diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,78330 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 11184, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.94134477825465e-05, + "grad_norm": 2.112427348262952, + "learning_rate": 5.952380952380952e-07, + "loss": 1.1653, + "step": 1 + }, + { + "epoch": 0.000178826895565093, + "grad_norm": 2.434476006168246, + "learning_rate": 1.1904761904761904e-06, + "loss": 1.2202, + "step": 2 + }, + { + "epoch": 0.0002682403433476395, + "grad_norm": 2.592543824809968, + "learning_rate": 1.7857142857142857e-06, + "loss": 1.2558, + "step": 3 + }, + { + "epoch": 0.000357653791130186, + "grad_norm": 2.903628976944987, + "learning_rate": 2.3809523809523808e-06, + "loss": 1.2867, + "step": 4 + }, + { + "epoch": 0.0004470672389127325, + "grad_norm": 2.235258973652649, + "learning_rate": 2.9761904761904763e-06, + "loss": 1.1749, + "step": 5 + }, + { + "epoch": 0.000536480686695279, + "grad_norm": 2.8563099406925687, + "learning_rate": 3.5714285714285714e-06, + "loss": 1.2253, + "step": 6 + }, + { + "epoch": 0.0006258941344778255, + "grad_norm": 2.494368565995805, + "learning_rate": 4.166666666666667e-06, + "loss": 1.2004, + "step": 7 + }, + { + "epoch": 0.000715307582260372, + "grad_norm": 2.0993643405018343, + "learning_rate": 4.7619047619047615e-06, + "loss": 1.1268, + "step": 8 + }, + { + "epoch": 0.0008047210300429185, + "grad_norm": 2.369377613971426, + "learning_rate": 5.357142857142857e-06, + "loss": 1.1513, + "step": 9 + }, + { + "epoch": 0.000894134477825465, + "grad_norm": 1.985857758610558, + "learning_rate": 5.9523809523809525e-06, + "loss": 1.0869, + "step": 10 + }, + { + "epoch": 0.0009835479256080114, + "grad_norm": 1.3694751488224577, + "learning_rate": 6.547619047619048e-06, + "loss": 1.0877, + "step": 11 + }, + { + "epoch": 0.001072961373390558, + "grad_norm": 0.8334012801165972, + "learning_rate": 7.142857142857143e-06, + "loss": 1.1164, + "step": 12 + }, + { + "epoch": 0.0011623748211731044, + "grad_norm": 0.5771252526676611, + "learning_rate": 7.738095238095238e-06, + "loss": 0.9837, + "step": 13 + }, + { + "epoch": 0.001251788268955651, + "grad_norm": 0.5444408792960981, + "learning_rate": 8.333333333333334e-06, + "loss": 0.9946, + "step": 14 + }, + { + "epoch": 0.0013412017167381974, + "grad_norm": 0.4248010326392547, + "learning_rate": 8.92857142857143e-06, + "loss": 1.0005, + "step": 15 + }, + { + "epoch": 0.001430615164520744, + "grad_norm": 0.391484329986641, + "learning_rate": 9.523809523809523e-06, + "loss": 0.9996, + "step": 16 + }, + { + "epoch": 0.0015200286123032904, + "grad_norm": 0.3949198780085505, + "learning_rate": 1.011904761904762e-05, + "loss": 1.0328, + "step": 17 + }, + { + "epoch": 0.001609442060085837, + "grad_norm": 0.3169724926637417, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.9521, + "step": 18 + }, + { + "epoch": 0.0016988555078683834, + "grad_norm": 0.3846211659961457, + "learning_rate": 1.130952380952381e-05, + "loss": 1.04, + "step": 19 + }, + { + "epoch": 0.00178826895565093, + "grad_norm": 0.2813372595807122, + "learning_rate": 1.1904761904761905e-05, + "loss": 0.9178, + "step": 20 + }, + { + "epoch": 0.0018776824034334764, + "grad_norm": 0.2590629806163614, + "learning_rate": 1.25e-05, + "loss": 0.9298, + "step": 21 + }, + { + "epoch": 0.001967095851216023, + "grad_norm": 0.2778615780364784, + "learning_rate": 1.3095238095238096e-05, + "loss": 0.9788, + "step": 22 + }, + { + "epoch": 0.0020565092989985696, + "grad_norm": 0.22855427230090553, + "learning_rate": 1.3690476190476192e-05, + "loss": 0.8963, + "step": 23 + }, + { + "epoch": 0.002145922746781116, + "grad_norm": 0.22287931409542902, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.947, + "step": 24 + }, + { + "epoch": 0.0022353361945636626, + "grad_norm": 0.2454499638292665, + "learning_rate": 1.4880952380952381e-05, + "loss": 0.932, + "step": 25 + }, + { + "epoch": 0.002324749642346209, + "grad_norm": 0.23006228145493962, + "learning_rate": 1.5476190476190476e-05, + "loss": 0.9565, + "step": 26 + }, + { + "epoch": 0.0024141630901287556, + "grad_norm": 0.20994741139554976, + "learning_rate": 1.6071428571428572e-05, + "loss": 0.9477, + "step": 27 + }, + { + "epoch": 0.002503576537911302, + "grad_norm": 0.19591699117272993, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.9338, + "step": 28 + }, + { + "epoch": 0.0025929899856938486, + "grad_norm": 0.186456720547025, + "learning_rate": 1.7261904761904763e-05, + "loss": 0.9213, + "step": 29 + }, + { + "epoch": 0.002682403433476395, + "grad_norm": 0.18213872963574826, + "learning_rate": 1.785714285714286e-05, + "loss": 0.923, + "step": 30 + }, + { + "epoch": 0.0027718168812589416, + "grad_norm": 0.17888130644984732, + "learning_rate": 1.8452380952380954e-05, + "loss": 0.9016, + "step": 31 + }, + { + "epoch": 0.002861230329041488, + "grad_norm": 0.2484784417478269, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.9079, + "step": 32 + }, + { + "epoch": 0.0029506437768240345, + "grad_norm": 0.24654017637779918, + "learning_rate": 1.9642857142857145e-05, + "loss": 0.9479, + "step": 33 + }, + { + "epoch": 0.003040057224606581, + "grad_norm": 0.20437989499642115, + "learning_rate": 2.023809523809524e-05, + "loss": 0.9255, + "step": 34 + }, + { + "epoch": 0.0031294706723891275, + "grad_norm": 0.20824372367632946, + "learning_rate": 2.0833333333333336e-05, + "loss": 0.9143, + "step": 35 + }, + { + "epoch": 0.003218884120171674, + "grad_norm": 0.20963793630920013, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.9183, + "step": 36 + }, + { + "epoch": 0.0033082975679542205, + "grad_norm": 0.1654939037518689, + "learning_rate": 2.2023809523809524e-05, + "loss": 0.8891, + "step": 37 + }, + { + "epoch": 0.003397711015736767, + "grad_norm": 0.1659276772175719, + "learning_rate": 2.261904761904762e-05, + "loss": 0.8949, + "step": 38 + }, + { + "epoch": 0.0034871244635193135, + "grad_norm": 0.15052328344161278, + "learning_rate": 2.3214285714285715e-05, + "loss": 0.8908, + "step": 39 + }, + { + "epoch": 0.00357653791130186, + "grad_norm": 0.11685867264985557, + "learning_rate": 2.380952380952381e-05, + "loss": 0.8428, + "step": 40 + }, + { + "epoch": 0.0036659513590844065, + "grad_norm": 0.16580050851010159, + "learning_rate": 2.4404761904761906e-05, + "loss": 0.9181, + "step": 41 + }, + { + "epoch": 0.0037553648068669528, + "grad_norm": 0.15240992237865722, + "learning_rate": 2.5e-05, + "loss": 0.8967, + "step": 42 + }, + { + "epoch": 0.0038447782546494995, + "grad_norm": 0.1363392022631291, + "learning_rate": 2.5595238095238093e-05, + "loss": 0.8552, + "step": 43 + }, + { + "epoch": 0.003934191702432046, + "grad_norm": 0.133844965457179, + "learning_rate": 2.6190476190476192e-05, + "loss": 0.8887, + "step": 44 + }, + { + "epoch": 0.004023605150214592, + "grad_norm": 0.16617829208017074, + "learning_rate": 2.6785714285714288e-05, + "loss": 0.9305, + "step": 45 + }, + { + "epoch": 0.004113018597997139, + "grad_norm": 0.12830162938182604, + "learning_rate": 2.7380952380952383e-05, + "loss": 0.8337, + "step": 46 + }, + { + "epoch": 0.0042024320457796855, + "grad_norm": 0.15343402426297933, + "learning_rate": 2.797619047619048e-05, + "loss": 0.8603, + "step": 47 + }, + { + "epoch": 0.004291845493562232, + "grad_norm": 0.13439121051031538, + "learning_rate": 2.857142857142857e-05, + "loss": 0.8666, + "step": 48 + }, + { + "epoch": 0.004381258941344778, + "grad_norm": 0.15462718685804583, + "learning_rate": 2.916666666666667e-05, + "loss": 0.8577, + "step": 49 + }, + { + "epoch": 0.004470672389127325, + "grad_norm": 0.1618660309417901, + "learning_rate": 2.9761904761904762e-05, + "loss": 0.9205, + "step": 50 + }, + { + "epoch": 0.0045600858369098714, + "grad_norm": 0.1392891233809839, + "learning_rate": 3.0357142857142857e-05, + "loss": 0.8723, + "step": 51 + }, + { + "epoch": 0.004649499284692418, + "grad_norm": 0.13906213824136998, + "learning_rate": 3.095238095238095e-05, + "loss": 0.8368, + "step": 52 + }, + { + "epoch": 0.004738912732474964, + "grad_norm": 0.14042330444254278, + "learning_rate": 3.154761904761905e-05, + "loss": 0.8457, + "step": 53 + }, + { + "epoch": 0.004828326180257511, + "grad_norm": 0.1358034012308072, + "learning_rate": 3.2142857142857144e-05, + "loss": 0.8883, + "step": 54 + }, + { + "epoch": 0.004917739628040057, + "grad_norm": 0.13888715770562465, + "learning_rate": 3.273809523809524e-05, + "loss": 0.9036, + "step": 55 + }, + { + "epoch": 0.005007153075822604, + "grad_norm": 0.14839759755839002, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.8489, + "step": 56 + }, + { + "epoch": 0.00509656652360515, + "grad_norm": 0.16096822534237254, + "learning_rate": 3.392857142857143e-05, + "loss": 0.8531, + "step": 57 + }, + { + "epoch": 0.005185979971387697, + "grad_norm": 0.19150467307557667, + "learning_rate": 3.4523809523809526e-05, + "loss": 0.9197, + "step": 58 + }, + { + "epoch": 0.005275393419170243, + "grad_norm": 0.151106361964583, + "learning_rate": 3.511904761904762e-05, + "loss": 0.8573, + "step": 59 + }, + { + "epoch": 0.00536480686695279, + "grad_norm": 0.17236446797265506, + "learning_rate": 3.571428571428572e-05, + "loss": 0.8755, + "step": 60 + }, + { + "epoch": 0.005454220314735336, + "grad_norm": 0.21216931086355062, + "learning_rate": 3.630952380952381e-05, + "loss": 0.8538, + "step": 61 + }, + { + "epoch": 0.005543633762517883, + "grad_norm": 0.2512819972717374, + "learning_rate": 3.690476190476191e-05, + "loss": 0.8759, + "step": 62 + }, + { + "epoch": 0.005633047210300429, + "grad_norm": 0.20372657846177758, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.8448, + "step": 63 + }, + { + "epoch": 0.005722460658082976, + "grad_norm": 0.3301565754607396, + "learning_rate": 3.809523809523809e-05, + "loss": 0.6656, + "step": 64 + }, + { + "epoch": 0.005811874105865522, + "grad_norm": 0.2322953754460605, + "learning_rate": 3.8690476190476195e-05, + "loss": 0.8861, + "step": 65 + }, + { + "epoch": 0.005901287553648069, + "grad_norm": 0.2696409929829177, + "learning_rate": 3.928571428571429e-05, + "loss": 0.8673, + "step": 66 + }, + { + "epoch": 0.005990701001430615, + "grad_norm": 0.2012803778417414, + "learning_rate": 3.9880952380952386e-05, + "loss": 0.8021, + "step": 67 + }, + { + "epoch": 0.006080114449213162, + "grad_norm": 0.2575642063186467, + "learning_rate": 4.047619047619048e-05, + "loss": 0.8301, + "step": 68 + }, + { + "epoch": 0.006169527896995708, + "grad_norm": 0.20738460528509226, + "learning_rate": 4.107142857142857e-05, + "loss": 0.8549, + "step": 69 + }, + { + "epoch": 0.006258941344778255, + "grad_norm": 0.18987568540801977, + "learning_rate": 4.166666666666667e-05, + "loss": 0.8247, + "step": 70 + }, + { + "epoch": 0.006348354792560801, + "grad_norm": 0.23339429539903328, + "learning_rate": 4.226190476190476e-05, + "loss": 0.7914, + "step": 71 + }, + { + "epoch": 0.006437768240343348, + "grad_norm": 0.20526630379269273, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.8203, + "step": 72 + }, + { + "epoch": 0.006527181688125894, + "grad_norm": 0.21178023859455444, + "learning_rate": 4.345238095238096e-05, + "loss": 0.8379, + "step": 73 + }, + { + "epoch": 0.006616595135908441, + "grad_norm": 0.19642644136128676, + "learning_rate": 4.404761904761905e-05, + "loss": 0.82, + "step": 74 + }, + { + "epoch": 0.006706008583690987, + "grad_norm": 0.14979201269541748, + "learning_rate": 4.464285714285715e-05, + "loss": 0.8114, + "step": 75 + }, + { + "epoch": 0.006795422031473534, + "grad_norm": 0.1533843517037106, + "learning_rate": 4.523809523809524e-05, + "loss": 0.7953, + "step": 76 + }, + { + "epoch": 0.00688483547925608, + "grad_norm": 0.24589888198707474, + "learning_rate": 4.5833333333333334e-05, + "loss": 0.7944, + "step": 77 + }, + { + "epoch": 0.006974248927038627, + "grad_norm": 0.21472095993842374, + "learning_rate": 4.642857142857143e-05, + "loss": 0.777, + "step": 78 + }, + { + "epoch": 0.007063662374821173, + "grad_norm": 0.23711243441455712, + "learning_rate": 4.7023809523809525e-05, + "loss": 0.8415, + "step": 79 + }, + { + "epoch": 0.00715307582260372, + "grad_norm": 0.21468375863184724, + "learning_rate": 4.761904761904762e-05, + "loss": 0.872, + "step": 80 + }, + { + "epoch": 0.007242489270386266, + "grad_norm": 0.30058427114296155, + "learning_rate": 4.8214285714285716e-05, + "loss": 0.8065, + "step": 81 + }, + { + "epoch": 0.007331902718168813, + "grad_norm": 0.18913519221456512, + "learning_rate": 4.880952380952381e-05, + "loss": 0.7919, + "step": 82 + }, + { + "epoch": 0.007421316165951359, + "grad_norm": 0.19761741835369034, + "learning_rate": 4.940476190476191e-05, + "loss": 0.817, + "step": 83 + }, + { + "epoch": 0.0075107296137339056, + "grad_norm": 0.21175213232958234, + "learning_rate": 5e-05, + "loss": 0.7971, + "step": 84 + }, + { + "epoch": 0.007600143061516452, + "grad_norm": 0.1815714572635503, + "learning_rate": 5.05952380952381e-05, + "loss": 0.7828, + "step": 85 + }, + { + "epoch": 0.007689556509298999, + "grad_norm": 0.1982693087157195, + "learning_rate": 5.119047619047619e-05, + "loss": 0.8035, + "step": 86 + }, + { + "epoch": 0.007778969957081545, + "grad_norm": 0.15606061313944483, + "learning_rate": 5.1785714285714296e-05, + "loss": 0.6039, + "step": 87 + }, + { + "epoch": 0.007868383404864092, + "grad_norm": 0.2006816876027447, + "learning_rate": 5.2380952380952384e-05, + "loss": 0.8096, + "step": 88 + }, + { + "epoch": 0.007957796852646639, + "grad_norm": 0.19194501765807784, + "learning_rate": 5.297619047619048e-05, + "loss": 0.7805, + "step": 89 + }, + { + "epoch": 0.008047210300429184, + "grad_norm": 0.2752349850749797, + "learning_rate": 5.3571428571428575e-05, + "loss": 0.817, + "step": 90 + }, + { + "epoch": 0.008136623748211731, + "grad_norm": 0.18169510334355143, + "learning_rate": 5.4166666666666664e-05, + "loss": 0.8408, + "step": 91 + }, + { + "epoch": 0.008226037195994278, + "grad_norm": 0.17012223817628744, + "learning_rate": 5.4761904761904766e-05, + "loss": 0.7991, + "step": 92 + }, + { + "epoch": 0.008315450643776824, + "grad_norm": 0.18563427917782419, + "learning_rate": 5.535714285714286e-05, + "loss": 0.7906, + "step": 93 + }, + { + "epoch": 0.008404864091559371, + "grad_norm": 0.17225942019969434, + "learning_rate": 5.595238095238096e-05, + "loss": 0.7654, + "step": 94 + }, + { + "epoch": 0.008494277539341916, + "grad_norm": 0.17376816272023407, + "learning_rate": 5.6547619047619046e-05, + "loss": 0.8244, + "step": 95 + }, + { + "epoch": 0.008583690987124463, + "grad_norm": 0.21917194307966617, + "learning_rate": 5.714285714285714e-05, + "loss": 0.8001, + "step": 96 + }, + { + "epoch": 0.00867310443490701, + "grad_norm": 0.18180746515079294, + "learning_rate": 5.773809523809524e-05, + "loss": 0.8128, + "step": 97 + }, + { + "epoch": 0.008762517882689556, + "grad_norm": 0.20005199910410984, + "learning_rate": 5.833333333333334e-05, + "loss": 0.7967, + "step": 98 + }, + { + "epoch": 0.008851931330472103, + "grad_norm": 0.15169527602335664, + "learning_rate": 5.8928571428571435e-05, + "loss": 0.7668, + "step": 99 + }, + { + "epoch": 0.00894134477825465, + "grad_norm": 0.2389662121203403, + "learning_rate": 5.9523809523809524e-05, + "loss": 0.8178, + "step": 100 + }, + { + "epoch": 0.009030758226037196, + "grad_norm": 0.14873395086486962, + "learning_rate": 6.011904761904762e-05, + "loss": 0.7709, + "step": 101 + }, + { + "epoch": 0.009120171673819743, + "grad_norm": 0.1761574023896183, + "learning_rate": 6.0714285714285715e-05, + "loss": 0.7572, + "step": 102 + }, + { + "epoch": 0.009209585121602288, + "grad_norm": 0.17691798480463905, + "learning_rate": 6.130952380952381e-05, + "loss": 0.8129, + "step": 103 + }, + { + "epoch": 0.009298998569384835, + "grad_norm": 0.168096947131579, + "learning_rate": 6.19047619047619e-05, + "loss": 0.7959, + "step": 104 + }, + { + "epoch": 0.009388412017167383, + "grad_norm": 0.24631118007303146, + "learning_rate": 6.25e-05, + "loss": 0.7926, + "step": 105 + }, + { + "epoch": 0.009477825464949928, + "grad_norm": 0.17895614019746353, + "learning_rate": 6.30952380952381e-05, + "loss": 0.8233, + "step": 106 + }, + { + "epoch": 0.009567238912732475, + "grad_norm": 0.23362379710538378, + "learning_rate": 6.369047619047619e-05, + "loss": 0.7752, + "step": 107 + }, + { + "epoch": 0.009656652360515022, + "grad_norm": 0.16748138856750122, + "learning_rate": 6.428571428571429e-05, + "loss": 0.7857, + "step": 108 + }, + { + "epoch": 0.009746065808297568, + "grad_norm": 0.23204034117548553, + "learning_rate": 6.488095238095238e-05, + "loss": 0.7682, + "step": 109 + }, + { + "epoch": 0.009835479256080115, + "grad_norm": 0.1545960086280794, + "learning_rate": 6.547619047619048e-05, + "loss": 0.7758, + "step": 110 + }, + { + "epoch": 0.00992489270386266, + "grad_norm": 0.21798199856815906, + "learning_rate": 6.607142857142857e-05, + "loss": 0.7756, + "step": 111 + }, + { + "epoch": 0.010014306151645207, + "grad_norm": 0.16778287071881462, + "learning_rate": 6.666666666666667e-05, + "loss": 0.7671, + "step": 112 + }, + { + "epoch": 0.010103719599427755, + "grad_norm": 0.21341237976197366, + "learning_rate": 6.726190476190477e-05, + "loss": 0.7855, + "step": 113 + }, + { + "epoch": 0.0101931330472103, + "grad_norm": 0.17468839458442748, + "learning_rate": 6.785714285714286e-05, + "loss": 0.8414, + "step": 114 + }, + { + "epoch": 0.010282546494992847, + "grad_norm": 0.18019131476080916, + "learning_rate": 6.845238095238096e-05, + "loss": 0.7435, + "step": 115 + }, + { + "epoch": 0.010371959942775394, + "grad_norm": 0.17819920408953274, + "learning_rate": 6.904761904761905e-05, + "loss": 0.7618, + "step": 116 + }, + { + "epoch": 0.01046137339055794, + "grad_norm": 0.18033638445602337, + "learning_rate": 6.964285714285715e-05, + "loss": 0.7624, + "step": 117 + }, + { + "epoch": 0.010550786838340487, + "grad_norm": 0.171141331701975, + "learning_rate": 7.023809523809524e-05, + "loss": 0.8002, + "step": 118 + }, + { + "epoch": 0.010640200286123032, + "grad_norm": 0.18903040219927092, + "learning_rate": 7.083333333333334e-05, + "loss": 0.7907, + "step": 119 + }, + { + "epoch": 0.01072961373390558, + "grad_norm": 0.19237615230274016, + "learning_rate": 7.142857142857143e-05, + "loss": 0.768, + "step": 120 + }, + { + "epoch": 0.010819027181688127, + "grad_norm": 0.15731317822028204, + "learning_rate": 7.202380952380953e-05, + "loss": 0.7763, + "step": 121 + }, + { + "epoch": 0.010908440629470672, + "grad_norm": 0.23985268511464525, + "learning_rate": 7.261904761904762e-05, + "loss": 0.7865, + "step": 122 + }, + { + "epoch": 0.010997854077253219, + "grad_norm": 0.14876732710449098, + "learning_rate": 7.321428571428571e-05, + "loss": 0.7682, + "step": 123 + }, + { + "epoch": 0.011087267525035766, + "grad_norm": 0.1985793834549767, + "learning_rate": 7.380952380952382e-05, + "loss": 0.7932, + "step": 124 + }, + { + "epoch": 0.011176680972818312, + "grad_norm": 0.32763442543319676, + "learning_rate": 7.440476190476191e-05, + "loss": 0.8115, + "step": 125 + }, + { + "epoch": 0.011266094420600859, + "grad_norm": 0.23406398825030167, + "learning_rate": 7.500000000000001e-05, + "loss": 0.804, + "step": 126 + }, + { + "epoch": 0.011355507868383404, + "grad_norm": 0.19650976314139115, + "learning_rate": 7.55952380952381e-05, + "loss": 0.7518, + "step": 127 + }, + { + "epoch": 0.011444921316165951, + "grad_norm": 0.15761760299962, + "learning_rate": 7.619047619047618e-05, + "loss": 0.7808, + "step": 128 + }, + { + "epoch": 0.011534334763948498, + "grad_norm": 0.16893751186875686, + "learning_rate": 7.67857142857143e-05, + "loss": 0.77, + "step": 129 + }, + { + "epoch": 0.011623748211731044, + "grad_norm": 0.17260454584859253, + "learning_rate": 7.738095238095239e-05, + "loss": 0.743, + "step": 130 + }, + { + "epoch": 0.011713161659513591, + "grad_norm": 0.17614247486193257, + "learning_rate": 7.797619047619048e-05, + "loss": 0.7638, + "step": 131 + }, + { + "epoch": 0.011802575107296138, + "grad_norm": 0.14698941077706276, + "learning_rate": 7.857142857142858e-05, + "loss": 0.7373, + "step": 132 + }, + { + "epoch": 0.011891988555078684, + "grad_norm": 0.1772201233818989, + "learning_rate": 7.916666666666666e-05, + "loss": 0.7804, + "step": 133 + }, + { + "epoch": 0.01198140200286123, + "grad_norm": 0.17407716135118306, + "learning_rate": 7.976190476190477e-05, + "loss": 0.7397, + "step": 134 + }, + { + "epoch": 0.012070815450643776, + "grad_norm": 0.17745904659745315, + "learning_rate": 8.035714285714287e-05, + "loss": 0.8013, + "step": 135 + }, + { + "epoch": 0.012160228898426323, + "grad_norm": 0.1535171720026128, + "learning_rate": 8.095238095238096e-05, + "loss": 0.8106, + "step": 136 + }, + { + "epoch": 0.01224964234620887, + "grad_norm": 0.1593318191584122, + "learning_rate": 8.154761904761904e-05, + "loss": 0.7103, + "step": 137 + }, + { + "epoch": 0.012339055793991416, + "grad_norm": 0.14342805594124658, + "learning_rate": 8.214285714285714e-05, + "loss": 0.645, + "step": 138 + }, + { + "epoch": 0.012428469241773963, + "grad_norm": 0.18719519821651992, + "learning_rate": 8.273809523809524e-05, + "loss": 0.7738, + "step": 139 + }, + { + "epoch": 0.01251788268955651, + "grad_norm": 0.1662120219173177, + "learning_rate": 8.333333333333334e-05, + "loss": 0.7391, + "step": 140 + }, + { + "epoch": 0.012607296137339056, + "grad_norm": 0.14134491930782606, + "learning_rate": 8.392857142857144e-05, + "loss": 0.7914, + "step": 141 + }, + { + "epoch": 0.012696709585121603, + "grad_norm": 0.16897230577050237, + "learning_rate": 8.452380952380952e-05, + "loss": 0.7615, + "step": 142 + }, + { + "epoch": 0.012786123032904148, + "grad_norm": 0.17537233236504932, + "learning_rate": 8.511904761904762e-05, + "loss": 0.7576, + "step": 143 + }, + { + "epoch": 0.012875536480686695, + "grad_norm": 0.18799852659279506, + "learning_rate": 8.571428571428571e-05, + "loss": 0.8037, + "step": 144 + }, + { + "epoch": 0.012964949928469242, + "grad_norm": 0.1518570155878049, + "learning_rate": 8.630952380952382e-05, + "loss": 0.7711, + "step": 145 + }, + { + "epoch": 0.013054363376251788, + "grad_norm": 0.17727032282377775, + "learning_rate": 8.690476190476192e-05, + "loss": 0.8098, + "step": 146 + }, + { + "epoch": 0.013143776824034335, + "grad_norm": 0.17414219903724018, + "learning_rate": 8.75e-05, + "loss": 0.7972, + "step": 147 + }, + { + "epoch": 0.013233190271816882, + "grad_norm": 0.15792901215114324, + "learning_rate": 8.80952380952381e-05, + "loss": 0.7381, + "step": 148 + }, + { + "epoch": 0.013322603719599427, + "grad_norm": 0.18175687981826458, + "learning_rate": 8.869047619047619e-05, + "loss": 0.7836, + "step": 149 + }, + { + "epoch": 0.013412017167381975, + "grad_norm": 0.1726086356122119, + "learning_rate": 8.92857142857143e-05, + "loss": 0.7402, + "step": 150 + }, + { + "epoch": 0.01350143061516452, + "grad_norm": 0.16330066573775154, + "learning_rate": 8.988095238095238e-05, + "loss": 0.7457, + "step": 151 + }, + { + "epoch": 0.013590844062947067, + "grad_norm": 0.16480086942789293, + "learning_rate": 9.047619047619048e-05, + "loss": 0.7904, + "step": 152 + }, + { + "epoch": 0.013680257510729614, + "grad_norm": 0.18112710659089498, + "learning_rate": 9.107142857142857e-05, + "loss": 0.8027, + "step": 153 + }, + { + "epoch": 0.01376967095851216, + "grad_norm": 0.16362857970468397, + "learning_rate": 9.166666666666667e-05, + "loss": 0.7663, + "step": 154 + }, + { + "epoch": 0.013859084406294707, + "grad_norm": 0.2227190071433268, + "learning_rate": 9.226190476190478e-05, + "loss": 0.7492, + "step": 155 + }, + { + "epoch": 0.013948497854077254, + "grad_norm": 0.17637683328908896, + "learning_rate": 9.285714285714286e-05, + "loss": 0.7728, + "step": 156 + }, + { + "epoch": 0.0140379113018598, + "grad_norm": 0.1576160332855807, + "learning_rate": 9.345238095238095e-05, + "loss": 0.7904, + "step": 157 + }, + { + "epoch": 0.014127324749642347, + "grad_norm": 0.17909184941756384, + "learning_rate": 9.404761904761905e-05, + "loss": 0.7686, + "step": 158 + }, + { + "epoch": 0.014216738197424892, + "grad_norm": 0.14298697022242737, + "learning_rate": 9.464285714285715e-05, + "loss": 0.7327, + "step": 159 + }, + { + "epoch": 0.01430615164520744, + "grad_norm": 0.21191307246217986, + "learning_rate": 9.523809523809524e-05, + "loss": 0.7222, + "step": 160 + }, + { + "epoch": 0.014395565092989986, + "grad_norm": 0.16303806902992754, + "learning_rate": 9.583333333333334e-05, + "loss": 0.7497, + "step": 161 + }, + { + "epoch": 0.014484978540772532, + "grad_norm": 0.1460367472716386, + "learning_rate": 9.642857142857143e-05, + "loss": 0.7737, + "step": 162 + }, + { + "epoch": 0.014574391988555079, + "grad_norm": 0.15148809123778514, + "learning_rate": 9.702380952380953e-05, + "loss": 0.7197, + "step": 163 + }, + { + "epoch": 0.014663805436337626, + "grad_norm": 0.16421379071349437, + "learning_rate": 9.761904761904762e-05, + "loss": 0.7624, + "step": 164 + }, + { + "epoch": 0.014753218884120171, + "grad_norm": 0.10176382753928688, + "learning_rate": 9.821428571428572e-05, + "loss": 0.5922, + "step": 165 + }, + { + "epoch": 0.014842632331902719, + "grad_norm": 0.26185552050855365, + "learning_rate": 9.880952380952381e-05, + "loss": 0.7766, + "step": 166 + }, + { + "epoch": 0.014932045779685264, + "grad_norm": 0.2614366868896481, + "learning_rate": 9.940476190476191e-05, + "loss": 0.7158, + "step": 167 + }, + { + "epoch": 0.015021459227467811, + "grad_norm": 0.15394066589899832, + "learning_rate": 0.0001, + "loss": 0.784, + "step": 168 + }, + { + "epoch": 0.015110872675250358, + "grad_norm": 0.1850265292629983, + "learning_rate": 0.0001005952380952381, + "loss": 0.764, + "step": 169 + }, + { + "epoch": 0.015200286123032904, + "grad_norm": 0.1452511414413871, + "learning_rate": 0.0001011904761904762, + "loss": 0.7851, + "step": 170 + }, + { + "epoch": 0.01528969957081545, + "grad_norm": 0.14718909459065663, + "learning_rate": 0.00010178571428571428, + "loss": 0.796, + "step": 171 + }, + { + "epoch": 0.015379113018597998, + "grad_norm": 0.1754010351094989, + "learning_rate": 0.00010238095238095237, + "loss": 0.7514, + "step": 172 + }, + { + "epoch": 0.015468526466380543, + "grad_norm": 0.16748693121212316, + "learning_rate": 0.00010297619047619047, + "loss": 0.7594, + "step": 173 + }, + { + "epoch": 0.01555793991416309, + "grad_norm": 0.16686144193725502, + "learning_rate": 0.00010357142857142859, + "loss": 0.7881, + "step": 174 + }, + { + "epoch": 0.015647353361945636, + "grad_norm": 0.14552816966908122, + "learning_rate": 0.00010416666666666667, + "loss": 0.7517, + "step": 175 + }, + { + "epoch": 0.015736766809728183, + "grad_norm": 0.2222350748814435, + "learning_rate": 0.00010476190476190477, + "loss": 0.7937, + "step": 176 + }, + { + "epoch": 0.01582618025751073, + "grad_norm": 0.13920772954028826, + "learning_rate": 0.00010535714285714286, + "loss": 0.7363, + "step": 177 + }, + { + "epoch": 0.015915593705293277, + "grad_norm": 0.17282846465085883, + "learning_rate": 0.00010595238095238096, + "loss": 0.7699, + "step": 178 + }, + { + "epoch": 0.01600500715307582, + "grad_norm": 0.15869090425441293, + "learning_rate": 0.00010654761904761906, + "loss": 0.8288, + "step": 179 + }, + { + "epoch": 0.016094420600858368, + "grad_norm": 0.1490580724333226, + "learning_rate": 0.00010714285714285715, + "loss": 0.7718, + "step": 180 + }, + { + "epoch": 0.016183834048640915, + "grad_norm": 0.16359011397048406, + "learning_rate": 0.00010773809523809523, + "loss": 0.7159, + "step": 181 + }, + { + "epoch": 0.016273247496423462, + "grad_norm": 0.12972448336040554, + "learning_rate": 0.00010833333333333333, + "loss": 0.7296, + "step": 182 + }, + { + "epoch": 0.01636266094420601, + "grad_norm": 0.15031834709630368, + "learning_rate": 0.00010892857142857142, + "loss": 0.7555, + "step": 183 + }, + { + "epoch": 0.016452074391988557, + "grad_norm": 0.13502257583952706, + "learning_rate": 0.00010952380952380953, + "loss": 0.754, + "step": 184 + }, + { + "epoch": 0.0165414878397711, + "grad_norm": 0.16878004005967365, + "learning_rate": 0.00011011904761904763, + "loss": 0.7901, + "step": 185 + }, + { + "epoch": 0.016630901287553648, + "grad_norm": 0.1290945180853397, + "learning_rate": 0.00011071428571428572, + "loss": 0.7395, + "step": 186 + }, + { + "epoch": 0.016720314735336195, + "grad_norm": 0.14631787575822042, + "learning_rate": 0.00011130952380952382, + "loss": 0.7634, + "step": 187 + }, + { + "epoch": 0.016809728183118742, + "grad_norm": 0.15500834830132287, + "learning_rate": 0.00011190476190476191, + "loss": 0.7743, + "step": 188 + }, + { + "epoch": 0.01689914163090129, + "grad_norm": 0.16803538873212687, + "learning_rate": 0.00011250000000000001, + "loss": 0.7643, + "step": 189 + }, + { + "epoch": 0.016988555078683833, + "grad_norm": 0.13879424831565437, + "learning_rate": 0.00011309523809523809, + "loss": 0.7583, + "step": 190 + }, + { + "epoch": 0.01707796852646638, + "grad_norm": 0.14004271084731082, + "learning_rate": 0.00011369047619047619, + "loss": 0.739, + "step": 191 + }, + { + "epoch": 0.017167381974248927, + "grad_norm": 0.13555028171656225, + "learning_rate": 0.00011428571428571428, + "loss": 0.7463, + "step": 192 + }, + { + "epoch": 0.017256795422031474, + "grad_norm": 0.1675912355593016, + "learning_rate": 0.00011488095238095238, + "loss": 0.7629, + "step": 193 + }, + { + "epoch": 0.01734620886981402, + "grad_norm": 0.16165838220836778, + "learning_rate": 0.00011547619047619047, + "loss": 0.7731, + "step": 194 + }, + { + "epoch": 0.017435622317596565, + "grad_norm": 0.1434507582927404, + "learning_rate": 0.00011607142857142858, + "loss": 0.792, + "step": 195 + }, + { + "epoch": 0.017525035765379112, + "grad_norm": 0.18633817960712476, + "learning_rate": 0.00011666666666666668, + "loss": 0.7653, + "step": 196 + }, + { + "epoch": 0.01761444921316166, + "grad_norm": 0.21252744011539074, + "learning_rate": 0.00011726190476190477, + "loss": 0.7829, + "step": 197 + }, + { + "epoch": 0.017703862660944206, + "grad_norm": 0.14382091705863137, + "learning_rate": 0.00011785714285714287, + "loss": 0.7721, + "step": 198 + }, + { + "epoch": 0.017793276108726754, + "grad_norm": 0.2652428532700769, + "learning_rate": 0.00011845238095238097, + "loss": 0.7532, + "step": 199 + }, + { + "epoch": 0.0178826895565093, + "grad_norm": 0.18268525037569788, + "learning_rate": 0.00011904761904761905, + "loss": 0.8034, + "step": 200 + }, + { + "epoch": 0.017972103004291844, + "grad_norm": 0.18653152328449307, + "learning_rate": 0.00011964285714285714, + "loss": 0.8051, + "step": 201 + }, + { + "epoch": 0.01806151645207439, + "grad_norm": 0.15353148100532757, + "learning_rate": 0.00012023809523809524, + "loss": 0.7642, + "step": 202 + }, + { + "epoch": 0.01815092989985694, + "grad_norm": 0.19892905387542276, + "learning_rate": 0.00012083333333333333, + "loss": 0.7444, + "step": 203 + }, + { + "epoch": 0.018240343347639486, + "grad_norm": 0.14355075903277686, + "learning_rate": 0.00012142857142857143, + "loss": 0.7623, + "step": 204 + }, + { + "epoch": 0.018329756795422033, + "grad_norm": 0.13585580759613153, + "learning_rate": 0.00012202380952380954, + "loss": 0.7372, + "step": 205 + }, + { + "epoch": 0.018419170243204577, + "grad_norm": 0.14399557699452184, + "learning_rate": 0.00012261904761904762, + "loss": 0.7462, + "step": 206 + }, + { + "epoch": 0.018508583690987124, + "grad_norm": 0.1326253554828262, + "learning_rate": 0.00012321428571428572, + "loss": 0.7477, + "step": 207 + }, + { + "epoch": 0.01859799713876967, + "grad_norm": 0.17278900766301702, + "learning_rate": 0.0001238095238095238, + "loss": 0.7826, + "step": 208 + }, + { + "epoch": 0.018687410586552218, + "grad_norm": 0.14365542452358232, + "learning_rate": 0.0001244047619047619, + "loss": 0.7682, + "step": 209 + }, + { + "epoch": 0.018776824034334765, + "grad_norm": 0.15219399423069133, + "learning_rate": 0.000125, + "loss": 0.7325, + "step": 210 + }, + { + "epoch": 0.01886623748211731, + "grad_norm": 0.13339308355995036, + "learning_rate": 0.0001255952380952381, + "loss": 0.782, + "step": 211 + }, + { + "epoch": 0.018955650929899856, + "grad_norm": 0.14674564679550597, + "learning_rate": 0.0001261904761904762, + "loss": 0.7466, + "step": 212 + }, + { + "epoch": 0.019045064377682403, + "grad_norm": 0.15980867870274504, + "learning_rate": 0.0001267857142857143, + "loss": 0.7551, + "step": 213 + }, + { + "epoch": 0.01913447782546495, + "grad_norm": 0.15138941779069306, + "learning_rate": 0.00012738095238095238, + "loss": 0.7927, + "step": 214 + }, + { + "epoch": 0.019223891273247497, + "grad_norm": 0.1345826953294758, + "learning_rate": 0.00012797619047619048, + "loss": 0.7256, + "step": 215 + }, + { + "epoch": 0.019313304721030045, + "grad_norm": 0.12870754111693739, + "learning_rate": 0.00012857142857142858, + "loss": 0.737, + "step": 216 + }, + { + "epoch": 0.019402718168812588, + "grad_norm": 0.14477400313402036, + "learning_rate": 0.00012916666666666667, + "loss": 0.7839, + "step": 217 + }, + { + "epoch": 0.019492131616595135, + "grad_norm": 0.12399406547060428, + "learning_rate": 0.00012976190476190477, + "loss": 0.7541, + "step": 218 + }, + { + "epoch": 0.019581545064377683, + "grad_norm": 0.1372388223514834, + "learning_rate": 0.00013035714285714286, + "loss": 0.7394, + "step": 219 + }, + { + "epoch": 0.01967095851216023, + "grad_norm": 0.13525017297394623, + "learning_rate": 0.00013095238095238096, + "loss": 0.7518, + "step": 220 + }, + { + "epoch": 0.019760371959942777, + "grad_norm": 0.13681830037294287, + "learning_rate": 0.00013154761904761905, + "loss": 0.7706, + "step": 221 + }, + { + "epoch": 0.01984978540772532, + "grad_norm": 0.13900443331425666, + "learning_rate": 0.00013214285714285715, + "loss": 0.7703, + "step": 222 + }, + { + "epoch": 0.019939198855507868, + "grad_norm": 0.12867822376285218, + "learning_rate": 0.00013273809523809524, + "loss": 0.7447, + "step": 223 + }, + { + "epoch": 0.020028612303290415, + "grad_norm": 0.164151071936308, + "learning_rate": 0.00013333333333333334, + "loss": 0.7184, + "step": 224 + }, + { + "epoch": 0.020118025751072962, + "grad_norm": 0.14524777621915758, + "learning_rate": 0.00013392857142857144, + "loss": 0.7259, + "step": 225 + }, + { + "epoch": 0.02020743919885551, + "grad_norm": 0.17705695063972163, + "learning_rate": 0.00013452380952380953, + "loss": 0.7269, + "step": 226 + }, + { + "epoch": 0.020296852646638053, + "grad_norm": 0.15944199037240572, + "learning_rate": 0.00013511904761904763, + "loss": 0.7591, + "step": 227 + }, + { + "epoch": 0.0203862660944206, + "grad_norm": 0.11975749634032441, + "learning_rate": 0.00013571428571428572, + "loss": 0.7276, + "step": 228 + }, + { + "epoch": 0.020475679542203147, + "grad_norm": 0.13868911221583805, + "learning_rate": 0.00013630952380952382, + "loss": 0.7214, + "step": 229 + }, + { + "epoch": 0.020565092989985694, + "grad_norm": 0.15239812873941141, + "learning_rate": 0.0001369047619047619, + "loss": 0.7617, + "step": 230 + }, + { + "epoch": 0.02065450643776824, + "grad_norm": 0.11488162929031359, + "learning_rate": 0.0001375, + "loss": 0.6829, + "step": 231 + }, + { + "epoch": 0.02074391988555079, + "grad_norm": 0.15665216257363299, + "learning_rate": 0.0001380952380952381, + "loss": 0.7769, + "step": 232 + }, + { + "epoch": 0.020833333333333332, + "grad_norm": 0.13993832609769888, + "learning_rate": 0.0001386904761904762, + "loss": 0.7484, + "step": 233 + }, + { + "epoch": 0.02092274678111588, + "grad_norm": 0.127685378790649, + "learning_rate": 0.0001392857142857143, + "loss": 0.7511, + "step": 234 + }, + { + "epoch": 0.021012160228898426, + "grad_norm": 0.17982808634372074, + "learning_rate": 0.0001398809523809524, + "loss": 0.7472, + "step": 235 + }, + { + "epoch": 0.021101573676680974, + "grad_norm": 0.1368472664157066, + "learning_rate": 0.00014047619047619049, + "loss": 0.7555, + "step": 236 + }, + { + "epoch": 0.02119098712446352, + "grad_norm": 0.14175301439160032, + "learning_rate": 0.00014107142857142858, + "loss": 0.7833, + "step": 237 + }, + { + "epoch": 0.021280400572246064, + "grad_norm": 0.14709124865170675, + "learning_rate": 0.00014166666666666668, + "loss": 0.7696, + "step": 238 + }, + { + "epoch": 0.02136981402002861, + "grad_norm": 0.1363779293539838, + "learning_rate": 0.00014226190476190477, + "loss": 0.7481, + "step": 239 + }, + { + "epoch": 0.02145922746781116, + "grad_norm": 0.12377755606538512, + "learning_rate": 0.00014285714285714287, + "loss": 0.7726, + "step": 240 + }, + { + "epoch": 0.021548640915593706, + "grad_norm": 0.12055665175062664, + "learning_rate": 0.00014345238095238096, + "loss": 0.7216, + "step": 241 + }, + { + "epoch": 0.021638054363376253, + "grad_norm": 0.13390390671835695, + "learning_rate": 0.00014404761904761906, + "loss": 0.7328, + "step": 242 + }, + { + "epoch": 0.021727467811158797, + "grad_norm": 0.11921221367204354, + "learning_rate": 0.00014464285714285715, + "loss": 0.7351, + "step": 243 + }, + { + "epoch": 0.021816881258941344, + "grad_norm": 0.13434828029131995, + "learning_rate": 0.00014523809523809525, + "loss": 0.7591, + "step": 244 + }, + { + "epoch": 0.02190629470672389, + "grad_norm": 0.11832129438128867, + "learning_rate": 0.00014583333333333335, + "loss": 0.7427, + "step": 245 + }, + { + "epoch": 0.021995708154506438, + "grad_norm": 0.1302292281400204, + "learning_rate": 0.00014642857142857141, + "loss": 0.7508, + "step": 246 + }, + { + "epoch": 0.022085121602288985, + "grad_norm": 0.13273841653067733, + "learning_rate": 0.00014702380952380954, + "loss": 0.7617, + "step": 247 + }, + { + "epoch": 0.022174535050071532, + "grad_norm": 0.14466817514627803, + "learning_rate": 0.00014761904761904763, + "loss": 0.7516, + "step": 248 + }, + { + "epoch": 0.022263948497854076, + "grad_norm": 0.1547161698389477, + "learning_rate": 0.00014821428571428573, + "loss": 0.7581, + "step": 249 + }, + { + "epoch": 0.022353361945636623, + "grad_norm": 0.14425153435402596, + "learning_rate": 0.00014880952380952382, + "loss": 0.7811, + "step": 250 + }, + { + "epoch": 0.02244277539341917, + "grad_norm": 0.134406746653966, + "learning_rate": 0.00014940476190476192, + "loss": 0.7514, + "step": 251 + }, + { + "epoch": 0.022532188841201718, + "grad_norm": 0.15328863204776966, + "learning_rate": 0.00015000000000000001, + "loss": 0.7181, + "step": 252 + }, + { + "epoch": 0.022621602288984265, + "grad_norm": 0.12818812374943625, + "learning_rate": 0.0001505952380952381, + "loss": 0.7199, + "step": 253 + }, + { + "epoch": 0.02271101573676681, + "grad_norm": 0.19004917580528, + "learning_rate": 0.0001511904761904762, + "loss": 0.7299, + "step": 254 + }, + { + "epoch": 0.022800429184549355, + "grad_norm": 0.21974812268282254, + "learning_rate": 0.00015178571428571427, + "loss": 0.7231, + "step": 255 + }, + { + "epoch": 0.022889842632331903, + "grad_norm": 0.1777804365075326, + "learning_rate": 0.00015238095238095237, + "loss": 0.727, + "step": 256 + }, + { + "epoch": 0.02297925608011445, + "grad_norm": 0.15233354797189486, + "learning_rate": 0.00015297619047619046, + "loss": 0.7437, + "step": 257 + }, + { + "epoch": 0.023068669527896997, + "grad_norm": 0.13355024711128596, + "learning_rate": 0.0001535714285714286, + "loss": 0.7892, + "step": 258 + }, + { + "epoch": 0.02315808297567954, + "grad_norm": 0.1343239661671809, + "learning_rate": 0.00015416666666666668, + "loss": 0.774, + "step": 259 + }, + { + "epoch": 0.023247496423462088, + "grad_norm": 0.15193908525084832, + "learning_rate": 0.00015476190476190478, + "loss": 0.7875, + "step": 260 + }, + { + "epoch": 0.023336909871244635, + "grad_norm": 0.1175705792788769, + "learning_rate": 0.00015535714285714287, + "loss": 0.6754, + "step": 261 + }, + { + "epoch": 0.023426323319027182, + "grad_norm": 0.10718814299286035, + "learning_rate": 0.00015595238095238097, + "loss": 0.5795, + "step": 262 + }, + { + "epoch": 0.02351573676680973, + "grad_norm": 0.15892523865551067, + "learning_rate": 0.00015654761904761906, + "loss": 0.7616, + "step": 263 + }, + { + "epoch": 0.023605150214592276, + "grad_norm": 0.12860029557575955, + "learning_rate": 0.00015714285714285716, + "loss": 0.7921, + "step": 264 + }, + { + "epoch": 0.02369456366237482, + "grad_norm": 0.13865212279684797, + "learning_rate": 0.00015773809523809523, + "loss": 0.7349, + "step": 265 + }, + { + "epoch": 0.023783977110157367, + "grad_norm": 0.1518123340386917, + "learning_rate": 0.00015833333333333332, + "loss": 0.7553, + "step": 266 + }, + { + "epoch": 0.023873390557939914, + "grad_norm": 0.14206997699927346, + "learning_rate": 0.00015892857142857142, + "loss": 0.7383, + "step": 267 + }, + { + "epoch": 0.02396280400572246, + "grad_norm": 0.1345768443614876, + "learning_rate": 0.00015952380952380954, + "loss": 0.7418, + "step": 268 + }, + { + "epoch": 0.02405221745350501, + "grad_norm": 0.15508100643560638, + "learning_rate": 0.00016011904761904764, + "loss": 0.7927, + "step": 269 + }, + { + "epoch": 0.024141630901287552, + "grad_norm": 0.11255924917657936, + "learning_rate": 0.00016071428571428573, + "loss": 0.7298, + "step": 270 + }, + { + "epoch": 0.0242310443490701, + "grad_norm": 0.11921930865977283, + "learning_rate": 0.00016130952380952383, + "loss": 0.7293, + "step": 271 + }, + { + "epoch": 0.024320457796852647, + "grad_norm": 0.14236696578734664, + "learning_rate": 0.00016190476190476192, + "loss": 0.7423, + "step": 272 + }, + { + "epoch": 0.024409871244635194, + "grad_norm": 0.12062643919634249, + "learning_rate": 0.00016250000000000002, + "loss": 0.73, + "step": 273 + }, + { + "epoch": 0.02449928469241774, + "grad_norm": 0.17336289407735728, + "learning_rate": 0.0001630952380952381, + "loss": 0.7427, + "step": 274 + }, + { + "epoch": 0.024588698140200285, + "grad_norm": 0.15048904194452006, + "learning_rate": 0.00016369047619047618, + "loss": 0.7773, + "step": 275 + }, + { + "epoch": 0.02467811158798283, + "grad_norm": 0.14676789709707452, + "learning_rate": 0.00016428571428571428, + "loss": 0.7496, + "step": 276 + }, + { + "epoch": 0.02476752503576538, + "grad_norm": 0.11894198126161419, + "learning_rate": 0.00016488095238095237, + "loss": 0.6093, + "step": 277 + }, + { + "epoch": 0.024856938483547926, + "grad_norm": 0.1798882267456007, + "learning_rate": 0.00016547619047619047, + "loss": 0.7289, + "step": 278 + }, + { + "epoch": 0.024946351931330473, + "grad_norm": 0.12978484197543982, + "learning_rate": 0.0001660714285714286, + "loss": 0.726, + "step": 279 + }, + { + "epoch": 0.02503576537911302, + "grad_norm": 0.1377057349275337, + "learning_rate": 0.0001666666666666667, + "loss": 0.7301, + "step": 280 + }, + { + "epoch": 0.025125178826895564, + "grad_norm": 0.13741857628459928, + "learning_rate": 0.00016726190476190478, + "loss": 0.7278, + "step": 281 + }, + { + "epoch": 0.02521459227467811, + "grad_norm": 0.13565733823758808, + "learning_rate": 0.00016785714285714288, + "loss": 0.7826, + "step": 282 + }, + { + "epoch": 0.025304005722460658, + "grad_norm": 0.11485511067610578, + "learning_rate": 0.00016845238095238097, + "loss": 0.5672, + "step": 283 + }, + { + "epoch": 0.025393419170243205, + "grad_norm": 0.7713111211622931, + "learning_rate": 0.00016904761904761904, + "loss": 0.7682, + "step": 284 + }, + { + "epoch": 0.025482832618025753, + "grad_norm": 0.8343624952858678, + "learning_rate": 0.00016964285714285714, + "loss": 0.8063, + "step": 285 + }, + { + "epoch": 0.025572246065808296, + "grad_norm": 0.16519321464496228, + "learning_rate": 0.00017023809523809523, + "loss": 0.752, + "step": 286 + }, + { + "epoch": 0.025661659513590843, + "grad_norm": 0.1750689262475944, + "learning_rate": 0.00017083333333333333, + "loss": 0.7426, + "step": 287 + }, + { + "epoch": 0.02575107296137339, + "grad_norm": 0.17217169142154534, + "learning_rate": 0.00017142857142857143, + "loss": 0.7277, + "step": 288 + }, + { + "epoch": 0.025840486409155938, + "grad_norm": 0.1481485639868935, + "learning_rate": 0.00017202380952380955, + "loss": 0.7859, + "step": 289 + }, + { + "epoch": 0.025929899856938485, + "grad_norm": 0.2010897859872673, + "learning_rate": 0.00017261904761904764, + "loss": 0.7455, + "step": 290 + }, + { + "epoch": 0.02601931330472103, + "grad_norm": 0.15071056661804152, + "learning_rate": 0.00017321428571428574, + "loss": 0.7818, + "step": 291 + }, + { + "epoch": 0.026108726752503576, + "grad_norm": 0.1587892110036088, + "learning_rate": 0.00017380952380952383, + "loss": 0.7573, + "step": 292 + }, + { + "epoch": 0.026198140200286123, + "grad_norm": 0.1915258590182573, + "learning_rate": 0.0001744047619047619, + "loss": 0.7701, + "step": 293 + }, + { + "epoch": 0.02628755364806867, + "grad_norm": 0.14570674382799684, + "learning_rate": 0.000175, + "loss": 0.7692, + "step": 294 + }, + { + "epoch": 0.026376967095851217, + "grad_norm": 0.13339014127427842, + "learning_rate": 0.0001755952380952381, + "loss": 0.7449, + "step": 295 + }, + { + "epoch": 0.026466380543633764, + "grad_norm": 0.1426855768320904, + "learning_rate": 0.0001761904761904762, + "loss": 0.7663, + "step": 296 + }, + { + "epoch": 0.026555793991416308, + "grad_norm": 0.14966820998883643, + "learning_rate": 0.00017678571428571428, + "loss": 0.7706, + "step": 297 + }, + { + "epoch": 0.026645207439198855, + "grad_norm": 0.18835944726326798, + "learning_rate": 0.00017738095238095238, + "loss": 0.7593, + "step": 298 + }, + { + "epoch": 0.026734620886981402, + "grad_norm": 0.21513925701133205, + "learning_rate": 0.00017797619047619048, + "loss": 0.7538, + "step": 299 + }, + { + "epoch": 0.02682403433476395, + "grad_norm": 0.16166025085845018, + "learning_rate": 0.0001785714285714286, + "loss": 0.7436, + "step": 300 + }, + { + "epoch": 0.026913447782546496, + "grad_norm": 0.14886514873637338, + "learning_rate": 0.0001791666666666667, + "loss": 0.7666, + "step": 301 + }, + { + "epoch": 0.02700286123032904, + "grad_norm": 0.1358038479926043, + "learning_rate": 0.00017976190476190476, + "loss": 0.729, + "step": 302 + }, + { + "epoch": 0.027092274678111587, + "grad_norm": 0.1325790424420621, + "learning_rate": 0.00018035714285714286, + "loss": 0.7436, + "step": 303 + }, + { + "epoch": 0.027181688125894134, + "grad_norm": 0.13168074471976537, + "learning_rate": 0.00018095238095238095, + "loss": 0.7243, + "step": 304 + }, + { + "epoch": 0.02727110157367668, + "grad_norm": 0.1362677889787748, + "learning_rate": 0.00018154761904761905, + "loss": 0.8016, + "step": 305 + }, + { + "epoch": 0.02736051502145923, + "grad_norm": 0.1377829661690502, + "learning_rate": 0.00018214285714285714, + "loss": 0.7436, + "step": 306 + }, + { + "epoch": 0.027449928469241772, + "grad_norm": 0.12567032756074942, + "learning_rate": 0.00018273809523809524, + "loss": 0.7512, + "step": 307 + }, + { + "epoch": 0.02753934191702432, + "grad_norm": 0.11063621144784686, + "learning_rate": 0.00018333333333333334, + "loss": 0.5879, + "step": 308 + }, + { + "epoch": 0.027628755364806867, + "grad_norm": 0.1512579715740937, + "learning_rate": 0.00018392857142857143, + "loss": 0.7178, + "step": 309 + }, + { + "epoch": 0.027718168812589414, + "grad_norm": 0.149254831304389, + "learning_rate": 0.00018452380952380955, + "loss": 0.7353, + "step": 310 + }, + { + "epoch": 0.02780758226037196, + "grad_norm": 0.1196630295065358, + "learning_rate": 0.00018511904761904765, + "loss": 0.7536, + "step": 311 + }, + { + "epoch": 0.027896995708154508, + "grad_norm": 0.1407653215490422, + "learning_rate": 0.00018571428571428572, + "loss": 0.7156, + "step": 312 + }, + { + "epoch": 0.027986409155937052, + "grad_norm": 0.10915098902903411, + "learning_rate": 0.0001863095238095238, + "loss": 0.7509, + "step": 313 + }, + { + "epoch": 0.0280758226037196, + "grad_norm": 0.13279387066211473, + "learning_rate": 0.0001869047619047619, + "loss": 0.7283, + "step": 314 + }, + { + "epoch": 0.028165236051502146, + "grad_norm": 0.12376651016133287, + "learning_rate": 0.0001875, + "loss": 0.7507, + "step": 315 + }, + { + "epoch": 0.028254649499284693, + "grad_norm": 0.12883923764847408, + "learning_rate": 0.0001880952380952381, + "loss": 0.7231, + "step": 316 + }, + { + "epoch": 0.02834406294706724, + "grad_norm": 0.11447575986310286, + "learning_rate": 0.0001886904761904762, + "loss": 0.705, + "step": 317 + }, + { + "epoch": 0.028433476394849784, + "grad_norm": 0.1249976731727308, + "learning_rate": 0.0001892857142857143, + "loss": 0.7027, + "step": 318 + }, + { + "epoch": 0.02852288984263233, + "grad_norm": 0.10999765218342103, + "learning_rate": 0.00018988095238095239, + "loss": 0.7104, + "step": 319 + }, + { + "epoch": 0.02861230329041488, + "grad_norm": 0.12301344850162459, + "learning_rate": 0.00019047619047619048, + "loss": 0.7401, + "step": 320 + }, + { + "epoch": 0.028701716738197425, + "grad_norm": 0.1252243321360845, + "learning_rate": 0.00019107142857142858, + "loss": 0.7392, + "step": 321 + }, + { + "epoch": 0.028791130185979973, + "grad_norm": 0.12454563173250499, + "learning_rate": 0.00019166666666666667, + "loss": 0.7661, + "step": 322 + }, + { + "epoch": 0.028880543633762516, + "grad_norm": 0.12261749433007706, + "learning_rate": 0.00019226190476190477, + "loss": 0.7253, + "step": 323 + }, + { + "epoch": 0.028969957081545063, + "grad_norm": 0.11850379769303693, + "learning_rate": 0.00019285714285714286, + "loss": 0.7234, + "step": 324 + }, + { + "epoch": 0.02905937052932761, + "grad_norm": 0.1295802752191494, + "learning_rate": 0.00019345238095238096, + "loss": 0.7289, + "step": 325 + }, + { + "epoch": 0.029148783977110158, + "grad_norm": 0.13315498439785817, + "learning_rate": 0.00019404761904761905, + "loss": 0.7416, + "step": 326 + }, + { + "epoch": 0.029238197424892705, + "grad_norm": 0.1216516605668618, + "learning_rate": 0.00019464285714285715, + "loss": 0.7455, + "step": 327 + }, + { + "epoch": 0.029327610872675252, + "grad_norm": 0.1306333628753076, + "learning_rate": 0.00019523809523809525, + "loss": 0.7493, + "step": 328 + }, + { + "epoch": 0.029417024320457796, + "grad_norm": 0.1264849792341156, + "learning_rate": 0.00019583333333333334, + "loss": 0.732, + "step": 329 + }, + { + "epoch": 0.029506437768240343, + "grad_norm": 0.13762416127180963, + "learning_rate": 0.00019642857142857144, + "loss": 0.7536, + "step": 330 + }, + { + "epoch": 0.02959585121602289, + "grad_norm": 0.1347474086047056, + "learning_rate": 0.00019702380952380953, + "loss": 0.7449, + "step": 331 + }, + { + "epoch": 0.029685264663805437, + "grad_norm": 0.13075389426253128, + "learning_rate": 0.00019761904761904763, + "loss": 0.7389, + "step": 332 + }, + { + "epoch": 0.029774678111587984, + "grad_norm": 0.11050948990280264, + "learning_rate": 0.00019821428571428572, + "loss": 0.7467, + "step": 333 + }, + { + "epoch": 0.029864091559370528, + "grad_norm": 0.13362788949084944, + "learning_rate": 0.00019880952380952382, + "loss": 0.7417, + "step": 334 + }, + { + "epoch": 0.029953505007153075, + "grad_norm": 0.1457666342857604, + "learning_rate": 0.00019940476190476191, + "loss": 0.7521, + "step": 335 + }, + { + "epoch": 0.030042918454935622, + "grad_norm": 0.14138664411889168, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 336 + }, + { + "epoch": 0.03013233190271817, + "grad_norm": 0.11785714439686411, + "learning_rate": 0.0001999999958065604, + "loss": 0.7304, + "step": 337 + }, + { + "epoch": 0.030221745350500717, + "grad_norm": 0.16849518759513168, + "learning_rate": 0.0001999999832262419, + "loss": 0.7129, + "step": 338 + }, + { + "epoch": 0.03031115879828326, + "grad_norm": 0.11432212643920293, + "learning_rate": 0.00019999996225904558, + "loss": 0.7681, + "step": 339 + }, + { + "epoch": 0.030400572246065807, + "grad_norm": 0.12609985616462815, + "learning_rate": 0.00019999993290497318, + "loss": 0.7636, + "step": 340 + }, + { + "epoch": 0.030489985693848354, + "grad_norm": 0.10609146480421928, + "learning_rate": 0.0001999998951640272, + "loss": 0.7171, + "step": 341 + }, + { + "epoch": 0.0305793991416309, + "grad_norm": 0.12391404428244573, + "learning_rate": 0.0001999998490362108, + "loss": 0.7361, + "step": 342 + }, + { + "epoch": 0.03066881258941345, + "grad_norm": 0.11945643593506687, + "learning_rate": 0.0001999997945215278, + "loss": 0.748, + "step": 343 + }, + { + "epoch": 0.030758226037195996, + "grad_norm": 0.1084990872726355, + "learning_rate": 0.0001999997316199828, + "loss": 0.7519, + "step": 344 + }, + { + "epoch": 0.03084763948497854, + "grad_norm": 0.11709964110330885, + "learning_rate": 0.00019999966033158108, + "loss": 0.7253, + "step": 345 + }, + { + "epoch": 0.030937052932761087, + "grad_norm": 0.12160569806623323, + "learning_rate": 0.00019999958065632862, + "loss": 0.7637, + "step": 346 + }, + { + "epoch": 0.031026466380543634, + "grad_norm": 0.11777985221990753, + "learning_rate": 0.0001999994925942321, + "loss": 0.7634, + "step": 347 + }, + { + "epoch": 0.03111587982832618, + "grad_norm": 0.12678261746309777, + "learning_rate": 0.00019999939614529893, + "loss": 0.7524, + "step": 348 + }, + { + "epoch": 0.031205293276108728, + "grad_norm": 0.12412399658073946, + "learning_rate": 0.00019999929130953714, + "loss": 0.74, + "step": 349 + }, + { + "epoch": 0.03129470672389127, + "grad_norm": 0.1136190559699973, + "learning_rate": 0.00019999917808695558, + "loss": 0.7163, + "step": 350 + }, + { + "epoch": 0.03138412017167382, + "grad_norm": 0.11225946185708106, + "learning_rate": 0.0001999990564775637, + "loss": 0.7478, + "step": 351 + }, + { + "epoch": 0.031473533619456366, + "grad_norm": 0.12486316508891385, + "learning_rate": 0.00019999892648137174, + "loss": 0.7581, + "step": 352 + }, + { + "epoch": 0.03156294706723891, + "grad_norm": 0.13456517018042574, + "learning_rate": 0.00019999878809839056, + "loss": 0.7066, + "step": 353 + }, + { + "epoch": 0.03165236051502146, + "grad_norm": 0.17041159039915588, + "learning_rate": 0.0001999986413286318, + "loss": 0.7246, + "step": 354 + }, + { + "epoch": 0.031741773962804004, + "grad_norm": 0.13375548904919324, + "learning_rate": 0.00019999848617210776, + "loss": 0.7798, + "step": 355 + }, + { + "epoch": 0.031831187410586555, + "grad_norm": 0.1359813104709954, + "learning_rate": 0.00019999832262883148, + "loss": 0.7338, + "step": 356 + }, + { + "epoch": 0.0319206008583691, + "grad_norm": 0.1240410304716203, + "learning_rate": 0.00019999815069881663, + "loss": 0.7334, + "step": 357 + }, + { + "epoch": 0.03201001430615164, + "grad_norm": 0.11852151204574862, + "learning_rate": 0.00019999797038207763, + "loss": 0.7716, + "step": 358 + }, + { + "epoch": 0.03209942775393419, + "grad_norm": 0.10674430819539216, + "learning_rate": 0.00019999778167862964, + "loss": 0.7709, + "step": 359 + }, + { + "epoch": 0.032188841201716736, + "grad_norm": 0.1132537116966304, + "learning_rate": 0.00019999758458848847, + "loss": 0.7257, + "step": 360 + }, + { + "epoch": 0.03227825464949929, + "grad_norm": 0.11651479415697885, + "learning_rate": 0.00019999737911167065, + "loss": 0.7718, + "step": 361 + }, + { + "epoch": 0.03236766809728183, + "grad_norm": 0.11980907431116007, + "learning_rate": 0.00019999716524819337, + "loss": 0.72, + "step": 362 + }, + { + "epoch": 0.032457081545064374, + "grad_norm": 0.11844180094563787, + "learning_rate": 0.00019999694299807465, + "loss": 0.7525, + "step": 363 + }, + { + "epoch": 0.032546494992846925, + "grad_norm": 0.12199210710149219, + "learning_rate": 0.0001999967123613331, + "loss": 0.7472, + "step": 364 + }, + { + "epoch": 0.03263590844062947, + "grad_norm": 0.10958483468398363, + "learning_rate": 0.000199996473337988, + "loss": 0.7377, + "step": 365 + }, + { + "epoch": 0.03272532188841202, + "grad_norm": 0.1051986535902932, + "learning_rate": 0.0001999962259280595, + "loss": 0.7464, + "step": 366 + }, + { + "epoch": 0.03281473533619456, + "grad_norm": 0.13866310998657386, + "learning_rate": 0.00019999597013156824, + "loss": 0.7203, + "step": 367 + }, + { + "epoch": 0.032904148783977114, + "grad_norm": 0.11052413444571026, + "learning_rate": 0.00019999570594853575, + "loss": 0.706, + "step": 368 + }, + { + "epoch": 0.03299356223175966, + "grad_norm": 0.11400289552409312, + "learning_rate": 0.0001999954333789842, + "loss": 0.7143, + "step": 369 + }, + { + "epoch": 0.0330829756795422, + "grad_norm": 0.10666746725993716, + "learning_rate": 0.00019999515242293637, + "loss": 0.7497, + "step": 370 + }, + { + "epoch": 0.03317238912732475, + "grad_norm": 0.14031515026503666, + "learning_rate": 0.0001999948630804159, + "loss": 0.7774, + "step": 371 + }, + { + "epoch": 0.033261802575107295, + "grad_norm": 0.11247136326311645, + "learning_rate": 0.000199994565351447, + "loss": 0.7486, + "step": 372 + }, + { + "epoch": 0.033351216022889846, + "grad_norm": 0.13118163131251545, + "learning_rate": 0.00019999425923605468, + "loss": 0.7259, + "step": 373 + }, + { + "epoch": 0.03344062947067239, + "grad_norm": 0.11010111703285963, + "learning_rate": 0.0001999939447342646, + "loss": 0.7264, + "step": 374 + }, + { + "epoch": 0.03353004291845493, + "grad_norm": 0.10819477834963062, + "learning_rate": 0.00019999362184610316, + "loss": 0.7017, + "step": 375 + }, + { + "epoch": 0.033619456366237484, + "grad_norm": 0.11770072883972092, + "learning_rate": 0.00019999329057159736, + "loss": 0.753, + "step": 376 + }, + { + "epoch": 0.03370886981402003, + "grad_norm": 0.11815679336757667, + "learning_rate": 0.0001999929509107751, + "loss": 0.7577, + "step": 377 + }, + { + "epoch": 0.03379828326180258, + "grad_norm": 0.12141837188323223, + "learning_rate": 0.00019999260286366477, + "loss": 0.7186, + "step": 378 + }, + { + "epoch": 0.03388769670958512, + "grad_norm": 0.10704084450839539, + "learning_rate": 0.00019999224643029565, + "loss": 0.7334, + "step": 379 + }, + { + "epoch": 0.033977110157367665, + "grad_norm": 0.12968310978409553, + "learning_rate": 0.0001999918816106975, + "loss": 0.7188, + "step": 380 + }, + { + "epoch": 0.034066523605150216, + "grad_norm": 0.1157580292522495, + "learning_rate": 0.00019999150840490105, + "loss": 0.7235, + "step": 381 + }, + { + "epoch": 0.03415593705293276, + "grad_norm": 0.13091796931024197, + "learning_rate": 0.00019999112681293757, + "loss": 0.7241, + "step": 382 + }, + { + "epoch": 0.03424535050071531, + "grad_norm": 0.12459136449536122, + "learning_rate": 0.000199990736834839, + "loss": 0.7519, + "step": 383 + }, + { + "epoch": 0.034334763948497854, + "grad_norm": 0.11323488201726499, + "learning_rate": 0.00019999033847063811, + "loss": 0.7451, + "step": 384 + }, + { + "epoch": 0.0344241773962804, + "grad_norm": 0.11143602114133866, + "learning_rate": 0.00019998993172036828, + "loss": 0.7099, + "step": 385 + }, + { + "epoch": 0.03451359084406295, + "grad_norm": 0.12996651494432812, + "learning_rate": 0.00019998951658406364, + "loss": 0.7499, + "step": 386 + }, + { + "epoch": 0.03460300429184549, + "grad_norm": 0.1176115343174283, + "learning_rate": 0.000199989093061759, + "loss": 0.788, + "step": 387 + }, + { + "epoch": 0.03469241773962804, + "grad_norm": 0.12869759749333287, + "learning_rate": 0.00019998866115348988, + "loss": 0.7481, + "step": 388 + }, + { + "epoch": 0.034781831187410586, + "grad_norm": 0.09831573576699329, + "learning_rate": 0.0001999882208592925, + "loss": 0.6024, + "step": 389 + }, + { + "epoch": 0.03487124463519313, + "grad_norm": 0.15402331970219518, + "learning_rate": 0.00019998777217920385, + "loss": 0.7713, + "step": 390 + }, + { + "epoch": 0.03496065808297568, + "grad_norm": 0.11305305272296767, + "learning_rate": 0.0001999873151132614, + "loss": 0.7138, + "step": 391 + }, + { + "epoch": 0.035050071530758224, + "grad_norm": 0.1267858222275246, + "learning_rate": 0.00019998684966150365, + "loss": 0.7251, + "step": 392 + }, + { + "epoch": 0.035139484978540775, + "grad_norm": 0.11391072775879073, + "learning_rate": 0.00019998637582396958, + "loss": 0.7327, + "step": 393 + }, + { + "epoch": 0.03522889842632332, + "grad_norm": 0.1313298027320374, + "learning_rate": 0.0001999858936006989, + "loss": 0.7247, + "step": 394 + }, + { + "epoch": 0.03531831187410586, + "grad_norm": 0.1251943622008024, + "learning_rate": 0.00019998540299173207, + "loss": 0.7482, + "step": 395 + }, + { + "epoch": 0.03540772532188841, + "grad_norm": 0.09288032471934188, + "learning_rate": 0.00019998490399711024, + "loss": 0.561, + "step": 396 + }, + { + "epoch": 0.035497138769670956, + "grad_norm": 0.12219727834495733, + "learning_rate": 0.0001999843966168753, + "loss": 0.7432, + "step": 397 + }, + { + "epoch": 0.03558655221745351, + "grad_norm": 0.11922805602961195, + "learning_rate": 0.00019998388085106972, + "loss": 0.7477, + "step": 398 + }, + { + "epoch": 0.03567596566523605, + "grad_norm": 0.11328431774138491, + "learning_rate": 0.00019998335669973682, + "loss": 0.7295, + "step": 399 + }, + { + "epoch": 0.0357653791130186, + "grad_norm": 0.13485829532688168, + "learning_rate": 0.00019998282416292055, + "loss": 0.7319, + "step": 400 + }, + { + "epoch": 0.035854792560801145, + "grad_norm": 0.11774123379118764, + "learning_rate": 0.00019998228324066557, + "loss": 0.6997, + "step": 401 + }, + { + "epoch": 0.03594420600858369, + "grad_norm": 0.1204120346000939, + "learning_rate": 0.00019998173393301723, + "loss": 0.7371, + "step": 402 + }, + { + "epoch": 0.03603361945636624, + "grad_norm": 0.10666356631443329, + "learning_rate": 0.0001999811762400216, + "loss": 0.7386, + "step": 403 + }, + { + "epoch": 0.03612303290414878, + "grad_norm": 0.11327238736560379, + "learning_rate": 0.0001999806101617255, + "loss": 0.7607, + "step": 404 + }, + { + "epoch": 0.036212446351931334, + "grad_norm": 0.11212826521203365, + "learning_rate": 0.00019998003569817637, + "loss": 0.7279, + "step": 405 + }, + { + "epoch": 0.03630185979971388, + "grad_norm": 0.12898016182315172, + "learning_rate": 0.00019997945284942235, + "loss": 0.749, + "step": 406 + }, + { + "epoch": 0.03639127324749642, + "grad_norm": 0.1252959376202027, + "learning_rate": 0.0001999788616155124, + "loss": 0.7702, + "step": 407 + }, + { + "epoch": 0.03648068669527897, + "grad_norm": 0.12107163393688677, + "learning_rate": 0.00019997826199649605, + "loss": 0.7117, + "step": 408 + }, + { + "epoch": 0.036570100143061515, + "grad_norm": 0.10489774559424611, + "learning_rate": 0.00019997765399242364, + "loss": 0.6888, + "step": 409 + }, + { + "epoch": 0.036659513590844066, + "grad_norm": 0.11416813009234353, + "learning_rate": 0.0001999770376033461, + "loss": 0.7376, + "step": 410 + }, + { + "epoch": 0.03674892703862661, + "grad_norm": 0.10492833876149472, + "learning_rate": 0.00019997641282931515, + "loss": 0.7277, + "step": 411 + }, + { + "epoch": 0.03683834048640915, + "grad_norm": 0.1032674162874515, + "learning_rate": 0.00019997577967038324, + "loss": 0.7118, + "step": 412 + }, + { + "epoch": 0.036927753934191704, + "grad_norm": 0.10399006809790065, + "learning_rate": 0.0001999751381266034, + "loss": 0.7734, + "step": 413 + }, + { + "epoch": 0.03701716738197425, + "grad_norm": 0.13541703489387838, + "learning_rate": 0.00019997448819802948, + "loss": 0.7554, + "step": 414 + }, + { + "epoch": 0.0371065808297568, + "grad_norm": 0.10969017324824866, + "learning_rate": 0.00019997382988471595, + "loss": 0.7424, + "step": 415 + }, + { + "epoch": 0.03719599427753934, + "grad_norm": 0.1148426344712516, + "learning_rate": 0.00019997316318671806, + "loss": 0.7157, + "step": 416 + }, + { + "epoch": 0.037285407725321885, + "grad_norm": 0.1516337267746433, + "learning_rate": 0.00019997248810409173, + "loss": 0.718, + "step": 417 + }, + { + "epoch": 0.037374821173104436, + "grad_norm": 0.14440057522138566, + "learning_rate": 0.0001999718046368935, + "loss": 0.7967, + "step": 418 + }, + { + "epoch": 0.03746423462088698, + "grad_norm": 0.12019070203382322, + "learning_rate": 0.0001999711127851808, + "loss": 0.7044, + "step": 419 + }, + { + "epoch": 0.03755364806866953, + "grad_norm": 0.12607426930772245, + "learning_rate": 0.0001999704125490116, + "loss": 0.7499, + "step": 420 + }, + { + "epoch": 0.037643061516452074, + "grad_norm": 0.09983006163819239, + "learning_rate": 0.00019996970392844462, + "loss": 0.7193, + "step": 421 + }, + { + "epoch": 0.03773247496423462, + "grad_norm": 0.10828984320730105, + "learning_rate": 0.0001999689869235393, + "loss": 0.74, + "step": 422 + }, + { + "epoch": 0.03782188841201717, + "grad_norm": 0.10337828048275076, + "learning_rate": 0.00019996826153435582, + "loss": 0.7068, + "step": 423 + }, + { + "epoch": 0.03791130185979971, + "grad_norm": 0.10900660793278547, + "learning_rate": 0.00019996752776095495, + "loss": 0.7081, + "step": 424 + }, + { + "epoch": 0.03800071530758226, + "grad_norm": 0.1202271362897271, + "learning_rate": 0.00019996678560339824, + "loss": 0.7539, + "step": 425 + }, + { + "epoch": 0.038090128755364806, + "grad_norm": 0.10929726989122661, + "learning_rate": 0.00019996603506174795, + "loss": 0.6926, + "step": 426 + }, + { + "epoch": 0.03817954220314735, + "grad_norm": 0.1099693882310794, + "learning_rate": 0.00019996527613606708, + "loss": 0.7023, + "step": 427 + }, + { + "epoch": 0.0382689556509299, + "grad_norm": 0.11196851633890027, + "learning_rate": 0.00019996450882641916, + "loss": 0.7301, + "step": 428 + }, + { + "epoch": 0.038358369098712444, + "grad_norm": 0.1164006382904638, + "learning_rate": 0.00019996373313286867, + "loss": 0.7479, + "step": 429 + }, + { + "epoch": 0.038447782546494995, + "grad_norm": 0.10597869875526951, + "learning_rate": 0.00019996294905548056, + "loss": 0.7481, + "step": 430 + }, + { + "epoch": 0.03853719599427754, + "grad_norm": 0.1073024941334283, + "learning_rate": 0.00019996215659432066, + "loss": 0.7482, + "step": 431 + }, + { + "epoch": 0.03862660944206009, + "grad_norm": 0.11684680912369817, + "learning_rate": 0.00019996135574945544, + "loss": 0.7238, + "step": 432 + }, + { + "epoch": 0.03871602288984263, + "grad_norm": 0.10951005993940047, + "learning_rate": 0.00019996054652095198, + "loss": 0.7094, + "step": 433 + }, + { + "epoch": 0.038805436337625177, + "grad_norm": 0.12515876792322317, + "learning_rate": 0.00019995972890887823, + "loss": 0.7194, + "step": 434 + }, + { + "epoch": 0.03889484978540773, + "grad_norm": 0.12188184585208146, + "learning_rate": 0.00019995890291330272, + "loss": 0.7405, + "step": 435 + }, + { + "epoch": 0.03898426323319027, + "grad_norm": 0.11617722530099231, + "learning_rate": 0.00019995806853429477, + "loss": 0.6987, + "step": 436 + }, + { + "epoch": 0.03907367668097282, + "grad_norm": 0.14510705040615918, + "learning_rate": 0.0001999572257719243, + "loss": 0.7335, + "step": 437 + }, + { + "epoch": 0.039163090128755365, + "grad_norm": 0.10901710660669646, + "learning_rate": 0.00019995637462626205, + "loss": 0.7236, + "step": 438 + }, + { + "epoch": 0.03925250357653791, + "grad_norm": 0.12408317331442519, + "learning_rate": 0.00019995551509737936, + "loss": 0.7314, + "step": 439 + }, + { + "epoch": 0.03934191702432046, + "grad_norm": 0.1010039341925824, + "learning_rate": 0.00019995464718534835, + "loss": 0.7188, + "step": 440 + }, + { + "epoch": 0.039431330472103, + "grad_norm": 0.11459594290840534, + "learning_rate": 0.00019995377089024178, + "loss": 0.7527, + "step": 441 + }, + { + "epoch": 0.039520743919885554, + "grad_norm": 0.11749841751122932, + "learning_rate": 0.00019995288621213318, + "loss": 0.7004, + "step": 442 + }, + { + "epoch": 0.0396101573676681, + "grad_norm": 0.12499459197586718, + "learning_rate": 0.0001999519931510967, + "loss": 0.7799, + "step": 443 + }, + { + "epoch": 0.03969957081545064, + "grad_norm": 0.10429828196808294, + "learning_rate": 0.00019995109170720728, + "loss": 0.748, + "step": 444 + }, + { + "epoch": 0.03978898426323319, + "grad_norm": 0.11746440897093881, + "learning_rate": 0.0001999501818805405, + "loss": 0.7737, + "step": 445 + }, + { + "epoch": 0.039878397711015735, + "grad_norm": 0.12225752187197354, + "learning_rate": 0.0001999492636711727, + "loss": 0.7648, + "step": 446 + }, + { + "epoch": 0.039967811158798286, + "grad_norm": 0.11398033516615481, + "learning_rate": 0.00019994833707918084, + "loss": 0.6859, + "step": 447 + }, + { + "epoch": 0.04005722460658083, + "grad_norm": 0.11895799439127805, + "learning_rate": 0.00019994740210464268, + "loss": 0.7467, + "step": 448 + }, + { + "epoch": 0.04014663805436337, + "grad_norm": 0.10817670472530296, + "learning_rate": 0.00019994645874763658, + "loss": 0.7252, + "step": 449 + }, + { + "epoch": 0.040236051502145924, + "grad_norm": 0.1045523000757615, + "learning_rate": 0.00019994550700824172, + "loss": 0.7332, + "step": 450 + }, + { + "epoch": 0.04032546494992847, + "grad_norm": 0.11556138790783273, + "learning_rate": 0.00019994454688653784, + "loss": 0.7546, + "step": 451 + }, + { + "epoch": 0.04041487839771102, + "grad_norm": 0.10666012604427984, + "learning_rate": 0.00019994357838260557, + "loss": 0.7417, + "step": 452 + }, + { + "epoch": 0.04050429184549356, + "grad_norm": 0.12418604147311628, + "learning_rate": 0.00019994260149652603, + "loss": 0.7786, + "step": 453 + }, + { + "epoch": 0.040593705293276106, + "grad_norm": 0.10971331215093318, + "learning_rate": 0.00019994161622838126, + "loss": 0.7222, + "step": 454 + }, + { + "epoch": 0.040683118741058656, + "grad_norm": 0.10601623517542058, + "learning_rate": 0.0001999406225782538, + "loss": 0.7477, + "step": 455 + }, + { + "epoch": 0.0407725321888412, + "grad_norm": 0.1294338850851335, + "learning_rate": 0.00019993962054622703, + "loss": 0.7368, + "step": 456 + }, + { + "epoch": 0.04086194563662375, + "grad_norm": 0.12069690511246088, + "learning_rate": 0.00019993861013238497, + "loss": 0.7181, + "step": 457 + }, + { + "epoch": 0.040951359084406294, + "grad_norm": 0.10168148224330094, + "learning_rate": 0.0001999375913368124, + "loss": 0.7272, + "step": 458 + }, + { + "epoch": 0.04104077253218884, + "grad_norm": 0.1302055154691596, + "learning_rate": 0.00019993656415959472, + "loss": 0.7504, + "step": 459 + }, + { + "epoch": 0.04113018597997139, + "grad_norm": 0.14381015915665538, + "learning_rate": 0.00019993552860081814, + "loss": 0.7031, + "step": 460 + }, + { + "epoch": 0.04121959942775393, + "grad_norm": 0.12867262992827644, + "learning_rate": 0.00019993448466056938, + "loss": 0.7423, + "step": 461 + }, + { + "epoch": 0.04130901287553648, + "grad_norm": 0.1129391527597728, + "learning_rate": 0.00019993343233893615, + "loss": 0.732, + "step": 462 + }, + { + "epoch": 0.041398426323319026, + "grad_norm": 0.12908915004440646, + "learning_rate": 0.00019993237163600663, + "loss": 0.7933, + "step": 463 + }, + { + "epoch": 0.04148783977110158, + "grad_norm": 0.10901937851433495, + "learning_rate": 0.00019993130255186977, + "loss": 0.7434, + "step": 464 + }, + { + "epoch": 0.04157725321888412, + "grad_norm": 0.11801033989957715, + "learning_rate": 0.00019993022508661525, + "loss": 0.7361, + "step": 465 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 0.11890288631296461, + "learning_rate": 0.00019992913924033349, + "loss": 0.7308, + "step": 466 + }, + { + "epoch": 0.041756080114449215, + "grad_norm": 0.10843017649230466, + "learning_rate": 0.00019992804501311543, + "loss": 0.6722, + "step": 467 + }, + { + "epoch": 0.04184549356223176, + "grad_norm": 0.1113694890678067, + "learning_rate": 0.00019992694240505293, + "loss": 0.7177, + "step": 468 + }, + { + "epoch": 0.04193490701001431, + "grad_norm": 0.11893947887254087, + "learning_rate": 0.00019992583141623848, + "loss": 0.7242, + "step": 469 + }, + { + "epoch": 0.04202432045779685, + "grad_norm": 0.10018523914391919, + "learning_rate": 0.00019992471204676525, + "loss": 0.6928, + "step": 470 + }, + { + "epoch": 0.0421137339055794, + "grad_norm": 0.08382853240450226, + "learning_rate": 0.00019992358429672704, + "loss": 0.6106, + "step": 471 + }, + { + "epoch": 0.04220314735336195, + "grad_norm": 0.1264160602519555, + "learning_rate": 0.00019992244816621852, + "loss": 0.7092, + "step": 472 + }, + { + "epoch": 0.04229256080114449, + "grad_norm": 0.14384826602062137, + "learning_rate": 0.00019992130365533497, + "loss": 0.7346, + "step": 473 + }, + { + "epoch": 0.04238197424892704, + "grad_norm": 0.12374693664684093, + "learning_rate": 0.00019992015076417233, + "loss": 0.6926, + "step": 474 + }, + { + "epoch": 0.042471387696709585, + "grad_norm": 0.11610990038880226, + "learning_rate": 0.00019991898949282732, + "loss": 0.7048, + "step": 475 + }, + { + "epoch": 0.04256080114449213, + "grad_norm": 0.13703685183381903, + "learning_rate": 0.00019991781984139736, + "loss": 0.798, + "step": 476 + }, + { + "epoch": 0.04265021459227468, + "grad_norm": 0.1285460186292709, + "learning_rate": 0.00019991664180998048, + "loss": 0.7296, + "step": 477 + }, + { + "epoch": 0.04273962804005722, + "grad_norm": 0.10346220626599661, + "learning_rate": 0.00019991545539867556, + "loss": 0.6949, + "step": 478 + }, + { + "epoch": 0.042829041487839774, + "grad_norm": 0.12299959874944408, + "learning_rate": 0.00019991426060758202, + "loss": 0.7159, + "step": 479 + }, + { + "epoch": 0.04291845493562232, + "grad_norm": 0.11462484386435523, + "learning_rate": 0.00019991305743680013, + "loss": 0.7364, + "step": 480 + }, + { + "epoch": 0.04300786838340486, + "grad_norm": 0.11743209829125795, + "learning_rate": 0.00019991184588643077, + "loss": 0.764, + "step": 481 + }, + { + "epoch": 0.04309728183118741, + "grad_norm": 0.1176048983095752, + "learning_rate": 0.00019991062595657558, + "loss": 0.7239, + "step": 482 + }, + { + "epoch": 0.043186695278969955, + "grad_norm": 0.10795861778517826, + "learning_rate": 0.00019990939764733684, + "loss": 0.7245, + "step": 483 + }, + { + "epoch": 0.043276108726752506, + "grad_norm": 0.11131076344543407, + "learning_rate": 0.0001999081609588176, + "loss": 0.7155, + "step": 484 + }, + { + "epoch": 0.04336552217453505, + "grad_norm": 0.12128510310620275, + "learning_rate": 0.0001999069158911215, + "loss": 0.7116, + "step": 485 + }, + { + "epoch": 0.04345493562231759, + "grad_norm": 0.10645155558974909, + "learning_rate": 0.00019990566244435307, + "loss": 0.741, + "step": 486 + }, + { + "epoch": 0.043544349070100144, + "grad_norm": 0.11451142739530648, + "learning_rate": 0.0001999044006186174, + "loss": 0.7284, + "step": 487 + }, + { + "epoch": 0.04363376251788269, + "grad_norm": 0.12118923670090724, + "learning_rate": 0.00019990313041402024, + "loss": 0.7919, + "step": 488 + }, + { + "epoch": 0.04372317596566524, + "grad_norm": 0.10851590456819543, + "learning_rate": 0.00019990185183066825, + "loss": 0.7212, + "step": 489 + }, + { + "epoch": 0.04381258941344778, + "grad_norm": 0.11448404327058281, + "learning_rate": 0.00019990056486866858, + "loss": 0.7261, + "step": 490 + }, + { + "epoch": 0.043902002861230326, + "grad_norm": 0.10065687067471084, + "learning_rate": 0.00019989926952812916, + "loss": 0.7106, + "step": 491 + }, + { + "epoch": 0.043991416309012876, + "grad_norm": 0.08924008444181503, + "learning_rate": 0.00019989796580915866, + "loss": 0.5899, + "step": 492 + }, + { + "epoch": 0.04408082975679542, + "grad_norm": 0.11564138863611917, + "learning_rate": 0.0001998966537118664, + "loss": 0.6829, + "step": 493 + }, + { + "epoch": 0.04417024320457797, + "grad_norm": 0.11787380662666952, + "learning_rate": 0.0001998953332363625, + "loss": 0.743, + "step": 494 + }, + { + "epoch": 0.044259656652360514, + "grad_norm": 0.11724641927772524, + "learning_rate": 0.00019989400438275758, + "loss": 0.7061, + "step": 495 + }, + { + "epoch": 0.044349070100143065, + "grad_norm": 0.12552557237510614, + "learning_rate": 0.00019989266715116316, + "loss": 0.7325, + "step": 496 + }, + { + "epoch": 0.04443848354792561, + "grad_norm": 0.09782816270083837, + "learning_rate": 0.0001998913215416914, + "loss": 0.7239, + "step": 497 + }, + { + "epoch": 0.04452789699570815, + "grad_norm": 0.11357586681728593, + "learning_rate": 0.00019988996755445517, + "loss": 0.7223, + "step": 498 + }, + { + "epoch": 0.0446173104434907, + "grad_norm": 0.11766669246784206, + "learning_rate": 0.00019988860518956796, + "loss": 0.7305, + "step": 499 + }, + { + "epoch": 0.044706723891273246, + "grad_norm": 0.11636416692001553, + "learning_rate": 0.0001998872344471441, + "loss": 0.7498, + "step": 500 + }, + { + "epoch": 0.0447961373390558, + "grad_norm": 0.10320881159917296, + "learning_rate": 0.00019988585532729848, + "loss": 0.7561, + "step": 501 + }, + { + "epoch": 0.04488555078683834, + "grad_norm": 0.10789673998978924, + "learning_rate": 0.00019988446783014683, + "loss": 0.688, + "step": 502 + }, + { + "epoch": 0.044974964234620884, + "grad_norm": 0.10518465906325315, + "learning_rate": 0.0001998830719558055, + "loss": 0.7501, + "step": 503 + }, + { + "epoch": 0.045064377682403435, + "grad_norm": 0.11597321668189191, + "learning_rate": 0.00019988166770439154, + "loss": 0.7437, + "step": 504 + }, + { + "epoch": 0.04515379113018598, + "grad_norm": 0.12282022165290798, + "learning_rate": 0.00019988025507602274, + "loss": 0.7397, + "step": 505 + }, + { + "epoch": 0.04524320457796853, + "grad_norm": 0.10846048812919656, + "learning_rate": 0.00019987883407081753, + "loss": 0.7464, + "step": 506 + }, + { + "epoch": 0.04533261802575107, + "grad_norm": 0.13885597719749024, + "learning_rate": 0.00019987740468889519, + "loss": 0.7146, + "step": 507 + }, + { + "epoch": 0.04542203147353362, + "grad_norm": 0.12141567351578623, + "learning_rate": 0.00019987596693037552, + "loss": 0.7801, + "step": 508 + }, + { + "epoch": 0.04551144492131617, + "grad_norm": 0.11595265796344859, + "learning_rate": 0.00019987452079537913, + "loss": 0.7502, + "step": 509 + }, + { + "epoch": 0.04560085836909871, + "grad_norm": 0.12013671401495915, + "learning_rate": 0.00019987306628402727, + "loss": 0.7275, + "step": 510 + }, + { + "epoch": 0.04569027181688126, + "grad_norm": 0.1199679975547481, + "learning_rate": 0.00019987160339644198, + "loss": 0.6924, + "step": 511 + }, + { + "epoch": 0.045779685264663805, + "grad_norm": 0.10713847094575711, + "learning_rate": 0.00019987013213274593, + "loss": 0.7415, + "step": 512 + }, + { + "epoch": 0.04586909871244635, + "grad_norm": 0.11321432475299306, + "learning_rate": 0.0001998686524930625, + "loss": 0.7209, + "step": 513 + }, + { + "epoch": 0.0459585121602289, + "grad_norm": 0.11160775706127539, + "learning_rate": 0.0001998671644775158, + "loss": 0.7218, + "step": 514 + }, + { + "epoch": 0.04604792560801144, + "grad_norm": 0.11413115126920971, + "learning_rate": 0.00019986566808623062, + "loss": 0.7353, + "step": 515 + }, + { + "epoch": 0.046137339055793994, + "grad_norm": 0.11720037156940456, + "learning_rate": 0.00019986416331933246, + "loss": 0.7672, + "step": 516 + }, + { + "epoch": 0.04622675250357654, + "grad_norm": 0.10888555633254604, + "learning_rate": 0.00019986265017694755, + "loss": 0.7467, + "step": 517 + }, + { + "epoch": 0.04631616595135908, + "grad_norm": 0.10712111754452124, + "learning_rate": 0.00019986112865920277, + "loss": 0.7358, + "step": 518 + }, + { + "epoch": 0.04640557939914163, + "grad_norm": 0.11489077042706304, + "learning_rate": 0.00019985959876622574, + "loss": 0.7634, + "step": 519 + }, + { + "epoch": 0.046494992846924176, + "grad_norm": 0.11478995395635694, + "learning_rate": 0.00019985806049814474, + "loss": 0.7213, + "step": 520 + }, + { + "epoch": 0.046584406294706726, + "grad_norm": 0.12790945837213263, + "learning_rate": 0.0001998565138550888, + "loss": 0.7594, + "step": 521 + }, + { + "epoch": 0.04667381974248927, + "grad_norm": 0.12249053410965338, + "learning_rate": 0.00019985495883718764, + "loss": 0.7473, + "step": 522 + }, + { + "epoch": 0.04676323319027181, + "grad_norm": 0.12586396099326086, + "learning_rate": 0.0001998533954445717, + "loss": 0.7055, + "step": 523 + }, + { + "epoch": 0.046852646638054364, + "grad_norm": 0.12601236821671155, + "learning_rate": 0.00019985182367737202, + "loss": 0.7404, + "step": 524 + }, + { + "epoch": 0.04694206008583691, + "grad_norm": 0.11520427146646561, + "learning_rate": 0.00019985024353572054, + "loss": 0.7289, + "step": 525 + }, + { + "epoch": 0.04703147353361946, + "grad_norm": 0.11562246367647036, + "learning_rate": 0.0001998486550197497, + "loss": 0.7378, + "step": 526 + }, + { + "epoch": 0.047120886981402, + "grad_norm": 0.12073698147609761, + "learning_rate": 0.00019984705812959276, + "loss": 0.7468, + "step": 527 + }, + { + "epoch": 0.04721030042918455, + "grad_norm": 0.11523261388082548, + "learning_rate": 0.0001998454528653836, + "loss": 0.7539, + "step": 528 + }, + { + "epoch": 0.047299713876967096, + "grad_norm": 0.11827934853546702, + "learning_rate": 0.00019984383922725695, + "loss": 0.7315, + "step": 529 + }, + { + "epoch": 0.04738912732474964, + "grad_norm": 0.12770096736827222, + "learning_rate": 0.00019984221721534805, + "loss": 0.749, + "step": 530 + }, + { + "epoch": 0.04747854077253219, + "grad_norm": 0.1065666940754891, + "learning_rate": 0.00019984058682979297, + "loss": 0.7096, + "step": 531 + }, + { + "epoch": 0.047567954220314734, + "grad_norm": 0.1114168666798643, + "learning_rate": 0.00019983894807072848, + "loss": 0.7624, + "step": 532 + }, + { + "epoch": 0.047657367668097285, + "grad_norm": 0.09553287751294812, + "learning_rate": 0.00019983730093829194, + "loss": 0.6957, + "step": 533 + }, + { + "epoch": 0.04774678111587983, + "grad_norm": 0.10292111615290603, + "learning_rate": 0.00019983564543262156, + "loss": 0.7217, + "step": 534 + }, + { + "epoch": 0.04783619456366237, + "grad_norm": 0.1040308603948985, + "learning_rate": 0.0001998339815538562, + "loss": 0.7223, + "step": 535 + }, + { + "epoch": 0.04792560801144492, + "grad_norm": 0.11528620003861911, + "learning_rate": 0.00019983230930213536, + "loss": 0.7256, + "step": 536 + }, + { + "epoch": 0.04801502145922747, + "grad_norm": 0.11257728296786353, + "learning_rate": 0.00019983062867759928, + "loss": 0.7676, + "step": 537 + }, + { + "epoch": 0.04810443490701002, + "grad_norm": 0.10766834244217263, + "learning_rate": 0.00019982893968038896, + "loss": 0.7529, + "step": 538 + }, + { + "epoch": 0.04819384835479256, + "grad_norm": 0.11712320429625325, + "learning_rate": 0.00019982724231064602, + "loss": 0.7237, + "step": 539 + }, + { + "epoch": 0.048283261802575105, + "grad_norm": 0.1238257311975794, + "learning_rate": 0.00019982553656851284, + "loss": 0.7754, + "step": 540 + }, + { + "epoch": 0.048372675250357655, + "grad_norm": 0.12898458889898778, + "learning_rate": 0.00019982382245413248, + "loss": 0.7401, + "step": 541 + }, + { + "epoch": 0.0484620886981402, + "grad_norm": 0.11497638203217167, + "learning_rate": 0.00019982209996764866, + "loss": 0.7257, + "step": 542 + }, + { + "epoch": 0.04855150214592275, + "grad_norm": 0.12620074779334023, + "learning_rate": 0.0001998203691092059, + "loss": 0.7234, + "step": 543 + }, + { + "epoch": 0.04864091559370529, + "grad_norm": 0.10950190777064969, + "learning_rate": 0.00019981862987894934, + "loss": 0.7436, + "step": 544 + }, + { + "epoch": 0.04873032904148784, + "grad_norm": 0.12980278818033106, + "learning_rate": 0.0001998168822770248, + "loss": 0.7326, + "step": 545 + }, + { + "epoch": 0.04881974248927039, + "grad_norm": 0.10893976104627578, + "learning_rate": 0.0001998151263035789, + "loss": 0.7044, + "step": 546 + }, + { + "epoch": 0.04890915593705293, + "grad_norm": 0.12070880519113099, + "learning_rate": 0.00019981336195875894, + "loss": 0.7169, + "step": 547 + }, + { + "epoch": 0.04899856938483548, + "grad_norm": 0.10988801197585908, + "learning_rate": 0.00019981158924271283, + "loss": 0.6997, + "step": 548 + }, + { + "epoch": 0.049087982832618025, + "grad_norm": 0.11305919258093258, + "learning_rate": 0.00019980980815558925, + "loss": 0.7321, + "step": 549 + }, + { + "epoch": 0.04917739628040057, + "grad_norm": 0.11229895028115337, + "learning_rate": 0.00019980801869753765, + "loss": 0.748, + "step": 550 + }, + { + "epoch": 0.04926680972818312, + "grad_norm": 0.10513710986583921, + "learning_rate": 0.00019980622086870803, + "loss": 0.723, + "step": 551 + }, + { + "epoch": 0.04935622317596566, + "grad_norm": 0.09936378911343739, + "learning_rate": 0.00019980441466925118, + "loss": 0.7226, + "step": 552 + }, + { + "epoch": 0.049445636623748214, + "grad_norm": 0.09897703220211836, + "learning_rate": 0.00019980260009931864, + "loss": 0.7408, + "step": 553 + }, + { + "epoch": 0.04953505007153076, + "grad_norm": 0.11036612355429895, + "learning_rate": 0.00019980077715906256, + "loss": 0.7595, + "step": 554 + }, + { + "epoch": 0.0496244635193133, + "grad_norm": 0.12001162536426885, + "learning_rate": 0.0001997989458486358, + "loss": 0.7218, + "step": 555 + }, + { + "epoch": 0.04971387696709585, + "grad_norm": 0.13605483963650108, + "learning_rate": 0.000199797106168192, + "loss": 0.7476, + "step": 556 + }, + { + "epoch": 0.049803290414878396, + "grad_norm": 0.11299272875376487, + "learning_rate": 0.00019979525811788542, + "loss": 0.6855, + "step": 557 + }, + { + "epoch": 0.049892703862660946, + "grad_norm": 0.1619680017236769, + "learning_rate": 0.0001997934016978711, + "loss": 0.7302, + "step": 558 + }, + { + "epoch": 0.04998211731044349, + "grad_norm": 0.11774366839966968, + "learning_rate": 0.00019979153690830463, + "loss": 0.7362, + "step": 559 + }, + { + "epoch": 0.05007153075822604, + "grad_norm": 0.15779885269643176, + "learning_rate": 0.00019978966374934254, + "loss": 0.7785, + "step": 560 + }, + { + "epoch": 0.050160944206008584, + "grad_norm": 0.11790663249297109, + "learning_rate": 0.00019978778222114185, + "loss": 0.6679, + "step": 561 + }, + { + "epoch": 0.05025035765379113, + "grad_norm": 0.11725584645733778, + "learning_rate": 0.00019978589232386035, + "loss": 0.7076, + "step": 562 + }, + { + "epoch": 0.05033977110157368, + "grad_norm": 0.10454527734147512, + "learning_rate": 0.0001997839940576566, + "loss": 0.7246, + "step": 563 + }, + { + "epoch": 0.05042918454935622, + "grad_norm": 0.1033228217086727, + "learning_rate": 0.00019978208742268977, + "loss": 0.6926, + "step": 564 + }, + { + "epoch": 0.05051859799713877, + "grad_norm": 0.11504169802913661, + "learning_rate": 0.00019978017241911977, + "loss": 0.7317, + "step": 565 + }, + { + "epoch": 0.050608011444921316, + "grad_norm": 0.12695317097417186, + "learning_rate": 0.00019977824904710722, + "loss": 0.7194, + "step": 566 + }, + { + "epoch": 0.05069742489270386, + "grad_norm": 0.09963235328057644, + "learning_rate": 0.00019977631730681343, + "loss": 0.6977, + "step": 567 + }, + { + "epoch": 0.05078683834048641, + "grad_norm": 0.11748257244227107, + "learning_rate": 0.0001997743771984004, + "loss": 0.7439, + "step": 568 + }, + { + "epoch": 0.050876251788268954, + "grad_norm": 0.11008609637763098, + "learning_rate": 0.00019977242872203083, + "loss": 0.7018, + "step": 569 + }, + { + "epoch": 0.050965665236051505, + "grad_norm": 0.10900295025116093, + "learning_rate": 0.00019977047187786818, + "loss": 0.7579, + "step": 570 + }, + { + "epoch": 0.05105507868383405, + "grad_norm": 0.11411849025827475, + "learning_rate": 0.00019976850666607657, + "loss": 0.738, + "step": 571 + }, + { + "epoch": 0.05114449213161659, + "grad_norm": 0.13097701605421644, + "learning_rate": 0.00019976653308682076, + "loss": 0.6941, + "step": 572 + }, + { + "epoch": 0.05123390557939914, + "grad_norm": 0.12024171762857046, + "learning_rate": 0.0001997645511402663, + "loss": 0.7281, + "step": 573 + }, + { + "epoch": 0.05132331902718169, + "grad_norm": 0.12699880313531772, + "learning_rate": 0.00019976256082657946, + "loss": 0.7671, + "step": 574 + }, + { + "epoch": 0.05141273247496424, + "grad_norm": 0.10803736235730309, + "learning_rate": 0.00019976056214592708, + "loss": 0.7501, + "step": 575 + }, + { + "epoch": 0.05150214592274678, + "grad_norm": 0.13906432120700568, + "learning_rate": 0.00019975855509847686, + "loss": 0.7124, + "step": 576 + }, + { + "epoch": 0.051591559370529325, + "grad_norm": 0.12206589510551503, + "learning_rate": 0.00019975653968439712, + "loss": 0.7083, + "step": 577 + }, + { + "epoch": 0.051680972818311875, + "grad_norm": 0.10861138263470603, + "learning_rate": 0.00019975451590385684, + "loss": 0.7174, + "step": 578 + }, + { + "epoch": 0.05177038626609442, + "grad_norm": 0.11448968413928572, + "learning_rate": 0.0001997524837570258, + "loss": 0.6835, + "step": 579 + }, + { + "epoch": 0.05185979971387697, + "grad_norm": 0.13866904515832584, + "learning_rate": 0.0001997504432440744, + "loss": 0.7317, + "step": 580 + }, + { + "epoch": 0.05194921316165951, + "grad_norm": 0.11670048414916542, + "learning_rate": 0.00019974839436517382, + "loss": 0.7125, + "step": 581 + }, + { + "epoch": 0.05203862660944206, + "grad_norm": 0.12532280572255777, + "learning_rate": 0.00019974633712049587, + "loss": 0.7131, + "step": 582 + }, + { + "epoch": 0.05212804005722461, + "grad_norm": 0.10769242011757571, + "learning_rate": 0.00019974427151021304, + "loss": 0.7489, + "step": 583 + }, + { + "epoch": 0.05221745350500715, + "grad_norm": 0.12461520417637519, + "learning_rate": 0.00019974219753449867, + "loss": 0.7518, + "step": 584 + }, + { + "epoch": 0.0523068669527897, + "grad_norm": 0.10953324901268817, + "learning_rate": 0.00019974011519352663, + "loss": 0.7195, + "step": 585 + }, + { + "epoch": 0.052396280400572245, + "grad_norm": 0.11615238782479433, + "learning_rate": 0.0001997380244874716, + "loss": 0.7438, + "step": 586 + }, + { + "epoch": 0.05248569384835479, + "grad_norm": 0.10369171711840804, + "learning_rate": 0.0001997359254165089, + "loss": 0.704, + "step": 587 + }, + { + "epoch": 0.05257510729613734, + "grad_norm": 0.11782000085840258, + "learning_rate": 0.00019973381798081457, + "loss": 0.7166, + "step": 588 + }, + { + "epoch": 0.05266452074391988, + "grad_norm": 0.10698830124244714, + "learning_rate": 0.0001997317021805654, + "loss": 0.7259, + "step": 589 + }, + { + "epoch": 0.052753934191702434, + "grad_norm": 0.10939708777081238, + "learning_rate": 0.0001997295780159388, + "loss": 0.7166, + "step": 590 + }, + { + "epoch": 0.05284334763948498, + "grad_norm": 0.10814655876601852, + "learning_rate": 0.00019972744548711293, + "loss": 0.7782, + "step": 591 + }, + { + "epoch": 0.05293276108726753, + "grad_norm": 0.12038185721649475, + "learning_rate": 0.00019972530459426663, + "loss": 0.7427, + "step": 592 + }, + { + "epoch": 0.05302217453505007, + "grad_norm": 0.125755456815292, + "learning_rate": 0.00019972315533757954, + "loss": 0.6993, + "step": 593 + }, + { + "epoch": 0.053111587982832616, + "grad_norm": 0.10929738075990249, + "learning_rate": 0.00019972099771723177, + "loss": 0.7039, + "step": 594 + }, + { + "epoch": 0.053201001430615166, + "grad_norm": 0.12324727562580268, + "learning_rate": 0.00019971883173340439, + "loss": 0.7312, + "step": 595 + }, + { + "epoch": 0.05329041487839771, + "grad_norm": 0.12287586736472603, + "learning_rate": 0.00019971665738627902, + "loss": 0.7443, + "step": 596 + }, + { + "epoch": 0.05337982832618026, + "grad_norm": 0.12328130152011688, + "learning_rate": 0.00019971447467603804, + "loss": 0.7204, + "step": 597 + }, + { + "epoch": 0.053469241773962804, + "grad_norm": 0.1357083949468057, + "learning_rate": 0.00019971228360286445, + "loss": 0.7749, + "step": 598 + }, + { + "epoch": 0.05355865522174535, + "grad_norm": 0.10594094447532902, + "learning_rate": 0.00019971008416694208, + "loss": 0.703, + "step": 599 + }, + { + "epoch": 0.0536480686695279, + "grad_norm": 0.10623294074489598, + "learning_rate": 0.00019970787636845535, + "loss": 0.7233, + "step": 600 + }, + { + "epoch": 0.05373748211731044, + "grad_norm": 0.10786445551766645, + "learning_rate": 0.00019970566020758947, + "loss": 0.7103, + "step": 601 + }, + { + "epoch": 0.05382689556509299, + "grad_norm": 0.1247480440408876, + "learning_rate": 0.0001997034356845303, + "loss": 0.7418, + "step": 602 + }, + { + "epoch": 0.053916309012875537, + "grad_norm": 0.11246239369104992, + "learning_rate": 0.00019970120279946436, + "loss": 0.6858, + "step": 603 + }, + { + "epoch": 0.05400572246065808, + "grad_norm": 0.11207948610234435, + "learning_rate": 0.00019969896155257896, + "loss": 0.7106, + "step": 604 + }, + { + "epoch": 0.05409513590844063, + "grad_norm": 0.11690908072858175, + "learning_rate": 0.00019969671194406205, + "loss": 0.7311, + "step": 605 + }, + { + "epoch": 0.054184549356223174, + "grad_norm": 0.11168017008247025, + "learning_rate": 0.0001996944539741023, + "loss": 0.6895, + "step": 606 + }, + { + "epoch": 0.054273962804005725, + "grad_norm": 0.12540053957396544, + "learning_rate": 0.00019969218764288914, + "loss": 0.7584, + "step": 607 + }, + { + "epoch": 0.05436337625178827, + "grad_norm": 0.10490950176394435, + "learning_rate": 0.0001996899129506126, + "loss": 0.7208, + "step": 608 + }, + { + "epoch": 0.05445278969957081, + "grad_norm": 0.1102186705446395, + "learning_rate": 0.0001996876298974634, + "loss": 0.7101, + "step": 609 + }, + { + "epoch": 0.05454220314735336, + "grad_norm": 0.11650635735416583, + "learning_rate": 0.00019968533848363311, + "loss": 0.7346, + "step": 610 + }, + { + "epoch": 0.05463161659513591, + "grad_norm": 0.12571952336406506, + "learning_rate": 0.00019968303870931386, + "loss": 0.6891, + "step": 611 + }, + { + "epoch": 0.05472103004291846, + "grad_norm": 0.10590165981500597, + "learning_rate": 0.00019968073057469857, + "loss": 0.6917, + "step": 612 + }, + { + "epoch": 0.054810443490701, + "grad_norm": 0.09984396461228537, + "learning_rate": 0.00019967841407998076, + "loss": 0.7087, + "step": 613 + }, + { + "epoch": 0.054899856938483545, + "grad_norm": 0.11036758121008955, + "learning_rate": 0.00019967608922535476, + "loss": 0.7382, + "step": 614 + }, + { + "epoch": 0.054989270386266095, + "grad_norm": 0.1272770528843975, + "learning_rate": 0.00019967375601101552, + "loss": 0.7349, + "step": 615 + }, + { + "epoch": 0.05507868383404864, + "grad_norm": 0.10269623112569996, + "learning_rate": 0.00019967141443715872, + "loss": 0.6924, + "step": 616 + }, + { + "epoch": 0.05516809728183119, + "grad_norm": 0.12438182128141506, + "learning_rate": 0.0001996690645039808, + "loss": 0.7046, + "step": 617 + }, + { + "epoch": 0.05525751072961373, + "grad_norm": 0.12072741928187801, + "learning_rate": 0.00019966670621167877, + "loss": 0.7569, + "step": 618 + }, + { + "epoch": 0.05534692417739628, + "grad_norm": 0.14811907532668595, + "learning_rate": 0.0001996643395604505, + "loss": 0.7619, + "step": 619 + }, + { + "epoch": 0.05543633762517883, + "grad_norm": 0.10662367150637247, + "learning_rate": 0.00019966196455049442, + "loss": 0.7203, + "step": 620 + }, + { + "epoch": 0.05552575107296137, + "grad_norm": 0.1337097693472042, + "learning_rate": 0.00019965958118200972, + "loss": 0.7602, + "step": 621 + }, + { + "epoch": 0.05561516452074392, + "grad_norm": 0.1143049278974281, + "learning_rate": 0.00019965718945519633, + "loss": 0.6899, + "step": 622 + }, + { + "epoch": 0.055704577968526466, + "grad_norm": 0.12360139635729928, + "learning_rate": 0.00019965478937025483, + "loss": 0.7006, + "step": 623 + }, + { + "epoch": 0.055793991416309016, + "grad_norm": 0.1125189834669064, + "learning_rate": 0.00019965238092738643, + "loss": 0.7204, + "step": 624 + }, + { + "epoch": 0.05588340486409156, + "grad_norm": 0.11102365475114467, + "learning_rate": 0.00019964996412679325, + "loss": 0.685, + "step": 625 + }, + { + "epoch": 0.055972818311874104, + "grad_norm": 0.10115570056518214, + "learning_rate": 0.00019964753896867788, + "loss": 0.7167, + "step": 626 + }, + { + "epoch": 0.056062231759656654, + "grad_norm": 0.09933274016665551, + "learning_rate": 0.00019964510545324382, + "loss": 0.7048, + "step": 627 + }, + { + "epoch": 0.0561516452074392, + "grad_norm": 0.09742232742217655, + "learning_rate": 0.00019964266358069504, + "loss": 0.6721, + "step": 628 + }, + { + "epoch": 0.05624105865522175, + "grad_norm": 0.11666390495021792, + "learning_rate": 0.00019964021335123645, + "loss": 0.7418, + "step": 629 + }, + { + "epoch": 0.05633047210300429, + "grad_norm": 0.11473253666729206, + "learning_rate": 0.00019963775476507348, + "loss": 0.7281, + "step": 630 + }, + { + "epoch": 0.056419885550786836, + "grad_norm": 0.12711773330978177, + "learning_rate": 0.00019963528782241237, + "loss": 0.7131, + "step": 631 + }, + { + "epoch": 0.056509298998569386, + "grad_norm": 0.1223419094131092, + "learning_rate": 0.00019963281252346, + "loss": 0.7004, + "step": 632 + }, + { + "epoch": 0.05659871244635193, + "grad_norm": 0.11465358010995287, + "learning_rate": 0.00019963032886842393, + "loss": 0.7318, + "step": 633 + }, + { + "epoch": 0.05668812589413448, + "grad_norm": 0.10464808891101271, + "learning_rate": 0.00019962783685751253, + "loss": 0.7427, + "step": 634 + }, + { + "epoch": 0.056777539341917024, + "grad_norm": 0.11516619034921198, + "learning_rate": 0.0001996253364909348, + "loss": 0.6932, + "step": 635 + }, + { + "epoch": 0.05686695278969957, + "grad_norm": 0.10659993544036764, + "learning_rate": 0.00019962282776890037, + "loss": 0.6935, + "step": 636 + }, + { + "epoch": 0.05695636623748212, + "grad_norm": 0.11201635213303553, + "learning_rate": 0.0001996203106916197, + "loss": 0.6985, + "step": 637 + }, + { + "epoch": 0.05704577968526466, + "grad_norm": 0.10180066964248725, + "learning_rate": 0.00019961778525930387, + "loss": 0.7474, + "step": 638 + }, + { + "epoch": 0.05713519313304721, + "grad_norm": 0.09745592081627077, + "learning_rate": 0.00019961525147216475, + "loss": 0.5956, + "step": 639 + }, + { + "epoch": 0.05722460658082976, + "grad_norm": 0.1078918625160968, + "learning_rate": 0.00019961270933041477, + "loss": 0.6889, + "step": 640 + }, + { + "epoch": 0.0573140200286123, + "grad_norm": 0.10663437751792978, + "learning_rate": 0.00019961015883426716, + "loss": 0.7028, + "step": 641 + }, + { + "epoch": 0.05740343347639485, + "grad_norm": 0.12144941030029725, + "learning_rate": 0.0001996075999839358, + "loss": 0.7363, + "step": 642 + }, + { + "epoch": 0.057492846924177395, + "grad_norm": 0.11791736279621355, + "learning_rate": 0.0001996050327796353, + "loss": 0.6639, + "step": 643 + }, + { + "epoch": 0.057582260371959945, + "grad_norm": 0.11312778079647914, + "learning_rate": 0.00019960245722158108, + "loss": 0.7124, + "step": 644 + }, + { + "epoch": 0.05767167381974249, + "grad_norm": 0.11011056816335249, + "learning_rate": 0.000199599873309989, + "loss": 0.693, + "step": 645 + }, + { + "epoch": 0.05776108726752503, + "grad_norm": 0.11828040701013658, + "learning_rate": 0.00019959728104507586, + "loss": 0.6845, + "step": 646 + }, + { + "epoch": 0.05785050071530758, + "grad_norm": 0.1252335272434012, + "learning_rate": 0.00019959468042705903, + "loss": 0.7557, + "step": 647 + }, + { + "epoch": 0.05793991416309013, + "grad_norm": 0.11245533253040425, + "learning_rate": 0.00019959207145615665, + "loss": 0.741, + "step": 648 + }, + { + "epoch": 0.05802932761087268, + "grad_norm": 0.1134288361127154, + "learning_rate": 0.00019958945413258748, + "loss": 0.6907, + "step": 649 + }, + { + "epoch": 0.05811874105865522, + "grad_norm": 0.13456706586357714, + "learning_rate": 0.00019958682845657108, + "loss": 0.7612, + "step": 650 + }, + { + "epoch": 0.058208154506437765, + "grad_norm": 0.12181618138508356, + "learning_rate": 0.00019958419442832765, + "loss": 0.7137, + "step": 651 + }, + { + "epoch": 0.058297567954220315, + "grad_norm": 0.11847793743560917, + "learning_rate": 0.00019958155204807812, + "loss": 0.751, + "step": 652 + }, + { + "epoch": 0.05838698140200286, + "grad_norm": 0.10364390193738336, + "learning_rate": 0.00019957890131604405, + "loss": 0.7071, + "step": 653 + }, + { + "epoch": 0.05847639484978541, + "grad_norm": 0.12223601254268882, + "learning_rate": 0.0001995762422324478, + "loss": 0.7472, + "step": 654 + }, + { + "epoch": 0.05856580829756795, + "grad_norm": 0.12807325334262984, + "learning_rate": 0.00019957357479751236, + "loss": 0.7264, + "step": 655 + }, + { + "epoch": 0.058655221745350504, + "grad_norm": 0.1145721264964095, + "learning_rate": 0.00019957089901146148, + "loss": 0.7252, + "step": 656 + }, + { + "epoch": 0.05874463519313305, + "grad_norm": 0.13594382098808488, + "learning_rate": 0.00019956821487451953, + "loss": 0.7138, + "step": 657 + }, + { + "epoch": 0.05883404864091559, + "grad_norm": 0.11711071850732034, + "learning_rate": 0.00019956552238691166, + "loss": 0.7507, + "step": 658 + }, + { + "epoch": 0.05892346208869814, + "grad_norm": 0.12416061121658581, + "learning_rate": 0.00019956282154886369, + "loss": 0.7391, + "step": 659 + }, + { + "epoch": 0.059012875536480686, + "grad_norm": 0.11290608658993254, + "learning_rate": 0.00019956011236060207, + "loss": 0.7196, + "step": 660 + }, + { + "epoch": 0.059102288984263236, + "grad_norm": 0.1141418165609107, + "learning_rate": 0.0001995573948223541, + "loss": 0.6929, + "step": 661 + }, + { + "epoch": 0.05919170243204578, + "grad_norm": 0.12532393723367152, + "learning_rate": 0.00019955466893434767, + "loss": 0.7576, + "step": 662 + }, + { + "epoch": 0.059281115879828324, + "grad_norm": 0.1101643896941401, + "learning_rate": 0.00019955193469681137, + "loss": 0.7105, + "step": 663 + }, + { + "epoch": 0.059370529327610874, + "grad_norm": 0.11195155794091761, + "learning_rate": 0.00019954919210997453, + "loss": 0.7246, + "step": 664 + }, + { + "epoch": 0.05945994277539342, + "grad_norm": 0.11451845777959563, + "learning_rate": 0.00019954644117406718, + "loss": 0.7275, + "step": 665 + }, + { + "epoch": 0.05954935622317597, + "grad_norm": 0.11237340011364323, + "learning_rate": 0.00019954368188932002, + "loss": 0.6846, + "step": 666 + }, + { + "epoch": 0.05963876967095851, + "grad_norm": 0.10871583289833159, + "learning_rate": 0.0001995409142559645, + "loss": 0.725, + "step": 667 + }, + { + "epoch": 0.059728183118741056, + "grad_norm": 0.1163102689317058, + "learning_rate": 0.0001995381382742327, + "loss": 0.7102, + "step": 668 + }, + { + "epoch": 0.059817596566523606, + "grad_norm": 0.11207744331429788, + "learning_rate": 0.00019953535394435744, + "loss": 0.7039, + "step": 669 + }, + { + "epoch": 0.05990701001430615, + "grad_norm": 0.10857342211719301, + "learning_rate": 0.0001995325612665723, + "loss": 0.7151, + "step": 670 + }, + { + "epoch": 0.0599964234620887, + "grad_norm": 0.11984881728151953, + "learning_rate": 0.00019952976024111143, + "loss": 0.7134, + "step": 671 + }, + { + "epoch": 0.060085836909871244, + "grad_norm": 0.12233460343984336, + "learning_rate": 0.00019952695086820975, + "loss": 0.7549, + "step": 672 + }, + { + "epoch": 0.06017525035765379, + "grad_norm": 0.12784318254089702, + "learning_rate": 0.0001995241331481029, + "loss": 0.7026, + "step": 673 + }, + { + "epoch": 0.06026466380543634, + "grad_norm": 0.11880656823804864, + "learning_rate": 0.00019952130708102722, + "loss": 0.7358, + "step": 674 + }, + { + "epoch": 0.06035407725321888, + "grad_norm": 0.12424959913416043, + "learning_rate": 0.0001995184726672197, + "loss": 0.7022, + "step": 675 + }, + { + "epoch": 0.06044349070100143, + "grad_norm": 0.11322948928177734, + "learning_rate": 0.00019951562990691807, + "loss": 0.7414, + "step": 676 + }, + { + "epoch": 0.06053290414878398, + "grad_norm": 0.1275016135667461, + "learning_rate": 0.00019951277880036073, + "loss": 0.7313, + "step": 677 + }, + { + "epoch": 0.06062231759656652, + "grad_norm": 0.1262313374855093, + "learning_rate": 0.0001995099193477868, + "loss": 0.7559, + "step": 678 + }, + { + "epoch": 0.06071173104434907, + "grad_norm": 0.10125065451536432, + "learning_rate": 0.00019950705154943613, + "loss": 0.5677, + "step": 679 + }, + { + "epoch": 0.060801144492131615, + "grad_norm": 0.16255796192454, + "learning_rate": 0.00019950417540554925, + "loss": 0.7045, + "step": 680 + }, + { + "epoch": 0.060890557939914165, + "grad_norm": 0.08839990408013344, + "learning_rate": 0.00019950129091636732, + "loss": 0.5946, + "step": 681 + }, + { + "epoch": 0.06097997138769671, + "grad_norm": 0.14509418643898894, + "learning_rate": 0.00019949839808213227, + "loss": 0.778, + "step": 682 + }, + { + "epoch": 0.06106938483547925, + "grad_norm": 0.12177444712259722, + "learning_rate": 0.00019949549690308677, + "loss": 0.7624, + "step": 683 + }, + { + "epoch": 0.0611587982832618, + "grad_norm": 0.12068006489364042, + "learning_rate": 0.0001994925873794741, + "loss": 0.7081, + "step": 684 + }, + { + "epoch": 0.06124821173104435, + "grad_norm": 0.1390032969018271, + "learning_rate": 0.00019948966951153824, + "loss": 0.7197, + "step": 685 + }, + { + "epoch": 0.0613376251788269, + "grad_norm": 0.1209108576040945, + "learning_rate": 0.000199486743299524, + "loss": 0.7509, + "step": 686 + }, + { + "epoch": 0.06142703862660944, + "grad_norm": 0.11949722041776238, + "learning_rate": 0.00019948380874367674, + "loss": 0.7842, + "step": 687 + }, + { + "epoch": 0.06151645207439199, + "grad_norm": 0.11559187350575702, + "learning_rate": 0.00019948086584424256, + "loss": 0.7022, + "step": 688 + }, + { + "epoch": 0.061605865522174535, + "grad_norm": 0.09492258171753662, + "learning_rate": 0.00019947791460146833, + "loss": 0.6018, + "step": 689 + }, + { + "epoch": 0.06169527896995708, + "grad_norm": 0.12373053864727085, + "learning_rate": 0.00019947495501560153, + "loss": 0.7303, + "step": 690 + }, + { + "epoch": 0.06178469241773963, + "grad_norm": 0.12097659299434896, + "learning_rate": 0.00019947198708689042, + "loss": 0.7513, + "step": 691 + }, + { + "epoch": 0.06187410586552217, + "grad_norm": 0.12080976296236795, + "learning_rate": 0.00019946901081558386, + "loss": 0.7394, + "step": 692 + }, + { + "epoch": 0.061963519313304724, + "grad_norm": 0.13965320847521365, + "learning_rate": 0.0001994660262019315, + "loss": 0.7867, + "step": 693 + }, + { + "epoch": 0.06205293276108727, + "grad_norm": 0.13598641858270544, + "learning_rate": 0.0001994630332461836, + "loss": 0.7201, + "step": 694 + }, + { + "epoch": 0.06214234620886981, + "grad_norm": 0.12038495027902943, + "learning_rate": 0.00019946003194859125, + "loss": 0.7207, + "step": 695 + }, + { + "epoch": 0.06223175965665236, + "grad_norm": 0.10122942863849041, + "learning_rate": 0.00019945702230940614, + "loss": 0.7218, + "step": 696 + }, + { + "epoch": 0.062321173104434906, + "grad_norm": 0.1307310043062138, + "learning_rate": 0.0001994540043288807, + "loss": 0.7365, + "step": 697 + }, + { + "epoch": 0.062410586552217456, + "grad_norm": 0.11157432438197508, + "learning_rate": 0.00019945097800726802, + "loss": 0.7308, + "step": 698 + }, + { + "epoch": 0.0625, + "grad_norm": 0.08045774617023814, + "learning_rate": 0.00019944794334482194, + "loss": 0.6003, + "step": 699 + }, + { + "epoch": 0.06258941344778254, + "grad_norm": 0.10184587527776315, + "learning_rate": 0.0001994449003417969, + "loss": 0.6993, + "step": 700 + }, + { + "epoch": 0.06267882689556509, + "grad_norm": 0.10443072116770834, + "learning_rate": 0.00019944184899844822, + "loss": 0.6842, + "step": 701 + }, + { + "epoch": 0.06276824034334764, + "grad_norm": 0.11298109912007183, + "learning_rate": 0.00019943878931503176, + "loss": 0.7759, + "step": 702 + }, + { + "epoch": 0.06285765379113019, + "grad_norm": 0.11520438195625131, + "learning_rate": 0.0001994357212918041, + "loss": 0.7428, + "step": 703 + }, + { + "epoch": 0.06294706723891273, + "grad_norm": 0.11294767799869675, + "learning_rate": 0.00019943264492902258, + "loss": 0.7286, + "step": 704 + }, + { + "epoch": 0.06303648068669528, + "grad_norm": 0.11035907009437985, + "learning_rate": 0.00019942956022694523, + "loss": 0.694, + "step": 705 + }, + { + "epoch": 0.06312589413447782, + "grad_norm": 0.12452601974413496, + "learning_rate": 0.00019942646718583076, + "loss": 0.7051, + "step": 706 + }, + { + "epoch": 0.06321530758226038, + "grad_norm": 0.1263725538348333, + "learning_rate": 0.00019942336580593852, + "loss": 0.7104, + "step": 707 + }, + { + "epoch": 0.06330472103004292, + "grad_norm": 0.11039002199640827, + "learning_rate": 0.0001994202560875287, + "loss": 0.7462, + "step": 708 + }, + { + "epoch": 0.06339413447782546, + "grad_norm": 0.12112971334548035, + "learning_rate": 0.00019941713803086204, + "loss": 0.709, + "step": 709 + }, + { + "epoch": 0.06348354792560801, + "grad_norm": 0.10814321559431908, + "learning_rate": 0.0001994140116362001, + "loss": 0.7118, + "step": 710 + }, + { + "epoch": 0.06357296137339055, + "grad_norm": 0.11530094201473134, + "learning_rate": 0.0001994108769038051, + "loss": 0.7316, + "step": 711 + }, + { + "epoch": 0.06366237482117311, + "grad_norm": 0.1242804064639501, + "learning_rate": 0.00019940773383393987, + "loss": 0.7592, + "step": 712 + }, + { + "epoch": 0.06375178826895565, + "grad_norm": 0.1079458666296503, + "learning_rate": 0.00019940458242686802, + "loss": 0.7536, + "step": 713 + }, + { + "epoch": 0.0638412017167382, + "grad_norm": 0.12224585659677908, + "learning_rate": 0.00019940142268285395, + "loss": 0.7322, + "step": 714 + }, + { + "epoch": 0.06393061516452074, + "grad_norm": 0.12395710637080087, + "learning_rate": 0.0001993982546021626, + "loss": 0.7088, + "step": 715 + }, + { + "epoch": 0.06402002861230328, + "grad_norm": 0.12992332968148873, + "learning_rate": 0.00019939507818505966, + "loss": 0.7565, + "step": 716 + }, + { + "epoch": 0.06410944206008584, + "grad_norm": 0.1117590305720774, + "learning_rate": 0.00019939189343181157, + "loss": 0.7008, + "step": 717 + }, + { + "epoch": 0.06419885550786839, + "grad_norm": 0.12180863848930482, + "learning_rate": 0.00019938870034268542, + "loss": 0.6894, + "step": 718 + }, + { + "epoch": 0.06428826895565093, + "grad_norm": 0.1292117440511776, + "learning_rate": 0.00019938549891794898, + "loss": 0.7477, + "step": 719 + }, + { + "epoch": 0.06437768240343347, + "grad_norm": 0.11476588898138726, + "learning_rate": 0.0001993822891578708, + "loss": 0.7367, + "step": 720 + }, + { + "epoch": 0.06446709585121602, + "grad_norm": 0.11619327094250655, + "learning_rate": 0.00019937907106272002, + "loss": 0.7158, + "step": 721 + }, + { + "epoch": 0.06455650929899857, + "grad_norm": 0.10875633750128895, + "learning_rate": 0.00019937584463276657, + "loss": 0.7169, + "step": 722 + }, + { + "epoch": 0.06464592274678112, + "grad_norm": 0.13219152366832768, + "learning_rate": 0.00019937260986828108, + "loss": 0.7407, + "step": 723 + }, + { + "epoch": 0.06473533619456366, + "grad_norm": 0.11084427860043608, + "learning_rate": 0.0001993693667695348, + "loss": 0.6964, + "step": 724 + }, + { + "epoch": 0.0648247496423462, + "grad_norm": 0.13038621079325124, + "learning_rate": 0.0001993661153367997, + "loss": 0.7277, + "step": 725 + }, + { + "epoch": 0.06491416309012875, + "grad_norm": 0.11964838489617755, + "learning_rate": 0.00019936285557034858, + "loss": 0.758, + "step": 726 + }, + { + "epoch": 0.0650035765379113, + "grad_norm": 0.1136324319602629, + "learning_rate": 0.00019935958747045472, + "loss": 0.7065, + "step": 727 + }, + { + "epoch": 0.06509298998569385, + "grad_norm": 0.11704701021160062, + "learning_rate": 0.00019935631103739225, + "loss": 0.7027, + "step": 728 + }, + { + "epoch": 0.0651824034334764, + "grad_norm": 0.10911551576429433, + "learning_rate": 0.00019935302627143594, + "loss": 0.6991, + "step": 729 + }, + { + "epoch": 0.06527181688125894, + "grad_norm": 0.08690960004924812, + "learning_rate": 0.00019934973317286138, + "loss": 0.5953, + "step": 730 + }, + { + "epoch": 0.0653612303290415, + "grad_norm": 0.12342881145118767, + "learning_rate": 0.00019934643174194462, + "loss": 0.7108, + "step": 731 + }, + { + "epoch": 0.06545064377682404, + "grad_norm": 0.1258919021166248, + "learning_rate": 0.00019934312197896262, + "loss": 0.7079, + "step": 732 + }, + { + "epoch": 0.06554005722460658, + "grad_norm": 0.12215408305540847, + "learning_rate": 0.00019933980388419297, + "loss": 0.7164, + "step": 733 + }, + { + "epoch": 0.06562947067238913, + "grad_norm": 0.11957393507544746, + "learning_rate": 0.00019933647745791393, + "loss": 0.7291, + "step": 734 + }, + { + "epoch": 0.06571888412017167, + "grad_norm": 0.13641277031430527, + "learning_rate": 0.0001993331427004045, + "loss": 0.7381, + "step": 735 + }, + { + "epoch": 0.06580829756795423, + "grad_norm": 0.11370861208428537, + "learning_rate": 0.00019932979961194435, + "loss": 0.6973, + "step": 736 + }, + { + "epoch": 0.06589771101573677, + "grad_norm": 0.1163151425135368, + "learning_rate": 0.00019932644819281389, + "loss": 0.7129, + "step": 737 + }, + { + "epoch": 0.06598712446351931, + "grad_norm": 0.11385329868213157, + "learning_rate": 0.00019932308844329417, + "loss": 0.709, + "step": 738 + }, + { + "epoch": 0.06607653791130186, + "grad_norm": 0.10581146824870467, + "learning_rate": 0.00019931972036366696, + "loss": 0.6667, + "step": 739 + }, + { + "epoch": 0.0661659513590844, + "grad_norm": 0.11936537198109698, + "learning_rate": 0.00019931634395421475, + "loss": 0.7193, + "step": 740 + }, + { + "epoch": 0.06625536480686696, + "grad_norm": 0.1192599952999057, + "learning_rate": 0.0001993129592152207, + "loss": 0.7119, + "step": 741 + }, + { + "epoch": 0.0663447782546495, + "grad_norm": 0.12381874735285966, + "learning_rate": 0.00019930956614696874, + "loss": 0.7318, + "step": 742 + }, + { + "epoch": 0.06643419170243205, + "grad_norm": 0.12932955016317565, + "learning_rate": 0.0001993061647497434, + "loss": 0.7521, + "step": 743 + }, + { + "epoch": 0.06652360515021459, + "grad_norm": 0.1087070857165062, + "learning_rate": 0.0001993027550238299, + "loss": 0.7004, + "step": 744 + }, + { + "epoch": 0.06661301859799713, + "grad_norm": 0.13394310970145654, + "learning_rate": 0.00019929933696951433, + "loss": 0.748, + "step": 745 + }, + { + "epoch": 0.06670243204577969, + "grad_norm": 0.12541636793161148, + "learning_rate": 0.00019929591058708324, + "loss": 0.7471, + "step": 746 + }, + { + "epoch": 0.06679184549356224, + "grad_norm": 0.11786969328045184, + "learning_rate": 0.00019929247587682406, + "loss": 0.7184, + "step": 747 + }, + { + "epoch": 0.06688125894134478, + "grad_norm": 0.11518632443200923, + "learning_rate": 0.00019928903283902486, + "loss": 0.6556, + "step": 748 + }, + { + "epoch": 0.06697067238912732, + "grad_norm": 0.13181274898311382, + "learning_rate": 0.00019928558147397439, + "loss": 0.7044, + "step": 749 + }, + { + "epoch": 0.06706008583690987, + "grad_norm": 0.12232927131258715, + "learning_rate": 0.0001992821217819621, + "loss": 0.7154, + "step": 750 + }, + { + "epoch": 0.06714949928469242, + "grad_norm": 0.08560295989624506, + "learning_rate": 0.00019927865376327816, + "loss": 0.5885, + "step": 751 + }, + { + "epoch": 0.06723891273247497, + "grad_norm": 0.14187803853187703, + "learning_rate": 0.00019927517741821343, + "loss": 0.6981, + "step": 752 + }, + { + "epoch": 0.06732832618025751, + "grad_norm": 0.12047911503457301, + "learning_rate": 0.00019927169274705945, + "loss": 0.7021, + "step": 753 + }, + { + "epoch": 0.06741773962804005, + "grad_norm": 0.11976135893565193, + "learning_rate": 0.00019926819975010852, + "loss": 0.6778, + "step": 754 + }, + { + "epoch": 0.0675071530758226, + "grad_norm": 0.12086948711082494, + "learning_rate": 0.00019926469842765352, + "loss": 0.7307, + "step": 755 + }, + { + "epoch": 0.06759656652360516, + "grad_norm": 0.12723860624134717, + "learning_rate": 0.00019926118877998817, + "loss": 0.7527, + "step": 756 + }, + { + "epoch": 0.0676859799713877, + "grad_norm": 0.10709916239569715, + "learning_rate": 0.0001992576708074068, + "loss": 0.701, + "step": 757 + }, + { + "epoch": 0.06777539341917024, + "grad_norm": 0.11920921157180489, + "learning_rate": 0.00019925414451020442, + "loss": 0.6803, + "step": 758 + }, + { + "epoch": 0.06786480686695279, + "grad_norm": 0.1200114647398503, + "learning_rate": 0.00019925060988867682, + "loss": 0.7173, + "step": 759 + }, + { + "epoch": 0.06795422031473533, + "grad_norm": 0.10588135384777184, + "learning_rate": 0.00019924706694312045, + "loss": 0.6786, + "step": 760 + }, + { + "epoch": 0.06804363376251789, + "grad_norm": 0.1244877418364086, + "learning_rate": 0.00019924351567383243, + "loss": 0.6918, + "step": 761 + }, + { + "epoch": 0.06813304721030043, + "grad_norm": 0.1395466393726822, + "learning_rate": 0.00019923995608111058, + "loss": 0.7447, + "step": 762 + }, + { + "epoch": 0.06822246065808298, + "grad_norm": 0.14459927368394349, + "learning_rate": 0.0001992363881652535, + "loss": 0.7607, + "step": 763 + }, + { + "epoch": 0.06831187410586552, + "grad_norm": 0.13371942400338285, + "learning_rate": 0.0001992328119265604, + "loss": 0.7106, + "step": 764 + }, + { + "epoch": 0.06840128755364806, + "grad_norm": 0.10791213299984778, + "learning_rate": 0.0001992292273653312, + "loss": 0.6753, + "step": 765 + }, + { + "epoch": 0.06849070100143062, + "grad_norm": 0.11196779126844479, + "learning_rate": 0.00019922563448186652, + "loss": 0.7189, + "step": 766 + }, + { + "epoch": 0.06858011444921316, + "grad_norm": 0.11763013811207632, + "learning_rate": 0.00019922203327646772, + "loss": 0.7435, + "step": 767 + }, + { + "epoch": 0.06866952789699571, + "grad_norm": 0.11742358384450564, + "learning_rate": 0.0001992184237494368, + "loss": 0.7189, + "step": 768 + }, + { + "epoch": 0.06875894134477825, + "grad_norm": 0.12820327800389347, + "learning_rate": 0.00019921480590107653, + "loss": 0.7321, + "step": 769 + }, + { + "epoch": 0.0688483547925608, + "grad_norm": 0.1081989027307377, + "learning_rate": 0.0001992111797316903, + "loss": 0.7226, + "step": 770 + }, + { + "epoch": 0.06893776824034335, + "grad_norm": 0.12328211585120659, + "learning_rate": 0.00019920754524158226, + "loss": 0.7077, + "step": 771 + }, + { + "epoch": 0.0690271816881259, + "grad_norm": 0.11028261294379708, + "learning_rate": 0.00019920390243105716, + "loss": 0.6843, + "step": 772 + }, + { + "epoch": 0.06911659513590844, + "grad_norm": 0.09349207994925489, + "learning_rate": 0.00019920025130042062, + "loss": 0.6164, + "step": 773 + }, + { + "epoch": 0.06920600858369098, + "grad_norm": 0.10325239895653397, + "learning_rate": 0.0001991965918499788, + "loss": 0.6756, + "step": 774 + }, + { + "epoch": 0.06929542203147353, + "grad_norm": 0.10826998457128595, + "learning_rate": 0.00019919292408003862, + "loss": 0.6969, + "step": 775 + }, + { + "epoch": 0.06938483547925609, + "grad_norm": 0.12427239098721765, + "learning_rate": 0.0001991892479909077, + "loss": 0.7374, + "step": 776 + }, + { + "epoch": 0.06947424892703863, + "grad_norm": 0.11886131111091837, + "learning_rate": 0.0001991855635828943, + "loss": 0.7296, + "step": 777 + }, + { + "epoch": 0.06956366237482117, + "grad_norm": 0.11820247671544, + "learning_rate": 0.00019918187085630752, + "loss": 0.7177, + "step": 778 + }, + { + "epoch": 0.06965307582260372, + "grad_norm": 0.12400422508613523, + "learning_rate": 0.000199178169811457, + "loss": 0.754, + "step": 779 + }, + { + "epoch": 0.06974248927038626, + "grad_norm": 0.11464223220231128, + "learning_rate": 0.00019917446044865312, + "loss": 0.7169, + "step": 780 + }, + { + "epoch": 0.06983190271816882, + "grad_norm": 0.09370369413710516, + "learning_rate": 0.00019917074276820705, + "loss": 0.5907, + "step": 781 + }, + { + "epoch": 0.06992131616595136, + "grad_norm": 0.1088261483016479, + "learning_rate": 0.00019916701677043054, + "loss": 0.7051, + "step": 782 + }, + { + "epoch": 0.0700107296137339, + "grad_norm": 0.1046738380272891, + "learning_rate": 0.00019916328245563611, + "loss": 0.718, + "step": 783 + }, + { + "epoch": 0.07010014306151645, + "grad_norm": 0.12392512719517194, + "learning_rate": 0.0001991595398241369, + "loss": 0.7732, + "step": 784 + }, + { + "epoch": 0.07018955650929899, + "grad_norm": 0.10177047120300407, + "learning_rate": 0.0001991557888762469, + "loss": 0.7078, + "step": 785 + }, + { + "epoch": 0.07027896995708155, + "grad_norm": 0.11860555575258958, + "learning_rate": 0.00019915202961228058, + "loss": 0.6936, + "step": 786 + }, + { + "epoch": 0.0703683834048641, + "grad_norm": 0.11991420681588535, + "learning_rate": 0.00019914826203255333, + "loss": 0.6921, + "step": 787 + }, + { + "epoch": 0.07045779685264664, + "grad_norm": 0.1187688976298137, + "learning_rate": 0.00019914448613738106, + "loss": 0.7014, + "step": 788 + }, + { + "epoch": 0.07054721030042918, + "grad_norm": 0.11571748716187652, + "learning_rate": 0.00019914070192708047, + "loss": 0.7225, + "step": 789 + }, + { + "epoch": 0.07063662374821172, + "grad_norm": 0.10987513496863859, + "learning_rate": 0.00019913690940196894, + "loss": 0.724, + "step": 790 + }, + { + "epoch": 0.07072603719599428, + "grad_norm": 0.1267720832359393, + "learning_rate": 0.00019913310856236452, + "loss": 0.7392, + "step": 791 + }, + { + "epoch": 0.07081545064377683, + "grad_norm": 0.12392817426348426, + "learning_rate": 0.00019912929940858607, + "loss": 0.7416, + "step": 792 + }, + { + "epoch": 0.07090486409155937, + "grad_norm": 0.10663731144014597, + "learning_rate": 0.00019912548194095297, + "loss": 0.6629, + "step": 793 + }, + { + "epoch": 0.07099427753934191, + "grad_norm": 0.10114715904871975, + "learning_rate": 0.0001991216561597854, + "loss": 0.7477, + "step": 794 + }, + { + "epoch": 0.07108369098712447, + "grad_norm": 0.1191663624303711, + "learning_rate": 0.00019911782206540423, + "loss": 0.7549, + "step": 795 + }, + { + "epoch": 0.07117310443490701, + "grad_norm": 0.12096940252281077, + "learning_rate": 0.00019911397965813107, + "loss": 0.7414, + "step": 796 + }, + { + "epoch": 0.07126251788268956, + "grad_norm": 0.10362217291965285, + "learning_rate": 0.0001991101289382881, + "loss": 0.7255, + "step": 797 + }, + { + "epoch": 0.0713519313304721, + "grad_norm": 0.11686283189619318, + "learning_rate": 0.0001991062699061983, + "loss": 0.7044, + "step": 798 + }, + { + "epoch": 0.07144134477825465, + "grad_norm": 0.11150850670553546, + "learning_rate": 0.00019910240256218535, + "loss": 0.686, + "step": 799 + }, + { + "epoch": 0.0715307582260372, + "grad_norm": 0.12932215230283942, + "learning_rate": 0.00019909852690657359, + "loss": 0.7157, + "step": 800 + }, + { + "epoch": 0.07162017167381975, + "grad_norm": 0.11720681834088088, + "learning_rate": 0.00019909464293968804, + "loss": 0.7086, + "step": 801 + }, + { + "epoch": 0.07170958512160229, + "grad_norm": 0.11362094598828655, + "learning_rate": 0.0001990907506618545, + "loss": 0.7197, + "step": 802 + }, + { + "epoch": 0.07179899856938483, + "grad_norm": 0.12097101557045933, + "learning_rate": 0.00019908685007339932, + "loss": 0.7557, + "step": 803 + }, + { + "epoch": 0.07188841201716738, + "grad_norm": 0.10627689302772272, + "learning_rate": 0.00019908294117464975, + "loss": 0.6949, + "step": 804 + }, + { + "epoch": 0.07197782546494993, + "grad_norm": 0.1065278647657006, + "learning_rate": 0.00019907902396593352, + "loss": 0.6676, + "step": 805 + }, + { + "epoch": 0.07206723891273248, + "grad_norm": 0.11227925985614019, + "learning_rate": 0.00019907509844757925, + "loss": 0.7274, + "step": 806 + }, + { + "epoch": 0.07215665236051502, + "grad_norm": 0.10792119284596699, + "learning_rate": 0.00019907116461991605, + "loss": 0.7028, + "step": 807 + }, + { + "epoch": 0.07224606580829757, + "grad_norm": 0.12273267570731222, + "learning_rate": 0.00019906722248327397, + "loss": 0.7451, + "step": 808 + }, + { + "epoch": 0.07233547925608011, + "grad_norm": 0.12269172364833716, + "learning_rate": 0.0001990632720379836, + "loss": 0.7261, + "step": 809 + }, + { + "epoch": 0.07242489270386267, + "grad_norm": 0.1137117399123865, + "learning_rate": 0.00019905931328437624, + "loss": 0.7088, + "step": 810 + }, + { + "epoch": 0.07251430615164521, + "grad_norm": 0.11606801108213334, + "learning_rate": 0.00019905534622278388, + "loss": 0.7425, + "step": 811 + }, + { + "epoch": 0.07260371959942775, + "grad_norm": 0.12688811582935725, + "learning_rate": 0.00019905137085353926, + "loss": 0.7091, + "step": 812 + }, + { + "epoch": 0.0726931330472103, + "grad_norm": 0.11320867381286354, + "learning_rate": 0.0001990473871769758, + "loss": 0.679, + "step": 813 + }, + { + "epoch": 0.07278254649499284, + "grad_norm": 0.12220248475981585, + "learning_rate": 0.00019904339519342764, + "loss": 0.713, + "step": 814 + }, + { + "epoch": 0.0728719599427754, + "grad_norm": 0.11974052699586082, + "learning_rate": 0.00019903939490322948, + "loss": 0.7242, + "step": 815 + }, + { + "epoch": 0.07296137339055794, + "grad_norm": 0.104765472209328, + "learning_rate": 0.0001990353863067169, + "loss": 0.7216, + "step": 816 + }, + { + "epoch": 0.07305078683834049, + "grad_norm": 0.10662777854981569, + "learning_rate": 0.00019903136940422605, + "loss": 0.7099, + "step": 817 + }, + { + "epoch": 0.07314020028612303, + "grad_norm": 0.10775053023602181, + "learning_rate": 0.00019902734419609389, + "loss": 0.7137, + "step": 818 + }, + { + "epoch": 0.07322961373390557, + "grad_norm": 0.10943959169281135, + "learning_rate": 0.00019902331068265793, + "loss": 0.7438, + "step": 819 + }, + { + "epoch": 0.07331902718168813, + "grad_norm": 0.11451279786933859, + "learning_rate": 0.00019901926886425653, + "loss": 0.7266, + "step": 820 + }, + { + "epoch": 0.07340844062947068, + "grad_norm": 0.12821443726762802, + "learning_rate": 0.00019901521874122859, + "loss": 0.6975, + "step": 821 + }, + { + "epoch": 0.07349785407725322, + "grad_norm": 0.1212553619304692, + "learning_rate": 0.00019901116031391386, + "loss": 0.7036, + "step": 822 + }, + { + "epoch": 0.07358726752503576, + "grad_norm": 0.10757339662284422, + "learning_rate": 0.0001990070935826527, + "loss": 0.69, + "step": 823 + }, + { + "epoch": 0.0736766809728183, + "grad_norm": 0.12153199623119398, + "learning_rate": 0.00019900301854778617, + "loss": 0.6793, + "step": 824 + }, + { + "epoch": 0.07376609442060086, + "grad_norm": 0.11380919772615132, + "learning_rate": 0.00019899893520965604, + "loss": 0.7358, + "step": 825 + }, + { + "epoch": 0.07385550786838341, + "grad_norm": 0.11970138303303433, + "learning_rate": 0.00019899484356860473, + "loss": 0.743, + "step": 826 + }, + { + "epoch": 0.07394492131616595, + "grad_norm": 0.12296126974722046, + "learning_rate": 0.00019899074362497552, + "loss": 0.7511, + "step": 827 + }, + { + "epoch": 0.0740343347639485, + "grad_norm": 0.11531880008848143, + "learning_rate": 0.00019898663537911213, + "loss": 0.7107, + "step": 828 + }, + { + "epoch": 0.07412374821173104, + "grad_norm": 0.10809094058693469, + "learning_rate": 0.00019898251883135922, + "loss": 0.6784, + "step": 829 + }, + { + "epoch": 0.0742131616595136, + "grad_norm": 0.12784934132571188, + "learning_rate": 0.00019897839398206197, + "loss": 0.7199, + "step": 830 + }, + { + "epoch": 0.07430257510729614, + "grad_norm": 0.12277751079252588, + "learning_rate": 0.00019897426083156634, + "loss": 0.7087, + "step": 831 + }, + { + "epoch": 0.07439198855507868, + "grad_norm": 0.11469299022141605, + "learning_rate": 0.000198970119380219, + "loss": 0.7267, + "step": 832 + }, + { + "epoch": 0.07448140200286123, + "grad_norm": 0.11377672563137296, + "learning_rate": 0.0001989659696283673, + "loss": 0.7504, + "step": 833 + }, + { + "epoch": 0.07457081545064377, + "grad_norm": 0.1272174230794, + "learning_rate": 0.00019896181157635923, + "loss": 0.7073, + "step": 834 + }, + { + "epoch": 0.07466022889842633, + "grad_norm": 0.10472467704660554, + "learning_rate": 0.0001989576452245435, + "loss": 0.7095, + "step": 835 + }, + { + "epoch": 0.07474964234620887, + "grad_norm": 0.12798218542382755, + "learning_rate": 0.00019895347057326962, + "loss": 0.6826, + "step": 836 + }, + { + "epoch": 0.07483905579399142, + "grad_norm": 0.13111665779100562, + "learning_rate": 0.00019894928762288766, + "loss": 0.7227, + "step": 837 + }, + { + "epoch": 0.07492846924177396, + "grad_norm": 0.10246053239779968, + "learning_rate": 0.00019894509637374843, + "loss": 0.6868, + "step": 838 + }, + { + "epoch": 0.0750178826895565, + "grad_norm": 0.10571170922250024, + "learning_rate": 0.00019894089682620349, + "loss": 0.7169, + "step": 839 + }, + { + "epoch": 0.07510729613733906, + "grad_norm": 0.10144139281661389, + "learning_rate": 0.00019893668898060502, + "loss": 0.6909, + "step": 840 + }, + { + "epoch": 0.0751967095851216, + "grad_norm": 0.13711550664887814, + "learning_rate": 0.00019893247283730593, + "loss": 0.7346, + "step": 841 + }, + { + "epoch": 0.07528612303290415, + "grad_norm": 0.11830084732535436, + "learning_rate": 0.0001989282483966598, + "loss": 0.7099, + "step": 842 + }, + { + "epoch": 0.07537553648068669, + "grad_norm": 0.11981830486618023, + "learning_rate": 0.00019892401565902096, + "loss": 0.7236, + "step": 843 + }, + { + "epoch": 0.07546494992846924, + "grad_norm": 0.11750927894690985, + "learning_rate": 0.0001989197746247444, + "loss": 0.701, + "step": 844 + }, + { + "epoch": 0.07555436337625179, + "grad_norm": 0.08198635017958308, + "learning_rate": 0.0001989155252941858, + "loss": 0.5903, + "step": 845 + }, + { + "epoch": 0.07564377682403434, + "grad_norm": 0.11249044753934653, + "learning_rate": 0.00019891126766770158, + "loss": 0.7114, + "step": 846 + }, + { + "epoch": 0.07573319027181688, + "grad_norm": 0.10048848649933247, + "learning_rate": 0.00019890700174564878, + "loss": 0.6791, + "step": 847 + }, + { + "epoch": 0.07582260371959942, + "grad_norm": 0.11618045507552753, + "learning_rate": 0.00019890272752838518, + "loss": 0.6803, + "step": 848 + }, + { + "epoch": 0.07591201716738197, + "grad_norm": 0.11294070078498683, + "learning_rate": 0.00019889844501626928, + "loss": 0.6936, + "step": 849 + }, + { + "epoch": 0.07600143061516453, + "grad_norm": 0.1153288512373971, + "learning_rate": 0.00019889415420966026, + "loss": 0.7123, + "step": 850 + }, + { + "epoch": 0.07609084406294707, + "grad_norm": 0.1149617966032092, + "learning_rate": 0.00019888985510891792, + "loss": 0.7106, + "step": 851 + }, + { + "epoch": 0.07618025751072961, + "grad_norm": 0.11103600078739605, + "learning_rate": 0.00019888554771440288, + "loss": 0.7212, + "step": 852 + }, + { + "epoch": 0.07626967095851216, + "grad_norm": 0.10500695933830785, + "learning_rate": 0.00019888123202647636, + "loss": 0.6818, + "step": 853 + }, + { + "epoch": 0.0763590844062947, + "grad_norm": 0.12095063159124995, + "learning_rate": 0.00019887690804550035, + "loss": 0.7281, + "step": 854 + }, + { + "epoch": 0.07644849785407726, + "grad_norm": 0.11207199587489737, + "learning_rate": 0.00019887257577183744, + "loss": 0.6956, + "step": 855 + }, + { + "epoch": 0.0765379113018598, + "grad_norm": 0.10860229325928329, + "learning_rate": 0.00019886823520585105, + "loss": 0.6806, + "step": 856 + }, + { + "epoch": 0.07662732474964234, + "grad_norm": 0.10751348455795418, + "learning_rate": 0.00019886388634790517, + "loss": 0.7269, + "step": 857 + }, + { + "epoch": 0.07671673819742489, + "grad_norm": 0.1117702471513851, + "learning_rate": 0.0001988595291983645, + "loss": 0.7029, + "step": 858 + }, + { + "epoch": 0.07680615164520745, + "grad_norm": 0.10164756004944163, + "learning_rate": 0.00019885516375759457, + "loss": 0.7112, + "step": 859 + }, + { + "epoch": 0.07689556509298999, + "grad_norm": 0.11949049485137775, + "learning_rate": 0.00019885079002596138, + "loss": 0.7084, + "step": 860 + }, + { + "epoch": 0.07698497854077253, + "grad_norm": 0.12173602602567372, + "learning_rate": 0.00019884640800383186, + "loss": 0.7211, + "step": 861 + }, + { + "epoch": 0.07707439198855508, + "grad_norm": 0.12812704720192927, + "learning_rate": 0.00019884201769157346, + "loss": 0.7072, + "step": 862 + }, + { + "epoch": 0.07716380543633762, + "grad_norm": 0.09851483196157856, + "learning_rate": 0.0001988376190895544, + "loss": 0.7099, + "step": 863 + }, + { + "epoch": 0.07725321888412018, + "grad_norm": 0.09220576622209264, + "learning_rate": 0.0001988332121981436, + "loss": 0.6719, + "step": 864 + }, + { + "epoch": 0.07734263233190272, + "grad_norm": 0.10457895257754841, + "learning_rate": 0.00019882879701771063, + "loss": 0.7216, + "step": 865 + }, + { + "epoch": 0.07743204577968527, + "grad_norm": 0.12018305162376172, + "learning_rate": 0.00019882437354862585, + "loss": 0.7188, + "step": 866 + }, + { + "epoch": 0.07752145922746781, + "grad_norm": 0.09774872068665684, + "learning_rate": 0.00019881994179126017, + "loss": 0.6788, + "step": 867 + }, + { + "epoch": 0.07761087267525035, + "grad_norm": 0.13967708323237948, + "learning_rate": 0.00019881550174598536, + "loss": 0.7454, + "step": 868 + }, + { + "epoch": 0.07770028612303291, + "grad_norm": 0.12209317445790095, + "learning_rate": 0.00019881105341317372, + "loss": 0.7239, + "step": 869 + }, + { + "epoch": 0.07778969957081545, + "grad_norm": 0.12666858642769663, + "learning_rate": 0.00019880659679319838, + "loss": 0.7543, + "step": 870 + }, + { + "epoch": 0.077879113018598, + "grad_norm": 0.13486877337411107, + "learning_rate": 0.00019880213188643307, + "loss": 0.7257, + "step": 871 + }, + { + "epoch": 0.07796852646638054, + "grad_norm": 0.13628906610921968, + "learning_rate": 0.00019879765869325233, + "loss": 0.6979, + "step": 872 + }, + { + "epoch": 0.07805793991416309, + "grad_norm": 0.1179446695054151, + "learning_rate": 0.00019879317721403124, + "loss": 0.6983, + "step": 873 + }, + { + "epoch": 0.07814735336194564, + "grad_norm": 0.10304084882443239, + "learning_rate": 0.00019878868744914569, + "loss": 0.7067, + "step": 874 + }, + { + "epoch": 0.07823676680972819, + "grad_norm": 0.11120098037453476, + "learning_rate": 0.00019878418939897223, + "loss": 0.7227, + "step": 875 + }, + { + "epoch": 0.07832618025751073, + "grad_norm": 0.12020543164415914, + "learning_rate": 0.00019877968306388811, + "loss": 0.6928, + "step": 876 + }, + { + "epoch": 0.07841559370529327, + "grad_norm": 0.10256974075315298, + "learning_rate": 0.00019877516844427127, + "loss": 0.7217, + "step": 877 + }, + { + "epoch": 0.07850500715307582, + "grad_norm": 0.11666712683155775, + "learning_rate": 0.00019877064554050036, + "loss": 0.698, + "step": 878 + }, + { + "epoch": 0.07859442060085838, + "grad_norm": 0.12020975539213138, + "learning_rate": 0.00019876611435295466, + "loss": 0.726, + "step": 879 + }, + { + "epoch": 0.07868383404864092, + "grad_norm": 0.12252526669508058, + "learning_rate": 0.00019876157488201424, + "loss": 0.715, + "step": 880 + }, + { + "epoch": 0.07877324749642346, + "grad_norm": 0.11176327688123622, + "learning_rate": 0.00019875702712805984, + "loss": 0.7131, + "step": 881 + }, + { + "epoch": 0.078862660944206, + "grad_norm": 0.12296256726006946, + "learning_rate": 0.00019875247109147278, + "loss": 0.75, + "step": 882 + }, + { + "epoch": 0.07895207439198855, + "grad_norm": 0.1252293276784564, + "learning_rate": 0.0001987479067726353, + "loss": 0.7424, + "step": 883 + }, + { + "epoch": 0.07904148783977111, + "grad_norm": 0.1319889153150054, + "learning_rate": 0.00019874333417193007, + "loss": 0.7177, + "step": 884 + }, + { + "epoch": 0.07913090128755365, + "grad_norm": 0.10559687721926225, + "learning_rate": 0.00019873875328974073, + "loss": 0.6907, + "step": 885 + }, + { + "epoch": 0.0792203147353362, + "grad_norm": 0.1186110630362134, + "learning_rate": 0.00019873416412645133, + "loss": 0.711, + "step": 886 + }, + { + "epoch": 0.07930972818311874, + "grad_norm": 0.11005975207581717, + "learning_rate": 0.00019872956668244687, + "loss": 0.6931, + "step": 887 + }, + { + "epoch": 0.07939914163090128, + "grad_norm": 0.11182970425454669, + "learning_rate": 0.00019872496095811286, + "loss": 0.7047, + "step": 888 + }, + { + "epoch": 0.07948855507868384, + "grad_norm": 0.12259406557405075, + "learning_rate": 0.00019872034695383558, + "loss": 0.7062, + "step": 889 + }, + { + "epoch": 0.07957796852646638, + "grad_norm": 0.11124869740652987, + "learning_rate": 0.0001987157246700021, + "loss": 0.6832, + "step": 890 + }, + { + "epoch": 0.07966738197424893, + "grad_norm": 0.11908210168823634, + "learning_rate": 0.00019871109410699996, + "loss": 0.6933, + "step": 891 + }, + { + "epoch": 0.07975679542203147, + "grad_norm": 0.12122223276465903, + "learning_rate": 0.00019870645526521758, + "loss": 0.6858, + "step": 892 + }, + { + "epoch": 0.07984620886981401, + "grad_norm": 0.1093216930077197, + "learning_rate": 0.000198701808145044, + "loss": 0.7117, + "step": 893 + }, + { + "epoch": 0.07993562231759657, + "grad_norm": 0.12316880875003004, + "learning_rate": 0.00019869715274686898, + "loss": 0.7215, + "step": 894 + }, + { + "epoch": 0.08002503576537912, + "grad_norm": 0.12903912885983154, + "learning_rate": 0.00019869248907108294, + "loss": 0.7804, + "step": 895 + }, + { + "epoch": 0.08011444921316166, + "grad_norm": 0.11228683174136096, + "learning_rate": 0.00019868781711807705, + "loss": 0.7237, + "step": 896 + }, + { + "epoch": 0.0802038626609442, + "grad_norm": 0.12012360202801098, + "learning_rate": 0.0001986831368882431, + "loss": 0.7559, + "step": 897 + }, + { + "epoch": 0.08029327610872675, + "grad_norm": 0.11282855885862857, + "learning_rate": 0.00019867844838197365, + "loss": 0.7341, + "step": 898 + }, + { + "epoch": 0.0803826895565093, + "grad_norm": 0.12476118849258976, + "learning_rate": 0.00019867375159966192, + "loss": 0.7138, + "step": 899 + }, + { + "epoch": 0.08047210300429185, + "grad_norm": 0.111240341225397, + "learning_rate": 0.00019866904654170179, + "loss": 0.6781, + "step": 900 + }, + { + "epoch": 0.08056151645207439, + "grad_norm": 0.11775931858331944, + "learning_rate": 0.0001986643332084879, + "loss": 0.7311, + "step": 901 + }, + { + "epoch": 0.08065092989985694, + "grad_norm": 0.12374998293679369, + "learning_rate": 0.00019865961160041556, + "loss": 0.7184, + "step": 902 + }, + { + "epoch": 0.08074034334763948, + "grad_norm": 0.11875020742924076, + "learning_rate": 0.0001986548817178807, + "loss": 0.687, + "step": 903 + }, + { + "epoch": 0.08082975679542204, + "grad_norm": 0.13346718588060752, + "learning_rate": 0.0001986501435612801, + "loss": 0.7465, + "step": 904 + }, + { + "epoch": 0.08091917024320458, + "grad_norm": 0.12489523521066836, + "learning_rate": 0.00019864539713101108, + "loss": 0.7415, + "step": 905 + }, + { + "epoch": 0.08100858369098712, + "grad_norm": 0.12473291917186245, + "learning_rate": 0.00019864064242747174, + "loss": 0.6993, + "step": 906 + }, + { + "epoch": 0.08109799713876967, + "grad_norm": 0.11890811428469056, + "learning_rate": 0.00019863587945106084, + "loss": 0.7251, + "step": 907 + }, + { + "epoch": 0.08118741058655221, + "grad_norm": 0.1345240190800998, + "learning_rate": 0.00019863110820217785, + "loss": 0.6989, + "step": 908 + }, + { + "epoch": 0.08127682403433477, + "grad_norm": 0.12329638750484594, + "learning_rate": 0.00019862632868122292, + "loss": 0.7048, + "step": 909 + }, + { + "epoch": 0.08136623748211731, + "grad_norm": 0.09847259453976093, + "learning_rate": 0.00019862154088859697, + "loss": 0.6978, + "step": 910 + }, + { + "epoch": 0.08145565092989986, + "grad_norm": 0.11643819965543482, + "learning_rate": 0.00019861674482470144, + "loss": 0.7096, + "step": 911 + }, + { + "epoch": 0.0815450643776824, + "grad_norm": 0.12426788274095572, + "learning_rate": 0.00019861194048993863, + "loss": 0.6917, + "step": 912 + }, + { + "epoch": 0.08163447782546494, + "grad_norm": 0.11727826899906225, + "learning_rate": 0.00019860712788471148, + "loss": 0.69, + "step": 913 + }, + { + "epoch": 0.0817238912732475, + "grad_norm": 0.11581590124962876, + "learning_rate": 0.00019860230700942356, + "loss": 0.7644, + "step": 914 + }, + { + "epoch": 0.08181330472103004, + "grad_norm": 0.125045597519101, + "learning_rate": 0.00019859747786447928, + "loss": 0.7341, + "step": 915 + }, + { + "epoch": 0.08190271816881259, + "grad_norm": 0.12787909991865232, + "learning_rate": 0.00019859264045028358, + "loss": 0.7621, + "step": 916 + }, + { + "epoch": 0.08199213161659513, + "grad_norm": 0.11839990194317032, + "learning_rate": 0.00019858779476724219, + "loss": 0.7103, + "step": 917 + }, + { + "epoch": 0.08208154506437768, + "grad_norm": 0.09480846187785935, + "learning_rate": 0.00019858294081576155, + "loss": 0.7221, + "step": 918 + }, + { + "epoch": 0.08217095851216023, + "grad_norm": 0.10931793958822753, + "learning_rate": 0.00019857807859624869, + "loss": 0.7037, + "step": 919 + }, + { + "epoch": 0.08226037195994278, + "grad_norm": 0.11751931405344297, + "learning_rate": 0.00019857320810911144, + "loss": 0.6842, + "step": 920 + }, + { + "epoch": 0.08234978540772532, + "grad_norm": 0.12254102133242308, + "learning_rate": 0.00019856832935475827, + "loss": 0.6948, + "step": 921 + }, + { + "epoch": 0.08243919885550786, + "grad_norm": 0.1458939516618382, + "learning_rate": 0.00019856344233359837, + "loss": 0.7283, + "step": 922 + }, + { + "epoch": 0.08252861230329042, + "grad_norm": 0.12026597979804166, + "learning_rate": 0.0001985585470460416, + "loss": 0.7104, + "step": 923 + }, + { + "epoch": 0.08261802575107297, + "grad_norm": 0.11781249889322008, + "learning_rate": 0.00019855364349249848, + "loss": 0.7484, + "step": 924 + }, + { + "epoch": 0.08270743919885551, + "grad_norm": 0.11619499451860368, + "learning_rate": 0.00019854873167338033, + "loss": 0.7443, + "step": 925 + }, + { + "epoch": 0.08279685264663805, + "grad_norm": 0.11897290453840265, + "learning_rate": 0.0001985438115890991, + "loss": 0.736, + "step": 926 + }, + { + "epoch": 0.0828862660944206, + "grad_norm": 0.11781035481074364, + "learning_rate": 0.00019853888324006735, + "loss": 0.7426, + "step": 927 + }, + { + "epoch": 0.08297567954220315, + "grad_norm": 0.1227974083058488, + "learning_rate": 0.00019853394662669847, + "loss": 0.7197, + "step": 928 + }, + { + "epoch": 0.0830650929899857, + "grad_norm": 0.11796184207831006, + "learning_rate": 0.00019852900174940655, + "loss": 0.7288, + "step": 929 + }, + { + "epoch": 0.08315450643776824, + "grad_norm": 0.10998382824006687, + "learning_rate": 0.00019852404860860618, + "loss": 0.7313, + "step": 930 + }, + { + "epoch": 0.08324391988555079, + "grad_norm": 0.11300340024683005, + "learning_rate": 0.00019851908720471285, + "loss": 0.6516, + "step": 931 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.13223417182892025, + "learning_rate": 0.0001985141175381427, + "loss": 0.6637, + "step": 932 + }, + { + "epoch": 0.08342274678111589, + "grad_norm": 0.12993076060526496, + "learning_rate": 0.00019850913960931243, + "loss": 0.7149, + "step": 933 + }, + { + "epoch": 0.08351216022889843, + "grad_norm": 0.1059024821649147, + "learning_rate": 0.00019850415341863962, + "loss": 0.6822, + "step": 934 + }, + { + "epoch": 0.08360157367668097, + "grad_norm": 0.11267594810264554, + "learning_rate": 0.00019849915896654242, + "loss": 0.6743, + "step": 935 + }, + { + "epoch": 0.08369098712446352, + "grad_norm": 0.11229687416979144, + "learning_rate": 0.0001984941562534397, + "loss": 0.7036, + "step": 936 + }, + { + "epoch": 0.08378040057224606, + "grad_norm": 0.14056278995031982, + "learning_rate": 0.00019848914527975108, + "loss": 0.6982, + "step": 937 + }, + { + "epoch": 0.08386981402002862, + "grad_norm": 0.11427420993233472, + "learning_rate": 0.00019848412604589678, + "loss": 0.7053, + "step": 938 + }, + { + "epoch": 0.08395922746781116, + "grad_norm": 0.12483946220502204, + "learning_rate": 0.00019847909855229775, + "loss": 0.7583, + "step": 939 + }, + { + "epoch": 0.0840486409155937, + "grad_norm": 0.08328826469360924, + "learning_rate": 0.00019847406279937567, + "loss": 0.5822, + "step": 940 + }, + { + "epoch": 0.08413805436337625, + "grad_norm": 0.12258527149003881, + "learning_rate": 0.00019846901878755287, + "loss": 0.6924, + "step": 941 + }, + { + "epoch": 0.0842274678111588, + "grad_norm": 0.12562073865156181, + "learning_rate": 0.00019846396651725237, + "loss": 0.7238, + "step": 942 + }, + { + "epoch": 0.08431688125894135, + "grad_norm": 0.11957074652731456, + "learning_rate": 0.0001984589059888979, + "loss": 0.6981, + "step": 943 + }, + { + "epoch": 0.0844062947067239, + "grad_norm": 0.1164439478505767, + "learning_rate": 0.00019845383720291392, + "loss": 0.7556, + "step": 944 + }, + { + "epoch": 0.08449570815450644, + "grad_norm": 0.10959607066585283, + "learning_rate": 0.00019844876015972552, + "loss": 0.7099, + "step": 945 + }, + { + "epoch": 0.08458512160228898, + "grad_norm": 0.10559907924911559, + "learning_rate": 0.0001984436748597585, + "loss": 0.7101, + "step": 946 + }, + { + "epoch": 0.08467453505007153, + "grad_norm": 0.1217642499682193, + "learning_rate": 0.00019843858130343933, + "loss": 0.691, + "step": 947 + }, + { + "epoch": 0.08476394849785408, + "grad_norm": 0.12469237373307174, + "learning_rate": 0.00019843347949119526, + "loss": 0.726, + "step": 948 + }, + { + "epoch": 0.08485336194563663, + "grad_norm": 0.11922867197003774, + "learning_rate": 0.00019842836942345415, + "loss": 0.7043, + "step": 949 + }, + { + "epoch": 0.08494277539341917, + "grad_norm": 0.11242521268615731, + "learning_rate": 0.00019842325110064454, + "loss": 0.7122, + "step": 950 + }, + { + "epoch": 0.08503218884120171, + "grad_norm": 0.12346249346336992, + "learning_rate": 0.00019841812452319575, + "loss": 0.6828, + "step": 951 + }, + { + "epoch": 0.08512160228898426, + "grad_norm": 0.11887965356097024, + "learning_rate": 0.0001984129896915377, + "loss": 0.7686, + "step": 952 + }, + { + "epoch": 0.08521101573676682, + "grad_norm": 0.11737709476832696, + "learning_rate": 0.00019840784660610106, + "loss": 0.6782, + "step": 953 + }, + { + "epoch": 0.08530042918454936, + "grad_norm": 0.11018017725990227, + "learning_rate": 0.00019840269526731716, + "loss": 0.6859, + "step": 954 + }, + { + "epoch": 0.0853898426323319, + "grad_norm": 0.12468357158425819, + "learning_rate": 0.00019839753567561807, + "loss": 0.701, + "step": 955 + }, + { + "epoch": 0.08547925608011445, + "grad_norm": 0.13279310588525303, + "learning_rate": 0.0001983923678314365, + "loss": 0.7036, + "step": 956 + }, + { + "epoch": 0.08556866952789699, + "grad_norm": 0.12068924723303537, + "learning_rate": 0.00019838719173520585, + "loss": 0.7326, + "step": 957 + }, + { + "epoch": 0.08565808297567955, + "grad_norm": 0.13279644702159446, + "learning_rate": 0.00019838200738736027, + "loss": 0.747, + "step": 958 + }, + { + "epoch": 0.08574749642346209, + "grad_norm": 0.11717555387515059, + "learning_rate": 0.0001983768147883345, + "loss": 0.7102, + "step": 959 + }, + { + "epoch": 0.08583690987124463, + "grad_norm": 0.12985181638226392, + "learning_rate": 0.0001983716139385641, + "loss": 0.722, + "step": 960 + }, + { + "epoch": 0.08592632331902718, + "grad_norm": 0.12320323325683502, + "learning_rate": 0.00019836640483848528, + "loss": 0.726, + "step": 961 + }, + { + "epoch": 0.08601573676680972, + "grad_norm": 0.12584247601195797, + "learning_rate": 0.00019836118748853485, + "loss": 0.6709, + "step": 962 + }, + { + "epoch": 0.08610515021459228, + "grad_norm": 0.1381155166043056, + "learning_rate": 0.00019835596188915044, + "loss": 0.7333, + "step": 963 + }, + { + "epoch": 0.08619456366237482, + "grad_norm": 0.11610078406333452, + "learning_rate": 0.00019835072804077027, + "loss": 0.6838, + "step": 964 + }, + { + "epoch": 0.08628397711015737, + "grad_norm": 0.10061228382686652, + "learning_rate": 0.00019834548594383332, + "loss": 0.6891, + "step": 965 + }, + { + "epoch": 0.08637339055793991, + "grad_norm": 0.11103908474942432, + "learning_rate": 0.00019834023559877923, + "loss": 0.7022, + "step": 966 + }, + { + "epoch": 0.08646280400572245, + "grad_norm": 0.1036401538106972, + "learning_rate": 0.00019833497700604835, + "loss": 0.6831, + "step": 967 + }, + { + "epoch": 0.08655221745350501, + "grad_norm": 0.09889338004697737, + "learning_rate": 0.0001983297101660817, + "loss": 0.6867, + "step": 968 + }, + { + "epoch": 0.08664163090128756, + "grad_norm": 0.10354937771186412, + "learning_rate": 0.00019832443507932103, + "loss": 0.697, + "step": 969 + }, + { + "epoch": 0.0867310443490701, + "grad_norm": 0.10577811202280137, + "learning_rate": 0.00019831915174620872, + "loss": 0.6716, + "step": 970 + }, + { + "epoch": 0.08682045779685264, + "grad_norm": 0.12171642709236408, + "learning_rate": 0.0001983138601671879, + "loss": 0.7457, + "step": 971 + }, + { + "epoch": 0.08690987124463519, + "grad_norm": 0.10887959248968702, + "learning_rate": 0.00019830856034270235, + "loss": 0.7266, + "step": 972 + }, + { + "epoch": 0.08699928469241774, + "grad_norm": 0.11479072740209732, + "learning_rate": 0.0001983032522731966, + "loss": 0.6595, + "step": 973 + }, + { + "epoch": 0.08708869814020029, + "grad_norm": 0.10562127632370794, + "learning_rate": 0.00019829793595911577, + "loss": 0.6774, + "step": 974 + }, + { + "epoch": 0.08717811158798283, + "grad_norm": 0.11324535057695885, + "learning_rate": 0.0001982926114009058, + "loss": 0.7148, + "step": 975 + }, + { + "epoch": 0.08726752503576538, + "grad_norm": 0.1155174902483848, + "learning_rate": 0.00019828727859901317, + "loss": 0.705, + "step": 976 + }, + { + "epoch": 0.08735693848354792, + "grad_norm": 0.1129065341387722, + "learning_rate": 0.00019828193755388522, + "loss": 0.666, + "step": 977 + }, + { + "epoch": 0.08744635193133048, + "grad_norm": 0.08875254580948977, + "learning_rate": 0.00019827658826596984, + "loss": 0.6058, + "step": 978 + }, + { + "epoch": 0.08753576537911302, + "grad_norm": 0.11935339037783838, + "learning_rate": 0.00019827123073571572, + "loss": 0.6973, + "step": 979 + }, + { + "epoch": 0.08762517882689556, + "grad_norm": 0.11588626455954308, + "learning_rate": 0.00019826586496357216, + "loss": 0.6891, + "step": 980 + }, + { + "epoch": 0.08771459227467811, + "grad_norm": 0.12292864596730763, + "learning_rate": 0.00019826049094998912, + "loss": 0.6765, + "step": 981 + }, + { + "epoch": 0.08780400572246065, + "grad_norm": 0.1327483101908775, + "learning_rate": 0.00019825510869541743, + "loss": 0.7357, + "step": 982 + }, + { + "epoch": 0.08789341917024321, + "grad_norm": 0.1086413715631611, + "learning_rate": 0.00019824971820030842, + "loss": 0.6957, + "step": 983 + }, + { + "epoch": 0.08798283261802575, + "grad_norm": 0.12770893056858484, + "learning_rate": 0.0001982443194651142, + "loss": 0.7048, + "step": 984 + }, + { + "epoch": 0.0880722460658083, + "grad_norm": 0.12461905858569935, + "learning_rate": 0.00019823891249028756, + "loss": 0.7252, + "step": 985 + }, + { + "epoch": 0.08816165951359084, + "grad_norm": 0.12439395978315505, + "learning_rate": 0.00019823349727628197, + "loss": 0.6756, + "step": 986 + }, + { + "epoch": 0.0882510729613734, + "grad_norm": 0.11246876763020815, + "learning_rate": 0.00019822807382355163, + "loss": 0.6832, + "step": 987 + }, + { + "epoch": 0.08834048640915594, + "grad_norm": 0.11967561788568037, + "learning_rate": 0.0001982226421325513, + "loss": 0.6926, + "step": 988 + }, + { + "epoch": 0.08842989985693848, + "grad_norm": 0.08632738573772984, + "learning_rate": 0.00019821720220373665, + "loss": 0.6013, + "step": 989 + }, + { + "epoch": 0.08851931330472103, + "grad_norm": 0.10829531605196364, + "learning_rate": 0.00019821175403756384, + "loss": 0.6924, + "step": 990 + }, + { + "epoch": 0.08860872675250357, + "grad_norm": 0.11475472814879897, + "learning_rate": 0.00019820629763448987, + "loss": 0.7192, + "step": 991 + }, + { + "epoch": 0.08869814020028613, + "grad_norm": 0.09542684699818124, + "learning_rate": 0.00019820083299497228, + "loss": 0.7149, + "step": 992 + }, + { + "epoch": 0.08878755364806867, + "grad_norm": 0.12034299001001134, + "learning_rate": 0.00019819536011946945, + "loss": 0.7183, + "step": 993 + }, + { + "epoch": 0.08887696709585122, + "grad_norm": 0.11388212628809354, + "learning_rate": 0.00019818987900844032, + "loss": 0.7042, + "step": 994 + }, + { + "epoch": 0.08896638054363376, + "grad_norm": 0.11399229669855085, + "learning_rate": 0.00019818438966234464, + "loss": 0.7205, + "step": 995 + }, + { + "epoch": 0.0890557939914163, + "grad_norm": 0.11378961880284288, + "learning_rate": 0.00019817889208164277, + "loss": 0.6969, + "step": 996 + }, + { + "epoch": 0.08914520743919886, + "grad_norm": 0.1234818880615404, + "learning_rate": 0.0001981733862667958, + "loss": 0.7146, + "step": 997 + }, + { + "epoch": 0.0892346208869814, + "grad_norm": 0.11120781950006957, + "learning_rate": 0.00019816787221826548, + "loss": 0.6944, + "step": 998 + }, + { + "epoch": 0.08932403433476395, + "grad_norm": 0.12172054631076844, + "learning_rate": 0.0001981623499365143, + "loss": 0.7249, + "step": 999 + }, + { + "epoch": 0.08941344778254649, + "grad_norm": 0.10699957289298079, + "learning_rate": 0.00019815681942200535, + "loss": 0.6718, + "step": 1000 + }, + { + "epoch": 0.08950286123032904, + "grad_norm": 0.1313702920597125, + "learning_rate": 0.00019815128067520252, + "loss": 0.7099, + "step": 1001 + }, + { + "epoch": 0.0895922746781116, + "grad_norm": 0.12252301591401848, + "learning_rate": 0.0001981457336965703, + "loss": 0.7312, + "step": 1002 + }, + { + "epoch": 0.08968168812589414, + "grad_norm": 0.1327395379384629, + "learning_rate": 0.0001981401784865739, + "loss": 0.7178, + "step": 1003 + }, + { + "epoch": 0.08977110157367668, + "grad_norm": 0.11625163455381615, + "learning_rate": 0.00019813461504567933, + "loss": 0.6785, + "step": 1004 + }, + { + "epoch": 0.08986051502145923, + "grad_norm": 0.12564164909996145, + "learning_rate": 0.00019812904337435306, + "loss": 0.7456, + "step": 1005 + }, + { + "epoch": 0.08994992846924177, + "grad_norm": 0.11019146542335403, + "learning_rate": 0.00019812346347306242, + "loss": 0.6896, + "step": 1006 + }, + { + "epoch": 0.09003934191702433, + "grad_norm": 0.13039752777583025, + "learning_rate": 0.00019811787534227543, + "loss": 0.7144, + "step": 1007 + }, + { + "epoch": 0.09012875536480687, + "grad_norm": 0.1168832640817367, + "learning_rate": 0.0001981122789824607, + "loss": 0.7328, + "step": 1008 + }, + { + "epoch": 0.09021816881258941, + "grad_norm": 0.11487642247721462, + "learning_rate": 0.00019810667439408767, + "loss": 0.7009, + "step": 1009 + }, + { + "epoch": 0.09030758226037196, + "grad_norm": 0.12025967545829183, + "learning_rate": 0.0001981010615776263, + "loss": 0.7168, + "step": 1010 + }, + { + "epoch": 0.0903969957081545, + "grad_norm": 0.10803759393984985, + "learning_rate": 0.00019809544053354738, + "loss": 0.705, + "step": 1011 + }, + { + "epoch": 0.09048640915593706, + "grad_norm": 0.11076508356676809, + "learning_rate": 0.00019808981126232236, + "loss": 0.7367, + "step": 1012 + }, + { + "epoch": 0.0905758226037196, + "grad_norm": 0.11878922782154387, + "learning_rate": 0.0001980841737644233, + "loss": 0.706, + "step": 1013 + }, + { + "epoch": 0.09066523605150215, + "grad_norm": 0.12836509928114864, + "learning_rate": 0.00019807852804032305, + "loss": 0.7246, + "step": 1014 + }, + { + "epoch": 0.09075464949928469, + "grad_norm": 0.13304410964584917, + "learning_rate": 0.00019807287409049512, + "loss": 0.7059, + "step": 1015 + }, + { + "epoch": 0.09084406294706723, + "grad_norm": 0.10585404403544645, + "learning_rate": 0.00019806721191541367, + "loss": 0.6927, + "step": 1016 + }, + { + "epoch": 0.09093347639484979, + "grad_norm": 0.12265509169411787, + "learning_rate": 0.00019806154151555356, + "loss": 0.701, + "step": 1017 + }, + { + "epoch": 0.09102288984263233, + "grad_norm": 0.11389181644777972, + "learning_rate": 0.00019805586289139043, + "loss": 0.7173, + "step": 1018 + }, + { + "epoch": 0.09111230329041488, + "grad_norm": 0.12354847778240381, + "learning_rate": 0.0001980501760434005, + "loss": 0.7251, + "step": 1019 + }, + { + "epoch": 0.09120171673819742, + "grad_norm": 0.12431066137793205, + "learning_rate": 0.00019804448097206068, + "loss": 0.7388, + "step": 1020 + }, + { + "epoch": 0.09129113018597997, + "grad_norm": 0.13432678959010583, + "learning_rate": 0.0001980387776778487, + "loss": 0.731, + "step": 1021 + }, + { + "epoch": 0.09138054363376252, + "grad_norm": 0.10674987119342495, + "learning_rate": 0.00019803306616124282, + "loss": 0.7238, + "step": 1022 + }, + { + "epoch": 0.09146995708154507, + "grad_norm": 0.1125648349694289, + "learning_rate": 0.00019802734642272206, + "loss": 0.7558, + "step": 1023 + }, + { + "epoch": 0.09155937052932761, + "grad_norm": 0.12248976578227386, + "learning_rate": 0.00019802161846276615, + "loss": 0.7308, + "step": 1024 + }, + { + "epoch": 0.09164878397711015, + "grad_norm": 0.12544140127779743, + "learning_rate": 0.0001980158822818555, + "loss": 0.6935, + "step": 1025 + }, + { + "epoch": 0.0917381974248927, + "grad_norm": 0.12334601829257896, + "learning_rate": 0.00019801013788047116, + "loss": 0.7504, + "step": 1026 + }, + { + "epoch": 0.09182761087267526, + "grad_norm": 0.1412769809429634, + "learning_rate": 0.00019800438525909491, + "loss": 0.751, + "step": 1027 + }, + { + "epoch": 0.0919170243204578, + "grad_norm": 0.12658914281047676, + "learning_rate": 0.00019799862441820923, + "loss": 0.7282, + "step": 1028 + }, + { + "epoch": 0.09200643776824034, + "grad_norm": 0.1351938461576739, + "learning_rate": 0.0001979928553582973, + "loss": 0.7175, + "step": 1029 + }, + { + "epoch": 0.09209585121602289, + "grad_norm": 0.12069736848655868, + "learning_rate": 0.0001979870780798429, + "loss": 0.7127, + "step": 1030 + }, + { + "epoch": 0.09218526466380543, + "grad_norm": 0.1297216653815746, + "learning_rate": 0.00019798129258333065, + "loss": 0.7468, + "step": 1031 + }, + { + "epoch": 0.09227467811158799, + "grad_norm": 0.12156917963813453, + "learning_rate": 0.00019797549886924566, + "loss": 0.6982, + "step": 1032 + }, + { + "epoch": 0.09236409155937053, + "grad_norm": 0.12384882127560556, + "learning_rate": 0.00019796969693807394, + "loss": 0.6913, + "step": 1033 + }, + { + "epoch": 0.09245350500715308, + "grad_norm": 0.10501001930256934, + "learning_rate": 0.00019796388679030205, + "loss": 0.6746, + "step": 1034 + }, + { + "epoch": 0.09254291845493562, + "grad_norm": 0.10449987094921984, + "learning_rate": 0.0001979580684264173, + "loss": 0.6843, + "step": 1035 + }, + { + "epoch": 0.09263233190271816, + "grad_norm": 0.12255036857138393, + "learning_rate": 0.00019795224184690764, + "loss": 0.6852, + "step": 1036 + }, + { + "epoch": 0.09272174535050072, + "grad_norm": 0.11390603317538942, + "learning_rate": 0.00019794640705226175, + "loss": 0.7052, + "step": 1037 + }, + { + "epoch": 0.09281115879828326, + "grad_norm": 0.15509764786065933, + "learning_rate": 0.00019794056404296898, + "loss": 0.7215, + "step": 1038 + }, + { + "epoch": 0.09290057224606581, + "grad_norm": 0.1088677457092348, + "learning_rate": 0.0001979347128195194, + "loss": 0.7179, + "step": 1039 + }, + { + "epoch": 0.09298998569384835, + "grad_norm": 0.10935453936775572, + "learning_rate": 0.00019792885338240374, + "loss": 0.701, + "step": 1040 + }, + { + "epoch": 0.0930793991416309, + "grad_norm": 0.1170746416561294, + "learning_rate": 0.0001979229857321134, + "loss": 0.6889, + "step": 1041 + }, + { + "epoch": 0.09316881258941345, + "grad_norm": 0.09988389325768009, + "learning_rate": 0.00019791710986914051, + "loss": 0.7018, + "step": 1042 + }, + { + "epoch": 0.093258226037196, + "grad_norm": 0.10673148497657232, + "learning_rate": 0.00019791122579397789, + "loss": 0.7052, + "step": 1043 + }, + { + "epoch": 0.09334763948497854, + "grad_norm": 0.1078187452325507, + "learning_rate": 0.00019790533350711899, + "loss": 0.6916, + "step": 1044 + }, + { + "epoch": 0.09343705293276108, + "grad_norm": 0.10343722686480938, + "learning_rate": 0.000197899433009058, + "loss": 0.6862, + "step": 1045 + }, + { + "epoch": 0.09352646638054363, + "grad_norm": 0.12380204561723353, + "learning_rate": 0.0001978935243002898, + "loss": 0.743, + "step": 1046 + }, + { + "epoch": 0.09361587982832618, + "grad_norm": 0.12124712831772741, + "learning_rate": 0.00019788760738130993, + "loss": 0.7298, + "step": 1047 + }, + { + "epoch": 0.09370529327610873, + "grad_norm": 0.10652838832235821, + "learning_rate": 0.00019788168225261469, + "loss": 0.6953, + "step": 1048 + }, + { + "epoch": 0.09379470672389127, + "grad_norm": 0.12884430219883403, + "learning_rate": 0.00019787574891470095, + "loss": 0.7284, + "step": 1049 + }, + { + "epoch": 0.09388412017167382, + "grad_norm": 0.1119365545736051, + "learning_rate": 0.00019786980736806635, + "loss": 0.687, + "step": 1050 + }, + { + "epoch": 0.09397353361945637, + "grad_norm": 0.12620110754509287, + "learning_rate": 0.0001978638576132092, + "loss": 0.7096, + "step": 1051 + }, + { + "epoch": 0.09406294706723892, + "grad_norm": 0.11484515644050314, + "learning_rate": 0.00019785789965062848, + "loss": 0.7502, + "step": 1052 + }, + { + "epoch": 0.09415236051502146, + "grad_norm": 0.11486138202796431, + "learning_rate": 0.00019785193348082394, + "loss": 0.7549, + "step": 1053 + }, + { + "epoch": 0.094241773962804, + "grad_norm": 0.1208129417955592, + "learning_rate": 0.0001978459591042959, + "loss": 0.713, + "step": 1054 + }, + { + "epoch": 0.09433118741058655, + "grad_norm": 0.11982491426158631, + "learning_rate": 0.00019783997652154543, + "loss": 0.6981, + "step": 1055 + }, + { + "epoch": 0.0944206008583691, + "grad_norm": 0.11234952258349445, + "learning_rate": 0.00019783398573307428, + "loss": 0.6971, + "step": 1056 + }, + { + "epoch": 0.09451001430615165, + "grad_norm": 0.1276101489027681, + "learning_rate": 0.00019782798673938492, + "loss": 0.7145, + "step": 1057 + }, + { + "epoch": 0.09459942775393419, + "grad_norm": 0.10612986217102435, + "learning_rate": 0.00019782197954098046, + "loss": 0.7287, + "step": 1058 + }, + { + "epoch": 0.09468884120171674, + "grad_norm": 0.11976409091981452, + "learning_rate": 0.0001978159641383647, + "loss": 0.7461, + "step": 1059 + }, + { + "epoch": 0.09477825464949928, + "grad_norm": 0.13378580967209042, + "learning_rate": 0.00019780994053204216, + "loss": 0.729, + "step": 1060 + }, + { + "epoch": 0.09486766809728184, + "grad_norm": 0.11546798017371751, + "learning_rate": 0.00019780390872251803, + "loss": 0.7086, + "step": 1061 + }, + { + "epoch": 0.09495708154506438, + "grad_norm": 0.09179240846952949, + "learning_rate": 0.00019779786871029819, + "loss": 0.626, + "step": 1062 + }, + { + "epoch": 0.09504649499284692, + "grad_norm": 0.11278916789494807, + "learning_rate": 0.00019779182049588925, + "loss": 0.7141, + "step": 1063 + }, + { + "epoch": 0.09513590844062947, + "grad_norm": 0.12033536381273387, + "learning_rate": 0.0001977857640797984, + "loss": 0.6787, + "step": 1064 + }, + { + "epoch": 0.09522532188841201, + "grad_norm": 0.09947158564077767, + "learning_rate": 0.0001977796994625336, + "loss": 0.7047, + "step": 1065 + }, + { + "epoch": 0.09531473533619457, + "grad_norm": 0.11517842831854673, + "learning_rate": 0.0001977736266446035, + "loss": 0.7253, + "step": 1066 + }, + { + "epoch": 0.09540414878397711, + "grad_norm": 0.11154415786023304, + "learning_rate": 0.00019776754562651742, + "loss": 0.7068, + "step": 1067 + }, + { + "epoch": 0.09549356223175966, + "grad_norm": 0.11509622232544432, + "learning_rate": 0.00019776145640878538, + "loss": 0.6978, + "step": 1068 + }, + { + "epoch": 0.0955829756795422, + "grad_norm": 0.09804971227039021, + "learning_rate": 0.000197755358991918, + "loss": 0.691, + "step": 1069 + }, + { + "epoch": 0.09567238912732474, + "grad_norm": 0.10994092133438983, + "learning_rate": 0.00019774925337642677, + "loss": 0.7232, + "step": 1070 + }, + { + "epoch": 0.0957618025751073, + "grad_norm": 0.11131371542651788, + "learning_rate": 0.0001977431395628237, + "loss": 0.7346, + "step": 1071 + }, + { + "epoch": 0.09585121602288985, + "grad_norm": 0.12189412869651854, + "learning_rate": 0.00019773701755162158, + "loss": 0.6872, + "step": 1072 + }, + { + "epoch": 0.09594062947067239, + "grad_norm": 0.11546005091164117, + "learning_rate": 0.0001977308873433338, + "loss": 0.6934, + "step": 1073 + }, + { + "epoch": 0.09603004291845493, + "grad_norm": 0.12278994146337716, + "learning_rate": 0.00019772474893847456, + "loss": 0.7338, + "step": 1074 + }, + { + "epoch": 0.09611945636623748, + "grad_norm": 0.11871080886029332, + "learning_rate": 0.00019771860233755862, + "loss": 0.7126, + "step": 1075 + }, + { + "epoch": 0.09620886981402003, + "grad_norm": 0.11783747397027917, + "learning_rate": 0.0001977124475411015, + "loss": 0.6821, + "step": 1076 + }, + { + "epoch": 0.09629828326180258, + "grad_norm": 0.11845898635028043, + "learning_rate": 0.00019770628454961946, + "loss": 0.6301, + "step": 1077 + }, + { + "epoch": 0.09638769670958512, + "grad_norm": 0.13109764179000466, + "learning_rate": 0.00019770011336362928, + "loss": 0.7044, + "step": 1078 + }, + { + "epoch": 0.09647711015736767, + "grad_norm": 0.1115007041151102, + "learning_rate": 0.00019769393398364865, + "loss": 0.7225, + "step": 1079 + }, + { + "epoch": 0.09656652360515021, + "grad_norm": 0.14150701738760096, + "learning_rate": 0.0001976877464101957, + "loss": 0.7544, + "step": 1080 + }, + { + "epoch": 0.09665593705293277, + "grad_norm": 0.11912204909935731, + "learning_rate": 0.00019768155064378947, + "loss": 0.688, + "step": 1081 + }, + { + "epoch": 0.09674535050071531, + "grad_norm": 0.12801819796547825, + "learning_rate": 0.00019767534668494954, + "loss": 0.7402, + "step": 1082 + }, + { + "epoch": 0.09683476394849785, + "grad_norm": 0.11042564113309371, + "learning_rate": 0.00019766913453419624, + "loss": 0.7081, + "step": 1083 + }, + { + "epoch": 0.0969241773962804, + "grad_norm": 0.10687632630311861, + "learning_rate": 0.0001976629141920506, + "loss": 0.6888, + "step": 1084 + }, + { + "epoch": 0.09701359084406294, + "grad_norm": 0.115200332273902, + "learning_rate": 0.0001976566856590343, + "loss": 0.7064, + "step": 1085 + }, + { + "epoch": 0.0971030042918455, + "grad_norm": 0.11268781040181705, + "learning_rate": 0.00019765044893566968, + "loss": 0.6743, + "step": 1086 + }, + { + "epoch": 0.09719241773962804, + "grad_norm": 0.09358234124741771, + "learning_rate": 0.00019764420402247987, + "loss": 0.6742, + "step": 1087 + }, + { + "epoch": 0.09728183118741059, + "grad_norm": 0.12907264664570361, + "learning_rate": 0.00019763795091998858, + "loss": 0.7232, + "step": 1088 + }, + { + "epoch": 0.09737124463519313, + "grad_norm": 0.09012990509784854, + "learning_rate": 0.00019763168962872026, + "loss": 0.5696, + "step": 1089 + }, + { + "epoch": 0.09746065808297567, + "grad_norm": 0.11944197265722861, + "learning_rate": 0.00019762542014920004, + "loss": 0.7306, + "step": 1090 + }, + { + "epoch": 0.09755007153075823, + "grad_norm": 0.11454998393277539, + "learning_rate": 0.00019761914248195373, + "loss": 0.7187, + "step": 1091 + }, + { + "epoch": 0.09763948497854077, + "grad_norm": 0.1038352219436379, + "learning_rate": 0.00019761285662750787, + "loss": 0.6739, + "step": 1092 + }, + { + "epoch": 0.09772889842632332, + "grad_norm": 0.11170118871304813, + "learning_rate": 0.00019760656258638958, + "loss": 0.7171, + "step": 1093 + }, + { + "epoch": 0.09781831187410586, + "grad_norm": 0.11210667988817329, + "learning_rate": 0.0001976002603591268, + "loss": 0.7074, + "step": 1094 + }, + { + "epoch": 0.0979077253218884, + "grad_norm": 0.12656923042263052, + "learning_rate": 0.000197593949946248, + "loss": 0.7289, + "step": 1095 + }, + { + "epoch": 0.09799713876967096, + "grad_norm": 0.12429834595870645, + "learning_rate": 0.0001975876313482825, + "loss": 0.7046, + "step": 1096 + }, + { + "epoch": 0.09808655221745351, + "grad_norm": 0.12924117747074726, + "learning_rate": 0.00019758130456576023, + "loss": 0.7225, + "step": 1097 + }, + { + "epoch": 0.09817596566523605, + "grad_norm": 0.12326949302966331, + "learning_rate": 0.00019757496959921177, + "loss": 0.7142, + "step": 1098 + }, + { + "epoch": 0.0982653791130186, + "grad_norm": 0.12689208313631586, + "learning_rate": 0.00019756862644916846, + "loss": 0.7185, + "step": 1099 + }, + { + "epoch": 0.09835479256080114, + "grad_norm": 0.11946108174873706, + "learning_rate": 0.00019756227511616233, + "loss": 0.6753, + "step": 1100 + }, + { + "epoch": 0.0984442060085837, + "grad_norm": 0.11618249151096958, + "learning_rate": 0.00019755591560072596, + "loss": 0.7068, + "step": 1101 + }, + { + "epoch": 0.09853361945636624, + "grad_norm": 0.1262905448027378, + "learning_rate": 0.00019754954790339278, + "loss": 0.708, + "step": 1102 + }, + { + "epoch": 0.09862303290414878, + "grad_norm": 0.11241016034174682, + "learning_rate": 0.00019754317202469682, + "loss": 0.6684, + "step": 1103 + }, + { + "epoch": 0.09871244635193133, + "grad_norm": 0.11875454369929637, + "learning_rate": 0.00019753678796517282, + "loss": 0.7246, + "step": 1104 + }, + { + "epoch": 0.09880185979971387, + "grad_norm": 0.1055835370975195, + "learning_rate": 0.0001975303957253562, + "loss": 0.6861, + "step": 1105 + }, + { + "epoch": 0.09889127324749643, + "grad_norm": 0.13316089563456007, + "learning_rate": 0.00019752399530578312, + "loss": 0.7238, + "step": 1106 + }, + { + "epoch": 0.09898068669527897, + "grad_norm": 0.11898147261313487, + "learning_rate": 0.00019751758670699028, + "loss": 0.7019, + "step": 1107 + }, + { + "epoch": 0.09907010014306152, + "grad_norm": 0.11323252554660124, + "learning_rate": 0.00019751116992951527, + "loss": 0.6976, + "step": 1108 + }, + { + "epoch": 0.09915951359084406, + "grad_norm": 0.1122841969505758, + "learning_rate": 0.00019750474497389614, + "loss": 0.6996, + "step": 1109 + }, + { + "epoch": 0.0992489270386266, + "grad_norm": 0.1192135617088567, + "learning_rate": 0.00019749831184067185, + "loss": 0.7005, + "step": 1110 + }, + { + "epoch": 0.09933834048640916, + "grad_norm": 0.1315542574483895, + "learning_rate": 0.00019749187053038188, + "loss": 0.7446, + "step": 1111 + }, + { + "epoch": 0.0994277539341917, + "grad_norm": 0.11935486571594502, + "learning_rate": 0.00019748542104356648, + "loss": 0.7622, + "step": 1112 + }, + { + "epoch": 0.09951716738197425, + "grad_norm": 0.11516821304411134, + "learning_rate": 0.00019747896338076655, + "loss": 0.6932, + "step": 1113 + }, + { + "epoch": 0.09960658082975679, + "grad_norm": 0.11362426707198334, + "learning_rate": 0.00019747249754252367, + "loss": 0.7116, + "step": 1114 + }, + { + "epoch": 0.09969599427753935, + "grad_norm": 0.11912826901665655, + "learning_rate": 0.00019746602352938014, + "loss": 0.7328, + "step": 1115 + }, + { + "epoch": 0.09978540772532189, + "grad_norm": 0.1131851055909797, + "learning_rate": 0.00019745954134187894, + "loss": 0.7081, + "step": 1116 + }, + { + "epoch": 0.09987482117310444, + "grad_norm": 0.12449631289186344, + "learning_rate": 0.00019745305098056368, + "loss": 0.7039, + "step": 1117 + }, + { + "epoch": 0.09996423462088698, + "grad_norm": 0.12013800425262575, + "learning_rate": 0.00019744655244597877, + "loss": 0.7326, + "step": 1118 + }, + { + "epoch": 0.10005364806866952, + "grad_norm": 0.12377783571654773, + "learning_rate": 0.00019744004573866915, + "loss": 0.7559, + "step": 1119 + }, + { + "epoch": 0.10014306151645208, + "grad_norm": 0.11075874806178014, + "learning_rate": 0.0001974335308591806, + "loss": 0.7089, + "step": 1120 + }, + { + "epoch": 0.10023247496423462, + "grad_norm": 0.1115753101867663, + "learning_rate": 0.00019742700780805948, + "loss": 0.7233, + "step": 1121 + }, + { + "epoch": 0.10032188841201717, + "grad_norm": 0.10088811049845599, + "learning_rate": 0.00019742047658585286, + "loss": 0.6732, + "step": 1122 + }, + { + "epoch": 0.10041130185979971, + "grad_norm": 0.0904471601354607, + "learning_rate": 0.00019741393719310853, + "loss": 0.5634, + "step": 1123 + }, + { + "epoch": 0.10050071530758226, + "grad_norm": 0.11760363464319573, + "learning_rate": 0.00019740738963037495, + "loss": 0.6774, + "step": 1124 + }, + { + "epoch": 0.10059012875536481, + "grad_norm": 0.12682109207293454, + "learning_rate": 0.00019740083389820122, + "loss": 0.7251, + "step": 1125 + }, + { + "epoch": 0.10067954220314736, + "grad_norm": 0.12815444725376934, + "learning_rate": 0.0001973942699971372, + "loss": 0.6876, + "step": 1126 + }, + { + "epoch": 0.1007689556509299, + "grad_norm": 0.12222458157430982, + "learning_rate": 0.00019738769792773336, + "loss": 0.726, + "step": 1127 + }, + { + "epoch": 0.10085836909871244, + "grad_norm": 0.11160379188023413, + "learning_rate": 0.00019738111769054093, + "loss": 0.7016, + "step": 1128 + }, + { + "epoch": 0.10094778254649499, + "grad_norm": 0.10469395340491208, + "learning_rate": 0.00019737452928611176, + "loss": 0.6638, + "step": 1129 + }, + { + "epoch": 0.10103719599427755, + "grad_norm": 0.10580912017805619, + "learning_rate": 0.00019736793271499844, + "loss": 0.6839, + "step": 1130 + }, + { + "epoch": 0.10112660944206009, + "grad_norm": 0.11872948854764293, + "learning_rate": 0.00019736132797775415, + "loss": 0.6999, + "step": 1131 + }, + { + "epoch": 0.10121602288984263, + "grad_norm": 0.10746251379434452, + "learning_rate": 0.00019735471507493287, + "loss": 0.7289, + "step": 1132 + }, + { + "epoch": 0.10130543633762518, + "grad_norm": 0.11759399002547126, + "learning_rate": 0.00019734809400708922, + "loss": 0.698, + "step": 1133 + }, + { + "epoch": 0.10139484978540772, + "grad_norm": 0.10945961568320951, + "learning_rate": 0.0001973414647747785, + "loss": 0.7487, + "step": 1134 + }, + { + "epoch": 0.10148426323319028, + "grad_norm": 0.1130184241543984, + "learning_rate": 0.0001973348273785567, + "loss": 0.7196, + "step": 1135 + }, + { + "epoch": 0.10157367668097282, + "grad_norm": 0.12685079382203263, + "learning_rate": 0.00019732818181898045, + "loss": 0.704, + "step": 1136 + }, + { + "epoch": 0.10166309012875537, + "grad_norm": 0.11422763044798245, + "learning_rate": 0.00019732152809660716, + "loss": 0.7159, + "step": 1137 + }, + { + "epoch": 0.10175250357653791, + "grad_norm": 0.11068880305769356, + "learning_rate": 0.0001973148662119948, + "loss": 0.7218, + "step": 1138 + }, + { + "epoch": 0.10184191702432045, + "grad_norm": 0.10430944371229589, + "learning_rate": 0.0001973081961657022, + "loss": 0.713, + "step": 1139 + }, + { + "epoch": 0.10193133047210301, + "grad_norm": 0.10171908946456408, + "learning_rate": 0.00019730151795828866, + "loss": 0.6706, + "step": 1140 + }, + { + "epoch": 0.10202074391988555, + "grad_norm": 0.11607598327408344, + "learning_rate": 0.0001972948315903143, + "loss": 0.6976, + "step": 1141 + }, + { + "epoch": 0.1021101573676681, + "grad_norm": 0.12411466110208902, + "learning_rate": 0.00019728813706233997, + "loss": 0.7132, + "step": 1142 + }, + { + "epoch": 0.10219957081545064, + "grad_norm": 0.12778803355051968, + "learning_rate": 0.00019728143437492706, + "loss": 0.7135, + "step": 1143 + }, + { + "epoch": 0.10228898426323318, + "grad_norm": 0.11399506303693005, + "learning_rate": 0.00019727472352863774, + "loss": 0.7128, + "step": 1144 + }, + { + "epoch": 0.10237839771101574, + "grad_norm": 0.10943306077161287, + "learning_rate": 0.00019726800452403483, + "loss": 0.6544, + "step": 1145 + }, + { + "epoch": 0.10246781115879829, + "grad_norm": 0.11167959628021075, + "learning_rate": 0.00019726127736168186, + "loss": 0.6763, + "step": 1146 + }, + { + "epoch": 0.10255722460658083, + "grad_norm": 0.12192793667989196, + "learning_rate": 0.000197254542042143, + "loss": 0.7106, + "step": 1147 + }, + { + "epoch": 0.10264663805436337, + "grad_norm": 0.12485304910339907, + "learning_rate": 0.00019724779856598317, + "loss": 0.7255, + "step": 1148 + }, + { + "epoch": 0.10273605150214592, + "grad_norm": 0.11379199648208123, + "learning_rate": 0.0001972410469337679, + "loss": 0.6984, + "step": 1149 + }, + { + "epoch": 0.10282546494992847, + "grad_norm": 0.09667559060672343, + "learning_rate": 0.00019723428714606348, + "loss": 0.7048, + "step": 1150 + }, + { + "epoch": 0.10291487839771102, + "grad_norm": 0.11887340851820627, + "learning_rate": 0.0001972275192034368, + "loss": 0.6859, + "step": 1151 + }, + { + "epoch": 0.10300429184549356, + "grad_norm": 0.12159324578558547, + "learning_rate": 0.00019722074310645553, + "loss": 0.7333, + "step": 1152 + }, + { + "epoch": 0.1030937052932761, + "grad_norm": 0.11823758243301129, + "learning_rate": 0.00019721395885568795, + "loss": 0.7153, + "step": 1153 + }, + { + "epoch": 0.10318311874105865, + "grad_norm": 0.11870510620763434, + "learning_rate": 0.00019720716645170303, + "loss": 0.7158, + "step": 1154 + }, + { + "epoch": 0.1032725321888412, + "grad_norm": 0.10799942109390136, + "learning_rate": 0.00019720036589507048, + "loss": 0.6997, + "step": 1155 + }, + { + "epoch": 0.10336194563662375, + "grad_norm": 0.11282320984858696, + "learning_rate": 0.0001971935571863606, + "loss": 0.6578, + "step": 1156 + }, + { + "epoch": 0.1034513590844063, + "grad_norm": 0.10998140735127042, + "learning_rate": 0.00019718674032614448, + "loss": 0.7389, + "step": 1157 + }, + { + "epoch": 0.10354077253218884, + "grad_norm": 0.10777286832722431, + "learning_rate": 0.0001971799153149938, + "loss": 0.6978, + "step": 1158 + }, + { + "epoch": 0.10363018597997138, + "grad_norm": 0.12438142403192427, + "learning_rate": 0.00019717308215348102, + "loss": 0.7264, + "step": 1159 + }, + { + "epoch": 0.10371959942775394, + "grad_norm": 0.10909547786737701, + "learning_rate": 0.00019716624084217918, + "loss": 0.6742, + "step": 1160 + }, + { + "epoch": 0.10380901287553648, + "grad_norm": 0.11986425207317511, + "learning_rate": 0.00019715939138166205, + "loss": 0.7105, + "step": 1161 + }, + { + "epoch": 0.10389842632331903, + "grad_norm": 0.11866185605434645, + "learning_rate": 0.00019715253377250411, + "loss": 0.6776, + "step": 1162 + }, + { + "epoch": 0.10398783977110157, + "grad_norm": 0.11075586917193186, + "learning_rate": 0.0001971456680152805, + "loss": 0.6907, + "step": 1163 + }, + { + "epoch": 0.10407725321888411, + "grad_norm": 0.10213619251412855, + "learning_rate": 0.00019713879411056704, + "loss": 0.688, + "step": 1164 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.11591289541767451, + "learning_rate": 0.00019713191205894025, + "loss": 0.7341, + "step": 1165 + }, + { + "epoch": 0.10425608011444921, + "grad_norm": 0.10700237950523461, + "learning_rate": 0.00019712502186097726, + "loss": 0.7192, + "step": 1166 + }, + { + "epoch": 0.10434549356223176, + "grad_norm": 0.11044524070182057, + "learning_rate": 0.00019711812351725603, + "loss": 0.6823, + "step": 1167 + }, + { + "epoch": 0.1044349070100143, + "grad_norm": 0.1100544476219459, + "learning_rate": 0.00019711121702835504, + "loss": 0.7007, + "step": 1168 + }, + { + "epoch": 0.10452432045779685, + "grad_norm": 0.12244540704078435, + "learning_rate": 0.00019710430239485354, + "loss": 0.7108, + "step": 1169 + }, + { + "epoch": 0.1046137339055794, + "grad_norm": 0.1138028960034345, + "learning_rate": 0.0001970973796173315, + "loss": 0.7203, + "step": 1170 + }, + { + "epoch": 0.10470314735336195, + "grad_norm": 0.10875277787213324, + "learning_rate": 0.00019709044869636947, + "loss": 0.6888, + "step": 1171 + }, + { + "epoch": 0.10479256080114449, + "grad_norm": 0.11835785187137747, + "learning_rate": 0.00019708350963254878, + "loss": 0.7298, + "step": 1172 + }, + { + "epoch": 0.10488197424892703, + "grad_norm": 0.12430391033965968, + "learning_rate": 0.0001970765624264514, + "loss": 0.7306, + "step": 1173 + }, + { + "epoch": 0.10497138769670958, + "grad_norm": 0.1225907540383503, + "learning_rate": 0.00019706960707865996, + "loss": 0.6793, + "step": 1174 + }, + { + "epoch": 0.10506080114449214, + "grad_norm": 0.12042451400319758, + "learning_rate": 0.00019706264358975779, + "loss": 0.7217, + "step": 1175 + }, + { + "epoch": 0.10515021459227468, + "grad_norm": 0.11352705006337399, + "learning_rate": 0.00019705567196032892, + "loss": 0.6734, + "step": 1176 + }, + { + "epoch": 0.10523962804005722, + "grad_norm": 0.10920054334852212, + "learning_rate": 0.0001970486921909581, + "loss": 0.6961, + "step": 1177 + }, + { + "epoch": 0.10532904148783977, + "grad_norm": 0.11379128037746947, + "learning_rate": 0.0001970417042822306, + "loss": 0.7475, + "step": 1178 + }, + { + "epoch": 0.10541845493562232, + "grad_norm": 0.11292414705770648, + "learning_rate": 0.00019703470823473262, + "loss": 0.6891, + "step": 1179 + }, + { + "epoch": 0.10550786838340487, + "grad_norm": 0.12073085229117705, + "learning_rate": 0.0001970277040490508, + "loss": 0.7026, + "step": 1180 + }, + { + "epoch": 0.10559728183118741, + "grad_norm": 0.1085301972435544, + "learning_rate": 0.0001970206917257727, + "loss": 0.7059, + "step": 1181 + }, + { + "epoch": 0.10568669527896996, + "grad_norm": 0.1115418531341273, + "learning_rate": 0.0001970136712654863, + "loss": 0.6963, + "step": 1182 + }, + { + "epoch": 0.1057761087267525, + "grad_norm": 0.12911132223142893, + "learning_rate": 0.00019700664266878045, + "loss": 0.7199, + "step": 1183 + }, + { + "epoch": 0.10586552217453506, + "grad_norm": 0.11485092120930024, + "learning_rate": 0.00019699960593624462, + "loss": 0.7006, + "step": 1184 + }, + { + "epoch": 0.1059549356223176, + "grad_norm": 0.11789523245660187, + "learning_rate": 0.00019699256106846903, + "loss": 0.7042, + "step": 1185 + }, + { + "epoch": 0.10604434907010014, + "grad_norm": 0.11041354052714047, + "learning_rate": 0.00019698550806604445, + "loss": 0.716, + "step": 1186 + }, + { + "epoch": 0.10613376251788269, + "grad_norm": 0.1092044691942089, + "learning_rate": 0.00019697844692956245, + "loss": 0.6982, + "step": 1187 + }, + { + "epoch": 0.10622317596566523, + "grad_norm": 0.11845275209374778, + "learning_rate": 0.0001969713776596152, + "loss": 0.7253, + "step": 1188 + }, + { + "epoch": 0.10631258941344779, + "grad_norm": 0.12818958340665007, + "learning_rate": 0.00019696430025679566, + "loss": 0.738, + "step": 1189 + }, + { + "epoch": 0.10640200286123033, + "grad_norm": 0.11163347597343715, + "learning_rate": 0.00019695721472169734, + "loss": 0.6915, + "step": 1190 + }, + { + "epoch": 0.10649141630901288, + "grad_norm": 0.10868865397085237, + "learning_rate": 0.0001969501210549145, + "loss": 0.6855, + "step": 1191 + }, + { + "epoch": 0.10658082975679542, + "grad_norm": 0.10347424222146083, + "learning_rate": 0.0001969430192570421, + "loss": 0.7192, + "step": 1192 + }, + { + "epoch": 0.10667024320457796, + "grad_norm": 0.13694675268343828, + "learning_rate": 0.00019693590932867578, + "loss": 0.695, + "step": 1193 + }, + { + "epoch": 0.10675965665236052, + "grad_norm": 0.11245584038641218, + "learning_rate": 0.0001969287912704118, + "loss": 0.6639, + "step": 1194 + }, + { + "epoch": 0.10684907010014306, + "grad_norm": 0.12195559912898907, + "learning_rate": 0.00019692166508284716, + "loss": 0.7529, + "step": 1195 + }, + { + "epoch": 0.10693848354792561, + "grad_norm": 0.13143173548941212, + "learning_rate": 0.0001969145307665795, + "loss": 0.7585, + "step": 1196 + }, + { + "epoch": 0.10702789699570815, + "grad_norm": 0.12288819668375744, + "learning_rate": 0.0001969073883222072, + "loss": 0.7531, + "step": 1197 + }, + { + "epoch": 0.1071173104434907, + "grad_norm": 0.12267914679453815, + "learning_rate": 0.00019690023775032929, + "loss": 0.704, + "step": 1198 + }, + { + "epoch": 0.10720672389127325, + "grad_norm": 0.12155308950808667, + "learning_rate": 0.00019689307905154543, + "loss": 0.7106, + "step": 1199 + }, + { + "epoch": 0.1072961373390558, + "grad_norm": 0.12674879862354205, + "learning_rate": 0.00019688591222645607, + "loss": 0.7243, + "step": 1200 + }, + { + "epoch": 0.10738555078683834, + "grad_norm": 0.11146697098800162, + "learning_rate": 0.00019687873727566226, + "loss": 0.7102, + "step": 1201 + }, + { + "epoch": 0.10747496423462088, + "grad_norm": 0.14034450461544085, + "learning_rate": 0.00019687155419976574, + "loss": 0.7301, + "step": 1202 + }, + { + "epoch": 0.10756437768240343, + "grad_norm": 0.13980698425906393, + "learning_rate": 0.000196864362999369, + "loss": 0.6888, + "step": 1203 + }, + { + "epoch": 0.10765379113018599, + "grad_norm": 0.11929711793579836, + "learning_rate": 0.00019685716367507508, + "loss": 0.7084, + "step": 1204 + }, + { + "epoch": 0.10774320457796853, + "grad_norm": 0.10790996599068452, + "learning_rate": 0.00019684995622748784, + "loss": 0.7065, + "step": 1205 + }, + { + "epoch": 0.10783261802575107, + "grad_norm": 0.12809420520305334, + "learning_rate": 0.00019684274065721172, + "loss": 0.7274, + "step": 1206 + }, + { + "epoch": 0.10792203147353362, + "grad_norm": 0.10661652547740018, + "learning_rate": 0.00019683551696485192, + "loss": 0.6604, + "step": 1207 + }, + { + "epoch": 0.10801144492131616, + "grad_norm": 0.1303182112628572, + "learning_rate": 0.00019682828515101423, + "loss": 0.7622, + "step": 1208 + }, + { + "epoch": 0.10810085836909872, + "grad_norm": 0.11064374864561734, + "learning_rate": 0.0001968210452163052, + "loss": 0.6742, + "step": 1209 + }, + { + "epoch": 0.10819027181688126, + "grad_norm": 0.12007154952955891, + "learning_rate": 0.00019681379716133206, + "loss": 0.6915, + "step": 1210 + }, + { + "epoch": 0.1082796852646638, + "grad_norm": 0.11066601772968626, + "learning_rate": 0.00019680654098670267, + "loss": 0.67, + "step": 1211 + }, + { + "epoch": 0.10836909871244635, + "grad_norm": 0.1285481204340949, + "learning_rate": 0.0001967992766930256, + "loss": 0.7678, + "step": 1212 + }, + { + "epoch": 0.10845851216022889, + "grad_norm": 0.11964198913630358, + "learning_rate": 0.0001967920042809101, + "loss": 0.7329, + "step": 1213 + }, + { + "epoch": 0.10854792560801145, + "grad_norm": 0.10719210645772378, + "learning_rate": 0.0001967847237509661, + "loss": 0.7319, + "step": 1214 + }, + { + "epoch": 0.108637339055794, + "grad_norm": 0.11127263021703016, + "learning_rate": 0.00019677743510380417, + "loss": 0.719, + "step": 1215 + }, + { + "epoch": 0.10872675250357654, + "grad_norm": 0.12890897639094262, + "learning_rate": 0.0001967701383400357, + "loss": 0.6899, + "step": 1216 + }, + { + "epoch": 0.10881616595135908, + "grad_norm": 0.10087217192879011, + "learning_rate": 0.00019676283346027254, + "loss": 0.6976, + "step": 1217 + }, + { + "epoch": 0.10890557939914162, + "grad_norm": 0.11058243144107945, + "learning_rate": 0.0001967555204651274, + "loss": 0.6774, + "step": 1218 + }, + { + "epoch": 0.10899499284692418, + "grad_norm": 0.12538853182363183, + "learning_rate": 0.00019674819935521366, + "loss": 0.7246, + "step": 1219 + }, + { + "epoch": 0.10908440629470673, + "grad_norm": 0.12814244795048435, + "learning_rate": 0.0001967408701311452, + "loss": 0.7302, + "step": 1220 + }, + { + "epoch": 0.10917381974248927, + "grad_norm": 0.11812126523363348, + "learning_rate": 0.00019673353279353684, + "loss": 0.7236, + "step": 1221 + }, + { + "epoch": 0.10926323319027181, + "grad_norm": 0.11768828219993852, + "learning_rate": 0.00019672618734300392, + "loss": 0.6887, + "step": 1222 + }, + { + "epoch": 0.10935264663805436, + "grad_norm": 0.12376126141827815, + "learning_rate": 0.0001967188337801625, + "loss": 0.7053, + "step": 1223 + }, + { + "epoch": 0.10944206008583691, + "grad_norm": 0.10974169998686373, + "learning_rate": 0.00019671147210562927, + "loss": 0.6788, + "step": 1224 + }, + { + "epoch": 0.10953147353361946, + "grad_norm": 0.127307981407316, + "learning_rate": 0.00019670410232002164, + "loss": 0.7403, + "step": 1225 + }, + { + "epoch": 0.109620886981402, + "grad_norm": 0.1135810150123527, + "learning_rate": 0.00019669672442395778, + "loss": 0.6861, + "step": 1226 + }, + { + "epoch": 0.10971030042918455, + "grad_norm": 0.10281975090734723, + "learning_rate": 0.00019668933841805644, + "loss": 0.6769, + "step": 1227 + }, + { + "epoch": 0.10979971387696709, + "grad_norm": 0.09742365128303662, + "learning_rate": 0.00019668194430293702, + "loss": 0.6268, + "step": 1228 + }, + { + "epoch": 0.10988912732474965, + "grad_norm": 0.12826609305065398, + "learning_rate": 0.0001966745420792197, + "loss": 0.7555, + "step": 1229 + }, + { + "epoch": 0.10997854077253219, + "grad_norm": 0.10982509946407253, + "learning_rate": 0.0001966671317475253, + "loss": 0.666, + "step": 1230 + }, + { + "epoch": 0.11006795422031473, + "grad_norm": 0.11571148746096584, + "learning_rate": 0.00019665971330847532, + "loss": 0.7175, + "step": 1231 + }, + { + "epoch": 0.11015736766809728, + "grad_norm": 0.13053626847473304, + "learning_rate": 0.0001966522867626919, + "loss": 0.7014, + "step": 1232 + }, + { + "epoch": 0.11024678111587982, + "grad_norm": 0.13315868060330407, + "learning_rate": 0.00019664485211079793, + "loss": 0.7692, + "step": 1233 + }, + { + "epoch": 0.11033619456366238, + "grad_norm": 0.12008331268438739, + "learning_rate": 0.0001966374093534169, + "loss": 0.6969, + "step": 1234 + }, + { + "epoch": 0.11042560801144492, + "grad_norm": 0.11709410048434861, + "learning_rate": 0.00019662995849117307, + "loss": 0.6944, + "step": 1235 + }, + { + "epoch": 0.11051502145922747, + "grad_norm": 0.1112741502481919, + "learning_rate": 0.00019662249952469133, + "loss": 0.6798, + "step": 1236 + }, + { + "epoch": 0.11060443490701001, + "grad_norm": 0.11925062374039408, + "learning_rate": 0.00019661503245459722, + "loss": 0.6989, + "step": 1237 + }, + { + "epoch": 0.11069384835479255, + "grad_norm": 0.12687952973009659, + "learning_rate": 0.00019660755728151706, + "loss": 0.7587, + "step": 1238 + }, + { + "epoch": 0.11078326180257511, + "grad_norm": 0.11560303074378814, + "learning_rate": 0.00019660007400607772, + "loss": 0.7266, + "step": 1239 + }, + { + "epoch": 0.11087267525035766, + "grad_norm": 0.10143760478336973, + "learning_rate": 0.00019659258262890683, + "loss": 0.7158, + "step": 1240 + }, + { + "epoch": 0.1109620886981402, + "grad_norm": 0.09676932910683651, + "learning_rate": 0.00019658508315063272, + "loss": 0.6552, + "step": 1241 + }, + { + "epoch": 0.11105150214592274, + "grad_norm": 0.11360773205740188, + "learning_rate": 0.0001965775755718843, + "loss": 0.7324, + "step": 1242 + }, + { + "epoch": 0.1111409155937053, + "grad_norm": 0.10409982572549888, + "learning_rate": 0.00019657005989329128, + "loss": 0.7227, + "step": 1243 + }, + { + "epoch": 0.11123032904148784, + "grad_norm": 0.11281981265754351, + "learning_rate": 0.0001965625361154839, + "loss": 0.7345, + "step": 1244 + }, + { + "epoch": 0.11131974248927039, + "grad_norm": 0.11681110213534465, + "learning_rate": 0.0001965550042390933, + "loss": 0.6838, + "step": 1245 + }, + { + "epoch": 0.11140915593705293, + "grad_norm": 0.10695378312314789, + "learning_rate": 0.0001965474642647511, + "loss": 0.6976, + "step": 1246 + }, + { + "epoch": 0.11149856938483547, + "grad_norm": 0.09746705833467885, + "learning_rate": 0.00019653991619308965, + "loss": 0.6919, + "step": 1247 + }, + { + "epoch": 0.11158798283261803, + "grad_norm": 0.10558747795997307, + "learning_rate": 0.000196532360024742, + "loss": 0.6812, + "step": 1248 + }, + { + "epoch": 0.11167739628040058, + "grad_norm": 0.11023797468001287, + "learning_rate": 0.00019652479576034196, + "loss": 0.7002, + "step": 1249 + }, + { + "epoch": 0.11176680972818312, + "grad_norm": 0.11612595553452107, + "learning_rate": 0.0001965172234005238, + "loss": 0.6985, + "step": 1250 + }, + { + "epoch": 0.11185622317596566, + "grad_norm": 0.11530122602803362, + "learning_rate": 0.00019650964294592272, + "loss": 0.6845, + "step": 1251 + }, + { + "epoch": 0.11194563662374821, + "grad_norm": 0.12883784648034002, + "learning_rate": 0.0001965020543971744, + "loss": 0.7281, + "step": 1252 + }, + { + "epoch": 0.11203505007153076, + "grad_norm": 0.12550956717115067, + "learning_rate": 0.0001964944577549154, + "loss": 0.7112, + "step": 1253 + }, + { + "epoch": 0.11212446351931331, + "grad_norm": 0.13265346676108042, + "learning_rate": 0.00019648685301978271, + "loss": 0.7513, + "step": 1254 + }, + { + "epoch": 0.11221387696709585, + "grad_norm": 0.1386316928301978, + "learning_rate": 0.00019647924019241416, + "loss": 0.7439, + "step": 1255 + }, + { + "epoch": 0.1123032904148784, + "grad_norm": 0.12809036804119397, + "learning_rate": 0.00019647161927344831, + "loss": 0.6815, + "step": 1256 + }, + { + "epoch": 0.11239270386266094, + "grad_norm": 0.12398705368994496, + "learning_rate": 0.00019646399026352422, + "loss": 0.681, + "step": 1257 + }, + { + "epoch": 0.1124821173104435, + "grad_norm": 0.12080384598511641, + "learning_rate": 0.00019645635316328179, + "loss": 0.7253, + "step": 1258 + }, + { + "epoch": 0.11257153075822604, + "grad_norm": 0.12073359512139643, + "learning_rate": 0.0001964487079733615, + "loss": 0.7086, + "step": 1259 + }, + { + "epoch": 0.11266094420600858, + "grad_norm": 0.12073605794134877, + "learning_rate": 0.00019644105469440455, + "loss": 0.7111, + "step": 1260 + }, + { + "epoch": 0.11275035765379113, + "grad_norm": 0.13628052130366522, + "learning_rate": 0.00019643339332705282, + "loss": 0.7214, + "step": 1261 + }, + { + "epoch": 0.11283977110157367, + "grad_norm": 0.11806996807793622, + "learning_rate": 0.00019642572387194884, + "loss": 0.6829, + "step": 1262 + }, + { + "epoch": 0.11292918454935623, + "grad_norm": 0.12403435007450288, + "learning_rate": 0.00019641804632973585, + "loss": 0.7255, + "step": 1263 + }, + { + "epoch": 0.11301859799713877, + "grad_norm": 0.11592948067197362, + "learning_rate": 0.00019641036070105778, + "loss": 0.685, + "step": 1264 + }, + { + "epoch": 0.11310801144492132, + "grad_norm": 0.1168687611755306, + "learning_rate": 0.00019640266698655917, + "loss": 0.6588, + "step": 1265 + }, + { + "epoch": 0.11319742489270386, + "grad_norm": 0.13494860072712198, + "learning_rate": 0.00019639496518688532, + "loss": 0.7311, + "step": 1266 + }, + { + "epoch": 0.1132868383404864, + "grad_norm": 0.110560562109232, + "learning_rate": 0.00019638725530268217, + "loss": 0.6773, + "step": 1267 + }, + { + "epoch": 0.11337625178826896, + "grad_norm": 0.11146492876953297, + "learning_rate": 0.00019637953733459628, + "loss": 0.7088, + "step": 1268 + }, + { + "epoch": 0.1134656652360515, + "grad_norm": 0.1073513697256103, + "learning_rate": 0.00019637181128327505, + "loss": 0.6283, + "step": 1269 + }, + { + "epoch": 0.11355507868383405, + "grad_norm": 0.10925917078995857, + "learning_rate": 0.00019636407714936636, + "loss": 0.7074, + "step": 1270 + }, + { + "epoch": 0.11364449213161659, + "grad_norm": 0.11669105039202515, + "learning_rate": 0.00019635633493351887, + "loss": 0.7303, + "step": 1271 + }, + { + "epoch": 0.11373390557939914, + "grad_norm": 0.10684658646190368, + "learning_rate": 0.000196348584636382, + "loss": 0.6277, + "step": 1272 + }, + { + "epoch": 0.1138233190271817, + "grad_norm": 0.1466605395799567, + "learning_rate": 0.00019634082625860562, + "loss": 0.698, + "step": 1273 + }, + { + "epoch": 0.11391273247496424, + "grad_norm": 0.13161673049171493, + "learning_rate": 0.00019633305980084055, + "loss": 0.6556, + "step": 1274 + }, + { + "epoch": 0.11400214592274678, + "grad_norm": 0.11530544865291535, + "learning_rate": 0.0001963252852637381, + "loss": 0.6903, + "step": 1275 + }, + { + "epoch": 0.11409155937052932, + "grad_norm": 0.13420867716606605, + "learning_rate": 0.00019631750264795028, + "loss": 0.7296, + "step": 1276 + }, + { + "epoch": 0.11418097281831187, + "grad_norm": 0.11970188382908267, + "learning_rate": 0.00019630971195412983, + "loss": 0.7351, + "step": 1277 + }, + { + "epoch": 0.11427038626609443, + "grad_norm": 0.1174153605345003, + "learning_rate": 0.00019630191318293017, + "loss": 0.7018, + "step": 1278 + }, + { + "epoch": 0.11435979971387697, + "grad_norm": 0.12260033454965091, + "learning_rate": 0.00019629410633500535, + "loss": 0.7221, + "step": 1279 + }, + { + "epoch": 0.11444921316165951, + "grad_norm": 0.10585523417624144, + "learning_rate": 0.00019628629141101012, + "loss": 0.687, + "step": 1280 + }, + { + "epoch": 0.11453862660944206, + "grad_norm": 0.10656256760695416, + "learning_rate": 0.0001962784684115999, + "loss": 0.707, + "step": 1281 + }, + { + "epoch": 0.1146280400572246, + "grad_norm": 0.10651177852964859, + "learning_rate": 0.00019627063733743084, + "loss": 0.6855, + "step": 1282 + }, + { + "epoch": 0.11471745350500716, + "grad_norm": 0.10449448974022026, + "learning_rate": 0.0001962627981891597, + "loss": 0.6708, + "step": 1283 + }, + { + "epoch": 0.1148068669527897, + "grad_norm": 0.11015003305909725, + "learning_rate": 0.00019625495096744388, + "loss": 0.6663, + "step": 1284 + }, + { + "epoch": 0.11489628040057225, + "grad_norm": 0.12061514916165365, + "learning_rate": 0.00019624709567294158, + "loss": 0.7481, + "step": 1285 + }, + { + "epoch": 0.11498569384835479, + "grad_norm": 0.11801840400323987, + "learning_rate": 0.00019623923230631165, + "loss": 0.721, + "step": 1286 + }, + { + "epoch": 0.11507510729613733, + "grad_norm": 0.1042474731790967, + "learning_rate": 0.0001962313608682135, + "loss": 0.7154, + "step": 1287 + }, + { + "epoch": 0.11516452074391989, + "grad_norm": 0.1271077260779588, + "learning_rate": 0.00019622348135930735, + "loss": 0.7335, + "step": 1288 + }, + { + "epoch": 0.11525393419170243, + "grad_norm": 0.11648434266998377, + "learning_rate": 0.00019621559378025401, + "loss": 0.5591, + "step": 1289 + }, + { + "epoch": 0.11534334763948498, + "grad_norm": 0.12832590126915328, + "learning_rate": 0.00019620769813171504, + "loss": 0.6936, + "step": 1290 + }, + { + "epoch": 0.11543276108726752, + "grad_norm": 0.12387522898631366, + "learning_rate": 0.0001961997944143526, + "loss": 0.7029, + "step": 1291 + }, + { + "epoch": 0.11552217453505007, + "grad_norm": 0.10838975108266656, + "learning_rate": 0.00019619188262882958, + "loss": 0.698, + "step": 1292 + }, + { + "epoch": 0.11561158798283262, + "grad_norm": 0.11380757528893048, + "learning_rate": 0.00019618396277580952, + "loss": 0.7074, + "step": 1293 + }, + { + "epoch": 0.11570100143061517, + "grad_norm": 0.11616786293978235, + "learning_rate": 0.0001961760348559567, + "loss": 0.6841, + "step": 1294 + }, + { + "epoch": 0.11579041487839771, + "grad_norm": 0.1390902581587284, + "learning_rate": 0.00019616809886993596, + "loss": 0.7203, + "step": 1295 + }, + { + "epoch": 0.11587982832618025, + "grad_norm": 0.1285420528789538, + "learning_rate": 0.0001961601548184129, + "loss": 0.6924, + "step": 1296 + }, + { + "epoch": 0.1159692417739628, + "grad_norm": 0.11548900730608164, + "learning_rate": 0.00019615220270205382, + "loss": 0.6768, + "step": 1297 + }, + { + "epoch": 0.11605865522174535, + "grad_norm": 0.12259632545794921, + "learning_rate": 0.00019614424252152558, + "loss": 0.7168, + "step": 1298 + }, + { + "epoch": 0.1161480686695279, + "grad_norm": 0.11788013246932903, + "learning_rate": 0.0001961362742774959, + "loss": 0.7014, + "step": 1299 + }, + { + "epoch": 0.11623748211731044, + "grad_norm": 0.12142020665284659, + "learning_rate": 0.00019612829797063295, + "loss": 0.7121, + "step": 1300 + }, + { + "epoch": 0.11632689556509299, + "grad_norm": 0.1129896620931224, + "learning_rate": 0.00019612031360160574, + "loss": 0.7156, + "step": 1301 + }, + { + "epoch": 0.11641630901287553, + "grad_norm": 0.11412266186402904, + "learning_rate": 0.00019611232117108395, + "loss": 0.7254, + "step": 1302 + }, + { + "epoch": 0.11650572246065809, + "grad_norm": 0.10825804625970988, + "learning_rate": 0.00019610432067973781, + "loss": 0.6737, + "step": 1303 + }, + { + "epoch": 0.11659513590844063, + "grad_norm": 0.12665131431983026, + "learning_rate": 0.0001960963121282384, + "loss": 0.6727, + "step": 1304 + }, + { + "epoch": 0.11668454935622317, + "grad_norm": 0.11515346290036901, + "learning_rate": 0.0001960882955172573, + "loss": 0.6763, + "step": 1305 + }, + { + "epoch": 0.11677396280400572, + "grad_norm": 0.11167642658239972, + "learning_rate": 0.00019608027084746694, + "loss": 0.6787, + "step": 1306 + }, + { + "epoch": 0.11686337625178828, + "grad_norm": 0.12155640149931418, + "learning_rate": 0.0001960722381195403, + "loss": 0.7099, + "step": 1307 + }, + { + "epoch": 0.11695278969957082, + "grad_norm": 0.11063298002262108, + "learning_rate": 0.00019606419733415105, + "loss": 0.6756, + "step": 1308 + }, + { + "epoch": 0.11704220314735336, + "grad_norm": 0.11069396635123849, + "learning_rate": 0.00019605614849197358, + "loss": 0.7129, + "step": 1309 + }, + { + "epoch": 0.1171316165951359, + "grad_norm": 0.10439603742815884, + "learning_rate": 0.000196048091593683, + "loss": 0.6663, + "step": 1310 + }, + { + "epoch": 0.11722103004291845, + "grad_norm": 0.10371951374696756, + "learning_rate": 0.00019604002663995492, + "loss": 0.6946, + "step": 1311 + }, + { + "epoch": 0.11731044349070101, + "grad_norm": 0.11653013669799103, + "learning_rate": 0.0001960319536314658, + "loss": 0.6683, + "step": 1312 + }, + { + "epoch": 0.11739985693848355, + "grad_norm": 0.10742768901559895, + "learning_rate": 0.0001960238725688927, + "loss": 0.6883, + "step": 1313 + }, + { + "epoch": 0.1174892703862661, + "grad_norm": 0.1276439693077825, + "learning_rate": 0.0001960157834529134, + "loss": 0.7453, + "step": 1314 + }, + { + "epoch": 0.11757868383404864, + "grad_norm": 0.11219386807380298, + "learning_rate": 0.0001960076862842063, + "loss": 0.6665, + "step": 1315 + }, + { + "epoch": 0.11766809728183118, + "grad_norm": 0.11037858167623397, + "learning_rate": 0.00019599958106345045, + "loss": 0.6787, + "step": 1316 + }, + { + "epoch": 0.11775751072961374, + "grad_norm": 0.11582394474520591, + "learning_rate": 0.00019599146779132576, + "loss": 0.7064, + "step": 1317 + }, + { + "epoch": 0.11784692417739628, + "grad_norm": 0.12873091380053847, + "learning_rate": 0.00019598334646851254, + "loss": 0.6658, + "step": 1318 + }, + { + "epoch": 0.11793633762517883, + "grad_norm": 0.11432640411814078, + "learning_rate": 0.000195975217095692, + "loss": 0.6911, + "step": 1319 + }, + { + "epoch": 0.11802575107296137, + "grad_norm": 0.12226688265420023, + "learning_rate": 0.00019596707967354585, + "loss": 0.7121, + "step": 1320 + }, + { + "epoch": 0.11811516452074391, + "grad_norm": 0.13442885663668452, + "learning_rate": 0.00019595893420275667, + "loss": 0.7197, + "step": 1321 + }, + { + "epoch": 0.11820457796852647, + "grad_norm": 0.132504994237494, + "learning_rate": 0.00019595078068400756, + "loss": 0.7149, + "step": 1322 + }, + { + "epoch": 0.11829399141630902, + "grad_norm": 0.11855394161153315, + "learning_rate": 0.00019594261911798236, + "loss": 0.6671, + "step": 1323 + }, + { + "epoch": 0.11838340486409156, + "grad_norm": 0.12865715681440104, + "learning_rate": 0.00019593444950536556, + "loss": 0.7087, + "step": 1324 + }, + { + "epoch": 0.1184728183118741, + "grad_norm": 0.13400549590773525, + "learning_rate": 0.00019592627184684234, + "loss": 0.7524, + "step": 1325 + }, + { + "epoch": 0.11856223175965665, + "grad_norm": 0.12150140724107768, + "learning_rate": 0.00019591808614309854, + "loss": 0.6911, + "step": 1326 + }, + { + "epoch": 0.1186516452074392, + "grad_norm": 0.11874639771258365, + "learning_rate": 0.0001959098923948207, + "loss": 0.6353, + "step": 1327 + }, + { + "epoch": 0.11874105865522175, + "grad_norm": 0.11381344933661627, + "learning_rate": 0.00019590169060269602, + "loss": 0.7146, + "step": 1328 + }, + { + "epoch": 0.11883047210300429, + "grad_norm": 0.11904622471106398, + "learning_rate": 0.0001958934807674124, + "loss": 0.6991, + "step": 1329 + }, + { + "epoch": 0.11891988555078684, + "grad_norm": 0.12873332434194412, + "learning_rate": 0.00019588526288965828, + "loss": 0.7288, + "step": 1330 + }, + { + "epoch": 0.11900929899856938, + "grad_norm": 0.10686604922279774, + "learning_rate": 0.00019587703697012302, + "loss": 0.662, + "step": 1331 + }, + { + "epoch": 0.11909871244635194, + "grad_norm": 0.11914043323618877, + "learning_rate": 0.00019586880300949644, + "loss": 0.6922, + "step": 1332 + }, + { + "epoch": 0.11918812589413448, + "grad_norm": 0.10611821071926282, + "learning_rate": 0.00019586056100846916, + "loss": 0.7295, + "step": 1333 + }, + { + "epoch": 0.11927753934191702, + "grad_norm": 0.13176363239373892, + "learning_rate": 0.00019585231096773238, + "loss": 0.7025, + "step": 1334 + }, + { + "epoch": 0.11936695278969957, + "grad_norm": 0.11708486556818347, + "learning_rate": 0.00019584405288797802, + "loss": 0.6977, + "step": 1335 + }, + { + "epoch": 0.11945636623748211, + "grad_norm": 0.1298187343291837, + "learning_rate": 0.0001958357867698987, + "loss": 0.7523, + "step": 1336 + }, + { + "epoch": 0.11954577968526467, + "grad_norm": 0.10547235654047968, + "learning_rate": 0.0001958275126141877, + "loss": 0.6674, + "step": 1337 + }, + { + "epoch": 0.11963519313304721, + "grad_norm": 0.11085501599193334, + "learning_rate": 0.00019581923042153894, + "loss": 0.7015, + "step": 1338 + }, + { + "epoch": 0.11972460658082976, + "grad_norm": 0.11183646639923467, + "learning_rate": 0.00019581094019264705, + "loss": 0.712, + "step": 1339 + }, + { + "epoch": 0.1198140200286123, + "grad_norm": 0.12574170208602375, + "learning_rate": 0.00019580264192820733, + "loss": 0.7262, + "step": 1340 + }, + { + "epoch": 0.11990343347639484, + "grad_norm": 0.1296013498845716, + "learning_rate": 0.00019579433562891572, + "loss": 0.6783, + "step": 1341 + }, + { + "epoch": 0.1199928469241774, + "grad_norm": 0.12290868966235159, + "learning_rate": 0.00019578602129546885, + "loss": 0.6875, + "step": 1342 + }, + { + "epoch": 0.12008226037195995, + "grad_norm": 0.11577739590440027, + "learning_rate": 0.00019577769892856407, + "loss": 0.7272, + "step": 1343 + }, + { + "epoch": 0.12017167381974249, + "grad_norm": 0.12220425017564503, + "learning_rate": 0.00019576936852889936, + "loss": 0.6924, + "step": 1344 + }, + { + "epoch": 0.12026108726752503, + "grad_norm": 0.1374115673149061, + "learning_rate": 0.00019576103009717337, + "loss": 0.6804, + "step": 1345 + }, + { + "epoch": 0.12035050071530758, + "grad_norm": 0.13737306075129918, + "learning_rate": 0.00019575268363408542, + "loss": 0.7576, + "step": 1346 + }, + { + "epoch": 0.12043991416309013, + "grad_norm": 0.13472850925949686, + "learning_rate": 0.00019574432914033554, + "loss": 0.7246, + "step": 1347 + }, + { + "epoch": 0.12052932761087268, + "grad_norm": 0.12045714332051638, + "learning_rate": 0.00019573596661662438, + "loss": 0.7378, + "step": 1348 + }, + { + "epoch": 0.12061874105865522, + "grad_norm": 0.1170213281960471, + "learning_rate": 0.00019572759606365336, + "loss": 0.7292, + "step": 1349 + }, + { + "epoch": 0.12070815450643776, + "grad_norm": 0.10027984993773656, + "learning_rate": 0.00019571921748212447, + "loss": 0.6686, + "step": 1350 + }, + { + "epoch": 0.12079756795422031, + "grad_norm": 0.1300739754805338, + "learning_rate": 0.0001957108308727404, + "loss": 0.7397, + "step": 1351 + }, + { + "epoch": 0.12088698140200287, + "grad_norm": 0.11247430819573619, + "learning_rate": 0.0001957024362362045, + "loss": 0.7134, + "step": 1352 + }, + { + "epoch": 0.12097639484978541, + "grad_norm": 0.12227556564161822, + "learning_rate": 0.0001956940335732209, + "loss": 0.7143, + "step": 1353 + }, + { + "epoch": 0.12106580829756795, + "grad_norm": 0.11738305673393284, + "learning_rate": 0.00019568562288449422, + "loss": 0.7183, + "step": 1354 + }, + { + "epoch": 0.1211552217453505, + "grad_norm": 0.11155556193797557, + "learning_rate": 0.00019567720417072997, + "loss": 0.6857, + "step": 1355 + }, + { + "epoch": 0.12124463519313304, + "grad_norm": 0.14073339857340286, + "learning_rate": 0.00019566877743263414, + "loss": 0.7067, + "step": 1356 + }, + { + "epoch": 0.1213340486409156, + "grad_norm": 0.11464030544475967, + "learning_rate": 0.00019566034267091346, + "loss": 0.6863, + "step": 1357 + }, + { + "epoch": 0.12142346208869814, + "grad_norm": 0.12366928173397226, + "learning_rate": 0.0001956518998862754, + "loss": 0.7083, + "step": 1358 + }, + { + "epoch": 0.12151287553648069, + "grad_norm": 0.11892530452135087, + "learning_rate": 0.00019564344907942798, + "loss": 0.7148, + "step": 1359 + }, + { + "epoch": 0.12160228898426323, + "grad_norm": 0.11075271188057849, + "learning_rate": 0.00019563499025107998, + "loss": 0.7436, + "step": 1360 + }, + { + "epoch": 0.12169170243204577, + "grad_norm": 0.1294566940771805, + "learning_rate": 0.0001956265234019409, + "loss": 0.7634, + "step": 1361 + }, + { + "epoch": 0.12178111587982833, + "grad_norm": 0.1191704902119027, + "learning_rate": 0.00019561804853272075, + "loss": 0.7123, + "step": 1362 + }, + { + "epoch": 0.12187052932761087, + "grad_norm": 0.12368798289050359, + "learning_rate": 0.00019560956564413035, + "loss": 0.6959, + "step": 1363 + }, + { + "epoch": 0.12195994277539342, + "grad_norm": 0.11676018877934677, + "learning_rate": 0.00019560107473688118, + "loss": 0.7058, + "step": 1364 + }, + { + "epoch": 0.12204935622317596, + "grad_norm": 0.12663930420411879, + "learning_rate": 0.0001955925758116853, + "loss": 0.7309, + "step": 1365 + }, + { + "epoch": 0.1221387696709585, + "grad_norm": 0.09962992122745205, + "learning_rate": 0.00019558406886925554, + "loss": 0.6714, + "step": 1366 + }, + { + "epoch": 0.12222818311874106, + "grad_norm": 0.10056414982413514, + "learning_rate": 0.00019557555391030537, + "loss": 0.5782, + "step": 1367 + }, + { + "epoch": 0.1223175965665236, + "grad_norm": 0.0983970154318763, + "learning_rate": 0.0001955670309355489, + "loss": 0.5776, + "step": 1368 + }, + { + "epoch": 0.12240701001430615, + "grad_norm": 0.12447001295559146, + "learning_rate": 0.00019555849994570097, + "loss": 0.7085, + "step": 1369 + }, + { + "epoch": 0.1224964234620887, + "grad_norm": 0.12194278947221437, + "learning_rate": 0.00019554996094147707, + "loss": 0.6855, + "step": 1370 + }, + { + "epoch": 0.12258583690987125, + "grad_norm": 0.12620947616315883, + "learning_rate": 0.00019554141392359332, + "loss": 0.7329, + "step": 1371 + }, + { + "epoch": 0.1226752503576538, + "grad_norm": 0.1104643623405123, + "learning_rate": 0.00019553285889276656, + "loss": 0.6708, + "step": 1372 + }, + { + "epoch": 0.12276466380543634, + "grad_norm": 0.11188258510273322, + "learning_rate": 0.00019552429584971434, + "loss": 0.6958, + "step": 1373 + }, + { + "epoch": 0.12285407725321888, + "grad_norm": 0.10432167355066395, + "learning_rate": 0.00019551572479515478, + "loss": 0.5968, + "step": 1374 + }, + { + "epoch": 0.12294349070100143, + "grad_norm": 0.11768705599316626, + "learning_rate": 0.00019550714572980668, + "loss": 0.7032, + "step": 1375 + }, + { + "epoch": 0.12303290414878398, + "grad_norm": 0.12090310452164336, + "learning_rate": 0.00019549855865438965, + "loss": 0.7213, + "step": 1376 + }, + { + "epoch": 0.12312231759656653, + "grad_norm": 0.1187023768230625, + "learning_rate": 0.00019548996356962386, + "loss": 0.6985, + "step": 1377 + }, + { + "epoch": 0.12321173104434907, + "grad_norm": 0.10919762902284709, + "learning_rate": 0.00019548136047623015, + "loss": 0.7184, + "step": 1378 + }, + { + "epoch": 0.12330114449213161, + "grad_norm": 0.10666915347756019, + "learning_rate": 0.00019547274937492998, + "loss": 0.7135, + "step": 1379 + }, + { + "epoch": 0.12339055793991416, + "grad_norm": 0.12155913833891308, + "learning_rate": 0.00019546413026644567, + "loss": 0.7237, + "step": 1380 + }, + { + "epoch": 0.12347997138769672, + "grad_norm": 0.11004956973348659, + "learning_rate": 0.00019545550315150004, + "loss": 0.6786, + "step": 1381 + }, + { + "epoch": 0.12356938483547926, + "grad_norm": 0.1028474481005824, + "learning_rate": 0.00019544686803081666, + "loss": 0.6703, + "step": 1382 + }, + { + "epoch": 0.1236587982832618, + "grad_norm": 0.10476714522082088, + "learning_rate": 0.00019543822490511974, + "loss": 0.6652, + "step": 1383 + }, + { + "epoch": 0.12374821173104435, + "grad_norm": 0.11667027424798726, + "learning_rate": 0.00019542957377513412, + "loss": 0.7155, + "step": 1384 + }, + { + "epoch": 0.12383762517882689, + "grad_norm": 0.11729309700547232, + "learning_rate": 0.00019542091464158542, + "loss": 0.6603, + "step": 1385 + }, + { + "epoch": 0.12392703862660945, + "grad_norm": 0.1277188841778547, + "learning_rate": 0.00019541224750519983, + "loss": 0.7274, + "step": 1386 + }, + { + "epoch": 0.12401645207439199, + "grad_norm": 0.11810195961638592, + "learning_rate": 0.00019540357236670427, + "loss": 0.7032, + "step": 1387 + }, + { + "epoch": 0.12410586552217454, + "grad_norm": 0.1420518841681989, + "learning_rate": 0.00019539488922682633, + "loss": 0.7129, + "step": 1388 + }, + { + "epoch": 0.12419527896995708, + "grad_norm": 0.12091945203924717, + "learning_rate": 0.00019538619808629422, + "loss": 0.6886, + "step": 1389 + }, + { + "epoch": 0.12428469241773962, + "grad_norm": 0.13817160741898726, + "learning_rate": 0.0001953774989458369, + "loss": 0.727, + "step": 1390 + }, + { + "epoch": 0.12437410586552218, + "grad_norm": 0.10790253845395395, + "learning_rate": 0.00019536879180618392, + "loss": 0.7135, + "step": 1391 + }, + { + "epoch": 0.12446351931330472, + "grad_norm": 0.11205741863441573, + "learning_rate": 0.00019536007666806556, + "loss": 0.6561, + "step": 1392 + }, + { + "epoch": 0.12455293276108727, + "grad_norm": 0.12044754409471638, + "learning_rate": 0.00019535135353221272, + "loss": 0.7017, + "step": 1393 + }, + { + "epoch": 0.12464234620886981, + "grad_norm": 0.1357506525814231, + "learning_rate": 0.000195342622399357, + "loss": 0.729, + "step": 1394 + }, + { + "epoch": 0.12473175965665236, + "grad_norm": 0.11256024074691025, + "learning_rate": 0.0001953338832702307, + "loss": 0.6956, + "step": 1395 + }, + { + "epoch": 0.12482117310443491, + "grad_norm": 0.10613164166590644, + "learning_rate": 0.00019532513614556673, + "loss": 0.6963, + "step": 1396 + }, + { + "epoch": 0.12491058655221746, + "grad_norm": 0.12122018056169745, + "learning_rate": 0.00019531638102609873, + "loss": 0.6862, + "step": 1397 + }, + { + "epoch": 0.125, + "grad_norm": 0.1091948894452867, + "learning_rate": 0.00019530761791256097, + "loss": 0.6866, + "step": 1398 + }, + { + "epoch": 0.12508941344778254, + "grad_norm": 0.13004041282076712, + "learning_rate": 0.0001952988468056884, + "loss": 0.7298, + "step": 1399 + }, + { + "epoch": 0.1251788268955651, + "grad_norm": 0.11379688949702871, + "learning_rate": 0.00019529006770621662, + "loss": 0.6904, + "step": 1400 + }, + { + "epoch": 0.12526824034334763, + "grad_norm": 0.10976973116316134, + "learning_rate": 0.00019528128061488195, + "loss": 0.6924, + "step": 1401 + }, + { + "epoch": 0.12535765379113017, + "grad_norm": 0.12486503183585514, + "learning_rate": 0.00019527248553242137, + "loss": 0.7209, + "step": 1402 + }, + { + "epoch": 0.12544706723891275, + "grad_norm": 0.11627945957013704, + "learning_rate": 0.00019526368245957246, + "loss": 0.7221, + "step": 1403 + }, + { + "epoch": 0.1255364806866953, + "grad_norm": 0.11221866387999863, + "learning_rate": 0.00019525487139707357, + "loss": 0.7056, + "step": 1404 + }, + { + "epoch": 0.12562589413447783, + "grad_norm": 0.12777176992979314, + "learning_rate": 0.00019524605234566363, + "loss": 0.7336, + "step": 1405 + }, + { + "epoch": 0.12571530758226038, + "grad_norm": 0.11646299658244984, + "learning_rate": 0.00019523722530608232, + "loss": 0.6834, + "step": 1406 + }, + { + "epoch": 0.12580472103004292, + "grad_norm": 0.13352914949985237, + "learning_rate": 0.00019522839027906995, + "loss": 0.7083, + "step": 1407 + }, + { + "epoch": 0.12589413447782546, + "grad_norm": 0.13127104802419506, + "learning_rate": 0.0001952195472653675, + "loss": 0.7302, + "step": 1408 + }, + { + "epoch": 0.125983547925608, + "grad_norm": 0.11372869713857085, + "learning_rate": 0.0001952106962657166, + "loss": 0.7148, + "step": 1409 + }, + { + "epoch": 0.12607296137339055, + "grad_norm": 0.1299928595177995, + "learning_rate": 0.0001952018372808596, + "loss": 0.7003, + "step": 1410 + }, + { + "epoch": 0.1261623748211731, + "grad_norm": 0.13066068506550135, + "learning_rate": 0.00019519297031153946, + "loss": 0.7115, + "step": 1411 + }, + { + "epoch": 0.12625178826895564, + "grad_norm": 0.11068189410941494, + "learning_rate": 0.0001951840953584999, + "loss": 0.6931, + "step": 1412 + }, + { + "epoch": 0.1263412017167382, + "grad_norm": 0.10707564933849793, + "learning_rate": 0.0001951752124224852, + "loss": 0.7129, + "step": 1413 + }, + { + "epoch": 0.12643061516452075, + "grad_norm": 0.12869595193987082, + "learning_rate": 0.00019516632150424034, + "loss": 0.7126, + "step": 1414 + }, + { + "epoch": 0.1265200286123033, + "grad_norm": 0.12079566345869255, + "learning_rate": 0.00019515742260451107, + "loss": 0.7263, + "step": 1415 + }, + { + "epoch": 0.12660944206008584, + "grad_norm": 0.10310467795173718, + "learning_rate": 0.00019514851572404368, + "loss": 0.6652, + "step": 1416 + }, + { + "epoch": 0.12669885550786839, + "grad_norm": 0.09074334679275928, + "learning_rate": 0.0001951396008635852, + "loss": 0.5302, + "step": 1417 + }, + { + "epoch": 0.12678826895565093, + "grad_norm": 0.13700271783217333, + "learning_rate": 0.00019513067802388325, + "loss": 0.7043, + "step": 1418 + }, + { + "epoch": 0.12687768240343347, + "grad_norm": 0.1183406772501756, + "learning_rate": 0.00019512174720568627, + "loss": 0.6908, + "step": 1419 + }, + { + "epoch": 0.12696709585121602, + "grad_norm": 0.12419024773157154, + "learning_rate": 0.0001951128084097432, + "loss": 0.6885, + "step": 1420 + }, + { + "epoch": 0.12705650929899856, + "grad_norm": 0.10637907879411794, + "learning_rate": 0.00019510386163680375, + "loss": 0.6581, + "step": 1421 + }, + { + "epoch": 0.1271459227467811, + "grad_norm": 0.12761706650518745, + "learning_rate": 0.00019509490688761832, + "loss": 0.7094, + "step": 1422 + }, + { + "epoch": 0.12723533619456368, + "grad_norm": 0.1206806011551076, + "learning_rate": 0.0001950859441629379, + "loss": 0.6828, + "step": 1423 + }, + { + "epoch": 0.12732474964234622, + "grad_norm": 0.10817346118937556, + "learning_rate": 0.00019507697346351414, + "loss": 0.6517, + "step": 1424 + }, + { + "epoch": 0.12741416309012876, + "grad_norm": 0.12170143873190516, + "learning_rate": 0.00019506799479009944, + "loss": 0.6792, + "step": 1425 + }, + { + "epoch": 0.1275035765379113, + "grad_norm": 0.11406534107687887, + "learning_rate": 0.00019505900814344683, + "loss": 0.6808, + "step": 1426 + }, + { + "epoch": 0.12759298998569385, + "grad_norm": 0.1223131119291968, + "learning_rate": 0.00019505001352431003, + "loss": 0.7098, + "step": 1427 + }, + { + "epoch": 0.1276824034334764, + "grad_norm": 0.13023402864115616, + "learning_rate": 0.00019504101093344338, + "loss": 0.7236, + "step": 1428 + }, + { + "epoch": 0.12777181688125894, + "grad_norm": 0.11800960208057892, + "learning_rate": 0.00019503200037160193, + "loss": 0.7028, + "step": 1429 + }, + { + "epoch": 0.12786123032904148, + "grad_norm": 0.12125200157827958, + "learning_rate": 0.00019502298183954136, + "loss": 0.7348, + "step": 1430 + }, + { + "epoch": 0.12795064377682402, + "grad_norm": 0.12195976876025827, + "learning_rate": 0.00019501395533801807, + "loss": 0.6982, + "step": 1431 + }, + { + "epoch": 0.12804005722460657, + "grad_norm": 0.11240479703116116, + "learning_rate": 0.0001950049208677891, + "loss": 0.6637, + "step": 1432 + }, + { + "epoch": 0.12812947067238914, + "grad_norm": 0.10910191231711004, + "learning_rate": 0.00019499587842961214, + "loss": 0.6702, + "step": 1433 + }, + { + "epoch": 0.12821888412017168, + "grad_norm": 0.13072124614763608, + "learning_rate": 0.0001949868280242456, + "loss": 0.7166, + "step": 1434 + }, + { + "epoch": 0.12830829756795423, + "grad_norm": 0.12817617769998038, + "learning_rate": 0.0001949777696524485, + "loss": 0.7602, + "step": 1435 + }, + { + "epoch": 0.12839771101573677, + "grad_norm": 0.12370172615927699, + "learning_rate": 0.00019496870331498056, + "loss": 0.6975, + "step": 1436 + }, + { + "epoch": 0.12848712446351931, + "grad_norm": 0.11290657408941307, + "learning_rate": 0.00019495962901260215, + "loss": 0.7031, + "step": 1437 + }, + { + "epoch": 0.12857653791130186, + "grad_norm": 0.11327060372430829, + "learning_rate": 0.00019495054674607438, + "loss": 0.6971, + "step": 1438 + }, + { + "epoch": 0.1286659513590844, + "grad_norm": 0.12693011511449223, + "learning_rate": 0.00019494145651615888, + "loss": 0.6965, + "step": 1439 + }, + { + "epoch": 0.12875536480686695, + "grad_norm": 0.1151606803085685, + "learning_rate": 0.0001949323583236181, + "loss": 0.7125, + "step": 1440 + }, + { + "epoch": 0.1288447782546495, + "grad_norm": 0.12140108468388502, + "learning_rate": 0.00019492325216921506, + "loss": 0.732, + "step": 1441 + }, + { + "epoch": 0.12893419170243203, + "grad_norm": 0.1172759519776798, + "learning_rate": 0.00019491413805371356, + "loss": 0.6781, + "step": 1442 + }, + { + "epoch": 0.1290236051502146, + "grad_norm": 0.11739624055286309, + "learning_rate": 0.0001949050159778779, + "loss": 0.7384, + "step": 1443 + }, + { + "epoch": 0.12911301859799715, + "grad_norm": 0.12627830256368824, + "learning_rate": 0.00019489588594247313, + "loss": 0.7414, + "step": 1444 + }, + { + "epoch": 0.1292024320457797, + "grad_norm": 0.11740502307930578, + "learning_rate": 0.00019488674794826505, + "loss": 0.6856, + "step": 1445 + }, + { + "epoch": 0.12929184549356224, + "grad_norm": 0.10727765619569893, + "learning_rate": 0.00019487760199602, + "loss": 0.6587, + "step": 1446 + }, + { + "epoch": 0.12938125894134478, + "grad_norm": 0.11044771224643073, + "learning_rate": 0.00019486844808650503, + "loss": 0.6854, + "step": 1447 + }, + { + "epoch": 0.12947067238912732, + "grad_norm": 0.11895932987135162, + "learning_rate": 0.00019485928622048793, + "loss": 0.7077, + "step": 1448 + }, + { + "epoch": 0.12956008583690987, + "grad_norm": 0.11188661295676891, + "learning_rate": 0.00019485011639873702, + "loss": 0.6898, + "step": 1449 + }, + { + "epoch": 0.1296494992846924, + "grad_norm": 0.11637903990560167, + "learning_rate": 0.0001948409386220214, + "loss": 0.6916, + "step": 1450 + }, + { + "epoch": 0.12973891273247495, + "grad_norm": 0.10915182915660616, + "learning_rate": 0.00019483175289111083, + "loss": 0.6677, + "step": 1451 + }, + { + "epoch": 0.1298283261802575, + "grad_norm": 0.11618937831446567, + "learning_rate": 0.00019482255920677565, + "loss": 0.6864, + "step": 1452 + }, + { + "epoch": 0.12991773962804007, + "grad_norm": 0.1206500030092326, + "learning_rate": 0.00019481335756978696, + "loss": 0.6866, + "step": 1453 + }, + { + "epoch": 0.1300071530758226, + "grad_norm": 0.12645812482068586, + "learning_rate": 0.00019480414798091647, + "loss": 0.6862, + "step": 1454 + }, + { + "epoch": 0.13009656652360516, + "grad_norm": 0.12125982930800062, + "learning_rate": 0.00019479493044093657, + "loss": 0.7208, + "step": 1455 + }, + { + "epoch": 0.1301859799713877, + "grad_norm": 0.12258678143047046, + "learning_rate": 0.00019478570495062037, + "loss": 0.7375, + "step": 1456 + }, + { + "epoch": 0.13027539341917024, + "grad_norm": 0.11357095659102791, + "learning_rate": 0.00019477647151074155, + "loss": 0.7125, + "step": 1457 + }, + { + "epoch": 0.1303648068669528, + "grad_norm": 0.11182222652347619, + "learning_rate": 0.0001947672301220745, + "loss": 0.7062, + "step": 1458 + }, + { + "epoch": 0.13045422031473533, + "grad_norm": 0.1150097502595012, + "learning_rate": 0.00019475798078539433, + "loss": 0.7251, + "step": 1459 + }, + { + "epoch": 0.13054363376251787, + "grad_norm": 0.11967431006700122, + "learning_rate": 0.00019474872350147676, + "loss": 0.6931, + "step": 1460 + }, + { + "epoch": 0.13063304721030042, + "grad_norm": 0.14101365241412636, + "learning_rate": 0.0001947394582710982, + "loss": 0.7481, + "step": 1461 + }, + { + "epoch": 0.130722460658083, + "grad_norm": 0.10513355577992721, + "learning_rate": 0.00019473018509503565, + "loss": 0.6357, + "step": 1462 + }, + { + "epoch": 0.13081187410586553, + "grad_norm": 0.11612034524793406, + "learning_rate": 0.00019472090397406686, + "loss": 0.688, + "step": 1463 + }, + { + "epoch": 0.13090128755364808, + "grad_norm": 0.10694389947666129, + "learning_rate": 0.00019471161490897029, + "loss": 0.6945, + "step": 1464 + }, + { + "epoch": 0.13099070100143062, + "grad_norm": 0.11881934394623718, + "learning_rate": 0.00019470231790052496, + "loss": 0.698, + "step": 1465 + }, + { + "epoch": 0.13108011444921316, + "grad_norm": 0.11332664611157374, + "learning_rate": 0.0001946930129495106, + "loss": 0.6672, + "step": 1466 + }, + { + "epoch": 0.1311695278969957, + "grad_norm": 0.12866010836217706, + "learning_rate": 0.00019468370005670758, + "loss": 0.7254, + "step": 1467 + }, + { + "epoch": 0.13125894134477825, + "grad_norm": 0.12861668633952847, + "learning_rate": 0.00019467437922289697, + "loss": 0.7117, + "step": 1468 + }, + { + "epoch": 0.1313483547925608, + "grad_norm": 0.12204880043592947, + "learning_rate": 0.00019466505044886056, + "loss": 0.7001, + "step": 1469 + }, + { + "epoch": 0.13143776824034334, + "grad_norm": 0.1243946040036735, + "learning_rate": 0.00019465571373538068, + "loss": 0.6704, + "step": 1470 + }, + { + "epoch": 0.13152718168812588, + "grad_norm": 0.12245572482330055, + "learning_rate": 0.00019464636908324038, + "loss": 0.705, + "step": 1471 + }, + { + "epoch": 0.13161659513590845, + "grad_norm": 0.12752912914138015, + "learning_rate": 0.00019463701649322343, + "loss": 0.7139, + "step": 1472 + }, + { + "epoch": 0.131706008583691, + "grad_norm": 0.12944730732673557, + "learning_rate": 0.0001946276559661142, + "loss": 0.7203, + "step": 1473 + }, + { + "epoch": 0.13179542203147354, + "grad_norm": 0.10207656648022136, + "learning_rate": 0.00019461828750269775, + "loss": 0.6779, + "step": 1474 + }, + { + "epoch": 0.13188483547925609, + "grad_norm": 0.11558194159557372, + "learning_rate": 0.00019460891110375977, + "loss": 0.6945, + "step": 1475 + }, + { + "epoch": 0.13197424892703863, + "grad_norm": 0.11205472855810344, + "learning_rate": 0.00019459952677008672, + "loss": 0.6859, + "step": 1476 + }, + { + "epoch": 0.13206366237482117, + "grad_norm": 0.12645534593875923, + "learning_rate": 0.00019459013450246558, + "loss": 0.7127, + "step": 1477 + }, + { + "epoch": 0.13215307582260372, + "grad_norm": 0.11509057927981123, + "learning_rate": 0.0001945807343016841, + "loss": 0.674, + "step": 1478 + }, + { + "epoch": 0.13224248927038626, + "grad_norm": 0.11271868718612851, + "learning_rate": 0.00019457132616853065, + "loss": 0.6698, + "step": 1479 + }, + { + "epoch": 0.1323319027181688, + "grad_norm": 0.0956356746302997, + "learning_rate": 0.00019456191010379427, + "loss": 0.5825, + "step": 1480 + }, + { + "epoch": 0.13242131616595135, + "grad_norm": 0.1144281361529204, + "learning_rate": 0.00019455248610826474, + "loss": 0.6918, + "step": 1481 + }, + { + "epoch": 0.13251072961373392, + "grad_norm": 0.11213336572340664, + "learning_rate": 0.00019454305418273234, + "loss": 0.7191, + "step": 1482 + }, + { + "epoch": 0.13260014306151646, + "grad_norm": 0.1197964023523493, + "learning_rate": 0.0001945336143279882, + "loss": 0.7093, + "step": 1483 + }, + { + "epoch": 0.132689556509299, + "grad_norm": 0.11770260838080067, + "learning_rate": 0.000194524166544824, + "loss": 0.6944, + "step": 1484 + }, + { + "epoch": 0.13277896995708155, + "grad_norm": 0.13562042094711324, + "learning_rate": 0.00019451471083403209, + "loss": 0.7395, + "step": 1485 + }, + { + "epoch": 0.1328683834048641, + "grad_norm": 0.12681989579712713, + "learning_rate": 0.0001945052471964055, + "loss": 0.6699, + "step": 1486 + }, + { + "epoch": 0.13295779685264664, + "grad_norm": 0.10577017717010377, + "learning_rate": 0.000194495775632738, + "loss": 0.7036, + "step": 1487 + }, + { + "epoch": 0.13304721030042918, + "grad_norm": 0.09581537288375827, + "learning_rate": 0.0001944862961438239, + "loss": 0.5951, + "step": 1488 + }, + { + "epoch": 0.13313662374821172, + "grad_norm": 0.12718679487647716, + "learning_rate": 0.0001944768087304583, + "loss": 0.7093, + "step": 1489 + }, + { + "epoch": 0.13322603719599427, + "grad_norm": 0.1203161872525395, + "learning_rate": 0.0001944673133934368, + "loss": 0.744, + "step": 1490 + }, + { + "epoch": 0.1333154506437768, + "grad_norm": 0.10935375350744009, + "learning_rate": 0.00019445781013355582, + "loss": 0.71, + "step": 1491 + }, + { + "epoch": 0.13340486409155938, + "grad_norm": 0.11599941839749811, + "learning_rate": 0.00019444829895161239, + "loss": 0.679, + "step": 1492 + }, + { + "epoch": 0.13349427753934193, + "grad_norm": 0.13110444733165072, + "learning_rate": 0.0001944387798484042, + "loss": 0.7129, + "step": 1493 + }, + { + "epoch": 0.13358369098712447, + "grad_norm": 0.1410926463379429, + "learning_rate": 0.00019442925282472958, + "loss": 0.6866, + "step": 1494 + }, + { + "epoch": 0.13367310443490701, + "grad_norm": 0.11688059028360125, + "learning_rate": 0.00019441971788138756, + "loss": 0.7099, + "step": 1495 + }, + { + "epoch": 0.13376251788268956, + "grad_norm": 0.12205015611591997, + "learning_rate": 0.00019441017501917784, + "loss": 0.683, + "step": 1496 + }, + { + "epoch": 0.1338519313304721, + "grad_norm": 0.11996882620804877, + "learning_rate": 0.0001944006242389008, + "loss": 0.7206, + "step": 1497 + }, + { + "epoch": 0.13394134477825465, + "grad_norm": 0.1215595522065824, + "learning_rate": 0.00019439106554135736, + "loss": 0.7323, + "step": 1498 + }, + { + "epoch": 0.1340307582260372, + "grad_norm": 0.12113855916982219, + "learning_rate": 0.00019438149892734926, + "loss": 0.735, + "step": 1499 + }, + { + "epoch": 0.13412017167381973, + "grad_norm": 0.11108248491565995, + "learning_rate": 0.00019437192439767883, + "loss": 0.6729, + "step": 1500 + }, + { + "epoch": 0.13420958512160228, + "grad_norm": 0.12247853355360296, + "learning_rate": 0.00019436234195314907, + "loss": 0.7236, + "step": 1501 + }, + { + "epoch": 0.13429899856938485, + "grad_norm": 0.12264381719505135, + "learning_rate": 0.00019435275159456364, + "loss": 0.6772, + "step": 1502 + }, + { + "epoch": 0.1343884120171674, + "grad_norm": 0.11955963776740314, + "learning_rate": 0.00019434315332272692, + "loss": 0.7044, + "step": 1503 + }, + { + "epoch": 0.13447782546494993, + "grad_norm": 0.12351298291591183, + "learning_rate": 0.00019433354713844386, + "loss": 0.7016, + "step": 1504 + }, + { + "epoch": 0.13456723891273248, + "grad_norm": 0.13388229181642639, + "learning_rate": 0.00019432393304252013, + "loss": 0.7502, + "step": 1505 + }, + { + "epoch": 0.13465665236051502, + "grad_norm": 0.12634610948298444, + "learning_rate": 0.00019431431103576202, + "loss": 0.7162, + "step": 1506 + }, + { + "epoch": 0.13474606580829757, + "grad_norm": 0.11530316568716945, + "learning_rate": 0.00019430468111897656, + "loss": 0.704, + "step": 1507 + }, + { + "epoch": 0.1348354792560801, + "grad_norm": 0.12748419311219406, + "learning_rate": 0.0001942950432929714, + "loss": 0.7333, + "step": 1508 + }, + { + "epoch": 0.13492489270386265, + "grad_norm": 0.09532493109185716, + "learning_rate": 0.00019428539755855483, + "loss": 0.6244, + "step": 1509 + }, + { + "epoch": 0.1350143061516452, + "grad_norm": 0.12550066389905506, + "learning_rate": 0.00019427574391653581, + "loss": 0.7281, + "step": 1510 + }, + { + "epoch": 0.13510371959942774, + "grad_norm": 0.10384856711715422, + "learning_rate": 0.00019426608236772404, + "loss": 0.703, + "step": 1511 + }, + { + "epoch": 0.1351931330472103, + "grad_norm": 0.109143244904296, + "learning_rate": 0.00019425641291292978, + "loss": 0.7164, + "step": 1512 + }, + { + "epoch": 0.13528254649499286, + "grad_norm": 0.11320334844434872, + "learning_rate": 0.000194246735552964, + "loss": 0.6975, + "step": 1513 + }, + { + "epoch": 0.1353719599427754, + "grad_norm": 0.11392014378105597, + "learning_rate": 0.00019423705028863832, + "loss": 0.7045, + "step": 1514 + }, + { + "epoch": 0.13546137339055794, + "grad_norm": 0.1246190087750916, + "learning_rate": 0.00019422735712076506, + "loss": 0.7031, + "step": 1515 + }, + { + "epoch": 0.1355507868383405, + "grad_norm": 0.12596997630918536, + "learning_rate": 0.00019421765605015713, + "loss": 0.7017, + "step": 1516 + }, + { + "epoch": 0.13564020028612303, + "grad_norm": 0.129178958358627, + "learning_rate": 0.0001942079470776282, + "loss": 0.6966, + "step": 1517 + }, + { + "epoch": 0.13572961373390557, + "grad_norm": 0.12274470489213388, + "learning_rate": 0.0001941982302039925, + "loss": 0.697, + "step": 1518 + }, + { + "epoch": 0.13581902718168812, + "grad_norm": 0.12352956112287755, + "learning_rate": 0.000194188505430065, + "loss": 0.6871, + "step": 1519 + }, + { + "epoch": 0.13590844062947066, + "grad_norm": 0.11465088172515615, + "learning_rate": 0.0001941787727566613, + "loss": 0.685, + "step": 1520 + }, + { + "epoch": 0.1359978540772532, + "grad_norm": 0.11279900409144487, + "learning_rate": 0.0001941690321845977, + "loss": 0.6667, + "step": 1521 + }, + { + "epoch": 0.13608726752503578, + "grad_norm": 0.11874833111803525, + "learning_rate": 0.00019415928371469105, + "loss": 0.7106, + "step": 1522 + }, + { + "epoch": 0.13617668097281832, + "grad_norm": 0.12034225299257646, + "learning_rate": 0.000194149527347759, + "loss": 0.6835, + "step": 1523 + }, + { + "epoch": 0.13626609442060086, + "grad_norm": 0.12932581726217088, + "learning_rate": 0.00019413976308461982, + "loss": 0.6688, + "step": 1524 + }, + { + "epoch": 0.1363555078683834, + "grad_norm": 0.12894616356040833, + "learning_rate": 0.0001941299909260924, + "loss": 0.6951, + "step": 1525 + }, + { + "epoch": 0.13644492131616595, + "grad_norm": 0.12239408152824803, + "learning_rate": 0.0001941202108729963, + "loss": 0.7286, + "step": 1526 + }, + { + "epoch": 0.1365343347639485, + "grad_norm": 0.12826264342952617, + "learning_rate": 0.0001941104229261518, + "loss": 0.7808, + "step": 1527 + }, + { + "epoch": 0.13662374821173104, + "grad_norm": 0.11828097124686919, + "learning_rate": 0.0001941006270863798, + "loss": 0.665, + "step": 1528 + }, + { + "epoch": 0.13671316165951358, + "grad_norm": 0.12272564125776099, + "learning_rate": 0.0001940908233545018, + "loss": 0.7004, + "step": 1529 + }, + { + "epoch": 0.13680257510729613, + "grad_norm": 0.09899735099889202, + "learning_rate": 0.00019408101173134013, + "loss": 0.6746, + "step": 1530 + }, + { + "epoch": 0.1368919885550787, + "grad_norm": 0.12039437446651235, + "learning_rate": 0.00019407119221771758, + "loss": 0.7469, + "step": 1531 + }, + { + "epoch": 0.13698140200286124, + "grad_norm": 0.11915456344916048, + "learning_rate": 0.00019406136481445782, + "loss": 0.6668, + "step": 1532 + }, + { + "epoch": 0.13707081545064378, + "grad_norm": 0.11881609310022603, + "learning_rate": 0.0001940515295223849, + "loss": 0.6727, + "step": 1533 + }, + { + "epoch": 0.13716022889842633, + "grad_norm": 0.12136427879183528, + "learning_rate": 0.00019404168634232382, + "loss": 0.7264, + "step": 1534 + }, + { + "epoch": 0.13724964234620887, + "grad_norm": 0.10291382119274713, + "learning_rate": 0.0001940318352751001, + "loss": 0.6974, + "step": 1535 + }, + { + "epoch": 0.13733905579399142, + "grad_norm": 0.12076364569831737, + "learning_rate": 0.00019402197632153992, + "loss": 0.686, + "step": 1536 + }, + { + "epoch": 0.13742846924177396, + "grad_norm": 0.11905715648216032, + "learning_rate": 0.0001940121094824701, + "loss": 0.7104, + "step": 1537 + }, + { + "epoch": 0.1375178826895565, + "grad_norm": 0.11976400478076772, + "learning_rate": 0.00019400223475871825, + "loss": 0.7237, + "step": 1538 + }, + { + "epoch": 0.13760729613733905, + "grad_norm": 0.09990958404295362, + "learning_rate": 0.00019399235215111245, + "loss": 0.6712, + "step": 1539 + }, + { + "epoch": 0.1376967095851216, + "grad_norm": 0.11854040327212229, + "learning_rate": 0.00019398246166048159, + "loss": 0.7054, + "step": 1540 + }, + { + "epoch": 0.13778612303290416, + "grad_norm": 0.11733152912123158, + "learning_rate": 0.00019397256328765517, + "loss": 0.6939, + "step": 1541 + }, + { + "epoch": 0.1378755364806867, + "grad_norm": 0.10609076185352237, + "learning_rate": 0.00019396265703346339, + "loss": 0.6569, + "step": 1542 + }, + { + "epoch": 0.13796494992846925, + "grad_norm": 0.11611718149838696, + "learning_rate": 0.00019395274289873705, + "loss": 0.6945, + "step": 1543 + }, + { + "epoch": 0.1380543633762518, + "grad_norm": 0.10386654880879335, + "learning_rate": 0.00019394282088430758, + "loss": 0.6602, + "step": 1544 + }, + { + "epoch": 0.13814377682403434, + "grad_norm": 0.126655383093727, + "learning_rate": 0.0001939328909910072, + "loss": 0.7005, + "step": 1545 + }, + { + "epoch": 0.13823319027181688, + "grad_norm": 0.11099171925724545, + "learning_rate": 0.0001939229532196687, + "loss": 0.6762, + "step": 1546 + }, + { + "epoch": 0.13832260371959942, + "grad_norm": 0.12513473644346568, + "learning_rate": 0.00019391300757112557, + "loss": 0.7266, + "step": 1547 + }, + { + "epoch": 0.13841201716738197, + "grad_norm": 0.13546910715829705, + "learning_rate": 0.00019390305404621186, + "loss": 0.721, + "step": 1548 + }, + { + "epoch": 0.1385014306151645, + "grad_norm": 0.12826731793242044, + "learning_rate": 0.00019389309264576242, + "loss": 0.7226, + "step": 1549 + }, + { + "epoch": 0.13859084406294706, + "grad_norm": 0.12387680052340387, + "learning_rate": 0.00019388312337061274, + "loss": 0.677, + "step": 1550 + }, + { + "epoch": 0.13868025751072963, + "grad_norm": 0.10694276554814255, + "learning_rate": 0.00019387314622159885, + "loss": 0.7013, + "step": 1551 + }, + { + "epoch": 0.13876967095851217, + "grad_norm": 0.111538676867054, + "learning_rate": 0.00019386316119955756, + "loss": 0.639, + "step": 1552 + }, + { + "epoch": 0.1388590844062947, + "grad_norm": 0.11953820341551928, + "learning_rate": 0.0001938531683053263, + "loss": 0.7007, + "step": 1553 + }, + { + "epoch": 0.13894849785407726, + "grad_norm": 0.12128393622561187, + "learning_rate": 0.00019384316753974314, + "loss": 0.7166, + "step": 1554 + }, + { + "epoch": 0.1390379113018598, + "grad_norm": 0.1261192506790548, + "learning_rate": 0.00019383315890364689, + "loss": 0.7487, + "step": 1555 + }, + { + "epoch": 0.13912732474964234, + "grad_norm": 0.12456249297671067, + "learning_rate": 0.00019382314239787691, + "loss": 0.6668, + "step": 1556 + }, + { + "epoch": 0.1392167381974249, + "grad_norm": 0.12378777857461316, + "learning_rate": 0.00019381311802327327, + "loss": 0.6762, + "step": 1557 + }, + { + "epoch": 0.13930615164520743, + "grad_norm": 0.12918413297726436, + "learning_rate": 0.00019380308578067674, + "loss": 0.7157, + "step": 1558 + }, + { + "epoch": 0.13939556509298998, + "grad_norm": 0.14402636023932203, + "learning_rate": 0.00019379304567092867, + "loss": 0.7133, + "step": 1559 + }, + { + "epoch": 0.13948497854077252, + "grad_norm": 0.12156912348632973, + "learning_rate": 0.00019378299769487117, + "loss": 0.6849, + "step": 1560 + }, + { + "epoch": 0.1395743919885551, + "grad_norm": 0.12536628841092087, + "learning_rate": 0.0001937729418533469, + "loss": 0.6993, + "step": 1561 + }, + { + "epoch": 0.13966380543633763, + "grad_norm": 0.10858934999030478, + "learning_rate": 0.0001937628781471992, + "loss": 0.6779, + "step": 1562 + }, + { + "epoch": 0.13975321888412018, + "grad_norm": 0.13054033872306847, + "learning_rate": 0.0001937528065772722, + "loss": 0.7484, + "step": 1563 + }, + { + "epoch": 0.13984263233190272, + "grad_norm": 0.11872227598702915, + "learning_rate": 0.0001937427271444105, + "loss": 0.6565, + "step": 1564 + }, + { + "epoch": 0.13993204577968527, + "grad_norm": 0.12055559961118063, + "learning_rate": 0.00019373263984945953, + "loss": 0.7336, + "step": 1565 + }, + { + "epoch": 0.1400214592274678, + "grad_norm": 0.11169611578918413, + "learning_rate": 0.00019372254469326522, + "loss": 0.6788, + "step": 1566 + }, + { + "epoch": 0.14011087267525035, + "grad_norm": 0.1230753276787746, + "learning_rate": 0.0001937124416766743, + "loss": 0.7175, + "step": 1567 + }, + { + "epoch": 0.1402002861230329, + "grad_norm": 0.1283348903485352, + "learning_rate": 0.00019370233080053407, + "loss": 0.7117, + "step": 1568 + }, + { + "epoch": 0.14028969957081544, + "grad_norm": 0.1306624674995136, + "learning_rate": 0.0001936922120656925, + "loss": 0.6948, + "step": 1569 + }, + { + "epoch": 0.14037911301859798, + "grad_norm": 0.11740639224017851, + "learning_rate": 0.00019368208547299826, + "loss": 0.6915, + "step": 1570 + }, + { + "epoch": 0.14046852646638056, + "grad_norm": 0.12975484860742037, + "learning_rate": 0.00019367195102330066, + "loss": 0.7178, + "step": 1571 + }, + { + "epoch": 0.1405579399141631, + "grad_norm": 0.11863471232758367, + "learning_rate": 0.00019366180871744964, + "loss": 0.6657, + "step": 1572 + }, + { + "epoch": 0.14064735336194564, + "grad_norm": 0.11209013344219736, + "learning_rate": 0.00019365165855629587, + "loss": 0.6802, + "step": 1573 + }, + { + "epoch": 0.1407367668097282, + "grad_norm": 0.12849110299050995, + "learning_rate": 0.00019364150054069059, + "loss": 0.6716, + "step": 1574 + }, + { + "epoch": 0.14082618025751073, + "grad_norm": 0.11665785297077555, + "learning_rate": 0.00019363133467148572, + "loss": 0.6881, + "step": 1575 + }, + { + "epoch": 0.14091559370529327, + "grad_norm": 0.10809234825945116, + "learning_rate": 0.00019362116094953391, + "loss": 0.6764, + "step": 1576 + }, + { + "epoch": 0.14100500715307582, + "grad_norm": 0.1048040180332654, + "learning_rate": 0.0001936109793756884, + "loss": 0.6853, + "step": 1577 + }, + { + "epoch": 0.14109442060085836, + "grad_norm": 0.130499306836965, + "learning_rate": 0.00019360078995080308, + "loss": 0.7189, + "step": 1578 + }, + { + "epoch": 0.1411838340486409, + "grad_norm": 0.11323149329278673, + "learning_rate": 0.0001935905926757326, + "loss": 0.7187, + "step": 1579 + }, + { + "epoch": 0.14127324749642345, + "grad_norm": 0.11100696102638474, + "learning_rate": 0.0001935803875513321, + "loss": 0.6643, + "step": 1580 + }, + { + "epoch": 0.14136266094420602, + "grad_norm": 0.1272686542302985, + "learning_rate": 0.0001935701745784575, + "loss": 0.7649, + "step": 1581 + }, + { + "epoch": 0.14145207439198856, + "grad_norm": 0.12026149583223118, + "learning_rate": 0.0001935599537579654, + "loss": 0.6847, + "step": 1582 + }, + { + "epoch": 0.1415414878397711, + "grad_norm": 0.12846773962452568, + "learning_rate": 0.00019354972509071295, + "loss": 0.7225, + "step": 1583 + }, + { + "epoch": 0.14163090128755365, + "grad_norm": 0.10952695428409223, + "learning_rate": 0.00019353948857755803, + "loss": 0.691, + "step": 1584 + }, + { + "epoch": 0.1417203147353362, + "grad_norm": 0.10780587876325119, + "learning_rate": 0.00019352924421935916, + "loss": 0.6851, + "step": 1585 + }, + { + "epoch": 0.14180972818311874, + "grad_norm": 0.09757119879922824, + "learning_rate": 0.00019351899201697556, + "loss": 0.682, + "step": 1586 + }, + { + "epoch": 0.14189914163090128, + "grad_norm": 0.1127023079561386, + "learning_rate": 0.00019350873197126705, + "loss": 0.6952, + "step": 1587 + }, + { + "epoch": 0.14198855507868383, + "grad_norm": 0.11331940989614074, + "learning_rate": 0.0001934984640830941, + "loss": 0.6948, + "step": 1588 + }, + { + "epoch": 0.14207796852646637, + "grad_norm": 0.11095219845017015, + "learning_rate": 0.00019348818835331788, + "loss": 0.6634, + "step": 1589 + }, + { + "epoch": 0.14216738197424894, + "grad_norm": 0.1086090690440793, + "learning_rate": 0.0001934779047828002, + "loss": 0.7129, + "step": 1590 + }, + { + "epoch": 0.14225679542203148, + "grad_norm": 0.12551588355213317, + "learning_rate": 0.00019346761337240355, + "loss": 0.721, + "step": 1591 + }, + { + "epoch": 0.14234620886981403, + "grad_norm": 0.12189765819872321, + "learning_rate": 0.00019345731412299106, + "loss": 0.6845, + "step": 1592 + }, + { + "epoch": 0.14243562231759657, + "grad_norm": 0.10960407421667144, + "learning_rate": 0.0001934470070354265, + "loss": 0.671, + "step": 1593 + }, + { + "epoch": 0.14252503576537912, + "grad_norm": 0.13630819888364953, + "learning_rate": 0.00019343669211057432, + "loss": 0.7649, + "step": 1594 + }, + { + "epoch": 0.14261444921316166, + "grad_norm": 0.11819423873734783, + "learning_rate": 0.00019342636934929959, + "loss": 0.6427, + "step": 1595 + }, + { + "epoch": 0.1427038626609442, + "grad_norm": 0.11674774052977496, + "learning_rate": 0.0001934160387524681, + "loss": 0.7224, + "step": 1596 + }, + { + "epoch": 0.14279327610872675, + "grad_norm": 0.126576065775566, + "learning_rate": 0.00019340570032094626, + "loss": 0.6501, + "step": 1597 + }, + { + "epoch": 0.1428826895565093, + "grad_norm": 0.1265393889269436, + "learning_rate": 0.00019339535405560115, + "loss": 0.7119, + "step": 1598 + }, + { + "epoch": 0.14297210300429183, + "grad_norm": 0.12916930391825326, + "learning_rate": 0.00019338499995730048, + "loss": 0.723, + "step": 1599 + }, + { + "epoch": 0.1430615164520744, + "grad_norm": 0.12839223372728184, + "learning_rate": 0.00019337463802691264, + "loss": 0.7289, + "step": 1600 + }, + { + "epoch": 0.14315092989985695, + "grad_norm": 0.1351395311821613, + "learning_rate": 0.00019336426826530668, + "loss": 0.7207, + "step": 1601 + }, + { + "epoch": 0.1432403433476395, + "grad_norm": 0.13861061166939762, + "learning_rate": 0.0001933538906733523, + "loss": 0.7081, + "step": 1602 + }, + { + "epoch": 0.14332975679542204, + "grad_norm": 0.11795300103010421, + "learning_rate": 0.00019334350525191987, + "loss": 0.6383, + "step": 1603 + }, + { + "epoch": 0.14341917024320458, + "grad_norm": 0.12777781648429823, + "learning_rate": 0.00019333311200188036, + "loss": 0.6921, + "step": 1604 + }, + { + "epoch": 0.14350858369098712, + "grad_norm": 0.12846529674476784, + "learning_rate": 0.00019332271092410545, + "loss": 0.7397, + "step": 1605 + }, + { + "epoch": 0.14359799713876967, + "grad_norm": 0.12442371682285477, + "learning_rate": 0.0001933123020194675, + "loss": 0.7009, + "step": 1606 + }, + { + "epoch": 0.1436874105865522, + "grad_norm": 0.12212803020913632, + "learning_rate": 0.00019330188528883947, + "loss": 0.7181, + "step": 1607 + }, + { + "epoch": 0.14377682403433475, + "grad_norm": 0.12845763709741412, + "learning_rate": 0.00019329146073309504, + "loss": 0.7121, + "step": 1608 + }, + { + "epoch": 0.1438662374821173, + "grad_norm": 0.09179697110242584, + "learning_rate": 0.00019328102835310842, + "loss": 0.6679, + "step": 1609 + }, + { + "epoch": 0.14395565092989987, + "grad_norm": 0.11391978762600251, + "learning_rate": 0.00019327058814975462, + "loss": 0.7029, + "step": 1610 + }, + { + "epoch": 0.1440450643776824, + "grad_norm": 0.12235861091803552, + "learning_rate": 0.00019326014012390922, + "loss": 0.7103, + "step": 1611 + }, + { + "epoch": 0.14413447782546496, + "grad_norm": 0.11412878962862126, + "learning_rate": 0.00019324968427644848, + "loss": 0.7034, + "step": 1612 + }, + { + "epoch": 0.1442238912732475, + "grad_norm": 0.12196174552449712, + "learning_rate": 0.00019323922060824939, + "loss": 0.6917, + "step": 1613 + }, + { + "epoch": 0.14431330472103004, + "grad_norm": 0.11504340146440407, + "learning_rate": 0.00019322874912018945, + "loss": 0.6923, + "step": 1614 + }, + { + "epoch": 0.1444027181688126, + "grad_norm": 0.11729466740214188, + "learning_rate": 0.00019321826981314691, + "loss": 0.6937, + "step": 1615 + }, + { + "epoch": 0.14449213161659513, + "grad_norm": 0.1207856270652376, + "learning_rate": 0.00019320778268800066, + "loss": 0.6884, + "step": 1616 + }, + { + "epoch": 0.14458154506437768, + "grad_norm": 0.12019446351392715, + "learning_rate": 0.00019319728774563023, + "loss": 0.7003, + "step": 1617 + }, + { + "epoch": 0.14467095851216022, + "grad_norm": 0.10941217294832886, + "learning_rate": 0.00019318678498691586, + "loss": 0.6937, + "step": 1618 + }, + { + "epoch": 0.14476037195994276, + "grad_norm": 0.13495949986307673, + "learning_rate": 0.00019317627441273836, + "loss": 0.7373, + "step": 1619 + }, + { + "epoch": 0.14484978540772533, + "grad_norm": 0.140078432307456, + "learning_rate": 0.00019316575602397923, + "loss": 0.6901, + "step": 1620 + }, + { + "epoch": 0.14493919885550788, + "grad_norm": 0.12542893805071645, + "learning_rate": 0.0001931552298215207, + "loss": 0.718, + "step": 1621 + }, + { + "epoch": 0.14502861230329042, + "grad_norm": 0.10447347799954161, + "learning_rate": 0.0001931446958062455, + "loss": 0.6682, + "step": 1622 + }, + { + "epoch": 0.14511802575107297, + "grad_norm": 0.13378207539872483, + "learning_rate": 0.0001931341539790372, + "loss": 0.7367, + "step": 1623 + }, + { + "epoch": 0.1452074391988555, + "grad_norm": 0.11699104924726858, + "learning_rate": 0.00019312360434077985, + "loss": 0.7213, + "step": 1624 + }, + { + "epoch": 0.14529685264663805, + "grad_norm": 0.1012875768306472, + "learning_rate": 0.0001931130468923583, + "loss": 0.6457, + "step": 1625 + }, + { + "epoch": 0.1453862660944206, + "grad_norm": 0.13360448231519004, + "learning_rate": 0.00019310248163465795, + "loss": 0.7171, + "step": 1626 + }, + { + "epoch": 0.14547567954220314, + "grad_norm": 0.12432003963926412, + "learning_rate": 0.00019309190856856486, + "loss": 0.7174, + "step": 1627 + }, + { + "epoch": 0.14556509298998568, + "grad_norm": 0.1231976393330957, + "learning_rate": 0.0001930813276949659, + "loss": 0.6901, + "step": 1628 + }, + { + "epoch": 0.14565450643776823, + "grad_norm": 0.12249845162603117, + "learning_rate": 0.00019307073901474834, + "loss": 0.7246, + "step": 1629 + }, + { + "epoch": 0.1457439198855508, + "grad_norm": 0.1333023523865243, + "learning_rate": 0.00019306014252880034, + "loss": 0.7288, + "step": 1630 + }, + { + "epoch": 0.14583333333333334, + "grad_norm": 0.10197445934518298, + "learning_rate": 0.00019304953823801055, + "loss": 0.6995, + "step": 1631 + }, + { + "epoch": 0.1459227467811159, + "grad_norm": 0.11345321081703617, + "learning_rate": 0.00019303892614326836, + "loss": 0.6942, + "step": 1632 + }, + { + "epoch": 0.14601216022889843, + "grad_norm": 0.12470967887027148, + "learning_rate": 0.0001930283062454638, + "loss": 0.6779, + "step": 1633 + }, + { + "epoch": 0.14610157367668097, + "grad_norm": 0.11463663960105855, + "learning_rate": 0.00019301767854548756, + "loss": 0.682, + "step": 1634 + }, + { + "epoch": 0.14619098712446352, + "grad_norm": 0.11884053191794601, + "learning_rate": 0.00019300704304423094, + "loss": 0.685, + "step": 1635 + }, + { + "epoch": 0.14628040057224606, + "grad_norm": 0.11751888171328463, + "learning_rate": 0.00019299639974258598, + "loss": 0.6749, + "step": 1636 + }, + { + "epoch": 0.1463698140200286, + "grad_norm": 0.1214713773283145, + "learning_rate": 0.00019298574864144523, + "loss": 0.7135, + "step": 1637 + }, + { + "epoch": 0.14645922746781115, + "grad_norm": 0.1097293583814551, + "learning_rate": 0.00019297508974170207, + "loss": 0.6645, + "step": 1638 + }, + { + "epoch": 0.1465486409155937, + "grad_norm": 0.1044342584446312, + "learning_rate": 0.0001929644230442504, + "loss": 0.6386, + "step": 1639 + }, + { + "epoch": 0.14663805436337626, + "grad_norm": 0.12326562779139857, + "learning_rate": 0.00019295374854998488, + "loss": 0.7173, + "step": 1640 + }, + { + "epoch": 0.1467274678111588, + "grad_norm": 0.12577276764279058, + "learning_rate": 0.0001929430662598007, + "loss": 0.6866, + "step": 1641 + }, + { + "epoch": 0.14681688125894135, + "grad_norm": 0.11722065593107213, + "learning_rate": 0.00019293237617459382, + "loss": 0.6591, + "step": 1642 + }, + { + "epoch": 0.1469062947067239, + "grad_norm": 0.11837232163452166, + "learning_rate": 0.00019292167829526076, + "loss": 0.7121, + "step": 1643 + }, + { + "epoch": 0.14699570815450644, + "grad_norm": 0.12902086386497363, + "learning_rate": 0.00019291097262269874, + "loss": 0.7152, + "step": 1644 + }, + { + "epoch": 0.14708512160228898, + "grad_norm": 0.11229917283476151, + "learning_rate": 0.0001929002591578057, + "loss": 0.6865, + "step": 1645 + }, + { + "epoch": 0.14717453505007153, + "grad_norm": 0.12377752273697573, + "learning_rate": 0.00019288953790148013, + "loss": 0.7404, + "step": 1646 + }, + { + "epoch": 0.14726394849785407, + "grad_norm": 0.11389739556962722, + "learning_rate": 0.00019287880885462115, + "loss": 0.6855, + "step": 1647 + }, + { + "epoch": 0.1473533619456366, + "grad_norm": 0.11578392732617168, + "learning_rate": 0.00019286807201812867, + "loss": 0.6796, + "step": 1648 + }, + { + "epoch": 0.14744277539341916, + "grad_norm": 0.12240547772029536, + "learning_rate": 0.00019285732739290315, + "loss": 0.6877, + "step": 1649 + }, + { + "epoch": 0.14753218884120173, + "grad_norm": 0.10886484363412309, + "learning_rate": 0.0001928465749798457, + "loss": 0.7017, + "step": 1650 + }, + { + "epoch": 0.14762160228898427, + "grad_norm": 0.10752226879177729, + "learning_rate": 0.00019283581477985817, + "loss": 0.6909, + "step": 1651 + }, + { + "epoch": 0.14771101573676682, + "grad_norm": 0.11495802552303679, + "learning_rate": 0.00019282504679384293, + "loss": 0.7118, + "step": 1652 + }, + { + "epoch": 0.14780042918454936, + "grad_norm": 0.12434020555366797, + "learning_rate": 0.00019281427102270314, + "loss": 0.697, + "step": 1653 + }, + { + "epoch": 0.1478898426323319, + "grad_norm": 0.103498410941521, + "learning_rate": 0.00019280348746734255, + "loss": 0.6845, + "step": 1654 + }, + { + "epoch": 0.14797925608011445, + "grad_norm": 0.12026714414807169, + "learning_rate": 0.00019279269612866554, + "loss": 0.7084, + "step": 1655 + }, + { + "epoch": 0.148068669527897, + "grad_norm": 0.12172603193775365, + "learning_rate": 0.00019278189700757715, + "loss": 0.6641, + "step": 1656 + }, + { + "epoch": 0.14815808297567953, + "grad_norm": 0.11599521772896626, + "learning_rate": 0.0001927710901049831, + "loss": 0.7181, + "step": 1657 + }, + { + "epoch": 0.14824749642346208, + "grad_norm": 0.11709709284869493, + "learning_rate": 0.00019276027542178978, + "loss": 0.7549, + "step": 1658 + }, + { + "epoch": 0.14833690987124465, + "grad_norm": 0.13097095055621036, + "learning_rate": 0.0001927494529589042, + "loss": 0.69, + "step": 1659 + }, + { + "epoch": 0.1484263233190272, + "grad_norm": 0.1044242210905482, + "learning_rate": 0.000192738622717234, + "loss": 0.6684, + "step": 1660 + }, + { + "epoch": 0.14851573676680974, + "grad_norm": 0.1258380433188409, + "learning_rate": 0.0001927277846976875, + "loss": 0.7055, + "step": 1661 + }, + { + "epoch": 0.14860515021459228, + "grad_norm": 0.13700022420385707, + "learning_rate": 0.00019271693890117372, + "loss": 0.7006, + "step": 1662 + }, + { + "epoch": 0.14869456366237482, + "grad_norm": 0.11273735911082167, + "learning_rate": 0.00019270608532860224, + "loss": 0.7158, + "step": 1663 + }, + { + "epoch": 0.14878397711015737, + "grad_norm": 0.12232592510605972, + "learning_rate": 0.00019269522398088332, + "loss": 0.6956, + "step": 1664 + }, + { + "epoch": 0.1488733905579399, + "grad_norm": 0.11988891760912487, + "learning_rate": 0.0001926843548589279, + "loss": 0.7102, + "step": 1665 + }, + { + "epoch": 0.14896280400572245, + "grad_norm": 0.12940519526411878, + "learning_rate": 0.0001926734779636476, + "loss": 0.6962, + "step": 1666 + }, + { + "epoch": 0.149052217453505, + "grad_norm": 0.10469684210928501, + "learning_rate": 0.00019266259329595462, + "loss": 0.6752, + "step": 1667 + }, + { + "epoch": 0.14914163090128754, + "grad_norm": 0.10614685879470287, + "learning_rate": 0.00019265170085676185, + "loss": 0.6818, + "step": 1668 + }, + { + "epoch": 0.1492310443490701, + "grad_norm": 0.11399824788317532, + "learning_rate": 0.00019264080064698282, + "loss": 0.6967, + "step": 1669 + }, + { + "epoch": 0.14932045779685266, + "grad_norm": 0.11929639785021443, + "learning_rate": 0.00019262989266753173, + "loss": 0.7082, + "step": 1670 + }, + { + "epoch": 0.1494098712446352, + "grad_norm": 0.11077191694985, + "learning_rate": 0.0001926189769193234, + "loss": 0.6826, + "step": 1671 + }, + { + "epoch": 0.14949928469241774, + "grad_norm": 0.1000418413094613, + "learning_rate": 0.00019260805340327335, + "loss": 0.5946, + "step": 1672 + }, + { + "epoch": 0.1495886981402003, + "grad_norm": 0.12325264230685502, + "learning_rate": 0.00019259712212029765, + "loss": 0.72, + "step": 1673 + }, + { + "epoch": 0.14967811158798283, + "grad_norm": 0.11621208991135061, + "learning_rate": 0.0001925861830713132, + "loss": 0.6318, + "step": 1674 + }, + { + "epoch": 0.14976752503576538, + "grad_norm": 0.11781080282160951, + "learning_rate": 0.00019257523625723736, + "loss": 0.7417, + "step": 1675 + }, + { + "epoch": 0.14985693848354792, + "grad_norm": 0.1303920368993325, + "learning_rate": 0.0001925642816789883, + "loss": 0.7249, + "step": 1676 + }, + { + "epoch": 0.14994635193133046, + "grad_norm": 0.13344327098980438, + "learning_rate": 0.00019255331933748472, + "loss": 0.6936, + "step": 1677 + }, + { + "epoch": 0.150035765379113, + "grad_norm": 0.12012460974528424, + "learning_rate": 0.000192542349233646, + "loss": 0.7277, + "step": 1678 + }, + { + "epoch": 0.15012517882689558, + "grad_norm": 0.11239342477998611, + "learning_rate": 0.0001925313713683922, + "loss": 0.7093, + "step": 1679 + }, + { + "epoch": 0.15021459227467812, + "grad_norm": 0.11322058482551069, + "learning_rate": 0.00019252038574264405, + "loss": 0.6787, + "step": 1680 + }, + { + "epoch": 0.15030400572246067, + "grad_norm": 0.1135777040328576, + "learning_rate": 0.00019250939235732287, + "loss": 0.6632, + "step": 1681 + }, + { + "epoch": 0.1503934191702432, + "grad_norm": 0.11946746265398699, + "learning_rate": 0.00019249839121335068, + "loss": 0.662, + "step": 1682 + }, + { + "epoch": 0.15048283261802575, + "grad_norm": 0.11536610929581548, + "learning_rate": 0.00019248738231165017, + "loss": 0.6579, + "step": 1683 + }, + { + "epoch": 0.1505722460658083, + "grad_norm": 0.1294547586405375, + "learning_rate": 0.00019247636565314453, + "loss": 0.7391, + "step": 1684 + }, + { + "epoch": 0.15066165951359084, + "grad_norm": 0.11892374694685262, + "learning_rate": 0.00019246534123875783, + "loss": 0.678, + "step": 1685 + }, + { + "epoch": 0.15075107296137338, + "grad_norm": 0.11082256192444918, + "learning_rate": 0.00019245430906941464, + "loss": 0.682, + "step": 1686 + }, + { + "epoch": 0.15084048640915593, + "grad_norm": 0.10749721624078366, + "learning_rate": 0.00019244326914604019, + "loss": 0.663, + "step": 1687 + }, + { + "epoch": 0.15092989985693847, + "grad_norm": 0.10861832892604592, + "learning_rate": 0.00019243222146956039, + "loss": 0.6636, + "step": 1688 + }, + { + "epoch": 0.15101931330472104, + "grad_norm": 0.11512576658258632, + "learning_rate": 0.0001924211660409018, + "loss": 0.682, + "step": 1689 + }, + { + "epoch": 0.15110872675250359, + "grad_norm": 0.11489853143681697, + "learning_rate": 0.00019241010286099165, + "loss": 0.6819, + "step": 1690 + }, + { + "epoch": 0.15119814020028613, + "grad_norm": 0.11936090488148421, + "learning_rate": 0.00019239903193075776, + "loss": 0.6706, + "step": 1691 + }, + { + "epoch": 0.15128755364806867, + "grad_norm": 0.139215096937425, + "learning_rate": 0.0001923879532511287, + "loss": 0.745, + "step": 1692 + }, + { + "epoch": 0.15137696709585122, + "grad_norm": 0.11673546050027597, + "learning_rate": 0.0001923768668230335, + "loss": 0.5563, + "step": 1693 + }, + { + "epoch": 0.15146638054363376, + "grad_norm": 0.1237388300982024, + "learning_rate": 0.0001923657726474021, + "loss": 0.7083, + "step": 1694 + }, + { + "epoch": 0.1515557939914163, + "grad_norm": 0.12611894483884586, + "learning_rate": 0.00019235467072516488, + "loss": 0.7276, + "step": 1695 + }, + { + "epoch": 0.15164520743919885, + "grad_norm": 0.11170866686295845, + "learning_rate": 0.00019234356105725297, + "loss": 0.6665, + "step": 1696 + }, + { + "epoch": 0.1517346208869814, + "grad_norm": 0.11748063275111453, + "learning_rate": 0.00019233244364459814, + "loss": 0.6909, + "step": 1697 + }, + { + "epoch": 0.15182403433476394, + "grad_norm": 0.11916720406574662, + "learning_rate": 0.00019232131848813272, + "loss": 0.6687, + "step": 1698 + }, + { + "epoch": 0.1519134477825465, + "grad_norm": 0.12293810389451762, + "learning_rate": 0.00019231018558878984, + "loss": 0.7229, + "step": 1699 + }, + { + "epoch": 0.15200286123032905, + "grad_norm": 0.1190387957171869, + "learning_rate": 0.00019229904494750315, + "loss": 0.7088, + "step": 1700 + }, + { + "epoch": 0.1520922746781116, + "grad_norm": 0.11761151570552358, + "learning_rate": 0.00019228789656520708, + "loss": 0.7333, + "step": 1701 + }, + { + "epoch": 0.15218168812589414, + "grad_norm": 0.11645049665281915, + "learning_rate": 0.00019227674044283653, + "loss": 0.6662, + "step": 1702 + }, + { + "epoch": 0.15227110157367668, + "grad_norm": 0.11724806048982421, + "learning_rate": 0.00019226557658132723, + "loss": 0.6759, + "step": 1703 + }, + { + "epoch": 0.15236051502145923, + "grad_norm": 0.10937339912968146, + "learning_rate": 0.00019225440498161546, + "loss": 0.6736, + "step": 1704 + }, + { + "epoch": 0.15244992846924177, + "grad_norm": 0.1262085569114229, + "learning_rate": 0.00019224322564463813, + "loss": 0.7667, + "step": 1705 + }, + { + "epoch": 0.1525393419170243, + "grad_norm": 0.10874777052292407, + "learning_rate": 0.00019223203857133287, + "loss": 0.6824, + "step": 1706 + }, + { + "epoch": 0.15262875536480686, + "grad_norm": 0.11580643205195877, + "learning_rate": 0.00019222084376263794, + "loss": 0.6712, + "step": 1707 + }, + { + "epoch": 0.1527181688125894, + "grad_norm": 0.12559131073195087, + "learning_rate": 0.0001922096412194922, + "loss": 0.7365, + "step": 1708 + }, + { + "epoch": 0.15280758226037197, + "grad_norm": 0.10696148642720657, + "learning_rate": 0.00019219843094283524, + "loss": 0.649, + "step": 1709 + }, + { + "epoch": 0.15289699570815452, + "grad_norm": 0.1327527271556779, + "learning_rate": 0.00019218721293360718, + "loss": 0.7249, + "step": 1710 + }, + { + "epoch": 0.15298640915593706, + "grad_norm": 0.12085642810124567, + "learning_rate": 0.00019217598719274896, + "loss": 0.6821, + "step": 1711 + }, + { + "epoch": 0.1530758226037196, + "grad_norm": 0.11201157359460624, + "learning_rate": 0.00019216475372120197, + "loss": 0.6658, + "step": 1712 + }, + { + "epoch": 0.15316523605150215, + "grad_norm": 0.112553966164943, + "learning_rate": 0.0001921535125199084, + "loss": 0.581, + "step": 1713 + }, + { + "epoch": 0.1532546494992847, + "grad_norm": 0.13952095208449772, + "learning_rate": 0.00019214226358981105, + "loss": 0.7308, + "step": 1714 + }, + { + "epoch": 0.15334406294706723, + "grad_norm": 0.11172439220632353, + "learning_rate": 0.00019213100693185332, + "loss": 0.7055, + "step": 1715 + }, + { + "epoch": 0.15343347639484978, + "grad_norm": 0.11167859279337986, + "learning_rate": 0.00019211974254697932, + "loss": 0.6782, + "step": 1716 + }, + { + "epoch": 0.15352288984263232, + "grad_norm": 0.12870052087467918, + "learning_rate": 0.00019210847043613373, + "loss": 0.7138, + "step": 1717 + }, + { + "epoch": 0.1536123032904149, + "grad_norm": 0.1185709949249616, + "learning_rate": 0.000192097190600262, + "loss": 0.6928, + "step": 1718 + }, + { + "epoch": 0.15370171673819744, + "grad_norm": 0.11199027666443091, + "learning_rate": 0.0001920859030403101, + "loss": 0.6867, + "step": 1719 + }, + { + "epoch": 0.15379113018597998, + "grad_norm": 0.11836660429282532, + "learning_rate": 0.00019207460775722473, + "loss": 0.6855, + "step": 1720 + }, + { + "epoch": 0.15388054363376252, + "grad_norm": 0.12272420512959038, + "learning_rate": 0.00019206330475195319, + "loss": 0.6816, + "step": 1721 + }, + { + "epoch": 0.15396995708154507, + "grad_norm": 0.11752298448471363, + "learning_rate": 0.0001920519940254435, + "loss": 0.702, + "step": 1722 + }, + { + "epoch": 0.1540593705293276, + "grad_norm": 0.10416462966689188, + "learning_rate": 0.0001920406755786442, + "loss": 0.59, + "step": 1723 + }, + { + "epoch": 0.15414878397711015, + "grad_norm": 0.11079095766225425, + "learning_rate": 0.0001920293494125046, + "loss": 0.6631, + "step": 1724 + }, + { + "epoch": 0.1542381974248927, + "grad_norm": 0.12109227677090094, + "learning_rate": 0.00019201801552797462, + "loss": 0.7192, + "step": 1725 + }, + { + "epoch": 0.15432761087267524, + "grad_norm": 0.11599752106526165, + "learning_rate": 0.0001920066739260048, + "loss": 0.6524, + "step": 1726 + }, + { + "epoch": 0.15441702432045779, + "grad_norm": 0.11600106973236858, + "learning_rate": 0.0001919953246075464, + "loss": 0.7024, + "step": 1727 + }, + { + "epoch": 0.15450643776824036, + "grad_norm": 0.11127621619863687, + "learning_rate": 0.00019198396757355118, + "loss": 0.6982, + "step": 1728 + }, + { + "epoch": 0.1545958512160229, + "grad_norm": 0.11541321773644365, + "learning_rate": 0.00019197260282497171, + "loss": 0.6332, + "step": 1729 + }, + { + "epoch": 0.15468526466380544, + "grad_norm": 0.10385451969516919, + "learning_rate": 0.0001919612303627611, + "loss": 0.565, + "step": 1730 + }, + { + "epoch": 0.154774678111588, + "grad_norm": 0.12568076414423396, + "learning_rate": 0.00019194985018787316, + "loss": 0.6918, + "step": 1731 + }, + { + "epoch": 0.15486409155937053, + "grad_norm": 0.11979946760590564, + "learning_rate": 0.00019193846230126233, + "loss": 0.7079, + "step": 1732 + }, + { + "epoch": 0.15495350500715308, + "grad_norm": 0.1332603837450273, + "learning_rate": 0.00019192706670388373, + "loss": 0.7383, + "step": 1733 + }, + { + "epoch": 0.15504291845493562, + "grad_norm": 0.10477662973935474, + "learning_rate": 0.00019191566339669302, + "loss": 0.6777, + "step": 1734 + }, + { + "epoch": 0.15513233190271816, + "grad_norm": 0.1111576182052403, + "learning_rate": 0.00019190425238064667, + "loss": 0.7235, + "step": 1735 + }, + { + "epoch": 0.1552217453505007, + "grad_norm": 0.11303599982848261, + "learning_rate": 0.00019189283365670163, + "loss": 0.6636, + "step": 1736 + }, + { + "epoch": 0.15531115879828325, + "grad_norm": 0.11085130193562413, + "learning_rate": 0.00019188140722581562, + "loss": 0.7066, + "step": 1737 + }, + { + "epoch": 0.15540057224606582, + "grad_norm": 0.12809981302553508, + "learning_rate": 0.00019186997308894696, + "loss": 0.7, + "step": 1738 + }, + { + "epoch": 0.15548998569384836, + "grad_norm": 0.12370066690732448, + "learning_rate": 0.0001918585312470546, + "loss": 0.733, + "step": 1739 + }, + { + "epoch": 0.1555793991416309, + "grad_norm": 0.11555430269435388, + "learning_rate": 0.00019184708170109818, + "loss": 0.7167, + "step": 1740 + }, + { + "epoch": 0.15566881258941345, + "grad_norm": 0.12964805359172374, + "learning_rate": 0.00019183562445203794, + "loss": 0.7121, + "step": 1741 + }, + { + "epoch": 0.155758226037196, + "grad_norm": 0.10655911557651934, + "learning_rate": 0.00019182415950083477, + "loss": 0.6845, + "step": 1742 + }, + { + "epoch": 0.15584763948497854, + "grad_norm": 0.12157674536290992, + "learning_rate": 0.0001918126868484502, + "loss": 0.6979, + "step": 1743 + }, + { + "epoch": 0.15593705293276108, + "grad_norm": 0.12653074510801493, + "learning_rate": 0.00019180120649584653, + "loss": 0.7422, + "step": 1744 + }, + { + "epoch": 0.15602646638054363, + "grad_norm": 0.12082297138590385, + "learning_rate": 0.00019178971844398653, + "loss": 0.6864, + "step": 1745 + }, + { + "epoch": 0.15611587982832617, + "grad_norm": 0.11049917466409395, + "learning_rate": 0.00019177822269383368, + "loss": 0.689, + "step": 1746 + }, + { + "epoch": 0.15620529327610871, + "grad_norm": 0.12381519528569343, + "learning_rate": 0.00019176671924635215, + "loss": 0.7198, + "step": 1747 + }, + { + "epoch": 0.15629470672389129, + "grad_norm": 0.12375730288232585, + "learning_rate": 0.00019175520810250666, + "loss": 0.6916, + "step": 1748 + }, + { + "epoch": 0.15638412017167383, + "grad_norm": 0.12083429512587274, + "learning_rate": 0.00019174368926326273, + "loss": 0.6767, + "step": 1749 + }, + { + "epoch": 0.15647353361945637, + "grad_norm": 0.1070756472739994, + "learning_rate": 0.00019173216272958633, + "loss": 0.6801, + "step": 1750 + }, + { + "epoch": 0.15656294706723892, + "grad_norm": 0.11792892664354912, + "learning_rate": 0.00019172062850244425, + "loss": 0.7214, + "step": 1751 + }, + { + "epoch": 0.15665236051502146, + "grad_norm": 0.12537612835944786, + "learning_rate": 0.00019170908658280386, + "loss": 0.6849, + "step": 1752 + }, + { + "epoch": 0.156741773962804, + "grad_norm": 0.13075800575543442, + "learning_rate": 0.0001916975369716331, + "loss": 0.7772, + "step": 1753 + }, + { + "epoch": 0.15683118741058655, + "grad_norm": 0.11842882495181552, + "learning_rate": 0.00019168597966990065, + "loss": 0.6979, + "step": 1754 + }, + { + "epoch": 0.1569206008583691, + "grad_norm": 0.11053739146575595, + "learning_rate": 0.00019167441467857584, + "loss": 0.6787, + "step": 1755 + }, + { + "epoch": 0.15701001430615164, + "grad_norm": 0.11752083579943107, + "learning_rate": 0.00019166284199862856, + "loss": 0.6746, + "step": 1756 + }, + { + "epoch": 0.15709942775393418, + "grad_norm": 0.10146136899005755, + "learning_rate": 0.00019165126163102943, + "loss": 0.6852, + "step": 1757 + }, + { + "epoch": 0.15718884120171675, + "grad_norm": 0.1104632215475608, + "learning_rate": 0.0001916396735767497, + "loss": 0.6924, + "step": 1758 + }, + { + "epoch": 0.1572782546494993, + "grad_norm": 0.12657471679521223, + "learning_rate": 0.00019162807783676118, + "loss": 0.6963, + "step": 1759 + }, + { + "epoch": 0.15736766809728184, + "grad_norm": 0.10518860624953465, + "learning_rate": 0.00019161647441203646, + "loss": 0.6832, + "step": 1760 + }, + { + "epoch": 0.15745708154506438, + "grad_norm": 0.10995219767497283, + "learning_rate": 0.0001916048633035487, + "loss": 0.7, + "step": 1761 + }, + { + "epoch": 0.15754649499284692, + "grad_norm": 0.10256285773381195, + "learning_rate": 0.00019159324451227164, + "loss": 0.67, + "step": 1762 + }, + { + "epoch": 0.15763590844062947, + "grad_norm": 0.12647546533745316, + "learning_rate": 0.00019158161803917975, + "loss": 0.6978, + "step": 1763 + }, + { + "epoch": 0.157725321888412, + "grad_norm": 0.10619594574799715, + "learning_rate": 0.0001915699838852482, + "loss": 0.7277, + "step": 1764 + }, + { + "epoch": 0.15781473533619456, + "grad_norm": 0.11268841281314712, + "learning_rate": 0.0001915583420514527, + "loss": 0.6897, + "step": 1765 + }, + { + "epoch": 0.1579041487839771, + "grad_norm": 0.1161971682648604, + "learning_rate": 0.00019154669253876962, + "loss": 0.6842, + "step": 1766 + }, + { + "epoch": 0.15799356223175964, + "grad_norm": 0.11956401832735787, + "learning_rate": 0.000191535035348176, + "loss": 0.6991, + "step": 1767 + }, + { + "epoch": 0.15808297567954221, + "grad_norm": 0.1081790456960009, + "learning_rate": 0.00019152337048064947, + "loss": 0.6848, + "step": 1768 + }, + { + "epoch": 0.15817238912732476, + "grad_norm": 0.10779189565567947, + "learning_rate": 0.00019151169793716843, + "loss": 0.6626, + "step": 1769 + }, + { + "epoch": 0.1582618025751073, + "grad_norm": 0.12819939854347412, + "learning_rate": 0.0001915000177187118, + "loss": 0.7165, + "step": 1770 + }, + { + "epoch": 0.15835121602288985, + "grad_norm": 0.09866423804820092, + "learning_rate": 0.00019148832982625918, + "loss": 0.6422, + "step": 1771 + }, + { + "epoch": 0.1584406294706724, + "grad_norm": 0.12046423059151647, + "learning_rate": 0.00019147663426079083, + "loss": 0.7116, + "step": 1772 + }, + { + "epoch": 0.15853004291845493, + "grad_norm": 0.12658320147557708, + "learning_rate": 0.00019146493102328765, + "loss": 0.6448, + "step": 1773 + }, + { + "epoch": 0.15861945636623748, + "grad_norm": 0.12248268403140374, + "learning_rate": 0.00019145322011473117, + "loss": 0.6947, + "step": 1774 + }, + { + "epoch": 0.15870886981402002, + "grad_norm": 0.11001060092480061, + "learning_rate": 0.00019144150153610354, + "loss": 0.697, + "step": 1775 + }, + { + "epoch": 0.15879828326180256, + "grad_norm": 0.11448178061928242, + "learning_rate": 0.00019142977528838762, + "loss": 0.6902, + "step": 1776 + }, + { + "epoch": 0.1588876967095851, + "grad_norm": 0.11730330750473346, + "learning_rate": 0.00019141804137256686, + "loss": 0.6823, + "step": 1777 + }, + { + "epoch": 0.15897711015736768, + "grad_norm": 0.11012691471156934, + "learning_rate": 0.0001914062997896254, + "loss": 0.6748, + "step": 1778 + }, + { + "epoch": 0.15906652360515022, + "grad_norm": 0.10317254318028421, + "learning_rate": 0.00019139455054054794, + "loss": 0.6482, + "step": 1779 + }, + { + "epoch": 0.15915593705293277, + "grad_norm": 0.11197940529566985, + "learning_rate": 0.0001913827936263199, + "loss": 0.7096, + "step": 1780 + }, + { + "epoch": 0.1592453505007153, + "grad_norm": 0.11791780703292618, + "learning_rate": 0.00019137102904792736, + "loss": 0.7041, + "step": 1781 + }, + { + "epoch": 0.15933476394849785, + "grad_norm": 0.1399234777762615, + "learning_rate": 0.00019135925680635694, + "loss": 0.6948, + "step": 1782 + }, + { + "epoch": 0.1594241773962804, + "grad_norm": 0.10815806376872454, + "learning_rate": 0.00019134747690259597, + "loss": 0.5859, + "step": 1783 + }, + { + "epoch": 0.15951359084406294, + "grad_norm": 0.12700748447016358, + "learning_rate": 0.00019133568933763244, + "loss": 0.7364, + "step": 1784 + }, + { + "epoch": 0.15960300429184548, + "grad_norm": 0.12632117366084866, + "learning_rate": 0.00019132389411245497, + "loss": 0.7605, + "step": 1785 + }, + { + "epoch": 0.15969241773962803, + "grad_norm": 0.10484020354574303, + "learning_rate": 0.00019131209122805277, + "loss": 0.6618, + "step": 1786 + }, + { + "epoch": 0.1597818311874106, + "grad_norm": 0.12279508395353769, + "learning_rate": 0.00019130028068541576, + "loss": 0.6723, + "step": 1787 + }, + { + "epoch": 0.15987124463519314, + "grad_norm": 0.10627669234936658, + "learning_rate": 0.0001912884624855345, + "loss": 0.6788, + "step": 1788 + }, + { + "epoch": 0.1599606580829757, + "grad_norm": 0.11102350467664336, + "learning_rate": 0.0001912766366294001, + "loss": 0.7038, + "step": 1789 + }, + { + "epoch": 0.16005007153075823, + "grad_norm": 0.11496848162949178, + "learning_rate": 0.00019126480311800444, + "loss": 0.6573, + "step": 1790 + }, + { + "epoch": 0.16013948497854077, + "grad_norm": 0.11620212724085546, + "learning_rate": 0.00019125296195233996, + "loss": 0.7047, + "step": 1791 + }, + { + "epoch": 0.16022889842632332, + "grad_norm": 0.11441173843194777, + "learning_rate": 0.00019124111313339976, + "loss": 0.6609, + "step": 1792 + }, + { + "epoch": 0.16031831187410586, + "grad_norm": 0.10344257661914436, + "learning_rate": 0.0001912292566621776, + "loss": 0.6967, + "step": 1793 + }, + { + "epoch": 0.1604077253218884, + "grad_norm": 0.10919624219882058, + "learning_rate": 0.00019121739253966785, + "loss": 0.6374, + "step": 1794 + }, + { + "epoch": 0.16049713876967095, + "grad_norm": 0.11213024899292781, + "learning_rate": 0.00019120552076686554, + "loss": 0.6618, + "step": 1795 + }, + { + "epoch": 0.1605865522174535, + "grad_norm": 0.12560943259071713, + "learning_rate": 0.0001911936413447664, + "loss": 0.741, + "step": 1796 + }, + { + "epoch": 0.16067596566523606, + "grad_norm": 0.1247327454197769, + "learning_rate": 0.00019118175427436666, + "loss": 0.6842, + "step": 1797 + }, + { + "epoch": 0.1607653791130186, + "grad_norm": 0.10713057384576155, + "learning_rate": 0.0001911698595566633, + "loss": 0.696, + "step": 1798 + }, + { + "epoch": 0.16085479256080115, + "grad_norm": 0.14080760230827394, + "learning_rate": 0.00019115795719265395, + "loss": 0.7364, + "step": 1799 + }, + { + "epoch": 0.1609442060085837, + "grad_norm": 0.12161947546986525, + "learning_rate": 0.0001911460471833368, + "loss": 0.6811, + "step": 1800 + }, + { + "epoch": 0.16103361945636624, + "grad_norm": 0.12484661955728499, + "learning_rate": 0.00019113412952971077, + "loss": 0.7019, + "step": 1801 + }, + { + "epoch": 0.16112303290414878, + "grad_norm": 0.11374280199213958, + "learning_rate": 0.00019112220423277534, + "loss": 0.6549, + "step": 1802 + }, + { + "epoch": 0.16121244635193133, + "grad_norm": 0.12731574303922244, + "learning_rate": 0.0001911102712935307, + "loss": 0.7153, + "step": 1803 + }, + { + "epoch": 0.16130185979971387, + "grad_norm": 0.11265742087829825, + "learning_rate": 0.00019109833071297763, + "loss": 0.6779, + "step": 1804 + }, + { + "epoch": 0.16139127324749641, + "grad_norm": 0.11232090003094962, + "learning_rate": 0.00019108638249211758, + "loss": 0.7072, + "step": 1805 + }, + { + "epoch": 0.16148068669527896, + "grad_norm": 0.11849279762641216, + "learning_rate": 0.00019107442663195265, + "loss": 0.6841, + "step": 1806 + }, + { + "epoch": 0.16157010014306153, + "grad_norm": 0.11427919141427417, + "learning_rate": 0.00019106246313348554, + "loss": 0.6921, + "step": 1807 + }, + { + "epoch": 0.16165951359084407, + "grad_norm": 0.11479866915969765, + "learning_rate": 0.00019105049199771962, + "loss": 0.7142, + "step": 1808 + }, + { + "epoch": 0.16174892703862662, + "grad_norm": 0.1169163113704344, + "learning_rate": 0.00019103851322565892, + "loss": 0.6072, + "step": 1809 + }, + { + "epoch": 0.16183834048640916, + "grad_norm": 0.12512295314391567, + "learning_rate": 0.00019102652681830804, + "loss": 0.7124, + "step": 1810 + }, + { + "epoch": 0.1619277539341917, + "grad_norm": 0.12713858070265707, + "learning_rate": 0.00019101453277667226, + "loss": 0.7075, + "step": 1811 + }, + { + "epoch": 0.16201716738197425, + "grad_norm": 0.12213971300224032, + "learning_rate": 0.00019100253110175758, + "loss": 0.6817, + "step": 1812 + }, + { + "epoch": 0.1621065808297568, + "grad_norm": 0.12088868178250073, + "learning_rate": 0.00019099052179457054, + "loss": 0.7158, + "step": 1813 + }, + { + "epoch": 0.16219599427753933, + "grad_norm": 0.12183777724244794, + "learning_rate": 0.00019097850485611827, + "loss": 0.6911, + "step": 1814 + }, + { + "epoch": 0.16228540772532188, + "grad_norm": 0.12570988891845458, + "learning_rate": 0.00019096648028740868, + "loss": 0.701, + "step": 1815 + }, + { + "epoch": 0.16237482117310442, + "grad_norm": 0.12084006913696883, + "learning_rate": 0.00019095444808945027, + "loss": 0.6841, + "step": 1816 + }, + { + "epoch": 0.162464234620887, + "grad_norm": 0.10867161017468978, + "learning_rate": 0.00019094240826325213, + "loss": 0.645, + "step": 1817 + }, + { + "epoch": 0.16255364806866954, + "grad_norm": 0.11757547545514577, + "learning_rate": 0.00019093036080982404, + "loss": 0.7141, + "step": 1818 + }, + { + "epoch": 0.16264306151645208, + "grad_norm": 0.12448252137088664, + "learning_rate": 0.0001909183057301764, + "loss": 0.7058, + "step": 1819 + }, + { + "epoch": 0.16273247496423462, + "grad_norm": 0.12181782053155381, + "learning_rate": 0.0001909062430253203, + "loss": 0.7001, + "step": 1820 + }, + { + "epoch": 0.16282188841201717, + "grad_norm": 0.115080956091501, + "learning_rate": 0.00019089417269626733, + "loss": 0.7068, + "step": 1821 + }, + { + "epoch": 0.1629113018597997, + "grad_norm": 0.13023642281183986, + "learning_rate": 0.00019088209474402992, + "loss": 0.7108, + "step": 1822 + }, + { + "epoch": 0.16300071530758226, + "grad_norm": 0.12931263976517735, + "learning_rate": 0.00019087000916962095, + "loss": 0.699, + "step": 1823 + }, + { + "epoch": 0.1630901287553648, + "grad_norm": 0.11120989173144309, + "learning_rate": 0.00019085791597405404, + "loss": 0.6906, + "step": 1824 + }, + { + "epoch": 0.16317954220314734, + "grad_norm": 0.10447677194605309, + "learning_rate": 0.00019084581515834347, + "loss": 0.6637, + "step": 1825 + }, + { + "epoch": 0.1632689556509299, + "grad_norm": 0.10755714461784904, + "learning_rate": 0.00019083370672350408, + "loss": 0.704, + "step": 1826 + }, + { + "epoch": 0.16335836909871246, + "grad_norm": 0.1290069994628505, + "learning_rate": 0.0001908215906705514, + "loss": 0.6795, + "step": 1827 + }, + { + "epoch": 0.163447782546495, + "grad_norm": 0.10702350766178427, + "learning_rate": 0.00019080946700050162, + "loss": 0.6637, + "step": 1828 + }, + { + "epoch": 0.16353719599427755, + "grad_norm": 0.12413542756566581, + "learning_rate": 0.00019079733571437154, + "loss": 0.6932, + "step": 1829 + }, + { + "epoch": 0.1636266094420601, + "grad_norm": 0.13968419835157822, + "learning_rate": 0.0001907851968131785, + "loss": 0.7445, + "step": 1830 + }, + { + "epoch": 0.16371602288984263, + "grad_norm": 0.1305320073205767, + "learning_rate": 0.00019077305029794068, + "loss": 0.7055, + "step": 1831 + }, + { + "epoch": 0.16380543633762518, + "grad_norm": 0.12235981964054365, + "learning_rate": 0.00019076089616967677, + "loss": 0.6865, + "step": 1832 + }, + { + "epoch": 0.16389484978540772, + "grad_norm": 0.12011280909336101, + "learning_rate": 0.0001907487344294061, + "loss": 0.707, + "step": 1833 + }, + { + "epoch": 0.16398426323319026, + "grad_norm": 0.11964816039621318, + "learning_rate": 0.00019073656507814866, + "loss": 0.7061, + "step": 1834 + }, + { + "epoch": 0.1640736766809728, + "grad_norm": 0.11625904957339855, + "learning_rate": 0.00019072438811692507, + "loss": 0.6958, + "step": 1835 + }, + { + "epoch": 0.16416309012875535, + "grad_norm": 0.11571545343365594, + "learning_rate": 0.00019071220354675665, + "loss": 0.7039, + "step": 1836 + }, + { + "epoch": 0.16425250357653792, + "grad_norm": 0.1363780456755864, + "learning_rate": 0.00019070001136866526, + "loss": 0.6968, + "step": 1837 + }, + { + "epoch": 0.16434191702432047, + "grad_norm": 0.12573763612309227, + "learning_rate": 0.00019068781158367346, + "loss": 0.7217, + "step": 1838 + }, + { + "epoch": 0.164431330472103, + "grad_norm": 0.11518762934598382, + "learning_rate": 0.00019067560419280442, + "loss": 0.6868, + "step": 1839 + }, + { + "epoch": 0.16452074391988555, + "grad_norm": 0.13487998910449553, + "learning_rate": 0.00019066338919708197, + "loss": 0.6948, + "step": 1840 + }, + { + "epoch": 0.1646101573676681, + "grad_norm": 0.10853749986535581, + "learning_rate": 0.00019065116659753054, + "loss": 0.7022, + "step": 1841 + }, + { + "epoch": 0.16469957081545064, + "grad_norm": 0.11509724255590797, + "learning_rate": 0.00019063893639517527, + "loss": 0.68, + "step": 1842 + }, + { + "epoch": 0.16478898426323318, + "grad_norm": 0.10802904387437053, + "learning_rate": 0.00019062669859104187, + "loss": 0.7066, + "step": 1843 + }, + { + "epoch": 0.16487839771101573, + "grad_norm": 0.1231324170137931, + "learning_rate": 0.0001906144531861567, + "loss": 0.6728, + "step": 1844 + }, + { + "epoch": 0.16496781115879827, + "grad_norm": 0.11732838163753348, + "learning_rate": 0.00019060220018154677, + "loss": 0.6418, + "step": 1845 + }, + { + "epoch": 0.16505722460658084, + "grad_norm": 0.1087767858521221, + "learning_rate": 0.00019058993957823974, + "loss": 0.6771, + "step": 1846 + }, + { + "epoch": 0.1651466380543634, + "grad_norm": 0.109158590524288, + "learning_rate": 0.00019057767137726388, + "loss": 0.6583, + "step": 1847 + }, + { + "epoch": 0.16523605150214593, + "grad_norm": 0.11903775383097459, + "learning_rate": 0.00019056539557964813, + "loss": 0.6784, + "step": 1848 + }, + { + "epoch": 0.16532546494992847, + "grad_norm": 0.12007042697950616, + "learning_rate": 0.000190553112186422, + "loss": 0.7186, + "step": 1849 + }, + { + "epoch": 0.16541487839771102, + "grad_norm": 0.1347217715626391, + "learning_rate": 0.00019054082119861573, + "loss": 0.7195, + "step": 1850 + }, + { + "epoch": 0.16550429184549356, + "grad_norm": 0.1241008064704117, + "learning_rate": 0.0001905285226172601, + "loss": 0.6988, + "step": 1851 + }, + { + "epoch": 0.1655937052932761, + "grad_norm": 0.1159631756624994, + "learning_rate": 0.00019051621644338665, + "loss": 0.7179, + "step": 1852 + }, + { + "epoch": 0.16568311874105865, + "grad_norm": 0.11330285022344735, + "learning_rate": 0.0001905039026780274, + "loss": 0.7256, + "step": 1853 + }, + { + "epoch": 0.1657725321888412, + "grad_norm": 0.1373233214363452, + "learning_rate": 0.00019049158132221515, + "loss": 0.6931, + "step": 1854 + }, + { + "epoch": 0.16586194563662374, + "grad_norm": 0.11469817037481449, + "learning_rate": 0.0001904792523769833, + "loss": 0.716, + "step": 1855 + }, + { + "epoch": 0.1659513590844063, + "grad_norm": 0.11662227013562461, + "learning_rate": 0.00019046691584336577, + "loss": 0.6776, + "step": 1856 + }, + { + "epoch": 0.16604077253218885, + "grad_norm": 0.1174345419749616, + "learning_rate": 0.0001904545717223973, + "loss": 0.7107, + "step": 1857 + }, + { + "epoch": 0.1661301859799714, + "grad_norm": 0.11711337977516723, + "learning_rate": 0.00019044222001511312, + "loss": 0.7127, + "step": 1858 + }, + { + "epoch": 0.16621959942775394, + "grad_norm": 0.12780610984950197, + "learning_rate": 0.00019042986072254919, + "loss": 0.674, + "step": 1859 + }, + { + "epoch": 0.16630901287553648, + "grad_norm": 0.11195995644929864, + "learning_rate": 0.00019041749384574204, + "loss": 0.6921, + "step": 1860 + }, + { + "epoch": 0.16639842632331903, + "grad_norm": 0.13085403631674336, + "learning_rate": 0.0001904051193857289, + "loss": 0.7229, + "step": 1861 + }, + { + "epoch": 0.16648783977110157, + "grad_norm": 0.120199205365209, + "learning_rate": 0.00019039273734354755, + "loss": 0.6472, + "step": 1862 + }, + { + "epoch": 0.1665772532188841, + "grad_norm": 0.12858298668564497, + "learning_rate": 0.00019038034772023648, + "loss": 0.7416, + "step": 1863 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.11835490460174665, + "learning_rate": 0.00019036795051683483, + "loss": 0.6586, + "step": 1864 + }, + { + "epoch": 0.1667560801144492, + "grad_norm": 0.10649051584398966, + "learning_rate": 0.0001903555457343823, + "loss": 0.6581, + "step": 1865 + }, + { + "epoch": 0.16684549356223177, + "grad_norm": 0.11590091512074796, + "learning_rate": 0.00019034313337391924, + "loss": 0.6762, + "step": 1866 + }, + { + "epoch": 0.16693490701001432, + "grad_norm": 0.1272168061416584, + "learning_rate": 0.00019033071343648673, + "loss": 0.6959, + "step": 1867 + }, + { + "epoch": 0.16702432045779686, + "grad_norm": 0.1551655623020352, + "learning_rate": 0.00019031828592312635, + "loss": 0.6925, + "step": 1868 + }, + { + "epoch": 0.1671137339055794, + "grad_norm": 0.11421143795833792, + "learning_rate": 0.00019030585083488043, + "loss": 0.6883, + "step": 1869 + }, + { + "epoch": 0.16720314735336195, + "grad_norm": 0.11689255534343101, + "learning_rate": 0.00019029340817279183, + "loss": 0.6719, + "step": 1870 + }, + { + "epoch": 0.1672925608011445, + "grad_norm": 0.11554462553823172, + "learning_rate": 0.00019028095793790416, + "loss": 0.6794, + "step": 1871 + }, + { + "epoch": 0.16738197424892703, + "grad_norm": 0.11846678252310555, + "learning_rate": 0.00019026850013126157, + "loss": 0.7085, + "step": 1872 + }, + { + "epoch": 0.16747138769670958, + "grad_norm": 0.11463294263333043, + "learning_rate": 0.0001902560347539089, + "loss": 0.7021, + "step": 1873 + }, + { + "epoch": 0.16756080114449212, + "grad_norm": 0.12114170335725694, + "learning_rate": 0.00019024356180689158, + "loss": 0.6855, + "step": 1874 + }, + { + "epoch": 0.16765021459227467, + "grad_norm": 0.11597466383593678, + "learning_rate": 0.00019023108129125572, + "loss": 0.7254, + "step": 1875 + }, + { + "epoch": 0.16773962804005724, + "grad_norm": 0.13901185210727116, + "learning_rate": 0.00019021859320804806, + "loss": 0.7486, + "step": 1876 + }, + { + "epoch": 0.16782904148783978, + "grad_norm": 0.10162651304011529, + "learning_rate": 0.00019020609755831592, + "loss": 0.6939, + "step": 1877 + }, + { + "epoch": 0.16791845493562232, + "grad_norm": 0.11948216340519852, + "learning_rate": 0.00019019359434310738, + "loss": 0.6817, + "step": 1878 + }, + { + "epoch": 0.16800786838340487, + "grad_norm": 0.11732077946104558, + "learning_rate": 0.00019018108356347094, + "loss": 0.658, + "step": 1879 + }, + { + "epoch": 0.1680972818311874, + "grad_norm": 0.11732669674814819, + "learning_rate": 0.00019016856522045597, + "loss": 0.6604, + "step": 1880 + }, + { + "epoch": 0.16818669527896996, + "grad_norm": 0.10266784366427963, + "learning_rate": 0.0001901560393151123, + "loss": 0.6505, + "step": 1881 + }, + { + "epoch": 0.1682761087267525, + "grad_norm": 0.13325040125652118, + "learning_rate": 0.00019014350584849052, + "loss": 0.7131, + "step": 1882 + }, + { + "epoch": 0.16836552217453504, + "grad_norm": 0.13142516899864337, + "learning_rate": 0.00019013096482164177, + "loss": 0.7111, + "step": 1883 + }, + { + "epoch": 0.1684549356223176, + "grad_norm": 0.11349564692282199, + "learning_rate": 0.00019011841623561783, + "loss": 0.5713, + "step": 1884 + }, + { + "epoch": 0.16854434907010013, + "grad_norm": 0.11283770676256552, + "learning_rate": 0.00019010586009147117, + "loss": 0.5962, + "step": 1885 + }, + { + "epoch": 0.1686337625178827, + "grad_norm": 0.11948325561944464, + "learning_rate": 0.00019009329639025483, + "loss": 0.6906, + "step": 1886 + }, + { + "epoch": 0.16872317596566525, + "grad_norm": 0.14533023494589217, + "learning_rate": 0.00019008072513302255, + "loss": 0.7367, + "step": 1887 + }, + { + "epoch": 0.1688125894134478, + "grad_norm": 0.1211341791775917, + "learning_rate": 0.00019006814632082863, + "loss": 0.7252, + "step": 1888 + }, + { + "epoch": 0.16890200286123033, + "grad_norm": 0.1443332729660412, + "learning_rate": 0.00019005555995472805, + "loss": 0.7518, + "step": 1889 + }, + { + "epoch": 0.16899141630901288, + "grad_norm": 0.12871617769117885, + "learning_rate": 0.00019004296603577646, + "loss": 0.6681, + "step": 1890 + }, + { + "epoch": 0.16908082975679542, + "grad_norm": 0.14431234808607835, + "learning_rate": 0.00019003036456503, + "loss": 0.5987, + "step": 1891 + }, + { + "epoch": 0.16917024320457796, + "grad_norm": 0.1230231235054903, + "learning_rate": 0.0001900177555435456, + "loss": 0.6903, + "step": 1892 + }, + { + "epoch": 0.1692596566523605, + "grad_norm": 0.1145834456083459, + "learning_rate": 0.00019000513897238076, + "loss": 0.7363, + "step": 1893 + }, + { + "epoch": 0.16934907010014305, + "grad_norm": 0.10799767648318864, + "learning_rate": 0.00018999251485259363, + "loss": 0.6758, + "step": 1894 + }, + { + "epoch": 0.1694384835479256, + "grad_norm": 0.11007485917546535, + "learning_rate": 0.00018997988318524293, + "loss": 0.7184, + "step": 1895 + }, + { + "epoch": 0.16952789699570817, + "grad_norm": 0.11618241004262046, + "learning_rate": 0.00018996724397138813, + "loss": 0.7151, + "step": 1896 + }, + { + "epoch": 0.1696173104434907, + "grad_norm": 0.1242934015592723, + "learning_rate": 0.0001899545972120892, + "loss": 0.6635, + "step": 1897 + }, + { + "epoch": 0.16970672389127325, + "grad_norm": 0.12803575833146066, + "learning_rate": 0.00018994194290840687, + "loss": 0.7247, + "step": 1898 + }, + { + "epoch": 0.1697961373390558, + "grad_norm": 0.11113942088498296, + "learning_rate": 0.0001899292810614024, + "loss": 0.6798, + "step": 1899 + }, + { + "epoch": 0.16988555078683834, + "grad_norm": 0.12203122797252532, + "learning_rate": 0.00018991661167213773, + "loss": 0.7455, + "step": 1900 + }, + { + "epoch": 0.16997496423462088, + "grad_norm": 0.12156601922778879, + "learning_rate": 0.00018990393474167542, + "loss": 0.7159, + "step": 1901 + }, + { + "epoch": 0.17006437768240343, + "grad_norm": 0.11325620336394869, + "learning_rate": 0.0001898912502710787, + "loss": 0.7118, + "step": 1902 + }, + { + "epoch": 0.17015379113018597, + "grad_norm": 0.11541451813037597, + "learning_rate": 0.00018987855826141137, + "loss": 0.6949, + "step": 1903 + }, + { + "epoch": 0.17024320457796852, + "grad_norm": 0.11095930648915221, + "learning_rate": 0.0001898658587137379, + "loss": 0.6681, + "step": 1904 + }, + { + "epoch": 0.17033261802575106, + "grad_norm": 0.1312408315068312, + "learning_rate": 0.0001898531516291234, + "loss": 0.712, + "step": 1905 + }, + { + "epoch": 0.17042203147353363, + "grad_norm": 0.10738484544177196, + "learning_rate": 0.00018984043700863356, + "loss": 0.6543, + "step": 1906 + }, + { + "epoch": 0.17051144492131617, + "grad_norm": 0.12416423116071065, + "learning_rate": 0.0001898277148533348, + "loss": 0.7234, + "step": 1907 + }, + { + "epoch": 0.17060085836909872, + "grad_norm": 0.1250962278939358, + "learning_rate": 0.0001898149851642941, + "loss": 0.6587, + "step": 1908 + }, + { + "epoch": 0.17069027181688126, + "grad_norm": 0.1107248010608424, + "learning_rate": 0.00018980224794257905, + "loss": 0.7228, + "step": 1909 + }, + { + "epoch": 0.1707796852646638, + "grad_norm": 0.11471972554795704, + "learning_rate": 0.0001897895031892579, + "loss": 0.6124, + "step": 1910 + }, + { + "epoch": 0.17086909871244635, + "grad_norm": 0.11785745142026482, + "learning_rate": 0.00018977675090539955, + "loss": 0.7205, + "step": 1911 + }, + { + "epoch": 0.1709585121602289, + "grad_norm": 0.12610179498787155, + "learning_rate": 0.00018976399109207353, + "loss": 0.7001, + "step": 1912 + }, + { + "epoch": 0.17104792560801144, + "grad_norm": 0.1255445873932955, + "learning_rate": 0.00018975122375035, + "loss": 0.7229, + "step": 1913 + }, + { + "epoch": 0.17113733905579398, + "grad_norm": 0.11051776499173355, + "learning_rate": 0.0001897384488812997, + "loss": 0.7016, + "step": 1914 + }, + { + "epoch": 0.17122675250357655, + "grad_norm": 0.10912357376078598, + "learning_rate": 0.00018972566648599407, + "loss": 0.6882, + "step": 1915 + }, + { + "epoch": 0.1713161659513591, + "grad_norm": 0.10116900188520175, + "learning_rate": 0.0001897128765655052, + "loss": 0.6517, + "step": 1916 + }, + { + "epoch": 0.17140557939914164, + "grad_norm": 0.13750110873733729, + "learning_rate": 0.00018970007912090566, + "loss": 0.7103, + "step": 1917 + }, + { + "epoch": 0.17149499284692418, + "grad_norm": 0.12586296303949543, + "learning_rate": 0.00018968727415326884, + "loss": 0.717, + "step": 1918 + }, + { + "epoch": 0.17158440629470673, + "grad_norm": 0.1307129135896672, + "learning_rate": 0.00018967446166366867, + "loss": 0.7085, + "step": 1919 + }, + { + "epoch": 0.17167381974248927, + "grad_norm": 0.11865256735290701, + "learning_rate": 0.00018966164165317966, + "loss": 0.7136, + "step": 1920 + }, + { + "epoch": 0.1717632331902718, + "grad_norm": 0.111180149182751, + "learning_rate": 0.00018964881412287708, + "loss": 0.6804, + "step": 1921 + }, + { + "epoch": 0.17185264663805436, + "grad_norm": 0.12590551404734557, + "learning_rate": 0.00018963597907383672, + "loss": 0.7018, + "step": 1922 + }, + { + "epoch": 0.1719420600858369, + "grad_norm": 0.11284523123658492, + "learning_rate": 0.00018962313650713503, + "loss": 0.6732, + "step": 1923 + }, + { + "epoch": 0.17203147353361944, + "grad_norm": 0.13315849418373152, + "learning_rate": 0.00018961028642384915, + "loss": 0.6914, + "step": 1924 + }, + { + "epoch": 0.17212088698140202, + "grad_norm": 0.12074553807202176, + "learning_rate": 0.00018959742882505674, + "loss": 0.7006, + "step": 1925 + }, + { + "epoch": 0.17221030042918456, + "grad_norm": 0.11461020042781171, + "learning_rate": 0.00018958456371183618, + "loss": 0.6797, + "step": 1926 + }, + { + "epoch": 0.1722997138769671, + "grad_norm": 0.10896999754597349, + "learning_rate": 0.00018957169108526646, + "loss": 0.6668, + "step": 1927 + }, + { + "epoch": 0.17238912732474965, + "grad_norm": 0.12373749230783676, + "learning_rate": 0.00018955881094642721, + "loss": 0.6989, + "step": 1928 + }, + { + "epoch": 0.1724785407725322, + "grad_norm": 0.1325203522486443, + "learning_rate": 0.00018954592329639862, + "loss": 0.6888, + "step": 1929 + }, + { + "epoch": 0.17256795422031473, + "grad_norm": 0.12237903510544877, + "learning_rate": 0.00018953302813626158, + "loss": 0.683, + "step": 1930 + }, + { + "epoch": 0.17265736766809728, + "grad_norm": 0.11728031432891546, + "learning_rate": 0.00018952012546709764, + "loss": 0.6588, + "step": 1931 + }, + { + "epoch": 0.17274678111587982, + "grad_norm": 0.11308678381234699, + "learning_rate": 0.00018950721528998885, + "loss": 0.6465, + "step": 1932 + }, + { + "epoch": 0.17283619456366237, + "grad_norm": 0.1333834363537053, + "learning_rate": 0.00018949429760601802, + "loss": 0.7467, + "step": 1933 + }, + { + "epoch": 0.1729256080114449, + "grad_norm": 0.12394072098445534, + "learning_rate": 0.00018948137241626853, + "loss": 0.6874, + "step": 1934 + }, + { + "epoch": 0.17301502145922748, + "grad_norm": 0.11737875480922377, + "learning_rate": 0.0001894684397218244, + "loss": 0.6733, + "step": 1935 + }, + { + "epoch": 0.17310443490701002, + "grad_norm": 0.11097316652054559, + "learning_rate": 0.0001894554995237703, + "loss": 0.6682, + "step": 1936 + }, + { + "epoch": 0.17319384835479257, + "grad_norm": 0.1351305010346107, + "learning_rate": 0.00018944255182319148, + "loss": 0.7171, + "step": 1937 + }, + { + "epoch": 0.1732832618025751, + "grad_norm": 0.13181177028110552, + "learning_rate": 0.00018942959662117384, + "loss": 0.7508, + "step": 1938 + }, + { + "epoch": 0.17337267525035766, + "grad_norm": 0.12580782527274004, + "learning_rate": 0.00018941663391880396, + "loss": 0.6602, + "step": 1939 + }, + { + "epoch": 0.1734620886981402, + "grad_norm": 0.11570072067957521, + "learning_rate": 0.00018940366371716897, + "loss": 0.6892, + "step": 1940 + }, + { + "epoch": 0.17355150214592274, + "grad_norm": 0.12966638832713132, + "learning_rate": 0.00018939068601735666, + "loss": 0.7244, + "step": 1941 + }, + { + "epoch": 0.1736409155937053, + "grad_norm": 0.126821468726959, + "learning_rate": 0.0001893777008204555, + "loss": 0.6973, + "step": 1942 + }, + { + "epoch": 0.17373032904148783, + "grad_norm": 0.12152484549555127, + "learning_rate": 0.0001893647081275545, + "loss": 0.6808, + "step": 1943 + }, + { + "epoch": 0.17381974248927037, + "grad_norm": 0.09914175197643066, + "learning_rate": 0.00018935170793974335, + "loss": 0.676, + "step": 1944 + }, + { + "epoch": 0.17390915593705294, + "grad_norm": 0.11123441087004964, + "learning_rate": 0.00018933870025811237, + "loss": 0.6743, + "step": 1945 + }, + { + "epoch": 0.1739985693848355, + "grad_norm": 0.12401239494028553, + "learning_rate": 0.0001893256850837525, + "loss": 0.704, + "step": 1946 + }, + { + "epoch": 0.17408798283261803, + "grad_norm": 0.11349530789944214, + "learning_rate": 0.0001893126624177553, + "loss": 0.6797, + "step": 1947 + }, + { + "epoch": 0.17417739628040058, + "grad_norm": 0.11169588338641652, + "learning_rate": 0.00018929963226121295, + "loss": 0.7088, + "step": 1948 + }, + { + "epoch": 0.17426680972818312, + "grad_norm": 0.12061336395491205, + "learning_rate": 0.0001892865946152183, + "loss": 0.7065, + "step": 1949 + }, + { + "epoch": 0.17435622317596566, + "grad_norm": 0.10451195605315274, + "learning_rate": 0.0001892735494808648, + "loss": 0.7124, + "step": 1950 + }, + { + "epoch": 0.1744456366237482, + "grad_norm": 0.09809997428896626, + "learning_rate": 0.0001892604968592465, + "loss": 0.6372, + "step": 1951 + }, + { + "epoch": 0.17453505007153075, + "grad_norm": 0.10737070842516347, + "learning_rate": 0.00018924743675145813, + "loss": 0.6653, + "step": 1952 + }, + { + "epoch": 0.1746244635193133, + "grad_norm": 0.11396955936853741, + "learning_rate": 0.00018923436915859503, + "loss": 0.6426, + "step": 1953 + }, + { + "epoch": 0.17471387696709584, + "grad_norm": 0.11060929076703208, + "learning_rate": 0.00018922129408175314, + "loss": 0.6278, + "step": 1954 + }, + { + "epoch": 0.1748032904148784, + "grad_norm": 0.12093845896188483, + "learning_rate": 0.0001892082115220291, + "loss": 0.7112, + "step": 1955 + }, + { + "epoch": 0.17489270386266095, + "grad_norm": 0.12590877223249441, + "learning_rate": 0.00018919512148052005, + "loss": 0.7279, + "step": 1956 + }, + { + "epoch": 0.1749821173104435, + "grad_norm": 0.11788298294661363, + "learning_rate": 0.0001891820239583239, + "loss": 0.7041, + "step": 1957 + }, + { + "epoch": 0.17507153075822604, + "grad_norm": 0.12188826094578241, + "learning_rate": 0.00018916891895653915, + "loss": 0.7191, + "step": 1958 + }, + { + "epoch": 0.17516094420600858, + "grad_norm": 0.11522718668704668, + "learning_rate": 0.0001891558064762648, + "loss": 0.6814, + "step": 1959 + }, + { + "epoch": 0.17525035765379113, + "grad_norm": 0.12712554619086094, + "learning_rate": 0.00018914268651860067, + "loss": 0.6789, + "step": 1960 + }, + { + "epoch": 0.17533977110157367, + "grad_norm": 0.13319574710174706, + "learning_rate": 0.00018912955908464708, + "loss": 0.699, + "step": 1961 + }, + { + "epoch": 0.17542918454935622, + "grad_norm": 0.1455686886170692, + "learning_rate": 0.00018911642417550497, + "loss": 0.7323, + "step": 1962 + }, + { + "epoch": 0.17551859799713876, + "grad_norm": 0.10090837444002514, + "learning_rate": 0.00018910328179227605, + "loss": 0.5939, + "step": 1963 + }, + { + "epoch": 0.1756080114449213, + "grad_norm": 0.13135823579278724, + "learning_rate": 0.0001890901319360624, + "loss": 0.7222, + "step": 1964 + }, + { + "epoch": 0.17569742489270387, + "grad_norm": 0.13091414410136595, + "learning_rate": 0.00018907697460796707, + "loss": 0.689, + "step": 1965 + }, + { + "epoch": 0.17578683834048642, + "grad_norm": 0.11076451241047343, + "learning_rate": 0.00018906380980909343, + "loss": 0.694, + "step": 1966 + }, + { + "epoch": 0.17587625178826896, + "grad_norm": 0.12942628363997075, + "learning_rate": 0.00018905063754054563, + "loss": 0.6873, + "step": 1967 + }, + { + "epoch": 0.1759656652360515, + "grad_norm": 0.12478166170347577, + "learning_rate": 0.00018903745780342839, + "loss": 0.7111, + "step": 1968 + }, + { + "epoch": 0.17605507868383405, + "grad_norm": 0.11525834721651874, + "learning_rate": 0.00018902427059884708, + "loss": 0.6967, + "step": 1969 + }, + { + "epoch": 0.1761444921316166, + "grad_norm": 0.11752752679494409, + "learning_rate": 0.00018901107592790776, + "loss": 0.6798, + "step": 1970 + }, + { + "epoch": 0.17623390557939914, + "grad_norm": 0.11376179641568542, + "learning_rate": 0.00018899787379171693, + "loss": 0.6601, + "step": 1971 + }, + { + "epoch": 0.17632331902718168, + "grad_norm": 0.12697112316443981, + "learning_rate": 0.00018898466419138197, + "loss": 0.71, + "step": 1972 + }, + { + "epoch": 0.17641273247496422, + "grad_norm": 0.11354096797475285, + "learning_rate": 0.00018897144712801066, + "loss": 0.6929, + "step": 1973 + }, + { + "epoch": 0.1765021459227468, + "grad_norm": 0.12462814506160975, + "learning_rate": 0.00018895822260271152, + "loss": 0.6783, + "step": 1974 + }, + { + "epoch": 0.17659155937052934, + "grad_norm": 0.1230247174736318, + "learning_rate": 0.0001889449906165937, + "loss": 0.72, + "step": 1975 + }, + { + "epoch": 0.17668097281831188, + "grad_norm": 0.11821180958126917, + "learning_rate": 0.00018893175117076693, + "loss": 0.6787, + "step": 1976 + }, + { + "epoch": 0.17677038626609443, + "grad_norm": 0.13553368856735729, + "learning_rate": 0.00018891850426634162, + "loss": 0.7415, + "step": 1977 + }, + { + "epoch": 0.17685979971387697, + "grad_norm": 0.10998558247804874, + "learning_rate": 0.00018890524990442873, + "loss": 0.6567, + "step": 1978 + }, + { + "epoch": 0.1769492131616595, + "grad_norm": 0.12034869711257005, + "learning_rate": 0.00018889198808613985, + "loss": 0.6733, + "step": 1979 + }, + { + "epoch": 0.17703862660944206, + "grad_norm": 0.13233751110708233, + "learning_rate": 0.00018887871881258735, + "loss": 0.7428, + "step": 1980 + }, + { + "epoch": 0.1771280400572246, + "grad_norm": 0.11820976322856652, + "learning_rate": 0.000188865442084884, + "loss": 0.6618, + "step": 1981 + }, + { + "epoch": 0.17721745350500714, + "grad_norm": 0.11766617709681067, + "learning_rate": 0.00018885215790414336, + "loss": 0.6889, + "step": 1982 + }, + { + "epoch": 0.1773068669527897, + "grad_norm": 0.11799752320665635, + "learning_rate": 0.00018883886627147955, + "loss": 0.7033, + "step": 1983 + }, + { + "epoch": 0.17739628040057226, + "grad_norm": 0.1216472850962718, + "learning_rate": 0.0001888255671880073, + "loss": 0.6758, + "step": 1984 + }, + { + "epoch": 0.1774856938483548, + "grad_norm": 0.11624922382559535, + "learning_rate": 0.00018881226065484204, + "loss": 0.6727, + "step": 1985 + }, + { + "epoch": 0.17757510729613735, + "grad_norm": 0.12414958185492883, + "learning_rate": 0.0001887989466730997, + "loss": 0.7228, + "step": 1986 + }, + { + "epoch": 0.1776645207439199, + "grad_norm": 0.12838611738353606, + "learning_rate": 0.00018878562524389696, + "loss": 0.7201, + "step": 1987 + }, + { + "epoch": 0.17775393419170243, + "grad_norm": 0.12299815118816897, + "learning_rate": 0.00018877229636835106, + "loss": 0.6845, + "step": 1988 + }, + { + "epoch": 0.17784334763948498, + "grad_norm": 0.12832235061909852, + "learning_rate": 0.00018875896004757984, + "loss": 0.7314, + "step": 1989 + }, + { + "epoch": 0.17793276108726752, + "grad_norm": 0.11283019286601077, + "learning_rate": 0.00018874561628270188, + "loss": 0.6886, + "step": 1990 + }, + { + "epoch": 0.17802217453505007, + "grad_norm": 0.13504726851460228, + "learning_rate": 0.00018873226507483623, + "loss": 0.7586, + "step": 1991 + }, + { + "epoch": 0.1781115879828326, + "grad_norm": 0.11331794951464227, + "learning_rate": 0.0001887189064251027, + "loss": 0.6456, + "step": 1992 + }, + { + "epoch": 0.17820100143061515, + "grad_norm": 0.1155841993927725, + "learning_rate": 0.00018870554033462159, + "loss": 0.7142, + "step": 1993 + }, + { + "epoch": 0.17829041487839772, + "grad_norm": 0.13902794231285567, + "learning_rate": 0.00018869216680451398, + "loss": 0.7166, + "step": 1994 + }, + { + "epoch": 0.17837982832618027, + "grad_norm": 0.10923667486883037, + "learning_rate": 0.0001886787858359014, + "loss": 0.6808, + "step": 1995 + }, + { + "epoch": 0.1784692417739628, + "grad_norm": 0.12830818168433725, + "learning_rate": 0.0001886653974299062, + "loss": 0.7174, + "step": 1996 + }, + { + "epoch": 0.17855865522174535, + "grad_norm": 0.11308229666957095, + "learning_rate": 0.0001886520015876512, + "loss": 0.6519, + "step": 1997 + }, + { + "epoch": 0.1786480686695279, + "grad_norm": 0.10787324553547355, + "learning_rate": 0.00018863859831025988, + "loss": 0.656, + "step": 1998 + }, + { + "epoch": 0.17873748211731044, + "grad_norm": 0.11992473740363394, + "learning_rate": 0.00018862518759885636, + "loss": 0.6686, + "step": 1999 + }, + { + "epoch": 0.17882689556509299, + "grad_norm": 0.12412503727340538, + "learning_rate": 0.0001886117694545654, + "loss": 0.7061, + "step": 2000 + }, + { + "epoch": 0.17891630901287553, + "grad_norm": 0.10911402818899783, + "learning_rate": 0.00018859834387851233, + "loss": 0.6794, + "step": 2001 + }, + { + "epoch": 0.17900572246065807, + "grad_norm": 0.11045983644330752, + "learning_rate": 0.00018858491087182317, + "loss": 0.6899, + "step": 2002 + }, + { + "epoch": 0.17909513590844062, + "grad_norm": 0.12619786184980886, + "learning_rate": 0.00018857147043562452, + "loss": 0.699, + "step": 2003 + }, + { + "epoch": 0.1791845493562232, + "grad_norm": 0.10708963490147558, + "learning_rate": 0.00018855802257104363, + "loss": 0.6485, + "step": 2004 + }, + { + "epoch": 0.17927396280400573, + "grad_norm": 0.1246323332714693, + "learning_rate": 0.0001885445672792083, + "loss": 0.7131, + "step": 2005 + }, + { + "epoch": 0.17936337625178828, + "grad_norm": 0.11334792476655674, + "learning_rate": 0.00018853110456124709, + "loss": 0.6962, + "step": 2006 + }, + { + "epoch": 0.17945278969957082, + "grad_norm": 0.10451906464950485, + "learning_rate": 0.00018851763441828903, + "loss": 0.6798, + "step": 2007 + }, + { + "epoch": 0.17954220314735336, + "grad_norm": 0.11449354827026316, + "learning_rate": 0.00018850415685146387, + "loss": 0.6954, + "step": 2008 + }, + { + "epoch": 0.1796316165951359, + "grad_norm": 0.12515587978005246, + "learning_rate": 0.00018849067186190198, + "loss": 0.6813, + "step": 2009 + }, + { + "epoch": 0.17972103004291845, + "grad_norm": 0.10930919897756036, + "learning_rate": 0.0001884771794507343, + "loss": 0.7042, + "step": 2010 + }, + { + "epoch": 0.179810443490701, + "grad_norm": 0.13320321040707145, + "learning_rate": 0.00018846367961909244, + "loss": 0.6915, + "step": 2011 + }, + { + "epoch": 0.17989985693848354, + "grad_norm": 0.13704541962737565, + "learning_rate": 0.0001884501723681086, + "loss": 0.7107, + "step": 2012 + }, + { + "epoch": 0.17998927038626608, + "grad_norm": 0.10537813810267385, + "learning_rate": 0.00018843665769891562, + "loss": 0.6566, + "step": 2013 + }, + { + "epoch": 0.18007868383404865, + "grad_norm": 0.12716414031127338, + "learning_rate": 0.00018842313561264696, + "loss": 0.7027, + "step": 2014 + }, + { + "epoch": 0.1801680972818312, + "grad_norm": 0.10683603877142621, + "learning_rate": 0.0001884096061104367, + "loss": 0.6859, + "step": 2015 + }, + { + "epoch": 0.18025751072961374, + "grad_norm": 0.12200611069534043, + "learning_rate": 0.0001883960691934196, + "loss": 0.7487, + "step": 2016 + }, + { + "epoch": 0.18034692417739628, + "grad_norm": 0.11036473303759724, + "learning_rate": 0.00018838252486273087, + "loss": 0.6765, + "step": 2017 + }, + { + "epoch": 0.18043633762517883, + "grad_norm": 0.10490204371529484, + "learning_rate": 0.00018836897311950653, + "loss": 0.6744, + "step": 2018 + }, + { + "epoch": 0.18052575107296137, + "grad_norm": 0.11692085484139851, + "learning_rate": 0.00018835541396488315, + "loss": 0.6546, + "step": 2019 + }, + { + "epoch": 0.18061516452074391, + "grad_norm": 0.1036530394881881, + "learning_rate": 0.00018834184739999793, + "loss": 0.6455, + "step": 2020 + }, + { + "epoch": 0.18070457796852646, + "grad_norm": 0.11057738887266048, + "learning_rate": 0.00018832827342598861, + "loss": 0.6836, + "step": 2021 + }, + { + "epoch": 0.180793991416309, + "grad_norm": 0.12598917412226338, + "learning_rate": 0.0001883146920439937, + "loss": 0.6917, + "step": 2022 + }, + { + "epoch": 0.18088340486409155, + "grad_norm": 0.10194976952419027, + "learning_rate": 0.00018830110325515222, + "loss": 0.6705, + "step": 2023 + }, + { + "epoch": 0.18097281831187412, + "grad_norm": 0.1263572730919776, + "learning_rate": 0.00018828750706060385, + "loss": 0.6961, + "step": 2024 + }, + { + "epoch": 0.18106223175965666, + "grad_norm": 0.12234221464552172, + "learning_rate": 0.00018827390346148887, + "loss": 0.6769, + "step": 2025 + }, + { + "epoch": 0.1811516452074392, + "grad_norm": 0.12163239770562258, + "learning_rate": 0.00018826029245894827, + "loss": 0.7233, + "step": 2026 + }, + { + "epoch": 0.18124105865522175, + "grad_norm": 0.14223717538047242, + "learning_rate": 0.00018824667405412348, + "loss": 0.7197, + "step": 2027 + }, + { + "epoch": 0.1813304721030043, + "grad_norm": 0.1160309651284802, + "learning_rate": 0.00018823304824815672, + "loss": 0.7027, + "step": 2028 + }, + { + "epoch": 0.18141988555078684, + "grad_norm": 0.10978166578184342, + "learning_rate": 0.0001882194150421908, + "loss": 0.6269, + "step": 2029 + }, + { + "epoch": 0.18150929899856938, + "grad_norm": 0.1156755574270681, + "learning_rate": 0.00018820577443736904, + "loss": 0.6988, + "step": 2030 + }, + { + "epoch": 0.18159871244635192, + "grad_norm": 0.13234065521417787, + "learning_rate": 0.0001881921264348355, + "loss": 0.6969, + "step": 2031 + }, + { + "epoch": 0.18168812589413447, + "grad_norm": 0.1157114700047179, + "learning_rate": 0.00018817847103573486, + "loss": 0.6745, + "step": 2032 + }, + { + "epoch": 0.181777539341917, + "grad_norm": 0.11876271619987934, + "learning_rate": 0.00018816480824121232, + "loss": 0.6723, + "step": 2033 + }, + { + "epoch": 0.18186695278969958, + "grad_norm": 0.11861909256850428, + "learning_rate": 0.0001881511380524138, + "loss": 0.6744, + "step": 2034 + }, + { + "epoch": 0.18195636623748213, + "grad_norm": 0.12411995770202762, + "learning_rate": 0.0001881374604704858, + "loss": 0.7045, + "step": 2035 + }, + { + "epoch": 0.18204577968526467, + "grad_norm": 0.1143924892298148, + "learning_rate": 0.0001881237754965754, + "loss": 0.655, + "step": 2036 + }, + { + "epoch": 0.1821351931330472, + "grad_norm": 0.12003019831539187, + "learning_rate": 0.0001881100831318304, + "loss": 0.7241, + "step": 2037 + }, + { + "epoch": 0.18222460658082976, + "grad_norm": 0.12102949216365784, + "learning_rate": 0.00018809638337739915, + "loss": 0.7106, + "step": 2038 + }, + { + "epoch": 0.1823140200286123, + "grad_norm": 0.13536702624973912, + "learning_rate": 0.0001880826762344306, + "loss": 0.7005, + "step": 2039 + }, + { + "epoch": 0.18240343347639484, + "grad_norm": 0.11416067755985627, + "learning_rate": 0.00018806896170407437, + "loss": 0.6926, + "step": 2040 + }, + { + "epoch": 0.1824928469241774, + "grad_norm": 0.11646378265430969, + "learning_rate": 0.00018805523978748068, + "loss": 0.6962, + "step": 2041 + }, + { + "epoch": 0.18258226037195993, + "grad_norm": 0.11145734689552471, + "learning_rate": 0.0001880415104858004, + "loss": 0.677, + "step": 2042 + }, + { + "epoch": 0.1826716738197425, + "grad_norm": 0.10750192880643859, + "learning_rate": 0.00018802777380018496, + "loss": 0.7006, + "step": 2043 + }, + { + "epoch": 0.18276108726752505, + "grad_norm": 0.11743584753381822, + "learning_rate": 0.00018801402973178642, + "loss": 0.6663, + "step": 2044 + }, + { + "epoch": 0.1828505007153076, + "grad_norm": 0.11188032350138839, + "learning_rate": 0.0001880002782817575, + "loss": 0.6641, + "step": 2045 + }, + { + "epoch": 0.18293991416309013, + "grad_norm": 0.11501850512645463, + "learning_rate": 0.00018798651945125153, + "loss": 0.647, + "step": 2046 + }, + { + "epoch": 0.18302932761087268, + "grad_norm": 0.12416828492058143, + "learning_rate": 0.00018797275324142242, + "loss": 0.6982, + "step": 2047 + }, + { + "epoch": 0.18311874105865522, + "grad_norm": 0.10844323868759867, + "learning_rate": 0.00018795897965342474, + "loss": 0.6901, + "step": 2048 + }, + { + "epoch": 0.18320815450643776, + "grad_norm": 0.12128282846061603, + "learning_rate": 0.00018794519868841367, + "loss": 0.6954, + "step": 2049 + }, + { + "epoch": 0.1832975679542203, + "grad_norm": 0.1186363219167583, + "learning_rate": 0.000187931410347545, + "loss": 0.6759, + "step": 2050 + }, + { + "epoch": 0.18338698140200285, + "grad_norm": 0.11620477714094808, + "learning_rate": 0.00018791761463197513, + "loss": 0.6819, + "step": 2051 + }, + { + "epoch": 0.1834763948497854, + "grad_norm": 0.12160882217245526, + "learning_rate": 0.00018790381154286113, + "loss": 0.6699, + "step": 2052 + }, + { + "epoch": 0.18356580829756797, + "grad_norm": 0.12845371931774371, + "learning_rate": 0.00018789000108136058, + "loss": 0.705, + "step": 2053 + }, + { + "epoch": 0.1836552217453505, + "grad_norm": 0.11735233436148218, + "learning_rate": 0.0001878761832486318, + "loss": 0.691, + "step": 2054 + }, + { + "epoch": 0.18374463519313305, + "grad_norm": 0.11490119736626715, + "learning_rate": 0.00018786235804583366, + "loss": 0.6921, + "step": 2055 + }, + { + "epoch": 0.1838340486409156, + "grad_norm": 0.11443207239305431, + "learning_rate": 0.00018784852547412565, + "loss": 0.7299, + "step": 2056 + }, + { + "epoch": 0.18392346208869814, + "grad_norm": 0.1127106969922017, + "learning_rate": 0.0001878346855346679, + "loss": 0.7026, + "step": 2057 + }, + { + "epoch": 0.18401287553648069, + "grad_norm": 0.11042892999479735, + "learning_rate": 0.00018782083822862114, + "loss": 0.6669, + "step": 2058 + }, + { + "epoch": 0.18410228898426323, + "grad_norm": 0.12871783881158633, + "learning_rate": 0.0001878069835571468, + "loss": 0.684, + "step": 2059 + }, + { + "epoch": 0.18419170243204577, + "grad_norm": 0.10982815813649253, + "learning_rate": 0.00018779312152140674, + "loss": 0.6847, + "step": 2060 + }, + { + "epoch": 0.18428111587982832, + "grad_norm": 0.1104528957812208, + "learning_rate": 0.0001877792521225636, + "loss": 0.6786, + "step": 2061 + }, + { + "epoch": 0.18437052932761086, + "grad_norm": 0.13375552889307282, + "learning_rate": 0.00018776537536178064, + "loss": 0.6684, + "step": 2062 + }, + { + "epoch": 0.18445994277539343, + "grad_norm": 0.11988361920787713, + "learning_rate": 0.00018775149124022162, + "loss": 0.6905, + "step": 2063 + }, + { + "epoch": 0.18454935622317598, + "grad_norm": 0.12034287858638079, + "learning_rate": 0.00018773759975905098, + "loss": 0.672, + "step": 2064 + }, + { + "epoch": 0.18463876967095852, + "grad_norm": 0.1109431897707632, + "learning_rate": 0.00018772370091943384, + "loss": 0.6818, + "step": 2065 + }, + { + "epoch": 0.18472818311874106, + "grad_norm": 0.12273617331071669, + "learning_rate": 0.00018770979472253581, + "loss": 0.7004, + "step": 2066 + }, + { + "epoch": 0.1848175965665236, + "grad_norm": 0.11820812375247103, + "learning_rate": 0.0001876958811695233, + "loss": 0.6555, + "step": 2067 + }, + { + "epoch": 0.18490701001430615, + "grad_norm": 0.12547665777635184, + "learning_rate": 0.00018768196026156306, + "loss": 0.6732, + "step": 2068 + }, + { + "epoch": 0.1849964234620887, + "grad_norm": 0.11770621868322309, + "learning_rate": 0.00018766803199982273, + "loss": 0.6473, + "step": 2069 + }, + { + "epoch": 0.18508583690987124, + "grad_norm": 0.13903803222838246, + "learning_rate": 0.00018765409638547048, + "loss": 0.7056, + "step": 2070 + }, + { + "epoch": 0.18517525035765378, + "grad_norm": 0.12832016033261942, + "learning_rate": 0.00018764015341967498, + "loss": 0.6515, + "step": 2071 + }, + { + "epoch": 0.18526466380543632, + "grad_norm": 0.12103253199572962, + "learning_rate": 0.00018762620310360567, + "loss": 0.6821, + "step": 2072 + }, + { + "epoch": 0.1853540772532189, + "grad_norm": 0.13061424720244355, + "learning_rate": 0.00018761224543843255, + "loss": 0.7409, + "step": 2073 + }, + { + "epoch": 0.18544349070100144, + "grad_norm": 0.1277625885521388, + "learning_rate": 0.00018759828042532616, + "loss": 0.6384, + "step": 2074 + }, + { + "epoch": 0.18553290414878398, + "grad_norm": 0.11362524228709617, + "learning_rate": 0.00018758430806545783, + "loss": 0.7124, + "step": 2075 + }, + { + "epoch": 0.18562231759656653, + "grad_norm": 0.13086367078003752, + "learning_rate": 0.00018757032835999931, + "loss": 0.7121, + "step": 2076 + }, + { + "epoch": 0.18571173104434907, + "grad_norm": 0.12120408496197059, + "learning_rate": 0.00018755634131012317, + "loss": 0.7092, + "step": 2077 + }, + { + "epoch": 0.18580114449213161, + "grad_norm": 0.13288919808903055, + "learning_rate": 0.00018754234691700238, + "loss": 0.7479, + "step": 2078 + }, + { + "epoch": 0.18589055793991416, + "grad_norm": 0.1225856718603501, + "learning_rate": 0.00018752834518181072, + "loss": 0.6872, + "step": 2079 + }, + { + "epoch": 0.1859799713876967, + "grad_norm": 0.13153427484420113, + "learning_rate": 0.00018751433610572242, + "loss": 0.7216, + "step": 2080 + }, + { + "epoch": 0.18606938483547925, + "grad_norm": 0.12615912820348812, + "learning_rate": 0.00018750031968991243, + "loss": 0.7063, + "step": 2081 + }, + { + "epoch": 0.1861587982832618, + "grad_norm": 0.13967729986839328, + "learning_rate": 0.00018748629593555633, + "loss": 0.6988, + "step": 2082 + }, + { + "epoch": 0.18624821173104436, + "grad_norm": 0.11185386969861533, + "learning_rate": 0.00018747226484383024, + "loss": 0.6594, + "step": 2083 + }, + { + "epoch": 0.1863376251788269, + "grad_norm": 0.12003040693991801, + "learning_rate": 0.00018745822641591094, + "loss": 0.6995, + "step": 2084 + }, + { + "epoch": 0.18642703862660945, + "grad_norm": 0.11609469574527988, + "learning_rate": 0.00018744418065297583, + "loss": 0.6921, + "step": 2085 + }, + { + "epoch": 0.186516452074392, + "grad_norm": 0.11933863143986223, + "learning_rate": 0.00018743012755620286, + "loss": 0.6747, + "step": 2086 + }, + { + "epoch": 0.18660586552217454, + "grad_norm": 0.11786986474034065, + "learning_rate": 0.0001874160671267707, + "loss": 0.7025, + "step": 2087 + }, + { + "epoch": 0.18669527896995708, + "grad_norm": 0.1083664097898319, + "learning_rate": 0.00018740199936585853, + "loss": 0.685, + "step": 2088 + }, + { + "epoch": 0.18678469241773962, + "grad_norm": 0.11495376573259918, + "learning_rate": 0.00018738792427464625, + "loss": 0.6752, + "step": 2089 + }, + { + "epoch": 0.18687410586552217, + "grad_norm": 0.11184687359638107, + "learning_rate": 0.00018737384185431432, + "loss": 0.6738, + "step": 2090 + }, + { + "epoch": 0.1869635193133047, + "grad_norm": 0.1267832419504366, + "learning_rate": 0.00018735975210604376, + "loss": 0.7143, + "step": 2091 + }, + { + "epoch": 0.18705293276108725, + "grad_norm": 0.12540962397052063, + "learning_rate": 0.00018734565503101636, + "loss": 0.7293, + "step": 2092 + }, + { + "epoch": 0.18714234620886983, + "grad_norm": 0.1070991845772717, + "learning_rate": 0.0001873315506304143, + "loss": 0.642, + "step": 2093 + }, + { + "epoch": 0.18723175965665237, + "grad_norm": 0.10801694345445546, + "learning_rate": 0.00018731743890542058, + "loss": 0.6767, + "step": 2094 + }, + { + "epoch": 0.1873211731044349, + "grad_norm": 0.11792159562690895, + "learning_rate": 0.0001873033198572187, + "loss": 0.6791, + "step": 2095 + }, + { + "epoch": 0.18741058655221746, + "grad_norm": 0.1232789800701935, + "learning_rate": 0.00018728919348699283, + "loss": 0.7122, + "step": 2096 + }, + { + "epoch": 0.1875, + "grad_norm": 0.1183157436126485, + "learning_rate": 0.0001872750597959277, + "loss": 0.7149, + "step": 2097 + }, + { + "epoch": 0.18758941344778254, + "grad_norm": 0.1231493022171833, + "learning_rate": 0.00018726091878520871, + "loss": 0.6773, + "step": 2098 + }, + { + "epoch": 0.1876788268955651, + "grad_norm": 0.1119179330750041, + "learning_rate": 0.00018724677045602186, + "loss": 0.663, + "step": 2099 + }, + { + "epoch": 0.18776824034334763, + "grad_norm": 0.10928508617333373, + "learning_rate": 0.00018723261480955373, + "loss": 0.687, + "step": 2100 + }, + { + "epoch": 0.18785765379113017, + "grad_norm": 0.11339193021513519, + "learning_rate": 0.00018721845184699158, + "loss": 0.6501, + "step": 2101 + }, + { + "epoch": 0.18794706723891275, + "grad_norm": 0.11590360659883586, + "learning_rate": 0.00018720428156952316, + "loss": 0.6375, + "step": 2102 + }, + { + "epoch": 0.1880364806866953, + "grad_norm": 0.1306446613205442, + "learning_rate": 0.00018719010397833698, + "loss": 0.68, + "step": 2103 + }, + { + "epoch": 0.18812589413447783, + "grad_norm": 0.11422644780910532, + "learning_rate": 0.00018717591907462208, + "loss": 0.6481, + "step": 2104 + }, + { + "epoch": 0.18821530758226038, + "grad_norm": 0.12859042446754562, + "learning_rate": 0.00018716172685956815, + "loss": 0.6708, + "step": 2105 + }, + { + "epoch": 0.18830472103004292, + "grad_norm": 0.10251819247864878, + "learning_rate": 0.0001871475273343654, + "loss": 0.651, + "step": 2106 + }, + { + "epoch": 0.18839413447782546, + "grad_norm": 0.12647376732960208, + "learning_rate": 0.00018713332050020482, + "loss": 0.7055, + "step": 2107 + }, + { + "epoch": 0.188483547925608, + "grad_norm": 0.10559814106559, + "learning_rate": 0.00018711910635827787, + "loss": 0.6532, + "step": 2108 + }, + { + "epoch": 0.18857296137339055, + "grad_norm": 0.1303032157479212, + "learning_rate": 0.0001871048849097767, + "loss": 0.7237, + "step": 2109 + }, + { + "epoch": 0.1886623748211731, + "grad_norm": 0.11970076997546941, + "learning_rate": 0.000187090656155894, + "loss": 0.6877, + "step": 2110 + }, + { + "epoch": 0.18875178826895564, + "grad_norm": 0.11173992880884068, + "learning_rate": 0.00018707642009782317, + "loss": 0.7194, + "step": 2111 + }, + { + "epoch": 0.1888412017167382, + "grad_norm": 0.13165705552380683, + "learning_rate": 0.00018706217673675811, + "loss": 0.7307, + "step": 2112 + }, + { + "epoch": 0.18893061516452075, + "grad_norm": 0.12430856070682518, + "learning_rate": 0.00018704792607389346, + "loss": 0.7213, + "step": 2113 + }, + { + "epoch": 0.1890200286123033, + "grad_norm": 0.11568939162774261, + "learning_rate": 0.00018703366811042438, + "loss": 0.646, + "step": 2114 + }, + { + "epoch": 0.18910944206008584, + "grad_norm": 0.11781636231140726, + "learning_rate": 0.00018701940284754665, + "loss": 0.6913, + "step": 2115 + }, + { + "epoch": 0.18919885550786839, + "grad_norm": 0.1017758151514641, + "learning_rate": 0.00018700513028645672, + "loss": 0.6771, + "step": 2116 + }, + { + "epoch": 0.18928826895565093, + "grad_norm": 0.10735530182905544, + "learning_rate": 0.00018699085042835157, + "loss": 0.6662, + "step": 2117 + }, + { + "epoch": 0.18937768240343347, + "grad_norm": 0.1113008984510174, + "learning_rate": 0.00018697656327442888, + "loss": 0.5866, + "step": 2118 + }, + { + "epoch": 0.18946709585121602, + "grad_norm": 0.115767583274479, + "learning_rate": 0.00018696226882588683, + "loss": 0.6844, + "step": 2119 + }, + { + "epoch": 0.18955650929899856, + "grad_norm": 0.11971736209371409, + "learning_rate": 0.00018694796708392436, + "loss": 0.6707, + "step": 2120 + }, + { + "epoch": 0.1896459227467811, + "grad_norm": 0.12105342666095671, + "learning_rate": 0.00018693365804974086, + "loss": 0.6999, + "step": 2121 + }, + { + "epoch": 0.18973533619456368, + "grad_norm": 0.11583561757441754, + "learning_rate": 0.00018691934172453646, + "loss": 0.685, + "step": 2122 + }, + { + "epoch": 0.18982474964234622, + "grad_norm": 0.1281379619238494, + "learning_rate": 0.00018690501810951182, + "loss": 0.679, + "step": 2123 + }, + { + "epoch": 0.18991416309012876, + "grad_norm": 0.1262967759353257, + "learning_rate": 0.0001868906872058683, + "loss": 0.6557, + "step": 2124 + }, + { + "epoch": 0.1900035765379113, + "grad_norm": 0.1079695887753425, + "learning_rate": 0.00018687634901480777, + "loss": 0.6631, + "step": 2125 + }, + { + "epoch": 0.19009298998569385, + "grad_norm": 0.12862511664560072, + "learning_rate": 0.00018686200353753275, + "loss": 0.6854, + "step": 2126 + }, + { + "epoch": 0.1901824034334764, + "grad_norm": 0.10749867665508936, + "learning_rate": 0.00018684765077524643, + "loss": 0.6867, + "step": 2127 + }, + { + "epoch": 0.19027181688125894, + "grad_norm": 0.12075004434841523, + "learning_rate": 0.00018683329072915252, + "loss": 0.6871, + "step": 2128 + }, + { + "epoch": 0.19036123032904148, + "grad_norm": 0.1194117544966717, + "learning_rate": 0.00018681892340045538, + "loss": 0.7227, + "step": 2129 + }, + { + "epoch": 0.19045064377682402, + "grad_norm": 0.12061315034097078, + "learning_rate": 0.00018680454879035997, + "loss": 0.7022, + "step": 2130 + }, + { + "epoch": 0.19054005722460657, + "grad_norm": 0.1290954126968377, + "learning_rate": 0.0001867901669000719, + "loss": 0.652, + "step": 2131 + }, + { + "epoch": 0.19062947067238914, + "grad_norm": 0.12254065927991185, + "learning_rate": 0.00018677577773079733, + "loss": 0.6906, + "step": 2132 + }, + { + "epoch": 0.19071888412017168, + "grad_norm": 0.12344326557973458, + "learning_rate": 0.00018676138128374313, + "loss": 0.6944, + "step": 2133 + }, + { + "epoch": 0.19080829756795423, + "grad_norm": 0.1319494127712004, + "learning_rate": 0.0001867469775601166, + "loss": 0.7042, + "step": 2134 + }, + { + "epoch": 0.19089771101573677, + "grad_norm": 0.12264260751561602, + "learning_rate": 0.00018673256656112584, + "loss": 0.6845, + "step": 2135 + }, + { + "epoch": 0.19098712446351931, + "grad_norm": 0.12170861152194962, + "learning_rate": 0.0001867181482879795, + "loss": 0.7049, + "step": 2136 + }, + { + "epoch": 0.19107653791130186, + "grad_norm": 0.13017882312807985, + "learning_rate": 0.00018670372274188677, + "loss": 0.6233, + "step": 2137 + }, + { + "epoch": 0.1911659513590844, + "grad_norm": 0.12052699648264499, + "learning_rate": 0.00018668928992405755, + "loss": 0.6543, + "step": 2138 + }, + { + "epoch": 0.19125536480686695, + "grad_norm": 0.12917098401331858, + "learning_rate": 0.00018667484983570223, + "loss": 0.6984, + "step": 2139 + }, + { + "epoch": 0.1913447782546495, + "grad_norm": 0.12604623713005236, + "learning_rate": 0.00018666040247803195, + "loss": 0.7107, + "step": 2140 + }, + { + "epoch": 0.19143419170243203, + "grad_norm": 0.11759523290111637, + "learning_rate": 0.0001866459478522584, + "loss": 0.6264, + "step": 2141 + }, + { + "epoch": 0.1915236051502146, + "grad_norm": 0.10872270948193456, + "learning_rate": 0.0001866314859595938, + "loss": 0.6841, + "step": 2142 + }, + { + "epoch": 0.19161301859799715, + "grad_norm": 0.10984727434690364, + "learning_rate": 0.00018661701680125115, + "loss": 0.6653, + "step": 2143 + }, + { + "epoch": 0.1917024320457797, + "grad_norm": 0.12733787761182333, + "learning_rate": 0.00018660254037844388, + "loss": 0.73, + "step": 2144 + }, + { + "epoch": 0.19179184549356224, + "grad_norm": 0.13120876075848895, + "learning_rate": 0.00018658805669238612, + "loss": 0.7123, + "step": 2145 + }, + { + "epoch": 0.19188125894134478, + "grad_norm": 0.11524233520925732, + "learning_rate": 0.00018657356574429266, + "loss": 0.6679, + "step": 2146 + }, + { + "epoch": 0.19197067238912732, + "grad_norm": 0.12980913338348965, + "learning_rate": 0.00018655906753537878, + "loss": 0.6786, + "step": 2147 + }, + { + "epoch": 0.19206008583690987, + "grad_norm": 0.11897778543833908, + "learning_rate": 0.00018654456206686042, + "loss": 0.6781, + "step": 2148 + }, + { + "epoch": 0.1921494992846924, + "grad_norm": 0.11911374355221238, + "learning_rate": 0.00018653004933995418, + "loss": 0.7033, + "step": 2149 + }, + { + "epoch": 0.19223891273247495, + "grad_norm": 0.11991863669389116, + "learning_rate": 0.00018651552935587717, + "loss": 0.6885, + "step": 2150 + }, + { + "epoch": 0.1923283261802575, + "grad_norm": 0.12591960422387016, + "learning_rate": 0.00018650100211584723, + "loss": 0.6616, + "step": 2151 + }, + { + "epoch": 0.19241773962804007, + "grad_norm": 0.12953244840377454, + "learning_rate": 0.00018648646762108273, + "loss": 0.6186, + "step": 2152 + }, + { + "epoch": 0.1925071530758226, + "grad_norm": 0.11893142185678529, + "learning_rate": 0.0001864719258728026, + "loss": 0.6772, + "step": 2153 + }, + { + "epoch": 0.19259656652360516, + "grad_norm": 0.1304352364899234, + "learning_rate": 0.0001864573768722265, + "loss": 0.6844, + "step": 2154 + }, + { + "epoch": 0.1926859799713877, + "grad_norm": 0.11084699935200316, + "learning_rate": 0.0001864428206205746, + "loss": 0.6736, + "step": 2155 + }, + { + "epoch": 0.19277539341917024, + "grad_norm": 0.11445340194007211, + "learning_rate": 0.00018642825711906772, + "loss": 0.6831, + "step": 2156 + }, + { + "epoch": 0.1928648068669528, + "grad_norm": 0.11403287378857185, + "learning_rate": 0.00018641368636892734, + "loss": 0.652, + "step": 2157 + }, + { + "epoch": 0.19295422031473533, + "grad_norm": 0.13352877907024166, + "learning_rate": 0.00018639910837137542, + "loss": 0.7251, + "step": 2158 + }, + { + "epoch": 0.19304363376251787, + "grad_norm": 0.12855667584132138, + "learning_rate": 0.0001863845231276346, + "loss": 0.6491, + "step": 2159 + }, + { + "epoch": 0.19313304721030042, + "grad_norm": 0.1981265058057005, + "learning_rate": 0.0001863699306389282, + "loss": 0.7194, + "step": 2160 + }, + { + "epoch": 0.193222460658083, + "grad_norm": 0.11716420718609553, + "learning_rate": 0.00018635533090647998, + "loss": 0.6665, + "step": 2161 + }, + { + "epoch": 0.19331187410586553, + "grad_norm": 0.13286567639878333, + "learning_rate": 0.00018634072393151446, + "loss": 0.6934, + "step": 2162 + }, + { + "epoch": 0.19340128755364808, + "grad_norm": 0.1166301191588948, + "learning_rate": 0.00018632610971525671, + "loss": 0.6743, + "step": 2163 + }, + { + "epoch": 0.19349070100143062, + "grad_norm": 0.1361672067627895, + "learning_rate": 0.00018631148825893238, + "loss": 0.6977, + "step": 2164 + }, + { + "epoch": 0.19358011444921316, + "grad_norm": 0.1140843210102678, + "learning_rate": 0.00018629685956376779, + "loss": 0.6835, + "step": 2165 + }, + { + "epoch": 0.1936695278969957, + "grad_norm": 0.12901529810856366, + "learning_rate": 0.0001862822236309898, + "loss": 0.7229, + "step": 2166 + }, + { + "epoch": 0.19375894134477825, + "grad_norm": 0.13239274607165497, + "learning_rate": 0.0001862675804618259, + "loss": 0.6937, + "step": 2167 + }, + { + "epoch": 0.1938483547925608, + "grad_norm": 0.12632361295817945, + "learning_rate": 0.00018625293005750424, + "loss": 0.6847, + "step": 2168 + }, + { + "epoch": 0.19393776824034334, + "grad_norm": 0.12890659637508362, + "learning_rate": 0.00018623827241925347, + "loss": 0.6986, + "step": 2169 + }, + { + "epoch": 0.19402718168812588, + "grad_norm": 0.12122770631754301, + "learning_rate": 0.000186223607548303, + "loss": 0.7103, + "step": 2170 + }, + { + "epoch": 0.19411659513590845, + "grad_norm": 0.1310607223554355, + "learning_rate": 0.00018620893544588264, + "loss": 0.6845, + "step": 2171 + }, + { + "epoch": 0.194206008583691, + "grad_norm": 0.1211278342444622, + "learning_rate": 0.00018619425611322298, + "loss": 0.6575, + "step": 2172 + }, + { + "epoch": 0.19429542203147354, + "grad_norm": 0.1232510251758791, + "learning_rate": 0.0001861795695515552, + "loss": 0.7111, + "step": 2173 + }, + { + "epoch": 0.19438483547925609, + "grad_norm": 0.1224403284265881, + "learning_rate": 0.00018616487576211092, + "loss": 0.7298, + "step": 2174 + }, + { + "epoch": 0.19447424892703863, + "grad_norm": 0.11863386021251275, + "learning_rate": 0.00018615017474612265, + "loss": 0.7055, + "step": 2175 + }, + { + "epoch": 0.19456366237482117, + "grad_norm": 0.12455053735854373, + "learning_rate": 0.00018613546650482322, + "loss": 0.7058, + "step": 2176 + }, + { + "epoch": 0.19465307582260372, + "grad_norm": 0.13332922405725472, + "learning_rate": 0.00018612075103944625, + "loss": 0.7286, + "step": 2177 + }, + { + "epoch": 0.19474248927038626, + "grad_norm": 0.11945933212484482, + "learning_rate": 0.00018610602835122592, + "loss": 0.6662, + "step": 2178 + }, + { + "epoch": 0.1948319027181688, + "grad_norm": 0.1284481237750627, + "learning_rate": 0.00018609129844139697, + "loss": 0.7272, + "step": 2179 + }, + { + "epoch": 0.19492131616595135, + "grad_norm": 0.10778586056952096, + "learning_rate": 0.00018607656131119476, + "loss": 0.6725, + "step": 2180 + }, + { + "epoch": 0.19501072961373392, + "grad_norm": 0.11065975132299138, + "learning_rate": 0.00018606181696185535, + "loss": 0.6914, + "step": 2181 + }, + { + "epoch": 0.19510014306151646, + "grad_norm": 0.12306201971547163, + "learning_rate": 0.00018604706539461526, + "loss": 0.6792, + "step": 2182 + }, + { + "epoch": 0.195189556509299, + "grad_norm": 0.12436262785251939, + "learning_rate": 0.00018603230661071174, + "loss": 0.7062, + "step": 2183 + }, + { + "epoch": 0.19527896995708155, + "grad_norm": 0.1189131734136166, + "learning_rate": 0.00018601754061138256, + "loss": 0.695, + "step": 2184 + }, + { + "epoch": 0.1953683834048641, + "grad_norm": 0.12982708737694804, + "learning_rate": 0.00018600276739786612, + "loss": 0.6803, + "step": 2185 + }, + { + "epoch": 0.19545779685264664, + "grad_norm": 0.11395209983962551, + "learning_rate": 0.00018598798697140145, + "loss": 0.672, + "step": 2186 + }, + { + "epoch": 0.19554721030042918, + "grad_norm": 0.11921809430408047, + "learning_rate": 0.00018597319933322815, + "loss": 0.6652, + "step": 2187 + }, + { + "epoch": 0.19563662374821172, + "grad_norm": 0.10489284975438917, + "learning_rate": 0.0001859584044845865, + "loss": 0.6403, + "step": 2188 + }, + { + "epoch": 0.19572603719599427, + "grad_norm": 0.12649240044518012, + "learning_rate": 0.0001859436024267172, + "loss": 0.7052, + "step": 2189 + }, + { + "epoch": 0.1958154506437768, + "grad_norm": 0.1040900230497204, + "learning_rate": 0.0001859287931608618, + "loss": 0.5783, + "step": 2190 + }, + { + "epoch": 0.19590486409155938, + "grad_norm": 0.1316662203649631, + "learning_rate": 0.00018591397668826228, + "loss": 0.7138, + "step": 2191 + }, + { + "epoch": 0.19599427753934193, + "grad_norm": 0.11326440295103513, + "learning_rate": 0.0001858991530101613, + "loss": 0.6695, + "step": 2192 + }, + { + "epoch": 0.19608369098712447, + "grad_norm": 0.12273901986119538, + "learning_rate": 0.00018588432212780212, + "loss": 0.6668, + "step": 2193 + }, + { + "epoch": 0.19617310443490701, + "grad_norm": 0.11882168756101888, + "learning_rate": 0.00018586948404242853, + "loss": 0.6762, + "step": 2194 + }, + { + "epoch": 0.19626251788268956, + "grad_norm": 0.1428105748931433, + "learning_rate": 0.00018585463875528505, + "loss": 0.6817, + "step": 2195 + }, + { + "epoch": 0.1963519313304721, + "grad_norm": 0.11777067732843254, + "learning_rate": 0.00018583978626761667, + "loss": 0.7033, + "step": 2196 + }, + { + "epoch": 0.19644134477825465, + "grad_norm": 0.13092516140310675, + "learning_rate": 0.00018582492658066909, + "loss": 0.7344, + "step": 2197 + }, + { + "epoch": 0.1965307582260372, + "grad_norm": 0.11338738687221712, + "learning_rate": 0.00018581005969568856, + "loss": 0.6471, + "step": 2198 + }, + { + "epoch": 0.19662017167381973, + "grad_norm": 0.13220188455812143, + "learning_rate": 0.00018579518561392198, + "loss": 0.7314, + "step": 2199 + }, + { + "epoch": 0.19670958512160228, + "grad_norm": 0.13054044122770883, + "learning_rate": 0.00018578030433661678, + "loss": 0.7007, + "step": 2200 + }, + { + "epoch": 0.19679899856938485, + "grad_norm": 0.12928924887374674, + "learning_rate": 0.00018576541586502106, + "loss": 0.741, + "step": 2201 + }, + { + "epoch": 0.1968884120171674, + "grad_norm": 0.11371283379493236, + "learning_rate": 0.00018575052020038352, + "loss": 0.6886, + "step": 2202 + }, + { + "epoch": 0.19697782546494993, + "grad_norm": 0.10697451120217319, + "learning_rate": 0.00018573561734395338, + "loss": 0.5829, + "step": 2203 + }, + { + "epoch": 0.19706723891273248, + "grad_norm": 0.11466390792754241, + "learning_rate": 0.0001857207072969805, + "loss": 0.7, + "step": 2204 + }, + { + "epoch": 0.19715665236051502, + "grad_norm": 0.1212697210967295, + "learning_rate": 0.0001857057900607155, + "loss": 0.6906, + "step": 2205 + }, + { + "epoch": 0.19724606580829757, + "grad_norm": 0.1076747897601777, + "learning_rate": 0.0001856908656364094, + "loss": 0.6657, + "step": 2206 + }, + { + "epoch": 0.1973354792560801, + "grad_norm": 0.11097609401714933, + "learning_rate": 0.00018567593402531385, + "loss": 0.6713, + "step": 2207 + }, + { + "epoch": 0.19742489270386265, + "grad_norm": 0.12054158556864357, + "learning_rate": 0.00018566099522868119, + "loss": 0.705, + "step": 2208 + }, + { + "epoch": 0.1975143061516452, + "grad_norm": 0.13844712213595026, + "learning_rate": 0.00018564604924776432, + "loss": 0.7512, + "step": 2209 + }, + { + "epoch": 0.19760371959942774, + "grad_norm": 0.12145851049565923, + "learning_rate": 0.00018563109608381675, + "loss": 0.7021, + "step": 2210 + }, + { + "epoch": 0.1976931330472103, + "grad_norm": 0.12024061068919752, + "learning_rate": 0.00018561613573809253, + "loss": 0.6486, + "step": 2211 + }, + { + "epoch": 0.19778254649499286, + "grad_norm": 0.12800121256271968, + "learning_rate": 0.00018560116821184642, + "loss": 0.7374, + "step": 2212 + }, + { + "epoch": 0.1978719599427754, + "grad_norm": 0.11209671647668772, + "learning_rate": 0.0001855861935063337, + "loss": 0.6389, + "step": 2213 + }, + { + "epoch": 0.19796137339055794, + "grad_norm": 0.11154633083453865, + "learning_rate": 0.00018557121162281033, + "loss": 0.6549, + "step": 2214 + }, + { + "epoch": 0.1980507868383405, + "grad_norm": 0.1251854280047357, + "learning_rate": 0.00018555622256253274, + "loss": 0.6978, + "step": 2215 + }, + { + "epoch": 0.19814020028612303, + "grad_norm": 0.11192937243513484, + "learning_rate": 0.00018554122632675815, + "loss": 0.6662, + "step": 2216 + }, + { + "epoch": 0.19822961373390557, + "grad_norm": 0.10282887954071608, + "learning_rate": 0.00018552622291674416, + "loss": 0.6771, + "step": 2217 + }, + { + "epoch": 0.19831902718168812, + "grad_norm": 0.11795034738429434, + "learning_rate": 0.00018551121233374915, + "loss": 0.6968, + "step": 2218 + }, + { + "epoch": 0.19840844062947066, + "grad_norm": 0.10980069497823985, + "learning_rate": 0.00018549619457903206, + "loss": 0.6626, + "step": 2219 + }, + { + "epoch": 0.1984978540772532, + "grad_norm": 0.10845027343771943, + "learning_rate": 0.00018548116965385236, + "loss": 0.6776, + "step": 2220 + }, + { + "epoch": 0.19858726752503578, + "grad_norm": 0.12382614742084734, + "learning_rate": 0.0001854661375594702, + "loss": 0.7085, + "step": 2221 + }, + { + "epoch": 0.19867668097281832, + "grad_norm": 0.10587376567320682, + "learning_rate": 0.0001854510982971463, + "loss": 0.5587, + "step": 2222 + }, + { + "epoch": 0.19876609442060086, + "grad_norm": 0.10923098768363364, + "learning_rate": 0.000185436051868142, + "loss": 0.667, + "step": 2223 + }, + { + "epoch": 0.1988555078683834, + "grad_norm": 0.10277916253369847, + "learning_rate": 0.0001854209982737192, + "loss": 0.653, + "step": 2224 + }, + { + "epoch": 0.19894492131616595, + "grad_norm": 0.1251767936891544, + "learning_rate": 0.00018540593751514042, + "loss": 0.7031, + "step": 2225 + }, + { + "epoch": 0.1990343347639485, + "grad_norm": 0.12520653392159675, + "learning_rate": 0.00018539086959366881, + "loss": 0.6674, + "step": 2226 + }, + { + "epoch": 0.19912374821173104, + "grad_norm": 0.12945494766203605, + "learning_rate": 0.00018537579451056811, + "loss": 0.6652, + "step": 2227 + }, + { + "epoch": 0.19921316165951358, + "grad_norm": 0.09711565431225597, + "learning_rate": 0.00018536071226710267, + "loss": 0.5161, + "step": 2228 + }, + { + "epoch": 0.19930257510729613, + "grad_norm": 0.12094243213978034, + "learning_rate": 0.0001853456228645373, + "loss": 0.6822, + "step": 2229 + }, + { + "epoch": 0.1993919885550787, + "grad_norm": 0.1333396876582595, + "learning_rate": 0.00018533052630413766, + "loss": 0.6745, + "step": 2230 + }, + { + "epoch": 0.19948140200286124, + "grad_norm": 0.11061423534807724, + "learning_rate": 0.00018531542258716982, + "loss": 0.6496, + "step": 2231 + }, + { + "epoch": 0.19957081545064378, + "grad_norm": 0.09720926884432914, + "learning_rate": 0.00018530031171490053, + "loss": 0.6933, + "step": 2232 + }, + { + "epoch": 0.19966022889842633, + "grad_norm": 0.11968444875925964, + "learning_rate": 0.0001852851936885971, + "loss": 0.7342, + "step": 2233 + }, + { + "epoch": 0.19974964234620887, + "grad_norm": 0.10467349199719397, + "learning_rate": 0.00018527006850952747, + "loss": 0.6864, + "step": 2234 + }, + { + "epoch": 0.19983905579399142, + "grad_norm": 0.11885861143631692, + "learning_rate": 0.0001852549361789602, + "loss": 0.7198, + "step": 2235 + }, + { + "epoch": 0.19992846924177396, + "grad_norm": 0.13358014553808564, + "learning_rate": 0.00018523979669816438, + "loss": 0.6898, + "step": 2236 + }, + { + "epoch": 0.2000178826895565, + "grad_norm": 0.1273841426163249, + "learning_rate": 0.00018522465006840975, + "loss": 0.7246, + "step": 2237 + }, + { + "epoch": 0.20010729613733905, + "grad_norm": 0.12763970629235155, + "learning_rate": 0.00018520949629096664, + "loss": 0.721, + "step": 2238 + }, + { + "epoch": 0.2001967095851216, + "grad_norm": 0.1250682954398822, + "learning_rate": 0.000185194335367106, + "loss": 0.6858, + "step": 2239 + }, + { + "epoch": 0.20028612303290416, + "grad_norm": 0.11470945435995972, + "learning_rate": 0.0001851791672980993, + "loss": 0.7086, + "step": 2240 + }, + { + "epoch": 0.2003755364806867, + "grad_norm": 0.10643481549874267, + "learning_rate": 0.0001851639920852188, + "loss": 0.6439, + "step": 2241 + }, + { + "epoch": 0.20046494992846925, + "grad_norm": 0.1257467397438871, + "learning_rate": 0.00018514880972973706, + "loss": 0.7018, + "step": 2242 + }, + { + "epoch": 0.2005543633762518, + "grad_norm": 0.10304809439598402, + "learning_rate": 0.0001851336202329275, + "loss": 0.6622, + "step": 2243 + }, + { + "epoch": 0.20064377682403434, + "grad_norm": 0.11736087957428683, + "learning_rate": 0.00018511842359606403, + "loss": 0.731, + "step": 2244 + }, + { + "epoch": 0.20073319027181688, + "grad_norm": 0.13466054089444895, + "learning_rate": 0.00018510321982042116, + "loss": 0.7091, + "step": 2245 + }, + { + "epoch": 0.20082260371959942, + "grad_norm": 0.11404776714896206, + "learning_rate": 0.00018508800890727403, + "loss": 0.5986, + "step": 2246 + }, + { + "epoch": 0.20091201716738197, + "grad_norm": 0.1354416142217492, + "learning_rate": 0.00018507279085789834, + "loss": 0.7344, + "step": 2247 + }, + { + "epoch": 0.2010014306151645, + "grad_norm": 0.10646855377226452, + "learning_rate": 0.00018505756567357046, + "loss": 0.6897, + "step": 2248 + }, + { + "epoch": 0.20109084406294706, + "grad_norm": 0.12402904147675774, + "learning_rate": 0.00018504233335556723, + "loss": 0.7056, + "step": 2249 + }, + { + "epoch": 0.20118025751072963, + "grad_norm": 0.12217625490921491, + "learning_rate": 0.00018502709390516624, + "loss": 0.6864, + "step": 2250 + }, + { + "epoch": 0.20126967095851217, + "grad_norm": 0.135633380617973, + "learning_rate": 0.00018501184732364553, + "loss": 0.6833, + "step": 2251 + }, + { + "epoch": 0.2013590844062947, + "grad_norm": 0.12299833055929717, + "learning_rate": 0.0001849965936122839, + "loss": 0.6664, + "step": 2252 + }, + { + "epoch": 0.20144849785407726, + "grad_norm": 0.1288467344833323, + "learning_rate": 0.00018498133277236058, + "loss": 0.6725, + "step": 2253 + }, + { + "epoch": 0.2015379113018598, + "grad_norm": 0.14086292493110764, + "learning_rate": 0.00018496606480515552, + "loss": 0.7063, + "step": 2254 + }, + { + "epoch": 0.20162732474964234, + "grad_norm": 0.12188852989461166, + "learning_rate": 0.0001849507897119492, + "loss": 0.6795, + "step": 2255 + }, + { + "epoch": 0.2017167381974249, + "grad_norm": 0.11112178903286697, + "learning_rate": 0.00018493550749402278, + "loss": 0.646, + "step": 2256 + }, + { + "epoch": 0.20180615164520743, + "grad_norm": 0.115725379927264, + "learning_rate": 0.0001849202181526579, + "loss": 0.676, + "step": 2257 + }, + { + "epoch": 0.20189556509298998, + "grad_norm": 0.11606537341178427, + "learning_rate": 0.00018490492168913688, + "loss": 0.6813, + "step": 2258 + }, + { + "epoch": 0.20198497854077252, + "grad_norm": 0.1142667046561917, + "learning_rate": 0.00018488961810474264, + "loss": 0.6139, + "step": 2259 + }, + { + "epoch": 0.2020743919885551, + "grad_norm": 0.13338692903406313, + "learning_rate": 0.00018487430740075862, + "loss": 0.7254, + "step": 2260 + }, + { + "epoch": 0.20216380543633763, + "grad_norm": 0.12000185266526117, + "learning_rate": 0.00018485898957846896, + "loss": 0.6631, + "step": 2261 + }, + { + "epoch": 0.20225321888412018, + "grad_norm": 0.11533497830623912, + "learning_rate": 0.0001848436646391583, + "loss": 0.6754, + "step": 2262 + }, + { + "epoch": 0.20234263233190272, + "grad_norm": 0.1250666331927033, + "learning_rate": 0.000184828332584112, + "loss": 0.719, + "step": 2263 + }, + { + "epoch": 0.20243204577968527, + "grad_norm": 0.12524168331327062, + "learning_rate": 0.00018481299341461583, + "loss": 0.6641, + "step": 2264 + }, + { + "epoch": 0.2025214592274678, + "grad_norm": 0.11162413381473944, + "learning_rate": 0.0001847976471319564, + "loss": 0.6813, + "step": 2265 + }, + { + "epoch": 0.20261087267525035, + "grad_norm": 0.13410398273226032, + "learning_rate": 0.00018478229373742065, + "loss": 0.6914, + "step": 2266 + }, + { + "epoch": 0.2027002861230329, + "grad_norm": 0.12253175087338113, + "learning_rate": 0.00018476693323229637, + "loss": 0.687, + "step": 2267 + }, + { + "epoch": 0.20278969957081544, + "grad_norm": 0.1286210305239075, + "learning_rate": 0.00018475156561787172, + "loss": 0.7199, + "step": 2268 + }, + { + "epoch": 0.20287911301859798, + "grad_norm": 0.1147950192026047, + "learning_rate": 0.00018473619089543565, + "loss": 0.6637, + "step": 2269 + }, + { + "epoch": 0.20296852646638056, + "grad_norm": 0.11315987575925207, + "learning_rate": 0.00018472080906627758, + "loss": 0.7129, + "step": 2270 + }, + { + "epoch": 0.2030579399141631, + "grad_norm": 0.12439400014855477, + "learning_rate": 0.00018470542013168757, + "loss": 0.6826, + "step": 2271 + }, + { + "epoch": 0.20314735336194564, + "grad_norm": 0.12337539886680216, + "learning_rate": 0.00018469002409295628, + "loss": 0.741, + "step": 2272 + }, + { + "epoch": 0.2032367668097282, + "grad_norm": 0.11381110650882616, + "learning_rate": 0.00018467462095137494, + "loss": 0.6963, + "step": 2273 + }, + { + "epoch": 0.20332618025751073, + "grad_norm": 0.12091273771395086, + "learning_rate": 0.0001846592107082354, + "loss": 0.6926, + "step": 2274 + }, + { + "epoch": 0.20341559370529327, + "grad_norm": 0.1190495703258839, + "learning_rate": 0.0001846437933648301, + "loss": 0.709, + "step": 2275 + }, + { + "epoch": 0.20350500715307582, + "grad_norm": 0.10575052010104168, + "learning_rate": 0.00018462836892245207, + "loss": 0.704, + "step": 2276 + }, + { + "epoch": 0.20359442060085836, + "grad_norm": 0.10814016328449326, + "learning_rate": 0.00018461293738239495, + "loss": 0.7039, + "step": 2277 + }, + { + "epoch": 0.2036838340486409, + "grad_norm": 0.11096540672509403, + "learning_rate": 0.00018459749874595298, + "loss": 0.6429, + "step": 2278 + }, + { + "epoch": 0.20377324749642345, + "grad_norm": 0.12723868202185687, + "learning_rate": 0.00018458205301442093, + "loss": 0.6985, + "step": 2279 + }, + { + "epoch": 0.20386266094420602, + "grad_norm": 0.1156635709224768, + "learning_rate": 0.00018456660018909425, + "loss": 0.6663, + "step": 2280 + }, + { + "epoch": 0.20395207439198856, + "grad_norm": 0.12355070039830605, + "learning_rate": 0.0001845511402712689, + "loss": 0.735, + "step": 2281 + }, + { + "epoch": 0.2040414878397711, + "grad_norm": 0.10578687118317043, + "learning_rate": 0.0001845356732622416, + "loss": 0.6849, + "step": 2282 + }, + { + "epoch": 0.20413090128755365, + "grad_norm": 0.10660414748820685, + "learning_rate": 0.00018452019916330944, + "loss": 0.6744, + "step": 2283 + }, + { + "epoch": 0.2042203147353362, + "grad_norm": 0.12117556457647968, + "learning_rate": 0.00018450471797577028, + "loss": 0.7116, + "step": 2284 + }, + { + "epoch": 0.20430972818311874, + "grad_norm": 0.1106633795323447, + "learning_rate": 0.00018448922970092243, + "loss": 0.6625, + "step": 2285 + }, + { + "epoch": 0.20439914163090128, + "grad_norm": 0.10595802037176927, + "learning_rate": 0.00018447373434006496, + "loss": 0.7154, + "step": 2286 + }, + { + "epoch": 0.20448855507868383, + "grad_norm": 0.10789576364525215, + "learning_rate": 0.0001844582318944974, + "loss": 0.6629, + "step": 2287 + }, + { + "epoch": 0.20457796852646637, + "grad_norm": 0.1236019284196663, + "learning_rate": 0.0001844427223655199, + "loss": 0.6988, + "step": 2288 + }, + { + "epoch": 0.20466738197424894, + "grad_norm": 0.12676568518844972, + "learning_rate": 0.0001844272057544333, + "loss": 0.6915, + "step": 2289 + }, + { + "epoch": 0.20475679542203148, + "grad_norm": 0.10057235038293874, + "learning_rate": 0.00018441168206253893, + "loss": 0.6216, + "step": 2290 + }, + { + "epoch": 0.20484620886981403, + "grad_norm": 0.1354007895689485, + "learning_rate": 0.00018439615129113866, + "loss": 0.736, + "step": 2291 + }, + { + "epoch": 0.20493562231759657, + "grad_norm": 0.1120349339894616, + "learning_rate": 0.00018438061344153517, + "loss": 0.5784, + "step": 2292 + }, + { + "epoch": 0.20502503576537912, + "grad_norm": 0.13442689021813223, + "learning_rate": 0.0001843650685150315, + "loss": 0.7099, + "step": 2293 + }, + { + "epoch": 0.20511444921316166, + "grad_norm": 0.11466050054051122, + "learning_rate": 0.00018434951651293143, + "loss": 0.593, + "step": 2294 + }, + { + "epoch": 0.2052038626609442, + "grad_norm": 0.12227711287109685, + "learning_rate": 0.0001843339574365393, + "loss": 0.6596, + "step": 2295 + }, + { + "epoch": 0.20529327610872675, + "grad_norm": 0.10180183750037852, + "learning_rate": 0.00018431839128715997, + "loss": 0.6678, + "step": 2296 + }, + { + "epoch": 0.2053826895565093, + "grad_norm": 0.11509668700635703, + "learning_rate": 0.000184302818066099, + "loss": 0.6562, + "step": 2297 + }, + { + "epoch": 0.20547210300429183, + "grad_norm": 0.134321133673698, + "learning_rate": 0.00018428723777466253, + "loss": 0.7099, + "step": 2298 + }, + { + "epoch": 0.2055615164520744, + "grad_norm": 0.14016059371205308, + "learning_rate": 0.0001842716504141572, + "loss": 0.7165, + "step": 2299 + }, + { + "epoch": 0.20565092989985695, + "grad_norm": 0.1261276168882959, + "learning_rate": 0.00018425605598589031, + "loss": 0.6989, + "step": 2300 + }, + { + "epoch": 0.2057403433476395, + "grad_norm": 0.125858322508696, + "learning_rate": 0.00018424045449116978, + "loss": 0.7417, + "step": 2301 + }, + { + "epoch": 0.20582975679542204, + "grad_norm": 0.12399131344495389, + "learning_rate": 0.000184224845931304, + "loss": 0.6699, + "step": 2302 + }, + { + "epoch": 0.20591917024320458, + "grad_norm": 0.12476213423685688, + "learning_rate": 0.0001842092303076022, + "loss": 0.6865, + "step": 2303 + }, + { + "epoch": 0.20600858369098712, + "grad_norm": 0.13033223952942322, + "learning_rate": 0.00018419360762137395, + "loss": 0.7167, + "step": 2304 + }, + { + "epoch": 0.20609799713876967, + "grad_norm": 0.10810301150939355, + "learning_rate": 0.00018417797787392948, + "loss": 0.5837, + "step": 2305 + }, + { + "epoch": 0.2061874105865522, + "grad_norm": 0.10672096279721788, + "learning_rate": 0.00018416234106657963, + "loss": 0.5496, + "step": 2306 + }, + { + "epoch": 0.20627682403433475, + "grad_norm": 0.12746235993910315, + "learning_rate": 0.00018414669720063592, + "loss": 0.6717, + "step": 2307 + }, + { + "epoch": 0.2063662374821173, + "grad_norm": 0.12375293686369311, + "learning_rate": 0.00018413104627741035, + "loss": 0.6476, + "step": 2308 + }, + { + "epoch": 0.20645565092989987, + "grad_norm": 0.12617085474291997, + "learning_rate": 0.00018411538829821552, + "loss": 0.6759, + "step": 2309 + }, + { + "epoch": 0.2065450643776824, + "grad_norm": 0.1314659305705196, + "learning_rate": 0.00018409972326436465, + "loss": 0.7358, + "step": 2310 + }, + { + "epoch": 0.20663447782546496, + "grad_norm": 0.1366447291911751, + "learning_rate": 0.00018408405117717154, + "loss": 0.6966, + "step": 2311 + }, + { + "epoch": 0.2067238912732475, + "grad_norm": 0.1024272192278969, + "learning_rate": 0.00018406837203795067, + "loss": 0.5959, + "step": 2312 + }, + { + "epoch": 0.20681330472103004, + "grad_norm": 0.1218855632124876, + "learning_rate": 0.0001840526858480169, + "loss": 0.6746, + "step": 2313 + }, + { + "epoch": 0.2069027181688126, + "grad_norm": 0.12320917359553904, + "learning_rate": 0.0001840369926086859, + "loss": 0.6694, + "step": 2314 + }, + { + "epoch": 0.20699213161659513, + "grad_norm": 0.12437533646604598, + "learning_rate": 0.00018402129232127383, + "loss": 0.6461, + "step": 2315 + }, + { + "epoch": 0.20708154506437768, + "grad_norm": 0.1348212075494701, + "learning_rate": 0.00018400558498709744, + "loss": 0.6916, + "step": 2316 + }, + { + "epoch": 0.20717095851216022, + "grad_norm": 0.11335241028475351, + "learning_rate": 0.00018398987060747407, + "loss": 0.665, + "step": 2317 + }, + { + "epoch": 0.20726037195994276, + "grad_norm": 0.10310090416637142, + "learning_rate": 0.00018397414918372172, + "loss": 0.6822, + "step": 2318 + }, + { + "epoch": 0.20734978540772533, + "grad_norm": 0.13082438874654095, + "learning_rate": 0.00018395842071715888, + "loss": 0.7349, + "step": 2319 + }, + { + "epoch": 0.20743919885550788, + "grad_norm": 0.11183097052094479, + "learning_rate": 0.00018394268520910466, + "loss": 0.6275, + "step": 2320 + }, + { + "epoch": 0.20752861230329042, + "grad_norm": 0.11497129062762367, + "learning_rate": 0.00018392694266087885, + "loss": 0.6617, + "step": 2321 + }, + { + "epoch": 0.20761802575107297, + "grad_norm": 0.1197453312428705, + "learning_rate": 0.00018391119307380172, + "loss": 0.7073, + "step": 2322 + }, + { + "epoch": 0.2077074391988555, + "grad_norm": 0.12868718296013962, + "learning_rate": 0.00018389543644919414, + "loss": 0.7048, + "step": 2323 + }, + { + "epoch": 0.20779685264663805, + "grad_norm": 0.12479973284195929, + "learning_rate": 0.00018387967278837763, + "loss": 0.7201, + "step": 2324 + }, + { + "epoch": 0.2078862660944206, + "grad_norm": 0.1172714308918151, + "learning_rate": 0.00018386390209267428, + "loss": 0.6997, + "step": 2325 + }, + { + "epoch": 0.20797567954220314, + "grad_norm": 0.12095766454966925, + "learning_rate": 0.00018384812436340672, + "loss": 0.689, + "step": 2326 + }, + { + "epoch": 0.20806509298998568, + "grad_norm": 0.1073121250041038, + "learning_rate": 0.00018383233960189826, + "loss": 0.6621, + "step": 2327 + }, + { + "epoch": 0.20815450643776823, + "grad_norm": 0.10328648457592236, + "learning_rate": 0.0001838165478094727, + "loss": 0.6163, + "step": 2328 + }, + { + "epoch": 0.2082439198855508, + "grad_norm": 0.11697784115328086, + "learning_rate": 0.0001838007489874545, + "loss": 0.6747, + "step": 2329 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.12171024775096759, + "learning_rate": 0.0001837849431371687, + "loss": 0.6853, + "step": 2330 + }, + { + "epoch": 0.2084227467811159, + "grad_norm": 0.12730446292213862, + "learning_rate": 0.0001837691302599409, + "loss": 0.7024, + "step": 2331 + }, + { + "epoch": 0.20851216022889843, + "grad_norm": 0.12843385126860646, + "learning_rate": 0.0001837533103570973, + "loss": 0.7013, + "step": 2332 + }, + { + "epoch": 0.20860157367668097, + "grad_norm": 0.10832542111786186, + "learning_rate": 0.00018373748342996474, + "loss": 0.6892, + "step": 2333 + }, + { + "epoch": 0.20869098712446352, + "grad_norm": 0.1279941659384648, + "learning_rate": 0.00018372164947987054, + "loss": 0.6655, + "step": 2334 + }, + { + "epoch": 0.20878040057224606, + "grad_norm": 0.11091083360910703, + "learning_rate": 0.00018370580850814272, + "loss": 0.6716, + "step": 2335 + }, + { + "epoch": 0.2088698140200286, + "grad_norm": 0.11516059100552747, + "learning_rate": 0.00018368996051610986, + "loss": 0.6543, + "step": 2336 + }, + { + "epoch": 0.20895922746781115, + "grad_norm": 0.12114041633838007, + "learning_rate": 0.00018367410550510104, + "loss": 0.671, + "step": 2337 + }, + { + "epoch": 0.2090486409155937, + "grad_norm": 0.12148768794044165, + "learning_rate": 0.00018365824347644607, + "loss": 0.7015, + "step": 2338 + }, + { + "epoch": 0.20913805436337626, + "grad_norm": 0.1143712850602656, + "learning_rate": 0.00018364237443147525, + "loss": 0.6712, + "step": 2339 + }, + { + "epoch": 0.2092274678111588, + "grad_norm": 0.127845930557767, + "learning_rate": 0.00018362649837151947, + "loss": 0.6944, + "step": 2340 + }, + { + "epoch": 0.20931688125894135, + "grad_norm": 0.1238111229901775, + "learning_rate": 0.0001836106152979103, + "loss": 0.6773, + "step": 2341 + }, + { + "epoch": 0.2094062947067239, + "grad_norm": 0.12365983213538359, + "learning_rate": 0.0001835947252119798, + "loss": 0.6974, + "step": 2342 + }, + { + "epoch": 0.20949570815450644, + "grad_norm": 0.12214855499156109, + "learning_rate": 0.00018357882811506065, + "loss": 0.6681, + "step": 2343 + }, + { + "epoch": 0.20958512160228898, + "grad_norm": 0.12006940744212118, + "learning_rate": 0.00018356292400848611, + "loss": 0.7238, + "step": 2344 + }, + { + "epoch": 0.20967453505007153, + "grad_norm": 0.12406084451798326, + "learning_rate": 0.00018354701289359005, + "loss": 0.694, + "step": 2345 + }, + { + "epoch": 0.20976394849785407, + "grad_norm": 0.1166650848675392, + "learning_rate": 0.00018353109477170696, + "loss": 0.7042, + "step": 2346 + }, + { + "epoch": 0.2098533619456366, + "grad_norm": 0.09851048166617163, + "learning_rate": 0.0001835151696441718, + "loss": 0.6863, + "step": 2347 + }, + { + "epoch": 0.20994277539341916, + "grad_norm": 0.11757766872096803, + "learning_rate": 0.00018349923751232022, + "loss": 0.6675, + "step": 2348 + }, + { + "epoch": 0.21003218884120173, + "grad_norm": 0.13758470675253626, + "learning_rate": 0.00018348329837748843, + "loss": 0.6971, + "step": 2349 + }, + { + "epoch": 0.21012160228898427, + "grad_norm": 0.1260441396217009, + "learning_rate": 0.00018346735224101325, + "loss": 0.6705, + "step": 2350 + }, + { + "epoch": 0.21021101573676682, + "grad_norm": 0.11567868204752162, + "learning_rate": 0.000183451399104232, + "loss": 0.6947, + "step": 2351 + }, + { + "epoch": 0.21030042918454936, + "grad_norm": 0.1264280099343222, + "learning_rate": 0.00018343543896848273, + "loss": 0.675, + "step": 2352 + }, + { + "epoch": 0.2103898426323319, + "grad_norm": 0.11219046748192626, + "learning_rate": 0.00018341947183510393, + "loss": 0.6618, + "step": 2353 + }, + { + "epoch": 0.21047925608011445, + "grad_norm": 0.1305410161887326, + "learning_rate": 0.00018340349770543481, + "loss": 0.698, + "step": 2354 + }, + { + "epoch": 0.210568669527897, + "grad_norm": 0.12019762289187978, + "learning_rate": 0.00018338751658081504, + "loss": 0.6755, + "step": 2355 + }, + { + "epoch": 0.21065808297567953, + "grad_norm": 0.1250423393108627, + "learning_rate": 0.00018337152846258493, + "loss": 0.7154, + "step": 2356 + }, + { + "epoch": 0.21074749642346208, + "grad_norm": 0.11974683272316991, + "learning_rate": 0.00018335553335208546, + "loss": 0.6628, + "step": 2357 + }, + { + "epoch": 0.21083690987124465, + "grad_norm": 0.12535429295934808, + "learning_rate": 0.00018333953125065805, + "loss": 0.7049, + "step": 2358 + }, + { + "epoch": 0.2109263233190272, + "grad_norm": 0.12293006619703642, + "learning_rate": 0.0001833235221596448, + "loss": 0.6485, + "step": 2359 + }, + { + "epoch": 0.21101573676680974, + "grad_norm": 0.12254662338206097, + "learning_rate": 0.00018330750608038844, + "loss": 0.7087, + "step": 2360 + }, + { + "epoch": 0.21110515021459228, + "grad_norm": 0.1039766183776667, + "learning_rate": 0.0001832914830142321, + "loss": 0.6363, + "step": 2361 + }, + { + "epoch": 0.21119456366237482, + "grad_norm": 0.11074128312488234, + "learning_rate": 0.00018327545296251968, + "loss": 0.7097, + "step": 2362 + }, + { + "epoch": 0.21128397711015737, + "grad_norm": 0.11613529229910755, + "learning_rate": 0.00018325941592659553, + "loss": 0.6768, + "step": 2363 + }, + { + "epoch": 0.2113733905579399, + "grad_norm": 0.11542024480372291, + "learning_rate": 0.0001832433719078048, + "loss": 0.7019, + "step": 2364 + }, + { + "epoch": 0.21146280400572245, + "grad_norm": 0.11454927046092067, + "learning_rate": 0.00018322732090749296, + "loss": 0.6481, + "step": 2365 + }, + { + "epoch": 0.211552217453505, + "grad_norm": 0.13742805645693357, + "learning_rate": 0.00018321126292700628, + "loss": 0.6983, + "step": 2366 + }, + { + "epoch": 0.21164163090128754, + "grad_norm": 0.1420853583386238, + "learning_rate": 0.00018319519796769143, + "loss": 0.7251, + "step": 2367 + }, + { + "epoch": 0.2117310443490701, + "grad_norm": 0.12755229115100924, + "learning_rate": 0.0001831791260308958, + "loss": 0.7095, + "step": 2368 + }, + { + "epoch": 0.21182045779685266, + "grad_norm": 0.11204860981318053, + "learning_rate": 0.00018316304711796732, + "loss": 0.6728, + "step": 2369 + }, + { + "epoch": 0.2119098712446352, + "grad_norm": 0.11014426939317652, + "learning_rate": 0.00018314696123025454, + "loss": 0.7025, + "step": 2370 + }, + { + "epoch": 0.21199928469241774, + "grad_norm": 0.1146063431577136, + "learning_rate": 0.0001831308683691065, + "loss": 0.6887, + "step": 2371 + }, + { + "epoch": 0.2120886981402003, + "grad_norm": 0.11387430777939081, + "learning_rate": 0.00018311476853587297, + "loss": 0.7095, + "step": 2372 + }, + { + "epoch": 0.21217811158798283, + "grad_norm": 0.1039573095029593, + "learning_rate": 0.00018309866173190416, + "loss": 0.6391, + "step": 2373 + }, + { + "epoch": 0.21226752503576538, + "grad_norm": 0.13369515039053953, + "learning_rate": 0.00018308254795855095, + "loss": 0.7295, + "step": 2374 + }, + { + "epoch": 0.21235693848354792, + "grad_norm": 0.1095169611874911, + "learning_rate": 0.00018306642721716476, + "loss": 0.6667, + "step": 2375 + }, + { + "epoch": 0.21244635193133046, + "grad_norm": 0.11020240702553846, + "learning_rate": 0.00018305029950909768, + "loss": 0.6504, + "step": 2376 + }, + { + "epoch": 0.212535765379113, + "grad_norm": 0.12436811886569199, + "learning_rate": 0.00018303416483570227, + "loss": 0.6882, + "step": 2377 + }, + { + "epoch": 0.21262517882689558, + "grad_norm": 0.11663952218881564, + "learning_rate": 0.0001830180231983317, + "loss": 0.6814, + "step": 2378 + }, + { + "epoch": 0.21271459227467812, + "grad_norm": 0.12925426759974706, + "learning_rate": 0.00018300187459833981, + "loss": 0.6814, + "step": 2379 + }, + { + "epoch": 0.21280400572246067, + "grad_norm": 0.12590646307117562, + "learning_rate": 0.00018298571903708092, + "loss": 0.6399, + "step": 2380 + }, + { + "epoch": 0.2128934191702432, + "grad_norm": 0.11895921812461122, + "learning_rate": 0.00018296955651591002, + "loss": 0.7063, + "step": 2381 + }, + { + "epoch": 0.21298283261802575, + "grad_norm": 0.11189069938638561, + "learning_rate": 0.00018295338703618258, + "loss": 0.6655, + "step": 2382 + }, + { + "epoch": 0.2130722460658083, + "grad_norm": 0.11612551298286716, + "learning_rate": 0.0001829372105992548, + "loss": 0.655, + "step": 2383 + }, + { + "epoch": 0.21316165951359084, + "grad_norm": 0.12687711148056421, + "learning_rate": 0.00018292102720648333, + "loss": 0.6836, + "step": 2384 + }, + { + "epoch": 0.21325107296137338, + "grad_norm": 0.1270440836419016, + "learning_rate": 0.0001829048368592254, + "loss": 0.6759, + "step": 2385 + }, + { + "epoch": 0.21334048640915593, + "grad_norm": 0.15005223065987813, + "learning_rate": 0.00018288863955883897, + "loss": 0.6818, + "step": 2386 + }, + { + "epoch": 0.21342989985693847, + "grad_norm": 0.12888849816427791, + "learning_rate": 0.00018287243530668243, + "loss": 0.6847, + "step": 2387 + }, + { + "epoch": 0.21351931330472104, + "grad_norm": 0.1202840855905085, + "learning_rate": 0.00018285622410411484, + "loss": 0.6739, + "step": 2388 + }, + { + "epoch": 0.21360872675250359, + "grad_norm": 0.12264189048853992, + "learning_rate": 0.00018284000595249577, + "loss": 0.6705, + "step": 2389 + }, + { + "epoch": 0.21369814020028613, + "grad_norm": 0.12191808485652982, + "learning_rate": 0.00018282378085318545, + "loss": 0.685, + "step": 2390 + }, + { + "epoch": 0.21378755364806867, + "grad_norm": 0.12076581590560698, + "learning_rate": 0.00018280754880754468, + "loss": 0.6601, + "step": 2391 + }, + { + "epoch": 0.21387696709585122, + "grad_norm": 0.1205904843556458, + "learning_rate": 0.0001827913098169348, + "loss": 0.6421, + "step": 2392 + }, + { + "epoch": 0.21396638054363376, + "grad_norm": 0.12363129261487246, + "learning_rate": 0.00018277506388271773, + "loss": 0.7039, + "step": 2393 + }, + { + "epoch": 0.2140557939914163, + "grad_norm": 0.1271873364213158, + "learning_rate": 0.000182758811006256, + "loss": 0.7114, + "step": 2394 + }, + { + "epoch": 0.21414520743919885, + "grad_norm": 0.12261608534417892, + "learning_rate": 0.0001827425511889128, + "loss": 0.7136, + "step": 2395 + }, + { + "epoch": 0.2142346208869814, + "grad_norm": 0.10000610909155579, + "learning_rate": 0.00018272628443205172, + "loss": 0.6692, + "step": 2396 + }, + { + "epoch": 0.21432403433476394, + "grad_norm": 0.10132055693479024, + "learning_rate": 0.00018271001073703706, + "loss": 0.6577, + "step": 2397 + }, + { + "epoch": 0.2144134477825465, + "grad_norm": 0.11482134102283224, + "learning_rate": 0.0001826937301052337, + "loss": 0.6811, + "step": 2398 + }, + { + "epoch": 0.21450286123032905, + "grad_norm": 0.1334666606162785, + "learning_rate": 0.00018267744253800707, + "loss": 0.685, + "step": 2399 + }, + { + "epoch": 0.2145922746781116, + "grad_norm": 0.12866000476164674, + "learning_rate": 0.00018266114803672318, + "loss": 0.7122, + "step": 2400 + }, + { + "epoch": 0.21468168812589414, + "grad_norm": 0.12002960043226395, + "learning_rate": 0.00018264484660274866, + "loss": 0.7061, + "step": 2401 + }, + { + "epoch": 0.21477110157367668, + "grad_norm": 0.12256660290515131, + "learning_rate": 0.00018262853823745062, + "loss": 0.682, + "step": 2402 + }, + { + "epoch": 0.21486051502145923, + "grad_norm": 0.10244827785841933, + "learning_rate": 0.0001826122229421969, + "loss": 0.6318, + "step": 2403 + }, + { + "epoch": 0.21494992846924177, + "grad_norm": 0.12943900425385893, + "learning_rate": 0.0001825959007183558, + "loss": 0.7478, + "step": 2404 + }, + { + "epoch": 0.2150393419170243, + "grad_norm": 0.14554345022504614, + "learning_rate": 0.00018257957156729625, + "loss": 0.7136, + "step": 2405 + }, + { + "epoch": 0.21512875536480686, + "grad_norm": 0.10816605049740896, + "learning_rate": 0.00018256323549038778, + "loss": 0.6038, + "step": 2406 + }, + { + "epoch": 0.2152181688125894, + "grad_norm": 0.12148289810120382, + "learning_rate": 0.00018254689248900047, + "loss": 0.677, + "step": 2407 + }, + { + "epoch": 0.21530758226037197, + "grad_norm": 0.12175520484212216, + "learning_rate": 0.00018253054256450494, + "loss": 0.6693, + "step": 2408 + }, + { + "epoch": 0.21539699570815452, + "grad_norm": 0.1092939801088289, + "learning_rate": 0.0001825141857182725, + "loss": 0.685, + "step": 2409 + }, + { + "epoch": 0.21548640915593706, + "grad_norm": 0.12579318633028244, + "learning_rate": 0.00018249782195167496, + "loss": 0.6763, + "step": 2410 + }, + { + "epoch": 0.2155758226037196, + "grad_norm": 0.11603072173028872, + "learning_rate": 0.0001824814512660847, + "loss": 0.6993, + "step": 2411 + }, + { + "epoch": 0.21566523605150215, + "grad_norm": 0.13296276337300655, + "learning_rate": 0.00018246507366287475, + "loss": 0.7794, + "step": 2412 + }, + { + "epoch": 0.2157546494992847, + "grad_norm": 0.11705399085144558, + "learning_rate": 0.0001824486891434187, + "loss": 0.658, + "step": 2413 + }, + { + "epoch": 0.21584406294706723, + "grad_norm": 0.12185503958222238, + "learning_rate": 0.0001824322977090906, + "loss": 0.6971, + "step": 2414 + }, + { + "epoch": 0.21593347639484978, + "grad_norm": 0.11711758627824517, + "learning_rate": 0.0001824158993612653, + "loss": 0.6486, + "step": 2415 + }, + { + "epoch": 0.21602288984263232, + "grad_norm": 0.11339711052423673, + "learning_rate": 0.00018239949410131802, + "loss": 0.6673, + "step": 2416 + }, + { + "epoch": 0.2161123032904149, + "grad_norm": 0.12682847907868694, + "learning_rate": 0.0001823830819306247, + "loss": 0.6826, + "step": 2417 + }, + { + "epoch": 0.21620171673819744, + "grad_norm": 0.13779350586816078, + "learning_rate": 0.0001823666628505618, + "loss": 0.7122, + "step": 2418 + }, + { + "epoch": 0.21629113018597998, + "grad_norm": 0.11884244514858724, + "learning_rate": 0.00018235023686250635, + "loss": 0.6667, + "step": 2419 + }, + { + "epoch": 0.21638054363376252, + "grad_norm": 0.13176183246893178, + "learning_rate": 0.00018233380396783595, + "loss": 0.665, + "step": 2420 + }, + { + "epoch": 0.21646995708154507, + "grad_norm": 0.10297854571152268, + "learning_rate": 0.0001823173641679289, + "loss": 0.5722, + "step": 2421 + }, + { + "epoch": 0.2165593705293276, + "grad_norm": 0.1142248848640678, + "learning_rate": 0.0001823009174641639, + "loss": 0.7182, + "step": 2422 + }, + { + "epoch": 0.21664878397711015, + "grad_norm": 0.10736894788843718, + "learning_rate": 0.00018228446385792037, + "loss": 0.6928, + "step": 2423 + }, + { + "epoch": 0.2167381974248927, + "grad_norm": 0.1330010220129735, + "learning_rate": 0.00018226800335057822, + "loss": 0.7151, + "step": 2424 + }, + { + "epoch": 0.21682761087267524, + "grad_norm": 0.12148192991538834, + "learning_rate": 0.00018225153594351795, + "loss": 0.6795, + "step": 2425 + }, + { + "epoch": 0.21691702432045779, + "grad_norm": 0.12722721932155623, + "learning_rate": 0.00018223506163812076, + "loss": 0.6653, + "step": 2426 + }, + { + "epoch": 0.21700643776824036, + "grad_norm": 0.10714944533566115, + "learning_rate": 0.0001822185804357682, + "loss": 0.6693, + "step": 2427 + }, + { + "epoch": 0.2170958512160229, + "grad_norm": 0.13873816339390133, + "learning_rate": 0.00018220209233784266, + "loss": 0.7135, + "step": 2428 + }, + { + "epoch": 0.21718526466380544, + "grad_norm": 0.12221043564238239, + "learning_rate": 0.00018218559734572686, + "loss": 0.7011, + "step": 2429 + }, + { + "epoch": 0.217274678111588, + "grad_norm": 0.1031934460944037, + "learning_rate": 0.00018216909546080428, + "loss": 0.6894, + "step": 2430 + }, + { + "epoch": 0.21736409155937053, + "grad_norm": 0.11675840349678036, + "learning_rate": 0.00018215258668445892, + "loss": 0.6886, + "step": 2431 + }, + { + "epoch": 0.21745350500715308, + "grad_norm": 0.10523000050871707, + "learning_rate": 0.00018213607101807527, + "loss": 0.6699, + "step": 2432 + }, + { + "epoch": 0.21754291845493562, + "grad_norm": 0.13922577172853712, + "learning_rate": 0.0001821195484630386, + "loss": 0.7358, + "step": 2433 + }, + { + "epoch": 0.21763233190271816, + "grad_norm": 0.12188528365080646, + "learning_rate": 0.00018210301902073456, + "loss": 0.7002, + "step": 2434 + }, + { + "epoch": 0.2177217453505007, + "grad_norm": 0.11431364022348156, + "learning_rate": 0.00018208648269254946, + "loss": 0.6692, + "step": 2435 + }, + { + "epoch": 0.21781115879828325, + "grad_norm": 0.12157947919849972, + "learning_rate": 0.0001820699394798702, + "loss": 0.6598, + "step": 2436 + }, + { + "epoch": 0.21790057224606582, + "grad_norm": 0.11687901900010095, + "learning_rate": 0.00018205338938408425, + "loss": 0.674, + "step": 2437 + }, + { + "epoch": 0.21798998569384836, + "grad_norm": 0.13535365500292765, + "learning_rate": 0.0001820368324065796, + "loss": 0.7114, + "step": 2438 + }, + { + "epoch": 0.2180793991416309, + "grad_norm": 0.11739904294812979, + "learning_rate": 0.00018202026854874487, + "loss": 0.6799, + "step": 2439 + }, + { + "epoch": 0.21816881258941345, + "grad_norm": 0.12425988737124752, + "learning_rate": 0.00018200369781196934, + "loss": 0.6386, + "step": 2440 + }, + { + "epoch": 0.218258226037196, + "grad_norm": 0.14220077191986946, + "learning_rate": 0.00018198712019764266, + "loss": 0.6995, + "step": 2441 + }, + { + "epoch": 0.21834763948497854, + "grad_norm": 0.11641748523772547, + "learning_rate": 0.00018197053570715523, + "loss": 0.6739, + "step": 2442 + }, + { + "epoch": 0.21843705293276108, + "grad_norm": 0.12085240155799107, + "learning_rate": 0.00018195394434189797, + "loss": 0.6878, + "step": 2443 + }, + { + "epoch": 0.21852646638054363, + "grad_norm": 0.13239098361906107, + "learning_rate": 0.00018193734610326239, + "loss": 0.7155, + "step": 2444 + }, + { + "epoch": 0.21861587982832617, + "grad_norm": 0.12576962170505812, + "learning_rate": 0.0001819207409926405, + "loss": 0.6792, + "step": 2445 + }, + { + "epoch": 0.21870529327610871, + "grad_norm": 0.11441074346070441, + "learning_rate": 0.00018190412901142504, + "loss": 0.7122, + "step": 2446 + }, + { + "epoch": 0.21879470672389129, + "grad_norm": 0.12455981141394917, + "learning_rate": 0.00018188751016100918, + "loss": 0.6839, + "step": 2447 + }, + { + "epoch": 0.21888412017167383, + "grad_norm": 0.11193809086372787, + "learning_rate": 0.00018187088444278674, + "loss": 0.7075, + "step": 2448 + }, + { + "epoch": 0.21897353361945637, + "grad_norm": 0.12213317829705583, + "learning_rate": 0.0001818542518581521, + "loss": 0.7146, + "step": 2449 + }, + { + "epoch": 0.21906294706723892, + "grad_norm": 0.12789185271247433, + "learning_rate": 0.0001818376124085002, + "loss": 0.7019, + "step": 2450 + }, + { + "epoch": 0.21915236051502146, + "grad_norm": 0.11119368159655234, + "learning_rate": 0.0001818209660952266, + "loss": 0.6619, + "step": 2451 + }, + { + "epoch": 0.219241773962804, + "grad_norm": 0.10959875746145868, + "learning_rate": 0.00018180431291972738, + "loss": 0.6791, + "step": 2452 + }, + { + "epoch": 0.21933118741058655, + "grad_norm": 0.10013378309649511, + "learning_rate": 0.00018178765288339924, + "loss": 0.6308, + "step": 2453 + }, + { + "epoch": 0.2194206008583691, + "grad_norm": 0.11345576503749667, + "learning_rate": 0.00018177098598763942, + "loss": 0.682, + "step": 2454 + }, + { + "epoch": 0.21951001430615164, + "grad_norm": 0.12460490397183549, + "learning_rate": 0.00018175431223384575, + "loss": 0.6797, + "step": 2455 + }, + { + "epoch": 0.21959942775393418, + "grad_norm": 0.11364736396078652, + "learning_rate": 0.00018173763162341667, + "loss": 0.5752, + "step": 2456 + }, + { + "epoch": 0.21968884120171675, + "grad_norm": 0.13321983832192058, + "learning_rate": 0.00018172094415775113, + "loss": 0.7037, + "step": 2457 + }, + { + "epoch": 0.2197782546494993, + "grad_norm": 0.1281303459901519, + "learning_rate": 0.00018170424983824868, + "loss": 0.7006, + "step": 2458 + }, + { + "epoch": 0.21986766809728184, + "grad_norm": 0.1244063240055367, + "learning_rate": 0.00018168754866630947, + "loss": 0.6966, + "step": 2459 + }, + { + "epoch": 0.21995708154506438, + "grad_norm": 0.11147213919970468, + "learning_rate": 0.00018167084064333423, + "loss": 0.5907, + "step": 2460 + }, + { + "epoch": 0.22004649499284692, + "grad_norm": 0.13328420466705904, + "learning_rate": 0.0001816541257707242, + "loss": 0.6912, + "step": 2461 + }, + { + "epoch": 0.22013590844062947, + "grad_norm": 0.13078396192298328, + "learning_rate": 0.00018163740404988126, + "loss": 0.7103, + "step": 2462 + }, + { + "epoch": 0.220225321888412, + "grad_norm": 0.1250488373369605, + "learning_rate": 0.00018162067548220786, + "loss": 0.6772, + "step": 2463 + }, + { + "epoch": 0.22031473533619456, + "grad_norm": 0.11957798477120306, + "learning_rate": 0.00018160394006910694, + "loss": 0.6614, + "step": 2464 + }, + { + "epoch": 0.2204041487839771, + "grad_norm": 0.13040546192417715, + "learning_rate": 0.00018158719781198213, + "loss": 0.6614, + "step": 2465 + }, + { + "epoch": 0.22049356223175964, + "grad_norm": 0.13066626240334797, + "learning_rate": 0.00018157044871223757, + "loss": 0.7113, + "step": 2466 + }, + { + "epoch": 0.22058297567954221, + "grad_norm": 0.11776082641037798, + "learning_rate": 0.00018155369277127802, + "loss": 0.7058, + "step": 2467 + }, + { + "epoch": 0.22067238912732476, + "grad_norm": 0.1365690148013358, + "learning_rate": 0.00018153692999050872, + "loss": 0.6935, + "step": 2468 + }, + { + "epoch": 0.2207618025751073, + "grad_norm": 0.12119824194834289, + "learning_rate": 0.00018152016037133558, + "loss": 0.6975, + "step": 2469 + }, + { + "epoch": 0.22085121602288985, + "grad_norm": 0.11367098817118482, + "learning_rate": 0.00018150338391516505, + "loss": 0.6757, + "step": 2470 + }, + { + "epoch": 0.2209406294706724, + "grad_norm": 0.12833799906361518, + "learning_rate": 0.0001814866006234041, + "loss": 0.6714, + "step": 2471 + }, + { + "epoch": 0.22103004291845493, + "grad_norm": 0.11249811645613299, + "learning_rate": 0.00018146981049746043, + "loss": 0.683, + "step": 2472 + }, + { + "epoch": 0.22111945636623748, + "grad_norm": 0.12702473159770072, + "learning_rate": 0.0001814530135387421, + "loss": 0.686, + "step": 2473 + }, + { + "epoch": 0.22120886981402002, + "grad_norm": 0.11959082260469536, + "learning_rate": 0.0001814362097486579, + "loss": 0.6051, + "step": 2474 + }, + { + "epoch": 0.22129828326180256, + "grad_norm": 0.12201539626971733, + "learning_rate": 0.00018141939912861717, + "loss": 0.6719, + "step": 2475 + }, + { + "epoch": 0.2213876967095851, + "grad_norm": 0.11215519620398691, + "learning_rate": 0.00018140258168002971, + "loss": 0.6469, + "step": 2476 + }, + { + "epoch": 0.22147711015736768, + "grad_norm": 0.13368334386717667, + "learning_rate": 0.0001813857574043061, + "loss": 0.7112, + "step": 2477 + }, + { + "epoch": 0.22156652360515022, + "grad_norm": 0.1311516072584206, + "learning_rate": 0.00018136892630285726, + "loss": 0.6765, + "step": 2478 + }, + { + "epoch": 0.22165593705293277, + "grad_norm": 0.12560135626341787, + "learning_rate": 0.00018135208837709486, + "loss": 0.6847, + "step": 2479 + }, + { + "epoch": 0.2217453505007153, + "grad_norm": 0.11042878465194883, + "learning_rate": 0.00018133524362843104, + "loss": 0.6934, + "step": 2480 + }, + { + "epoch": 0.22183476394849785, + "grad_norm": 0.1101761501383691, + "learning_rate": 0.00018131839205827856, + "loss": 0.573, + "step": 2481 + }, + { + "epoch": 0.2219241773962804, + "grad_norm": 0.11747489887675296, + "learning_rate": 0.00018130153366805075, + "loss": 0.6747, + "step": 2482 + }, + { + "epoch": 0.22201359084406294, + "grad_norm": 0.13197461874506314, + "learning_rate": 0.00018128466845916154, + "loss": 0.683, + "step": 2483 + }, + { + "epoch": 0.22210300429184548, + "grad_norm": 0.11192874907483853, + "learning_rate": 0.00018126779643302528, + "loss": 0.639, + "step": 2484 + }, + { + "epoch": 0.22219241773962803, + "grad_norm": 0.11549528363267438, + "learning_rate": 0.00018125091759105713, + "loss": 0.6936, + "step": 2485 + }, + { + "epoch": 0.2222818311874106, + "grad_norm": 0.1141256781722901, + "learning_rate": 0.00018123403193467266, + "loss": 0.6541, + "step": 2486 + }, + { + "epoch": 0.22237124463519314, + "grad_norm": 0.13078153514927618, + "learning_rate": 0.000181217139465288, + "loss": 0.7155, + "step": 2487 + }, + { + "epoch": 0.2224606580829757, + "grad_norm": 0.1181709029937511, + "learning_rate": 0.00018120024018431998, + "loss": 0.6825, + "step": 2488 + }, + { + "epoch": 0.22255007153075823, + "grad_norm": 0.09911395489263596, + "learning_rate": 0.00018118333409318583, + "loss": 0.6304, + "step": 2489 + }, + { + "epoch": 0.22263948497854077, + "grad_norm": 0.12386538931165902, + "learning_rate": 0.00018116642119330354, + "loss": 0.6961, + "step": 2490 + }, + { + "epoch": 0.22272889842632332, + "grad_norm": 0.12221009623081998, + "learning_rate": 0.0001811495014860915, + "loss": 0.7069, + "step": 2491 + }, + { + "epoch": 0.22281831187410586, + "grad_norm": 0.11757239471000618, + "learning_rate": 0.00018113257497296879, + "loss": 0.7057, + "step": 2492 + }, + { + "epoch": 0.2229077253218884, + "grad_norm": 0.11361773979059415, + "learning_rate": 0.000181115641655355, + "loss": 0.6403, + "step": 2493 + }, + { + "epoch": 0.22299713876967095, + "grad_norm": 0.12189563233093258, + "learning_rate": 0.00018109870153467031, + "loss": 0.7203, + "step": 2494 + }, + { + "epoch": 0.2230865522174535, + "grad_norm": 0.1272779117722831, + "learning_rate": 0.00018108175461233544, + "loss": 0.7259, + "step": 2495 + }, + { + "epoch": 0.22317596566523606, + "grad_norm": 0.12432059406530276, + "learning_rate": 0.00018106480088977172, + "loss": 0.6851, + "step": 2496 + }, + { + "epoch": 0.2232653791130186, + "grad_norm": 0.12678474679451085, + "learning_rate": 0.0001810478403684011, + "loss": 0.6839, + "step": 2497 + }, + { + "epoch": 0.22335479256080115, + "grad_norm": 0.1309149999653169, + "learning_rate": 0.00018103087304964597, + "loss": 0.6913, + "step": 2498 + }, + { + "epoch": 0.2234442060085837, + "grad_norm": 0.1285327075161493, + "learning_rate": 0.00018101389893492937, + "loss": 0.7169, + "step": 2499 + }, + { + "epoch": 0.22353361945636624, + "grad_norm": 0.10689033408977718, + "learning_rate": 0.0001809969180256749, + "loss": 0.5564, + "step": 2500 + }, + { + "epoch": 0.22362303290414878, + "grad_norm": 0.12505819330286663, + "learning_rate": 0.00018097993032330676, + "loss": 0.7174, + "step": 2501 + }, + { + "epoch": 0.22371244635193133, + "grad_norm": 0.12294843011980605, + "learning_rate": 0.00018096293582924963, + "loss": 0.6765, + "step": 2502 + }, + { + "epoch": 0.22380185979971387, + "grad_norm": 0.141455854495452, + "learning_rate": 0.00018094593454492887, + "loss": 0.7204, + "step": 2503 + }, + { + "epoch": 0.22389127324749641, + "grad_norm": 0.1480992727767101, + "learning_rate": 0.00018092892647177035, + "loss": 0.7268, + "step": 2504 + }, + { + "epoch": 0.22398068669527896, + "grad_norm": 0.12928326452651182, + "learning_rate": 0.0001809119116112005, + "loss": 0.7049, + "step": 2505 + }, + { + "epoch": 0.22407010014306153, + "grad_norm": 0.13577266722062503, + "learning_rate": 0.00018089488996464632, + "loss": 0.6995, + "step": 2506 + }, + { + "epoch": 0.22415951359084407, + "grad_norm": 0.10956698938560827, + "learning_rate": 0.00018087786153353543, + "loss": 0.6566, + "step": 2507 + }, + { + "epoch": 0.22424892703862662, + "grad_norm": 0.13115967555552396, + "learning_rate": 0.00018086082631929595, + "loss": 0.7018, + "step": 2508 + }, + { + "epoch": 0.22433834048640916, + "grad_norm": 0.11994718065848636, + "learning_rate": 0.00018084378432335667, + "loss": 0.666, + "step": 2509 + }, + { + "epoch": 0.2244277539341917, + "grad_norm": 0.135437021922194, + "learning_rate": 0.00018082673554714677, + "loss": 0.6609, + "step": 2510 + }, + { + "epoch": 0.22451716738197425, + "grad_norm": 0.13240509113680357, + "learning_rate": 0.00018080967999209622, + "loss": 0.6822, + "step": 2511 + }, + { + "epoch": 0.2246065808297568, + "grad_norm": 0.12573062727194637, + "learning_rate": 0.00018079261765963537, + "loss": 0.7312, + "step": 2512 + }, + { + "epoch": 0.22469599427753933, + "grad_norm": 0.1213044053328111, + "learning_rate": 0.00018077554855119526, + "loss": 0.6946, + "step": 2513 + }, + { + "epoch": 0.22478540772532188, + "grad_norm": 0.13250323241411227, + "learning_rate": 0.00018075847266820746, + "loss": 0.673, + "step": 2514 + }, + { + "epoch": 0.22487482117310442, + "grad_norm": 0.11798998764761552, + "learning_rate": 0.0001807413900121041, + "loss": 0.653, + "step": 2515 + }, + { + "epoch": 0.224964234620887, + "grad_norm": 0.12402860683232099, + "learning_rate": 0.00018072430058431783, + "loss": 0.6963, + "step": 2516 + }, + { + "epoch": 0.22505364806866954, + "grad_norm": 0.11732206703620743, + "learning_rate": 0.000180707204386282, + "loss": 0.6385, + "step": 2517 + }, + { + "epoch": 0.22514306151645208, + "grad_norm": 0.12902605915262974, + "learning_rate": 0.00018069010141943037, + "loss": 0.698, + "step": 2518 + }, + { + "epoch": 0.22523247496423462, + "grad_norm": 0.1147396692259764, + "learning_rate": 0.00018067299168519741, + "loss": 0.6705, + "step": 2519 + }, + { + "epoch": 0.22532188841201717, + "grad_norm": 0.13193614468729056, + "learning_rate": 0.00018065587518501804, + "loss": 0.7134, + "step": 2520 + }, + { + "epoch": 0.2254113018597997, + "grad_norm": 0.127618620911135, + "learning_rate": 0.00018063875192032787, + "loss": 0.664, + "step": 2521 + }, + { + "epoch": 0.22550071530758226, + "grad_norm": 0.1130296067541865, + "learning_rate": 0.00018062162189256292, + "loss": 0.6371, + "step": 2522 + }, + { + "epoch": 0.2255901287553648, + "grad_norm": 0.11958803678969215, + "learning_rate": 0.00018060448510315993, + "loss": 0.6477, + "step": 2523 + }, + { + "epoch": 0.22567954220314734, + "grad_norm": 0.12222311382502539, + "learning_rate": 0.00018058734155355612, + "loss": 0.6925, + "step": 2524 + }, + { + "epoch": 0.2257689556509299, + "grad_norm": 0.1252992873772483, + "learning_rate": 0.00018057019124518927, + "loss": 0.6987, + "step": 2525 + }, + { + "epoch": 0.22585836909871246, + "grad_norm": 0.11046058814594148, + "learning_rate": 0.00018055303417949782, + "loss": 0.6837, + "step": 2526 + }, + { + "epoch": 0.225947782546495, + "grad_norm": 0.11642134068216387, + "learning_rate": 0.00018053587035792067, + "loss": 0.6626, + "step": 2527 + }, + { + "epoch": 0.22603719599427755, + "grad_norm": 0.11261873637795441, + "learning_rate": 0.00018051869978189731, + "loss": 0.6914, + "step": 2528 + }, + { + "epoch": 0.2261266094420601, + "grad_norm": 0.12485215421110364, + "learning_rate": 0.0001805015224528679, + "loss": 0.737, + "step": 2529 + }, + { + "epoch": 0.22621602288984263, + "grad_norm": 0.10661302654037097, + "learning_rate": 0.00018048433837227295, + "loss": 0.6234, + "step": 2530 + }, + { + "epoch": 0.22630543633762518, + "grad_norm": 0.11542844408198896, + "learning_rate": 0.0001804671475415538, + "loss": 0.6363, + "step": 2531 + }, + { + "epoch": 0.22639484978540772, + "grad_norm": 0.12190385802754976, + "learning_rate": 0.00018044994996215213, + "loss": 0.6887, + "step": 2532 + }, + { + "epoch": 0.22648426323319026, + "grad_norm": 0.12103790118078671, + "learning_rate": 0.00018043274563551035, + "loss": 0.6847, + "step": 2533 + }, + { + "epoch": 0.2265736766809728, + "grad_norm": 0.12970052094376483, + "learning_rate": 0.00018041553456307128, + "loss": 0.5871, + "step": 2534 + }, + { + "epoch": 0.22666309012875535, + "grad_norm": 0.12681366498628638, + "learning_rate": 0.00018039831674627847, + "loss": 0.688, + "step": 2535 + }, + { + "epoch": 0.22675250357653792, + "grad_norm": 0.122276278956141, + "learning_rate": 0.00018038109218657594, + "loss": 0.6764, + "step": 2536 + }, + { + "epoch": 0.22684191702432047, + "grad_norm": 0.13241887012387418, + "learning_rate": 0.00018036386088540827, + "loss": 0.7182, + "step": 2537 + }, + { + "epoch": 0.226931330472103, + "grad_norm": 0.12323221175234216, + "learning_rate": 0.00018034662284422065, + "loss": 0.6602, + "step": 2538 + }, + { + "epoch": 0.22702074391988555, + "grad_norm": 0.12150331890252536, + "learning_rate": 0.00018032937806445882, + "loss": 0.6912, + "step": 2539 + }, + { + "epoch": 0.2271101573676681, + "grad_norm": 0.13812238604585614, + "learning_rate": 0.00018031212654756905, + "loss": 0.6902, + "step": 2540 + }, + { + "epoch": 0.22719957081545064, + "grad_norm": 0.12389931048264034, + "learning_rate": 0.00018029486829499822, + "loss": 0.718, + "step": 2541 + }, + { + "epoch": 0.22728898426323318, + "grad_norm": 0.12904844636176716, + "learning_rate": 0.00018027760330819375, + "loss": 0.7022, + "step": 2542 + }, + { + "epoch": 0.22737839771101573, + "grad_norm": 0.11767738626453668, + "learning_rate": 0.00018026033158860365, + "loss": 0.7108, + "step": 2543 + }, + { + "epoch": 0.22746781115879827, + "grad_norm": 0.123644693447151, + "learning_rate": 0.00018024305313767646, + "loss": 0.6991, + "step": 2544 + }, + { + "epoch": 0.22755722460658084, + "grad_norm": 0.11409369982179492, + "learning_rate": 0.00018022576795686133, + "loss": 0.6521, + "step": 2545 + }, + { + "epoch": 0.2276466380543634, + "grad_norm": 0.11408800588994462, + "learning_rate": 0.00018020847604760794, + "loss": 0.6985, + "step": 2546 + }, + { + "epoch": 0.22773605150214593, + "grad_norm": 0.1250818530764666, + "learning_rate": 0.00018019117741136648, + "loss": 0.577, + "step": 2547 + }, + { + "epoch": 0.22782546494992847, + "grad_norm": 0.1254029861064449, + "learning_rate": 0.00018017387204958784, + "loss": 0.6724, + "step": 2548 + }, + { + "epoch": 0.22791487839771102, + "grad_norm": 0.12655641007465665, + "learning_rate": 0.0001801565599637234, + "loss": 0.6954, + "step": 2549 + }, + { + "epoch": 0.22800429184549356, + "grad_norm": 0.12603032626364974, + "learning_rate": 0.00018013924115522508, + "loss": 0.6995, + "step": 2550 + }, + { + "epoch": 0.2280937052932761, + "grad_norm": 0.12154159890152424, + "learning_rate": 0.00018012191562554537, + "loss": 0.6496, + "step": 2551 + }, + { + "epoch": 0.22818311874105865, + "grad_norm": 0.1154993507305846, + "learning_rate": 0.00018010458337613735, + "loss": 0.6844, + "step": 2552 + }, + { + "epoch": 0.2282725321888412, + "grad_norm": 0.1388374509873288, + "learning_rate": 0.00018008724440845468, + "loss": 0.7418, + "step": 2553 + }, + { + "epoch": 0.22836194563662374, + "grad_norm": 0.12941564538766712, + "learning_rate": 0.00018006989872395156, + "loss": 0.6968, + "step": 2554 + }, + { + "epoch": 0.2284513590844063, + "grad_norm": 0.11420016901864315, + "learning_rate": 0.0001800525463240827, + "loss": 0.6621, + "step": 2555 + }, + { + "epoch": 0.22854077253218885, + "grad_norm": 0.11939067189921171, + "learning_rate": 0.00018003518721030349, + "loss": 0.6321, + "step": 2556 + }, + { + "epoch": 0.2286301859799714, + "grad_norm": 0.13931824046723565, + "learning_rate": 0.00018001782138406976, + "loss": 0.6845, + "step": 2557 + }, + { + "epoch": 0.22871959942775394, + "grad_norm": 0.12020396909255554, + "learning_rate": 0.000180000448846838, + "loss": 0.6629, + "step": 2558 + }, + { + "epoch": 0.22880901287553648, + "grad_norm": 0.11522369356628086, + "learning_rate": 0.0001799830696000652, + "loss": 0.6876, + "step": 2559 + }, + { + "epoch": 0.22889842632331903, + "grad_norm": 0.12529886244618876, + "learning_rate": 0.00017996568364520897, + "loss": 0.7038, + "step": 2560 + }, + { + "epoch": 0.22898783977110157, + "grad_norm": 0.13084873141882838, + "learning_rate": 0.00017994829098372738, + "loss": 0.6882, + "step": 2561 + }, + { + "epoch": 0.2290772532188841, + "grad_norm": 0.12047894185704668, + "learning_rate": 0.0001799308916170792, + "loss": 0.7046, + "step": 2562 + }, + { + "epoch": 0.22916666666666666, + "grad_norm": 0.11146239137681381, + "learning_rate": 0.00017991348554672373, + "loss": 0.6852, + "step": 2563 + }, + { + "epoch": 0.2292560801144492, + "grad_norm": 0.14309636040868717, + "learning_rate": 0.00017989607277412066, + "loss": 0.715, + "step": 2564 + }, + { + "epoch": 0.22934549356223177, + "grad_norm": 0.12201770914929234, + "learning_rate": 0.00017987865330073048, + "loss": 0.6832, + "step": 2565 + }, + { + "epoch": 0.22943490701001432, + "grad_norm": 0.11289944666497513, + "learning_rate": 0.00017986122712801414, + "loss": 0.5648, + "step": 2566 + }, + { + "epoch": 0.22952432045779686, + "grad_norm": 0.12961752881668498, + "learning_rate": 0.0001798437942574331, + "loss": 0.6971, + "step": 2567 + }, + { + "epoch": 0.2296137339055794, + "grad_norm": 0.09880170030683273, + "learning_rate": 0.0001798263546904495, + "loss": 0.6177, + "step": 2568 + }, + { + "epoch": 0.22970314735336195, + "grad_norm": 0.11720980188303157, + "learning_rate": 0.0001798089084285259, + "loss": 0.661, + "step": 2569 + }, + { + "epoch": 0.2297925608011445, + "grad_norm": 0.12078799742564801, + "learning_rate": 0.00017979145547312555, + "loss": 0.7081, + "step": 2570 + }, + { + "epoch": 0.22988197424892703, + "grad_norm": 0.12618560544767665, + "learning_rate": 0.0001797739958257122, + "loss": 0.7183, + "step": 2571 + }, + { + "epoch": 0.22997138769670958, + "grad_norm": 0.1416799949632132, + "learning_rate": 0.00017975652948775013, + "loss": 0.7296, + "step": 2572 + }, + { + "epoch": 0.23006080114449212, + "grad_norm": 0.13642006193446857, + "learning_rate": 0.0001797390564607043, + "loss": 0.7298, + "step": 2573 + }, + { + "epoch": 0.23015021459227467, + "grad_norm": 0.13347498792000922, + "learning_rate": 0.00017972157674604007, + "loss": 0.7065, + "step": 2574 + }, + { + "epoch": 0.23023962804005724, + "grad_norm": 0.1089057543150937, + "learning_rate": 0.00017970409034522348, + "loss": 0.6958, + "step": 2575 + }, + { + "epoch": 0.23032904148783978, + "grad_norm": 0.11636166772894539, + "learning_rate": 0.00017968659725972112, + "loss": 0.6806, + "step": 2576 + }, + { + "epoch": 0.23041845493562232, + "grad_norm": 0.11300795455764832, + "learning_rate": 0.00017966909749100006, + "loss": 0.6746, + "step": 2577 + }, + { + "epoch": 0.23050786838340487, + "grad_norm": 0.12068165794296443, + "learning_rate": 0.00017965159104052803, + "loss": 0.6665, + "step": 2578 + }, + { + "epoch": 0.2305972818311874, + "grad_norm": 0.13245852128472882, + "learning_rate": 0.00017963407790977322, + "loss": 0.7316, + "step": 2579 + }, + { + "epoch": 0.23068669527896996, + "grad_norm": 0.11333157887661104, + "learning_rate": 0.00017961655810020452, + "loss": 0.5973, + "step": 2580 + }, + { + "epoch": 0.2307761087267525, + "grad_norm": 0.12288094522115454, + "learning_rate": 0.00017959903161329118, + "loss": 0.7105, + "step": 2581 + }, + { + "epoch": 0.23086552217453504, + "grad_norm": 0.1227703437846491, + "learning_rate": 0.00017958149845050323, + "loss": 0.692, + "step": 2582 + }, + { + "epoch": 0.2309549356223176, + "grad_norm": 0.11593262117514043, + "learning_rate": 0.0001795639586133111, + "loss": 0.6768, + "step": 2583 + }, + { + "epoch": 0.23104434907010013, + "grad_norm": 0.12871494026927136, + "learning_rate": 0.00017954641210318588, + "loss": 0.694, + "step": 2584 + }, + { + "epoch": 0.2311337625178827, + "grad_norm": 0.1207057690346058, + "learning_rate": 0.0001795288589215991, + "loss": 0.6831, + "step": 2585 + }, + { + "epoch": 0.23122317596566525, + "grad_norm": 0.1296675569774204, + "learning_rate": 0.000179511299070023, + "loss": 0.6842, + "step": 2586 + }, + { + "epoch": 0.2313125894134478, + "grad_norm": 0.10982002083244964, + "learning_rate": 0.00017949373254993027, + "loss": 0.676, + "step": 2587 + }, + { + "epoch": 0.23140200286123033, + "grad_norm": 0.12894290334403516, + "learning_rate": 0.00017947615936279417, + "loss": 0.6905, + "step": 2588 + }, + { + "epoch": 0.23149141630901288, + "grad_norm": 0.11751024817250628, + "learning_rate": 0.00017945857951008859, + "loss": 0.6688, + "step": 2589 + }, + { + "epoch": 0.23158082975679542, + "grad_norm": 0.14163246880388297, + "learning_rate": 0.00017944099299328791, + "loss": 0.6914, + "step": 2590 + }, + { + "epoch": 0.23167024320457796, + "grad_norm": 0.12908196188801513, + "learning_rate": 0.00017942339981386708, + "loss": 0.6768, + "step": 2591 + }, + { + "epoch": 0.2317596566523605, + "grad_norm": 0.1289160662072438, + "learning_rate": 0.00017940579997330165, + "loss": 0.6955, + "step": 2592 + }, + { + "epoch": 0.23184907010014305, + "grad_norm": 0.11704740328571622, + "learning_rate": 0.00017938819347306764, + "loss": 0.6797, + "step": 2593 + }, + { + "epoch": 0.2319384835479256, + "grad_norm": 0.12682532075879135, + "learning_rate": 0.00017937058031464173, + "loss": 0.7197, + "step": 2594 + }, + { + "epoch": 0.23202789699570817, + "grad_norm": 0.12593652419846468, + "learning_rate": 0.0001793529604995011, + "loss": 0.6834, + "step": 2595 + }, + { + "epoch": 0.2321173104434907, + "grad_norm": 0.13567147612610178, + "learning_rate": 0.00017933533402912354, + "loss": 0.7234, + "step": 2596 + }, + { + "epoch": 0.23220672389127325, + "grad_norm": 0.11962935558713048, + "learning_rate": 0.0001793177009049873, + "loss": 0.7165, + "step": 2597 + }, + { + "epoch": 0.2322961373390558, + "grad_norm": 0.12333078001651761, + "learning_rate": 0.00017930006112857127, + "loss": 0.6884, + "step": 2598 + }, + { + "epoch": 0.23238555078683834, + "grad_norm": 0.14057859314555438, + "learning_rate": 0.0001792824147013549, + "loss": 0.6846, + "step": 2599 + }, + { + "epoch": 0.23247496423462088, + "grad_norm": 0.11854416177123267, + "learning_rate": 0.00017926476162481817, + "loss": 0.6629, + "step": 2600 + }, + { + "epoch": 0.23256437768240343, + "grad_norm": 0.10901808671524629, + "learning_rate": 0.00017924710190044156, + "loss": 0.6941, + "step": 2601 + }, + { + "epoch": 0.23265379113018597, + "grad_norm": 0.13801374554410661, + "learning_rate": 0.00017922943552970625, + "loss": 0.6885, + "step": 2602 + }, + { + "epoch": 0.23274320457796852, + "grad_norm": 0.13835083409327892, + "learning_rate": 0.0001792117625140939, + "loss": 0.6728, + "step": 2603 + }, + { + "epoch": 0.23283261802575106, + "grad_norm": 0.13152663956600763, + "learning_rate": 0.00017919408285508662, + "loss": 0.6991, + "step": 2604 + }, + { + "epoch": 0.23292203147353363, + "grad_norm": 0.11846883637577915, + "learning_rate": 0.0001791763965541673, + "loss": 0.6409, + "step": 2605 + }, + { + "epoch": 0.23301144492131617, + "grad_norm": 0.1130716874310009, + "learning_rate": 0.00017915870361281922, + "loss": 0.6883, + "step": 2606 + }, + { + "epoch": 0.23310085836909872, + "grad_norm": 0.12391434185989712, + "learning_rate": 0.00017914100403252628, + "loss": 0.724, + "step": 2607 + }, + { + "epoch": 0.23319027181688126, + "grad_norm": 0.10980671115415136, + "learning_rate": 0.00017912329781477287, + "loss": 0.6895, + "step": 2608 + }, + { + "epoch": 0.2332796852646638, + "grad_norm": 0.10946954957734162, + "learning_rate": 0.00017910558496104403, + "loss": 0.6917, + "step": 2609 + }, + { + "epoch": 0.23336909871244635, + "grad_norm": 0.11752156034479594, + "learning_rate": 0.00017908786547282538, + "loss": 0.7027, + "step": 2610 + }, + { + "epoch": 0.2334585121602289, + "grad_norm": 0.1129136747193234, + "learning_rate": 0.0001790701393516029, + "loss": 0.6814, + "step": 2611 + }, + { + "epoch": 0.23354792560801144, + "grad_norm": 0.12495589534977443, + "learning_rate": 0.00017905240659886335, + "loss": 0.6707, + "step": 2612 + }, + { + "epoch": 0.23363733905579398, + "grad_norm": 0.11724594615876, + "learning_rate": 0.00017903466721609393, + "loss": 0.7204, + "step": 2613 + }, + { + "epoch": 0.23372675250357655, + "grad_norm": 0.13072650369462838, + "learning_rate": 0.0001790169212047824, + "loss": 0.6556, + "step": 2614 + }, + { + "epoch": 0.2338161659513591, + "grad_norm": 0.13226540658867522, + "learning_rate": 0.00017899916856641714, + "loss": 0.6982, + "step": 2615 + }, + { + "epoch": 0.23390557939914164, + "grad_norm": 0.13499899242689695, + "learning_rate": 0.00017898140930248704, + "loss": 0.7357, + "step": 2616 + }, + { + "epoch": 0.23399499284692418, + "grad_norm": 0.11488989073728205, + "learning_rate": 0.0001789636434144815, + "loss": 0.6432, + "step": 2617 + }, + { + "epoch": 0.23408440629470673, + "grad_norm": 0.11882952526602232, + "learning_rate": 0.00017894587090389052, + "loss": 0.673, + "step": 2618 + }, + { + "epoch": 0.23417381974248927, + "grad_norm": 0.12393706901795183, + "learning_rate": 0.00017892809177220474, + "loss": 0.698, + "step": 2619 + }, + { + "epoch": 0.2342632331902718, + "grad_norm": 0.13599376936394805, + "learning_rate": 0.00017891030602091519, + "loss": 0.7117, + "step": 2620 + }, + { + "epoch": 0.23435264663805436, + "grad_norm": 0.13809451704457748, + "learning_rate": 0.0001788925136515136, + "loss": 0.727, + "step": 2621 + }, + { + "epoch": 0.2344420600858369, + "grad_norm": 0.14382231627772973, + "learning_rate": 0.00017887471466549216, + "loss": 0.7202, + "step": 2622 + }, + { + "epoch": 0.23453147353361944, + "grad_norm": 0.1153470583451544, + "learning_rate": 0.00017885690906434365, + "loss": 0.671, + "step": 2623 + }, + { + "epoch": 0.23462088698140202, + "grad_norm": 0.12491781569218335, + "learning_rate": 0.0001788390968495614, + "loss": 0.6897, + "step": 2624 + }, + { + "epoch": 0.23471030042918456, + "grad_norm": 0.1231170347244725, + "learning_rate": 0.00017882127802263935, + "loss": 0.7061, + "step": 2625 + }, + { + "epoch": 0.2347997138769671, + "grad_norm": 0.10838623358104192, + "learning_rate": 0.00017880345258507188, + "loss": 0.6463, + "step": 2626 + }, + { + "epoch": 0.23488912732474965, + "grad_norm": 0.11989894457143903, + "learning_rate": 0.000178785620538354, + "loss": 0.6848, + "step": 2627 + }, + { + "epoch": 0.2349785407725322, + "grad_norm": 0.11801072846962975, + "learning_rate": 0.00017876778188398128, + "loss": 0.6706, + "step": 2628 + }, + { + "epoch": 0.23506795422031473, + "grad_norm": 0.11681641179007325, + "learning_rate": 0.00017874993662344983, + "loss": 0.652, + "step": 2629 + }, + { + "epoch": 0.23515736766809728, + "grad_norm": 0.11394742002237625, + "learning_rate": 0.00017873208475825632, + "loss": 0.663, + "step": 2630 + }, + { + "epoch": 0.23524678111587982, + "grad_norm": 0.11958021898135963, + "learning_rate": 0.0001787142262898979, + "loss": 0.6764, + "step": 2631 + }, + { + "epoch": 0.23533619456366237, + "grad_norm": 0.12460456693091672, + "learning_rate": 0.00017869636121987243, + "loss": 0.6756, + "step": 2632 + }, + { + "epoch": 0.2354256080114449, + "grad_norm": 0.1170437799150863, + "learning_rate": 0.00017867848954967815, + "loss": 0.6902, + "step": 2633 + }, + { + "epoch": 0.23551502145922748, + "grad_norm": 0.12638869053313723, + "learning_rate": 0.000178660611280814, + "loss": 0.6343, + "step": 2634 + }, + { + "epoch": 0.23560443490701002, + "grad_norm": 0.1172476744751421, + "learning_rate": 0.00017864272641477936, + "loss": 0.6692, + "step": 2635 + }, + { + "epoch": 0.23569384835479257, + "grad_norm": 0.1084807460456056, + "learning_rate": 0.00017862483495307424, + "loss": 0.6604, + "step": 2636 + }, + { + "epoch": 0.2357832618025751, + "grad_norm": 0.105340243893267, + "learning_rate": 0.00017860693689719916, + "loss": 0.6689, + "step": 2637 + }, + { + "epoch": 0.23587267525035766, + "grad_norm": 0.13087176284184313, + "learning_rate": 0.0001785890322486552, + "loss": 0.7206, + "step": 2638 + }, + { + "epoch": 0.2359620886981402, + "grad_norm": 0.13262787716294094, + "learning_rate": 0.00017857112100894406, + "loss": 0.7167, + "step": 2639 + }, + { + "epoch": 0.23605150214592274, + "grad_norm": 0.11699737544119877, + "learning_rate": 0.00017855320317956784, + "loss": 0.6843, + "step": 2640 + }, + { + "epoch": 0.2361409155937053, + "grad_norm": 0.12601155146815649, + "learning_rate": 0.0001785352787620294, + "loss": 0.6752, + "step": 2641 + }, + { + "epoch": 0.23623032904148783, + "grad_norm": 0.10619524051148092, + "learning_rate": 0.00017851734775783194, + "loss": 0.6723, + "step": 2642 + }, + { + "epoch": 0.23631974248927037, + "grad_norm": 0.12119828076221324, + "learning_rate": 0.00017849941016847933, + "loss": 0.6681, + "step": 2643 + }, + { + "epoch": 0.23640915593705294, + "grad_norm": 0.11655159896308738, + "learning_rate": 0.000178481465995476, + "loss": 0.6739, + "step": 2644 + }, + { + "epoch": 0.2364985693848355, + "grad_norm": 0.12929877827017308, + "learning_rate": 0.00017846351524032693, + "loss": 0.6671, + "step": 2645 + }, + { + "epoch": 0.23658798283261803, + "grad_norm": 0.1366852844041364, + "learning_rate": 0.0001784455579045376, + "loss": 0.707, + "step": 2646 + }, + { + "epoch": 0.23667739628040058, + "grad_norm": 0.13620255372296255, + "learning_rate": 0.00017842759398961405, + "loss": 0.7128, + "step": 2647 + }, + { + "epoch": 0.23676680972818312, + "grad_norm": 0.10805434397646932, + "learning_rate": 0.00017840962349706288, + "loss": 0.6694, + "step": 2648 + }, + { + "epoch": 0.23685622317596566, + "grad_norm": 0.12130890240437392, + "learning_rate": 0.00017839164642839133, + "loss": 0.6794, + "step": 2649 + }, + { + "epoch": 0.2369456366237482, + "grad_norm": 0.1144313156925462, + "learning_rate": 0.000178373662785107, + "loss": 0.6873, + "step": 2650 + }, + { + "epoch": 0.23703505007153075, + "grad_norm": 0.12266791656056864, + "learning_rate": 0.00017835567256871827, + "loss": 0.67, + "step": 2651 + }, + { + "epoch": 0.2371244635193133, + "grad_norm": 0.1154364368753262, + "learning_rate": 0.00017833767578073393, + "loss": 0.6596, + "step": 2652 + }, + { + "epoch": 0.23721387696709584, + "grad_norm": 0.1180755492219964, + "learning_rate": 0.0001783196724226633, + "loss": 0.6834, + "step": 2653 + }, + { + "epoch": 0.2373032904148784, + "grad_norm": 0.12064759759687882, + "learning_rate": 0.00017830166249601637, + "loss": 0.6983, + "step": 2654 + }, + { + "epoch": 0.23739270386266095, + "grad_norm": 0.13357795681500167, + "learning_rate": 0.00017828364600230352, + "loss": 0.6753, + "step": 2655 + }, + { + "epoch": 0.2374821173104435, + "grad_norm": 0.12807548695842105, + "learning_rate": 0.00017826562294303585, + "loss": 0.6959, + "step": 2656 + }, + { + "epoch": 0.23757153075822604, + "grad_norm": 0.11610388495857422, + "learning_rate": 0.0001782475933197249, + "loss": 0.7109, + "step": 2657 + }, + { + "epoch": 0.23766094420600858, + "grad_norm": 0.11980634751170804, + "learning_rate": 0.00017822955713388277, + "loss": 0.6587, + "step": 2658 + }, + { + "epoch": 0.23775035765379113, + "grad_norm": 0.13096557734405337, + "learning_rate": 0.0001782115143870222, + "loss": 0.6683, + "step": 2659 + }, + { + "epoch": 0.23783977110157367, + "grad_norm": 0.12359040389081283, + "learning_rate": 0.00017819346508065635, + "loss": 0.6915, + "step": 2660 + }, + { + "epoch": 0.23792918454935622, + "grad_norm": 0.12959861856180566, + "learning_rate": 0.00017817540921629904, + "loss": 0.7055, + "step": 2661 + }, + { + "epoch": 0.23801859799713876, + "grad_norm": 0.1172531349760748, + "learning_rate": 0.00017815734679546457, + "loss": 0.6655, + "step": 2662 + }, + { + "epoch": 0.2381080114449213, + "grad_norm": 0.12913217303705246, + "learning_rate": 0.00017813927781966778, + "loss": 0.6359, + "step": 2663 + }, + { + "epoch": 0.23819742489270387, + "grad_norm": 0.12289830038668613, + "learning_rate": 0.00017812120229042416, + "loss": 0.6649, + "step": 2664 + }, + { + "epoch": 0.23828683834048642, + "grad_norm": 0.12760718738724416, + "learning_rate": 0.00017810312020924963, + "loss": 0.6976, + "step": 2665 + }, + { + "epoch": 0.23837625178826896, + "grad_norm": 0.12255009502092373, + "learning_rate": 0.00017808503157766073, + "loss": 0.6905, + "step": 2666 + }, + { + "epoch": 0.2384656652360515, + "grad_norm": 0.12248969288379422, + "learning_rate": 0.00017806693639717456, + "loss": 0.7107, + "step": 2667 + }, + { + "epoch": 0.23855507868383405, + "grad_norm": 0.1169723177655417, + "learning_rate": 0.0001780488346693087, + "loss": 0.681, + "step": 2668 + }, + { + "epoch": 0.2386444921316166, + "grad_norm": 0.12318344961867721, + "learning_rate": 0.00017803072639558133, + "loss": 0.6754, + "step": 2669 + }, + { + "epoch": 0.23873390557939914, + "grad_norm": 0.11448928133489374, + "learning_rate": 0.0001780126115775112, + "loss": 0.6627, + "step": 2670 + }, + { + "epoch": 0.23882331902718168, + "grad_norm": 0.10634625475137893, + "learning_rate": 0.00017799449021661752, + "loss": 0.6618, + "step": 2671 + }, + { + "epoch": 0.23891273247496422, + "grad_norm": 0.10274333896500747, + "learning_rate": 0.00017797636231442016, + "loss": 0.632, + "step": 2672 + }, + { + "epoch": 0.2390021459227468, + "grad_norm": 0.12014457025084428, + "learning_rate": 0.00017795822787243946, + "loss": 0.6857, + "step": 2673 + }, + { + "epoch": 0.23909155937052934, + "grad_norm": 0.1186588643947202, + "learning_rate": 0.0001779400868921963, + "loss": 0.7105, + "step": 2674 + }, + { + "epoch": 0.23918097281831188, + "grad_norm": 0.13688160559161489, + "learning_rate": 0.00017792193937521224, + "loss": 0.7112, + "step": 2675 + }, + { + "epoch": 0.23927038626609443, + "grad_norm": 0.1392256064037472, + "learning_rate": 0.0001779037853230092, + "loss": 0.7243, + "step": 2676 + }, + { + "epoch": 0.23935979971387697, + "grad_norm": 0.11778369579336731, + "learning_rate": 0.00017788562473710978, + "loss": 0.6907, + "step": 2677 + }, + { + "epoch": 0.2394492131616595, + "grad_norm": 0.10716985707380798, + "learning_rate": 0.00017786745761903708, + "loss": 0.6621, + "step": 2678 + }, + { + "epoch": 0.23953862660944206, + "grad_norm": 0.11078949905772843, + "learning_rate": 0.00017784928397031476, + "loss": 0.6772, + "step": 2679 + }, + { + "epoch": 0.2396280400572246, + "grad_norm": 0.1155914150532143, + "learning_rate": 0.00017783110379246696, + "loss": 0.6285, + "step": 2680 + }, + { + "epoch": 0.23971745350500714, + "grad_norm": 0.1436813588001681, + "learning_rate": 0.00017781291708701853, + "loss": 0.7045, + "step": 2681 + }, + { + "epoch": 0.2398068669527897, + "grad_norm": 0.14125956880738438, + "learning_rate": 0.0001777947238554947, + "loss": 0.7164, + "step": 2682 + }, + { + "epoch": 0.23989628040057226, + "grad_norm": 0.1311718891048215, + "learning_rate": 0.00017777652409942132, + "loss": 0.6755, + "step": 2683 + }, + { + "epoch": 0.2399856938483548, + "grad_norm": 0.11463291240670873, + "learning_rate": 0.00017775831782032483, + "loss": 0.7117, + "step": 2684 + }, + { + "epoch": 0.24007510729613735, + "grad_norm": 0.1210656403025832, + "learning_rate": 0.00017774010501973208, + "loss": 0.6878, + "step": 2685 + }, + { + "epoch": 0.2401645207439199, + "grad_norm": 0.10906359363839563, + "learning_rate": 0.00017772188569917065, + "loss": 0.5797, + "step": 2686 + }, + { + "epoch": 0.24025393419170243, + "grad_norm": 0.11141808941345346, + "learning_rate": 0.00017770365986016852, + "loss": 0.6652, + "step": 2687 + }, + { + "epoch": 0.24034334763948498, + "grad_norm": 0.12862438343668947, + "learning_rate": 0.00017768542750425426, + "loss": 0.6797, + "step": 2688 + }, + { + "epoch": 0.24043276108726752, + "grad_norm": 0.13271846486890124, + "learning_rate": 0.00017766718863295705, + "loss": 0.6813, + "step": 2689 + }, + { + "epoch": 0.24052217453505007, + "grad_norm": 0.13175592784602377, + "learning_rate": 0.00017764894324780653, + "loss": 0.731, + "step": 2690 + }, + { + "epoch": 0.2406115879828326, + "grad_norm": 0.12399289245095027, + "learning_rate": 0.0001776306913503329, + "loss": 0.6759, + "step": 2691 + }, + { + "epoch": 0.24070100143061515, + "grad_norm": 0.12653983232351723, + "learning_rate": 0.00017761243294206694, + "loss": 0.6681, + "step": 2692 + }, + { + "epoch": 0.24079041487839772, + "grad_norm": 0.12464506997226699, + "learning_rate": 0.00017759416802453997, + "loss": 0.6668, + "step": 2693 + }, + { + "epoch": 0.24087982832618027, + "grad_norm": 0.13435010510667608, + "learning_rate": 0.0001775758965992838, + "loss": 0.7019, + "step": 2694 + }, + { + "epoch": 0.2409692417739628, + "grad_norm": 0.14205605557859757, + "learning_rate": 0.0001775576186678309, + "loss": 0.6924, + "step": 2695 + }, + { + "epoch": 0.24105865522174535, + "grad_norm": 0.1366438437517622, + "learning_rate": 0.00017753933423171421, + "loss": 0.7254, + "step": 2696 + }, + { + "epoch": 0.2411480686695279, + "grad_norm": 0.13246666695377485, + "learning_rate": 0.00017752104329246717, + "loss": 0.702, + "step": 2697 + }, + { + "epoch": 0.24123748211731044, + "grad_norm": 0.10870314232472417, + "learning_rate": 0.00017750274585162385, + "loss": 0.6657, + "step": 2698 + }, + { + "epoch": 0.24132689556509299, + "grad_norm": 0.1282326004228628, + "learning_rate": 0.00017748444191071884, + "loss": 0.6869, + "step": 2699 + }, + { + "epoch": 0.24141630901287553, + "grad_norm": 0.11132304239040978, + "learning_rate": 0.00017746613147128726, + "loss": 0.6807, + "step": 2700 + }, + { + "epoch": 0.24150572246065807, + "grad_norm": 0.12706547035312324, + "learning_rate": 0.0001774478145348648, + "loss": 0.596, + "step": 2701 + }, + { + "epoch": 0.24159513590844062, + "grad_norm": 0.12399077402328892, + "learning_rate": 0.00017742949110298767, + "loss": 0.6475, + "step": 2702 + }, + { + "epoch": 0.2416845493562232, + "grad_norm": 0.11627933216673314, + "learning_rate": 0.00017741116117719262, + "loss": 0.6674, + "step": 2703 + }, + { + "epoch": 0.24177396280400573, + "grad_norm": 0.1316602573077454, + "learning_rate": 0.000177392824759017, + "loss": 0.7343, + "step": 2704 + }, + { + "epoch": 0.24186337625178828, + "grad_norm": 0.10835580379747231, + "learning_rate": 0.0001773744818499986, + "loss": 0.6369, + "step": 2705 + }, + { + "epoch": 0.24195278969957082, + "grad_norm": 0.12022624725311418, + "learning_rate": 0.00017735613245167586, + "loss": 0.6852, + "step": 2706 + }, + { + "epoch": 0.24204220314735336, + "grad_norm": 0.12816410630667707, + "learning_rate": 0.00017733777656558773, + "loss": 0.6973, + "step": 2707 + }, + { + "epoch": 0.2421316165951359, + "grad_norm": 0.11745943868309061, + "learning_rate": 0.00017731941419327365, + "loss": 0.6726, + "step": 2708 + }, + { + "epoch": 0.24222103004291845, + "grad_norm": 0.12568352345191391, + "learning_rate": 0.0001773010453362737, + "loss": 0.6781, + "step": 2709 + }, + { + "epoch": 0.242310443490701, + "grad_norm": 0.1221585798310315, + "learning_rate": 0.00017728266999612844, + "loss": 0.671, + "step": 2710 + }, + { + "epoch": 0.24239985693848354, + "grad_norm": 0.11606191326183517, + "learning_rate": 0.000177264288174379, + "loss": 0.684, + "step": 2711 + }, + { + "epoch": 0.24248927038626608, + "grad_norm": 0.11834611245491597, + "learning_rate": 0.00017724589987256698, + "loss": 0.6744, + "step": 2712 + }, + { + "epoch": 0.24257868383404865, + "grad_norm": 0.12015478778908993, + "learning_rate": 0.00017722750509223465, + "loss": 0.6981, + "step": 2713 + }, + { + "epoch": 0.2426680972818312, + "grad_norm": 0.13657552192107594, + "learning_rate": 0.0001772091038349247, + "loss": 0.7064, + "step": 2714 + }, + { + "epoch": 0.24275751072961374, + "grad_norm": 0.13009759130692614, + "learning_rate": 0.00017719069610218048, + "loss": 0.6856, + "step": 2715 + }, + { + "epoch": 0.24284692417739628, + "grad_norm": 0.12753377063160667, + "learning_rate": 0.00017717228189554582, + "loss": 0.7101, + "step": 2716 + }, + { + "epoch": 0.24293633762517883, + "grad_norm": 0.11133518741675988, + "learning_rate": 0.00017715386121656507, + "loss": 0.6727, + "step": 2717 + }, + { + "epoch": 0.24302575107296137, + "grad_norm": 0.12205012344181053, + "learning_rate": 0.00017713543406678315, + "loss": 0.6646, + "step": 2718 + }, + { + "epoch": 0.24311516452074391, + "grad_norm": 0.13519008647377923, + "learning_rate": 0.0001771170004477455, + "loss": 0.6804, + "step": 2719 + }, + { + "epoch": 0.24320457796852646, + "grad_norm": 0.11315441282285923, + "learning_rate": 0.0001770985603609982, + "loss": 0.6712, + "step": 2720 + }, + { + "epoch": 0.243293991416309, + "grad_norm": 0.11711286647666133, + "learning_rate": 0.00017708011380808774, + "loss": 0.6726, + "step": 2721 + }, + { + "epoch": 0.24338340486409155, + "grad_norm": 0.12625725787512307, + "learning_rate": 0.00017706166079056124, + "loss": 0.6767, + "step": 2722 + }, + { + "epoch": 0.24347281831187412, + "grad_norm": 0.12771214178575688, + "learning_rate": 0.0001770432013099663, + "loss": 0.7485, + "step": 2723 + }, + { + "epoch": 0.24356223175965666, + "grad_norm": 0.11797301685309322, + "learning_rate": 0.0001770247353678511, + "loss": 0.691, + "step": 2724 + }, + { + "epoch": 0.2436516452074392, + "grad_norm": 0.12863509524734457, + "learning_rate": 0.0001770062629657644, + "loss": 0.7075, + "step": 2725 + }, + { + "epoch": 0.24374105865522175, + "grad_norm": 0.11649570546379877, + "learning_rate": 0.0001769877841052554, + "loss": 0.6909, + "step": 2726 + }, + { + "epoch": 0.2438304721030043, + "grad_norm": 0.13634015490207094, + "learning_rate": 0.00017696929878787394, + "loss": 0.6881, + "step": 2727 + }, + { + "epoch": 0.24391988555078684, + "grad_norm": 0.11646515938870226, + "learning_rate": 0.00017695080701517034, + "loss": 0.6741, + "step": 2728 + }, + { + "epoch": 0.24400929899856938, + "grad_norm": 0.12468194616172189, + "learning_rate": 0.00017693230878869547, + "loss": 0.6147, + "step": 2729 + }, + { + "epoch": 0.24409871244635192, + "grad_norm": 0.11725397311138396, + "learning_rate": 0.00017691380411000079, + "loss": 0.6782, + "step": 2730 + }, + { + "epoch": 0.24418812589413447, + "grad_norm": 0.1415862068113671, + "learning_rate": 0.00017689529298063822, + "loss": 0.673, + "step": 2731 + }, + { + "epoch": 0.244277539341917, + "grad_norm": 0.12234376612755296, + "learning_rate": 0.00017687677540216033, + "loss": 0.6835, + "step": 2732 + }, + { + "epoch": 0.24436695278969958, + "grad_norm": 0.12953931057430626, + "learning_rate": 0.00017685825137612012, + "loss": 0.685, + "step": 2733 + }, + { + "epoch": 0.24445636623748213, + "grad_norm": 0.12359741944835526, + "learning_rate": 0.00017683972090407123, + "loss": 0.6632, + "step": 2734 + }, + { + "epoch": 0.24454577968526467, + "grad_norm": 0.1251867121881118, + "learning_rate": 0.00017682118398756766, + "loss": 0.6418, + "step": 2735 + }, + { + "epoch": 0.2446351931330472, + "grad_norm": 0.12751910419042536, + "learning_rate": 0.0001768026406281642, + "loss": 0.7062, + "step": 2736 + }, + { + "epoch": 0.24472460658082976, + "grad_norm": 0.1332395011289846, + "learning_rate": 0.00017678409082741604, + "loss": 0.7018, + "step": 2737 + }, + { + "epoch": 0.2448140200286123, + "grad_norm": 0.11638106183483192, + "learning_rate": 0.00017676553458687892, + "loss": 0.6468, + "step": 2738 + }, + { + "epoch": 0.24490343347639484, + "grad_norm": 0.1154829083058072, + "learning_rate": 0.00017674697190810912, + "loss": 0.6928, + "step": 2739 + }, + { + "epoch": 0.2449928469241774, + "grad_norm": 0.1169503981830623, + "learning_rate": 0.00017672840279266345, + "loss": 0.57, + "step": 2740 + }, + { + "epoch": 0.24508226037195993, + "grad_norm": 0.1277551603459908, + "learning_rate": 0.00017670982724209933, + "loss": 0.6965, + "step": 2741 + }, + { + "epoch": 0.2451716738197425, + "grad_norm": 0.12591456234506496, + "learning_rate": 0.00017669124525797463, + "loss": 0.6997, + "step": 2742 + }, + { + "epoch": 0.24526108726752505, + "grad_norm": 0.13029245611214468, + "learning_rate": 0.0001766726568418478, + "loss": 0.6664, + "step": 2743 + }, + { + "epoch": 0.2453505007153076, + "grad_norm": 0.1283124145714901, + "learning_rate": 0.00017665406199527785, + "loss": 0.7062, + "step": 2744 + }, + { + "epoch": 0.24543991416309013, + "grad_norm": 0.12694901764263233, + "learning_rate": 0.00017663546071982432, + "loss": 0.6698, + "step": 2745 + }, + { + "epoch": 0.24552932761087268, + "grad_norm": 0.10610670503413401, + "learning_rate": 0.0001766168530170472, + "loss": 0.6446, + "step": 2746 + }, + { + "epoch": 0.24561874105865522, + "grad_norm": 0.12262011257457188, + "learning_rate": 0.00017659823888850715, + "loss": 0.7005, + "step": 2747 + }, + { + "epoch": 0.24570815450643776, + "grad_norm": 0.13461371245052228, + "learning_rate": 0.00017657961833576535, + "loss": 0.7033, + "step": 2748 + }, + { + "epoch": 0.2457975679542203, + "grad_norm": 0.13716620329559068, + "learning_rate": 0.0001765609913603834, + "loss": 0.7215, + "step": 2749 + }, + { + "epoch": 0.24588698140200285, + "grad_norm": 0.12636237254316562, + "learning_rate": 0.00017654235796392363, + "loss": 0.6664, + "step": 2750 + }, + { + "epoch": 0.2459763948497854, + "grad_norm": 0.13110079991873597, + "learning_rate": 0.0001765237181479487, + "loss": 0.6674, + "step": 2751 + }, + { + "epoch": 0.24606580829756797, + "grad_norm": 0.12221154275671281, + "learning_rate": 0.00017650507191402194, + "loss": 0.687, + "step": 2752 + }, + { + "epoch": 0.2461552217453505, + "grad_norm": 0.1303351438909407, + "learning_rate": 0.0001764864192637072, + "loss": 0.651, + "step": 2753 + }, + { + "epoch": 0.24624463519313305, + "grad_norm": 0.13486726734411822, + "learning_rate": 0.00017646776019856884, + "loss": 0.6817, + "step": 2754 + }, + { + "epoch": 0.2463340486409156, + "grad_norm": 0.14167527831082027, + "learning_rate": 0.0001764490947201718, + "loss": 0.7011, + "step": 2755 + }, + { + "epoch": 0.24642346208869814, + "grad_norm": 0.1286514482740099, + "learning_rate": 0.00017643042283008148, + "loss": 0.7298, + "step": 2756 + }, + { + "epoch": 0.24651287553648069, + "grad_norm": 0.10249144802015314, + "learning_rate": 0.00017641174452986396, + "loss": 0.6668, + "step": 2757 + }, + { + "epoch": 0.24660228898426323, + "grad_norm": 0.12230272410686598, + "learning_rate": 0.00017639305982108567, + "loss": 0.6747, + "step": 2758 + }, + { + "epoch": 0.24669170243204577, + "grad_norm": 0.1402252442752135, + "learning_rate": 0.0001763743687053137, + "loss": 0.7202, + "step": 2759 + }, + { + "epoch": 0.24678111587982832, + "grad_norm": 0.10825361576971364, + "learning_rate": 0.0001763556711841157, + "loss": 0.684, + "step": 2760 + }, + { + "epoch": 0.24687052932761086, + "grad_norm": 0.11255259336058915, + "learning_rate": 0.00017633696725905974, + "loss": 0.6434, + "step": 2761 + }, + { + "epoch": 0.24695994277539343, + "grad_norm": 0.11357367348678292, + "learning_rate": 0.00017631825693171453, + "loss": 0.592, + "step": 2762 + }, + { + "epoch": 0.24704935622317598, + "grad_norm": 0.1298073707620364, + "learning_rate": 0.0001762995402036493, + "loss": 0.6608, + "step": 2763 + }, + { + "epoch": 0.24713876967095852, + "grad_norm": 0.12623970118540787, + "learning_rate": 0.00017628081707643376, + "loss": 0.6618, + "step": 2764 + }, + { + "epoch": 0.24722818311874106, + "grad_norm": 0.12359385923337364, + "learning_rate": 0.00017626208755163822, + "loss": 0.699, + "step": 2765 + }, + { + "epoch": 0.2473175965665236, + "grad_norm": 0.11122245776121334, + "learning_rate": 0.0001762433516308335, + "loss": 0.6613, + "step": 2766 + }, + { + "epoch": 0.24740701001430615, + "grad_norm": 0.10665828129415425, + "learning_rate": 0.00017622460931559098, + "loss": 0.7009, + "step": 2767 + }, + { + "epoch": 0.2474964234620887, + "grad_norm": 0.12270853523384873, + "learning_rate": 0.00017620586060748252, + "loss": 0.7223, + "step": 2768 + }, + { + "epoch": 0.24758583690987124, + "grad_norm": 0.10405374470798437, + "learning_rate": 0.00017618710550808056, + "loss": 0.6521, + "step": 2769 + }, + { + "epoch": 0.24767525035765378, + "grad_norm": 0.11819931152186344, + "learning_rate": 0.00017616834401895805, + "loss": 0.6881, + "step": 2770 + }, + { + "epoch": 0.24776466380543632, + "grad_norm": 0.1066517297741388, + "learning_rate": 0.0001761495761416885, + "loss": 0.68, + "step": 2771 + }, + { + "epoch": 0.2478540772532189, + "grad_norm": 0.13129713103786103, + "learning_rate": 0.00017613080187784603, + "loss": 0.7164, + "step": 2772 + }, + { + "epoch": 0.24794349070100144, + "grad_norm": 0.11325094109253679, + "learning_rate": 0.00017611202122900512, + "loss": 0.6664, + "step": 2773 + }, + { + "epoch": 0.24803290414878398, + "grad_norm": 0.10320750316721528, + "learning_rate": 0.0001760932341967409, + "loss": 0.6585, + "step": 2774 + }, + { + "epoch": 0.24812231759656653, + "grad_norm": 0.12335943231349804, + "learning_rate": 0.00017607444078262903, + "loss": 0.6608, + "step": 2775 + }, + { + "epoch": 0.24821173104434907, + "grad_norm": 0.12068269072596903, + "learning_rate": 0.00017605564098824568, + "loss": 0.7097, + "step": 2776 + }, + { + "epoch": 0.24830114449213161, + "grad_norm": 0.11451436618613328, + "learning_rate": 0.00017603683481516762, + "loss": 0.662, + "step": 2777 + }, + { + "epoch": 0.24839055793991416, + "grad_norm": 0.11883137910353361, + "learning_rate": 0.000176018022264972, + "loss": 0.6551, + "step": 2778 + }, + { + "epoch": 0.2484799713876967, + "grad_norm": 0.1187696738661592, + "learning_rate": 0.00017599920333923668, + "loss": 0.6407, + "step": 2779 + }, + { + "epoch": 0.24856938483547925, + "grad_norm": 0.11756970276337292, + "learning_rate": 0.00017598037803953994, + "loss": 0.691, + "step": 2780 + }, + { + "epoch": 0.2486587982832618, + "grad_norm": 0.12140410823467547, + "learning_rate": 0.00017596154636746066, + "loss": 0.6621, + "step": 2781 + }, + { + "epoch": 0.24874821173104436, + "grad_norm": 0.14142111189996484, + "learning_rate": 0.00017594270832457825, + "loss": 0.7103, + "step": 2782 + }, + { + "epoch": 0.2488376251788269, + "grad_norm": 0.1413362783538041, + "learning_rate": 0.0001759238639124726, + "loss": 0.721, + "step": 2783 + }, + { + "epoch": 0.24892703862660945, + "grad_norm": 0.1104221057765417, + "learning_rate": 0.00017590501313272415, + "loss": 0.6918, + "step": 2784 + }, + { + "epoch": 0.249016452074392, + "grad_norm": 0.10815623105569236, + "learning_rate": 0.00017588615598691397, + "loss": 0.676, + "step": 2785 + }, + { + "epoch": 0.24910586552217454, + "grad_norm": 0.12048587521620334, + "learning_rate": 0.00017586729247662345, + "loss": 0.7029, + "step": 2786 + }, + { + "epoch": 0.24919527896995708, + "grad_norm": 0.13198249086963718, + "learning_rate": 0.00017584842260343482, + "loss": 0.7151, + "step": 2787 + }, + { + "epoch": 0.24928469241773962, + "grad_norm": 0.1120369425863664, + "learning_rate": 0.00017582954636893055, + "loss": 0.6622, + "step": 2788 + }, + { + "epoch": 0.24937410586552217, + "grad_norm": 0.11183946716621189, + "learning_rate": 0.0001758106637746938, + "loss": 0.6398, + "step": 2789 + }, + { + "epoch": 0.2494635193133047, + "grad_norm": 0.14061515821766413, + "learning_rate": 0.00017579177482230824, + "loss": 0.7233, + "step": 2790 + }, + { + "epoch": 0.24955293276108725, + "grad_norm": 0.1285397469814573, + "learning_rate": 0.00017577287951335807, + "loss": 0.6633, + "step": 2791 + }, + { + "epoch": 0.24964234620886983, + "grad_norm": 0.12730832321315447, + "learning_rate": 0.00017575397784942799, + "loss": 0.7024, + "step": 2792 + }, + { + "epoch": 0.24973175965665237, + "grad_norm": 0.12458823271372006, + "learning_rate": 0.00017573506983210329, + "loss": 0.645, + "step": 2793 + }, + { + "epoch": 0.2498211731044349, + "grad_norm": 0.14166602113361076, + "learning_rate": 0.00017571615546296972, + "loss": 0.6809, + "step": 2794 + }, + { + "epoch": 0.24991058655221746, + "grad_norm": 0.120423575723331, + "learning_rate": 0.00017569723474361365, + "loss": 0.6993, + "step": 2795 + }, + { + "epoch": 0.25, + "grad_norm": 0.10641894223694481, + "learning_rate": 0.00017567830767562198, + "loss": 0.684, + "step": 2796 + }, + { + "epoch": 0.25008941344778257, + "grad_norm": 0.12272558684723989, + "learning_rate": 0.00017565937426058196, + "loss": 0.716, + "step": 2797 + }, + { + "epoch": 0.2501788268955651, + "grad_norm": 0.119060516502856, + "learning_rate": 0.00017564043450008163, + "loss": 0.6812, + "step": 2798 + }, + { + "epoch": 0.25026824034334766, + "grad_norm": 0.12690846302318873, + "learning_rate": 0.0001756214883957094, + "loss": 0.6932, + "step": 2799 + }, + { + "epoch": 0.2503576537911302, + "grad_norm": 0.1351453459812743, + "learning_rate": 0.00017560253594905425, + "loss": 0.6286, + "step": 2800 + }, + { + "epoch": 0.25044706723891275, + "grad_norm": 0.10932687848266213, + "learning_rate": 0.00017558357716170573, + "loss": 0.6401, + "step": 2801 + }, + { + "epoch": 0.25053648068669526, + "grad_norm": 0.13362515952972057, + "learning_rate": 0.00017556461203525387, + "loss": 0.6629, + "step": 2802 + }, + { + "epoch": 0.25062589413447783, + "grad_norm": 0.13520502087717953, + "learning_rate": 0.00017554564057128928, + "loss": 0.6582, + "step": 2803 + }, + { + "epoch": 0.25071530758226035, + "grad_norm": 0.11243159461546676, + "learning_rate": 0.00017552666277140304, + "loss": 0.6721, + "step": 2804 + }, + { + "epoch": 0.2508047210300429, + "grad_norm": 0.12206579549272432, + "learning_rate": 0.0001755076786371868, + "loss": 0.657, + "step": 2805 + }, + { + "epoch": 0.2508941344778255, + "grad_norm": 0.1287783345075806, + "learning_rate": 0.00017548868817023275, + "loss": 0.6775, + "step": 2806 + }, + { + "epoch": 0.250983547925608, + "grad_norm": 0.12674617656195572, + "learning_rate": 0.00017546969137213357, + "loss": 0.6841, + "step": 2807 + }, + { + "epoch": 0.2510729613733906, + "grad_norm": 0.12227227525379879, + "learning_rate": 0.00017545068824448255, + "loss": 0.6765, + "step": 2808 + }, + { + "epoch": 0.2511623748211731, + "grad_norm": 0.12621206225872458, + "learning_rate": 0.0001754316787888734, + "loss": 0.6495, + "step": 2809 + }, + { + "epoch": 0.25125178826895567, + "grad_norm": 0.11354037110768449, + "learning_rate": 0.00017541266300690047, + "loss": 0.6607, + "step": 2810 + }, + { + "epoch": 0.2513412017167382, + "grad_norm": 0.11138009315964274, + "learning_rate": 0.00017539364090015855, + "loss": 0.7053, + "step": 2811 + }, + { + "epoch": 0.25143061516452075, + "grad_norm": 0.11380629390382611, + "learning_rate": 0.00017537461247024304, + "loss": 0.6349, + "step": 2812 + }, + { + "epoch": 0.25152002861230327, + "grad_norm": 0.12306345396201926, + "learning_rate": 0.0001753555777187498, + "loss": 0.7118, + "step": 2813 + }, + { + "epoch": 0.25160944206008584, + "grad_norm": 0.1361424942348374, + "learning_rate": 0.00017533653664727529, + "loss": 0.6758, + "step": 2814 + }, + { + "epoch": 0.25169885550786836, + "grad_norm": 0.12050916393413319, + "learning_rate": 0.0001753174892574164, + "loss": 0.7152, + "step": 2815 + }, + { + "epoch": 0.25178826895565093, + "grad_norm": 0.1442987777685469, + "learning_rate": 0.00017529843555077066, + "loss": 0.7231, + "step": 2816 + }, + { + "epoch": 0.2518776824034335, + "grad_norm": 0.10928855271795214, + "learning_rate": 0.00017527937552893605, + "loss": 0.636, + "step": 2817 + }, + { + "epoch": 0.251967095851216, + "grad_norm": 0.12047062035003647, + "learning_rate": 0.00017526030919351113, + "loss": 0.6985, + "step": 2818 + }, + { + "epoch": 0.2520565092989986, + "grad_norm": 0.13285937189187394, + "learning_rate": 0.000175241236546095, + "loss": 0.7078, + "step": 2819 + }, + { + "epoch": 0.2521459227467811, + "grad_norm": 0.1351525610070296, + "learning_rate": 0.00017522215758828722, + "loss": 0.6183, + "step": 2820 + }, + { + "epoch": 0.2522353361945637, + "grad_norm": 0.1283689868345731, + "learning_rate": 0.0001752030723216879, + "loss": 0.7174, + "step": 2821 + }, + { + "epoch": 0.2523247496423462, + "grad_norm": 0.12405442951910309, + "learning_rate": 0.00017518398074789775, + "loss": 0.6782, + "step": 2822 + }, + { + "epoch": 0.25241416309012876, + "grad_norm": 0.12712002000861877, + "learning_rate": 0.00017516488286851794, + "loss": 0.6797, + "step": 2823 + }, + { + "epoch": 0.2525035765379113, + "grad_norm": 0.10971297778559776, + "learning_rate": 0.00017514577868515016, + "loss": 0.6773, + "step": 2824 + }, + { + "epoch": 0.25259298998569385, + "grad_norm": 0.12511116114391133, + "learning_rate": 0.0001751266681993967, + "loss": 0.6894, + "step": 2825 + }, + { + "epoch": 0.2526824034334764, + "grad_norm": 0.13518027328142415, + "learning_rate": 0.00017510755141286028, + "loss": 0.6607, + "step": 2826 + }, + { + "epoch": 0.25277181688125894, + "grad_norm": 0.1209158520361801, + "learning_rate": 0.00017508842832714426, + "loss": 0.6655, + "step": 2827 + }, + { + "epoch": 0.2528612303290415, + "grad_norm": 0.11458648653455965, + "learning_rate": 0.0001750692989438524, + "loss": 0.7042, + "step": 2828 + }, + { + "epoch": 0.252950643776824, + "grad_norm": 0.12383849308536221, + "learning_rate": 0.00017505016326458913, + "loss": 0.7079, + "step": 2829 + }, + { + "epoch": 0.2530400572246066, + "grad_norm": 0.1271234976994649, + "learning_rate": 0.00017503102129095928, + "loss": 0.7168, + "step": 2830 + }, + { + "epoch": 0.2531294706723891, + "grad_norm": 0.10756419057408664, + "learning_rate": 0.0001750118730245683, + "loss": 0.6784, + "step": 2831 + }, + { + "epoch": 0.2532188841201717, + "grad_norm": 0.11942415574237361, + "learning_rate": 0.00017499271846702213, + "loss": 0.6665, + "step": 2832 + }, + { + "epoch": 0.2533082975679542, + "grad_norm": 0.12598807595759542, + "learning_rate": 0.00017497355761992724, + "loss": 0.6713, + "step": 2833 + }, + { + "epoch": 0.25339771101573677, + "grad_norm": 0.12780529826858703, + "learning_rate": 0.00017495439048489063, + "loss": 0.7114, + "step": 2834 + }, + { + "epoch": 0.2534871244635193, + "grad_norm": 0.13721079688519083, + "learning_rate": 0.00017493521706351975, + "loss": 0.7195, + "step": 2835 + }, + { + "epoch": 0.25357653791130186, + "grad_norm": 0.10938528505532004, + "learning_rate": 0.00017491603735742277, + "loss": 0.6519, + "step": 2836 + }, + { + "epoch": 0.25366595135908443, + "grad_norm": 0.12692732306085422, + "learning_rate": 0.0001748968513682082, + "loss": 0.6762, + "step": 2837 + }, + { + "epoch": 0.25375536480686695, + "grad_norm": 0.11273882680345479, + "learning_rate": 0.00017487765909748513, + "loss": 0.6526, + "step": 2838 + }, + { + "epoch": 0.2538447782546495, + "grad_norm": 0.14297251057047503, + "learning_rate": 0.00017485846054686324, + "loss": 0.7072, + "step": 2839 + }, + { + "epoch": 0.25393419170243203, + "grad_norm": 0.11425852478133094, + "learning_rate": 0.00017483925571795268, + "loss": 0.6484, + "step": 2840 + }, + { + "epoch": 0.2540236051502146, + "grad_norm": 0.12341038501136938, + "learning_rate": 0.00017482004461236413, + "loss": 0.7076, + "step": 2841 + }, + { + "epoch": 0.2541130185979971, + "grad_norm": 0.11523008315615849, + "learning_rate": 0.00017480082723170877, + "loss": 0.6759, + "step": 2842 + }, + { + "epoch": 0.2542024320457797, + "grad_norm": 0.11111880673554317, + "learning_rate": 0.00017478160357759838, + "loss": 0.6624, + "step": 2843 + }, + { + "epoch": 0.2542918454935622, + "grad_norm": 0.13798342685931697, + "learning_rate": 0.00017476237365164523, + "loss": 0.6897, + "step": 2844 + }, + { + "epoch": 0.2543812589413448, + "grad_norm": 0.1352151982754971, + "learning_rate": 0.00017474313745546204, + "loss": 0.6826, + "step": 2845 + }, + { + "epoch": 0.25447067238912735, + "grad_norm": 0.1121238070625693, + "learning_rate": 0.00017472389499066223, + "loss": 0.6803, + "step": 2846 + }, + { + "epoch": 0.25456008583690987, + "grad_norm": 0.1339670039962732, + "learning_rate": 0.00017470464625885958, + "loss": 0.6811, + "step": 2847 + }, + { + "epoch": 0.25464949928469244, + "grad_norm": 0.13012760502598975, + "learning_rate": 0.00017468539126166846, + "loss": 0.7048, + "step": 2848 + }, + { + "epoch": 0.25473891273247495, + "grad_norm": 0.13855693637974073, + "learning_rate": 0.0001746661300007038, + "loss": 0.6902, + "step": 2849 + }, + { + "epoch": 0.2548283261802575, + "grad_norm": 0.13266297242700362, + "learning_rate": 0.00017464686247758095, + "loss": 0.7019, + "step": 2850 + }, + { + "epoch": 0.25491773962804004, + "grad_norm": 0.13318677866995712, + "learning_rate": 0.00017462758869391591, + "loss": 0.6946, + "step": 2851 + }, + { + "epoch": 0.2550071530758226, + "grad_norm": 0.12155448594703719, + "learning_rate": 0.00017460830865132513, + "loss": 0.7113, + "step": 2852 + }, + { + "epoch": 0.25509656652360513, + "grad_norm": 0.12583155480617625, + "learning_rate": 0.00017458902235142562, + "loss": 0.699, + "step": 2853 + }, + { + "epoch": 0.2551859799713877, + "grad_norm": 0.12969034144275302, + "learning_rate": 0.00017456972979583486, + "loss": 0.6878, + "step": 2854 + }, + { + "epoch": 0.2552753934191702, + "grad_norm": 0.12101589058475389, + "learning_rate": 0.00017455043098617097, + "loss": 0.7104, + "step": 2855 + }, + { + "epoch": 0.2553648068669528, + "grad_norm": 0.1128502244893988, + "learning_rate": 0.00017453112592405242, + "loss": 0.6804, + "step": 2856 + }, + { + "epoch": 0.25545422031473536, + "grad_norm": 0.1300063916867911, + "learning_rate": 0.00017451181461109835, + "loss": 0.7004, + "step": 2857 + }, + { + "epoch": 0.2555436337625179, + "grad_norm": 0.10675149998067392, + "learning_rate": 0.0001744924970489284, + "loss": 0.6755, + "step": 2858 + }, + { + "epoch": 0.25563304721030045, + "grad_norm": 0.1263452830516575, + "learning_rate": 0.00017447317323916267, + "loss": 0.6877, + "step": 2859 + }, + { + "epoch": 0.25572246065808296, + "grad_norm": 0.12074348284006901, + "learning_rate": 0.00017445384318342185, + "loss": 0.6872, + "step": 2860 + }, + { + "epoch": 0.25581187410586553, + "grad_norm": 0.12611583608946186, + "learning_rate": 0.00017443450688332712, + "loss": 0.6532, + "step": 2861 + }, + { + "epoch": 0.25590128755364805, + "grad_norm": 0.12267353578527176, + "learning_rate": 0.00017441516434050017, + "loss": 0.6782, + "step": 2862 + }, + { + "epoch": 0.2559907010014306, + "grad_norm": 0.11167490289232593, + "learning_rate": 0.0001743958155565633, + "loss": 0.6659, + "step": 2863 + }, + { + "epoch": 0.25608011444921314, + "grad_norm": 0.11947529983704334, + "learning_rate": 0.0001743764605331392, + "loss": 0.6684, + "step": 2864 + }, + { + "epoch": 0.2561695278969957, + "grad_norm": 0.13734116783637976, + "learning_rate": 0.0001743570992718512, + "loss": 0.7539, + "step": 2865 + }, + { + "epoch": 0.2562589413447783, + "grad_norm": 0.12938179651243334, + "learning_rate": 0.00017433773177432307, + "loss": 0.6832, + "step": 2866 + }, + { + "epoch": 0.2563483547925608, + "grad_norm": 0.11604402078917336, + "learning_rate": 0.00017431835804217912, + "loss": 0.6576, + "step": 2867 + }, + { + "epoch": 0.25643776824034337, + "grad_norm": 0.1134655883152843, + "learning_rate": 0.00017429897807704427, + "loss": 0.6293, + "step": 2868 + }, + { + "epoch": 0.2565271816881259, + "grad_norm": 0.12694229161433138, + "learning_rate": 0.00017427959188054385, + "loss": 0.7077, + "step": 2869 + }, + { + "epoch": 0.25661659513590845, + "grad_norm": 0.11125188337497026, + "learning_rate": 0.0001742601994543038, + "loss": 0.6681, + "step": 2870 + }, + { + "epoch": 0.25670600858369097, + "grad_norm": 0.1216474195581543, + "learning_rate": 0.00017424080079995045, + "loss": 0.6722, + "step": 2871 + }, + { + "epoch": 0.25679542203147354, + "grad_norm": 0.13390079708282432, + "learning_rate": 0.00017422139591911085, + "loss": 0.7216, + "step": 2872 + }, + { + "epoch": 0.25688483547925606, + "grad_norm": 0.1253442641797318, + "learning_rate": 0.00017420198481341237, + "loss": 0.6941, + "step": 2873 + }, + { + "epoch": 0.25697424892703863, + "grad_norm": 0.10909427748089695, + "learning_rate": 0.00017418256748448304, + "loss": 0.6424, + "step": 2874 + }, + { + "epoch": 0.2570636623748212, + "grad_norm": 0.13879970902718491, + "learning_rate": 0.0001741631439339514, + "loss": 0.671, + "step": 2875 + }, + { + "epoch": 0.2571530758226037, + "grad_norm": 0.14066716043681635, + "learning_rate": 0.0001741437141634464, + "loss": 0.7061, + "step": 2876 + }, + { + "epoch": 0.2572424892703863, + "grad_norm": 0.12325390508043386, + "learning_rate": 0.00017412427817459767, + "loss": 0.669, + "step": 2877 + }, + { + "epoch": 0.2573319027181688, + "grad_norm": 0.12224138193030479, + "learning_rate": 0.00017410483596903525, + "loss": 0.5869, + "step": 2878 + }, + { + "epoch": 0.2574213161659514, + "grad_norm": 0.13550231199539764, + "learning_rate": 0.0001740853875483897, + "loss": 0.6842, + "step": 2879 + }, + { + "epoch": 0.2575107296137339, + "grad_norm": 0.1470928891858101, + "learning_rate": 0.00017406593291429217, + "loss": 0.6948, + "step": 2880 + }, + { + "epoch": 0.25760014306151646, + "grad_norm": 0.11414722992818128, + "learning_rate": 0.00017404647206837432, + "loss": 0.6704, + "step": 2881 + }, + { + "epoch": 0.257689556509299, + "grad_norm": 0.12303897053397717, + "learning_rate": 0.00017402700501226826, + "loss": 0.6941, + "step": 2882 + }, + { + "epoch": 0.25777896995708155, + "grad_norm": 0.1089323732708137, + "learning_rate": 0.00017400753174760672, + "loss": 0.6478, + "step": 2883 + }, + { + "epoch": 0.25786838340486407, + "grad_norm": 0.127750997084591, + "learning_rate": 0.0001739880522760229, + "loss": 0.6779, + "step": 2884 + }, + { + "epoch": 0.25795779685264664, + "grad_norm": 0.11518479276146407, + "learning_rate": 0.00017396856659915045, + "loss": 0.6429, + "step": 2885 + }, + { + "epoch": 0.2580472103004292, + "grad_norm": 0.10442794272510281, + "learning_rate": 0.00017394907471862363, + "loss": 0.6526, + "step": 2886 + }, + { + "epoch": 0.2581366237482117, + "grad_norm": 0.14464609761922073, + "learning_rate": 0.00017392957663607723, + "loss": 0.6973, + "step": 2887 + }, + { + "epoch": 0.2582260371959943, + "grad_norm": 0.12762659566561663, + "learning_rate": 0.00017391007235314655, + "loss": 0.7001, + "step": 2888 + }, + { + "epoch": 0.2583154506437768, + "grad_norm": 0.11688618242609337, + "learning_rate": 0.00017389056187146733, + "loss": 0.6839, + "step": 2889 + }, + { + "epoch": 0.2584048640915594, + "grad_norm": 0.12075476958806416, + "learning_rate": 0.00017387104519267594, + "loss": 0.7094, + "step": 2890 + }, + { + "epoch": 0.2584942775393419, + "grad_norm": 0.1165121077741563, + "learning_rate": 0.0001738515223184092, + "loss": 0.6453, + "step": 2891 + }, + { + "epoch": 0.25858369098712447, + "grad_norm": 0.1143249530632413, + "learning_rate": 0.00017383199325030448, + "loss": 0.6956, + "step": 2892 + }, + { + "epoch": 0.258673104434907, + "grad_norm": 0.11612704045966664, + "learning_rate": 0.00017381245798999965, + "loss": 0.6642, + "step": 2893 + }, + { + "epoch": 0.25876251788268956, + "grad_norm": 0.11157144718988989, + "learning_rate": 0.00017379291653913311, + "loss": 0.6696, + "step": 2894 + }, + { + "epoch": 0.25885193133047213, + "grad_norm": 0.130892677365369, + "learning_rate": 0.0001737733688993438, + "loss": 0.6578, + "step": 2895 + }, + { + "epoch": 0.25894134477825465, + "grad_norm": 0.11426721496914129, + "learning_rate": 0.00017375381507227108, + "loss": 0.6774, + "step": 2896 + }, + { + "epoch": 0.2590307582260372, + "grad_norm": 0.10970304064416798, + "learning_rate": 0.000173734255059555, + "loss": 0.6469, + "step": 2897 + }, + { + "epoch": 0.25912017167381973, + "grad_norm": 0.12540748636156357, + "learning_rate": 0.000173714688862836, + "loss": 0.7085, + "step": 2898 + }, + { + "epoch": 0.2592095851216023, + "grad_norm": 0.1196224166371979, + "learning_rate": 0.00017369511648375507, + "loss": 0.7046, + "step": 2899 + }, + { + "epoch": 0.2592989985693848, + "grad_norm": 0.12041666215508672, + "learning_rate": 0.00017367553792395373, + "loss": 0.6818, + "step": 2900 + }, + { + "epoch": 0.2593884120171674, + "grad_norm": 0.11847898687133898, + "learning_rate": 0.00017365595318507397, + "loss": 0.678, + "step": 2901 + }, + { + "epoch": 0.2594778254649499, + "grad_norm": 0.114961593536544, + "learning_rate": 0.00017363636226875836, + "loss": 0.6614, + "step": 2902 + }, + { + "epoch": 0.2595672389127325, + "grad_norm": 0.11389929576465166, + "learning_rate": 0.00017361676517665001, + "loss": 0.5873, + "step": 2903 + }, + { + "epoch": 0.259656652360515, + "grad_norm": 0.1327138756074871, + "learning_rate": 0.00017359716191039248, + "loss": 0.6814, + "step": 2904 + }, + { + "epoch": 0.25974606580829757, + "grad_norm": 0.12840059077055263, + "learning_rate": 0.00017357755247162984, + "loss": 0.7097, + "step": 2905 + }, + { + "epoch": 0.25983547925608014, + "grad_norm": 0.14387381360689347, + "learning_rate": 0.00017355793686200675, + "loss": 0.7345, + "step": 2906 + }, + { + "epoch": 0.25992489270386265, + "grad_norm": 0.13200671075013684, + "learning_rate": 0.00017353831508316834, + "loss": 0.6756, + "step": 2907 + }, + { + "epoch": 0.2600143061516452, + "grad_norm": 0.11963110543424885, + "learning_rate": 0.00017351868713676023, + "loss": 0.6742, + "step": 2908 + }, + { + "epoch": 0.26010371959942774, + "grad_norm": 0.13399200001076586, + "learning_rate": 0.00017349905302442863, + "loss": 0.6953, + "step": 2909 + }, + { + "epoch": 0.2601931330472103, + "grad_norm": 0.13555331437489485, + "learning_rate": 0.0001734794127478202, + "loss": 0.7159, + "step": 2910 + }, + { + "epoch": 0.26028254649499283, + "grad_norm": 0.12712387358266805, + "learning_rate": 0.00017345976630858218, + "loss": 0.6853, + "step": 2911 + }, + { + "epoch": 0.2603719599427754, + "grad_norm": 0.11464053044480661, + "learning_rate": 0.00017344011370836227, + "loss": 0.7102, + "step": 2912 + }, + { + "epoch": 0.2604613733905579, + "grad_norm": 0.1172579168266654, + "learning_rate": 0.00017342045494880872, + "loss": 0.6601, + "step": 2913 + }, + { + "epoch": 0.2605507868383405, + "grad_norm": 0.12136908274655187, + "learning_rate": 0.0001734007900315703, + "loss": 0.6875, + "step": 2914 + }, + { + "epoch": 0.26064020028612306, + "grad_norm": 0.11747674407477113, + "learning_rate": 0.00017338111895829624, + "loss": 0.6553, + "step": 2915 + }, + { + "epoch": 0.2607296137339056, + "grad_norm": 0.12935073346197554, + "learning_rate": 0.00017336144173063636, + "loss": 0.7043, + "step": 2916 + }, + { + "epoch": 0.26081902718168815, + "grad_norm": 0.12191312223750173, + "learning_rate": 0.00017334175835024095, + "loss": 0.7001, + "step": 2917 + }, + { + "epoch": 0.26090844062947066, + "grad_norm": 0.11985812027401757, + "learning_rate": 0.00017332206881876086, + "loss": 0.6932, + "step": 2918 + }, + { + "epoch": 0.26099785407725323, + "grad_norm": 0.13953600456764936, + "learning_rate": 0.0001733023731378474, + "loss": 0.7385, + "step": 2919 + }, + { + "epoch": 0.26108726752503575, + "grad_norm": 0.12414598213956338, + "learning_rate": 0.00017328267130915244, + "loss": 0.6373, + "step": 2920 + }, + { + "epoch": 0.2611766809728183, + "grad_norm": 0.11391532223021376, + "learning_rate": 0.00017326296333432833, + "loss": 0.6669, + "step": 2921 + }, + { + "epoch": 0.26126609442060084, + "grad_norm": 0.12390622520219004, + "learning_rate": 0.000173243249215028, + "loss": 0.6709, + "step": 2922 + }, + { + "epoch": 0.2613555078683834, + "grad_norm": 0.11384957099184759, + "learning_rate": 0.00017322352895290477, + "loss": 0.6575, + "step": 2923 + }, + { + "epoch": 0.261444921316166, + "grad_norm": 0.12214272919988266, + "learning_rate": 0.0001732038025496126, + "loss": 0.6748, + "step": 2924 + }, + { + "epoch": 0.2615343347639485, + "grad_norm": 0.12945140256594928, + "learning_rate": 0.0001731840700068059, + "loss": 0.6938, + "step": 2925 + }, + { + "epoch": 0.26162374821173107, + "grad_norm": 0.14216880090293757, + "learning_rate": 0.00017316433132613969, + "loss": 0.7065, + "step": 2926 + }, + { + "epoch": 0.2617131616595136, + "grad_norm": 0.11932454913001905, + "learning_rate": 0.00017314458650926934, + "loss": 0.6687, + "step": 2927 + }, + { + "epoch": 0.26180257510729615, + "grad_norm": 0.12387116332661967, + "learning_rate": 0.00017312483555785086, + "loss": 0.6832, + "step": 2928 + }, + { + "epoch": 0.26189198855507867, + "grad_norm": 0.11197566872361028, + "learning_rate": 0.00017310507847354077, + "loss": 0.673, + "step": 2929 + }, + { + "epoch": 0.26198140200286124, + "grad_norm": 0.10062733737559648, + "learning_rate": 0.00017308531525799597, + "loss": 0.6423, + "step": 2930 + }, + { + "epoch": 0.26207081545064376, + "grad_norm": 0.1294153685975203, + "learning_rate": 0.0001730655459128741, + "loss": 0.6612, + "step": 2931 + }, + { + "epoch": 0.26216022889842633, + "grad_norm": 0.1323014581471112, + "learning_rate": 0.0001730457704398331, + "loss": 0.6822, + "step": 2932 + }, + { + "epoch": 0.26224964234620884, + "grad_norm": 0.13330592450290757, + "learning_rate": 0.00017302598884053153, + "loss": 0.6815, + "step": 2933 + }, + { + "epoch": 0.2623390557939914, + "grad_norm": 0.11414517233349206, + "learning_rate": 0.00017300620111662852, + "loss": 0.6232, + "step": 2934 + }, + { + "epoch": 0.262428469241774, + "grad_norm": 0.12347340201624758, + "learning_rate": 0.00017298640726978357, + "loss": 0.6997, + "step": 2935 + }, + { + "epoch": 0.2625178826895565, + "grad_norm": 0.1300906303486969, + "learning_rate": 0.00017296660730165678, + "loss": 0.6741, + "step": 2936 + }, + { + "epoch": 0.2626072961373391, + "grad_norm": 0.12948852095284324, + "learning_rate": 0.00017294680121390877, + "loss": 0.6639, + "step": 2937 + }, + { + "epoch": 0.2626967095851216, + "grad_norm": 0.11272899693788493, + "learning_rate": 0.00017292698900820064, + "loss": 0.6744, + "step": 2938 + }, + { + "epoch": 0.26278612303290416, + "grad_norm": 0.11751128112263197, + "learning_rate": 0.00017290717068619402, + "loss": 0.6528, + "step": 2939 + }, + { + "epoch": 0.2628755364806867, + "grad_norm": 0.13295040171517178, + "learning_rate": 0.00017288734624955102, + "loss": 0.6951, + "step": 2940 + }, + { + "epoch": 0.26296494992846925, + "grad_norm": 0.139974351259309, + "learning_rate": 0.00017286751569993433, + "loss": 0.7009, + "step": 2941 + }, + { + "epoch": 0.26305436337625177, + "grad_norm": 0.11263311279614854, + "learning_rate": 0.0001728476790390071, + "loss": 0.7151, + "step": 2942 + }, + { + "epoch": 0.26314377682403434, + "grad_norm": 0.1275262671785296, + "learning_rate": 0.00017282783626843302, + "loss": 0.6829, + "step": 2943 + }, + { + "epoch": 0.2632331902718169, + "grad_norm": 0.1369813930774622, + "learning_rate": 0.00017280798738987624, + "loss": 0.6693, + "step": 2944 + }, + { + "epoch": 0.2633226037195994, + "grad_norm": 0.11328107145113063, + "learning_rate": 0.00017278813240500154, + "loss": 0.6195, + "step": 2945 + }, + { + "epoch": 0.263412017167382, + "grad_norm": 0.13878585148640568, + "learning_rate": 0.000172768271315474, + "loss": 0.6862, + "step": 2946 + }, + { + "epoch": 0.2635014306151645, + "grad_norm": 0.12084459594371182, + "learning_rate": 0.00017274840412295948, + "loss": 0.6897, + "step": 2947 + }, + { + "epoch": 0.2635908440629471, + "grad_norm": 0.11314017055186083, + "learning_rate": 0.00017272853082912418, + "loss": 0.6487, + "step": 2948 + }, + { + "epoch": 0.2636802575107296, + "grad_norm": 0.14322970339727814, + "learning_rate": 0.00017270865143563478, + "loss": 0.6916, + "step": 2949 + }, + { + "epoch": 0.26376967095851217, + "grad_norm": 0.14003512349419228, + "learning_rate": 0.00017268876594415863, + "loss": 0.6842, + "step": 2950 + }, + { + "epoch": 0.2638590844062947, + "grad_norm": 0.12044401630907915, + "learning_rate": 0.00017266887435636344, + "loss": 0.6557, + "step": 2951 + }, + { + "epoch": 0.26394849785407726, + "grad_norm": 0.11129636463813393, + "learning_rate": 0.00017264897667391754, + "loss": 0.6487, + "step": 2952 + }, + { + "epoch": 0.2640379113018598, + "grad_norm": 0.1478647314657479, + "learning_rate": 0.0001726290728984897, + "loss": 0.6829, + "step": 2953 + }, + { + "epoch": 0.26412732474964234, + "grad_norm": 0.1490750095268827, + "learning_rate": 0.00017260916303174923, + "loss": 0.7484, + "step": 2954 + }, + { + "epoch": 0.2642167381974249, + "grad_norm": 0.12478997297200903, + "learning_rate": 0.00017258924707536596, + "loss": 0.6872, + "step": 2955 + }, + { + "epoch": 0.26430615164520743, + "grad_norm": 0.11240191007704851, + "learning_rate": 0.00017256932503101018, + "loss": 0.6883, + "step": 2956 + }, + { + "epoch": 0.26439556509299, + "grad_norm": 0.11878410166563937, + "learning_rate": 0.00017254939690035276, + "loss": 0.6862, + "step": 2957 + }, + { + "epoch": 0.2644849785407725, + "grad_norm": 0.13609420186539273, + "learning_rate": 0.00017252946268506505, + "loss": 0.7617, + "step": 2958 + }, + { + "epoch": 0.2645743919885551, + "grad_norm": 0.14192915865363304, + "learning_rate": 0.00017250952238681889, + "loss": 0.706, + "step": 2959 + }, + { + "epoch": 0.2646638054363376, + "grad_norm": 0.12241682651190532, + "learning_rate": 0.00017248957600728664, + "loss": 0.6751, + "step": 2960 + }, + { + "epoch": 0.2647532188841202, + "grad_norm": 0.1417176337021574, + "learning_rate": 0.0001724696235481412, + "loss": 0.6816, + "step": 2961 + }, + { + "epoch": 0.2648426323319027, + "grad_norm": 0.11819657235512197, + "learning_rate": 0.00017244966501105596, + "loss": 0.681, + "step": 2962 + }, + { + "epoch": 0.26493204577968527, + "grad_norm": 0.14374267604215019, + "learning_rate": 0.0001724297003977048, + "loss": 0.7104, + "step": 2963 + }, + { + "epoch": 0.26502145922746784, + "grad_norm": 0.1317178798484022, + "learning_rate": 0.0001724097297097622, + "loss": 0.7343, + "step": 2964 + }, + { + "epoch": 0.26511087267525035, + "grad_norm": 0.10983104682683975, + "learning_rate": 0.00017238975294890297, + "loss": 0.6714, + "step": 2965 + }, + { + "epoch": 0.2652002861230329, + "grad_norm": 0.11489446180393947, + "learning_rate": 0.00017236977011680257, + "loss": 0.6492, + "step": 2966 + }, + { + "epoch": 0.26528969957081544, + "grad_norm": 0.12379052496951407, + "learning_rate": 0.00017234978121513699, + "loss": 0.6972, + "step": 2967 + }, + { + "epoch": 0.265379113018598, + "grad_norm": 0.12125050719732294, + "learning_rate": 0.0001723297862455826, + "loss": 0.6896, + "step": 2968 + }, + { + "epoch": 0.2654685264663805, + "grad_norm": 0.11930322640359649, + "learning_rate": 0.00017230978520981643, + "loss": 0.6741, + "step": 2969 + }, + { + "epoch": 0.2655579399141631, + "grad_norm": 0.11591634451221226, + "learning_rate": 0.00017228977810951584, + "loss": 0.6687, + "step": 2970 + }, + { + "epoch": 0.2656473533619456, + "grad_norm": 0.12470829845810544, + "learning_rate": 0.00017226976494635893, + "loss": 0.6503, + "step": 2971 + }, + { + "epoch": 0.2657367668097282, + "grad_norm": 0.12335745586531718, + "learning_rate": 0.00017224974572202409, + "loss": 0.6852, + "step": 2972 + }, + { + "epoch": 0.2658261802575107, + "grad_norm": 0.13647046517641576, + "learning_rate": 0.0001722297204381903, + "loss": 0.6537, + "step": 2973 + }, + { + "epoch": 0.2659155937052933, + "grad_norm": 0.11448037070961659, + "learning_rate": 0.00017220968909653715, + "loss": 0.6475, + "step": 2974 + }, + { + "epoch": 0.26600500715307585, + "grad_norm": 0.1170999791599391, + "learning_rate": 0.00017218965169874456, + "loss": 0.6955, + "step": 2975 + }, + { + "epoch": 0.26609442060085836, + "grad_norm": 0.13027842911934023, + "learning_rate": 0.00017216960824649303, + "loss": 0.68, + "step": 2976 + }, + { + "epoch": 0.26618383404864093, + "grad_norm": 0.11543504289335245, + "learning_rate": 0.00017214955874146363, + "loss": 0.7004, + "step": 2977 + }, + { + "epoch": 0.26627324749642345, + "grad_norm": 0.12403063114862434, + "learning_rate": 0.00017212950318533788, + "loss": 0.6935, + "step": 2978 + }, + { + "epoch": 0.266362660944206, + "grad_norm": 0.135385532758241, + "learning_rate": 0.00017210944157979783, + "loss": 0.6942, + "step": 2979 + }, + { + "epoch": 0.26645207439198854, + "grad_norm": 0.136729707999011, + "learning_rate": 0.00017208937392652594, + "loss": 0.7346, + "step": 2980 + }, + { + "epoch": 0.2665414878397711, + "grad_norm": 0.12521038908747523, + "learning_rate": 0.0001720693002272054, + "loss": 0.7031, + "step": 2981 + }, + { + "epoch": 0.2666309012875536, + "grad_norm": 0.13498048420727693, + "learning_rate": 0.00017204922048351964, + "loss": 0.6697, + "step": 2982 + }, + { + "epoch": 0.2667203147353362, + "grad_norm": 0.13004082895195868, + "learning_rate": 0.0001720291346971528, + "loss": 0.6912, + "step": 2983 + }, + { + "epoch": 0.26680972818311877, + "grad_norm": 0.12278026185206099, + "learning_rate": 0.0001720090428697894, + "loss": 0.7066, + "step": 2984 + }, + { + "epoch": 0.2668991416309013, + "grad_norm": 0.13998833299071592, + "learning_rate": 0.00017198894500311453, + "loss": 0.7021, + "step": 2985 + }, + { + "epoch": 0.26698855507868385, + "grad_norm": 0.11442621153613011, + "learning_rate": 0.0001719688410988138, + "loss": 0.6777, + "step": 2986 + }, + { + "epoch": 0.26707796852646637, + "grad_norm": 0.11594129617538977, + "learning_rate": 0.00017194873115857328, + "loss": 0.6878, + "step": 2987 + }, + { + "epoch": 0.26716738197424894, + "grad_norm": 0.11431426155178731, + "learning_rate": 0.00017192861518407958, + "loss": 0.6701, + "step": 2988 + }, + { + "epoch": 0.26725679542203146, + "grad_norm": 0.13416340779914301, + "learning_rate": 0.00017190849317701975, + "loss": 0.6825, + "step": 2989 + }, + { + "epoch": 0.26734620886981403, + "grad_norm": 0.11753550193819644, + "learning_rate": 0.00017188836513908152, + "loss": 0.6897, + "step": 2990 + }, + { + "epoch": 0.26743562231759654, + "grad_norm": 0.1160941072686251, + "learning_rate": 0.00017186823107195287, + "loss": 0.6899, + "step": 2991 + }, + { + "epoch": 0.2675250357653791, + "grad_norm": 0.1329352446373353, + "learning_rate": 0.00017184809097732246, + "loss": 0.7016, + "step": 2992 + }, + { + "epoch": 0.2676144492131617, + "grad_norm": 0.12344682771216706, + "learning_rate": 0.00017182794485687944, + "loss": 0.6837, + "step": 2993 + }, + { + "epoch": 0.2677038626609442, + "grad_norm": 0.11429498728925647, + "learning_rate": 0.00017180779271231344, + "loss": 0.6815, + "step": 2994 + }, + { + "epoch": 0.2677932761087268, + "grad_norm": 0.12062777831691417, + "learning_rate": 0.0001717876345453146, + "loss": 0.6855, + "step": 2995 + }, + { + "epoch": 0.2678826895565093, + "grad_norm": 0.11013275783190707, + "learning_rate": 0.00017176747035757355, + "loss": 0.6799, + "step": 2996 + }, + { + "epoch": 0.26797210300429186, + "grad_norm": 0.1351173202017434, + "learning_rate": 0.0001717473001507814, + "loss": 0.7041, + "step": 2997 + }, + { + "epoch": 0.2680615164520744, + "grad_norm": 0.1255519209142153, + "learning_rate": 0.00017172712392662988, + "loss": 0.7322, + "step": 2998 + }, + { + "epoch": 0.26815092989985695, + "grad_norm": 0.12131466800331854, + "learning_rate": 0.00017170694168681106, + "loss": 0.694, + "step": 2999 + }, + { + "epoch": 0.26824034334763946, + "grad_norm": 0.12584411122267894, + "learning_rate": 0.00017168675343301769, + "loss": 0.695, + "step": 3000 + }, + { + "epoch": 0.26832975679542204, + "grad_norm": 0.1398696793189521, + "learning_rate": 0.00017166655916694284, + "loss": 0.7082, + "step": 3001 + }, + { + "epoch": 0.26841917024320455, + "grad_norm": 0.13583865308851537, + "learning_rate": 0.00017164635889028025, + "loss": 0.6834, + "step": 3002 + }, + { + "epoch": 0.2685085836909871, + "grad_norm": 0.13156669645015448, + "learning_rate": 0.00017162615260472402, + "loss": 0.7137, + "step": 3003 + }, + { + "epoch": 0.2685979971387697, + "grad_norm": 0.1334523956162601, + "learning_rate": 0.00017160594031196894, + "loss": 0.6667, + "step": 3004 + }, + { + "epoch": 0.2686874105865522, + "grad_norm": 0.1525893275442804, + "learning_rate": 0.00017158572201371008, + "loss": 0.7143, + "step": 3005 + }, + { + "epoch": 0.2687768240343348, + "grad_norm": 0.112143896867621, + "learning_rate": 0.00017156549771164318, + "loss": 0.6827, + "step": 3006 + }, + { + "epoch": 0.2688662374821173, + "grad_norm": 0.11289741950511938, + "learning_rate": 0.00017154526740746442, + "loss": 0.6834, + "step": 3007 + }, + { + "epoch": 0.26895565092989987, + "grad_norm": 0.12392807067342705, + "learning_rate": 0.00017152503110287048, + "loss": 0.6831, + "step": 3008 + }, + { + "epoch": 0.2690450643776824, + "grad_norm": 0.12022323372674344, + "learning_rate": 0.00017150478879955858, + "loss": 0.6787, + "step": 3009 + }, + { + "epoch": 0.26913447782546496, + "grad_norm": 0.12820617638311152, + "learning_rate": 0.00017148454049922636, + "loss": 0.6692, + "step": 3010 + }, + { + "epoch": 0.2692238912732475, + "grad_norm": 0.11760890171424014, + "learning_rate": 0.00017146428620357212, + "loss": 0.7002, + "step": 3011 + }, + { + "epoch": 0.26931330472103004, + "grad_norm": 0.12442686939433761, + "learning_rate": 0.00017144402591429448, + "loss": 0.6631, + "step": 3012 + }, + { + "epoch": 0.2694027181688126, + "grad_norm": 0.110962889095034, + "learning_rate": 0.00017142375963309262, + "loss": 0.673, + "step": 3013 + }, + { + "epoch": 0.26949213161659513, + "grad_norm": 0.12462207708246682, + "learning_rate": 0.00017140348736166636, + "loss": 0.6861, + "step": 3014 + }, + { + "epoch": 0.2695815450643777, + "grad_norm": 0.11778301124332412, + "learning_rate": 0.00017138320910171584, + "loss": 0.6993, + "step": 3015 + }, + { + "epoch": 0.2696709585121602, + "grad_norm": 0.13973498002875306, + "learning_rate": 0.00017136292485494175, + "loss": 0.6895, + "step": 3016 + }, + { + "epoch": 0.2697603719599428, + "grad_norm": 0.10955401226912483, + "learning_rate": 0.00017134263462304533, + "loss": 0.6849, + "step": 3017 + }, + { + "epoch": 0.2698497854077253, + "grad_norm": 0.1354613937187827, + "learning_rate": 0.00017132233840772836, + "loss": 0.6762, + "step": 3018 + }, + { + "epoch": 0.2699391988555079, + "grad_norm": 0.11517637446353499, + "learning_rate": 0.00017130203621069297, + "loss": 0.6359, + "step": 3019 + }, + { + "epoch": 0.2700286123032904, + "grad_norm": 0.1336335295761634, + "learning_rate": 0.00017128172803364188, + "loss": 0.7072, + "step": 3020 + }, + { + "epoch": 0.27011802575107297, + "grad_norm": 0.1296659535883367, + "learning_rate": 0.0001712614138782784, + "loss": 0.6768, + "step": 3021 + }, + { + "epoch": 0.2702074391988555, + "grad_norm": 0.1325812266897293, + "learning_rate": 0.00017124109374630616, + "loss": 0.6873, + "step": 3022 + }, + { + "epoch": 0.27029685264663805, + "grad_norm": 0.12641752140659707, + "learning_rate": 0.00017122076763942946, + "loss": 0.6967, + "step": 3023 + }, + { + "epoch": 0.2703862660944206, + "grad_norm": 0.12492545529776433, + "learning_rate": 0.00017120043555935298, + "loss": 0.6628, + "step": 3024 + }, + { + "epoch": 0.27047567954220314, + "grad_norm": 0.13497017308013068, + "learning_rate": 0.00017118009750778196, + "loss": 0.6644, + "step": 3025 + }, + { + "epoch": 0.2705650929899857, + "grad_norm": 0.1151626731837942, + "learning_rate": 0.00017115975348642212, + "loss": 0.6799, + "step": 3026 + }, + { + "epoch": 0.2706545064377682, + "grad_norm": 0.11319553890182417, + "learning_rate": 0.00017113940349697967, + "loss": 0.6524, + "step": 3027 + }, + { + "epoch": 0.2707439198855508, + "grad_norm": 0.13673180605264978, + "learning_rate": 0.00017111904754116142, + "loss": 0.7174, + "step": 3028 + }, + { + "epoch": 0.2708333333333333, + "grad_norm": 0.09643668263702604, + "learning_rate": 0.0001710986856206745, + "loss": 0.666, + "step": 3029 + }, + { + "epoch": 0.2709227467811159, + "grad_norm": 0.12044319008425809, + "learning_rate": 0.00017107831773722668, + "loss": 0.685, + "step": 3030 + }, + { + "epoch": 0.2710121602288984, + "grad_norm": 0.1291903970544597, + "learning_rate": 0.00017105794389252622, + "loss": 0.6627, + "step": 3031 + }, + { + "epoch": 0.271101573676681, + "grad_norm": 0.14700376656713499, + "learning_rate": 0.00017103756408828183, + "loss": 0.7093, + "step": 3032 + }, + { + "epoch": 0.27119098712446355, + "grad_norm": 0.12490184129566334, + "learning_rate": 0.0001710171783262027, + "loss": 0.6884, + "step": 3033 + }, + { + "epoch": 0.27128040057224606, + "grad_norm": 0.11283723529571452, + "learning_rate": 0.00017099678660799857, + "loss": 0.6431, + "step": 3034 + }, + { + "epoch": 0.27136981402002863, + "grad_norm": 0.11521605811554873, + "learning_rate": 0.00017097638893537976, + "loss": 0.6788, + "step": 3035 + }, + { + "epoch": 0.27145922746781115, + "grad_norm": 0.11948854286056183, + "learning_rate": 0.00017095598531005688, + "loss": 0.6563, + "step": 3036 + }, + { + "epoch": 0.2715486409155937, + "grad_norm": 0.1238019137535881, + "learning_rate": 0.0001709355757337412, + "loss": 0.6697, + "step": 3037 + }, + { + "epoch": 0.27163805436337624, + "grad_norm": 0.1282769351238472, + "learning_rate": 0.00017091516020814447, + "loss": 0.7135, + "step": 3038 + }, + { + "epoch": 0.2717274678111588, + "grad_norm": 0.1240003033645979, + "learning_rate": 0.0001708947387349789, + "loss": 0.6964, + "step": 3039 + }, + { + "epoch": 0.2718168812589413, + "grad_norm": 0.1192532946628337, + "learning_rate": 0.0001708743113159572, + "loss": 0.6881, + "step": 3040 + }, + { + "epoch": 0.2719062947067239, + "grad_norm": 0.1211150932063476, + "learning_rate": 0.0001708538779527926, + "loss": 0.676, + "step": 3041 + }, + { + "epoch": 0.2719957081545064, + "grad_norm": 0.12515975346205807, + "learning_rate": 0.00017083343864719884, + "loss": 0.7368, + "step": 3042 + }, + { + "epoch": 0.272085121602289, + "grad_norm": 0.12645758561274, + "learning_rate": 0.00017081299340089012, + "loss": 0.6849, + "step": 3043 + }, + { + "epoch": 0.27217453505007155, + "grad_norm": 0.1006775206093279, + "learning_rate": 0.00017079254221558115, + "loss": 0.642, + "step": 3044 + }, + { + "epoch": 0.27226394849785407, + "grad_norm": 0.11973539647351869, + "learning_rate": 0.00017077208509298718, + "loss": 0.6936, + "step": 3045 + }, + { + "epoch": 0.27235336194563664, + "grad_norm": 0.12087870182633778, + "learning_rate": 0.0001707516220348239, + "loss": 0.705, + "step": 3046 + }, + { + "epoch": 0.27244277539341916, + "grad_norm": 0.12260783613075717, + "learning_rate": 0.00017073115304280754, + "loss": 0.631, + "step": 3047 + }, + { + "epoch": 0.27253218884120173, + "grad_norm": 0.12962974248827558, + "learning_rate": 0.00017071067811865476, + "loss": 0.7046, + "step": 3048 + }, + { + "epoch": 0.27262160228898424, + "grad_norm": 0.13051686238736993, + "learning_rate": 0.00017069019726408282, + "loss": 0.6836, + "step": 3049 + }, + { + "epoch": 0.2727110157367668, + "grad_norm": 0.13168194135530478, + "learning_rate": 0.0001706697104808094, + "loss": 0.7217, + "step": 3050 + }, + { + "epoch": 0.27280042918454933, + "grad_norm": 0.12668546184747242, + "learning_rate": 0.00017064921777055272, + "loss": 0.6493, + "step": 3051 + }, + { + "epoch": 0.2728898426323319, + "grad_norm": 0.14078632326404084, + "learning_rate": 0.00017062871913503148, + "loss": 0.6789, + "step": 3052 + }, + { + "epoch": 0.2729792560801145, + "grad_norm": 0.1006339878050429, + "learning_rate": 0.00017060821457596487, + "loss": 0.6496, + "step": 3053 + }, + { + "epoch": 0.273068669527897, + "grad_norm": 0.13710287717996508, + "learning_rate": 0.00017058770409507254, + "loss": 0.7124, + "step": 3054 + }, + { + "epoch": 0.27315808297567956, + "grad_norm": 0.11463069188262288, + "learning_rate": 0.00017056718769407474, + "loss": 0.6653, + "step": 3055 + }, + { + "epoch": 0.2732474964234621, + "grad_norm": 0.1293617866674779, + "learning_rate": 0.00017054666537469213, + "loss": 0.7257, + "step": 3056 + }, + { + "epoch": 0.27333690987124465, + "grad_norm": 0.1264909233789169, + "learning_rate": 0.00017052613713864587, + "loss": 0.7051, + "step": 3057 + }, + { + "epoch": 0.27342632331902716, + "grad_norm": 0.11411508958486136, + "learning_rate": 0.0001705056029876577, + "loss": 0.6802, + "step": 3058 + }, + { + "epoch": 0.27351573676680974, + "grad_norm": 0.11107124652452606, + "learning_rate": 0.00017048506292344974, + "loss": 0.6389, + "step": 3059 + }, + { + "epoch": 0.27360515021459225, + "grad_norm": 0.12038792779748145, + "learning_rate": 0.00017046451694774467, + "loss": 0.6743, + "step": 3060 + }, + { + "epoch": 0.2736945636623748, + "grad_norm": 0.11258094690927122, + "learning_rate": 0.00017044396506226566, + "loss": 0.6785, + "step": 3061 + }, + { + "epoch": 0.2737839771101574, + "grad_norm": 0.12166465526118078, + "learning_rate": 0.0001704234072687364, + "loss": 0.6685, + "step": 3062 + }, + { + "epoch": 0.2738733905579399, + "grad_norm": 0.1287811303561736, + "learning_rate": 0.000170402843568881, + "loss": 0.6819, + "step": 3063 + }, + { + "epoch": 0.2739628040057225, + "grad_norm": 0.1323166066010598, + "learning_rate": 0.00017038227396442415, + "loss": 0.7194, + "step": 3064 + }, + { + "epoch": 0.274052217453505, + "grad_norm": 0.1405779467436321, + "learning_rate": 0.00017036169845709097, + "loss": 0.6822, + "step": 3065 + }, + { + "epoch": 0.27414163090128757, + "grad_norm": 0.1312956438396766, + "learning_rate": 0.00017034111704860712, + "loss": 0.7538, + "step": 3066 + }, + { + "epoch": 0.2742310443490701, + "grad_norm": 0.12653420976445065, + "learning_rate": 0.00017032052974069874, + "loss": 0.6664, + "step": 3067 + }, + { + "epoch": 0.27432045779685266, + "grad_norm": 0.13076471425197367, + "learning_rate": 0.00017029993653509243, + "loss": 0.6794, + "step": 3068 + }, + { + "epoch": 0.2744098712446352, + "grad_norm": 0.10966859017916328, + "learning_rate": 0.0001702793374335154, + "loss": 0.6389, + "step": 3069 + }, + { + "epoch": 0.27449928469241774, + "grad_norm": 0.12032648073067811, + "learning_rate": 0.00017025873243769517, + "loss": 0.6582, + "step": 3070 + }, + { + "epoch": 0.27458869814020026, + "grad_norm": 0.14292271133953746, + "learning_rate": 0.0001702381215493599, + "loss": 0.7389, + "step": 3071 + }, + { + "epoch": 0.27467811158798283, + "grad_norm": 0.1293337678475759, + "learning_rate": 0.0001702175047702382, + "loss": 0.6922, + "step": 3072 + }, + { + "epoch": 0.2747675250357654, + "grad_norm": 0.14231852550247515, + "learning_rate": 0.00017019688210205918, + "loss": 0.6797, + "step": 3073 + }, + { + "epoch": 0.2748569384835479, + "grad_norm": 0.14717425262316786, + "learning_rate": 0.00017017625354655245, + "loss": 0.6817, + "step": 3074 + }, + { + "epoch": 0.2749463519313305, + "grad_norm": 0.12460326807909644, + "learning_rate": 0.00017015561910544807, + "loss": 0.6981, + "step": 3075 + }, + { + "epoch": 0.275035765379113, + "grad_norm": 0.1506201869961386, + "learning_rate": 0.00017013497878047668, + "loss": 0.7549, + "step": 3076 + }, + { + "epoch": 0.2751251788268956, + "grad_norm": 0.12442744128115747, + "learning_rate": 0.0001701143325733693, + "loss": 0.705, + "step": 3077 + }, + { + "epoch": 0.2752145922746781, + "grad_norm": 0.14022792464535794, + "learning_rate": 0.0001700936804858575, + "loss": 0.704, + "step": 3078 + }, + { + "epoch": 0.27530400572246067, + "grad_norm": 0.12538367785232357, + "learning_rate": 0.00017007302251967338, + "loss": 0.7155, + "step": 3079 + }, + { + "epoch": 0.2753934191702432, + "grad_norm": 0.13211250327176835, + "learning_rate": 0.0001700523586765495, + "loss": 0.6519, + "step": 3080 + }, + { + "epoch": 0.27548283261802575, + "grad_norm": 0.12105735837138341, + "learning_rate": 0.00017003168895821888, + "loss": 0.6385, + "step": 3081 + }, + { + "epoch": 0.2755722460658083, + "grad_norm": 0.11836751753782157, + "learning_rate": 0.00017001101336641512, + "loss": 0.6698, + "step": 3082 + }, + { + "epoch": 0.27566165951359084, + "grad_norm": 0.13040498400036452, + "learning_rate": 0.0001699903319028722, + "loss": 0.7352, + "step": 3083 + }, + { + "epoch": 0.2757510729613734, + "grad_norm": 0.10793068831517412, + "learning_rate": 0.00016996964456932466, + "loss": 0.6863, + "step": 3084 + }, + { + "epoch": 0.2758404864091559, + "grad_norm": 0.1122725117201091, + "learning_rate": 0.0001699489513675075, + "loss": 0.6692, + "step": 3085 + }, + { + "epoch": 0.2759298998569385, + "grad_norm": 0.10409384590800683, + "learning_rate": 0.00016992825229915636, + "loss": 0.6553, + "step": 3086 + }, + { + "epoch": 0.276019313304721, + "grad_norm": 0.11906432127415076, + "learning_rate": 0.0001699075473660071, + "loss": 0.682, + "step": 3087 + }, + { + "epoch": 0.2761087267525036, + "grad_norm": 0.1176137500914291, + "learning_rate": 0.00016988683656979624, + "loss": 0.6842, + "step": 3088 + }, + { + "epoch": 0.2761981402002861, + "grad_norm": 0.12056777486400541, + "learning_rate": 0.00016986611991226086, + "loss": 0.6489, + "step": 3089 + }, + { + "epoch": 0.2762875536480687, + "grad_norm": 0.12826131688311174, + "learning_rate": 0.00016984539739513835, + "loss": 0.678, + "step": 3090 + }, + { + "epoch": 0.2763769670958512, + "grad_norm": 0.11126238392220367, + "learning_rate": 0.0001698246690201667, + "loss": 0.6635, + "step": 3091 + }, + { + "epoch": 0.27646638054363376, + "grad_norm": 0.1212272692012133, + "learning_rate": 0.00016980393478908438, + "loss": 0.6691, + "step": 3092 + }, + { + "epoch": 0.27655579399141633, + "grad_norm": 0.13470159814217691, + "learning_rate": 0.00016978319470363035, + "loss": 0.7049, + "step": 3093 + }, + { + "epoch": 0.27664520743919885, + "grad_norm": 0.12681721105545268, + "learning_rate": 0.0001697624487655441, + "loss": 0.6503, + "step": 3094 + }, + { + "epoch": 0.2767346208869814, + "grad_norm": 0.11540125932491978, + "learning_rate": 0.0001697416969765655, + "loss": 0.7049, + "step": 3095 + }, + { + "epoch": 0.27682403433476394, + "grad_norm": 0.11084279178662383, + "learning_rate": 0.000169720939338435, + "loss": 0.6482, + "step": 3096 + }, + { + "epoch": 0.2769134477825465, + "grad_norm": 0.1291712410826834, + "learning_rate": 0.0001697001758528935, + "loss": 0.6667, + "step": 3097 + }, + { + "epoch": 0.277002861230329, + "grad_norm": 0.1217249152371253, + "learning_rate": 0.00016967940652168247, + "loss": 0.673, + "step": 3098 + }, + { + "epoch": 0.2770922746781116, + "grad_norm": 0.115412134435474, + "learning_rate": 0.00016965863134654372, + "loss": 0.6393, + "step": 3099 + }, + { + "epoch": 0.2771816881258941, + "grad_norm": 0.1325295853545433, + "learning_rate": 0.0001696378503292197, + "loss": 0.6847, + "step": 3100 + }, + { + "epoch": 0.2772711015736767, + "grad_norm": 0.1144436417451668, + "learning_rate": 0.0001696170634714533, + "loss": 0.6452, + "step": 3101 + }, + { + "epoch": 0.27736051502145925, + "grad_norm": 0.1251030601233092, + "learning_rate": 0.00016959627077498782, + "loss": 0.7041, + "step": 3102 + }, + { + "epoch": 0.27744992846924177, + "grad_norm": 0.12324808162929049, + "learning_rate": 0.00016957547224156718, + "loss": 0.6707, + "step": 3103 + }, + { + "epoch": 0.27753934191702434, + "grad_norm": 0.1146159287234878, + "learning_rate": 0.00016955466787293576, + "loss": 0.6451, + "step": 3104 + }, + { + "epoch": 0.27762875536480686, + "grad_norm": 0.10736314088043876, + "learning_rate": 0.00016953385767083827, + "loss": 0.658, + "step": 3105 + }, + { + "epoch": 0.2777181688125894, + "grad_norm": 0.137655589523494, + "learning_rate": 0.00016951304163702013, + "loss": 0.681, + "step": 3106 + }, + { + "epoch": 0.27780758226037194, + "grad_norm": 0.1417552557028932, + "learning_rate": 0.00016949221977322716, + "loss": 0.7359, + "step": 3107 + }, + { + "epoch": 0.2778969957081545, + "grad_norm": 0.14021102922713552, + "learning_rate": 0.00016947139208120564, + "loss": 0.6958, + "step": 3108 + }, + { + "epoch": 0.27798640915593703, + "grad_norm": 0.13238940666928944, + "learning_rate": 0.00016945055856270236, + "loss": 0.6956, + "step": 3109 + }, + { + "epoch": 0.2780758226037196, + "grad_norm": 0.1253356841380599, + "learning_rate": 0.0001694297192194646, + "loss": 0.582, + "step": 3110 + }, + { + "epoch": 0.2781652360515021, + "grad_norm": 0.12536615351861724, + "learning_rate": 0.00016940887405324015, + "loss": 0.6949, + "step": 3111 + }, + { + "epoch": 0.2782546494992847, + "grad_norm": 0.12999190789785556, + "learning_rate": 0.00016938802306577726, + "loss": 0.6732, + "step": 3112 + }, + { + "epoch": 0.27834406294706726, + "grad_norm": 0.12915104824465704, + "learning_rate": 0.00016936716625882468, + "loss": 0.722, + "step": 3113 + }, + { + "epoch": 0.2784334763948498, + "grad_norm": 0.11917261521670296, + "learning_rate": 0.00016934630363413163, + "loss": 0.6355, + "step": 3114 + }, + { + "epoch": 0.27852288984263235, + "grad_norm": 0.11269241277368969, + "learning_rate": 0.00016932543519344783, + "loss": 0.6643, + "step": 3115 + }, + { + "epoch": 0.27861230329041486, + "grad_norm": 0.12098983263045583, + "learning_rate": 0.00016930456093852353, + "loss": 0.698, + "step": 3116 + }, + { + "epoch": 0.27870171673819744, + "grad_norm": 0.11003175080483975, + "learning_rate": 0.00016928368087110938, + "loss": 0.6346, + "step": 3117 + }, + { + "epoch": 0.27879113018597995, + "grad_norm": 0.13467832681361064, + "learning_rate": 0.0001692627949929566, + "loss": 0.6931, + "step": 3118 + }, + { + "epoch": 0.2788805436337625, + "grad_norm": 0.12489851603044427, + "learning_rate": 0.00016924190330581685, + "loss": 0.6924, + "step": 3119 + }, + { + "epoch": 0.27896995708154504, + "grad_norm": 0.11017355464314356, + "learning_rate": 0.00016922100581144228, + "loss": 0.6147, + "step": 3120 + }, + { + "epoch": 0.2790593705293276, + "grad_norm": 0.10749986060511176, + "learning_rate": 0.0001692001025115856, + "loss": 0.6374, + "step": 3121 + }, + { + "epoch": 0.2791487839771102, + "grad_norm": 0.11222760216318778, + "learning_rate": 0.00016917919340799986, + "loss": 0.671, + "step": 3122 + }, + { + "epoch": 0.2792381974248927, + "grad_norm": 0.11207083689837134, + "learning_rate": 0.00016915827850243868, + "loss": 0.6433, + "step": 3123 + }, + { + "epoch": 0.27932761087267527, + "grad_norm": 0.13036628128846953, + "learning_rate": 0.00016913735779665627, + "loss": 0.671, + "step": 3124 + }, + { + "epoch": 0.2794170243204578, + "grad_norm": 0.12796866571861581, + "learning_rate": 0.00016911643129240714, + "loss": 0.7247, + "step": 3125 + }, + { + "epoch": 0.27950643776824036, + "grad_norm": 0.10667487749044752, + "learning_rate": 0.00016909549899144635, + "loss": 0.656, + "step": 3126 + }, + { + "epoch": 0.2795958512160229, + "grad_norm": 0.11787802057773997, + "learning_rate": 0.00016907456089552953, + "loss": 0.662, + "step": 3127 + }, + { + "epoch": 0.27968526466380544, + "grad_norm": 0.1025202674172835, + "learning_rate": 0.00016905361700641271, + "loss": 0.6568, + "step": 3128 + }, + { + "epoch": 0.27977467811158796, + "grad_norm": 0.11996679730812478, + "learning_rate": 0.00016903266732585243, + "loss": 0.6698, + "step": 3129 + }, + { + "epoch": 0.27986409155937053, + "grad_norm": 0.11795578692668651, + "learning_rate": 0.00016901171185560574, + "loss": 0.6692, + "step": 3130 + }, + { + "epoch": 0.2799535050071531, + "grad_norm": 0.12887877998895594, + "learning_rate": 0.00016899075059743007, + "loss": 0.6472, + "step": 3131 + }, + { + "epoch": 0.2800429184549356, + "grad_norm": 0.12328217053123841, + "learning_rate": 0.00016896978355308352, + "loss": 0.6585, + "step": 3132 + }, + { + "epoch": 0.2801323319027182, + "grad_norm": 0.10909259924807857, + "learning_rate": 0.00016894881072432443, + "loss": 0.6805, + "step": 3133 + }, + { + "epoch": 0.2802217453505007, + "grad_norm": 0.1116090957307472, + "learning_rate": 0.00016892783211291194, + "loss": 0.6459, + "step": 3134 + }, + { + "epoch": 0.2803111587982833, + "grad_norm": 0.11776012354471861, + "learning_rate": 0.00016890684772060538, + "loss": 0.6521, + "step": 3135 + }, + { + "epoch": 0.2804005722460658, + "grad_norm": 0.11640638385440091, + "learning_rate": 0.00016888585754916476, + "loss": 0.6743, + "step": 3136 + }, + { + "epoch": 0.28048998569384836, + "grad_norm": 0.12603740157050605, + "learning_rate": 0.0001688648616003504, + "loss": 0.6888, + "step": 3137 + }, + { + "epoch": 0.2805793991416309, + "grad_norm": 0.11567991042701484, + "learning_rate": 0.0001688438598759233, + "loss": 0.5629, + "step": 3138 + }, + { + "epoch": 0.28066881258941345, + "grad_norm": 0.13756715838226138, + "learning_rate": 0.00016882285237764482, + "loss": 0.6993, + "step": 3139 + }, + { + "epoch": 0.28075822603719597, + "grad_norm": 0.1291486009469929, + "learning_rate": 0.0001688018391072768, + "loss": 0.6737, + "step": 3140 + }, + { + "epoch": 0.28084763948497854, + "grad_norm": 0.1459302305836547, + "learning_rate": 0.00016878082006658164, + "loss": 0.7151, + "step": 3141 + }, + { + "epoch": 0.2809370529327611, + "grad_norm": 0.12283646916355727, + "learning_rate": 0.00016875979525732214, + "loss": 0.6772, + "step": 3142 + }, + { + "epoch": 0.2810264663805436, + "grad_norm": 0.14633326489233034, + "learning_rate": 0.0001687387646812617, + "loss": 0.7275, + "step": 3143 + }, + { + "epoch": 0.2811158798283262, + "grad_norm": 0.1334910679081192, + "learning_rate": 0.00016871772834016406, + "loss": 0.7106, + "step": 3144 + }, + { + "epoch": 0.2812052932761087, + "grad_norm": 0.10967857114906761, + "learning_rate": 0.00016869668623579353, + "loss": 0.645, + "step": 3145 + }, + { + "epoch": 0.2812947067238913, + "grad_norm": 0.13813479187443803, + "learning_rate": 0.00016867563836991492, + "loss": 0.689, + "step": 3146 + }, + { + "epoch": 0.2813841201716738, + "grad_norm": 0.11672673517190056, + "learning_rate": 0.00016865458474429342, + "loss": 0.7031, + "step": 3147 + }, + { + "epoch": 0.2814735336194564, + "grad_norm": 0.1312622356509223, + "learning_rate": 0.00016863352536069482, + "loss": 0.7312, + "step": 3148 + }, + { + "epoch": 0.2815629470672389, + "grad_norm": 0.1126233442609159, + "learning_rate": 0.00016861246022088536, + "loss": 0.6735, + "step": 3149 + }, + { + "epoch": 0.28165236051502146, + "grad_norm": 0.11606080322787969, + "learning_rate": 0.0001685913893266317, + "loss": 0.6676, + "step": 3150 + }, + { + "epoch": 0.28174177396280403, + "grad_norm": 0.11569092211620217, + "learning_rate": 0.00016857031267970105, + "loss": 0.6808, + "step": 3151 + }, + { + "epoch": 0.28183118741058655, + "grad_norm": 0.11906022978332957, + "learning_rate": 0.00016854923028186111, + "loss": 0.6739, + "step": 3152 + }, + { + "epoch": 0.2819206008583691, + "grad_norm": 0.14106882048073865, + "learning_rate": 0.00016852814213488, + "loss": 0.7179, + "step": 3153 + }, + { + "epoch": 0.28201001430615164, + "grad_norm": 0.13292784243450473, + "learning_rate": 0.00016850704824052635, + "loss": 0.6632, + "step": 3154 + }, + { + "epoch": 0.2820994277539342, + "grad_norm": 0.12299831761913435, + "learning_rate": 0.00016848594860056933, + "loss": 0.6454, + "step": 3155 + }, + { + "epoch": 0.2821888412017167, + "grad_norm": 0.14456044401609977, + "learning_rate": 0.00016846484321677852, + "loss": 0.7258, + "step": 3156 + }, + { + "epoch": 0.2822782546494993, + "grad_norm": 0.12791089368969807, + "learning_rate": 0.00016844373209092396, + "loss": 0.7147, + "step": 3157 + }, + { + "epoch": 0.2823676680972818, + "grad_norm": 0.12932952057195632, + "learning_rate": 0.00016842261522477628, + "loss": 0.6838, + "step": 3158 + }, + { + "epoch": 0.2824570815450644, + "grad_norm": 0.12047597166168035, + "learning_rate": 0.00016840149262010648, + "loss": 0.6391, + "step": 3159 + }, + { + "epoch": 0.2825464949928469, + "grad_norm": 0.11758129760002843, + "learning_rate": 0.00016838036427868608, + "loss": 0.6878, + "step": 3160 + }, + { + "epoch": 0.28263590844062947, + "grad_norm": 0.14672225778983927, + "learning_rate": 0.00016835923020228712, + "loss": 0.7226, + "step": 3161 + }, + { + "epoch": 0.28272532188841204, + "grad_norm": 0.13174369143996284, + "learning_rate": 0.0001683380903926821, + "loss": 0.7174, + "step": 3162 + }, + { + "epoch": 0.28281473533619456, + "grad_norm": 0.13664680688575664, + "learning_rate": 0.00016831694485164398, + "loss": 0.6975, + "step": 3163 + }, + { + "epoch": 0.2829041487839771, + "grad_norm": 0.13191176181359301, + "learning_rate": 0.00016829579358094616, + "loss": 0.6901, + "step": 3164 + }, + { + "epoch": 0.28299356223175964, + "grad_norm": 0.1319310057803636, + "learning_rate": 0.00016827463658236264, + "loss": 0.6552, + "step": 3165 + }, + { + "epoch": 0.2830829756795422, + "grad_norm": 0.13214197328830396, + "learning_rate": 0.0001682534738576678, + "loss": 0.6224, + "step": 3166 + }, + { + "epoch": 0.28317238912732473, + "grad_norm": 0.12704660112937147, + "learning_rate": 0.00016823230540863654, + "loss": 0.6489, + "step": 3167 + }, + { + "epoch": 0.2832618025751073, + "grad_norm": 0.11760919354271732, + "learning_rate": 0.00016821113123704424, + "loss": 0.6062, + "step": 3168 + }, + { + "epoch": 0.2833512160228898, + "grad_norm": 0.12296642674673766, + "learning_rate": 0.0001681899513446667, + "loss": 0.6525, + "step": 3169 + }, + { + "epoch": 0.2834406294706724, + "grad_norm": 0.13677712431551148, + "learning_rate": 0.00016816876573328037, + "loss": 0.6859, + "step": 3170 + }, + { + "epoch": 0.28353004291845496, + "grad_norm": 0.12532309935479696, + "learning_rate": 0.00016814757440466188, + "loss": 0.676, + "step": 3171 + }, + { + "epoch": 0.2836194563662375, + "grad_norm": 0.10936086806414129, + "learning_rate": 0.0001681263773605887, + "loss": 0.6727, + "step": 3172 + }, + { + "epoch": 0.28370886981402005, + "grad_norm": 0.11406758430690514, + "learning_rate": 0.00016810517460283853, + "loss": 0.6693, + "step": 3173 + }, + { + "epoch": 0.28379828326180256, + "grad_norm": 0.12401041592532154, + "learning_rate": 0.0001680839661331896, + "loss": 0.5882, + "step": 3174 + }, + { + "epoch": 0.28388769670958514, + "grad_norm": 0.12837192575512227, + "learning_rate": 0.00016806275195342064, + "loss": 0.6784, + "step": 3175 + }, + { + "epoch": 0.28397711015736765, + "grad_norm": 0.13841777450257262, + "learning_rate": 0.00016804153206531088, + "loss": 0.7193, + "step": 3176 + }, + { + "epoch": 0.2840665236051502, + "grad_norm": 0.13798839518038739, + "learning_rate": 0.00016802030647064, + "loss": 0.7114, + "step": 3177 + }, + { + "epoch": 0.28415593705293274, + "grad_norm": 0.13715135828569114, + "learning_rate": 0.00016799907517118818, + "loss": 0.6702, + "step": 3178 + }, + { + "epoch": 0.2842453505007153, + "grad_norm": 0.11001298109748851, + "learning_rate": 0.00016797783816873603, + "loss": 0.5552, + "step": 3179 + }, + { + "epoch": 0.2843347639484979, + "grad_norm": 0.14529507389542015, + "learning_rate": 0.00016795659546506468, + "loss": 0.7129, + "step": 3180 + }, + { + "epoch": 0.2844241773962804, + "grad_norm": 0.12343025612833182, + "learning_rate": 0.00016793534706195575, + "loss": 0.7081, + "step": 3181 + }, + { + "epoch": 0.28451359084406297, + "grad_norm": 0.11988624635172462, + "learning_rate": 0.0001679140929611913, + "loss": 0.665, + "step": 3182 + }, + { + "epoch": 0.2846030042918455, + "grad_norm": 0.14176678940456452, + "learning_rate": 0.00016789283316455392, + "loss": 0.7149, + "step": 3183 + }, + { + "epoch": 0.28469241773962806, + "grad_norm": 0.1303011525465853, + "learning_rate": 0.00016787156767382659, + "loss": 0.6937, + "step": 3184 + }, + { + "epoch": 0.2847818311874106, + "grad_norm": 0.1283323661607881, + "learning_rate": 0.00016785029649079287, + "loss": 0.691, + "step": 3185 + }, + { + "epoch": 0.28487124463519314, + "grad_norm": 0.11301104534017256, + "learning_rate": 0.0001678290196172367, + "loss": 0.6269, + "step": 3186 + }, + { + "epoch": 0.28496065808297566, + "grad_norm": 0.14349186603265626, + "learning_rate": 0.0001678077370549426, + "loss": 0.7245, + "step": 3187 + }, + { + "epoch": 0.28505007153075823, + "grad_norm": 0.12731003259428045, + "learning_rate": 0.00016778644880569544, + "loss": 0.6658, + "step": 3188 + }, + { + "epoch": 0.28513948497854075, + "grad_norm": 0.11356660478262473, + "learning_rate": 0.00016776515487128073, + "loss": 0.6229, + "step": 3189 + }, + { + "epoch": 0.2852288984263233, + "grad_norm": 0.1241929859416351, + "learning_rate": 0.00016774385525348428, + "loss": 0.6584, + "step": 3190 + }, + { + "epoch": 0.2853183118741059, + "grad_norm": 0.1288813989622598, + "learning_rate": 0.00016772254995409255, + "loss": 0.6667, + "step": 3191 + }, + { + "epoch": 0.2854077253218884, + "grad_norm": 0.11389424503939254, + "learning_rate": 0.00016770123897489228, + "loss": 0.6443, + "step": 3192 + }, + { + "epoch": 0.285497138769671, + "grad_norm": 0.11594762643338212, + "learning_rate": 0.00016767992231767092, + "loss": 0.6766, + "step": 3193 + }, + { + "epoch": 0.2855865522174535, + "grad_norm": 0.12847757888469863, + "learning_rate": 0.0001676585999842162, + "loss": 0.6677, + "step": 3194 + }, + { + "epoch": 0.28567596566523606, + "grad_norm": 0.1031334593363573, + "learning_rate": 0.0001676372719763164, + "loss": 0.6791, + "step": 3195 + }, + { + "epoch": 0.2857653791130186, + "grad_norm": 0.1241719167693109, + "learning_rate": 0.0001676159382957603, + "loss": 0.6972, + "step": 3196 + }, + { + "epoch": 0.28585479256080115, + "grad_norm": 0.11190731488820728, + "learning_rate": 0.0001675945989443371, + "loss": 0.6511, + "step": 3197 + }, + { + "epoch": 0.28594420600858367, + "grad_norm": 0.12096786602948434, + "learning_rate": 0.0001675732539238365, + "loss": 0.6684, + "step": 3198 + }, + { + "epoch": 0.28603361945636624, + "grad_norm": 0.10269409430391, + "learning_rate": 0.00016755190323604872, + "loss": 0.651, + "step": 3199 + }, + { + "epoch": 0.2861230329041488, + "grad_norm": 0.13095822769902268, + "learning_rate": 0.0001675305468827644, + "loss": 0.6955, + "step": 3200 + }, + { + "epoch": 0.2862124463519313, + "grad_norm": 0.12574409609208037, + "learning_rate": 0.00016750918486577466, + "loss": 0.6492, + "step": 3201 + }, + { + "epoch": 0.2863018597997139, + "grad_norm": 0.1255517293159677, + "learning_rate": 0.00016748781718687111, + "loss": 0.6876, + "step": 3202 + }, + { + "epoch": 0.2863912732474964, + "grad_norm": 0.12870211766638057, + "learning_rate": 0.00016746644384784586, + "loss": 0.6758, + "step": 3203 + }, + { + "epoch": 0.286480686695279, + "grad_norm": 0.14032182504811322, + "learning_rate": 0.00016744506485049144, + "loss": 0.6391, + "step": 3204 + }, + { + "epoch": 0.2865701001430615, + "grad_norm": 0.1377323938765238, + "learning_rate": 0.00016742368019660088, + "loss": 0.6637, + "step": 3205 + }, + { + "epoch": 0.2866595135908441, + "grad_norm": 0.12962215041510994, + "learning_rate": 0.0001674022898879677, + "loss": 0.6795, + "step": 3206 + }, + { + "epoch": 0.2867489270386266, + "grad_norm": 0.12069656441125647, + "learning_rate": 0.00016738089392638586, + "loss": 0.6344, + "step": 3207 + }, + { + "epoch": 0.28683834048640916, + "grad_norm": 0.127639646523941, + "learning_rate": 0.0001673594923136498, + "loss": 0.6581, + "step": 3208 + }, + { + "epoch": 0.2869277539341917, + "grad_norm": 0.13235533927249987, + "learning_rate": 0.00016733808505155448, + "loss": 0.7335, + "step": 3209 + }, + { + "epoch": 0.28701716738197425, + "grad_norm": 0.12302648716986188, + "learning_rate": 0.0001673166721418953, + "loss": 0.6719, + "step": 3210 + }, + { + "epoch": 0.2871065808297568, + "grad_norm": 0.13023358765880505, + "learning_rate": 0.00016729525358646813, + "loss": 0.6979, + "step": 3211 + }, + { + "epoch": 0.28719599427753933, + "grad_norm": 0.11712861863586144, + "learning_rate": 0.00016727382938706931, + "loss": 0.6714, + "step": 3212 + }, + { + "epoch": 0.2872854077253219, + "grad_norm": 0.12882538739845475, + "learning_rate": 0.00016725239954549565, + "loss": 0.6436, + "step": 3213 + }, + { + "epoch": 0.2873748211731044, + "grad_norm": 0.12479704633249106, + "learning_rate": 0.00016723096406354447, + "loss": 0.6766, + "step": 3214 + }, + { + "epoch": 0.287464234620887, + "grad_norm": 0.10932456383055032, + "learning_rate": 0.00016720952294301355, + "loss": 0.7006, + "step": 3215 + }, + { + "epoch": 0.2875536480686695, + "grad_norm": 0.11402755541892283, + "learning_rate": 0.00016718807618570106, + "loss": 0.6714, + "step": 3216 + }, + { + "epoch": 0.2876430615164521, + "grad_norm": 0.1350639758099237, + "learning_rate": 0.0001671666237934058, + "loss": 0.6959, + "step": 3217 + }, + { + "epoch": 0.2877324749642346, + "grad_norm": 0.12040457751467525, + "learning_rate": 0.00016714516576792692, + "loss": 0.6449, + "step": 3218 + }, + { + "epoch": 0.28782188841201717, + "grad_norm": 0.14685838260407624, + "learning_rate": 0.00016712370211106406, + "loss": 0.6779, + "step": 3219 + }, + { + "epoch": 0.28791130185979974, + "grad_norm": 0.13619883543036804, + "learning_rate": 0.0001671022328246174, + "loss": 0.7327, + "step": 3220 + }, + { + "epoch": 0.28800071530758226, + "grad_norm": 0.13170210359143586, + "learning_rate": 0.00016708075791038745, + "loss": 0.6591, + "step": 3221 + }, + { + "epoch": 0.2880901287553648, + "grad_norm": 0.13189245412798517, + "learning_rate": 0.00016705927737017544, + "loss": 0.6887, + "step": 3222 + }, + { + "epoch": 0.28817954220314734, + "grad_norm": 0.13033556167363622, + "learning_rate": 0.00016703779120578273, + "loss": 0.6779, + "step": 3223 + }, + { + "epoch": 0.2882689556509299, + "grad_norm": 0.12461053550169966, + "learning_rate": 0.00016701629941901148, + "loss": 0.6481, + "step": 3224 + }, + { + "epoch": 0.28835836909871243, + "grad_norm": 0.12816702278018377, + "learning_rate": 0.00016699480201166415, + "loss": 0.6783, + "step": 3225 + }, + { + "epoch": 0.288447782546495, + "grad_norm": 0.12381307537760308, + "learning_rate": 0.00016697329898554365, + "loss": 0.678, + "step": 3226 + }, + { + "epoch": 0.2885371959942775, + "grad_norm": 0.12495096722774952, + "learning_rate": 0.00016695179034245346, + "loss": 0.6755, + "step": 3227 + }, + { + "epoch": 0.2886266094420601, + "grad_norm": 0.1494370525963766, + "learning_rate": 0.00016693027608419747, + "loss": 0.6769, + "step": 3228 + }, + { + "epoch": 0.2887160228898426, + "grad_norm": 0.13293594651835794, + "learning_rate": 0.00016690875621258006, + "loss": 0.7176, + "step": 3229 + }, + { + "epoch": 0.2888054363376252, + "grad_norm": 0.13014780391374162, + "learning_rate": 0.00016688723072940607, + "loss": 0.6956, + "step": 3230 + }, + { + "epoch": 0.28889484978540775, + "grad_norm": 0.11690794612232408, + "learning_rate": 0.0001668656996364808, + "loss": 0.6802, + "step": 3231 + }, + { + "epoch": 0.28898426323319026, + "grad_norm": 0.12454662637262806, + "learning_rate": 0.0001668441629356101, + "loss": 0.6887, + "step": 3232 + }, + { + "epoch": 0.28907367668097284, + "grad_norm": 0.11899922175789016, + "learning_rate": 0.00016682262062860014, + "loss": 0.6189, + "step": 3233 + }, + { + "epoch": 0.28916309012875535, + "grad_norm": 0.13765869632230496, + "learning_rate": 0.0001668010727172577, + "loss": 0.6928, + "step": 3234 + }, + { + "epoch": 0.2892525035765379, + "grad_norm": 0.1305712267060371, + "learning_rate": 0.00016677951920338995, + "loss": 0.728, + "step": 3235 + }, + { + "epoch": 0.28934191702432044, + "grad_norm": 0.12996735565312958, + "learning_rate": 0.00016675796008880462, + "loss": 0.6803, + "step": 3236 + }, + { + "epoch": 0.289431330472103, + "grad_norm": 0.13382521929280528, + "learning_rate": 0.00016673639537530976, + "loss": 0.6413, + "step": 3237 + }, + { + "epoch": 0.2895207439198855, + "grad_norm": 0.11541016189730033, + "learning_rate": 0.00016671482506471402, + "loss": 0.6833, + "step": 3238 + }, + { + "epoch": 0.2896101573676681, + "grad_norm": 0.11891257853185692, + "learning_rate": 0.0001666932491588265, + "loss": 0.6514, + "step": 3239 + }, + { + "epoch": 0.28969957081545067, + "grad_norm": 0.12127876105352778, + "learning_rate": 0.00016667166765945668, + "loss": 0.6251, + "step": 3240 + }, + { + "epoch": 0.2897889842632332, + "grad_norm": 0.11844352686765064, + "learning_rate": 0.00016665008056841466, + "loss": 0.6476, + "step": 3241 + }, + { + "epoch": 0.28987839771101576, + "grad_norm": 0.13798368304071287, + "learning_rate": 0.00016662848788751085, + "loss": 0.6953, + "step": 3242 + }, + { + "epoch": 0.28996781115879827, + "grad_norm": 0.11708276250254207, + "learning_rate": 0.00016660688961855623, + "loss": 0.67, + "step": 3243 + }, + { + "epoch": 0.29005722460658084, + "grad_norm": 0.1251249446081542, + "learning_rate": 0.0001665852857633622, + "loss": 0.6935, + "step": 3244 + }, + { + "epoch": 0.29014663805436336, + "grad_norm": 0.12514515569839443, + "learning_rate": 0.0001665636763237407, + "loss": 0.6833, + "step": 3245 + }, + { + "epoch": 0.29023605150214593, + "grad_norm": 0.13778923170463203, + "learning_rate": 0.00016654206130150404, + "loss": 0.7057, + "step": 3246 + }, + { + "epoch": 0.29032546494992845, + "grad_norm": 0.11971110150894028, + "learning_rate": 0.00016652044069846505, + "loss": 0.7027, + "step": 3247 + }, + { + "epoch": 0.290414878397711, + "grad_norm": 0.1151784672295942, + "learning_rate": 0.00016649881451643705, + "loss": 0.6633, + "step": 3248 + }, + { + "epoch": 0.2905042918454936, + "grad_norm": 0.1274785588834341, + "learning_rate": 0.0001664771827572338, + "loss": 0.7082, + "step": 3249 + }, + { + "epoch": 0.2905937052932761, + "grad_norm": 0.136302836245176, + "learning_rate": 0.0001664555454226695, + "loss": 0.694, + "step": 3250 + }, + { + "epoch": 0.2906831187410587, + "grad_norm": 0.11067876499125179, + "learning_rate": 0.00016643390251455884, + "loss": 0.6364, + "step": 3251 + }, + { + "epoch": 0.2907725321888412, + "grad_norm": 0.12875632927334313, + "learning_rate": 0.00016641225403471701, + "loss": 0.6623, + "step": 3252 + }, + { + "epoch": 0.29086194563662376, + "grad_norm": 0.11082910709364135, + "learning_rate": 0.00016639059998495968, + "loss": 0.6683, + "step": 3253 + }, + { + "epoch": 0.2909513590844063, + "grad_norm": 0.12618065928611819, + "learning_rate": 0.00016636894036710286, + "loss": 0.6866, + "step": 3254 + }, + { + "epoch": 0.29104077253218885, + "grad_norm": 0.11634822079452188, + "learning_rate": 0.0001663472751829632, + "loss": 0.6951, + "step": 3255 + }, + { + "epoch": 0.29113018597997137, + "grad_norm": 0.11298636782072631, + "learning_rate": 0.0001663256044343577, + "loss": 0.6827, + "step": 3256 + }, + { + "epoch": 0.29121959942775394, + "grad_norm": 0.09935745556639404, + "learning_rate": 0.00016630392812310384, + "loss": 0.63, + "step": 3257 + }, + { + "epoch": 0.29130901287553645, + "grad_norm": 0.10885630808545652, + "learning_rate": 0.00016628224625101962, + "loss": 0.6471, + "step": 3258 + }, + { + "epoch": 0.291398426323319, + "grad_norm": 0.1189802181837886, + "learning_rate": 0.00016626055881992344, + "loss": 0.6652, + "step": 3259 + }, + { + "epoch": 0.2914878397711016, + "grad_norm": 0.12236704944577029, + "learning_rate": 0.00016623886583163423, + "loss": 0.6481, + "step": 3260 + }, + { + "epoch": 0.2915772532188841, + "grad_norm": 0.11513467895679957, + "learning_rate": 0.00016621716728797132, + "loss": 0.6632, + "step": 3261 + }, + { + "epoch": 0.2916666666666667, + "grad_norm": 0.12075674028430772, + "learning_rate": 0.00016619546319075455, + "loss": 0.7364, + "step": 3262 + }, + { + "epoch": 0.2917560801144492, + "grad_norm": 0.13295756501601425, + "learning_rate": 0.00016617375354180424, + "loss": 0.7026, + "step": 3263 + }, + { + "epoch": 0.2918454935622318, + "grad_norm": 0.13646708463061114, + "learning_rate": 0.00016615203834294119, + "loss": 0.693, + "step": 3264 + }, + { + "epoch": 0.2919349070100143, + "grad_norm": 0.11778111199941266, + "learning_rate": 0.0001661303175959865, + "loss": 0.6625, + "step": 3265 + }, + { + "epoch": 0.29202432045779686, + "grad_norm": 0.10252550339677374, + "learning_rate": 0.00016610859130276198, + "loss": 0.6631, + "step": 3266 + }, + { + "epoch": 0.2921137339055794, + "grad_norm": 0.10853258804378303, + "learning_rate": 0.00016608685946508972, + "loss": 0.6733, + "step": 3267 + }, + { + "epoch": 0.29220314735336195, + "grad_norm": 0.1310325006479362, + "learning_rate": 0.00016606512208479238, + "loss": 0.6176, + "step": 3268 + }, + { + "epoch": 0.2922925608011445, + "grad_norm": 0.13193157057771765, + "learning_rate": 0.00016604337916369306, + "loss": 0.6713, + "step": 3269 + }, + { + "epoch": 0.29238197424892703, + "grad_norm": 0.11110041685485347, + "learning_rate": 0.00016602163070361526, + "loss": 0.6426, + "step": 3270 + }, + { + "epoch": 0.2924713876967096, + "grad_norm": 0.12977783874318916, + "learning_rate": 0.00016599987670638304, + "loss": 0.6671, + "step": 3271 + }, + { + "epoch": 0.2925608011444921, + "grad_norm": 0.12627436067905712, + "learning_rate": 0.00016597811717382083, + "loss": 0.6742, + "step": 3272 + }, + { + "epoch": 0.2926502145922747, + "grad_norm": 0.1303060132341248, + "learning_rate": 0.00016595635210775366, + "loss": 0.6265, + "step": 3273 + }, + { + "epoch": 0.2927396280400572, + "grad_norm": 0.13409292978819765, + "learning_rate": 0.00016593458151000688, + "loss": 0.6613, + "step": 3274 + }, + { + "epoch": 0.2928290414878398, + "grad_norm": 0.13496761352766917, + "learning_rate": 0.0001659128053824064, + "loss": 0.6741, + "step": 3275 + }, + { + "epoch": 0.2929184549356223, + "grad_norm": 0.11749551807161791, + "learning_rate": 0.0001658910237267785, + "loss": 0.6728, + "step": 3276 + }, + { + "epoch": 0.29300786838340487, + "grad_norm": 0.10514287896813738, + "learning_rate": 0.00016586923654495004, + "loss": 0.6622, + "step": 3277 + }, + { + "epoch": 0.2930972818311874, + "grad_norm": 0.12743646444336595, + "learning_rate": 0.00016584744383874825, + "loss": 0.684, + "step": 3278 + }, + { + "epoch": 0.29318669527896996, + "grad_norm": 0.12504218006522705, + "learning_rate": 0.00016582564561000088, + "loss": 0.6785, + "step": 3279 + }, + { + "epoch": 0.2932761087267525, + "grad_norm": 0.12726057999789347, + "learning_rate": 0.0001658038418605361, + "loss": 0.6858, + "step": 3280 + }, + { + "epoch": 0.29336552217453504, + "grad_norm": 0.13584381029386244, + "learning_rate": 0.00016578203259218257, + "loss": 0.6844, + "step": 3281 + }, + { + "epoch": 0.2934549356223176, + "grad_norm": 0.12309335604705146, + "learning_rate": 0.00016576021780676943, + "loss": 0.6707, + "step": 3282 + }, + { + "epoch": 0.29354434907010013, + "grad_norm": 0.12628346526584816, + "learning_rate": 0.00016573839750612623, + "loss": 0.6875, + "step": 3283 + }, + { + "epoch": 0.2936337625178827, + "grad_norm": 0.117086681544865, + "learning_rate": 0.00016571657169208302, + "loss": 0.6567, + "step": 3284 + }, + { + "epoch": 0.2937231759656652, + "grad_norm": 0.11714025998446069, + "learning_rate": 0.00016569474036647028, + "loss": 0.636, + "step": 3285 + }, + { + "epoch": 0.2938125894134478, + "grad_norm": 0.1376111379370494, + "learning_rate": 0.00016567290353111905, + "loss": 0.6644, + "step": 3286 + }, + { + "epoch": 0.2939020028612303, + "grad_norm": 0.11806808639490163, + "learning_rate": 0.0001656510611878607, + "loss": 0.5492, + "step": 3287 + }, + { + "epoch": 0.2939914163090129, + "grad_norm": 0.1406671257261491, + "learning_rate": 0.00016562921333852714, + "loss": 0.7195, + "step": 3288 + }, + { + "epoch": 0.29408082975679545, + "grad_norm": 0.11073451170384549, + "learning_rate": 0.00016560735998495066, + "loss": 0.6138, + "step": 3289 + }, + { + "epoch": 0.29417024320457796, + "grad_norm": 0.12446376156805514, + "learning_rate": 0.0001655855011289642, + "loss": 0.6712, + "step": 3290 + }, + { + "epoch": 0.29425965665236054, + "grad_norm": 0.12758084736640832, + "learning_rate": 0.00016556363677240098, + "loss": 0.6728, + "step": 3291 + }, + { + "epoch": 0.29434907010014305, + "grad_norm": 0.12710965975360777, + "learning_rate": 0.00016554176691709467, + "loss": 0.6879, + "step": 3292 + }, + { + "epoch": 0.2944384835479256, + "grad_norm": 0.12837737130215188, + "learning_rate": 0.00016551989156487955, + "loss": 0.6996, + "step": 3293 + }, + { + "epoch": 0.29452789699570814, + "grad_norm": 0.11289886727532707, + "learning_rate": 0.00016549801071759026, + "loss": 0.6824, + "step": 3294 + }, + { + "epoch": 0.2946173104434907, + "grad_norm": 0.10190782101778326, + "learning_rate": 0.00016547612437706189, + "loss": 0.641, + "step": 3295 + }, + { + "epoch": 0.2947067238912732, + "grad_norm": 0.12815618188483913, + "learning_rate": 0.00016545423254513004, + "loss": 0.6575, + "step": 3296 + }, + { + "epoch": 0.2947961373390558, + "grad_norm": 0.12411709071283593, + "learning_rate": 0.00016543233522363078, + "loss": 0.6614, + "step": 3297 + }, + { + "epoch": 0.2948855507868383, + "grad_norm": 0.13580155095356308, + "learning_rate": 0.00016541043241440057, + "loss": 0.7038, + "step": 3298 + }, + { + "epoch": 0.2949749642346209, + "grad_norm": 0.10658182089473858, + "learning_rate": 0.0001653885241192764, + "loss": 0.6687, + "step": 3299 + }, + { + "epoch": 0.29506437768240346, + "grad_norm": 0.1205429795588404, + "learning_rate": 0.00016536661034009567, + "loss": 0.6891, + "step": 3300 + }, + { + "epoch": 0.29515379113018597, + "grad_norm": 0.10899070468345053, + "learning_rate": 0.00016534469107869627, + "loss": 0.6561, + "step": 3301 + }, + { + "epoch": 0.29524320457796854, + "grad_norm": 0.13554015321754917, + "learning_rate": 0.00016532276633691656, + "loss": 0.6761, + "step": 3302 + }, + { + "epoch": 0.29533261802575106, + "grad_norm": 0.13834189407424238, + "learning_rate": 0.00016530083611659532, + "loss": 0.7355, + "step": 3303 + }, + { + "epoch": 0.29542203147353363, + "grad_norm": 0.13245541842973638, + "learning_rate": 0.00016527890041957184, + "loss": 0.7257, + "step": 3304 + }, + { + "epoch": 0.29551144492131615, + "grad_norm": 0.13345927464352178, + "learning_rate": 0.0001652569592476858, + "loss": 0.7189, + "step": 3305 + }, + { + "epoch": 0.2956008583690987, + "grad_norm": 0.1250218719477767, + "learning_rate": 0.0001652350126027774, + "loss": 0.6581, + "step": 3306 + }, + { + "epoch": 0.29569027181688123, + "grad_norm": 0.1212078662009146, + "learning_rate": 0.00016521306048668727, + "loss": 0.6117, + "step": 3307 + }, + { + "epoch": 0.2957796852646638, + "grad_norm": 0.10184986021841869, + "learning_rate": 0.00016519110290125652, + "loss": 0.6561, + "step": 3308 + }, + { + "epoch": 0.2958690987124464, + "grad_norm": 0.13004339258655392, + "learning_rate": 0.0001651691398483267, + "loss": 0.662, + "step": 3309 + }, + { + "epoch": 0.2959585121602289, + "grad_norm": 0.12190325475813726, + "learning_rate": 0.00016514717132973982, + "loss": 0.6456, + "step": 3310 + }, + { + "epoch": 0.29604792560801146, + "grad_norm": 0.12650554671809044, + "learning_rate": 0.00016512519734733836, + "loss": 0.6874, + "step": 3311 + }, + { + "epoch": 0.296137339055794, + "grad_norm": 0.12202528510933723, + "learning_rate": 0.00016510321790296525, + "loss": 0.6888, + "step": 3312 + }, + { + "epoch": 0.29622675250357655, + "grad_norm": 0.11479080144028632, + "learning_rate": 0.0001650812329984639, + "loss": 0.5535, + "step": 3313 + }, + { + "epoch": 0.29631616595135907, + "grad_norm": 0.13555978019918838, + "learning_rate": 0.0001650592426356781, + "loss": 0.6987, + "step": 3314 + }, + { + "epoch": 0.29640557939914164, + "grad_norm": 0.12480957469593582, + "learning_rate": 0.00016503724681645222, + "loss": 0.6644, + "step": 3315 + }, + { + "epoch": 0.29649499284692415, + "grad_norm": 0.12372642717639255, + "learning_rate": 0.000165015245542631, + "loss": 0.6649, + "step": 3316 + }, + { + "epoch": 0.2965844062947067, + "grad_norm": 0.1416391552520613, + "learning_rate": 0.00016499323881605964, + "loss": 0.6968, + "step": 3317 + }, + { + "epoch": 0.2966738197424893, + "grad_norm": 0.12835352264605396, + "learning_rate": 0.00016497122663858385, + "loss": 0.7104, + "step": 3318 + }, + { + "epoch": 0.2967632331902718, + "grad_norm": 0.11792267587986545, + "learning_rate": 0.0001649492090120497, + "loss": 0.6769, + "step": 3319 + }, + { + "epoch": 0.2968526466380544, + "grad_norm": 0.14093590317065033, + "learning_rate": 0.00016492718593830389, + "loss": 0.7639, + "step": 3320 + }, + { + "epoch": 0.2969420600858369, + "grad_norm": 0.1338432209337858, + "learning_rate": 0.00016490515741919334, + "loss": 0.6749, + "step": 3321 + }, + { + "epoch": 0.2970314735336195, + "grad_norm": 0.1288430822372793, + "learning_rate": 0.00016488312345656566, + "loss": 0.6631, + "step": 3322 + }, + { + "epoch": 0.297120886981402, + "grad_norm": 0.12361050831108586, + "learning_rate": 0.0001648610840522688, + "loss": 0.69, + "step": 3323 + }, + { + "epoch": 0.29721030042918456, + "grad_norm": 0.12880136709165016, + "learning_rate": 0.00016483903920815111, + "loss": 0.6946, + "step": 3324 + }, + { + "epoch": 0.2972997138769671, + "grad_norm": 0.12543743778138997, + "learning_rate": 0.0001648169889260615, + "loss": 0.6827, + "step": 3325 + }, + { + "epoch": 0.29738912732474965, + "grad_norm": 0.11891513234264257, + "learning_rate": 0.00016479493320784938, + "loss": 0.5721, + "step": 3326 + }, + { + "epoch": 0.29747854077253216, + "grad_norm": 0.1245009870826884, + "learning_rate": 0.0001647728720553644, + "loss": 0.6433, + "step": 3327 + }, + { + "epoch": 0.29756795422031473, + "grad_norm": 0.1352675996707778, + "learning_rate": 0.00016475080547045687, + "loss": 0.6922, + "step": 3328 + }, + { + "epoch": 0.2976573676680973, + "grad_norm": 0.12990280975172214, + "learning_rate": 0.0001647287334549775, + "loss": 0.6891, + "step": 3329 + }, + { + "epoch": 0.2977467811158798, + "grad_norm": 0.11062733534165609, + "learning_rate": 0.00016470665601077742, + "loss": 0.6831, + "step": 3330 + }, + { + "epoch": 0.2978361945636624, + "grad_norm": 0.11656150016217433, + "learning_rate": 0.00016468457313970826, + "loss": 0.5844, + "step": 3331 + }, + { + "epoch": 0.2979256080114449, + "grad_norm": 0.13600492158573585, + "learning_rate": 0.00016466248484362208, + "loss": 0.6346, + "step": 3332 + }, + { + "epoch": 0.2980150214592275, + "grad_norm": 0.10784032144081082, + "learning_rate": 0.00016464039112437138, + "loss": 0.6599, + "step": 3333 + }, + { + "epoch": 0.29810443490701, + "grad_norm": 0.12360041661560128, + "learning_rate": 0.00016461829198380912, + "loss": 0.6853, + "step": 3334 + }, + { + "epoch": 0.29819384835479257, + "grad_norm": 0.13263134103888244, + "learning_rate": 0.00016459618742378876, + "loss": 0.6968, + "step": 3335 + }, + { + "epoch": 0.2982832618025751, + "grad_norm": 0.12301444150299923, + "learning_rate": 0.0001645740774461642, + "loss": 0.6676, + "step": 3336 + }, + { + "epoch": 0.29837267525035766, + "grad_norm": 0.12726996059579326, + "learning_rate": 0.00016455196205278968, + "loss": 0.7009, + "step": 3337 + }, + { + "epoch": 0.2984620886981402, + "grad_norm": 0.12551744181768054, + "learning_rate": 0.0001645298412455201, + "loss": 0.7083, + "step": 3338 + }, + { + "epoch": 0.29855150214592274, + "grad_norm": 0.13123124985177817, + "learning_rate": 0.0001645077150262107, + "loss": 0.7062, + "step": 3339 + }, + { + "epoch": 0.2986409155937053, + "grad_norm": 0.12187590544220947, + "learning_rate": 0.00016448558339671713, + "loss": 0.6649, + "step": 3340 + }, + { + "epoch": 0.29873032904148783, + "grad_norm": 0.13219987180956805, + "learning_rate": 0.00016446344635889554, + "loss": 0.7056, + "step": 3341 + }, + { + "epoch": 0.2988197424892704, + "grad_norm": 0.1676795987270043, + "learning_rate": 0.00016444130391460258, + "loss": 0.6847, + "step": 3342 + }, + { + "epoch": 0.2989091559370529, + "grad_norm": 0.13401594491709268, + "learning_rate": 0.00016441915606569526, + "loss": 0.6959, + "step": 3343 + }, + { + "epoch": 0.2989985693848355, + "grad_norm": 0.11201222349427026, + "learning_rate": 0.00016439700281403114, + "loss": 0.6754, + "step": 3344 + }, + { + "epoch": 0.299087982832618, + "grad_norm": 0.11155840042745455, + "learning_rate": 0.00016437484416146817, + "loss": 0.6283, + "step": 3345 + }, + { + "epoch": 0.2991773962804006, + "grad_norm": 0.11812191770592936, + "learning_rate": 0.00016435268010986476, + "loss": 0.7046, + "step": 3346 + }, + { + "epoch": 0.2992668097281831, + "grad_norm": 0.13402066237102891, + "learning_rate": 0.0001643305106610798, + "loss": 0.7346, + "step": 3347 + }, + { + "epoch": 0.29935622317596566, + "grad_norm": 0.1189452290894087, + "learning_rate": 0.00016430833581697254, + "loss": 0.6693, + "step": 3348 + }, + { + "epoch": 0.29944563662374823, + "grad_norm": 0.12931700878338384, + "learning_rate": 0.00016428615557940288, + "loss": 0.6958, + "step": 3349 + }, + { + "epoch": 0.29953505007153075, + "grad_norm": 0.11666521736788464, + "learning_rate": 0.000164263969950231, + "loss": 0.7209, + "step": 3350 + }, + { + "epoch": 0.2996244635193133, + "grad_norm": 0.11424011364364658, + "learning_rate": 0.0001642417789313175, + "loss": 0.6719, + "step": 3351 + }, + { + "epoch": 0.29971387696709584, + "grad_norm": 0.10797879619230094, + "learning_rate": 0.00016421958252452363, + "loss": 0.6623, + "step": 3352 + }, + { + "epoch": 0.2998032904148784, + "grad_norm": 0.13686101303783577, + "learning_rate": 0.00016419738073171093, + "loss": 0.6919, + "step": 3353 + }, + { + "epoch": 0.2998927038626609, + "grad_norm": 0.12132714653604808, + "learning_rate": 0.00016417517355474145, + "loss": 0.6876, + "step": 3354 + }, + { + "epoch": 0.2999821173104435, + "grad_norm": 0.11968609576327024, + "learning_rate": 0.00016415296099547765, + "loss": 0.6921, + "step": 3355 + }, + { + "epoch": 0.300071530758226, + "grad_norm": 0.13298007550313592, + "learning_rate": 0.0001641307430557825, + "loss": 0.7007, + "step": 3356 + }, + { + "epoch": 0.3001609442060086, + "grad_norm": 0.12623657248372716, + "learning_rate": 0.0001641085197375194, + "loss": 0.6773, + "step": 3357 + }, + { + "epoch": 0.30025035765379116, + "grad_norm": 0.13910488001162177, + "learning_rate": 0.00016408629104255212, + "loss": 0.6982, + "step": 3358 + }, + { + "epoch": 0.30033977110157367, + "grad_norm": 0.12648974588699965, + "learning_rate": 0.00016406405697274505, + "loss": 0.6691, + "step": 3359 + }, + { + "epoch": 0.30042918454935624, + "grad_norm": 0.11411201519474078, + "learning_rate": 0.00016404181752996289, + "loss": 0.6693, + "step": 3360 + }, + { + "epoch": 0.30051859799713876, + "grad_norm": 0.12838745534267332, + "learning_rate": 0.00016401957271607083, + "loss": 0.6828, + "step": 3361 + }, + { + "epoch": 0.30060801144492133, + "grad_norm": 0.14204916500740292, + "learning_rate": 0.0001639973225329345, + "loss": 0.713, + "step": 3362 + }, + { + "epoch": 0.30069742489270385, + "grad_norm": 0.12658714816812217, + "learning_rate": 0.00016397506698242003, + "loss": 0.692, + "step": 3363 + }, + { + "epoch": 0.3007868383404864, + "grad_norm": 0.12350755058075635, + "learning_rate": 0.00016395280606639395, + "loss": 0.6365, + "step": 3364 + }, + { + "epoch": 0.30087625178826893, + "grad_norm": 0.11096288793986957, + "learning_rate": 0.00016393053978672328, + "loss": 0.6487, + "step": 3365 + }, + { + "epoch": 0.3009656652360515, + "grad_norm": 0.12898818122567143, + "learning_rate": 0.00016390826814527545, + "loss": 0.7215, + "step": 3366 + }, + { + "epoch": 0.301055078683834, + "grad_norm": 0.11996134863047009, + "learning_rate": 0.00016388599114391833, + "loss": 0.6742, + "step": 3367 + }, + { + "epoch": 0.3011444921316166, + "grad_norm": 0.11422606865424266, + "learning_rate": 0.0001638637087845203, + "loss": 0.647, + "step": 3368 + }, + { + "epoch": 0.30123390557939916, + "grad_norm": 0.1471338644120097, + "learning_rate": 0.00016384142106895015, + "loss": 0.7126, + "step": 3369 + }, + { + "epoch": 0.3013233190271817, + "grad_norm": 0.12740047616763386, + "learning_rate": 0.0001638191279990771, + "loss": 0.6777, + "step": 3370 + }, + { + "epoch": 0.30141273247496425, + "grad_norm": 0.13936506221235923, + "learning_rate": 0.00016379682957677087, + "loss": 0.6892, + "step": 3371 + }, + { + "epoch": 0.30150214592274677, + "grad_norm": 0.1184797934856671, + "learning_rate": 0.00016377452580390158, + "loss": 0.6698, + "step": 3372 + }, + { + "epoch": 0.30159155937052934, + "grad_norm": 0.11999632476082969, + "learning_rate": 0.00016375221668233985, + "loss": 0.7021, + "step": 3373 + }, + { + "epoch": 0.30168097281831185, + "grad_norm": 0.12482317264539718, + "learning_rate": 0.00016372990221395666, + "loss": 0.6839, + "step": 3374 + }, + { + "epoch": 0.3017703862660944, + "grad_norm": 0.1348603745606632, + "learning_rate": 0.00016370758240062357, + "loss": 0.7124, + "step": 3375 + }, + { + "epoch": 0.30185979971387694, + "grad_norm": 0.1205853419800009, + "learning_rate": 0.00016368525724421248, + "loss": 0.6639, + "step": 3376 + }, + { + "epoch": 0.3019492131616595, + "grad_norm": 0.11941317976311948, + "learning_rate": 0.00016366292674659577, + "loss": 0.6835, + "step": 3377 + }, + { + "epoch": 0.3020386266094421, + "grad_norm": 0.13621840465276264, + "learning_rate": 0.0001636405909096463, + "loss": 0.6736, + "step": 3378 + }, + { + "epoch": 0.3021280400572246, + "grad_norm": 0.12473930862520939, + "learning_rate": 0.0001636182497352373, + "loss": 0.6497, + "step": 3379 + }, + { + "epoch": 0.30221745350500717, + "grad_norm": 0.1326896948634738, + "learning_rate": 0.00016359590322524253, + "loss": 0.7227, + "step": 3380 + }, + { + "epoch": 0.3023068669527897, + "grad_norm": 0.13972268872792856, + "learning_rate": 0.0001635735513815362, + "loss": 0.6662, + "step": 3381 + }, + { + "epoch": 0.30239628040057226, + "grad_norm": 0.13050214392721568, + "learning_rate": 0.00016355119420599282, + "loss": 0.6924, + "step": 3382 + }, + { + "epoch": 0.3024856938483548, + "grad_norm": 0.19842809345571827, + "learning_rate": 0.00016352883170048758, + "loss": 0.6859, + "step": 3383 + }, + { + "epoch": 0.30257510729613735, + "grad_norm": 0.12140652107758057, + "learning_rate": 0.00016350646386689593, + "loss": 0.6336, + "step": 3384 + }, + { + "epoch": 0.30266452074391986, + "grad_norm": 0.12927738045701329, + "learning_rate": 0.0001634840907070939, + "loss": 0.7002, + "step": 3385 + }, + { + "epoch": 0.30275393419170243, + "grad_norm": 0.15400740555411985, + "learning_rate": 0.0001634617122229578, + "loss": 0.6832, + "step": 3386 + }, + { + "epoch": 0.302843347639485, + "grad_norm": 0.12714644240088482, + "learning_rate": 0.00016343932841636456, + "loss": 0.7239, + "step": 3387 + }, + { + "epoch": 0.3029327610872675, + "grad_norm": 0.1286750177170258, + "learning_rate": 0.00016341693928919145, + "loss": 0.6758, + "step": 3388 + }, + { + "epoch": 0.3030221745350501, + "grad_norm": 0.12471364979726633, + "learning_rate": 0.00016339454484331624, + "loss": 0.6483, + "step": 3389 + }, + { + "epoch": 0.3031115879828326, + "grad_norm": 0.12385127043388686, + "learning_rate": 0.00016337214508061712, + "loss": 0.668, + "step": 3390 + }, + { + "epoch": 0.3032010014306152, + "grad_norm": 0.21156031484445456, + "learning_rate": 0.00016334974000297271, + "loss": 0.6858, + "step": 3391 + }, + { + "epoch": 0.3032904148783977, + "grad_norm": 0.11262192154672855, + "learning_rate": 0.0001633273296122621, + "loss": 0.6233, + "step": 3392 + }, + { + "epoch": 0.30337982832618027, + "grad_norm": 0.12619007601623022, + "learning_rate": 0.0001633049139103649, + "loss": 0.6867, + "step": 3393 + }, + { + "epoch": 0.3034692417739628, + "grad_norm": 0.11330247903015722, + "learning_rate": 0.00016328249289916097, + "loss": 0.6446, + "step": 3394 + }, + { + "epoch": 0.30355865522174535, + "grad_norm": 0.13373877330164524, + "learning_rate": 0.00016326006658053078, + "loss": 0.683, + "step": 3395 + }, + { + "epoch": 0.30364806866952787, + "grad_norm": 0.12535736227240038, + "learning_rate": 0.00016323763495635523, + "loss": 0.6613, + "step": 3396 + }, + { + "epoch": 0.30373748211731044, + "grad_norm": 0.1301687069358041, + "learning_rate": 0.00016321519802851557, + "loss": 0.6727, + "step": 3397 + }, + { + "epoch": 0.303826895565093, + "grad_norm": 0.12265789826524313, + "learning_rate": 0.00016319275579889365, + "loss": 0.6208, + "step": 3398 + }, + { + "epoch": 0.30391630901287553, + "grad_norm": 0.13150488856879597, + "learning_rate": 0.0001631703082693716, + "loss": 0.6419, + "step": 3399 + }, + { + "epoch": 0.3040057224606581, + "grad_norm": 0.1360730967979245, + "learning_rate": 0.00016314785544183208, + "loss": 0.7108, + "step": 3400 + }, + { + "epoch": 0.3040951359084406, + "grad_norm": 0.11518540996318953, + "learning_rate": 0.00016312539731815816, + "loss": 0.6441, + "step": 3401 + }, + { + "epoch": 0.3041845493562232, + "grad_norm": 0.13471367659085356, + "learning_rate": 0.00016310293390023344, + "loss": 0.6448, + "step": 3402 + }, + { + "epoch": 0.3042739628040057, + "grad_norm": 0.11864956676070604, + "learning_rate": 0.00016308046518994184, + "loss": 0.668, + "step": 3403 + }, + { + "epoch": 0.3043633762517883, + "grad_norm": 0.13514138746422796, + "learning_rate": 0.00016305799118916783, + "loss": 0.7081, + "step": 3404 + }, + { + "epoch": 0.3044527896995708, + "grad_norm": 0.1383845190996653, + "learning_rate": 0.00016303551189979625, + "loss": 0.7213, + "step": 3405 + }, + { + "epoch": 0.30454220314735336, + "grad_norm": 0.12203799847404735, + "learning_rate": 0.0001630130273237124, + "loss": 0.7019, + "step": 3406 + }, + { + "epoch": 0.30463161659513593, + "grad_norm": 0.1304064889339467, + "learning_rate": 0.00016299053746280206, + "loss": 0.6223, + "step": 3407 + }, + { + "epoch": 0.30472103004291845, + "grad_norm": 0.12971976496867849, + "learning_rate": 0.00016296804231895142, + "loss": 0.7002, + "step": 3408 + }, + { + "epoch": 0.304810443490701, + "grad_norm": 0.1295642812874813, + "learning_rate": 0.00016294554189404708, + "loss": 0.6779, + "step": 3409 + }, + { + "epoch": 0.30489985693848354, + "grad_norm": 0.1290207062561215, + "learning_rate": 0.00016292303618997619, + "loss": 0.6621, + "step": 3410 + }, + { + "epoch": 0.3049892703862661, + "grad_norm": 0.11859633503869571, + "learning_rate": 0.00016290052520862624, + "loss": 0.6822, + "step": 3411 + }, + { + "epoch": 0.3050786838340486, + "grad_norm": 0.12981288695392856, + "learning_rate": 0.00016287800895188522, + "loss": 0.671, + "step": 3412 + }, + { + "epoch": 0.3051680972818312, + "grad_norm": 0.14997052392667512, + "learning_rate": 0.0001628554874216415, + "loss": 0.7236, + "step": 3413 + }, + { + "epoch": 0.3052575107296137, + "grad_norm": 0.12881992234302725, + "learning_rate": 0.00016283296061978398, + "loss": 0.6945, + "step": 3414 + }, + { + "epoch": 0.3053469241773963, + "grad_norm": 0.14036757856828416, + "learning_rate": 0.00016281042854820194, + "loss": 0.6997, + "step": 3415 + }, + { + "epoch": 0.3054363376251788, + "grad_norm": 0.1369217743791982, + "learning_rate": 0.0001627878912087851, + "loss": 0.7182, + "step": 3416 + }, + { + "epoch": 0.30552575107296137, + "grad_norm": 0.13972776090347097, + "learning_rate": 0.00016276534860342368, + "loss": 0.7285, + "step": 3417 + }, + { + "epoch": 0.30561516452074394, + "grad_norm": 0.131927988357579, + "learning_rate": 0.00016274280073400824, + "loss": 0.6612, + "step": 3418 + }, + { + "epoch": 0.30570457796852646, + "grad_norm": 0.11737464606972436, + "learning_rate": 0.00016272024760242992, + "loss": 0.6551, + "step": 3419 + }, + { + "epoch": 0.30579399141630903, + "grad_norm": 0.15483270337112676, + "learning_rate": 0.00016269768921058013, + "loss": 0.7076, + "step": 3420 + }, + { + "epoch": 0.30588340486409155, + "grad_norm": 0.12002885972891218, + "learning_rate": 0.0001626751255603509, + "loss": 0.6433, + "step": 3421 + }, + { + "epoch": 0.3059728183118741, + "grad_norm": 0.11838138872588586, + "learning_rate": 0.00016265255665363454, + "loss": 0.6507, + "step": 3422 + }, + { + "epoch": 0.30606223175965663, + "grad_norm": 0.12269700046720604, + "learning_rate": 0.00016262998249232398, + "loss": 0.7122, + "step": 3423 + }, + { + "epoch": 0.3061516452074392, + "grad_norm": 0.1621559592573383, + "learning_rate": 0.00016260740307831237, + "loss": 0.6491, + "step": 3424 + }, + { + "epoch": 0.3062410586552217, + "grad_norm": 0.14262429715413166, + "learning_rate": 0.00016258481841349348, + "loss": 0.6794, + "step": 3425 + }, + { + "epoch": 0.3063304721030043, + "grad_norm": 0.1258902556344438, + "learning_rate": 0.0001625622284997615, + "loss": 0.7171, + "step": 3426 + }, + { + "epoch": 0.30641988555078686, + "grad_norm": 0.13648417856386616, + "learning_rate": 0.0001625396333390109, + "loss": 0.6716, + "step": 3427 + }, + { + "epoch": 0.3065092989985694, + "grad_norm": 0.12350690196082571, + "learning_rate": 0.00016251703293313687, + "loss": 0.6923, + "step": 3428 + }, + { + "epoch": 0.30659871244635195, + "grad_norm": 0.12927824682783925, + "learning_rate": 0.00016249442728403474, + "loss": 0.6081, + "step": 3429 + }, + { + "epoch": 0.30668812589413447, + "grad_norm": 0.11195499725919228, + "learning_rate": 0.00016247181639360045, + "loss": 0.6477, + "step": 3430 + }, + { + "epoch": 0.30677753934191704, + "grad_norm": 0.12975846844859154, + "learning_rate": 0.00016244920026373038, + "loss": 0.731, + "step": 3431 + }, + { + "epoch": 0.30686695278969955, + "grad_norm": 0.1402265163856159, + "learning_rate": 0.00016242657889632133, + "loss": 0.6634, + "step": 3432 + }, + { + "epoch": 0.3069563662374821, + "grad_norm": 0.1361654245335197, + "learning_rate": 0.0001624039522932705, + "loss": 0.652, + "step": 3433 + }, + { + "epoch": 0.30704577968526464, + "grad_norm": 0.1488150713136209, + "learning_rate": 0.00016238132045647553, + "loss": 0.7344, + "step": 3434 + }, + { + "epoch": 0.3071351931330472, + "grad_norm": 0.11875837529957414, + "learning_rate": 0.00016235868338783455, + "loss": 0.665, + "step": 3435 + }, + { + "epoch": 0.3072246065808298, + "grad_norm": 0.10611302796232193, + "learning_rate": 0.00016233604108924609, + "loss": 0.6699, + "step": 3436 + }, + { + "epoch": 0.3073140200286123, + "grad_norm": 0.13895919530464565, + "learning_rate": 0.0001623133935626092, + "loss": 0.6723, + "step": 3437 + }, + { + "epoch": 0.30740343347639487, + "grad_norm": 0.11075271105661412, + "learning_rate": 0.00016229074080982317, + "loss": 0.6499, + "step": 3438 + }, + { + "epoch": 0.3074928469241774, + "grad_norm": 0.10258032262165945, + "learning_rate": 0.000162268082832788, + "loss": 0.6616, + "step": 3439 + }, + { + "epoch": 0.30758226037195996, + "grad_norm": 0.12116132106753459, + "learning_rate": 0.00016224541963340391, + "loss": 0.6918, + "step": 3440 + }, + { + "epoch": 0.3076716738197425, + "grad_norm": 0.1163149626605016, + "learning_rate": 0.00016222275121357163, + "loss": 0.6801, + "step": 3441 + }, + { + "epoch": 0.30776108726752505, + "grad_norm": 0.11026364724519838, + "learning_rate": 0.00016220007757519238, + "loss": 0.6656, + "step": 3442 + }, + { + "epoch": 0.30785050071530756, + "grad_norm": 0.13104626645057457, + "learning_rate": 0.00016217739872016772, + "loss": 0.6702, + "step": 3443 + }, + { + "epoch": 0.30793991416309013, + "grad_norm": 0.11876657943067712, + "learning_rate": 0.00016215471465039975, + "loss": 0.669, + "step": 3444 + }, + { + "epoch": 0.30802932761087265, + "grad_norm": 0.1268751606547932, + "learning_rate": 0.00016213202536779087, + "loss": 0.7284, + "step": 3445 + }, + { + "epoch": 0.3081187410586552, + "grad_norm": 0.12690617560381828, + "learning_rate": 0.00016210933087424412, + "loss": 0.7135, + "step": 3446 + }, + { + "epoch": 0.3082081545064378, + "grad_norm": 0.13180192929032425, + "learning_rate": 0.00016208663117166277, + "loss": 0.6915, + "step": 3447 + }, + { + "epoch": 0.3082975679542203, + "grad_norm": 0.1178936623990904, + "learning_rate": 0.00016206392626195063, + "loss": 0.7019, + "step": 3448 + }, + { + "epoch": 0.3083869814020029, + "grad_norm": 0.12095731307918228, + "learning_rate": 0.00016204121614701197, + "loss": 0.6864, + "step": 3449 + }, + { + "epoch": 0.3084763948497854, + "grad_norm": 0.11486599041549994, + "learning_rate": 0.00016201850082875146, + "loss": 0.6669, + "step": 3450 + }, + { + "epoch": 0.30856580829756797, + "grad_norm": 0.10803265462354479, + "learning_rate": 0.00016199578030907415, + "loss": 0.6803, + "step": 3451 + }, + { + "epoch": 0.3086552217453505, + "grad_norm": 0.10654111289974834, + "learning_rate": 0.0001619730545898856, + "loss": 0.6923, + "step": 3452 + }, + { + "epoch": 0.30874463519313305, + "grad_norm": 0.10636402210574725, + "learning_rate": 0.00016195032367309183, + "loss": 0.7057, + "step": 3453 + }, + { + "epoch": 0.30883404864091557, + "grad_norm": 0.1336243508977306, + "learning_rate": 0.00016192758756059926, + "loss": 0.7259, + "step": 3454 + }, + { + "epoch": 0.30892346208869814, + "grad_norm": 0.1343513678393242, + "learning_rate": 0.00016190484625431468, + "loss": 0.6872, + "step": 3455 + }, + { + "epoch": 0.3090128755364807, + "grad_norm": 0.14871150010100675, + "learning_rate": 0.00016188209975614542, + "loss": 0.7009, + "step": 3456 + }, + { + "epoch": 0.30910228898426323, + "grad_norm": 0.1501022035681944, + "learning_rate": 0.00016185934806799916, + "loss": 0.7244, + "step": 3457 + }, + { + "epoch": 0.3091917024320458, + "grad_norm": 0.1278726859652057, + "learning_rate": 0.0001618365911917841, + "loss": 0.6924, + "step": 3458 + }, + { + "epoch": 0.3092811158798283, + "grad_norm": 0.13089932088179476, + "learning_rate": 0.00016181382912940884, + "loss": 0.6451, + "step": 3459 + }, + { + "epoch": 0.3093705293276109, + "grad_norm": 0.12223310805201001, + "learning_rate": 0.00016179106188278234, + "loss": 0.7058, + "step": 3460 + }, + { + "epoch": 0.3094599427753934, + "grad_norm": 0.12450956516584354, + "learning_rate": 0.00016176828945381415, + "loss": 0.6514, + "step": 3461 + }, + { + "epoch": 0.309549356223176, + "grad_norm": 0.14217125979233647, + "learning_rate": 0.00016174551184441408, + "loss": 0.6914, + "step": 3462 + }, + { + "epoch": 0.3096387696709585, + "grad_norm": 0.12048141985565185, + "learning_rate": 0.00016172272905649253, + "loss": 0.6605, + "step": 3463 + }, + { + "epoch": 0.30972818311874106, + "grad_norm": 0.12091555308875819, + "learning_rate": 0.00016169994109196023, + "loss": 0.6364, + "step": 3464 + }, + { + "epoch": 0.3098175965665236, + "grad_norm": 0.13018610921639331, + "learning_rate": 0.00016167714795272837, + "loss": 0.7279, + "step": 3465 + }, + { + "epoch": 0.30990701001430615, + "grad_norm": 0.13416681735157374, + "learning_rate": 0.00016165434964070862, + "loss": 0.6959, + "step": 3466 + }, + { + "epoch": 0.3099964234620887, + "grad_norm": 0.12810404243325316, + "learning_rate": 0.000161631546157813, + "loss": 0.6985, + "step": 3467 + }, + { + "epoch": 0.31008583690987124, + "grad_norm": 0.13114121581823684, + "learning_rate": 0.00016160873750595405, + "loss": 0.6533, + "step": 3468 + }, + { + "epoch": 0.3101752503576538, + "grad_norm": 0.12615106142551016, + "learning_rate": 0.00016158592368704472, + "loss": 0.6751, + "step": 3469 + }, + { + "epoch": 0.3102646638054363, + "grad_norm": 0.11251976343106306, + "learning_rate": 0.00016156310470299832, + "loss": 0.6498, + "step": 3470 + }, + { + "epoch": 0.3103540772532189, + "grad_norm": 0.1288890242347081, + "learning_rate": 0.00016154028055572866, + "loss": 0.7242, + "step": 3471 + }, + { + "epoch": 0.3104434907010014, + "grad_norm": 0.1491375646737293, + "learning_rate": 0.00016151745124715002, + "loss": 0.734, + "step": 3472 + }, + { + "epoch": 0.310532904148784, + "grad_norm": 0.11547531479428318, + "learning_rate": 0.000161494616779177, + "loss": 0.6385, + "step": 3473 + }, + { + "epoch": 0.3106223175965665, + "grad_norm": 0.11970833416791953, + "learning_rate": 0.00016147177715372476, + "loss": 0.6633, + "step": 3474 + }, + { + "epoch": 0.31071173104434907, + "grad_norm": 0.12620848260479992, + "learning_rate": 0.00016144893237270887, + "loss": 0.6569, + "step": 3475 + }, + { + "epoch": 0.31080114449213164, + "grad_norm": 0.12871919605189508, + "learning_rate": 0.00016142608243804513, + "loss": 0.6808, + "step": 3476 + }, + { + "epoch": 0.31089055793991416, + "grad_norm": 0.14892306382886092, + "learning_rate": 0.0001614032273516501, + "loss": 0.7101, + "step": 3477 + }, + { + "epoch": 0.31097997138769673, + "grad_norm": 0.12043263291240128, + "learning_rate": 0.00016138036711544054, + "loss": 0.6578, + "step": 3478 + }, + { + "epoch": 0.31106938483547925, + "grad_norm": 0.12693441318493037, + "learning_rate": 0.0001613575017313337, + "loss": 0.6425, + "step": 3479 + }, + { + "epoch": 0.3111587982832618, + "grad_norm": 0.1397704078187446, + "learning_rate": 0.00016133463120124731, + "loss": 0.7162, + "step": 3480 + }, + { + "epoch": 0.31124821173104433, + "grad_norm": 0.12671553028527652, + "learning_rate": 0.00016131175552709946, + "loss": 0.6774, + "step": 3481 + }, + { + "epoch": 0.3113376251788269, + "grad_norm": 0.12159179064868342, + "learning_rate": 0.00016128887471080874, + "loss": 0.7093, + "step": 3482 + }, + { + "epoch": 0.3114270386266094, + "grad_norm": 0.13464980236792726, + "learning_rate": 0.00016126598875429408, + "loss": 0.6734, + "step": 3483 + }, + { + "epoch": 0.311516452074392, + "grad_norm": 0.11205551087688274, + "learning_rate": 0.00016124309765947498, + "loss": 0.6461, + "step": 3484 + }, + { + "epoch": 0.3116058655221745, + "grad_norm": 0.11235091108204397, + "learning_rate": 0.00016122020142827123, + "loss": 0.6509, + "step": 3485 + }, + { + "epoch": 0.3116952789699571, + "grad_norm": 0.1328120400496156, + "learning_rate": 0.0001611973000626031, + "loss": 0.6788, + "step": 3486 + }, + { + "epoch": 0.31178469241773965, + "grad_norm": 0.13774854643845671, + "learning_rate": 0.00016117439356439132, + "loss": 0.7201, + "step": 3487 + }, + { + "epoch": 0.31187410586552217, + "grad_norm": 0.11941172538886091, + "learning_rate": 0.00016115148193555706, + "loss": 0.6525, + "step": 3488 + }, + { + "epoch": 0.31196351931330474, + "grad_norm": 0.1289227111362865, + "learning_rate": 0.00016112856517802183, + "loss": 0.6882, + "step": 3489 + }, + { + "epoch": 0.31205293276108725, + "grad_norm": 0.10730088656935113, + "learning_rate": 0.0001611056432937077, + "loss": 0.688, + "step": 3490 + }, + { + "epoch": 0.3121423462088698, + "grad_norm": 0.14155525967673396, + "learning_rate": 0.00016108271628453703, + "loss": 0.6896, + "step": 3491 + }, + { + "epoch": 0.31223175965665234, + "grad_norm": 0.12361260247153859, + "learning_rate": 0.00016105978415243276, + "loss": 0.6631, + "step": 3492 + }, + { + "epoch": 0.3123211731044349, + "grad_norm": 0.11917037204754492, + "learning_rate": 0.00016103684689931807, + "loss": 0.6503, + "step": 3493 + }, + { + "epoch": 0.31241058655221743, + "grad_norm": 0.12746219482238452, + "learning_rate": 0.0001610139045271168, + "loss": 0.5923, + "step": 3494 + }, + { + "epoch": 0.3125, + "grad_norm": 0.11482163218428473, + "learning_rate": 0.00016099095703775302, + "loss": 0.6711, + "step": 3495 + }, + { + "epoch": 0.31258941344778257, + "grad_norm": 0.11326622942751362, + "learning_rate": 0.00016096800443315132, + "loss": 0.6473, + "step": 3496 + }, + { + "epoch": 0.3126788268955651, + "grad_norm": 0.12601303200142608, + "learning_rate": 0.00016094504671523673, + "loss": 0.6619, + "step": 3497 + }, + { + "epoch": 0.31276824034334766, + "grad_norm": 0.11419751384329468, + "learning_rate": 0.00016092208388593469, + "loss": 0.6853, + "step": 3498 + }, + { + "epoch": 0.3128576537911302, + "grad_norm": 0.13949415217118308, + "learning_rate": 0.00016089911594717102, + "loss": 0.6752, + "step": 3499 + }, + { + "epoch": 0.31294706723891275, + "grad_norm": 0.12244174683779277, + "learning_rate": 0.00016087614290087208, + "loss": 0.701, + "step": 3500 + }, + { + "epoch": 0.31303648068669526, + "grad_norm": 0.11916666348124495, + "learning_rate": 0.00016085316474896452, + "loss": 0.6809, + "step": 3501 + }, + { + "epoch": 0.31312589413447783, + "grad_norm": 0.12553136846522908, + "learning_rate": 0.00016083018149337558, + "loss": 0.6754, + "step": 3502 + }, + { + "epoch": 0.31321530758226035, + "grad_norm": 0.11904741453727131, + "learning_rate": 0.0001608071931360327, + "loss": 0.6618, + "step": 3503 + }, + { + "epoch": 0.3133047210300429, + "grad_norm": 0.12074698944391209, + "learning_rate": 0.00016078419967886402, + "loss": 0.6519, + "step": 3504 + }, + { + "epoch": 0.3133941344778255, + "grad_norm": 0.1202524328249098, + "learning_rate": 0.00016076120112379792, + "loss": 0.6801, + "step": 3505 + }, + { + "epoch": 0.313483547925608, + "grad_norm": 0.11565959530957656, + "learning_rate": 0.00016073819747276327, + "loss": 0.7006, + "step": 3506 + }, + { + "epoch": 0.3135729613733906, + "grad_norm": 0.11936501386101663, + "learning_rate": 0.0001607151887276893, + "loss": 0.6574, + "step": 3507 + }, + { + "epoch": 0.3136623748211731, + "grad_norm": 0.11499534434460049, + "learning_rate": 0.00016069217489050584, + "loss": 0.6462, + "step": 3508 + }, + { + "epoch": 0.31375178826895567, + "grad_norm": 0.1303603285824176, + "learning_rate": 0.00016066915596314293, + "loss": 0.6791, + "step": 3509 + }, + { + "epoch": 0.3138412017167382, + "grad_norm": 0.12142522581342825, + "learning_rate": 0.00016064613194753118, + "loss": 0.6648, + "step": 3510 + }, + { + "epoch": 0.31393061516452075, + "grad_norm": 0.12857222976689728, + "learning_rate": 0.0001606231028456016, + "loss": 0.6932, + "step": 3511 + }, + { + "epoch": 0.31402002861230327, + "grad_norm": 0.11813228885612105, + "learning_rate": 0.0001606000686592856, + "loss": 0.6431, + "step": 3512 + }, + { + "epoch": 0.31410944206008584, + "grad_norm": 0.1299463407818482, + "learning_rate": 0.00016057702939051502, + "loss": 0.718, + "step": 3513 + }, + { + "epoch": 0.31419885550786836, + "grad_norm": 0.12247797694518549, + "learning_rate": 0.00016055398504122214, + "loss": 0.5433, + "step": 3514 + }, + { + "epoch": 0.31428826895565093, + "grad_norm": 0.13247575755654184, + "learning_rate": 0.00016053093561333966, + "loss": 0.7099, + "step": 3515 + }, + { + "epoch": 0.3143776824034335, + "grad_norm": 0.13068918167094912, + "learning_rate": 0.00016050788110880072, + "loss": 0.657, + "step": 3516 + }, + { + "epoch": 0.314467095851216, + "grad_norm": 0.13821566932420987, + "learning_rate": 0.00016048482152953889, + "loss": 0.68, + "step": 3517 + }, + { + "epoch": 0.3145565092989986, + "grad_norm": 0.12313878000477985, + "learning_rate": 0.0001604617568774881, + "loss": 0.6401, + "step": 3518 + }, + { + "epoch": 0.3146459227467811, + "grad_norm": 0.11793129600193888, + "learning_rate": 0.0001604386871545828, + "loss": 0.6785, + "step": 3519 + }, + { + "epoch": 0.3147353361945637, + "grad_norm": 0.11575916367376889, + "learning_rate": 0.00016041561236275777, + "loss": 0.6664, + "step": 3520 + }, + { + "epoch": 0.3148247496423462, + "grad_norm": 0.12486008991594322, + "learning_rate": 0.00016039253250394833, + "loss": 0.6746, + "step": 3521 + }, + { + "epoch": 0.31491416309012876, + "grad_norm": 0.12959124125937277, + "learning_rate": 0.0001603694475800901, + "loss": 0.6561, + "step": 3522 + }, + { + "epoch": 0.3150035765379113, + "grad_norm": 0.13457238253780196, + "learning_rate": 0.00016034635759311922, + "loss": 0.667, + "step": 3523 + }, + { + "epoch": 0.31509298998569385, + "grad_norm": 0.12313639527084864, + "learning_rate": 0.00016032326254497218, + "loss": 0.6275, + "step": 3524 + }, + { + "epoch": 0.3151824034334764, + "grad_norm": 0.14176570771297983, + "learning_rate": 0.000160300162437586, + "loss": 0.6766, + "step": 3525 + }, + { + "epoch": 0.31527181688125894, + "grad_norm": 0.1311369160256755, + "learning_rate": 0.00016027705727289802, + "loss": 0.7009, + "step": 3526 + }, + { + "epoch": 0.3153612303290415, + "grad_norm": 0.13195702926454667, + "learning_rate": 0.00016025394705284602, + "loss": 0.6165, + "step": 3527 + }, + { + "epoch": 0.315450643776824, + "grad_norm": 0.12029431146119038, + "learning_rate": 0.00016023083177936823, + "loss": 0.6525, + "step": 3528 + }, + { + "epoch": 0.3155400572246066, + "grad_norm": 0.12600457402793988, + "learning_rate": 0.00016020771145440336, + "loss": 0.6831, + "step": 3529 + }, + { + "epoch": 0.3156294706723891, + "grad_norm": 0.10712136673172547, + "learning_rate": 0.00016018458607989044, + "loss": 0.6773, + "step": 3530 + }, + { + "epoch": 0.3157188841201717, + "grad_norm": 0.11107244933556867, + "learning_rate": 0.00016016145565776895, + "loss": 0.6597, + "step": 3531 + }, + { + "epoch": 0.3158082975679542, + "grad_norm": 0.12427861402650961, + "learning_rate": 0.00016013832018997882, + "loss": 0.6671, + "step": 3532 + }, + { + "epoch": 0.31589771101573677, + "grad_norm": 0.11854564633014246, + "learning_rate": 0.00016011517967846043, + "loss": 0.6706, + "step": 3533 + }, + { + "epoch": 0.3159871244635193, + "grad_norm": 0.1337534018969435, + "learning_rate": 0.00016009203412515455, + "loss": 0.6634, + "step": 3534 + }, + { + "epoch": 0.31607653791130186, + "grad_norm": 0.12269336081424649, + "learning_rate": 0.00016006888353200228, + "loss": 0.6901, + "step": 3535 + }, + { + "epoch": 0.31616595135908443, + "grad_norm": 0.13186167296611173, + "learning_rate": 0.00016004572790094535, + "loss": 0.664, + "step": 3536 + }, + { + "epoch": 0.31625536480686695, + "grad_norm": 0.12310525626929243, + "learning_rate": 0.0001600225672339257, + "loss": 0.6692, + "step": 3537 + }, + { + "epoch": 0.3163447782546495, + "grad_norm": 0.13368463907169886, + "learning_rate": 0.00015999940153288582, + "loss": 0.7237, + "step": 3538 + }, + { + "epoch": 0.31643419170243203, + "grad_norm": 0.14176268435364694, + "learning_rate": 0.00015997623079976863, + "loss": 0.6373, + "step": 3539 + }, + { + "epoch": 0.3165236051502146, + "grad_norm": 0.12902545799554427, + "learning_rate": 0.00015995305503651737, + "loss": 0.6697, + "step": 3540 + }, + { + "epoch": 0.3166130185979971, + "grad_norm": 0.12945621420951076, + "learning_rate": 0.00015992987424507578, + "loss": 0.6525, + "step": 3541 + }, + { + "epoch": 0.3167024320457797, + "grad_norm": 0.11576212786295376, + "learning_rate": 0.000159906688427388, + "loss": 0.6608, + "step": 3542 + }, + { + "epoch": 0.3167918454935622, + "grad_norm": 0.12193502660502531, + "learning_rate": 0.00015988349758539868, + "loss": 0.658, + "step": 3543 + }, + { + "epoch": 0.3168812589413448, + "grad_norm": 0.1233645214383578, + "learning_rate": 0.00015986030172105266, + "loss": 0.6748, + "step": 3544 + }, + { + "epoch": 0.31697067238912735, + "grad_norm": 0.12100197840728102, + "learning_rate": 0.00015983710083629547, + "loss": 0.6703, + "step": 3545 + }, + { + "epoch": 0.31706008583690987, + "grad_norm": 0.13157030707159656, + "learning_rate": 0.00015981389493307288, + "loss": 0.6558, + "step": 3546 + }, + { + "epoch": 0.31714949928469244, + "grad_norm": 0.15331112682375914, + "learning_rate": 0.0001597906840133312, + "loss": 0.7085, + "step": 3547 + }, + { + "epoch": 0.31723891273247495, + "grad_norm": 0.14343929683655082, + "learning_rate": 0.000159767468079017, + "loss": 0.679, + "step": 3548 + }, + { + "epoch": 0.3173283261802575, + "grad_norm": 0.1427385270280673, + "learning_rate": 0.00015974424713207746, + "loss": 0.6936, + "step": 3549 + }, + { + "epoch": 0.31741773962804004, + "grad_norm": 0.1300573353139227, + "learning_rate": 0.0001597210211744601, + "loss": 0.6601, + "step": 3550 + }, + { + "epoch": 0.3175071530758226, + "grad_norm": 0.12719876682787332, + "learning_rate": 0.0001596977902081128, + "loss": 0.7018, + "step": 3551 + }, + { + "epoch": 0.31759656652360513, + "grad_norm": 0.11186511576169847, + "learning_rate": 0.00015967455423498387, + "loss": 0.6551, + "step": 3552 + }, + { + "epoch": 0.3176859799713877, + "grad_norm": 0.12199199497688164, + "learning_rate": 0.00015965131325702223, + "loss": 0.657, + "step": 3553 + }, + { + "epoch": 0.3177753934191702, + "grad_norm": 0.13609027909960755, + "learning_rate": 0.00015962806727617694, + "loss": 0.6735, + "step": 3554 + }, + { + "epoch": 0.3178648068669528, + "grad_norm": 0.12799509220617422, + "learning_rate": 0.00015960481629439768, + "loss": 0.6991, + "step": 3555 + }, + { + "epoch": 0.31795422031473536, + "grad_norm": 0.13338262139254298, + "learning_rate": 0.00015958156031363444, + "loss": 0.6874, + "step": 3556 + }, + { + "epoch": 0.3180436337625179, + "grad_norm": 0.12602290795016746, + "learning_rate": 0.0001595582993358377, + "loss": 0.6636, + "step": 3557 + }, + { + "epoch": 0.31813304721030045, + "grad_norm": 0.10906119589279145, + "learning_rate": 0.00015953503336295835, + "loss": 0.6844, + "step": 3558 + }, + { + "epoch": 0.31822246065808296, + "grad_norm": 0.12026948363076372, + "learning_rate": 0.00015951176239694764, + "loss": 0.6421, + "step": 3559 + }, + { + "epoch": 0.31831187410586553, + "grad_norm": 0.11966338176148296, + "learning_rate": 0.00015948848643975726, + "loss": 0.6728, + "step": 3560 + }, + { + "epoch": 0.31840128755364805, + "grad_norm": 0.12891404778378393, + "learning_rate": 0.00015946520549333938, + "loss": 0.6062, + "step": 3561 + }, + { + "epoch": 0.3184907010014306, + "grad_norm": 0.14185939698850974, + "learning_rate": 0.00015944191955964655, + "loss": 0.6702, + "step": 3562 + }, + { + "epoch": 0.31858011444921314, + "grad_norm": 0.12634351259213866, + "learning_rate": 0.0001594186286406317, + "loss": 0.6929, + "step": 3563 + }, + { + "epoch": 0.3186695278969957, + "grad_norm": 0.11659911256020757, + "learning_rate": 0.00015939533273824822, + "loss": 0.6542, + "step": 3564 + }, + { + "epoch": 0.3187589413447783, + "grad_norm": 0.1302899333031904, + "learning_rate": 0.00015937203185444992, + "loss": 0.7028, + "step": 3565 + }, + { + "epoch": 0.3188483547925608, + "grad_norm": 0.12647728708208203, + "learning_rate": 0.000159348725991191, + "loss": 0.6092, + "step": 3566 + }, + { + "epoch": 0.31893776824034337, + "grad_norm": 0.12170121039554378, + "learning_rate": 0.00015932541515042615, + "loss": 0.6862, + "step": 3567 + }, + { + "epoch": 0.3190271816881259, + "grad_norm": 0.11488799489917363, + "learning_rate": 0.00015930209933411036, + "loss": 0.6954, + "step": 3568 + }, + { + "epoch": 0.31911659513590845, + "grad_norm": 0.12493791401322063, + "learning_rate": 0.00015927877854419908, + "loss": 0.6535, + "step": 3569 + }, + { + "epoch": 0.31920600858369097, + "grad_norm": 0.1314434225865832, + "learning_rate": 0.00015925545278264828, + "loss": 0.688, + "step": 3570 + }, + { + "epoch": 0.31929542203147354, + "grad_norm": 0.13439877998478744, + "learning_rate": 0.00015923212205141418, + "loss": 0.6915, + "step": 3571 + }, + { + "epoch": 0.31938483547925606, + "grad_norm": 0.12390066830466678, + "learning_rate": 0.00015920878635245357, + "loss": 0.6825, + "step": 3572 + }, + { + "epoch": 0.31947424892703863, + "grad_norm": 0.12286666887338475, + "learning_rate": 0.00015918544568772354, + "loss": 0.6745, + "step": 3573 + }, + { + "epoch": 0.3195636623748212, + "grad_norm": 0.12379412858488495, + "learning_rate": 0.00015916210005918164, + "loss": 0.6554, + "step": 3574 + }, + { + "epoch": 0.3196530758226037, + "grad_norm": 0.1215784631928972, + "learning_rate": 0.00015913874946878588, + "loss": 0.6383, + "step": 3575 + }, + { + "epoch": 0.3197424892703863, + "grad_norm": 0.1250063996282882, + "learning_rate": 0.00015911539391849462, + "loss": 0.6904, + "step": 3576 + }, + { + "epoch": 0.3198319027181688, + "grad_norm": 0.13360345477291247, + "learning_rate": 0.00015909203341026666, + "loss": 0.6889, + "step": 3577 + }, + { + "epoch": 0.3199213161659514, + "grad_norm": 0.11815477294591573, + "learning_rate": 0.00015906866794606126, + "loss": 0.7041, + "step": 3578 + }, + { + "epoch": 0.3200107296137339, + "grad_norm": 0.11737045455183365, + "learning_rate": 0.00015904529752783794, + "loss": 0.6649, + "step": 3579 + }, + { + "epoch": 0.32010014306151646, + "grad_norm": 0.12158875318302063, + "learning_rate": 0.00015902192215755688, + "loss": 0.6686, + "step": 3580 + }, + { + "epoch": 0.320189556509299, + "grad_norm": 0.13063819259273002, + "learning_rate": 0.00015899854183717852, + "loss": 0.6286, + "step": 3581 + }, + { + "epoch": 0.32027896995708155, + "grad_norm": 0.1345184233743633, + "learning_rate": 0.00015897515656866363, + "loss": 0.6517, + "step": 3582 + }, + { + "epoch": 0.32036838340486407, + "grad_norm": 0.14988779867587795, + "learning_rate": 0.00015895176635397364, + "loss": 0.7235, + "step": 3583 + }, + { + "epoch": 0.32045779685264664, + "grad_norm": 0.11718471293190433, + "learning_rate": 0.00015892837119507014, + "loss": 0.6541, + "step": 3584 + }, + { + "epoch": 0.3205472103004292, + "grad_norm": 0.13829684332425235, + "learning_rate": 0.0001589049710939154, + "loss": 0.6831, + "step": 3585 + }, + { + "epoch": 0.3206366237482117, + "grad_norm": 0.12942731966518511, + "learning_rate": 0.0001588815660524718, + "loss": 0.6847, + "step": 3586 + }, + { + "epoch": 0.3207260371959943, + "grad_norm": 0.1332892109930575, + "learning_rate": 0.0001588581560727024, + "loss": 0.7229, + "step": 3587 + }, + { + "epoch": 0.3208154506437768, + "grad_norm": 0.13495749439344154, + "learning_rate": 0.00015883474115657056, + "loss": 0.5848, + "step": 3588 + }, + { + "epoch": 0.3209048640915594, + "grad_norm": 0.12031858680410126, + "learning_rate": 0.00015881132130603998, + "loss": 0.6854, + "step": 3589 + }, + { + "epoch": 0.3209942775393419, + "grad_norm": 0.1362494429165818, + "learning_rate": 0.00015878789652307496, + "loss": 0.697, + "step": 3590 + }, + { + "epoch": 0.32108369098712447, + "grad_norm": 0.11813753003627739, + "learning_rate": 0.00015876446680964, + "loss": 0.654, + "step": 3591 + }, + { + "epoch": 0.321173104434907, + "grad_norm": 0.11849154709601747, + "learning_rate": 0.00015874103216770023, + "loss": 0.6612, + "step": 3592 + }, + { + "epoch": 0.32126251788268956, + "grad_norm": 0.12567131054375, + "learning_rate": 0.00015871759259922097, + "loss": 0.6941, + "step": 3593 + }, + { + "epoch": 0.32135193133047213, + "grad_norm": 0.11967387066267704, + "learning_rate": 0.0001586941481061682, + "loss": 0.6815, + "step": 3594 + }, + { + "epoch": 0.32144134477825465, + "grad_norm": 0.12842047295121642, + "learning_rate": 0.0001586706986905081, + "loss": 0.6655, + "step": 3595 + }, + { + "epoch": 0.3215307582260372, + "grad_norm": 0.11770441173051092, + "learning_rate": 0.00015864724435420732, + "loss": 0.715, + "step": 3596 + }, + { + "epoch": 0.32162017167381973, + "grad_norm": 0.1341385108330511, + "learning_rate": 0.000158623785099233, + "loss": 0.6671, + "step": 3597 + }, + { + "epoch": 0.3217095851216023, + "grad_norm": 0.12814428375159748, + "learning_rate": 0.0001586003209275526, + "loss": 0.6859, + "step": 3598 + }, + { + "epoch": 0.3217989985693848, + "grad_norm": 0.13760324934896792, + "learning_rate": 0.00015857685184113412, + "loss": 0.6956, + "step": 3599 + }, + { + "epoch": 0.3218884120171674, + "grad_norm": 0.12909707287035233, + "learning_rate": 0.00015855337784194577, + "loss": 0.6945, + "step": 3600 + }, + { + "epoch": 0.3219778254649499, + "grad_norm": 0.1316572795756724, + "learning_rate": 0.00015852989893195635, + "loss": 0.7209, + "step": 3601 + }, + { + "epoch": 0.3220672389127325, + "grad_norm": 0.13420218289480848, + "learning_rate": 0.00015850641511313496, + "loss": 0.6973, + "step": 3602 + }, + { + "epoch": 0.322156652360515, + "grad_norm": 0.13170997264419798, + "learning_rate": 0.00015848292638745125, + "loss": 0.699, + "step": 3603 + }, + { + "epoch": 0.32224606580829757, + "grad_norm": 0.1254420583054684, + "learning_rate": 0.0001584594327568751, + "loss": 0.6693, + "step": 3604 + }, + { + "epoch": 0.32233547925608014, + "grad_norm": 0.15505729240032493, + "learning_rate": 0.00015843593422337695, + "loss": 0.7238, + "step": 3605 + }, + { + "epoch": 0.32242489270386265, + "grad_norm": 0.11213649228985481, + "learning_rate": 0.00015841243078892756, + "loss": 0.6571, + "step": 3606 + }, + { + "epoch": 0.3225143061516452, + "grad_norm": 0.1310301373852298, + "learning_rate": 0.0001583889224554981, + "loss": 0.6755, + "step": 3607 + }, + { + "epoch": 0.32260371959942774, + "grad_norm": 0.1415415231656143, + "learning_rate": 0.0001583654092250603, + "loss": 0.6911, + "step": 3608 + }, + { + "epoch": 0.3226931330472103, + "grad_norm": 0.13983124078344886, + "learning_rate": 0.00015834189109958607, + "loss": 0.6706, + "step": 3609 + }, + { + "epoch": 0.32278254649499283, + "grad_norm": 0.1306352695611395, + "learning_rate": 0.00015831836808104788, + "loss": 0.6947, + "step": 3610 + }, + { + "epoch": 0.3228719599427754, + "grad_norm": 0.11670260389078324, + "learning_rate": 0.0001582948401714186, + "loss": 0.6845, + "step": 3611 + }, + { + "epoch": 0.3229613733905579, + "grad_norm": 0.12288190203680249, + "learning_rate": 0.0001582713073726715, + "loss": 0.6759, + "step": 3612 + }, + { + "epoch": 0.3230507868383405, + "grad_norm": 0.13318121558805787, + "learning_rate": 0.00015824776968678024, + "loss": 0.6489, + "step": 3613 + }, + { + "epoch": 0.32314020028612306, + "grad_norm": 0.13615811422977894, + "learning_rate": 0.00015822422711571883, + "loss": 0.6899, + "step": 3614 + }, + { + "epoch": 0.3232296137339056, + "grad_norm": 0.11698618878981214, + "learning_rate": 0.00015820067966146185, + "loss": 0.6534, + "step": 3615 + }, + { + "epoch": 0.32331902718168815, + "grad_norm": 0.10873329716844542, + "learning_rate": 0.00015817712732598413, + "loss": 0.6527, + "step": 3616 + }, + { + "epoch": 0.32340844062947066, + "grad_norm": 0.11690221004407447, + "learning_rate": 0.00015815357011126103, + "loss": 0.6864, + "step": 3617 + }, + { + "epoch": 0.32349785407725323, + "grad_norm": 0.1220438075302209, + "learning_rate": 0.0001581300080192682, + "loss": 0.6778, + "step": 3618 + }, + { + "epoch": 0.32358726752503575, + "grad_norm": 0.1215214459557879, + "learning_rate": 0.00015810644105198184, + "loss": 0.6721, + "step": 3619 + }, + { + "epoch": 0.3236766809728183, + "grad_norm": 0.12583013100010362, + "learning_rate": 0.0001580828692113784, + "loss": 0.6705, + "step": 3620 + }, + { + "epoch": 0.32376609442060084, + "grad_norm": 0.1316784374125209, + "learning_rate": 0.0001580592924994349, + "loss": 0.6743, + "step": 3621 + }, + { + "epoch": 0.3238555078683834, + "grad_norm": 0.12375093294900319, + "learning_rate": 0.00015803571091812865, + "loss": 0.6751, + "step": 3622 + }, + { + "epoch": 0.323944921316166, + "grad_norm": 0.12816712010910128, + "learning_rate": 0.0001580121244694374, + "loss": 0.7092, + "step": 3623 + }, + { + "epoch": 0.3240343347639485, + "grad_norm": 0.12856841415624154, + "learning_rate": 0.00015798853315533931, + "loss": 0.693, + "step": 3624 + }, + { + "epoch": 0.32412374821173107, + "grad_norm": 0.12585201106572552, + "learning_rate": 0.00015796493697781304, + "loss": 0.6313, + "step": 3625 + }, + { + "epoch": 0.3242131616595136, + "grad_norm": 0.12056018413924063, + "learning_rate": 0.0001579413359388375, + "loss": 0.6503, + "step": 3626 + }, + { + "epoch": 0.32430257510729615, + "grad_norm": 0.13631240160743077, + "learning_rate": 0.00015791773004039206, + "loss": 0.668, + "step": 3627 + }, + { + "epoch": 0.32439198855507867, + "grad_norm": 0.13008498780546646, + "learning_rate": 0.00015789411928445653, + "loss": 0.6937, + "step": 3628 + }, + { + "epoch": 0.32448140200286124, + "grad_norm": 0.12654458621954706, + "learning_rate": 0.00015787050367301118, + "loss": 0.6621, + "step": 3629 + }, + { + "epoch": 0.32457081545064376, + "grad_norm": 0.12425201575194023, + "learning_rate": 0.00015784688320803655, + "loss": 0.6867, + "step": 3630 + }, + { + "epoch": 0.32466022889842633, + "grad_norm": 0.12129321029968362, + "learning_rate": 0.00015782325789151367, + "loss": 0.7088, + "step": 3631 + }, + { + "epoch": 0.32474964234620884, + "grad_norm": 0.12065377032275663, + "learning_rate": 0.00015779962772542402, + "loss": 0.6684, + "step": 3632 + }, + { + "epoch": 0.3248390557939914, + "grad_norm": 0.1303008165254246, + "learning_rate": 0.0001577759927117494, + "loss": 0.6746, + "step": 3633 + }, + { + "epoch": 0.324928469241774, + "grad_norm": 0.11886816348620084, + "learning_rate": 0.00015775235285247203, + "loss": 0.6527, + "step": 3634 + }, + { + "epoch": 0.3250178826895565, + "grad_norm": 0.1193560063131333, + "learning_rate": 0.00015772870814957453, + "loss": 0.651, + "step": 3635 + }, + { + "epoch": 0.3251072961373391, + "grad_norm": 0.12376918738604101, + "learning_rate": 0.00015770505860504005, + "loss": 0.6778, + "step": 3636 + }, + { + "epoch": 0.3251967095851216, + "grad_norm": 0.12347195020504466, + "learning_rate": 0.000157681404220852, + "loss": 0.6485, + "step": 3637 + }, + { + "epoch": 0.32528612303290416, + "grad_norm": 0.12526891079327965, + "learning_rate": 0.00015765774499899423, + "loss": 0.6635, + "step": 3638 + }, + { + "epoch": 0.3253755364806867, + "grad_norm": 0.12652393004646248, + "learning_rate": 0.00015763408094145103, + "loss": 0.7269, + "step": 3639 + }, + { + "epoch": 0.32546494992846925, + "grad_norm": 0.13514550709838097, + "learning_rate": 0.00015761041205020703, + "loss": 0.7042, + "step": 3640 + }, + { + "epoch": 0.32555436337625177, + "grad_norm": 0.1240094950609246, + "learning_rate": 0.00015758673832724738, + "loss": 0.6528, + "step": 3641 + }, + { + "epoch": 0.32564377682403434, + "grad_norm": 0.11647594057410697, + "learning_rate": 0.00015756305977455753, + "loss": 0.6785, + "step": 3642 + }, + { + "epoch": 0.3257331902718169, + "grad_norm": 0.12091024523834044, + "learning_rate": 0.00015753937639412336, + "loss": 0.6545, + "step": 3643 + }, + { + "epoch": 0.3258226037195994, + "grad_norm": 0.1442811242910447, + "learning_rate": 0.00015751568818793117, + "loss": 0.7038, + "step": 3644 + }, + { + "epoch": 0.325912017167382, + "grad_norm": 0.12678294378786134, + "learning_rate": 0.0001574919951579677, + "loss": 0.7104, + "step": 3645 + }, + { + "epoch": 0.3260014306151645, + "grad_norm": 0.12076333488876967, + "learning_rate": 0.00015746829730622, + "loss": 0.7038, + "step": 3646 + }, + { + "epoch": 0.3260908440629471, + "grad_norm": 0.13355431593020584, + "learning_rate": 0.00015744459463467564, + "loss": 0.7107, + "step": 3647 + }, + { + "epoch": 0.3261802575107296, + "grad_norm": 0.1304367713624583, + "learning_rate": 0.00015742088714532247, + "loss": 0.6943, + "step": 3648 + }, + { + "epoch": 0.32626967095851217, + "grad_norm": 0.12494527396597933, + "learning_rate": 0.00015739717484014888, + "loss": 0.6907, + "step": 3649 + }, + { + "epoch": 0.3263590844062947, + "grad_norm": 0.12902410442143408, + "learning_rate": 0.00015737345772114355, + "loss": 0.6873, + "step": 3650 + }, + { + "epoch": 0.32644849785407726, + "grad_norm": 0.11798516534465639, + "learning_rate": 0.0001573497357902956, + "loss": 0.6757, + "step": 3651 + }, + { + "epoch": 0.3265379113018598, + "grad_norm": 0.14011067216102874, + "learning_rate": 0.00015732600904959455, + "loss": 0.6471, + "step": 3652 + }, + { + "epoch": 0.32662732474964234, + "grad_norm": 0.12276544061255645, + "learning_rate": 0.00015730227750103038, + "loss": 0.678, + "step": 3653 + }, + { + "epoch": 0.3267167381974249, + "grad_norm": 0.14576548592819383, + "learning_rate": 0.0001572785411465934, + "loss": 0.7381, + "step": 3654 + }, + { + "epoch": 0.32680615164520743, + "grad_norm": 0.1273718517003604, + "learning_rate": 0.00015725479998827434, + "loss": 0.6758, + "step": 3655 + }, + { + "epoch": 0.32689556509299, + "grad_norm": 0.11760233441041211, + "learning_rate": 0.00015723105402806436, + "loss": 0.7075, + "step": 3656 + }, + { + "epoch": 0.3269849785407725, + "grad_norm": 0.1350963587821481, + "learning_rate": 0.000157207303267955, + "loss": 0.6704, + "step": 3657 + }, + { + "epoch": 0.3270743919885551, + "grad_norm": 0.12416661228202623, + "learning_rate": 0.00015718354770993817, + "loss": 0.7401, + "step": 3658 + }, + { + "epoch": 0.3271638054363376, + "grad_norm": 0.13233188172826674, + "learning_rate": 0.00015715978735600627, + "loss": 0.6987, + "step": 3659 + }, + { + "epoch": 0.3272532188841202, + "grad_norm": 0.11107448626159613, + "learning_rate": 0.00015713602220815203, + "loss": 0.6566, + "step": 3660 + }, + { + "epoch": 0.3273426323319027, + "grad_norm": 0.13185453667049873, + "learning_rate": 0.00015711225226836865, + "loss": 0.661, + "step": 3661 + }, + { + "epoch": 0.32743204577968527, + "grad_norm": 0.1384741262845939, + "learning_rate": 0.00015708847753864963, + "loss": 0.6663, + "step": 3662 + }, + { + "epoch": 0.32752145922746784, + "grad_norm": 0.11703980649685271, + "learning_rate": 0.0001570646980209889, + "loss": 0.6662, + "step": 3663 + }, + { + "epoch": 0.32761087267525035, + "grad_norm": 0.12578455243954464, + "learning_rate": 0.0001570409137173809, + "loss": 0.6919, + "step": 3664 + }, + { + "epoch": 0.3277002861230329, + "grad_norm": 0.11218173902918593, + "learning_rate": 0.00015701712462982037, + "loss": 0.6446, + "step": 3665 + }, + { + "epoch": 0.32778969957081544, + "grad_norm": 0.1251179185543268, + "learning_rate": 0.0001569933307603024, + "loss": 0.6753, + "step": 3666 + }, + { + "epoch": 0.327879113018598, + "grad_norm": 0.13176981693407572, + "learning_rate": 0.00015696953211082268, + "loss": 0.6757, + "step": 3667 + }, + { + "epoch": 0.3279685264663805, + "grad_norm": 0.12396474981731288, + "learning_rate": 0.00015694572868337706, + "loss": 0.6595, + "step": 3668 + }, + { + "epoch": 0.3280579399141631, + "grad_norm": 0.12861906562858375, + "learning_rate": 0.00015692192047996194, + "loss": 0.6552, + "step": 3669 + }, + { + "epoch": 0.3281473533619456, + "grad_norm": 0.11854584047720772, + "learning_rate": 0.00015689810750257413, + "loss": 0.6704, + "step": 3670 + }, + { + "epoch": 0.3282367668097282, + "grad_norm": 0.14234653916205103, + "learning_rate": 0.00015687428975321078, + "loss": 0.6968, + "step": 3671 + }, + { + "epoch": 0.3283261802575107, + "grad_norm": 0.129675021608209, + "learning_rate": 0.00015685046723386937, + "loss": 0.6887, + "step": 3672 + }, + { + "epoch": 0.3284155937052933, + "grad_norm": 0.1420561886064433, + "learning_rate": 0.00015682663994654795, + "loss": 0.6912, + "step": 3673 + }, + { + "epoch": 0.32850500715307585, + "grad_norm": 0.10573946693624853, + "learning_rate": 0.0001568028078932449, + "loss": 0.6254, + "step": 3674 + }, + { + "epoch": 0.32859442060085836, + "grad_norm": 0.1456570500658689, + "learning_rate": 0.00015677897107595892, + "loss": 0.6791, + "step": 3675 + }, + { + "epoch": 0.32868383404864093, + "grad_norm": 0.1301532001967832, + "learning_rate": 0.0001567551294966892, + "loss": 0.6619, + "step": 3676 + }, + { + "epoch": 0.32877324749642345, + "grad_norm": 0.1430672600420267, + "learning_rate": 0.00015673128315743534, + "loss": 0.6999, + "step": 3677 + }, + { + "epoch": 0.328862660944206, + "grad_norm": 0.12593167529768692, + "learning_rate": 0.00015670743206019723, + "loss": 0.6631, + "step": 3678 + }, + { + "epoch": 0.32895207439198854, + "grad_norm": 0.12915745243882476, + "learning_rate": 0.00015668357620697533, + "loss": 0.6734, + "step": 3679 + }, + { + "epoch": 0.3290414878397711, + "grad_norm": 0.12741340751315422, + "learning_rate": 0.00015665971559977035, + "loss": 0.6624, + "step": 3680 + }, + { + "epoch": 0.3291309012875536, + "grad_norm": 0.13047569628829872, + "learning_rate": 0.00015663585024058342, + "loss": 0.6617, + "step": 3681 + }, + { + "epoch": 0.3292203147353362, + "grad_norm": 0.13409275121238257, + "learning_rate": 0.00015661198013141613, + "loss": 0.7122, + "step": 3682 + }, + { + "epoch": 0.32930972818311877, + "grad_norm": 0.12348177278275783, + "learning_rate": 0.00015658810527427046, + "loss": 0.7075, + "step": 3683 + }, + { + "epoch": 0.3293991416309013, + "grad_norm": 0.13462212470858795, + "learning_rate": 0.00015656422567114872, + "loss": 0.6987, + "step": 3684 + }, + { + "epoch": 0.32948855507868385, + "grad_norm": 0.1191850656379932, + "learning_rate": 0.0001565403413240537, + "loss": 0.6646, + "step": 3685 + }, + { + "epoch": 0.32957796852646637, + "grad_norm": 0.12715580498087908, + "learning_rate": 0.00015651645223498854, + "loss": 0.5856, + "step": 3686 + }, + { + "epoch": 0.32966738197424894, + "grad_norm": 0.13750385222287967, + "learning_rate": 0.00015649255840595675, + "loss": 0.6938, + "step": 3687 + }, + { + "epoch": 0.32975679542203146, + "grad_norm": 0.13355694722080277, + "learning_rate": 0.00015646865983896238, + "loss": 0.5919, + "step": 3688 + }, + { + "epoch": 0.32984620886981403, + "grad_norm": 0.14654353621423816, + "learning_rate": 0.00015644475653600964, + "loss": 0.7022, + "step": 3689 + }, + { + "epoch": 0.32993562231759654, + "grad_norm": 0.13328776210442836, + "learning_rate": 0.00015642084849910336, + "loss": 0.6838, + "step": 3690 + }, + { + "epoch": 0.3300250357653791, + "grad_norm": 0.11560552633714115, + "learning_rate": 0.00015639693573024865, + "loss": 0.654, + "step": 3691 + }, + { + "epoch": 0.3301144492131617, + "grad_norm": 0.13621467537899862, + "learning_rate": 0.0001563730182314511, + "loss": 0.694, + "step": 3692 + }, + { + "epoch": 0.3302038626609442, + "grad_norm": 0.11958491090151628, + "learning_rate": 0.0001563490960047165, + "loss": 0.6687, + "step": 3693 + }, + { + "epoch": 0.3302932761087268, + "grad_norm": 0.12539638152445648, + "learning_rate": 0.00015632516905205135, + "loss": 0.6726, + "step": 3694 + }, + { + "epoch": 0.3303826895565093, + "grad_norm": 0.129178587214716, + "learning_rate": 0.00015630123737546224, + "loss": 0.6761, + "step": 3695 + }, + { + "epoch": 0.33047210300429186, + "grad_norm": 0.13461829492774874, + "learning_rate": 0.00015627730097695638, + "loss": 0.6822, + "step": 3696 + }, + { + "epoch": 0.3305615164520744, + "grad_norm": 0.12187812332208782, + "learning_rate": 0.00015625335985854126, + "loss": 0.647, + "step": 3697 + }, + { + "epoch": 0.33065092989985695, + "grad_norm": 0.12511332958503035, + "learning_rate": 0.00015622941402222479, + "loss": 0.6526, + "step": 3698 + }, + { + "epoch": 0.33074034334763946, + "grad_norm": 0.13351392732959314, + "learning_rate": 0.00015620546347001524, + "loss": 0.6724, + "step": 3699 + }, + { + "epoch": 0.33082975679542204, + "grad_norm": 0.14437423000827582, + "learning_rate": 0.00015618150820392136, + "loss": 0.663, + "step": 3700 + }, + { + "epoch": 0.33091917024320455, + "grad_norm": 0.11430736877646575, + "learning_rate": 0.00015615754822595224, + "loss": 0.6092, + "step": 3701 + }, + { + "epoch": 0.3310085836909871, + "grad_norm": 0.13614203890707155, + "learning_rate": 0.00015613358353811738, + "loss": 0.7033, + "step": 3702 + }, + { + "epoch": 0.3310979971387697, + "grad_norm": 0.1342899920580367, + "learning_rate": 0.00015610961414242664, + "loss": 0.6814, + "step": 3703 + }, + { + "epoch": 0.3311874105865522, + "grad_norm": 0.12356233741059527, + "learning_rate": 0.00015608564004089033, + "loss": 0.6656, + "step": 3704 + }, + { + "epoch": 0.3312768240343348, + "grad_norm": 0.14631917512939724, + "learning_rate": 0.00015606166123551912, + "loss": 0.6923, + "step": 3705 + }, + { + "epoch": 0.3313662374821173, + "grad_norm": 0.10226906348146185, + "learning_rate": 0.00015603767772832413, + "loss": 0.6468, + "step": 3706 + }, + { + "epoch": 0.33145565092989987, + "grad_norm": 0.11673031568138445, + "learning_rate": 0.0001560136895213167, + "loss": 0.6473, + "step": 3707 + }, + { + "epoch": 0.3315450643776824, + "grad_norm": 0.1279795854749729, + "learning_rate": 0.00015598969661650888, + "loss": 0.7224, + "step": 3708 + }, + { + "epoch": 0.33163447782546496, + "grad_norm": 0.10277969270827729, + "learning_rate": 0.00015596569901591277, + "loss": 0.6399, + "step": 3709 + }, + { + "epoch": 0.3317238912732475, + "grad_norm": 0.1073400096192666, + "learning_rate": 0.00015594169672154107, + "loss": 0.612, + "step": 3710 + }, + { + "epoch": 0.33181330472103004, + "grad_norm": 0.10833209569147004, + "learning_rate": 0.00015591768973540683, + "loss": 0.6434, + "step": 3711 + }, + { + "epoch": 0.3319027181688126, + "grad_norm": 0.10983842245285443, + "learning_rate": 0.00015589367805952348, + "loss": 0.6332, + "step": 3712 + }, + { + "epoch": 0.33199213161659513, + "grad_norm": 0.12375554884026481, + "learning_rate": 0.00015586966169590488, + "loss": 0.6756, + "step": 3713 + }, + { + "epoch": 0.3320815450643777, + "grad_norm": 0.1191690472471773, + "learning_rate": 0.0001558456406465652, + "loss": 0.6743, + "step": 3714 + }, + { + "epoch": 0.3321709585121602, + "grad_norm": 0.14872628380622402, + "learning_rate": 0.00015582161491351908, + "loss": 0.6015, + "step": 3715 + }, + { + "epoch": 0.3322603719599428, + "grad_norm": 0.12281483170137238, + "learning_rate": 0.00015579758449878157, + "loss": 0.6759, + "step": 3716 + }, + { + "epoch": 0.3323497854077253, + "grad_norm": 0.1252573313661572, + "learning_rate": 0.000155773549404368, + "loss": 0.6909, + "step": 3717 + }, + { + "epoch": 0.3324391988555079, + "grad_norm": 0.11753570548125256, + "learning_rate": 0.00015574950963229419, + "loss": 0.6714, + "step": 3718 + }, + { + "epoch": 0.3325286123032904, + "grad_norm": 0.13626089734829053, + "learning_rate": 0.00015572546518457636, + "loss": 0.7141, + "step": 3719 + }, + { + "epoch": 0.33261802575107297, + "grad_norm": 0.13685164470047387, + "learning_rate": 0.00015570141606323105, + "loss": 0.6795, + "step": 3720 + }, + { + "epoch": 0.3327074391988555, + "grad_norm": 0.138592434512266, + "learning_rate": 0.00015567736227027525, + "loss": 0.7148, + "step": 3721 + }, + { + "epoch": 0.33279685264663805, + "grad_norm": 0.10930194368713045, + "learning_rate": 0.00015565330380772633, + "loss": 0.6353, + "step": 3722 + }, + { + "epoch": 0.3328862660944206, + "grad_norm": 0.14116079567673714, + "learning_rate": 0.00015562924067760202, + "loss": 0.7019, + "step": 3723 + }, + { + "epoch": 0.33297567954220314, + "grad_norm": 0.14560321376329505, + "learning_rate": 0.00015560517288192046, + "loss": 0.6599, + "step": 3724 + }, + { + "epoch": 0.3330650929899857, + "grad_norm": 0.1254738644032804, + "learning_rate": 0.00015558110042270023, + "loss": 0.6311, + "step": 3725 + }, + { + "epoch": 0.3331545064377682, + "grad_norm": 0.11499208847151822, + "learning_rate": 0.00015555702330196023, + "loss": 0.6506, + "step": 3726 + }, + { + "epoch": 0.3332439198855508, + "grad_norm": 0.11605710692372752, + "learning_rate": 0.00015553294152171977, + "loss": 0.6621, + "step": 3727 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.12695887860863264, + "learning_rate": 0.00015550885508399856, + "loss": 0.6865, + "step": 3728 + }, + { + "epoch": 0.3334227467811159, + "grad_norm": 0.11709554545602324, + "learning_rate": 0.00015548476399081674, + "loss": 0.6442, + "step": 3729 + }, + { + "epoch": 0.3335121602288984, + "grad_norm": 0.13919864117804062, + "learning_rate": 0.0001554606682441948, + "loss": 0.6753, + "step": 3730 + }, + { + "epoch": 0.333601573676681, + "grad_norm": 0.13119309488962713, + "learning_rate": 0.00015543656784615354, + "loss": 0.6872, + "step": 3731 + }, + { + "epoch": 0.33369098712446355, + "grad_norm": 0.1495109950462978, + "learning_rate": 0.00015541246279871432, + "loss": 0.6299, + "step": 3732 + }, + { + "epoch": 0.33378040057224606, + "grad_norm": 0.1325816790210995, + "learning_rate": 0.00015538835310389875, + "loss": 0.7148, + "step": 3733 + }, + { + "epoch": 0.33386981402002863, + "grad_norm": 0.1413152489620935, + "learning_rate": 0.00015536423876372888, + "loss": 0.6157, + "step": 3734 + }, + { + "epoch": 0.33395922746781115, + "grad_norm": 0.1391012807829841, + "learning_rate": 0.00015534011978022717, + "loss": 0.6789, + "step": 3735 + }, + { + "epoch": 0.3340486409155937, + "grad_norm": 0.10569604483644256, + "learning_rate": 0.00015531599615541648, + "loss": 0.6192, + "step": 3736 + }, + { + "epoch": 0.33413805436337624, + "grad_norm": 0.1342963230400462, + "learning_rate": 0.00015529186789131996, + "loss": 0.6459, + "step": 3737 + }, + { + "epoch": 0.3342274678111588, + "grad_norm": 0.140373216929097, + "learning_rate": 0.0001552677349899613, + "loss": 0.6946, + "step": 3738 + }, + { + "epoch": 0.3343168812589413, + "grad_norm": 0.15080917793768903, + "learning_rate": 0.0001552435974533644, + "loss": 0.7192, + "step": 3739 + }, + { + "epoch": 0.3344062947067239, + "grad_norm": 0.11957220503146072, + "learning_rate": 0.00015521945528355376, + "loss": 0.6499, + "step": 3740 + }, + { + "epoch": 0.3344957081545064, + "grad_norm": 0.13546025862225403, + "learning_rate": 0.00015519530848255407, + "loss": 0.6268, + "step": 3741 + }, + { + "epoch": 0.334585121602289, + "grad_norm": 0.13619212238855036, + "learning_rate": 0.00015517115705239047, + "loss": 0.6436, + "step": 3742 + }, + { + "epoch": 0.33467453505007155, + "grad_norm": 0.12436884973418964, + "learning_rate": 0.0001551470009950886, + "loss": 0.6262, + "step": 3743 + }, + { + "epoch": 0.33476394849785407, + "grad_norm": 0.13108256891444803, + "learning_rate": 0.00015512284031267437, + "loss": 0.7051, + "step": 3744 + }, + { + "epoch": 0.33485336194563664, + "grad_norm": 0.1248335111505159, + "learning_rate": 0.00015509867500717407, + "loss": 0.6691, + "step": 3745 + }, + { + "epoch": 0.33494277539341916, + "grad_norm": 0.10702626666146411, + "learning_rate": 0.00015507450508061443, + "loss": 0.667, + "step": 3746 + }, + { + "epoch": 0.33503218884120173, + "grad_norm": 0.12222811097776855, + "learning_rate": 0.0001550503305350226, + "loss": 0.7215, + "step": 3747 + }, + { + "epoch": 0.33512160228898424, + "grad_norm": 0.13146761479940502, + "learning_rate": 0.000155026151372426, + "loss": 0.6798, + "step": 3748 + }, + { + "epoch": 0.3352110157367668, + "grad_norm": 0.14373025135897818, + "learning_rate": 0.00015500196759485254, + "loss": 0.6961, + "step": 3749 + }, + { + "epoch": 0.33530042918454933, + "grad_norm": 0.12399299732135012, + "learning_rate": 0.0001549777792043305, + "loss": 0.6997, + "step": 3750 + }, + { + "epoch": 0.3353898426323319, + "grad_norm": 0.13828130446184456, + "learning_rate": 0.0001549535862028885, + "loss": 0.6847, + "step": 3751 + }, + { + "epoch": 0.3354792560801145, + "grad_norm": 0.13422508163925442, + "learning_rate": 0.0001549293885925556, + "loss": 0.7204, + "step": 3752 + }, + { + "epoch": 0.335568669527897, + "grad_norm": 0.12198930848549673, + "learning_rate": 0.0001549051863753612, + "loss": 0.6672, + "step": 3753 + }, + { + "epoch": 0.33565808297567956, + "grad_norm": 0.11167492920953705, + "learning_rate": 0.00015488097955333515, + "loss": 0.6248, + "step": 3754 + }, + { + "epoch": 0.3357474964234621, + "grad_norm": 0.1252299019293565, + "learning_rate": 0.00015485676812850761, + "loss": 0.6516, + "step": 3755 + }, + { + "epoch": 0.33583690987124465, + "grad_norm": 0.12031489951732768, + "learning_rate": 0.0001548325521029092, + "loss": 0.6723, + "step": 3756 + }, + { + "epoch": 0.33592632331902716, + "grad_norm": 0.13269424788495374, + "learning_rate": 0.00015480833147857087, + "loss": 0.6655, + "step": 3757 + }, + { + "epoch": 0.33601573676680974, + "grad_norm": 0.13311319528004426, + "learning_rate": 0.00015478410625752393, + "loss": 0.6704, + "step": 3758 + }, + { + "epoch": 0.33610515021459225, + "grad_norm": 0.11957125012610288, + "learning_rate": 0.00015475987644180016, + "loss": 0.6532, + "step": 3759 + }, + { + "epoch": 0.3361945636623748, + "grad_norm": 0.13597184732142922, + "learning_rate": 0.00015473564203343174, + "loss": 0.6944, + "step": 3760 + }, + { + "epoch": 0.3362839771101574, + "grad_norm": 0.13520980280960898, + "learning_rate": 0.00015471140303445109, + "loss": 0.7022, + "step": 3761 + }, + { + "epoch": 0.3363733905579399, + "grad_norm": 0.14158118310711973, + "learning_rate": 0.00015468715944689113, + "loss": 0.7018, + "step": 3762 + }, + { + "epoch": 0.3364628040057225, + "grad_norm": 0.10662428600181362, + "learning_rate": 0.0001546629112727852, + "loss": 0.6294, + "step": 3763 + }, + { + "epoch": 0.336552217453505, + "grad_norm": 0.1132774033145064, + "learning_rate": 0.00015463865851416685, + "loss": 0.6777, + "step": 3764 + }, + { + "epoch": 0.33664163090128757, + "grad_norm": 0.125052904960351, + "learning_rate": 0.00015461440117307026, + "loss": 0.6806, + "step": 3765 + }, + { + "epoch": 0.3367310443490701, + "grad_norm": 0.14556912271332367, + "learning_rate": 0.00015459013925152976, + "loss": 0.7166, + "step": 3766 + }, + { + "epoch": 0.33682045779685266, + "grad_norm": 0.12685442427315102, + "learning_rate": 0.00015456587275158024, + "loss": 0.6544, + "step": 3767 + }, + { + "epoch": 0.3369098712446352, + "grad_norm": 0.13036308753193374, + "learning_rate": 0.00015454160167525685, + "loss": 0.6917, + "step": 3768 + }, + { + "epoch": 0.33699928469241774, + "grad_norm": 0.13902421699690165, + "learning_rate": 0.00015451732602459522, + "loss": 0.7091, + "step": 3769 + }, + { + "epoch": 0.33708869814020026, + "grad_norm": 0.12879139235664278, + "learning_rate": 0.00015449304580163125, + "loss": 0.6766, + "step": 3770 + }, + { + "epoch": 0.33717811158798283, + "grad_norm": 0.12875759293175992, + "learning_rate": 0.00015446876100840137, + "loss": 0.6763, + "step": 3771 + }, + { + "epoch": 0.3372675250357654, + "grad_norm": 0.13131104704858512, + "learning_rate": 0.0001544444716469423, + "loss": 0.6572, + "step": 3772 + }, + { + "epoch": 0.3373569384835479, + "grad_norm": 0.13301471114421795, + "learning_rate": 0.0001544201777192911, + "loss": 0.6689, + "step": 3773 + }, + { + "epoch": 0.3374463519313305, + "grad_norm": 0.10979224221394006, + "learning_rate": 0.00015439587922748537, + "loss": 0.6364, + "step": 3774 + }, + { + "epoch": 0.337535765379113, + "grad_norm": 0.11605493007588681, + "learning_rate": 0.00015437157617356292, + "loss": 0.6712, + "step": 3775 + }, + { + "epoch": 0.3376251788268956, + "grad_norm": 0.12157579495431349, + "learning_rate": 0.00015434726855956206, + "loss": 0.6511, + "step": 3776 + }, + { + "epoch": 0.3377145922746781, + "grad_norm": 0.11451795047393637, + "learning_rate": 0.0001543229563875214, + "loss": 0.6651, + "step": 3777 + }, + { + "epoch": 0.33780400572246067, + "grad_norm": 0.11700861605484116, + "learning_rate": 0.00015429863965947996, + "loss": 0.6467, + "step": 3778 + }, + { + "epoch": 0.3378934191702432, + "grad_norm": 0.12107940762277204, + "learning_rate": 0.00015427431837747725, + "loss": 0.6797, + "step": 3779 + }, + { + "epoch": 0.33798283261802575, + "grad_norm": 0.13928350662363562, + "learning_rate": 0.00015424999254355296, + "loss": 0.5752, + "step": 3780 + }, + { + "epoch": 0.3380722460658083, + "grad_norm": 0.13457373073136344, + "learning_rate": 0.00015422566215974733, + "loss": 0.6712, + "step": 3781 + }, + { + "epoch": 0.33816165951359084, + "grad_norm": 0.13080732274583112, + "learning_rate": 0.00015420132722810092, + "loss": 0.714, + "step": 3782 + }, + { + "epoch": 0.3382510729613734, + "grad_norm": 0.13688268468289053, + "learning_rate": 0.00015417698775065466, + "loss": 0.721, + "step": 3783 + }, + { + "epoch": 0.3383404864091559, + "grad_norm": 0.14769908094666262, + "learning_rate": 0.00015415264372944983, + "loss": 0.676, + "step": 3784 + }, + { + "epoch": 0.3384298998569385, + "grad_norm": 0.12724483581787535, + "learning_rate": 0.00015412829516652817, + "loss": 0.68, + "step": 3785 + }, + { + "epoch": 0.338519313304721, + "grad_norm": 0.11311244265718949, + "learning_rate": 0.00015410394206393177, + "loss": 0.6927, + "step": 3786 + }, + { + "epoch": 0.3386087267525036, + "grad_norm": 0.12140729288098363, + "learning_rate": 0.00015407958442370312, + "loss": 0.6598, + "step": 3787 + }, + { + "epoch": 0.3386981402002861, + "grad_norm": 0.12276411097176215, + "learning_rate": 0.000154055222247885, + "loss": 0.6719, + "step": 3788 + }, + { + "epoch": 0.3387875536480687, + "grad_norm": 0.15163664625801068, + "learning_rate": 0.00015403085553852068, + "loss": 0.6504, + "step": 3789 + }, + { + "epoch": 0.3388769670958512, + "grad_norm": 0.11243480785802643, + "learning_rate": 0.00015400648429765375, + "loss": 0.6317, + "step": 3790 + }, + { + "epoch": 0.33896638054363376, + "grad_norm": 0.1181313361685204, + "learning_rate": 0.00015398210852732825, + "loss": 0.6706, + "step": 3791 + }, + { + "epoch": 0.33905579399141633, + "grad_norm": 0.1294236103446988, + "learning_rate": 0.00015395772822958845, + "loss": 0.688, + "step": 3792 + }, + { + "epoch": 0.33914520743919885, + "grad_norm": 0.13643161341085464, + "learning_rate": 0.00015393334340647917, + "loss": 0.5632, + "step": 3793 + }, + { + "epoch": 0.3392346208869814, + "grad_norm": 0.13355688236988447, + "learning_rate": 0.00015390895406004553, + "loss": 0.684, + "step": 3794 + }, + { + "epoch": 0.33932403433476394, + "grad_norm": 0.1272047969674181, + "learning_rate": 0.00015388456019233302, + "loss": 0.6508, + "step": 3795 + }, + { + "epoch": 0.3394134477825465, + "grad_norm": 0.1269015710795402, + "learning_rate": 0.0001538601618053875, + "loss": 0.676, + "step": 3796 + }, + { + "epoch": 0.339502861230329, + "grad_norm": 0.12970970351779848, + "learning_rate": 0.00015383575890125527, + "loss": 0.6867, + "step": 3797 + }, + { + "epoch": 0.3395922746781116, + "grad_norm": 0.11222801551856067, + "learning_rate": 0.00015381135148198293, + "loss": 0.6955, + "step": 3798 + }, + { + "epoch": 0.3396816881258941, + "grad_norm": 0.1191752173508515, + "learning_rate": 0.00015378693954961754, + "loss": 0.6562, + "step": 3799 + }, + { + "epoch": 0.3397711015736767, + "grad_norm": 0.12769731715480995, + "learning_rate": 0.0001537625231062065, + "loss": 0.6609, + "step": 3800 + }, + { + "epoch": 0.33986051502145925, + "grad_norm": 0.1326410273283231, + "learning_rate": 0.00015373810215379757, + "loss": 0.7056, + "step": 3801 + }, + { + "epoch": 0.33994992846924177, + "grad_norm": 0.10674464448819193, + "learning_rate": 0.0001537136766944389, + "loss": 0.6531, + "step": 3802 + }, + { + "epoch": 0.34003934191702434, + "grad_norm": 0.12251302621452173, + "learning_rate": 0.00015368924673017905, + "loss": 0.6361, + "step": 3803 + }, + { + "epoch": 0.34012875536480686, + "grad_norm": 0.12003644507424674, + "learning_rate": 0.00015366481226306692, + "loss": 0.6583, + "step": 3804 + }, + { + "epoch": 0.3402181688125894, + "grad_norm": 0.12356261098626244, + "learning_rate": 0.00015364037329515182, + "loss": 0.6932, + "step": 3805 + }, + { + "epoch": 0.34030758226037194, + "grad_norm": 0.13500224331464156, + "learning_rate": 0.00015361592982848335, + "loss": 0.6975, + "step": 3806 + }, + { + "epoch": 0.3403969957081545, + "grad_norm": 0.13734424943766843, + "learning_rate": 0.00015359148186511163, + "loss": 0.7048, + "step": 3807 + }, + { + "epoch": 0.34048640915593703, + "grad_norm": 0.13055401251471263, + "learning_rate": 0.000153567029407087, + "loss": 0.694, + "step": 3808 + }, + { + "epoch": 0.3405758226037196, + "grad_norm": 0.13370251315737772, + "learning_rate": 0.00015354257245646036, + "loss": 0.5811, + "step": 3809 + }, + { + "epoch": 0.3406652360515021, + "grad_norm": 0.12551897108696222, + "learning_rate": 0.0001535181110152828, + "loss": 0.6761, + "step": 3810 + }, + { + "epoch": 0.3407546494992847, + "grad_norm": 0.11907956887069227, + "learning_rate": 0.00015349364508560588, + "loss": 0.6781, + "step": 3811 + }, + { + "epoch": 0.34084406294706726, + "grad_norm": 0.1196520067665802, + "learning_rate": 0.00015346917466948161, + "loss": 0.6601, + "step": 3812 + }, + { + "epoch": 0.3409334763948498, + "grad_norm": 0.12627043359700685, + "learning_rate": 0.0001534446997689622, + "loss": 0.6819, + "step": 3813 + }, + { + "epoch": 0.34102288984263235, + "grad_norm": 0.1210900246945103, + "learning_rate": 0.00015342022038610038, + "loss": 0.6808, + "step": 3814 + }, + { + "epoch": 0.34111230329041486, + "grad_norm": 0.1465693075423293, + "learning_rate": 0.00015339573652294917, + "loss": 0.723, + "step": 3815 + }, + { + "epoch": 0.34120171673819744, + "grad_norm": 0.12124524871185846, + "learning_rate": 0.00015337124818156205, + "loss": 0.6518, + "step": 3816 + }, + { + "epoch": 0.34129113018597995, + "grad_norm": 0.12068569957727836, + "learning_rate": 0.00015334675536399277, + "loss": 0.6852, + "step": 3817 + }, + { + "epoch": 0.3413805436337625, + "grad_norm": 0.12787303357097785, + "learning_rate": 0.00015332225807229556, + "loss": 0.7382, + "step": 3818 + }, + { + "epoch": 0.34146995708154504, + "grad_norm": 0.12268522343311872, + "learning_rate": 0.00015329775630852497, + "loss": 0.6914, + "step": 3819 + }, + { + "epoch": 0.3415593705293276, + "grad_norm": 0.13130729250385495, + "learning_rate": 0.00015327325007473592, + "loss": 0.6933, + "step": 3820 + }, + { + "epoch": 0.3416487839771102, + "grad_norm": 0.11865996575579867, + "learning_rate": 0.00015324873937298374, + "loss": 0.6497, + "step": 3821 + }, + { + "epoch": 0.3417381974248927, + "grad_norm": 0.12551432440664712, + "learning_rate": 0.00015322422420532407, + "loss": 0.6935, + "step": 3822 + }, + { + "epoch": 0.34182761087267527, + "grad_norm": 0.12543983790381685, + "learning_rate": 0.000153199704573813, + "loss": 0.6819, + "step": 3823 + }, + { + "epoch": 0.3419170243204578, + "grad_norm": 0.11860809444130453, + "learning_rate": 0.00015317518048050697, + "loss": 0.6822, + "step": 3824 + }, + { + "epoch": 0.34200643776824036, + "grad_norm": 0.12449646233582068, + "learning_rate": 0.00015315065192746276, + "loss": 0.662, + "step": 3825 + }, + { + "epoch": 0.3420958512160229, + "grad_norm": 0.11323235563711723, + "learning_rate": 0.00015312611891673752, + "loss": 0.6333, + "step": 3826 + }, + { + "epoch": 0.34218526466380544, + "grad_norm": 0.13310218841391572, + "learning_rate": 0.00015310158145038892, + "loss": 0.6514, + "step": 3827 + }, + { + "epoch": 0.34227467811158796, + "grad_norm": 0.1174365644851276, + "learning_rate": 0.0001530770395304748, + "loss": 0.654, + "step": 3828 + }, + { + "epoch": 0.34236409155937053, + "grad_norm": 0.1442415575573912, + "learning_rate": 0.00015305249315905348, + "loss": 0.6873, + "step": 3829 + }, + { + "epoch": 0.3424535050071531, + "grad_norm": 0.13197448885952073, + "learning_rate": 0.0001530279423381836, + "loss": 0.6028, + "step": 3830 + }, + { + "epoch": 0.3425429184549356, + "grad_norm": 0.12607524090092487, + "learning_rate": 0.00015300338706992426, + "loss": 0.6877, + "step": 3831 + }, + { + "epoch": 0.3426323319027182, + "grad_norm": 0.13158969039492657, + "learning_rate": 0.00015297882735633485, + "loss": 0.6811, + "step": 3832 + }, + { + "epoch": 0.3427217453505007, + "grad_norm": 0.13675667689178098, + "learning_rate": 0.00015295426319947514, + "loss": 0.6701, + "step": 3833 + }, + { + "epoch": 0.3428111587982833, + "grad_norm": 0.11401458903046316, + "learning_rate": 0.0001529296946014054, + "loss": 0.6414, + "step": 3834 + }, + { + "epoch": 0.3429005722460658, + "grad_norm": 0.10799203079158202, + "learning_rate": 0.00015290512156418602, + "loss": 0.6427, + "step": 3835 + }, + { + "epoch": 0.34298998569384836, + "grad_norm": 0.12604319014457763, + "learning_rate": 0.000152880544089878, + "loss": 0.6698, + "step": 3836 + }, + { + "epoch": 0.3430793991416309, + "grad_norm": 0.13280577562343057, + "learning_rate": 0.00015285596218054265, + "loss": 0.6833, + "step": 3837 + }, + { + "epoch": 0.34316881258941345, + "grad_norm": 0.12537656905720174, + "learning_rate": 0.00015283137583824158, + "loss": 0.6824, + "step": 3838 + }, + { + "epoch": 0.34325822603719597, + "grad_norm": 0.12409514287649692, + "learning_rate": 0.0001528067850650368, + "loss": 0.654, + "step": 3839 + }, + { + "epoch": 0.34334763948497854, + "grad_norm": 0.12399309436727995, + "learning_rate": 0.00015278218986299074, + "loss": 0.6433, + "step": 3840 + }, + { + "epoch": 0.3434370529327611, + "grad_norm": 0.1236694475769074, + "learning_rate": 0.00015275759023416618, + "loss": 0.6875, + "step": 3841 + }, + { + "epoch": 0.3435264663805436, + "grad_norm": 0.13020546784249315, + "learning_rate": 0.00015273298618062624, + "loss": 0.6885, + "step": 3842 + }, + { + "epoch": 0.3436158798283262, + "grad_norm": 0.13155860009424172, + "learning_rate": 0.00015270837770443437, + "loss": 0.6591, + "step": 3843 + }, + { + "epoch": 0.3437052932761087, + "grad_norm": 0.12541797572519825, + "learning_rate": 0.0001526837648076546, + "loss": 0.6908, + "step": 3844 + }, + { + "epoch": 0.3437947067238913, + "grad_norm": 0.1306095321248591, + "learning_rate": 0.00015265914749235107, + "loss": 0.664, + "step": 3845 + }, + { + "epoch": 0.3438841201716738, + "grad_norm": 0.13621964182664165, + "learning_rate": 0.00015263452576058843, + "loss": 0.6591, + "step": 3846 + }, + { + "epoch": 0.3439735336194564, + "grad_norm": 0.12152908102904858, + "learning_rate": 0.0001526098996144317, + "loss": 0.6306, + "step": 3847 + }, + { + "epoch": 0.3440629470672389, + "grad_norm": 0.12378591916742387, + "learning_rate": 0.0001525852690559462, + "loss": 0.6271, + "step": 3848 + }, + { + "epoch": 0.34415236051502146, + "grad_norm": 0.13708758118277667, + "learning_rate": 0.00015256063408719772, + "loss": 0.6709, + "step": 3849 + }, + { + "epoch": 0.34424177396280403, + "grad_norm": 0.15147110070376132, + "learning_rate": 0.0001525359947102523, + "loss": 0.6724, + "step": 3850 + }, + { + "epoch": 0.34433118741058655, + "grad_norm": 0.12346377097845922, + "learning_rate": 0.00015251135092717648, + "loss": 0.6579, + "step": 3851 + }, + { + "epoch": 0.3444206008583691, + "grad_norm": 0.1309192533189219, + "learning_rate": 0.00015248670274003708, + "loss": 0.7014, + "step": 3852 + }, + { + "epoch": 0.34451001430615164, + "grad_norm": 0.11903111756054861, + "learning_rate": 0.00015246205015090127, + "loss": 0.6703, + "step": 3853 + }, + { + "epoch": 0.3445994277539342, + "grad_norm": 0.12076066613642719, + "learning_rate": 0.0001524373931618367, + "loss": 0.6765, + "step": 3854 + }, + { + "epoch": 0.3446888412017167, + "grad_norm": 0.12529905742487202, + "learning_rate": 0.0001524127317749113, + "loss": 0.6803, + "step": 3855 + }, + { + "epoch": 0.3447782546494993, + "grad_norm": 0.12606021080138505, + "learning_rate": 0.00015238806599219336, + "loss": 0.6759, + "step": 3856 + }, + { + "epoch": 0.3448676680972818, + "grad_norm": 0.13724277061254786, + "learning_rate": 0.0001523633958157516, + "loss": 0.723, + "step": 3857 + }, + { + "epoch": 0.3449570815450644, + "grad_norm": 0.11282424208765766, + "learning_rate": 0.00015233872124765512, + "loss": 0.6746, + "step": 3858 + }, + { + "epoch": 0.3450464949928469, + "grad_norm": 0.12158951601307061, + "learning_rate": 0.00015231404228997325, + "loss": 0.7211, + "step": 3859 + }, + { + "epoch": 0.34513590844062947, + "grad_norm": 0.12940821193810198, + "learning_rate": 0.00015228935894477582, + "loss": 0.6436, + "step": 3860 + }, + { + "epoch": 0.34522532188841204, + "grad_norm": 0.1266031801877225, + "learning_rate": 0.00015226467121413304, + "loss": 0.6667, + "step": 3861 + }, + { + "epoch": 0.34531473533619456, + "grad_norm": 0.12755737891736751, + "learning_rate": 0.0001522399791001154, + "loss": 0.6968, + "step": 3862 + }, + { + "epoch": 0.3454041487839771, + "grad_norm": 0.11786467846151204, + "learning_rate": 0.00015221528260479377, + "loss": 0.6018, + "step": 3863 + }, + { + "epoch": 0.34549356223175964, + "grad_norm": 0.12016300596028717, + "learning_rate": 0.0001521905817302395, + "loss": 0.6788, + "step": 3864 + }, + { + "epoch": 0.3455829756795422, + "grad_norm": 0.1328743231213266, + "learning_rate": 0.00015216587647852415, + "loss": 0.6624, + "step": 3865 + }, + { + "epoch": 0.34567238912732473, + "grad_norm": 0.13146387594071124, + "learning_rate": 0.0001521411668517197, + "loss": 0.6993, + "step": 3866 + }, + { + "epoch": 0.3457618025751073, + "grad_norm": 0.10972337164694881, + "learning_rate": 0.00015211645285189858, + "loss": 0.6867, + "step": 3867 + }, + { + "epoch": 0.3458512160228898, + "grad_norm": 0.14224490568299517, + "learning_rate": 0.00015209173448113355, + "loss": 0.7152, + "step": 3868 + }, + { + "epoch": 0.3459406294706724, + "grad_norm": 0.13823671429575007, + "learning_rate": 0.0001520670117414976, + "loss": 0.6696, + "step": 3869 + }, + { + "epoch": 0.34603004291845496, + "grad_norm": 0.12632046320831408, + "learning_rate": 0.00015204228463506424, + "loss": 0.6703, + "step": 3870 + }, + { + "epoch": 0.3461194563662375, + "grad_norm": 0.13248605010257195, + "learning_rate": 0.00015201755316390737, + "loss": 0.7232, + "step": 3871 + }, + { + "epoch": 0.34620886981402005, + "grad_norm": 0.13528834409465426, + "learning_rate": 0.00015199281733010116, + "loss": 0.7004, + "step": 3872 + }, + { + "epoch": 0.34629828326180256, + "grad_norm": 0.13236291348242624, + "learning_rate": 0.0001519680771357201, + "loss": 0.6139, + "step": 3873 + }, + { + "epoch": 0.34638769670958514, + "grad_norm": 0.1276057607337956, + "learning_rate": 0.00015194333258283918, + "loss": 0.7216, + "step": 3874 + }, + { + "epoch": 0.34647711015736765, + "grad_norm": 0.1259060729860609, + "learning_rate": 0.00015191858367353368, + "loss": 0.6583, + "step": 3875 + }, + { + "epoch": 0.3465665236051502, + "grad_norm": 0.11433050265466861, + "learning_rate": 0.0001518938304098793, + "loss": 0.6404, + "step": 3876 + }, + { + "epoch": 0.34665593705293274, + "grad_norm": 0.13741839279664528, + "learning_rate": 0.00015186907279395202, + "loss": 0.6867, + "step": 3877 + }, + { + "epoch": 0.3467453505007153, + "grad_norm": 0.12991842704192264, + "learning_rate": 0.00015184431082782823, + "loss": 0.6885, + "step": 3878 + }, + { + "epoch": 0.3468347639484979, + "grad_norm": 0.12291119415895398, + "learning_rate": 0.00015181954451358473, + "loss": 0.7001, + "step": 3879 + }, + { + "epoch": 0.3469241773962804, + "grad_norm": 0.11768673820162685, + "learning_rate": 0.0001517947738532986, + "loss": 0.6465, + "step": 3880 + }, + { + "epoch": 0.34701359084406297, + "grad_norm": 0.14523804579729865, + "learning_rate": 0.00015176999884904734, + "loss": 0.6773, + "step": 3881 + }, + { + "epoch": 0.3471030042918455, + "grad_norm": 0.12593562153583002, + "learning_rate": 0.0001517452195029088, + "loss": 0.6788, + "step": 3882 + }, + { + "epoch": 0.34719241773962806, + "grad_norm": 0.13771308560045192, + "learning_rate": 0.00015172043581696118, + "loss": 0.6688, + "step": 3883 + }, + { + "epoch": 0.3472818311874106, + "grad_norm": 0.12134965089401555, + "learning_rate": 0.0001516956477932831, + "loss": 0.6475, + "step": 3884 + }, + { + "epoch": 0.34737124463519314, + "grad_norm": 0.12876653169019187, + "learning_rate": 0.00015167085543395348, + "loss": 0.6501, + "step": 3885 + }, + { + "epoch": 0.34746065808297566, + "grad_norm": 0.11669588287273361, + "learning_rate": 0.00015164605874105156, + "loss": 0.647, + "step": 3886 + }, + { + "epoch": 0.34755007153075823, + "grad_norm": 0.11580516363177697, + "learning_rate": 0.0001516212577166571, + "loss": 0.6642, + "step": 3887 + }, + { + "epoch": 0.34763948497854075, + "grad_norm": 0.11639650836532399, + "learning_rate": 0.0001515964523628501, + "loss": 0.6345, + "step": 3888 + }, + { + "epoch": 0.3477288984263233, + "grad_norm": 0.1267145296100165, + "learning_rate": 0.00015157164268171097, + "loss": 0.6664, + "step": 3889 + }, + { + "epoch": 0.3478183118741059, + "grad_norm": 0.14014913696166703, + "learning_rate": 0.0001515468286753204, + "loss": 0.7382, + "step": 3890 + }, + { + "epoch": 0.3479077253218884, + "grad_norm": 0.11736085532437321, + "learning_rate": 0.0001515220103457596, + "loss": 0.6662, + "step": 3891 + }, + { + "epoch": 0.347997138769671, + "grad_norm": 0.1146316248082257, + "learning_rate": 0.00015149718769511003, + "loss": 0.6665, + "step": 3892 + }, + { + "epoch": 0.3480865522174535, + "grad_norm": 0.11946893584047057, + "learning_rate": 0.00015147236072545348, + "loss": 0.654, + "step": 3893 + }, + { + "epoch": 0.34817596566523606, + "grad_norm": 0.11568224413875948, + "learning_rate": 0.00015144752943887222, + "loss": 0.6565, + "step": 3894 + }, + { + "epoch": 0.3482653791130186, + "grad_norm": 0.11946536171697053, + "learning_rate": 0.0001514226938374488, + "loss": 0.6547, + "step": 3895 + }, + { + "epoch": 0.34835479256080115, + "grad_norm": 0.10955644986582835, + "learning_rate": 0.00015139785392326616, + "loss": 0.684, + "step": 3896 + }, + { + "epoch": 0.34844420600858367, + "grad_norm": 0.13260268495384708, + "learning_rate": 0.00015137300969840758, + "loss": 0.689, + "step": 3897 + }, + { + "epoch": 0.34853361945636624, + "grad_norm": 0.1299212237259798, + "learning_rate": 0.0001513481611649567, + "loss": 0.671, + "step": 3898 + }, + { + "epoch": 0.3486230329041488, + "grad_norm": 0.1368450239411616, + "learning_rate": 0.00015132330832499756, + "loss": 0.6922, + "step": 3899 + }, + { + "epoch": 0.3487124463519313, + "grad_norm": 0.12588558587146514, + "learning_rate": 0.00015129845118061453, + "loss": 0.6907, + "step": 3900 + }, + { + "epoch": 0.3488018597997139, + "grad_norm": 0.13240264596629628, + "learning_rate": 0.00015127358973389236, + "loss": 0.6898, + "step": 3901 + }, + { + "epoch": 0.3488912732474964, + "grad_norm": 0.13388484469612083, + "learning_rate": 0.00015124872398691617, + "loss": 0.6806, + "step": 3902 + }, + { + "epoch": 0.348980686695279, + "grad_norm": 0.14809826231549547, + "learning_rate": 0.00015122385394177135, + "loss": 0.7452, + "step": 3903 + }, + { + "epoch": 0.3490701001430615, + "grad_norm": 0.1230324704392458, + "learning_rate": 0.0001511989796005438, + "loss": 0.674, + "step": 3904 + }, + { + "epoch": 0.3491595135908441, + "grad_norm": 0.12335373401758337, + "learning_rate": 0.00015117410096531964, + "loss": 0.6647, + "step": 3905 + }, + { + "epoch": 0.3492489270386266, + "grad_norm": 0.11368696161751456, + "learning_rate": 0.00015114921803818546, + "loss": 0.614, + "step": 3906 + }, + { + "epoch": 0.34933834048640916, + "grad_norm": 0.1494542644982469, + "learning_rate": 0.0001511243308212281, + "loss": 0.6732, + "step": 3907 + }, + { + "epoch": 0.3494277539341917, + "grad_norm": 0.11373923877039262, + "learning_rate": 0.00015109943931653486, + "loss": 0.6341, + "step": 3908 + }, + { + "epoch": 0.34951716738197425, + "grad_norm": 0.12521391916377428, + "learning_rate": 0.00015107454352619336, + "loss": 0.6896, + "step": 3909 + }, + { + "epoch": 0.3496065808297568, + "grad_norm": 0.12895650082749352, + "learning_rate": 0.00015104964345229158, + "loss": 0.6673, + "step": 3910 + }, + { + "epoch": 0.34969599427753933, + "grad_norm": 0.12689208273471464, + "learning_rate": 0.00015102473909691785, + "loss": 0.6586, + "step": 3911 + }, + { + "epoch": 0.3497854077253219, + "grad_norm": 0.12362966482603815, + "learning_rate": 0.0001509998304621609, + "loss": 0.6591, + "step": 3912 + }, + { + "epoch": 0.3498748211731044, + "grad_norm": 0.12436832770252408, + "learning_rate": 0.00015097491755010974, + "loss": 0.6493, + "step": 3913 + }, + { + "epoch": 0.349964234620887, + "grad_norm": 0.14847240503392128, + "learning_rate": 0.0001509500003628538, + "loss": 0.6805, + "step": 3914 + }, + { + "epoch": 0.3500536480686695, + "grad_norm": 0.13357728348823997, + "learning_rate": 0.00015092507890248288, + "loss": 0.6449, + "step": 3915 + }, + { + "epoch": 0.3501430615164521, + "grad_norm": 0.1419147979558426, + "learning_rate": 0.0001509001531710871, + "loss": 0.6823, + "step": 3916 + }, + { + "epoch": 0.3502324749642346, + "grad_norm": 0.12993625784081214, + "learning_rate": 0.00015087522317075693, + "loss": 0.6728, + "step": 3917 + }, + { + "epoch": 0.35032188841201717, + "grad_norm": 0.1279292898060999, + "learning_rate": 0.00015085028890358325, + "loss": 0.6463, + "step": 3918 + }, + { + "epoch": 0.35041130185979974, + "grad_norm": 0.12368679307397935, + "learning_rate": 0.00015082535037165724, + "loss": 0.6251, + "step": 3919 + }, + { + "epoch": 0.35050071530758226, + "grad_norm": 0.1539866308412901, + "learning_rate": 0.00015080040757707046, + "loss": 0.6801, + "step": 3920 + }, + { + "epoch": 0.3505901287553648, + "grad_norm": 0.1155255989417973, + "learning_rate": 0.0001507754605219149, + "loss": 0.6609, + "step": 3921 + }, + { + "epoch": 0.35067954220314734, + "grad_norm": 0.10638181377360578, + "learning_rate": 0.00015075050920828272, + "loss": 0.6514, + "step": 3922 + }, + { + "epoch": 0.3507689556509299, + "grad_norm": 0.14606299573412543, + "learning_rate": 0.00015072555363826665, + "loss": 0.7206, + "step": 3923 + }, + { + "epoch": 0.35085836909871243, + "grad_norm": 0.12009357323046205, + "learning_rate": 0.00015070059381395968, + "loss": 0.6626, + "step": 3924 + }, + { + "epoch": 0.350947782546495, + "grad_norm": 0.113442119298725, + "learning_rate": 0.0001506756297374551, + "loss": 0.6694, + "step": 3925 + }, + { + "epoch": 0.3510371959942775, + "grad_norm": 0.1184790979412401, + "learning_rate": 0.00015065066141084667, + "loss": 0.6609, + "step": 3926 + }, + { + "epoch": 0.3511266094420601, + "grad_norm": 0.12795921674190505, + "learning_rate": 0.00015062568883622844, + "loss": 0.6787, + "step": 3927 + }, + { + "epoch": 0.3512160228898426, + "grad_norm": 0.14523846836719922, + "learning_rate": 0.00015060071201569486, + "loss": 0.6993, + "step": 3928 + }, + { + "epoch": 0.3513054363376252, + "grad_norm": 0.1390442217411172, + "learning_rate": 0.00015057573095134062, + "loss": 0.7283, + "step": 3929 + }, + { + "epoch": 0.35139484978540775, + "grad_norm": 0.13950342527689452, + "learning_rate": 0.00015055074564526095, + "loss": 0.6965, + "step": 3930 + }, + { + "epoch": 0.35148426323319026, + "grad_norm": 0.1261524773803082, + "learning_rate": 0.00015052575609955125, + "loss": 0.7066, + "step": 3931 + }, + { + "epoch": 0.35157367668097284, + "grad_norm": 0.15647934381724243, + "learning_rate": 0.00015050076231630744, + "loss": 0.7333, + "step": 3932 + }, + { + "epoch": 0.35166309012875535, + "grad_norm": 0.12236554351148439, + "learning_rate": 0.00015047576429762566, + "loss": 0.6123, + "step": 3933 + }, + { + "epoch": 0.3517525035765379, + "grad_norm": 0.11867168640563125, + "learning_rate": 0.0001504507620456025, + "loss": 0.6464, + "step": 3934 + }, + { + "epoch": 0.35184191702432044, + "grad_norm": 0.1216081635336587, + "learning_rate": 0.00015042575556233488, + "loss": 0.6596, + "step": 3935 + }, + { + "epoch": 0.351931330472103, + "grad_norm": 0.1297208239524209, + "learning_rate": 0.00015040074484992, + "loss": 0.6972, + "step": 3936 + }, + { + "epoch": 0.3520207439198855, + "grad_norm": 0.12795294025789342, + "learning_rate": 0.00015037572991045552, + "loss": 0.6974, + "step": 3937 + }, + { + "epoch": 0.3521101573676681, + "grad_norm": 0.13622946514562176, + "learning_rate": 0.00015035071074603944, + "loss": 0.6715, + "step": 3938 + }, + { + "epoch": 0.35219957081545067, + "grad_norm": 0.15166871339021298, + "learning_rate": 0.00015032568735877003, + "loss": 0.7417, + "step": 3939 + }, + { + "epoch": 0.3522889842632332, + "grad_norm": 0.1137048802583555, + "learning_rate": 0.000150300659750746, + "loss": 0.6675, + "step": 3940 + }, + { + "epoch": 0.35237839771101576, + "grad_norm": 0.12604977885155497, + "learning_rate": 0.00015027562792406643, + "loss": 0.6409, + "step": 3941 + }, + { + "epoch": 0.35246781115879827, + "grad_norm": 0.11578274641602955, + "learning_rate": 0.0001502505918808306, + "loss": 0.6373, + "step": 3942 + }, + { + "epoch": 0.35255722460658084, + "grad_norm": 0.12206246722089217, + "learning_rate": 0.00015022555162313834, + "loss": 0.6789, + "step": 3943 + }, + { + "epoch": 0.35264663805436336, + "grad_norm": 0.127888905866457, + "learning_rate": 0.00015020050715308972, + "loss": 0.6558, + "step": 3944 + }, + { + "epoch": 0.35273605150214593, + "grad_norm": 0.13074893323617456, + "learning_rate": 0.0001501754584727852, + "loss": 0.6563, + "step": 3945 + }, + { + "epoch": 0.35282546494992845, + "grad_norm": 0.1356785813655046, + "learning_rate": 0.0001501504055843256, + "loss": 0.7251, + "step": 3946 + }, + { + "epoch": 0.352914878397711, + "grad_norm": 0.12785602826035464, + "learning_rate": 0.00015012534848981202, + "loss": 0.7092, + "step": 3947 + }, + { + "epoch": 0.3530042918454936, + "grad_norm": 0.1349896992051987, + "learning_rate": 0.000150100287191346, + "loss": 0.7376, + "step": 3948 + }, + { + "epoch": 0.3530937052932761, + "grad_norm": 0.12664578714823163, + "learning_rate": 0.00015007522169102941, + "loss": 0.6256, + "step": 3949 + }, + { + "epoch": 0.3531831187410587, + "grad_norm": 0.1332795541771807, + "learning_rate": 0.00015005015199096443, + "loss": 0.6786, + "step": 3950 + }, + { + "epoch": 0.3532725321888412, + "grad_norm": 0.12079130219533342, + "learning_rate": 0.00015002507809325365, + "loss": 0.6461, + "step": 3951 + }, + { + "epoch": 0.35336194563662376, + "grad_norm": 0.11368320151569691, + "learning_rate": 0.00015000000000000001, + "loss": 0.6475, + "step": 3952 + }, + { + "epoch": 0.3534513590844063, + "grad_norm": 0.13630969362258905, + "learning_rate": 0.00014997491771330675, + "loss": 0.6983, + "step": 3953 + }, + { + "epoch": 0.35354077253218885, + "grad_norm": 0.142312992557757, + "learning_rate": 0.0001499498312352775, + "loss": 0.7147, + "step": 3954 + }, + { + "epoch": 0.35363018597997137, + "grad_norm": 0.1355585165341386, + "learning_rate": 0.0001499247405680162, + "loss": 0.6876, + "step": 3955 + }, + { + "epoch": 0.35371959942775394, + "grad_norm": 0.13526088652283738, + "learning_rate": 0.00014989964571362723, + "loss": 0.7223, + "step": 3956 + }, + { + "epoch": 0.35380901287553645, + "grad_norm": 0.10757516649214754, + "learning_rate": 0.0001498745466742152, + "loss": 0.6403, + "step": 3957 + }, + { + "epoch": 0.353898426323319, + "grad_norm": 0.12383241596531859, + "learning_rate": 0.0001498494434518852, + "loss": 0.6553, + "step": 3958 + }, + { + "epoch": 0.3539878397711016, + "grad_norm": 0.12334334731801085, + "learning_rate": 0.0001498243360487426, + "loss": 0.6808, + "step": 3959 + }, + { + "epoch": 0.3540772532188841, + "grad_norm": 0.12457398563536182, + "learning_rate": 0.00014979922446689306, + "loss": 0.6953, + "step": 3960 + }, + { + "epoch": 0.3541666666666667, + "grad_norm": 0.1178617095997362, + "learning_rate": 0.00014977410870844273, + "loss": 0.6738, + "step": 3961 + }, + { + "epoch": 0.3542560801144492, + "grad_norm": 0.1141886459000753, + "learning_rate": 0.00014974898877549806, + "loss": 0.6, + "step": 3962 + }, + { + "epoch": 0.3543454935622318, + "grad_norm": 0.13352843614152138, + "learning_rate": 0.0001497238646701657, + "loss": 0.6249, + "step": 3963 + }, + { + "epoch": 0.3544349070100143, + "grad_norm": 0.12670434212597492, + "learning_rate": 0.00014969873639455292, + "loss": 0.6801, + "step": 3964 + }, + { + "epoch": 0.35452432045779686, + "grad_norm": 0.13547956612448073, + "learning_rate": 0.00014967360395076713, + "loss": 0.7073, + "step": 3965 + }, + { + "epoch": 0.3546137339055794, + "grad_norm": 0.12401712804269377, + "learning_rate": 0.00014964846734091616, + "loss": 0.6528, + "step": 3966 + }, + { + "epoch": 0.35470314735336195, + "grad_norm": 0.12549105608400743, + "learning_rate": 0.00014962332656710817, + "loss": 0.6416, + "step": 3967 + }, + { + "epoch": 0.3547925608011445, + "grad_norm": 0.11919075008647546, + "learning_rate": 0.00014959818163145174, + "loss": 0.6244, + "step": 3968 + }, + { + "epoch": 0.35488197424892703, + "grad_norm": 0.11668132636957239, + "learning_rate": 0.00014957303253605573, + "loss": 0.6402, + "step": 3969 + }, + { + "epoch": 0.3549713876967096, + "grad_norm": 0.12460259674734771, + "learning_rate": 0.00014954787928302935, + "loss": 0.6693, + "step": 3970 + }, + { + "epoch": 0.3550608011444921, + "grad_norm": 0.1340359063327917, + "learning_rate": 0.00014952272187448214, + "loss": 0.6841, + "step": 3971 + }, + { + "epoch": 0.3551502145922747, + "grad_norm": 0.13544926738951918, + "learning_rate": 0.00014949756031252406, + "loss": 0.7116, + "step": 3972 + }, + { + "epoch": 0.3552396280400572, + "grad_norm": 0.1297917193422058, + "learning_rate": 0.0001494723945992654, + "loss": 0.7047, + "step": 3973 + }, + { + "epoch": 0.3553290414878398, + "grad_norm": 0.13814095691660536, + "learning_rate": 0.00014944722473681673, + "loss": 0.6807, + "step": 3974 + }, + { + "epoch": 0.3554184549356223, + "grad_norm": 0.12895119649687137, + "learning_rate": 0.00014942205072728903, + "loss": 0.6974, + "step": 3975 + }, + { + "epoch": 0.35550786838340487, + "grad_norm": 0.1367836253103561, + "learning_rate": 0.00014939687257279363, + "loss": 0.7213, + "step": 3976 + }, + { + "epoch": 0.3555972818311874, + "grad_norm": 0.12617539966668467, + "learning_rate": 0.0001493716902754422, + "loss": 0.6585, + "step": 3977 + }, + { + "epoch": 0.35568669527896996, + "grad_norm": 0.11832616135536711, + "learning_rate": 0.00014934650383734672, + "loss": 0.6363, + "step": 3978 + }, + { + "epoch": 0.3557761087267525, + "grad_norm": 0.12045386713434578, + "learning_rate": 0.00014932131326061957, + "loss": 0.6647, + "step": 3979 + }, + { + "epoch": 0.35586552217453504, + "grad_norm": 0.12488666193680609, + "learning_rate": 0.00014929611854737343, + "loss": 0.6732, + "step": 3980 + }, + { + "epoch": 0.3559549356223176, + "grad_norm": 0.12487831860368381, + "learning_rate": 0.00014927091969972134, + "loss": 0.6446, + "step": 3981 + }, + { + "epoch": 0.35604434907010013, + "grad_norm": 0.12470256591769538, + "learning_rate": 0.00014924571671977676, + "loss": 0.6697, + "step": 3982 + }, + { + "epoch": 0.3561337625178827, + "grad_norm": 0.12432374393967323, + "learning_rate": 0.0001492205096096534, + "loss": 0.6535, + "step": 3983 + }, + { + "epoch": 0.3562231759656652, + "grad_norm": 0.11289888549520771, + "learning_rate": 0.00014919529837146528, + "loss": 0.6831, + "step": 3984 + }, + { + "epoch": 0.3563125894134478, + "grad_norm": 0.14201781954050316, + "learning_rate": 0.00014917008300732696, + "loss": 0.6832, + "step": 3985 + }, + { + "epoch": 0.3564020028612303, + "grad_norm": 0.13150182372599606, + "learning_rate": 0.00014914486351935312, + "loss": 0.6469, + "step": 3986 + }, + { + "epoch": 0.3564914163090129, + "grad_norm": 0.1083169176333587, + "learning_rate": 0.00014911963990965897, + "loss": 0.6349, + "step": 3987 + }, + { + "epoch": 0.35658082975679545, + "grad_norm": 0.12971959132513497, + "learning_rate": 0.00014909441218035992, + "loss": 0.6714, + "step": 3988 + }, + { + "epoch": 0.35667024320457796, + "grad_norm": 0.14424731481367428, + "learning_rate": 0.0001490691803335718, + "loss": 0.6703, + "step": 3989 + }, + { + "epoch": 0.35675965665236054, + "grad_norm": 0.11606492881802381, + "learning_rate": 0.0001490439443714108, + "loss": 0.674, + "step": 3990 + }, + { + "epoch": 0.35684907010014305, + "grad_norm": 0.10983186833970106, + "learning_rate": 0.0001490187042959934, + "loss": 0.6398, + "step": 3991 + }, + { + "epoch": 0.3569384835479256, + "grad_norm": 0.14404414146626715, + "learning_rate": 0.0001489934601094365, + "loss": 0.746, + "step": 3992 + }, + { + "epoch": 0.35702789699570814, + "grad_norm": 0.12825086442892572, + "learning_rate": 0.00014896821181385725, + "loss": 0.6553, + "step": 3993 + }, + { + "epoch": 0.3571173104434907, + "grad_norm": 0.14607921884787842, + "learning_rate": 0.0001489429594113732, + "loss": 0.7082, + "step": 3994 + }, + { + "epoch": 0.3572067238912732, + "grad_norm": 0.13043228044197896, + "learning_rate": 0.00014891770290410228, + "loss": 0.636, + "step": 3995 + }, + { + "epoch": 0.3572961373390558, + "grad_norm": 0.13886746132509173, + "learning_rate": 0.0001488924422941627, + "loss": 0.7017, + "step": 3996 + }, + { + "epoch": 0.3573855507868383, + "grad_norm": 0.13789837675970623, + "learning_rate": 0.000148867177583673, + "loss": 0.6767, + "step": 3997 + }, + { + "epoch": 0.3574749642346209, + "grad_norm": 0.12370223319089022, + "learning_rate": 0.00014884190877475213, + "loss": 0.7043, + "step": 3998 + }, + { + "epoch": 0.35756437768240346, + "grad_norm": 0.1279537701868889, + "learning_rate": 0.00014881663586951938, + "loss": 0.7067, + "step": 3999 + }, + { + "epoch": 0.35765379113018597, + "grad_norm": 0.12849395558008728, + "learning_rate": 0.00014879135887009435, + "loss": 0.6581, + "step": 4000 + }, + { + "epoch": 0.35774320457796854, + "grad_norm": 0.13501796999360846, + "learning_rate": 0.00014876607777859695, + "loss": 0.6706, + "step": 4001 + }, + { + "epoch": 0.35783261802575106, + "grad_norm": 0.12314563585962615, + "learning_rate": 0.0001487407925971475, + "loss": 0.6969, + "step": 4002 + }, + { + "epoch": 0.35792203147353363, + "grad_norm": 0.11385780131452768, + "learning_rate": 0.00014871550332786666, + "loss": 0.6223, + "step": 4003 + }, + { + "epoch": 0.35801144492131615, + "grad_norm": 0.13454185021619178, + "learning_rate": 0.00014869020997287536, + "loss": 0.6608, + "step": 4004 + }, + { + "epoch": 0.3581008583690987, + "grad_norm": 0.11680503899220669, + "learning_rate": 0.00014866491253429497, + "loss": 0.6609, + "step": 4005 + }, + { + "epoch": 0.35819027181688123, + "grad_norm": 0.12007219536443396, + "learning_rate": 0.00014863961101424712, + "loss": 0.6652, + "step": 4006 + }, + { + "epoch": 0.3582796852646638, + "grad_norm": 0.1238158924538288, + "learning_rate": 0.00014861430541485387, + "loss": 0.688, + "step": 4007 + }, + { + "epoch": 0.3583690987124464, + "grad_norm": 0.1274950235109175, + "learning_rate": 0.00014858899573823753, + "loss": 0.6724, + "step": 4008 + }, + { + "epoch": 0.3584585121602289, + "grad_norm": 0.11909963733911792, + "learning_rate": 0.00014856368198652077, + "loss": 0.6765, + "step": 4009 + }, + { + "epoch": 0.35854792560801146, + "grad_norm": 0.12463469082872992, + "learning_rate": 0.00014853836416182668, + "loss": 0.6435, + "step": 4010 + }, + { + "epoch": 0.358637339055794, + "grad_norm": 0.13092602292218636, + "learning_rate": 0.0001485130422662786, + "loss": 0.6506, + "step": 4011 + }, + { + "epoch": 0.35872675250357655, + "grad_norm": 0.13464163154893197, + "learning_rate": 0.00014848771630200023, + "loss": 0.6554, + "step": 4012 + }, + { + "epoch": 0.35881616595135907, + "grad_norm": 0.14829849032033912, + "learning_rate": 0.00014846238627111568, + "loss": 0.6735, + "step": 4013 + }, + { + "epoch": 0.35890557939914164, + "grad_norm": 0.14210436518818229, + "learning_rate": 0.00014843705217574933, + "loss": 0.7095, + "step": 4014 + }, + { + "epoch": 0.35899499284692415, + "grad_norm": 0.11696783235804893, + "learning_rate": 0.00014841171401802587, + "loss": 0.6505, + "step": 4015 + }, + { + "epoch": 0.3590844062947067, + "grad_norm": 0.12734422397851844, + "learning_rate": 0.00014838637180007047, + "loss": 0.6715, + "step": 4016 + }, + { + "epoch": 0.3591738197424893, + "grad_norm": 0.13482855205047098, + "learning_rate": 0.00014836102552400848, + "loss": 0.7091, + "step": 4017 + }, + { + "epoch": 0.3592632331902718, + "grad_norm": 0.1249151559488851, + "learning_rate": 0.00014833567519196566, + "loss": 0.693, + "step": 4018 + }, + { + "epoch": 0.3593526466380544, + "grad_norm": 0.12924905213221036, + "learning_rate": 0.00014831032080606817, + "loss": 0.6524, + "step": 4019 + }, + { + "epoch": 0.3594420600858369, + "grad_norm": 0.12467904153112855, + "learning_rate": 0.00014828496236844242, + "loss": 0.6722, + "step": 4020 + }, + { + "epoch": 0.3595314735336195, + "grad_norm": 0.13832029225076528, + "learning_rate": 0.00014825959988121515, + "loss": 0.6933, + "step": 4021 + }, + { + "epoch": 0.359620886981402, + "grad_norm": 0.1295545419828804, + "learning_rate": 0.00014823423334651357, + "loss": 0.6711, + "step": 4022 + }, + { + "epoch": 0.35971030042918456, + "grad_norm": 0.11591462915977295, + "learning_rate": 0.00014820886276646506, + "loss": 0.6732, + "step": 4023 + }, + { + "epoch": 0.3597997138769671, + "grad_norm": 0.13941370693098, + "learning_rate": 0.00014818348814319747, + "loss": 0.6624, + "step": 4024 + }, + { + "epoch": 0.35988912732474965, + "grad_norm": 0.1422814846767228, + "learning_rate": 0.0001481581094788389, + "loss": 0.7154, + "step": 4025 + }, + { + "epoch": 0.35997854077253216, + "grad_norm": 0.1375974492608014, + "learning_rate": 0.00014813272677551787, + "loss": 0.6534, + "step": 4026 + }, + { + "epoch": 0.36006795422031473, + "grad_norm": 0.13933808642830592, + "learning_rate": 0.00014810734003536317, + "loss": 0.7003, + "step": 4027 + }, + { + "epoch": 0.3601573676680973, + "grad_norm": 0.1536158813568295, + "learning_rate": 0.00014808194926050394, + "loss": 0.6752, + "step": 4028 + }, + { + "epoch": 0.3602467811158798, + "grad_norm": 0.11578360711567041, + "learning_rate": 0.0001480565544530697, + "loss": 0.6251, + "step": 4029 + }, + { + "epoch": 0.3603361945636624, + "grad_norm": 0.12889133805591146, + "learning_rate": 0.0001480311556151903, + "loss": 0.5866, + "step": 4030 + }, + { + "epoch": 0.3604256080114449, + "grad_norm": 0.13977854652469077, + "learning_rate": 0.00014800575274899588, + "loss": 0.6666, + "step": 4031 + }, + { + "epoch": 0.3605150214592275, + "grad_norm": 0.12962455234626918, + "learning_rate": 0.00014798034585661695, + "loss": 0.6496, + "step": 4032 + }, + { + "epoch": 0.36060443490701, + "grad_norm": 0.1379426465956055, + "learning_rate": 0.00014795493494018433, + "loss": 0.6977, + "step": 4033 + }, + { + "epoch": 0.36069384835479257, + "grad_norm": 0.14057064998877655, + "learning_rate": 0.00014792952000182926, + "loss": 0.6954, + "step": 4034 + }, + { + "epoch": 0.3607832618025751, + "grad_norm": 0.11713022112354356, + "learning_rate": 0.00014790410104368324, + "loss": 0.6856, + "step": 4035 + }, + { + "epoch": 0.36087267525035766, + "grad_norm": 0.1363717670975214, + "learning_rate": 0.00014787867806787807, + "loss": 0.719, + "step": 4036 + }, + { + "epoch": 0.3609620886981402, + "grad_norm": 0.12194719829045576, + "learning_rate": 0.00014785325107654606, + "loss": 0.6824, + "step": 4037 + }, + { + "epoch": 0.36105150214592274, + "grad_norm": 0.12482328608525517, + "learning_rate": 0.00014782782007181962, + "loss": 0.6625, + "step": 4038 + }, + { + "epoch": 0.3611409155937053, + "grad_norm": 0.12733517838398556, + "learning_rate": 0.0001478023850558317, + "loss": 0.6896, + "step": 4039 + }, + { + "epoch": 0.36123032904148783, + "grad_norm": 0.11843968768943941, + "learning_rate": 0.00014777694603071548, + "loss": 0.7074, + "step": 4040 + }, + { + "epoch": 0.3613197424892704, + "grad_norm": 0.12068421486740216, + "learning_rate": 0.0001477515029986045, + "loss": 0.6874, + "step": 4041 + }, + { + "epoch": 0.3614091559370529, + "grad_norm": 0.11760391520056201, + "learning_rate": 0.00014772605596163261, + "loss": 0.6488, + "step": 4042 + }, + { + "epoch": 0.3614985693848355, + "grad_norm": 0.12961441369230375, + "learning_rate": 0.00014770060492193406, + "loss": 0.6439, + "step": 4043 + }, + { + "epoch": 0.361587982832618, + "grad_norm": 0.14488541612251452, + "learning_rate": 0.00014767514988164336, + "loss": 0.6713, + "step": 4044 + }, + { + "epoch": 0.3616773962804006, + "grad_norm": 0.12068740502524888, + "learning_rate": 0.00014764969084289544, + "loss": 0.6653, + "step": 4045 + }, + { + "epoch": 0.3617668097281831, + "grad_norm": 0.1256167630921076, + "learning_rate": 0.00014762422780782548, + "loss": 0.6588, + "step": 4046 + }, + { + "epoch": 0.36185622317596566, + "grad_norm": 0.12093733017838243, + "learning_rate": 0.00014759876077856905, + "loss": 0.6493, + "step": 4047 + }, + { + "epoch": 0.36194563662374823, + "grad_norm": 0.1301745072766589, + "learning_rate": 0.00014757328975726207, + "loss": 0.686, + "step": 4048 + }, + { + "epoch": 0.36203505007153075, + "grad_norm": 0.13864492778315027, + "learning_rate": 0.0001475478147460407, + "loss": 0.6873, + "step": 4049 + }, + { + "epoch": 0.3621244635193133, + "grad_norm": 0.13481029104068792, + "learning_rate": 0.00014752233574704153, + "loss": 0.6864, + "step": 4050 + }, + { + "epoch": 0.36221387696709584, + "grad_norm": 0.13347914158778307, + "learning_rate": 0.0001474968527624015, + "loss": 0.6811, + "step": 4051 + }, + { + "epoch": 0.3623032904148784, + "grad_norm": 0.14715814299627122, + "learning_rate": 0.00014747136579425772, + "loss": 0.7152, + "step": 4052 + }, + { + "epoch": 0.3623927038626609, + "grad_norm": 0.11670821854081957, + "learning_rate": 0.00014744587484474784, + "loss": 0.6626, + "step": 4053 + }, + { + "epoch": 0.3624821173104435, + "grad_norm": 0.1284649164519899, + "learning_rate": 0.00014742037991600975, + "loss": 0.6667, + "step": 4054 + }, + { + "epoch": 0.362571530758226, + "grad_norm": 0.11551483462401384, + "learning_rate": 0.00014739488101018168, + "loss": 0.6468, + "step": 4055 + }, + { + "epoch": 0.3626609442060086, + "grad_norm": 0.14027114230340224, + "learning_rate": 0.00014736937812940217, + "loss": 0.7061, + "step": 4056 + }, + { + "epoch": 0.36275035765379116, + "grad_norm": 0.1350903031380968, + "learning_rate": 0.0001473438712758101, + "loss": 0.692, + "step": 4057 + }, + { + "epoch": 0.36283977110157367, + "grad_norm": 0.1165925047395569, + "learning_rate": 0.00014731836045154477, + "loss": 0.689, + "step": 4058 + }, + { + "epoch": 0.36292918454935624, + "grad_norm": 0.11955079161717422, + "learning_rate": 0.00014729284565874562, + "loss": 0.6781, + "step": 4059 + }, + { + "epoch": 0.36301859799713876, + "grad_norm": 0.1306040432178441, + "learning_rate": 0.0001472673268995527, + "loss": 0.703, + "step": 4060 + }, + { + "epoch": 0.36310801144492133, + "grad_norm": 0.11417248947698261, + "learning_rate": 0.0001472418041761061, + "loss": 0.6317, + "step": 4061 + }, + { + "epoch": 0.36319742489270385, + "grad_norm": 0.1352683990411234, + "learning_rate": 0.00014721627749054647, + "loss": 0.6586, + "step": 4062 + }, + { + "epoch": 0.3632868383404864, + "grad_norm": 0.1236547725876493, + "learning_rate": 0.00014719074684501468, + "loss": 0.7168, + "step": 4063 + }, + { + "epoch": 0.36337625178826893, + "grad_norm": 0.13184450676499743, + "learning_rate": 0.00014716521224165192, + "loss": 0.7029, + "step": 4064 + }, + { + "epoch": 0.3634656652360515, + "grad_norm": 0.14269051122166077, + "learning_rate": 0.0001471396736825998, + "loss": 0.6985, + "step": 4065 + }, + { + "epoch": 0.363555078683834, + "grad_norm": 0.13819774853720027, + "learning_rate": 0.00014711413117000013, + "loss": 0.6711, + "step": 4066 + }, + { + "epoch": 0.3636444921316166, + "grad_norm": 0.13092432574366128, + "learning_rate": 0.0001470885847059952, + "loss": 0.6717, + "step": 4067 + }, + { + "epoch": 0.36373390557939916, + "grad_norm": 0.12896958297874045, + "learning_rate": 0.00014706303429272755, + "loss": 0.68, + "step": 4068 + }, + { + "epoch": 0.3638233190271817, + "grad_norm": 0.11454028760274831, + "learning_rate": 0.00014703747993234003, + "loss": 0.6656, + "step": 4069 + }, + { + "epoch": 0.36391273247496425, + "grad_norm": 0.11035541062680722, + "learning_rate": 0.00014701192162697591, + "loss": 0.6223, + "step": 4070 + }, + { + "epoch": 0.36400214592274677, + "grad_norm": 0.129323931975019, + "learning_rate": 0.00014698635937877868, + "loss": 0.6922, + "step": 4071 + }, + { + "epoch": 0.36409155937052934, + "grad_norm": 0.1275399921519264, + "learning_rate": 0.0001469607931898922, + "loss": 0.6566, + "step": 4072 + }, + { + "epoch": 0.36418097281831185, + "grad_norm": 0.122227214642854, + "learning_rate": 0.00014693522306246076, + "loss": 0.6897, + "step": 4073 + }, + { + "epoch": 0.3642703862660944, + "grad_norm": 0.1323400321761279, + "learning_rate": 0.00014690964899862882, + "loss": 0.6986, + "step": 4074 + }, + { + "epoch": 0.36435979971387694, + "grad_norm": 0.13683237068483095, + "learning_rate": 0.0001468840710005413, + "loss": 0.6718, + "step": 4075 + }, + { + "epoch": 0.3644492131616595, + "grad_norm": 0.12760954651477482, + "learning_rate": 0.00014685848907034331, + "loss": 0.6671, + "step": 4076 + }, + { + "epoch": 0.3645386266094421, + "grad_norm": 0.11706247545053138, + "learning_rate": 0.00014683290321018048, + "loss": 0.6756, + "step": 4077 + }, + { + "epoch": 0.3646280400572246, + "grad_norm": 0.11862207474854723, + "learning_rate": 0.0001468073134221986, + "loss": 0.6684, + "step": 4078 + }, + { + "epoch": 0.36471745350500717, + "grad_norm": 0.1347292518859864, + "learning_rate": 0.0001467817197085439, + "loss": 0.6932, + "step": 4079 + }, + { + "epoch": 0.3648068669527897, + "grad_norm": 0.12767269837593562, + "learning_rate": 0.0001467561220713628, + "loss": 0.6809, + "step": 4080 + }, + { + "epoch": 0.36489628040057226, + "grad_norm": 0.1090151845458327, + "learning_rate": 0.00014673052051280227, + "loss": 0.6537, + "step": 4081 + }, + { + "epoch": 0.3649856938483548, + "grad_norm": 0.1156376761582797, + "learning_rate": 0.0001467049150350094, + "loss": 0.6757, + "step": 4082 + }, + { + "epoch": 0.36507510729613735, + "grad_norm": 0.12246693952998111, + "learning_rate": 0.00014667930564013173, + "loss": 0.645, + "step": 4083 + }, + { + "epoch": 0.36516452074391986, + "grad_norm": 0.11973810894601683, + "learning_rate": 0.00014665369233031705, + "loss": 0.6501, + "step": 4084 + }, + { + "epoch": 0.36525393419170243, + "grad_norm": 0.13339637714745173, + "learning_rate": 0.00014662807510771355, + "loss": 0.6713, + "step": 4085 + }, + { + "epoch": 0.365343347639485, + "grad_norm": 0.14227947515975473, + "learning_rate": 0.0001466024539744697, + "loss": 0.7, + "step": 4086 + }, + { + "epoch": 0.3654327610872675, + "grad_norm": 0.12641734175467018, + "learning_rate": 0.0001465768289327343, + "loss": 0.6708, + "step": 4087 + }, + { + "epoch": 0.3655221745350501, + "grad_norm": 0.13117448193732958, + "learning_rate": 0.00014655119998465652, + "loss": 0.6548, + "step": 4088 + }, + { + "epoch": 0.3656115879828326, + "grad_norm": 0.1296060429717204, + "learning_rate": 0.00014652556713238578, + "loss": 0.6833, + "step": 4089 + }, + { + "epoch": 0.3657010014306152, + "grad_norm": 0.1398500325699022, + "learning_rate": 0.000146499930378072, + "loss": 0.6905, + "step": 4090 + }, + { + "epoch": 0.3657904148783977, + "grad_norm": 0.13531419328803865, + "learning_rate": 0.00014647428972386513, + "loss": 0.6412, + "step": 4091 + }, + { + "epoch": 0.36587982832618027, + "grad_norm": 0.1312061629871141, + "learning_rate": 0.00014644864517191576, + "loss": 0.7101, + "step": 4092 + }, + { + "epoch": 0.3659692417739628, + "grad_norm": 0.14501032476152712, + "learning_rate": 0.00014642299672437461, + "loss": 0.6939, + "step": 4093 + }, + { + "epoch": 0.36605865522174535, + "grad_norm": 0.13893995236182333, + "learning_rate": 0.00014639734438339278, + "loss": 0.5845, + "step": 4094 + }, + { + "epoch": 0.36614806866952787, + "grad_norm": 0.14024405279759145, + "learning_rate": 0.0001463716881511217, + "loss": 0.7126, + "step": 4095 + }, + { + "epoch": 0.36623748211731044, + "grad_norm": 0.14057674755431737, + "learning_rate": 0.00014634602802971312, + "loss": 0.6864, + "step": 4096 + }, + { + "epoch": 0.366326895565093, + "grad_norm": 0.1175462355182313, + "learning_rate": 0.0001463203640213192, + "loss": 0.6492, + "step": 4097 + }, + { + "epoch": 0.36641630901287553, + "grad_norm": 0.11838133009455155, + "learning_rate": 0.00014629469612809223, + "loss": 0.681, + "step": 4098 + }, + { + "epoch": 0.3665057224606581, + "grad_norm": 0.11094914105625707, + "learning_rate": 0.00014626902435218504, + "loss": 0.6754, + "step": 4099 + }, + { + "epoch": 0.3665951359084406, + "grad_norm": 0.13331720700606067, + "learning_rate": 0.00014624334869575066, + "loss": 0.7172, + "step": 4100 + }, + { + "epoch": 0.3666845493562232, + "grad_norm": 0.14531171739702695, + "learning_rate": 0.00014621766916094248, + "loss": 0.6625, + "step": 4101 + }, + { + "epoch": 0.3667739628040057, + "grad_norm": 0.1242031058575496, + "learning_rate": 0.00014619198574991417, + "loss": 0.6577, + "step": 4102 + }, + { + "epoch": 0.3668633762517883, + "grad_norm": 0.11802488878993787, + "learning_rate": 0.00014616629846481982, + "loss": 0.6255, + "step": 4103 + }, + { + "epoch": 0.3669527896995708, + "grad_norm": 0.13194122569933503, + "learning_rate": 0.00014614060730781377, + "loss": 0.6572, + "step": 4104 + }, + { + "epoch": 0.36704220314735336, + "grad_norm": 0.12008906265881729, + "learning_rate": 0.0001461149122810507, + "loss": 0.6176, + "step": 4105 + }, + { + "epoch": 0.36713161659513593, + "grad_norm": 0.12783124125595918, + "learning_rate": 0.00014608921338668562, + "loss": 0.6843, + "step": 4106 + }, + { + "epoch": 0.36722103004291845, + "grad_norm": 0.13959514311193771, + "learning_rate": 0.00014606351062687391, + "loss": 0.6776, + "step": 4107 + }, + { + "epoch": 0.367310443490701, + "grad_norm": 0.1277488674920889, + "learning_rate": 0.00014603780400377118, + "loss": 0.6694, + "step": 4108 + }, + { + "epoch": 0.36739985693848354, + "grad_norm": 0.1362744516441585, + "learning_rate": 0.00014601209351953345, + "loss": 0.6961, + "step": 4109 + }, + { + "epoch": 0.3674892703862661, + "grad_norm": 0.1084405167167553, + "learning_rate": 0.00014598637917631697, + "loss": 0.6357, + "step": 4110 + }, + { + "epoch": 0.3675786838340486, + "grad_norm": 0.1298822729787, + "learning_rate": 0.00014596066097627842, + "loss": 0.6794, + "step": 4111 + }, + { + "epoch": 0.3676680972818312, + "grad_norm": 0.11699269855627306, + "learning_rate": 0.00014593493892157473, + "loss": 0.6606, + "step": 4112 + }, + { + "epoch": 0.3677575107296137, + "grad_norm": 0.13977839683177964, + "learning_rate": 0.00014590921301436318, + "loss": 0.7274, + "step": 4113 + }, + { + "epoch": 0.3678469241773963, + "grad_norm": 0.12253684355722297, + "learning_rate": 0.0001458834832568014, + "loss": 0.6763, + "step": 4114 + }, + { + "epoch": 0.3679363376251788, + "grad_norm": 0.11715882945823801, + "learning_rate": 0.00014585774965104732, + "loss": 0.6445, + "step": 4115 + }, + { + "epoch": 0.36802575107296137, + "grad_norm": 0.12058303282677975, + "learning_rate": 0.00014583201219925908, + "loss": 0.6874, + "step": 4116 + }, + { + "epoch": 0.36811516452074394, + "grad_norm": 0.13334913866325557, + "learning_rate": 0.0001458062709035954, + "loss": 0.7, + "step": 4117 + }, + { + "epoch": 0.36820457796852646, + "grad_norm": 0.13568674907308922, + "learning_rate": 0.00014578052576621507, + "loss": 0.7159, + "step": 4118 + }, + { + "epoch": 0.36829399141630903, + "grad_norm": 0.12395253629916911, + "learning_rate": 0.00014575477678927732, + "loss": 0.6916, + "step": 4119 + }, + { + "epoch": 0.36838340486409155, + "grad_norm": 0.13505481688994367, + "learning_rate": 0.00014572902397494173, + "loss": 0.6757, + "step": 4120 + }, + { + "epoch": 0.3684728183118741, + "grad_norm": 0.12622991476714457, + "learning_rate": 0.0001457032673253681, + "loss": 0.7001, + "step": 4121 + }, + { + "epoch": 0.36856223175965663, + "grad_norm": 0.13312477834886477, + "learning_rate": 0.00014567750684271665, + "loss": 0.6537, + "step": 4122 + }, + { + "epoch": 0.3686516452074392, + "grad_norm": 0.11242836714777812, + "learning_rate": 0.00014565174252914785, + "loss": 0.6501, + "step": 4123 + }, + { + "epoch": 0.3687410586552217, + "grad_norm": 0.12838390974640274, + "learning_rate": 0.00014562597438682256, + "loss": 0.6436, + "step": 4124 + }, + { + "epoch": 0.3688304721030043, + "grad_norm": 0.11700425130725674, + "learning_rate": 0.0001456002024179019, + "loss": 0.6867, + "step": 4125 + }, + { + "epoch": 0.36891988555078686, + "grad_norm": 0.13487128145073318, + "learning_rate": 0.0001455744266245473, + "loss": 0.6553, + "step": 4126 + }, + { + "epoch": 0.3690092989985694, + "grad_norm": 0.14053359701472531, + "learning_rate": 0.0001455486470089206, + "loss": 0.6757, + "step": 4127 + }, + { + "epoch": 0.36909871244635195, + "grad_norm": 0.13019770655567134, + "learning_rate": 0.0001455228635731839, + "loss": 0.6436, + "step": 4128 + }, + { + "epoch": 0.36918812589413447, + "grad_norm": 0.13676155571194243, + "learning_rate": 0.00014549707631949957, + "loss": 0.6579, + "step": 4129 + }, + { + "epoch": 0.36927753934191704, + "grad_norm": 0.12648262587808, + "learning_rate": 0.00014547128525003045, + "loss": 0.6603, + "step": 4130 + }, + { + "epoch": 0.36936695278969955, + "grad_norm": 0.144775979390438, + "learning_rate": 0.0001454454903669395, + "loss": 0.7066, + "step": 4131 + }, + { + "epoch": 0.3694563662374821, + "grad_norm": 0.13222737758250672, + "learning_rate": 0.0001454196916723902, + "loss": 0.6697, + "step": 4132 + }, + { + "epoch": 0.36954577968526464, + "grad_norm": 0.12038490938368185, + "learning_rate": 0.00014539388916854617, + "loss": 0.6205, + "step": 4133 + }, + { + "epoch": 0.3696351931330472, + "grad_norm": 0.13341474211347837, + "learning_rate": 0.00014536808285757152, + "loss": 0.6628, + "step": 4134 + }, + { + "epoch": 0.3697246065808298, + "grad_norm": 0.13038486727463355, + "learning_rate": 0.00014534227274163051, + "loss": 0.6442, + "step": 4135 + }, + { + "epoch": 0.3698140200286123, + "grad_norm": 0.1169714577260334, + "learning_rate": 0.00014531645882288788, + "loss": 0.7043, + "step": 4136 + }, + { + "epoch": 0.36990343347639487, + "grad_norm": 0.12042847316938962, + "learning_rate": 0.00014529064110350856, + "loss": 0.6677, + "step": 4137 + }, + { + "epoch": 0.3699928469241774, + "grad_norm": 0.13032985814323816, + "learning_rate": 0.00014526481958565787, + "loss": 0.7018, + "step": 4138 + }, + { + "epoch": 0.37008226037195996, + "grad_norm": 0.11744961213757403, + "learning_rate": 0.00014523899427150143, + "loss": 0.6628, + "step": 4139 + }, + { + "epoch": 0.3701716738197425, + "grad_norm": 0.12526477127255375, + "learning_rate": 0.00014521316516320515, + "loss": 0.6811, + "step": 4140 + }, + { + "epoch": 0.37026108726752505, + "grad_norm": 0.134441252291899, + "learning_rate": 0.00014518733226293534, + "loss": 0.6834, + "step": 4141 + }, + { + "epoch": 0.37035050071530756, + "grad_norm": 0.11933437745413629, + "learning_rate": 0.00014516149557285856, + "loss": 0.6632, + "step": 4142 + }, + { + "epoch": 0.37043991416309013, + "grad_norm": 0.12444559178832286, + "learning_rate": 0.00014513565509514167, + "loss": 0.6525, + "step": 4143 + }, + { + "epoch": 0.37052932761087265, + "grad_norm": 0.12256583378019338, + "learning_rate": 0.00014510981083195188, + "loss": 0.6514, + "step": 4144 + }, + { + "epoch": 0.3706187410586552, + "grad_norm": 0.13790497194768012, + "learning_rate": 0.00014508396278545678, + "loss": 0.6375, + "step": 4145 + }, + { + "epoch": 0.3707081545064378, + "grad_norm": 0.14722449284191563, + "learning_rate": 0.0001450581109578241, + "loss": 0.708, + "step": 4146 + }, + { + "epoch": 0.3707975679542203, + "grad_norm": 0.1393346813796873, + "learning_rate": 0.00014503225535122212, + "loss": 0.6921, + "step": 4147 + }, + { + "epoch": 0.3708869814020029, + "grad_norm": 0.1132897505407592, + "learning_rate": 0.00014500639596781926, + "loss": 0.6399, + "step": 4148 + }, + { + "epoch": 0.3709763948497854, + "grad_norm": 0.13771482250625655, + "learning_rate": 0.00014498053280978434, + "loss": 0.6802, + "step": 4149 + }, + { + "epoch": 0.37106580829756797, + "grad_norm": 0.1240307389041555, + "learning_rate": 0.00014495466587928642, + "loss": 0.6883, + "step": 4150 + }, + { + "epoch": 0.3711552217453505, + "grad_norm": 0.11784909186409762, + "learning_rate": 0.00014492879517849497, + "loss": 0.661, + "step": 4151 + }, + { + "epoch": 0.37124463519313305, + "grad_norm": 0.14025072407806705, + "learning_rate": 0.0001449029207095798, + "loss": 0.687, + "step": 4152 + }, + { + "epoch": 0.37133404864091557, + "grad_norm": 0.12115116630093693, + "learning_rate": 0.00014487704247471078, + "loss": 0.6928, + "step": 4153 + }, + { + "epoch": 0.37142346208869814, + "grad_norm": 0.1320514350490006, + "learning_rate": 0.00014485116047605848, + "loss": 0.6766, + "step": 4154 + }, + { + "epoch": 0.3715128755364807, + "grad_norm": 0.1289745516773074, + "learning_rate": 0.00014482527471579353, + "loss": 0.6103, + "step": 4155 + }, + { + "epoch": 0.37160228898426323, + "grad_norm": 0.1307280851318335, + "learning_rate": 0.00014479938519608687, + "loss": 0.6734, + "step": 4156 + }, + { + "epoch": 0.3716917024320458, + "grad_norm": 0.11879890155632798, + "learning_rate": 0.0001447734919191099, + "loss": 0.6666, + "step": 4157 + }, + { + "epoch": 0.3717811158798283, + "grad_norm": 0.12773105631458592, + "learning_rate": 0.00014474759488703425, + "loss": 0.679, + "step": 4158 + }, + { + "epoch": 0.3718705293276109, + "grad_norm": 0.11128208333004481, + "learning_rate": 0.00014472169410203187, + "loss": 0.6874, + "step": 4159 + }, + { + "epoch": 0.3719599427753934, + "grad_norm": 0.12634152224802206, + "learning_rate": 0.00014469578956627496, + "loss": 0.6511, + "step": 4160 + }, + { + "epoch": 0.372049356223176, + "grad_norm": 0.1048846782381673, + "learning_rate": 0.0001446698812819362, + "loss": 0.6356, + "step": 4161 + }, + { + "epoch": 0.3721387696709585, + "grad_norm": 0.11461313987918338, + "learning_rate": 0.00014464396925118847, + "loss": 0.6652, + "step": 4162 + }, + { + "epoch": 0.37222818311874106, + "grad_norm": 0.11129460679802627, + "learning_rate": 0.00014461805347620489, + "loss": 0.6324, + "step": 4163 + }, + { + "epoch": 0.3723175965665236, + "grad_norm": 0.1249240379991832, + "learning_rate": 0.00014459213395915906, + "loss": 0.6836, + "step": 4164 + }, + { + "epoch": 0.37240701001430615, + "grad_norm": 0.13475088403416463, + "learning_rate": 0.00014456621070222484, + "loss": 0.6856, + "step": 4165 + }, + { + "epoch": 0.3724964234620887, + "grad_norm": 0.13855387456122606, + "learning_rate": 0.00014454028370757636, + "loss": 0.6704, + "step": 4166 + }, + { + "epoch": 0.37258583690987124, + "grad_norm": 0.11820653405083897, + "learning_rate": 0.00014451435297738806, + "loss": 0.6689, + "step": 4167 + }, + { + "epoch": 0.3726752503576538, + "grad_norm": 0.12045649111164006, + "learning_rate": 0.00014448841851383472, + "loss": 0.6841, + "step": 4168 + }, + { + "epoch": 0.3727646638054363, + "grad_norm": 0.11345665768325547, + "learning_rate": 0.00014446248031909148, + "loss": 0.662, + "step": 4169 + }, + { + "epoch": 0.3728540772532189, + "grad_norm": 0.1290658375650302, + "learning_rate": 0.0001444365383953337, + "loss": 0.669, + "step": 4170 + }, + { + "epoch": 0.3729434907010014, + "grad_norm": 0.14736188051445948, + "learning_rate": 0.00014441059274473706, + "loss": 0.6922, + "step": 4171 + }, + { + "epoch": 0.373032904148784, + "grad_norm": 0.1390349243754118, + "learning_rate": 0.00014438464336947773, + "loss": 0.7337, + "step": 4172 + }, + { + "epoch": 0.3731223175965665, + "grad_norm": 0.14630378044263573, + "learning_rate": 0.0001443586902717319, + "loss": 0.6869, + "step": 4173 + }, + { + "epoch": 0.37321173104434907, + "grad_norm": 0.13759025580896392, + "learning_rate": 0.0001443327334536763, + "loss": 0.6889, + "step": 4174 + }, + { + "epoch": 0.37330114449213164, + "grad_norm": 0.1297000400643601, + "learning_rate": 0.00014430677291748788, + "loss": 0.667, + "step": 4175 + }, + { + "epoch": 0.37339055793991416, + "grad_norm": 0.1094534663156409, + "learning_rate": 0.00014428080866534396, + "loss": 0.6392, + "step": 4176 + }, + { + "epoch": 0.37347997138769673, + "grad_norm": 0.12264293189191881, + "learning_rate": 0.00014425484069942207, + "loss": 0.6123, + "step": 4177 + }, + { + "epoch": 0.37356938483547925, + "grad_norm": 0.12211681792099886, + "learning_rate": 0.00014422886902190014, + "loss": 0.6594, + "step": 4178 + }, + { + "epoch": 0.3736587982832618, + "grad_norm": 0.126506711271373, + "learning_rate": 0.00014420289363495638, + "loss": 0.6728, + "step": 4179 + }, + { + "epoch": 0.37374821173104433, + "grad_norm": 0.14274441538154645, + "learning_rate": 0.00014417691454076932, + "loss": 0.667, + "step": 4180 + }, + { + "epoch": 0.3738376251788269, + "grad_norm": 0.12979927546317316, + "learning_rate": 0.00014415093174151777, + "loss": 0.6451, + "step": 4181 + }, + { + "epoch": 0.3739270386266094, + "grad_norm": 0.14698951358125867, + "learning_rate": 0.0001441249452393809, + "loss": 0.6743, + "step": 4182 + }, + { + "epoch": 0.374016452074392, + "grad_norm": 0.12293309218000294, + "learning_rate": 0.0001440989550365382, + "loss": 0.5952, + "step": 4183 + }, + { + "epoch": 0.3741058655221745, + "grad_norm": 0.11318197702771918, + "learning_rate": 0.00014407296113516934, + "loss": 0.6488, + "step": 4184 + }, + { + "epoch": 0.3741952789699571, + "grad_norm": 0.1325085639168401, + "learning_rate": 0.00014404696353745452, + "loss": 0.6099, + "step": 4185 + }, + { + "epoch": 0.37428469241773965, + "grad_norm": 0.13741640222024556, + "learning_rate": 0.000144020962245574, + "loss": 0.6618, + "step": 4186 + }, + { + "epoch": 0.37437410586552217, + "grad_norm": 0.12281668185507229, + "learning_rate": 0.00014399495726170858, + "loss": 0.6534, + "step": 4187 + }, + { + "epoch": 0.37446351931330474, + "grad_norm": 0.11260222370508888, + "learning_rate": 0.0001439689485880392, + "loss": 0.6264, + "step": 4188 + }, + { + "epoch": 0.37455293276108725, + "grad_norm": 0.11582785632961101, + "learning_rate": 0.00014394293622674724, + "loss": 0.661, + "step": 4189 + }, + { + "epoch": 0.3746423462088698, + "grad_norm": 0.12002245764086601, + "learning_rate": 0.00014391692018001425, + "loss": 0.6831, + "step": 4190 + }, + { + "epoch": 0.37473175965665234, + "grad_norm": 0.14905632168323157, + "learning_rate": 0.00014389090045002225, + "loss": 0.6467, + "step": 4191 + }, + { + "epoch": 0.3748211731044349, + "grad_norm": 0.12001007021963467, + "learning_rate": 0.0001438648770389534, + "loss": 0.6457, + "step": 4192 + }, + { + "epoch": 0.37491058655221743, + "grad_norm": 0.12158965594976227, + "learning_rate": 0.0001438388499489903, + "loss": 0.6795, + "step": 4193 + }, + { + "epoch": 0.375, + "grad_norm": 0.11929882284269323, + "learning_rate": 0.00014381281918231578, + "loss": 0.677, + "step": 4194 + }, + { + "epoch": 0.37508941344778257, + "grad_norm": 0.13474609731754333, + "learning_rate": 0.00014378678474111304, + "loss": 0.6532, + "step": 4195 + }, + { + "epoch": 0.3751788268955651, + "grad_norm": 0.12888258303756764, + "learning_rate": 0.00014376074662756557, + "loss": 0.64, + "step": 4196 + }, + { + "epoch": 0.37526824034334766, + "grad_norm": 0.11514995177125069, + "learning_rate": 0.0001437347048438571, + "loss": 0.6813, + "step": 4197 + }, + { + "epoch": 0.3753576537911302, + "grad_norm": 0.12655452415739077, + "learning_rate": 0.00014370865939217176, + "loss": 0.7065, + "step": 4198 + }, + { + "epoch": 0.37544706723891275, + "grad_norm": 0.12634574000696103, + "learning_rate": 0.00014368261027469394, + "loss": 0.6809, + "step": 4199 + }, + { + "epoch": 0.37553648068669526, + "grad_norm": 0.12636691996068947, + "learning_rate": 0.00014365655749360833, + "loss": 0.6935, + "step": 4200 + }, + { + "epoch": 0.37562589413447783, + "grad_norm": 0.10953054449013125, + "learning_rate": 0.0001436305010511, + "loss": 0.6589, + "step": 4201 + }, + { + "epoch": 0.37571530758226035, + "grad_norm": 0.12596162656053772, + "learning_rate": 0.00014360444094935424, + "loss": 0.6796, + "step": 4202 + }, + { + "epoch": 0.3758047210300429, + "grad_norm": 0.1265110283227648, + "learning_rate": 0.00014357837719055667, + "loss": 0.6779, + "step": 4203 + }, + { + "epoch": 0.3758941344778255, + "grad_norm": 0.12545115347085914, + "learning_rate": 0.00014355230977689323, + "loss": 0.668, + "step": 4204 + }, + { + "epoch": 0.375983547925608, + "grad_norm": 0.13151077166990072, + "learning_rate": 0.00014352623871055018, + "loss": 0.6586, + "step": 4205 + }, + { + "epoch": 0.3760729613733906, + "grad_norm": 0.1418816081693946, + "learning_rate": 0.00014350016399371405, + "loss": 0.6947, + "step": 4206 + }, + { + "epoch": 0.3761623748211731, + "grad_norm": 0.11564131849105623, + "learning_rate": 0.00014347408562857169, + "loss": 0.6426, + "step": 4207 + }, + { + "epoch": 0.37625178826895567, + "grad_norm": 0.14058705105875652, + "learning_rate": 0.00014344800361731027, + "loss": 0.7101, + "step": 4208 + }, + { + "epoch": 0.3763412017167382, + "grad_norm": 0.132575380595712, + "learning_rate": 0.00014342191796211726, + "loss": 0.6845, + "step": 4209 + }, + { + "epoch": 0.37643061516452075, + "grad_norm": 0.14391975816591568, + "learning_rate": 0.00014339582866518044, + "loss": 0.7056, + "step": 4210 + }, + { + "epoch": 0.37652002861230327, + "grad_norm": 0.15648670056118608, + "learning_rate": 0.00014336973572868787, + "loss": 0.6928, + "step": 4211 + }, + { + "epoch": 0.37660944206008584, + "grad_norm": 0.14571317644439064, + "learning_rate": 0.00014334363915482795, + "loss": 0.7243, + "step": 4212 + }, + { + "epoch": 0.37669885550786836, + "grad_norm": 0.13289677336157293, + "learning_rate": 0.00014331753894578937, + "loss": 0.6701, + "step": 4213 + }, + { + "epoch": 0.37678826895565093, + "grad_norm": 0.11540353319651907, + "learning_rate": 0.00014329143510376108, + "loss": 0.6703, + "step": 4214 + }, + { + "epoch": 0.3768776824034335, + "grad_norm": 0.12070336812439329, + "learning_rate": 0.00014326532763093245, + "loss": 0.666, + "step": 4215 + }, + { + "epoch": 0.376967095851216, + "grad_norm": 0.13240346645789097, + "learning_rate": 0.00014323921652949301, + "loss": 0.6654, + "step": 4216 + }, + { + "epoch": 0.3770565092989986, + "grad_norm": 0.10691279631194729, + "learning_rate": 0.00014321310180163272, + "loss": 0.651, + "step": 4217 + }, + { + "epoch": 0.3771459227467811, + "grad_norm": 0.12393237608239396, + "learning_rate": 0.00014318698344954175, + "loss": 0.6501, + "step": 4218 + }, + { + "epoch": 0.3772353361945637, + "grad_norm": 0.1224418228418481, + "learning_rate": 0.00014316086147541065, + "loss": 0.6759, + "step": 4219 + }, + { + "epoch": 0.3773247496423462, + "grad_norm": 0.12284367531374941, + "learning_rate": 0.00014313473588143026, + "loss": 0.6855, + "step": 4220 + }, + { + "epoch": 0.37741416309012876, + "grad_norm": 0.12394926579556699, + "learning_rate": 0.0001431086066697916, + "loss": 0.6477, + "step": 4221 + }, + { + "epoch": 0.3775035765379113, + "grad_norm": 0.12037021037409226, + "learning_rate": 0.0001430824738426862, + "loss": 0.6335, + "step": 4222 + }, + { + "epoch": 0.37759298998569385, + "grad_norm": 0.12919165395173463, + "learning_rate": 0.00014305633740230574, + "loss": 0.6323, + "step": 4223 + }, + { + "epoch": 0.3776824034334764, + "grad_norm": 0.13983253795679024, + "learning_rate": 0.00014303019735084226, + "loss": 0.6537, + "step": 4224 + }, + { + "epoch": 0.37777181688125894, + "grad_norm": 0.13771160689592685, + "learning_rate": 0.00014300405369048808, + "loss": 0.6761, + "step": 4225 + }, + { + "epoch": 0.3778612303290415, + "grad_norm": 0.11737874689656026, + "learning_rate": 0.00014297790642343587, + "loss": 0.6441, + "step": 4226 + }, + { + "epoch": 0.377950643776824, + "grad_norm": 0.13791608920376686, + "learning_rate": 0.00014295175555187854, + "loss": 0.6677, + "step": 4227 + }, + { + "epoch": 0.3780400572246066, + "grad_norm": 0.13294001256142618, + "learning_rate": 0.00014292560107800935, + "loss": 0.6826, + "step": 4228 + }, + { + "epoch": 0.3781294706723891, + "grad_norm": 0.11520249710390984, + "learning_rate": 0.00014289944300402186, + "loss": 0.6511, + "step": 4229 + }, + { + "epoch": 0.3782188841201717, + "grad_norm": 0.1253065638653619, + "learning_rate": 0.00014287328133210986, + "loss": 0.6733, + "step": 4230 + }, + { + "epoch": 0.3783082975679542, + "grad_norm": 0.13334041942775296, + "learning_rate": 0.00014284711606446754, + "loss": 0.6867, + "step": 4231 + }, + { + "epoch": 0.37839771101573677, + "grad_norm": 0.13159166843246992, + "learning_rate": 0.00014282094720328937, + "loss": 0.6689, + "step": 4232 + }, + { + "epoch": 0.3784871244635193, + "grad_norm": 0.1283860894787853, + "learning_rate": 0.00014279477475077006, + "loss": 0.696, + "step": 4233 + }, + { + "epoch": 0.37857653791130186, + "grad_norm": 0.10914913866864902, + "learning_rate": 0.00014276859870910463, + "loss": 0.6451, + "step": 4234 + }, + { + "epoch": 0.37866595135908443, + "grad_norm": 0.0956018739508924, + "learning_rate": 0.00014274241908048856, + "loss": 0.6539, + "step": 4235 + }, + { + "epoch": 0.37875536480686695, + "grad_norm": 0.12489072876957448, + "learning_rate": 0.00014271623586711738, + "loss": 0.6406, + "step": 4236 + }, + { + "epoch": 0.3788447782546495, + "grad_norm": 0.12874128691997147, + "learning_rate": 0.00014269004907118706, + "loss": 0.6661, + "step": 4237 + }, + { + "epoch": 0.37893419170243203, + "grad_norm": 0.12187540908628142, + "learning_rate": 0.0001426638586948939, + "loss": 0.6761, + "step": 4238 + }, + { + "epoch": 0.3790236051502146, + "grad_norm": 0.13689715864117477, + "learning_rate": 0.00014263766474043445, + "loss": 0.629, + "step": 4239 + }, + { + "epoch": 0.3791130185979971, + "grad_norm": 0.1575810143424251, + "learning_rate": 0.00014261146721000553, + "loss": 0.6995, + "step": 4240 + }, + { + "epoch": 0.3792024320457797, + "grad_norm": 0.10462429086922004, + "learning_rate": 0.00014258526610580433, + "loss": 0.6309, + "step": 4241 + }, + { + "epoch": 0.3792918454935622, + "grad_norm": 0.1386335567762611, + "learning_rate": 0.0001425590614300283, + "loss": 0.713, + "step": 4242 + }, + { + "epoch": 0.3793812589413448, + "grad_norm": 0.1370156691370874, + "learning_rate": 0.0001425328531848752, + "loss": 0.6543, + "step": 4243 + }, + { + "epoch": 0.37947067238912735, + "grad_norm": 0.12110256242035367, + "learning_rate": 0.00014250664137254303, + "loss": 0.6587, + "step": 4244 + }, + { + "epoch": 0.37956008583690987, + "grad_norm": 0.14021860833448288, + "learning_rate": 0.0001424804259952302, + "loss": 0.6798, + "step": 4245 + }, + { + "epoch": 0.37964949928469244, + "grad_norm": 0.1461853700367031, + "learning_rate": 0.00014245420705513535, + "loss": 0.724, + "step": 4246 + }, + { + "epoch": 0.37973891273247495, + "grad_norm": 0.15677773805034056, + "learning_rate": 0.0001424279845544574, + "loss": 0.7223, + "step": 4247 + }, + { + "epoch": 0.3798283261802575, + "grad_norm": 0.12082724709012223, + "learning_rate": 0.00014240175849539565, + "loss": 0.5882, + "step": 4248 + }, + { + "epoch": 0.37991773962804004, + "grad_norm": 0.12643860613546637, + "learning_rate": 0.00014237552888014961, + "loss": 0.6719, + "step": 4249 + }, + { + "epoch": 0.3800071530758226, + "grad_norm": 0.1263496377489484, + "learning_rate": 0.00014234929571091916, + "loss": 0.6614, + "step": 4250 + }, + { + "epoch": 0.38009656652360513, + "grad_norm": 0.12229383833844724, + "learning_rate": 0.0001423230589899044, + "loss": 0.7014, + "step": 4251 + }, + { + "epoch": 0.3801859799713877, + "grad_norm": 0.12951184709302435, + "learning_rate": 0.00014229681871930582, + "loss": 0.6698, + "step": 4252 + }, + { + "epoch": 0.3802753934191702, + "grad_norm": 0.11321812510805015, + "learning_rate": 0.00014227057490132414, + "loss": 0.6194, + "step": 4253 + }, + { + "epoch": 0.3803648068669528, + "grad_norm": 0.12533056313562166, + "learning_rate": 0.00014224432753816036, + "loss": 0.6698, + "step": 4254 + }, + { + "epoch": 0.38045422031473536, + "grad_norm": 0.13166279188039223, + "learning_rate": 0.00014221807663201586, + "loss": 0.6689, + "step": 4255 + }, + { + "epoch": 0.3805436337625179, + "grad_norm": 0.1491836514207552, + "learning_rate": 0.0001421918221850923, + "loss": 0.7292, + "step": 4256 + }, + { + "epoch": 0.38063304721030045, + "grad_norm": 0.12880619123213374, + "learning_rate": 0.0001421655641995915, + "loss": 0.6615, + "step": 4257 + }, + { + "epoch": 0.38072246065808296, + "grad_norm": 0.12484786458076762, + "learning_rate": 0.0001421393026777158, + "loss": 0.6883, + "step": 4258 + }, + { + "epoch": 0.38081187410586553, + "grad_norm": 0.12785616197005864, + "learning_rate": 0.00014211303762166766, + "loss": 0.6459, + "step": 4259 + }, + { + "epoch": 0.38090128755364805, + "grad_norm": 0.140566805797323, + "learning_rate": 0.00014208676903364992, + "loss": 0.5822, + "step": 4260 + }, + { + "epoch": 0.3809907010014306, + "grad_norm": 0.12752772197111112, + "learning_rate": 0.00014206049691586564, + "loss": 0.6482, + "step": 4261 + }, + { + "epoch": 0.38108011444921314, + "grad_norm": 0.14157084275556642, + "learning_rate": 0.00014203422127051835, + "loss": 0.6865, + "step": 4262 + }, + { + "epoch": 0.3811695278969957, + "grad_norm": 0.12821141933320526, + "learning_rate": 0.00014200794209981167, + "loss": 0.6623, + "step": 4263 + }, + { + "epoch": 0.3812589413447783, + "grad_norm": 0.11924866600467326, + "learning_rate": 0.0001419816594059496, + "loss": 0.653, + "step": 4264 + }, + { + "epoch": 0.3813483547925608, + "grad_norm": 0.13159686086129854, + "learning_rate": 0.00014195537319113647, + "loss": 0.6486, + "step": 4265 + }, + { + "epoch": 0.38143776824034337, + "grad_norm": 0.1340469376770978, + "learning_rate": 0.00014192908345757687, + "loss": 0.6487, + "step": 4266 + }, + { + "epoch": 0.3815271816881259, + "grad_norm": 0.1352819118755341, + "learning_rate": 0.0001419027902074757, + "loss": 0.6789, + "step": 4267 + }, + { + "epoch": 0.38161659513590845, + "grad_norm": 0.11432461862818971, + "learning_rate": 0.000141876493443038, + "loss": 0.6531, + "step": 4268 + }, + { + "epoch": 0.38170600858369097, + "grad_norm": 0.12400528230601884, + "learning_rate": 0.0001418501931664695, + "loss": 0.6659, + "step": 4269 + }, + { + "epoch": 0.38179542203147354, + "grad_norm": 0.13406010595807985, + "learning_rate": 0.0001418238893799758, + "loss": 0.6524, + "step": 4270 + }, + { + "epoch": 0.38188483547925606, + "grad_norm": 0.12709542388336328, + "learning_rate": 0.00014179758208576298, + "loss": 0.6926, + "step": 4271 + }, + { + "epoch": 0.38197424892703863, + "grad_norm": 0.1348471410926399, + "learning_rate": 0.00014177127128603745, + "loss": 0.7096, + "step": 4272 + }, + { + "epoch": 0.3820636623748212, + "grad_norm": 0.12125484341460904, + "learning_rate": 0.00014174495698300588, + "loss": 0.647, + "step": 4273 + }, + { + "epoch": 0.3821530758226037, + "grad_norm": 0.13637584523915663, + "learning_rate": 0.00014171863917887513, + "loss": 0.6784, + "step": 4274 + }, + { + "epoch": 0.3822424892703863, + "grad_norm": 0.1361942449725186, + "learning_rate": 0.0001416923178758525, + "loss": 0.6227, + "step": 4275 + }, + { + "epoch": 0.3823319027181688, + "grad_norm": 0.13542476373949444, + "learning_rate": 0.00014166599307614556, + "loss": 0.726, + "step": 4276 + }, + { + "epoch": 0.3824213161659514, + "grad_norm": 0.13007847448932572, + "learning_rate": 0.00014163966478196208, + "loss": 0.6924, + "step": 4277 + }, + { + "epoch": 0.3825107296137339, + "grad_norm": 0.11719079297402707, + "learning_rate": 0.0001416133329955102, + "loss": 0.6437, + "step": 4278 + }, + { + "epoch": 0.38260014306151646, + "grad_norm": 0.12158549409421314, + "learning_rate": 0.00014158699771899832, + "loss": 0.6479, + "step": 4279 + }, + { + "epoch": 0.382689556509299, + "grad_norm": 0.118294159137216, + "learning_rate": 0.0001415606589546352, + "loss": 0.604, + "step": 4280 + }, + { + "epoch": 0.38277896995708155, + "grad_norm": 0.1396004376010757, + "learning_rate": 0.0001415343167046298, + "loss": 0.6826, + "step": 4281 + }, + { + "epoch": 0.38286838340486407, + "grad_norm": 0.12903085307147727, + "learning_rate": 0.0001415079709711914, + "loss": 0.6698, + "step": 4282 + }, + { + "epoch": 0.38295779685264664, + "grad_norm": 0.12341079222895503, + "learning_rate": 0.0001414816217565296, + "loss": 0.6467, + "step": 4283 + }, + { + "epoch": 0.3830472103004292, + "grad_norm": 0.14660431079587732, + "learning_rate": 0.00014145526906285432, + "loss": 0.6668, + "step": 4284 + }, + { + "epoch": 0.3831366237482117, + "grad_norm": 0.13429934178697997, + "learning_rate": 0.00014142891289237563, + "loss": 0.6654, + "step": 4285 + }, + { + "epoch": 0.3832260371959943, + "grad_norm": 0.12454798770777631, + "learning_rate": 0.0001414025532473041, + "loss": 0.6656, + "step": 4286 + }, + { + "epoch": 0.3833154506437768, + "grad_norm": 0.11439201925524083, + "learning_rate": 0.00014137619012985042, + "loss": 0.6423, + "step": 4287 + }, + { + "epoch": 0.3834048640915594, + "grad_norm": 0.11932399274808354, + "learning_rate": 0.00014134982354222563, + "loss": 0.6148, + "step": 4288 + }, + { + "epoch": 0.3834942775393419, + "grad_norm": 0.12192304474563279, + "learning_rate": 0.00014132345348664106, + "loss": 0.6607, + "step": 4289 + }, + { + "epoch": 0.38358369098712447, + "grad_norm": 0.12640529406312387, + "learning_rate": 0.00014129707996530838, + "loss": 0.6843, + "step": 4290 + }, + { + "epoch": 0.383673104434907, + "grad_norm": 0.13818409861063968, + "learning_rate": 0.00014127070298043947, + "loss": 0.6842, + "step": 4291 + }, + { + "epoch": 0.38376251788268956, + "grad_norm": 0.1307277254328571, + "learning_rate": 0.00014124432253424655, + "loss": 0.6833, + "step": 4292 + }, + { + "epoch": 0.38385193133047213, + "grad_norm": 0.11445403597932946, + "learning_rate": 0.0001412179386289421, + "loss": 0.6443, + "step": 4293 + }, + { + "epoch": 0.38394134477825465, + "grad_norm": 0.14046909012734046, + "learning_rate": 0.00014119155126673895, + "loss": 0.6468, + "step": 4294 + }, + { + "epoch": 0.3840307582260372, + "grad_norm": 0.13638170938782918, + "learning_rate": 0.0001411651604498501, + "loss": 0.6553, + "step": 4295 + }, + { + "epoch": 0.38412017167381973, + "grad_norm": 0.1471681519353006, + "learning_rate": 0.00014113876618048897, + "loss": 0.6532, + "step": 4296 + }, + { + "epoch": 0.3842095851216023, + "grad_norm": 0.12221227192009469, + "learning_rate": 0.00014111236846086922, + "loss": 0.6705, + "step": 4297 + }, + { + "epoch": 0.3842989985693848, + "grad_norm": 0.140495844378672, + "learning_rate": 0.00014108596729320473, + "loss": 0.7002, + "step": 4298 + }, + { + "epoch": 0.3843884120171674, + "grad_norm": 0.12929022637990908, + "learning_rate": 0.0001410595626797098, + "loss": 0.6574, + "step": 4299 + }, + { + "epoch": 0.3844778254649499, + "grad_norm": 0.13800314690893659, + "learning_rate": 0.00014103315462259898, + "loss": 0.6967, + "step": 4300 + }, + { + "epoch": 0.3845672389127325, + "grad_norm": 0.13523440039149753, + "learning_rate": 0.000141006743124087, + "loss": 0.6881, + "step": 4301 + }, + { + "epoch": 0.384656652360515, + "grad_norm": 0.12612969989611134, + "learning_rate": 0.000140980328186389, + "loss": 0.6363, + "step": 4302 + }, + { + "epoch": 0.38474606580829757, + "grad_norm": 0.11869412314269821, + "learning_rate": 0.00014095390981172038, + "loss": 0.692, + "step": 4303 + }, + { + "epoch": 0.38483547925608014, + "grad_norm": 0.12556729674937284, + "learning_rate": 0.00014092748800229683, + "loss": 0.6658, + "step": 4304 + }, + { + "epoch": 0.38492489270386265, + "grad_norm": 0.14644396972059368, + "learning_rate": 0.00014090106276033423, + "loss": 0.6724, + "step": 4305 + }, + { + "epoch": 0.3850143061516452, + "grad_norm": 0.1336403761460775, + "learning_rate": 0.00014087463408804892, + "loss": 0.6883, + "step": 4306 + }, + { + "epoch": 0.38510371959942774, + "grad_norm": 0.1209238451742378, + "learning_rate": 0.00014084820198765743, + "loss": 0.6564, + "step": 4307 + }, + { + "epoch": 0.3851931330472103, + "grad_norm": 0.1332100459647131, + "learning_rate": 0.00014082176646137653, + "loss": 0.5934, + "step": 4308 + }, + { + "epoch": 0.38528254649499283, + "grad_norm": 0.1298439612016147, + "learning_rate": 0.0001407953275114234, + "loss": 0.6172, + "step": 4309 + }, + { + "epoch": 0.3853719599427754, + "grad_norm": 0.11144778417322408, + "learning_rate": 0.00014076888514001542, + "loss": 0.644, + "step": 4310 + }, + { + "epoch": 0.3854613733905579, + "grad_norm": 0.12258365413110564, + "learning_rate": 0.0001407424393493703, + "loss": 0.6725, + "step": 4311 + }, + { + "epoch": 0.3855507868383405, + "grad_norm": 0.12877137906675534, + "learning_rate": 0.00014071599014170598, + "loss": 0.6386, + "step": 4312 + }, + { + "epoch": 0.38564020028612306, + "grad_norm": 0.12630386829035656, + "learning_rate": 0.0001406895375192407, + "loss": 0.6781, + "step": 4313 + }, + { + "epoch": 0.3857296137339056, + "grad_norm": 0.10220771849408977, + "learning_rate": 0.0001406630814841931, + "loss": 0.6199, + "step": 4314 + }, + { + "epoch": 0.38581902718168815, + "grad_norm": 0.1199492306128077, + "learning_rate": 0.00014063662203878195, + "loss": 0.669, + "step": 4315 + }, + { + "epoch": 0.38590844062947066, + "grad_norm": 0.11951866381067335, + "learning_rate": 0.00014061015918522639, + "loss": 0.6502, + "step": 4316 + }, + { + "epoch": 0.38599785407725323, + "grad_norm": 0.12688773138451998, + "learning_rate": 0.0001405836929257458, + "loss": 0.6502, + "step": 4317 + }, + { + "epoch": 0.38608726752503575, + "grad_norm": 0.11368920988255825, + "learning_rate": 0.00014055722326255992, + "loss": 0.6833, + "step": 4318 + }, + { + "epoch": 0.3861766809728183, + "grad_norm": 0.11389832640610745, + "learning_rate": 0.0001405307501978887, + "loss": 0.6555, + "step": 4319 + }, + { + "epoch": 0.38626609442060084, + "grad_norm": 0.13129300224198637, + "learning_rate": 0.0001405042737339524, + "loss": 0.6967, + "step": 4320 + }, + { + "epoch": 0.3863555078683834, + "grad_norm": 0.12565610203522984, + "learning_rate": 0.0001404777938729716, + "loss": 0.6403, + "step": 4321 + }, + { + "epoch": 0.386444921316166, + "grad_norm": 0.11716399972306178, + "learning_rate": 0.00014045131061716712, + "loss": 0.6268, + "step": 4322 + }, + { + "epoch": 0.3865343347639485, + "grad_norm": 0.1364141923680457, + "learning_rate": 0.00014042482396876005, + "loss": 0.6525, + "step": 4323 + }, + { + "epoch": 0.38662374821173107, + "grad_norm": 0.14533942974935415, + "learning_rate": 0.0001403983339299718, + "loss": 0.682, + "step": 4324 + }, + { + "epoch": 0.3867131616595136, + "grad_norm": 0.1297515348216203, + "learning_rate": 0.0001403718405030241, + "loss": 0.6484, + "step": 4325 + }, + { + "epoch": 0.38680257510729615, + "grad_norm": 0.1230538714443216, + "learning_rate": 0.00014034534369013887, + "loss": 0.6691, + "step": 4326 + }, + { + "epoch": 0.38689198855507867, + "grad_norm": 0.1319398692773667, + "learning_rate": 0.0001403188434935384, + "loss": 0.6518, + "step": 4327 + }, + { + "epoch": 0.38698140200286124, + "grad_norm": 0.12268397995331559, + "learning_rate": 0.00014029233991544527, + "loss": 0.6588, + "step": 4328 + }, + { + "epoch": 0.38707081545064376, + "grad_norm": 0.1357874329318544, + "learning_rate": 0.0001402658329580822, + "loss": 0.6877, + "step": 4329 + }, + { + "epoch": 0.38716022889842633, + "grad_norm": 0.12737599699659305, + "learning_rate": 0.0001402393226236723, + "loss": 0.6798, + "step": 4330 + }, + { + "epoch": 0.38724964234620884, + "grad_norm": 0.14071449318945065, + "learning_rate": 0.00014021280891443909, + "loss": 0.7076, + "step": 4331 + }, + { + "epoch": 0.3873390557939914, + "grad_norm": 0.11021221160360709, + "learning_rate": 0.0001401862918326061, + "loss": 0.6239, + "step": 4332 + }, + { + "epoch": 0.387428469241774, + "grad_norm": 0.1270358668573588, + "learning_rate": 0.0001401597713803974, + "loss": 0.6358, + "step": 4333 + }, + { + "epoch": 0.3875178826895565, + "grad_norm": 0.13037832050245748, + "learning_rate": 0.00014013324756003716, + "loss": 0.6682, + "step": 4334 + }, + { + "epoch": 0.3876072961373391, + "grad_norm": 0.14624778643221054, + "learning_rate": 0.0001401067203737499, + "loss": 0.7335, + "step": 4335 + }, + { + "epoch": 0.3876967095851216, + "grad_norm": 0.12446835048542841, + "learning_rate": 0.00014008018982376044, + "loss": 0.6929, + "step": 4336 + }, + { + "epoch": 0.38778612303290416, + "grad_norm": 0.12359768995049311, + "learning_rate": 0.0001400536559122939, + "loss": 0.6298, + "step": 4337 + }, + { + "epoch": 0.3878755364806867, + "grad_norm": 0.10363305505613889, + "learning_rate": 0.00014002711864157557, + "loss": 0.6497, + "step": 4338 + }, + { + "epoch": 0.38796494992846925, + "grad_norm": 0.12328724694727682, + "learning_rate": 0.00014000057801383115, + "loss": 0.6893, + "step": 4339 + }, + { + "epoch": 0.38805436337625177, + "grad_norm": 0.12686131382973276, + "learning_rate": 0.0001399740340312866, + "loss": 0.7157, + "step": 4340 + }, + { + "epoch": 0.38814377682403434, + "grad_norm": 0.1371697687499135, + "learning_rate": 0.00013994748669616803, + "loss": 0.6647, + "step": 4341 + }, + { + "epoch": 0.3882331902718169, + "grad_norm": 0.13151057384300097, + "learning_rate": 0.00013992093601070203, + "loss": 0.675, + "step": 4342 + }, + { + "epoch": 0.3883226037195994, + "grad_norm": 0.11798781453524898, + "learning_rate": 0.00013989438197711533, + "loss": 0.6146, + "step": 4343 + }, + { + "epoch": 0.388412017167382, + "grad_norm": 0.11345266137637586, + "learning_rate": 0.000139867824597635, + "loss": 0.6744, + "step": 4344 + }, + { + "epoch": 0.3885014306151645, + "grad_norm": 0.11913981664797499, + "learning_rate": 0.00013984126387448837, + "loss": 0.6327, + "step": 4345 + }, + { + "epoch": 0.3885908440629471, + "grad_norm": 0.13787190578201844, + "learning_rate": 0.00013981469980990302, + "loss": 0.6688, + "step": 4346 + }, + { + "epoch": 0.3886802575107296, + "grad_norm": 0.11767285839375125, + "learning_rate": 0.0001397881324061069, + "loss": 0.6651, + "step": 4347 + }, + { + "epoch": 0.38876967095851217, + "grad_norm": 0.13167600038596827, + "learning_rate": 0.0001397615616653282, + "loss": 0.6388, + "step": 4348 + }, + { + "epoch": 0.3888590844062947, + "grad_norm": 0.13504967789791883, + "learning_rate": 0.00013973498758979532, + "loss": 0.723, + "step": 4349 + }, + { + "epoch": 0.38894849785407726, + "grad_norm": 0.1429931286046482, + "learning_rate": 0.00013970841018173702, + "loss": 0.6741, + "step": 4350 + }, + { + "epoch": 0.3890379113018598, + "grad_norm": 0.1461668208515399, + "learning_rate": 0.0001396818294433823, + "loss": 0.6948, + "step": 4351 + }, + { + "epoch": 0.38912732474964234, + "grad_norm": 0.1352834790090561, + "learning_rate": 0.00013965524537696048, + "loss": 0.6785, + "step": 4352 + }, + { + "epoch": 0.3892167381974249, + "grad_norm": 0.14184611417882645, + "learning_rate": 0.00013962865798470113, + "loss": 0.6878, + "step": 4353 + }, + { + "epoch": 0.38930615164520743, + "grad_norm": 0.13422302190807503, + "learning_rate": 0.00013960206726883407, + "loss": 0.6518, + "step": 4354 + }, + { + "epoch": 0.38939556509299, + "grad_norm": 0.14370356755476904, + "learning_rate": 0.00013957547323158949, + "loss": 0.6852, + "step": 4355 + }, + { + "epoch": 0.3894849785407725, + "grad_norm": 0.1298640205465565, + "learning_rate": 0.00013954887587519773, + "loss": 0.6456, + "step": 4356 + }, + { + "epoch": 0.3895743919885551, + "grad_norm": 0.1350712600174289, + "learning_rate": 0.00013952227520188957, + "loss": 0.6648, + "step": 4357 + }, + { + "epoch": 0.3896638054363376, + "grad_norm": 0.1283241520902227, + "learning_rate": 0.00013949567121389586, + "loss": 0.6779, + "step": 4358 + }, + { + "epoch": 0.3897532188841202, + "grad_norm": 0.15002840297934444, + "learning_rate": 0.00013946906391344791, + "loss": 0.6672, + "step": 4359 + }, + { + "epoch": 0.3898426323319027, + "grad_norm": 0.11352119791922995, + "learning_rate": 0.00013944245330277724, + "loss": 0.6268, + "step": 4360 + }, + { + "epoch": 0.38993204577968527, + "grad_norm": 0.13622724087638166, + "learning_rate": 0.00013941583938411567, + "loss": 0.6931, + "step": 4361 + }, + { + "epoch": 0.39002145922746784, + "grad_norm": 0.13146901743200823, + "learning_rate": 0.00013938922215969523, + "loss": 0.6743, + "step": 4362 + }, + { + "epoch": 0.39011087267525035, + "grad_norm": 0.12636376693584572, + "learning_rate": 0.00013936260163174832, + "loss": 0.6487, + "step": 4363 + }, + { + "epoch": 0.3902002861230329, + "grad_norm": 0.11627070253074458, + "learning_rate": 0.00013933597780250753, + "loss": 0.6544, + "step": 4364 + }, + { + "epoch": 0.39028969957081544, + "grad_norm": 0.12127626770975165, + "learning_rate": 0.0001393093506742058, + "loss": 0.6582, + "step": 4365 + }, + { + "epoch": 0.390379113018598, + "grad_norm": 0.12861171114418085, + "learning_rate": 0.0001392827202490763, + "loss": 0.6798, + "step": 4366 + }, + { + "epoch": 0.3904685264663805, + "grad_norm": 0.1341027145709113, + "learning_rate": 0.00013925608652935249, + "loss": 0.6484, + "step": 4367 + }, + { + "epoch": 0.3905579399141631, + "grad_norm": 0.1441850232231463, + "learning_rate": 0.0001392294495172681, + "loss": 0.7131, + "step": 4368 + }, + { + "epoch": 0.3906473533619456, + "grad_norm": 0.13404751385035477, + "learning_rate": 0.00013920280921505716, + "loss": 0.6945, + "step": 4369 + }, + { + "epoch": 0.3907367668097282, + "grad_norm": 0.14253889633438555, + "learning_rate": 0.00013917616562495396, + "loss": 0.6625, + "step": 4370 + }, + { + "epoch": 0.3908261802575107, + "grad_norm": 0.13337551327580194, + "learning_rate": 0.00013914951874919308, + "loss": 0.6722, + "step": 4371 + }, + { + "epoch": 0.3909155937052933, + "grad_norm": 0.11978179158769166, + "learning_rate": 0.00013912286859000934, + "loss": 0.6295, + "step": 4372 + }, + { + "epoch": 0.39100500715307585, + "grad_norm": 0.14358978581184006, + "learning_rate": 0.00013909621514963784, + "loss": 0.6745, + "step": 4373 + }, + { + "epoch": 0.39109442060085836, + "grad_norm": 0.1421292798468606, + "learning_rate": 0.00013906955843031403, + "loss": 0.635, + "step": 4374 + }, + { + "epoch": 0.39118383404864093, + "grad_norm": 0.12249002768070154, + "learning_rate": 0.00013904289843427348, + "loss": 0.6963, + "step": 4375 + }, + { + "epoch": 0.39127324749642345, + "grad_norm": 0.11562462182370908, + "learning_rate": 0.00013901623516375219, + "loss": 0.6695, + "step": 4376 + }, + { + "epoch": 0.391362660944206, + "grad_norm": 0.15161206582961895, + "learning_rate": 0.00013898956862098643, + "loss": 0.6454, + "step": 4377 + }, + { + "epoch": 0.39145207439198854, + "grad_norm": 0.12012227333801902, + "learning_rate": 0.00013896289880821263, + "loss": 0.6439, + "step": 4378 + }, + { + "epoch": 0.3915414878397711, + "grad_norm": 0.12519669967508484, + "learning_rate": 0.0001389362257276675, + "loss": 0.6565, + "step": 4379 + }, + { + "epoch": 0.3916309012875536, + "grad_norm": 0.12011547063522328, + "learning_rate": 0.00013890954938158823, + "loss": 0.5678, + "step": 4380 + }, + { + "epoch": 0.3917203147353362, + "grad_norm": 0.12753579929345607, + "learning_rate": 0.000138882869772212, + "loss": 0.6721, + "step": 4381 + }, + { + "epoch": 0.39180972818311877, + "grad_norm": 0.13274946556804085, + "learning_rate": 0.00013885618690177642, + "loss": 0.6203, + "step": 4382 + }, + { + "epoch": 0.3918991416309013, + "grad_norm": 0.12470996574072084, + "learning_rate": 0.0001388295007725194, + "loss": 0.6441, + "step": 4383 + }, + { + "epoch": 0.39198855507868385, + "grad_norm": 0.13777274160652392, + "learning_rate": 0.00013880281138667905, + "loss": 0.6913, + "step": 4384 + }, + { + "epoch": 0.39207796852646637, + "grad_norm": 0.14954305897467618, + "learning_rate": 0.00013877611874649375, + "loss": 0.6754, + "step": 4385 + }, + { + "epoch": 0.39216738197424894, + "grad_norm": 0.13574878950235042, + "learning_rate": 0.0001387494228542022, + "loss": 0.6774, + "step": 4386 + }, + { + "epoch": 0.39225679542203146, + "grad_norm": 0.12480255710542454, + "learning_rate": 0.00013872272371204337, + "loss": 0.6617, + "step": 4387 + }, + { + "epoch": 0.39234620886981403, + "grad_norm": 0.12705774454934454, + "learning_rate": 0.00013869602132225646, + "loss": 0.678, + "step": 4388 + }, + { + "epoch": 0.39243562231759654, + "grad_norm": 0.1275144097062747, + "learning_rate": 0.00013866931568708098, + "loss": 0.6502, + "step": 4389 + }, + { + "epoch": 0.3925250357653791, + "grad_norm": 0.12373061616985173, + "learning_rate": 0.00013864260680875666, + "loss": 0.6513, + "step": 4390 + }, + { + "epoch": 0.3926144492131617, + "grad_norm": 0.13292255358181299, + "learning_rate": 0.00013861589468952364, + "loss": 0.6569, + "step": 4391 + }, + { + "epoch": 0.3927038626609442, + "grad_norm": 0.125493152103668, + "learning_rate": 0.0001385891793316221, + "loss": 0.6804, + "step": 4392 + }, + { + "epoch": 0.3927932761087268, + "grad_norm": 0.1344196566606878, + "learning_rate": 0.0001385624607372927, + "loss": 0.6865, + "step": 4393 + }, + { + "epoch": 0.3928826895565093, + "grad_norm": 0.13081718057484162, + "learning_rate": 0.00013853573890877633, + "loss": 0.6423, + "step": 4394 + }, + { + "epoch": 0.39297210300429186, + "grad_norm": 0.11837744752074682, + "learning_rate": 0.0001385090138483141, + "loss": 0.7012, + "step": 4395 + }, + { + "epoch": 0.3930615164520744, + "grad_norm": 0.13764107986294033, + "learning_rate": 0.0001384822855581473, + "loss": 0.6659, + "step": 4396 + }, + { + "epoch": 0.39315092989985695, + "grad_norm": 0.13393221033806427, + "learning_rate": 0.00013845555404051776, + "loss": 0.6329, + "step": 4397 + }, + { + "epoch": 0.39324034334763946, + "grad_norm": 0.11203600235149348, + "learning_rate": 0.00013842881929766732, + "loss": 0.6617, + "step": 4398 + }, + { + "epoch": 0.39332975679542204, + "grad_norm": 0.1228143110911558, + "learning_rate": 0.00013840208133183822, + "loss": 0.6474, + "step": 4399 + }, + { + "epoch": 0.39341917024320455, + "grad_norm": 0.1180649427159415, + "learning_rate": 0.0001383753401452729, + "loss": 0.6552, + "step": 4400 + }, + { + "epoch": 0.3935085836909871, + "grad_norm": 0.12553903538132583, + "learning_rate": 0.00013834859574021418, + "loss": 0.6818, + "step": 4401 + }, + { + "epoch": 0.3935979971387697, + "grad_norm": 0.14436215941970595, + "learning_rate": 0.00013832184811890508, + "loss": 0.6962, + "step": 4402 + }, + { + "epoch": 0.3936874105865522, + "grad_norm": 0.15337367049343553, + "learning_rate": 0.0001382950972835888, + "loss": 0.6903, + "step": 4403 + }, + { + "epoch": 0.3937768240343348, + "grad_norm": 0.12100799029958946, + "learning_rate": 0.000138268343236509, + "loss": 0.6404, + "step": 4404 + }, + { + "epoch": 0.3938662374821173, + "grad_norm": 0.1339960789460822, + "learning_rate": 0.00013824158597990947, + "loss": 0.6745, + "step": 4405 + }, + { + "epoch": 0.39395565092989987, + "grad_norm": 0.10985201677310562, + "learning_rate": 0.00013821482551603425, + "loss": 0.5898, + "step": 4406 + }, + { + "epoch": 0.3940450643776824, + "grad_norm": 0.13828552093867952, + "learning_rate": 0.00013818806184712781, + "loss": 0.7299, + "step": 4407 + }, + { + "epoch": 0.39413447782546496, + "grad_norm": 0.12444381175302155, + "learning_rate": 0.00013816129497543476, + "loss": 0.6415, + "step": 4408 + }, + { + "epoch": 0.3942238912732475, + "grad_norm": 0.12719520514078508, + "learning_rate": 0.00013813452490319997, + "loss": 0.7086, + "step": 4409 + }, + { + "epoch": 0.39431330472103004, + "grad_norm": 0.1250390338931844, + "learning_rate": 0.0001381077516326686, + "loss": 0.6543, + "step": 4410 + }, + { + "epoch": 0.3944027181688126, + "grad_norm": 0.11682242959003214, + "learning_rate": 0.00013808097516608618, + "loss": 0.6372, + "step": 4411 + }, + { + "epoch": 0.39449213161659513, + "grad_norm": 0.13185086332671997, + "learning_rate": 0.00013805419550569833, + "loss": 0.6989, + "step": 4412 + }, + { + "epoch": 0.3945815450643777, + "grad_norm": 0.11449745560580603, + "learning_rate": 0.00013802741265375105, + "loss": 0.6477, + "step": 4413 + }, + { + "epoch": 0.3946709585121602, + "grad_norm": 0.14034452072516285, + "learning_rate": 0.00013800062661249062, + "loss": 0.7328, + "step": 4414 + }, + { + "epoch": 0.3947603719599428, + "grad_norm": 0.13543363902495187, + "learning_rate": 0.00013797383738416353, + "loss": 0.6797, + "step": 4415 + }, + { + "epoch": 0.3948497854077253, + "grad_norm": 0.11477717540533336, + "learning_rate": 0.00013794704497101655, + "loss": 0.6573, + "step": 4416 + }, + { + "epoch": 0.3949391988555079, + "grad_norm": 0.12572894601755402, + "learning_rate": 0.00013792024937529673, + "loss": 0.6126, + "step": 4417 + }, + { + "epoch": 0.3950286123032904, + "grad_norm": 0.13871499949394148, + "learning_rate": 0.0001378934505992514, + "loss": 0.7023, + "step": 4418 + }, + { + "epoch": 0.39511802575107297, + "grad_norm": 0.12741533955136092, + "learning_rate": 0.00013786664864512814, + "loss": 0.6981, + "step": 4419 + }, + { + "epoch": 0.3952074391988555, + "grad_norm": 0.1368676150154264, + "learning_rate": 0.00013783984351517476, + "loss": 0.7206, + "step": 4420 + }, + { + "epoch": 0.39529685264663805, + "grad_norm": 0.13719625460562693, + "learning_rate": 0.00013781303521163943, + "loss": 0.6909, + "step": 4421 + }, + { + "epoch": 0.3953862660944206, + "grad_norm": 0.12730856431025042, + "learning_rate": 0.0001377862237367705, + "loss": 0.6786, + "step": 4422 + }, + { + "epoch": 0.39547567954220314, + "grad_norm": 0.13241592584055592, + "learning_rate": 0.0001377594090928166, + "loss": 0.684, + "step": 4423 + }, + { + "epoch": 0.3955650929899857, + "grad_norm": 0.1381391682908978, + "learning_rate": 0.00013773259128202668, + "loss": 0.6853, + "step": 4424 + }, + { + "epoch": 0.3956545064377682, + "grad_norm": 0.12020762842129264, + "learning_rate": 0.0001377057703066499, + "loss": 0.666, + "step": 4425 + }, + { + "epoch": 0.3957439198855508, + "grad_norm": 0.13306252481141304, + "learning_rate": 0.0001376789461689357, + "loss": 0.6645, + "step": 4426 + }, + { + "epoch": 0.3958333333333333, + "grad_norm": 0.14170883968874187, + "learning_rate": 0.0001376521188711338, + "loss": 0.6859, + "step": 4427 + }, + { + "epoch": 0.3959227467811159, + "grad_norm": 0.11880772074805111, + "learning_rate": 0.00013762528841549415, + "loss": 0.6555, + "step": 4428 + }, + { + "epoch": 0.3960121602288984, + "grad_norm": 0.10956070832182169, + "learning_rate": 0.000137598454804267, + "loss": 0.5978, + "step": 4429 + }, + { + "epoch": 0.396101573676681, + "grad_norm": 0.13659514667773368, + "learning_rate": 0.00013757161803970282, + "loss": 0.6643, + "step": 4430 + }, + { + "epoch": 0.39619098712446355, + "grad_norm": 0.13034491136626722, + "learning_rate": 0.00013754477812405247, + "loss": 0.6612, + "step": 4431 + }, + { + "epoch": 0.39628040057224606, + "grad_norm": 0.13535028322963083, + "learning_rate": 0.0001375179350595669, + "loss": 0.6631, + "step": 4432 + }, + { + "epoch": 0.39636981402002863, + "grad_norm": 0.1355657180340871, + "learning_rate": 0.0001374910888484974, + "loss": 0.6729, + "step": 4433 + }, + { + "epoch": 0.39645922746781115, + "grad_norm": 0.11139441364199804, + "learning_rate": 0.00013746423949309556, + "loss": 0.6545, + "step": 4434 + }, + { + "epoch": 0.3965486409155937, + "grad_norm": 0.14549355147749635, + "learning_rate": 0.00013743738699561323, + "loss": 0.7174, + "step": 4435 + }, + { + "epoch": 0.39663805436337624, + "grad_norm": 0.13530835555000992, + "learning_rate": 0.00013741053135830246, + "loss": 0.6874, + "step": 4436 + }, + { + "epoch": 0.3967274678111588, + "grad_norm": 0.1336256720658387, + "learning_rate": 0.00013738367258341557, + "loss": 0.6611, + "step": 4437 + }, + { + "epoch": 0.3968168812589413, + "grad_norm": 0.12663005615860032, + "learning_rate": 0.00013735681067320526, + "loss": 0.6392, + "step": 4438 + }, + { + "epoch": 0.3969062947067239, + "grad_norm": 0.13727073114115612, + "learning_rate": 0.00013732994562992433, + "loss": 0.6585, + "step": 4439 + }, + { + "epoch": 0.3969957081545064, + "grad_norm": 0.10358265587260367, + "learning_rate": 0.00013730307745582593, + "loss": 0.6079, + "step": 4440 + }, + { + "epoch": 0.397085121602289, + "grad_norm": 0.1332862643977081, + "learning_rate": 0.0001372762061531635, + "loss": 0.659, + "step": 4441 + }, + { + "epoch": 0.39717453505007155, + "grad_norm": 0.1365915588457181, + "learning_rate": 0.00013724933172419066, + "loss": 0.6925, + "step": 4442 + }, + { + "epoch": 0.39726394849785407, + "grad_norm": 0.1402456739611818, + "learning_rate": 0.00013722245417116134, + "loss": 0.6654, + "step": 4443 + }, + { + "epoch": 0.39735336194563664, + "grad_norm": 0.1333478691571548, + "learning_rate": 0.00013719557349632977, + "loss": 0.687, + "step": 4444 + }, + { + "epoch": 0.39744277539341916, + "grad_norm": 0.13412582854849334, + "learning_rate": 0.00013716868970195038, + "loss": 0.6758, + "step": 4445 + }, + { + "epoch": 0.39753218884120173, + "grad_norm": 0.11751719416068586, + "learning_rate": 0.00013714180279027785, + "loss": 0.654, + "step": 4446 + }, + { + "epoch": 0.39762160228898424, + "grad_norm": 0.14480146693085796, + "learning_rate": 0.00013711491276356718, + "loss": 0.7279, + "step": 4447 + }, + { + "epoch": 0.3977110157367668, + "grad_norm": 0.12525808125609345, + "learning_rate": 0.0001370880196240736, + "loss": 0.6496, + "step": 4448 + }, + { + "epoch": 0.39780042918454933, + "grad_norm": 0.11852882484606601, + "learning_rate": 0.00013706112337405263, + "loss": 0.6581, + "step": 4449 + }, + { + "epoch": 0.3978898426323319, + "grad_norm": 0.14321412541879358, + "learning_rate": 0.00013703422401575995, + "loss": 0.6642, + "step": 4450 + }, + { + "epoch": 0.3979792560801145, + "grad_norm": 0.13179978441217774, + "learning_rate": 0.00013700732155145167, + "loss": 0.6873, + "step": 4451 + }, + { + "epoch": 0.398068669527897, + "grad_norm": 0.12397554373503147, + "learning_rate": 0.00013698041598338403, + "loss": 0.7096, + "step": 4452 + }, + { + "epoch": 0.39815808297567956, + "grad_norm": 0.1306788281503666, + "learning_rate": 0.0001369535073138135, + "loss": 0.6615, + "step": 4453 + }, + { + "epoch": 0.3982474964234621, + "grad_norm": 0.1354294908851078, + "learning_rate": 0.00013692659554499702, + "loss": 0.6739, + "step": 4454 + }, + { + "epoch": 0.39833690987124465, + "grad_norm": 0.1308741655712689, + "learning_rate": 0.0001368996806791915, + "loss": 0.6546, + "step": 4455 + }, + { + "epoch": 0.39842632331902716, + "grad_norm": 0.14156404713408285, + "learning_rate": 0.0001368727627186544, + "loss": 0.5813, + "step": 4456 + }, + { + "epoch": 0.39851573676680974, + "grad_norm": 0.13722046164037918, + "learning_rate": 0.00013684584166564315, + "loss": 0.6722, + "step": 4457 + }, + { + "epoch": 0.39860515021459225, + "grad_norm": 0.1326845826407697, + "learning_rate": 0.0001368189175224157, + "loss": 0.6826, + "step": 4458 + }, + { + "epoch": 0.3986945636623748, + "grad_norm": 0.1306324240407259, + "learning_rate": 0.00013679199029123008, + "loss": 0.6916, + "step": 4459 + }, + { + "epoch": 0.3987839771101574, + "grad_norm": 0.13737634355304856, + "learning_rate": 0.00013676505997434467, + "loss": 0.696, + "step": 4460 + }, + { + "epoch": 0.3988733905579399, + "grad_norm": 0.12573748596118267, + "learning_rate": 0.0001367381265740181, + "loss": 0.654, + "step": 4461 + }, + { + "epoch": 0.3989628040057225, + "grad_norm": 0.12556712105830858, + "learning_rate": 0.00013671119009250922, + "loss": 0.6469, + "step": 4462 + }, + { + "epoch": 0.399052217453505, + "grad_norm": 0.12885984298559597, + "learning_rate": 0.00013668425053207713, + "loss": 0.6696, + "step": 4463 + }, + { + "epoch": 0.39914163090128757, + "grad_norm": 0.13682578189306838, + "learning_rate": 0.0001366573078949813, + "loss": 0.6879, + "step": 4464 + }, + { + "epoch": 0.3992310443490701, + "grad_norm": 0.12988857124342723, + "learning_rate": 0.00013663036218348128, + "loss": 0.6509, + "step": 4465 + }, + { + "epoch": 0.39932045779685266, + "grad_norm": 0.12272226499485578, + "learning_rate": 0.00013660341339983707, + "loss": 0.6569, + "step": 4466 + }, + { + "epoch": 0.3994098712446352, + "grad_norm": 0.11547617907394218, + "learning_rate": 0.00013657646154630876, + "loss": 0.6513, + "step": 4467 + }, + { + "epoch": 0.39949928469241774, + "grad_norm": 0.12670614990450693, + "learning_rate": 0.00013654950662515678, + "loss": 0.6283, + "step": 4468 + }, + { + "epoch": 0.39958869814020026, + "grad_norm": 0.13223305103804747, + "learning_rate": 0.00013652254863864185, + "loss": 0.6504, + "step": 4469 + }, + { + "epoch": 0.39967811158798283, + "grad_norm": 0.13579990408099704, + "learning_rate": 0.00013649558758902484, + "loss": 0.6679, + "step": 4470 + }, + { + "epoch": 0.3997675250357654, + "grad_norm": 0.12100230736345854, + "learning_rate": 0.000136468623478567, + "loss": 0.6833, + "step": 4471 + }, + { + "epoch": 0.3998569384835479, + "grad_norm": 0.12503689240293855, + "learning_rate": 0.00013644165630952973, + "loss": 0.6325, + "step": 4472 + }, + { + "epoch": 0.3999463519313305, + "grad_norm": 0.13200586467999986, + "learning_rate": 0.00013641468608417478, + "loss": 0.6806, + "step": 4473 + }, + { + "epoch": 0.400035765379113, + "grad_norm": 0.12756644724577287, + "learning_rate": 0.00013638771280476405, + "loss": 0.615, + "step": 4474 + }, + { + "epoch": 0.4001251788268956, + "grad_norm": 0.12147911562391936, + "learning_rate": 0.00013636073647355982, + "loss": 0.6288, + "step": 4475 + }, + { + "epoch": 0.4002145922746781, + "grad_norm": 0.1296315636972057, + "learning_rate": 0.00013633375709282453, + "loss": 0.678, + "step": 4476 + }, + { + "epoch": 0.40030400572246067, + "grad_norm": 0.11434212123952887, + "learning_rate": 0.00013630677466482092, + "loss": 0.6785, + "step": 4477 + }, + { + "epoch": 0.4003934191702432, + "grad_norm": 0.15182199560679646, + "learning_rate": 0.00013627978919181197, + "loss": 0.6983, + "step": 4478 + }, + { + "epoch": 0.40048283261802575, + "grad_norm": 0.11985734624844571, + "learning_rate": 0.0001362528006760609, + "loss": 0.6604, + "step": 4479 + }, + { + "epoch": 0.4005722460658083, + "grad_norm": 0.13580223801155694, + "learning_rate": 0.0001362258091198312, + "loss": 0.6838, + "step": 4480 + }, + { + "epoch": 0.40066165951359084, + "grad_norm": 0.13275803039784245, + "learning_rate": 0.0001361988145253867, + "loss": 0.6939, + "step": 4481 + }, + { + "epoch": 0.4007510729613734, + "grad_norm": 0.12937439278449042, + "learning_rate": 0.00013617181689499128, + "loss": 0.65, + "step": 4482 + }, + { + "epoch": 0.4008404864091559, + "grad_norm": 0.14336562814153367, + "learning_rate": 0.00013614481623090932, + "loss": 0.6922, + "step": 4483 + }, + { + "epoch": 0.4009298998569385, + "grad_norm": 0.10411860911242656, + "learning_rate": 0.00013611781253540522, + "loss": 0.6132, + "step": 4484 + }, + { + "epoch": 0.401019313304721, + "grad_norm": 0.12912803371193998, + "learning_rate": 0.00013609080581074382, + "loss": 0.6711, + "step": 4485 + }, + { + "epoch": 0.4011087267525036, + "grad_norm": 0.12045159723267715, + "learning_rate": 0.00013606379605919013, + "loss": 0.6489, + "step": 4486 + }, + { + "epoch": 0.4011981402002861, + "grad_norm": 0.12858544090721813, + "learning_rate": 0.00013603678328300939, + "loss": 0.6634, + "step": 4487 + }, + { + "epoch": 0.4012875536480687, + "grad_norm": 0.12977942216001578, + "learning_rate": 0.0001360097674844672, + "loss": 0.6358, + "step": 4488 + }, + { + "epoch": 0.4013769670958512, + "grad_norm": 0.13346205652636153, + "learning_rate": 0.0001359827486658293, + "loss": 0.6581, + "step": 4489 + }, + { + "epoch": 0.40146638054363376, + "grad_norm": 0.13643264539381597, + "learning_rate": 0.00013595572682936172, + "loss": 0.6942, + "step": 4490 + }, + { + "epoch": 0.40155579399141633, + "grad_norm": 0.12465810416086325, + "learning_rate": 0.00013592870197733073, + "loss": 0.6716, + "step": 4491 + }, + { + "epoch": 0.40164520743919885, + "grad_norm": 0.12879110996368243, + "learning_rate": 0.0001359016741120029, + "loss": 0.5658, + "step": 4492 + }, + { + "epoch": 0.4017346208869814, + "grad_norm": 0.1277335729417279, + "learning_rate": 0.00013587464323564503, + "loss": 0.6484, + "step": 4493 + }, + { + "epoch": 0.40182403433476394, + "grad_norm": 0.12682529760889327, + "learning_rate": 0.00013584760935052417, + "loss": 0.6659, + "step": 4494 + }, + { + "epoch": 0.4019134477825465, + "grad_norm": 0.11076972058094783, + "learning_rate": 0.00013582057245890757, + "loss": 0.6589, + "step": 4495 + }, + { + "epoch": 0.402002861230329, + "grad_norm": 0.1323560350066924, + "learning_rate": 0.00013579353256306287, + "loss": 0.6674, + "step": 4496 + }, + { + "epoch": 0.4020922746781116, + "grad_norm": 0.11272538031804434, + "learning_rate": 0.00013576648966525778, + "loss": 0.6639, + "step": 4497 + }, + { + "epoch": 0.4021816881258941, + "grad_norm": 0.12470930270965075, + "learning_rate": 0.00013573944376776042, + "loss": 0.6494, + "step": 4498 + }, + { + "epoch": 0.4022711015736767, + "grad_norm": 0.11996818311163339, + "learning_rate": 0.00013571239487283906, + "loss": 0.661, + "step": 4499 + }, + { + "epoch": 0.40236051502145925, + "grad_norm": 0.11887026676451637, + "learning_rate": 0.00013568534298276228, + "loss": 0.6519, + "step": 4500 + }, + { + "epoch": 0.40244992846924177, + "grad_norm": 0.11941758991887025, + "learning_rate": 0.00013565828809979885, + "loss": 0.6461, + "step": 4501 + }, + { + "epoch": 0.40253934191702434, + "grad_norm": 0.13336899711290046, + "learning_rate": 0.0001356312302262179, + "loss": 0.6733, + "step": 4502 + }, + { + "epoch": 0.40262875536480686, + "grad_norm": 0.11742624793673362, + "learning_rate": 0.0001356041693642887, + "loss": 0.6739, + "step": 4503 + }, + { + "epoch": 0.4027181688125894, + "grad_norm": 0.12452765609504134, + "learning_rate": 0.0001355771055162808, + "loss": 0.675, + "step": 4504 + }, + { + "epoch": 0.40280758226037194, + "grad_norm": 0.12231431551386465, + "learning_rate": 0.00013555003868446404, + "loss": 0.6317, + "step": 4505 + }, + { + "epoch": 0.4028969957081545, + "grad_norm": 0.11667821209484998, + "learning_rate": 0.00013552296887110846, + "loss": 0.6309, + "step": 4506 + }, + { + "epoch": 0.40298640915593703, + "grad_norm": 0.12238956233624956, + "learning_rate": 0.00013549589607848438, + "loss": 0.6882, + "step": 4507 + }, + { + "epoch": 0.4030758226037196, + "grad_norm": 0.1290029283575322, + "learning_rate": 0.00013546882030886237, + "loss": 0.6458, + "step": 4508 + }, + { + "epoch": 0.4031652360515021, + "grad_norm": 0.14526227746868733, + "learning_rate": 0.00013544174156451323, + "loss": 0.7108, + "step": 4509 + }, + { + "epoch": 0.4032546494992847, + "grad_norm": 0.1322502485387699, + "learning_rate": 0.00013541465984770804, + "loss": 0.7147, + "step": 4510 + }, + { + "epoch": 0.40334406294706726, + "grad_norm": 0.138584409738471, + "learning_rate": 0.00013538757516071807, + "loss": 0.6633, + "step": 4511 + }, + { + "epoch": 0.4034334763948498, + "grad_norm": 0.13251091251774627, + "learning_rate": 0.00013536048750581494, + "loss": 0.6816, + "step": 4512 + }, + { + "epoch": 0.40352288984263235, + "grad_norm": 0.15084494625865968, + "learning_rate": 0.0001353333968852704, + "loss": 0.679, + "step": 4513 + }, + { + "epoch": 0.40361230329041486, + "grad_norm": 0.12684200170622706, + "learning_rate": 0.00013530630330135655, + "loss": 0.6316, + "step": 4514 + }, + { + "epoch": 0.40370171673819744, + "grad_norm": 0.11754330707836588, + "learning_rate": 0.0001352792067563457, + "loss": 0.6519, + "step": 4515 + }, + { + "epoch": 0.40379113018597995, + "grad_norm": 0.12487123600791288, + "learning_rate": 0.00013525210725251035, + "loss": 0.6918, + "step": 4516 + }, + { + "epoch": 0.4038805436337625, + "grad_norm": 0.12739281960546583, + "learning_rate": 0.00013522500479212337, + "loss": 0.6486, + "step": 4517 + }, + { + "epoch": 0.40396995708154504, + "grad_norm": 0.11171454184503841, + "learning_rate": 0.00013519789937745775, + "loss": 0.6078, + "step": 4518 + }, + { + "epoch": 0.4040593705293276, + "grad_norm": 0.14147435606876615, + "learning_rate": 0.00013517079101078684, + "loss": 0.6844, + "step": 4519 + }, + { + "epoch": 0.4041487839771102, + "grad_norm": 0.12192366489477646, + "learning_rate": 0.00013514367969438414, + "loss": 0.6496, + "step": 4520 + }, + { + "epoch": 0.4042381974248927, + "grad_norm": 0.13580904422058163, + "learning_rate": 0.0001351165654305235, + "loss": 0.697, + "step": 4521 + }, + { + "epoch": 0.40432761087267527, + "grad_norm": 0.13207457934341582, + "learning_rate": 0.00013508944822147891, + "loss": 0.6828, + "step": 4522 + }, + { + "epoch": 0.4044170243204578, + "grad_norm": 0.12099209439439194, + "learning_rate": 0.00013506232806952467, + "loss": 0.655, + "step": 4523 + }, + { + "epoch": 0.40450643776824036, + "grad_norm": 0.13022814116949769, + "learning_rate": 0.0001350352049769353, + "loss": 0.6957, + "step": 4524 + }, + { + "epoch": 0.4045958512160229, + "grad_norm": 0.1366995788921072, + "learning_rate": 0.00013500807894598565, + "loss": 0.6836, + "step": 4525 + }, + { + "epoch": 0.40468526466380544, + "grad_norm": 0.11147159273146073, + "learning_rate": 0.00013498094997895069, + "loss": 0.6375, + "step": 4526 + }, + { + "epoch": 0.40477467811158796, + "grad_norm": 0.12068810205492021, + "learning_rate": 0.00013495381807810569, + "loss": 0.6964, + "step": 4527 + }, + { + "epoch": 0.40486409155937053, + "grad_norm": 0.14963537780581776, + "learning_rate": 0.00013492668324572614, + "loss": 0.7023, + "step": 4528 + }, + { + "epoch": 0.4049535050071531, + "grad_norm": 0.11435911750555718, + "learning_rate": 0.0001348995454840879, + "loss": 0.65, + "step": 4529 + }, + { + "epoch": 0.4050429184549356, + "grad_norm": 0.14626703080234407, + "learning_rate": 0.00013487240479546691, + "loss": 0.7102, + "step": 4530 + }, + { + "epoch": 0.4051323319027182, + "grad_norm": 0.11988386201224827, + "learning_rate": 0.00013484526118213942, + "loss": 0.6543, + "step": 4531 + }, + { + "epoch": 0.4052217453505007, + "grad_norm": 0.13908968856330683, + "learning_rate": 0.000134818114646382, + "loss": 0.6695, + "step": 4532 + }, + { + "epoch": 0.4053111587982833, + "grad_norm": 0.12785271634962553, + "learning_rate": 0.00013479096519047136, + "loss": 0.6673, + "step": 4533 + }, + { + "epoch": 0.4054005722460658, + "grad_norm": 0.1345949643167071, + "learning_rate": 0.00013476381281668447, + "loss": 0.6931, + "step": 4534 + }, + { + "epoch": 0.40548998569384836, + "grad_norm": 0.13125547469241697, + "learning_rate": 0.00013473665752729855, + "loss": 0.6648, + "step": 4535 + }, + { + "epoch": 0.4055793991416309, + "grad_norm": 0.14765948172672352, + "learning_rate": 0.00013470949932459117, + "loss": 0.6715, + "step": 4536 + }, + { + "epoch": 0.40566881258941345, + "grad_norm": 0.12861473677851679, + "learning_rate": 0.00013468233821083996, + "loss": 0.7026, + "step": 4537 + }, + { + "epoch": 0.40575822603719597, + "grad_norm": 0.13090709394105907, + "learning_rate": 0.0001346551741883229, + "loss": 0.6834, + "step": 4538 + }, + { + "epoch": 0.40584763948497854, + "grad_norm": 0.12680282607424437, + "learning_rate": 0.0001346280072593183, + "loss": 0.6988, + "step": 4539 + }, + { + "epoch": 0.4059370529327611, + "grad_norm": 0.1200467029857276, + "learning_rate": 0.00013460083742610455, + "loss": 0.6329, + "step": 4540 + }, + { + "epoch": 0.4060264663805436, + "grad_norm": 0.1421362081593003, + "learning_rate": 0.00013457366469096029, + "loss": 0.6956, + "step": 4541 + }, + { + "epoch": 0.4061158798283262, + "grad_norm": 0.14726909995571935, + "learning_rate": 0.00013454648905616458, + "loss": 0.7142, + "step": 4542 + }, + { + "epoch": 0.4062052932761087, + "grad_norm": 0.12256364480916368, + "learning_rate": 0.00013451931052399656, + "loss": 0.667, + "step": 4543 + }, + { + "epoch": 0.4062947067238913, + "grad_norm": 0.11322993604701337, + "learning_rate": 0.00013449212909673563, + "loss": 0.6341, + "step": 4544 + }, + { + "epoch": 0.4063841201716738, + "grad_norm": 0.11591391322954843, + "learning_rate": 0.00013446494477666146, + "loss": 0.6521, + "step": 4545 + }, + { + "epoch": 0.4064735336194564, + "grad_norm": 0.12163209808980437, + "learning_rate": 0.00013443775756605405, + "loss": 0.6369, + "step": 4546 + }, + { + "epoch": 0.4065629470672389, + "grad_norm": 0.12219730677410054, + "learning_rate": 0.0001344105674671935, + "loss": 0.6309, + "step": 4547 + }, + { + "epoch": 0.40665236051502146, + "grad_norm": 0.13234734150415617, + "learning_rate": 0.00013438337448236015, + "loss": 0.6845, + "step": 4548 + }, + { + "epoch": 0.40674177396280403, + "grad_norm": 0.12585742670239244, + "learning_rate": 0.0001343561786138348, + "loss": 0.6402, + "step": 4549 + }, + { + "epoch": 0.40683118741058655, + "grad_norm": 0.11852049313092848, + "learning_rate": 0.00013432897986389818, + "loss": 0.6718, + "step": 4550 + }, + { + "epoch": 0.4069206008583691, + "grad_norm": 0.13940886140538006, + "learning_rate": 0.00013430177823483148, + "loss": 0.6835, + "step": 4551 + }, + { + "epoch": 0.40701001430615164, + "grad_norm": 0.13788413248060913, + "learning_rate": 0.00013427457372891608, + "loss": 0.6515, + "step": 4552 + }, + { + "epoch": 0.4070994277539342, + "grad_norm": 0.13010004759361038, + "learning_rate": 0.00013424736634843357, + "loss": 0.6808, + "step": 4553 + }, + { + "epoch": 0.4071888412017167, + "grad_norm": 0.11831640973782806, + "learning_rate": 0.0001342201560956658, + "loss": 0.6676, + "step": 4554 + }, + { + "epoch": 0.4072782546494993, + "grad_norm": 0.12790268342958028, + "learning_rate": 0.00013419294297289486, + "loss": 0.6768, + "step": 4555 + }, + { + "epoch": 0.4073676680972818, + "grad_norm": 0.12911831759079223, + "learning_rate": 0.00013416572698240312, + "loss": 0.6895, + "step": 4556 + }, + { + "epoch": 0.4074570815450644, + "grad_norm": 0.13455385924360155, + "learning_rate": 0.00013413850812647312, + "loss": 0.6427, + "step": 4557 + }, + { + "epoch": 0.4075464949928469, + "grad_norm": 0.12633622378347512, + "learning_rate": 0.00013411128640738762, + "loss": 0.6699, + "step": 4558 + }, + { + "epoch": 0.40763590844062947, + "grad_norm": 0.14984019222572384, + "learning_rate": 0.00013408406182742976, + "loss": 0.6858, + "step": 4559 + }, + { + "epoch": 0.40772532188841204, + "grad_norm": 0.11873690528198927, + "learning_rate": 0.00013405683438888282, + "loss": 0.6858, + "step": 4560 + }, + { + "epoch": 0.40781473533619456, + "grad_norm": 0.11729393301798835, + "learning_rate": 0.00013402960409403028, + "loss": 0.6621, + "step": 4561 + }, + { + "epoch": 0.4079041487839771, + "grad_norm": 0.11595069941391899, + "learning_rate": 0.00013400237094515592, + "loss": 0.6233, + "step": 4562 + }, + { + "epoch": 0.40799356223175964, + "grad_norm": 0.13154012518395206, + "learning_rate": 0.0001339751349445438, + "loss": 0.6772, + "step": 4563 + }, + { + "epoch": 0.4080829756795422, + "grad_norm": 0.14142120308442827, + "learning_rate": 0.00013394789609447817, + "loss": 0.6876, + "step": 4564 + }, + { + "epoch": 0.40817238912732473, + "grad_norm": 0.1382185912438061, + "learning_rate": 0.00013392065439724344, + "loss": 0.6808, + "step": 4565 + }, + { + "epoch": 0.4082618025751073, + "grad_norm": 0.12632686551068983, + "learning_rate": 0.00013389340985512442, + "loss": 0.7175, + "step": 4566 + }, + { + "epoch": 0.4083512160228898, + "grad_norm": 0.12551575760013026, + "learning_rate": 0.00013386616247040606, + "loss": 0.6566, + "step": 4567 + }, + { + "epoch": 0.4084406294706724, + "grad_norm": 0.13843016098064426, + "learning_rate": 0.00013383891224537354, + "loss": 0.6345, + "step": 4568 + }, + { + "epoch": 0.40853004291845496, + "grad_norm": 0.15450570541395076, + "learning_rate": 0.0001338116591823123, + "loss": 0.7032, + "step": 4569 + }, + { + "epoch": 0.4086194563662375, + "grad_norm": 0.1253552719139386, + "learning_rate": 0.0001337844032835081, + "loss": 0.6632, + "step": 4570 + }, + { + "epoch": 0.40870886981402005, + "grad_norm": 0.13140811574378408, + "learning_rate": 0.0001337571445512467, + "loss": 0.6292, + "step": 4571 + }, + { + "epoch": 0.40879828326180256, + "grad_norm": 0.1307151908017851, + "learning_rate": 0.00013372988298781442, + "loss": 0.6444, + "step": 4572 + }, + { + "epoch": 0.40888769670958514, + "grad_norm": 0.13741467079755285, + "learning_rate": 0.00013370261859549758, + "loss": 0.7092, + "step": 4573 + }, + { + "epoch": 0.40897711015736765, + "grad_norm": 0.15506000490575345, + "learning_rate": 0.00013367535137658282, + "loss": 0.6497, + "step": 4574 + }, + { + "epoch": 0.4090665236051502, + "grad_norm": 0.13664693189802532, + "learning_rate": 0.00013364808133335703, + "loss": 0.6786, + "step": 4575 + }, + { + "epoch": 0.40915593705293274, + "grad_norm": 0.13110305565699848, + "learning_rate": 0.00013362080846810725, + "loss": 0.6595, + "step": 4576 + }, + { + "epoch": 0.4092453505007153, + "grad_norm": 0.12754561789658206, + "learning_rate": 0.0001335935327831209, + "loss": 0.6874, + "step": 4577 + }, + { + "epoch": 0.4093347639484979, + "grad_norm": 0.1405828113083196, + "learning_rate": 0.0001335662542806855, + "loss": 0.6832, + "step": 4578 + }, + { + "epoch": 0.4094241773962804, + "grad_norm": 0.11341096821966433, + "learning_rate": 0.00013353897296308892, + "loss": 0.6777, + "step": 4579 + }, + { + "epoch": 0.40951359084406297, + "grad_norm": 0.13440840429724657, + "learning_rate": 0.00013351168883261915, + "loss": 0.6756, + "step": 4580 + }, + { + "epoch": 0.4096030042918455, + "grad_norm": 0.11675876198745958, + "learning_rate": 0.00013348440189156455, + "loss": 0.6269, + "step": 4581 + }, + { + "epoch": 0.40969241773962806, + "grad_norm": 0.12110134921912409, + "learning_rate": 0.00013345711214221359, + "loss": 0.6916, + "step": 4582 + }, + { + "epoch": 0.4097818311874106, + "grad_norm": 0.11998740691707692, + "learning_rate": 0.00013342981958685502, + "loss": 0.6295, + "step": 4583 + }, + { + "epoch": 0.40987124463519314, + "grad_norm": 0.13319671542150247, + "learning_rate": 0.00013340252422777788, + "loss": 0.6909, + "step": 4584 + }, + { + "epoch": 0.40996065808297566, + "grad_norm": 0.13658153012462954, + "learning_rate": 0.00013337522606727132, + "loss": 0.7, + "step": 4585 + }, + { + "epoch": 0.41005007153075823, + "grad_norm": 0.13762977476830324, + "learning_rate": 0.00013334792510762491, + "loss": 0.6827, + "step": 4586 + }, + { + "epoch": 0.41013948497854075, + "grad_norm": 0.1259936834582774, + "learning_rate": 0.0001333206213511283, + "loss": 0.6644, + "step": 4587 + }, + { + "epoch": 0.4102288984263233, + "grad_norm": 0.12591027197083007, + "learning_rate": 0.00013329331480007139, + "loss": 0.6319, + "step": 4588 + }, + { + "epoch": 0.4103183118741059, + "grad_norm": 0.12765925724858876, + "learning_rate": 0.0001332660054567444, + "loss": 0.6738, + "step": 4589 + }, + { + "epoch": 0.4104077253218884, + "grad_norm": 0.11738155578911436, + "learning_rate": 0.00013323869332343768, + "loss": 0.6463, + "step": 4590 + }, + { + "epoch": 0.410497138769671, + "grad_norm": 0.1320381780458928, + "learning_rate": 0.00013321137840244192, + "loss": 0.6447, + "step": 4591 + }, + { + "epoch": 0.4105865522174535, + "grad_norm": 0.11151191132187124, + "learning_rate": 0.00013318406069604794, + "loss": 0.6306, + "step": 4592 + }, + { + "epoch": 0.41067596566523606, + "grad_norm": 0.12020898720001295, + "learning_rate": 0.00013315674020654688, + "loss": 0.6527, + "step": 4593 + }, + { + "epoch": 0.4107653791130186, + "grad_norm": 0.1195703347041719, + "learning_rate": 0.00013312941693623004, + "loss": 0.6908, + "step": 4594 + }, + { + "epoch": 0.41085479256080115, + "grad_norm": 0.15179854745406215, + "learning_rate": 0.00013310209088738902, + "loss": 0.6926, + "step": 4595 + }, + { + "epoch": 0.41094420600858367, + "grad_norm": 0.13270695649889228, + "learning_rate": 0.00013307476206231563, + "loss": 0.6799, + "step": 4596 + }, + { + "epoch": 0.41103361945636624, + "grad_norm": 0.1443282762088797, + "learning_rate": 0.0001330474304633019, + "loss": 0.6552, + "step": 4597 + }, + { + "epoch": 0.4111230329041488, + "grad_norm": 0.13953077130523275, + "learning_rate": 0.00013302009609264005, + "loss": 0.6959, + "step": 4598 + }, + { + "epoch": 0.4112124463519313, + "grad_norm": 0.12881819643247153, + "learning_rate": 0.00013299275895262266, + "loss": 0.7056, + "step": 4599 + }, + { + "epoch": 0.4113018597997139, + "grad_norm": 0.1294643841710929, + "learning_rate": 0.00013296541904554238, + "loss": 0.6902, + "step": 4600 + }, + { + "epoch": 0.4113912732474964, + "grad_norm": 0.13100955015084118, + "learning_rate": 0.00013293807637369226, + "loss": 0.6896, + "step": 4601 + }, + { + "epoch": 0.411480686695279, + "grad_norm": 0.14394414938682315, + "learning_rate": 0.00013291073093936543, + "loss": 0.717, + "step": 4602 + }, + { + "epoch": 0.4115701001430615, + "grad_norm": 0.13560625822920727, + "learning_rate": 0.00013288338274485532, + "loss": 0.7042, + "step": 4603 + }, + { + "epoch": 0.4116595135908441, + "grad_norm": 0.12643026491393913, + "learning_rate": 0.00013285603179245565, + "loss": 0.6433, + "step": 4604 + }, + { + "epoch": 0.4117489270386266, + "grad_norm": 0.1328046830965579, + "learning_rate": 0.00013282867808446025, + "loss": 0.6739, + "step": 4605 + }, + { + "epoch": 0.41183834048640916, + "grad_norm": 0.13732576691217646, + "learning_rate": 0.0001328013216231633, + "loss": 0.6791, + "step": 4606 + }, + { + "epoch": 0.4119277539341917, + "grad_norm": 0.1306295766312596, + "learning_rate": 0.00013277396241085908, + "loss": 0.6464, + "step": 4607 + }, + { + "epoch": 0.41201716738197425, + "grad_norm": 0.13282796903172744, + "learning_rate": 0.00013274660044984224, + "loss": 0.6498, + "step": 4608 + }, + { + "epoch": 0.4121065808297568, + "grad_norm": 0.12364921627288063, + "learning_rate": 0.00013271923574240756, + "loss": 0.6298, + "step": 4609 + }, + { + "epoch": 0.41219599427753933, + "grad_norm": 0.13234293105740488, + "learning_rate": 0.00013269186829085008, + "loss": 0.6723, + "step": 4610 + }, + { + "epoch": 0.4122854077253219, + "grad_norm": 0.1374584980193778, + "learning_rate": 0.0001326644980974651, + "loss": 0.6847, + "step": 4611 + }, + { + "epoch": 0.4123748211731044, + "grad_norm": 0.15223004723468173, + "learning_rate": 0.00013263712516454812, + "loss": 0.6687, + "step": 4612 + }, + { + "epoch": 0.412464234620887, + "grad_norm": 0.11410073339886681, + "learning_rate": 0.00013260974949439485, + "loss": 0.6603, + "step": 4613 + }, + { + "epoch": 0.4125536480686695, + "grad_norm": 0.12014897845230335, + "learning_rate": 0.00013258237108930128, + "loss": 0.668, + "step": 4614 + }, + { + "epoch": 0.4126430615164521, + "grad_norm": 0.16399909820694117, + "learning_rate": 0.0001325549899515636, + "loss": 0.6662, + "step": 4615 + }, + { + "epoch": 0.4127324749642346, + "grad_norm": 0.11383875037534798, + "learning_rate": 0.00013252760608347826, + "loss": 0.649, + "step": 4616 + }, + { + "epoch": 0.41282188841201717, + "grad_norm": 0.1275284781480193, + "learning_rate": 0.00013250021948734184, + "loss": 0.6746, + "step": 4617 + }, + { + "epoch": 0.41291130185979974, + "grad_norm": 0.13694646705343594, + "learning_rate": 0.00013247283016545126, + "loss": 0.6789, + "step": 4618 + }, + { + "epoch": 0.41300071530758226, + "grad_norm": 0.14141446197467297, + "learning_rate": 0.00013244543812010364, + "loss": 0.6924, + "step": 4619 + }, + { + "epoch": 0.4130901287553648, + "grad_norm": 0.12220512100701647, + "learning_rate": 0.00013241804335359633, + "loss": 0.652, + "step": 4620 + }, + { + "epoch": 0.41317954220314734, + "grad_norm": 0.14215533963686172, + "learning_rate": 0.00013239064586822685, + "loss": 0.6203, + "step": 4621 + }, + { + "epoch": 0.4132689556509299, + "grad_norm": 0.09534549479550836, + "learning_rate": 0.000132363245666293, + "loss": 0.6002, + "step": 4622 + }, + { + "epoch": 0.41335836909871243, + "grad_norm": 0.1233375795563227, + "learning_rate": 0.00013233584275009288, + "loss": 0.6781, + "step": 4623 + }, + { + "epoch": 0.413447782546495, + "grad_norm": 0.12976951637287318, + "learning_rate": 0.00013230843712192463, + "loss": 0.6456, + "step": 4624 + }, + { + "epoch": 0.4135371959942775, + "grad_norm": 0.14758396229326703, + "learning_rate": 0.0001322810287840868, + "loss": 0.7316, + "step": 4625 + }, + { + "epoch": 0.4136266094420601, + "grad_norm": 0.1187727152482722, + "learning_rate": 0.00013225361773887804, + "loss": 0.6375, + "step": 4626 + }, + { + "epoch": 0.4137160228898426, + "grad_norm": 0.11847527534316661, + "learning_rate": 0.00013222620398859738, + "loss": 0.6646, + "step": 4627 + }, + { + "epoch": 0.4138054363376252, + "grad_norm": 0.11625657323820472, + "learning_rate": 0.00013219878753554384, + "loss": 0.6425, + "step": 4628 + }, + { + "epoch": 0.41389484978540775, + "grad_norm": 0.12149720139396447, + "learning_rate": 0.0001321713683820169, + "loss": 0.6611, + "step": 4629 + }, + { + "epoch": 0.41398426323319026, + "grad_norm": 0.1283216448467327, + "learning_rate": 0.00013214394653031616, + "loss": 0.6457, + "step": 4630 + }, + { + "epoch": 0.41407367668097284, + "grad_norm": 0.13516260047480652, + "learning_rate": 0.00013211652198274145, + "loss": 0.6658, + "step": 4631 + }, + { + "epoch": 0.41416309012875535, + "grad_norm": 0.14283808935293513, + "learning_rate": 0.0001320890947415928, + "loss": 0.6753, + "step": 4632 + }, + { + "epoch": 0.4142525035765379, + "grad_norm": 0.1293170258948286, + "learning_rate": 0.00013206166480917055, + "loss": 0.6788, + "step": 4633 + }, + { + "epoch": 0.41434191702432044, + "grad_norm": 0.12407590116029225, + "learning_rate": 0.0001320342321877752, + "loss": 0.6592, + "step": 4634 + }, + { + "epoch": 0.414431330472103, + "grad_norm": 0.11844186836718075, + "learning_rate": 0.00013200679687970748, + "loss": 0.6746, + "step": 4635 + }, + { + "epoch": 0.4145207439198855, + "grad_norm": 0.15140350707070088, + "learning_rate": 0.00013197935888726832, + "loss": 0.6937, + "step": 4636 + }, + { + "epoch": 0.4146101573676681, + "grad_norm": 0.11777967982773467, + "learning_rate": 0.000131951918212759, + "loss": 0.6442, + "step": 4637 + }, + { + "epoch": 0.41469957081545067, + "grad_norm": 0.12452773693110882, + "learning_rate": 0.00013192447485848088, + "loss": 0.6541, + "step": 4638 + }, + { + "epoch": 0.4147889842632332, + "grad_norm": 0.13594087347444336, + "learning_rate": 0.00013189702882673556, + "loss": 0.6658, + "step": 4639 + }, + { + "epoch": 0.41487839771101576, + "grad_norm": 0.14093292276970895, + "learning_rate": 0.00013186958011982502, + "loss": 0.6789, + "step": 4640 + }, + { + "epoch": 0.41496781115879827, + "grad_norm": 0.13230142533243705, + "learning_rate": 0.00013184212874005124, + "loss": 0.594, + "step": 4641 + }, + { + "epoch": 0.41505722460658084, + "grad_norm": 0.12566956739361904, + "learning_rate": 0.0001318146746897166, + "loss": 0.7015, + "step": 4642 + }, + { + "epoch": 0.41514663805436336, + "grad_norm": 0.11888893471591223, + "learning_rate": 0.00013178721797112362, + "loss": 0.6825, + "step": 4643 + }, + { + "epoch": 0.41523605150214593, + "grad_norm": 0.10224246850906149, + "learning_rate": 0.00013175975858657505, + "loss": 0.6025, + "step": 4644 + }, + { + "epoch": 0.41532546494992845, + "grad_norm": 0.13240592109963398, + "learning_rate": 0.00013173229653837387, + "loss": 0.6576, + "step": 4645 + }, + { + "epoch": 0.415414878397711, + "grad_norm": 0.12551789772012492, + "learning_rate": 0.0001317048318288233, + "loss": 0.6666, + "step": 4646 + }, + { + "epoch": 0.4155042918454936, + "grad_norm": 0.13154381544439683, + "learning_rate": 0.0001316773644602268, + "loss": 0.6825, + "step": 4647 + }, + { + "epoch": 0.4155937052932761, + "grad_norm": 0.10725004480937225, + "learning_rate": 0.00013164989443488798, + "loss": 0.6218, + "step": 4648 + }, + { + "epoch": 0.4156831187410587, + "grad_norm": 0.10935065575404315, + "learning_rate": 0.00013162242175511076, + "loss": 0.6354, + "step": 4649 + }, + { + "epoch": 0.4157725321888412, + "grad_norm": 0.12459692723504932, + "learning_rate": 0.0001315949464231992, + "loss": 0.704, + "step": 4650 + }, + { + "epoch": 0.41586194563662376, + "grad_norm": 0.14117772407178192, + "learning_rate": 0.00013156746844145766, + "loss": 0.7029, + "step": 4651 + }, + { + "epoch": 0.4159513590844063, + "grad_norm": 0.12208944318210105, + "learning_rate": 0.00013153998781219062, + "loss": 0.6725, + "step": 4652 + }, + { + "epoch": 0.41604077253218885, + "grad_norm": 0.12907535251428928, + "learning_rate": 0.00013151250453770293, + "loss": 0.6432, + "step": 4653 + }, + { + "epoch": 0.41613018597997137, + "grad_norm": 0.13287966085386255, + "learning_rate": 0.00013148501862029954, + "loss": 0.6501, + "step": 4654 + }, + { + "epoch": 0.41621959942775394, + "grad_norm": 0.13773049966965414, + "learning_rate": 0.00013145753006228565, + "loss": 0.7091, + "step": 4655 + }, + { + "epoch": 0.41630901287553645, + "grad_norm": 0.14614555990110406, + "learning_rate": 0.00013143003886596669, + "loss": 0.7123, + "step": 4656 + }, + { + "epoch": 0.416398426323319, + "grad_norm": 0.11740201295837574, + "learning_rate": 0.00013140254503364837, + "loss": 0.6805, + "step": 4657 + }, + { + "epoch": 0.4164878397711016, + "grad_norm": 0.1254851554182412, + "learning_rate": 0.00013137504856763652, + "loss": 0.6787, + "step": 4658 + }, + { + "epoch": 0.4165772532188841, + "grad_norm": 0.12637411428773976, + "learning_rate": 0.0001313475494702372, + "loss": 0.6501, + "step": 4659 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.13237930850392313, + "learning_rate": 0.0001313200477437568, + "loss": 0.6724, + "step": 4660 + }, + { + "epoch": 0.4167560801144492, + "grad_norm": 0.14206653525363333, + "learning_rate": 0.00013129254339050181, + "loss": 0.6586, + "step": 4661 + }, + { + "epoch": 0.4168454935622318, + "grad_norm": 0.1364343421123656, + "learning_rate": 0.00013126503641277897, + "loss": 0.6489, + "step": 4662 + }, + { + "epoch": 0.4169349070100143, + "grad_norm": 0.12423933664184211, + "learning_rate": 0.00013123752681289529, + "loss": 0.6786, + "step": 4663 + }, + { + "epoch": 0.41702432045779686, + "grad_norm": 0.12609664760431014, + "learning_rate": 0.000131210014593158, + "loss": 0.6383, + "step": 4664 + }, + { + "epoch": 0.4171137339055794, + "grad_norm": 0.12090332447676035, + "learning_rate": 0.00013118249975587447, + "loss": 0.6472, + "step": 4665 + }, + { + "epoch": 0.41720314735336195, + "grad_norm": 0.12968532379171024, + "learning_rate": 0.0001311549823033523, + "loss": 0.6816, + "step": 4666 + }, + { + "epoch": 0.4172925608011445, + "grad_norm": 0.12908930072194383, + "learning_rate": 0.0001311274622378994, + "loss": 0.6681, + "step": 4667 + }, + { + "epoch": 0.41738197424892703, + "grad_norm": 0.1278705197508452, + "learning_rate": 0.0001310999395618239, + "loss": 0.6737, + "step": 4668 + }, + { + "epoch": 0.4174713876967096, + "grad_norm": 0.1316451430525128, + "learning_rate": 0.00013107241427743398, + "loss": 0.6816, + "step": 4669 + }, + { + "epoch": 0.4175608011444921, + "grad_norm": 0.14146887912197376, + "learning_rate": 0.0001310448863870382, + "loss": 0.7322, + "step": 4670 + }, + { + "epoch": 0.4176502145922747, + "grad_norm": 0.11546632676758165, + "learning_rate": 0.0001310173558929453, + "loss": 0.65, + "step": 4671 + }, + { + "epoch": 0.4177396280400572, + "grad_norm": 0.1253485407208305, + "learning_rate": 0.00013098982279746422, + "loss": 0.6753, + "step": 4672 + }, + { + "epoch": 0.4178290414878398, + "grad_norm": 0.13108934496025126, + "learning_rate": 0.0001309622871029041, + "loss": 0.653, + "step": 4673 + }, + { + "epoch": 0.4179184549356223, + "grad_norm": 0.12703884844794053, + "learning_rate": 0.00013093474881157438, + "loss": 0.657, + "step": 4674 + }, + { + "epoch": 0.41800786838340487, + "grad_norm": 0.12171133591189635, + "learning_rate": 0.00013090720792578465, + "loss": 0.6796, + "step": 4675 + }, + { + "epoch": 0.4180972818311874, + "grad_norm": 0.11895658593478595, + "learning_rate": 0.00013087966444784468, + "loss": 0.6228, + "step": 4676 + }, + { + "epoch": 0.41818669527896996, + "grad_norm": 0.13140497633604412, + "learning_rate": 0.00013085211838006458, + "loss": 0.6312, + "step": 4677 + }, + { + "epoch": 0.4182761087267525, + "grad_norm": 0.12520858284462502, + "learning_rate": 0.00013082456972475458, + "loss": 0.6731, + "step": 4678 + }, + { + "epoch": 0.41836552217453504, + "grad_norm": 0.13019253519021645, + "learning_rate": 0.0001307970184842251, + "loss": 0.6654, + "step": 4679 + }, + { + "epoch": 0.4184549356223176, + "grad_norm": 0.12073369315702971, + "learning_rate": 0.0001307694646607869, + "loss": 0.681, + "step": 4680 + }, + { + "epoch": 0.41854434907010013, + "grad_norm": 0.13245172580714867, + "learning_rate": 0.00013074190825675087, + "loss": 0.6373, + "step": 4681 + }, + { + "epoch": 0.4186337625178827, + "grad_norm": 0.14502787177752957, + "learning_rate": 0.00013071434927442813, + "loss": 0.6919, + "step": 4682 + }, + { + "epoch": 0.4187231759656652, + "grad_norm": 0.12877305330019426, + "learning_rate": 0.00013068678771612996, + "loss": 0.6615, + "step": 4683 + }, + { + "epoch": 0.4188125894134478, + "grad_norm": 0.1329190876902813, + "learning_rate": 0.00013065922358416798, + "loss": 0.7309, + "step": 4684 + }, + { + "epoch": 0.4189020028612303, + "grad_norm": 0.12132554712063964, + "learning_rate": 0.00013063165688085397, + "loss": 0.6691, + "step": 4685 + }, + { + "epoch": 0.4189914163090129, + "grad_norm": 0.11883820104186725, + "learning_rate": 0.00013060408760849987, + "loss": 0.6753, + "step": 4686 + }, + { + "epoch": 0.41908082975679545, + "grad_norm": 0.11997196930411903, + "learning_rate": 0.00013057651576941793, + "loss": 0.6631, + "step": 4687 + }, + { + "epoch": 0.41917024320457796, + "grad_norm": 0.13445048413815847, + "learning_rate": 0.00013054894136592052, + "loss": 0.6783, + "step": 4688 + }, + { + "epoch": 0.41925965665236054, + "grad_norm": 0.11696919892173126, + "learning_rate": 0.00013052136440032028, + "loss": 0.6277, + "step": 4689 + }, + { + "epoch": 0.41934907010014305, + "grad_norm": 0.12607516973929117, + "learning_rate": 0.00013049378487493008, + "loss": 0.7162, + "step": 4690 + }, + { + "epoch": 0.4194384835479256, + "grad_norm": 0.14247260100441755, + "learning_rate": 0.00013046620279206296, + "loss": 0.6664, + "step": 4691 + }, + { + "epoch": 0.41952789699570814, + "grad_norm": 0.12769313710003863, + "learning_rate": 0.00013043861815403225, + "loss": 0.6505, + "step": 4692 + }, + { + "epoch": 0.4196173104434907, + "grad_norm": 0.14001325247110644, + "learning_rate": 0.0001304110309631513, + "loss": 0.6883, + "step": 4693 + }, + { + "epoch": 0.4197067238912732, + "grad_norm": 0.13323258650828412, + "learning_rate": 0.000130383441221734, + "loss": 0.6934, + "step": 4694 + }, + { + "epoch": 0.4197961373390558, + "grad_norm": 0.14621216852033822, + "learning_rate": 0.00013035584893209416, + "loss": 0.5928, + "step": 4695 + }, + { + "epoch": 0.4198855507868383, + "grad_norm": 0.13321281785236008, + "learning_rate": 0.00013032825409654592, + "loss": 0.6514, + "step": 4696 + }, + { + "epoch": 0.4199749642346209, + "grad_norm": 0.14764701234322636, + "learning_rate": 0.00013030065671740363, + "loss": 0.6727, + "step": 4697 + }, + { + "epoch": 0.42006437768240346, + "grad_norm": 0.11566075531155486, + "learning_rate": 0.00013027305679698186, + "loss": 0.6685, + "step": 4698 + }, + { + "epoch": 0.42015379113018597, + "grad_norm": 0.1472693419765523, + "learning_rate": 0.00013024545433759538, + "loss": 0.7242, + "step": 4699 + }, + { + "epoch": 0.42024320457796854, + "grad_norm": 0.14776453669197206, + "learning_rate": 0.00013021784934155915, + "loss": 0.7042, + "step": 4700 + }, + { + "epoch": 0.42033261802575106, + "grad_norm": 0.12712891677359592, + "learning_rate": 0.00013019024181118845, + "loss": 0.6749, + "step": 4701 + }, + { + "epoch": 0.42042203147353363, + "grad_norm": 0.13053541358738843, + "learning_rate": 0.00013016263174879858, + "loss": 0.6883, + "step": 4702 + }, + { + "epoch": 0.42051144492131615, + "grad_norm": 0.14809146192655698, + "learning_rate": 0.00013013501915670522, + "loss": 0.682, + "step": 4703 + }, + { + "epoch": 0.4206008583690987, + "grad_norm": 0.11643448449885989, + "learning_rate": 0.0001301074040372242, + "loss": 0.6489, + "step": 4704 + }, + { + "epoch": 0.42069027181688123, + "grad_norm": 0.13542952946748535, + "learning_rate": 0.0001300797863926716, + "loss": 0.6632, + "step": 4705 + }, + { + "epoch": 0.4207796852646638, + "grad_norm": 0.10678892649451664, + "learning_rate": 0.00013005216622536355, + "loss": 0.6149, + "step": 4706 + }, + { + "epoch": 0.4208690987124464, + "grad_norm": 0.12630658550417329, + "learning_rate": 0.00013002454353761665, + "loss": 0.7099, + "step": 4707 + }, + { + "epoch": 0.4209585121602289, + "grad_norm": 0.1388847192624808, + "learning_rate": 0.0001299969183317476, + "loss": 0.66, + "step": 4708 + }, + { + "epoch": 0.42104792560801146, + "grad_norm": 0.1276225487748427, + "learning_rate": 0.0001299692906100732, + "loss": 0.6615, + "step": 4709 + }, + { + "epoch": 0.421137339055794, + "grad_norm": 0.1433559199854091, + "learning_rate": 0.00012994166037491058, + "loss": 0.6847, + "step": 4710 + }, + { + "epoch": 0.42122675250357655, + "grad_norm": 0.12156454579926214, + "learning_rate": 0.00012991402762857707, + "loss": 0.598, + "step": 4711 + }, + { + "epoch": 0.42131616595135907, + "grad_norm": 0.15150689994585934, + "learning_rate": 0.00012988639237339022, + "loss": 0.6735, + "step": 4712 + }, + { + "epoch": 0.42140557939914164, + "grad_norm": 0.13108601041934417, + "learning_rate": 0.0001298587546116677, + "loss": 0.6575, + "step": 4713 + }, + { + "epoch": 0.42149499284692415, + "grad_norm": 0.1267360078215547, + "learning_rate": 0.00012983111434572748, + "loss": 0.6177, + "step": 4714 + }, + { + "epoch": 0.4215844062947067, + "grad_norm": 0.13313728126666993, + "learning_rate": 0.00012980347157788777, + "loss": 0.5798, + "step": 4715 + }, + { + "epoch": 0.4216738197424893, + "grad_norm": 0.15071702710307008, + "learning_rate": 0.00012977582631046685, + "loss": 0.6698, + "step": 4716 + }, + { + "epoch": 0.4217632331902718, + "grad_norm": 0.13042867672592348, + "learning_rate": 0.0001297481785457834, + "loss": 0.6633, + "step": 4717 + }, + { + "epoch": 0.4218526466380544, + "grad_norm": 0.12362089804106056, + "learning_rate": 0.00012972052828615606, + "loss": 0.6168, + "step": 4718 + }, + { + "epoch": 0.4219420600858369, + "grad_norm": 0.1324998696174766, + "learning_rate": 0.00012969287553390397, + "loss": 0.6918, + "step": 4719 + }, + { + "epoch": 0.4220314735336195, + "grad_norm": 0.14016421330464368, + "learning_rate": 0.00012966522029134623, + "loss": 0.6643, + "step": 4720 + }, + { + "epoch": 0.422120886981402, + "grad_norm": 0.1318501089755161, + "learning_rate": 0.0001296375625608023, + "loss": 0.7063, + "step": 4721 + }, + { + "epoch": 0.42221030042918456, + "grad_norm": 0.11219651689668891, + "learning_rate": 0.0001296099023445918, + "loss": 0.6534, + "step": 4722 + }, + { + "epoch": 0.4222997138769671, + "grad_norm": 0.13844502481383067, + "learning_rate": 0.00012958223964503452, + "loss": 0.6972, + "step": 4723 + }, + { + "epoch": 0.42238912732474965, + "grad_norm": 0.12476814985220182, + "learning_rate": 0.00012955457446445055, + "loss": 0.6286, + "step": 4724 + }, + { + "epoch": 0.42247854077253216, + "grad_norm": 0.11178932219578289, + "learning_rate": 0.00012952690680516016, + "loss": 0.6563, + "step": 4725 + }, + { + "epoch": 0.42256795422031473, + "grad_norm": 0.13256658982990163, + "learning_rate": 0.0001294992366694837, + "loss": 0.6803, + "step": 4726 + }, + { + "epoch": 0.4226573676680973, + "grad_norm": 0.12275930484052602, + "learning_rate": 0.00012947156405974187, + "loss": 0.6534, + "step": 4727 + }, + { + "epoch": 0.4227467811158798, + "grad_norm": 0.14757840270126546, + "learning_rate": 0.0001294438889782556, + "loss": 0.6708, + "step": 4728 + }, + { + "epoch": 0.4228361945636624, + "grad_norm": 0.11853867122333704, + "learning_rate": 0.00012941621142734594, + "loss": 0.6397, + "step": 4729 + }, + { + "epoch": 0.4229256080114449, + "grad_norm": 0.11498428232024968, + "learning_rate": 0.00012938853140933407, + "loss": 0.6493, + "step": 4730 + }, + { + "epoch": 0.4230150214592275, + "grad_norm": 0.12059214116247166, + "learning_rate": 0.0001293608489265416, + "loss": 0.6642, + "step": 4731 + }, + { + "epoch": 0.42310443490701, + "grad_norm": 0.10888103307383526, + "learning_rate": 0.00012933316398129022, + "loss": 0.6178, + "step": 4732 + }, + { + "epoch": 0.42319384835479257, + "grad_norm": 0.13801984146680363, + "learning_rate": 0.00012930547657590179, + "loss": 0.6877, + "step": 4733 + }, + { + "epoch": 0.4232832618025751, + "grad_norm": 0.1277988330399816, + "learning_rate": 0.00012927778671269842, + "loss": 0.6498, + "step": 4734 + }, + { + "epoch": 0.42337267525035766, + "grad_norm": 0.1316746894731682, + "learning_rate": 0.00012925009439400243, + "loss": 0.6014, + "step": 4735 + }, + { + "epoch": 0.4234620886981402, + "grad_norm": 0.13845880808907443, + "learning_rate": 0.00012922239962213637, + "loss": 0.6939, + "step": 4736 + }, + { + "epoch": 0.42355150214592274, + "grad_norm": 0.12010818553668316, + "learning_rate": 0.00012919470239942292, + "loss": 0.6418, + "step": 4737 + }, + { + "epoch": 0.4236409155937053, + "grad_norm": 0.1381024835917395, + "learning_rate": 0.00012916700272818505, + "loss": 0.6452, + "step": 4738 + }, + { + "epoch": 0.42373032904148783, + "grad_norm": 0.11103495359076826, + "learning_rate": 0.00012913930061074592, + "loss": 0.6356, + "step": 4739 + }, + { + "epoch": 0.4238197424892704, + "grad_norm": 0.11709968895147835, + "learning_rate": 0.00012911159604942879, + "loss": 0.6479, + "step": 4740 + }, + { + "epoch": 0.4239091559370529, + "grad_norm": 0.1444997913280812, + "learning_rate": 0.0001290838890465573, + "loss": 0.7316, + "step": 4741 + }, + { + "epoch": 0.4239985693848355, + "grad_norm": 0.12939026840866694, + "learning_rate": 0.00012905617960445512, + "loss": 0.6915, + "step": 4742 + }, + { + "epoch": 0.424087982832618, + "grad_norm": 0.13611547714542604, + "learning_rate": 0.00012902846772544624, + "loss": 0.6754, + "step": 4743 + }, + { + "epoch": 0.4241773962804006, + "grad_norm": 0.11936596033033842, + "learning_rate": 0.00012900075341185487, + "loss": 0.6578, + "step": 4744 + }, + { + "epoch": 0.4242668097281831, + "grad_norm": 0.1407980705305285, + "learning_rate": 0.0001289730366660053, + "loss": 0.6723, + "step": 4745 + }, + { + "epoch": 0.42435622317596566, + "grad_norm": 0.11456485794116267, + "learning_rate": 0.00012894531749022217, + "loss": 0.6515, + "step": 4746 + }, + { + "epoch": 0.42444563662374823, + "grad_norm": 0.11152667761504123, + "learning_rate": 0.00012891759588683018, + "loss": 0.6219, + "step": 4747 + }, + { + "epoch": 0.42453505007153075, + "grad_norm": 0.11506137880768523, + "learning_rate": 0.0001288898718581544, + "loss": 0.633, + "step": 4748 + }, + { + "epoch": 0.4246244635193133, + "grad_norm": 0.12469541748654037, + "learning_rate": 0.0001288621454065199, + "loss": 0.6494, + "step": 4749 + }, + { + "epoch": 0.42471387696709584, + "grad_norm": 0.1397602810302089, + "learning_rate": 0.00012883441653425214, + "loss": 0.6704, + "step": 4750 + }, + { + "epoch": 0.4248032904148784, + "grad_norm": 0.1288092043372824, + "learning_rate": 0.00012880668524367672, + "loss": 0.6507, + "step": 4751 + }, + { + "epoch": 0.4248927038626609, + "grad_norm": 0.1521504862419311, + "learning_rate": 0.00012877895153711935, + "loss": 0.7128, + "step": 4752 + }, + { + "epoch": 0.4249821173104435, + "grad_norm": 0.1242618614207361, + "learning_rate": 0.0001287512154169061, + "loss": 0.6757, + "step": 4753 + }, + { + "epoch": 0.425071530758226, + "grad_norm": 0.14818754175312918, + "learning_rate": 0.00012872347688536312, + "loss": 0.6989, + "step": 4754 + }, + { + "epoch": 0.4251609442060086, + "grad_norm": 0.12669253891273233, + "learning_rate": 0.00012869573594481685, + "loss": 0.598, + "step": 4755 + }, + { + "epoch": 0.42525035765379116, + "grad_norm": 0.12937335427406602, + "learning_rate": 0.00012866799259759386, + "loss": 0.6078, + "step": 4756 + }, + { + "epoch": 0.42533977110157367, + "grad_norm": 0.11960043502891289, + "learning_rate": 0.0001286402468460209, + "loss": 0.6419, + "step": 4757 + }, + { + "epoch": 0.42542918454935624, + "grad_norm": 0.12979632404672545, + "learning_rate": 0.0001286124986924251, + "loss": 0.652, + "step": 4758 + }, + { + "epoch": 0.42551859799713876, + "grad_norm": 0.1346272655630689, + "learning_rate": 0.00012858474813913352, + "loss": 0.6918, + "step": 4759 + }, + { + "epoch": 0.42560801144492133, + "grad_norm": 0.13961785607281332, + "learning_rate": 0.00012855699518847367, + "loss": 0.6996, + "step": 4760 + }, + { + "epoch": 0.42569742489270385, + "grad_norm": 0.1228309451913087, + "learning_rate": 0.00012852923984277314, + "loss": 0.6442, + "step": 4761 + }, + { + "epoch": 0.4257868383404864, + "grad_norm": 0.1333249137712477, + "learning_rate": 0.0001285014821043597, + "loss": 0.6998, + "step": 4762 + }, + { + "epoch": 0.42587625178826893, + "grad_norm": 0.13851998554962805, + "learning_rate": 0.00012847372197556138, + "loss": 0.6668, + "step": 4763 + }, + { + "epoch": 0.4259656652360515, + "grad_norm": 0.12676470978145216, + "learning_rate": 0.00012844595945870637, + "loss": 0.6166, + "step": 4764 + }, + { + "epoch": 0.426055078683834, + "grad_norm": 0.13899482293850338, + "learning_rate": 0.00012841819455612313, + "loss": 0.6704, + "step": 4765 + }, + { + "epoch": 0.4261444921316166, + "grad_norm": 0.12054128119695805, + "learning_rate": 0.0001283904272701402, + "loss": 0.6659, + "step": 4766 + }, + { + "epoch": 0.42623390557939916, + "grad_norm": 0.1280958148866336, + "learning_rate": 0.0001283626576030864, + "loss": 0.6521, + "step": 4767 + }, + { + "epoch": 0.4263233190271817, + "grad_norm": 0.1353298640831068, + "learning_rate": 0.0001283348855572908, + "loss": 0.6643, + "step": 4768 + }, + { + "epoch": 0.42641273247496425, + "grad_norm": 0.1175895560469458, + "learning_rate": 0.00012830711113508256, + "loss": 0.6428, + "step": 4769 + }, + { + "epoch": 0.42650214592274677, + "grad_norm": 0.137488043374189, + "learning_rate": 0.0001282793343387911, + "loss": 0.6306, + "step": 4770 + }, + { + "epoch": 0.42659155937052934, + "grad_norm": 0.12288888803076158, + "learning_rate": 0.000128251555170746, + "loss": 0.5724, + "step": 4771 + }, + { + "epoch": 0.42668097281831185, + "grad_norm": 0.13647658650321542, + "learning_rate": 0.00012822377363327713, + "loss": 0.6893, + "step": 4772 + }, + { + "epoch": 0.4267703862660944, + "grad_norm": 0.1311022669688, + "learning_rate": 0.00012819598972871443, + "loss": 0.6425, + "step": 4773 + }, + { + "epoch": 0.42685979971387694, + "grad_norm": 0.13846526974693743, + "learning_rate": 0.0001281682034593881, + "loss": 0.6794, + "step": 4774 + }, + { + "epoch": 0.4269492131616595, + "grad_norm": 0.12743657725480573, + "learning_rate": 0.0001281404148276286, + "loss": 0.6533, + "step": 4775 + }, + { + "epoch": 0.4270386266094421, + "grad_norm": 0.1310248151166644, + "learning_rate": 0.00012811262383576646, + "loss": 0.6553, + "step": 4776 + }, + { + "epoch": 0.4271280400572246, + "grad_norm": 0.1394570807393252, + "learning_rate": 0.00012808483048613252, + "loss": 0.6382, + "step": 4777 + }, + { + "epoch": 0.42721745350500717, + "grad_norm": 0.15763029577034612, + "learning_rate": 0.00012805703478105778, + "loss": 0.7155, + "step": 4778 + }, + { + "epoch": 0.4273068669527897, + "grad_norm": 0.12531059101427086, + "learning_rate": 0.00012802923672287342, + "loss": 0.6581, + "step": 4779 + }, + { + "epoch": 0.42739628040057226, + "grad_norm": 0.12335562000823025, + "learning_rate": 0.00012800143631391082, + "loss": 0.6848, + "step": 4780 + }, + { + "epoch": 0.4274856938483548, + "grad_norm": 0.12942172160208648, + "learning_rate": 0.00012797363355650154, + "loss": 0.6217, + "step": 4781 + }, + { + "epoch": 0.42757510729613735, + "grad_norm": 0.1302665538033458, + "learning_rate": 0.00012794582845297744, + "loss": 0.6772, + "step": 4782 + }, + { + "epoch": 0.42766452074391986, + "grad_norm": 0.14447060699190648, + "learning_rate": 0.00012791802100567043, + "loss": 0.6654, + "step": 4783 + }, + { + "epoch": 0.42775393419170243, + "grad_norm": 0.13003071915226305, + "learning_rate": 0.00012789021121691274, + "loss": 0.6676, + "step": 4784 + }, + { + "epoch": 0.427843347639485, + "grad_norm": 0.12939217231411312, + "learning_rate": 0.0001278623990890367, + "loss": 0.6805, + "step": 4785 + }, + { + "epoch": 0.4279327610872675, + "grad_norm": 0.11718515934370032, + "learning_rate": 0.0001278345846243749, + "loss": 0.6609, + "step": 4786 + }, + { + "epoch": 0.4280221745350501, + "grad_norm": 0.12032144580323206, + "learning_rate": 0.00012780676782526014, + "loss": 0.6449, + "step": 4787 + }, + { + "epoch": 0.4281115879828326, + "grad_norm": 0.12522762784932467, + "learning_rate": 0.0001277789486940253, + "loss": 0.6397, + "step": 4788 + }, + { + "epoch": 0.4282010014306152, + "grad_norm": 0.11606477447025534, + "learning_rate": 0.0001277511272330036, + "loss": 0.6625, + "step": 4789 + }, + { + "epoch": 0.4282904148783977, + "grad_norm": 0.13223945959546357, + "learning_rate": 0.00012772330344452834, + "loss": 0.6081, + "step": 4790 + }, + { + "epoch": 0.42837982832618027, + "grad_norm": 0.13480009520141695, + "learning_rate": 0.00012769547733093312, + "loss": 0.6736, + "step": 4791 + }, + { + "epoch": 0.4284692417739628, + "grad_norm": 0.12356206744235122, + "learning_rate": 0.0001276676488945517, + "loss": 0.6283, + "step": 4792 + }, + { + "epoch": 0.42855865522174535, + "grad_norm": 0.12015502920545883, + "learning_rate": 0.00012763981813771795, + "loss": 0.6426, + "step": 4793 + }, + { + "epoch": 0.42864806866952787, + "grad_norm": 0.11689714982838988, + "learning_rate": 0.00012761198506276603, + "loss": 0.611, + "step": 4794 + }, + { + "epoch": 0.42873748211731044, + "grad_norm": 0.14350238338162744, + "learning_rate": 0.00012758414967203028, + "loss": 0.7084, + "step": 4795 + }, + { + "epoch": 0.428826895565093, + "grad_norm": 0.13446543392259847, + "learning_rate": 0.00012755631196784522, + "loss": 0.6193, + "step": 4796 + }, + { + "epoch": 0.42891630901287553, + "grad_norm": 0.1388934424666328, + "learning_rate": 0.00012752847195254553, + "loss": 0.6537, + "step": 4797 + }, + { + "epoch": 0.4290057224606581, + "grad_norm": 0.125370241808868, + "learning_rate": 0.00012750062962846613, + "loss": 0.6335, + "step": 4798 + }, + { + "epoch": 0.4290951359084406, + "grad_norm": 0.14461800535886243, + "learning_rate": 0.0001274727849979422, + "loss": 0.681, + "step": 4799 + }, + { + "epoch": 0.4291845493562232, + "grad_norm": 0.11979439882031082, + "learning_rate": 0.0001274449380633089, + "loss": 0.6386, + "step": 4800 + }, + { + "epoch": 0.4292739628040057, + "grad_norm": 0.1258467310039248, + "learning_rate": 0.0001274170888269018, + "loss": 0.6755, + "step": 4801 + }, + { + "epoch": 0.4293633762517883, + "grad_norm": 0.12750971434736647, + "learning_rate": 0.00012738923729105662, + "loss": 0.6731, + "step": 4802 + }, + { + "epoch": 0.4294527896995708, + "grad_norm": 0.12498465885231613, + "learning_rate": 0.00012736138345810917, + "loss": 0.6882, + "step": 4803 + }, + { + "epoch": 0.42954220314735336, + "grad_norm": 0.1152479345411343, + "learning_rate": 0.0001273335273303955, + "loss": 0.6471, + "step": 4804 + }, + { + "epoch": 0.42963161659513593, + "grad_norm": 0.11764669311637127, + "learning_rate": 0.00012730566891025195, + "loss": 0.6541, + "step": 4805 + }, + { + "epoch": 0.42972103004291845, + "grad_norm": 0.13271507136807867, + "learning_rate": 0.0001272778082000149, + "loss": 0.6734, + "step": 4806 + }, + { + "epoch": 0.429810443490701, + "grad_norm": 0.1106511212981975, + "learning_rate": 0.000127249945202021, + "loss": 0.6387, + "step": 4807 + }, + { + "epoch": 0.42989985693848354, + "grad_norm": 0.14411156192051094, + "learning_rate": 0.00012722207991860713, + "loss": 0.6788, + "step": 4808 + }, + { + "epoch": 0.4299892703862661, + "grad_norm": 0.11653932720834499, + "learning_rate": 0.0001271942123521103, + "loss": 0.67, + "step": 4809 + }, + { + "epoch": 0.4300786838340486, + "grad_norm": 0.13007710229096525, + "learning_rate": 0.0001271663425048677, + "loss": 0.6673, + "step": 4810 + }, + { + "epoch": 0.4301680972818312, + "grad_norm": 0.13989452659268978, + "learning_rate": 0.00012713847037921678, + "loss": 0.5981, + "step": 4811 + }, + { + "epoch": 0.4302575107296137, + "grad_norm": 0.15601307400677436, + "learning_rate": 0.00012711059597749513, + "loss": 0.7141, + "step": 4812 + }, + { + "epoch": 0.4303469241773963, + "grad_norm": 0.13850846091340233, + "learning_rate": 0.00012708271930204052, + "loss": 0.6685, + "step": 4813 + }, + { + "epoch": 0.4304363376251788, + "grad_norm": 0.13935389542627147, + "learning_rate": 0.00012705484035519096, + "loss": 0.6886, + "step": 4814 + }, + { + "epoch": 0.43052575107296137, + "grad_norm": 0.12755581819871656, + "learning_rate": 0.0001270269591392846, + "loss": 0.6706, + "step": 4815 + }, + { + "epoch": 0.43061516452074394, + "grad_norm": 0.12921829108798275, + "learning_rate": 0.00012699907565665982, + "loss": 0.6551, + "step": 4816 + }, + { + "epoch": 0.43070457796852646, + "grad_norm": 0.10088821712033602, + "learning_rate": 0.0001269711899096552, + "loss": 0.6109, + "step": 4817 + }, + { + "epoch": 0.43079399141630903, + "grad_norm": 0.1277923243909112, + "learning_rate": 0.0001269433019006094, + "loss": 0.672, + "step": 4818 + }, + { + "epoch": 0.43088340486409155, + "grad_norm": 0.11793540826116561, + "learning_rate": 0.00012691541163186148, + "loss": 0.6271, + "step": 4819 + }, + { + "epoch": 0.4309728183118741, + "grad_norm": 0.12121289043408502, + "learning_rate": 0.00012688751910575044, + "loss": 0.647, + "step": 4820 + }, + { + "epoch": 0.43106223175965663, + "grad_norm": 0.12166733264178499, + "learning_rate": 0.00012685962432461563, + "loss": 0.6705, + "step": 4821 + }, + { + "epoch": 0.4311516452074392, + "grad_norm": 0.12742741733884158, + "learning_rate": 0.00012683172729079662, + "loss": 0.6527, + "step": 4822 + }, + { + "epoch": 0.4312410586552217, + "grad_norm": 0.13853764531400017, + "learning_rate": 0.00012680382800663302, + "loss": 0.6949, + "step": 4823 + }, + { + "epoch": 0.4313304721030043, + "grad_norm": 0.13447729020101, + "learning_rate": 0.00012677592647446472, + "loss": 0.6887, + "step": 4824 + }, + { + "epoch": 0.43141988555078686, + "grad_norm": 0.12370286825905347, + "learning_rate": 0.0001267480226966318, + "loss": 0.6306, + "step": 4825 + }, + { + "epoch": 0.4315092989985694, + "grad_norm": 0.14052935973007985, + "learning_rate": 0.00012672011667547457, + "loss": 0.6684, + "step": 4826 + }, + { + "epoch": 0.43159871244635195, + "grad_norm": 0.14284900768116987, + "learning_rate": 0.0001266922084133334, + "loss": 0.6804, + "step": 4827 + }, + { + "epoch": 0.43168812589413447, + "grad_norm": 0.14048590938420055, + "learning_rate": 0.00012666429791254892, + "loss": 0.6804, + "step": 4828 + }, + { + "epoch": 0.43177753934191704, + "grad_norm": 0.11898578264137527, + "learning_rate": 0.000126636385175462, + "loss": 0.6397, + "step": 4829 + }, + { + "epoch": 0.43186695278969955, + "grad_norm": 0.12320309540054045, + "learning_rate": 0.00012660847020441363, + "loss": 0.6388, + "step": 4830 + }, + { + "epoch": 0.4319563662374821, + "grad_norm": 0.13140333129893542, + "learning_rate": 0.00012658055300174498, + "loss": 0.6604, + "step": 4831 + }, + { + "epoch": 0.43204577968526464, + "grad_norm": 0.13665556466581177, + "learning_rate": 0.00012655263356979747, + "loss": 0.6933, + "step": 4832 + }, + { + "epoch": 0.4321351931330472, + "grad_norm": 0.13596876853314302, + "learning_rate": 0.00012652471191091266, + "loss": 0.69, + "step": 4833 + }, + { + "epoch": 0.4322246065808298, + "grad_norm": 0.12921796987253548, + "learning_rate": 0.00012649678802743227, + "loss": 0.6549, + "step": 4834 + }, + { + "epoch": 0.4323140200286123, + "grad_norm": 0.11560399996413041, + "learning_rate": 0.00012646886192169826, + "loss": 0.6701, + "step": 4835 + }, + { + "epoch": 0.43240343347639487, + "grad_norm": 0.13166846323878995, + "learning_rate": 0.00012644093359605278, + "loss": 0.6784, + "step": 4836 + }, + { + "epoch": 0.4324928469241774, + "grad_norm": 0.12127456313749806, + "learning_rate": 0.00012641300305283814, + "loss": 0.6415, + "step": 4837 + }, + { + "epoch": 0.43258226037195996, + "grad_norm": 0.1307963989906328, + "learning_rate": 0.00012638507029439684, + "loss": 0.6717, + "step": 4838 + }, + { + "epoch": 0.4326716738197425, + "grad_norm": 0.1336063887896029, + "learning_rate": 0.00012635713532307152, + "loss": 0.6082, + "step": 4839 + }, + { + "epoch": 0.43276108726752505, + "grad_norm": 0.14070523002926538, + "learning_rate": 0.00012632919814120513, + "loss": 0.684, + "step": 4840 + }, + { + "epoch": 0.43285050071530756, + "grad_norm": 0.12881649623787575, + "learning_rate": 0.00012630125875114068, + "loss": 0.6714, + "step": 4841 + }, + { + "epoch": 0.43293991416309013, + "grad_norm": 0.1492934235039202, + "learning_rate": 0.00012627331715522143, + "loss": 0.6725, + "step": 4842 + }, + { + "epoch": 0.43302932761087265, + "grad_norm": 0.11320020851976305, + "learning_rate": 0.0001262453733557908, + "loss": 0.6504, + "step": 4843 + }, + { + "epoch": 0.4331187410586552, + "grad_norm": 0.1329096552860581, + "learning_rate": 0.00012621742735519239, + "loss": 0.6641, + "step": 4844 + }, + { + "epoch": 0.4332081545064378, + "grad_norm": 0.14080298306246472, + "learning_rate": 0.00012618947915577, + "loss": 0.6777, + "step": 4845 + }, + { + "epoch": 0.4332975679542203, + "grad_norm": 0.1316479498707785, + "learning_rate": 0.0001261615287598676, + "loss": 0.6695, + "step": 4846 + }, + { + "epoch": 0.4333869814020029, + "grad_norm": 0.14075425512783651, + "learning_rate": 0.00012613357616982946, + "loss": 0.6791, + "step": 4847 + }, + { + "epoch": 0.4334763948497854, + "grad_norm": 0.14767080016205547, + "learning_rate": 0.00012610562138799978, + "loss": 0.7009, + "step": 4848 + }, + { + "epoch": 0.43356580829756797, + "grad_norm": 0.12409097793445334, + "learning_rate": 0.00012607766441672318, + "loss": 0.6526, + "step": 4849 + }, + { + "epoch": 0.4336552217453505, + "grad_norm": 0.1329673533742714, + "learning_rate": 0.00012604970525834436, + "loss": 0.6846, + "step": 4850 + }, + { + "epoch": 0.43374463519313305, + "grad_norm": 0.12214033261987524, + "learning_rate": 0.00012602174391520822, + "loss": 0.6898, + "step": 4851 + }, + { + "epoch": 0.43383404864091557, + "grad_norm": 0.12053456872096158, + "learning_rate": 0.00012599378038965985, + "loss": 0.6544, + "step": 4852 + }, + { + "epoch": 0.43392346208869814, + "grad_norm": 0.1330337202310665, + "learning_rate": 0.00012596581468404453, + "loss": 0.6952, + "step": 4853 + }, + { + "epoch": 0.4340128755364807, + "grad_norm": 0.1413251164763547, + "learning_rate": 0.00012593784680070766, + "loss": 0.5925, + "step": 4854 + }, + { + "epoch": 0.43410228898426323, + "grad_norm": 0.1379487679029663, + "learning_rate": 0.0001259098767419949, + "loss": 0.6838, + "step": 4855 + }, + { + "epoch": 0.4341917024320458, + "grad_norm": 0.12019386240464854, + "learning_rate": 0.00012588190451025207, + "loss": 0.6782, + "step": 4856 + }, + { + "epoch": 0.4342811158798283, + "grad_norm": 0.12794185466037358, + "learning_rate": 0.0001258539301078252, + "loss": 0.6475, + "step": 4857 + }, + { + "epoch": 0.4343705293276109, + "grad_norm": 0.13747313514995113, + "learning_rate": 0.0001258259535370604, + "loss": 0.6832, + "step": 4858 + }, + { + "epoch": 0.4344599427753934, + "grad_norm": 0.12736673936870377, + "learning_rate": 0.00012579797480030406, + "loss": 0.6303, + "step": 4859 + }, + { + "epoch": 0.434549356223176, + "grad_norm": 0.15151350580272196, + "learning_rate": 0.00012576999389990278, + "loss": 0.6895, + "step": 4860 + }, + { + "epoch": 0.4346387696709585, + "grad_norm": 0.12998380744715904, + "learning_rate": 0.0001257420108382032, + "loss": 0.6892, + "step": 4861 + }, + { + "epoch": 0.43472818311874106, + "grad_norm": 0.1263913419156132, + "learning_rate": 0.00012571402561755227, + "loss": 0.6244, + "step": 4862 + }, + { + "epoch": 0.4348175965665236, + "grad_norm": 0.13020890943115704, + "learning_rate": 0.00012568603824029707, + "loss": 0.6614, + "step": 4863 + }, + { + "epoch": 0.43490701001430615, + "grad_norm": 0.11145908496746108, + "learning_rate": 0.00012565804870878484, + "loss": 0.6762, + "step": 4864 + }, + { + "epoch": 0.4349964234620887, + "grad_norm": 0.14187820193589507, + "learning_rate": 0.00012563005702536306, + "loss": 0.6964, + "step": 4865 + }, + { + "epoch": 0.43508583690987124, + "grad_norm": 0.1249170767105925, + "learning_rate": 0.00012560206319237936, + "loss": 0.6819, + "step": 4866 + }, + { + "epoch": 0.4351752503576538, + "grad_norm": 0.14431739549043757, + "learning_rate": 0.00012557406721218155, + "loss": 0.7061, + "step": 4867 + }, + { + "epoch": 0.4352646638054363, + "grad_norm": 0.12823891555924805, + "learning_rate": 0.00012554606908711757, + "loss": 0.7199, + "step": 4868 + }, + { + "epoch": 0.4353540772532189, + "grad_norm": 0.1297648405516651, + "learning_rate": 0.00012551806881953566, + "loss": 0.6632, + "step": 4869 + }, + { + "epoch": 0.4354434907010014, + "grad_norm": 0.11867990228692249, + "learning_rate": 0.0001254900664117841, + "loss": 0.6625, + "step": 4870 + }, + { + "epoch": 0.435532904148784, + "grad_norm": 0.13959797255688602, + "learning_rate": 0.0001254620618662115, + "loss": 0.7116, + "step": 4871 + }, + { + "epoch": 0.4356223175965665, + "grad_norm": 0.1220859112924097, + "learning_rate": 0.0001254340551851665, + "loss": 0.6695, + "step": 4872 + }, + { + "epoch": 0.43571173104434907, + "grad_norm": 0.1319589784987974, + "learning_rate": 0.000125406046370998, + "loss": 0.6662, + "step": 4873 + }, + { + "epoch": 0.43580114449213164, + "grad_norm": 0.13502124575338123, + "learning_rate": 0.00012537803542605512, + "loss": 0.6791, + "step": 4874 + }, + { + "epoch": 0.43589055793991416, + "grad_norm": 0.1258341639137474, + "learning_rate": 0.00012535002235268701, + "loss": 0.7025, + "step": 4875 + }, + { + "epoch": 0.43597997138769673, + "grad_norm": 0.14469384350358278, + "learning_rate": 0.00012532200715324317, + "loss": 0.6733, + "step": 4876 + }, + { + "epoch": 0.43606938483547925, + "grad_norm": 0.12893957679548151, + "learning_rate": 0.0001252939898300731, + "loss": 0.6136, + "step": 4877 + }, + { + "epoch": 0.4361587982832618, + "grad_norm": 0.13028409018422266, + "learning_rate": 0.0001252659703855267, + "loss": 0.664, + "step": 4878 + }, + { + "epoch": 0.43624821173104433, + "grad_norm": 0.13873813772027793, + "learning_rate": 0.00012523794882195391, + "loss": 0.6877, + "step": 4879 + }, + { + "epoch": 0.4363376251788269, + "grad_norm": 0.13536908488460145, + "learning_rate": 0.0001252099251417048, + "loss": 0.6619, + "step": 4880 + }, + { + "epoch": 0.4364270386266094, + "grad_norm": 0.11959204726701496, + "learning_rate": 0.00012518189934712973, + "loss": 0.6415, + "step": 4881 + }, + { + "epoch": 0.436516452074392, + "grad_norm": 0.11714636648528813, + "learning_rate": 0.00012515387144057915, + "loss": 0.6567, + "step": 4882 + }, + { + "epoch": 0.4366058655221745, + "grad_norm": 0.13273865466384815, + "learning_rate": 0.00012512584142440378, + "loss": 0.6521, + "step": 4883 + }, + { + "epoch": 0.4366952789699571, + "grad_norm": 0.1213972869918922, + "learning_rate": 0.00012509780930095442, + "loss": 0.6583, + "step": 4884 + }, + { + "epoch": 0.43678469241773965, + "grad_norm": 0.13417863141326117, + "learning_rate": 0.00012506977507258208, + "loss": 0.6494, + "step": 4885 + }, + { + "epoch": 0.43687410586552217, + "grad_norm": 0.1258016350087054, + "learning_rate": 0.000125041738741638, + "loss": 0.6592, + "step": 4886 + }, + { + "epoch": 0.43696351931330474, + "grad_norm": 0.11664078895031185, + "learning_rate": 0.00012501370031047356, + "loss": 0.6928, + "step": 4887 + }, + { + "epoch": 0.43705293276108725, + "grad_norm": 0.13839252628064588, + "learning_rate": 0.00012498565978144027, + "loss": 0.651, + "step": 4888 + }, + { + "epoch": 0.4371423462088698, + "grad_norm": 0.10918823383554112, + "learning_rate": 0.00012495761715688983, + "loss": 0.6261, + "step": 4889 + }, + { + "epoch": 0.43723175965665234, + "grad_norm": 0.13291851165686194, + "learning_rate": 0.00012492957243917424, + "loss": 0.6536, + "step": 4890 + }, + { + "epoch": 0.4373211731044349, + "grad_norm": 0.12732250474302234, + "learning_rate": 0.00012490152563064544, + "loss": 0.6738, + "step": 4891 + }, + { + "epoch": 0.43741058655221743, + "grad_norm": 0.1532877221463704, + "learning_rate": 0.00012487347673365582, + "loss": 0.6692, + "step": 4892 + }, + { + "epoch": 0.4375, + "grad_norm": 0.14610310762907458, + "learning_rate": 0.00012484542575055775, + "loss": 0.6689, + "step": 4893 + }, + { + "epoch": 0.43758941344778257, + "grad_norm": 0.12377531742788102, + "learning_rate": 0.0001248173726837038, + "loss": 0.651, + "step": 4894 + }, + { + "epoch": 0.4376788268955651, + "grad_norm": 0.12683698988459297, + "learning_rate": 0.00012478931753544676, + "loss": 0.6606, + "step": 4895 + }, + { + "epoch": 0.43776824034334766, + "grad_norm": 0.13230930957658918, + "learning_rate": 0.00012476126030813963, + "loss": 0.6623, + "step": 4896 + }, + { + "epoch": 0.4378576537911302, + "grad_norm": 0.1313158512177868, + "learning_rate": 0.0001247332010041355, + "loss": 0.668, + "step": 4897 + }, + { + "epoch": 0.43794706723891275, + "grad_norm": 0.13724034013619538, + "learning_rate": 0.00012470513962578768, + "loss": 0.6841, + "step": 4898 + }, + { + "epoch": 0.43803648068669526, + "grad_norm": 0.12513764866407598, + "learning_rate": 0.0001246770761754496, + "loss": 0.6422, + "step": 4899 + }, + { + "epoch": 0.43812589413447783, + "grad_norm": 0.1312970151912867, + "learning_rate": 0.000124649010655475, + "loss": 0.6692, + "step": 4900 + }, + { + "epoch": 0.43821530758226035, + "grad_norm": 0.14280568089848567, + "learning_rate": 0.00012462094306821758, + "loss": 0.6402, + "step": 4901 + }, + { + "epoch": 0.4383047210300429, + "grad_norm": 0.14129093301826706, + "learning_rate": 0.0001245928734160314, + "loss": 0.7209, + "step": 4902 + }, + { + "epoch": 0.4383941344778255, + "grad_norm": 0.12090864633348064, + "learning_rate": 0.00012456480170127069, + "loss": 0.6788, + "step": 4903 + }, + { + "epoch": 0.438483547925608, + "grad_norm": 0.13747895088407924, + "learning_rate": 0.00012453672792628968, + "loss": 0.6689, + "step": 4904 + }, + { + "epoch": 0.4385729613733906, + "grad_norm": 0.13626359593721726, + "learning_rate": 0.00012450865209344294, + "loss": 0.643, + "step": 4905 + }, + { + "epoch": 0.4386623748211731, + "grad_norm": 0.13030995619065555, + "learning_rate": 0.00012448057420508517, + "loss": 0.689, + "step": 4906 + }, + { + "epoch": 0.43875178826895567, + "grad_norm": 0.1186181048546069, + "learning_rate": 0.0001244524942635712, + "loss": 0.6558, + "step": 4907 + }, + { + "epoch": 0.4388412017167382, + "grad_norm": 0.12246663479779822, + "learning_rate": 0.00012442441227125602, + "loss": 0.6459, + "step": 4908 + }, + { + "epoch": 0.43893061516452075, + "grad_norm": 0.14636699712366527, + "learning_rate": 0.00012439632823049493, + "loss": 0.6998, + "step": 4909 + }, + { + "epoch": 0.43902002861230327, + "grad_norm": 0.1069480876500811, + "learning_rate": 0.00012436824214364324, + "loss": 0.6452, + "step": 4910 + }, + { + "epoch": 0.43910944206008584, + "grad_norm": 0.1418399979245449, + "learning_rate": 0.00012434015401305653, + "loss": 0.6959, + "step": 4911 + }, + { + "epoch": 0.43919885550786836, + "grad_norm": 0.14086988123444696, + "learning_rate": 0.00012431206384109044, + "loss": 0.6647, + "step": 4912 + }, + { + "epoch": 0.43928826895565093, + "grad_norm": 0.16125636032247243, + "learning_rate": 0.00012428397163010096, + "loss": 0.7038, + "step": 4913 + }, + { + "epoch": 0.4393776824034335, + "grad_norm": 0.13591032926410357, + "learning_rate": 0.00012425587738244413, + "loss": 0.6659, + "step": 4914 + }, + { + "epoch": 0.439467095851216, + "grad_norm": 0.13590137660225868, + "learning_rate": 0.00012422778110047613, + "loss": 0.672, + "step": 4915 + }, + { + "epoch": 0.4395565092989986, + "grad_norm": 0.13771993213079906, + "learning_rate": 0.0001241996827865534, + "loss": 0.6665, + "step": 4916 + }, + { + "epoch": 0.4396459227467811, + "grad_norm": 0.12424025049022144, + "learning_rate": 0.00012417158244303249, + "loss": 0.6617, + "step": 4917 + }, + { + "epoch": 0.4397353361945637, + "grad_norm": 0.1325041177846594, + "learning_rate": 0.00012414348007227014, + "loss": 0.6521, + "step": 4918 + }, + { + "epoch": 0.4398247496423462, + "grad_norm": 0.13671151694767777, + "learning_rate": 0.00012411537567662327, + "loss": 0.671, + "step": 4919 + }, + { + "epoch": 0.43991416309012876, + "grad_norm": 0.12365450933735757, + "learning_rate": 0.000124087269258449, + "loss": 0.6557, + "step": 4920 + }, + { + "epoch": 0.4400035765379113, + "grad_norm": 0.12942468110348002, + "learning_rate": 0.00012405916082010456, + "loss": 0.6366, + "step": 4921 + }, + { + "epoch": 0.44009298998569385, + "grad_norm": 0.1300257055420477, + "learning_rate": 0.00012403105036394728, + "loss": 0.6859, + "step": 4922 + }, + { + "epoch": 0.4401824034334764, + "grad_norm": 0.13171916445513815, + "learning_rate": 0.0001240029378923349, + "loss": 0.5821, + "step": 4923 + }, + { + "epoch": 0.44027181688125894, + "grad_norm": 0.13351022329776888, + "learning_rate": 0.0001239748234076251, + "loss": 0.6947, + "step": 4924 + }, + { + "epoch": 0.4403612303290415, + "grad_norm": 0.14088774444894311, + "learning_rate": 0.0001239467069121758, + "loss": 0.6608, + "step": 4925 + }, + { + "epoch": 0.440450643776824, + "grad_norm": 0.13959529877354276, + "learning_rate": 0.0001239185884083451, + "loss": 0.6578, + "step": 4926 + }, + { + "epoch": 0.4405400572246066, + "grad_norm": 0.13515230484509844, + "learning_rate": 0.00012389046789849128, + "loss": 0.6835, + "step": 4927 + }, + { + "epoch": 0.4406294706723891, + "grad_norm": 0.12178430814100254, + "learning_rate": 0.00012386234538497282, + "loss": 0.6336, + "step": 4928 + }, + { + "epoch": 0.4407188841201717, + "grad_norm": 0.1286100654936146, + "learning_rate": 0.00012383422087014817, + "loss": 0.6494, + "step": 4929 + }, + { + "epoch": 0.4408082975679542, + "grad_norm": 0.13939367994567323, + "learning_rate": 0.00012380609435637627, + "loss": 0.622, + "step": 4930 + }, + { + "epoch": 0.44089771101573677, + "grad_norm": 0.13622182395515944, + "learning_rate": 0.000123777965846016, + "loss": 0.6756, + "step": 4931 + }, + { + "epoch": 0.4409871244635193, + "grad_norm": 0.13134724502935, + "learning_rate": 0.0001237498353414264, + "loss": 0.6874, + "step": 4932 + }, + { + "epoch": 0.44107653791130186, + "grad_norm": 0.1401590766645218, + "learning_rate": 0.00012372170284496683, + "loss": 0.7163, + "step": 4933 + }, + { + "epoch": 0.44116595135908443, + "grad_norm": 0.1252956671401417, + "learning_rate": 0.0001236935683589967, + "loss": 0.6706, + "step": 4934 + }, + { + "epoch": 0.44125536480686695, + "grad_norm": 0.15470857677890718, + "learning_rate": 0.00012366543188587555, + "loss": 0.6721, + "step": 4935 + }, + { + "epoch": 0.4413447782546495, + "grad_norm": 0.14270526953077334, + "learning_rate": 0.00012363729342796325, + "loss": 0.7032, + "step": 4936 + }, + { + "epoch": 0.44143419170243203, + "grad_norm": 0.13058439849714298, + "learning_rate": 0.0001236091529876197, + "loss": 0.6493, + "step": 4937 + }, + { + "epoch": 0.4415236051502146, + "grad_norm": 0.14306573282731144, + "learning_rate": 0.000123581010567205, + "loss": 0.6867, + "step": 4938 + }, + { + "epoch": 0.4416130185979971, + "grad_norm": 0.11407641981474496, + "learning_rate": 0.00012355286616907939, + "loss": 0.6433, + "step": 4939 + }, + { + "epoch": 0.4417024320457797, + "grad_norm": 0.12777880182092521, + "learning_rate": 0.00012352471979560338, + "loss": 0.6827, + "step": 4940 + }, + { + "epoch": 0.4417918454935622, + "grad_norm": 0.14456987878023572, + "learning_rate": 0.00012349657144913753, + "loss": 0.671, + "step": 4941 + }, + { + "epoch": 0.4418812589413448, + "grad_norm": 0.12680286366109048, + "learning_rate": 0.00012346842113204257, + "loss": 0.6982, + "step": 4942 + }, + { + "epoch": 0.44197067238912735, + "grad_norm": 0.11776489410338127, + "learning_rate": 0.0001234402688466795, + "loss": 0.6703, + "step": 4943 + }, + { + "epoch": 0.44206008583690987, + "grad_norm": 0.1388534542484837, + "learning_rate": 0.0001234121145954094, + "loss": 0.6705, + "step": 4944 + }, + { + "epoch": 0.44214949928469244, + "grad_norm": 0.12758677871558605, + "learning_rate": 0.00012338395838059352, + "loss": 0.6504, + "step": 4945 + }, + { + "epoch": 0.44223891273247495, + "grad_norm": 0.11487574364565041, + "learning_rate": 0.00012335580020459325, + "loss": 0.6445, + "step": 4946 + }, + { + "epoch": 0.4423283261802575, + "grad_norm": 0.1342349678758515, + "learning_rate": 0.00012332764006977028, + "loss": 0.6694, + "step": 4947 + }, + { + "epoch": 0.44241773962804004, + "grad_norm": 0.1459095658884452, + "learning_rate": 0.0001232994779784863, + "loss": 0.6668, + "step": 4948 + }, + { + "epoch": 0.4425071530758226, + "grad_norm": 0.13692571935005532, + "learning_rate": 0.0001232713139331032, + "loss": 0.668, + "step": 4949 + }, + { + "epoch": 0.44259656652360513, + "grad_norm": 0.14601405364342293, + "learning_rate": 0.00012324314793598314, + "loss": 0.6732, + "step": 4950 + }, + { + "epoch": 0.4426859799713877, + "grad_norm": 0.13375039968441185, + "learning_rate": 0.00012321497998948834, + "loss": 0.6727, + "step": 4951 + }, + { + "epoch": 0.4427753934191702, + "grad_norm": 0.13441387959113996, + "learning_rate": 0.00012318681009598116, + "loss": 0.6496, + "step": 4952 + }, + { + "epoch": 0.4428648068669528, + "grad_norm": 0.11362058626348051, + "learning_rate": 0.00012315863825782425, + "loss": 0.6408, + "step": 4953 + }, + { + "epoch": 0.44295422031473536, + "grad_norm": 0.13158081217936965, + "learning_rate": 0.00012313046447738035, + "loss": 0.6934, + "step": 4954 + }, + { + "epoch": 0.4430436337625179, + "grad_norm": 0.13426443361721652, + "learning_rate": 0.0001231022887570123, + "loss": 0.6676, + "step": 4955 + }, + { + "epoch": 0.44313304721030045, + "grad_norm": 0.13413930547798902, + "learning_rate": 0.00012307411109908315, + "loss": 0.6759, + "step": 4956 + }, + { + "epoch": 0.44322246065808296, + "grad_norm": 0.11879127573369962, + "learning_rate": 0.00012304593150595623, + "loss": 0.6683, + "step": 4957 + }, + { + "epoch": 0.44331187410586553, + "grad_norm": 0.12096516068184467, + "learning_rate": 0.00012301774997999483, + "loss": 0.6566, + "step": 4958 + }, + { + "epoch": 0.44340128755364805, + "grad_norm": 0.13206331390750053, + "learning_rate": 0.00012298956652356257, + "loss": 0.6765, + "step": 4959 + }, + { + "epoch": 0.4434907010014306, + "grad_norm": 0.14105276981105525, + "learning_rate": 0.00012296138113902308, + "loss": 0.6838, + "step": 4960 + }, + { + "epoch": 0.44358011444921314, + "grad_norm": 0.13092052756358477, + "learning_rate": 0.00012293319382874037, + "loss": 0.6663, + "step": 4961 + }, + { + "epoch": 0.4436695278969957, + "grad_norm": 0.13318711690485863, + "learning_rate": 0.0001229050045950783, + "loss": 0.6599, + "step": 4962 + }, + { + "epoch": 0.4437589413447783, + "grad_norm": 0.13417430966407348, + "learning_rate": 0.00012287681344040117, + "loss": 0.6818, + "step": 4963 + }, + { + "epoch": 0.4438483547925608, + "grad_norm": 0.12407942944318716, + "learning_rate": 0.00012284862036707339, + "loss": 0.6598, + "step": 4964 + }, + { + "epoch": 0.44393776824034337, + "grad_norm": 0.12889593161196505, + "learning_rate": 0.00012282042537745938, + "loss": 0.6388, + "step": 4965 + }, + { + "epoch": 0.4440271816881259, + "grad_norm": 0.13827720295957704, + "learning_rate": 0.00012279222847392385, + "loss": 0.6552, + "step": 4966 + }, + { + "epoch": 0.44411659513590845, + "grad_norm": 0.1352952893170182, + "learning_rate": 0.0001227640296588316, + "loss": 0.6524, + "step": 4967 + }, + { + "epoch": 0.44420600858369097, + "grad_norm": 0.14369736596280594, + "learning_rate": 0.00012273582893454775, + "loss": 0.6809, + "step": 4968 + }, + { + "epoch": 0.44429542203147354, + "grad_norm": 0.1552364120897419, + "learning_rate": 0.00012270762630343734, + "loss": 0.6955, + "step": 4969 + }, + { + "epoch": 0.44438483547925606, + "grad_norm": 0.14438245051365242, + "learning_rate": 0.00012267942176786575, + "loss": 0.7011, + "step": 4970 + }, + { + "epoch": 0.44447424892703863, + "grad_norm": 0.1144107253172833, + "learning_rate": 0.00012265121533019843, + "loss": 0.6363, + "step": 4971 + }, + { + "epoch": 0.4445636623748212, + "grad_norm": 0.11896531587610609, + "learning_rate": 0.00012262300699280104, + "loss": 0.6335, + "step": 4972 + }, + { + "epoch": 0.4446530758226037, + "grad_norm": 0.13784512000568916, + "learning_rate": 0.00012259479675803939, + "loss": 0.6938, + "step": 4973 + }, + { + "epoch": 0.4447424892703863, + "grad_norm": 0.12373126237026302, + "learning_rate": 0.00012256658462827941, + "loss": 0.6634, + "step": 4974 + }, + { + "epoch": 0.4448319027181688, + "grad_norm": 0.13106547455593992, + "learning_rate": 0.00012253837060588723, + "loss": 0.6888, + "step": 4975 + }, + { + "epoch": 0.4449213161659514, + "grad_norm": 0.13122463532232756, + "learning_rate": 0.00012251015469322916, + "loss": 0.6771, + "step": 4976 + }, + { + "epoch": 0.4450107296137339, + "grad_norm": 0.12490298636751186, + "learning_rate": 0.00012248193689267157, + "loss": 0.6756, + "step": 4977 + }, + { + "epoch": 0.44510014306151646, + "grad_norm": 0.12766087416869795, + "learning_rate": 0.00012245371720658109, + "loss": 0.6618, + "step": 4978 + }, + { + "epoch": 0.445189556509299, + "grad_norm": 0.1212661524821833, + "learning_rate": 0.00012242549563732443, + "loss": 0.6454, + "step": 4979 + }, + { + "epoch": 0.44527896995708155, + "grad_norm": 0.12755297384134162, + "learning_rate": 0.00012239727218726857, + "loss": 0.5817, + "step": 4980 + }, + { + "epoch": 0.44536838340486407, + "grad_norm": 0.13306956747102566, + "learning_rate": 0.00012236904685878055, + "loss": 0.6641, + "step": 4981 + }, + { + "epoch": 0.44545779685264664, + "grad_norm": 0.1360807994511009, + "learning_rate": 0.0001223408196542276, + "loss": 0.6356, + "step": 4982 + }, + { + "epoch": 0.4455472103004292, + "grad_norm": 0.12824022597244383, + "learning_rate": 0.00012231259057597703, + "loss": 0.6829, + "step": 4983 + }, + { + "epoch": 0.4456366237482117, + "grad_norm": 0.13289436205956878, + "learning_rate": 0.00012228435962639646, + "loss": 0.6973, + "step": 4984 + }, + { + "epoch": 0.4457260371959943, + "grad_norm": 0.14149144425003898, + "learning_rate": 0.00012225612680785358, + "loss": 0.6711, + "step": 4985 + }, + { + "epoch": 0.4458154506437768, + "grad_norm": 0.11917130781010493, + "learning_rate": 0.0001222278921227162, + "loss": 0.6667, + "step": 4986 + }, + { + "epoch": 0.4459048640915594, + "grad_norm": 0.1254314925058195, + "learning_rate": 0.00012219965557335236, + "loss": 0.6618, + "step": 4987 + }, + { + "epoch": 0.4459942775393419, + "grad_norm": 0.146609341910149, + "learning_rate": 0.00012217141716213022, + "loss": 0.6825, + "step": 4988 + }, + { + "epoch": 0.44608369098712447, + "grad_norm": 0.1307676420939781, + "learning_rate": 0.0001221431768914181, + "loss": 0.6184, + "step": 4989 + }, + { + "epoch": 0.446173104434907, + "grad_norm": 0.14764031208420877, + "learning_rate": 0.00012211493476358448, + "loss": 0.6945, + "step": 4990 + }, + { + "epoch": 0.44626251788268956, + "grad_norm": 0.130133155788629, + "learning_rate": 0.00012208669078099798, + "loss": 0.6973, + "step": 4991 + }, + { + "epoch": 0.44635193133047213, + "grad_norm": 0.12687642583230097, + "learning_rate": 0.0001220584449460274, + "loss": 0.6588, + "step": 4992 + }, + { + "epoch": 0.44644134477825465, + "grad_norm": 0.13608231852075128, + "learning_rate": 0.00012203019726104168, + "loss": 0.6703, + "step": 4993 + }, + { + "epoch": 0.4465307582260372, + "grad_norm": 0.11214617471745364, + "learning_rate": 0.00012200194772840991, + "loss": 0.6363, + "step": 4994 + }, + { + "epoch": 0.44662017167381973, + "grad_norm": 0.13989175468146195, + "learning_rate": 0.0001219736963505014, + "loss": 0.681, + "step": 4995 + }, + { + "epoch": 0.4467095851216023, + "grad_norm": 0.13220894655094653, + "learning_rate": 0.00012194544312968548, + "loss": 0.6602, + "step": 4996 + }, + { + "epoch": 0.4467989985693848, + "grad_norm": 0.13900901599432516, + "learning_rate": 0.00012191718806833177, + "loss": 0.6544, + "step": 4997 + }, + { + "epoch": 0.4468884120171674, + "grad_norm": 0.13593494659362357, + "learning_rate": 0.00012188893116880993, + "loss": 0.6259, + "step": 4998 + }, + { + "epoch": 0.4469778254649499, + "grad_norm": 0.14712025949100294, + "learning_rate": 0.00012186067243348991, + "loss": 0.7124, + "step": 4999 + }, + { + "epoch": 0.4470672389127325, + "grad_norm": 0.13069870267970868, + "learning_rate": 0.00012183241186474166, + "loss": 0.6706, + "step": 5000 + }, + { + "epoch": 0.447156652360515, + "grad_norm": 0.16480613197526434, + "learning_rate": 0.00012180414946493538, + "loss": 0.6918, + "step": 5001 + }, + { + "epoch": 0.44724606580829757, + "grad_norm": 0.11625285171748366, + "learning_rate": 0.00012177588523644146, + "loss": 0.6599, + "step": 5002 + }, + { + "epoch": 0.44733547925608014, + "grad_norm": 0.11290024462702626, + "learning_rate": 0.00012174761918163034, + "loss": 0.6406, + "step": 5003 + }, + { + "epoch": 0.44742489270386265, + "grad_norm": 0.14301927537490716, + "learning_rate": 0.00012171935130287265, + "loss": 0.6493, + "step": 5004 + }, + { + "epoch": 0.4475143061516452, + "grad_norm": 0.13708327983310106, + "learning_rate": 0.00012169108160253919, + "loss": 0.645, + "step": 5005 + }, + { + "epoch": 0.44760371959942774, + "grad_norm": 0.131751849229147, + "learning_rate": 0.00012166281008300094, + "loss": 0.6927, + "step": 5006 + }, + { + "epoch": 0.4476931330472103, + "grad_norm": 0.11874532891622093, + "learning_rate": 0.00012163453674662892, + "loss": 0.6642, + "step": 5007 + }, + { + "epoch": 0.44778254649499283, + "grad_norm": 0.1259260125015255, + "learning_rate": 0.00012160626159579447, + "loss": 0.6422, + "step": 5008 + }, + { + "epoch": 0.4478719599427754, + "grad_norm": 0.12974269084831536, + "learning_rate": 0.00012157798463286894, + "loss": 0.6609, + "step": 5009 + }, + { + "epoch": 0.4479613733905579, + "grad_norm": 0.11798915943841709, + "learning_rate": 0.00012154970586022389, + "loss": 0.672, + "step": 5010 + }, + { + "epoch": 0.4480507868383405, + "grad_norm": 0.1223361591196809, + "learning_rate": 0.00012152142528023107, + "loss": 0.6428, + "step": 5011 + }, + { + "epoch": 0.44814020028612306, + "grad_norm": 0.13731190518097294, + "learning_rate": 0.00012149314289526228, + "loss": 0.6582, + "step": 5012 + }, + { + "epoch": 0.4482296137339056, + "grad_norm": 0.1344180500102291, + "learning_rate": 0.00012146485870768954, + "loss": 0.6328, + "step": 5013 + }, + { + "epoch": 0.44831902718168815, + "grad_norm": 0.13331642593828896, + "learning_rate": 0.00012143657271988505, + "loss": 0.6873, + "step": 5014 + }, + { + "epoch": 0.44840844062947066, + "grad_norm": 0.1243968441135307, + "learning_rate": 0.00012140828493422107, + "loss": 0.6492, + "step": 5015 + }, + { + "epoch": 0.44849785407725323, + "grad_norm": 0.1289625888385462, + "learning_rate": 0.0001213799953530701, + "loss": 0.6473, + "step": 5016 + }, + { + "epoch": 0.44858726752503575, + "grad_norm": 0.13682745656448242, + "learning_rate": 0.00012135170397880473, + "loss": 0.6802, + "step": 5017 + }, + { + "epoch": 0.4486766809728183, + "grad_norm": 0.13155459604541483, + "learning_rate": 0.00012132341081379776, + "loss": 0.5911, + "step": 5018 + }, + { + "epoch": 0.44876609442060084, + "grad_norm": 0.15407804732616684, + "learning_rate": 0.00012129511586042206, + "loss": 0.6937, + "step": 5019 + }, + { + "epoch": 0.4488555078683834, + "grad_norm": 0.1250985076218926, + "learning_rate": 0.00012126681912105069, + "loss": 0.5703, + "step": 5020 + }, + { + "epoch": 0.448944921316166, + "grad_norm": 0.14418577685884654, + "learning_rate": 0.00012123852059805691, + "loss": 0.6665, + "step": 5021 + }, + { + "epoch": 0.4490343347639485, + "grad_norm": 0.12715189277442193, + "learning_rate": 0.00012121022029381406, + "loss": 0.6431, + "step": 5022 + }, + { + "epoch": 0.44912374821173107, + "grad_norm": 0.12359634423035853, + "learning_rate": 0.00012118191821069565, + "loss": 0.6366, + "step": 5023 + }, + { + "epoch": 0.4492131616595136, + "grad_norm": 0.10718356968802029, + "learning_rate": 0.00012115361435107531, + "loss": 0.6462, + "step": 5024 + }, + { + "epoch": 0.44930257510729615, + "grad_norm": 0.12206766101601195, + "learning_rate": 0.0001211253087173269, + "loss": 0.6599, + "step": 5025 + }, + { + "epoch": 0.44939198855507867, + "grad_norm": 0.1381000991729019, + "learning_rate": 0.00012109700131182437, + "loss": 0.684, + "step": 5026 + }, + { + "epoch": 0.44948140200286124, + "grad_norm": 0.12109618485417616, + "learning_rate": 0.00012106869213694181, + "loss": 0.6635, + "step": 5027 + }, + { + "epoch": 0.44957081545064376, + "grad_norm": 0.11951278292030514, + "learning_rate": 0.0001210403811950535, + "loss": 0.6393, + "step": 5028 + }, + { + "epoch": 0.44966022889842633, + "grad_norm": 0.1369244485227592, + "learning_rate": 0.00012101206848853381, + "loss": 0.6598, + "step": 5029 + }, + { + "epoch": 0.44974964234620884, + "grad_norm": 0.11933125897410585, + "learning_rate": 0.00012098375401975731, + "loss": 0.6635, + "step": 5030 + }, + { + "epoch": 0.4498390557939914, + "grad_norm": 0.13093731570399783, + "learning_rate": 0.00012095543779109873, + "loss": 0.6575, + "step": 5031 + }, + { + "epoch": 0.449928469241774, + "grad_norm": 0.12031510720849566, + "learning_rate": 0.00012092711980493285, + "loss": 0.6396, + "step": 5032 + }, + { + "epoch": 0.4500178826895565, + "grad_norm": 0.12204711623761129, + "learning_rate": 0.00012089880006363475, + "loss": 0.6304, + "step": 5033 + }, + { + "epoch": 0.4501072961373391, + "grad_norm": 0.1380145493637176, + "learning_rate": 0.00012087047856957949, + "loss": 0.6957, + "step": 5034 + }, + { + "epoch": 0.4501967095851216, + "grad_norm": 0.1612661412138124, + "learning_rate": 0.00012084215532514243, + "loss": 0.7078, + "step": 5035 + }, + { + "epoch": 0.45028612303290416, + "grad_norm": 0.13887519608596313, + "learning_rate": 0.00012081383033269896, + "loss": 0.6683, + "step": 5036 + }, + { + "epoch": 0.4503755364806867, + "grad_norm": 0.1208736898874662, + "learning_rate": 0.00012078550359462464, + "loss": 0.6302, + "step": 5037 + }, + { + "epoch": 0.45046494992846925, + "grad_norm": 0.13426206635633342, + "learning_rate": 0.00012075717511329529, + "loss": 0.6849, + "step": 5038 + }, + { + "epoch": 0.45055436337625177, + "grad_norm": 0.13711906515401015, + "learning_rate": 0.00012072884489108669, + "loss": 0.6405, + "step": 5039 + }, + { + "epoch": 0.45064377682403434, + "grad_norm": 0.12305861809484434, + "learning_rate": 0.00012070051293037492, + "loss": 0.6396, + "step": 5040 + }, + { + "epoch": 0.4507331902718169, + "grad_norm": 0.1333256016912405, + "learning_rate": 0.00012067217923353615, + "loss": 0.6806, + "step": 5041 + }, + { + "epoch": 0.4508226037195994, + "grad_norm": 0.1241670254150715, + "learning_rate": 0.00012064384380294667, + "loss": 0.6785, + "step": 5042 + }, + { + "epoch": 0.450912017167382, + "grad_norm": 0.14024714083110765, + "learning_rate": 0.00012061550664098293, + "loss": 0.6463, + "step": 5043 + }, + { + "epoch": 0.4510014306151645, + "grad_norm": 0.12717851756158557, + "learning_rate": 0.00012058716775002152, + "loss": 0.6422, + "step": 5044 + }, + { + "epoch": 0.4510908440629471, + "grad_norm": 0.13212018380574977, + "learning_rate": 0.00012055882713243926, + "loss": 0.6441, + "step": 5045 + }, + { + "epoch": 0.4511802575107296, + "grad_norm": 0.11085667512363975, + "learning_rate": 0.00012053048479061298, + "loss": 0.6501, + "step": 5046 + }, + { + "epoch": 0.45126967095851217, + "grad_norm": 0.1618004252230536, + "learning_rate": 0.00012050214072691969, + "loss": 0.6616, + "step": 5047 + }, + { + "epoch": 0.4513590844062947, + "grad_norm": 0.12462302085039458, + "learning_rate": 0.0001204737949437367, + "loss": 0.7154, + "step": 5048 + }, + { + "epoch": 0.45144849785407726, + "grad_norm": 0.12030754349109561, + "learning_rate": 0.00012044544744344122, + "loss": 0.6191, + "step": 5049 + }, + { + "epoch": 0.4515379113018598, + "grad_norm": 0.1166839813018082, + "learning_rate": 0.00012041709822841074, + "loss": 0.6393, + "step": 5050 + }, + { + "epoch": 0.45162732474964234, + "grad_norm": 0.12943338013546263, + "learning_rate": 0.00012038874730102288, + "loss": 0.6401, + "step": 5051 + }, + { + "epoch": 0.4517167381974249, + "grad_norm": 0.14368759044354668, + "learning_rate": 0.00012036039466365543, + "loss": 0.7039, + "step": 5052 + }, + { + "epoch": 0.45180615164520743, + "grad_norm": 0.1233352511357767, + "learning_rate": 0.00012033204031868626, + "loss": 0.6822, + "step": 5053 + }, + { + "epoch": 0.45189556509299, + "grad_norm": 0.13089919343713485, + "learning_rate": 0.0001203036842684934, + "loss": 0.6722, + "step": 5054 + }, + { + "epoch": 0.4519849785407725, + "grad_norm": 0.1178495499918086, + "learning_rate": 0.00012027532651545512, + "loss": 0.6296, + "step": 5055 + }, + { + "epoch": 0.4520743919885551, + "grad_norm": 0.13627529987744513, + "learning_rate": 0.00012024696706194967, + "loss": 0.6591, + "step": 5056 + }, + { + "epoch": 0.4521638054363376, + "grad_norm": 0.12702606443330008, + "learning_rate": 0.00012021860591035549, + "loss": 0.6516, + "step": 5057 + }, + { + "epoch": 0.4522532188841202, + "grad_norm": 0.14469979201522867, + "learning_rate": 0.00012019024306305131, + "loss": 0.6832, + "step": 5058 + }, + { + "epoch": 0.4523426323319027, + "grad_norm": 0.13029606735510002, + "learning_rate": 0.00012016187852241583, + "loss": 0.6166, + "step": 5059 + }, + { + "epoch": 0.45243204577968527, + "grad_norm": 0.13420688269443212, + "learning_rate": 0.00012013351229082792, + "loss": 0.7094, + "step": 5060 + }, + { + "epoch": 0.45252145922746784, + "grad_norm": 0.1539623125875664, + "learning_rate": 0.00012010514437066664, + "loss": 0.7146, + "step": 5061 + }, + { + "epoch": 0.45261087267525035, + "grad_norm": 0.12313782401741918, + "learning_rate": 0.00012007677476431123, + "loss": 0.6598, + "step": 5062 + }, + { + "epoch": 0.4527002861230329, + "grad_norm": 0.11417079915381419, + "learning_rate": 0.00012004840347414092, + "loss": 0.67, + "step": 5063 + }, + { + "epoch": 0.45278969957081544, + "grad_norm": 0.1292991001899118, + "learning_rate": 0.00012002003050253522, + "loss": 0.6508, + "step": 5064 + }, + { + "epoch": 0.452879113018598, + "grad_norm": 0.11933562304905082, + "learning_rate": 0.0001199916558518738, + "loss": 0.6715, + "step": 5065 + }, + { + "epoch": 0.4529685264663805, + "grad_norm": 0.12912473231549898, + "learning_rate": 0.00011996327952453629, + "loss": 0.6547, + "step": 5066 + }, + { + "epoch": 0.4530579399141631, + "grad_norm": 0.11876435049376402, + "learning_rate": 0.00011993490152290266, + "loss": 0.6534, + "step": 5067 + }, + { + "epoch": 0.4531473533619456, + "grad_norm": 0.12034323372281064, + "learning_rate": 0.00011990652184935289, + "loss": 0.6487, + "step": 5068 + }, + { + "epoch": 0.4532367668097282, + "grad_norm": 0.14002624410207604, + "learning_rate": 0.00011987814050626722, + "loss": 0.7001, + "step": 5069 + }, + { + "epoch": 0.4533261802575107, + "grad_norm": 0.12692062218864136, + "learning_rate": 0.00011984975749602588, + "loss": 0.6421, + "step": 5070 + }, + { + "epoch": 0.4534155937052933, + "grad_norm": 0.11240565613180427, + "learning_rate": 0.00011982137282100934, + "loss": 0.6511, + "step": 5071 + }, + { + "epoch": 0.45350500715307585, + "grad_norm": 0.13477878974415325, + "learning_rate": 0.00011979298648359823, + "loss": 0.6622, + "step": 5072 + }, + { + "epoch": 0.45359442060085836, + "grad_norm": 0.1416336984880435, + "learning_rate": 0.00011976459848617323, + "loss": 0.7137, + "step": 5073 + }, + { + "epoch": 0.45368383404864093, + "grad_norm": 0.12578654792824037, + "learning_rate": 0.00011973620883111521, + "loss": 0.6511, + "step": 5074 + }, + { + "epoch": 0.45377324749642345, + "grad_norm": 0.12448197141196625, + "learning_rate": 0.00011970781752080523, + "loss": 0.6432, + "step": 5075 + }, + { + "epoch": 0.453862660944206, + "grad_norm": 0.12367705966986674, + "learning_rate": 0.00011967942455762437, + "loss": 0.6523, + "step": 5076 + }, + { + "epoch": 0.45395207439198854, + "grad_norm": 0.14472418239217374, + "learning_rate": 0.00011965102994395394, + "loss": 0.6946, + "step": 5077 + }, + { + "epoch": 0.4540414878397711, + "grad_norm": 0.12954155801882747, + "learning_rate": 0.00011962263368217535, + "loss": 0.6388, + "step": 5078 + }, + { + "epoch": 0.4541309012875536, + "grad_norm": 0.13411185923822244, + "learning_rate": 0.0001195942357746702, + "loss": 0.6468, + "step": 5079 + }, + { + "epoch": 0.4542203147353362, + "grad_norm": 0.1353648130492168, + "learning_rate": 0.00011956583622382015, + "loss": 0.6979, + "step": 5080 + }, + { + "epoch": 0.45430972818311877, + "grad_norm": 0.11998544625044429, + "learning_rate": 0.000119537435032007, + "loss": 0.6352, + "step": 5081 + }, + { + "epoch": 0.4543991416309013, + "grad_norm": 0.13413875634906494, + "learning_rate": 0.00011950903220161285, + "loss": 0.6668, + "step": 5082 + }, + { + "epoch": 0.45448855507868385, + "grad_norm": 0.1328520592384789, + "learning_rate": 0.00011948062773501969, + "loss": 0.6261, + "step": 5083 + }, + { + "epoch": 0.45457796852646637, + "grad_norm": 0.13168914729065084, + "learning_rate": 0.00011945222163460979, + "loss": 0.6707, + "step": 5084 + }, + { + "epoch": 0.45466738197424894, + "grad_norm": 0.14233874034341704, + "learning_rate": 0.00011942381390276556, + "loss": 0.6553, + "step": 5085 + }, + { + "epoch": 0.45475679542203146, + "grad_norm": 0.12268069690796748, + "learning_rate": 0.00011939540454186954, + "loss": 0.6563, + "step": 5086 + }, + { + "epoch": 0.45484620886981403, + "grad_norm": 0.12157338319077214, + "learning_rate": 0.00011936699355430436, + "loss": 0.6302, + "step": 5087 + }, + { + "epoch": 0.45493562231759654, + "grad_norm": 0.12596973808085923, + "learning_rate": 0.00011933858094245281, + "loss": 0.6816, + "step": 5088 + }, + { + "epoch": 0.4550250357653791, + "grad_norm": 0.1480461550684754, + "learning_rate": 0.00011931016670869784, + "loss": 0.6904, + "step": 5089 + }, + { + "epoch": 0.4551144492131617, + "grad_norm": 0.17596983208186526, + "learning_rate": 0.0001192817508554225, + "loss": 0.6573, + "step": 5090 + }, + { + "epoch": 0.4552038626609442, + "grad_norm": 0.13142193124515966, + "learning_rate": 0.00011925333338500999, + "loss": 0.6397, + "step": 5091 + }, + { + "epoch": 0.4552932761087268, + "grad_norm": 0.14341087136074712, + "learning_rate": 0.00011922491429984368, + "loss": 0.6333, + "step": 5092 + }, + { + "epoch": 0.4553826895565093, + "grad_norm": 0.1319825969891419, + "learning_rate": 0.00011919649360230702, + "loss": 0.6168, + "step": 5093 + }, + { + "epoch": 0.45547210300429186, + "grad_norm": 0.13757544357923088, + "learning_rate": 0.00011916807129478363, + "loss": 0.6586, + "step": 5094 + }, + { + "epoch": 0.4555615164520744, + "grad_norm": 0.11661248817500149, + "learning_rate": 0.00011913964737965723, + "loss": 0.6391, + "step": 5095 + }, + { + "epoch": 0.45565092989985695, + "grad_norm": 0.15062531339117932, + "learning_rate": 0.00011911122185931176, + "loss": 0.6705, + "step": 5096 + }, + { + "epoch": 0.45574034334763946, + "grad_norm": 0.12927064120533582, + "learning_rate": 0.00011908279473613115, + "loss": 0.6729, + "step": 5097 + }, + { + "epoch": 0.45582975679542204, + "grad_norm": 0.12023225284674513, + "learning_rate": 0.0001190543660124996, + "loss": 0.6439, + "step": 5098 + }, + { + "epoch": 0.45591917024320455, + "grad_norm": 0.1355879995879687, + "learning_rate": 0.0001190259356908014, + "loss": 0.6568, + "step": 5099 + }, + { + "epoch": 0.4560085836909871, + "grad_norm": 0.1334179159963381, + "learning_rate": 0.00011899750377342098, + "loss": 0.6464, + "step": 5100 + }, + { + "epoch": 0.4560979971387697, + "grad_norm": 0.1416583145914154, + "learning_rate": 0.0001189690702627428, + "loss": 0.7053, + "step": 5101 + }, + { + "epoch": 0.4561874105865522, + "grad_norm": 0.12423305058314806, + "learning_rate": 0.00011894063516115163, + "loss": 0.6943, + "step": 5102 + }, + { + "epoch": 0.4562768240343348, + "grad_norm": 0.11534141569872278, + "learning_rate": 0.00011891219847103228, + "loss": 0.6325, + "step": 5103 + }, + { + "epoch": 0.4563662374821173, + "grad_norm": 0.1172153536149293, + "learning_rate": 0.00011888376019476966, + "loss": 0.6309, + "step": 5104 + }, + { + "epoch": 0.45645565092989987, + "grad_norm": 0.1388118920981399, + "learning_rate": 0.00011885532033474889, + "loss": 0.7003, + "step": 5105 + }, + { + "epoch": 0.4565450643776824, + "grad_norm": 0.12858450258589962, + "learning_rate": 0.00011882687889335518, + "loss": 0.6815, + "step": 5106 + }, + { + "epoch": 0.45663447782546496, + "grad_norm": 0.13131901912358337, + "learning_rate": 0.00011879843587297387, + "loss": 0.6532, + "step": 5107 + }, + { + "epoch": 0.4567238912732475, + "grad_norm": 0.11661867589052494, + "learning_rate": 0.00011876999127599042, + "loss": 0.634, + "step": 5108 + }, + { + "epoch": 0.45681330472103004, + "grad_norm": 0.12849756701948076, + "learning_rate": 0.00011874154510479052, + "loss": 0.6058, + "step": 5109 + }, + { + "epoch": 0.4569027181688126, + "grad_norm": 0.11140385274718131, + "learning_rate": 0.00011871309736175984, + "loss": 0.5915, + "step": 5110 + }, + { + "epoch": 0.45699213161659513, + "grad_norm": 0.13028353895468997, + "learning_rate": 0.0001186846480492843, + "loss": 0.6501, + "step": 5111 + }, + { + "epoch": 0.4570815450643777, + "grad_norm": 0.11584291886026496, + "learning_rate": 0.00011865619716974984, + "loss": 0.6751, + "step": 5112 + }, + { + "epoch": 0.4571709585121602, + "grad_norm": 0.12254669760025981, + "learning_rate": 0.00011862774472554272, + "loss": 0.6725, + "step": 5113 + }, + { + "epoch": 0.4572603719599428, + "grad_norm": 0.12255414993169449, + "learning_rate": 0.00011859929071904912, + "loss": 0.6659, + "step": 5114 + }, + { + "epoch": 0.4573497854077253, + "grad_norm": 0.1318128252272289, + "learning_rate": 0.00011857083515265546, + "loss": 0.6794, + "step": 5115 + }, + { + "epoch": 0.4574391988555079, + "grad_norm": 0.13192467274115585, + "learning_rate": 0.0001185423780287483, + "loss": 0.6792, + "step": 5116 + }, + { + "epoch": 0.4575286123032904, + "grad_norm": 0.1348697693706375, + "learning_rate": 0.0001185139193497143, + "loss": 0.711, + "step": 5117 + }, + { + "epoch": 0.45761802575107297, + "grad_norm": 0.12701965099386395, + "learning_rate": 0.0001184854591179402, + "loss": 0.6566, + "step": 5118 + }, + { + "epoch": 0.4577074391988555, + "grad_norm": 0.12289573384993589, + "learning_rate": 0.000118456997335813, + "loss": 0.6451, + "step": 5119 + }, + { + "epoch": 0.45779685264663805, + "grad_norm": 0.12206928935739798, + "learning_rate": 0.00011842853400571971, + "loss": 0.6713, + "step": 5120 + }, + { + "epoch": 0.4578862660944206, + "grad_norm": 0.12795845363538128, + "learning_rate": 0.00011840006913004753, + "loss": 0.6563, + "step": 5121 + }, + { + "epoch": 0.45797567954220314, + "grad_norm": 0.12310839919097226, + "learning_rate": 0.00011837160271118377, + "loss": 0.6566, + "step": 5122 + }, + { + "epoch": 0.4580650929899857, + "grad_norm": 0.145921154162638, + "learning_rate": 0.00011834313475151591, + "loss": 0.6431, + "step": 5123 + }, + { + "epoch": 0.4581545064377682, + "grad_norm": 0.12799258571669975, + "learning_rate": 0.00011831466525343146, + "loss": 0.6502, + "step": 5124 + }, + { + "epoch": 0.4582439198855508, + "grad_norm": 0.1146106650312073, + "learning_rate": 0.00011828619421931817, + "loss": 0.6739, + "step": 5125 + }, + { + "epoch": 0.4583333333333333, + "grad_norm": 0.12709835250285287, + "learning_rate": 0.00011825772165156384, + "loss": 0.6391, + "step": 5126 + }, + { + "epoch": 0.4584227467811159, + "grad_norm": 0.12432006410542765, + "learning_rate": 0.00011822924755255647, + "loss": 0.6385, + "step": 5127 + }, + { + "epoch": 0.4585121602288984, + "grad_norm": 0.13639513611246393, + "learning_rate": 0.00011820077192468409, + "loss": 0.6683, + "step": 5128 + }, + { + "epoch": 0.458601573676681, + "grad_norm": 0.1272226759481579, + "learning_rate": 0.00011817229477033495, + "loss": 0.6238, + "step": 5129 + }, + { + "epoch": 0.45869098712446355, + "grad_norm": 0.12533774837613212, + "learning_rate": 0.00011814381609189741, + "loss": 0.6337, + "step": 5130 + }, + { + "epoch": 0.45878040057224606, + "grad_norm": 0.14342174039166647, + "learning_rate": 0.00011811533589175994, + "loss": 0.6278, + "step": 5131 + }, + { + "epoch": 0.45886981402002863, + "grad_norm": 0.15221053153974293, + "learning_rate": 0.00011808685417231111, + "loss": 0.6789, + "step": 5132 + }, + { + "epoch": 0.45895922746781115, + "grad_norm": 0.14263927217572508, + "learning_rate": 0.00011805837093593965, + "loss": 0.6903, + "step": 5133 + }, + { + "epoch": 0.4590486409155937, + "grad_norm": 0.14346313984479728, + "learning_rate": 0.00011802988618503447, + "loss": 0.6583, + "step": 5134 + }, + { + "epoch": 0.45913805436337624, + "grad_norm": 0.12617972377477257, + "learning_rate": 0.00011800139992198447, + "loss": 0.6387, + "step": 5135 + }, + { + "epoch": 0.4592274678111588, + "grad_norm": 0.12614234701179816, + "learning_rate": 0.00011797291214917881, + "loss": 0.6369, + "step": 5136 + }, + { + "epoch": 0.4593168812589413, + "grad_norm": 0.12600808254105006, + "learning_rate": 0.00011794442286900673, + "loss": 0.5847, + "step": 5137 + }, + { + "epoch": 0.4594062947067239, + "grad_norm": 0.14755903376849722, + "learning_rate": 0.00011791593208385756, + "loss": 0.639, + "step": 5138 + }, + { + "epoch": 0.4594957081545064, + "grad_norm": 0.13076330454881693, + "learning_rate": 0.0001178874397961208, + "loss": 0.6739, + "step": 5139 + }, + { + "epoch": 0.459585121602289, + "grad_norm": 0.12567307361386332, + "learning_rate": 0.00011785894600818608, + "loss": 0.6515, + "step": 5140 + }, + { + "epoch": 0.45967453505007155, + "grad_norm": 0.12358203236565002, + "learning_rate": 0.0001178304507224431, + "loss": 0.612, + "step": 5141 + }, + { + "epoch": 0.45976394849785407, + "grad_norm": 0.11287605888384954, + "learning_rate": 0.0001178019539412818, + "loss": 0.6268, + "step": 5142 + }, + { + "epoch": 0.45985336194563664, + "grad_norm": 0.12613430586290186, + "learning_rate": 0.00011777345566709206, + "loss": 0.6382, + "step": 5143 + }, + { + "epoch": 0.45994277539341916, + "grad_norm": 0.11950161604978742, + "learning_rate": 0.00011774495590226411, + "loss": 0.6582, + "step": 5144 + }, + { + "epoch": 0.46003218884120173, + "grad_norm": 0.14161382847386017, + "learning_rate": 0.00011771645464918813, + "loss": 0.6568, + "step": 5145 + }, + { + "epoch": 0.46012160228898424, + "grad_norm": 0.12293649665299329, + "learning_rate": 0.00011768795191025445, + "loss": 0.6658, + "step": 5146 + }, + { + "epoch": 0.4602110157367668, + "grad_norm": 0.13207234667797246, + "learning_rate": 0.00011765944768785366, + "loss": 0.6599, + "step": 5147 + }, + { + "epoch": 0.46030042918454933, + "grad_norm": 0.12710429696120956, + "learning_rate": 0.0001176309419843763, + "loss": 0.6394, + "step": 5148 + }, + { + "epoch": 0.4603898426323319, + "grad_norm": 0.1546941467971507, + "learning_rate": 0.00011760243480221313, + "loss": 0.6817, + "step": 5149 + }, + { + "epoch": 0.4604792560801145, + "grad_norm": 0.1338983721024879, + "learning_rate": 0.000117573926143755, + "loss": 0.6667, + "step": 5150 + }, + { + "epoch": 0.460568669527897, + "grad_norm": 0.11891556236097454, + "learning_rate": 0.00011754541601139292, + "loss": 0.6417, + "step": 5151 + }, + { + "epoch": 0.46065808297567956, + "grad_norm": 0.1366310893734596, + "learning_rate": 0.000117516904407518, + "loss": 0.6552, + "step": 5152 + }, + { + "epoch": 0.4607474964234621, + "grad_norm": 0.12910154928404902, + "learning_rate": 0.00011748839133452143, + "loss": 0.6612, + "step": 5153 + }, + { + "epoch": 0.46083690987124465, + "grad_norm": 0.12503752488259107, + "learning_rate": 0.00011745987679479462, + "loss": 0.6397, + "step": 5154 + }, + { + "epoch": 0.46092632331902716, + "grad_norm": 0.13747236090570705, + "learning_rate": 0.00011743136079072903, + "loss": 0.6496, + "step": 5155 + }, + { + "epoch": 0.46101573676680974, + "grad_norm": 0.13886870357927897, + "learning_rate": 0.00011740284332471628, + "loss": 0.6833, + "step": 5156 + }, + { + "epoch": 0.46110515021459225, + "grad_norm": 0.13167820233848782, + "learning_rate": 0.00011737432439914804, + "loss": 0.6866, + "step": 5157 + }, + { + "epoch": 0.4611945636623748, + "grad_norm": 0.13023056908754, + "learning_rate": 0.0001173458040164162, + "loss": 0.6816, + "step": 5158 + }, + { + "epoch": 0.4612839771101574, + "grad_norm": 0.14301949471496275, + "learning_rate": 0.00011731728217891275, + "loss": 0.6386, + "step": 5159 + }, + { + "epoch": 0.4613733905579399, + "grad_norm": 0.1505048444490811, + "learning_rate": 0.00011728875888902975, + "loss": 0.6957, + "step": 5160 + }, + { + "epoch": 0.4614628040057225, + "grad_norm": 0.13982823784523177, + "learning_rate": 0.00011726023414915941, + "loss": 0.6826, + "step": 5161 + }, + { + "epoch": 0.461552217453505, + "grad_norm": 0.14450333988772465, + "learning_rate": 0.00011723170796169409, + "loss": 0.6221, + "step": 5162 + }, + { + "epoch": 0.46164163090128757, + "grad_norm": 0.11765541218319442, + "learning_rate": 0.00011720318032902624, + "loss": 0.667, + "step": 5163 + }, + { + "epoch": 0.4617310443490701, + "grad_norm": 0.14308747166869615, + "learning_rate": 0.0001171746512535484, + "loss": 0.6401, + "step": 5164 + }, + { + "epoch": 0.46182045779685266, + "grad_norm": 0.12344354050739911, + "learning_rate": 0.00011714612073765332, + "loss": 0.6272, + "step": 5165 + }, + { + "epoch": 0.4619098712446352, + "grad_norm": 0.12671566016109415, + "learning_rate": 0.00011711758878373383, + "loss": 0.6785, + "step": 5166 + }, + { + "epoch": 0.46199928469241774, + "grad_norm": 0.129213376246325, + "learning_rate": 0.0001170890553941828, + "loss": 0.6659, + "step": 5167 + }, + { + "epoch": 0.46208869814020026, + "grad_norm": 0.14456193586389154, + "learning_rate": 0.00011706052057139335, + "loss": 0.6771, + "step": 5168 + }, + { + "epoch": 0.46217811158798283, + "grad_norm": 0.13769677350552428, + "learning_rate": 0.00011703198431775863, + "loss": 0.6543, + "step": 5169 + }, + { + "epoch": 0.4622675250357654, + "grad_norm": 0.12471629814997588, + "learning_rate": 0.00011700344663567197, + "loss": 0.6873, + "step": 5170 + }, + { + "epoch": 0.4623569384835479, + "grad_norm": 0.13457811972381128, + "learning_rate": 0.00011697490752752678, + "loss": 0.7078, + "step": 5171 + }, + { + "epoch": 0.4624463519313305, + "grad_norm": 0.13371617507090858, + "learning_rate": 0.00011694636699571657, + "loss": 0.6779, + "step": 5172 + }, + { + "epoch": 0.462535765379113, + "grad_norm": 0.13527342794121672, + "learning_rate": 0.00011691782504263505, + "loss": 0.6793, + "step": 5173 + }, + { + "epoch": 0.4626251788268956, + "grad_norm": 0.11472664174124836, + "learning_rate": 0.00011688928167067597, + "loss": 0.6274, + "step": 5174 + }, + { + "epoch": 0.4627145922746781, + "grad_norm": 0.11833710037397893, + "learning_rate": 0.0001168607368822332, + "loss": 0.6793, + "step": 5175 + }, + { + "epoch": 0.46280400572246067, + "grad_norm": 0.10467779940744937, + "learning_rate": 0.00011683219067970084, + "loss": 0.6606, + "step": 5176 + }, + { + "epoch": 0.4628934191702432, + "grad_norm": 0.13729210878741754, + "learning_rate": 0.00011680364306547298, + "loss": 0.6459, + "step": 5177 + }, + { + "epoch": 0.46298283261802575, + "grad_norm": 0.12265163369395216, + "learning_rate": 0.00011677509404194382, + "loss": 0.6471, + "step": 5178 + }, + { + "epoch": 0.4630722460658083, + "grad_norm": 0.15270582629527243, + "learning_rate": 0.0001167465436115078, + "loss": 0.7109, + "step": 5179 + }, + { + "epoch": 0.46316165951359084, + "grad_norm": 0.12417731163590265, + "learning_rate": 0.00011671799177655942, + "loss": 0.6709, + "step": 5180 + }, + { + "epoch": 0.4632510729613734, + "grad_norm": 0.11581969314958988, + "learning_rate": 0.00011668943853949323, + "loss": 0.6587, + "step": 5181 + }, + { + "epoch": 0.4633404864091559, + "grad_norm": 0.13213957871922424, + "learning_rate": 0.000116660883902704, + "loss": 0.6752, + "step": 5182 + }, + { + "epoch": 0.4634298998569385, + "grad_norm": 0.11894278498234959, + "learning_rate": 0.00011663232786858656, + "loss": 0.6233, + "step": 5183 + }, + { + "epoch": 0.463519313304721, + "grad_norm": 0.13896662192014375, + "learning_rate": 0.00011660377043953588, + "loss": 0.6594, + "step": 5184 + }, + { + "epoch": 0.4636087267525036, + "grad_norm": 0.14132286806026037, + "learning_rate": 0.000116575211617947, + "loss": 0.7061, + "step": 5185 + }, + { + "epoch": 0.4636981402002861, + "grad_norm": 0.1272131510184923, + "learning_rate": 0.00011654665140621515, + "loss": 0.6517, + "step": 5186 + }, + { + "epoch": 0.4637875536480687, + "grad_norm": 0.12823114973332161, + "learning_rate": 0.00011651808980673561, + "loss": 0.6714, + "step": 5187 + }, + { + "epoch": 0.4638769670958512, + "grad_norm": 0.14363858282902736, + "learning_rate": 0.00011648952682190387, + "loss": 0.6914, + "step": 5188 + }, + { + "epoch": 0.46396638054363376, + "grad_norm": 0.13094658642015178, + "learning_rate": 0.00011646096245411538, + "loss": 0.6562, + "step": 5189 + }, + { + "epoch": 0.46405579399141633, + "grad_norm": 0.12202991560896585, + "learning_rate": 0.00011643239670576589, + "loss": 0.6413, + "step": 5190 + }, + { + "epoch": 0.46414520743919885, + "grad_norm": 0.12577288817353163, + "learning_rate": 0.00011640382957925111, + "loss": 0.6673, + "step": 5191 + }, + { + "epoch": 0.4642346208869814, + "grad_norm": 0.13614707516193925, + "learning_rate": 0.00011637526107696694, + "loss": 0.6422, + "step": 5192 + }, + { + "epoch": 0.46432403433476394, + "grad_norm": 0.13763305318993538, + "learning_rate": 0.00011634669120130943, + "loss": 0.6675, + "step": 5193 + }, + { + "epoch": 0.4644134477825465, + "grad_norm": 0.13247732007396767, + "learning_rate": 0.00011631811995467467, + "loss": 0.6599, + "step": 5194 + }, + { + "epoch": 0.464502861230329, + "grad_norm": 0.14180506974132193, + "learning_rate": 0.0001162895473394589, + "loss": 0.6811, + "step": 5195 + }, + { + "epoch": 0.4645922746781116, + "grad_norm": 0.1170616251863554, + "learning_rate": 0.00011626097335805843, + "loss": 0.6099, + "step": 5196 + }, + { + "epoch": 0.4646816881258941, + "grad_norm": 0.12775310356871858, + "learning_rate": 0.00011623239801286981, + "loss": 0.6165, + "step": 5197 + }, + { + "epoch": 0.4647711015736767, + "grad_norm": 0.12820091895340366, + "learning_rate": 0.00011620382130628952, + "loss": 0.6727, + "step": 5198 + }, + { + "epoch": 0.46486051502145925, + "grad_norm": 0.12824293081672408, + "learning_rate": 0.00011617524324071433, + "loss": 0.6542, + "step": 5199 + }, + { + "epoch": 0.46494992846924177, + "grad_norm": 0.11459851549112658, + "learning_rate": 0.00011614666381854107, + "loss": 0.635, + "step": 5200 + }, + { + "epoch": 0.46503934191702434, + "grad_norm": 0.1056049658803562, + "learning_rate": 0.00011611808304216658, + "loss": 0.6386, + "step": 5201 + }, + { + "epoch": 0.46512875536480686, + "grad_norm": 0.1051696313237755, + "learning_rate": 0.0001160895009139879, + "loss": 0.6273, + "step": 5202 + }, + { + "epoch": 0.4652181688125894, + "grad_norm": 0.11879572885606486, + "learning_rate": 0.00011606091743640224, + "loss": 0.6258, + "step": 5203 + }, + { + "epoch": 0.46530758226037194, + "grad_norm": 0.13950740294485017, + "learning_rate": 0.00011603233261180683, + "loss": 0.6942, + "step": 5204 + }, + { + "epoch": 0.4653969957081545, + "grad_norm": 0.12915913718909963, + "learning_rate": 0.00011600374644259906, + "loss": 0.6381, + "step": 5205 + }, + { + "epoch": 0.46548640915593703, + "grad_norm": 0.13331302659120856, + "learning_rate": 0.00011597515893117637, + "loss": 0.6478, + "step": 5206 + }, + { + "epoch": 0.4655758226037196, + "grad_norm": 0.14006551208077178, + "learning_rate": 0.00011594657007993644, + "loss": 0.6385, + "step": 5207 + }, + { + "epoch": 0.4656652360515021, + "grad_norm": 0.14109467642528586, + "learning_rate": 0.0001159179798912769, + "loss": 0.6633, + "step": 5208 + }, + { + "epoch": 0.4657546494992847, + "grad_norm": 0.14103132665716347, + "learning_rate": 0.0001158893883675956, + "loss": 0.6929, + "step": 5209 + }, + { + "epoch": 0.46584406294706726, + "grad_norm": 0.15538884763097413, + "learning_rate": 0.00011586079551129053, + "loss": 0.6724, + "step": 5210 + }, + { + "epoch": 0.4659334763948498, + "grad_norm": 0.1375335902320319, + "learning_rate": 0.00011583220132475966, + "loss": 0.6557, + "step": 5211 + }, + { + "epoch": 0.46602288984263235, + "grad_norm": 0.1186265102941403, + "learning_rate": 0.0001158036058104012, + "loss": 0.6395, + "step": 5212 + }, + { + "epoch": 0.46611230329041486, + "grad_norm": 0.1145971743990579, + "learning_rate": 0.00011577500897061338, + "loss": 0.6331, + "step": 5213 + }, + { + "epoch": 0.46620171673819744, + "grad_norm": 0.13301876171777471, + "learning_rate": 0.00011574641080779464, + "loss": 0.6244, + "step": 5214 + }, + { + "epoch": 0.46629113018597995, + "grad_norm": 0.1364662854938477, + "learning_rate": 0.00011571781132434343, + "loss": 0.6442, + "step": 5215 + }, + { + "epoch": 0.4663805436337625, + "grad_norm": 0.137599854111675, + "learning_rate": 0.00011568921052265836, + "loss": 0.6665, + "step": 5216 + }, + { + "epoch": 0.46646995708154504, + "grad_norm": 0.13648534603224446, + "learning_rate": 0.00011566060840513817, + "loss": 0.6529, + "step": 5217 + }, + { + "epoch": 0.4665593705293276, + "grad_norm": 0.11912072735236146, + "learning_rate": 0.00011563200497418168, + "loss": 0.6445, + "step": 5218 + }, + { + "epoch": 0.4666487839771102, + "grad_norm": 0.14404711764704, + "learning_rate": 0.00011560340023218776, + "loss": 0.6776, + "step": 5219 + }, + { + "epoch": 0.4667381974248927, + "grad_norm": 0.13122424018985415, + "learning_rate": 0.00011557479418155555, + "loss": 0.6739, + "step": 5220 + }, + { + "epoch": 0.46682761087267527, + "grad_norm": 0.13757936337455112, + "learning_rate": 0.00011554618682468416, + "loss": 0.6357, + "step": 5221 + }, + { + "epoch": 0.4669170243204578, + "grad_norm": 0.13643084104961611, + "learning_rate": 0.00011551757816397285, + "loss": 0.6373, + "step": 5222 + }, + { + "epoch": 0.46700643776824036, + "grad_norm": 0.12193581972469661, + "learning_rate": 0.00011548896820182095, + "loss": 0.6241, + "step": 5223 + }, + { + "epoch": 0.4670958512160229, + "grad_norm": 0.13040391043834149, + "learning_rate": 0.00011546035694062806, + "loss": 0.6512, + "step": 5224 + }, + { + "epoch": 0.46718526466380544, + "grad_norm": 0.12245332438677232, + "learning_rate": 0.0001154317443827937, + "loss": 0.6588, + "step": 5225 + }, + { + "epoch": 0.46727467811158796, + "grad_norm": 0.15709976232709744, + "learning_rate": 0.00011540313053071752, + "loss": 0.6947, + "step": 5226 + }, + { + "epoch": 0.46736409155937053, + "grad_norm": 0.13261577315065218, + "learning_rate": 0.00011537451538679944, + "loss": 0.69, + "step": 5227 + }, + { + "epoch": 0.4674535050071531, + "grad_norm": 0.12122969816509134, + "learning_rate": 0.00011534589895343933, + "loss": 0.6569, + "step": 5228 + }, + { + "epoch": 0.4675429184549356, + "grad_norm": 0.13391436977043536, + "learning_rate": 0.00011531728123303715, + "loss": 0.6366, + "step": 5229 + }, + { + "epoch": 0.4676323319027182, + "grad_norm": 0.12321150031519942, + "learning_rate": 0.00011528866222799313, + "loss": 0.6639, + "step": 5230 + }, + { + "epoch": 0.4677217453505007, + "grad_norm": 0.14737590683827545, + "learning_rate": 0.00011526004194070748, + "loss": 0.7056, + "step": 5231 + }, + { + "epoch": 0.4678111587982833, + "grad_norm": 0.13546753343031487, + "learning_rate": 0.0001152314203735805, + "loss": 0.6873, + "step": 5232 + }, + { + "epoch": 0.4679005722460658, + "grad_norm": 0.13816273054956713, + "learning_rate": 0.00011520279752901273, + "loss": 0.6847, + "step": 5233 + }, + { + "epoch": 0.46798998569384836, + "grad_norm": 0.1481757676826205, + "learning_rate": 0.00011517417340940468, + "loss": 0.6525, + "step": 5234 + }, + { + "epoch": 0.4680793991416309, + "grad_norm": 0.11358194187476328, + "learning_rate": 0.00011514554801715704, + "loss": 0.6342, + "step": 5235 + }, + { + "epoch": 0.46816881258941345, + "grad_norm": 0.15725821480302798, + "learning_rate": 0.00011511692135467054, + "loss": 0.6344, + "step": 5236 + }, + { + "epoch": 0.46825822603719597, + "grad_norm": 0.13358190629595568, + "learning_rate": 0.00011508829342434615, + "loss": 0.681, + "step": 5237 + }, + { + "epoch": 0.46834763948497854, + "grad_norm": 0.1182839250259482, + "learning_rate": 0.00011505966422858481, + "loss": 0.6485, + "step": 5238 + }, + { + "epoch": 0.4684370529327611, + "grad_norm": 0.13543262352783766, + "learning_rate": 0.00011503103376978759, + "loss": 0.6468, + "step": 5239 + }, + { + "epoch": 0.4685264663805436, + "grad_norm": 0.1417368032502043, + "learning_rate": 0.00011500240205035573, + "loss": 0.6721, + "step": 5240 + }, + { + "epoch": 0.4686158798283262, + "grad_norm": 0.1356389835693045, + "learning_rate": 0.00011497376907269053, + "loss": 0.6812, + "step": 5241 + }, + { + "epoch": 0.4687052932761087, + "grad_norm": 0.13178825575743877, + "learning_rate": 0.00011494513483919342, + "loss": 0.6584, + "step": 5242 + }, + { + "epoch": 0.4687947067238913, + "grad_norm": 0.12134005844852974, + "learning_rate": 0.00011491649935226584, + "loss": 0.6288, + "step": 5243 + }, + { + "epoch": 0.4688841201716738, + "grad_norm": 0.12197121484596544, + "learning_rate": 0.00011488786261430954, + "loss": 0.6589, + "step": 5244 + }, + { + "epoch": 0.4689735336194564, + "grad_norm": 0.1308702775399463, + "learning_rate": 0.00011485922462772616, + "loss": 0.697, + "step": 5245 + }, + { + "epoch": 0.4690629470672389, + "grad_norm": 0.13307150729406492, + "learning_rate": 0.00011483058539491756, + "loss": 0.6914, + "step": 5246 + }, + { + "epoch": 0.46915236051502146, + "grad_norm": 0.14227977033921355, + "learning_rate": 0.00011480194491828567, + "loss": 0.682, + "step": 5247 + }, + { + "epoch": 0.46924177396280403, + "grad_norm": 0.12507600035221494, + "learning_rate": 0.00011477330320023255, + "loss": 0.6534, + "step": 5248 + }, + { + "epoch": 0.46933118741058655, + "grad_norm": 0.1529532001244547, + "learning_rate": 0.00011474466024316029, + "loss": 0.6937, + "step": 5249 + }, + { + "epoch": 0.4694206008583691, + "grad_norm": 0.12576852365622815, + "learning_rate": 0.0001147160160494712, + "loss": 0.6898, + "step": 5250 + }, + { + "epoch": 0.46951001430615164, + "grad_norm": 0.13268422619329884, + "learning_rate": 0.00011468737062156765, + "loss": 0.6146, + "step": 5251 + }, + { + "epoch": 0.4695994277539342, + "grad_norm": 0.15673221594717854, + "learning_rate": 0.00011465872396185204, + "loss": 0.6906, + "step": 5252 + }, + { + "epoch": 0.4696888412017167, + "grad_norm": 0.15311993475600566, + "learning_rate": 0.00011463007607272695, + "loss": 0.6745, + "step": 5253 + }, + { + "epoch": 0.4697782546494993, + "grad_norm": 0.14043093938908804, + "learning_rate": 0.00011460142695659503, + "loss": 0.6749, + "step": 5254 + }, + { + "epoch": 0.4698676680972818, + "grad_norm": 0.1383019169770105, + "learning_rate": 0.00011457277661585912, + "loss": 0.7343, + "step": 5255 + }, + { + "epoch": 0.4699570815450644, + "grad_norm": 0.1226826723018992, + "learning_rate": 0.000114544125052922, + "loss": 0.638, + "step": 5256 + }, + { + "epoch": 0.4700464949928469, + "grad_norm": 0.12443531167800716, + "learning_rate": 0.00011451547227018666, + "loss": 0.6459, + "step": 5257 + }, + { + "epoch": 0.47013590844062947, + "grad_norm": 0.13954665432084826, + "learning_rate": 0.00011448681827005623, + "loss": 0.6816, + "step": 5258 + }, + { + "epoch": 0.47022532188841204, + "grad_norm": 0.11869850848472603, + "learning_rate": 0.00011445816305493382, + "loss": 0.6456, + "step": 5259 + }, + { + "epoch": 0.47031473533619456, + "grad_norm": 0.14448646159149175, + "learning_rate": 0.00011442950662722274, + "loss": 0.6876, + "step": 5260 + }, + { + "epoch": 0.4704041487839771, + "grad_norm": 0.13222938330932849, + "learning_rate": 0.00011440084898932637, + "loss": 0.5878, + "step": 5261 + }, + { + "epoch": 0.47049356223175964, + "grad_norm": 0.1487001571089678, + "learning_rate": 0.00011437219014364819, + "loss": 0.6764, + "step": 5262 + }, + { + "epoch": 0.4705829756795422, + "grad_norm": 0.13348425893074195, + "learning_rate": 0.00011434353009259178, + "loss": 0.6361, + "step": 5263 + }, + { + "epoch": 0.47067238912732473, + "grad_norm": 0.14880667359879557, + "learning_rate": 0.00011431486883856082, + "loss": 0.6792, + "step": 5264 + }, + { + "epoch": 0.4707618025751073, + "grad_norm": 0.1368500442829497, + "learning_rate": 0.0001142862063839591, + "loss": 0.6504, + "step": 5265 + }, + { + "epoch": 0.4708512160228898, + "grad_norm": 0.13971723114044593, + "learning_rate": 0.00011425754273119049, + "loss": 0.695, + "step": 5266 + }, + { + "epoch": 0.4709406294706724, + "grad_norm": 0.13089144896978847, + "learning_rate": 0.00011422887788265901, + "loss": 0.6366, + "step": 5267 + }, + { + "epoch": 0.47103004291845496, + "grad_norm": 0.1205514178383958, + "learning_rate": 0.00011420021184076872, + "loss": 0.6339, + "step": 5268 + }, + { + "epoch": 0.4711194563662375, + "grad_norm": 0.12309856391546348, + "learning_rate": 0.00011417154460792381, + "loss": 0.6549, + "step": 5269 + }, + { + "epoch": 0.47120886981402005, + "grad_norm": 0.13103052544868246, + "learning_rate": 0.00011414287618652857, + "loss": 0.6388, + "step": 5270 + }, + { + "epoch": 0.47129828326180256, + "grad_norm": 0.12686370254345733, + "learning_rate": 0.00011411420657898737, + "loss": 0.6572, + "step": 5271 + }, + { + "epoch": 0.47138769670958514, + "grad_norm": 0.14440627918211926, + "learning_rate": 0.00011408553578770473, + "loss": 0.6978, + "step": 5272 + }, + { + "epoch": 0.47147711015736765, + "grad_norm": 0.1333812409843467, + "learning_rate": 0.0001140568638150852, + "loss": 0.6752, + "step": 5273 + }, + { + "epoch": 0.4715665236051502, + "grad_norm": 0.1384170721848416, + "learning_rate": 0.00011402819066353348, + "loss": 0.6791, + "step": 5274 + }, + { + "epoch": 0.47165593705293274, + "grad_norm": 0.12585914283183938, + "learning_rate": 0.00011399951633545438, + "loss": 0.6397, + "step": 5275 + }, + { + "epoch": 0.4717453505007153, + "grad_norm": 0.13024732510193177, + "learning_rate": 0.00011397084083325271, + "loss": 0.6686, + "step": 5276 + }, + { + "epoch": 0.4718347639484979, + "grad_norm": 0.11107383815286408, + "learning_rate": 0.00011394216415933355, + "loss": 0.6138, + "step": 5277 + }, + { + "epoch": 0.4719241773962804, + "grad_norm": 0.13253683871230681, + "learning_rate": 0.00011391348631610186, + "loss": 0.6457, + "step": 5278 + }, + { + "epoch": 0.47201359084406297, + "grad_norm": 0.1396243935745483, + "learning_rate": 0.0001138848073059629, + "loss": 0.6683, + "step": 5279 + }, + { + "epoch": 0.4721030042918455, + "grad_norm": 0.11308236214945032, + "learning_rate": 0.0001138561271313219, + "loss": 0.6628, + "step": 5280 + }, + { + "epoch": 0.47219241773962806, + "grad_norm": 0.1295578576953835, + "learning_rate": 0.00011382744579458426, + "loss": 0.6491, + "step": 5281 + }, + { + "epoch": 0.4722818311874106, + "grad_norm": 0.12026256523450116, + "learning_rate": 0.00011379876329815546, + "loss": 0.6396, + "step": 5282 + }, + { + "epoch": 0.47237124463519314, + "grad_norm": 0.12789533075251278, + "learning_rate": 0.00011377007964444104, + "loss": 0.6764, + "step": 5283 + }, + { + "epoch": 0.47246065808297566, + "grad_norm": 0.15251329876988942, + "learning_rate": 0.00011374139483584667, + "loss": 0.704, + "step": 5284 + }, + { + "epoch": 0.47255007153075823, + "grad_norm": 0.12849161285726543, + "learning_rate": 0.00011371270887477809, + "loss": 0.6532, + "step": 5285 + }, + { + "epoch": 0.47263948497854075, + "grad_norm": 0.12209907847121988, + "learning_rate": 0.00011368402176364121, + "loss": 0.6344, + "step": 5286 + }, + { + "epoch": 0.4727288984263233, + "grad_norm": 0.1420311177291364, + "learning_rate": 0.00011365533350484194, + "loss": 0.6598, + "step": 5287 + }, + { + "epoch": 0.4728183118741059, + "grad_norm": 0.12939345563447277, + "learning_rate": 0.00011362664410078632, + "loss": 0.6527, + "step": 5288 + }, + { + "epoch": 0.4729077253218884, + "grad_norm": 0.12300117627989515, + "learning_rate": 0.00011359795355388054, + "loss": 0.6832, + "step": 5289 + }, + { + "epoch": 0.472997138769671, + "grad_norm": 0.117586331450083, + "learning_rate": 0.0001135692618665308, + "loss": 0.6428, + "step": 5290 + }, + { + "epoch": 0.4730865522174535, + "grad_norm": 0.12492006178594192, + "learning_rate": 0.00011354056904114347, + "loss": 0.6388, + "step": 5291 + }, + { + "epoch": 0.47317596566523606, + "grad_norm": 0.12317803164884876, + "learning_rate": 0.00011351187508012496, + "loss": 0.6549, + "step": 5292 + }, + { + "epoch": 0.4732653791130186, + "grad_norm": 0.1147278219027161, + "learning_rate": 0.0001134831799858818, + "loss": 0.6489, + "step": 5293 + }, + { + "epoch": 0.47335479256080115, + "grad_norm": 0.12758636628120035, + "learning_rate": 0.00011345448376082064, + "loss": 0.6813, + "step": 5294 + }, + { + "epoch": 0.47344420600858367, + "grad_norm": 0.14536071962165453, + "learning_rate": 0.00011342578640734816, + "loss": 0.5936, + "step": 5295 + }, + { + "epoch": 0.47353361945636624, + "grad_norm": 0.134748953822481, + "learning_rate": 0.00011339708792787119, + "loss": 0.6785, + "step": 5296 + }, + { + "epoch": 0.4736230329041488, + "grad_norm": 0.12637513047551643, + "learning_rate": 0.00011336838832479661, + "loss": 0.6817, + "step": 5297 + }, + { + "epoch": 0.4737124463519313, + "grad_norm": 0.13244274053623387, + "learning_rate": 0.00011333968760053149, + "loss": 0.6639, + "step": 5298 + }, + { + "epoch": 0.4738018597997139, + "grad_norm": 0.1294674469775883, + "learning_rate": 0.00011331098575748284, + "loss": 0.6375, + "step": 5299 + }, + { + "epoch": 0.4738912732474964, + "grad_norm": 0.12996084935863694, + "learning_rate": 0.00011328228279805792, + "loss": 0.6681, + "step": 5300 + }, + { + "epoch": 0.473980686695279, + "grad_norm": 0.1263029715202414, + "learning_rate": 0.00011325357872466398, + "loss": 0.6708, + "step": 5301 + }, + { + "epoch": 0.4740701001430615, + "grad_norm": 0.11784263917423712, + "learning_rate": 0.00011322487353970838, + "loss": 0.657, + "step": 5302 + }, + { + "epoch": 0.4741595135908441, + "grad_norm": 0.1365460668786545, + "learning_rate": 0.00011319616724559866, + "loss": 0.6723, + "step": 5303 + }, + { + "epoch": 0.4742489270386266, + "grad_norm": 0.13136574571711168, + "learning_rate": 0.00011316745984474226, + "loss": 0.6565, + "step": 5304 + }, + { + "epoch": 0.47433834048640916, + "grad_norm": 0.12492891673069068, + "learning_rate": 0.00011313875133954695, + "loss": 0.6194, + "step": 5305 + }, + { + "epoch": 0.4744277539341917, + "grad_norm": 0.13120672988547324, + "learning_rate": 0.00011311004173242041, + "loss": 0.6778, + "step": 5306 + }, + { + "epoch": 0.47451716738197425, + "grad_norm": 0.10365606612177757, + "learning_rate": 0.0001130813310257705, + "loss": 0.6205, + "step": 5307 + }, + { + "epoch": 0.4746065808297568, + "grad_norm": 0.12209830229315587, + "learning_rate": 0.00011305261922200519, + "loss": 0.6695, + "step": 5308 + }, + { + "epoch": 0.47469599427753933, + "grad_norm": 0.13265929118911085, + "learning_rate": 0.00011302390632353241, + "loss": 0.5834, + "step": 5309 + }, + { + "epoch": 0.4747854077253219, + "grad_norm": 0.12902573977190834, + "learning_rate": 0.00011299519233276037, + "loss": 0.6985, + "step": 5310 + }, + { + "epoch": 0.4748748211731044, + "grad_norm": 0.139767700165402, + "learning_rate": 0.00011296647725209726, + "loss": 0.6626, + "step": 5311 + }, + { + "epoch": 0.474964234620887, + "grad_norm": 0.12880223235290286, + "learning_rate": 0.00011293776108395135, + "loss": 0.684, + "step": 5312 + }, + { + "epoch": 0.4750536480686695, + "grad_norm": 0.14958493972789694, + "learning_rate": 0.00011290904383073104, + "loss": 0.645, + "step": 5313 + }, + { + "epoch": 0.4751430615164521, + "grad_norm": 0.132637861978981, + "learning_rate": 0.0001128803254948448, + "loss": 0.6261, + "step": 5314 + }, + { + "epoch": 0.4752324749642346, + "grad_norm": 0.12889223232249952, + "learning_rate": 0.00011285160607870124, + "loss": 0.652, + "step": 5315 + }, + { + "epoch": 0.47532188841201717, + "grad_norm": 0.13187404884871698, + "learning_rate": 0.000112822885584709, + "loss": 0.6647, + "step": 5316 + }, + { + "epoch": 0.47541130185979974, + "grad_norm": 0.13467376346391996, + "learning_rate": 0.0001127941640152768, + "loss": 0.7013, + "step": 5317 + }, + { + "epoch": 0.47550071530758226, + "grad_norm": 0.13778346906820593, + "learning_rate": 0.00011276544137281355, + "loss": 0.6745, + "step": 5318 + }, + { + "epoch": 0.4755901287553648, + "grad_norm": 0.1715547977639135, + "learning_rate": 0.00011273671765972813, + "loss": 0.6947, + "step": 5319 + }, + { + "epoch": 0.47567954220314734, + "grad_norm": 0.1374616492502146, + "learning_rate": 0.00011270799287842957, + "loss": 0.6902, + "step": 5320 + }, + { + "epoch": 0.4757689556509299, + "grad_norm": 0.12928995925684086, + "learning_rate": 0.00011267926703132703, + "loss": 0.66, + "step": 5321 + }, + { + "epoch": 0.47585836909871243, + "grad_norm": 0.11754773872204531, + "learning_rate": 0.00011265054012082967, + "loss": 0.6325, + "step": 5322 + }, + { + "epoch": 0.475947782546495, + "grad_norm": 0.13189387246338904, + "learning_rate": 0.00011262181214934677, + "loss": 0.676, + "step": 5323 + }, + { + "epoch": 0.4760371959942775, + "grad_norm": 0.1251365711651304, + "learning_rate": 0.00011259308311928771, + "loss": 0.635, + "step": 5324 + }, + { + "epoch": 0.4761266094420601, + "grad_norm": 0.1346944097332207, + "learning_rate": 0.00011256435303306203, + "loss": 0.6347, + "step": 5325 + }, + { + "epoch": 0.4762160228898426, + "grad_norm": 0.13962450301537005, + "learning_rate": 0.00011253562189307921, + "loss": 0.6815, + "step": 5326 + }, + { + "epoch": 0.4763054363376252, + "grad_norm": 0.12933787371832905, + "learning_rate": 0.0001125068897017489, + "loss": 0.6869, + "step": 5327 + }, + { + "epoch": 0.47639484978540775, + "grad_norm": 0.14073722697020877, + "learning_rate": 0.00011247815646148087, + "loss": 0.6953, + "step": 5328 + }, + { + "epoch": 0.47648426323319026, + "grad_norm": 0.14626445675638106, + "learning_rate": 0.00011244942217468495, + "loss": 0.7051, + "step": 5329 + }, + { + "epoch": 0.47657367668097284, + "grad_norm": 0.10948919819863737, + "learning_rate": 0.00011242068684377101, + "loss": 0.6549, + "step": 5330 + }, + { + "epoch": 0.47666309012875535, + "grad_norm": 0.1445960666042506, + "learning_rate": 0.00011239195047114903, + "loss": 0.6627, + "step": 5331 + }, + { + "epoch": 0.4767525035765379, + "grad_norm": 0.11746978472755003, + "learning_rate": 0.00011236321305922919, + "loss": 0.6381, + "step": 5332 + }, + { + "epoch": 0.47684191702432044, + "grad_norm": 0.1285705365459697, + "learning_rate": 0.00011233447461042157, + "loss": 0.7055, + "step": 5333 + }, + { + "epoch": 0.476931330472103, + "grad_norm": 0.1265750675526299, + "learning_rate": 0.00011230573512713644, + "loss": 0.6526, + "step": 5334 + }, + { + "epoch": 0.4770207439198855, + "grad_norm": 0.13729504115342583, + "learning_rate": 0.00011227699461178423, + "loss": 0.6905, + "step": 5335 + }, + { + "epoch": 0.4771101573676681, + "grad_norm": 0.11962579378675413, + "learning_rate": 0.00011224825306677527, + "loss": 0.6539, + "step": 5336 + }, + { + "epoch": 0.47719957081545067, + "grad_norm": 0.1328755352223214, + "learning_rate": 0.00011221951049452009, + "loss": 0.6554, + "step": 5337 + }, + { + "epoch": 0.4772889842632332, + "grad_norm": 0.1288561037212802, + "learning_rate": 0.00011219076689742936, + "loss": 0.6561, + "step": 5338 + }, + { + "epoch": 0.47737839771101576, + "grad_norm": 0.13707670152329865, + "learning_rate": 0.00011216202227791373, + "loss": 0.6645, + "step": 5339 + }, + { + "epoch": 0.47746781115879827, + "grad_norm": 0.1199874322437719, + "learning_rate": 0.00011213327663838396, + "loss": 0.6773, + "step": 5340 + }, + { + "epoch": 0.47755722460658084, + "grad_norm": 0.1307047381083622, + "learning_rate": 0.00011210452998125094, + "loss": 0.7052, + "step": 5341 + }, + { + "epoch": 0.47764663805436336, + "grad_norm": 0.10963891409520675, + "learning_rate": 0.00011207578230892562, + "loss": 0.6434, + "step": 5342 + }, + { + "epoch": 0.47773605150214593, + "grad_norm": 0.1748958610112933, + "learning_rate": 0.00011204703362381903, + "loss": 0.6955, + "step": 5343 + }, + { + "epoch": 0.47782546494992845, + "grad_norm": 0.12020045318420901, + "learning_rate": 0.00011201828392834223, + "loss": 0.6659, + "step": 5344 + }, + { + "epoch": 0.477914878397711, + "grad_norm": 0.15049056272516195, + "learning_rate": 0.00011198953322490653, + "loss": 0.6772, + "step": 5345 + }, + { + "epoch": 0.4780042918454936, + "grad_norm": 0.12601604121955473, + "learning_rate": 0.00011196078151592314, + "loss": 0.6538, + "step": 5346 + }, + { + "epoch": 0.4780937052932761, + "grad_norm": 0.10650320965274676, + "learning_rate": 0.00011193202880380343, + "loss": 0.6586, + "step": 5347 + }, + { + "epoch": 0.4781831187410587, + "grad_norm": 0.14167786004775493, + "learning_rate": 0.00011190327509095889, + "loss": 0.6557, + "step": 5348 + }, + { + "epoch": 0.4782725321888412, + "grad_norm": 0.1311163427355405, + "learning_rate": 0.00011187452037980104, + "loss": 0.6536, + "step": 5349 + }, + { + "epoch": 0.47836194563662376, + "grad_norm": 0.144564509393965, + "learning_rate": 0.0001118457646727415, + "loss": 0.6679, + "step": 5350 + }, + { + "epoch": 0.4784513590844063, + "grad_norm": 0.13317986160954992, + "learning_rate": 0.00011181700797219199, + "loss": 0.6732, + "step": 5351 + }, + { + "epoch": 0.47854077253218885, + "grad_norm": 0.12483924543438527, + "learning_rate": 0.0001117882502805643, + "loss": 0.6088, + "step": 5352 + }, + { + "epoch": 0.47863018597997137, + "grad_norm": 0.13710237954413906, + "learning_rate": 0.00011175949160027031, + "loss": 0.691, + "step": 5353 + }, + { + "epoch": 0.47871959942775394, + "grad_norm": 0.10930426813116532, + "learning_rate": 0.0001117307319337219, + "loss": 0.6462, + "step": 5354 + }, + { + "epoch": 0.47880901287553645, + "grad_norm": 0.1294262428330674, + "learning_rate": 0.00011170197128333122, + "loss": 0.6823, + "step": 5355 + }, + { + "epoch": 0.478898426323319, + "grad_norm": 0.1218100631660575, + "learning_rate": 0.00011167320965151033, + "loss": 0.5755, + "step": 5356 + }, + { + "epoch": 0.4789878397711016, + "grad_norm": 0.13696958207722046, + "learning_rate": 0.00011164444704067145, + "loss": 0.6664, + "step": 5357 + }, + { + "epoch": 0.4790772532188841, + "grad_norm": 0.13542162997590967, + "learning_rate": 0.00011161568345322684, + "loss": 0.6249, + "step": 5358 + }, + { + "epoch": 0.4791666666666667, + "grad_norm": 0.11136999312165441, + "learning_rate": 0.00011158691889158892, + "loss": 0.6441, + "step": 5359 + }, + { + "epoch": 0.4792560801144492, + "grad_norm": 0.12633456686867234, + "learning_rate": 0.00011155815335817011, + "loss": 0.6502, + "step": 5360 + }, + { + "epoch": 0.4793454935622318, + "grad_norm": 0.1184546701582934, + "learning_rate": 0.00011152938685538287, + "loss": 0.656, + "step": 5361 + }, + { + "epoch": 0.4794349070100143, + "grad_norm": 0.125060891496937, + "learning_rate": 0.00011150061938563993, + "loss": 0.6289, + "step": 5362 + }, + { + "epoch": 0.47952432045779686, + "grad_norm": 0.13463555485319958, + "learning_rate": 0.00011147185095135395, + "loss": 0.671, + "step": 5363 + }, + { + "epoch": 0.4796137339055794, + "grad_norm": 0.13781265246403762, + "learning_rate": 0.00011144308155493763, + "loss": 0.6454, + "step": 5364 + }, + { + "epoch": 0.47970314735336195, + "grad_norm": 0.13072182618111364, + "learning_rate": 0.00011141431119880392, + "loss": 0.7069, + "step": 5365 + }, + { + "epoch": 0.4797925608011445, + "grad_norm": 0.12284082479471575, + "learning_rate": 0.00011138553988536571, + "loss": 0.6382, + "step": 5366 + }, + { + "epoch": 0.47988197424892703, + "grad_norm": 0.12839257541479113, + "learning_rate": 0.000111356767617036, + "loss": 0.6629, + "step": 5367 + }, + { + "epoch": 0.4799713876967096, + "grad_norm": 0.12070566057880491, + "learning_rate": 0.00011132799439622792, + "loss": 0.6524, + "step": 5368 + }, + { + "epoch": 0.4800608011444921, + "grad_norm": 0.13101732333545074, + "learning_rate": 0.00011129922022535464, + "loss": 0.5937, + "step": 5369 + }, + { + "epoch": 0.4801502145922747, + "grad_norm": 0.12647235058534834, + "learning_rate": 0.0001112704451068294, + "loss": 0.6582, + "step": 5370 + }, + { + "epoch": 0.4802396280400572, + "grad_norm": 0.14746679977971916, + "learning_rate": 0.0001112416690430655, + "loss": 0.6732, + "step": 5371 + }, + { + "epoch": 0.4803290414878398, + "grad_norm": 0.13423896448147926, + "learning_rate": 0.00011121289203647644, + "loss": 0.6399, + "step": 5372 + }, + { + "epoch": 0.4804184549356223, + "grad_norm": 0.14118613528749485, + "learning_rate": 0.00011118411408947567, + "loss": 0.6738, + "step": 5373 + }, + { + "epoch": 0.48050786838340487, + "grad_norm": 0.12561661299077143, + "learning_rate": 0.00011115533520447674, + "loss": 0.6815, + "step": 5374 + }, + { + "epoch": 0.4805972818311874, + "grad_norm": 0.13446331446564702, + "learning_rate": 0.00011112655538389331, + "loss": 0.6485, + "step": 5375 + }, + { + "epoch": 0.48068669527896996, + "grad_norm": 0.11571212621379494, + "learning_rate": 0.00011109777463013915, + "loss": 0.5999, + "step": 5376 + }, + { + "epoch": 0.4807761087267525, + "grad_norm": 0.13184496911988455, + "learning_rate": 0.000111068992945628, + "loss": 0.6578, + "step": 5377 + }, + { + "epoch": 0.48086552217453504, + "grad_norm": 0.15579959059422116, + "learning_rate": 0.00011104021033277379, + "loss": 0.7096, + "step": 5378 + }, + { + "epoch": 0.4809549356223176, + "grad_norm": 0.13723524346272664, + "learning_rate": 0.00011101142679399049, + "loss": 0.6425, + "step": 5379 + }, + { + "epoch": 0.48104434907010013, + "grad_norm": 0.12760041435079636, + "learning_rate": 0.00011098264233169211, + "loss": 0.6602, + "step": 5380 + }, + { + "epoch": 0.4811337625178827, + "grad_norm": 0.1492563212247847, + "learning_rate": 0.00011095385694829278, + "loss": 0.7168, + "step": 5381 + }, + { + "epoch": 0.4812231759656652, + "grad_norm": 0.12655654351906404, + "learning_rate": 0.0001109250706462067, + "loss": 0.6562, + "step": 5382 + }, + { + "epoch": 0.4813125894134478, + "grad_norm": 0.1371442959012665, + "learning_rate": 0.00011089628342784814, + "loss": 0.665, + "step": 5383 + }, + { + "epoch": 0.4814020028612303, + "grad_norm": 0.1391289195351156, + "learning_rate": 0.00011086749529563143, + "loss": 0.5678, + "step": 5384 + }, + { + "epoch": 0.4814914163090129, + "grad_norm": 0.14667934838023772, + "learning_rate": 0.00011083870625197103, + "loss": 0.6163, + "step": 5385 + }, + { + "epoch": 0.48158082975679545, + "grad_norm": 0.11996118950496738, + "learning_rate": 0.00011080991629928143, + "loss": 0.6405, + "step": 5386 + }, + { + "epoch": 0.48167024320457796, + "grad_norm": 0.1326665054138686, + "learning_rate": 0.00011078112543997723, + "loss": 0.6485, + "step": 5387 + }, + { + "epoch": 0.48175965665236054, + "grad_norm": 0.14744261407978546, + "learning_rate": 0.00011075233367647302, + "loss": 0.6819, + "step": 5388 + }, + { + "epoch": 0.48184907010014305, + "grad_norm": 0.13450959847980945, + "learning_rate": 0.00011072354101118357, + "loss": 0.6559, + "step": 5389 + }, + { + "epoch": 0.4819384835479256, + "grad_norm": 0.11971464105976246, + "learning_rate": 0.00011069474744652371, + "loss": 0.6082, + "step": 5390 + }, + { + "epoch": 0.48202789699570814, + "grad_norm": 0.12708538617222404, + "learning_rate": 0.00011066595298490827, + "loss": 0.6415, + "step": 5391 + }, + { + "epoch": 0.4821173104434907, + "grad_norm": 0.1362474021477294, + "learning_rate": 0.00011063715762875225, + "loss": 0.6139, + "step": 5392 + }, + { + "epoch": 0.4822067238912732, + "grad_norm": 0.15333476445106176, + "learning_rate": 0.00011060836138047066, + "loss": 0.7058, + "step": 5393 + }, + { + "epoch": 0.4822961373390558, + "grad_norm": 0.14293886031106537, + "learning_rate": 0.00011057956424247861, + "loss": 0.618, + "step": 5394 + }, + { + "epoch": 0.4823855507868383, + "grad_norm": 0.14610772258413096, + "learning_rate": 0.00011055076621719132, + "loss": 0.692, + "step": 5395 + }, + { + "epoch": 0.4824749642346209, + "grad_norm": 0.14031958797648816, + "learning_rate": 0.00011052196730702396, + "loss": 0.6882, + "step": 5396 + }, + { + "epoch": 0.48256437768240346, + "grad_norm": 0.13552908971298366, + "learning_rate": 0.00011049316751439194, + "loss": 0.6183, + "step": 5397 + }, + { + "epoch": 0.48265379113018597, + "grad_norm": 0.14419818454458688, + "learning_rate": 0.00011046436684171062, + "loss": 0.689, + "step": 5398 + }, + { + "epoch": 0.48274320457796854, + "grad_norm": 0.14155697100458312, + "learning_rate": 0.00011043556529139549, + "loss": 0.6403, + "step": 5399 + }, + { + "epoch": 0.48283261802575106, + "grad_norm": 0.1357177719331768, + "learning_rate": 0.00011040676286586211, + "loss": 0.6455, + "step": 5400 + }, + { + "epoch": 0.48292203147353363, + "grad_norm": 0.11811795678791975, + "learning_rate": 0.00011037795956752608, + "loss": 0.6644, + "step": 5401 + }, + { + "epoch": 0.48301144492131615, + "grad_norm": 0.1513057220263348, + "learning_rate": 0.00011034915539880313, + "loss": 0.7201, + "step": 5402 + }, + { + "epoch": 0.4831008583690987, + "grad_norm": 0.11306137909831149, + "learning_rate": 0.00011032035036210901, + "loss": 0.647, + "step": 5403 + }, + { + "epoch": 0.48319027181688123, + "grad_norm": 0.11851570902451654, + "learning_rate": 0.00011029154445985961, + "loss": 0.6642, + "step": 5404 + }, + { + "epoch": 0.4832796852646638, + "grad_norm": 0.14008941546271278, + "learning_rate": 0.00011026273769447076, + "loss": 0.6666, + "step": 5405 + }, + { + "epoch": 0.4833690987124464, + "grad_norm": 0.12860164598126458, + "learning_rate": 0.00011023393006835847, + "loss": 0.6627, + "step": 5406 + }, + { + "epoch": 0.4834585121602289, + "grad_norm": 0.13618443622806223, + "learning_rate": 0.00011020512158393887, + "loss": 0.62, + "step": 5407 + }, + { + "epoch": 0.48354792560801146, + "grad_norm": 0.13674861833996033, + "learning_rate": 0.00011017631224362803, + "loss": 0.6606, + "step": 5408 + }, + { + "epoch": 0.483637339055794, + "grad_norm": 0.1357307688301329, + "learning_rate": 0.00011014750204984217, + "loss": 0.6786, + "step": 5409 + }, + { + "epoch": 0.48372675250357655, + "grad_norm": 0.12239111643386232, + "learning_rate": 0.00011011869100499758, + "loss": 0.6691, + "step": 5410 + }, + { + "epoch": 0.48381616595135907, + "grad_norm": 0.11722402572213987, + "learning_rate": 0.00011008987911151058, + "loss": 0.6259, + "step": 5411 + }, + { + "epoch": 0.48390557939914164, + "grad_norm": 0.13978973221585977, + "learning_rate": 0.00011006106637179763, + "loss": 0.6494, + "step": 5412 + }, + { + "epoch": 0.48399499284692415, + "grad_norm": 0.13619036431593706, + "learning_rate": 0.00011003225278827515, + "loss": 0.6429, + "step": 5413 + }, + { + "epoch": 0.4840844062947067, + "grad_norm": 0.13146714805838108, + "learning_rate": 0.0001100034383633598, + "loss": 0.6675, + "step": 5414 + }, + { + "epoch": 0.4841738197424893, + "grad_norm": 0.12255078172499972, + "learning_rate": 0.00010997462309946811, + "loss": 0.6307, + "step": 5415 + }, + { + "epoch": 0.4842632331902718, + "grad_norm": 0.1193287966690048, + "learning_rate": 0.00010994580699901684, + "loss": 0.6318, + "step": 5416 + }, + { + "epoch": 0.4843526466380544, + "grad_norm": 0.1344913210596068, + "learning_rate": 0.00010991699006442275, + "loss": 0.6676, + "step": 5417 + }, + { + "epoch": 0.4844420600858369, + "grad_norm": 0.11698557266726868, + "learning_rate": 0.00010988817229810268, + "loss": 0.6464, + "step": 5418 + }, + { + "epoch": 0.4845314735336195, + "grad_norm": 0.13039502195286537, + "learning_rate": 0.00010985935370247355, + "loss": 0.6444, + "step": 5419 + }, + { + "epoch": 0.484620886981402, + "grad_norm": 0.1345573379687849, + "learning_rate": 0.00010983053427995234, + "loss": 0.6375, + "step": 5420 + }, + { + "epoch": 0.48471030042918456, + "grad_norm": 0.12295574826863741, + "learning_rate": 0.0001098017140329561, + "loss": 0.6217, + "step": 5421 + }, + { + "epoch": 0.4847997138769671, + "grad_norm": 0.1365713039766957, + "learning_rate": 0.0001097728929639019, + "loss": 0.6691, + "step": 5422 + }, + { + "epoch": 0.48488912732474965, + "grad_norm": 0.10683761451690867, + "learning_rate": 0.00010974407107520697, + "loss": 0.6428, + "step": 5423 + }, + { + "epoch": 0.48497854077253216, + "grad_norm": 0.14058037385754735, + "learning_rate": 0.0001097152483692886, + "loss": 0.7068, + "step": 5424 + }, + { + "epoch": 0.48506795422031473, + "grad_norm": 0.1329945033997565, + "learning_rate": 0.00010968642484856406, + "loss": 0.6859, + "step": 5425 + }, + { + "epoch": 0.4851573676680973, + "grad_norm": 0.14515382655172762, + "learning_rate": 0.0001096576005154508, + "loss": 0.632, + "step": 5426 + }, + { + "epoch": 0.4852467811158798, + "grad_norm": 0.1383871341277744, + "learning_rate": 0.0001096287753723662, + "loss": 0.5822, + "step": 5427 + }, + { + "epoch": 0.4853361945636624, + "grad_norm": 0.11542534054880266, + "learning_rate": 0.00010959994942172786, + "loss": 0.6542, + "step": 5428 + }, + { + "epoch": 0.4854256080114449, + "grad_norm": 0.13545728282830738, + "learning_rate": 0.00010957112266595338, + "loss": 0.6457, + "step": 5429 + }, + { + "epoch": 0.4855150214592275, + "grad_norm": 0.12484936468566324, + "learning_rate": 0.00010954229510746035, + "loss": 0.648, + "step": 5430 + }, + { + "epoch": 0.48560443490701, + "grad_norm": 0.1368124374385989, + "learning_rate": 0.0001095134667486666, + "loss": 0.6669, + "step": 5431 + }, + { + "epoch": 0.48569384835479257, + "grad_norm": 0.11479870051690283, + "learning_rate": 0.00010948463759198986, + "loss": 0.6504, + "step": 5432 + }, + { + "epoch": 0.4857832618025751, + "grad_norm": 0.12364058614781358, + "learning_rate": 0.00010945580763984801, + "loss": 0.6574, + "step": 5433 + }, + { + "epoch": 0.48587267525035766, + "grad_norm": 0.13790708215880512, + "learning_rate": 0.00010942697689465902, + "loss": 0.6969, + "step": 5434 + }, + { + "epoch": 0.4859620886981402, + "grad_norm": 0.10955126795751693, + "learning_rate": 0.00010939814535884083, + "loss": 0.6574, + "step": 5435 + }, + { + "epoch": 0.48605150214592274, + "grad_norm": 0.13882931649517358, + "learning_rate": 0.00010936931303481158, + "loss": 0.6735, + "step": 5436 + }, + { + "epoch": 0.4861409155937053, + "grad_norm": 0.1432954509021626, + "learning_rate": 0.00010934047992498932, + "loss": 0.6721, + "step": 5437 + }, + { + "epoch": 0.48623032904148783, + "grad_norm": 0.1578909492880853, + "learning_rate": 0.00010931164603179231, + "loss": 0.7043, + "step": 5438 + }, + { + "epoch": 0.4863197424892704, + "grad_norm": 0.13854532676911624, + "learning_rate": 0.0001092828113576388, + "loss": 0.6654, + "step": 5439 + }, + { + "epoch": 0.4864091559370529, + "grad_norm": 0.12276899644291821, + "learning_rate": 0.00010925397590494712, + "loss": 0.6293, + "step": 5440 + }, + { + "epoch": 0.4864985693848355, + "grad_norm": 0.13663313754119683, + "learning_rate": 0.00010922513967613563, + "loss": 0.6768, + "step": 5441 + }, + { + "epoch": 0.486587982832618, + "grad_norm": 0.1148026371487047, + "learning_rate": 0.00010919630267362282, + "loss": 0.6213, + "step": 5442 + }, + { + "epoch": 0.4866773962804006, + "grad_norm": 0.13418955825368326, + "learning_rate": 0.00010916746489982723, + "loss": 0.6367, + "step": 5443 + }, + { + "epoch": 0.4867668097281831, + "grad_norm": 0.13313202139912678, + "learning_rate": 0.00010913862635716741, + "loss": 0.6635, + "step": 5444 + }, + { + "epoch": 0.48685622317596566, + "grad_norm": 0.1288073203837162, + "learning_rate": 0.00010910978704806203, + "loss": 0.6509, + "step": 5445 + }, + { + "epoch": 0.48694563662374823, + "grad_norm": 0.13211201727784488, + "learning_rate": 0.00010908094697492983, + "loss": 0.6586, + "step": 5446 + }, + { + "epoch": 0.48703505007153075, + "grad_norm": 0.13612776853951625, + "learning_rate": 0.00010905210614018957, + "loss": 0.6615, + "step": 5447 + }, + { + "epoch": 0.4871244635193133, + "grad_norm": 0.12707669697206928, + "learning_rate": 0.0001090232645462601, + "loss": 0.6881, + "step": 5448 + }, + { + "epoch": 0.48721387696709584, + "grad_norm": 0.1484694856789169, + "learning_rate": 0.00010899442219556033, + "loss": 0.5953, + "step": 5449 + }, + { + "epoch": 0.4873032904148784, + "grad_norm": 0.11925538825827203, + "learning_rate": 0.00010896557909050927, + "loss": 0.6433, + "step": 5450 + }, + { + "epoch": 0.4873927038626609, + "grad_norm": 0.12213423016112743, + "learning_rate": 0.00010893673523352585, + "loss": 0.6549, + "step": 5451 + }, + { + "epoch": 0.4874821173104435, + "grad_norm": 0.14463513930176541, + "learning_rate": 0.00010890789062702926, + "loss": 0.7056, + "step": 5452 + }, + { + "epoch": 0.487571530758226, + "grad_norm": 0.13756161735556843, + "learning_rate": 0.00010887904527343866, + "loss": 0.6577, + "step": 5453 + }, + { + "epoch": 0.4876609442060086, + "grad_norm": 0.1303584388817195, + "learning_rate": 0.00010885019917517325, + "loss": 0.6683, + "step": 5454 + }, + { + "epoch": 0.48775035765379116, + "grad_norm": 0.12809405109773392, + "learning_rate": 0.00010882135233465232, + "loss": 0.6456, + "step": 5455 + }, + { + "epoch": 0.48783977110157367, + "grad_norm": 0.12687932819071543, + "learning_rate": 0.00010879250475429523, + "loss": 0.6211, + "step": 5456 + }, + { + "epoch": 0.48792918454935624, + "grad_norm": 0.12107142722881373, + "learning_rate": 0.0001087636564365214, + "loss": 0.6845, + "step": 5457 + }, + { + "epoch": 0.48801859799713876, + "grad_norm": 0.14712140445163582, + "learning_rate": 0.00010873480738375024, + "loss": 0.7039, + "step": 5458 + }, + { + "epoch": 0.48810801144492133, + "grad_norm": 0.1394398728124547, + "learning_rate": 0.00010870595759840137, + "loss": 0.6559, + "step": 5459 + }, + { + "epoch": 0.48819742489270385, + "grad_norm": 0.14151994710513477, + "learning_rate": 0.00010867710708289434, + "loss": 0.5912, + "step": 5460 + }, + { + "epoch": 0.4882868383404864, + "grad_norm": 0.1259888300046882, + "learning_rate": 0.00010864825583964882, + "loss": 0.6515, + "step": 5461 + }, + { + "epoch": 0.48837625178826893, + "grad_norm": 0.12269510761600465, + "learning_rate": 0.00010861940387108451, + "loss": 0.6134, + "step": 5462 + }, + { + "epoch": 0.4884656652360515, + "grad_norm": 0.12250459961201843, + "learning_rate": 0.00010859055117962125, + "loss": 0.6461, + "step": 5463 + }, + { + "epoch": 0.488555078683834, + "grad_norm": 0.12781324706102162, + "learning_rate": 0.00010856169776767882, + "loss": 0.6185, + "step": 5464 + }, + { + "epoch": 0.4886444921316166, + "grad_norm": 0.12127862632141892, + "learning_rate": 0.0001085328436376771, + "loss": 0.6139, + "step": 5465 + }, + { + "epoch": 0.48873390557939916, + "grad_norm": 0.14626349529626967, + "learning_rate": 0.00010850398879203611, + "loss": 0.6406, + "step": 5466 + }, + { + "epoch": 0.4888233190271817, + "grad_norm": 0.13538445278254715, + "learning_rate": 0.00010847513323317588, + "loss": 0.6501, + "step": 5467 + }, + { + "epoch": 0.48891273247496425, + "grad_norm": 0.13759099188135251, + "learning_rate": 0.00010844627696351644, + "loss": 0.648, + "step": 5468 + }, + { + "epoch": 0.48900214592274677, + "grad_norm": 0.12639849557999958, + "learning_rate": 0.00010841741998547794, + "loss": 0.6335, + "step": 5469 + }, + { + "epoch": 0.48909155937052934, + "grad_norm": 0.13999831466554363, + "learning_rate": 0.00010838856230148063, + "loss": 0.6885, + "step": 5470 + }, + { + "epoch": 0.48918097281831185, + "grad_norm": 0.12152430463577796, + "learning_rate": 0.0001083597039139447, + "loss": 0.674, + "step": 5471 + }, + { + "epoch": 0.4892703862660944, + "grad_norm": 0.13084550949260454, + "learning_rate": 0.00010833084482529048, + "loss": 0.67, + "step": 5472 + }, + { + "epoch": 0.48935979971387694, + "grad_norm": 0.13855592284197008, + "learning_rate": 0.0001083019850379384, + "loss": 0.6876, + "step": 5473 + }, + { + "epoch": 0.4894492131616595, + "grad_norm": 0.13018613004866342, + "learning_rate": 0.00010827312455430884, + "loss": 0.6825, + "step": 5474 + }, + { + "epoch": 0.4895386266094421, + "grad_norm": 0.12585664064473062, + "learning_rate": 0.00010824426337682235, + "loss": 0.6551, + "step": 5475 + }, + { + "epoch": 0.4896280400572246, + "grad_norm": 0.11513847111809768, + "learning_rate": 0.00010821540150789939, + "loss": 0.6412, + "step": 5476 + }, + { + "epoch": 0.48971745350500717, + "grad_norm": 0.12476122069294038, + "learning_rate": 0.00010818653894996067, + "loss": 0.6665, + "step": 5477 + }, + { + "epoch": 0.4898068669527897, + "grad_norm": 0.1337593630834426, + "learning_rate": 0.00010815767570542681, + "loss": 0.6564, + "step": 5478 + }, + { + "epoch": 0.48989628040057226, + "grad_norm": 0.12218291375684195, + "learning_rate": 0.00010812881177671852, + "loss": 0.646, + "step": 5479 + }, + { + "epoch": 0.4899856938483548, + "grad_norm": 0.13683436387482822, + "learning_rate": 0.00010809994716625662, + "loss": 0.6496, + "step": 5480 + }, + { + "epoch": 0.49007510729613735, + "grad_norm": 0.1431305018326274, + "learning_rate": 0.00010807108187646195, + "loss": 0.6941, + "step": 5481 + }, + { + "epoch": 0.49016452074391986, + "grad_norm": 0.12199530325707832, + "learning_rate": 0.00010804221590975535, + "loss": 0.6462, + "step": 5482 + }, + { + "epoch": 0.49025393419170243, + "grad_norm": 0.13361514120810955, + "learning_rate": 0.00010801334926855784, + "loss": 0.6788, + "step": 5483 + }, + { + "epoch": 0.490343347639485, + "grad_norm": 0.12653021527412417, + "learning_rate": 0.0001079844819552904, + "loss": 0.672, + "step": 5484 + }, + { + "epoch": 0.4904327610872675, + "grad_norm": 0.12210690822087214, + "learning_rate": 0.0001079556139723741, + "loss": 0.6232, + "step": 5485 + }, + { + "epoch": 0.4905221745350501, + "grad_norm": 0.11792429167163271, + "learning_rate": 0.00010792674532223006, + "loss": 0.6311, + "step": 5486 + }, + { + "epoch": 0.4906115879828326, + "grad_norm": 0.13294563929426012, + "learning_rate": 0.00010789787600727948, + "loss": 0.6858, + "step": 5487 + }, + { + "epoch": 0.4907010014306152, + "grad_norm": 0.13030446820418629, + "learning_rate": 0.00010786900602994359, + "loss": 0.6512, + "step": 5488 + }, + { + "epoch": 0.4907904148783977, + "grad_norm": 0.14102166093771554, + "learning_rate": 0.00010784013539264359, + "loss": 0.6931, + "step": 5489 + }, + { + "epoch": 0.49087982832618027, + "grad_norm": 0.15979621556622164, + "learning_rate": 0.00010781126409780098, + "loss": 0.6263, + "step": 5490 + }, + { + "epoch": 0.4909692417739628, + "grad_norm": 0.14514606836412391, + "learning_rate": 0.00010778239214783708, + "loss": 0.6701, + "step": 5491 + }, + { + "epoch": 0.49105865522174535, + "grad_norm": 0.1502374566213836, + "learning_rate": 0.00010775351954517332, + "loss": 0.6384, + "step": 5492 + }, + { + "epoch": 0.49114806866952787, + "grad_norm": 0.12920994534503993, + "learning_rate": 0.00010772464629223124, + "loss": 0.6605, + "step": 5493 + }, + { + "epoch": 0.49123748211731044, + "grad_norm": 0.12460014938616468, + "learning_rate": 0.00010769577239143242, + "loss": 0.6531, + "step": 5494 + }, + { + "epoch": 0.491326895565093, + "grad_norm": 0.14724016567669687, + "learning_rate": 0.00010766689784519845, + "loss": 0.6476, + "step": 5495 + }, + { + "epoch": 0.49141630901287553, + "grad_norm": 0.12151740246890641, + "learning_rate": 0.00010763802265595102, + "loss": 0.6443, + "step": 5496 + }, + { + "epoch": 0.4915057224606581, + "grad_norm": 0.12579785407171057, + "learning_rate": 0.00010760914682611188, + "loss": 0.6476, + "step": 5497 + }, + { + "epoch": 0.4915951359084406, + "grad_norm": 0.12118232277994448, + "learning_rate": 0.00010758027035810276, + "loss": 0.6527, + "step": 5498 + }, + { + "epoch": 0.4916845493562232, + "grad_norm": 0.11861581780107, + "learning_rate": 0.00010755139325434548, + "loss": 0.6584, + "step": 5499 + }, + { + "epoch": 0.4917739628040057, + "grad_norm": 0.1306248458128445, + "learning_rate": 0.00010752251551726205, + "loss": 0.6757, + "step": 5500 + }, + { + "epoch": 0.4918633762517883, + "grad_norm": 0.1316600963880277, + "learning_rate": 0.0001074936371492743, + "loss": 0.6332, + "step": 5501 + }, + { + "epoch": 0.4919527896995708, + "grad_norm": 0.1317985618981636, + "learning_rate": 0.00010746475815280424, + "loss": 0.6817, + "step": 5502 + }, + { + "epoch": 0.49204220314735336, + "grad_norm": 0.13274482480711783, + "learning_rate": 0.00010743587853027391, + "loss": 0.682, + "step": 5503 + }, + { + "epoch": 0.49213161659513593, + "grad_norm": 0.13712338176948466, + "learning_rate": 0.00010740699828410545, + "loss": 0.6331, + "step": 5504 + }, + { + "epoch": 0.49222103004291845, + "grad_norm": 0.14797039674830675, + "learning_rate": 0.00010737811741672101, + "loss": 0.6876, + "step": 5505 + }, + { + "epoch": 0.492310443490701, + "grad_norm": 0.12630215357440217, + "learning_rate": 0.00010734923593054271, + "loss": 0.6382, + "step": 5506 + }, + { + "epoch": 0.49239985693848354, + "grad_norm": 0.12177419896353968, + "learning_rate": 0.00010732035382799293, + "loss": 0.6124, + "step": 5507 + }, + { + "epoch": 0.4924892703862661, + "grad_norm": 0.1321879913502413, + "learning_rate": 0.00010729147111149392, + "loss": 0.6376, + "step": 5508 + }, + { + "epoch": 0.4925786838340486, + "grad_norm": 0.13664112967063682, + "learning_rate": 0.00010726258778346798, + "loss": 0.6517, + "step": 5509 + }, + { + "epoch": 0.4926680972818312, + "grad_norm": 0.13357395996139035, + "learning_rate": 0.0001072337038463376, + "loss": 0.6587, + "step": 5510 + }, + { + "epoch": 0.4927575107296137, + "grad_norm": 0.1362119807906883, + "learning_rate": 0.00010720481930252524, + "loss": 0.6391, + "step": 5511 + }, + { + "epoch": 0.4928469241773963, + "grad_norm": 0.12901053053356965, + "learning_rate": 0.00010717593415445335, + "loss": 0.6509, + "step": 5512 + }, + { + "epoch": 0.4929363376251788, + "grad_norm": 0.14424421117084388, + "learning_rate": 0.00010714704840454453, + "loss": 0.6429, + "step": 5513 + }, + { + "epoch": 0.49302575107296137, + "grad_norm": 0.15726164725723646, + "learning_rate": 0.0001071181620552214, + "loss": 0.6467, + "step": 5514 + }, + { + "epoch": 0.49311516452074394, + "grad_norm": 0.12193012556768378, + "learning_rate": 0.00010708927510890665, + "loss": 0.6196, + "step": 5515 + }, + { + "epoch": 0.49320457796852646, + "grad_norm": 0.1238244156435777, + "learning_rate": 0.0001070603875680229, + "loss": 0.571, + "step": 5516 + }, + { + "epoch": 0.49329399141630903, + "grad_norm": 0.14284641727492756, + "learning_rate": 0.000107031499434993, + "loss": 0.6563, + "step": 5517 + }, + { + "epoch": 0.49338340486409155, + "grad_norm": 0.12311780383022745, + "learning_rate": 0.00010700261071223973, + "loss": 0.6565, + "step": 5518 + }, + { + "epoch": 0.4934728183118741, + "grad_norm": 0.1303256069833742, + "learning_rate": 0.00010697372140218596, + "loss": 0.6534, + "step": 5519 + }, + { + "epoch": 0.49356223175965663, + "grad_norm": 0.13607032966293092, + "learning_rate": 0.00010694483150725458, + "loss": 0.6261, + "step": 5520 + }, + { + "epoch": 0.4936516452074392, + "grad_norm": 0.12970756941279593, + "learning_rate": 0.00010691594102986861, + "loss": 0.6757, + "step": 5521 + }, + { + "epoch": 0.4937410586552217, + "grad_norm": 0.12362374110702944, + "learning_rate": 0.000106887049972451, + "loss": 0.6704, + "step": 5522 + }, + { + "epoch": 0.4938304721030043, + "grad_norm": 0.13691186567827274, + "learning_rate": 0.00010685815833742481, + "loss": 0.682, + "step": 5523 + }, + { + "epoch": 0.49391988555078686, + "grad_norm": 0.1409275676233513, + "learning_rate": 0.00010682926612721315, + "loss": 0.6979, + "step": 5524 + }, + { + "epoch": 0.4940092989985694, + "grad_norm": 0.14330905068815947, + "learning_rate": 0.00010680037334423925, + "loss": 0.6514, + "step": 5525 + }, + { + "epoch": 0.49409871244635195, + "grad_norm": 0.13056516575828003, + "learning_rate": 0.00010677147999092618, + "loss": 0.6648, + "step": 5526 + }, + { + "epoch": 0.49418812589413447, + "grad_norm": 0.13590086536054727, + "learning_rate": 0.00010674258606969729, + "loss": 0.6552, + "step": 5527 + }, + { + "epoch": 0.49427753934191704, + "grad_norm": 0.11864038164287781, + "learning_rate": 0.00010671369158297586, + "loss": 0.6496, + "step": 5528 + }, + { + "epoch": 0.49436695278969955, + "grad_norm": 0.12623736879984396, + "learning_rate": 0.00010668479653318522, + "loss": 0.6323, + "step": 5529 + }, + { + "epoch": 0.4944563662374821, + "grad_norm": 0.1221105595375275, + "learning_rate": 0.00010665590092274876, + "loss": 0.6378, + "step": 5530 + }, + { + "epoch": 0.49454577968526464, + "grad_norm": 0.13757973478199473, + "learning_rate": 0.00010662700475408994, + "loss": 0.6719, + "step": 5531 + }, + { + "epoch": 0.4946351931330472, + "grad_norm": 0.14461587702745055, + "learning_rate": 0.00010659810802963224, + "loss": 0.6435, + "step": 5532 + }, + { + "epoch": 0.4947246065808298, + "grad_norm": 0.1167151529940778, + "learning_rate": 0.00010656921075179915, + "loss": 0.6766, + "step": 5533 + }, + { + "epoch": 0.4948140200286123, + "grad_norm": 0.14113607327612185, + "learning_rate": 0.00010654031292301432, + "loss": 0.6713, + "step": 5534 + }, + { + "epoch": 0.49490343347639487, + "grad_norm": 0.1344778748275434, + "learning_rate": 0.00010651141454570135, + "loss": 0.6791, + "step": 5535 + }, + { + "epoch": 0.4949928469241774, + "grad_norm": 0.1103540485217885, + "learning_rate": 0.00010648251562228386, + "loss": 0.6104, + "step": 5536 + }, + { + "epoch": 0.49508226037195996, + "grad_norm": 0.13835677138921249, + "learning_rate": 0.00010645361615518565, + "loss": 0.7036, + "step": 5537 + }, + { + "epoch": 0.4951716738197425, + "grad_norm": 0.11899165629963443, + "learning_rate": 0.00010642471614683045, + "loss": 0.6647, + "step": 5538 + }, + { + "epoch": 0.49526108726752505, + "grad_norm": 0.13575438120944763, + "learning_rate": 0.00010639581559964205, + "loss": 0.6625, + "step": 5539 + }, + { + "epoch": 0.49535050071530756, + "grad_norm": 0.11661246320830934, + "learning_rate": 0.00010636691451604434, + "loss": 0.5973, + "step": 5540 + }, + { + "epoch": 0.49543991416309013, + "grad_norm": 0.13749839683547815, + "learning_rate": 0.00010633801289846119, + "loss": 0.6582, + "step": 5541 + }, + { + "epoch": 0.49552932761087265, + "grad_norm": 0.12323919482626979, + "learning_rate": 0.00010630911074931655, + "loss": 0.6317, + "step": 5542 + }, + { + "epoch": 0.4956187410586552, + "grad_norm": 0.15691000439032493, + "learning_rate": 0.00010628020807103441, + "loss": 0.6099, + "step": 5543 + }, + { + "epoch": 0.4957081545064378, + "grad_norm": 0.15437429284294166, + "learning_rate": 0.00010625130486603878, + "loss": 0.6906, + "step": 5544 + }, + { + "epoch": 0.4957975679542203, + "grad_norm": 0.12461706557712436, + "learning_rate": 0.00010622240113675382, + "loss": 0.6589, + "step": 5545 + }, + { + "epoch": 0.4958869814020029, + "grad_norm": 0.1402701850054856, + "learning_rate": 0.00010619349688560354, + "loss": 0.6866, + "step": 5546 + }, + { + "epoch": 0.4959763948497854, + "grad_norm": 0.13498915914865106, + "learning_rate": 0.00010616459211501217, + "loss": 0.6669, + "step": 5547 + }, + { + "epoch": 0.49606580829756797, + "grad_norm": 0.12151141797319799, + "learning_rate": 0.00010613568682740391, + "loss": 0.6722, + "step": 5548 + }, + { + "epoch": 0.4961552217453505, + "grad_norm": 0.12600346014693659, + "learning_rate": 0.00010610678102520301, + "loss": 0.6361, + "step": 5549 + }, + { + "epoch": 0.49624463519313305, + "grad_norm": 0.12468068222306164, + "learning_rate": 0.00010607787471083375, + "loss": 0.6417, + "step": 5550 + }, + { + "epoch": 0.49633404864091557, + "grad_norm": 0.14292239847455898, + "learning_rate": 0.00010604896788672048, + "loss": 0.7467, + "step": 5551 + }, + { + "epoch": 0.49642346208869814, + "grad_norm": 0.1458937675512113, + "learning_rate": 0.0001060200605552876, + "loss": 0.6927, + "step": 5552 + }, + { + "epoch": 0.4965128755364807, + "grad_norm": 0.13004319565941097, + "learning_rate": 0.00010599115271895948, + "loss": 0.6429, + "step": 5553 + }, + { + "epoch": 0.49660228898426323, + "grad_norm": 0.13953632987884154, + "learning_rate": 0.00010596224438016063, + "loss": 0.6948, + "step": 5554 + }, + { + "epoch": 0.4966917024320458, + "grad_norm": 0.1280583664080116, + "learning_rate": 0.00010593333554131552, + "loss": 0.6553, + "step": 5555 + }, + { + "epoch": 0.4967811158798283, + "grad_norm": 0.1240038895629599, + "learning_rate": 0.00010590442620484875, + "loss": 0.6578, + "step": 5556 + }, + { + "epoch": 0.4968705293276109, + "grad_norm": 0.14008698593825353, + "learning_rate": 0.00010587551637318489, + "loss": 0.6908, + "step": 5557 + }, + { + "epoch": 0.4969599427753934, + "grad_norm": 0.11720064855699953, + "learning_rate": 0.00010584660604874857, + "loss": 0.6459, + "step": 5558 + }, + { + "epoch": 0.497049356223176, + "grad_norm": 0.14259292594079045, + "learning_rate": 0.00010581769523396445, + "loss": 0.6815, + "step": 5559 + }, + { + "epoch": 0.4971387696709585, + "grad_norm": 0.1487565053101425, + "learning_rate": 0.00010578878393125724, + "loss": 0.6768, + "step": 5560 + }, + { + "epoch": 0.49722818311874106, + "grad_norm": 0.15074611907707805, + "learning_rate": 0.00010575987214305174, + "loss": 0.6741, + "step": 5561 + }, + { + "epoch": 0.4973175965665236, + "grad_norm": 0.14244230243618816, + "learning_rate": 0.0001057309598717727, + "loss": 0.6294, + "step": 5562 + }, + { + "epoch": 0.49740701001430615, + "grad_norm": 0.1316887539206744, + "learning_rate": 0.000105702047119845, + "loss": 0.6986, + "step": 5563 + }, + { + "epoch": 0.4974964234620887, + "grad_norm": 0.1329886933342604, + "learning_rate": 0.00010567313388969348, + "loss": 0.663, + "step": 5564 + }, + { + "epoch": 0.49758583690987124, + "grad_norm": 0.12670835942130249, + "learning_rate": 0.00010564422018374307, + "loss": 0.6605, + "step": 5565 + }, + { + "epoch": 0.4976752503576538, + "grad_norm": 0.1342817510986497, + "learning_rate": 0.00010561530600441873, + "loss": 0.6884, + "step": 5566 + }, + { + "epoch": 0.4977646638054363, + "grad_norm": 0.14316622630835027, + "learning_rate": 0.00010558639135414545, + "loss": 0.6286, + "step": 5567 + }, + { + "epoch": 0.4978540772532189, + "grad_norm": 0.12273755144864545, + "learning_rate": 0.00010555747623534831, + "loss": 0.6351, + "step": 5568 + }, + { + "epoch": 0.4979434907010014, + "grad_norm": 0.1253316922074813, + "learning_rate": 0.00010552856065045232, + "loss": 0.5995, + "step": 5569 + }, + { + "epoch": 0.498032904148784, + "grad_norm": 0.1185731703323, + "learning_rate": 0.00010549964460188261, + "loss": 0.6493, + "step": 5570 + }, + { + "epoch": 0.4981223175965665, + "grad_norm": 0.12949536935953757, + "learning_rate": 0.00010547072809206437, + "loss": 0.6378, + "step": 5571 + }, + { + "epoch": 0.49821173104434907, + "grad_norm": 0.13662222079901132, + "learning_rate": 0.00010544181112342278, + "loss": 0.6647, + "step": 5572 + }, + { + "epoch": 0.49830114449213164, + "grad_norm": 0.13374701147849632, + "learning_rate": 0.00010541289369838302, + "loss": 0.6428, + "step": 5573 + }, + { + "epoch": 0.49839055793991416, + "grad_norm": 0.12411454496439683, + "learning_rate": 0.00010538397581937048, + "loss": 0.6632, + "step": 5574 + }, + { + "epoch": 0.49847997138769673, + "grad_norm": 0.12610487611254836, + "learning_rate": 0.00010535505748881031, + "loss": 0.6708, + "step": 5575 + }, + { + "epoch": 0.49856938483547925, + "grad_norm": 0.13829278824704186, + "learning_rate": 0.00010532613870912799, + "loss": 0.6947, + "step": 5576 + }, + { + "epoch": 0.4986587982832618, + "grad_norm": 0.12271454539278112, + "learning_rate": 0.00010529721948274882, + "loss": 0.655, + "step": 5577 + }, + { + "epoch": 0.49874821173104433, + "grad_norm": 0.13405218214464662, + "learning_rate": 0.00010526829981209827, + "loss": 0.6455, + "step": 5578 + }, + { + "epoch": 0.4988376251788269, + "grad_norm": 0.12467663361100509, + "learning_rate": 0.00010523937969960176, + "loss": 0.6349, + "step": 5579 + }, + { + "epoch": 0.4989270386266094, + "grad_norm": 0.1312224474805621, + "learning_rate": 0.00010521045914768482, + "loss": 0.6465, + "step": 5580 + }, + { + "epoch": 0.499016452074392, + "grad_norm": 0.1294923593043134, + "learning_rate": 0.00010518153815877294, + "loss": 0.6006, + "step": 5581 + }, + { + "epoch": 0.4991058655221745, + "grad_norm": 0.13329466272877527, + "learning_rate": 0.00010515261673529173, + "loss": 0.6766, + "step": 5582 + }, + { + "epoch": 0.4991952789699571, + "grad_norm": 0.142695712357687, + "learning_rate": 0.00010512369487966678, + "loss": 0.649, + "step": 5583 + }, + { + "epoch": 0.49928469241773965, + "grad_norm": 0.1132947346297427, + "learning_rate": 0.00010509477259432372, + "loss": 0.618, + "step": 5584 + }, + { + "epoch": 0.49937410586552217, + "grad_norm": 0.12746146551503132, + "learning_rate": 0.00010506584988168824, + "loss": 0.6669, + "step": 5585 + }, + { + "epoch": 0.49946351931330474, + "grad_norm": 0.12012879082877045, + "learning_rate": 0.00010503692674418603, + "loss": 0.6351, + "step": 5586 + }, + { + "epoch": 0.49955293276108725, + "grad_norm": 0.12328028336443093, + "learning_rate": 0.00010500800318424286, + "loss": 0.6701, + "step": 5587 + }, + { + "epoch": 0.4996423462088698, + "grad_norm": 0.1165072479540527, + "learning_rate": 0.00010497907920428454, + "loss": 0.624, + "step": 5588 + }, + { + "epoch": 0.49973175965665234, + "grad_norm": 0.13469163768848177, + "learning_rate": 0.00010495015480673685, + "loss": 0.6822, + "step": 5589 + }, + { + "epoch": 0.4998211731044349, + "grad_norm": 0.1186547494737121, + "learning_rate": 0.00010492122999402562, + "loss": 0.6114, + "step": 5590 + }, + { + "epoch": 0.49991058655221743, + "grad_norm": 0.11107053539066046, + "learning_rate": 0.00010489230476857681, + "loss": 0.6113, + "step": 5591 + }, + { + "epoch": 0.5, + "grad_norm": 0.138418554428364, + "learning_rate": 0.00010486337913281632, + "loss": 0.6691, + "step": 5592 + }, + { + "epoch": 0.5000894134477826, + "grad_norm": 0.13968730755701297, + "learning_rate": 0.00010483445308917006, + "loss": 0.6903, + "step": 5593 + }, + { + "epoch": 0.5001788268955651, + "grad_norm": 0.12597446059605757, + "learning_rate": 0.00010480552664006406, + "loss": 0.6408, + "step": 5594 + }, + { + "epoch": 0.5002682403433476, + "grad_norm": 0.10886602338445497, + "learning_rate": 0.00010477659978792438, + "loss": 0.6328, + "step": 5595 + }, + { + "epoch": 0.5003576537911302, + "grad_norm": 0.11274809590416329, + "learning_rate": 0.00010474767253517701, + "loss": 0.6693, + "step": 5596 + }, + { + "epoch": 0.5004470672389127, + "grad_norm": 0.125951002599933, + "learning_rate": 0.00010471874488424808, + "loss": 0.6362, + "step": 5597 + }, + { + "epoch": 0.5005364806866953, + "grad_norm": 0.1505565616081237, + "learning_rate": 0.00010468981683756373, + "loss": 0.6761, + "step": 5598 + }, + { + "epoch": 0.5006258941344778, + "grad_norm": 0.11425298437120984, + "learning_rate": 0.00010466088839755012, + "loss": 0.6358, + "step": 5599 + }, + { + "epoch": 0.5007153075822603, + "grad_norm": 0.12730112351116338, + "learning_rate": 0.00010463195956663338, + "loss": 0.6455, + "step": 5600 + }, + { + "epoch": 0.5008047210300429, + "grad_norm": 0.14098732888398507, + "learning_rate": 0.00010460303034723985, + "loss": 0.6709, + "step": 5601 + }, + { + "epoch": 0.5008941344778255, + "grad_norm": 0.11620401171044842, + "learning_rate": 0.00010457410074179568, + "loss": 0.6274, + "step": 5602 + }, + { + "epoch": 0.5009835479256081, + "grad_norm": 0.13938531260669315, + "learning_rate": 0.00010454517075272721, + "loss": 0.6013, + "step": 5603 + }, + { + "epoch": 0.5010729613733905, + "grad_norm": 0.13407371753647276, + "learning_rate": 0.00010451624038246075, + "loss": 0.6504, + "step": 5604 + }, + { + "epoch": 0.5011623748211731, + "grad_norm": 0.15213760185006886, + "learning_rate": 0.00010448730963342268, + "loss": 0.7035, + "step": 5605 + }, + { + "epoch": 0.5012517882689557, + "grad_norm": 0.12485320044860702, + "learning_rate": 0.00010445837850803939, + "loss": 0.691, + "step": 5606 + }, + { + "epoch": 0.5013412017167382, + "grad_norm": 0.13363048057908294, + "learning_rate": 0.00010442944700873722, + "loss": 0.6939, + "step": 5607 + }, + { + "epoch": 0.5014306151645207, + "grad_norm": 0.12370670750250415, + "learning_rate": 0.00010440051513794271, + "loss": 0.6608, + "step": 5608 + }, + { + "epoch": 0.5015200286123033, + "grad_norm": 0.1484051750235618, + "learning_rate": 0.00010437158289808233, + "loss": 0.697, + "step": 5609 + }, + { + "epoch": 0.5016094420600858, + "grad_norm": 0.13375400030282575, + "learning_rate": 0.00010434265029158254, + "loss": 0.6635, + "step": 5610 + }, + { + "epoch": 0.5016988555078684, + "grad_norm": 0.1354027853460153, + "learning_rate": 0.00010431371732086994, + "loss": 0.6528, + "step": 5611 + }, + { + "epoch": 0.501788268955651, + "grad_norm": 0.12565251460978963, + "learning_rate": 0.00010428478398837107, + "loss": 0.682, + "step": 5612 + }, + { + "epoch": 0.5018776824034334, + "grad_norm": 0.13438871456054244, + "learning_rate": 0.00010425585029651252, + "loss": 0.6736, + "step": 5613 + }, + { + "epoch": 0.501967095851216, + "grad_norm": 0.1364760537053795, + "learning_rate": 0.00010422691624772097, + "loss": 0.7, + "step": 5614 + }, + { + "epoch": 0.5020565092989986, + "grad_norm": 0.10990983547444878, + "learning_rate": 0.00010419798184442307, + "loss": 0.6127, + "step": 5615 + }, + { + "epoch": 0.5021459227467812, + "grad_norm": 0.1270280260962841, + "learning_rate": 0.00010416904708904548, + "loss": 0.6185, + "step": 5616 + }, + { + "epoch": 0.5022353361945636, + "grad_norm": 0.14216132268583284, + "learning_rate": 0.00010414011198401492, + "loss": 0.6765, + "step": 5617 + }, + { + "epoch": 0.5023247496423462, + "grad_norm": 0.11608789344315064, + "learning_rate": 0.00010411117653175821, + "loss": 0.6457, + "step": 5618 + }, + { + "epoch": 0.5024141630901288, + "grad_norm": 0.14367632601432895, + "learning_rate": 0.0001040822407347021, + "loss": 0.7068, + "step": 5619 + }, + { + "epoch": 0.5025035765379113, + "grad_norm": 0.13272040210616262, + "learning_rate": 0.00010405330459527336, + "loss": 0.6524, + "step": 5620 + }, + { + "epoch": 0.5025929899856938, + "grad_norm": 0.1533158937103631, + "learning_rate": 0.00010402436811589887, + "loss": 0.7084, + "step": 5621 + }, + { + "epoch": 0.5026824034334764, + "grad_norm": 0.14249099810816673, + "learning_rate": 0.00010399543129900549, + "loss": 0.6978, + "step": 5622 + }, + { + "epoch": 0.5027718168812589, + "grad_norm": 0.13867393041253873, + "learning_rate": 0.00010396649414702011, + "loss": 0.6555, + "step": 5623 + }, + { + "epoch": 0.5028612303290415, + "grad_norm": 0.1303600171599239, + "learning_rate": 0.00010393755666236962, + "loss": 0.6196, + "step": 5624 + }, + { + "epoch": 0.5029506437768241, + "grad_norm": 0.15839283762075157, + "learning_rate": 0.00010390861884748107, + "loss": 0.7019, + "step": 5625 + }, + { + "epoch": 0.5030400572246065, + "grad_norm": 0.12134587777685629, + "learning_rate": 0.00010387968070478136, + "loss": 0.6849, + "step": 5626 + }, + { + "epoch": 0.5031294706723891, + "grad_norm": 0.13038829060326107, + "learning_rate": 0.00010385074223669748, + "loss": 0.6123, + "step": 5627 + }, + { + "epoch": 0.5032188841201717, + "grad_norm": 0.14142776461511256, + "learning_rate": 0.0001038218034456565, + "loss": 0.6922, + "step": 5628 + }, + { + "epoch": 0.5033082975679543, + "grad_norm": 0.13744293271942506, + "learning_rate": 0.00010379286433408553, + "loss": 0.6784, + "step": 5629 + }, + { + "epoch": 0.5033977110157367, + "grad_norm": 0.1588363338941727, + "learning_rate": 0.00010376392490441158, + "loss": 0.6711, + "step": 5630 + }, + { + "epoch": 0.5034871244635193, + "grad_norm": 0.1416670206214228, + "learning_rate": 0.00010373498515906177, + "loss": 0.6895, + "step": 5631 + }, + { + "epoch": 0.5035765379113019, + "grad_norm": 0.14785871181247875, + "learning_rate": 0.00010370604510046331, + "loss": 0.6526, + "step": 5632 + }, + { + "epoch": 0.5036659513590844, + "grad_norm": 0.1365182912993476, + "learning_rate": 0.00010367710473104331, + "loss": 0.6858, + "step": 5633 + }, + { + "epoch": 0.503755364806867, + "grad_norm": 0.1378455347886249, + "learning_rate": 0.00010364816405322895, + "loss": 0.663, + "step": 5634 + }, + { + "epoch": 0.5038447782546495, + "grad_norm": 0.1269198958966511, + "learning_rate": 0.00010361922306944751, + "loss": 0.6304, + "step": 5635 + }, + { + "epoch": 0.503934191702432, + "grad_norm": 0.1245895184397009, + "learning_rate": 0.0001035902817821262, + "loss": 0.6524, + "step": 5636 + }, + { + "epoch": 0.5040236051502146, + "grad_norm": 0.13054984843884848, + "learning_rate": 0.00010356134019369227, + "loss": 0.6435, + "step": 5637 + }, + { + "epoch": 0.5041130185979972, + "grad_norm": 0.12160941436087076, + "learning_rate": 0.00010353239830657304, + "loss": 0.6053, + "step": 5638 + }, + { + "epoch": 0.5042024320457796, + "grad_norm": 0.1450629174147895, + "learning_rate": 0.00010350345612319586, + "loss": 0.6966, + "step": 5639 + }, + { + "epoch": 0.5042918454935622, + "grad_norm": 0.1470179024470026, + "learning_rate": 0.00010347451364598804, + "loss": 0.6863, + "step": 5640 + }, + { + "epoch": 0.5043812589413448, + "grad_norm": 0.10148936163845484, + "learning_rate": 0.00010344557087737692, + "loss": 0.6115, + "step": 5641 + }, + { + "epoch": 0.5044706723891274, + "grad_norm": 0.15255799051794444, + "learning_rate": 0.00010341662781978996, + "loss": 0.7148, + "step": 5642 + }, + { + "epoch": 0.5045600858369099, + "grad_norm": 0.12630005623276178, + "learning_rate": 0.00010338768447565457, + "loss": 0.6432, + "step": 5643 + }, + { + "epoch": 0.5046494992846924, + "grad_norm": 0.13240450342300275, + "learning_rate": 0.00010335874084739814, + "loss": 0.6247, + "step": 5644 + }, + { + "epoch": 0.504738912732475, + "grad_norm": 0.12568687039626622, + "learning_rate": 0.00010332979693744815, + "loss": 0.6144, + "step": 5645 + }, + { + "epoch": 0.5048283261802575, + "grad_norm": 0.12402227109266759, + "learning_rate": 0.00010330085274823218, + "loss": 0.678, + "step": 5646 + }, + { + "epoch": 0.5049177396280401, + "grad_norm": 0.13273512751551922, + "learning_rate": 0.00010327190828217763, + "loss": 0.6726, + "step": 5647 + }, + { + "epoch": 0.5050071530758226, + "grad_norm": 0.14972630789793723, + "learning_rate": 0.00010324296354171207, + "loss": 0.6468, + "step": 5648 + }, + { + "epoch": 0.5050965665236051, + "grad_norm": 0.1385819325899346, + "learning_rate": 0.00010321401852926312, + "loss": 0.675, + "step": 5649 + }, + { + "epoch": 0.5051859799713877, + "grad_norm": 0.13136527241070464, + "learning_rate": 0.0001031850732472583, + "loss": 0.6887, + "step": 5650 + }, + { + "epoch": 0.5052753934191703, + "grad_norm": 0.11936257593146918, + "learning_rate": 0.00010315612769812524, + "loss": 0.6449, + "step": 5651 + }, + { + "epoch": 0.5053648068669528, + "grad_norm": 0.12769011917968384, + "learning_rate": 0.00010312718188429154, + "loss": 0.6759, + "step": 5652 + }, + { + "epoch": 0.5054542203147353, + "grad_norm": 0.14515776497022206, + "learning_rate": 0.00010309823580818489, + "loss": 0.6992, + "step": 5653 + }, + { + "epoch": 0.5055436337625179, + "grad_norm": 0.12023176013429844, + "learning_rate": 0.00010306928947223294, + "loss": 0.6462, + "step": 5654 + }, + { + "epoch": 0.5056330472103004, + "grad_norm": 0.14050527634834828, + "learning_rate": 0.00010304034287886337, + "loss": 0.688, + "step": 5655 + }, + { + "epoch": 0.505722460658083, + "grad_norm": 0.13908487865671945, + "learning_rate": 0.00010301139603050394, + "loss": 0.6432, + "step": 5656 + }, + { + "epoch": 0.5058118741058655, + "grad_norm": 0.13439401604560708, + "learning_rate": 0.00010298244892958235, + "loss": 0.6541, + "step": 5657 + }, + { + "epoch": 0.505901287553648, + "grad_norm": 0.12289216875710846, + "learning_rate": 0.00010295350157852637, + "loss": 0.6571, + "step": 5658 + }, + { + "epoch": 0.5059907010014306, + "grad_norm": 0.12674684818828527, + "learning_rate": 0.00010292455397976379, + "loss": 0.6336, + "step": 5659 + }, + { + "epoch": 0.5060801144492132, + "grad_norm": 0.13378872189764918, + "learning_rate": 0.0001028956061357224, + "loss": 0.673, + "step": 5660 + }, + { + "epoch": 0.5061695278969958, + "grad_norm": 0.13944706824170108, + "learning_rate": 0.00010286665804883, + "loss": 0.671, + "step": 5661 + }, + { + "epoch": 0.5062589413447782, + "grad_norm": 0.1413270837006361, + "learning_rate": 0.00010283770972151445, + "loss": 0.6668, + "step": 5662 + }, + { + "epoch": 0.5063483547925608, + "grad_norm": 0.14385648072489973, + "learning_rate": 0.00010280876115620365, + "loss": 0.6756, + "step": 5663 + }, + { + "epoch": 0.5064377682403434, + "grad_norm": 0.1423845064819424, + "learning_rate": 0.00010277981235532541, + "loss": 0.6697, + "step": 5664 + }, + { + "epoch": 0.5065271816881259, + "grad_norm": 0.12232860017821855, + "learning_rate": 0.00010275086332130768, + "loss": 0.625, + "step": 5665 + }, + { + "epoch": 0.5066165951359084, + "grad_norm": 0.13002376795796314, + "learning_rate": 0.00010272191405657836, + "loss": 0.666, + "step": 5666 + }, + { + "epoch": 0.506706008583691, + "grad_norm": 0.13182469725000906, + "learning_rate": 0.00010269296456356541, + "loss": 0.5668, + "step": 5667 + }, + { + "epoch": 0.5067954220314735, + "grad_norm": 0.12545454604496586, + "learning_rate": 0.00010266401484469674, + "loss": 0.5976, + "step": 5668 + }, + { + "epoch": 0.5068848354792561, + "grad_norm": 0.1575567539052478, + "learning_rate": 0.00010263506490240038, + "loss": 0.6966, + "step": 5669 + }, + { + "epoch": 0.5069742489270386, + "grad_norm": 0.14298859159420615, + "learning_rate": 0.00010260611473910433, + "loss": 0.6072, + "step": 5670 + }, + { + "epoch": 0.5070636623748211, + "grad_norm": 0.12760686161082185, + "learning_rate": 0.00010257716435723656, + "loss": 0.6305, + "step": 5671 + }, + { + "epoch": 0.5071530758226037, + "grad_norm": 0.1209305652512357, + "learning_rate": 0.00010254821375922512, + "loss": 0.6191, + "step": 5672 + }, + { + "epoch": 0.5072424892703863, + "grad_norm": 0.13154616776333694, + "learning_rate": 0.0001025192629474981, + "loss": 0.6503, + "step": 5673 + }, + { + "epoch": 0.5073319027181689, + "grad_norm": 0.14950868653646324, + "learning_rate": 0.0001024903119244835, + "loss": 0.6797, + "step": 5674 + }, + { + "epoch": 0.5074213161659513, + "grad_norm": 0.1500474668950626, + "learning_rate": 0.0001024613606926095, + "loss": 0.6589, + "step": 5675 + }, + { + "epoch": 0.5075107296137339, + "grad_norm": 0.1419647113723438, + "learning_rate": 0.00010243240925430411, + "loss": 0.6818, + "step": 5676 + }, + { + "epoch": 0.5076001430615165, + "grad_norm": 0.11809342606065618, + "learning_rate": 0.00010240345761199553, + "loss": 0.6342, + "step": 5677 + }, + { + "epoch": 0.507689556509299, + "grad_norm": 0.12378495414320947, + "learning_rate": 0.0001023745057681118, + "loss": 0.6346, + "step": 5678 + }, + { + "epoch": 0.5077789699570815, + "grad_norm": 0.12237455123763111, + "learning_rate": 0.00010234555372508119, + "loss": 0.65, + "step": 5679 + }, + { + "epoch": 0.5078683834048641, + "grad_norm": 0.1334258581494126, + "learning_rate": 0.00010231660148533183, + "loss": 0.6815, + "step": 5680 + }, + { + "epoch": 0.5079577968526466, + "grad_norm": 0.13899344786621734, + "learning_rate": 0.00010228764905129184, + "loss": 0.5853, + "step": 5681 + }, + { + "epoch": 0.5080472103004292, + "grad_norm": 0.1324482764531228, + "learning_rate": 0.00010225869642538955, + "loss": 0.6634, + "step": 5682 + }, + { + "epoch": 0.5081366237482118, + "grad_norm": 0.13649032923383653, + "learning_rate": 0.00010222974361005309, + "loss": 0.6301, + "step": 5683 + }, + { + "epoch": 0.5082260371959942, + "grad_norm": 0.14305635053548946, + "learning_rate": 0.00010220079060771075, + "loss": 0.6644, + "step": 5684 + }, + { + "epoch": 0.5083154506437768, + "grad_norm": 0.12777968568580528, + "learning_rate": 0.00010217183742079073, + "loss": 0.6655, + "step": 5685 + }, + { + "epoch": 0.5084048640915594, + "grad_norm": 0.13825623150257724, + "learning_rate": 0.00010214288405172133, + "loss": 0.6668, + "step": 5686 + }, + { + "epoch": 0.508494277539342, + "grad_norm": 0.11452622307466262, + "learning_rate": 0.00010211393050293083, + "loss": 0.6517, + "step": 5687 + }, + { + "epoch": 0.5085836909871244, + "grad_norm": 0.12843716094482188, + "learning_rate": 0.00010208497677684754, + "loss": 0.6168, + "step": 5688 + }, + { + "epoch": 0.508673104434907, + "grad_norm": 0.12141978895904403, + "learning_rate": 0.0001020560228758998, + "loss": 0.6568, + "step": 5689 + }, + { + "epoch": 0.5087625178826896, + "grad_norm": 0.11711338629872386, + "learning_rate": 0.00010202706880251584, + "loss": 0.6468, + "step": 5690 + }, + { + "epoch": 0.5088519313304721, + "grad_norm": 0.13551993314136576, + "learning_rate": 0.00010199811455912412, + "loss": 0.7043, + "step": 5691 + }, + { + "epoch": 0.5089413447782547, + "grad_norm": 0.12352381555420987, + "learning_rate": 0.00010196916014815292, + "loss": 0.5893, + "step": 5692 + }, + { + "epoch": 0.5090307582260372, + "grad_norm": 0.11800944321387134, + "learning_rate": 0.00010194020557203063, + "loss": 0.637, + "step": 5693 + }, + { + "epoch": 0.5091201716738197, + "grad_norm": 0.12109934575777752, + "learning_rate": 0.00010191125083318566, + "loss": 0.6295, + "step": 5694 + }, + { + "epoch": 0.5092095851216023, + "grad_norm": 0.11928614908938776, + "learning_rate": 0.00010188229593404639, + "loss": 0.6547, + "step": 5695 + }, + { + "epoch": 0.5092989985693849, + "grad_norm": 0.12613911564989644, + "learning_rate": 0.00010185334087704124, + "loss": 0.644, + "step": 5696 + }, + { + "epoch": 0.5093884120171673, + "grad_norm": 0.12013678262816065, + "learning_rate": 0.0001018243856645986, + "loss": 0.6343, + "step": 5697 + }, + { + "epoch": 0.5094778254649499, + "grad_norm": 0.15576777189797453, + "learning_rate": 0.00010179543029914695, + "loss": 0.7092, + "step": 5698 + }, + { + "epoch": 0.5095672389127325, + "grad_norm": 0.13727766059308202, + "learning_rate": 0.00010176647478311473, + "loss": 0.653, + "step": 5699 + }, + { + "epoch": 0.509656652360515, + "grad_norm": 0.12050050772728157, + "learning_rate": 0.00010173751911893041, + "loss": 0.6294, + "step": 5700 + }, + { + "epoch": 0.5097460658082976, + "grad_norm": 0.13004494574623643, + "learning_rate": 0.00010170856330902247, + "loss": 0.671, + "step": 5701 + }, + { + "epoch": 0.5098354792560801, + "grad_norm": 0.15116691021791154, + "learning_rate": 0.00010167960735581936, + "loss": 0.6834, + "step": 5702 + }, + { + "epoch": 0.5099248927038627, + "grad_norm": 0.1403928773387079, + "learning_rate": 0.00010165065126174962, + "loss": 0.6292, + "step": 5703 + }, + { + "epoch": 0.5100143061516452, + "grad_norm": 0.13245927092248166, + "learning_rate": 0.00010162169502924177, + "loss": 0.6572, + "step": 5704 + }, + { + "epoch": 0.5101037195994278, + "grad_norm": 0.1311432350818624, + "learning_rate": 0.0001015927386607243, + "loss": 0.661, + "step": 5705 + }, + { + "epoch": 0.5101931330472103, + "grad_norm": 0.12713953972778216, + "learning_rate": 0.00010156378215862578, + "loss": 0.6637, + "step": 5706 + }, + { + "epoch": 0.5102825464949928, + "grad_norm": 0.1461746430686644, + "learning_rate": 0.00010153482552537472, + "loss": 0.7043, + "step": 5707 + }, + { + "epoch": 0.5103719599427754, + "grad_norm": 0.1401572050568088, + "learning_rate": 0.00010150586876339969, + "loss": 0.6529, + "step": 5708 + }, + { + "epoch": 0.510461373390558, + "grad_norm": 0.14507180230955163, + "learning_rate": 0.00010147691187512928, + "loss": 0.6484, + "step": 5709 + }, + { + "epoch": 0.5105507868383404, + "grad_norm": 0.1423228436685427, + "learning_rate": 0.00010144795486299205, + "loss": 0.6281, + "step": 5710 + }, + { + "epoch": 0.510640200286123, + "grad_norm": 0.1366799532574586, + "learning_rate": 0.0001014189977294166, + "loss": 0.6422, + "step": 5711 + }, + { + "epoch": 0.5107296137339056, + "grad_norm": 0.13170181716019003, + "learning_rate": 0.00010139004047683151, + "loss": 0.6531, + "step": 5712 + }, + { + "epoch": 0.5108190271816881, + "grad_norm": 0.11940734069417458, + "learning_rate": 0.00010136108310766544, + "loss": 0.6285, + "step": 5713 + }, + { + "epoch": 0.5109084406294707, + "grad_norm": 0.1366385897935516, + "learning_rate": 0.00010133212562434693, + "loss": 0.6595, + "step": 5714 + }, + { + "epoch": 0.5109978540772532, + "grad_norm": 0.15923515809910757, + "learning_rate": 0.00010130316802930467, + "loss": 0.6065, + "step": 5715 + }, + { + "epoch": 0.5110872675250357, + "grad_norm": 0.131010555963119, + "learning_rate": 0.00010127421032496729, + "loss": 0.6454, + "step": 5716 + }, + { + "epoch": 0.5111766809728183, + "grad_norm": 0.14375427405327904, + "learning_rate": 0.00010124525251376342, + "loss": 0.6721, + "step": 5717 + }, + { + "epoch": 0.5112660944206009, + "grad_norm": 0.13662797419725967, + "learning_rate": 0.00010121629459812172, + "loss": 0.6634, + "step": 5718 + }, + { + "epoch": 0.5113555078683834, + "grad_norm": 0.1403001987147886, + "learning_rate": 0.00010118733658047088, + "loss": 0.6439, + "step": 5719 + }, + { + "epoch": 0.5114449213161659, + "grad_norm": 0.1318948753350572, + "learning_rate": 0.00010115837846323954, + "loss": 0.6763, + "step": 5720 + }, + { + "epoch": 0.5115343347639485, + "grad_norm": 0.14557540969486638, + "learning_rate": 0.00010112942024885639, + "loss": 0.6842, + "step": 5721 + }, + { + "epoch": 0.5116237482117311, + "grad_norm": 0.1422064249205326, + "learning_rate": 0.00010110046193975014, + "loss": 0.6651, + "step": 5722 + }, + { + "epoch": 0.5117131616595136, + "grad_norm": 0.11654529710170293, + "learning_rate": 0.0001010715035383495, + "loss": 0.6416, + "step": 5723 + }, + { + "epoch": 0.5118025751072961, + "grad_norm": 0.12024831259373309, + "learning_rate": 0.00010104254504708311, + "loss": 0.6402, + "step": 5724 + }, + { + "epoch": 0.5118919885550787, + "grad_norm": 0.14506584891803387, + "learning_rate": 0.00010101358646837971, + "loss": 0.6604, + "step": 5725 + }, + { + "epoch": 0.5119814020028612, + "grad_norm": 0.1353143289524487, + "learning_rate": 0.00010098462780466808, + "loss": 0.6158, + "step": 5726 + }, + { + "epoch": 0.5120708154506438, + "grad_norm": 0.13153612322940855, + "learning_rate": 0.00010095566905837692, + "loss": 0.5876, + "step": 5727 + }, + { + "epoch": 0.5121602288984263, + "grad_norm": 0.12403798562211689, + "learning_rate": 0.00010092671023193491, + "loss": 0.6338, + "step": 5728 + }, + { + "epoch": 0.5122496423462088, + "grad_norm": 0.14712516440391887, + "learning_rate": 0.00010089775132777084, + "loss": 0.6822, + "step": 5729 + }, + { + "epoch": 0.5123390557939914, + "grad_norm": 0.11930439167419309, + "learning_rate": 0.00010086879234831345, + "loss": 0.64, + "step": 5730 + }, + { + "epoch": 0.512428469241774, + "grad_norm": 0.1202956236271754, + "learning_rate": 0.00010083983329599151, + "loss": 0.6173, + "step": 5731 + }, + { + "epoch": 0.5125178826895566, + "grad_norm": 0.11213967300627982, + "learning_rate": 0.00010081087417323374, + "loss": 0.6255, + "step": 5732 + }, + { + "epoch": 0.512607296137339, + "grad_norm": 0.1340512303159191, + "learning_rate": 0.00010078191498246897, + "loss": 0.6437, + "step": 5733 + }, + { + "epoch": 0.5126967095851216, + "grad_norm": 0.12295089935443664, + "learning_rate": 0.00010075295572612593, + "loss": 0.6474, + "step": 5734 + }, + { + "epoch": 0.5127861230329042, + "grad_norm": 0.13645705265458652, + "learning_rate": 0.00010072399640663334, + "loss": 0.6454, + "step": 5735 + }, + { + "epoch": 0.5128755364806867, + "grad_norm": 0.1210404605411335, + "learning_rate": 0.00010069503702642011, + "loss": 0.617, + "step": 5736 + }, + { + "epoch": 0.5129649499284692, + "grad_norm": 0.13297665307710382, + "learning_rate": 0.00010066607758791495, + "loss": 0.6372, + "step": 5737 + }, + { + "epoch": 0.5130543633762518, + "grad_norm": 0.16381323082102445, + "learning_rate": 0.00010063711809354665, + "loss": 0.6748, + "step": 5738 + }, + { + "epoch": 0.5131437768240343, + "grad_norm": 0.12833086794216408, + "learning_rate": 0.00010060815854574403, + "loss": 0.6756, + "step": 5739 + }, + { + "epoch": 0.5132331902718169, + "grad_norm": 0.12227450840519098, + "learning_rate": 0.00010057919894693593, + "loss": 0.6442, + "step": 5740 + }, + { + "epoch": 0.5133226037195995, + "grad_norm": 0.13864493342896786, + "learning_rate": 0.00010055023929955106, + "loss": 0.6935, + "step": 5741 + }, + { + "epoch": 0.5134120171673819, + "grad_norm": 0.1238572627804214, + "learning_rate": 0.00010052127960601829, + "loss": 0.6237, + "step": 5742 + }, + { + "epoch": 0.5135014306151645, + "grad_norm": 0.11968675020874933, + "learning_rate": 0.00010049231986876646, + "loss": 0.5836, + "step": 5743 + }, + { + "epoch": 0.5135908440629471, + "grad_norm": 0.14347055401354203, + "learning_rate": 0.00010046336009022435, + "loss": 0.6472, + "step": 5744 + }, + { + "epoch": 0.5136802575107297, + "grad_norm": 0.12554154769535367, + "learning_rate": 0.00010043440027282078, + "loss": 0.6645, + "step": 5745 + }, + { + "epoch": 0.5137696709585121, + "grad_norm": 0.1349415576513135, + "learning_rate": 0.00010040544041898456, + "loss": 0.6421, + "step": 5746 + }, + { + "epoch": 0.5138590844062947, + "grad_norm": 0.1305177045269254, + "learning_rate": 0.0001003764805311446, + "loss": 0.6779, + "step": 5747 + }, + { + "epoch": 0.5139484978540773, + "grad_norm": 0.13738435922372055, + "learning_rate": 0.00010034752061172961, + "loss": 0.6797, + "step": 5748 + }, + { + "epoch": 0.5140379113018598, + "grad_norm": 0.1312305534243374, + "learning_rate": 0.00010031856066316852, + "loss": 0.643, + "step": 5749 + }, + { + "epoch": 0.5141273247496424, + "grad_norm": 0.1252412553301454, + "learning_rate": 0.00010028960068789012, + "loss": 0.6099, + "step": 5750 + }, + { + "epoch": 0.5142167381974249, + "grad_norm": 0.147989511208777, + "learning_rate": 0.00010026064068832328, + "loss": 0.7084, + "step": 5751 + }, + { + "epoch": 0.5143061516452074, + "grad_norm": 0.15394860740558408, + "learning_rate": 0.00010023168066689677, + "loss": 0.669, + "step": 5752 + }, + { + "epoch": 0.51439556509299, + "grad_norm": 0.12551973591628965, + "learning_rate": 0.00010020272062603953, + "loss": 0.6423, + "step": 5753 + }, + { + "epoch": 0.5144849785407726, + "grad_norm": 0.12607224416348983, + "learning_rate": 0.00010017376056818035, + "loss": 0.6522, + "step": 5754 + }, + { + "epoch": 0.514574391988555, + "grad_norm": 0.13707356404081145, + "learning_rate": 0.00010014480049574808, + "loss": 0.6385, + "step": 5755 + }, + { + "epoch": 0.5146638054363376, + "grad_norm": 0.13052483974764537, + "learning_rate": 0.00010011584041117155, + "loss": 0.6547, + "step": 5756 + }, + { + "epoch": 0.5147532188841202, + "grad_norm": 0.1269539270133717, + "learning_rate": 0.00010008688031687964, + "loss": 0.6782, + "step": 5757 + }, + { + "epoch": 0.5148426323319027, + "grad_norm": 0.15259690931169234, + "learning_rate": 0.00010005792021530121, + "loss": 0.6453, + "step": 5758 + }, + { + "epoch": 0.5149320457796852, + "grad_norm": 0.13400835682675233, + "learning_rate": 0.000100028960108865, + "loss": 0.6496, + "step": 5759 + }, + { + "epoch": 0.5150214592274678, + "grad_norm": 0.11699659002239046, + "learning_rate": 0.0001, + "loss": 0.6095, + "step": 5760 + }, + { + "epoch": 0.5151108726752504, + "grad_norm": 0.12785032599767654, + "learning_rate": 9.997103989113501e-05, + "loss": 0.6516, + "step": 5761 + }, + { + "epoch": 0.5152002861230329, + "grad_norm": 0.12693043285411873, + "learning_rate": 9.994207978469885e-05, + "loss": 0.6723, + "step": 5762 + }, + { + "epoch": 0.5152896995708155, + "grad_norm": 0.11923839706560693, + "learning_rate": 9.991311968312039e-05, + "loss": 0.6488, + "step": 5763 + }, + { + "epoch": 0.515379113018598, + "grad_norm": 0.13041191412774528, + "learning_rate": 9.988415958882845e-05, + "loss": 0.6182, + "step": 5764 + }, + { + "epoch": 0.5154685264663805, + "grad_norm": 0.1250078722894622, + "learning_rate": 9.985519950425196e-05, + "loss": 0.6357, + "step": 5765 + }, + { + "epoch": 0.5155579399141631, + "grad_norm": 0.1278309376503686, + "learning_rate": 9.982623943181966e-05, + "loss": 0.6679, + "step": 5766 + }, + { + "epoch": 0.5156473533619457, + "grad_norm": 0.13112008568648725, + "learning_rate": 9.979727937396048e-05, + "loss": 0.6539, + "step": 5767 + }, + { + "epoch": 0.5157367668097281, + "grad_norm": 0.1347277350341562, + "learning_rate": 9.976831933310324e-05, + "loss": 0.5779, + "step": 5768 + }, + { + "epoch": 0.5158261802575107, + "grad_norm": 0.1391071019119728, + "learning_rate": 9.973935931167677e-05, + "loss": 0.6634, + "step": 5769 + }, + { + "epoch": 0.5159155937052933, + "grad_norm": 0.12942380498589745, + "learning_rate": 9.971039931210993e-05, + "loss": 0.6309, + "step": 5770 + }, + { + "epoch": 0.5160050071530758, + "grad_norm": 0.14294559425330408, + "learning_rate": 9.968143933683149e-05, + "loss": 0.6866, + "step": 5771 + }, + { + "epoch": 0.5160944206008584, + "grad_norm": 0.12999720253034822, + "learning_rate": 9.965247938827041e-05, + "loss": 0.6139, + "step": 5772 + }, + { + "epoch": 0.5161838340486409, + "grad_norm": 0.13983646434613176, + "learning_rate": 9.962351946885544e-05, + "loss": 0.6385, + "step": 5773 + }, + { + "epoch": 0.5162732474964234, + "grad_norm": 0.13716824109001144, + "learning_rate": 9.959455958101546e-05, + "loss": 0.7021, + "step": 5774 + }, + { + "epoch": 0.516362660944206, + "grad_norm": 0.13916276121749363, + "learning_rate": 9.956559972717925e-05, + "loss": 0.6891, + "step": 5775 + }, + { + "epoch": 0.5164520743919886, + "grad_norm": 0.1479530714943822, + "learning_rate": 9.953663990977568e-05, + "loss": 0.6667, + "step": 5776 + }, + { + "epoch": 0.516541487839771, + "grad_norm": 0.13758427234027898, + "learning_rate": 9.950768013123358e-05, + "loss": 0.6656, + "step": 5777 + }, + { + "epoch": 0.5166309012875536, + "grad_norm": 0.14710264261974598, + "learning_rate": 9.94787203939817e-05, + "loss": 0.6651, + "step": 5778 + }, + { + "epoch": 0.5167203147353362, + "grad_norm": 0.11742951472459653, + "learning_rate": 9.944976070044894e-05, + "loss": 0.644, + "step": 5779 + }, + { + "epoch": 0.5168097281831188, + "grad_norm": 0.13981407883280653, + "learning_rate": 9.94208010530641e-05, + "loss": 0.6637, + "step": 5780 + }, + { + "epoch": 0.5168991416309013, + "grad_norm": 0.1243526476812858, + "learning_rate": 9.939184145425598e-05, + "loss": 0.6938, + "step": 5781 + }, + { + "epoch": 0.5169885550786838, + "grad_norm": 0.12093203109025728, + "learning_rate": 9.936288190645336e-05, + "loss": 0.6476, + "step": 5782 + }, + { + "epoch": 0.5170779685264664, + "grad_norm": 0.12155855213391721, + "learning_rate": 9.933392241208507e-05, + "loss": 0.6409, + "step": 5783 + }, + { + "epoch": 0.5171673819742489, + "grad_norm": 0.14591097342669607, + "learning_rate": 9.930496297357993e-05, + "loss": 0.6625, + "step": 5784 + }, + { + "epoch": 0.5172567954220315, + "grad_norm": 0.17109781705082935, + "learning_rate": 9.927600359336666e-05, + "loss": 0.6818, + "step": 5785 + }, + { + "epoch": 0.517346208869814, + "grad_norm": 0.14863995720853704, + "learning_rate": 9.92470442738741e-05, + "loss": 0.6306, + "step": 5786 + }, + { + "epoch": 0.5174356223175965, + "grad_norm": 0.13502456493515252, + "learning_rate": 9.921808501753106e-05, + "loss": 0.6218, + "step": 5787 + }, + { + "epoch": 0.5175250357653791, + "grad_norm": 0.15251174496466519, + "learning_rate": 9.918912582676629e-05, + "loss": 0.6603, + "step": 5788 + }, + { + "epoch": 0.5176144492131617, + "grad_norm": 0.13305268040960738, + "learning_rate": 9.916016670400851e-05, + "loss": 0.6509, + "step": 5789 + }, + { + "epoch": 0.5177038626609443, + "grad_norm": 0.13069429677576092, + "learning_rate": 9.913120765168657e-05, + "loss": 0.6275, + "step": 5790 + }, + { + "epoch": 0.5177932761087267, + "grad_norm": 0.1336549847587575, + "learning_rate": 9.910224867222921e-05, + "loss": 0.666, + "step": 5791 + }, + { + "epoch": 0.5178826895565093, + "grad_norm": 0.12980834163914493, + "learning_rate": 9.907328976806511e-05, + "loss": 0.6543, + "step": 5792 + }, + { + "epoch": 0.5179721030042919, + "grad_norm": 0.139712801377907, + "learning_rate": 9.904433094162311e-05, + "loss": 0.6554, + "step": 5793 + }, + { + "epoch": 0.5180615164520744, + "grad_norm": 0.1337798960163812, + "learning_rate": 9.901537219533194e-05, + "loss": 0.631, + "step": 5794 + }, + { + "epoch": 0.5181509298998569, + "grad_norm": 0.1286425632269903, + "learning_rate": 9.89864135316203e-05, + "loss": 0.6588, + "step": 5795 + }, + { + "epoch": 0.5182403433476395, + "grad_norm": 0.14809938491091865, + "learning_rate": 9.895745495291693e-05, + "loss": 0.6857, + "step": 5796 + }, + { + "epoch": 0.518329756795422, + "grad_norm": 0.14505547243748165, + "learning_rate": 9.892849646165057e-05, + "loss": 0.6265, + "step": 5797 + }, + { + "epoch": 0.5184191702432046, + "grad_norm": 0.13445199113446774, + "learning_rate": 9.889953806024991e-05, + "loss": 0.6764, + "step": 5798 + }, + { + "epoch": 0.5185085836909872, + "grad_norm": 0.1379614069446391, + "learning_rate": 9.887057975114362e-05, + "loss": 0.6232, + "step": 5799 + }, + { + "epoch": 0.5185979971387696, + "grad_norm": 0.1413003757649285, + "learning_rate": 9.884162153676048e-05, + "loss": 0.6215, + "step": 5800 + }, + { + "epoch": 0.5186874105865522, + "grad_norm": 0.1313896812756031, + "learning_rate": 9.881266341952915e-05, + "loss": 0.6601, + "step": 5801 + }, + { + "epoch": 0.5187768240343348, + "grad_norm": 0.14154682286118606, + "learning_rate": 9.878370540187831e-05, + "loss": 0.6353, + "step": 5802 + }, + { + "epoch": 0.5188662374821174, + "grad_norm": 0.1297432376071427, + "learning_rate": 9.875474748623661e-05, + "loss": 0.6847, + "step": 5803 + }, + { + "epoch": 0.5189556509298998, + "grad_norm": 0.13863707241147316, + "learning_rate": 9.872578967503275e-05, + "loss": 0.7056, + "step": 5804 + }, + { + "epoch": 0.5190450643776824, + "grad_norm": 0.13428418394911795, + "learning_rate": 9.869683197069533e-05, + "loss": 0.6453, + "step": 5805 + }, + { + "epoch": 0.519134477825465, + "grad_norm": 0.13983576342746462, + "learning_rate": 9.866787437565308e-05, + "loss": 0.674, + "step": 5806 + }, + { + "epoch": 0.5192238912732475, + "grad_norm": 0.12358593195738844, + "learning_rate": 9.863891689233459e-05, + "loss": 0.6266, + "step": 5807 + }, + { + "epoch": 0.51931330472103, + "grad_norm": 0.12509861322787058, + "learning_rate": 9.860995952316851e-05, + "loss": 0.6217, + "step": 5808 + }, + { + "epoch": 0.5194027181688126, + "grad_norm": 0.14604098361447834, + "learning_rate": 9.858100227058342e-05, + "loss": 0.6818, + "step": 5809 + }, + { + "epoch": 0.5194921316165951, + "grad_norm": 0.1299788096448348, + "learning_rate": 9.855204513700797e-05, + "loss": 0.6698, + "step": 5810 + }, + { + "epoch": 0.5195815450643777, + "grad_norm": 0.14259588539680995, + "learning_rate": 9.852308812487075e-05, + "loss": 0.6725, + "step": 5811 + }, + { + "epoch": 0.5196709585121603, + "grad_norm": 0.12509871277323034, + "learning_rate": 9.84941312366003e-05, + "loss": 0.6486, + "step": 5812 + }, + { + "epoch": 0.5197603719599427, + "grad_norm": 0.14079996552182847, + "learning_rate": 9.846517447462527e-05, + "loss": 0.6761, + "step": 5813 + }, + { + "epoch": 0.5198497854077253, + "grad_norm": 0.13558708446201714, + "learning_rate": 9.843621784137424e-05, + "loss": 0.6338, + "step": 5814 + }, + { + "epoch": 0.5199391988555079, + "grad_norm": 0.14198558371875808, + "learning_rate": 9.840726133927571e-05, + "loss": 0.6667, + "step": 5815 + }, + { + "epoch": 0.5200286123032904, + "grad_norm": 0.14200286525005898, + "learning_rate": 9.837830497075824e-05, + "loss": 0.6501, + "step": 5816 + }, + { + "epoch": 0.5201180257510729, + "grad_norm": 0.1342918585585855, + "learning_rate": 9.834934873825038e-05, + "loss": 0.6543, + "step": 5817 + }, + { + "epoch": 0.5202074391988555, + "grad_norm": 0.11713539676435072, + "learning_rate": 9.832039264418067e-05, + "loss": 0.6558, + "step": 5818 + }, + { + "epoch": 0.520296852646638, + "grad_norm": 0.13607981191787316, + "learning_rate": 9.829143669097754e-05, + "loss": 0.6754, + "step": 5819 + }, + { + "epoch": 0.5203862660944206, + "grad_norm": 0.13904401863372817, + "learning_rate": 9.826248088106959e-05, + "loss": 0.6683, + "step": 5820 + }, + { + "epoch": 0.5204756795422032, + "grad_norm": 0.15563503906697354, + "learning_rate": 9.823352521688528e-05, + "loss": 0.6556, + "step": 5821 + }, + { + "epoch": 0.5205650929899857, + "grad_norm": 0.11423280978600411, + "learning_rate": 9.820456970085307e-05, + "loss": 0.6628, + "step": 5822 + }, + { + "epoch": 0.5206545064377682, + "grad_norm": 0.1468218200043724, + "learning_rate": 9.817561433540141e-05, + "loss": 0.6708, + "step": 5823 + }, + { + "epoch": 0.5207439198855508, + "grad_norm": 0.13627459655684093, + "learning_rate": 9.81466591229588e-05, + "loss": 0.6687, + "step": 5824 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.13312369364614152, + "learning_rate": 9.811770406595365e-05, + "loss": 0.6834, + "step": 5825 + }, + { + "epoch": 0.5209227467811158, + "grad_norm": 0.12075544375399884, + "learning_rate": 9.808874916681436e-05, + "loss": 0.6486, + "step": 5826 + }, + { + "epoch": 0.5210121602288984, + "grad_norm": 0.1452647188215051, + "learning_rate": 9.805979442796936e-05, + "loss": 0.6878, + "step": 5827 + }, + { + "epoch": 0.521101573676681, + "grad_norm": 0.13441630738237648, + "learning_rate": 9.80308398518471e-05, + "loss": 0.6928, + "step": 5828 + }, + { + "epoch": 0.5211909871244635, + "grad_norm": 0.13431865681804459, + "learning_rate": 9.800188544087592e-05, + "loss": 0.6726, + "step": 5829 + }, + { + "epoch": 0.5212804005722461, + "grad_norm": 0.14007381072543798, + "learning_rate": 9.797293119748417e-05, + "loss": 0.6448, + "step": 5830 + }, + { + "epoch": 0.5213698140200286, + "grad_norm": 0.11589594377856446, + "learning_rate": 9.794397712410025e-05, + "loss": 0.6284, + "step": 5831 + }, + { + "epoch": 0.5214592274678111, + "grad_norm": 0.12213890510104947, + "learning_rate": 9.791502322315249e-05, + "loss": 0.6527, + "step": 5832 + }, + { + "epoch": 0.5215486409155937, + "grad_norm": 0.13942130653253398, + "learning_rate": 9.788606949706918e-05, + "loss": 0.684, + "step": 5833 + }, + { + "epoch": 0.5216380543633763, + "grad_norm": 0.1352514648076847, + "learning_rate": 9.785711594827868e-05, + "loss": 0.6705, + "step": 5834 + }, + { + "epoch": 0.5217274678111588, + "grad_norm": 0.11457290881525248, + "learning_rate": 9.78281625792093e-05, + "loss": 0.653, + "step": 5835 + }, + { + "epoch": 0.5218168812589413, + "grad_norm": 0.12305949037524289, + "learning_rate": 9.779920939228928e-05, + "loss": 0.633, + "step": 5836 + }, + { + "epoch": 0.5219062947067239, + "grad_norm": 0.12114405181332617, + "learning_rate": 9.777025638994693e-05, + "loss": 0.6313, + "step": 5837 + }, + { + "epoch": 0.5219957081545065, + "grad_norm": 0.12812584561977217, + "learning_rate": 9.774130357461049e-05, + "loss": 0.6404, + "step": 5838 + }, + { + "epoch": 0.522085121602289, + "grad_norm": 0.15123484576674032, + "learning_rate": 9.771235094870817e-05, + "loss": 0.7244, + "step": 5839 + }, + { + "epoch": 0.5221745350500715, + "grad_norm": 0.14176612281927967, + "learning_rate": 9.768339851466818e-05, + "loss": 0.6854, + "step": 5840 + }, + { + "epoch": 0.5222639484978541, + "grad_norm": 0.123036212157166, + "learning_rate": 9.765444627491882e-05, + "loss": 0.6496, + "step": 5841 + }, + { + "epoch": 0.5223533619456366, + "grad_norm": 0.13241108871036722, + "learning_rate": 9.76254942318882e-05, + "loss": 0.6683, + "step": 5842 + }, + { + "epoch": 0.5224427753934192, + "grad_norm": 0.13660603720664413, + "learning_rate": 9.759654238800451e-05, + "loss": 0.6815, + "step": 5843 + }, + { + "epoch": 0.5225321888412017, + "grad_norm": 0.12024044249498762, + "learning_rate": 9.756759074569591e-05, + "loss": 0.6343, + "step": 5844 + }, + { + "epoch": 0.5226216022889842, + "grad_norm": 0.13944715681307318, + "learning_rate": 9.753863930739054e-05, + "loss": 0.6939, + "step": 5845 + }, + { + "epoch": 0.5227110157367668, + "grad_norm": 0.11791267853653584, + "learning_rate": 9.75096880755165e-05, + "loss": 0.6431, + "step": 5846 + }, + { + "epoch": 0.5228004291845494, + "grad_norm": 0.13280942387712164, + "learning_rate": 9.748073705250188e-05, + "loss": 0.6423, + "step": 5847 + }, + { + "epoch": 0.522889842632332, + "grad_norm": 0.1141858787568073, + "learning_rate": 9.745178624077488e-05, + "loss": 0.6568, + "step": 5848 + }, + { + "epoch": 0.5229792560801144, + "grad_norm": 0.12021816851266545, + "learning_rate": 9.742283564276347e-05, + "loss": 0.6303, + "step": 5849 + }, + { + "epoch": 0.523068669527897, + "grad_norm": 0.12225241539073664, + "learning_rate": 9.739388526089568e-05, + "loss": 0.6286, + "step": 5850 + }, + { + "epoch": 0.5231580829756796, + "grad_norm": 0.11244528718350376, + "learning_rate": 9.736493509759962e-05, + "loss": 0.6471, + "step": 5851 + }, + { + "epoch": 0.5232474964234621, + "grad_norm": 0.12265303502875498, + "learning_rate": 9.733598515530328e-05, + "loss": 0.6565, + "step": 5852 + }, + { + "epoch": 0.5233369098712446, + "grad_norm": 0.12097810066218677, + "learning_rate": 9.730703543643464e-05, + "loss": 0.6668, + "step": 5853 + }, + { + "epoch": 0.5234263233190272, + "grad_norm": 0.1313150479074142, + "learning_rate": 9.727808594342164e-05, + "loss": 0.673, + "step": 5854 + }, + { + "epoch": 0.5235157367668097, + "grad_norm": 0.12883847103725413, + "learning_rate": 9.724913667869233e-05, + "loss": 0.6598, + "step": 5855 + }, + { + "epoch": 0.5236051502145923, + "grad_norm": 0.12590759066128546, + "learning_rate": 9.722018764467461e-05, + "loss": 0.6599, + "step": 5856 + }, + { + "epoch": 0.5236945636623748, + "grad_norm": 0.13938594902509255, + "learning_rate": 9.719123884379637e-05, + "loss": 0.6681, + "step": 5857 + }, + { + "epoch": 0.5237839771101573, + "grad_norm": 0.132965363558116, + "learning_rate": 9.716229027848556e-05, + "loss": 0.6464, + "step": 5858 + }, + { + "epoch": 0.5238733905579399, + "grad_norm": 0.13740337865560182, + "learning_rate": 9.713334195117004e-05, + "loss": 0.6977, + "step": 5859 + }, + { + "epoch": 0.5239628040057225, + "grad_norm": 0.12338545266846031, + "learning_rate": 9.710439386427764e-05, + "loss": 0.6719, + "step": 5860 + }, + { + "epoch": 0.524052217453505, + "grad_norm": 0.12484212048976542, + "learning_rate": 9.707544602023622e-05, + "loss": 0.6581, + "step": 5861 + }, + { + "epoch": 0.5241416309012875, + "grad_norm": 0.14179645595548465, + "learning_rate": 9.704649842147364e-05, + "loss": 0.6823, + "step": 5862 + }, + { + "epoch": 0.5242310443490701, + "grad_norm": 0.14245455445686955, + "learning_rate": 9.701755107041767e-05, + "loss": 0.6279, + "step": 5863 + }, + { + "epoch": 0.5243204577968527, + "grad_norm": 0.13660192485359815, + "learning_rate": 9.698860396949608e-05, + "loss": 0.6966, + "step": 5864 + }, + { + "epoch": 0.5244098712446352, + "grad_norm": 0.13364583234486813, + "learning_rate": 9.695965712113666e-05, + "loss": 0.6373, + "step": 5865 + }, + { + "epoch": 0.5244992846924177, + "grad_norm": 0.14710723685724222, + "learning_rate": 9.69307105277671e-05, + "loss": 0.6874, + "step": 5866 + }, + { + "epoch": 0.5245886981402003, + "grad_norm": 0.13057893813092314, + "learning_rate": 9.690176419181516e-05, + "loss": 0.6847, + "step": 5867 + }, + { + "epoch": 0.5246781115879828, + "grad_norm": 0.13523984496531688, + "learning_rate": 9.687281811570847e-05, + "loss": 0.688, + "step": 5868 + }, + { + "epoch": 0.5247675250357654, + "grad_norm": 0.12410073500135488, + "learning_rate": 9.68438723018748e-05, + "loss": 0.6547, + "step": 5869 + }, + { + "epoch": 0.524856938483548, + "grad_norm": 0.1297892732516689, + "learning_rate": 9.681492675274171e-05, + "loss": 0.6442, + "step": 5870 + }, + { + "epoch": 0.5249463519313304, + "grad_norm": 0.13259587266631825, + "learning_rate": 9.678598147073689e-05, + "loss": 0.6822, + "step": 5871 + }, + { + "epoch": 0.525035765379113, + "grad_norm": 0.1255110544801174, + "learning_rate": 9.675703645828794e-05, + "loss": 0.6338, + "step": 5872 + }, + { + "epoch": 0.5251251788268956, + "grad_norm": 0.12096368515767861, + "learning_rate": 9.67280917178224e-05, + "loss": 0.6068, + "step": 5873 + }, + { + "epoch": 0.5252145922746781, + "grad_norm": 0.13798943880198436, + "learning_rate": 9.669914725176787e-05, + "loss": 0.6298, + "step": 5874 + }, + { + "epoch": 0.5253040057224606, + "grad_norm": 0.143054196005608, + "learning_rate": 9.667020306255183e-05, + "loss": 0.643, + "step": 5875 + }, + { + "epoch": 0.5253934191702432, + "grad_norm": 0.13347907125330644, + "learning_rate": 9.66412591526019e-05, + "loss": 0.6537, + "step": 5876 + }, + { + "epoch": 0.5254828326180258, + "grad_norm": 0.10935941098688524, + "learning_rate": 9.661231552434546e-05, + "loss": 0.6073, + "step": 5877 + }, + { + "epoch": 0.5255722460658083, + "grad_norm": 0.1272201987259928, + "learning_rate": 9.658337218021007e-05, + "loss": 0.675, + "step": 5878 + }, + { + "epoch": 0.5256616595135909, + "grad_norm": 0.1361259488428198, + "learning_rate": 9.655442912262311e-05, + "loss": 0.6853, + "step": 5879 + }, + { + "epoch": 0.5257510729613734, + "grad_norm": 0.13741077062954457, + "learning_rate": 9.652548635401201e-05, + "loss": 0.6478, + "step": 5880 + }, + { + "epoch": 0.5258404864091559, + "grad_norm": 0.14844328470433013, + "learning_rate": 9.64965438768042e-05, + "loss": 0.693, + "step": 5881 + }, + { + "epoch": 0.5259298998569385, + "grad_norm": 0.12791209182324687, + "learning_rate": 9.646760169342696e-05, + "loss": 0.6328, + "step": 5882 + }, + { + "epoch": 0.5260193133047211, + "grad_norm": 0.12048691674036789, + "learning_rate": 9.643865980630775e-05, + "loss": 0.6801, + "step": 5883 + }, + { + "epoch": 0.5261087267525035, + "grad_norm": 0.11658390331316033, + "learning_rate": 9.640971821787382e-05, + "loss": 0.6104, + "step": 5884 + }, + { + "epoch": 0.5261981402002861, + "grad_norm": 0.13106271993787277, + "learning_rate": 9.638077693055252e-05, + "loss": 0.6581, + "step": 5885 + }, + { + "epoch": 0.5262875536480687, + "grad_norm": 0.13942413369261009, + "learning_rate": 9.635183594677107e-05, + "loss": 0.6378, + "step": 5886 + }, + { + "epoch": 0.5263769670958512, + "grad_norm": 0.12696264587752623, + "learning_rate": 9.632289526895672e-05, + "loss": 0.6537, + "step": 5887 + }, + { + "epoch": 0.5264663805436338, + "grad_norm": 0.11664214783984798, + "learning_rate": 9.629395489953669e-05, + "loss": 0.6605, + "step": 5888 + }, + { + "epoch": 0.5265557939914163, + "grad_norm": 0.1363808681108209, + "learning_rate": 9.626501484093823e-05, + "loss": 0.6459, + "step": 5889 + }, + { + "epoch": 0.5266452074391988, + "grad_norm": 0.13585788483455116, + "learning_rate": 9.623607509558846e-05, + "loss": 0.6632, + "step": 5890 + }, + { + "epoch": 0.5267346208869814, + "grad_norm": 0.13374051103469364, + "learning_rate": 9.620713566591449e-05, + "loss": 0.6739, + "step": 5891 + }, + { + "epoch": 0.526824034334764, + "grad_norm": 0.159737405541114, + "learning_rate": 9.61781965543435e-05, + "loss": 0.7157, + "step": 5892 + }, + { + "epoch": 0.5269134477825465, + "grad_norm": 0.1331982295887237, + "learning_rate": 9.614925776330254e-05, + "loss": 0.613, + "step": 5893 + }, + { + "epoch": 0.527002861230329, + "grad_norm": 0.12105484833999025, + "learning_rate": 9.612031929521869e-05, + "loss": 0.6576, + "step": 5894 + }, + { + "epoch": 0.5270922746781116, + "grad_norm": 0.12299532956883417, + "learning_rate": 9.609138115251894e-05, + "loss": 0.6166, + "step": 5895 + }, + { + "epoch": 0.5271816881258942, + "grad_norm": 0.11875567822697579, + "learning_rate": 9.606244333763038e-05, + "loss": 0.6542, + "step": 5896 + }, + { + "epoch": 0.5272711015736766, + "grad_norm": 0.14508753712210026, + "learning_rate": 9.603350585297991e-05, + "loss": 0.6898, + "step": 5897 + }, + { + "epoch": 0.5273605150214592, + "grad_norm": 0.12224768624804348, + "learning_rate": 9.600456870099454e-05, + "loss": 0.6434, + "step": 5898 + }, + { + "epoch": 0.5274499284692418, + "grad_norm": 0.126968370338191, + "learning_rate": 9.597563188410116e-05, + "loss": 0.6623, + "step": 5899 + }, + { + "epoch": 0.5275393419170243, + "grad_norm": 0.1299320443939031, + "learning_rate": 9.594669540472666e-05, + "loss": 0.6269, + "step": 5900 + }, + { + "epoch": 0.5276287553648069, + "grad_norm": 0.12632417657140718, + "learning_rate": 9.591775926529793e-05, + "loss": 0.6466, + "step": 5901 + }, + { + "epoch": 0.5277181688125894, + "grad_norm": 0.13669015180235486, + "learning_rate": 9.588882346824177e-05, + "loss": 0.6099, + "step": 5902 + }, + { + "epoch": 0.5278075822603719, + "grad_norm": 0.1325885085688676, + "learning_rate": 9.585988801598506e-05, + "loss": 0.616, + "step": 5903 + }, + { + "epoch": 0.5278969957081545, + "grad_norm": 0.12489786347684399, + "learning_rate": 9.583095291095453e-05, + "loss": 0.6529, + "step": 5904 + }, + { + "epoch": 0.5279864091559371, + "grad_norm": 0.14330631639025396, + "learning_rate": 9.580201815557695e-05, + "loss": 0.6759, + "step": 5905 + }, + { + "epoch": 0.5280758226037195, + "grad_norm": 0.12388688047778612, + "learning_rate": 9.577308375227906e-05, + "loss": 0.6744, + "step": 5906 + }, + { + "epoch": 0.5281652360515021, + "grad_norm": 0.13140137240278968, + "learning_rate": 9.574414970348749e-05, + "loss": 0.6925, + "step": 5907 + }, + { + "epoch": 0.5282546494992847, + "grad_norm": 0.14492755151425193, + "learning_rate": 9.571521601162897e-05, + "loss": 0.6852, + "step": 5908 + }, + { + "epoch": 0.5283440629470673, + "grad_norm": 0.1216534286408746, + "learning_rate": 9.568628267913007e-05, + "loss": 0.6048, + "step": 5909 + }, + { + "epoch": 0.5284334763948498, + "grad_norm": 0.12778304506598753, + "learning_rate": 9.565734970841747e-05, + "loss": 0.634, + "step": 5910 + }, + { + "epoch": 0.5285228898426323, + "grad_norm": 0.11782953581893636, + "learning_rate": 9.562841710191769e-05, + "loss": 0.6213, + "step": 5911 + }, + { + "epoch": 0.5286123032904149, + "grad_norm": 0.12964632085958547, + "learning_rate": 9.55994848620573e-05, + "loss": 0.6651, + "step": 5912 + }, + { + "epoch": 0.5287017167381974, + "grad_norm": 0.14969817550594, + "learning_rate": 9.55705529912628e-05, + "loss": 0.6746, + "step": 5913 + }, + { + "epoch": 0.52879113018598, + "grad_norm": 0.13556856443624224, + "learning_rate": 9.554162149196066e-05, + "loss": 0.6543, + "step": 5914 + }, + { + "epoch": 0.5288805436337625, + "grad_norm": 0.12369934070636834, + "learning_rate": 9.551269036657736e-05, + "loss": 0.6278, + "step": 5915 + }, + { + "epoch": 0.528969957081545, + "grad_norm": 0.13294893525930448, + "learning_rate": 9.548375961753926e-05, + "loss": 0.5823, + "step": 5916 + }, + { + "epoch": 0.5290593705293276, + "grad_norm": 0.13764301062674708, + "learning_rate": 9.545482924727282e-05, + "loss": 0.6965, + "step": 5917 + }, + { + "epoch": 0.5291487839771102, + "grad_norm": 0.1458719736287598, + "learning_rate": 9.542589925820435e-05, + "loss": 0.6859, + "step": 5918 + }, + { + "epoch": 0.5292381974248928, + "grad_norm": 0.13605248389147864, + "learning_rate": 9.539696965276019e-05, + "loss": 0.6742, + "step": 5919 + }, + { + "epoch": 0.5293276108726752, + "grad_norm": 0.13362561651297095, + "learning_rate": 9.536804043336664e-05, + "loss": 0.6491, + "step": 5920 + }, + { + "epoch": 0.5294170243204578, + "grad_norm": 0.1308087665158935, + "learning_rate": 9.533911160244993e-05, + "loss": 0.6679, + "step": 5921 + }, + { + "epoch": 0.5295064377682404, + "grad_norm": 0.1657322997884045, + "learning_rate": 9.53101831624363e-05, + "loss": 0.6992, + "step": 5922 + }, + { + "epoch": 0.5295958512160229, + "grad_norm": 0.13782295976762635, + "learning_rate": 9.528125511575193e-05, + "loss": 0.6375, + "step": 5923 + }, + { + "epoch": 0.5296852646638054, + "grad_norm": 0.11812556725851668, + "learning_rate": 9.525232746482301e-05, + "loss": 0.6358, + "step": 5924 + }, + { + "epoch": 0.529774678111588, + "grad_norm": 0.12721362223175364, + "learning_rate": 9.522340021207564e-05, + "loss": 0.6353, + "step": 5925 + }, + { + "epoch": 0.5298640915593705, + "grad_norm": 0.12628975391706423, + "learning_rate": 9.519447335993595e-05, + "loss": 0.6504, + "step": 5926 + }, + { + "epoch": 0.5299535050071531, + "grad_norm": 0.1137922417267945, + "learning_rate": 9.516554691082995e-05, + "loss": 0.6062, + "step": 5927 + }, + { + "epoch": 0.5300429184549357, + "grad_norm": 0.11916175664634437, + "learning_rate": 9.513662086718372e-05, + "loss": 0.6371, + "step": 5928 + }, + { + "epoch": 0.5301323319027181, + "grad_norm": 0.13408382298559074, + "learning_rate": 9.510769523142322e-05, + "loss": 0.6437, + "step": 5929 + }, + { + "epoch": 0.5302217453505007, + "grad_norm": 0.14454383464202222, + "learning_rate": 9.507877000597437e-05, + "loss": 0.6566, + "step": 5930 + }, + { + "epoch": 0.5303111587982833, + "grad_norm": 0.1439688064189998, + "learning_rate": 9.504984519326316e-05, + "loss": 0.673, + "step": 5931 + }, + { + "epoch": 0.5304005722460658, + "grad_norm": 0.15520403789057322, + "learning_rate": 9.502092079571547e-05, + "loss": 0.6414, + "step": 5932 + }, + { + "epoch": 0.5304899856938483, + "grad_norm": 0.13904949982512077, + "learning_rate": 9.499199681575716e-05, + "loss": 0.6598, + "step": 5933 + }, + { + "epoch": 0.5305793991416309, + "grad_norm": 0.12331640577382375, + "learning_rate": 9.496307325581398e-05, + "loss": 0.6538, + "step": 5934 + }, + { + "epoch": 0.5306688125894135, + "grad_norm": 0.12925712556615615, + "learning_rate": 9.49341501183118e-05, + "loss": 0.6365, + "step": 5935 + }, + { + "epoch": 0.530758226037196, + "grad_norm": 0.1324294385896426, + "learning_rate": 9.490522740567633e-05, + "loss": 0.6477, + "step": 5936 + }, + { + "epoch": 0.5308476394849786, + "grad_norm": 0.15370624919243797, + "learning_rate": 9.487630512033325e-05, + "loss": 0.6625, + "step": 5937 + }, + { + "epoch": 0.530937052932761, + "grad_norm": 0.12536711585876398, + "learning_rate": 9.484738326470828e-05, + "loss": 0.6522, + "step": 5938 + }, + { + "epoch": 0.5310264663805436, + "grad_norm": 0.1266317821514982, + "learning_rate": 9.481846184122707e-05, + "loss": 0.6428, + "step": 5939 + }, + { + "epoch": 0.5311158798283262, + "grad_norm": 0.14098782156683468, + "learning_rate": 9.478954085231522e-05, + "loss": 0.6367, + "step": 5940 + }, + { + "epoch": 0.5312052932761088, + "grad_norm": 0.12978336847672695, + "learning_rate": 9.476062030039825e-05, + "loss": 0.627, + "step": 5941 + }, + { + "epoch": 0.5312947067238912, + "grad_norm": 0.12760317886390926, + "learning_rate": 9.473170018790176e-05, + "loss": 0.6452, + "step": 5942 + }, + { + "epoch": 0.5313841201716738, + "grad_norm": 0.14963462017125562, + "learning_rate": 9.470278051725122e-05, + "loss": 0.6751, + "step": 5943 + }, + { + "epoch": 0.5314735336194564, + "grad_norm": 0.1542744739819886, + "learning_rate": 9.467386129087202e-05, + "loss": 0.6866, + "step": 5944 + }, + { + "epoch": 0.531562947067239, + "grad_norm": 0.10933001017821431, + "learning_rate": 9.464494251118968e-05, + "loss": 0.614, + "step": 5945 + }, + { + "epoch": 0.5316523605150214, + "grad_norm": 0.12998587328298242, + "learning_rate": 9.461602418062956e-05, + "loss": 0.6524, + "step": 5946 + }, + { + "epoch": 0.531741773962804, + "grad_norm": 0.14593154423463156, + "learning_rate": 9.458710630161698e-05, + "loss": 0.6423, + "step": 5947 + }, + { + "epoch": 0.5318311874105865, + "grad_norm": 0.13414751329506489, + "learning_rate": 9.455818887657725e-05, + "loss": 0.6866, + "step": 5948 + }, + { + "epoch": 0.5319206008583691, + "grad_norm": 0.14742663965969394, + "learning_rate": 9.452927190793566e-05, + "loss": 0.6766, + "step": 5949 + }, + { + "epoch": 0.5320100143061517, + "grad_norm": 0.13928732641344527, + "learning_rate": 9.450035539811741e-05, + "loss": 0.667, + "step": 5950 + }, + { + "epoch": 0.5320994277539342, + "grad_norm": 0.13546947375759147, + "learning_rate": 9.447143934954771e-05, + "loss": 0.6307, + "step": 5951 + }, + { + "epoch": 0.5321888412017167, + "grad_norm": 0.11778644950428925, + "learning_rate": 9.444252376465171e-05, + "loss": 0.6444, + "step": 5952 + }, + { + "epoch": 0.5322782546494993, + "grad_norm": 0.13080302811381306, + "learning_rate": 9.441360864585456e-05, + "loss": 0.6749, + "step": 5953 + }, + { + "epoch": 0.5323676680972819, + "grad_norm": 0.12418149910791682, + "learning_rate": 9.438469399558128e-05, + "loss": 0.6372, + "step": 5954 + }, + { + "epoch": 0.5324570815450643, + "grad_norm": 0.1388898620348654, + "learning_rate": 9.435577981625697e-05, + "loss": 0.6662, + "step": 5955 + }, + { + "epoch": 0.5325464949928469, + "grad_norm": 0.1216464899100316, + "learning_rate": 9.432686611030657e-05, + "loss": 0.6284, + "step": 5956 + }, + { + "epoch": 0.5326359084406295, + "grad_norm": 0.12181944991000246, + "learning_rate": 9.429795288015504e-05, + "loss": 0.6466, + "step": 5957 + }, + { + "epoch": 0.532725321888412, + "grad_norm": 0.1299104963722252, + "learning_rate": 9.42690401282273e-05, + "loss": 0.6259, + "step": 5958 + }, + { + "epoch": 0.5328147353361946, + "grad_norm": 0.13714115749736627, + "learning_rate": 9.424012785694827e-05, + "loss": 0.6884, + "step": 5959 + }, + { + "epoch": 0.5329041487839771, + "grad_norm": 0.11646685441080991, + "learning_rate": 9.421121606874278e-05, + "loss": 0.623, + "step": 5960 + }, + { + "epoch": 0.5329935622317596, + "grad_norm": 0.1246512195573299, + "learning_rate": 9.418230476603558e-05, + "loss": 0.6294, + "step": 5961 + }, + { + "epoch": 0.5330829756795422, + "grad_norm": 0.13847037594895567, + "learning_rate": 9.415339395125147e-05, + "loss": 0.6381, + "step": 5962 + }, + { + "epoch": 0.5331723891273248, + "grad_norm": 0.12603685247079138, + "learning_rate": 9.412448362681516e-05, + "loss": 0.6377, + "step": 5963 + }, + { + "epoch": 0.5332618025751072, + "grad_norm": 0.12146590874296173, + "learning_rate": 9.409557379515127e-05, + "loss": 0.6444, + "step": 5964 + }, + { + "epoch": 0.5333512160228898, + "grad_norm": 0.1411948136086386, + "learning_rate": 9.406666445868448e-05, + "loss": 0.6595, + "step": 5965 + }, + { + "epoch": 0.5334406294706724, + "grad_norm": 0.13827040647279168, + "learning_rate": 9.40377556198394e-05, + "loss": 0.6554, + "step": 5966 + }, + { + "epoch": 0.533530042918455, + "grad_norm": 0.13163085527050908, + "learning_rate": 9.400884728104056e-05, + "loss": 0.6479, + "step": 5967 + }, + { + "epoch": 0.5336194563662375, + "grad_norm": 0.12722814926067721, + "learning_rate": 9.397993944471244e-05, + "loss": 0.6492, + "step": 5968 + }, + { + "epoch": 0.53370886981402, + "grad_norm": 0.12102472732410827, + "learning_rate": 9.395103211327955e-05, + "loss": 0.6201, + "step": 5969 + }, + { + "epoch": 0.5337982832618026, + "grad_norm": 0.11949016644711744, + "learning_rate": 9.39221252891663e-05, + "loss": 0.6446, + "step": 5970 + }, + { + "epoch": 0.5338876967095851, + "grad_norm": 0.13388046970757797, + "learning_rate": 9.389321897479703e-05, + "loss": 0.6249, + "step": 5971 + }, + { + "epoch": 0.5339771101573677, + "grad_norm": 0.1348418705983925, + "learning_rate": 9.386431317259609e-05, + "loss": 0.6388, + "step": 5972 + }, + { + "epoch": 0.5340665236051502, + "grad_norm": 0.13356903044534046, + "learning_rate": 9.383540788498784e-05, + "loss": 0.688, + "step": 5973 + }, + { + "epoch": 0.5341559370529327, + "grad_norm": 0.13140186834536058, + "learning_rate": 9.380650311439649e-05, + "loss": 0.6769, + "step": 5974 + }, + { + "epoch": 0.5342453505007153, + "grad_norm": 0.13196937284783863, + "learning_rate": 9.37775988632462e-05, + "loss": 0.6767, + "step": 5975 + }, + { + "epoch": 0.5343347639484979, + "grad_norm": 0.12331883244343765, + "learning_rate": 9.374869513396123e-05, + "loss": 0.6957, + "step": 5976 + }, + { + "epoch": 0.5344241773962805, + "grad_norm": 0.1210054994621941, + "learning_rate": 9.371979192896564e-05, + "loss": 0.6668, + "step": 5977 + }, + { + "epoch": 0.5345135908440629, + "grad_norm": 0.1300229048200453, + "learning_rate": 9.369088925068347e-05, + "loss": 0.6756, + "step": 5978 + }, + { + "epoch": 0.5346030042918455, + "grad_norm": 0.1222877782407902, + "learning_rate": 9.366198710153882e-05, + "loss": 0.656, + "step": 5979 + }, + { + "epoch": 0.5346924177396281, + "grad_norm": 0.14822834707096472, + "learning_rate": 9.363308548395568e-05, + "loss": 0.635, + "step": 5980 + }, + { + "epoch": 0.5347818311874106, + "grad_norm": 0.14027833788768485, + "learning_rate": 9.360418440035796e-05, + "loss": 0.6382, + "step": 5981 + }, + { + "epoch": 0.5348712446351931, + "grad_norm": 0.12306530329115237, + "learning_rate": 9.357528385316958e-05, + "loss": 0.6694, + "step": 5982 + }, + { + "epoch": 0.5349606580829757, + "grad_norm": 0.12707633091492407, + "learning_rate": 9.354638384481437e-05, + "loss": 0.5992, + "step": 5983 + }, + { + "epoch": 0.5350500715307582, + "grad_norm": 0.11921965483778194, + "learning_rate": 9.351748437771615e-05, + "loss": 0.6521, + "step": 5984 + }, + { + "epoch": 0.5351394849785408, + "grad_norm": 0.1301399030285673, + "learning_rate": 9.348858545429868e-05, + "loss": 0.6494, + "step": 5985 + }, + { + "epoch": 0.5352288984263234, + "grad_norm": 0.14516470374842358, + "learning_rate": 9.345968707698569e-05, + "loss": 0.6559, + "step": 5986 + }, + { + "epoch": 0.5353183118741058, + "grad_norm": 0.1372189457555013, + "learning_rate": 9.343078924820087e-05, + "loss": 0.6612, + "step": 5987 + }, + { + "epoch": 0.5354077253218884, + "grad_norm": 0.14015988115354758, + "learning_rate": 9.340189197036779e-05, + "loss": 0.6579, + "step": 5988 + }, + { + "epoch": 0.535497138769671, + "grad_norm": 0.1253915702358557, + "learning_rate": 9.337299524591009e-05, + "loss": 0.6212, + "step": 5989 + }, + { + "epoch": 0.5355865522174535, + "grad_norm": 0.14088408715394662, + "learning_rate": 9.334409907725128e-05, + "loss": 0.6417, + "step": 5990 + }, + { + "epoch": 0.535675965665236, + "grad_norm": 0.12087803924975038, + "learning_rate": 9.33152034668148e-05, + "loss": 0.6357, + "step": 5991 + }, + { + "epoch": 0.5357653791130186, + "grad_norm": 0.14470921390198188, + "learning_rate": 9.328630841702414e-05, + "loss": 0.6531, + "step": 5992 + }, + { + "epoch": 0.5358547925608012, + "grad_norm": 0.1392218110016408, + "learning_rate": 9.32574139303027e-05, + "loss": 0.6588, + "step": 5993 + }, + { + "epoch": 0.5359442060085837, + "grad_norm": 0.140264166925229, + "learning_rate": 9.322852000907383e-05, + "loss": 0.6594, + "step": 5994 + }, + { + "epoch": 0.5360336194563662, + "grad_norm": 0.1521741997406389, + "learning_rate": 9.319962665576078e-05, + "loss": 0.6612, + "step": 5995 + }, + { + "epoch": 0.5361230329041488, + "grad_norm": 0.1438765886381417, + "learning_rate": 9.317073387278686e-05, + "loss": 0.6715, + "step": 5996 + }, + { + "epoch": 0.5362124463519313, + "grad_norm": 0.13025745336382102, + "learning_rate": 9.314184166257524e-05, + "loss": 0.6238, + "step": 5997 + }, + { + "epoch": 0.5363018597997139, + "grad_norm": 0.1417420906602526, + "learning_rate": 9.311295002754905e-05, + "loss": 0.6775, + "step": 5998 + }, + { + "epoch": 0.5363912732474965, + "grad_norm": 0.13612083089166227, + "learning_rate": 9.30840589701314e-05, + "loss": 0.6801, + "step": 5999 + }, + { + "epoch": 0.5364806866952789, + "grad_norm": 0.14335265699257224, + "learning_rate": 9.305516849274541e-05, + "loss": 0.6178, + "step": 6000 + }, + { + "epoch": 0.5365701001430615, + "grad_norm": 0.12574177000136116, + "learning_rate": 9.302627859781406e-05, + "loss": 0.6375, + "step": 6001 + }, + { + "epoch": 0.5366595135908441, + "grad_norm": 0.12808620192041223, + "learning_rate": 9.299738928776029e-05, + "loss": 0.637, + "step": 6002 + }, + { + "epoch": 0.5367489270386266, + "grad_norm": 0.13861047267453042, + "learning_rate": 9.296850056500703e-05, + "loss": 0.6803, + "step": 6003 + }, + { + "epoch": 0.5368383404864091, + "grad_norm": 0.1545022857857963, + "learning_rate": 9.293961243197715e-05, + "loss": 0.6854, + "step": 6004 + }, + { + "epoch": 0.5369277539341917, + "grad_norm": 0.13831064135252769, + "learning_rate": 9.29107248910934e-05, + "loss": 0.6572, + "step": 6005 + }, + { + "epoch": 0.5370171673819742, + "grad_norm": 0.12689138795255953, + "learning_rate": 9.28818379447786e-05, + "loss": 0.6378, + "step": 6006 + }, + { + "epoch": 0.5371065808297568, + "grad_norm": 0.12459533390970742, + "learning_rate": 9.285295159545547e-05, + "loss": 0.6558, + "step": 6007 + }, + { + "epoch": 0.5371959942775394, + "grad_norm": 0.13699278817321378, + "learning_rate": 9.282406584554668e-05, + "loss": 0.665, + "step": 6008 + }, + { + "epoch": 0.5372854077253219, + "grad_norm": 0.13104289238902017, + "learning_rate": 9.279518069747479e-05, + "loss": 0.6743, + "step": 6009 + }, + { + "epoch": 0.5373748211731044, + "grad_norm": 0.1222110873955845, + "learning_rate": 9.276629615366242e-05, + "loss": 0.6203, + "step": 6010 + }, + { + "epoch": 0.537464234620887, + "grad_norm": 0.14251594204943505, + "learning_rate": 9.273741221653204e-05, + "loss": 0.6423, + "step": 6011 + }, + { + "epoch": 0.5375536480686696, + "grad_norm": 0.12905598824435777, + "learning_rate": 9.270852888850615e-05, + "loss": 0.6603, + "step": 6012 + }, + { + "epoch": 0.537643061516452, + "grad_norm": 0.12367198897060384, + "learning_rate": 9.267964617200707e-05, + "loss": 0.6391, + "step": 6013 + }, + { + "epoch": 0.5377324749642346, + "grad_norm": 0.13674835632860216, + "learning_rate": 9.265076406945727e-05, + "loss": 0.6454, + "step": 6014 + }, + { + "epoch": 0.5378218884120172, + "grad_norm": 0.14517621423179472, + "learning_rate": 9.262188258327901e-05, + "loss": 0.6773, + "step": 6015 + }, + { + "epoch": 0.5379113018597997, + "grad_norm": 0.1331727201224821, + "learning_rate": 9.259300171589456e-05, + "loss": 0.6449, + "step": 6016 + }, + { + "epoch": 0.5380007153075823, + "grad_norm": 0.12339396779565026, + "learning_rate": 9.256412146972611e-05, + "loss": 0.6194, + "step": 6017 + }, + { + "epoch": 0.5380901287553648, + "grad_norm": 0.12375725597657145, + "learning_rate": 9.25352418471958e-05, + "loss": 0.6597, + "step": 6018 + }, + { + "epoch": 0.5381795422031473, + "grad_norm": 0.11804467513630061, + "learning_rate": 9.250636285072574e-05, + "loss": 0.6447, + "step": 6019 + }, + { + "epoch": 0.5382689556509299, + "grad_norm": 0.1385914640418469, + "learning_rate": 9.247748448273796e-05, + "loss": 0.6559, + "step": 6020 + }, + { + "epoch": 0.5383583690987125, + "grad_norm": 0.13534272769744357, + "learning_rate": 9.24486067456545e-05, + "loss": 0.6459, + "step": 6021 + }, + { + "epoch": 0.538447782546495, + "grad_norm": 0.1270752685978577, + "learning_rate": 9.241972964189726e-05, + "loss": 0.6158, + "step": 6022 + }, + { + "epoch": 0.5385371959942775, + "grad_norm": 0.13816030724262637, + "learning_rate": 9.239085317388816e-05, + "loss": 0.6432, + "step": 6023 + }, + { + "epoch": 0.5386266094420601, + "grad_norm": 0.1273354306335186, + "learning_rate": 9.236197734404901e-05, + "loss": 0.6695, + "step": 6024 + }, + { + "epoch": 0.5387160228898427, + "grad_norm": 0.14206724257115239, + "learning_rate": 9.233310215480157e-05, + "loss": 0.6628, + "step": 6025 + }, + { + "epoch": 0.5388054363376252, + "grad_norm": 0.13659403927548916, + "learning_rate": 9.230422760856762e-05, + "loss": 0.6431, + "step": 6026 + }, + { + "epoch": 0.5388948497854077, + "grad_norm": 0.1318244050505706, + "learning_rate": 9.227535370776877e-05, + "loss": 0.6612, + "step": 6027 + }, + { + "epoch": 0.5389842632331903, + "grad_norm": 0.12233682174850423, + "learning_rate": 9.22464804548267e-05, + "loss": 0.6037, + "step": 6028 + }, + { + "epoch": 0.5390736766809728, + "grad_norm": 0.13124701639427344, + "learning_rate": 9.221760785216295e-05, + "loss": 0.6245, + "step": 6029 + }, + { + "epoch": 0.5391630901287554, + "grad_norm": 0.13803496092142387, + "learning_rate": 9.218873590219905e-05, + "loss": 0.683, + "step": 6030 + }, + { + "epoch": 0.5392525035765379, + "grad_norm": 0.14663810841900282, + "learning_rate": 9.215986460735642e-05, + "loss": 0.6531, + "step": 6031 + }, + { + "epoch": 0.5393419170243204, + "grad_norm": 0.11721475352884773, + "learning_rate": 9.213099397005646e-05, + "loss": 0.6619, + "step": 6032 + }, + { + "epoch": 0.539431330472103, + "grad_norm": 0.12758279905445732, + "learning_rate": 9.210212399272056e-05, + "loss": 0.6119, + "step": 6033 + }, + { + "epoch": 0.5395207439198856, + "grad_norm": 0.13157934857341688, + "learning_rate": 9.207325467776993e-05, + "loss": 0.6516, + "step": 6034 + }, + { + "epoch": 0.539610157367668, + "grad_norm": 0.14998076634880306, + "learning_rate": 9.204438602762592e-05, + "loss": 0.6767, + "step": 6035 + }, + { + "epoch": 0.5396995708154506, + "grad_norm": 0.15166562333330935, + "learning_rate": 9.201551804470962e-05, + "loss": 0.6521, + "step": 6036 + }, + { + "epoch": 0.5397889842632332, + "grad_norm": 0.12827929282470116, + "learning_rate": 9.198665073144218e-05, + "loss": 0.6354, + "step": 6037 + }, + { + "epoch": 0.5398783977110158, + "grad_norm": 0.12772003778627444, + "learning_rate": 9.195778409024468e-05, + "loss": 0.6694, + "step": 6038 + }, + { + "epoch": 0.5399678111587983, + "grad_norm": 0.12749866068132076, + "learning_rate": 9.19289181235381e-05, + "loss": 0.6155, + "step": 6039 + }, + { + "epoch": 0.5400572246065808, + "grad_norm": 0.1293084040681061, + "learning_rate": 9.190005283374343e-05, + "loss": 0.6631, + "step": 6040 + }, + { + "epoch": 0.5401466380543634, + "grad_norm": 0.13002353654128987, + "learning_rate": 9.187118822328149e-05, + "loss": 0.6775, + "step": 6041 + }, + { + "epoch": 0.5402360515021459, + "grad_norm": 0.1337149500211654, + "learning_rate": 9.184232429457323e-05, + "loss": 0.6714, + "step": 6042 + }, + { + "epoch": 0.5403254649499285, + "grad_norm": 0.13443543265421426, + "learning_rate": 9.181346105003936e-05, + "loss": 0.6255, + "step": 6043 + }, + { + "epoch": 0.540414878397711, + "grad_norm": 0.12347039831110761, + "learning_rate": 9.178459849210063e-05, + "loss": 0.6575, + "step": 6044 + }, + { + "epoch": 0.5405042918454935, + "grad_norm": 0.13250618293824215, + "learning_rate": 9.175573662317769e-05, + "loss": 0.6604, + "step": 6045 + }, + { + "epoch": 0.5405937052932761, + "grad_norm": 0.13196595234026853, + "learning_rate": 9.172687544569118e-05, + "loss": 0.6366, + "step": 6046 + }, + { + "epoch": 0.5406831187410587, + "grad_norm": 0.1320738718323303, + "learning_rate": 9.169801496206165e-05, + "loss": 0.6667, + "step": 6047 + }, + { + "epoch": 0.5407725321888412, + "grad_norm": 0.14863751225689567, + "learning_rate": 9.166915517470953e-05, + "loss": 0.717, + "step": 6048 + }, + { + "epoch": 0.5408619456366237, + "grad_norm": 0.12693158252386347, + "learning_rate": 9.164029608605531e-05, + "loss": 0.6211, + "step": 6049 + }, + { + "epoch": 0.5409513590844063, + "grad_norm": 0.12559282534248453, + "learning_rate": 9.161143769851941e-05, + "loss": 0.6415, + "step": 6050 + }, + { + "epoch": 0.5410407725321889, + "grad_norm": 0.137722406572653, + "learning_rate": 9.158258001452208e-05, + "loss": 0.6836, + "step": 6051 + }, + { + "epoch": 0.5411301859799714, + "grad_norm": 0.1209355033372269, + "learning_rate": 9.155372303648359e-05, + "loss": 0.6486, + "step": 6052 + }, + { + "epoch": 0.5412195994277539, + "grad_norm": 0.12864815467468818, + "learning_rate": 9.152486676682415e-05, + "loss": 0.6745, + "step": 6053 + }, + { + "epoch": 0.5413090128755365, + "grad_norm": 0.13676012570199986, + "learning_rate": 9.149601120796391e-05, + "loss": 0.6666, + "step": 6054 + }, + { + "epoch": 0.541398426323319, + "grad_norm": 0.13560856170687294, + "learning_rate": 9.146715636232291e-05, + "loss": 0.6742, + "step": 6055 + }, + { + "epoch": 0.5414878397711016, + "grad_norm": 0.12750494767508824, + "learning_rate": 9.14383022323212e-05, + "loss": 0.6836, + "step": 6056 + }, + { + "epoch": 0.5415772532188842, + "grad_norm": 0.12746632074749129, + "learning_rate": 9.140944882037879e-05, + "loss": 0.6311, + "step": 6057 + }, + { + "epoch": 0.5416666666666666, + "grad_norm": 0.12305564813843989, + "learning_rate": 9.138059612891551e-05, + "loss": 0.625, + "step": 6058 + }, + { + "epoch": 0.5417560801144492, + "grad_norm": 0.14150274905400725, + "learning_rate": 9.13517441603512e-05, + "loss": 0.6733, + "step": 6059 + }, + { + "epoch": 0.5418454935622318, + "grad_norm": 0.1325604571665017, + "learning_rate": 9.13228929171057e-05, + "loss": 0.691, + "step": 6060 + }, + { + "epoch": 0.5419349070100143, + "grad_norm": 0.11912464453228853, + "learning_rate": 9.129404240159864e-05, + "loss": 0.6372, + "step": 6061 + }, + { + "epoch": 0.5420243204577968, + "grad_norm": 0.1376784090552953, + "learning_rate": 9.126519261624977e-05, + "loss": 0.6798, + "step": 6062 + }, + { + "epoch": 0.5421137339055794, + "grad_norm": 0.11840678349990032, + "learning_rate": 9.123634356347863e-05, + "loss": 0.6153, + "step": 6063 + }, + { + "epoch": 0.542203147353362, + "grad_norm": 0.1312006892814954, + "learning_rate": 9.12074952457048e-05, + "loss": 0.6617, + "step": 6064 + }, + { + "epoch": 0.5422925608011445, + "grad_norm": 0.12751844778829982, + "learning_rate": 9.117864766534772e-05, + "loss": 0.6127, + "step": 6065 + }, + { + "epoch": 0.5423819742489271, + "grad_norm": 0.11755996114362079, + "learning_rate": 9.114980082482677e-05, + "loss": 0.6454, + "step": 6066 + }, + { + "epoch": 0.5424713876967096, + "grad_norm": 0.13474386433249055, + "learning_rate": 9.112095472656137e-05, + "loss": 0.6734, + "step": 6067 + }, + { + "epoch": 0.5425608011444921, + "grad_norm": 0.13282266053675085, + "learning_rate": 9.109210937297074e-05, + "loss": 0.6572, + "step": 6068 + }, + { + "epoch": 0.5426502145922747, + "grad_norm": 0.13546097601235296, + "learning_rate": 9.106326476647417e-05, + "loss": 0.6494, + "step": 6069 + }, + { + "epoch": 0.5427396280400573, + "grad_norm": 0.11969007192656739, + "learning_rate": 9.103442090949077e-05, + "loss": 0.611, + "step": 6070 + }, + { + "epoch": 0.5428290414878397, + "grad_norm": 0.13429955795058535, + "learning_rate": 9.100557780443968e-05, + "loss": 0.6715, + "step": 6071 + }, + { + "epoch": 0.5429184549356223, + "grad_norm": 0.1346262046228736, + "learning_rate": 9.09767354537399e-05, + "loss": 0.6238, + "step": 6072 + }, + { + "epoch": 0.5430078683834049, + "grad_norm": 0.13018370639149482, + "learning_rate": 9.094789385981045e-05, + "loss": 0.6756, + "step": 6073 + }, + { + "epoch": 0.5430972818311874, + "grad_norm": 0.12630433486228612, + "learning_rate": 9.09190530250702e-05, + "loss": 0.6347, + "step": 6074 + }, + { + "epoch": 0.54318669527897, + "grad_norm": 0.13620815238861692, + "learning_rate": 9.089021295193796e-05, + "loss": 0.6452, + "step": 6075 + }, + { + "epoch": 0.5432761087267525, + "grad_norm": 0.14147851620397964, + "learning_rate": 9.08613736428326e-05, + "loss": 0.6138, + "step": 6076 + }, + { + "epoch": 0.543365522174535, + "grad_norm": 0.1607794757680186, + "learning_rate": 9.083253510017279e-05, + "loss": 0.6693, + "step": 6077 + }, + { + "epoch": 0.5434549356223176, + "grad_norm": 0.12406050106110936, + "learning_rate": 9.08036973263772e-05, + "loss": 0.6195, + "step": 6078 + }, + { + "epoch": 0.5435443490701002, + "grad_norm": 0.12676939864673575, + "learning_rate": 9.077486032386439e-05, + "loss": 0.6499, + "step": 6079 + }, + { + "epoch": 0.5436337625178826, + "grad_norm": 0.13521415736482661, + "learning_rate": 9.074602409505293e-05, + "loss": 0.6568, + "step": 6080 + }, + { + "epoch": 0.5437231759656652, + "grad_norm": 0.13585979227222467, + "learning_rate": 9.071718864236125e-05, + "loss": 0.6749, + "step": 6081 + }, + { + "epoch": 0.5438125894134478, + "grad_norm": 0.1444851423710919, + "learning_rate": 9.06883539682077e-05, + "loss": 0.6682, + "step": 6082 + }, + { + "epoch": 0.5439020028612304, + "grad_norm": 0.13976395448742293, + "learning_rate": 9.065952007501067e-05, + "loss": 0.6358, + "step": 6083 + }, + { + "epoch": 0.5439914163090128, + "grad_norm": 0.14589758357290564, + "learning_rate": 9.063068696518843e-05, + "loss": 0.6754, + "step": 6084 + }, + { + "epoch": 0.5440808297567954, + "grad_norm": 0.12487707555443986, + "learning_rate": 9.060185464115918e-05, + "loss": 0.69, + "step": 6085 + }, + { + "epoch": 0.544170243204578, + "grad_norm": 0.13945020726707477, + "learning_rate": 9.0573023105341e-05, + "loss": 0.6533, + "step": 6086 + }, + { + "epoch": 0.5442596566523605, + "grad_norm": 0.14887534684093198, + "learning_rate": 9.054419236015201e-05, + "loss": 0.6521, + "step": 6087 + }, + { + "epoch": 0.5443490701001431, + "grad_norm": 0.13958284502156, + "learning_rate": 9.05153624080102e-05, + "loss": 0.6526, + "step": 6088 + }, + { + "epoch": 0.5444384835479256, + "grad_norm": 0.13987187040349844, + "learning_rate": 9.048653325133343e-05, + "loss": 0.6476, + "step": 6089 + }, + { + "epoch": 0.5445278969957081, + "grad_norm": 0.13645150777196952, + "learning_rate": 9.045770489253965e-05, + "loss": 0.665, + "step": 6090 + }, + { + "epoch": 0.5446173104434907, + "grad_norm": 0.12608214403120488, + "learning_rate": 9.042887733404666e-05, + "loss": 0.6355, + "step": 6091 + }, + { + "epoch": 0.5447067238912733, + "grad_norm": 0.12048994614601144, + "learning_rate": 9.040005057827216e-05, + "loss": 0.6349, + "step": 6092 + }, + { + "epoch": 0.5447961373390557, + "grad_norm": 0.13318664243477252, + "learning_rate": 9.037122462763383e-05, + "loss": 0.6669, + "step": 6093 + }, + { + "epoch": 0.5448855507868383, + "grad_norm": 0.13467178558036833, + "learning_rate": 9.034239948454925e-05, + "loss": 0.6444, + "step": 6094 + }, + { + "epoch": 0.5449749642346209, + "grad_norm": 0.12705644162083024, + "learning_rate": 9.031357515143599e-05, + "loss": 0.6249, + "step": 6095 + }, + { + "epoch": 0.5450643776824035, + "grad_norm": 0.12724067796797878, + "learning_rate": 9.028475163071141e-05, + "loss": 0.6328, + "step": 6096 + }, + { + "epoch": 0.545153791130186, + "grad_norm": 0.12260832571811696, + "learning_rate": 9.025592892479303e-05, + "loss": 0.6385, + "step": 6097 + }, + { + "epoch": 0.5452432045779685, + "grad_norm": 0.13841087128633403, + "learning_rate": 9.022710703609814e-05, + "loss": 0.6304, + "step": 6098 + }, + { + "epoch": 0.5453326180257511, + "grad_norm": 0.12139554619402279, + "learning_rate": 9.019828596704394e-05, + "loss": 0.6384, + "step": 6099 + }, + { + "epoch": 0.5454220314735336, + "grad_norm": 0.13319526162595993, + "learning_rate": 9.01694657200477e-05, + "loss": 0.6765, + "step": 6100 + }, + { + "epoch": 0.5455114449213162, + "grad_norm": 0.12523640522184137, + "learning_rate": 9.014064629752647e-05, + "loss": 0.6384, + "step": 6101 + }, + { + "epoch": 0.5456008583690987, + "grad_norm": 0.13607722547597742, + "learning_rate": 9.011182770189733e-05, + "loss": 0.6747, + "step": 6102 + }, + { + "epoch": 0.5456902718168812, + "grad_norm": 0.129815057246577, + "learning_rate": 9.008300993557723e-05, + "loss": 0.6589, + "step": 6103 + }, + { + "epoch": 0.5457796852646638, + "grad_norm": 0.1429741721119498, + "learning_rate": 9.005419300098316e-05, + "loss": 0.6453, + "step": 6104 + }, + { + "epoch": 0.5458690987124464, + "grad_norm": 0.12842309671157745, + "learning_rate": 9.002537690053191e-05, + "loss": 0.6523, + "step": 6105 + }, + { + "epoch": 0.545958512160229, + "grad_norm": 0.12367725175664367, + "learning_rate": 8.999656163664023e-05, + "loss": 0.6802, + "step": 6106 + }, + { + "epoch": 0.5460479256080114, + "grad_norm": 0.11638305808695787, + "learning_rate": 8.996774721172487e-05, + "loss": 0.6339, + "step": 6107 + }, + { + "epoch": 0.546137339055794, + "grad_norm": 0.12218061198539877, + "learning_rate": 8.993893362820241e-05, + "loss": 0.6584, + "step": 6108 + }, + { + "epoch": 0.5462267525035766, + "grad_norm": 0.12796896827209564, + "learning_rate": 8.991012088848944e-05, + "loss": 0.6401, + "step": 6109 + }, + { + "epoch": 0.5463161659513591, + "grad_norm": 0.133796670210342, + "learning_rate": 8.988130899500243e-05, + "loss": 0.6544, + "step": 6110 + }, + { + "epoch": 0.5464055793991416, + "grad_norm": 0.13657992192621485, + "learning_rate": 8.985249795015784e-05, + "loss": 0.6609, + "step": 6111 + }, + { + "epoch": 0.5464949928469242, + "grad_norm": 0.14997689145184281, + "learning_rate": 8.9823687756372e-05, + "loss": 0.6544, + "step": 6112 + }, + { + "epoch": 0.5465844062947067, + "grad_norm": 0.1436602943629485, + "learning_rate": 8.979487841606115e-05, + "loss": 0.6618, + "step": 6113 + }, + { + "epoch": 0.5466738197424893, + "grad_norm": 0.12781099115161007, + "learning_rate": 8.976606993164155e-05, + "loss": 0.6333, + "step": 6114 + }, + { + "epoch": 0.5467632331902719, + "grad_norm": 0.12487974573983612, + "learning_rate": 8.97372623055293e-05, + "loss": 0.6546, + "step": 6115 + }, + { + "epoch": 0.5468526466380543, + "grad_norm": 0.1183604990199323, + "learning_rate": 8.970845554014044e-05, + "loss": 0.6292, + "step": 6116 + }, + { + "epoch": 0.5469420600858369, + "grad_norm": 0.11427380096376574, + "learning_rate": 8.967964963789097e-05, + "loss": 0.65, + "step": 6117 + }, + { + "epoch": 0.5470314735336195, + "grad_norm": 0.1331217771609666, + "learning_rate": 8.965084460119687e-05, + "loss": 0.6489, + "step": 6118 + }, + { + "epoch": 0.547120886981402, + "grad_norm": 0.1429361659100269, + "learning_rate": 8.962204043247393e-05, + "loss": 0.6972, + "step": 6119 + }, + { + "epoch": 0.5472103004291845, + "grad_norm": 0.12498241875462669, + "learning_rate": 8.959323713413791e-05, + "loss": 0.6503, + "step": 6120 + }, + { + "epoch": 0.5472997138769671, + "grad_norm": 0.11895969211864314, + "learning_rate": 8.956443470860453e-05, + "loss": 0.6244, + "step": 6121 + }, + { + "epoch": 0.5473891273247496, + "grad_norm": 0.12633077251176317, + "learning_rate": 8.953563315828942e-05, + "loss": 0.6638, + "step": 6122 + }, + { + "epoch": 0.5474785407725322, + "grad_norm": 0.12920475534783707, + "learning_rate": 8.95068324856081e-05, + "loss": 0.621, + "step": 6123 + }, + { + "epoch": 0.5475679542203148, + "grad_norm": 0.128409263983758, + "learning_rate": 8.947803269297604e-05, + "loss": 0.6435, + "step": 6124 + }, + { + "epoch": 0.5476573676680973, + "grad_norm": 0.15582821650121437, + "learning_rate": 8.944923378280871e-05, + "loss": 0.7085, + "step": 6125 + }, + { + "epoch": 0.5477467811158798, + "grad_norm": 0.12899320902232886, + "learning_rate": 8.942043575752141e-05, + "loss": 0.6506, + "step": 6126 + }, + { + "epoch": 0.5478361945636624, + "grad_norm": 0.13136276679208067, + "learning_rate": 8.939163861952935e-05, + "loss": 0.594, + "step": 6127 + }, + { + "epoch": 0.547925608011445, + "grad_norm": 0.15289633543610667, + "learning_rate": 8.936284237124778e-05, + "loss": 0.6675, + "step": 6128 + }, + { + "epoch": 0.5480150214592274, + "grad_norm": 0.1303515971493517, + "learning_rate": 8.933404701509175e-05, + "loss": 0.6235, + "step": 6129 + }, + { + "epoch": 0.54810443490701, + "grad_norm": 0.13353923027350145, + "learning_rate": 8.930525255347634e-05, + "loss": 0.6793, + "step": 6130 + }, + { + "epoch": 0.5481938483547926, + "grad_norm": 0.1248377687752633, + "learning_rate": 8.927645898881644e-05, + "loss": 0.6314, + "step": 6131 + }, + { + "epoch": 0.5482832618025751, + "grad_norm": 0.11969918679464672, + "learning_rate": 8.924766632352702e-05, + "loss": 0.6468, + "step": 6132 + }, + { + "epoch": 0.5483726752503576, + "grad_norm": 0.13503105743062746, + "learning_rate": 8.92188745600228e-05, + "loss": 0.6586, + "step": 6133 + }, + { + "epoch": 0.5484620886981402, + "grad_norm": 0.11994488045658999, + "learning_rate": 8.919008370071859e-05, + "loss": 0.6455, + "step": 6134 + }, + { + "epoch": 0.5485515021459227, + "grad_norm": 0.13071709482931113, + "learning_rate": 8.916129374802899e-05, + "loss": 0.6331, + "step": 6135 + }, + { + "epoch": 0.5486409155937053, + "grad_norm": 0.13768327061890637, + "learning_rate": 8.913250470436858e-05, + "loss": 0.661, + "step": 6136 + }, + { + "epoch": 0.5487303290414879, + "grad_norm": 0.12388346099783504, + "learning_rate": 8.910371657215191e-05, + "loss": 0.6277, + "step": 6137 + }, + { + "epoch": 0.5488197424892703, + "grad_norm": 0.1273703802644318, + "learning_rate": 8.907492935379331e-05, + "loss": 0.6144, + "step": 6138 + }, + { + "epoch": 0.5489091559370529, + "grad_norm": 0.14522062500046634, + "learning_rate": 8.904614305170724e-05, + "loss": 0.6541, + "step": 6139 + }, + { + "epoch": 0.5489985693848355, + "grad_norm": 0.14172658989073716, + "learning_rate": 8.90173576683079e-05, + "loss": 0.681, + "step": 6140 + }, + { + "epoch": 0.5490879828326181, + "grad_norm": 0.13622957279570758, + "learning_rate": 8.898857320600952e-05, + "loss": 0.644, + "step": 6141 + }, + { + "epoch": 0.5491773962804005, + "grad_norm": 0.13010069676436933, + "learning_rate": 8.895978966722623e-05, + "loss": 0.6325, + "step": 6142 + }, + { + "epoch": 0.5492668097281831, + "grad_norm": 0.13727278491896416, + "learning_rate": 8.893100705437201e-05, + "loss": 0.6889, + "step": 6143 + }, + { + "epoch": 0.5493562231759657, + "grad_norm": 0.1407088388622779, + "learning_rate": 8.890222536986085e-05, + "loss": 0.6548, + "step": 6144 + }, + { + "epoch": 0.5494456366237482, + "grad_norm": 0.14021480089175026, + "learning_rate": 8.887344461610668e-05, + "loss": 0.6689, + "step": 6145 + }, + { + "epoch": 0.5495350500715308, + "grad_norm": 0.13472061795094306, + "learning_rate": 8.884466479552328e-05, + "loss": 0.6522, + "step": 6146 + }, + { + "epoch": 0.5496244635193133, + "grad_norm": 0.14288271697503455, + "learning_rate": 8.881588591052434e-05, + "loss": 0.6553, + "step": 6147 + }, + { + "epoch": 0.5497138769670958, + "grad_norm": 0.13822050261544358, + "learning_rate": 8.878710796352358e-05, + "loss": 0.5922, + "step": 6148 + }, + { + "epoch": 0.5498032904148784, + "grad_norm": 0.12530355833401102, + "learning_rate": 8.875833095693451e-05, + "loss": 0.629, + "step": 6149 + }, + { + "epoch": 0.549892703862661, + "grad_norm": 0.12123797912781156, + "learning_rate": 8.872955489317063e-05, + "loss": 0.6722, + "step": 6150 + }, + { + "epoch": 0.5499821173104434, + "grad_norm": 0.13257555302978566, + "learning_rate": 8.870077977464537e-05, + "loss": 0.662, + "step": 6151 + }, + { + "epoch": 0.550071530758226, + "grad_norm": 0.15244647314179025, + "learning_rate": 8.867200560377209e-05, + "loss": 0.6322, + "step": 6152 + }, + { + "epoch": 0.5501609442060086, + "grad_norm": 0.1160773820637136, + "learning_rate": 8.864323238296401e-05, + "loss": 0.6526, + "step": 6153 + }, + { + "epoch": 0.5502503576537912, + "grad_norm": 0.12613064408342006, + "learning_rate": 8.861446011463432e-05, + "loss": 0.6349, + "step": 6154 + }, + { + "epoch": 0.5503397711015737, + "grad_norm": 0.1261555107197043, + "learning_rate": 8.858568880119611e-05, + "loss": 0.6361, + "step": 6155 + }, + { + "epoch": 0.5504291845493562, + "grad_norm": 0.12946526921937374, + "learning_rate": 8.855691844506238e-05, + "loss": 0.6481, + "step": 6156 + }, + { + "epoch": 0.5505185979971388, + "grad_norm": 0.12477200072123804, + "learning_rate": 8.852814904864611e-05, + "loss": 0.6776, + "step": 6157 + }, + { + "epoch": 0.5506080114449213, + "grad_norm": 0.1325283623555409, + "learning_rate": 8.849938061436006e-05, + "loss": 0.6428, + "step": 6158 + }, + { + "epoch": 0.5506974248927039, + "grad_norm": 0.11926485368008286, + "learning_rate": 8.847061314461714e-05, + "loss": 0.6397, + "step": 6159 + }, + { + "epoch": 0.5507868383404864, + "grad_norm": 0.13596554929410415, + "learning_rate": 8.844184664182993e-05, + "loss": 0.6256, + "step": 6160 + }, + { + "epoch": 0.5508762517882689, + "grad_norm": 0.11784243448661325, + "learning_rate": 8.84130811084111e-05, + "loss": 0.6213, + "step": 6161 + }, + { + "epoch": 0.5509656652360515, + "grad_norm": 0.1298382289927204, + "learning_rate": 8.838431654677317e-05, + "loss": 0.6391, + "step": 6162 + }, + { + "epoch": 0.5510550786838341, + "grad_norm": 0.12024670374408544, + "learning_rate": 8.835555295932857e-05, + "loss": 0.6512, + "step": 6163 + }, + { + "epoch": 0.5511444921316166, + "grad_norm": 0.13221257210057993, + "learning_rate": 8.832679034848969e-05, + "loss": 0.6696, + "step": 6164 + }, + { + "epoch": 0.5512339055793991, + "grad_norm": 0.1376509097652368, + "learning_rate": 8.829802871666877e-05, + "loss": 0.6606, + "step": 6165 + }, + { + "epoch": 0.5513233190271817, + "grad_norm": 0.1350550500014722, + "learning_rate": 8.82692680662781e-05, + "loss": 0.6416, + "step": 6166 + }, + { + "epoch": 0.5514127324749643, + "grad_norm": 0.13121650248385364, + "learning_rate": 8.824050839972973e-05, + "loss": 0.6169, + "step": 6167 + }, + { + "epoch": 0.5515021459227468, + "grad_norm": 0.13175389395713943, + "learning_rate": 8.821174971943572e-05, + "loss": 0.6315, + "step": 6168 + }, + { + "epoch": 0.5515915593705293, + "grad_norm": 0.14563155682080187, + "learning_rate": 8.818299202780805e-05, + "loss": 0.6482, + "step": 6169 + }, + { + "epoch": 0.5516809728183119, + "grad_norm": 0.134569001275435, + "learning_rate": 8.815423532725852e-05, + "loss": 0.6498, + "step": 6170 + }, + { + "epoch": 0.5517703862660944, + "grad_norm": 0.1168713940392866, + "learning_rate": 8.8125479620199e-05, + "loss": 0.6248, + "step": 6171 + }, + { + "epoch": 0.551859799713877, + "grad_norm": 0.12003085958539716, + "learning_rate": 8.809672490904111e-05, + "loss": 0.6562, + "step": 6172 + }, + { + "epoch": 0.5519492131616596, + "grad_norm": 0.13526274663556337, + "learning_rate": 8.806797119619658e-05, + "loss": 0.616, + "step": 6173 + }, + { + "epoch": 0.552038626609442, + "grad_norm": 0.1393013506929957, + "learning_rate": 8.803921848407687e-05, + "loss": 0.6834, + "step": 6174 + }, + { + "epoch": 0.5521280400572246, + "grad_norm": 0.13725501844494098, + "learning_rate": 8.80104667750935e-05, + "loss": 0.6539, + "step": 6175 + }, + { + "epoch": 0.5522174535050072, + "grad_norm": 0.13708879998226922, + "learning_rate": 8.798171607165778e-05, + "loss": 0.7, + "step": 6176 + }, + { + "epoch": 0.5523068669527897, + "grad_norm": 0.1444153579445027, + "learning_rate": 8.795296637618101e-05, + "loss": 0.6593, + "step": 6177 + }, + { + "epoch": 0.5523962804005722, + "grad_norm": 0.13385930509865007, + "learning_rate": 8.792421769107442e-05, + "loss": 0.671, + "step": 6178 + }, + { + "epoch": 0.5524856938483548, + "grad_norm": 0.11750088947051665, + "learning_rate": 8.789547001874906e-05, + "loss": 0.5942, + "step": 6179 + }, + { + "epoch": 0.5525751072961373, + "grad_norm": 0.12595222970560274, + "learning_rate": 8.786672336161605e-05, + "loss": 0.6338, + "step": 6180 + }, + { + "epoch": 0.5526645207439199, + "grad_norm": 0.13574955702506397, + "learning_rate": 8.783797772208628e-05, + "loss": 0.6627, + "step": 6181 + }, + { + "epoch": 0.5527539341917024, + "grad_norm": 0.15278334165416071, + "learning_rate": 8.780923310257067e-05, + "loss": 0.6781, + "step": 6182 + }, + { + "epoch": 0.552843347639485, + "grad_norm": 0.16210169541869004, + "learning_rate": 8.778048950547994e-05, + "loss": 0.6875, + "step": 6183 + }, + { + "epoch": 0.5529327610872675, + "grad_norm": 0.13574977831979904, + "learning_rate": 8.775174693322478e-05, + "loss": 0.6792, + "step": 6184 + }, + { + "epoch": 0.5530221745350501, + "grad_norm": 0.12242573700064947, + "learning_rate": 8.772300538821583e-05, + "loss": 0.6304, + "step": 6185 + }, + { + "epoch": 0.5531115879828327, + "grad_norm": 0.15455017943273935, + "learning_rate": 8.769426487286356e-05, + "loss": 0.6568, + "step": 6186 + }, + { + "epoch": 0.5532010014306151, + "grad_norm": 0.1419386770771869, + "learning_rate": 8.766552538957846e-05, + "loss": 0.6387, + "step": 6187 + }, + { + "epoch": 0.5532904148783977, + "grad_norm": 0.14599532473664806, + "learning_rate": 8.763678694077083e-05, + "loss": 0.6938, + "step": 6188 + }, + { + "epoch": 0.5533798283261803, + "grad_norm": 0.12955303189112324, + "learning_rate": 8.760804952885098e-05, + "loss": 0.5709, + "step": 6189 + }, + { + "epoch": 0.5534692417739628, + "grad_norm": 0.13075060638042157, + "learning_rate": 8.757931315622903e-05, + "loss": 0.6358, + "step": 6190 + }, + { + "epoch": 0.5535586552217453, + "grad_norm": 0.13539442170042007, + "learning_rate": 8.755057782531509e-05, + "loss": 0.6512, + "step": 6191 + }, + { + "epoch": 0.5536480686695279, + "grad_norm": 0.13362182818815738, + "learning_rate": 8.752184353851916e-05, + "loss": 0.676, + "step": 6192 + }, + { + "epoch": 0.5537374821173104, + "grad_norm": 0.11524518176147769, + "learning_rate": 8.749311029825111e-05, + "loss": 0.625, + "step": 6193 + }, + { + "epoch": 0.553826895565093, + "grad_norm": 0.1318455173755058, + "learning_rate": 8.74643781069208e-05, + "loss": 0.6883, + "step": 6194 + }, + { + "epoch": 0.5539163090128756, + "grad_norm": 0.14292212713667302, + "learning_rate": 8.7435646966938e-05, + "loss": 0.6426, + "step": 6195 + }, + { + "epoch": 0.554005722460658, + "grad_norm": 0.12575796211186194, + "learning_rate": 8.74069168807123e-05, + "loss": 0.6263, + "step": 6196 + }, + { + "epoch": 0.5540951359084406, + "grad_norm": 0.13669852163997398, + "learning_rate": 8.737818785065326e-05, + "loss": 0.6229, + "step": 6197 + }, + { + "epoch": 0.5541845493562232, + "grad_norm": 0.1285287213413593, + "learning_rate": 8.734945987917038e-05, + "loss": 0.6366, + "step": 6198 + }, + { + "epoch": 0.5542739628040058, + "grad_norm": 0.13501565173126542, + "learning_rate": 8.732073296867303e-05, + "loss": 0.6583, + "step": 6199 + }, + { + "epoch": 0.5543633762517882, + "grad_norm": 0.14619647909973235, + "learning_rate": 8.729200712157043e-05, + "loss": 0.6652, + "step": 6200 + }, + { + "epoch": 0.5544527896995708, + "grad_norm": 0.12513023877835774, + "learning_rate": 8.726328234027188e-05, + "loss": 0.6092, + "step": 6201 + }, + { + "epoch": 0.5545422031473534, + "grad_norm": 0.14689146506457046, + "learning_rate": 8.723455862718649e-05, + "loss": 0.6687, + "step": 6202 + }, + { + "epoch": 0.5546316165951359, + "grad_norm": 0.14592508312589078, + "learning_rate": 8.720583598472322e-05, + "loss": 0.6415, + "step": 6203 + }, + { + "epoch": 0.5547210300429185, + "grad_norm": 0.13944011887540383, + "learning_rate": 8.717711441529104e-05, + "loss": 0.6766, + "step": 6204 + }, + { + "epoch": 0.554810443490701, + "grad_norm": 0.14185186226217084, + "learning_rate": 8.71483939212988e-05, + "loss": 0.6662, + "step": 6205 + }, + { + "epoch": 0.5548998569384835, + "grad_norm": 0.12108469373741804, + "learning_rate": 8.711967450515524e-05, + "loss": 0.6545, + "step": 6206 + }, + { + "epoch": 0.5549892703862661, + "grad_norm": 0.1295447556226329, + "learning_rate": 8.709095616926897e-05, + "loss": 0.644, + "step": 6207 + }, + { + "epoch": 0.5550786838340487, + "grad_norm": 0.12214712608580144, + "learning_rate": 8.706223891604866e-05, + "loss": 0.6427, + "step": 6208 + }, + { + "epoch": 0.5551680972818311, + "grad_norm": 0.13927568422481576, + "learning_rate": 8.703352274790276e-05, + "loss": 0.6632, + "step": 6209 + }, + { + "epoch": 0.5552575107296137, + "grad_norm": 0.14726250293635115, + "learning_rate": 8.700480766723964e-05, + "loss": 0.6703, + "step": 6210 + }, + { + "epoch": 0.5553469241773963, + "grad_norm": 0.13948813411455535, + "learning_rate": 8.69760936764676e-05, + "loss": 0.608, + "step": 6211 + }, + { + "epoch": 0.5554363376251789, + "grad_norm": 0.12766300992363566, + "learning_rate": 8.694738077799488e-05, + "loss": 0.6131, + "step": 6212 + }, + { + "epoch": 0.5555257510729614, + "grad_norm": 0.12053647502399296, + "learning_rate": 8.691866897422952e-05, + "loss": 0.6138, + "step": 6213 + }, + { + "epoch": 0.5556151645207439, + "grad_norm": 0.1329698002391906, + "learning_rate": 8.688995826757961e-05, + "loss": 0.6477, + "step": 6214 + }, + { + "epoch": 0.5557045779685265, + "grad_norm": 0.11806015946128043, + "learning_rate": 8.686124866045308e-05, + "loss": 0.6269, + "step": 6215 + }, + { + "epoch": 0.555793991416309, + "grad_norm": 0.12312957490087421, + "learning_rate": 8.683254015525776e-05, + "loss": 0.6439, + "step": 6216 + }, + { + "epoch": 0.5558834048640916, + "grad_norm": 0.13194420578577437, + "learning_rate": 8.680383275440138e-05, + "loss": 0.6574, + "step": 6217 + }, + { + "epoch": 0.5559728183118741, + "grad_norm": 0.1274325304722276, + "learning_rate": 8.677512646029163e-05, + "loss": 0.6734, + "step": 6218 + }, + { + "epoch": 0.5560622317596566, + "grad_norm": 0.13406067419216755, + "learning_rate": 8.674642127533605e-05, + "loss": 0.6428, + "step": 6219 + }, + { + "epoch": 0.5561516452074392, + "grad_norm": 0.12669731395580755, + "learning_rate": 8.671771720194211e-05, + "loss": 0.6453, + "step": 6220 + }, + { + "epoch": 0.5562410586552218, + "grad_norm": 0.12887130634298655, + "learning_rate": 8.668901424251714e-05, + "loss": 0.679, + "step": 6221 + }, + { + "epoch": 0.5563304721030042, + "grad_norm": 0.13489052087756123, + "learning_rate": 8.666031239946852e-05, + "loss": 0.6597, + "step": 6222 + }, + { + "epoch": 0.5564198855507868, + "grad_norm": 0.1388553485862018, + "learning_rate": 8.66316116752034e-05, + "loss": 0.6398, + "step": 6223 + }, + { + "epoch": 0.5565092989985694, + "grad_norm": 0.1258139657452108, + "learning_rate": 8.660291207212882e-05, + "loss": 0.6674, + "step": 6224 + }, + { + "epoch": 0.556598712446352, + "grad_norm": 0.134376988954353, + "learning_rate": 8.657421359265188e-05, + "loss": 0.6409, + "step": 6225 + }, + { + "epoch": 0.5566881258941345, + "grad_norm": 0.1202401311345221, + "learning_rate": 8.654551623917941e-05, + "loss": 0.6436, + "step": 6226 + }, + { + "epoch": 0.556777539341917, + "grad_norm": 0.13692510213934006, + "learning_rate": 8.651682001411821e-05, + "loss": 0.6318, + "step": 6227 + }, + { + "epoch": 0.5568669527896996, + "grad_norm": 0.1331583084659034, + "learning_rate": 8.648812491987504e-05, + "loss": 0.6906, + "step": 6228 + }, + { + "epoch": 0.5569563662374821, + "grad_norm": 0.13038616277176, + "learning_rate": 8.645943095885655e-05, + "loss": 0.6484, + "step": 6229 + }, + { + "epoch": 0.5570457796852647, + "grad_norm": 0.12263951115880795, + "learning_rate": 8.643073813346922e-05, + "loss": 0.6552, + "step": 6230 + }, + { + "epoch": 0.5571351931330472, + "grad_norm": 0.1368917750180441, + "learning_rate": 8.640204644611948e-05, + "loss": 0.662, + "step": 6231 + }, + { + "epoch": 0.5572246065808297, + "grad_norm": 0.12986232117858215, + "learning_rate": 8.63733558992137e-05, + "loss": 0.682, + "step": 6232 + }, + { + "epoch": 0.5573140200286123, + "grad_norm": 0.1392455579901547, + "learning_rate": 8.634466649515811e-05, + "loss": 0.6631, + "step": 6233 + }, + { + "epoch": 0.5574034334763949, + "grad_norm": 0.13030318952167166, + "learning_rate": 8.63159782363588e-05, + "loss": 0.6022, + "step": 6234 + }, + { + "epoch": 0.5574928469241774, + "grad_norm": 0.13184145214124932, + "learning_rate": 8.62872911252219e-05, + "loss": 0.6583, + "step": 6235 + }, + { + "epoch": 0.5575822603719599, + "grad_norm": 0.1423174161628551, + "learning_rate": 8.625860516415335e-05, + "loss": 0.6617, + "step": 6236 + }, + { + "epoch": 0.5576716738197425, + "grad_norm": 0.142788519642374, + "learning_rate": 8.6229920355559e-05, + "loss": 0.6671, + "step": 6237 + }, + { + "epoch": 0.557761087267525, + "grad_norm": 0.15394827005439143, + "learning_rate": 8.620123670184455e-05, + "loss": 0.6863, + "step": 6238 + }, + { + "epoch": 0.5578505007153076, + "grad_norm": 0.13977133780771472, + "learning_rate": 8.617255420541576e-05, + "loss": 0.6584, + "step": 6239 + }, + { + "epoch": 0.5579399141630901, + "grad_norm": 0.13654710070315826, + "learning_rate": 8.614387286867814e-05, + "loss": 0.6574, + "step": 6240 + }, + { + "epoch": 0.5580293276108726, + "grad_norm": 0.12384883724862314, + "learning_rate": 8.611519269403712e-05, + "loss": 0.614, + "step": 6241 + }, + { + "epoch": 0.5581187410586552, + "grad_norm": 0.14620631140420862, + "learning_rate": 8.608651368389815e-05, + "loss": 0.6884, + "step": 6242 + }, + { + "epoch": 0.5582081545064378, + "grad_norm": 0.12941213111832023, + "learning_rate": 8.605783584066649e-05, + "loss": 0.6434, + "step": 6243 + }, + { + "epoch": 0.5582975679542204, + "grad_norm": 0.1275810578553516, + "learning_rate": 8.602915916674731e-05, + "loss": 0.6165, + "step": 6244 + }, + { + "epoch": 0.5583869814020028, + "grad_norm": 0.12240351745785097, + "learning_rate": 8.600048366454565e-05, + "loss": 0.6436, + "step": 6245 + }, + { + "epoch": 0.5584763948497854, + "grad_norm": 0.1310733216864227, + "learning_rate": 8.597180933646653e-05, + "loss": 0.6432, + "step": 6246 + }, + { + "epoch": 0.558565808297568, + "grad_norm": 0.15716041586420681, + "learning_rate": 8.594313618491481e-05, + "loss": 0.6638, + "step": 6247 + }, + { + "epoch": 0.5586552217453505, + "grad_norm": 0.14961279292887616, + "learning_rate": 8.591446421229528e-05, + "loss": 0.6572, + "step": 6248 + }, + { + "epoch": 0.558744635193133, + "grad_norm": 0.13521686134758165, + "learning_rate": 8.588579342101263e-05, + "loss": 0.6285, + "step": 6249 + }, + { + "epoch": 0.5588340486409156, + "grad_norm": 0.12437661479740282, + "learning_rate": 8.585712381347145e-05, + "loss": 0.5686, + "step": 6250 + }, + { + "epoch": 0.5589234620886981, + "grad_norm": 0.12346256576184866, + "learning_rate": 8.58284553920762e-05, + "loss": 0.6633, + "step": 6251 + }, + { + "epoch": 0.5590128755364807, + "grad_norm": 0.1251025585467884, + "learning_rate": 8.57997881592313e-05, + "loss": 0.6312, + "step": 6252 + }, + { + "epoch": 0.5591022889842633, + "grad_norm": 0.1291028921071069, + "learning_rate": 8.577112211734104e-05, + "loss": 0.6486, + "step": 6253 + }, + { + "epoch": 0.5591917024320457, + "grad_norm": 0.1291835391614438, + "learning_rate": 8.574245726880953e-05, + "loss": 0.6506, + "step": 6254 + }, + { + "epoch": 0.5592811158798283, + "grad_norm": 0.13592336532306343, + "learning_rate": 8.571379361604091e-05, + "loss": 0.641, + "step": 6255 + }, + { + "epoch": 0.5593705293276109, + "grad_norm": 0.12248946296745382, + "learning_rate": 8.568513116143919e-05, + "loss": 0.6121, + "step": 6256 + }, + { + "epoch": 0.5594599427753935, + "grad_norm": 0.128961554955683, + "learning_rate": 8.565646990740824e-05, + "loss": 0.5874, + "step": 6257 + }, + { + "epoch": 0.5595493562231759, + "grad_norm": 0.13148960719244301, + "learning_rate": 8.562780985635183e-05, + "loss": 0.6261, + "step": 6258 + }, + { + "epoch": 0.5596387696709585, + "grad_norm": 0.13591012092623073, + "learning_rate": 8.559915101067366e-05, + "loss": 0.652, + "step": 6259 + }, + { + "epoch": 0.5597281831187411, + "grad_norm": 0.12354021101427452, + "learning_rate": 8.55704933727773e-05, + "loss": 0.6546, + "step": 6260 + }, + { + "epoch": 0.5598175965665236, + "grad_norm": 0.12975880325669176, + "learning_rate": 8.554183694506622e-05, + "loss": 0.6432, + "step": 6261 + }, + { + "epoch": 0.5599070100143062, + "grad_norm": 0.12017199712993705, + "learning_rate": 8.551318172994378e-05, + "loss": 0.6597, + "step": 6262 + }, + { + "epoch": 0.5599964234620887, + "grad_norm": 0.14523801704228145, + "learning_rate": 8.548452772981334e-05, + "loss": 0.6269, + "step": 6263 + }, + { + "epoch": 0.5600858369098712, + "grad_norm": 0.12393263392632564, + "learning_rate": 8.545587494707803e-05, + "loss": 0.6399, + "step": 6264 + }, + { + "epoch": 0.5601752503576538, + "grad_norm": 0.14141514800577584, + "learning_rate": 8.54272233841409e-05, + "loss": 0.6741, + "step": 6265 + }, + { + "epoch": 0.5602646638054364, + "grad_norm": 0.13414117806816217, + "learning_rate": 8.539857304340498e-05, + "loss": 0.6522, + "step": 6266 + }, + { + "epoch": 0.5603540772532188, + "grad_norm": 0.14498849789305393, + "learning_rate": 8.53699239272731e-05, + "loss": 0.6911, + "step": 6267 + }, + { + "epoch": 0.5604434907010014, + "grad_norm": 0.14286703568514245, + "learning_rate": 8.5341276038148e-05, + "loss": 0.6799, + "step": 6268 + }, + { + "epoch": 0.560532904148784, + "grad_norm": 0.12360242342163996, + "learning_rate": 8.531262937843236e-05, + "loss": 0.6152, + "step": 6269 + }, + { + "epoch": 0.5606223175965666, + "grad_norm": 0.1550631151516845, + "learning_rate": 8.528398395052879e-05, + "loss": 0.6803, + "step": 6270 + }, + { + "epoch": 0.560711731044349, + "grad_norm": 0.11979844321538216, + "learning_rate": 8.525533975683972e-05, + "loss": 0.6537, + "step": 6271 + }, + { + "epoch": 0.5608011444921316, + "grad_norm": 0.14138763655236866, + "learning_rate": 8.522669679976749e-05, + "loss": 0.7136, + "step": 6272 + }, + { + "epoch": 0.5608905579399142, + "grad_norm": 0.1364805908120512, + "learning_rate": 8.519805508171437e-05, + "loss": 0.6658, + "step": 6273 + }, + { + "epoch": 0.5609799713876967, + "grad_norm": 0.12134018141859801, + "learning_rate": 8.516941460508247e-05, + "loss": 0.6553, + "step": 6274 + }, + { + "epoch": 0.5610693848354793, + "grad_norm": 0.11941045700696216, + "learning_rate": 8.514077537227388e-05, + "loss": 0.632, + "step": 6275 + }, + { + "epoch": 0.5611587982832618, + "grad_norm": 0.14405108493301158, + "learning_rate": 8.511213738569046e-05, + "loss": 0.6547, + "step": 6276 + }, + { + "epoch": 0.5612482117310443, + "grad_norm": 0.13942261614993898, + "learning_rate": 8.508350064773415e-05, + "loss": 0.6964, + "step": 6277 + }, + { + "epoch": 0.5613376251788269, + "grad_norm": 0.15986846250255626, + "learning_rate": 8.50548651608066e-05, + "loss": 0.6565, + "step": 6278 + }, + { + "epoch": 0.5614270386266095, + "grad_norm": 0.13775510589906084, + "learning_rate": 8.50262309273095e-05, + "loss": 0.6264, + "step": 6279 + }, + { + "epoch": 0.5615164520743919, + "grad_norm": 0.13331952757610643, + "learning_rate": 8.49975979496443e-05, + "loss": 0.6435, + "step": 6280 + }, + { + "epoch": 0.5616058655221745, + "grad_norm": 0.13075482225882756, + "learning_rate": 8.496896623021245e-05, + "loss": 0.6638, + "step": 6281 + }, + { + "epoch": 0.5616952789699571, + "grad_norm": 0.12787056265521066, + "learning_rate": 8.494033577141525e-05, + "loss": 0.6367, + "step": 6282 + }, + { + "epoch": 0.5617846924177397, + "grad_norm": 0.1381922969561462, + "learning_rate": 8.491170657565386e-05, + "loss": 0.6234, + "step": 6283 + }, + { + "epoch": 0.5618741058655222, + "grad_norm": 0.12977137372983907, + "learning_rate": 8.488307864532946e-05, + "loss": 0.6431, + "step": 6284 + }, + { + "epoch": 0.5619635193133047, + "grad_norm": 0.13475678702082378, + "learning_rate": 8.485445198284298e-05, + "loss": 0.6525, + "step": 6285 + }, + { + "epoch": 0.5620529327610873, + "grad_norm": 0.13727632450407992, + "learning_rate": 8.482582659059534e-05, + "loss": 0.5919, + "step": 6286 + }, + { + "epoch": 0.5621423462088698, + "grad_norm": 0.13964817003368454, + "learning_rate": 8.47972024709873e-05, + "loss": 0.666, + "step": 6287 + }, + { + "epoch": 0.5622317596566524, + "grad_norm": 0.16188741201223605, + "learning_rate": 8.47685796264195e-05, + "loss": 0.678, + "step": 6288 + }, + { + "epoch": 0.5623211731044349, + "grad_norm": 0.1317754723562474, + "learning_rate": 8.473995805929257e-05, + "loss": 0.656, + "step": 6289 + }, + { + "epoch": 0.5624105865522174, + "grad_norm": 0.1359849127737354, + "learning_rate": 8.471133777200688e-05, + "loss": 0.6409, + "step": 6290 + }, + { + "epoch": 0.5625, + "grad_norm": 0.13110074448637601, + "learning_rate": 8.468271876696286e-05, + "loss": 0.6392, + "step": 6291 + }, + { + "epoch": 0.5625894134477826, + "grad_norm": 0.12198038262380735, + "learning_rate": 8.46541010465607e-05, + "loss": 0.6223, + "step": 6292 + }, + { + "epoch": 0.5626788268955651, + "grad_norm": 0.1251791304288294, + "learning_rate": 8.462548461320057e-05, + "loss": 0.6235, + "step": 6293 + }, + { + "epoch": 0.5627682403433476, + "grad_norm": 0.13920706363398197, + "learning_rate": 8.459686946928249e-05, + "loss": 0.6768, + "step": 6294 + }, + { + "epoch": 0.5628576537911302, + "grad_norm": 0.12343105894080995, + "learning_rate": 8.456825561720634e-05, + "loss": 0.6449, + "step": 6295 + }, + { + "epoch": 0.5629470672389127, + "grad_norm": 0.12695798048952245, + "learning_rate": 8.453964305937197e-05, + "loss": 0.6519, + "step": 6296 + }, + { + "epoch": 0.5630364806866953, + "grad_norm": 0.13984063849625675, + "learning_rate": 8.451103179817903e-05, + "loss": 0.6891, + "step": 6297 + }, + { + "epoch": 0.5631258941344778, + "grad_norm": 0.12483497384603978, + "learning_rate": 8.448242183602719e-05, + "loss": 0.6411, + "step": 6298 + }, + { + "epoch": 0.5632153075822603, + "grad_norm": 0.14632999593546245, + "learning_rate": 8.445381317531586e-05, + "loss": 0.6666, + "step": 6299 + }, + { + "epoch": 0.5633047210300429, + "grad_norm": 0.13610178199574932, + "learning_rate": 8.442520581844447e-05, + "loss": 0.6834, + "step": 6300 + }, + { + "epoch": 0.5633941344778255, + "grad_norm": 0.14013024815558142, + "learning_rate": 8.439659976781226e-05, + "loss": 0.6508, + "step": 6301 + }, + { + "epoch": 0.5634835479256081, + "grad_norm": 0.12859712782768876, + "learning_rate": 8.436799502581836e-05, + "loss": 0.662, + "step": 6302 + }, + { + "epoch": 0.5635729613733905, + "grad_norm": 0.14293031944153609, + "learning_rate": 8.433939159486186e-05, + "loss": 0.6306, + "step": 6303 + }, + { + "epoch": 0.5636623748211731, + "grad_norm": 0.13095263913901897, + "learning_rate": 8.431078947734164e-05, + "loss": 0.6478, + "step": 6304 + }, + { + "epoch": 0.5637517882689557, + "grad_norm": 0.143660070652813, + "learning_rate": 8.428218867565659e-05, + "loss": 0.6508, + "step": 6305 + }, + { + "epoch": 0.5638412017167382, + "grad_norm": 0.1430156974460853, + "learning_rate": 8.425358919220537e-05, + "loss": 0.6785, + "step": 6306 + }, + { + "epoch": 0.5639306151645207, + "grad_norm": 0.12875882460188587, + "learning_rate": 8.422499102938663e-05, + "loss": 0.6114, + "step": 6307 + }, + { + "epoch": 0.5640200286123033, + "grad_norm": 0.13909130620143886, + "learning_rate": 8.419639418959884e-05, + "loss": 0.6942, + "step": 6308 + }, + { + "epoch": 0.5641094420600858, + "grad_norm": 0.1488085483929436, + "learning_rate": 8.416779867524039e-05, + "loss": 0.6709, + "step": 6309 + }, + { + "epoch": 0.5641988555078684, + "grad_norm": 0.15490824226417713, + "learning_rate": 8.413920448870954e-05, + "loss": 0.6685, + "step": 6310 + }, + { + "epoch": 0.564288268955651, + "grad_norm": 0.13220082816921935, + "learning_rate": 8.411061163240441e-05, + "loss": 0.6175, + "step": 6311 + }, + { + "epoch": 0.5643776824034334, + "grad_norm": 0.14243571667520752, + "learning_rate": 8.408202010872312e-05, + "loss": 0.6405, + "step": 6312 + }, + { + "epoch": 0.564467095851216, + "grad_norm": 0.13471725145373462, + "learning_rate": 8.40534299200636e-05, + "loss": 0.6585, + "step": 6313 + }, + { + "epoch": 0.5645565092989986, + "grad_norm": 0.14780608816472587, + "learning_rate": 8.402484106882364e-05, + "loss": 0.6325, + "step": 6314 + }, + { + "epoch": 0.5646459227467812, + "grad_norm": 0.14556409234497653, + "learning_rate": 8.399625355740097e-05, + "loss": 0.656, + "step": 6315 + }, + { + "epoch": 0.5647353361945636, + "grad_norm": 0.13813004814053118, + "learning_rate": 8.396766738819319e-05, + "loss": 0.6456, + "step": 6316 + }, + { + "epoch": 0.5648247496423462, + "grad_norm": 0.11856364650992003, + "learning_rate": 8.393908256359776e-05, + "loss": 0.6562, + "step": 6317 + }, + { + "epoch": 0.5649141630901288, + "grad_norm": 0.1348266944106183, + "learning_rate": 8.39104990860121e-05, + "loss": 0.6438, + "step": 6318 + }, + { + "epoch": 0.5650035765379113, + "grad_norm": 0.1373314892908934, + "learning_rate": 8.388191695783345e-05, + "loss": 0.6883, + "step": 6319 + }, + { + "epoch": 0.5650929899856938, + "grad_norm": 0.1446087593710813, + "learning_rate": 8.385333618145896e-05, + "loss": 0.6931, + "step": 6320 + }, + { + "epoch": 0.5651824034334764, + "grad_norm": 0.1295357360972907, + "learning_rate": 8.382475675928568e-05, + "loss": 0.6473, + "step": 6321 + }, + { + "epoch": 0.5652718168812589, + "grad_norm": 0.14699903005759668, + "learning_rate": 8.379617869371049e-05, + "loss": 0.6738, + "step": 6322 + }, + { + "epoch": 0.5653612303290415, + "grad_norm": 0.12323104892892509, + "learning_rate": 8.376760198713024e-05, + "loss": 0.6129, + "step": 6323 + }, + { + "epoch": 0.5654506437768241, + "grad_norm": 0.13133320309274615, + "learning_rate": 8.373902664194156e-05, + "loss": 0.6536, + "step": 6324 + }, + { + "epoch": 0.5655400572246065, + "grad_norm": 0.13332248166707, + "learning_rate": 8.371045266054114e-05, + "loss": 0.6735, + "step": 6325 + }, + { + "epoch": 0.5656294706723891, + "grad_norm": 0.15026104135999102, + "learning_rate": 8.368188004532535e-05, + "loss": 0.647, + "step": 6326 + }, + { + "epoch": 0.5657188841201717, + "grad_norm": 0.13722298132581381, + "learning_rate": 8.365330879869059e-05, + "loss": 0.6487, + "step": 6327 + }, + { + "epoch": 0.5658082975679543, + "grad_norm": 0.1355719033924247, + "learning_rate": 8.362473892303308e-05, + "loss": 0.6521, + "step": 6328 + }, + { + "epoch": 0.5658977110157367, + "grad_norm": 0.13511255476941697, + "learning_rate": 8.359617042074891e-05, + "loss": 0.6655, + "step": 6329 + }, + { + "epoch": 0.5659871244635193, + "grad_norm": 0.13630014108836544, + "learning_rate": 8.356760329423417e-05, + "loss": 0.6392, + "step": 6330 + }, + { + "epoch": 0.5660765379113019, + "grad_norm": 0.12476051826874808, + "learning_rate": 8.353903754588463e-05, + "loss": 0.6745, + "step": 6331 + }, + { + "epoch": 0.5661659513590844, + "grad_norm": 0.11857967518339872, + "learning_rate": 8.351047317809617e-05, + "loss": 0.6658, + "step": 6332 + }, + { + "epoch": 0.566255364806867, + "grad_norm": 0.12858644353120874, + "learning_rate": 8.34819101932644e-05, + "loss": 0.6599, + "step": 6333 + }, + { + "epoch": 0.5663447782546495, + "grad_norm": 0.15101282566984583, + "learning_rate": 8.345334859378489e-05, + "loss": 0.6694, + "step": 6334 + }, + { + "epoch": 0.566434191702432, + "grad_norm": 0.11273157816893455, + "learning_rate": 8.342478838205302e-05, + "loss": 0.6243, + "step": 6335 + }, + { + "epoch": 0.5665236051502146, + "grad_norm": 0.1266420080952929, + "learning_rate": 8.339622956046417e-05, + "loss": 0.6588, + "step": 6336 + }, + { + "epoch": 0.5666130185979972, + "grad_norm": 0.11315014557093923, + "learning_rate": 8.336767213141348e-05, + "loss": 0.6221, + "step": 6337 + }, + { + "epoch": 0.5667024320457796, + "grad_norm": 0.11602450999040634, + "learning_rate": 8.333911609729601e-05, + "loss": 0.5984, + "step": 6338 + }, + { + "epoch": 0.5667918454935622, + "grad_norm": 0.13588034584993552, + "learning_rate": 8.331056146050676e-05, + "loss": 0.6275, + "step": 6339 + }, + { + "epoch": 0.5668812589413448, + "grad_norm": 0.12767099490288686, + "learning_rate": 8.328200822344058e-05, + "loss": 0.6131, + "step": 6340 + }, + { + "epoch": 0.5669706723891274, + "grad_norm": 0.14571980751292635, + "learning_rate": 8.325345638849221e-05, + "loss": 0.6716, + "step": 6341 + }, + { + "epoch": 0.5670600858369099, + "grad_norm": 0.1257875980943755, + "learning_rate": 8.322490595805619e-05, + "loss": 0.6652, + "step": 6342 + }, + { + "epoch": 0.5671494992846924, + "grad_norm": 0.10462204846802724, + "learning_rate": 8.319635693452707e-05, + "loss": 0.6271, + "step": 6343 + }, + { + "epoch": 0.567238912732475, + "grad_norm": 0.1369580145688548, + "learning_rate": 8.31678093202992e-05, + "loss": 0.6371, + "step": 6344 + }, + { + "epoch": 0.5673283261802575, + "grad_norm": 0.1373276253708858, + "learning_rate": 8.313926311776678e-05, + "loss": 0.6564, + "step": 6345 + }, + { + "epoch": 0.5674177396280401, + "grad_norm": 0.11763295123497114, + "learning_rate": 8.311071832932404e-05, + "loss": 0.6063, + "step": 6346 + }, + { + "epoch": 0.5675071530758226, + "grad_norm": 0.14337160398732257, + "learning_rate": 8.308217495736496e-05, + "loss": 0.6657, + "step": 6347 + }, + { + "epoch": 0.5675965665236051, + "grad_norm": 0.1303234245269845, + "learning_rate": 8.305363300428346e-05, + "loss": 0.6728, + "step": 6348 + }, + { + "epoch": 0.5676859799713877, + "grad_norm": 0.13031120567028392, + "learning_rate": 8.302509247247325e-05, + "loss": 0.6285, + "step": 6349 + }, + { + "epoch": 0.5677753934191703, + "grad_norm": 0.12879379341531716, + "learning_rate": 8.299655336432806e-05, + "loss": 0.6513, + "step": 6350 + }, + { + "epoch": 0.5678648068669528, + "grad_norm": 0.12906350506194292, + "learning_rate": 8.296801568224142e-05, + "loss": 0.6676, + "step": 6351 + }, + { + "epoch": 0.5679542203147353, + "grad_norm": 0.1603800142654801, + "learning_rate": 8.293947942860666e-05, + "loss": 0.7067, + "step": 6352 + }, + { + "epoch": 0.5680436337625179, + "grad_norm": 0.12690322730087933, + "learning_rate": 8.291094460581721e-05, + "loss": 0.6652, + "step": 6353 + }, + { + "epoch": 0.5681330472103004, + "grad_norm": 0.13908996531599935, + "learning_rate": 8.288241121626621e-05, + "loss": 0.6604, + "step": 6354 + }, + { + "epoch": 0.568222460658083, + "grad_norm": 0.11825625633576854, + "learning_rate": 8.28538792623467e-05, + "loss": 0.6248, + "step": 6355 + }, + { + "epoch": 0.5683118741058655, + "grad_norm": 0.14376877653451503, + "learning_rate": 8.282534874645162e-05, + "loss": 0.6788, + "step": 6356 + }, + { + "epoch": 0.568401287553648, + "grad_norm": 0.11990747540101587, + "learning_rate": 8.279681967097381e-05, + "loss": 0.6071, + "step": 6357 + }, + { + "epoch": 0.5684907010014306, + "grad_norm": 0.13800216888567648, + "learning_rate": 8.276829203830596e-05, + "loss": 0.6352, + "step": 6358 + }, + { + "epoch": 0.5685801144492132, + "grad_norm": 0.13321695203797707, + "learning_rate": 8.27397658508406e-05, + "loss": 0.6151, + "step": 6359 + }, + { + "epoch": 0.5686695278969958, + "grad_norm": 0.13297127849982007, + "learning_rate": 8.271124111097026e-05, + "loss": 0.6278, + "step": 6360 + }, + { + "epoch": 0.5687589413447782, + "grad_norm": 0.1561943769576219, + "learning_rate": 8.268271782108727e-05, + "loss": 0.6948, + "step": 6361 + }, + { + "epoch": 0.5688483547925608, + "grad_norm": 0.14193741890236986, + "learning_rate": 8.265419598358381e-05, + "loss": 0.6686, + "step": 6362 + }, + { + "epoch": 0.5689377682403434, + "grad_norm": 0.1459703933210491, + "learning_rate": 8.262567560085199e-05, + "loss": 0.6177, + "step": 6363 + }, + { + "epoch": 0.5690271816881259, + "grad_norm": 0.137583148104067, + "learning_rate": 8.259715667528377e-05, + "loss": 0.6417, + "step": 6364 + }, + { + "epoch": 0.5691165951359084, + "grad_norm": 0.13791042274712859, + "learning_rate": 8.256863920927099e-05, + "loss": 0.7062, + "step": 6365 + }, + { + "epoch": 0.569206008583691, + "grad_norm": 0.14647139175907947, + "learning_rate": 8.254012320520539e-05, + "loss": 0.6901, + "step": 6366 + }, + { + "epoch": 0.5692954220314735, + "grad_norm": 0.13338038459254248, + "learning_rate": 8.251160866547857e-05, + "loss": 0.6499, + "step": 6367 + }, + { + "epoch": 0.5693848354792561, + "grad_norm": 0.1383253270442963, + "learning_rate": 8.248309559248203e-05, + "loss": 0.6309, + "step": 6368 + }, + { + "epoch": 0.5694742489270386, + "grad_norm": 0.1508968623634846, + "learning_rate": 8.245458398860709e-05, + "loss": 0.6468, + "step": 6369 + }, + { + "epoch": 0.5695636623748211, + "grad_norm": 0.1358679617706846, + "learning_rate": 8.242607385624501e-05, + "loss": 0.6521, + "step": 6370 + }, + { + "epoch": 0.5696530758226037, + "grad_norm": 0.13374250646079253, + "learning_rate": 8.23975651977869e-05, + "loss": 0.6453, + "step": 6371 + }, + { + "epoch": 0.5697424892703863, + "grad_norm": 0.1360284979753717, + "learning_rate": 8.236905801562373e-05, + "loss": 0.6209, + "step": 6372 + }, + { + "epoch": 0.5698319027181689, + "grad_norm": 0.12020564759545163, + "learning_rate": 8.234055231214634e-05, + "loss": 0.6438, + "step": 6373 + }, + { + "epoch": 0.5699213161659513, + "grad_norm": 0.13651698184594105, + "learning_rate": 8.231204808974554e-05, + "loss": 0.6419, + "step": 6374 + }, + { + "epoch": 0.5700107296137339, + "grad_norm": 0.14255152245954542, + "learning_rate": 8.228354535081191e-05, + "loss": 0.6521, + "step": 6375 + }, + { + "epoch": 0.5701001430615165, + "grad_norm": 0.1383739917659407, + "learning_rate": 8.225504409773591e-05, + "loss": 0.652, + "step": 6376 + }, + { + "epoch": 0.570189556509299, + "grad_norm": 0.13403486809274318, + "learning_rate": 8.222654433290795e-05, + "loss": 0.6383, + "step": 6377 + }, + { + "epoch": 0.5702789699570815, + "grad_norm": 0.1354246481800512, + "learning_rate": 8.219804605871826e-05, + "loss": 0.6223, + "step": 6378 + }, + { + "epoch": 0.5703683834048641, + "grad_norm": 0.14411438833671644, + "learning_rate": 8.216954927755692e-05, + "loss": 0.6776, + "step": 6379 + }, + { + "epoch": 0.5704577968526466, + "grad_norm": 0.15012038370645583, + "learning_rate": 8.214105399181393e-05, + "loss": 0.7147, + "step": 6380 + }, + { + "epoch": 0.5705472103004292, + "grad_norm": 0.13599355701279411, + "learning_rate": 8.21125602038792e-05, + "loss": 0.6435, + "step": 6381 + }, + { + "epoch": 0.5706366237482118, + "grad_norm": 0.14293894036630272, + "learning_rate": 8.208406791614247e-05, + "loss": 0.6928, + "step": 6382 + }, + { + "epoch": 0.5707260371959942, + "grad_norm": 0.14183140782736545, + "learning_rate": 8.20555771309933e-05, + "loss": 0.6647, + "step": 6383 + }, + { + "epoch": 0.5708154506437768, + "grad_norm": 0.13694997754606578, + "learning_rate": 8.202708785082121e-05, + "loss": 0.6793, + "step": 6384 + }, + { + "epoch": 0.5709048640915594, + "grad_norm": 0.13470555553747443, + "learning_rate": 8.199860007801557e-05, + "loss": 0.6438, + "step": 6385 + }, + { + "epoch": 0.570994277539342, + "grad_norm": 0.14092936963493263, + "learning_rate": 8.197011381496558e-05, + "loss": 0.6277, + "step": 6386 + }, + { + "epoch": 0.5710836909871244, + "grad_norm": 0.13242102976853437, + "learning_rate": 8.194162906406033e-05, + "loss": 0.6411, + "step": 6387 + }, + { + "epoch": 0.571173104434907, + "grad_norm": 0.14127443273365498, + "learning_rate": 8.191314582768891e-05, + "loss": 0.6443, + "step": 6388 + }, + { + "epoch": 0.5712625178826896, + "grad_norm": 0.13834150463201042, + "learning_rate": 8.18846641082401e-05, + "loss": 0.6746, + "step": 6389 + }, + { + "epoch": 0.5713519313304721, + "grad_norm": 0.11623807518369798, + "learning_rate": 8.18561839081026e-05, + "loss": 0.617, + "step": 6390 + }, + { + "epoch": 0.5714413447782547, + "grad_norm": 0.12476602745747933, + "learning_rate": 8.182770522966507e-05, + "loss": 0.6391, + "step": 6391 + }, + { + "epoch": 0.5715307582260372, + "grad_norm": 0.1419202220619405, + "learning_rate": 8.179922807531594e-05, + "loss": 0.6415, + "step": 6392 + }, + { + "epoch": 0.5716201716738197, + "grad_norm": 0.1266933328882974, + "learning_rate": 8.177075244744358e-05, + "loss": 0.6304, + "step": 6393 + }, + { + "epoch": 0.5717095851216023, + "grad_norm": 0.1262856758826735, + "learning_rate": 8.174227834843617e-05, + "loss": 0.6494, + "step": 6394 + }, + { + "epoch": 0.5717989985693849, + "grad_norm": 0.1275997715014501, + "learning_rate": 8.171380578068185e-05, + "loss": 0.6297, + "step": 6395 + }, + { + "epoch": 0.5718884120171673, + "grad_norm": 0.15889109335101526, + "learning_rate": 8.168533474656855e-05, + "loss": 0.6456, + "step": 6396 + }, + { + "epoch": 0.5719778254649499, + "grad_norm": 0.13241668022674305, + "learning_rate": 8.165686524848411e-05, + "loss": 0.6714, + "step": 6397 + }, + { + "epoch": 0.5720672389127325, + "grad_norm": 0.14291575376989638, + "learning_rate": 8.162839728881625e-05, + "loss": 0.6222, + "step": 6398 + }, + { + "epoch": 0.572156652360515, + "grad_norm": 0.13137912011152242, + "learning_rate": 8.159993086995249e-05, + "loss": 0.6443, + "step": 6399 + }, + { + "epoch": 0.5722460658082976, + "grad_norm": 0.15385876845239452, + "learning_rate": 8.157146599428028e-05, + "loss": 0.6523, + "step": 6400 + }, + { + "epoch": 0.5723354792560801, + "grad_norm": 0.13404810206136095, + "learning_rate": 8.154300266418702e-05, + "loss": 0.6628, + "step": 6401 + }, + { + "epoch": 0.5724248927038627, + "grad_norm": 0.12974755387015705, + "learning_rate": 8.151454088205982e-05, + "loss": 0.6414, + "step": 6402 + }, + { + "epoch": 0.5725143061516452, + "grad_norm": 0.14066715115869624, + "learning_rate": 8.148608065028574e-05, + "loss": 0.6563, + "step": 6403 + }, + { + "epoch": 0.5726037195994278, + "grad_norm": 0.14003620557277266, + "learning_rate": 8.145762197125173e-05, + "loss": 0.6739, + "step": 6404 + }, + { + "epoch": 0.5726931330472103, + "grad_norm": 0.13397793780979875, + "learning_rate": 8.142916484734458e-05, + "loss": 0.6542, + "step": 6405 + }, + { + "epoch": 0.5727825464949928, + "grad_norm": 0.12850072112188052, + "learning_rate": 8.140070928095092e-05, + "loss": 0.6495, + "step": 6406 + }, + { + "epoch": 0.5728719599427754, + "grad_norm": 0.12665351576885978, + "learning_rate": 8.137225527445727e-05, + "loss": 0.673, + "step": 6407 + }, + { + "epoch": 0.572961373390558, + "grad_norm": 0.13766021647443494, + "learning_rate": 8.134380283025014e-05, + "loss": 0.6848, + "step": 6408 + }, + { + "epoch": 0.5730507868383404, + "grad_norm": 0.1296303494576804, + "learning_rate": 8.131535195071574e-05, + "loss": 0.6223, + "step": 6409 + }, + { + "epoch": 0.573140200286123, + "grad_norm": 0.12446988375620412, + "learning_rate": 8.128690263824017e-05, + "loss": 0.6394, + "step": 6410 + }, + { + "epoch": 0.5732296137339056, + "grad_norm": 0.12289007418336953, + "learning_rate": 8.12584548952095e-05, + "loss": 0.6227, + "step": 6411 + }, + { + "epoch": 0.5733190271816881, + "grad_norm": 0.14430542573970898, + "learning_rate": 8.123000872400959e-05, + "loss": 0.6974, + "step": 6412 + }, + { + "epoch": 0.5734084406294707, + "grad_norm": 0.13020421643659116, + "learning_rate": 8.120156412702615e-05, + "loss": 0.6632, + "step": 6413 + }, + { + "epoch": 0.5734978540772532, + "grad_norm": 0.12285313513371308, + "learning_rate": 8.117312110664482e-05, + "loss": 0.6185, + "step": 6414 + }, + { + "epoch": 0.5735872675250357, + "grad_norm": 0.12779280737864485, + "learning_rate": 8.114467966525112e-05, + "loss": 0.6399, + "step": 6415 + }, + { + "epoch": 0.5736766809728183, + "grad_norm": 0.12964563197022153, + "learning_rate": 8.111623980523035e-05, + "loss": 0.6691, + "step": 6416 + }, + { + "epoch": 0.5737660944206009, + "grad_norm": 0.14862828888789417, + "learning_rate": 8.108780152896773e-05, + "loss": 0.6366, + "step": 6417 + }, + { + "epoch": 0.5738555078683834, + "grad_norm": 0.14361861731491854, + "learning_rate": 8.105936483884838e-05, + "loss": 0.6889, + "step": 6418 + }, + { + "epoch": 0.5739449213161659, + "grad_norm": 0.11174857984378034, + "learning_rate": 8.103092973725724e-05, + "loss": 0.6506, + "step": 6419 + }, + { + "epoch": 0.5740343347639485, + "grad_norm": 0.137940741790518, + "learning_rate": 8.100249622657907e-05, + "loss": 0.6203, + "step": 6420 + }, + { + "epoch": 0.5741237482117311, + "grad_norm": 0.13978720456320756, + "learning_rate": 8.097406430919858e-05, + "loss": 0.6652, + "step": 6421 + }, + { + "epoch": 0.5742131616595136, + "grad_norm": 0.13872639649156587, + "learning_rate": 8.094563398750039e-05, + "loss": 0.67, + "step": 6422 + }, + { + "epoch": 0.5743025751072961, + "grad_norm": 0.13352445314999334, + "learning_rate": 8.091720526386886e-05, + "loss": 0.6499, + "step": 6423 + }, + { + "epoch": 0.5743919885550787, + "grad_norm": 0.14075968572906616, + "learning_rate": 8.088877814068827e-05, + "loss": 0.6656, + "step": 6424 + }, + { + "epoch": 0.5744814020028612, + "grad_norm": 0.1445251899978096, + "learning_rate": 8.086035262034278e-05, + "loss": 0.6738, + "step": 6425 + }, + { + "epoch": 0.5745708154506438, + "grad_norm": 0.15501607814944043, + "learning_rate": 8.083192870521638e-05, + "loss": 0.6572, + "step": 6426 + }, + { + "epoch": 0.5746602288984263, + "grad_norm": 0.14215018183857134, + "learning_rate": 8.0803506397693e-05, + "loss": 0.6542, + "step": 6427 + }, + { + "epoch": 0.5747496423462088, + "grad_norm": 0.12549600806461214, + "learning_rate": 8.077508570015632e-05, + "loss": 0.6361, + "step": 6428 + }, + { + "epoch": 0.5748390557939914, + "grad_norm": 0.12123155723841521, + "learning_rate": 8.074666661499002e-05, + "loss": 0.6391, + "step": 6429 + }, + { + "epoch": 0.574928469241774, + "grad_norm": 0.13008244222477902, + "learning_rate": 8.071824914457751e-05, + "loss": 0.659, + "step": 6430 + }, + { + "epoch": 0.5750178826895566, + "grad_norm": 0.15068446418192463, + "learning_rate": 8.068983329130218e-05, + "loss": 0.6185, + "step": 6431 + }, + { + "epoch": 0.575107296137339, + "grad_norm": 0.1344988931353772, + "learning_rate": 8.066141905754723e-05, + "loss": 0.6729, + "step": 6432 + }, + { + "epoch": 0.5751967095851216, + "grad_norm": 0.13704654765175728, + "learning_rate": 8.063300644569567e-05, + "loss": 0.6403, + "step": 6433 + }, + { + "epoch": 0.5752861230329042, + "grad_norm": 0.12950109736130655, + "learning_rate": 8.060459545813049e-05, + "loss": 0.5666, + "step": 6434 + }, + { + "epoch": 0.5753755364806867, + "grad_norm": 0.12217923692593148, + "learning_rate": 8.057618609723443e-05, + "loss": 0.6654, + "step": 6435 + }, + { + "epoch": 0.5754649499284692, + "grad_norm": 0.13405248153257707, + "learning_rate": 8.054777836539022e-05, + "loss": 0.6788, + "step": 6436 + }, + { + "epoch": 0.5755543633762518, + "grad_norm": 0.1443544465144163, + "learning_rate": 8.051937226498034e-05, + "loss": 0.6464, + "step": 6437 + }, + { + "epoch": 0.5756437768240343, + "grad_norm": 0.11221096488589254, + "learning_rate": 8.049096779838719e-05, + "loss": 0.6227, + "step": 6438 + }, + { + "epoch": 0.5757331902718169, + "grad_norm": 0.13767659820559797, + "learning_rate": 8.0462564967993e-05, + "loss": 0.616, + "step": 6439 + }, + { + "epoch": 0.5758226037195995, + "grad_norm": 0.1316460148509155, + "learning_rate": 8.043416377617988e-05, + "loss": 0.6603, + "step": 6440 + }, + { + "epoch": 0.5759120171673819, + "grad_norm": 0.155182119803568, + "learning_rate": 8.040576422532984e-05, + "loss": 0.6206, + "step": 6441 + }, + { + "epoch": 0.5760014306151645, + "grad_norm": 0.14248417751214462, + "learning_rate": 8.037736631782465e-05, + "loss": 0.6604, + "step": 6442 + }, + { + "epoch": 0.5760908440629471, + "grad_norm": 0.12275779693202887, + "learning_rate": 8.034897005604608e-05, + "loss": 0.6076, + "step": 6443 + }, + { + "epoch": 0.5761802575107297, + "grad_norm": 0.14851652417397795, + "learning_rate": 8.032057544237565e-05, + "loss": 0.6838, + "step": 6444 + }, + { + "epoch": 0.5762696709585121, + "grad_norm": 0.14364716564886812, + "learning_rate": 8.02921824791948e-05, + "loss": 0.6809, + "step": 6445 + }, + { + "epoch": 0.5763590844062947, + "grad_norm": 0.1473167929512374, + "learning_rate": 8.026379116888481e-05, + "loss": 0.6424, + "step": 6446 + }, + { + "epoch": 0.5764484978540773, + "grad_norm": 0.14904246508207045, + "learning_rate": 8.02354015138268e-05, + "loss": 0.6605, + "step": 6447 + }, + { + "epoch": 0.5765379113018598, + "grad_norm": 0.12378722327937304, + "learning_rate": 8.020701351640182e-05, + "loss": 0.6372, + "step": 6448 + }, + { + "epoch": 0.5766273247496424, + "grad_norm": 0.13940859016318044, + "learning_rate": 8.017862717899066e-05, + "loss": 0.675, + "step": 6449 + }, + { + "epoch": 0.5767167381974249, + "grad_norm": 0.14833698218280947, + "learning_rate": 8.015024250397415e-05, + "loss": 0.5907, + "step": 6450 + }, + { + "epoch": 0.5768061516452074, + "grad_norm": 0.12613545607467824, + "learning_rate": 8.01218594937328e-05, + "loss": 0.6144, + "step": 6451 + }, + { + "epoch": 0.57689556509299, + "grad_norm": 0.12414489240794298, + "learning_rate": 8.009347815064712e-05, + "loss": 0.654, + "step": 6452 + }, + { + "epoch": 0.5769849785407726, + "grad_norm": 0.1293020623458627, + "learning_rate": 8.006509847709735e-05, + "loss": 0.6792, + "step": 6453 + }, + { + "epoch": 0.577074391988555, + "grad_norm": 0.14706923724488205, + "learning_rate": 8.003672047546373e-05, + "loss": 0.6604, + "step": 6454 + }, + { + "epoch": 0.5771638054363376, + "grad_norm": 0.14268008741838076, + "learning_rate": 8.000834414812625e-05, + "loss": 0.645, + "step": 6455 + }, + { + "epoch": 0.5772532188841202, + "grad_norm": 0.1197505616950535, + "learning_rate": 7.997996949746477e-05, + "loss": 0.656, + "step": 6456 + }, + { + "epoch": 0.5773426323319027, + "grad_norm": 0.12022119802669175, + "learning_rate": 7.995159652585908e-05, + "loss": 0.6277, + "step": 6457 + }, + { + "epoch": 0.5774320457796852, + "grad_norm": 0.13719611402112977, + "learning_rate": 7.99232252356888e-05, + "loss": 0.6534, + "step": 6458 + }, + { + "epoch": 0.5775214592274678, + "grad_norm": 0.14057017188140428, + "learning_rate": 7.989485562933338e-05, + "loss": 0.6485, + "step": 6459 + }, + { + "epoch": 0.5776108726752504, + "grad_norm": 0.13269268707007398, + "learning_rate": 7.98664877091721e-05, + "loss": 0.6198, + "step": 6460 + }, + { + "epoch": 0.5777002861230329, + "grad_norm": 0.12275154613515299, + "learning_rate": 7.983812147758422e-05, + "loss": 0.6469, + "step": 6461 + }, + { + "epoch": 0.5777896995708155, + "grad_norm": 0.1317661685227848, + "learning_rate": 7.980975693694872e-05, + "loss": 0.6349, + "step": 6462 + }, + { + "epoch": 0.577879113018598, + "grad_norm": 0.14277905537470123, + "learning_rate": 7.97813940896445e-05, + "loss": 0.6635, + "step": 6463 + }, + { + "epoch": 0.5779685264663805, + "grad_norm": 0.14688913097610998, + "learning_rate": 7.975303293805035e-05, + "loss": 0.6689, + "step": 6464 + }, + { + "epoch": 0.5780579399141631, + "grad_norm": 0.147430923784116, + "learning_rate": 7.97246734845449e-05, + "loss": 0.6397, + "step": 6465 + }, + { + "epoch": 0.5781473533619457, + "grad_norm": 0.1412175553784543, + "learning_rate": 7.96963157315066e-05, + "loss": 0.703, + "step": 6466 + }, + { + "epoch": 0.5782367668097281, + "grad_norm": 0.12773818706368723, + "learning_rate": 7.966795968131377e-05, + "loss": 0.6523, + "step": 6467 + }, + { + "epoch": 0.5783261802575107, + "grad_norm": 0.13398521251499182, + "learning_rate": 7.963960533634461e-05, + "loss": 0.6571, + "step": 6468 + }, + { + "epoch": 0.5784155937052933, + "grad_norm": 0.1449805497931219, + "learning_rate": 7.961125269897716e-05, + "loss": 0.6134, + "step": 6469 + }, + { + "epoch": 0.5785050071530758, + "grad_norm": 0.1323171220814179, + "learning_rate": 7.95829017715893e-05, + "loss": 0.6153, + "step": 6470 + }, + { + "epoch": 0.5785944206008584, + "grad_norm": 0.1258197110749595, + "learning_rate": 7.955455255655881e-05, + "loss": 0.6293, + "step": 6471 + }, + { + "epoch": 0.5786838340486409, + "grad_norm": 0.12284514801831894, + "learning_rate": 7.952620505626333e-05, + "loss": 0.63, + "step": 6472 + }, + { + "epoch": 0.5787732474964234, + "grad_norm": 0.13644340485114007, + "learning_rate": 7.949785927308032e-05, + "loss": 0.6604, + "step": 6473 + }, + { + "epoch": 0.578862660944206, + "grad_norm": 0.12730393084263553, + "learning_rate": 7.946951520938706e-05, + "loss": 0.6544, + "step": 6474 + }, + { + "epoch": 0.5789520743919886, + "grad_norm": 0.1210464257897423, + "learning_rate": 7.944117286756079e-05, + "loss": 0.601, + "step": 6475 + }, + { + "epoch": 0.579041487839771, + "grad_norm": 0.11484169080637148, + "learning_rate": 7.94128322499785e-05, + "loss": 0.6456, + "step": 6476 + }, + { + "epoch": 0.5791309012875536, + "grad_norm": 0.12407181491957384, + "learning_rate": 7.93844933590171e-05, + "loss": 0.6247, + "step": 6477 + }, + { + "epoch": 0.5792203147353362, + "grad_norm": 0.15012148823421606, + "learning_rate": 7.935615619705334e-05, + "loss": 0.6517, + "step": 6478 + }, + { + "epoch": 0.5793097281831188, + "grad_norm": 0.1425169652319623, + "learning_rate": 7.932782076646386e-05, + "loss": 0.647, + "step": 6479 + }, + { + "epoch": 0.5793991416309013, + "grad_norm": 0.13826976724922568, + "learning_rate": 7.929948706962508e-05, + "loss": 0.6526, + "step": 6480 + }, + { + "epoch": 0.5794885550786838, + "grad_norm": 0.15095152414253507, + "learning_rate": 7.927115510891332e-05, + "loss": 0.6326, + "step": 6481 + }, + { + "epoch": 0.5795779685264664, + "grad_norm": 0.13828168390515574, + "learning_rate": 7.924282488670476e-05, + "loss": 0.6541, + "step": 6482 + }, + { + "epoch": 0.5796673819742489, + "grad_norm": 0.13069650685767448, + "learning_rate": 7.921449640537535e-05, + "loss": 0.5594, + "step": 6483 + }, + { + "epoch": 0.5797567954220315, + "grad_norm": 0.12168931374836714, + "learning_rate": 7.918616966730108e-05, + "loss": 0.6354, + "step": 6484 + }, + { + "epoch": 0.579846208869814, + "grad_norm": 0.15292985308426724, + "learning_rate": 7.91578446748576e-05, + "loss": 0.6386, + "step": 6485 + }, + { + "epoch": 0.5799356223175965, + "grad_norm": 0.14575881408999694, + "learning_rate": 7.912952143042052e-05, + "loss": 0.6745, + "step": 6486 + }, + { + "epoch": 0.5800250357653791, + "grad_norm": 0.12954171826798536, + "learning_rate": 7.910119993636528e-05, + "loss": 0.6472, + "step": 6487 + }, + { + "epoch": 0.5801144492131617, + "grad_norm": 0.15013011487650427, + "learning_rate": 7.907288019506717e-05, + "loss": 0.6995, + "step": 6488 + }, + { + "epoch": 0.5802038626609443, + "grad_norm": 0.1382253250907903, + "learning_rate": 7.904456220890132e-05, + "loss": 0.67, + "step": 6489 + }, + { + "epoch": 0.5802932761087267, + "grad_norm": 0.13238127416470813, + "learning_rate": 7.901624598024269e-05, + "loss": 0.6405, + "step": 6490 + }, + { + "epoch": 0.5803826895565093, + "grad_norm": 0.12583464117577065, + "learning_rate": 7.89879315114662e-05, + "loss": 0.6434, + "step": 6491 + }, + { + "epoch": 0.5804721030042919, + "grad_norm": 0.14379756668626867, + "learning_rate": 7.895961880494652e-05, + "loss": 0.6729, + "step": 6492 + }, + { + "epoch": 0.5805615164520744, + "grad_norm": 0.13590545053664022, + "learning_rate": 7.893130786305821e-05, + "loss": 0.6442, + "step": 6493 + }, + { + "epoch": 0.5806509298998569, + "grad_norm": 0.13305435435497548, + "learning_rate": 7.890299868817564e-05, + "loss": 0.6886, + "step": 6494 + }, + { + "epoch": 0.5807403433476395, + "grad_norm": 0.1213650936049263, + "learning_rate": 7.887469128267312e-05, + "loss": 0.6432, + "step": 6495 + }, + { + "epoch": 0.580829756795422, + "grad_norm": 0.11397232657278865, + "learning_rate": 7.884638564892472e-05, + "loss": 0.609, + "step": 6496 + }, + { + "epoch": 0.5809191702432046, + "grad_norm": 0.11596888988850225, + "learning_rate": 7.881808178930438e-05, + "loss": 0.6244, + "step": 6497 + }, + { + "epoch": 0.5810085836909872, + "grad_norm": 0.15336450338331897, + "learning_rate": 7.878977970618595e-05, + "loss": 0.6703, + "step": 6498 + }, + { + "epoch": 0.5810979971387696, + "grad_norm": 0.1226812593245226, + "learning_rate": 7.876147940194311e-05, + "loss": 0.6527, + "step": 6499 + }, + { + "epoch": 0.5811874105865522, + "grad_norm": 0.14264985162594362, + "learning_rate": 7.873318087894933e-05, + "loss": 0.6573, + "step": 6500 + }, + { + "epoch": 0.5812768240343348, + "grad_norm": 0.1301713452344042, + "learning_rate": 7.870488413957797e-05, + "loss": 0.6128, + "step": 6501 + }, + { + "epoch": 0.5813662374821174, + "grad_norm": 0.13189416169104257, + "learning_rate": 7.867658918620229e-05, + "loss": 0.6044, + "step": 6502 + }, + { + "epoch": 0.5814556509298998, + "grad_norm": 0.14691543785053499, + "learning_rate": 7.86482960211953e-05, + "loss": 0.6622, + "step": 6503 + }, + { + "epoch": 0.5815450643776824, + "grad_norm": 0.15564397574339828, + "learning_rate": 7.862000464692991e-05, + "loss": 0.6377, + "step": 6504 + }, + { + "epoch": 0.581634477825465, + "grad_norm": 0.12575608955553197, + "learning_rate": 7.859171506577893e-05, + "loss": 0.6354, + "step": 6505 + }, + { + "epoch": 0.5817238912732475, + "grad_norm": 0.13868314242374466, + "learning_rate": 7.856342728011498e-05, + "loss": 0.6406, + "step": 6506 + }, + { + "epoch": 0.58181330472103, + "grad_norm": 0.1462785129286169, + "learning_rate": 7.853514129231049e-05, + "loss": 0.601, + "step": 6507 + }, + { + "epoch": 0.5819027181688126, + "grad_norm": 0.13925380320135106, + "learning_rate": 7.850685710473775e-05, + "loss": 0.7076, + "step": 6508 + }, + { + "epoch": 0.5819921316165951, + "grad_norm": 0.14135747915815655, + "learning_rate": 7.847857471976897e-05, + "loss": 0.6093, + "step": 6509 + }, + { + "epoch": 0.5820815450643777, + "grad_norm": 0.1241507734105099, + "learning_rate": 7.845029413977613e-05, + "loss": 0.6619, + "step": 6510 + }, + { + "epoch": 0.5821709585121603, + "grad_norm": 0.13130946524024661, + "learning_rate": 7.842201536713107e-05, + "loss": 0.6555, + "step": 6511 + }, + { + "epoch": 0.5822603719599427, + "grad_norm": 0.13995781883769468, + "learning_rate": 7.839373840420554e-05, + "loss": 0.6375, + "step": 6512 + }, + { + "epoch": 0.5823497854077253, + "grad_norm": 0.12320900120413118, + "learning_rate": 7.83654632533711e-05, + "loss": 0.6308, + "step": 6513 + }, + { + "epoch": 0.5824391988555079, + "grad_norm": 0.12439193895142214, + "learning_rate": 7.83371899169991e-05, + "loss": 0.6432, + "step": 6514 + }, + { + "epoch": 0.5825286123032904, + "grad_norm": 0.13250086998963284, + "learning_rate": 7.830891839746083e-05, + "loss": 0.6429, + "step": 6515 + }, + { + "epoch": 0.5826180257510729, + "grad_norm": 0.13902446092191256, + "learning_rate": 7.828064869712739e-05, + "loss": 0.6206, + "step": 6516 + }, + { + "epoch": 0.5827074391988555, + "grad_norm": 0.13402968735954462, + "learning_rate": 7.82523808183697e-05, + "loss": 0.6823, + "step": 6517 + }, + { + "epoch": 0.582796852646638, + "grad_norm": 0.12667482318473797, + "learning_rate": 7.822411476355854e-05, + "loss": 0.6723, + "step": 6518 + }, + { + "epoch": 0.5828862660944206, + "grad_norm": 0.13987486825090545, + "learning_rate": 7.819585053506461e-05, + "loss": 0.6422, + "step": 6519 + }, + { + "epoch": 0.5829756795422032, + "grad_norm": 0.13241528448173542, + "learning_rate": 7.816758813525836e-05, + "loss": 0.6368, + "step": 6520 + }, + { + "epoch": 0.5830650929899857, + "grad_norm": 0.15121349929689465, + "learning_rate": 7.813932756651012e-05, + "loss": 0.6909, + "step": 6521 + }, + { + "epoch": 0.5831545064377682, + "grad_norm": 0.14086156070841244, + "learning_rate": 7.811106883119008e-05, + "loss": 0.6755, + "step": 6522 + }, + { + "epoch": 0.5832439198855508, + "grad_norm": 0.13250771920361082, + "learning_rate": 7.808281193166829e-05, + "loss": 0.6711, + "step": 6523 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 0.13657647797353417, + "learning_rate": 7.805455687031455e-05, + "loss": 0.6334, + "step": 6524 + }, + { + "epoch": 0.5834227467811158, + "grad_norm": 0.12674493663560257, + "learning_rate": 7.80263036494986e-05, + "loss": 0.6201, + "step": 6525 + }, + { + "epoch": 0.5835121602288984, + "grad_norm": 0.14024037204220932, + "learning_rate": 7.799805227159007e-05, + "loss": 0.5881, + "step": 6526 + }, + { + "epoch": 0.583601573676681, + "grad_norm": 0.1236540555198191, + "learning_rate": 7.796980273895833e-05, + "loss": 0.6113, + "step": 6527 + }, + { + "epoch": 0.5836909871244635, + "grad_norm": 0.12763526554280338, + "learning_rate": 7.794155505397261e-05, + "loss": 0.6493, + "step": 6528 + }, + { + "epoch": 0.5837804005722461, + "grad_norm": 0.12542020136594903, + "learning_rate": 7.791330921900205e-05, + "loss": 0.652, + "step": 6529 + }, + { + "epoch": 0.5838698140200286, + "grad_norm": 0.10899982315488752, + "learning_rate": 7.788506523641556e-05, + "loss": 0.6387, + "step": 6530 + }, + { + "epoch": 0.5839592274678111, + "grad_norm": 0.12034858332414634, + "learning_rate": 7.785682310858193e-05, + "loss": 0.643, + "step": 6531 + }, + { + "epoch": 0.5840486409155937, + "grad_norm": 0.14478664328306176, + "learning_rate": 7.782858283786976e-05, + "loss": 0.6357, + "step": 6532 + }, + { + "epoch": 0.5841380543633763, + "grad_norm": 0.13813537330800874, + "learning_rate": 7.780034442664764e-05, + "loss": 0.6311, + "step": 6533 + }, + { + "epoch": 0.5842274678111588, + "grad_norm": 0.1541889545622834, + "learning_rate": 7.777210787728382e-05, + "loss": 0.6478, + "step": 6534 + }, + { + "epoch": 0.5843168812589413, + "grad_norm": 0.18901259483488525, + "learning_rate": 7.774387319214643e-05, + "loss": 0.6212, + "step": 6535 + }, + { + "epoch": 0.5844062947067239, + "grad_norm": 0.14190324746847513, + "learning_rate": 7.771564037360355e-05, + "loss": 0.6573, + "step": 6536 + }, + { + "epoch": 0.5844957081545065, + "grad_norm": 0.1295584326896787, + "learning_rate": 7.768740942402301e-05, + "loss": 0.6492, + "step": 6537 + }, + { + "epoch": 0.584585121602289, + "grad_norm": 0.12930989007034474, + "learning_rate": 7.765918034577245e-05, + "loss": 0.7117, + "step": 6538 + }, + { + "epoch": 0.5846745350500715, + "grad_norm": 0.1359941703904831, + "learning_rate": 7.763095314121945e-05, + "loss": 0.6711, + "step": 6539 + }, + { + "epoch": 0.5847639484978541, + "grad_norm": 0.12689574418179397, + "learning_rate": 7.760272781273142e-05, + "loss": 0.6582, + "step": 6540 + }, + { + "epoch": 0.5848533619456366, + "grad_norm": 0.14223459594807478, + "learning_rate": 7.757450436267558e-05, + "loss": 0.6765, + "step": 6541 + }, + { + "epoch": 0.5849427753934192, + "grad_norm": 0.13407432977049563, + "learning_rate": 7.754628279341895e-05, + "loss": 0.6451, + "step": 6542 + }, + { + "epoch": 0.5850321888412017, + "grad_norm": 0.14068554350554272, + "learning_rate": 7.751806310732847e-05, + "loss": 0.653, + "step": 6543 + }, + { + "epoch": 0.5851216022889842, + "grad_norm": 0.13038818340622702, + "learning_rate": 7.748984530677089e-05, + "loss": 0.6422, + "step": 6544 + }, + { + "epoch": 0.5852110157367668, + "grad_norm": 0.12603808727753063, + "learning_rate": 7.746162939411279e-05, + "loss": 0.6342, + "step": 6545 + }, + { + "epoch": 0.5853004291845494, + "grad_norm": 0.13168790434304828, + "learning_rate": 7.74334153717206e-05, + "loss": 0.6187, + "step": 6546 + }, + { + "epoch": 0.585389842632332, + "grad_norm": 0.12273567897828885, + "learning_rate": 7.740520324196064e-05, + "loss": 0.6707, + "step": 6547 + }, + { + "epoch": 0.5854792560801144, + "grad_norm": 0.1304818230848891, + "learning_rate": 7.737699300719896e-05, + "loss": 0.6301, + "step": 6548 + }, + { + "epoch": 0.585568669527897, + "grad_norm": 0.12803156346475045, + "learning_rate": 7.734878466980159e-05, + "loss": 0.6751, + "step": 6549 + }, + { + "epoch": 0.5856580829756796, + "grad_norm": 0.1327547690389487, + "learning_rate": 7.73205782321343e-05, + "loss": 0.6532, + "step": 6550 + }, + { + "epoch": 0.5857474964234621, + "grad_norm": 0.1384006910354231, + "learning_rate": 7.729237369656269e-05, + "loss": 0.704, + "step": 6551 + }, + { + "epoch": 0.5858369098712446, + "grad_norm": 0.13019235385660524, + "learning_rate": 7.72641710654523e-05, + "loss": 0.6389, + "step": 6552 + }, + { + "epoch": 0.5859263233190272, + "grad_norm": 0.12686673045291366, + "learning_rate": 7.723597034116838e-05, + "loss": 0.6652, + "step": 6553 + }, + { + "epoch": 0.5860157367668097, + "grad_norm": 0.13280857044276836, + "learning_rate": 7.720777152607619e-05, + "loss": 0.688, + "step": 6554 + }, + { + "epoch": 0.5861051502145923, + "grad_norm": 0.13628464827276102, + "learning_rate": 7.717957462254065e-05, + "loss": 0.6398, + "step": 6555 + }, + { + "epoch": 0.5861945636623748, + "grad_norm": 0.12587134616101733, + "learning_rate": 7.715137963292665e-05, + "loss": 0.6389, + "step": 6556 + }, + { + "epoch": 0.5862839771101573, + "grad_norm": 0.15307243173945115, + "learning_rate": 7.712318655959884e-05, + "loss": 0.6719, + "step": 6557 + }, + { + "epoch": 0.5863733905579399, + "grad_norm": 0.13115722950244668, + "learning_rate": 7.709499540492171e-05, + "loss": 0.6573, + "step": 6558 + }, + { + "epoch": 0.5864628040057225, + "grad_norm": 0.14288454852651172, + "learning_rate": 7.70668061712597e-05, + "loss": 0.6228, + "step": 6559 + }, + { + "epoch": 0.586552217453505, + "grad_norm": 0.1420934555559685, + "learning_rate": 7.70386188609769e-05, + "loss": 0.6853, + "step": 6560 + }, + { + "epoch": 0.5866416309012875, + "grad_norm": 0.14446126866366316, + "learning_rate": 7.701043347643747e-05, + "loss": 0.6109, + "step": 6561 + }, + { + "epoch": 0.5867310443490701, + "grad_norm": 0.1309420122454243, + "learning_rate": 7.698225002000516e-05, + "loss": 0.6773, + "step": 6562 + }, + { + "epoch": 0.5868204577968527, + "grad_norm": 0.13497863586354958, + "learning_rate": 7.695406849404379e-05, + "loss": 0.6509, + "step": 6563 + }, + { + "epoch": 0.5869098712446352, + "grad_norm": 0.13777610856855588, + "learning_rate": 7.692588890091686e-05, + "loss": 0.6386, + "step": 6564 + }, + { + "epoch": 0.5869992846924177, + "grad_norm": 0.11210519464952531, + "learning_rate": 7.689771124298774e-05, + "loss": 0.6228, + "step": 6565 + }, + { + "epoch": 0.5870886981402003, + "grad_norm": 0.1207196812756651, + "learning_rate": 7.686953552261966e-05, + "loss": 0.6379, + "step": 6566 + }, + { + "epoch": 0.5871781115879828, + "grad_norm": 0.1334794870155124, + "learning_rate": 7.684136174217574e-05, + "loss": 0.6494, + "step": 6567 + }, + { + "epoch": 0.5872675250357654, + "grad_norm": 0.13516733425421065, + "learning_rate": 7.681318990401885e-05, + "loss": 0.6665, + "step": 6568 + }, + { + "epoch": 0.587356938483548, + "grad_norm": 0.1370494253896243, + "learning_rate": 7.678502001051168e-05, + "loss": 0.6496, + "step": 6569 + }, + { + "epoch": 0.5874463519313304, + "grad_norm": 0.12675104701021672, + "learning_rate": 7.675685206401689e-05, + "loss": 0.5995, + "step": 6570 + }, + { + "epoch": 0.587535765379113, + "grad_norm": 0.13441635239958144, + "learning_rate": 7.67286860668968e-05, + "loss": 0.6518, + "step": 6571 + }, + { + "epoch": 0.5876251788268956, + "grad_norm": 0.1398637635901542, + "learning_rate": 7.670052202151374e-05, + "loss": 0.649, + "step": 6572 + }, + { + "epoch": 0.5877145922746781, + "grad_norm": 0.14244032884899888, + "learning_rate": 7.667235993022972e-05, + "loss": 0.6849, + "step": 6573 + }, + { + "epoch": 0.5878040057224606, + "grad_norm": 0.12691336779079063, + "learning_rate": 7.664419979540673e-05, + "loss": 0.6335, + "step": 6574 + }, + { + "epoch": 0.5878934191702432, + "grad_norm": 0.1553006142210313, + "learning_rate": 7.66160416194065e-05, + "loss": 0.6582, + "step": 6575 + }, + { + "epoch": 0.5879828326180258, + "grad_norm": 0.14749842906534408, + "learning_rate": 7.658788540459062e-05, + "loss": 0.6697, + "step": 6576 + }, + { + "epoch": 0.5880722460658083, + "grad_norm": 0.1347369615940866, + "learning_rate": 7.655973115332052e-05, + "loss": 0.6084, + "step": 6577 + }, + { + "epoch": 0.5881616595135909, + "grad_norm": 0.1426084518641073, + "learning_rate": 7.653157886795744e-05, + "loss": 0.6667, + "step": 6578 + }, + { + "epoch": 0.5882510729613734, + "grad_norm": 0.12556610964840292, + "learning_rate": 7.65034285508625e-05, + "loss": 0.6332, + "step": 6579 + }, + { + "epoch": 0.5883404864091559, + "grad_norm": 0.14151999706349908, + "learning_rate": 7.647528020439662e-05, + "loss": 0.6417, + "step": 6580 + }, + { + "epoch": 0.5884298998569385, + "grad_norm": 0.1201056370572102, + "learning_rate": 7.64471338309206e-05, + "loss": 0.6448, + "step": 6581 + }, + { + "epoch": 0.5885193133047211, + "grad_norm": 0.14034079755547982, + "learning_rate": 7.641898943279501e-05, + "loss": 0.6819, + "step": 6582 + }, + { + "epoch": 0.5886087267525035, + "grad_norm": 0.12789072043388694, + "learning_rate": 7.639084701238032e-05, + "loss": 0.638, + "step": 6583 + }, + { + "epoch": 0.5886981402002861, + "grad_norm": 0.13911825970703806, + "learning_rate": 7.636270657203677e-05, + "loss": 0.6454, + "step": 6584 + }, + { + "epoch": 0.5887875536480687, + "grad_norm": 0.1275988625497568, + "learning_rate": 7.633456811412446e-05, + "loss": 0.6259, + "step": 6585 + }, + { + "epoch": 0.5888769670958512, + "grad_norm": 0.12894200569827535, + "learning_rate": 7.630643164100335e-05, + "loss": 0.6283, + "step": 6586 + }, + { + "epoch": 0.5889663805436338, + "grad_norm": 0.13426570044299502, + "learning_rate": 7.627829715503317e-05, + "loss": 0.6705, + "step": 6587 + }, + { + "epoch": 0.5890557939914163, + "grad_norm": 0.13320873759463236, + "learning_rate": 7.625016465857361e-05, + "loss": 0.6826, + "step": 6588 + }, + { + "epoch": 0.5891452074391988, + "grad_norm": 0.12515923026773806, + "learning_rate": 7.622203415398402e-05, + "loss": 0.654, + "step": 6589 + }, + { + "epoch": 0.5892346208869814, + "grad_norm": 0.15189080544730016, + "learning_rate": 7.619390564362374e-05, + "loss": 0.6896, + "step": 6590 + }, + { + "epoch": 0.589324034334764, + "grad_norm": 0.12693752877238304, + "learning_rate": 7.616577912985185e-05, + "loss": 0.5476, + "step": 6591 + }, + { + "epoch": 0.5894134477825465, + "grad_norm": 0.13967914080603988, + "learning_rate": 7.613765461502724e-05, + "loss": 0.6748, + "step": 6592 + }, + { + "epoch": 0.589502861230329, + "grad_norm": 0.13085344474771574, + "learning_rate": 7.610953210150875e-05, + "loss": 0.6431, + "step": 6593 + }, + { + "epoch": 0.5895922746781116, + "grad_norm": 0.13210109016279248, + "learning_rate": 7.608141159165492e-05, + "loss": 0.6748, + "step": 6594 + }, + { + "epoch": 0.5896816881258942, + "grad_norm": 0.1451738050675427, + "learning_rate": 7.605329308782423e-05, + "loss": 0.6715, + "step": 6595 + }, + { + "epoch": 0.5897711015736766, + "grad_norm": 0.12704417916891964, + "learning_rate": 7.602517659237492e-05, + "loss": 0.5576, + "step": 6596 + }, + { + "epoch": 0.5898605150214592, + "grad_norm": 0.13115428035861665, + "learning_rate": 7.599706210766513e-05, + "loss": 0.6297, + "step": 6597 + }, + { + "epoch": 0.5899499284692418, + "grad_norm": 0.14035478236267934, + "learning_rate": 7.596894963605274e-05, + "loss": 0.6501, + "step": 6598 + }, + { + "epoch": 0.5900393419170243, + "grad_norm": 0.12419659556190969, + "learning_rate": 7.594083917989549e-05, + "loss": 0.6312, + "step": 6599 + }, + { + "epoch": 0.5901287553648069, + "grad_norm": 0.11333401518901162, + "learning_rate": 7.591273074155104e-05, + "loss": 0.6588, + "step": 6600 + }, + { + "epoch": 0.5902181688125894, + "grad_norm": 0.11760765816321997, + "learning_rate": 7.588462432337672e-05, + "loss": 0.6568, + "step": 6601 + }, + { + "epoch": 0.5903075822603719, + "grad_norm": 0.13993598996663612, + "learning_rate": 7.585651992772988e-05, + "loss": 0.6459, + "step": 6602 + }, + { + "epoch": 0.5903969957081545, + "grad_norm": 0.1394814993481904, + "learning_rate": 7.582841755696754e-05, + "loss": 0.5755, + "step": 6603 + }, + { + "epoch": 0.5904864091559371, + "grad_norm": 0.1315341996918284, + "learning_rate": 7.580031721344663e-05, + "loss": 0.6279, + "step": 6604 + }, + { + "epoch": 0.5905758226037195, + "grad_norm": 0.12981374664950796, + "learning_rate": 7.577221889952389e-05, + "loss": 0.6214, + "step": 6605 + }, + { + "epoch": 0.5906652360515021, + "grad_norm": 0.12640198847723083, + "learning_rate": 7.57441226175559e-05, + "loss": 0.6573, + "step": 6606 + }, + { + "epoch": 0.5907546494992847, + "grad_norm": 0.12960965562577037, + "learning_rate": 7.571602836989906e-05, + "loss": 0.6419, + "step": 6607 + }, + { + "epoch": 0.5908440629470673, + "grad_norm": 0.1343730043850246, + "learning_rate": 7.568793615890954e-05, + "loss": 0.6141, + "step": 6608 + }, + { + "epoch": 0.5909334763948498, + "grad_norm": 0.13239245162312194, + "learning_rate": 7.565984598694349e-05, + "loss": 0.6534, + "step": 6609 + }, + { + "epoch": 0.5910228898426323, + "grad_norm": 0.13249442657170724, + "learning_rate": 7.563175785635678e-05, + "loss": 0.659, + "step": 6610 + }, + { + "epoch": 0.5911123032904149, + "grad_norm": 0.17614126501383007, + "learning_rate": 7.56036717695051e-05, + "loss": 0.6876, + "step": 6611 + }, + { + "epoch": 0.5912017167381974, + "grad_norm": 0.13690734611152086, + "learning_rate": 7.557558772874398e-05, + "loss": 0.6318, + "step": 6612 + }, + { + "epoch": 0.59129113018598, + "grad_norm": 0.13388464661276728, + "learning_rate": 7.554750573642886e-05, + "loss": 0.6775, + "step": 6613 + }, + { + "epoch": 0.5913805436337625, + "grad_norm": 0.1142522779250013, + "learning_rate": 7.551942579491489e-05, + "loss": 0.6643, + "step": 6614 + }, + { + "epoch": 0.591469957081545, + "grad_norm": 0.13600875510572868, + "learning_rate": 7.549134790655708e-05, + "loss": 0.6372, + "step": 6615 + }, + { + "epoch": 0.5915593705293276, + "grad_norm": 0.13163367599000572, + "learning_rate": 7.546327207371033e-05, + "loss": 0.6145, + "step": 6616 + }, + { + "epoch": 0.5916487839771102, + "grad_norm": 0.1348412480484771, + "learning_rate": 7.543519829872934e-05, + "loss": 0.6276, + "step": 6617 + }, + { + "epoch": 0.5917381974248928, + "grad_norm": 0.12644242264529348, + "learning_rate": 7.54071265839686e-05, + "loss": 0.6327, + "step": 6618 + }, + { + "epoch": 0.5918276108726752, + "grad_norm": 0.13736057801819654, + "learning_rate": 7.537905693178245e-05, + "loss": 0.6557, + "step": 6619 + }, + { + "epoch": 0.5919170243204578, + "grad_norm": 0.16263894800528936, + "learning_rate": 7.535098934452508e-05, + "loss": 0.6977, + "step": 6620 + }, + { + "epoch": 0.5920064377682404, + "grad_norm": 0.13004822971681437, + "learning_rate": 7.532292382455044e-05, + "loss": 0.6651, + "step": 6621 + }, + { + "epoch": 0.5920958512160229, + "grad_norm": 0.12900802301155306, + "learning_rate": 7.529486037421235e-05, + "loss": 0.637, + "step": 6622 + }, + { + "epoch": 0.5921852646638054, + "grad_norm": 0.11625507854542683, + "learning_rate": 7.52667989958645e-05, + "loss": 0.6396, + "step": 6623 + }, + { + "epoch": 0.592274678111588, + "grad_norm": 0.13887138391247666, + "learning_rate": 7.523873969186039e-05, + "loss": 0.6359, + "step": 6624 + }, + { + "epoch": 0.5923640915593705, + "grad_norm": 0.1301600568875736, + "learning_rate": 7.521068246455325e-05, + "loss": 0.6398, + "step": 6625 + }, + { + "epoch": 0.5924535050071531, + "grad_norm": 0.14520056060468184, + "learning_rate": 7.518262731629623e-05, + "loss": 0.6753, + "step": 6626 + }, + { + "epoch": 0.5925429184549357, + "grad_norm": 0.1335523684184755, + "learning_rate": 7.51545742494423e-05, + "loss": 0.5822, + "step": 6627 + }, + { + "epoch": 0.5926323319027181, + "grad_norm": 0.13834107564353595, + "learning_rate": 7.512652326634421e-05, + "loss": 0.6722, + "step": 6628 + }, + { + "epoch": 0.5927217453505007, + "grad_norm": 0.1255887094048903, + "learning_rate": 7.509847436935455e-05, + "loss": 0.6435, + "step": 6629 + }, + { + "epoch": 0.5928111587982833, + "grad_norm": 0.1238115975609334, + "learning_rate": 7.50704275608258e-05, + "loss": 0.6207, + "step": 6630 + }, + { + "epoch": 0.5929005722460658, + "grad_norm": 0.14184366275914723, + "learning_rate": 7.504238284311019e-05, + "loss": 0.6653, + "step": 6631 + }, + { + "epoch": 0.5929899856938483, + "grad_norm": 0.13642295443112898, + "learning_rate": 7.501434021855977e-05, + "loss": 0.6887, + "step": 6632 + }, + { + "epoch": 0.5930793991416309, + "grad_norm": 0.13485103751636135, + "learning_rate": 7.498629968952648e-05, + "loss": 0.639, + "step": 6633 + }, + { + "epoch": 0.5931688125894135, + "grad_norm": 0.1491573970310068, + "learning_rate": 7.495826125836203e-05, + "loss": 0.6464, + "step": 6634 + }, + { + "epoch": 0.593258226037196, + "grad_norm": 0.11750045857944395, + "learning_rate": 7.493022492741795e-05, + "loss": 0.5912, + "step": 6635 + }, + { + "epoch": 0.5933476394849786, + "grad_norm": 0.14014963487819915, + "learning_rate": 7.49021906990456e-05, + "loss": 0.6775, + "step": 6636 + }, + { + "epoch": 0.593437052932761, + "grad_norm": 0.10704895154611531, + "learning_rate": 7.487415857559625e-05, + "loss": 0.6426, + "step": 6637 + }, + { + "epoch": 0.5935264663805436, + "grad_norm": 0.1302767500690071, + "learning_rate": 7.484612855942088e-05, + "loss": 0.6651, + "step": 6638 + }, + { + "epoch": 0.5936158798283262, + "grad_norm": 0.13447943120635822, + "learning_rate": 7.481810065287029e-05, + "loss": 0.6122, + "step": 6639 + }, + { + "epoch": 0.5937052932761088, + "grad_norm": 0.14105487408114595, + "learning_rate": 7.479007485829523e-05, + "loss": 0.6064, + "step": 6640 + }, + { + "epoch": 0.5937947067238912, + "grad_norm": 0.11724204903310854, + "learning_rate": 7.476205117804614e-05, + "loss": 0.6661, + "step": 6641 + }, + { + "epoch": 0.5938841201716738, + "grad_norm": 0.1407585939427979, + "learning_rate": 7.47340296144733e-05, + "loss": 0.6467, + "step": 6642 + }, + { + "epoch": 0.5939735336194564, + "grad_norm": 0.1340757246580791, + "learning_rate": 7.470601016992687e-05, + "loss": 0.6343, + "step": 6643 + }, + { + "epoch": 0.594062947067239, + "grad_norm": 0.156118552985027, + "learning_rate": 7.467799284675687e-05, + "loss": 0.6517, + "step": 6644 + }, + { + "epoch": 0.5941523605150214, + "grad_norm": 0.14935463737191432, + "learning_rate": 7.464997764731304e-05, + "loss": 0.64, + "step": 6645 + }, + { + "epoch": 0.594241773962804, + "grad_norm": 0.11417047824101477, + "learning_rate": 7.462196457394493e-05, + "loss": 0.6637, + "step": 6646 + }, + { + "epoch": 0.5943311874105865, + "grad_norm": 0.11706978898006369, + "learning_rate": 7.459395362900201e-05, + "loss": 0.6334, + "step": 6647 + }, + { + "epoch": 0.5944206008583691, + "grad_norm": 0.14373537307705136, + "learning_rate": 7.456594481483355e-05, + "loss": 0.6426, + "step": 6648 + }, + { + "epoch": 0.5945100143061517, + "grad_norm": 0.13406134922764437, + "learning_rate": 7.453793813378853e-05, + "loss": 0.6526, + "step": 6649 + }, + { + "epoch": 0.5945994277539342, + "grad_norm": 0.13129816077821468, + "learning_rate": 7.450993358821589e-05, + "loss": 0.6578, + "step": 6650 + }, + { + "epoch": 0.5946888412017167, + "grad_norm": 0.1305197136659487, + "learning_rate": 7.448193118046435e-05, + "loss": 0.6609, + "step": 6651 + }, + { + "epoch": 0.5947782546494993, + "grad_norm": 0.15207846221047783, + "learning_rate": 7.445393091288247e-05, + "loss": 0.6693, + "step": 6652 + }, + { + "epoch": 0.5948676680972819, + "grad_norm": 0.13479761460680234, + "learning_rate": 7.442593278781848e-05, + "loss": 0.6701, + "step": 6653 + }, + { + "epoch": 0.5949570815450643, + "grad_norm": 0.10729117492898699, + "learning_rate": 7.439793680762068e-05, + "loss": 0.6228, + "step": 6654 + }, + { + "epoch": 0.5950464949928469, + "grad_norm": 0.11990729564235966, + "learning_rate": 7.436994297463698e-05, + "loss": 0.6155, + "step": 6655 + }, + { + "epoch": 0.5951359084406295, + "grad_norm": 0.14766359340253202, + "learning_rate": 7.434195129121518e-05, + "loss": 0.6481, + "step": 6656 + }, + { + "epoch": 0.595225321888412, + "grad_norm": 0.13762215401114955, + "learning_rate": 7.431396175970296e-05, + "loss": 0.665, + "step": 6657 + }, + { + "epoch": 0.5953147353361946, + "grad_norm": 0.1347679812873106, + "learning_rate": 7.428597438244776e-05, + "loss": 0.6983, + "step": 6658 + }, + { + "epoch": 0.5954041487839771, + "grad_norm": 0.12126844457931235, + "learning_rate": 7.425798916179683e-05, + "loss": 0.6623, + "step": 6659 + }, + { + "epoch": 0.5954935622317596, + "grad_norm": 0.1306035615591949, + "learning_rate": 7.423000610009725e-05, + "loss": 0.6226, + "step": 6660 + }, + { + "epoch": 0.5955829756795422, + "grad_norm": 0.133322604528304, + "learning_rate": 7.420202519969595e-05, + "loss": 0.6389, + "step": 6661 + }, + { + "epoch": 0.5956723891273248, + "grad_norm": 0.13421869788242421, + "learning_rate": 7.417404646293961e-05, + "loss": 0.6585, + "step": 6662 + }, + { + "epoch": 0.5957618025751072, + "grad_norm": 0.13246794297287448, + "learning_rate": 7.414606989217482e-05, + "loss": 0.5886, + "step": 6663 + }, + { + "epoch": 0.5958512160228898, + "grad_norm": 0.11799504164211921, + "learning_rate": 7.411809548974792e-05, + "loss": 0.6292, + "step": 6664 + }, + { + "epoch": 0.5959406294706724, + "grad_norm": 0.11482114108663258, + "learning_rate": 7.409012325800511e-05, + "loss": 0.6407, + "step": 6665 + }, + { + "epoch": 0.596030042918455, + "grad_norm": 0.12876538703668236, + "learning_rate": 7.406215319929235e-05, + "loss": 0.6482, + "step": 6666 + }, + { + "epoch": 0.5961194563662375, + "grad_norm": 0.12979520547204396, + "learning_rate": 7.403418531595551e-05, + "loss": 0.6551, + "step": 6667 + }, + { + "epoch": 0.59620886981402, + "grad_norm": 0.16147853504981097, + "learning_rate": 7.400621961034018e-05, + "loss": 0.6981, + "step": 6668 + }, + { + "epoch": 0.5962982832618026, + "grad_norm": 0.1344370882865375, + "learning_rate": 7.39782560847918e-05, + "loss": 0.6175, + "step": 6669 + }, + { + "epoch": 0.5963876967095851, + "grad_norm": 0.1305901048113766, + "learning_rate": 7.395029474165562e-05, + "loss": 0.6658, + "step": 6670 + }, + { + "epoch": 0.5964771101573677, + "grad_norm": 0.12014517891386958, + "learning_rate": 7.392233558327683e-05, + "loss": 0.6497, + "step": 6671 + }, + { + "epoch": 0.5965665236051502, + "grad_norm": 0.12615560232920364, + "learning_rate": 7.389437861200024e-05, + "loss": 0.6597, + "step": 6672 + }, + { + "epoch": 0.5966559370529327, + "grad_norm": 0.12415781013096652, + "learning_rate": 7.386642383017057e-05, + "loss": 0.6297, + "step": 6673 + }, + { + "epoch": 0.5967453505007153, + "grad_norm": 0.1205424647750184, + "learning_rate": 7.383847124013239e-05, + "loss": 0.6158, + "step": 6674 + }, + { + "epoch": 0.5968347639484979, + "grad_norm": 0.12574592324983375, + "learning_rate": 7.381052084423005e-05, + "loss": 0.6154, + "step": 6675 + }, + { + "epoch": 0.5969241773962805, + "grad_norm": 0.1453796269591205, + "learning_rate": 7.378257264480766e-05, + "loss": 0.6382, + "step": 6676 + }, + { + "epoch": 0.5970135908440629, + "grad_norm": 0.13857121969125855, + "learning_rate": 7.375462664420922e-05, + "loss": 0.6564, + "step": 6677 + }, + { + "epoch": 0.5971030042918455, + "grad_norm": 0.12058420240420482, + "learning_rate": 7.37266828447786e-05, + "loss": 0.6382, + "step": 6678 + }, + { + "epoch": 0.5971924177396281, + "grad_norm": 0.12295869318739311, + "learning_rate": 7.369874124885934e-05, + "loss": 0.624, + "step": 6679 + }, + { + "epoch": 0.5972818311874106, + "grad_norm": 0.12467079843687852, + "learning_rate": 7.367080185879489e-05, + "loss": 0.6322, + "step": 6680 + }, + { + "epoch": 0.5973712446351931, + "grad_norm": 0.13268787979506602, + "learning_rate": 7.364286467692848e-05, + "loss": 0.6807, + "step": 6681 + }, + { + "epoch": 0.5974606580829757, + "grad_norm": 0.13011736396831525, + "learning_rate": 7.361492970560322e-05, + "loss": 0.6874, + "step": 6682 + }, + { + "epoch": 0.5975500715307582, + "grad_norm": 0.1264536539969103, + "learning_rate": 7.358699694716189e-05, + "loss": 0.6295, + "step": 6683 + }, + { + "epoch": 0.5976394849785408, + "grad_norm": 0.13503190657192873, + "learning_rate": 7.35590664039472e-05, + "loss": 0.6374, + "step": 6684 + }, + { + "epoch": 0.5977288984263234, + "grad_norm": 0.13556377977005304, + "learning_rate": 7.353113807830175e-05, + "loss": 0.6234, + "step": 6685 + }, + { + "epoch": 0.5978183118741058, + "grad_norm": 0.12458016581445186, + "learning_rate": 7.350321197256777e-05, + "loss": 0.6624, + "step": 6686 + }, + { + "epoch": 0.5979077253218884, + "grad_norm": 0.1367585656843602, + "learning_rate": 7.347528808908737e-05, + "loss": 0.6566, + "step": 6687 + }, + { + "epoch": 0.597997138769671, + "grad_norm": 0.12539663902291107, + "learning_rate": 7.344736643020256e-05, + "loss": 0.6533, + "step": 6688 + }, + { + "epoch": 0.5980865522174535, + "grad_norm": 0.13584090472087887, + "learning_rate": 7.341944699825503e-05, + "loss": 0.6579, + "step": 6689 + }, + { + "epoch": 0.598175965665236, + "grad_norm": 0.12786472715399913, + "learning_rate": 7.33915297955864e-05, + "loss": 0.6354, + "step": 6690 + }, + { + "epoch": 0.5982653791130186, + "grad_norm": 0.1185664121909463, + "learning_rate": 7.3363614824538e-05, + "loss": 0.6236, + "step": 6691 + }, + { + "epoch": 0.5983547925608012, + "grad_norm": 0.11987618106143114, + "learning_rate": 7.333570208745109e-05, + "loss": 0.6256, + "step": 6692 + }, + { + "epoch": 0.5984442060085837, + "grad_norm": 0.13482031067046452, + "learning_rate": 7.330779158666661e-05, + "loss": 0.646, + "step": 6693 + }, + { + "epoch": 0.5985336194563662, + "grad_norm": 0.14043601264263011, + "learning_rate": 7.327988332452545e-05, + "loss": 0.6375, + "step": 6694 + }, + { + "epoch": 0.5986230329041488, + "grad_norm": 0.14918105610088753, + "learning_rate": 7.325197730336819e-05, + "loss": 0.6565, + "step": 6695 + }, + { + "epoch": 0.5987124463519313, + "grad_norm": 0.1352549342312558, + "learning_rate": 7.322407352553529e-05, + "loss": 0.6797, + "step": 6696 + }, + { + "epoch": 0.5988018597997139, + "grad_norm": 0.13940123239649496, + "learning_rate": 7.319617199336701e-05, + "loss": 0.6693, + "step": 6697 + }, + { + "epoch": 0.5988912732474965, + "grad_norm": 0.11631392277105737, + "learning_rate": 7.316827270920339e-05, + "loss": 0.6204, + "step": 6698 + }, + { + "epoch": 0.5989806866952789, + "grad_norm": 0.13679412300580324, + "learning_rate": 7.314037567538436e-05, + "loss": 0.6473, + "step": 6699 + }, + { + "epoch": 0.5990701001430615, + "grad_norm": 0.1282571018027092, + "learning_rate": 7.311248089424958e-05, + "loss": 0.6379, + "step": 6700 + }, + { + "epoch": 0.5991595135908441, + "grad_norm": 0.12479342765278205, + "learning_rate": 7.308458836813856e-05, + "loss": 0.6204, + "step": 6701 + }, + { + "epoch": 0.5992489270386266, + "grad_norm": 0.13832123178708233, + "learning_rate": 7.305669809939062e-05, + "loss": 0.6416, + "step": 6702 + }, + { + "epoch": 0.5993383404864091, + "grad_norm": 0.126378199326614, + "learning_rate": 7.302881009034484e-05, + "loss": 0.5341, + "step": 6703 + }, + { + "epoch": 0.5994277539341917, + "grad_norm": 0.15105797218830122, + "learning_rate": 7.30009243433402e-05, + "loss": 0.6679, + "step": 6704 + }, + { + "epoch": 0.5995171673819742, + "grad_norm": 0.12936508976914537, + "learning_rate": 7.29730408607154e-05, + "loss": 0.665, + "step": 6705 + }, + { + "epoch": 0.5996065808297568, + "grad_norm": 0.12963932044538817, + "learning_rate": 7.294515964480906e-05, + "loss": 0.6492, + "step": 6706 + }, + { + "epoch": 0.5996959942775394, + "grad_norm": 0.1293738254437735, + "learning_rate": 7.291728069795948e-05, + "loss": 0.6024, + "step": 6707 + }, + { + "epoch": 0.5997854077253219, + "grad_norm": 0.1280740904116155, + "learning_rate": 7.28894040225049e-05, + "loss": 0.6608, + "step": 6708 + }, + { + "epoch": 0.5998748211731044, + "grad_norm": 0.15012201639612438, + "learning_rate": 7.286152962078326e-05, + "loss": 0.6444, + "step": 6709 + }, + { + "epoch": 0.599964234620887, + "grad_norm": 0.13401413838316004, + "learning_rate": 7.283365749513231e-05, + "loss": 0.6732, + "step": 6710 + }, + { + "epoch": 0.6000536480686696, + "grad_norm": 0.13274889633958425, + "learning_rate": 7.280578764788975e-05, + "loss": 0.5908, + "step": 6711 + }, + { + "epoch": 0.600143061516452, + "grad_norm": 0.13669177874790114, + "learning_rate": 7.277792008139287e-05, + "loss": 0.6265, + "step": 6712 + }, + { + "epoch": 0.6002324749642346, + "grad_norm": 0.13124038803134483, + "learning_rate": 7.2750054797979e-05, + "loss": 0.6646, + "step": 6713 + }, + { + "epoch": 0.6003218884120172, + "grad_norm": 0.13974919957397597, + "learning_rate": 7.272219179998511e-05, + "loss": 0.6464, + "step": 6714 + }, + { + "epoch": 0.6004113018597997, + "grad_norm": 0.1389409644188988, + "learning_rate": 7.269433108974809e-05, + "loss": 0.6455, + "step": 6715 + }, + { + "epoch": 0.6005007153075823, + "grad_norm": 0.14788135815874828, + "learning_rate": 7.266647266960452e-05, + "loss": 0.7157, + "step": 6716 + }, + { + "epoch": 0.6005901287553648, + "grad_norm": 0.14124204511750635, + "learning_rate": 7.263861654189086e-05, + "loss": 0.6813, + "step": 6717 + }, + { + "epoch": 0.6006795422031473, + "grad_norm": 0.1411013001921186, + "learning_rate": 7.261076270894342e-05, + "loss": 0.6305, + "step": 6718 + }, + { + "epoch": 0.6007689556509299, + "grad_norm": 0.14229884257630376, + "learning_rate": 7.258291117309817e-05, + "loss": 0.6728, + "step": 6719 + }, + { + "epoch": 0.6008583690987125, + "grad_norm": 0.13040883206444762, + "learning_rate": 7.25550619366911e-05, + "loss": 0.647, + "step": 6720 + }, + { + "epoch": 0.600947782546495, + "grad_norm": 0.1532950729891049, + "learning_rate": 7.252721500205783e-05, + "loss": 0.6256, + "step": 6721 + }, + { + "epoch": 0.6010371959942775, + "grad_norm": 0.13980517313328109, + "learning_rate": 7.249937037153387e-05, + "loss": 0.6434, + "step": 6722 + }, + { + "epoch": 0.6011266094420601, + "grad_norm": 0.10672854620083042, + "learning_rate": 7.24715280474545e-05, + "loss": 0.6255, + "step": 6723 + }, + { + "epoch": 0.6012160228898427, + "grad_norm": 0.12795566928964855, + "learning_rate": 7.244368803215482e-05, + "loss": 0.6614, + "step": 6724 + }, + { + "epoch": 0.6013054363376252, + "grad_norm": 0.12898280194846246, + "learning_rate": 7.241585032796977e-05, + "loss": 0.6067, + "step": 6725 + }, + { + "epoch": 0.6013948497854077, + "grad_norm": 0.1290121231877142, + "learning_rate": 7.238801493723398e-05, + "loss": 0.6446, + "step": 6726 + }, + { + "epoch": 0.6014842632331903, + "grad_norm": 0.1365794325276368, + "learning_rate": 7.236018186228206e-05, + "loss": 0.6746, + "step": 6727 + }, + { + "epoch": 0.6015736766809728, + "grad_norm": 0.13268587075213073, + "learning_rate": 7.233235110544833e-05, + "loss": 0.6687, + "step": 6728 + }, + { + "epoch": 0.6016630901287554, + "grad_norm": 0.1290788868040094, + "learning_rate": 7.230452266906689e-05, + "loss": 0.6662, + "step": 6729 + }, + { + "epoch": 0.6017525035765379, + "grad_norm": 0.1366925838326267, + "learning_rate": 7.227669655547167e-05, + "loss": 0.6661, + "step": 6730 + }, + { + "epoch": 0.6018419170243204, + "grad_norm": 0.1186005599399236, + "learning_rate": 7.224887276699645e-05, + "loss": 0.6247, + "step": 6731 + }, + { + "epoch": 0.601931330472103, + "grad_norm": 0.12953193853894815, + "learning_rate": 7.222105130597477e-05, + "loss": 0.6777, + "step": 6732 + }, + { + "epoch": 0.6020207439198856, + "grad_norm": 0.1323869886778551, + "learning_rate": 7.21932321747399e-05, + "loss": 0.6296, + "step": 6733 + }, + { + "epoch": 0.602110157367668, + "grad_norm": 0.13540621477055076, + "learning_rate": 7.21654153756251e-05, + "loss": 0.6414, + "step": 6734 + }, + { + "epoch": 0.6021995708154506, + "grad_norm": 0.1252804489135302, + "learning_rate": 7.213760091096331e-05, + "loss": 0.6209, + "step": 6735 + }, + { + "epoch": 0.6022889842632332, + "grad_norm": 0.12439547253955051, + "learning_rate": 7.210978878308729e-05, + "loss": 0.6504, + "step": 6736 + }, + { + "epoch": 0.6023783977110158, + "grad_norm": 0.11685958885889472, + "learning_rate": 7.208197899432958e-05, + "loss": 0.6153, + "step": 6737 + }, + { + "epoch": 0.6024678111587983, + "grad_norm": 0.1280652030566039, + "learning_rate": 7.20541715470226e-05, + "loss": 0.6451, + "step": 6738 + }, + { + "epoch": 0.6025572246065808, + "grad_norm": 0.12067004828573544, + "learning_rate": 7.202636644349845e-05, + "loss": 0.6208, + "step": 6739 + }, + { + "epoch": 0.6026466380543634, + "grad_norm": 0.13922405867527854, + "learning_rate": 7.199856368608922e-05, + "loss": 0.689, + "step": 6740 + }, + { + "epoch": 0.6027360515021459, + "grad_norm": 0.14407592909406292, + "learning_rate": 7.197076327712659e-05, + "loss": 0.6463, + "step": 6741 + }, + { + "epoch": 0.6028254649499285, + "grad_norm": 0.1280191204240011, + "learning_rate": 7.194296521894223e-05, + "loss": 0.634, + "step": 6742 + }, + { + "epoch": 0.602914878397711, + "grad_norm": 0.13754088370349948, + "learning_rate": 7.191516951386751e-05, + "loss": 0.6394, + "step": 6743 + }, + { + "epoch": 0.6030042918454935, + "grad_norm": 0.13747082657956844, + "learning_rate": 7.188737616423356e-05, + "loss": 0.6515, + "step": 6744 + }, + { + "epoch": 0.6030937052932761, + "grad_norm": 0.1416490233345828, + "learning_rate": 7.185958517237146e-05, + "loss": 0.6287, + "step": 6745 + }, + { + "epoch": 0.6031831187410587, + "grad_norm": 0.13448230858013735, + "learning_rate": 7.183179654061191e-05, + "loss": 0.6481, + "step": 6746 + }, + { + "epoch": 0.6032725321888412, + "grad_norm": 0.13076124304234946, + "learning_rate": 7.18040102712856e-05, + "loss": 0.6153, + "step": 6747 + }, + { + "epoch": 0.6033619456366237, + "grad_norm": 0.1388694262391988, + "learning_rate": 7.17762263667229e-05, + "loss": 0.6697, + "step": 6748 + }, + { + "epoch": 0.6034513590844063, + "grad_norm": 0.1304180147078259, + "learning_rate": 7.1748444829254e-05, + "loss": 0.6571, + "step": 6749 + }, + { + "epoch": 0.6035407725321889, + "grad_norm": 0.11825171670613792, + "learning_rate": 7.172066566120892e-05, + "loss": 0.6346, + "step": 6750 + }, + { + "epoch": 0.6036301859799714, + "grad_norm": 0.14782207115119234, + "learning_rate": 7.169288886491746e-05, + "loss": 0.6518, + "step": 6751 + }, + { + "epoch": 0.6037195994277539, + "grad_norm": 0.13241502766543115, + "learning_rate": 7.166511444270924e-05, + "loss": 0.6613, + "step": 6752 + }, + { + "epoch": 0.6038090128755365, + "grad_norm": 0.13036574519294092, + "learning_rate": 7.16373423969136e-05, + "loss": 0.6116, + "step": 6753 + }, + { + "epoch": 0.603898426323319, + "grad_norm": 0.14504421547009763, + "learning_rate": 7.160957272985982e-05, + "loss": 0.6617, + "step": 6754 + }, + { + "epoch": 0.6039878397711016, + "grad_norm": 0.15265548400877332, + "learning_rate": 7.158180544387691e-05, + "loss": 0.631, + "step": 6755 + }, + { + "epoch": 0.6040772532188842, + "grad_norm": 0.1304703710410161, + "learning_rate": 7.155404054129366e-05, + "loss": 0.6685, + "step": 6756 + }, + { + "epoch": 0.6041666666666666, + "grad_norm": 0.13550432533237353, + "learning_rate": 7.152627802443866e-05, + "loss": 0.6419, + "step": 6757 + }, + { + "epoch": 0.6042560801144492, + "grad_norm": 0.11935719489017622, + "learning_rate": 7.149851789564034e-05, + "loss": 0.6184, + "step": 6758 + }, + { + "epoch": 0.6043454935622318, + "grad_norm": 0.13223013783741455, + "learning_rate": 7.147076015722691e-05, + "loss": 0.6292, + "step": 6759 + }, + { + "epoch": 0.6044349070100143, + "grad_norm": 0.14039432238328772, + "learning_rate": 7.144300481152633e-05, + "loss": 0.6409, + "step": 6760 + }, + { + "epoch": 0.6045243204577968, + "grad_norm": 0.13608329744599956, + "learning_rate": 7.141525186086647e-05, + "loss": 0.6515, + "step": 6761 + }, + { + "epoch": 0.6046137339055794, + "grad_norm": 0.13409240087665733, + "learning_rate": 7.138750130757493e-05, + "loss": 0.6272, + "step": 6762 + }, + { + "epoch": 0.604703147353362, + "grad_norm": 0.12956629378914328, + "learning_rate": 7.135975315397912e-05, + "loss": 0.6375, + "step": 6763 + }, + { + "epoch": 0.6047925608011445, + "grad_norm": 0.12603111188203686, + "learning_rate": 7.133200740240618e-05, + "loss": 0.6237, + "step": 6764 + }, + { + "epoch": 0.6048819742489271, + "grad_norm": 0.1344196954542709, + "learning_rate": 7.130426405518318e-05, + "loss": 0.6947, + "step": 6765 + }, + { + "epoch": 0.6049713876967096, + "grad_norm": 0.1487724195173802, + "learning_rate": 7.127652311463691e-05, + "loss": 0.6655, + "step": 6766 + }, + { + "epoch": 0.6050608011444921, + "grad_norm": 0.12601827988248424, + "learning_rate": 7.124878458309391e-05, + "loss": 0.6284, + "step": 6767 + }, + { + "epoch": 0.6051502145922747, + "grad_norm": 0.14842408140619806, + "learning_rate": 7.122104846288064e-05, + "loss": 0.6763, + "step": 6768 + }, + { + "epoch": 0.6052396280400573, + "grad_norm": 0.14607715384739187, + "learning_rate": 7.119331475632332e-05, + "loss": 0.65, + "step": 6769 + }, + { + "epoch": 0.6053290414878397, + "grad_norm": 0.13337611190735765, + "learning_rate": 7.116558346574788e-05, + "loss": 0.6681, + "step": 6770 + }, + { + "epoch": 0.6054184549356223, + "grad_norm": 0.12796838920222353, + "learning_rate": 7.113785459348012e-05, + "loss": 0.6412, + "step": 6771 + }, + { + "epoch": 0.6055078683834049, + "grad_norm": 0.12857120202064562, + "learning_rate": 7.111012814184566e-05, + "loss": 0.6415, + "step": 6772 + }, + { + "epoch": 0.6055972818311874, + "grad_norm": 0.14108878568078778, + "learning_rate": 7.108240411316986e-05, + "loss": 0.6379, + "step": 6773 + }, + { + "epoch": 0.60568669527897, + "grad_norm": 0.13434340537746628, + "learning_rate": 7.105468250977786e-05, + "loss": 0.6244, + "step": 6774 + }, + { + "epoch": 0.6057761087267525, + "grad_norm": 0.13777004946469953, + "learning_rate": 7.10269633339947e-05, + "loss": 0.67, + "step": 6775 + }, + { + "epoch": 0.605865522174535, + "grad_norm": 0.12269598417914898, + "learning_rate": 7.099924658814517e-05, + "loss": 0.6245, + "step": 6776 + }, + { + "epoch": 0.6059549356223176, + "grad_norm": 0.14683490941907276, + "learning_rate": 7.097153227455379e-05, + "loss": 0.6499, + "step": 6777 + }, + { + "epoch": 0.6060443490701002, + "grad_norm": 0.14121604223408468, + "learning_rate": 7.094382039554493e-05, + "loss": 0.6996, + "step": 6778 + }, + { + "epoch": 0.6061337625178826, + "grad_norm": 0.12025225668235402, + "learning_rate": 7.091611095344277e-05, + "loss": 0.6608, + "step": 6779 + }, + { + "epoch": 0.6062231759656652, + "grad_norm": 0.12887015530271834, + "learning_rate": 7.088840395057124e-05, + "loss": 0.6439, + "step": 6780 + }, + { + "epoch": 0.6063125894134478, + "grad_norm": 0.12703400556013303, + "learning_rate": 7.086069938925411e-05, + "loss": 0.644, + "step": 6781 + }, + { + "epoch": 0.6064020028612304, + "grad_norm": 0.13114643934009917, + "learning_rate": 7.083299727181495e-05, + "loss": 0.6278, + "step": 6782 + }, + { + "epoch": 0.6064914163090128, + "grad_norm": 0.13575262392540657, + "learning_rate": 7.080529760057709e-05, + "loss": 0.6437, + "step": 6783 + }, + { + "epoch": 0.6065808297567954, + "grad_norm": 0.1385751194689619, + "learning_rate": 7.077760037786365e-05, + "loss": 0.6439, + "step": 6784 + }, + { + "epoch": 0.606670243204578, + "grad_norm": 0.14792399558509006, + "learning_rate": 7.074990560599759e-05, + "loss": 0.642, + "step": 6785 + }, + { + "epoch": 0.6067596566523605, + "grad_norm": 0.14782069319168528, + "learning_rate": 7.072221328730162e-05, + "loss": 0.6835, + "step": 6786 + }, + { + "epoch": 0.6068490701001431, + "grad_norm": 0.13244515435319767, + "learning_rate": 7.069452342409825e-05, + "loss": 0.6688, + "step": 6787 + }, + { + "epoch": 0.6069384835479256, + "grad_norm": 0.13539910365822141, + "learning_rate": 7.066683601870978e-05, + "loss": 0.6349, + "step": 6788 + }, + { + "epoch": 0.6070278969957081, + "grad_norm": 0.12988907448745995, + "learning_rate": 7.063915107345839e-05, + "loss": 0.6563, + "step": 6789 + }, + { + "epoch": 0.6071173104434907, + "grad_norm": 0.15322840045272482, + "learning_rate": 7.061146859066594e-05, + "loss": 0.6441, + "step": 6790 + }, + { + "epoch": 0.6072067238912733, + "grad_norm": 0.12933574030599412, + "learning_rate": 7.058378857265411e-05, + "loss": 0.6506, + "step": 6791 + }, + { + "epoch": 0.6072961373390557, + "grad_norm": 0.11751957567039363, + "learning_rate": 7.055611102174442e-05, + "loss": 0.6206, + "step": 6792 + }, + { + "epoch": 0.6073855507868383, + "grad_norm": 0.12652419323158406, + "learning_rate": 7.052843594025815e-05, + "loss": 0.6222, + "step": 6793 + }, + { + "epoch": 0.6074749642346209, + "grad_norm": 0.15226102848107545, + "learning_rate": 7.050076333051634e-05, + "loss": 0.6646, + "step": 6794 + }, + { + "epoch": 0.6075643776824035, + "grad_norm": 0.14142435485581673, + "learning_rate": 7.047309319483985e-05, + "loss": 0.6759, + "step": 6795 + }, + { + "epoch": 0.607653791130186, + "grad_norm": 0.13756393175402745, + "learning_rate": 7.044542553554943e-05, + "loss": 0.6635, + "step": 6796 + }, + { + "epoch": 0.6077432045779685, + "grad_norm": 0.12841544070361685, + "learning_rate": 7.041776035496547e-05, + "loss": 0.6074, + "step": 6797 + }, + { + "epoch": 0.6078326180257511, + "grad_norm": 0.13766585526763522, + "learning_rate": 7.039009765540822e-05, + "loss": 0.6228, + "step": 6798 + }, + { + "epoch": 0.6079220314735336, + "grad_norm": 0.13261317558005284, + "learning_rate": 7.036243743919773e-05, + "loss": 0.6334, + "step": 6799 + }, + { + "epoch": 0.6080114449213162, + "grad_norm": 0.12680621863301686, + "learning_rate": 7.033477970865381e-05, + "loss": 0.6384, + "step": 6800 + }, + { + "epoch": 0.6081008583690987, + "grad_norm": 0.14458582545988422, + "learning_rate": 7.030712446609608e-05, + "loss": 0.6593, + "step": 6801 + }, + { + "epoch": 0.6081902718168812, + "grad_norm": 0.1346571992017083, + "learning_rate": 7.027947171384394e-05, + "loss": 0.6452, + "step": 6802 + }, + { + "epoch": 0.6082796852646638, + "grad_norm": 0.12726769230910734, + "learning_rate": 7.025182145421665e-05, + "loss": 0.6371, + "step": 6803 + }, + { + "epoch": 0.6083690987124464, + "grad_norm": 0.12936948520669775, + "learning_rate": 7.022417368953317e-05, + "loss": 0.6246, + "step": 6804 + }, + { + "epoch": 0.608458512160229, + "grad_norm": 0.1584273832855792, + "learning_rate": 7.019652842211226e-05, + "loss": 0.6927, + "step": 6805 + }, + { + "epoch": 0.6085479256080114, + "grad_norm": 0.12392233854963389, + "learning_rate": 7.016888565427253e-05, + "loss": 0.6217, + "step": 6806 + }, + { + "epoch": 0.608637339055794, + "grad_norm": 0.1257357035863358, + "learning_rate": 7.014124538833234e-05, + "loss": 0.635, + "step": 6807 + }, + { + "epoch": 0.6087267525035766, + "grad_norm": 0.122817570247331, + "learning_rate": 7.011360762660983e-05, + "loss": 0.6287, + "step": 6808 + }, + { + "epoch": 0.6088161659513591, + "grad_norm": 0.12866179829231494, + "learning_rate": 7.008597237142293e-05, + "loss": 0.6517, + "step": 6809 + }, + { + "epoch": 0.6089055793991416, + "grad_norm": 0.11811140119972027, + "learning_rate": 7.005833962508943e-05, + "loss": 0.6394, + "step": 6810 + }, + { + "epoch": 0.6089949928469242, + "grad_norm": 0.1284398674250895, + "learning_rate": 7.003070938992682e-05, + "loss": 0.6657, + "step": 6811 + }, + { + "epoch": 0.6090844062947067, + "grad_norm": 0.13839792920174163, + "learning_rate": 7.000308166825243e-05, + "loss": 0.6583, + "step": 6812 + }, + { + "epoch": 0.6091738197424893, + "grad_norm": 0.11928797197734893, + "learning_rate": 6.997545646238335e-05, + "loss": 0.626, + "step": 6813 + }, + { + "epoch": 0.6092632331902719, + "grad_norm": 0.1337919638709587, + "learning_rate": 6.994783377463645e-05, + "loss": 0.6531, + "step": 6814 + }, + { + "epoch": 0.6093526466380543, + "grad_norm": 0.12707526025648833, + "learning_rate": 6.992021360732848e-05, + "loss": 0.6128, + "step": 6815 + }, + { + "epoch": 0.6094420600858369, + "grad_norm": 0.11588649286186811, + "learning_rate": 6.989259596277582e-05, + "loss": 0.5906, + "step": 6816 + }, + { + "epoch": 0.6095314735336195, + "grad_norm": 0.15317626802407097, + "learning_rate": 6.98649808432948e-05, + "loss": 0.658, + "step": 6817 + }, + { + "epoch": 0.609620886981402, + "grad_norm": 0.15735088588094442, + "learning_rate": 6.983736825120144e-05, + "loss": 0.5952, + "step": 6818 + }, + { + "epoch": 0.6097103004291845, + "grad_norm": 0.1472875220509623, + "learning_rate": 6.980975818881159e-05, + "loss": 0.6587, + "step": 6819 + }, + { + "epoch": 0.6097997138769671, + "grad_norm": 0.12311575179362905, + "learning_rate": 6.978215065844087e-05, + "loss": 0.6192, + "step": 6820 + }, + { + "epoch": 0.6098891273247496, + "grad_norm": 0.11522764599943114, + "learning_rate": 6.975454566240465e-05, + "loss": 0.6384, + "step": 6821 + }, + { + "epoch": 0.6099785407725322, + "grad_norm": 0.1353695052618803, + "learning_rate": 6.972694320301813e-05, + "loss": 0.693, + "step": 6822 + }, + { + "epoch": 0.6100679542203148, + "grad_norm": 0.12795376330517896, + "learning_rate": 6.969934328259637e-05, + "loss": 0.6347, + "step": 6823 + }, + { + "epoch": 0.6101573676680973, + "grad_norm": 0.12037946101984662, + "learning_rate": 6.96717459034541e-05, + "loss": 0.6019, + "step": 6824 + }, + { + "epoch": 0.6102467811158798, + "grad_norm": 0.13061195551708255, + "learning_rate": 6.964415106790586e-05, + "loss": 0.6435, + "step": 6825 + }, + { + "epoch": 0.6103361945636624, + "grad_norm": 0.13070622658170228, + "learning_rate": 6.961655877826603e-05, + "loss": 0.6428, + "step": 6826 + }, + { + "epoch": 0.610425608011445, + "grad_norm": 0.13233603144193543, + "learning_rate": 6.95889690368487e-05, + "loss": 0.6528, + "step": 6827 + }, + { + "epoch": 0.6105150214592274, + "grad_norm": 0.14103336693449955, + "learning_rate": 6.956138184596782e-05, + "loss": 0.6868, + "step": 6828 + }, + { + "epoch": 0.61060443490701, + "grad_norm": 0.13798218967391424, + "learning_rate": 6.953379720793703e-05, + "loss": 0.6053, + "step": 6829 + }, + { + "epoch": 0.6106938483547926, + "grad_norm": 0.12758382713331393, + "learning_rate": 6.950621512506993e-05, + "loss": 0.6486, + "step": 6830 + }, + { + "epoch": 0.6107832618025751, + "grad_norm": 0.15087091749831885, + "learning_rate": 6.947863559967976e-05, + "loss": 0.6833, + "step": 6831 + }, + { + "epoch": 0.6108726752503576, + "grad_norm": 0.13447007642958472, + "learning_rate": 6.945105863407951e-05, + "loss": 0.6409, + "step": 6832 + }, + { + "epoch": 0.6109620886981402, + "grad_norm": 0.15150030034140846, + "learning_rate": 6.942348423058212e-05, + "loss": 0.6435, + "step": 6833 + }, + { + "epoch": 0.6110515021459227, + "grad_norm": 0.13650549315626367, + "learning_rate": 6.939591239150014e-05, + "loss": 0.6401, + "step": 6834 + }, + { + "epoch": 0.6111409155937053, + "grad_norm": 0.13649795573542042, + "learning_rate": 6.936834311914606e-05, + "loss": 0.639, + "step": 6835 + }, + { + "epoch": 0.6112303290414879, + "grad_norm": 0.14198394150391538, + "learning_rate": 6.934077641583201e-05, + "loss": 0.6759, + "step": 6836 + }, + { + "epoch": 0.6113197424892703, + "grad_norm": 0.14473780844392334, + "learning_rate": 6.931321228387005e-05, + "loss": 0.6308, + "step": 6837 + }, + { + "epoch": 0.6114091559370529, + "grad_norm": 0.1450715438384638, + "learning_rate": 6.928565072557191e-05, + "loss": 0.6393, + "step": 6838 + }, + { + "epoch": 0.6114985693848355, + "grad_norm": 0.13624932958005012, + "learning_rate": 6.925809174324915e-05, + "loss": 0.6469, + "step": 6839 + }, + { + "epoch": 0.6115879828326181, + "grad_norm": 0.14246751679987368, + "learning_rate": 6.923053533921312e-05, + "loss": 0.6828, + "step": 6840 + }, + { + "epoch": 0.6116773962804005, + "grad_norm": 0.1402336838445518, + "learning_rate": 6.920298151577491e-05, + "loss": 0.6418, + "step": 6841 + }, + { + "epoch": 0.6117668097281831, + "grad_norm": 0.15806463102346052, + "learning_rate": 6.917543027524546e-05, + "loss": 0.6504, + "step": 6842 + }, + { + "epoch": 0.6118562231759657, + "grad_norm": 0.1306280049721121, + "learning_rate": 6.914788161993542e-05, + "loss": 0.6625, + "step": 6843 + }, + { + "epoch": 0.6119456366237482, + "grad_norm": 0.13757678264506412, + "learning_rate": 6.912033555215532e-05, + "loss": 0.5936, + "step": 6844 + }, + { + "epoch": 0.6120350500715308, + "grad_norm": 0.14246294379705043, + "learning_rate": 6.909279207421536e-05, + "loss": 0.6191, + "step": 6845 + }, + { + "epoch": 0.6121244635193133, + "grad_norm": 0.13212400730797452, + "learning_rate": 6.906525118842563e-05, + "loss": 0.6613, + "step": 6846 + }, + { + "epoch": 0.6122138769670958, + "grad_norm": 0.12361089814452982, + "learning_rate": 6.903771289709591e-05, + "loss": 0.6217, + "step": 6847 + }, + { + "epoch": 0.6123032904148784, + "grad_norm": 0.13871778218688915, + "learning_rate": 6.901017720253583e-05, + "loss": 0.6792, + "step": 6848 + }, + { + "epoch": 0.612392703862661, + "grad_norm": 0.12464085819092928, + "learning_rate": 6.898264410705475e-05, + "loss": 0.6285, + "step": 6849 + }, + { + "epoch": 0.6124821173104434, + "grad_norm": 0.1425313046828018, + "learning_rate": 6.89551136129618e-05, + "loss": 0.6747, + "step": 6850 + }, + { + "epoch": 0.612571530758226, + "grad_norm": 0.12918477098312486, + "learning_rate": 6.892758572256604e-05, + "loss": 0.6415, + "step": 6851 + }, + { + "epoch": 0.6126609442060086, + "grad_norm": 0.12246539354480186, + "learning_rate": 6.890006043817612e-05, + "loss": 0.6511, + "step": 6852 + }, + { + "epoch": 0.6127503576537912, + "grad_norm": 0.13777148920018686, + "learning_rate": 6.887253776210058e-05, + "loss": 0.6855, + "step": 6853 + }, + { + "epoch": 0.6128397711015737, + "grad_norm": 0.14733081159559722, + "learning_rate": 6.884501769664773e-05, + "loss": 0.6866, + "step": 6854 + }, + { + "epoch": 0.6129291845493562, + "grad_norm": 0.12997108598669418, + "learning_rate": 6.881750024412557e-05, + "loss": 0.6477, + "step": 6855 + }, + { + "epoch": 0.6130185979971388, + "grad_norm": 0.12629940437392176, + "learning_rate": 6.878998540684206e-05, + "loss": 0.6447, + "step": 6856 + }, + { + "epoch": 0.6131080114449213, + "grad_norm": 0.13669655865274552, + "learning_rate": 6.876247318710471e-05, + "loss": 0.6827, + "step": 6857 + }, + { + "epoch": 0.6131974248927039, + "grad_norm": 0.13878949563599557, + "learning_rate": 6.873496358722105e-05, + "loss": 0.6483, + "step": 6858 + }, + { + "epoch": 0.6132868383404864, + "grad_norm": 0.12035983255691902, + "learning_rate": 6.870745660949822e-05, + "loss": 0.6239, + "step": 6859 + }, + { + "epoch": 0.6133762517882689, + "grad_norm": 0.14490742479472457, + "learning_rate": 6.867995225624324e-05, + "loss": 0.6498, + "step": 6860 + }, + { + "epoch": 0.6134656652360515, + "grad_norm": 0.1422778851581028, + "learning_rate": 6.865245052976284e-05, + "loss": 0.6553, + "step": 6861 + }, + { + "epoch": 0.6135550786838341, + "grad_norm": 0.13012127354182051, + "learning_rate": 6.862495143236353e-05, + "loss": 0.6238, + "step": 6862 + }, + { + "epoch": 0.6136444921316166, + "grad_norm": 0.14045780039030312, + "learning_rate": 6.859745496635167e-05, + "loss": 0.6328, + "step": 6863 + }, + { + "epoch": 0.6137339055793991, + "grad_norm": 0.13427657138396568, + "learning_rate": 6.85699611340333e-05, + "loss": 0.6309, + "step": 6864 + }, + { + "epoch": 0.6138233190271817, + "grad_norm": 0.14026970104283237, + "learning_rate": 6.854246993771438e-05, + "loss": 0.6713, + "step": 6865 + }, + { + "epoch": 0.6139127324749643, + "grad_norm": 0.1431980512319045, + "learning_rate": 6.851498137970049e-05, + "loss": 0.6959, + "step": 6866 + }, + { + "epoch": 0.6140021459227468, + "grad_norm": 0.12231426267587596, + "learning_rate": 6.84874954622971e-05, + "loss": 0.6376, + "step": 6867 + }, + { + "epoch": 0.6140915593705293, + "grad_norm": 0.1439476901801194, + "learning_rate": 6.84600121878094e-05, + "loss": 0.6757, + "step": 6868 + }, + { + "epoch": 0.6141809728183119, + "grad_norm": 0.1310641553420817, + "learning_rate": 6.843253155854239e-05, + "loss": 0.6299, + "step": 6869 + }, + { + "epoch": 0.6142703862660944, + "grad_norm": 0.11916151508146033, + "learning_rate": 6.840505357680085e-05, + "loss": 0.6571, + "step": 6870 + }, + { + "epoch": 0.614359799713877, + "grad_norm": 0.1340177014323877, + "learning_rate": 6.837757824488927e-05, + "loss": 0.5924, + "step": 6871 + }, + { + "epoch": 0.6144492131616596, + "grad_norm": 0.1330328148699806, + "learning_rate": 6.835010556511201e-05, + "loss": 0.6523, + "step": 6872 + }, + { + "epoch": 0.614538626609442, + "grad_norm": 0.13038654127569907, + "learning_rate": 6.832263553977321e-05, + "loss": 0.6434, + "step": 6873 + }, + { + "epoch": 0.6146280400572246, + "grad_norm": 0.12680760123420998, + "learning_rate": 6.829516817117671e-05, + "loss": 0.6719, + "step": 6874 + }, + { + "epoch": 0.6147174535050072, + "grad_norm": 0.14224199541566665, + "learning_rate": 6.826770346162614e-05, + "loss": 0.663, + "step": 6875 + }, + { + "epoch": 0.6148068669527897, + "grad_norm": 0.12472328394753844, + "learning_rate": 6.8240241413425e-05, + "loss": 0.6284, + "step": 6876 + }, + { + "epoch": 0.6148962804005722, + "grad_norm": 0.14864637202718525, + "learning_rate": 6.821278202887643e-05, + "loss": 0.6334, + "step": 6877 + }, + { + "epoch": 0.6149856938483548, + "grad_norm": 0.13437120671806888, + "learning_rate": 6.818532531028342e-05, + "loss": 0.6112, + "step": 6878 + }, + { + "epoch": 0.6150751072961373, + "grad_norm": 0.12437667937250668, + "learning_rate": 6.815787125994875e-05, + "loss": 0.5965, + "step": 6879 + }, + { + "epoch": 0.6151645207439199, + "grad_norm": 0.14331141899576383, + "learning_rate": 6.813041988017501e-05, + "loss": 0.6482, + "step": 6880 + }, + { + "epoch": 0.6152539341917024, + "grad_norm": 0.1283987083212686, + "learning_rate": 6.810297117326445e-05, + "loss": 0.637, + "step": 6881 + }, + { + "epoch": 0.615343347639485, + "grad_norm": 0.13829850998834003, + "learning_rate": 6.807552514151915e-05, + "loss": 0.6657, + "step": 6882 + }, + { + "epoch": 0.6154327610872675, + "grad_norm": 0.13501658954149767, + "learning_rate": 6.804808178724105e-05, + "loss": 0.6368, + "step": 6883 + }, + { + "epoch": 0.6155221745350501, + "grad_norm": 0.13395592338071954, + "learning_rate": 6.802064111273173e-05, + "loss": 0.6495, + "step": 6884 + }, + { + "epoch": 0.6156115879828327, + "grad_norm": 0.1386279068624879, + "learning_rate": 6.799320312029256e-05, + "loss": 0.6272, + "step": 6885 + }, + { + "epoch": 0.6157010014306151, + "grad_norm": 0.12891432846105008, + "learning_rate": 6.796576781222481e-05, + "loss": 0.6112, + "step": 6886 + }, + { + "epoch": 0.6157904148783977, + "grad_norm": 0.14597581475203733, + "learning_rate": 6.793833519082946e-05, + "loss": 0.6695, + "step": 6887 + }, + { + "epoch": 0.6158798283261803, + "grad_norm": 0.13517514172393377, + "learning_rate": 6.791090525840722e-05, + "loss": 0.6452, + "step": 6888 + }, + { + "epoch": 0.6159692417739628, + "grad_norm": 0.14813777639377512, + "learning_rate": 6.788347801725859e-05, + "loss": 0.6284, + "step": 6889 + }, + { + "epoch": 0.6160586552217453, + "grad_norm": 0.13295241527344193, + "learning_rate": 6.785605346968386e-05, + "loss": 0.6708, + "step": 6890 + }, + { + "epoch": 0.6161480686695279, + "grad_norm": 0.12396889889911204, + "learning_rate": 6.782863161798311e-05, + "loss": 0.621, + "step": 6891 + }, + { + "epoch": 0.6162374821173104, + "grad_norm": 0.122722009964724, + "learning_rate": 6.780121246445617e-05, + "loss": 0.6047, + "step": 6892 + }, + { + "epoch": 0.616326895565093, + "grad_norm": 0.1418771581366837, + "learning_rate": 6.777379601140264e-05, + "loss": 0.6355, + "step": 6893 + }, + { + "epoch": 0.6164163090128756, + "grad_norm": 0.13566318971297092, + "learning_rate": 6.774638226112195e-05, + "loss": 0.6209, + "step": 6894 + }, + { + "epoch": 0.616505722460658, + "grad_norm": 0.1415276318952624, + "learning_rate": 6.771897121591321e-05, + "loss": 0.6658, + "step": 6895 + }, + { + "epoch": 0.6165951359084406, + "grad_norm": 0.15466725266438763, + "learning_rate": 6.76915628780754e-05, + "loss": 0.6823, + "step": 6896 + }, + { + "epoch": 0.6166845493562232, + "grad_norm": 0.11580154767356832, + "learning_rate": 6.766415724990718e-05, + "loss": 0.6046, + "step": 6897 + }, + { + "epoch": 0.6167739628040058, + "grad_norm": 0.1323889052796042, + "learning_rate": 6.7636754333707e-05, + "loss": 0.5886, + "step": 6898 + }, + { + "epoch": 0.6168633762517882, + "grad_norm": 0.1377085776697591, + "learning_rate": 6.760935413177316e-05, + "loss": 0.6475, + "step": 6899 + }, + { + "epoch": 0.6169527896995708, + "grad_norm": 0.14212792750946962, + "learning_rate": 6.75819566464037e-05, + "loss": 0.6807, + "step": 6900 + }, + { + "epoch": 0.6170422031473534, + "grad_norm": 0.12117817140027312, + "learning_rate": 6.755456187989637e-05, + "loss": 0.6393, + "step": 6901 + }, + { + "epoch": 0.6171316165951359, + "grad_norm": 0.13012660150249603, + "learning_rate": 6.752716983454875e-05, + "loss": 0.5982, + "step": 6902 + }, + { + "epoch": 0.6172210300429185, + "grad_norm": 0.1414038653137403, + "learning_rate": 6.749978051265819e-05, + "loss": 0.65, + "step": 6903 + }, + { + "epoch": 0.617310443490701, + "grad_norm": 0.13013717105137887, + "learning_rate": 6.74723939165218e-05, + "loss": 0.5915, + "step": 6904 + }, + { + "epoch": 0.6173998569384835, + "grad_norm": 0.12455460862368453, + "learning_rate": 6.74450100484364e-05, + "loss": 0.646, + "step": 6905 + }, + { + "epoch": 0.6174892703862661, + "grad_norm": 0.13053237003263704, + "learning_rate": 6.741762891069871e-05, + "loss": 0.6147, + "step": 6906 + }, + { + "epoch": 0.6175786838340487, + "grad_norm": 0.13013705153519595, + "learning_rate": 6.739025050560514e-05, + "loss": 0.6417, + "step": 6907 + }, + { + "epoch": 0.6176680972818311, + "grad_norm": 0.12942392559204377, + "learning_rate": 6.736287483545191e-05, + "loss": 0.6243, + "step": 6908 + }, + { + "epoch": 0.6177575107296137, + "grad_norm": 0.13522419351708972, + "learning_rate": 6.73355019025349e-05, + "loss": 0.627, + "step": 6909 + }, + { + "epoch": 0.6178469241773963, + "grad_norm": 0.13760534824366777, + "learning_rate": 6.730813170914993e-05, + "loss": 0.6639, + "step": 6910 + }, + { + "epoch": 0.6179363376251789, + "grad_norm": 0.11990579611082618, + "learning_rate": 6.72807642575925e-05, + "loss": 0.628, + "step": 6911 + }, + { + "epoch": 0.6180257510729614, + "grad_norm": 0.13356459551302544, + "learning_rate": 6.725339955015777e-05, + "loss": 0.6728, + "step": 6912 + }, + { + "epoch": 0.6181151645207439, + "grad_norm": 0.13099439441963776, + "learning_rate": 6.722603758914092e-05, + "loss": 0.6444, + "step": 6913 + }, + { + "epoch": 0.6182045779685265, + "grad_norm": 0.14667920761090178, + "learning_rate": 6.719867837683672e-05, + "loss": 0.6507, + "step": 6914 + }, + { + "epoch": 0.618293991416309, + "grad_norm": 0.14171231581907712, + "learning_rate": 6.717132191553977e-05, + "loss": 0.6459, + "step": 6915 + }, + { + "epoch": 0.6183834048640916, + "grad_norm": 0.1402569385379344, + "learning_rate": 6.714396820754436e-05, + "loss": 0.6704, + "step": 6916 + }, + { + "epoch": 0.6184728183118741, + "grad_norm": 0.13748111157841816, + "learning_rate": 6.711661725514469e-05, + "loss": 0.6508, + "step": 6917 + }, + { + "epoch": 0.6185622317596566, + "grad_norm": 0.1305902783991508, + "learning_rate": 6.708926906063462e-05, + "loss": 0.6626, + "step": 6918 + }, + { + "epoch": 0.6186516452074392, + "grad_norm": 0.1359615095791053, + "learning_rate": 6.706192362630776e-05, + "loss": 0.633, + "step": 6919 + }, + { + "epoch": 0.6187410586552218, + "grad_norm": 0.14927004363107843, + "learning_rate": 6.70345809544576e-05, + "loss": 0.6881, + "step": 6920 + }, + { + "epoch": 0.6188304721030042, + "grad_norm": 0.13626088268551412, + "learning_rate": 6.700724104737736e-05, + "loss": 0.6413, + "step": 6921 + }, + { + "epoch": 0.6189198855507868, + "grad_norm": 0.1353785013098381, + "learning_rate": 6.697990390735997e-05, + "loss": 0.6639, + "step": 6922 + }, + { + "epoch": 0.6190092989985694, + "grad_norm": 0.13657041284157317, + "learning_rate": 6.695256953669812e-05, + "loss": 0.6441, + "step": 6923 + }, + { + "epoch": 0.619098712446352, + "grad_norm": 0.1357547259828247, + "learning_rate": 6.69252379376844e-05, + "loss": 0.6581, + "step": 6924 + }, + { + "epoch": 0.6191881258941345, + "grad_norm": 0.13369040218570616, + "learning_rate": 6.689790911261099e-05, + "loss": 0.6703, + "step": 6925 + }, + { + "epoch": 0.619277539341917, + "grad_norm": 0.1263209432750091, + "learning_rate": 6.687058306376997e-05, + "loss": 0.5986, + "step": 6926 + }, + { + "epoch": 0.6193669527896996, + "grad_norm": 0.1414502216649402, + "learning_rate": 6.684325979345315e-05, + "loss": 0.6621, + "step": 6927 + }, + { + "epoch": 0.6194563662374821, + "grad_norm": 0.13853820181015716, + "learning_rate": 6.681593930395209e-05, + "loss": 0.6629, + "step": 6928 + }, + { + "epoch": 0.6195457796852647, + "grad_norm": 0.1329569007153714, + "learning_rate": 6.678862159755809e-05, + "loss": 0.6442, + "step": 6929 + }, + { + "epoch": 0.6196351931330472, + "grad_norm": 0.1186371435980077, + "learning_rate": 6.676130667656235e-05, + "loss": 0.6376, + "step": 6930 + }, + { + "epoch": 0.6197246065808297, + "grad_norm": 0.1304505917065453, + "learning_rate": 6.673399454325565e-05, + "loss": 0.6644, + "step": 6931 + }, + { + "epoch": 0.6198140200286123, + "grad_norm": 0.13289890593641351, + "learning_rate": 6.670668519992864e-05, + "loss": 0.6487, + "step": 6932 + }, + { + "epoch": 0.6199034334763949, + "grad_norm": 0.1392659251259473, + "learning_rate": 6.66793786488717e-05, + "loss": 0.6329, + "step": 6933 + }, + { + "epoch": 0.6199928469241774, + "grad_norm": 0.12012838757204235, + "learning_rate": 6.66520748923751e-05, + "loss": 0.6615, + "step": 6934 + }, + { + "epoch": 0.6200822603719599, + "grad_norm": 0.14665869736759615, + "learning_rate": 6.662477393272869e-05, + "loss": 0.6774, + "step": 6935 + }, + { + "epoch": 0.6201716738197425, + "grad_norm": 0.1286945178900153, + "learning_rate": 6.659747577222216e-05, + "loss": 0.6383, + "step": 6936 + }, + { + "epoch": 0.620261087267525, + "grad_norm": 0.11778186284628892, + "learning_rate": 6.657018041314502e-05, + "loss": 0.6196, + "step": 6937 + }, + { + "epoch": 0.6203505007153076, + "grad_norm": 0.12164864303527072, + "learning_rate": 6.654288785778646e-05, + "loss": 0.6542, + "step": 6938 + }, + { + "epoch": 0.6204399141630901, + "grad_norm": 0.14324242495860692, + "learning_rate": 6.651559810843548e-05, + "loss": 0.6409, + "step": 6939 + }, + { + "epoch": 0.6205293276108726, + "grad_norm": 0.14296539269311087, + "learning_rate": 6.648831116738083e-05, + "loss": 0.6804, + "step": 6940 + }, + { + "epoch": 0.6206187410586552, + "grad_norm": 0.13686139321469118, + "learning_rate": 6.646102703691111e-05, + "loss": 0.6425, + "step": 6941 + }, + { + "epoch": 0.6207081545064378, + "grad_norm": 0.1471419557333173, + "learning_rate": 6.643374571931451e-05, + "loss": 0.6324, + "step": 6942 + }, + { + "epoch": 0.6207975679542204, + "grad_norm": 0.13183946295773155, + "learning_rate": 6.640646721687913e-05, + "loss": 0.6619, + "step": 6943 + }, + { + "epoch": 0.6208869814020028, + "grad_norm": 0.14836971711851402, + "learning_rate": 6.637919153189279e-05, + "loss": 0.6798, + "step": 6944 + }, + { + "epoch": 0.6209763948497854, + "grad_norm": 0.15517026560525093, + "learning_rate": 6.635191866664303e-05, + "loss": 0.6942, + "step": 6945 + }, + { + "epoch": 0.621065808297568, + "grad_norm": 0.1332004251621028, + "learning_rate": 6.632464862341721e-05, + "loss": 0.634, + "step": 6946 + }, + { + "epoch": 0.6211552217453505, + "grad_norm": 0.1401294726865475, + "learning_rate": 6.629738140450241e-05, + "loss": 0.6441, + "step": 6947 + }, + { + "epoch": 0.621244635193133, + "grad_norm": 0.1338197887959721, + "learning_rate": 6.62701170121856e-05, + "loss": 0.6034, + "step": 6948 + }, + { + "epoch": 0.6213340486409156, + "grad_norm": 0.13055599987592081, + "learning_rate": 6.62428554487533e-05, + "loss": 0.6449, + "step": 6949 + }, + { + "epoch": 0.6214234620886981, + "grad_norm": 0.1526935445578119, + "learning_rate": 6.621559671649196e-05, + "loss": 0.6359, + "step": 6950 + }, + { + "epoch": 0.6215128755364807, + "grad_norm": 0.13273407084536537, + "learning_rate": 6.618834081768772e-05, + "loss": 0.6207, + "step": 6951 + }, + { + "epoch": 0.6216022889842633, + "grad_norm": 0.13338550844242042, + "learning_rate": 6.616108775462649e-05, + "loss": 0.6226, + "step": 6952 + }, + { + "epoch": 0.6216917024320457, + "grad_norm": 0.12537830936739924, + "learning_rate": 6.613383752959398e-05, + "loss": 0.6424, + "step": 6953 + }, + { + "epoch": 0.6217811158798283, + "grad_norm": 0.12827697568448285, + "learning_rate": 6.610659014487557e-05, + "loss": 0.6682, + "step": 6954 + }, + { + "epoch": 0.6218705293276109, + "grad_norm": 0.13763857077947397, + "learning_rate": 6.607934560275657e-05, + "loss": 0.7004, + "step": 6955 + }, + { + "epoch": 0.6219599427753935, + "grad_norm": 0.13595463898975096, + "learning_rate": 6.605210390552185e-05, + "loss": 0.6494, + "step": 6956 + }, + { + "epoch": 0.6220493562231759, + "grad_norm": 0.15531374993926214, + "learning_rate": 6.602486505545621e-05, + "loss": 0.6594, + "step": 6957 + }, + { + "epoch": 0.6221387696709585, + "grad_norm": 0.13374010572580258, + "learning_rate": 6.59976290548441e-05, + "loss": 0.5924, + "step": 6958 + }, + { + "epoch": 0.6222281831187411, + "grad_norm": 0.13513160280376987, + "learning_rate": 6.597039590596976e-05, + "loss": 0.6374, + "step": 6959 + }, + { + "epoch": 0.6223175965665236, + "grad_norm": 0.1349278914419548, + "learning_rate": 6.594316561111724e-05, + "loss": 0.6615, + "step": 6960 + }, + { + "epoch": 0.6224070100143062, + "grad_norm": 0.1316123053707709, + "learning_rate": 6.591593817257025e-05, + "loss": 0.5893, + "step": 6961 + }, + { + "epoch": 0.6224964234620887, + "grad_norm": 0.14540278052267291, + "learning_rate": 6.58887135926124e-05, + "loss": 0.6447, + "step": 6962 + }, + { + "epoch": 0.6225858369098712, + "grad_norm": 0.14918822790717884, + "learning_rate": 6.58614918735269e-05, + "loss": 0.6383, + "step": 6963 + }, + { + "epoch": 0.6226752503576538, + "grad_norm": 0.14065079708250564, + "learning_rate": 6.58342730175969e-05, + "loss": 0.6495, + "step": 6964 + }, + { + "epoch": 0.6227646638054364, + "grad_norm": 0.13661667564784596, + "learning_rate": 6.580705702710514e-05, + "loss": 0.6636, + "step": 6965 + }, + { + "epoch": 0.6228540772532188, + "grad_norm": 0.14400999671884798, + "learning_rate": 6.577984390433421e-05, + "loss": 0.6679, + "step": 6966 + }, + { + "epoch": 0.6229434907010014, + "grad_norm": 0.15241461130187545, + "learning_rate": 6.575263365156647e-05, + "loss": 0.6491, + "step": 6967 + }, + { + "epoch": 0.623032904148784, + "grad_norm": 0.14389122566040694, + "learning_rate": 6.572542627108393e-05, + "loss": 0.6377, + "step": 6968 + }, + { + "epoch": 0.6231223175965666, + "grad_norm": 0.12730539287461587, + "learning_rate": 6.569822176516853e-05, + "loss": 0.6477, + "step": 6969 + }, + { + "epoch": 0.623211731044349, + "grad_norm": 0.11576715116732246, + "learning_rate": 6.567102013610184e-05, + "loss": 0.589, + "step": 6970 + }, + { + "epoch": 0.6233011444921316, + "grad_norm": 0.13939212015661417, + "learning_rate": 6.564382138616526e-05, + "loss": 0.6745, + "step": 6971 + }, + { + "epoch": 0.6233905579399142, + "grad_norm": 0.16133376174961261, + "learning_rate": 6.561662551763984e-05, + "loss": 0.6206, + "step": 6972 + }, + { + "epoch": 0.6234799713876967, + "grad_norm": 0.13102852019317565, + "learning_rate": 6.558943253280654e-05, + "loss": 0.6032, + "step": 6973 + }, + { + "epoch": 0.6235693848354793, + "grad_norm": 0.12977267565236, + "learning_rate": 6.556224243394599e-05, + "loss": 0.6574, + "step": 6974 + }, + { + "epoch": 0.6236587982832618, + "grad_norm": 0.12117690183097048, + "learning_rate": 6.553505522333853e-05, + "loss": 0.6208, + "step": 6975 + }, + { + "epoch": 0.6237482117310443, + "grad_norm": 0.15210906016817358, + "learning_rate": 6.55078709032644e-05, + "loss": 0.6553, + "step": 6976 + }, + { + "epoch": 0.6238376251788269, + "grad_norm": 0.12364933924145365, + "learning_rate": 6.548068947600346e-05, + "loss": 0.6336, + "step": 6977 + }, + { + "epoch": 0.6239270386266095, + "grad_norm": 0.13033364501900999, + "learning_rate": 6.545351094383544e-05, + "loss": 0.615, + "step": 6978 + }, + { + "epoch": 0.6240164520743919, + "grad_norm": 0.12394420879957867, + "learning_rate": 6.542633530903972e-05, + "loss": 0.6569, + "step": 6979 + }, + { + "epoch": 0.6241058655221745, + "grad_norm": 0.15907711319148013, + "learning_rate": 6.53991625738955e-05, + "loss": 0.7105, + "step": 6980 + }, + { + "epoch": 0.6241952789699571, + "grad_norm": 0.11699639996756858, + "learning_rate": 6.537199274068173e-05, + "loss": 0.6436, + "step": 6981 + }, + { + "epoch": 0.6242846924177397, + "grad_norm": 0.14762526267921994, + "learning_rate": 6.534482581167707e-05, + "loss": 0.6984, + "step": 6982 + }, + { + "epoch": 0.6243741058655222, + "grad_norm": 0.13335397352286515, + "learning_rate": 6.531766178916008e-05, + "loss": 0.6781, + "step": 6983 + }, + { + "epoch": 0.6244635193133047, + "grad_norm": 0.12499222699060293, + "learning_rate": 6.529050067540887e-05, + "loss": 0.6211, + "step": 6984 + }, + { + "epoch": 0.6245529327610873, + "grad_norm": 0.12227144461926796, + "learning_rate": 6.526334247270147e-05, + "loss": 0.6499, + "step": 6985 + }, + { + "epoch": 0.6246423462088698, + "grad_norm": 0.12635776683569372, + "learning_rate": 6.523618718331557e-05, + "loss": 0.6644, + "step": 6986 + }, + { + "epoch": 0.6247317596566524, + "grad_norm": 0.14768207744285142, + "learning_rate": 6.520903480952869e-05, + "loss": 0.6661, + "step": 6987 + }, + { + "epoch": 0.6248211731044349, + "grad_norm": 0.128905885680924, + "learning_rate": 6.518188535361803e-05, + "loss": 0.6418, + "step": 6988 + }, + { + "epoch": 0.6249105865522174, + "grad_norm": 0.1273176574465011, + "learning_rate": 6.515473881786056e-05, + "loss": 0.6273, + "step": 6989 + }, + { + "epoch": 0.625, + "grad_norm": 0.14540098018304268, + "learning_rate": 6.512759520453308e-05, + "loss": 0.6413, + "step": 6990 + }, + { + "epoch": 0.6250894134477826, + "grad_norm": 0.13855769896215778, + "learning_rate": 6.510045451591211e-05, + "loss": 0.5724, + "step": 6991 + }, + { + "epoch": 0.6251788268955651, + "grad_norm": 0.13251689948085546, + "learning_rate": 6.507331675427387e-05, + "loss": 0.6428, + "step": 6992 + }, + { + "epoch": 0.6252682403433476, + "grad_norm": 0.13917885944080757, + "learning_rate": 6.504618192189435e-05, + "loss": 0.6439, + "step": 6993 + }, + { + "epoch": 0.6253576537911302, + "grad_norm": 0.14108062953936149, + "learning_rate": 6.501905002104935e-05, + "loss": 0.6227, + "step": 6994 + }, + { + "epoch": 0.6254470672389127, + "grad_norm": 0.14410271753978746, + "learning_rate": 6.499192105401435e-05, + "loss": 0.6384, + "step": 6995 + }, + { + "epoch": 0.6255364806866953, + "grad_norm": 0.12733470129629804, + "learning_rate": 6.49647950230647e-05, + "loss": 0.6131, + "step": 6996 + }, + { + "epoch": 0.6256258941344778, + "grad_norm": 0.13744463346539448, + "learning_rate": 6.493767193047534e-05, + "loss": 0.6279, + "step": 6997 + }, + { + "epoch": 0.6257153075822603, + "grad_norm": 0.12724089408187877, + "learning_rate": 6.491055177852111e-05, + "loss": 0.6583, + "step": 6998 + }, + { + "epoch": 0.6258047210300429, + "grad_norm": 0.14475768314290255, + "learning_rate": 6.488343456947654e-05, + "loss": 0.6494, + "step": 6999 + }, + { + "epoch": 0.6258941344778255, + "grad_norm": 0.13395985396712123, + "learning_rate": 6.485632030561587e-05, + "loss": 0.6425, + "step": 7000 + }, + { + "epoch": 0.6259835479256081, + "grad_norm": 0.1388598930655007, + "learning_rate": 6.48292089892132e-05, + "loss": 0.6489, + "step": 7001 + }, + { + "epoch": 0.6260729613733905, + "grad_norm": 0.13580978641645763, + "learning_rate": 6.480210062254225e-05, + "loss": 0.656, + "step": 7002 + }, + { + "epoch": 0.6261623748211731, + "grad_norm": 0.1500437886560929, + "learning_rate": 6.477499520787665e-05, + "loss": 0.6115, + "step": 7003 + }, + { + "epoch": 0.6262517882689557, + "grad_norm": 0.13841149657057927, + "learning_rate": 6.474789274748964e-05, + "loss": 0.6644, + "step": 7004 + }, + { + "epoch": 0.6263412017167382, + "grad_norm": 0.13973731618355625, + "learning_rate": 6.472079324365433e-05, + "loss": 0.646, + "step": 7005 + }, + { + "epoch": 0.6264306151645207, + "grad_norm": 0.14489181533527465, + "learning_rate": 6.469369669864346e-05, + "loss": 0.6522, + "step": 7006 + }, + { + "epoch": 0.6265200286123033, + "grad_norm": 0.12442643034394417, + "learning_rate": 6.466660311472962e-05, + "loss": 0.5812, + "step": 7007 + }, + { + "epoch": 0.6266094420600858, + "grad_norm": 0.14294674330834853, + "learning_rate": 6.46395124941851e-05, + "loss": 0.6862, + "step": 7008 + }, + { + "epoch": 0.6266988555078684, + "grad_norm": 0.1373416997433859, + "learning_rate": 6.461242483928194e-05, + "loss": 0.6202, + "step": 7009 + }, + { + "epoch": 0.626788268955651, + "grad_norm": 0.12461728458975072, + "learning_rate": 6.4585340152292e-05, + "loss": 0.6204, + "step": 7010 + }, + { + "epoch": 0.6268776824034334, + "grad_norm": 0.14696715111524986, + "learning_rate": 6.455825843548678e-05, + "loss": 0.6509, + "step": 7011 + }, + { + "epoch": 0.626967095851216, + "grad_norm": 0.11897542237560298, + "learning_rate": 6.453117969113767e-05, + "loss": 0.6214, + "step": 7012 + }, + { + "epoch": 0.6270565092989986, + "grad_norm": 0.14820329589038242, + "learning_rate": 6.450410392151564e-05, + "loss": 0.6676, + "step": 7013 + }, + { + "epoch": 0.6271459227467812, + "grad_norm": 0.13667938602391624, + "learning_rate": 6.447703112889158e-05, + "loss": 0.6079, + "step": 7014 + }, + { + "epoch": 0.6272353361945636, + "grad_norm": 0.12443399048231749, + "learning_rate": 6.4449961315536e-05, + "loss": 0.6534, + "step": 7015 + }, + { + "epoch": 0.6273247496423462, + "grad_norm": 0.11968610079124739, + "learning_rate": 6.44228944837192e-05, + "loss": 0.6321, + "step": 7016 + }, + { + "epoch": 0.6274141630901288, + "grad_norm": 0.1416588408323073, + "learning_rate": 6.43958306357113e-05, + "loss": 0.6544, + "step": 7017 + }, + { + "epoch": 0.6275035765379113, + "grad_norm": 0.14883133631924447, + "learning_rate": 6.43687697737821e-05, + "loss": 0.7111, + "step": 7018 + }, + { + "epoch": 0.6275929899856938, + "grad_norm": 0.12348480588747432, + "learning_rate": 6.434171190020116e-05, + "loss": 0.5883, + "step": 7019 + }, + { + "epoch": 0.6276824034334764, + "grad_norm": 0.12040714266509764, + "learning_rate": 6.431465701723774e-05, + "loss": 0.6107, + "step": 7020 + }, + { + "epoch": 0.6277718168812589, + "grad_norm": 0.13484230381842258, + "learning_rate": 6.428760512716096e-05, + "loss": 0.6087, + "step": 7021 + }, + { + "epoch": 0.6278612303290415, + "grad_norm": 0.1357788043747333, + "learning_rate": 6.426055623223963e-05, + "loss": 0.635, + "step": 7022 + }, + { + "epoch": 0.6279506437768241, + "grad_norm": 0.13712036484651552, + "learning_rate": 6.423351033474223e-05, + "loss": 0.6353, + "step": 7023 + }, + { + "epoch": 0.6280400572246065, + "grad_norm": 0.1371366214976662, + "learning_rate": 6.420646743693714e-05, + "loss": 0.6545, + "step": 7024 + }, + { + "epoch": 0.6281294706723891, + "grad_norm": 0.13340042936453764, + "learning_rate": 6.417942754109242e-05, + "loss": 0.6198, + "step": 7025 + }, + { + "epoch": 0.6282188841201717, + "grad_norm": 0.13474045398397982, + "learning_rate": 6.415239064947587e-05, + "loss": 0.6593, + "step": 7026 + }, + { + "epoch": 0.6283082975679543, + "grad_norm": 0.13399093650160823, + "learning_rate": 6.412535676435498e-05, + "loss": 0.5858, + "step": 7027 + }, + { + "epoch": 0.6283977110157367, + "grad_norm": 0.12118840307857452, + "learning_rate": 6.409832588799713e-05, + "loss": 0.6208, + "step": 7028 + }, + { + "epoch": 0.6284871244635193, + "grad_norm": 0.13285501989124657, + "learning_rate": 6.407129802266932e-05, + "loss": 0.6501, + "step": 7029 + }, + { + "epoch": 0.6285765379113019, + "grad_norm": 0.14287653944047396, + "learning_rate": 6.404427317063832e-05, + "loss": 0.661, + "step": 7030 + }, + { + "epoch": 0.6286659513590844, + "grad_norm": 0.1465075142338195, + "learning_rate": 6.401725133417071e-05, + "loss": 0.6698, + "step": 7031 + }, + { + "epoch": 0.628755364806867, + "grad_norm": 0.13853548380767886, + "learning_rate": 6.39902325155328e-05, + "loss": 0.664, + "step": 7032 + }, + { + "epoch": 0.6288447782546495, + "grad_norm": 0.1313836583310731, + "learning_rate": 6.396321671699061e-05, + "loss": 0.6107, + "step": 7033 + }, + { + "epoch": 0.628934191702432, + "grad_norm": 0.13531492444368343, + "learning_rate": 6.39362039408099e-05, + "loss": 0.6343, + "step": 7034 + }, + { + "epoch": 0.6290236051502146, + "grad_norm": 0.14132904393495033, + "learning_rate": 6.39091941892562e-05, + "loss": 0.6607, + "step": 7035 + }, + { + "epoch": 0.6291130185979972, + "grad_norm": 0.15470777526452625, + "learning_rate": 6.388218746459483e-05, + "loss": 0.6685, + "step": 7036 + }, + { + "epoch": 0.6292024320457796, + "grad_norm": 0.13693917241045417, + "learning_rate": 6.385518376909072e-05, + "loss": 0.6662, + "step": 7037 + }, + { + "epoch": 0.6292918454935622, + "grad_norm": 0.12541554480681646, + "learning_rate": 6.382818310500871e-05, + "loss": 0.6524, + "step": 7038 + }, + { + "epoch": 0.6293812589413448, + "grad_norm": 0.14140436892924804, + "learning_rate": 6.380118547461334e-05, + "loss": 0.6622, + "step": 7039 + }, + { + "epoch": 0.6294706723891274, + "grad_norm": 0.1355373095077777, + "learning_rate": 6.377419088016881e-05, + "loss": 0.65, + "step": 7040 + }, + { + "epoch": 0.6295600858369099, + "grad_norm": 0.1375643408072905, + "learning_rate": 6.374719932393913e-05, + "loss": 0.6446, + "step": 7041 + }, + { + "epoch": 0.6296494992846924, + "grad_norm": 0.144670729641796, + "learning_rate": 6.372021080818808e-05, + "loss": 0.6612, + "step": 7042 + }, + { + "epoch": 0.629738912732475, + "grad_norm": 0.15783127205288888, + "learning_rate": 6.36932253351791e-05, + "loss": 0.6196, + "step": 7043 + }, + { + "epoch": 0.6298283261802575, + "grad_norm": 0.12984201639950443, + "learning_rate": 6.366624290717548e-05, + "loss": 0.6465, + "step": 7044 + }, + { + "epoch": 0.6299177396280401, + "grad_norm": 0.12838244522002307, + "learning_rate": 6.363926352644019e-05, + "loss": 0.667, + "step": 7045 + }, + { + "epoch": 0.6300071530758226, + "grad_norm": 0.15300603187821205, + "learning_rate": 6.361228719523595e-05, + "loss": 0.6432, + "step": 7046 + }, + { + "epoch": 0.6300965665236051, + "grad_norm": 0.12662976722422017, + "learning_rate": 6.358531391582525e-05, + "loss": 0.6287, + "step": 7047 + }, + { + "epoch": 0.6301859799713877, + "grad_norm": 0.12801301751078395, + "learning_rate": 6.355834369047029e-05, + "loss": 0.6523, + "step": 7048 + }, + { + "epoch": 0.6302753934191703, + "grad_norm": 0.14003350742171541, + "learning_rate": 6.353137652143305e-05, + "loss": 0.6334, + "step": 7049 + }, + { + "epoch": 0.6303648068669528, + "grad_norm": 0.14183624618276122, + "learning_rate": 6.350441241097518e-05, + "loss": 0.6166, + "step": 7050 + }, + { + "epoch": 0.6304542203147353, + "grad_norm": 0.13474024923801972, + "learning_rate": 6.347745136135816e-05, + "loss": 0.6601, + "step": 7051 + }, + { + "epoch": 0.6305436337625179, + "grad_norm": 0.16956877057980482, + "learning_rate": 6.345049337484323e-05, + "loss": 0.694, + "step": 7052 + }, + { + "epoch": 0.6306330472103004, + "grad_norm": 0.14183543084129646, + "learning_rate": 6.342353845369127e-05, + "loss": 0.6413, + "step": 7053 + }, + { + "epoch": 0.630722460658083, + "grad_norm": 0.12779675350274117, + "learning_rate": 6.339658660016295e-05, + "loss": 0.5579, + "step": 7054 + }, + { + "epoch": 0.6308118741058655, + "grad_norm": 0.13732378196138123, + "learning_rate": 6.336963781651873e-05, + "loss": 0.5915, + "step": 7055 + }, + { + "epoch": 0.630901287553648, + "grad_norm": 0.12840571286934818, + "learning_rate": 6.334269210501875e-05, + "loss": 0.6376, + "step": 7056 + }, + { + "epoch": 0.6309907010014306, + "grad_norm": 0.1350294809186858, + "learning_rate": 6.331574946792288e-05, + "loss": 0.6434, + "step": 7057 + }, + { + "epoch": 0.6310801144492132, + "grad_norm": 0.12116452576079499, + "learning_rate": 6.328880990749079e-05, + "loss": 0.6255, + "step": 7058 + }, + { + "epoch": 0.6311695278969958, + "grad_norm": 0.14614860672802302, + "learning_rate": 6.32618734259819e-05, + "loss": 0.7261, + "step": 7059 + }, + { + "epoch": 0.6312589413447782, + "grad_norm": 0.13477807543147804, + "learning_rate": 6.323494002565534e-05, + "loss": 0.681, + "step": 7060 + }, + { + "epoch": 0.6313483547925608, + "grad_norm": 0.13390803557089825, + "learning_rate": 6.320800970876992e-05, + "loss": 0.6413, + "step": 7061 + }, + { + "epoch": 0.6314377682403434, + "grad_norm": 0.12693626089793586, + "learning_rate": 6.318108247758434e-05, + "loss": 0.6422, + "step": 7062 + }, + { + "epoch": 0.6315271816881259, + "grad_norm": 0.13195426634573862, + "learning_rate": 6.315415833435687e-05, + "loss": 0.6786, + "step": 7063 + }, + { + "epoch": 0.6316165951359084, + "grad_norm": 0.12889229240417438, + "learning_rate": 6.312723728134565e-05, + "loss": 0.566, + "step": 7064 + }, + { + "epoch": 0.631706008583691, + "grad_norm": 0.12795868652199296, + "learning_rate": 6.310031932080847e-05, + "loss": 0.6352, + "step": 7065 + }, + { + "epoch": 0.6317954220314735, + "grad_norm": 0.14029402741287472, + "learning_rate": 6.3073404455003e-05, + "loss": 0.6252, + "step": 7066 + }, + { + "epoch": 0.6318848354792561, + "grad_norm": 0.14764678554488397, + "learning_rate": 6.304649268618649e-05, + "loss": 0.6471, + "step": 7067 + }, + { + "epoch": 0.6319742489270386, + "grad_norm": 0.13428591034728915, + "learning_rate": 6.3019584016616e-05, + "loss": 0.6616, + "step": 7068 + }, + { + "epoch": 0.6320636623748211, + "grad_norm": 0.12312071103731628, + "learning_rate": 6.299267844854835e-05, + "loss": 0.6502, + "step": 7069 + }, + { + "epoch": 0.6321530758226037, + "grad_norm": 0.1336774427161807, + "learning_rate": 6.296577598424004e-05, + "loss": 0.6613, + "step": 7070 + }, + { + "epoch": 0.6322424892703863, + "grad_norm": 0.15655657645874843, + "learning_rate": 6.293887662594742e-05, + "loss": 0.672, + "step": 7071 + }, + { + "epoch": 0.6323319027181689, + "grad_norm": 0.12272923481364813, + "learning_rate": 6.291198037592639e-05, + "loss": 0.6132, + "step": 7072 + }, + { + "epoch": 0.6324213161659513, + "grad_norm": 0.13574396997430996, + "learning_rate": 6.288508723643283e-05, + "loss": 0.6618, + "step": 7073 + }, + { + "epoch": 0.6325107296137339, + "grad_norm": 0.13447736442555577, + "learning_rate": 6.285819720972214e-05, + "loss": 0.6341, + "step": 7074 + }, + { + "epoch": 0.6326001430615165, + "grad_norm": 0.12564702765220076, + "learning_rate": 6.283131029804963e-05, + "loss": 0.6281, + "step": 7075 + }, + { + "epoch": 0.632689556509299, + "grad_norm": 0.13932606294604566, + "learning_rate": 6.280442650367025e-05, + "loss": 0.6272, + "step": 7076 + }, + { + "epoch": 0.6327789699570815, + "grad_norm": 0.11244980832296855, + "learning_rate": 6.277754582883866e-05, + "loss": 0.6115, + "step": 7077 + }, + { + "epoch": 0.6328683834048641, + "grad_norm": 0.13262770452931644, + "learning_rate": 6.275066827580933e-05, + "loss": 0.6384, + "step": 7078 + }, + { + "epoch": 0.6329577968526466, + "grad_norm": 0.15740654842564097, + "learning_rate": 6.272379384683651e-05, + "loss": 0.6606, + "step": 7079 + }, + { + "epoch": 0.6330472103004292, + "grad_norm": 0.14335779930346898, + "learning_rate": 6.269692254417408e-05, + "loss": 0.6943, + "step": 7080 + }, + { + "epoch": 0.6331366237482118, + "grad_norm": 0.13523721269304206, + "learning_rate": 6.267005437007567e-05, + "loss": 0.596, + "step": 7081 + }, + { + "epoch": 0.6332260371959942, + "grad_norm": 0.11830498654647598, + "learning_rate": 6.264318932679476e-05, + "loss": 0.6616, + "step": 7082 + }, + { + "epoch": 0.6333154506437768, + "grad_norm": 0.1388906827255159, + "learning_rate": 6.261632741658443e-05, + "loss": 0.6589, + "step": 7083 + }, + { + "epoch": 0.6334048640915594, + "grad_norm": 0.11768383144344507, + "learning_rate": 6.258946864169757e-05, + "loss": 0.604, + "step": 7084 + }, + { + "epoch": 0.633494277539342, + "grad_norm": 0.13285185776806974, + "learning_rate": 6.256261300438676e-05, + "loss": 0.6218, + "step": 7085 + }, + { + "epoch": 0.6335836909871244, + "grad_norm": 0.12478713973726323, + "learning_rate": 6.253576050690442e-05, + "loss": 0.6243, + "step": 7086 + }, + { + "epoch": 0.633673104434907, + "grad_norm": 0.13394443911380272, + "learning_rate": 6.250891115150261e-05, + "loss": 0.6371, + "step": 7087 + }, + { + "epoch": 0.6337625178826896, + "grad_norm": 0.1585687923060831, + "learning_rate": 6.248206494043313e-05, + "loss": 0.6629, + "step": 7088 + }, + { + "epoch": 0.6338519313304721, + "grad_norm": 0.13179580415786557, + "learning_rate": 6.245522187594757e-05, + "loss": 0.6317, + "step": 7089 + }, + { + "epoch": 0.6339413447782547, + "grad_norm": 0.12324356219056913, + "learning_rate": 6.242838196029719e-05, + "loss": 0.6148, + "step": 7090 + }, + { + "epoch": 0.6340307582260372, + "grad_norm": 0.14086531189467985, + "learning_rate": 6.240154519573304e-05, + "loss": 0.6723, + "step": 7091 + }, + { + "epoch": 0.6341201716738197, + "grad_norm": 0.15624163554909393, + "learning_rate": 6.237471158450585e-05, + "loss": 0.6673, + "step": 7092 + }, + { + "epoch": 0.6342095851216023, + "grad_norm": 0.121004254260364, + "learning_rate": 6.234788112886623e-05, + "loss": 0.6098, + "step": 7093 + }, + { + "epoch": 0.6342989985693849, + "grad_norm": 0.13089172568647453, + "learning_rate": 6.232105383106432e-05, + "loss": 0.6495, + "step": 7094 + }, + { + "epoch": 0.6343884120171673, + "grad_norm": 0.1369335832820001, + "learning_rate": 6.22942296933501e-05, + "loss": 0.6632, + "step": 7095 + }, + { + "epoch": 0.6344778254649499, + "grad_norm": 0.1422499075382307, + "learning_rate": 6.226740871797334e-05, + "loss": 0.6489, + "step": 7096 + }, + { + "epoch": 0.6345672389127325, + "grad_norm": 0.13785680097888486, + "learning_rate": 6.224059090718341e-05, + "loss": 0.6754, + "step": 7097 + }, + { + "epoch": 0.634656652360515, + "grad_norm": 0.14121870570389408, + "learning_rate": 6.221377626322953e-05, + "loss": 0.6395, + "step": 7098 + }, + { + "epoch": 0.6347460658082976, + "grad_norm": 0.14440392286502607, + "learning_rate": 6.218696478836058e-05, + "loss": 0.6058, + "step": 7099 + }, + { + "epoch": 0.6348354792560801, + "grad_norm": 0.1380916199428975, + "learning_rate": 6.216015648482525e-05, + "loss": 0.6369, + "step": 7100 + }, + { + "epoch": 0.6349248927038627, + "grad_norm": 0.12130820604892859, + "learning_rate": 6.21333513548719e-05, + "loss": 0.6083, + "step": 7101 + }, + { + "epoch": 0.6350143061516452, + "grad_norm": 0.1358771852032607, + "learning_rate": 6.210654940074861e-05, + "loss": 0.636, + "step": 7102 + }, + { + "epoch": 0.6351037195994278, + "grad_norm": 0.12492470993170414, + "learning_rate": 6.20797506247033e-05, + "loss": 0.6074, + "step": 7103 + }, + { + "epoch": 0.6351931330472103, + "grad_norm": 0.14359310491555521, + "learning_rate": 6.205295502898348e-05, + "loss": 0.6881, + "step": 7104 + }, + { + "epoch": 0.6352825464949928, + "grad_norm": 0.12896300409097752, + "learning_rate": 6.202616261583652e-05, + "loss": 0.6667, + "step": 7105 + }, + { + "epoch": 0.6353719599427754, + "grad_norm": 0.12117510178459727, + "learning_rate": 6.199937338750939e-05, + "loss": 0.603, + "step": 7106 + }, + { + "epoch": 0.635461373390558, + "grad_norm": 0.13545644030225898, + "learning_rate": 6.197258734624896e-05, + "loss": 0.6598, + "step": 7107 + }, + { + "epoch": 0.6355507868383404, + "grad_norm": 0.14632232211337723, + "learning_rate": 6.194580449430168e-05, + "loss": 0.6181, + "step": 7108 + }, + { + "epoch": 0.635640200286123, + "grad_norm": 0.13465469997594753, + "learning_rate": 6.191902483391386e-05, + "loss": 0.6398, + "step": 7109 + }, + { + "epoch": 0.6357296137339056, + "grad_norm": 0.14179834742190653, + "learning_rate": 6.18922483673314e-05, + "loss": 0.6028, + "step": 7110 + }, + { + "epoch": 0.6358190271816881, + "grad_norm": 0.11849739025306075, + "learning_rate": 6.186547509680007e-05, + "loss": 0.6275, + "step": 7111 + }, + { + "epoch": 0.6359084406294707, + "grad_norm": 0.14252245516829343, + "learning_rate": 6.183870502456529e-05, + "loss": 0.6601, + "step": 7112 + }, + { + "epoch": 0.6359978540772532, + "grad_norm": 0.13166349254454549, + "learning_rate": 6.181193815287218e-05, + "loss": 0.6556, + "step": 7113 + }, + { + "epoch": 0.6360872675250357, + "grad_norm": 0.13255403093469453, + "learning_rate": 6.178517448396575e-05, + "loss": 0.6797, + "step": 7114 + }, + { + "epoch": 0.6361766809728183, + "grad_norm": 0.13213896421458776, + "learning_rate": 6.175841402009058e-05, + "loss": 0.6541, + "step": 7115 + }, + { + "epoch": 0.6362660944206009, + "grad_norm": 0.13683523769070954, + "learning_rate": 6.173165676349103e-05, + "loss": 0.663, + "step": 7116 + }, + { + "epoch": 0.6363555078683834, + "grad_norm": 0.12703564819069876, + "learning_rate": 6.170490271641123e-05, + "loss": 0.6402, + "step": 7117 + }, + { + "epoch": 0.6364449213161659, + "grad_norm": 0.13110922356875537, + "learning_rate": 6.167815188109496e-05, + "loss": 0.6332, + "step": 7118 + }, + { + "epoch": 0.6365343347639485, + "grad_norm": 0.14366394507362432, + "learning_rate": 6.165140425978584e-05, + "loss": 0.6139, + "step": 7119 + }, + { + "epoch": 0.6366237482117311, + "grad_norm": 0.1333052636061236, + "learning_rate": 6.16246598547271e-05, + "loss": 0.631, + "step": 7120 + }, + { + "epoch": 0.6367131616595136, + "grad_norm": 0.15243185444094332, + "learning_rate": 6.159791866816182e-05, + "loss": 0.6587, + "step": 7121 + }, + { + "epoch": 0.6368025751072961, + "grad_norm": 0.14612720832536702, + "learning_rate": 6.157118070233269e-05, + "loss": 0.669, + "step": 7122 + }, + { + "epoch": 0.6368919885550787, + "grad_norm": 0.13911642520768966, + "learning_rate": 6.154444595948227e-05, + "loss": 0.6564, + "step": 7123 + }, + { + "epoch": 0.6369814020028612, + "grad_norm": 0.12172385392793951, + "learning_rate": 6.15177144418527e-05, + "loss": 0.6346, + "step": 7124 + }, + { + "epoch": 0.6370708154506438, + "grad_norm": 0.139311784091653, + "learning_rate": 6.149098615168594e-05, + "loss": 0.6472, + "step": 7125 + }, + { + "epoch": 0.6371602288984263, + "grad_norm": 0.13348761528475045, + "learning_rate": 6.14642610912237e-05, + "loss": 0.6357, + "step": 7126 + }, + { + "epoch": 0.6372496423462088, + "grad_norm": 0.13538996217745233, + "learning_rate": 6.143753926270727e-05, + "loss": 0.6394, + "step": 7127 + }, + { + "epoch": 0.6373390557939914, + "grad_norm": 0.14434576481852876, + "learning_rate": 6.141082066837791e-05, + "loss": 0.673, + "step": 7128 + }, + { + "epoch": 0.637428469241774, + "grad_norm": 0.1461574612892332, + "learning_rate": 6.13841053104764e-05, + "loss": 0.6762, + "step": 7129 + }, + { + "epoch": 0.6375178826895566, + "grad_norm": 0.13436725514253944, + "learning_rate": 6.135739319124335e-05, + "loss": 0.6166, + "step": 7130 + }, + { + "epoch": 0.637607296137339, + "grad_norm": 0.14579512191725874, + "learning_rate": 6.133068431291904e-05, + "loss": 0.6314, + "step": 7131 + }, + { + "epoch": 0.6376967095851216, + "grad_norm": 0.13859671886265057, + "learning_rate": 6.130397867774357e-05, + "loss": 0.6862, + "step": 7132 + }, + { + "epoch": 0.6377861230329042, + "grad_norm": 0.14555021862507211, + "learning_rate": 6.127727628795668e-05, + "loss": 0.6412, + "step": 7133 + }, + { + "epoch": 0.6378755364806867, + "grad_norm": 0.1432750132255979, + "learning_rate": 6.12505771457978e-05, + "loss": 0.6739, + "step": 7134 + }, + { + "epoch": 0.6379649499284692, + "grad_norm": 0.13763115262408168, + "learning_rate": 6.122388125350625e-05, + "loss": 0.6441, + "step": 7135 + }, + { + "epoch": 0.6380543633762518, + "grad_norm": 0.13710317442617187, + "learning_rate": 6.119718861332098e-05, + "loss": 0.6425, + "step": 7136 + }, + { + "epoch": 0.6381437768240343, + "grad_norm": 0.14401443031500186, + "learning_rate": 6.117049922748063e-05, + "loss": 0.6286, + "step": 7137 + }, + { + "epoch": 0.6382331902718169, + "grad_norm": 0.12957227871411772, + "learning_rate": 6.114381309822359e-05, + "loss": 0.6382, + "step": 7138 + }, + { + "epoch": 0.6383226037195995, + "grad_norm": 0.12403399667967459, + "learning_rate": 6.111713022778804e-05, + "loss": 0.6252, + "step": 7139 + }, + { + "epoch": 0.6384120171673819, + "grad_norm": 0.1445788909492414, + "learning_rate": 6.109045061841183e-05, + "loss": 0.6568, + "step": 7140 + }, + { + "epoch": 0.6385014306151645, + "grad_norm": 0.15188614192862115, + "learning_rate": 6.106377427233247e-05, + "loss": 0.6509, + "step": 7141 + }, + { + "epoch": 0.6385908440629471, + "grad_norm": 0.1320657003845947, + "learning_rate": 6.103710119178738e-05, + "loss": 0.5948, + "step": 7142 + }, + { + "epoch": 0.6386802575107297, + "grad_norm": 0.12747226523486183, + "learning_rate": 6.1010431379013585e-05, + "loss": 0.6563, + "step": 7143 + }, + { + "epoch": 0.6387696709585121, + "grad_norm": 0.12879895883686465, + "learning_rate": 6.098376483624781e-05, + "loss": 0.6309, + "step": 7144 + }, + { + "epoch": 0.6388590844062947, + "grad_norm": 0.12853693238413155, + "learning_rate": 6.095710156572654e-05, + "loss": 0.6237, + "step": 7145 + }, + { + "epoch": 0.6389484978540773, + "grad_norm": 0.14612752554204878, + "learning_rate": 6.0930441569686036e-05, + "loss": 0.6252, + "step": 7146 + }, + { + "epoch": 0.6390379113018598, + "grad_norm": 0.13055041257838354, + "learning_rate": 6.090378485036221e-05, + "loss": 0.6557, + "step": 7147 + }, + { + "epoch": 0.6391273247496424, + "grad_norm": 0.12325842229566554, + "learning_rate": 6.0877131409990684e-05, + "loss": 0.6428, + "step": 7148 + }, + { + "epoch": 0.6392167381974249, + "grad_norm": 0.14361640853275476, + "learning_rate": 6.085048125080692e-05, + "loss": 0.659, + "step": 7149 + }, + { + "epoch": 0.6393061516452074, + "grad_norm": 0.12596999468085465, + "learning_rate": 6.082383437504604e-05, + "loss": 0.6265, + "step": 7150 + }, + { + "epoch": 0.63939556509299, + "grad_norm": 0.12766229886039965, + "learning_rate": 6.079719078494286e-05, + "loss": 0.5668, + "step": 7151 + }, + { + "epoch": 0.6394849785407726, + "grad_norm": 0.1382924186783482, + "learning_rate": 6.0770550482731924e-05, + "loss": 0.6161, + "step": 7152 + }, + { + "epoch": 0.639574391988555, + "grad_norm": 0.13804803349676434, + "learning_rate": 6.0743913470647564e-05, + "loss": 0.6389, + "step": 7153 + }, + { + "epoch": 0.6396638054363376, + "grad_norm": 0.1311209198262294, + "learning_rate": 6.071727975092376e-05, + "loss": 0.6622, + "step": 7154 + }, + { + "epoch": 0.6397532188841202, + "grad_norm": 0.12621842336942304, + "learning_rate": 6.069064932579423e-05, + "loss": 0.6543, + "step": 7155 + }, + { + "epoch": 0.6398426323319027, + "grad_norm": 0.14203010945869818, + "learning_rate": 6.0664022197492475e-05, + "loss": 0.6479, + "step": 7156 + }, + { + "epoch": 0.6399320457796852, + "grad_norm": 0.13625534039555673, + "learning_rate": 6.0637398368251705e-05, + "loss": 0.6482, + "step": 7157 + }, + { + "epoch": 0.6400214592274678, + "grad_norm": 0.11881306698816589, + "learning_rate": 6.06107778403048e-05, + "loss": 0.5985, + "step": 7158 + }, + { + "epoch": 0.6401108726752504, + "grad_norm": 0.14292181829990447, + "learning_rate": 6.058416061588434e-05, + "loss": 0.6169, + "step": 7159 + }, + { + "epoch": 0.6402002861230329, + "grad_norm": 0.1241049662783293, + "learning_rate": 6.055754669722278e-05, + "loss": 0.6266, + "step": 7160 + }, + { + "epoch": 0.6402896995708155, + "grad_norm": 0.14218595435994746, + "learning_rate": 6.0530936086552095e-05, + "loss": 0.6545, + "step": 7161 + }, + { + "epoch": 0.640379113018598, + "grad_norm": 0.14035655063231015, + "learning_rate": 6.050432878610417e-05, + "loss": 0.6573, + "step": 7162 + }, + { + "epoch": 0.6404685264663805, + "grad_norm": 0.1476915257374427, + "learning_rate": 6.047772479811047e-05, + "loss": 0.6794, + "step": 7163 + }, + { + "epoch": 0.6405579399141631, + "grad_norm": 0.13694187112895162, + "learning_rate": 6.0451124124802275e-05, + "loss": 0.6356, + "step": 7164 + }, + { + "epoch": 0.6406473533619457, + "grad_norm": 0.12186014026446358, + "learning_rate": 6.042452676841053e-05, + "loss": 0.6267, + "step": 7165 + }, + { + "epoch": 0.6407367668097281, + "grad_norm": 0.12900057552459124, + "learning_rate": 6.039793273116594e-05, + "loss": 0.6457, + "step": 7166 + }, + { + "epoch": 0.6408261802575107, + "grad_norm": 0.1384629146044161, + "learning_rate": 6.03713420152989e-05, + "loss": 0.6276, + "step": 7167 + }, + { + "epoch": 0.6409155937052933, + "grad_norm": 0.1407723256788448, + "learning_rate": 6.034475462303952e-05, + "loss": 0.6576, + "step": 7168 + }, + { + "epoch": 0.6410050071530758, + "grad_norm": 0.1317208825163511, + "learning_rate": 6.031817055661769e-05, + "loss": 0.638, + "step": 7169 + }, + { + "epoch": 0.6410944206008584, + "grad_norm": 0.13699795915366092, + "learning_rate": 6.029158981826299e-05, + "loss": 0.6383, + "step": 7170 + }, + { + "epoch": 0.6411838340486409, + "grad_norm": 0.13802868990157724, + "learning_rate": 6.02650124102047e-05, + "loss": 0.6325, + "step": 7171 + }, + { + "epoch": 0.6412732474964234, + "grad_norm": 0.1437288614501165, + "learning_rate": 6.023843833467182e-05, + "loss": 0.6802, + "step": 7172 + }, + { + "epoch": 0.641362660944206, + "grad_norm": 0.13863701140539583, + "learning_rate": 6.02118675938931e-05, + "loss": 0.6461, + "step": 7173 + }, + { + "epoch": 0.6414520743919886, + "grad_norm": 0.14879224941612787, + "learning_rate": 6.0185300190097004e-05, + "loss": 0.6372, + "step": 7174 + }, + { + "epoch": 0.641541487839771, + "grad_norm": 0.1361334291241788, + "learning_rate": 6.0158736125511664e-05, + "loss": 0.6581, + "step": 7175 + }, + { + "epoch": 0.6416309012875536, + "grad_norm": 0.1439306077897712, + "learning_rate": 6.013217540236502e-05, + "loss": 0.635, + "step": 7176 + }, + { + "epoch": 0.6417203147353362, + "grad_norm": 0.13575647576559025, + "learning_rate": 6.0105618022884694e-05, + "loss": 0.6782, + "step": 7177 + }, + { + "epoch": 0.6418097281831188, + "grad_norm": 0.12530332275468323, + "learning_rate": 6.0079063989298e-05, + "loss": 0.6431, + "step": 7178 + }, + { + "epoch": 0.6418991416309013, + "grad_norm": 0.13567025019941878, + "learning_rate": 6.005251330383199e-05, + "loss": 0.6754, + "step": 7179 + }, + { + "epoch": 0.6419885550786838, + "grad_norm": 0.13975783789785057, + "learning_rate": 6.002596596871346e-05, + "loss": 0.6482, + "step": 7180 + }, + { + "epoch": 0.6420779685264664, + "grad_norm": 0.12606085516522267, + "learning_rate": 5.999942198616888e-05, + "loss": 0.6475, + "step": 7181 + }, + { + "epoch": 0.6421673819742489, + "grad_norm": 0.13238863544317372, + "learning_rate": 5.9972881358424436e-05, + "loss": 0.6203, + "step": 7182 + }, + { + "epoch": 0.6422567954220315, + "grad_norm": 0.147782737487516, + "learning_rate": 5.994634408770612e-05, + "loss": 0.6731, + "step": 7183 + }, + { + "epoch": 0.642346208869814, + "grad_norm": 0.12997049200063981, + "learning_rate": 5.991981017623955e-05, + "loss": 0.6561, + "step": 7184 + }, + { + "epoch": 0.6424356223175965, + "grad_norm": 0.15537969626539433, + "learning_rate": 5.9893279626250124e-05, + "loss": 0.6592, + "step": 7185 + }, + { + "epoch": 0.6425250357653791, + "grad_norm": 0.1247337051688294, + "learning_rate": 5.986675243996286e-05, + "loss": 0.623, + "step": 7186 + }, + { + "epoch": 0.6426144492131617, + "grad_norm": 0.13195517437392748, + "learning_rate": 5.9840228619602636e-05, + "loss": 0.6697, + "step": 7187 + }, + { + "epoch": 0.6427038626609443, + "grad_norm": 0.1395683930003352, + "learning_rate": 5.981370816739389e-05, + "loss": 0.6722, + "step": 7188 + }, + { + "epoch": 0.6427932761087267, + "grad_norm": 0.1263093430225843, + "learning_rate": 5.978719108556094e-05, + "loss": 0.6114, + "step": 7189 + }, + { + "epoch": 0.6428826895565093, + "grad_norm": 0.11624962183600371, + "learning_rate": 5.976067737632769e-05, + "loss": 0.6222, + "step": 7190 + }, + { + "epoch": 0.6429721030042919, + "grad_norm": 0.12444403465561979, + "learning_rate": 5.9734167041917856e-05, + "loss": 0.6274, + "step": 7191 + }, + { + "epoch": 0.6430615164520744, + "grad_norm": 0.1209997635813248, + "learning_rate": 5.9707660084554774e-05, + "loss": 0.6159, + "step": 7192 + }, + { + "epoch": 0.6431509298998569, + "grad_norm": 0.14052588481865172, + "learning_rate": 5.968115650646161e-05, + "loss": 0.6236, + "step": 7193 + }, + { + "epoch": 0.6432403433476395, + "grad_norm": 0.13019152931972375, + "learning_rate": 5.9654656309861155e-05, + "loss": 0.6424, + "step": 7194 + }, + { + "epoch": 0.643329756795422, + "grad_norm": 0.13664054923188415, + "learning_rate": 5.9628159496975935e-05, + "loss": 0.6515, + "step": 7195 + }, + { + "epoch": 0.6434191702432046, + "grad_norm": 0.13057792838173857, + "learning_rate": 5.9601666070028194e-05, + "loss": 0.6454, + "step": 7196 + }, + { + "epoch": 0.6435085836909872, + "grad_norm": 0.14125931604015537, + "learning_rate": 5.9575176031239964e-05, + "loss": 0.6133, + "step": 7197 + }, + { + "epoch": 0.6435979971387696, + "grad_norm": 0.13197508918990455, + "learning_rate": 5.954868938283291e-05, + "loss": 0.6624, + "step": 7198 + }, + { + "epoch": 0.6436874105865522, + "grad_norm": 0.12616911287424182, + "learning_rate": 5.9522206127028414e-05, + "loss": 0.6385, + "step": 7199 + }, + { + "epoch": 0.6437768240343348, + "grad_norm": 0.13059199808160438, + "learning_rate": 5.9495726266047605e-05, + "loss": 0.6423, + "step": 7200 + }, + { + "epoch": 0.6438662374821174, + "grad_norm": 0.12739478723258224, + "learning_rate": 5.9469249802111324e-05, + "loss": 0.6297, + "step": 7201 + }, + { + "epoch": 0.6439556509298998, + "grad_norm": 0.1324779058912722, + "learning_rate": 5.94427767374401e-05, + "loss": 0.6473, + "step": 7202 + }, + { + "epoch": 0.6440450643776824, + "grad_norm": 0.1491263248210106, + "learning_rate": 5.941630707425418e-05, + "loss": 0.7003, + "step": 7203 + }, + { + "epoch": 0.644134477825465, + "grad_norm": 0.1363635222464881, + "learning_rate": 5.938984081477363e-05, + "loss": 0.6282, + "step": 7204 + }, + { + "epoch": 0.6442238912732475, + "grad_norm": 0.13797732228916945, + "learning_rate": 5.936337796121807e-05, + "loss": 0.7065, + "step": 7205 + }, + { + "epoch": 0.64431330472103, + "grad_norm": 0.13428634052299454, + "learning_rate": 5.9336918515806914e-05, + "loss": 0.6326, + "step": 7206 + }, + { + "epoch": 0.6444027181688126, + "grad_norm": 0.15124779904279978, + "learning_rate": 5.931046248075931e-05, + "loss": 0.6666, + "step": 7207 + }, + { + "epoch": 0.6444921316165951, + "grad_norm": 0.12038624725374786, + "learning_rate": 5.9284009858294076e-05, + "loss": 0.6231, + "step": 7208 + }, + { + "epoch": 0.6445815450643777, + "grad_norm": 0.130972805257028, + "learning_rate": 5.925756065062975e-05, + "loss": 0.6459, + "step": 7209 + }, + { + "epoch": 0.6446709585121603, + "grad_norm": 0.15840684798872312, + "learning_rate": 5.9231114859984584e-05, + "loss": 0.7074, + "step": 7210 + }, + { + "epoch": 0.6447603719599427, + "grad_norm": 0.12685057400829644, + "learning_rate": 5.920467248857661e-05, + "loss": 0.6248, + "step": 7211 + }, + { + "epoch": 0.6448497854077253, + "grad_norm": 0.13586158101216517, + "learning_rate": 5.9178233538623486e-05, + "loss": 0.64, + "step": 7212 + }, + { + "epoch": 0.6449391988555079, + "grad_norm": 0.13874839206131745, + "learning_rate": 5.9151798012342605e-05, + "loss": 0.6506, + "step": 7213 + }, + { + "epoch": 0.6450286123032904, + "grad_norm": 0.1303361027527483, + "learning_rate": 5.91253659119511e-05, + "loss": 0.6318, + "step": 7214 + }, + { + "epoch": 0.6451180257510729, + "grad_norm": 0.13963400939622891, + "learning_rate": 5.9098937239665796e-05, + "loss": 0.6155, + "step": 7215 + }, + { + "epoch": 0.6452074391988555, + "grad_norm": 0.1360817583631532, + "learning_rate": 5.9072511997703226e-05, + "loss": 0.6423, + "step": 7216 + }, + { + "epoch": 0.645296852646638, + "grad_norm": 0.12928298256138146, + "learning_rate": 5.904609018827961e-05, + "loss": 0.6428, + "step": 7217 + }, + { + "epoch": 0.6453862660944206, + "grad_norm": 0.14291762549265832, + "learning_rate": 5.9019671813610986e-05, + "loss": 0.6544, + "step": 7218 + }, + { + "epoch": 0.6454756795422032, + "grad_norm": 0.13529352844348932, + "learning_rate": 5.899325687591302e-05, + "loss": 0.6325, + "step": 7219 + }, + { + "epoch": 0.6455650929899857, + "grad_norm": 0.13365314849643223, + "learning_rate": 5.896684537740103e-05, + "loss": 0.6217, + "step": 7220 + }, + { + "epoch": 0.6456545064377682, + "grad_norm": 0.137982059823902, + "learning_rate": 5.89404373202902e-05, + "loss": 0.6453, + "step": 7221 + }, + { + "epoch": 0.6457439198855508, + "grad_norm": 0.14037320408206344, + "learning_rate": 5.891403270679527e-05, + "loss": 0.6671, + "step": 7222 + }, + { + "epoch": 0.6458333333333334, + "grad_norm": 0.14383993331300926, + "learning_rate": 5.8887631539130826e-05, + "loss": 0.6796, + "step": 7223 + }, + { + "epoch": 0.6459227467811158, + "grad_norm": 0.12654807093395248, + "learning_rate": 5.886123381951103e-05, + "loss": 0.6643, + "step": 7224 + }, + { + "epoch": 0.6460121602288984, + "grad_norm": 0.15228443055122515, + "learning_rate": 5.883483955014992e-05, + "loss": 0.665, + "step": 7225 + }, + { + "epoch": 0.646101573676681, + "grad_norm": 0.129758194813115, + "learning_rate": 5.8808448733261076e-05, + "loss": 0.6455, + "step": 7226 + }, + { + "epoch": 0.6461909871244635, + "grad_norm": 0.14065252600857242, + "learning_rate": 5.878206137105791e-05, + "loss": 0.6703, + "step": 7227 + }, + { + "epoch": 0.6462804005722461, + "grad_norm": 0.14380862163243807, + "learning_rate": 5.875567746575348e-05, + "loss": 0.6299, + "step": 7228 + }, + { + "epoch": 0.6463698140200286, + "grad_norm": 0.12160424124427852, + "learning_rate": 5.872929701956054e-05, + "loss": 0.6383, + "step": 7229 + }, + { + "epoch": 0.6464592274678111, + "grad_norm": 0.1402636991622797, + "learning_rate": 5.870292003469164e-05, + "loss": 0.6367, + "step": 7230 + }, + { + "epoch": 0.6465486409155937, + "grad_norm": 0.13201856516730445, + "learning_rate": 5.867654651335893e-05, + "loss": 0.6174, + "step": 7231 + }, + { + "epoch": 0.6466380543633763, + "grad_norm": 0.12399705365838443, + "learning_rate": 5.86501764577744e-05, + "loss": 0.6403, + "step": 7232 + }, + { + "epoch": 0.6467274678111588, + "grad_norm": 0.1452232125049458, + "learning_rate": 5.862380987014959e-05, + "loss": 0.6868, + "step": 7233 + }, + { + "epoch": 0.6468168812589413, + "grad_norm": 0.12324317207602094, + "learning_rate": 5.8597446752695915e-05, + "loss": 0.6159, + "step": 7234 + }, + { + "epoch": 0.6469062947067239, + "grad_norm": 0.14169616435704202, + "learning_rate": 5.857108710762439e-05, + "loss": 0.63, + "step": 7235 + }, + { + "epoch": 0.6469957081545065, + "grad_norm": 0.1542573159369756, + "learning_rate": 5.854473093714572e-05, + "loss": 0.6679, + "step": 7236 + }, + { + "epoch": 0.647085121602289, + "grad_norm": 0.13207709248845972, + "learning_rate": 5.851837824347042e-05, + "loss": 0.6343, + "step": 7237 + }, + { + "epoch": 0.6471745350500715, + "grad_norm": 0.1598071289501819, + "learning_rate": 5.8492029028808615e-05, + "loss": 0.6884, + "step": 7238 + }, + { + "epoch": 0.6472639484978541, + "grad_norm": 0.13545655366358875, + "learning_rate": 5.846568329537023e-05, + "loss": 0.657, + "step": 7239 + }, + { + "epoch": 0.6473533619456366, + "grad_norm": 0.1260197219007553, + "learning_rate": 5.8439341045364815e-05, + "loss": 0.6507, + "step": 7240 + }, + { + "epoch": 0.6474427753934192, + "grad_norm": 0.13763162932755985, + "learning_rate": 5.8413002281001686e-05, + "loss": 0.6397, + "step": 7241 + }, + { + "epoch": 0.6475321888412017, + "grad_norm": 0.1446754918429185, + "learning_rate": 5.8386667004489835e-05, + "loss": 0.6726, + "step": 7242 + }, + { + "epoch": 0.6476216022889842, + "grad_norm": 0.1334665341246519, + "learning_rate": 5.836033521803796e-05, + "loss": 0.629, + "step": 7243 + }, + { + "epoch": 0.6477110157367668, + "grad_norm": 0.12908488968498297, + "learning_rate": 5.833400692385444e-05, + "loss": 0.6489, + "step": 7244 + }, + { + "epoch": 0.6478004291845494, + "grad_norm": 0.12455354370474843, + "learning_rate": 5.8307682124147466e-05, + "loss": 0.6441, + "step": 7245 + }, + { + "epoch": 0.647889842632332, + "grad_norm": 0.14751152166836623, + "learning_rate": 5.8281360821124884e-05, + "loss": 0.6385, + "step": 7246 + }, + { + "epoch": 0.6479792560801144, + "grad_norm": 0.14334034209270596, + "learning_rate": 5.8255043016994145e-05, + "loss": 0.6273, + "step": 7247 + }, + { + "epoch": 0.648068669527897, + "grad_norm": 0.1331278374885474, + "learning_rate": 5.8228728713962543e-05, + "loss": 0.636, + "step": 7248 + }, + { + "epoch": 0.6481580829756796, + "grad_norm": 0.14237620582587718, + "learning_rate": 5.820241791423704e-05, + "loss": 0.6491, + "step": 7249 + }, + { + "epoch": 0.6482474964234621, + "grad_norm": 0.12179246049976136, + "learning_rate": 5.8176110620024236e-05, + "loss": 0.6315, + "step": 7250 + }, + { + "epoch": 0.6483369098712446, + "grad_norm": 0.13013358104350164, + "learning_rate": 5.814980683353053e-05, + "loss": 0.6037, + "step": 7251 + }, + { + "epoch": 0.6484263233190272, + "grad_norm": 0.12571764194948928, + "learning_rate": 5.812350655696197e-05, + "loss": 0.6274, + "step": 7252 + }, + { + "epoch": 0.6485157367668097, + "grad_norm": 0.14961191701807347, + "learning_rate": 5.809720979252435e-05, + "loss": 0.6944, + "step": 7253 + }, + { + "epoch": 0.6486051502145923, + "grad_norm": 0.14822184420562118, + "learning_rate": 5.807091654242318e-05, + "loss": 0.6564, + "step": 7254 + }, + { + "epoch": 0.6486945636623748, + "grad_norm": 0.13584608236626536, + "learning_rate": 5.8044626808863557e-05, + "loss": 0.6027, + "step": 7255 + }, + { + "epoch": 0.6487839771101573, + "grad_norm": 0.13191664566066305, + "learning_rate": 5.801834059405041e-05, + "loss": 0.6308, + "step": 7256 + }, + { + "epoch": 0.6488733905579399, + "grad_norm": 0.1321554931904972, + "learning_rate": 5.799205790018838e-05, + "loss": 0.6375, + "step": 7257 + }, + { + "epoch": 0.6489628040057225, + "grad_norm": 0.1389236923744714, + "learning_rate": 5.796577872948165e-05, + "loss": 0.6774, + "step": 7258 + }, + { + "epoch": 0.649052217453505, + "grad_norm": 0.11223341678042463, + "learning_rate": 5.793950308413432e-05, + "loss": 0.6437, + "step": 7259 + }, + { + "epoch": 0.6491416309012875, + "grad_norm": 0.13000492145161613, + "learning_rate": 5.7913230966350116e-05, + "loss": 0.6487, + "step": 7260 + }, + { + "epoch": 0.6492310443490701, + "grad_norm": 0.13932789270856555, + "learning_rate": 5.788696237833237e-05, + "loss": 0.6186, + "step": 7261 + }, + { + "epoch": 0.6493204577968527, + "grad_norm": 0.1527566928297609, + "learning_rate": 5.786069732228423e-05, + "loss": 0.6314, + "step": 7262 + }, + { + "epoch": 0.6494098712446352, + "grad_norm": 0.13843020852278054, + "learning_rate": 5.783443580040854e-05, + "loss": 0.6471, + "step": 7263 + }, + { + "epoch": 0.6494992846924177, + "grad_norm": 0.14061406810033292, + "learning_rate": 5.780817781490777e-05, + "loss": 0.6496, + "step": 7264 + }, + { + "epoch": 0.6495886981402003, + "grad_norm": 0.16212912105859437, + "learning_rate": 5.778192336798416e-05, + "loss": 0.707, + "step": 7265 + }, + { + "epoch": 0.6496781115879828, + "grad_norm": 0.12696013315121532, + "learning_rate": 5.775567246183966e-05, + "loss": 0.6705, + "step": 7266 + }, + { + "epoch": 0.6497675250357654, + "grad_norm": 0.14472221327448898, + "learning_rate": 5.772942509867588e-05, + "loss": 0.6326, + "step": 7267 + }, + { + "epoch": 0.649856938483548, + "grad_norm": 0.1354972549998005, + "learning_rate": 5.7703181280694184e-05, + "loss": 0.6561, + "step": 7268 + }, + { + "epoch": 0.6499463519313304, + "grad_norm": 0.12598710469729588, + "learning_rate": 5.767694101009562e-05, + "loss": 0.6646, + "step": 7269 + }, + { + "epoch": 0.650035765379113, + "grad_norm": 0.14296097550376086, + "learning_rate": 5.765070428908086e-05, + "loss": 0.6587, + "step": 7270 + }, + { + "epoch": 0.6501251788268956, + "grad_norm": 0.14229960030359748, + "learning_rate": 5.762447111985039e-05, + "loss": 0.5888, + "step": 7271 + }, + { + "epoch": 0.6502145922746781, + "grad_norm": 0.1382968349973054, + "learning_rate": 5.759824150460435e-05, + "loss": 0.6644, + "step": 7272 + }, + { + "epoch": 0.6503040057224606, + "grad_norm": 0.11681203368516016, + "learning_rate": 5.7572015445542594e-05, + "loss": 0.638, + "step": 7273 + }, + { + "epoch": 0.6503934191702432, + "grad_norm": 0.13836711817477781, + "learning_rate": 5.7545792944864696e-05, + "loss": 0.6354, + "step": 7274 + }, + { + "epoch": 0.6504828326180258, + "grad_norm": 0.12562250354809473, + "learning_rate": 5.751957400476984e-05, + "loss": 0.6635, + "step": 7275 + }, + { + "epoch": 0.6505722460658083, + "grad_norm": 0.12311175725230762, + "learning_rate": 5.7493358627456995e-05, + "loss": 0.6244, + "step": 7276 + }, + { + "epoch": 0.6506616595135909, + "grad_norm": 0.13447733849944912, + "learning_rate": 5.7467146815124874e-05, + "loss": 0.6076, + "step": 7277 + }, + { + "epoch": 0.6507510729613734, + "grad_norm": 0.13541005650017313, + "learning_rate": 5.744093856997175e-05, + "loss": 0.6911, + "step": 7278 + }, + { + "epoch": 0.6508404864091559, + "grad_norm": 0.14117998141843746, + "learning_rate": 5.741473389419565e-05, + "loss": 0.665, + "step": 7279 + }, + { + "epoch": 0.6509298998569385, + "grad_norm": 0.15429669554552525, + "learning_rate": 5.7388532789994476e-05, + "loss": 0.6433, + "step": 7280 + }, + { + "epoch": 0.6510193133047211, + "grad_norm": 0.1454913325997393, + "learning_rate": 5.7362335259565556e-05, + "loss": 0.6417, + "step": 7281 + }, + { + "epoch": 0.6511087267525035, + "grad_norm": 0.12187146025625745, + "learning_rate": 5.733614130510609e-05, + "loss": 0.6431, + "step": 7282 + }, + { + "epoch": 0.6511981402002861, + "grad_norm": 0.13684540681567886, + "learning_rate": 5.730995092881297e-05, + "loss": 0.6446, + "step": 7283 + }, + { + "epoch": 0.6512875536480687, + "grad_norm": 0.13220876367157264, + "learning_rate": 5.728376413288267e-05, + "loss": 0.6843, + "step": 7284 + }, + { + "epoch": 0.6513769670958512, + "grad_norm": 0.13038788216657396, + "learning_rate": 5.725758091951148e-05, + "loss": 0.6184, + "step": 7285 + }, + { + "epoch": 0.6514663805436338, + "grad_norm": 0.12895057476977606, + "learning_rate": 5.723140129089535e-05, + "loss": 0.6431, + "step": 7286 + }, + { + "epoch": 0.6515557939914163, + "grad_norm": 0.13136615703707427, + "learning_rate": 5.720522524922995e-05, + "loss": 0.5619, + "step": 7287 + }, + { + "epoch": 0.6516452074391988, + "grad_norm": 0.11809902180680425, + "learning_rate": 5.717905279671068e-05, + "loss": 0.6221, + "step": 7288 + }, + { + "epoch": 0.6517346208869814, + "grad_norm": 0.15245052415689403, + "learning_rate": 5.715288393553247e-05, + "loss": 0.6758, + "step": 7289 + }, + { + "epoch": 0.651824034334764, + "grad_norm": 0.1311383028236272, + "learning_rate": 5.712671866789015e-05, + "loss": 0.5987, + "step": 7290 + }, + { + "epoch": 0.6519134477825465, + "grad_norm": 0.13253439926080063, + "learning_rate": 5.710055699597816e-05, + "loss": 0.6455, + "step": 7291 + }, + { + "epoch": 0.652002861230329, + "grad_norm": 0.14178669265669305, + "learning_rate": 5.707439892199068e-05, + "loss": 0.6499, + "step": 7292 + }, + { + "epoch": 0.6520922746781116, + "grad_norm": 0.14143948038558174, + "learning_rate": 5.7048244448121447e-05, + "loss": 0.6861, + "step": 7293 + }, + { + "epoch": 0.6521816881258942, + "grad_norm": 0.13828586691109537, + "learning_rate": 5.7022093576564165e-05, + "loss": 0.6768, + "step": 7294 + }, + { + "epoch": 0.6522711015736766, + "grad_norm": 0.1428950976208985, + "learning_rate": 5.6995946309511924e-05, + "loss": 0.6372, + "step": 7295 + }, + { + "epoch": 0.6523605150214592, + "grad_norm": 0.13495186454373925, + "learning_rate": 5.696980264915777e-05, + "loss": 0.6377, + "step": 7296 + }, + { + "epoch": 0.6524499284692418, + "grad_norm": 0.14794881733309018, + "learning_rate": 5.69436625976943e-05, + "loss": 0.6792, + "step": 7297 + }, + { + "epoch": 0.6525393419170243, + "grad_norm": 0.1311064692053035, + "learning_rate": 5.691752615731384e-05, + "loss": 0.6402, + "step": 7298 + }, + { + "epoch": 0.6526287553648069, + "grad_norm": 0.12802885713171228, + "learning_rate": 5.689139333020842e-05, + "loss": 0.6253, + "step": 7299 + }, + { + "epoch": 0.6527181688125894, + "grad_norm": 0.14167899656376756, + "learning_rate": 5.686526411856978e-05, + "loss": 0.6114, + "step": 7300 + }, + { + "epoch": 0.6528075822603719, + "grad_norm": 0.13607943211740553, + "learning_rate": 5.6839138524589344e-05, + "loss": 0.5652, + "step": 7301 + }, + { + "epoch": 0.6528969957081545, + "grad_norm": 0.14524032530672998, + "learning_rate": 5.681301655045823e-05, + "loss": 0.5954, + "step": 7302 + }, + { + "epoch": 0.6529864091559371, + "grad_norm": 0.132039615502774, + "learning_rate": 5.678689819836731e-05, + "loss": 0.6322, + "step": 7303 + }, + { + "epoch": 0.6530758226037195, + "grad_norm": 0.13633302005667097, + "learning_rate": 5.6760783470506996e-05, + "loss": 0.6429, + "step": 7304 + }, + { + "epoch": 0.6531652360515021, + "grad_norm": 0.14234118149378142, + "learning_rate": 5.673467236906758e-05, + "loss": 0.6353, + "step": 7305 + }, + { + "epoch": 0.6532546494992847, + "grad_norm": 0.1435518697080957, + "learning_rate": 5.6708564896238944e-05, + "loss": 0.6544, + "step": 7306 + }, + { + "epoch": 0.6533440629470673, + "grad_norm": 0.11236634230883104, + "learning_rate": 5.6682461054210635e-05, + "loss": 0.6313, + "step": 7307 + }, + { + "epoch": 0.6534334763948498, + "grad_norm": 0.1285260002456921, + "learning_rate": 5.6656360845172076e-05, + "loss": 0.6411, + "step": 7308 + }, + { + "epoch": 0.6535228898426323, + "grad_norm": 0.13676389797738064, + "learning_rate": 5.663026427131215e-05, + "loss": 0.6541, + "step": 7309 + }, + { + "epoch": 0.6536123032904149, + "grad_norm": 0.1262684987595396, + "learning_rate": 5.6604171334819564e-05, + "loss": 0.6364, + "step": 7310 + }, + { + "epoch": 0.6537017167381974, + "grad_norm": 0.1467555758319018, + "learning_rate": 5.657808203788277e-05, + "loss": 0.6418, + "step": 7311 + }, + { + "epoch": 0.65379113018598, + "grad_norm": 0.13539220388186124, + "learning_rate": 5.6551996382689776e-05, + "loss": 0.6149, + "step": 7312 + }, + { + "epoch": 0.6538805436337625, + "grad_norm": 0.1367305652869935, + "learning_rate": 5.6525914371428344e-05, + "loss": 0.5821, + "step": 7313 + }, + { + "epoch": 0.653969957081545, + "grad_norm": 0.13257450641283333, + "learning_rate": 5.649983600628599e-05, + "loss": 0.6317, + "step": 7314 + }, + { + "epoch": 0.6540593705293276, + "grad_norm": 0.12568850151936886, + "learning_rate": 5.647376128944984e-05, + "loss": 0.6243, + "step": 7315 + }, + { + "epoch": 0.6541487839771102, + "grad_norm": 0.12337931355139099, + "learning_rate": 5.6447690223106775e-05, + "loss": 0.6288, + "step": 7316 + }, + { + "epoch": 0.6542381974248928, + "grad_norm": 0.13433939201154887, + "learning_rate": 5.642162280944336e-05, + "loss": 0.591, + "step": 7317 + }, + { + "epoch": 0.6543276108726752, + "grad_norm": 0.14440111846443024, + "learning_rate": 5.6395559050645794e-05, + "loss": 0.6736, + "step": 7318 + }, + { + "epoch": 0.6544170243204578, + "grad_norm": 0.13193619945645377, + "learning_rate": 5.6369498948900014e-05, + "loss": 0.6379, + "step": 7319 + }, + { + "epoch": 0.6545064377682404, + "grad_norm": 0.12728603447005563, + "learning_rate": 5.63434425063917e-05, + "loss": 0.6593, + "step": 7320 + }, + { + "epoch": 0.6545958512160229, + "grad_norm": 0.15194079390244153, + "learning_rate": 5.6317389725306066e-05, + "loss": 0.6394, + "step": 7321 + }, + { + "epoch": 0.6546852646638054, + "grad_norm": 0.13237676655065905, + "learning_rate": 5.629134060782828e-05, + "loss": 0.623, + "step": 7322 + }, + { + "epoch": 0.654774678111588, + "grad_norm": 0.1549090400707122, + "learning_rate": 5.626529515614294e-05, + "loss": 0.7018, + "step": 7323 + }, + { + "epoch": 0.6548640915593705, + "grad_norm": 0.1475210986702467, + "learning_rate": 5.6239253372434465e-05, + "loss": 0.6747, + "step": 7324 + }, + { + "epoch": 0.6549535050071531, + "grad_norm": 0.12475955879467823, + "learning_rate": 5.621321525888697e-05, + "loss": 0.6587, + "step": 7325 + }, + { + "epoch": 0.6550429184549357, + "grad_norm": 0.15633539602057617, + "learning_rate": 5.618718081768426e-05, + "loss": 0.676, + "step": 7326 + }, + { + "epoch": 0.6551323319027181, + "grad_norm": 0.13117447053541328, + "learning_rate": 5.616115005100975e-05, + "loss": 0.5785, + "step": 7327 + }, + { + "epoch": 0.6552217453505007, + "grad_norm": 0.13460213285151773, + "learning_rate": 5.613512296104663e-05, + "loss": 0.6361, + "step": 7328 + }, + { + "epoch": 0.6553111587982833, + "grad_norm": 0.1285219043531816, + "learning_rate": 5.6109099549977786e-05, + "loss": 0.6426, + "step": 7329 + }, + { + "epoch": 0.6554005722460658, + "grad_norm": 0.14427188959312795, + "learning_rate": 5.608307981998574e-05, + "loss": 0.6395, + "step": 7330 + }, + { + "epoch": 0.6554899856938483, + "grad_norm": 0.1316641906483586, + "learning_rate": 5.6057063773252794e-05, + "loss": 0.6477, + "step": 7331 + }, + { + "epoch": 0.6555793991416309, + "grad_norm": 0.14986416238594696, + "learning_rate": 5.603105141196081e-05, + "loss": 0.7018, + "step": 7332 + }, + { + "epoch": 0.6556688125894135, + "grad_norm": 0.1543817230090694, + "learning_rate": 5.600504273829144e-05, + "loss": 0.6813, + "step": 7333 + }, + { + "epoch": 0.655758226037196, + "grad_norm": 0.12282354344555865, + "learning_rate": 5.5979037754426003e-05, + "loss": 0.608, + "step": 7334 + }, + { + "epoch": 0.6558476394849786, + "grad_norm": 0.13589792377676402, + "learning_rate": 5.5953036462545505e-05, + "loss": 0.6041, + "step": 7335 + }, + { + "epoch": 0.655937052932761, + "grad_norm": 0.16032633479335298, + "learning_rate": 5.592703886483064e-05, + "loss": 0.6354, + "step": 7336 + }, + { + "epoch": 0.6560264663805436, + "grad_norm": 0.1513985774450874, + "learning_rate": 5.590104496346185e-05, + "loss": 0.6558, + "step": 7337 + }, + { + "epoch": 0.6561158798283262, + "grad_norm": 0.11607644579441986, + "learning_rate": 5.5875054760619104e-05, + "loss": 0.6342, + "step": 7338 + }, + { + "epoch": 0.6562052932761088, + "grad_norm": 0.13837753793646485, + "learning_rate": 5.584906825848224e-05, + "loss": 0.6505, + "step": 7339 + }, + { + "epoch": 0.6562947067238912, + "grad_norm": 0.14054814728365314, + "learning_rate": 5.582308545923074e-05, + "loss": 0.6096, + "step": 7340 + }, + { + "epoch": 0.6563841201716738, + "grad_norm": 0.16004259939120763, + "learning_rate": 5.579710636504362e-05, + "loss": 0.6704, + "step": 7341 + }, + { + "epoch": 0.6564735336194564, + "grad_norm": 0.1541722910566911, + "learning_rate": 5.577113097809989e-05, + "loss": 0.6517, + "step": 7342 + }, + { + "epoch": 0.656562947067239, + "grad_norm": 0.1397734913518676, + "learning_rate": 5.574515930057795e-05, + "loss": 0.6367, + "step": 7343 + }, + { + "epoch": 0.6566523605150214, + "grad_norm": 0.1306702968770121, + "learning_rate": 5.571919133465605e-05, + "loss": 0.6155, + "step": 7344 + }, + { + "epoch": 0.656741773962804, + "grad_norm": 0.14129117485045362, + "learning_rate": 5.569322708251215e-05, + "loss": 0.6429, + "step": 7345 + }, + { + "epoch": 0.6568311874105865, + "grad_norm": 0.12846885046024253, + "learning_rate": 5.5667266546323723e-05, + "loss": 0.6362, + "step": 7346 + }, + { + "epoch": 0.6569206008583691, + "grad_norm": 0.15400455905807087, + "learning_rate": 5.564130972826813e-05, + "loss": 0.6801, + "step": 7347 + }, + { + "epoch": 0.6570100143061517, + "grad_norm": 0.1503101374502484, + "learning_rate": 5.561535663052231e-05, + "loss": 0.6242, + "step": 7348 + }, + { + "epoch": 0.6570994277539342, + "grad_norm": 0.14625006740070784, + "learning_rate": 5.558940725526291e-05, + "loss": 0.6556, + "step": 7349 + }, + { + "epoch": 0.6571888412017167, + "grad_norm": 0.12533034214178268, + "learning_rate": 5.5563461604666325e-05, + "loss": 0.6628, + "step": 7350 + }, + { + "epoch": 0.6572782546494993, + "grad_norm": 0.14204309182870564, + "learning_rate": 5.553751968090857e-05, + "loss": 0.6191, + "step": 7351 + }, + { + "epoch": 0.6573676680972819, + "grad_norm": 0.12800828502827527, + "learning_rate": 5.55115814861653e-05, + "loss": 0.6585, + "step": 7352 + }, + { + "epoch": 0.6574570815450643, + "grad_norm": 0.1272311602167524, + "learning_rate": 5.548564702261196e-05, + "loss": 0.6311, + "step": 7353 + }, + { + "epoch": 0.6575464949928469, + "grad_norm": 0.13360936529867853, + "learning_rate": 5.545971629242369e-05, + "loss": 0.6615, + "step": 7354 + }, + { + "epoch": 0.6576359084406295, + "grad_norm": 0.14335343158626704, + "learning_rate": 5.543378929777514e-05, + "loss": 0.6442, + "step": 7355 + }, + { + "epoch": 0.657725321888412, + "grad_norm": 0.12846406658683007, + "learning_rate": 5.540786604084091e-05, + "loss": 0.6252, + "step": 7356 + }, + { + "epoch": 0.6578147353361946, + "grad_norm": 0.15307292339886105, + "learning_rate": 5.538194652379514e-05, + "loss": 0.6647, + "step": 7357 + }, + { + "epoch": 0.6579041487839771, + "grad_norm": 0.14372504765655295, + "learning_rate": 5.5356030748811575e-05, + "loss": 0.6347, + "step": 7358 + }, + { + "epoch": 0.6579935622317596, + "grad_norm": 0.1257553376750081, + "learning_rate": 5.5330118718063795e-05, + "loss": 0.6543, + "step": 7359 + }, + { + "epoch": 0.6580829756795422, + "grad_norm": 0.13468922171160702, + "learning_rate": 5.530421043372507e-05, + "loss": 0.6355, + "step": 7360 + }, + { + "epoch": 0.6581723891273248, + "grad_norm": 0.12365151993588651, + "learning_rate": 5.5278305897968185e-05, + "loss": 0.6088, + "step": 7361 + }, + { + "epoch": 0.6582618025751072, + "grad_norm": 0.16757854757436025, + "learning_rate": 5.525240511296577e-05, + "loss": 0.6764, + "step": 7362 + }, + { + "epoch": 0.6583512160228898, + "grad_norm": 0.11465665463313218, + "learning_rate": 5.522650808089011e-05, + "loss": 0.6191, + "step": 7363 + }, + { + "epoch": 0.6584406294706724, + "grad_norm": 0.12794405218544705, + "learning_rate": 5.520061480391313e-05, + "loss": 0.6268, + "step": 7364 + }, + { + "epoch": 0.658530042918455, + "grad_norm": 0.13797666150910007, + "learning_rate": 5.517472528420653e-05, + "loss": 0.6469, + "step": 7365 + }, + { + "epoch": 0.6586194563662375, + "grad_norm": 0.16143332254442244, + "learning_rate": 5.514883952394154e-05, + "loss": 0.6374, + "step": 7366 + }, + { + "epoch": 0.65870886981402, + "grad_norm": 0.12295265575105252, + "learning_rate": 5.512295752528922e-05, + "loss": 0.6138, + "step": 7367 + }, + { + "epoch": 0.6587982832618026, + "grad_norm": 0.13459412407127885, + "learning_rate": 5.50970792904203e-05, + "loss": 0.6103, + "step": 7368 + }, + { + "epoch": 0.6588876967095851, + "grad_norm": 0.1415023104116087, + "learning_rate": 5.507120482150501e-05, + "loss": 0.6783, + "step": 7369 + }, + { + "epoch": 0.6589771101573677, + "grad_norm": 0.12685383449545787, + "learning_rate": 5.5045334120713565e-05, + "loss": 0.632, + "step": 7370 + }, + { + "epoch": 0.6590665236051502, + "grad_norm": 0.13041403696985832, + "learning_rate": 5.501946719021569e-05, + "loss": 0.6587, + "step": 7371 + }, + { + "epoch": 0.6591559370529327, + "grad_norm": 0.14208772275777923, + "learning_rate": 5.4993604032180746e-05, + "loss": 0.6435, + "step": 7372 + }, + { + "epoch": 0.6592453505007153, + "grad_norm": 0.1297579137944134, + "learning_rate": 5.496774464877787e-05, + "loss": 0.6636, + "step": 7373 + }, + { + "epoch": 0.6593347639484979, + "grad_norm": 0.13351298931030803, + "learning_rate": 5.494188904217592e-05, + "loss": 0.6669, + "step": 7374 + }, + { + "epoch": 0.6594241773962805, + "grad_norm": 0.1276682361592793, + "learning_rate": 5.491603721454327e-05, + "loss": 0.6528, + "step": 7375 + }, + { + "epoch": 0.6595135908440629, + "grad_norm": 0.125702879079446, + "learning_rate": 5.489018916804813e-05, + "loss": 0.6275, + "step": 7376 + }, + { + "epoch": 0.6596030042918455, + "grad_norm": 0.15932476969725803, + "learning_rate": 5.4864344904858345e-05, + "loss": 0.6255, + "step": 7377 + }, + { + "epoch": 0.6596924177396281, + "grad_norm": 0.12685075937761442, + "learning_rate": 5.483850442714145e-05, + "loss": 0.6127, + "step": 7378 + }, + { + "epoch": 0.6597818311874106, + "grad_norm": 0.14634200788501409, + "learning_rate": 5.481266773706468e-05, + "loss": 0.6337, + "step": 7379 + }, + { + "epoch": 0.6598712446351931, + "grad_norm": 0.13688031805616027, + "learning_rate": 5.4786834836794855e-05, + "loss": 0.6719, + "step": 7380 + }, + { + "epoch": 0.6599606580829757, + "grad_norm": 0.13053365653718596, + "learning_rate": 5.4761005728498594e-05, + "loss": 0.6211, + "step": 7381 + }, + { + "epoch": 0.6600500715307582, + "grad_norm": 0.13064720847655426, + "learning_rate": 5.4735180414342134e-05, + "loss": 0.6322, + "step": 7382 + }, + { + "epoch": 0.6601394849785408, + "grad_norm": 0.13736315198592086, + "learning_rate": 5.4709358896491445e-05, + "loss": 0.6678, + "step": 7383 + }, + { + "epoch": 0.6602288984263234, + "grad_norm": 0.11924137743622205, + "learning_rate": 5.468354117711212e-05, + "loss": 0.6278, + "step": 7384 + }, + { + "epoch": 0.6603183118741058, + "grad_norm": 0.14000150700180594, + "learning_rate": 5.465772725836951e-05, + "loss": 0.6547, + "step": 7385 + }, + { + "epoch": 0.6604077253218884, + "grad_norm": 0.1388155168293871, + "learning_rate": 5.463191714242851e-05, + "loss": 0.6384, + "step": 7386 + }, + { + "epoch": 0.660497138769671, + "grad_norm": 0.15064210529505742, + "learning_rate": 5.4606110831453836e-05, + "loss": 0.6357, + "step": 7387 + }, + { + "epoch": 0.6605865522174535, + "grad_norm": 0.13066222503300776, + "learning_rate": 5.458030832760985e-05, + "loss": 0.6102, + "step": 7388 + }, + { + "epoch": 0.660675965665236, + "grad_norm": 0.1483404673104829, + "learning_rate": 5.4554509633060524e-05, + "loss": 0.636, + "step": 7389 + }, + { + "epoch": 0.6607653791130186, + "grad_norm": 0.1498164687358436, + "learning_rate": 5.452871474996955e-05, + "loss": 0.675, + "step": 7390 + }, + { + "epoch": 0.6608547925608012, + "grad_norm": 0.12565504331522062, + "learning_rate": 5.450292368050043e-05, + "loss": 0.6155, + "step": 7391 + }, + { + "epoch": 0.6609442060085837, + "grad_norm": 0.1397876705142112, + "learning_rate": 5.447713642681612e-05, + "loss": 0.6498, + "step": 7392 + }, + { + "epoch": 0.6610336194563662, + "grad_norm": 0.11534722159629948, + "learning_rate": 5.44513529910794e-05, + "loss": 0.5809, + "step": 7393 + }, + { + "epoch": 0.6611230329041488, + "grad_norm": 0.13708794314635928, + "learning_rate": 5.442557337545273e-05, + "loss": 0.6874, + "step": 7394 + }, + { + "epoch": 0.6612124463519313, + "grad_norm": 0.1510696118150404, + "learning_rate": 5.4399797582098144e-05, + "loss": 0.6466, + "step": 7395 + }, + { + "epoch": 0.6613018597997139, + "grad_norm": 0.11643221317861302, + "learning_rate": 5.437402561317746e-05, + "loss": 0.6056, + "step": 7396 + }, + { + "epoch": 0.6613912732474965, + "grad_norm": 0.1421694757655734, + "learning_rate": 5.434825747085215e-05, + "loss": 0.6576, + "step": 7397 + }, + { + "epoch": 0.6614806866952789, + "grad_norm": 0.13550064847661533, + "learning_rate": 5.432249315728336e-05, + "loss": 0.6472, + "step": 7398 + }, + { + "epoch": 0.6615701001430615, + "grad_norm": 0.11184769271929551, + "learning_rate": 5.429673267463193e-05, + "loss": 0.6229, + "step": 7399 + }, + { + "epoch": 0.6616595135908441, + "grad_norm": 0.14846942251425965, + "learning_rate": 5.427097602505831e-05, + "loss": 0.6314, + "step": 7400 + }, + { + "epoch": 0.6617489270386266, + "grad_norm": 0.12450298189583202, + "learning_rate": 5.42452232107227e-05, + "loss": 0.6046, + "step": 7401 + }, + { + "epoch": 0.6618383404864091, + "grad_norm": 0.13418050353720248, + "learning_rate": 5.4219474233785e-05, + "loss": 0.655, + "step": 7402 + }, + { + "epoch": 0.6619277539341917, + "grad_norm": 0.15004673069013374, + "learning_rate": 5.419372909640466e-05, + "loss": 0.6598, + "step": 7403 + }, + { + "epoch": 0.6620171673819742, + "grad_norm": 0.12266372144541278, + "learning_rate": 5.416798780074091e-05, + "loss": 0.6556, + "step": 7404 + }, + { + "epoch": 0.6621065808297568, + "grad_norm": 0.13292069650969235, + "learning_rate": 5.414225034895273e-05, + "loss": 0.6451, + "step": 7405 + }, + { + "epoch": 0.6621959942775394, + "grad_norm": 0.1467416892786956, + "learning_rate": 5.411651674319862e-05, + "loss": 0.6815, + "step": 7406 + }, + { + "epoch": 0.6622854077253219, + "grad_norm": 0.14403335285892585, + "learning_rate": 5.409078698563682e-05, + "loss": 0.6498, + "step": 7407 + }, + { + "epoch": 0.6623748211731044, + "grad_norm": 0.13693577686302022, + "learning_rate": 5.4065061078425315e-05, + "loss": 0.6455, + "step": 7408 + }, + { + "epoch": 0.662464234620887, + "grad_norm": 0.13635547739950413, + "learning_rate": 5.403933902372162e-05, + "loss": 0.6446, + "step": 7409 + }, + { + "epoch": 0.6625536480686696, + "grad_norm": 0.13214291772362632, + "learning_rate": 5.401362082368306e-05, + "loss": 0.6546, + "step": 7410 + }, + { + "epoch": 0.662643061516452, + "grad_norm": 0.1358119606491704, + "learning_rate": 5.3987906480466586e-05, + "loss": 0.6398, + "step": 7411 + }, + { + "epoch": 0.6627324749642346, + "grad_norm": 0.13260568144236448, + "learning_rate": 5.3962195996228825e-05, + "loss": 0.6289, + "step": 7412 + }, + { + "epoch": 0.6628218884120172, + "grad_norm": 0.13591557751547656, + "learning_rate": 5.3936489373126075e-05, + "loss": 0.6211, + "step": 7413 + }, + { + "epoch": 0.6629113018597997, + "grad_norm": 0.12137438653760874, + "learning_rate": 5.391078661331439e-05, + "loss": 0.6358, + "step": 7414 + }, + { + "epoch": 0.6630007153075823, + "grad_norm": 0.13160181955870387, + "learning_rate": 5.388508771894931e-05, + "loss": 0.6323, + "step": 7415 + }, + { + "epoch": 0.6630901287553648, + "grad_norm": 0.14560778628745077, + "learning_rate": 5.385939269218625e-05, + "loss": 0.6743, + "step": 7416 + }, + { + "epoch": 0.6631795422031473, + "grad_norm": 0.13020027013791133, + "learning_rate": 5.383370153518019e-05, + "loss": 0.5914, + "step": 7417 + }, + { + "epoch": 0.6632689556509299, + "grad_norm": 0.12196318168060201, + "learning_rate": 5.3808014250085836e-05, + "loss": 0.6167, + "step": 7418 + }, + { + "epoch": 0.6633583690987125, + "grad_norm": 0.1376915831897266, + "learning_rate": 5.3782330839057573e-05, + "loss": 0.6556, + "step": 7419 + }, + { + "epoch": 0.663447782546495, + "grad_norm": 0.1268412250049469, + "learning_rate": 5.375665130424936e-05, + "loss": 0.6411, + "step": 7420 + }, + { + "epoch": 0.6635371959942775, + "grad_norm": 0.12017186362887294, + "learning_rate": 5.373097564781496e-05, + "loss": 0.6156, + "step": 7421 + }, + { + "epoch": 0.6636266094420601, + "grad_norm": 0.13386324921531198, + "learning_rate": 5.3705303871907795e-05, + "loss": 0.5526, + "step": 7422 + }, + { + "epoch": 0.6637160228898427, + "grad_norm": 0.1378725343761233, + "learning_rate": 5.3679635978680843e-05, + "loss": 0.6505, + "step": 7423 + }, + { + "epoch": 0.6638054363376252, + "grad_norm": 0.14200970346190708, + "learning_rate": 5.365397197028685e-05, + "loss": 0.6111, + "step": 7424 + }, + { + "epoch": 0.6638948497854077, + "grad_norm": 0.13799747019421654, + "learning_rate": 5.3628311848878333e-05, + "loss": 0.635, + "step": 7425 + }, + { + "epoch": 0.6639842632331903, + "grad_norm": 0.14956890911600837, + "learning_rate": 5.360265561660725e-05, + "loss": 0.6533, + "step": 7426 + }, + { + "epoch": 0.6640736766809728, + "grad_norm": 0.15584444128033376, + "learning_rate": 5.35770032756254e-05, + "loss": 0.6119, + "step": 7427 + }, + { + "epoch": 0.6641630901287554, + "grad_norm": 0.13952317155370966, + "learning_rate": 5.3551354828084276e-05, + "loss": 0.6384, + "step": 7428 + }, + { + "epoch": 0.6642525035765379, + "grad_norm": 0.10736455912438986, + "learning_rate": 5.352571027613489e-05, + "loss": 0.6196, + "step": 7429 + }, + { + "epoch": 0.6643419170243204, + "grad_norm": 0.13541464193408828, + "learning_rate": 5.350006962192804e-05, + "loss": 0.6467, + "step": 7430 + }, + { + "epoch": 0.664431330472103, + "grad_norm": 0.1298301693293065, + "learning_rate": 5.34744328676142e-05, + "loss": 0.6348, + "step": 7431 + }, + { + "epoch": 0.6645207439198856, + "grad_norm": 0.11315720681224807, + "learning_rate": 5.344880001534349e-05, + "loss": 0.6656, + "step": 7432 + }, + { + "epoch": 0.664610157367668, + "grad_norm": 0.12054496795825348, + "learning_rate": 5.342317106726574e-05, + "loss": 0.6122, + "step": 7433 + }, + { + "epoch": 0.6646995708154506, + "grad_norm": 0.11274687599990578, + "learning_rate": 5.339754602553034e-05, + "loss": 0.6133, + "step": 7434 + }, + { + "epoch": 0.6647889842632332, + "grad_norm": 0.14337097397681428, + "learning_rate": 5.3371924892286484e-05, + "loss": 0.6365, + "step": 7435 + }, + { + "epoch": 0.6648783977110158, + "grad_norm": 0.11896371195620362, + "learning_rate": 5.3346307669683005e-05, + "loss": 0.6121, + "step": 7436 + }, + { + "epoch": 0.6649678111587983, + "grad_norm": 0.14912410272261095, + "learning_rate": 5.332069435986832e-05, + "loss": 0.7022, + "step": 7437 + }, + { + "epoch": 0.6650572246065808, + "grad_norm": 0.13852921934158788, + "learning_rate": 5.329508496499058e-05, + "loss": 0.6214, + "step": 7438 + }, + { + "epoch": 0.6651466380543634, + "grad_norm": 0.12392149142953361, + "learning_rate": 5.326947948719775e-05, + "loss": 0.6585, + "step": 7439 + }, + { + "epoch": 0.6652360515021459, + "grad_norm": 0.1490058594903041, + "learning_rate": 5.324387792863719e-05, + "loss": 0.6581, + "step": 7440 + }, + { + "epoch": 0.6653254649499285, + "grad_norm": 0.1148904120065793, + "learning_rate": 5.3218280291456126e-05, + "loss": 0.6199, + "step": 7441 + }, + { + "epoch": 0.665414878397711, + "grad_norm": 0.13421146211190454, + "learning_rate": 5.319268657780143e-05, + "loss": 0.5729, + "step": 7442 + }, + { + "epoch": 0.6655042918454935, + "grad_norm": 0.13492439128248093, + "learning_rate": 5.316709678981955e-05, + "loss": 0.6608, + "step": 7443 + }, + { + "epoch": 0.6655937052932761, + "grad_norm": 0.12776219587318377, + "learning_rate": 5.314151092965669e-05, + "loss": 0.6281, + "step": 7444 + }, + { + "epoch": 0.6656831187410587, + "grad_norm": 0.14460312308786522, + "learning_rate": 5.311592899945873e-05, + "loss": 0.6702, + "step": 7445 + }, + { + "epoch": 0.6657725321888412, + "grad_norm": 0.12459630599055216, + "learning_rate": 5.3090351001371185e-05, + "loss": 0.5908, + "step": 7446 + }, + { + "epoch": 0.6658619456366237, + "grad_norm": 0.1283267836321984, + "learning_rate": 5.306477693753924e-05, + "loss": 0.648, + "step": 7447 + }, + { + "epoch": 0.6659513590844063, + "grad_norm": 0.13098222791995487, + "learning_rate": 5.303920681010781e-05, + "loss": 0.6113, + "step": 7448 + }, + { + "epoch": 0.6660407725321889, + "grad_norm": 0.13105832422363167, + "learning_rate": 5.301364062122136e-05, + "loss": 0.632, + "step": 7449 + }, + { + "epoch": 0.6661301859799714, + "grad_norm": 0.11942024852259422, + "learning_rate": 5.298807837302411e-05, + "loss": 0.6374, + "step": 7450 + }, + { + "epoch": 0.6662195994277539, + "grad_norm": 0.13302238003341135, + "learning_rate": 5.2962520067660004e-05, + "loss": 0.6423, + "step": 7451 + }, + { + "epoch": 0.6663090128755365, + "grad_norm": 0.13111152167018036, + "learning_rate": 5.2936965707272446e-05, + "loss": 0.6387, + "step": 7452 + }, + { + "epoch": 0.666398426323319, + "grad_norm": 0.12931187412694944, + "learning_rate": 5.291141529400483e-05, + "loss": 0.6182, + "step": 7453 + }, + { + "epoch": 0.6664878397711016, + "grad_norm": 0.13207553417760975, + "learning_rate": 5.288586882999989e-05, + "loss": 0.6476, + "step": 7454 + }, + { + "epoch": 0.6665772532188842, + "grad_norm": 0.1435672503572563, + "learning_rate": 5.286032631740023e-05, + "loss": 0.6287, + "step": 7455 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.13937317806330213, + "learning_rate": 5.283478775834811e-05, + "loss": 0.6616, + "step": 7456 + }, + { + "epoch": 0.6667560801144492, + "grad_norm": 0.1239790181254793, + "learning_rate": 5.280925315498536e-05, + "loss": 0.6395, + "step": 7457 + }, + { + "epoch": 0.6668454935622318, + "grad_norm": 0.11355430013777336, + "learning_rate": 5.278372250945354e-05, + "loss": 0.6206, + "step": 7458 + }, + { + "epoch": 0.6669349070100143, + "grad_norm": 0.1465736574741933, + "learning_rate": 5.2758195823893896e-05, + "loss": 0.6269, + "step": 7459 + }, + { + "epoch": 0.6670243204577968, + "grad_norm": 0.14896702463199615, + "learning_rate": 5.273267310044732e-05, + "loss": 0.6527, + "step": 7460 + }, + { + "epoch": 0.6671137339055794, + "grad_norm": 0.14228508856905717, + "learning_rate": 5.270715434125435e-05, + "loss": 0.6431, + "step": 7461 + }, + { + "epoch": 0.667203147353362, + "grad_norm": 0.1498024077791319, + "learning_rate": 5.2681639548455284e-05, + "loss": 0.6551, + "step": 7462 + }, + { + "epoch": 0.6672925608011445, + "grad_norm": 0.14237008673908225, + "learning_rate": 5.2656128724189916e-05, + "loss": 0.6605, + "step": 7463 + }, + { + "epoch": 0.6673819742489271, + "grad_norm": 0.12467226630326808, + "learning_rate": 5.263062187059785e-05, + "loss": 0.5997, + "step": 7464 + }, + { + "epoch": 0.6674713876967096, + "grad_norm": 0.15247658893220048, + "learning_rate": 5.260511898981837e-05, + "loss": 0.6595, + "step": 7465 + }, + { + "epoch": 0.6675608011444921, + "grad_norm": 0.12567135135387958, + "learning_rate": 5.2579620083990244e-05, + "loss": 0.5977, + "step": 7466 + }, + { + "epoch": 0.6676502145922747, + "grad_norm": 0.12844502869584584, + "learning_rate": 5.2554125155252175e-05, + "loss": 0.6601, + "step": 7467 + }, + { + "epoch": 0.6677396280400573, + "grad_norm": 0.12824601122947543, + "learning_rate": 5.25286342057423e-05, + "loss": 0.6528, + "step": 7468 + }, + { + "epoch": 0.6678290414878397, + "grad_norm": 0.14386210393873272, + "learning_rate": 5.2503147237598546e-05, + "loss": 0.6435, + "step": 7469 + }, + { + "epoch": 0.6679184549356223, + "grad_norm": 0.14551990770821374, + "learning_rate": 5.247766425295848e-05, + "loss": 0.6204, + "step": 7470 + }, + { + "epoch": 0.6680078683834049, + "grad_norm": 0.13363182747751537, + "learning_rate": 5.245218525395934e-05, + "loss": 0.6283, + "step": 7471 + }, + { + "epoch": 0.6680972818311874, + "grad_norm": 0.1426949124789881, + "learning_rate": 5.242671024273798e-05, + "loss": 0.686, + "step": 7472 + }, + { + "epoch": 0.66818669527897, + "grad_norm": 0.13606339858146194, + "learning_rate": 5.240123922143096e-05, + "loss": 0.6291, + "step": 7473 + }, + { + "epoch": 0.6682761087267525, + "grad_norm": 0.14351267461897582, + "learning_rate": 5.2375772192174534e-05, + "loss": 0.641, + "step": 7474 + }, + { + "epoch": 0.668365522174535, + "grad_norm": 0.11788224027293127, + "learning_rate": 5.235030915710457e-05, + "loss": 0.6281, + "step": 7475 + }, + { + "epoch": 0.6684549356223176, + "grad_norm": 0.12064804892788977, + "learning_rate": 5.2324850118356674e-05, + "loss": 0.6231, + "step": 7476 + }, + { + "epoch": 0.6685443490701002, + "grad_norm": 0.1306484594035293, + "learning_rate": 5.229939507806598e-05, + "loss": 0.6161, + "step": 7477 + }, + { + "epoch": 0.6686337625178826, + "grad_norm": 0.14141408846947204, + "learning_rate": 5.2273944038367416e-05, + "loss": 0.6331, + "step": 7478 + }, + { + "epoch": 0.6687231759656652, + "grad_norm": 0.1339866166207199, + "learning_rate": 5.224849700139557e-05, + "loss": 0.6113, + "step": 7479 + }, + { + "epoch": 0.6688125894134478, + "grad_norm": 0.15019466226743297, + "learning_rate": 5.222305396928453e-05, + "loss": 0.6173, + "step": 7480 + }, + { + "epoch": 0.6689020028612304, + "grad_norm": 0.150081473921078, + "learning_rate": 5.219761494416828e-05, + "loss": 0.6776, + "step": 7481 + }, + { + "epoch": 0.6689914163090128, + "grad_norm": 0.13440387593062464, + "learning_rate": 5.2172179928180395e-05, + "loss": 0.6547, + "step": 7482 + }, + { + "epoch": 0.6690808297567954, + "grad_norm": 0.13573144217332325, + "learning_rate": 5.214674892345397e-05, + "loss": 0.6608, + "step": 7483 + }, + { + "epoch": 0.669170243204578, + "grad_norm": 0.13235220951864338, + "learning_rate": 5.2121321932121916e-05, + "loss": 0.6199, + "step": 7484 + }, + { + "epoch": 0.6692596566523605, + "grad_norm": 0.12144869577488757, + "learning_rate": 5.209589895631681e-05, + "loss": 0.6524, + "step": 7485 + }, + { + "epoch": 0.6693490701001431, + "grad_norm": 0.13202973185318975, + "learning_rate": 5.207047999817076e-05, + "loss": 0.6588, + "step": 7486 + }, + { + "epoch": 0.6694384835479256, + "grad_norm": 0.11664643961870809, + "learning_rate": 5.2045065059815676e-05, + "loss": 0.6214, + "step": 7487 + }, + { + "epoch": 0.6695278969957081, + "grad_norm": 0.13827879714071517, + "learning_rate": 5.201965414338308e-05, + "loss": 0.686, + "step": 7488 + }, + { + "epoch": 0.6696173104434907, + "grad_norm": 0.13499961373370603, + "learning_rate": 5.199424725100413e-05, + "loss": 0.6263, + "step": 7489 + }, + { + "epoch": 0.6697067238912733, + "grad_norm": 0.13578770298764645, + "learning_rate": 5.1968844384809734e-05, + "loss": 0.6601, + "step": 7490 + }, + { + "epoch": 0.6697961373390557, + "grad_norm": 0.13130728767451982, + "learning_rate": 5.194344554693032e-05, + "loss": 0.6475, + "step": 7491 + }, + { + "epoch": 0.6698855507868383, + "grad_norm": 0.13403826752720413, + "learning_rate": 5.1918050739496074e-05, + "loss": 0.6404, + "step": 7492 + }, + { + "epoch": 0.6699749642346209, + "grad_norm": 0.15221811409598016, + "learning_rate": 5.189265996463689e-05, + "loss": 0.629, + "step": 7493 + }, + { + "epoch": 0.6700643776824035, + "grad_norm": 0.1574238145683084, + "learning_rate": 5.186727322448214e-05, + "loss": 0.6475, + "step": 7494 + }, + { + "epoch": 0.670153791130186, + "grad_norm": 0.1507131504703269, + "learning_rate": 5.1841890521161085e-05, + "loss": 0.6693, + "step": 7495 + }, + { + "epoch": 0.6702432045779685, + "grad_norm": 0.13038182002518972, + "learning_rate": 5.181651185680256e-05, + "loss": 0.6443, + "step": 7496 + }, + { + "epoch": 0.6703326180257511, + "grad_norm": 0.12797292254634487, + "learning_rate": 5.1791137233534946e-05, + "loss": 0.6184, + "step": 7497 + }, + { + "epoch": 0.6704220314735336, + "grad_norm": 0.13698670782597053, + "learning_rate": 5.1765766653486446e-05, + "loss": 0.6623, + "step": 7498 + }, + { + "epoch": 0.6705114449213162, + "grad_norm": 0.13144446713794242, + "learning_rate": 5.174040011878487e-05, + "loss": 0.6236, + "step": 7499 + }, + { + "epoch": 0.6706008583690987, + "grad_norm": 0.13763792062637192, + "learning_rate": 5.171503763155758e-05, + "loss": 0.657, + "step": 7500 + }, + { + "epoch": 0.6706902718168812, + "grad_norm": 0.13376368149033713, + "learning_rate": 5.168967919393186e-05, + "loss": 0.6471, + "step": 7501 + }, + { + "epoch": 0.6707796852646638, + "grad_norm": 0.129312812690993, + "learning_rate": 5.166432480803435e-05, + "loss": 0.6319, + "step": 7502 + }, + { + "epoch": 0.6708690987124464, + "grad_norm": 0.15272565207654723, + "learning_rate": 5.1638974475991554e-05, + "loss": 0.6726, + "step": 7503 + }, + { + "epoch": 0.670958512160229, + "grad_norm": 0.1375536517773976, + "learning_rate": 5.1613628199929544e-05, + "loss": 0.6607, + "step": 7504 + }, + { + "epoch": 0.6710479256080114, + "grad_norm": 0.1374698685563626, + "learning_rate": 5.158828598197416e-05, + "loss": 0.6427, + "step": 7505 + }, + { + "epoch": 0.671137339055794, + "grad_norm": 0.13297532164683817, + "learning_rate": 5.1562947824250704e-05, + "loss": 0.6404, + "step": 7506 + }, + { + "epoch": 0.6712267525035766, + "grad_norm": 0.14399331120382747, + "learning_rate": 5.1537613728884335e-05, + "loss": 0.6465, + "step": 7507 + }, + { + "epoch": 0.6713161659513591, + "grad_norm": 0.14734275093391183, + "learning_rate": 5.151228369799976e-05, + "loss": 0.6352, + "step": 7508 + }, + { + "epoch": 0.6714055793991416, + "grad_norm": 0.14317724876510923, + "learning_rate": 5.1486957733721405e-05, + "loss": 0.6879, + "step": 7509 + }, + { + "epoch": 0.6714949928469242, + "grad_norm": 0.13172775914183724, + "learning_rate": 5.146163583817336e-05, + "loss": 0.6102, + "step": 7510 + }, + { + "epoch": 0.6715844062947067, + "grad_norm": 0.12093639399179697, + "learning_rate": 5.143631801347926e-05, + "loss": 0.6109, + "step": 7511 + }, + { + "epoch": 0.6716738197424893, + "grad_norm": 0.1397740165337291, + "learning_rate": 5.14110042617625e-05, + "loss": 0.6527, + "step": 7512 + }, + { + "epoch": 0.6717632331902719, + "grad_norm": 0.1338867566587551, + "learning_rate": 5.138569458514617e-05, + "loss": 0.6766, + "step": 7513 + }, + { + "epoch": 0.6718526466380543, + "grad_norm": 0.16090864426810053, + "learning_rate": 5.136038898575286e-05, + "loss": 0.6528, + "step": 7514 + }, + { + "epoch": 0.6719420600858369, + "grad_norm": 0.14596975846531898, + "learning_rate": 5.133508746570502e-05, + "loss": 0.6608, + "step": 7515 + }, + { + "epoch": 0.6720314735336195, + "grad_norm": 0.1435793519619562, + "learning_rate": 5.130979002712466e-05, + "loss": 0.6549, + "step": 7516 + }, + { + "epoch": 0.672120886981402, + "grad_norm": 0.13812191753365877, + "learning_rate": 5.128449667213337e-05, + "loss": 0.6385, + "step": 7517 + }, + { + "epoch": 0.6722103004291845, + "grad_norm": 0.1415467960770542, + "learning_rate": 5.1259207402852506e-05, + "loss": 0.6359, + "step": 7518 + }, + { + "epoch": 0.6722997138769671, + "grad_norm": 0.1365284021237749, + "learning_rate": 5.1233922221403094e-05, + "loss": 0.6394, + "step": 7519 + }, + { + "epoch": 0.6723891273247496, + "grad_norm": 0.12980861145384243, + "learning_rate": 5.120864112990569e-05, + "loss": 0.6149, + "step": 7520 + }, + { + "epoch": 0.6724785407725322, + "grad_norm": 0.15636645913754288, + "learning_rate": 5.118336413048064e-05, + "loss": 0.6613, + "step": 7521 + }, + { + "epoch": 0.6725679542203148, + "grad_norm": 0.1367514751745414, + "learning_rate": 5.115809122524787e-05, + "loss": 0.6411, + "step": 7522 + }, + { + "epoch": 0.6726573676680973, + "grad_norm": 0.13565032769538965, + "learning_rate": 5.113282241632702e-05, + "loss": 0.6546, + "step": 7523 + }, + { + "epoch": 0.6727467811158798, + "grad_norm": 0.12601680751759403, + "learning_rate": 5.110755770583736e-05, + "loss": 0.6754, + "step": 7524 + }, + { + "epoch": 0.6728361945636624, + "grad_norm": 0.1408539375602835, + "learning_rate": 5.108229709589776e-05, + "loss": 0.6457, + "step": 7525 + }, + { + "epoch": 0.672925608011445, + "grad_norm": 0.1675695110680141, + "learning_rate": 5.1057040588626816e-05, + "loss": 0.6529, + "step": 7526 + }, + { + "epoch": 0.6730150214592274, + "grad_norm": 0.1337426554818667, + "learning_rate": 5.103178818614277e-05, + "loss": 0.6388, + "step": 7527 + }, + { + "epoch": 0.67310443490701, + "grad_norm": 0.1537680420404493, + "learning_rate": 5.100653989056352e-05, + "loss": 0.6346, + "step": 7528 + }, + { + "epoch": 0.6731938483547926, + "grad_norm": 0.1346408369217896, + "learning_rate": 5.098129570400658e-05, + "loss": 0.6052, + "step": 7529 + }, + { + "epoch": 0.6732832618025751, + "grad_norm": 0.13487803613609695, + "learning_rate": 5.095605562858923e-05, + "loss": 0.6675, + "step": 7530 + }, + { + "epoch": 0.6733726752503576, + "grad_norm": 0.11958532516196098, + "learning_rate": 5.093081966642822e-05, + "loss": 0.6214, + "step": 7531 + }, + { + "epoch": 0.6734620886981402, + "grad_norm": 0.12844485337056108, + "learning_rate": 5.09055878196401e-05, + "loss": 0.6396, + "step": 7532 + }, + { + "epoch": 0.6735515021459227, + "grad_norm": 0.14199724714371006, + "learning_rate": 5.0880360090341084e-05, + "loss": 0.6677, + "step": 7533 + }, + { + "epoch": 0.6736409155937053, + "grad_norm": 0.1381054656317306, + "learning_rate": 5.08551364806469e-05, + "loss": 0.678, + "step": 7534 + }, + { + "epoch": 0.6737303290414879, + "grad_norm": 0.14174413357084867, + "learning_rate": 5.0829916992673035e-05, + "loss": 0.6237, + "step": 7535 + }, + { + "epoch": 0.6738197424892703, + "grad_norm": 0.11747148911861711, + "learning_rate": 5.080470162853472e-05, + "loss": 0.633, + "step": 7536 + }, + { + "epoch": 0.6739091559370529, + "grad_norm": 0.11075614724250112, + "learning_rate": 5.0779490390346626e-05, + "loss": 0.5818, + "step": 7537 + }, + { + "epoch": 0.6739985693848355, + "grad_norm": 0.13459605976444597, + "learning_rate": 5.075428328022325e-05, + "loss": 0.6539, + "step": 7538 + }, + { + "epoch": 0.6740879828326181, + "grad_norm": 0.1295011303596294, + "learning_rate": 5.0729080300278676e-05, + "loss": 0.6319, + "step": 7539 + }, + { + "epoch": 0.6741773962804005, + "grad_norm": 0.13754257458303956, + "learning_rate": 5.07038814526266e-05, + "loss": 0.606, + "step": 7540 + }, + { + "epoch": 0.6742668097281831, + "grad_norm": 0.14223682944247362, + "learning_rate": 5.0678686739380455e-05, + "loss": 0.6573, + "step": 7541 + }, + { + "epoch": 0.6743562231759657, + "grad_norm": 0.12239198869404785, + "learning_rate": 5.065349616265329e-05, + "loss": 0.6077, + "step": 7542 + }, + { + "epoch": 0.6744456366237482, + "grad_norm": 0.1394648887144935, + "learning_rate": 5.062830972455781e-05, + "loss": 0.6435, + "step": 7543 + }, + { + "epoch": 0.6745350500715308, + "grad_norm": 0.13718203551405891, + "learning_rate": 5.060312742720639e-05, + "loss": 0.5963, + "step": 7544 + }, + { + "epoch": 0.6746244635193133, + "grad_norm": 0.15412949542944462, + "learning_rate": 5.0577949272711e-05, + "loss": 0.6445, + "step": 7545 + }, + { + "epoch": 0.6747138769670958, + "grad_norm": 0.12942097963263352, + "learning_rate": 5.0552775263183294e-05, + "loss": 0.6575, + "step": 7546 + }, + { + "epoch": 0.6748032904148784, + "grad_norm": 0.13052261065547627, + "learning_rate": 5.052760540073467e-05, + "loss": 0.6014, + "step": 7547 + }, + { + "epoch": 0.674892703862661, + "grad_norm": 0.1366122375692267, + "learning_rate": 5.050243968747599e-05, + "loss": 0.6408, + "step": 7548 + }, + { + "epoch": 0.6749821173104434, + "grad_norm": 0.12932829229062096, + "learning_rate": 5.047727812551786e-05, + "loss": 0.6165, + "step": 7549 + }, + { + "epoch": 0.675071530758226, + "grad_norm": 0.13077757072017285, + "learning_rate": 5.04521207169707e-05, + "loss": 0.652, + "step": 7550 + }, + { + "epoch": 0.6751609442060086, + "grad_norm": 0.14151050652804453, + "learning_rate": 5.0426967463944285e-05, + "loss": 0.6192, + "step": 7551 + }, + { + "epoch": 0.6752503576537912, + "grad_norm": 0.17067815859133836, + "learning_rate": 5.040181836854825e-05, + "loss": 0.7104, + "step": 7552 + }, + { + "epoch": 0.6753397711015737, + "grad_norm": 0.13083731972390583, + "learning_rate": 5.037667343289185e-05, + "loss": 0.619, + "step": 7553 + }, + { + "epoch": 0.6754291845493562, + "grad_norm": 0.13804980187456012, + "learning_rate": 5.035153265908388e-05, + "loss": 0.6606, + "step": 7554 + }, + { + "epoch": 0.6755185979971388, + "grad_norm": 0.12342347711549677, + "learning_rate": 5.032639604923289e-05, + "loss": 0.6375, + "step": 7555 + }, + { + "epoch": 0.6756080114449213, + "grad_norm": 0.1289888884613407, + "learning_rate": 5.0301263605447093e-05, + "loss": 0.6672, + "step": 7556 + }, + { + "epoch": 0.6756974248927039, + "grad_norm": 0.11999048182673894, + "learning_rate": 5.0276135329834284e-05, + "loss": 0.6133, + "step": 7557 + }, + { + "epoch": 0.6757868383404864, + "grad_norm": 0.12433439472426792, + "learning_rate": 5.0251011224502e-05, + "loss": 0.622, + "step": 7558 + }, + { + "epoch": 0.6758762517882689, + "grad_norm": 0.15506899637306734, + "learning_rate": 5.0225891291557284e-05, + "loss": 0.6832, + "step": 7559 + }, + { + "epoch": 0.6759656652360515, + "grad_norm": 0.13503479298134888, + "learning_rate": 5.020077553310694e-05, + "loss": 0.6493, + "step": 7560 + }, + { + "epoch": 0.6760550786838341, + "grad_norm": 0.13878900645854933, + "learning_rate": 5.0175663951257424e-05, + "loss": 0.6473, + "step": 7561 + }, + { + "epoch": 0.6761444921316166, + "grad_norm": 0.11947686377020954, + "learning_rate": 5.015055654811484e-05, + "loss": 0.6156, + "step": 7562 + }, + { + "epoch": 0.6762339055793991, + "grad_norm": 0.12696287567428297, + "learning_rate": 5.012545332578479e-05, + "loss": 0.621, + "step": 7563 + }, + { + "epoch": 0.6763233190271817, + "grad_norm": 0.13705439541372358, + "learning_rate": 5.0100354286372806e-05, + "loss": 0.6279, + "step": 7564 + }, + { + "epoch": 0.6764127324749643, + "grad_norm": 0.1338083743514038, + "learning_rate": 5.007525943198382e-05, + "loss": 0.6517, + "step": 7565 + }, + { + "epoch": 0.6765021459227468, + "grad_norm": 0.14703946980037375, + "learning_rate": 5.0050168764722524e-05, + "loss": 0.6425, + "step": 7566 + }, + { + "epoch": 0.6765915593705293, + "grad_norm": 0.1374351789826795, + "learning_rate": 5.002508228669329e-05, + "loss": 0.6029, + "step": 7567 + }, + { + "epoch": 0.6766809728183119, + "grad_norm": 0.147285357013439, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6429, + "step": 7568 + }, + { + "epoch": 0.6767703862660944, + "grad_norm": 0.13480240966685772, + "learning_rate": 4.9974921906746363e-05, + "loss": 0.6471, + "step": 7569 + }, + { + "epoch": 0.676859799713877, + "grad_norm": 0.14766898272129605, + "learning_rate": 4.9949848009035584e-05, + "loss": 0.6558, + "step": 7570 + }, + { + "epoch": 0.6769492131616596, + "grad_norm": 0.14490710509634985, + "learning_rate": 4.992477830897061e-05, + "loss": 0.6847, + "step": 7571 + }, + { + "epoch": 0.677038626609442, + "grad_norm": 0.1494481971716273, + "learning_rate": 4.989971280865401e-05, + "loss": 0.628, + "step": 7572 + }, + { + "epoch": 0.6771280400572246, + "grad_norm": 0.12722523521688728, + "learning_rate": 4.987465151018802e-05, + "loss": 0.653, + "step": 7573 + }, + { + "epoch": 0.6772174535050072, + "grad_norm": 0.12727779027141656, + "learning_rate": 4.984959441567443e-05, + "loss": 0.6443, + "step": 7574 + }, + { + "epoch": 0.6773068669527897, + "grad_norm": 0.12411360906347513, + "learning_rate": 4.9824541527214797e-05, + "loss": 0.5818, + "step": 7575 + }, + { + "epoch": 0.6773962804005722, + "grad_norm": 0.13049536069346257, + "learning_rate": 4.979949284691031e-05, + "loss": 0.6369, + "step": 7576 + }, + { + "epoch": 0.6774856938483548, + "grad_norm": 0.12230988043152684, + "learning_rate": 4.977444837686165e-05, + "loss": 0.6565, + "step": 7577 + }, + { + "epoch": 0.6775751072961373, + "grad_norm": 0.13191468070162995, + "learning_rate": 4.974940811916943e-05, + "loss": 0.6364, + "step": 7578 + }, + { + "epoch": 0.6776645207439199, + "grad_norm": 0.13013719899382525, + "learning_rate": 4.9724372075933615e-05, + "loss": 0.6347, + "step": 7579 + }, + { + "epoch": 0.6777539341917024, + "grad_norm": 0.13987502453104178, + "learning_rate": 4.9699340249254e-05, + "loss": 0.6293, + "step": 7580 + }, + { + "epoch": 0.677843347639485, + "grad_norm": 0.1498720428350692, + "learning_rate": 4.9674312641230015e-05, + "loss": 0.6273, + "step": 7581 + }, + { + "epoch": 0.6779327610872675, + "grad_norm": 0.13067439553871316, + "learning_rate": 4.9649289253960606e-05, + "loss": 0.6565, + "step": 7582 + }, + { + "epoch": 0.6780221745350501, + "grad_norm": 0.14199998445738285, + "learning_rate": 4.9624270089544464e-05, + "loss": 0.662, + "step": 7583 + }, + { + "epoch": 0.6781115879828327, + "grad_norm": 0.13401768910889458, + "learning_rate": 4.959925515008002e-05, + "loss": 0.6144, + "step": 7584 + }, + { + "epoch": 0.6782010014306151, + "grad_norm": 0.15462371433605887, + "learning_rate": 4.9574244437665154e-05, + "loss": 0.6792, + "step": 7585 + }, + { + "epoch": 0.6782904148783977, + "grad_norm": 0.14164753285502354, + "learning_rate": 4.9549237954397495e-05, + "loss": 0.6312, + "step": 7586 + }, + { + "epoch": 0.6783798283261803, + "grad_norm": 0.12432767712993933, + "learning_rate": 4.952423570237437e-05, + "loss": 0.6084, + "step": 7587 + }, + { + "epoch": 0.6784692417739628, + "grad_norm": 0.14054880422558558, + "learning_rate": 4.949923768369259e-05, + "loss": 0.6966, + "step": 7588 + }, + { + "epoch": 0.6785586552217453, + "grad_norm": 0.13279146628360008, + "learning_rate": 4.9474243900448755e-05, + "loss": 0.6494, + "step": 7589 + }, + { + "epoch": 0.6786480686695279, + "grad_norm": 0.13330024890937914, + "learning_rate": 4.9449254354739074e-05, + "loss": 0.639, + "step": 7590 + }, + { + "epoch": 0.6787374821173104, + "grad_norm": 0.12390259826614841, + "learning_rate": 4.9424269048659375e-05, + "loss": 0.5883, + "step": 7591 + }, + { + "epoch": 0.678826895565093, + "grad_norm": 0.13845737193155477, + "learning_rate": 4.939928798430515e-05, + "loss": 0.6583, + "step": 7592 + }, + { + "epoch": 0.6789163090128756, + "grad_norm": 0.14777995350038003, + "learning_rate": 4.9374311163771567e-05, + "loss": 0.67, + "step": 7593 + }, + { + "epoch": 0.679005722460658, + "grad_norm": 0.12826823929445397, + "learning_rate": 4.9349338589153335e-05, + "loss": 0.6442, + "step": 7594 + }, + { + "epoch": 0.6790951359084406, + "grad_norm": 0.12484339786580029, + "learning_rate": 4.9324370262544905e-05, + "loss": 0.6269, + "step": 7595 + }, + { + "epoch": 0.6791845493562232, + "grad_norm": 0.11608380270230918, + "learning_rate": 4.929940618604037e-05, + "loss": 0.6403, + "step": 7596 + }, + { + "epoch": 0.6792739628040058, + "grad_norm": 0.14378722656892345, + "learning_rate": 4.927444636173334e-05, + "loss": 0.6858, + "step": 7597 + }, + { + "epoch": 0.6793633762517882, + "grad_norm": 0.12101422926350675, + "learning_rate": 4.92494907917173e-05, + "loss": 0.6286, + "step": 7598 + }, + { + "epoch": 0.6794527896995708, + "grad_norm": 0.1276538522918897, + "learning_rate": 4.9224539478085144e-05, + "loss": 0.5419, + "step": 7599 + }, + { + "epoch": 0.6795422031473534, + "grad_norm": 0.13302938195931532, + "learning_rate": 4.919959242292954e-05, + "loss": 0.6591, + "step": 7600 + }, + { + "epoch": 0.6796316165951359, + "grad_norm": 0.13307424874432175, + "learning_rate": 4.9174649628342805e-05, + "loss": 0.5935, + "step": 7601 + }, + { + "epoch": 0.6797210300429185, + "grad_norm": 0.1348062208297238, + "learning_rate": 4.914971109641678e-05, + "loss": 0.6648, + "step": 7602 + }, + { + "epoch": 0.679810443490701, + "grad_norm": 0.11919993033754232, + "learning_rate": 4.912477682924309e-05, + "loss": 0.6181, + "step": 7603 + }, + { + "epoch": 0.6798998569384835, + "grad_norm": 0.14118996417135463, + "learning_rate": 4.909984682891291e-05, + "loss": 0.6599, + "step": 7604 + }, + { + "epoch": 0.6799892703862661, + "grad_norm": 0.1368966413519689, + "learning_rate": 4.907492109751711e-05, + "loss": 0.6534, + "step": 7605 + }, + { + "epoch": 0.6800786838340487, + "grad_norm": 0.13139466063795338, + "learning_rate": 4.904999963714618e-05, + "loss": 0.6322, + "step": 7606 + }, + { + "epoch": 0.6801680972818311, + "grad_norm": 0.12672873032814197, + "learning_rate": 4.902508244989028e-05, + "loss": 0.642, + "step": 7607 + }, + { + "epoch": 0.6802575107296137, + "grad_norm": 0.14794384320014956, + "learning_rate": 4.900016953783912e-05, + "loss": 0.6568, + "step": 7608 + }, + { + "epoch": 0.6803469241773963, + "grad_norm": 0.13756061896210836, + "learning_rate": 4.8975260903082157e-05, + "loss": 0.6377, + "step": 7609 + }, + { + "epoch": 0.6804363376251789, + "grad_norm": 0.15086717147305984, + "learning_rate": 4.895035654770846e-05, + "loss": 0.6486, + "step": 7610 + }, + { + "epoch": 0.6805257510729614, + "grad_norm": 0.14981857162418022, + "learning_rate": 4.892545647380664e-05, + "loss": 0.627, + "step": 7611 + }, + { + "epoch": 0.6806151645207439, + "grad_norm": 0.14687415313222812, + "learning_rate": 4.890056068346518e-05, + "loss": 0.6461, + "step": 7612 + }, + { + "epoch": 0.6807045779685265, + "grad_norm": 0.1295099625825689, + "learning_rate": 4.887566917877194e-05, + "loss": 0.652, + "step": 7613 + }, + { + "epoch": 0.680793991416309, + "grad_norm": 0.16170619612839565, + "learning_rate": 4.885078196181458e-05, + "loss": 0.656, + "step": 7614 + }, + { + "epoch": 0.6808834048640916, + "grad_norm": 0.14303864688901508, + "learning_rate": 4.882589903468041e-05, + "loss": 0.6415, + "step": 7615 + }, + { + "epoch": 0.6809728183118741, + "grad_norm": 0.1650612779274405, + "learning_rate": 4.880102039945624e-05, + "loss": 0.6608, + "step": 7616 + }, + { + "epoch": 0.6810622317596566, + "grad_norm": 0.13873985718733467, + "learning_rate": 4.8776146058228665e-05, + "loss": 0.6139, + "step": 7617 + }, + { + "epoch": 0.6811516452074392, + "grad_norm": 0.13218749963109036, + "learning_rate": 4.875127601308386e-05, + "loss": 0.6373, + "step": 7618 + }, + { + "epoch": 0.6812410586552218, + "grad_norm": 0.12793941946181708, + "learning_rate": 4.8726410266107634e-05, + "loss": 0.6137, + "step": 7619 + }, + { + "epoch": 0.6813304721030042, + "grad_norm": 0.13156310858178966, + "learning_rate": 4.870154881938546e-05, + "loss": 0.6249, + "step": 7620 + }, + { + "epoch": 0.6814198855507868, + "grad_norm": 0.1542109330717543, + "learning_rate": 4.867669167500247e-05, + "loss": 0.6349, + "step": 7621 + }, + { + "epoch": 0.6815092989985694, + "grad_norm": 0.12797127614722567, + "learning_rate": 4.865183883504333e-05, + "loss": 0.6179, + "step": 7622 + }, + { + "epoch": 0.681598712446352, + "grad_norm": 0.14308441632639915, + "learning_rate": 4.862699030159246e-05, + "loss": 0.6483, + "step": 7623 + }, + { + "epoch": 0.6816881258941345, + "grad_norm": 0.1429901446411135, + "learning_rate": 4.86021460767339e-05, + "loss": 0.6565, + "step": 7624 + }, + { + "epoch": 0.681777539341917, + "grad_norm": 0.14514288076342635, + "learning_rate": 4.8577306162551196e-05, + "loss": 0.6262, + "step": 7625 + }, + { + "epoch": 0.6818669527896996, + "grad_norm": 0.125255946254158, + "learning_rate": 4.8552470561127775e-05, + "loss": 0.6155, + "step": 7626 + }, + { + "epoch": 0.6819563662374821, + "grad_norm": 0.12457753161158261, + "learning_rate": 4.852763927454653e-05, + "loss": 0.6484, + "step": 7627 + }, + { + "epoch": 0.6820457796852647, + "grad_norm": 0.14065728315683085, + "learning_rate": 4.850281230489e-05, + "loss": 0.6224, + "step": 7628 + }, + { + "epoch": 0.6821351931330472, + "grad_norm": 0.11753120668231969, + "learning_rate": 4.84779896542404e-05, + "loss": 0.6204, + "step": 7629 + }, + { + "epoch": 0.6822246065808297, + "grad_norm": 0.14438483852751777, + "learning_rate": 4.845317132467963e-05, + "loss": 0.6484, + "step": 7630 + }, + { + "epoch": 0.6823140200286123, + "grad_norm": 0.13896752406601082, + "learning_rate": 4.842835731828908e-05, + "loss": 0.6183, + "step": 7631 + }, + { + "epoch": 0.6824034334763949, + "grad_norm": 0.1458864516132642, + "learning_rate": 4.840354763714991e-05, + "loss": 0.6546, + "step": 7632 + }, + { + "epoch": 0.6824928469241774, + "grad_norm": 0.1523517250648181, + "learning_rate": 4.83787422833429e-05, + "loss": 0.6799, + "step": 7633 + }, + { + "epoch": 0.6825822603719599, + "grad_norm": 0.13344021345060122, + "learning_rate": 4.835394125894843e-05, + "loss": 0.6041, + "step": 7634 + }, + { + "epoch": 0.6826716738197425, + "grad_norm": 0.1437442874446841, + "learning_rate": 4.832914456604658e-05, + "loss": 0.6345, + "step": 7635 + }, + { + "epoch": 0.682761087267525, + "grad_norm": 0.14738888328691513, + "learning_rate": 4.830435220671693e-05, + "loss": 0.6074, + "step": 7636 + }, + { + "epoch": 0.6828505007153076, + "grad_norm": 0.1329315038052847, + "learning_rate": 4.8279564183038825e-05, + "loss": 0.61, + "step": 7637 + }, + { + "epoch": 0.6829399141630901, + "grad_norm": 0.14563802551156824, + "learning_rate": 4.825478049709124e-05, + "loss": 0.6689, + "step": 7638 + }, + { + "epoch": 0.6830293276108726, + "grad_norm": 0.13114073575838286, + "learning_rate": 4.823000115095266e-05, + "loss": 0.6288, + "step": 7639 + }, + { + "epoch": 0.6831187410586552, + "grad_norm": 0.13657531106826215, + "learning_rate": 4.82052261467014e-05, + "loss": 0.6436, + "step": 7640 + }, + { + "epoch": 0.6832081545064378, + "grad_norm": 0.13092962041328338, + "learning_rate": 4.81804554864153e-05, + "loss": 0.6454, + "step": 7641 + }, + { + "epoch": 0.6832975679542204, + "grad_norm": 0.12930910087940686, + "learning_rate": 4.815568917217178e-05, + "loss": 0.6195, + "step": 7642 + }, + { + "epoch": 0.6833869814020028, + "grad_norm": 0.14893260068483707, + "learning_rate": 4.813092720604799e-05, + "loss": 0.6127, + "step": 7643 + }, + { + "epoch": 0.6834763948497854, + "grad_norm": 0.13277735429337442, + "learning_rate": 4.8106169590120745e-05, + "loss": 0.5985, + "step": 7644 + }, + { + "epoch": 0.683565808297568, + "grad_norm": 0.16757221901737498, + "learning_rate": 4.8081416326466346e-05, + "loss": 0.6701, + "step": 7645 + }, + { + "epoch": 0.6836552217453505, + "grad_norm": 0.14482697058668975, + "learning_rate": 4.805666741716085e-05, + "loss": 0.6431, + "step": 7646 + }, + { + "epoch": 0.683744635193133, + "grad_norm": 0.1338077561936149, + "learning_rate": 4.8031922864279924e-05, + "loss": 0.6703, + "step": 7647 + }, + { + "epoch": 0.6838340486409156, + "grad_norm": 0.130994499884751, + "learning_rate": 4.800718266989888e-05, + "loss": 0.6602, + "step": 7648 + }, + { + "epoch": 0.6839234620886981, + "grad_norm": 0.13486818833802225, + "learning_rate": 4.798244683609262e-05, + "loss": 0.6298, + "step": 7649 + }, + { + "epoch": 0.6840128755364807, + "grad_norm": 0.14385699867165538, + "learning_rate": 4.795771536493576e-05, + "loss": 0.6716, + "step": 7650 + }, + { + "epoch": 0.6841022889842633, + "grad_norm": 0.14400931838818007, + "learning_rate": 4.793298825850243e-05, + "loss": 0.6526, + "step": 7651 + }, + { + "epoch": 0.6841917024320457, + "grad_norm": 0.11912572634490098, + "learning_rate": 4.790826551886649e-05, + "loss": 0.6216, + "step": 7652 + }, + { + "epoch": 0.6842811158798283, + "grad_norm": 0.13006558360772724, + "learning_rate": 4.788354714810141e-05, + "loss": 0.6462, + "step": 7653 + }, + { + "epoch": 0.6843705293276109, + "grad_norm": 0.14229511863522384, + "learning_rate": 4.7858833148280294e-05, + "loss": 0.6473, + "step": 7654 + }, + { + "epoch": 0.6844599427753935, + "grad_norm": 0.12801359939439658, + "learning_rate": 4.78341235214759e-05, + "loss": 0.6334, + "step": 7655 + }, + { + "epoch": 0.6845493562231759, + "grad_norm": 0.12783740882387157, + "learning_rate": 4.7809418269760545e-05, + "loss": 0.6382, + "step": 7656 + }, + { + "epoch": 0.6846387696709585, + "grad_norm": 0.1293884089104937, + "learning_rate": 4.778471739520624e-05, + "loss": 0.6376, + "step": 7657 + }, + { + "epoch": 0.6847281831187411, + "grad_norm": 0.1553890180663556, + "learning_rate": 4.7760020899884664e-05, + "loss": 0.6209, + "step": 7658 + }, + { + "epoch": 0.6848175965665236, + "grad_norm": 0.14825489123400692, + "learning_rate": 4.7735328785867004e-05, + "loss": 0.6332, + "step": 7659 + }, + { + "epoch": 0.6849070100143062, + "grad_norm": 0.13620190381542452, + "learning_rate": 4.771064105522417e-05, + "loss": 0.6375, + "step": 7660 + }, + { + "epoch": 0.6849964234620887, + "grad_norm": 0.12946341217078458, + "learning_rate": 4.7685957710026784e-05, + "loss": 0.6252, + "step": 7661 + }, + { + "epoch": 0.6850858369098712, + "grad_norm": 0.14334133083580458, + "learning_rate": 4.766127875234492e-05, + "loss": 0.6146, + "step": 7662 + }, + { + "epoch": 0.6851752503576538, + "grad_norm": 0.14148442323505006, + "learning_rate": 4.763660418424839e-05, + "loss": 0.6664, + "step": 7663 + }, + { + "epoch": 0.6852646638054364, + "grad_norm": 0.15897428113654657, + "learning_rate": 4.7611934007806666e-05, + "loss": 0.677, + "step": 7664 + }, + { + "epoch": 0.6853540772532188, + "grad_norm": 0.1565252975130032, + "learning_rate": 4.758726822508874e-05, + "loss": 0.6569, + "step": 7665 + }, + { + "epoch": 0.6854434907010014, + "grad_norm": 0.12627210110669027, + "learning_rate": 4.756260683816333e-05, + "loss": 0.6159, + "step": 7666 + }, + { + "epoch": 0.685532904148784, + "grad_norm": 0.15087297621881085, + "learning_rate": 4.753794984909874e-05, + "loss": 0.6419, + "step": 7667 + }, + { + "epoch": 0.6856223175965666, + "grad_norm": 0.13325075731740432, + "learning_rate": 4.751329725996295e-05, + "loss": 0.63, + "step": 7668 + }, + { + "epoch": 0.685711731044349, + "grad_norm": 0.13057446070969267, + "learning_rate": 4.748864907282357e-05, + "loss": 0.6304, + "step": 7669 + }, + { + "epoch": 0.6858011444921316, + "grad_norm": 0.1568860430603029, + "learning_rate": 4.746400528974772e-05, + "loss": 0.6443, + "step": 7670 + }, + { + "epoch": 0.6858905579399142, + "grad_norm": 0.13231232875558926, + "learning_rate": 4.7439365912802314e-05, + "loss": 0.6142, + "step": 7671 + }, + { + "epoch": 0.6859799713876967, + "grad_norm": 0.13419407620101828, + "learning_rate": 4.741473094405386e-05, + "loss": 0.6472, + "step": 7672 + }, + { + "epoch": 0.6860693848354793, + "grad_norm": 0.13507059048960554, + "learning_rate": 4.739010038556831e-05, + "loss": 0.6401, + "step": 7673 + }, + { + "epoch": 0.6861587982832618, + "grad_norm": 0.15657176052092242, + "learning_rate": 4.736547423941157e-05, + "loss": 0.6399, + "step": 7674 + }, + { + "epoch": 0.6862482117310443, + "grad_norm": 0.13742086323987238, + "learning_rate": 4.734085250764896e-05, + "loss": 0.6743, + "step": 7675 + }, + { + "epoch": 0.6863376251788269, + "grad_norm": 0.11849038455062204, + "learning_rate": 4.7316235192345416e-05, + "loss": 0.5784, + "step": 7676 + }, + { + "epoch": 0.6864270386266095, + "grad_norm": 0.1338943446724349, + "learning_rate": 4.729162229556561e-05, + "loss": 0.6358, + "step": 7677 + }, + { + "epoch": 0.6865164520743919, + "grad_norm": 0.12660779183450943, + "learning_rate": 4.726701381937382e-05, + "loss": 0.5829, + "step": 7678 + }, + { + "epoch": 0.6866058655221745, + "grad_norm": 0.16753446537107974, + "learning_rate": 4.724240976583386e-05, + "loss": 0.6359, + "step": 7679 + }, + { + "epoch": 0.6866952789699571, + "grad_norm": 0.14098268645001552, + "learning_rate": 4.7217810137009274e-05, + "loss": 0.6525, + "step": 7680 + }, + { + "epoch": 0.6867846924177397, + "grad_norm": 0.14354215999708989, + "learning_rate": 4.7193214934963206e-05, + "loss": 0.6637, + "step": 7681 + }, + { + "epoch": 0.6868741058655222, + "grad_norm": 0.143221087036459, + "learning_rate": 4.716862416175844e-05, + "loss": 0.6267, + "step": 7682 + }, + { + "epoch": 0.6869635193133047, + "grad_norm": 0.14128317215796954, + "learning_rate": 4.7144037819457345e-05, + "loss": 0.6617, + "step": 7683 + }, + { + "epoch": 0.6870529327610873, + "grad_norm": 0.15016833092107984, + "learning_rate": 4.7119455910122e-05, + "loss": 0.6665, + "step": 7684 + }, + { + "epoch": 0.6871423462088698, + "grad_norm": 0.12336261251650447, + "learning_rate": 4.709487843581399e-05, + "loss": 0.6189, + "step": 7685 + }, + { + "epoch": 0.6872317596566524, + "grad_norm": 0.12768118942729342, + "learning_rate": 4.707030539859465e-05, + "loss": 0.6195, + "step": 7686 + }, + { + "epoch": 0.6873211731044349, + "grad_norm": 0.13735794135128604, + "learning_rate": 4.7045736800524856e-05, + "loss": 0.6136, + "step": 7687 + }, + { + "epoch": 0.6874105865522174, + "grad_norm": 0.11711810237962413, + "learning_rate": 4.702117264366517e-05, + "loss": 0.6143, + "step": 7688 + }, + { + "epoch": 0.6875, + "grad_norm": 0.13764125597754798, + "learning_rate": 4.699661293007579e-05, + "loss": 0.639, + "step": 7689 + }, + { + "epoch": 0.6875894134477826, + "grad_norm": 0.14848012963984908, + "learning_rate": 4.6972057661816426e-05, + "loss": 0.691, + "step": 7690 + }, + { + "epoch": 0.6876788268955651, + "grad_norm": 0.13253128170575892, + "learning_rate": 4.6947506840946555e-05, + "loss": 0.6013, + "step": 7691 + }, + { + "epoch": 0.6877682403433476, + "grad_norm": 0.12855061636433457, + "learning_rate": 4.6922960469525245e-05, + "loss": 0.6287, + "step": 7692 + }, + { + "epoch": 0.6878576537911302, + "grad_norm": 0.13665071098034481, + "learning_rate": 4.68984185496111e-05, + "loss": 0.6662, + "step": 7693 + }, + { + "epoch": 0.6879470672389127, + "grad_norm": 0.13027591345418998, + "learning_rate": 4.687388108326243e-05, + "loss": 0.6376, + "step": 7694 + }, + { + "epoch": 0.6880364806866953, + "grad_norm": 0.1387572627951427, + "learning_rate": 4.684934807253727e-05, + "loss": 0.6253, + "step": 7695 + }, + { + "epoch": 0.6881258941344778, + "grad_norm": 0.14255377003891415, + "learning_rate": 4.6824819519493057e-05, + "loss": 0.6691, + "step": 7696 + }, + { + "epoch": 0.6882153075822603, + "grad_norm": 0.12903949349494614, + "learning_rate": 4.6800295426187e-05, + "loss": 0.6573, + "step": 7697 + }, + { + "epoch": 0.6883047210300429, + "grad_norm": 0.13411277626525675, + "learning_rate": 4.677577579467597e-05, + "loss": 0.6692, + "step": 7698 + }, + { + "epoch": 0.6883941344778255, + "grad_norm": 0.1550201866272274, + "learning_rate": 4.67512606270163e-05, + "loss": 0.6411, + "step": 7699 + }, + { + "epoch": 0.6884835479256081, + "grad_norm": 0.14067356921524993, + "learning_rate": 4.67267499252641e-05, + "loss": 0.6241, + "step": 7700 + }, + { + "epoch": 0.6885729613733905, + "grad_norm": 0.14504031912813223, + "learning_rate": 4.670224369147505e-05, + "loss": 0.6804, + "step": 7701 + }, + { + "epoch": 0.6886623748211731, + "grad_norm": 0.11961684458657632, + "learning_rate": 4.6677741927704434e-05, + "loss": 0.6173, + "step": 7702 + }, + { + "epoch": 0.6887517882689557, + "grad_norm": 0.15686399932271877, + "learning_rate": 4.6653244636007255e-05, + "loss": 0.6617, + "step": 7703 + }, + { + "epoch": 0.6888412017167382, + "grad_norm": 0.1335848255095892, + "learning_rate": 4.6628751818437985e-05, + "loss": 0.6468, + "step": 7704 + }, + { + "epoch": 0.6889306151645207, + "grad_norm": 0.13651152195850455, + "learning_rate": 4.660426347705085e-05, + "loss": 0.6095, + "step": 7705 + }, + { + "epoch": 0.6890200286123033, + "grad_norm": 0.14888459775585294, + "learning_rate": 4.6579779613899644e-05, + "loss": 0.6501, + "step": 7706 + }, + { + "epoch": 0.6891094420600858, + "grad_norm": 0.13130795635465833, + "learning_rate": 4.6555300231037836e-05, + "loss": 0.6517, + "step": 7707 + }, + { + "epoch": 0.6891988555078684, + "grad_norm": 0.14092420486126928, + "learning_rate": 4.653082533051839e-05, + "loss": 0.6618, + "step": 7708 + }, + { + "epoch": 0.689288268955651, + "grad_norm": 0.12713413769275225, + "learning_rate": 4.650635491439412e-05, + "loss": 0.6337, + "step": 7709 + }, + { + "epoch": 0.6893776824034334, + "grad_norm": 0.14792721701722858, + "learning_rate": 4.6481888984717225e-05, + "loss": 0.657, + "step": 7710 + }, + { + "epoch": 0.689467095851216, + "grad_norm": 0.14085279569405285, + "learning_rate": 4.6457427543539654e-05, + "loss": 0.6618, + "step": 7711 + }, + { + "epoch": 0.6895565092989986, + "grad_norm": 0.1360373696359322, + "learning_rate": 4.6432970592913026e-05, + "loss": 0.6614, + "step": 7712 + }, + { + "epoch": 0.6896459227467812, + "grad_norm": 0.13447062575290084, + "learning_rate": 4.640851813488842e-05, + "loss": 0.6154, + "step": 7713 + }, + { + "epoch": 0.6897353361945636, + "grad_norm": 0.13828871614172691, + "learning_rate": 4.638407017151667e-05, + "loss": 0.6409, + "step": 7714 + }, + { + "epoch": 0.6898247496423462, + "grad_norm": 0.13430994318633402, + "learning_rate": 4.6359626704848215e-05, + "loss": 0.6521, + "step": 7715 + }, + { + "epoch": 0.6899141630901288, + "grad_norm": 0.13438612522165447, + "learning_rate": 4.633518773693307e-05, + "loss": 0.6315, + "step": 7716 + }, + { + "epoch": 0.6900035765379113, + "grad_norm": 0.14116922852211009, + "learning_rate": 4.631075326982093e-05, + "loss": 0.6702, + "step": 7717 + }, + { + "epoch": 0.6900929899856938, + "grad_norm": 0.1493875108657804, + "learning_rate": 4.6286323305561105e-05, + "loss": 0.6406, + "step": 7718 + }, + { + "epoch": 0.6901824034334764, + "grad_norm": 0.1347953355109415, + "learning_rate": 4.626189784620245e-05, + "loss": 0.641, + "step": 7719 + }, + { + "epoch": 0.6902718168812589, + "grad_norm": 0.15068833077282692, + "learning_rate": 4.623747689379351e-05, + "loss": 0.6452, + "step": 7720 + }, + { + "epoch": 0.6903612303290415, + "grad_norm": 0.13539637934797644, + "learning_rate": 4.621306045038249e-05, + "loss": 0.6923, + "step": 7721 + }, + { + "epoch": 0.6904506437768241, + "grad_norm": 0.14586753136178882, + "learning_rate": 4.618864851801707e-05, + "loss": 0.6654, + "step": 7722 + }, + { + "epoch": 0.6905400572246065, + "grad_norm": 0.13236371450535375, + "learning_rate": 4.6164241098744776e-05, + "loss": 0.641, + "step": 7723 + }, + { + "epoch": 0.6906294706723891, + "grad_norm": 0.13215922092832388, + "learning_rate": 4.613983819461253e-05, + "loss": 0.6707, + "step": 7724 + }, + { + "epoch": 0.6907188841201717, + "grad_norm": 0.14229304606389967, + "learning_rate": 4.6115439807667005e-05, + "loss": 0.663, + "step": 7725 + }, + { + "epoch": 0.6908082975679543, + "grad_norm": 0.14492944743194264, + "learning_rate": 4.6091045939954514e-05, + "loss": 0.6274, + "step": 7726 + }, + { + "epoch": 0.6908977110157367, + "grad_norm": 0.13340092990395222, + "learning_rate": 4.606665659352085e-05, + "loss": 0.6309, + "step": 7727 + }, + { + "epoch": 0.6909871244635193, + "grad_norm": 0.1377046104256255, + "learning_rate": 4.604227177041156e-05, + "loss": 0.6246, + "step": 7728 + }, + { + "epoch": 0.6910765379113019, + "grad_norm": 0.1414323982917771, + "learning_rate": 4.601789147267177e-05, + "loss": 0.6376, + "step": 7729 + }, + { + "epoch": 0.6911659513590844, + "grad_norm": 0.12038921172298793, + "learning_rate": 4.5993515702346235e-05, + "loss": 0.6489, + "step": 7730 + }, + { + "epoch": 0.691255364806867, + "grad_norm": 0.1272000394145778, + "learning_rate": 4.596914446147932e-05, + "loss": 0.6406, + "step": 7731 + }, + { + "epoch": 0.6913447782546495, + "grad_norm": 0.13518282892051728, + "learning_rate": 4.594477775211503e-05, + "loss": 0.6004, + "step": 7732 + }, + { + "epoch": 0.691434191702432, + "grad_norm": 0.15112646110419053, + "learning_rate": 4.5920415576296914e-05, + "loss": 0.6389, + "step": 7733 + }, + { + "epoch": 0.6915236051502146, + "grad_norm": 0.14406121900089744, + "learning_rate": 4.589605793606824e-05, + "loss": 0.6195, + "step": 7734 + }, + { + "epoch": 0.6916130185979972, + "grad_norm": 0.13137095087346065, + "learning_rate": 4.5871704833471876e-05, + "loss": 0.6516, + "step": 7735 + }, + { + "epoch": 0.6917024320457796, + "grad_norm": 0.11588552096569081, + "learning_rate": 4.584735627055019e-05, + "loss": 0.6058, + "step": 7736 + }, + { + "epoch": 0.6917918454935622, + "grad_norm": 0.13731967012507962, + "learning_rate": 4.5823012249345396e-05, + "loss": 0.6395, + "step": 7737 + }, + { + "epoch": 0.6918812589413448, + "grad_norm": 0.146100086747959, + "learning_rate": 4.579867277189911e-05, + "loss": 0.6692, + "step": 7738 + }, + { + "epoch": 0.6919706723891274, + "grad_norm": 0.15551021670872647, + "learning_rate": 4.5774337840252666e-05, + "loss": 0.6766, + "step": 7739 + }, + { + "epoch": 0.6920600858369099, + "grad_norm": 0.14840563695762787, + "learning_rate": 4.575000745644703e-05, + "loss": 0.6671, + "step": 7740 + }, + { + "epoch": 0.6921494992846924, + "grad_norm": 0.14454443304798004, + "learning_rate": 4.5725681622522795e-05, + "loss": 0.6728, + "step": 7741 + }, + { + "epoch": 0.692238912732475, + "grad_norm": 0.11549748657013202, + "learning_rate": 4.570136034052005e-05, + "loss": 0.6018, + "step": 7742 + }, + { + "epoch": 0.6923283261802575, + "grad_norm": 0.14732551249815512, + "learning_rate": 4.567704361247863e-05, + "loss": 0.6661, + "step": 7743 + }, + { + "epoch": 0.6924177396280401, + "grad_norm": 0.13190727165175792, + "learning_rate": 4.5652731440437965e-05, + "loss": 0.636, + "step": 7744 + }, + { + "epoch": 0.6925071530758226, + "grad_norm": 0.14066623843793613, + "learning_rate": 4.5628423826437085e-05, + "loss": 0.6334, + "step": 7745 + }, + { + "epoch": 0.6925965665236051, + "grad_norm": 0.140210390631529, + "learning_rate": 4.5604120772514655e-05, + "loss": 0.6423, + "step": 7746 + }, + { + "epoch": 0.6926859799713877, + "grad_norm": 0.14292271334713014, + "learning_rate": 4.557982228070891e-05, + "loss": 0.669, + "step": 7747 + }, + { + "epoch": 0.6927753934191703, + "grad_norm": 0.1640312140607293, + "learning_rate": 4.5555528353057716e-05, + "loss": 0.7093, + "step": 7748 + }, + { + "epoch": 0.6928648068669528, + "grad_norm": 0.13372337444887109, + "learning_rate": 4.553123899159867e-05, + "loss": 0.595, + "step": 7749 + }, + { + "epoch": 0.6929542203147353, + "grad_norm": 0.13934470458152556, + "learning_rate": 4.5506954198368744e-05, + "loss": 0.6407, + "step": 7750 + }, + { + "epoch": 0.6930436337625179, + "grad_norm": 0.1437911858616344, + "learning_rate": 4.54826739754048e-05, + "loss": 0.663, + "step": 7751 + }, + { + "epoch": 0.6931330472103004, + "grad_norm": 0.13251170795000677, + "learning_rate": 4.545839832474318e-05, + "loss": 0.63, + "step": 7752 + }, + { + "epoch": 0.693222460658083, + "grad_norm": 0.14528235403485362, + "learning_rate": 4.543412724841979e-05, + "loss": 0.6446, + "step": 7753 + }, + { + "epoch": 0.6933118741058655, + "grad_norm": 0.12267733899604713, + "learning_rate": 4.5409860748470246e-05, + "loss": 0.5938, + "step": 7754 + }, + { + "epoch": 0.693401287553648, + "grad_norm": 0.12401119344412348, + "learning_rate": 4.538559882692979e-05, + "loss": 0.6421, + "step": 7755 + }, + { + "epoch": 0.6934907010014306, + "grad_norm": 0.15320854640442808, + "learning_rate": 4.536134148583313e-05, + "loss": 0.6433, + "step": 7756 + }, + { + "epoch": 0.6935801144492132, + "grad_norm": 0.13940302058593582, + "learning_rate": 4.5337088727214835e-05, + "loss": 0.6366, + "step": 7757 + }, + { + "epoch": 0.6936695278969958, + "grad_norm": 0.13753478355801627, + "learning_rate": 4.531284055310887e-05, + "loss": 0.6513, + "step": 7758 + }, + { + "epoch": 0.6937589413447782, + "grad_norm": 0.1432737438666615, + "learning_rate": 4.5288596965548924e-05, + "loss": 0.6583, + "step": 7759 + }, + { + "epoch": 0.6938483547925608, + "grad_norm": 0.12470179320628522, + "learning_rate": 4.5264357966568306e-05, + "loss": 0.6378, + "step": 7760 + }, + { + "epoch": 0.6939377682403434, + "grad_norm": 0.13794268272750856, + "learning_rate": 4.5240123558199846e-05, + "loss": 0.6658, + "step": 7761 + }, + { + "epoch": 0.6940271816881259, + "grad_norm": 0.1335952048059035, + "learning_rate": 4.521589374247609e-05, + "loss": 0.6339, + "step": 7762 + }, + { + "epoch": 0.6941165951359084, + "grad_norm": 0.13727249707451655, + "learning_rate": 4.519166852142917e-05, + "loss": 0.6655, + "step": 7763 + }, + { + "epoch": 0.694206008583691, + "grad_norm": 0.13150130188676823, + "learning_rate": 4.516744789709081e-05, + "loss": 0.607, + "step": 7764 + }, + { + "epoch": 0.6942954220314735, + "grad_norm": 0.12262843529906135, + "learning_rate": 4.5143231871492375e-05, + "loss": 0.6577, + "step": 7765 + }, + { + "epoch": 0.6943848354792561, + "grad_norm": 0.12854951786071336, + "learning_rate": 4.5119020446664875e-05, + "loss": 0.6371, + "step": 7766 + }, + { + "epoch": 0.6944742489270386, + "grad_norm": 0.13720657607158715, + "learning_rate": 4.509481362463881e-05, + "loss": 0.6285, + "step": 7767 + }, + { + "epoch": 0.6945636623748211, + "grad_norm": 0.13965301878082378, + "learning_rate": 4.507061140744442e-05, + "loss": 0.6346, + "step": 7768 + }, + { + "epoch": 0.6946530758226037, + "grad_norm": 0.13554689237844325, + "learning_rate": 4.504641379711154e-05, + "loss": 0.627, + "step": 7769 + }, + { + "epoch": 0.6947424892703863, + "grad_norm": 0.12304403921667317, + "learning_rate": 4.502222079566951e-05, + "loss": 0.6319, + "step": 7770 + }, + { + "epoch": 0.6948319027181689, + "grad_norm": 0.13305450014014386, + "learning_rate": 4.499803240514745e-05, + "loss": 0.6393, + "step": 7771 + }, + { + "epoch": 0.6949213161659513, + "grad_norm": 0.13691160906530297, + "learning_rate": 4.497384862757403e-05, + "loss": 0.6427, + "step": 7772 + }, + { + "epoch": 0.6950107296137339, + "grad_norm": 0.1208565275881048, + "learning_rate": 4.494966946497743e-05, + "loss": 0.6594, + "step": 7773 + }, + { + "epoch": 0.6951001430615165, + "grad_norm": 0.13597341343395253, + "learning_rate": 4.492549491938557e-05, + "loss": 0.6324, + "step": 7774 + }, + { + "epoch": 0.695189556509299, + "grad_norm": 0.15977931810491183, + "learning_rate": 4.4901324992825975e-05, + "loss": 0.7005, + "step": 7775 + }, + { + "epoch": 0.6952789699570815, + "grad_norm": 0.13741612638183878, + "learning_rate": 4.487715968732568e-05, + "loss": 0.6608, + "step": 7776 + }, + { + "epoch": 0.6953683834048641, + "grad_norm": 0.1380513315498678, + "learning_rate": 4.4852999004911425e-05, + "loss": 0.6421, + "step": 7777 + }, + { + "epoch": 0.6954577968526466, + "grad_norm": 0.1408336920481102, + "learning_rate": 4.482884294760954e-05, + "loss": 0.6656, + "step": 7778 + }, + { + "epoch": 0.6955472103004292, + "grad_norm": 0.130968700357961, + "learning_rate": 4.480469151744596e-05, + "loss": 0.635, + "step": 7779 + }, + { + "epoch": 0.6956366237482118, + "grad_norm": 0.1441334858380902, + "learning_rate": 4.4780544716446294e-05, + "loss": 0.6182, + "step": 7780 + }, + { + "epoch": 0.6957260371959942, + "grad_norm": 0.1309294155771469, + "learning_rate": 4.475640254663561e-05, + "loss": 0.6375, + "step": 7781 + }, + { + "epoch": 0.6958154506437768, + "grad_norm": 0.1283928086708547, + "learning_rate": 4.473226501003873e-05, + "loss": 0.6014, + "step": 7782 + }, + { + "epoch": 0.6959048640915594, + "grad_norm": 0.154600446490118, + "learning_rate": 4.470813210868008e-05, + "loss": 0.6379, + "step": 7783 + }, + { + "epoch": 0.695994277539342, + "grad_norm": 0.13553512297061987, + "learning_rate": 4.4684003844583534e-05, + "loss": 0.5595, + "step": 7784 + }, + { + "epoch": 0.6960836909871244, + "grad_norm": 0.13249448480475468, + "learning_rate": 4.465988021977282e-05, + "loss": 0.6153, + "step": 7785 + }, + { + "epoch": 0.696173104434907, + "grad_norm": 0.15330747660832053, + "learning_rate": 4.4635761236271144e-05, + "loss": 0.6376, + "step": 7786 + }, + { + "epoch": 0.6962625178826896, + "grad_norm": 0.15805484163667438, + "learning_rate": 4.461164689610129e-05, + "loss": 0.6499, + "step": 7787 + }, + { + "epoch": 0.6963519313304721, + "grad_norm": 0.14149853629982595, + "learning_rate": 4.458753720128571e-05, + "loss": 0.6084, + "step": 7788 + }, + { + "epoch": 0.6964413447782547, + "grad_norm": 0.14082337516898177, + "learning_rate": 4.4563432153846494e-05, + "loss": 0.6699, + "step": 7789 + }, + { + "epoch": 0.6965307582260372, + "grad_norm": 0.15506728758911614, + "learning_rate": 4.453933175580525e-05, + "loss": 0.6467, + "step": 7790 + }, + { + "epoch": 0.6966201716738197, + "grad_norm": 0.15930743853761553, + "learning_rate": 4.451523600918327e-05, + "loss": 0.6739, + "step": 7791 + }, + { + "epoch": 0.6967095851216023, + "grad_norm": 0.15769406573409012, + "learning_rate": 4.4491144916001425e-05, + "loss": 0.678, + "step": 7792 + }, + { + "epoch": 0.6967989985693849, + "grad_norm": 0.1419303094517262, + "learning_rate": 4.4467058478280235e-05, + "loss": 0.6586, + "step": 7793 + }, + { + "epoch": 0.6968884120171673, + "grad_norm": 0.15113841174285755, + "learning_rate": 4.444297669803981e-05, + "loss": 0.6721, + "step": 7794 + }, + { + "epoch": 0.6969778254649499, + "grad_norm": 0.12068573789366102, + "learning_rate": 4.441889957729979e-05, + "loss": 0.6267, + "step": 7795 + }, + { + "epoch": 0.6970672389127325, + "grad_norm": 0.1496765551041931, + "learning_rate": 4.439482711807955e-05, + "loss": 0.6642, + "step": 7796 + }, + { + "epoch": 0.697156652360515, + "grad_norm": 0.12115537502121869, + "learning_rate": 4.4370759322398006e-05, + "loss": 0.6055, + "step": 7797 + }, + { + "epoch": 0.6972460658082976, + "grad_norm": 0.13504584471361233, + "learning_rate": 4.434669619227368e-05, + "loss": 0.5547, + "step": 7798 + }, + { + "epoch": 0.6973354792560801, + "grad_norm": 0.13836887300178174, + "learning_rate": 4.432263772972475e-05, + "loss": 0.6135, + "step": 7799 + }, + { + "epoch": 0.6974248927038627, + "grad_norm": 0.13336297723423193, + "learning_rate": 4.4298583936768976e-05, + "loss": 0.5629, + "step": 7800 + }, + { + "epoch": 0.6975143061516452, + "grad_norm": 0.1347667632047081, + "learning_rate": 4.427453481542366e-05, + "loss": 0.6646, + "step": 7801 + }, + { + "epoch": 0.6976037195994278, + "grad_norm": 0.14101885090346902, + "learning_rate": 4.4250490367705824e-05, + "loss": 0.6334, + "step": 7802 + }, + { + "epoch": 0.6976931330472103, + "grad_norm": 0.12967699276029598, + "learning_rate": 4.4226450595632055e-05, + "loss": 0.6435, + "step": 7803 + }, + { + "epoch": 0.6977825464949928, + "grad_norm": 0.13621024728051948, + "learning_rate": 4.420241550121849e-05, + "loss": 0.59, + "step": 7804 + }, + { + "epoch": 0.6978719599427754, + "grad_norm": 0.11421358376150005, + "learning_rate": 4.41783850864809e-05, + "loss": 0.6576, + "step": 7805 + }, + { + "epoch": 0.697961373390558, + "grad_norm": 0.15307584862968926, + "learning_rate": 4.4154359353434824e-05, + "loss": 0.6725, + "step": 7806 + }, + { + "epoch": 0.6980507868383404, + "grad_norm": 0.14627660550188828, + "learning_rate": 4.4130338304095146e-05, + "loss": 0.6481, + "step": 7807 + }, + { + "epoch": 0.698140200286123, + "grad_norm": 0.12517899192630808, + "learning_rate": 4.4106321940476516e-05, + "loss": 0.6533, + "step": 7808 + }, + { + "epoch": 0.6982296137339056, + "grad_norm": 0.12647551064334028, + "learning_rate": 4.408231026459321e-05, + "loss": 0.5983, + "step": 7809 + }, + { + "epoch": 0.6983190271816881, + "grad_norm": 0.12955720192191858, + "learning_rate": 4.405830327845896e-05, + "loss": 0.6499, + "step": 7810 + }, + { + "epoch": 0.6984084406294707, + "grad_norm": 0.13294854202354714, + "learning_rate": 4.403430098408726e-05, + "loss": 0.6498, + "step": 7811 + }, + { + "epoch": 0.6984978540772532, + "grad_norm": 0.12990424662797628, + "learning_rate": 4.401030338349115e-05, + "loss": 0.6115, + "step": 7812 + }, + { + "epoch": 0.6985872675250357, + "grad_norm": 0.13459635993027771, + "learning_rate": 4.3986310478683265e-05, + "loss": 0.6486, + "step": 7813 + }, + { + "epoch": 0.6986766809728183, + "grad_norm": 0.13995776229295212, + "learning_rate": 4.3962322271675915e-05, + "loss": 0.6523, + "step": 7814 + }, + { + "epoch": 0.6987660944206009, + "grad_norm": 0.13865861287039377, + "learning_rate": 4.393833876448089e-05, + "loss": 0.6391, + "step": 7815 + }, + { + "epoch": 0.6988555078683834, + "grad_norm": 0.13966749823177457, + "learning_rate": 4.3914359959109686e-05, + "loss": 0.625, + "step": 7816 + }, + { + "epoch": 0.6989449213161659, + "grad_norm": 0.11999398978254208, + "learning_rate": 4.389038585757341e-05, + "loss": 0.6375, + "step": 7817 + }, + { + "epoch": 0.6990343347639485, + "grad_norm": 0.12548201909417958, + "learning_rate": 4.3866416461882676e-05, + "loss": 0.6178, + "step": 7818 + }, + { + "epoch": 0.6991237482117311, + "grad_norm": 0.139774721220158, + "learning_rate": 4.3842451774047755e-05, + "loss": 0.5991, + "step": 7819 + }, + { + "epoch": 0.6992131616595136, + "grad_norm": 0.13821138686717177, + "learning_rate": 4.381849179607867e-05, + "loss": 0.6519, + "step": 7820 + }, + { + "epoch": 0.6993025751072961, + "grad_norm": 0.1377244906499029, + "learning_rate": 4.379453652998479e-05, + "loss": 0.6053, + "step": 7821 + }, + { + "epoch": 0.6993919885550787, + "grad_norm": 0.12915151303917402, + "learning_rate": 4.377058597777524e-05, + "loss": 0.6507, + "step": 7822 + }, + { + "epoch": 0.6994814020028612, + "grad_norm": 0.12864404310140376, + "learning_rate": 4.3746640141458786e-05, + "loss": 0.6218, + "step": 7823 + }, + { + "epoch": 0.6995708154506438, + "grad_norm": 0.1370206965501129, + "learning_rate": 4.372269902304363e-05, + "loss": 0.6344, + "step": 7824 + }, + { + "epoch": 0.6996602288984263, + "grad_norm": 0.1356630619022783, + "learning_rate": 4.369876262453776e-05, + "loss": 0.6396, + "step": 7825 + }, + { + "epoch": 0.6997496423462088, + "grad_norm": 0.11747657928612533, + "learning_rate": 4.367483094794866e-05, + "loss": 0.6062, + "step": 7826 + }, + { + "epoch": 0.6998390557939914, + "grad_norm": 0.15883463993872193, + "learning_rate": 4.365090399528349e-05, + "loss": 0.6529, + "step": 7827 + }, + { + "epoch": 0.699928469241774, + "grad_norm": 0.14448911647165594, + "learning_rate": 4.362698176854892e-05, + "loss": 0.6703, + "step": 7828 + }, + { + "epoch": 0.7000178826895566, + "grad_norm": 0.13385878244313354, + "learning_rate": 4.360306426975136e-05, + "loss": 0.6452, + "step": 7829 + }, + { + "epoch": 0.700107296137339, + "grad_norm": 0.14130634351539798, + "learning_rate": 4.357915150089665e-05, + "loss": 0.6179, + "step": 7830 + }, + { + "epoch": 0.7001967095851216, + "grad_norm": 0.13342824628432268, + "learning_rate": 4.355524346399037e-05, + "loss": 0.6723, + "step": 7831 + }, + { + "epoch": 0.7002861230329042, + "grad_norm": 0.13742149494294525, + "learning_rate": 4.3531340161037684e-05, + "loss": 0.6156, + "step": 7832 + }, + { + "epoch": 0.7003755364806867, + "grad_norm": 0.1384800809452946, + "learning_rate": 4.350744159404323e-05, + "loss": 0.6463, + "step": 7833 + }, + { + "epoch": 0.7004649499284692, + "grad_norm": 0.12019979324898575, + "learning_rate": 4.348354776501149e-05, + "loss": 0.613, + "step": 7834 + }, + { + "epoch": 0.7005543633762518, + "grad_norm": 0.13526531801445946, + "learning_rate": 4.345965867594631e-05, + "loss": 0.6565, + "step": 7835 + }, + { + "epoch": 0.7006437768240343, + "grad_norm": 0.13997656177864162, + "learning_rate": 4.3435774328851276e-05, + "loss": 0.6556, + "step": 7836 + }, + { + "epoch": 0.7007331902718169, + "grad_norm": 0.12955988179712483, + "learning_rate": 4.3411894725729576e-05, + "loss": 0.6379, + "step": 7837 + }, + { + "epoch": 0.7008226037195995, + "grad_norm": 0.12611285307084366, + "learning_rate": 4.338801986858388e-05, + "loss": 0.6001, + "step": 7838 + }, + { + "epoch": 0.7009120171673819, + "grad_norm": 0.12632549779515526, + "learning_rate": 4.336414975941656e-05, + "loss": 0.6089, + "step": 7839 + }, + { + "epoch": 0.7010014306151645, + "grad_norm": 0.1355140516403797, + "learning_rate": 4.3340284400229666e-05, + "loss": 0.6343, + "step": 7840 + }, + { + "epoch": 0.7010908440629471, + "grad_norm": 0.1320626569888383, + "learning_rate": 4.331642379302466e-05, + "loss": 0.6182, + "step": 7841 + }, + { + "epoch": 0.7011802575107297, + "grad_norm": 0.15350465471870794, + "learning_rate": 4.329256793980274e-05, + "loss": 0.6798, + "step": 7842 + }, + { + "epoch": 0.7012696709585121, + "grad_norm": 0.14151918611833822, + "learning_rate": 4.326871684256469e-05, + "loss": 0.6535, + "step": 7843 + }, + { + "epoch": 0.7013590844062947, + "grad_norm": 0.13555122939697753, + "learning_rate": 4.324487050331082e-05, + "loss": 0.6216, + "step": 7844 + }, + { + "epoch": 0.7014484978540773, + "grad_norm": 0.1406776681238905, + "learning_rate": 4.3221028924041105e-05, + "loss": 0.6544, + "step": 7845 + }, + { + "epoch": 0.7015379113018598, + "grad_norm": 0.1477243119070577, + "learning_rate": 4.3197192106755125e-05, + "loss": 0.6536, + "step": 7846 + }, + { + "epoch": 0.7016273247496424, + "grad_norm": 0.12038106320865895, + "learning_rate": 4.317336005345204e-05, + "loss": 0.5702, + "step": 7847 + }, + { + "epoch": 0.7017167381974249, + "grad_norm": 0.1237652017901301, + "learning_rate": 4.314953276613066e-05, + "loss": 0.6032, + "step": 7848 + }, + { + "epoch": 0.7018061516452074, + "grad_norm": 0.13286105219388983, + "learning_rate": 4.312571024678926e-05, + "loss": 0.5995, + "step": 7849 + }, + { + "epoch": 0.70189556509299, + "grad_norm": 0.14591802681983665, + "learning_rate": 4.310189249742588e-05, + "loss": 0.6595, + "step": 7850 + }, + { + "epoch": 0.7019849785407726, + "grad_norm": 0.14059499702660797, + "learning_rate": 4.307807952003804e-05, + "loss": 0.6363, + "step": 7851 + }, + { + "epoch": 0.702074391988555, + "grad_norm": 0.14248061500790696, + "learning_rate": 4.305427131662296e-05, + "loss": 0.6572, + "step": 7852 + }, + { + "epoch": 0.7021638054363376, + "grad_norm": 0.131317131432564, + "learning_rate": 4.303046788917732e-05, + "loss": 0.6516, + "step": 7853 + }, + { + "epoch": 0.7022532188841202, + "grad_norm": 0.1321949138067592, + "learning_rate": 4.3006669239697596e-05, + "loss": 0.5891, + "step": 7854 + }, + { + "epoch": 0.7023426323319027, + "grad_norm": 0.13895042610853217, + "learning_rate": 4.298287537017965e-05, + "loss": 0.6252, + "step": 7855 + }, + { + "epoch": 0.7024320457796852, + "grad_norm": 0.1399728409252533, + "learning_rate": 4.29590862826191e-05, + "loss": 0.6291, + "step": 7856 + }, + { + "epoch": 0.7025214592274678, + "grad_norm": 0.12661740663578164, + "learning_rate": 4.293530197901112e-05, + "loss": 0.6077, + "step": 7857 + }, + { + "epoch": 0.7026108726752504, + "grad_norm": 0.1444132655569641, + "learning_rate": 4.291152246135042e-05, + "loss": 0.6222, + "step": 7858 + }, + { + "epoch": 0.7027002861230329, + "grad_norm": 0.1418358577675981, + "learning_rate": 4.288774773163138e-05, + "loss": 0.5942, + "step": 7859 + }, + { + "epoch": 0.7027896995708155, + "grad_norm": 0.13505474949404883, + "learning_rate": 4.286397779184796e-05, + "loss": 0.6132, + "step": 7860 + }, + { + "epoch": 0.702879113018598, + "grad_norm": 0.136044653800351, + "learning_rate": 4.2840212643993725e-05, + "loss": 0.6529, + "step": 7861 + }, + { + "epoch": 0.7029685264663805, + "grad_norm": 0.123148392309492, + "learning_rate": 4.2816452290061826e-05, + "loss": 0.5552, + "step": 7862 + }, + { + "epoch": 0.7030579399141631, + "grad_norm": 0.1700069652492953, + "learning_rate": 4.279269673204504e-05, + "loss": 0.7022, + "step": 7863 + }, + { + "epoch": 0.7031473533619457, + "grad_norm": 0.14932967094411684, + "learning_rate": 4.276894597193567e-05, + "loss": 0.6268, + "step": 7864 + }, + { + "epoch": 0.7032367668097281, + "grad_norm": 0.13204368392073415, + "learning_rate": 4.274520001172567e-05, + "loss": 0.612, + "step": 7865 + }, + { + "epoch": 0.7033261802575107, + "grad_norm": 0.13081870153474615, + "learning_rate": 4.2721458853406646e-05, + "loss": 0.6218, + "step": 7866 + }, + { + "epoch": 0.7034155937052933, + "grad_norm": 0.14490210117589142, + "learning_rate": 4.2697722498969616e-05, + "loss": 0.6345, + "step": 7867 + }, + { + "epoch": 0.7035050071530758, + "grad_norm": 0.1431059534890679, + "learning_rate": 4.267399095040546e-05, + "loss": 0.6521, + "step": 7868 + }, + { + "epoch": 0.7035944206008584, + "grad_norm": 0.11171167453726656, + "learning_rate": 4.265026420970443e-05, + "loss": 0.5882, + "step": 7869 + }, + { + "epoch": 0.7036838340486409, + "grad_norm": 0.12867753340050805, + "learning_rate": 4.2626542278856464e-05, + "loss": 0.6571, + "step": 7870 + }, + { + "epoch": 0.7037732474964234, + "grad_norm": 0.136651534726768, + "learning_rate": 4.2602825159851156e-05, + "loss": 0.6381, + "step": 7871 + }, + { + "epoch": 0.703862660944206, + "grad_norm": 0.14225276311182358, + "learning_rate": 4.257911285467754e-05, + "loss": 0.6347, + "step": 7872 + }, + { + "epoch": 0.7039520743919886, + "grad_norm": 0.12668260836456757, + "learning_rate": 4.2555405365324385e-05, + "loss": 0.6448, + "step": 7873 + }, + { + "epoch": 0.704041487839771, + "grad_norm": 0.1327695865328808, + "learning_rate": 4.2531702693780005e-05, + "loss": 0.6182, + "step": 7874 + }, + { + "epoch": 0.7041309012875536, + "grad_norm": 0.14534919482385897, + "learning_rate": 4.250800484203232e-05, + "loss": 0.6039, + "step": 7875 + }, + { + "epoch": 0.7042203147353362, + "grad_norm": 0.13750836984976836, + "learning_rate": 4.2484311812068836e-05, + "loss": 0.6723, + "step": 7876 + }, + { + "epoch": 0.7043097281831188, + "grad_norm": 0.16064242987051638, + "learning_rate": 4.246062360587669e-05, + "loss": 0.6636, + "step": 7877 + }, + { + "epoch": 0.7043991416309013, + "grad_norm": 0.15693456372742748, + "learning_rate": 4.243694022544251e-05, + "loss": 0.6186, + "step": 7878 + }, + { + "epoch": 0.7044885550786838, + "grad_norm": 0.14027298213589692, + "learning_rate": 4.241326167275265e-05, + "loss": 0.6349, + "step": 7879 + }, + { + "epoch": 0.7045779685264664, + "grad_norm": 0.13770678231122663, + "learning_rate": 4.238958794979302e-05, + "loss": 0.6413, + "step": 7880 + }, + { + "epoch": 0.7046673819742489, + "grad_norm": 0.13553953514867972, + "learning_rate": 4.236591905854898e-05, + "loss": 0.6517, + "step": 7881 + }, + { + "epoch": 0.7047567954220315, + "grad_norm": 0.14866894265937872, + "learning_rate": 4.23422550010058e-05, + "loss": 0.6246, + "step": 7882 + }, + { + "epoch": 0.704846208869814, + "grad_norm": 0.1363041540785378, + "learning_rate": 4.231859577914802e-05, + "loss": 0.604, + "step": 7883 + }, + { + "epoch": 0.7049356223175965, + "grad_norm": 0.13149855513031353, + "learning_rate": 4.229494139495995e-05, + "loss": 0.6294, + "step": 7884 + }, + { + "epoch": 0.7050250357653791, + "grad_norm": 0.1413065226933613, + "learning_rate": 4.2271291850425455e-05, + "loss": 0.658, + "step": 7885 + }, + { + "epoch": 0.7051144492131617, + "grad_norm": 0.1310650886788592, + "learning_rate": 4.224764714752803e-05, + "loss": 0.624, + "step": 7886 + }, + { + "epoch": 0.7052038626609443, + "grad_norm": 0.1505840581853425, + "learning_rate": 4.2224007288250645e-05, + "loss": 0.6663, + "step": 7887 + }, + { + "epoch": 0.7052932761087267, + "grad_norm": 0.14421656238992173, + "learning_rate": 4.2200372274576e-05, + "loss": 0.6596, + "step": 7888 + }, + { + "epoch": 0.7053826895565093, + "grad_norm": 0.1243029098774041, + "learning_rate": 4.2176742108486334e-05, + "loss": 0.6068, + "step": 7889 + }, + { + "epoch": 0.7054721030042919, + "grad_norm": 0.12311488882734246, + "learning_rate": 4.2153116791963465e-05, + "loss": 0.6207, + "step": 7890 + }, + { + "epoch": 0.7055615164520744, + "grad_norm": 0.13539754711099722, + "learning_rate": 4.212949632698887e-05, + "loss": 0.6339, + "step": 7891 + }, + { + "epoch": 0.7056509298998569, + "grad_norm": 0.1251093240533797, + "learning_rate": 4.210588071554349e-05, + "loss": 0.6207, + "step": 7892 + }, + { + "epoch": 0.7057403433476395, + "grad_norm": 0.1299619665359375, + "learning_rate": 4.208226995960798e-05, + "loss": 0.6401, + "step": 7893 + }, + { + "epoch": 0.705829756795422, + "grad_norm": 0.13046475348123315, + "learning_rate": 4.205866406116258e-05, + "loss": 0.6138, + "step": 7894 + }, + { + "epoch": 0.7059191702432046, + "grad_norm": 0.15337352059833376, + "learning_rate": 4.203506302218697e-05, + "loss": 0.6956, + "step": 7895 + }, + { + "epoch": 0.7060085836909872, + "grad_norm": 0.14808111079873668, + "learning_rate": 4.2011466844660655e-05, + "loss": 0.6311, + "step": 7896 + }, + { + "epoch": 0.7060979971387696, + "grad_norm": 0.14032069073834458, + "learning_rate": 4.1987875530562624e-05, + "loss": 0.6379, + "step": 7897 + }, + { + "epoch": 0.7061874105865522, + "grad_norm": 0.15347188040758206, + "learning_rate": 4.1964289081871376e-05, + "loss": 0.6572, + "step": 7898 + }, + { + "epoch": 0.7062768240343348, + "grad_norm": 0.13779617076535472, + "learning_rate": 4.1940707500565114e-05, + "loss": 0.6524, + "step": 7899 + }, + { + "epoch": 0.7063662374821174, + "grad_norm": 0.1327945258509338, + "learning_rate": 4.191713078862163e-05, + "loss": 0.667, + "step": 7900 + }, + { + "epoch": 0.7064556509298998, + "grad_norm": 0.13800920708283865, + "learning_rate": 4.189355894801821e-05, + "loss": 0.6268, + "step": 7901 + }, + { + "epoch": 0.7065450643776824, + "grad_norm": 0.13863351907944824, + "learning_rate": 4.186999198073182e-05, + "loss": 0.6116, + "step": 7902 + }, + { + "epoch": 0.706634477825465, + "grad_norm": 0.11441423303699348, + "learning_rate": 4.1846429888739005e-05, + "loss": 0.6295, + "step": 7903 + }, + { + "epoch": 0.7067238912732475, + "grad_norm": 0.13172061561020687, + "learning_rate": 4.182287267401587e-05, + "loss": 0.6714, + "step": 7904 + }, + { + "epoch": 0.70681330472103, + "grad_norm": 0.11779378476231456, + "learning_rate": 4.17993203385382e-05, + "loss": 0.6116, + "step": 7905 + }, + { + "epoch": 0.7069027181688126, + "grad_norm": 0.12591357054345292, + "learning_rate": 4.1775772884281185e-05, + "loss": 0.598, + "step": 7906 + }, + { + "epoch": 0.7069921316165951, + "grad_norm": 0.1444273069448731, + "learning_rate": 4.17522303132198e-05, + "loss": 0.6533, + "step": 7907 + }, + { + "epoch": 0.7070815450643777, + "grad_norm": 0.14782998292481134, + "learning_rate": 4.17286926273285e-05, + "loss": 0.6274, + "step": 7908 + }, + { + "epoch": 0.7071709585121603, + "grad_norm": 0.1326420180763455, + "learning_rate": 4.170515982858139e-05, + "loss": 0.6351, + "step": 7909 + }, + { + "epoch": 0.7072603719599427, + "grad_norm": 0.13995426883627762, + "learning_rate": 4.168163191895211e-05, + "loss": 0.6479, + "step": 7910 + }, + { + "epoch": 0.7073497854077253, + "grad_norm": 0.13937696422946497, + "learning_rate": 4.1658108900413975e-05, + "loss": 0.6021, + "step": 7911 + }, + { + "epoch": 0.7074391988555079, + "grad_norm": 0.1241925352514068, + "learning_rate": 4.163459077493974e-05, + "loss": 0.5994, + "step": 7912 + }, + { + "epoch": 0.7075286123032904, + "grad_norm": 0.12919315316453528, + "learning_rate": 4.16110775445019e-05, + "loss": 0.6016, + "step": 7913 + }, + { + "epoch": 0.7076180257510729, + "grad_norm": 0.1298631336637558, + "learning_rate": 4.158756921107251e-05, + "loss": 0.5844, + "step": 7914 + }, + { + "epoch": 0.7077074391988555, + "grad_norm": 0.1320393277715213, + "learning_rate": 4.15640657766231e-05, + "loss": 0.6396, + "step": 7915 + }, + { + "epoch": 0.707796852646638, + "grad_norm": 0.13442961776136345, + "learning_rate": 4.1540567243124886e-05, + "loss": 0.6219, + "step": 7916 + }, + { + "epoch": 0.7078862660944206, + "grad_norm": 0.11333319229906702, + "learning_rate": 4.1517073612548764e-05, + "loss": 0.6383, + "step": 7917 + }, + { + "epoch": 0.7079756795422032, + "grad_norm": 0.13758956158866636, + "learning_rate": 4.1493584886865026e-05, + "loss": 0.6741, + "step": 7918 + }, + { + "epoch": 0.7080650929899857, + "grad_norm": 0.13805923442141443, + "learning_rate": 4.147010106804365e-05, + "loss": 0.6447, + "step": 7919 + }, + { + "epoch": 0.7081545064377682, + "grad_norm": 0.12431412304406073, + "learning_rate": 4.144662215805426e-05, + "loss": 0.6555, + "step": 7920 + }, + { + "epoch": 0.7082439198855508, + "grad_norm": 0.13748002324958378, + "learning_rate": 4.142314815886591e-05, + "loss": 0.6193, + "step": 7921 + }, + { + "epoch": 0.7083333333333334, + "grad_norm": 0.15404909514555654, + "learning_rate": 4.1399679072447384e-05, + "loss": 0.6772, + "step": 7922 + }, + { + "epoch": 0.7084227467811158, + "grad_norm": 0.12718097035530024, + "learning_rate": 4.137621490076701e-05, + "loss": 0.6256, + "step": 7923 + }, + { + "epoch": 0.7085121602288984, + "grad_norm": 0.1489437109348167, + "learning_rate": 4.135275564579268e-05, + "loss": 0.6501, + "step": 7924 + }, + { + "epoch": 0.708601573676681, + "grad_norm": 0.14067531440949466, + "learning_rate": 4.1329301309491955e-05, + "loss": 0.658, + "step": 7925 + }, + { + "epoch": 0.7086909871244635, + "grad_norm": 0.15101918370314216, + "learning_rate": 4.130585189383183e-05, + "loss": 0.6413, + "step": 7926 + }, + { + "epoch": 0.7087804005722461, + "grad_norm": 0.13631540402822576, + "learning_rate": 4.128240740077902e-05, + "loss": 0.6384, + "step": 7927 + }, + { + "epoch": 0.7088698140200286, + "grad_norm": 0.1417726108470389, + "learning_rate": 4.1258967832299835e-05, + "loss": 0.6573, + "step": 7928 + }, + { + "epoch": 0.7089592274678111, + "grad_norm": 0.1183951563645486, + "learning_rate": 4.123553319035999e-05, + "loss": 0.6266, + "step": 7929 + }, + { + "epoch": 0.7090486409155937, + "grad_norm": 0.1328570299950386, + "learning_rate": 4.121210347692506e-05, + "loss": 0.6636, + "step": 7930 + }, + { + "epoch": 0.7091380543633763, + "grad_norm": 0.1368689023818568, + "learning_rate": 4.1188678693960034e-05, + "loss": 0.6297, + "step": 7931 + }, + { + "epoch": 0.7092274678111588, + "grad_norm": 0.13989209352391396, + "learning_rate": 4.116525884342947e-05, + "loss": 0.642, + "step": 7932 + }, + { + "epoch": 0.7093168812589413, + "grad_norm": 0.1335939425984829, + "learning_rate": 4.114184392729758e-05, + "loss": 0.6343, + "step": 7933 + }, + { + "epoch": 0.7094062947067239, + "grad_norm": 0.13339032312790414, + "learning_rate": 4.1118433947528215e-05, + "loss": 0.5762, + "step": 7934 + }, + { + "epoch": 0.7094957081545065, + "grad_norm": 0.14501010321908084, + "learning_rate": 4.109502890608463e-05, + "loss": 0.6343, + "step": 7935 + }, + { + "epoch": 0.709585121602289, + "grad_norm": 0.15306351097657278, + "learning_rate": 4.107162880492984e-05, + "loss": 0.6431, + "step": 7936 + }, + { + "epoch": 0.7096745350500715, + "grad_norm": 0.14340393464877677, + "learning_rate": 4.104823364602638e-05, + "loss": 0.6352, + "step": 7937 + }, + { + "epoch": 0.7097639484978541, + "grad_norm": 0.12761784138373297, + "learning_rate": 4.1024843431336355e-05, + "loss": 0.6563, + "step": 7938 + }, + { + "epoch": 0.7098533619456366, + "grad_norm": 0.1374162070722928, + "learning_rate": 4.100145816282154e-05, + "loss": 0.6605, + "step": 7939 + }, + { + "epoch": 0.7099427753934192, + "grad_norm": 0.12356729926328057, + "learning_rate": 4.097807784244313e-05, + "loss": 0.6301, + "step": 7940 + }, + { + "epoch": 0.7100321888412017, + "grad_norm": 0.13398341988621185, + "learning_rate": 4.095470247216205e-05, + "loss": 0.6404, + "step": 7941 + }, + { + "epoch": 0.7101216022889842, + "grad_norm": 0.13516818728810648, + "learning_rate": 4.0931332053938766e-05, + "loss": 0.6743, + "step": 7942 + }, + { + "epoch": 0.7102110157367668, + "grad_norm": 0.13743914512632413, + "learning_rate": 4.090796658973333e-05, + "loss": 0.68, + "step": 7943 + }, + { + "epoch": 0.7103004291845494, + "grad_norm": 0.1540947242705522, + "learning_rate": 4.0884606081505374e-05, + "loss": 0.6705, + "step": 7944 + }, + { + "epoch": 0.710389842632332, + "grad_norm": 0.124025662774602, + "learning_rate": 4.0861250531214136e-05, + "loss": 0.6375, + "step": 7945 + }, + { + "epoch": 0.7104792560801144, + "grad_norm": 0.1196765161691345, + "learning_rate": 4.083789994081837e-05, + "loss": 0.6319, + "step": 7946 + }, + { + "epoch": 0.710568669527897, + "grad_norm": 0.14084728652825645, + "learning_rate": 4.081455431227648e-05, + "loss": 0.6709, + "step": 7947 + }, + { + "epoch": 0.7106580829756796, + "grad_norm": 0.13233407373034664, + "learning_rate": 4.0791213647546475e-05, + "loss": 0.6826, + "step": 7948 + }, + { + "epoch": 0.7107474964234621, + "grad_norm": 0.13451081071308377, + "learning_rate": 4.0767877948585845e-05, + "loss": 0.6437, + "step": 7949 + }, + { + "epoch": 0.7108369098712446, + "grad_norm": 0.12787431408101813, + "learning_rate": 4.0744547217351715e-05, + "loss": 0.6368, + "step": 7950 + }, + { + "epoch": 0.7109263233190272, + "grad_norm": 0.13420910384645013, + "learning_rate": 4.072122145580093e-05, + "loss": 0.6306, + "step": 7951 + }, + { + "epoch": 0.7110157367668097, + "grad_norm": 0.1333522589890752, + "learning_rate": 4.069790066588967e-05, + "loss": 0.589, + "step": 7952 + }, + { + "epoch": 0.7111051502145923, + "grad_norm": 0.15118538550840047, + "learning_rate": 4.067458484957386e-05, + "loss": 0.6312, + "step": 7953 + }, + { + "epoch": 0.7111945636623748, + "grad_norm": 0.12640673092201607, + "learning_rate": 4.0651274008809004e-05, + "loss": 0.5718, + "step": 7954 + }, + { + "epoch": 0.7112839771101573, + "grad_norm": 0.12827276161897383, + "learning_rate": 4.06279681455501e-05, + "loss": 0.6283, + "step": 7955 + }, + { + "epoch": 0.7113733905579399, + "grad_norm": 0.13907830122372092, + "learning_rate": 4.060466726175179e-05, + "loss": 0.6302, + "step": 7956 + }, + { + "epoch": 0.7114628040057225, + "grad_norm": 0.11703155885477771, + "learning_rate": 4.0581371359368315e-05, + "loss": 0.6131, + "step": 7957 + }, + { + "epoch": 0.711552217453505, + "grad_norm": 0.12921164123048226, + "learning_rate": 4.0558080440353455e-05, + "loss": 0.592, + "step": 7958 + }, + { + "epoch": 0.7116416309012875, + "grad_norm": 0.13406922715877098, + "learning_rate": 4.0534794506660645e-05, + "loss": 0.6241, + "step": 7959 + }, + { + "epoch": 0.7117310443490701, + "grad_norm": 0.12287740905748337, + "learning_rate": 4.0511513560242766e-05, + "loss": 0.6399, + "step": 7960 + }, + { + "epoch": 0.7118204577968527, + "grad_norm": 0.13890631503402853, + "learning_rate": 4.0488237603052396e-05, + "loss": 0.6581, + "step": 7961 + }, + { + "epoch": 0.7119098712446352, + "grad_norm": 0.14004641881716892, + "learning_rate": 4.04649666370417e-05, + "loss": 0.6788, + "step": 7962 + }, + { + "epoch": 0.7119992846924177, + "grad_norm": 0.12775677323859128, + "learning_rate": 4.044170066416233e-05, + "loss": 0.6296, + "step": 7963 + }, + { + "epoch": 0.7120886981402003, + "grad_norm": 0.12378645156450958, + "learning_rate": 4.041843968636555e-05, + "loss": 0.6148, + "step": 7964 + }, + { + "epoch": 0.7121781115879828, + "grad_norm": 0.12931891817450886, + "learning_rate": 4.0395183705602354e-05, + "loss": 0.6009, + "step": 7965 + }, + { + "epoch": 0.7122675250357654, + "grad_norm": 0.12925370210939455, + "learning_rate": 4.037193272382308e-05, + "loss": 0.6201, + "step": 7966 + }, + { + "epoch": 0.712356938483548, + "grad_norm": 0.139321878196048, + "learning_rate": 4.034868674297779e-05, + "loss": 0.6387, + "step": 7967 + }, + { + "epoch": 0.7124463519313304, + "grad_norm": 0.14684811425645378, + "learning_rate": 4.0325445765016145e-05, + "loss": 0.6703, + "step": 7968 + }, + { + "epoch": 0.712535765379113, + "grad_norm": 0.1247964465323734, + "learning_rate": 4.030220979188726e-05, + "loss": 0.6345, + "step": 7969 + }, + { + "epoch": 0.7126251788268956, + "grad_norm": 0.17324115182601402, + "learning_rate": 4.027897882553994e-05, + "loss": 0.6364, + "step": 7970 + }, + { + "epoch": 0.7127145922746781, + "grad_norm": 0.120928125080889, + "learning_rate": 4.025575286792254e-05, + "loss": 0.6307, + "step": 7971 + }, + { + "epoch": 0.7128040057224606, + "grad_norm": 0.12641759043236814, + "learning_rate": 4.0232531920983e-05, + "loss": 0.6253, + "step": 7972 + }, + { + "epoch": 0.7128934191702432, + "grad_norm": 0.1457810361409623, + "learning_rate": 4.020931598666882e-05, + "loss": 0.619, + "step": 7973 + }, + { + "epoch": 0.7129828326180258, + "grad_norm": 0.12770687032397693, + "learning_rate": 4.018610506692713e-05, + "loss": 0.6197, + "step": 7974 + }, + { + "epoch": 0.7130722460658083, + "grad_norm": 0.14531052128825736, + "learning_rate": 4.0162899163704545e-05, + "loss": 0.6695, + "step": 7975 + }, + { + "epoch": 0.7131616595135909, + "grad_norm": 0.14512806373800033, + "learning_rate": 4.0139698278947336e-05, + "loss": 0.5913, + "step": 7976 + }, + { + "epoch": 0.7132510729613734, + "grad_norm": 0.1359125318528725, + "learning_rate": 4.0116502414601384e-05, + "loss": 0.6361, + "step": 7977 + }, + { + "epoch": 0.7133404864091559, + "grad_norm": 0.1390495746688531, + "learning_rate": 4.009331157261198e-05, + "loss": 0.6171, + "step": 7978 + }, + { + "epoch": 0.7134298998569385, + "grad_norm": 0.1305631473448773, + "learning_rate": 4.007012575492425e-05, + "loss": 0.6824, + "step": 7979 + }, + { + "epoch": 0.7135193133047211, + "grad_norm": 0.14418430510424216, + "learning_rate": 4.004694496348267e-05, + "loss": 0.6354, + "step": 7980 + }, + { + "epoch": 0.7136087267525035, + "grad_norm": 0.16552359882762688, + "learning_rate": 4.0023769200231395e-05, + "loss": 0.699, + "step": 7981 + }, + { + "epoch": 0.7136981402002861, + "grad_norm": 0.15290565526930694, + "learning_rate": 4.0000598467114214e-05, + "loss": 0.6474, + "step": 7982 + }, + { + "epoch": 0.7137875536480687, + "grad_norm": 0.13347756741106592, + "learning_rate": 3.997743276607434e-05, + "loss": 0.6222, + "step": 7983 + }, + { + "epoch": 0.7138769670958512, + "grad_norm": 0.16085528410260586, + "learning_rate": 3.995427209905469e-05, + "loss": 0.6419, + "step": 7984 + }, + { + "epoch": 0.7139663805436338, + "grad_norm": 0.1463642707377673, + "learning_rate": 3.993111646799772e-05, + "loss": 0.652, + "step": 7985 + }, + { + "epoch": 0.7140557939914163, + "grad_norm": 0.1442081457963185, + "learning_rate": 3.990796587484548e-05, + "loss": 0.6331, + "step": 7986 + }, + { + "epoch": 0.7141452074391988, + "grad_norm": 0.12909362150736398, + "learning_rate": 3.988482032153955e-05, + "loss": 0.6399, + "step": 7987 + }, + { + "epoch": 0.7142346208869814, + "grad_norm": 0.16082646695726502, + "learning_rate": 3.986167981002118e-05, + "loss": 0.6617, + "step": 7988 + }, + { + "epoch": 0.714324034334764, + "grad_norm": 0.14065675531830604, + "learning_rate": 3.983854434223107e-05, + "loss": 0.6165, + "step": 7989 + }, + { + "epoch": 0.7144134477825465, + "grad_norm": 0.11465012243865763, + "learning_rate": 3.981541392010958e-05, + "loss": 0.6084, + "step": 7990 + }, + { + "epoch": 0.714502861230329, + "grad_norm": 0.1321812150431216, + "learning_rate": 3.979228854559668e-05, + "loss": 0.6311, + "step": 7991 + }, + { + "epoch": 0.7145922746781116, + "grad_norm": 0.1381325604125119, + "learning_rate": 3.9769168220631745e-05, + "loss": 0.6141, + "step": 7992 + }, + { + "epoch": 0.7146816881258942, + "grad_norm": 0.13878809202237338, + "learning_rate": 3.974605294715402e-05, + "loss": 0.6668, + "step": 7993 + }, + { + "epoch": 0.7147711015736766, + "grad_norm": 0.12929104490843998, + "learning_rate": 3.972294272710202e-05, + "loss": 0.6423, + "step": 7994 + }, + { + "epoch": 0.7148605150214592, + "grad_norm": 0.1303861448457182, + "learning_rate": 3.9699837562414024e-05, + "loss": 0.6249, + "step": 7995 + }, + { + "epoch": 0.7149499284692418, + "grad_norm": 0.137119984443254, + "learning_rate": 3.967673745502785e-05, + "loss": 0.6365, + "step": 7996 + }, + { + "epoch": 0.7150393419170243, + "grad_norm": 0.12711369999392763, + "learning_rate": 3.965364240688083e-05, + "loss": 0.6019, + "step": 7997 + }, + { + "epoch": 0.7151287553648069, + "grad_norm": 0.1441262212511791, + "learning_rate": 3.963055241990994e-05, + "loss": 0.6643, + "step": 7998 + }, + { + "epoch": 0.7152181688125894, + "grad_norm": 0.11800634187394983, + "learning_rate": 3.96074674960517e-05, + "loss": 0.6253, + "step": 7999 + }, + { + "epoch": 0.7153075822603719, + "grad_norm": 0.12773167804973493, + "learning_rate": 3.958438763724224e-05, + "loss": 0.5956, + "step": 8000 + }, + { + "epoch": 0.7153969957081545, + "grad_norm": 0.13729229228762801, + "learning_rate": 3.956131284541722e-05, + "loss": 0.6507, + "step": 8001 + }, + { + "epoch": 0.7154864091559371, + "grad_norm": 0.14406002812582705, + "learning_rate": 3.953824312251193e-05, + "loss": 0.6496, + "step": 8002 + }, + { + "epoch": 0.7155758226037195, + "grad_norm": 0.14199019384929898, + "learning_rate": 3.951517847046113e-05, + "loss": 0.6426, + "step": 8003 + }, + { + "epoch": 0.7156652360515021, + "grad_norm": 0.13512728468061438, + "learning_rate": 3.949211889119928e-05, + "loss": 0.6372, + "step": 8004 + }, + { + "epoch": 0.7157546494992847, + "grad_norm": 0.15131327450773865, + "learning_rate": 3.946906438666037e-05, + "loss": 0.6661, + "step": 8005 + }, + { + "epoch": 0.7158440629470673, + "grad_norm": 0.12737876216570335, + "learning_rate": 3.9446014958777863e-05, + "loss": 0.6415, + "step": 8006 + }, + { + "epoch": 0.7159334763948498, + "grad_norm": 0.14028011792931802, + "learning_rate": 3.942297060948498e-05, + "loss": 0.6155, + "step": 8007 + }, + { + "epoch": 0.7160228898426323, + "grad_norm": 0.1313253003227874, + "learning_rate": 3.9399931340714436e-05, + "loss": 0.6056, + "step": 8008 + }, + { + "epoch": 0.7161123032904149, + "grad_norm": 0.12750999934325669, + "learning_rate": 3.937689715439842e-05, + "loss": 0.5722, + "step": 8009 + }, + { + "epoch": 0.7162017167381974, + "grad_norm": 0.14005729044783566, + "learning_rate": 3.935386805246882e-05, + "loss": 0.6461, + "step": 8010 + }, + { + "epoch": 0.71629113018598, + "grad_norm": 0.146782318866456, + "learning_rate": 3.933084403685712e-05, + "loss": 0.6631, + "step": 8011 + }, + { + "epoch": 0.7163805436337625, + "grad_norm": 0.12673420557264573, + "learning_rate": 3.930782510949418e-05, + "loss": 0.6532, + "step": 8012 + }, + { + "epoch": 0.716469957081545, + "grad_norm": 0.11988738779399595, + "learning_rate": 3.9284811272310715e-05, + "loss": 0.6269, + "step": 8013 + }, + { + "epoch": 0.7165593705293276, + "grad_norm": 0.1368495655116035, + "learning_rate": 3.9261802527236765e-05, + "loss": 0.6314, + "step": 8014 + }, + { + "epoch": 0.7166487839771102, + "grad_norm": 0.14435630812810446, + "learning_rate": 3.92387988762021e-05, + "loss": 0.665, + "step": 8015 + }, + { + "epoch": 0.7167381974248928, + "grad_norm": 0.13815339425430287, + "learning_rate": 3.921580032113602e-05, + "loss": 0.6024, + "step": 8016 + }, + { + "epoch": 0.7168276108726752, + "grad_norm": 0.12905125990556993, + "learning_rate": 3.919280686396732e-05, + "loss": 0.595, + "step": 8017 + }, + { + "epoch": 0.7169170243204578, + "grad_norm": 0.12529596407862134, + "learning_rate": 3.916981850662448e-05, + "loss": 0.5827, + "step": 8018 + }, + { + "epoch": 0.7170064377682404, + "grad_norm": 0.13869372049496406, + "learning_rate": 3.9146835251035485e-05, + "loss": 0.6289, + "step": 8019 + }, + { + "epoch": 0.7170958512160229, + "grad_norm": 0.12847938069945775, + "learning_rate": 3.9123857099127936e-05, + "loss": 0.6324, + "step": 8020 + }, + { + "epoch": 0.7171852646638054, + "grad_norm": 0.14025815221322177, + "learning_rate": 3.910088405282897e-05, + "loss": 0.6469, + "step": 8021 + }, + { + "epoch": 0.717274678111588, + "grad_norm": 0.13519676147974494, + "learning_rate": 3.907791611406534e-05, + "loss": 0.6477, + "step": 8022 + }, + { + "epoch": 0.7173640915593705, + "grad_norm": 0.14021700386520866, + "learning_rate": 3.9054953284763284e-05, + "loss": 0.5844, + "step": 8023 + }, + { + "epoch": 0.7174535050071531, + "grad_norm": 0.14040495087855423, + "learning_rate": 3.9031995566848687e-05, + "loss": 0.6287, + "step": 8024 + }, + { + "epoch": 0.7175429184549357, + "grad_norm": 0.14780411248217737, + "learning_rate": 3.900904296224702e-05, + "loss": 0.6034, + "step": 8025 + }, + { + "epoch": 0.7176323319027181, + "grad_norm": 0.13080568829802955, + "learning_rate": 3.89860954728832e-05, + "loss": 0.6531, + "step": 8026 + }, + { + "epoch": 0.7177217453505007, + "grad_norm": 0.1284889361620211, + "learning_rate": 3.896315310068194e-05, + "loss": 0.6491, + "step": 8027 + }, + { + "epoch": 0.7178111587982833, + "grad_norm": 0.11392405424430553, + "learning_rate": 3.8940215847567274e-05, + "loss": 0.5608, + "step": 8028 + }, + { + "epoch": 0.7179005722460658, + "grad_norm": 0.13344950805273506, + "learning_rate": 3.891728371546297e-05, + "loss": 0.6074, + "step": 8029 + }, + { + "epoch": 0.7179899856938483, + "grad_norm": 0.13793147830125935, + "learning_rate": 3.88943567062923e-05, + "loss": 0.5863, + "step": 8030 + }, + { + "epoch": 0.7180793991416309, + "grad_norm": 0.13960327374205891, + "learning_rate": 3.887143482197818e-05, + "loss": 0.6273, + "step": 8031 + }, + { + "epoch": 0.7181688125894135, + "grad_norm": 0.11937085547889992, + "learning_rate": 3.884851806444296e-05, + "loss": 0.598, + "step": 8032 + }, + { + "epoch": 0.718258226037196, + "grad_norm": 0.13332716324070273, + "learning_rate": 3.882560643560869e-05, + "loss": 0.6398, + "step": 8033 + }, + { + "epoch": 0.7183476394849786, + "grad_norm": 0.12947586746973366, + "learning_rate": 3.880269993739691e-05, + "loss": 0.6613, + "step": 8034 + }, + { + "epoch": 0.718437052932761, + "grad_norm": 0.1337117057637897, + "learning_rate": 3.8779798571728786e-05, + "loss": 0.6227, + "step": 8035 + }, + { + "epoch": 0.7185264663805436, + "grad_norm": 0.12823252357916698, + "learning_rate": 3.8756902340525046e-05, + "loss": 0.5806, + "step": 8036 + }, + { + "epoch": 0.7186158798283262, + "grad_norm": 0.14152144918815304, + "learning_rate": 3.8734011245705924e-05, + "loss": 0.6415, + "step": 8037 + }, + { + "epoch": 0.7187052932761088, + "grad_norm": 0.136791333824304, + "learning_rate": 3.871112528919128e-05, + "loss": 0.6545, + "step": 8038 + }, + { + "epoch": 0.7187947067238912, + "grad_norm": 0.12296311765569005, + "learning_rate": 3.868824447290058e-05, + "loss": 0.6039, + "step": 8039 + }, + { + "epoch": 0.7188841201716738, + "grad_norm": 0.13495440035053263, + "learning_rate": 3.866536879875269e-05, + "loss": 0.5894, + "step": 8040 + }, + { + "epoch": 0.7189735336194564, + "grad_norm": 0.1316458130257091, + "learning_rate": 3.86424982686663e-05, + "loss": 0.6233, + "step": 8041 + }, + { + "epoch": 0.719062947067239, + "grad_norm": 0.12909250781409395, + "learning_rate": 3.861963288455949e-05, + "loss": 0.6367, + "step": 8042 + }, + { + "epoch": 0.7191523605150214, + "grad_norm": 0.1534112751309429, + "learning_rate": 3.8596772648349924e-05, + "loss": 0.6264, + "step": 8043 + }, + { + "epoch": 0.719241773962804, + "grad_norm": 0.14045991922752318, + "learning_rate": 3.857391756195487e-05, + "loss": 0.6756, + "step": 8044 + }, + { + "epoch": 0.7193311874105865, + "grad_norm": 0.1309112277797971, + "learning_rate": 3.85510676272912e-05, + "loss": 0.6308, + "step": 8045 + }, + { + "epoch": 0.7194206008583691, + "grad_norm": 0.13718693135684062, + "learning_rate": 3.852822284627524e-05, + "loss": 0.6221, + "step": 8046 + }, + { + "epoch": 0.7195100143061517, + "grad_norm": 0.14743293279953823, + "learning_rate": 3.8505383220823e-05, + "loss": 0.6213, + "step": 8047 + }, + { + "epoch": 0.7195994277539342, + "grad_norm": 0.13503362243459954, + "learning_rate": 3.848254875285e-05, + "loss": 0.6328, + "step": 8048 + }, + { + "epoch": 0.7196888412017167, + "grad_norm": 0.14610205013413186, + "learning_rate": 3.845971944427135e-05, + "loss": 0.6511, + "step": 8049 + }, + { + "epoch": 0.7197782546494993, + "grad_norm": 0.15504696453328798, + "learning_rate": 3.8436895297001726e-05, + "loss": 0.594, + "step": 8050 + }, + { + "epoch": 0.7198676680972819, + "grad_norm": 0.1422039815217685, + "learning_rate": 3.841407631295532e-05, + "loss": 0.6466, + "step": 8051 + }, + { + "epoch": 0.7199570815450643, + "grad_norm": 0.1351351863363672, + "learning_rate": 3.8391262494045955e-05, + "loss": 0.6306, + "step": 8052 + }, + { + "epoch": 0.7200464949928469, + "grad_norm": 0.14165541677091284, + "learning_rate": 3.8368453842187026e-05, + "loss": 0.5904, + "step": 8053 + }, + { + "epoch": 0.7201359084406295, + "grad_norm": 0.1539794034781221, + "learning_rate": 3.8345650359291384e-05, + "loss": 0.6498, + "step": 8054 + }, + { + "epoch": 0.720225321888412, + "grad_norm": 0.15179506632953219, + "learning_rate": 3.8322852047271615e-05, + "loss": 0.665, + "step": 8055 + }, + { + "epoch": 0.7203147353361946, + "grad_norm": 0.14371907328816028, + "learning_rate": 3.830005890803979e-05, + "loss": 0.6353, + "step": 8056 + }, + { + "epoch": 0.7204041487839771, + "grad_norm": 0.13813647311999888, + "learning_rate": 3.8277270943507484e-05, + "loss": 0.6413, + "step": 8057 + }, + { + "epoch": 0.7204935622317596, + "grad_norm": 0.1708724562564604, + "learning_rate": 3.8254488155585924e-05, + "loss": 0.6145, + "step": 8058 + }, + { + "epoch": 0.7205829756795422, + "grad_norm": 0.13298627567278218, + "learning_rate": 3.8231710546185895e-05, + "loss": 0.6654, + "step": 8059 + }, + { + "epoch": 0.7206723891273248, + "grad_norm": 0.1378618581236898, + "learning_rate": 3.8208938117217674e-05, + "loss": 0.6444, + "step": 8060 + }, + { + "epoch": 0.7207618025751072, + "grad_norm": 0.13105043745306427, + "learning_rate": 3.8186170870591185e-05, + "loss": 0.6375, + "step": 8061 + }, + { + "epoch": 0.7208512160228898, + "grad_norm": 0.13827255254254067, + "learning_rate": 3.8163408808215904e-05, + "loss": 0.5636, + "step": 8062 + }, + { + "epoch": 0.7209406294706724, + "grad_norm": 0.1576260376205048, + "learning_rate": 3.814065193200084e-05, + "loss": 0.6658, + "step": 8063 + }, + { + "epoch": 0.721030042918455, + "grad_norm": 0.1439086477438008, + "learning_rate": 3.8117900243854595e-05, + "loss": 0.6332, + "step": 8064 + }, + { + "epoch": 0.7211194563662375, + "grad_norm": 0.136220019051813, + "learning_rate": 3.809515374568535e-05, + "loss": 0.6534, + "step": 8065 + }, + { + "epoch": 0.72120886981402, + "grad_norm": 0.13089178665995954, + "learning_rate": 3.807241243940077e-05, + "loss": 0.6046, + "step": 8066 + }, + { + "epoch": 0.7212982832618026, + "grad_norm": 0.1339766499272589, + "learning_rate": 3.804967632690817e-05, + "loss": 0.6602, + "step": 8067 + }, + { + "epoch": 0.7213876967095851, + "grad_norm": 0.14847648519778162, + "learning_rate": 3.802694541011439e-05, + "loss": 0.6717, + "step": 8068 + }, + { + "epoch": 0.7214771101573677, + "grad_norm": 0.1396070665402114, + "learning_rate": 3.8004219690925856e-05, + "loss": 0.6099, + "step": 8069 + }, + { + "epoch": 0.7215665236051502, + "grad_norm": 0.13020402890554555, + "learning_rate": 3.7981499171248594e-05, + "loss": 0.6312, + "step": 8070 + }, + { + "epoch": 0.7216559370529327, + "grad_norm": 0.14671465917996684, + "learning_rate": 3.795878385298804e-05, + "loss": 0.6548, + "step": 8071 + }, + { + "epoch": 0.7217453505007153, + "grad_norm": 0.12841775521824075, + "learning_rate": 3.793607373804937e-05, + "loss": 0.6264, + "step": 8072 + }, + { + "epoch": 0.7218347639484979, + "grad_norm": 0.11961834256722564, + "learning_rate": 3.7913368828337285e-05, + "loss": 0.6081, + "step": 8073 + }, + { + "epoch": 0.7219241773962805, + "grad_norm": 0.12660443513283487, + "learning_rate": 3.789066912575593e-05, + "loss": 0.6246, + "step": 8074 + }, + { + "epoch": 0.7220135908440629, + "grad_norm": 0.12625533385242033, + "learning_rate": 3.78679746322091e-05, + "loss": 0.6275, + "step": 8075 + }, + { + "epoch": 0.7221030042918455, + "grad_norm": 0.14428417527200973, + "learning_rate": 3.784528534960029e-05, + "loss": 0.6107, + "step": 8076 + }, + { + "epoch": 0.7221924177396281, + "grad_norm": 0.14547582491055108, + "learning_rate": 3.782260127983229e-05, + "loss": 0.6409, + "step": 8077 + }, + { + "epoch": 0.7222818311874106, + "grad_norm": 0.14683545118894636, + "learning_rate": 3.7799922424807634e-05, + "loss": 0.6484, + "step": 8078 + }, + { + "epoch": 0.7223712446351931, + "grad_norm": 0.13432852176645843, + "learning_rate": 3.777724878642839e-05, + "loss": 0.626, + "step": 8079 + }, + { + "epoch": 0.7224606580829757, + "grad_norm": 0.12824532904414535, + "learning_rate": 3.7754580366596115e-05, + "loss": 0.6332, + "step": 8080 + }, + { + "epoch": 0.7225500715307582, + "grad_norm": 0.13336170089722582, + "learning_rate": 3.773191716721202e-05, + "loss": 0.6732, + "step": 8081 + }, + { + "epoch": 0.7226394849785408, + "grad_norm": 0.1489880531066494, + "learning_rate": 3.7709259190176816e-05, + "loss": 0.6981, + "step": 8082 + }, + { + "epoch": 0.7227288984263234, + "grad_norm": 0.14642819179946004, + "learning_rate": 3.768660643739083e-05, + "loss": 0.6134, + "step": 8083 + }, + { + "epoch": 0.7228183118741058, + "grad_norm": 0.11440176445280363, + "learning_rate": 3.766395891075394e-05, + "loss": 0.6149, + "step": 8084 + }, + { + "epoch": 0.7229077253218884, + "grad_norm": 0.13883617891735314, + "learning_rate": 3.764131661216549e-05, + "loss": 0.6467, + "step": 8085 + }, + { + "epoch": 0.722997138769671, + "grad_norm": 0.13194606422522873, + "learning_rate": 3.7618679543524503e-05, + "loss": 0.5683, + "step": 8086 + }, + { + "epoch": 0.7230865522174535, + "grad_norm": 0.13829902148154505, + "learning_rate": 3.759604770672953e-05, + "loss": 0.6079, + "step": 8087 + }, + { + "epoch": 0.723175965665236, + "grad_norm": 0.1508315730758705, + "learning_rate": 3.757342110367871e-05, + "loss": 0.637, + "step": 8088 + }, + { + "epoch": 0.7232653791130186, + "grad_norm": 0.12588185348764905, + "learning_rate": 3.755079973626959e-05, + "loss": 0.6156, + "step": 8089 + }, + { + "epoch": 0.7233547925608012, + "grad_norm": 0.12706607720061647, + "learning_rate": 3.752818360639956e-05, + "loss": 0.6071, + "step": 8090 + }, + { + "epoch": 0.7234442060085837, + "grad_norm": 0.14831825048333697, + "learning_rate": 3.7505572715965284e-05, + "loss": 0.6705, + "step": 8091 + }, + { + "epoch": 0.7235336194563662, + "grad_norm": 0.1304065350599283, + "learning_rate": 3.748296706686315e-05, + "loss": 0.6312, + "step": 8092 + }, + { + "epoch": 0.7236230329041488, + "grad_norm": 0.16509771631897088, + "learning_rate": 3.74603666609891e-05, + "loss": 0.6504, + "step": 8093 + }, + { + "epoch": 0.7237124463519313, + "grad_norm": 0.15340294263532644, + "learning_rate": 3.7437771500238526e-05, + "loss": 0.6712, + "step": 8094 + }, + { + "epoch": 0.7238018597997139, + "grad_norm": 0.12507501484957606, + "learning_rate": 3.741518158650648e-05, + "loss": 0.6286, + "step": 8095 + }, + { + "epoch": 0.7238912732474965, + "grad_norm": 0.1426750933904862, + "learning_rate": 3.739259692168764e-05, + "loss": 0.6234, + "step": 8096 + }, + { + "epoch": 0.7239806866952789, + "grad_norm": 0.12660811125017038, + "learning_rate": 3.737001750767604e-05, + "loss": 0.6245, + "step": 8097 + }, + { + "epoch": 0.7240701001430615, + "grad_norm": 0.14147012119580563, + "learning_rate": 3.734744334636544e-05, + "loss": 0.6644, + "step": 8098 + }, + { + "epoch": 0.7241595135908441, + "grad_norm": 0.1342934530798288, + "learning_rate": 3.732487443964914e-05, + "loss": 0.6278, + "step": 8099 + }, + { + "epoch": 0.7242489270386266, + "grad_norm": 0.15291068146078157, + "learning_rate": 3.730231078941988e-05, + "loss": 0.6723, + "step": 8100 + }, + { + "epoch": 0.7243383404864091, + "grad_norm": 0.1380376058468556, + "learning_rate": 3.727975239757011e-05, + "loss": 0.6421, + "step": 8101 + }, + { + "epoch": 0.7244277539341917, + "grad_norm": 0.11712452002074557, + "learning_rate": 3.725719926599175e-05, + "loss": 0.6535, + "step": 8102 + }, + { + "epoch": 0.7245171673819742, + "grad_norm": 0.13052583041407778, + "learning_rate": 3.723465139657632e-05, + "loss": 0.6305, + "step": 8103 + }, + { + "epoch": 0.7246065808297568, + "grad_norm": 0.13156134239827003, + "learning_rate": 3.72121087912149e-05, + "loss": 0.6396, + "step": 8104 + }, + { + "epoch": 0.7246959942775394, + "grad_norm": 0.1290040351733166, + "learning_rate": 3.7189571451798065e-05, + "loss": 0.5968, + "step": 8105 + }, + { + "epoch": 0.7247854077253219, + "grad_norm": 0.15598336457113382, + "learning_rate": 3.7167039380216005e-05, + "loss": 0.6823, + "step": 8106 + }, + { + "epoch": 0.7248748211731044, + "grad_norm": 0.13869847147862083, + "learning_rate": 3.714451257835852e-05, + "loss": 0.646, + "step": 8107 + }, + { + "epoch": 0.724964234620887, + "grad_norm": 0.14222680609662666, + "learning_rate": 3.71219910481148e-05, + "loss": 0.6611, + "step": 8108 + }, + { + "epoch": 0.7250536480686696, + "grad_norm": 0.14814708099525745, + "learning_rate": 3.7099474791373736e-05, + "loss": 0.6826, + "step": 8109 + }, + { + "epoch": 0.725143061516452, + "grad_norm": 0.15343465349589133, + "learning_rate": 3.707696381002381e-05, + "loss": 0.6588, + "step": 8110 + }, + { + "epoch": 0.7252324749642346, + "grad_norm": 0.15273453545902257, + "learning_rate": 3.705445810595291e-05, + "loss": 0.6581, + "step": 8111 + }, + { + "epoch": 0.7253218884120172, + "grad_norm": 0.1250862359225949, + "learning_rate": 3.7031957681048604e-05, + "loss": 0.6559, + "step": 8112 + }, + { + "epoch": 0.7254113018597997, + "grad_norm": 0.15787074785985372, + "learning_rate": 3.700946253719798e-05, + "loss": 0.6605, + "step": 8113 + }, + { + "epoch": 0.7255007153075823, + "grad_norm": 0.1213822187114857, + "learning_rate": 3.6986972676287626e-05, + "loss": 0.6036, + "step": 8114 + }, + { + "epoch": 0.7255901287553648, + "grad_norm": 0.13274833653501936, + "learning_rate": 3.6964488100203776e-05, + "loss": 0.6675, + "step": 8115 + }, + { + "epoch": 0.7256795422031473, + "grad_norm": 0.14239873941323086, + "learning_rate": 3.6942008810832184e-05, + "loss": 0.6102, + "step": 8116 + }, + { + "epoch": 0.7257689556509299, + "grad_norm": 0.12367776195214875, + "learning_rate": 3.6919534810058154e-05, + "loss": 0.6076, + "step": 8117 + }, + { + "epoch": 0.7258583690987125, + "grad_norm": 0.1228257647693858, + "learning_rate": 3.68970660997666e-05, + "loss": 0.6281, + "step": 8118 + }, + { + "epoch": 0.725947782546495, + "grad_norm": 0.12613094219277718, + "learning_rate": 3.687460268184185e-05, + "loss": 0.6241, + "step": 8119 + }, + { + "epoch": 0.7260371959942775, + "grad_norm": 0.14367529445338822, + "learning_rate": 3.685214455816796e-05, + "loss": 0.6299, + "step": 8120 + }, + { + "epoch": 0.7261266094420601, + "grad_norm": 0.13992148984339453, + "learning_rate": 3.682969173062842e-05, + "loss": 0.6186, + "step": 8121 + }, + { + "epoch": 0.7262160228898427, + "grad_norm": 0.13294768180453664, + "learning_rate": 3.6807244201106394e-05, + "loss": 0.6229, + "step": 8122 + }, + { + "epoch": 0.7263054363376252, + "grad_norm": 0.14501855462885616, + "learning_rate": 3.6784801971484405e-05, + "loss": 0.6127, + "step": 8123 + }, + { + "epoch": 0.7263948497854077, + "grad_norm": 0.1305348002760169, + "learning_rate": 3.6762365043644806e-05, + "loss": 0.6462, + "step": 8124 + }, + { + "epoch": 0.7264842632331903, + "grad_norm": 0.13269008228179585, + "learning_rate": 3.673993341946924e-05, + "loss": 0.6118, + "step": 8125 + }, + { + "epoch": 0.7265736766809728, + "grad_norm": 0.13676560845714922, + "learning_rate": 3.671750710083906e-05, + "loss": 0.6437, + "step": 8126 + }, + { + "epoch": 0.7266630901287554, + "grad_norm": 0.14397996884003436, + "learning_rate": 3.6695086089635156e-05, + "loss": 0.6531, + "step": 8127 + }, + { + "epoch": 0.7267525035765379, + "grad_norm": 0.1309420062217295, + "learning_rate": 3.667267038773791e-05, + "loss": 0.6198, + "step": 8128 + }, + { + "epoch": 0.7268419170243204, + "grad_norm": 0.13942889106097797, + "learning_rate": 3.6650259997027315e-05, + "loss": 0.6655, + "step": 8129 + }, + { + "epoch": 0.726931330472103, + "grad_norm": 0.16351411064105983, + "learning_rate": 3.66278549193829e-05, + "loss": 0.6644, + "step": 8130 + }, + { + "epoch": 0.7270207439198856, + "grad_norm": 0.13562286939122284, + "learning_rate": 3.6605455156683766e-05, + "loss": 0.6734, + "step": 8131 + }, + { + "epoch": 0.727110157367668, + "grad_norm": 0.1371071703522805, + "learning_rate": 3.658306071080855e-05, + "loss": 0.6412, + "step": 8132 + }, + { + "epoch": 0.7271995708154506, + "grad_norm": 0.14585080077366516, + "learning_rate": 3.6560671583635467e-05, + "loss": 0.6556, + "step": 8133 + }, + { + "epoch": 0.7272889842632332, + "grad_norm": 0.13760665006689968, + "learning_rate": 3.6538287777042215e-05, + "loss": 0.6396, + "step": 8134 + }, + { + "epoch": 0.7273783977110158, + "grad_norm": 0.1357648225631361, + "learning_rate": 3.6515909292906126e-05, + "loss": 0.6282, + "step": 8135 + }, + { + "epoch": 0.7274678111587983, + "grad_norm": 0.1289839534221374, + "learning_rate": 3.649353613310409e-05, + "loss": 0.6257, + "step": 8136 + }, + { + "epoch": 0.7275572246065808, + "grad_norm": 0.13696789673323312, + "learning_rate": 3.6471168299512405e-05, + "loss": 0.6207, + "step": 8137 + }, + { + "epoch": 0.7276466380543634, + "grad_norm": 0.13935380632220368, + "learning_rate": 3.644880579400719e-05, + "loss": 0.6638, + "step": 8138 + }, + { + "epoch": 0.7277360515021459, + "grad_norm": 0.1492787348731475, + "learning_rate": 3.6426448618463836e-05, + "loss": 0.6709, + "step": 8139 + }, + { + "epoch": 0.7278254649499285, + "grad_norm": 0.11117388434801423, + "learning_rate": 3.640409677475748e-05, + "loss": 0.6351, + "step": 8140 + }, + { + "epoch": 0.727914878397711, + "grad_norm": 0.12415206467138871, + "learning_rate": 3.6381750264762734e-05, + "loss": 0.6185, + "step": 8141 + }, + { + "epoch": 0.7280042918454935, + "grad_norm": 0.12931922383507735, + "learning_rate": 3.6359409090353744e-05, + "loss": 0.6312, + "step": 8142 + }, + { + "epoch": 0.7280937052932761, + "grad_norm": 0.13412868275524406, + "learning_rate": 3.633707325340425e-05, + "loss": 0.6612, + "step": 8143 + }, + { + "epoch": 0.7281831187410587, + "grad_norm": 0.1378179531105397, + "learning_rate": 3.631474275578754e-05, + "loss": 0.6198, + "step": 8144 + }, + { + "epoch": 0.7282725321888412, + "grad_norm": 0.12843883200215345, + "learning_rate": 3.6292417599376436e-05, + "loss": 0.5955, + "step": 8145 + }, + { + "epoch": 0.7283619456366237, + "grad_norm": 0.11682711975909932, + "learning_rate": 3.627009778604333e-05, + "loss": 0.6375, + "step": 8146 + }, + { + "epoch": 0.7284513590844063, + "grad_norm": 0.13170741438255548, + "learning_rate": 3.624778331766019e-05, + "loss": 0.6298, + "step": 8147 + }, + { + "epoch": 0.7285407725321889, + "grad_norm": 0.1329167954045852, + "learning_rate": 3.6225474196098444e-05, + "loss": 0.6539, + "step": 8148 + }, + { + "epoch": 0.7286301859799714, + "grad_norm": 0.13170069506036172, + "learning_rate": 3.620317042322915e-05, + "loss": 0.5981, + "step": 8149 + }, + { + "epoch": 0.7287195994277539, + "grad_norm": 0.1309700547500036, + "learning_rate": 3.6180872000922935e-05, + "loss": 0.6399, + "step": 8150 + }, + { + "epoch": 0.7288090128755365, + "grad_norm": 0.12799938851148912, + "learning_rate": 3.615857893104986e-05, + "loss": 0.6389, + "step": 8151 + }, + { + "epoch": 0.728898426323319, + "grad_norm": 0.14367170879642874, + "learning_rate": 3.613629121547969e-05, + "loss": 0.6431, + "step": 8152 + }, + { + "epoch": 0.7289878397711016, + "grad_norm": 0.13104395749693273, + "learning_rate": 3.611400885608168e-05, + "loss": 0.6334, + "step": 8153 + }, + { + "epoch": 0.7290772532188842, + "grad_norm": 0.1509785437074301, + "learning_rate": 3.6091731854724566e-05, + "loss": 0.6447, + "step": 8154 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.13815042193117663, + "learning_rate": 3.606946021327672e-05, + "loss": 0.5985, + "step": 8155 + }, + { + "epoch": 0.7292560801144492, + "grad_norm": 0.142765332705094, + "learning_rate": 3.604719393360606e-05, + "loss": 0.6327, + "step": 8156 + }, + { + "epoch": 0.7293454935622318, + "grad_norm": 0.16873316052242393, + "learning_rate": 3.6024933017579984e-05, + "loss": 0.6354, + "step": 8157 + }, + { + "epoch": 0.7294349070100143, + "grad_norm": 0.13652212379551962, + "learning_rate": 3.600267746706552e-05, + "loss": 0.6623, + "step": 8158 + }, + { + "epoch": 0.7295243204577968, + "grad_norm": 0.12964963222238335, + "learning_rate": 3.59804272839292e-05, + "loss": 0.6093, + "step": 8159 + }, + { + "epoch": 0.7296137339055794, + "grad_norm": 0.13537032250728182, + "learning_rate": 3.595818247003713e-05, + "loss": 0.6204, + "step": 8160 + }, + { + "epoch": 0.729703147353362, + "grad_norm": 0.14323126839108397, + "learning_rate": 3.593594302725498e-05, + "loss": 0.6391, + "step": 8161 + }, + { + "epoch": 0.7297925608011445, + "grad_norm": 0.12410763939516536, + "learning_rate": 3.591370895744789e-05, + "loss": 0.6185, + "step": 8162 + }, + { + "epoch": 0.7298819742489271, + "grad_norm": 0.13329691811505848, + "learning_rate": 3.5891480262480635e-05, + "loss": 0.6238, + "step": 8163 + }, + { + "epoch": 0.7299713876967096, + "grad_norm": 0.1257210760876514, + "learning_rate": 3.5869256944217535e-05, + "loss": 0.5829, + "step": 8164 + }, + { + "epoch": 0.7300608011444921, + "grad_norm": 0.12195256386270548, + "learning_rate": 3.584703900452234e-05, + "loss": 0.5794, + "step": 8165 + }, + { + "epoch": 0.7301502145922747, + "grad_norm": 0.125167369883187, + "learning_rate": 3.582482644525854e-05, + "loss": 0.6107, + "step": 8166 + }, + { + "epoch": 0.7302396280400573, + "grad_norm": 0.12991104080204838, + "learning_rate": 3.580261926828908e-05, + "loss": 0.5694, + "step": 8167 + }, + { + "epoch": 0.7303290414878397, + "grad_norm": 0.14884535973818191, + "learning_rate": 3.578041747547638e-05, + "loss": 0.6244, + "step": 8168 + }, + { + "epoch": 0.7304184549356223, + "grad_norm": 0.13933886839924828, + "learning_rate": 3.57582210686825e-05, + "loss": 0.6704, + "step": 8169 + }, + { + "epoch": 0.7305078683834049, + "grad_norm": 0.13987730403311366, + "learning_rate": 3.5736030049769074e-05, + "loss": 0.6604, + "step": 8170 + }, + { + "epoch": 0.7305972818311874, + "grad_norm": 0.13139760273295592, + "learning_rate": 3.5713844420597155e-05, + "loss": 0.6489, + "step": 8171 + }, + { + "epoch": 0.73068669527897, + "grad_norm": 0.1298458245982437, + "learning_rate": 3.569166418302747e-05, + "loss": 0.5948, + "step": 8172 + }, + { + "epoch": 0.7307761087267525, + "grad_norm": 0.11874504648097822, + "learning_rate": 3.566948933892025e-05, + "loss": 0.6464, + "step": 8173 + }, + { + "epoch": 0.730865522174535, + "grad_norm": 0.14015162745937476, + "learning_rate": 3.564731989013527e-05, + "loss": 0.618, + "step": 8174 + }, + { + "epoch": 0.7309549356223176, + "grad_norm": 0.13797961388973046, + "learning_rate": 3.5625155838531877e-05, + "loss": 0.6185, + "step": 8175 + }, + { + "epoch": 0.7310443490701002, + "grad_norm": 0.11692042470579059, + "learning_rate": 3.560299718596889e-05, + "loss": 0.6227, + "step": 8176 + }, + { + "epoch": 0.7311337625178826, + "grad_norm": 0.15506165579850292, + "learning_rate": 3.558084393430475e-05, + "loss": 0.6692, + "step": 8177 + }, + { + "epoch": 0.7312231759656652, + "grad_norm": 0.13766190929690855, + "learning_rate": 3.555869608539743e-05, + "loss": 0.6567, + "step": 8178 + }, + { + "epoch": 0.7313125894134478, + "grad_norm": 0.12489575849388837, + "learning_rate": 3.5536553641104465e-05, + "loss": 0.5951, + "step": 8179 + }, + { + "epoch": 0.7314020028612304, + "grad_norm": 0.1495888873672835, + "learning_rate": 3.5514416603282876e-05, + "loss": 0.6382, + "step": 8180 + }, + { + "epoch": 0.7314914163090128, + "grad_norm": 0.14522216011622693, + "learning_rate": 3.549228497378932e-05, + "loss": 0.6309, + "step": 8181 + }, + { + "epoch": 0.7315808297567954, + "grad_norm": 0.1296501325918048, + "learning_rate": 3.547015875447989e-05, + "loss": 0.6458, + "step": 8182 + }, + { + "epoch": 0.731670243204578, + "grad_norm": 0.11768105121167932, + "learning_rate": 3.544803794721031e-05, + "loss": 0.6324, + "step": 8183 + }, + { + "epoch": 0.7317596566523605, + "grad_norm": 0.15132311898828282, + "learning_rate": 3.542592255383586e-05, + "loss": 0.6601, + "step": 8184 + }, + { + "epoch": 0.7318490701001431, + "grad_norm": 0.14495809384121683, + "learning_rate": 3.5403812576211246e-05, + "loss": 0.6201, + "step": 8185 + }, + { + "epoch": 0.7319384835479256, + "grad_norm": 0.1371409530143683, + "learning_rate": 3.538170801619088e-05, + "loss": 0.6161, + "step": 8186 + }, + { + "epoch": 0.7320278969957081, + "grad_norm": 0.12447192146784954, + "learning_rate": 3.535960887562866e-05, + "loss": 0.5842, + "step": 8187 + }, + { + "epoch": 0.7321173104434907, + "grad_norm": 0.13258774844253457, + "learning_rate": 3.533751515637794e-05, + "loss": 0.6103, + "step": 8188 + }, + { + "epoch": 0.7322067238912733, + "grad_norm": 0.13610971652648535, + "learning_rate": 3.531542686029173e-05, + "loss": 0.612, + "step": 8189 + }, + { + "epoch": 0.7322961373390557, + "grad_norm": 0.14511156185839647, + "learning_rate": 3.5293343989222593e-05, + "loss": 0.621, + "step": 8190 + }, + { + "epoch": 0.7323855507868383, + "grad_norm": 0.13024921683543556, + "learning_rate": 3.527126654502252e-05, + "loss": 0.6202, + "step": 8191 + }, + { + "epoch": 0.7324749642346209, + "grad_norm": 0.1363826799353108, + "learning_rate": 3.5249194529543137e-05, + "loss": 0.6247, + "step": 8192 + }, + { + "epoch": 0.7325643776824035, + "grad_norm": 0.15453892341611036, + "learning_rate": 3.5227127944635606e-05, + "loss": 0.7021, + "step": 8193 + }, + { + "epoch": 0.732653791130186, + "grad_norm": 0.13822317511756124, + "learning_rate": 3.520506679215064e-05, + "loss": 0.6481, + "step": 8194 + }, + { + "epoch": 0.7327432045779685, + "grad_norm": 0.14200890747842718, + "learning_rate": 3.51830110739385e-05, + "loss": 0.6222, + "step": 8195 + }, + { + "epoch": 0.7328326180257511, + "grad_norm": 0.1268575436711969, + "learning_rate": 3.516096079184891e-05, + "loss": 0.624, + "step": 8196 + }, + { + "epoch": 0.7329220314735336, + "grad_norm": 0.13751148887706685, + "learning_rate": 3.513891594773123e-05, + "loss": 0.611, + "step": 8197 + }, + { + "epoch": 0.7330114449213162, + "grad_norm": 0.11962581451453774, + "learning_rate": 3.5116876543434374e-05, + "loss": 0.5983, + "step": 8198 + }, + { + "epoch": 0.7331008583690987, + "grad_norm": 0.13369683208682454, + "learning_rate": 3.509484258080665e-05, + "loss": 0.5953, + "step": 8199 + }, + { + "epoch": 0.7331902718168812, + "grad_norm": 0.1378567236851521, + "learning_rate": 3.507281406169614e-05, + "loss": 0.6715, + "step": 8200 + }, + { + "epoch": 0.7332796852646638, + "grad_norm": 0.12269563940272611, + "learning_rate": 3.505079098795032e-05, + "loss": 0.6088, + "step": 8201 + }, + { + "epoch": 0.7333690987124464, + "grad_norm": 0.12811795938809056, + "learning_rate": 3.502877336141619e-05, + "loss": 0.6683, + "step": 8202 + }, + { + "epoch": 0.733458512160229, + "grad_norm": 0.14452773582552336, + "learning_rate": 3.5006761183940386e-05, + "loss": 0.6191, + "step": 8203 + }, + { + "epoch": 0.7335479256080114, + "grad_norm": 0.13917980960625387, + "learning_rate": 3.498475445736905e-05, + "loss": 0.6606, + "step": 8204 + }, + { + "epoch": 0.733637339055794, + "grad_norm": 0.14196141074849394, + "learning_rate": 3.4962753183547806e-05, + "loss": 0.623, + "step": 8205 + }, + { + "epoch": 0.7337267525035766, + "grad_norm": 0.14551367105274055, + "learning_rate": 3.4940757364321906e-05, + "loss": 0.6628, + "step": 8206 + }, + { + "epoch": 0.7338161659513591, + "grad_norm": 0.15390696672672333, + "learning_rate": 3.491876700153612e-05, + "loss": 0.6339, + "step": 8207 + }, + { + "epoch": 0.7339055793991416, + "grad_norm": 0.13671099250232802, + "learning_rate": 3.489678209703475e-05, + "loss": 0.6003, + "step": 8208 + }, + { + "epoch": 0.7339949928469242, + "grad_norm": 0.1378086209724288, + "learning_rate": 3.487480265266164e-05, + "loss": 0.6568, + "step": 8209 + }, + { + "epoch": 0.7340844062947067, + "grad_norm": 0.1653125459862054, + "learning_rate": 3.485282867026021e-05, + "loss": 0.6251, + "step": 8210 + }, + { + "epoch": 0.7341738197424893, + "grad_norm": 0.13021293011040228, + "learning_rate": 3.483086015167333e-05, + "loss": 0.6516, + "step": 8211 + }, + { + "epoch": 0.7342632331902719, + "grad_norm": 0.1354856589563358, + "learning_rate": 3.48088970987435e-05, + "loss": 0.6423, + "step": 8212 + }, + { + "epoch": 0.7343526466380543, + "grad_norm": 0.12655257386821472, + "learning_rate": 3.4786939513312744e-05, + "loss": 0.657, + "step": 8213 + }, + { + "epoch": 0.7344420600858369, + "grad_norm": 0.12482569128996195, + "learning_rate": 3.4764987397222614e-05, + "loss": 0.5938, + "step": 8214 + }, + { + "epoch": 0.7345314735336195, + "grad_norm": 0.15697451229300635, + "learning_rate": 3.474304075231424e-05, + "loss": 0.6607, + "step": 8215 + }, + { + "epoch": 0.734620886981402, + "grad_norm": 0.15648477737355246, + "learning_rate": 3.472109958042819e-05, + "loss": 0.71, + "step": 8216 + }, + { + "epoch": 0.7347103004291845, + "grad_norm": 0.1306608915481164, + "learning_rate": 3.4699163883404685e-05, + "loss": 0.5754, + "step": 8217 + }, + { + "epoch": 0.7347997138769671, + "grad_norm": 0.1434619535860872, + "learning_rate": 3.467723366308348e-05, + "loss": 0.6902, + "step": 8218 + }, + { + "epoch": 0.7348891273247496, + "grad_norm": 0.14372155439325582, + "learning_rate": 3.465530892130375e-05, + "loss": 0.6744, + "step": 8219 + }, + { + "epoch": 0.7349785407725322, + "grad_norm": 0.1297456885823888, + "learning_rate": 3.4633389659904324e-05, + "loss": 0.6284, + "step": 8220 + }, + { + "epoch": 0.7350679542203148, + "grad_norm": 0.1515399908421105, + "learning_rate": 3.461147588072362e-05, + "loss": 0.6353, + "step": 8221 + }, + { + "epoch": 0.7351573676680973, + "grad_norm": 0.13729588241304558, + "learning_rate": 3.458956758559945e-05, + "loss": 0.6216, + "step": 8222 + }, + { + "epoch": 0.7352467811158798, + "grad_norm": 0.14043376158704682, + "learning_rate": 3.4567664776369236e-05, + "loss": 0.6399, + "step": 8223 + }, + { + "epoch": 0.7353361945636624, + "grad_norm": 0.144361717701065, + "learning_rate": 3.4545767454869995e-05, + "loss": 0.6278, + "step": 8224 + }, + { + "epoch": 0.735425608011445, + "grad_norm": 0.16352436059515074, + "learning_rate": 3.452387562293814e-05, + "loss": 0.6533, + "step": 8225 + }, + { + "epoch": 0.7355150214592274, + "grad_norm": 0.14190698312139804, + "learning_rate": 3.4501989282409776e-05, + "loss": 0.6285, + "step": 8226 + }, + { + "epoch": 0.73560443490701, + "grad_norm": 0.14641269949344482, + "learning_rate": 3.448010843512046e-05, + "loss": 0.6389, + "step": 8227 + }, + { + "epoch": 0.7356938483547926, + "grad_norm": 0.13829651351600608, + "learning_rate": 3.4458233082905334e-05, + "loss": 0.6415, + "step": 8228 + }, + { + "epoch": 0.7357832618025751, + "grad_norm": 0.13674756388272644, + "learning_rate": 3.443636322759908e-05, + "loss": 0.6139, + "step": 8229 + }, + { + "epoch": 0.7358726752503576, + "grad_norm": 0.1338260855859196, + "learning_rate": 3.4414498871035816e-05, + "loss": 0.5714, + "step": 8230 + }, + { + "epoch": 0.7359620886981402, + "grad_norm": 0.13314054973946557, + "learning_rate": 3.4392640015049325e-05, + "loss": 0.6756, + "step": 8231 + }, + { + "epoch": 0.7360515021459227, + "grad_norm": 0.16358120893941144, + "learning_rate": 3.437078666147292e-05, + "loss": 0.6152, + "step": 8232 + }, + { + "epoch": 0.7361409155937053, + "grad_norm": 0.13154564926088774, + "learning_rate": 3.434893881213934e-05, + "loss": 0.5769, + "step": 8233 + }, + { + "epoch": 0.7362303290414879, + "grad_norm": 0.12763274813000075, + "learning_rate": 3.432709646888095e-05, + "loss": 0.6425, + "step": 8234 + }, + { + "epoch": 0.7363197424892703, + "grad_norm": 0.1275856456915021, + "learning_rate": 3.430525963352973e-05, + "loss": 0.6556, + "step": 8235 + }, + { + "epoch": 0.7364091559370529, + "grad_norm": 0.13645982850954355, + "learning_rate": 3.428342830791701e-05, + "loss": 0.6225, + "step": 8236 + }, + { + "epoch": 0.7364985693848355, + "grad_norm": 0.13596275560697166, + "learning_rate": 3.426160249387379e-05, + "loss": 0.6024, + "step": 8237 + }, + { + "epoch": 0.7365879828326181, + "grad_norm": 0.13779983259510667, + "learning_rate": 3.423978219323062e-05, + "loss": 0.6646, + "step": 8238 + }, + { + "epoch": 0.7366773962804005, + "grad_norm": 0.1231520197110808, + "learning_rate": 3.421796740781745e-05, + "loss": 0.6259, + "step": 8239 + }, + { + "epoch": 0.7367668097281831, + "grad_norm": 0.12523942380380992, + "learning_rate": 3.4196158139463915e-05, + "loss": 0.6172, + "step": 8240 + }, + { + "epoch": 0.7368562231759657, + "grad_norm": 0.15413785550288386, + "learning_rate": 3.417435438999914e-05, + "loss": 0.6591, + "step": 8241 + }, + { + "epoch": 0.7369456366237482, + "grad_norm": 0.1588520504980296, + "learning_rate": 3.4152556161251744e-05, + "loss": 0.6028, + "step": 8242 + }, + { + "epoch": 0.7370350500715308, + "grad_norm": 0.14566141484066006, + "learning_rate": 3.413076345504995e-05, + "loss": 0.5985, + "step": 8243 + }, + { + "epoch": 0.7371244635193133, + "grad_norm": 0.13584494879735765, + "learning_rate": 3.410897627322152e-05, + "loss": 0.6483, + "step": 8244 + }, + { + "epoch": 0.7372138769670958, + "grad_norm": 0.1349900713312298, + "learning_rate": 3.408719461759362e-05, + "loss": 0.6422, + "step": 8245 + }, + { + "epoch": 0.7373032904148784, + "grad_norm": 0.1477450454104541, + "learning_rate": 3.406541848999312e-05, + "loss": 0.6289, + "step": 8246 + }, + { + "epoch": 0.737392703862661, + "grad_norm": 0.1320950113681908, + "learning_rate": 3.404364789224637e-05, + "loss": 0.5969, + "step": 8247 + }, + { + "epoch": 0.7374821173104434, + "grad_norm": 0.14392637290862037, + "learning_rate": 3.402188282617914e-05, + "loss": 0.6804, + "step": 8248 + }, + { + "epoch": 0.737571530758226, + "grad_norm": 0.1402443743902852, + "learning_rate": 3.4000123293616995e-05, + "loss": 0.6432, + "step": 8249 + }, + { + "epoch": 0.7376609442060086, + "grad_norm": 0.13378073807391708, + "learning_rate": 3.397836929638476e-05, + "loss": 0.6562, + "step": 8250 + }, + { + "epoch": 0.7377503576537912, + "grad_norm": 0.1347474611713913, + "learning_rate": 3.395662083630696e-05, + "loss": 0.5874, + "step": 8251 + }, + { + "epoch": 0.7378397711015737, + "grad_norm": 0.12500460154862142, + "learning_rate": 3.393487791520765e-05, + "loss": 0.608, + "step": 8252 + }, + { + "epoch": 0.7379291845493562, + "grad_norm": 0.1326444183379459, + "learning_rate": 3.391314053491031e-05, + "loss": 0.5987, + "step": 8253 + }, + { + "epoch": 0.7380185979971388, + "grad_norm": 0.15760680096243446, + "learning_rate": 3.389140869723806e-05, + "loss": 0.6139, + "step": 8254 + }, + { + "epoch": 0.7381080114449213, + "grad_norm": 0.15690130990747722, + "learning_rate": 3.3869682404013516e-05, + "loss": 0.6278, + "step": 8255 + }, + { + "epoch": 0.7381974248927039, + "grad_norm": 0.12813864676603606, + "learning_rate": 3.3847961657058845e-05, + "loss": 0.631, + "step": 8256 + }, + { + "epoch": 0.7382868383404864, + "grad_norm": 0.14024449958322063, + "learning_rate": 3.382624645819574e-05, + "loss": 0.6546, + "step": 8257 + }, + { + "epoch": 0.7383762517882689, + "grad_norm": 0.15087438861420627, + "learning_rate": 3.3804536809245455e-05, + "loss": 0.6182, + "step": 8258 + }, + { + "epoch": 0.7384656652360515, + "grad_norm": 0.13226501569685026, + "learning_rate": 3.37828327120287e-05, + "loss": 0.6044, + "step": 8259 + }, + { + "epoch": 0.7385550786838341, + "grad_norm": 0.14098775456100496, + "learning_rate": 3.376113416836579e-05, + "loss": 0.6338, + "step": 8260 + }, + { + "epoch": 0.7386444921316166, + "grad_norm": 0.13554032424769635, + "learning_rate": 3.373944118007657e-05, + "loss": 0.6488, + "step": 8261 + }, + { + "epoch": 0.7387339055793991, + "grad_norm": 0.1383706669784219, + "learning_rate": 3.371775374898038e-05, + "loss": 0.6605, + "step": 8262 + }, + { + "epoch": 0.7388233190271817, + "grad_norm": 0.13910511453455277, + "learning_rate": 3.369607187689618e-05, + "loss": 0.6369, + "step": 8263 + }, + { + "epoch": 0.7389127324749643, + "grad_norm": 0.12793150047164845, + "learning_rate": 3.3674395565642324e-05, + "loss": 0.6377, + "step": 8264 + }, + { + "epoch": 0.7390021459227468, + "grad_norm": 0.13531511051616, + "learning_rate": 3.365272481703681e-05, + "loss": 0.6074, + "step": 8265 + }, + { + "epoch": 0.7390915593705293, + "grad_norm": 0.13440193435922793, + "learning_rate": 3.3631059632897135e-05, + "loss": 0.6646, + "step": 8266 + }, + { + "epoch": 0.7391809728183119, + "grad_norm": 0.1282434565670089, + "learning_rate": 3.360940001504037e-05, + "loss": 0.6231, + "step": 8267 + }, + { + "epoch": 0.7392703862660944, + "grad_norm": 0.1304523032555015, + "learning_rate": 3.358774596528298e-05, + "loss": 0.619, + "step": 8268 + }, + { + "epoch": 0.739359799713877, + "grad_norm": 0.13175844086309843, + "learning_rate": 3.35660974854412e-05, + "loss": 0.6245, + "step": 8269 + }, + { + "epoch": 0.7394492131616596, + "grad_norm": 0.14263092734502017, + "learning_rate": 3.354445457733054e-05, + "loss": 0.6346, + "step": 8270 + }, + { + "epoch": 0.739538626609442, + "grad_norm": 0.1448735089950577, + "learning_rate": 3.352281724276623e-05, + "loss": 0.6697, + "step": 8271 + }, + { + "epoch": 0.7396280400572246, + "grad_norm": 0.1594659026397231, + "learning_rate": 3.3501185483562994e-05, + "loss": 0.6659, + "step": 8272 + }, + { + "epoch": 0.7397174535050072, + "grad_norm": 0.1360702018559247, + "learning_rate": 3.347955930153498e-05, + "loss": 0.6317, + "step": 8273 + }, + { + "epoch": 0.7398068669527897, + "grad_norm": 0.13853114430294713, + "learning_rate": 3.3457938698496e-05, + "loss": 0.6326, + "step": 8274 + }, + { + "epoch": 0.7398962804005722, + "grad_norm": 0.12815445455504973, + "learning_rate": 3.343632367625932e-05, + "loss": 0.6243, + "step": 8275 + }, + { + "epoch": 0.7399856938483548, + "grad_norm": 0.11415315652146894, + "learning_rate": 3.34147142366378e-05, + "loss": 0.6306, + "step": 8276 + }, + { + "epoch": 0.7400751072961373, + "grad_norm": 0.15968960318249648, + "learning_rate": 3.339311038144378e-05, + "loss": 0.6707, + "step": 8277 + }, + { + "epoch": 0.7401645207439199, + "grad_norm": 0.14566307955550314, + "learning_rate": 3.337151211248918e-05, + "loss": 0.6132, + "step": 8278 + }, + { + "epoch": 0.7402539341917024, + "grad_norm": 0.12458148662656011, + "learning_rate": 3.3349919431585366e-05, + "loss": 0.6022, + "step": 8279 + }, + { + "epoch": 0.740343347639485, + "grad_norm": 0.14891754297911433, + "learning_rate": 3.332833234054331e-05, + "loss": 0.6837, + "step": 8280 + }, + { + "epoch": 0.7404327610872675, + "grad_norm": 0.13645035295682856, + "learning_rate": 3.330675084117354e-05, + "loss": 0.5884, + "step": 8281 + }, + { + "epoch": 0.7405221745350501, + "grad_norm": 0.14120134892216776, + "learning_rate": 3.3285174935285954e-05, + "loss": 0.6391, + "step": 8282 + }, + { + "epoch": 0.7406115879828327, + "grad_norm": 0.14167364780303793, + "learning_rate": 3.3263604624690257e-05, + "loss": 0.657, + "step": 8283 + }, + { + "epoch": 0.7407010014306151, + "grad_norm": 0.12981773015749293, + "learning_rate": 3.32420399111954e-05, + "loss": 0.6529, + "step": 8284 + }, + { + "epoch": 0.7407904148783977, + "grad_norm": 0.14740046981844668, + "learning_rate": 3.322048079661004e-05, + "loss": 0.634, + "step": 8285 + }, + { + "epoch": 0.7408798283261803, + "grad_norm": 0.1268528852523494, + "learning_rate": 3.3198927282742334e-05, + "loss": 0.6313, + "step": 8286 + }, + { + "epoch": 0.7409692417739628, + "grad_norm": 0.13200564336706197, + "learning_rate": 3.3177379371399886e-05, + "loss": 0.6285, + "step": 8287 + }, + { + "epoch": 0.7410586552217453, + "grad_norm": 0.12406264004579282, + "learning_rate": 3.315583706438994e-05, + "loss": 0.6239, + "step": 8288 + }, + { + "epoch": 0.7411480686695279, + "grad_norm": 0.14673674196870853, + "learning_rate": 3.31343003635192e-05, + "loss": 0.6364, + "step": 8289 + }, + { + "epoch": 0.7412374821173104, + "grad_norm": 0.14498151863656727, + "learning_rate": 3.311276927059393e-05, + "loss": 0.6551, + "step": 8290 + }, + { + "epoch": 0.741326895565093, + "grad_norm": 0.1364072388575471, + "learning_rate": 3.3091243787419944e-05, + "loss": 0.6124, + "step": 8291 + }, + { + "epoch": 0.7414163090128756, + "grad_norm": 0.12694866513867994, + "learning_rate": 3.306972391580255e-05, + "loss": 0.627, + "step": 8292 + }, + { + "epoch": 0.741505722460658, + "grad_norm": 0.14524449045497378, + "learning_rate": 3.304820965754656e-05, + "loss": 0.6226, + "step": 8293 + }, + { + "epoch": 0.7415951359084406, + "grad_norm": 0.14362797245197365, + "learning_rate": 3.302670101445636e-05, + "loss": 0.661, + "step": 8294 + }, + { + "epoch": 0.7416845493562232, + "grad_norm": 0.1421426797741436, + "learning_rate": 3.3005197988335904e-05, + "loss": 0.6681, + "step": 8295 + }, + { + "epoch": 0.7417739628040058, + "grad_norm": 0.1376190248276928, + "learning_rate": 3.2983700580988505e-05, + "loss": 0.6154, + "step": 8296 + }, + { + "epoch": 0.7418633762517882, + "grad_norm": 0.1368084315833809, + "learning_rate": 3.2962208794217275e-05, + "loss": 0.6223, + "step": 8297 + }, + { + "epoch": 0.7419527896995708, + "grad_norm": 0.14765915867773316, + "learning_rate": 3.2940722629824604e-05, + "loss": 0.6498, + "step": 8298 + }, + { + "epoch": 0.7420422031473534, + "grad_norm": 0.13685524654558198, + "learning_rate": 3.291924208961253e-05, + "loss": 0.626, + "step": 8299 + }, + { + "epoch": 0.7421316165951359, + "grad_norm": 0.15898442750654626, + "learning_rate": 3.289776717538262e-05, + "loss": 0.6921, + "step": 8300 + }, + { + "epoch": 0.7422210300429185, + "grad_norm": 0.12583406388885046, + "learning_rate": 3.287629788893596e-05, + "loss": 0.6352, + "step": 8301 + }, + { + "epoch": 0.742310443490701, + "grad_norm": 0.1428209558495062, + "learning_rate": 3.2854834232073105e-05, + "loss": 0.662, + "step": 8302 + }, + { + "epoch": 0.7423998569384835, + "grad_norm": 0.13617867706422185, + "learning_rate": 3.283337620659421e-05, + "loss": 0.6563, + "step": 8303 + }, + { + "epoch": 0.7424892703862661, + "grad_norm": 0.14086910590209778, + "learning_rate": 3.281192381429894e-05, + "loss": 0.6259, + "step": 8304 + }, + { + "epoch": 0.7425786838340487, + "grad_norm": 0.14477527233128987, + "learning_rate": 3.279047705698647e-05, + "loss": 0.6703, + "step": 8305 + }, + { + "epoch": 0.7426680972818311, + "grad_norm": 0.13328219794835103, + "learning_rate": 3.276903593645555e-05, + "loss": 0.6649, + "step": 8306 + }, + { + "epoch": 0.7427575107296137, + "grad_norm": 0.12253939078739592, + "learning_rate": 3.2747600454504366e-05, + "loss": 0.6168, + "step": 8307 + }, + { + "epoch": 0.7428469241773963, + "grad_norm": 0.13515299453399687, + "learning_rate": 3.2726170612930716e-05, + "loss": 0.6057, + "step": 8308 + }, + { + "epoch": 0.7429363376251789, + "grad_norm": 0.15338231221267642, + "learning_rate": 3.270474641353192e-05, + "loss": 0.6667, + "step": 8309 + }, + { + "epoch": 0.7430257510729614, + "grad_norm": 0.11584491126956649, + "learning_rate": 3.26833278581047e-05, + "loss": 0.6316, + "step": 8310 + }, + { + "epoch": 0.7431151645207439, + "grad_norm": 0.1494787228822513, + "learning_rate": 3.266191494844552e-05, + "loss": 0.6661, + "step": 8311 + }, + { + "epoch": 0.7432045779685265, + "grad_norm": 0.1302508021312901, + "learning_rate": 3.264050768635022e-05, + "loss": 0.6687, + "step": 8312 + }, + { + "epoch": 0.743293991416309, + "grad_norm": 0.15481504251472283, + "learning_rate": 3.261910607361417e-05, + "loss": 0.6702, + "step": 8313 + }, + { + "epoch": 0.7433834048640916, + "grad_norm": 0.14355524744798118, + "learning_rate": 3.259771011203232e-05, + "loss": 0.5928, + "step": 8314 + }, + { + "epoch": 0.7434728183118741, + "grad_norm": 0.13844539784550425, + "learning_rate": 3.257631980339916e-05, + "loss": 0.6894, + "step": 8315 + }, + { + "epoch": 0.7435622317596566, + "grad_norm": 0.13027485848812945, + "learning_rate": 3.2554935149508584e-05, + "loss": 0.6161, + "step": 8316 + }, + { + "epoch": 0.7436516452074392, + "grad_norm": 0.1304972630537859, + "learning_rate": 3.253355615215416e-05, + "loss": 0.6366, + "step": 8317 + }, + { + "epoch": 0.7437410586552218, + "grad_norm": 0.1560378095443397, + "learning_rate": 3.251218281312889e-05, + "loss": 0.6676, + "step": 8318 + }, + { + "epoch": 0.7438304721030042, + "grad_norm": 0.12588751518269986, + "learning_rate": 3.249081513422534e-05, + "loss": 0.6602, + "step": 8319 + }, + { + "epoch": 0.7439198855507868, + "grad_norm": 0.13305331587897734, + "learning_rate": 3.246945311723564e-05, + "loss": 0.6293, + "step": 8320 + }, + { + "epoch": 0.7440092989985694, + "grad_norm": 0.15656660613566645, + "learning_rate": 3.244809676395131e-05, + "loss": 0.662, + "step": 8321 + }, + { + "epoch": 0.744098712446352, + "grad_norm": 0.1242004545019204, + "learning_rate": 3.2426746076163514e-05, + "loss": 0.624, + "step": 8322 + }, + { + "epoch": 0.7441881258941345, + "grad_norm": 0.13738043408355718, + "learning_rate": 3.240540105566293e-05, + "loss": 0.6172, + "step": 8323 + }, + { + "epoch": 0.744277539341917, + "grad_norm": 0.1344907296623002, + "learning_rate": 3.238406170423972e-05, + "loss": 0.6265, + "step": 8324 + }, + { + "epoch": 0.7443669527896996, + "grad_norm": 0.15681805700434134, + "learning_rate": 3.2362728023683594e-05, + "loss": 0.6768, + "step": 8325 + }, + { + "epoch": 0.7444563662374821, + "grad_norm": 0.13548494896475138, + "learning_rate": 3.234140001578383e-05, + "loss": 0.6144, + "step": 8326 + }, + { + "epoch": 0.7445457796852647, + "grad_norm": 0.12918775209157426, + "learning_rate": 3.23200776823291e-05, + "loss": 0.6426, + "step": 8327 + }, + { + "epoch": 0.7446351931330472, + "grad_norm": 0.13064960935788023, + "learning_rate": 3.2298761025107706e-05, + "loss": 0.6297, + "step": 8328 + }, + { + "epoch": 0.7447246065808297, + "grad_norm": 0.1436321100873248, + "learning_rate": 3.22774500459075e-05, + "loss": 0.6654, + "step": 8329 + }, + { + "epoch": 0.7448140200286123, + "grad_norm": 0.13811143541011586, + "learning_rate": 3.2256144746515735e-05, + "loss": 0.6381, + "step": 8330 + }, + { + "epoch": 0.7449034334763949, + "grad_norm": 0.14226084540096234, + "learning_rate": 3.223484512871927e-05, + "loss": 0.6323, + "step": 8331 + }, + { + "epoch": 0.7449928469241774, + "grad_norm": 0.1516579277992277, + "learning_rate": 3.221355119430456e-05, + "loss": 0.621, + "step": 8332 + }, + { + "epoch": 0.7450822603719599, + "grad_norm": 0.12470825005869005, + "learning_rate": 3.219226294505743e-05, + "loss": 0.6424, + "step": 8333 + }, + { + "epoch": 0.7451716738197425, + "grad_norm": 0.12483979525055759, + "learning_rate": 3.2170980382763306e-05, + "loss": 0.6075, + "step": 8334 + }, + { + "epoch": 0.745261087267525, + "grad_norm": 0.13691773310716687, + "learning_rate": 3.214970350920716e-05, + "loss": 0.6476, + "step": 8335 + }, + { + "epoch": 0.7453505007153076, + "grad_norm": 0.14030005427068895, + "learning_rate": 3.212843232617343e-05, + "loss": 0.6186, + "step": 8336 + }, + { + "epoch": 0.7454399141630901, + "grad_norm": 0.13509400367677804, + "learning_rate": 3.21071668354461e-05, + "loss": 0.6486, + "step": 8337 + }, + { + "epoch": 0.7455293276108726, + "grad_norm": 0.1304896800453022, + "learning_rate": 3.2085907038808695e-05, + "loss": 0.6388, + "step": 8338 + }, + { + "epoch": 0.7456187410586552, + "grad_norm": 0.1494190191755718, + "learning_rate": 3.2064652938044246e-05, + "loss": 0.6821, + "step": 8339 + }, + { + "epoch": 0.7457081545064378, + "grad_norm": 0.13787504100259784, + "learning_rate": 3.204340453493534e-05, + "loss": 0.6807, + "step": 8340 + }, + { + "epoch": 0.7457975679542204, + "grad_norm": 0.14584248500425284, + "learning_rate": 3.2022161831264e-05, + "loss": 0.6416, + "step": 8341 + }, + { + "epoch": 0.7458869814020028, + "grad_norm": 0.14523834565892454, + "learning_rate": 3.200092482881184e-05, + "loss": 0.6466, + "step": 8342 + }, + { + "epoch": 0.7459763948497854, + "grad_norm": 0.13688808239136802, + "learning_rate": 3.197969352936003e-05, + "loss": 0.6194, + "step": 8343 + }, + { + "epoch": 0.746065808297568, + "grad_norm": 0.1314915706913259, + "learning_rate": 3.1958467934689153e-05, + "loss": 0.6522, + "step": 8344 + }, + { + "epoch": 0.7461552217453505, + "grad_norm": 0.1411661418043349, + "learning_rate": 3.193724804657936e-05, + "loss": 0.616, + "step": 8345 + }, + { + "epoch": 0.746244635193133, + "grad_norm": 0.15239658568106476, + "learning_rate": 3.1916033866810436e-05, + "loss": 0.688, + "step": 8346 + }, + { + "epoch": 0.7463340486409156, + "grad_norm": 0.13726878126036376, + "learning_rate": 3.189482539716149e-05, + "loss": 0.6106, + "step": 8347 + }, + { + "epoch": 0.7464234620886981, + "grad_norm": 0.136514997738685, + "learning_rate": 3.1873622639411293e-05, + "loss": 0.6517, + "step": 8348 + }, + { + "epoch": 0.7465128755364807, + "grad_norm": 0.12903447294348794, + "learning_rate": 3.185242559533812e-05, + "loss": 0.6431, + "step": 8349 + }, + { + "epoch": 0.7466022889842633, + "grad_norm": 0.11916828329415736, + "learning_rate": 3.183123426671968e-05, + "loss": 0.5308, + "step": 8350 + }, + { + "epoch": 0.7466917024320457, + "grad_norm": 0.1341827648682724, + "learning_rate": 3.181004865533329e-05, + "loss": 0.6033, + "step": 8351 + }, + { + "epoch": 0.7467811158798283, + "grad_norm": 0.14640365627611776, + "learning_rate": 3.178886876295578e-05, + "loss": 0.6783, + "step": 8352 + }, + { + "epoch": 0.7468705293276109, + "grad_norm": 0.1320759861766741, + "learning_rate": 3.176769459136346e-05, + "loss": 0.6207, + "step": 8353 + }, + { + "epoch": 0.7469599427753935, + "grad_norm": 0.13434088742006078, + "learning_rate": 3.174652614233222e-05, + "loss": 0.6531, + "step": 8354 + }, + { + "epoch": 0.7470493562231759, + "grad_norm": 0.13291199998760408, + "learning_rate": 3.172536341763738e-05, + "loss": 0.6441, + "step": 8355 + }, + { + "epoch": 0.7471387696709585, + "grad_norm": 0.1371515156391429, + "learning_rate": 3.170420641905384e-05, + "loss": 0.6434, + "step": 8356 + }, + { + "epoch": 0.7472281831187411, + "grad_norm": 0.14896159476690127, + "learning_rate": 3.1683055148356044e-05, + "loss": 0.6683, + "step": 8357 + }, + { + "epoch": 0.7473175965665236, + "grad_norm": 0.13381685107979166, + "learning_rate": 3.1661909607317894e-05, + "loss": 0.6111, + "step": 8358 + }, + { + "epoch": 0.7474070100143062, + "grad_norm": 0.14535507779677415, + "learning_rate": 3.164076979771287e-05, + "loss": 0.6454, + "step": 8359 + }, + { + "epoch": 0.7474964234620887, + "grad_norm": 0.1385791009235831, + "learning_rate": 3.161963572131393e-05, + "loss": 0.6088, + "step": 8360 + }, + { + "epoch": 0.7475858369098712, + "grad_norm": 0.13042981460105604, + "learning_rate": 3.159850737989355e-05, + "loss": 0.6292, + "step": 8361 + }, + { + "epoch": 0.7476752503576538, + "grad_norm": 0.1362914498497284, + "learning_rate": 3.1577384775223754e-05, + "loss": 0.6557, + "step": 8362 + }, + { + "epoch": 0.7477646638054364, + "grad_norm": 0.13778937943546224, + "learning_rate": 3.1556267909076076e-05, + "loss": 0.6232, + "step": 8363 + }, + { + "epoch": 0.7478540772532188, + "grad_norm": 0.13370258396216825, + "learning_rate": 3.153515678322152e-05, + "loss": 0.6504, + "step": 8364 + }, + { + "epoch": 0.7479434907010014, + "grad_norm": 0.13926309139503548, + "learning_rate": 3.1514051399430654e-05, + "loss": 0.6398, + "step": 8365 + }, + { + "epoch": 0.748032904148784, + "grad_norm": 0.13908700109014532, + "learning_rate": 3.149295175947365e-05, + "loss": 0.6451, + "step": 8366 + }, + { + "epoch": 0.7481223175965666, + "grad_norm": 0.13503173470079077, + "learning_rate": 3.1471857865120016e-05, + "loss": 0.5906, + "step": 8367 + }, + { + "epoch": 0.748211731044349, + "grad_norm": 0.1492308745151141, + "learning_rate": 3.145076971813891e-05, + "loss": 0.6197, + "step": 8368 + }, + { + "epoch": 0.7483011444921316, + "grad_norm": 0.1360228422981601, + "learning_rate": 3.1429687320298976e-05, + "loss": 0.6218, + "step": 8369 + }, + { + "epoch": 0.7483905579399142, + "grad_norm": 0.1365085998132044, + "learning_rate": 3.1408610673368333e-05, + "loss": 0.6559, + "step": 8370 + }, + { + "epoch": 0.7484799713876967, + "grad_norm": 0.12802930462012, + "learning_rate": 3.138753977911467e-05, + "loss": 0.6387, + "step": 8371 + }, + { + "epoch": 0.7485693848354793, + "grad_norm": 0.1186966034920185, + "learning_rate": 3.1366474639305185e-05, + "loss": 0.6203, + "step": 8372 + }, + { + "epoch": 0.7486587982832618, + "grad_norm": 0.13158349654100973, + "learning_rate": 3.134541525570659e-05, + "loss": 0.6181, + "step": 8373 + }, + { + "epoch": 0.7487482117310443, + "grad_norm": 0.15310968934306715, + "learning_rate": 3.132436163008512e-05, + "loss": 0.6289, + "step": 8374 + }, + { + "epoch": 0.7488376251788269, + "grad_norm": 0.12390915134541125, + "learning_rate": 3.1303313764206486e-05, + "loss": 0.6624, + "step": 8375 + }, + { + "epoch": 0.7489270386266095, + "grad_norm": 0.14653869824277868, + "learning_rate": 3.1282271659835946e-05, + "loss": 0.6702, + "step": 8376 + }, + { + "epoch": 0.7490164520743919, + "grad_norm": 0.13506831358022292, + "learning_rate": 3.1261235318738336e-05, + "loss": 0.6351, + "step": 8377 + }, + { + "epoch": 0.7491058655221745, + "grad_norm": 0.13255585187076757, + "learning_rate": 3.124020474267787e-05, + "loss": 0.6674, + "step": 8378 + }, + { + "epoch": 0.7491952789699571, + "grad_norm": 0.14475350063421524, + "learning_rate": 3.1219179933418365e-05, + "loss": 0.6454, + "step": 8379 + }, + { + "epoch": 0.7492846924177397, + "grad_norm": 0.12352370454110773, + "learning_rate": 3.1198160892723225e-05, + "loss": 0.604, + "step": 8380 + }, + { + "epoch": 0.7493741058655222, + "grad_norm": 0.1337431000870207, + "learning_rate": 3.117714762235522e-05, + "loss": 0.6128, + "step": 8381 + }, + { + "epoch": 0.7494635193133047, + "grad_norm": 0.1517316733559755, + "learning_rate": 3.1156140124076714e-05, + "loss": 0.6643, + "step": 8382 + }, + { + "epoch": 0.7495529327610873, + "grad_norm": 0.12447567079293041, + "learning_rate": 3.113513839964963e-05, + "loss": 0.6436, + "step": 8383 + }, + { + "epoch": 0.7496423462088698, + "grad_norm": 0.13077665936448324, + "learning_rate": 3.1114142450835294e-05, + "loss": 0.6204, + "step": 8384 + }, + { + "epoch": 0.7497317596566524, + "grad_norm": 0.14615056998192816, + "learning_rate": 3.1093152279394635e-05, + "loss": 0.6589, + "step": 8385 + }, + { + "epoch": 0.7498211731044349, + "grad_norm": 0.13546860873587732, + "learning_rate": 3.1072167887088065e-05, + "loss": 0.6359, + "step": 8386 + }, + { + "epoch": 0.7499105865522174, + "grad_norm": 0.1256062313305739, + "learning_rate": 3.105118927567554e-05, + "loss": 0.6073, + "step": 8387 + }, + { + "epoch": 0.75, + "grad_norm": 0.13533049496738297, + "learning_rate": 3.103021644691651e-05, + "loss": 0.6316, + "step": 8388 + }, + { + "epoch": 0.7500894134477826, + "grad_norm": 0.13589505912804087, + "learning_rate": 3.1009249402569954e-05, + "loss": 0.6754, + "step": 8389 + }, + { + "epoch": 0.7501788268955651, + "grad_norm": 0.14167095496003754, + "learning_rate": 3.098828814439429e-05, + "loss": 0.6398, + "step": 8390 + }, + { + "epoch": 0.7502682403433476, + "grad_norm": 0.14757660006336057, + "learning_rate": 3.096733267414758e-05, + "loss": 0.6216, + "step": 8391 + }, + { + "epoch": 0.7503576537911302, + "grad_norm": 0.13420474686008527, + "learning_rate": 3.094638299358732e-05, + "loss": 0.6593, + "step": 8392 + }, + { + "epoch": 0.7504470672389127, + "grad_norm": 0.14634070489976994, + "learning_rate": 3.0925439104470456e-05, + "loss": 0.6593, + "step": 8393 + }, + { + "epoch": 0.7505364806866953, + "grad_norm": 0.14165656283231995, + "learning_rate": 3.090450100855367e-05, + "loss": 0.6658, + "step": 8394 + }, + { + "epoch": 0.7506258941344778, + "grad_norm": 0.1176144977150531, + "learning_rate": 3.08835687075929e-05, + "loss": 0.608, + "step": 8395 + }, + { + "epoch": 0.7507153075822603, + "grad_norm": 0.1418634498329435, + "learning_rate": 3.086264220334375e-05, + "loss": 0.6316, + "step": 8396 + }, + { + "epoch": 0.7508047210300429, + "grad_norm": 0.13747478408149208, + "learning_rate": 3.084172149756134e-05, + "loss": 0.5986, + "step": 8397 + }, + { + "epoch": 0.7508941344778255, + "grad_norm": 0.12377637845383696, + "learning_rate": 3.082080659200018e-05, + "loss": 0.634, + "step": 8398 + }, + { + "epoch": 0.7509835479256081, + "grad_norm": 0.1359849586506848, + "learning_rate": 3.079989748841444e-05, + "loss": 0.619, + "step": 8399 + }, + { + "epoch": 0.7510729613733905, + "grad_norm": 0.13249165244572467, + "learning_rate": 3.077899418855772e-05, + "loss": 0.612, + "step": 8400 + }, + { + "epoch": 0.7511623748211731, + "grad_norm": 0.1332162882446546, + "learning_rate": 3.075809669418316e-05, + "loss": 0.5909, + "step": 8401 + }, + { + "epoch": 0.7512517882689557, + "grad_norm": 0.14898093619480832, + "learning_rate": 3.07372050070434e-05, + "loss": 0.656, + "step": 8402 + }, + { + "epoch": 0.7513412017167382, + "grad_norm": 0.15537077400719948, + "learning_rate": 3.071631912889065e-05, + "loss": 0.624, + "step": 8403 + }, + { + "epoch": 0.7514306151645207, + "grad_norm": 0.14719287773856818, + "learning_rate": 3.0695439061476504e-05, + "loss": 0.6432, + "step": 8404 + }, + { + "epoch": 0.7515200286123033, + "grad_norm": 0.13379065047692063, + "learning_rate": 3.0674564806552187e-05, + "loss": 0.6015, + "step": 8405 + }, + { + "epoch": 0.7516094420600858, + "grad_norm": 0.13665410178542972, + "learning_rate": 3.0653696365868424e-05, + "loss": 0.6186, + "step": 8406 + }, + { + "epoch": 0.7516988555078684, + "grad_norm": 0.12547032728027385, + "learning_rate": 3.0632833741175336e-05, + "loss": 0.6412, + "step": 8407 + }, + { + "epoch": 0.751788268955651, + "grad_norm": 0.14088880559402217, + "learning_rate": 3.061197693422278e-05, + "loss": 0.6445, + "step": 8408 + }, + { + "epoch": 0.7518776824034334, + "grad_norm": 0.1465853444998385, + "learning_rate": 3.059112594675987e-05, + "loss": 0.6732, + "step": 8409 + }, + { + "epoch": 0.751967095851216, + "grad_norm": 0.13715090779487027, + "learning_rate": 3.057028078053541e-05, + "loss": 0.6844, + "step": 8410 + }, + { + "epoch": 0.7520565092989986, + "grad_norm": 0.14644742164836663, + "learning_rate": 3.054944143729769e-05, + "loss": 0.629, + "step": 8411 + }, + { + "epoch": 0.7521459227467812, + "grad_norm": 0.135299043523951, + "learning_rate": 3.0528607918794395e-05, + "loss": 0.6635, + "step": 8412 + }, + { + "epoch": 0.7522353361945636, + "grad_norm": 0.12890292129883346, + "learning_rate": 3.0507780226772863e-05, + "loss": 0.5648, + "step": 8413 + }, + { + "epoch": 0.7523247496423462, + "grad_norm": 0.15096839045621593, + "learning_rate": 3.048695836297988e-05, + "loss": 0.6071, + "step": 8414 + }, + { + "epoch": 0.7524141630901288, + "grad_norm": 0.16155380524021268, + "learning_rate": 3.046614232916174e-05, + "loss": 0.6362, + "step": 8415 + }, + { + "epoch": 0.7525035765379113, + "grad_norm": 0.14199738861274835, + "learning_rate": 3.0445332127064275e-05, + "loss": 0.645, + "step": 8416 + }, + { + "epoch": 0.7525929899856938, + "grad_norm": 0.13666206425373825, + "learning_rate": 3.042452775843284e-05, + "loss": 0.6367, + "step": 8417 + }, + { + "epoch": 0.7526824034334764, + "grad_norm": 0.14060979612090657, + "learning_rate": 3.040372922501219e-05, + "loss": 0.6384, + "step": 8418 + }, + { + "epoch": 0.7527718168812589, + "grad_norm": 0.13880547579984778, + "learning_rate": 3.0382936528546735e-05, + "loss": 0.6313, + "step": 8419 + }, + { + "epoch": 0.7528612303290415, + "grad_norm": 0.1510832574827047, + "learning_rate": 3.036214967078034e-05, + "loss": 0.6366, + "step": 8420 + }, + { + "epoch": 0.7529506437768241, + "grad_norm": 0.14322866021331798, + "learning_rate": 3.0341368653456283e-05, + "loss": 0.5795, + "step": 8421 + }, + { + "epoch": 0.7530400572246065, + "grad_norm": 0.14138900414475453, + "learning_rate": 3.032059347831755e-05, + "loss": 0.639, + "step": 8422 + }, + { + "epoch": 0.7531294706723891, + "grad_norm": 0.14732626888583866, + "learning_rate": 3.0299824147106516e-05, + "loss": 0.6436, + "step": 8423 + }, + { + "epoch": 0.7532188841201717, + "grad_norm": 0.12473674511994357, + "learning_rate": 3.0279060661565028e-05, + "loss": 0.6121, + "step": 8424 + }, + { + "epoch": 0.7533082975679543, + "grad_norm": 0.12836244793533116, + "learning_rate": 3.025830302343452e-05, + "loss": 0.6192, + "step": 8425 + }, + { + "epoch": 0.7533977110157367, + "grad_norm": 0.1485023528898284, + "learning_rate": 3.023755123445594e-05, + "loss": 0.6555, + "step": 8426 + }, + { + "epoch": 0.7534871244635193, + "grad_norm": 0.13003819342967202, + "learning_rate": 3.0216805296369654e-05, + "loss": 0.6207, + "step": 8427 + }, + { + "epoch": 0.7535765379113019, + "grad_norm": 0.14870641793576797, + "learning_rate": 3.0196065210915637e-05, + "loss": 0.6709, + "step": 8428 + }, + { + "epoch": 0.7536659513590844, + "grad_norm": 0.14405586631211828, + "learning_rate": 3.017533097983333e-05, + "loss": 0.6316, + "step": 8429 + }, + { + "epoch": 0.753755364806867, + "grad_norm": 0.16277406500052957, + "learning_rate": 3.015460260486168e-05, + "loss": 0.6908, + "step": 8430 + }, + { + "epoch": 0.7538447782546495, + "grad_norm": 0.12775954273746365, + "learning_rate": 3.0133880087739184e-05, + "loss": 0.6493, + "step": 8431 + }, + { + "epoch": 0.753934191702432, + "grad_norm": 0.13626194017883184, + "learning_rate": 3.0113163430203772e-05, + "loss": 0.6421, + "step": 8432 + }, + { + "epoch": 0.7540236051502146, + "grad_norm": 0.1350700981872835, + "learning_rate": 3.009245263399293e-05, + "loss": 0.6259, + "step": 8433 + }, + { + "epoch": 0.7541130185979972, + "grad_norm": 0.14619180007448315, + "learning_rate": 3.0071747700843667e-05, + "loss": 0.6401, + "step": 8434 + }, + { + "epoch": 0.7542024320457796, + "grad_norm": 0.13258870411224613, + "learning_rate": 3.0051048632492463e-05, + "loss": 0.6375, + "step": 8435 + }, + { + "epoch": 0.7542918454935622, + "grad_norm": 0.14146092073797895, + "learning_rate": 3.003035543067534e-05, + "loss": 0.6749, + "step": 8436 + }, + { + "epoch": 0.7543812589413448, + "grad_norm": 0.1383071748138223, + "learning_rate": 3.000966809712783e-05, + "loss": 0.6346, + "step": 8437 + }, + { + "epoch": 0.7544706723891274, + "grad_norm": 0.15088556229701697, + "learning_rate": 2.9988986633584902e-05, + "loss": 0.6601, + "step": 8438 + }, + { + "epoch": 0.7545600858369099, + "grad_norm": 0.12495162836911887, + "learning_rate": 2.9968311041781116e-05, + "loss": 0.609, + "step": 8439 + }, + { + "epoch": 0.7546494992846924, + "grad_norm": 0.1302020509062777, + "learning_rate": 2.9947641323450535e-05, + "loss": 0.6396, + "step": 8440 + }, + { + "epoch": 0.754738912732475, + "grad_norm": 0.1274979188722935, + "learning_rate": 2.992697748032661e-05, + "loss": 0.6068, + "step": 8441 + }, + { + "epoch": 0.7548283261802575, + "grad_norm": 0.12383401032605809, + "learning_rate": 2.990631951414252e-05, + "loss": 0.6108, + "step": 8442 + }, + { + "epoch": 0.7549177396280401, + "grad_norm": 0.14403173017133253, + "learning_rate": 2.9885667426630737e-05, + "loss": 0.6568, + "step": 8443 + }, + { + "epoch": 0.7550071530758226, + "grad_norm": 0.12354226386864993, + "learning_rate": 2.9865021219523337e-05, + "loss": 0.6465, + "step": 8444 + }, + { + "epoch": 0.7550965665236051, + "grad_norm": 0.15481252013450525, + "learning_rate": 2.9844380894551916e-05, + "loss": 0.6745, + "step": 8445 + }, + { + "epoch": 0.7551859799713877, + "grad_norm": 0.14313761824463675, + "learning_rate": 2.9823746453447565e-05, + "loss": 0.6272, + "step": 8446 + }, + { + "epoch": 0.7552753934191703, + "grad_norm": 0.14674251667881913, + "learning_rate": 2.9803117897940826e-05, + "loss": 0.6567, + "step": 8447 + }, + { + "epoch": 0.7553648068669528, + "grad_norm": 0.12072732500806005, + "learning_rate": 2.9782495229761808e-05, + "loss": 0.6382, + "step": 8448 + }, + { + "epoch": 0.7554542203147353, + "grad_norm": 0.14638002263654706, + "learning_rate": 2.9761878450640112e-05, + "loss": 0.6196, + "step": 8449 + }, + { + "epoch": 0.7555436337625179, + "grad_norm": 0.15776308463774252, + "learning_rate": 2.9741267562304854e-05, + "loss": 0.6542, + "step": 8450 + }, + { + "epoch": 0.7556330472103004, + "grad_norm": 0.133985014404011, + "learning_rate": 2.972066256648465e-05, + "loss": 0.6175, + "step": 8451 + }, + { + "epoch": 0.755722460658083, + "grad_norm": 0.14175611454802167, + "learning_rate": 2.9700063464907578e-05, + "loss": 0.6421, + "step": 8452 + }, + { + "epoch": 0.7558118741058655, + "grad_norm": 0.12455452914780793, + "learning_rate": 2.967947025930128e-05, + "loss": 0.599, + "step": 8453 + }, + { + "epoch": 0.755901287553648, + "grad_norm": 0.14208658821886408, + "learning_rate": 2.9658882951392918e-05, + "loss": 0.6255, + "step": 8454 + }, + { + "epoch": 0.7559907010014306, + "grad_norm": 0.1501475076918662, + "learning_rate": 2.963830154290903e-05, + "loss": 0.6533, + "step": 8455 + }, + { + "epoch": 0.7560801144492132, + "grad_norm": 0.14051132577608003, + "learning_rate": 2.9617726035575855e-05, + "loss": 0.6598, + "step": 8456 + }, + { + "epoch": 0.7561695278969958, + "grad_norm": 0.1317902069061588, + "learning_rate": 2.9597156431119023e-05, + "loss": 0.6184, + "step": 8457 + }, + { + "epoch": 0.7562589413447782, + "grad_norm": 0.13992393006798337, + "learning_rate": 2.957659273126362e-05, + "loss": 0.6299, + "step": 8458 + }, + { + "epoch": 0.7563483547925608, + "grad_norm": 0.1415924741602186, + "learning_rate": 2.9556034937734332e-05, + "loss": 0.6322, + "step": 8459 + }, + { + "epoch": 0.7564377682403434, + "grad_norm": 0.13252258476266268, + "learning_rate": 2.9535483052255365e-05, + "loss": 0.6285, + "step": 8460 + }, + { + "epoch": 0.7565271816881259, + "grad_norm": 0.15564741846677424, + "learning_rate": 2.9514937076550286e-05, + "loss": 0.6265, + "step": 8461 + }, + { + "epoch": 0.7566165951359084, + "grad_norm": 0.14946688690165683, + "learning_rate": 2.9494397012342322e-05, + "loss": 0.6336, + "step": 8462 + }, + { + "epoch": 0.756706008583691, + "grad_norm": 0.13108008002663846, + "learning_rate": 2.9473862861354128e-05, + "loss": 0.6447, + "step": 8463 + }, + { + "epoch": 0.7567954220314735, + "grad_norm": 0.12803540495702567, + "learning_rate": 2.945333462530788e-05, + "loss": 0.6235, + "step": 8464 + }, + { + "epoch": 0.7568848354792561, + "grad_norm": 0.15181416762425856, + "learning_rate": 2.9432812305925295e-05, + "loss": 0.6949, + "step": 8465 + }, + { + "epoch": 0.7569742489270386, + "grad_norm": 0.1385324258619831, + "learning_rate": 2.941229590492748e-05, + "loss": 0.6485, + "step": 8466 + }, + { + "epoch": 0.7570636623748211, + "grad_norm": 0.15479472379325837, + "learning_rate": 2.9391785424035167e-05, + "loss": 0.6511, + "step": 8467 + }, + { + "epoch": 0.7571530758226037, + "grad_norm": 0.13447214669354526, + "learning_rate": 2.9371280864968565e-05, + "loss": 0.6019, + "step": 8468 + }, + { + "epoch": 0.7572424892703863, + "grad_norm": 0.13019222862025853, + "learning_rate": 2.935078222944727e-05, + "loss": 0.6203, + "step": 8469 + }, + { + "epoch": 0.7573319027181689, + "grad_norm": 0.14639824415865604, + "learning_rate": 2.933028951919058e-05, + "loss": 0.6204, + "step": 8470 + }, + { + "epoch": 0.7574213161659513, + "grad_norm": 0.14568980665282033, + "learning_rate": 2.93098027359172e-05, + "loss": 0.6458, + "step": 8471 + }, + { + "epoch": 0.7575107296137339, + "grad_norm": 0.13562642341678088, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.598, + "step": 8472 + }, + { + "epoch": 0.7576001430615165, + "grad_norm": 0.1386433622946008, + "learning_rate": 2.9268846957192485e-05, + "loss": 0.6487, + "step": 8473 + }, + { + "epoch": 0.757689556509299, + "grad_norm": 0.12047196144468103, + "learning_rate": 2.9248377965176134e-05, + "loss": 0.6097, + "step": 8474 + }, + { + "epoch": 0.7577789699570815, + "grad_norm": 0.16035028505088988, + "learning_rate": 2.9227914907012845e-05, + "loss": 0.613, + "step": 8475 + }, + { + "epoch": 0.7578683834048641, + "grad_norm": 0.13767410470675642, + "learning_rate": 2.9207457784418835e-05, + "loss": 0.5738, + "step": 8476 + }, + { + "epoch": 0.7579577968526466, + "grad_norm": 0.1341558744059813, + "learning_rate": 2.91870065991099e-05, + "loss": 0.622, + "step": 8477 + }, + { + "epoch": 0.7580472103004292, + "grad_norm": 0.15184698108471112, + "learning_rate": 2.9166561352801182e-05, + "loss": 0.6395, + "step": 8478 + }, + { + "epoch": 0.7581366237482118, + "grad_norm": 0.12182840556021018, + "learning_rate": 2.91461220472074e-05, + "loss": 0.6076, + "step": 8479 + }, + { + "epoch": 0.7582260371959942, + "grad_norm": 0.1367987892200234, + "learning_rate": 2.912568868404284e-05, + "loss": 0.6493, + "step": 8480 + }, + { + "epoch": 0.7583154506437768, + "grad_norm": 0.13826134634293058, + "learning_rate": 2.9105261265021133e-05, + "loss": 0.6647, + "step": 8481 + }, + { + "epoch": 0.7584048640915594, + "grad_norm": 0.13977410888631922, + "learning_rate": 2.9084839791855544e-05, + "loss": 0.6591, + "step": 8482 + }, + { + "epoch": 0.758494277539342, + "grad_norm": 0.1488380396183352, + "learning_rate": 2.9064424266258805e-05, + "loss": 0.6454, + "step": 8483 + }, + { + "epoch": 0.7585836909871244, + "grad_norm": 0.14923204417745498, + "learning_rate": 2.9044014689943132e-05, + "loss": 0.6576, + "step": 8484 + }, + { + "epoch": 0.758673104434907, + "grad_norm": 0.15255057404893513, + "learning_rate": 2.902361106462028e-05, + "loss": 0.6584, + "step": 8485 + }, + { + "epoch": 0.7587625178826896, + "grad_norm": 0.13150695611640253, + "learning_rate": 2.9003213392001426e-05, + "loss": 0.5945, + "step": 8486 + }, + { + "epoch": 0.7588519313304721, + "grad_norm": 0.1377678182589257, + "learning_rate": 2.8982821673797322e-05, + "loss": 0.6308, + "step": 8487 + }, + { + "epoch": 0.7589413447782547, + "grad_norm": 0.15748486600160472, + "learning_rate": 2.8962435911718222e-05, + "loss": 0.6358, + "step": 8488 + }, + { + "epoch": 0.7590307582260372, + "grad_norm": 0.1267603575580078, + "learning_rate": 2.8942056107473802e-05, + "loss": 0.6502, + "step": 8489 + }, + { + "epoch": 0.7591201716738197, + "grad_norm": 0.12415584887851629, + "learning_rate": 2.89216822627733e-05, + "loss": 0.6213, + "step": 8490 + }, + { + "epoch": 0.7592095851216023, + "grad_norm": 0.14708604555870444, + "learning_rate": 2.8901314379325517e-05, + "loss": 0.6305, + "step": 8491 + }, + { + "epoch": 0.7592989985693849, + "grad_norm": 0.1465396609334341, + "learning_rate": 2.8880952458838593e-05, + "loss": 0.6454, + "step": 8492 + }, + { + "epoch": 0.7593884120171673, + "grad_norm": 0.1305036324397261, + "learning_rate": 2.886059650302031e-05, + "loss": 0.6376, + "step": 8493 + }, + { + "epoch": 0.7594778254649499, + "grad_norm": 0.13564746685247434, + "learning_rate": 2.8840246513577907e-05, + "loss": 0.6237, + "step": 8494 + }, + { + "epoch": 0.7595672389127325, + "grad_norm": 0.1521063758181458, + "learning_rate": 2.8819902492218066e-05, + "loss": 0.6754, + "step": 8495 + }, + { + "epoch": 0.759656652360515, + "grad_norm": 0.12801967192117, + "learning_rate": 2.879956444064703e-05, + "loss": 0.622, + "step": 8496 + }, + { + "epoch": 0.7597460658082976, + "grad_norm": 0.13507313888236924, + "learning_rate": 2.877923236057054e-05, + "loss": 0.5873, + "step": 8497 + }, + { + "epoch": 0.7598354792560801, + "grad_norm": 0.14105019492974435, + "learning_rate": 2.8758906253693818e-05, + "loss": 0.6337, + "step": 8498 + }, + { + "epoch": 0.7599248927038627, + "grad_norm": 0.13922347265936114, + "learning_rate": 2.8738586121721634e-05, + "loss": 0.5994, + "step": 8499 + }, + { + "epoch": 0.7600143061516452, + "grad_norm": 0.13593358949879372, + "learning_rate": 2.8718271966358124e-05, + "loss": 0.5978, + "step": 8500 + }, + { + "epoch": 0.7601037195994278, + "grad_norm": 0.13579523483604716, + "learning_rate": 2.869796378930706e-05, + "loss": 0.6529, + "step": 8501 + }, + { + "epoch": 0.7601931330472103, + "grad_norm": 0.12806006457108696, + "learning_rate": 2.8677661592271666e-05, + "loss": 0.6379, + "step": 8502 + }, + { + "epoch": 0.7602825464949928, + "grad_norm": 0.15351530448194323, + "learning_rate": 2.8657365376954692e-05, + "loss": 0.6218, + "step": 8503 + }, + { + "epoch": 0.7603719599427754, + "grad_norm": 0.1560654475202268, + "learning_rate": 2.8637075145058257e-05, + "loss": 0.6687, + "step": 8504 + }, + { + "epoch": 0.760461373390558, + "grad_norm": 0.1509299170153201, + "learning_rate": 2.8616790898284207e-05, + "loss": 0.6594, + "step": 8505 + }, + { + "epoch": 0.7605507868383404, + "grad_norm": 0.13744854601994155, + "learning_rate": 2.859651263833366e-05, + "loss": 0.6412, + "step": 8506 + }, + { + "epoch": 0.760640200286123, + "grad_norm": 0.13687947639479098, + "learning_rate": 2.857624036690737e-05, + "loss": 0.6472, + "step": 8507 + }, + { + "epoch": 0.7607296137339056, + "grad_norm": 0.14569977280960647, + "learning_rate": 2.8555974085705573e-05, + "loss": 0.6297, + "step": 8508 + }, + { + "epoch": 0.7608190271816881, + "grad_norm": 0.12762285239660454, + "learning_rate": 2.853571379642792e-05, + "loss": 0.6484, + "step": 8509 + }, + { + "epoch": 0.7609084406294707, + "grad_norm": 0.13999037602018044, + "learning_rate": 2.8515459500773633e-05, + "loss": 0.6226, + "step": 8510 + }, + { + "epoch": 0.7609978540772532, + "grad_norm": 0.12446369210247626, + "learning_rate": 2.849521120044144e-05, + "loss": 0.605, + "step": 8511 + }, + { + "epoch": 0.7610872675250357, + "grad_norm": 0.17886269877321354, + "learning_rate": 2.847496889712952e-05, + "loss": 0.6422, + "step": 8512 + }, + { + "epoch": 0.7611766809728183, + "grad_norm": 0.13494777659965795, + "learning_rate": 2.845473259253557e-05, + "loss": 0.6183, + "step": 8513 + }, + { + "epoch": 0.7612660944206009, + "grad_norm": 0.15574176833643988, + "learning_rate": 2.8434502288356835e-05, + "loss": 0.6625, + "step": 8514 + }, + { + "epoch": 0.7613555078683834, + "grad_norm": 0.13244902848433365, + "learning_rate": 2.8414277986289928e-05, + "loss": 0.6414, + "step": 8515 + }, + { + "epoch": 0.7614449213161659, + "grad_norm": 0.14973420692956999, + "learning_rate": 2.839405968803108e-05, + "loss": 0.6241, + "step": 8516 + }, + { + "epoch": 0.7615343347639485, + "grad_norm": 0.13648097171677304, + "learning_rate": 2.8373847395275966e-05, + "loss": 0.6217, + "step": 8517 + }, + { + "epoch": 0.7616237482117311, + "grad_norm": 0.1429967452393891, + "learning_rate": 2.8353641109719764e-05, + "loss": 0.6429, + "step": 8518 + }, + { + "epoch": 0.7617131616595136, + "grad_norm": 0.12792721930864767, + "learning_rate": 2.833344083305719e-05, + "loss": 0.6584, + "step": 8519 + }, + { + "epoch": 0.7618025751072961, + "grad_norm": 0.14206598781132868, + "learning_rate": 2.8313246566982345e-05, + "loss": 0.6659, + "step": 8520 + }, + { + "epoch": 0.7618919885550787, + "grad_norm": 0.1464265666479497, + "learning_rate": 2.8293058313188935e-05, + "loss": 0.593, + "step": 8521 + }, + { + "epoch": 0.7619814020028612, + "grad_norm": 0.12894630120671385, + "learning_rate": 2.827287607337016e-05, + "loss": 0.554, + "step": 8522 + }, + { + "epoch": 0.7620708154506438, + "grad_norm": 0.15151476194452707, + "learning_rate": 2.8252699849218613e-05, + "loss": 0.5948, + "step": 8523 + }, + { + "epoch": 0.7621602288984263, + "grad_norm": 0.13408661186053827, + "learning_rate": 2.823252964242644e-05, + "loss": 0.625, + "step": 8524 + }, + { + "epoch": 0.7622496423462088, + "grad_norm": 0.1525584601719681, + "learning_rate": 2.8212365454685408e-05, + "loss": 0.6462, + "step": 8525 + }, + { + "epoch": 0.7623390557939914, + "grad_norm": 0.14973398395664408, + "learning_rate": 2.8192207287686555e-05, + "loss": 0.643, + "step": 8526 + }, + { + "epoch": 0.762428469241774, + "grad_norm": 0.1305037482335428, + "learning_rate": 2.8172055143120546e-05, + "loss": 0.6173, + "step": 8527 + }, + { + "epoch": 0.7625178826895566, + "grad_norm": 0.13699376910399963, + "learning_rate": 2.815190902267757e-05, + "loss": 0.602, + "step": 8528 + }, + { + "epoch": 0.762607296137339, + "grad_norm": 0.13177428548877185, + "learning_rate": 2.8131768928047176e-05, + "loss": 0.6324, + "step": 8529 + }, + { + "epoch": 0.7626967095851216, + "grad_norm": 0.13865488155355368, + "learning_rate": 2.8111634860918524e-05, + "loss": 0.6261, + "step": 8530 + }, + { + "epoch": 0.7627861230329042, + "grad_norm": 0.14092116693311757, + "learning_rate": 2.809150682298024e-05, + "loss": 0.633, + "step": 8531 + }, + { + "epoch": 0.7628755364806867, + "grad_norm": 0.13838045582315395, + "learning_rate": 2.807138481592043e-05, + "loss": 0.6011, + "step": 8532 + }, + { + "epoch": 0.7629649499284692, + "grad_norm": 0.14618231319426045, + "learning_rate": 2.8051268841426713e-05, + "loss": 0.6555, + "step": 8533 + }, + { + "epoch": 0.7630543633762518, + "grad_norm": 0.14770069500279992, + "learning_rate": 2.803115890118623e-05, + "loss": 0.6401, + "step": 8534 + }, + { + "epoch": 0.7631437768240343, + "grad_norm": 0.12752646430457812, + "learning_rate": 2.8011054996885477e-05, + "loss": 0.6319, + "step": 8535 + }, + { + "epoch": 0.7632331902718169, + "grad_norm": 0.15668756587726002, + "learning_rate": 2.7990957130210617e-05, + "loss": 0.6158, + "step": 8536 + }, + { + "epoch": 0.7633226037195995, + "grad_norm": 0.1345555795829279, + "learning_rate": 2.797086530284725e-05, + "loss": 0.6478, + "step": 8537 + }, + { + "epoch": 0.7634120171673819, + "grad_norm": 0.15186930203333643, + "learning_rate": 2.795077951648035e-05, + "loss": 0.6247, + "step": 8538 + }, + { + "epoch": 0.7635014306151645, + "grad_norm": 0.14138320856151104, + "learning_rate": 2.7930699772794623e-05, + "loss": 0.6498, + "step": 8539 + }, + { + "epoch": 0.7635908440629471, + "grad_norm": 0.13894205150656153, + "learning_rate": 2.7910626073474045e-05, + "loss": 0.6527, + "step": 8540 + }, + { + "epoch": 0.7636802575107297, + "grad_norm": 0.14741903002411796, + "learning_rate": 2.7890558420202185e-05, + "loss": 0.6436, + "step": 8541 + }, + { + "epoch": 0.7637696709585121, + "grad_norm": 0.13258833650054985, + "learning_rate": 2.787049681466214e-05, + "loss": 0.6351, + "step": 8542 + }, + { + "epoch": 0.7638590844062947, + "grad_norm": 0.13083172739297652, + "learning_rate": 2.7850441258536386e-05, + "loss": 0.6038, + "step": 8543 + }, + { + "epoch": 0.7639484978540773, + "grad_norm": 0.1479994962055604, + "learning_rate": 2.783039175350699e-05, + "loss": 0.6629, + "step": 8544 + }, + { + "epoch": 0.7640379113018598, + "grad_norm": 0.13414238567796394, + "learning_rate": 2.7810348301255486e-05, + "loss": 0.6128, + "step": 8545 + }, + { + "epoch": 0.7641273247496424, + "grad_norm": 0.12984572963178265, + "learning_rate": 2.779031090346287e-05, + "loss": 0.6173, + "step": 8546 + }, + { + "epoch": 0.7642167381974249, + "grad_norm": 0.1459530464896426, + "learning_rate": 2.7770279561809686e-05, + "loss": 0.6315, + "step": 8547 + }, + { + "epoch": 0.7643061516452074, + "grad_norm": 0.1427006441389468, + "learning_rate": 2.775025427797594e-05, + "loss": 0.6179, + "step": 8548 + }, + { + "epoch": 0.76439556509299, + "grad_norm": 0.13847628459614683, + "learning_rate": 2.7730235053641096e-05, + "loss": 0.5991, + "step": 8549 + }, + { + "epoch": 0.7644849785407726, + "grad_norm": 0.1555306504427351, + "learning_rate": 2.7710221890484157e-05, + "loss": 0.6559, + "step": 8550 + }, + { + "epoch": 0.764574391988555, + "grad_norm": 0.13773639787679876, + "learning_rate": 2.7690214790183622e-05, + "loss": 0.588, + "step": 8551 + }, + { + "epoch": 0.7646638054363376, + "grad_norm": 0.13643489682933438, + "learning_rate": 2.7670213754417396e-05, + "loss": 0.6472, + "step": 8552 + }, + { + "epoch": 0.7647532188841202, + "grad_norm": 0.135449366152864, + "learning_rate": 2.7650218784863047e-05, + "loss": 0.6384, + "step": 8553 + }, + { + "epoch": 0.7648426323319027, + "grad_norm": 0.17809177356175548, + "learning_rate": 2.7630229883197433e-05, + "loss": 0.7118, + "step": 8554 + }, + { + "epoch": 0.7649320457796852, + "grad_norm": 0.14864662736606346, + "learning_rate": 2.761024705109705e-05, + "loss": 0.5888, + "step": 8555 + }, + { + "epoch": 0.7650214592274678, + "grad_norm": 0.14352325703924726, + "learning_rate": 2.7590270290237852e-05, + "loss": 0.6315, + "step": 8556 + }, + { + "epoch": 0.7651108726752504, + "grad_norm": 0.14355151162864851, + "learning_rate": 2.75702996022952e-05, + "loss": 0.6596, + "step": 8557 + }, + { + "epoch": 0.7652002861230329, + "grad_norm": 0.15143680394820902, + "learning_rate": 2.755033498894405e-05, + "loss": 0.6089, + "step": 8558 + }, + { + "epoch": 0.7652896995708155, + "grad_norm": 0.14182999086065867, + "learning_rate": 2.7530376451858807e-05, + "loss": 0.592, + "step": 8559 + }, + { + "epoch": 0.765379113018598, + "grad_norm": 0.1275919854406637, + "learning_rate": 2.7510423992713374e-05, + "loss": 0.6468, + "step": 8560 + }, + { + "epoch": 0.7654685264663805, + "grad_norm": 0.1410214294162784, + "learning_rate": 2.749047761318113e-05, + "loss": 0.6147, + "step": 8561 + }, + { + "epoch": 0.7655579399141631, + "grad_norm": 0.1187093355076287, + "learning_rate": 2.7470537314934997e-05, + "loss": 0.6048, + "step": 8562 + }, + { + "epoch": 0.7656473533619457, + "grad_norm": 0.13047783629001994, + "learning_rate": 2.7450603099647266e-05, + "loss": 0.5953, + "step": 8563 + }, + { + "epoch": 0.7657367668097281, + "grad_norm": 0.1539158316057903, + "learning_rate": 2.7430674968989832e-05, + "loss": 0.6564, + "step": 8564 + }, + { + "epoch": 0.7658261802575107, + "grad_norm": 0.1246216633718656, + "learning_rate": 2.7410752924634088e-05, + "loss": 0.6337, + "step": 8565 + }, + { + "epoch": 0.7659155937052933, + "grad_norm": 0.13966351306836725, + "learning_rate": 2.7390836968250766e-05, + "loss": 0.6814, + "step": 8566 + }, + { + "epoch": 0.7660050071530758, + "grad_norm": 0.1435010285564901, + "learning_rate": 2.737092710151029e-05, + "loss": 0.6726, + "step": 8567 + }, + { + "epoch": 0.7660944206008584, + "grad_norm": 0.14266353235627172, + "learning_rate": 2.735102332608247e-05, + "loss": 0.6443, + "step": 8568 + }, + { + "epoch": 0.7661838340486409, + "grad_norm": 0.14911025281715284, + "learning_rate": 2.7331125643636567e-05, + "loss": 0.654, + "step": 8569 + }, + { + "epoch": 0.7662732474964234, + "grad_norm": 0.14495655411392525, + "learning_rate": 2.7311234055841382e-05, + "loss": 0.6529, + "step": 8570 + }, + { + "epoch": 0.766362660944206, + "grad_norm": 0.11922359586838059, + "learning_rate": 2.7291348564365248e-05, + "loss": 0.6425, + "step": 8571 + }, + { + "epoch": 0.7664520743919886, + "grad_norm": 0.13338410554478233, + "learning_rate": 2.7271469170875863e-05, + "loss": 0.6198, + "step": 8572 + }, + { + "epoch": 0.766541487839771, + "grad_norm": 0.14045788073902132, + "learning_rate": 2.7251595877040538e-05, + "loss": 0.6132, + "step": 8573 + }, + { + "epoch": 0.7666309012875536, + "grad_norm": 0.13529558407806458, + "learning_rate": 2.7231728684525992e-05, + "loss": 0.6119, + "step": 8574 + }, + { + "epoch": 0.7667203147353362, + "grad_norm": 0.1413871763877911, + "learning_rate": 2.7211867594998486e-05, + "loss": 0.6752, + "step": 8575 + }, + { + "epoch": 0.7668097281831188, + "grad_norm": 0.1353188619242593, + "learning_rate": 2.7192012610123774e-05, + "loss": 0.575, + "step": 8576 + }, + { + "epoch": 0.7668991416309013, + "grad_norm": 0.13997898786061536, + "learning_rate": 2.7172163731567e-05, + "loss": 0.6197, + "step": 8577 + }, + { + "epoch": 0.7669885550786838, + "grad_norm": 0.1309651338045419, + "learning_rate": 2.7152320960992905e-05, + "loss": 0.6434, + "step": 8578 + }, + { + "epoch": 0.7670779685264664, + "grad_norm": 0.124578696466795, + "learning_rate": 2.71324843000657e-05, + "loss": 0.6374, + "step": 8579 + }, + { + "epoch": 0.7671673819742489, + "grad_norm": 0.13944880339414562, + "learning_rate": 2.711265375044897e-05, + "loss": 0.6302, + "step": 8580 + }, + { + "epoch": 0.7672567954220315, + "grad_norm": 0.1387634040533031, + "learning_rate": 2.709282931380598e-05, + "loss": 0.6553, + "step": 8581 + }, + { + "epoch": 0.767346208869814, + "grad_norm": 0.14434502767206883, + "learning_rate": 2.7073010991799376e-05, + "loss": 0.6465, + "step": 8582 + }, + { + "epoch": 0.7674356223175965, + "grad_norm": 0.13875819665963232, + "learning_rate": 2.705319878609124e-05, + "loss": 0.637, + "step": 8583 + }, + { + "epoch": 0.7675250357653791, + "grad_norm": 0.1325967101443536, + "learning_rate": 2.7033392698343218e-05, + "loss": 0.6575, + "step": 8584 + }, + { + "epoch": 0.7676144492131617, + "grad_norm": 0.14698949050789614, + "learning_rate": 2.7013592730216465e-05, + "loss": 0.6264, + "step": 8585 + }, + { + "epoch": 0.7677038626609443, + "grad_norm": 0.14178791141952596, + "learning_rate": 2.69937988833715e-05, + "loss": 0.6549, + "step": 8586 + }, + { + "epoch": 0.7677932761087267, + "grad_norm": 0.16195401390921751, + "learning_rate": 2.697401115946847e-05, + "loss": 0.6762, + "step": 8587 + }, + { + "epoch": 0.7678826895565093, + "grad_norm": 0.14560143619755758, + "learning_rate": 2.6954229560166923e-05, + "loss": 0.6564, + "step": 8588 + }, + { + "epoch": 0.7679721030042919, + "grad_norm": 0.12425157042803052, + "learning_rate": 2.6934454087125926e-05, + "loss": 0.6413, + "step": 8589 + }, + { + "epoch": 0.7680615164520744, + "grad_norm": 0.14509382067621218, + "learning_rate": 2.6914684742004028e-05, + "loss": 0.6857, + "step": 8590 + }, + { + "epoch": 0.7681509298998569, + "grad_norm": 0.13759596078565348, + "learning_rate": 2.689492152645928e-05, + "loss": 0.6593, + "step": 8591 + }, + { + "epoch": 0.7682403433476395, + "grad_norm": 0.16621289340319043, + "learning_rate": 2.6875164442149147e-05, + "loss": 0.6441, + "step": 8592 + }, + { + "epoch": 0.768329756795422, + "grad_norm": 0.14054020305309503, + "learning_rate": 2.685541349073066e-05, + "loss": 0.6411, + "step": 8593 + }, + { + "epoch": 0.7684191702432046, + "grad_norm": 0.1256413035036636, + "learning_rate": 2.6835668673860314e-05, + "loss": 0.6446, + "step": 8594 + }, + { + "epoch": 0.7685085836909872, + "grad_norm": 0.14510703022288954, + "learning_rate": 2.6815929993194067e-05, + "loss": 0.633, + "step": 8595 + }, + { + "epoch": 0.7685979971387696, + "grad_norm": 0.15314324895262862, + "learning_rate": 2.679619745038743e-05, + "loss": 0.6082, + "step": 8596 + }, + { + "epoch": 0.7686874105865522, + "grad_norm": 0.1326214432014111, + "learning_rate": 2.6776471047095263e-05, + "loss": 0.6323, + "step": 8597 + }, + { + "epoch": 0.7687768240343348, + "grad_norm": 0.13586531481333197, + "learning_rate": 2.675675078497204e-05, + "loss": 0.6498, + "step": 8598 + }, + { + "epoch": 0.7688662374821174, + "grad_norm": 0.13003168521068229, + "learning_rate": 2.67370366656717e-05, + "loss": 0.6136, + "step": 8599 + }, + { + "epoch": 0.7689556509298998, + "grad_norm": 0.14064536208470876, + "learning_rate": 2.6717328690847565e-05, + "loss": 0.6078, + "step": 8600 + }, + { + "epoch": 0.7690450643776824, + "grad_norm": 0.1352924550893056, + "learning_rate": 2.669762686215259e-05, + "loss": 0.6436, + "step": 8601 + }, + { + "epoch": 0.769134477825465, + "grad_norm": 0.13326568496367128, + "learning_rate": 2.6677931181239158e-05, + "loss": 0.5695, + "step": 8602 + }, + { + "epoch": 0.7692238912732475, + "grad_norm": 0.13669487758819993, + "learning_rate": 2.6658241649759062e-05, + "loss": 0.595, + "step": 8603 + }, + { + "epoch": 0.76931330472103, + "grad_norm": 0.14431063655455875, + "learning_rate": 2.6638558269363654e-05, + "loss": 0.6391, + "step": 8604 + }, + { + "epoch": 0.7694027181688126, + "grad_norm": 0.13578087600891825, + "learning_rate": 2.6618881041703804e-05, + "loss": 0.6314, + "step": 8605 + }, + { + "epoch": 0.7694921316165951, + "grad_norm": 0.13165443178171865, + "learning_rate": 2.659920996842975e-05, + "loss": 0.6127, + "step": 8606 + }, + { + "epoch": 0.7695815450643777, + "grad_norm": 0.14191514993104185, + "learning_rate": 2.6579545051191302e-05, + "loss": 0.6456, + "step": 8607 + }, + { + "epoch": 0.7696709585121603, + "grad_norm": 0.13162328499808099, + "learning_rate": 2.6559886291637748e-05, + "loss": 0.6347, + "step": 8608 + }, + { + "epoch": 0.7697603719599427, + "grad_norm": 0.17146672465714594, + "learning_rate": 2.6540233691417837e-05, + "loss": 0.6249, + "step": 8609 + }, + { + "epoch": 0.7698497854077253, + "grad_norm": 0.1429788753804966, + "learning_rate": 2.652058725217983e-05, + "loss": 0.6568, + "step": 8610 + }, + { + "epoch": 0.7699391988555079, + "grad_norm": 0.14638907499112885, + "learning_rate": 2.6500946975571405e-05, + "loss": 0.6158, + "step": 8611 + }, + { + "epoch": 0.7700286123032904, + "grad_norm": 0.1421859402729888, + "learning_rate": 2.6481312863239804e-05, + "loss": 0.6365, + "step": 8612 + }, + { + "epoch": 0.7701180257510729, + "grad_norm": 0.1373617002856777, + "learning_rate": 2.646168491683172e-05, + "loss": 0.6228, + "step": 8613 + }, + { + "epoch": 0.7702074391988555, + "grad_norm": 0.14234687896447815, + "learning_rate": 2.6442063137993255e-05, + "loss": 0.6223, + "step": 8614 + }, + { + "epoch": 0.770296852646638, + "grad_norm": 0.13631220262444008, + "learning_rate": 2.6422447528370152e-05, + "loss": 0.6286, + "step": 8615 + }, + { + "epoch": 0.7703862660944206, + "grad_norm": 0.12728964448199018, + "learning_rate": 2.640283808960754e-05, + "loss": 0.6382, + "step": 8616 + }, + { + "epoch": 0.7704756795422032, + "grad_norm": 0.15262077540918645, + "learning_rate": 2.638323482334999e-05, + "loss": 0.6741, + "step": 8617 + }, + { + "epoch": 0.7705650929899857, + "grad_norm": 0.14836139962512906, + "learning_rate": 2.636363773124163e-05, + "loss": 0.6587, + "step": 8618 + }, + { + "epoch": 0.7706545064377682, + "grad_norm": 0.1318420336078078, + "learning_rate": 2.634404681492607e-05, + "loss": 0.6319, + "step": 8619 + }, + { + "epoch": 0.7707439198855508, + "grad_norm": 0.13280334175542835, + "learning_rate": 2.6324462076046318e-05, + "loss": 0.6256, + "step": 8620 + }, + { + "epoch": 0.7708333333333334, + "grad_norm": 0.1305053152868271, + "learning_rate": 2.630488351624496e-05, + "loss": 0.6286, + "step": 8621 + }, + { + "epoch": 0.7709227467811158, + "grad_norm": 0.1519931202426578, + "learning_rate": 2.6285311137164013e-05, + "loss": 0.63, + "step": 8622 + }, + { + "epoch": 0.7710121602288984, + "grad_norm": 0.1329298551080484, + "learning_rate": 2.6265744940445003e-05, + "loss": 0.6328, + "step": 8623 + }, + { + "epoch": 0.771101573676681, + "grad_norm": 0.1647868007900284, + "learning_rate": 2.624618492772891e-05, + "loss": 0.6359, + "step": 8624 + }, + { + "epoch": 0.7711909871244635, + "grad_norm": 0.12212146476528936, + "learning_rate": 2.622663110065625e-05, + "loss": 0.6463, + "step": 8625 + }, + { + "epoch": 0.7712804005722461, + "grad_norm": 0.13178001887756036, + "learning_rate": 2.6207083460866912e-05, + "loss": 0.5948, + "step": 8626 + }, + { + "epoch": 0.7713698140200286, + "grad_norm": 0.13640508599755147, + "learning_rate": 2.6187542010000367e-05, + "loss": 0.6394, + "step": 8627 + }, + { + "epoch": 0.7714592274678111, + "grad_norm": 0.1273783256736575, + "learning_rate": 2.616800674969553e-05, + "loss": 0.6157, + "step": 8628 + }, + { + "epoch": 0.7715486409155937, + "grad_norm": 0.13512277763388525, + "learning_rate": 2.61484776815908e-05, + "loss": 0.649, + "step": 8629 + }, + { + "epoch": 0.7716380543633763, + "grad_norm": 0.14505291111774749, + "learning_rate": 2.612895480732408e-05, + "loss": 0.6472, + "step": 8630 + }, + { + "epoch": 0.7717274678111588, + "grad_norm": 0.14413064364496422, + "learning_rate": 2.610943812853268e-05, + "loss": 0.6613, + "step": 8631 + }, + { + "epoch": 0.7718168812589413, + "grad_norm": 0.15782071563949934, + "learning_rate": 2.6089927646853474e-05, + "loss": 0.6714, + "step": 8632 + }, + { + "epoch": 0.7719062947067239, + "grad_norm": 0.1432916146144606, + "learning_rate": 2.6070423363922803e-05, + "loss": 0.6183, + "step": 8633 + }, + { + "epoch": 0.7719957081545065, + "grad_norm": 0.14906216468164402, + "learning_rate": 2.6050925281376403e-05, + "loss": 0.6595, + "step": 8634 + }, + { + "epoch": 0.772085121602289, + "grad_norm": 0.12884400602743756, + "learning_rate": 2.603143340084957e-05, + "loss": 0.6403, + "step": 8635 + }, + { + "epoch": 0.7721745350500715, + "grad_norm": 0.14369290845794605, + "learning_rate": 2.601194772397715e-05, + "loss": 0.6528, + "step": 8636 + }, + { + "epoch": 0.7722639484978541, + "grad_norm": 0.12974819028899418, + "learning_rate": 2.5992468252393275e-05, + "loss": 0.5948, + "step": 8637 + }, + { + "epoch": 0.7723533619456366, + "grad_norm": 0.1503579377981833, + "learning_rate": 2.5972994987731714e-05, + "loss": 0.6555, + "step": 8638 + }, + { + "epoch": 0.7724427753934192, + "grad_norm": 0.13913976605416561, + "learning_rate": 2.59535279316257e-05, + "loss": 0.6156, + "step": 8639 + }, + { + "epoch": 0.7725321888412017, + "grad_norm": 0.13377659945953985, + "learning_rate": 2.5934067085707834e-05, + "loss": 0.6218, + "step": 8640 + }, + { + "epoch": 0.7726216022889842, + "grad_norm": 0.14643949507249437, + "learning_rate": 2.591461245161032e-05, + "loss": 0.6283, + "step": 8641 + }, + { + "epoch": 0.7727110157367668, + "grad_norm": 0.13314131463284162, + "learning_rate": 2.589516403096478e-05, + "loss": 0.6126, + "step": 8642 + }, + { + "epoch": 0.7728004291845494, + "grad_norm": 0.12650459134059336, + "learning_rate": 2.5875721825402342e-05, + "loss": 0.615, + "step": 8643 + }, + { + "epoch": 0.772889842632332, + "grad_norm": 0.1385134725786321, + "learning_rate": 2.585628583655362e-05, + "loss": 0.6243, + "step": 8644 + }, + { + "epoch": 0.7729792560801144, + "grad_norm": 0.14576089395758787, + "learning_rate": 2.583685606604863e-05, + "loss": 0.6668, + "step": 8645 + }, + { + "epoch": 0.773068669527897, + "grad_norm": 0.1290429205496534, + "learning_rate": 2.581743251551697e-05, + "loss": 0.6128, + "step": 8646 + }, + { + "epoch": 0.7731580829756796, + "grad_norm": 0.13564919330226474, + "learning_rate": 2.5798015186587643e-05, + "loss": 0.6311, + "step": 8647 + }, + { + "epoch": 0.7732474964234621, + "grad_norm": 0.13426994647609888, + "learning_rate": 2.5778604080889202e-05, + "loss": 0.63, + "step": 8648 + }, + { + "epoch": 0.7733369098712446, + "grad_norm": 0.14001425986708163, + "learning_rate": 2.5759199200049534e-05, + "loss": 0.5691, + "step": 8649 + }, + { + "epoch": 0.7734263233190272, + "grad_norm": 0.13318312734677842, + "learning_rate": 2.5739800545696237e-05, + "loss": 0.6511, + "step": 8650 + }, + { + "epoch": 0.7735157367668097, + "grad_norm": 0.13676261272638107, + "learning_rate": 2.5720408119456152e-05, + "loss": 0.6271, + "step": 8651 + }, + { + "epoch": 0.7736051502145923, + "grad_norm": 0.14125371022010874, + "learning_rate": 2.5701021922955727e-05, + "loss": 0.6427, + "step": 8652 + }, + { + "epoch": 0.7736945636623748, + "grad_norm": 0.1422523918849518, + "learning_rate": 2.56816419578209e-05, + "loss": 0.6272, + "step": 8653 + }, + { + "epoch": 0.7737839771101573, + "grad_norm": 0.13748229803377848, + "learning_rate": 2.5662268225676976e-05, + "loss": 0.6135, + "step": 8654 + }, + { + "epoch": 0.7738733905579399, + "grad_norm": 0.13955712449666827, + "learning_rate": 2.5642900728148832e-05, + "loss": 0.6397, + "step": 8655 + }, + { + "epoch": 0.7739628040057225, + "grad_norm": 0.14413457443610886, + "learning_rate": 2.5623539466860813e-05, + "loss": 0.6405, + "step": 8656 + }, + { + "epoch": 0.774052217453505, + "grad_norm": 0.13925389314307576, + "learning_rate": 2.5604184443436707e-05, + "loss": 0.6433, + "step": 8657 + }, + { + "epoch": 0.7741416309012875, + "grad_norm": 0.14037464513304396, + "learning_rate": 2.5584835659499807e-05, + "loss": 0.6156, + "step": 8658 + }, + { + "epoch": 0.7742310443490701, + "grad_norm": 0.13999159834513747, + "learning_rate": 2.5565493116672902e-05, + "loss": 0.6479, + "step": 8659 + }, + { + "epoch": 0.7743204577968527, + "grad_norm": 0.13389079835517875, + "learning_rate": 2.5546156816578158e-05, + "loss": 0.6615, + "step": 8660 + }, + { + "epoch": 0.7744098712446352, + "grad_norm": 0.13922764602643176, + "learning_rate": 2.552682676083733e-05, + "loss": 0.6306, + "step": 8661 + }, + { + "epoch": 0.7744992846924177, + "grad_norm": 0.15487333822587754, + "learning_rate": 2.5507502951071637e-05, + "loss": 0.6538, + "step": 8662 + }, + { + "epoch": 0.7745886981402003, + "grad_norm": 0.1417031469659411, + "learning_rate": 2.5488185388901642e-05, + "loss": 0.6213, + "step": 8663 + }, + { + "epoch": 0.7746781115879828, + "grad_norm": 0.13035297822573091, + "learning_rate": 2.54688740759476e-05, + "loss": 0.6095, + "step": 8664 + }, + { + "epoch": 0.7747675250357654, + "grad_norm": 0.14464318386243957, + "learning_rate": 2.5449569013829066e-05, + "loss": 0.6495, + "step": 8665 + }, + { + "epoch": 0.774856938483548, + "grad_norm": 0.13805804415966233, + "learning_rate": 2.543027020416514e-05, + "loss": 0.6406, + "step": 8666 + }, + { + "epoch": 0.7749463519313304, + "grad_norm": 0.1378179045892024, + "learning_rate": 2.541097764857442e-05, + "loss": 0.6665, + "step": 8667 + }, + { + "epoch": 0.775035765379113, + "grad_norm": 0.12911905915826585, + "learning_rate": 2.5391691348674894e-05, + "loss": 0.5576, + "step": 8668 + }, + { + "epoch": 0.7751251788268956, + "grad_norm": 0.1324236172494627, + "learning_rate": 2.537241130608411e-05, + "loss": 0.612, + "step": 8669 + }, + { + "epoch": 0.7752145922746781, + "grad_norm": 0.13317287228789496, + "learning_rate": 2.5353137522419067e-05, + "loss": 0.629, + "step": 8670 + }, + { + "epoch": 0.7753040057224606, + "grad_norm": 0.1268662722937612, + "learning_rate": 2.5333869999296223e-05, + "loss": 0.6318, + "step": 8671 + }, + { + "epoch": 0.7753934191702432, + "grad_norm": 0.13965884382710478, + "learning_rate": 2.5314608738331537e-05, + "loss": 0.63, + "step": 8672 + }, + { + "epoch": 0.7754828326180258, + "grad_norm": 0.14328044480199623, + "learning_rate": 2.529535374114044e-05, + "loss": 0.6701, + "step": 8673 + }, + { + "epoch": 0.7755722460658083, + "grad_norm": 0.15828060666702773, + "learning_rate": 2.527610500933778e-05, + "loss": 0.6313, + "step": 8674 + }, + { + "epoch": 0.7756616595135909, + "grad_norm": 0.1491940672511399, + "learning_rate": 2.525686254453795e-05, + "loss": 0.643, + "step": 8675 + }, + { + "epoch": 0.7757510729613734, + "grad_norm": 0.14662093016989206, + "learning_rate": 2.5237626348354813e-05, + "loss": 0.598, + "step": 8676 + }, + { + "epoch": 0.7758404864091559, + "grad_norm": 0.13047959558842406, + "learning_rate": 2.5218396422401614e-05, + "loss": 0.6221, + "step": 8677 + }, + { + "epoch": 0.7759298998569385, + "grad_norm": 0.15715011836274181, + "learning_rate": 2.5199172768291248e-05, + "loss": 0.6831, + "step": 8678 + }, + { + "epoch": 0.7760193133047211, + "grad_norm": 0.13328352932292264, + "learning_rate": 2.51799553876359e-05, + "loss": 0.6325, + "step": 8679 + }, + { + "epoch": 0.7761087267525035, + "grad_norm": 0.15660865221431855, + "learning_rate": 2.5160744282047333e-05, + "loss": 0.6735, + "step": 8680 + }, + { + "epoch": 0.7761981402002861, + "grad_norm": 0.14493749893882876, + "learning_rate": 2.5141539453136755e-05, + "loss": 0.6061, + "step": 8681 + }, + { + "epoch": 0.7762875536480687, + "grad_norm": 0.1426462229240539, + "learning_rate": 2.5122340902514897e-05, + "loss": 0.6121, + "step": 8682 + }, + { + "epoch": 0.7763769670958512, + "grad_norm": 0.13101975898723564, + "learning_rate": 2.510314863179184e-05, + "loss": 0.5969, + "step": 8683 + }, + { + "epoch": 0.7764663805436338, + "grad_norm": 0.15707576241862534, + "learning_rate": 2.508396264257725e-05, + "loss": 0.617, + "step": 8684 + }, + { + "epoch": 0.7765557939914163, + "grad_norm": 0.1360090389928564, + "learning_rate": 2.5064782936480248e-05, + "loss": 0.6517, + "step": 8685 + }, + { + "epoch": 0.7766452074391988, + "grad_norm": 0.1400889063757569, + "learning_rate": 2.5045609515109403e-05, + "loss": 0.6334, + "step": 8686 + }, + { + "epoch": 0.7767346208869814, + "grad_norm": 0.1396156493453935, + "learning_rate": 2.502644238007279e-05, + "loss": 0.6322, + "step": 8687 + }, + { + "epoch": 0.776824034334764, + "grad_norm": 0.12987903176461954, + "learning_rate": 2.500728153297788e-05, + "loss": 0.603, + "step": 8688 + }, + { + "epoch": 0.7769134477825465, + "grad_norm": 0.13967890599021177, + "learning_rate": 2.498812697543169e-05, + "loss": 0.6534, + "step": 8689 + }, + { + "epoch": 0.777002861230329, + "grad_norm": 0.16678617254242042, + "learning_rate": 2.4968978709040713e-05, + "loss": 0.6384, + "step": 8690 + }, + { + "epoch": 0.7770922746781116, + "grad_norm": 0.13099729732835608, + "learning_rate": 2.4949836735410882e-05, + "loss": 0.6065, + "step": 8691 + }, + { + "epoch": 0.7771816881258942, + "grad_norm": 0.13335381460959359, + "learning_rate": 2.4930701056147586e-05, + "loss": 0.6129, + "step": 8692 + }, + { + "epoch": 0.7772711015736766, + "grad_norm": 0.13649741865539536, + "learning_rate": 2.491157167285578e-05, + "loss": 0.5991, + "step": 8693 + }, + { + "epoch": 0.7773605150214592, + "grad_norm": 0.13681832895154583, + "learning_rate": 2.489244858713974e-05, + "loss": 0.6001, + "step": 8694 + }, + { + "epoch": 0.7774499284692418, + "grad_norm": 0.12632667644521037, + "learning_rate": 2.4873331800603327e-05, + "loss": 0.5979, + "step": 8695 + }, + { + "epoch": 0.7775393419170243, + "grad_norm": 0.13541091030164706, + "learning_rate": 2.485422131484987e-05, + "loss": 0.6293, + "step": 8696 + }, + { + "epoch": 0.7776287553648069, + "grad_norm": 0.14818599085477863, + "learning_rate": 2.4835117131482067e-05, + "loss": 0.6587, + "step": 8697 + }, + { + "epoch": 0.7777181688125894, + "grad_norm": 0.11386268951738922, + "learning_rate": 2.4816019252102273e-05, + "loss": 0.633, + "step": 8698 + }, + { + "epoch": 0.7778075822603719, + "grad_norm": 0.13793469937048863, + "learning_rate": 2.479692767831211e-05, + "loss": 0.637, + "step": 8699 + }, + { + "epoch": 0.7778969957081545, + "grad_norm": 0.15879551594114713, + "learning_rate": 2.4777842411712805e-05, + "loss": 0.6483, + "step": 8700 + }, + { + "epoch": 0.7779864091559371, + "grad_norm": 0.13518953169071393, + "learning_rate": 2.4758763453905044e-05, + "loss": 0.6032, + "step": 8701 + }, + { + "epoch": 0.7780758226037195, + "grad_norm": 0.13664548419405873, + "learning_rate": 2.473969080648889e-05, + "loss": 0.6517, + "step": 8702 + }, + { + "epoch": 0.7781652360515021, + "grad_norm": 0.1328737986312152, + "learning_rate": 2.472062447106398e-05, + "loss": 0.6277, + "step": 8703 + }, + { + "epoch": 0.7782546494992847, + "grad_norm": 0.13064191851232038, + "learning_rate": 2.4701564449229374e-05, + "loss": 0.6001, + "step": 8704 + }, + { + "epoch": 0.7783440629470673, + "grad_norm": 0.13222868441115013, + "learning_rate": 2.468251074258362e-05, + "loss": 0.6284, + "step": 8705 + }, + { + "epoch": 0.7784334763948498, + "grad_norm": 0.14385528543682977, + "learning_rate": 2.4663463352724737e-05, + "loss": 0.631, + "step": 8706 + }, + { + "epoch": 0.7785228898426323, + "grad_norm": 0.13917157700154287, + "learning_rate": 2.4644422281250223e-05, + "loss": 0.6037, + "step": 8707 + }, + { + "epoch": 0.7786123032904149, + "grad_norm": 0.14401054616156397, + "learning_rate": 2.462538752975698e-05, + "loss": 0.6457, + "step": 8708 + }, + { + "epoch": 0.7787017167381974, + "grad_norm": 0.1368945508944011, + "learning_rate": 2.4606359099841457e-05, + "loss": 0.6366, + "step": 8709 + }, + { + "epoch": 0.77879113018598, + "grad_norm": 0.1405441896667928, + "learning_rate": 2.4587336993099574e-05, + "loss": 0.6743, + "step": 8710 + }, + { + "epoch": 0.7788805436337625, + "grad_norm": 0.1476638771983185, + "learning_rate": 2.4568321211126598e-05, + "loss": 0.6437, + "step": 8711 + }, + { + "epoch": 0.778969957081545, + "grad_norm": 0.1461762012776933, + "learning_rate": 2.4549311755517457e-05, + "loss": 0.6508, + "step": 8712 + }, + { + "epoch": 0.7790593705293276, + "grad_norm": 0.12512509677756647, + "learning_rate": 2.4530308627866438e-05, + "loss": 0.6427, + "step": 8713 + }, + { + "epoch": 0.7791487839771102, + "grad_norm": 0.1269450860142132, + "learning_rate": 2.451131182976727e-05, + "loss": 0.6219, + "step": 8714 + }, + { + "epoch": 0.7792381974248928, + "grad_norm": 0.14017370575255736, + "learning_rate": 2.4492321362813207e-05, + "loss": 0.6283, + "step": 8715 + }, + { + "epoch": 0.7793276108726752, + "grad_norm": 0.12959865864180364, + "learning_rate": 2.4473337228596994e-05, + "loss": 0.6412, + "step": 8716 + }, + { + "epoch": 0.7794170243204578, + "grad_norm": 0.14975553081429663, + "learning_rate": 2.445435942871074e-05, + "loss": 0.631, + "step": 8717 + }, + { + "epoch": 0.7795064377682404, + "grad_norm": 0.14879351535409555, + "learning_rate": 2.4435387964746127e-05, + "loss": 0.6821, + "step": 8718 + }, + { + "epoch": 0.7795958512160229, + "grad_norm": 0.15833765688022716, + "learning_rate": 2.4416422838294273e-05, + "loss": 0.6846, + "step": 8719 + }, + { + "epoch": 0.7796852646638054, + "grad_norm": 0.13230456238692986, + "learning_rate": 2.439746405094575e-05, + "loss": 0.6261, + "step": 8720 + }, + { + "epoch": 0.779774678111588, + "grad_norm": 0.14507320774533938, + "learning_rate": 2.4378511604290632e-05, + "loss": 0.6459, + "step": 8721 + }, + { + "epoch": 0.7798640915593705, + "grad_norm": 0.15688951503598852, + "learning_rate": 2.4359565499918402e-05, + "loss": 0.6844, + "step": 8722 + }, + { + "epoch": 0.7799535050071531, + "grad_norm": 0.16379680107029587, + "learning_rate": 2.4340625739418055e-05, + "loss": 0.6348, + "step": 8723 + }, + { + "epoch": 0.7800429184549357, + "grad_norm": 0.150291574826323, + "learning_rate": 2.4321692324378087e-05, + "loss": 0.6339, + "step": 8724 + }, + { + "epoch": 0.7801323319027181, + "grad_norm": 0.13259040388762938, + "learning_rate": 2.4302765256386327e-05, + "loss": 0.6356, + "step": 8725 + }, + { + "epoch": 0.7802217453505007, + "grad_norm": 0.142135679413225, + "learning_rate": 2.4283844537030252e-05, + "loss": 0.6669, + "step": 8726 + }, + { + "epoch": 0.7803111587982833, + "grad_norm": 0.14222559553942304, + "learning_rate": 2.4264930167896727e-05, + "loss": 0.6294, + "step": 8727 + }, + { + "epoch": 0.7804005722460658, + "grad_norm": 0.17630161294091998, + "learning_rate": 2.4246022150572024e-05, + "loss": 0.6351, + "step": 8728 + }, + { + "epoch": 0.7804899856938483, + "grad_norm": 0.12210281177940731, + "learning_rate": 2.422712048664194e-05, + "loss": 0.6277, + "step": 8729 + }, + { + "epoch": 0.7805793991416309, + "grad_norm": 0.1452253930973785, + "learning_rate": 2.420822517769179e-05, + "loss": 0.6297, + "step": 8730 + }, + { + "epoch": 0.7806688125894135, + "grad_norm": 0.13895981736038834, + "learning_rate": 2.4189336225306225e-05, + "loss": 0.6583, + "step": 8731 + }, + { + "epoch": 0.780758226037196, + "grad_norm": 0.14353198578947865, + "learning_rate": 2.417045363106948e-05, + "loss": 0.6514, + "step": 8732 + }, + { + "epoch": 0.7808476394849786, + "grad_norm": 0.13437776230296497, + "learning_rate": 2.4151577396565205e-05, + "loss": 0.6238, + "step": 8733 + }, + { + "epoch": 0.780937052932761, + "grad_norm": 0.1283401423377132, + "learning_rate": 2.413270752337653e-05, + "loss": 0.6286, + "step": 8734 + }, + { + "epoch": 0.7810264663805436, + "grad_norm": 0.13959034915136329, + "learning_rate": 2.4113844013086083e-05, + "loss": 0.6112, + "step": 8735 + }, + { + "epoch": 0.7811158798283262, + "grad_norm": 0.12422403027267578, + "learning_rate": 2.409498686727587e-05, + "loss": 0.6296, + "step": 8736 + }, + { + "epoch": 0.7812052932761088, + "grad_norm": 0.12426842433118428, + "learning_rate": 2.4076136087527435e-05, + "loss": 0.6119, + "step": 8737 + }, + { + "epoch": 0.7812947067238912, + "grad_norm": 0.1293712021394388, + "learning_rate": 2.4057291675421768e-05, + "loss": 0.6122, + "step": 8738 + }, + { + "epoch": 0.7813841201716738, + "grad_norm": 0.13376413937454326, + "learning_rate": 2.4038453632539338e-05, + "loss": 0.6563, + "step": 8739 + }, + { + "epoch": 0.7814735336194564, + "grad_norm": 0.13420784445005873, + "learning_rate": 2.4019621960460058e-05, + "loss": 0.6408, + "step": 8740 + }, + { + "epoch": 0.781562947067239, + "grad_norm": 0.14005678805670604, + "learning_rate": 2.4000796660763346e-05, + "loss": 0.6638, + "step": 8741 + }, + { + "epoch": 0.7816523605150214, + "grad_norm": 0.13403911377228053, + "learning_rate": 2.3981977735028018e-05, + "loss": 0.6262, + "step": 8742 + }, + { + "epoch": 0.781741773962804, + "grad_norm": 0.1357578946431819, + "learning_rate": 2.3963165184832403e-05, + "loss": 0.6264, + "step": 8743 + }, + { + "epoch": 0.7818311874105865, + "grad_norm": 0.13950002967930228, + "learning_rate": 2.3944359011754336e-05, + "loss": 0.6739, + "step": 8744 + }, + { + "epoch": 0.7819206008583691, + "grad_norm": 0.14263948399553833, + "learning_rate": 2.3925559217370987e-05, + "loss": 0.6422, + "step": 8745 + }, + { + "epoch": 0.7820100143061517, + "grad_norm": 0.16085760501482813, + "learning_rate": 2.3906765803259078e-05, + "loss": 0.6297, + "step": 8746 + }, + { + "epoch": 0.7820994277539342, + "grad_norm": 0.13600594397756768, + "learning_rate": 2.388797877099489e-05, + "loss": 0.652, + "step": 8747 + }, + { + "epoch": 0.7821888412017167, + "grad_norm": 0.1389317070283114, + "learning_rate": 2.386919812215398e-05, + "loss": 0.6374, + "step": 8748 + }, + { + "epoch": 0.7822782546494993, + "grad_norm": 0.13697017481990892, + "learning_rate": 2.3850423858311466e-05, + "loss": 0.6426, + "step": 8749 + }, + { + "epoch": 0.7823676680972819, + "grad_norm": 0.13232068834777802, + "learning_rate": 2.3831655981041977e-05, + "loss": 0.619, + "step": 8750 + }, + { + "epoch": 0.7824570815450643, + "grad_norm": 0.15210169132748222, + "learning_rate": 2.381289449191948e-05, + "loss": 0.6423, + "step": 8751 + }, + { + "epoch": 0.7825464949928469, + "grad_norm": 0.13601763340183806, + "learning_rate": 2.379413939251751e-05, + "loss": 0.6594, + "step": 8752 + }, + { + "epoch": 0.7826359084406295, + "grad_norm": 0.13636113843981387, + "learning_rate": 2.3775390684409037e-05, + "loss": 0.6475, + "step": 8753 + }, + { + "epoch": 0.782725321888412, + "grad_norm": 0.14149113969588853, + "learning_rate": 2.375664836916649e-05, + "loss": 0.575, + "step": 8754 + }, + { + "epoch": 0.7828147353361946, + "grad_norm": 0.2074799416023357, + "learning_rate": 2.3737912448361798e-05, + "loss": 0.6821, + "step": 8755 + }, + { + "epoch": 0.7829041487839771, + "grad_norm": 0.13794867861152907, + "learning_rate": 2.3719182923566263e-05, + "loss": 0.6638, + "step": 8756 + }, + { + "epoch": 0.7829935622317596, + "grad_norm": 0.16151795438107358, + "learning_rate": 2.3700459796350726e-05, + "loss": 0.6311, + "step": 8757 + }, + { + "epoch": 0.7830829756795422, + "grad_norm": 0.1216100195865001, + "learning_rate": 2.36817430682855e-05, + "loss": 0.6129, + "step": 8758 + }, + { + "epoch": 0.7831723891273248, + "grad_norm": 0.14220514982074686, + "learning_rate": 2.3663032740940293e-05, + "loss": 0.589, + "step": 8759 + }, + { + "epoch": 0.7832618025751072, + "grad_norm": 0.15479752419538303, + "learning_rate": 2.364432881588431e-05, + "loss": 0.6569, + "step": 8760 + }, + { + "epoch": 0.7833512160228898, + "grad_norm": 0.15596073867122506, + "learning_rate": 2.362563129468631e-05, + "loss": 0.5939, + "step": 8761 + }, + { + "epoch": 0.7834406294706724, + "grad_norm": 0.1451926489921474, + "learning_rate": 2.360694017891436e-05, + "loss": 0.6825, + "step": 8762 + }, + { + "epoch": 0.783530042918455, + "grad_norm": 0.13730469117415453, + "learning_rate": 2.358825547013607e-05, + "loss": 0.6556, + "step": 8763 + }, + { + "epoch": 0.7836194563662375, + "grad_norm": 0.13761973138136438, + "learning_rate": 2.3569577169918532e-05, + "loss": 0.6245, + "step": 8764 + }, + { + "epoch": 0.78370886981402, + "grad_norm": 0.15024237580645386, + "learning_rate": 2.355090527982823e-05, + "loss": 0.6892, + "step": 8765 + }, + { + "epoch": 0.7837982832618026, + "grad_norm": 0.13250349040759316, + "learning_rate": 2.353223980143118e-05, + "loss": 0.6449, + "step": 8766 + }, + { + "epoch": 0.7838876967095851, + "grad_norm": 0.1517066320193147, + "learning_rate": 2.351358073629282e-05, + "loss": 0.6348, + "step": 8767 + }, + { + "epoch": 0.7839771101573677, + "grad_norm": 0.1485146840522493, + "learning_rate": 2.3494928085978073e-05, + "loss": 0.651, + "step": 8768 + }, + { + "epoch": 0.7840665236051502, + "grad_norm": 0.1324943649167474, + "learning_rate": 2.3476281852051308e-05, + "loss": 0.5925, + "step": 8769 + }, + { + "epoch": 0.7841559370529327, + "grad_norm": 0.14797799081641377, + "learning_rate": 2.345764203607641e-05, + "loss": 0.6311, + "step": 8770 + }, + { + "epoch": 0.7842453505007153, + "grad_norm": 0.1297876907794252, + "learning_rate": 2.343900863961659e-05, + "loss": 0.6048, + "step": 8771 + }, + { + "epoch": 0.7843347639484979, + "grad_norm": 0.12510274026678442, + "learning_rate": 2.342038166423466e-05, + "loss": 0.6318, + "step": 8772 + }, + { + "epoch": 0.7844241773962805, + "grad_norm": 0.13694329709803796, + "learning_rate": 2.3401761111492836e-05, + "loss": 0.6088, + "step": 8773 + }, + { + "epoch": 0.7845135908440629, + "grad_norm": 0.1393415836248589, + "learning_rate": 2.338314698295281e-05, + "loss": 0.631, + "step": 8774 + }, + { + "epoch": 0.7846030042918455, + "grad_norm": 0.12734855721230134, + "learning_rate": 2.3364539280175734e-05, + "loss": 0.6267, + "step": 8775 + }, + { + "epoch": 0.7846924177396281, + "grad_norm": 0.11855230883618961, + "learning_rate": 2.3345938004722168e-05, + "loss": 0.6024, + "step": 8776 + }, + { + "epoch": 0.7847818311874106, + "grad_norm": 0.14227604068889424, + "learning_rate": 2.3327343158152205e-05, + "loss": 0.6128, + "step": 8777 + }, + { + "epoch": 0.7848712446351931, + "grad_norm": 0.13946561713893324, + "learning_rate": 2.3308754742025406e-05, + "loss": 0.6692, + "step": 8778 + }, + { + "epoch": 0.7849606580829757, + "grad_norm": 0.14342202568157994, + "learning_rate": 2.3290172757900696e-05, + "loss": 0.6721, + "step": 8779 + }, + { + "epoch": 0.7850500715307582, + "grad_norm": 0.141518216498899, + "learning_rate": 2.3271597207336526e-05, + "loss": 0.6253, + "step": 8780 + }, + { + "epoch": 0.7851394849785408, + "grad_norm": 0.12443918607027549, + "learning_rate": 2.3253028091890893e-05, + "loss": 0.6207, + "step": 8781 + }, + { + "epoch": 0.7852288984263234, + "grad_norm": 0.1531988567877942, + "learning_rate": 2.3234465413121086e-05, + "loss": 0.6982, + "step": 8782 + }, + { + "epoch": 0.7853183118741058, + "grad_norm": 0.12240218020692126, + "learning_rate": 2.321590917258395e-05, + "loss": 0.6022, + "step": 8783 + }, + { + "epoch": 0.7854077253218884, + "grad_norm": 0.15761233245835676, + "learning_rate": 2.3197359371835802e-05, + "loss": 0.6345, + "step": 8784 + }, + { + "epoch": 0.785497138769671, + "grad_norm": 0.12774087702504439, + "learning_rate": 2.3178816012432346e-05, + "loss": 0.6564, + "step": 8785 + }, + { + "epoch": 0.7855865522174535, + "grad_norm": 0.1332528747819422, + "learning_rate": 2.3160279095928817e-05, + "loss": 0.6061, + "step": 8786 + }, + { + "epoch": 0.785675965665236, + "grad_norm": 0.12827534923433512, + "learning_rate": 2.3141748623879878e-05, + "loss": 0.5869, + "step": 8787 + }, + { + "epoch": 0.7857653791130186, + "grad_norm": 0.12659210284585506, + "learning_rate": 2.3123224597839664e-05, + "loss": 0.6748, + "step": 8788 + }, + { + "epoch": 0.7858547925608012, + "grad_norm": 0.1505007496694598, + "learning_rate": 2.3104707019361782e-05, + "loss": 0.6897, + "step": 8789 + }, + { + "epoch": 0.7859442060085837, + "grad_norm": 0.1390927578132446, + "learning_rate": 2.3086195889999228e-05, + "loss": 0.6531, + "step": 8790 + }, + { + "epoch": 0.7860336194563662, + "grad_norm": 0.15119889333316094, + "learning_rate": 2.3067691211304544e-05, + "loss": 0.643, + "step": 8791 + }, + { + "epoch": 0.7861230329041488, + "grad_norm": 0.13282209045031418, + "learning_rate": 2.3049192984829715e-05, + "loss": 0.6259, + "step": 8792 + }, + { + "epoch": 0.7862124463519313, + "grad_norm": 0.13439287647799153, + "learning_rate": 2.3030701212126106e-05, + "loss": 0.6228, + "step": 8793 + }, + { + "epoch": 0.7863018597997139, + "grad_norm": 0.14360716404969473, + "learning_rate": 2.3012215894744593e-05, + "loss": 0.6411, + "step": 8794 + }, + { + "epoch": 0.7863912732474965, + "grad_norm": 0.14119549019941158, + "learning_rate": 2.299373703423563e-05, + "loss": 0.6664, + "step": 8795 + }, + { + "epoch": 0.7864806866952789, + "grad_norm": 0.13061125343172938, + "learning_rate": 2.2975264632148896e-05, + "loss": 0.6205, + "step": 8796 + }, + { + "epoch": 0.7865701001430615, + "grad_norm": 0.13199700437843315, + "learning_rate": 2.2956798690033708e-05, + "loss": 0.5826, + "step": 8797 + }, + { + "epoch": 0.7866595135908441, + "grad_norm": 0.1396529497123937, + "learning_rate": 2.2938339209438797e-05, + "loss": 0.6026, + "step": 8798 + }, + { + "epoch": 0.7867489270386266, + "grad_norm": 0.13012727039701302, + "learning_rate": 2.2919886191912277e-05, + "loss": 0.6032, + "step": 8799 + }, + { + "epoch": 0.7868383404864091, + "grad_norm": 0.15097937480678722, + "learning_rate": 2.290143963900181e-05, + "loss": 0.6762, + "step": 8800 + }, + { + "epoch": 0.7869277539341917, + "grad_norm": 0.1350116725824394, + "learning_rate": 2.2882999552254492e-05, + "loss": 0.6213, + "step": 8801 + }, + { + "epoch": 0.7870171673819742, + "grad_norm": 0.1431986176042137, + "learning_rate": 2.2864565933216865e-05, + "loss": 0.6943, + "step": 8802 + }, + { + "epoch": 0.7871065808297568, + "grad_norm": 0.13154483591122595, + "learning_rate": 2.2846138783434944e-05, + "loss": 0.637, + "step": 8803 + }, + { + "epoch": 0.7871959942775394, + "grad_norm": 0.15437063466408757, + "learning_rate": 2.282771810445421e-05, + "loss": 0.6568, + "step": 8804 + }, + { + "epoch": 0.7872854077253219, + "grad_norm": 0.13944192767922112, + "learning_rate": 2.280930389781952e-05, + "loss": 0.6556, + "step": 8805 + }, + { + "epoch": 0.7873748211731044, + "grad_norm": 0.1297316229331341, + "learning_rate": 2.2790896165075305e-05, + "loss": 0.6332, + "step": 8806 + }, + { + "epoch": 0.787464234620887, + "grad_norm": 0.14241776442102036, + "learning_rate": 2.2772494907765406e-05, + "loss": 0.6072, + "step": 8807 + }, + { + "epoch": 0.7875536480686696, + "grad_norm": 0.1492702643228061, + "learning_rate": 2.275410012743303e-05, + "loss": 0.6287, + "step": 8808 + }, + { + "epoch": 0.787643061516452, + "grad_norm": 0.1477398100103291, + "learning_rate": 2.2735711825621052e-05, + "loss": 0.6441, + "step": 8809 + }, + { + "epoch": 0.7877324749642346, + "grad_norm": 0.13756196965200257, + "learning_rate": 2.2717330003871573e-05, + "loss": 0.6135, + "step": 8810 + }, + { + "epoch": 0.7878218884120172, + "grad_norm": 0.1431284044125976, + "learning_rate": 2.26989546637263e-05, + "loss": 0.6, + "step": 8811 + }, + { + "epoch": 0.7879113018597997, + "grad_norm": 0.12685368610387585, + "learning_rate": 2.2680585806726373e-05, + "loss": 0.6152, + "step": 8812 + }, + { + "epoch": 0.7880007153075823, + "grad_norm": 0.13492063224542306, + "learning_rate": 2.266222343441231e-05, + "loss": 0.6159, + "step": 8813 + }, + { + "epoch": 0.7880901287553648, + "grad_norm": 0.13636103095460464, + "learning_rate": 2.264386754832416e-05, + "loss": 0.6292, + "step": 8814 + }, + { + "epoch": 0.7881795422031473, + "grad_norm": 0.14136403730111957, + "learning_rate": 2.2625518150001425e-05, + "loss": 0.6321, + "step": 8815 + }, + { + "epoch": 0.7882689556509299, + "grad_norm": 0.1368617284851829, + "learning_rate": 2.2607175240983026e-05, + "loss": 0.6268, + "step": 8816 + }, + { + "epoch": 0.7883583690987125, + "grad_norm": 0.14995967558230996, + "learning_rate": 2.2588838822807378e-05, + "loss": 0.6381, + "step": 8817 + }, + { + "epoch": 0.788447782546495, + "grad_norm": 0.12975737196632775, + "learning_rate": 2.2570508897012355e-05, + "loss": 0.6075, + "step": 8818 + }, + { + "epoch": 0.7885371959942775, + "grad_norm": 0.16234571700605938, + "learning_rate": 2.2552185465135224e-05, + "loss": 0.6506, + "step": 8819 + }, + { + "epoch": 0.7886266094420601, + "grad_norm": 0.13927898507178954, + "learning_rate": 2.2533868528712755e-05, + "loss": 0.6289, + "step": 8820 + }, + { + "epoch": 0.7887160228898427, + "grad_norm": 0.13705933127678982, + "learning_rate": 2.2515558089281196e-05, + "loss": 0.642, + "step": 8821 + }, + { + "epoch": 0.7888054363376252, + "grad_norm": 0.13903672671244388, + "learning_rate": 2.2497254148376157e-05, + "loss": 0.6576, + "step": 8822 + }, + { + "epoch": 0.7888948497854077, + "grad_norm": 0.1372765605946722, + "learning_rate": 2.247895670753287e-05, + "loss": 0.6402, + "step": 8823 + }, + { + "epoch": 0.7889842632331903, + "grad_norm": 0.14472117309720145, + "learning_rate": 2.2460665768285826e-05, + "loss": 0.6386, + "step": 8824 + }, + { + "epoch": 0.7890736766809728, + "grad_norm": 0.14580727086608516, + "learning_rate": 2.2442381332169115e-05, + "loss": 0.6582, + "step": 8825 + }, + { + "epoch": 0.7891630901287554, + "grad_norm": 0.14900499021473046, + "learning_rate": 2.2424103400716203e-05, + "loss": 0.6481, + "step": 8826 + }, + { + "epoch": 0.7892525035765379, + "grad_norm": 0.1431627528234741, + "learning_rate": 2.240583197546008e-05, + "loss": 0.6159, + "step": 8827 + }, + { + "epoch": 0.7893419170243204, + "grad_norm": 0.15110310309150224, + "learning_rate": 2.23875670579331e-05, + "loss": 0.6322, + "step": 8828 + }, + { + "epoch": 0.789431330472103, + "grad_norm": 0.14141834212293797, + "learning_rate": 2.236930864966713e-05, + "loss": 0.6349, + "step": 8829 + }, + { + "epoch": 0.7895207439198856, + "grad_norm": 0.15606486738954262, + "learning_rate": 2.235105675219349e-05, + "loss": 0.664, + "step": 8830 + }, + { + "epoch": 0.789610157367668, + "grad_norm": 0.15706198553021417, + "learning_rate": 2.2332811367042948e-05, + "loss": 0.6035, + "step": 8831 + }, + { + "epoch": 0.7896995708154506, + "grad_norm": 0.13879822073166817, + "learning_rate": 2.2314572495745746e-05, + "loss": 0.6369, + "step": 8832 + }, + { + "epoch": 0.7897889842632332, + "grad_norm": 0.15236155912204935, + "learning_rate": 2.2296340139831494e-05, + "loss": 0.6585, + "step": 8833 + }, + { + "epoch": 0.7898783977110158, + "grad_norm": 0.1502596430501082, + "learning_rate": 2.2278114300829356e-05, + "loss": 0.6376, + "step": 8834 + }, + { + "epoch": 0.7899678111587983, + "grad_norm": 0.14026416174022954, + "learning_rate": 2.2259894980267937e-05, + "loss": 0.6335, + "step": 8835 + }, + { + "epoch": 0.7900572246065808, + "grad_norm": 0.14212428324065057, + "learning_rate": 2.224168217967518e-05, + "loss": 0.6182, + "step": 8836 + }, + { + "epoch": 0.7901466380543634, + "grad_norm": 0.14430232445871488, + "learning_rate": 2.2223475900578674e-05, + "loss": 0.6436, + "step": 8837 + }, + { + "epoch": 0.7902360515021459, + "grad_norm": 0.14552761974408926, + "learning_rate": 2.220527614450533e-05, + "loss": 0.6079, + "step": 8838 + }, + { + "epoch": 0.7903254649499285, + "grad_norm": 0.12852716841171646, + "learning_rate": 2.2187082912981493e-05, + "loss": 0.6158, + "step": 8839 + }, + { + "epoch": 0.790414878397711, + "grad_norm": 0.1399198631261727, + "learning_rate": 2.216889620753304e-05, + "loss": 0.6369, + "step": 8840 + }, + { + "epoch": 0.7905042918454935, + "grad_norm": 0.13451993696843997, + "learning_rate": 2.215071602968529e-05, + "loss": 0.6242, + "step": 8841 + }, + { + "epoch": 0.7905937052932761, + "grad_norm": 0.12653710350439212, + "learning_rate": 2.213254238096295e-05, + "loss": 0.6007, + "step": 8842 + }, + { + "epoch": 0.7906831187410587, + "grad_norm": 0.13459510957454138, + "learning_rate": 2.211437526289023e-05, + "loss": 0.6176, + "step": 8843 + }, + { + "epoch": 0.7907725321888412, + "grad_norm": 0.13574906846859736, + "learning_rate": 2.20962146769908e-05, + "loss": 0.5965, + "step": 8844 + }, + { + "epoch": 0.7908619456366237, + "grad_norm": 0.14616009894546306, + "learning_rate": 2.2078060624787757e-05, + "loss": 0.6377, + "step": 8845 + }, + { + "epoch": 0.7909513590844063, + "grad_norm": 0.13787226908227596, + "learning_rate": 2.2059913107803697e-05, + "loss": 0.6136, + "step": 8846 + }, + { + "epoch": 0.7910407725321889, + "grad_norm": 0.11950639377811874, + "learning_rate": 2.2041772127560566e-05, + "loss": 0.6057, + "step": 8847 + }, + { + "epoch": 0.7911301859799714, + "grad_norm": 0.13553588571880826, + "learning_rate": 2.2023637685579856e-05, + "loss": 0.6574, + "step": 8848 + }, + { + "epoch": 0.7912195994277539, + "grad_norm": 0.12661898577401745, + "learning_rate": 2.2005509783382517e-05, + "loss": 0.5967, + "step": 8849 + }, + { + "epoch": 0.7913090128755365, + "grad_norm": 0.14814021444056555, + "learning_rate": 2.198738842248882e-05, + "loss": 0.6497, + "step": 8850 + }, + { + "epoch": 0.791398426323319, + "grad_norm": 0.1338729367077627, + "learning_rate": 2.196927360441866e-05, + "loss": 0.6078, + "step": 8851 + }, + { + "epoch": 0.7914878397711016, + "grad_norm": 0.14136565668379955, + "learning_rate": 2.1951165330691324e-05, + "loss": 0.659, + "step": 8852 + }, + { + "epoch": 0.7915772532188842, + "grad_norm": 0.1422371361553859, + "learning_rate": 2.1933063602825455e-05, + "loss": 0.6584, + "step": 8853 + }, + { + "epoch": 0.7916666666666666, + "grad_norm": 0.14038719416024895, + "learning_rate": 2.1914968422339266e-05, + "loss": 0.667, + "step": 8854 + }, + { + "epoch": 0.7917560801144492, + "grad_norm": 0.1525913675013719, + "learning_rate": 2.1896879790750403e-05, + "loss": 0.6421, + "step": 8855 + }, + { + "epoch": 0.7918454935622318, + "grad_norm": 0.134644997961094, + "learning_rate": 2.1878797709575847e-05, + "loss": 0.6423, + "step": 8856 + }, + { + "epoch": 0.7919349070100143, + "grad_norm": 0.13595171362123906, + "learning_rate": 2.186072218033224e-05, + "loss": 0.656, + "step": 8857 + }, + { + "epoch": 0.7920243204577968, + "grad_norm": 0.14822854493989823, + "learning_rate": 2.1842653204535466e-05, + "loss": 0.6053, + "step": 8858 + }, + { + "epoch": 0.7921137339055794, + "grad_norm": 0.15002011750815586, + "learning_rate": 2.1824590783700982e-05, + "loss": 0.6265, + "step": 8859 + }, + { + "epoch": 0.792203147353362, + "grad_norm": 0.12623702416113175, + "learning_rate": 2.1806534919343647e-05, + "loss": 0.6336, + "step": 8860 + }, + { + "epoch": 0.7922925608011445, + "grad_norm": 0.13495675727805678, + "learning_rate": 2.1788485612977827e-05, + "loss": 0.6414, + "step": 8861 + }, + { + "epoch": 0.7923819742489271, + "grad_norm": 0.1377857486051365, + "learning_rate": 2.1770442866117236e-05, + "loss": 0.6351, + "step": 8862 + }, + { + "epoch": 0.7924713876967096, + "grad_norm": 0.13498555553007954, + "learning_rate": 2.1752406680275126e-05, + "loss": 0.6079, + "step": 8863 + }, + { + "epoch": 0.7925608011444921, + "grad_norm": 0.12832196905230525, + "learning_rate": 2.1734377056964172e-05, + "loss": 0.605, + "step": 8864 + }, + { + "epoch": 0.7926502145922747, + "grad_norm": 0.13814037303617335, + "learning_rate": 2.1716353997696482e-05, + "loss": 0.6334, + "step": 8865 + }, + { + "epoch": 0.7927396280400573, + "grad_norm": 0.13617776763588163, + "learning_rate": 2.169833750398368e-05, + "loss": 0.6237, + "step": 8866 + }, + { + "epoch": 0.7928290414878397, + "grad_norm": 0.13279560977230268, + "learning_rate": 2.1680327577336712e-05, + "loss": 0.5847, + "step": 8867 + }, + { + "epoch": 0.7929184549356223, + "grad_norm": 0.1394273484027934, + "learning_rate": 2.1662324219266083e-05, + "loss": 0.6372, + "step": 8868 + }, + { + "epoch": 0.7930078683834049, + "grad_norm": 0.14316755166353523, + "learning_rate": 2.1644327431281742e-05, + "loss": 0.637, + "step": 8869 + }, + { + "epoch": 0.7930972818311874, + "grad_norm": 0.1497051561542668, + "learning_rate": 2.1626337214892978e-05, + "loss": 0.6561, + "step": 8870 + }, + { + "epoch": 0.79318669527897, + "grad_norm": 0.1284206201748203, + "learning_rate": 2.1608353571608685e-05, + "loss": 0.6133, + "step": 8871 + }, + { + "epoch": 0.7932761087267525, + "grad_norm": 0.13580218511974193, + "learning_rate": 2.1590376502937136e-05, + "loss": 0.6423, + "step": 8872 + }, + { + "epoch": 0.793365522174535, + "grad_norm": 0.15143903917525203, + "learning_rate": 2.1572406010385983e-05, + "loss": 0.6585, + "step": 8873 + }, + { + "epoch": 0.7934549356223176, + "grad_norm": 0.15030199863697596, + "learning_rate": 2.1554442095462422e-05, + "loss": 0.6407, + "step": 8874 + }, + { + "epoch": 0.7935443490701002, + "grad_norm": 0.13858301806232864, + "learning_rate": 2.1536484759673092e-05, + "loss": 0.6258, + "step": 8875 + }, + { + "epoch": 0.7936337625178826, + "grad_norm": 0.1295345827805824, + "learning_rate": 2.1518534004523993e-05, + "loss": 0.6118, + "step": 8876 + }, + { + "epoch": 0.7937231759656652, + "grad_norm": 0.13100219081413783, + "learning_rate": 2.150058983152068e-05, + "loss": 0.6684, + "step": 8877 + }, + { + "epoch": 0.7938125894134478, + "grad_norm": 0.14086253681363148, + "learning_rate": 2.1482652242168077e-05, + "loss": 0.6267, + "step": 8878 + }, + { + "epoch": 0.7939020028612304, + "grad_norm": 0.13997409841354058, + "learning_rate": 2.146472123797062e-05, + "loss": 0.6768, + "step": 8879 + }, + { + "epoch": 0.7939914163090128, + "grad_norm": 0.12581886750123678, + "learning_rate": 2.1446796820432167e-05, + "loss": 0.6113, + "step": 8880 + }, + { + "epoch": 0.7940808297567954, + "grad_norm": 0.1561245727544637, + "learning_rate": 2.1428878991055966e-05, + "loss": 0.6871, + "step": 8881 + }, + { + "epoch": 0.794170243204578, + "grad_norm": 0.13378963197394994, + "learning_rate": 2.1410967751344803e-05, + "loss": 0.5947, + "step": 8882 + }, + { + "epoch": 0.7942596566523605, + "grad_norm": 0.12993203602791353, + "learning_rate": 2.1393063102800847e-05, + "loss": 0.626, + "step": 8883 + }, + { + "epoch": 0.7943490701001431, + "grad_norm": 0.13607135921310964, + "learning_rate": 2.137516504692577e-05, + "loss": 0.6085, + "step": 8884 + }, + { + "epoch": 0.7944384835479256, + "grad_norm": 0.11775609093608475, + "learning_rate": 2.135727358522064e-05, + "loss": 0.6104, + "step": 8885 + }, + { + "epoch": 0.7945278969957081, + "grad_norm": 0.1387427586009419, + "learning_rate": 2.1339388719186028e-05, + "loss": 0.6662, + "step": 8886 + }, + { + "epoch": 0.7946173104434907, + "grad_norm": 0.15496127404660098, + "learning_rate": 2.1321510450321858e-05, + "loss": 0.6511, + "step": 8887 + }, + { + "epoch": 0.7947067238912733, + "grad_norm": 0.14660621473131608, + "learning_rate": 2.1303638780127588e-05, + "loss": 0.7032, + "step": 8888 + }, + { + "epoch": 0.7947961373390557, + "grad_norm": 0.14668574910787954, + "learning_rate": 2.128577371010212e-05, + "loss": 0.6403, + "step": 8889 + }, + { + "epoch": 0.7948855507868383, + "grad_norm": 0.12548404325539747, + "learning_rate": 2.126791524174372e-05, + "loss": 0.597, + "step": 8890 + }, + { + "epoch": 0.7949749642346209, + "grad_norm": 0.1333436356386548, + "learning_rate": 2.1250063376550154e-05, + "loss": 0.6471, + "step": 8891 + }, + { + "epoch": 0.7950643776824035, + "grad_norm": 0.1397911263289146, + "learning_rate": 2.1232218116018722e-05, + "loss": 0.6397, + "step": 8892 + }, + { + "epoch": 0.795153791130186, + "grad_norm": 0.14136232158912457, + "learning_rate": 2.1214379461646005e-05, + "loss": 0.6653, + "step": 8893 + }, + { + "epoch": 0.7952432045779685, + "grad_norm": 0.1470068746020934, + "learning_rate": 2.1196547414928137e-05, + "loss": 0.6611, + "step": 8894 + }, + { + "epoch": 0.7953326180257511, + "grad_norm": 0.14317920814107626, + "learning_rate": 2.1178721977360684e-05, + "loss": 0.6267, + "step": 8895 + }, + { + "epoch": 0.7954220314735336, + "grad_norm": 0.14454833682806567, + "learning_rate": 2.1160903150438605e-05, + "loss": 0.6171, + "step": 8896 + }, + { + "epoch": 0.7955114449213162, + "grad_norm": 0.1524023740512294, + "learning_rate": 2.114309093565637e-05, + "loss": 0.6531, + "step": 8897 + }, + { + "epoch": 0.7956008583690987, + "grad_norm": 0.14069640708951528, + "learning_rate": 2.112528533450786e-05, + "loss": 0.6206, + "step": 8898 + }, + { + "epoch": 0.7956902718168812, + "grad_norm": 0.14736725375067353, + "learning_rate": 2.1107486348486406e-05, + "loss": 0.6042, + "step": 8899 + }, + { + "epoch": 0.7957796852646638, + "grad_norm": 0.12327346792789697, + "learning_rate": 2.1089693979084825e-05, + "loss": 0.595, + "step": 8900 + }, + { + "epoch": 0.7958690987124464, + "grad_norm": 0.15755428836309074, + "learning_rate": 2.107190822779529e-05, + "loss": 0.6645, + "step": 8901 + }, + { + "epoch": 0.795958512160229, + "grad_norm": 0.140684312387852, + "learning_rate": 2.1054129096109486e-05, + "loss": 0.6302, + "step": 8902 + }, + { + "epoch": 0.7960479256080114, + "grad_norm": 0.13520300439882796, + "learning_rate": 2.103635658551856e-05, + "loss": 0.6284, + "step": 8903 + }, + { + "epoch": 0.796137339055794, + "grad_norm": 0.14550185742406743, + "learning_rate": 2.101859069751301e-05, + "loss": 0.6703, + "step": 8904 + }, + { + "epoch": 0.7962267525035766, + "grad_norm": 0.11811857681260582, + "learning_rate": 2.1000831433582856e-05, + "loss": 0.6147, + "step": 8905 + }, + { + "epoch": 0.7963161659513591, + "grad_norm": 0.1337238024507231, + "learning_rate": 2.0983078795217603e-05, + "loss": 0.6349, + "step": 8906 + }, + { + "epoch": 0.7964055793991416, + "grad_norm": 0.12635686876104693, + "learning_rate": 2.0965332783906087e-05, + "loss": 0.6257, + "step": 8907 + }, + { + "epoch": 0.7964949928469242, + "grad_norm": 0.13611861674733922, + "learning_rate": 2.0947593401136657e-05, + "loss": 0.6276, + "step": 8908 + }, + { + "epoch": 0.7965844062947067, + "grad_norm": 0.13974169089495384, + "learning_rate": 2.0929860648397126e-05, + "loss": 0.6382, + "step": 8909 + }, + { + "epoch": 0.7966738197424893, + "grad_norm": 0.12885986879485165, + "learning_rate": 2.0912134527174664e-05, + "loss": 0.6075, + "step": 8910 + }, + { + "epoch": 0.7967632331902719, + "grad_norm": 0.12938343296046265, + "learning_rate": 2.0894415038955962e-05, + "loss": 0.6513, + "step": 8911 + }, + { + "epoch": 0.7968526466380543, + "grad_norm": 0.15000842289941257, + "learning_rate": 2.0876702185227137e-05, + "loss": 0.6645, + "step": 8912 + }, + { + "epoch": 0.7969420600858369, + "grad_norm": 0.13953453524700982, + "learning_rate": 2.085899596747375e-05, + "loss": 0.6364, + "step": 8913 + }, + { + "epoch": 0.7970314735336195, + "grad_norm": 0.1305387064636346, + "learning_rate": 2.084129638718081e-05, + "loss": 0.6186, + "step": 8914 + }, + { + "epoch": 0.797120886981402, + "grad_norm": 0.12760345028486064, + "learning_rate": 2.082360344583272e-05, + "loss": 0.6045, + "step": 8915 + }, + { + "epoch": 0.7972103004291845, + "grad_norm": 0.1311733613114143, + "learning_rate": 2.080591714491339e-05, + "loss": 0.5936, + "step": 8916 + }, + { + "epoch": 0.7972997138769671, + "grad_norm": 0.136485493540591, + "learning_rate": 2.0788237485906135e-05, + "loss": 0.641, + "step": 8917 + }, + { + "epoch": 0.7973891273247496, + "grad_norm": 0.1368671357997628, + "learning_rate": 2.0770564470293775e-05, + "loss": 0.5914, + "step": 8918 + }, + { + "epoch": 0.7974785407725322, + "grad_norm": 0.1320595835525534, + "learning_rate": 2.0752898099558437e-05, + "loss": 0.6332, + "step": 8919 + }, + { + "epoch": 0.7975679542203148, + "grad_norm": 0.1273751010876373, + "learning_rate": 2.0735238375181875e-05, + "loss": 0.6308, + "step": 8920 + }, + { + "epoch": 0.7976573676680973, + "grad_norm": 0.14901250419448206, + "learning_rate": 2.0717585298645127e-05, + "loss": 0.6606, + "step": 8921 + }, + { + "epoch": 0.7977467811158798, + "grad_norm": 0.12633514671378496, + "learning_rate": 2.069993887142874e-05, + "loss": 0.5876, + "step": 8922 + }, + { + "epoch": 0.7978361945636624, + "grad_norm": 0.1289636042206656, + "learning_rate": 2.0682299095012747e-05, + "loss": 0.6283, + "step": 8923 + }, + { + "epoch": 0.797925608011445, + "grad_norm": 0.14249442484121075, + "learning_rate": 2.0664665970876496e-05, + "loss": 0.6739, + "step": 8924 + }, + { + "epoch": 0.7980150214592274, + "grad_norm": 0.13629761241369076, + "learning_rate": 2.064703950049891e-05, + "loss": 0.6189, + "step": 8925 + }, + { + "epoch": 0.79810443490701, + "grad_norm": 0.1396926611219298, + "learning_rate": 2.0629419685358286e-05, + "loss": 0.6403, + "step": 8926 + }, + { + "epoch": 0.7981938483547926, + "grad_norm": 0.1301983239285953, + "learning_rate": 2.0611806526932364e-05, + "loss": 0.6177, + "step": 8927 + }, + { + "epoch": 0.7982832618025751, + "grad_norm": 0.14271665694721208, + "learning_rate": 2.0594200026698363e-05, + "loss": 0.6412, + "step": 8928 + }, + { + "epoch": 0.7983726752503576, + "grad_norm": 0.14207348270652417, + "learning_rate": 2.0576600186132934e-05, + "loss": 0.6615, + "step": 8929 + }, + { + "epoch": 0.7984620886981402, + "grad_norm": 0.14104669731796635, + "learning_rate": 2.0559007006712106e-05, + "loss": 0.6286, + "step": 8930 + }, + { + "epoch": 0.7985515021459227, + "grad_norm": 0.15221169101352683, + "learning_rate": 2.0541420489911413e-05, + "loss": 0.6268, + "step": 8931 + }, + { + "epoch": 0.7986409155937053, + "grad_norm": 0.13405651926134665, + "learning_rate": 2.052384063720585e-05, + "loss": 0.6287, + "step": 8932 + }, + { + "epoch": 0.7987303290414879, + "grad_norm": 0.14616003799884925, + "learning_rate": 2.0506267450069737e-05, + "loss": 0.6243, + "step": 8933 + }, + { + "epoch": 0.7988197424892703, + "grad_norm": 0.12526211585384842, + "learning_rate": 2.048870092997702e-05, + "loss": 0.6197, + "step": 8934 + }, + { + "epoch": 0.7989091559370529, + "grad_norm": 0.15812459814355245, + "learning_rate": 2.0471141078400912e-05, + "loss": 0.6464, + "step": 8935 + }, + { + "epoch": 0.7989985693848355, + "grad_norm": 0.136224390587284, + "learning_rate": 2.0453587896814142e-05, + "loss": 0.6258, + "step": 8936 + }, + { + "epoch": 0.7990879828326181, + "grad_norm": 0.16329859493308782, + "learning_rate": 2.0436041386688932e-05, + "loss": 0.6345, + "step": 8937 + }, + { + "epoch": 0.7991773962804005, + "grad_norm": 0.13527454663647046, + "learning_rate": 2.0418501549496792e-05, + "loss": 0.6472, + "step": 8938 + }, + { + "epoch": 0.7992668097281831, + "grad_norm": 0.1302647649158334, + "learning_rate": 2.040096838670881e-05, + "loss": 0.6277, + "step": 8939 + }, + { + "epoch": 0.7993562231759657, + "grad_norm": 0.16969001187320398, + "learning_rate": 2.0383441899795518e-05, + "loss": 0.6499, + "step": 8940 + }, + { + "epoch": 0.7994456366237482, + "grad_norm": 0.12872724206434952, + "learning_rate": 2.0365922090226784e-05, + "loss": 0.6322, + "step": 8941 + }, + { + "epoch": 0.7995350500715308, + "grad_norm": 0.15478615491499936, + "learning_rate": 2.034840895947199e-05, + "loss": 0.6727, + "step": 8942 + }, + { + "epoch": 0.7996244635193133, + "grad_norm": 0.1424367461125504, + "learning_rate": 2.033090250899997e-05, + "loss": 0.6382, + "step": 8943 + }, + { + "epoch": 0.7997138769670958, + "grad_norm": 0.13622534425013363, + "learning_rate": 2.0313402740278908e-05, + "loss": 0.6477, + "step": 8944 + }, + { + "epoch": 0.7998032904148784, + "grad_norm": 0.1459575924153453, + "learning_rate": 2.0295909654776524e-05, + "loss": 0.6306, + "step": 8945 + }, + { + "epoch": 0.799892703862661, + "grad_norm": 0.145421897659671, + "learning_rate": 2.0278423253959934e-05, + "loss": 0.6841, + "step": 8946 + }, + { + "epoch": 0.7999821173104434, + "grad_norm": 0.140685279432563, + "learning_rate": 2.026094353929572e-05, + "loss": 0.634, + "step": 8947 + }, + { + "epoch": 0.800071530758226, + "grad_norm": 0.1321441169575951, + "learning_rate": 2.024347051224985e-05, + "loss": 0.6142, + "step": 8948 + }, + { + "epoch": 0.8001609442060086, + "grad_norm": 0.14192560792250822, + "learning_rate": 2.0226004174287827e-05, + "loss": 0.6407, + "step": 8949 + }, + { + "epoch": 0.8002503576537912, + "grad_norm": 0.1397839074791712, + "learning_rate": 2.0208544526874475e-05, + "loss": 0.6364, + "step": 8950 + }, + { + "epoch": 0.8003397711015737, + "grad_norm": 0.13106354039323298, + "learning_rate": 2.0191091571474108e-05, + "loss": 0.6008, + "step": 8951 + }, + { + "epoch": 0.8004291845493562, + "grad_norm": 0.11622261392075374, + "learning_rate": 2.0173645309550548e-05, + "loss": 0.631, + "step": 8952 + }, + { + "epoch": 0.8005185979971388, + "grad_norm": 0.12737679153208425, + "learning_rate": 2.0156205742566892e-05, + "loss": 0.613, + "step": 8953 + }, + { + "epoch": 0.8006080114449213, + "grad_norm": 0.13939748045401143, + "learning_rate": 2.013877287198588e-05, + "loss": 0.6424, + "step": 8954 + }, + { + "epoch": 0.8006974248927039, + "grad_norm": 0.13175203037021244, + "learning_rate": 2.0121346699269516e-05, + "loss": 0.6351, + "step": 8955 + }, + { + "epoch": 0.8007868383404864, + "grad_norm": 0.13379963909740963, + "learning_rate": 2.0103927225879336e-05, + "loss": 0.6486, + "step": 8956 + }, + { + "epoch": 0.8008762517882689, + "grad_norm": 0.14152348523212038, + "learning_rate": 2.008651445327633e-05, + "loss": 0.6527, + "step": 8957 + }, + { + "epoch": 0.8009656652360515, + "grad_norm": 0.1378393004593277, + "learning_rate": 2.00691083829208e-05, + "loss": 0.6421, + "step": 8958 + }, + { + "epoch": 0.8010550786838341, + "grad_norm": 0.14571857080193973, + "learning_rate": 2.0051709016272625e-05, + "loss": 0.6285, + "step": 8959 + }, + { + "epoch": 0.8011444921316166, + "grad_norm": 0.1360490580319798, + "learning_rate": 2.0034316354791062e-05, + "loss": 0.6045, + "step": 8960 + }, + { + "epoch": 0.8012339055793991, + "grad_norm": 0.1662177255506221, + "learning_rate": 2.001693039993482e-05, + "loss": 0.6546, + "step": 8961 + }, + { + "epoch": 0.8013233190271817, + "grad_norm": 0.13330374734914094, + "learning_rate": 1.9999551153162022e-05, + "loss": 0.6207, + "step": 8962 + }, + { + "epoch": 0.8014127324749643, + "grad_norm": 0.1388858229620465, + "learning_rate": 1.998217861593028e-05, + "loss": 0.6725, + "step": 8963 + }, + { + "epoch": 0.8015021459227468, + "grad_norm": 0.12868156756871582, + "learning_rate": 1.996481278969655e-05, + "loss": 0.639, + "step": 8964 + }, + { + "epoch": 0.8015915593705293, + "grad_norm": 0.1269993315171469, + "learning_rate": 1.9947453675917316e-05, + "loss": 0.6018, + "step": 8965 + }, + { + "epoch": 0.8016809728183119, + "grad_norm": 0.1616563685245582, + "learning_rate": 1.9930101276048485e-05, + "loss": 0.679, + "step": 8966 + }, + { + "epoch": 0.8017703862660944, + "grad_norm": 0.13289272160921123, + "learning_rate": 1.9912755591545317e-05, + "loss": 0.6185, + "step": 8967 + }, + { + "epoch": 0.801859799713877, + "grad_norm": 0.12529681270528553, + "learning_rate": 1.9895416623862662e-05, + "loss": 0.6237, + "step": 8968 + }, + { + "epoch": 0.8019492131616596, + "grad_norm": 0.1444101716396247, + "learning_rate": 1.9878084374454653e-05, + "loss": 0.6515, + "step": 8969 + }, + { + "epoch": 0.802038626609442, + "grad_norm": 0.15201767603852312, + "learning_rate": 1.986075884477494e-05, + "loss": 0.6265, + "step": 8970 + }, + { + "epoch": 0.8021280400572246, + "grad_norm": 0.16524375748454523, + "learning_rate": 1.984344003627663e-05, + "loss": 0.6286, + "step": 8971 + }, + { + "epoch": 0.8022174535050072, + "grad_norm": 0.15143784298272472, + "learning_rate": 1.9826127950412167e-05, + "loss": 0.6653, + "step": 8972 + }, + { + "epoch": 0.8023068669527897, + "grad_norm": 0.14850262575597925, + "learning_rate": 1.9808822588633535e-05, + "loss": 0.6322, + "step": 8973 + }, + { + "epoch": 0.8023962804005722, + "grad_norm": 0.14053996483785122, + "learning_rate": 1.97915239523921e-05, + "loss": 0.6565, + "step": 8974 + }, + { + "epoch": 0.8024856938483548, + "grad_norm": 0.1397770849944675, + "learning_rate": 1.9774232043138685e-05, + "loss": 0.6531, + "step": 8975 + }, + { + "epoch": 0.8025751072961373, + "grad_norm": 0.16595569165329732, + "learning_rate": 1.9756946862323535e-05, + "loss": 0.5997, + "step": 8976 + }, + { + "epoch": 0.8026645207439199, + "grad_norm": 0.15758103573184717, + "learning_rate": 1.9739668411396383e-05, + "loss": 0.6482, + "step": 8977 + }, + { + "epoch": 0.8027539341917024, + "grad_norm": 0.15284664449382845, + "learning_rate": 1.9722396691806267e-05, + "loss": 0.6507, + "step": 8978 + }, + { + "epoch": 0.802843347639485, + "grad_norm": 0.14550718603140675, + "learning_rate": 1.97051317050018e-05, + "loss": 0.6358, + "step": 8979 + }, + { + "epoch": 0.8029327610872675, + "grad_norm": 0.14394440705961709, + "learning_rate": 1.9687873452430995e-05, + "loss": 0.6619, + "step": 8980 + }, + { + "epoch": 0.8030221745350501, + "grad_norm": 0.13691405481441266, + "learning_rate": 1.967062193554119e-05, + "loss": 0.6361, + "step": 8981 + }, + { + "epoch": 0.8031115879828327, + "grad_norm": 0.140322206286646, + "learning_rate": 1.965337715577934e-05, + "loss": 0.6699, + "step": 8982 + }, + { + "epoch": 0.8032010014306151, + "grad_norm": 0.1530362290222312, + "learning_rate": 1.9636139114591747e-05, + "loss": 0.6292, + "step": 8983 + }, + { + "epoch": 0.8032904148783977, + "grad_norm": 0.14815952002921118, + "learning_rate": 1.961890781342408e-05, + "loss": 0.6375, + "step": 8984 + }, + { + "epoch": 0.8033798283261803, + "grad_norm": 0.12855816488820768, + "learning_rate": 1.9601683253721536e-05, + "loss": 0.6554, + "step": 8985 + }, + { + "epoch": 0.8034692417739628, + "grad_norm": 0.15016233090526296, + "learning_rate": 1.9584465436928745e-05, + "loss": 0.6503, + "step": 8986 + }, + { + "epoch": 0.8035586552217453, + "grad_norm": 0.14017396799982865, + "learning_rate": 1.9567254364489694e-05, + "loss": 0.6303, + "step": 8987 + }, + { + "epoch": 0.8036480686695279, + "grad_norm": 0.12420340260600207, + "learning_rate": 1.955005003784789e-05, + "loss": 0.6371, + "step": 8988 + }, + { + "epoch": 0.8037374821173104, + "grad_norm": 0.15076854671520723, + "learning_rate": 1.9532852458446228e-05, + "loss": 0.6519, + "step": 8989 + }, + { + "epoch": 0.803826895565093, + "grad_norm": 0.13212001439849358, + "learning_rate": 1.9515661627727044e-05, + "loss": 0.6091, + "step": 8990 + }, + { + "epoch": 0.8039163090128756, + "grad_norm": 0.1521705679931749, + "learning_rate": 1.9498477547132154e-05, + "loss": 0.6552, + "step": 8991 + }, + { + "epoch": 0.804005722460658, + "grad_norm": 0.14579984744546945, + "learning_rate": 1.9481300218102692e-05, + "loss": 0.6766, + "step": 8992 + }, + { + "epoch": 0.8040951359084406, + "grad_norm": 0.14066303977971073, + "learning_rate": 1.9464129642079355e-05, + "loss": 0.6391, + "step": 8993 + }, + { + "epoch": 0.8041845493562232, + "grad_norm": 0.14735501079069405, + "learning_rate": 1.9446965820502218e-05, + "loss": 0.6646, + "step": 8994 + }, + { + "epoch": 0.8042739628040058, + "grad_norm": 0.1297222157737908, + "learning_rate": 1.9429808754810717e-05, + "loss": 0.6182, + "step": 8995 + }, + { + "epoch": 0.8043633762517882, + "grad_norm": 0.13511718349031024, + "learning_rate": 1.9412658446443887e-05, + "loss": 0.6322, + "step": 8996 + }, + { + "epoch": 0.8044527896995708, + "grad_norm": 0.12125912273592782, + "learning_rate": 1.9395514896840093e-05, + "loss": 0.6214, + "step": 8997 + }, + { + "epoch": 0.8045422031473534, + "grad_norm": 0.13659975873152574, + "learning_rate": 1.93783781074371e-05, + "loss": 0.6147, + "step": 8998 + }, + { + "epoch": 0.8046316165951359, + "grad_norm": 0.1426103920772694, + "learning_rate": 1.9361248079672158e-05, + "loss": 0.6687, + "step": 8999 + }, + { + "epoch": 0.8047210300429185, + "grad_norm": 0.13125327605565823, + "learning_rate": 1.934412481498198e-05, + "loss": 0.6133, + "step": 9000 + }, + { + "epoch": 0.804810443490701, + "grad_norm": 0.12836997213879028, + "learning_rate": 1.932700831480262e-05, + "loss": 0.6365, + "step": 9001 + }, + { + "epoch": 0.8048998569384835, + "grad_norm": 0.13264701181232022, + "learning_rate": 1.930989858056965e-05, + "loss": 0.6497, + "step": 9002 + }, + { + "epoch": 0.8049892703862661, + "grad_norm": 0.13289530831244617, + "learning_rate": 1.929279561371803e-05, + "loss": 0.5988, + "step": 9003 + }, + { + "epoch": 0.8050786838340487, + "grad_norm": 0.14491121436584228, + "learning_rate": 1.927569941568218e-05, + "loss": 0.6578, + "step": 9004 + }, + { + "epoch": 0.8051680972818311, + "grad_norm": 0.13302130799803422, + "learning_rate": 1.9258609987895926e-05, + "loss": 0.6444, + "step": 9005 + }, + { + "epoch": 0.8052575107296137, + "grad_norm": 0.1322886147793534, + "learning_rate": 1.9241527331792562e-05, + "loss": 0.6165, + "step": 9006 + }, + { + "epoch": 0.8053469241773963, + "grad_norm": 0.13648523622511904, + "learning_rate": 1.922445144880475e-05, + "loss": 0.6634, + "step": 9007 + }, + { + "epoch": 0.8054363376251789, + "grad_norm": 0.13011123676420072, + "learning_rate": 1.9207382340364634e-05, + "loss": 0.6402, + "step": 9008 + }, + { + "epoch": 0.8055257510729614, + "grad_norm": 0.14455733317680594, + "learning_rate": 1.9190320007903796e-05, + "loss": 0.6555, + "step": 9009 + }, + { + "epoch": 0.8056151645207439, + "grad_norm": 0.11997076620809287, + "learning_rate": 1.9173264452853222e-05, + "loss": 0.604, + "step": 9010 + }, + { + "epoch": 0.8057045779685265, + "grad_norm": 0.12506813646087178, + "learning_rate": 1.9156215676643375e-05, + "loss": 0.6236, + "step": 9011 + }, + { + "epoch": 0.805793991416309, + "grad_norm": 0.14793020736718726, + "learning_rate": 1.913917368070406e-05, + "loss": 0.6065, + "step": 9012 + }, + { + "epoch": 0.8058834048640916, + "grad_norm": 0.13040534289372732, + "learning_rate": 1.912213846646459e-05, + "loss": 0.6139, + "step": 9013 + }, + { + "epoch": 0.8059728183118741, + "grad_norm": 0.1362408450954967, + "learning_rate": 1.9105110035353714e-05, + "loss": 0.6318, + "step": 9014 + }, + { + "epoch": 0.8060622317596566, + "grad_norm": 0.1307411702007902, + "learning_rate": 1.9088088388799542e-05, + "loss": 0.642, + "step": 9015 + }, + { + "epoch": 0.8061516452074392, + "grad_norm": 0.15253589536127893, + "learning_rate": 1.9071073528229655e-05, + "loss": 0.646, + "step": 9016 + }, + { + "epoch": 0.8062410586552218, + "grad_norm": 0.15018404599146246, + "learning_rate": 1.9054065455071136e-05, + "loss": 0.6427, + "step": 9017 + }, + { + "epoch": 0.8063304721030042, + "grad_norm": 0.14029748190255667, + "learning_rate": 1.9037064170750373e-05, + "loss": 0.6544, + "step": 9018 + }, + { + "epoch": 0.8064198855507868, + "grad_norm": 0.13957360577497127, + "learning_rate": 1.9020069676693252e-05, + "loss": 0.6785, + "step": 9019 + }, + { + "epoch": 0.8065092989985694, + "grad_norm": 0.13829873450066227, + "learning_rate": 1.9003081974325122e-05, + "loss": 0.6572, + "step": 9020 + }, + { + "epoch": 0.806598712446352, + "grad_norm": 0.13670640469265516, + "learning_rate": 1.898610106507066e-05, + "loss": 0.6167, + "step": 9021 + }, + { + "epoch": 0.8066881258941345, + "grad_norm": 0.12497107316049365, + "learning_rate": 1.8969126950354055e-05, + "loss": 0.6138, + "step": 9022 + }, + { + "epoch": 0.806777539341917, + "grad_norm": 0.12919984199350867, + "learning_rate": 1.8952159631598922e-05, + "loss": 0.5824, + "step": 9023 + }, + { + "epoch": 0.8068669527896996, + "grad_norm": 0.1268022191972343, + "learning_rate": 1.8935199110228275e-05, + "loss": 0.6336, + "step": 9024 + }, + { + "epoch": 0.8069563662374821, + "grad_norm": 0.14165865218247412, + "learning_rate": 1.8918245387664602e-05, + "loss": 0.6463, + "step": 9025 + }, + { + "epoch": 0.8070457796852647, + "grad_norm": 0.13720488224088984, + "learning_rate": 1.8901298465329743e-05, + "loss": 0.6191, + "step": 9026 + }, + { + "epoch": 0.8071351931330472, + "grad_norm": 0.1352000162868002, + "learning_rate": 1.8884358344645025e-05, + "loss": 0.6307, + "step": 9027 + }, + { + "epoch": 0.8072246065808297, + "grad_norm": 0.1429862450824215, + "learning_rate": 1.886742502703125e-05, + "loss": 0.5914, + "step": 9028 + }, + { + "epoch": 0.8073140200286123, + "grad_norm": 0.1382213010559851, + "learning_rate": 1.88504985139085e-05, + "loss": 0.6445, + "step": 9029 + }, + { + "epoch": 0.8074034334763949, + "grad_norm": 0.144986713345629, + "learning_rate": 1.883357880669646e-05, + "loss": 0.6027, + "step": 9030 + }, + { + "epoch": 0.8074928469241774, + "grad_norm": 0.15733557133322612, + "learning_rate": 1.8816665906814178e-05, + "loss": 0.6193, + "step": 9031 + }, + { + "epoch": 0.8075822603719599, + "grad_norm": 0.12311328939112473, + "learning_rate": 1.879975981568004e-05, + "loss": 0.6352, + "step": 9032 + }, + { + "epoch": 0.8076716738197425, + "grad_norm": 0.15828648622331862, + "learning_rate": 1.8782860534711998e-05, + "loss": 0.6355, + "step": 9033 + }, + { + "epoch": 0.807761087267525, + "grad_norm": 0.1448148856941083, + "learning_rate": 1.8765968065327367e-05, + "loss": 0.6205, + "step": 9034 + }, + { + "epoch": 0.8078505007153076, + "grad_norm": 0.13104634308988622, + "learning_rate": 1.8749082408942876e-05, + "loss": 0.5923, + "step": 9035 + }, + { + "epoch": 0.8079399141630901, + "grad_norm": 0.13362069208492422, + "learning_rate": 1.8732203566974705e-05, + "loss": 0.6028, + "step": 9036 + }, + { + "epoch": 0.8080293276108726, + "grad_norm": 0.1418037939260459, + "learning_rate": 1.8715331540838487e-05, + "loss": 0.6107, + "step": 9037 + }, + { + "epoch": 0.8081187410586552, + "grad_norm": 0.13229432729216936, + "learning_rate": 1.8698466331949238e-05, + "loss": 0.6087, + "step": 9038 + }, + { + "epoch": 0.8082081545064378, + "grad_norm": 0.13606680677072572, + "learning_rate": 1.8681607941721425e-05, + "loss": 0.6288, + "step": 9039 + }, + { + "epoch": 0.8082975679542204, + "grad_norm": 0.14224327550069185, + "learning_rate": 1.866475637156898e-05, + "loss": 0.6561, + "step": 9040 + }, + { + "epoch": 0.8083869814020028, + "grad_norm": 0.1430197878865557, + "learning_rate": 1.8647911622905168e-05, + "loss": 0.5984, + "step": 9041 + }, + { + "epoch": 0.8084763948497854, + "grad_norm": 0.13649744214257528, + "learning_rate": 1.8631073697142754e-05, + "loss": 0.6503, + "step": 9042 + }, + { + "epoch": 0.808565808297568, + "grad_norm": 0.13425975494802472, + "learning_rate": 1.8614242595693908e-05, + "loss": 0.6133, + "step": 9043 + }, + { + "epoch": 0.8086552217453505, + "grad_norm": 0.13301870069836566, + "learning_rate": 1.8597418319970262e-05, + "loss": 0.6318, + "step": 9044 + }, + { + "epoch": 0.808744635193133, + "grad_norm": 0.12586608305466954, + "learning_rate": 1.8580600871382857e-05, + "loss": 0.6231, + "step": 9045 + }, + { + "epoch": 0.8088340486409156, + "grad_norm": 0.1369659641148421, + "learning_rate": 1.8563790251342095e-05, + "loss": 0.6139, + "step": 9046 + }, + { + "epoch": 0.8089234620886981, + "grad_norm": 0.13158139411098183, + "learning_rate": 1.85469864612579e-05, + "loss": 0.5998, + "step": 9047 + }, + { + "epoch": 0.8090128755364807, + "grad_norm": 0.11457829286834816, + "learning_rate": 1.8530189502539607e-05, + "loss": 0.5812, + "step": 9048 + }, + { + "epoch": 0.8091022889842633, + "grad_norm": 0.13250986447335897, + "learning_rate": 1.8513399376595895e-05, + "loss": 0.6329, + "step": 9049 + }, + { + "epoch": 0.8091917024320457, + "grad_norm": 0.13212231738038815, + "learning_rate": 1.849661608483495e-05, + "loss": 0.6216, + "step": 9050 + }, + { + "epoch": 0.8092811158798283, + "grad_norm": 0.133895399163031, + "learning_rate": 1.847983962866443e-05, + "loss": 0.668, + "step": 9051 + }, + { + "epoch": 0.8093705293276109, + "grad_norm": 0.13236589809406873, + "learning_rate": 1.846307000949129e-05, + "loss": 0.5916, + "step": 9052 + }, + { + "epoch": 0.8094599427753935, + "grad_norm": 0.14000956543544513, + "learning_rate": 1.844630722872199e-05, + "loss": 0.6293, + "step": 9053 + }, + { + "epoch": 0.8095493562231759, + "grad_norm": 0.1322007183137412, + "learning_rate": 1.8429551287762435e-05, + "loss": 0.6289, + "step": 9054 + }, + { + "epoch": 0.8096387696709585, + "grad_norm": 0.14579339614551592, + "learning_rate": 1.8412802188017885e-05, + "loss": 0.5987, + "step": 9055 + }, + { + "epoch": 0.8097281831187411, + "grad_norm": 0.14912144754419948, + "learning_rate": 1.839605993089307e-05, + "loss": 0.6151, + "step": 9056 + }, + { + "epoch": 0.8098175965665236, + "grad_norm": 0.1320997563016008, + "learning_rate": 1.8379324517792163e-05, + "loss": 0.6081, + "step": 9057 + }, + { + "epoch": 0.8099070100143062, + "grad_norm": 0.13030774254710623, + "learning_rate": 1.8362595950118733e-05, + "loss": 0.6413, + "step": 9058 + }, + { + "epoch": 0.8099964234620887, + "grad_norm": 0.12933913975641845, + "learning_rate": 1.8345874229275816e-05, + "loss": 0.6107, + "step": 9059 + }, + { + "epoch": 0.8100858369098712, + "grad_norm": 0.13936149095461878, + "learning_rate": 1.8329159356665793e-05, + "loss": 0.6279, + "step": 9060 + }, + { + "epoch": 0.8101752503576538, + "grad_norm": 0.13352781719819742, + "learning_rate": 1.8312451333690538e-05, + "loss": 0.6167, + "step": 9061 + }, + { + "epoch": 0.8102646638054364, + "grad_norm": 0.13827243239783613, + "learning_rate": 1.8295750161751334e-05, + "loss": 0.6409, + "step": 9062 + }, + { + "epoch": 0.8103540772532188, + "grad_norm": 0.1335538903281557, + "learning_rate": 1.8279055842248915e-05, + "loss": 0.6313, + "step": 9063 + }, + { + "epoch": 0.8104434907010014, + "grad_norm": 0.15389334024792112, + "learning_rate": 1.826236837658334e-05, + "loss": 0.6351, + "step": 9064 + }, + { + "epoch": 0.810532904148784, + "grad_norm": 0.12939698335716823, + "learning_rate": 1.8245687766154262e-05, + "loss": 0.6092, + "step": 9065 + }, + { + "epoch": 0.8106223175965666, + "grad_norm": 0.13872455725076044, + "learning_rate": 1.822901401236059e-05, + "loss": 0.6383, + "step": 9066 + }, + { + "epoch": 0.810711731044349, + "grad_norm": 0.14633729037717982, + "learning_rate": 1.821234711660077e-05, + "loss": 0.6638, + "step": 9067 + }, + { + "epoch": 0.8108011444921316, + "grad_norm": 0.13962482478605173, + "learning_rate": 1.819568708027264e-05, + "loss": 0.6691, + "step": 9068 + }, + { + "epoch": 0.8108905579399142, + "grad_norm": 0.12724737636339598, + "learning_rate": 1.817903390477341e-05, + "loss": 0.5885, + "step": 9069 + }, + { + "epoch": 0.8109799713876967, + "grad_norm": 0.1385470855901187, + "learning_rate": 1.8162387591499796e-05, + "loss": 0.6332, + "step": 9070 + }, + { + "epoch": 0.8110693848354793, + "grad_norm": 0.13519474763412712, + "learning_rate": 1.8145748141847908e-05, + "loss": 0.6048, + "step": 9071 + }, + { + "epoch": 0.8111587982832618, + "grad_norm": 0.1437534133425213, + "learning_rate": 1.8129115557213262e-05, + "loss": 0.6448, + "step": 9072 + }, + { + "epoch": 0.8112482117310443, + "grad_norm": 0.14683779656723364, + "learning_rate": 1.811248983899082e-05, + "loss": 0.6201, + "step": 9073 + }, + { + "epoch": 0.8113376251788269, + "grad_norm": 0.14571007762864374, + "learning_rate": 1.809587098857498e-05, + "loss": 0.6382, + "step": 9074 + }, + { + "epoch": 0.8114270386266095, + "grad_norm": 0.14498317040144362, + "learning_rate": 1.8079259007359506e-05, + "loss": 0.6582, + "step": 9075 + }, + { + "epoch": 0.8115164520743919, + "grad_norm": 0.15078826094706013, + "learning_rate": 1.8062653896737647e-05, + "loss": 0.6203, + "step": 9076 + }, + { + "epoch": 0.8116058655221745, + "grad_norm": 0.14523467395570736, + "learning_rate": 1.804605565810207e-05, + "loss": 0.6122, + "step": 9077 + }, + { + "epoch": 0.8116952789699571, + "grad_norm": 0.13821151305131674, + "learning_rate": 1.8029464292844778e-05, + "loss": 0.593, + "step": 9078 + }, + { + "epoch": 0.8117846924177397, + "grad_norm": 0.1460573018253066, + "learning_rate": 1.8012879802357374e-05, + "loss": 0.6846, + "step": 9079 + }, + { + "epoch": 0.8118741058655222, + "grad_norm": 0.1351781139491187, + "learning_rate": 1.79963021880307e-05, + "loss": 0.6251, + "step": 9080 + }, + { + "epoch": 0.8119635193133047, + "grad_norm": 0.1323688910575515, + "learning_rate": 1.797973145125512e-05, + "loss": 0.6218, + "step": 9081 + }, + { + "epoch": 0.8120529327610873, + "grad_norm": 0.1534668780241013, + "learning_rate": 1.7963167593420438e-05, + "loss": 0.6832, + "step": 9082 + }, + { + "epoch": 0.8121423462088698, + "grad_norm": 0.1508384777109147, + "learning_rate": 1.7946610615915792e-05, + "loss": 0.6435, + "step": 9083 + }, + { + "epoch": 0.8122317596566524, + "grad_norm": 0.1388413130439061, + "learning_rate": 1.793006052012981e-05, + "loss": 0.6491, + "step": 9084 + }, + { + "epoch": 0.8123211731044349, + "grad_norm": 0.15413862780247764, + "learning_rate": 1.7913517307450544e-05, + "loss": 0.6433, + "step": 9085 + }, + { + "epoch": 0.8124105865522174, + "grad_norm": 0.1703838182082585, + "learning_rate": 1.7896980979265443e-05, + "loss": 0.6582, + "step": 9086 + }, + { + "epoch": 0.8125, + "grad_norm": 0.12425867057987688, + "learning_rate": 1.7880451536961394e-05, + "loss": 0.6181, + "step": 9087 + }, + { + "epoch": 0.8125894134477826, + "grad_norm": 0.12789588701871987, + "learning_rate": 1.7863928981924726e-05, + "loss": 0.6453, + "step": 9088 + }, + { + "epoch": 0.8126788268955651, + "grad_norm": 0.1272748569312676, + "learning_rate": 1.7847413315541118e-05, + "loss": 0.6413, + "step": 9089 + }, + { + "epoch": 0.8127682403433476, + "grad_norm": 0.12892667532320123, + "learning_rate": 1.7830904539195726e-05, + "loss": 0.6039, + "step": 9090 + }, + { + "epoch": 0.8128576537911302, + "grad_norm": 0.15071215849473862, + "learning_rate": 1.7814402654273167e-05, + "loss": 0.6629, + "step": 9091 + }, + { + "epoch": 0.8129470672389127, + "grad_norm": 0.1396084200585551, + "learning_rate": 1.7797907662157355e-05, + "loss": 0.6429, + "step": 9092 + }, + { + "epoch": 0.8130364806866953, + "grad_norm": 0.1491930508686805, + "learning_rate": 1.7781419564231805e-05, + "loss": 0.6193, + "step": 9093 + }, + { + "epoch": 0.8131258941344778, + "grad_norm": 0.15220165634056634, + "learning_rate": 1.776493836187927e-05, + "loss": 0.666, + "step": 9094 + }, + { + "epoch": 0.8132153075822603, + "grad_norm": 0.1538897771345248, + "learning_rate": 1.774846405648204e-05, + "loss": 0.6413, + "step": 9095 + }, + { + "epoch": 0.8133047210300429, + "grad_norm": 0.133073846942661, + "learning_rate": 1.7731996649421802e-05, + "loss": 0.5852, + "step": 9096 + }, + { + "epoch": 0.8133941344778255, + "grad_norm": 0.14470554827040727, + "learning_rate": 1.771553614207967e-05, + "loss": 0.6314, + "step": 9097 + }, + { + "epoch": 0.8134835479256081, + "grad_norm": 0.15210965174650826, + "learning_rate": 1.769908253583612e-05, + "loss": 0.6416, + "step": 9098 + }, + { + "epoch": 0.8135729613733905, + "grad_norm": 0.1340994968397552, + "learning_rate": 1.7682635832071125e-05, + "loss": 0.616, + "step": 9099 + }, + { + "epoch": 0.8136623748211731, + "grad_norm": 0.13200873803322286, + "learning_rate": 1.766619603216405e-05, + "loss": 0.6363, + "step": 9100 + }, + { + "epoch": 0.8137517882689557, + "grad_norm": 0.12073587471185145, + "learning_rate": 1.7649763137493682e-05, + "loss": 0.6239, + "step": 9101 + }, + { + "epoch": 0.8138412017167382, + "grad_norm": 0.13757811317310134, + "learning_rate": 1.7633337149438246e-05, + "loss": 0.6134, + "step": 9102 + }, + { + "epoch": 0.8139306151645207, + "grad_norm": 0.14984036476852305, + "learning_rate": 1.7616918069375322e-05, + "loss": 0.6441, + "step": 9103 + }, + { + "epoch": 0.8140200286123033, + "grad_norm": 0.1332014759059025, + "learning_rate": 1.7600505898681997e-05, + "loss": 0.6371, + "step": 9104 + }, + { + "epoch": 0.8141094420600858, + "grad_norm": 0.13333374937432715, + "learning_rate": 1.7584100638734745e-05, + "loss": 0.6044, + "step": 9105 + }, + { + "epoch": 0.8141988555078684, + "grad_norm": 0.1544803829511909, + "learning_rate": 1.7567702290909393e-05, + "loss": 0.665, + "step": 9106 + }, + { + "epoch": 0.814288268955651, + "grad_norm": 0.1480655617579994, + "learning_rate": 1.7551310856581316e-05, + "loss": 0.6367, + "step": 9107 + }, + { + "epoch": 0.8143776824034334, + "grad_norm": 0.16772160194606578, + "learning_rate": 1.7534926337125257e-05, + "loss": 0.684, + "step": 9108 + }, + { + "epoch": 0.814467095851216, + "grad_norm": 0.14218984651107047, + "learning_rate": 1.751854873391531e-05, + "loss": 0.6156, + "step": 9109 + }, + { + "epoch": 0.8145565092989986, + "grad_norm": 0.12820570146389126, + "learning_rate": 1.750217804832506e-05, + "loss": 0.6382, + "step": 9110 + }, + { + "epoch": 0.8146459227467812, + "grad_norm": 0.132894828029278, + "learning_rate": 1.7485814281727532e-05, + "loss": 0.5845, + "step": 9111 + }, + { + "epoch": 0.8147353361945636, + "grad_norm": 0.14747143144497796, + "learning_rate": 1.7469457435495063e-05, + "loss": 0.6587, + "step": 9112 + }, + { + "epoch": 0.8148247496423462, + "grad_norm": 0.13960341603921864, + "learning_rate": 1.7453107510999568e-05, + "loss": 0.6735, + "step": 9113 + }, + { + "epoch": 0.8149141630901288, + "grad_norm": 0.13592670437995077, + "learning_rate": 1.7436764509612237e-05, + "loss": 0.6637, + "step": 9114 + }, + { + "epoch": 0.8150035765379113, + "grad_norm": 0.1578862792428007, + "learning_rate": 1.742042843270375e-05, + "loss": 0.6707, + "step": 9115 + }, + { + "epoch": 0.8150929899856938, + "grad_norm": 0.13897147811354202, + "learning_rate": 1.7404099281644237e-05, + "loss": 0.6466, + "step": 9116 + }, + { + "epoch": 0.8151824034334764, + "grad_norm": 0.14160145595565493, + "learning_rate": 1.7387777057803134e-05, + "loss": 0.637, + "step": 9117 + }, + { + "epoch": 0.8152718168812589, + "grad_norm": 0.15026886842958534, + "learning_rate": 1.737146176254939e-05, + "loss": 0.6184, + "step": 9118 + }, + { + "epoch": 0.8153612303290415, + "grad_norm": 0.14262709059465506, + "learning_rate": 1.735515339725137e-05, + "loss": 0.6388, + "step": 9119 + }, + { + "epoch": 0.8154506437768241, + "grad_norm": 0.1251687481214913, + "learning_rate": 1.7338851963276825e-05, + "loss": 0.6494, + "step": 9120 + }, + { + "epoch": 0.8155400572246065, + "grad_norm": 0.14549669147742847, + "learning_rate": 1.7322557461992926e-05, + "loss": 0.6338, + "step": 9121 + }, + { + "epoch": 0.8156294706723891, + "grad_norm": 0.1560014679770541, + "learning_rate": 1.7306269894766312e-05, + "loss": 0.6429, + "step": 9122 + }, + { + "epoch": 0.8157188841201717, + "grad_norm": 0.13413173132238648, + "learning_rate": 1.728998926296296e-05, + "loss": 0.5982, + "step": 9123 + }, + { + "epoch": 0.8158082975679543, + "grad_norm": 0.14395203935738188, + "learning_rate": 1.727371556794831e-05, + "loss": 0.6211, + "step": 9124 + }, + { + "epoch": 0.8158977110157367, + "grad_norm": 0.14502879695089618, + "learning_rate": 1.725744881108725e-05, + "loss": 0.6342, + "step": 9125 + }, + { + "epoch": 0.8159871244635193, + "grad_norm": 0.15103028532900944, + "learning_rate": 1.7241188993743984e-05, + "loss": 0.6397, + "step": 9126 + }, + { + "epoch": 0.8160765379113019, + "grad_norm": 0.1350753444879515, + "learning_rate": 1.7224936117282276e-05, + "loss": 0.6404, + "step": 9127 + }, + { + "epoch": 0.8161659513590844, + "grad_norm": 0.12771153263504587, + "learning_rate": 1.7208690183065236e-05, + "loss": 0.6148, + "step": 9128 + }, + { + "epoch": 0.816255364806867, + "grad_norm": 0.14781781360706353, + "learning_rate": 1.719245119245534e-05, + "loss": 0.6452, + "step": 9129 + }, + { + "epoch": 0.8163447782546495, + "grad_norm": 0.1266229307148906, + "learning_rate": 1.7176219146814542e-05, + "loss": 0.6366, + "step": 9130 + }, + { + "epoch": 0.816434191702432, + "grad_norm": 0.15210270659239536, + "learning_rate": 1.715999404750426e-05, + "loss": 0.6504, + "step": 9131 + }, + { + "epoch": 0.8165236051502146, + "grad_norm": 0.12177724994759628, + "learning_rate": 1.7143775895885195e-05, + "loss": 0.6172, + "step": 9132 + }, + { + "epoch": 0.8166130185979972, + "grad_norm": 0.14578710939629558, + "learning_rate": 1.712756469331759e-05, + "loss": 0.6483, + "step": 9133 + }, + { + "epoch": 0.8167024320457796, + "grad_norm": 0.14167348738554628, + "learning_rate": 1.7111360441161038e-05, + "loss": 0.6325, + "step": 9134 + }, + { + "epoch": 0.8167918454935622, + "grad_norm": 0.14390732036870124, + "learning_rate": 1.7095163140774596e-05, + "loss": 0.5916, + "step": 9135 + }, + { + "epoch": 0.8168812589413448, + "grad_norm": 0.147428413538118, + "learning_rate": 1.707897279351671e-05, + "loss": 0.6244, + "step": 9136 + }, + { + "epoch": 0.8169706723891274, + "grad_norm": 0.12908151540655433, + "learning_rate": 1.7062789400745215e-05, + "loss": 0.5514, + "step": 9137 + }, + { + "epoch": 0.8170600858369099, + "grad_norm": 0.16225325687278053, + "learning_rate": 1.704661296381741e-05, + "loss": 0.6462, + "step": 9138 + }, + { + "epoch": 0.8171494992846924, + "grad_norm": 0.13212814388951294, + "learning_rate": 1.703044348409002e-05, + "loss": 0.6341, + "step": 9139 + }, + { + "epoch": 0.817238912732475, + "grad_norm": 0.13605033386919488, + "learning_rate": 1.701428096291908e-05, + "loss": 0.6652, + "step": 9140 + }, + { + "epoch": 0.8173283261802575, + "grad_norm": 0.1243348754179093, + "learning_rate": 1.6998125401660202e-05, + "loss": 0.5697, + "step": 9141 + }, + { + "epoch": 0.8174177396280401, + "grad_norm": 0.14375264493035286, + "learning_rate": 1.698197680166832e-05, + "loss": 0.607, + "step": 9142 + }, + { + "epoch": 0.8175071530758226, + "grad_norm": 0.1349447470632931, + "learning_rate": 1.6965835164297773e-05, + "loss": 0.606, + "step": 9143 + }, + { + "epoch": 0.8175965665236051, + "grad_norm": 0.14221653811034396, + "learning_rate": 1.6949700490902344e-05, + "loss": 0.6449, + "step": 9144 + }, + { + "epoch": 0.8176859799713877, + "grad_norm": 0.14765238104409353, + "learning_rate": 1.693357278283526e-05, + "loss": 0.6861, + "step": 9145 + }, + { + "epoch": 0.8177753934191703, + "grad_norm": 0.15505525654422456, + "learning_rate": 1.6917452041449077e-05, + "loss": 0.6462, + "step": 9146 + }, + { + "epoch": 0.8178648068669528, + "grad_norm": 0.14199079962880856, + "learning_rate": 1.6901338268095866e-05, + "loss": 0.6334, + "step": 9147 + }, + { + "epoch": 0.8179542203147353, + "grad_norm": 0.13928787015504204, + "learning_rate": 1.688523146412705e-05, + "loss": 0.6369, + "step": 9148 + }, + { + "epoch": 0.8180436337625179, + "grad_norm": 0.13545347689577655, + "learning_rate": 1.68691316308935e-05, + "loss": 0.599, + "step": 9149 + }, + { + "epoch": 0.8181330472103004, + "grad_norm": 0.13233539722204118, + "learning_rate": 1.6853038769745467e-05, + "loss": 0.5984, + "step": 9150 + }, + { + "epoch": 0.818222460658083, + "grad_norm": 0.14581743121117322, + "learning_rate": 1.6836952882032698e-05, + "loss": 0.6314, + "step": 9151 + }, + { + "epoch": 0.8183118741058655, + "grad_norm": 0.13929770397680394, + "learning_rate": 1.682087396910422e-05, + "loss": 0.618, + "step": 9152 + }, + { + "epoch": 0.818401287553648, + "grad_norm": 0.15861689874697626, + "learning_rate": 1.68048020323086e-05, + "loss": 0.6436, + "step": 9153 + }, + { + "epoch": 0.8184907010014306, + "grad_norm": 0.1630523653796517, + "learning_rate": 1.6788737072993744e-05, + "loss": 0.6106, + "step": 9154 + }, + { + "epoch": 0.8185801144492132, + "grad_norm": 0.12170151352897449, + "learning_rate": 1.6772679092507025e-05, + "loss": 0.6287, + "step": 9155 + }, + { + "epoch": 0.8186695278969958, + "grad_norm": 0.1403737008212397, + "learning_rate": 1.6756628092195214e-05, + "loss": 0.6323, + "step": 9156 + }, + { + "epoch": 0.8187589413447782, + "grad_norm": 0.13796778565577592, + "learning_rate": 1.6740584073404454e-05, + "loss": 0.6348, + "step": 9157 + }, + { + "epoch": 0.8188483547925608, + "grad_norm": 0.13029511299862193, + "learning_rate": 1.6724547037480355e-05, + "loss": 0.6386, + "step": 9158 + }, + { + "epoch": 0.8189377682403434, + "grad_norm": 0.1487424802607062, + "learning_rate": 1.6708516985767953e-05, + "loss": 0.6187, + "step": 9159 + }, + { + "epoch": 0.8190271816881259, + "grad_norm": 0.14573505236585885, + "learning_rate": 1.6692493919611606e-05, + "loss": 0.6151, + "step": 9160 + }, + { + "epoch": 0.8191165951359084, + "grad_norm": 0.1454410137797063, + "learning_rate": 1.6676477840355166e-05, + "loss": 0.6496, + "step": 9161 + }, + { + "epoch": 0.819206008583691, + "grad_norm": 0.14080434523420582, + "learning_rate": 1.666046874934195e-05, + "loss": 0.6159, + "step": 9162 + }, + { + "epoch": 0.8192954220314735, + "grad_norm": 0.13234001756226582, + "learning_rate": 1.6644466647914546e-05, + "loss": 0.6499, + "step": 9163 + }, + { + "epoch": 0.8193848354792561, + "grad_norm": 0.14695314800393408, + "learning_rate": 1.662847153741506e-05, + "loss": 0.6326, + "step": 9164 + }, + { + "epoch": 0.8194742489270386, + "grad_norm": 0.12249945038542502, + "learning_rate": 1.6612483419185e-05, + "loss": 0.5822, + "step": 9165 + }, + { + "epoch": 0.8195636623748211, + "grad_norm": 0.1515535469194704, + "learning_rate": 1.659650229456522e-05, + "loss": 0.6061, + "step": 9166 + }, + { + "epoch": 0.8196530758226037, + "grad_norm": 0.13474120469141584, + "learning_rate": 1.658052816489607e-05, + "loss": 0.5995, + "step": 9167 + }, + { + "epoch": 0.8197424892703863, + "grad_norm": 0.14099272347318545, + "learning_rate": 1.656456103151728e-05, + "loss": 0.6189, + "step": 9168 + }, + { + "epoch": 0.8198319027181689, + "grad_norm": 0.13529129805652232, + "learning_rate": 1.6548600895767997e-05, + "loss": 0.6195, + "step": 9169 + }, + { + "epoch": 0.8199213161659513, + "grad_norm": 0.14059241951254375, + "learning_rate": 1.6532647758986786e-05, + "loss": 0.6308, + "step": 9170 + }, + { + "epoch": 0.8200107296137339, + "grad_norm": 0.14445883736995643, + "learning_rate": 1.6516701622511588e-05, + "loss": 0.6077, + "step": 9171 + }, + { + "epoch": 0.8201001430615165, + "grad_norm": 0.14465201442395997, + "learning_rate": 1.65007624876798e-05, + "loss": 0.6549, + "step": 9172 + }, + { + "epoch": 0.820189556509299, + "grad_norm": 0.1399531339951657, + "learning_rate": 1.6484830355828242e-05, + "loss": 0.6047, + "step": 9173 + }, + { + "epoch": 0.8202789699570815, + "grad_norm": 0.1503192544689178, + "learning_rate": 1.6468905228293073e-05, + "loss": 0.6479, + "step": 9174 + }, + { + "epoch": 0.8203683834048641, + "grad_norm": 0.137470568587822, + "learning_rate": 1.6452987106409935e-05, + "loss": 0.6472, + "step": 9175 + }, + { + "epoch": 0.8204577968526466, + "grad_norm": 0.1422972750333396, + "learning_rate": 1.6437075991513905e-05, + "loss": 0.6829, + "step": 9176 + }, + { + "epoch": 0.8205472103004292, + "grad_norm": 0.15961459458193808, + "learning_rate": 1.6421171884939368e-05, + "loss": 0.6408, + "step": 9177 + }, + { + "epoch": 0.8206366237482118, + "grad_norm": 0.15742003883138558, + "learning_rate": 1.640527478802021e-05, + "loss": 0.6516, + "step": 9178 + }, + { + "epoch": 0.8207260371959942, + "grad_norm": 0.12366881594982768, + "learning_rate": 1.638938470208973e-05, + "loss": 0.645, + "step": 9179 + }, + { + "epoch": 0.8208154506437768, + "grad_norm": 0.14582432810543494, + "learning_rate": 1.6373501628480535e-05, + "loss": 0.6509, + "step": 9180 + }, + { + "epoch": 0.8209048640915594, + "grad_norm": 0.14294681673510193, + "learning_rate": 1.6357625568524783e-05, + "loss": 0.5954, + "step": 9181 + }, + { + "epoch": 0.820994277539342, + "grad_norm": 0.13431985404489824, + "learning_rate": 1.6341756523553954e-05, + "loss": 0.6192, + "step": 9182 + }, + { + "epoch": 0.8210836909871244, + "grad_norm": 0.13951523452967687, + "learning_rate": 1.6325894494898975e-05, + "loss": 0.6148, + "step": 9183 + }, + { + "epoch": 0.821173104434907, + "grad_norm": 0.1518120509418134, + "learning_rate": 1.631003948389016e-05, + "loss": 0.6267, + "step": 9184 + }, + { + "epoch": 0.8212625178826896, + "grad_norm": 0.152792124524215, + "learning_rate": 1.629419149185729e-05, + "loss": 0.6751, + "step": 9185 + }, + { + "epoch": 0.8213519313304721, + "grad_norm": 0.13690269926234047, + "learning_rate": 1.627835052012947e-05, + "loss": 0.6171, + "step": 9186 + }, + { + "epoch": 0.8214413447782547, + "grad_norm": 0.1298890787469681, + "learning_rate": 1.626251657003528e-05, + "loss": 0.6031, + "step": 9187 + }, + { + "epoch": 0.8215307582260372, + "grad_norm": 0.1714819567854761, + "learning_rate": 1.6246689642902725e-05, + "loss": 0.6572, + "step": 9188 + }, + { + "epoch": 0.8216201716738197, + "grad_norm": 0.13299368837585196, + "learning_rate": 1.6230869740059106e-05, + "loss": 0.6226, + "step": 9189 + }, + { + "epoch": 0.8217095851216023, + "grad_norm": 0.1331837950415521, + "learning_rate": 1.6215056862831324e-05, + "loss": 0.6192, + "step": 9190 + }, + { + "epoch": 0.8217989985693849, + "grad_norm": 0.14492024048304192, + "learning_rate": 1.6199251012545512e-05, + "loss": 0.6487, + "step": 9191 + }, + { + "epoch": 0.8218884120171673, + "grad_norm": 0.1569828152726075, + "learning_rate": 1.6183452190527316e-05, + "loss": 0.6602, + "step": 9192 + }, + { + "epoch": 0.8219778254649499, + "grad_norm": 0.14577944937408585, + "learning_rate": 1.616766039810178e-05, + "loss": 0.6262, + "step": 9193 + }, + { + "epoch": 0.8220672389127325, + "grad_norm": 0.12848741413469683, + "learning_rate": 1.6151875636593306e-05, + "loss": 0.612, + "step": 9194 + }, + { + "epoch": 0.822156652360515, + "grad_norm": 0.1466313276900892, + "learning_rate": 1.613609790732572e-05, + "loss": 0.6432, + "step": 9195 + }, + { + "epoch": 0.8222460658082976, + "grad_norm": 0.1430773498721736, + "learning_rate": 1.6120327211622375e-05, + "loss": 0.5849, + "step": 9196 + }, + { + "epoch": 0.8223354792560801, + "grad_norm": 0.1503040950686577, + "learning_rate": 1.6104563550805875e-05, + "loss": 0.6507, + "step": 9197 + }, + { + "epoch": 0.8224248927038627, + "grad_norm": 0.1431803275283148, + "learning_rate": 1.6088806926198297e-05, + "loss": 0.6414, + "step": 9198 + }, + { + "epoch": 0.8225143061516452, + "grad_norm": 0.14204572638735352, + "learning_rate": 1.6073057339121166e-05, + "loss": 0.5878, + "step": 9199 + }, + { + "epoch": 0.8226037195994278, + "grad_norm": 0.14577459515655505, + "learning_rate": 1.605731479089534e-05, + "loss": 0.6029, + "step": 9200 + }, + { + "epoch": 0.8226931330472103, + "grad_norm": 0.13999170808490674, + "learning_rate": 1.6041579282841145e-05, + "loss": 0.6144, + "step": 9201 + }, + { + "epoch": 0.8227825464949928, + "grad_norm": 0.14238847275024796, + "learning_rate": 1.6025850816278297e-05, + "loss": 0.6223, + "step": 9202 + }, + { + "epoch": 0.8228719599427754, + "grad_norm": 0.14099027726881969, + "learning_rate": 1.601012939252592e-05, + "loss": 0.6295, + "step": 9203 + }, + { + "epoch": 0.822961373390558, + "grad_norm": 0.14125994686837204, + "learning_rate": 1.5994415012902587e-05, + "loss": 0.6561, + "step": 9204 + }, + { + "epoch": 0.8230507868383404, + "grad_norm": 0.14921550733143069, + "learning_rate": 1.597870767872619e-05, + "loss": 0.6213, + "step": 9205 + }, + { + "epoch": 0.823140200286123, + "grad_norm": 0.1466824266256055, + "learning_rate": 1.5963007391314113e-05, + "loss": 0.6303, + "step": 9206 + }, + { + "epoch": 0.8232296137339056, + "grad_norm": 0.14414803427972692, + "learning_rate": 1.5947314151983105e-05, + "loss": 0.6472, + "step": 9207 + }, + { + "epoch": 0.8233190271816881, + "grad_norm": 0.1408329234093684, + "learning_rate": 1.5931627962049378e-05, + "loss": 0.6372, + "step": 9208 + }, + { + "epoch": 0.8234084406294707, + "grad_norm": 0.14245968351376234, + "learning_rate": 1.591594882282844e-05, + "loss": 0.6066, + "step": 9209 + }, + { + "epoch": 0.8234978540772532, + "grad_norm": 0.14230006044237425, + "learning_rate": 1.5900276735635367e-05, + "loss": 0.6284, + "step": 9210 + }, + { + "epoch": 0.8235872675250357, + "grad_norm": 0.13498099912107125, + "learning_rate": 1.5884611701784504e-05, + "loss": 0.6205, + "step": 9211 + }, + { + "epoch": 0.8236766809728183, + "grad_norm": 0.14989903363114715, + "learning_rate": 1.5868953722589663e-05, + "loss": 0.6264, + "step": 9212 + }, + { + "epoch": 0.8237660944206009, + "grad_norm": 0.1490930412932949, + "learning_rate": 1.585330279936409e-05, + "loss": 0.6502, + "step": 9213 + }, + { + "epoch": 0.8238555078683834, + "grad_norm": 0.1481915417641587, + "learning_rate": 1.5837658933420375e-05, + "loss": 0.6328, + "step": 9214 + }, + { + "epoch": 0.8239449213161659, + "grad_norm": 0.1274884521704355, + "learning_rate": 1.5822022126070556e-05, + "loss": 0.6108, + "step": 9215 + }, + { + "epoch": 0.8240343347639485, + "grad_norm": 0.15313182845942366, + "learning_rate": 1.580639237862608e-05, + "loss": 0.6547, + "step": 9216 + }, + { + "epoch": 0.8241237482117311, + "grad_norm": 0.13439695365016818, + "learning_rate": 1.57907696923978e-05, + "loss": 0.6322, + "step": 9217 + }, + { + "epoch": 0.8242131616595136, + "grad_norm": 0.13646148474521788, + "learning_rate": 1.5775154068695963e-05, + "loss": 0.5857, + "step": 9218 + }, + { + "epoch": 0.8243025751072961, + "grad_norm": 0.14989739341648536, + "learning_rate": 1.5759545508830252e-05, + "loss": 0.6025, + "step": 9219 + }, + { + "epoch": 0.8243919885550787, + "grad_norm": 0.15427034240364312, + "learning_rate": 1.5743944014109713e-05, + "loss": 0.6282, + "step": 9220 + }, + { + "epoch": 0.8244814020028612, + "grad_norm": 0.12746393587591637, + "learning_rate": 1.5728349585842827e-05, + "loss": 0.6123, + "step": 9221 + }, + { + "epoch": 0.8245708154506438, + "grad_norm": 0.130409377973294, + "learning_rate": 1.571276222533751e-05, + "loss": 0.6257, + "step": 9222 + }, + { + "epoch": 0.8246602288984263, + "grad_norm": 0.12495449302459312, + "learning_rate": 1.5697181933900985e-05, + "loss": 0.6202, + "step": 9223 + }, + { + "epoch": 0.8247496423462088, + "grad_norm": 0.14622808579392949, + "learning_rate": 1.5681608712840046e-05, + "loss": 0.6291, + "step": 9224 + }, + { + "epoch": 0.8248390557939914, + "grad_norm": 0.14757689701792026, + "learning_rate": 1.5666042563460737e-05, + "loss": 0.6192, + "step": 9225 + }, + { + "epoch": 0.824928469241774, + "grad_norm": 0.12842300832682205, + "learning_rate": 1.565048348706858e-05, + "loss": 0.5753, + "step": 9226 + }, + { + "epoch": 0.8250178826895566, + "grad_norm": 0.12082335117096342, + "learning_rate": 1.563493148496853e-05, + "loss": 0.5756, + "step": 9227 + }, + { + "epoch": 0.825107296137339, + "grad_norm": 0.13832333481060735, + "learning_rate": 1.5619386558464865e-05, + "loss": 0.5815, + "step": 9228 + }, + { + "epoch": 0.8251967095851216, + "grad_norm": 0.13754476787004993, + "learning_rate": 1.5603848708861347e-05, + "loss": 0.5946, + "step": 9229 + }, + { + "epoch": 0.8252861230329042, + "grad_norm": 0.14419899059449198, + "learning_rate": 1.5588317937461105e-05, + "loss": 0.633, + "step": 9230 + }, + { + "epoch": 0.8253755364806867, + "grad_norm": 0.1446216382741417, + "learning_rate": 1.55727942455667e-05, + "loss": 0.612, + "step": 9231 + }, + { + "epoch": 0.8254649499284692, + "grad_norm": 0.128395976428335, + "learning_rate": 1.5557277634480083e-05, + "loss": 0.6059, + "step": 9232 + }, + { + "epoch": 0.8255543633762518, + "grad_norm": 0.14175144545297969, + "learning_rate": 1.554176810550263e-05, + "loss": 0.6587, + "step": 9233 + }, + { + "epoch": 0.8256437768240343, + "grad_norm": 0.1314536650164941, + "learning_rate": 1.552626565993507e-05, + "loss": 0.6572, + "step": 9234 + }, + { + "epoch": 0.8257331902718169, + "grad_norm": 0.13482335980440682, + "learning_rate": 1.551077029907758e-05, + "loss": 0.5885, + "step": 9235 + }, + { + "epoch": 0.8258226037195995, + "grad_norm": 0.15182349669985207, + "learning_rate": 1.5495282024229775e-05, + "loss": 0.6561, + "step": 9236 + }, + { + "epoch": 0.8259120171673819, + "grad_norm": 0.14851062882076851, + "learning_rate": 1.547980083669056e-05, + "loss": 0.6294, + "step": 9237 + }, + { + "epoch": 0.8260014306151645, + "grad_norm": 0.13872255477614537, + "learning_rate": 1.5464326737758428e-05, + "loss": 0.6297, + "step": 9238 + }, + { + "epoch": 0.8260908440629471, + "grad_norm": 0.12914295295781555, + "learning_rate": 1.544885972873109e-05, + "loss": 0.6065, + "step": 9239 + }, + { + "epoch": 0.8261802575107297, + "grad_norm": 0.11500518527558805, + "learning_rate": 1.543339981090578e-05, + "loss": 0.5843, + "step": 9240 + }, + { + "epoch": 0.8262696709585121, + "grad_norm": 0.14640414724269482, + "learning_rate": 1.541794698557909e-05, + "loss": 0.6299, + "step": 9241 + }, + { + "epoch": 0.8263590844062947, + "grad_norm": 0.13622940878724546, + "learning_rate": 1.5402501254047065e-05, + "loss": 0.6525, + "step": 9242 + }, + { + "epoch": 0.8264484978540773, + "grad_norm": 0.1404667004618345, + "learning_rate": 1.5387062617605064e-05, + "loss": 0.6352, + "step": 9243 + }, + { + "epoch": 0.8265379113018598, + "grad_norm": 0.1397999154185353, + "learning_rate": 1.5371631077547942e-05, + "loss": 0.6244, + "step": 9244 + }, + { + "epoch": 0.8266273247496424, + "grad_norm": 0.13525378889019937, + "learning_rate": 1.5356206635169912e-05, + "loss": 0.6503, + "step": 9245 + }, + { + "epoch": 0.8267167381974249, + "grad_norm": 0.14738534680276624, + "learning_rate": 1.5340789291764612e-05, + "loss": 0.6276, + "step": 9246 + }, + { + "epoch": 0.8268061516452074, + "grad_norm": 0.15259900975094826, + "learning_rate": 1.532537904862509e-05, + "loss": 0.6534, + "step": 9247 + }, + { + "epoch": 0.82689556509299, + "grad_norm": 0.1351139307568911, + "learning_rate": 1.530997590704375e-05, + "loss": 0.5714, + "step": 9248 + }, + { + "epoch": 0.8269849785407726, + "grad_norm": 0.14799743481100625, + "learning_rate": 1.529457986831244e-05, + "loss": 0.6621, + "step": 9249 + }, + { + "epoch": 0.827074391988555, + "grad_norm": 0.14337722549318432, + "learning_rate": 1.5279190933722443e-05, + "loss": 0.6435, + "step": 9250 + }, + { + "epoch": 0.8271638054363376, + "grad_norm": 0.1650303804478055, + "learning_rate": 1.5263809104564353e-05, + "loss": 0.6817, + "step": 9251 + }, + { + "epoch": 0.8272532188841202, + "grad_norm": 0.13737014548648446, + "learning_rate": 1.5248434382128263e-05, + "loss": 0.6217, + "step": 9252 + }, + { + "epoch": 0.8273426323319027, + "grad_norm": 0.1424876656296603, + "learning_rate": 1.5233066767703663e-05, + "loss": 0.6371, + "step": 9253 + }, + { + "epoch": 0.8274320457796852, + "grad_norm": 0.14319889030629837, + "learning_rate": 1.5217706262579356e-05, + "loss": 0.6161, + "step": 9254 + }, + { + "epoch": 0.8275214592274678, + "grad_norm": 0.12095806616654464, + "learning_rate": 1.5202352868043624e-05, + "loss": 0.6492, + "step": 9255 + }, + { + "epoch": 0.8276108726752504, + "grad_norm": 0.13875164669645468, + "learning_rate": 1.5187006585384179e-05, + "loss": 0.6362, + "step": 9256 + }, + { + "epoch": 0.8277002861230329, + "grad_norm": 0.1237404466541059, + "learning_rate": 1.5171667415888046e-05, + "loss": 0.6198, + "step": 9257 + }, + { + "epoch": 0.8277896995708155, + "grad_norm": 0.14187785311329842, + "learning_rate": 1.515633536084171e-05, + "loss": 0.6507, + "step": 9258 + }, + { + "epoch": 0.827879113018598, + "grad_norm": 0.1574587295271861, + "learning_rate": 1.5141010421531066e-05, + "loss": 0.6046, + "step": 9259 + }, + { + "epoch": 0.8279685264663805, + "grad_norm": 0.13794309844757296, + "learning_rate": 1.5125692599241391e-05, + "loss": 0.6319, + "step": 9260 + }, + { + "epoch": 0.8280579399141631, + "grad_norm": 0.14049761192990204, + "learning_rate": 1.5110381895257408e-05, + "loss": 0.6378, + "step": 9261 + }, + { + "epoch": 0.8281473533619457, + "grad_norm": 0.13733394907635985, + "learning_rate": 1.5095078310863142e-05, + "loss": 0.6327, + "step": 9262 + }, + { + "epoch": 0.8282367668097281, + "grad_norm": 0.13473802644413496, + "learning_rate": 1.5079781847342123e-05, + "loss": 0.6361, + "step": 9263 + }, + { + "epoch": 0.8283261802575107, + "grad_norm": 0.14765776247583545, + "learning_rate": 1.5064492505977234e-05, + "loss": 0.6623, + "step": 9264 + }, + { + "epoch": 0.8284155937052933, + "grad_norm": 0.14699553213070438, + "learning_rate": 1.5049210288050796e-05, + "loss": 0.6336, + "step": 9265 + }, + { + "epoch": 0.8285050071530758, + "grad_norm": 0.12235070501278668, + "learning_rate": 1.5033935194844484e-05, + "loss": 0.6129, + "step": 9266 + }, + { + "epoch": 0.8285944206008584, + "grad_norm": 0.13440803704002152, + "learning_rate": 1.501866722763945e-05, + "loss": 0.6492, + "step": 9267 + }, + { + "epoch": 0.8286838340486409, + "grad_norm": 0.14004681375536382, + "learning_rate": 1.5003406387716134e-05, + "loss": 0.6395, + "step": 9268 + }, + { + "epoch": 0.8287732474964234, + "grad_norm": 0.1318886025808485, + "learning_rate": 1.4988152676354472e-05, + "loss": 0.5982, + "step": 9269 + }, + { + "epoch": 0.828862660944206, + "grad_norm": 0.1325569342029391, + "learning_rate": 1.4972906094833805e-05, + "loss": 0.6392, + "step": 9270 + }, + { + "epoch": 0.8289520743919886, + "grad_norm": 0.15163579695416776, + "learning_rate": 1.4957666644432788e-05, + "loss": 0.6604, + "step": 9271 + }, + { + "epoch": 0.829041487839771, + "grad_norm": 0.1329460403222576, + "learning_rate": 1.4942434326429544e-05, + "loss": 0.6209, + "step": 9272 + }, + { + "epoch": 0.8291309012875536, + "grad_norm": 0.13888989610599856, + "learning_rate": 1.4927209142101662e-05, + "loss": 0.6297, + "step": 9273 + }, + { + "epoch": 0.8292203147353362, + "grad_norm": 0.1431176172141615, + "learning_rate": 1.4911991092725985e-05, + "loss": 0.6332, + "step": 9274 + }, + { + "epoch": 0.8293097281831188, + "grad_norm": 0.13262763042979236, + "learning_rate": 1.489678017957884e-05, + "loss": 0.6189, + "step": 9275 + }, + { + "epoch": 0.8293991416309013, + "grad_norm": 0.14951443445546678, + "learning_rate": 1.4881576403936004e-05, + "loss": 0.6278, + "step": 9276 + }, + { + "epoch": 0.8294885550786838, + "grad_norm": 0.13989841371386477, + "learning_rate": 1.4866379767072525e-05, + "loss": 0.6462, + "step": 9277 + }, + { + "epoch": 0.8295779685264664, + "grad_norm": 0.12579105218064376, + "learning_rate": 1.485119027026296e-05, + "loss": 0.6219, + "step": 9278 + }, + { + "epoch": 0.8296673819742489, + "grad_norm": 0.13804825880569882, + "learning_rate": 1.4836007914781225e-05, + "loss": 0.6619, + "step": 9279 + }, + { + "epoch": 0.8297567954220315, + "grad_norm": 0.14726904449325415, + "learning_rate": 1.4820832701900667e-05, + "loss": 0.641, + "step": 9280 + }, + { + "epoch": 0.829846208869814, + "grad_norm": 0.13784511078821346, + "learning_rate": 1.4805664632894024e-05, + "loss": 0.6369, + "step": 9281 + }, + { + "epoch": 0.8299356223175965, + "grad_norm": 0.12108431036585524, + "learning_rate": 1.4790503709033365e-05, + "loss": 0.6365, + "step": 9282 + }, + { + "epoch": 0.8300250357653791, + "grad_norm": 0.1369036729082032, + "learning_rate": 1.4775349931590266e-05, + "loss": 0.6424, + "step": 9283 + }, + { + "epoch": 0.8301144492131617, + "grad_norm": 0.131206184739468, + "learning_rate": 1.4760203301835652e-05, + "loss": 0.6317, + "step": 9284 + }, + { + "epoch": 0.8302038626609443, + "grad_norm": 0.13101290689664483, + "learning_rate": 1.4745063821039806e-05, + "loss": 0.6162, + "step": 9285 + }, + { + "epoch": 0.8302932761087267, + "grad_norm": 0.15074854393142795, + "learning_rate": 1.4729931490472515e-05, + "loss": 0.6512, + "step": 9286 + }, + { + "epoch": 0.8303826895565093, + "grad_norm": 0.14876694453842482, + "learning_rate": 1.4714806311402918e-05, + "loss": 0.6668, + "step": 9287 + }, + { + "epoch": 0.8304721030042919, + "grad_norm": 0.12357863211114138, + "learning_rate": 1.4699688285099489e-05, + "loss": 0.6245, + "step": 9288 + }, + { + "epoch": 0.8305615164520744, + "grad_norm": 0.15402617413674988, + "learning_rate": 1.4684577412830191e-05, + "loss": 0.6536, + "step": 9289 + }, + { + "epoch": 0.8306509298998569, + "grad_norm": 0.14798913069106284, + "learning_rate": 1.4669473695862368e-05, + "loss": 0.643, + "step": 9290 + }, + { + "epoch": 0.8307403433476395, + "grad_norm": 0.12801875719729358, + "learning_rate": 1.4654377135462715e-05, + "loss": 0.6188, + "step": 9291 + }, + { + "epoch": 0.830829756795422, + "grad_norm": 0.12840880671219287, + "learning_rate": 1.4639287732897377e-05, + "loss": 0.5918, + "step": 9292 + }, + { + "epoch": 0.8309191702432046, + "grad_norm": 0.13114847645967237, + "learning_rate": 1.4624205489431886e-05, + "loss": 0.6256, + "step": 9293 + }, + { + "epoch": 0.8310085836909872, + "grad_norm": 0.14587201262579577, + "learning_rate": 1.4609130406331172e-05, + "loss": 0.6303, + "step": 9294 + }, + { + "epoch": 0.8310979971387696, + "grad_norm": 0.12718893600368453, + "learning_rate": 1.4594062484859595e-05, + "loss": 0.6038, + "step": 9295 + }, + { + "epoch": 0.8311874105865522, + "grad_norm": 0.14345042349141868, + "learning_rate": 1.4579001726280828e-05, + "loss": 0.6616, + "step": 9296 + }, + { + "epoch": 0.8312768240343348, + "grad_norm": 0.13583144337252603, + "learning_rate": 1.4563948131858018e-05, + "loss": 0.6198, + "step": 9297 + }, + { + "epoch": 0.8313662374821174, + "grad_norm": 0.1425346434909573, + "learning_rate": 1.4548901702853701e-05, + "loss": 0.617, + "step": 9298 + }, + { + "epoch": 0.8314556509298998, + "grad_norm": 0.1379827817511024, + "learning_rate": 1.4533862440529799e-05, + "loss": 0.6372, + "step": 9299 + }, + { + "epoch": 0.8315450643776824, + "grad_norm": 0.14240553196396144, + "learning_rate": 1.4518830346147638e-05, + "loss": 0.6433, + "step": 9300 + }, + { + "epoch": 0.831634477825465, + "grad_norm": 0.12921497580527036, + "learning_rate": 1.4503805420967964e-05, + "loss": 0.6086, + "step": 9301 + }, + { + "epoch": 0.8317238912732475, + "grad_norm": 0.13939968020892474, + "learning_rate": 1.4488787666250858e-05, + "loss": 0.632, + "step": 9302 + }, + { + "epoch": 0.83181330472103, + "grad_norm": 0.1287240753605672, + "learning_rate": 1.4473777083255857e-05, + "loss": 0.6283, + "step": 9303 + }, + { + "epoch": 0.8319027181688126, + "grad_norm": 0.1350560266235124, + "learning_rate": 1.4458773673241899e-05, + "loss": 0.6483, + "step": 9304 + }, + { + "epoch": 0.8319921316165951, + "grad_norm": 0.11601639398445113, + "learning_rate": 1.4443777437467265e-05, + "loss": 0.6029, + "step": 9305 + }, + { + "epoch": 0.8320815450643777, + "grad_norm": 0.13654271697338535, + "learning_rate": 1.4428788377189672e-05, + "loss": 0.6342, + "step": 9306 + }, + { + "epoch": 0.8321709585121603, + "grad_norm": 0.15603522954891588, + "learning_rate": 1.4413806493666293e-05, + "loss": 0.6142, + "step": 9307 + }, + { + "epoch": 0.8322603719599427, + "grad_norm": 0.1307896714729219, + "learning_rate": 1.4398831788153588e-05, + "loss": 0.5949, + "step": 9308 + }, + { + "epoch": 0.8323497854077253, + "grad_norm": 0.138896602640025, + "learning_rate": 1.4383864261907476e-05, + "loss": 0.6394, + "step": 9309 + }, + { + "epoch": 0.8324391988555079, + "grad_norm": 0.14882654159137873, + "learning_rate": 1.4368903916183296e-05, + "loss": 0.65, + "step": 9310 + }, + { + "epoch": 0.8325286123032904, + "grad_norm": 0.13172313538697714, + "learning_rate": 1.4353950752235702e-05, + "loss": 0.602, + "step": 9311 + }, + { + "epoch": 0.8326180257510729, + "grad_norm": 0.1496357799160131, + "learning_rate": 1.433900477131882e-05, + "loss": 0.6273, + "step": 9312 + }, + { + "epoch": 0.8327074391988555, + "grad_norm": 0.13415974468108058, + "learning_rate": 1.4324065974686162e-05, + "loss": 0.6591, + "step": 9313 + }, + { + "epoch": 0.832796852646638, + "grad_norm": 0.14325187252588772, + "learning_rate": 1.4309134363590615e-05, + "loss": 0.6039, + "step": 9314 + }, + { + "epoch": 0.8328862660944206, + "grad_norm": 0.1483099381835713, + "learning_rate": 1.4294209939284509e-05, + "loss": 0.6541, + "step": 9315 + }, + { + "epoch": 0.8329756795422032, + "grad_norm": 0.13399379011196286, + "learning_rate": 1.4279292703019486e-05, + "loss": 0.6344, + "step": 9316 + }, + { + "epoch": 0.8330650929899857, + "grad_norm": 0.14124393166140653, + "learning_rate": 1.426438265604666e-05, + "loss": 0.629, + "step": 9317 + }, + { + "epoch": 0.8331545064377682, + "grad_norm": 0.15454103904875935, + "learning_rate": 1.4249479799616538e-05, + "loss": 0.6559, + "step": 9318 + }, + { + "epoch": 0.8332439198855508, + "grad_norm": 0.13755359580479637, + "learning_rate": 1.4234584134978956e-05, + "loss": 0.637, + "step": 9319 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.1240567580176253, + "learning_rate": 1.4219695663383214e-05, + "loss": 0.5304, + "step": 9320 + }, + { + "epoch": 0.8334227467811158, + "grad_norm": 0.14273870109093686, + "learning_rate": 1.4204814386078036e-05, + "loss": 0.6935, + "step": 9321 + }, + { + "epoch": 0.8335121602288984, + "grad_norm": 0.13417821081188425, + "learning_rate": 1.4189940304311444e-05, + "loss": 0.5845, + "step": 9322 + }, + { + "epoch": 0.833601573676681, + "grad_norm": 0.13069009499207151, + "learning_rate": 1.417507341933092e-05, + "loss": 0.6035, + "step": 9323 + }, + { + "epoch": 0.8336909871244635, + "grad_norm": 0.1329866414839239, + "learning_rate": 1.4160213732383364e-05, + "loss": 0.6089, + "step": 9324 + }, + { + "epoch": 0.8337804005722461, + "grad_norm": 0.14428379790371484, + "learning_rate": 1.4145361244714995e-05, + "loss": 0.5949, + "step": 9325 + }, + { + "epoch": 0.8338698140200286, + "grad_norm": 0.11850327690388361, + "learning_rate": 1.4130515957571488e-05, + "loss": 0.6136, + "step": 9326 + }, + { + "epoch": 0.8339592274678111, + "grad_norm": 0.1404409920667865, + "learning_rate": 1.4115677872197908e-05, + "loss": 0.6364, + "step": 9327 + }, + { + "epoch": 0.8340486409155937, + "grad_norm": 0.14321261918408926, + "learning_rate": 1.41008469898387e-05, + "loss": 0.6066, + "step": 9328 + }, + { + "epoch": 0.8341380543633763, + "grad_norm": 0.13939576561711098, + "learning_rate": 1.4086023311737716e-05, + "loss": 0.6241, + "step": 9329 + }, + { + "epoch": 0.8342274678111588, + "grad_norm": 0.13531601228275492, + "learning_rate": 1.4071206839138217e-05, + "loss": 0.6344, + "step": 9330 + }, + { + "epoch": 0.8343168812589413, + "grad_norm": 0.12470892510996848, + "learning_rate": 1.4056397573282808e-05, + "loss": 0.6149, + "step": 9331 + }, + { + "epoch": 0.8344062947067239, + "grad_norm": 0.13840540791417513, + "learning_rate": 1.4041595515413542e-05, + "loss": 0.6344, + "step": 9332 + }, + { + "epoch": 0.8344957081545065, + "grad_norm": 0.12285207388742775, + "learning_rate": 1.4026800666771867e-05, + "loss": 0.6068, + "step": 9333 + }, + { + "epoch": 0.834585121602289, + "grad_norm": 0.12873727025644718, + "learning_rate": 1.4012013028598547e-05, + "loss": 0.5611, + "step": 9334 + }, + { + "epoch": 0.8346745350500715, + "grad_norm": 0.1475912752449455, + "learning_rate": 1.3997232602133892e-05, + "loss": 0.6688, + "step": 9335 + }, + { + "epoch": 0.8347639484978541, + "grad_norm": 0.1267504338278307, + "learning_rate": 1.3982459388617452e-05, + "loss": 0.6253, + "step": 9336 + }, + { + "epoch": 0.8348533619456366, + "grad_norm": 0.13843926603179899, + "learning_rate": 1.3967693389288261e-05, + "loss": 0.6828, + "step": 9337 + }, + { + "epoch": 0.8349427753934192, + "grad_norm": 0.13421410412225565, + "learning_rate": 1.3952934605384749e-05, + "loss": 0.5826, + "step": 9338 + }, + { + "epoch": 0.8350321888412017, + "grad_norm": 0.13668378076683269, + "learning_rate": 1.393818303814467e-05, + "loss": 0.6689, + "step": 9339 + }, + { + "epoch": 0.8351216022889842, + "grad_norm": 0.14142208268273757, + "learning_rate": 1.3923438688805235e-05, + "loss": 0.6302, + "step": 9340 + }, + { + "epoch": 0.8352110157367668, + "grad_norm": 0.14722145627671074, + "learning_rate": 1.3908701558603054e-05, + "loss": 0.6148, + "step": 9341 + }, + { + "epoch": 0.8353004291845494, + "grad_norm": 0.1710616440231489, + "learning_rate": 1.3893971648774095e-05, + "loss": 0.6325, + "step": 9342 + }, + { + "epoch": 0.835389842632332, + "grad_norm": 0.13794604019499548, + "learning_rate": 1.3879248960553737e-05, + "loss": 0.6299, + "step": 9343 + }, + { + "epoch": 0.8354792560801144, + "grad_norm": 0.1479635578434282, + "learning_rate": 1.386453349517679e-05, + "loss": 0.6408, + "step": 9344 + }, + { + "epoch": 0.835568669527897, + "grad_norm": 0.1515066578024089, + "learning_rate": 1.384982525387738e-05, + "loss": 0.691, + "step": 9345 + }, + { + "epoch": 0.8356580829756796, + "grad_norm": 0.13530991597379446, + "learning_rate": 1.3835124237889074e-05, + "loss": 0.6231, + "step": 9346 + }, + { + "epoch": 0.8357474964234621, + "grad_norm": 0.13320059317658753, + "learning_rate": 1.3820430448444866e-05, + "loss": 0.576, + "step": 9347 + }, + { + "epoch": 0.8358369098712446, + "grad_norm": 0.14757065711262962, + "learning_rate": 1.3805743886777022e-05, + "loss": 0.6527, + "step": 9348 + }, + { + "epoch": 0.8359263233190272, + "grad_norm": 0.14484761837475332, + "learning_rate": 1.3791064554117394e-05, + "loss": 0.6184, + "step": 9349 + }, + { + "epoch": 0.8360157367668097, + "grad_norm": 0.14742972340405658, + "learning_rate": 1.377639245169704e-05, + "loss": 0.6453, + "step": 9350 + }, + { + "epoch": 0.8361051502145923, + "grad_norm": 0.13080925578156796, + "learning_rate": 1.376172758074653e-05, + "loss": 0.6272, + "step": 9351 + }, + { + "epoch": 0.8361945636623748, + "grad_norm": 0.14347949903800802, + "learning_rate": 1.3747069942495794e-05, + "loss": 0.6473, + "step": 9352 + }, + { + "epoch": 0.8362839771101573, + "grad_norm": 0.15547959587105684, + "learning_rate": 1.3732419538174112e-05, + "loss": 0.6587, + "step": 9353 + }, + { + "epoch": 0.8363733905579399, + "grad_norm": 0.14559869643073825, + "learning_rate": 1.3717776369010216e-05, + "loss": 0.5906, + "step": 9354 + }, + { + "epoch": 0.8364628040057225, + "grad_norm": 0.1477144583869734, + "learning_rate": 1.370314043623222e-05, + "loss": 0.6231, + "step": 9355 + }, + { + "epoch": 0.836552217453505, + "grad_norm": 0.12535902352515213, + "learning_rate": 1.368851174106761e-05, + "loss": 0.6288, + "step": 9356 + }, + { + "epoch": 0.8366416309012875, + "grad_norm": 0.13669681754191965, + "learning_rate": 1.3673890284743285e-05, + "loss": 0.5992, + "step": 9357 + }, + { + "epoch": 0.8367310443490701, + "grad_norm": 0.14366855881251633, + "learning_rate": 1.3659276068485549e-05, + "loss": 0.6651, + "step": 9358 + }, + { + "epoch": 0.8368204577968527, + "grad_norm": 0.15273548716599983, + "learning_rate": 1.3644669093520035e-05, + "loss": 0.6206, + "step": 9359 + }, + { + "epoch": 0.8369098712446352, + "grad_norm": 0.1406516373089417, + "learning_rate": 1.363006936107183e-05, + "loss": 0.6356, + "step": 9360 + }, + { + "epoch": 0.8369992846924177, + "grad_norm": 0.14076502080604017, + "learning_rate": 1.3615476872365419e-05, + "loss": 0.6197, + "step": 9361 + }, + { + "epoch": 0.8370886981402003, + "grad_norm": 0.12812020150029357, + "learning_rate": 1.3600891628624601e-05, + "loss": 0.5661, + "step": 9362 + }, + { + "epoch": 0.8371781115879828, + "grad_norm": 0.14113614904847852, + "learning_rate": 1.3586313631072668e-05, + "loss": 0.6319, + "step": 9363 + }, + { + "epoch": 0.8372675250357654, + "grad_norm": 0.13270593058954236, + "learning_rate": 1.357174288093228e-05, + "loss": 0.6355, + "step": 9364 + }, + { + "epoch": 0.837356938483548, + "grad_norm": 0.15452032802127627, + "learning_rate": 1.355717937942541e-05, + "loss": 0.626, + "step": 9365 + }, + { + "epoch": 0.8374463519313304, + "grad_norm": 0.13612342748231182, + "learning_rate": 1.3542623127773523e-05, + "loss": 0.6538, + "step": 9366 + }, + { + "epoch": 0.837535765379113, + "grad_norm": 0.15320590446978977, + "learning_rate": 1.3528074127197432e-05, + "loss": 0.5886, + "step": 9367 + }, + { + "epoch": 0.8376251788268956, + "grad_norm": 0.1440113318178802, + "learning_rate": 1.3513532378917281e-05, + "loss": 0.605, + "step": 9368 + }, + { + "epoch": 0.8377145922746781, + "grad_norm": 0.15261349588571868, + "learning_rate": 1.3498997884152776e-05, + "loss": 0.6354, + "step": 9369 + }, + { + "epoch": 0.8378040057224606, + "grad_norm": 0.1326683286717112, + "learning_rate": 1.3484470644122826e-05, + "loss": 0.63, + "step": 9370 + }, + { + "epoch": 0.8378934191702432, + "grad_norm": 0.14537990158687647, + "learning_rate": 1.3469950660045838e-05, + "loss": 0.6255, + "step": 9371 + }, + { + "epoch": 0.8379828326180258, + "grad_norm": 0.13052316258500107, + "learning_rate": 1.3455437933139614e-05, + "loss": 0.6257, + "step": 9372 + }, + { + "epoch": 0.8380722460658083, + "grad_norm": 0.1434912981495475, + "learning_rate": 1.344093246462126e-05, + "loss": 0.6183, + "step": 9373 + }, + { + "epoch": 0.8381616595135909, + "grad_norm": 0.14301575332342875, + "learning_rate": 1.3426434255707365e-05, + "loss": 0.6597, + "step": 9374 + }, + { + "epoch": 0.8382510729613734, + "grad_norm": 0.13753028539361908, + "learning_rate": 1.3411943307613883e-05, + "loss": 0.6243, + "step": 9375 + }, + { + "epoch": 0.8383404864091559, + "grad_norm": 0.15236690955774837, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6112, + "step": 9376 + }, + { + "epoch": 0.8384298998569385, + "grad_norm": 0.13631113203239403, + "learning_rate": 1.3382983198748855e-05, + "loss": 0.6273, + "step": 9377 + }, + { + "epoch": 0.8385193133047211, + "grad_norm": 0.11884389383413334, + "learning_rate": 1.33685140404062e-05, + "loss": 0.6256, + "step": 9378 + }, + { + "epoch": 0.8386087267525035, + "grad_norm": 0.16337640225973316, + "learning_rate": 1.3354052147741625e-05, + "loss": 0.645, + "step": 9379 + }, + { + "epoch": 0.8386981402002861, + "grad_norm": 0.1443417344839206, + "learning_rate": 1.333959752196805e-05, + "loss": 0.5841, + "step": 9380 + }, + { + "epoch": 0.8387875536480687, + "grad_norm": 0.14244008496294958, + "learning_rate": 1.3325150164297796e-05, + "loss": 0.6289, + "step": 9381 + }, + { + "epoch": 0.8388769670958512, + "grad_norm": 0.13727762283472703, + "learning_rate": 1.3310710075942479e-05, + "loss": 0.6414, + "step": 9382 + }, + { + "epoch": 0.8389663805436338, + "grad_norm": 0.15290338422298352, + "learning_rate": 1.3296277258113254e-05, + "loss": 0.6251, + "step": 9383 + }, + { + "epoch": 0.8390557939914163, + "grad_norm": 0.15258782663293285, + "learning_rate": 1.328185171202052e-05, + "loss": 0.6737, + "step": 9384 + }, + { + "epoch": 0.8391452074391988, + "grad_norm": 0.1462129660616898, + "learning_rate": 1.3267433438874155e-05, + "loss": 0.6259, + "step": 9385 + }, + { + "epoch": 0.8392346208869814, + "grad_norm": 0.1534515334563192, + "learning_rate": 1.3253022439883412e-05, + "loss": 0.6546, + "step": 9386 + }, + { + "epoch": 0.839324034334764, + "grad_norm": 0.13770212157589803, + "learning_rate": 1.3238618716256923e-05, + "loss": 0.6522, + "step": 9387 + }, + { + "epoch": 0.8394134477825465, + "grad_norm": 0.1446281132798228, + "learning_rate": 1.322422226920268e-05, + "loss": 0.648, + "step": 9388 + }, + { + "epoch": 0.839502861230329, + "grad_norm": 0.15318370322750768, + "learning_rate": 1.3209833099928114e-05, + "loss": 0.6648, + "step": 9389 + }, + { + "epoch": 0.8395922746781116, + "grad_norm": 0.1442475887969915, + "learning_rate": 1.3195451209640041e-05, + "loss": 0.6465, + "step": 9390 + }, + { + "epoch": 0.8396816881258942, + "grad_norm": 0.1339127156304286, + "learning_rate": 1.3181076599544629e-05, + "loss": 0.6297, + "step": 9391 + }, + { + "epoch": 0.8397711015736766, + "grad_norm": 0.14048921798163258, + "learning_rate": 1.3166709270847511e-05, + "loss": 0.6398, + "step": 9392 + }, + { + "epoch": 0.8398605150214592, + "grad_norm": 0.14084966477686037, + "learning_rate": 1.3152349224753579e-05, + "loss": 0.6595, + "step": 9393 + }, + { + "epoch": 0.8399499284692418, + "grad_norm": 0.15662381480982526, + "learning_rate": 1.3137996462467239e-05, + "loss": 0.6192, + "step": 9394 + }, + { + "epoch": 0.8400393419170243, + "grad_norm": 0.13847550678980636, + "learning_rate": 1.312365098519226e-05, + "loss": 0.6546, + "step": 9395 + }, + { + "epoch": 0.8401287553648069, + "grad_norm": 0.140751347053482, + "learning_rate": 1.31093127941317e-05, + "loss": 0.6517, + "step": 9396 + }, + { + "epoch": 0.8402181688125894, + "grad_norm": 0.13299599907756027, + "learning_rate": 1.3094981890488167e-05, + "loss": 0.5821, + "step": 9397 + }, + { + "epoch": 0.8403075822603719, + "grad_norm": 0.14374044736429661, + "learning_rate": 1.3080658275463565e-05, + "loss": 0.6025, + "step": 9398 + }, + { + "epoch": 0.8403969957081545, + "grad_norm": 0.1432159826336771, + "learning_rate": 1.3066341950259165e-05, + "loss": 0.6313, + "step": 9399 + }, + { + "epoch": 0.8404864091559371, + "grad_norm": 0.13624949970895725, + "learning_rate": 1.3052032916075674e-05, + "loss": 0.6402, + "step": 9400 + }, + { + "epoch": 0.8405758226037195, + "grad_norm": 0.15990408705952563, + "learning_rate": 1.3037731174113188e-05, + "loss": 0.6889, + "step": 9401 + }, + { + "epoch": 0.8406652360515021, + "grad_norm": 0.1393985694803436, + "learning_rate": 1.3023436725571158e-05, + "loss": 0.6134, + "step": 9402 + }, + { + "epoch": 0.8407546494992847, + "grad_norm": 0.1302555310448898, + "learning_rate": 1.3009149571648438e-05, + "loss": 0.6302, + "step": 9403 + }, + { + "epoch": 0.8408440629470673, + "grad_norm": 0.13648837397391272, + "learning_rate": 1.2994869713543289e-05, + "loss": 0.6575, + "step": 9404 + }, + { + "epoch": 0.8409334763948498, + "grad_norm": 0.1390214613471304, + "learning_rate": 1.2980597152453344e-05, + "loss": 0.5891, + "step": 9405 + }, + { + "epoch": 0.8410228898426323, + "grad_norm": 0.12655235941486567, + "learning_rate": 1.2966331889575644e-05, + "loss": 0.6249, + "step": 9406 + }, + { + "epoch": 0.8411123032904149, + "grad_norm": 0.11208962015873908, + "learning_rate": 1.2952073926106556e-05, + "loss": 0.6038, + "step": 9407 + }, + { + "epoch": 0.8412017167381974, + "grad_norm": 0.1317953338948652, + "learning_rate": 1.29378232632419e-05, + "loss": 0.6377, + "step": 9408 + }, + { + "epoch": 0.84129113018598, + "grad_norm": 0.14423802837805313, + "learning_rate": 1.2923579902176886e-05, + "loss": 0.6414, + "step": 9409 + }, + { + "epoch": 0.8413805436337625, + "grad_norm": 0.1411338613059139, + "learning_rate": 1.2909343844106014e-05, + "loss": 0.6335, + "step": 9410 + }, + { + "epoch": 0.841469957081545, + "grad_norm": 0.15466533409673117, + "learning_rate": 1.289511509022332e-05, + "loss": 0.6267, + "step": 9411 + }, + { + "epoch": 0.8415593705293276, + "grad_norm": 0.14330308528508673, + "learning_rate": 1.2880893641722147e-05, + "loss": 0.6407, + "step": 9412 + }, + { + "epoch": 0.8416487839771102, + "grad_norm": 0.13562943070747338, + "learning_rate": 1.2866679499795198e-05, + "loss": 0.6333, + "step": 9413 + }, + { + "epoch": 0.8417381974248928, + "grad_norm": 0.12365060602606824, + "learning_rate": 1.2852472665634607e-05, + "loss": 0.6451, + "step": 9414 + }, + { + "epoch": 0.8418276108726752, + "grad_norm": 0.14648778146984903, + "learning_rate": 1.28382731404319e-05, + "loss": 0.6388, + "step": 9415 + }, + { + "epoch": 0.8419170243204578, + "grad_norm": 0.13950307734114475, + "learning_rate": 1.2824080925377945e-05, + "loss": 0.6458, + "step": 9416 + }, + { + "epoch": 0.8420064377682404, + "grad_norm": 0.13457905014884544, + "learning_rate": 1.2809896021663037e-05, + "loss": 0.6449, + "step": 9417 + }, + { + "epoch": 0.8420958512160229, + "grad_norm": 0.13861496322947658, + "learning_rate": 1.2795718430476854e-05, + "loss": 0.6391, + "step": 9418 + }, + { + "epoch": 0.8421852646638054, + "grad_norm": 0.14053555103217322, + "learning_rate": 1.278154815300845e-05, + "loss": 0.6229, + "step": 9419 + }, + { + "epoch": 0.842274678111588, + "grad_norm": 0.14463411001619297, + "learning_rate": 1.2767385190446257e-05, + "loss": 0.6494, + "step": 9420 + }, + { + "epoch": 0.8423640915593705, + "grad_norm": 0.14335091824409826, + "learning_rate": 1.2753229543978151e-05, + "loss": 0.6437, + "step": 9421 + }, + { + "epoch": 0.8424535050071531, + "grad_norm": 0.1613520078456706, + "learning_rate": 1.2739081214791293e-05, + "loss": 0.6429, + "step": 9422 + }, + { + "epoch": 0.8425429184549357, + "grad_norm": 0.12468129135720436, + "learning_rate": 1.2724940204072311e-05, + "loss": 0.6397, + "step": 9423 + }, + { + "epoch": 0.8426323319027181, + "grad_norm": 0.12172361467910689, + "learning_rate": 1.271080651300719e-05, + "loss": 0.5921, + "step": 9424 + }, + { + "epoch": 0.8427217453505007, + "grad_norm": 0.15583338122047016, + "learning_rate": 1.2696680142781313e-05, + "loss": 0.6608, + "step": 9425 + }, + { + "epoch": 0.8428111587982833, + "grad_norm": 0.13532641792076583, + "learning_rate": 1.2682561094579448e-05, + "loss": 0.6605, + "step": 9426 + }, + { + "epoch": 0.8429005722460658, + "grad_norm": 0.14366296327350253, + "learning_rate": 1.2668449369585723e-05, + "loss": 0.6502, + "step": 9427 + }, + { + "epoch": 0.8429899856938483, + "grad_norm": 0.14968358213790248, + "learning_rate": 1.2654344968983668e-05, + "loss": 0.6265, + "step": 9428 + }, + { + "epoch": 0.8430793991416309, + "grad_norm": 0.12398530668283139, + "learning_rate": 1.2640247893956236e-05, + "loss": 0.6176, + "step": 9429 + }, + { + "epoch": 0.8431688125894135, + "grad_norm": 0.13041108853795, + "learning_rate": 1.2626158145685696e-05, + "loss": 0.5894, + "step": 9430 + }, + { + "epoch": 0.843258226037196, + "grad_norm": 0.13714776048156455, + "learning_rate": 1.2612075725353722e-05, + "loss": 0.6346, + "step": 9431 + }, + { + "epoch": 0.8433476394849786, + "grad_norm": 0.15181814807371766, + "learning_rate": 1.259800063414146e-05, + "loss": 0.6365, + "step": 9432 + }, + { + "epoch": 0.843437052932761, + "grad_norm": 0.13378752198198587, + "learning_rate": 1.258393287322932e-05, + "loss": 0.6417, + "step": 9433 + }, + { + "epoch": 0.8435264663805436, + "grad_norm": 0.14854432074726287, + "learning_rate": 1.2569872443797148e-05, + "loss": 0.6025, + "step": 9434 + }, + { + "epoch": 0.8436158798283262, + "grad_norm": 0.12579900278188777, + "learning_rate": 1.2555819347024211e-05, + "loss": 0.6019, + "step": 9435 + }, + { + "epoch": 0.8437052932761088, + "grad_norm": 0.13513735412431016, + "learning_rate": 1.2541773584089079e-05, + "loss": 0.6518, + "step": 9436 + }, + { + "epoch": 0.8437947067238912, + "grad_norm": 0.12958738939048528, + "learning_rate": 1.2527735156169773e-05, + "loss": 0.6513, + "step": 9437 + }, + { + "epoch": 0.8438841201716738, + "grad_norm": 0.15376563519605052, + "learning_rate": 1.2513704064443677e-05, + "loss": 0.6347, + "step": 9438 + }, + { + "epoch": 0.8439735336194564, + "grad_norm": 0.13889745856810604, + "learning_rate": 1.249968031008757e-05, + "loss": 0.6116, + "step": 9439 + }, + { + "epoch": 0.844062947067239, + "grad_norm": 0.15381860647657775, + "learning_rate": 1.2485663894277611e-05, + "loss": 0.6631, + "step": 9440 + }, + { + "epoch": 0.8441523605150214, + "grad_norm": 0.1359952493156896, + "learning_rate": 1.2471654818189316e-05, + "loss": 0.6155, + "step": 9441 + }, + { + "epoch": 0.844241773962804, + "grad_norm": 0.14096665924363203, + "learning_rate": 1.2457653082997634e-05, + "loss": 0.6488, + "step": 9442 + }, + { + "epoch": 0.8443311874105865, + "grad_norm": 0.13610205033366948, + "learning_rate": 1.2443658689876847e-05, + "loss": 0.6549, + "step": 9443 + }, + { + "epoch": 0.8444206008583691, + "grad_norm": 0.14907187849040898, + "learning_rate": 1.2429671640000695e-05, + "loss": 0.6428, + "step": 9444 + }, + { + "epoch": 0.8445100143061517, + "grad_norm": 0.14928306033008934, + "learning_rate": 1.2415691934542183e-05, + "loss": 0.6643, + "step": 9445 + }, + { + "epoch": 0.8445994277539342, + "grad_norm": 0.1313939593501914, + "learning_rate": 1.2401719574673854e-05, + "loss": 0.6274, + "step": 9446 + }, + { + "epoch": 0.8446888412017167, + "grad_norm": 0.12900125393430878, + "learning_rate": 1.2387754561567488e-05, + "loss": 0.6068, + "step": 9447 + }, + { + "epoch": 0.8447782546494993, + "grad_norm": 0.14681373388062932, + "learning_rate": 1.237379689639434e-05, + "loss": 0.6066, + "step": 9448 + }, + { + "epoch": 0.8448676680972819, + "grad_norm": 0.14062624882923777, + "learning_rate": 1.2359846580325041e-05, + "loss": 0.6117, + "step": 9449 + }, + { + "epoch": 0.8449570815450643, + "grad_norm": 0.15477200610512182, + "learning_rate": 1.2345903614529552e-05, + "loss": 0.6588, + "step": 9450 + }, + { + "epoch": 0.8450464949928469, + "grad_norm": 0.12023625575253223, + "learning_rate": 1.233196800017724e-05, + "loss": 0.6088, + "step": 9451 + }, + { + "epoch": 0.8451359084406295, + "grad_norm": 0.13387065720261565, + "learning_rate": 1.2318039738436936e-05, + "loss": 0.6591, + "step": 9452 + }, + { + "epoch": 0.845225321888412, + "grad_norm": 0.1352442114004467, + "learning_rate": 1.230411883047673e-05, + "loss": 0.6165, + "step": 9453 + }, + { + "epoch": 0.8453147353361946, + "grad_norm": 0.1526756510447786, + "learning_rate": 1.2290205277464161e-05, + "loss": 0.6283, + "step": 9454 + }, + { + "epoch": 0.8454041487839771, + "grad_norm": 0.13632652584282873, + "learning_rate": 1.2276299080566178e-05, + "loss": 0.6286, + "step": 9455 + }, + { + "epoch": 0.8454935622317596, + "grad_norm": 0.1353055574655835, + "learning_rate": 1.2262400240949023e-05, + "loss": 0.6486, + "step": 9456 + }, + { + "epoch": 0.8455829756795422, + "grad_norm": 0.12459038208301534, + "learning_rate": 1.22485087597784e-05, + "loss": 0.6061, + "step": 9457 + }, + { + "epoch": 0.8456723891273248, + "grad_norm": 0.13126863788078375, + "learning_rate": 1.2234624638219372e-05, + "loss": 0.5946, + "step": 9458 + }, + { + "epoch": 0.8457618025751072, + "grad_norm": 0.1443012923498258, + "learning_rate": 1.2220747877436378e-05, + "loss": 0.6735, + "step": 9459 + }, + { + "epoch": 0.8458512160228898, + "grad_norm": 0.14462965409389197, + "learning_rate": 1.2206878478593276e-05, + "loss": 0.6399, + "step": 9460 + }, + { + "epoch": 0.8459406294706724, + "grad_norm": 0.13530427354333732, + "learning_rate": 1.2193016442853221e-05, + "loss": 0.6101, + "step": 9461 + }, + { + "epoch": 0.846030042918455, + "grad_norm": 0.15013549024982967, + "learning_rate": 1.2179161771378845e-05, + "loss": 0.6228, + "step": 9462 + }, + { + "epoch": 0.8461194563662375, + "grad_norm": 0.1499664275458647, + "learning_rate": 1.2165314465332122e-05, + "loss": 0.6275, + "step": 9463 + }, + { + "epoch": 0.84620886981402, + "grad_norm": 0.15134913452862053, + "learning_rate": 1.2151474525874374e-05, + "loss": 0.6386, + "step": 9464 + }, + { + "epoch": 0.8462982832618026, + "grad_norm": 0.15401721851261282, + "learning_rate": 1.2137641954166346e-05, + "loss": 0.6805, + "step": 9465 + }, + { + "epoch": 0.8463876967095851, + "grad_norm": 0.13322228434030958, + "learning_rate": 1.212381675136821e-05, + "loss": 0.6314, + "step": 9466 + }, + { + "epoch": 0.8464771101573677, + "grad_norm": 0.14307652232821041, + "learning_rate": 1.2109998918639431e-05, + "loss": 0.6488, + "step": 9467 + }, + { + "epoch": 0.8465665236051502, + "grad_norm": 0.13477669918205665, + "learning_rate": 1.209618845713889e-05, + "loss": 0.5958, + "step": 9468 + }, + { + "epoch": 0.8466559370529327, + "grad_norm": 0.15052956016702956, + "learning_rate": 1.2082385368024884e-05, + "loss": 0.6453, + "step": 9469 + }, + { + "epoch": 0.8467453505007153, + "grad_norm": 0.14250661146715304, + "learning_rate": 1.2068589652455008e-05, + "loss": 0.6173, + "step": 9470 + }, + { + "epoch": 0.8468347639484979, + "grad_norm": 0.14688385621937955, + "learning_rate": 1.205480131158634e-05, + "loss": 0.6035, + "step": 9471 + }, + { + "epoch": 0.8469241773962805, + "grad_norm": 0.12861885952430852, + "learning_rate": 1.2041020346575272e-05, + "loss": 0.5979, + "step": 9472 + }, + { + "epoch": 0.8470135908440629, + "grad_norm": 0.1321989417271016, + "learning_rate": 1.2027246758577593e-05, + "loss": 0.6434, + "step": 9473 + }, + { + "epoch": 0.8471030042918455, + "grad_norm": 0.1490006220895603, + "learning_rate": 1.2013480548748512e-05, + "loss": 0.7007, + "step": 9474 + }, + { + "epoch": 0.8471924177396281, + "grad_norm": 0.14808831346179643, + "learning_rate": 1.199972171824253e-05, + "loss": 0.6259, + "step": 9475 + }, + { + "epoch": 0.8472818311874106, + "grad_norm": 0.13505379795869904, + "learning_rate": 1.198597026821361e-05, + "loss": 0.6142, + "step": 9476 + }, + { + "epoch": 0.8473712446351931, + "grad_norm": 0.14835615771242017, + "learning_rate": 1.1972226199815074e-05, + "loss": 0.6397, + "step": 9477 + }, + { + "epoch": 0.8474606580829757, + "grad_norm": 0.14663100825053818, + "learning_rate": 1.1958489514199634e-05, + "loss": 0.6533, + "step": 9478 + }, + { + "epoch": 0.8475500715307582, + "grad_norm": 0.15274340572025658, + "learning_rate": 1.1944760212519313e-05, + "loss": 0.6272, + "step": 9479 + }, + { + "epoch": 0.8476394849785408, + "grad_norm": 0.14695154301419977, + "learning_rate": 1.1931038295925645e-05, + "loss": 0.6265, + "step": 9480 + }, + { + "epoch": 0.8477288984263234, + "grad_norm": 0.13703021360726544, + "learning_rate": 1.1917323765569411e-05, + "loss": 0.5762, + "step": 9481 + }, + { + "epoch": 0.8478183118741058, + "grad_norm": 0.1381752169617845, + "learning_rate": 1.1903616622600866e-05, + "loss": 0.6536, + "step": 9482 + }, + { + "epoch": 0.8479077253218884, + "grad_norm": 0.13328783617591455, + "learning_rate": 1.1889916868169614e-05, + "loss": 0.6074, + "step": 9483 + }, + { + "epoch": 0.847997138769671, + "grad_norm": 0.1340410492267393, + "learning_rate": 1.1876224503424615e-05, + "loss": 0.6331, + "step": 9484 + }, + { + "epoch": 0.8480865522174535, + "grad_norm": 0.1409389466542306, + "learning_rate": 1.1862539529514228e-05, + "loss": 0.6278, + "step": 9485 + }, + { + "epoch": 0.848175965665236, + "grad_norm": 0.14067807928583653, + "learning_rate": 1.184886194758621e-05, + "loss": 0.6185, + "step": 9486 + }, + { + "epoch": 0.8482653791130186, + "grad_norm": 0.13444410233204765, + "learning_rate": 1.183519175878769e-05, + "loss": 0.6335, + "step": 9487 + }, + { + "epoch": 0.8483547925608012, + "grad_norm": 0.15420341647760383, + "learning_rate": 1.182152896426515e-05, + "loss": 0.6606, + "step": 9488 + }, + { + "epoch": 0.8484442060085837, + "grad_norm": 0.16066811155603974, + "learning_rate": 1.1807873565164506e-05, + "loss": 0.6596, + "step": 9489 + }, + { + "epoch": 0.8485336194563662, + "grad_norm": 0.14625952215036936, + "learning_rate": 1.1794225562630978e-05, + "loss": 0.6692, + "step": 9490 + }, + { + "epoch": 0.8486230329041488, + "grad_norm": 0.1347728216114412, + "learning_rate": 1.1780584957809227e-05, + "loss": 0.6026, + "step": 9491 + }, + { + "epoch": 0.8487124463519313, + "grad_norm": 0.13260647580570273, + "learning_rate": 1.1766951751843292e-05, + "loss": 0.6245, + "step": 9492 + }, + { + "epoch": 0.8488018597997139, + "grad_norm": 0.14872504289667027, + "learning_rate": 1.1753325945876515e-05, + "loss": 0.5746, + "step": 9493 + }, + { + "epoch": 0.8488912732474965, + "grad_norm": 0.1250195656728929, + "learning_rate": 1.173970754105176e-05, + "loss": 0.5944, + "step": 9494 + }, + { + "epoch": 0.8489806866952789, + "grad_norm": 0.1291460306634856, + "learning_rate": 1.1726096538511122e-05, + "loss": 0.6203, + "step": 9495 + }, + { + "epoch": 0.8490701001430615, + "grad_norm": 0.14241788471383093, + "learning_rate": 1.1712492939396157e-05, + "loss": 0.631, + "step": 9496 + }, + { + "epoch": 0.8491595135908441, + "grad_norm": 0.14105262870792354, + "learning_rate": 1.1698896744847809e-05, + "loss": 0.6287, + "step": 9497 + }, + { + "epoch": 0.8492489270386266, + "grad_norm": 0.13463655653591072, + "learning_rate": 1.168530795600632e-05, + "loss": 0.6054, + "step": 9498 + }, + { + "epoch": 0.8493383404864091, + "grad_norm": 0.1337646390857531, + "learning_rate": 1.1671726574011399e-05, + "loss": 0.6202, + "step": 9499 + }, + { + "epoch": 0.8494277539341917, + "grad_norm": 0.1410852781215582, + "learning_rate": 1.1658152600002104e-05, + "loss": 0.6307, + "step": 9500 + }, + { + "epoch": 0.8495171673819742, + "grad_norm": 0.13817000992909803, + "learning_rate": 1.1644586035116856e-05, + "loss": 0.6058, + "step": 9501 + }, + { + "epoch": 0.8496065808297568, + "grad_norm": 0.14884662176240518, + "learning_rate": 1.1631026880493468e-05, + "loss": 0.6089, + "step": 9502 + }, + { + "epoch": 0.8496959942775394, + "grad_norm": 0.1442495682665767, + "learning_rate": 1.1617475137269152e-05, + "loss": 0.6368, + "step": 9503 + }, + { + "epoch": 0.8497854077253219, + "grad_norm": 0.1282521409608485, + "learning_rate": 1.1603930806580444e-05, + "loss": 0.6317, + "step": 9504 + }, + { + "epoch": 0.8498748211731044, + "grad_norm": 0.11845242161086715, + "learning_rate": 1.15903938895633e-05, + "loss": 0.6161, + "step": 9505 + }, + { + "epoch": 0.849964234620887, + "grad_norm": 0.12745637765954168, + "learning_rate": 1.157686438735307e-05, + "loss": 0.6465, + "step": 9506 + }, + { + "epoch": 0.8500536480686696, + "grad_norm": 0.1293473507381607, + "learning_rate": 1.156334230108439e-05, + "loss": 0.6392, + "step": 9507 + }, + { + "epoch": 0.850143061516452, + "grad_norm": 0.13415728203659363, + "learning_rate": 1.1549827631891418e-05, + "loss": 0.6392, + "step": 9508 + }, + { + "epoch": 0.8502324749642346, + "grad_norm": 0.15280696934476143, + "learning_rate": 1.1536320380907596e-05, + "loss": 0.6662, + "step": 9509 + }, + { + "epoch": 0.8503218884120172, + "grad_norm": 0.14992200700073555, + "learning_rate": 1.1522820549265723e-05, + "loss": 0.6115, + "step": 9510 + }, + { + "epoch": 0.8504113018597997, + "grad_norm": 0.12519194063925518, + "learning_rate": 1.1509328138098041e-05, + "loss": 0.6187, + "step": 9511 + }, + { + "epoch": 0.8505007153075823, + "grad_norm": 0.1336799850900787, + "learning_rate": 1.1495843148536157e-05, + "loss": 0.6095, + "step": 9512 + }, + { + "epoch": 0.8505901287553648, + "grad_norm": 0.16001661530509273, + "learning_rate": 1.1482365581711008e-05, + "loss": 0.6764, + "step": 9513 + }, + { + "epoch": 0.8506795422031473, + "grad_norm": 0.14834814794002227, + "learning_rate": 1.1468895438752947e-05, + "loss": 0.6718, + "step": 9514 + }, + { + "epoch": 0.8507689556509299, + "grad_norm": 0.13020698592948832, + "learning_rate": 1.1455432720791714e-05, + "loss": 0.6255, + "step": 9515 + }, + { + "epoch": 0.8508583690987125, + "grad_norm": 0.12470246842403074, + "learning_rate": 1.1441977428956396e-05, + "loss": 0.6466, + "step": 9516 + }, + { + "epoch": 0.850947782546495, + "grad_norm": 0.12362640294802713, + "learning_rate": 1.1428529564375502e-05, + "loss": 0.5775, + "step": 9517 + }, + { + "epoch": 0.8510371959942775, + "grad_norm": 0.16403796126578246, + "learning_rate": 1.1415089128176847e-05, + "loss": 0.6151, + "step": 9518 + }, + { + "epoch": 0.8511266094420601, + "grad_norm": 0.1538952541041677, + "learning_rate": 1.1401656121487692e-05, + "loss": 0.6478, + "step": 9519 + }, + { + "epoch": 0.8512160228898427, + "grad_norm": 0.14426132996219782, + "learning_rate": 1.1388230545434653e-05, + "loss": 0.6076, + "step": 9520 + }, + { + "epoch": 0.8513054363376252, + "grad_norm": 0.15501191279211515, + "learning_rate": 1.1374812401143653e-05, + "loss": 0.6568, + "step": 9521 + }, + { + "epoch": 0.8513948497854077, + "grad_norm": 0.14192932704531672, + "learning_rate": 1.1361401689740137e-05, + "loss": 0.5988, + "step": 9522 + }, + { + "epoch": 0.8514842632331903, + "grad_norm": 0.15341273384582646, + "learning_rate": 1.1347998412348825e-05, + "loss": 0.6479, + "step": 9523 + }, + { + "epoch": 0.8515736766809728, + "grad_norm": 0.14697374653178727, + "learning_rate": 1.13346025700938e-05, + "loss": 0.662, + "step": 9524 + }, + { + "epoch": 0.8516630901287554, + "grad_norm": 0.15733014824834562, + "learning_rate": 1.1321214164098582e-05, + "loss": 0.6164, + "step": 9525 + }, + { + "epoch": 0.8517525035765379, + "grad_norm": 0.1657542958999634, + "learning_rate": 1.1307833195486062e-05, + "loss": 0.6394, + "step": 9526 + }, + { + "epoch": 0.8518419170243204, + "grad_norm": 0.14250412226029438, + "learning_rate": 1.1294459665378432e-05, + "loss": 0.6422, + "step": 9527 + }, + { + "epoch": 0.851931330472103, + "grad_norm": 0.12639986972682807, + "learning_rate": 1.1281093574897338e-05, + "loss": 0.6356, + "step": 9528 + }, + { + "epoch": 0.8520207439198856, + "grad_norm": 0.14937332198500472, + "learning_rate": 1.1267734925163787e-05, + "loss": 0.6497, + "step": 9529 + }, + { + "epoch": 0.852110157367668, + "grad_norm": 0.13853651253374427, + "learning_rate": 1.1254383717298134e-05, + "loss": 0.657, + "step": 9530 + }, + { + "epoch": 0.8521995708154506, + "grad_norm": 0.1265152433116898, + "learning_rate": 1.1241039952420173e-05, + "loss": 0.6029, + "step": 9531 + }, + { + "epoch": 0.8522889842632332, + "grad_norm": 0.13129807194436993, + "learning_rate": 1.1227703631648978e-05, + "loss": 0.6242, + "step": 9532 + }, + { + "epoch": 0.8523783977110158, + "grad_norm": 0.12341416000974238, + "learning_rate": 1.1214374756103064e-05, + "loss": 0.5931, + "step": 9533 + }, + { + "epoch": 0.8524678111587983, + "grad_norm": 0.1582704661929461, + "learning_rate": 1.1201053326900313e-05, + "loss": 0.603, + "step": 9534 + }, + { + "epoch": 0.8525572246065808, + "grad_norm": 0.14051154947089736, + "learning_rate": 1.1187739345157977e-05, + "loss": 0.5953, + "step": 9535 + }, + { + "epoch": 0.8526466380543634, + "grad_norm": 0.12173898005338343, + "learning_rate": 1.1174432811992685e-05, + "loss": 0.6063, + "step": 9536 + }, + { + "epoch": 0.8527360515021459, + "grad_norm": 0.13231956191198901, + "learning_rate": 1.1161133728520467e-05, + "loss": 0.6183, + "step": 9537 + }, + { + "epoch": 0.8528254649499285, + "grad_norm": 0.1343928480252713, + "learning_rate": 1.1147842095856642e-05, + "loss": 0.6177, + "step": 9538 + }, + { + "epoch": 0.852914878397711, + "grad_norm": 0.1207820243051129, + "learning_rate": 1.1134557915115994e-05, + "loss": 0.656, + "step": 9539 + }, + { + "epoch": 0.8530042918454935, + "grad_norm": 0.12359301324592682, + "learning_rate": 1.112128118741268e-05, + "loss": 0.5779, + "step": 9540 + }, + { + "epoch": 0.8530937052932761, + "grad_norm": 0.13757751851710523, + "learning_rate": 1.1108011913860128e-05, + "loss": 0.6318, + "step": 9541 + }, + { + "epoch": 0.8531831187410587, + "grad_norm": 0.15674853574529576, + "learning_rate": 1.1094750095571282e-05, + "loss": 0.654, + "step": 9542 + }, + { + "epoch": 0.8532725321888412, + "grad_norm": 0.14582006803360514, + "learning_rate": 1.1081495733658409e-05, + "loss": 0.6541, + "step": 9543 + }, + { + "epoch": 0.8533619456366237, + "grad_norm": 0.13682584472368256, + "learning_rate": 1.1068248829233063e-05, + "loss": 0.632, + "step": 9544 + }, + { + "epoch": 0.8534513590844063, + "grad_norm": 0.13500803894082866, + "learning_rate": 1.10550093834063e-05, + "loss": 0.6431, + "step": 9545 + }, + { + "epoch": 0.8535407725321889, + "grad_norm": 0.14999542509865216, + "learning_rate": 1.1041777397288488e-05, + "loss": 0.6443, + "step": 9546 + }, + { + "epoch": 0.8536301859799714, + "grad_norm": 0.12825542920625674, + "learning_rate": 1.1028552871989362e-05, + "loss": 0.6112, + "step": 9547 + }, + { + "epoch": 0.8537195994277539, + "grad_norm": 0.14480038599499928, + "learning_rate": 1.1015335808618055e-05, + "loss": 0.6353, + "step": 9548 + }, + { + "epoch": 0.8538090128755365, + "grad_norm": 0.14583780013900247, + "learning_rate": 1.100212620828307e-05, + "loss": 0.6477, + "step": 9549 + }, + { + "epoch": 0.853898426323319, + "grad_norm": 0.1761125884921413, + "learning_rate": 1.0988924072092266e-05, + "loss": 0.6571, + "step": 9550 + }, + { + "epoch": 0.8539878397711016, + "grad_norm": 0.14116293337347993, + "learning_rate": 1.0975729401152934e-05, + "loss": 0.6161, + "step": 9551 + }, + { + "epoch": 0.8540772532188842, + "grad_norm": 0.1495261469528404, + "learning_rate": 1.0962542196571634e-05, + "loss": 0.6533, + "step": 9552 + }, + { + "epoch": 0.8541666666666666, + "grad_norm": 0.13449828881422243, + "learning_rate": 1.0949362459454393e-05, + "loss": 0.6357, + "step": 9553 + }, + { + "epoch": 0.8542560801144492, + "grad_norm": 0.12059637972650358, + "learning_rate": 1.0936190190906603e-05, + "loss": 0.6106, + "step": 9554 + }, + { + "epoch": 0.8543454935622318, + "grad_norm": 0.1357009421808618, + "learning_rate": 1.0923025392032937e-05, + "loss": 0.6338, + "step": 9555 + }, + { + "epoch": 0.8544349070100143, + "grad_norm": 0.14121291415936751, + "learning_rate": 1.0909868063937567e-05, + "loss": 0.6394, + "step": 9556 + }, + { + "epoch": 0.8545243204577968, + "grad_norm": 0.13365137707528546, + "learning_rate": 1.0896718207723988e-05, + "loss": 0.6648, + "step": 9557 + }, + { + "epoch": 0.8546137339055794, + "grad_norm": 0.13598892857860165, + "learning_rate": 1.0883575824495029e-05, + "loss": 0.6405, + "step": 9558 + }, + { + "epoch": 0.854703147353362, + "grad_norm": 0.1468138395564599, + "learning_rate": 1.0870440915352942e-05, + "loss": 0.6461, + "step": 9559 + }, + { + "epoch": 0.8547925608011445, + "grad_norm": 0.1371689106078361, + "learning_rate": 1.0857313481399355e-05, + "loss": 0.6129, + "step": 9560 + }, + { + "epoch": 0.8548819742489271, + "grad_norm": 0.14529752290777737, + "learning_rate": 1.0844193523735202e-05, + "loss": 0.6732, + "step": 9561 + }, + { + "epoch": 0.8549713876967096, + "grad_norm": 0.12415756291503456, + "learning_rate": 1.0831081043460868e-05, + "loss": 0.632, + "step": 9562 + }, + { + "epoch": 0.8550608011444921, + "grad_norm": 0.15169830366901488, + "learning_rate": 1.081797604167608e-05, + "loss": 0.6478, + "step": 9563 + }, + { + "epoch": 0.8551502145922747, + "grad_norm": 0.14185213918696435, + "learning_rate": 1.0804878519479943e-05, + "loss": 0.6583, + "step": 9564 + }, + { + "epoch": 0.8552396280400573, + "grad_norm": 0.1486797338587544, + "learning_rate": 1.079178847797091e-05, + "loss": 0.5945, + "step": 9565 + }, + { + "epoch": 0.8553290414878397, + "grad_norm": 0.15760663619009954, + "learning_rate": 1.0778705918246867e-05, + "loss": 0.6282, + "step": 9566 + }, + { + "epoch": 0.8554184549356223, + "grad_norm": 0.15181396528574936, + "learning_rate": 1.0765630841404994e-05, + "loss": 0.6566, + "step": 9567 + }, + { + "epoch": 0.8555078683834049, + "grad_norm": 0.13047729463304042, + "learning_rate": 1.0752563248541891e-05, + "loss": 0.6674, + "step": 9568 + }, + { + "epoch": 0.8555972818311874, + "grad_norm": 0.14638880782473246, + "learning_rate": 1.0739503140753516e-05, + "loss": 0.6561, + "step": 9569 + }, + { + "epoch": 0.85568669527897, + "grad_norm": 0.13196907664276378, + "learning_rate": 1.0726450519135222e-05, + "loss": 0.6183, + "step": 9570 + }, + { + "epoch": 0.8557761087267525, + "grad_norm": 0.13826799465997472, + "learning_rate": 1.0713405384781727e-05, + "loss": 0.641, + "step": 9571 + }, + { + "epoch": 0.855865522174535, + "grad_norm": 0.15038848949507136, + "learning_rate": 1.0700367738787064e-05, + "loss": 0.6323, + "step": 9572 + }, + { + "epoch": 0.8559549356223176, + "grad_norm": 0.14151286055153536, + "learning_rate": 1.0687337582244727e-05, + "loss": 0.6556, + "step": 9573 + }, + { + "epoch": 0.8560443490701002, + "grad_norm": 0.13136093439986204, + "learning_rate": 1.067431491624753e-05, + "loss": 0.5886, + "step": 9574 + }, + { + "epoch": 0.8561337625178826, + "grad_norm": 0.14658049614631313, + "learning_rate": 1.0661299741887654e-05, + "loss": 0.6105, + "step": 9575 + }, + { + "epoch": 0.8562231759656652, + "grad_norm": 0.1406074567191104, + "learning_rate": 1.0648292060256649e-05, + "loss": 0.606, + "step": 9576 + }, + { + "epoch": 0.8563125894134478, + "grad_norm": 0.14170968905638134, + "learning_rate": 1.0635291872445518e-05, + "loss": 0.6173, + "step": 9577 + }, + { + "epoch": 0.8564020028612304, + "grad_norm": 0.14306360126266882, + "learning_rate": 1.0622299179544516e-05, + "loss": 0.6378, + "step": 9578 + }, + { + "epoch": 0.8564914163090128, + "grad_norm": 0.1613354410025655, + "learning_rate": 1.0609313982643331e-05, + "loss": 0.6903, + "step": 9579 + }, + { + "epoch": 0.8565808297567954, + "grad_norm": 0.13004104132594094, + "learning_rate": 1.0596336282831054e-05, + "loss": 0.5616, + "step": 9580 + }, + { + "epoch": 0.856670243204578, + "grad_norm": 0.12925801089694794, + "learning_rate": 1.0583366081196066e-05, + "loss": 0.5759, + "step": 9581 + }, + { + "epoch": 0.8567596566523605, + "grad_norm": 0.14336944608288663, + "learning_rate": 1.0570403378826166e-05, + "loss": 0.643, + "step": 9582 + }, + { + "epoch": 0.8568490701001431, + "grad_norm": 0.13837819518972622, + "learning_rate": 1.0557448176808537e-05, + "loss": 0.6295, + "step": 9583 + }, + { + "epoch": 0.8569384835479256, + "grad_norm": 0.14453907317034234, + "learning_rate": 1.0544500476229713e-05, + "loss": 0.6549, + "step": 9584 + }, + { + "epoch": 0.8570278969957081, + "grad_norm": 0.14736690355089005, + "learning_rate": 1.0531560278175611e-05, + "loss": 0.6145, + "step": 9585 + }, + { + "epoch": 0.8571173104434907, + "grad_norm": 0.15027778314662754, + "learning_rate": 1.0518627583731477e-05, + "loss": 0.6167, + "step": 9586 + }, + { + "epoch": 0.8572067238912733, + "grad_norm": 0.1325061922265638, + "learning_rate": 1.0505702393981987e-05, + "loss": 0.598, + "step": 9587 + }, + { + "epoch": 0.8572961373390557, + "grad_norm": 0.13807679431921782, + "learning_rate": 1.0492784710011184e-05, + "loss": 0.6188, + "step": 9588 + }, + { + "epoch": 0.8573855507868383, + "grad_norm": 0.14911986731672747, + "learning_rate": 1.04798745329024e-05, + "loss": 0.65, + "step": 9589 + }, + { + "epoch": 0.8574749642346209, + "grad_norm": 0.14241527356980507, + "learning_rate": 1.0466971863738406e-05, + "loss": 0.672, + "step": 9590 + }, + { + "epoch": 0.8575643776824035, + "grad_norm": 0.14722650212814775, + "learning_rate": 1.045407670360139e-05, + "loss": 0.6365, + "step": 9591 + }, + { + "epoch": 0.857653791130186, + "grad_norm": 0.12687708212646304, + "learning_rate": 1.0441189053572809e-05, + "loss": 0.6192, + "step": 9592 + }, + { + "epoch": 0.8577432045779685, + "grad_norm": 0.1468415356290416, + "learning_rate": 1.0428308914733531e-05, + "loss": 0.6423, + "step": 9593 + }, + { + "epoch": 0.8578326180257511, + "grad_norm": 0.15349347301472263, + "learning_rate": 1.0415436288163826e-05, + "loss": 0.6725, + "step": 9594 + }, + { + "epoch": 0.8579220314735336, + "grad_norm": 0.14526941835350268, + "learning_rate": 1.0402571174943276e-05, + "loss": 0.5948, + "step": 9595 + }, + { + "epoch": 0.8580114449213162, + "grad_norm": 0.1303209911129903, + "learning_rate": 1.0389713576150883e-05, + "loss": 0.6015, + "step": 9596 + }, + { + "epoch": 0.8581008583690987, + "grad_norm": 0.16131293653857057, + "learning_rate": 1.0376863492864975e-05, + "loss": 0.6783, + "step": 9597 + }, + { + "epoch": 0.8581902718168812, + "grad_norm": 0.138021328788474, + "learning_rate": 1.0364020926163298e-05, + "loss": 0.6557, + "step": 9598 + }, + { + "epoch": 0.8582796852646638, + "grad_norm": 0.13811803436125442, + "learning_rate": 1.0351185877122938e-05, + "loss": 0.619, + "step": 9599 + }, + { + "epoch": 0.8583690987124464, + "grad_norm": 0.1373080599470215, + "learning_rate": 1.0338358346820353e-05, + "loss": 0.5968, + "step": 9600 + }, + { + "epoch": 0.858458512160229, + "grad_norm": 0.1496205355078144, + "learning_rate": 1.0325538336331364e-05, + "loss": 0.617, + "step": 9601 + }, + { + "epoch": 0.8585479256080114, + "grad_norm": 0.14137709328468437, + "learning_rate": 1.0312725846731175e-05, + "loss": 0.6113, + "step": 9602 + }, + { + "epoch": 0.858637339055794, + "grad_norm": 0.14928850033663332, + "learning_rate": 1.0299920879094372e-05, + "loss": 0.6189, + "step": 9603 + }, + { + "epoch": 0.8587267525035766, + "grad_norm": 0.12957562597846, + "learning_rate": 1.0287123434494827e-05, + "loss": 0.6063, + "step": 9604 + }, + { + "epoch": 0.8588161659513591, + "grad_norm": 0.13380940153929896, + "learning_rate": 1.027433351400594e-05, + "loss": 0.6071, + "step": 9605 + }, + { + "epoch": 0.8589055793991416, + "grad_norm": 0.15316840372630894, + "learning_rate": 1.0261551118700318e-05, + "loss": 0.6236, + "step": 9606 + }, + { + "epoch": 0.8589949928469242, + "grad_norm": 0.14641107627021452, + "learning_rate": 1.0248776249650027e-05, + "loss": 0.6133, + "step": 9607 + }, + { + "epoch": 0.8590844062947067, + "grad_norm": 0.1481290158878531, + "learning_rate": 1.0236008907926508e-05, + "loss": 0.6329, + "step": 9608 + }, + { + "epoch": 0.8591738197424893, + "grad_norm": 0.12287523583155525, + "learning_rate": 1.0223249094600485e-05, + "loss": 0.6278, + "step": 9609 + }, + { + "epoch": 0.8592632331902719, + "grad_norm": 0.12281625341056461, + "learning_rate": 1.0210496810742143e-05, + "loss": 0.6292, + "step": 9610 + }, + { + "epoch": 0.8593526466380543, + "grad_norm": 0.14339636999012786, + "learning_rate": 1.0197752057420995e-05, + "loss": 0.6436, + "step": 9611 + }, + { + "epoch": 0.8594420600858369, + "grad_norm": 0.1390544070008987, + "learning_rate": 1.018501483570592e-05, + "loss": 0.5737, + "step": 9612 + }, + { + "epoch": 0.8595314735336195, + "grad_norm": 0.13349170246455186, + "learning_rate": 1.0172285146665195e-05, + "loss": 0.6358, + "step": 9613 + }, + { + "epoch": 0.859620886981402, + "grad_norm": 0.13855056373387212, + "learning_rate": 1.0159562991366444e-05, + "loss": 0.6453, + "step": 9614 + }, + { + "epoch": 0.8597103004291845, + "grad_norm": 0.12462377972203594, + "learning_rate": 1.0146848370876627e-05, + "loss": 0.6153, + "step": 9615 + }, + { + "epoch": 0.8597997138769671, + "grad_norm": 0.15367895023097716, + "learning_rate": 1.013414128626211e-05, + "loss": 0.6673, + "step": 9616 + }, + { + "epoch": 0.8598891273247496, + "grad_norm": 0.13407293015522112, + "learning_rate": 1.0121441738588644e-05, + "loss": 0.6176, + "step": 9617 + }, + { + "epoch": 0.8599785407725322, + "grad_norm": 0.12931505522307085, + "learning_rate": 1.0108749728921319e-05, + "loss": 0.5632, + "step": 9618 + }, + { + "epoch": 0.8600679542203148, + "grad_norm": 0.11817709446932649, + "learning_rate": 1.0096065258324606e-05, + "loss": 0.6186, + "step": 9619 + }, + { + "epoch": 0.8601573676680973, + "grad_norm": 0.1461781646238627, + "learning_rate": 1.0083388327862298e-05, + "loss": 0.6483, + "step": 9620 + }, + { + "epoch": 0.8602467811158798, + "grad_norm": 0.1335268594409244, + "learning_rate": 1.0070718938597623e-05, + "loss": 0.5952, + "step": 9621 + }, + { + "epoch": 0.8603361945636624, + "grad_norm": 0.12838810701146222, + "learning_rate": 1.0058057091593154e-05, + "loss": 0.5654, + "step": 9622 + }, + { + "epoch": 0.860425608011445, + "grad_norm": 0.14986579179607448, + "learning_rate": 1.0045402787910818e-05, + "loss": 0.6717, + "step": 9623 + }, + { + "epoch": 0.8605150214592274, + "grad_norm": 0.15359794529527201, + "learning_rate": 1.0032756028611878e-05, + "loss": 0.6546, + "step": 9624 + }, + { + "epoch": 0.86060443490701, + "grad_norm": 0.12293106513774903, + "learning_rate": 1.0020116814757085e-05, + "loss": 0.5834, + "step": 9625 + }, + { + "epoch": 0.8606938483547926, + "grad_norm": 0.15363880327983617, + "learning_rate": 1.0007485147406404e-05, + "loss": 0.6211, + "step": 9626 + }, + { + "epoch": 0.8607832618025751, + "grad_norm": 0.1291240390318175, + "learning_rate": 9.99486102761925e-06, + "loss": 0.5879, + "step": 9627 + }, + { + "epoch": 0.8608726752503576, + "grad_norm": 0.136646327865932, + "learning_rate": 9.982244456454427e-06, + "loss": 0.6629, + "step": 9628 + }, + { + "epoch": 0.8609620886981402, + "grad_norm": 0.14661559034759997, + "learning_rate": 9.969635434970037e-06, + "loss": 0.6594, + "step": 9629 + }, + { + "epoch": 0.8610515021459227, + "grad_norm": 0.1434010510766151, + "learning_rate": 9.957033964223582e-06, + "loss": 0.6651, + "step": 9630 + }, + { + "epoch": 0.8611409155937053, + "grad_norm": 0.13402894386152656, + "learning_rate": 9.944440045271953e-06, + "loss": 0.6307, + "step": 9631 + }, + { + "epoch": 0.8612303290414879, + "grad_norm": 0.1436598317758283, + "learning_rate": 9.931853679171377e-06, + "loss": 0.6264, + "step": 9632 + }, + { + "epoch": 0.8613197424892703, + "grad_norm": 0.1478312007492944, + "learning_rate": 9.919274866977457e-06, + "loss": 0.6214, + "step": 9633 + }, + { + "epoch": 0.8614091559370529, + "grad_norm": 0.1567691089874801, + "learning_rate": 9.90670360974517e-06, + "loss": 0.6672, + "step": 9634 + }, + { + "epoch": 0.8614985693848355, + "grad_norm": 0.15111528993529638, + "learning_rate": 9.894139908528843e-06, + "loss": 0.6431, + "step": 9635 + }, + { + "epoch": 0.8615879828326181, + "grad_norm": 0.13019448780186468, + "learning_rate": 9.881583764382175e-06, + "loss": 0.6139, + "step": 9636 + }, + { + "epoch": 0.8616773962804005, + "grad_norm": 0.13889091529338232, + "learning_rate": 9.869035178358266e-06, + "loss": 0.6085, + "step": 9637 + }, + { + "epoch": 0.8617668097281831, + "grad_norm": 0.13934261106547308, + "learning_rate": 9.856494151509488e-06, + "loss": 0.578, + "step": 9638 + }, + { + "epoch": 0.8618562231759657, + "grad_norm": 0.1436323356223267, + "learning_rate": 9.84396068488771e-06, + "loss": 0.618, + "step": 9639 + }, + { + "epoch": 0.8619456366237482, + "grad_norm": 0.1459815532240673, + "learning_rate": 9.831434779544057e-06, + "loss": 0.6207, + "step": 9640 + }, + { + "epoch": 0.8620350500715308, + "grad_norm": 0.13945396077730784, + "learning_rate": 9.818916436529069e-06, + "loss": 0.6087, + "step": 9641 + }, + { + "epoch": 0.8621244635193133, + "grad_norm": 0.1496911644830662, + "learning_rate": 9.80640565689267e-06, + "loss": 0.6421, + "step": 9642 + }, + { + "epoch": 0.8622138769670958, + "grad_norm": 0.15755586920494194, + "learning_rate": 9.793902441684077e-06, + "loss": 0.6606, + "step": 9643 + }, + { + "epoch": 0.8623032904148784, + "grad_norm": 0.14976010535962825, + "learning_rate": 9.781406791951952e-06, + "loss": 0.6145, + "step": 9644 + }, + { + "epoch": 0.862392703862661, + "grad_norm": 0.14181363867370117, + "learning_rate": 9.76891870874428e-06, + "loss": 0.6092, + "step": 9645 + }, + { + "epoch": 0.8624821173104434, + "grad_norm": 0.1361793613487305, + "learning_rate": 9.756438193108419e-06, + "loss": 0.6159, + "step": 9646 + }, + { + "epoch": 0.862571530758226, + "grad_norm": 0.15622601516681028, + "learning_rate": 9.743965246091102e-06, + "loss": 0.683, + "step": 9647 + }, + { + "epoch": 0.8626609442060086, + "grad_norm": 0.13604445153849443, + "learning_rate": 9.731499868738447e-06, + "loss": 0.5767, + "step": 9648 + }, + { + "epoch": 0.8627503576537912, + "grad_norm": 0.12987592471794893, + "learning_rate": 9.719042062095851e-06, + "loss": 0.613, + "step": 9649 + }, + { + "epoch": 0.8628397711015737, + "grad_norm": 0.1423929193317585, + "learning_rate": 9.706591827208166e-06, + "loss": 0.6311, + "step": 9650 + }, + { + "epoch": 0.8629291845493562, + "grad_norm": 0.144947386334915, + "learning_rate": 9.694149165119603e-06, + "loss": 0.5857, + "step": 9651 + }, + { + "epoch": 0.8630185979971388, + "grad_norm": 0.12501755888898572, + "learning_rate": 9.68171407687365e-06, + "loss": 0.5863, + "step": 9652 + }, + { + "epoch": 0.8631080114449213, + "grad_norm": 0.15134092146355294, + "learning_rate": 9.66928656351329e-06, + "loss": 0.5953, + "step": 9653 + }, + { + "epoch": 0.8631974248927039, + "grad_norm": 0.158918798569772, + "learning_rate": 9.656866626080763e-06, + "loss": 0.6169, + "step": 9654 + }, + { + "epoch": 0.8632868383404864, + "grad_norm": 0.13173740808880402, + "learning_rate": 9.644454265617731e-06, + "loss": 0.63, + "step": 9655 + }, + { + "epoch": 0.8633762517882689, + "grad_norm": 0.14624285176714452, + "learning_rate": 9.632049483165184e-06, + "loss": 0.6305, + "step": 9656 + }, + { + "epoch": 0.8634656652360515, + "grad_norm": 0.13662079074162004, + "learning_rate": 9.619652279763536e-06, + "loss": 0.6258, + "step": 9657 + }, + { + "epoch": 0.8635550786838341, + "grad_norm": 0.15297651390826372, + "learning_rate": 9.607262656452475e-06, + "loss": 0.6365, + "step": 9658 + }, + { + "epoch": 0.8636444921316166, + "grad_norm": 0.16189432643257967, + "learning_rate": 9.59488061427114e-06, + "loss": 0.6658, + "step": 9659 + }, + { + "epoch": 0.8637339055793991, + "grad_norm": 0.13592171750050971, + "learning_rate": 9.582506154257976e-06, + "loss": 0.6163, + "step": 9660 + }, + { + "epoch": 0.8638233190271817, + "grad_norm": 0.13447568464203388, + "learning_rate": 9.57013927745083e-06, + "loss": 0.6547, + "step": 9661 + }, + { + "epoch": 0.8639127324749643, + "grad_norm": 0.14176548550200987, + "learning_rate": 9.557779984886905e-06, + "loss": 0.6643, + "step": 9662 + }, + { + "epoch": 0.8640021459227468, + "grad_norm": 0.13218121716518202, + "learning_rate": 9.545428277602731e-06, + "loss": 0.6595, + "step": 9663 + }, + { + "epoch": 0.8640915593705293, + "grad_norm": 0.1567173272475877, + "learning_rate": 9.533084156634242e-06, + "loss": 0.6431, + "step": 9664 + }, + { + "epoch": 0.8641809728183119, + "grad_norm": 0.14151214495372347, + "learning_rate": 9.520747623016747e-06, + "loss": 0.6213, + "step": 9665 + }, + { + "epoch": 0.8642703862660944, + "grad_norm": 0.13529176717620825, + "learning_rate": 9.508418677784847e-06, + "loss": 0.6177, + "step": 9666 + }, + { + "epoch": 0.864359799713877, + "grad_norm": 0.14362897678743577, + "learning_rate": 9.496097321972597e-06, + "loss": 0.625, + "step": 9667 + }, + { + "epoch": 0.8644492131616596, + "grad_norm": 0.1437034715568311, + "learning_rate": 9.48378355661339e-06, + "loss": 0.633, + "step": 9668 + }, + { + "epoch": 0.864538626609442, + "grad_norm": 0.14268452926602185, + "learning_rate": 9.471477382739912e-06, + "loss": 0.6227, + "step": 9669 + }, + { + "epoch": 0.8646280400572246, + "grad_norm": 0.13950295927968842, + "learning_rate": 9.459178801384304e-06, + "loss": 0.6352, + "step": 9670 + }, + { + "epoch": 0.8647174535050072, + "grad_norm": 0.1475607147018964, + "learning_rate": 9.446887813578031e-06, + "loss": 0.6103, + "step": 9671 + }, + { + "epoch": 0.8648068669527897, + "grad_norm": 0.16588012120268114, + "learning_rate": 9.434604420351911e-06, + "loss": 0.633, + "step": 9672 + }, + { + "epoch": 0.8648962804005722, + "grad_norm": 0.146552926626491, + "learning_rate": 9.422328622736142e-06, + "loss": 0.6394, + "step": 9673 + }, + { + "epoch": 0.8649856938483548, + "grad_norm": 0.1284164096531227, + "learning_rate": 9.41006042176027e-06, + "loss": 0.5898, + "step": 9674 + }, + { + "epoch": 0.8650751072961373, + "grad_norm": 0.15078948222499855, + "learning_rate": 9.397799818453235e-06, + "loss": 0.6524, + "step": 9675 + }, + { + "epoch": 0.8651645207439199, + "grad_norm": 0.13792390899401566, + "learning_rate": 9.385546813843326e-06, + "loss": 0.641, + "step": 9676 + }, + { + "epoch": 0.8652539341917024, + "grad_norm": 0.16139164795234542, + "learning_rate": 9.373301408958157e-06, + "loss": 0.6313, + "step": 9677 + }, + { + "epoch": 0.865343347639485, + "grad_norm": 0.1363482821213603, + "learning_rate": 9.361063604824738e-06, + "loss": 0.6254, + "step": 9678 + }, + { + "epoch": 0.8654327610872675, + "grad_norm": 0.13204699792940494, + "learning_rate": 9.34883340246946e-06, + "loss": 0.6519, + "step": 9679 + }, + { + "epoch": 0.8655221745350501, + "grad_norm": 0.1374960267669322, + "learning_rate": 9.336610802918044e-06, + "loss": 0.6334, + "step": 9680 + }, + { + "epoch": 0.8656115879828327, + "grad_norm": 0.15206809647399222, + "learning_rate": 9.324395807195585e-06, + "loss": 0.6558, + "step": 9681 + }, + { + "epoch": 0.8657010014306151, + "grad_norm": 0.14422787178625407, + "learning_rate": 9.312188416326562e-06, + "loss": 0.6335, + "step": 9682 + }, + { + "epoch": 0.8657904148783977, + "grad_norm": 0.12205396786236666, + "learning_rate": 9.299988631334755e-06, + "loss": 0.5795, + "step": 9683 + }, + { + "epoch": 0.8658798283261803, + "grad_norm": 0.13944863708270475, + "learning_rate": 9.287796453243358e-06, + "loss": 0.6497, + "step": 9684 + }, + { + "epoch": 0.8659692417739628, + "grad_norm": 0.1295428428940114, + "learning_rate": 9.275611883074941e-06, + "loss": 0.646, + "step": 9685 + }, + { + "epoch": 0.8660586552217453, + "grad_norm": 0.13028926548097883, + "learning_rate": 9.263434921851377e-06, + "loss": 0.6638, + "step": 9686 + }, + { + "epoch": 0.8661480686695279, + "grad_norm": 0.13659506394553292, + "learning_rate": 9.251265570593914e-06, + "loss": 0.5898, + "step": 9687 + }, + { + "epoch": 0.8662374821173104, + "grad_norm": 0.16098779513536451, + "learning_rate": 9.23910383032326e-06, + "loss": 0.683, + "step": 9688 + }, + { + "epoch": 0.866326895565093, + "grad_norm": 0.1628977665778722, + "learning_rate": 9.226949702059329e-06, + "loss": 0.6589, + "step": 9689 + }, + { + "epoch": 0.8664163090128756, + "grad_norm": 0.14744166350319296, + "learning_rate": 9.214803186821497e-06, + "loss": 0.6395, + "step": 9690 + }, + { + "epoch": 0.866505722460658, + "grad_norm": 0.15806127625355354, + "learning_rate": 9.202664285628504e-06, + "loss": 0.6116, + "step": 9691 + }, + { + "epoch": 0.8665951359084406, + "grad_norm": 0.15037584149588645, + "learning_rate": 9.190532999498392e-06, + "loss": 0.6121, + "step": 9692 + }, + { + "epoch": 0.8666845493562232, + "grad_norm": 0.15373226830766845, + "learning_rate": 9.178409329448601e-06, + "loss": 0.6702, + "step": 9693 + }, + { + "epoch": 0.8667739628040058, + "grad_norm": 0.14259956690841907, + "learning_rate": 9.16629327649593e-06, + "loss": 0.6371, + "step": 9694 + }, + { + "epoch": 0.8668633762517882, + "grad_norm": 0.15447266354286876, + "learning_rate": 9.154184841656544e-06, + "loss": 0.6447, + "step": 9695 + }, + { + "epoch": 0.8669527896995708, + "grad_norm": 0.1320964589025961, + "learning_rate": 9.142084025945984e-06, + "loss": 0.5897, + "step": 9696 + }, + { + "epoch": 0.8670422031473534, + "grad_norm": 0.1343844812328797, + "learning_rate": 9.129990830379087e-06, + "loss": 0.6607, + "step": 9697 + }, + { + "epoch": 0.8671316165951359, + "grad_norm": 0.13381392313968446, + "learning_rate": 9.117905255970116e-06, + "loss": 0.6214, + "step": 9698 + }, + { + "epoch": 0.8672210300429185, + "grad_norm": 0.14086086643203904, + "learning_rate": 9.105827303732695e-06, + "loss": 0.6385, + "step": 9699 + }, + { + "epoch": 0.867310443490701, + "grad_norm": 0.13945498852968882, + "learning_rate": 9.093756974679746e-06, + "loss": 0.6501, + "step": 9700 + }, + { + "epoch": 0.8673998569384835, + "grad_norm": 0.1392424073358879, + "learning_rate": 9.081694269823582e-06, + "loss": 0.6153, + "step": 9701 + }, + { + "epoch": 0.8674892703862661, + "grad_norm": 0.14513657319375695, + "learning_rate": 9.069639190175972e-06, + "loss": 0.6487, + "step": 9702 + }, + { + "epoch": 0.8675786838340487, + "grad_norm": 0.15706042038201093, + "learning_rate": 9.057591736747883e-06, + "loss": 0.686, + "step": 9703 + }, + { + "epoch": 0.8676680972818311, + "grad_norm": 0.14986629851158267, + "learning_rate": 9.045551910549744e-06, + "loss": 0.6336, + "step": 9704 + }, + { + "epoch": 0.8677575107296137, + "grad_norm": 0.14620385254036233, + "learning_rate": 9.033519712591332e-06, + "loss": 0.6452, + "step": 9705 + }, + { + "epoch": 0.8678469241773963, + "grad_norm": 0.1394680472961103, + "learning_rate": 9.021495143881753e-06, + "loss": 0.6553, + "step": 9706 + }, + { + "epoch": 0.8679363376251789, + "grad_norm": 0.13936142808883367, + "learning_rate": 9.0094782054295e-06, + "loss": 0.629, + "step": 9707 + }, + { + "epoch": 0.8680257510729614, + "grad_norm": 0.1262691951664836, + "learning_rate": 8.997468898242422e-06, + "loss": 0.6392, + "step": 9708 + }, + { + "epoch": 0.8681151645207439, + "grad_norm": 0.13843024414261682, + "learning_rate": 8.985467223327726e-06, + "loss": 0.6304, + "step": 9709 + }, + { + "epoch": 0.8682045779685265, + "grad_norm": 0.1502000820834651, + "learning_rate": 8.973473181691993e-06, + "loss": 0.6476, + "step": 9710 + }, + { + "epoch": 0.868293991416309, + "grad_norm": 0.12138428626100427, + "learning_rate": 8.96148677434111e-06, + "loss": 0.6125, + "step": 9711 + }, + { + "epoch": 0.8683834048640916, + "grad_norm": 0.15506666070616074, + "learning_rate": 8.949508002280382e-06, + "loss": 0.642, + "step": 9712 + }, + { + "epoch": 0.8684728183118741, + "grad_norm": 0.1468210261424248, + "learning_rate": 8.937536866514462e-06, + "loss": 0.6151, + "step": 9713 + }, + { + "epoch": 0.8685622317596566, + "grad_norm": 0.13083409985123262, + "learning_rate": 8.925573368047358e-06, + "loss": 0.6165, + "step": 9714 + }, + { + "epoch": 0.8686516452074392, + "grad_norm": 0.14939806585595347, + "learning_rate": 8.91361750788241e-06, + "loss": 0.6458, + "step": 9715 + }, + { + "epoch": 0.8687410586552218, + "grad_norm": 0.12654622087884204, + "learning_rate": 8.901669287022384e-06, + "loss": 0.6072, + "step": 9716 + }, + { + "epoch": 0.8688304721030042, + "grad_norm": 0.14149579278298208, + "learning_rate": 8.889728706469314e-06, + "loss": 0.6297, + "step": 9717 + }, + { + "epoch": 0.8689198855507868, + "grad_norm": 0.1245290094622808, + "learning_rate": 8.877795767224672e-06, + "loss": 0.6455, + "step": 9718 + }, + { + "epoch": 0.8690092989985694, + "grad_norm": 0.1507818422527502, + "learning_rate": 8.86587047028926e-06, + "loss": 0.6184, + "step": 9719 + }, + { + "epoch": 0.869098712446352, + "grad_norm": 0.13124389789342558, + "learning_rate": 8.853952816663213e-06, + "loss": 0.6342, + "step": 9720 + }, + { + "epoch": 0.8691881258941345, + "grad_norm": 0.15101860219862928, + "learning_rate": 8.842042807346051e-06, + "loss": 0.6306, + "step": 9721 + }, + { + "epoch": 0.869277539341917, + "grad_norm": 0.164321018695269, + "learning_rate": 8.830140443336699e-06, + "loss": 0.6422, + "step": 9722 + }, + { + "epoch": 0.8693669527896996, + "grad_norm": 0.12980997726154342, + "learning_rate": 8.818245725633356e-06, + "loss": 0.6416, + "step": 9723 + }, + { + "epoch": 0.8694563662374821, + "grad_norm": 0.14127162695649975, + "learning_rate": 8.806358655233615e-06, + "loss": 0.6746, + "step": 9724 + }, + { + "epoch": 0.8695457796852647, + "grad_norm": 0.1371462448739538, + "learning_rate": 8.794479233134456e-06, + "loss": 0.5879, + "step": 9725 + }, + { + "epoch": 0.8696351931330472, + "grad_norm": 0.1459660611025314, + "learning_rate": 8.78260746033217e-06, + "loss": 0.6569, + "step": 9726 + }, + { + "epoch": 0.8697246065808297, + "grad_norm": 0.15228345824405362, + "learning_rate": 8.770743337822418e-06, + "loss": 0.6122, + "step": 9727 + }, + { + "epoch": 0.8698140200286123, + "grad_norm": 0.14209754285986606, + "learning_rate": 8.758886866600257e-06, + "loss": 0.6145, + "step": 9728 + }, + { + "epoch": 0.8699034334763949, + "grad_norm": 0.16057438414468111, + "learning_rate": 8.74703804766005e-06, + "loss": 0.64, + "step": 9729 + }, + { + "epoch": 0.8699928469241774, + "grad_norm": 0.14074370090263866, + "learning_rate": 8.735196881995589e-06, + "loss": 0.6319, + "step": 9730 + }, + { + "epoch": 0.8700822603719599, + "grad_norm": 0.1130465803578544, + "learning_rate": 8.723363370599924e-06, + "loss": 0.6043, + "step": 9731 + }, + { + "epoch": 0.8701716738197425, + "grad_norm": 0.139020252393108, + "learning_rate": 8.71153751446553e-06, + "loss": 0.6107, + "step": 9732 + }, + { + "epoch": 0.870261087267525, + "grad_norm": 0.14104296352927567, + "learning_rate": 8.699719314584265e-06, + "loss": 0.6363, + "step": 9733 + }, + { + "epoch": 0.8703505007153076, + "grad_norm": 0.13601567586872682, + "learning_rate": 8.687908771947251e-06, + "loss": 0.6156, + "step": 9734 + }, + { + "epoch": 0.8704399141630901, + "grad_norm": 0.15016750687751346, + "learning_rate": 8.676105887545039e-06, + "loss": 0.6452, + "step": 9735 + }, + { + "epoch": 0.8705293276108726, + "grad_norm": 0.14354852996763076, + "learning_rate": 8.66431066236757e-06, + "loss": 0.6295, + "step": 9736 + }, + { + "epoch": 0.8706187410586552, + "grad_norm": 0.15564022849923698, + "learning_rate": 8.652523097404042e-06, + "loss": 0.6241, + "step": 9737 + }, + { + "epoch": 0.8707081545064378, + "grad_norm": 0.13423328323092182, + "learning_rate": 8.640743193643075e-06, + "loss": 0.6451, + "step": 9738 + }, + { + "epoch": 0.8707975679542204, + "grad_norm": 0.1301058114551674, + "learning_rate": 8.628970952072667e-06, + "loss": 0.6116, + "step": 9739 + }, + { + "epoch": 0.8708869814020028, + "grad_norm": 0.13966878007281855, + "learning_rate": 8.617206373680098e-06, + "loss": 0.6083, + "step": 9740 + }, + { + "epoch": 0.8709763948497854, + "grad_norm": 0.13624837781425528, + "learning_rate": 8.605449459452075e-06, + "loss": 0.6075, + "step": 9741 + }, + { + "epoch": 0.871065808297568, + "grad_norm": 0.13956738318114711, + "learning_rate": 8.593700210374622e-06, + "loss": 0.6216, + "step": 9742 + }, + { + "epoch": 0.8711552217453505, + "grad_norm": 0.14745756342619556, + "learning_rate": 8.58195862743314e-06, + "loss": 0.6066, + "step": 9743 + }, + { + "epoch": 0.871244635193133, + "grad_norm": 0.11922713342184216, + "learning_rate": 8.570224711612385e-06, + "loss": 0.5806, + "step": 9744 + }, + { + "epoch": 0.8713340486409156, + "grad_norm": 0.14978679455828006, + "learning_rate": 8.55849846389648e-06, + "loss": 0.6145, + "step": 9745 + }, + { + "epoch": 0.8714234620886981, + "grad_norm": 0.147300021289893, + "learning_rate": 8.546779885268863e-06, + "loss": 0.6231, + "step": 9746 + }, + { + "epoch": 0.8715128755364807, + "grad_norm": 0.14044295728635428, + "learning_rate": 8.535068976712368e-06, + "loss": 0.6504, + "step": 9747 + }, + { + "epoch": 0.8716022889842633, + "grad_norm": 0.1337258360262597, + "learning_rate": 8.523365739209188e-06, + "loss": 0.6523, + "step": 9748 + }, + { + "epoch": 0.8716917024320457, + "grad_norm": 0.13748580478102293, + "learning_rate": 8.511670173740816e-06, + "loss": 0.6271, + "step": 9749 + }, + { + "epoch": 0.8717811158798283, + "grad_norm": 0.13924829877746653, + "learning_rate": 8.499982281288221e-06, + "loss": 0.6188, + "step": 9750 + }, + { + "epoch": 0.8718705293276109, + "grad_norm": 0.14408836054116136, + "learning_rate": 8.488302062831576e-06, + "loss": 0.635, + "step": 9751 + }, + { + "epoch": 0.8719599427753935, + "grad_norm": 0.14719932061679533, + "learning_rate": 8.476629519350532e-06, + "loss": 0.6263, + "step": 9752 + }, + { + "epoch": 0.8720493562231759, + "grad_norm": 0.13914484855077083, + "learning_rate": 8.464964651824048e-06, + "loss": 0.6875, + "step": 9753 + }, + { + "epoch": 0.8721387696709585, + "grad_norm": 0.14620448896114324, + "learning_rate": 8.453307461230409e-06, + "loss": 0.6255, + "step": 9754 + }, + { + "epoch": 0.8722281831187411, + "grad_norm": 0.13027259811900935, + "learning_rate": 8.441657948547322e-06, + "loss": 0.6184, + "step": 9755 + }, + { + "epoch": 0.8723175965665236, + "grad_norm": 0.12947899179696026, + "learning_rate": 8.430016114751805e-06, + "loss": 0.6242, + "step": 9756 + }, + { + "epoch": 0.8724070100143062, + "grad_norm": 0.13582175226566084, + "learning_rate": 8.418381960820243e-06, + "loss": 0.6033, + "step": 9757 + }, + { + "epoch": 0.8724964234620887, + "grad_norm": 0.12331225991433198, + "learning_rate": 8.40675548772839e-06, + "loss": 0.6023, + "step": 9758 + }, + { + "epoch": 0.8725858369098712, + "grad_norm": 0.15107338303650775, + "learning_rate": 8.395136696451355e-06, + "loss": 0.6288, + "step": 9759 + }, + { + "epoch": 0.8726752503576538, + "grad_norm": 0.13808066087830467, + "learning_rate": 8.383525587963558e-06, + "loss": 0.6338, + "step": 9760 + }, + { + "epoch": 0.8727646638054364, + "grad_norm": 0.1608302538935996, + "learning_rate": 8.371922163238821e-06, + "loss": 0.6377, + "step": 9761 + }, + { + "epoch": 0.8728540772532188, + "grad_norm": 0.14450920073034598, + "learning_rate": 8.36032642325033e-06, + "loss": 0.6067, + "step": 9762 + }, + { + "epoch": 0.8729434907010014, + "grad_norm": 0.13134329024264482, + "learning_rate": 8.348738368970566e-06, + "loss": 0.6279, + "step": 9763 + }, + { + "epoch": 0.873032904148784, + "grad_norm": 0.13168295887477477, + "learning_rate": 8.337158001371449e-06, + "loss": 0.6377, + "step": 9764 + }, + { + "epoch": 0.8731223175965666, + "grad_norm": 0.1548259120161206, + "learning_rate": 8.325585321424178e-06, + "loss": 0.6301, + "step": 9765 + }, + { + "epoch": 0.873211731044349, + "grad_norm": 0.14592608674164653, + "learning_rate": 8.314020330099348e-06, + "loss": 0.6228, + "step": 9766 + }, + { + "epoch": 0.8733011444921316, + "grad_norm": 0.12913491501352367, + "learning_rate": 8.302463028366924e-06, + "loss": 0.6408, + "step": 9767 + }, + { + "epoch": 0.8733905579399142, + "grad_norm": 0.1310804398971361, + "learning_rate": 8.290913417196177e-06, + "loss": 0.6339, + "step": 9768 + }, + { + "epoch": 0.8734799713876967, + "grad_norm": 0.1545017972543344, + "learning_rate": 8.279371497555755e-06, + "loss": 0.6229, + "step": 9769 + }, + { + "epoch": 0.8735693848354793, + "grad_norm": 0.15922466769648888, + "learning_rate": 8.26783727041367e-06, + "loss": 0.6348, + "step": 9770 + }, + { + "epoch": 0.8736587982832618, + "grad_norm": 0.13578998377593116, + "learning_rate": 8.256310736737294e-06, + "loss": 0.5852, + "step": 9771 + }, + { + "epoch": 0.8737482117310443, + "grad_norm": 0.13026628204204987, + "learning_rate": 8.244791897493342e-06, + "loss": 0.6182, + "step": 9772 + }, + { + "epoch": 0.8738376251788269, + "grad_norm": 0.152471456971664, + "learning_rate": 8.233280753647887e-06, + "loss": 0.647, + "step": 9773 + }, + { + "epoch": 0.8739270386266095, + "grad_norm": 0.14177865760123082, + "learning_rate": 8.221777306166346e-06, + "loss": 0.6086, + "step": 9774 + }, + { + "epoch": 0.8740164520743919, + "grad_norm": 0.13290011550276765, + "learning_rate": 8.210281556013489e-06, + "loss": 0.6123, + "step": 9775 + }, + { + "epoch": 0.8741058655221745, + "grad_norm": 0.13704155070724477, + "learning_rate": 8.19879350415349e-06, + "loss": 0.6215, + "step": 9776 + }, + { + "epoch": 0.8741952789699571, + "grad_norm": 0.12959441236907088, + "learning_rate": 8.18731315154978e-06, + "loss": 0.6182, + "step": 9777 + }, + { + "epoch": 0.8742846924177397, + "grad_norm": 0.13714430968133862, + "learning_rate": 8.175840499165244e-06, + "loss": 0.6234, + "step": 9778 + }, + { + "epoch": 0.8743741058655222, + "grad_norm": 0.12650239124980941, + "learning_rate": 8.16437554796209e-06, + "loss": 0.6181, + "step": 9779 + }, + { + "epoch": 0.8744635193133047, + "grad_norm": 0.1263569111866498, + "learning_rate": 8.152918298901836e-06, + "loss": 0.5519, + "step": 9780 + }, + { + "epoch": 0.8745529327610873, + "grad_norm": 0.13068483204969492, + "learning_rate": 8.141468752945392e-06, + "loss": 0.6156, + "step": 9781 + }, + { + "epoch": 0.8746423462088698, + "grad_norm": 0.13776141118270874, + "learning_rate": 8.130026911053045e-06, + "loss": 0.6114, + "step": 9782 + }, + { + "epoch": 0.8747317596566524, + "grad_norm": 0.1330644297053598, + "learning_rate": 8.118592774184385e-06, + "loss": 0.6096, + "step": 9783 + }, + { + "epoch": 0.8748211731044349, + "grad_norm": 0.15107272241720252, + "learning_rate": 8.107166343298377e-06, + "loss": 0.6573, + "step": 9784 + }, + { + "epoch": 0.8749105865522174, + "grad_norm": 0.1348068859557533, + "learning_rate": 8.095747619353345e-06, + "loss": 0.6359, + "step": 9785 + }, + { + "epoch": 0.875, + "grad_norm": 0.1300739083147111, + "learning_rate": 8.084336603306974e-06, + "loss": 0.635, + "step": 9786 + }, + { + "epoch": 0.8750894134477826, + "grad_norm": 0.139275091301925, + "learning_rate": 8.072933296116303e-06, + "loss": 0.6404, + "step": 9787 + }, + { + "epoch": 0.8751788268955651, + "grad_norm": 0.15228356358697842, + "learning_rate": 8.061537698737675e-06, + "loss": 0.6592, + "step": 9788 + }, + { + "epoch": 0.8752682403433476, + "grad_norm": 0.14931974308941778, + "learning_rate": 8.05014981212685e-06, + "loss": 0.6368, + "step": 9789 + }, + { + "epoch": 0.8753576537911302, + "grad_norm": 0.15752863371019396, + "learning_rate": 8.038769637238907e-06, + "loss": 0.6205, + "step": 9790 + }, + { + "epoch": 0.8754470672389127, + "grad_norm": 0.13689078795490028, + "learning_rate": 8.027397175028305e-06, + "loss": 0.6533, + "step": 9791 + }, + { + "epoch": 0.8755364806866953, + "grad_norm": 0.1394730271496094, + "learning_rate": 8.016032426448817e-06, + "loss": 0.6545, + "step": 9792 + }, + { + "epoch": 0.8756258941344778, + "grad_norm": 0.12124227792988489, + "learning_rate": 8.00467539245362e-06, + "loss": 0.5916, + "step": 9793 + }, + { + "epoch": 0.8757153075822603, + "grad_norm": 0.15604174294733886, + "learning_rate": 7.993326073995189e-06, + "loss": 0.6382, + "step": 9794 + }, + { + "epoch": 0.8758047210300429, + "grad_norm": 0.13524051249752647, + "learning_rate": 7.981984472025372e-06, + "loss": 0.6327, + "step": 9795 + }, + { + "epoch": 0.8758941344778255, + "grad_norm": 0.15078814569551685, + "learning_rate": 7.97065058749541e-06, + "loss": 0.6759, + "step": 9796 + }, + { + "epoch": 0.8759835479256081, + "grad_norm": 0.13634696605330435, + "learning_rate": 7.959324421355797e-06, + "loss": 0.6041, + "step": 9797 + }, + { + "epoch": 0.8760729613733905, + "grad_norm": 0.15768511686041736, + "learning_rate": 7.948005974556539e-06, + "loss": 0.6506, + "step": 9798 + }, + { + "epoch": 0.8761623748211731, + "grad_norm": 0.15352608302145862, + "learning_rate": 7.936695248046822e-06, + "loss": 0.6586, + "step": 9799 + }, + { + "epoch": 0.8762517882689557, + "grad_norm": 0.14381470446314285, + "learning_rate": 7.925392242775288e-06, + "loss": 0.6203, + "step": 9800 + }, + { + "epoch": 0.8763412017167382, + "grad_norm": 0.14504493502913912, + "learning_rate": 7.91409695968991e-06, + "loss": 0.6277, + "step": 9801 + }, + { + "epoch": 0.8764306151645207, + "grad_norm": 0.147200147830882, + "learning_rate": 7.90280939973802e-06, + "loss": 0.6173, + "step": 9802 + }, + { + "epoch": 0.8765200286123033, + "grad_norm": 0.1317729213875328, + "learning_rate": 7.891529563866274e-06, + "loss": 0.5805, + "step": 9803 + }, + { + "epoch": 0.8766094420600858, + "grad_norm": 0.1507832648660823, + "learning_rate": 7.8802574530207e-06, + "loss": 0.6379, + "step": 9804 + }, + { + "epoch": 0.8766988555078684, + "grad_norm": 0.1292752895238864, + "learning_rate": 7.86899306814668e-06, + "loss": 0.6182, + "step": 9805 + }, + { + "epoch": 0.876788268955651, + "grad_norm": 0.12493310491046589, + "learning_rate": 7.857736410188953e-06, + "loss": 0.6171, + "step": 9806 + }, + { + "epoch": 0.8768776824034334, + "grad_norm": 0.13533224442449002, + "learning_rate": 7.846487480091603e-06, + "loss": 0.6224, + "step": 9807 + }, + { + "epoch": 0.876967095851216, + "grad_norm": 0.12385478299185966, + "learning_rate": 7.835246278798037e-06, + "loss": 0.6242, + "step": 9808 + }, + { + "epoch": 0.8770565092989986, + "grad_norm": 0.1521340816503355, + "learning_rate": 7.824012807251058e-06, + "loss": 0.6418, + "step": 9809 + }, + { + "epoch": 0.8771459227467812, + "grad_norm": 0.1446924163547891, + "learning_rate": 7.812787066392825e-06, + "loss": 0.62, + "step": 9810 + }, + { + "epoch": 0.8772353361945636, + "grad_norm": 0.15544800687067964, + "learning_rate": 7.80156905716477e-06, + "loss": 0.6609, + "step": 9811 + }, + { + "epoch": 0.8773247496423462, + "grad_norm": 0.1353439072903216, + "learning_rate": 7.790358780507789e-06, + "loss": 0.6376, + "step": 9812 + }, + { + "epoch": 0.8774141630901288, + "grad_norm": 0.15351847049948522, + "learning_rate": 7.779156237362084e-06, + "loss": 0.6437, + "step": 9813 + }, + { + "epoch": 0.8775035765379113, + "grad_norm": 0.13826660043669783, + "learning_rate": 7.767961428667136e-06, + "loss": 0.6292, + "step": 9814 + }, + { + "epoch": 0.8775929899856938, + "grad_norm": 0.13648485816492092, + "learning_rate": 7.756774355361884e-06, + "loss": 0.6026, + "step": 9815 + }, + { + "epoch": 0.8776824034334764, + "grad_norm": 0.14552867774655154, + "learning_rate": 7.745595018384578e-06, + "loss": 0.6143, + "step": 9816 + }, + { + "epoch": 0.8777718168812589, + "grad_norm": 0.12187910266240148, + "learning_rate": 7.734423418672786e-06, + "loss": 0.6049, + "step": 9817 + }, + { + "epoch": 0.8778612303290415, + "grad_norm": 0.1431824392983821, + "learning_rate": 7.723259557163487e-06, + "loss": 0.6359, + "step": 9818 + }, + { + "epoch": 0.8779506437768241, + "grad_norm": 0.16162538445965705, + "learning_rate": 7.71210343479295e-06, + "loss": 0.6533, + "step": 9819 + }, + { + "epoch": 0.8780400572246065, + "grad_norm": 0.15198325575683053, + "learning_rate": 7.70095505249685e-06, + "loss": 0.6609, + "step": 9820 + }, + { + "epoch": 0.8781294706723891, + "grad_norm": 0.12829269095394774, + "learning_rate": 7.689814411210195e-06, + "loss": 0.5925, + "step": 9821 + }, + { + "epoch": 0.8782188841201717, + "grad_norm": 0.15073440368827587, + "learning_rate": 7.678681511867304e-06, + "loss": 0.6221, + "step": 9822 + }, + { + "epoch": 0.8783082975679543, + "grad_norm": 0.15142674005833695, + "learning_rate": 7.667556355401906e-06, + "loss": 0.6754, + "step": 9823 + }, + { + "epoch": 0.8783977110157367, + "grad_norm": 0.15249841706734898, + "learning_rate": 7.656438942747058e-06, + "loss": 0.6839, + "step": 9824 + }, + { + "epoch": 0.8784871244635193, + "grad_norm": 0.15511428959777424, + "learning_rate": 7.645329274835122e-06, + "loss": 0.6567, + "step": 9825 + }, + { + "epoch": 0.8785765379113019, + "grad_norm": 0.14310124472713265, + "learning_rate": 7.634227352597901e-06, + "loss": 0.632, + "step": 9826 + }, + { + "epoch": 0.8786659513590844, + "grad_norm": 0.13337316466614416, + "learning_rate": 7.623133176966491e-06, + "loss": 0.6153, + "step": 9827 + }, + { + "epoch": 0.878755364806867, + "grad_norm": 0.14241715699275811, + "learning_rate": 7.612046748871327e-06, + "loss": 0.6362, + "step": 9828 + }, + { + "epoch": 0.8788447782546495, + "grad_norm": 0.14891425292531937, + "learning_rate": 7.600968069242232e-06, + "loss": 0.6628, + "step": 9829 + }, + { + "epoch": 0.878934191702432, + "grad_norm": 0.1431355627668148, + "learning_rate": 7.589897139008362e-06, + "loss": 0.6477, + "step": 9830 + }, + { + "epoch": 0.8790236051502146, + "grad_norm": 0.14017120265550798, + "learning_rate": 7.578833959098209e-06, + "loss": 0.6174, + "step": 9831 + }, + { + "epoch": 0.8791130185979972, + "grad_norm": 0.1401079284422178, + "learning_rate": 7.567778530439606e-06, + "loss": 0.6317, + "step": 9832 + }, + { + "epoch": 0.8792024320457796, + "grad_norm": 0.14572718812110286, + "learning_rate": 7.5567308539598256e-06, + "loss": 0.6335, + "step": 9833 + }, + { + "epoch": 0.8792918454935622, + "grad_norm": 0.12644438267543978, + "learning_rate": 7.545690930585381e-06, + "loss": 0.5571, + "step": 9834 + }, + { + "epoch": 0.8793812589413448, + "grad_norm": 0.13784345939758416, + "learning_rate": 7.534658761242164e-06, + "loss": 0.6774, + "step": 9835 + }, + { + "epoch": 0.8794706723891274, + "grad_norm": 0.13646567974147106, + "learning_rate": 7.52363434685548e-06, + "loss": 0.6446, + "step": 9836 + }, + { + "epoch": 0.8795600858369099, + "grad_norm": 0.12429393809205243, + "learning_rate": 7.512617688349866e-06, + "loss": 0.603, + "step": 9837 + }, + { + "epoch": 0.8796494992846924, + "grad_norm": 0.12650620710655633, + "learning_rate": 7.501608786649328e-06, + "loss": 0.6239, + "step": 9838 + }, + { + "epoch": 0.879738912732475, + "grad_norm": 0.15745543668526368, + "learning_rate": 7.490607642677139e-06, + "loss": 0.6419, + "step": 9839 + }, + { + "epoch": 0.8798283261802575, + "grad_norm": 0.14452221813017016, + "learning_rate": 7.479614257355971e-06, + "loss": 0.6448, + "step": 9840 + }, + { + "epoch": 0.8799177396280401, + "grad_norm": 0.15031026725129126, + "learning_rate": 7.468628631607822e-06, + "loss": 0.6422, + "step": 9841 + }, + { + "epoch": 0.8800071530758226, + "grad_norm": 0.14277729019241395, + "learning_rate": 7.45765076635404e-06, + "loss": 0.6439, + "step": 9842 + }, + { + "epoch": 0.8800965665236051, + "grad_norm": 0.12906341447289088, + "learning_rate": 7.446680662515315e-06, + "loss": 0.6285, + "step": 9843 + }, + { + "epoch": 0.8801859799713877, + "grad_norm": 0.13907701853235166, + "learning_rate": 7.435718321011731e-06, + "loss": 0.5644, + "step": 9844 + }, + { + "epoch": 0.8802753934191703, + "grad_norm": 0.13121306017067472, + "learning_rate": 7.424763742762642e-06, + "loss": 0.6598, + "step": 9845 + }, + { + "epoch": 0.8803648068669528, + "grad_norm": 0.15110035377709868, + "learning_rate": 7.41381692868679e-06, + "loss": 0.6649, + "step": 9846 + }, + { + "epoch": 0.8804542203147353, + "grad_norm": 0.13067173892075173, + "learning_rate": 7.402877879702341e-06, + "loss": 0.6179, + "step": 9847 + }, + { + "epoch": 0.8805436337625179, + "grad_norm": 0.14560576172852874, + "learning_rate": 7.391946596726673e-06, + "loss": 0.6348, + "step": 9848 + }, + { + "epoch": 0.8806330472103004, + "grad_norm": 0.15126451072657984, + "learning_rate": 7.381023080676608e-06, + "loss": 0.5889, + "step": 9849 + }, + { + "epoch": 0.880722460658083, + "grad_norm": 0.13751243017932654, + "learning_rate": 7.3701073324682905e-06, + "loss": 0.6297, + "step": 9850 + }, + { + "epoch": 0.8808118741058655, + "grad_norm": 0.13360662847689198, + "learning_rate": 7.3591993530171984e-06, + "loss": 0.5989, + "step": 9851 + }, + { + "epoch": 0.880901287553648, + "grad_norm": 0.14587770228739072, + "learning_rate": 7.348299143238157e-06, + "loss": 0.6129, + "step": 9852 + }, + { + "epoch": 0.8809907010014306, + "grad_norm": 0.1467778481546591, + "learning_rate": 7.33740670404538e-06, + "loss": 0.6103, + "step": 9853 + }, + { + "epoch": 0.8810801144492132, + "grad_norm": 0.13353080958716732, + "learning_rate": 7.326522036352401e-06, + "loss": 0.6221, + "step": 9854 + }, + { + "epoch": 0.8811695278969958, + "grad_norm": 0.1504306224938409, + "learning_rate": 7.315645141072103e-06, + "loss": 0.6173, + "step": 9855 + }, + { + "epoch": 0.8812589413447782, + "grad_norm": 0.13702226506738088, + "learning_rate": 7.30477601911671e-06, + "loss": 0.6386, + "step": 9856 + }, + { + "epoch": 0.8813483547925608, + "grad_norm": 0.11561731260896949, + "learning_rate": 7.293914671397795e-06, + "loss": 0.6019, + "step": 9857 + }, + { + "epoch": 0.8814377682403434, + "grad_norm": 0.14155005804859763, + "learning_rate": 7.283061098826294e-06, + "loss": 0.6247, + "step": 9858 + }, + { + "epoch": 0.8815271816881259, + "grad_norm": 0.15383248871518604, + "learning_rate": 7.272215302312502e-06, + "loss": 0.6324, + "step": 9859 + }, + { + "epoch": 0.8816165951359084, + "grad_norm": 0.13544421294699804, + "learning_rate": 7.261377282766002e-06, + "loss": 0.6598, + "step": 9860 + }, + { + "epoch": 0.881706008583691, + "grad_norm": 0.1445065973161384, + "learning_rate": 7.250547041095812e-06, + "loss": 0.6081, + "step": 9861 + }, + { + "epoch": 0.8817954220314735, + "grad_norm": 0.1422046152593404, + "learning_rate": 7.239724578210216e-06, + "loss": 0.6156, + "step": 9862 + }, + { + "epoch": 0.8818848354792561, + "grad_norm": 0.15113943989064857, + "learning_rate": 7.2289098950168995e-06, + "loss": 0.6003, + "step": 9863 + }, + { + "epoch": 0.8819742489270386, + "grad_norm": 0.12293511477778155, + "learning_rate": 7.2181029924228814e-06, + "loss": 0.6238, + "step": 9864 + }, + { + "epoch": 0.8820636623748211, + "grad_norm": 0.1420829526982219, + "learning_rate": 7.207303871334492e-06, + "loss": 0.6523, + "step": 9865 + }, + { + "epoch": 0.8821530758226037, + "grad_norm": 0.1502945935621625, + "learning_rate": 7.1965125326574735e-06, + "loss": 0.6371, + "step": 9866 + }, + { + "epoch": 0.8822424892703863, + "grad_norm": 0.17663615020005555, + "learning_rate": 7.185728977296857e-06, + "loss": 0.6301, + "step": 9867 + }, + { + "epoch": 0.8823319027181689, + "grad_norm": 0.12427820252600605, + "learning_rate": 7.174953206157064e-06, + "loss": 0.6173, + "step": 9868 + }, + { + "epoch": 0.8824213161659513, + "grad_norm": 0.1407830094516203, + "learning_rate": 7.16418522014185e-06, + "loss": 0.6156, + "step": 9869 + }, + { + "epoch": 0.8825107296137339, + "grad_norm": 0.1437912817615683, + "learning_rate": 7.153425020154314e-06, + "loss": 0.6373, + "step": 9870 + }, + { + "epoch": 0.8826001430615165, + "grad_norm": 0.1388025944730624, + "learning_rate": 7.142672607096878e-06, + "loss": 0.5914, + "step": 9871 + }, + { + "epoch": 0.882689556509299, + "grad_norm": 0.13144116528812075, + "learning_rate": 7.1319279818713445e-06, + "loss": 0.6026, + "step": 9872 + }, + { + "epoch": 0.8827789699570815, + "grad_norm": 0.14430553310467428, + "learning_rate": 7.121191145378858e-06, + "loss": 0.6654, + "step": 9873 + }, + { + "epoch": 0.8828683834048641, + "grad_norm": 0.14065362298232872, + "learning_rate": 7.110462098519899e-06, + "loss": 0.6139, + "step": 9874 + }, + { + "epoch": 0.8829577968526466, + "grad_norm": 0.14325466171558726, + "learning_rate": 7.099740842194313e-06, + "loss": 0.611, + "step": 9875 + }, + { + "epoch": 0.8830472103004292, + "grad_norm": 0.14702730026116598, + "learning_rate": 7.08902737730125e-06, + "loss": 0.6602, + "step": 9876 + }, + { + "epoch": 0.8831366237482118, + "grad_norm": 0.12956594639975647, + "learning_rate": 7.078321704739266e-06, + "loss": 0.5685, + "step": 9877 + }, + { + "epoch": 0.8832260371959942, + "grad_norm": 0.15375594960594713, + "learning_rate": 7.067623825406222e-06, + "loss": 0.6509, + "step": 9878 + }, + { + "epoch": 0.8833154506437768, + "grad_norm": 0.13813249473367104, + "learning_rate": 7.056933740199323e-06, + "loss": 0.6365, + "step": 9879 + }, + { + "epoch": 0.8834048640915594, + "grad_norm": 0.1402769096010447, + "learning_rate": 7.0462514500151285e-06, + "loss": 0.6389, + "step": 9880 + }, + { + "epoch": 0.883494277539342, + "grad_norm": 0.14138892385005572, + "learning_rate": 7.035576955749601e-06, + "loss": 0.6237, + "step": 9881 + }, + { + "epoch": 0.8835836909871244, + "grad_norm": 0.1493160958572658, + "learning_rate": 7.0249102582979455e-06, + "loss": 0.6385, + "step": 9882 + }, + { + "epoch": 0.883673104434907, + "grad_norm": 0.13276063528496418, + "learning_rate": 7.01425135855478e-06, + "loss": 0.6519, + "step": 9883 + }, + { + "epoch": 0.8837625178826896, + "grad_norm": 0.12320955372268079, + "learning_rate": 7.003600257414067e-06, + "loss": 0.6135, + "step": 9884 + }, + { + "epoch": 0.8838519313304721, + "grad_norm": 0.15038240273764042, + "learning_rate": 6.99295695576907e-06, + "loss": 0.631, + "step": 9885 + }, + { + "epoch": 0.8839413447782547, + "grad_norm": 0.1501591527357852, + "learning_rate": 6.9823214545124525e-06, + "loss": 0.6564, + "step": 9886 + }, + { + "epoch": 0.8840307582260372, + "grad_norm": 0.1347837790892716, + "learning_rate": 6.971693754536201e-06, + "loss": 0.6472, + "step": 9887 + }, + { + "epoch": 0.8841201716738197, + "grad_norm": 0.1372087251402663, + "learning_rate": 6.961073856731648e-06, + "loss": 0.6253, + "step": 9888 + }, + { + "epoch": 0.8842095851216023, + "grad_norm": 0.1463763924682415, + "learning_rate": 6.950461761989458e-06, + "loss": 0.6343, + "step": 9889 + }, + { + "epoch": 0.8842989985693849, + "grad_norm": 0.13957241357511851, + "learning_rate": 6.9398574711996844e-06, + "loss": 0.6419, + "step": 9890 + }, + { + "epoch": 0.8843884120171673, + "grad_norm": 0.12973835028357156, + "learning_rate": 6.929260985251662e-06, + "loss": 0.5968, + "step": 9891 + }, + { + "epoch": 0.8844778254649499, + "grad_norm": 0.14570902702287608, + "learning_rate": 6.918672305034124e-06, + "loss": 0.6561, + "step": 9892 + }, + { + "epoch": 0.8845672389127325, + "grad_norm": 0.15374508123113245, + "learning_rate": 6.908091431435138e-06, + "loss": 0.6551, + "step": 9893 + }, + { + "epoch": 0.884656652360515, + "grad_norm": 0.1487712808668464, + "learning_rate": 6.897518365342059e-06, + "loss": 0.6568, + "step": 9894 + }, + { + "epoch": 0.8847460658082976, + "grad_norm": 0.12276473787154399, + "learning_rate": 6.8869531076417136e-06, + "loss": 0.613, + "step": 9895 + }, + { + "epoch": 0.8848354792560801, + "grad_norm": 0.14486498900535033, + "learning_rate": 6.876395659220148e-06, + "loss": 0.6133, + "step": 9896 + }, + { + "epoch": 0.8849248927038627, + "grad_norm": 0.1426347596000164, + "learning_rate": 6.865846020962807e-06, + "loss": 0.6193, + "step": 9897 + }, + { + "epoch": 0.8850143061516452, + "grad_norm": 0.13603224960212396, + "learning_rate": 6.855304193754497e-06, + "loss": 0.6258, + "step": 9898 + }, + { + "epoch": 0.8851037195994278, + "grad_norm": 0.13505907677072462, + "learning_rate": 6.844770178479321e-06, + "loss": 0.6143, + "step": 9899 + }, + { + "epoch": 0.8851931330472103, + "grad_norm": 0.15690251462637242, + "learning_rate": 6.834243976020771e-06, + "loss": 0.6472, + "step": 9900 + }, + { + "epoch": 0.8852825464949928, + "grad_norm": 0.14091879294526172, + "learning_rate": 6.823725587261654e-06, + "loss": 0.6412, + "step": 9901 + }, + { + "epoch": 0.8853719599427754, + "grad_norm": 0.1394408194952058, + "learning_rate": 6.813215013084151e-06, + "loss": 0.6225, + "step": 9902 + }, + { + "epoch": 0.885461373390558, + "grad_norm": 0.14000921781521306, + "learning_rate": 6.8027122543697586e-06, + "loss": 0.6059, + "step": 9903 + }, + { + "epoch": 0.8855507868383404, + "grad_norm": 0.13958400395134793, + "learning_rate": 6.7922173119993606e-06, + "loss": 0.625, + "step": 9904 + }, + { + "epoch": 0.885640200286123, + "grad_norm": 0.13557458513854342, + "learning_rate": 6.781730186853108e-06, + "loss": 0.5991, + "step": 9905 + }, + { + "epoch": 0.8857296137339056, + "grad_norm": 0.12653968598255372, + "learning_rate": 6.771250879810565e-06, + "loss": 0.613, + "step": 9906 + }, + { + "epoch": 0.8858190271816881, + "grad_norm": 0.13927636230149082, + "learning_rate": 6.760779391750627e-06, + "loss": 0.635, + "step": 9907 + }, + { + "epoch": 0.8859084406294707, + "grad_norm": 0.1388262922382443, + "learning_rate": 6.750315723551492e-06, + "loss": 0.6407, + "step": 9908 + }, + { + "epoch": 0.8859978540772532, + "grad_norm": 0.11970907361537644, + "learning_rate": 6.739859876090793e-06, + "loss": 0.5928, + "step": 9909 + }, + { + "epoch": 0.8860872675250357, + "grad_norm": 0.141086873348868, + "learning_rate": 6.729411850245404e-06, + "loss": 0.601, + "step": 9910 + }, + { + "epoch": 0.8861766809728183, + "grad_norm": 0.12757366592154076, + "learning_rate": 6.718971646891603e-06, + "loss": 0.5971, + "step": 9911 + }, + { + "epoch": 0.8862660944206009, + "grad_norm": 0.1405606746193058, + "learning_rate": 6.708539266905001e-06, + "loss": 0.5957, + "step": 9912 + }, + { + "epoch": 0.8863555078683834, + "grad_norm": 0.13141262397861192, + "learning_rate": 6.6981147111605305e-06, + "loss": 0.6207, + "step": 9913 + }, + { + "epoch": 0.8864449213161659, + "grad_norm": 0.13674051325647074, + "learning_rate": 6.687697980532504e-06, + "loss": 0.6203, + "step": 9914 + }, + { + "epoch": 0.8865343347639485, + "grad_norm": 0.12087198574251254, + "learning_rate": 6.677289075894544e-06, + "loss": 0.6132, + "step": 9915 + }, + { + "epoch": 0.8866237482117311, + "grad_norm": 0.15017118765335524, + "learning_rate": 6.666887998119653e-06, + "loss": 0.6531, + "step": 9916 + }, + { + "epoch": 0.8867131616595136, + "grad_norm": 0.13143706056134177, + "learning_rate": 6.656494748080144e-06, + "loss": 0.6207, + "step": 9917 + }, + { + "epoch": 0.8868025751072961, + "grad_norm": 0.15743158308953295, + "learning_rate": 6.646109326647709e-06, + "loss": 0.6758, + "step": 9918 + }, + { + "epoch": 0.8868919885550787, + "grad_norm": 0.13042880474110025, + "learning_rate": 6.635731734693329e-06, + "loss": 0.631, + "step": 9919 + }, + { + "epoch": 0.8869814020028612, + "grad_norm": 0.12253968547499208, + "learning_rate": 6.625361973087363e-06, + "loss": 0.6047, + "step": 9920 + }, + { + "epoch": 0.8870708154506438, + "grad_norm": 0.13443225738080303, + "learning_rate": 6.6150000426995486e-06, + "loss": 0.6169, + "step": 9921 + }, + { + "epoch": 0.8871602288984263, + "grad_norm": 0.14852647814171768, + "learning_rate": 6.604645944398858e-06, + "loss": 0.671, + "step": 9922 + }, + { + "epoch": 0.8872496423462088, + "grad_norm": 0.1317316186814846, + "learning_rate": 6.594299679053739e-06, + "loss": 0.6168, + "step": 9923 + }, + { + "epoch": 0.8873390557939914, + "grad_norm": 0.14701190599754482, + "learning_rate": 6.583961247531911e-06, + "loss": 0.6246, + "step": 9924 + }, + { + "epoch": 0.887428469241774, + "grad_norm": 0.12541620065727488, + "learning_rate": 6.573630650700424e-06, + "loss": 0.6072, + "step": 9925 + }, + { + "epoch": 0.8875178826895566, + "grad_norm": 0.13009519879109666, + "learning_rate": 6.563307889425707e-06, + "loss": 0.6042, + "step": 9926 + }, + { + "epoch": 0.887607296137339, + "grad_norm": 0.163986988952759, + "learning_rate": 6.5529929645735235e-06, + "loss": 0.6136, + "step": 9927 + }, + { + "epoch": 0.8876967095851216, + "grad_norm": 0.14967455347848083, + "learning_rate": 6.542685877008959e-06, + "loss": 0.6359, + "step": 9928 + }, + { + "epoch": 0.8877861230329042, + "grad_norm": 0.1508011361780604, + "learning_rate": 6.532386627596454e-06, + "loss": 0.6607, + "step": 9929 + }, + { + "epoch": 0.8878755364806867, + "grad_norm": 0.12701468806119925, + "learning_rate": 6.522095217199797e-06, + "loss": 0.6368, + "step": 9930 + }, + { + "epoch": 0.8879649499284692, + "grad_norm": 0.13966596918934138, + "learning_rate": 6.511811646682131e-06, + "loss": 0.6447, + "step": 9931 + }, + { + "epoch": 0.8880543633762518, + "grad_norm": 0.14177162550997716, + "learning_rate": 6.501535916905932e-06, + "loss": 0.6142, + "step": 9932 + }, + { + "epoch": 0.8881437768240343, + "grad_norm": 0.13528902785495736, + "learning_rate": 6.491268028732977e-06, + "loss": 0.635, + "step": 9933 + }, + { + "epoch": 0.8882331902718169, + "grad_norm": 0.1380170688982303, + "learning_rate": 6.4810079830244455e-06, + "loss": 0.6141, + "step": 9934 + }, + { + "epoch": 0.8883226037195995, + "grad_norm": 0.14397226942926739, + "learning_rate": 6.470755780640847e-06, + "loss": 0.6554, + "step": 9935 + }, + { + "epoch": 0.8884120171673819, + "grad_norm": 0.1446569401112015, + "learning_rate": 6.460511422441984e-06, + "loss": 0.62, + "step": 9936 + }, + { + "epoch": 0.8885014306151645, + "grad_norm": 0.1506806844006069, + "learning_rate": 6.450274909287068e-06, + "loss": 0.6507, + "step": 9937 + }, + { + "epoch": 0.8885908440629471, + "grad_norm": 0.1382735211551489, + "learning_rate": 6.440046242034625e-06, + "loss": 0.6337, + "step": 9938 + }, + { + "epoch": 0.8886802575107297, + "grad_norm": 0.15243789152190063, + "learning_rate": 6.429825421542512e-06, + "loss": 0.6177, + "step": 9939 + }, + { + "epoch": 0.8887696709585121, + "grad_norm": 0.15379838267548118, + "learning_rate": 6.4196124486679225e-06, + "loss": 0.6355, + "step": 9940 + }, + { + "epoch": 0.8888590844062947, + "grad_norm": 0.1442399739879629, + "learning_rate": 6.409407324267447e-06, + "loss": 0.6249, + "step": 9941 + }, + { + "epoch": 0.8889484978540773, + "grad_norm": 0.133761713073746, + "learning_rate": 6.399210049196924e-06, + "loss": 0.6067, + "step": 9942 + }, + { + "epoch": 0.8890379113018598, + "grad_norm": 0.12361206672402802, + "learning_rate": 6.3890206243116255e-06, + "loss": 0.6156, + "step": 9943 + }, + { + "epoch": 0.8891273247496424, + "grad_norm": 0.14243693252907713, + "learning_rate": 6.378839050466101e-06, + "loss": 0.653, + "step": 9944 + }, + { + "epoch": 0.8892167381974249, + "grad_norm": 0.13180699648379227, + "learning_rate": 6.36866532851429e-06, + "loss": 0.6166, + "step": 9945 + }, + { + "epoch": 0.8893061516452074, + "grad_norm": 0.1619657922488336, + "learning_rate": 6.3584994593094305e-06, + "loss": 0.6811, + "step": 9946 + }, + { + "epoch": 0.88939556509299, + "grad_norm": 0.1418735870866135, + "learning_rate": 6.348341443704153e-06, + "loss": 0.6505, + "step": 9947 + }, + { + "epoch": 0.8894849785407726, + "grad_norm": 0.13835961980981895, + "learning_rate": 6.338191282550354e-06, + "loss": 0.6517, + "step": 9948 + }, + { + "epoch": 0.889574391988555, + "grad_norm": 0.1554838660642156, + "learning_rate": 6.328048976699352e-06, + "loss": 0.6694, + "step": 9949 + }, + { + "epoch": 0.8896638054363376, + "grad_norm": 0.1318978827499976, + "learning_rate": 6.317914527001745e-06, + "loss": 0.6066, + "step": 9950 + }, + { + "epoch": 0.8897532188841202, + "grad_norm": 0.15123368260664743, + "learning_rate": 6.307787934307507e-06, + "loss": 0.6649, + "step": 9951 + }, + { + "epoch": 0.8898426323319027, + "grad_norm": 0.15139991969576883, + "learning_rate": 6.297669199465961e-06, + "loss": 0.6265, + "step": 9952 + }, + { + "epoch": 0.8899320457796852, + "grad_norm": 0.13330802891983612, + "learning_rate": 6.287558323325715e-06, + "loss": 0.5994, + "step": 9953 + }, + { + "epoch": 0.8900214592274678, + "grad_norm": 0.13811951836413594, + "learning_rate": 6.277455306734781e-06, + "loss": 0.6254, + "step": 9954 + }, + { + "epoch": 0.8901108726752504, + "grad_norm": 0.14937928932227737, + "learning_rate": 6.267360150540491e-06, + "loss": 0.6214, + "step": 9955 + }, + { + "epoch": 0.8902002861230329, + "grad_norm": 0.14286247396892562, + "learning_rate": 6.2572728555894796e-06, + "loss": 0.6136, + "step": 9956 + }, + { + "epoch": 0.8902896995708155, + "grad_norm": 0.14706417423883283, + "learning_rate": 6.247193422727804e-06, + "loss": 0.6344, + "step": 9957 + }, + { + "epoch": 0.890379113018598, + "grad_norm": 0.13585534590331427, + "learning_rate": 6.237121852800798e-06, + "loss": 0.6377, + "step": 9958 + }, + { + "epoch": 0.8904685264663805, + "grad_norm": 0.12875000662515915, + "learning_rate": 6.227058146653131e-06, + "loss": 0.6011, + "step": 9959 + }, + { + "epoch": 0.8905579399141631, + "grad_norm": 0.14893836110006964, + "learning_rate": 6.217002305128849e-06, + "loss": 0.6236, + "step": 9960 + }, + { + "epoch": 0.8906473533619457, + "grad_norm": 0.137493072835442, + "learning_rate": 6.206954329071335e-06, + "loss": 0.6428, + "step": 9961 + }, + { + "epoch": 0.8907367668097281, + "grad_norm": 0.14058008297339838, + "learning_rate": 6.19691421932328e-06, + "loss": 0.593, + "step": 9962 + }, + { + "epoch": 0.8908261802575107, + "grad_norm": 0.1402640673887719, + "learning_rate": 6.186881976726733e-06, + "loss": 0.6628, + "step": 9963 + }, + { + "epoch": 0.8909155937052933, + "grad_norm": 0.15118858244926978, + "learning_rate": 6.17685760212311e-06, + "loss": 0.6623, + "step": 9964 + }, + { + "epoch": 0.8910050071530758, + "grad_norm": 0.12738790846822004, + "learning_rate": 6.166841096353126e-06, + "loss": 0.609, + "step": 9965 + }, + { + "epoch": 0.8910944206008584, + "grad_norm": 0.12975947796840273, + "learning_rate": 6.1568324602568675e-06, + "loss": 0.6123, + "step": 9966 + }, + { + "epoch": 0.8911838340486409, + "grad_norm": 0.13251268560303375, + "learning_rate": 6.146831694673727e-06, + "loss": 0.6067, + "step": 9967 + }, + { + "epoch": 0.8912732474964234, + "grad_norm": 0.14565874068319135, + "learning_rate": 6.136838800442457e-06, + "loss": 0.6336, + "step": 9968 + }, + { + "epoch": 0.891362660944206, + "grad_norm": 0.14172047356251433, + "learning_rate": 6.126853778401187e-06, + "loss": 0.612, + "step": 9969 + }, + { + "epoch": 0.8914520743919886, + "grad_norm": 0.13024080306006233, + "learning_rate": 6.11687662938728e-06, + "loss": 0.6001, + "step": 9970 + }, + { + "epoch": 0.891541487839771, + "grad_norm": 0.14319002195591654, + "learning_rate": 6.1069073542375675e-06, + "loss": 0.6375, + "step": 9971 + }, + { + "epoch": 0.8916309012875536, + "grad_norm": 0.1255876513534593, + "learning_rate": 6.0969459537881575e-06, + "loss": 0.5924, + "step": 9972 + }, + { + "epoch": 0.8917203147353362, + "grad_norm": 0.13893704340587734, + "learning_rate": 6.086992428874472e-06, + "loss": 0.62, + "step": 9973 + }, + { + "epoch": 0.8918097281831188, + "grad_norm": 0.1521577685628036, + "learning_rate": 6.077046780331308e-06, + "loss": 0.6604, + "step": 9974 + }, + { + "epoch": 0.8918991416309013, + "grad_norm": 0.1383353144006529, + "learning_rate": 6.06710900899281e-06, + "loss": 0.621, + "step": 9975 + }, + { + "epoch": 0.8919885550786838, + "grad_norm": 0.1364112019433754, + "learning_rate": 6.057179115692435e-06, + "loss": 0.643, + "step": 9976 + }, + { + "epoch": 0.8920779685264664, + "grad_norm": 0.13944973471211033, + "learning_rate": 6.047257101262982e-06, + "loss": 0.634, + "step": 9977 + }, + { + "epoch": 0.8921673819742489, + "grad_norm": 0.1504666142731708, + "learning_rate": 6.037342966536619e-06, + "loss": 0.633, + "step": 9978 + }, + { + "epoch": 0.8922567954220315, + "grad_norm": 0.1317343259816466, + "learning_rate": 6.027436712344814e-06, + "loss": 0.65, + "step": 9979 + }, + { + "epoch": 0.892346208869814, + "grad_norm": 0.13020219005702105, + "learning_rate": 6.017538339518403e-06, + "loss": 0.6534, + "step": 9980 + }, + { + "epoch": 0.8924356223175965, + "grad_norm": 0.13233222729391836, + "learning_rate": 6.007647848887565e-06, + "loss": 0.6202, + "step": 9981 + }, + { + "epoch": 0.8925250357653791, + "grad_norm": 0.1457275895237314, + "learning_rate": 5.997765241281783e-06, + "loss": 0.565, + "step": 9982 + }, + { + "epoch": 0.8926144492131617, + "grad_norm": 0.14495458159959584, + "learning_rate": 5.987890517529893e-06, + "loss": 0.6711, + "step": 9983 + }, + { + "epoch": 0.8927038626609443, + "grad_norm": 0.1309520182721307, + "learning_rate": 5.978023678460099e-06, + "loss": 0.6069, + "step": 9984 + }, + { + "epoch": 0.8927932761087267, + "grad_norm": 0.1385299530893253, + "learning_rate": 5.968164724899894e-06, + "loss": 0.6006, + "step": 9985 + }, + { + "epoch": 0.8928826895565093, + "grad_norm": 0.1371325591106538, + "learning_rate": 5.958313657676173e-06, + "loss": 0.6407, + "step": 9986 + }, + { + "epoch": 0.8929721030042919, + "grad_norm": 0.14508785203246835, + "learning_rate": 5.948470477615098e-06, + "loss": 0.647, + "step": 9987 + }, + { + "epoch": 0.8930615164520744, + "grad_norm": 0.15215930321583546, + "learning_rate": 5.938635185542218e-06, + "loss": 0.6226, + "step": 9988 + }, + { + "epoch": 0.8931509298998569, + "grad_norm": 0.1479025161733409, + "learning_rate": 5.928807782282431e-06, + "loss": 0.6258, + "step": 9989 + }, + { + "epoch": 0.8932403433476395, + "grad_norm": 0.1544124457243786, + "learning_rate": 5.918988268659898e-06, + "loss": 0.6492, + "step": 9990 + }, + { + "epoch": 0.893329756795422, + "grad_norm": 0.13198622886141675, + "learning_rate": 5.909176645498193e-06, + "loss": 0.6133, + "step": 9991 + }, + { + "epoch": 0.8934191702432046, + "grad_norm": 0.14858836228937336, + "learning_rate": 5.899372913620238e-06, + "loss": 0.6875, + "step": 9992 + }, + { + "epoch": 0.8935085836909872, + "grad_norm": 0.16345214966985644, + "learning_rate": 5.889577073848207e-06, + "loss": 0.6552, + "step": 9993 + }, + { + "epoch": 0.8935979971387696, + "grad_norm": 0.1546567849485975, + "learning_rate": 5.879789127003699e-06, + "loss": 0.6441, + "step": 9994 + }, + { + "epoch": 0.8936874105865522, + "grad_norm": 0.12987785209156472, + "learning_rate": 5.870009073907623e-06, + "loss": 0.6176, + "step": 9995 + }, + { + "epoch": 0.8937768240343348, + "grad_norm": 0.1466341297707959, + "learning_rate": 5.86023691538019e-06, + "loss": 0.6362, + "step": 9996 + }, + { + "epoch": 0.8938662374821174, + "grad_norm": 0.14546268022063005, + "learning_rate": 5.850472652240991e-06, + "loss": 0.6402, + "step": 9997 + }, + { + "epoch": 0.8939556509298998, + "grad_norm": 0.13468646548518576, + "learning_rate": 5.840716285308956e-06, + "loss": 0.6229, + "step": 9998 + }, + { + "epoch": 0.8940450643776824, + "grad_norm": 0.15083276460498563, + "learning_rate": 5.8309678154023216e-06, + "loss": 0.6186, + "step": 9999 + }, + { + "epoch": 0.894134477825465, + "grad_norm": 0.14594019177478537, + "learning_rate": 5.821227243338712e-06, + "loss": 0.6453, + "step": 10000 + }, + { + "epoch": 0.8942238912732475, + "grad_norm": 0.1578030686353113, + "learning_rate": 5.811494569935016e-06, + "loss": 0.6447, + "step": 10001 + }, + { + "epoch": 0.89431330472103, + "grad_norm": 0.1380294281287521, + "learning_rate": 5.801769796007517e-06, + "loss": 0.6292, + "step": 10002 + }, + { + "epoch": 0.8944027181688126, + "grad_norm": 0.1365272559370871, + "learning_rate": 5.792052922371826e-06, + "loss": 0.647, + "step": 10003 + }, + { + "epoch": 0.8944921316165951, + "grad_norm": 0.15448449700749622, + "learning_rate": 5.782343949842894e-06, + "loss": 0.6532, + "step": 10004 + }, + { + "epoch": 0.8945815450643777, + "grad_norm": 0.14590238124391514, + "learning_rate": 5.7726428792349574e-06, + "loss": 0.6482, + "step": 10005 + }, + { + "epoch": 0.8946709585121603, + "grad_norm": 0.14650584827713617, + "learning_rate": 5.762949711361698e-06, + "loss": 0.6457, + "step": 10006 + }, + { + "epoch": 0.8947603719599427, + "grad_norm": 0.14827048566389064, + "learning_rate": 5.753264447036022e-06, + "loss": 0.625, + "step": 10007 + }, + { + "epoch": 0.8948497854077253, + "grad_norm": 0.14696152271939578, + "learning_rate": 5.743587087070235e-06, + "loss": 0.6311, + "step": 10008 + }, + { + "epoch": 0.8949391988555079, + "grad_norm": 0.1480913695551408, + "learning_rate": 5.733917632275976e-06, + "loss": 0.6591, + "step": 10009 + }, + { + "epoch": 0.8950286123032904, + "grad_norm": 0.12367461653420708, + "learning_rate": 5.7242560834641855e-06, + "loss": 0.6234, + "step": 10010 + }, + { + "epoch": 0.8951180257510729, + "grad_norm": 0.12605105992097218, + "learning_rate": 5.714602441445194e-06, + "loss": 0.5779, + "step": 10011 + }, + { + "epoch": 0.8952074391988555, + "grad_norm": 0.13347967432940422, + "learning_rate": 5.704956707028619e-06, + "loss": 0.6198, + "step": 10012 + }, + { + "epoch": 0.895296852646638, + "grad_norm": 0.1381036225287256, + "learning_rate": 5.695318881023437e-06, + "loss": 0.6081, + "step": 10013 + }, + { + "epoch": 0.8953862660944206, + "grad_norm": 0.1219375436131841, + "learning_rate": 5.685688964237979e-06, + "loss": 0.6326, + "step": 10014 + }, + { + "epoch": 0.8954756795422032, + "grad_norm": 0.13436196544589823, + "learning_rate": 5.676066957479898e-06, + "loss": 0.6042, + "step": 10015 + }, + { + "epoch": 0.8955650929899857, + "grad_norm": 0.15679423802167808, + "learning_rate": 5.66645286155616e-06, + "loss": 0.6329, + "step": 10016 + }, + { + "epoch": 0.8956545064377682, + "grad_norm": 0.13038897037841557, + "learning_rate": 5.656846677273086e-06, + "loss": 0.6152, + "step": 10017 + }, + { + "epoch": 0.8957439198855508, + "grad_norm": 0.12805397231328125, + "learning_rate": 5.647248405436356e-06, + "loss": 0.5718, + "step": 10018 + }, + { + "epoch": 0.8958333333333334, + "grad_norm": 0.12844791759034335, + "learning_rate": 5.637658046850924e-06, + "loss": 0.616, + "step": 10019 + }, + { + "epoch": 0.8959227467811158, + "grad_norm": 0.14329419507896102, + "learning_rate": 5.628075602321181e-06, + "loss": 0.6087, + "step": 10020 + }, + { + "epoch": 0.8960121602288984, + "grad_norm": 0.14263303219035037, + "learning_rate": 5.618501072650761e-06, + "loss": 0.6111, + "step": 10021 + }, + { + "epoch": 0.896101573676681, + "grad_norm": 0.12618455238068885, + "learning_rate": 5.608934458642656e-06, + "loss": 0.5987, + "step": 10022 + }, + { + "epoch": 0.8961909871244635, + "grad_norm": 0.14171079486569413, + "learning_rate": 5.599375761099246e-06, + "loss": 0.6472, + "step": 10023 + }, + { + "epoch": 0.8962804005722461, + "grad_norm": 0.12292888835934496, + "learning_rate": 5.589824980822167e-06, + "loss": 0.6361, + "step": 10024 + }, + { + "epoch": 0.8963698140200286, + "grad_norm": 0.149048259614827, + "learning_rate": 5.580282118612446e-06, + "loss": 0.6652, + "step": 10025 + }, + { + "epoch": 0.8964592274678111, + "grad_norm": 0.14181833031171845, + "learning_rate": 5.570747175270441e-06, + "loss": 0.6635, + "step": 10026 + }, + { + "epoch": 0.8965486409155937, + "grad_norm": 0.14963265544926102, + "learning_rate": 5.561220151595825e-06, + "loss": 0.6068, + "step": 10027 + }, + { + "epoch": 0.8966380543633763, + "grad_norm": 0.14498131097439756, + "learning_rate": 5.551701048387614e-06, + "loss": 0.6107, + "step": 10028 + }, + { + "epoch": 0.8967274678111588, + "grad_norm": 0.14542574055319377, + "learning_rate": 5.542189866444203e-06, + "loss": 0.5977, + "step": 10029 + }, + { + "epoch": 0.8968168812589413, + "grad_norm": 0.13151994271275286, + "learning_rate": 5.53268660656322e-06, + "loss": 0.5767, + "step": 10030 + }, + { + "epoch": 0.8969062947067239, + "grad_norm": 0.14171000864237462, + "learning_rate": 5.523191269541728e-06, + "loss": 0.6302, + "step": 10031 + }, + { + "epoch": 0.8969957081545065, + "grad_norm": 0.14451075332481486, + "learning_rate": 5.5137038561761115e-06, + "loss": 0.6059, + "step": 10032 + }, + { + "epoch": 0.897085121602289, + "grad_norm": 0.13610473533710082, + "learning_rate": 5.5042243672620006e-06, + "loss": 0.6464, + "step": 10033 + }, + { + "epoch": 0.8971745350500715, + "grad_norm": 0.14857125037737434, + "learning_rate": 5.494752803594505e-06, + "loss": 0.6368, + "step": 10034 + }, + { + "epoch": 0.8972639484978541, + "grad_norm": 0.13876504329870445, + "learning_rate": 5.485289165967933e-06, + "loss": 0.6139, + "step": 10035 + }, + { + "epoch": 0.8973533619456366, + "grad_norm": 0.11787561659035686, + "learning_rate": 5.475833455176027e-06, + "loss": 0.6098, + "step": 10036 + }, + { + "epoch": 0.8974427753934192, + "grad_norm": 0.15878638174195223, + "learning_rate": 5.466385672011809e-06, + "loss": 0.6436, + "step": 10037 + }, + { + "epoch": 0.8975321888412017, + "grad_norm": 0.1532848358057374, + "learning_rate": 5.4569458172676665e-06, + "loss": 0.692, + "step": 10038 + }, + { + "epoch": 0.8976216022889842, + "grad_norm": 0.15585425914019985, + "learning_rate": 5.4475138917352894e-06, + "loss": 0.6581, + "step": 10039 + }, + { + "epoch": 0.8977110157367668, + "grad_norm": 0.1322096310047298, + "learning_rate": 5.4380898962057336e-06, + "loss": 0.6071, + "step": 10040 + }, + { + "epoch": 0.8978004291845494, + "grad_norm": 0.1446732082853203, + "learning_rate": 5.428673831469366e-06, + "loss": 0.6039, + "step": 10041 + }, + { + "epoch": 0.897889842632332, + "grad_norm": 0.15187257765782644, + "learning_rate": 5.419265698315923e-06, + "loss": 0.6284, + "step": 10042 + }, + { + "epoch": 0.8979792560801144, + "grad_norm": 0.13713749059637456, + "learning_rate": 5.40986549753445e-06, + "loss": 0.6537, + "step": 10043 + }, + { + "epoch": 0.898068669527897, + "grad_norm": 0.13500862190525464, + "learning_rate": 5.400473229913305e-06, + "loss": 0.6194, + "step": 10044 + }, + { + "epoch": 0.8981580829756796, + "grad_norm": 0.13852383986303754, + "learning_rate": 5.3910888962402265e-06, + "loss": 0.6772, + "step": 10045 + }, + { + "epoch": 0.8982474964234621, + "grad_norm": 0.14870123856920794, + "learning_rate": 5.381712497302261e-06, + "loss": 0.6491, + "step": 10046 + }, + { + "epoch": 0.8983369098712446, + "grad_norm": 0.12730445041102056, + "learning_rate": 5.372344033885801e-06, + "loss": 0.6458, + "step": 10047 + }, + { + "epoch": 0.8984263233190272, + "grad_norm": 0.14557958190210063, + "learning_rate": 5.362983506776564e-06, + "loss": 0.5775, + "step": 10048 + }, + { + "epoch": 0.8985157367668097, + "grad_norm": 0.15808212890520046, + "learning_rate": 5.353630916759622e-06, + "loss": 0.66, + "step": 10049 + }, + { + "epoch": 0.8986051502145923, + "grad_norm": 0.13539592395361527, + "learning_rate": 5.344286264619347e-06, + "loss": 0.6079, + "step": 10050 + }, + { + "epoch": 0.8986945636623748, + "grad_norm": 0.1496799966840158, + "learning_rate": 5.334949551139457e-06, + "loss": 0.6137, + "step": 10051 + }, + { + "epoch": 0.8987839771101573, + "grad_norm": 0.12696746859928276, + "learning_rate": 5.325620777103035e-06, + "loss": 0.5802, + "step": 10052 + }, + { + "epoch": 0.8988733905579399, + "grad_norm": 0.13925273051996773, + "learning_rate": 5.316299943292435e-06, + "loss": 0.6169, + "step": 10053 + }, + { + "epoch": 0.8989628040057225, + "grad_norm": 0.12716146619230762, + "learning_rate": 5.306987050489442e-06, + "loss": 0.6079, + "step": 10054 + }, + { + "epoch": 0.899052217453505, + "grad_norm": 0.13703914966966158, + "learning_rate": 5.297682099475066e-06, + "loss": 0.6523, + "step": 10055 + }, + { + "epoch": 0.8991416309012875, + "grad_norm": 0.1486714277770928, + "learning_rate": 5.2883850910297235e-06, + "loss": 0.6238, + "step": 10056 + }, + { + "epoch": 0.8992310443490701, + "grad_norm": 0.141299260297237, + "learning_rate": 5.27909602593315e-06, + "loss": 0.6325, + "step": 10057 + }, + { + "epoch": 0.8993204577968527, + "grad_norm": 0.1330907954399222, + "learning_rate": 5.2698149049643874e-06, + "loss": 0.6135, + "step": 10058 + }, + { + "epoch": 0.8994098712446352, + "grad_norm": 0.13447393130554597, + "learning_rate": 5.260541728901847e-06, + "loss": 0.6042, + "step": 10059 + }, + { + "epoch": 0.8994992846924177, + "grad_norm": 0.12491231525465435, + "learning_rate": 5.25127649852325e-06, + "loss": 0.6049, + "step": 10060 + }, + { + "epoch": 0.8995886981402003, + "grad_norm": 0.13700263498277745, + "learning_rate": 5.2420192146056645e-06, + "loss": 0.6312, + "step": 10061 + }, + { + "epoch": 0.8996781115879828, + "grad_norm": 0.13858626917277028, + "learning_rate": 5.232769877925503e-06, + "loss": 0.6781, + "step": 10062 + }, + { + "epoch": 0.8997675250357654, + "grad_norm": 0.1405548981216729, + "learning_rate": 5.2235284892584776e-06, + "loss": 0.6154, + "step": 10063 + }, + { + "epoch": 0.899856938483548, + "grad_norm": 0.1503785866954745, + "learning_rate": 5.214295049379658e-06, + "loss": 0.6242, + "step": 10064 + }, + { + "epoch": 0.8999463519313304, + "grad_norm": 0.13501492191581904, + "learning_rate": 5.205069559063425e-06, + "loss": 0.6105, + "step": 10065 + }, + { + "epoch": 0.900035765379113, + "grad_norm": 0.13772720160420662, + "learning_rate": 5.195852019083558e-06, + "loss": 0.621, + "step": 10066 + }, + { + "epoch": 0.9001251788268956, + "grad_norm": 0.16055109593620873, + "learning_rate": 5.18664243021304e-06, + "loss": 0.6329, + "step": 10067 + }, + { + "epoch": 0.9002145922746781, + "grad_norm": 0.14441997011606628, + "learning_rate": 5.177440793224342e-06, + "loss": 0.6732, + "step": 10068 + }, + { + "epoch": 0.9003040057224606, + "grad_norm": 0.1507711244734226, + "learning_rate": 5.168247108889179e-06, + "loss": 0.6085, + "step": 10069 + }, + { + "epoch": 0.9003934191702432, + "grad_norm": 0.13759249140611926, + "learning_rate": 5.159061377978591e-06, + "loss": 0.6283, + "step": 10070 + }, + { + "epoch": 0.9004828326180258, + "grad_norm": 0.14171118700560512, + "learning_rate": 5.149883601262984e-06, + "loss": 0.6314, + "step": 10071 + }, + { + "epoch": 0.9005722460658083, + "grad_norm": 0.138737686149614, + "learning_rate": 5.1407137795121075e-06, + "loss": 0.6326, + "step": 10072 + }, + { + "epoch": 0.9006616595135909, + "grad_norm": 0.14244916838937924, + "learning_rate": 5.131551913494981e-06, + "loss": 0.623, + "step": 10073 + }, + { + "epoch": 0.9007510729613734, + "grad_norm": 0.1412999951226116, + "learning_rate": 5.122398003980033e-06, + "loss": 0.6102, + "step": 10074 + }, + { + "epoch": 0.9008404864091559, + "grad_norm": 0.1425642618265906, + "learning_rate": 5.1132520517349735e-06, + "loss": 0.623, + "step": 10075 + }, + { + "epoch": 0.9009298998569385, + "grad_norm": 0.14948074788168986, + "learning_rate": 5.104114057526876e-06, + "loss": 0.659, + "step": 10076 + }, + { + "epoch": 0.9010193133047211, + "grad_norm": 0.1513117347870027, + "learning_rate": 5.09498402212214e-06, + "loss": 0.604, + "step": 10077 + }, + { + "epoch": 0.9011087267525035, + "grad_norm": 0.1468415143692298, + "learning_rate": 5.085861946286463e-06, + "loss": 0.6514, + "step": 10078 + }, + { + "epoch": 0.9011981402002861, + "grad_norm": 0.14647827695925758, + "learning_rate": 5.076747830784923e-06, + "loss": 0.5986, + "step": 10079 + }, + { + "epoch": 0.9012875536480687, + "grad_norm": 0.14681799159559444, + "learning_rate": 5.067641676381918e-06, + "loss": 0.6532, + "step": 10080 + }, + { + "epoch": 0.9013769670958512, + "grad_norm": 0.14870976995290613, + "learning_rate": 5.058543483841116e-06, + "loss": 0.6264, + "step": 10081 + }, + { + "epoch": 0.9014663805436338, + "grad_norm": 0.1223217717048362, + "learning_rate": 5.04945325392564e-06, + "loss": 0.6176, + "step": 10082 + }, + { + "epoch": 0.9015557939914163, + "grad_norm": 0.15324255485678173, + "learning_rate": 5.040370987397858e-06, + "loss": 0.6156, + "step": 10083 + }, + { + "epoch": 0.9016452074391988, + "grad_norm": 0.144018376035172, + "learning_rate": 5.03129668501946e-06, + "loss": 0.6412, + "step": 10084 + }, + { + "epoch": 0.9017346208869814, + "grad_norm": 0.1427126310873349, + "learning_rate": 5.022230347551515e-06, + "loss": 0.5979, + "step": 10085 + }, + { + "epoch": 0.901824034334764, + "grad_norm": 0.13208652788255304, + "learning_rate": 5.013171975754427e-06, + "loss": 0.6095, + "step": 10086 + }, + { + "epoch": 0.9019134477825465, + "grad_norm": 0.1221677290128273, + "learning_rate": 5.004121570387876e-06, + "loss": 0.6127, + "step": 10087 + }, + { + "epoch": 0.902002861230329, + "grad_norm": 0.12800485516211935, + "learning_rate": 4.995079132210922e-06, + "loss": 0.6116, + "step": 10088 + }, + { + "epoch": 0.9020922746781116, + "grad_norm": 0.13580732385660066, + "learning_rate": 4.986044661981948e-06, + "loss": 0.631, + "step": 10089 + }, + { + "epoch": 0.9021816881258942, + "grad_norm": 0.1443508622137229, + "learning_rate": 4.977018160458646e-06, + "loss": 0.6362, + "step": 10090 + }, + { + "epoch": 0.9022711015736766, + "grad_norm": 0.14372714472169146, + "learning_rate": 4.967999628398101e-06, + "loss": 0.641, + "step": 10091 + }, + { + "epoch": 0.9023605150214592, + "grad_norm": 0.1599234401324081, + "learning_rate": 4.958989066556641e-06, + "loss": 0.6531, + "step": 10092 + }, + { + "epoch": 0.9024499284692418, + "grad_norm": 0.1484690597062878, + "learning_rate": 4.949986475689983e-06, + "loss": 0.643, + "step": 10093 + }, + { + "epoch": 0.9025393419170243, + "grad_norm": 0.16406299410466751, + "learning_rate": 4.9409918565531675e-06, + "loss": 0.6482, + "step": 10094 + }, + { + "epoch": 0.9026287553648069, + "grad_norm": 0.1422186101756866, + "learning_rate": 4.93200520990057e-06, + "loss": 0.6317, + "step": 10095 + }, + { + "epoch": 0.9027181688125894, + "grad_norm": 0.13011463612449958, + "learning_rate": 4.923026536485875e-06, + "loss": 0.5895, + "step": 10096 + }, + { + "epoch": 0.9028075822603719, + "grad_norm": 0.14639872062345766, + "learning_rate": 4.914055837062137e-06, + "loss": 0.618, + "step": 10097 + }, + { + "epoch": 0.9028969957081545, + "grad_norm": 0.11991779662469676, + "learning_rate": 4.905093112381687e-06, + "loss": 0.625, + "step": 10098 + }, + { + "epoch": 0.9029864091559371, + "grad_norm": 0.13749966972143493, + "learning_rate": 4.896138363196235e-06, + "loss": 0.6567, + "step": 10099 + }, + { + "epoch": 0.9030758226037195, + "grad_norm": 0.1429639755990895, + "learning_rate": 4.8871915902568125e-06, + "loss": 0.6616, + "step": 10100 + }, + { + "epoch": 0.9031652360515021, + "grad_norm": 0.14286077688659915, + "learning_rate": 4.878252794313754e-06, + "loss": 0.6255, + "step": 10101 + }, + { + "epoch": 0.9032546494992847, + "grad_norm": 0.13929354896625204, + "learning_rate": 4.869321976116737e-06, + "loss": 0.6371, + "step": 10102 + }, + { + "epoch": 0.9033440629470673, + "grad_norm": 0.15621425613842985, + "learning_rate": 4.860399136414828e-06, + "loss": 0.6324, + "step": 10103 + }, + { + "epoch": 0.9034334763948498, + "grad_norm": 0.13577074187742516, + "learning_rate": 4.8514842759563306e-06, + "loss": 0.6017, + "step": 10104 + }, + { + "epoch": 0.9035228898426323, + "grad_norm": 0.15061824434960414, + "learning_rate": 4.842577395488934e-06, + "loss": 0.6114, + "step": 10105 + }, + { + "epoch": 0.9036123032904149, + "grad_norm": 0.1375928234482085, + "learning_rate": 4.833678495759664e-06, + "loss": 0.6009, + "step": 10106 + }, + { + "epoch": 0.9037017167381974, + "grad_norm": 0.1570845050161065, + "learning_rate": 4.8247875775148335e-06, + "loss": 0.6274, + "step": 10107 + }, + { + "epoch": 0.90379113018598, + "grad_norm": 0.1440815951189564, + "learning_rate": 4.815904641500124e-06, + "loss": 0.5858, + "step": 10108 + }, + { + "epoch": 0.9038805436337625, + "grad_norm": 0.1386972234080062, + "learning_rate": 4.80702968846054e-06, + "loss": 0.6072, + "step": 10109 + }, + { + "epoch": 0.903969957081545, + "grad_norm": 0.1390961903638521, + "learning_rate": 4.79816271914042e-06, + "loss": 0.616, + "step": 10110 + }, + { + "epoch": 0.9040593705293276, + "grad_norm": 0.14383212669188597, + "learning_rate": 4.789303734283423e-06, + "loss": 0.6127, + "step": 10111 + }, + { + "epoch": 0.9041487839771102, + "grad_norm": 0.14114685059242385, + "learning_rate": 4.780452734632524e-06, + "loss": 0.645, + "step": 10112 + }, + { + "epoch": 0.9042381974248928, + "grad_norm": 0.1534394890833055, + "learning_rate": 4.771609720930059e-06, + "loss": 0.5966, + "step": 10113 + }, + { + "epoch": 0.9043276108726752, + "grad_norm": 0.15988467510434914, + "learning_rate": 4.762774693917693e-06, + "loss": 0.6653, + "step": 10114 + }, + { + "epoch": 0.9044170243204578, + "grad_norm": 0.12166332486353514, + "learning_rate": 4.753947654336388e-06, + "loss": 0.6097, + "step": 10115 + }, + { + "epoch": 0.9045064377682404, + "grad_norm": 0.1422533118164979, + "learning_rate": 4.7451286029264405e-06, + "loss": 0.5957, + "step": 10116 + }, + { + "epoch": 0.9045958512160229, + "grad_norm": 0.14497644884486288, + "learning_rate": 4.73631754042756e-06, + "loss": 0.6488, + "step": 10117 + }, + { + "epoch": 0.9046852646638054, + "grad_norm": 0.12567704777116037, + "learning_rate": 4.727514467578653e-06, + "loss": 0.6048, + "step": 10118 + }, + { + "epoch": 0.904774678111588, + "grad_norm": 0.16024582864546452, + "learning_rate": 4.718719385118053e-06, + "loss": 0.6618, + "step": 10119 + }, + { + "epoch": 0.9048640915593705, + "grad_norm": 0.1644354996339735, + "learning_rate": 4.7099322937833925e-06, + "loss": 0.5647, + "step": 10120 + }, + { + "epoch": 0.9049535050071531, + "grad_norm": 0.13124078906635814, + "learning_rate": 4.701153194311625e-06, + "loss": 0.6291, + "step": 10121 + }, + { + "epoch": 0.9050429184549357, + "grad_norm": 0.13706610580622045, + "learning_rate": 4.69238208743904e-06, + "loss": 0.6062, + "step": 10122 + }, + { + "epoch": 0.9051323319027181, + "grad_norm": 0.14735870469132195, + "learning_rate": 4.6836189739012715e-06, + "loss": 0.677, + "step": 10123 + }, + { + "epoch": 0.9052217453505007, + "grad_norm": 0.1270770013982934, + "learning_rate": 4.6748638544332644e-06, + "loss": 0.651, + "step": 10124 + }, + { + "epoch": 0.9053111587982833, + "grad_norm": 0.1555727815685006, + "learning_rate": 4.66611672976931e-06, + "loss": 0.5992, + "step": 10125 + }, + { + "epoch": 0.9054005722460658, + "grad_norm": 0.14182628241122872, + "learning_rate": 4.6573776006430205e-06, + "loss": 0.6531, + "step": 10126 + }, + { + "epoch": 0.9054899856938483, + "grad_norm": 0.13802434580479123, + "learning_rate": 4.6486464677873094e-06, + "loss": 0.6237, + "step": 10127 + }, + { + "epoch": 0.9055793991416309, + "grad_norm": 0.13339477657657298, + "learning_rate": 4.639923331934471e-06, + "loss": 0.6013, + "step": 10128 + }, + { + "epoch": 0.9056688125894135, + "grad_norm": 0.15193291763382416, + "learning_rate": 4.631208193816083e-06, + "loss": 0.5953, + "step": 10129 + }, + { + "epoch": 0.905758226037196, + "grad_norm": 0.1449877862418937, + "learning_rate": 4.622501054163098e-06, + "loss": 0.6169, + "step": 10130 + }, + { + "epoch": 0.9058476394849786, + "grad_norm": 0.14818474540489995, + "learning_rate": 4.613801913705773e-06, + "loss": 0.624, + "step": 10131 + }, + { + "epoch": 0.905937052932761, + "grad_norm": 0.14858261440888462, + "learning_rate": 4.605110773173682e-06, + "loss": 0.6167, + "step": 10132 + }, + { + "epoch": 0.9060264663805436, + "grad_norm": 0.14008981844760773, + "learning_rate": 4.59642763329573e-06, + "loss": 0.6001, + "step": 10133 + }, + { + "epoch": 0.9061158798283262, + "grad_norm": 0.13154398711236953, + "learning_rate": 4.5877524948001905e-06, + "loss": 0.6035, + "step": 10134 + }, + { + "epoch": 0.9062052932761088, + "grad_norm": 0.1373429148646205, + "learning_rate": 4.5790853584146035e-06, + "loss": 0.6229, + "step": 10135 + }, + { + "epoch": 0.9062947067238912, + "grad_norm": 0.1464231236098808, + "learning_rate": 4.570426224865876e-06, + "loss": 0.6327, + "step": 10136 + }, + { + "epoch": 0.9063841201716738, + "grad_norm": 0.16438609748561153, + "learning_rate": 4.561775094880283e-06, + "loss": 0.6314, + "step": 10137 + }, + { + "epoch": 0.9064735336194564, + "grad_norm": 0.14817996503186653, + "learning_rate": 4.5531319691833326e-06, + "loss": 0.6138, + "step": 10138 + }, + { + "epoch": 0.906562947067239, + "grad_norm": 0.1456378081755216, + "learning_rate": 4.544496848499946e-06, + "loss": 0.6341, + "step": 10139 + }, + { + "epoch": 0.9066523605150214, + "grad_norm": 0.13682117499904284, + "learning_rate": 4.535869733554332e-06, + "loss": 0.5934, + "step": 10140 + }, + { + "epoch": 0.906741773962804, + "grad_norm": 0.12256700624333275, + "learning_rate": 4.527250625070012e-06, + "loss": 0.6219, + "step": 10141 + }, + { + "epoch": 0.9068311874105865, + "grad_norm": 0.1349980876444065, + "learning_rate": 4.518639523769897e-06, + "loss": 0.6069, + "step": 10142 + }, + { + "epoch": 0.9069206008583691, + "grad_norm": 0.15838729961397285, + "learning_rate": 4.510036430376152e-06, + "loss": 0.6655, + "step": 10143 + }, + { + "epoch": 0.9070100143061517, + "grad_norm": 0.14052759133644857, + "learning_rate": 4.501441345610347e-06, + "loss": 0.6295, + "step": 10144 + }, + { + "epoch": 0.9070994277539342, + "grad_norm": 0.1380684001642858, + "learning_rate": 4.492854270193325e-06, + "loss": 0.6422, + "step": 10145 + }, + { + "epoch": 0.9071888412017167, + "grad_norm": 0.1293740703007173, + "learning_rate": 4.4842752048452676e-06, + "loss": 0.5613, + "step": 10146 + }, + { + "epoch": 0.9072782546494993, + "grad_norm": 0.14590857876235244, + "learning_rate": 4.475704150285687e-06, + "loss": 0.6507, + "step": 10147 + }, + { + "epoch": 0.9073676680972819, + "grad_norm": 0.12296326773008869, + "learning_rate": 4.4671411072334526e-06, + "loss": 0.5741, + "step": 10148 + }, + { + "epoch": 0.9074570815450643, + "grad_norm": 0.15211965486059398, + "learning_rate": 4.458586076406701e-06, + "loss": 0.6657, + "step": 10149 + }, + { + "epoch": 0.9075464949928469, + "grad_norm": 0.13825699986021614, + "learning_rate": 4.450039058522948e-06, + "loss": 0.6097, + "step": 10150 + }, + { + "epoch": 0.9076359084406295, + "grad_norm": 0.13084681592095865, + "learning_rate": 4.441500054299042e-06, + "loss": 0.6087, + "step": 10151 + }, + { + "epoch": 0.907725321888412, + "grad_norm": 0.1430977302963023, + "learning_rate": 4.432969064451109e-06, + "loss": 0.6063, + "step": 10152 + }, + { + "epoch": 0.9078147353361946, + "grad_norm": 0.15425491436534214, + "learning_rate": 4.424446089694645e-06, + "loss": 0.6703, + "step": 10153 + }, + { + "epoch": 0.9079041487839771, + "grad_norm": 0.134178672976284, + "learning_rate": 4.415931130744477e-06, + "loss": 0.6216, + "step": 10154 + }, + { + "epoch": 0.9079935622317596, + "grad_norm": 0.14573001951228526, + "learning_rate": 4.407424188314713e-06, + "loss": 0.5762, + "step": 10155 + }, + { + "epoch": 0.9080829756795422, + "grad_norm": 0.13313326767979697, + "learning_rate": 4.398925263118836e-06, + "loss": 0.5837, + "step": 10156 + }, + { + "epoch": 0.9081723891273248, + "grad_norm": 0.1343469868989919, + "learning_rate": 4.390434355869643e-06, + "loss": 0.6279, + "step": 10157 + }, + { + "epoch": 0.9082618025751072, + "grad_norm": 0.13491472087806633, + "learning_rate": 4.381951467279244e-06, + "loss": 0.6118, + "step": 10158 + }, + { + "epoch": 0.9083512160228898, + "grad_norm": 0.15919834532065683, + "learning_rate": 4.373476598059112e-06, + "loss": 0.6143, + "step": 10159 + }, + { + "epoch": 0.9084406294706724, + "grad_norm": 0.15390826839678637, + "learning_rate": 4.365009748920012e-06, + "loss": 0.627, + "step": 10160 + }, + { + "epoch": 0.908530042918455, + "grad_norm": 0.1463567990751427, + "learning_rate": 4.356550920572044e-06, + "loss": 0.6595, + "step": 10161 + }, + { + "epoch": 0.9086194563662375, + "grad_norm": 0.16594408275689, + "learning_rate": 4.348100113724629e-06, + "loss": 0.6875, + "step": 10162 + }, + { + "epoch": 0.90870886981402, + "grad_norm": 0.12899712085166937, + "learning_rate": 4.339657329086566e-06, + "loss": 0.6089, + "step": 10163 + }, + { + "epoch": 0.9087982832618026, + "grad_norm": 0.1358009857819722, + "learning_rate": 4.331222567365878e-06, + "loss": 0.5982, + "step": 10164 + }, + { + "epoch": 0.9088876967095851, + "grad_norm": 0.1370912956188444, + "learning_rate": 4.322795829270043e-06, + "loss": 0.622, + "step": 10165 + }, + { + "epoch": 0.9089771101573677, + "grad_norm": 0.13603540604153277, + "learning_rate": 4.314377115505763e-06, + "loss": 0.624, + "step": 10166 + }, + { + "epoch": 0.9090665236051502, + "grad_norm": 0.13587009973016936, + "learning_rate": 4.305966426779118e-06, + "loss": 0.6191, + "step": 10167 + }, + { + "epoch": 0.9091559370529327, + "grad_norm": 0.14933291296590206, + "learning_rate": 4.297563763795509e-06, + "loss": 0.6447, + "step": 10168 + }, + { + "epoch": 0.9092453505007153, + "grad_norm": 0.14214262865839894, + "learning_rate": 4.289169127259629e-06, + "loss": 0.6098, + "step": 10169 + }, + { + "epoch": 0.9093347639484979, + "grad_norm": 0.1378833996095821, + "learning_rate": 4.280782517875548e-06, + "loss": 0.6181, + "step": 10170 + }, + { + "epoch": 0.9094241773962805, + "grad_norm": 0.1508223327772943, + "learning_rate": 4.272403936346647e-06, + "loss": 0.6255, + "step": 10171 + }, + { + "epoch": 0.9095135908440629, + "grad_norm": 0.13402117252664317, + "learning_rate": 4.26403338337561e-06, + "loss": 0.6226, + "step": 10172 + }, + { + "epoch": 0.9096030042918455, + "grad_norm": 0.13557081172204458, + "learning_rate": 4.255670859664474e-06, + "loss": 0.6089, + "step": 10173 + }, + { + "epoch": 0.9096924177396281, + "grad_norm": 0.14559652302827963, + "learning_rate": 4.2473163659146e-06, + "loss": 0.6278, + "step": 10174 + }, + { + "epoch": 0.9097818311874106, + "grad_norm": 0.14908797367202736, + "learning_rate": 4.238969902826662e-06, + "loss": 0.6502, + "step": 10175 + }, + { + "epoch": 0.9098712446351931, + "grad_norm": 0.141449793246254, + "learning_rate": 4.230631471100655e-06, + "loss": 0.5967, + "step": 10176 + }, + { + "epoch": 0.9099606580829757, + "grad_norm": 0.1476482490630792, + "learning_rate": 4.222301071435952e-06, + "loss": 0.6367, + "step": 10177 + }, + { + "epoch": 0.9100500715307582, + "grad_norm": 0.14157814034590468, + "learning_rate": 4.213978704531152e-06, + "loss": 0.6764, + "step": 10178 + }, + { + "epoch": 0.9101394849785408, + "grad_norm": 0.1387797809816435, + "learning_rate": 4.205664371084306e-06, + "loss": 0.6134, + "step": 10179 + }, + { + "epoch": 0.9102288984263234, + "grad_norm": 0.1316642971005902, + "learning_rate": 4.19735807179269e-06, + "loss": 0.6334, + "step": 10180 + }, + { + "epoch": 0.9103183118741058, + "grad_norm": 0.14310234606562422, + "learning_rate": 4.189059807352958e-06, + "loss": 0.6265, + "step": 10181 + }, + { + "epoch": 0.9104077253218884, + "grad_norm": 0.13722534911341583, + "learning_rate": 4.180769578461063e-06, + "loss": 0.6505, + "step": 10182 + }, + { + "epoch": 0.910497138769671, + "grad_norm": 0.1456555521179743, + "learning_rate": 4.172487385812307e-06, + "loss": 0.6264, + "step": 10183 + }, + { + "epoch": 0.9105865522174535, + "grad_norm": 0.15404101507455703, + "learning_rate": 4.164213230101299e-06, + "loss": 0.6394, + "step": 10184 + }, + { + "epoch": 0.910675965665236, + "grad_norm": 0.14678900730179356, + "learning_rate": 4.155947112021985e-06, + "loss": 0.5946, + "step": 10185 + }, + { + "epoch": 0.9107653791130186, + "grad_norm": 0.14517322524343126, + "learning_rate": 4.147689032267643e-06, + "loss": 0.6598, + "step": 10186 + }, + { + "epoch": 0.9108547925608012, + "grad_norm": 0.12707686897454906, + "learning_rate": 4.139438991530853e-06, + "loss": 0.6116, + "step": 10187 + }, + { + "epoch": 0.9109442060085837, + "grad_norm": 0.16159009104461738, + "learning_rate": 4.131196990503561e-06, + "loss": 0.6541, + "step": 10188 + }, + { + "epoch": 0.9110336194563662, + "grad_norm": 0.1341303119722451, + "learning_rate": 4.1229630298769914e-06, + "loss": 0.6285, + "step": 10189 + }, + { + "epoch": 0.9111230329041488, + "grad_norm": 0.14273343837228744, + "learning_rate": 4.114737110341715e-06, + "loss": 0.6257, + "step": 10190 + }, + { + "epoch": 0.9112124463519313, + "grad_norm": 0.15571642450409828, + "learning_rate": 4.106519232587647e-06, + "loss": 0.6656, + "step": 10191 + }, + { + "epoch": 0.9113018597997139, + "grad_norm": 0.14287318708312097, + "learning_rate": 4.098309397303978e-06, + "loss": 0.5929, + "step": 10192 + }, + { + "epoch": 0.9113912732474965, + "grad_norm": 0.1264203485677019, + "learning_rate": 4.090107605179294e-06, + "loss": 0.6542, + "step": 10193 + }, + { + "epoch": 0.9114806866952789, + "grad_norm": 0.13166470449220924, + "learning_rate": 4.081913856901476e-06, + "loss": 0.6455, + "step": 10194 + }, + { + "epoch": 0.9115701001430615, + "grad_norm": 0.14030698653512685, + "learning_rate": 4.073728153157674e-06, + "loss": 0.6386, + "step": 10195 + }, + { + "epoch": 0.9116595135908441, + "grad_norm": 0.1379930811651166, + "learning_rate": 4.065550494634451e-06, + "loss": 0.5966, + "step": 10196 + }, + { + "epoch": 0.9117489270386266, + "grad_norm": 0.1472932444520674, + "learning_rate": 4.057380882017658e-06, + "loss": 0.6177, + "step": 10197 + }, + { + "epoch": 0.9118383404864091, + "grad_norm": 0.1449834100571808, + "learning_rate": 4.049219315992458e-06, + "loss": 0.6324, + "step": 10198 + }, + { + "epoch": 0.9119277539341917, + "grad_norm": 0.1486463933647249, + "learning_rate": 4.041065797243349e-06, + "loss": 0.6518, + "step": 10199 + }, + { + "epoch": 0.9120171673819742, + "grad_norm": 0.12430480370101285, + "learning_rate": 4.032920326454159e-06, + "loss": 0.5938, + "step": 10200 + }, + { + "epoch": 0.9121065808297568, + "grad_norm": 0.13582606670869754, + "learning_rate": 4.0247829043080445e-06, + "loss": 0.623, + "step": 10201 + }, + { + "epoch": 0.9121959942775394, + "grad_norm": 0.12835835485016958, + "learning_rate": 4.016653531487491e-06, + "loss": 0.6024, + "step": 10202 + }, + { + "epoch": 0.9122854077253219, + "grad_norm": 0.14022576513105298, + "learning_rate": 4.008532208674276e-06, + "loss": 0.6089, + "step": 10203 + }, + { + "epoch": 0.9123748211731044, + "grad_norm": 0.14540768763743242, + "learning_rate": 4.000418936549533e-06, + "loss": 0.6317, + "step": 10204 + }, + { + "epoch": 0.912464234620887, + "grad_norm": 0.13366524684109335, + "learning_rate": 3.992313715793727e-06, + "loss": 0.5782, + "step": 10205 + }, + { + "epoch": 0.9125536480686696, + "grad_norm": 0.13435805620896824, + "learning_rate": 3.984216547086606e-06, + "loss": 0.6107, + "step": 10206 + }, + { + "epoch": 0.912643061516452, + "grad_norm": 0.13529904742929535, + "learning_rate": 3.97612743110729e-06, + "loss": 0.6011, + "step": 10207 + }, + { + "epoch": 0.9127324749642346, + "grad_norm": 0.1284023555875822, + "learning_rate": 3.968046368534217e-06, + "loss": 0.6092, + "step": 10208 + }, + { + "epoch": 0.9128218884120172, + "grad_norm": 0.15664533681157847, + "learning_rate": 3.9599733600450995e-06, + "loss": 0.6579, + "step": 10209 + }, + { + "epoch": 0.9129113018597997, + "grad_norm": 0.1575282508871656, + "learning_rate": 3.95190840631704e-06, + "loss": 0.6164, + "step": 10210 + }, + { + "epoch": 0.9130007153075823, + "grad_norm": 0.14350187793388794, + "learning_rate": 3.94385150802643e-06, + "loss": 0.6533, + "step": 10211 + }, + { + "epoch": 0.9130901287553648, + "grad_norm": 0.1453847261585334, + "learning_rate": 3.9358026658489535e-06, + "loss": 0.6174, + "step": 10212 + }, + { + "epoch": 0.9131795422031473, + "grad_norm": 0.14659894850952043, + "learning_rate": 3.927761880459735e-06, + "loss": 0.6294, + "step": 10213 + }, + { + "epoch": 0.9132689556509299, + "grad_norm": 0.16141159930842208, + "learning_rate": 3.91972915253308e-06, + "loss": 0.6124, + "step": 10214 + }, + { + "epoch": 0.9133583690987125, + "grad_norm": 0.13790358597510852, + "learning_rate": 3.9117044827427066e-06, + "loss": 0.6515, + "step": 10215 + }, + { + "epoch": 0.913447782546495, + "grad_norm": 0.1362909905263737, + "learning_rate": 3.90368787176163e-06, + "loss": 0.6468, + "step": 10216 + }, + { + "epoch": 0.9135371959942775, + "grad_norm": 0.13931362444374906, + "learning_rate": 3.895679320262202e-06, + "loss": 0.646, + "step": 10217 + }, + { + "epoch": 0.9136266094420601, + "grad_norm": 0.1336217009796931, + "learning_rate": 3.8876788289160855e-06, + "loss": 0.6232, + "step": 10218 + }, + { + "epoch": 0.9137160228898427, + "grad_norm": 0.146421456320253, + "learning_rate": 3.879686398394267e-06, + "loss": 0.6847, + "step": 10219 + }, + { + "epoch": 0.9138054363376252, + "grad_norm": 0.15151288001882857, + "learning_rate": 3.871702029367064e-06, + "loss": 0.5899, + "step": 10220 + }, + { + "epoch": 0.9138948497854077, + "grad_norm": 0.14649242387049383, + "learning_rate": 3.863725722504119e-06, + "loss": 0.6779, + "step": 10221 + }, + { + "epoch": 0.9139842632331903, + "grad_norm": 0.13261979958618705, + "learning_rate": 3.8557574784744085e-06, + "loss": 0.6206, + "step": 10222 + }, + { + "epoch": 0.9140736766809728, + "grad_norm": 0.14729459655842814, + "learning_rate": 3.847797297946198e-06, + "loss": 0.5642, + "step": 10223 + }, + { + "epoch": 0.9141630901287554, + "grad_norm": 0.12650826926849454, + "learning_rate": 3.839845181587098e-06, + "loss": 0.5854, + "step": 10224 + }, + { + "epoch": 0.9142525035765379, + "grad_norm": 0.13820437979324557, + "learning_rate": 3.831901130064064e-06, + "loss": 0.6341, + "step": 10225 + }, + { + "epoch": 0.9143419170243204, + "grad_norm": 0.11916205393157507, + "learning_rate": 3.823965144043318e-06, + "loss": 0.621, + "step": 10226 + }, + { + "epoch": 0.914431330472103, + "grad_norm": 0.15644800775184295, + "learning_rate": 3.816037224190483e-06, + "loss": 0.6441, + "step": 10227 + }, + { + "epoch": 0.9145207439198856, + "grad_norm": 0.13168600537486963, + "learning_rate": 3.8081173711704497e-06, + "loss": 0.6121, + "step": 10228 + }, + { + "epoch": 0.914610157367668, + "grad_norm": 0.1339926626792496, + "learning_rate": 3.8002055856474206e-06, + "loss": 0.6065, + "step": 10229 + }, + { + "epoch": 0.9146995708154506, + "grad_norm": 0.1369298339294501, + "learning_rate": 3.7923018682849864e-06, + "loss": 0.6113, + "step": 10230 + }, + { + "epoch": 0.9147889842632332, + "grad_norm": 0.14980107582470986, + "learning_rate": 3.784406219746006e-06, + "loss": 0.6189, + "step": 10231 + }, + { + "epoch": 0.9148783977110158, + "grad_norm": 0.12997956102239938, + "learning_rate": 3.7765186406926722e-06, + "loss": 0.6253, + "step": 10232 + }, + { + "epoch": 0.9149678111587983, + "grad_norm": 0.14368929181624893, + "learning_rate": 3.768639131786511e-06, + "loss": 0.618, + "step": 10233 + }, + { + "epoch": 0.9150572246065808, + "grad_norm": 0.1506744006932612, + "learning_rate": 3.760767693688361e-06, + "loss": 0.6245, + "step": 10234 + }, + { + "epoch": 0.9151466380543634, + "grad_norm": 0.14958026249944625, + "learning_rate": 3.752904327058404e-06, + "loss": 0.6815, + "step": 10235 + }, + { + "epoch": 0.9152360515021459, + "grad_norm": 0.13331332243835337, + "learning_rate": 3.745049032556125e-06, + "loss": 0.6446, + "step": 10236 + }, + { + "epoch": 0.9153254649499285, + "grad_norm": 0.14047388581217107, + "learning_rate": 3.7372018108403405e-06, + "loss": 0.6503, + "step": 10237 + }, + { + "epoch": 0.915414878397711, + "grad_norm": 0.1385597609463663, + "learning_rate": 3.729362662569169e-06, + "loss": 0.6239, + "step": 10238 + }, + { + "epoch": 0.9155042918454935, + "grad_norm": 0.13498988686904528, + "learning_rate": 3.7215315884000957e-06, + "loss": 0.6496, + "step": 10239 + }, + { + "epoch": 0.9155937052932761, + "grad_norm": 0.13108289445730675, + "learning_rate": 3.7137085889898947e-06, + "loss": 0.6173, + "step": 10240 + }, + { + "epoch": 0.9156831187410587, + "grad_norm": 0.14882068274971344, + "learning_rate": 3.705893664994664e-06, + "loss": 0.6355, + "step": 10241 + }, + { + "epoch": 0.9157725321888412, + "grad_norm": 0.13234899264368477, + "learning_rate": 3.6980868170698456e-06, + "loss": 0.633, + "step": 10242 + }, + { + "epoch": 0.9158619456366237, + "grad_norm": 0.15647293501890308, + "learning_rate": 3.6902880458701826e-06, + "loss": 0.6253, + "step": 10243 + }, + { + "epoch": 0.9159513590844063, + "grad_norm": 0.13744991495312955, + "learning_rate": 3.6824973520497408e-06, + "loss": 0.6226, + "step": 10244 + }, + { + "epoch": 0.9160407725321889, + "grad_norm": 0.14019208587959653, + "learning_rate": 3.6747147362619304e-06, + "loss": 0.6225, + "step": 10245 + }, + { + "epoch": 0.9161301859799714, + "grad_norm": 0.1401903981260308, + "learning_rate": 3.666940199159463e-06, + "loss": 0.6167, + "step": 10246 + }, + { + "epoch": 0.9162195994277539, + "grad_norm": 0.13040092148530105, + "learning_rate": 3.6591737413943616e-06, + "loss": 0.604, + "step": 10247 + }, + { + "epoch": 0.9163090128755365, + "grad_norm": 0.13252898814076625, + "learning_rate": 3.6514153636180383e-06, + "loss": 0.6312, + "step": 10248 + }, + { + "epoch": 0.916398426323319, + "grad_norm": 0.13130037471228637, + "learning_rate": 3.643665066481128e-06, + "loss": 0.6104, + "step": 10249 + }, + { + "epoch": 0.9164878397711016, + "grad_norm": 0.14861872545524374, + "learning_rate": 3.635922850633666e-06, + "loss": 0.6075, + "step": 10250 + }, + { + "epoch": 0.9165772532188842, + "grad_norm": 0.14401499315486918, + "learning_rate": 3.6281887167249895e-06, + "loss": 0.6269, + "step": 10251 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 0.13374746673660537, + "learning_rate": 3.6204626654037233e-06, + "loss": 0.635, + "step": 10252 + }, + { + "epoch": 0.9167560801144492, + "grad_norm": 0.14940379050182795, + "learning_rate": 3.612744697317849e-06, + "loss": 0.6494, + "step": 10253 + }, + { + "epoch": 0.9168454935622318, + "grad_norm": 0.1418279914487777, + "learning_rate": 3.6050348131146825e-06, + "loss": 0.6211, + "step": 10254 + }, + { + "epoch": 0.9169349070100143, + "grad_norm": 0.1433400782902062, + "learning_rate": 3.597333013440829e-06, + "loss": 0.6416, + "step": 10255 + }, + { + "epoch": 0.9170243204577968, + "grad_norm": 0.13213803622822445, + "learning_rate": 3.5896392989422377e-06, + "loss": 0.6103, + "step": 10256 + }, + { + "epoch": 0.9171137339055794, + "grad_norm": 0.13542863275471584, + "learning_rate": 3.5819536702641485e-06, + "loss": 0.6301, + "step": 10257 + }, + { + "epoch": 0.917203147353362, + "grad_norm": 0.1353254650586485, + "learning_rate": 3.5742761280511685e-06, + "loss": 0.6341, + "step": 10258 + }, + { + "epoch": 0.9172925608011445, + "grad_norm": 0.14766505368534985, + "learning_rate": 3.566606672947204e-06, + "loss": 0.6249, + "step": 10259 + }, + { + "epoch": 0.9173819742489271, + "grad_norm": 0.1367622222396889, + "learning_rate": 3.5589453055954737e-06, + "loss": 0.619, + "step": 10260 + }, + { + "epoch": 0.9174713876967096, + "grad_norm": 0.13622775538026546, + "learning_rate": 3.5512920266385085e-06, + "loss": 0.602, + "step": 10261 + }, + { + "epoch": 0.9175608011444921, + "grad_norm": 0.12720639472138331, + "learning_rate": 3.5436468367182284e-06, + "loss": 0.6315, + "step": 10262 + }, + { + "epoch": 0.9176502145922747, + "grad_norm": 0.1390095320860383, + "learning_rate": 3.536009736475787e-06, + "loss": 0.6216, + "step": 10263 + }, + { + "epoch": 0.9177396280400573, + "grad_norm": 0.1508572564919802, + "learning_rate": 3.5283807265517053e-06, + "loss": 0.6544, + "step": 10264 + }, + { + "epoch": 0.9178290414878397, + "grad_norm": 0.13786232177061516, + "learning_rate": 3.5207598075858383e-06, + "loss": 0.6037, + "step": 10265 + }, + { + "epoch": 0.9179184549356223, + "grad_norm": 0.14092316909646782, + "learning_rate": 3.5131469802173076e-06, + "loss": 0.6212, + "step": 10266 + }, + { + "epoch": 0.9180078683834049, + "grad_norm": 0.16428595209918917, + "learning_rate": 3.5055422450846253e-06, + "loss": 0.6937, + "step": 10267 + }, + { + "epoch": 0.9180972818311874, + "grad_norm": 0.13784377552735236, + "learning_rate": 3.4979456028255806e-06, + "loss": 0.6054, + "step": 10268 + }, + { + "epoch": 0.91818669527897, + "grad_norm": 0.1484208849598634, + "learning_rate": 3.4903570540772866e-06, + "loss": 0.6489, + "step": 10269 + }, + { + "epoch": 0.9182761087267525, + "grad_norm": 0.14028445918788335, + "learning_rate": 3.482776599476201e-06, + "loss": 0.6498, + "step": 10270 + }, + { + "epoch": 0.918365522174535, + "grad_norm": 0.15030549230036525, + "learning_rate": 3.4752042396580807e-06, + "loss": 0.614, + "step": 10271 + }, + { + "epoch": 0.9184549356223176, + "grad_norm": 0.15753630704478583, + "learning_rate": 3.467639975257997e-06, + "loss": 0.6427, + "step": 10272 + }, + { + "epoch": 0.9185443490701002, + "grad_norm": 0.13209357221586673, + "learning_rate": 3.4600838069103635e-06, + "loss": 0.6252, + "step": 10273 + }, + { + "epoch": 0.9186337625178826, + "grad_norm": 0.15306171709295402, + "learning_rate": 3.4525357352489295e-06, + "loss": 0.642, + "step": 10274 + }, + { + "epoch": 0.9187231759656652, + "grad_norm": 0.13106097446373652, + "learning_rate": 3.4449957609066996e-06, + "loss": 0.6379, + "step": 10275 + }, + { + "epoch": 0.9188125894134478, + "grad_norm": 0.1494792143643428, + "learning_rate": 3.43746388451609e-06, + "loss": 0.6336, + "step": 10276 + }, + { + "epoch": 0.9189020028612304, + "grad_norm": 0.15184801822003127, + "learning_rate": 3.429940106708751e-06, + "loss": 0.6389, + "step": 10277 + }, + { + "epoch": 0.9189914163090128, + "grad_norm": 0.14835619170249773, + "learning_rate": 3.422424428115711e-06, + "loss": 0.6344, + "step": 10278 + }, + { + "epoch": 0.9190808297567954, + "grad_norm": 0.13764775226054538, + "learning_rate": 3.4149168493673113e-06, + "loss": 0.646, + "step": 10279 + }, + { + "epoch": 0.919170243204578, + "grad_norm": 0.14233267940473598, + "learning_rate": 3.40741737109318e-06, + "loss": 0.5802, + "step": 10280 + }, + { + "epoch": 0.9192596566523605, + "grad_norm": 0.15609278309696953, + "learning_rate": 3.3999259939222927e-06, + "loss": 0.6156, + "step": 10281 + }, + { + "epoch": 0.9193490701001431, + "grad_norm": 0.13260257751893523, + "learning_rate": 3.3924427184829575e-06, + "loss": 0.5784, + "step": 10282 + }, + { + "epoch": 0.9194384835479256, + "grad_norm": 0.11786711016666011, + "learning_rate": 3.3849675454027727e-06, + "loss": 0.5672, + "step": 10283 + }, + { + "epoch": 0.9195278969957081, + "grad_norm": 0.1485280509779639, + "learning_rate": 3.3775004753086812e-06, + "loss": 0.6173, + "step": 10284 + }, + { + "epoch": 0.9196173104434907, + "grad_norm": 0.14075399750564851, + "learning_rate": 3.3700415088269377e-06, + "loss": 0.5467, + "step": 10285 + }, + { + "epoch": 0.9197067238912733, + "grad_norm": 0.15771461992703034, + "learning_rate": 3.362590646583108e-06, + "loss": 0.6129, + "step": 10286 + }, + { + "epoch": 0.9197961373390557, + "grad_norm": 0.1343620551276853, + "learning_rate": 3.3551478892020926e-06, + "loss": 0.6153, + "step": 10287 + }, + { + "epoch": 0.9198855507868383, + "grad_norm": 0.15177112240482832, + "learning_rate": 3.3477132373081254e-06, + "loss": 0.6548, + "step": 10288 + }, + { + "epoch": 0.9199749642346209, + "grad_norm": 0.1513805555675183, + "learning_rate": 3.3402866915246854e-06, + "loss": 0.6565, + "step": 10289 + }, + { + "epoch": 0.9200643776824035, + "grad_norm": 0.1482197481903374, + "learning_rate": 3.3328682524746967e-06, + "loss": 0.6288, + "step": 10290 + }, + { + "epoch": 0.920153791130186, + "grad_norm": 0.14097668878973868, + "learning_rate": 3.325457920780295e-06, + "loss": 0.6227, + "step": 10291 + }, + { + "epoch": 0.9202432045779685, + "grad_norm": 0.14084148918547612, + "learning_rate": 3.318055697062983e-06, + "loss": 0.6238, + "step": 10292 + }, + { + "epoch": 0.9203326180257511, + "grad_norm": 0.1326460062627357, + "learning_rate": 3.310661581943586e-06, + "loss": 0.6079, + "step": 10293 + }, + { + "epoch": 0.9204220314735336, + "grad_norm": 0.1440023612167947, + "learning_rate": 3.3032755760422196e-06, + "loss": 0.6132, + "step": 10294 + }, + { + "epoch": 0.9205114449213162, + "grad_norm": 0.16214516962458822, + "learning_rate": 3.2958976799783326e-06, + "loss": 0.7046, + "step": 10295 + }, + { + "epoch": 0.9206008583690987, + "grad_norm": 0.1324631187732574, + "learning_rate": 3.288527894370752e-06, + "loss": 0.5842, + "step": 10296 + }, + { + "epoch": 0.9206902718168812, + "grad_norm": 0.15024388255057153, + "learning_rate": 3.281166219837517e-06, + "loss": 0.6347, + "step": 10297 + }, + { + "epoch": 0.9207796852646638, + "grad_norm": 0.1380508134979402, + "learning_rate": 3.273812656996067e-06, + "loss": 0.6323, + "step": 10298 + }, + { + "epoch": 0.9208690987124464, + "grad_norm": 0.1409234648202746, + "learning_rate": 3.2664672064631528e-06, + "loss": 0.6354, + "step": 10299 + }, + { + "epoch": 0.920958512160229, + "grad_norm": 0.1309871909171701, + "learning_rate": 3.2591298688547932e-06, + "loss": 0.5927, + "step": 10300 + }, + { + "epoch": 0.9210479256080114, + "grad_norm": 0.14713417956165845, + "learning_rate": 3.2518006447863847e-06, + "loss": 0.6158, + "step": 10301 + }, + { + "epoch": 0.921137339055794, + "grad_norm": 0.14374533718878774, + "learning_rate": 3.244479534872602e-06, + "loss": 0.6642, + "step": 10302 + }, + { + "epoch": 0.9212267525035766, + "grad_norm": 0.14870872426108867, + "learning_rate": 3.2371665397274763e-06, + "loss": 0.6443, + "step": 10303 + }, + { + "epoch": 0.9213161659513591, + "grad_norm": 0.13821877560972842, + "learning_rate": 3.2298616599643285e-06, + "loss": 0.6604, + "step": 10304 + }, + { + "epoch": 0.9214055793991416, + "grad_norm": 0.14072183585698636, + "learning_rate": 3.2225648961958344e-06, + "loss": 0.6286, + "step": 10305 + }, + { + "epoch": 0.9214949928469242, + "grad_norm": 0.14704599830737242, + "learning_rate": 3.215276249033927e-06, + "loss": 0.6299, + "step": 10306 + }, + { + "epoch": 0.9215844062947067, + "grad_norm": 0.11905022698311854, + "learning_rate": 3.207995719089918e-06, + "loss": 0.5879, + "step": 10307 + }, + { + "epoch": 0.9216738197424893, + "grad_norm": 0.12222029692761968, + "learning_rate": 3.200723306974418e-06, + "loss": 0.5987, + "step": 10308 + }, + { + "epoch": 0.9217632331902719, + "grad_norm": 0.15841450510068975, + "learning_rate": 3.1934590132973283e-06, + "loss": 0.6579, + "step": 10309 + }, + { + "epoch": 0.9218526466380543, + "grad_norm": 0.13283086537367972, + "learning_rate": 3.186202838667951e-06, + "loss": 0.6467, + "step": 10310 + }, + { + "epoch": 0.9219420600858369, + "grad_norm": 0.14345178858572363, + "learning_rate": 3.1789547836947986e-06, + "loss": 0.6364, + "step": 10311 + }, + { + "epoch": 0.9220314735336195, + "grad_norm": 0.13020844171922102, + "learning_rate": 3.171714848985785e-06, + "loss": 0.6349, + "step": 10312 + }, + { + "epoch": 0.922120886981402, + "grad_norm": 0.1422309501804655, + "learning_rate": 3.164483035148114e-06, + "loss": 0.6247, + "step": 10313 + }, + { + "epoch": 0.9222103004291845, + "grad_norm": 0.14240330892905095, + "learning_rate": 3.157259342788299e-06, + "loss": 0.6567, + "step": 10314 + }, + { + "epoch": 0.9222997138769671, + "grad_norm": 0.14053708597825465, + "learning_rate": 3.150043772512179e-06, + "loss": 0.6475, + "step": 10315 + }, + { + "epoch": 0.9223891273247496, + "grad_norm": 0.1323551722432141, + "learning_rate": 3.1428363249249247e-06, + "loss": 0.6113, + "step": 10316 + }, + { + "epoch": 0.9224785407725322, + "grad_norm": 0.13878176808353965, + "learning_rate": 3.1356370006310197e-06, + "loss": 0.6222, + "step": 10317 + }, + { + "epoch": 0.9225679542203148, + "grad_norm": 0.11801032742252321, + "learning_rate": 3.1284458002342475e-06, + "loss": 0.6021, + "step": 10318 + }, + { + "epoch": 0.9226573676680973, + "grad_norm": 0.13498860342774174, + "learning_rate": 3.121262724337748e-06, + "loss": 0.6569, + "step": 10319 + }, + { + "epoch": 0.9227467811158798, + "grad_norm": 0.135122123912213, + "learning_rate": 3.1140877735439387e-06, + "loss": 0.588, + "step": 10320 + }, + { + "epoch": 0.9228361945636624, + "grad_norm": 0.1609279530114907, + "learning_rate": 3.1069209484545725e-06, + "loss": 0.7021, + "step": 10321 + }, + { + "epoch": 0.922925608011445, + "grad_norm": 0.1407491198733395, + "learning_rate": 3.0997622496707456e-06, + "loss": 0.6282, + "step": 10322 + }, + { + "epoch": 0.9230150214592274, + "grad_norm": 0.15327200589577658, + "learning_rate": 3.0926116777928116e-06, + "loss": 0.6169, + "step": 10323 + }, + { + "epoch": 0.92310443490701, + "grad_norm": 0.14691699444808537, + "learning_rate": 3.0854692334205125e-06, + "loss": 0.647, + "step": 10324 + }, + { + "epoch": 0.9231938483547926, + "grad_norm": 0.13626145795476569, + "learning_rate": 3.0783349171528697e-06, + "loss": 0.6141, + "step": 10325 + }, + { + "epoch": 0.9232832618025751, + "grad_norm": 0.14823350119560458, + "learning_rate": 3.0712087295882154e-06, + "loss": 0.6353, + "step": 10326 + }, + { + "epoch": 0.9233726752503576, + "grad_norm": 0.14336430783956916, + "learning_rate": 3.064090671324238e-06, + "loss": 0.6567, + "step": 10327 + }, + { + "epoch": 0.9234620886981402, + "grad_norm": 0.15619919682692746, + "learning_rate": 3.0569807429579044e-06, + "loss": 0.6765, + "step": 10328 + }, + { + "epoch": 0.9235515021459227, + "grad_norm": 0.13117773099510974, + "learning_rate": 3.0498789450855046e-06, + "loss": 0.5913, + "step": 10329 + }, + { + "epoch": 0.9236409155937053, + "grad_norm": 0.12246381155643049, + "learning_rate": 3.0427852783026843e-06, + "loss": 0.5672, + "step": 10330 + }, + { + "epoch": 0.9237303290414879, + "grad_norm": 0.15079068390572556, + "learning_rate": 3.0356997432043565e-06, + "loss": 0.6682, + "step": 10331 + }, + { + "epoch": 0.9238197424892703, + "grad_norm": 0.16871141024505823, + "learning_rate": 3.0286223403848014e-06, + "loss": 0.6816, + "step": 10332 + }, + { + "epoch": 0.9239091559370529, + "grad_norm": 0.13899446490912415, + "learning_rate": 3.021553070437577e-06, + "loss": 0.6057, + "step": 10333 + }, + { + "epoch": 0.9239985693848355, + "grad_norm": 0.1281259080292813, + "learning_rate": 3.0144919339555654e-06, + "loss": 0.5908, + "step": 10334 + }, + { + "epoch": 0.9240879828326181, + "grad_norm": 0.1483746248510366, + "learning_rate": 3.0074389315309928e-06, + "loss": 0.6095, + "step": 10335 + }, + { + "epoch": 0.9241773962804005, + "grad_norm": 0.14382742223801548, + "learning_rate": 3.000394063755396e-06, + "loss": 0.6331, + "step": 10336 + }, + { + "epoch": 0.9242668097281831, + "grad_norm": 0.15049861116284374, + "learning_rate": 2.9933573312195708e-06, + "loss": 0.6237, + "step": 10337 + }, + { + "epoch": 0.9243562231759657, + "grad_norm": 0.12652699332596873, + "learning_rate": 2.9863287345137216e-06, + "loss": 0.6285, + "step": 10338 + }, + { + "epoch": 0.9244456366237482, + "grad_norm": 0.1380708600000526, + "learning_rate": 2.979308274227344e-06, + "loss": 0.6392, + "step": 10339 + }, + { + "epoch": 0.9245350500715308, + "grad_norm": 0.13848596121435913, + "learning_rate": 2.9722959509491888e-06, + "loss": 0.6273, + "step": 10340 + }, + { + "epoch": 0.9246244635193133, + "grad_norm": 0.14414520918963752, + "learning_rate": 2.965291765267386e-06, + "loss": 0.6246, + "step": 10341 + }, + { + "epoch": 0.9247138769670958, + "grad_norm": 0.13628443291402934, + "learning_rate": 2.958295717769399e-06, + "loss": 0.634, + "step": 10342 + }, + { + "epoch": 0.9248032904148784, + "grad_norm": 0.12679128806534995, + "learning_rate": 2.9513078090419365e-06, + "loss": 0.6267, + "step": 10343 + }, + { + "epoch": 0.924892703862661, + "grad_norm": 0.16306407608412063, + "learning_rate": 2.944328039671085e-06, + "loss": 0.6768, + "step": 10344 + }, + { + "epoch": 0.9249821173104434, + "grad_norm": 0.15212811996552217, + "learning_rate": 2.93735641024222e-06, + "loss": 0.5703, + "step": 10345 + }, + { + "epoch": 0.925071530758226, + "grad_norm": 0.14611136715599693, + "learning_rate": 2.930392921340053e-06, + "loss": 0.6769, + "step": 10346 + }, + { + "epoch": 0.9251609442060086, + "grad_norm": 0.13388197900353105, + "learning_rate": 2.9234375735486153e-06, + "loss": 0.6411, + "step": 10347 + }, + { + "epoch": 0.9252503576537912, + "grad_norm": 0.14705504205373401, + "learning_rate": 2.916490367451219e-06, + "loss": 0.6683, + "step": 10348 + }, + { + "epoch": 0.9253397711015737, + "grad_norm": 0.13194944461751246, + "learning_rate": 2.909551303630531e-06, + "loss": 0.6274, + "step": 10349 + }, + { + "epoch": 0.9254291845493562, + "grad_norm": 0.14034691530255627, + "learning_rate": 2.9026203826685195e-06, + "loss": 0.6394, + "step": 10350 + }, + { + "epoch": 0.9255185979971388, + "grad_norm": 0.15516808365291337, + "learning_rate": 2.8956976051464636e-06, + "loss": 0.6216, + "step": 10351 + }, + { + "epoch": 0.9256080114449213, + "grad_norm": 0.13725861047074536, + "learning_rate": 2.8887829716449876e-06, + "loss": 0.6344, + "step": 10352 + }, + { + "epoch": 0.9256974248927039, + "grad_norm": 0.13742201877890087, + "learning_rate": 2.8818764827440057e-06, + "loss": 0.6616, + "step": 10353 + }, + { + "epoch": 0.9257868383404864, + "grad_norm": 0.16270671827882388, + "learning_rate": 2.8749781390227437e-06, + "loss": 0.6027, + "step": 10354 + }, + { + "epoch": 0.9258762517882689, + "grad_norm": 0.13538180650129253, + "learning_rate": 2.8680879410597716e-06, + "loss": 0.647, + "step": 10355 + }, + { + "epoch": 0.9259656652360515, + "grad_norm": 0.1374523535203862, + "learning_rate": 2.861205889432972e-06, + "loss": 0.6325, + "step": 10356 + }, + { + "epoch": 0.9260550786838341, + "grad_norm": 0.1406951880317815, + "learning_rate": 2.854331984719505e-06, + "loss": 0.6597, + "step": 10357 + }, + { + "epoch": 0.9261444921316166, + "grad_norm": 0.1487749412570289, + "learning_rate": 2.8474662274958987e-06, + "loss": 0.6416, + "step": 10358 + }, + { + "epoch": 0.9262339055793991, + "grad_norm": 0.137335441943473, + "learning_rate": 2.8406086183379586e-06, + "loss": 0.6343, + "step": 10359 + }, + { + "epoch": 0.9263233190271817, + "grad_norm": 0.13394857515101083, + "learning_rate": 2.8337591578208366e-06, + "loss": 0.6405, + "step": 10360 + }, + { + "epoch": 0.9264127324749643, + "grad_norm": 0.14221171853067627, + "learning_rate": 2.826917846518995e-06, + "loss": 0.6366, + "step": 10361 + }, + { + "epoch": 0.9265021459227468, + "grad_norm": 0.13236447535492243, + "learning_rate": 2.820084685006208e-06, + "loss": 0.5888, + "step": 10362 + }, + { + "epoch": 0.9265915593705293, + "grad_norm": 0.15206188346490637, + "learning_rate": 2.8132596738555397e-06, + "loss": 0.6145, + "step": 10363 + }, + { + "epoch": 0.9266809728183119, + "grad_norm": 0.1500396927062451, + "learning_rate": 2.8064428136394096e-06, + "loss": 0.6685, + "step": 10364 + }, + { + "epoch": 0.9267703862660944, + "grad_norm": 0.13061800133056434, + "learning_rate": 2.799634104929538e-06, + "loss": 0.5542, + "step": 10365 + }, + { + "epoch": 0.926859799713877, + "grad_norm": 0.14363354667096132, + "learning_rate": 2.7928335482969802e-06, + "loss": 0.6335, + "step": 10366 + }, + { + "epoch": 0.9269492131616596, + "grad_norm": 0.13451584238735997, + "learning_rate": 2.7860411443120684e-06, + "loss": 0.6118, + "step": 10367 + }, + { + "epoch": 0.927038626609442, + "grad_norm": 0.13491889296484272, + "learning_rate": 2.7792568935444796e-06, + "loss": 0.6336, + "step": 10368 + }, + { + "epoch": 0.9271280400572246, + "grad_norm": 0.15490595174041422, + "learning_rate": 2.772480796563204e-06, + "loss": 0.5897, + "step": 10369 + }, + { + "epoch": 0.9272174535050072, + "grad_norm": 0.14184062628176788, + "learning_rate": 2.765712853936553e-06, + "loss": 0.6476, + "step": 10370 + }, + { + "epoch": 0.9273068669527897, + "grad_norm": 0.14131853416566953, + "learning_rate": 2.7589530662321285e-06, + "loss": 0.5905, + "step": 10371 + }, + { + "epoch": 0.9273962804005722, + "grad_norm": 0.12877908564272886, + "learning_rate": 2.7522014340168547e-06, + "loss": 0.6046, + "step": 10372 + }, + { + "epoch": 0.9274856938483548, + "grad_norm": 0.12937727004921545, + "learning_rate": 2.745457957857023e-06, + "loss": 0.619, + "step": 10373 + }, + { + "epoch": 0.9275751072961373, + "grad_norm": 0.1350695383186381, + "learning_rate": 2.7387226383181696e-06, + "loss": 0.6119, + "step": 10374 + }, + { + "epoch": 0.9276645207439199, + "grad_norm": 0.14527146683009126, + "learning_rate": 2.7319954759651877e-06, + "loss": 0.628, + "step": 10375 + }, + { + "epoch": 0.9277539341917024, + "grad_norm": 0.1425598826616786, + "learning_rate": 2.7252764713622814e-06, + "loss": 0.6245, + "step": 10376 + }, + { + "epoch": 0.927843347639485, + "grad_norm": 0.14532641169737132, + "learning_rate": 2.718565625072955e-06, + "loss": 0.6165, + "step": 10377 + }, + { + "epoch": 0.9279327610872675, + "grad_norm": 0.1527360808369613, + "learning_rate": 2.711862937660037e-06, + "loss": 0.6283, + "step": 10378 + }, + { + "epoch": 0.9280221745350501, + "grad_norm": 0.15063418941290244, + "learning_rate": 2.7051684096856876e-06, + "loss": 0.6454, + "step": 10379 + }, + { + "epoch": 0.9281115879828327, + "grad_norm": 0.13843455035426208, + "learning_rate": 2.6984820417113587e-06, + "loss": 0.6524, + "step": 10380 + }, + { + "epoch": 0.9282010014306151, + "grad_norm": 0.12318044748001901, + "learning_rate": 2.6918038342978345e-06, + "loss": 0.6049, + "step": 10381 + }, + { + "epoch": 0.9282904148783977, + "grad_norm": 0.14073892192686863, + "learning_rate": 2.685133788005201e-06, + "loss": 0.6462, + "step": 10382 + }, + { + "epoch": 0.9283798283261803, + "grad_norm": 0.14158431696670973, + "learning_rate": 2.678471903392865e-06, + "loss": 0.6347, + "step": 10383 + }, + { + "epoch": 0.9284692417739628, + "grad_norm": 0.1414928727217873, + "learning_rate": 2.6718181810195696e-06, + "loss": 0.641, + "step": 10384 + }, + { + "epoch": 0.9285586552217453, + "grad_norm": 0.14380161359277505, + "learning_rate": 2.6651726214433235e-06, + "loss": 0.6259, + "step": 10385 + }, + { + "epoch": 0.9286480686695279, + "grad_norm": 0.1501384401610469, + "learning_rate": 2.6585352252215036e-06, + "loss": 0.6005, + "step": 10386 + }, + { + "epoch": 0.9287374821173104, + "grad_norm": 0.13906182066881254, + "learning_rate": 2.651905992910786e-06, + "loss": 0.6489, + "step": 10387 + }, + { + "epoch": 0.928826895565093, + "grad_norm": 0.17337510141925888, + "learning_rate": 2.6452849250671373e-06, + "loss": 0.6308, + "step": 10388 + }, + { + "epoch": 0.9289163090128756, + "grad_norm": 0.14300780761066784, + "learning_rate": 2.6386720222458693e-06, + "loss": 0.6348, + "step": 10389 + }, + { + "epoch": 0.929005722460658, + "grad_norm": 0.14045497935154086, + "learning_rate": 2.6320672850016047e-06, + "loss": 0.58, + "step": 10390 + }, + { + "epoch": 0.9290951359084406, + "grad_norm": 0.15465314322405957, + "learning_rate": 2.625470713888256e-06, + "loss": 0.6401, + "step": 10391 + }, + { + "epoch": 0.9291845493562232, + "grad_norm": 0.1375129505742144, + "learning_rate": 2.618882309459081e-06, + "loss": 0.6639, + "step": 10392 + }, + { + "epoch": 0.9292739628040058, + "grad_norm": 0.14480008499548783, + "learning_rate": 2.612302072266637e-06, + "loss": 0.6254, + "step": 10393 + }, + { + "epoch": 0.9293633762517882, + "grad_norm": 0.13780699646236705, + "learning_rate": 2.605730002862805e-06, + "loss": 0.6186, + "step": 10394 + }, + { + "epoch": 0.9294527896995708, + "grad_norm": 0.13995894375424786, + "learning_rate": 2.5991661017987777e-06, + "loss": 0.6016, + "step": 10395 + }, + { + "epoch": 0.9295422031473534, + "grad_norm": 0.12803474469253506, + "learning_rate": 2.5926103696250703e-06, + "loss": 0.6147, + "step": 10396 + }, + { + "epoch": 0.9296316165951359, + "grad_norm": 0.12992421725398362, + "learning_rate": 2.586062806891476e-06, + "loss": 0.6045, + "step": 10397 + }, + { + "epoch": 0.9297210300429185, + "grad_norm": 0.1449170271745022, + "learning_rate": 2.5795234141471445e-06, + "loss": 0.6418, + "step": 10398 + }, + { + "epoch": 0.929810443490701, + "grad_norm": 0.13914309111861362, + "learning_rate": 2.5729921919405377e-06, + "loss": 0.6096, + "step": 10399 + }, + { + "epoch": 0.9298998569384835, + "grad_norm": 0.15995117124645894, + "learning_rate": 2.5664691408194165e-06, + "loss": 0.6415, + "step": 10400 + }, + { + "epoch": 0.9299892703862661, + "grad_norm": 0.13641151733759507, + "learning_rate": 2.559954261330866e-06, + "loss": 0.6066, + "step": 10401 + }, + { + "epoch": 0.9300786838340487, + "grad_norm": 0.13905547098318186, + "learning_rate": 2.55344755402126e-06, + "loss": 0.619, + "step": 10402 + }, + { + "epoch": 0.9301680972818311, + "grad_norm": 0.1511738739825206, + "learning_rate": 2.546949019436329e-06, + "loss": 0.6494, + "step": 10403 + }, + { + "epoch": 0.9302575107296137, + "grad_norm": 0.13515836572055107, + "learning_rate": 2.540458658121092e-06, + "loss": 0.631, + "step": 10404 + }, + { + "epoch": 0.9303469241773963, + "grad_norm": 0.1358235785276537, + "learning_rate": 2.533976470619881e-06, + "loss": 0.6319, + "step": 10405 + }, + { + "epoch": 0.9304363376251789, + "grad_norm": 0.14770025572952156, + "learning_rate": 2.5275024574763496e-06, + "loss": 0.6392, + "step": 10406 + }, + { + "epoch": 0.9305257510729614, + "grad_norm": 0.1500542087934994, + "learning_rate": 2.5210366192334745e-06, + "loss": 0.6406, + "step": 10407 + }, + { + "epoch": 0.9306151645207439, + "grad_norm": 0.15542719364222596, + "learning_rate": 2.514578956433533e-06, + "loss": 0.6447, + "step": 10408 + }, + { + "epoch": 0.9307045779685265, + "grad_norm": 0.15516676618995376, + "learning_rate": 2.5081294696181255e-06, + "loss": 0.6259, + "step": 10409 + }, + { + "epoch": 0.930793991416309, + "grad_norm": 0.13229215951851317, + "learning_rate": 2.501688159328164e-06, + "loss": 0.5995, + "step": 10410 + }, + { + "epoch": 0.9308834048640916, + "grad_norm": 0.14631399771315334, + "learning_rate": 2.49525502610386e-06, + "loss": 0.6045, + "step": 10411 + }, + { + "epoch": 0.9309728183118741, + "grad_norm": 0.13754799097383877, + "learning_rate": 2.48883007048476e-06, + "loss": 0.6035, + "step": 10412 + }, + { + "epoch": 0.9310622317596566, + "grad_norm": 0.14830183553711537, + "learning_rate": 2.4824132930097222e-06, + "loss": 0.63, + "step": 10413 + }, + { + "epoch": 0.9311516452074392, + "grad_norm": 0.14135317009768053, + "learning_rate": 2.4760046942169048e-06, + "loss": 0.6219, + "step": 10414 + }, + { + "epoch": 0.9312410586552218, + "grad_norm": 0.13922046020054862, + "learning_rate": 2.4696042746438108e-06, + "loss": 0.6616, + "step": 10415 + }, + { + "epoch": 0.9313304721030042, + "grad_norm": 0.14215053954284682, + "learning_rate": 2.4632120348272003e-06, + "loss": 0.6413, + "step": 10416 + }, + { + "epoch": 0.9314198855507868, + "grad_norm": 0.1391554836875864, + "learning_rate": 2.456827975303211e-06, + "loss": 0.6455, + "step": 10417 + }, + { + "epoch": 0.9315092989985694, + "grad_norm": 0.13689222580406976, + "learning_rate": 2.4504520966072476e-06, + "loss": 0.6335, + "step": 10418 + }, + { + "epoch": 0.931598712446352, + "grad_norm": 0.13784291632620832, + "learning_rate": 2.4440843992740714e-06, + "loss": 0.6365, + "step": 10419 + }, + { + "epoch": 0.9316881258941345, + "grad_norm": 0.13626719643172996, + "learning_rate": 2.4377248838376996e-06, + "loss": 0.5932, + "step": 10420 + }, + { + "epoch": 0.931777539341917, + "grad_norm": 0.15618609850608672, + "learning_rate": 2.4313735508315396e-06, + "loss": 0.6326, + "step": 10421 + }, + { + "epoch": 0.9318669527896996, + "grad_norm": 0.15331429287765083, + "learning_rate": 2.425030400788231e-06, + "loss": 0.6278, + "step": 10422 + }, + { + "epoch": 0.9319563662374821, + "grad_norm": 0.1372096320204776, + "learning_rate": 2.4186954342397815e-06, + "loss": 0.607, + "step": 10423 + }, + { + "epoch": 0.9320457796852647, + "grad_norm": 0.14560351418399953, + "learning_rate": 2.4123686517175113e-06, + "loss": 0.6039, + "step": 10424 + }, + { + "epoch": 0.9321351931330472, + "grad_norm": 0.1439751064470908, + "learning_rate": 2.406050053752018e-06, + "loss": 0.6282, + "step": 10425 + }, + { + "epoch": 0.9322246065808297, + "grad_norm": 0.12937946888505436, + "learning_rate": 2.3997396408732443e-06, + "loss": 0.601, + "step": 10426 + }, + { + "epoch": 0.9323140200286123, + "grad_norm": 0.13604514484103855, + "learning_rate": 2.3934374136104222e-06, + "loss": 0.6386, + "step": 10427 + }, + { + "epoch": 0.9324034334763949, + "grad_norm": 0.1433290985961292, + "learning_rate": 2.38714337249214e-06, + "loss": 0.5894, + "step": 10428 + }, + { + "epoch": 0.9324928469241774, + "grad_norm": 0.15766494036605447, + "learning_rate": 2.3808575180462533e-06, + "loss": 0.6673, + "step": 10429 + }, + { + "epoch": 0.9325822603719599, + "grad_norm": 0.1388415656246375, + "learning_rate": 2.374579850799963e-06, + "loss": 0.5772, + "step": 10430 + }, + { + "epoch": 0.9326716738197425, + "grad_norm": 0.12961751522115222, + "learning_rate": 2.3683103712797473e-06, + "loss": 0.6355, + "step": 10431 + }, + { + "epoch": 0.932761087267525, + "grad_norm": 0.1459794405897821, + "learning_rate": 2.3620490800114304e-06, + "loss": 0.6281, + "step": 10432 + }, + { + "epoch": 0.9328505007153076, + "grad_norm": 0.13624945096875749, + "learning_rate": 2.3557959775201478e-06, + "loss": 0.6329, + "step": 10433 + }, + { + "epoch": 0.9329399141630901, + "grad_norm": 0.1466858249906393, + "learning_rate": 2.349551064330313e-06, + "loss": 0.6126, + "step": 10434 + }, + { + "epoch": 0.9330293276108726, + "grad_norm": 0.13520488035308065, + "learning_rate": 2.3433143409657188e-06, + "loss": 0.609, + "step": 10435 + }, + { + "epoch": 0.9331187410586552, + "grad_norm": 0.134333347438292, + "learning_rate": 2.337085807949413e-06, + "loss": 0.5856, + "step": 10436 + }, + { + "epoch": 0.9332081545064378, + "grad_norm": 0.1428559102439688, + "learning_rate": 2.3308654658037555e-06, + "loss": 0.6178, + "step": 10437 + }, + { + "epoch": 0.9332975679542204, + "grad_norm": 0.1409987888479926, + "learning_rate": 2.3246533150504735e-06, + "loss": 0.582, + "step": 10438 + }, + { + "epoch": 0.9333869814020028, + "grad_norm": 0.14002157577646884, + "learning_rate": 2.3184493562105504e-06, + "loss": 0.6155, + "step": 10439 + }, + { + "epoch": 0.9334763948497854, + "grad_norm": 0.14074146216448621, + "learning_rate": 2.312253589804314e-06, + "loss": 0.6069, + "step": 10440 + }, + { + "epoch": 0.933565808297568, + "grad_norm": 0.1478275881782856, + "learning_rate": 2.3060660163513825e-06, + "loss": 0.6553, + "step": 10441 + }, + { + "epoch": 0.9336552217453505, + "grad_norm": 0.15357091784601162, + "learning_rate": 2.2998866363707184e-06, + "loss": 0.6099, + "step": 10442 + }, + { + "epoch": 0.933744635193133, + "grad_norm": 0.1287639905952841, + "learning_rate": 2.2937154503805623e-06, + "loss": 0.6559, + "step": 10443 + }, + { + "epoch": 0.9338340486409156, + "grad_norm": 0.14221901397815084, + "learning_rate": 2.287552458898501e-06, + "loss": 0.6377, + "step": 10444 + }, + { + "epoch": 0.9339234620886981, + "grad_norm": 0.13357615935464867, + "learning_rate": 2.2813976624414093e-06, + "loss": 0.591, + "step": 10445 + }, + { + "epoch": 0.9340128755364807, + "grad_norm": 0.1676186601962572, + "learning_rate": 2.275251061525474e-06, + "loss": 0.6661, + "step": 10446 + }, + { + "epoch": 0.9341022889842633, + "grad_norm": 0.14466271114540008, + "learning_rate": 2.269112656666217e-06, + "loss": 0.6475, + "step": 10447 + }, + { + "epoch": 0.9341917024320457, + "grad_norm": 0.1531444726955703, + "learning_rate": 2.2629824483784366e-06, + "loss": 0.653, + "step": 10448 + }, + { + "epoch": 0.9342811158798283, + "grad_norm": 0.16325635367574182, + "learning_rate": 2.2568604371763e-06, + "loss": 0.6491, + "step": 10449 + }, + { + "epoch": 0.9343705293276109, + "grad_norm": 0.12701637373955538, + "learning_rate": 2.25074662357323e-06, + "loss": 0.5953, + "step": 10450 + }, + { + "epoch": 0.9344599427753935, + "grad_norm": 0.14211444992099245, + "learning_rate": 2.2446410080819824e-06, + "loss": 0.6259, + "step": 10451 + }, + { + "epoch": 0.9345493562231759, + "grad_norm": 0.13102849670092276, + "learning_rate": 2.238543591214637e-06, + "loss": 0.6289, + "step": 10452 + }, + { + "epoch": 0.9346387696709585, + "grad_norm": 0.15371355900625558, + "learning_rate": 2.232454373482584e-06, + "loss": 0.6401, + "step": 10453 + }, + { + "epoch": 0.9347281831187411, + "grad_norm": 0.12478667032748689, + "learning_rate": 2.226373355396505e-06, + "loss": 0.5788, + "step": 10454 + }, + { + "epoch": 0.9348175965665236, + "grad_norm": 0.13099529935799908, + "learning_rate": 2.220300537466413e-06, + "loss": 0.5717, + "step": 10455 + }, + { + "epoch": 0.9349070100143062, + "grad_norm": 0.12450038212288013, + "learning_rate": 2.2142359202016237e-06, + "loss": 0.5944, + "step": 10456 + }, + { + "epoch": 0.9349964234620887, + "grad_norm": 0.13514698230712038, + "learning_rate": 2.208179504110763e-06, + "loss": 0.6015, + "step": 10457 + }, + { + "epoch": 0.9350858369098712, + "grad_norm": 0.14195616672416433, + "learning_rate": 2.202131289701803e-06, + "loss": 0.6627, + "step": 10458 + }, + { + "epoch": 0.9351752503576538, + "grad_norm": 0.12833898353951398, + "learning_rate": 2.1960912774819707e-06, + "loss": 0.5983, + "step": 10459 + }, + { + "epoch": 0.9352646638054364, + "grad_norm": 0.13025871612484063, + "learning_rate": 2.1900594679578503e-06, + "loss": 0.637, + "step": 10460 + }, + { + "epoch": 0.9353540772532188, + "grad_norm": 0.13561486540891496, + "learning_rate": 2.1840358616353252e-06, + "loss": 0.6016, + "step": 10461 + }, + { + "epoch": 0.9354434907010014, + "grad_norm": 0.13947725711681158, + "learning_rate": 2.1780204590195583e-06, + "loss": 0.633, + "step": 10462 + }, + { + "epoch": 0.935532904148784, + "grad_norm": 0.14759692448559855, + "learning_rate": 2.172013260615091e-06, + "loss": 0.6451, + "step": 10463 + }, + { + "epoch": 0.9356223175965666, + "grad_norm": 0.13129419437472148, + "learning_rate": 2.166014266925731e-06, + "loss": 0.5973, + "step": 10464 + }, + { + "epoch": 0.935711731044349, + "grad_norm": 0.13882594713364893, + "learning_rate": 2.160023478454587e-06, + "loss": 0.6301, + "step": 10465 + }, + { + "epoch": 0.9358011444921316, + "grad_norm": 0.14195030646863666, + "learning_rate": 2.1540408957041235e-06, + "loss": 0.641, + "step": 10466 + }, + { + "epoch": 0.9358905579399142, + "grad_norm": 0.15577647569309294, + "learning_rate": 2.148066519176084e-06, + "loss": 0.6372, + "step": 10467 + }, + { + "epoch": 0.9359799713876967, + "grad_norm": 0.14885023309302772, + "learning_rate": 2.142100349371512e-06, + "loss": 0.6381, + "step": 10468 + }, + { + "epoch": 0.9360693848354793, + "grad_norm": 0.1315543614706417, + "learning_rate": 2.1361423867908293e-06, + "loss": 0.5667, + "step": 10469 + }, + { + "epoch": 0.9361587982832618, + "grad_norm": 0.151444621069262, + "learning_rate": 2.1301926319336696e-06, + "loss": 0.6504, + "step": 10470 + }, + { + "epoch": 0.9362482117310443, + "grad_norm": 0.12742552220972317, + "learning_rate": 2.124251085299067e-06, + "loss": 0.6286, + "step": 10471 + }, + { + "epoch": 0.9363376251788269, + "grad_norm": 0.15245372224610435, + "learning_rate": 2.1183177473853346e-06, + "loss": 0.6458, + "step": 10472 + }, + { + "epoch": 0.9364270386266095, + "grad_norm": 0.1359283862914503, + "learning_rate": 2.112392618690062e-06, + "loss": 0.6104, + "step": 10473 + }, + { + "epoch": 0.9365164520743919, + "grad_norm": 0.15061148921159856, + "learning_rate": 2.1064756997102084e-06, + "loss": 0.6269, + "step": 10474 + }, + { + "epoch": 0.9366058655221745, + "grad_norm": 0.1285692465113556, + "learning_rate": 2.10056699094201e-06, + "loss": 0.6307, + "step": 10475 + }, + { + "epoch": 0.9366952789699571, + "grad_norm": 0.1514314599735264, + "learning_rate": 2.0946664928810367e-06, + "loss": 0.649, + "step": 10476 + }, + { + "epoch": 0.9367846924177397, + "grad_norm": 0.13164289990824116, + "learning_rate": 2.0887742060221262e-06, + "loss": 0.6258, + "step": 10477 + }, + { + "epoch": 0.9368741058655222, + "grad_norm": 0.1543195702214682, + "learning_rate": 2.082890130859505e-06, + "loss": 0.6312, + "step": 10478 + }, + { + "epoch": 0.9369635193133047, + "grad_norm": 0.1288957385797999, + "learning_rate": 2.077014267886612e-06, + "loss": 0.617, + "step": 10479 + }, + { + "epoch": 0.9370529327610873, + "grad_norm": 0.1328282990450325, + "learning_rate": 2.0711466175962756e-06, + "loss": 0.621, + "step": 10480 + }, + { + "epoch": 0.9371423462088698, + "grad_norm": 0.1398055815839979, + "learning_rate": 2.065287180480613e-06, + "loss": 0.6535, + "step": 10481 + }, + { + "epoch": 0.9372317596566524, + "grad_norm": 0.14081811990395662, + "learning_rate": 2.0594359570310196e-06, + "loss": 0.6576, + "step": 10482 + }, + { + "epoch": 0.9373211731044349, + "grad_norm": 0.1313524784603632, + "learning_rate": 2.0535929477382587e-06, + "loss": 0.6195, + "step": 10483 + }, + { + "epoch": 0.9374105865522174, + "grad_norm": 0.15747934202314956, + "learning_rate": 2.0477581530923717e-06, + "loss": 0.6203, + "step": 10484 + }, + { + "epoch": 0.9375, + "grad_norm": 0.133269245764959, + "learning_rate": 2.0419315735827116e-06, + "loss": 0.619, + "step": 10485 + }, + { + "epoch": 0.9375894134477826, + "grad_norm": 0.14914040194344344, + "learning_rate": 2.036113209697943e-06, + "loss": 0.6385, + "step": 10486 + }, + { + "epoch": 0.9376788268955651, + "grad_norm": 0.12902620957332026, + "learning_rate": 2.0303030619260644e-06, + "loss": 0.6359, + "step": 10487 + }, + { + "epoch": 0.9377682403433476, + "grad_norm": 0.14947411964431792, + "learning_rate": 2.0245011307543416e-06, + "loss": 0.6106, + "step": 10488 + }, + { + "epoch": 0.9378576537911302, + "grad_norm": 0.15140814259073343, + "learning_rate": 2.018707416669374e-06, + "loss": 0.6173, + "step": 10489 + }, + { + "epoch": 0.9379470672389127, + "grad_norm": 0.1441971442418902, + "learning_rate": 2.012921920157096e-06, + "loss": 0.6384, + "step": 10490 + }, + { + "epoch": 0.9380364806866953, + "grad_norm": 0.14680347148639059, + "learning_rate": 2.0071446417027073e-06, + "loss": 0.6434, + "step": 10491 + }, + { + "epoch": 0.9381258941344778, + "grad_norm": 0.15689044311233227, + "learning_rate": 2.0013755817907652e-06, + "loss": 0.6701, + "step": 10492 + }, + { + "epoch": 0.9382153075822603, + "grad_norm": 0.1332133567318598, + "learning_rate": 1.995614740905094e-06, + "loss": 0.6106, + "step": 10493 + }, + { + "epoch": 0.9383047210300429, + "grad_norm": 0.1289080754962145, + "learning_rate": 1.9898621195288515e-06, + "loss": 0.6136, + "step": 10494 + }, + { + "epoch": 0.9383941344778255, + "grad_norm": 0.14727731209976178, + "learning_rate": 1.984117718144518e-06, + "loss": 0.6125, + "step": 10495 + }, + { + "epoch": 0.9384835479256081, + "grad_norm": 0.14481893708906332, + "learning_rate": 1.9783815372338423e-06, + "loss": 0.6443, + "step": 10496 + }, + { + "epoch": 0.9385729613733905, + "grad_norm": 0.14585666447918674, + "learning_rate": 1.972653577277939e-06, + "loss": 0.5985, + "step": 10497 + }, + { + "epoch": 0.9386623748211731, + "grad_norm": 0.14448344557686138, + "learning_rate": 1.96693383875719e-06, + "loss": 0.6117, + "step": 10498 + }, + { + "epoch": 0.9387517882689557, + "grad_norm": 0.14583271703998907, + "learning_rate": 1.9612223221513125e-06, + "loss": 0.643, + "step": 10499 + }, + { + "epoch": 0.9388412017167382, + "grad_norm": 0.1376801129167129, + "learning_rate": 1.955519027939301e-06, + "loss": 0.6337, + "step": 10500 + }, + { + "epoch": 0.9389306151645207, + "grad_norm": 0.14418464407578724, + "learning_rate": 1.949823956599528e-06, + "loss": 0.6371, + "step": 10501 + }, + { + "epoch": 0.9390200286123033, + "grad_norm": 0.13871692650321438, + "learning_rate": 1.9441371086095784e-06, + "loss": 0.656, + "step": 10502 + }, + { + "epoch": 0.9391094420600858, + "grad_norm": 0.14776364327433403, + "learning_rate": 1.938458484446437e-06, + "loss": 0.6623, + "step": 10503 + }, + { + "epoch": 0.9391988555078684, + "grad_norm": 0.13505388978554542, + "learning_rate": 1.9327880845863568e-06, + "loss": 0.6329, + "step": 10504 + }, + { + "epoch": 0.939288268955651, + "grad_norm": 0.1372914598443534, + "learning_rate": 1.927125909504901e-06, + "loss": 0.6415, + "step": 10505 + }, + { + "epoch": 0.9393776824034334, + "grad_norm": 0.13371510310427576, + "learning_rate": 1.921471959676957e-06, + "loss": 0.5927, + "step": 10506 + }, + { + "epoch": 0.939467095851216, + "grad_norm": 0.12108117117457218, + "learning_rate": 1.915826235576712e-06, + "loss": 0.5867, + "step": 10507 + }, + { + "epoch": 0.9395565092989986, + "grad_norm": 0.14435339118386212, + "learning_rate": 1.910188737677665e-06, + "loss": 0.613, + "step": 10508 + }, + { + "epoch": 0.9396459227467812, + "grad_norm": 0.1365070409998258, + "learning_rate": 1.9045594664526155e-06, + "loss": 0.6271, + "step": 10509 + }, + { + "epoch": 0.9397353361945636, + "grad_norm": 0.1460217125028139, + "learning_rate": 1.8989384223736971e-06, + "loss": 0.6257, + "step": 10510 + }, + { + "epoch": 0.9398247496423462, + "grad_norm": 0.14684301074327705, + "learning_rate": 1.8933256059123438e-06, + "loss": 0.6445, + "step": 10511 + }, + { + "epoch": 0.9399141630901288, + "grad_norm": 0.1347541953468775, + "learning_rate": 1.88772101753929e-06, + "loss": 0.6328, + "step": 10512 + }, + { + "epoch": 0.9400035765379113, + "grad_norm": 0.11905440605596006, + "learning_rate": 1.8821246577245822e-06, + "loss": 0.603, + "step": 10513 + }, + { + "epoch": 0.9400929899856938, + "grad_norm": 0.15405809922590874, + "learning_rate": 1.876536526937589e-06, + "loss": 0.6663, + "step": 10514 + }, + { + "epoch": 0.9401824034334764, + "grad_norm": 0.13245502743488421, + "learning_rate": 1.8709566256469691e-06, + "loss": 0.5806, + "step": 10515 + }, + { + "epoch": 0.9402718168812589, + "grad_norm": 0.13756503163717693, + "learning_rate": 1.8653849543207036e-06, + "loss": 0.6169, + "step": 10516 + }, + { + "epoch": 0.9403612303290415, + "grad_norm": 0.15100863400239317, + "learning_rate": 1.8598215134260743e-06, + "loss": 0.5987, + "step": 10517 + }, + { + "epoch": 0.9404506437768241, + "grad_norm": 0.15937045868995103, + "learning_rate": 1.8542663034297191e-06, + "loss": 0.6075, + "step": 10518 + }, + { + "epoch": 0.9405400572246065, + "grad_norm": 0.14682639905472167, + "learning_rate": 1.8487193247974989e-06, + "loss": 0.6441, + "step": 10519 + }, + { + "epoch": 0.9406294706723891, + "grad_norm": 0.13012132218140166, + "learning_rate": 1.843180577994652e-06, + "loss": 0.6351, + "step": 10520 + }, + { + "epoch": 0.9407188841201717, + "grad_norm": 0.1534553358408929, + "learning_rate": 1.8376500634857296e-06, + "loss": 0.6086, + "step": 10521 + }, + { + "epoch": 0.9408082975679543, + "grad_norm": 0.14510669540573257, + "learning_rate": 1.8321277817345274e-06, + "loss": 0.6259, + "step": 10522 + }, + { + "epoch": 0.9408977110157367, + "grad_norm": 0.1409204250928062, + "learning_rate": 1.8266137332042077e-06, + "loss": 0.6437, + "step": 10523 + }, + { + "epoch": 0.9409871244635193, + "grad_norm": 0.13936399183990378, + "learning_rate": 1.8211079183572344e-06, + "loss": 0.6306, + "step": 10524 + }, + { + "epoch": 0.9410765379113019, + "grad_norm": 0.15244905213587256, + "learning_rate": 1.8156103376553714e-06, + "loss": 0.6384, + "step": 10525 + }, + { + "epoch": 0.9411659513590844, + "grad_norm": 0.14220936525841088, + "learning_rate": 1.810120991559694e-06, + "loss": 0.6227, + "step": 10526 + }, + { + "epoch": 0.941255364806867, + "grad_norm": 0.1404272475036829, + "learning_rate": 1.8046398805305898e-06, + "loss": 0.6213, + "step": 10527 + }, + { + "epoch": 0.9413447782546495, + "grad_norm": 0.12673193746308892, + "learning_rate": 1.7991670050277354e-06, + "loss": 0.6437, + "step": 10528 + }, + { + "epoch": 0.941434191702432, + "grad_norm": 0.1356134684053029, + "learning_rate": 1.7937023655101636e-06, + "loss": 0.6437, + "step": 10529 + }, + { + "epoch": 0.9415236051502146, + "grad_norm": 0.13050180088667432, + "learning_rate": 1.7882459624361637e-06, + "loss": 0.626, + "step": 10530 + }, + { + "epoch": 0.9416130185979972, + "grad_norm": 0.14271701406970108, + "learning_rate": 1.782797796263358e-06, + "loss": 0.6591, + "step": 10531 + }, + { + "epoch": 0.9417024320457796, + "grad_norm": 0.12666628508339256, + "learning_rate": 1.7773578674486923e-06, + "loss": 0.6008, + "step": 10532 + }, + { + "epoch": 0.9417918454935622, + "grad_norm": 0.14764266513005775, + "learning_rate": 1.7719261764484019e-06, + "loss": 0.624, + "step": 10533 + }, + { + "epoch": 0.9418812589413448, + "grad_norm": 0.1404086747557525, + "learning_rate": 1.7665027237180332e-06, + "loss": 0.6394, + "step": 10534 + }, + { + "epoch": 0.9419706723891274, + "grad_norm": 0.13773303303197934, + "learning_rate": 1.7610875097124446e-06, + "loss": 0.5959, + "step": 10535 + }, + { + "epoch": 0.9420600858369099, + "grad_norm": 0.15439798083693204, + "learning_rate": 1.7556805348858064e-06, + "loss": 0.6848, + "step": 10536 + }, + { + "epoch": 0.9421494992846924, + "grad_norm": 0.13894174445522536, + "learning_rate": 1.7502817996915778e-06, + "loss": 0.6823, + "step": 10537 + }, + { + "epoch": 0.942238912732475, + "grad_norm": 0.14627829538729673, + "learning_rate": 1.7448913045825742e-06, + "loss": 0.6354, + "step": 10538 + }, + { + "epoch": 0.9423283261802575, + "grad_norm": 0.12942673074147545, + "learning_rate": 1.739509050010868e-06, + "loss": 0.5873, + "step": 10539 + }, + { + "epoch": 0.9424177396280401, + "grad_norm": 0.1373821316916452, + "learning_rate": 1.7341350364278642e-06, + "loss": 0.5954, + "step": 10540 + }, + { + "epoch": 0.9425071530758226, + "grad_norm": 0.13680270791703877, + "learning_rate": 1.7287692642842911e-06, + "loss": 0.6364, + "step": 10541 + }, + { + "epoch": 0.9425965665236051, + "grad_norm": 0.12726432819598027, + "learning_rate": 1.723411734030156e-06, + "loss": 0.6216, + "step": 10542 + }, + { + "epoch": 0.9426859799713877, + "grad_norm": 0.14240190900304558, + "learning_rate": 1.7180624461147876e-06, + "loss": 0.6556, + "step": 10543 + }, + { + "epoch": 0.9427753934191703, + "grad_norm": 0.14261152567402458, + "learning_rate": 1.7127214009868385e-06, + "loss": 0.6229, + "step": 10544 + }, + { + "epoch": 0.9428648068669528, + "grad_norm": 0.14516137401536972, + "learning_rate": 1.7073885990942174e-06, + "loss": 0.608, + "step": 10545 + }, + { + "epoch": 0.9429542203147353, + "grad_norm": 0.12494865812070141, + "learning_rate": 1.7020640408842325e-06, + "loss": 0.6277, + "step": 10546 + }, + { + "epoch": 0.9430436337625179, + "grad_norm": 0.14223889305050924, + "learning_rate": 1.696747726803416e-06, + "loss": 0.6806, + "step": 10547 + }, + { + "epoch": 0.9431330472103004, + "grad_norm": 0.1470047991694468, + "learning_rate": 1.6914396572976444e-06, + "loss": 0.6524, + "step": 10548 + }, + { + "epoch": 0.943222460658083, + "grad_norm": 0.13720621461542945, + "learning_rate": 1.6861398328121059e-06, + "loss": 0.6227, + "step": 10549 + }, + { + "epoch": 0.9433118741058655, + "grad_norm": 0.14610524103512995, + "learning_rate": 1.6808482537912896e-06, + "loss": 0.6178, + "step": 10550 + }, + { + "epoch": 0.943401287553648, + "grad_norm": 0.12846597081661348, + "learning_rate": 1.6755649206789737e-06, + "loss": 0.6034, + "step": 10551 + }, + { + "epoch": 0.9434907010014306, + "grad_norm": 0.13887970547694606, + "learning_rate": 1.6702898339182925e-06, + "loss": 0.6399, + "step": 10552 + }, + { + "epoch": 0.9435801144492132, + "grad_norm": 0.13300517566380993, + "learning_rate": 1.6650229939516593e-06, + "loss": 0.6617, + "step": 10553 + }, + { + "epoch": 0.9436695278969958, + "grad_norm": 0.12800562898608184, + "learning_rate": 1.6597644012207759e-06, + "loss": 0.5844, + "step": 10554 + }, + { + "epoch": 0.9437589413447782, + "grad_norm": 0.12077789013873906, + "learning_rate": 1.6545140561667005e-06, + "loss": 0.6309, + "step": 10555 + }, + { + "epoch": 0.9438483547925608, + "grad_norm": 0.13968113455017458, + "learning_rate": 1.6492719592297478e-06, + "loss": 0.6204, + "step": 10556 + }, + { + "epoch": 0.9439377682403434, + "grad_norm": 0.12746271080380414, + "learning_rate": 1.6440381108495772e-06, + "loss": 0.6474, + "step": 10557 + }, + { + "epoch": 0.9440271816881259, + "grad_norm": 0.13470633364242207, + "learning_rate": 1.6388125114651486e-06, + "loss": 0.6356, + "step": 10558 + }, + { + "epoch": 0.9441165951359084, + "grad_norm": 0.1350096637693293, + "learning_rate": 1.6335951615147337e-06, + "loss": 0.6183, + "step": 10559 + }, + { + "epoch": 0.944206008583691, + "grad_norm": 0.1447801836994116, + "learning_rate": 1.6283860614358936e-06, + "loss": 0.6064, + "step": 10560 + }, + { + "epoch": 0.9442954220314735, + "grad_norm": 0.136823698549754, + "learning_rate": 1.623185211665501e-06, + "loss": 0.6581, + "step": 10561 + }, + { + "epoch": 0.9443848354792561, + "grad_norm": 0.13161536171401234, + "learning_rate": 1.6179926126397626e-06, + "loss": 0.5684, + "step": 10562 + }, + { + "epoch": 0.9444742489270386, + "grad_norm": 0.15727564478317577, + "learning_rate": 1.6128082647941744e-06, + "loss": 0.6216, + "step": 10563 + }, + { + "epoch": 0.9445636623748211, + "grad_norm": 0.1343705205670405, + "learning_rate": 1.6076321685635332e-06, + "loss": 0.6173, + "step": 10564 + }, + { + "epoch": 0.9446530758226037, + "grad_norm": 0.15438524087482985, + "learning_rate": 1.602464324381936e-06, + "loss": 0.5973, + "step": 10565 + }, + { + "epoch": 0.9447424892703863, + "grad_norm": 0.13582243990967144, + "learning_rate": 1.5973047326828472e-06, + "loss": 0.5991, + "step": 10566 + }, + { + "epoch": 0.9448319027181689, + "grad_norm": 0.13248974414336648, + "learning_rate": 1.5921533938989542e-06, + "loss": 0.652, + "step": 10567 + }, + { + "epoch": 0.9449213161659513, + "grad_norm": 0.1388150783058193, + "learning_rate": 1.5870103084623111e-06, + "loss": 0.6092, + "step": 10568 + }, + { + "epoch": 0.9450107296137339, + "grad_norm": 0.14345468581154996, + "learning_rate": 1.5818754768042733e-06, + "loss": 0.6406, + "step": 10569 + }, + { + "epoch": 0.9451001430615165, + "grad_norm": 0.1484951617693342, + "learning_rate": 1.5767488993554736e-06, + "loss": 0.6435, + "step": 10570 + }, + { + "epoch": 0.945189556509299, + "grad_norm": 0.14742498566068454, + "learning_rate": 1.5716305765458683e-06, + "loss": 0.6391, + "step": 10571 + }, + { + "epoch": 0.9452789699570815, + "grad_norm": 0.1326698583002011, + "learning_rate": 1.5665205088047474e-06, + "loss": 0.6188, + "step": 10572 + }, + { + "epoch": 0.9453683834048641, + "grad_norm": 0.13867652072834752, + "learning_rate": 1.561418696560668e-06, + "loss": 0.6297, + "step": 10573 + }, + { + "epoch": 0.9454577968526466, + "grad_norm": 0.140563380475072, + "learning_rate": 1.5563251402415102e-06, + "loss": 0.6265, + "step": 10574 + }, + { + "epoch": 0.9455472103004292, + "grad_norm": 0.14896612721092467, + "learning_rate": 1.5512398402744876e-06, + "loss": 0.6402, + "step": 10575 + }, + { + "epoch": 0.9456366237482118, + "grad_norm": 0.12892571171260578, + "learning_rate": 1.5461627970860814e-06, + "loss": 0.5904, + "step": 10576 + }, + { + "epoch": 0.9457260371959942, + "grad_norm": 0.13984320275848264, + "learning_rate": 1.5410940111020956e-06, + "loss": 0.6054, + "step": 10577 + }, + { + "epoch": 0.9458154506437768, + "grad_norm": 0.13295301136828044, + "learning_rate": 1.5360334827476564e-06, + "loss": 0.622, + "step": 10578 + }, + { + "epoch": 0.9459048640915594, + "grad_norm": 0.1294127650984172, + "learning_rate": 1.5309812124471579e-06, + "loss": 0.626, + "step": 10579 + }, + { + "epoch": 0.945994277539342, + "grad_norm": 0.14588186778588266, + "learning_rate": 1.52593720062435e-06, + "loss": 0.6029, + "step": 10580 + }, + { + "epoch": 0.9460836909871244, + "grad_norm": 0.1497990031453329, + "learning_rate": 1.520901447702272e-06, + "loss": 0.644, + "step": 10581 + }, + { + "epoch": 0.946173104434907, + "grad_norm": 0.16035147702903932, + "learning_rate": 1.5158739541032418e-06, + "loss": 0.6735, + "step": 10582 + }, + { + "epoch": 0.9462625178826896, + "grad_norm": 0.14050972622931768, + "learning_rate": 1.5108547202489443e-06, + "loss": 0.6533, + "step": 10583 + }, + { + "epoch": 0.9463519313304721, + "grad_norm": 0.14562315297216877, + "learning_rate": 1.5058437465602982e-06, + "loss": 0.6517, + "step": 10584 + }, + { + "epoch": 0.9464413447782547, + "grad_norm": 0.12731717603273202, + "learning_rate": 1.5008410334576006e-06, + "loss": 0.6202, + "step": 10585 + }, + { + "epoch": 0.9465307582260372, + "grad_norm": 0.14753144234787882, + "learning_rate": 1.495846581360394e-06, + "loss": 0.627, + "step": 10586 + }, + { + "epoch": 0.9466201716738197, + "grad_norm": 0.15524576577729687, + "learning_rate": 1.4908603906875761e-06, + "loss": 0.6471, + "step": 10587 + }, + { + "epoch": 0.9467095851216023, + "grad_norm": 0.16035208649327562, + "learning_rate": 1.4858824618573352e-06, + "loss": 0.6646, + "step": 10588 + }, + { + "epoch": 0.9467989985693849, + "grad_norm": 0.12682115809886366, + "learning_rate": 1.4809127952871592e-06, + "loss": 0.5554, + "step": 10589 + }, + { + "epoch": 0.9468884120171673, + "grad_norm": 0.1369721422046803, + "learning_rate": 1.4759513913938372e-06, + "loss": 0.6117, + "step": 10590 + }, + { + "epoch": 0.9469778254649499, + "grad_norm": 0.13619607332310987, + "learning_rate": 1.4709982505934806e-06, + "loss": 0.6426, + "step": 10591 + }, + { + "epoch": 0.9470672389127325, + "grad_norm": 0.1464598243939692, + "learning_rate": 1.4660533733015236e-06, + "loss": 0.6385, + "step": 10592 + }, + { + "epoch": 0.947156652360515, + "grad_norm": 0.1342812062516323, + "learning_rate": 1.461116759932657e-06, + "loss": 0.5911, + "step": 10593 + }, + { + "epoch": 0.9472460658082976, + "grad_norm": 0.14288135929828108, + "learning_rate": 1.4561884109009384e-06, + "loss": 0.6187, + "step": 10594 + }, + { + "epoch": 0.9473354792560801, + "grad_norm": 0.1442658981285425, + "learning_rate": 1.4512683266196703e-06, + "loss": 0.641, + "step": 10595 + }, + { + "epoch": 0.9474248927038627, + "grad_norm": 0.12661260254390205, + "learning_rate": 1.4463565075015228e-06, + "loss": 0.5034, + "step": 10596 + }, + { + "epoch": 0.9475143061516452, + "grad_norm": 0.13987459307367428, + "learning_rate": 1.441452953958422e-06, + "loss": 0.637, + "step": 10597 + }, + { + "epoch": 0.9476037195994278, + "grad_norm": 0.15084880821767355, + "learning_rate": 1.43655766640165e-06, + "loss": 0.6053, + "step": 10598 + }, + { + "epoch": 0.9476931330472103, + "grad_norm": 0.15263583053631, + "learning_rate": 1.4316706452417338e-06, + "loss": 0.6536, + "step": 10599 + }, + { + "epoch": 0.9477825464949928, + "grad_norm": 0.14699022266521833, + "learning_rate": 1.4267918908885681e-06, + "loss": 0.6419, + "step": 10600 + }, + { + "epoch": 0.9478719599427754, + "grad_norm": 0.1360017902143352, + "learning_rate": 1.421921403751314e-06, + "loss": 0.6261, + "step": 10601 + }, + { + "epoch": 0.947961373390558, + "grad_norm": 0.14547498096158032, + "learning_rate": 1.4170591842384672e-06, + "loss": 0.6511, + "step": 10602 + }, + { + "epoch": 0.9480507868383404, + "grad_norm": 0.1514533987985747, + "learning_rate": 1.4122052327578128e-06, + "loss": 0.6416, + "step": 10603 + }, + { + "epoch": 0.948140200286123, + "grad_norm": 0.15157634283925725, + "learning_rate": 1.4073595497164361e-06, + "loss": 0.6359, + "step": 10604 + }, + { + "epoch": 0.9482296137339056, + "grad_norm": 0.1321203874287434, + "learning_rate": 1.402522135520734e-06, + "loss": 0.5973, + "step": 10605 + }, + { + "epoch": 0.9483190271816881, + "grad_norm": 0.1346461104900089, + "learning_rate": 1.397692990576449e-06, + "loss": 0.6065, + "step": 10606 + }, + { + "epoch": 0.9484084406294707, + "grad_norm": 0.11651530433510153, + "learning_rate": 1.392872115288546e-06, + "loss": 0.6147, + "step": 10607 + }, + { + "epoch": 0.9484978540772532, + "grad_norm": 0.13566026992536764, + "learning_rate": 1.3880595100613792e-06, + "loss": 0.6085, + "step": 10608 + }, + { + "epoch": 0.9485872675250357, + "grad_norm": 0.151316331091179, + "learning_rate": 1.3832551752985811e-06, + "loss": 0.6355, + "step": 10609 + }, + { + "epoch": 0.9486766809728183, + "grad_norm": 0.11929046923808413, + "learning_rate": 1.378459111403052e-06, + "loss": 0.611, + "step": 10610 + }, + { + "epoch": 0.9487660944206009, + "grad_norm": 0.1484686973549508, + "learning_rate": 1.37367131877707e-06, + "loss": 0.6573, + "step": 10611 + }, + { + "epoch": 0.9488555078683834, + "grad_norm": 0.12445753724169632, + "learning_rate": 1.3688917978221583e-06, + "loss": 0.6171, + "step": 10612 + }, + { + "epoch": 0.9489449213161659, + "grad_norm": 0.13670280483615016, + "learning_rate": 1.364120548939174e-06, + "loss": 0.6267, + "step": 10613 + }, + { + "epoch": 0.9490343347639485, + "grad_norm": 0.1311649166881929, + "learning_rate": 1.3593575725282749e-06, + "loss": 0.6135, + "step": 10614 + }, + { + "epoch": 0.9491237482117311, + "grad_norm": 0.1337983642892334, + "learning_rate": 1.3546028689889302e-06, + "loss": 0.619, + "step": 10615 + }, + { + "epoch": 0.9492131616595136, + "grad_norm": 0.13269645061604407, + "learning_rate": 1.3498564387199098e-06, + "loss": 0.6076, + "step": 10616 + }, + { + "epoch": 0.9493025751072961, + "grad_norm": 0.14726405291891756, + "learning_rate": 1.3451182821192954e-06, + "loss": 0.6237, + "step": 10617 + }, + { + "epoch": 0.9493919885550787, + "grad_norm": 0.1579156852448733, + "learning_rate": 1.3403883995844579e-06, + "loss": 0.6015, + "step": 10618 + }, + { + "epoch": 0.9494814020028612, + "grad_norm": 0.13857908194001542, + "learning_rate": 1.3356667915121025e-06, + "loss": 0.5985, + "step": 10619 + }, + { + "epoch": 0.9495708154506438, + "grad_norm": 0.1430255441731802, + "learning_rate": 1.330953458298212e-06, + "loss": 0.614, + "step": 10620 + }, + { + "epoch": 0.9496602288984263, + "grad_norm": 0.1470585094520141, + "learning_rate": 1.3262484003380927e-06, + "loss": 0.6159, + "step": 10621 + }, + { + "epoch": 0.9497496423462088, + "grad_norm": 0.13610519787040495, + "learning_rate": 1.321551618026351e-06, + "loss": 0.6469, + "step": 10622 + }, + { + "epoch": 0.9498390557939914, + "grad_norm": 0.14638426954194556, + "learning_rate": 1.3168631117569052e-06, + "loss": 0.6277, + "step": 10623 + }, + { + "epoch": 0.949928469241774, + "grad_norm": 0.14213259115510887, + "learning_rate": 1.3121828819229743e-06, + "loss": 0.6376, + "step": 10624 + }, + { + "epoch": 0.9500178826895566, + "grad_norm": 0.14610650094438854, + "learning_rate": 1.3075109289170773e-06, + "loss": 0.6211, + "step": 10625 + }, + { + "epoch": 0.950107296137339, + "grad_norm": 0.14357980459463, + "learning_rate": 1.3028472531310454e-06, + "loss": 0.6198, + "step": 10626 + }, + { + "epoch": 0.9501967095851216, + "grad_norm": 0.14747863893613775, + "learning_rate": 1.2981918549560213e-06, + "loss": 0.5991, + "step": 10627 + }, + { + "epoch": 0.9502861230329042, + "grad_norm": 0.1550828940829764, + "learning_rate": 1.293544734782437e-06, + "loss": 0.6574, + "step": 10628 + }, + { + "epoch": 0.9503755364806867, + "grad_norm": 0.14331430960965127, + "learning_rate": 1.2889058930000586e-06, + "loss": 0.6155, + "step": 10629 + }, + { + "epoch": 0.9504649499284692, + "grad_norm": 0.13812865035622504, + "learning_rate": 1.2842753299979305e-06, + "loss": 0.6461, + "step": 10630 + }, + { + "epoch": 0.9505543633762518, + "grad_norm": 0.1360618981705308, + "learning_rate": 1.2796530461644086e-06, + "loss": 0.58, + "step": 10631 + }, + { + "epoch": 0.9506437768240343, + "grad_norm": 0.12716722169665307, + "learning_rate": 1.2750390418871604e-06, + "loss": 0.6097, + "step": 10632 + }, + { + "epoch": 0.9507331902718169, + "grad_norm": 0.14534365619318299, + "learning_rate": 1.2704333175531546e-06, + "loss": 0.5977, + "step": 10633 + }, + { + "epoch": 0.9508226037195995, + "grad_norm": 0.1595345082938002, + "learning_rate": 1.265835873548682e-06, + "loss": 0.6531, + "step": 10634 + }, + { + "epoch": 0.9509120171673819, + "grad_norm": 0.13583731073517133, + "learning_rate": 1.2612467102593006e-06, + "loss": 0.6527, + "step": 10635 + }, + { + "epoch": 0.9510014306151645, + "grad_norm": 0.15162765990589577, + "learning_rate": 1.256665828069925e-06, + "loss": 0.6278, + "step": 10636 + }, + { + "epoch": 0.9510908440629471, + "grad_norm": 0.13785721301822343, + "learning_rate": 1.2520932273647258e-06, + "loss": 0.6157, + "step": 10637 + }, + { + "epoch": 0.9511802575107297, + "grad_norm": 0.14008865000817308, + "learning_rate": 1.2475289085272178e-06, + "loss": 0.622, + "step": 10638 + }, + { + "epoch": 0.9512696709585121, + "grad_norm": 0.12750484935325793, + "learning_rate": 1.2429728719401845e-06, + "loss": 0.6043, + "step": 10639 + }, + { + "epoch": 0.9513590844062947, + "grad_norm": 0.12730462810052995, + "learning_rate": 1.2384251179857643e-06, + "loss": 0.65, + "step": 10640 + }, + { + "epoch": 0.9514484978540773, + "grad_norm": 0.15449477264629874, + "learning_rate": 1.233885647045341e-06, + "loss": 0.6261, + "step": 10641 + }, + { + "epoch": 0.9515379113018598, + "grad_norm": 0.14868745771418745, + "learning_rate": 1.2293544594996543e-06, + "loss": 0.6222, + "step": 10642 + }, + { + "epoch": 0.9516273247496424, + "grad_norm": 0.14575310671964087, + "learning_rate": 1.2248315557287337e-06, + "loss": 0.6163, + "step": 10643 + }, + { + "epoch": 0.9517167381974249, + "grad_norm": 0.1280493708065406, + "learning_rate": 1.2203169361118871e-06, + "loss": 0.6388, + "step": 10644 + }, + { + "epoch": 0.9518061516452074, + "grad_norm": 0.1348540404086411, + "learning_rate": 1.215810601027767e-06, + "loss": 0.6282, + "step": 10645 + }, + { + "epoch": 0.95189556509299, + "grad_norm": 0.1374195337035723, + "learning_rate": 1.2113125508543267e-06, + "loss": 0.6233, + "step": 10646 + }, + { + "epoch": 0.9519849785407726, + "grad_norm": 0.15400911094940453, + "learning_rate": 1.2068227859687753e-06, + "loss": 0.6387, + "step": 10647 + }, + { + "epoch": 0.952074391988555, + "grad_norm": 0.13137843587990689, + "learning_rate": 1.2023413067476896e-06, + "loss": 0.6216, + "step": 10648 + }, + { + "epoch": 0.9521638054363376, + "grad_norm": 0.15015380675800047, + "learning_rate": 1.1978681135669245e-06, + "loss": 0.6447, + "step": 10649 + }, + { + "epoch": 0.9522532188841202, + "grad_norm": 0.13986319799308253, + "learning_rate": 1.1934032068016354e-06, + "loss": 0.621, + "step": 10650 + }, + { + "epoch": 0.9523426323319027, + "grad_norm": 0.14030137416175073, + "learning_rate": 1.1889465868263005e-06, + "loss": 0.636, + "step": 10651 + }, + { + "epoch": 0.9524320457796852, + "grad_norm": 0.13605600636273907, + "learning_rate": 1.1844982540146654e-06, + "loss": 0.6262, + "step": 10652 + }, + { + "epoch": 0.9525214592274678, + "grad_norm": 0.1408027119508627, + "learning_rate": 1.1800582087398316e-06, + "loss": 0.6155, + "step": 10653 + }, + { + "epoch": 0.9526108726752504, + "grad_norm": 0.13511731134811725, + "learning_rate": 1.1756264513741676e-06, + "loss": 0.6197, + "step": 10654 + }, + { + "epoch": 0.9527002861230329, + "grad_norm": 0.130233536177793, + "learning_rate": 1.1712029822893654e-06, + "loss": 0.6014, + "step": 10655 + }, + { + "epoch": 0.9527896995708155, + "grad_norm": 0.14852613083363175, + "learning_rate": 1.1667878018564171e-06, + "loss": 0.6108, + "step": 10656 + }, + { + "epoch": 0.952879113018598, + "grad_norm": 0.1313458498748765, + "learning_rate": 1.1623809104456262e-06, + "loss": 0.6064, + "step": 10657 + }, + { + "epoch": 0.9529685264663805, + "grad_norm": 0.14708028279274404, + "learning_rate": 1.157982308426564e-06, + "loss": 0.658, + "step": 10658 + }, + { + "epoch": 0.9530579399141631, + "grad_norm": 0.1437477803723859, + "learning_rate": 1.1535919961681575e-06, + "loss": 0.6058, + "step": 10659 + }, + { + "epoch": 0.9531473533619457, + "grad_norm": 0.12574506889777512, + "learning_rate": 1.1492099740386231e-06, + "loss": 0.6353, + "step": 10660 + }, + { + "epoch": 0.9532367668097281, + "grad_norm": 0.1432691910740884, + "learning_rate": 1.144836242405467e-06, + "loss": 0.6419, + "step": 10661 + }, + { + "epoch": 0.9533261802575107, + "grad_norm": 0.13943192781380492, + "learning_rate": 1.140470801635496e-06, + "loss": 0.6424, + "step": 10662 + }, + { + "epoch": 0.9534155937052933, + "grad_norm": 0.14576030410557986, + "learning_rate": 1.13611365209485e-06, + "loss": 0.6342, + "step": 10663 + }, + { + "epoch": 0.9535050071530758, + "grad_norm": 0.15128157598321595, + "learning_rate": 1.1317647941489595e-06, + "loss": 0.6628, + "step": 10664 + }, + { + "epoch": 0.9535944206008584, + "grad_norm": 0.14502584788018386, + "learning_rate": 1.1274242281625547e-06, + "loss": 0.6411, + "step": 10665 + }, + { + "epoch": 0.9536838340486409, + "grad_norm": 0.1584293354850336, + "learning_rate": 1.1230919544996776e-06, + "loss": 0.6407, + "step": 10666 + }, + { + "epoch": 0.9537732474964234, + "grad_norm": 0.13895356660047709, + "learning_rate": 1.1187679735236489e-06, + "loss": 0.6372, + "step": 10667 + }, + { + "epoch": 0.953862660944206, + "grad_norm": 0.14524980060737325, + "learning_rate": 1.114452285597145e-06, + "loss": 0.6232, + "step": 10668 + }, + { + "epoch": 0.9539520743919886, + "grad_norm": 0.14170283678199141, + "learning_rate": 1.110144891082099e-06, + "loss": 0.6114, + "step": 10669 + }, + { + "epoch": 0.954041487839771, + "grad_norm": 0.12686316686942156, + "learning_rate": 1.1058457903397656e-06, + "loss": 0.5662, + "step": 10670 + }, + { + "epoch": 0.9541309012875536, + "grad_norm": 0.1453871392666989, + "learning_rate": 1.1015549837307237e-06, + "loss": 0.6208, + "step": 10671 + }, + { + "epoch": 0.9542203147353362, + "grad_norm": 0.1424845650072512, + "learning_rate": 1.0972724716148187e-06, + "loss": 0.6468, + "step": 10672 + }, + { + "epoch": 0.9543097281831188, + "grad_norm": 0.14452042654348135, + "learning_rate": 1.0929982543512296e-06, + "loss": 0.6447, + "step": 10673 + }, + { + "epoch": 0.9543991416309013, + "grad_norm": 0.15124236469558108, + "learning_rate": 1.0887323322984366e-06, + "loss": 0.6641, + "step": 10674 + }, + { + "epoch": 0.9544885550786838, + "grad_norm": 0.14176543516269077, + "learning_rate": 1.084474705814198e-06, + "loss": 0.6157, + "step": 10675 + }, + { + "epoch": 0.9545779685264664, + "grad_norm": 0.1484778294302716, + "learning_rate": 1.0802253752556058e-06, + "loss": 0.6374, + "step": 10676 + }, + { + "epoch": 0.9546673819742489, + "grad_norm": 0.13974157633439518, + "learning_rate": 1.0759843409790527e-06, + "loss": 0.6046, + "step": 10677 + }, + { + "epoch": 0.9547567954220315, + "grad_norm": 0.15020386511724362, + "learning_rate": 1.0717516033402097e-06, + "loss": 0.6223, + "step": 10678 + }, + { + "epoch": 0.954846208869814, + "grad_norm": 0.13473159371031712, + "learning_rate": 1.0675271626940931e-06, + "loss": 0.6253, + "step": 10679 + }, + { + "epoch": 0.9549356223175965, + "grad_norm": 0.13943532854415217, + "learning_rate": 1.063311019395008e-06, + "loss": 0.6004, + "step": 10680 + }, + { + "epoch": 0.9550250357653791, + "grad_norm": 0.13413228967122398, + "learning_rate": 1.0591031737965273e-06, + "loss": 0.6267, + "step": 10681 + }, + { + "epoch": 0.9551144492131617, + "grad_norm": 0.13489492364058756, + "learning_rate": 1.0549036262515689e-06, + "loss": 0.6405, + "step": 10682 + }, + { + "epoch": 0.9552038626609443, + "grad_norm": 0.12803805020846662, + "learning_rate": 1.0507123771123505e-06, + "loss": 0.6197, + "step": 10683 + }, + { + "epoch": 0.9552932761087267, + "grad_norm": 0.14810207711972084, + "learning_rate": 1.0465294267303915e-06, + "loss": 0.655, + "step": 10684 + }, + { + "epoch": 0.9553826895565093, + "grad_norm": 0.13409751023923597, + "learning_rate": 1.0423547754564888e-06, + "loss": 0.6257, + "step": 10685 + }, + { + "epoch": 0.9554721030042919, + "grad_norm": 0.1468355286318428, + "learning_rate": 1.0381884236407958e-06, + "loss": 0.6338, + "step": 10686 + }, + { + "epoch": 0.9555615164520744, + "grad_norm": 0.12775271015290687, + "learning_rate": 1.0340303716327215e-06, + "loss": 0.6244, + "step": 10687 + }, + { + "epoch": 0.9556509298998569, + "grad_norm": 0.14600561038865023, + "learning_rate": 1.0298806197809984e-06, + "loss": 0.6168, + "step": 10688 + }, + { + "epoch": 0.9557403433476395, + "grad_norm": 0.12966125460152392, + "learning_rate": 1.0257391684336703e-06, + "loss": 0.5984, + "step": 10689 + }, + { + "epoch": 0.955829756795422, + "grad_norm": 0.15160627156984896, + "learning_rate": 1.0216060179380481e-06, + "loss": 0.6519, + "step": 10690 + }, + { + "epoch": 0.9559191702432046, + "grad_norm": 0.12097243385173657, + "learning_rate": 1.0174811686408104e-06, + "loss": 0.6356, + "step": 10691 + }, + { + "epoch": 0.9560085836909872, + "grad_norm": 0.15886319412395883, + "learning_rate": 1.01336462088788e-06, + "loss": 0.6734, + "step": 10692 + }, + { + "epoch": 0.9560979971387696, + "grad_norm": 0.14484628328899185, + "learning_rate": 1.0092563750245032e-06, + "loss": 0.6431, + "step": 10693 + }, + { + "epoch": 0.9561874105865522, + "grad_norm": 0.14585432599354575, + "learning_rate": 1.00515643139526e-06, + "loss": 0.6607, + "step": 10694 + }, + { + "epoch": 0.9562768240343348, + "grad_norm": 0.15107446179203285, + "learning_rate": 1.0010647903439862e-06, + "loss": 0.6677, + "step": 10695 + }, + { + "epoch": 0.9563662374821174, + "grad_norm": 0.14659083592985309, + "learning_rate": 9.96981452213852e-07, + "loss": 0.6147, + "step": 10696 + }, + { + "epoch": 0.9564556509298998, + "grad_norm": 0.1512402039245558, + "learning_rate": 9.929064173473057e-07, + "loss": 0.6321, + "step": 10697 + }, + { + "epoch": 0.9565450643776824, + "grad_norm": 0.1490721801487878, + "learning_rate": 9.888396860861404e-07, + "loss": 0.6347, + "step": 10698 + }, + { + "epoch": 0.956634477825465, + "grad_norm": 0.13417896642225566, + "learning_rate": 9.847812587714057e-07, + "loss": 0.5672, + "step": 10699 + }, + { + "epoch": 0.9567238912732475, + "grad_norm": 0.13537350793658984, + "learning_rate": 9.807311357434956e-07, + "loss": 0.6099, + "step": 10700 + }, + { + "epoch": 0.95681330472103, + "grad_norm": 0.1353508088658358, + "learning_rate": 9.766893173420721e-07, + "loss": 0.6356, + "step": 10701 + }, + { + "epoch": 0.9569027181688126, + "grad_norm": 0.11967850108841793, + "learning_rate": 9.726558039061308e-07, + "loss": 0.6287, + "step": 10702 + }, + { + "epoch": 0.9569921316165951, + "grad_norm": 0.15314910507632545, + "learning_rate": 9.68630595773956e-07, + "loss": 0.654, + "step": 10703 + }, + { + "epoch": 0.9570815450643777, + "grad_norm": 0.15529786470408452, + "learning_rate": 9.64613693283123e-07, + "loss": 0.6689, + "step": 10704 + }, + { + "epoch": 0.9571709585121603, + "grad_norm": 0.1432276711089892, + "learning_rate": 9.606050967705393e-07, + "loss": 0.6508, + "step": 10705 + }, + { + "epoch": 0.9572603719599427, + "grad_norm": 0.13907940591779347, + "learning_rate": 9.566048065724032e-07, + "loss": 0.6307, + "step": 10706 + }, + { + "epoch": 0.9573497854077253, + "grad_norm": 0.1474673770098785, + "learning_rate": 9.526128230242016e-07, + "loss": 0.6257, + "step": 10707 + }, + { + "epoch": 0.9574391988555079, + "grad_norm": 0.12641635471923926, + "learning_rate": 9.486291464607444e-07, + "loss": 0.6224, + "step": 10708 + }, + { + "epoch": 0.9575286123032904, + "grad_norm": 0.1265725721048859, + "learning_rate": 9.446537772161423e-07, + "loss": 0.6071, + "step": 10709 + }, + { + "epoch": 0.9576180257510729, + "grad_norm": 0.1382449865038119, + "learning_rate": 9.406867156237842e-07, + "loss": 0.6563, + "step": 10710 + }, + { + "epoch": 0.9577074391988555, + "grad_norm": 0.1442747743125284, + "learning_rate": 9.367279620164149e-07, + "loss": 0.6527, + "step": 10711 + }, + { + "epoch": 0.957796852646638, + "grad_norm": 0.14572004274252254, + "learning_rate": 9.327775167260244e-07, + "loss": 0.6184, + "step": 10712 + }, + { + "epoch": 0.9578862660944206, + "grad_norm": 0.14834963205612522, + "learning_rate": 9.288353800839366e-07, + "loss": 0.6358, + "step": 10713 + }, + { + "epoch": 0.9579756795422032, + "grad_norm": 0.14499000051215485, + "learning_rate": 9.249015524207872e-07, + "loss": 0.6168, + "step": 10714 + }, + { + "epoch": 0.9580650929899857, + "grad_norm": 0.14671959856320374, + "learning_rate": 9.209760340664897e-07, + "loss": 0.6234, + "step": 10715 + }, + { + "epoch": 0.9581545064377682, + "grad_norm": 0.14359791257028243, + "learning_rate": 9.170588253502698e-07, + "loss": 0.6715, + "step": 10716 + }, + { + "epoch": 0.9582439198855508, + "grad_norm": 0.12895091961327895, + "learning_rate": 9.13149926600676e-07, + "loss": 0.6149, + "step": 10717 + }, + { + "epoch": 0.9583333333333334, + "grad_norm": 0.13216904345545402, + "learning_rate": 9.092493381455236e-07, + "loss": 0.6206, + "step": 10718 + }, + { + "epoch": 0.9584227467811158, + "grad_norm": 0.13832505812763313, + "learning_rate": 9.05357060311951e-07, + "loss": 0.6267, + "step": 10719 + }, + { + "epoch": 0.9585121602288984, + "grad_norm": 0.1456798659928948, + "learning_rate": 9.014730934264192e-07, + "loss": 0.6412, + "step": 10720 + }, + { + "epoch": 0.958601573676681, + "grad_norm": 0.1457395417331456, + "learning_rate": 8.975974378146457e-07, + "loss": 0.6578, + "step": 10721 + }, + { + "epoch": 0.9586909871244635, + "grad_norm": 0.14592548116809087, + "learning_rate": 8.937300938017035e-07, + "loss": 0.6205, + "step": 10722 + }, + { + "epoch": 0.9587804005722461, + "grad_norm": 0.13429141517801124, + "learning_rate": 8.898710617119222e-07, + "loss": 0.6083, + "step": 10723 + }, + { + "epoch": 0.9588698140200286, + "grad_norm": 0.14312898737939825, + "learning_rate": 8.860203418689539e-07, + "loss": 0.6474, + "step": 10724 + }, + { + "epoch": 0.9589592274678111, + "grad_norm": 0.13595479411027087, + "learning_rate": 8.821779345957626e-07, + "loss": 0.6295, + "step": 10725 + }, + { + "epoch": 0.9590486409155937, + "grad_norm": 0.14363592032096748, + "learning_rate": 8.783438402146127e-07, + "loss": 0.5846, + "step": 10726 + }, + { + "epoch": 0.9591380543633763, + "grad_norm": 0.1453953004374146, + "learning_rate": 8.74518059047047e-07, + "loss": 0.655, + "step": 10727 + }, + { + "epoch": 0.9592274678111588, + "grad_norm": 0.1431923366215784, + "learning_rate": 8.707005914139422e-07, + "loss": 0.6341, + "step": 10728 + }, + { + "epoch": 0.9593168812589413, + "grad_norm": 0.13835472270636834, + "learning_rate": 8.668914376354642e-07, + "loss": 0.632, + "step": 10729 + }, + { + "epoch": 0.9594062947067239, + "grad_norm": 0.15325515559607214, + "learning_rate": 8.630905980310689e-07, + "loss": 0.6328, + "step": 10730 + }, + { + "epoch": 0.9594957081545065, + "grad_norm": 0.16577319455953152, + "learning_rate": 8.592980729195455e-07, + "loss": 0.6824, + "step": 10731 + }, + { + "epoch": 0.959585121602289, + "grad_norm": 0.13352702426463148, + "learning_rate": 8.555138626189618e-07, + "loss": 0.634, + "step": 10732 + }, + { + "epoch": 0.9596745350500715, + "grad_norm": 0.13766268500969794, + "learning_rate": 8.517379674466863e-07, + "loss": 0.5889, + "step": 10733 + }, + { + "epoch": 0.9597639484978541, + "grad_norm": 0.14597827778237688, + "learning_rate": 8.479703877194212e-07, + "loss": 0.6116, + "step": 10734 + }, + { + "epoch": 0.9598533619456366, + "grad_norm": 0.12939897099226674, + "learning_rate": 8.442111237531247e-07, + "loss": 0.6301, + "step": 10735 + }, + { + "epoch": 0.9599427753934192, + "grad_norm": 0.1393414048586491, + "learning_rate": 8.404601758630892e-07, + "loss": 0.6421, + "step": 10736 + }, + { + "epoch": 0.9600321888412017, + "grad_norm": 0.14057423330062144, + "learning_rate": 8.367175443639075e-07, + "loss": 0.6771, + "step": 10737 + }, + { + "epoch": 0.9601216022889842, + "grad_norm": 0.1504925519634741, + "learning_rate": 8.329832295694618e-07, + "loss": 0.5976, + "step": 10738 + }, + { + "epoch": 0.9602110157367668, + "grad_norm": 0.1461632395378375, + "learning_rate": 8.29257231792957e-07, + "loss": 0.6263, + "step": 10739 + }, + { + "epoch": 0.9603004291845494, + "grad_norm": 0.1386983400644285, + "learning_rate": 8.255395513468767e-07, + "loss": 0.64, + "step": 10740 + }, + { + "epoch": 0.960389842632332, + "grad_norm": 0.13438267464004683, + "learning_rate": 8.218301885430268e-07, + "loss": 0.6247, + "step": 10741 + }, + { + "epoch": 0.9604792560801144, + "grad_norm": 0.13458102182897613, + "learning_rate": 8.181291436924921e-07, + "loss": 0.6194, + "step": 10742 + }, + { + "epoch": 0.960568669527897, + "grad_norm": 0.14068356067018295, + "learning_rate": 8.144364171056906e-07, + "loss": 0.6208, + "step": 10743 + }, + { + "epoch": 0.9606580829756796, + "grad_norm": 0.1525164841189012, + "learning_rate": 8.107520090923193e-07, + "loss": 0.6422, + "step": 10744 + }, + { + "epoch": 0.9607474964234621, + "grad_norm": 0.13406033422575844, + "learning_rate": 8.070759199613864e-07, + "loss": 0.621, + "step": 10745 + }, + { + "epoch": 0.9608369098712446, + "grad_norm": 0.14546246050597506, + "learning_rate": 8.03408150021201e-07, + "loss": 0.6114, + "step": 10746 + }, + { + "epoch": 0.9609263233190272, + "grad_norm": 0.1222018056951582, + "learning_rate": 7.997486995793834e-07, + "loss": 0.5553, + "step": 10747 + }, + { + "epoch": 0.9610157367668097, + "grad_norm": 0.14199886555744184, + "learning_rate": 7.96097568942833e-07, + "loss": 0.6301, + "step": 10748 + }, + { + "epoch": 0.9611051502145923, + "grad_norm": 0.14835716079004774, + "learning_rate": 7.924547584177711e-07, + "loss": 0.6338, + "step": 10749 + }, + { + "epoch": 0.9611945636623748, + "grad_norm": 0.14145591047852496, + "learning_rate": 7.88820268309709e-07, + "loss": 0.5931, + "step": 10750 + }, + { + "epoch": 0.9612839771101573, + "grad_norm": 0.1466797559516681, + "learning_rate": 7.851940989234919e-07, + "loss": 0.6106, + "step": 10751 + }, + { + "epoch": 0.9613733905579399, + "grad_norm": 0.14248479208194678, + "learning_rate": 7.815762505632096e-07, + "loss": 0.6276, + "step": 10752 + }, + { + "epoch": 0.9614628040057225, + "grad_norm": 0.1431102943913707, + "learning_rate": 7.779667235322974e-07, + "loss": 0.6178, + "step": 10753 + }, + { + "epoch": 0.961552217453505, + "grad_norm": 0.13911390644912952, + "learning_rate": 7.743655181335019e-07, + "loss": 0.6581, + "step": 10754 + }, + { + "epoch": 0.9616416309012875, + "grad_norm": 0.11884495497586257, + "learning_rate": 7.707726346688259e-07, + "loss": 0.6108, + "step": 10755 + }, + { + "epoch": 0.9617310443490701, + "grad_norm": 0.13514906894410023, + "learning_rate": 7.671880734396175e-07, + "loss": 0.6607, + "step": 10756 + }, + { + "epoch": 0.9618204577968527, + "grad_norm": 0.1495357442576066, + "learning_rate": 7.636118347465027e-07, + "loss": 0.6476, + "step": 10757 + }, + { + "epoch": 0.9619098712446352, + "grad_norm": 0.1537216710671733, + "learning_rate": 7.600439188894082e-07, + "loss": 0.6469, + "step": 10758 + }, + { + "epoch": 0.9619992846924177, + "grad_norm": 0.1431716810915022, + "learning_rate": 7.564843261675835e-07, + "loss": 0.6238, + "step": 10759 + }, + { + "epoch": 0.9620886981402003, + "grad_norm": 0.15072011081888434, + "learning_rate": 7.529330568795568e-07, + "loss": 0.6644, + "step": 10760 + }, + { + "epoch": 0.9621781115879828, + "grad_norm": 0.1420130081951305, + "learning_rate": 7.493901113231782e-07, + "loss": 0.6784, + "step": 10761 + }, + { + "epoch": 0.9622675250357654, + "grad_norm": 0.13217916345318106, + "learning_rate": 7.458554897955883e-07, + "loss": 0.6029, + "step": 10762 + }, + { + "epoch": 0.962356938483548, + "grad_norm": 0.14413626277024347, + "learning_rate": 7.423291925932275e-07, + "loss": 0.6227, + "step": 10763 + }, + { + "epoch": 0.9624463519313304, + "grad_norm": 0.15350862527745254, + "learning_rate": 7.388112200118479e-07, + "loss": 0.6502, + "step": 10764 + }, + { + "epoch": 0.962535765379113, + "grad_norm": 0.13425442973220772, + "learning_rate": 7.353015723464918e-07, + "loss": 0.6275, + "step": 10765 + }, + { + "epoch": 0.9626251788268956, + "grad_norm": 0.14541703463763997, + "learning_rate": 7.318002498915122e-07, + "loss": 0.6048, + "step": 10766 + }, + { + "epoch": 0.9627145922746781, + "grad_norm": 0.12662942568488209, + "learning_rate": 7.283072529405521e-07, + "loss": 0.624, + "step": 10767 + }, + { + "epoch": 0.9628040057224606, + "grad_norm": 0.14683101019029876, + "learning_rate": 7.248225817865884e-07, + "loss": 0.6176, + "step": 10768 + }, + { + "epoch": 0.9628934191702432, + "grad_norm": 0.12899134152808361, + "learning_rate": 7.213462367218537e-07, + "loss": 0.589, + "step": 10769 + }, + { + "epoch": 0.9629828326180258, + "grad_norm": 0.1266412580112084, + "learning_rate": 7.17878218037904e-07, + "loss": 0.6156, + "step": 10770 + }, + { + "epoch": 0.9630722460658083, + "grad_norm": 0.14516091019161043, + "learning_rate": 7.144185260256175e-07, + "loss": 0.6181, + "step": 10771 + }, + { + "epoch": 0.9631616595135909, + "grad_norm": 0.14942466667201, + "learning_rate": 7.1096716097514e-07, + "loss": 0.6347, + "step": 10772 + }, + { + "epoch": 0.9632510729613734, + "grad_norm": 0.14047200028862195, + "learning_rate": 7.075241231759289e-07, + "loss": 0.6245, + "step": 10773 + }, + { + "epoch": 0.9633404864091559, + "grad_norm": 0.1491385227347996, + "learning_rate": 7.040894129167641e-07, + "loss": 0.6467, + "step": 10774 + }, + { + "epoch": 0.9634298998569385, + "grad_norm": 0.11439202877278962, + "learning_rate": 7.006630304856932e-07, + "loss": 0.6238, + "step": 10775 + }, + { + "epoch": 0.9635193133047211, + "grad_norm": 0.14188656548770298, + "learning_rate": 6.972449761700861e-07, + "loss": 0.5589, + "step": 10776 + }, + { + "epoch": 0.9636087267525035, + "grad_norm": 0.15369062960808139, + "learning_rate": 6.938352502566358e-07, + "loss": 0.6693, + "step": 10777 + }, + { + "epoch": 0.9636981402002861, + "grad_norm": 0.1319775795275505, + "learning_rate": 6.904338530312693e-07, + "loss": 0.6636, + "step": 10778 + }, + { + "epoch": 0.9637875536480687, + "grad_norm": 0.14239918935421378, + "learning_rate": 6.870407847792915e-07, + "loss": 0.6241, + "step": 10779 + }, + { + "epoch": 0.9638769670958512, + "grad_norm": 0.1351184558173469, + "learning_rate": 6.836560457852636e-07, + "loss": 0.5873, + "step": 10780 + }, + { + "epoch": 0.9639663805436338, + "grad_norm": 0.14618993904529862, + "learning_rate": 6.802796363330588e-07, + "loss": 0.6233, + "step": 10781 + }, + { + "epoch": 0.9640557939914163, + "grad_norm": 0.14506035545924198, + "learning_rate": 6.769115567058504e-07, + "loss": 0.6196, + "step": 10782 + }, + { + "epoch": 0.9641452074391988, + "grad_norm": 0.12795968430817964, + "learning_rate": 6.735518071861235e-07, + "loss": 0.5937, + "step": 10783 + }, + { + "epoch": 0.9642346208869814, + "grad_norm": 0.12369569756100574, + "learning_rate": 6.702003880556418e-07, + "loss": 0.6167, + "step": 10784 + }, + { + "epoch": 0.964324034334764, + "grad_norm": 0.12703143255559005, + "learning_rate": 6.668572995955025e-07, + "loss": 0.5979, + "step": 10785 + }, + { + "epoch": 0.9644134477825465, + "grad_norm": 0.15209063594954697, + "learning_rate": 6.635225420860702e-07, + "loss": 0.6066, + "step": 10786 + }, + { + "epoch": 0.964502861230329, + "grad_norm": 0.14843165257590638, + "learning_rate": 6.601961158070325e-07, + "loss": 0.6217, + "step": 10787 + }, + { + "epoch": 0.9645922746781116, + "grad_norm": 0.1568543999192746, + "learning_rate": 6.56878021037377e-07, + "loss": 0.6157, + "step": 10788 + }, + { + "epoch": 0.9646816881258942, + "grad_norm": 0.13204623753929523, + "learning_rate": 6.535682580553926e-07, + "loss": 0.6378, + "step": 10789 + }, + { + "epoch": 0.9647711015736766, + "grad_norm": 0.12983296066000924, + "learning_rate": 6.502668271386458e-07, + "loss": 0.6181, + "step": 10790 + }, + { + "epoch": 0.9648605150214592, + "grad_norm": 0.14852936608381676, + "learning_rate": 6.469737285640487e-07, + "loss": 0.5965, + "step": 10791 + }, + { + "epoch": 0.9649499284692418, + "grad_norm": 0.14282646373701388, + "learning_rate": 6.436889626077691e-07, + "loss": 0.6816, + "step": 10792 + }, + { + "epoch": 0.9650393419170243, + "grad_norm": 0.14416688278578652, + "learning_rate": 6.40412529545309e-07, + "loss": 0.6454, + "step": 10793 + }, + { + "epoch": 0.9651287553648069, + "grad_norm": 0.14633248831377638, + "learning_rate": 6.371444296514484e-07, + "loss": 0.627, + "step": 10794 + }, + { + "epoch": 0.9652181688125894, + "grad_norm": 0.13257908030492302, + "learning_rate": 6.338846632002904e-07, + "loss": 0.6329, + "step": 10795 + }, + { + "epoch": 0.9653075822603719, + "grad_norm": 0.14211446959987925, + "learning_rate": 6.306332304652273e-07, + "loss": 0.6204, + "step": 10796 + }, + { + "epoch": 0.9653969957081545, + "grad_norm": 0.14855319738253903, + "learning_rate": 6.273901317189301e-07, + "loss": 0.6277, + "step": 10797 + }, + { + "epoch": 0.9654864091559371, + "grad_norm": 0.12964619446942005, + "learning_rate": 6.241553672334255e-07, + "loss": 0.6407, + "step": 10798 + }, + { + "epoch": 0.9655758226037195, + "grad_norm": 0.13752687173577216, + "learning_rate": 6.209289372799854e-07, + "loss": 0.6422, + "step": 10799 + }, + { + "epoch": 0.9656652360515021, + "grad_norm": 0.14166299991030404, + "learning_rate": 6.177108421292266e-07, + "loss": 0.5994, + "step": 10800 + }, + { + "epoch": 0.9657546494992847, + "grad_norm": 0.1372895971671765, + "learning_rate": 6.145010820510222e-07, + "loss": 0.6526, + "step": 10801 + }, + { + "epoch": 0.9658440629470673, + "grad_norm": 0.13678976107910804, + "learning_rate": 6.112996573145902e-07, + "loss": 0.6148, + "step": 10802 + }, + { + "epoch": 0.9659334763948498, + "grad_norm": 0.14191361679909312, + "learning_rate": 6.081065681884268e-07, + "loss": 0.6244, + "step": 10803 + }, + { + "epoch": 0.9660228898426323, + "grad_norm": 0.1372186092943853, + "learning_rate": 6.04921814940329e-07, + "loss": 0.6318, + "step": 10804 + }, + { + "epoch": 0.9661123032904149, + "grad_norm": 0.14096238938652186, + "learning_rate": 6.017453978374055e-07, + "loss": 0.651, + "step": 10805 + }, + { + "epoch": 0.9662017167381974, + "grad_norm": 0.14440876485490708, + "learning_rate": 5.985773171460429e-07, + "loss": 0.6368, + "step": 10806 + }, + { + "epoch": 0.96629113018598, + "grad_norm": 0.1318072222667001, + "learning_rate": 5.954175731319622e-07, + "loss": 0.5948, + "step": 10807 + }, + { + "epoch": 0.9663805436337625, + "grad_norm": 0.15776028169244816, + "learning_rate": 5.922661660601514e-07, + "loss": 0.6553, + "step": 10808 + }, + { + "epoch": 0.966469957081545, + "grad_norm": 0.1463154930791866, + "learning_rate": 5.891230961949324e-07, + "loss": 0.6403, + "step": 10809 + }, + { + "epoch": 0.9665593705293276, + "grad_norm": 0.13430157439065388, + "learning_rate": 5.859883637998942e-07, + "loss": 0.639, + "step": 10810 + }, + { + "epoch": 0.9666487839771102, + "grad_norm": 0.13905740738066688, + "learning_rate": 5.8286196913796e-07, + "loss": 0.6141, + "step": 10811 + }, + { + "epoch": 0.9667381974248928, + "grad_norm": 0.12199142344558762, + "learning_rate": 5.7974391247132e-07, + "loss": 0.6026, + "step": 10812 + }, + { + "epoch": 0.9668276108726752, + "grad_norm": 0.13777995309894864, + "learning_rate": 5.766341940614872e-07, + "loss": 0.6527, + "step": 10813 + }, + { + "epoch": 0.9669170243204578, + "grad_norm": 0.13145351903199806, + "learning_rate": 5.735328141692642e-07, + "loss": 0.6036, + "step": 10814 + }, + { + "epoch": 0.9670064377682404, + "grad_norm": 0.13044389030115836, + "learning_rate": 5.704397730547762e-07, + "loss": 0.6289, + "step": 10815 + }, + { + "epoch": 0.9670958512160229, + "grad_norm": 0.13844763093806803, + "learning_rate": 5.673550709774267e-07, + "loss": 0.626, + "step": 10816 + }, + { + "epoch": 0.9671852646638054, + "grad_norm": 0.146677989213386, + "learning_rate": 5.6427870819592e-07, + "loss": 0.6412, + "step": 10817 + }, + { + "epoch": 0.967274678111588, + "grad_norm": 0.1471175496531894, + "learning_rate": 5.612106849682719e-07, + "loss": 0.6478, + "step": 10818 + }, + { + "epoch": 0.9673640915593705, + "grad_norm": 0.1403563651403121, + "learning_rate": 5.581510015517988e-07, + "loss": 0.5737, + "step": 10819 + }, + { + "epoch": 0.9674535050071531, + "grad_norm": 0.13438959624671176, + "learning_rate": 5.550996582030954e-07, + "loss": 0.5857, + "step": 10820 + }, + { + "epoch": 0.9675429184549357, + "grad_norm": 0.13940746632342682, + "learning_rate": 5.520566551780792e-07, + "loss": 0.62, + "step": 10821 + }, + { + "epoch": 0.9676323319027181, + "grad_norm": 0.13010859794697158, + "learning_rate": 5.490219927319795e-07, + "loss": 0.5882, + "step": 10822 + }, + { + "epoch": 0.9677217453505007, + "grad_norm": 0.14265362616991986, + "learning_rate": 5.459956711192926e-07, + "loss": 0.6364, + "step": 10823 + }, + { + "epoch": 0.9678111587982833, + "grad_norm": 0.12405430071786905, + "learning_rate": 5.429776905938489e-07, + "loss": 0.6127, + "step": 10824 + }, + { + "epoch": 0.9679005722460658, + "grad_norm": 0.13633924496480926, + "learning_rate": 5.399680514087458e-07, + "loss": 0.6353, + "step": 10825 + }, + { + "epoch": 0.9679899856938483, + "grad_norm": 0.13801960136188396, + "learning_rate": 5.369667538164036e-07, + "loss": 0.6272, + "step": 10826 + }, + { + "epoch": 0.9680793991416309, + "grad_norm": 0.14049103731815582, + "learning_rate": 5.339737980685433e-07, + "loss": 0.5996, + "step": 10827 + }, + { + "epoch": 0.9681688125894135, + "grad_norm": 0.1412880913016755, + "learning_rate": 5.30989184416164e-07, + "loss": 0.6583, + "step": 10828 + }, + { + "epoch": 0.968258226037196, + "grad_norm": 0.14443911123801825, + "learning_rate": 5.28012913109599e-07, + "loss": 0.5948, + "step": 10829 + }, + { + "epoch": 0.9683476394849786, + "grad_norm": 0.1359137796144237, + "learning_rate": 5.250449843984706e-07, + "loss": 0.6507, + "step": 10830 + }, + { + "epoch": 0.968437052932761, + "grad_norm": 0.1405201367620344, + "learning_rate": 5.220853985316798e-07, + "loss": 0.6265, + "step": 10831 + }, + { + "epoch": 0.9685264663805436, + "grad_norm": 0.14194951307738937, + "learning_rate": 5.191341557574392e-07, + "loss": 0.6234, + "step": 10832 + }, + { + "epoch": 0.9686158798283262, + "grad_norm": 0.14474320988943543, + "learning_rate": 5.16191256323273e-07, + "loss": 0.6138, + "step": 10833 + }, + { + "epoch": 0.9687052932761088, + "grad_norm": 0.13077341957879648, + "learning_rate": 5.132567004760169e-07, + "loss": 0.6428, + "step": 10834 + }, + { + "epoch": 0.9687947067238912, + "grad_norm": 0.14130533976723092, + "learning_rate": 5.103304884617521e-07, + "loss": 0.6183, + "step": 10835 + }, + { + "epoch": 0.9688841201716738, + "grad_norm": 0.13622263223787087, + "learning_rate": 5.074126205259266e-07, + "loss": 0.6244, + "step": 10836 + }, + { + "epoch": 0.9689735336194564, + "grad_norm": 0.1512207406240718, + "learning_rate": 5.045030969132447e-07, + "loss": 0.5897, + "step": 10837 + }, + { + "epoch": 0.969062947067239, + "grad_norm": 0.12147427345607692, + "learning_rate": 5.016019178677333e-07, + "loss": 0.5452, + "step": 10838 + }, + { + "epoch": 0.9691523605150214, + "grad_norm": 0.13662612470512459, + "learning_rate": 4.987090836327091e-07, + "loss": 0.6156, + "step": 10839 + }, + { + "epoch": 0.969241773962804, + "grad_norm": 0.1342217773992938, + "learning_rate": 4.958245944507777e-07, + "loss": 0.6189, + "step": 10840 + }, + { + "epoch": 0.9693311874105865, + "grad_norm": 0.13349215141473975, + "learning_rate": 4.929484505638682e-07, + "loss": 0.6076, + "step": 10841 + }, + { + "epoch": 0.9694206008583691, + "grad_norm": 0.15643682511515813, + "learning_rate": 4.900806522131984e-07, + "loss": 0.6837, + "step": 10842 + }, + { + "epoch": 0.9695100143061517, + "grad_norm": 0.14710903755660049, + "learning_rate": 4.872211996392872e-07, + "loss": 0.6469, + "step": 10843 + }, + { + "epoch": 0.9695994277539342, + "grad_norm": 0.1574240699901595, + "learning_rate": 4.843700930819539e-07, + "loss": 0.6258, + "step": 10844 + }, + { + "epoch": 0.9696888412017167, + "grad_norm": 0.15451151599756802, + "learning_rate": 4.815273327803182e-07, + "loss": 0.6308, + "step": 10845 + }, + { + "epoch": 0.9697782546494993, + "grad_norm": 0.14424902574085025, + "learning_rate": 4.786929189727896e-07, + "loss": 0.5905, + "step": 10846 + }, + { + "epoch": 0.9698676680972819, + "grad_norm": 0.13882032503855196, + "learning_rate": 4.758668518970999e-07, + "loss": 0.6549, + "step": 10847 + }, + { + "epoch": 0.9699570815450643, + "grad_norm": 0.15431973663563125, + "learning_rate": 4.7304913179025965e-07, + "loss": 0.6033, + "step": 10848 + }, + { + "epoch": 0.9700464949928469, + "grad_norm": 0.1360501810000153, + "learning_rate": 4.7023975888859095e-07, + "loss": 0.5973, + "step": 10849 + }, + { + "epoch": 0.9701359084406295, + "grad_norm": 0.142072003173717, + "learning_rate": 4.674387334277164e-07, + "loss": 0.6353, + "step": 10850 + }, + { + "epoch": 0.970225321888412, + "grad_norm": 0.1586131631712153, + "learning_rate": 4.6464605564254803e-07, + "loss": 0.6492, + "step": 10851 + }, + { + "epoch": 0.9703147353361946, + "grad_norm": 0.1329919062183331, + "learning_rate": 4.6186172576730967e-07, + "loss": 0.6387, + "step": 10852 + }, + { + "epoch": 0.9704041487839771, + "grad_norm": 0.13791458813134763, + "learning_rate": 4.5908574403551454e-07, + "loss": 0.6561, + "step": 10853 + }, + { + "epoch": 0.9704935622317596, + "grad_norm": 0.1362812238863832, + "learning_rate": 4.5631811067998743e-07, + "loss": 0.6416, + "step": 10854 + }, + { + "epoch": 0.9705829756795422, + "grad_norm": 0.13697462881858766, + "learning_rate": 4.5355882593283163e-07, + "loss": 0.6147, + "step": 10855 + }, + { + "epoch": 0.9706723891273248, + "grad_norm": 0.1411152778193052, + "learning_rate": 4.5080789002548417e-07, + "loss": 0.6262, + "step": 10856 + }, + { + "epoch": 0.9707618025751072, + "grad_norm": 0.141632817669852, + "learning_rate": 4.4806530318864945e-07, + "loss": 0.6375, + "step": 10857 + }, + { + "epoch": 0.9708512160228898, + "grad_norm": 0.14340406356335006, + "learning_rate": 4.453310656523435e-07, + "loss": 0.5871, + "step": 10858 + }, + { + "epoch": 0.9709406294706724, + "grad_norm": 0.1377292352125211, + "learning_rate": 4.42605177645905e-07, + "loss": 0.645, + "step": 10859 + }, + { + "epoch": 0.971030042918455, + "grad_norm": 0.13846378268835757, + "learning_rate": 4.39887639397929e-07, + "loss": 0.6223, + "step": 10860 + }, + { + "epoch": 0.9711194563662375, + "grad_norm": 0.16778221105471, + "learning_rate": 4.3717845113633307e-07, + "loss": 0.6312, + "step": 10861 + }, + { + "epoch": 0.97120886981402, + "grad_norm": 0.14293496816058107, + "learning_rate": 4.344776130883466e-07, + "loss": 0.6527, + "step": 10862 + }, + { + "epoch": 0.9712982832618026, + "grad_norm": 0.14863606944329202, + "learning_rate": 4.3178512548046613e-07, + "loss": 0.6034, + "step": 10863 + }, + { + "epoch": 0.9713876967095851, + "grad_norm": 0.13374244454157436, + "learning_rate": 4.291009885385333e-07, + "loss": 0.6793, + "step": 10864 + }, + { + "epoch": 0.9714771101573677, + "grad_norm": 0.15819474462056493, + "learning_rate": 4.264252024876458e-07, + "loss": 0.6269, + "step": 10865 + }, + { + "epoch": 0.9715665236051502, + "grad_norm": 0.13717667451956164, + "learning_rate": 4.237577675522131e-07, + "loss": 0.628, + "step": 10866 + }, + { + "epoch": 0.9716559370529327, + "grad_norm": 0.13247855517784063, + "learning_rate": 4.210986839559672e-07, + "loss": 0.6079, + "step": 10867 + }, + { + "epoch": 0.9717453505007153, + "grad_norm": 0.1426065728265448, + "learning_rate": 4.184479519219187e-07, + "loss": 0.6455, + "step": 10868 + }, + { + "epoch": 0.9718347639484979, + "grad_norm": 0.13443940433691512, + "learning_rate": 4.1580557167236744e-07, + "loss": 0.6132, + "step": 10869 + }, + { + "epoch": 0.9719241773962805, + "grad_norm": 0.15253454016975582, + "learning_rate": 4.131715434289363e-07, + "loss": 0.6464, + "step": 10870 + }, + { + "epoch": 0.9720135908440629, + "grad_norm": 0.12918213897197078, + "learning_rate": 4.105458674125373e-07, + "loss": 0.6215, + "step": 10871 + }, + { + "epoch": 0.9721030042918455, + "grad_norm": 0.1340428300111658, + "learning_rate": 4.0792854384338333e-07, + "loss": 0.5843, + "step": 10872 + }, + { + "epoch": 0.9721924177396281, + "grad_norm": 0.12590352426102372, + "learning_rate": 4.0531957294098755e-07, + "loss": 0.6228, + "step": 10873 + }, + { + "epoch": 0.9722818311874106, + "grad_norm": 0.13704576140609906, + "learning_rate": 4.027189549241639e-07, + "loss": 0.6391, + "step": 10874 + }, + { + "epoch": 0.9723712446351931, + "grad_norm": 0.14128858757806428, + "learning_rate": 4.001266900110046e-07, + "loss": 0.653, + "step": 10875 + }, + { + "epoch": 0.9724606580829757, + "grad_norm": 0.1511129269509432, + "learning_rate": 3.975427784189467e-07, + "loss": 0.6462, + "step": 10876 + }, + { + "epoch": 0.9725500715307582, + "grad_norm": 0.11191916737742284, + "learning_rate": 3.949672203646837e-07, + "loss": 0.6151, + "step": 10877 + }, + { + "epoch": 0.9726394849785408, + "grad_norm": 0.14016697985707174, + "learning_rate": 3.924000160642205e-07, + "loss": 0.6253, + "step": 10878 + }, + { + "epoch": 0.9727288984263234, + "grad_norm": 0.12353538015670586, + "learning_rate": 3.898411657328849e-07, + "loss": 0.6041, + "step": 10879 + }, + { + "epoch": 0.9728183118741058, + "grad_norm": 0.1491621733616844, + "learning_rate": 3.872906695852607e-07, + "loss": 0.6233, + "step": 10880 + }, + { + "epoch": 0.9729077253218884, + "grad_norm": 0.13155713337409256, + "learning_rate": 3.847485278352658e-07, + "loss": 0.625, + "step": 10881 + }, + { + "epoch": 0.972997138769671, + "grad_norm": 0.14032984415061384, + "learning_rate": 3.8221474069611854e-07, + "loss": 0.6496, + "step": 10882 + }, + { + "epoch": 0.9730865522174535, + "grad_norm": 0.12427664130846743, + "learning_rate": 3.7968930838030436e-07, + "loss": 0.6044, + "step": 10883 + }, + { + "epoch": 0.973175965665236, + "grad_norm": 0.1498035933330122, + "learning_rate": 3.771722310996428e-07, + "loss": 0.6487, + "step": 10884 + }, + { + "epoch": 0.9732653791130186, + "grad_norm": 0.1428449126046639, + "learning_rate": 3.7466350906522065e-07, + "loss": 0.6458, + "step": 10885 + }, + { + "epoch": 0.9733547925608012, + "grad_norm": 0.15242969036391343, + "learning_rate": 3.721631424874694e-07, + "loss": 0.6122, + "step": 10886 + }, + { + "epoch": 0.9734442060085837, + "grad_norm": 0.1349284038750613, + "learning_rate": 3.696711315760659e-07, + "loss": 0.5898, + "step": 10887 + }, + { + "epoch": 0.9735336194563662, + "grad_norm": 0.1445604848409513, + "learning_rate": 3.671874765400207e-07, + "loss": 0.6369, + "step": 10888 + }, + { + "epoch": 0.9736230329041488, + "grad_norm": 0.12886727912235793, + "learning_rate": 3.6471217758763387e-07, + "loss": 0.5858, + "step": 10889 + }, + { + "epoch": 0.9737124463519313, + "grad_norm": 0.1301705052258413, + "learning_rate": 3.6224523492651706e-07, + "loss": 0.5853, + "step": 10890 + }, + { + "epoch": 0.9738018597997139, + "grad_norm": 0.1256679255850902, + "learning_rate": 3.5978664876354926e-07, + "loss": 0.561, + "step": 10891 + }, + { + "epoch": 0.9738912732474965, + "grad_norm": 0.14624826950909603, + "learning_rate": 3.573364193049433e-07, + "loss": 0.6651, + "step": 10892 + }, + { + "epoch": 0.9739806866952789, + "grad_norm": 0.13895427045896952, + "learning_rate": 3.5489454675620147e-07, + "loss": 0.6405, + "step": 10893 + }, + { + "epoch": 0.9740701001430615, + "grad_norm": 0.14221560393096946, + "learning_rate": 3.524610313221155e-07, + "loss": 0.657, + "step": 10894 + }, + { + "epoch": 0.9741595135908441, + "grad_norm": 0.1291621580798373, + "learning_rate": 3.5003587320676655e-07, + "loss": 0.6314, + "step": 10895 + }, + { + "epoch": 0.9742489270386266, + "grad_norm": 0.12861567048442488, + "learning_rate": 3.4761907261356976e-07, + "loss": 0.6197, + "step": 10896 + }, + { + "epoch": 0.9743383404864091, + "grad_norm": 0.13798600089264587, + "learning_rate": 3.4521062974520737e-07, + "loss": 0.6164, + "step": 10897 + }, + { + "epoch": 0.9744277539341917, + "grad_norm": 0.12963826087505612, + "learning_rate": 3.4281054480368445e-07, + "loss": 0.5839, + "step": 10898 + }, + { + "epoch": 0.9745171673819742, + "grad_norm": 0.12828116925044522, + "learning_rate": 3.404188179902845e-07, + "loss": 0.5728, + "step": 10899 + }, + { + "epoch": 0.9746065808297568, + "grad_norm": 0.1508773989426875, + "learning_rate": 3.380354495055915e-07, + "loss": 0.5955, + "step": 10900 + }, + { + "epoch": 0.9746959942775394, + "grad_norm": 0.14636940356748246, + "learning_rate": 3.356604395495122e-07, + "loss": 0.6593, + "step": 10901 + }, + { + "epoch": 0.9747854077253219, + "grad_norm": 0.12945490270643828, + "learning_rate": 3.332937883212206e-07, + "loss": 0.6129, + "step": 10902 + }, + { + "epoch": 0.9748748211731044, + "grad_norm": 0.13199765724656726, + "learning_rate": 3.3093549601921345e-07, + "loss": 0.6072, + "step": 10903 + }, + { + "epoch": 0.974964234620887, + "grad_norm": 0.13707522792740662, + "learning_rate": 3.2858556284127704e-07, + "loss": 0.6043, + "step": 10904 + }, + { + "epoch": 0.9750536480686696, + "grad_norm": 0.14245542379702827, + "learning_rate": 3.2624398898449814e-07, + "loss": 0.583, + "step": 10905 + }, + { + "epoch": 0.975143061516452, + "grad_norm": 0.15620824396504338, + "learning_rate": 3.239107746452641e-07, + "loss": 0.6538, + "step": 10906 + }, + { + "epoch": 0.9752324749642346, + "grad_norm": 0.14319068489559134, + "learning_rate": 3.215859200192517e-07, + "loss": 0.6509, + "step": 10907 + }, + { + "epoch": 0.9753218884120172, + "grad_norm": 0.12905049266798055, + "learning_rate": 3.1926942530144945e-07, + "loss": 0.6047, + "step": 10908 + }, + { + "epoch": 0.9754113018597997, + "grad_norm": 0.12821367969481404, + "learning_rate": 3.1696129068613525e-07, + "loss": 0.6112, + "step": 10909 + }, + { + "epoch": 0.9755007153075823, + "grad_norm": 0.13190238658467845, + "learning_rate": 3.1466151636689865e-07, + "loss": 0.6376, + "step": 10910 + }, + { + "epoch": 0.9755901287553648, + "grad_norm": 0.1472412701603036, + "learning_rate": 3.1237010253659657e-07, + "loss": 0.6453, + "step": 10911 + }, + { + "epoch": 0.9756795422031473, + "grad_norm": 0.13541277257533255, + "learning_rate": 3.1008704938743084e-07, + "loss": 0.6249, + "step": 10912 + }, + { + "epoch": 0.9757689556509299, + "grad_norm": 0.13751563366634867, + "learning_rate": 3.078123571108704e-07, + "loss": 0.5899, + "step": 10913 + }, + { + "epoch": 0.9758583690987125, + "grad_norm": 0.1413129087261499, + "learning_rate": 3.05546025897685e-07, + "loss": 0.624, + "step": 10914 + }, + { + "epoch": 0.975947782546495, + "grad_norm": 0.13754898813626273, + "learning_rate": 3.0328805593795584e-07, + "loss": 0.62, + "step": 10915 + }, + { + "epoch": 0.9760371959942775, + "grad_norm": 0.14287873949582977, + "learning_rate": 3.010384474210537e-07, + "loss": 0.6315, + "step": 10916 + }, + { + "epoch": 0.9761266094420601, + "grad_norm": 0.14543411361867972, + "learning_rate": 2.987972005356499e-07, + "loss": 0.6169, + "step": 10917 + }, + { + "epoch": 0.9762160228898427, + "grad_norm": 0.14336834876482926, + "learning_rate": 2.965643154697162e-07, + "loss": 0.6374, + "step": 10918 + }, + { + "epoch": 0.9763054363376252, + "grad_norm": 0.1313159319666108, + "learning_rate": 2.943397924105251e-07, + "loss": 0.5621, + "step": 10919 + }, + { + "epoch": 0.9763948497854077, + "grad_norm": 0.14356759095525723, + "learning_rate": 2.921236315446385e-07, + "loss": 0.5884, + "step": 10920 + }, + { + "epoch": 0.9764842632331903, + "grad_norm": 0.12748661493129435, + "learning_rate": 2.899158330579299e-07, + "loss": 0.5788, + "step": 10921 + }, + { + "epoch": 0.9765736766809728, + "grad_norm": 0.14046566118924111, + "learning_rate": 2.877163971355623e-07, + "loss": 0.6339, + "step": 10922 + }, + { + "epoch": 0.9766630901287554, + "grad_norm": 0.1358682778508838, + "learning_rate": 2.8552532396198815e-07, + "loss": 0.6428, + "step": 10923 + }, + { + "epoch": 0.9767525035765379, + "grad_norm": 0.14153668349208792, + "learning_rate": 2.833426137209938e-07, + "loss": 0.6511, + "step": 10924 + }, + { + "epoch": 0.9768419170243204, + "grad_norm": 0.1373192429026649, + "learning_rate": 2.811682665956217e-07, + "loss": 0.6551, + "step": 10925 + }, + { + "epoch": 0.976931330472103, + "grad_norm": 0.1377241777795198, + "learning_rate": 2.7900228276823704e-07, + "loss": 0.6698, + "step": 10926 + }, + { + "epoch": 0.9770207439198856, + "grad_norm": 0.14974523287389419, + "learning_rate": 2.768446624204946e-07, + "loss": 0.6051, + "step": 10927 + }, + { + "epoch": 0.977110157367668, + "grad_norm": 0.17098781483340716, + "learning_rate": 2.746954057333606e-07, + "loss": 0.682, + "step": 10928 + }, + { + "epoch": 0.9771995708154506, + "grad_norm": 0.14667726885692764, + "learning_rate": 2.7255451288707987e-07, + "loss": 0.6086, + "step": 10929 + }, + { + "epoch": 0.9772889842632332, + "grad_norm": 0.1466177584092556, + "learning_rate": 2.704219840612199e-07, + "loss": 0.634, + "step": 10930 + }, + { + "epoch": 0.9773783977110158, + "grad_norm": 0.1438949872240827, + "learning_rate": 2.682978194346264e-07, + "loss": 0.6196, + "step": 10931 + }, + { + "epoch": 0.9774678111587983, + "grad_norm": 0.1391379953535412, + "learning_rate": 2.661820191854347e-07, + "loss": 0.6428, + "step": 10932 + }, + { + "epoch": 0.9775572246065808, + "grad_norm": 0.15041358915831315, + "learning_rate": 2.640745834911251e-07, + "loss": 0.6738, + "step": 10933 + }, + { + "epoch": 0.9776466380543634, + "grad_norm": 0.16129007280440594, + "learning_rate": 2.6197551252842287e-07, + "loss": 0.6279, + "step": 10934 + }, + { + "epoch": 0.9777360515021459, + "grad_norm": 0.15389832209715285, + "learning_rate": 2.598848064733761e-07, + "loss": 0.6503, + "step": 10935 + }, + { + "epoch": 0.9778254649499285, + "grad_norm": 0.15378843550495785, + "learning_rate": 2.5780246550134444e-07, + "loss": 0.6138, + "step": 10936 + }, + { + "epoch": 0.977914878397711, + "grad_norm": 0.14170439131403822, + "learning_rate": 2.5572848978695496e-07, + "loss": 0.6649, + "step": 10937 + }, + { + "epoch": 0.9780042918454935, + "grad_norm": 0.12705294430303662, + "learning_rate": 2.5366287950415737e-07, + "loss": 0.6175, + "step": 10938 + }, + { + "epoch": 0.9780937052932761, + "grad_norm": 0.14677354080742452, + "learning_rate": 2.516056348261908e-07, + "loss": 0.6586, + "step": 10939 + }, + { + "epoch": 0.9781831187410587, + "grad_norm": 0.1442821664405584, + "learning_rate": 2.495567559256062e-07, + "loss": 0.6367, + "step": 10940 + }, + { + "epoch": 0.9782725321888412, + "grad_norm": 0.16325855516793844, + "learning_rate": 2.475162429742106e-07, + "loss": 0.6421, + "step": 10941 + }, + { + "epoch": 0.9783619456366237, + "grad_norm": 0.14400958487133408, + "learning_rate": 2.45484096143167e-07, + "loss": 0.6532, + "step": 10942 + }, + { + "epoch": 0.9784513590844063, + "grad_norm": 0.14647448517411882, + "learning_rate": 2.434603156028947e-07, + "loss": 0.6568, + "step": 10943 + }, + { + "epoch": 0.9785407725321889, + "grad_norm": 0.14343991926088664, + "learning_rate": 2.414449015231357e-07, + "loss": 0.6044, + "step": 10944 + }, + { + "epoch": 0.9786301859799714, + "grad_norm": 0.13704485063370245, + "learning_rate": 2.394378540729214e-07, + "loss": 0.6402, + "step": 10945 + }, + { + "epoch": 0.9787195994277539, + "grad_norm": 0.1508169154595322, + "learning_rate": 2.3743917342056166e-07, + "loss": 0.6456, + "step": 10946 + }, + { + "epoch": 0.9788090128755365, + "grad_norm": 0.14301098241763635, + "learning_rate": 2.3544885973370012e-07, + "loss": 0.6219, + "step": 10947 + }, + { + "epoch": 0.978898426323319, + "grad_norm": 0.13473627841876537, + "learning_rate": 2.3346691317924775e-07, + "loss": 0.6279, + "step": 10948 + }, + { + "epoch": 0.9789878397711016, + "grad_norm": 0.13145142747595664, + "learning_rate": 2.314933339234493e-07, + "loss": 0.6346, + "step": 10949 + }, + { + "epoch": 0.9790772532188842, + "grad_norm": 0.1441432373939503, + "learning_rate": 2.2952812213181684e-07, + "loss": 0.6429, + "step": 10950 + }, + { + "epoch": 0.9791666666666666, + "grad_norm": 0.13653264329800818, + "learning_rate": 2.27571277969163e-07, + "loss": 0.6235, + "step": 10951 + }, + { + "epoch": 0.9792560801144492, + "grad_norm": 0.12002351021453565, + "learning_rate": 2.2562280159961203e-07, + "loss": 0.5486, + "step": 10952 + }, + { + "epoch": 0.9793454935622318, + "grad_norm": 0.12232601269744238, + "learning_rate": 2.2368269318657764e-07, + "loss": 0.5459, + "step": 10953 + }, + { + "epoch": 0.9794349070100143, + "grad_norm": 0.14497904878757023, + "learning_rate": 2.2175095289278524e-07, + "loss": 0.6485, + "step": 10954 + }, + { + "epoch": 0.9795243204577968, + "grad_norm": 0.13986142730445134, + "learning_rate": 2.1982758088022747e-07, + "loss": 0.5836, + "step": 10955 + }, + { + "epoch": 0.9796137339055794, + "grad_norm": 0.14423093900910153, + "learning_rate": 2.1791257731024194e-07, + "loss": 0.6427, + "step": 10956 + }, + { + "epoch": 0.979703147353362, + "grad_norm": 0.1255359859537519, + "learning_rate": 2.160059423434113e-07, + "loss": 0.6286, + "step": 10957 + }, + { + "epoch": 0.9797925608011445, + "grad_norm": 0.12297649130140494, + "learning_rate": 2.141076761396521e-07, + "loss": 0.6272, + "step": 10958 + }, + { + "epoch": 0.9798819742489271, + "grad_norm": 0.1311930074483231, + "learning_rate": 2.1221777885817028e-07, + "loss": 0.6104, + "step": 10959 + }, + { + "epoch": 0.9799713876967096, + "grad_norm": 0.14056244490371755, + "learning_rate": 2.1033625065747242e-07, + "loss": 0.6104, + "step": 10960 + }, + { + "epoch": 0.9800608011444921, + "grad_norm": 0.15847007978319946, + "learning_rate": 2.084630916953656e-07, + "loss": 0.6635, + "step": 10961 + }, + { + "epoch": 0.9801502145922747, + "grad_norm": 0.150639458487789, + "learning_rate": 2.0659830212893527e-07, + "loss": 0.6321, + "step": 10962 + }, + { + "epoch": 0.9802396280400573, + "grad_norm": 0.15747791551898174, + "learning_rate": 2.0474188211457856e-07, + "loss": 0.6212, + "step": 10963 + }, + { + "epoch": 0.9803290414878397, + "grad_norm": 0.14377149559917962, + "learning_rate": 2.0289383180801537e-07, + "loss": 0.5816, + "step": 10964 + }, + { + "epoch": 0.9804184549356223, + "grad_norm": 0.14477990931532178, + "learning_rate": 2.0105415136421058e-07, + "loss": 0.6404, + "step": 10965 + }, + { + "epoch": 0.9805078683834049, + "grad_norm": 0.15392336483074173, + "learning_rate": 1.9922284093746302e-07, + "loss": 0.593, + "step": 10966 + }, + { + "epoch": 0.9805972818311874, + "grad_norm": 0.1379016611223456, + "learning_rate": 1.9739990068137203e-07, + "loss": 0.6221, + "step": 10967 + }, + { + "epoch": 0.98068669527897, + "grad_norm": 0.1459793044367406, + "learning_rate": 1.9558533074882646e-07, + "loss": 0.5968, + "step": 10968 + }, + { + "epoch": 0.9807761087267525, + "grad_norm": 0.15059990952720645, + "learning_rate": 1.9377913129199344e-07, + "loss": 0.6356, + "step": 10969 + }, + { + "epoch": 0.980865522174535, + "grad_norm": 0.14995509566943208, + "learning_rate": 1.919813024623851e-07, + "loss": 0.6508, + "step": 10970 + }, + { + "epoch": 0.9809549356223176, + "grad_norm": 0.13239066722797796, + "learning_rate": 1.9019184441075865e-07, + "loss": 0.6047, + "step": 10971 + }, + { + "epoch": 0.9810443490701002, + "grad_norm": 0.12458278983124756, + "learning_rate": 1.8841075728719404e-07, + "loss": 0.6287, + "step": 10972 + }, + { + "epoch": 0.9811337625178826, + "grad_norm": 0.15857703850833915, + "learning_rate": 1.8663804124108286e-07, + "loss": 0.6254, + "step": 10973 + }, + { + "epoch": 0.9812231759656652, + "grad_norm": 0.13764088200357746, + "learning_rate": 1.848736964211062e-07, + "loss": 0.627, + "step": 10974 + }, + { + "epoch": 0.9813125894134478, + "grad_norm": 0.12130589550853661, + "learning_rate": 1.8311772297521234e-07, + "loss": 0.677, + "step": 10975 + }, + { + "epoch": 0.9814020028612304, + "grad_norm": 0.13851955697833748, + "learning_rate": 1.813701210506946e-07, + "loss": 0.6311, + "step": 10976 + }, + { + "epoch": 0.9814914163090128, + "grad_norm": 0.130283971993881, + "learning_rate": 1.7963089079411356e-07, + "loss": 0.5954, + "step": 10977 + }, + { + "epoch": 0.9815808297567954, + "grad_norm": 0.13958019962746362, + "learning_rate": 1.7790003235134133e-07, + "loss": 0.6082, + "step": 10978 + }, + { + "epoch": 0.981670243204578, + "grad_norm": 0.1554133351556136, + "learning_rate": 1.7617754586752855e-07, + "loss": 0.6329, + "step": 10979 + }, + { + "epoch": 0.9817596566523605, + "grad_norm": 0.14121203705952898, + "learning_rate": 1.744634314871485e-07, + "loss": 0.5968, + "step": 10980 + }, + { + "epoch": 0.9818490701001431, + "grad_norm": 0.12665744681896152, + "learning_rate": 1.7275768935397507e-07, + "loss": 0.5534, + "step": 10981 + }, + { + "epoch": 0.9819384835479256, + "grad_norm": 0.13366590340312445, + "learning_rate": 1.710603196110383e-07, + "loss": 0.6338, + "step": 10982 + }, + { + "epoch": 0.9820278969957081, + "grad_norm": 0.14643566324505375, + "learning_rate": 1.693713224007243e-07, + "loss": 0.6374, + "step": 10983 + }, + { + "epoch": 0.9821173104434907, + "grad_norm": 0.15350046877741003, + "learning_rate": 1.6769069786466418e-07, + "loss": 0.665, + "step": 10984 + }, + { + "epoch": 0.9822067238912733, + "grad_norm": 0.1455986161344369, + "learning_rate": 1.66018446143823e-07, + "loss": 0.686, + "step": 10985 + }, + { + "epoch": 0.9822961373390557, + "grad_norm": 0.15973173539240237, + "learning_rate": 1.6435456737843302e-07, + "loss": 0.6588, + "step": 10986 + }, + { + "epoch": 0.9823855507868383, + "grad_norm": 0.15970110840446874, + "learning_rate": 1.6269906170807148e-07, + "loss": 0.6515, + "step": 10987 + }, + { + "epoch": 0.9824749642346209, + "grad_norm": 0.12729366192337116, + "learning_rate": 1.6105192927154956e-07, + "loss": 0.6124, + "step": 10988 + }, + { + "epoch": 0.9825643776824035, + "grad_norm": 0.1475003322631453, + "learning_rate": 1.594131702070345e-07, + "loss": 0.6369, + "step": 10989 + }, + { + "epoch": 0.982653791130186, + "grad_norm": 0.13621527536160571, + "learning_rate": 1.5778278465197194e-07, + "loss": 0.6292, + "step": 10990 + }, + { + "epoch": 0.9827432045779685, + "grad_norm": 0.13265504511421747, + "learning_rate": 1.5616077274307473e-07, + "loss": 0.5941, + "step": 10991 + }, + { + "epoch": 0.9828326180257511, + "grad_norm": 0.15287028808408526, + "learning_rate": 1.545471346164007e-07, + "loss": 0.6184, + "step": 10992 + }, + { + "epoch": 0.9829220314735336, + "grad_norm": 0.14399897516433424, + "learning_rate": 1.5294187040726382e-07, + "loss": 0.6733, + "step": 10993 + }, + { + "epoch": 0.9830114449213162, + "grad_norm": 0.1397742816586139, + "learning_rate": 1.5134498025031196e-07, + "loss": 0.6588, + "step": 10994 + }, + { + "epoch": 0.9831008583690987, + "grad_norm": 0.12055834727283507, + "learning_rate": 1.4975646427948244e-07, + "loss": 0.6275, + "step": 10995 + }, + { + "epoch": 0.9831902718168812, + "grad_norm": 0.14729234095662894, + "learning_rate": 1.4817632262797976e-07, + "loss": 0.6451, + "step": 10996 + }, + { + "epoch": 0.9832796852646638, + "grad_norm": 0.12248725278641369, + "learning_rate": 1.4660455542833128e-07, + "loss": 0.5986, + "step": 10997 + }, + { + "epoch": 0.9833690987124464, + "grad_norm": 0.1416876217077464, + "learning_rate": 1.45041162812376e-07, + "loss": 0.5934, + "step": 10998 + }, + { + "epoch": 0.983458512160229, + "grad_norm": 0.1513597570943676, + "learning_rate": 1.4348614491123125e-07, + "loss": 0.642, + "step": 10999 + }, + { + "epoch": 0.9835479256080114, + "grad_norm": 0.1474172904318185, + "learning_rate": 1.419395018552927e-07, + "loss": 0.6142, + "step": 11000 + }, + { + "epoch": 0.983637339055794, + "grad_norm": 0.1352842911782199, + "learning_rate": 1.4040123377428993e-07, + "loss": 0.6097, + "step": 11001 + }, + { + "epoch": 0.9837267525035766, + "grad_norm": 0.13752097837608912, + "learning_rate": 1.3887134079724196e-07, + "loss": 0.6307, + "step": 11002 + }, + { + "epoch": 0.9838161659513591, + "grad_norm": 0.15334318915516298, + "learning_rate": 1.3734982305245724e-07, + "loss": 0.6214, + "step": 11003 + }, + { + "epoch": 0.9839055793991416, + "grad_norm": 0.12981844815474233, + "learning_rate": 1.3583668066753375e-07, + "loss": 0.5808, + "step": 11004 + }, + { + "epoch": 0.9839949928469242, + "grad_norm": 0.13488926873141552, + "learning_rate": 1.3433191376938103e-07, + "loss": 0.6518, + "step": 11005 + }, + { + "epoch": 0.9840844062947067, + "grad_norm": 0.1353027090130752, + "learning_rate": 1.3283552248420927e-07, + "loss": 0.6189, + "step": 11006 + }, + { + "epoch": 0.9841738197424893, + "grad_norm": 0.1322584439165575, + "learning_rate": 1.3134750693751806e-07, + "loss": 0.6428, + "step": 11007 + }, + { + "epoch": 0.9842632331902719, + "grad_norm": 0.14474888104353406, + "learning_rate": 1.298678672540854e-07, + "loss": 0.6296, + "step": 11008 + }, + { + "epoch": 0.9843526466380543, + "grad_norm": 0.15387243757343413, + "learning_rate": 1.2839660355803417e-07, + "loss": 0.6113, + "step": 11009 + }, + { + "epoch": 0.9844420600858369, + "grad_norm": 0.14200327887467712, + "learning_rate": 1.2693371597273241e-07, + "loss": 0.649, + "step": 11010 + }, + { + "epoch": 0.9845314735336195, + "grad_norm": 0.13452873000663665, + "learning_rate": 1.2547920462089302e-07, + "loss": 0.61, + "step": 11011 + }, + { + "epoch": 0.984620886981402, + "grad_norm": 0.1408483424152301, + "learning_rate": 1.2403306962449624e-07, + "loss": 0.6392, + "step": 11012 + }, + { + "epoch": 0.9847103004291845, + "grad_norm": 0.13276535692250227, + "learning_rate": 1.225953111048228e-07, + "loss": 0.6105, + "step": 11013 + }, + { + "epoch": 0.9847997138769671, + "grad_norm": 0.12985795686921728, + "learning_rate": 1.2116592918246516e-07, + "loss": 0.607, + "step": 11014 + }, + { + "epoch": 0.9848891273247496, + "grad_norm": 0.13135264974886693, + "learning_rate": 1.197449239772941e-07, + "loss": 0.6258, + "step": 11015 + }, + { + "epoch": 0.9849785407725322, + "grad_norm": 0.15945761841017464, + "learning_rate": 1.1833229560848092e-07, + "loss": 0.6336, + "step": 11016 + }, + { + "epoch": 0.9850679542203148, + "grad_norm": 0.1403699027834479, + "learning_rate": 1.1692804419451975e-07, + "loss": 0.6463, + "step": 11017 + }, + { + "epoch": 0.9851573676680973, + "grad_norm": 0.14083081045409954, + "learning_rate": 1.1553216985318305e-07, + "loss": 0.605, + "step": 11018 + }, + { + "epoch": 0.9852467811158798, + "grad_norm": 0.14903696924355214, + "learning_rate": 1.1414467270152163e-07, + "loss": 0.637, + "step": 11019 + }, + { + "epoch": 0.9853361945636624, + "grad_norm": 0.14910249014930835, + "learning_rate": 1.1276555285592017e-07, + "loss": 0.6204, + "step": 11020 + }, + { + "epoch": 0.985425608011445, + "grad_norm": 0.14136176792356786, + "learning_rate": 1.113948104320417e-07, + "loss": 0.6151, + "step": 11021 + }, + { + "epoch": 0.9855150214592274, + "grad_norm": 0.12568387496363487, + "learning_rate": 1.1003244554483871e-07, + "loss": 0.6189, + "step": 11022 + }, + { + "epoch": 0.98560443490701, + "grad_norm": 0.1503645311541827, + "learning_rate": 1.0867845830858647e-07, + "loss": 0.6673, + "step": 11023 + }, + { + "epoch": 0.9856938483547926, + "grad_norm": 0.12648280311984128, + "learning_rate": 1.0733284883682749e-07, + "loss": 0.602, + "step": 11024 + }, + { + "epoch": 0.9857832618025751, + "grad_norm": 0.13763835129779692, + "learning_rate": 1.0599561724242702e-07, + "loss": 0.6581, + "step": 11025 + }, + { + "epoch": 0.9858726752503576, + "grad_norm": 0.1386786252707929, + "learning_rate": 1.046667636375287e-07, + "loss": 0.6351, + "step": 11026 + }, + { + "epoch": 0.9859620886981402, + "grad_norm": 0.13053205717057803, + "learning_rate": 1.0334628813358782e-07, + "loss": 0.6013, + "step": 11027 + }, + { + "epoch": 0.9860515021459227, + "grad_norm": 0.1368874196835543, + "learning_rate": 1.0203419084134913e-07, + "loss": 0.6099, + "step": 11028 + }, + { + "epoch": 0.9861409155937053, + "grad_norm": 0.1343391325189306, + "learning_rate": 1.0073047187085794e-07, + "loss": 0.6243, + "step": 11029 + }, + { + "epoch": 0.9862303290414879, + "grad_norm": 0.13724404623287156, + "learning_rate": 9.9435131331449e-08, + "loss": 0.6055, + "step": 11030 + }, + { + "epoch": 0.9863197424892703, + "grad_norm": 0.12595195262602588, + "learning_rate": 9.814816933176874e-08, + "loss": 0.5907, + "step": 11031 + }, + { + "epoch": 0.9864091559370529, + "grad_norm": 0.13919998187886115, + "learning_rate": 9.686958597975304e-08, + "loss": 0.6482, + "step": 11032 + }, + { + "epoch": 0.9864985693848355, + "grad_norm": 0.14324584787064445, + "learning_rate": 9.559938138263836e-08, + "loss": 0.6145, + "step": 11033 + }, + { + "epoch": 0.9865879828326181, + "grad_norm": 0.14313538115175964, + "learning_rate": 9.433755564693947e-08, + "loss": 0.6254, + "step": 11034 + }, + { + "epoch": 0.9866773962804005, + "grad_norm": 0.1514850285720087, + "learning_rate": 9.308410887849394e-08, + "loss": 0.6475, + "step": 11035 + }, + { + "epoch": 0.9867668097281831, + "grad_norm": 0.1380578887660379, + "learning_rate": 9.18390411824288e-08, + "loss": 0.5743, + "step": 11036 + }, + { + "epoch": 0.9868562231759657, + "grad_norm": 0.14240785803730513, + "learning_rate": 9.060235266317163e-08, + "loss": 0.619, + "step": 11037 + }, + { + "epoch": 0.9869456366237482, + "grad_norm": 0.12836971029866467, + "learning_rate": 8.937404342442834e-08, + "loss": 0.631, + "step": 11038 + }, + { + "epoch": 0.9870350500715308, + "grad_norm": 0.1333568641345491, + "learning_rate": 8.815411356922764e-08, + "loss": 0.631, + "step": 11039 + }, + { + "epoch": 0.9871244635193133, + "grad_norm": 0.13324819359496964, + "learning_rate": 8.694256319987659e-08, + "loss": 0.611, + "step": 11040 + }, + { + "epoch": 0.9872138769670958, + "grad_norm": 0.15981001516379592, + "learning_rate": 8.573939241798278e-08, + "loss": 0.6278, + "step": 11041 + }, + { + "epoch": 0.9873032904148784, + "grad_norm": 0.12675135842549598, + "learning_rate": 8.454460132446552e-08, + "loss": 0.6363, + "step": 11042 + }, + { + "epoch": 0.987392703862661, + "grad_norm": 0.13567373902304083, + "learning_rate": 8.335819001952239e-08, + "loss": 0.6604, + "step": 11043 + }, + { + "epoch": 0.9874821173104434, + "grad_norm": 0.14676345348766562, + "learning_rate": 8.21801586026627e-08, + "loss": 0.6019, + "step": 11044 + }, + { + "epoch": 0.987571530758226, + "grad_norm": 0.15349146868971528, + "learning_rate": 8.101050717267411e-08, + "loss": 0.6607, + "step": 11045 + }, + { + "epoch": 0.9876609442060086, + "grad_norm": 0.14055814265613303, + "learning_rate": 7.984923582767812e-08, + "loss": 0.6353, + "step": 11046 + }, + { + "epoch": 0.9877503576537912, + "grad_norm": 0.13071765228425036, + "learning_rate": 7.869634466504128e-08, + "loss": 0.6136, + "step": 11047 + }, + { + "epoch": 0.9878397711015737, + "grad_norm": 0.14255295053915593, + "learning_rate": 7.755183378147512e-08, + "loss": 0.6638, + "step": 11048 + }, + { + "epoch": 0.9879291845493562, + "grad_norm": 0.14246053194136973, + "learning_rate": 7.641570327295844e-08, + "loss": 0.6119, + "step": 11049 + }, + { + "epoch": 0.9880185979971388, + "grad_norm": 0.14239372345524423, + "learning_rate": 7.528795323477055e-08, + "loss": 0.6323, + "step": 11050 + }, + { + "epoch": 0.9881080114449213, + "grad_norm": 0.138603642999248, + "learning_rate": 7.416858376151359e-08, + "loss": 0.6238, + "step": 11051 + }, + { + "epoch": 0.9881974248927039, + "grad_norm": 0.13986737997289012, + "learning_rate": 7.305759494705689e-08, + "loss": 0.62, + "step": 11052 + }, + { + "epoch": 0.9882868383404864, + "grad_norm": 0.13820442072789307, + "learning_rate": 7.195498688458147e-08, + "loss": 0.6523, + "step": 11053 + }, + { + "epoch": 0.9883762517882689, + "grad_norm": 0.13694622168631446, + "learning_rate": 7.08607596665467e-08, + "loss": 0.5897, + "step": 11054 + }, + { + "epoch": 0.9884656652360515, + "grad_norm": 0.13940957475162216, + "learning_rate": 6.977491338474585e-08, + "loss": 0.6526, + "step": 11055 + }, + { + "epoch": 0.9885550786838341, + "grad_norm": 0.14439332945996955, + "learning_rate": 6.869744813023937e-08, + "loss": 0.5864, + "step": 11056 + }, + { + "epoch": 0.9886444921316166, + "grad_norm": 0.15762872291886934, + "learning_rate": 6.762836399338834e-08, + "loss": 0.6677, + "step": 11057 + }, + { + "epoch": 0.9887339055793991, + "grad_norm": 0.13482489288597807, + "learning_rate": 6.656766106385436e-08, + "loss": 0.6275, + "step": 11058 + }, + { + "epoch": 0.9888233190271817, + "grad_norm": 0.13056362174786953, + "learning_rate": 6.551533943061072e-08, + "loss": 0.6425, + "step": 11059 + }, + { + "epoch": 0.9889127324749643, + "grad_norm": 0.13452954561997812, + "learning_rate": 6.447139918189793e-08, + "loss": 0.6549, + "step": 11060 + }, + { + "epoch": 0.9890021459227468, + "grad_norm": 0.14479760998114333, + "learning_rate": 6.343584040527927e-08, + "loss": 0.6109, + "step": 11061 + }, + { + "epoch": 0.9890915593705293, + "grad_norm": 0.14533543130567875, + "learning_rate": 6.240866318760752e-08, + "loss": 0.6081, + "step": 11062 + }, + { + "epoch": 0.9891809728183119, + "grad_norm": 0.12833459078624976, + "learning_rate": 6.138986761502486e-08, + "loss": 0.6268, + "step": 11063 + }, + { + "epoch": 0.9892703862660944, + "grad_norm": 0.13674477761160492, + "learning_rate": 6.037945377297405e-08, + "loss": 0.6655, + "step": 11064 + }, + { + "epoch": 0.989359799713877, + "grad_norm": 0.13247768312743788, + "learning_rate": 5.9377421746209525e-08, + "loss": 0.6073, + "step": 11065 + }, + { + "epoch": 0.9894492131616596, + "grad_norm": 0.13431824786821106, + "learning_rate": 5.838377161875297e-08, + "loss": 0.6153, + "step": 11066 + }, + { + "epoch": 0.989538626609442, + "grad_norm": 0.14678224226145814, + "learning_rate": 5.739850347395992e-08, + "loss": 0.6478, + "step": 11067 + }, + { + "epoch": 0.9896280400572246, + "grad_norm": 0.16035621402779565, + "learning_rate": 5.642161739445317e-08, + "loss": 0.6391, + "step": 11068 + }, + { + "epoch": 0.9897174535050072, + "grad_norm": 0.14940289952916397, + "learning_rate": 5.545311346215609e-08, + "loss": 0.6206, + "step": 11069 + }, + { + "epoch": 0.9898068669527897, + "grad_norm": 0.15013991097564922, + "learning_rate": 5.449299175831479e-08, + "loss": 0.6221, + "step": 11070 + }, + { + "epoch": 0.9898962804005722, + "grad_norm": 0.13341934664510485, + "learning_rate": 5.354125236343155e-08, + "loss": 0.6191, + "step": 11071 + }, + { + "epoch": 0.9899856938483548, + "grad_norm": 0.1534466290890448, + "learning_rate": 5.25978953573536e-08, + "loss": 0.6012, + "step": 11072 + }, + { + "epoch": 0.9900751072961373, + "grad_norm": 0.15278601029828584, + "learning_rate": 5.166292081917323e-08, + "loss": 0.666, + "step": 11073 + }, + { + "epoch": 0.9901645207439199, + "grad_norm": 0.16551727144004486, + "learning_rate": 5.0736328827316605e-08, + "loss": 0.6262, + "step": 11074 + }, + { + "epoch": 0.9902539341917024, + "grad_norm": 0.14097920686468676, + "learning_rate": 4.9818119459499325e-08, + "loss": 0.6146, + "step": 11075 + }, + { + "epoch": 0.990343347639485, + "grad_norm": 0.13790385838919408, + "learning_rate": 4.890829279272646e-08, + "loss": 0.6228, + "step": 11076 + }, + { + "epoch": 0.9904327610872675, + "grad_norm": 0.13484826364234925, + "learning_rate": 4.800684890330365e-08, + "loss": 0.5723, + "step": 11077 + }, + { + "epoch": 0.9905221745350501, + "grad_norm": 0.139799623641682, + "learning_rate": 4.711378786683707e-08, + "loss": 0.6473, + "step": 11078 + }, + { + "epoch": 0.9906115879828327, + "grad_norm": 0.14695425592846686, + "learning_rate": 4.6229109758222365e-08, + "loss": 0.6429, + "step": 11079 + }, + { + "epoch": 0.9907010014306151, + "grad_norm": 0.13961793079720125, + "learning_rate": 4.535281465165575e-08, + "loss": 0.651, + "step": 11080 + }, + { + "epoch": 0.9907904148783977, + "grad_norm": 0.15511007611524716, + "learning_rate": 4.448490262064509e-08, + "loss": 0.6247, + "step": 11081 + }, + { + "epoch": 0.9908798283261803, + "grad_norm": 0.14123929490561252, + "learning_rate": 4.362537373795439e-08, + "loss": 0.6264, + "step": 11082 + }, + { + "epoch": 0.9909692417739628, + "grad_norm": 0.1512411742926425, + "learning_rate": 4.277422807570375e-08, + "loss": 0.6555, + "step": 11083 + }, + { + "epoch": 0.9910586552217453, + "grad_norm": 0.14010278378118687, + "learning_rate": 4.1931465705247195e-08, + "loss": 0.6224, + "step": 11084 + }, + { + "epoch": 0.9911480686695279, + "grad_norm": 0.14674214655703544, + "learning_rate": 4.109708669728374e-08, + "loss": 0.6565, + "step": 11085 + }, + { + "epoch": 0.9912374821173104, + "grad_norm": 0.13507641018395972, + "learning_rate": 4.027109112179073e-08, + "loss": 0.6197, + "step": 11086 + }, + { + "epoch": 0.991326895565093, + "grad_norm": 0.159099111313133, + "learning_rate": 3.945347904803498e-08, + "loss": 0.6536, + "step": 11087 + }, + { + "epoch": 0.9914163090128756, + "grad_norm": 0.14683095727295262, + "learning_rate": 3.8644250544594975e-08, + "loss": 0.6358, + "step": 11088 + }, + { + "epoch": 0.991505722460658, + "grad_norm": 0.16301149120894076, + "learning_rate": 3.784340567934974e-08, + "loss": 0.6806, + "step": 11089 + }, + { + "epoch": 0.9915951359084406, + "grad_norm": 0.13619424919624104, + "learning_rate": 3.7050944519445576e-08, + "loss": 0.6569, + "step": 11090 + }, + { + "epoch": 0.9916845493562232, + "grad_norm": 0.134064275330277, + "learning_rate": 3.626686713135152e-08, + "loss": 0.6152, + "step": 11091 + }, + { + "epoch": 0.9917739628040058, + "grad_norm": 0.1439177979297162, + "learning_rate": 3.54911735808372e-08, + "loss": 0.6236, + "step": 11092 + }, + { + "epoch": 0.9918633762517882, + "grad_norm": 0.15260062425335874, + "learning_rate": 3.472386393293947e-08, + "loss": 0.651, + "step": 11093 + }, + { + "epoch": 0.9919527896995708, + "grad_norm": 0.1358363088712499, + "learning_rate": 3.3964938252040166e-08, + "loss": 0.6229, + "step": 11094 + }, + { + "epoch": 0.9920422031473534, + "grad_norm": 0.1442853188937492, + "learning_rate": 3.3214396601766176e-08, + "loss": 0.6299, + "step": 11095 + }, + { + "epoch": 0.9921316165951359, + "grad_norm": 0.13756288965082258, + "learning_rate": 3.247223904506713e-08, + "loss": 0.6338, + "step": 11096 + }, + { + "epoch": 0.9922210300429185, + "grad_norm": 0.13759593445143073, + "learning_rate": 3.173846564419325e-08, + "loss": 0.626, + "step": 11097 + }, + { + "epoch": 0.992310443490701, + "grad_norm": 0.15612741374945577, + "learning_rate": 3.1013076460684186e-08, + "loss": 0.6279, + "step": 11098 + }, + { + "epoch": 0.9923998569384835, + "grad_norm": 0.15784008622163856, + "learning_rate": 3.0296071555369065e-08, + "loss": 0.656, + "step": 11099 + }, + { + "epoch": 0.9924892703862661, + "grad_norm": 0.13526965040854289, + "learning_rate": 2.9587450988399768e-08, + "loss": 0.5748, + "step": 11100 + }, + { + "epoch": 0.9925786838340487, + "grad_norm": 0.13672768479745478, + "learning_rate": 2.888721481919543e-08, + "loss": 0.6284, + "step": 11101 + }, + { + "epoch": 0.9926680972818311, + "grad_norm": 0.13338809376101307, + "learning_rate": 2.819536310648685e-08, + "loss": 0.6261, + "step": 11102 + }, + { + "epoch": 0.9927575107296137, + "grad_norm": 0.14079715268091167, + "learning_rate": 2.7511895908294282e-08, + "loss": 0.6222, + "step": 11103 + }, + { + "epoch": 0.9928469241773963, + "grad_norm": 0.13645873845540077, + "learning_rate": 2.6836813281938543e-08, + "loss": 0.627, + "step": 11104 + }, + { + "epoch": 0.9929363376251789, + "grad_norm": 0.12839142656109423, + "learning_rate": 2.617011528405211e-08, + "loss": 0.6413, + "step": 11105 + }, + { + "epoch": 0.9930257510729614, + "grad_norm": 0.14175383127517902, + "learning_rate": 2.551180197053471e-08, + "loss": 0.5794, + "step": 11106 + }, + { + "epoch": 0.9931151645207439, + "grad_norm": 0.14806770298909605, + "learning_rate": 2.4861873396608838e-08, + "loss": 0.6599, + "step": 11107 + }, + { + "epoch": 0.9932045779685265, + "grad_norm": 0.14932124012548684, + "learning_rate": 2.422032961677534e-08, + "loss": 0.6445, + "step": 11108 + }, + { + "epoch": 0.993293991416309, + "grad_norm": 0.14905814558312871, + "learning_rate": 2.3587170684835623e-08, + "loss": 0.6497, + "step": 11109 + }, + { + "epoch": 0.9933834048640916, + "grad_norm": 0.15082195813834753, + "learning_rate": 2.2962396653913864e-08, + "loss": 0.6582, + "step": 11110 + }, + { + "epoch": 0.9934728183118741, + "grad_norm": 0.14814696212711873, + "learning_rate": 2.234600757637928e-08, + "loss": 0.6192, + "step": 11111 + }, + { + "epoch": 0.9935622317596566, + "grad_norm": 0.13921055095114268, + "learning_rate": 2.1738003503946057e-08, + "loss": 0.6305, + "step": 11112 + }, + { + "epoch": 0.9936516452074392, + "grad_norm": 0.15166055514244783, + "learning_rate": 2.1138384487606742e-08, + "loss": 0.616, + "step": 11113 + }, + { + "epoch": 0.9937410586552218, + "grad_norm": 0.14350176983390192, + "learning_rate": 2.054715057765444e-08, + "loss": 0.652, + "step": 11114 + }, + { + "epoch": 0.9938304721030042, + "grad_norm": 0.14978100019495327, + "learning_rate": 1.9964301823660604e-08, + "loss": 0.6363, + "step": 11115 + }, + { + "epoch": 0.9939198855507868, + "grad_norm": 0.1455188929200548, + "learning_rate": 1.9389838274508355e-08, + "loss": 0.6068, + "step": 11116 + }, + { + "epoch": 0.9940092989985694, + "grad_norm": 0.13138914030148563, + "learning_rate": 1.8823759978392474e-08, + "loss": 0.6281, + "step": 11117 + }, + { + "epoch": 0.994098712446352, + "grad_norm": 0.14122290939417653, + "learning_rate": 1.8266066982774997e-08, + "loss": 0.6301, + "step": 11118 + }, + { + "epoch": 0.9941881258941345, + "grad_norm": 0.12513305731790816, + "learning_rate": 1.7716759334440724e-08, + "loss": 0.5732, + "step": 11119 + }, + { + "epoch": 0.994277539341917, + "grad_norm": 0.14930123029523548, + "learning_rate": 1.7175837079452804e-08, + "loss": 0.6329, + "step": 11120 + }, + { + "epoch": 0.9943669527896996, + "grad_norm": 0.1401996277632383, + "learning_rate": 1.6643300263186056e-08, + "loss": 0.6397, + "step": 11121 + }, + { + "epoch": 0.9944563662374821, + "grad_norm": 0.16188827485508434, + "learning_rate": 1.6119148930282546e-08, + "loss": 0.6167, + "step": 11122 + }, + { + "epoch": 0.9945457796852647, + "grad_norm": 0.15520271832599186, + "learning_rate": 1.560338312472931e-08, + "loss": 0.5582, + "step": 11123 + }, + { + "epoch": 0.9946351931330472, + "grad_norm": 0.13024461657324254, + "learning_rate": 1.5096002889758433e-08, + "loss": 0.5685, + "step": 11124 + }, + { + "epoch": 0.9947246065808297, + "grad_norm": 0.13733280266027492, + "learning_rate": 1.4597008267935863e-08, + "loss": 0.6163, + "step": 11125 + }, + { + "epoch": 0.9948140200286123, + "grad_norm": 0.13195066675172015, + "learning_rate": 1.4106399301117012e-08, + "loss": 0.6186, + "step": 11126 + }, + { + "epoch": 0.9949034334763949, + "grad_norm": 0.13480142506374482, + "learning_rate": 1.3624176030435642e-08, + "loss": 0.6253, + "step": 11127 + }, + { + "epoch": 0.9949928469241774, + "grad_norm": 0.13618183264248804, + "learning_rate": 1.315033849634828e-08, + "loss": 0.6328, + "step": 11128 + }, + { + "epoch": 0.9950822603719599, + "grad_norm": 0.13559038144055138, + "learning_rate": 1.2684886738589808e-08, + "loss": 0.6147, + "step": 11129 + }, + { + "epoch": 0.9951716738197425, + "grad_norm": 0.13573104886425244, + "learning_rate": 1.2227820796184564e-08, + "loss": 0.6259, + "step": 11130 + }, + { + "epoch": 0.995261087267525, + "grad_norm": 0.14532576052265697, + "learning_rate": 1.1779140707490755e-08, + "loss": 0.6089, + "step": 11131 + }, + { + "epoch": 0.9953505007153076, + "grad_norm": 0.14740360186924886, + "learning_rate": 1.1338846510111633e-08, + "loss": 0.6512, + "step": 11132 + }, + { + "epoch": 0.9954399141630901, + "grad_norm": 0.1562581113905965, + "learning_rate": 1.0906938240995423e-08, + "loss": 0.6643, + "step": 11133 + }, + { + "epoch": 0.9955293276108726, + "grad_norm": 0.16128239323453805, + "learning_rate": 1.04834159363576e-08, + "loss": 0.6846, + "step": 11134 + }, + { + "epoch": 0.9956187410586552, + "grad_norm": 0.14905053987709882, + "learning_rate": 1.0068279631725297e-08, + "loss": 0.6316, + "step": 11135 + }, + { + "epoch": 0.9957081545064378, + "grad_norm": 0.13143489119120583, + "learning_rate": 9.661529361892907e-09, + "loss": 0.6192, + "step": 11136 + }, + { + "epoch": 0.9957975679542204, + "grad_norm": 0.15502627011889877, + "learning_rate": 9.263165160999787e-09, + "loss": 0.6802, + "step": 11137 + }, + { + "epoch": 0.9958869814020028, + "grad_norm": 0.13070511918067315, + "learning_rate": 8.873187062452548e-09, + "loss": 0.625, + "step": 11138 + }, + { + "epoch": 0.9959763948497854, + "grad_norm": 0.14774349670351397, + "learning_rate": 8.491595098947258e-09, + "loss": 0.6447, + "step": 11139 + }, + { + "epoch": 0.996065808297568, + "grad_norm": 0.15005585249181325, + "learning_rate": 8.118389302491647e-09, + "loss": 0.6663, + "step": 11140 + }, + { + "epoch": 0.9961552217453505, + "grad_norm": 0.14096721254500363, + "learning_rate": 7.753569704382902e-09, + "loss": 0.5814, + "step": 11141 + }, + { + "epoch": 0.996244635193133, + "grad_norm": 0.11673291599721247, + "learning_rate": 7.397136335229871e-09, + "loss": 0.6173, + "step": 11142 + }, + { + "epoch": 0.9963340486409156, + "grad_norm": 0.14275596273477761, + "learning_rate": 7.049089224919758e-09, + "loss": 0.6353, + "step": 11143 + }, + { + "epoch": 0.9964234620886981, + "grad_norm": 0.13169428245646578, + "learning_rate": 6.709428402629225e-09, + "loss": 0.6261, + "step": 11144 + }, + { + "epoch": 0.9965128755364807, + "grad_norm": 0.15346931686678889, + "learning_rate": 6.378153896868799e-09, + "loss": 0.6157, + "step": 11145 + }, + { + "epoch": 0.9966022889842633, + "grad_norm": 0.13334600947428904, + "learning_rate": 6.055265735405158e-09, + "loss": 0.6652, + "step": 11146 + }, + { + "epoch": 0.9966917024320457, + "grad_norm": 0.14585066688107184, + "learning_rate": 5.740763945327743e-09, + "loss": 0.6398, + "step": 11147 + }, + { + "epoch": 0.9967811158798283, + "grad_norm": 0.1581844405043901, + "learning_rate": 5.434648553015453e-09, + "loss": 0.5884, + "step": 11148 + }, + { + "epoch": 0.9968705293276109, + "grad_norm": 0.15438234049099286, + "learning_rate": 5.136919584125544e-09, + "loss": 0.6257, + "step": 11149 + }, + { + "epoch": 0.9969599427753935, + "grad_norm": 0.12120290493952918, + "learning_rate": 4.847577063649133e-09, + "loss": 0.6149, + "step": 11150 + }, + { + "epoch": 0.9970493562231759, + "grad_norm": 0.13321177912878313, + "learning_rate": 4.566621015833495e-09, + "loss": 0.625, + "step": 11151 + }, + { + "epoch": 0.9971387696709585, + "grad_norm": 0.14366194791316714, + "learning_rate": 4.2940514642597626e-09, + "loss": 0.6033, + "step": 11152 + }, + { + "epoch": 0.9972281831187411, + "grad_norm": 0.14433849616709904, + "learning_rate": 4.029868431765227e-09, + "loss": 0.6429, + "step": 11153 + }, + { + "epoch": 0.9973175965665236, + "grad_norm": 0.13317389805506602, + "learning_rate": 3.774071940532142e-09, + "loss": 0.5826, + "step": 11154 + }, + { + "epoch": 0.9974070100143062, + "grad_norm": 0.16661739733787356, + "learning_rate": 3.526662012010018e-09, + "loss": 0.5957, + "step": 11155 + }, + { + "epoch": 0.9974964234620887, + "grad_norm": 0.1387639707822905, + "learning_rate": 3.2876386669267177e-09, + "loss": 0.6324, + "step": 11156 + }, + { + "epoch": 0.9975858369098712, + "grad_norm": 0.13647878438622368, + "learning_rate": 3.057001925355074e-09, + "loss": 0.6309, + "step": 11157 + }, + { + "epoch": 0.9976752503576538, + "grad_norm": 0.12992268979946228, + "learning_rate": 2.8347518066129675e-09, + "loss": 0.6096, + "step": 11158 + }, + { + "epoch": 0.9977646638054364, + "grad_norm": 0.13493939285332018, + "learning_rate": 2.620888329363247e-09, + "loss": 0.6227, + "step": 11159 + }, + { + "epoch": 0.9978540772532188, + "grad_norm": 0.14962545237051078, + "learning_rate": 2.4154115115360144e-09, + "loss": 0.6526, + "step": 11160 + }, + { + "epoch": 0.9979434907010014, + "grad_norm": 0.14068555784123588, + "learning_rate": 2.218321370361931e-09, + "loss": 0.649, + "step": 11161 + }, + { + "epoch": 0.998032904148784, + "grad_norm": 0.14224286812554895, + "learning_rate": 2.0296179223722176e-09, + "loss": 0.6212, + "step": 11162 + }, + { + "epoch": 0.9981223175965666, + "grad_norm": 0.1287999772360952, + "learning_rate": 1.8493011833875529e-09, + "loss": 0.6134, + "step": 11163 + }, + { + "epoch": 0.998211731044349, + "grad_norm": 0.1401461542702303, + "learning_rate": 1.6773711685291738e-09, + "loss": 0.6505, + "step": 11164 + }, + { + "epoch": 0.9983011444921316, + "grad_norm": 0.1385618683721195, + "learning_rate": 1.5138278922299797e-09, + "loss": 0.6186, + "step": 11165 + }, + { + "epoch": 0.9983905579399142, + "grad_norm": 0.13293551721967828, + "learning_rate": 1.3586713681901232e-09, + "loss": 0.5941, + "step": 11166 + }, + { + "epoch": 0.9984799713876967, + "grad_norm": 0.16392920465647726, + "learning_rate": 1.211901609443622e-09, + "loss": 0.6651, + "step": 11167 + }, + { + "epoch": 0.9985693848354793, + "grad_norm": 0.1331595655382339, + "learning_rate": 1.073518628269543e-09, + "loss": 0.6178, + "step": 11168 + }, + { + "epoch": 0.9986587982832618, + "grad_norm": 0.14527431557075152, + "learning_rate": 9.435224363030238e-10, + "loss": 0.6269, + "step": 11169 + }, + { + "epoch": 0.9987482117310443, + "grad_norm": 0.13456107626737132, + "learning_rate": 8.219130444353518e-10, + "loss": 0.6046, + "step": 11170 + }, + { + "epoch": 0.9988376251788269, + "grad_norm": 0.13649829414230996, + "learning_rate": 7.086904628694769e-10, + "loss": 0.6738, + "step": 11171 + }, + { + "epoch": 0.9989270386266095, + "grad_norm": 0.14056407165783377, + "learning_rate": 6.038547010867035e-10, + "loss": 0.6206, + "step": 11172 + }, + { + "epoch": 0.9990164520743919, + "grad_norm": 0.13547108233580843, + "learning_rate": 5.074057678911004e-10, + "loss": 0.6606, + "step": 11173 + }, + { + "epoch": 0.9991058655221745, + "grad_norm": 0.1349109006287252, + "learning_rate": 4.1934367137619334e-10, + "loss": 0.6011, + "step": 11174 + }, + { + "epoch": 0.9991952789699571, + "grad_norm": 0.1465288001889633, + "learning_rate": 3.396684189249655e-10, + "loss": 0.6264, + "step": 11175 + }, + { + "epoch": 0.9992846924177397, + "grad_norm": 0.1509064477341791, + "learning_rate": 2.683800172098572e-10, + "loss": 0.6499, + "step": 11176 + }, + { + "epoch": 0.9993741058655222, + "grad_norm": 0.12281048992955401, + "learning_rate": 2.054784722149705e-10, + "loss": 0.5563, + "step": 11177 + }, + { + "epoch": 0.9994635193133047, + "grad_norm": 0.15031065286142145, + "learning_rate": 1.5096378922496712e-10, + "loss": 0.6348, + "step": 11178 + }, + { + "epoch": 0.9995529327610873, + "grad_norm": 0.14048436998292235, + "learning_rate": 1.0483597280286361e-10, + "loss": 0.6536, + "step": 11179 + }, + { + "epoch": 0.9996423462088698, + "grad_norm": 0.13144751379354727, + "learning_rate": 6.709502681223611e-11, + "loss": 0.5698, + "step": 11180 + }, + { + "epoch": 0.9997317596566524, + "grad_norm": 0.13268878435349393, + "learning_rate": 3.774095442832248e-11, + "loss": 0.6337, + "step": 11181 + }, + { + "epoch": 0.9998211731044349, + "grad_norm": 0.14046018099211374, + "learning_rate": 1.6773758104715597e-11, + "loss": 0.6155, + "step": 11182 + }, + { + "epoch": 0.9999105865522174, + "grad_norm": 0.15228027576912703, + "learning_rate": 4.193439617772299e-12, + "loss": 0.6301, + "step": 11183 + }, + { + "epoch": 1.0, + "grad_norm": 0.13481671828933445, + "learning_rate": 0.0, + "loss": 0.5608, + "step": 11184 + }, + { + "epoch": 1.0, + "step": 11184, + "total_flos": 4525840170385408.0, + "train_loss": 0.021487695587337954, + "train_runtime": 5038.0572, + "train_samples_per_second": 284.141, + "train_steps_per_second": 2.22 + } + ], + "logging_steps": 1.0, + "max_steps": 11184, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4525840170385408.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}