{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6640484848484849, "eval_steps": 16, "global_step": 10272, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.464646464646465e-05, "grad_norm": NaN, "learning_rate": 0.0, "loss": 29.6213, "step": 1 }, { "epoch": 0.0001292929292929293, "grad_norm": NaN, "learning_rate": 0.0, "loss": 29.6208, "step": 2 }, { "epoch": 0.00019393939393939395, "grad_norm": 129.40235900878906, "learning_rate": 4.3010752688172043e-07, "loss": 29.5846, "step": 3 }, { "epoch": 0.0002585858585858586, "grad_norm": Infinity, "learning_rate": 4.3010752688172043e-07, "loss": 29.7161, "step": 4 }, { "epoch": 0.00032323232323232324, "grad_norm": 130.79031372070312, "learning_rate": 8.602150537634409e-07, "loss": 29.7196, "step": 5 }, { "epoch": 0.0003878787878787879, "grad_norm": 123.62369537353516, "learning_rate": 1.2903225806451614e-06, "loss": 29.2487, "step": 6 }, { "epoch": 0.0004525252525252525, "grad_norm": 135.1348876953125, "learning_rate": 1.7204301075268817e-06, "loss": 29.6055, "step": 7 }, { "epoch": 0.0005171717171717172, "grad_norm": Infinity, "learning_rate": 1.7204301075268817e-06, "loss": 28.568, "step": 8 }, { "epoch": 0.0005818181818181818, "grad_norm": 270.385498046875, "learning_rate": 2.1505376344086023e-06, "loss": 29.7127, "step": 9 }, { "epoch": 0.0006464646464646465, "grad_norm": 109.7217788696289, "learning_rate": 2.580645161290323e-06, "loss": 29.2279, "step": 10 }, { "epoch": 0.0007111111111111111, "grad_norm": 403.46337890625, "learning_rate": 3.0107526881720433e-06, "loss": 28.7925, "step": 11 }, { "epoch": 0.0007757575757575758, "grad_norm": 114.81087493896484, "learning_rate": 3.4408602150537635e-06, "loss": 28.0663, "step": 12 }, { "epoch": 0.0008404040404040404, "grad_norm": 240.21282958984375, "learning_rate": 3.870967741935484e-06, "loss": 27.3475, "step": 13 }, { "epoch": 0.000905050505050505, "grad_norm": 236.48675537109375, "learning_rate": 4.3010752688172045e-06, "loss": 26.9302, "step": 14 }, { "epoch": 0.0009696969696969697, "grad_norm": 203.8461456298828, "learning_rate": 4.731182795698925e-06, "loss": 24.0135, "step": 15 }, { "epoch": 0.0010343434343434343, "grad_norm": 156.15663146972656, "learning_rate": 5.161290322580646e-06, "loss": 22.7445, "step": 16 }, { "epoch": 0.0010343434343434343, "eval_bleu": 0.11919568898736486, "eval_loss": 21.227251052856445, "eval_runtime": 2.9098, "eval_samples_per_second": 10.997, "eval_steps_per_second": 1.375, "step": 16 }, { "epoch": 0.0010989898989898989, "grad_norm": 202.9501495361328, "learning_rate": 5.591397849462366e-06, "loss": 20.6978, "step": 17 }, { "epoch": 0.0011636363636363637, "grad_norm": 100.57891082763672, "learning_rate": 6.021505376344087e-06, "loss": 19.4305, "step": 18 }, { "epoch": 0.0012282828282828282, "grad_norm": Infinity, "learning_rate": 6.021505376344087e-06, "loss": 17.1684, "step": 19 }, { "epoch": 0.001292929292929293, "grad_norm": 108.986572265625, "learning_rate": 6.451612903225806e-06, "loss": 16.4328, "step": 20 }, { "epoch": 0.0013575757575757575, "grad_norm": 118.8450927734375, "learning_rate": 6.881720430107527e-06, "loss": 16.6908, "step": 21 }, { "epoch": 0.0014222222222222223, "grad_norm": 251.05662536621094, "learning_rate": 7.3118279569892475e-06, "loss": 15.8439, "step": 22 }, { "epoch": 0.0014868686868686868, "grad_norm": 79.44746398925781, "learning_rate": 7.741935483870968e-06, "loss": 14.1431, "step": 23 }, { "epoch": 0.0015515151515151516, "grad_norm": 170.30389404296875, "learning_rate": 8.172043010752689e-06, "loss": 13.6728, "step": 24 }, { "epoch": 0.0016161616161616162, "grad_norm": 72.50897216796875, "learning_rate": 8.602150537634409e-06, "loss": 11.2049, "step": 25 }, { "epoch": 0.0016808080808080807, "grad_norm": 65.9330825805664, "learning_rate": 9.03225806451613e-06, "loss": 9.7004, "step": 26 }, { "epoch": 0.0017454545454545455, "grad_norm": 60.561038970947266, "learning_rate": 9.46236559139785e-06, "loss": 8.3309, "step": 27 }, { "epoch": 0.00181010101010101, "grad_norm": 69.28730010986328, "learning_rate": 9.89247311827957e-06, "loss": 9.5886, "step": 28 }, { "epoch": 0.0018747474747474748, "grad_norm": 61.073875427246094, "learning_rate": 1.0322580645161291e-05, "loss": 7.9738, "step": 29 }, { "epoch": 0.0019393939393939393, "grad_norm": 49.478477478027344, "learning_rate": 1.0752688172043012e-05, "loss": 5.8632, "step": 30 }, { "epoch": 0.002004040404040404, "grad_norm": 50.41263198852539, "learning_rate": 1.1182795698924732e-05, "loss": 5.2403, "step": 31 }, { "epoch": 0.0020686868686868687, "grad_norm": 49.01531982421875, "learning_rate": 1.1612903225806453e-05, "loss": 4.9331, "step": 32 }, { "epoch": 0.0020686868686868687, "eval_bleu": 0.17225555175409368, "eval_loss": 2.72825288772583, "eval_runtime": 2.9075, "eval_samples_per_second": 11.006, "eval_steps_per_second": 1.376, "step": 32 }, { "epoch": 0.0021333333333333334, "grad_norm": 52.26249694824219, "learning_rate": 1.2043010752688173e-05, "loss": 4.3053, "step": 33 }, { "epoch": 0.0021979797979797978, "grad_norm": 52.2663688659668, "learning_rate": 1.2473118279569892e-05, "loss": 3.687, "step": 34 }, { "epoch": 0.0022626262626262625, "grad_norm": 28.177982330322266, "learning_rate": 1.2903225806451613e-05, "loss": 2.4059, "step": 35 }, { "epoch": 0.0023272727272727273, "grad_norm": 19.620092391967773, "learning_rate": 1.3333333333333333e-05, "loss": 1.8274, "step": 36 }, { "epoch": 0.002391919191919192, "grad_norm": 11.509565353393555, "learning_rate": 1.3763440860215054e-05, "loss": 1.4432, "step": 37 }, { "epoch": 0.0024565656565656564, "grad_norm": 8.501072883605957, "learning_rate": 1.4193548387096774e-05, "loss": 1.2904, "step": 38 }, { "epoch": 0.002521212121212121, "grad_norm": 6.885224342346191, "learning_rate": 1.4623655913978495e-05, "loss": 1.2263, "step": 39 }, { "epoch": 0.002585858585858586, "grad_norm": 5.4081645011901855, "learning_rate": 1.5053763440860215e-05, "loss": 1.0505, "step": 40 }, { "epoch": 0.0026505050505050507, "grad_norm": 4.560916900634766, "learning_rate": 1.5483870967741936e-05, "loss": 1.0065, "step": 41 }, { "epoch": 0.002715151515151515, "grad_norm": 3.7118706703186035, "learning_rate": 1.5913978494623657e-05, "loss": 0.9423, "step": 42 }, { "epoch": 0.00277979797979798, "grad_norm": 3.2654550075531006, "learning_rate": 1.6344086021505377e-05, "loss": 0.817, "step": 43 }, { "epoch": 0.0028444444444444446, "grad_norm": 2.726564645767212, "learning_rate": 1.6774193548387098e-05, "loss": 0.7942, "step": 44 }, { "epoch": 0.002909090909090909, "grad_norm": 2.5318379402160645, "learning_rate": 1.7204301075268818e-05, "loss": 0.6455, "step": 45 }, { "epoch": 0.0029737373737373737, "grad_norm": 2.1529147624969482, "learning_rate": 1.763440860215054e-05, "loss": 0.642, "step": 46 }, { "epoch": 0.0030383838383838385, "grad_norm": 1.7998542785644531, "learning_rate": 1.806451612903226e-05, "loss": 0.5299, "step": 47 }, { "epoch": 0.0031030303030303032, "grad_norm": 1.2775508165359497, "learning_rate": 1.849462365591398e-05, "loss": 0.4811, "step": 48 }, { "epoch": 0.0031030303030303032, "eval_bleu": 0.0, "eval_loss": 0.36655399203300476, "eval_runtime": 2.9074, "eval_samples_per_second": 11.006, "eval_steps_per_second": 1.376, "step": 48 }, { "epoch": 0.0031676767676767676, "grad_norm": 0.896143913269043, "learning_rate": 1.89247311827957e-05, "loss": 0.4713, "step": 49 }, { "epoch": 0.0032323232323232323, "grad_norm": 0.6520284414291382, "learning_rate": 1.935483870967742e-05, "loss": 0.3871, "step": 50 }, { "epoch": 0.003296969696969697, "grad_norm": 0.5051870942115784, "learning_rate": 1.978494623655914e-05, "loss": 0.3739, "step": 51 }, { "epoch": 0.0033616161616161614, "grad_norm": 0.43156668543815613, "learning_rate": 2.0215053763440862e-05, "loss": 0.3706, "step": 52 }, { "epoch": 0.003426262626262626, "grad_norm": 0.45535755157470703, "learning_rate": 2.0645161290322582e-05, "loss": 0.3435, "step": 53 }, { "epoch": 0.003490909090909091, "grad_norm": 0.560636043548584, "learning_rate": 2.1075268817204303e-05, "loss": 0.3588, "step": 54 }, { "epoch": 0.0035555555555555557, "grad_norm": 0.33022740483283997, "learning_rate": 2.1505376344086024e-05, "loss": 0.3293, "step": 55 }, { "epoch": 0.00362020202020202, "grad_norm": 0.5199302434921265, "learning_rate": 2.1935483870967744e-05, "loss": 0.373, "step": 56 }, { "epoch": 0.003684848484848485, "grad_norm": 0.4684373438358307, "learning_rate": 2.2365591397849465e-05, "loss": 0.3395, "step": 57 }, { "epoch": 0.0037494949494949496, "grad_norm": 0.3800906836986542, "learning_rate": 2.2795698924731185e-05, "loss": 0.3123, "step": 58 }, { "epoch": 0.003814141414141414, "grad_norm": 0.3400656282901764, "learning_rate": 2.3225806451612906e-05, "loss": 0.3263, "step": 59 }, { "epoch": 0.0038787878787878787, "grad_norm": 0.25802895426750183, "learning_rate": 2.3655913978494626e-05, "loss": 0.3034, "step": 60 }, { "epoch": 0.0039434343434343435, "grad_norm": 0.2955683767795563, "learning_rate": 2.4086021505376347e-05, "loss": 0.2894, "step": 61 }, { "epoch": 0.004008080808080808, "grad_norm": 0.3214697241783142, "learning_rate": 2.4516129032258064e-05, "loss": 0.2861, "step": 62 }, { "epoch": 0.004072727272727273, "grad_norm": 0.21337643265724182, "learning_rate": 2.4946236559139784e-05, "loss": 0.2621, "step": 63 }, { "epoch": 0.004137373737373737, "grad_norm": 0.20859983563423157, "learning_rate": 2.537634408602151e-05, "loss": 0.244, "step": 64 }, { "epoch": 0.004137373737373737, "eval_bleu": 0.0, "eval_loss": 0.2603618800640106, "eval_runtime": 2.8237, "eval_samples_per_second": 11.333, "eval_steps_per_second": 1.417, "step": 64 }, { "epoch": 0.004202020202020202, "grad_norm": 0.2817174196243286, "learning_rate": 2.5806451612903226e-05, "loss": 0.3045, "step": 65 }, { "epoch": 0.004266666666666667, "grad_norm": 0.3275797665119171, "learning_rate": 2.623655913978495e-05, "loss": 0.3167, "step": 66 }, { "epoch": 0.004331313131313131, "grad_norm": 0.2828182280063629, "learning_rate": 2.6666666666666667e-05, "loss": 0.2951, "step": 67 }, { "epoch": 0.0043959595959595955, "grad_norm": 0.3147842288017273, "learning_rate": 2.709677419354839e-05, "loss": 0.2565, "step": 68 }, { "epoch": 0.004460606060606061, "grad_norm": 0.2775740325450897, "learning_rate": 2.7526881720430108e-05, "loss": 0.2515, "step": 69 }, { "epoch": 0.004525252525252525, "grad_norm": 0.22093918919563293, "learning_rate": 2.7956989247311828e-05, "loss": 0.2277, "step": 70 }, { "epoch": 0.00458989898989899, "grad_norm": 0.2860693633556366, "learning_rate": 2.838709677419355e-05, "loss": 0.2625, "step": 71 }, { "epoch": 0.004654545454545455, "grad_norm": 0.2432243973016739, "learning_rate": 2.881720430107527e-05, "loss": 0.2475, "step": 72 }, { "epoch": 0.004719191919191919, "grad_norm": 0.2548949122428894, "learning_rate": 2.924731182795699e-05, "loss": 0.2591, "step": 73 }, { "epoch": 0.004783838383838384, "grad_norm": 0.2704143226146698, "learning_rate": 2.967741935483871e-05, "loss": 0.2398, "step": 74 }, { "epoch": 0.0048484848484848485, "grad_norm": 0.288873553276062, "learning_rate": 3.010752688172043e-05, "loss": 0.2207, "step": 75 }, { "epoch": 0.004913131313131313, "grad_norm": 0.43518224358558655, "learning_rate": 3.053763440860215e-05, "loss": 0.1955, "step": 76 }, { "epoch": 0.004977777777777778, "grad_norm": 0.23072396218776703, "learning_rate": 3.096774193548387e-05, "loss": 0.228, "step": 77 }, { "epoch": 0.005042424242424242, "grad_norm": 0.31609800457954407, "learning_rate": 3.139784946236559e-05, "loss": 0.2218, "step": 78 }, { "epoch": 0.005107070707070707, "grad_norm": 0.2203676551580429, "learning_rate": 3.182795698924731e-05, "loss": 0.1978, "step": 79 }, { "epoch": 0.005171717171717172, "grad_norm": 0.22825172543525696, "learning_rate": 3.2258064516129034e-05, "loss": 0.1938, "step": 80 }, { "epoch": 0.005171717171717172, "eval_bleu": 0.0, "eval_loss": 0.19310392439365387, "eval_runtime": 2.8471, "eval_samples_per_second": 11.24, "eval_steps_per_second": 1.405, "step": 80 }, { "epoch": 0.005236363636363636, "grad_norm": 0.24557648599147797, "learning_rate": 3.2688172043010754e-05, "loss": 0.2094, "step": 81 }, { "epoch": 0.005301010101010101, "grad_norm": 0.22608253359794617, "learning_rate": 3.3118279569892475e-05, "loss": 0.1994, "step": 82 }, { "epoch": 0.005365656565656566, "grad_norm": 0.23522968590259552, "learning_rate": 3.3548387096774195e-05, "loss": 0.2014, "step": 83 }, { "epoch": 0.00543030303030303, "grad_norm": 0.23294328153133392, "learning_rate": 3.3978494623655916e-05, "loss": 0.2068, "step": 84 }, { "epoch": 0.005494949494949495, "grad_norm": 0.23539653420448303, "learning_rate": 3.4408602150537636e-05, "loss": 0.203, "step": 85 }, { "epoch": 0.00555959595959596, "grad_norm": 0.32274749875068665, "learning_rate": 3.483870967741936e-05, "loss": 0.2463, "step": 86 }, { "epoch": 0.005624242424242424, "grad_norm": 0.25700676441192627, "learning_rate": 3.526881720430108e-05, "loss": 0.2063, "step": 87 }, { "epoch": 0.005688888888888889, "grad_norm": 0.2676103413105011, "learning_rate": 3.56989247311828e-05, "loss": 0.2091, "step": 88 }, { "epoch": 0.0057535353535353535, "grad_norm": 0.2309941202402115, "learning_rate": 3.612903225806452e-05, "loss": 0.1992, "step": 89 }, { "epoch": 0.005818181818181818, "grad_norm": 0.21144503355026245, "learning_rate": 3.655913978494624e-05, "loss": 0.1971, "step": 90 }, { "epoch": 0.005882828282828283, "grad_norm": 0.2694931924343109, "learning_rate": 3.698924731182796e-05, "loss": 0.216, "step": 91 }, { "epoch": 0.005947474747474747, "grad_norm": 0.39304476976394653, "learning_rate": 3.741935483870968e-05, "loss": 0.2672, "step": 92 }, { "epoch": 0.006012121212121212, "grad_norm": 0.2071070820093155, "learning_rate": 3.78494623655914e-05, "loss": 0.1754, "step": 93 }, { "epoch": 0.006076767676767677, "grad_norm": 0.23533540964126587, "learning_rate": 3.827956989247312e-05, "loss": 0.198, "step": 94 }, { "epoch": 0.006141414141414141, "grad_norm": 0.23313389718532562, "learning_rate": 3.870967741935484e-05, "loss": 0.1926, "step": 95 }, { "epoch": 0.0062060606060606064, "grad_norm": 0.2327883541584015, "learning_rate": 3.913978494623656e-05, "loss": 0.1932, "step": 96 }, { "epoch": 0.0062060606060606064, "eval_bleu": 0.0, "eval_loss": 0.1656179279088974, "eval_runtime": 2.8794, "eval_samples_per_second": 11.113, "eval_steps_per_second": 1.389, "step": 96 }, { "epoch": 0.006270707070707071, "grad_norm": 0.24508926272392273, "learning_rate": 3.956989247311828e-05, "loss": 0.1954, "step": 97 }, { "epoch": 0.006335353535353535, "grad_norm": 0.27829626202583313, "learning_rate": 4e-05, "loss": 0.2349, "step": 98 }, { "epoch": 0.0064, "grad_norm": 0.2639622390270233, "learning_rate": 4.0430107526881724e-05, "loss": 0.2216, "step": 99 }, { "epoch": 0.006464646464646465, "grad_norm": 0.1994614154100418, "learning_rate": 4.0860215053763444e-05, "loss": 0.1629, "step": 100 }, { "epoch": 0.006529292929292929, "grad_norm": 0.2456357777118683, "learning_rate": 4.1290322580645165e-05, "loss": 0.183, "step": 101 }, { "epoch": 0.006593939393939394, "grad_norm": 0.253951758146286, "learning_rate": 4.172043010752688e-05, "loss": 0.1886, "step": 102 }, { "epoch": 0.0066585858585858585, "grad_norm": 0.2239103466272354, "learning_rate": 4.2150537634408606e-05, "loss": 0.1814, "step": 103 }, { "epoch": 0.006723232323232323, "grad_norm": 0.25032392144203186, "learning_rate": 4.258064516129032e-05, "loss": 0.1802, "step": 104 }, { "epoch": 0.006787878787878788, "grad_norm": 0.2761096656322479, "learning_rate": 4.301075268817205e-05, "loss": 0.1888, "step": 105 }, { "epoch": 0.006852525252525252, "grad_norm": 0.2294616997241974, "learning_rate": 4.344086021505376e-05, "loss": 0.1636, "step": 106 }, { "epoch": 0.006917171717171718, "grad_norm": 0.23688875138759613, "learning_rate": 4.387096774193549e-05, "loss": 0.1546, "step": 107 }, { "epoch": 0.006981818181818182, "grad_norm": 0.27356773614883423, "learning_rate": 4.43010752688172e-05, "loss": 0.1611, "step": 108 }, { "epoch": 0.007046464646464646, "grad_norm": 0.24604901671409607, "learning_rate": 4.473118279569893e-05, "loss": 0.1598, "step": 109 }, { "epoch": 0.0071111111111111115, "grad_norm": 0.22883355617523193, "learning_rate": 4.516129032258064e-05, "loss": 0.1334, "step": 110 }, { "epoch": 0.007175757575757576, "grad_norm": 0.3058576285839081, "learning_rate": 4.559139784946237e-05, "loss": 0.1583, "step": 111 }, { "epoch": 0.00724040404040404, "grad_norm": 0.25770094990730286, "learning_rate": 4.6021505376344084e-05, "loss": 0.1449, "step": 112 }, { "epoch": 0.00724040404040404, "eval_bleu": 0.004012177831303324, "eval_loss": 0.1327122449874878, "eval_runtime": 2.8585, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 112 }, { "epoch": 0.007305050505050505, "grad_norm": 0.24761152267456055, "learning_rate": 4.645161290322581e-05, "loss": 0.1485, "step": 113 }, { "epoch": 0.00736969696969697, "grad_norm": 0.24573925137519836, "learning_rate": 4.688172043010753e-05, "loss": 0.1406, "step": 114 }, { "epoch": 0.007434343434343434, "grad_norm": 0.276619553565979, "learning_rate": 4.731182795698925e-05, "loss": 0.1494, "step": 115 }, { "epoch": 0.007498989898989899, "grad_norm": 0.2808314561843872, "learning_rate": 4.774193548387097e-05, "loss": 0.1685, "step": 116 }, { "epoch": 0.0075636363636363635, "grad_norm": 0.2550530731678009, "learning_rate": 4.8172043010752693e-05, "loss": 0.1445, "step": 117 }, { "epoch": 0.007628282828282828, "grad_norm": 0.2385779619216919, "learning_rate": 4.8602150537634414e-05, "loss": 0.1473, "step": 118 }, { "epoch": 0.007692929292929293, "grad_norm": 0.22435127198696136, "learning_rate": 4.903225806451613e-05, "loss": 0.1424, "step": 119 }, { "epoch": 0.007757575757575757, "grad_norm": 0.21921704709529877, "learning_rate": 4.9462365591397855e-05, "loss": 0.1494, "step": 120 }, { "epoch": 0.007822222222222222, "grad_norm": 0.19863726198673248, "learning_rate": 4.989247311827957e-05, "loss": 0.1431, "step": 121 }, { "epoch": 0.007886868686868687, "grad_norm": 0.2568920850753784, "learning_rate": 5.032258064516129e-05, "loss": 0.1389, "step": 122 }, { "epoch": 0.007951515151515152, "grad_norm": 0.24171364307403564, "learning_rate": 5.075268817204302e-05, "loss": 0.1616, "step": 123 }, { "epoch": 0.008016161616161616, "grad_norm": 0.1914776861667633, "learning_rate": 5.118279569892474e-05, "loss": 0.1371, "step": 124 }, { "epoch": 0.00808080808080808, "grad_norm": 0.20348064601421356, "learning_rate": 5.161290322580645e-05, "loss": 0.1361, "step": 125 }, { "epoch": 0.008145454545454546, "grad_norm": 0.19211728870868683, "learning_rate": 5.204301075268817e-05, "loss": 0.1381, "step": 126 }, { "epoch": 0.00821010101010101, "grad_norm": 0.2039393037557602, "learning_rate": 5.24731182795699e-05, "loss": 0.1473, "step": 127 }, { "epoch": 0.008274747474747475, "grad_norm": 0.17385540902614594, "learning_rate": 5.290322580645162e-05, "loss": 0.1229, "step": 128 }, { "epoch": 0.008274747474747475, "eval_bleu": 6.231980354669859, "eval_loss": 0.12336916476488113, "eval_runtime": 2.4144, "eval_samples_per_second": 13.254, "eval_steps_per_second": 1.657, "step": 128 }, { "epoch": 0.00833939393939394, "grad_norm": 0.2055896371603012, "learning_rate": 5.333333333333333e-05, "loss": 0.1519, "step": 129 }, { "epoch": 0.008404040404040403, "grad_norm": 0.1943008005619049, "learning_rate": 5.3763440860215054e-05, "loss": 0.127, "step": 130 }, { "epoch": 0.008468686868686869, "grad_norm": 0.20179802179336548, "learning_rate": 5.419354838709678e-05, "loss": 0.1548, "step": 131 }, { "epoch": 0.008533333333333334, "grad_norm": 0.20027868449687958, "learning_rate": 5.46236559139785e-05, "loss": 0.1469, "step": 132 }, { "epoch": 0.008597979797979797, "grad_norm": 0.2637341320514679, "learning_rate": 5.5053763440860215e-05, "loss": 0.1439, "step": 133 }, { "epoch": 0.008662626262626262, "grad_norm": 0.2555563747882843, "learning_rate": 5.5483870967741936e-05, "loss": 0.124, "step": 134 }, { "epoch": 0.008727272727272728, "grad_norm": 0.21842581033706665, "learning_rate": 5.5913978494623656e-05, "loss": 0.1436, "step": 135 }, { "epoch": 0.008791919191919191, "grad_norm": 0.21638083457946777, "learning_rate": 5.6344086021505384e-05, "loss": 0.1476, "step": 136 }, { "epoch": 0.008856565656565656, "grad_norm": 0.2058630734682083, "learning_rate": 5.67741935483871e-05, "loss": 0.1488, "step": 137 }, { "epoch": 0.008921212121212121, "grad_norm": 0.178453728556633, "learning_rate": 5.720430107526882e-05, "loss": 0.1342, "step": 138 }, { "epoch": 0.008985858585858587, "grad_norm": 0.22210922837257385, "learning_rate": 5.763440860215054e-05, "loss": 0.1497, "step": 139 }, { "epoch": 0.00905050505050505, "grad_norm": 0.1818244308233261, "learning_rate": 5.8064516129032266e-05, "loss": 0.1297, "step": 140 }, { "epoch": 0.009115151515151515, "grad_norm": 0.19282251596450806, "learning_rate": 5.849462365591398e-05, "loss": 0.1225, "step": 141 }, { "epoch": 0.00917979797979798, "grad_norm": 0.2010301947593689, "learning_rate": 5.89247311827957e-05, "loss": 0.1489, "step": 142 }, { "epoch": 0.009244444444444444, "grad_norm": 0.21354353427886963, "learning_rate": 5.935483870967742e-05, "loss": 0.1614, "step": 143 }, { "epoch": 0.00930909090909091, "grad_norm": 0.2229049801826477, "learning_rate": 5.978494623655915e-05, "loss": 0.1265, "step": 144 }, { "epoch": 0.00930909090909091, "eval_bleu": 4.760007484239427, "eval_loss": 0.1183546632528305, "eval_runtime": 2.5139, "eval_samples_per_second": 12.729, "eval_steps_per_second": 1.591, "step": 144 }, { "epoch": 0.009373737373737374, "grad_norm": 0.18631191551685333, "learning_rate": 6.021505376344086e-05, "loss": 0.1467, "step": 145 }, { "epoch": 0.009438383838383838, "grad_norm": 0.19207659363746643, "learning_rate": 6.064516129032258e-05, "loss": 0.134, "step": 146 }, { "epoch": 0.009503030303030303, "grad_norm": 0.18229234218597412, "learning_rate": 6.10752688172043e-05, "loss": 0.1294, "step": 147 }, { "epoch": 0.009567676767676768, "grad_norm": 0.21776947379112244, "learning_rate": 6.150537634408602e-05, "loss": 0.1417, "step": 148 }, { "epoch": 0.009632323232323232, "grad_norm": 0.1958850473165512, "learning_rate": 6.193548387096774e-05, "loss": 0.1239, "step": 149 }, { "epoch": 0.009696969696969697, "grad_norm": 0.2348705679178238, "learning_rate": 6.236559139784946e-05, "loss": 0.136, "step": 150 }, { "epoch": 0.009761616161616162, "grad_norm": 0.18097767233848572, "learning_rate": 6.279569892473119e-05, "loss": 0.1377, "step": 151 }, { "epoch": 0.009826262626262626, "grad_norm": 0.17325381934642792, "learning_rate": 6.32258064516129e-05, "loss": 0.1348, "step": 152 }, { "epoch": 0.00989090909090909, "grad_norm": 0.17179083824157715, "learning_rate": 6.365591397849463e-05, "loss": 0.1241, "step": 153 }, { "epoch": 0.009955555555555556, "grad_norm": 0.18480801582336426, "learning_rate": 6.408602150537635e-05, "loss": 0.1319, "step": 154 }, { "epoch": 0.01002020202020202, "grad_norm": 0.2286769598722458, "learning_rate": 6.451612903225807e-05, "loss": 0.152, "step": 155 }, { "epoch": 0.010084848484848485, "grad_norm": 0.17827528715133667, "learning_rate": 6.494623655913979e-05, "loss": 0.1287, "step": 156 }, { "epoch": 0.01014949494949495, "grad_norm": 0.22023025155067444, "learning_rate": 6.537634408602151e-05, "loss": 0.157, "step": 157 }, { "epoch": 0.010214141414141413, "grad_norm": 0.16185691952705383, "learning_rate": 6.580645161290323e-05, "loss": 0.1068, "step": 158 }, { "epoch": 0.010278787878787879, "grad_norm": 0.1795404702425003, "learning_rate": 6.623655913978495e-05, "loss": 0.1281, "step": 159 }, { "epoch": 0.010343434343434344, "grad_norm": 0.20915783941745758, "learning_rate": 6.666666666666667e-05, "loss": 0.1614, "step": 160 }, { "epoch": 0.010343434343434344, "eval_bleu": 6.592458544293429, "eval_loss": 0.11603529751300812, "eval_runtime": 2.5802, "eval_samples_per_second": 12.402, "eval_steps_per_second": 1.55, "step": 160 }, { "epoch": 0.010408080808080807, "grad_norm": 0.16381213068962097, "learning_rate": 6.709677419354839e-05, "loss": 0.1225, "step": 161 }, { "epoch": 0.010472727272727272, "grad_norm": 0.17188864946365356, "learning_rate": 6.752688172043011e-05, "loss": 0.1224, "step": 162 }, { "epoch": 0.010537373737373738, "grad_norm": 0.2340584695339203, "learning_rate": 6.795698924731183e-05, "loss": 0.1327, "step": 163 }, { "epoch": 0.010602020202020203, "grad_norm": 0.18974143266677856, "learning_rate": 6.838709677419355e-05, "loss": 0.1401, "step": 164 }, { "epoch": 0.010666666666666666, "grad_norm": 0.16689695417881012, "learning_rate": 6.881720430107527e-05, "loss": 0.1304, "step": 165 }, { "epoch": 0.010731313131313132, "grad_norm": 0.17095914483070374, "learning_rate": 6.924731182795699e-05, "loss": 0.1386, "step": 166 }, { "epoch": 0.010795959595959597, "grad_norm": 0.17770056426525116, "learning_rate": 6.967741935483871e-05, "loss": 0.1336, "step": 167 }, { "epoch": 0.01086060606060606, "grad_norm": 0.18237479031085968, "learning_rate": 7.010752688172043e-05, "loss": 0.136, "step": 168 }, { "epoch": 0.010925252525252525, "grad_norm": 0.16110308468341827, "learning_rate": 7.053763440860215e-05, "loss": 0.1138, "step": 169 }, { "epoch": 0.01098989898989899, "grad_norm": 0.18047171831130981, "learning_rate": 7.096774193548388e-05, "loss": 0.1378, "step": 170 }, { "epoch": 0.011054545454545454, "grad_norm": 0.16840171813964844, "learning_rate": 7.13978494623656e-05, "loss": 0.1399, "step": 171 }, { "epoch": 0.01111919191919192, "grad_norm": 0.17557452619075775, "learning_rate": 7.182795698924732e-05, "loss": 0.1335, "step": 172 }, { "epoch": 0.011183838383838384, "grad_norm": 0.15990616381168365, "learning_rate": 7.225806451612904e-05, "loss": 0.1152, "step": 173 }, { "epoch": 0.011248484848484848, "grad_norm": 0.19198541343212128, "learning_rate": 7.268817204301076e-05, "loss": 0.1474, "step": 174 }, { "epoch": 0.011313131313131313, "grad_norm": 0.1823148876428604, "learning_rate": 7.311827956989248e-05, "loss": 0.1399, "step": 175 }, { "epoch": 0.011377777777777778, "grad_norm": 0.17804116010665894, "learning_rate": 7.35483870967742e-05, "loss": 0.1154, "step": 176 }, { "epoch": 0.011377777777777778, "eval_bleu": 8.868003981640207, "eval_loss": 0.11387009918689728, "eval_runtime": 2.8192, "eval_samples_per_second": 11.351, "eval_steps_per_second": 1.419, "step": 176 }, { "epoch": 0.011442424242424242, "grad_norm": 0.18475455045700073, "learning_rate": 7.397849462365592e-05, "loss": 0.1395, "step": 177 }, { "epoch": 0.011507070707070707, "grad_norm": 0.18610043823719025, "learning_rate": 7.440860215053764e-05, "loss": 0.133, "step": 178 }, { "epoch": 0.011571717171717172, "grad_norm": 0.19519630074501038, "learning_rate": 7.483870967741936e-05, "loss": 0.1533, "step": 179 }, { "epoch": 0.011636363636363636, "grad_norm": 0.2074822634458542, "learning_rate": 7.526881720430108e-05, "loss": 0.117, "step": 180 }, { "epoch": 0.0117010101010101, "grad_norm": 0.189409539103508, "learning_rate": 7.56989247311828e-05, "loss": 0.1244, "step": 181 }, { "epoch": 0.011765656565656566, "grad_norm": 0.16102752089500427, "learning_rate": 7.612903225806451e-05, "loss": 0.1172, "step": 182 }, { "epoch": 0.01183030303030303, "grad_norm": 0.1906556636095047, "learning_rate": 7.655913978494624e-05, "loss": 0.1062, "step": 183 }, { "epoch": 0.011894949494949495, "grad_norm": 0.17854231595993042, "learning_rate": 7.698924731182796e-05, "loss": 0.1232, "step": 184 }, { "epoch": 0.01195959595959596, "grad_norm": 0.22238144278526306, "learning_rate": 7.741935483870968e-05, "loss": 0.1436, "step": 185 }, { "epoch": 0.012024242424242423, "grad_norm": 0.18775483965873718, "learning_rate": 7.784946236559139e-05, "loss": 0.1272, "step": 186 }, { "epoch": 0.012088888888888889, "grad_norm": 0.1833924651145935, "learning_rate": 7.827956989247312e-05, "loss": 0.1434, "step": 187 }, { "epoch": 0.012153535353535354, "grad_norm": 0.19351021945476532, "learning_rate": 7.870967741935484e-05, "loss": 0.1387, "step": 188 }, { "epoch": 0.012218181818181819, "grad_norm": 0.1965048760175705, "learning_rate": 7.913978494623657e-05, "loss": 0.1233, "step": 189 }, { "epoch": 0.012282828282828282, "grad_norm": 0.20669583976268768, "learning_rate": 7.956989247311829e-05, "loss": 0.1371, "step": 190 }, { "epoch": 0.012347474747474748, "grad_norm": 0.17665086686611176, "learning_rate": 8e-05, "loss": 0.1303, "step": 191 }, { "epoch": 0.012412121212121213, "grad_norm": 0.16998814046382904, "learning_rate": 8.043010752688173e-05, "loss": 0.1269, "step": 192 }, { "epoch": 0.012412121212121213, "eval_bleu": 4.975597595079175, "eval_loss": 0.1205844059586525, "eval_runtime": 2.4161, "eval_samples_per_second": 13.245, "eval_steps_per_second": 1.656, "step": 192 }, { "epoch": 0.012476767676767676, "grad_norm": 0.1767059713602066, "learning_rate": 8.086021505376345e-05, "loss": 0.1318, "step": 193 }, { "epoch": 0.012541414141414142, "grad_norm": 0.15831409394741058, "learning_rate": 8.129032258064517e-05, "loss": 0.1129, "step": 194 }, { "epoch": 0.012606060606060607, "grad_norm": 0.22060465812683105, "learning_rate": 8.172043010752689e-05, "loss": 0.1389, "step": 195 }, { "epoch": 0.01267070707070707, "grad_norm": 0.15754492580890656, "learning_rate": 8.215053763440861e-05, "loss": 0.1112, "step": 196 }, { "epoch": 0.012735353535353535, "grad_norm": 0.17196407914161682, "learning_rate": 8.258064516129033e-05, "loss": 0.1213, "step": 197 }, { "epoch": 0.0128, "grad_norm": 0.195866659283638, "learning_rate": 8.301075268817205e-05, "loss": 0.1517, "step": 198 }, { "epoch": 0.012864646464646464, "grad_norm": 0.1495412290096283, "learning_rate": 8.344086021505376e-05, "loss": 0.1067, "step": 199 }, { "epoch": 0.01292929292929293, "grad_norm": 0.22708700597286224, "learning_rate": 8.387096774193549e-05, "loss": 0.1478, "step": 200 }, { "epoch": 0.012993939393939394, "grad_norm": 0.18813180923461914, "learning_rate": 8.430107526881721e-05, "loss": 0.1474, "step": 201 }, { "epoch": 0.013058585858585858, "grad_norm": 0.212677463889122, "learning_rate": 8.473118279569893e-05, "loss": 0.1276, "step": 202 }, { "epoch": 0.013123232323232323, "grad_norm": 0.19477659463882446, "learning_rate": 8.516129032258064e-05, "loss": 0.1407, "step": 203 }, { "epoch": 0.013187878787878788, "grad_norm": 0.17175044119358063, "learning_rate": 8.559139784946237e-05, "loss": 0.1357, "step": 204 }, { "epoch": 0.013252525252525252, "grad_norm": 0.18147721886634827, "learning_rate": 8.60215053763441e-05, "loss": 0.1111, "step": 205 }, { "epoch": 0.013317171717171717, "grad_norm": 0.18739114701747894, "learning_rate": 8.645161290322581e-05, "loss": 0.1306, "step": 206 }, { "epoch": 0.013381818181818182, "grad_norm": 0.18932896852493286, "learning_rate": 8.688172043010752e-05, "loss": 0.1441, "step": 207 }, { "epoch": 0.013446464646464646, "grad_norm": 0.16915710270404816, "learning_rate": 8.731182795698926e-05, "loss": 0.1175, "step": 208 }, { "epoch": 0.013446464646464646, "eval_bleu": 7.276512799060872, "eval_loss": 0.11653009057044983, "eval_runtime": 2.4967, "eval_samples_per_second": 12.817, "eval_steps_per_second": 1.602, "step": 208 }, { "epoch": 0.013511111111111111, "grad_norm": 0.17084503173828125, "learning_rate": 8.774193548387098e-05, "loss": 0.1093, "step": 209 }, { "epoch": 0.013575757575757576, "grad_norm": 0.16955283284187317, "learning_rate": 8.81720430107527e-05, "loss": 0.1178, "step": 210 }, { "epoch": 0.01364040404040404, "grad_norm": 0.18181230127811432, "learning_rate": 8.86021505376344e-05, "loss": 0.1293, "step": 211 }, { "epoch": 0.013705050505050505, "grad_norm": 0.1908804327249527, "learning_rate": 8.903225806451614e-05, "loss": 0.139, "step": 212 }, { "epoch": 0.01376969696969697, "grad_norm": 0.19779594242572784, "learning_rate": 8.946236559139786e-05, "loss": 0.101, "step": 213 }, { "epoch": 0.013834343434343435, "grad_norm": 0.14478574693202972, "learning_rate": 8.989247311827958e-05, "loss": 0.1066, "step": 214 }, { "epoch": 0.013898989898989899, "grad_norm": 0.28901779651641846, "learning_rate": 9.032258064516129e-05, "loss": 0.1184, "step": 215 }, { "epoch": 0.013963636363636364, "grad_norm": 0.15521469712257385, "learning_rate": 9.0752688172043e-05, "loss": 0.1077, "step": 216 }, { "epoch": 0.014028282828282829, "grad_norm": 0.20941248536109924, "learning_rate": 9.118279569892474e-05, "loss": 0.1559, "step": 217 }, { "epoch": 0.014092929292929293, "grad_norm": 0.18314620852470398, "learning_rate": 9.161290322580646e-05, "loss": 0.1388, "step": 218 }, { "epoch": 0.014157575757575758, "grad_norm": 0.17776654660701752, "learning_rate": 9.204301075268817e-05, "loss": 0.117, "step": 219 }, { "epoch": 0.014222222222222223, "grad_norm": 0.2089313566684723, "learning_rate": 9.247311827956989e-05, "loss": 0.1436, "step": 220 }, { "epoch": 0.014286868686868686, "grad_norm": 0.1833810657262802, "learning_rate": 9.290322580645162e-05, "loss": 0.13, "step": 221 }, { "epoch": 0.014351515151515152, "grad_norm": 0.18056225776672363, "learning_rate": 9.333333333333334e-05, "loss": 0.1171, "step": 222 }, { "epoch": 0.014416161616161617, "grad_norm": 0.22030754387378693, "learning_rate": 9.376344086021506e-05, "loss": 0.1385, "step": 223 }, { "epoch": 0.01448080808080808, "grad_norm": 0.17485859990119934, "learning_rate": 9.419354838709677e-05, "loss": 0.1092, "step": 224 }, { "epoch": 0.01448080808080808, "eval_bleu": 7.809131634642059, "eval_loss": 0.1150018498301506, "eval_runtime": 2.5848, "eval_samples_per_second": 12.38, "eval_steps_per_second": 1.547, "step": 224 }, { "epoch": 0.014545454545454545, "grad_norm": 0.1666616052389145, "learning_rate": 9.46236559139785e-05, "loss": 0.1136, "step": 225 }, { "epoch": 0.01461010101010101, "grad_norm": 0.18403193354606628, "learning_rate": 9.505376344086023e-05, "loss": 0.1392, "step": 226 }, { "epoch": 0.014674747474747474, "grad_norm": 0.17291735112667084, "learning_rate": 9.548387096774195e-05, "loss": 0.109, "step": 227 }, { "epoch": 0.01473939393939394, "grad_norm": 0.17544101178646088, "learning_rate": 9.591397849462365e-05, "loss": 0.1334, "step": 228 }, { "epoch": 0.014804040404040405, "grad_norm": 0.17171847820281982, "learning_rate": 9.634408602150539e-05, "loss": 0.1254, "step": 229 }, { "epoch": 0.014868686868686868, "grad_norm": 0.17139862477779388, "learning_rate": 9.677419354838711e-05, "loss": 0.1071, "step": 230 }, { "epoch": 0.014933333333333333, "grad_norm": 0.17982217669487, "learning_rate": 9.720430107526883e-05, "loss": 0.097, "step": 231 }, { "epoch": 0.014997979797979798, "grad_norm": 0.1756395548582077, "learning_rate": 9.763440860215054e-05, "loss": 0.1341, "step": 232 }, { "epoch": 0.015062626262626262, "grad_norm": 0.1753191500902176, "learning_rate": 9.806451612903226e-05, "loss": 0.128, "step": 233 }, { "epoch": 0.015127272727272727, "grad_norm": 0.16588540375232697, "learning_rate": 9.849462365591399e-05, "loss": 0.1171, "step": 234 }, { "epoch": 0.015191919191919192, "grad_norm": 0.17201820015907288, "learning_rate": 9.892473118279571e-05, "loss": 0.1312, "step": 235 }, { "epoch": 0.015256565656565656, "grad_norm": 0.1373736709356308, "learning_rate": 9.935483870967742e-05, "loss": 0.0967, "step": 236 }, { "epoch": 0.015321212121212121, "grad_norm": 0.19025053083896637, "learning_rate": 9.978494623655914e-05, "loss": 0.1205, "step": 237 }, { "epoch": 0.015385858585858586, "grad_norm": 0.17612391710281372, "learning_rate": 0.00010021505376344087, "loss": 0.131, "step": 238 }, { "epoch": 0.015450505050505051, "grad_norm": 0.17722564935684204, "learning_rate": 0.00010064516129032258, "loss": 0.1275, "step": 239 }, { "epoch": 0.015515151515151515, "grad_norm": 0.157410129904747, "learning_rate": 0.0001010752688172043, "loss": 0.1163, "step": 240 }, { "epoch": 0.015515151515151515, "eval_bleu": 7.879718785339313, "eval_loss": 0.11526702344417572, "eval_runtime": 2.6169, "eval_samples_per_second": 12.228, "eval_steps_per_second": 1.529, "step": 240 }, { "epoch": 0.01557979797979798, "grad_norm": 0.15548042953014374, "learning_rate": 0.00010150537634408603, "loss": 0.113, "step": 241 }, { "epoch": 0.015644444444444443, "grad_norm": 0.16928356885910034, "learning_rate": 0.00010193548387096774, "loss": 0.1227, "step": 242 }, { "epoch": 0.01570909090909091, "grad_norm": 0.16685104370117188, "learning_rate": 0.00010236559139784947, "loss": 0.1269, "step": 243 }, { "epoch": 0.015773737373737374, "grad_norm": 0.18364295363426208, "learning_rate": 0.0001027956989247312, "loss": 0.1318, "step": 244 }, { "epoch": 0.01583838383838384, "grad_norm": 0.14628556370735168, "learning_rate": 0.0001032258064516129, "loss": 0.109, "step": 245 }, { "epoch": 0.015903030303030304, "grad_norm": 0.16133062541484833, "learning_rate": 0.00010365591397849464, "loss": 0.1171, "step": 246 }, { "epoch": 0.015967676767676766, "grad_norm": 0.2608477771282196, "learning_rate": 0.00010408602150537634, "loss": 0.1279, "step": 247 }, { "epoch": 0.01603232323232323, "grad_norm": 0.16215598583221436, "learning_rate": 0.00010451612903225806, "loss": 0.1191, "step": 248 }, { "epoch": 0.016096969696969696, "grad_norm": 0.15406298637390137, "learning_rate": 0.0001049462365591398, "loss": 0.1197, "step": 249 }, { "epoch": 0.01616161616161616, "grad_norm": 0.15008051693439484, "learning_rate": 0.0001053763440860215, "loss": 0.1099, "step": 250 }, { "epoch": 0.016226262626262627, "grad_norm": 0.1588115245103836, "learning_rate": 0.00010580645161290324, "loss": 0.1269, "step": 251 }, { "epoch": 0.016290909090909092, "grad_norm": 0.1505240648984909, "learning_rate": 0.00010623655913978495, "loss": 0.117, "step": 252 }, { "epoch": 0.016355555555555557, "grad_norm": 0.1762067973613739, "learning_rate": 0.00010666666666666667, "loss": 0.1432, "step": 253 }, { "epoch": 0.01642020202020202, "grad_norm": 0.1506965160369873, "learning_rate": 0.0001070967741935484, "loss": 0.1075, "step": 254 }, { "epoch": 0.016484848484848484, "grad_norm": 0.16768166422843933, "learning_rate": 0.00010752688172043011, "loss": 0.1211, "step": 255 }, { "epoch": 0.01654949494949495, "grad_norm": 0.18663646280765533, "learning_rate": 0.00010795698924731184, "loss": 0.1161, "step": 256 }, { "epoch": 0.01654949494949495, "eval_bleu": 7.040350332130717, "eval_loss": 0.1121298223733902, "eval_runtime": 2.4273, "eval_samples_per_second": 13.183, "eval_steps_per_second": 1.648, "step": 256 }, { "epoch": 0.016614141414141415, "grad_norm": 0.17525675892829895, "learning_rate": 0.00010838709677419356, "loss": 0.1269, "step": 257 }, { "epoch": 0.01667878787878788, "grad_norm": 0.15843874216079712, "learning_rate": 0.00010881720430107527, "loss": 0.1191, "step": 258 }, { "epoch": 0.016743434343434345, "grad_norm": 0.1823432892560959, "learning_rate": 0.000109247311827957, "loss": 0.1353, "step": 259 }, { "epoch": 0.016808080808080807, "grad_norm": 0.18377064168453217, "learning_rate": 0.00010967741935483871, "loss": 0.1424, "step": 260 }, { "epoch": 0.016872727272727272, "grad_norm": 0.1670287549495697, "learning_rate": 0.00011010752688172043, "loss": 0.1167, "step": 261 }, { "epoch": 0.016937373737373737, "grad_norm": 0.17760691046714783, "learning_rate": 0.00011053763440860216, "loss": 0.1371, "step": 262 }, { "epoch": 0.017002020202020202, "grad_norm": 0.15814171731472015, "learning_rate": 0.00011096774193548387, "loss": 0.1083, "step": 263 }, { "epoch": 0.017066666666666667, "grad_norm": 0.168021097779274, "learning_rate": 0.0001113978494623656, "loss": 0.1212, "step": 264 }, { "epoch": 0.017131313131313133, "grad_norm": 0.1638791710138321, "learning_rate": 0.00011182795698924731, "loss": 0.1227, "step": 265 }, { "epoch": 0.017195959595959594, "grad_norm": 0.16392038762569427, "learning_rate": 0.00011225806451612903, "loss": 0.1123, "step": 266 }, { "epoch": 0.01726060606060606, "grad_norm": 0.1560571789741516, "learning_rate": 0.00011268817204301077, "loss": 0.1039, "step": 267 }, { "epoch": 0.017325252525252525, "grad_norm": 0.15675808489322662, "learning_rate": 0.00011311827956989247, "loss": 0.1192, "step": 268 }, { "epoch": 0.01738989898989899, "grad_norm": 0.1586473435163498, "learning_rate": 0.0001135483870967742, "loss": 0.115, "step": 269 }, { "epoch": 0.017454545454545455, "grad_norm": 0.23141983151435852, "learning_rate": 0.00011397849462365593, "loss": 0.1105, "step": 270 }, { "epoch": 0.01751919191919192, "grad_norm": 0.17948609590530396, "learning_rate": 0.00011440860215053764, "loss": 0.1238, "step": 271 }, { "epoch": 0.017583838383838382, "grad_norm": 0.1589084267616272, "learning_rate": 0.00011483870967741937, "loss": 0.1142, "step": 272 }, { "epoch": 0.017583838383838382, "eval_bleu": 8.868664561614475, "eval_loss": 0.11044229567050934, "eval_runtime": 2.6707, "eval_samples_per_second": 11.982, "eval_steps_per_second": 1.498, "step": 272 }, { "epoch": 0.017648484848484847, "grad_norm": 0.17362943291664124, "learning_rate": 0.00011526881720430108, "loss": 0.1427, "step": 273 }, { "epoch": 0.017713131313131313, "grad_norm": 0.16138732433319092, "learning_rate": 0.0001156989247311828, "loss": 0.1143, "step": 274 }, { "epoch": 0.017777777777777778, "grad_norm": 0.15941482782363892, "learning_rate": 0.00011612903225806453, "loss": 0.1088, "step": 275 }, { "epoch": 0.017842424242424243, "grad_norm": 0.15772004425525665, "learning_rate": 0.00011655913978494624, "loss": 0.12, "step": 276 }, { "epoch": 0.017907070707070708, "grad_norm": 0.16370543837547302, "learning_rate": 0.00011698924731182796, "loss": 0.1172, "step": 277 }, { "epoch": 0.017971717171717173, "grad_norm": 0.1775922328233719, "learning_rate": 0.00011741935483870967, "loss": 0.1211, "step": 278 }, { "epoch": 0.018036363636363635, "grad_norm": 0.17987173795700073, "learning_rate": 0.0001178494623655914, "loss": 0.1354, "step": 279 }, { "epoch": 0.0181010101010101, "grad_norm": 0.1710910052061081, "learning_rate": 0.00011827956989247313, "loss": 0.1128, "step": 280 }, { "epoch": 0.018165656565656566, "grad_norm": 0.15026962757110596, "learning_rate": 0.00011870967741935484, "loss": 0.1001, "step": 281 }, { "epoch": 0.01823030303030303, "grad_norm": 0.16633881628513336, "learning_rate": 0.00011913978494623656, "loss": 0.1117, "step": 282 }, { "epoch": 0.018294949494949496, "grad_norm": 0.18486183881759644, "learning_rate": 0.0001195698924731183, "loss": 0.1416, "step": 283 }, { "epoch": 0.01835959595959596, "grad_norm": 0.15950919687747955, "learning_rate": 0.00012, "loss": 0.1145, "step": 284 }, { "epoch": 0.018424242424242423, "grad_norm": 0.17504741251468658, "learning_rate": 0.00012043010752688172, "loss": 0.1325, "step": 285 }, { "epoch": 0.018488888888888888, "grad_norm": 0.17341840267181396, "learning_rate": 0.00012086021505376343, "loss": 0.1324, "step": 286 }, { "epoch": 0.018553535353535353, "grad_norm": 0.16707849502563477, "learning_rate": 0.00012129032258064516, "loss": 0.0998, "step": 287 }, { "epoch": 0.01861818181818182, "grad_norm": 0.2094668447971344, "learning_rate": 0.0001217204301075269, "loss": 0.1306, "step": 288 }, { "epoch": 0.01861818181818182, "eval_bleu": 8.091362015630345, "eval_loss": 0.11228324472904205, "eval_runtime": 2.3875, "eval_samples_per_second": 13.403, "eval_steps_per_second": 1.675, "step": 288 }, { "epoch": 0.018682828282828284, "grad_norm": 0.15353377163410187, "learning_rate": 0.0001221505376344086, "loss": 0.1039, "step": 289 }, { "epoch": 0.01874747474747475, "grad_norm": 0.14731451869010925, "learning_rate": 0.00012258064516129034, "loss": 0.1, "step": 290 }, { "epoch": 0.01881212121212121, "grad_norm": 0.170234814286232, "learning_rate": 0.00012301075268817205, "loss": 0.1249, "step": 291 }, { "epoch": 0.018876767676767676, "grad_norm": 0.16308999061584473, "learning_rate": 0.00012344086021505375, "loss": 0.1317, "step": 292 }, { "epoch": 0.01894141414141414, "grad_norm": 0.15882478654384613, "learning_rate": 0.0001238709677419355, "loss": 0.105, "step": 293 }, { "epoch": 0.019006060606060606, "grad_norm": 0.17380963265895844, "learning_rate": 0.0001243010752688172, "loss": 0.1343, "step": 294 }, { "epoch": 0.01907070707070707, "grad_norm": 0.1621273010969162, "learning_rate": 0.00012473118279569893, "loss": 0.1197, "step": 295 }, { "epoch": 0.019135353535353537, "grad_norm": 0.16630950570106506, "learning_rate": 0.00012516129032258066, "loss": 0.1219, "step": 296 }, { "epoch": 0.0192, "grad_norm": 0.18303367495536804, "learning_rate": 0.00012559139784946237, "loss": 0.1093, "step": 297 }, { "epoch": 0.019264646464646464, "grad_norm": 0.20401039719581604, "learning_rate": 0.0001260215053763441, "loss": 0.1268, "step": 298 }, { "epoch": 0.01932929292929293, "grad_norm": 0.19388994574546814, "learning_rate": 0.0001264516129032258, "loss": 0.1371, "step": 299 }, { "epoch": 0.019393939393939394, "grad_norm": 0.16796191036701202, "learning_rate": 0.00012688172043010752, "loss": 0.1222, "step": 300 }, { "epoch": 0.01945858585858586, "grad_norm": 0.17263269424438477, "learning_rate": 0.00012731182795698925, "loss": 0.1103, "step": 301 }, { "epoch": 0.019523232323232324, "grad_norm": 0.15675833821296692, "learning_rate": 0.00012774193548387096, "loss": 0.1128, "step": 302 }, { "epoch": 0.01958787878787879, "grad_norm": 0.174618199467659, "learning_rate": 0.0001281720430107527, "loss": 0.1342, "step": 303 }, { "epoch": 0.01965252525252525, "grad_norm": 0.1889706254005432, "learning_rate": 0.00012860215053763443, "loss": 0.1336, "step": 304 }, { "epoch": 0.01965252525252525, "eval_bleu": 10.705749789759375, "eval_loss": 0.11087147146463394, "eval_runtime": 2.9216, "eval_samples_per_second": 10.953, "eval_steps_per_second": 1.369, "step": 304 }, { "epoch": 0.019717171717171716, "grad_norm": 0.1739739030599594, "learning_rate": 0.00012903225806451613, "loss": 0.1166, "step": 305 }, { "epoch": 0.01978181818181818, "grad_norm": 0.14401200413703918, "learning_rate": 0.00012946236559139787, "loss": 0.0986, "step": 306 }, { "epoch": 0.019846464646464647, "grad_norm": 0.1703801155090332, "learning_rate": 0.00012989247311827958, "loss": 0.1257, "step": 307 }, { "epoch": 0.019911111111111112, "grad_norm": 0.13767610490322113, "learning_rate": 0.0001303225806451613, "loss": 0.096, "step": 308 }, { "epoch": 0.019975757575757577, "grad_norm": 0.15267498791217804, "learning_rate": 0.00013075268817204302, "loss": 0.11, "step": 309 }, { "epoch": 0.02004040404040404, "grad_norm": 0.13670149445533752, "learning_rate": 0.00013118279569892472, "loss": 0.0986, "step": 310 }, { "epoch": 0.020105050505050504, "grad_norm": 0.1716107279062271, "learning_rate": 0.00013161290322580646, "loss": 0.1348, "step": 311 }, { "epoch": 0.02016969696969697, "grad_norm": 0.1574842482805252, "learning_rate": 0.00013204301075268816, "loss": 0.1177, "step": 312 }, { "epoch": 0.020234343434343435, "grad_norm": 0.16496491432189941, "learning_rate": 0.0001324731182795699, "loss": 0.1257, "step": 313 }, { "epoch": 0.0202989898989899, "grad_norm": 0.14341674745082855, "learning_rate": 0.00013290322580645163, "loss": 0.1009, "step": 314 }, { "epoch": 0.020363636363636365, "grad_norm": 0.16001056134700775, "learning_rate": 0.00013333333333333334, "loss": 0.1206, "step": 315 }, { "epoch": 0.020428282828282827, "grad_norm": 0.16191469132900238, "learning_rate": 0.00013376344086021507, "loss": 0.1209, "step": 316 }, { "epoch": 0.020492929292929292, "grad_norm": 0.1328209638595581, "learning_rate": 0.00013419354838709678, "loss": 0.0868, "step": 317 }, { "epoch": 0.020557575757575757, "grad_norm": 0.16641101241111755, "learning_rate": 0.0001346236559139785, "loss": 0.1241, "step": 318 }, { "epoch": 0.020622222222222222, "grad_norm": 0.15474484860897064, "learning_rate": 0.00013505376344086022, "loss": 0.1055, "step": 319 }, { "epoch": 0.020686868686868688, "grad_norm": 0.16334268450737, "learning_rate": 0.00013548387096774193, "loss": 0.111, "step": 320 }, { "epoch": 0.020686868686868688, "eval_bleu": 8.121325828182469, "eval_loss": 0.11115045845508575, "eval_runtime": 2.3728, "eval_samples_per_second": 13.486, "eval_steps_per_second": 1.686, "step": 320 }, { "epoch": 0.020751515151515153, "grad_norm": 0.129131481051445, "learning_rate": 0.00013591397849462366, "loss": 0.0885, "step": 321 }, { "epoch": 0.020816161616161614, "grad_norm": 0.1746491640806198, "learning_rate": 0.0001363440860215054, "loss": 0.1311, "step": 322 }, { "epoch": 0.02088080808080808, "grad_norm": 0.15188440680503845, "learning_rate": 0.0001367741935483871, "loss": 0.1223, "step": 323 }, { "epoch": 0.020945454545454545, "grad_norm": 0.15379559993743896, "learning_rate": 0.00013720430107526884, "loss": 0.1156, "step": 324 }, { "epoch": 0.02101010101010101, "grad_norm": 0.16021427512168884, "learning_rate": 0.00013763440860215055, "loss": 0.1242, "step": 325 }, { "epoch": 0.021074747474747475, "grad_norm": 0.16248944401741028, "learning_rate": 0.00013806451612903225, "loss": 0.109, "step": 326 }, { "epoch": 0.02113939393939394, "grad_norm": 0.1452634632587433, "learning_rate": 0.00013849462365591399, "loss": 0.1041, "step": 327 }, { "epoch": 0.021204040404040406, "grad_norm": 0.1568138748407364, "learning_rate": 0.0001389247311827957, "loss": 0.1139, "step": 328 }, { "epoch": 0.021268686868686867, "grad_norm": 0.15839411318302155, "learning_rate": 0.00013935483870967743, "loss": 0.1102, "step": 329 }, { "epoch": 0.021333333333333333, "grad_norm": 0.15745337307453156, "learning_rate": 0.00013978494623655916, "loss": 0.1223, "step": 330 }, { "epoch": 0.021397979797979798, "grad_norm": 0.1718154102563858, "learning_rate": 0.00014021505376344087, "loss": 0.1301, "step": 331 }, { "epoch": 0.021462626262626263, "grad_norm": 0.16774027049541473, "learning_rate": 0.0001406451612903226, "loss": 0.1301, "step": 332 }, { "epoch": 0.021527272727272728, "grad_norm": 0.1731647104024887, "learning_rate": 0.0001410752688172043, "loss": 0.1336, "step": 333 }, { "epoch": 0.021591919191919193, "grad_norm": 0.15440630912780762, "learning_rate": 0.00014150537634408602, "loss": 0.1105, "step": 334 }, { "epoch": 0.021656565656565655, "grad_norm": 0.16279609501361847, "learning_rate": 0.00014193548387096775, "loss": 0.1217, "step": 335 }, { "epoch": 0.02172121212121212, "grad_norm": 0.143400177359581, "learning_rate": 0.00014236559139784946, "loss": 0.0959, "step": 336 }, { "epoch": 0.02172121212121212, "eval_bleu": 9.318219961493885, "eval_loss": 0.10899758338928223, "eval_runtime": 2.8131, "eval_samples_per_second": 11.375, "eval_steps_per_second": 1.422, "step": 336 }, { "epoch": 0.021785858585858586, "grad_norm": 0.14454936981201172, "learning_rate": 0.0001427956989247312, "loss": 0.1034, "step": 337 }, { "epoch": 0.02185050505050505, "grad_norm": 0.15263137221336365, "learning_rate": 0.00014322580645161293, "loss": 0.1184, "step": 338 }, { "epoch": 0.021915151515151516, "grad_norm": 0.16419324278831482, "learning_rate": 0.00014365591397849463, "loss": 0.112, "step": 339 }, { "epoch": 0.02197979797979798, "grad_norm": 0.16806355118751526, "learning_rate": 0.00014408602150537637, "loss": 0.1249, "step": 340 }, { "epoch": 0.022044444444444443, "grad_norm": 0.1690777838230133, "learning_rate": 0.00014451612903225807, "loss": 0.1272, "step": 341 }, { "epoch": 0.022109090909090908, "grad_norm": 0.16707856953144073, "learning_rate": 0.00014494623655913978, "loss": 0.1139, "step": 342 }, { "epoch": 0.022173737373737373, "grad_norm": 0.17025162279605865, "learning_rate": 0.00014537634408602151, "loss": 0.1201, "step": 343 }, { "epoch": 0.02223838383838384, "grad_norm": 0.14075659215450287, "learning_rate": 0.00014580645161290322, "loss": 0.1028, "step": 344 }, { "epoch": 0.022303030303030304, "grad_norm": 0.17504535615444183, "learning_rate": 0.00014623655913978496, "loss": 0.1383, "step": 345 }, { "epoch": 0.02236767676767677, "grad_norm": 0.16352058947086334, "learning_rate": 0.00014666666666666666, "loss": 0.1229, "step": 346 }, { "epoch": 0.02243232323232323, "grad_norm": 0.16439294815063477, "learning_rate": 0.0001470967741935484, "loss": 0.1181, "step": 347 }, { "epoch": 0.022496969696969696, "grad_norm": 0.16326642036437988, "learning_rate": 0.00014752688172043013, "loss": 0.1095, "step": 348 }, { "epoch": 0.02256161616161616, "grad_norm": 0.1376395970582962, "learning_rate": 0.00014795698924731184, "loss": 0.0921, "step": 349 }, { "epoch": 0.022626262626262626, "grad_norm": 0.14896328747272491, "learning_rate": 0.00014838709677419355, "loss": 0.1139, "step": 350 }, { "epoch": 0.02269090909090909, "grad_norm": 0.1763393133878708, "learning_rate": 0.00014881720430107528, "loss": 0.1084, "step": 351 }, { "epoch": 0.022755555555555557, "grad_norm": 0.13971780240535736, "learning_rate": 0.00014924731182795699, "loss": 0.0977, "step": 352 }, { "epoch": 0.022755555555555557, "eval_bleu": 9.32524427394953, "eval_loss": 0.10860705375671387, "eval_runtime": 2.6408, "eval_samples_per_second": 12.118, "eval_steps_per_second": 1.515, "step": 352 }, { "epoch": 0.022820202020202022, "grad_norm": 0.1523587703704834, "learning_rate": 0.00014967741935483872, "loss": 0.1175, "step": 353 }, { "epoch": 0.022884848484848484, "grad_norm": 0.18091051280498505, "learning_rate": 0.00015010752688172043, "loss": 0.1047, "step": 354 }, { "epoch": 0.02294949494949495, "grad_norm": 0.18230056762695312, "learning_rate": 0.00015053763440860216, "loss": 0.1293, "step": 355 }, { "epoch": 0.023014141414141414, "grad_norm": 0.17838574945926666, "learning_rate": 0.0001509677419354839, "loss": 0.1225, "step": 356 }, { "epoch": 0.02307878787878788, "grad_norm": 0.18112388253211975, "learning_rate": 0.0001513978494623656, "loss": 0.1322, "step": 357 }, { "epoch": 0.023143434343434344, "grad_norm": 0.16122449934482574, "learning_rate": 0.0001518279569892473, "loss": 0.1133, "step": 358 }, { "epoch": 0.02320808080808081, "grad_norm": 0.14952917397022247, "learning_rate": 0.00015225806451612902, "loss": 0.1026, "step": 359 }, { "epoch": 0.02327272727272727, "grad_norm": 0.16230295598506927, "learning_rate": 0.00015268817204301075, "loss": 0.1224, "step": 360 }, { "epoch": 0.023337373737373737, "grad_norm": 0.14395084977149963, "learning_rate": 0.00015311827956989248, "loss": 0.1149, "step": 361 }, { "epoch": 0.0234020202020202, "grad_norm": 0.17280416190624237, "learning_rate": 0.0001535483870967742, "loss": 0.1217, "step": 362 }, { "epoch": 0.023466666666666667, "grad_norm": 0.15629975497722626, "learning_rate": 0.00015397849462365593, "loss": 0.1207, "step": 363 }, { "epoch": 0.023531313131313132, "grad_norm": 0.15779243409633636, "learning_rate": 0.00015440860215053766, "loss": 0.1247, "step": 364 }, { "epoch": 0.023595959595959597, "grad_norm": 0.16891762614250183, "learning_rate": 0.00015483870967741937, "loss": 0.1444, "step": 365 }, { "epoch": 0.02366060606060606, "grad_norm": 0.13558053970336914, "learning_rate": 0.00015526881720430107, "loss": 0.1034, "step": 366 }, { "epoch": 0.023725252525252524, "grad_norm": 0.1764685958623886, "learning_rate": 0.00015569892473118278, "loss": 0.1052, "step": 367 }, { "epoch": 0.02378989898989899, "grad_norm": 0.1592056155204773, "learning_rate": 0.00015612903225806451, "loss": 0.1225, "step": 368 }, { "epoch": 0.02378989898989899, "eval_bleu": 9.103427970211893, "eval_loss": 0.10848551988601685, "eval_runtime": 2.7711, "eval_samples_per_second": 11.548, "eval_steps_per_second": 1.443, "step": 368 }, { "epoch": 0.023854545454545455, "grad_norm": 0.13903047144412994, "learning_rate": 0.00015655913978494625, "loss": 0.1061, "step": 369 }, { "epoch": 0.02391919191919192, "grad_norm": 0.18397219479084015, "learning_rate": 0.00015698924731182796, "loss": 0.132, "step": 370 }, { "epoch": 0.023983838383838385, "grad_norm": 0.14793841540813446, "learning_rate": 0.0001574193548387097, "loss": 0.1124, "step": 371 }, { "epoch": 0.024048484848484847, "grad_norm": 0.15606354176998138, "learning_rate": 0.00015784946236559142, "loss": 0.1161, "step": 372 }, { "epoch": 0.024113131313131312, "grad_norm": 0.15853872895240784, "learning_rate": 0.00015827956989247313, "loss": 0.1166, "step": 373 }, { "epoch": 0.024177777777777777, "grad_norm": 0.15579386055469513, "learning_rate": 0.00015870967741935487, "loss": 0.1163, "step": 374 }, { "epoch": 0.024242424242424242, "grad_norm": 0.17541849613189697, "learning_rate": 0.00015913978494623657, "loss": 0.1143, "step": 375 }, { "epoch": 0.024307070707070708, "grad_norm": 0.22376100718975067, "learning_rate": 0.00015956989247311828, "loss": 0.1227, "step": 376 }, { "epoch": 0.024371717171717173, "grad_norm": 0.16659735143184662, "learning_rate": 0.00016, "loss": 0.1174, "step": 377 }, { "epoch": 0.024436363636363638, "grad_norm": 0.14780963957309723, "learning_rate": 0.00016043010752688172, "loss": 0.1116, "step": 378 }, { "epoch": 0.0245010101010101, "grad_norm": 0.16586990654468536, "learning_rate": 0.00016086021505376345, "loss": 0.1279, "step": 379 }, { "epoch": 0.024565656565656565, "grad_norm": 0.13963653147220612, "learning_rate": 0.00016129032258064516, "loss": 0.1099, "step": 380 }, { "epoch": 0.02463030303030303, "grad_norm": 0.16145245730876923, "learning_rate": 0.0001617204301075269, "loss": 0.1121, "step": 381 }, { "epoch": 0.024694949494949495, "grad_norm": 0.13311390578746796, "learning_rate": 0.00016215053763440863, "loss": 0.106, "step": 382 }, { "epoch": 0.02475959595959596, "grad_norm": 0.16619884967803955, "learning_rate": 0.00016258064516129034, "loss": 0.1158, "step": 383 }, { "epoch": 0.024824242424242426, "grad_norm": 0.15011127293109894, "learning_rate": 0.00016301075268817204, "loss": 0.1119, "step": 384 }, { "epoch": 0.024824242424242426, "eval_bleu": 10.882295300737969, "eval_loss": 0.10799592733383179, "eval_runtime": 2.601, "eval_samples_per_second": 12.303, "eval_steps_per_second": 1.538, "step": 384 }, { "epoch": 0.024888888888888887, "grad_norm": 0.11864225566387177, "learning_rate": 0.00016344086021505378, "loss": 0.0956, "step": 385 }, { "epoch": 0.024953535353535353, "grad_norm": 0.13069860637187958, "learning_rate": 0.00016387096774193548, "loss": 0.0954, "step": 386 }, { "epoch": 0.025018181818181818, "grad_norm": 0.13603512942790985, "learning_rate": 0.00016430107526881722, "loss": 0.1018, "step": 387 }, { "epoch": 0.025082828282828283, "grad_norm": 0.1649290770292282, "learning_rate": 0.00016473118279569893, "loss": 0.1242, "step": 388 }, { "epoch": 0.02514747474747475, "grad_norm": 0.16284801065921783, "learning_rate": 0.00016516129032258066, "loss": 0.1085, "step": 389 }, { "epoch": 0.025212121212121213, "grad_norm": 0.13699135184288025, "learning_rate": 0.0001655913978494624, "loss": 0.1021, "step": 390 }, { "epoch": 0.025276767676767675, "grad_norm": 0.13586992025375366, "learning_rate": 0.0001660215053763441, "loss": 0.0973, "step": 391 }, { "epoch": 0.02534141414141414, "grad_norm": 0.1439896821975708, "learning_rate": 0.0001664516129032258, "loss": 0.1043, "step": 392 }, { "epoch": 0.025406060606060606, "grad_norm": 0.18616266548633575, "learning_rate": 0.00016688172043010751, "loss": 0.1456, "step": 393 }, { "epoch": 0.02547070707070707, "grad_norm": 0.28124067187309265, "learning_rate": 0.00016731182795698925, "loss": 0.1111, "step": 394 }, { "epoch": 0.025535353535353536, "grad_norm": 0.14582084119319916, "learning_rate": 0.00016774193548387098, "loss": 0.1121, "step": 395 }, { "epoch": 0.0256, "grad_norm": 0.14482928812503815, "learning_rate": 0.0001681720430107527, "loss": 0.114, "step": 396 }, { "epoch": 0.025664646464646463, "grad_norm": 0.13774289190769196, "learning_rate": 0.00016860215053763442, "loss": 0.1123, "step": 397 }, { "epoch": 0.025729292929292928, "grad_norm": 0.14453327655792236, "learning_rate": 0.00016903225806451616, "loss": 0.1207, "step": 398 }, { "epoch": 0.025793939393939393, "grad_norm": 0.12729491293430328, "learning_rate": 0.00016946236559139786, "loss": 0.0952, "step": 399 }, { "epoch": 0.02585858585858586, "grad_norm": 0.14126811921596527, "learning_rate": 0.00016989247311827957, "loss": 0.12, "step": 400 }, { "epoch": 0.02585858585858586, "eval_bleu": 8.985186608284193, "eval_loss": 0.10655423253774643, "eval_runtime": 2.6462, "eval_samples_per_second": 12.093, "eval_steps_per_second": 1.512, "step": 400 }, { "epoch": 0.025923232323232324, "grad_norm": 0.143293097615242, "learning_rate": 0.00017032258064516128, "loss": 0.13, "step": 401 }, { "epoch": 0.02598787878787879, "grad_norm": 0.14245979487895966, "learning_rate": 0.000170752688172043, "loss": 0.1118, "step": 402 }, { "epoch": 0.026052525252525254, "grad_norm": 0.1558937132358551, "learning_rate": 0.00017118279569892475, "loss": 0.1152, "step": 403 }, { "epoch": 0.026117171717171716, "grad_norm": 0.1501445472240448, "learning_rate": 0.00017161290322580645, "loss": 0.1156, "step": 404 }, { "epoch": 0.02618181818181818, "grad_norm": 0.1374216228723526, "learning_rate": 0.0001720430107526882, "loss": 0.1002, "step": 405 }, { "epoch": 0.026246464646464646, "grad_norm": 0.1564859300851822, "learning_rate": 0.0001724731182795699, "loss": 0.1215, "step": 406 }, { "epoch": 0.02631111111111111, "grad_norm": 0.16713081300258636, "learning_rate": 0.00017290322580645163, "loss": 0.1305, "step": 407 }, { "epoch": 0.026375757575757577, "grad_norm": 0.13711397349834442, "learning_rate": 0.00017333333333333334, "loss": 0.098, "step": 408 }, { "epoch": 0.026440404040404042, "grad_norm": 0.13817580044269562, "learning_rate": 0.00017376344086021504, "loss": 0.0974, "step": 409 }, { "epoch": 0.026505050505050504, "grad_norm": 0.1366218775510788, "learning_rate": 0.00017419354838709678, "loss": 0.1044, "step": 410 }, { "epoch": 0.02656969696969697, "grad_norm": 0.19042985141277313, "learning_rate": 0.0001746236559139785, "loss": 0.1298, "step": 411 }, { "epoch": 0.026634343434343434, "grad_norm": 0.14615961909294128, "learning_rate": 0.00017505376344086022, "loss": 0.0993, "step": 412 }, { "epoch": 0.0266989898989899, "grad_norm": 0.15030674636363983, "learning_rate": 0.00017548387096774195, "loss": 0.1209, "step": 413 }, { "epoch": 0.026763636363636364, "grad_norm": 0.147713765501976, "learning_rate": 0.00017591397849462366, "loss": 0.1207, "step": 414 }, { "epoch": 0.02682828282828283, "grad_norm": 0.15125833451747894, "learning_rate": 0.0001763440860215054, "loss": 0.1223, "step": 415 }, { "epoch": 0.02689292929292929, "grad_norm": 0.12598833441734314, "learning_rate": 0.0001767741935483871, "loss": 0.0899, "step": 416 }, { "epoch": 0.02689292929292929, "eval_bleu": 10.147141304417094, "eval_loss": 0.10775532573461533, "eval_runtime": 2.6245, "eval_samples_per_second": 12.193, "eval_steps_per_second": 1.524, "step": 416 }, { "epoch": 0.026957575757575757, "grad_norm": 0.16561436653137207, "learning_rate": 0.0001772043010752688, "loss": 0.1182, "step": 417 }, { "epoch": 0.027022222222222222, "grad_norm": 0.12544482946395874, "learning_rate": 0.00017763440860215054, "loss": 0.0974, "step": 418 }, { "epoch": 0.027086868686868687, "grad_norm": 0.1391218900680542, "learning_rate": 0.00017806451612903228, "loss": 0.103, "step": 419 }, { "epoch": 0.027151515151515152, "grad_norm": 0.15751127898693085, "learning_rate": 0.00017849462365591398, "loss": 0.1109, "step": 420 }, { "epoch": 0.027216161616161617, "grad_norm": 0.13848909735679626, "learning_rate": 0.00017892473118279572, "loss": 0.1024, "step": 421 }, { "epoch": 0.02728080808080808, "grad_norm": 0.15021491050720215, "learning_rate": 0.00017935483870967742, "loss": 0.1216, "step": 422 }, { "epoch": 0.027345454545454544, "grad_norm": 0.13931679725646973, "learning_rate": 0.00017978494623655916, "loss": 0.1174, "step": 423 }, { "epoch": 0.02741010101010101, "grad_norm": 0.16443245112895966, "learning_rate": 0.00018021505376344086, "loss": 0.1227, "step": 424 }, { "epoch": 0.027474747474747475, "grad_norm": 0.13369695842266083, "learning_rate": 0.00018064516129032257, "loss": 0.0986, "step": 425 }, { "epoch": 0.02753939393939394, "grad_norm": 0.16605685651302338, "learning_rate": 0.0001810752688172043, "loss": 0.1275, "step": 426 }, { "epoch": 0.027604040404040405, "grad_norm": 0.1480400115251541, "learning_rate": 0.000181505376344086, "loss": 0.1105, "step": 427 }, { "epoch": 0.02766868686868687, "grad_norm": 0.17037196457386017, "learning_rate": 0.00018193548387096775, "loss": 0.1484, "step": 428 }, { "epoch": 0.027733333333333332, "grad_norm": 0.16051368415355682, "learning_rate": 0.00018236559139784948, "loss": 0.1316, "step": 429 }, { "epoch": 0.027797979797979797, "grad_norm": 0.16002199053764343, "learning_rate": 0.0001827956989247312, "loss": 0.1267, "step": 430 }, { "epoch": 0.027862626262626262, "grad_norm": 0.11067473888397217, "learning_rate": 0.00018322580645161292, "loss": 0.0765, "step": 431 }, { "epoch": 0.027927272727272728, "grad_norm": 0.14831194281578064, "learning_rate": 0.00018365591397849463, "loss": 0.129, "step": 432 }, { "epoch": 0.027927272727272728, "eval_bleu": 11.054277549790548, "eval_loss": 0.10615125298500061, "eval_runtime": 2.6125, "eval_samples_per_second": 12.249, "eval_steps_per_second": 1.531, "step": 432 }, { "epoch": 0.027991919191919193, "grad_norm": 0.14561083912849426, "learning_rate": 0.00018408602150537634, "loss": 0.0983, "step": 433 }, { "epoch": 0.028056565656565658, "grad_norm": 0.12259099632501602, "learning_rate": 0.00018451612903225807, "loss": 0.0862, "step": 434 }, { "epoch": 0.02812121212121212, "grad_norm": 0.1275368481874466, "learning_rate": 0.00018494623655913978, "loss": 0.0992, "step": 435 }, { "epoch": 0.028185858585858585, "grad_norm": 0.14508667588233948, "learning_rate": 0.0001853763440860215, "loss": 0.1126, "step": 436 }, { "epoch": 0.02825050505050505, "grad_norm": 0.14779242873191833, "learning_rate": 0.00018580645161290325, "loss": 0.1104, "step": 437 }, { "epoch": 0.028315151515151515, "grad_norm": 0.14809027314186096, "learning_rate": 0.00018623655913978495, "loss": 0.1158, "step": 438 }, { "epoch": 0.02837979797979798, "grad_norm": 0.18058310449123383, "learning_rate": 0.0001866666666666667, "loss": 0.1252, "step": 439 }, { "epoch": 0.028444444444444446, "grad_norm": 0.1464747041463852, "learning_rate": 0.0001870967741935484, "loss": 0.122, "step": 440 }, { "epoch": 0.028509090909090908, "grad_norm": 0.12998485565185547, "learning_rate": 0.00018752688172043013, "loss": 0.1166, "step": 441 }, { "epoch": 0.028573737373737373, "grad_norm": 0.12510083615779877, "learning_rate": 0.00018795698924731183, "loss": 0.1, "step": 442 }, { "epoch": 0.028638383838383838, "grad_norm": 0.14600686728954315, "learning_rate": 0.00018838709677419354, "loss": 0.119, "step": 443 }, { "epoch": 0.028703030303030303, "grad_norm": 0.16463568806648254, "learning_rate": 0.00018881720430107528, "loss": 0.1468, "step": 444 }, { "epoch": 0.02876767676767677, "grad_norm": 0.14338329434394836, "learning_rate": 0.000189247311827957, "loss": 0.11, "step": 445 }, { "epoch": 0.028832323232323234, "grad_norm": 0.12141603231430054, "learning_rate": 0.00018967741935483872, "loss": 0.0941, "step": 446 }, { "epoch": 0.028896969696969695, "grad_norm": 0.14474593102931976, "learning_rate": 0.00019010752688172045, "loss": 0.1278, "step": 447 }, { "epoch": 0.02896161616161616, "grad_norm": 0.13601356744766235, "learning_rate": 0.00019053763440860216, "loss": 0.1015, "step": 448 }, { "epoch": 0.02896161616161616, "eval_bleu": 10.693190110406718, "eval_loss": 0.10794844478368759, "eval_runtime": 2.5169, "eval_samples_per_second": 12.714, "eval_steps_per_second": 1.589, "step": 448 }, { "epoch": 0.029026262626262626, "grad_norm": 0.12670452892780304, "learning_rate": 0.0001909677419354839, "loss": 0.1091, "step": 449 }, { "epoch": 0.02909090909090909, "grad_norm": 0.16578026115894318, "learning_rate": 0.0001913978494623656, "loss": 0.118, "step": 450 }, { "epoch": 0.029155555555555556, "grad_norm": 0.15042001008987427, "learning_rate": 0.0001918279569892473, "loss": 0.1228, "step": 451 }, { "epoch": 0.02922020202020202, "grad_norm": 0.14987945556640625, "learning_rate": 0.00019225806451612904, "loss": 0.1228, "step": 452 }, { "epoch": 0.029284848484848486, "grad_norm": 0.1526365578174591, "learning_rate": 0.00019268817204301077, "loss": 0.1284, "step": 453 }, { "epoch": 0.029349494949494948, "grad_norm": 0.1318255364894867, "learning_rate": 0.00019311827956989248, "loss": 0.1039, "step": 454 }, { "epoch": 0.029414141414141413, "grad_norm": 0.1383906453847885, "learning_rate": 0.00019354838709677422, "loss": 0.1212, "step": 455 }, { "epoch": 0.02947878787878788, "grad_norm": 0.1535595953464508, "learning_rate": 0.00019397849462365592, "loss": 0.1155, "step": 456 }, { "epoch": 0.029543434343434344, "grad_norm": 0.1335660219192505, "learning_rate": 0.00019440860215053766, "loss": 0.1001, "step": 457 }, { "epoch": 0.02960808080808081, "grad_norm": 0.2634216248989105, "learning_rate": 0.00019483870967741936, "loss": 0.1192, "step": 458 }, { "epoch": 0.029672727272727274, "grad_norm": 0.13985425233840942, "learning_rate": 0.00019526881720430107, "loss": 0.1214, "step": 459 }, { "epoch": 0.029737373737373736, "grad_norm": 0.14772897958755493, "learning_rate": 0.0001956989247311828, "loss": 0.1125, "step": 460 }, { "epoch": 0.0298020202020202, "grad_norm": 0.1297396719455719, "learning_rate": 0.0001961290322580645, "loss": 0.0984, "step": 461 }, { "epoch": 0.029866666666666666, "grad_norm": 0.1519833654165268, "learning_rate": 0.00019655913978494625, "loss": 0.123, "step": 462 }, { "epoch": 0.02993131313131313, "grad_norm": 0.13482460379600525, "learning_rate": 0.00019698924731182798, "loss": 0.1069, "step": 463 }, { "epoch": 0.029995959595959597, "grad_norm": 0.13658182322978973, "learning_rate": 0.00019741935483870969, "loss": 0.1177, "step": 464 }, { "epoch": 0.029995959595959597, "eval_bleu": 12.572843086082546, "eval_loss": 0.10561101138591766, "eval_runtime": 2.7021, "eval_samples_per_second": 11.843, "eval_steps_per_second": 1.48, "step": 464 }, { "epoch": 0.030060606060606062, "grad_norm": 0.12853117287158966, "learning_rate": 0.00019784946236559142, "loss": 0.1016, "step": 465 }, { "epoch": 0.030125252525252524, "grad_norm": 0.14732927083969116, "learning_rate": 0.00019827956989247313, "loss": 0.128, "step": 466 }, { "epoch": 0.03018989898989899, "grad_norm": 0.1329525113105774, "learning_rate": 0.00019870967741935483, "loss": 0.1102, "step": 467 }, { "epoch": 0.030254545454545454, "grad_norm": 0.14983846247196198, "learning_rate": 0.00019913978494623657, "loss": 0.1311, "step": 468 }, { "epoch": 0.03031919191919192, "grad_norm": 0.12656426429748535, "learning_rate": 0.00019956989247311828, "loss": 0.1038, "step": 469 }, { "epoch": 0.030383838383838385, "grad_norm": 0.14365172386169434, "learning_rate": 0.0002, "loss": 0.1147, "step": 470 }, { "epoch": 0.03044848484848485, "grad_norm": 0.13733601570129395, "learning_rate": 0.00019999999976616652, "loss": 0.1123, "step": 471 }, { "epoch": 0.03051313131313131, "grad_norm": 0.1368652582168579, "learning_rate": 0.00019999999906466614, "loss": 0.1094, "step": 472 }, { "epoch": 0.030577777777777777, "grad_norm": 0.15315213799476624, "learning_rate": 0.00019999999789549876, "loss": 0.1239, "step": 473 }, { "epoch": 0.030642424242424242, "grad_norm": 0.14755412936210632, "learning_rate": 0.0001999999962586645, "loss": 0.115, "step": 474 }, { "epoch": 0.030707070707070707, "grad_norm": 0.14244329929351807, "learning_rate": 0.0001999999941541633, "loss": 0.1069, "step": 475 }, { "epoch": 0.030771717171717172, "grad_norm": 0.1433442085981369, "learning_rate": 0.0001999999915819952, "loss": 0.1073, "step": 476 }, { "epoch": 0.030836363636363637, "grad_norm": 0.14364516735076904, "learning_rate": 0.00019999998854216018, "loss": 0.1131, "step": 477 }, { "epoch": 0.030901010101010103, "grad_norm": 0.15006142854690552, "learning_rate": 0.0001999999850346583, "loss": 0.12, "step": 478 }, { "epoch": 0.030965656565656564, "grad_norm": 0.12459193915128708, "learning_rate": 0.00019999998105948953, "loss": 0.1, "step": 479 }, { "epoch": 0.03103030303030303, "grad_norm": 0.14505188167095184, "learning_rate": 0.0001999999766166539, "loss": 0.1126, "step": 480 }, { "epoch": 0.03103030303030303, "eval_bleu": 12.168736331249336, "eval_loss": 0.10556380450725555, "eval_runtime": 2.52, "eval_samples_per_second": 12.698, "eval_steps_per_second": 1.587, "step": 480 }, { "epoch": 0.031094949494949495, "grad_norm": 0.15447013080120087, "learning_rate": 0.0001999999717061514, "loss": 0.1132, "step": 481 }, { "epoch": 0.03115959595959596, "grad_norm": 0.11705537140369415, "learning_rate": 0.00019999996632798217, "loss": 0.0976, "step": 482 }, { "epoch": 0.031224242424242425, "grad_norm": 0.15111412107944489, "learning_rate": 0.00019999996048214612, "loss": 0.1139, "step": 483 }, { "epoch": 0.03128888888888889, "grad_norm": 0.14592544734477997, "learning_rate": 0.0001999999541686433, "loss": 0.1221, "step": 484 }, { "epoch": 0.031353535353535356, "grad_norm": 0.18990279734134674, "learning_rate": 0.00019999994738747378, "loss": 0.1517, "step": 485 }, { "epoch": 0.03141818181818182, "grad_norm": 0.14252693951129913, "learning_rate": 0.00019999994013863756, "loss": 0.1186, "step": 486 }, { "epoch": 0.031482828282828286, "grad_norm": 0.13235680758953094, "learning_rate": 0.00019999993242213467, "loss": 0.1163, "step": 487 }, { "epoch": 0.03154747474747475, "grad_norm": 0.21530663967132568, "learning_rate": 0.00019999992423796515, "loss": 0.1037, "step": 488 }, { "epoch": 0.03161212121212121, "grad_norm": 0.12231055647134781, "learning_rate": 0.00019999991558612904, "loss": 0.0925, "step": 489 }, { "epoch": 0.03167676767676768, "grad_norm": 0.146692156791687, "learning_rate": 0.00019999990646662642, "loss": 0.1056, "step": 490 }, { "epoch": 0.03174141414141414, "grad_norm": 0.16732384264469147, "learning_rate": 0.00019999989687945728, "loss": 0.1051, "step": 491 }, { "epoch": 0.03180606060606061, "grad_norm": 0.14350025355815887, "learning_rate": 0.00019999988682462168, "loss": 0.1193, "step": 492 }, { "epoch": 0.03187070707070707, "grad_norm": 0.21280625462532043, "learning_rate": 0.00019999987630211967, "loss": 0.1272, "step": 493 }, { "epoch": 0.03193535353535353, "grad_norm": 0.23272041976451874, "learning_rate": 0.0001999998653119513, "loss": 0.0916, "step": 494 }, { "epoch": 0.032, "grad_norm": 0.1862674206495285, "learning_rate": 0.0001999998538541166, "loss": 0.125, "step": 495 }, { "epoch": 0.03206464646464646, "grad_norm": 0.200235515832901, "learning_rate": 0.00019999984192861566, "loss": 0.1175, "step": 496 }, { "epoch": 0.03206464646464646, "eval_bleu": 11.162771160551632, "eval_loss": 0.1055934727191925, "eval_runtime": 2.5819, "eval_samples_per_second": 12.394, "eval_steps_per_second": 1.549, "step": 496 }, { "epoch": 0.03212929292929293, "grad_norm": 0.1706344038248062, "learning_rate": 0.00019999982953544852, "loss": 0.1156, "step": 497 }, { "epoch": 0.03219393939393939, "grad_norm": 0.1841149479150772, "learning_rate": 0.00019999981667461522, "loss": 0.1265, "step": 498 }, { "epoch": 0.03225858585858586, "grad_norm": 0.14012755453586578, "learning_rate": 0.00019999980334611586, "loss": 0.1173, "step": 499 }, { "epoch": 0.03232323232323232, "grad_norm": 0.1379678100347519, "learning_rate": 0.00019999978954995045, "loss": 0.1065, "step": 500 }, { "epoch": 0.032387878787878785, "grad_norm": 0.14016778767108917, "learning_rate": 0.0001999997752861191, "loss": 0.1152, "step": 501 }, { "epoch": 0.032452525252525254, "grad_norm": 0.13358506560325623, "learning_rate": 0.00019999976055462185, "loss": 0.1135, "step": 502 }, { "epoch": 0.032517171717171715, "grad_norm": 0.14606399834156036, "learning_rate": 0.0001999997453554588, "loss": 0.1066, "step": 503 }, { "epoch": 0.032581818181818184, "grad_norm": 0.14392736554145813, "learning_rate": 0.00019999972968863, "loss": 0.1052, "step": 504 }, { "epoch": 0.032646464646464646, "grad_norm": 0.14780963957309723, "learning_rate": 0.0001999997135541355, "loss": 0.1095, "step": 505 }, { "epoch": 0.032711111111111114, "grad_norm": 0.12461218237876892, "learning_rate": 0.00019999969695197543, "loss": 0.1026, "step": 506 }, { "epoch": 0.032775757575757576, "grad_norm": 0.1564641147851944, "learning_rate": 0.0001999996798821498, "loss": 0.1276, "step": 507 }, { "epoch": 0.03284040404040404, "grad_norm": 0.1745409071445465, "learning_rate": 0.00019999966234465877, "loss": 0.1014, "step": 508 }, { "epoch": 0.03290505050505051, "grad_norm": 0.1282573640346527, "learning_rate": 0.00019999964433950235, "loss": 0.1099, "step": 509 }, { "epoch": 0.03296969696969697, "grad_norm": 0.12653900682926178, "learning_rate": 0.00019999962586668063, "loss": 0.106, "step": 510 }, { "epoch": 0.03303434343434344, "grad_norm": 0.13360770046710968, "learning_rate": 0.00019999960692619376, "loss": 0.1171, "step": 511 }, { "epoch": 0.0330989898989899, "grad_norm": 0.1208031103014946, "learning_rate": 0.00019999958751804178, "loss": 0.1115, "step": 512 }, { "epoch": 0.0330989898989899, "eval_bleu": 9.488414092761792, "eval_loss": 0.10857859253883362, "eval_runtime": 2.6232, "eval_samples_per_second": 12.199, "eval_steps_per_second": 1.525, "step": 512 }, { "epoch": 0.03316363636363636, "grad_norm": 0.13872727751731873, "learning_rate": 0.00019999956764222478, "loss": 0.1228, "step": 513 }, { "epoch": 0.03322828282828283, "grad_norm": 0.25268110632896423, "learning_rate": 0.00019999954729874286, "loss": 0.1478, "step": 514 }, { "epoch": 0.03329292929292929, "grad_norm": 0.13848553597927094, "learning_rate": 0.0001999995264875961, "loss": 0.1286, "step": 515 }, { "epoch": 0.03335757575757576, "grad_norm": 0.13639119267463684, "learning_rate": 0.00019999950520878463, "loss": 0.1246, "step": 516 }, { "epoch": 0.03342222222222222, "grad_norm": 0.12548309564590454, "learning_rate": 0.00019999948346230854, "loss": 0.1087, "step": 517 }, { "epoch": 0.03348686868686869, "grad_norm": 0.13334284722805023, "learning_rate": 0.00019999946124816794, "loss": 0.1077, "step": 518 }, { "epoch": 0.03355151515151515, "grad_norm": 0.12922510504722595, "learning_rate": 0.0001999994385663629, "loss": 0.1034, "step": 519 }, { "epoch": 0.03361616161616161, "grad_norm": 0.15493009984493256, "learning_rate": 0.00019999941541689356, "loss": 0.1416, "step": 520 }, { "epoch": 0.03368080808080808, "grad_norm": 0.1641789972782135, "learning_rate": 0.00019999939179975997, "loss": 0.136, "step": 521 }, { "epoch": 0.033745454545454544, "grad_norm": 0.11691408604383469, "learning_rate": 0.00019999936771496231, "loss": 0.0844, "step": 522 }, { "epoch": 0.03381010101010101, "grad_norm": 0.13989783823490143, "learning_rate": 0.0001999993431625007, "loss": 0.1195, "step": 523 }, { "epoch": 0.033874747474747474, "grad_norm": 0.13525332510471344, "learning_rate": 0.00019999931814237515, "loss": 0.113, "step": 524 }, { "epoch": 0.03393939393939394, "grad_norm": 0.13060380518436432, "learning_rate": 0.0001999992926545859, "loss": 0.1094, "step": 525 }, { "epoch": 0.034004040404040405, "grad_norm": 0.14437176287174225, "learning_rate": 0.00019999926669913301, "loss": 0.1232, "step": 526 }, { "epoch": 0.034068686868686866, "grad_norm": 0.13507899641990662, "learning_rate": 0.0001999992402760166, "loss": 0.1078, "step": 527 }, { "epoch": 0.034133333333333335, "grad_norm": 0.13182367384433746, "learning_rate": 0.00019999921338523683, "loss": 0.1093, "step": 528 }, { "epoch": 0.034133333333333335, "eval_bleu": 10.682708795923162, "eval_loss": 0.10908666253089905, "eval_runtime": 2.9928, "eval_samples_per_second": 10.692, "eval_steps_per_second": 1.337, "step": 528 }, { "epoch": 0.0341979797979798, "grad_norm": 0.12510529160499573, "learning_rate": 0.00019999918602679376, "loss": 0.1053, "step": 529 }, { "epoch": 0.034262626262626265, "grad_norm": 0.12579026818275452, "learning_rate": 0.00019999915820068757, "loss": 0.1062, "step": 530 }, { "epoch": 0.03432727272727273, "grad_norm": 0.13125565648078918, "learning_rate": 0.0001999991299069184, "loss": 0.1243, "step": 531 }, { "epoch": 0.03439191919191919, "grad_norm": 0.13875770568847656, "learning_rate": 0.0001999991011454863, "loss": 0.1348, "step": 532 }, { "epoch": 0.03445656565656566, "grad_norm": 0.14722971618175507, "learning_rate": 0.0001999990719163915, "loss": 0.1129, "step": 533 }, { "epoch": 0.03452121212121212, "grad_norm": 0.1381601244211197, "learning_rate": 0.00019999904221963411, "loss": 0.1251, "step": 534 }, { "epoch": 0.03458585858585859, "grad_norm": 0.13787423074245453, "learning_rate": 0.00019999901205521424, "loss": 0.1298, "step": 535 }, { "epoch": 0.03465050505050505, "grad_norm": 0.12954244017601013, "learning_rate": 0.00019999898142313206, "loss": 0.1048, "step": 536 }, { "epoch": 0.03471515151515152, "grad_norm": 0.14057686924934387, "learning_rate": 0.0001999989503233877, "loss": 0.117, "step": 537 }, { "epoch": 0.03477979797979798, "grad_norm": 0.12244465202093124, "learning_rate": 0.0001999989187559813, "loss": 0.1087, "step": 538 }, { "epoch": 0.03484444444444444, "grad_norm": 0.13722139596939087, "learning_rate": 0.00019999888672091304, "loss": 0.1226, "step": 539 }, { "epoch": 0.03490909090909091, "grad_norm": 0.13450995087623596, "learning_rate": 0.00019999885421818304, "loss": 0.1092, "step": 540 }, { "epoch": 0.03497373737373737, "grad_norm": 0.11251247674226761, "learning_rate": 0.0001999988212477914, "loss": 0.1046, "step": 541 }, { "epoch": 0.03503838383838384, "grad_norm": 0.11316199600696564, "learning_rate": 0.0001999987878097384, "loss": 0.1031, "step": 542 }, { "epoch": 0.0351030303030303, "grad_norm": 0.12963828444480896, "learning_rate": 0.0001999987539040241, "loss": 0.1065, "step": 543 }, { "epoch": 0.035167676767676764, "grad_norm": 0.13680677115917206, "learning_rate": 0.0001999987195306487, "loss": 0.1242, "step": 544 }, { "epoch": 0.035167676767676764, "eval_bleu": 9.798230940457769, "eval_loss": 0.10932165384292603, "eval_runtime": 2.6972, "eval_samples_per_second": 11.864, "eval_steps_per_second": 1.483, "step": 544 }, { "epoch": 0.03523232323232323, "grad_norm": 0.14490805566310883, "learning_rate": 0.00019999868468961233, "loss": 0.1029, "step": 545 }, { "epoch": 0.035296969696969695, "grad_norm": 0.36747074127197266, "learning_rate": 0.0001999986493809152, "loss": 0.1511, "step": 546 }, { "epoch": 0.03536161616161616, "grad_norm": 0.11181472986936569, "learning_rate": 0.00019999861360455741, "loss": 0.0992, "step": 547 }, { "epoch": 0.035426262626262625, "grad_norm": 0.11787353456020355, "learning_rate": 0.00019999857736053918, "loss": 0.1168, "step": 548 }, { "epoch": 0.035490909090909094, "grad_norm": 0.11531051248311996, "learning_rate": 0.00019999854064886067, "loss": 0.1054, "step": 549 }, { "epoch": 0.035555555555555556, "grad_norm": 0.10686899721622467, "learning_rate": 0.00019999850346952205, "loss": 0.0853, "step": 550 }, { "epoch": 0.03562020202020202, "grad_norm": 0.12576760351657867, "learning_rate": 0.0001999984658225235, "loss": 0.1012, "step": 551 }, { "epoch": 0.035684848484848486, "grad_norm": 0.12727631628513336, "learning_rate": 0.00019999842770786512, "loss": 0.1039, "step": 552 }, { "epoch": 0.03574949494949495, "grad_norm": 0.150522381067276, "learning_rate": 0.0001999983891255472, "loss": 0.1128, "step": 553 }, { "epoch": 0.035814141414141416, "grad_norm": 0.11746193468570709, "learning_rate": 0.00019999835007556986, "loss": 0.0902, "step": 554 }, { "epoch": 0.03587878787878788, "grad_norm": 0.142499178647995, "learning_rate": 0.00019999831055793332, "loss": 0.1066, "step": 555 }, { "epoch": 0.03594343434343435, "grad_norm": 0.1304892897605896, "learning_rate": 0.0001999982705726377, "loss": 0.1136, "step": 556 }, { "epoch": 0.03600808080808081, "grad_norm": 0.13161268830299377, "learning_rate": 0.00019999823011968327, "loss": 0.1054, "step": 557 }, { "epoch": 0.03607272727272727, "grad_norm": 0.13755886256694794, "learning_rate": 0.00019999818919907015, "loss": 0.1147, "step": 558 }, { "epoch": 0.03613737373737374, "grad_norm": 0.11605346202850342, "learning_rate": 0.00019999814781079857, "loss": 0.0991, "step": 559 }, { "epoch": 0.0362020202020202, "grad_norm": 0.18530824780464172, "learning_rate": 0.0001999981059548687, "loss": 0.1162, "step": 560 }, { "epoch": 0.0362020202020202, "eval_bleu": 11.287920881552383, "eval_loss": 0.10759274661540985, "eval_runtime": 2.7656, "eval_samples_per_second": 11.571, "eval_steps_per_second": 1.446, "step": 560 }, { "epoch": 0.03626666666666667, "grad_norm": 0.1463087946176529, "learning_rate": 0.00019999806363128075, "loss": 0.1459, "step": 561 }, { "epoch": 0.03633131313131313, "grad_norm": 0.12586872279644012, "learning_rate": 0.00019999802084003492, "loss": 0.1187, "step": 562 }, { "epoch": 0.03639595959595959, "grad_norm": 0.15147972106933594, "learning_rate": 0.0001999979775811314, "loss": 0.1236, "step": 563 }, { "epoch": 0.03646060606060606, "grad_norm": 0.11987727135419846, "learning_rate": 0.0001999979338545704, "loss": 0.1016, "step": 564 }, { "epoch": 0.03652525252525252, "grad_norm": 0.12765085697174072, "learning_rate": 0.00019999788966035213, "loss": 0.105, "step": 565 }, { "epoch": 0.03658989898989899, "grad_norm": 0.131104975938797, "learning_rate": 0.00019999784499847678, "loss": 0.1074, "step": 566 }, { "epoch": 0.036654545454545454, "grad_norm": 0.12428110837936401, "learning_rate": 0.00019999779986894456, "loss": 0.1101, "step": 567 }, { "epoch": 0.03671919191919192, "grad_norm": 0.13196514546871185, "learning_rate": 0.00019999775427175572, "loss": 0.1157, "step": 568 }, { "epoch": 0.036783838383838384, "grad_norm": 0.11181005835533142, "learning_rate": 0.0001999977082069104, "loss": 0.0908, "step": 569 }, { "epoch": 0.036848484848484846, "grad_norm": 0.1224859431385994, "learning_rate": 0.00019999766167440886, "loss": 0.1105, "step": 570 }, { "epoch": 0.036913131313131314, "grad_norm": 0.09960032999515533, "learning_rate": 0.00019999761467425135, "loss": 0.0892, "step": 571 }, { "epoch": 0.036977777777777776, "grad_norm": 0.12447663396596909, "learning_rate": 0.00019999756720643803, "loss": 0.1115, "step": 572 }, { "epoch": 0.037042424242424245, "grad_norm": 0.12504985928535461, "learning_rate": 0.00019999751927096915, "loss": 0.1118, "step": 573 }, { "epoch": 0.037107070707070706, "grad_norm": 0.11158134788274765, "learning_rate": 0.00019999747086784492, "loss": 0.1056, "step": 574 }, { "epoch": 0.037171717171717175, "grad_norm": 0.11234010756015778, "learning_rate": 0.0001999974219970656, "loss": 0.1031, "step": 575 }, { "epoch": 0.03723636363636364, "grad_norm": 0.12932424247264862, "learning_rate": 0.0001999973726586314, "loss": 0.1098, "step": 576 }, { "epoch": 0.03723636363636364, "eval_bleu": 11.07863404655768, "eval_loss": 0.10550281405448914, "eval_runtime": 2.7115, "eval_samples_per_second": 11.802, "eval_steps_per_second": 1.475, "step": 576 }, { "epoch": 0.0373010101010101, "grad_norm": 0.13537174463272095, "learning_rate": 0.00019999732285254251, "loss": 0.1224, "step": 577 }, { "epoch": 0.03736565656565657, "grad_norm": 0.10807958990335464, "learning_rate": 0.00019999727257879923, "loss": 0.0932, "step": 578 }, { "epoch": 0.03743030303030303, "grad_norm": 0.09941566735506058, "learning_rate": 0.00019999722183740176, "loss": 0.0787, "step": 579 }, { "epoch": 0.0374949494949495, "grad_norm": 0.12469415366649628, "learning_rate": 0.00019999717062835033, "loss": 0.1173, "step": 580 }, { "epoch": 0.03755959595959596, "grad_norm": 0.12502068281173706, "learning_rate": 0.0001999971189516452, "loss": 0.0975, "step": 581 }, { "epoch": 0.03762424242424242, "grad_norm": 0.13000090420246124, "learning_rate": 0.00019999706680728663, "loss": 0.1208, "step": 582 }, { "epoch": 0.03768888888888889, "grad_norm": 0.1340045928955078, "learning_rate": 0.0001999970141952748, "loss": 0.117, "step": 583 }, { "epoch": 0.03775353535353535, "grad_norm": 0.11511174589395523, "learning_rate": 0.00019999696111561, "loss": 0.0847, "step": 584 }, { "epoch": 0.03781818181818182, "grad_norm": 0.12176530063152313, "learning_rate": 0.00019999690756829246, "loss": 0.0956, "step": 585 }, { "epoch": 0.03788282828282828, "grad_norm": 0.1807343065738678, "learning_rate": 0.00019999685355332248, "loss": 0.1061, "step": 586 }, { "epoch": 0.03794747474747475, "grad_norm": 0.14554469287395477, "learning_rate": 0.00019999679907070023, "loss": 0.1322, "step": 587 }, { "epoch": 0.03801212121212121, "grad_norm": 0.13668417930603027, "learning_rate": 0.00019999674412042603, "loss": 0.1249, "step": 588 }, { "epoch": 0.038076767676767674, "grad_norm": 0.13603554666042328, "learning_rate": 0.0001999966887025001, "loss": 0.1169, "step": 589 }, { "epoch": 0.03814141414141414, "grad_norm": 0.13147543370723724, "learning_rate": 0.00019999663281692275, "loss": 0.1123, "step": 590 }, { "epoch": 0.038206060606060605, "grad_norm": 0.26900359988212585, "learning_rate": 0.0001999965764636942, "loss": 0.1253, "step": 591 }, { "epoch": 0.03827070707070707, "grad_norm": 0.1478128433227539, "learning_rate": 0.0001999965196428147, "loss": 0.1333, "step": 592 }, { "epoch": 0.03827070707070707, "eval_bleu": 14.099159337385808, "eval_loss": 0.10521923750638962, "eval_runtime": 2.7142, "eval_samples_per_second": 11.79, "eval_steps_per_second": 1.474, "step": 592 }, { "epoch": 0.038335353535353535, "grad_norm": 0.1270078718662262, "learning_rate": 0.00019999646235428452, "loss": 0.0976, "step": 593 }, { "epoch": 0.0384, "grad_norm": 0.138322114944458, "learning_rate": 0.000199996404598104, "loss": 0.1121, "step": 594 }, { "epoch": 0.038464646464646465, "grad_norm": 0.12432148307561874, "learning_rate": 0.0001999963463742733, "loss": 0.0971, "step": 595 }, { "epoch": 0.03852929292929293, "grad_norm": 0.12516441941261292, "learning_rate": 0.00019999628768279276, "loss": 0.1074, "step": 596 }, { "epoch": 0.038593939393939396, "grad_norm": 0.13323353230953217, "learning_rate": 0.00019999622852366267, "loss": 0.1144, "step": 597 }, { "epoch": 0.03865858585858586, "grad_norm": 0.11388403177261353, "learning_rate": 0.00019999616889688327, "loss": 0.1017, "step": 598 }, { "epoch": 0.038723232323232326, "grad_norm": 0.13637560606002808, "learning_rate": 0.0001999961088024548, "loss": 0.1216, "step": 599 }, { "epoch": 0.03878787878787879, "grad_norm": 0.12641260027885437, "learning_rate": 0.00019999604824037762, "loss": 0.088, "step": 600 }, { "epoch": 0.03885252525252525, "grad_norm": 0.10515261441469193, "learning_rate": 0.00019999598721065197, "loss": 0.0901, "step": 601 }, { "epoch": 0.03891717171717172, "grad_norm": 0.1189311146736145, "learning_rate": 0.00019999592571327815, "loss": 0.1001, "step": 602 }, { "epoch": 0.03898181818181818, "grad_norm": 0.12847928702831268, "learning_rate": 0.00019999586374825644, "loss": 0.1073, "step": 603 }, { "epoch": 0.03904646464646465, "grad_norm": 0.13329587876796722, "learning_rate": 0.00019999580131558717, "loss": 0.1066, "step": 604 }, { "epoch": 0.03911111111111111, "grad_norm": 0.1321302056312561, "learning_rate": 0.00019999573841527054, "loss": 0.1219, "step": 605 }, { "epoch": 0.03917575757575758, "grad_norm": 0.15070821344852448, "learning_rate": 0.00019999567504730696, "loss": 0.1117, "step": 606 }, { "epoch": 0.03924040404040404, "grad_norm": 0.13868500292301178, "learning_rate": 0.0001999956112116966, "loss": 0.1273, "step": 607 }, { "epoch": 0.0393050505050505, "grad_norm": 0.13437072932720184, "learning_rate": 0.00019999554690843988, "loss": 0.1284, "step": 608 }, { "epoch": 0.0393050505050505, "eval_bleu": 11.33287803667904, "eval_loss": 0.10529904067516327, "eval_runtime": 2.6624, "eval_samples_per_second": 12.019, "eval_steps_per_second": 1.502, "step": 608 }, { "epoch": 0.03936969696969697, "grad_norm": 0.11254505813121796, "learning_rate": 0.00019999548213753702, "loss": 0.1005, "step": 609 }, { "epoch": 0.03943434343434343, "grad_norm": 0.1814531534910202, "learning_rate": 0.00019999541689898835, "loss": 0.1312, "step": 610 }, { "epoch": 0.0394989898989899, "grad_norm": 0.1422121226787567, "learning_rate": 0.00019999535119279415, "loss": 0.1048, "step": 611 }, { "epoch": 0.03956363636363636, "grad_norm": 0.1460379958152771, "learning_rate": 0.0001999952850189548, "loss": 0.1276, "step": 612 }, { "epoch": 0.039628282828282825, "grad_norm": 0.13203667104244232, "learning_rate": 0.00019999521837747052, "loss": 0.1288, "step": 613 }, { "epoch": 0.039692929292929294, "grad_norm": 0.1199788972735405, "learning_rate": 0.00019999515126834167, "loss": 0.1122, "step": 614 }, { "epoch": 0.039757575757575755, "grad_norm": 0.10937829315662384, "learning_rate": 0.00019999508369156855, "loss": 0.1001, "step": 615 }, { "epoch": 0.039822222222222224, "grad_norm": 0.14558622241020203, "learning_rate": 0.0001999950156471515, "loss": 0.1505, "step": 616 }, { "epoch": 0.039886868686868686, "grad_norm": 0.11247015744447708, "learning_rate": 0.0001999949471350908, "loss": 0.1009, "step": 617 }, { "epoch": 0.039951515151515155, "grad_norm": 0.13116182386875153, "learning_rate": 0.0001999948781553868, "loss": 0.1002, "step": 618 }, { "epoch": 0.040016161616161616, "grad_norm": 0.11855407804250717, "learning_rate": 0.00019999480870803985, "loss": 0.1069, "step": 619 }, { "epoch": 0.04008080808080808, "grad_norm": 0.11876469105482101, "learning_rate": 0.00019999473879305017, "loss": 0.1206, "step": 620 }, { "epoch": 0.04014545454545455, "grad_norm": 0.1143917664885521, "learning_rate": 0.00019999466841041818, "loss": 0.1061, "step": 621 }, { "epoch": 0.04021010101010101, "grad_norm": 0.11280103027820587, "learning_rate": 0.00019999459756014419, "loss": 0.0898, "step": 622 }, { "epoch": 0.04027474747474748, "grad_norm": 0.14063239097595215, "learning_rate": 0.00019999452624222853, "loss": 0.1294, "step": 623 }, { "epoch": 0.04033939393939394, "grad_norm": 0.133212149143219, "learning_rate": 0.0001999944544566715, "loss": 0.1117, "step": 624 }, { "epoch": 0.04033939393939394, "eval_bleu": 10.425502094549062, "eval_loss": 0.10544434189796448, "eval_runtime": 2.8121, "eval_samples_per_second": 11.379, "eval_steps_per_second": 1.422, "step": 624 }, { "epoch": 0.04040404040404041, "grad_norm": 0.1216183751821518, "learning_rate": 0.0001999943822034735, "loss": 0.0987, "step": 625 }, { "epoch": 0.04046868686868687, "grad_norm": 0.13168206810951233, "learning_rate": 0.00019999430948263483, "loss": 0.115, "step": 626 }, { "epoch": 0.04053333333333333, "grad_norm": 0.20381559431552887, "learning_rate": 0.00019999423629415582, "loss": 0.11, "step": 627 }, { "epoch": 0.0405979797979798, "grad_norm": 0.13594192266464233, "learning_rate": 0.0001999941626380368, "loss": 0.1132, "step": 628 }, { "epoch": 0.04066262626262626, "grad_norm": 0.1459536999464035, "learning_rate": 0.00019999408851427818, "loss": 0.1051, "step": 629 }, { "epoch": 0.04072727272727273, "grad_norm": 0.15588319301605225, "learning_rate": 0.00019999401392288023, "loss": 0.1216, "step": 630 }, { "epoch": 0.04079191919191919, "grad_norm": 0.14037740230560303, "learning_rate": 0.00019999393886384334, "loss": 0.1236, "step": 631 }, { "epoch": 0.040856565656565653, "grad_norm": 0.12617136538028717, "learning_rate": 0.00019999386333716788, "loss": 0.1009, "step": 632 }, { "epoch": 0.04092121212121212, "grad_norm": 0.1461799591779709, "learning_rate": 0.00019999378734285417, "loss": 0.1411, "step": 633 }, { "epoch": 0.040985858585858584, "grad_norm": 0.1203555092215538, "learning_rate": 0.00019999371088090255, "loss": 0.1049, "step": 634 }, { "epoch": 0.04105050505050505, "grad_norm": 0.1366298496723175, "learning_rate": 0.00019999363395131344, "loss": 0.1287, "step": 635 }, { "epoch": 0.041115151515151514, "grad_norm": 0.10302355140447617, "learning_rate": 0.00019999355655408714, "loss": 0.0862, "step": 636 }, { "epoch": 0.04117979797979798, "grad_norm": 0.1196269616484642, "learning_rate": 0.00019999347868922404, "loss": 0.1247, "step": 637 }, { "epoch": 0.041244444444444445, "grad_norm": 0.1320931762456894, "learning_rate": 0.00019999340035672448, "loss": 0.1124, "step": 638 }, { "epoch": 0.041309090909090906, "grad_norm": 0.12277361005544662, "learning_rate": 0.00019999332155658885, "loss": 0.1057, "step": 639 }, { "epoch": 0.041373737373737375, "grad_norm": 0.12430281192064285, "learning_rate": 0.00019999324228881752, "loss": 0.1164, "step": 640 }, { "epoch": 0.041373737373737375, "eval_bleu": 11.288036767724059, "eval_loss": 0.10528124868869781, "eval_runtime": 2.5817, "eval_samples_per_second": 12.395, "eval_steps_per_second": 1.549, "step": 640 }, { "epoch": 0.04143838383838384, "grad_norm": 0.11824125796556473, "learning_rate": 0.00019999316255341084, "loss": 0.1076, "step": 641 }, { "epoch": 0.041503030303030305, "grad_norm": 0.10635879635810852, "learning_rate": 0.0001999930823503692, "loss": 0.0939, "step": 642 }, { "epoch": 0.04156767676767677, "grad_norm": 0.1288403421640396, "learning_rate": 0.000199993001679693, "loss": 0.1094, "step": 643 }, { "epoch": 0.04163232323232323, "grad_norm": 0.13538114726543427, "learning_rate": 0.00019999292054138253, "loss": 0.1249, "step": 644 }, { "epoch": 0.0416969696969697, "grad_norm": 0.1185263842344284, "learning_rate": 0.00019999283893543828, "loss": 0.1004, "step": 645 }, { "epoch": 0.04176161616161616, "grad_norm": 0.1532231867313385, "learning_rate": 0.00019999275686186056, "loss": 0.1178, "step": 646 }, { "epoch": 0.04182626262626263, "grad_norm": 0.10361472517251968, "learning_rate": 0.0001999926743206498, "loss": 0.0738, "step": 647 }, { "epoch": 0.04189090909090909, "grad_norm": 0.13335101306438446, "learning_rate": 0.00019999259131180631, "loss": 0.1221, "step": 648 }, { "epoch": 0.04195555555555556, "grad_norm": 0.13752682507038116, "learning_rate": 0.00019999250783533056, "loss": 0.1448, "step": 649 }, { "epoch": 0.04202020202020202, "grad_norm": 0.11338730156421661, "learning_rate": 0.0001999924238912229, "loss": 0.099, "step": 650 }, { "epoch": 0.04208484848484848, "grad_norm": 0.15735499560832977, "learning_rate": 0.00019999233947948371, "loss": 0.1045, "step": 651 }, { "epoch": 0.04214949494949495, "grad_norm": 0.12788867950439453, "learning_rate": 0.00019999225460011344, "loss": 0.1246, "step": 652 }, { "epoch": 0.04221414141414141, "grad_norm": 0.12864696979522705, "learning_rate": 0.00019999216925311244, "loss": 0.113, "step": 653 }, { "epoch": 0.04227878787878788, "grad_norm": 0.11846766620874405, "learning_rate": 0.00019999208343848113, "loss": 0.0895, "step": 654 }, { "epoch": 0.04234343434343434, "grad_norm": 0.1231776550412178, "learning_rate": 0.00019999199715621988, "loss": 0.101, "step": 655 }, { "epoch": 0.04240808080808081, "grad_norm": 0.1102028340101242, "learning_rate": 0.00019999191040632913, "loss": 0.099, "step": 656 }, { "epoch": 0.04240808080808081, "eval_bleu": 12.270452227963574, "eval_loss": 0.10357742756605148, "eval_runtime": 2.8138, "eval_samples_per_second": 11.373, "eval_steps_per_second": 1.422, "step": 656 }, { "epoch": 0.04247272727272727, "grad_norm": 0.12100549042224884, "learning_rate": 0.00019999182318880928, "loss": 0.1089, "step": 657 }, { "epoch": 0.042537373737373735, "grad_norm": 0.13474537432193756, "learning_rate": 0.0001999917355036607, "loss": 0.1083, "step": 658 }, { "epoch": 0.042602020202020204, "grad_norm": 0.10728071630001068, "learning_rate": 0.00019999164735088384, "loss": 0.1059, "step": 659 }, { "epoch": 0.042666666666666665, "grad_norm": 0.10053954273462296, "learning_rate": 0.00019999155873047912, "loss": 0.0937, "step": 660 }, { "epoch": 0.042731313131313134, "grad_norm": 0.1097431629896164, "learning_rate": 0.00019999146964244692, "loss": 0.1004, "step": 661 }, { "epoch": 0.042795959595959596, "grad_norm": 0.1231168583035469, "learning_rate": 0.00019999138008678768, "loss": 0.0999, "step": 662 }, { "epoch": 0.04286060606060606, "grad_norm": 0.11319750547409058, "learning_rate": 0.0001999912900635018, "loss": 0.0939, "step": 663 }, { "epoch": 0.042925252525252526, "grad_norm": 0.1253257542848587, "learning_rate": 0.00019999119957258974, "loss": 0.1108, "step": 664 }, { "epoch": 0.04298989898989899, "grad_norm": 0.15099355578422546, "learning_rate": 0.0001999911086140519, "loss": 0.121, "step": 665 }, { "epoch": 0.043054545454545456, "grad_norm": 0.0973721593618393, "learning_rate": 0.00019999101718788868, "loss": 0.0905, "step": 666 }, { "epoch": 0.04311919191919192, "grad_norm": 0.12029827386140823, "learning_rate": 0.0001999909252941005, "loss": 0.1145, "step": 667 }, { "epoch": 0.04318383838383839, "grad_norm": 0.12910278141498566, "learning_rate": 0.00019999083293268784, "loss": 0.1043, "step": 668 }, { "epoch": 0.04324848484848485, "grad_norm": 0.16909056901931763, "learning_rate": 0.00019999074010365115, "loss": 0.1041, "step": 669 }, { "epoch": 0.04331313131313131, "grad_norm": 0.11273932456970215, "learning_rate": 0.0001999906468069908, "loss": 0.0946, "step": 670 }, { "epoch": 0.04337777777777778, "grad_norm": 0.1234969049692154, "learning_rate": 0.0001999905530427072, "loss": 0.0982, "step": 671 }, { "epoch": 0.04344242424242424, "grad_norm": 0.14719463884830475, "learning_rate": 0.00019999045881080092, "loss": 0.1332, "step": 672 }, { "epoch": 0.04344242424242424, "eval_bleu": 13.043288789593035, "eval_loss": 0.10298320651054382, "eval_runtime": 2.6171, "eval_samples_per_second": 12.227, "eval_steps_per_second": 1.528, "step": 672 }, { "epoch": 0.04350707070707071, "grad_norm": 0.1803755909204483, "learning_rate": 0.0001999903641112723, "loss": 0.132, "step": 673 }, { "epoch": 0.04357171717171717, "grad_norm": 0.11416936665773392, "learning_rate": 0.00019999026894412176, "loss": 0.104, "step": 674 }, { "epoch": 0.04363636363636364, "grad_norm": 0.1184062734246254, "learning_rate": 0.00019999017330934985, "loss": 0.1008, "step": 675 }, { "epoch": 0.0437010101010101, "grad_norm": 0.11440252512693405, "learning_rate": 0.0001999900772069569, "loss": 0.1008, "step": 676 }, { "epoch": 0.04376565656565656, "grad_norm": 0.13120384514331818, "learning_rate": 0.00019998998063694345, "loss": 0.0986, "step": 677 }, { "epoch": 0.04383030303030303, "grad_norm": 0.11450443416833878, "learning_rate": 0.00019998988359930988, "loss": 0.1069, "step": 678 }, { "epoch": 0.043894949494949494, "grad_norm": 0.10805507004261017, "learning_rate": 0.0001999897860940567, "loss": 0.0884, "step": 679 }, { "epoch": 0.04395959595959596, "grad_norm": 0.17670485377311707, "learning_rate": 0.00019998968812118438, "loss": 0.1046, "step": 680 }, { "epoch": 0.044024242424242424, "grad_norm": 0.13816799223423004, "learning_rate": 0.0001999895896806933, "loss": 0.1196, "step": 681 }, { "epoch": 0.044088888888888886, "grad_norm": 0.11267176270484924, "learning_rate": 0.00019998949077258398, "loss": 0.1019, "step": 682 }, { "epoch": 0.044153535353535354, "grad_norm": 0.14338566362857819, "learning_rate": 0.00019998939139685687, "loss": 0.1143, "step": 683 }, { "epoch": 0.044218181818181816, "grad_norm": 0.10855672508478165, "learning_rate": 0.00019998929155351242, "loss": 0.0893, "step": 684 }, { "epoch": 0.044282828282828285, "grad_norm": 0.11952214688062668, "learning_rate": 0.00019998919124255115, "loss": 0.1113, "step": 685 }, { "epoch": 0.04434747474747475, "grad_norm": 0.1171790063381195, "learning_rate": 0.00019998909046397344, "loss": 0.0988, "step": 686 }, { "epoch": 0.044412121212121215, "grad_norm": 0.11785004287958145, "learning_rate": 0.00019998898921777983, "loss": 0.1082, "step": 687 }, { "epoch": 0.04447676767676768, "grad_norm": 0.10744521021842957, "learning_rate": 0.00019998888750397077, "loss": 0.0962, "step": 688 }, { "epoch": 0.04447676767676768, "eval_bleu": 12.487478905348521, "eval_loss": 0.10292381793260574, "eval_runtime": 2.7172, "eval_samples_per_second": 11.777, "eval_steps_per_second": 1.472, "step": 688 }, { "epoch": 0.04454141414141414, "grad_norm": 0.12902088463306427, "learning_rate": 0.00019998878532254675, "loss": 0.1222, "step": 689 }, { "epoch": 0.04460606060606061, "grad_norm": 0.11698608845472336, "learning_rate": 0.0001999886826735082, "loss": 0.1087, "step": 690 }, { "epoch": 0.04467070707070707, "grad_norm": 0.11035232245922089, "learning_rate": 0.00019998857955685567, "loss": 0.111, "step": 691 }, { "epoch": 0.04473535353535354, "grad_norm": 0.11041948199272156, "learning_rate": 0.0001999884759725896, "loss": 0.1182, "step": 692 }, { "epoch": 0.0448, "grad_norm": 0.15364226698875427, "learning_rate": 0.0001999883719207105, "loss": 0.1015, "step": 693 }, { "epoch": 0.04486464646464646, "grad_norm": 0.11859642714262009, "learning_rate": 0.00019998826740121883, "loss": 0.0975, "step": 694 }, { "epoch": 0.04492929292929293, "grad_norm": 0.14098550379276276, "learning_rate": 0.00019998816241411507, "loss": 0.1169, "step": 695 }, { "epoch": 0.04499393939393939, "grad_norm": 0.11364419013261795, "learning_rate": 0.00019998805695939975, "loss": 0.1002, "step": 696 }, { "epoch": 0.04505858585858586, "grad_norm": 0.11612164229154587, "learning_rate": 0.00019998795103707333, "loss": 0.1018, "step": 697 }, { "epoch": 0.04512323232323232, "grad_norm": 0.13067692518234253, "learning_rate": 0.00019998784464713633, "loss": 0.1159, "step": 698 }, { "epoch": 0.04518787878787879, "grad_norm": 0.11234342306852341, "learning_rate": 0.0001999877377895892, "loss": 0.1024, "step": 699 }, { "epoch": 0.04525252525252525, "grad_norm": 0.11350338906049728, "learning_rate": 0.00019998763046443253, "loss": 0.1048, "step": 700 }, { "epoch": 0.045317171717171714, "grad_norm": 0.12426406145095825, "learning_rate": 0.00019998752267166677, "loss": 0.1105, "step": 701 }, { "epoch": 0.04538181818181818, "grad_norm": 0.12319961935281754, "learning_rate": 0.00019998741441129236, "loss": 0.1108, "step": 702 }, { "epoch": 0.045446464646464645, "grad_norm": 0.13607670366764069, "learning_rate": 0.00019998730568330993, "loss": 0.1106, "step": 703 }, { "epoch": 0.04551111111111111, "grad_norm": 0.1059730127453804, "learning_rate": 0.0001999871964877199, "loss": 0.0925, "step": 704 }, { "epoch": 0.04551111111111111, "eval_bleu": 12.392402411338868, "eval_loss": 0.10216230154037476, "eval_runtime": 2.6054, "eval_samples_per_second": 12.282, "eval_steps_per_second": 1.535, "step": 704 }, { "epoch": 0.045575757575757575, "grad_norm": 0.10643964260816574, "learning_rate": 0.00019998708682452277, "loss": 0.0972, "step": 705 }, { "epoch": 0.045640404040404044, "grad_norm": 0.14168602228164673, "learning_rate": 0.00019998697669371915, "loss": 0.1239, "step": 706 }, { "epoch": 0.045705050505050505, "grad_norm": 0.12365109473466873, "learning_rate": 0.0001999868660953095, "loss": 0.1113, "step": 707 }, { "epoch": 0.04576969696969697, "grad_norm": 0.14012353122234344, "learning_rate": 0.0001999867550292943, "loss": 0.1286, "step": 708 }, { "epoch": 0.045834343434343436, "grad_norm": 0.17314791679382324, "learning_rate": 0.0001999866434956741, "loss": 0.1505, "step": 709 }, { "epoch": 0.0458989898989899, "grad_norm": 0.12496153265237808, "learning_rate": 0.00019998653149444942, "loss": 0.1042, "step": 710 }, { "epoch": 0.045963636363636366, "grad_norm": 0.12631718814373016, "learning_rate": 0.0001999864190256208, "loss": 0.1138, "step": 711 }, { "epoch": 0.04602828282828283, "grad_norm": 0.10696897655725479, "learning_rate": 0.00019998630608918875, "loss": 0.1017, "step": 712 }, { "epoch": 0.04609292929292929, "grad_norm": 0.12372178584337234, "learning_rate": 0.00019998619268515378, "loss": 0.1163, "step": 713 }, { "epoch": 0.04615757575757576, "grad_norm": 0.11347978562116623, "learning_rate": 0.00019998607881351648, "loss": 0.1012, "step": 714 }, { "epoch": 0.04622222222222222, "grad_norm": 0.1179923266172409, "learning_rate": 0.00019998596447427734, "loss": 0.0999, "step": 715 }, { "epoch": 0.04628686868686869, "grad_norm": 0.10583227127790451, "learning_rate": 0.00019998584966743688, "loss": 0.1054, "step": 716 }, { "epoch": 0.04635151515151515, "grad_norm": 0.11413531750440598, "learning_rate": 0.00019998573439299565, "loss": 0.1194, "step": 717 }, { "epoch": 0.04641616161616162, "grad_norm": 0.11270076781511307, "learning_rate": 0.0001999856186509542, "loss": 0.1122, "step": 718 }, { "epoch": 0.04648080808080808, "grad_norm": 0.10701815783977509, "learning_rate": 0.0001999855024413131, "loss": 0.0987, "step": 719 }, { "epoch": 0.04654545454545454, "grad_norm": 0.11094026267528534, "learning_rate": 0.00019998538576407283, "loss": 0.1103, "step": 720 }, { "epoch": 0.04654545454545454, "eval_bleu": 14.192791910220867, "eval_loss": 0.103324294090271, "eval_runtime": 2.799, "eval_samples_per_second": 11.433, "eval_steps_per_second": 1.429, "step": 720 }, { "epoch": 0.04661010101010101, "grad_norm": 0.12587034702301025, "learning_rate": 0.00019998526861923397, "loss": 0.121, "step": 721 }, { "epoch": 0.04667474747474747, "grad_norm": 0.1284613162279129, "learning_rate": 0.00019998515100679706, "loss": 0.0983, "step": 722 }, { "epoch": 0.04673939393939394, "grad_norm": 0.09873173385858536, "learning_rate": 0.00019998503292676265, "loss": 0.085, "step": 723 }, { "epoch": 0.0468040404040404, "grad_norm": 0.10213294625282288, "learning_rate": 0.0001999849143791313, "loss": 0.0955, "step": 724 }, { "epoch": 0.04686868686868687, "grad_norm": 0.13524073362350464, "learning_rate": 0.00019998479536390356, "loss": 0.1133, "step": 725 }, { "epoch": 0.046933333333333334, "grad_norm": 0.12266495078802109, "learning_rate": 0.00019998467588107997, "loss": 0.1119, "step": 726 }, { "epoch": 0.046997979797979796, "grad_norm": 0.1434646099805832, "learning_rate": 0.00019998455593066115, "loss": 0.1013, "step": 727 }, { "epoch": 0.047062626262626264, "grad_norm": 0.15254801511764526, "learning_rate": 0.0001999844355126476, "loss": 0.0967, "step": 728 }, { "epoch": 0.047127272727272726, "grad_norm": 0.1079547330737114, "learning_rate": 0.00019998431462703986, "loss": 0.097, "step": 729 }, { "epoch": 0.047191919191919195, "grad_norm": 0.1393647938966751, "learning_rate": 0.00019998419327383856, "loss": 0.1102, "step": 730 }, { "epoch": 0.047256565656565656, "grad_norm": 0.1209694892168045, "learning_rate": 0.00019998407145304422, "loss": 0.1135, "step": 731 }, { "epoch": 0.04732121212121212, "grad_norm": 0.14806681871414185, "learning_rate": 0.00019998394916465747, "loss": 0.1047, "step": 732 }, { "epoch": 0.04738585858585859, "grad_norm": 0.12639784812927246, "learning_rate": 0.00019998382640867886, "loss": 0.1171, "step": 733 }, { "epoch": 0.04745050505050505, "grad_norm": 0.12218527495861053, "learning_rate": 0.0001999837031851089, "loss": 0.1011, "step": 734 }, { "epoch": 0.04751515151515152, "grad_norm": 0.10220067948102951, "learning_rate": 0.00019998357949394823, "loss": 0.0935, "step": 735 }, { "epoch": 0.04757979797979798, "grad_norm": 0.11254847049713135, "learning_rate": 0.0001999834553351974, "loss": 0.093, "step": 736 }, { "epoch": 0.04757979797979798, "eval_bleu": 11.49276421723662, "eval_loss": 0.10434786975383759, "eval_runtime": 2.606, "eval_samples_per_second": 12.279, "eval_steps_per_second": 1.535, "step": 736 }, { "epoch": 0.04764444444444445, "grad_norm": 0.12570251524448395, "learning_rate": 0.00019998333070885704, "loss": 0.09, "step": 737 }, { "epoch": 0.04770909090909091, "grad_norm": 0.12122748792171478, "learning_rate": 0.00019998320561492766, "loss": 0.0916, "step": 738 }, { "epoch": 0.04777373737373737, "grad_norm": 0.15788482129573822, "learning_rate": 0.00019998308005340988, "loss": 0.1126, "step": 739 }, { "epoch": 0.04783838383838384, "grad_norm": 0.13124330341815948, "learning_rate": 0.00019998295402430432, "loss": 0.1164, "step": 740 }, { "epoch": 0.0479030303030303, "grad_norm": 0.11488284915685654, "learning_rate": 0.00019998282752761154, "loss": 0.0928, "step": 741 }, { "epoch": 0.04796767676767677, "grad_norm": 0.13330914080142975, "learning_rate": 0.0001999827005633321, "loss": 0.1232, "step": 742 }, { "epoch": 0.04803232323232323, "grad_norm": 0.10554460436105728, "learning_rate": 0.00019998257313146663, "loss": 0.0868, "step": 743 }, { "epoch": 0.048096969696969694, "grad_norm": 0.11329661309719086, "learning_rate": 0.00019998244523201572, "loss": 0.1064, "step": 744 }, { "epoch": 0.04816161616161616, "grad_norm": 0.12269411981105804, "learning_rate": 0.00019998231686497997, "loss": 0.107, "step": 745 }, { "epoch": 0.048226262626262624, "grad_norm": 0.12911410629749298, "learning_rate": 0.00019998218803035997, "loss": 0.1201, "step": 746 }, { "epoch": 0.04829090909090909, "grad_norm": 0.12091823667287827, "learning_rate": 0.00019998205872815634, "loss": 0.1144, "step": 747 }, { "epoch": 0.048355555555555554, "grad_norm": 0.11663118749856949, "learning_rate": 0.00019998192895836968, "loss": 0.1089, "step": 748 }, { "epoch": 0.04842020202020202, "grad_norm": 0.1018366813659668, "learning_rate": 0.0001999817987210006, "loss": 0.089, "step": 749 }, { "epoch": 0.048484848484848485, "grad_norm": 0.10414136946201324, "learning_rate": 0.00019998166801604966, "loss": 0.0902, "step": 750 }, { "epoch": 0.04854949494949495, "grad_norm": 0.1267605870962143, "learning_rate": 0.00019998153684351754, "loss": 0.1109, "step": 751 }, { "epoch": 0.048614141414141415, "grad_norm": 0.13840921223163605, "learning_rate": 0.0001999814052034048, "loss": 0.1052, "step": 752 }, { "epoch": 0.048614141414141415, "eval_bleu": 12.124744245468259, "eval_loss": 0.10355029255151749, "eval_runtime": 2.7658, "eval_samples_per_second": 11.57, "eval_steps_per_second": 1.446, "step": 752 }, { "epoch": 0.04867878787878788, "grad_norm": 0.11638985574245453, "learning_rate": 0.0001999812730957121, "loss": 0.1025, "step": 753 }, { "epoch": 0.048743434343434346, "grad_norm": 0.16252492368221283, "learning_rate": 0.00019998114052044005, "loss": 0.1246, "step": 754 }, { "epoch": 0.04880808080808081, "grad_norm": 0.11385884881019592, "learning_rate": 0.00019998100747758925, "loss": 0.0987, "step": 755 }, { "epoch": 0.048872727272727276, "grad_norm": 0.12251380831003189, "learning_rate": 0.00019998087396716035, "loss": 0.1089, "step": 756 }, { "epoch": 0.04893737373737374, "grad_norm": 0.10436839610338211, "learning_rate": 0.00019998073998915393, "loss": 0.0919, "step": 757 }, { "epoch": 0.0490020202020202, "grad_norm": 0.12865278124809265, "learning_rate": 0.00019998060554357063, "loss": 0.1078, "step": 758 }, { "epoch": 0.04906666666666667, "grad_norm": 0.1327090561389923, "learning_rate": 0.00019998047063041115, "loss": 0.1163, "step": 759 }, { "epoch": 0.04913131313131313, "grad_norm": 0.11831310391426086, "learning_rate": 0.000199980335249676, "loss": 0.1122, "step": 760 }, { "epoch": 0.0491959595959596, "grad_norm": 0.10785401612520218, "learning_rate": 0.0001999801994013659, "loss": 0.0972, "step": 761 }, { "epoch": 0.04926060606060606, "grad_norm": 0.11466260999441147, "learning_rate": 0.00019998006308548144, "loss": 0.0958, "step": 762 }, { "epoch": 0.04932525252525252, "grad_norm": 0.11845093220472336, "learning_rate": 0.00019997992630202332, "loss": 0.1237, "step": 763 }, { "epoch": 0.04938989898989899, "grad_norm": 0.11316904425621033, "learning_rate": 0.0001999797890509921, "loss": 0.1053, "step": 764 }, { "epoch": 0.04945454545454545, "grad_norm": 0.11070220172405243, "learning_rate": 0.00019997965133238847, "loss": 0.1165, "step": 765 }, { "epoch": 0.04951919191919192, "grad_norm": 0.11167492717504501, "learning_rate": 0.00019997951314621305, "loss": 0.1135, "step": 766 }, { "epoch": 0.04958383838383838, "grad_norm": 0.13088814914226532, "learning_rate": 0.00019997937449246653, "loss": 0.129, "step": 767 }, { "epoch": 0.04964848484848485, "grad_norm": 0.11039507389068604, "learning_rate": 0.00019997923537114952, "loss": 0.1092, "step": 768 }, { "epoch": 0.04964848484848485, "eval_bleu": 11.15731612475645, "eval_loss": 0.10469337552785873, "eval_runtime": 2.5966, "eval_samples_per_second": 12.324, "eval_steps_per_second": 1.54, "step": 768 }, { "epoch": 0.04971313131313131, "grad_norm": 0.10391898453235626, "learning_rate": 0.00019997909578226266, "loss": 0.1037, "step": 769 }, { "epoch": 0.049777777777777775, "grad_norm": 0.1021718829870224, "learning_rate": 0.00019997895572580662, "loss": 0.0969, "step": 770 }, { "epoch": 0.049842424242424244, "grad_norm": 0.11407826095819473, "learning_rate": 0.00019997881520178207, "loss": 0.1031, "step": 771 }, { "epoch": 0.049907070707070705, "grad_norm": 0.1182626411318779, "learning_rate": 0.00019997867421018967, "loss": 0.1051, "step": 772 }, { "epoch": 0.049971717171717174, "grad_norm": 0.13272640109062195, "learning_rate": 0.00019997853275103005, "loss": 0.1205, "step": 773 }, { "epoch": 0.050036363636363636, "grad_norm": 0.12423010170459747, "learning_rate": 0.00019997839082430384, "loss": 0.1053, "step": 774 }, { "epoch": 0.050101010101010104, "grad_norm": 0.11508378386497498, "learning_rate": 0.0001999782484300118, "loss": 0.0813, "step": 775 }, { "epoch": 0.050165656565656566, "grad_norm": 0.11342547833919525, "learning_rate": 0.00019997810556815455, "loss": 0.1069, "step": 776 }, { "epoch": 0.05023030303030303, "grad_norm": 0.1191805750131607, "learning_rate": 0.00019997796223873273, "loss": 0.094, "step": 777 }, { "epoch": 0.0502949494949495, "grad_norm": 0.11147759854793549, "learning_rate": 0.00019997781844174705, "loss": 0.0943, "step": 778 }, { "epoch": 0.05035959595959596, "grad_norm": 0.11758267134428024, "learning_rate": 0.00019997767417719814, "loss": 0.106, "step": 779 }, { "epoch": 0.05042424242424243, "grad_norm": 0.11845671385526657, "learning_rate": 0.00019997752944508673, "loss": 0.102, "step": 780 }, { "epoch": 0.05048888888888889, "grad_norm": 0.12012017518281937, "learning_rate": 0.00019997738424541343, "loss": 0.1097, "step": 781 }, { "epoch": 0.05055353535353535, "grad_norm": 0.1275402158498764, "learning_rate": 0.000199977238578179, "loss": 0.1004, "step": 782 }, { "epoch": 0.05061818181818182, "grad_norm": 0.13106386363506317, "learning_rate": 0.00019997709244338403, "loss": 0.1073, "step": 783 }, { "epoch": 0.05068282828282828, "grad_norm": 0.10580660402774811, "learning_rate": 0.00019997694584102926, "loss": 0.0877, "step": 784 }, { "epoch": 0.05068282828282828, "eval_bleu": 14.201445493145247, "eval_loss": 0.10319557785987854, "eval_runtime": 2.6913, "eval_samples_per_second": 11.89, "eval_steps_per_second": 1.486, "step": 784 }, { "epoch": 0.05074747474747475, "grad_norm": 0.12241457402706146, "learning_rate": 0.0001999767987711154, "loss": 0.1254, "step": 785 }, { "epoch": 0.05081212121212121, "grad_norm": 0.13357685506343842, "learning_rate": 0.0001999766512336431, "loss": 0.1279, "step": 786 }, { "epoch": 0.05087676767676768, "grad_norm": 0.09599798172712326, "learning_rate": 0.000199976503228613, "loss": 0.0891, "step": 787 }, { "epoch": 0.05094141414141414, "grad_norm": 0.11776507645845413, "learning_rate": 0.0001999763547560259, "loss": 0.1192, "step": 788 }, { "epoch": 0.0510060606060606, "grad_norm": 0.11095654964447021, "learning_rate": 0.00019997620581588241, "loss": 0.0827, "step": 789 }, { "epoch": 0.05107070707070707, "grad_norm": 0.11389493942260742, "learning_rate": 0.00019997605640818326, "loss": 0.1184, "step": 790 }, { "epoch": 0.051135353535353534, "grad_norm": 0.12444175779819489, "learning_rate": 0.00019997590653292918, "loss": 0.1104, "step": 791 }, { "epoch": 0.0512, "grad_norm": 0.10522261261940002, "learning_rate": 0.00019997575619012082, "loss": 0.0849, "step": 792 }, { "epoch": 0.051264646464646464, "grad_norm": 0.13184869289398193, "learning_rate": 0.00019997560537975888, "loss": 0.1221, "step": 793 }, { "epoch": 0.051329292929292926, "grad_norm": 0.13569733500480652, "learning_rate": 0.0001999754541018441, "loss": 0.1338, "step": 794 }, { "epoch": 0.051393939393939395, "grad_norm": 0.12633666396141052, "learning_rate": 0.00019997530235637715, "loss": 0.126, "step": 795 }, { "epoch": 0.051458585858585856, "grad_norm": 0.10019071400165558, "learning_rate": 0.0001999751501433588, "loss": 0.0983, "step": 796 }, { "epoch": 0.051523232323232325, "grad_norm": 0.1268041431903839, "learning_rate": 0.0001999749974627897, "loss": 0.1249, "step": 797 }, { "epoch": 0.05158787878787879, "grad_norm": 0.11632261425256729, "learning_rate": 0.0001999748443146706, "loss": 0.1071, "step": 798 }, { "epoch": 0.051652525252525255, "grad_norm": 0.11214227974414825, "learning_rate": 0.00019997469069900218, "loss": 0.1137, "step": 799 }, { "epoch": 0.05171717171717172, "grad_norm": 0.10913074761629105, "learning_rate": 0.0001999745366157852, "loss": 0.1053, "step": 800 }, { "epoch": 0.05171717171717172, "eval_bleu": 13.90970352692978, "eval_loss": 0.10533279925584793, "eval_runtime": 2.5881, "eval_samples_per_second": 12.364, "eval_steps_per_second": 1.546, "step": 800 }, { "epoch": 0.05178181818181818, "grad_norm": 0.10523483157157898, "learning_rate": 0.00019997438206502036, "loss": 0.1066, "step": 801 }, { "epoch": 0.05184646464646465, "grad_norm": 0.10975678265094757, "learning_rate": 0.00019997422704670837, "loss": 0.1183, "step": 802 }, { "epoch": 0.05191111111111111, "grad_norm": 0.11845553666353226, "learning_rate": 0.00019997407156085003, "loss": 0.1073, "step": 803 }, { "epoch": 0.05197575757575758, "grad_norm": 0.13536454737186432, "learning_rate": 0.00019997391560744597, "loss": 0.1044, "step": 804 }, { "epoch": 0.05204040404040404, "grad_norm": 0.11444460600614548, "learning_rate": 0.00019997375918649692, "loss": 0.0978, "step": 805 }, { "epoch": 0.05210505050505051, "grad_norm": 0.0963529422879219, "learning_rate": 0.0001999736022980037, "loss": 0.0892, "step": 806 }, { "epoch": 0.05216969696969697, "grad_norm": 0.09845588356256485, "learning_rate": 0.00019997344494196697, "loss": 0.0776, "step": 807 }, { "epoch": 0.05223434343434343, "grad_norm": 0.12654970586299896, "learning_rate": 0.0001999732871183875, "loss": 0.1017, "step": 808 }, { "epoch": 0.0522989898989899, "grad_norm": 0.12078473716974258, "learning_rate": 0.000199973128827266, "loss": 0.1066, "step": 809 }, { "epoch": 0.05236363636363636, "grad_norm": 0.0995335727930069, "learning_rate": 0.00019997297006860325, "loss": 0.089, "step": 810 }, { "epoch": 0.05242828282828283, "grad_norm": 0.10869870334863663, "learning_rate": 0.00019997281084239993, "loss": 0.1024, "step": 811 }, { "epoch": 0.05249292929292929, "grad_norm": 0.10635609179735184, "learning_rate": 0.00019997265114865686, "loss": 0.0859, "step": 812 }, { "epoch": 0.052557575757575754, "grad_norm": 0.13891276717185974, "learning_rate": 0.00019997249098737476, "loss": 0.124, "step": 813 }, { "epoch": 0.05262222222222222, "grad_norm": 0.12540379166603088, "learning_rate": 0.00019997233035855434, "loss": 0.1067, "step": 814 }, { "epoch": 0.052686868686868685, "grad_norm": 0.1086144894361496, "learning_rate": 0.00019997216926219638, "loss": 0.0948, "step": 815 }, { "epoch": 0.05275151515151515, "grad_norm": 0.11935828626155853, "learning_rate": 0.0001999720076983017, "loss": 0.107, "step": 816 }, { "epoch": 0.05275151515151515, "eval_bleu": 12.039794100997185, "eval_loss": 0.10444250702857971, "eval_runtime": 2.72, "eval_samples_per_second": 11.765, "eval_steps_per_second": 1.471, "step": 816 }, { "epoch": 0.052816161616161615, "grad_norm": 0.11445945501327515, "learning_rate": 0.00019997184566687094, "loss": 0.1088, "step": 817 }, { "epoch": 0.052880808080808084, "grad_norm": 0.11164270341396332, "learning_rate": 0.00019997168316790486, "loss": 0.1012, "step": 818 }, { "epoch": 0.052945454545454546, "grad_norm": 0.10718589276075363, "learning_rate": 0.00019997152020140432, "loss": 0.1003, "step": 819 }, { "epoch": 0.05301010101010101, "grad_norm": 0.12261474877595901, "learning_rate": 0.00019997135676737005, "loss": 0.1371, "step": 820 }, { "epoch": 0.053074747474747476, "grad_norm": 0.11439112573862076, "learning_rate": 0.00019997119286580274, "loss": 0.1111, "step": 821 }, { "epoch": 0.05313939393939394, "grad_norm": 0.2269362509250641, "learning_rate": 0.00019997102849670325, "loss": 0.1125, "step": 822 }, { "epoch": 0.053204040404040406, "grad_norm": 0.09734158217906952, "learning_rate": 0.0001999708636600723, "loss": 0.0942, "step": 823 }, { "epoch": 0.05326868686868687, "grad_norm": 0.11116871237754822, "learning_rate": 0.00019997069835591067, "loss": 0.1077, "step": 824 }, { "epoch": 0.05333333333333334, "grad_norm": 0.15197673439979553, "learning_rate": 0.00019997053258421913, "loss": 0.0768, "step": 825 }, { "epoch": 0.0533979797979798, "grad_norm": 0.10549704730510712, "learning_rate": 0.00019997036634499847, "loss": 0.1088, "step": 826 }, { "epoch": 0.05346262626262626, "grad_norm": 0.10435540229082108, "learning_rate": 0.00019997019963824942, "loss": 0.0955, "step": 827 }, { "epoch": 0.05352727272727273, "grad_norm": 0.10653238743543625, "learning_rate": 0.00019997003246397283, "loss": 0.0927, "step": 828 }, { "epoch": 0.05359191919191919, "grad_norm": 0.12135177105665207, "learning_rate": 0.00019996986482216945, "loss": 0.1141, "step": 829 }, { "epoch": 0.05365656565656566, "grad_norm": 0.11117222905158997, "learning_rate": 0.00019996969671284007, "loss": 0.0996, "step": 830 }, { "epoch": 0.05372121212121212, "grad_norm": 0.1145494282245636, "learning_rate": 0.00019996952813598545, "loss": 0.1232, "step": 831 }, { "epoch": 0.05378585858585858, "grad_norm": 0.10023811459541321, "learning_rate": 0.0001999693590916064, "loss": 0.0984, "step": 832 }, { "epoch": 0.05378585858585858, "eval_bleu": 12.826223091065069, "eval_loss": 0.10393930971622467, "eval_runtime": 2.6105, "eval_samples_per_second": 12.258, "eval_steps_per_second": 1.532, "step": 832 }, { "epoch": 0.05385050505050505, "grad_norm": 0.11759473383426666, "learning_rate": 0.0001999691895797037, "loss": 0.1151, "step": 833 }, { "epoch": 0.05391515151515151, "grad_norm": 0.12444781512022018, "learning_rate": 0.00019996901960027814, "loss": 0.1104, "step": 834 }, { "epoch": 0.05397979797979798, "grad_norm": 0.12180424481630325, "learning_rate": 0.00019996884915333054, "loss": 0.1172, "step": 835 }, { "epoch": 0.054044444444444444, "grad_norm": 0.1297658234834671, "learning_rate": 0.00019996867823886166, "loss": 0.1334, "step": 836 }, { "epoch": 0.05410909090909091, "grad_norm": 0.13237729668617249, "learning_rate": 0.00019996850685687236, "loss": 0.1126, "step": 837 }, { "epoch": 0.054173737373737374, "grad_norm": 0.12256555259227753, "learning_rate": 0.0001999683350073634, "loss": 0.133, "step": 838 }, { "epoch": 0.054238383838383836, "grad_norm": 0.10652507841587067, "learning_rate": 0.00019996816269033554, "loss": 0.0963, "step": 839 }, { "epoch": 0.054303030303030304, "grad_norm": 0.13831791281700134, "learning_rate": 0.00019996798990578967, "loss": 0.1291, "step": 840 }, { "epoch": 0.054367676767676766, "grad_norm": 0.11334650218486786, "learning_rate": 0.00019996781665372655, "loss": 0.1073, "step": 841 }, { "epoch": 0.054432323232323235, "grad_norm": 0.10309616476297379, "learning_rate": 0.000199967642934147, "loss": 0.0952, "step": 842 }, { "epoch": 0.054496969696969697, "grad_norm": 0.12420094013214111, "learning_rate": 0.00019996746874705184, "loss": 0.1075, "step": 843 }, { "epoch": 0.05456161616161616, "grad_norm": 0.10948662459850311, "learning_rate": 0.00019996729409244186, "loss": 0.1137, "step": 844 }, { "epoch": 0.05462626262626263, "grad_norm": 0.1168128028512001, "learning_rate": 0.0001999671189703179, "loss": 0.1016, "step": 845 }, { "epoch": 0.05469090909090909, "grad_norm": 0.09760251641273499, "learning_rate": 0.00019996694338068077, "loss": 0.0934, "step": 846 }, { "epoch": 0.05475555555555556, "grad_norm": 0.12694862484931946, "learning_rate": 0.0001999667673235313, "loss": 0.1207, "step": 847 }, { "epoch": 0.05482020202020202, "grad_norm": 0.11269383877515793, "learning_rate": 0.00019996659079887032, "loss": 0.1044, "step": 848 }, { "epoch": 0.05482020202020202, "eval_bleu": 11.33978220393837, "eval_loss": 0.10266672819852829, "eval_runtime": 2.7659, "eval_samples_per_second": 11.57, "eval_steps_per_second": 1.446, "step": 848 }, { "epoch": 0.05488484848484849, "grad_norm": 0.11263251304626465, "learning_rate": 0.00019996641380669864, "loss": 0.1079, "step": 849 }, { "epoch": 0.05494949494949495, "grad_norm": 0.12025243043899536, "learning_rate": 0.0001999662363470171, "loss": 0.1043, "step": 850 }, { "epoch": 0.05501414141414141, "grad_norm": 0.1157556027173996, "learning_rate": 0.0001999660584198265, "loss": 0.105, "step": 851 }, { "epoch": 0.05507878787878788, "grad_norm": 0.10801595449447632, "learning_rate": 0.0001999658800251277, "loss": 0.0986, "step": 852 }, { "epoch": 0.05514343434343434, "grad_norm": 0.11887183785438538, "learning_rate": 0.0001999657011629215, "loss": 0.1167, "step": 853 }, { "epoch": 0.05520808080808081, "grad_norm": 0.11737223714590073, "learning_rate": 0.00019996552183320878, "loss": 0.1159, "step": 854 }, { "epoch": 0.05527272727272727, "grad_norm": 0.1060580387711525, "learning_rate": 0.00019996534203599038, "loss": 0.101, "step": 855 }, { "epoch": 0.05533737373737374, "grad_norm": 0.13838988542556763, "learning_rate": 0.00019996516177126712, "loss": 0.1187, "step": 856 }, { "epoch": 0.0554020202020202, "grad_norm": 0.10509878396987915, "learning_rate": 0.0001999649810390398, "loss": 0.1015, "step": 857 }, { "epoch": 0.055466666666666664, "grad_norm": 0.10692878067493439, "learning_rate": 0.0001999647998393094, "loss": 0.1015, "step": 858 }, { "epoch": 0.05553131313131313, "grad_norm": 0.09788931161165237, "learning_rate": 0.0001999646181720766, "loss": 0.0986, "step": 859 }, { "epoch": 0.055595959595959595, "grad_norm": 0.10287925601005554, "learning_rate": 0.00019996443603734237, "loss": 0.1001, "step": 860 }, { "epoch": 0.05566060606060606, "grad_norm": 0.10221549868583679, "learning_rate": 0.00019996425343510752, "loss": 0.0983, "step": 861 }, { "epoch": 0.055725252525252525, "grad_norm": 0.10853669792413712, "learning_rate": 0.0001999640703653729, "loss": 0.0985, "step": 862 }, { "epoch": 0.05578989898989899, "grad_norm": 0.10268131643533707, "learning_rate": 0.00019996388682813935, "loss": 0.1034, "step": 863 }, { "epoch": 0.055854545454545455, "grad_norm": 0.11127810180187225, "learning_rate": 0.00019996370282340779, "loss": 0.1109, "step": 864 }, { "epoch": 0.055854545454545455, "eval_bleu": 17.771942553667554, "eval_loss": 0.10232952237129211, "eval_runtime": 2.6745, "eval_samples_per_second": 11.965, "eval_steps_per_second": 1.496, "step": 864 }, { "epoch": 0.05591919191919192, "grad_norm": 0.08473800122737885, "learning_rate": 0.00019996351835117898, "loss": 0.08, "step": 865 }, { "epoch": 0.055983838383838386, "grad_norm": 0.11169324070215225, "learning_rate": 0.00019996333341145388, "loss": 0.1089, "step": 866 }, { "epoch": 0.05604848484848485, "grad_norm": 0.1174512729048729, "learning_rate": 0.0001999631480042333, "loss": 0.1061, "step": 867 }, { "epoch": 0.056113131313131316, "grad_norm": 0.117987722158432, "learning_rate": 0.00019996296212951813, "loss": 0.1097, "step": 868 }, { "epoch": 0.05617777777777778, "grad_norm": 0.10855425149202347, "learning_rate": 0.00019996277578730924, "loss": 0.1098, "step": 869 }, { "epoch": 0.05624242424242424, "grad_norm": 0.15426157414913177, "learning_rate": 0.00019996258897760752, "loss": 0.1092, "step": 870 }, { "epoch": 0.05630707070707071, "grad_norm": 0.09436088800430298, "learning_rate": 0.00019996240170041377, "loss": 0.0988, "step": 871 }, { "epoch": 0.05637171717171717, "grad_norm": 0.09706581383943558, "learning_rate": 0.00019996221395572893, "loss": 0.0962, "step": 872 }, { "epoch": 0.05643636363636364, "grad_norm": 0.09941123425960541, "learning_rate": 0.00019996202574355386, "loss": 0.0879, "step": 873 }, { "epoch": 0.0565010101010101, "grad_norm": 0.10898749530315399, "learning_rate": 0.00019996183706388947, "loss": 0.1095, "step": 874 }, { "epoch": 0.05656565656565657, "grad_norm": 0.12166240811347961, "learning_rate": 0.00019996164791673658, "loss": 0.0861, "step": 875 }, { "epoch": 0.05663030303030303, "grad_norm": 0.10835130512714386, "learning_rate": 0.0001999614583020961, "loss": 0.1087, "step": 876 }, { "epoch": 0.05669494949494949, "grad_norm": 0.10567472130060196, "learning_rate": 0.00019996126821996896, "loss": 0.1111, "step": 877 }, { "epoch": 0.05675959595959596, "grad_norm": 0.10107319802045822, "learning_rate": 0.00019996107767035603, "loss": 0.0893, "step": 878 }, { "epoch": 0.05682424242424242, "grad_norm": 0.10014646500349045, "learning_rate": 0.00019996088665325817, "loss": 0.0911, "step": 879 }, { "epoch": 0.05688888888888889, "grad_norm": 0.11912678927183151, "learning_rate": 0.00019996069516867626, "loss": 0.1132, "step": 880 }, { "epoch": 0.05688888888888889, "eval_bleu": 16.553064467689797, "eval_loss": 0.10099372267723083, "eval_runtime": 2.6971, "eval_samples_per_second": 11.865, "eval_steps_per_second": 1.483, "step": 880 }, { "epoch": 0.05695353535353535, "grad_norm": 0.11043732613325119, "learning_rate": 0.00019996050321661123, "loss": 0.0971, "step": 881 }, { "epoch": 0.057018181818181815, "grad_norm": 0.11203518509864807, "learning_rate": 0.000199960310797064, "loss": 0.1047, "step": 882 }, { "epoch": 0.057082828282828284, "grad_norm": 0.1297028362751007, "learning_rate": 0.00019996011791003544, "loss": 0.1084, "step": 883 }, { "epoch": 0.057147474747474745, "grad_norm": 0.11635738611221313, "learning_rate": 0.00019995992455552645, "loss": 0.097, "step": 884 }, { "epoch": 0.057212121212121214, "grad_norm": 0.11968257278203964, "learning_rate": 0.0001999597307335379, "loss": 0.1013, "step": 885 }, { "epoch": 0.057276767676767676, "grad_norm": 0.10897650569677353, "learning_rate": 0.0001999595364440708, "loss": 0.0966, "step": 886 }, { "epoch": 0.057341414141414145, "grad_norm": 0.1232411116361618, "learning_rate": 0.00019995934168712595, "loss": 0.1207, "step": 887 }, { "epoch": 0.057406060606060606, "grad_norm": 0.09599161893129349, "learning_rate": 0.00019995914646270434, "loss": 0.0935, "step": 888 }, { "epoch": 0.05747070707070707, "grad_norm": 0.110834501683712, "learning_rate": 0.0001999589507708068, "loss": 0.1069, "step": 889 }, { "epoch": 0.05753535353535354, "grad_norm": 0.12236505001783371, "learning_rate": 0.00019995875461143432, "loss": 0.1101, "step": 890 }, { "epoch": 0.0576, "grad_norm": 0.10326965898275375, "learning_rate": 0.00019995855798458781, "loss": 0.1045, "step": 891 }, { "epoch": 0.05766464646464647, "grad_norm": 0.13237006962299347, "learning_rate": 0.00019995836089026813, "loss": 0.1122, "step": 892 }, { "epoch": 0.05772929292929293, "grad_norm": 0.09868495911359787, "learning_rate": 0.00019995816332847626, "loss": 0.0962, "step": 893 }, { "epoch": 0.05779393939393939, "grad_norm": 0.11295129358768463, "learning_rate": 0.0001999579652992131, "loss": 0.1072, "step": 894 }, { "epoch": 0.05785858585858586, "grad_norm": 0.1221550777554512, "learning_rate": 0.0001999577668024796, "loss": 0.1192, "step": 895 }, { "epoch": 0.05792323232323232, "grad_norm": 0.11230450123548508, "learning_rate": 0.00019995756783827665, "loss": 0.1196, "step": 896 }, { "epoch": 0.05792323232323232, "eval_bleu": 14.791945394215508, "eval_loss": 0.10085465759038925, "eval_runtime": 2.7005, "eval_samples_per_second": 11.85, "eval_steps_per_second": 1.481, "step": 896 }, { "epoch": 0.05798787878787879, "grad_norm": 0.10296222567558289, "learning_rate": 0.00019995736840660523, "loss": 0.1048, "step": 897 }, { "epoch": 0.05805252525252525, "grad_norm": 0.1342710256576538, "learning_rate": 0.0001999571685074662, "loss": 0.1099, "step": 898 }, { "epoch": 0.05811717171717172, "grad_norm": 0.11080357432365417, "learning_rate": 0.00019995696814086058, "loss": 0.092, "step": 899 }, { "epoch": 0.05818181818181818, "grad_norm": 0.11223907023668289, "learning_rate": 0.00019995676730678925, "loss": 0.1031, "step": 900 }, { "epoch": 0.058246464646464643, "grad_norm": 0.11462075263261795, "learning_rate": 0.00019995656600525313, "loss": 0.1105, "step": 901 }, { "epoch": 0.05831111111111111, "grad_norm": 0.11297249794006348, "learning_rate": 0.00019995636423625324, "loss": 0.1002, "step": 902 }, { "epoch": 0.058375757575757574, "grad_norm": 0.10576991736888885, "learning_rate": 0.00019995616199979045, "loss": 0.0918, "step": 903 }, { "epoch": 0.05844040404040404, "grad_norm": 0.10222340375185013, "learning_rate": 0.00019995595929586576, "loss": 0.0939, "step": 904 }, { "epoch": 0.058505050505050504, "grad_norm": 0.1218513771891594, "learning_rate": 0.0001999557561244801, "loss": 0.1116, "step": 905 }, { "epoch": 0.05856969696969697, "grad_norm": 0.11331094801425934, "learning_rate": 0.00019995555248563437, "loss": 0.1126, "step": 906 }, { "epoch": 0.058634343434343435, "grad_norm": 0.1779399812221527, "learning_rate": 0.00019995534837932962, "loss": 0.104, "step": 907 }, { "epoch": 0.058698989898989896, "grad_norm": 0.11177841573953629, "learning_rate": 0.00019995514380556672, "loss": 0.1008, "step": 908 }, { "epoch": 0.058763636363636365, "grad_norm": 0.10509387403726578, "learning_rate": 0.00019995493876434666, "loss": 0.0946, "step": 909 }, { "epoch": 0.05882828282828283, "grad_norm": 0.11490184813737869, "learning_rate": 0.0001999547332556704, "loss": 0.108, "step": 910 }, { "epoch": 0.058892929292929296, "grad_norm": 0.10376391559839249, "learning_rate": 0.0001999545272795389, "loss": 0.0925, "step": 911 }, { "epoch": 0.05895757575757576, "grad_norm": 0.10855154693126678, "learning_rate": 0.00019995432083595312, "loss": 0.0989, "step": 912 }, { "epoch": 0.05895757575757576, "eval_bleu": 13.503646460032721, "eval_loss": 0.09989695250988007, "eval_runtime": 2.7356, "eval_samples_per_second": 11.698, "eval_steps_per_second": 1.462, "step": 912 }, { "epoch": 0.05902222222222222, "grad_norm": 0.1056453138589859, "learning_rate": 0.000199954113924914, "loss": 0.1062, "step": 913 }, { "epoch": 0.05908686868686869, "grad_norm": 0.1238904744386673, "learning_rate": 0.00019995390654642257, "loss": 0.1257, "step": 914 }, { "epoch": 0.05915151515151515, "grad_norm": 0.10526388138532639, "learning_rate": 0.00019995369870047972, "loss": 0.1051, "step": 915 }, { "epoch": 0.05921616161616162, "grad_norm": 0.11506146192550659, "learning_rate": 0.00019995349038708652, "loss": 0.1201, "step": 916 }, { "epoch": 0.05928080808080808, "grad_norm": 0.10063769668340683, "learning_rate": 0.00019995328160624386, "loss": 0.1043, "step": 917 }, { "epoch": 0.05934545454545455, "grad_norm": 0.11473876982927322, "learning_rate": 0.00019995307235795272, "loss": 0.0935, "step": 918 }, { "epoch": 0.05941010101010101, "grad_norm": 0.1102418303489685, "learning_rate": 0.00019995286264221413, "loss": 0.1061, "step": 919 }, { "epoch": 0.05947474747474747, "grad_norm": 0.11224579066038132, "learning_rate": 0.00019995265245902904, "loss": 0.1013, "step": 920 }, { "epoch": 0.05953939393939394, "grad_norm": 0.15346866846084595, "learning_rate": 0.00019995244180839845, "loss": 0.1065, "step": 921 }, { "epoch": 0.0596040404040404, "grad_norm": 0.16856330633163452, "learning_rate": 0.00019995223069032333, "loss": 0.107, "step": 922 }, { "epoch": 0.05966868686868687, "grad_norm": 0.13238894939422607, "learning_rate": 0.00019995201910480467, "loss": 0.1194, "step": 923 }, { "epoch": 0.05973333333333333, "grad_norm": 0.12689128518104553, "learning_rate": 0.00019995180705184346, "loss": 0.1197, "step": 924 }, { "epoch": 0.0597979797979798, "grad_norm": 0.1079198494553566, "learning_rate": 0.00019995159453144068, "loss": 0.1059, "step": 925 }, { "epoch": 0.05986262626262626, "grad_norm": 0.09220168739557266, "learning_rate": 0.00019995138154359733, "loss": 0.0818, "step": 926 }, { "epoch": 0.059927272727272725, "grad_norm": 0.11543834209442139, "learning_rate": 0.00019995116808831445, "loss": 0.1127, "step": 927 }, { "epoch": 0.059991919191919194, "grad_norm": 0.131541907787323, "learning_rate": 0.00019995095416559298, "loss": 0.109, "step": 928 }, { "epoch": 0.059991919191919194, "eval_bleu": 13.146405815577472, "eval_loss": 0.1025841161608696, "eval_runtime": 2.6904, "eval_samples_per_second": 11.894, "eval_steps_per_second": 1.487, "step": 928 }, { "epoch": 0.060056565656565655, "grad_norm": 0.10599727928638458, "learning_rate": 0.00019995073977543393, "loss": 0.0928, "step": 929 }, { "epoch": 0.060121212121212124, "grad_norm": 0.10286258906126022, "learning_rate": 0.00019995052491783832, "loss": 0.105, "step": 930 }, { "epoch": 0.060185858585858586, "grad_norm": 0.09632156789302826, "learning_rate": 0.00019995030959280716, "loss": 0.0999, "step": 931 }, { "epoch": 0.06025050505050505, "grad_norm": 0.11808812618255615, "learning_rate": 0.00019995009380034142, "loss": 0.119, "step": 932 }, { "epoch": 0.060315151515151516, "grad_norm": 0.11463737487792969, "learning_rate": 0.00019994987754044216, "loss": 0.1167, "step": 933 }, { "epoch": 0.06037979797979798, "grad_norm": 0.10428722202777863, "learning_rate": 0.00019994966081311036, "loss": 0.1012, "step": 934 }, { "epoch": 0.060444444444444446, "grad_norm": 0.09879586100578308, "learning_rate": 0.00019994944361834702, "loss": 0.0976, "step": 935 }, { "epoch": 0.06050909090909091, "grad_norm": 0.11271627247333527, "learning_rate": 0.0001999492259561532, "loss": 0.1105, "step": 936 }, { "epoch": 0.06057373737373738, "grad_norm": 0.11347619444131851, "learning_rate": 0.00019994900782652986, "loss": 0.1203, "step": 937 }, { "epoch": 0.06063838383838384, "grad_norm": 0.10328079760074615, "learning_rate": 0.00019994878922947805, "loss": 0.1068, "step": 938 }, { "epoch": 0.0607030303030303, "grad_norm": 0.10391739010810852, "learning_rate": 0.00019994857016499885, "loss": 0.1066, "step": 939 }, { "epoch": 0.06076767676767677, "grad_norm": 0.11093475669622421, "learning_rate": 0.00019994835063309318, "loss": 0.1028, "step": 940 }, { "epoch": 0.06083232323232323, "grad_norm": 0.11336583644151688, "learning_rate": 0.00019994813063376214, "loss": 0.1106, "step": 941 }, { "epoch": 0.0608969696969697, "grad_norm": 0.11894485354423523, "learning_rate": 0.00019994791016700675, "loss": 0.1108, "step": 942 }, { "epoch": 0.06096161616161616, "grad_norm": 0.10436300188302994, "learning_rate": 0.00019994768923282797, "loss": 0.0967, "step": 943 }, { "epoch": 0.06102626262626262, "grad_norm": 0.1109478771686554, "learning_rate": 0.00019994746783122693, "loss": 0.1039, "step": 944 }, { "epoch": 0.06102626262626262, "eval_bleu": 14.703895748705321, "eval_loss": 0.1030469536781311, "eval_runtime": 2.8892, "eval_samples_per_second": 11.076, "eval_steps_per_second": 1.384, "step": 944 }, { "epoch": 0.06109090909090909, "grad_norm": 0.10315844416618347, "learning_rate": 0.0001999472459622046, "loss": 0.1, "step": 945 }, { "epoch": 0.06115555555555555, "grad_norm": 0.12184132635593414, "learning_rate": 0.0001999470236257621, "loss": 0.1056, "step": 946 }, { "epoch": 0.06122020202020202, "grad_norm": 0.10203199833631516, "learning_rate": 0.00019994680082190036, "loss": 0.1004, "step": 947 }, { "epoch": 0.061284848484848484, "grad_norm": 0.10957950353622437, "learning_rate": 0.0001999465775506205, "loss": 0.1001, "step": 948 }, { "epoch": 0.06134949494949495, "grad_norm": 0.09947719424962997, "learning_rate": 0.00019994635381192353, "loss": 0.0981, "step": 949 }, { "epoch": 0.061414141414141414, "grad_norm": 0.10668642818927765, "learning_rate": 0.00019994612960581049, "loss": 0.1043, "step": 950 }, { "epoch": 0.061478787878787876, "grad_norm": 0.09808440506458282, "learning_rate": 0.00019994590493228247, "loss": 0.0849, "step": 951 }, { "epoch": 0.061543434343434344, "grad_norm": 0.1252501755952835, "learning_rate": 0.00019994567979134047, "loss": 0.1027, "step": 952 }, { "epoch": 0.061608080808080806, "grad_norm": 0.09008917212486267, "learning_rate": 0.00019994545418298558, "loss": 0.0809, "step": 953 }, { "epoch": 0.061672727272727275, "grad_norm": 0.09205963462591171, "learning_rate": 0.00019994522810721886, "loss": 0.0942, "step": 954 }, { "epoch": 0.06173737373737374, "grad_norm": 0.09121670573949814, "learning_rate": 0.0001999450015640413, "loss": 0.0854, "step": 955 }, { "epoch": 0.061802020202020205, "grad_norm": 0.1023370772600174, "learning_rate": 0.00019994477455345403, "loss": 0.0989, "step": 956 }, { "epoch": 0.06186666666666667, "grad_norm": 0.11290039122104645, "learning_rate": 0.00019994454707545815, "loss": 0.1109, "step": 957 }, { "epoch": 0.06193131313131313, "grad_norm": 0.0942951962351799, "learning_rate": 0.00019994431913005457, "loss": 0.0863, "step": 958 }, { "epoch": 0.0619959595959596, "grad_norm": 0.09380186349153519, "learning_rate": 0.0001999440907172445, "loss": 0.0879, "step": 959 }, { "epoch": 0.06206060606060606, "grad_norm": 0.10406679660081863, "learning_rate": 0.00019994386183702897, "loss": 0.082, "step": 960 }, { "epoch": 0.06206060606060606, "eval_bleu": 13.504000765398503, "eval_loss": 0.1035585030913353, "eval_runtime": 2.7527, "eval_samples_per_second": 11.625, "eval_steps_per_second": 1.453, "step": 960 }, { "epoch": 0.06212525252525253, "grad_norm": 0.09908974915742874, "learning_rate": 0.00019994363248940903, "loss": 0.0949, "step": 961 }, { "epoch": 0.06218989898989899, "grad_norm": 0.12424175441265106, "learning_rate": 0.00019994340267438573, "loss": 0.0976, "step": 962 }, { "epoch": 0.06225454545454545, "grad_norm": 0.10670045763254166, "learning_rate": 0.0001999431723919602, "loss": 0.1003, "step": 963 }, { "epoch": 0.06231919191919192, "grad_norm": 0.1120072603225708, "learning_rate": 0.0001999429416421335, "loss": 0.1041, "step": 964 }, { "epoch": 0.06238383838383838, "grad_norm": 0.14401409029960632, "learning_rate": 0.00019994271042490666, "loss": 0.1196, "step": 965 }, { "epoch": 0.06244848484848485, "grad_norm": 0.11253448575735092, "learning_rate": 0.00019994247874028086, "loss": 0.1138, "step": 966 }, { "epoch": 0.06251313131313131, "grad_norm": 0.10873868316411972, "learning_rate": 0.00019994224658825707, "loss": 0.108, "step": 967 }, { "epoch": 0.06257777777777777, "grad_norm": 0.12138410657644272, "learning_rate": 0.0001999420139688365, "loss": 0.1013, "step": 968 }, { "epoch": 0.06264242424242424, "grad_norm": 0.11792376637458801, "learning_rate": 0.00019994178088202013, "loss": 0.1078, "step": 969 }, { "epoch": 0.06270707070707071, "grad_norm": 0.09426609426736832, "learning_rate": 0.0001999415473278091, "loss": 0.0881, "step": 970 }, { "epoch": 0.06277171717171717, "grad_norm": 0.14864352345466614, "learning_rate": 0.00019994131330620452, "loss": 0.0993, "step": 971 }, { "epoch": 0.06283636363636363, "grad_norm": 0.09219281375408173, "learning_rate": 0.0001999410788172074, "loss": 0.0896, "step": 972 }, { "epoch": 0.0629010101010101, "grad_norm": 0.09379587322473526, "learning_rate": 0.00019994084386081894, "loss": 0.0865, "step": 973 }, { "epoch": 0.06296565656565657, "grad_norm": 0.12673808634281158, "learning_rate": 0.00019994060843704016, "loss": 0.1086, "step": 974 }, { "epoch": 0.06303030303030303, "grad_norm": 0.1037813052535057, "learning_rate": 0.0001999403725458722, "loss": 0.0975, "step": 975 }, { "epoch": 0.0630949494949495, "grad_norm": 0.14156074821949005, "learning_rate": 0.00019994013618731618, "loss": 0.106, "step": 976 }, { "epoch": 0.0630949494949495, "eval_bleu": 16.63288810316848, "eval_loss": 0.10185457766056061, "eval_runtime": 2.7356, "eval_samples_per_second": 11.698, "eval_steps_per_second": 1.462, "step": 976 }, { "epoch": 0.06315959595959596, "grad_norm": 0.1073005273938179, "learning_rate": 0.0001999398993613732, "loss": 0.1096, "step": 977 }, { "epoch": 0.06322424242424242, "grad_norm": 0.09988738596439362, "learning_rate": 0.00019993966206804432, "loss": 0.0997, "step": 978 }, { "epoch": 0.0632888888888889, "grad_norm": 0.10770740360021591, "learning_rate": 0.00019993942430733067, "loss": 0.1121, "step": 979 }, { "epoch": 0.06335353535353536, "grad_norm": 0.10458149015903473, "learning_rate": 0.00019993918607923337, "loss": 0.0975, "step": 980 }, { "epoch": 0.06341818181818182, "grad_norm": 0.10584570467472076, "learning_rate": 0.00019993894738375357, "loss": 0.1065, "step": 981 }, { "epoch": 0.06348282828282828, "grad_norm": 0.10891502350568771, "learning_rate": 0.00019993870822089234, "loss": 0.097, "step": 982 }, { "epoch": 0.06354747474747474, "grad_norm": 0.11425327509641647, "learning_rate": 0.00019993846859065077, "loss": 0.1051, "step": 983 }, { "epoch": 0.06361212121212122, "grad_norm": 0.09814655035734177, "learning_rate": 0.00019993822849303006, "loss": 0.0944, "step": 984 }, { "epoch": 0.06367676767676768, "grad_norm": 0.10303620249032974, "learning_rate": 0.00019993798792803129, "loss": 0.0977, "step": 985 }, { "epoch": 0.06374141414141414, "grad_norm": 0.10379397869110107, "learning_rate": 0.0001999377468956556, "loss": 0.0918, "step": 986 }, { "epoch": 0.0638060606060606, "grad_norm": 0.11233863234519958, "learning_rate": 0.0001999375053959041, "loss": 0.1014, "step": 987 }, { "epoch": 0.06387070707070706, "grad_norm": 0.09550675749778748, "learning_rate": 0.0001999372634287779, "loss": 0.1007, "step": 988 }, { "epoch": 0.06393535353535354, "grad_norm": 0.1209508553147316, "learning_rate": 0.00019993702099427817, "loss": 0.1224, "step": 989 }, { "epoch": 0.064, "grad_norm": 0.10136724263429642, "learning_rate": 0.00019993677809240605, "loss": 0.1126, "step": 990 }, { "epoch": 0.06406464646464646, "grad_norm": 0.09928274154663086, "learning_rate": 0.00019993653472316264, "loss": 0.1028, "step": 991 }, { "epoch": 0.06412929292929292, "grad_norm": 0.10656436532735825, "learning_rate": 0.0001999362908865491, "loss": 0.1068, "step": 992 }, { "epoch": 0.06412929292929292, "eval_bleu": 12.73347699399549, "eval_loss": 0.10065907984972, "eval_runtime": 2.6363, "eval_samples_per_second": 12.138, "eval_steps_per_second": 1.517, "step": 992 }, { "epoch": 0.0641939393939394, "grad_norm": 0.1007160171866417, "learning_rate": 0.00019993604658256658, "loss": 0.094, "step": 993 }, { "epoch": 0.06425858585858586, "grad_norm": 0.09793444722890854, "learning_rate": 0.0001999358018112162, "loss": 0.1046, "step": 994 }, { "epoch": 0.06432323232323232, "grad_norm": 0.10793797671794891, "learning_rate": 0.0001999355565724991, "loss": 0.1077, "step": 995 }, { "epoch": 0.06438787878787879, "grad_norm": 0.0944027304649353, "learning_rate": 0.00019993531086641645, "loss": 0.0976, "step": 996 }, { "epoch": 0.06445252525252525, "grad_norm": 0.14579692482948303, "learning_rate": 0.0001999350646929694, "loss": 0.1281, "step": 997 }, { "epoch": 0.06451717171717172, "grad_norm": 0.10719353705644608, "learning_rate": 0.00019993481805215905, "loss": 0.1067, "step": 998 }, { "epoch": 0.06458181818181818, "grad_norm": 0.08898865431547165, "learning_rate": 0.00019993457094398664, "loss": 0.0863, "step": 999 }, { "epoch": 0.06464646464646465, "grad_norm": 0.10586211085319519, "learning_rate": 0.00019993432336845327, "loss": 0.0946, "step": 1000 }, { "epoch": 0.06471111111111111, "grad_norm": 0.09605011343955994, "learning_rate": 0.00019993407532556008, "loss": 0.0895, "step": 1001 }, { "epoch": 0.06477575757575757, "grad_norm": 0.10589330643415451, "learning_rate": 0.0001999338268153083, "loss": 0.1004, "step": 1002 }, { "epoch": 0.06484040404040405, "grad_norm": 0.09274252504110336, "learning_rate": 0.000199933577837699, "loss": 0.0773, "step": 1003 }, { "epoch": 0.06490505050505051, "grad_norm": 0.10222328454256058, "learning_rate": 0.00019993332839273343, "loss": 0.1018, "step": 1004 }, { "epoch": 0.06496969696969697, "grad_norm": 0.1029796153306961, "learning_rate": 0.00019993307848041272, "loss": 0.087, "step": 1005 }, { "epoch": 0.06503434343434343, "grad_norm": 0.12443310767412186, "learning_rate": 0.00019993282810073804, "loss": 0.1269, "step": 1006 }, { "epoch": 0.06509898989898989, "grad_norm": 0.10611167550086975, "learning_rate": 0.00019993257725371054, "loss": 0.0957, "step": 1007 }, { "epoch": 0.06516363636363637, "grad_norm": 0.10289434343576431, "learning_rate": 0.0001999323259393314, "loss": 0.1098, "step": 1008 }, { "epoch": 0.06516363636363637, "eval_bleu": 15.062234492126294, "eval_loss": 0.09961952269077301, "eval_runtime": 2.8104, "eval_samples_per_second": 11.386, "eval_steps_per_second": 1.423, "step": 1008 }, { "epoch": 0.06522828282828283, "grad_norm": 0.09957101196050644, "learning_rate": 0.00019993207415760185, "loss": 0.099, "step": 1009 }, { "epoch": 0.06529292929292929, "grad_norm": 0.09299284219741821, "learning_rate": 0.000199931821908523, "loss": 0.0876, "step": 1010 }, { "epoch": 0.06535757575757575, "grad_norm": 0.10813038796186447, "learning_rate": 0.00019993156919209605, "loss": 0.1054, "step": 1011 }, { "epoch": 0.06542222222222223, "grad_norm": 0.08390337228775024, "learning_rate": 0.00019993131600832222, "loss": 0.0787, "step": 1012 }, { "epoch": 0.06548686868686869, "grad_norm": 0.10573827475309372, "learning_rate": 0.00019993106235720266, "loss": 0.1102, "step": 1013 }, { "epoch": 0.06555151515151515, "grad_norm": 0.12195960432291031, "learning_rate": 0.00019993080823873852, "loss": 0.1098, "step": 1014 }, { "epoch": 0.06561616161616161, "grad_norm": 0.1130300983786583, "learning_rate": 0.00019993055365293106, "loss": 0.1087, "step": 1015 }, { "epoch": 0.06568080808080808, "grad_norm": 0.10060533881187439, "learning_rate": 0.0001999302985997814, "loss": 0.097, "step": 1016 }, { "epoch": 0.06574545454545455, "grad_norm": 0.12042611837387085, "learning_rate": 0.0001999300430792908, "loss": 0.0994, "step": 1017 }, { "epoch": 0.06581010101010101, "grad_norm": 0.11390791088342667, "learning_rate": 0.00019992978709146042, "loss": 0.1009, "step": 1018 }, { "epoch": 0.06587474747474747, "grad_norm": 0.09423738718032837, "learning_rate": 0.00019992953063629145, "loss": 0.096, "step": 1019 }, { "epoch": 0.06593939393939394, "grad_norm": 0.11149628460407257, "learning_rate": 0.00019992927371378512, "loss": 0.1008, "step": 1020 }, { "epoch": 0.0660040404040404, "grad_norm": 0.10044733434915543, "learning_rate": 0.00019992901632394258, "loss": 0.1091, "step": 1021 }, { "epoch": 0.06606868686868687, "grad_norm": 0.14813345670700073, "learning_rate": 0.0001999287584667651, "loss": 0.0985, "step": 1022 }, { "epoch": 0.06613333333333334, "grad_norm": 0.10394751280546188, "learning_rate": 0.00019992850014225385, "loss": 0.1082, "step": 1023 }, { "epoch": 0.0661979797979798, "grad_norm": 0.10756181180477142, "learning_rate": 0.00019992824135041005, "loss": 0.1082, "step": 1024 }, { "epoch": 0.0661979797979798, "eval_bleu": 15.96788675592142, "eval_loss": 0.10119746625423431, "eval_runtime": 2.7218, "eval_samples_per_second": 11.757, "eval_steps_per_second": 1.47, "step": 1024 }, { "epoch": 0.06626262626262626, "grad_norm": 0.09936445951461792, "learning_rate": 0.00019992798209123486, "loss": 0.0895, "step": 1025 }, { "epoch": 0.06632727272727272, "grad_norm": 0.1018339991569519, "learning_rate": 0.00019992772236472955, "loss": 0.1046, "step": 1026 }, { "epoch": 0.0663919191919192, "grad_norm": 0.10769277065992355, "learning_rate": 0.00019992746217089536, "loss": 0.1102, "step": 1027 }, { "epoch": 0.06645656565656566, "grad_norm": 0.12098774313926697, "learning_rate": 0.0001999272015097334, "loss": 0.0921, "step": 1028 }, { "epoch": 0.06652121212121212, "grad_norm": 0.12512466311454773, "learning_rate": 0.000199926940381245, "loss": 0.1399, "step": 1029 }, { "epoch": 0.06658585858585858, "grad_norm": 0.10815372318029404, "learning_rate": 0.0001999266787854313, "loss": 0.107, "step": 1030 }, { "epoch": 0.06665050505050506, "grad_norm": 0.09212598949670792, "learning_rate": 0.0001999264167222936, "loss": 0.0812, "step": 1031 }, { "epoch": 0.06671515151515152, "grad_norm": 0.11654563993215561, "learning_rate": 0.00019992615419183306, "loss": 0.1214, "step": 1032 }, { "epoch": 0.06677979797979798, "grad_norm": 0.11384245753288269, "learning_rate": 0.00019992589119405092, "loss": 0.0962, "step": 1033 }, { "epoch": 0.06684444444444444, "grad_norm": 0.10957711935043335, "learning_rate": 0.00019992562772894843, "loss": 0.0967, "step": 1034 }, { "epoch": 0.0669090909090909, "grad_norm": 0.12031755596399307, "learning_rate": 0.00019992536379652683, "loss": 0.108, "step": 1035 }, { "epoch": 0.06697373737373738, "grad_norm": 0.10495950281620026, "learning_rate": 0.00019992509939678734, "loss": 0.103, "step": 1036 }, { "epoch": 0.06703838383838384, "grad_norm": 0.10424365103244781, "learning_rate": 0.00019992483452973116, "loss": 0.0944, "step": 1037 }, { "epoch": 0.0671030303030303, "grad_norm": 0.09796426445245743, "learning_rate": 0.0001999245691953596, "loss": 0.0886, "step": 1038 }, { "epoch": 0.06716767676767676, "grad_norm": 0.11019867658615112, "learning_rate": 0.00019992430339367382, "loss": 0.0839, "step": 1039 }, { "epoch": 0.06723232323232323, "grad_norm": 0.1157185360789299, "learning_rate": 0.00019992403712467517, "loss": 0.1119, "step": 1040 }, { "epoch": 0.06723232323232323, "eval_bleu": 12.040628198130733, "eval_loss": 0.1027548760175705, "eval_runtime": 2.7972, "eval_samples_per_second": 11.44, "eval_steps_per_second": 1.43, "step": 1040 }, { "epoch": 0.0672969696969697, "grad_norm": 0.09878209233283997, "learning_rate": 0.00019992377038836477, "loss": 0.0923, "step": 1041 }, { "epoch": 0.06736161616161616, "grad_norm": 0.13967718183994293, "learning_rate": 0.00019992350318474397, "loss": 0.1151, "step": 1042 }, { "epoch": 0.06742626262626263, "grad_norm": 0.13182571530342102, "learning_rate": 0.00019992323551381398, "loss": 0.1015, "step": 1043 }, { "epoch": 0.06749090909090909, "grad_norm": 0.10362154245376587, "learning_rate": 0.00019992296737557602, "loss": 0.096, "step": 1044 }, { "epoch": 0.06755555555555555, "grad_norm": 0.11503683775663376, "learning_rate": 0.00019992269877003142, "loss": 0.1191, "step": 1045 }, { "epoch": 0.06762020202020202, "grad_norm": 0.100889191031456, "learning_rate": 0.00019992242969718135, "loss": 0.095, "step": 1046 }, { "epoch": 0.06768484848484849, "grad_norm": 0.1143028512597084, "learning_rate": 0.00019992216015702713, "loss": 0.1063, "step": 1047 }, { "epoch": 0.06774949494949495, "grad_norm": 0.113642618060112, "learning_rate": 0.00019992189014957, "loss": 0.114, "step": 1048 }, { "epoch": 0.06781414141414141, "grad_norm": 0.09080003947019577, "learning_rate": 0.00019992161967481123, "loss": 0.0842, "step": 1049 }, { "epoch": 0.06787878787878789, "grad_norm": 0.10620587319135666, "learning_rate": 0.00019992134873275208, "loss": 0.1081, "step": 1050 }, { "epoch": 0.06794343434343435, "grad_norm": 0.10006501525640488, "learning_rate": 0.0001999210773233938, "loss": 0.1003, "step": 1051 }, { "epoch": 0.06800808080808081, "grad_norm": 0.09275925904512405, "learning_rate": 0.00019992080544673768, "loss": 0.0981, "step": 1052 }, { "epoch": 0.06807272727272727, "grad_norm": 0.08958086371421814, "learning_rate": 0.00019992053310278498, "loss": 0.0919, "step": 1053 }, { "epoch": 0.06813737373737373, "grad_norm": 0.09127036482095718, "learning_rate": 0.000199920260291537, "loss": 0.0906, "step": 1054 }, { "epoch": 0.06820202020202021, "grad_norm": 0.11914127320051193, "learning_rate": 0.00019991998701299497, "loss": 0.1418, "step": 1055 }, { "epoch": 0.06826666666666667, "grad_norm": 0.0990588590502739, "learning_rate": 0.0001999197132671602, "loss": 0.0944, "step": 1056 }, { "epoch": 0.06826666666666667, "eval_bleu": 13.203528345008845, "eval_loss": 0.10272073745727539, "eval_runtime": 2.7338, "eval_samples_per_second": 11.705, "eval_steps_per_second": 1.463, "step": 1056 }, { "epoch": 0.06833131313131313, "grad_norm": 0.10000976175069809, "learning_rate": 0.000199919439054034, "loss": 0.0847, "step": 1057 }, { "epoch": 0.0683959595959596, "grad_norm": 0.09563630074262619, "learning_rate": 0.00019991916437361756, "loss": 0.0933, "step": 1058 }, { "epoch": 0.06846060606060606, "grad_norm": 0.09445653855800629, "learning_rate": 0.00019991888922591224, "loss": 0.0866, "step": 1059 }, { "epoch": 0.06852525252525253, "grad_norm": 0.10733193904161453, "learning_rate": 0.00019991861361091934, "loss": 0.1126, "step": 1060 }, { "epoch": 0.06858989898989899, "grad_norm": 0.10559269785881042, "learning_rate": 0.00019991833752864007, "loss": 0.1137, "step": 1061 }, { "epoch": 0.06865454545454545, "grad_norm": 0.09451444447040558, "learning_rate": 0.00019991806097907579, "loss": 0.1016, "step": 1062 }, { "epoch": 0.06871919191919192, "grad_norm": 0.09219640493392944, "learning_rate": 0.00019991778396222777, "loss": 0.1019, "step": 1063 }, { "epoch": 0.06878383838383838, "grad_norm": 0.10495129227638245, "learning_rate": 0.00019991750647809729, "loss": 0.1007, "step": 1064 }, { "epoch": 0.06884848484848485, "grad_norm": 0.10733859241008759, "learning_rate": 0.0001999172285266857, "loss": 0.0973, "step": 1065 }, { "epoch": 0.06891313131313132, "grad_norm": 0.15368856489658356, "learning_rate": 0.00019991695010799422, "loss": 0.0881, "step": 1066 }, { "epoch": 0.06897777777777778, "grad_norm": 0.10988730937242508, "learning_rate": 0.0001999166712220242, "loss": 0.116, "step": 1067 }, { "epoch": 0.06904242424242424, "grad_norm": 0.10612314194440842, "learning_rate": 0.00019991639186877697, "loss": 0.1007, "step": 1068 }, { "epoch": 0.0691070707070707, "grad_norm": 0.10792101919651031, "learning_rate": 0.00019991611204825383, "loss": 0.1067, "step": 1069 }, { "epoch": 0.06917171717171718, "grad_norm": 0.09953682869672775, "learning_rate": 0.000199915831760456, "loss": 0.1052, "step": 1070 }, { "epoch": 0.06923636363636364, "grad_norm": 0.09427595883607864, "learning_rate": 0.0001999155510053849, "loss": 0.0935, "step": 1071 }, { "epoch": 0.0693010101010101, "grad_norm": 0.11791904270648956, "learning_rate": 0.0001999152697830418, "loss": 0.1209, "step": 1072 }, { "epoch": 0.0693010101010101, "eval_bleu": 12.398441773724599, "eval_loss": 0.10355356335639954, "eval_runtime": 2.7704, "eval_samples_per_second": 11.551, "eval_steps_per_second": 1.444, "step": 1072 }, { "epoch": 0.06936565656565656, "grad_norm": 0.1010703518986702, "learning_rate": 0.000199914988093428, "loss": 0.1115, "step": 1073 }, { "epoch": 0.06943030303030304, "grad_norm": 0.19438670575618744, "learning_rate": 0.00019991470593654485, "loss": 0.1419, "step": 1074 }, { "epoch": 0.0694949494949495, "grad_norm": 0.10723185539245605, "learning_rate": 0.00019991442331239364, "loss": 0.115, "step": 1075 }, { "epoch": 0.06955959595959596, "grad_norm": 0.10031235963106155, "learning_rate": 0.0001999141402209757, "loss": 0.0865, "step": 1076 }, { "epoch": 0.06962424242424242, "grad_norm": 0.11666223406791687, "learning_rate": 0.00019991385666229234, "loss": 0.1149, "step": 1077 }, { "epoch": 0.06968888888888888, "grad_norm": 0.10806740820407867, "learning_rate": 0.00019991357263634494, "loss": 0.098, "step": 1078 }, { "epoch": 0.06975353535353536, "grad_norm": 0.11003516614437103, "learning_rate": 0.00019991328814313475, "loss": 0.0887, "step": 1079 }, { "epoch": 0.06981818181818182, "grad_norm": 0.10464249551296234, "learning_rate": 0.00019991300318266317, "loss": 0.1023, "step": 1080 }, { "epoch": 0.06988282828282828, "grad_norm": 0.14229588210582733, "learning_rate": 0.0001999127177549315, "loss": 0.1038, "step": 1081 }, { "epoch": 0.06994747474747474, "grad_norm": 0.11465063691139221, "learning_rate": 0.00019991243185994107, "loss": 0.1041, "step": 1082 }, { "epoch": 0.0700121212121212, "grad_norm": 0.09853720664978027, "learning_rate": 0.00019991214549769325, "loss": 0.0836, "step": 1083 }, { "epoch": 0.07007676767676768, "grad_norm": 0.12108904123306274, "learning_rate": 0.00019991185866818933, "loss": 0.1054, "step": 1084 }, { "epoch": 0.07014141414141414, "grad_norm": 0.11559389531612396, "learning_rate": 0.0001999115713714307, "loss": 0.1064, "step": 1085 }, { "epoch": 0.0702060606060606, "grad_norm": 0.10156375914812088, "learning_rate": 0.00019991128360741865, "loss": 0.0898, "step": 1086 }, { "epoch": 0.07027070707070707, "grad_norm": 0.10436543822288513, "learning_rate": 0.00019991099537615458, "loss": 0.0991, "step": 1087 }, { "epoch": 0.07033535353535353, "grad_norm": 0.11197343468666077, "learning_rate": 0.00019991070667763983, "loss": 0.0948, "step": 1088 }, { "epoch": 0.07033535353535353, "eval_bleu": 16.438974162564314, "eval_loss": 0.10288131982088089, "eval_runtime": 2.5554, "eval_samples_per_second": 12.523, "eval_steps_per_second": 1.565, "step": 1088 }, { "epoch": 0.0704, "grad_norm": 0.09361731261014938, "learning_rate": 0.0001999104175118757, "loss": 0.0855, "step": 1089 }, { "epoch": 0.07046464646464647, "grad_norm": 0.09633186459541321, "learning_rate": 0.0001999101278788636, "loss": 0.0957, "step": 1090 }, { "epoch": 0.07052929292929293, "grad_norm": 0.10059010982513428, "learning_rate": 0.00019990983777860483, "loss": 0.1068, "step": 1091 }, { "epoch": 0.07059393939393939, "grad_norm": 0.0951557457447052, "learning_rate": 0.00019990954721110083, "loss": 0.0974, "step": 1092 }, { "epoch": 0.07065858585858587, "grad_norm": 0.1154366135597229, "learning_rate": 0.00019990925617635288, "loss": 0.0958, "step": 1093 }, { "epoch": 0.07072323232323233, "grad_norm": 0.16366399824619293, "learning_rate": 0.00019990896467436236, "loss": 0.1013, "step": 1094 }, { "epoch": 0.07078787878787879, "grad_norm": 0.10861239582300186, "learning_rate": 0.00019990867270513064, "loss": 0.1055, "step": 1095 }, { "epoch": 0.07085252525252525, "grad_norm": 0.0996587797999382, "learning_rate": 0.0001999083802686591, "loss": 0.0875, "step": 1096 }, { "epoch": 0.07091717171717171, "grad_norm": 0.27355629205703735, "learning_rate": 0.0001999080873649491, "loss": 0.1403, "step": 1097 }, { "epoch": 0.07098181818181819, "grad_norm": 0.10840930044651031, "learning_rate": 0.00019990779399400198, "loss": 0.103, "step": 1098 }, { "epoch": 0.07104646464646465, "grad_norm": 0.10754979401826859, "learning_rate": 0.00019990750015581914, "loss": 0.1061, "step": 1099 }, { "epoch": 0.07111111111111111, "grad_norm": 0.10941679775714874, "learning_rate": 0.00019990720585040197, "loss": 0.1095, "step": 1100 }, { "epoch": 0.07117575757575757, "grad_norm": 0.09383228421211243, "learning_rate": 0.00019990691107775184, "loss": 0.1, "step": 1101 }, { "epoch": 0.07124040404040403, "grad_norm": 0.10095507651567459, "learning_rate": 0.00019990661583787006, "loss": 0.0958, "step": 1102 }, { "epoch": 0.07130505050505051, "grad_norm": 0.1256381869316101, "learning_rate": 0.0001999063201307581, "loss": 0.1067, "step": 1103 }, { "epoch": 0.07136969696969697, "grad_norm": 0.09743168950080872, "learning_rate": 0.0001999060239564173, "loss": 0.0973, "step": 1104 }, { "epoch": 0.07136969696969697, "eval_bleu": 14.919981827351906, "eval_loss": 0.10219694674015045, "eval_runtime": 2.7215, "eval_samples_per_second": 11.758, "eval_steps_per_second": 1.47, "step": 1104 }, { "epoch": 0.07143434343434343, "grad_norm": 0.09388572722673416, "learning_rate": 0.00019990572731484909, "loss": 0.0963, "step": 1105 }, { "epoch": 0.0714989898989899, "grad_norm": 0.08546385169029236, "learning_rate": 0.00019990543020605479, "loss": 0.0934, "step": 1106 }, { "epoch": 0.07156363636363636, "grad_norm": 0.09720250219106674, "learning_rate": 0.00019990513263003584, "loss": 0.0939, "step": 1107 }, { "epoch": 0.07162828282828283, "grad_norm": 0.1317378431558609, "learning_rate": 0.0001999048345867936, "loss": 0.1225, "step": 1108 }, { "epoch": 0.0716929292929293, "grad_norm": 0.08493581414222717, "learning_rate": 0.00019990453607632947, "loss": 0.0835, "step": 1109 }, { "epoch": 0.07175757575757576, "grad_norm": 0.11314329504966736, "learning_rate": 0.00019990423709864485, "loss": 0.1107, "step": 1110 }, { "epoch": 0.07182222222222222, "grad_norm": 0.08751076459884644, "learning_rate": 0.00019990393765374114, "loss": 0.0776, "step": 1111 }, { "epoch": 0.0718868686868687, "grad_norm": 0.11137157678604126, "learning_rate": 0.00019990363774161974, "loss": 0.1002, "step": 1112 }, { "epoch": 0.07195151515151516, "grad_norm": 0.11782591044902802, "learning_rate": 0.00019990333736228207, "loss": 0.0975, "step": 1113 }, { "epoch": 0.07201616161616162, "grad_norm": 0.10530432313680649, "learning_rate": 0.00019990303651572951, "loss": 0.091, "step": 1114 }, { "epoch": 0.07208080808080808, "grad_norm": 0.11624302715063095, "learning_rate": 0.00019990273520196348, "loss": 0.1116, "step": 1115 }, { "epoch": 0.07214545454545454, "grad_norm": 0.10502476990222931, "learning_rate": 0.0001999024334209854, "loss": 0.0945, "step": 1116 }, { "epoch": 0.07221010101010102, "grad_norm": 0.11220627278089523, "learning_rate": 0.00019990213117279665, "loss": 0.1011, "step": 1117 }, { "epoch": 0.07227474747474748, "grad_norm": 0.1221824437379837, "learning_rate": 0.00019990182845739865, "loss": 0.1201, "step": 1118 }, { "epoch": 0.07233939393939394, "grad_norm": 0.10572541505098343, "learning_rate": 0.00019990152527479284, "loss": 0.1233, "step": 1119 }, { "epoch": 0.0724040404040404, "grad_norm": 0.10655274987220764, "learning_rate": 0.00019990122162498062, "loss": 0.1308, "step": 1120 }, { "epoch": 0.0724040404040404, "eval_bleu": 13.866672753758753, "eval_loss": 0.10140915215015411, "eval_runtime": 2.6783, "eval_samples_per_second": 11.948, "eval_steps_per_second": 1.493, "step": 1120 }, { "epoch": 0.07246868686868686, "grad_norm": 0.08577724546194077, "learning_rate": 0.0001999009175079634, "loss": 0.0898, "step": 1121 }, { "epoch": 0.07253333333333334, "grad_norm": 0.08553668856620789, "learning_rate": 0.00019990061292374264, "loss": 0.0922, "step": 1122 }, { "epoch": 0.0725979797979798, "grad_norm": 0.09185883402824402, "learning_rate": 0.0001999003078723197, "loss": 0.0968, "step": 1123 }, { "epoch": 0.07266262626262626, "grad_norm": 0.10707148909568787, "learning_rate": 0.00019990000235369605, "loss": 0.1122, "step": 1124 }, { "epoch": 0.07272727272727272, "grad_norm": 0.09191740304231644, "learning_rate": 0.00019989969636787316, "loss": 0.1019, "step": 1125 }, { "epoch": 0.07279191919191919, "grad_norm": 0.08890045434236526, "learning_rate": 0.00019989938991485238, "loss": 0.0877, "step": 1126 }, { "epoch": 0.07285656565656566, "grad_norm": 0.11036644130945206, "learning_rate": 0.0001998990829946352, "loss": 0.108, "step": 1127 }, { "epoch": 0.07292121212121212, "grad_norm": 0.1215878278017044, "learning_rate": 0.00019989877560722304, "loss": 0.1214, "step": 1128 }, { "epoch": 0.07298585858585858, "grad_norm": 0.11315099149942398, "learning_rate": 0.00019989846775261732, "loss": 0.1155, "step": 1129 }, { "epoch": 0.07305050505050505, "grad_norm": 0.10078810155391693, "learning_rate": 0.0001998981594308195, "loss": 0.0859, "step": 1130 }, { "epoch": 0.07311515151515152, "grad_norm": 0.12651541829109192, "learning_rate": 0.00019989785064183097, "loss": 0.1133, "step": 1131 }, { "epoch": 0.07317979797979798, "grad_norm": 0.13573110103607178, "learning_rate": 0.00019989754138565325, "loss": 0.1033, "step": 1132 }, { "epoch": 0.07324444444444445, "grad_norm": 0.10439104586839676, "learning_rate": 0.00019989723166228778, "loss": 0.103, "step": 1133 }, { "epoch": 0.07330909090909091, "grad_norm": 0.12181776016950607, "learning_rate": 0.00019989692147173596, "loss": 0.1228, "step": 1134 }, { "epoch": 0.07337373737373737, "grad_norm": 0.09804438054561615, "learning_rate": 0.00019989661081399926, "loss": 0.0918, "step": 1135 }, { "epoch": 0.07343838383838384, "grad_norm": 0.11062714457511902, "learning_rate": 0.00019989629968907913, "loss": 0.1051, "step": 1136 }, { "epoch": 0.07343838383838384, "eval_bleu": 13.482048021626605, "eval_loss": 0.10033686459064484, "eval_runtime": 2.772, "eval_samples_per_second": 11.544, "eval_steps_per_second": 1.443, "step": 1136 }, { "epoch": 0.0735030303030303, "grad_norm": 0.10200586915016174, "learning_rate": 0.00019989598809697705, "loss": 0.1001, "step": 1137 }, { "epoch": 0.07356767676767677, "grad_norm": 0.09004790335893631, "learning_rate": 0.00019989567603769445, "loss": 0.0858, "step": 1138 }, { "epoch": 0.07363232323232323, "grad_norm": 0.09158658981323242, "learning_rate": 0.0001998953635112328, "loss": 0.0987, "step": 1139 }, { "epoch": 0.07369696969696969, "grad_norm": 0.09695807844400406, "learning_rate": 0.00019989505051759356, "loss": 0.1097, "step": 1140 }, { "epoch": 0.07376161616161617, "grad_norm": 0.09070958197116852, "learning_rate": 0.00019989473705677818, "loss": 0.094, "step": 1141 }, { "epoch": 0.07382626262626263, "grad_norm": 0.1058708131313324, "learning_rate": 0.00019989442312878816, "loss": 0.0928, "step": 1142 }, { "epoch": 0.07389090909090909, "grad_norm": 0.1012599840760231, "learning_rate": 0.00019989410873362494, "loss": 0.091, "step": 1143 }, { "epoch": 0.07395555555555555, "grad_norm": 0.10348769277334213, "learning_rate": 0.00019989379387129, "loss": 0.0966, "step": 1144 }, { "epoch": 0.07402020202020201, "grad_norm": 0.08224254846572876, "learning_rate": 0.0001998934785417848, "loss": 0.0869, "step": 1145 }, { "epoch": 0.07408484848484849, "grad_norm": 0.12119830399751663, "learning_rate": 0.00019989316274511082, "loss": 0.1277, "step": 1146 }, { "epoch": 0.07414949494949495, "grad_norm": 0.14378836750984192, "learning_rate": 0.0001998928464812696, "loss": 0.1247, "step": 1147 }, { "epoch": 0.07421414141414141, "grad_norm": 0.08215869963169098, "learning_rate": 0.00019989252975026247, "loss": 0.0809, "step": 1148 }, { "epoch": 0.07427878787878787, "grad_norm": 0.1078762337565422, "learning_rate": 0.00019989221255209104, "loss": 0.1205, "step": 1149 }, { "epoch": 0.07434343434343435, "grad_norm": 0.10174203664064407, "learning_rate": 0.0001998918948867568, "loss": 0.1066, "step": 1150 }, { "epoch": 0.07440808080808081, "grad_norm": 0.12836866080760956, "learning_rate": 0.00019989157675426113, "loss": 0.1303, "step": 1151 }, { "epoch": 0.07447272727272727, "grad_norm": 0.10035769641399384, "learning_rate": 0.0001998912581546056, "loss": 0.106, "step": 1152 }, { "epoch": 0.07447272727272727, "eval_bleu": 15.257446961470782, "eval_loss": 0.10000753402709961, "eval_runtime": 2.649, "eval_samples_per_second": 12.08, "eval_steps_per_second": 1.51, "step": 1152 }, { "epoch": 0.07453737373737374, "grad_norm": 0.10295552015304565, "learning_rate": 0.00019989093908779167, "loss": 0.1117, "step": 1153 }, { "epoch": 0.0746020202020202, "grad_norm": 0.10999424755573273, "learning_rate": 0.00019989061955382086, "loss": 0.0892, "step": 1154 }, { "epoch": 0.07466666666666667, "grad_norm": 0.13196702301502228, "learning_rate": 0.00019989029955269465, "loss": 0.1094, "step": 1155 }, { "epoch": 0.07473131313131313, "grad_norm": 0.09767697006464005, "learning_rate": 0.0001998899790844145, "loss": 0.1097, "step": 1156 }, { "epoch": 0.0747959595959596, "grad_norm": 0.10190394520759583, "learning_rate": 0.00019988965814898198, "loss": 0.0882, "step": 1157 }, { "epoch": 0.07486060606060606, "grad_norm": 0.10192656517028809, "learning_rate": 0.00019988933674639853, "loss": 0.1029, "step": 1158 }, { "epoch": 0.07492525252525252, "grad_norm": 0.10996544361114502, "learning_rate": 0.00019988901487666568, "loss": 0.1043, "step": 1159 }, { "epoch": 0.074989898989899, "grad_norm": 0.11330875009298325, "learning_rate": 0.00019988869253978494, "loss": 0.1029, "step": 1160 }, { "epoch": 0.07505454545454546, "grad_norm": 0.11957574635744095, "learning_rate": 0.00019988836973575778, "loss": 0.103, "step": 1161 }, { "epoch": 0.07511919191919192, "grad_norm": 0.1305283159017563, "learning_rate": 0.00019988804646458577, "loss": 0.0986, "step": 1162 }, { "epoch": 0.07518383838383838, "grad_norm": 0.10644561052322388, "learning_rate": 0.00019988772272627037, "loss": 0.1002, "step": 1163 }, { "epoch": 0.07524848484848484, "grad_norm": 0.09223178774118423, "learning_rate": 0.00019988739852081308, "loss": 0.0899, "step": 1164 }, { "epoch": 0.07531313131313132, "grad_norm": 0.11759480088949203, "learning_rate": 0.0001998870738482155, "loss": 0.1, "step": 1165 }, { "epoch": 0.07537777777777778, "grad_norm": 0.08548406511545181, "learning_rate": 0.0001998867487084791, "loss": 0.0872, "step": 1166 }, { "epoch": 0.07544242424242424, "grad_norm": 0.10207363218069077, "learning_rate": 0.00019988642310160538, "loss": 0.1015, "step": 1167 }, { "epoch": 0.0755070707070707, "grad_norm": 0.11605361104011536, "learning_rate": 0.00019988609702759587, "loss": 0.1045, "step": 1168 }, { "epoch": 0.0755070707070707, "eval_bleu": 11.914871423195398, "eval_loss": 0.09854108095169067, "eval_runtime": 2.7721, "eval_samples_per_second": 11.544, "eval_steps_per_second": 1.443, "step": 1168 }, { "epoch": 0.07557171717171716, "grad_norm": 0.12160315364599228, "learning_rate": 0.0001998857704864521, "loss": 0.1128, "step": 1169 }, { "epoch": 0.07563636363636364, "grad_norm": 0.08649091422557831, "learning_rate": 0.0001998854434781756, "loss": 0.0992, "step": 1170 }, { "epoch": 0.0757010101010101, "grad_norm": 0.09241735190153122, "learning_rate": 0.00019988511600276793, "loss": 0.0984, "step": 1171 }, { "epoch": 0.07576565656565656, "grad_norm": 0.09227190166711807, "learning_rate": 0.0001998847880602306, "loss": 0.1002, "step": 1172 }, { "epoch": 0.07583030303030303, "grad_norm": 0.09272097796201706, "learning_rate": 0.0001998844596505651, "loss": 0.0951, "step": 1173 }, { "epoch": 0.0758949494949495, "grad_norm": 0.09505018591880798, "learning_rate": 0.00019988413077377305, "loss": 0.1011, "step": 1174 }, { "epoch": 0.07595959595959596, "grad_norm": 0.08822530508041382, "learning_rate": 0.00019988380142985592, "loss": 0.0968, "step": 1175 }, { "epoch": 0.07602424242424242, "grad_norm": 0.12706732749938965, "learning_rate": 0.0001998834716188153, "loss": 0.1065, "step": 1176 }, { "epoch": 0.07608888888888889, "grad_norm": 0.09266221523284912, "learning_rate": 0.00019988314134065265, "loss": 0.097, "step": 1177 }, { "epoch": 0.07615353535353535, "grad_norm": 0.09255720674991608, "learning_rate": 0.00019988281059536958, "loss": 0.0822, "step": 1178 }, { "epoch": 0.07621818181818182, "grad_norm": 0.09482455253601074, "learning_rate": 0.00019988247938296764, "loss": 0.0998, "step": 1179 }, { "epoch": 0.07628282828282829, "grad_norm": 0.09541239589452744, "learning_rate": 0.00019988214770344834, "loss": 0.1011, "step": 1180 }, { "epoch": 0.07634747474747475, "grad_norm": 0.10172248631715775, "learning_rate": 0.0001998818155568133, "loss": 0.0932, "step": 1181 }, { "epoch": 0.07641212121212121, "grad_norm": 0.0817946046590805, "learning_rate": 0.00019988148294306402, "loss": 0.0823, "step": 1182 }, { "epoch": 0.07647676767676767, "grad_norm": 0.10149791091680527, "learning_rate": 0.00019988114986220205, "loss": 0.0867, "step": 1183 }, { "epoch": 0.07654141414141415, "grad_norm": 0.10169646888971329, "learning_rate": 0.00019988081631422896, "loss": 0.1007, "step": 1184 }, { "epoch": 0.07654141414141415, "eval_bleu": 13.500695208523451, "eval_loss": 0.10087230801582336, "eval_runtime": 2.5616, "eval_samples_per_second": 12.492, "eval_steps_per_second": 1.562, "step": 1184 }, { "epoch": 0.07660606060606061, "grad_norm": 0.0874757468700409, "learning_rate": 0.0001998804822991463, "loss": 0.0897, "step": 1185 }, { "epoch": 0.07667070707070707, "grad_norm": 0.0955202654004097, "learning_rate": 0.0001998801478169557, "loss": 0.0965, "step": 1186 }, { "epoch": 0.07673535353535353, "grad_norm": 0.09903624653816223, "learning_rate": 0.0001998798128676586, "loss": 0.099, "step": 1187 }, { "epoch": 0.0768, "grad_norm": 0.09248454123735428, "learning_rate": 0.00019987947745125665, "loss": 0.0802, "step": 1188 }, { "epoch": 0.07686464646464647, "grad_norm": 0.10043051838874817, "learning_rate": 0.00019987914156775142, "loss": 0.0948, "step": 1189 }, { "epoch": 0.07692929292929293, "grad_norm": 0.10855479538440704, "learning_rate": 0.0001998788052171445, "loss": 0.1128, "step": 1190 }, { "epoch": 0.07699393939393939, "grad_norm": 0.09830132126808167, "learning_rate": 0.00019987846839943736, "loss": 0.1057, "step": 1191 }, { "epoch": 0.07705858585858585, "grad_norm": 0.09701931476593018, "learning_rate": 0.00019987813111463164, "loss": 0.0995, "step": 1192 }, { "epoch": 0.07712323232323233, "grad_norm": 0.10256762057542801, "learning_rate": 0.00019987779336272897, "loss": 0.1072, "step": 1193 }, { "epoch": 0.07718787878787879, "grad_norm": 0.1158410981297493, "learning_rate": 0.00019987745514373086, "loss": 0.1247, "step": 1194 }, { "epoch": 0.07725252525252525, "grad_norm": 0.10586290806531906, "learning_rate": 0.0001998771164576389, "loss": 0.1074, "step": 1195 }, { "epoch": 0.07731717171717171, "grad_norm": 0.11251550912857056, "learning_rate": 0.0001998767773044547, "loss": 0.1164, "step": 1196 }, { "epoch": 0.07738181818181818, "grad_norm": 0.09646680951118469, "learning_rate": 0.00019987643768417984, "loss": 0.0859, "step": 1197 }, { "epoch": 0.07744646464646465, "grad_norm": 0.09115240722894669, "learning_rate": 0.00019987609759681586, "loss": 0.0931, "step": 1198 }, { "epoch": 0.07751111111111111, "grad_norm": 0.09376112371683121, "learning_rate": 0.00019987575704236442, "loss": 0.0883, "step": 1199 }, { "epoch": 0.07757575757575758, "grad_norm": 0.11306974291801453, "learning_rate": 0.0001998754160208271, "loss": 0.1093, "step": 1200 }, { "epoch": 0.07757575757575758, "eval_bleu": 12.277632424523997, "eval_loss": 0.10065107047557831, "eval_runtime": 2.7447, "eval_samples_per_second": 11.659, "eval_steps_per_second": 1.457, "step": 1200 }, { "epoch": 0.07764040404040404, "grad_norm": 0.09991087764501572, "learning_rate": 0.00019987507453220546, "loss": 0.1045, "step": 1201 }, { "epoch": 0.0777050505050505, "grad_norm": 0.0946974903345108, "learning_rate": 0.00019987473257650112, "loss": 0.0932, "step": 1202 }, { "epoch": 0.07776969696969697, "grad_norm": 0.10396780073642731, "learning_rate": 0.00019987439015371565, "loss": 0.1, "step": 1203 }, { "epoch": 0.07783434343434344, "grad_norm": 0.11985617130994797, "learning_rate": 0.0001998740472638507, "loss": 0.1178, "step": 1204 }, { "epoch": 0.0778989898989899, "grad_norm": 0.10046354681253433, "learning_rate": 0.00019987370390690782, "loss": 0.0969, "step": 1205 }, { "epoch": 0.07796363636363636, "grad_norm": 0.1067626103758812, "learning_rate": 0.00019987336008288868, "loss": 0.0967, "step": 1206 }, { "epoch": 0.07802828282828282, "grad_norm": 0.11509537696838379, "learning_rate": 0.00019987301579179484, "loss": 0.1144, "step": 1207 }, { "epoch": 0.0780929292929293, "grad_norm": 0.10435677319765091, "learning_rate": 0.0001998726710336279, "loss": 0.1082, "step": 1208 }, { "epoch": 0.07815757575757576, "grad_norm": 0.10428476333618164, "learning_rate": 0.00019987232580838952, "loss": 0.1084, "step": 1209 }, { "epoch": 0.07822222222222222, "grad_norm": 0.1169833317399025, "learning_rate": 0.00019987198011608127, "loss": 0.1082, "step": 1210 }, { "epoch": 0.07828686868686868, "grad_norm": 0.11864268779754639, "learning_rate": 0.0001998716339567048, "loss": 0.1157, "step": 1211 }, { "epoch": 0.07835151515151516, "grad_norm": 0.11021723598241806, "learning_rate": 0.00019987128733026173, "loss": 0.1081, "step": 1212 }, { "epoch": 0.07841616161616162, "grad_norm": 0.10279525816440582, "learning_rate": 0.00019987094023675363, "loss": 0.1036, "step": 1213 }, { "epoch": 0.07848080808080808, "grad_norm": 0.10111741721630096, "learning_rate": 0.0001998705926761822, "loss": 0.0902, "step": 1214 }, { "epoch": 0.07854545454545454, "grad_norm": 0.09448467940092087, "learning_rate": 0.00019987024464854897, "loss": 0.0966, "step": 1215 }, { "epoch": 0.078610101010101, "grad_norm": 0.09504568576812744, "learning_rate": 0.00019986989615385567, "loss": 0.1017, "step": 1216 }, { "epoch": 0.078610101010101, "eval_bleu": 14.99782490957994, "eval_loss": 0.0987180843949318, "eval_runtime": 2.6258, "eval_samples_per_second": 12.187, "eval_steps_per_second": 1.523, "step": 1216 }, { "epoch": 0.07867474747474748, "grad_norm": 0.09722236543893814, "learning_rate": 0.00019986954719210386, "loss": 0.0873, "step": 1217 }, { "epoch": 0.07873939393939394, "grad_norm": 0.1531490683555603, "learning_rate": 0.0001998691977632952, "loss": 0.0999, "step": 1218 }, { "epoch": 0.0788040404040404, "grad_norm": 0.11008467525243759, "learning_rate": 0.0001998688478674313, "loss": 0.1231, "step": 1219 }, { "epoch": 0.07886868686868687, "grad_norm": 0.10244037955999374, "learning_rate": 0.00019986849750451387, "loss": 0.1175, "step": 1220 }, { "epoch": 0.07893333333333333, "grad_norm": 0.09780837595462799, "learning_rate": 0.00019986814667454446, "loss": 0.106, "step": 1221 }, { "epoch": 0.0789979797979798, "grad_norm": 0.09185977280139923, "learning_rate": 0.00019986779537752472, "loss": 0.0999, "step": 1222 }, { "epoch": 0.07906262626262626, "grad_norm": 0.10435672104358673, "learning_rate": 0.00019986744361345636, "loss": 0.1092, "step": 1223 }, { "epoch": 0.07912727272727273, "grad_norm": 0.09911638498306274, "learning_rate": 0.00019986709138234093, "loss": 0.092, "step": 1224 }, { "epoch": 0.07919191919191919, "grad_norm": 0.08804962784051895, "learning_rate": 0.00019986673868418016, "loss": 0.0915, "step": 1225 }, { "epoch": 0.07925656565656565, "grad_norm": 0.09933444112539291, "learning_rate": 0.00019986638551897567, "loss": 0.1046, "step": 1226 }, { "epoch": 0.07932121212121213, "grad_norm": 0.10138771682977676, "learning_rate": 0.0001998660318867291, "loss": 0.0963, "step": 1227 }, { "epoch": 0.07938585858585859, "grad_norm": 0.1047845110297203, "learning_rate": 0.00019986567778744214, "loss": 0.1146, "step": 1228 }, { "epoch": 0.07945050505050505, "grad_norm": 0.1062421202659607, "learning_rate": 0.00019986532322111637, "loss": 0.0775, "step": 1229 }, { "epoch": 0.07951515151515151, "grad_norm": 0.09797623008489609, "learning_rate": 0.00019986496818775353, "loss": 0.087, "step": 1230 }, { "epoch": 0.07957979797979799, "grad_norm": 0.11880829185247421, "learning_rate": 0.00019986461268735526, "loss": 0.1046, "step": 1231 }, { "epoch": 0.07964444444444445, "grad_norm": 0.10233789682388306, "learning_rate": 0.0001998642567199232, "loss": 0.101, "step": 1232 }, { "epoch": 0.07964444444444445, "eval_bleu": 14.30630858105872, "eval_loss": 0.09911099076271057, "eval_runtime": 2.6934, "eval_samples_per_second": 11.881, "eval_steps_per_second": 1.485, "step": 1232 }, { "epoch": 0.07970909090909091, "grad_norm": 0.11157096922397614, "learning_rate": 0.00019986390028545902, "loss": 0.0879, "step": 1233 }, { "epoch": 0.07977373737373737, "grad_norm": 0.0990234911441803, "learning_rate": 0.0001998635433839644, "loss": 0.0952, "step": 1234 }, { "epoch": 0.07983838383838383, "grad_norm": 0.12317034602165222, "learning_rate": 0.000199863186015441, "loss": 0.1093, "step": 1235 }, { "epoch": 0.07990303030303031, "grad_norm": 0.11035049706697464, "learning_rate": 0.00019986282817989048, "loss": 0.1136, "step": 1236 }, { "epoch": 0.07996767676767677, "grad_norm": 0.08728849142789841, "learning_rate": 0.00019986246987731455, "loss": 0.0916, "step": 1237 }, { "epoch": 0.08003232323232323, "grad_norm": 0.10465919971466064, "learning_rate": 0.00019986211110771486, "loss": 0.1043, "step": 1238 }, { "epoch": 0.0800969696969697, "grad_norm": 0.11658327281475067, "learning_rate": 0.00019986175187109307, "loss": 0.1197, "step": 1239 }, { "epoch": 0.08016161616161616, "grad_norm": 0.0924682766199112, "learning_rate": 0.0001998613921674509, "loss": 0.093, "step": 1240 }, { "epoch": 0.08022626262626263, "grad_norm": 0.09621009975671768, "learning_rate": 0.00019986103199679, "loss": 0.0873, "step": 1241 }, { "epoch": 0.0802909090909091, "grad_norm": 0.10269203037023544, "learning_rate": 0.00019986067135911205, "loss": 0.1012, "step": 1242 }, { "epoch": 0.08035555555555556, "grad_norm": 0.1048085018992424, "learning_rate": 0.00019986031025441878, "loss": 0.107, "step": 1243 }, { "epoch": 0.08042020202020202, "grad_norm": 0.11960818618535995, "learning_rate": 0.00019985994868271185, "loss": 0.1058, "step": 1244 }, { "epoch": 0.08048484848484848, "grad_norm": 0.1293594241142273, "learning_rate": 0.00019985958664399294, "loss": 0.1104, "step": 1245 }, { "epoch": 0.08054949494949495, "grad_norm": 0.10614243894815445, "learning_rate": 0.00019985922413826376, "loss": 0.1, "step": 1246 }, { "epoch": 0.08061414141414142, "grad_norm": 0.09893834590911865, "learning_rate": 0.000199858861165526, "loss": 0.0981, "step": 1247 }, { "epoch": 0.08067878787878788, "grad_norm": 0.08690205961465836, "learning_rate": 0.00019985849772578138, "loss": 0.0757, "step": 1248 }, { "epoch": 0.08067878787878788, "eval_bleu": 10.116267882407662, "eval_loss": 0.09995153546333313, "eval_runtime": 2.6797, "eval_samples_per_second": 11.941, "eval_steps_per_second": 1.493, "step": 1248 }, { "epoch": 0.08074343434343434, "grad_norm": 0.11978496611118317, "learning_rate": 0.00019985813381903156, "loss": 0.1203, "step": 1249 }, { "epoch": 0.08080808080808081, "grad_norm": 0.10559460520744324, "learning_rate": 0.00019985776944527825, "loss": 0.1063, "step": 1250 }, { "epoch": 0.08087272727272728, "grad_norm": 0.09509187936782837, "learning_rate": 0.00019985740460452318, "loss": 0.104, "step": 1251 }, { "epoch": 0.08093737373737374, "grad_norm": 0.10089214891195297, "learning_rate": 0.00019985703929676808, "loss": 0.1007, "step": 1252 }, { "epoch": 0.0810020202020202, "grad_norm": 0.10321296006441116, "learning_rate": 0.00019985667352201455, "loss": 0.1085, "step": 1253 }, { "epoch": 0.08106666666666666, "grad_norm": 0.08723357319831848, "learning_rate": 0.00019985630728026438, "loss": 0.0891, "step": 1254 }, { "epoch": 0.08113131313131314, "grad_norm": 0.09296173602342606, "learning_rate": 0.00019985594057151933, "loss": 0.1047, "step": 1255 }, { "epoch": 0.0811959595959596, "grad_norm": 0.08660700917243958, "learning_rate": 0.000199855573395781, "loss": 0.0883, "step": 1256 }, { "epoch": 0.08126060606060606, "grad_norm": 0.10300800949335098, "learning_rate": 0.00019985520575305118, "loss": 0.1201, "step": 1257 }, { "epoch": 0.08132525252525252, "grad_norm": 0.09489543735980988, "learning_rate": 0.00019985483764333158, "loss": 0.1009, "step": 1258 }, { "epoch": 0.08138989898989898, "grad_norm": 0.0899474024772644, "learning_rate": 0.00019985446906662394, "loss": 0.0959, "step": 1259 }, { "epoch": 0.08145454545454546, "grad_norm": 0.10913682729005814, "learning_rate": 0.00019985410002292992, "loss": 0.1086, "step": 1260 }, { "epoch": 0.08151919191919192, "grad_norm": 0.09105635434389114, "learning_rate": 0.0001998537305122513, "loss": 0.109, "step": 1261 }, { "epoch": 0.08158383838383838, "grad_norm": 0.08313827961683273, "learning_rate": 0.00019985336053458978, "loss": 0.0854, "step": 1262 }, { "epoch": 0.08164848484848485, "grad_norm": 0.10245882719755173, "learning_rate": 0.0001998529900899471, "loss": 0.0954, "step": 1263 }, { "epoch": 0.08171313131313131, "grad_norm": 0.11280017346143723, "learning_rate": 0.00019985261917832502, "loss": 0.1153, "step": 1264 }, { "epoch": 0.08171313131313131, "eval_bleu": 11.527914224001549, "eval_loss": 0.09990820288658142, "eval_runtime": 2.766, "eval_samples_per_second": 11.569, "eval_steps_per_second": 1.446, "step": 1264 }, { "epoch": 0.08177777777777778, "grad_norm": 0.10616433620452881, "learning_rate": 0.00019985224779972525, "loss": 0.101, "step": 1265 }, { "epoch": 0.08184242424242424, "grad_norm": 0.10047123581171036, "learning_rate": 0.00019985187595414954, "loss": 0.1006, "step": 1266 }, { "epoch": 0.0819070707070707, "grad_norm": 0.1606845110654831, "learning_rate": 0.00019985150364159958, "loss": 0.1179, "step": 1267 }, { "epoch": 0.08197171717171717, "grad_norm": 0.0824814885854721, "learning_rate": 0.00019985113086207716, "loss": 0.0735, "step": 1268 }, { "epoch": 0.08203636363636363, "grad_norm": 0.10125270485877991, "learning_rate": 0.00019985075761558402, "loss": 0.1048, "step": 1269 }, { "epoch": 0.0821010101010101, "grad_norm": 0.09428473562002182, "learning_rate": 0.00019985038390212188, "loss": 0.105, "step": 1270 }, { "epoch": 0.08216565656565657, "grad_norm": 0.09866265207529068, "learning_rate": 0.00019985000972169253, "loss": 0.0863, "step": 1271 }, { "epoch": 0.08223030303030303, "grad_norm": 0.09355733543634415, "learning_rate": 0.0001998496350742977, "loss": 0.0871, "step": 1272 }, { "epoch": 0.08229494949494949, "grad_norm": 0.10245216637849808, "learning_rate": 0.0001998492599599391, "loss": 0.1087, "step": 1273 }, { "epoch": 0.08235959595959597, "grad_norm": 0.09834680706262589, "learning_rate": 0.00019984888437861852, "loss": 0.1007, "step": 1274 }, { "epoch": 0.08242424242424243, "grad_norm": 0.08705113083124161, "learning_rate": 0.00019984850833033776, "loss": 0.096, "step": 1275 }, { "epoch": 0.08248888888888889, "grad_norm": 0.1057414636015892, "learning_rate": 0.0001998481318150985, "loss": 0.118, "step": 1276 }, { "epoch": 0.08255353535353535, "grad_norm": 0.08798019587993622, "learning_rate": 0.00019984775483290255, "loss": 0.0995, "step": 1277 }, { "epoch": 0.08261818181818181, "grad_norm": 0.08680333942174911, "learning_rate": 0.00019984737738375165, "loss": 0.0895, "step": 1278 }, { "epoch": 0.08268282828282829, "grad_norm": 0.12362273782491684, "learning_rate": 0.0001998469994676476, "loss": 0.1078, "step": 1279 }, { "epoch": 0.08274747474747475, "grad_norm": 0.10130532830953598, "learning_rate": 0.00019984662108459212, "loss": 0.098, "step": 1280 }, { "epoch": 0.08274747474747475, "eval_bleu": 9.529024371172472, "eval_loss": 0.09953488409519196, "eval_runtime": 2.6735, "eval_samples_per_second": 11.969, "eval_steps_per_second": 1.496, "step": 1280 }, { "epoch": 0.08281212121212121, "grad_norm": 0.10659637302160263, "learning_rate": 0.000199846242234587, "loss": 0.1015, "step": 1281 }, { "epoch": 0.08287676767676767, "grad_norm": 0.0832698866724968, "learning_rate": 0.000199845862917634, "loss": 0.0911, "step": 1282 }, { "epoch": 0.08294141414141414, "grad_norm": 0.11082912236452103, "learning_rate": 0.00019984548313373496, "loss": 0.1159, "step": 1283 }, { "epoch": 0.08300606060606061, "grad_norm": 0.10170575976371765, "learning_rate": 0.00019984510288289156, "loss": 0.1044, "step": 1284 }, { "epoch": 0.08307070707070707, "grad_norm": 0.0846438929438591, "learning_rate": 0.00019984472216510565, "loss": 0.0841, "step": 1285 }, { "epoch": 0.08313535353535353, "grad_norm": 0.10545945912599564, "learning_rate": 0.00019984434098037893, "loss": 0.0812, "step": 1286 }, { "epoch": 0.0832, "grad_norm": 0.09282685816287994, "learning_rate": 0.00019984395932871326, "loss": 0.0962, "step": 1287 }, { "epoch": 0.08326464646464646, "grad_norm": 0.1083919107913971, "learning_rate": 0.00019984357721011041, "loss": 0.1098, "step": 1288 }, { "epoch": 0.08332929292929293, "grad_norm": 0.10446532070636749, "learning_rate": 0.00019984319462457216, "loss": 0.1109, "step": 1289 }, { "epoch": 0.0833939393939394, "grad_norm": 0.09216875582933426, "learning_rate": 0.0001998428115721003, "loss": 0.1045, "step": 1290 }, { "epoch": 0.08345858585858586, "grad_norm": 0.1036292091012001, "learning_rate": 0.00019984242805269663, "loss": 0.0988, "step": 1291 }, { "epoch": 0.08352323232323232, "grad_norm": 0.10119486600160599, "learning_rate": 0.0001998420440663629, "loss": 0.0965, "step": 1292 }, { "epoch": 0.0835878787878788, "grad_norm": 0.0946914479136467, "learning_rate": 0.00019984165961310096, "loss": 0.104, "step": 1293 }, { "epoch": 0.08365252525252526, "grad_norm": 0.08623967319726944, "learning_rate": 0.0001998412746929126, "loss": 0.0933, "step": 1294 }, { "epoch": 0.08371717171717172, "grad_norm": 0.08169128745794296, "learning_rate": 0.00019984088930579956, "loss": 0.0845, "step": 1295 }, { "epoch": 0.08378181818181818, "grad_norm": 0.09612638503313065, "learning_rate": 0.0001998405034517637, "loss": 0.105, "step": 1296 }, { "epoch": 0.08378181818181818, "eval_bleu": 13.001526120588284, "eval_loss": 0.0968184769153595, "eval_runtime": 2.6676, "eval_samples_per_second": 11.996, "eval_steps_per_second": 1.499, "step": 1296 }, { "epoch": 0.08384646464646464, "grad_norm": 0.09524210542440414, "learning_rate": 0.0001998401171308068, "loss": 0.0943, "step": 1297 }, { "epoch": 0.08391111111111112, "grad_norm": 0.09286917001008987, "learning_rate": 0.0001998397303429307, "loss": 0.0954, "step": 1298 }, { "epoch": 0.08397575757575758, "grad_norm": 0.11236073076725006, "learning_rate": 0.00019983934308813721, "loss": 0.1064, "step": 1299 }, { "epoch": 0.08404040404040404, "grad_norm": 0.08568739891052246, "learning_rate": 0.00019983895536642806, "loss": 0.0824, "step": 1300 }, { "epoch": 0.0841050505050505, "grad_norm": 0.10929793119430542, "learning_rate": 0.0001998385671778052, "loss": 0.0905, "step": 1301 }, { "epoch": 0.08416969696969696, "grad_norm": 0.08664591610431671, "learning_rate": 0.00019983817852227032, "loss": 0.0847, "step": 1302 }, { "epoch": 0.08423434343434344, "grad_norm": 0.09793335944414139, "learning_rate": 0.00019983778939982528, "loss": 0.0921, "step": 1303 }, { "epoch": 0.0842989898989899, "grad_norm": 0.08782773464918137, "learning_rate": 0.00019983739981047188, "loss": 0.0834, "step": 1304 }, { "epoch": 0.08436363636363636, "grad_norm": 0.09974594414234161, "learning_rate": 0.00019983700975421202, "loss": 0.0961, "step": 1305 }, { "epoch": 0.08442828282828282, "grad_norm": 0.09977371245622635, "learning_rate": 0.00019983661923104746, "loss": 0.0979, "step": 1306 }, { "epoch": 0.08449292929292929, "grad_norm": 0.10015544295310974, "learning_rate": 0.00019983622824098002, "loss": 0.0984, "step": 1307 }, { "epoch": 0.08455757575757576, "grad_norm": 0.1517401784658432, "learning_rate": 0.00019983583678401153, "loss": 0.1205, "step": 1308 }, { "epoch": 0.08462222222222222, "grad_norm": 0.09720490127801895, "learning_rate": 0.00019983544486014388, "loss": 0.1031, "step": 1309 }, { "epoch": 0.08468686868686869, "grad_norm": 0.08698497712612152, "learning_rate": 0.00019983505246937884, "loss": 0.0989, "step": 1310 }, { "epoch": 0.08475151515151515, "grad_norm": 0.09529600292444229, "learning_rate": 0.00019983465961171824, "loss": 0.0981, "step": 1311 }, { "epoch": 0.08481616161616162, "grad_norm": 0.10238347202539444, "learning_rate": 0.000199834266287164, "loss": 0.0958, "step": 1312 }, { "epoch": 0.08481616161616162, "eval_bleu": 13.969026288273707, "eval_loss": 0.09935599565505981, "eval_runtime": 2.6161, "eval_samples_per_second": 12.232, "eval_steps_per_second": 1.529, "step": 1312 }, { "epoch": 0.08488080808080808, "grad_norm": 0.09444960951805115, "learning_rate": 0.00019983387249571785, "loss": 0.096, "step": 1313 }, { "epoch": 0.08494545454545455, "grad_norm": 0.1125500276684761, "learning_rate": 0.0001998334782373817, "loss": 0.097, "step": 1314 }, { "epoch": 0.08501010101010101, "grad_norm": 0.10057497769594193, "learning_rate": 0.0001998330835121574, "loss": 0.1104, "step": 1315 }, { "epoch": 0.08507474747474747, "grad_norm": 0.1008632481098175, "learning_rate": 0.0001998326883200467, "loss": 0.0958, "step": 1316 }, { "epoch": 0.08513939393939395, "grad_norm": 0.08887092769145966, "learning_rate": 0.00019983229266105158, "loss": 0.0836, "step": 1317 }, { "epoch": 0.08520404040404041, "grad_norm": 0.1062830463051796, "learning_rate": 0.00019983189653517385, "loss": 0.109, "step": 1318 }, { "epoch": 0.08526868686868687, "grad_norm": 0.10037776082754135, "learning_rate": 0.0001998314999424153, "loss": 0.0939, "step": 1319 }, { "epoch": 0.08533333333333333, "grad_norm": 0.08978555351495743, "learning_rate": 0.00019983110288277785, "loss": 0.0857, "step": 1320 }, { "epoch": 0.08539797979797979, "grad_norm": 0.09538564085960388, "learning_rate": 0.00019983070535626332, "loss": 0.1015, "step": 1321 }, { "epoch": 0.08546262626262627, "grad_norm": 0.10329007357358932, "learning_rate": 0.0001998303073628736, "loss": 0.1108, "step": 1322 }, { "epoch": 0.08552727272727273, "grad_norm": 0.12846879661083221, "learning_rate": 0.0001998299089026105, "loss": 0.1332, "step": 1323 }, { "epoch": 0.08559191919191919, "grad_norm": 0.08949021995067596, "learning_rate": 0.000199829509975476, "loss": 0.0994, "step": 1324 }, { "epoch": 0.08565656565656565, "grad_norm": 0.09153349697589874, "learning_rate": 0.0001998291105814718, "loss": 0.0952, "step": 1325 }, { "epoch": 0.08572121212121211, "grad_norm": 0.11797761172056198, "learning_rate": 0.00019982871072059987, "loss": 0.0915, "step": 1326 }, { "epoch": 0.08578585858585859, "grad_norm": 0.09567425400018692, "learning_rate": 0.0001998283103928621, "loss": 0.1055, "step": 1327 }, { "epoch": 0.08585050505050505, "grad_norm": 0.15089303255081177, "learning_rate": 0.0001998279095982603, "loss": 0.0975, "step": 1328 }, { "epoch": 0.08585050505050505, "eval_bleu": 12.865755999664628, "eval_loss": 0.09942425042390823, "eval_runtime": 2.7438, "eval_samples_per_second": 11.663, "eval_steps_per_second": 1.458, "step": 1328 }, { "epoch": 0.08591515151515151, "grad_norm": 0.11189599335193634, "learning_rate": 0.00019982750833679637, "loss": 0.1214, "step": 1329 }, { "epoch": 0.08597979797979798, "grad_norm": 0.09316188097000122, "learning_rate": 0.00019982710660847218, "loss": 0.0938, "step": 1330 }, { "epoch": 0.08604444444444445, "grad_norm": 0.11655101925134659, "learning_rate": 0.00019982670441328964, "loss": 0.1083, "step": 1331 }, { "epoch": 0.08610909090909091, "grad_norm": 0.13540121912956238, "learning_rate": 0.00019982630175125057, "loss": 0.1236, "step": 1332 }, { "epoch": 0.08617373737373737, "grad_norm": 0.08572167158126831, "learning_rate": 0.0001998258986223569, "loss": 0.0759, "step": 1333 }, { "epoch": 0.08623838383838384, "grad_norm": 0.11115605384111404, "learning_rate": 0.00019982549502661052, "loss": 0.1032, "step": 1334 }, { "epoch": 0.0863030303030303, "grad_norm": 0.10030196607112885, "learning_rate": 0.00019982509096401328, "loss": 0.1033, "step": 1335 }, { "epoch": 0.08636767676767677, "grad_norm": 0.10309617221355438, "learning_rate": 0.00019982468643456712, "loss": 0.1124, "step": 1336 }, { "epoch": 0.08643232323232324, "grad_norm": 0.10604587197303772, "learning_rate": 0.00019982428143827387, "loss": 0.1063, "step": 1337 }, { "epoch": 0.0864969696969697, "grad_norm": 0.08428341895341873, "learning_rate": 0.0001998238759751355, "loss": 0.0789, "step": 1338 }, { "epoch": 0.08656161616161616, "grad_norm": 0.0930686891078949, "learning_rate": 0.00019982347004515383, "loss": 0.104, "step": 1339 }, { "epoch": 0.08662626262626262, "grad_norm": 0.09911807626485825, "learning_rate": 0.0001998230636483308, "loss": 0.0996, "step": 1340 }, { "epoch": 0.0866909090909091, "grad_norm": 0.10296230018138885, "learning_rate": 0.0001998226567846683, "loss": 0.1006, "step": 1341 }, { "epoch": 0.08675555555555556, "grad_norm": 0.10285492986440659, "learning_rate": 0.0001998222494541682, "loss": 0.1122, "step": 1342 }, { "epoch": 0.08682020202020202, "grad_norm": 0.09906865656375885, "learning_rate": 0.00019982184165683248, "loss": 0.0887, "step": 1343 }, { "epoch": 0.08688484848484848, "grad_norm": 0.10971717536449432, "learning_rate": 0.000199821433392663, "loss": 0.1056, "step": 1344 }, { "epoch": 0.08688484848484848, "eval_bleu": 11.484851180167276, "eval_loss": 0.09886029362678528, "eval_runtime": 2.6457, "eval_samples_per_second": 12.095, "eval_steps_per_second": 1.512, "step": 1344 }, { "epoch": 0.08694949494949494, "grad_norm": 0.08866874128580093, "learning_rate": 0.0001998210246616617, "loss": 0.0882, "step": 1345 }, { "epoch": 0.08701414141414142, "grad_norm": 0.09244007617235184, "learning_rate": 0.00019982061546383042, "loss": 0.0932, "step": 1346 }, { "epoch": 0.08707878787878788, "grad_norm": 0.08623712509870529, "learning_rate": 0.00019982020579917116, "loss": 0.0873, "step": 1347 }, { "epoch": 0.08714343434343434, "grad_norm": 0.1054779589176178, "learning_rate": 0.0001998197956676858, "loss": 0.1143, "step": 1348 }, { "epoch": 0.0872080808080808, "grad_norm": 0.0864868238568306, "learning_rate": 0.00019981938506937624, "loss": 0.0946, "step": 1349 }, { "epoch": 0.08727272727272728, "grad_norm": 0.20312117040157318, "learning_rate": 0.00019981897400424438, "loss": 0.119, "step": 1350 }, { "epoch": 0.08733737373737374, "grad_norm": 0.09270636737346649, "learning_rate": 0.00019981856247229223, "loss": 0.1021, "step": 1351 }, { "epoch": 0.0874020202020202, "grad_norm": 0.11795508116483688, "learning_rate": 0.00019981815047352165, "loss": 0.1129, "step": 1352 }, { "epoch": 0.08746666666666666, "grad_norm": 0.08348415046930313, "learning_rate": 0.00019981773800793458, "loss": 0.0867, "step": 1353 }, { "epoch": 0.08753131313131313, "grad_norm": 0.10120968520641327, "learning_rate": 0.00019981732507553295, "loss": 0.1116, "step": 1354 }, { "epoch": 0.0875959595959596, "grad_norm": 0.09838369488716125, "learning_rate": 0.00019981691167631865, "loss": 0.1061, "step": 1355 }, { "epoch": 0.08766060606060606, "grad_norm": 0.0971892848610878, "learning_rate": 0.0001998164978102937, "loss": 0.1132, "step": 1356 }, { "epoch": 0.08772525252525253, "grad_norm": 0.07860702276229858, "learning_rate": 0.00019981608347745998, "loss": 0.0875, "step": 1357 }, { "epoch": 0.08778989898989899, "grad_norm": 0.09088286757469177, "learning_rate": 0.0001998156686778194, "loss": 0.0924, "step": 1358 }, { "epoch": 0.08785454545454545, "grad_norm": 0.08988488465547562, "learning_rate": 0.000199815253411374, "loss": 0.0954, "step": 1359 }, { "epoch": 0.08791919191919192, "grad_norm": 0.09467092156410217, "learning_rate": 0.0001998148376781256, "loss": 0.086, "step": 1360 }, { "epoch": 0.08791919191919192, "eval_bleu": 12.53385025553159, "eval_loss": 0.09877762943506241, "eval_runtime": 2.8351, "eval_samples_per_second": 11.287, "eval_steps_per_second": 1.411, "step": 1360 }, { "epoch": 0.08798383838383839, "grad_norm": 0.09027474373579025, "learning_rate": 0.00019981442147807624, "loss": 0.0891, "step": 1361 }, { "epoch": 0.08804848484848485, "grad_norm": 0.12696439027786255, "learning_rate": 0.00019981400481122784, "loss": 0.1192, "step": 1362 }, { "epoch": 0.08811313131313131, "grad_norm": 0.10165084153413773, "learning_rate": 0.00019981358767758232, "loss": 0.0935, "step": 1363 }, { "epoch": 0.08817777777777777, "grad_norm": 0.09301851689815521, "learning_rate": 0.00019981317007714163, "loss": 0.1, "step": 1364 }, { "epoch": 0.08824242424242425, "grad_norm": 0.09047587215900421, "learning_rate": 0.00019981275200990775, "loss": 0.0918, "step": 1365 }, { "epoch": 0.08830707070707071, "grad_norm": 0.09839123487472534, "learning_rate": 0.00019981233347588263, "loss": 0.1012, "step": 1366 }, { "epoch": 0.08837171717171717, "grad_norm": 0.1176568940281868, "learning_rate": 0.00019981191447506822, "loss": 0.1286, "step": 1367 }, { "epoch": 0.08843636363636363, "grad_norm": 0.10879629105329514, "learning_rate": 0.0001998114950074665, "loss": 0.1052, "step": 1368 }, { "epoch": 0.0885010101010101, "grad_norm": 0.09475315362215042, "learning_rate": 0.00019981107507307936, "loss": 0.0999, "step": 1369 }, { "epoch": 0.08856565656565657, "grad_norm": 0.27396082878112793, "learning_rate": 0.00019981065467190886, "loss": 0.1161, "step": 1370 }, { "epoch": 0.08863030303030303, "grad_norm": 0.18335482478141785, "learning_rate": 0.00019981023380395696, "loss": 0.1163, "step": 1371 }, { "epoch": 0.0886949494949495, "grad_norm": 0.13550560176372528, "learning_rate": 0.00019980981246922553, "loss": 0.1259, "step": 1372 }, { "epoch": 0.08875959595959595, "grad_norm": 0.09698712825775146, "learning_rate": 0.00019980939066771664, "loss": 0.0964, "step": 1373 }, { "epoch": 0.08882424242424243, "grad_norm": 0.10698266327381134, "learning_rate": 0.00019980896839943223, "loss": 0.0932, "step": 1374 }, { "epoch": 0.08888888888888889, "grad_norm": 0.09666618704795837, "learning_rate": 0.00019980854566437425, "loss": 0.1002, "step": 1375 }, { "epoch": 0.08895353535353535, "grad_norm": 0.10566040873527527, "learning_rate": 0.0001998081224625447, "loss": 0.1023, "step": 1376 }, { "epoch": 0.08895353535353535, "eval_bleu": 12.276032833228713, "eval_loss": 0.09839345514774323, "eval_runtime": 2.6591, "eval_samples_per_second": 12.034, "eval_steps_per_second": 1.504, "step": 1376 }, { "epoch": 0.08901818181818182, "grad_norm": 0.10848584771156311, "learning_rate": 0.0001998076987939456, "loss": 0.1169, "step": 1377 }, { "epoch": 0.08908282828282828, "grad_norm": 0.11011257022619247, "learning_rate": 0.00019980727465857882, "loss": 0.0948, "step": 1378 }, { "epoch": 0.08914747474747475, "grad_norm": 0.08588409423828125, "learning_rate": 0.00019980685005644645, "loss": 0.0915, "step": 1379 }, { "epoch": 0.08921212121212121, "grad_norm": 0.11844003945589066, "learning_rate": 0.00019980642498755043, "loss": 0.1217, "step": 1380 }, { "epoch": 0.08927676767676768, "grad_norm": 0.09860694408416748, "learning_rate": 0.00019980599945189275, "loss": 0.109, "step": 1381 }, { "epoch": 0.08934141414141414, "grad_norm": 0.10013259947299957, "learning_rate": 0.00019980557344947543, "loss": 0.0992, "step": 1382 }, { "epoch": 0.0894060606060606, "grad_norm": 0.09079885482788086, "learning_rate": 0.0001998051469803004, "loss": 0.0984, "step": 1383 }, { "epoch": 0.08947070707070708, "grad_norm": 0.0816333070397377, "learning_rate": 0.00019980472004436974, "loss": 0.0826, "step": 1384 }, { "epoch": 0.08953535353535354, "grad_norm": 0.08877380937337875, "learning_rate": 0.00019980429264168534, "loss": 0.087, "step": 1385 }, { "epoch": 0.0896, "grad_norm": 0.09078330546617508, "learning_rate": 0.0001998038647722493, "loss": 0.1023, "step": 1386 }, { "epoch": 0.08966464646464646, "grad_norm": 0.09360656142234802, "learning_rate": 0.00019980343643606358, "loss": 0.0952, "step": 1387 }, { "epoch": 0.08972929292929292, "grad_norm": 0.08943704515695572, "learning_rate": 0.00019980300763313018, "loss": 0.09, "step": 1388 }, { "epoch": 0.0897939393939394, "grad_norm": 0.08633750677108765, "learning_rate": 0.00019980257836345108, "loss": 0.0948, "step": 1389 }, { "epoch": 0.08985858585858586, "grad_norm": 0.10041945427656174, "learning_rate": 0.00019980214862702836, "loss": 0.1075, "step": 1390 }, { "epoch": 0.08992323232323232, "grad_norm": 0.10538869351148605, "learning_rate": 0.00019980171842386396, "loss": 0.0805, "step": 1391 }, { "epoch": 0.08998787878787878, "grad_norm": 0.09101137518882751, "learning_rate": 0.0001998012877539599, "loss": 0.1053, "step": 1392 }, { "epoch": 0.08998787878787878, "eval_bleu": 12.953432942015944, "eval_loss": 0.09817688167095184, "eval_runtime": 2.6914, "eval_samples_per_second": 11.89, "eval_steps_per_second": 1.486, "step": 1392 }, { "epoch": 0.09005252525252526, "grad_norm": 0.1076117604970932, "learning_rate": 0.00019980085661731823, "loss": 0.1028, "step": 1393 }, { "epoch": 0.09011717171717172, "grad_norm": 0.10198492556810379, "learning_rate": 0.00019980042501394093, "loss": 0.0919, "step": 1394 }, { "epoch": 0.09018181818181818, "grad_norm": 0.08665958046913147, "learning_rate": 0.00019979999294383007, "loss": 0.0877, "step": 1395 }, { "epoch": 0.09024646464646464, "grad_norm": 0.0990280881524086, "learning_rate": 0.0001997995604069876, "loss": 0.1077, "step": 1396 }, { "epoch": 0.0903111111111111, "grad_norm": 0.09020534157752991, "learning_rate": 0.00019979912740341563, "loss": 0.0835, "step": 1397 }, { "epoch": 0.09037575757575758, "grad_norm": 0.09370144456624985, "learning_rate": 0.00019979869393311607, "loss": 0.0938, "step": 1398 }, { "epoch": 0.09044040404040404, "grad_norm": 0.09408402442932129, "learning_rate": 0.00019979825999609104, "loss": 0.0879, "step": 1399 }, { "epoch": 0.0905050505050505, "grad_norm": 0.09311624616384506, "learning_rate": 0.00019979782559234255, "loss": 0.0871, "step": 1400 }, { "epoch": 0.09056969696969697, "grad_norm": 0.08725301176309586, "learning_rate": 0.0001997973907218726, "loss": 0.079, "step": 1401 }, { "epoch": 0.09063434343434343, "grad_norm": 0.13033892214298248, "learning_rate": 0.00019979695538468328, "loss": 0.1025, "step": 1402 }, { "epoch": 0.0906989898989899, "grad_norm": 0.099240243434906, "learning_rate": 0.00019979651958077657, "loss": 0.0974, "step": 1403 }, { "epoch": 0.09076363636363637, "grad_norm": 0.11006799340248108, "learning_rate": 0.0001997960833101545, "loss": 0.1192, "step": 1404 }, { "epoch": 0.09082828282828283, "grad_norm": 0.17573943734169006, "learning_rate": 0.0001997956465728192, "loss": 0.0939, "step": 1405 }, { "epoch": 0.09089292929292929, "grad_norm": 0.12288129329681396, "learning_rate": 0.00019979520936877262, "loss": 0.0874, "step": 1406 }, { "epoch": 0.09095757575757575, "grad_norm": 0.08029817044734955, "learning_rate": 0.00019979477169801684, "loss": 0.0826, "step": 1407 }, { "epoch": 0.09102222222222223, "grad_norm": 0.1330159604549408, "learning_rate": 0.0001997943335605539, "loss": 0.118, "step": 1408 }, { "epoch": 0.09102222222222223, "eval_bleu": 13.737769396611698, "eval_loss": 0.09946625679731369, "eval_runtime": 2.63, "eval_samples_per_second": 12.167, "eval_steps_per_second": 1.521, "step": 1408 }, { "epoch": 0.09108686868686869, "grad_norm": 0.09597237408161163, "learning_rate": 0.00019979389495638589, "loss": 0.0965, "step": 1409 }, { "epoch": 0.09115151515151515, "grad_norm": 0.09238039702177048, "learning_rate": 0.00019979345588551478, "loss": 0.0892, "step": 1410 }, { "epoch": 0.09121616161616161, "grad_norm": 0.09983370453119278, "learning_rate": 0.00019979301634794267, "loss": 0.0837, "step": 1411 }, { "epoch": 0.09128080808080809, "grad_norm": 0.09874019026756287, "learning_rate": 0.00019979257634367161, "loss": 0.0992, "step": 1412 }, { "epoch": 0.09134545454545455, "grad_norm": 0.08935726433992386, "learning_rate": 0.0001997921358727037, "loss": 0.0873, "step": 1413 }, { "epoch": 0.09141010101010101, "grad_norm": 0.11794383823871613, "learning_rate": 0.00019979169493504093, "loss": 0.0959, "step": 1414 }, { "epoch": 0.09147474747474747, "grad_norm": 0.08632034063339233, "learning_rate": 0.0001997912535306854, "loss": 0.0862, "step": 1415 }, { "epoch": 0.09153939393939393, "grad_norm": 0.08654046058654785, "learning_rate": 0.00019979081165963917, "loss": 0.105, "step": 1416 }, { "epoch": 0.09160404040404041, "grad_norm": 0.08890501409769058, "learning_rate": 0.0001997903693219043, "loss": 0.0945, "step": 1417 }, { "epoch": 0.09166868686868687, "grad_norm": 0.09299993515014648, "learning_rate": 0.00019978992651748287, "loss": 0.0934, "step": 1418 }, { "epoch": 0.09173333333333333, "grad_norm": 0.09371999651193619, "learning_rate": 0.0001997894832463769, "loss": 0.1027, "step": 1419 }, { "epoch": 0.0917979797979798, "grad_norm": 0.097237229347229, "learning_rate": 0.0001997890395085886, "loss": 0.0915, "step": 1420 }, { "epoch": 0.09186262626262626, "grad_norm": 0.11031143367290497, "learning_rate": 0.00019978859530411986, "loss": 0.1069, "step": 1421 }, { "epoch": 0.09192727272727273, "grad_norm": 0.0882914811372757, "learning_rate": 0.0001997881506329729, "loss": 0.0902, "step": 1422 }, { "epoch": 0.0919919191919192, "grad_norm": 0.09184537082910538, "learning_rate": 0.00019978770549514973, "loss": 0.1043, "step": 1423 }, { "epoch": 0.09205656565656566, "grad_norm": 0.09339790046215057, "learning_rate": 0.00019978725989065245, "loss": 0.1136, "step": 1424 }, { "epoch": 0.09205656565656566, "eval_bleu": 12.207677834612236, "eval_loss": 0.09881220012903214, "eval_runtime": 2.7918, "eval_samples_per_second": 11.462, "eval_steps_per_second": 1.433, "step": 1424 }, { "epoch": 0.09212121212121212, "grad_norm": 0.12844525277614594, "learning_rate": 0.00019978681381948316, "loss": 0.1296, "step": 1425 }, { "epoch": 0.09218585858585858, "grad_norm": 0.1130497008562088, "learning_rate": 0.00019978636728164393, "loss": 0.1234, "step": 1426 }, { "epoch": 0.09225050505050506, "grad_norm": 0.10930045694112778, "learning_rate": 0.00019978592027713682, "loss": 0.099, "step": 1427 }, { "epoch": 0.09231515151515152, "grad_norm": 0.0970892682671547, "learning_rate": 0.000199785472805964, "loss": 0.0941, "step": 1428 }, { "epoch": 0.09237979797979798, "grad_norm": 0.09091490507125854, "learning_rate": 0.00019978502486812748, "loss": 0.099, "step": 1429 }, { "epoch": 0.09244444444444444, "grad_norm": 0.08599131554365158, "learning_rate": 0.00019978457646362938, "loss": 0.0876, "step": 1430 }, { "epoch": 0.09250909090909092, "grad_norm": 0.08314812928438187, "learning_rate": 0.0001997841275924718, "loss": 0.085, "step": 1431 }, { "epoch": 0.09257373737373738, "grad_norm": 0.08717819303274155, "learning_rate": 0.00019978367825465687, "loss": 0.0965, "step": 1432 }, { "epoch": 0.09263838383838384, "grad_norm": 0.09867282211780548, "learning_rate": 0.00019978322845018665, "loss": 0.1104, "step": 1433 }, { "epoch": 0.0927030303030303, "grad_norm": 0.09576225280761719, "learning_rate": 0.00019978277817906325, "loss": 0.1086, "step": 1434 }, { "epoch": 0.09276767676767676, "grad_norm": 0.09598365426063538, "learning_rate": 0.00019978232744128878, "loss": 0.0942, "step": 1435 }, { "epoch": 0.09283232323232324, "grad_norm": 0.1044580489397049, "learning_rate": 0.00019978187623686538, "loss": 0.1113, "step": 1436 }, { "epoch": 0.0928969696969697, "grad_norm": 0.09776109457015991, "learning_rate": 0.00019978142456579512, "loss": 0.1001, "step": 1437 }, { "epoch": 0.09296161616161616, "grad_norm": 0.08872736245393753, "learning_rate": 0.0001997809724280801, "loss": 0.0815, "step": 1438 }, { "epoch": 0.09302626262626262, "grad_norm": 0.09161276370286942, "learning_rate": 0.0001997805198237225, "loss": 0.0859, "step": 1439 }, { "epoch": 0.09309090909090909, "grad_norm": 0.10025312751531601, "learning_rate": 0.00019978006675272435, "loss": 0.1076, "step": 1440 }, { "epoch": 0.09309090909090909, "eval_bleu": 11.38449438389522, "eval_loss": 0.09994243830442429, "eval_runtime": 2.7267, "eval_samples_per_second": 11.736, "eval_steps_per_second": 1.467, "step": 1440 }, { "epoch": 0.09315555555555556, "grad_norm": 0.09451809525489807, "learning_rate": 0.00019977961321508787, "loss": 0.1075, "step": 1441 }, { "epoch": 0.09322020202020202, "grad_norm": 0.09404416382312775, "learning_rate": 0.0001997791592108151, "loss": 0.0815, "step": 1442 }, { "epoch": 0.09328484848484848, "grad_norm": 0.09629783034324646, "learning_rate": 0.00019977870473990818, "loss": 0.102, "step": 1443 }, { "epoch": 0.09334949494949495, "grad_norm": 0.08409852534532547, "learning_rate": 0.00019977824980236926, "loss": 0.0852, "step": 1444 }, { "epoch": 0.09341414141414141, "grad_norm": 0.09894314408302307, "learning_rate": 0.00019977779439820043, "loss": 0.0957, "step": 1445 }, { "epoch": 0.09347878787878788, "grad_norm": 0.09826365858316422, "learning_rate": 0.00019977733852740386, "loss": 0.1001, "step": 1446 }, { "epoch": 0.09354343434343435, "grad_norm": 0.1059250757098198, "learning_rate": 0.00019977688218998166, "loss": 0.1067, "step": 1447 }, { "epoch": 0.0936080808080808, "grad_norm": 0.08950673788785934, "learning_rate": 0.00019977642538593595, "loss": 0.09, "step": 1448 }, { "epoch": 0.09367272727272727, "grad_norm": 0.12125827372074127, "learning_rate": 0.00019977596811526888, "loss": 0.0718, "step": 1449 }, { "epoch": 0.09373737373737374, "grad_norm": 0.10540442168712616, "learning_rate": 0.00019977551037798262, "loss": 0.1089, "step": 1450 }, { "epoch": 0.0938020202020202, "grad_norm": 0.09844257682561874, "learning_rate": 0.00019977505217407928, "loss": 0.1054, "step": 1451 }, { "epoch": 0.09386666666666667, "grad_norm": 0.09119300544261932, "learning_rate": 0.000199774593503561, "loss": 0.1038, "step": 1452 }, { "epoch": 0.09393131313131313, "grad_norm": 0.09256444126367569, "learning_rate": 0.00019977413436642993, "loss": 0.1069, "step": 1453 }, { "epoch": 0.09399595959595959, "grad_norm": 0.26908043026924133, "learning_rate": 0.0001997736747626882, "loss": 0.0795, "step": 1454 }, { "epoch": 0.09406060606060607, "grad_norm": 0.10583196580410004, "learning_rate": 0.00019977321469233798, "loss": 0.0907, "step": 1455 }, { "epoch": 0.09412525252525253, "grad_norm": 0.10335783660411835, "learning_rate": 0.00019977275415538145, "loss": 0.0934, "step": 1456 }, { "epoch": 0.09412525252525253, "eval_bleu": 14.092461605337165, "eval_loss": 0.10060583800077438, "eval_runtime": 2.8017, "eval_samples_per_second": 11.422, "eval_steps_per_second": 1.428, "step": 1456 }, { "epoch": 0.09418989898989899, "grad_norm": 0.09173998236656189, "learning_rate": 0.00019977229315182071, "loss": 0.0956, "step": 1457 }, { "epoch": 0.09425454545454545, "grad_norm": 0.11639188230037689, "learning_rate": 0.00019977183168165796, "loss": 0.1012, "step": 1458 }, { "epoch": 0.09431919191919191, "grad_norm": 0.09920790791511536, "learning_rate": 0.00019977136974489533, "loss": 0.1105, "step": 1459 }, { "epoch": 0.09438383838383839, "grad_norm": 0.10236836969852448, "learning_rate": 0.000199770907341535, "loss": 0.1106, "step": 1460 }, { "epoch": 0.09444848484848485, "grad_norm": 0.09884092211723328, "learning_rate": 0.00019977044447157906, "loss": 0.1051, "step": 1461 }, { "epoch": 0.09451313131313131, "grad_norm": 0.08512425422668457, "learning_rate": 0.00019976998113502978, "loss": 0.0921, "step": 1462 }, { "epoch": 0.09457777777777777, "grad_norm": 0.09624971449375153, "learning_rate": 0.0001997695173318893, "loss": 0.108, "step": 1463 }, { "epoch": 0.09464242424242424, "grad_norm": 0.09836854785680771, "learning_rate": 0.00019976905306215973, "loss": 0.1009, "step": 1464 }, { "epoch": 0.09470707070707071, "grad_norm": 0.08996815979480743, "learning_rate": 0.0001997685883258433, "loss": 0.0815, "step": 1465 }, { "epoch": 0.09477171717171717, "grad_norm": 0.09514082223176956, "learning_rate": 0.00019976812312294214, "loss": 0.107, "step": 1466 }, { "epoch": 0.09483636363636364, "grad_norm": 0.09628879278898239, "learning_rate": 0.0001997676574534585, "loss": 0.1107, "step": 1467 }, { "epoch": 0.0949010101010101, "grad_norm": 0.08442001044750214, "learning_rate": 0.0001997671913173945, "loss": 0.0888, "step": 1468 }, { "epoch": 0.09496565656565656, "grad_norm": 0.10036958754062653, "learning_rate": 0.00019976672471475228, "loss": 0.1086, "step": 1469 }, { "epoch": 0.09503030303030303, "grad_norm": 0.08386826515197754, "learning_rate": 0.00019976625764553415, "loss": 0.1001, "step": 1470 }, { "epoch": 0.0950949494949495, "grad_norm": 0.08392481505870819, "learning_rate": 0.00019976579010974214, "loss": 0.0877, "step": 1471 }, { "epoch": 0.09515959595959596, "grad_norm": 0.12759946286678314, "learning_rate": 0.00019976532210737853, "loss": 0.1093, "step": 1472 }, { "epoch": 0.09515959595959596, "eval_bleu": 14.280043781319975, "eval_loss": 0.10036460310220718, "eval_runtime": 2.7194, "eval_samples_per_second": 11.767, "eval_steps_per_second": 1.471, "step": 1472 }, { "epoch": 0.09522424242424242, "grad_norm": 0.09618207067251205, "learning_rate": 0.0001997648536384455, "loss": 0.0848, "step": 1473 }, { "epoch": 0.0952888888888889, "grad_norm": 0.09130462259054184, "learning_rate": 0.00019976438470294526, "loss": 0.0837, "step": 1474 }, { "epoch": 0.09535353535353536, "grad_norm": 0.10890621691942215, "learning_rate": 0.00019976391530087995, "loss": 0.1212, "step": 1475 }, { "epoch": 0.09541818181818182, "grad_norm": 0.10262034833431244, "learning_rate": 0.00019976344543225176, "loss": 0.1027, "step": 1476 }, { "epoch": 0.09548282828282828, "grad_norm": 0.08642224222421646, "learning_rate": 0.00019976297509706294, "loss": 0.0917, "step": 1477 }, { "epoch": 0.09554747474747474, "grad_norm": 0.09260547906160355, "learning_rate": 0.0001997625042953157, "loss": 0.09, "step": 1478 }, { "epoch": 0.09561212121212122, "grad_norm": 0.10141746699810028, "learning_rate": 0.00019976203302701214, "loss": 0.1063, "step": 1479 }, { "epoch": 0.09567676767676768, "grad_norm": 0.10753446072340012, "learning_rate": 0.0001997615612921546, "loss": 0.0958, "step": 1480 }, { "epoch": 0.09574141414141414, "grad_norm": 0.09839066863059998, "learning_rate": 0.00019976108909074518, "loss": 0.0788, "step": 1481 }, { "epoch": 0.0958060606060606, "grad_norm": 0.10989544540643692, "learning_rate": 0.0001997606164227861, "loss": 0.0916, "step": 1482 }, { "epoch": 0.09587070707070706, "grad_norm": 0.09842634946107864, "learning_rate": 0.00019976014328827965, "loss": 0.1012, "step": 1483 }, { "epoch": 0.09593535353535354, "grad_norm": 0.0982094332575798, "learning_rate": 0.00019975966968722796, "loss": 0.0947, "step": 1484 }, { "epoch": 0.096, "grad_norm": 0.09330565482378006, "learning_rate": 0.0001997591956196333, "loss": 0.0947, "step": 1485 }, { "epoch": 0.09606464646464646, "grad_norm": 0.07657897472381592, "learning_rate": 0.00019975872108549784, "loss": 0.0744, "step": 1486 }, { "epoch": 0.09612929292929293, "grad_norm": 0.0957622081041336, "learning_rate": 0.00019975824608482382, "loss": 0.0997, "step": 1487 }, { "epoch": 0.09619393939393939, "grad_norm": 0.08672267198562622, "learning_rate": 0.00019975777061761348, "loss": 0.0919, "step": 1488 }, { "epoch": 0.09619393939393939, "eval_bleu": 14.729118721791473, "eval_loss": 0.0992056280374527, "eval_runtime": 2.9026, "eval_samples_per_second": 11.025, "eval_steps_per_second": 1.378, "step": 1488 }, { "epoch": 0.09625858585858586, "grad_norm": 0.11096522957086563, "learning_rate": 0.000199757294683869, "loss": 0.1092, "step": 1489 }, { "epoch": 0.09632323232323232, "grad_norm": 0.08589675277471542, "learning_rate": 0.00019975681828359268, "loss": 0.0932, "step": 1490 }, { "epoch": 0.09638787878787879, "grad_norm": 0.09943711757659912, "learning_rate": 0.00019975634141678668, "loss": 0.1117, "step": 1491 }, { "epoch": 0.09645252525252525, "grad_norm": 0.09538694471120834, "learning_rate": 0.00019975586408345323, "loss": 0.1201, "step": 1492 }, { "epoch": 0.09651717171717172, "grad_norm": 0.07913587987422943, "learning_rate": 0.00019975538628359458, "loss": 0.0867, "step": 1493 }, { "epoch": 0.09658181818181819, "grad_norm": 0.07704527676105499, "learning_rate": 0.00019975490801721297, "loss": 0.0681, "step": 1494 }, { "epoch": 0.09664646464646465, "grad_norm": 0.08875605463981628, "learning_rate": 0.00019975442928431067, "loss": 0.1012, "step": 1495 }, { "epoch": 0.09671111111111111, "grad_norm": 0.09561815857887268, "learning_rate": 0.00019975395008488983, "loss": 0.0936, "step": 1496 }, { "epoch": 0.09677575757575757, "grad_norm": 0.10696236044168472, "learning_rate": 0.00019975347041895278, "loss": 0.1038, "step": 1497 }, { "epoch": 0.09684040404040405, "grad_norm": 0.07971223443746567, "learning_rate": 0.0001997529902865017, "loss": 0.0843, "step": 1498 }, { "epoch": 0.09690505050505051, "grad_norm": 0.0859697014093399, "learning_rate": 0.0001997525096875389, "loss": 0.0947, "step": 1499 }, { "epoch": 0.09696969696969697, "grad_norm": 0.09678880870342255, "learning_rate": 0.00019975202862206656, "loss": 0.1047, "step": 1500 }, { "epoch": 0.09703434343434343, "grad_norm": 0.088010773062706, "learning_rate": 0.00019975154709008696, "loss": 0.0856, "step": 1501 }, { "epoch": 0.0970989898989899, "grad_norm": 0.10703625530004501, "learning_rate": 0.00019975106509160235, "loss": 0.0953, "step": 1502 }, { "epoch": 0.09716363636363637, "grad_norm": 0.1084941178560257, "learning_rate": 0.000199750582626615, "loss": 0.1101, "step": 1503 }, { "epoch": 0.09722828282828283, "grad_norm": 0.09701517224311829, "learning_rate": 0.00019975009969512716, "loss": 0.1011, "step": 1504 }, { "epoch": 0.09722828282828283, "eval_bleu": 12.832692589841608, "eval_loss": 0.10086671262979507, "eval_runtime": 2.693, "eval_samples_per_second": 11.883, "eval_steps_per_second": 1.485, "step": 1504 }, { "epoch": 0.09729292929292929, "grad_norm": 0.09981659799814224, "learning_rate": 0.00019974961629714108, "loss": 0.1042, "step": 1505 }, { "epoch": 0.09735757575757575, "grad_norm": 0.09844482690095901, "learning_rate": 0.00019974913243265898, "loss": 0.1028, "step": 1506 }, { "epoch": 0.09742222222222222, "grad_norm": 0.0844825804233551, "learning_rate": 0.0001997486481016832, "loss": 0.0887, "step": 1507 }, { "epoch": 0.09748686868686869, "grad_norm": 0.0880080834031105, "learning_rate": 0.00019974816330421596, "loss": 0.0951, "step": 1508 }, { "epoch": 0.09755151515151515, "grad_norm": 0.0978274941444397, "learning_rate": 0.00019974767804025953, "loss": 0.1026, "step": 1509 }, { "epoch": 0.09761616161616161, "grad_norm": 0.07957588881254196, "learning_rate": 0.0001997471923098162, "loss": 0.0865, "step": 1510 }, { "epoch": 0.09768080808080808, "grad_norm": 0.09161031246185303, "learning_rate": 0.00019974670611288818, "loss": 0.0918, "step": 1511 }, { "epoch": 0.09774545454545455, "grad_norm": 0.08318771421909332, "learning_rate": 0.00019974621944947787, "loss": 0.0724, "step": 1512 }, { "epoch": 0.09781010101010101, "grad_norm": 0.09956041723489761, "learning_rate": 0.0001997457323195874, "loss": 0.0936, "step": 1513 }, { "epoch": 0.09787474747474748, "grad_norm": 0.09289422631263733, "learning_rate": 0.00019974524472321915, "loss": 0.0937, "step": 1514 }, { "epoch": 0.09793939393939394, "grad_norm": 0.08256366103887558, "learning_rate": 0.00019974475666037535, "loss": 0.0846, "step": 1515 }, { "epoch": 0.0980040404040404, "grad_norm": 0.0928649753332138, "learning_rate": 0.0001997442681310583, "loss": 0.0879, "step": 1516 }, { "epoch": 0.09806868686868687, "grad_norm": 0.09999298304319382, "learning_rate": 0.00019974377913527033, "loss": 0.0793, "step": 1517 }, { "epoch": 0.09813333333333334, "grad_norm": 0.09429288655519485, "learning_rate": 0.00019974328967301363, "loss": 0.0932, "step": 1518 }, { "epoch": 0.0981979797979798, "grad_norm": 0.10184887051582336, "learning_rate": 0.00019974279974429053, "loss": 0.0999, "step": 1519 }, { "epoch": 0.09826262626262626, "grad_norm": 0.10147322714328766, "learning_rate": 0.00019974230934910337, "loss": 0.1039, "step": 1520 }, { "epoch": 0.09826262626262626, "eval_bleu": 11.785813470017036, "eval_loss": 0.09970571100711823, "eval_runtime": 2.8096, "eval_samples_per_second": 11.39, "eval_steps_per_second": 1.424, "step": 1520 }, { "epoch": 0.09832727272727272, "grad_norm": 0.10534729808568954, "learning_rate": 0.00019974181848745438, "loss": 0.0985, "step": 1521 }, { "epoch": 0.0983919191919192, "grad_norm": 0.09720037132501602, "learning_rate": 0.00019974132715934586, "loss": 0.0892, "step": 1522 }, { "epoch": 0.09845656565656566, "grad_norm": 0.09086208790540695, "learning_rate": 0.00019974083536478015, "loss": 0.0903, "step": 1523 }, { "epoch": 0.09852121212121212, "grad_norm": 0.11158297955989838, "learning_rate": 0.00019974034310375952, "loss": 0.1191, "step": 1524 }, { "epoch": 0.09858585858585858, "grad_norm": 0.08970746397972107, "learning_rate": 0.00019973985037628628, "loss": 0.0878, "step": 1525 }, { "epoch": 0.09865050505050504, "grad_norm": 0.08245444297790527, "learning_rate": 0.00019973935718236275, "loss": 0.0979, "step": 1526 }, { "epoch": 0.09871515151515152, "grad_norm": 0.09014393389225006, "learning_rate": 0.0001997388635219912, "loss": 0.1019, "step": 1527 }, { "epoch": 0.09877979797979798, "grad_norm": 0.08501767367124557, "learning_rate": 0.00019973836939517393, "loss": 0.0883, "step": 1528 }, { "epoch": 0.09884444444444444, "grad_norm": 0.09111899882555008, "learning_rate": 0.00019973787480191332, "loss": 0.1117, "step": 1529 }, { "epoch": 0.0989090909090909, "grad_norm": 0.11369483172893524, "learning_rate": 0.00019973737974221165, "loss": 0.1131, "step": 1530 }, { "epoch": 0.09897373737373738, "grad_norm": 0.10262712836265564, "learning_rate": 0.0001997368842160712, "loss": 0.0999, "step": 1531 }, { "epoch": 0.09903838383838384, "grad_norm": 0.09883233904838562, "learning_rate": 0.00019973638822349434, "loss": 0.101, "step": 1532 }, { "epoch": 0.0991030303030303, "grad_norm": 0.09912887960672379, "learning_rate": 0.00019973589176448334, "loss": 0.1049, "step": 1533 }, { "epoch": 0.09916767676767677, "grad_norm": 0.12348176538944244, "learning_rate": 0.00019973539483904057, "loss": 0.0978, "step": 1534 }, { "epoch": 0.09923232323232323, "grad_norm": 0.09499776363372803, "learning_rate": 0.0001997348974471683, "loss": 0.0902, "step": 1535 }, { "epoch": 0.0992969696969697, "grad_norm": 0.09011103212833405, "learning_rate": 0.00019973439958886893, "loss": 0.0901, "step": 1536 }, { "epoch": 0.0992969696969697, "eval_bleu": 13.668856052191725, "eval_loss": 0.09831744432449341, "eval_runtime": 2.7926, "eval_samples_per_second": 11.459, "eval_steps_per_second": 1.432, "step": 1536 }, { "epoch": 0.09936161616161616, "grad_norm": 0.10506445169448853, "learning_rate": 0.0001997339012641447, "loss": 0.1105, "step": 1537 }, { "epoch": 0.09942626262626263, "grad_norm": 0.10616018623113632, "learning_rate": 0.000199733402472998, "loss": 0.1043, "step": 1538 }, { "epoch": 0.09949090909090909, "grad_norm": 0.10897423326969147, "learning_rate": 0.00019973290321543118, "loss": 0.1137, "step": 1539 }, { "epoch": 0.09955555555555555, "grad_norm": 0.11066459119319916, "learning_rate": 0.00019973240349144652, "loss": 0.1057, "step": 1540 }, { "epoch": 0.09962020202020203, "grad_norm": 0.1022406741976738, "learning_rate": 0.00019973190330104635, "loss": 0.108, "step": 1541 }, { "epoch": 0.09968484848484849, "grad_norm": 0.08966987580060959, "learning_rate": 0.00019973140264423306, "loss": 0.0938, "step": 1542 }, { "epoch": 0.09974949494949495, "grad_norm": 0.09685828536748886, "learning_rate": 0.00019973090152100898, "loss": 0.089, "step": 1543 }, { "epoch": 0.09981414141414141, "grad_norm": 0.09710504114627838, "learning_rate": 0.00019973039993137644, "loss": 0.1164, "step": 1544 }, { "epoch": 0.09987878787878787, "grad_norm": 0.07680148631334305, "learning_rate": 0.00019972989787533777, "loss": 0.0814, "step": 1545 }, { "epoch": 0.09994343434343435, "grad_norm": 0.0986066609621048, "learning_rate": 0.00019972939535289536, "loss": 0.1021, "step": 1546 }, { "epoch": 0.10000808080808081, "grad_norm": 0.09149985760450363, "learning_rate": 0.00019972889236405156, "loss": 0.0929, "step": 1547 }, { "epoch": 0.10007272727272727, "grad_norm": 0.08575946092605591, "learning_rate": 0.00019972838890880865, "loss": 0.0814, "step": 1548 }, { "epoch": 0.10013737373737373, "grad_norm": 0.08150371164083481, "learning_rate": 0.00019972788498716904, "loss": 0.0849, "step": 1549 }, { "epoch": 0.10020202020202021, "grad_norm": 0.10836182534694672, "learning_rate": 0.00019972738059913513, "loss": 0.115, "step": 1550 }, { "epoch": 0.10026666666666667, "grad_norm": 0.08745045214891434, "learning_rate": 0.00019972687574470918, "loss": 0.0877, "step": 1551 }, { "epoch": 0.10033131313131313, "grad_norm": 0.10776417702436447, "learning_rate": 0.00019972637042389363, "loss": 0.1174, "step": 1552 }, { "epoch": 0.10033131313131313, "eval_bleu": 15.192125634192314, "eval_loss": 0.09673939645290375, "eval_runtime": 2.6619, "eval_samples_per_second": 12.022, "eval_steps_per_second": 1.503, "step": 1552 }, { "epoch": 0.1003959595959596, "grad_norm": 0.08816003054380417, "learning_rate": 0.00019972586463669082, "loss": 0.0871, "step": 1553 }, { "epoch": 0.10046060606060606, "grad_norm": 0.0874270498752594, "learning_rate": 0.00019972535838310308, "loss": 0.0861, "step": 1554 }, { "epoch": 0.10052525252525253, "grad_norm": 0.09039713442325592, "learning_rate": 0.0001997248516631328, "loss": 0.0991, "step": 1555 }, { "epoch": 0.100589898989899, "grad_norm": 0.09131505340337753, "learning_rate": 0.0001997243444767824, "loss": 0.0932, "step": 1556 }, { "epoch": 0.10065454545454545, "grad_norm": 0.08119247108697891, "learning_rate": 0.0001997238368240542, "loss": 0.08, "step": 1557 }, { "epoch": 0.10071919191919192, "grad_norm": 0.10036002099514008, "learning_rate": 0.00019972332870495056, "loss": 0.115, "step": 1558 }, { "epoch": 0.10078383838383838, "grad_norm": 0.07870175689458847, "learning_rate": 0.0001997228201194739, "loss": 0.0859, "step": 1559 }, { "epoch": 0.10084848484848485, "grad_norm": 0.10069957375526428, "learning_rate": 0.00019972231106762654, "loss": 0.107, "step": 1560 }, { "epoch": 0.10091313131313132, "grad_norm": 0.1176270842552185, "learning_rate": 0.00019972180154941095, "loss": 0.0955, "step": 1561 }, { "epoch": 0.10097777777777778, "grad_norm": 0.07788178324699402, "learning_rate": 0.00019972129156482944, "loss": 0.0933, "step": 1562 }, { "epoch": 0.10104242424242424, "grad_norm": 0.08061239868402481, "learning_rate": 0.00019972078111388442, "loss": 0.0925, "step": 1563 }, { "epoch": 0.1011070707070707, "grad_norm": 0.0940290093421936, "learning_rate": 0.00019972027019657827, "loss": 0.1138, "step": 1564 }, { "epoch": 0.10117171717171718, "grad_norm": 0.08142008632421494, "learning_rate": 0.00019971975881291339, "loss": 0.0962, "step": 1565 }, { "epoch": 0.10123636363636364, "grad_norm": 0.1068977490067482, "learning_rate": 0.00019971924696289212, "loss": 0.1147, "step": 1566 }, { "epoch": 0.1013010101010101, "grad_norm": 0.08470705896615982, "learning_rate": 0.00019971873464651695, "loss": 0.0888, "step": 1567 }, { "epoch": 0.10136565656565656, "grad_norm": 0.09118818491697311, "learning_rate": 0.0001997182218637902, "loss": 0.0956, "step": 1568 }, { "epoch": 0.10136565656565656, "eval_bleu": 18.25939569531262, "eval_loss": 0.09652234613895416, "eval_runtime": 2.8391, "eval_samples_per_second": 11.271, "eval_steps_per_second": 1.409, "step": 1568 }, { "epoch": 0.10143030303030302, "grad_norm": 0.09397991001605988, "learning_rate": 0.0001997177086147143, "loss": 0.1147, "step": 1569 }, { "epoch": 0.1014949494949495, "grad_norm": 0.09927749633789062, "learning_rate": 0.00019971719489929166, "loss": 0.0985, "step": 1570 }, { "epoch": 0.10155959595959596, "grad_norm": 0.09375564008951187, "learning_rate": 0.00019971668071752468, "loss": 0.104, "step": 1571 }, { "epoch": 0.10162424242424242, "grad_norm": 0.09379369765520096, "learning_rate": 0.0001997161660694157, "loss": 0.0923, "step": 1572 }, { "epoch": 0.10168888888888888, "grad_norm": 0.11959432065486908, "learning_rate": 0.00019971565095496717, "loss": 0.1204, "step": 1573 }, { "epoch": 0.10175353535353536, "grad_norm": 0.09277225285768509, "learning_rate": 0.00019971513537418156, "loss": 0.0879, "step": 1574 }, { "epoch": 0.10181818181818182, "grad_norm": 0.09895878285169601, "learning_rate": 0.00019971461932706119, "loss": 0.115, "step": 1575 }, { "epoch": 0.10188282828282828, "grad_norm": 0.08483145385980606, "learning_rate": 0.00019971410281360852, "loss": 0.0885, "step": 1576 }, { "epoch": 0.10194747474747475, "grad_norm": 0.09375562518835068, "learning_rate": 0.00019971358583382597, "loss": 0.0944, "step": 1577 }, { "epoch": 0.1020121212121212, "grad_norm": 0.10022222995758057, "learning_rate": 0.00019971306838771592, "loss": 0.0895, "step": 1578 }, { "epoch": 0.10207676767676768, "grad_norm": 0.09891146421432495, "learning_rate": 0.00019971255047528084, "loss": 0.1159, "step": 1579 }, { "epoch": 0.10214141414141414, "grad_norm": 0.09443075954914093, "learning_rate": 0.0001997120320965231, "loss": 0.1014, "step": 1580 }, { "epoch": 0.1022060606060606, "grad_norm": 0.09817148000001907, "learning_rate": 0.00019971151325144516, "loss": 0.1053, "step": 1581 }, { "epoch": 0.10227070707070707, "grad_norm": 0.0886181890964508, "learning_rate": 0.00019971099394004943, "loss": 0.0818, "step": 1582 }, { "epoch": 0.10233535353535353, "grad_norm": 0.09783749282360077, "learning_rate": 0.00019971047416233838, "loss": 0.1, "step": 1583 }, { "epoch": 0.1024, "grad_norm": 0.09405406564474106, "learning_rate": 0.00019970995391831436, "loss": 0.0947, "step": 1584 }, { "epoch": 0.1024, "eval_bleu": 13.274058950570861, "eval_loss": 0.09767289459705353, "eval_runtime": 2.7555, "eval_samples_per_second": 11.613, "eval_steps_per_second": 1.452, "step": 1584 }, { "epoch": 0.10246464646464647, "grad_norm": 0.10955529659986496, "learning_rate": 0.00019970943320797986, "loss": 0.1174, "step": 1585 }, { "epoch": 0.10252929292929293, "grad_norm": 0.10316221415996552, "learning_rate": 0.0001997089120313373, "loss": 0.1123, "step": 1586 }, { "epoch": 0.10259393939393939, "grad_norm": 0.10168009251356125, "learning_rate": 0.00019970839038838914, "loss": 0.1066, "step": 1587 }, { "epoch": 0.10265858585858585, "grad_norm": 0.09406615793704987, "learning_rate": 0.0001997078682791378, "loss": 0.0853, "step": 1588 }, { "epoch": 0.10272323232323233, "grad_norm": 0.09565608203411102, "learning_rate": 0.00019970734570358572, "loss": 0.1041, "step": 1589 }, { "epoch": 0.10278787878787879, "grad_norm": 0.10402078926563263, "learning_rate": 0.00019970682266173535, "loss": 0.1128, "step": 1590 }, { "epoch": 0.10285252525252525, "grad_norm": 0.09411383420228958, "learning_rate": 0.00019970629915358912, "loss": 0.1001, "step": 1591 }, { "epoch": 0.10291717171717171, "grad_norm": 0.08286409080028534, "learning_rate": 0.0001997057751791495, "loss": 0.0827, "step": 1592 }, { "epoch": 0.10298181818181819, "grad_norm": 0.1055469736456871, "learning_rate": 0.00019970525073841893, "loss": 0.1196, "step": 1593 }, { "epoch": 0.10304646464646465, "grad_norm": 0.1027616485953331, "learning_rate": 0.00019970472583139985, "loss": 0.1099, "step": 1594 }, { "epoch": 0.10311111111111111, "grad_norm": 0.09544102102518082, "learning_rate": 0.00019970420045809474, "loss": 0.1027, "step": 1595 }, { "epoch": 0.10317575757575757, "grad_norm": 0.11706390976905823, "learning_rate": 0.00019970367461850605, "loss": 0.1052, "step": 1596 }, { "epoch": 0.10324040404040404, "grad_norm": 0.09325215965509415, "learning_rate": 0.00019970314831263623, "loss": 0.0922, "step": 1597 }, { "epoch": 0.10330505050505051, "grad_norm": 0.08890800923109055, "learning_rate": 0.00019970262154048776, "loss": 0.0975, "step": 1598 }, { "epoch": 0.10336969696969697, "grad_norm": 0.08755331486463547, "learning_rate": 0.00019970209430206307, "loss": 0.1068, "step": 1599 }, { "epoch": 0.10343434343434343, "grad_norm": 0.08750922977924347, "learning_rate": 0.00019970156659736467, "loss": 0.0973, "step": 1600 }, { "epoch": 0.10343434343434343, "eval_bleu": 15.997045804511021, "eval_loss": 0.09796808660030365, "eval_runtime": 2.6698, "eval_samples_per_second": 11.986, "eval_steps_per_second": 1.498, "step": 1600 }, { "epoch": 0.1034989898989899, "grad_norm": 0.08491747081279755, "learning_rate": 0.00019970103842639498, "loss": 0.0867, "step": 1601 }, { "epoch": 0.10356363636363636, "grad_norm": 0.0872446745634079, "learning_rate": 0.0001997005097891565, "loss": 0.0904, "step": 1602 }, { "epoch": 0.10362828282828283, "grad_norm": 0.09314920753240585, "learning_rate": 0.0001996999806856517, "loss": 0.0808, "step": 1603 }, { "epoch": 0.1036929292929293, "grad_norm": 0.10297390073537827, "learning_rate": 0.00019969945111588305, "loss": 0.1062, "step": 1604 }, { "epoch": 0.10375757575757576, "grad_norm": 0.09452049434185028, "learning_rate": 0.00019969892107985304, "loss": 0.1032, "step": 1605 }, { "epoch": 0.10382222222222222, "grad_norm": 0.08197373151779175, "learning_rate": 0.00019969839057756413, "loss": 0.0903, "step": 1606 }, { "epoch": 0.10388686868686868, "grad_norm": 0.08604747802019119, "learning_rate": 0.0001996978596090188, "loss": 0.0909, "step": 1607 }, { "epoch": 0.10395151515151516, "grad_norm": 0.09293990582227707, "learning_rate": 0.00019969732817421953, "loss": 0.0978, "step": 1608 }, { "epoch": 0.10401616161616162, "grad_norm": 0.07438100874423981, "learning_rate": 0.00019969679627316884, "loss": 0.076, "step": 1609 }, { "epoch": 0.10408080808080808, "grad_norm": 0.09678399562835693, "learning_rate": 0.00019969626390586918, "loss": 0.0971, "step": 1610 }, { "epoch": 0.10414545454545454, "grad_norm": 0.11207693815231323, "learning_rate": 0.00019969573107232305, "loss": 0.1046, "step": 1611 }, { "epoch": 0.10421010101010102, "grad_norm": 0.08398399502038956, "learning_rate": 0.00019969519777253295, "loss": 0.0827, "step": 1612 }, { "epoch": 0.10427474747474748, "grad_norm": 0.09463801234960556, "learning_rate": 0.00019969466400650133, "loss": 0.1048, "step": 1613 }, { "epoch": 0.10433939393939394, "grad_norm": 0.08767811954021454, "learning_rate": 0.0001996941297742308, "loss": 0.091, "step": 1614 }, { "epoch": 0.1044040404040404, "grad_norm": 0.09328879415988922, "learning_rate": 0.00019969359507572372, "loss": 0.0975, "step": 1615 }, { "epoch": 0.10446868686868686, "grad_norm": 0.08879215270280838, "learning_rate": 0.00019969305991098267, "loss": 0.0935, "step": 1616 }, { "epoch": 0.10446868686868686, "eval_bleu": 15.18489137276308, "eval_loss": 0.09929370135068893, "eval_runtime": 2.8226, "eval_samples_per_second": 11.337, "eval_steps_per_second": 1.417, "step": 1616 }, { "epoch": 0.10453333333333334, "grad_norm": 0.08979222923517227, "learning_rate": 0.00019969252428001015, "loss": 0.095, "step": 1617 }, { "epoch": 0.1045979797979798, "grad_norm": 0.09343419969081879, "learning_rate": 0.00019969198818280863, "loss": 0.0968, "step": 1618 }, { "epoch": 0.10466262626262626, "grad_norm": 0.09061822295188904, "learning_rate": 0.00019969145161938067, "loss": 0.0994, "step": 1619 }, { "epoch": 0.10472727272727272, "grad_norm": 0.10156168788671494, "learning_rate": 0.00019969091458972872, "loss": 0.0983, "step": 1620 }, { "epoch": 0.10479191919191919, "grad_norm": 0.09300708770751953, "learning_rate": 0.0001996903770938553, "loss": 0.1091, "step": 1621 }, { "epoch": 0.10485656565656566, "grad_norm": 0.0945386216044426, "learning_rate": 0.000199689839131763, "loss": 0.1012, "step": 1622 }, { "epoch": 0.10492121212121212, "grad_norm": 0.0789511650800705, "learning_rate": 0.00019968930070345426, "loss": 0.095, "step": 1623 }, { "epoch": 0.10498585858585859, "grad_norm": 0.09683208912611008, "learning_rate": 0.00019968876180893161, "loss": 0.0954, "step": 1624 }, { "epoch": 0.10505050505050505, "grad_norm": 0.10096745938062668, "learning_rate": 0.0001996882224481976, "loss": 0.1123, "step": 1625 }, { "epoch": 0.10511515151515151, "grad_norm": 0.09884243458509445, "learning_rate": 0.00019968768262125468, "loss": 0.0912, "step": 1626 }, { "epoch": 0.10517979797979798, "grad_norm": 0.09217636287212372, "learning_rate": 0.00019968714232810545, "loss": 0.0977, "step": 1627 }, { "epoch": 0.10524444444444445, "grad_norm": 0.09520828723907471, "learning_rate": 0.0001996866015687524, "loss": 0.0916, "step": 1628 }, { "epoch": 0.10530909090909091, "grad_norm": 0.10780610889196396, "learning_rate": 0.00019968606034319813, "loss": 0.1254, "step": 1629 }, { "epoch": 0.10537373737373737, "grad_norm": 0.08303985744714737, "learning_rate": 0.00019968551865144504, "loss": 0.0953, "step": 1630 }, { "epoch": 0.10543838383838385, "grad_norm": 0.08853544294834137, "learning_rate": 0.00019968497649349579, "loss": 0.1016, "step": 1631 }, { "epoch": 0.1055030303030303, "grad_norm": 0.08793741464614868, "learning_rate": 0.0001996844338693528, "loss": 0.0928, "step": 1632 }, { "epoch": 0.1055030303030303, "eval_bleu": 12.336581675143183, "eval_loss": 0.09897857904434204, "eval_runtime": 2.6274, "eval_samples_per_second": 12.179, "eval_steps_per_second": 1.522, "step": 1632 }, { "epoch": 0.10556767676767677, "grad_norm": 0.09308361262083054, "learning_rate": 0.0001996838907790187, "loss": 0.099, "step": 1633 }, { "epoch": 0.10563232323232323, "grad_norm": 0.10176729410886765, "learning_rate": 0.000199683347222496, "loss": 0.1139, "step": 1634 }, { "epoch": 0.10569696969696969, "grad_norm": 0.0855538547039032, "learning_rate": 0.00019968280319978722, "loss": 0.0976, "step": 1635 }, { "epoch": 0.10576161616161617, "grad_norm": 0.08286743611097336, "learning_rate": 0.00019968225871089495, "loss": 0.0907, "step": 1636 }, { "epoch": 0.10582626262626263, "grad_norm": 0.07679275423288345, "learning_rate": 0.0001996817137558217, "loss": 0.0728, "step": 1637 }, { "epoch": 0.10589090909090909, "grad_norm": 0.1014551892876625, "learning_rate": 0.00019968116833457003, "loss": 0.0972, "step": 1638 }, { "epoch": 0.10595555555555555, "grad_norm": 0.17819735407829285, "learning_rate": 0.0001996806224471425, "loss": 0.1213, "step": 1639 }, { "epoch": 0.10602020202020201, "grad_norm": 0.0888531506061554, "learning_rate": 0.00019968007609354164, "loss": 0.1023, "step": 1640 }, { "epoch": 0.10608484848484849, "grad_norm": 0.10027861595153809, "learning_rate": 0.00019967952927377002, "loss": 0.1159, "step": 1641 }, { "epoch": 0.10614949494949495, "grad_norm": 0.08541226387023926, "learning_rate": 0.0001996789819878302, "loss": 0.0831, "step": 1642 }, { "epoch": 0.10621414141414141, "grad_norm": 0.08947043120861053, "learning_rate": 0.0001996784342357247, "loss": 0.1035, "step": 1643 }, { "epoch": 0.10627878787878788, "grad_norm": 0.09411673247814178, "learning_rate": 0.00019967788601745615, "loss": 0.1005, "step": 1644 }, { "epoch": 0.10634343434343434, "grad_norm": 0.0875181034207344, "learning_rate": 0.00019967733733302706, "loss": 0.0955, "step": 1645 }, { "epoch": 0.10640808080808081, "grad_norm": 0.09653908759355545, "learning_rate": 0.00019967678818244005, "loss": 0.1167, "step": 1646 }, { "epoch": 0.10647272727272727, "grad_norm": 0.07128980755805969, "learning_rate": 0.0001996762385656976, "loss": 0.0706, "step": 1647 }, { "epoch": 0.10653737373737374, "grad_norm": 0.08450567722320557, "learning_rate": 0.00019967568848280241, "loss": 0.0813, "step": 1648 }, { "epoch": 0.10653737373737374, "eval_bleu": 13.922604876199163, "eval_loss": 0.09967168420553207, "eval_runtime": 2.8357, "eval_samples_per_second": 11.285, "eval_steps_per_second": 1.411, "step": 1648 }, { "epoch": 0.1066020202020202, "grad_norm": 0.08985462039709091, "learning_rate": 0.0001996751379337569, "loss": 0.1019, "step": 1649 }, { "epoch": 0.10666666666666667, "grad_norm": 0.09184836596250534, "learning_rate": 0.00019967458691856377, "loss": 0.1143, "step": 1650 }, { "epoch": 0.10673131313131314, "grad_norm": 0.09489242732524872, "learning_rate": 0.00019967403543722555, "loss": 0.1108, "step": 1651 }, { "epoch": 0.1067959595959596, "grad_norm": 0.10834208130836487, "learning_rate": 0.0001996734834897448, "loss": 0.0969, "step": 1652 }, { "epoch": 0.10686060606060606, "grad_norm": 0.0975426658987999, "learning_rate": 0.00019967293107612413, "loss": 0.1138, "step": 1653 }, { "epoch": 0.10692525252525252, "grad_norm": 0.10958606749773026, "learning_rate": 0.0001996723781963661, "loss": 0.1293, "step": 1654 }, { "epoch": 0.106989898989899, "grad_norm": 0.0845121368765831, "learning_rate": 0.00019967182485047333, "loss": 0.1006, "step": 1655 }, { "epoch": 0.10705454545454546, "grad_norm": 0.09743615239858627, "learning_rate": 0.00019967127103844836, "loss": 0.1045, "step": 1656 }, { "epoch": 0.10711919191919192, "grad_norm": 0.09257999807596207, "learning_rate": 0.0001996707167602938, "loss": 0.0977, "step": 1657 }, { "epoch": 0.10718383838383838, "grad_norm": 0.07423005253076553, "learning_rate": 0.00019967016201601228, "loss": 0.072, "step": 1658 }, { "epoch": 0.10724848484848484, "grad_norm": 0.09771029651165009, "learning_rate": 0.00019966960680560636, "loss": 0.1036, "step": 1659 }, { "epoch": 0.10731313131313132, "grad_norm": 0.1146690845489502, "learning_rate": 0.00019966905112907862, "loss": 0.1162, "step": 1660 }, { "epoch": 0.10737777777777778, "grad_norm": 0.08793772757053375, "learning_rate": 0.00019966849498643168, "loss": 0.0951, "step": 1661 }, { "epoch": 0.10744242424242424, "grad_norm": 0.10198479890823364, "learning_rate": 0.00019966793837766816, "loss": 0.1174, "step": 1662 }, { "epoch": 0.1075070707070707, "grad_norm": 0.08196443319320679, "learning_rate": 0.00019966738130279058, "loss": 0.0865, "step": 1663 }, { "epoch": 0.10757171717171717, "grad_norm": 0.0996987447142601, "learning_rate": 0.00019966682376180165, "loss": 0.1096, "step": 1664 }, { "epoch": 0.10757171717171717, "eval_bleu": 13.528937742100071, "eval_loss": 0.10002744197845459, "eval_runtime": 2.6868, "eval_samples_per_second": 11.91, "eval_steps_per_second": 1.489, "step": 1664 }, { "epoch": 0.10763636363636364, "grad_norm": 0.08952216804027557, "learning_rate": 0.00019966626575470398, "loss": 0.1019, "step": 1665 }, { "epoch": 0.1077010101010101, "grad_norm": 0.10055726021528244, "learning_rate": 0.00019966570728150007, "loss": 0.1075, "step": 1666 }, { "epoch": 0.10776565656565656, "grad_norm": 0.09140234440565109, "learning_rate": 0.0001996651483421926, "loss": 0.1015, "step": 1667 }, { "epoch": 0.10783030303030303, "grad_norm": 0.08178149908781052, "learning_rate": 0.00019966458893678422, "loss": 0.0917, "step": 1668 }, { "epoch": 0.10789494949494949, "grad_norm": 0.09599588066339493, "learning_rate": 0.00019966402906527745, "loss": 0.1041, "step": 1669 }, { "epoch": 0.10795959595959596, "grad_norm": 0.10193531960248947, "learning_rate": 0.00019966346872767502, "loss": 0.1055, "step": 1670 }, { "epoch": 0.10802424242424243, "grad_norm": 0.09694015234708786, "learning_rate": 0.0001996629079239795, "loss": 0.1032, "step": 1671 }, { "epoch": 0.10808888888888889, "grad_norm": 0.08435283601284027, "learning_rate": 0.00019966234665419344, "loss": 0.0893, "step": 1672 }, { "epoch": 0.10815353535353535, "grad_norm": 0.10433562844991684, "learning_rate": 0.0001996617849183196, "loss": 0.1104, "step": 1673 }, { "epoch": 0.10821818181818182, "grad_norm": 0.08608686178922653, "learning_rate": 0.00019966122271636048, "loss": 0.1055, "step": 1674 }, { "epoch": 0.10828282828282829, "grad_norm": 0.1521894633769989, "learning_rate": 0.0001996606600483188, "loss": 0.0906, "step": 1675 }, { "epoch": 0.10834747474747475, "grad_norm": 0.08421637862920761, "learning_rate": 0.00019966009691419715, "loss": 0.0917, "step": 1676 }, { "epoch": 0.10841212121212121, "grad_norm": 0.08783233910799026, "learning_rate": 0.0001996595333139982, "loss": 0.0909, "step": 1677 }, { "epoch": 0.10847676767676767, "grad_norm": 0.08784070611000061, "learning_rate": 0.00019965896924772455, "loss": 0.0975, "step": 1678 }, { "epoch": 0.10854141414141415, "grad_norm": 0.08449574559926987, "learning_rate": 0.00019965840471537885, "loss": 0.0871, "step": 1679 }, { "epoch": 0.10860606060606061, "grad_norm": 0.11489264667034149, "learning_rate": 0.00019965783971696372, "loss": 0.1342, "step": 1680 }, { "epoch": 0.10860606060606061, "eval_bleu": 15.383758350508417, "eval_loss": 0.09991168975830078, "eval_runtime": 2.7517, "eval_samples_per_second": 11.629, "eval_steps_per_second": 1.454, "step": 1680 }, { "epoch": 0.10867070707070707, "grad_norm": 0.0800926461815834, "learning_rate": 0.0001996572742524818, "loss": 0.0921, "step": 1681 }, { "epoch": 0.10873535353535353, "grad_norm": 0.09851634502410889, "learning_rate": 0.0001996567083219358, "loss": 0.1101, "step": 1682 }, { "epoch": 0.1088, "grad_norm": 0.08068210631608963, "learning_rate": 0.0001996561419253283, "loss": 0.0922, "step": 1683 }, { "epoch": 0.10886464646464647, "grad_norm": 0.08367714285850525, "learning_rate": 0.00019965557506266196, "loss": 0.0755, "step": 1684 }, { "epoch": 0.10892929292929293, "grad_norm": 0.08664995431900024, "learning_rate": 0.00019965500773393946, "loss": 0.0919, "step": 1685 }, { "epoch": 0.10899393939393939, "grad_norm": 0.09484238922595978, "learning_rate": 0.00019965443993916345, "loss": 0.0975, "step": 1686 }, { "epoch": 0.10905858585858585, "grad_norm": 0.10192329436540604, "learning_rate": 0.00019965387167833655, "loss": 0.1226, "step": 1687 }, { "epoch": 0.10912323232323232, "grad_norm": 0.07847811281681061, "learning_rate": 0.00019965330295146144, "loss": 0.0858, "step": 1688 }, { "epoch": 0.10918787878787879, "grad_norm": 0.1330711990594864, "learning_rate": 0.00019965273375854075, "loss": 0.0995, "step": 1689 }, { "epoch": 0.10925252525252525, "grad_norm": 0.09547499567270279, "learning_rate": 0.0001996521640995772, "loss": 0.0953, "step": 1690 }, { "epoch": 0.10931717171717172, "grad_norm": 0.10386989265680313, "learning_rate": 0.0001996515939745734, "loss": 0.1098, "step": 1691 }, { "epoch": 0.10938181818181818, "grad_norm": 0.09227786213159561, "learning_rate": 0.00019965102338353205, "loss": 0.0999, "step": 1692 }, { "epoch": 0.10944646464646465, "grad_norm": 0.11024253070354462, "learning_rate": 0.00019965045232645583, "loss": 0.104, "step": 1693 }, { "epoch": 0.10951111111111111, "grad_norm": 0.11003511399030685, "learning_rate": 0.00019964988080334734, "loss": 0.1147, "step": 1694 }, { "epoch": 0.10957575757575758, "grad_norm": 0.07205240428447723, "learning_rate": 0.00019964930881420932, "loss": 0.0756, "step": 1695 }, { "epoch": 0.10964040404040404, "grad_norm": 0.09523680806159973, "learning_rate": 0.00019964873635904446, "loss": 0.0961, "step": 1696 }, { "epoch": 0.10964040404040404, "eval_bleu": 12.824950677653414, "eval_loss": 0.09803938865661621, "eval_runtime": 2.6212, "eval_samples_per_second": 12.208, "eval_steps_per_second": 1.526, "step": 1696 }, { "epoch": 0.1097050505050505, "grad_norm": 0.08599366992712021, "learning_rate": 0.00019964816343785537, "loss": 0.0931, "step": 1697 }, { "epoch": 0.10976969696969698, "grad_norm": 0.09303499013185501, "learning_rate": 0.00019964759005064477, "loss": 0.0986, "step": 1698 }, { "epoch": 0.10983434343434344, "grad_norm": 0.0944119468331337, "learning_rate": 0.00019964701619741532, "loss": 0.1082, "step": 1699 }, { "epoch": 0.1098989898989899, "grad_norm": 0.07310118526220322, "learning_rate": 0.00019964644187816973, "loss": 0.0734, "step": 1700 }, { "epoch": 0.10996363636363636, "grad_norm": 0.09287518262863159, "learning_rate": 0.00019964586709291071, "loss": 0.1085, "step": 1701 }, { "epoch": 0.11002828282828282, "grad_norm": 0.08038844913244247, "learning_rate": 0.00019964529184164086, "loss": 0.0931, "step": 1702 }, { "epoch": 0.1100929292929293, "grad_norm": 0.09042944759130478, "learning_rate": 0.00019964471612436295, "loss": 0.1064, "step": 1703 }, { "epoch": 0.11015757575757576, "grad_norm": 0.08298671990633011, "learning_rate": 0.00019964413994107965, "loss": 0.0919, "step": 1704 }, { "epoch": 0.11022222222222222, "grad_norm": 0.08193987607955933, "learning_rate": 0.00019964356329179363, "loss": 0.0883, "step": 1705 }, { "epoch": 0.11028686868686868, "grad_norm": 0.07536068558692932, "learning_rate": 0.00019964298617650757, "loss": 0.0806, "step": 1706 }, { "epoch": 0.11035151515151514, "grad_norm": 0.0773344412446022, "learning_rate": 0.00019964240859522426, "loss": 0.0763, "step": 1707 }, { "epoch": 0.11041616161616162, "grad_norm": 0.08499639481306076, "learning_rate": 0.00019964183054794633, "loss": 0.1026, "step": 1708 }, { "epoch": 0.11048080808080808, "grad_norm": 0.08847616612911224, "learning_rate": 0.00019964125203467652, "loss": 0.0971, "step": 1709 }, { "epoch": 0.11054545454545454, "grad_norm": 0.07781341671943665, "learning_rate": 0.0001996406730554175, "loss": 0.0829, "step": 1710 }, { "epoch": 0.110610101010101, "grad_norm": 0.09329476952552795, "learning_rate": 0.00019964009361017197, "loss": 0.0983, "step": 1711 }, { "epoch": 0.11067474747474748, "grad_norm": 0.08138521760702133, "learning_rate": 0.0001996395136989427, "loss": 0.0834, "step": 1712 }, { "epoch": 0.11067474747474748, "eval_bleu": 16.31455048457529, "eval_loss": 0.09661374986171722, "eval_runtime": 2.7928, "eval_samples_per_second": 11.458, "eval_steps_per_second": 1.432, "step": 1712 }, { "epoch": 0.11073939393939394, "grad_norm": 0.09928783029317856, "learning_rate": 0.00019963893332173235, "loss": 0.107, "step": 1713 }, { "epoch": 0.1108040404040404, "grad_norm": 0.10143221914768219, "learning_rate": 0.00019963835247854364, "loss": 0.1207, "step": 1714 }, { "epoch": 0.11086868686868687, "grad_norm": 0.08343500643968582, "learning_rate": 0.0001996377711693793, "loss": 0.0916, "step": 1715 }, { "epoch": 0.11093333333333333, "grad_norm": 0.0842021107673645, "learning_rate": 0.00019963718939424206, "loss": 0.093, "step": 1716 }, { "epoch": 0.1109979797979798, "grad_norm": 0.09642371535301208, "learning_rate": 0.0001996366071531346, "loss": 0.0875, "step": 1717 }, { "epoch": 0.11106262626262627, "grad_norm": 0.09394519776105881, "learning_rate": 0.00019963602444605968, "loss": 0.0931, "step": 1718 }, { "epoch": 0.11112727272727273, "grad_norm": 0.09390253573656082, "learning_rate": 0.00019963544127302, "loss": 0.095, "step": 1719 }, { "epoch": 0.11119191919191919, "grad_norm": 0.1078769862651825, "learning_rate": 0.0001996348576340183, "loss": 0.1018, "step": 1720 }, { "epoch": 0.11125656565656565, "grad_norm": 0.08828653395175934, "learning_rate": 0.00019963427352905733, "loss": 0.096, "step": 1721 }, { "epoch": 0.11132121212121213, "grad_norm": 0.08584805577993393, "learning_rate": 0.00019963368895813978, "loss": 0.098, "step": 1722 }, { "epoch": 0.11138585858585859, "grad_norm": 0.0930759534239769, "learning_rate": 0.0001996331039212684, "loss": 0.1044, "step": 1723 }, { "epoch": 0.11145050505050505, "grad_norm": 0.1555097997188568, "learning_rate": 0.00019963251841844594, "loss": 0.1034, "step": 1724 }, { "epoch": 0.11151515151515151, "grad_norm": 0.10394205898046494, "learning_rate": 0.00019963193244967514, "loss": 0.1021, "step": 1725 }, { "epoch": 0.11157979797979797, "grad_norm": 0.1084916889667511, "learning_rate": 0.00019963134601495874, "loss": 0.1026, "step": 1726 }, { "epoch": 0.11164444444444445, "grad_norm": 0.08258991688489914, "learning_rate": 0.00019963075911429945, "loss": 0.1024, "step": 1727 }, { "epoch": 0.11170909090909091, "grad_norm": 0.08256273716688156, "learning_rate": 0.00019963017174770006, "loss": 0.0855, "step": 1728 }, { "epoch": 0.11170909090909091, "eval_bleu": 16.68002773176488, "eval_loss": 0.09654008597135544, "eval_runtime": 2.9008, "eval_samples_per_second": 11.031, "eval_steps_per_second": 1.379, "step": 1728 }, { "epoch": 0.11177373737373737, "grad_norm": 0.0940549299120903, "learning_rate": 0.00019962958391516326, "loss": 0.1068, "step": 1729 }, { "epoch": 0.11183838383838383, "grad_norm": 0.09226085990667343, "learning_rate": 0.00019962899561669185, "loss": 0.0858, "step": 1730 }, { "epoch": 0.11190303030303031, "grad_norm": 0.08729394525289536, "learning_rate": 0.00019962840685228857, "loss": 0.1006, "step": 1731 }, { "epoch": 0.11196767676767677, "grad_norm": 0.11189916729927063, "learning_rate": 0.00019962781762195616, "loss": 0.0992, "step": 1732 }, { "epoch": 0.11203232323232323, "grad_norm": 0.08743728697299957, "learning_rate": 0.0001996272279256974, "loss": 0.0861, "step": 1733 }, { "epoch": 0.1120969696969697, "grad_norm": 0.0881812646985054, "learning_rate": 0.00019962663776351502, "loss": 0.08, "step": 1734 }, { "epoch": 0.11216161616161616, "grad_norm": 0.08333799988031387, "learning_rate": 0.0001996260471354118, "loss": 0.0873, "step": 1735 }, { "epoch": 0.11222626262626263, "grad_norm": 0.07822366803884506, "learning_rate": 0.00019962545604139046, "loss": 0.088, "step": 1736 }, { "epoch": 0.1122909090909091, "grad_norm": 0.12780636548995972, "learning_rate": 0.00019962486448145381, "loss": 0.1047, "step": 1737 }, { "epoch": 0.11235555555555556, "grad_norm": 0.08140812069177628, "learning_rate": 0.00019962427245560463, "loss": 0.0912, "step": 1738 }, { "epoch": 0.11242020202020202, "grad_norm": 0.08540894091129303, "learning_rate": 0.00019962367996384563, "loss": 0.1009, "step": 1739 }, { "epoch": 0.11248484848484848, "grad_norm": 0.10637736320495605, "learning_rate": 0.00019962308700617961, "loss": 0.1045, "step": 1740 }, { "epoch": 0.11254949494949495, "grad_norm": 0.1078633964061737, "learning_rate": 0.00019962249358260938, "loss": 0.0949, "step": 1741 }, { "epoch": 0.11261414141414142, "grad_norm": 0.08576861768960953, "learning_rate": 0.00019962189969313768, "loss": 0.0966, "step": 1742 }, { "epoch": 0.11267878787878788, "grad_norm": 0.07337532192468643, "learning_rate": 0.00019962130533776726, "loss": 0.075, "step": 1743 }, { "epoch": 0.11274343434343434, "grad_norm": 0.08354844152927399, "learning_rate": 0.00019962071051650098, "loss": 0.0872, "step": 1744 }, { "epoch": 0.11274343434343434, "eval_bleu": 15.466178365357706, "eval_loss": 0.09811587631702423, "eval_runtime": 2.8782, "eval_samples_per_second": 11.118, "eval_steps_per_second": 1.39, "step": 1744 }, { "epoch": 0.1128080808080808, "grad_norm": 0.08509067445993423, "learning_rate": 0.00019962011522934152, "loss": 0.0913, "step": 1745 }, { "epoch": 0.11287272727272728, "grad_norm": 0.1303311437368393, "learning_rate": 0.0001996195194762917, "loss": 0.1034, "step": 1746 }, { "epoch": 0.11293737373737374, "grad_norm": 0.09086159616708755, "learning_rate": 0.0001996189232573544, "loss": 0.0871, "step": 1747 }, { "epoch": 0.1130020202020202, "grad_norm": 0.09347430616617203, "learning_rate": 0.00019961832657253227, "loss": 0.1027, "step": 1748 }, { "epoch": 0.11306666666666666, "grad_norm": 0.08479762077331543, "learning_rate": 0.00019961772942182816, "loss": 0.0953, "step": 1749 }, { "epoch": 0.11313131313131314, "grad_norm": 0.08756949007511139, "learning_rate": 0.00019961713180524488, "loss": 0.0883, "step": 1750 }, { "epoch": 0.1131959595959596, "grad_norm": 0.10438496619462967, "learning_rate": 0.0001996165337227852, "loss": 0.1132, "step": 1751 }, { "epoch": 0.11326060606060606, "grad_norm": 0.09598714113235474, "learning_rate": 0.00019961593517445195, "loss": 0.1014, "step": 1752 }, { "epoch": 0.11332525252525252, "grad_norm": 0.1013709232211113, "learning_rate": 0.00019961533616024788, "loss": 0.1004, "step": 1753 }, { "epoch": 0.11338989898989899, "grad_norm": 0.10158463567495346, "learning_rate": 0.0001996147366801758, "loss": 0.0976, "step": 1754 }, { "epoch": 0.11345454545454546, "grad_norm": 0.08664679527282715, "learning_rate": 0.00019961413673423855, "loss": 0.0858, "step": 1755 }, { "epoch": 0.11351919191919192, "grad_norm": 0.08453410118818283, "learning_rate": 0.00019961353632243892, "loss": 0.0914, "step": 1756 }, { "epoch": 0.11358383838383838, "grad_norm": 0.10597959160804749, "learning_rate": 0.0001996129354447797, "loss": 0.0831, "step": 1757 }, { "epoch": 0.11364848484848485, "grad_norm": 0.0837024599313736, "learning_rate": 0.0001996123341012637, "loss": 0.0969, "step": 1758 }, { "epoch": 0.11371313131313131, "grad_norm": 0.07992946356534958, "learning_rate": 0.00019961173229189375, "loss": 0.0885, "step": 1759 }, { "epoch": 0.11377777777777778, "grad_norm": 0.0935758501291275, "learning_rate": 0.00019961113001667268, "loss": 0.0998, "step": 1760 }, { "epoch": 0.11377777777777778, "eval_bleu": 17.203350793717163, "eval_loss": 0.09911344200372696, "eval_runtime": 2.7632, "eval_samples_per_second": 11.581, "eval_steps_per_second": 1.448, "step": 1760 }, { "epoch": 0.11384242424242424, "grad_norm": 0.09442166984081268, "learning_rate": 0.00019961052727560325, "loss": 0.1094, "step": 1761 }, { "epoch": 0.1139070707070707, "grad_norm": 0.0889282375574112, "learning_rate": 0.00019960992406868835, "loss": 0.0879, "step": 1762 }, { "epoch": 0.11397171717171717, "grad_norm": 0.11973419785499573, "learning_rate": 0.00019960932039593074, "loss": 0.0984, "step": 1763 }, { "epoch": 0.11403636363636363, "grad_norm": 0.10106372088193893, "learning_rate": 0.00019960871625733327, "loss": 0.0905, "step": 1764 }, { "epoch": 0.1141010101010101, "grad_norm": 0.11689893156290054, "learning_rate": 0.00019960811165289878, "loss": 0.1284, "step": 1765 }, { "epoch": 0.11416565656565657, "grad_norm": 0.1037783995270729, "learning_rate": 0.00019960750658263007, "loss": 0.1111, "step": 1766 }, { "epoch": 0.11423030303030303, "grad_norm": 0.08097214996814728, "learning_rate": 0.00019960690104653002, "loss": 0.0837, "step": 1767 }, { "epoch": 0.11429494949494949, "grad_norm": 0.08323007076978683, "learning_rate": 0.00019960629504460137, "loss": 0.0905, "step": 1768 }, { "epoch": 0.11435959595959595, "grad_norm": 0.08763524144887924, "learning_rate": 0.00019960568857684704, "loss": 0.0965, "step": 1769 }, { "epoch": 0.11442424242424243, "grad_norm": 0.08229734748601913, "learning_rate": 0.00019960508164326983, "loss": 0.0888, "step": 1770 }, { "epoch": 0.11448888888888889, "grad_norm": 0.0831150934100151, "learning_rate": 0.00019960447424387256, "loss": 0.081, "step": 1771 }, { "epoch": 0.11455353535353535, "grad_norm": 0.09266072511672974, "learning_rate": 0.0001996038663786581, "loss": 0.1176, "step": 1772 }, { "epoch": 0.11461818181818181, "grad_norm": 0.08846625685691833, "learning_rate": 0.0001996032580476293, "loss": 0.0973, "step": 1773 }, { "epoch": 0.11468282828282829, "grad_norm": 0.09097465872764587, "learning_rate": 0.00019960264925078899, "loss": 0.1031, "step": 1774 }, { "epoch": 0.11474747474747475, "grad_norm": 0.08059729635715485, "learning_rate": 0.00019960203998814, "loss": 0.0842, "step": 1775 }, { "epoch": 0.11481212121212121, "grad_norm": 0.09724695980548859, "learning_rate": 0.0001996014302596852, "loss": 0.1119, "step": 1776 }, { "epoch": 0.11481212121212121, "eval_bleu": 12.563658772797544, "eval_loss": 0.1017531007528305, "eval_runtime": 2.8573, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 1776 }, { "epoch": 0.11487676767676767, "grad_norm": 0.09084711223840714, "learning_rate": 0.00019960082006542743, "loss": 0.0879, "step": 1777 }, { "epoch": 0.11494141414141414, "grad_norm": 0.10123063623905182, "learning_rate": 0.0001996002094053696, "loss": 0.1046, "step": 1778 }, { "epoch": 0.11500606060606061, "grad_norm": 0.08672968298196793, "learning_rate": 0.00019959959827951446, "loss": 0.0877, "step": 1779 }, { "epoch": 0.11507070707070707, "grad_norm": 0.09039455652236938, "learning_rate": 0.00019959898668786495, "loss": 0.1058, "step": 1780 }, { "epoch": 0.11513535353535354, "grad_norm": 0.08296473324298859, "learning_rate": 0.00019959837463042393, "loss": 0.1029, "step": 1781 }, { "epoch": 0.1152, "grad_norm": 0.08520353585481644, "learning_rate": 0.0001995977621071942, "loss": 0.0906, "step": 1782 }, { "epoch": 0.11526464646464646, "grad_norm": 0.0956898182630539, "learning_rate": 0.0001995971491181787, "loss": 0.1013, "step": 1783 }, { "epoch": 0.11532929292929293, "grad_norm": 0.09413713961839676, "learning_rate": 0.0001995965356633802, "loss": 0.0962, "step": 1784 }, { "epoch": 0.1153939393939394, "grad_norm": 0.09771838039159775, "learning_rate": 0.0001995959217428017, "loss": 0.1062, "step": 1785 }, { "epoch": 0.11545858585858586, "grad_norm": 0.0850255936384201, "learning_rate": 0.00019959530735644596, "loss": 0.085, "step": 1786 }, { "epoch": 0.11552323232323232, "grad_norm": 0.08241092413663864, "learning_rate": 0.0001995946925043159, "loss": 0.0787, "step": 1787 }, { "epoch": 0.11558787878787878, "grad_norm": 0.09113946557044983, "learning_rate": 0.0001995940771864144, "loss": 0.0978, "step": 1788 }, { "epoch": 0.11565252525252526, "grad_norm": 0.09338917583227158, "learning_rate": 0.0001995934614027443, "loss": 0.1163, "step": 1789 }, { "epoch": 0.11571717171717172, "grad_norm": 0.09320656210184097, "learning_rate": 0.00019959284515330849, "loss": 0.0995, "step": 1790 }, { "epoch": 0.11578181818181818, "grad_norm": 0.09453748911619186, "learning_rate": 0.0001995922284381099, "loss": 0.1212, "step": 1791 }, { "epoch": 0.11584646464646464, "grad_norm": 0.08749670535326004, "learning_rate": 0.0001995916112571514, "loss": 0.0924, "step": 1792 }, { "epoch": 0.11584646464646464, "eval_bleu": 16.02098396083046, "eval_loss": 0.10107424110174179, "eval_runtime": 2.7008, "eval_samples_per_second": 11.848, "eval_steps_per_second": 1.481, "step": 1792 }, { "epoch": 0.11591111111111112, "grad_norm": 0.09330089390277863, "learning_rate": 0.00019959099361043582, "loss": 0.1074, "step": 1793 }, { "epoch": 0.11597575757575758, "grad_norm": 0.08451402187347412, "learning_rate": 0.00019959037549796614, "loss": 0.0838, "step": 1794 }, { "epoch": 0.11604040404040404, "grad_norm": 0.0749412477016449, "learning_rate": 0.00019958975691974513, "loss": 0.0806, "step": 1795 }, { "epoch": 0.1161050505050505, "grad_norm": 0.08658935129642487, "learning_rate": 0.0001995891378757758, "loss": 0.0851, "step": 1796 }, { "epoch": 0.11616969696969696, "grad_norm": 0.09365957230329514, "learning_rate": 0.00019958851836606099, "loss": 0.1087, "step": 1797 }, { "epoch": 0.11623434343434344, "grad_norm": 0.11999083310365677, "learning_rate": 0.00019958789839060357, "loss": 0.1114, "step": 1798 }, { "epoch": 0.1162989898989899, "grad_norm": 0.09224677085876465, "learning_rate": 0.00019958727794940648, "loss": 0.1095, "step": 1799 }, { "epoch": 0.11636363636363636, "grad_norm": 0.08700484037399292, "learning_rate": 0.00019958665704247264, "loss": 0.1066, "step": 1800 }, { "epoch": 0.11642828282828283, "grad_norm": 0.0821424275636673, "learning_rate": 0.00019958603566980492, "loss": 0.1015, "step": 1801 }, { "epoch": 0.11649292929292929, "grad_norm": 0.08366188406944275, "learning_rate": 0.00019958541383140624, "loss": 0.1045, "step": 1802 }, { "epoch": 0.11655757575757576, "grad_norm": 0.08512281626462936, "learning_rate": 0.0001995847915272795, "loss": 0.0841, "step": 1803 }, { "epoch": 0.11662222222222222, "grad_norm": 0.08686674386262894, "learning_rate": 0.0001995841687574276, "loss": 0.1075, "step": 1804 }, { "epoch": 0.11668686868686869, "grad_norm": 0.08095437288284302, "learning_rate": 0.00019958354552185344, "loss": 0.0938, "step": 1805 }, { "epoch": 0.11675151515151515, "grad_norm": 0.09457031637430191, "learning_rate": 0.00019958292182056, "loss": 0.1097, "step": 1806 }, { "epoch": 0.11681616161616161, "grad_norm": 0.08220674842596054, "learning_rate": 0.00019958229765355015, "loss": 0.0833, "step": 1807 }, { "epoch": 0.11688080808080809, "grad_norm": 0.08338411152362823, "learning_rate": 0.00019958167302082678, "loss": 0.0837, "step": 1808 }, { "epoch": 0.11688080808080809, "eval_bleu": 15.42548785792838, "eval_loss": 0.09879328310489655, "eval_runtime": 2.7946, "eval_samples_per_second": 11.451, "eval_steps_per_second": 1.431, "step": 1808 }, { "epoch": 0.11694545454545455, "grad_norm": 0.08020108193159103, "learning_rate": 0.0001995810479223929, "loss": 0.0899, "step": 1809 }, { "epoch": 0.11701010101010101, "grad_norm": 0.09933420270681381, "learning_rate": 0.00019958042235825136, "loss": 0.1217, "step": 1810 }, { "epoch": 0.11707474747474747, "grad_norm": 0.09501846134662628, "learning_rate": 0.0001995797963284051, "loss": 0.0839, "step": 1811 }, { "epoch": 0.11713939393939395, "grad_norm": 0.11073479801416397, "learning_rate": 0.00019957916983285705, "loss": 0.1158, "step": 1812 }, { "epoch": 0.11720404040404041, "grad_norm": 0.08117087185382843, "learning_rate": 0.00019957854287161017, "loss": 0.0988, "step": 1813 }, { "epoch": 0.11726868686868687, "grad_norm": 0.08467129617929459, "learning_rate": 0.0001995779154446673, "loss": 0.09, "step": 1814 }, { "epoch": 0.11733333333333333, "grad_norm": 0.09511814266443253, "learning_rate": 0.0001995772875520315, "loss": 0.1003, "step": 1815 }, { "epoch": 0.11739797979797979, "grad_norm": 0.09553271532058716, "learning_rate": 0.00019957665919370562, "loss": 0.1031, "step": 1816 }, { "epoch": 0.11746262626262627, "grad_norm": 0.07528268545866013, "learning_rate": 0.0001995760303696926, "loss": 0.0745, "step": 1817 }, { "epoch": 0.11752727272727273, "grad_norm": 0.08599118143320084, "learning_rate": 0.00019957540107999545, "loss": 0.0913, "step": 1818 }, { "epoch": 0.11759191919191919, "grad_norm": 0.09785564988851547, "learning_rate": 0.00019957477132461708, "loss": 0.1053, "step": 1819 }, { "epoch": 0.11765656565656565, "grad_norm": 0.08028493076562881, "learning_rate": 0.0001995741411035604, "loss": 0.0889, "step": 1820 }, { "epoch": 0.11772121212121212, "grad_norm": 0.0831385925412178, "learning_rate": 0.00019957351041682836, "loss": 0.0946, "step": 1821 }, { "epoch": 0.11778585858585859, "grad_norm": 0.08381468057632446, "learning_rate": 0.00019957287926442393, "loss": 0.0975, "step": 1822 }, { "epoch": 0.11785050505050505, "grad_norm": 0.0780530571937561, "learning_rate": 0.0001995722476463501, "loss": 0.0811, "step": 1823 }, { "epoch": 0.11791515151515151, "grad_norm": 0.09475967288017273, "learning_rate": 0.00019957161556260976, "loss": 0.1071, "step": 1824 }, { "epoch": 0.11791515151515151, "eval_bleu": 12.836995387579664, "eval_loss": 0.10115996748209, "eval_runtime": 2.6106, "eval_samples_per_second": 12.257, "eval_steps_per_second": 1.532, "step": 1824 }, { "epoch": 0.11797979797979798, "grad_norm": 0.06906456500291824, "learning_rate": 0.0001995709830132059, "loss": 0.0708, "step": 1825 }, { "epoch": 0.11804444444444444, "grad_norm": 0.07418100535869598, "learning_rate": 0.0001995703499981415, "loss": 0.0772, "step": 1826 }, { "epoch": 0.11810909090909091, "grad_norm": 0.08834165334701538, "learning_rate": 0.00019956971651741943, "loss": 0.0994, "step": 1827 }, { "epoch": 0.11817373737373738, "grad_norm": 0.08954574912786484, "learning_rate": 0.00019956908257104275, "loss": 0.092, "step": 1828 }, { "epoch": 0.11823838383838384, "grad_norm": 0.09749346226453781, "learning_rate": 0.00019956844815901436, "loss": 0.1063, "step": 1829 }, { "epoch": 0.1183030303030303, "grad_norm": 0.09436722844839096, "learning_rate": 0.00019956781328133726, "loss": 0.0965, "step": 1830 }, { "epoch": 0.11836767676767677, "grad_norm": 0.08959711343050003, "learning_rate": 0.00019956717793801442, "loss": 0.1066, "step": 1831 }, { "epoch": 0.11843232323232324, "grad_norm": 0.07377462834119797, "learning_rate": 0.00019956654212904883, "loss": 0.0805, "step": 1832 }, { "epoch": 0.1184969696969697, "grad_norm": 0.09313827008008957, "learning_rate": 0.00019956590585444342, "loss": 0.0936, "step": 1833 }, { "epoch": 0.11856161616161616, "grad_norm": 0.08020653575658798, "learning_rate": 0.00019956526911420118, "loss": 0.0844, "step": 1834 }, { "epoch": 0.11862626262626262, "grad_norm": 0.0924125388264656, "learning_rate": 0.0001995646319083251, "loss": 0.1011, "step": 1835 }, { "epoch": 0.1186909090909091, "grad_norm": 0.08196383714675903, "learning_rate": 0.00019956399423681816, "loss": 0.0915, "step": 1836 }, { "epoch": 0.11875555555555556, "grad_norm": 0.09715278446674347, "learning_rate": 0.0001995633560996833, "loss": 0.1125, "step": 1837 }, { "epoch": 0.11882020202020202, "grad_norm": 0.0908689796924591, "learning_rate": 0.00019956271749692358, "loss": 0.1001, "step": 1838 }, { "epoch": 0.11888484848484848, "grad_norm": 0.08144357800483704, "learning_rate": 0.0001995620784285419, "loss": 0.0999, "step": 1839 }, { "epoch": 0.11894949494949494, "grad_norm": 0.08233830332756042, "learning_rate": 0.00019956143889454135, "loss": 0.0867, "step": 1840 }, { "epoch": 0.11894949494949494, "eval_bleu": 13.495300645389168, "eval_loss": 0.09907495975494385, "eval_runtime": 2.7916, "eval_samples_per_second": 11.463, "eval_steps_per_second": 1.433, "step": 1840 }, { "epoch": 0.11901414141414142, "grad_norm": 0.08961008489131927, "learning_rate": 0.00019956079889492482, "loss": 0.0952, "step": 1841 }, { "epoch": 0.11907878787878788, "grad_norm": 0.10404586046934128, "learning_rate": 0.00019956015842969538, "loss": 0.0921, "step": 1842 }, { "epoch": 0.11914343434343434, "grad_norm": 0.09377691894769669, "learning_rate": 0.00019955951749885595, "loss": 0.1117, "step": 1843 }, { "epoch": 0.1192080808080808, "grad_norm": 0.07946504652500153, "learning_rate": 0.0001995588761024096, "loss": 0.0759, "step": 1844 }, { "epoch": 0.11927272727272727, "grad_norm": 0.09573984146118164, "learning_rate": 0.00019955823424035928, "loss": 0.1129, "step": 1845 }, { "epoch": 0.11933737373737374, "grad_norm": 0.0993974357843399, "learning_rate": 0.00019955759191270803, "loss": 0.1197, "step": 1846 }, { "epoch": 0.1194020202020202, "grad_norm": 0.08959382772445679, "learning_rate": 0.00019955694911945885, "loss": 0.0824, "step": 1847 }, { "epoch": 0.11946666666666667, "grad_norm": 0.07440722733736038, "learning_rate": 0.0001995563058606147, "loss": 0.0794, "step": 1848 }, { "epoch": 0.11953131313131313, "grad_norm": 0.09743315726518631, "learning_rate": 0.00019955566213617865, "loss": 0.0849, "step": 1849 }, { "epoch": 0.1195959595959596, "grad_norm": 0.08127295225858688, "learning_rate": 0.00019955501794615365, "loss": 0.0868, "step": 1850 }, { "epoch": 0.11966060606060606, "grad_norm": 0.08320796489715576, "learning_rate": 0.00019955437329054277, "loss": 0.1018, "step": 1851 }, { "epoch": 0.11972525252525253, "grad_norm": 0.09335729479789734, "learning_rate": 0.00019955372816934897, "loss": 0.0924, "step": 1852 }, { "epoch": 0.11978989898989899, "grad_norm": 0.0801360085606575, "learning_rate": 0.00019955308258257532, "loss": 0.0781, "step": 1853 }, { "epoch": 0.11985454545454545, "grad_norm": 0.0855412632226944, "learning_rate": 0.0001995524365302248, "loss": 0.0866, "step": 1854 }, { "epoch": 0.11991919191919193, "grad_norm": 0.10311412066221237, "learning_rate": 0.00019955179001230047, "loss": 0.1183, "step": 1855 }, { "epoch": 0.11998383838383839, "grad_norm": 0.08917050808668137, "learning_rate": 0.0001995511430288053, "loss": 0.0966, "step": 1856 }, { "epoch": 0.11998383838383839, "eval_bleu": 15.815607722549192, "eval_loss": 0.09831130504608154, "eval_runtime": 2.7986, "eval_samples_per_second": 11.434, "eval_steps_per_second": 1.429, "step": 1856 }, { "epoch": 0.12004848484848485, "grad_norm": 0.08762361109256744, "learning_rate": 0.00019955049557974236, "loss": 0.0914, "step": 1857 }, { "epoch": 0.12011313131313131, "grad_norm": 0.0820266604423523, "learning_rate": 0.00019954984766511465, "loss": 0.0854, "step": 1858 }, { "epoch": 0.12017777777777777, "grad_norm": 0.08227520436048508, "learning_rate": 0.00019954919928492524, "loss": 0.0931, "step": 1859 }, { "epoch": 0.12024242424242425, "grad_norm": 0.08380527049303055, "learning_rate": 0.00019954855043917712, "loss": 0.0852, "step": 1860 }, { "epoch": 0.12030707070707071, "grad_norm": 0.07569682598114014, "learning_rate": 0.00019954790112787334, "loss": 0.0835, "step": 1861 }, { "epoch": 0.12037171717171717, "grad_norm": 0.08127695322036743, "learning_rate": 0.00019954725135101694, "loss": 0.0843, "step": 1862 }, { "epoch": 0.12043636363636363, "grad_norm": 0.09327790886163712, "learning_rate": 0.00019954660110861093, "loss": 0.1031, "step": 1863 }, { "epoch": 0.1205010101010101, "grad_norm": 0.11711227148771286, "learning_rate": 0.0001995459504006584, "loss": 0.1151, "step": 1864 }, { "epoch": 0.12056565656565657, "grad_norm": 0.09174749255180359, "learning_rate": 0.00019954529922716236, "loss": 0.0901, "step": 1865 }, { "epoch": 0.12063030303030303, "grad_norm": 0.0782327950000763, "learning_rate": 0.00019954464758812588, "loss": 0.086, "step": 1866 }, { "epoch": 0.1206949494949495, "grad_norm": 0.10070976614952087, "learning_rate": 0.00019954399548355198, "loss": 0.097, "step": 1867 }, { "epoch": 0.12075959595959596, "grad_norm": 0.08702991902828217, "learning_rate": 0.00019954334291344373, "loss": 0.0857, "step": 1868 }, { "epoch": 0.12082424242424242, "grad_norm": 0.08011528849601746, "learning_rate": 0.00019954268987780417, "loss": 0.0955, "step": 1869 }, { "epoch": 0.12088888888888889, "grad_norm": 0.07657444477081299, "learning_rate": 0.00019954203637663636, "loss": 0.0851, "step": 1870 }, { "epoch": 0.12095353535353535, "grad_norm": 0.22027158737182617, "learning_rate": 0.00019954138240994333, "loss": 0.1058, "step": 1871 }, { "epoch": 0.12101818181818182, "grad_norm": 0.10446283221244812, "learning_rate": 0.00019954072797772815, "loss": 0.121, "step": 1872 }, { "epoch": 0.12101818181818182, "eval_bleu": 13.164717037230142, "eval_loss": 0.09834155440330505, "eval_runtime": 2.742, "eval_samples_per_second": 11.67, "eval_steps_per_second": 1.459, "step": 1872 }, { "epoch": 0.12108282828282828, "grad_norm": 0.09554079174995422, "learning_rate": 0.00019954007307999394, "loss": 0.1075, "step": 1873 }, { "epoch": 0.12114747474747475, "grad_norm": 0.09154904633760452, "learning_rate": 0.0001995394177167437, "loss": 0.0973, "step": 1874 }, { "epoch": 0.12121212121212122, "grad_norm": 0.08791657537221909, "learning_rate": 0.00019953876188798052, "loss": 0.0909, "step": 1875 }, { "epoch": 0.12127676767676768, "grad_norm": 0.08171246200799942, "learning_rate": 0.00019953810559370742, "loss": 0.088, "step": 1876 }, { "epoch": 0.12134141414141414, "grad_norm": 0.0776485726237297, "learning_rate": 0.00019953744883392755, "loss": 0.0797, "step": 1877 }, { "epoch": 0.1214060606060606, "grad_norm": 0.09048549830913544, "learning_rate": 0.00019953679160864392, "loss": 0.0991, "step": 1878 }, { "epoch": 0.12147070707070708, "grad_norm": 0.09326526522636414, "learning_rate": 0.00019953613391785963, "loss": 0.1003, "step": 1879 }, { "epoch": 0.12153535353535354, "grad_norm": 0.08852504193782806, "learning_rate": 0.00019953547576157774, "loss": 0.0978, "step": 1880 }, { "epoch": 0.1216, "grad_norm": 0.10462523996829987, "learning_rate": 0.00019953481713980134, "loss": 0.1207, "step": 1881 }, { "epoch": 0.12166464646464646, "grad_norm": 0.08366268873214722, "learning_rate": 0.00019953415805253348, "loss": 0.0867, "step": 1882 }, { "epoch": 0.12172929292929292, "grad_norm": 0.08039695024490356, "learning_rate": 0.0001995334984997773, "loss": 0.0809, "step": 1883 }, { "epoch": 0.1217939393939394, "grad_norm": 0.0831957682967186, "learning_rate": 0.00019953283848153584, "loss": 0.0831, "step": 1884 }, { "epoch": 0.12185858585858586, "grad_norm": 0.08359136432409286, "learning_rate": 0.0001995321779978122, "loss": 0.0813, "step": 1885 }, { "epoch": 0.12192323232323232, "grad_norm": 0.11489012092351913, "learning_rate": 0.0001995315170486095, "loss": 0.1025, "step": 1886 }, { "epoch": 0.12198787878787878, "grad_norm": 0.09697767347097397, "learning_rate": 0.00019953085563393074, "loss": 0.1042, "step": 1887 }, { "epoch": 0.12205252525252525, "grad_norm": 0.07990733534097672, "learning_rate": 0.00019953019375377912, "loss": 0.086, "step": 1888 }, { "epoch": 0.12205252525252525, "eval_bleu": 12.424350258259723, "eval_loss": 0.10116302967071533, "eval_runtime": 2.7286, "eval_samples_per_second": 11.728, "eval_steps_per_second": 1.466, "step": 1888 }, { "epoch": 0.12211717171717172, "grad_norm": 0.0896470919251442, "learning_rate": 0.00019952953140815765, "loss": 0.0883, "step": 1889 }, { "epoch": 0.12218181818181818, "grad_norm": 0.08684299886226654, "learning_rate": 0.0001995288685970695, "loss": 0.0877, "step": 1890 }, { "epoch": 0.12224646464646464, "grad_norm": 0.09075053781270981, "learning_rate": 0.00019952820532051773, "loss": 0.0958, "step": 1891 }, { "epoch": 0.1223111111111111, "grad_norm": 0.0927472859621048, "learning_rate": 0.00019952754157850545, "loss": 0.1054, "step": 1892 }, { "epoch": 0.12237575757575758, "grad_norm": 0.08077292144298553, "learning_rate": 0.00019952687737103571, "loss": 0.0852, "step": 1893 }, { "epoch": 0.12244040404040404, "grad_norm": 0.07959099858999252, "learning_rate": 0.0001995262126981117, "loss": 0.0858, "step": 1894 }, { "epoch": 0.1225050505050505, "grad_norm": 0.08473529666662216, "learning_rate": 0.00019952554755973652, "loss": 0.0901, "step": 1895 }, { "epoch": 0.12256969696969697, "grad_norm": 0.0825471356511116, "learning_rate": 0.00019952488195591324, "loss": 0.082, "step": 1896 }, { "epoch": 0.12263434343434343, "grad_norm": 0.0848509669303894, "learning_rate": 0.00019952421588664498, "loss": 0.0941, "step": 1897 }, { "epoch": 0.1226989898989899, "grad_norm": 0.08734121918678284, "learning_rate": 0.00019952354935193488, "loss": 0.0972, "step": 1898 }, { "epoch": 0.12276363636363637, "grad_norm": 0.09622342884540558, "learning_rate": 0.00019952288235178603, "loss": 0.0947, "step": 1899 }, { "epoch": 0.12282828282828283, "grad_norm": 0.08091770857572556, "learning_rate": 0.00019952221488620157, "loss": 0.0918, "step": 1900 }, { "epoch": 0.12289292929292929, "grad_norm": 0.09355922788381577, "learning_rate": 0.0001995215469551846, "loss": 0.0998, "step": 1901 }, { "epoch": 0.12295757575757575, "grad_norm": 0.08547891676425934, "learning_rate": 0.00019952087855873828, "loss": 0.0874, "step": 1902 }, { "epoch": 0.12302222222222223, "grad_norm": 0.09887318313121796, "learning_rate": 0.00019952020969686568, "loss": 0.1125, "step": 1903 }, { "epoch": 0.12308686868686869, "grad_norm": 0.1272737979888916, "learning_rate": 0.00019951954036957, "loss": 0.1213, "step": 1904 }, { "epoch": 0.12308686868686869, "eval_bleu": 15.256617319812614, "eval_loss": 0.09971657395362854, "eval_runtime": 2.6871, "eval_samples_per_second": 11.909, "eval_steps_per_second": 1.489, "step": 1904 }, { "epoch": 0.12315151515151515, "grad_norm": 0.07746249437332153, "learning_rate": 0.0001995188705768543, "loss": 0.0772, "step": 1905 }, { "epoch": 0.12321616161616161, "grad_norm": 0.09049322456121445, "learning_rate": 0.00019951820031872172, "loss": 0.1078, "step": 1906 }, { "epoch": 0.12328080808080807, "grad_norm": 0.09215855598449707, "learning_rate": 0.00019951752959517543, "loss": 0.0859, "step": 1907 }, { "epoch": 0.12334545454545455, "grad_norm": 0.075960673391819, "learning_rate": 0.0001995168584062186, "loss": 0.086, "step": 1908 }, { "epoch": 0.12341010101010101, "grad_norm": 0.2640346586704254, "learning_rate": 0.00019951618675185427, "loss": 0.1581, "step": 1909 }, { "epoch": 0.12347474747474747, "grad_norm": 0.08572307229042053, "learning_rate": 0.0001995155146320857, "loss": 0.0891, "step": 1910 }, { "epoch": 0.12353939393939393, "grad_norm": 0.09943751245737076, "learning_rate": 0.00019951484204691592, "loss": 0.1025, "step": 1911 }, { "epoch": 0.12360404040404041, "grad_norm": 0.09284835308790207, "learning_rate": 0.00019951416899634809, "loss": 0.0939, "step": 1912 }, { "epoch": 0.12366868686868687, "grad_norm": 0.10654886811971664, "learning_rate": 0.00019951349548038544, "loss": 0.1059, "step": 1913 }, { "epoch": 0.12373333333333333, "grad_norm": 0.12765763700008392, "learning_rate": 0.00019951282149903104, "loss": 0.0882, "step": 1914 }, { "epoch": 0.1237979797979798, "grad_norm": 0.08594781160354614, "learning_rate": 0.0001995121470522881, "loss": 0.0895, "step": 1915 }, { "epoch": 0.12386262626262626, "grad_norm": 0.09550819545984268, "learning_rate": 0.00019951147214015974, "loss": 0.0962, "step": 1916 }, { "epoch": 0.12392727272727273, "grad_norm": 0.09613265097141266, "learning_rate": 0.00019951079676264912, "loss": 0.1156, "step": 1917 }, { "epoch": 0.1239919191919192, "grad_norm": 0.08825942873954773, "learning_rate": 0.00019951012091975941, "loss": 0.0944, "step": 1918 }, { "epoch": 0.12405656565656566, "grad_norm": 0.09429129213094711, "learning_rate": 0.00019950944461149376, "loss": 0.0963, "step": 1919 }, { "epoch": 0.12412121212121212, "grad_norm": 0.08821070194244385, "learning_rate": 0.0001995087678378553, "loss": 0.1003, "step": 1920 }, { "epoch": 0.12412121212121212, "eval_bleu": 15.988525128879363, "eval_loss": 0.09864120185375214, "eval_runtime": 2.9596, "eval_samples_per_second": 10.812, "eval_steps_per_second": 1.352, "step": 1920 }, { "epoch": 0.12418585858585858, "grad_norm": 0.08690501749515533, "learning_rate": 0.00019950809059884726, "loss": 0.0891, "step": 1921 }, { "epoch": 0.12425050505050506, "grad_norm": 0.08644890785217285, "learning_rate": 0.00019950741289447276, "loss": 0.1049, "step": 1922 }, { "epoch": 0.12431515151515152, "grad_norm": 0.07946109026670456, "learning_rate": 0.000199506734724735, "loss": 0.089, "step": 1923 }, { "epoch": 0.12437979797979798, "grad_norm": 0.07116784900426865, "learning_rate": 0.00019950605608963712, "loss": 0.0789, "step": 1924 }, { "epoch": 0.12444444444444444, "grad_norm": 0.07321831583976746, "learning_rate": 0.00019950537698918235, "loss": 0.0811, "step": 1925 }, { "epoch": 0.1245090909090909, "grad_norm": 0.0844600722193718, "learning_rate": 0.0001995046974233738, "loss": 0.0953, "step": 1926 }, { "epoch": 0.12457373737373738, "grad_norm": 0.08651264011859894, "learning_rate": 0.00019950401739221468, "loss": 0.0974, "step": 1927 }, { "epoch": 0.12463838383838384, "grad_norm": 0.11407974362373352, "learning_rate": 0.00019950333689570818, "loss": 0.1042, "step": 1928 }, { "epoch": 0.1247030303030303, "grad_norm": 0.08301576972007751, "learning_rate": 0.00019950265593385744, "loss": 0.0831, "step": 1929 }, { "epoch": 0.12476767676767676, "grad_norm": 0.09083203971385956, "learning_rate": 0.0001995019745066657, "loss": 0.0955, "step": 1930 }, { "epoch": 0.12483232323232324, "grad_norm": 0.08654728531837463, "learning_rate": 0.00019950129261413611, "loss": 0.1089, "step": 1931 }, { "epoch": 0.1248969696969697, "grad_norm": 0.10016711801290512, "learning_rate": 0.00019950061025627186, "loss": 0.1182, "step": 1932 }, { "epoch": 0.12496161616161616, "grad_norm": 0.11128496378660202, "learning_rate": 0.00019949992743307617, "loss": 0.0971, "step": 1933 }, { "epoch": 0.12502626262626262, "grad_norm": 0.09567607939243317, "learning_rate": 0.00019949924414455219, "loss": 0.1029, "step": 1934 }, { "epoch": 0.12509090909090909, "grad_norm": 0.08960685133934021, "learning_rate": 0.00019949856039070315, "loss": 0.0936, "step": 1935 }, { "epoch": 0.12515555555555555, "grad_norm": 0.08176688104867935, "learning_rate": 0.00019949787617153225, "loss": 0.0958, "step": 1936 }, { "epoch": 0.12515555555555555, "eval_bleu": 15.18812551196839, "eval_loss": 0.09950312227010727, "eval_runtime": 2.733, "eval_samples_per_second": 11.709, "eval_steps_per_second": 1.464, "step": 1936 }, { "epoch": 0.125220202020202, "grad_norm": 0.08809114247560501, "learning_rate": 0.00019949719148704267, "loss": 0.1027, "step": 1937 }, { "epoch": 0.12528484848484847, "grad_norm": 0.08383925259113312, "learning_rate": 0.0001994965063372376, "loss": 0.1019, "step": 1938 }, { "epoch": 0.12534949494949496, "grad_norm": 0.08531201630830765, "learning_rate": 0.00019949582072212028, "loss": 0.0866, "step": 1939 }, { "epoch": 0.12541414141414142, "grad_norm": 0.09667918086051941, "learning_rate": 0.0001994951346416939, "loss": 0.0991, "step": 1940 }, { "epoch": 0.12547878787878788, "grad_norm": 0.08536456525325775, "learning_rate": 0.00019949444809596166, "loss": 0.0877, "step": 1941 }, { "epoch": 0.12554343434343435, "grad_norm": 0.09186483919620514, "learning_rate": 0.0001994937610849268, "loss": 0.1128, "step": 1942 }, { "epoch": 0.1256080808080808, "grad_norm": 0.0914599746465683, "learning_rate": 0.00019949307360859247, "loss": 0.1009, "step": 1943 }, { "epoch": 0.12567272727272727, "grad_norm": 0.08961135149002075, "learning_rate": 0.00019949238566696194, "loss": 0.0964, "step": 1944 }, { "epoch": 0.12573737373737373, "grad_norm": 0.07311463356018066, "learning_rate": 0.00019949169726003844, "loss": 0.0716, "step": 1945 }, { "epoch": 0.1258020202020202, "grad_norm": 0.097776859998703, "learning_rate": 0.00019949100838782512, "loss": 0.0958, "step": 1946 }, { "epoch": 0.12586666666666665, "grad_norm": 0.10604903101921082, "learning_rate": 0.00019949031905032528, "loss": 0.0898, "step": 1947 }, { "epoch": 0.12593131313131314, "grad_norm": 0.0812811553478241, "learning_rate": 0.0001994896292475421, "loss": 0.0793, "step": 1948 }, { "epoch": 0.1259959595959596, "grad_norm": 0.08305008709430695, "learning_rate": 0.00019948893897947883, "loss": 0.094, "step": 1949 }, { "epoch": 0.12606060606060607, "grad_norm": 0.10829906910657883, "learning_rate": 0.00019948824824613865, "loss": 0.1085, "step": 1950 }, { "epoch": 0.12612525252525253, "grad_norm": 0.08572188019752502, "learning_rate": 0.00019948755704752484, "loss": 0.0941, "step": 1951 }, { "epoch": 0.126189898989899, "grad_norm": 0.07729079574346542, "learning_rate": 0.0001994868653836406, "loss": 0.0893, "step": 1952 }, { "epoch": 0.126189898989899, "eval_bleu": 15.25016748426295, "eval_loss": 0.09827081859111786, "eval_runtime": 2.8563, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 1952 }, { "epoch": 0.12625454545454545, "grad_norm": 0.09723882377147675, "learning_rate": 0.00019948617325448923, "loss": 0.1134, "step": 1953 }, { "epoch": 0.12631919191919191, "grad_norm": 0.08374866098165512, "learning_rate": 0.00019948548066007387, "loss": 0.0843, "step": 1954 }, { "epoch": 0.12638383838383838, "grad_norm": 0.10691636800765991, "learning_rate": 0.00019948478760039779, "loss": 0.1068, "step": 1955 }, { "epoch": 0.12644848484848484, "grad_norm": 0.08607611060142517, "learning_rate": 0.00019948409407546429, "loss": 0.0942, "step": 1956 }, { "epoch": 0.1265131313131313, "grad_norm": 0.07911746203899384, "learning_rate": 0.00019948340008527654, "loss": 0.0922, "step": 1957 }, { "epoch": 0.1265777777777778, "grad_norm": 0.08027563244104385, "learning_rate": 0.00019948270562983783, "loss": 0.086, "step": 1958 }, { "epoch": 0.12664242424242425, "grad_norm": 0.10056855529546738, "learning_rate": 0.00019948201070915143, "loss": 0.0764, "step": 1959 }, { "epoch": 0.1267070707070707, "grad_norm": 0.09231962263584137, "learning_rate": 0.00019948131532322052, "loss": 0.1007, "step": 1960 }, { "epoch": 0.12677171717171717, "grad_norm": 0.12780733406543732, "learning_rate": 0.0001994806194720484, "loss": 0.1499, "step": 1961 }, { "epoch": 0.12683636363636364, "grad_norm": 0.0825180783867836, "learning_rate": 0.00019947992315563827, "loss": 0.0938, "step": 1962 }, { "epoch": 0.1269010101010101, "grad_norm": 0.08289946615695953, "learning_rate": 0.0001994792263739935, "loss": 0.0887, "step": 1963 }, { "epoch": 0.12696565656565656, "grad_norm": 0.07675183564424515, "learning_rate": 0.0001994785291271172, "loss": 0.0804, "step": 1964 }, { "epoch": 0.12703030303030302, "grad_norm": 0.0937013030052185, "learning_rate": 0.00019947783141501272, "loss": 0.1091, "step": 1965 }, { "epoch": 0.12709494949494948, "grad_norm": 0.0827571302652359, "learning_rate": 0.00019947713323768333, "loss": 0.1006, "step": 1966 }, { "epoch": 0.12715959595959597, "grad_norm": 0.08069519698619843, "learning_rate": 0.0001994764345951323, "loss": 0.0981, "step": 1967 }, { "epoch": 0.12722424242424243, "grad_norm": 0.08248139172792435, "learning_rate": 0.00019947573548736284, "loss": 0.1071, "step": 1968 }, { "epoch": 0.12722424242424243, "eval_bleu": 13.49704611250518, "eval_loss": 0.09819333255290985, "eval_runtime": 2.6274, "eval_samples_per_second": 12.179, "eval_steps_per_second": 1.522, "step": 1968 }, { "epoch": 0.1272888888888889, "grad_norm": 0.10741985589265823, "learning_rate": 0.00019947503591437823, "loss": 0.1265, "step": 1969 }, { "epoch": 0.12735353535353536, "grad_norm": 0.07623306661844254, "learning_rate": 0.0001994743358761818, "loss": 0.0883, "step": 1970 }, { "epoch": 0.12741818181818182, "grad_norm": 0.06979891657829285, "learning_rate": 0.00019947363537277678, "loss": 0.0855, "step": 1971 }, { "epoch": 0.12748282828282828, "grad_norm": 0.11560800671577454, "learning_rate": 0.00019947293440416642, "loss": 0.1162, "step": 1972 }, { "epoch": 0.12754747474747474, "grad_norm": 0.09833060950040817, "learning_rate": 0.00019947223297035404, "loss": 0.1134, "step": 1973 }, { "epoch": 0.1276121212121212, "grad_norm": 0.07940519601106644, "learning_rate": 0.00019947153107134296, "loss": 0.0831, "step": 1974 }, { "epoch": 0.12767676767676767, "grad_norm": 0.07775519043207169, "learning_rate": 0.00019947082870713637, "loss": 0.0823, "step": 1975 }, { "epoch": 0.12774141414141413, "grad_norm": 0.08320373296737671, "learning_rate": 0.0001994701258777376, "loss": 0.087, "step": 1976 }, { "epoch": 0.12780606060606062, "grad_norm": 0.07904225587844849, "learning_rate": 0.00019946942258314995, "loss": 0.0939, "step": 1977 }, { "epoch": 0.12787070707070708, "grad_norm": 0.08934198319911957, "learning_rate": 0.00019946871882337667, "loss": 0.098, "step": 1978 }, { "epoch": 0.12793535353535354, "grad_norm": 0.08127976208925247, "learning_rate": 0.0001994680145984211, "loss": 0.0926, "step": 1979 }, { "epoch": 0.128, "grad_norm": 0.09419949352741241, "learning_rate": 0.00019946730990828653, "loss": 0.1129, "step": 1980 }, { "epoch": 0.12806464646464646, "grad_norm": 0.08693555742502213, "learning_rate": 0.00019946660475297622, "loss": 0.0894, "step": 1981 }, { "epoch": 0.12812929292929293, "grad_norm": 0.06886342912912369, "learning_rate": 0.00019946589913249344, "loss": 0.0736, "step": 1982 }, { "epoch": 0.1281939393939394, "grad_norm": 0.086369588971138, "learning_rate": 0.0001994651930468416, "loss": 0.0859, "step": 1983 }, { "epoch": 0.12825858585858585, "grad_norm": 0.08950663357973099, "learning_rate": 0.00019946448649602389, "loss": 0.1025, "step": 1984 }, { "epoch": 0.12825858585858585, "eval_bleu": 14.589686856672825, "eval_loss": 0.09829030930995941, "eval_runtime": 2.8228, "eval_samples_per_second": 11.336, "eval_steps_per_second": 1.417, "step": 1984 }, { "epoch": 0.1283232323232323, "grad_norm": 0.07537104934453964, "learning_rate": 0.00019946377948004367, "loss": 0.0733, "step": 1985 }, { "epoch": 0.1283878787878788, "grad_norm": 0.08883395045995712, "learning_rate": 0.00019946307199890422, "loss": 0.1034, "step": 1986 }, { "epoch": 0.12845252525252526, "grad_norm": 0.09857851266860962, "learning_rate": 0.00019946236405260888, "loss": 0.1067, "step": 1987 }, { "epoch": 0.12851717171717172, "grad_norm": 0.08807091414928436, "learning_rate": 0.00019946165564116094, "loss": 0.0979, "step": 1988 }, { "epoch": 0.12858181818181819, "grad_norm": 0.07067893445491791, "learning_rate": 0.0001994609467645637, "loss": 0.071, "step": 1989 }, { "epoch": 0.12864646464646465, "grad_norm": 0.08889427036046982, "learning_rate": 0.00019946023742282054, "loss": 0.1053, "step": 1990 }, { "epoch": 0.1287111111111111, "grad_norm": 0.07630407065153122, "learning_rate": 0.0001994595276159347, "loss": 0.0898, "step": 1991 }, { "epoch": 0.12877575757575757, "grad_norm": 0.09465549886226654, "learning_rate": 0.00019945881734390955, "loss": 0.1044, "step": 1992 }, { "epoch": 0.12884040404040403, "grad_norm": 0.09524498134851456, "learning_rate": 0.00019945810660674835, "loss": 0.1052, "step": 1993 }, { "epoch": 0.1289050505050505, "grad_norm": 0.09550344198942184, "learning_rate": 0.00019945739540445448, "loss": 0.1032, "step": 1994 }, { "epoch": 0.12896969696969696, "grad_norm": 0.07416670024394989, "learning_rate": 0.00019945668373703128, "loss": 0.0789, "step": 1995 }, { "epoch": 0.12903434343434345, "grad_norm": 0.0895688608288765, "learning_rate": 0.00019945597160448203, "loss": 0.0949, "step": 1996 }, { "epoch": 0.1290989898989899, "grad_norm": 0.08989269286394119, "learning_rate": 0.00019945525900681008, "loss": 0.1098, "step": 1997 }, { "epoch": 0.12916363636363637, "grad_norm": 0.08079128712415695, "learning_rate": 0.00019945454594401878, "loss": 0.0886, "step": 1998 }, { "epoch": 0.12922828282828283, "grad_norm": 0.08059177547693253, "learning_rate": 0.00019945383241611145, "loss": 0.1012, "step": 1999 }, { "epoch": 0.1292929292929293, "grad_norm": 0.10066372901201248, "learning_rate": 0.0001994531184230914, "loss": 0.1121, "step": 2000 }, { "epoch": 0.1292929292929293, "eval_bleu": 13.839869148332909, "eval_loss": 0.09522367268800735, "eval_runtime": 2.7627, "eval_samples_per_second": 11.583, "eval_steps_per_second": 1.448, "step": 2000 }, { "epoch": 0.12935757575757575, "grad_norm": 0.09417274594306946, "learning_rate": 0.000199452403964962, "loss": 0.1071, "step": 2001 }, { "epoch": 0.12942222222222222, "grad_norm": 0.09200766682624817, "learning_rate": 0.00019945168904172657, "loss": 0.1009, "step": 2002 }, { "epoch": 0.12948686868686868, "grad_norm": 0.08576171100139618, "learning_rate": 0.0001994509736533885, "loss": 0.101, "step": 2003 }, { "epoch": 0.12955151515151514, "grad_norm": 0.09033402055501938, "learning_rate": 0.0001994502577999511, "loss": 0.0966, "step": 2004 }, { "epoch": 0.12961616161616163, "grad_norm": 0.07994651049375534, "learning_rate": 0.0001994495414814177, "loss": 0.0835, "step": 2005 }, { "epoch": 0.1296808080808081, "grad_norm": 0.09723614156246185, "learning_rate": 0.00019944882469779166, "loss": 0.1017, "step": 2006 }, { "epoch": 0.12974545454545455, "grad_norm": 0.08721049875020981, "learning_rate": 0.00019944810744907638, "loss": 0.0835, "step": 2007 }, { "epoch": 0.12981010101010101, "grad_norm": 0.0764656737446785, "learning_rate": 0.00019944738973527517, "loss": 0.0846, "step": 2008 }, { "epoch": 0.12987474747474748, "grad_norm": 0.07630682736635208, "learning_rate": 0.00019944667155639138, "loss": 0.0904, "step": 2009 }, { "epoch": 0.12993939393939394, "grad_norm": 0.08141928166151047, "learning_rate": 0.0001994459529124284, "loss": 0.0916, "step": 2010 }, { "epoch": 0.1300040404040404, "grad_norm": 0.08433999866247177, "learning_rate": 0.00019944523380338957, "loss": 0.0899, "step": 2011 }, { "epoch": 0.13006868686868686, "grad_norm": 0.0846228152513504, "learning_rate": 0.00019944451422927826, "loss": 0.1039, "step": 2012 }, { "epoch": 0.13013333333333332, "grad_norm": 0.10189270973205566, "learning_rate": 0.00019944379419009782, "loss": 0.107, "step": 2013 }, { "epoch": 0.13019797979797978, "grad_norm": 0.09998765587806702, "learning_rate": 0.00019944307368585163, "loss": 0.1171, "step": 2014 }, { "epoch": 0.13026262626262627, "grad_norm": 0.10510484129190445, "learning_rate": 0.00019944235271654307, "loss": 0.1121, "step": 2015 }, { "epoch": 0.13032727272727274, "grad_norm": 0.07617301493883133, "learning_rate": 0.0001994416312821755, "loss": 0.0823, "step": 2016 }, { "epoch": 0.13032727272727274, "eval_bleu": 15.509957147824409, "eval_loss": 0.09738773107528687, "eval_runtime": 3.0456, "eval_samples_per_second": 10.507, "eval_steps_per_second": 1.313, "step": 2016 }, { "epoch": 0.1303919191919192, "grad_norm": 0.09673111140727997, "learning_rate": 0.00019944090938275232, "loss": 0.1015, "step": 2017 }, { "epoch": 0.13045656565656566, "grad_norm": 0.10245068371295929, "learning_rate": 0.00019944018701827684, "loss": 0.1079, "step": 2018 }, { "epoch": 0.13052121212121212, "grad_norm": 0.07984739542007446, "learning_rate": 0.0001994394641887525, "loss": 0.0892, "step": 2019 }, { "epoch": 0.13058585858585858, "grad_norm": 0.07822220027446747, "learning_rate": 0.00019943874089418264, "loss": 0.0837, "step": 2020 }, { "epoch": 0.13065050505050504, "grad_norm": 0.08840775489807129, "learning_rate": 0.0001994380171345707, "loss": 0.0951, "step": 2021 }, { "epoch": 0.1307151515151515, "grad_norm": 0.09588046371936798, "learning_rate": 0.00019943729290992001, "loss": 0.0983, "step": 2022 }, { "epoch": 0.13077979797979797, "grad_norm": 0.08750557154417038, "learning_rate": 0.000199436568220234, "loss": 0.0888, "step": 2023 }, { "epoch": 0.13084444444444446, "grad_norm": 0.0954146534204483, "learning_rate": 0.00019943584306551598, "loss": 0.1087, "step": 2024 }, { "epoch": 0.13090909090909092, "grad_norm": 0.07021438330411911, "learning_rate": 0.00019943511744576942, "loss": 0.069, "step": 2025 }, { "epoch": 0.13097373737373738, "grad_norm": 0.11486995220184326, "learning_rate": 0.0001994343913609977, "loss": 0.1011, "step": 2026 }, { "epoch": 0.13103838383838384, "grad_norm": 0.10010086745023727, "learning_rate": 0.00019943366481120416, "loss": 0.102, "step": 2027 }, { "epoch": 0.1311030303030303, "grad_norm": 0.09551717340946198, "learning_rate": 0.00019943293779639228, "loss": 0.0978, "step": 2028 }, { "epoch": 0.13116767676767677, "grad_norm": 0.07197084277868271, "learning_rate": 0.00019943221031656542, "loss": 0.0746, "step": 2029 }, { "epoch": 0.13123232323232323, "grad_norm": 0.08440660685300827, "learning_rate": 0.00019943148237172698, "loss": 0.0959, "step": 2030 }, { "epoch": 0.1312969696969697, "grad_norm": 0.10483204573392868, "learning_rate": 0.00019943075396188035, "loss": 0.1275, "step": 2031 }, { "epoch": 0.13136161616161615, "grad_norm": 0.08914661407470703, "learning_rate": 0.000199430025087029, "loss": 0.0957, "step": 2032 }, { "epoch": 0.13136161616161615, "eval_bleu": 14.10015424273172, "eval_loss": 0.0969596803188324, "eval_runtime": 2.7952, "eval_samples_per_second": 11.448, "eval_steps_per_second": 1.431, "step": 2032 }, { "epoch": 0.1314262626262626, "grad_norm": 0.07776562124490738, "learning_rate": 0.00019942929574717625, "loss": 0.0782, "step": 2033 }, { "epoch": 0.1314909090909091, "grad_norm": 0.08268604427576065, "learning_rate": 0.00019942856594232558, "loss": 0.0894, "step": 2034 }, { "epoch": 0.13155555555555556, "grad_norm": 0.09471847116947174, "learning_rate": 0.00019942783567248037, "loss": 0.0955, "step": 2035 }, { "epoch": 0.13162020202020203, "grad_norm": 0.0754428282380104, "learning_rate": 0.00019942710493764404, "loss": 0.088, "step": 2036 }, { "epoch": 0.1316848484848485, "grad_norm": 0.08042635768651962, "learning_rate": 0.00019942637373782001, "loss": 0.0812, "step": 2037 }, { "epoch": 0.13174949494949495, "grad_norm": 0.09189002215862274, "learning_rate": 0.00019942564207301168, "loss": 0.1033, "step": 2038 }, { "epoch": 0.1318141414141414, "grad_norm": 0.08115535229444504, "learning_rate": 0.00019942490994322252, "loss": 0.0881, "step": 2039 }, { "epoch": 0.13187878787878787, "grad_norm": 0.08375845849514008, "learning_rate": 0.0001994241773484559, "loss": 0.0987, "step": 2040 }, { "epoch": 0.13194343434343433, "grad_norm": 0.10288199037313461, "learning_rate": 0.00019942344428871528, "loss": 0.1096, "step": 2041 }, { "epoch": 0.1320080808080808, "grad_norm": 0.08917142450809479, "learning_rate": 0.0001994227107640041, "loss": 0.0968, "step": 2042 }, { "epoch": 0.13207272727272729, "grad_norm": 0.08875776827335358, "learning_rate": 0.00019942197677432576, "loss": 0.1, "step": 2043 }, { "epoch": 0.13213737373737375, "grad_norm": 0.07881813496351242, "learning_rate": 0.00019942124231968371, "loss": 0.0903, "step": 2044 }, { "epoch": 0.1322020202020202, "grad_norm": 0.0783696249127388, "learning_rate": 0.00019942050740008135, "loss": 0.0791, "step": 2045 }, { "epoch": 0.13226666666666667, "grad_norm": 0.0984339788556099, "learning_rate": 0.0001994197720155222, "loss": 0.1148, "step": 2046 }, { "epoch": 0.13233131313131313, "grad_norm": 0.09149181842803955, "learning_rate": 0.00019941903616600958, "loss": 0.1006, "step": 2047 }, { "epoch": 0.1323959595959596, "grad_norm": 0.10625866800546646, "learning_rate": 0.00019941829985154703, "loss": 0.1237, "step": 2048 }, { "epoch": 0.1323959595959596, "eval_bleu": 11.790794773106036, "eval_loss": 0.09782740473747253, "eval_runtime": 2.7028, "eval_samples_per_second": 11.84, "eval_steps_per_second": 1.48, "step": 2048 }, { "epoch": 0.13246060606060606, "grad_norm": 0.10007159411907196, "learning_rate": 0.00019941756307213795, "loss": 0.102, "step": 2049 }, { "epoch": 0.13252525252525252, "grad_norm": 0.07241909950971603, "learning_rate": 0.0001994168258277858, "loss": 0.0774, "step": 2050 }, { "epoch": 0.13258989898989898, "grad_norm": 0.07422494143247604, "learning_rate": 0.000199416088118494, "loss": 0.0832, "step": 2051 }, { "epoch": 0.13265454545454544, "grad_norm": 0.0820273905992508, "learning_rate": 0.00019941534994426604, "loss": 0.0885, "step": 2052 }, { "epoch": 0.13271919191919193, "grad_norm": 0.0825883001089096, "learning_rate": 0.00019941461130510536, "loss": 0.088, "step": 2053 }, { "epoch": 0.1327838383838384, "grad_norm": 0.0916120707988739, "learning_rate": 0.00019941387220101541, "loss": 0.0998, "step": 2054 }, { "epoch": 0.13284848484848485, "grad_norm": 0.09313715249300003, "learning_rate": 0.00019941313263199963, "loss": 0.1029, "step": 2055 }, { "epoch": 0.13291313131313132, "grad_norm": 0.08485183119773865, "learning_rate": 0.0001994123925980615, "loss": 0.1002, "step": 2056 }, { "epoch": 0.13297777777777778, "grad_norm": 0.08435779809951782, "learning_rate": 0.0001994116520992045, "loss": 0.1026, "step": 2057 }, { "epoch": 0.13304242424242424, "grad_norm": 0.08964058011770248, "learning_rate": 0.00019941091113543204, "loss": 0.0982, "step": 2058 }, { "epoch": 0.1331070707070707, "grad_norm": 0.08899747580289841, "learning_rate": 0.00019941016970674761, "loss": 0.1018, "step": 2059 }, { "epoch": 0.13317171717171716, "grad_norm": 0.07826949656009674, "learning_rate": 0.0001994094278131547, "loss": 0.0952, "step": 2060 }, { "epoch": 0.13323636363636363, "grad_norm": 0.09215851128101349, "learning_rate": 0.00019940868545465675, "loss": 0.1037, "step": 2061 }, { "epoch": 0.13330101010101011, "grad_norm": 0.08982773125171661, "learning_rate": 0.00019940794263125723, "loss": 0.105, "step": 2062 }, { "epoch": 0.13336565656565658, "grad_norm": 0.0929267480969429, "learning_rate": 0.00019940719934295964, "loss": 0.1085, "step": 2063 }, { "epoch": 0.13343030303030304, "grad_norm": 0.10089726001024246, "learning_rate": 0.00019940645558976744, "loss": 0.1217, "step": 2064 }, { "epoch": 0.13343030303030304, "eval_bleu": 13.068304622729366, "eval_loss": 0.09774366021156311, "eval_runtime": 2.8263, "eval_samples_per_second": 11.322, "eval_steps_per_second": 1.415, "step": 2064 }, { "epoch": 0.1334949494949495, "grad_norm": 0.0794333666563034, "learning_rate": 0.0001994057113716841, "loss": 0.0833, "step": 2065 }, { "epoch": 0.13355959595959596, "grad_norm": 0.0743979662656784, "learning_rate": 0.00019940496668871313, "loss": 0.079, "step": 2066 }, { "epoch": 0.13362424242424242, "grad_norm": 0.07490761578083038, "learning_rate": 0.00019940422154085798, "loss": 0.0826, "step": 2067 }, { "epoch": 0.13368888888888888, "grad_norm": 0.07392819225788116, "learning_rate": 0.00019940347592812215, "loss": 0.0833, "step": 2068 }, { "epoch": 0.13375353535353535, "grad_norm": 0.07630061358213425, "learning_rate": 0.00019940272985050913, "loss": 0.0794, "step": 2069 }, { "epoch": 0.1338181818181818, "grad_norm": 0.09349751472473145, "learning_rate": 0.00019940198330802242, "loss": 0.0824, "step": 2070 }, { "epoch": 0.13388282828282827, "grad_norm": 0.08001204580068588, "learning_rate": 0.00019940123630066546, "loss": 0.0878, "step": 2071 }, { "epoch": 0.13394747474747476, "grad_norm": 0.0950464978814125, "learning_rate": 0.0001994004888284418, "loss": 0.0953, "step": 2072 }, { "epoch": 0.13401212121212122, "grad_norm": 0.09755884855985641, "learning_rate": 0.00019939974089135492, "loss": 0.1086, "step": 2073 }, { "epoch": 0.13407676767676768, "grad_norm": 0.08843854814767838, "learning_rate": 0.00019939899248940833, "loss": 0.0955, "step": 2074 }, { "epoch": 0.13414141414141414, "grad_norm": 0.09414809197187424, "learning_rate": 0.00019939824362260545, "loss": 0.1062, "step": 2075 }, { "epoch": 0.1342060606060606, "grad_norm": 0.09230902791023254, "learning_rate": 0.00019939749429094992, "loss": 0.112, "step": 2076 }, { "epoch": 0.13427070707070707, "grad_norm": 0.07946252077817917, "learning_rate": 0.0001993967444944451, "loss": 0.0777, "step": 2077 }, { "epoch": 0.13433535353535353, "grad_norm": 0.07552774995565414, "learning_rate": 0.00019939599423309461, "loss": 0.0879, "step": 2078 }, { "epoch": 0.1344, "grad_norm": 0.12982022762298584, "learning_rate": 0.0001993952435069019, "loss": 0.0953, "step": 2079 }, { "epoch": 0.13446464646464645, "grad_norm": 0.08005748689174652, "learning_rate": 0.0001993944923158705, "loss": 0.0805, "step": 2080 }, { "epoch": 0.13446464646464645, "eval_bleu": 16.284477901282663, "eval_loss": 0.09954658150672913, "eval_runtime": 2.7672, "eval_samples_per_second": 11.564, "eval_steps_per_second": 1.445, "step": 2080 }, { "epoch": 0.13452929292929294, "grad_norm": 0.1251658797264099, "learning_rate": 0.00019939374066000392, "loss": 0.1077, "step": 2081 }, { "epoch": 0.1345939393939394, "grad_norm": 0.10091298073530197, "learning_rate": 0.00019939298853930567, "loss": 0.1289, "step": 2082 }, { "epoch": 0.13465858585858587, "grad_norm": 0.07252799719572067, "learning_rate": 0.00019939223595377928, "loss": 0.0832, "step": 2083 }, { "epoch": 0.13472323232323233, "grad_norm": 0.07752204686403275, "learning_rate": 0.00019939148290342825, "loss": 0.0869, "step": 2084 }, { "epoch": 0.1347878787878788, "grad_norm": 0.07700987160205841, "learning_rate": 0.00019939072938825612, "loss": 0.079, "step": 2085 }, { "epoch": 0.13485252525252525, "grad_norm": 0.0797998309135437, "learning_rate": 0.00019938997540826638, "loss": 0.0904, "step": 2086 }, { "epoch": 0.1349171717171717, "grad_norm": 0.09481562674045563, "learning_rate": 0.0001993892209634626, "loss": 0.0845, "step": 2087 }, { "epoch": 0.13498181818181818, "grad_norm": 0.08537639677524567, "learning_rate": 0.00019938846605384831, "loss": 0.1051, "step": 2088 }, { "epoch": 0.13504646464646464, "grad_norm": 0.07488562911748886, "learning_rate": 0.00019938771067942702, "loss": 0.0841, "step": 2089 }, { "epoch": 0.1351111111111111, "grad_norm": 0.075688935816288, "learning_rate": 0.00019938695484020223, "loss": 0.0777, "step": 2090 }, { "epoch": 0.1351757575757576, "grad_norm": 0.0655268207192421, "learning_rate": 0.00019938619853617753, "loss": 0.073, "step": 2091 }, { "epoch": 0.13524040404040405, "grad_norm": 0.08922410756349564, "learning_rate": 0.00019938544176735645, "loss": 0.1101, "step": 2092 }, { "epoch": 0.1353050505050505, "grad_norm": 0.08207284659147263, "learning_rate": 0.0001993846845337425, "loss": 0.0959, "step": 2093 }, { "epoch": 0.13536969696969697, "grad_norm": 0.09461072832345963, "learning_rate": 0.00019938392683533924, "loss": 0.1096, "step": 2094 }, { "epoch": 0.13543434343434343, "grad_norm": 0.08169141411781311, "learning_rate": 0.00019938316867215022, "loss": 0.086, "step": 2095 }, { "epoch": 0.1354989898989899, "grad_norm": 0.08676749467849731, "learning_rate": 0.00019938241004417893, "loss": 0.1172, "step": 2096 }, { "epoch": 0.1354989898989899, "eval_bleu": 15.876729664676253, "eval_loss": 0.09793879091739655, "eval_runtime": 2.7421, "eval_samples_per_second": 11.67, "eval_steps_per_second": 1.459, "step": 2096 }, { "epoch": 0.13556363636363636, "grad_norm": 0.08110977709293365, "learning_rate": 0.000199381650951429, "loss": 0.0902, "step": 2097 }, { "epoch": 0.13562828282828282, "grad_norm": 0.08314534276723862, "learning_rate": 0.00019938089139390395, "loss": 0.0953, "step": 2098 }, { "epoch": 0.13569292929292928, "grad_norm": 0.07887057960033417, "learning_rate": 0.0001993801313716073, "loss": 0.0876, "step": 2099 }, { "epoch": 0.13575757575757577, "grad_norm": 0.10084131360054016, "learning_rate": 0.00019937937088454266, "loss": 0.1064, "step": 2100 }, { "epoch": 0.13582222222222223, "grad_norm": 0.10975068807601929, "learning_rate": 0.00019937860993271352, "loss": 0.1413, "step": 2101 }, { "epoch": 0.1358868686868687, "grad_norm": 0.0831703469157219, "learning_rate": 0.0001993778485161235, "loss": 0.0953, "step": 2102 }, { "epoch": 0.13595151515151516, "grad_norm": 0.07815272361040115, "learning_rate": 0.00019937708663477613, "loss": 0.1048, "step": 2103 }, { "epoch": 0.13601616161616162, "grad_norm": 0.07690869271755219, "learning_rate": 0.00019937632428867498, "loss": 0.0915, "step": 2104 }, { "epoch": 0.13608080808080808, "grad_norm": 0.08577138930559158, "learning_rate": 0.00019937556147782363, "loss": 0.0865, "step": 2105 }, { "epoch": 0.13614545454545454, "grad_norm": 0.07861332595348358, "learning_rate": 0.00019937479820222564, "loss": 0.0942, "step": 2106 }, { "epoch": 0.136210101010101, "grad_norm": 0.08143707364797592, "learning_rate": 0.00019937403446188454, "loss": 0.0918, "step": 2107 }, { "epoch": 0.13627474747474747, "grad_norm": 0.09098243713378906, "learning_rate": 0.00019937327025680395, "loss": 0.1031, "step": 2108 }, { "epoch": 0.13633939393939393, "grad_norm": 0.09335984289646149, "learning_rate": 0.0001993725055869874, "loss": 0.0978, "step": 2109 }, { "epoch": 0.13640404040404042, "grad_norm": 0.08311004936695099, "learning_rate": 0.00019937174045243854, "loss": 0.0898, "step": 2110 }, { "epoch": 0.13646868686868688, "grad_norm": 0.07999549806118011, "learning_rate": 0.00019937097485316088, "loss": 0.0828, "step": 2111 }, { "epoch": 0.13653333333333334, "grad_norm": 0.09194918721914291, "learning_rate": 0.00019937020878915798, "loss": 0.093, "step": 2112 }, { "epoch": 0.13653333333333334, "eval_bleu": 15.07902042115171, "eval_loss": 0.09937527775764465, "eval_runtime": 2.769, "eval_samples_per_second": 11.557, "eval_steps_per_second": 1.445, "step": 2112 }, { "epoch": 0.1365979797979798, "grad_norm": 0.11082398891448975, "learning_rate": 0.00019936944226043354, "loss": 0.1164, "step": 2113 }, { "epoch": 0.13666262626262626, "grad_norm": 0.09152469038963318, "learning_rate": 0.000199368675266991, "loss": 0.1022, "step": 2114 }, { "epoch": 0.13672727272727273, "grad_norm": 0.07528580725193024, "learning_rate": 0.00019936790780883406, "loss": 0.071, "step": 2115 }, { "epoch": 0.1367919191919192, "grad_norm": 0.08660375326871872, "learning_rate": 0.00019936713988596627, "loss": 0.1003, "step": 2116 }, { "epoch": 0.13685656565656565, "grad_norm": 0.10427609086036682, "learning_rate": 0.0001993663714983912, "loss": 0.118, "step": 2117 }, { "epoch": 0.1369212121212121, "grad_norm": 0.0905638113617897, "learning_rate": 0.00019936560264611247, "loss": 0.1005, "step": 2118 }, { "epoch": 0.13698585858585857, "grad_norm": 0.08334964513778687, "learning_rate": 0.00019936483332913361, "loss": 0.1102, "step": 2119 }, { "epoch": 0.13705050505050506, "grad_norm": 0.07560577988624573, "learning_rate": 0.00019936406354745834, "loss": 0.0834, "step": 2120 }, { "epoch": 0.13711515151515152, "grad_norm": 0.08507279306650162, "learning_rate": 0.00019936329330109018, "loss": 0.0962, "step": 2121 }, { "epoch": 0.13717979797979798, "grad_norm": 0.08278714865446091, "learning_rate": 0.00019936252259003274, "loss": 0.1054, "step": 2122 }, { "epoch": 0.13724444444444445, "grad_norm": 0.08289074897766113, "learning_rate": 0.00019936175141428963, "loss": 0.1094, "step": 2123 }, { "epoch": 0.1373090909090909, "grad_norm": 0.10430148988962173, "learning_rate": 0.00019936097977386446, "loss": 0.1242, "step": 2124 }, { "epoch": 0.13737373737373737, "grad_norm": 0.09121454507112503, "learning_rate": 0.00019936020766876084, "loss": 0.1315, "step": 2125 }, { "epoch": 0.13743838383838383, "grad_norm": 0.07568488270044327, "learning_rate": 0.00019935943509898237, "loss": 0.0796, "step": 2126 }, { "epoch": 0.1375030303030303, "grad_norm": 0.09529861062765121, "learning_rate": 0.00019935866206453266, "loss": 0.1151, "step": 2127 }, { "epoch": 0.13756767676767676, "grad_norm": 0.0712660551071167, "learning_rate": 0.00019935788856541536, "loss": 0.0755, "step": 2128 }, { "epoch": 0.13756767676767676, "eval_bleu": 12.6309035502833, "eval_loss": 0.09649898111820221, "eval_runtime": 2.6672, "eval_samples_per_second": 11.998, "eval_steps_per_second": 1.5, "step": 2128 }, { "epoch": 0.13763232323232324, "grad_norm": 0.09537266939878464, "learning_rate": 0.00019935711460163403, "loss": 0.1132, "step": 2129 }, { "epoch": 0.1376969696969697, "grad_norm": 0.08489110320806503, "learning_rate": 0.00019935634017319233, "loss": 0.0955, "step": 2130 }, { "epoch": 0.13776161616161617, "grad_norm": 0.14424313604831696, "learning_rate": 0.00019935556528009388, "loss": 0.1255, "step": 2131 }, { "epoch": 0.13782626262626263, "grad_norm": 0.08503853529691696, "learning_rate": 0.00019935478992234233, "loss": 0.0932, "step": 2132 }, { "epoch": 0.1378909090909091, "grad_norm": 0.08866728097200394, "learning_rate": 0.00019935401409994124, "loss": 0.0993, "step": 2133 }, { "epoch": 0.13795555555555555, "grad_norm": 0.08791413903236389, "learning_rate": 0.00019935323781289426, "loss": 0.0996, "step": 2134 }, { "epoch": 0.13802020202020202, "grad_norm": 0.08543447405099869, "learning_rate": 0.00019935246106120506, "loss": 0.0868, "step": 2135 }, { "epoch": 0.13808484848484848, "grad_norm": 0.0702652856707573, "learning_rate": 0.00019935168384487725, "loss": 0.0817, "step": 2136 }, { "epoch": 0.13814949494949494, "grad_norm": 0.08029930293560028, "learning_rate": 0.0001993509061639144, "loss": 0.0967, "step": 2137 }, { "epoch": 0.1382141414141414, "grad_norm": 0.08459579199552536, "learning_rate": 0.00019935012801832028, "loss": 0.1036, "step": 2138 }, { "epoch": 0.1382787878787879, "grad_norm": 0.07462736964225769, "learning_rate": 0.0001993493494080984, "loss": 0.0785, "step": 2139 }, { "epoch": 0.13834343434343435, "grad_norm": 0.0938514843583107, "learning_rate": 0.00019934857033325248, "loss": 0.0974, "step": 2140 }, { "epoch": 0.1384080808080808, "grad_norm": 0.07491198927164078, "learning_rate": 0.00019934779079378617, "loss": 0.0917, "step": 2141 }, { "epoch": 0.13847272727272728, "grad_norm": 0.10061255097389221, "learning_rate": 0.00019934701078970303, "loss": 0.1067, "step": 2142 }, { "epoch": 0.13853737373737374, "grad_norm": 0.0729975551366806, "learning_rate": 0.0001993462303210068, "loss": 0.0829, "step": 2143 }, { "epoch": 0.1386020202020202, "grad_norm": 0.09039177000522614, "learning_rate": 0.0001993454493877011, "loss": 0.0996, "step": 2144 }, { "epoch": 0.1386020202020202, "eval_bleu": 13.316991181706612, "eval_loss": 0.098295658826828, "eval_runtime": 2.6935, "eval_samples_per_second": 11.88, "eval_steps_per_second": 1.485, "step": 2144 }, { "epoch": 0.13866666666666666, "grad_norm": 0.08515696227550507, "learning_rate": 0.00019934466798978955, "loss": 0.0944, "step": 2145 }, { "epoch": 0.13873131313131312, "grad_norm": 0.08764570951461792, "learning_rate": 0.00019934388612727585, "loss": 0.1015, "step": 2146 }, { "epoch": 0.13879595959595958, "grad_norm": 0.09567011892795563, "learning_rate": 0.00019934310380016363, "loss": 0.1082, "step": 2147 }, { "epoch": 0.13886060606060607, "grad_norm": 0.0907915011048317, "learning_rate": 0.00019934232100845655, "loss": 0.0934, "step": 2148 }, { "epoch": 0.13892525252525253, "grad_norm": 0.07852301001548767, "learning_rate": 0.0001993415377521583, "loss": 0.0948, "step": 2149 }, { "epoch": 0.138989898989899, "grad_norm": 0.09274768084287643, "learning_rate": 0.00019934075403127248, "loss": 0.1176, "step": 2150 }, { "epoch": 0.13905454545454546, "grad_norm": 0.07157626748085022, "learning_rate": 0.00019933996984580283, "loss": 0.0847, "step": 2151 }, { "epoch": 0.13911919191919192, "grad_norm": 0.07254094630479813, "learning_rate": 0.00019933918519575298, "loss": 0.0767, "step": 2152 }, { "epoch": 0.13918383838383838, "grad_norm": 0.07105102390050888, "learning_rate": 0.0001993384000811266, "loss": 0.0853, "step": 2153 }, { "epoch": 0.13924848484848484, "grad_norm": 0.09649059176445007, "learning_rate": 0.00019933761450192735, "loss": 0.0951, "step": 2154 }, { "epoch": 0.1393131313131313, "grad_norm": 0.09500347077846527, "learning_rate": 0.00019933682845815892, "loss": 0.1087, "step": 2155 }, { "epoch": 0.13937777777777777, "grad_norm": 0.06282282620668411, "learning_rate": 0.00019933604194982495, "loss": 0.0694, "step": 2156 }, { "epoch": 0.13944242424242423, "grad_norm": 0.08335459977388382, "learning_rate": 0.00019933525497692922, "loss": 0.093, "step": 2157 }, { "epoch": 0.13950707070707072, "grad_norm": 0.0766918733716011, "learning_rate": 0.0001993344675394753, "loss": 0.0796, "step": 2158 }, { "epoch": 0.13957171717171718, "grad_norm": 0.07622380554676056, "learning_rate": 0.0001993336796374669, "loss": 0.0776, "step": 2159 }, { "epoch": 0.13963636363636364, "grad_norm": 0.08085879683494568, "learning_rate": 0.00019933289127090778, "loss": 0.0835, "step": 2160 }, { "epoch": 0.13963636363636364, "eval_bleu": 15.763808862362175, "eval_loss": 0.09635349363088608, "eval_runtime": 2.765, "eval_samples_per_second": 11.573, "eval_steps_per_second": 1.447, "step": 2160 }, { "epoch": 0.1397010101010101, "grad_norm": 0.08228936046361923, "learning_rate": 0.00019933210243980152, "loss": 0.0802, "step": 2161 }, { "epoch": 0.13976565656565657, "grad_norm": 0.10941604524850845, "learning_rate": 0.00019933131314415188, "loss": 0.1319, "step": 2162 }, { "epoch": 0.13983030303030303, "grad_norm": 0.08717752248048782, "learning_rate": 0.0001993305233839625, "loss": 0.0875, "step": 2163 }, { "epoch": 0.1398949494949495, "grad_norm": 0.08118139207363129, "learning_rate": 0.0001993297331592371, "loss": 0.0942, "step": 2164 }, { "epoch": 0.13995959595959595, "grad_norm": 0.08656337857246399, "learning_rate": 0.0001993289424699794, "loss": 0.1041, "step": 2165 }, { "epoch": 0.1400242424242424, "grad_norm": 0.09879984706640244, "learning_rate": 0.00019932815131619306, "loss": 0.0999, "step": 2166 }, { "epoch": 0.1400888888888889, "grad_norm": 0.08160009980201721, "learning_rate": 0.0001993273596978818, "loss": 0.0891, "step": 2167 }, { "epoch": 0.14015353535353536, "grad_norm": 0.07659734785556793, "learning_rate": 0.0001993265676150493, "loss": 0.0886, "step": 2168 }, { "epoch": 0.14021818181818183, "grad_norm": 0.07596676796674728, "learning_rate": 0.0001993257750676993, "loss": 0.087, "step": 2169 }, { "epoch": 0.1402828282828283, "grad_norm": 0.08992719650268555, "learning_rate": 0.0001993249820558355, "loss": 0.094, "step": 2170 }, { "epoch": 0.14034747474747475, "grad_norm": 0.07521230727434158, "learning_rate": 0.00019932418857946153, "loss": 0.096, "step": 2171 }, { "epoch": 0.1404121212121212, "grad_norm": 0.08257531374692917, "learning_rate": 0.0001993233946385812, "loss": 0.0947, "step": 2172 }, { "epoch": 0.14047676767676767, "grad_norm": 0.07799696177244186, "learning_rate": 0.00019932260023319823, "loss": 0.0807, "step": 2173 }, { "epoch": 0.14054141414141413, "grad_norm": 0.08241602033376694, "learning_rate": 0.00019932180536331624, "loss": 0.104, "step": 2174 }, { "epoch": 0.1406060606060606, "grad_norm": 0.0690343827009201, "learning_rate": 0.00019932101002893902, "loss": 0.075, "step": 2175 }, { "epoch": 0.14067070707070706, "grad_norm": 0.10477066040039062, "learning_rate": 0.00019932021423007027, "loss": 0.1208, "step": 2176 }, { "epoch": 0.14067070707070706, "eval_bleu": 12.266963111138233, "eval_loss": 0.09701789915561676, "eval_runtime": 2.7285, "eval_samples_per_second": 11.728, "eval_steps_per_second": 1.466, "step": 2176 }, { "epoch": 0.14073535353535355, "grad_norm": 0.07689818739891052, "learning_rate": 0.00019931941796671372, "loss": 0.096, "step": 2177 }, { "epoch": 0.1408, "grad_norm": 0.079193614423275, "learning_rate": 0.0001993186212388731, "loss": 0.1006, "step": 2178 }, { "epoch": 0.14086464646464647, "grad_norm": 0.0728984996676445, "learning_rate": 0.0001993178240465521, "loss": 0.0764, "step": 2179 }, { "epoch": 0.14092929292929293, "grad_norm": 0.1090875118970871, "learning_rate": 0.00019931702638975447, "loss": 0.0846, "step": 2180 }, { "epoch": 0.1409939393939394, "grad_norm": 0.09084334969520569, "learning_rate": 0.00019931622826848395, "loss": 0.1047, "step": 2181 }, { "epoch": 0.14105858585858586, "grad_norm": 0.0822165384888649, "learning_rate": 0.00019931542968274426, "loss": 0.0981, "step": 2182 }, { "epoch": 0.14112323232323232, "grad_norm": 0.08956528455018997, "learning_rate": 0.00019931463063253913, "loss": 0.1027, "step": 2183 }, { "epoch": 0.14118787878787878, "grad_norm": 0.0748949944972992, "learning_rate": 0.0001993138311178723, "loss": 0.0794, "step": 2184 }, { "epoch": 0.14125252525252524, "grad_norm": 0.07600361853837967, "learning_rate": 0.00019931303113874754, "loss": 0.0759, "step": 2185 }, { "epoch": 0.14131717171717173, "grad_norm": 0.07164584845304489, "learning_rate": 0.00019931223069516855, "loss": 0.072, "step": 2186 }, { "epoch": 0.1413818181818182, "grad_norm": 0.10086806118488312, "learning_rate": 0.0001993114297871391, "loss": 0.1071, "step": 2187 }, { "epoch": 0.14144646464646465, "grad_norm": 0.08601544052362442, "learning_rate": 0.00019931062841466293, "loss": 0.1053, "step": 2188 }, { "epoch": 0.14151111111111112, "grad_norm": 0.08842509984970093, "learning_rate": 0.00019930982657774378, "loss": 0.0925, "step": 2189 }, { "epoch": 0.14157575757575758, "grad_norm": 0.0895439088344574, "learning_rate": 0.00019930902427638537, "loss": 0.1041, "step": 2190 }, { "epoch": 0.14164040404040404, "grad_norm": 0.08682627975940704, "learning_rate": 0.0001993082215105915, "loss": 0.0983, "step": 2191 }, { "epoch": 0.1417050505050505, "grad_norm": 0.09832212328910828, "learning_rate": 0.00019930741828036593, "loss": 0.0895, "step": 2192 }, { "epoch": 0.1417050505050505, "eval_bleu": 15.835709145695853, "eval_loss": 0.09852661192417145, "eval_runtime": 2.6642, "eval_samples_per_second": 12.011, "eval_steps_per_second": 1.501, "step": 2192 }, { "epoch": 0.14176969696969696, "grad_norm": 0.07735937833786011, "learning_rate": 0.00019930661458571238, "loss": 0.0697, "step": 2193 }, { "epoch": 0.14183434343434342, "grad_norm": 0.0891536995768547, "learning_rate": 0.00019930581042663465, "loss": 0.0907, "step": 2194 }, { "epoch": 0.14189898989898989, "grad_norm": 0.09226737171411514, "learning_rate": 0.00019930500580313642, "loss": 0.0965, "step": 2195 }, { "epoch": 0.14196363636363638, "grad_norm": 0.09302262216806412, "learning_rate": 0.00019930420071522154, "loss": 0.1054, "step": 2196 }, { "epoch": 0.14202828282828284, "grad_norm": 0.086602583527565, "learning_rate": 0.00019930339516289374, "loss": 0.0949, "step": 2197 }, { "epoch": 0.1420929292929293, "grad_norm": 0.09134562313556671, "learning_rate": 0.0001993025891461568, "loss": 0.0969, "step": 2198 }, { "epoch": 0.14215757575757576, "grad_norm": 0.08391424268484116, "learning_rate": 0.00019930178266501446, "loss": 0.1002, "step": 2199 }, { "epoch": 0.14222222222222222, "grad_norm": 0.0886627584695816, "learning_rate": 0.00019930097571947052, "loss": 0.1011, "step": 2200 }, { "epoch": 0.14228686868686868, "grad_norm": 0.0875149518251419, "learning_rate": 0.00019930016830952877, "loss": 0.1038, "step": 2201 }, { "epoch": 0.14235151515151515, "grad_norm": 0.08375229686498642, "learning_rate": 0.0001992993604351929, "loss": 0.0983, "step": 2202 }, { "epoch": 0.1424161616161616, "grad_norm": 0.07363390177488327, "learning_rate": 0.00019929855209646678, "loss": 0.0881, "step": 2203 }, { "epoch": 0.14248080808080807, "grad_norm": 0.07777576148509979, "learning_rate": 0.00019929774329335417, "loss": 0.0937, "step": 2204 }, { "epoch": 0.14254545454545456, "grad_norm": 0.07619834691286087, "learning_rate": 0.00019929693402585884, "loss": 0.0966, "step": 2205 }, { "epoch": 0.14261010101010102, "grad_norm": 0.07290953397750854, "learning_rate": 0.00019929612429398457, "loss": 0.0809, "step": 2206 }, { "epoch": 0.14267474747474748, "grad_norm": 0.0811164379119873, "learning_rate": 0.00019929531409773518, "loss": 0.1013, "step": 2207 }, { "epoch": 0.14273939393939394, "grad_norm": 0.08142521232366562, "learning_rate": 0.00019929450343711438, "loss": 0.1118, "step": 2208 }, { "epoch": 0.14273939393939394, "eval_bleu": 14.90355245620719, "eval_loss": 0.09795285761356354, "eval_runtime": 2.9077, "eval_samples_per_second": 11.005, "eval_steps_per_second": 1.376, "step": 2208 }, { "epoch": 0.1428040404040404, "grad_norm": 0.0814114362001419, "learning_rate": 0.00019929369231212605, "loss": 0.0887, "step": 2209 }, { "epoch": 0.14286868686868687, "grad_norm": 0.08488152921199799, "learning_rate": 0.00019929288072277396, "loss": 0.0995, "step": 2210 }, { "epoch": 0.14293333333333333, "grad_norm": 0.07185027003288269, "learning_rate": 0.00019929206866906184, "loss": 0.0773, "step": 2211 }, { "epoch": 0.1429979797979798, "grad_norm": 0.07965119928121567, "learning_rate": 0.00019929125615099355, "loss": 0.0997, "step": 2212 }, { "epoch": 0.14306262626262625, "grad_norm": 0.08428077399730682, "learning_rate": 0.00019929044316857294, "loss": 0.1, "step": 2213 }, { "epoch": 0.14312727272727271, "grad_norm": 0.07027176767587662, "learning_rate": 0.00019928962972180368, "loss": 0.0781, "step": 2214 }, { "epoch": 0.1431919191919192, "grad_norm": 0.07408403605222702, "learning_rate": 0.00019928881581068967, "loss": 0.0838, "step": 2215 }, { "epoch": 0.14325656565656567, "grad_norm": 0.09210987389087677, "learning_rate": 0.0001992880014352347, "loss": 0.0934, "step": 2216 }, { "epoch": 0.14332121212121213, "grad_norm": 0.08894040435552597, "learning_rate": 0.0001992871865954426, "loss": 0.1007, "step": 2217 }, { "epoch": 0.1433858585858586, "grad_norm": 0.0793905034661293, "learning_rate": 0.0001992863712913171, "loss": 0.0999, "step": 2218 }, { "epoch": 0.14345050505050505, "grad_norm": 0.0838514119386673, "learning_rate": 0.0001992855555228621, "loss": 0.102, "step": 2219 }, { "epoch": 0.1435151515151515, "grad_norm": 0.08446792513132095, "learning_rate": 0.00019928473929008137, "loss": 0.0974, "step": 2220 }, { "epoch": 0.14357979797979797, "grad_norm": 0.07814288884401321, "learning_rate": 0.00019928392259297873, "loss": 0.1031, "step": 2221 }, { "epoch": 0.14364444444444444, "grad_norm": 0.08398514240980148, "learning_rate": 0.00019928310543155804, "loss": 0.0833, "step": 2222 }, { "epoch": 0.1437090909090909, "grad_norm": 0.0768071711063385, "learning_rate": 0.00019928228780582305, "loss": 0.0853, "step": 2223 }, { "epoch": 0.1437737373737374, "grad_norm": 0.10781671851873398, "learning_rate": 0.00019928146971577763, "loss": 0.1233, "step": 2224 }, { "epoch": 0.1437737373737374, "eval_bleu": 16.856761244755475, "eval_loss": 0.09815460443496704, "eval_runtime": 2.7804, "eval_samples_per_second": 11.509, "eval_steps_per_second": 1.439, "step": 2224 }, { "epoch": 0.14383838383838385, "grad_norm": 0.08463025838136673, "learning_rate": 0.0001992806511614256, "loss": 0.0993, "step": 2225 }, { "epoch": 0.1439030303030303, "grad_norm": 0.07293100655078888, "learning_rate": 0.0001992798321427708, "loss": 0.0821, "step": 2226 }, { "epoch": 0.14396767676767677, "grad_norm": 0.08723324537277222, "learning_rate": 0.000199279012659817, "loss": 0.1043, "step": 2227 }, { "epoch": 0.14403232323232323, "grad_norm": 0.07860496640205383, "learning_rate": 0.00019927819271256812, "loss": 0.0872, "step": 2228 }, { "epoch": 0.1440969696969697, "grad_norm": 0.0815499946475029, "learning_rate": 0.00019927737230102796, "loss": 0.0895, "step": 2229 }, { "epoch": 0.14416161616161616, "grad_norm": 0.0952635332942009, "learning_rate": 0.00019927655142520034, "loss": 0.1085, "step": 2230 }, { "epoch": 0.14422626262626262, "grad_norm": 0.07156683504581451, "learning_rate": 0.0001992757300850891, "loss": 0.085, "step": 2231 }, { "epoch": 0.14429090909090908, "grad_norm": 0.08956246823072433, "learning_rate": 0.0001992749082806981, "loss": 0.1136, "step": 2232 }, { "epoch": 0.14435555555555554, "grad_norm": 0.09046854823827744, "learning_rate": 0.00019927408601203117, "loss": 0.1046, "step": 2233 }, { "epoch": 0.14442020202020203, "grad_norm": 0.10297202318906784, "learning_rate": 0.00019927326327909216, "loss": 0.0935, "step": 2234 }, { "epoch": 0.1444848484848485, "grad_norm": 0.0739424079656601, "learning_rate": 0.00019927244008188493, "loss": 0.0953, "step": 2235 }, { "epoch": 0.14454949494949496, "grad_norm": 0.069561667740345, "learning_rate": 0.00019927161642041327, "loss": 0.0699, "step": 2236 }, { "epoch": 0.14461414141414142, "grad_norm": 0.09780288487672806, "learning_rate": 0.00019927079229468112, "loss": 0.1027, "step": 2237 }, { "epoch": 0.14467878787878788, "grad_norm": 0.0815785750746727, "learning_rate": 0.0001992699677046923, "loss": 0.1058, "step": 2238 }, { "epoch": 0.14474343434343434, "grad_norm": 0.085743747651577, "learning_rate": 0.00019926914265045063, "loss": 0.0974, "step": 2239 }, { "epoch": 0.1448080808080808, "grad_norm": 0.0680752694606781, "learning_rate": 0.00019926831713196002, "loss": 0.075, "step": 2240 }, { "epoch": 0.1448080808080808, "eval_bleu": 14.144843246403564, "eval_loss": 0.09855931252241135, "eval_runtime": 2.7165, "eval_samples_per_second": 11.78, "eval_steps_per_second": 1.472, "step": 2240 }, { "epoch": 0.14487272727272726, "grad_norm": 0.09069699794054031, "learning_rate": 0.0001992674911492243, "loss": 0.1214, "step": 2241 }, { "epoch": 0.14493737373737373, "grad_norm": 0.09225036203861237, "learning_rate": 0.00019926666470224732, "loss": 0.1023, "step": 2242 }, { "epoch": 0.14500202020202022, "grad_norm": 0.08832424879074097, "learning_rate": 0.00019926583779103298, "loss": 0.0962, "step": 2243 }, { "epoch": 0.14506666666666668, "grad_norm": 0.09969259053468704, "learning_rate": 0.00019926501041558515, "loss": 0.0887, "step": 2244 }, { "epoch": 0.14513131313131314, "grad_norm": 0.0939222201704979, "learning_rate": 0.00019926418257590768, "loss": 0.1039, "step": 2245 }, { "epoch": 0.1451959595959596, "grad_norm": 0.07626113295555115, "learning_rate": 0.00019926335427200443, "loss": 0.0905, "step": 2246 }, { "epoch": 0.14526060606060606, "grad_norm": 0.08937086164951324, "learning_rate": 0.00019926252550387928, "loss": 0.0928, "step": 2247 }, { "epoch": 0.14532525252525252, "grad_norm": 0.09782461822032928, "learning_rate": 0.00019926169627153614, "loss": 0.1179, "step": 2248 }, { "epoch": 0.14538989898989899, "grad_norm": 0.10443700850009918, "learning_rate": 0.00019926086657497883, "loss": 0.1022, "step": 2249 }, { "epoch": 0.14545454545454545, "grad_norm": 0.09932852536439896, "learning_rate": 0.00019926003641421129, "loss": 0.13, "step": 2250 }, { "epoch": 0.1455191919191919, "grad_norm": 0.08329958468675613, "learning_rate": 0.00019925920578923737, "loss": 0.0886, "step": 2251 }, { "epoch": 0.14558383838383837, "grad_norm": 0.07198482751846313, "learning_rate": 0.00019925837470006097, "loss": 0.0782, "step": 2252 }, { "epoch": 0.14564848484848486, "grad_norm": 0.0805986225605011, "learning_rate": 0.00019925754314668593, "loss": 0.0938, "step": 2253 }, { "epoch": 0.14571313131313132, "grad_norm": 0.08215127885341644, "learning_rate": 0.00019925671112911618, "loss": 0.0893, "step": 2254 }, { "epoch": 0.14577777777777778, "grad_norm": 0.09303683042526245, "learning_rate": 0.00019925587864735565, "loss": 0.1279, "step": 2255 }, { "epoch": 0.14584242424242425, "grad_norm": 0.07702606171369553, "learning_rate": 0.00019925504570140814, "loss": 0.0876, "step": 2256 }, { "epoch": 0.14584242424242425, "eval_bleu": 14.761206778905008, "eval_loss": 0.09789718687534332, "eval_runtime": 2.7354, "eval_samples_per_second": 11.698, "eval_steps_per_second": 1.462, "step": 2256 }, { "epoch": 0.1459070707070707, "grad_norm": 0.07773461937904358, "learning_rate": 0.00019925421229127763, "loss": 0.0904, "step": 2257 }, { "epoch": 0.14597171717171717, "grad_norm": 0.09010069072246552, "learning_rate": 0.00019925337841696797, "loss": 0.1151, "step": 2258 }, { "epoch": 0.14603636363636363, "grad_norm": 0.09520852565765381, "learning_rate": 0.00019925254407848305, "loss": 0.1268, "step": 2259 }, { "epoch": 0.1461010101010101, "grad_norm": 0.08797336369752884, "learning_rate": 0.00019925170927582683, "loss": 0.1135, "step": 2260 }, { "epoch": 0.14616565656565655, "grad_norm": 0.09137532860040665, "learning_rate": 0.00019925087400900316, "loss": 0.0998, "step": 2261 }, { "epoch": 0.14623030303030304, "grad_norm": 0.08424366265535355, "learning_rate": 0.00019925003827801595, "loss": 0.0994, "step": 2262 }, { "epoch": 0.1462949494949495, "grad_norm": 0.09606316685676575, "learning_rate": 0.00019924920208286913, "loss": 0.1081, "step": 2263 }, { "epoch": 0.14635959595959597, "grad_norm": 0.08903534710407257, "learning_rate": 0.0001992483654235666, "loss": 0.1084, "step": 2264 }, { "epoch": 0.14642424242424243, "grad_norm": 0.08161941170692444, "learning_rate": 0.0001992475283001123, "loss": 0.0923, "step": 2265 }, { "epoch": 0.1464888888888889, "grad_norm": 0.08217356353998184, "learning_rate": 0.0001992466907125101, "loss": 0.0992, "step": 2266 }, { "epoch": 0.14655353535353535, "grad_norm": 0.0858437567949295, "learning_rate": 0.00019924585266076396, "loss": 0.1012, "step": 2267 }, { "epoch": 0.14661818181818181, "grad_norm": 0.0778830498456955, "learning_rate": 0.00019924501414487775, "loss": 0.0952, "step": 2268 }, { "epoch": 0.14668282828282828, "grad_norm": 0.08928383886814117, "learning_rate": 0.00019924417516485542, "loss": 0.0894, "step": 2269 }, { "epoch": 0.14674747474747474, "grad_norm": 0.0894358828663826, "learning_rate": 0.0001992433357207009, "loss": 0.1123, "step": 2270 }, { "epoch": 0.1468121212121212, "grad_norm": 0.08054732531309128, "learning_rate": 0.00019924249581241811, "loss": 0.0969, "step": 2271 }, { "epoch": 0.1468767676767677, "grad_norm": 0.08566389232873917, "learning_rate": 0.00019924165544001098, "loss": 0.105, "step": 2272 }, { "epoch": 0.1468767676767677, "eval_bleu": 17.043232217430802, "eval_loss": 0.09771702438592911, "eval_runtime": 2.6978, "eval_samples_per_second": 11.862, "eval_steps_per_second": 1.483, "step": 2272 }, { "epoch": 0.14694141414141415, "grad_norm": 0.10433823615312576, "learning_rate": 0.00019924081460348343, "loss": 0.087, "step": 2273 }, { "epoch": 0.1470060606060606, "grad_norm": 0.0779954344034195, "learning_rate": 0.0001992399733028394, "loss": 0.085, "step": 2274 }, { "epoch": 0.14707070707070707, "grad_norm": 0.14118874073028564, "learning_rate": 0.00019923913153808282, "loss": 0.1357, "step": 2275 }, { "epoch": 0.14713535353535354, "grad_norm": 0.08732448518276215, "learning_rate": 0.00019923828930921763, "loss": 0.1043, "step": 2276 }, { "epoch": 0.1472, "grad_norm": 0.07829619199037552, "learning_rate": 0.00019923744661624776, "loss": 0.0777, "step": 2277 }, { "epoch": 0.14726464646464646, "grad_norm": 0.09334984421730042, "learning_rate": 0.00019923660345917717, "loss": 0.0994, "step": 2278 }, { "epoch": 0.14732929292929292, "grad_norm": 0.09675560146570206, "learning_rate": 0.00019923575983800979, "loss": 0.1274, "step": 2279 }, { "epoch": 0.14739393939393938, "grad_norm": 0.09379332512617111, "learning_rate": 0.00019923491575274957, "loss": 0.1112, "step": 2280 }, { "epoch": 0.14745858585858587, "grad_norm": 0.17280007898807526, "learning_rate": 0.00019923407120340047, "loss": 0.1035, "step": 2281 }, { "epoch": 0.14752323232323233, "grad_norm": 0.07100223749876022, "learning_rate": 0.0001992332261899664, "loss": 0.0787, "step": 2282 }, { "epoch": 0.1475878787878788, "grad_norm": 0.07255946844816208, "learning_rate": 0.00019923238071245134, "loss": 0.0811, "step": 2283 }, { "epoch": 0.14765252525252526, "grad_norm": 0.07578455656766891, "learning_rate": 0.00019923153477085929, "loss": 0.0857, "step": 2284 }, { "epoch": 0.14771717171717172, "grad_norm": 0.07674919068813324, "learning_rate": 0.00019923068836519408, "loss": 0.084, "step": 2285 }, { "epoch": 0.14778181818181818, "grad_norm": 0.07491938769817352, "learning_rate": 0.00019922984149545978, "loss": 0.093, "step": 2286 }, { "epoch": 0.14784646464646464, "grad_norm": 0.08271020650863647, "learning_rate": 0.00019922899416166033, "loss": 0.092, "step": 2287 }, { "epoch": 0.1479111111111111, "grad_norm": 0.06983164697885513, "learning_rate": 0.00019922814636379969, "loss": 0.0747, "step": 2288 }, { "epoch": 0.1479111111111111, "eval_bleu": 15.892574320268269, "eval_loss": 0.09509193897247314, "eval_runtime": 2.8163, "eval_samples_per_second": 11.362, "eval_steps_per_second": 1.42, "step": 2288 }, { "epoch": 0.14797575757575757, "grad_norm": 0.07476863265037537, "learning_rate": 0.00019922729810188176, "loss": 0.0874, "step": 2289 }, { "epoch": 0.14804040404040403, "grad_norm": 0.07053195685148239, "learning_rate": 0.00019922644937591058, "loss": 0.0796, "step": 2290 }, { "epoch": 0.14810505050505052, "grad_norm": 0.07486848533153534, "learning_rate": 0.00019922560018589011, "loss": 0.088, "step": 2291 }, { "epoch": 0.14816969696969698, "grad_norm": 0.07849009335041046, "learning_rate": 0.00019922475053182431, "loss": 0.0831, "step": 2292 }, { "epoch": 0.14823434343434344, "grad_norm": 0.07709590345621109, "learning_rate": 0.00019922390041371716, "loss": 0.0928, "step": 2293 }, { "epoch": 0.1482989898989899, "grad_norm": 0.08308565616607666, "learning_rate": 0.00019922304983157262, "loss": 0.0841, "step": 2294 }, { "epoch": 0.14836363636363636, "grad_norm": 0.09292647242546082, "learning_rate": 0.0001992221987853947, "loss": 0.0859, "step": 2295 }, { "epoch": 0.14842828282828283, "grad_norm": 0.0787341445684433, "learning_rate": 0.00019922134727518733, "loss": 0.0874, "step": 2296 }, { "epoch": 0.1484929292929293, "grad_norm": 0.07518694549798965, "learning_rate": 0.00019922049530095455, "loss": 0.084, "step": 2297 }, { "epoch": 0.14855757575757575, "grad_norm": 0.07190399616956711, "learning_rate": 0.0001992196428627003, "loss": 0.0723, "step": 2298 }, { "epoch": 0.1486222222222222, "grad_norm": 0.10043215751647949, "learning_rate": 0.00019921878996042856, "loss": 0.0991, "step": 2299 }, { "epoch": 0.1486868686868687, "grad_norm": 0.0940486490726471, "learning_rate": 0.00019921793659414337, "loss": 0.111, "step": 2300 }, { "epoch": 0.14875151515151516, "grad_norm": 0.12624093890190125, "learning_rate": 0.00019921708276384869, "loss": 0.1139, "step": 2301 }, { "epoch": 0.14881616161616162, "grad_norm": 0.08641842752695084, "learning_rate": 0.0001992162284695485, "loss": 0.1203, "step": 2302 }, { "epoch": 0.14888080808080809, "grad_norm": 0.07899843901395798, "learning_rate": 0.0001992153737112468, "loss": 0.0898, "step": 2303 }, { "epoch": 0.14894545454545455, "grad_norm": 0.10435502976179123, "learning_rate": 0.00019921451848894765, "loss": 0.0884, "step": 2304 }, { "epoch": 0.14894545454545455, "eval_bleu": 14.088385449194126, "eval_loss": 0.09721770882606506, "eval_runtime": 2.812, "eval_samples_per_second": 11.38, "eval_steps_per_second": 1.422, "step": 2304 }, { "epoch": 0.149010101010101, "grad_norm": 0.08355221152305603, "learning_rate": 0.00019921366280265494, "loss": 0.0821, "step": 2305 }, { "epoch": 0.14907474747474747, "grad_norm": 0.06774589419364929, "learning_rate": 0.00019921280665237276, "loss": 0.0763, "step": 2306 }, { "epoch": 0.14913939393939393, "grad_norm": 0.08457525819540024, "learning_rate": 0.00019921195003810507, "loss": 0.1103, "step": 2307 }, { "epoch": 0.1492040404040404, "grad_norm": 0.0811379924416542, "learning_rate": 0.0001992110929598559, "loss": 0.0844, "step": 2308 }, { "epoch": 0.14926868686868686, "grad_norm": 0.08011262118816376, "learning_rate": 0.00019921023541762925, "loss": 0.1078, "step": 2309 }, { "epoch": 0.14933333333333335, "grad_norm": 0.10300824791193008, "learning_rate": 0.0001992093774114291, "loss": 0.1033, "step": 2310 }, { "epoch": 0.1493979797979798, "grad_norm": 0.0863417536020279, "learning_rate": 0.0001992085189412595, "loss": 0.1033, "step": 2311 }, { "epoch": 0.14946262626262627, "grad_norm": 0.10561800003051758, "learning_rate": 0.00019920766000712444, "loss": 0.1145, "step": 2312 }, { "epoch": 0.14952727272727273, "grad_norm": 0.07916280627250671, "learning_rate": 0.000199206800609028, "loss": 0.0875, "step": 2313 }, { "epoch": 0.1495919191919192, "grad_norm": 0.10506631433963776, "learning_rate": 0.0001992059407469741, "loss": 0.1229, "step": 2314 }, { "epoch": 0.14965656565656565, "grad_norm": 0.07248834520578384, "learning_rate": 0.00019920508042096683, "loss": 0.0755, "step": 2315 }, { "epoch": 0.14972121212121212, "grad_norm": 0.08458910137414932, "learning_rate": 0.0001992042196310102, "loss": 0.0989, "step": 2316 }, { "epoch": 0.14978585858585858, "grad_norm": 0.08612111210823059, "learning_rate": 0.00019920335837710825, "loss": 0.0903, "step": 2317 }, { "epoch": 0.14985050505050504, "grad_norm": 0.08405128866434097, "learning_rate": 0.00019920249665926495, "loss": 0.1118, "step": 2318 }, { "epoch": 0.1499151515151515, "grad_norm": 0.07918649911880493, "learning_rate": 0.0001992016344774844, "loss": 0.1057, "step": 2319 }, { "epoch": 0.149979797979798, "grad_norm": 0.0907558798789978, "learning_rate": 0.0001992007718317706, "loss": 0.1175, "step": 2320 }, { "epoch": 0.149979797979798, "eval_bleu": 13.222980814557614, "eval_loss": 0.09897366166114807, "eval_runtime": 2.6835, "eval_samples_per_second": 11.925, "eval_steps_per_second": 1.491, "step": 2320 }, { "epoch": 0.15004444444444445, "grad_norm": 0.0756349265575409, "learning_rate": 0.00019919990872212755, "loss": 0.0882, "step": 2321 }, { "epoch": 0.15010909090909091, "grad_norm": 0.08692969381809235, "learning_rate": 0.00019919904514855934, "loss": 0.0997, "step": 2322 }, { "epoch": 0.15017373737373738, "grad_norm": 0.0826423391699791, "learning_rate": 0.00019919818111107, "loss": 0.0902, "step": 2323 }, { "epoch": 0.15023838383838384, "grad_norm": 0.10737424343824387, "learning_rate": 0.00019919731660966356, "loss": 0.0865, "step": 2324 }, { "epoch": 0.1503030303030303, "grad_norm": 0.07574409991502762, "learning_rate": 0.00019919645164434406, "loss": 0.0914, "step": 2325 }, { "epoch": 0.15036767676767676, "grad_norm": 0.08156408369541168, "learning_rate": 0.00019919558621511553, "loss": 0.1002, "step": 2326 }, { "epoch": 0.15043232323232322, "grad_norm": 0.08333850651979446, "learning_rate": 0.00019919472032198208, "loss": 0.0918, "step": 2327 }, { "epoch": 0.15049696969696968, "grad_norm": 0.08153364062309265, "learning_rate": 0.0001991938539649477, "loss": 0.0936, "step": 2328 }, { "epoch": 0.15056161616161617, "grad_norm": 0.08397586643695831, "learning_rate": 0.00019919298714401643, "loss": 0.0921, "step": 2329 }, { "epoch": 0.15062626262626264, "grad_norm": 0.07793214917182922, "learning_rate": 0.0001991921198591924, "loss": 0.0929, "step": 2330 }, { "epoch": 0.1506909090909091, "grad_norm": 0.07486416399478912, "learning_rate": 0.00019919125211047958, "loss": 0.0972, "step": 2331 }, { "epoch": 0.15075555555555556, "grad_norm": 0.08046216517686844, "learning_rate": 0.00019919038389788207, "loss": 0.0935, "step": 2332 }, { "epoch": 0.15082020202020202, "grad_norm": 0.07738444954156876, "learning_rate": 0.00019918951522140393, "loss": 0.0887, "step": 2333 }, { "epoch": 0.15088484848484848, "grad_norm": 0.1287994086742401, "learning_rate": 0.00019918864608104922, "loss": 0.0929, "step": 2334 }, { "epoch": 0.15094949494949494, "grad_norm": 0.07731743156909943, "learning_rate": 0.000199187776476822, "loss": 0.0902, "step": 2335 }, { "epoch": 0.1510141414141414, "grad_norm": 0.080807164311409, "learning_rate": 0.00019918690640872636, "loss": 0.0942, "step": 2336 }, { "epoch": 0.1510141414141414, "eval_bleu": 15.973732924570047, "eval_loss": 0.09978494793176651, "eval_runtime": 2.7055, "eval_samples_per_second": 11.828, "eval_steps_per_second": 1.478, "step": 2336 }, { "epoch": 0.15107878787878787, "grad_norm": 0.07488775998353958, "learning_rate": 0.0001991860358767663, "loss": 0.0896, "step": 2337 }, { "epoch": 0.15114343434343433, "grad_norm": 0.0880126804113388, "learning_rate": 0.000199185164880946, "loss": 0.0995, "step": 2338 }, { "epoch": 0.15120808080808082, "grad_norm": 0.07517234236001968, "learning_rate": 0.00019918429342126944, "loss": 0.0842, "step": 2339 }, { "epoch": 0.15127272727272728, "grad_norm": 0.09195252507925034, "learning_rate": 0.00019918342149774073, "loss": 0.1023, "step": 2340 }, { "epoch": 0.15133737373737374, "grad_norm": 0.08316605538129807, "learning_rate": 0.00019918254911036398, "loss": 0.105, "step": 2341 }, { "epoch": 0.1514020202020202, "grad_norm": 0.08410564810037613, "learning_rate": 0.0001991816762591432, "loss": 0.0925, "step": 2342 }, { "epoch": 0.15146666666666667, "grad_norm": 0.08235607296228409, "learning_rate": 0.00019918080294408253, "loss": 0.0881, "step": 2343 }, { "epoch": 0.15153131313131313, "grad_norm": 0.06468655169010162, "learning_rate": 0.00019917992916518602, "loss": 0.0826, "step": 2344 }, { "epoch": 0.1515959595959596, "grad_norm": 0.08290201425552368, "learning_rate": 0.0001991790549224578, "loss": 0.0997, "step": 2345 }, { "epoch": 0.15166060606060605, "grad_norm": 0.07575234770774841, "learning_rate": 0.00019917818021590188, "loss": 0.0949, "step": 2346 }, { "epoch": 0.1517252525252525, "grad_norm": 0.0716208666563034, "learning_rate": 0.00019917730504552243, "loss": 0.077, "step": 2347 }, { "epoch": 0.151789898989899, "grad_norm": 0.06823279708623886, "learning_rate": 0.0001991764294113235, "loss": 0.0904, "step": 2348 }, { "epoch": 0.15185454545454546, "grad_norm": 0.10471434891223907, "learning_rate": 0.00019917555331330918, "loss": 0.1055, "step": 2349 }, { "epoch": 0.15191919191919193, "grad_norm": 0.0712771788239479, "learning_rate": 0.00019917467675148362, "loss": 0.0892, "step": 2350 }, { "epoch": 0.1519838383838384, "grad_norm": 0.0834423080086708, "learning_rate": 0.00019917379972585086, "loss": 0.089, "step": 2351 }, { "epoch": 0.15204848484848485, "grad_norm": 0.07527291774749756, "learning_rate": 0.000199172922236415, "loss": 0.0957, "step": 2352 }, { "epoch": 0.15204848484848485, "eval_bleu": 13.92591718541453, "eval_loss": 0.09829393774271011, "eval_runtime": 2.7052, "eval_samples_per_second": 11.829, "eval_steps_per_second": 1.479, "step": 2352 }, { "epoch": 0.1521131313131313, "grad_norm": 0.0684351772069931, "learning_rate": 0.00019917204428318024, "loss": 0.0894, "step": 2353 }, { "epoch": 0.15217777777777777, "grad_norm": 0.07119576632976532, "learning_rate": 0.00019917116586615056, "loss": 0.0854, "step": 2354 }, { "epoch": 0.15224242424242423, "grad_norm": 0.0755360871553421, "learning_rate": 0.00019917028698533015, "loss": 0.0862, "step": 2355 }, { "epoch": 0.1523070707070707, "grad_norm": 0.07296937704086304, "learning_rate": 0.00019916940764072306, "loss": 0.1027, "step": 2356 }, { "epoch": 0.15237171717171716, "grad_norm": 0.07805594801902771, "learning_rate": 0.00019916852783233345, "loss": 0.0986, "step": 2357 }, { "epoch": 0.15243636363636365, "grad_norm": 0.0746074840426445, "learning_rate": 0.00019916764756016544, "loss": 0.094, "step": 2358 }, { "epoch": 0.1525010101010101, "grad_norm": 0.0822427049279213, "learning_rate": 0.0001991667668242231, "loss": 0.1101, "step": 2359 }, { "epoch": 0.15256565656565657, "grad_norm": 0.08411096036434174, "learning_rate": 0.00019916588562451058, "loss": 0.0974, "step": 2360 }, { "epoch": 0.15263030303030303, "grad_norm": 0.06998459249734879, "learning_rate": 0.000199165003961032, "loss": 0.0864, "step": 2361 }, { "epoch": 0.1526949494949495, "grad_norm": 0.08847485482692719, "learning_rate": 0.0001991641218337915, "loss": 0.0968, "step": 2362 }, { "epoch": 0.15275959595959596, "grad_norm": 0.08485672622919083, "learning_rate": 0.00019916323924279317, "loss": 0.1038, "step": 2363 }, { "epoch": 0.15282424242424242, "grad_norm": 0.08982007950544357, "learning_rate": 0.00019916235618804115, "loss": 0.1092, "step": 2364 }, { "epoch": 0.15288888888888888, "grad_norm": 0.08455164730548859, "learning_rate": 0.00019916147266953958, "loss": 0.0912, "step": 2365 }, { "epoch": 0.15295353535353534, "grad_norm": 0.0744146853685379, "learning_rate": 0.00019916058868729258, "loss": 0.0874, "step": 2366 }, { "epoch": 0.15301818181818183, "grad_norm": 0.07572925835847855, "learning_rate": 0.0001991597042413043, "loss": 0.0921, "step": 2367 }, { "epoch": 0.1530828282828283, "grad_norm": 0.07488899677991867, "learning_rate": 0.00019915881933157883, "loss": 0.0956, "step": 2368 }, { "epoch": 0.1530828282828283, "eval_bleu": 13.510725812985678, "eval_loss": 0.09653285145759583, "eval_runtime": 2.8565, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 2368 }, { "epoch": 0.15314747474747475, "grad_norm": 0.07667747884988785, "learning_rate": 0.00019915793395812036, "loss": 0.0975, "step": 2369 }, { "epoch": 0.15321212121212122, "grad_norm": 0.1020771786570549, "learning_rate": 0.000199157048120933, "loss": 0.109, "step": 2370 }, { "epoch": 0.15327676767676768, "grad_norm": 0.07340958714485168, "learning_rate": 0.00019915616182002095, "loss": 0.0953, "step": 2371 }, { "epoch": 0.15334141414141414, "grad_norm": 0.08042582124471664, "learning_rate": 0.00019915527505538826, "loss": 0.0953, "step": 2372 }, { "epoch": 0.1534060606060606, "grad_norm": 0.07423064112663269, "learning_rate": 0.00019915438782703917, "loss": 0.0932, "step": 2373 }, { "epoch": 0.15347070707070706, "grad_norm": 0.0813387781381607, "learning_rate": 0.0001991535001349778, "loss": 0.0773, "step": 2374 }, { "epoch": 0.15353535353535352, "grad_norm": 0.07985939085483551, "learning_rate": 0.00019915261197920824, "loss": 0.0816, "step": 2375 }, { "epoch": 0.1536, "grad_norm": 0.0904230922460556, "learning_rate": 0.00019915172335973472, "loss": 0.1005, "step": 2376 }, { "epoch": 0.15366464646464648, "grad_norm": 0.06688009947538376, "learning_rate": 0.00019915083427656137, "loss": 0.0858, "step": 2377 }, { "epoch": 0.15372929292929294, "grad_norm": 0.07655452936887741, "learning_rate": 0.00019914994472969232, "loss": 0.0943, "step": 2378 }, { "epoch": 0.1537939393939394, "grad_norm": 0.09423457086086273, "learning_rate": 0.0001991490547191318, "loss": 0.0997, "step": 2379 }, { "epoch": 0.15385858585858586, "grad_norm": 0.09763972461223602, "learning_rate": 0.0001991481642448839, "loss": 0.0953, "step": 2380 }, { "epoch": 0.15392323232323232, "grad_norm": 0.07487978041172028, "learning_rate": 0.00019914727330695283, "loss": 0.0888, "step": 2381 }, { "epoch": 0.15398787878787878, "grad_norm": 0.070538729429245, "learning_rate": 0.00019914638190534273, "loss": 0.0821, "step": 2382 }, { "epoch": 0.15405252525252525, "grad_norm": 0.07963468879461288, "learning_rate": 0.00019914549004005779, "loss": 0.0917, "step": 2383 }, { "epoch": 0.1541171717171717, "grad_norm": 0.08227356523275375, "learning_rate": 0.00019914459771110213, "loss": 0.0907, "step": 2384 }, { "epoch": 0.1541171717171717, "eval_bleu": 16.018994930559277, "eval_loss": 0.09477431327104568, "eval_runtime": 2.6915, "eval_samples_per_second": 11.889, "eval_steps_per_second": 1.486, "step": 2384 }, { "epoch": 0.15418181818181817, "grad_norm": 0.1069755107164383, "learning_rate": 0.00019914370491848, "loss": 0.1135, "step": 2385 }, { "epoch": 0.15424646464646466, "grad_norm": 0.09064731001853943, "learning_rate": 0.00019914281166219552, "loss": 0.0945, "step": 2386 }, { "epoch": 0.15431111111111112, "grad_norm": 0.08304800093173981, "learning_rate": 0.0001991419179422529, "loss": 0.0817, "step": 2387 }, { "epoch": 0.15437575757575758, "grad_norm": 0.07548732310533524, "learning_rate": 0.00019914102375865632, "loss": 0.0804, "step": 2388 }, { "epoch": 0.15444040404040404, "grad_norm": 0.07957342267036438, "learning_rate": 0.00019914012911140993, "loss": 0.0901, "step": 2389 }, { "epoch": 0.1545050505050505, "grad_norm": 0.08715780824422836, "learning_rate": 0.00019913923400051792, "loss": 0.0964, "step": 2390 }, { "epoch": 0.15456969696969697, "grad_norm": 0.07704520225524902, "learning_rate": 0.00019913833842598452, "loss": 0.0948, "step": 2391 }, { "epoch": 0.15463434343434343, "grad_norm": 0.08232387900352478, "learning_rate": 0.00019913744238781386, "loss": 0.1048, "step": 2392 }, { "epoch": 0.1546989898989899, "grad_norm": 0.08324352651834488, "learning_rate": 0.00019913654588601015, "loss": 0.0921, "step": 2393 }, { "epoch": 0.15476363636363635, "grad_norm": 0.0733690932393074, "learning_rate": 0.00019913564892057762, "loss": 0.0804, "step": 2394 }, { "epoch": 0.15482828282828281, "grad_norm": 0.07485280185937881, "learning_rate": 0.0001991347514915204, "loss": 0.0862, "step": 2395 }, { "epoch": 0.1548929292929293, "grad_norm": 0.07740943878889084, "learning_rate": 0.00019913385359884272, "loss": 0.0848, "step": 2396 }, { "epoch": 0.15495757575757577, "grad_norm": 0.10380509495735168, "learning_rate": 0.0001991329552425488, "loss": 0.1076, "step": 2397 }, { "epoch": 0.15502222222222223, "grad_norm": 0.08265800774097443, "learning_rate": 0.00019913205642264282, "loss": 0.1025, "step": 2398 }, { "epoch": 0.1550868686868687, "grad_norm": 0.0772845521569252, "learning_rate": 0.00019913115713912895, "loss": 0.094, "step": 2399 }, { "epoch": 0.15515151515151515, "grad_norm": 0.07430519908666611, "learning_rate": 0.00019913025739201149, "loss": 0.0855, "step": 2400 }, { "epoch": 0.15515151515151515, "eval_bleu": 14.101293051777796, "eval_loss": 0.09552083909511566, "eval_runtime": 2.6825, "eval_samples_per_second": 11.929, "eval_steps_per_second": 1.491, "step": 2400 }, { "epoch": 0.1552161616161616, "grad_norm": 0.07156430929899216, "learning_rate": 0.00019912935718129453, "loss": 0.0879, "step": 2401 }, { "epoch": 0.15528080808080807, "grad_norm": 0.09990596026182175, "learning_rate": 0.00019912845650698238, "loss": 0.1339, "step": 2402 }, { "epoch": 0.15534545454545454, "grad_norm": 0.06521645933389664, "learning_rate": 0.00019912755536907918, "loss": 0.0693, "step": 2403 }, { "epoch": 0.155410101010101, "grad_norm": 0.0766676813364029, "learning_rate": 0.00019912665376758918, "loss": 0.0904, "step": 2404 }, { "epoch": 0.1554747474747475, "grad_norm": 0.0709398090839386, "learning_rate": 0.00019912575170251661, "loss": 0.0757, "step": 2405 }, { "epoch": 0.15553939393939395, "grad_norm": 0.07931448519229889, "learning_rate": 0.00019912484917386568, "loss": 0.105, "step": 2406 }, { "epoch": 0.1556040404040404, "grad_norm": 0.07410561293363571, "learning_rate": 0.00019912394618164058, "loss": 0.0835, "step": 2407 }, { "epoch": 0.15566868686868687, "grad_norm": 0.08293446898460388, "learning_rate": 0.00019912304272584554, "loss": 0.1038, "step": 2408 }, { "epoch": 0.15573333333333333, "grad_norm": 0.09968655556440353, "learning_rate": 0.00019912213880648482, "loss": 0.1311, "step": 2409 }, { "epoch": 0.1557979797979798, "grad_norm": 0.09109925478696823, "learning_rate": 0.0001991212344235626, "loss": 0.1124, "step": 2410 }, { "epoch": 0.15586262626262626, "grad_norm": 0.09481918811798096, "learning_rate": 0.00019912032957708316, "loss": 0.0966, "step": 2411 }, { "epoch": 0.15592727272727272, "grad_norm": 0.08684804290533066, "learning_rate": 0.0001991194242670507, "loss": 0.1042, "step": 2412 }, { "epoch": 0.15599191919191918, "grad_norm": 0.08067158609628677, "learning_rate": 0.00019911851849346946, "loss": 0.1054, "step": 2413 }, { "epoch": 0.15605656565656564, "grad_norm": 0.08107931911945343, "learning_rate": 0.0001991176122563437, "loss": 0.1061, "step": 2414 }, { "epoch": 0.15612121212121213, "grad_norm": 0.07803688198328018, "learning_rate": 0.0001991167055556776, "loss": 0.1077, "step": 2415 }, { "epoch": 0.1561858585858586, "grad_norm": 0.08813579380512238, "learning_rate": 0.00019911579839147544, "loss": 0.1147, "step": 2416 }, { "epoch": 0.1561858585858586, "eval_bleu": 15.979361638317176, "eval_loss": 0.09620603919029236, "eval_runtime": 2.6589, "eval_samples_per_second": 12.035, "eval_steps_per_second": 1.504, "step": 2416 }, { "epoch": 0.15625050505050506, "grad_norm": 0.1072230115532875, "learning_rate": 0.00019911489076374149, "loss": 0.1056, "step": 2417 }, { "epoch": 0.15631515151515152, "grad_norm": 0.08445640653371811, "learning_rate": 0.00019911398267247993, "loss": 0.0989, "step": 2418 }, { "epoch": 0.15637979797979798, "grad_norm": 0.07640232890844345, "learning_rate": 0.00019911307411769503, "loss": 0.0924, "step": 2419 }, { "epoch": 0.15644444444444444, "grad_norm": 0.08556509763002396, "learning_rate": 0.00019911216509939107, "loss": 0.097, "step": 2420 }, { "epoch": 0.1565090909090909, "grad_norm": 0.08633379638195038, "learning_rate": 0.0001991112556175723, "loss": 0.0976, "step": 2421 }, { "epoch": 0.15657373737373736, "grad_norm": 0.08954688906669617, "learning_rate": 0.00019911034567224293, "loss": 0.0916, "step": 2422 }, { "epoch": 0.15663838383838383, "grad_norm": 0.08552480489015579, "learning_rate": 0.0001991094352634072, "loss": 0.1142, "step": 2423 }, { "epoch": 0.15670303030303032, "grad_norm": 0.08744465559720993, "learning_rate": 0.00019910852439106947, "loss": 0.0995, "step": 2424 }, { "epoch": 0.15676767676767678, "grad_norm": 0.08486898243427277, "learning_rate": 0.00019910761305523393, "loss": 0.0908, "step": 2425 }, { "epoch": 0.15683232323232324, "grad_norm": 0.10686268657445908, "learning_rate": 0.00019910670125590483, "loss": 0.0879, "step": 2426 }, { "epoch": 0.1568969696969697, "grad_norm": 0.09920540452003479, "learning_rate": 0.00019910578899308643, "loss": 0.1157, "step": 2427 }, { "epoch": 0.15696161616161616, "grad_norm": 0.09235216677188873, "learning_rate": 0.00019910487626678303, "loss": 0.1015, "step": 2428 }, { "epoch": 0.15702626262626262, "grad_norm": 0.07505187392234802, "learning_rate": 0.0001991039630769989, "loss": 0.0914, "step": 2429 }, { "epoch": 0.1570909090909091, "grad_norm": 0.07139051705598831, "learning_rate": 0.0001991030494237383, "loss": 0.0777, "step": 2430 }, { "epoch": 0.15715555555555555, "grad_norm": 0.09772682189941406, "learning_rate": 0.0001991021353070055, "loss": 0.1183, "step": 2431 }, { "epoch": 0.157220202020202, "grad_norm": 0.08875758200883865, "learning_rate": 0.00019910122072680478, "loss": 0.1186, "step": 2432 }, { "epoch": 0.157220202020202, "eval_bleu": 16.458234427598295, "eval_loss": 0.09614560008049011, "eval_runtime": 2.7877, "eval_samples_per_second": 11.479, "eval_steps_per_second": 1.435, "step": 2432 }, { "epoch": 0.15728484848484847, "grad_norm": 0.09600205719470978, "learning_rate": 0.0001991003056831404, "loss": 0.1105, "step": 2433 }, { "epoch": 0.15734949494949496, "grad_norm": 0.09605379402637482, "learning_rate": 0.00019909939017601666, "loss": 0.1156, "step": 2434 }, { "epoch": 0.15741414141414142, "grad_norm": 0.07681706547737122, "learning_rate": 0.0001990984742054378, "loss": 0.0809, "step": 2435 }, { "epoch": 0.15747878787878788, "grad_norm": 0.08103141188621521, "learning_rate": 0.00019909755777140818, "loss": 0.097, "step": 2436 }, { "epoch": 0.15754343434343435, "grad_norm": 0.0800071582198143, "learning_rate": 0.00019909664087393205, "loss": 0.1025, "step": 2437 }, { "epoch": 0.1576080808080808, "grad_norm": 0.09360525012016296, "learning_rate": 0.00019909572351301368, "loss": 0.0931, "step": 2438 }, { "epoch": 0.15767272727272727, "grad_norm": 0.08019673824310303, "learning_rate": 0.00019909480568865734, "loss": 0.1003, "step": 2439 }, { "epoch": 0.15773737373737373, "grad_norm": 0.08331821113824844, "learning_rate": 0.00019909388740086737, "loss": 0.0885, "step": 2440 }, { "epoch": 0.1578020202020202, "grad_norm": 0.08785795420408249, "learning_rate": 0.0001990929686496481, "loss": 0.1012, "step": 2441 }, { "epoch": 0.15786666666666666, "grad_norm": 0.07840538769960403, "learning_rate": 0.0001990920494350037, "loss": 0.0773, "step": 2442 }, { "epoch": 0.15793131313131314, "grad_norm": 0.07582083344459534, "learning_rate": 0.00019909112975693857, "loss": 0.081, "step": 2443 }, { "epoch": 0.1579959595959596, "grad_norm": 0.09787586331367493, "learning_rate": 0.00019909020961545698, "loss": 0.1002, "step": 2444 }, { "epoch": 0.15806060606060607, "grad_norm": 0.08296768367290497, "learning_rate": 0.00019908928901056326, "loss": 0.0928, "step": 2445 }, { "epoch": 0.15812525252525253, "grad_norm": 0.0977582335472107, "learning_rate": 0.00019908836794226172, "loss": 0.1044, "step": 2446 }, { "epoch": 0.158189898989899, "grad_norm": 0.09254743903875351, "learning_rate": 0.00019908744641055658, "loss": 0.0875, "step": 2447 }, { "epoch": 0.15825454545454545, "grad_norm": 0.07885557413101196, "learning_rate": 0.00019908652441545224, "loss": 0.0909, "step": 2448 }, { "epoch": 0.15825454545454545, "eval_bleu": 13.406405586738554, "eval_loss": 0.09485719352960587, "eval_runtime": 2.7328, "eval_samples_per_second": 11.71, "eval_steps_per_second": 1.464, "step": 2448 }, { "epoch": 0.15831919191919192, "grad_norm": 0.10246507078409195, "learning_rate": 0.00019908560195695295, "loss": 0.09, "step": 2449 }, { "epoch": 0.15838383838383838, "grad_norm": 0.07869229465723038, "learning_rate": 0.0001990846790350631, "loss": 0.0931, "step": 2450 }, { "epoch": 0.15844848484848484, "grad_norm": 0.08455470949411392, "learning_rate": 0.00019908375564978696, "loss": 0.0949, "step": 2451 }, { "epoch": 0.1585131313131313, "grad_norm": 0.07824108749628067, "learning_rate": 0.00019908283180112885, "loss": 0.0927, "step": 2452 }, { "epoch": 0.1585777777777778, "grad_norm": 0.09662675112485886, "learning_rate": 0.00019908190748909305, "loss": 0.0804, "step": 2453 }, { "epoch": 0.15864242424242425, "grad_norm": 0.08414657413959503, "learning_rate": 0.00019908098271368397, "loss": 0.1116, "step": 2454 }, { "epoch": 0.1587070707070707, "grad_norm": 0.07943543791770935, "learning_rate": 0.00019908005747490588, "loss": 0.1144, "step": 2455 }, { "epoch": 0.15877171717171717, "grad_norm": 0.07298710942268372, "learning_rate": 0.0001990791317727631, "loss": 0.0965, "step": 2456 }, { "epoch": 0.15883636363636364, "grad_norm": 0.07060336321592331, "learning_rate": 0.00019907820560725997, "loss": 0.084, "step": 2457 }, { "epoch": 0.1589010101010101, "grad_norm": 0.08114949613809586, "learning_rate": 0.00019907727897840085, "loss": 0.0974, "step": 2458 }, { "epoch": 0.15896565656565656, "grad_norm": 0.07823418825864792, "learning_rate": 0.00019907635188619004, "loss": 0.0944, "step": 2459 }, { "epoch": 0.15903030303030302, "grad_norm": 0.07994591444730759, "learning_rate": 0.0001990754243306319, "loss": 0.0918, "step": 2460 }, { "epoch": 0.15909494949494948, "grad_norm": 0.07660511881113052, "learning_rate": 0.00019907449631173074, "loss": 0.0852, "step": 2461 }, { "epoch": 0.15915959595959597, "grad_norm": 0.08481775969266891, "learning_rate": 0.00019907356782949092, "loss": 0.1019, "step": 2462 }, { "epoch": 0.15922424242424243, "grad_norm": 0.08431138098239899, "learning_rate": 0.00019907263888391675, "loss": 0.0905, "step": 2463 }, { "epoch": 0.1592888888888889, "grad_norm": 0.08255553990602493, "learning_rate": 0.00019907170947501267, "loss": 0.0949, "step": 2464 }, { "epoch": 0.1592888888888889, "eval_bleu": 17.949476684371227, "eval_loss": 0.0979250818490982, "eval_runtime": 2.7504, "eval_samples_per_second": 11.634, "eval_steps_per_second": 1.454, "step": 2464 }, { "epoch": 0.15935353535353536, "grad_norm": 0.08562088012695312, "learning_rate": 0.0001990707796027829, "loss": 0.097, "step": 2465 }, { "epoch": 0.15941818181818182, "grad_norm": 0.09004244208335876, "learning_rate": 0.00019906984926723186, "loss": 0.0995, "step": 2466 }, { "epoch": 0.15948282828282828, "grad_norm": 0.08006538450717926, "learning_rate": 0.0001990689184683639, "loss": 0.0904, "step": 2467 }, { "epoch": 0.15954747474747474, "grad_norm": 0.08049822598695755, "learning_rate": 0.00019906798720618332, "loss": 0.0936, "step": 2468 }, { "epoch": 0.1596121212121212, "grad_norm": 0.0833514928817749, "learning_rate": 0.00019906705548069454, "loss": 0.0958, "step": 2469 }, { "epoch": 0.15967676767676767, "grad_norm": 0.07492423057556152, "learning_rate": 0.00019906612329190188, "loss": 0.0897, "step": 2470 }, { "epoch": 0.15974141414141413, "grad_norm": 0.06778661161661148, "learning_rate": 0.00019906519063980972, "loss": 0.0861, "step": 2471 }, { "epoch": 0.15980606060606062, "grad_norm": 0.08098343759775162, "learning_rate": 0.0001990642575244224, "loss": 0.1008, "step": 2472 }, { "epoch": 0.15987070707070708, "grad_norm": 0.07971610128879547, "learning_rate": 0.00019906332394574434, "loss": 0.1022, "step": 2473 }, { "epoch": 0.15993535353535354, "grad_norm": 0.08234554529190063, "learning_rate": 0.00019906238990377982, "loss": 0.1039, "step": 2474 }, { "epoch": 0.16, "grad_norm": 0.09035717695951462, "learning_rate": 0.00019906145539853328, "loss": 0.126, "step": 2475 }, { "epoch": 0.16006464646464647, "grad_norm": 0.08886728435754776, "learning_rate": 0.00019906052043000907, "loss": 0.0941, "step": 2476 }, { "epoch": 0.16012929292929293, "grad_norm": 0.07969742268323898, "learning_rate": 0.00019905958499821154, "loss": 0.1, "step": 2477 }, { "epoch": 0.1601939393939394, "grad_norm": 0.08392511308193207, "learning_rate": 0.00019905864910314507, "loss": 0.1009, "step": 2478 }, { "epoch": 0.16025858585858585, "grad_norm": 0.06872080266475677, "learning_rate": 0.00019905771274481406, "loss": 0.0763, "step": 2479 }, { "epoch": 0.1603232323232323, "grad_norm": 0.08104293793439865, "learning_rate": 0.00019905677592322286, "loss": 0.0871, "step": 2480 }, { "epoch": 0.1603232323232323, "eval_bleu": 16.479710093346444, "eval_loss": 0.09787783771753311, "eval_runtime": 2.633, "eval_samples_per_second": 12.154, "eval_steps_per_second": 1.519, "step": 2480 }, { "epoch": 0.1603878787878788, "grad_norm": 0.08231018483638763, "learning_rate": 0.0001990558386383759, "loss": 0.0735, "step": 2481 }, { "epoch": 0.16045252525252526, "grad_norm": 0.07051029801368713, "learning_rate": 0.00019905490089027747, "loss": 0.091, "step": 2482 }, { "epoch": 0.16051717171717172, "grad_norm": 0.06911779940128326, "learning_rate": 0.00019905396267893205, "loss": 0.0788, "step": 2483 }, { "epoch": 0.1605818181818182, "grad_norm": 0.0746067687869072, "learning_rate": 0.000199053024004344, "loss": 0.0826, "step": 2484 }, { "epoch": 0.16064646464646465, "grad_norm": 0.09021712839603424, "learning_rate": 0.0001990520848665177, "loss": 0.1117, "step": 2485 }, { "epoch": 0.1607111111111111, "grad_norm": 0.08775275200605392, "learning_rate": 0.00019905114526545754, "loss": 0.0955, "step": 2486 }, { "epoch": 0.16077575757575757, "grad_norm": 0.06989617645740509, "learning_rate": 0.00019905020520116792, "loss": 0.074, "step": 2487 }, { "epoch": 0.16084040404040403, "grad_norm": 0.08801909536123276, "learning_rate": 0.00019904926467365324, "loss": 0.0985, "step": 2488 }, { "epoch": 0.1609050505050505, "grad_norm": 0.0718747079372406, "learning_rate": 0.00019904832368291788, "loss": 0.0846, "step": 2489 }, { "epoch": 0.16096969696969696, "grad_norm": 0.06711117923259735, "learning_rate": 0.00019904738222896627, "loss": 0.0668, "step": 2490 }, { "epoch": 0.16103434343434345, "grad_norm": 0.0798947736620903, "learning_rate": 0.00019904644031180278, "loss": 0.0782, "step": 2491 }, { "epoch": 0.1610989898989899, "grad_norm": 0.09056241065263748, "learning_rate": 0.0001990454979314319, "loss": 0.1003, "step": 2492 }, { "epoch": 0.16116363636363637, "grad_norm": 0.08069320023059845, "learning_rate": 0.0001990445550878579, "loss": 0.0909, "step": 2493 }, { "epoch": 0.16122828282828283, "grad_norm": 0.07866625487804413, "learning_rate": 0.00019904361178108525, "loss": 0.0889, "step": 2494 }, { "epoch": 0.1612929292929293, "grad_norm": 0.08749701827764511, "learning_rate": 0.00019904266801111842, "loss": 0.082, "step": 2495 }, { "epoch": 0.16135757575757576, "grad_norm": 0.08497153967618942, "learning_rate": 0.00019904172377796173, "loss": 0.0883, "step": 2496 }, { "epoch": 0.16135757575757576, "eval_bleu": 13.421687733459187, "eval_loss": 0.09912532567977905, "eval_runtime": 2.9212, "eval_samples_per_second": 10.954, "eval_steps_per_second": 1.369, "step": 2496 }, { "epoch": 0.16142222222222222, "grad_norm": 0.13220928609371185, "learning_rate": 0.00019904077908161966, "loss": 0.1012, "step": 2497 }, { "epoch": 0.16148686868686868, "grad_norm": 0.08096387982368469, "learning_rate": 0.0001990398339220966, "loss": 0.0958, "step": 2498 }, { "epoch": 0.16155151515151514, "grad_norm": 0.07711213827133179, "learning_rate": 0.000199038888299397, "loss": 0.0914, "step": 2499 }, { "epoch": 0.16161616161616163, "grad_norm": 0.07851159572601318, "learning_rate": 0.00019903794221352524, "loss": 0.088, "step": 2500 }, { "epoch": 0.1616808080808081, "grad_norm": 0.07726718485355377, "learning_rate": 0.0001990369956644858, "loss": 0.0838, "step": 2501 }, { "epoch": 0.16174545454545455, "grad_norm": 0.09092199057340622, "learning_rate": 0.00019903604865228306, "loss": 0.1121, "step": 2502 }, { "epoch": 0.16181010101010102, "grad_norm": 0.07309889793395996, "learning_rate": 0.0001990351011769214, "loss": 0.0871, "step": 2503 }, { "epoch": 0.16187474747474748, "grad_norm": 0.1902703493833542, "learning_rate": 0.00019903415323840537, "loss": 0.0854, "step": 2504 }, { "epoch": 0.16193939393939394, "grad_norm": 0.09022161364555359, "learning_rate": 0.00019903320483673936, "loss": 0.0939, "step": 2505 }, { "epoch": 0.1620040404040404, "grad_norm": 0.07873562723398209, "learning_rate": 0.00019903225597192776, "loss": 0.0979, "step": 2506 }, { "epoch": 0.16206868686868686, "grad_norm": 0.07803434878587723, "learning_rate": 0.00019903130664397504, "loss": 0.0961, "step": 2507 }, { "epoch": 0.16213333333333332, "grad_norm": 0.07549719512462616, "learning_rate": 0.00019903035685288564, "loss": 0.0813, "step": 2508 }, { "epoch": 0.16219797979797979, "grad_norm": 0.07908180356025696, "learning_rate": 0.000199029406598664, "loss": 0.1072, "step": 2509 }, { "epoch": 0.16226262626262627, "grad_norm": 0.0819828063249588, "learning_rate": 0.00019902845588131457, "loss": 0.0901, "step": 2510 }, { "epoch": 0.16232727272727274, "grad_norm": 0.10002878308296204, "learning_rate": 0.0001990275047008418, "loss": 0.1271, "step": 2511 }, { "epoch": 0.1623919191919192, "grad_norm": 0.09526874125003815, "learning_rate": 0.0001990265530572501, "loss": 0.1011, "step": 2512 }, { "epoch": 0.1623919191919192, "eval_bleu": 14.323994324456736, "eval_loss": 0.09852258861064911, "eval_runtime": 2.6961, "eval_samples_per_second": 11.869, "eval_steps_per_second": 1.484, "step": 2512 }, { "epoch": 0.16245656565656566, "grad_norm": 0.084198959171772, "learning_rate": 0.00019902560095054397, "loss": 0.0908, "step": 2513 }, { "epoch": 0.16252121212121212, "grad_norm": 0.0899416133761406, "learning_rate": 0.00019902464838072784, "loss": 0.1136, "step": 2514 }, { "epoch": 0.16258585858585858, "grad_norm": 0.08481559157371521, "learning_rate": 0.00019902369534780616, "loss": 0.0902, "step": 2515 }, { "epoch": 0.16265050505050505, "grad_norm": 0.07603602856397629, "learning_rate": 0.00019902274185178338, "loss": 0.0795, "step": 2516 }, { "epoch": 0.1627151515151515, "grad_norm": 0.0834091529250145, "learning_rate": 0.000199021787892664, "loss": 0.092, "step": 2517 }, { "epoch": 0.16277979797979797, "grad_norm": 0.10004030913114548, "learning_rate": 0.00019902083347045243, "loss": 0.0981, "step": 2518 }, { "epoch": 0.16284444444444446, "grad_norm": 0.08966339379549026, "learning_rate": 0.00019901987858515318, "loss": 0.0785, "step": 2519 }, { "epoch": 0.16290909090909092, "grad_norm": 0.10092730820178986, "learning_rate": 0.0001990189232367707, "loss": 0.1036, "step": 2520 }, { "epoch": 0.16297373737373738, "grad_norm": 0.09682455658912659, "learning_rate": 0.0001990179674253094, "loss": 0.086, "step": 2521 }, { "epoch": 0.16303838383838384, "grad_norm": 0.07699234038591385, "learning_rate": 0.00019901701115077386, "loss": 0.0901, "step": 2522 }, { "epoch": 0.1631030303030303, "grad_norm": 0.07940739393234253, "learning_rate": 0.00019901605441316846, "loss": 0.0905, "step": 2523 }, { "epoch": 0.16316767676767677, "grad_norm": 0.08646399527788162, "learning_rate": 0.0001990150972124977, "loss": 0.0968, "step": 2524 }, { "epoch": 0.16323232323232323, "grad_norm": 0.08938555419445038, "learning_rate": 0.00019901413954876609, "loss": 0.0707, "step": 2525 }, { "epoch": 0.1632969696969697, "grad_norm": 0.08455876260995865, "learning_rate": 0.00019901318142197808, "loss": 0.092, "step": 2526 }, { "epoch": 0.16336161616161615, "grad_norm": 0.10977445542812347, "learning_rate": 0.00019901222283213814, "loss": 0.0902, "step": 2527 }, { "epoch": 0.16342626262626261, "grad_norm": 0.06731515377759933, "learning_rate": 0.00019901126377925077, "loss": 0.0754, "step": 2528 }, { "epoch": 0.16342626262626261, "eval_bleu": 16.363837430237744, "eval_loss": 0.10056555271148682, "eval_runtime": 2.7592, "eval_samples_per_second": 11.598, "eval_steps_per_second": 1.45, "step": 2528 }, { "epoch": 0.1634909090909091, "grad_norm": 0.08187739551067352, "learning_rate": 0.00019901030426332046, "loss": 0.1005, "step": 2529 }, { "epoch": 0.16355555555555557, "grad_norm": 0.06836195290088654, "learning_rate": 0.0001990093442843517, "loss": 0.0772, "step": 2530 }, { "epoch": 0.16362020202020203, "grad_norm": 0.0767069086432457, "learning_rate": 0.0001990083838423489, "loss": 0.0891, "step": 2531 }, { "epoch": 0.1636848484848485, "grad_norm": 0.08812432736158371, "learning_rate": 0.00019900742293731672, "loss": 0.0919, "step": 2532 }, { "epoch": 0.16374949494949495, "grad_norm": 0.0799088403582573, "learning_rate": 0.0001990064615692595, "loss": 0.087, "step": 2533 }, { "epoch": 0.1638141414141414, "grad_norm": 0.08472167700529099, "learning_rate": 0.00019900549973818178, "loss": 0.0977, "step": 2534 }, { "epoch": 0.16387878787878787, "grad_norm": 0.07687152177095413, "learning_rate": 0.0001990045374440881, "loss": 0.0895, "step": 2535 }, { "epoch": 0.16394343434343434, "grad_norm": 0.08221257477998734, "learning_rate": 0.0001990035746869829, "loss": 0.0903, "step": 2536 }, { "epoch": 0.1640080808080808, "grad_norm": 0.07737286388874054, "learning_rate": 0.00019900261146687072, "loss": 0.0935, "step": 2537 }, { "epoch": 0.16407272727272726, "grad_norm": 0.08981961756944656, "learning_rate": 0.00019900164778375608, "loss": 0.1196, "step": 2538 }, { "epoch": 0.16413737373737375, "grad_norm": 0.0863732099533081, "learning_rate": 0.00019900068363764344, "loss": 0.0993, "step": 2539 }, { "epoch": 0.1642020202020202, "grad_norm": 0.08438657224178314, "learning_rate": 0.00019899971902853734, "loss": 0.1012, "step": 2540 }, { "epoch": 0.16426666666666667, "grad_norm": 0.07388683408498764, "learning_rate": 0.00019899875395644226, "loss": 0.0796, "step": 2541 }, { "epoch": 0.16433131313131313, "grad_norm": 0.07076214253902435, "learning_rate": 0.00019899778842136278, "loss": 0.0699, "step": 2542 }, { "epoch": 0.1643959595959596, "grad_norm": 0.08369190990924835, "learning_rate": 0.00019899682242330332, "loss": 0.1134, "step": 2543 }, { "epoch": 0.16446060606060606, "grad_norm": 0.08069916814565659, "learning_rate": 0.00019899585596226847, "loss": 0.0893, "step": 2544 }, { "epoch": 0.16446060606060606, "eval_bleu": 14.397395841843759, "eval_loss": 0.09635031223297119, "eval_runtime": 2.666, "eval_samples_per_second": 12.003, "eval_steps_per_second": 1.5, "step": 2544 }, { "epoch": 0.16452525252525252, "grad_norm": 0.09031973779201508, "learning_rate": 0.00019899488903826274, "loss": 0.0901, "step": 2545 }, { "epoch": 0.16458989898989898, "grad_norm": 0.08417774736881256, "learning_rate": 0.00019899392165129064, "loss": 0.0898, "step": 2546 }, { "epoch": 0.16465454545454544, "grad_norm": 0.09317639470100403, "learning_rate": 0.00019899295380135668, "loss": 0.1036, "step": 2547 }, { "epoch": 0.16471919191919193, "grad_norm": 0.0843297690153122, "learning_rate": 0.00019899198548846542, "loss": 0.1095, "step": 2548 }, { "epoch": 0.1647838383838384, "grad_norm": 0.0838332399725914, "learning_rate": 0.00019899101671262136, "loss": 0.087, "step": 2549 }, { "epoch": 0.16484848484848486, "grad_norm": 0.08582288771867752, "learning_rate": 0.00019899004747382906, "loss": 0.1032, "step": 2550 }, { "epoch": 0.16491313131313132, "grad_norm": 0.1154182031750679, "learning_rate": 0.000198989077772093, "loss": 0.1073, "step": 2551 }, { "epoch": 0.16497777777777778, "grad_norm": 0.08523426949977875, "learning_rate": 0.00019898810760741778, "loss": 0.1185, "step": 2552 }, { "epoch": 0.16504242424242424, "grad_norm": 0.0796506404876709, "learning_rate": 0.0001989871369798079, "loss": 0.1016, "step": 2553 }, { "epoch": 0.1651070707070707, "grad_norm": 0.07208889722824097, "learning_rate": 0.00019898616588926785, "loss": 0.1019, "step": 2554 }, { "epoch": 0.16517171717171716, "grad_norm": 0.07410484552383423, "learning_rate": 0.0001989851943358023, "loss": 0.0839, "step": 2555 }, { "epoch": 0.16523636363636363, "grad_norm": 0.09314025938510895, "learning_rate": 0.0001989842223194157, "loss": 0.1069, "step": 2556 }, { "epoch": 0.1653010101010101, "grad_norm": 0.07474405318498611, "learning_rate": 0.0001989832498401126, "loss": 0.105, "step": 2557 }, { "epoch": 0.16536565656565658, "grad_norm": 0.07362939417362213, "learning_rate": 0.00019898227689789754, "loss": 0.0819, "step": 2558 }, { "epoch": 0.16543030303030304, "grad_norm": 0.07677435129880905, "learning_rate": 0.00019898130349277514, "loss": 0.0991, "step": 2559 }, { "epoch": 0.1654949494949495, "grad_norm": 0.08707185089588165, "learning_rate": 0.00019898032962474988, "loss": 0.0934, "step": 2560 }, { "epoch": 0.1654949494949495, "eval_bleu": 15.90855580314918, "eval_loss": 0.09577429294586182, "eval_runtime": 2.7514, "eval_samples_per_second": 11.631, "eval_steps_per_second": 1.454, "step": 2560 }, { "epoch": 0.16555959595959596, "grad_norm": 0.06546158343553543, "learning_rate": 0.00019897935529382638, "loss": 0.0666, "step": 2561 }, { "epoch": 0.16562424242424242, "grad_norm": 0.0820000022649765, "learning_rate": 0.00019897838050000912, "loss": 0.0931, "step": 2562 }, { "epoch": 0.16568888888888889, "grad_norm": 0.07797069102525711, "learning_rate": 0.0001989774052433027, "loss": 0.0736, "step": 2563 }, { "epoch": 0.16575353535353535, "grad_norm": 0.07704420387744904, "learning_rate": 0.00019897642952371167, "loss": 0.0954, "step": 2564 }, { "epoch": 0.1658181818181818, "grad_norm": 0.08196604251861572, "learning_rate": 0.0001989754533412406, "loss": 0.0856, "step": 2565 }, { "epoch": 0.16588282828282827, "grad_norm": 0.08397327363491058, "learning_rate": 0.00019897447669589409, "loss": 0.0977, "step": 2566 }, { "epoch": 0.16594747474747476, "grad_norm": 0.07992056012153625, "learning_rate": 0.00019897349958767664, "loss": 0.0955, "step": 2567 }, { "epoch": 0.16601212121212122, "grad_norm": 0.10287249088287354, "learning_rate": 0.00019897252201659285, "loss": 0.1129, "step": 2568 }, { "epoch": 0.16607676767676768, "grad_norm": 0.08574705570936203, "learning_rate": 0.0001989715439826473, "loss": 0.0883, "step": 2569 }, { "epoch": 0.16614141414141415, "grad_norm": 0.0904935672879219, "learning_rate": 0.00019897056548584457, "loss": 0.0906, "step": 2570 }, { "epoch": 0.1662060606060606, "grad_norm": 0.08727099746465683, "learning_rate": 0.00019896958652618923, "loss": 0.0972, "step": 2571 }, { "epoch": 0.16627070707070707, "grad_norm": 0.0833154246211052, "learning_rate": 0.00019896860710368584, "loss": 0.0936, "step": 2572 }, { "epoch": 0.16633535353535353, "grad_norm": 0.06849158555269241, "learning_rate": 0.000198967627218339, "loss": 0.0843, "step": 2573 }, { "epoch": 0.1664, "grad_norm": 0.09881257265806198, "learning_rate": 0.00019896664687015327, "loss": 0.0923, "step": 2574 }, { "epoch": 0.16646464646464645, "grad_norm": 0.08209142833948135, "learning_rate": 0.00019896566605913325, "loss": 0.0936, "step": 2575 }, { "epoch": 0.16652929292929292, "grad_norm": 0.0810588151216507, "learning_rate": 0.00019896468478528356, "loss": 0.0929, "step": 2576 }, { "epoch": 0.16652929292929292, "eval_bleu": 12.835440513750095, "eval_loss": 0.09694577753543854, "eval_runtime": 2.7036, "eval_samples_per_second": 11.836, "eval_steps_per_second": 1.48, "step": 2576 }, { "epoch": 0.1665939393939394, "grad_norm": 0.07667204737663269, "learning_rate": 0.00019896370304860872, "loss": 0.0954, "step": 2577 }, { "epoch": 0.16665858585858587, "grad_norm": 0.06912059336900711, "learning_rate": 0.00019896272084911338, "loss": 0.0715, "step": 2578 }, { "epoch": 0.16672323232323233, "grad_norm": 0.06380753964185715, "learning_rate": 0.00019896173818680207, "loss": 0.0746, "step": 2579 }, { "epoch": 0.1667878787878788, "grad_norm": 0.0782894492149353, "learning_rate": 0.00019896075506167947, "loss": 0.0995, "step": 2580 }, { "epoch": 0.16685252525252525, "grad_norm": 0.0894986093044281, "learning_rate": 0.00019895977147375013, "loss": 0.1154, "step": 2581 }, { "epoch": 0.16691717171717171, "grad_norm": 0.0772000104188919, "learning_rate": 0.00019895878742301864, "loss": 0.093, "step": 2582 }, { "epoch": 0.16698181818181818, "grad_norm": 0.07464070618152618, "learning_rate": 0.0001989578029094896, "loss": 0.0702, "step": 2583 }, { "epoch": 0.16704646464646464, "grad_norm": 0.07156574726104736, "learning_rate": 0.0001989568179331677, "loss": 0.0781, "step": 2584 }, { "epoch": 0.1671111111111111, "grad_norm": 0.07308657467365265, "learning_rate": 0.00019895583249405742, "loss": 0.0728, "step": 2585 }, { "epoch": 0.1671757575757576, "grad_norm": 0.06771235167980194, "learning_rate": 0.00019895484659216344, "loss": 0.0923, "step": 2586 }, { "epoch": 0.16724040404040405, "grad_norm": 0.12589362263679504, "learning_rate": 0.00019895386022749035, "loss": 0.0999, "step": 2587 }, { "epoch": 0.1673050505050505, "grad_norm": 0.07527434825897217, "learning_rate": 0.00019895287340004276, "loss": 0.0884, "step": 2588 }, { "epoch": 0.16736969696969697, "grad_norm": 0.09707389026880264, "learning_rate": 0.00019895188610982533, "loss": 0.1048, "step": 2589 }, { "epoch": 0.16743434343434344, "grad_norm": 0.07965560257434845, "learning_rate": 0.0001989508983568426, "loss": 0.0828, "step": 2590 }, { "epoch": 0.1674989898989899, "grad_norm": 0.0849994346499443, "learning_rate": 0.00019894991014109925, "loss": 0.1028, "step": 2591 }, { "epoch": 0.16756363636363636, "grad_norm": 0.061546456068754196, "learning_rate": 0.00019894892146259986, "loss": 0.0572, "step": 2592 }, { "epoch": 0.16756363636363636, "eval_bleu": 11.527252105818075, "eval_loss": 0.09925778955221176, "eval_runtime": 2.7989, "eval_samples_per_second": 11.433, "eval_steps_per_second": 1.429, "step": 2592 }, { "epoch": 0.16762828282828282, "grad_norm": 0.12852871417999268, "learning_rate": 0.00019894793232134913, "loss": 0.1222, "step": 2593 }, { "epoch": 0.16769292929292928, "grad_norm": 0.07851792871952057, "learning_rate": 0.00019894694271735159, "loss": 0.0937, "step": 2594 }, { "epoch": 0.16775757575757574, "grad_norm": 0.08756095916032791, "learning_rate": 0.00019894595265061192, "loss": 0.0778, "step": 2595 }, { "epoch": 0.16782222222222223, "grad_norm": 0.07558748126029968, "learning_rate": 0.00019894496212113474, "loss": 0.0981, "step": 2596 }, { "epoch": 0.1678868686868687, "grad_norm": 0.07501440495252609, "learning_rate": 0.00019894397112892465, "loss": 0.0964, "step": 2597 }, { "epoch": 0.16795151515151516, "grad_norm": 0.13666491210460663, "learning_rate": 0.00019894297967398638, "loss": 0.1389, "step": 2598 }, { "epoch": 0.16801616161616162, "grad_norm": 0.07506051659584045, "learning_rate": 0.00019894198775632446, "loss": 0.0923, "step": 2599 }, { "epoch": 0.16808080808080808, "grad_norm": 0.07255323231220245, "learning_rate": 0.0001989409953759436, "loss": 0.0832, "step": 2600 }, { "epoch": 0.16814545454545454, "grad_norm": 0.07257086783647537, "learning_rate": 0.0001989400025328484, "loss": 0.0854, "step": 2601 }, { "epoch": 0.168210101010101, "grad_norm": 0.0797615498304367, "learning_rate": 0.00019893900922704353, "loss": 0.0986, "step": 2602 }, { "epoch": 0.16827474747474747, "grad_norm": 0.08850092440843582, "learning_rate": 0.00019893801545853358, "loss": 0.1085, "step": 2603 }, { "epoch": 0.16833939393939393, "grad_norm": 0.09193852543830872, "learning_rate": 0.0001989370212273233, "loss": 0.0788, "step": 2604 }, { "epoch": 0.16840404040404042, "grad_norm": 0.09792643040418625, "learning_rate": 0.00019893602653341726, "loss": 0.1048, "step": 2605 }, { "epoch": 0.16846868686868688, "grad_norm": 0.08455830812454224, "learning_rate": 0.0001989350313768201, "loss": 0.1017, "step": 2606 }, { "epoch": 0.16853333333333334, "grad_norm": 0.07427718490362167, "learning_rate": 0.00019893403575753653, "loss": 0.0883, "step": 2607 }, { "epoch": 0.1685979797979798, "grad_norm": 0.07758526504039764, "learning_rate": 0.00019893303967557117, "loss": 0.1015, "step": 2608 }, { "epoch": 0.1685979797979798, "eval_bleu": 13.913496869616926, "eval_loss": 0.09879723936319351, "eval_runtime": 2.7196, "eval_samples_per_second": 11.766, "eval_steps_per_second": 1.471, "step": 2608 }, { "epoch": 0.16866262626262626, "grad_norm": 0.06988576054573059, "learning_rate": 0.0001989320431309287, "loss": 0.082, "step": 2609 }, { "epoch": 0.16872727272727273, "grad_norm": 0.07797277718782425, "learning_rate": 0.00019893104612361378, "loss": 0.0915, "step": 2610 }, { "epoch": 0.1687919191919192, "grad_norm": 0.07000397890806198, "learning_rate": 0.00019893004865363106, "loss": 0.0882, "step": 2611 }, { "epoch": 0.16885656565656565, "grad_norm": 0.07160747051239014, "learning_rate": 0.0001989290507209852, "loss": 0.0839, "step": 2612 }, { "epoch": 0.1689212121212121, "grad_norm": 0.08234173059463501, "learning_rate": 0.00019892805232568086, "loss": 0.096, "step": 2613 }, { "epoch": 0.16898585858585857, "grad_norm": 0.07149218767881393, "learning_rate": 0.00019892705346772274, "loss": 0.0919, "step": 2614 }, { "epoch": 0.16905050505050506, "grad_norm": 0.08471017330884933, "learning_rate": 0.0001989260541471155, "loss": 0.0955, "step": 2615 }, { "epoch": 0.16911515151515152, "grad_norm": 0.08480434864759445, "learning_rate": 0.0001989250543638638, "loss": 0.0818, "step": 2616 }, { "epoch": 0.16917979797979799, "grad_norm": 0.07190948724746704, "learning_rate": 0.00019892405411797232, "loss": 0.0702, "step": 2617 }, { "epoch": 0.16924444444444445, "grad_norm": 0.07594552636146545, "learning_rate": 0.00019892305340944578, "loss": 0.0875, "step": 2618 }, { "epoch": 0.1693090909090909, "grad_norm": 0.07243770360946655, "learning_rate": 0.00019892205223828876, "loss": 0.0922, "step": 2619 }, { "epoch": 0.16937373737373737, "grad_norm": 0.0926441103219986, "learning_rate": 0.00019892105060450606, "loss": 0.1028, "step": 2620 }, { "epoch": 0.16943838383838383, "grad_norm": 0.08102330565452576, "learning_rate": 0.0001989200485081023, "loss": 0.1076, "step": 2621 }, { "epoch": 0.1695030303030303, "grad_norm": 0.08745792508125305, "learning_rate": 0.00019891904594908212, "loss": 0.1199, "step": 2622 }, { "epoch": 0.16956767676767676, "grad_norm": 0.0709044560790062, "learning_rate": 0.00019891804292745033, "loss": 0.0765, "step": 2623 }, { "epoch": 0.16963232323232325, "grad_norm": 0.07396527379751205, "learning_rate": 0.00019891703944321153, "loss": 0.1011, "step": 2624 }, { "epoch": 0.16963232323232325, "eval_bleu": 11.959962870075357, "eval_loss": 0.09875558316707611, "eval_runtime": 2.9192, "eval_samples_per_second": 10.962, "eval_steps_per_second": 1.37, "step": 2624 }, { "epoch": 0.1696969696969697, "grad_norm": 0.09100839495658875, "learning_rate": 0.00019891603549637043, "loss": 0.1102, "step": 2625 }, { "epoch": 0.16976161616161617, "grad_norm": 0.07390713691711426, "learning_rate": 0.00019891503108693175, "loss": 0.0983, "step": 2626 }, { "epoch": 0.16982626262626263, "grad_norm": 0.0841887816786766, "learning_rate": 0.00019891402621490016, "loss": 0.1207, "step": 2627 }, { "epoch": 0.1698909090909091, "grad_norm": 0.0684683546423912, "learning_rate": 0.00019891302088028039, "loss": 0.0804, "step": 2628 }, { "epoch": 0.16995555555555555, "grad_norm": 0.07695214450359344, "learning_rate": 0.00019891201508307708, "loss": 0.0961, "step": 2629 }, { "epoch": 0.17002020202020202, "grad_norm": 0.1141047403216362, "learning_rate": 0.000198911008823295, "loss": 0.1129, "step": 2630 }, { "epoch": 0.17008484848484848, "grad_norm": 0.08545970916748047, "learning_rate": 0.00019891000210093887, "loss": 0.1025, "step": 2631 }, { "epoch": 0.17014949494949494, "grad_norm": 0.08067265152931213, "learning_rate": 0.00019890899491601332, "loss": 0.0873, "step": 2632 }, { "epoch": 0.1702141414141414, "grad_norm": 0.06937942653894424, "learning_rate": 0.00019890798726852308, "loss": 0.0813, "step": 2633 }, { "epoch": 0.1702787878787879, "grad_norm": 0.07903715968132019, "learning_rate": 0.00019890697915847289, "loss": 0.0909, "step": 2634 }, { "epoch": 0.17034343434343435, "grad_norm": 0.08862186223268509, "learning_rate": 0.0001989059705858675, "loss": 0.1001, "step": 2635 }, { "epoch": 0.17040808080808081, "grad_norm": 0.07973022758960724, "learning_rate": 0.00019890496155071152, "loss": 0.0891, "step": 2636 }, { "epoch": 0.17047272727272728, "grad_norm": 0.077756866812706, "learning_rate": 0.00019890395205300978, "loss": 0.0941, "step": 2637 }, { "epoch": 0.17053737373737374, "grad_norm": 0.09999898076057434, "learning_rate": 0.00019890294209276693, "loss": 0.1058, "step": 2638 }, { "epoch": 0.1706020202020202, "grad_norm": 0.08280091732740402, "learning_rate": 0.0001989019316699877, "loss": 0.0991, "step": 2639 }, { "epoch": 0.17066666666666666, "grad_norm": 0.0946156457066536, "learning_rate": 0.00019890092078467687, "loss": 0.1022, "step": 2640 }, { "epoch": 0.17066666666666666, "eval_bleu": 13.899835500734325, "eval_loss": 0.09693938493728638, "eval_runtime": 2.6787, "eval_samples_per_second": 11.946, "eval_steps_per_second": 1.493, "step": 2640 }, { "epoch": 0.17073131313131312, "grad_norm": 0.07308728992938995, "learning_rate": 0.00019889990943683912, "loss": 0.0867, "step": 2641 }, { "epoch": 0.17079595959595958, "grad_norm": 0.08340651541948318, "learning_rate": 0.00019889889762647917, "loss": 0.0973, "step": 2642 }, { "epoch": 0.17086060606060607, "grad_norm": 0.06602702289819717, "learning_rate": 0.00019889788535360178, "loss": 0.08, "step": 2643 }, { "epoch": 0.17092525252525254, "grad_norm": 0.08125805854797363, "learning_rate": 0.0001988968726182117, "loss": 0.0898, "step": 2644 }, { "epoch": 0.170989898989899, "grad_norm": 0.08202677220106125, "learning_rate": 0.00019889585942031363, "loss": 0.1065, "step": 2645 }, { "epoch": 0.17105454545454546, "grad_norm": 0.06917142122983932, "learning_rate": 0.0001988948457599123, "loss": 0.0808, "step": 2646 }, { "epoch": 0.17111919191919192, "grad_norm": 0.08310998231172562, "learning_rate": 0.0001988938316370125, "loss": 0.0974, "step": 2647 }, { "epoch": 0.17118383838383838, "grad_norm": 0.08297887444496155, "learning_rate": 0.0001988928170516189, "loss": 0.113, "step": 2648 }, { "epoch": 0.17124848484848484, "grad_norm": 0.08804168552160263, "learning_rate": 0.00019889180200373632, "loss": 0.0992, "step": 2649 }, { "epoch": 0.1713131313131313, "grad_norm": 0.1213785782456398, "learning_rate": 0.00019889078649336947, "loss": 0.0911, "step": 2650 }, { "epoch": 0.17137777777777777, "grad_norm": 0.08714232593774796, "learning_rate": 0.0001988897705205231, "loss": 0.0938, "step": 2651 }, { "epoch": 0.17144242424242423, "grad_norm": 0.0836285948753357, "learning_rate": 0.00019888875408520198, "loss": 0.0916, "step": 2652 }, { "epoch": 0.17150707070707072, "grad_norm": 0.08034289628267288, "learning_rate": 0.0001988877371874108, "loss": 0.0957, "step": 2653 }, { "epoch": 0.17157171717171718, "grad_norm": 0.12397786229848862, "learning_rate": 0.0001988867198271544, "loss": 0.1057, "step": 2654 }, { "epoch": 0.17163636363636364, "grad_norm": 0.07458718866109848, "learning_rate": 0.00019888570200443754, "loss": 0.0928, "step": 2655 }, { "epoch": 0.1717010101010101, "grad_norm": 0.07918529212474823, "learning_rate": 0.0001988846837192649, "loss": 0.0927, "step": 2656 }, { "epoch": 0.1717010101010101, "eval_bleu": 12.301804817046529, "eval_loss": 0.09886422753334045, "eval_runtime": 2.7667, "eval_samples_per_second": 11.566, "eval_steps_per_second": 1.446, "step": 2656 }, { "epoch": 0.17176565656565657, "grad_norm": 0.0783635675907135, "learning_rate": 0.00019888366497164127, "loss": 0.0862, "step": 2657 }, { "epoch": 0.17183030303030303, "grad_norm": 0.07257837057113647, "learning_rate": 0.00019888264576157147, "loss": 0.0906, "step": 2658 }, { "epoch": 0.1718949494949495, "grad_norm": 0.08375464379787445, "learning_rate": 0.0001988816260890602, "loss": 0.0955, "step": 2659 }, { "epoch": 0.17195959595959595, "grad_norm": 0.07692074030637741, "learning_rate": 0.00019888060595411227, "loss": 0.087, "step": 2660 }, { "epoch": 0.1720242424242424, "grad_norm": 0.08843611925840378, "learning_rate": 0.0001988795853567324, "loss": 0.0972, "step": 2661 }, { "epoch": 0.1720888888888889, "grad_norm": 0.10938207805156708, "learning_rate": 0.00019887856429692545, "loss": 0.1112, "step": 2662 }, { "epoch": 0.17215353535353536, "grad_norm": 0.08177085220813751, "learning_rate": 0.0001988775427746961, "loss": 0.0925, "step": 2663 }, { "epoch": 0.17221818181818183, "grad_norm": 0.08051607757806778, "learning_rate": 0.0001988765207900492, "loss": 0.0894, "step": 2664 }, { "epoch": 0.1722828282828283, "grad_norm": 0.0673099160194397, "learning_rate": 0.00019887549834298948, "loss": 0.0808, "step": 2665 }, { "epoch": 0.17234747474747475, "grad_norm": 0.07069753855466843, "learning_rate": 0.00019887447543352177, "loss": 0.0828, "step": 2666 }, { "epoch": 0.1724121212121212, "grad_norm": 0.0805334821343422, "learning_rate": 0.0001988734520616508, "loss": 0.1047, "step": 2667 }, { "epoch": 0.17247676767676767, "grad_norm": 0.0879552885890007, "learning_rate": 0.00019887242822738137, "loss": 0.0935, "step": 2668 }, { "epoch": 0.17254141414141413, "grad_norm": 0.07935614138841629, "learning_rate": 0.00019887140393071831, "loss": 0.0922, "step": 2669 }, { "epoch": 0.1726060606060606, "grad_norm": 0.09668657183647156, "learning_rate": 0.00019887037917166637, "loss": 0.1058, "step": 2670 }, { "epoch": 0.17267070707070706, "grad_norm": 0.07810390740633011, "learning_rate": 0.00019886935395023035, "loss": 0.0972, "step": 2671 }, { "epoch": 0.17273535353535355, "grad_norm": 0.16058142483234406, "learning_rate": 0.00019886832826641505, "loss": 0.1046, "step": 2672 }, { "epoch": 0.17273535353535355, "eval_bleu": 12.522160034978606, "eval_loss": 0.09890874475240707, "eval_runtime": 2.6074, "eval_samples_per_second": 12.273, "eval_steps_per_second": 1.534, "step": 2672 }, { "epoch": 0.1728, "grad_norm": 0.10017416626214981, "learning_rate": 0.00019886730212022527, "loss": 0.0864, "step": 2673 }, { "epoch": 0.17286464646464647, "grad_norm": 0.0689869299530983, "learning_rate": 0.0001988662755116658, "loss": 0.0846, "step": 2674 }, { "epoch": 0.17292929292929293, "grad_norm": 0.06987910717725754, "learning_rate": 0.00019886524844074142, "loss": 0.0834, "step": 2675 }, { "epoch": 0.1729939393939394, "grad_norm": 0.08495312929153442, "learning_rate": 0.00019886422090745697, "loss": 0.0997, "step": 2676 }, { "epoch": 0.17305858585858586, "grad_norm": 0.06730899959802628, "learning_rate": 0.00019886319291181724, "loss": 0.0752, "step": 2677 }, { "epoch": 0.17312323232323232, "grad_norm": 0.0818965882062912, "learning_rate": 0.00019886216445382706, "loss": 0.0982, "step": 2678 }, { "epoch": 0.17318787878787878, "grad_norm": 0.06921021640300751, "learning_rate": 0.00019886113553349121, "loss": 0.0815, "step": 2679 }, { "epoch": 0.17325252525252524, "grad_norm": 0.07900305837392807, "learning_rate": 0.00019886010615081451, "loss": 0.0872, "step": 2680 }, { "epoch": 0.17331717171717173, "grad_norm": 0.06973530352115631, "learning_rate": 0.00019885907630580178, "loss": 0.0825, "step": 2681 }, { "epoch": 0.1733818181818182, "grad_norm": 0.07682498544454575, "learning_rate": 0.00019885804599845784, "loss": 0.0777, "step": 2682 }, { "epoch": 0.17344646464646465, "grad_norm": 0.07817425578832626, "learning_rate": 0.00019885701522878748, "loss": 0.1032, "step": 2683 }, { "epoch": 0.17351111111111112, "grad_norm": 0.0742013081908226, "learning_rate": 0.00019885598399679554, "loss": 0.0985, "step": 2684 }, { "epoch": 0.17357575757575758, "grad_norm": 0.07024482637643814, "learning_rate": 0.00019885495230248688, "loss": 0.0844, "step": 2685 }, { "epoch": 0.17364040404040404, "grad_norm": 0.07693494856357574, "learning_rate": 0.00019885392014586627, "loss": 0.0931, "step": 2686 }, { "epoch": 0.1737050505050505, "grad_norm": 0.06686500459909439, "learning_rate": 0.0001988528875269385, "loss": 0.0788, "step": 2687 }, { "epoch": 0.17376969696969696, "grad_norm": 0.06792235374450684, "learning_rate": 0.00019885185444570852, "loss": 0.0861, "step": 2688 }, { "epoch": 0.17376969696969696, "eval_bleu": 13.350449363620942, "eval_loss": 0.09692772477865219, "eval_runtime": 2.911, "eval_samples_per_second": 10.993, "eval_steps_per_second": 1.374, "step": 2688 }, { "epoch": 0.17383434343434342, "grad_norm": 0.07090412080287933, "learning_rate": 0.0001988508209021811, "loss": 0.0805, "step": 2689 }, { "epoch": 0.1738989898989899, "grad_norm": 0.07681573927402496, "learning_rate": 0.00019884978689636105, "loss": 0.1042, "step": 2690 }, { "epoch": 0.17396363636363638, "grad_norm": 0.07938114553689957, "learning_rate": 0.00019884875242825323, "loss": 0.1041, "step": 2691 }, { "epoch": 0.17402828282828284, "grad_norm": 0.07724738121032715, "learning_rate": 0.00019884771749786246, "loss": 0.1072, "step": 2692 }, { "epoch": 0.1740929292929293, "grad_norm": 0.11972406506538391, "learning_rate": 0.00019884668210519362, "loss": 0.0902, "step": 2693 }, { "epoch": 0.17415757575757576, "grad_norm": 0.080725759267807, "learning_rate": 0.00019884564625025148, "loss": 0.1103, "step": 2694 }, { "epoch": 0.17422222222222222, "grad_norm": 0.06825319677591324, "learning_rate": 0.000198844609933041, "loss": 0.0743, "step": 2695 }, { "epoch": 0.17428686868686868, "grad_norm": 0.06799200922250748, "learning_rate": 0.0001988435731535669, "loss": 0.077, "step": 2696 }, { "epoch": 0.17435151515151515, "grad_norm": 0.07976427674293518, "learning_rate": 0.00019884253591183408, "loss": 0.0924, "step": 2697 }, { "epoch": 0.1744161616161616, "grad_norm": 0.06222952902317047, "learning_rate": 0.00019884149820784743, "loss": 0.0706, "step": 2698 }, { "epoch": 0.17448080808080807, "grad_norm": 0.07017695903778076, "learning_rate": 0.00019884046004161175, "loss": 0.0789, "step": 2699 }, { "epoch": 0.17454545454545456, "grad_norm": 0.0755394771695137, "learning_rate": 0.00019883942141313195, "loss": 0.0961, "step": 2700 }, { "epoch": 0.17461010101010102, "grad_norm": 0.08314016461372375, "learning_rate": 0.0001988383823224128, "loss": 0.0994, "step": 2701 }, { "epoch": 0.17467474747474748, "grad_norm": 0.079079769551754, "learning_rate": 0.00019883734276945924, "loss": 0.0885, "step": 2702 }, { "epoch": 0.17473939393939394, "grad_norm": 0.08789471536874771, "learning_rate": 0.0001988363027542761, "loss": 0.1044, "step": 2703 }, { "epoch": 0.1748040404040404, "grad_norm": 0.08137782663106918, "learning_rate": 0.00019883526227686824, "loss": 0.0897, "step": 2704 }, { "epoch": 0.1748040404040404, "eval_bleu": 15.714732085423615, "eval_loss": 0.09762945771217346, "eval_runtime": 2.7307, "eval_samples_per_second": 11.719, "eval_steps_per_second": 1.465, "step": 2704 }, { "epoch": 0.17486868686868687, "grad_norm": 0.07812388241291046, "learning_rate": 0.00019883422133724056, "loss": 0.0925, "step": 2705 }, { "epoch": 0.17493333333333333, "grad_norm": 0.06671834737062454, "learning_rate": 0.00019883317993539787, "loss": 0.0777, "step": 2706 }, { "epoch": 0.1749979797979798, "grad_norm": 0.07645226269960403, "learning_rate": 0.00019883213807134507, "loss": 0.0892, "step": 2707 }, { "epoch": 0.17506262626262625, "grad_norm": 0.07224711030721664, "learning_rate": 0.00019883109574508705, "loss": 0.0757, "step": 2708 }, { "epoch": 0.17512727272727271, "grad_norm": 0.07484664022922516, "learning_rate": 0.00019883005295662866, "loss": 0.0854, "step": 2709 }, { "epoch": 0.1751919191919192, "grad_norm": 0.07461053878068924, "learning_rate": 0.0001988290097059748, "loss": 0.0843, "step": 2710 }, { "epoch": 0.17525656565656567, "grad_norm": 0.08061341941356659, "learning_rate": 0.0001988279659931303, "loss": 0.0841, "step": 2711 }, { "epoch": 0.17532121212121213, "grad_norm": 0.0966501384973526, "learning_rate": 0.0001988269218181001, "loss": 0.1058, "step": 2712 }, { "epoch": 0.1753858585858586, "grad_norm": 0.0789634957909584, "learning_rate": 0.00019882587718088906, "loss": 0.1, "step": 2713 }, { "epoch": 0.17545050505050505, "grad_norm": 0.08129527419805527, "learning_rate": 0.00019882483208150204, "loss": 0.0998, "step": 2714 }, { "epoch": 0.1755151515151515, "grad_norm": 0.08067364245653152, "learning_rate": 0.00019882378651994397, "loss": 0.0834, "step": 2715 }, { "epoch": 0.17557979797979797, "grad_norm": 0.09063126891851425, "learning_rate": 0.00019882274049621974, "loss": 0.0875, "step": 2716 }, { "epoch": 0.17564444444444444, "grad_norm": 0.08005023747682571, "learning_rate": 0.00019882169401033419, "loss": 0.1037, "step": 2717 }, { "epoch": 0.1757090909090909, "grad_norm": 0.0804174467921257, "learning_rate": 0.00019882064706229225, "loss": 0.0916, "step": 2718 }, { "epoch": 0.1757737373737374, "grad_norm": 0.07512813061475754, "learning_rate": 0.00019881959965209882, "loss": 0.0936, "step": 2719 }, { "epoch": 0.17583838383838385, "grad_norm": 0.06768741458654404, "learning_rate": 0.0001988185517797588, "loss": 0.092, "step": 2720 }, { "epoch": 0.17583838383838385, "eval_bleu": 18.492274221283008, "eval_loss": 0.09604879468679428, "eval_runtime": 2.9265, "eval_samples_per_second": 10.935, "eval_steps_per_second": 1.367, "step": 2720 }, { "epoch": 0.1759030303030303, "grad_norm": 0.07086288183927536, "learning_rate": 0.0001988175034452771, "loss": 0.0907, "step": 2721 }, { "epoch": 0.17596767676767677, "grad_norm": 0.07924073934555054, "learning_rate": 0.00019881645464865858, "loss": 0.1034, "step": 2722 }, { "epoch": 0.17603232323232323, "grad_norm": 0.07363121211528778, "learning_rate": 0.00019881540538990814, "loss": 0.0906, "step": 2723 }, { "epoch": 0.1760969696969697, "grad_norm": 0.09316180646419525, "learning_rate": 0.00019881435566903078, "loss": 0.1154, "step": 2724 }, { "epoch": 0.17616161616161616, "grad_norm": 0.09084955602884293, "learning_rate": 0.00019881330548603127, "loss": 0.1133, "step": 2725 }, { "epoch": 0.17622626262626262, "grad_norm": 0.08407731354236603, "learning_rate": 0.00019881225484091465, "loss": 0.0916, "step": 2726 }, { "epoch": 0.17629090909090908, "grad_norm": 0.08583284914493561, "learning_rate": 0.00019881120373368578, "loss": 0.0987, "step": 2727 }, { "epoch": 0.17635555555555554, "grad_norm": 0.08164938539266586, "learning_rate": 0.00019881015216434956, "loss": 0.1065, "step": 2728 }, { "epoch": 0.17642020202020203, "grad_norm": 0.09231586009263992, "learning_rate": 0.00019880910013291093, "loss": 0.0985, "step": 2729 }, { "epoch": 0.1764848484848485, "grad_norm": 0.07795228064060211, "learning_rate": 0.0001988080476393748, "loss": 0.0898, "step": 2730 }, { "epoch": 0.17654949494949496, "grad_norm": 0.06641272455453873, "learning_rate": 0.00019880699468374612, "loss": 0.0797, "step": 2731 }, { "epoch": 0.17661414141414142, "grad_norm": 0.07230307906866074, "learning_rate": 0.00019880594126602978, "loss": 0.0974, "step": 2732 }, { "epoch": 0.17667878787878788, "grad_norm": 0.07935388386249542, "learning_rate": 0.0001988048873862307, "loss": 0.0928, "step": 2733 }, { "epoch": 0.17674343434343434, "grad_norm": 0.07873757928609848, "learning_rate": 0.00019880383304435385, "loss": 0.097, "step": 2734 }, { "epoch": 0.1768080808080808, "grad_norm": 0.07702283561229706, "learning_rate": 0.0001988027782404041, "loss": 0.0874, "step": 2735 }, { "epoch": 0.17687272727272726, "grad_norm": 0.08321177959442139, "learning_rate": 0.00019880172297438646, "loss": 0.1041, "step": 2736 }, { "epoch": 0.17687272727272726, "eval_bleu": 12.285483389687151, "eval_loss": 0.09538325667381287, "eval_runtime": 2.6616, "eval_samples_per_second": 12.023, "eval_steps_per_second": 1.503, "step": 2736 }, { "epoch": 0.17693737373737373, "grad_norm": 0.07662086188793182, "learning_rate": 0.0001988006672463058, "loss": 0.0908, "step": 2737 }, { "epoch": 0.1770020202020202, "grad_norm": 0.0868905708193779, "learning_rate": 0.00019879961105616708, "loss": 0.0984, "step": 2738 }, { "epoch": 0.17706666666666668, "grad_norm": 0.06956250220537186, "learning_rate": 0.00019879855440397526, "loss": 0.0857, "step": 2739 }, { "epoch": 0.17713131313131314, "grad_norm": 0.07216699421405792, "learning_rate": 0.00019879749728973526, "loss": 0.0866, "step": 2740 }, { "epoch": 0.1771959595959596, "grad_norm": 0.08254578709602356, "learning_rate": 0.00019879643971345204, "loss": 0.0986, "step": 2741 }, { "epoch": 0.17726060606060606, "grad_norm": 0.07362942397594452, "learning_rate": 0.0001987953816751305, "loss": 0.1042, "step": 2742 }, { "epoch": 0.17732525252525252, "grad_norm": 0.06621770560741425, "learning_rate": 0.00019879432317477562, "loss": 0.0885, "step": 2743 }, { "epoch": 0.177389898989899, "grad_norm": 0.08045028150081635, "learning_rate": 0.00019879326421239237, "loss": 0.0918, "step": 2744 }, { "epoch": 0.17745454545454545, "grad_norm": 0.071648508310318, "learning_rate": 0.00019879220478798569, "loss": 0.0896, "step": 2745 }, { "epoch": 0.1775191919191919, "grad_norm": 0.0943872258067131, "learning_rate": 0.00019879114490156053, "loss": 0.1169, "step": 2746 }, { "epoch": 0.17758383838383837, "grad_norm": 0.06958484649658203, "learning_rate": 0.00019879008455312181, "loss": 0.096, "step": 2747 }, { "epoch": 0.17764848484848486, "grad_norm": 0.07820151001214981, "learning_rate": 0.00019878902374267457, "loss": 0.0851, "step": 2748 }, { "epoch": 0.17771313131313132, "grad_norm": 0.07022452354431152, "learning_rate": 0.00019878796247022368, "loss": 0.0773, "step": 2749 }, { "epoch": 0.17777777777777778, "grad_norm": 0.08070288598537445, "learning_rate": 0.00019878690073577417, "loss": 0.0946, "step": 2750 }, { "epoch": 0.17784242424242425, "grad_norm": 0.0653889998793602, "learning_rate": 0.000198785838539331, "loss": 0.0764, "step": 2751 }, { "epoch": 0.1779070707070707, "grad_norm": 0.08429327607154846, "learning_rate": 0.0001987847758808991, "loss": 0.0886, "step": 2752 }, { "epoch": 0.1779070707070707, "eval_bleu": 11.99463844629718, "eval_loss": 0.09583473950624466, "eval_runtime": 2.7464, "eval_samples_per_second": 11.651, "eval_steps_per_second": 1.456, "step": 2752 }, { "epoch": 0.17797171717171717, "grad_norm": 0.08609875291585922, "learning_rate": 0.00019878371276048346, "loss": 0.0955, "step": 2753 }, { "epoch": 0.17803636363636363, "grad_norm": 0.06654812395572662, "learning_rate": 0.00019878264917808907, "loss": 0.0755, "step": 2754 }, { "epoch": 0.1781010101010101, "grad_norm": 0.16480769217014313, "learning_rate": 0.00019878158513372086, "loss": 0.1009, "step": 2755 }, { "epoch": 0.17816565656565655, "grad_norm": 0.08210154622793198, "learning_rate": 0.00019878052062738386, "loss": 0.0983, "step": 2756 }, { "epoch": 0.17823030303030302, "grad_norm": 0.0810609981417656, "learning_rate": 0.000198779455659083, "loss": 0.0879, "step": 2757 }, { "epoch": 0.1782949494949495, "grad_norm": 0.07702967524528503, "learning_rate": 0.0001987783902288233, "loss": 0.0945, "step": 2758 }, { "epoch": 0.17835959595959597, "grad_norm": 0.07689771056175232, "learning_rate": 0.0001987773243366097, "loss": 0.0874, "step": 2759 }, { "epoch": 0.17842424242424243, "grad_norm": 0.08530224114656448, "learning_rate": 0.00019877625798244726, "loss": 0.1071, "step": 2760 }, { "epoch": 0.1784888888888889, "grad_norm": 0.08010605722665787, "learning_rate": 0.00019877519116634086, "loss": 0.0919, "step": 2761 }, { "epoch": 0.17855353535353535, "grad_norm": 0.09572355449199677, "learning_rate": 0.00019877412388829557, "loss": 0.1053, "step": 2762 }, { "epoch": 0.17861818181818181, "grad_norm": 0.07287169992923737, "learning_rate": 0.00019877305614831637, "loss": 0.0858, "step": 2763 }, { "epoch": 0.17868282828282828, "grad_norm": 0.07264987379312515, "learning_rate": 0.0001987719879464082, "loss": 0.0937, "step": 2764 }, { "epoch": 0.17874747474747474, "grad_norm": 0.09174101799726486, "learning_rate": 0.00019877091928257614, "loss": 0.0825, "step": 2765 }, { "epoch": 0.1788121212121212, "grad_norm": 0.07856519520282745, "learning_rate": 0.0001987698501568251, "loss": 0.0926, "step": 2766 }, { "epoch": 0.1788767676767677, "grad_norm": 0.07437831908464432, "learning_rate": 0.00019876878056916016, "loss": 0.0876, "step": 2767 }, { "epoch": 0.17894141414141415, "grad_norm": 0.07657970488071442, "learning_rate": 0.00019876771051958626, "loss": 0.0951, "step": 2768 }, { "epoch": 0.17894141414141415, "eval_bleu": 12.501776313075313, "eval_loss": 0.096172034740448, "eval_runtime": 2.5926, "eval_samples_per_second": 12.343, "eval_steps_per_second": 1.543, "step": 2768 }, { "epoch": 0.1790060606060606, "grad_norm": 0.07658912986516953, "learning_rate": 0.00019876664000810845, "loss": 0.0856, "step": 2769 }, { "epoch": 0.17907070707070707, "grad_norm": 0.0717838853597641, "learning_rate": 0.00019876556903473171, "loss": 0.0791, "step": 2770 }, { "epoch": 0.17913535353535354, "grad_norm": 0.08511864393949509, "learning_rate": 0.00019876449759946105, "loss": 0.1022, "step": 2771 }, { "epoch": 0.1792, "grad_norm": 0.07913220673799515, "learning_rate": 0.00019876342570230147, "loss": 0.0921, "step": 2772 }, { "epoch": 0.17926464646464646, "grad_norm": 0.07598750293254852, "learning_rate": 0.00019876235334325803, "loss": 0.0921, "step": 2773 }, { "epoch": 0.17932929292929292, "grad_norm": 0.11733365803956985, "learning_rate": 0.00019876128052233568, "loss": 0.1033, "step": 2774 }, { "epoch": 0.17939393939393938, "grad_norm": 0.07843891531229019, "learning_rate": 0.00019876020723953952, "loss": 0.0839, "step": 2775 }, { "epoch": 0.17945858585858585, "grad_norm": 0.06839077919721603, "learning_rate": 0.00019875913349487448, "loss": 0.0745, "step": 2776 }, { "epoch": 0.17952323232323233, "grad_norm": 0.07148474454879761, "learning_rate": 0.00019875805928834566, "loss": 0.0944, "step": 2777 }, { "epoch": 0.1795878787878788, "grad_norm": 0.07561158388853073, "learning_rate": 0.000198756984619958, "loss": 0.089, "step": 2778 }, { "epoch": 0.17965252525252526, "grad_norm": 0.07801327109336853, "learning_rate": 0.0001987559094897166, "loss": 0.0989, "step": 2779 }, { "epoch": 0.17971717171717172, "grad_norm": 0.07930078357458115, "learning_rate": 0.00019875483389762645, "loss": 0.0961, "step": 2780 }, { "epoch": 0.17978181818181818, "grad_norm": 0.07779914885759354, "learning_rate": 0.00019875375784369258, "loss": 0.1055, "step": 2781 }, { "epoch": 0.17984646464646464, "grad_norm": 0.07417116314172745, "learning_rate": 0.00019875268132792004, "loss": 0.0738, "step": 2782 }, { "epoch": 0.1799111111111111, "grad_norm": 0.07325158268213272, "learning_rate": 0.00019875160435031385, "loss": 0.08, "step": 2783 }, { "epoch": 0.17997575757575757, "grad_norm": 0.07600921392440796, "learning_rate": 0.00019875052691087908, "loss": 0.0897, "step": 2784 }, { "epoch": 0.17997575757575757, "eval_bleu": 12.612213977287793, "eval_loss": 0.09436735510826111, "eval_runtime": 2.7371, "eval_samples_per_second": 11.691, "eval_steps_per_second": 1.461, "step": 2784 }, { "epoch": 0.18004040404040403, "grad_norm": 0.09855237603187561, "learning_rate": 0.0001987494490096207, "loss": 0.1204, "step": 2785 }, { "epoch": 0.18010505050505052, "grad_norm": 0.06247742101550102, "learning_rate": 0.00019874837064654384, "loss": 0.0803, "step": 2786 }, { "epoch": 0.18016969696969698, "grad_norm": 0.07029539346694946, "learning_rate": 0.00019874729182165347, "loss": 0.0777, "step": 2787 }, { "epoch": 0.18023434343434344, "grad_norm": 0.07934654504060745, "learning_rate": 0.00019874621253495467, "loss": 0.0855, "step": 2788 }, { "epoch": 0.1802989898989899, "grad_norm": 0.0715228021144867, "learning_rate": 0.00019874513278645247, "loss": 0.0802, "step": 2789 }, { "epoch": 0.18036363636363636, "grad_norm": 0.0898372009396553, "learning_rate": 0.00019874405257615192, "loss": 0.1072, "step": 2790 }, { "epoch": 0.18042828282828283, "grad_norm": 0.0665682777762413, "learning_rate": 0.00019874297190405812, "loss": 0.0727, "step": 2791 }, { "epoch": 0.1804929292929293, "grad_norm": 0.08710990846157074, "learning_rate": 0.00019874189077017605, "loss": 0.0818, "step": 2792 }, { "epoch": 0.18055757575757575, "grad_norm": 0.0810735747218132, "learning_rate": 0.0001987408091745108, "loss": 0.1106, "step": 2793 }, { "epoch": 0.1806222222222222, "grad_norm": 0.09424632787704468, "learning_rate": 0.0001987397271170674, "loss": 0.1085, "step": 2794 }, { "epoch": 0.18068686868686867, "grad_norm": 0.08035526424646378, "learning_rate": 0.000198738644597851, "loss": 0.1019, "step": 2795 }, { "epoch": 0.18075151515151516, "grad_norm": 0.08920923620462418, "learning_rate": 0.00019873756161686656, "loss": 0.0944, "step": 2796 }, { "epoch": 0.18081616161616162, "grad_norm": 0.07438452541828156, "learning_rate": 0.0001987364781741192, "loss": 0.092, "step": 2797 }, { "epoch": 0.1808808080808081, "grad_norm": 0.07915820181369781, "learning_rate": 0.00019873539426961396, "loss": 0.0944, "step": 2798 }, { "epoch": 0.18094545454545455, "grad_norm": 0.06792636215686798, "learning_rate": 0.00019873430990335596, "loss": 0.0795, "step": 2799 }, { "epoch": 0.181010101010101, "grad_norm": 0.08413238078355789, "learning_rate": 0.00019873322507535019, "loss": 0.1005, "step": 2800 }, { "epoch": 0.181010101010101, "eval_bleu": 11.47564656157881, "eval_loss": 0.09421389549970627, "eval_runtime": 2.7802, "eval_samples_per_second": 11.51, "eval_steps_per_second": 1.439, "step": 2800 }, { "epoch": 0.18107474747474747, "grad_norm": 0.07722476869821548, "learning_rate": 0.00019873213978560182, "loss": 0.0952, "step": 2801 }, { "epoch": 0.18113939393939393, "grad_norm": 0.0825035348534584, "learning_rate": 0.0001987310540341158, "loss": 0.109, "step": 2802 }, { "epoch": 0.1812040404040404, "grad_norm": 0.0754779577255249, "learning_rate": 0.00019872996782089732, "loss": 0.0866, "step": 2803 }, { "epoch": 0.18126868686868686, "grad_norm": 0.08096551150083542, "learning_rate": 0.00019872888114595144, "loss": 0.0844, "step": 2804 }, { "epoch": 0.18133333333333335, "grad_norm": 0.09417939931154251, "learning_rate": 0.00019872779400928318, "loss": 0.1022, "step": 2805 }, { "epoch": 0.1813979797979798, "grad_norm": 0.07801815122365952, "learning_rate": 0.0001987267064108977, "loss": 0.0912, "step": 2806 }, { "epoch": 0.18146262626262627, "grad_norm": 0.08048546314239502, "learning_rate": 0.00019872561835080003, "loss": 0.1013, "step": 2807 }, { "epoch": 0.18152727272727273, "grad_norm": 0.08277661353349686, "learning_rate": 0.0001987245298289953, "loss": 0.0949, "step": 2808 }, { "epoch": 0.1815919191919192, "grad_norm": 0.09248380362987518, "learning_rate": 0.00019872344084548857, "loss": 0.091, "step": 2809 }, { "epoch": 0.18165656565656566, "grad_norm": 0.07722607254981995, "learning_rate": 0.00019872235140028495, "loss": 0.0879, "step": 2810 }, { "epoch": 0.18172121212121212, "grad_norm": 0.06787624955177307, "learning_rate": 0.00019872126149338953, "loss": 0.0845, "step": 2811 }, { "epoch": 0.18178585858585858, "grad_norm": 0.08037717640399933, "learning_rate": 0.0001987201711248074, "loss": 0.0801, "step": 2812 }, { "epoch": 0.18185050505050504, "grad_norm": 0.06995908915996552, "learning_rate": 0.00019871908029454367, "loss": 0.0875, "step": 2813 }, { "epoch": 0.1819151515151515, "grad_norm": 0.07385554164648056, "learning_rate": 0.00019871798900260345, "loss": 0.0826, "step": 2814 }, { "epoch": 0.181979797979798, "grad_norm": 0.06955590099096298, "learning_rate": 0.0001987168972489918, "loss": 0.0864, "step": 2815 }, { "epoch": 0.18204444444444445, "grad_norm": 0.07817044854164124, "learning_rate": 0.0001987158050337139, "loss": 0.0906, "step": 2816 }, { "epoch": 0.18204444444444445, "eval_bleu": 14.96304468250526, "eval_loss": 0.0962601900100708, "eval_runtime": 2.6835, "eval_samples_per_second": 11.925, "eval_steps_per_second": 1.491, "step": 2816 }, { "epoch": 0.18210909090909091, "grad_norm": 0.0857403576374054, "learning_rate": 0.00019871471235677476, "loss": 0.0925, "step": 2817 }, { "epoch": 0.18217373737373738, "grad_norm": 0.07382909208536148, "learning_rate": 0.0001987136192181796, "loss": 0.0942, "step": 2818 }, { "epoch": 0.18223838383838384, "grad_norm": 0.1192716509103775, "learning_rate": 0.00019871252561793343, "loss": 0.1067, "step": 2819 }, { "epoch": 0.1823030303030303, "grad_norm": 0.08581046015024185, "learning_rate": 0.00019871143155604143, "loss": 0.0954, "step": 2820 }, { "epoch": 0.18236767676767676, "grad_norm": 0.07248219102621078, "learning_rate": 0.0001987103370325087, "loss": 0.0825, "step": 2821 }, { "epoch": 0.18243232323232322, "grad_norm": 0.0770423635840416, "learning_rate": 0.00019870924204734035, "loss": 0.097, "step": 2822 }, { "epoch": 0.18249696969696969, "grad_norm": 0.08159273117780685, "learning_rate": 0.00019870814660054152, "loss": 0.098, "step": 2823 }, { "epoch": 0.18256161616161617, "grad_norm": 0.08246093988418579, "learning_rate": 0.0001987070506921173, "loss": 0.0976, "step": 2824 }, { "epoch": 0.18262626262626264, "grad_norm": 0.08934545516967773, "learning_rate": 0.0001987059543220729, "loss": 0.1179, "step": 2825 }, { "epoch": 0.1826909090909091, "grad_norm": 0.07338359206914902, "learning_rate": 0.0001987048574904133, "loss": 0.087, "step": 2826 }, { "epoch": 0.18275555555555556, "grad_norm": 0.08218204975128174, "learning_rate": 0.00019870376019714376, "loss": 0.12, "step": 2827 }, { "epoch": 0.18282020202020202, "grad_norm": 0.07678299397230148, "learning_rate": 0.00019870266244226934, "loss": 0.0883, "step": 2828 }, { "epoch": 0.18288484848484848, "grad_norm": 0.07436902821063995, "learning_rate": 0.00019870156422579523, "loss": 0.0896, "step": 2829 }, { "epoch": 0.18294949494949495, "grad_norm": 0.06455118954181671, "learning_rate": 0.0001987004655477265, "loss": 0.0749, "step": 2830 }, { "epoch": 0.1830141414141414, "grad_norm": 0.0726434513926506, "learning_rate": 0.00019869936640806835, "loss": 0.0917, "step": 2831 }, { "epoch": 0.18307878787878787, "grad_norm": 0.0812985897064209, "learning_rate": 0.00019869826680682587, "loss": 0.1051, "step": 2832 }, { "epoch": 0.18307878787878787, "eval_bleu": 13.207528272580483, "eval_loss": 0.09708814322948456, "eval_runtime": 2.9038, "eval_samples_per_second": 11.02, "eval_steps_per_second": 1.377, "step": 2832 }, { "epoch": 0.18314343434343433, "grad_norm": 0.06964149326086044, "learning_rate": 0.00019869716674400422, "loss": 0.0799, "step": 2833 }, { "epoch": 0.18320808080808082, "grad_norm": 0.0735994428396225, "learning_rate": 0.00019869606621960857, "loss": 0.0909, "step": 2834 }, { "epoch": 0.18327272727272728, "grad_norm": 0.07230139523744583, "learning_rate": 0.00019869496523364404, "loss": 0.0873, "step": 2835 }, { "epoch": 0.18333737373737374, "grad_norm": 0.07027056813240051, "learning_rate": 0.0001986938637861158, "loss": 0.0885, "step": 2836 }, { "epoch": 0.1834020202020202, "grad_norm": 0.07894861698150635, "learning_rate": 0.00019869276187702895, "loss": 0.0887, "step": 2837 }, { "epoch": 0.18346666666666667, "grad_norm": 0.07709158957004547, "learning_rate": 0.0001986916595063887, "loss": 0.0998, "step": 2838 }, { "epoch": 0.18353131313131313, "grad_norm": 0.08857876062393188, "learning_rate": 0.0001986905566742002, "loss": 0.107, "step": 2839 }, { "epoch": 0.1835959595959596, "grad_norm": 0.0786396712064743, "learning_rate": 0.00019868945338046858, "loss": 0.094, "step": 2840 }, { "epoch": 0.18366060606060605, "grad_norm": 0.0806623250246048, "learning_rate": 0.000198688349625199, "loss": 0.1022, "step": 2841 }, { "epoch": 0.1837252525252525, "grad_norm": 0.075035959482193, "learning_rate": 0.00019868724540839664, "loss": 0.0816, "step": 2842 }, { "epoch": 0.183789898989899, "grad_norm": 0.06769943982362747, "learning_rate": 0.00019868614073006668, "loss": 0.0789, "step": 2843 }, { "epoch": 0.18385454545454546, "grad_norm": 0.07184632867574692, "learning_rate": 0.00019868503559021425, "loss": 0.0942, "step": 2844 }, { "epoch": 0.18391919191919193, "grad_norm": 0.0876123458147049, "learning_rate": 0.00019868392998884454, "loss": 0.0945, "step": 2845 }, { "epoch": 0.1839838383838384, "grad_norm": 0.07806850969791412, "learning_rate": 0.0001986828239259627, "loss": 0.0931, "step": 2846 }, { "epoch": 0.18404848484848485, "grad_norm": 0.07110225409269333, "learning_rate": 0.00019868171740157394, "loss": 0.0827, "step": 2847 }, { "epoch": 0.1841131313131313, "grad_norm": 0.0839366614818573, "learning_rate": 0.00019868061041568337, "loss": 0.0969, "step": 2848 }, { "epoch": 0.1841131313131313, "eval_bleu": 12.64759814692508, "eval_loss": 0.09852884709835052, "eval_runtime": 2.7421, "eval_samples_per_second": 11.67, "eval_steps_per_second": 1.459, "step": 2848 }, { "epoch": 0.18417777777777777, "grad_norm": 0.08227841556072235, "learning_rate": 0.00019867950296829624, "loss": 0.0961, "step": 2849 }, { "epoch": 0.18424242424242424, "grad_norm": 0.07454248517751694, "learning_rate": 0.0001986783950594177, "loss": 0.0988, "step": 2850 }, { "epoch": 0.1843070707070707, "grad_norm": 0.07863860577344894, "learning_rate": 0.0001986772866890529, "loss": 0.0807, "step": 2851 }, { "epoch": 0.18437171717171716, "grad_norm": 0.0771082192659378, "learning_rate": 0.00019867617785720708, "loss": 0.0974, "step": 2852 }, { "epoch": 0.18443636363636365, "grad_norm": 0.08415652811527252, "learning_rate": 0.0001986750685638854, "loss": 0.0968, "step": 2853 }, { "epoch": 0.1845010101010101, "grad_norm": 0.10221920162439346, "learning_rate": 0.00019867395880909303, "loss": 0.1171, "step": 2854 }, { "epoch": 0.18456565656565657, "grad_norm": 0.08217688649892807, "learning_rate": 0.00019867284859283516, "loss": 0.089, "step": 2855 }, { "epoch": 0.18463030303030303, "grad_norm": 0.08122526854276657, "learning_rate": 0.00019867173791511704, "loss": 0.1027, "step": 2856 }, { "epoch": 0.1846949494949495, "grad_norm": 0.07600749284029007, "learning_rate": 0.0001986706267759438, "loss": 0.0921, "step": 2857 }, { "epoch": 0.18475959595959596, "grad_norm": 0.08221013844013214, "learning_rate": 0.00019866951517532068, "loss": 0.0834, "step": 2858 }, { "epoch": 0.18482424242424242, "grad_norm": 0.07795519381761551, "learning_rate": 0.0001986684031132528, "loss": 0.1004, "step": 2859 }, { "epoch": 0.18488888888888888, "grad_norm": 0.11812471598386765, "learning_rate": 0.00019866729058974546, "loss": 0.1266, "step": 2860 }, { "epoch": 0.18495353535353534, "grad_norm": 0.08074431121349335, "learning_rate": 0.00019866617760480381, "loss": 0.0966, "step": 2861 }, { "epoch": 0.18501818181818183, "grad_norm": 0.0667763203382492, "learning_rate": 0.0001986650641584331, "loss": 0.081, "step": 2862 }, { "epoch": 0.1850828282828283, "grad_norm": 0.08093970268964767, "learning_rate": 0.00019866395025063848, "loss": 0.0946, "step": 2863 }, { "epoch": 0.18514747474747476, "grad_norm": 0.0754845142364502, "learning_rate": 0.00019866283588142517, "loss": 0.0903, "step": 2864 }, { "epoch": 0.18514747474747476, "eval_bleu": 14.564012214557106, "eval_loss": 0.09604822099208832, "eval_runtime": 2.7469, "eval_samples_per_second": 11.649, "eval_steps_per_second": 1.456, "step": 2864 }, { "epoch": 0.18521212121212122, "grad_norm": 0.07853715866804123, "learning_rate": 0.00019866172105079837, "loss": 0.0909, "step": 2865 }, { "epoch": 0.18527676767676768, "grad_norm": 0.06989241391420364, "learning_rate": 0.00019866060575876335, "loss": 0.0882, "step": 2866 }, { "epoch": 0.18534141414141414, "grad_norm": 0.07622192054986954, "learning_rate": 0.0001986594900053253, "loss": 0.0936, "step": 2867 }, { "epoch": 0.1854060606060606, "grad_norm": 0.08440738171339035, "learning_rate": 0.0001986583737904894, "loss": 0.1065, "step": 2868 }, { "epoch": 0.18547070707070706, "grad_norm": 0.07761204242706299, "learning_rate": 0.00019865725711426096, "loss": 0.0953, "step": 2869 }, { "epoch": 0.18553535353535353, "grad_norm": 0.09092800319194794, "learning_rate": 0.0001986561399766451, "loss": 0.1025, "step": 2870 }, { "epoch": 0.1856, "grad_norm": 0.08442335575819016, "learning_rate": 0.0001986550223776471, "loss": 0.0889, "step": 2871 }, { "epoch": 0.18566464646464648, "grad_norm": 0.08239573985338211, "learning_rate": 0.00019865390431727216, "loss": 0.0896, "step": 2872 }, { "epoch": 0.18572929292929294, "grad_norm": 0.08330517262220383, "learning_rate": 0.00019865278579552555, "loss": 0.0953, "step": 2873 }, { "epoch": 0.1857939393939394, "grad_norm": 0.0879674181342125, "learning_rate": 0.00019865166681241246, "loss": 0.1113, "step": 2874 }, { "epoch": 0.18585858585858586, "grad_norm": 0.07764395326375961, "learning_rate": 0.00019865054736793814, "loss": 0.0936, "step": 2875 }, { "epoch": 0.18592323232323232, "grad_norm": 0.07007759809494019, "learning_rate": 0.0001986494274621078, "loss": 0.076, "step": 2876 }, { "epoch": 0.18598787878787879, "grad_norm": 0.08036404103040695, "learning_rate": 0.00019864830709492672, "loss": 0.0971, "step": 2877 }, { "epoch": 0.18605252525252525, "grad_norm": 0.08120427280664444, "learning_rate": 0.00019864718626640013, "loss": 0.0982, "step": 2878 }, { "epoch": 0.1861171717171717, "grad_norm": 0.0838732123374939, "learning_rate": 0.00019864606497653324, "loss": 0.1185, "step": 2879 }, { "epoch": 0.18618181818181817, "grad_norm": 0.07545242458581924, "learning_rate": 0.00019864494322533135, "loss": 0.0928, "step": 2880 }, { "epoch": 0.18618181818181817, "eval_bleu": 11.666511494804512, "eval_loss": 0.09546297043561935, "eval_runtime": 2.8352, "eval_samples_per_second": 11.287, "eval_steps_per_second": 1.411, "step": 2880 }, { "epoch": 0.18624646464646466, "grad_norm": 0.07434926927089691, "learning_rate": 0.00019864382101279966, "loss": 0.0801, "step": 2881 }, { "epoch": 0.18631111111111112, "grad_norm": 0.06940607726573944, "learning_rate": 0.00019864269833894343, "loss": 0.0854, "step": 2882 }, { "epoch": 0.18637575757575758, "grad_norm": 0.06883041560649872, "learning_rate": 0.0001986415752037679, "loss": 0.0879, "step": 2883 }, { "epoch": 0.18644040404040405, "grad_norm": 0.0735919252038002, "learning_rate": 0.0001986404516072783, "loss": 0.0837, "step": 2884 }, { "epoch": 0.1865050505050505, "grad_norm": 0.07767898589372635, "learning_rate": 0.00019863932754947996, "loss": 0.1079, "step": 2885 }, { "epoch": 0.18656969696969697, "grad_norm": 0.09822847694158554, "learning_rate": 0.0001986382030303781, "loss": 0.1166, "step": 2886 }, { "epoch": 0.18663434343434343, "grad_norm": 0.09037579596042633, "learning_rate": 0.00019863707804997796, "loss": 0.101, "step": 2887 }, { "epoch": 0.1866989898989899, "grad_norm": 0.07880650460720062, "learning_rate": 0.00019863595260828483, "loss": 0.091, "step": 2888 }, { "epoch": 0.18676363636363635, "grad_norm": 0.07453230768442154, "learning_rate": 0.00019863482670530393, "loss": 0.0833, "step": 2889 }, { "epoch": 0.18682828282828282, "grad_norm": 0.07326208800077438, "learning_rate": 0.00019863370034104057, "loss": 0.0791, "step": 2890 }, { "epoch": 0.1868929292929293, "grad_norm": 0.06749226152896881, "learning_rate": 0.0001986325735155, "loss": 0.0915, "step": 2891 }, { "epoch": 0.18695757575757577, "grad_norm": 0.08843649178743362, "learning_rate": 0.00019863144622868747, "loss": 0.0949, "step": 2892 }, { "epoch": 0.18702222222222223, "grad_norm": 0.06979414820671082, "learning_rate": 0.0001986303184806083, "loss": 0.0881, "step": 2893 }, { "epoch": 0.1870868686868687, "grad_norm": 0.082852803170681, "learning_rate": 0.0001986291902712677, "loss": 0.0875, "step": 2894 }, { "epoch": 0.18715151515151515, "grad_norm": 0.08005732297897339, "learning_rate": 0.00019862806160067105, "loss": 0.0937, "step": 2895 }, { "epoch": 0.1872161616161616, "grad_norm": 0.09265148639678955, "learning_rate": 0.00019862693246882352, "loss": 0.1149, "step": 2896 }, { "epoch": 0.1872161616161616, "eval_bleu": 14.385859501465685, "eval_loss": 0.09779681265354156, "eval_runtime": 2.854, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.402, "step": 2896 }, { "epoch": 0.18728080808080808, "grad_norm": 0.07319992035627365, "learning_rate": 0.00019862580287573046, "loss": 0.0792, "step": 2897 }, { "epoch": 0.18734545454545454, "grad_norm": 0.08132781088352203, "learning_rate": 0.0001986246728213971, "loss": 0.0905, "step": 2898 }, { "epoch": 0.187410101010101, "grad_norm": 0.07200151681900024, "learning_rate": 0.00019862354230582873, "loss": 0.0813, "step": 2899 }, { "epoch": 0.1874747474747475, "grad_norm": 0.09434063732624054, "learning_rate": 0.00019862241132903067, "loss": 0.0936, "step": 2900 }, { "epoch": 0.18753939393939395, "grad_norm": 0.07864663749933243, "learning_rate": 0.00019862127989100822, "loss": 0.0916, "step": 2901 }, { "epoch": 0.1876040404040404, "grad_norm": 0.07996901869773865, "learning_rate": 0.00019862014799176662, "loss": 0.0978, "step": 2902 }, { "epoch": 0.18766868686868687, "grad_norm": 0.07707536965608597, "learning_rate": 0.0001986190156313112, "loss": 0.0928, "step": 2903 }, { "epoch": 0.18773333333333334, "grad_norm": 0.0690661370754242, "learning_rate": 0.00019861788280964727, "loss": 0.0931, "step": 2904 }, { "epoch": 0.1877979797979798, "grad_norm": 0.08122185617685318, "learning_rate": 0.00019861674952678006, "loss": 0.098, "step": 2905 }, { "epoch": 0.18786262626262626, "grad_norm": 0.08451220393180847, "learning_rate": 0.00019861561578271493, "loss": 0.1063, "step": 2906 }, { "epoch": 0.18792727272727272, "grad_norm": 0.06468889862298965, "learning_rate": 0.0001986144815774572, "loss": 0.0817, "step": 2907 }, { "epoch": 0.18799191919191918, "grad_norm": 0.06681172549724579, "learning_rate": 0.00019861334691101211, "loss": 0.0885, "step": 2908 }, { "epoch": 0.18805656565656564, "grad_norm": 0.07753611356019974, "learning_rate": 0.000198612211783385, "loss": 0.0997, "step": 2909 }, { "epoch": 0.18812121212121213, "grad_norm": 0.07176484167575836, "learning_rate": 0.0001986110761945812, "loss": 0.083, "step": 2910 }, { "epoch": 0.1881858585858586, "grad_norm": 0.07666324079036713, "learning_rate": 0.00019860994014460597, "loss": 0.0967, "step": 2911 }, { "epoch": 0.18825050505050506, "grad_norm": 0.0668957382440567, "learning_rate": 0.00019860880363346464, "loss": 0.0828, "step": 2912 }, { "epoch": 0.18825050505050506, "eval_bleu": 14.763794760543348, "eval_loss": 0.09515655785799026, "eval_runtime": 2.685, "eval_samples_per_second": 11.918, "eval_steps_per_second": 1.49, "step": 2912 }, { "epoch": 0.18831515151515152, "grad_norm": 0.07317197322845459, "learning_rate": 0.00019860766666116258, "loss": 0.0815, "step": 2913 }, { "epoch": 0.18837979797979798, "grad_norm": 0.0779421254992485, "learning_rate": 0.00019860652922770502, "loss": 0.0945, "step": 2914 }, { "epoch": 0.18844444444444444, "grad_norm": 0.0642285868525505, "learning_rate": 0.00019860539133309733, "loss": 0.0704, "step": 2915 }, { "epoch": 0.1885090909090909, "grad_norm": 0.07154625654220581, "learning_rate": 0.0001986042529773448, "loss": 0.0781, "step": 2916 }, { "epoch": 0.18857373737373737, "grad_norm": 0.07037678360939026, "learning_rate": 0.00019860311416045284, "loss": 0.0779, "step": 2917 }, { "epoch": 0.18863838383838383, "grad_norm": 0.08008299767971039, "learning_rate": 0.00019860197488242668, "loss": 0.0966, "step": 2918 }, { "epoch": 0.18870303030303032, "grad_norm": 0.08718068152666092, "learning_rate": 0.00019860083514327168, "loss": 0.0927, "step": 2919 }, { "epoch": 0.18876767676767678, "grad_norm": 0.07753336429595947, "learning_rate": 0.00019859969494299317, "loss": 0.0844, "step": 2920 }, { "epoch": 0.18883232323232324, "grad_norm": 0.08531400561332703, "learning_rate": 0.00019859855428159645, "loss": 0.1067, "step": 2921 }, { "epoch": 0.1888969696969697, "grad_norm": 0.07498552650213242, "learning_rate": 0.00019859741315908695, "loss": 0.0845, "step": 2922 }, { "epoch": 0.18896161616161616, "grad_norm": 0.07353661209344864, "learning_rate": 0.0001985962715754699, "loss": 0.0937, "step": 2923 }, { "epoch": 0.18902626262626263, "grad_norm": 0.07552067935466766, "learning_rate": 0.00019859512953075071, "loss": 0.0928, "step": 2924 }, { "epoch": 0.1890909090909091, "grad_norm": 0.07916353642940521, "learning_rate": 0.00019859398702493466, "loss": 0.1088, "step": 2925 }, { "epoch": 0.18915555555555555, "grad_norm": 0.07933656871318817, "learning_rate": 0.00019859284405802715, "loss": 0.1058, "step": 2926 }, { "epoch": 0.189220202020202, "grad_norm": 0.0765988752245903, "learning_rate": 0.0001985917006300335, "loss": 0.0918, "step": 2927 }, { "epoch": 0.18928484848484847, "grad_norm": 0.07666276395320892, "learning_rate": 0.0001985905567409591, "loss": 0.0964, "step": 2928 }, { "epoch": 0.18928484848484847, "eval_bleu": 12.737464917734656, "eval_loss": 0.09495145827531815, "eval_runtime": 2.894, "eval_samples_per_second": 11.057, "eval_steps_per_second": 1.382, "step": 2928 }, { "epoch": 0.18934949494949496, "grad_norm": 0.08585377037525177, "learning_rate": 0.00019858941239080918, "loss": 0.1116, "step": 2929 }, { "epoch": 0.18941414141414142, "grad_norm": 0.08172925561666489, "learning_rate": 0.00019858826757958924, "loss": 0.1024, "step": 2930 }, { "epoch": 0.18947878787878789, "grad_norm": 0.07743149250745773, "learning_rate": 0.00019858712230730452, "loss": 0.1038, "step": 2931 }, { "epoch": 0.18954343434343435, "grad_norm": 0.07631954550743103, "learning_rate": 0.00019858597657396043, "loss": 0.0993, "step": 2932 }, { "epoch": 0.1896080808080808, "grad_norm": 0.07849955558776855, "learning_rate": 0.00019858483037956233, "loss": 0.0976, "step": 2933 }, { "epoch": 0.18967272727272727, "grad_norm": 0.0782211497426033, "learning_rate": 0.00019858368372411558, "loss": 0.0894, "step": 2934 }, { "epoch": 0.18973737373737373, "grad_norm": 0.06850609928369522, "learning_rate": 0.0001985825366076255, "loss": 0.0851, "step": 2935 }, { "epoch": 0.1898020202020202, "grad_norm": 0.07977654784917831, "learning_rate": 0.0001985813890300975, "loss": 0.0882, "step": 2936 }, { "epoch": 0.18986666666666666, "grad_norm": 0.07474221289157867, "learning_rate": 0.000198580240991537, "loss": 0.0843, "step": 2937 }, { "epoch": 0.18993131313131312, "grad_norm": 0.09841342270374298, "learning_rate": 0.00019857909249194918, "loss": 0.1046, "step": 2938 }, { "epoch": 0.1899959595959596, "grad_norm": 0.08717047423124313, "learning_rate": 0.00019857794353133964, "loss": 0.1034, "step": 2939 }, { "epoch": 0.19006060606060607, "grad_norm": 0.06535878032445908, "learning_rate": 0.0001985767941097136, "loss": 0.0828, "step": 2940 }, { "epoch": 0.19012525252525253, "grad_norm": 0.08358795940876007, "learning_rate": 0.00019857564422707649, "loss": 0.1115, "step": 2941 }, { "epoch": 0.190189898989899, "grad_norm": 0.06361483037471771, "learning_rate": 0.00019857449388343366, "loss": 0.0767, "step": 2942 }, { "epoch": 0.19025454545454545, "grad_norm": 0.08360166102647781, "learning_rate": 0.0001985733430787905, "loss": 0.0961, "step": 2943 }, { "epoch": 0.19031919191919192, "grad_norm": 0.07721679657697678, "learning_rate": 0.00019857219181315246, "loss": 0.099, "step": 2944 }, { "epoch": 0.19031919191919192, "eval_bleu": 13.57269188628614, "eval_loss": 0.09461914002895355, "eval_runtime": 2.7117, "eval_samples_per_second": 11.801, "eval_steps_per_second": 1.475, "step": 2944 }, { "epoch": 0.19038383838383838, "grad_norm": 0.07514526695013046, "learning_rate": 0.00019857104008652482, "loss": 0.1006, "step": 2945 }, { "epoch": 0.19044848484848484, "grad_norm": 0.08099225908517838, "learning_rate": 0.00019856988789891306, "loss": 0.1009, "step": 2946 }, { "epoch": 0.1905131313131313, "grad_norm": 0.067985400557518, "learning_rate": 0.0001985687352503225, "loss": 0.0934, "step": 2947 }, { "epoch": 0.1905777777777778, "grad_norm": 0.0791618674993515, "learning_rate": 0.00019856758214075853, "loss": 0.0964, "step": 2948 }, { "epoch": 0.19064242424242425, "grad_norm": 0.08023680746555328, "learning_rate": 0.00019856642857022657, "loss": 0.0998, "step": 2949 }, { "epoch": 0.1907070707070707, "grad_norm": 0.06242063641548157, "learning_rate": 0.00019856527453873202, "loss": 0.0643, "step": 2950 }, { "epoch": 0.19077171717171718, "grad_norm": 0.08916376531124115, "learning_rate": 0.00019856412004628027, "loss": 0.1108, "step": 2951 }, { "epoch": 0.19083636363636364, "grad_norm": 0.15652132034301758, "learning_rate": 0.00019856296509287668, "loss": 0.1063, "step": 2952 }, { "epoch": 0.1909010101010101, "grad_norm": 0.07384748011827469, "learning_rate": 0.0001985618096785267, "loss": 0.086, "step": 2953 }, { "epoch": 0.19096565656565656, "grad_norm": 0.0817989632487297, "learning_rate": 0.00019856065380323578, "loss": 0.0847, "step": 2954 }, { "epoch": 0.19103030303030302, "grad_norm": 0.08566930145025253, "learning_rate": 0.0001985594974670092, "loss": 0.1169, "step": 2955 }, { "epoch": 0.19109494949494948, "grad_norm": 0.07700645178556442, "learning_rate": 0.00019855834066985243, "loss": 0.0943, "step": 2956 }, { "epoch": 0.19115959595959595, "grad_norm": 0.08066175132989883, "learning_rate": 0.0001985571834117709, "loss": 0.0991, "step": 2957 }, { "epoch": 0.19122424242424244, "grad_norm": 0.08218420296907425, "learning_rate": 0.00019855602569277003, "loss": 0.0992, "step": 2958 }, { "epoch": 0.1912888888888889, "grad_norm": 0.06755941361188889, "learning_rate": 0.00019855486751285518, "loss": 0.079, "step": 2959 }, { "epoch": 0.19135353535353536, "grad_norm": 0.07037235796451569, "learning_rate": 0.00019855370887203181, "loss": 0.0829, "step": 2960 }, { "epoch": 0.19135353535353536, "eval_bleu": 15.749835137967121, "eval_loss": 0.09402447938919067, "eval_runtime": 2.7406, "eval_samples_per_second": 11.676, "eval_steps_per_second": 1.46, "step": 2960 }, { "epoch": 0.19141818181818182, "grad_norm": 0.06689716130495071, "learning_rate": 0.00019855254977030532, "loss": 0.0852, "step": 2961 }, { "epoch": 0.19148282828282828, "grad_norm": 0.07843676954507828, "learning_rate": 0.00019855139020768115, "loss": 0.1105, "step": 2962 }, { "epoch": 0.19154747474747474, "grad_norm": 0.07698420435190201, "learning_rate": 0.00019855023018416467, "loss": 0.0908, "step": 2963 }, { "epoch": 0.1916121212121212, "grad_norm": 0.07270284742116928, "learning_rate": 0.00019854906969976138, "loss": 0.095, "step": 2964 }, { "epoch": 0.19167676767676767, "grad_norm": 0.0765790343284607, "learning_rate": 0.00019854790875447668, "loss": 0.0909, "step": 2965 }, { "epoch": 0.19174141414141413, "grad_norm": 0.08057883381843567, "learning_rate": 0.00019854674734831596, "loss": 0.0867, "step": 2966 }, { "epoch": 0.19180606060606062, "grad_norm": 0.0739097148180008, "learning_rate": 0.0001985455854812847, "loss": 0.0948, "step": 2967 }, { "epoch": 0.19187070707070708, "grad_norm": 0.07631795853376389, "learning_rate": 0.0001985444231533883, "loss": 0.1026, "step": 2968 }, { "epoch": 0.19193535353535354, "grad_norm": 0.07131054252386093, "learning_rate": 0.00019854326036463222, "loss": 0.0856, "step": 2969 }, { "epoch": 0.192, "grad_norm": 0.09894807636737823, "learning_rate": 0.0001985420971150219, "loss": 0.1251, "step": 2970 }, { "epoch": 0.19206464646464647, "grad_norm": 0.07754914462566376, "learning_rate": 0.00019854093340456274, "loss": 0.0812, "step": 2971 }, { "epoch": 0.19212929292929293, "grad_norm": 0.08262521028518677, "learning_rate": 0.00019853976923326022, "loss": 0.1088, "step": 2972 }, { "epoch": 0.1921939393939394, "grad_norm": 0.06875734031200409, "learning_rate": 0.00019853860460111977, "loss": 0.085, "step": 2973 }, { "epoch": 0.19225858585858585, "grad_norm": 0.06864924728870392, "learning_rate": 0.00019853743950814688, "loss": 0.0819, "step": 2974 }, { "epoch": 0.1923232323232323, "grad_norm": 0.07909615337848663, "learning_rate": 0.00019853627395434692, "loss": 0.0901, "step": 2975 }, { "epoch": 0.19238787878787877, "grad_norm": 0.0733780786395073, "learning_rate": 0.00019853510793972542, "loss": 0.0854, "step": 2976 }, { "epoch": 0.19238787878787877, "eval_bleu": 12.267704260117915, "eval_loss": 0.0959477573633194, "eval_runtime": 2.8234, "eval_samples_per_second": 11.334, "eval_steps_per_second": 1.417, "step": 2976 }, { "epoch": 0.19245252525252526, "grad_norm": 0.062204692512750626, "learning_rate": 0.00019853394146428777, "loss": 0.0668, "step": 2977 }, { "epoch": 0.19251717171717173, "grad_norm": 0.08668186515569687, "learning_rate": 0.00019853277452803943, "loss": 0.1122, "step": 2978 }, { "epoch": 0.1925818181818182, "grad_norm": 0.07729778438806534, "learning_rate": 0.00019853160713098592, "loss": 0.0972, "step": 2979 }, { "epoch": 0.19264646464646465, "grad_norm": 0.07843222469091415, "learning_rate": 0.00019853043927313264, "loss": 0.09, "step": 2980 }, { "epoch": 0.1927111111111111, "grad_norm": 0.07792270183563232, "learning_rate": 0.00019852927095448508, "loss": 0.097, "step": 2981 }, { "epoch": 0.19277575757575757, "grad_norm": 0.07337860018014908, "learning_rate": 0.00019852810217504868, "loss": 0.0861, "step": 2982 }, { "epoch": 0.19284040404040403, "grad_norm": 0.08088894933462143, "learning_rate": 0.00019852693293482892, "loss": 0.0987, "step": 2983 }, { "epoch": 0.1929050505050505, "grad_norm": 0.07351638376712799, "learning_rate": 0.00019852576323383127, "loss": 0.1036, "step": 2984 }, { "epoch": 0.19296969696969696, "grad_norm": 0.07280708849430084, "learning_rate": 0.00019852459307206116, "loss": 0.0872, "step": 2985 }, { "epoch": 0.19303434343434345, "grad_norm": 0.0681147426366806, "learning_rate": 0.00019852342244952418, "loss": 0.0948, "step": 2986 }, { "epoch": 0.1930989898989899, "grad_norm": 0.08333176374435425, "learning_rate": 0.00019852225136622565, "loss": 0.1075, "step": 2987 }, { "epoch": 0.19316363636363637, "grad_norm": 0.08377640694379807, "learning_rate": 0.00019852107982217118, "loss": 0.0972, "step": 2988 }, { "epoch": 0.19322828282828283, "grad_norm": 0.08644973486661911, "learning_rate": 0.00019851990781736615, "loss": 0.1019, "step": 2989 }, { "epoch": 0.1932929292929293, "grad_norm": 0.06872477382421494, "learning_rate": 0.0001985187353518161, "loss": 0.0796, "step": 2990 }, { "epoch": 0.19335757575757576, "grad_norm": 0.06575116515159607, "learning_rate": 0.00019851756242552648, "loss": 0.0908, "step": 2991 }, { "epoch": 0.19342222222222222, "grad_norm": 0.0682578757405281, "learning_rate": 0.00019851638903850278, "loss": 0.0821, "step": 2992 }, { "epoch": 0.19342222222222222, "eval_bleu": 15.121167264177965, "eval_loss": 0.09445605427026749, "eval_runtime": 2.7557, "eval_samples_per_second": 11.612, "eval_steps_per_second": 1.452, "step": 2992 }, { "epoch": 0.19348686868686868, "grad_norm": 0.0742427334189415, "learning_rate": 0.00019851521519075052, "loss": 0.0835, "step": 2993 }, { "epoch": 0.19355151515151514, "grad_norm": 0.08182668685913086, "learning_rate": 0.00019851404088227516, "loss": 0.0999, "step": 2994 }, { "epoch": 0.1936161616161616, "grad_norm": 0.08826828002929688, "learning_rate": 0.0001985128661130822, "loss": 0.1019, "step": 2995 }, { "epoch": 0.1936808080808081, "grad_norm": 0.08324848115444183, "learning_rate": 0.00019851169088317715, "loss": 0.0945, "step": 2996 }, { "epoch": 0.19374545454545455, "grad_norm": 0.07801444083452225, "learning_rate": 0.00019851051519256544, "loss": 0.1026, "step": 2997 }, { "epoch": 0.19381010101010102, "grad_norm": 0.09497161954641342, "learning_rate": 0.00019850933904125265, "loss": 0.1218, "step": 2998 }, { "epoch": 0.19387474747474748, "grad_norm": 0.07506946474313736, "learning_rate": 0.00019850816242924425, "loss": 0.095, "step": 2999 }, { "epoch": 0.19393939393939394, "grad_norm": 0.08126571774482727, "learning_rate": 0.00019850698535654575, "loss": 0.0998, "step": 3000 }, { "epoch": 0.1940040404040404, "grad_norm": 0.07637549191713333, "learning_rate": 0.0001985058078231626, "loss": 0.0836, "step": 3001 }, { "epoch": 0.19406868686868686, "grad_norm": 0.10559996962547302, "learning_rate": 0.0001985046298291004, "loss": 0.1286, "step": 3002 }, { "epoch": 0.19413333333333332, "grad_norm": 0.07943985611200333, "learning_rate": 0.0001985034513743646, "loss": 0.0862, "step": 3003 }, { "epoch": 0.1941979797979798, "grad_norm": 0.08651348203420639, "learning_rate": 0.00019850227245896073, "loss": 0.091, "step": 3004 }, { "epoch": 0.19426262626262628, "grad_norm": 0.07314924895763397, "learning_rate": 0.00019850109308289427, "loss": 0.0937, "step": 3005 }, { "epoch": 0.19432727272727274, "grad_norm": 0.07651598751544952, "learning_rate": 0.00019849991324617078, "loss": 0.0804, "step": 3006 }, { "epoch": 0.1943919191919192, "grad_norm": 0.07446742057800293, "learning_rate": 0.00019849873294879578, "loss": 0.0865, "step": 3007 }, { "epoch": 0.19445656565656566, "grad_norm": 0.07507487386465073, "learning_rate": 0.00019849755219077472, "loss": 0.0811, "step": 3008 }, { "epoch": 0.19445656565656566, "eval_bleu": 14.356516575103246, "eval_loss": 0.09638126194477081, "eval_runtime": 2.7859, "eval_samples_per_second": 11.486, "eval_steps_per_second": 1.436, "step": 3008 }, { "epoch": 0.19452121212121212, "grad_norm": 0.07590612769126892, "learning_rate": 0.0001984963709721132, "loss": 0.0894, "step": 3009 }, { "epoch": 0.19458585858585858, "grad_norm": 0.07424801588058472, "learning_rate": 0.00019849518929281672, "loss": 0.0886, "step": 3010 }, { "epoch": 0.19465050505050505, "grad_norm": 0.0704929530620575, "learning_rate": 0.00019849400715289076, "loss": 0.0905, "step": 3011 }, { "epoch": 0.1947151515151515, "grad_norm": 0.07022596895694733, "learning_rate": 0.00019849282455234094, "loss": 0.0864, "step": 3012 }, { "epoch": 0.19477979797979797, "grad_norm": 0.07476391643285751, "learning_rate": 0.00019849164149117273, "loss": 0.086, "step": 3013 }, { "epoch": 0.19484444444444443, "grad_norm": 0.08668769150972366, "learning_rate": 0.00019849045796939166, "loss": 0.1178, "step": 3014 }, { "epoch": 0.19490909090909092, "grad_norm": 0.0779046043753624, "learning_rate": 0.00019848927398700327, "loss": 0.1039, "step": 3015 }, { "epoch": 0.19497373737373738, "grad_norm": 0.07888432592153549, "learning_rate": 0.00019848808954401316, "loss": 0.1003, "step": 3016 }, { "epoch": 0.19503838383838384, "grad_norm": 0.07456845045089722, "learning_rate": 0.00019848690464042675, "loss": 0.088, "step": 3017 }, { "epoch": 0.1951030303030303, "grad_norm": 0.07968100160360336, "learning_rate": 0.00019848571927624965, "loss": 0.1003, "step": 3018 }, { "epoch": 0.19516767676767677, "grad_norm": 0.08136183768510818, "learning_rate": 0.00019848453345148746, "loss": 0.1011, "step": 3019 }, { "epoch": 0.19523232323232323, "grad_norm": 0.07760115712881088, "learning_rate": 0.0001984833471661456, "loss": 0.0919, "step": 3020 }, { "epoch": 0.1952969696969697, "grad_norm": 0.0735422819852829, "learning_rate": 0.00019848216042022971, "loss": 0.0908, "step": 3021 }, { "epoch": 0.19536161616161615, "grad_norm": 0.07440479099750519, "learning_rate": 0.00019848097321374533, "loss": 0.0911, "step": 3022 }, { "epoch": 0.19542626262626261, "grad_norm": 0.07027467340230942, "learning_rate": 0.00019847978554669797, "loss": 0.089, "step": 3023 }, { "epoch": 0.1954909090909091, "grad_norm": 0.07545223832130432, "learning_rate": 0.0001984785974190932, "loss": 0.0897, "step": 3024 }, { "epoch": 0.1954909090909091, "eval_bleu": 12.349293801942022, "eval_loss": 0.09381656348705292, "eval_runtime": 2.7765, "eval_samples_per_second": 11.525, "eval_steps_per_second": 1.441, "step": 3024 }, { "epoch": 0.19555555555555557, "grad_norm": 0.07429604232311249, "learning_rate": 0.00019847740883093662, "loss": 0.0726, "step": 3025 }, { "epoch": 0.19562020202020203, "grad_norm": 0.06766149401664734, "learning_rate": 0.00019847621978223373, "loss": 0.0879, "step": 3026 }, { "epoch": 0.1956848484848485, "grad_norm": 0.07333466410636902, "learning_rate": 0.00019847503027299013, "loss": 0.0891, "step": 3027 }, { "epoch": 0.19574949494949495, "grad_norm": 0.0650382786989212, "learning_rate": 0.00019847384030321135, "loss": 0.0757, "step": 3028 }, { "epoch": 0.1958141414141414, "grad_norm": 0.13894008100032806, "learning_rate": 0.000198472649872903, "loss": 0.1036, "step": 3029 }, { "epoch": 0.19587878787878787, "grad_norm": 0.06723253428936005, "learning_rate": 0.0001984714589820706, "loss": 0.0853, "step": 3030 }, { "epoch": 0.19594343434343434, "grad_norm": 0.09912490099668503, "learning_rate": 0.00019847026763071974, "loss": 0.0996, "step": 3031 }, { "epoch": 0.1960080808080808, "grad_norm": 0.06248555704951286, "learning_rate": 0.00019846907581885602, "loss": 0.0675, "step": 3032 }, { "epoch": 0.19607272727272726, "grad_norm": 0.08128565549850464, "learning_rate": 0.00019846788354648497, "loss": 0.095, "step": 3033 }, { "epoch": 0.19613737373737375, "grad_norm": 0.07953748852014542, "learning_rate": 0.00019846669081361219, "loss": 0.089, "step": 3034 }, { "epoch": 0.1962020202020202, "grad_norm": 0.07747144997119904, "learning_rate": 0.00019846549762024323, "loss": 0.0957, "step": 3035 }, { "epoch": 0.19626666666666667, "grad_norm": 0.07384207844734192, "learning_rate": 0.0001984643039663837, "loss": 0.0933, "step": 3036 }, { "epoch": 0.19633131313131313, "grad_norm": 0.08308795094490051, "learning_rate": 0.00019846310985203915, "loss": 0.1029, "step": 3037 }, { "epoch": 0.1963959595959596, "grad_norm": 0.0817706510424614, "learning_rate": 0.0001984619152772152, "loss": 0.0787, "step": 3038 }, { "epoch": 0.19646060606060606, "grad_norm": 0.0723562240600586, "learning_rate": 0.00019846072024191745, "loss": 0.0877, "step": 3039 }, { "epoch": 0.19652525252525252, "grad_norm": 0.08890534937381744, "learning_rate": 0.0001984595247461514, "loss": 0.1082, "step": 3040 }, { "epoch": 0.19652525252525252, "eval_bleu": 14.590501184362248, "eval_loss": 0.09402894973754883, "eval_runtime": 2.7031, "eval_samples_per_second": 11.838, "eval_steps_per_second": 1.48, "step": 3040 }, { "epoch": 0.19658989898989898, "grad_norm": 0.08016083389520645, "learning_rate": 0.00019845832878992277, "loss": 0.0986, "step": 3041 }, { "epoch": 0.19665454545454544, "grad_norm": 0.07454028725624084, "learning_rate": 0.00019845713237323707, "loss": 0.0917, "step": 3042 }, { "epoch": 0.19671919191919193, "grad_norm": 0.06026134639978409, "learning_rate": 0.00019845593549609988, "loss": 0.0707, "step": 3043 }, { "epoch": 0.1967838383838384, "grad_norm": 0.06678041815757751, "learning_rate": 0.00019845473815851685, "loss": 0.0847, "step": 3044 }, { "epoch": 0.19684848484848486, "grad_norm": 0.0710722804069519, "learning_rate": 0.00019845354036049354, "loss": 0.0905, "step": 3045 }, { "epoch": 0.19691313131313132, "grad_norm": 0.0787622481584549, "learning_rate": 0.00019845234210203561, "loss": 0.0996, "step": 3046 }, { "epoch": 0.19697777777777778, "grad_norm": 0.07835431396961212, "learning_rate": 0.00019845114338314858, "loss": 0.1082, "step": 3047 }, { "epoch": 0.19704242424242424, "grad_norm": 0.07426532357931137, "learning_rate": 0.00019844994420383815, "loss": 0.0908, "step": 3048 }, { "epoch": 0.1971070707070707, "grad_norm": 0.06754591315984726, "learning_rate": 0.00019844874456410985, "loss": 0.0977, "step": 3049 }, { "epoch": 0.19717171717171716, "grad_norm": 0.07153677195310593, "learning_rate": 0.0001984475444639693, "loss": 0.0852, "step": 3050 }, { "epoch": 0.19723636363636363, "grad_norm": 0.0679018422961235, "learning_rate": 0.00019844634390342214, "loss": 0.0785, "step": 3051 }, { "epoch": 0.1973010101010101, "grad_norm": 0.07492662966251373, "learning_rate": 0.00019844514288247397, "loss": 0.0839, "step": 3052 }, { "epoch": 0.19736565656565658, "grad_norm": 0.1103171706199646, "learning_rate": 0.00019844394140113044, "loss": 0.1144, "step": 3053 }, { "epoch": 0.19743030303030304, "grad_norm": 0.0835791751742363, "learning_rate": 0.0001984427394593971, "loss": 0.113, "step": 3054 }, { "epoch": 0.1974949494949495, "grad_norm": 0.08254282921552658, "learning_rate": 0.00019844153705727966, "loss": 0.0984, "step": 3055 }, { "epoch": 0.19755959595959596, "grad_norm": 0.07681182771921158, "learning_rate": 0.00019844033419478367, "loss": 0.1031, "step": 3056 }, { "epoch": 0.19755959595959596, "eval_bleu": 16.18089585965196, "eval_loss": 0.09275349229574203, "eval_runtime": 2.802, "eval_samples_per_second": 11.42, "eval_steps_per_second": 1.428, "step": 3056 }, { "epoch": 0.19762424242424242, "grad_norm": 0.08036350458860397, "learning_rate": 0.0001984391308719148, "loss": 0.0937, "step": 3057 }, { "epoch": 0.1976888888888889, "grad_norm": 0.0633590966463089, "learning_rate": 0.00019843792708867862, "loss": 0.0749, "step": 3058 }, { "epoch": 0.19775353535353535, "grad_norm": 0.06803199648857117, "learning_rate": 0.00019843672284508082, "loss": 0.0846, "step": 3059 }, { "epoch": 0.1978181818181818, "grad_norm": 0.07061391323804855, "learning_rate": 0.00019843551814112702, "loss": 0.0918, "step": 3060 }, { "epoch": 0.19788282828282827, "grad_norm": 0.07257424294948578, "learning_rate": 0.00019843431297682283, "loss": 0.0906, "step": 3061 }, { "epoch": 0.19794747474747476, "grad_norm": 0.07157407701015472, "learning_rate": 0.00019843310735217392, "loss": 0.0751, "step": 3062 }, { "epoch": 0.19801212121212122, "grad_norm": 0.06550973653793335, "learning_rate": 0.00019843190126718588, "loss": 0.0792, "step": 3063 }, { "epoch": 0.19807676767676768, "grad_norm": 0.07210759818553925, "learning_rate": 0.00019843069472186438, "loss": 0.0919, "step": 3064 }, { "epoch": 0.19814141414141415, "grad_norm": 0.06413321942090988, "learning_rate": 0.0001984294877162151, "loss": 0.0729, "step": 3065 }, { "epoch": 0.1982060606060606, "grad_norm": 0.06826533377170563, "learning_rate": 0.00019842828025024362, "loss": 0.0857, "step": 3066 }, { "epoch": 0.19827070707070707, "grad_norm": 0.08026416599750519, "learning_rate": 0.0001984270723239556, "loss": 0.0908, "step": 3067 }, { "epoch": 0.19833535353535353, "grad_norm": 0.07231787592172623, "learning_rate": 0.0001984258639373567, "loss": 0.086, "step": 3068 }, { "epoch": 0.1984, "grad_norm": 0.10786189883947372, "learning_rate": 0.00019842465509045258, "loss": 0.1132, "step": 3069 }, { "epoch": 0.19846464646464645, "grad_norm": 0.08069875091314316, "learning_rate": 0.0001984234457832489, "loss": 0.0987, "step": 3070 }, { "epoch": 0.19852929292929292, "grad_norm": 0.06611949950456619, "learning_rate": 0.00019842223601575126, "loss": 0.0776, "step": 3071 }, { "epoch": 0.1985939393939394, "grad_norm": 0.0697125494480133, "learning_rate": 0.0001984210257879654, "loss": 0.0896, "step": 3072 }, { "epoch": 0.1985939393939394, "eval_bleu": 15.348511609101799, "eval_loss": 0.09288185089826584, "eval_runtime": 2.6637, "eval_samples_per_second": 12.014, "eval_steps_per_second": 1.502, "step": 3072 }, { "epoch": 0.19865858585858587, "grad_norm": 0.07403448224067688, "learning_rate": 0.00019841981509989695, "loss": 0.093, "step": 3073 }, { "epoch": 0.19872323232323233, "grad_norm": 0.0917670950293541, "learning_rate": 0.00019841860395155157, "loss": 0.1015, "step": 3074 }, { "epoch": 0.1987878787878788, "grad_norm": 0.08433215320110321, "learning_rate": 0.00019841739234293488, "loss": 0.0926, "step": 3075 }, { "epoch": 0.19885252525252525, "grad_norm": 0.07214537262916565, "learning_rate": 0.0001984161802740526, "loss": 0.0947, "step": 3076 }, { "epoch": 0.19891717171717171, "grad_norm": 0.07228455692529678, "learning_rate": 0.00019841496774491036, "loss": 0.0894, "step": 3077 }, { "epoch": 0.19898181818181818, "grad_norm": 0.08608461916446686, "learning_rate": 0.00019841375475551388, "loss": 0.1185, "step": 3078 }, { "epoch": 0.19904646464646464, "grad_norm": 0.07759758085012436, "learning_rate": 0.0001984125413058688, "loss": 0.0949, "step": 3079 }, { "epoch": 0.1991111111111111, "grad_norm": 0.08019310981035233, "learning_rate": 0.00019841132739598075, "loss": 0.0975, "step": 3080 }, { "epoch": 0.1991757575757576, "grad_norm": 0.07815665751695633, "learning_rate": 0.0001984101130258555, "loss": 0.0919, "step": 3081 }, { "epoch": 0.19924040404040405, "grad_norm": 0.1941106617450714, "learning_rate": 0.0001984088981954987, "loss": 0.0944, "step": 3082 }, { "epoch": 0.1993050505050505, "grad_norm": 0.14127443730831146, "learning_rate": 0.000198407682904916, "loss": 0.0818, "step": 3083 }, { "epoch": 0.19936969696969697, "grad_norm": 0.09154678136110306, "learning_rate": 0.0001984064671541131, "loss": 0.1087, "step": 3084 }, { "epoch": 0.19943434343434344, "grad_norm": 0.1412719339132309, "learning_rate": 0.00019840525094309567, "loss": 0.0978, "step": 3085 }, { "epoch": 0.1994989898989899, "grad_norm": 0.07700436562299728, "learning_rate": 0.00019840403427186942, "loss": 0.095, "step": 3086 }, { "epoch": 0.19956363636363636, "grad_norm": 0.06923733651638031, "learning_rate": 0.00019840281714044003, "loss": 0.0782, "step": 3087 }, { "epoch": 0.19962828282828282, "grad_norm": 0.0664527639746666, "learning_rate": 0.0001984015995488132, "loss": 0.0841, "step": 3088 }, { "epoch": 0.19962828282828282, "eval_bleu": 15.489048741095534, "eval_loss": 0.09590156376361847, "eval_runtime": 2.8428, "eval_samples_per_second": 11.257, "eval_steps_per_second": 1.407, "step": 3088 }, { "epoch": 0.19969292929292928, "grad_norm": 0.07878103852272034, "learning_rate": 0.00019840038149699464, "loss": 0.089, "step": 3089 }, { "epoch": 0.19975757575757574, "grad_norm": 0.09234321117401123, "learning_rate": 0.00019839916298499, "loss": 0.1009, "step": 3090 }, { "epoch": 0.19982222222222223, "grad_norm": 0.07252255827188492, "learning_rate": 0.00019839794401280502, "loss": 0.0809, "step": 3091 }, { "epoch": 0.1998868686868687, "grad_norm": 0.08006826788187027, "learning_rate": 0.00019839672458044538, "loss": 0.0682, "step": 3092 }, { "epoch": 0.19995151515151516, "grad_norm": 0.08270895481109619, "learning_rate": 0.00019839550468791678, "loss": 0.0864, "step": 3093 }, { "epoch": 0.20001616161616162, "grad_norm": 0.06841226667165756, "learning_rate": 0.00019839428433522495, "loss": 0.0775, "step": 3094 }, { "epoch": 0.20008080808080808, "grad_norm": 0.08888465166091919, "learning_rate": 0.0001983930635223756, "loss": 0.1031, "step": 3095 }, { "epoch": 0.20014545454545454, "grad_norm": 0.06844443082809448, "learning_rate": 0.00019839184224937438, "loss": 0.0904, "step": 3096 }, { "epoch": 0.200210101010101, "grad_norm": 0.07276881486177444, "learning_rate": 0.00019839062051622705, "loss": 0.0858, "step": 3097 }, { "epoch": 0.20027474747474747, "grad_norm": 0.0789208635687828, "learning_rate": 0.00019838939832293932, "loss": 0.0944, "step": 3098 }, { "epoch": 0.20033939393939393, "grad_norm": 0.07845215499401093, "learning_rate": 0.0001983881756695169, "loss": 0.0953, "step": 3099 }, { "epoch": 0.20040404040404042, "grad_norm": 0.08761543780565262, "learning_rate": 0.00019838695255596552, "loss": 0.1, "step": 3100 }, { "epoch": 0.20046868686868688, "grad_norm": 0.09423337876796722, "learning_rate": 0.00019838572898229085, "loss": 0.1231, "step": 3101 }, { "epoch": 0.20053333333333334, "grad_norm": 0.08081380277872086, "learning_rate": 0.0001983845049484987, "loss": 0.1081, "step": 3102 }, { "epoch": 0.2005979797979798, "grad_norm": 0.21056683361530304, "learning_rate": 0.00019838328045459474, "loss": 0.0955, "step": 3103 }, { "epoch": 0.20066262626262626, "grad_norm": 0.07333079725503922, "learning_rate": 0.0001983820555005847, "loss": 0.0888, "step": 3104 }, { "epoch": 0.20066262626262626, "eval_bleu": 15.040086182106487, "eval_loss": 0.0959915816783905, "eval_runtime": 2.7261, "eval_samples_per_second": 11.738, "eval_steps_per_second": 1.467, "step": 3104 }, { "epoch": 0.20072727272727273, "grad_norm": 0.08100400865077972, "learning_rate": 0.00019838083008647428, "loss": 0.1079, "step": 3105 }, { "epoch": 0.2007919191919192, "grad_norm": 0.0793466567993164, "learning_rate": 0.00019837960421226924, "loss": 0.1054, "step": 3106 }, { "epoch": 0.20085656565656565, "grad_norm": 0.06841523200273514, "learning_rate": 0.00019837837787797536, "loss": 0.0965, "step": 3107 }, { "epoch": 0.2009212121212121, "grad_norm": 0.08950537443161011, "learning_rate": 0.00019837715108359828, "loss": 0.1034, "step": 3108 }, { "epoch": 0.20098585858585857, "grad_norm": 0.0957818478345871, "learning_rate": 0.00019837592382914384, "loss": 0.089, "step": 3109 }, { "epoch": 0.20105050505050506, "grad_norm": 0.06846385449171066, "learning_rate": 0.00019837469611461769, "loss": 0.0816, "step": 3110 }, { "epoch": 0.20111515151515152, "grad_norm": 0.07595576345920563, "learning_rate": 0.00019837346794002563, "loss": 0.1032, "step": 3111 }, { "epoch": 0.201179797979798, "grad_norm": 0.08023486286401749, "learning_rate": 0.00019837223930537333, "loss": 0.0953, "step": 3112 }, { "epoch": 0.20124444444444445, "grad_norm": 0.07304797321557999, "learning_rate": 0.00019837101021066665, "loss": 0.0972, "step": 3113 }, { "epoch": 0.2013090909090909, "grad_norm": 0.061266034841537476, "learning_rate": 0.00019836978065591125, "loss": 0.0753, "step": 3114 }, { "epoch": 0.20137373737373737, "grad_norm": 0.07018446177244186, "learning_rate": 0.0001983685506411129, "loss": 0.0978, "step": 3115 }, { "epoch": 0.20143838383838383, "grad_norm": 0.13731630146503448, "learning_rate": 0.00019836732016627738, "loss": 0.0797, "step": 3116 }, { "epoch": 0.2015030303030303, "grad_norm": 0.07203269004821777, "learning_rate": 0.0001983660892314104, "loss": 0.0939, "step": 3117 }, { "epoch": 0.20156767676767676, "grad_norm": 0.0736599862575531, "learning_rate": 0.00019836485783651778, "loss": 0.0871, "step": 3118 }, { "epoch": 0.20163232323232325, "grad_norm": 0.0717538520693779, "learning_rate": 0.0001983636259816052, "loss": 0.0725, "step": 3119 }, { "epoch": 0.2016969696969697, "grad_norm": 0.07141770422458649, "learning_rate": 0.00019836239366667846, "loss": 0.0915, "step": 3120 }, { "epoch": 0.2016969696969697, "eval_bleu": 14.59868330010741, "eval_loss": 0.09569938480854034, "eval_runtime": 2.7744, "eval_samples_per_second": 11.534, "eval_steps_per_second": 1.442, "step": 3120 }, { "epoch": 0.20176161616161617, "grad_norm": 0.0767352357506752, "learning_rate": 0.00019836116089174332, "loss": 0.1003, "step": 3121 }, { "epoch": 0.20182626262626263, "grad_norm": 0.13903386890888214, "learning_rate": 0.00019835992765680557, "loss": 0.1079, "step": 3122 }, { "epoch": 0.2018909090909091, "grad_norm": 0.07657284289598465, "learning_rate": 0.00019835869396187095, "loss": 0.0927, "step": 3123 }, { "epoch": 0.20195555555555555, "grad_norm": 0.07481386512517929, "learning_rate": 0.00019835745980694523, "loss": 0.0917, "step": 3124 }, { "epoch": 0.20202020202020202, "grad_norm": 0.08460117131471634, "learning_rate": 0.00019835622519203422, "loss": 0.1096, "step": 3125 }, { "epoch": 0.20208484848484848, "grad_norm": 0.07209576666355133, "learning_rate": 0.0001983549901171436, "loss": 0.0833, "step": 3126 }, { "epoch": 0.20214949494949494, "grad_norm": 0.06695141643285751, "learning_rate": 0.00019835375458227925, "loss": 0.0902, "step": 3127 }, { "epoch": 0.2022141414141414, "grad_norm": 0.10109949856996536, "learning_rate": 0.00019835251858744688, "loss": 0.0752, "step": 3128 }, { "epoch": 0.2022787878787879, "grad_norm": 0.06644662469625473, "learning_rate": 0.0001983512821326523, "loss": 0.0814, "step": 3129 }, { "epoch": 0.20234343434343435, "grad_norm": 0.07694841921329498, "learning_rate": 0.0001983500452179013, "loss": 0.0964, "step": 3130 }, { "epoch": 0.20240808080808081, "grad_norm": 0.17065557837486267, "learning_rate": 0.00019834880784319964, "loss": 0.1117, "step": 3131 }, { "epoch": 0.20247272727272728, "grad_norm": 0.09423643350601196, "learning_rate": 0.00019834757000855314, "loss": 0.0773, "step": 3132 }, { "epoch": 0.20253737373737374, "grad_norm": 0.09630367904901505, "learning_rate": 0.00019834633171396756, "loss": 0.1148, "step": 3133 }, { "epoch": 0.2026020202020202, "grad_norm": 0.06729527562856674, "learning_rate": 0.00019834509295944868, "loss": 0.0807, "step": 3134 }, { "epoch": 0.20266666666666666, "grad_norm": 0.08414700627326965, "learning_rate": 0.00019834385374500234, "loss": 0.1041, "step": 3135 }, { "epoch": 0.20273131313131312, "grad_norm": 0.07156471163034439, "learning_rate": 0.00019834261407063427, "loss": 0.0927, "step": 3136 }, { "epoch": 0.20273131313131312, "eval_bleu": 11.91406607261676, "eval_loss": 0.09549685567617416, "eval_runtime": 2.7445, "eval_samples_per_second": 11.66, "eval_steps_per_second": 1.457, "step": 3136 }, { "epoch": 0.20279595959595959, "grad_norm": 0.06717123091220856, "learning_rate": 0.00019834137393635035, "loss": 0.0889, "step": 3137 }, { "epoch": 0.20286060606060605, "grad_norm": 0.08477167040109634, "learning_rate": 0.0001983401333421563, "loss": 0.1125, "step": 3138 }, { "epoch": 0.20292525252525254, "grad_norm": 0.07160009443759918, "learning_rate": 0.00019833889228805797, "loss": 0.0964, "step": 3139 }, { "epoch": 0.202989898989899, "grad_norm": 0.07411563396453857, "learning_rate": 0.00019833765077406114, "loss": 0.0865, "step": 3140 }, { "epoch": 0.20305454545454546, "grad_norm": 0.0777866467833519, "learning_rate": 0.00019833640880017166, "loss": 0.1114, "step": 3141 }, { "epoch": 0.20311919191919192, "grad_norm": 0.09419663995504379, "learning_rate": 0.00019833516636639527, "loss": 0.0852, "step": 3142 }, { "epoch": 0.20318383838383838, "grad_norm": 0.08539903163909912, "learning_rate": 0.00019833392347273784, "loss": 0.1093, "step": 3143 }, { "epoch": 0.20324848484848484, "grad_norm": 0.07317009568214417, "learning_rate": 0.00019833268011920513, "loss": 0.0869, "step": 3144 }, { "epoch": 0.2033131313131313, "grad_norm": 0.08566869050264359, "learning_rate": 0.000198331436305803, "loss": 0.0943, "step": 3145 }, { "epoch": 0.20337777777777777, "grad_norm": 0.0702219232916832, "learning_rate": 0.00019833019203253726, "loss": 0.0955, "step": 3146 }, { "epoch": 0.20344242424242423, "grad_norm": 0.07991989701986313, "learning_rate": 0.00019832894729941372, "loss": 0.0893, "step": 3147 }, { "epoch": 0.20350707070707072, "grad_norm": 0.07923533767461777, "learning_rate": 0.0001983277021064382, "loss": 0.1063, "step": 3148 }, { "epoch": 0.20357171717171718, "grad_norm": 0.0759860947728157, "learning_rate": 0.00019832645645361652, "loss": 0.0895, "step": 3149 }, { "epoch": 0.20363636363636364, "grad_norm": 0.080452561378479, "learning_rate": 0.0001983252103409545, "loss": 0.1026, "step": 3150 }, { "epoch": 0.2037010101010101, "grad_norm": 0.06442708522081375, "learning_rate": 0.00019832396376845798, "loss": 0.0714, "step": 3151 }, { "epoch": 0.20376565656565657, "grad_norm": 0.0781233012676239, "learning_rate": 0.00019832271673613278, "loss": 0.1001, "step": 3152 }, { "epoch": 0.20376565656565657, "eval_bleu": 13.69806246538477, "eval_loss": 0.09530284255743027, "eval_runtime": 2.7923, "eval_samples_per_second": 11.46, "eval_steps_per_second": 1.433, "step": 3152 }, { "epoch": 0.20383030303030303, "grad_norm": 0.07101649045944214, "learning_rate": 0.00019832146924398476, "loss": 0.0925, "step": 3153 }, { "epoch": 0.2038949494949495, "grad_norm": 0.07431681454181671, "learning_rate": 0.00019832022129201971, "loss": 0.1021, "step": 3154 }, { "epoch": 0.20395959595959595, "grad_norm": 0.07920096069574356, "learning_rate": 0.00019831897288024347, "loss": 0.0933, "step": 3155 }, { "epoch": 0.2040242424242424, "grad_norm": 0.0672849789261818, "learning_rate": 0.00019831772400866196, "loss": 0.0792, "step": 3156 }, { "epoch": 0.20408888888888888, "grad_norm": 0.09281349927186966, "learning_rate": 0.00019831647467728088, "loss": 0.1255, "step": 3157 }, { "epoch": 0.20415353535353536, "grad_norm": 0.116844043135643, "learning_rate": 0.0001983152248861062, "loss": 0.1077, "step": 3158 }, { "epoch": 0.20421818181818183, "grad_norm": 0.07704758644104004, "learning_rate": 0.00019831397463514373, "loss": 0.105, "step": 3159 }, { "epoch": 0.2042828282828283, "grad_norm": 0.07215922325849533, "learning_rate": 0.0001983127239243993, "loss": 0.0956, "step": 3160 }, { "epoch": 0.20434747474747475, "grad_norm": 0.08894345164299011, "learning_rate": 0.00019831147275387875, "loss": 0.1024, "step": 3161 }, { "epoch": 0.2044121212121212, "grad_norm": 0.09421950578689575, "learning_rate": 0.00019831022112358794, "loss": 0.0984, "step": 3162 }, { "epoch": 0.20447676767676767, "grad_norm": 0.07571349292993546, "learning_rate": 0.0001983089690335327, "loss": 0.1047, "step": 3163 }, { "epoch": 0.20454141414141414, "grad_norm": 0.08159632980823517, "learning_rate": 0.00019830771648371895, "loss": 0.11, "step": 3164 }, { "epoch": 0.2046060606060606, "grad_norm": 0.08643662929534912, "learning_rate": 0.00019830646347415248, "loss": 0.1243, "step": 3165 }, { "epoch": 0.20467070707070706, "grad_norm": 0.0670056864619255, "learning_rate": 0.00019830521000483922, "loss": 0.0953, "step": 3166 }, { "epoch": 0.20473535353535355, "grad_norm": 0.06531786918640137, "learning_rate": 0.00019830395607578496, "loss": 0.0878, "step": 3167 }, { "epoch": 0.2048, "grad_norm": 0.06567156314849854, "learning_rate": 0.00019830270168699562, "loss": 0.09, "step": 3168 }, { "epoch": 0.2048, "eval_bleu": 13.082742353089657, "eval_loss": 0.09588369727134705, "eval_runtime": 2.7131, "eval_samples_per_second": 11.795, "eval_steps_per_second": 1.474, "step": 3168 }, { "epoch": 0.20486464646464647, "grad_norm": 0.06386405229568481, "learning_rate": 0.000198301446838477, "loss": 0.0798, "step": 3169 }, { "epoch": 0.20492929292929293, "grad_norm": 0.0673353523015976, "learning_rate": 0.00019830019153023505, "loss": 0.0885, "step": 3170 }, { "epoch": 0.2049939393939394, "grad_norm": 0.07388516515493393, "learning_rate": 0.00019829893576227557, "loss": 0.1063, "step": 3171 }, { "epoch": 0.20505858585858586, "grad_norm": 0.06915147602558136, "learning_rate": 0.0001982976795346045, "loss": 0.0864, "step": 3172 }, { "epoch": 0.20512323232323232, "grad_norm": 0.07721282541751862, "learning_rate": 0.00019829642284722768, "loss": 0.0945, "step": 3173 }, { "epoch": 0.20518787878787878, "grad_norm": 0.06398101150989532, "learning_rate": 0.00019829516570015095, "loss": 0.082, "step": 3174 }, { "epoch": 0.20525252525252524, "grad_norm": 0.11499542742967606, "learning_rate": 0.00019829390809338024, "loss": 0.0941, "step": 3175 }, { "epoch": 0.2053171717171717, "grad_norm": 0.0740969181060791, "learning_rate": 0.0001982926500269214, "loss": 0.0942, "step": 3176 }, { "epoch": 0.2053818181818182, "grad_norm": 0.07350771874189377, "learning_rate": 0.00019829139150078038, "loss": 0.091, "step": 3177 }, { "epoch": 0.20544646464646465, "grad_norm": 0.07321209460496902, "learning_rate": 0.000198290132514963, "loss": 0.094, "step": 3178 }, { "epoch": 0.20551111111111112, "grad_norm": 0.08422143757343292, "learning_rate": 0.00019828887306947514, "loss": 0.0917, "step": 3179 }, { "epoch": 0.20557575757575758, "grad_norm": 0.07574667781591415, "learning_rate": 0.0001982876131643227, "loss": 0.0934, "step": 3180 }, { "epoch": 0.20564040404040404, "grad_norm": 0.07455884665250778, "learning_rate": 0.00019828635279951166, "loss": 0.0948, "step": 3181 }, { "epoch": 0.2057050505050505, "grad_norm": 0.08029003441333771, "learning_rate": 0.00019828509197504775, "loss": 0.0862, "step": 3182 }, { "epoch": 0.20576969696969696, "grad_norm": 0.08099525421857834, "learning_rate": 0.00019828383069093704, "loss": 0.0979, "step": 3183 }, { "epoch": 0.20583434343434343, "grad_norm": 0.0789312869310379, "learning_rate": 0.0001982825689471853, "loss": 0.0956, "step": 3184 }, { "epoch": 0.20583434343434343, "eval_bleu": 11.35523981699744, "eval_loss": 0.0962069034576416, "eval_runtime": 2.8599, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 3184 }, { "epoch": 0.2058989898989899, "grad_norm": 0.0795975774526596, "learning_rate": 0.0001982813067437985, "loss": 0.0877, "step": 3185 }, { "epoch": 0.20596363636363638, "grad_norm": 0.0632048100233078, "learning_rate": 0.0001982800440807825, "loss": 0.0804, "step": 3186 }, { "epoch": 0.20602828282828284, "grad_norm": 0.07772943377494812, "learning_rate": 0.00019827878095814323, "loss": 0.0965, "step": 3187 }, { "epoch": 0.2060929292929293, "grad_norm": 0.0854567214846611, "learning_rate": 0.00019827751737588658, "loss": 0.0952, "step": 3188 }, { "epoch": 0.20615757575757576, "grad_norm": 0.07375089079141617, "learning_rate": 0.00019827625333401847, "loss": 0.0963, "step": 3189 }, { "epoch": 0.20622222222222222, "grad_norm": 0.07229287922382355, "learning_rate": 0.00019827498883254484, "loss": 0.0847, "step": 3190 }, { "epoch": 0.20628686868686869, "grad_norm": 0.060634031891822815, "learning_rate": 0.00019827372387147156, "loss": 0.0675, "step": 3191 }, { "epoch": 0.20635151515151515, "grad_norm": 0.07454415410757065, "learning_rate": 0.00019827245845080457, "loss": 0.0959, "step": 3192 }, { "epoch": 0.2064161616161616, "grad_norm": 0.07481205463409424, "learning_rate": 0.00019827119257054978, "loss": 0.0967, "step": 3193 }, { "epoch": 0.20648080808080807, "grad_norm": 0.07120965421199799, "learning_rate": 0.0001982699262307131, "loss": 0.084, "step": 3194 }, { "epoch": 0.20654545454545453, "grad_norm": 0.07946518808603287, "learning_rate": 0.00019826865943130045, "loss": 0.1094, "step": 3195 }, { "epoch": 0.20661010101010102, "grad_norm": 0.08382433652877808, "learning_rate": 0.0001982673921723178, "loss": 0.1121, "step": 3196 }, { "epoch": 0.20667474747474748, "grad_norm": 0.07565141469240189, "learning_rate": 0.00019826612445377102, "loss": 0.092, "step": 3197 }, { "epoch": 0.20673939393939395, "grad_norm": 0.07011779397726059, "learning_rate": 0.00019826485627566607, "loss": 0.1006, "step": 3198 }, { "epoch": 0.2068040404040404, "grad_norm": 0.10079753398895264, "learning_rate": 0.00019826358763800887, "loss": 0.0924, "step": 3199 }, { "epoch": 0.20686868686868687, "grad_norm": 0.07440459728240967, "learning_rate": 0.00019826231854080535, "loss": 0.0913, "step": 3200 }, { "epoch": 0.20686868686868687, "eval_bleu": 13.84797986886752, "eval_loss": 0.09659598767757416, "eval_runtime": 2.7037, "eval_samples_per_second": 11.836, "eval_steps_per_second": 1.479, "step": 3200 }, { "epoch": 0.20693333333333333, "grad_norm": 0.08571907877922058, "learning_rate": 0.00019826104898406147, "loss": 0.0994, "step": 3201 }, { "epoch": 0.2069979797979798, "grad_norm": 0.1361107975244522, "learning_rate": 0.00019825977896778313, "loss": 0.134, "step": 3202 }, { "epoch": 0.20706262626262625, "grad_norm": 0.06828378885984421, "learning_rate": 0.0001982585084919763, "loss": 0.0887, "step": 3203 }, { "epoch": 0.20712727272727272, "grad_norm": 0.08962441980838776, "learning_rate": 0.0001982572375566469, "loss": 0.0859, "step": 3204 }, { "epoch": 0.2071919191919192, "grad_norm": 0.07443254441022873, "learning_rate": 0.0001982559661618009, "loss": 0.0868, "step": 3205 }, { "epoch": 0.20725656565656567, "grad_norm": 0.06895461678504944, "learning_rate": 0.0001982546943074442, "loss": 0.0937, "step": 3206 }, { "epoch": 0.20732121212121213, "grad_norm": 0.07234080880880356, "learning_rate": 0.00019825342199358278, "loss": 0.0876, "step": 3207 }, { "epoch": 0.2073858585858586, "grad_norm": 0.07005325704813004, "learning_rate": 0.0001982521492202226, "loss": 0.0936, "step": 3208 }, { "epoch": 0.20745050505050505, "grad_norm": 0.06727655231952667, "learning_rate": 0.00019825087598736963, "loss": 0.0744, "step": 3209 }, { "epoch": 0.2075151515151515, "grad_norm": 0.06495391577482224, "learning_rate": 0.00019824960229502975, "loss": 0.0764, "step": 3210 }, { "epoch": 0.20757979797979798, "grad_norm": 0.06698035448789597, "learning_rate": 0.00019824832814320897, "loss": 0.0867, "step": 3211 }, { "epoch": 0.20764444444444444, "grad_norm": 0.07150883227586746, "learning_rate": 0.00019824705353191325, "loss": 0.0876, "step": 3212 }, { "epoch": 0.2077090909090909, "grad_norm": 0.0784783810377121, "learning_rate": 0.00019824577846114854, "loss": 0.0953, "step": 3213 }, { "epoch": 0.20777373737373736, "grad_norm": 0.08331015706062317, "learning_rate": 0.0001982445029309208, "loss": 0.1073, "step": 3214 }, { "epoch": 0.20783838383838385, "grad_norm": 0.07842043042182922, "learning_rate": 0.00019824322694123598, "loss": 0.0961, "step": 3215 }, { "epoch": 0.2079030303030303, "grad_norm": 0.0864470899105072, "learning_rate": 0.00019824195049210008, "loss": 0.1135, "step": 3216 }, { "epoch": 0.2079030303030303, "eval_bleu": 13.403972714553266, "eval_loss": 0.09692306071519852, "eval_runtime": 2.7803, "eval_samples_per_second": 11.509, "eval_steps_per_second": 1.439, "step": 3216 }, { "epoch": 0.20796767676767677, "grad_norm": 0.07396278530359268, "learning_rate": 0.00019824067358351907, "loss": 0.0964, "step": 3217 }, { "epoch": 0.20803232323232324, "grad_norm": 0.10630341619253159, "learning_rate": 0.0001982393962154989, "loss": 0.1053, "step": 3218 }, { "epoch": 0.2080969696969697, "grad_norm": 0.06560712307691574, "learning_rate": 0.00019823811838804556, "loss": 0.0776, "step": 3219 }, { "epoch": 0.20816161616161616, "grad_norm": 0.06602195650339127, "learning_rate": 0.000198236840101165, "loss": 0.0913, "step": 3220 }, { "epoch": 0.20822626262626262, "grad_norm": 0.07872724533081055, "learning_rate": 0.0001982355613548632, "loss": 0.1024, "step": 3221 }, { "epoch": 0.20829090909090908, "grad_norm": 0.07390357553958893, "learning_rate": 0.00019823428214914618, "loss": 0.1014, "step": 3222 }, { "epoch": 0.20835555555555554, "grad_norm": 0.07097990810871124, "learning_rate": 0.0001982330024840199, "loss": 0.0851, "step": 3223 }, { "epoch": 0.20842020202020203, "grad_norm": 0.07431790232658386, "learning_rate": 0.00019823172235949033, "loss": 0.091, "step": 3224 }, { "epoch": 0.2084848484848485, "grad_norm": 0.07419148087501526, "learning_rate": 0.00019823044177556346, "loss": 0.0892, "step": 3225 }, { "epoch": 0.20854949494949496, "grad_norm": 0.09415031224489212, "learning_rate": 0.00019822916073224535, "loss": 0.1084, "step": 3226 }, { "epoch": 0.20861414141414142, "grad_norm": 0.07401181757450104, "learning_rate": 0.00019822787922954186, "loss": 0.0861, "step": 3227 }, { "epoch": 0.20867878787878788, "grad_norm": 0.08296767622232437, "learning_rate": 0.00019822659726745906, "loss": 0.1022, "step": 3228 }, { "epoch": 0.20874343434343434, "grad_norm": 0.08823227137327194, "learning_rate": 0.00019822531484600298, "loss": 0.1139, "step": 3229 }, { "epoch": 0.2088080808080808, "grad_norm": 0.07112350314855576, "learning_rate": 0.00019822403196517955, "loss": 0.0839, "step": 3230 }, { "epoch": 0.20887272727272727, "grad_norm": 0.065652035176754, "learning_rate": 0.0001982227486249948, "loss": 0.0814, "step": 3231 }, { "epoch": 0.20893737373737373, "grad_norm": 0.07163041830062866, "learning_rate": 0.00019822146482545468, "loss": 0.0882, "step": 3232 }, { "epoch": 0.20893737373737373, "eval_bleu": 12.093881575725476, "eval_loss": 0.09620682895183563, "eval_runtime": 2.9348, "eval_samples_per_second": 10.904, "eval_steps_per_second": 1.363, "step": 3232 }, { "epoch": 0.2090020202020202, "grad_norm": 0.0594286248087883, "learning_rate": 0.0001982201805665653, "loss": 0.0625, "step": 3233 }, { "epoch": 0.20906666666666668, "grad_norm": 0.0808313861489296, "learning_rate": 0.0001982188958483326, "loss": 0.0914, "step": 3234 }, { "epoch": 0.20913131313131314, "grad_norm": 0.07525745034217834, "learning_rate": 0.00019821761067076256, "loss": 0.0941, "step": 3235 }, { "epoch": 0.2091959595959596, "grad_norm": 0.18140995502471924, "learning_rate": 0.00019821632503386124, "loss": 0.0958, "step": 3236 }, { "epoch": 0.20926060606060606, "grad_norm": 0.07752675563097, "learning_rate": 0.00019821503893763468, "loss": 0.0958, "step": 3237 }, { "epoch": 0.20932525252525253, "grad_norm": 0.06594157963991165, "learning_rate": 0.0001982137523820888, "loss": 0.0787, "step": 3238 }, { "epoch": 0.209389898989899, "grad_norm": 0.06697786599397659, "learning_rate": 0.00019821246536722966, "loss": 0.081, "step": 3239 }, { "epoch": 0.20945454545454545, "grad_norm": 0.08294513821601868, "learning_rate": 0.00019821117789306332, "loss": 0.0887, "step": 3240 }, { "epoch": 0.2095191919191919, "grad_norm": 0.060510385781526566, "learning_rate": 0.00019820988995959576, "loss": 0.0827, "step": 3241 }, { "epoch": 0.20958383838383837, "grad_norm": 0.07210643589496613, "learning_rate": 0.000198208601566833, "loss": 0.0806, "step": 3242 }, { "epoch": 0.20964848484848486, "grad_norm": 0.07757668197154999, "learning_rate": 0.00019820731271478112, "loss": 0.0966, "step": 3243 }, { "epoch": 0.20971313131313132, "grad_norm": 0.08799994736909866, "learning_rate": 0.00019820602340344607, "loss": 0.0956, "step": 3244 }, { "epoch": 0.20977777777777779, "grad_norm": 0.07386535406112671, "learning_rate": 0.00019820473363283393, "loss": 0.1012, "step": 3245 }, { "epoch": 0.20984242424242425, "grad_norm": 0.07808306813240051, "learning_rate": 0.0001982034434029507, "loss": 0.0931, "step": 3246 }, { "epoch": 0.2099070707070707, "grad_norm": 0.07190465927124023, "learning_rate": 0.00019820215271380246, "loss": 0.102, "step": 3247 }, { "epoch": 0.20997171717171717, "grad_norm": 0.09075876325368881, "learning_rate": 0.00019820086156539516, "loss": 0.0922, "step": 3248 }, { "epoch": 0.20997171717171717, "eval_bleu": 11.75159459131704, "eval_loss": 0.0980924516916275, "eval_runtime": 2.7078, "eval_samples_per_second": 11.818, "eval_steps_per_second": 1.477, "step": 3248 }, { "epoch": 0.21003636363636363, "grad_norm": 0.0819455087184906, "learning_rate": 0.00019819956995773493, "loss": 0.1016, "step": 3249 }, { "epoch": 0.2101010101010101, "grad_norm": 0.07833933085203171, "learning_rate": 0.0001981982778908278, "loss": 0.0877, "step": 3250 }, { "epoch": 0.21016565656565656, "grad_norm": 0.0952921211719513, "learning_rate": 0.00019819698536467975, "loss": 0.1179, "step": 3251 }, { "epoch": 0.21023030303030302, "grad_norm": 0.07919947803020477, "learning_rate": 0.00019819569237929688, "loss": 0.0971, "step": 3252 }, { "epoch": 0.2102949494949495, "grad_norm": 0.07089057564735413, "learning_rate": 0.0001981943989346852, "loss": 0.0956, "step": 3253 }, { "epoch": 0.21035959595959597, "grad_norm": 0.09709557890892029, "learning_rate": 0.0001981931050308508, "loss": 0.1366, "step": 3254 }, { "epoch": 0.21042424242424243, "grad_norm": 0.08185214549303055, "learning_rate": 0.0001981918106677997, "loss": 0.088, "step": 3255 }, { "epoch": 0.2104888888888889, "grad_norm": 0.0734316036105156, "learning_rate": 0.000198190515845538, "loss": 0.0938, "step": 3256 }, { "epoch": 0.21055353535353535, "grad_norm": 0.06370724737644196, "learning_rate": 0.00019818922056407168, "loss": 0.0771, "step": 3257 }, { "epoch": 0.21061818181818182, "grad_norm": 0.059855539351701736, "learning_rate": 0.00019818792482340683, "loss": 0.0716, "step": 3258 }, { "epoch": 0.21068282828282828, "grad_norm": 0.06486648321151733, "learning_rate": 0.0001981866286235495, "loss": 0.0767, "step": 3259 }, { "epoch": 0.21074747474747474, "grad_norm": 0.07044275104999542, "learning_rate": 0.00019818533196450583, "loss": 0.0833, "step": 3260 }, { "epoch": 0.2108121212121212, "grad_norm": 0.06139122694730759, "learning_rate": 0.0001981840348462818, "loss": 0.0843, "step": 3261 }, { "epoch": 0.2108767676767677, "grad_norm": 0.07323456555604935, "learning_rate": 0.00019818273726888346, "loss": 0.0901, "step": 3262 }, { "epoch": 0.21094141414141415, "grad_norm": 0.07193174213171005, "learning_rate": 0.00019818143923231696, "loss": 0.1033, "step": 3263 }, { "epoch": 0.2110060606060606, "grad_norm": 0.08211849629878998, "learning_rate": 0.0001981801407365883, "loss": 0.1117, "step": 3264 }, { "epoch": 0.2110060606060606, "eval_bleu": 12.226759341908128, "eval_loss": 0.09752113372087479, "eval_runtime": 2.886, "eval_samples_per_second": 11.088, "eval_steps_per_second": 1.386, "step": 3264 }, { "epoch": 0.21107070707070708, "grad_norm": 0.06628923863172531, "learning_rate": 0.00019817884178170359, "loss": 0.0889, "step": 3265 }, { "epoch": 0.21113535353535354, "grad_norm": 0.0785524770617485, "learning_rate": 0.00019817754236766888, "loss": 0.0869, "step": 3266 }, { "epoch": 0.2112, "grad_norm": 0.06970620900392532, "learning_rate": 0.00019817624249449028, "loss": 0.089, "step": 3267 }, { "epoch": 0.21126464646464646, "grad_norm": 0.08422599732875824, "learning_rate": 0.00019817494216217384, "loss": 0.099, "step": 3268 }, { "epoch": 0.21132929292929292, "grad_norm": 0.06703177094459534, "learning_rate": 0.00019817364137072568, "loss": 0.077, "step": 3269 }, { "epoch": 0.21139393939393938, "grad_norm": 0.0788150280714035, "learning_rate": 0.0001981723401201518, "loss": 0.0976, "step": 3270 }, { "epoch": 0.21145858585858585, "grad_norm": 0.07000916451215744, "learning_rate": 0.00019817103841045838, "loss": 0.0885, "step": 3271 }, { "epoch": 0.21152323232323234, "grad_norm": 0.08158749341964722, "learning_rate": 0.00019816973624165146, "loss": 0.0917, "step": 3272 }, { "epoch": 0.2115878787878788, "grad_norm": 0.07905953377485275, "learning_rate": 0.00019816843361373712, "loss": 0.1021, "step": 3273 }, { "epoch": 0.21165252525252526, "grad_norm": 0.14402255415916443, "learning_rate": 0.0001981671305267215, "loss": 0.1106, "step": 3274 }, { "epoch": 0.21171717171717172, "grad_norm": 0.08556491881608963, "learning_rate": 0.00019816582698061066, "loss": 0.1042, "step": 3275 }, { "epoch": 0.21178181818181818, "grad_norm": 0.06387800723314285, "learning_rate": 0.0001981645229754107, "loss": 0.0794, "step": 3276 }, { "epoch": 0.21184646464646464, "grad_norm": 0.06925465166568756, "learning_rate": 0.00019816321851112768, "loss": 0.0942, "step": 3277 }, { "epoch": 0.2119111111111111, "grad_norm": 0.08130539953708649, "learning_rate": 0.00019816191358776778, "loss": 0.0927, "step": 3278 }, { "epoch": 0.21197575757575757, "grad_norm": 0.07872293144464493, "learning_rate": 0.00019816060820533707, "loss": 0.0925, "step": 3279 }, { "epoch": 0.21204040404040403, "grad_norm": 0.06228552758693695, "learning_rate": 0.00019815930236384162, "loss": 0.081, "step": 3280 }, { "epoch": 0.21204040404040403, "eval_bleu": 15.055489145493217, "eval_loss": 0.09649857878684998, "eval_runtime": 2.6538, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 3280 }, { "epoch": 0.21210505050505052, "grad_norm": 0.0830526128411293, "learning_rate": 0.00019815799606328756, "loss": 0.0986, "step": 3281 }, { "epoch": 0.21216969696969698, "grad_norm": 0.07102509588003159, "learning_rate": 0.00019815668930368103, "loss": 0.0933, "step": 3282 }, { "epoch": 0.21223434343434344, "grad_norm": 0.08107369393110275, "learning_rate": 0.00019815538208502808, "loss": 0.1022, "step": 3283 }, { "epoch": 0.2122989898989899, "grad_norm": 0.08084149658679962, "learning_rate": 0.00019815407440733487, "loss": 0.0922, "step": 3284 }, { "epoch": 0.21236363636363637, "grad_norm": 0.07209715247154236, "learning_rate": 0.00019815276627060754, "loss": 0.0943, "step": 3285 }, { "epoch": 0.21242828282828283, "grad_norm": 0.06801539659500122, "learning_rate": 0.00019815145767485213, "loss": 0.0785, "step": 3286 }, { "epoch": 0.2124929292929293, "grad_norm": 0.07762764394283295, "learning_rate": 0.0001981501486200748, "loss": 0.0983, "step": 3287 }, { "epoch": 0.21255757575757575, "grad_norm": 0.08190947026014328, "learning_rate": 0.00019814883910628168, "loss": 0.1041, "step": 3288 }, { "epoch": 0.2126222222222222, "grad_norm": 0.07338094711303711, "learning_rate": 0.0001981475291334789, "loss": 0.0862, "step": 3289 }, { "epoch": 0.21268686868686867, "grad_norm": 0.07414393126964569, "learning_rate": 0.00019814621870167253, "loss": 0.0757, "step": 3290 }, { "epoch": 0.21275151515151516, "grad_norm": 0.06347116082906723, "learning_rate": 0.0001981449078108688, "loss": 0.0812, "step": 3291 }, { "epoch": 0.21281616161616163, "grad_norm": 0.07594288885593414, "learning_rate": 0.00019814359646107375, "loss": 0.0991, "step": 3292 }, { "epoch": 0.2128808080808081, "grad_norm": 0.0660187229514122, "learning_rate": 0.00019814228465229357, "loss": 0.0773, "step": 3293 }, { "epoch": 0.21294545454545455, "grad_norm": 0.07161509990692139, "learning_rate": 0.00019814097238453435, "loss": 0.0976, "step": 3294 }, { "epoch": 0.213010101010101, "grad_norm": 0.0784306526184082, "learning_rate": 0.00019813965965780224, "loss": 0.0975, "step": 3295 }, { "epoch": 0.21307474747474747, "grad_norm": 0.07063211500644684, "learning_rate": 0.0001981383464721034, "loss": 0.0835, "step": 3296 }, { "epoch": 0.21307474747474747, "eval_bleu": 14.526641689724539, "eval_loss": 0.09730221331119537, "eval_runtime": 2.7492, "eval_samples_per_second": 11.64, "eval_steps_per_second": 1.455, "step": 3296 }, { "epoch": 0.21313939393939393, "grad_norm": 0.06933317333459854, "learning_rate": 0.00019813703282744395, "loss": 0.0875, "step": 3297 }, { "epoch": 0.2132040404040404, "grad_norm": 0.06673501431941986, "learning_rate": 0.00019813571872383004, "loss": 0.0852, "step": 3298 }, { "epoch": 0.21326868686868686, "grad_norm": 0.07761738449335098, "learning_rate": 0.00019813440416126783, "loss": 0.1004, "step": 3299 }, { "epoch": 0.21333333333333335, "grad_norm": 0.07827828824520111, "learning_rate": 0.00019813308913976344, "loss": 0.1021, "step": 3300 }, { "epoch": 0.2133979797979798, "grad_norm": 0.0728059634566307, "learning_rate": 0.00019813177365932304, "loss": 0.0941, "step": 3301 }, { "epoch": 0.21346262626262627, "grad_norm": 0.06538718193769455, "learning_rate": 0.00019813045771995276, "loss": 0.0753, "step": 3302 }, { "epoch": 0.21352727272727273, "grad_norm": 0.09158114343881607, "learning_rate": 0.0001981291413216588, "loss": 0.1154, "step": 3303 }, { "epoch": 0.2135919191919192, "grad_norm": 0.08771733939647675, "learning_rate": 0.00019812782446444724, "loss": 0.1041, "step": 3304 }, { "epoch": 0.21365656565656566, "grad_norm": 0.07040257751941681, "learning_rate": 0.00019812650714832436, "loss": 0.0863, "step": 3305 }, { "epoch": 0.21372121212121212, "grad_norm": 0.06416306644678116, "learning_rate": 0.00019812518937329618, "loss": 0.0834, "step": 3306 }, { "epoch": 0.21378585858585858, "grad_norm": 0.07003892958164215, "learning_rate": 0.00019812387113936895, "loss": 0.0837, "step": 3307 }, { "epoch": 0.21385050505050504, "grad_norm": 0.0632302537560463, "learning_rate": 0.00019812255244654883, "loss": 0.0784, "step": 3308 }, { "epoch": 0.2139151515151515, "grad_norm": 0.07356846332550049, "learning_rate": 0.00019812123329484194, "loss": 0.0988, "step": 3309 }, { "epoch": 0.213979797979798, "grad_norm": 0.07500423491001129, "learning_rate": 0.00019811991368425452, "loss": 0.085, "step": 3310 }, { "epoch": 0.21404444444444445, "grad_norm": 0.07073833048343658, "learning_rate": 0.00019811859361479266, "loss": 0.0931, "step": 3311 }, { "epoch": 0.21410909090909092, "grad_norm": 0.06631512194871902, "learning_rate": 0.00019811727308646258, "loss": 0.0843, "step": 3312 }, { "epoch": 0.21410909090909092, "eval_bleu": 13.579684149902931, "eval_loss": 0.09733209013938904, "eval_runtime": 2.7003, "eval_samples_per_second": 11.851, "eval_steps_per_second": 1.481, "step": 3312 }, { "epoch": 0.21417373737373738, "grad_norm": 0.06393156200647354, "learning_rate": 0.00019811595209927047, "loss": 0.0722, "step": 3313 }, { "epoch": 0.21423838383838384, "grad_norm": 0.07341096550226212, "learning_rate": 0.00019811463065322248, "loss": 0.0918, "step": 3314 }, { "epoch": 0.2143030303030303, "grad_norm": 0.06938652694225311, "learning_rate": 0.00019811330874832482, "loss": 0.093, "step": 3315 }, { "epoch": 0.21436767676767676, "grad_norm": 0.07169376313686371, "learning_rate": 0.0001981119863845836, "loss": 0.0994, "step": 3316 }, { "epoch": 0.21443232323232322, "grad_norm": 0.09105397015810013, "learning_rate": 0.0001981106635620051, "loss": 0.0932, "step": 3317 }, { "epoch": 0.21449696969696969, "grad_norm": 0.0699797123670578, "learning_rate": 0.00019810934028059545, "loss": 0.0896, "step": 3318 }, { "epoch": 0.21456161616161618, "grad_norm": 0.058354903012514114, "learning_rate": 0.00019810801654036086, "loss": 0.0668, "step": 3319 }, { "epoch": 0.21462626262626264, "grad_norm": 0.07138828933238983, "learning_rate": 0.00019810669234130747, "loss": 0.0917, "step": 3320 }, { "epoch": 0.2146909090909091, "grad_norm": 0.07746605575084686, "learning_rate": 0.00019810536768344157, "loss": 0.0921, "step": 3321 }, { "epoch": 0.21475555555555556, "grad_norm": 0.07738307863473892, "learning_rate": 0.00019810404256676925, "loss": 0.0866, "step": 3322 }, { "epoch": 0.21482020202020202, "grad_norm": 0.07437840104103088, "learning_rate": 0.0001981027169912968, "loss": 0.0925, "step": 3323 }, { "epoch": 0.21488484848484848, "grad_norm": 0.06851816177368164, "learning_rate": 0.00019810139095703035, "loss": 0.0827, "step": 3324 }, { "epoch": 0.21494949494949495, "grad_norm": 0.06500723212957382, "learning_rate": 0.0001981000644639761, "loss": 0.0778, "step": 3325 }, { "epoch": 0.2150141414141414, "grad_norm": 0.06486549228429794, "learning_rate": 0.00019809873751214032, "loss": 0.072, "step": 3326 }, { "epoch": 0.21507878787878787, "grad_norm": 0.07577551901340485, "learning_rate": 0.00019809741010152915, "loss": 0.0811, "step": 3327 }, { "epoch": 0.21514343434343433, "grad_norm": 0.08082138746976852, "learning_rate": 0.00019809608223214883, "loss": 0.1028, "step": 3328 }, { "epoch": 0.21514343434343433, "eval_bleu": 13.53935335450092, "eval_loss": 0.09745529294013977, "eval_runtime": 2.8652, "eval_samples_per_second": 11.169, "eval_steps_per_second": 1.396, "step": 3328 }, { "epoch": 0.21520808080808082, "grad_norm": 0.06792891025543213, "learning_rate": 0.00019809475390400558, "loss": 0.0807, "step": 3329 }, { "epoch": 0.21527272727272728, "grad_norm": 0.08012599498033524, "learning_rate": 0.0001980934251171056, "loss": 0.0822, "step": 3330 }, { "epoch": 0.21533737373737374, "grad_norm": 0.07291116565465927, "learning_rate": 0.00019809209587145506, "loss": 0.0908, "step": 3331 }, { "epoch": 0.2154020202020202, "grad_norm": 0.07829274237155914, "learning_rate": 0.00019809076616706027, "loss": 0.1032, "step": 3332 }, { "epoch": 0.21546666666666667, "grad_norm": 0.06904233247041702, "learning_rate": 0.00019808943600392733, "loss": 0.0839, "step": 3333 }, { "epoch": 0.21553131313131313, "grad_norm": 0.0646386370062828, "learning_rate": 0.00019808810538206258, "loss": 0.0736, "step": 3334 }, { "epoch": 0.2155959595959596, "grad_norm": 0.07214377075433731, "learning_rate": 0.00019808677430147218, "loss": 0.0874, "step": 3335 }, { "epoch": 0.21566060606060605, "grad_norm": 0.08438138663768768, "learning_rate": 0.00019808544276216235, "loss": 0.0931, "step": 3336 }, { "epoch": 0.21572525252525251, "grad_norm": 0.07673601061105728, "learning_rate": 0.0001980841107641393, "loss": 0.0983, "step": 3337 }, { "epoch": 0.21578989898989898, "grad_norm": 0.06860648840665817, "learning_rate": 0.00019808277830740933, "loss": 0.0836, "step": 3338 }, { "epoch": 0.21585454545454547, "grad_norm": 0.07794602960348129, "learning_rate": 0.00019808144539197863, "loss": 0.0922, "step": 3339 }, { "epoch": 0.21591919191919193, "grad_norm": 0.0965222492814064, "learning_rate": 0.00019808011201785344, "loss": 0.1341, "step": 3340 }, { "epoch": 0.2159838383838384, "grad_norm": 0.08122780919075012, "learning_rate": 0.00019807877818503998, "loss": 0.104, "step": 3341 }, { "epoch": 0.21604848484848485, "grad_norm": 0.07839896529912949, "learning_rate": 0.0001980774438935445, "loss": 0.0946, "step": 3342 }, { "epoch": 0.2161131313131313, "grad_norm": 0.0766155794262886, "learning_rate": 0.00019807610914337323, "loss": 0.0837, "step": 3343 }, { "epoch": 0.21617777777777777, "grad_norm": 0.0776563212275505, "learning_rate": 0.00019807477393453242, "loss": 0.085, "step": 3344 }, { "epoch": 0.21617777777777777, "eval_bleu": 14.494493824501257, "eval_loss": 0.09570226073265076, "eval_runtime": 2.6666, "eval_samples_per_second": 12.0, "eval_steps_per_second": 1.5, "step": 3344 }, { "epoch": 0.21624242424242424, "grad_norm": 0.06095238775014877, "learning_rate": 0.00019807343826702833, "loss": 0.0811, "step": 3345 }, { "epoch": 0.2163070707070707, "grad_norm": 0.07335347682237625, "learning_rate": 0.00019807210214086717, "loss": 0.0891, "step": 3346 }, { "epoch": 0.21637171717171716, "grad_norm": 0.08615083992481232, "learning_rate": 0.00019807076555605524, "loss": 0.1087, "step": 3347 }, { "epoch": 0.21643636363636365, "grad_norm": 0.07660216838121414, "learning_rate": 0.00019806942851259874, "loss": 0.0961, "step": 3348 }, { "epoch": 0.2165010101010101, "grad_norm": 0.08800068497657776, "learning_rate": 0.00019806809101050393, "loss": 0.0939, "step": 3349 }, { "epoch": 0.21656565656565657, "grad_norm": 0.06695350259542465, "learning_rate": 0.00019806675304977713, "loss": 0.069, "step": 3350 }, { "epoch": 0.21663030303030303, "grad_norm": 0.08028466999530792, "learning_rate": 0.0001980654146304245, "loss": 0.0935, "step": 3351 }, { "epoch": 0.2166949494949495, "grad_norm": 0.0833858922123909, "learning_rate": 0.00019806407575245237, "loss": 0.0848, "step": 3352 }, { "epoch": 0.21675959595959596, "grad_norm": 0.07134957611560822, "learning_rate": 0.00019806273641586697, "loss": 0.0805, "step": 3353 }, { "epoch": 0.21682424242424242, "grad_norm": 0.0945541113615036, "learning_rate": 0.0001980613966206746, "loss": 0.0935, "step": 3354 }, { "epoch": 0.21688888888888888, "grad_norm": 0.07783107459545135, "learning_rate": 0.00019806005636688144, "loss": 0.0905, "step": 3355 }, { "epoch": 0.21695353535353534, "grad_norm": 0.07399491220712662, "learning_rate": 0.00019805871565449384, "loss": 0.0999, "step": 3356 }, { "epoch": 0.2170181818181818, "grad_norm": 0.0729394257068634, "learning_rate": 0.00019805737448351805, "loss": 0.0872, "step": 3357 }, { "epoch": 0.2170828282828283, "grad_norm": 0.07971515506505966, "learning_rate": 0.00019805603285396033, "loss": 0.0987, "step": 3358 }, { "epoch": 0.21714747474747476, "grad_norm": 0.09197946637868881, "learning_rate": 0.00019805469076582694, "loss": 0.1106, "step": 3359 }, { "epoch": 0.21721212121212122, "grad_norm": 0.06486957520246506, "learning_rate": 0.0001980533482191242, "loss": 0.0834, "step": 3360 }, { "epoch": 0.21721212121212122, "eval_bleu": 13.108251226700098, "eval_loss": 0.09510880708694458, "eval_runtime": 2.7968, "eval_samples_per_second": 11.442, "eval_steps_per_second": 1.43, "step": 3360 }, { "epoch": 0.21727676767676768, "grad_norm": 0.09355279058218002, "learning_rate": 0.00019805200521385836, "loss": 0.0995, "step": 3361 }, { "epoch": 0.21734141414141414, "grad_norm": 0.06528133153915405, "learning_rate": 0.00019805066175003573, "loss": 0.0812, "step": 3362 }, { "epoch": 0.2174060606060606, "grad_norm": 0.06726797670125961, "learning_rate": 0.00019804931782766257, "loss": 0.0897, "step": 3363 }, { "epoch": 0.21747070707070706, "grad_norm": 0.06881358474493027, "learning_rate": 0.00019804797344674514, "loss": 0.092, "step": 3364 }, { "epoch": 0.21753535353535353, "grad_norm": 0.07933942973613739, "learning_rate": 0.00019804662860728974, "loss": 0.0951, "step": 3365 }, { "epoch": 0.2176, "grad_norm": 0.07912680506706238, "learning_rate": 0.00019804528330930269, "loss": 0.1034, "step": 3366 }, { "epoch": 0.21766464646464648, "grad_norm": 0.0710979476571083, "learning_rate": 0.00019804393755279027, "loss": 0.0876, "step": 3367 }, { "epoch": 0.21772929292929294, "grad_norm": 0.07970493286848068, "learning_rate": 0.0001980425913377588, "loss": 0.1, "step": 3368 }, { "epoch": 0.2177939393939394, "grad_norm": 0.058857958763837814, "learning_rate": 0.00019804124466421446, "loss": 0.0716, "step": 3369 }, { "epoch": 0.21785858585858586, "grad_norm": 0.1202416941523552, "learning_rate": 0.00019803989753216367, "loss": 0.1624, "step": 3370 }, { "epoch": 0.21792323232323232, "grad_norm": 0.07926231622695923, "learning_rate": 0.0001980385499416127, "loss": 0.0903, "step": 3371 }, { "epoch": 0.21798787878787879, "grad_norm": 0.07087825238704681, "learning_rate": 0.00019803720189256785, "loss": 0.0906, "step": 3372 }, { "epoch": 0.21805252525252525, "grad_norm": 0.07070551812648773, "learning_rate": 0.0001980358533850354, "loss": 0.0889, "step": 3373 }, { "epoch": 0.2181171717171717, "grad_norm": 0.06965470314025879, "learning_rate": 0.00019803450441902167, "loss": 0.084, "step": 3374 }, { "epoch": 0.21818181818181817, "grad_norm": 0.07521123439073563, "learning_rate": 0.000198033154994533, "loss": 0.0998, "step": 3375 }, { "epoch": 0.21824646464646463, "grad_norm": 0.07839778065681458, "learning_rate": 0.00019803180511157565, "loss": 0.0941, "step": 3376 }, { "epoch": 0.21824646464646463, "eval_bleu": 14.012770241737655, "eval_loss": 0.09544821828603745, "eval_runtime": 2.6637, "eval_samples_per_second": 12.014, "eval_steps_per_second": 1.502, "step": 3376 }, { "epoch": 0.21831111111111112, "grad_norm": 0.07042742520570755, "learning_rate": 0.00019803045477015597, "loss": 0.0941, "step": 3377 }, { "epoch": 0.21837575757575758, "grad_norm": 0.08788799494504929, "learning_rate": 0.00019802910397028023, "loss": 0.1137, "step": 3378 }, { "epoch": 0.21844040404040405, "grad_norm": 0.06507635116577148, "learning_rate": 0.00019802775271195478, "loss": 0.0769, "step": 3379 }, { "epoch": 0.2185050505050505, "grad_norm": 0.0776074007153511, "learning_rate": 0.000198026400995186, "loss": 0.095, "step": 3380 }, { "epoch": 0.21856969696969697, "grad_norm": 0.07036030292510986, "learning_rate": 0.0001980250488199801, "loss": 0.1011, "step": 3381 }, { "epoch": 0.21863434343434343, "grad_norm": 0.0770777016878128, "learning_rate": 0.00019802369618634344, "loss": 0.0974, "step": 3382 }, { "epoch": 0.2186989898989899, "grad_norm": 0.06756532192230225, "learning_rate": 0.00019802234309428238, "loss": 0.0931, "step": 3383 }, { "epoch": 0.21876363636363635, "grad_norm": 0.0934842973947525, "learning_rate": 0.00019802098954380324, "loss": 0.1144, "step": 3384 }, { "epoch": 0.21882828282828282, "grad_norm": 0.0736488550901413, "learning_rate": 0.0001980196355349123, "loss": 0.0851, "step": 3385 }, { "epoch": 0.2188929292929293, "grad_norm": 0.07394425570964813, "learning_rate": 0.00019801828106761599, "loss": 0.0917, "step": 3386 }, { "epoch": 0.21895757575757577, "grad_norm": 0.07060705125331879, "learning_rate": 0.0001980169261419205, "loss": 0.0992, "step": 3387 }, { "epoch": 0.21902222222222223, "grad_norm": 0.07473021000623703, "learning_rate": 0.00019801557075783233, "loss": 0.094, "step": 3388 }, { "epoch": 0.2190868686868687, "grad_norm": 0.08129066228866577, "learning_rate": 0.0001980142149153577, "loss": 0.1071, "step": 3389 }, { "epoch": 0.21915151515151515, "grad_norm": 0.0649336650967598, "learning_rate": 0.000198012858614503, "loss": 0.0855, "step": 3390 }, { "epoch": 0.21921616161616161, "grad_norm": 0.06824880838394165, "learning_rate": 0.00019801150185527454, "loss": 0.0837, "step": 3391 }, { "epoch": 0.21928080808080808, "grad_norm": 0.0683906301856041, "learning_rate": 0.00019801014463767872, "loss": 0.0796, "step": 3392 }, { "epoch": 0.21928080808080808, "eval_bleu": 10.845328859708639, "eval_loss": 0.09526924788951874, "eval_runtime": 2.8712, "eval_samples_per_second": 11.145, "eval_steps_per_second": 1.393, "step": 3392 }, { "epoch": 0.21934545454545454, "grad_norm": 0.06590454280376434, "learning_rate": 0.00019800878696172184, "loss": 0.0775, "step": 3393 }, { "epoch": 0.219410101010101, "grad_norm": 0.08065328747034073, "learning_rate": 0.00019800742882741026, "loss": 0.0879, "step": 3394 }, { "epoch": 0.21947474747474746, "grad_norm": 0.08675577491521835, "learning_rate": 0.00019800607023475034, "loss": 0.0953, "step": 3395 }, { "epoch": 0.21953939393939395, "grad_norm": 0.07061854749917984, "learning_rate": 0.00019800471118374842, "loss": 0.0921, "step": 3396 }, { "epoch": 0.2196040404040404, "grad_norm": 0.07788094133138657, "learning_rate": 0.00019800335167441087, "loss": 0.0829, "step": 3397 }, { "epoch": 0.21966868686868687, "grad_norm": 0.07903806120157242, "learning_rate": 0.00019800199170674407, "loss": 0.0983, "step": 3398 }, { "epoch": 0.21973333333333334, "grad_norm": 0.07318644970655441, "learning_rate": 0.00019800063128075436, "loss": 0.0873, "step": 3399 }, { "epoch": 0.2197979797979798, "grad_norm": 0.07462291419506073, "learning_rate": 0.00019799927039644804, "loss": 0.0947, "step": 3400 }, { "epoch": 0.21986262626262626, "grad_norm": 0.07499099522829056, "learning_rate": 0.00019799790905383155, "loss": 0.0954, "step": 3401 }, { "epoch": 0.21992727272727272, "grad_norm": 0.06945568323135376, "learning_rate": 0.00019799654725291126, "loss": 0.0704, "step": 3402 }, { "epoch": 0.21999191919191918, "grad_norm": 0.10157816857099533, "learning_rate": 0.0001979951849936935, "loss": 0.113, "step": 3403 }, { "epoch": 0.22005656565656564, "grad_norm": 0.08681383728981018, "learning_rate": 0.00019799382227618466, "loss": 0.0871, "step": 3404 }, { "epoch": 0.22012121212121213, "grad_norm": 0.08701842278242111, "learning_rate": 0.0001979924591003911, "loss": 0.1039, "step": 3405 }, { "epoch": 0.2201858585858586, "grad_norm": 0.059475112706422806, "learning_rate": 0.00019799109546631919, "loss": 0.0723, "step": 3406 }, { "epoch": 0.22025050505050506, "grad_norm": 0.06547842919826508, "learning_rate": 0.00019798973137397536, "loss": 0.0803, "step": 3407 }, { "epoch": 0.22031515151515152, "grad_norm": 0.06663373857736588, "learning_rate": 0.00019798836682336594, "loss": 0.0825, "step": 3408 }, { "epoch": 0.22031515151515152, "eval_bleu": 13.261001125659474, "eval_loss": 0.09563431888818741, "eval_runtime": 2.7567, "eval_samples_per_second": 11.608, "eval_steps_per_second": 1.451, "step": 3408 }, { "epoch": 0.22037979797979798, "grad_norm": 0.0813746452331543, "learning_rate": 0.0001979870018144973, "loss": 0.1197, "step": 3409 }, { "epoch": 0.22044444444444444, "grad_norm": 0.08512023091316223, "learning_rate": 0.00019798563634737588, "loss": 0.1099, "step": 3410 }, { "epoch": 0.2205090909090909, "grad_norm": 0.08940357714891434, "learning_rate": 0.00019798427042200804, "loss": 0.1139, "step": 3411 }, { "epoch": 0.22057373737373737, "grad_norm": 0.07359014451503754, "learning_rate": 0.0001979829040384001, "loss": 0.0998, "step": 3412 }, { "epoch": 0.22063838383838383, "grad_norm": 0.060967642813920975, "learning_rate": 0.00019798153719655857, "loss": 0.0774, "step": 3413 }, { "epoch": 0.2207030303030303, "grad_norm": 0.054646387696266174, "learning_rate": 0.00019798016989648975, "loss": 0.0687, "step": 3414 }, { "epoch": 0.22076767676767678, "grad_norm": 0.06471384316682816, "learning_rate": 0.0001979788021382001, "loss": 0.0877, "step": 3415 }, { "epoch": 0.22083232323232324, "grad_norm": 0.07227566093206406, "learning_rate": 0.00019797743392169594, "loss": 0.0913, "step": 3416 }, { "epoch": 0.2208969696969697, "grad_norm": 0.07610689103603363, "learning_rate": 0.00019797606524698372, "loss": 0.0909, "step": 3417 }, { "epoch": 0.22096161616161616, "grad_norm": 0.06669628620147705, "learning_rate": 0.00019797469611406986, "loss": 0.082, "step": 3418 }, { "epoch": 0.22102626262626263, "grad_norm": 0.06636548787355423, "learning_rate": 0.0001979733265229607, "loss": 0.0812, "step": 3419 }, { "epoch": 0.2210909090909091, "grad_norm": 0.06676790118217468, "learning_rate": 0.00019797195647366272, "loss": 0.08, "step": 3420 }, { "epoch": 0.22115555555555555, "grad_norm": 0.07399611175060272, "learning_rate": 0.00019797058596618226, "loss": 0.0933, "step": 3421 }, { "epoch": 0.221220202020202, "grad_norm": 0.07517257332801819, "learning_rate": 0.00019796921500052575, "loss": 0.1008, "step": 3422 }, { "epoch": 0.22128484848484847, "grad_norm": 0.0758170410990715, "learning_rate": 0.00019796784357669966, "loss": 0.0977, "step": 3423 }, { "epoch": 0.22134949494949496, "grad_norm": 0.07462489604949951, "learning_rate": 0.00019796647169471033, "loss": 0.0963, "step": 3424 }, { "epoch": 0.22134949494949496, "eval_bleu": 15.5883034251279, "eval_loss": 0.09513704478740692, "eval_runtime": 2.7633, "eval_samples_per_second": 11.58, "eval_steps_per_second": 1.448, "step": 3424 }, { "epoch": 0.22141414141414142, "grad_norm": 0.06577882915735245, "learning_rate": 0.0001979650993545642, "loss": 0.0867, "step": 3425 }, { "epoch": 0.22147878787878789, "grad_norm": 0.07720796763896942, "learning_rate": 0.00019796372655626768, "loss": 0.0954, "step": 3426 }, { "epoch": 0.22154343434343435, "grad_norm": 0.06880860030651093, "learning_rate": 0.00019796235329982715, "loss": 0.0837, "step": 3427 }, { "epoch": 0.2216080808080808, "grad_norm": 0.07525089383125305, "learning_rate": 0.00019796097958524916, "loss": 0.08, "step": 3428 }, { "epoch": 0.22167272727272727, "grad_norm": 0.07998102903366089, "learning_rate": 0.00019795960541254003, "loss": 0.0945, "step": 3429 }, { "epoch": 0.22173737373737373, "grad_norm": 0.06905552744865417, "learning_rate": 0.00019795823078170617, "loss": 0.0772, "step": 3430 }, { "epoch": 0.2218020202020202, "grad_norm": 0.06332649290561676, "learning_rate": 0.00019795685569275407, "loss": 0.0751, "step": 3431 }, { "epoch": 0.22186666666666666, "grad_norm": 0.0635744258761406, "learning_rate": 0.00019795548014569017, "loss": 0.0743, "step": 3432 }, { "epoch": 0.22193131313131312, "grad_norm": 0.07523475587368011, "learning_rate": 0.00019795410414052086, "loss": 0.0978, "step": 3433 }, { "epoch": 0.2219959595959596, "grad_norm": 0.06728078424930573, "learning_rate": 0.00019795272767725254, "loss": 0.0853, "step": 3434 }, { "epoch": 0.22206060606060607, "grad_norm": 0.0777457058429718, "learning_rate": 0.00019795135075589175, "loss": 0.102, "step": 3435 }, { "epoch": 0.22212525252525253, "grad_norm": 0.07914719730615616, "learning_rate": 0.00019794997337644485, "loss": 0.0904, "step": 3436 }, { "epoch": 0.222189898989899, "grad_norm": 0.06530313193798065, "learning_rate": 0.00019794859553891832, "loss": 0.0801, "step": 3437 }, { "epoch": 0.22225454545454545, "grad_norm": 0.06777159124612808, "learning_rate": 0.00019794721724331857, "loss": 0.0934, "step": 3438 }, { "epoch": 0.22231919191919192, "grad_norm": 0.07087704539299011, "learning_rate": 0.00019794583848965208, "loss": 0.0917, "step": 3439 }, { "epoch": 0.22238383838383838, "grad_norm": 0.07231912016868591, "learning_rate": 0.0001979444592779253, "loss": 0.0825, "step": 3440 }, { "epoch": 0.22238383838383838, "eval_bleu": 17.468634021598287, "eval_loss": 0.09492091089487076, "eval_runtime": 2.6686, "eval_samples_per_second": 11.991, "eval_steps_per_second": 1.499, "step": 3440 }, { "epoch": 0.22244848484848484, "grad_norm": 0.09987115859985352, "learning_rate": 0.00019794307960814463, "loss": 0.1112, "step": 3441 }, { "epoch": 0.2225131313131313, "grad_norm": 0.07208310812711716, "learning_rate": 0.00019794169948031659, "loss": 0.0906, "step": 3442 }, { "epoch": 0.2225777777777778, "grad_norm": 0.07307442277669907, "learning_rate": 0.00019794031889444757, "loss": 0.0929, "step": 3443 }, { "epoch": 0.22264242424242425, "grad_norm": 0.07023077458143234, "learning_rate": 0.00019793893785054407, "loss": 0.0849, "step": 3444 }, { "epoch": 0.22270707070707071, "grad_norm": 0.07088685780763626, "learning_rate": 0.00019793755634861252, "loss": 0.0842, "step": 3445 }, { "epoch": 0.22277171717171718, "grad_norm": 0.07246281951665878, "learning_rate": 0.00019793617438865942, "loss": 0.1002, "step": 3446 }, { "epoch": 0.22283636363636364, "grad_norm": 0.07885023206472397, "learning_rate": 0.0001979347919706912, "loss": 0.0841, "step": 3447 }, { "epoch": 0.2229010101010101, "grad_norm": 0.06938537955284119, "learning_rate": 0.00019793340909471434, "loss": 0.0815, "step": 3448 }, { "epoch": 0.22296565656565656, "grad_norm": 0.07416706532239914, "learning_rate": 0.0001979320257607353, "loss": 0.101, "step": 3449 }, { "epoch": 0.22303030303030302, "grad_norm": 0.08499825745820999, "learning_rate": 0.00019793064196876054, "loss": 0.0846, "step": 3450 }, { "epoch": 0.22309494949494948, "grad_norm": 0.07244010269641876, "learning_rate": 0.00019792925771879656, "loss": 0.0869, "step": 3451 }, { "epoch": 0.22315959595959595, "grad_norm": 0.0722435936331749, "learning_rate": 0.0001979278730108498, "loss": 0.0948, "step": 3452 }, { "epoch": 0.22322424242424244, "grad_norm": 0.07287343591451645, "learning_rate": 0.00019792648784492677, "loss": 0.0945, "step": 3453 }, { "epoch": 0.2232888888888889, "grad_norm": 0.06159506365656853, "learning_rate": 0.0001979251022210339, "loss": 0.0763, "step": 3454 }, { "epoch": 0.22335353535353536, "grad_norm": 0.07969187200069427, "learning_rate": 0.00019792371613917774, "loss": 0.0998, "step": 3455 }, { "epoch": 0.22341818181818182, "grad_norm": 0.0770021602511406, "learning_rate": 0.0001979223295993647, "loss": 0.0828, "step": 3456 }, { "epoch": 0.22341818181818182, "eval_bleu": 13.031251092939797, "eval_loss": 0.09550341963768005, "eval_runtime": 2.8554, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 3456 }, { "epoch": 0.22348282828282828, "grad_norm": 0.0780697911977768, "learning_rate": 0.00019792094260160132, "loss": 0.0893, "step": 3457 }, { "epoch": 0.22354747474747474, "grad_norm": 0.06838620454072952, "learning_rate": 0.00019791955514589406, "loss": 0.0888, "step": 3458 }, { "epoch": 0.2236121212121212, "grad_norm": 0.07091367989778519, "learning_rate": 0.0001979181672322494, "loss": 0.0837, "step": 3459 }, { "epoch": 0.22367676767676767, "grad_norm": 0.07299026101827621, "learning_rate": 0.00019791677886067387, "loss": 0.0796, "step": 3460 }, { "epoch": 0.22374141414141413, "grad_norm": 0.07575448602437973, "learning_rate": 0.00019791539003117387, "loss": 0.0919, "step": 3461 }, { "epoch": 0.22380606060606062, "grad_norm": 0.06466193497180939, "learning_rate": 0.000197914000743756, "loss": 0.0835, "step": 3462 }, { "epoch": 0.22387070707070708, "grad_norm": 0.06787852197885513, "learning_rate": 0.00019791261099842675, "loss": 0.0918, "step": 3463 }, { "epoch": 0.22393535353535354, "grad_norm": 0.07614251971244812, "learning_rate": 0.00019791122079519256, "loss": 0.0944, "step": 3464 }, { "epoch": 0.224, "grad_norm": 0.07440026104450226, "learning_rate": 0.00019790983013405998, "loss": 0.1008, "step": 3465 }, { "epoch": 0.22406464646464647, "grad_norm": 0.07013943046331406, "learning_rate": 0.00019790843901503546, "loss": 0.0924, "step": 3466 }, { "epoch": 0.22412929292929293, "grad_norm": 0.09238637983798981, "learning_rate": 0.00019790704743812555, "loss": 0.1108, "step": 3467 }, { "epoch": 0.2241939393939394, "grad_norm": 0.07162702828645706, "learning_rate": 0.00019790565540333676, "loss": 0.0884, "step": 3468 }, { "epoch": 0.22425858585858585, "grad_norm": 0.07698137313127518, "learning_rate": 0.00019790426291067557, "loss": 0.1059, "step": 3469 }, { "epoch": 0.2243232323232323, "grad_norm": 0.06773233413696289, "learning_rate": 0.0001979028699601485, "loss": 0.0822, "step": 3470 }, { "epoch": 0.22438787878787878, "grad_norm": 0.07118669152259827, "learning_rate": 0.0001979014765517621, "loss": 0.0927, "step": 3471 }, { "epoch": 0.22445252525252526, "grad_norm": 0.08188159018754959, "learning_rate": 0.00019790008268552286, "loss": 0.1113, "step": 3472 }, { "epoch": 0.22445252525252526, "eval_bleu": 14.413165384938702, "eval_loss": 0.09582473337650299, "eval_runtime": 2.7649, "eval_samples_per_second": 11.574, "eval_steps_per_second": 1.447, "step": 3472 }, { "epoch": 0.22451717171717173, "grad_norm": 0.05508796125650406, "learning_rate": 0.00019789868836143728, "loss": 0.0728, "step": 3473 }, { "epoch": 0.2245818181818182, "grad_norm": 0.062334559857845306, "learning_rate": 0.0001978972935795119, "loss": 0.0807, "step": 3474 }, { "epoch": 0.22464646464646465, "grad_norm": 0.10257519036531448, "learning_rate": 0.00019789589833975324, "loss": 0.0855, "step": 3475 }, { "epoch": 0.2247111111111111, "grad_norm": 0.07012440264225006, "learning_rate": 0.00019789450264216782, "loss": 0.0894, "step": 3476 }, { "epoch": 0.22477575757575757, "grad_norm": 0.08081506192684174, "learning_rate": 0.0001978931064867622, "loss": 0.1035, "step": 3477 }, { "epoch": 0.22484040404040403, "grad_norm": 0.08611779659986496, "learning_rate": 0.00019789170987354288, "loss": 0.1049, "step": 3478 }, { "epoch": 0.2249050505050505, "grad_norm": 0.08547607809305191, "learning_rate": 0.00019789031280251638, "loss": 0.1157, "step": 3479 }, { "epoch": 0.22496969696969696, "grad_norm": 0.07521193474531174, "learning_rate": 0.00019788891527368926, "loss": 0.0762, "step": 3480 }, { "epoch": 0.22503434343434345, "grad_norm": 0.07498076558113098, "learning_rate": 0.00019788751728706805, "loss": 0.0972, "step": 3481 }, { "epoch": 0.2250989898989899, "grad_norm": 0.07329077273607254, "learning_rate": 0.00019788611884265927, "loss": 0.0759, "step": 3482 }, { "epoch": 0.22516363636363637, "grad_norm": 0.1099148690700531, "learning_rate": 0.00019788471994046947, "loss": 0.0995, "step": 3483 }, { "epoch": 0.22522828282828283, "grad_norm": 0.06628064811229706, "learning_rate": 0.0001978833205805052, "loss": 0.0811, "step": 3484 }, { "epoch": 0.2252929292929293, "grad_norm": 0.07237538695335388, "learning_rate": 0.00019788192076277298, "loss": 0.0932, "step": 3485 }, { "epoch": 0.22535757575757576, "grad_norm": 0.0674620047211647, "learning_rate": 0.00019788052048727944, "loss": 0.0847, "step": 3486 }, { "epoch": 0.22542222222222222, "grad_norm": 0.07274454087018967, "learning_rate": 0.00019787911975403103, "loss": 0.0928, "step": 3487 }, { "epoch": 0.22548686868686868, "grad_norm": 0.09663775563240051, "learning_rate": 0.0001978777185630343, "loss": 0.1234, "step": 3488 }, { "epoch": 0.22548686868686868, "eval_bleu": 11.761584550655929, "eval_loss": 0.09453203529119492, "eval_runtime": 2.7702, "eval_samples_per_second": 11.551, "eval_steps_per_second": 1.444, "step": 3488 }, { "epoch": 0.22555151515151514, "grad_norm": 0.08604975789785385, "learning_rate": 0.00019787631691429587, "loss": 0.108, "step": 3489 }, { "epoch": 0.2256161616161616, "grad_norm": 0.07414434850215912, "learning_rate": 0.00019787491480782225, "loss": 0.0984, "step": 3490 }, { "epoch": 0.2256808080808081, "grad_norm": 0.07036276161670685, "learning_rate": 0.00019787351224362002, "loss": 0.093, "step": 3491 }, { "epoch": 0.22574545454545455, "grad_norm": 0.08143153041601181, "learning_rate": 0.0001978721092216957, "loss": 0.0881, "step": 3492 }, { "epoch": 0.22581010101010102, "grad_norm": 0.07141514122486115, "learning_rate": 0.00019787070574205592, "loss": 0.0873, "step": 3493 }, { "epoch": 0.22587474747474748, "grad_norm": 0.08046414703130722, "learning_rate": 0.00019786930180470721, "loss": 0.1154, "step": 3494 }, { "epoch": 0.22593939393939394, "grad_norm": 0.0730963721871376, "learning_rate": 0.00019786789740965612, "loss": 0.0968, "step": 3495 }, { "epoch": 0.2260040404040404, "grad_norm": 0.07489965856075287, "learning_rate": 0.0001978664925569092, "loss": 0.1047, "step": 3496 }, { "epoch": 0.22606868686868686, "grad_norm": 0.06927904486656189, "learning_rate": 0.00019786508724647307, "loss": 0.0713, "step": 3497 }, { "epoch": 0.22613333333333333, "grad_norm": 0.08132988959550858, "learning_rate": 0.00019786368147835427, "loss": 0.0933, "step": 3498 }, { "epoch": 0.2261979797979798, "grad_norm": 0.06674129515886307, "learning_rate": 0.00019786227525255942, "loss": 0.0937, "step": 3499 }, { "epoch": 0.22626262626262628, "grad_norm": 0.06554894894361496, "learning_rate": 0.00019786086856909502, "loss": 0.0855, "step": 3500 }, { "epoch": 0.22632727272727274, "grad_norm": 0.06557810306549072, "learning_rate": 0.0001978594614279677, "loss": 0.0863, "step": 3501 }, { "epoch": 0.2263919191919192, "grad_norm": 0.07035014033317566, "learning_rate": 0.00019785805382918406, "loss": 0.0884, "step": 3502 }, { "epoch": 0.22645656565656566, "grad_norm": 0.08184259384870529, "learning_rate": 0.00019785664577275062, "loss": 0.1097, "step": 3503 }, { "epoch": 0.22652121212121212, "grad_norm": 0.07167940586805344, "learning_rate": 0.000197855237258674, "loss": 0.0746, "step": 3504 }, { "epoch": 0.22652121212121212, "eval_bleu": 9.913851586833534, "eval_loss": 0.09565990418195724, "eval_runtime": 2.719, "eval_samples_per_second": 11.769, "eval_steps_per_second": 1.471, "step": 3504 }, { "epoch": 0.22658585858585858, "grad_norm": 0.0847000852227211, "learning_rate": 0.0001978538282869608, "loss": 0.1167, "step": 3505 }, { "epoch": 0.22665050505050505, "grad_norm": 0.09329701215028763, "learning_rate": 0.00019785241885761758, "loss": 0.0818, "step": 3506 }, { "epoch": 0.2267151515151515, "grad_norm": 0.06925345957279205, "learning_rate": 0.000197851008970651, "loss": 0.089, "step": 3507 }, { "epoch": 0.22677979797979797, "grad_norm": 0.07653499394655228, "learning_rate": 0.00019784959862606752, "loss": 0.0968, "step": 3508 }, { "epoch": 0.22684444444444443, "grad_norm": 0.06895699352025986, "learning_rate": 0.00019784818782387387, "loss": 0.0775, "step": 3509 }, { "epoch": 0.22690909090909092, "grad_norm": 0.07260050624608994, "learning_rate": 0.00019784677656407658, "loss": 0.0965, "step": 3510 }, { "epoch": 0.22697373737373738, "grad_norm": 0.11276265978813171, "learning_rate": 0.00019784536484668226, "loss": 0.0853, "step": 3511 }, { "epoch": 0.22703838383838384, "grad_norm": 0.08519134670495987, "learning_rate": 0.00019784395267169748, "loss": 0.1117, "step": 3512 }, { "epoch": 0.2271030303030303, "grad_norm": 0.0735049694776535, "learning_rate": 0.00019784254003912892, "loss": 0.0968, "step": 3513 }, { "epoch": 0.22716767676767677, "grad_norm": 0.07618353515863419, "learning_rate": 0.00019784112694898315, "loss": 0.0846, "step": 3514 }, { "epoch": 0.22723232323232323, "grad_norm": 0.0764879509806633, "learning_rate": 0.00019783971340126677, "loss": 0.1017, "step": 3515 }, { "epoch": 0.2272969696969697, "grad_norm": 0.05671005696058273, "learning_rate": 0.0001978382993959864, "loss": 0.0709, "step": 3516 }, { "epoch": 0.22736161616161615, "grad_norm": 0.0625818520784378, "learning_rate": 0.00019783688493314863, "loss": 0.0862, "step": 3517 }, { "epoch": 0.22742626262626262, "grad_norm": 0.07804487645626068, "learning_rate": 0.0001978354700127601, "loss": 0.102, "step": 3518 }, { "epoch": 0.2274909090909091, "grad_norm": 0.06514550745487213, "learning_rate": 0.00019783405463482743, "loss": 0.0847, "step": 3519 }, { "epoch": 0.22755555555555557, "grad_norm": 0.056930817663669586, "learning_rate": 0.00019783263879935721, "loss": 0.0708, "step": 3520 }, { "epoch": 0.22755555555555557, "eval_bleu": 15.892232265484392, "eval_loss": 0.09597373008728027, "eval_runtime": 2.8643, "eval_samples_per_second": 11.172, "eval_steps_per_second": 1.396, "step": 3520 }, { "epoch": 0.22762020202020203, "grad_norm": 0.07230743020772934, "learning_rate": 0.0001978312225063561, "loss": 0.0854, "step": 3521 }, { "epoch": 0.2276848484848485, "grad_norm": 0.08224190771579742, "learning_rate": 0.0001978298057558307, "loss": 0.1051, "step": 3522 }, { "epoch": 0.22774949494949495, "grad_norm": 0.07027490437030792, "learning_rate": 0.00019782838854778766, "loss": 0.0966, "step": 3523 }, { "epoch": 0.2278141414141414, "grad_norm": 0.0777537077665329, "learning_rate": 0.00019782697088223356, "loss": 0.0929, "step": 3524 }, { "epoch": 0.22787878787878788, "grad_norm": 0.07830680161714554, "learning_rate": 0.00019782555275917507, "loss": 0.0973, "step": 3525 }, { "epoch": 0.22794343434343434, "grad_norm": 0.06828243285417557, "learning_rate": 0.0001978241341786188, "loss": 0.0876, "step": 3526 }, { "epoch": 0.2280080808080808, "grad_norm": 0.07176431268453598, "learning_rate": 0.00019782271514057139, "loss": 0.0926, "step": 3527 }, { "epoch": 0.22807272727272726, "grad_norm": 0.059330422431230545, "learning_rate": 0.0001978212956450395, "loss": 0.077, "step": 3528 }, { "epoch": 0.22813737373737375, "grad_norm": 0.067512147128582, "learning_rate": 0.00019781987569202972, "loss": 0.0959, "step": 3529 }, { "epoch": 0.2282020202020202, "grad_norm": 0.09087305516004562, "learning_rate": 0.00019781845528154873, "loss": 0.1233, "step": 3530 }, { "epoch": 0.22826666666666667, "grad_norm": 0.065886490046978, "learning_rate": 0.00019781703441360319, "loss": 0.0725, "step": 3531 }, { "epoch": 0.22833131313131314, "grad_norm": 0.06722399592399597, "learning_rate": 0.0001978156130881997, "loss": 0.0865, "step": 3532 }, { "epoch": 0.2283959595959596, "grad_norm": 0.060964904725551605, "learning_rate": 0.00019781419130534492, "loss": 0.0789, "step": 3533 }, { "epoch": 0.22846060606060606, "grad_norm": 0.11481403559446335, "learning_rate": 0.00019781276906504554, "loss": 0.1105, "step": 3534 }, { "epoch": 0.22852525252525252, "grad_norm": 0.07899876683950424, "learning_rate": 0.00019781134636730815, "loss": 0.0993, "step": 3535 }, { "epoch": 0.22858989898989898, "grad_norm": 0.06887764483690262, "learning_rate": 0.0001978099232121394, "loss": 0.0903, "step": 3536 }, { "epoch": 0.22858989898989898, "eval_bleu": 15.616825101250626, "eval_loss": 0.09498222172260284, "eval_runtime": 2.6981, "eval_samples_per_second": 11.86, "eval_steps_per_second": 1.483, "step": 3536 }, { "epoch": 0.22865454545454544, "grad_norm": 0.06531254202127457, "learning_rate": 0.000197808499599546, "loss": 0.0812, "step": 3537 }, { "epoch": 0.2287191919191919, "grad_norm": 0.0698668360710144, "learning_rate": 0.00019780707552953457, "loss": 0.0775, "step": 3538 }, { "epoch": 0.2287838383838384, "grad_norm": 0.06194218993186951, "learning_rate": 0.0001978056510021118, "loss": 0.0815, "step": 3539 }, { "epoch": 0.22884848484848486, "grad_norm": 0.0733053982257843, "learning_rate": 0.00019780422601728433, "loss": 0.0805, "step": 3540 }, { "epoch": 0.22891313131313132, "grad_norm": 0.07327724993228912, "learning_rate": 0.00019780280057505882, "loss": 0.0808, "step": 3541 }, { "epoch": 0.22897777777777778, "grad_norm": 0.07861881703138351, "learning_rate": 0.00019780137467544196, "loss": 0.0957, "step": 3542 }, { "epoch": 0.22904242424242424, "grad_norm": 0.08001697063446045, "learning_rate": 0.0001977999483184404, "loss": 0.0867, "step": 3543 }, { "epoch": 0.2291070707070707, "grad_norm": 0.07066141068935394, "learning_rate": 0.0001977985215040608, "loss": 0.086, "step": 3544 }, { "epoch": 0.22917171717171717, "grad_norm": 0.07036232948303223, "learning_rate": 0.00019779709423230985, "loss": 0.0858, "step": 3545 }, { "epoch": 0.22923636363636363, "grad_norm": 0.07283928990364075, "learning_rate": 0.00019779566650319423, "loss": 0.0904, "step": 3546 }, { "epoch": 0.2293010101010101, "grad_norm": 0.08753010630607605, "learning_rate": 0.0001977942383167206, "loss": 0.0975, "step": 3547 }, { "epoch": 0.22936565656565658, "grad_norm": 0.07472671568393707, "learning_rate": 0.00019779280967289564, "loss": 0.0958, "step": 3548 }, { "epoch": 0.22943030303030304, "grad_norm": 0.07429744303226471, "learning_rate": 0.00019779138057172605, "loss": 0.0957, "step": 3549 }, { "epoch": 0.2294949494949495, "grad_norm": 0.07320588827133179, "learning_rate": 0.0001977899510132185, "loss": 0.0815, "step": 3550 }, { "epoch": 0.22955959595959596, "grad_norm": 0.06866639107465744, "learning_rate": 0.0001977885209973797, "loss": 0.0889, "step": 3551 }, { "epoch": 0.22962424242424243, "grad_norm": 0.08037056773900986, "learning_rate": 0.0001977870905242163, "loss": 0.102, "step": 3552 }, { "epoch": 0.22962424242424243, "eval_bleu": 13.410676053624968, "eval_loss": 0.09431174397468567, "eval_runtime": 2.827, "eval_samples_per_second": 11.32, "eval_steps_per_second": 1.415, "step": 3552 }, { "epoch": 0.2296888888888889, "grad_norm": 0.07771310955286026, "learning_rate": 0.00019778565959373498, "loss": 0.0973, "step": 3553 }, { "epoch": 0.22975353535353535, "grad_norm": 0.06742063909769058, "learning_rate": 0.00019778422820594248, "loss": 0.0853, "step": 3554 }, { "epoch": 0.2298181818181818, "grad_norm": 0.06602583080530167, "learning_rate": 0.00019778279636084548, "loss": 0.1, "step": 3555 }, { "epoch": 0.22988282828282827, "grad_norm": 0.08072684705257416, "learning_rate": 0.00019778136405845066, "loss": 0.1173, "step": 3556 }, { "epoch": 0.22994747474747473, "grad_norm": 0.06625766307115555, "learning_rate": 0.0001977799312987647, "loss": 0.0845, "step": 3557 }, { "epoch": 0.23001212121212122, "grad_norm": 0.08137887716293335, "learning_rate": 0.00019777849808179436, "loss": 0.1027, "step": 3558 }, { "epoch": 0.23007676767676769, "grad_norm": 0.07646719366312027, "learning_rate": 0.0001977770644075463, "loss": 0.1111, "step": 3559 }, { "epoch": 0.23014141414141415, "grad_norm": 0.08114475756883621, "learning_rate": 0.00019777563027602725, "loss": 0.0729, "step": 3560 }, { "epoch": 0.2302060606060606, "grad_norm": 0.06272187829017639, "learning_rate": 0.00019777419568724387, "loss": 0.0854, "step": 3561 }, { "epoch": 0.23027070707070707, "grad_norm": 0.09199637174606323, "learning_rate": 0.00019777276064120294, "loss": 0.1105, "step": 3562 }, { "epoch": 0.23033535353535353, "grad_norm": 0.06557215750217438, "learning_rate": 0.0001977713251379111, "loss": 0.0832, "step": 3563 }, { "epoch": 0.2304, "grad_norm": 0.07381037622690201, "learning_rate": 0.0001977698891773751, "loss": 0.0949, "step": 3564 }, { "epoch": 0.23046464646464646, "grad_norm": 0.06555640697479248, "learning_rate": 0.00019776845275960164, "loss": 0.0838, "step": 3565 }, { "epoch": 0.23052929292929292, "grad_norm": 0.06982017308473587, "learning_rate": 0.00019776701588459746, "loss": 0.0943, "step": 3566 }, { "epoch": 0.2305939393939394, "grad_norm": 0.07609958946704865, "learning_rate": 0.0001977655785523693, "loss": 0.1074, "step": 3567 }, { "epoch": 0.23065858585858587, "grad_norm": 0.07699093222618103, "learning_rate": 0.0001977641407629238, "loss": 0.1091, "step": 3568 }, { "epoch": 0.23065858585858587, "eval_bleu": 14.150156222515426, "eval_loss": 0.093361034989357, "eval_runtime": 2.8557, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 3568 }, { "epoch": 0.23072323232323233, "grad_norm": 0.08846042305231094, "learning_rate": 0.00019776270251626774, "loss": 0.1029, "step": 3569 }, { "epoch": 0.2307878787878788, "grad_norm": 0.06828977912664413, "learning_rate": 0.00019776126381240787, "loss": 0.0937, "step": 3570 }, { "epoch": 0.23085252525252525, "grad_norm": 0.07474086433649063, "learning_rate": 0.00019775982465135086, "loss": 0.0944, "step": 3571 }, { "epoch": 0.23091717171717172, "grad_norm": 0.06815038621425629, "learning_rate": 0.00019775838503310346, "loss": 0.0901, "step": 3572 }, { "epoch": 0.23098181818181818, "grad_norm": 0.06756641715765, "learning_rate": 0.00019775694495767246, "loss": 0.0817, "step": 3573 }, { "epoch": 0.23104646464646464, "grad_norm": 0.07005354762077332, "learning_rate": 0.00019775550442506452, "loss": 0.0896, "step": 3574 }, { "epoch": 0.2311111111111111, "grad_norm": 0.08190398663282394, "learning_rate": 0.0001977540634352864, "loss": 0.0967, "step": 3575 }, { "epoch": 0.23117575757575756, "grad_norm": 0.07702455669641495, "learning_rate": 0.00019775262198834482, "loss": 0.0904, "step": 3576 }, { "epoch": 0.23124040404040405, "grad_norm": 0.08336713910102844, "learning_rate": 0.00019775118008424656, "loss": 0.0997, "step": 3577 }, { "epoch": 0.2313050505050505, "grad_norm": 0.07796916365623474, "learning_rate": 0.00019774973772299835, "loss": 0.103, "step": 3578 }, { "epoch": 0.23136969696969698, "grad_norm": 0.07120048254728317, "learning_rate": 0.00019774829490460695, "loss": 0.0834, "step": 3579 }, { "epoch": 0.23143434343434344, "grad_norm": 0.07230277359485626, "learning_rate": 0.00019774685162907908, "loss": 0.0942, "step": 3580 }, { "epoch": 0.2314989898989899, "grad_norm": 0.06850824505090714, "learning_rate": 0.00019774540789642148, "loss": 0.0824, "step": 3581 }, { "epoch": 0.23156363636363636, "grad_norm": 0.0780918151140213, "learning_rate": 0.00019774396370664094, "loss": 0.0979, "step": 3582 }, { "epoch": 0.23162828282828282, "grad_norm": 0.07548228651285172, "learning_rate": 0.00019774251905974418, "loss": 0.0837, "step": 3583 }, { "epoch": 0.23169292929292928, "grad_norm": 0.06877511739730835, "learning_rate": 0.000197741073955738, "loss": 0.0901, "step": 3584 }, { "epoch": 0.23169292929292928, "eval_bleu": 11.340700999418932, "eval_loss": 0.09412933886051178, "eval_runtime": 2.711, "eval_samples_per_second": 11.804, "eval_steps_per_second": 1.475, "step": 3584 }, { "epoch": 0.23175757575757575, "grad_norm": 0.10319288820028305, "learning_rate": 0.0001977396283946291, "loss": 0.1133, "step": 3585 }, { "epoch": 0.23182222222222224, "grad_norm": 0.09334880858659744, "learning_rate": 0.0001977381823764243, "loss": 0.0972, "step": 3586 }, { "epoch": 0.2318868686868687, "grad_norm": 0.0849304124712944, "learning_rate": 0.00019773673590113032, "loss": 0.1144, "step": 3587 }, { "epoch": 0.23195151515151516, "grad_norm": 0.07076696306467056, "learning_rate": 0.00019773528896875392, "loss": 0.0971, "step": 3588 }, { "epoch": 0.23201616161616162, "grad_norm": 0.07317604124546051, "learning_rate": 0.0001977338415793019, "loss": 0.0904, "step": 3589 }, { "epoch": 0.23208080808080808, "grad_norm": 0.06971902400255203, "learning_rate": 0.000197732393732781, "loss": 0.0863, "step": 3590 }, { "epoch": 0.23214545454545454, "grad_norm": 0.08120656758546829, "learning_rate": 0.000197730945429198, "loss": 0.105, "step": 3591 }, { "epoch": 0.232210101010101, "grad_norm": 0.0819096565246582, "learning_rate": 0.00019772949666855972, "loss": 0.0994, "step": 3592 }, { "epoch": 0.23227474747474747, "grad_norm": 0.060589749366045, "learning_rate": 0.00019772804745087284, "loss": 0.0834, "step": 3593 }, { "epoch": 0.23233939393939393, "grad_norm": 0.07656625658273697, "learning_rate": 0.00019772659777614421, "loss": 0.0943, "step": 3594 }, { "epoch": 0.2324040404040404, "grad_norm": 0.06892391294240952, "learning_rate": 0.0001977251476443806, "loss": 0.0896, "step": 3595 }, { "epoch": 0.23246868686868688, "grad_norm": 0.0657782256603241, "learning_rate": 0.00019772369705558878, "loss": 0.0866, "step": 3596 }, { "epoch": 0.23253333333333334, "grad_norm": 0.08330164104700089, "learning_rate": 0.00019772224600977554, "loss": 0.1073, "step": 3597 }, { "epoch": 0.2325979797979798, "grad_norm": 0.06975338608026505, "learning_rate": 0.00019772079450694764, "loss": 0.0906, "step": 3598 }, { "epoch": 0.23266262626262627, "grad_norm": 0.06488228589296341, "learning_rate": 0.00019771934254711191, "loss": 0.0796, "step": 3599 }, { "epoch": 0.23272727272727273, "grad_norm": 0.0792783722281456, "learning_rate": 0.0001977178901302751, "loss": 0.0935, "step": 3600 }, { "epoch": 0.23272727272727273, "eval_bleu": 14.084225994172748, "eval_loss": 0.0942840650677681, "eval_runtime": 2.8737, "eval_samples_per_second": 11.136, "eval_steps_per_second": 1.392, "step": 3600 }, { "epoch": 0.2327919191919192, "grad_norm": 0.07156125456094742, "learning_rate": 0.00019771643725644404, "loss": 0.0955, "step": 3601 }, { "epoch": 0.23285656565656565, "grad_norm": 0.09084650874137878, "learning_rate": 0.0001977149839256255, "loss": 0.0954, "step": 3602 }, { "epoch": 0.2329212121212121, "grad_norm": 0.08035772293806076, "learning_rate": 0.0001977135301378263, "loss": 0.0873, "step": 3603 }, { "epoch": 0.23298585858585857, "grad_norm": 0.06201196834445, "learning_rate": 0.00019771207589305322, "loss": 0.0712, "step": 3604 }, { "epoch": 0.23305050505050506, "grad_norm": 0.0698280930519104, "learning_rate": 0.00019771062119131302, "loss": 0.0814, "step": 3605 }, { "epoch": 0.23311515151515153, "grad_norm": 0.06953717023134232, "learning_rate": 0.0001977091660326126, "loss": 0.0907, "step": 3606 }, { "epoch": 0.233179797979798, "grad_norm": 0.0626969188451767, "learning_rate": 0.00019770771041695867, "loss": 0.0811, "step": 3607 }, { "epoch": 0.23324444444444445, "grad_norm": 0.07701700180768967, "learning_rate": 0.00019770625434435812, "loss": 0.0972, "step": 3608 }, { "epoch": 0.2333090909090909, "grad_norm": 0.06883645802736282, "learning_rate": 0.0001977047978148177, "loss": 0.0827, "step": 3609 }, { "epoch": 0.23337373737373737, "grad_norm": 0.0721801370382309, "learning_rate": 0.00019770334082834423, "loss": 0.1051, "step": 3610 }, { "epoch": 0.23343838383838383, "grad_norm": 0.07021838426589966, "learning_rate": 0.00019770188338494456, "loss": 0.0928, "step": 3611 }, { "epoch": 0.2335030303030303, "grad_norm": 0.07905546575784683, "learning_rate": 0.00019770042548462549, "loss": 0.092, "step": 3612 }, { "epoch": 0.23356767676767676, "grad_norm": 0.06617802381515503, "learning_rate": 0.0001976989671273938, "loss": 0.0774, "step": 3613 }, { "epoch": 0.23363232323232322, "grad_norm": 0.06899509578943253, "learning_rate": 0.00019769750831325636, "loss": 0.0915, "step": 3614 }, { "epoch": 0.2336969696969697, "grad_norm": 0.0789894387125969, "learning_rate": 0.00019769604904221994, "loss": 0.0944, "step": 3615 }, { "epoch": 0.23376161616161617, "grad_norm": 0.0746842548251152, "learning_rate": 0.00019769458931429143, "loss": 0.0967, "step": 3616 }, { "epoch": 0.23376161616161617, "eval_bleu": 11.203320568217007, "eval_loss": 0.09357260167598724, "eval_runtime": 2.7282, "eval_samples_per_second": 11.729, "eval_steps_per_second": 1.466, "step": 3616 }, { "epoch": 0.23382626262626263, "grad_norm": 0.08139018714427948, "learning_rate": 0.0001976931291294776, "loss": 0.0945, "step": 3617 }, { "epoch": 0.2338909090909091, "grad_norm": 0.06343588978052139, "learning_rate": 0.0001976916684877853, "loss": 0.0885, "step": 3618 }, { "epoch": 0.23395555555555556, "grad_norm": 0.08190052956342697, "learning_rate": 0.00019769020738922137, "loss": 0.0884, "step": 3619 }, { "epoch": 0.23402020202020202, "grad_norm": 0.08500027656555176, "learning_rate": 0.00019768874583379266, "loss": 0.1037, "step": 3620 }, { "epoch": 0.23408484848484848, "grad_norm": 0.07637444138526917, "learning_rate": 0.00019768728382150595, "loss": 0.0916, "step": 3621 }, { "epoch": 0.23414949494949494, "grad_norm": 0.06660114973783493, "learning_rate": 0.00019768582135236813, "loss": 0.0768, "step": 3622 }, { "epoch": 0.2342141414141414, "grad_norm": 0.06850486248731613, "learning_rate": 0.000197684358426386, "loss": 0.0915, "step": 3623 }, { "epoch": 0.2342787878787879, "grad_norm": 0.06507407873868942, "learning_rate": 0.00019768289504356644, "loss": 0.0856, "step": 3624 }, { "epoch": 0.23434343434343435, "grad_norm": 0.06116962805390358, "learning_rate": 0.00019768143120391625, "loss": 0.0775, "step": 3625 }, { "epoch": 0.23440808080808082, "grad_norm": 0.07707644999027252, "learning_rate": 0.0001976799669074423, "loss": 0.0984, "step": 3626 }, { "epoch": 0.23447272727272728, "grad_norm": 0.07304911315441132, "learning_rate": 0.00019767850215415144, "loss": 0.094, "step": 3627 }, { "epoch": 0.23453737373737374, "grad_norm": 0.08153322339057922, "learning_rate": 0.00019767703694405055, "loss": 0.1062, "step": 3628 }, { "epoch": 0.2346020202020202, "grad_norm": 0.07076204568147659, "learning_rate": 0.0001976755712771464, "loss": 0.0979, "step": 3629 }, { "epoch": 0.23466666666666666, "grad_norm": 0.06623207032680511, "learning_rate": 0.00019767410515344593, "loss": 0.0854, "step": 3630 }, { "epoch": 0.23473131313131312, "grad_norm": 0.07027088105678558, "learning_rate": 0.00019767263857295596, "loss": 0.0829, "step": 3631 }, { "epoch": 0.23479595959595959, "grad_norm": 0.08217540383338928, "learning_rate": 0.00019767117153568332, "loss": 0.0844, "step": 3632 }, { "epoch": 0.23479595959595959, "eval_bleu": 13.975432203529325, "eval_loss": 0.09312742948532104, "eval_runtime": 2.7876, "eval_samples_per_second": 11.48, "eval_steps_per_second": 1.435, "step": 3632 }, { "epoch": 0.23486060606060605, "grad_norm": 0.07628919929265976, "learning_rate": 0.00019766970404163492, "loss": 0.097, "step": 3633 }, { "epoch": 0.23492525252525254, "grad_norm": 0.1570323258638382, "learning_rate": 0.00019766823609081762, "loss": 0.0788, "step": 3634 }, { "epoch": 0.234989898989899, "grad_norm": 0.07840665429830551, "learning_rate": 0.00019766676768323823, "loss": 0.0859, "step": 3635 }, { "epoch": 0.23505454545454546, "grad_norm": 0.07596184313297272, "learning_rate": 0.00019766529881890368, "loss": 0.1035, "step": 3636 }, { "epoch": 0.23511919191919192, "grad_norm": 0.08474904298782349, "learning_rate": 0.0001976638294978208, "loss": 0.0985, "step": 3637 }, { "epoch": 0.23518383838383838, "grad_norm": 0.07941339910030365, "learning_rate": 0.0001976623597199965, "loss": 0.1058, "step": 3638 }, { "epoch": 0.23524848484848485, "grad_norm": 0.0766904279589653, "learning_rate": 0.00019766088948543762, "loss": 0.0946, "step": 3639 }, { "epoch": 0.2353131313131313, "grad_norm": 0.06431964784860611, "learning_rate": 0.00019765941879415104, "loss": 0.0703, "step": 3640 }, { "epoch": 0.23537777777777777, "grad_norm": 0.07155662775039673, "learning_rate": 0.00019765794764614364, "loss": 0.0867, "step": 3641 }, { "epoch": 0.23544242424242423, "grad_norm": 0.08403602987527847, "learning_rate": 0.0001976564760414223, "loss": 0.0915, "step": 3642 }, { "epoch": 0.23550707070707072, "grad_norm": 0.1112782284617424, "learning_rate": 0.00019765500397999394, "loss": 0.0886, "step": 3643 }, { "epoch": 0.23557171717171718, "grad_norm": 0.08049486577510834, "learning_rate": 0.00019765353146186535, "loss": 0.1019, "step": 3644 }, { "epoch": 0.23563636363636364, "grad_norm": 0.059750527143478394, "learning_rate": 0.00019765205848704353, "loss": 0.0625, "step": 3645 }, { "epoch": 0.2357010101010101, "grad_norm": 0.0703006312251091, "learning_rate": 0.00019765058505553527, "loss": 0.0887, "step": 3646 }, { "epoch": 0.23576565656565657, "grad_norm": 0.07186093926429749, "learning_rate": 0.00019764911116734756, "loss": 0.1051, "step": 3647 }, { "epoch": 0.23583030303030303, "grad_norm": 0.08068804442882538, "learning_rate": 0.00019764763682248723, "loss": 0.1087, "step": 3648 }, { "epoch": 0.23583030303030303, "eval_bleu": 12.768984312443392, "eval_loss": 0.09455497562885284, "eval_runtime": 2.661, "eval_samples_per_second": 12.025, "eval_steps_per_second": 1.503, "step": 3648 }, { "epoch": 0.2358949494949495, "grad_norm": 0.07385983318090439, "learning_rate": 0.00019764616202096115, "loss": 0.0876, "step": 3649 }, { "epoch": 0.23595959595959595, "grad_norm": 0.07834664732217789, "learning_rate": 0.00019764468676277628, "loss": 0.0959, "step": 3650 }, { "epoch": 0.23602424242424241, "grad_norm": 0.06544079631567001, "learning_rate": 0.0001976432110479395, "loss": 0.0856, "step": 3651 }, { "epoch": 0.23608888888888888, "grad_norm": 0.06250978261232376, "learning_rate": 0.00019764173487645765, "loss": 0.0819, "step": 3652 }, { "epoch": 0.23615353535353537, "grad_norm": 0.0783507451415062, "learning_rate": 0.00019764025824833774, "loss": 0.0966, "step": 3653 }, { "epoch": 0.23621818181818183, "grad_norm": 0.06845230609178543, "learning_rate": 0.0001976387811635866, "loss": 0.0834, "step": 3654 }, { "epoch": 0.2362828282828283, "grad_norm": 0.06890866160392761, "learning_rate": 0.00019763730362221116, "loss": 0.0819, "step": 3655 }, { "epoch": 0.23634747474747475, "grad_norm": 0.07947429269552231, "learning_rate": 0.0001976358256242183, "loss": 0.1124, "step": 3656 }, { "epoch": 0.2364121212121212, "grad_norm": 0.0641227439045906, "learning_rate": 0.00019763434716961502, "loss": 0.078, "step": 3657 }, { "epoch": 0.23647676767676767, "grad_norm": 0.08110956102609634, "learning_rate": 0.00019763286825840814, "loss": 0.097, "step": 3658 }, { "epoch": 0.23654141414141414, "grad_norm": 0.06283240020275116, "learning_rate": 0.0001976313888906046, "loss": 0.0858, "step": 3659 }, { "epoch": 0.2366060606060606, "grad_norm": 0.06079525500535965, "learning_rate": 0.00019762990906621136, "loss": 0.0758, "step": 3660 }, { "epoch": 0.23667070707070706, "grad_norm": 0.06878741830587387, "learning_rate": 0.00019762842878523528, "loss": 0.0933, "step": 3661 }, { "epoch": 0.23673535353535355, "grad_norm": 0.07569827884435654, "learning_rate": 0.00019762694804768333, "loss": 0.1065, "step": 3662 }, { "epoch": 0.2368, "grad_norm": 0.07727054506540298, "learning_rate": 0.00019762546685356245, "loss": 0.0873, "step": 3663 }, { "epoch": 0.23686464646464647, "grad_norm": 0.06651638448238373, "learning_rate": 0.0001976239852028795, "loss": 0.0828, "step": 3664 }, { "epoch": 0.23686464646464647, "eval_bleu": 12.938585454638043, "eval_loss": 0.09288673102855682, "eval_runtime": 2.8034, "eval_samples_per_second": 11.415, "eval_steps_per_second": 1.427, "step": 3664 }, { "epoch": 0.23692929292929293, "grad_norm": 0.08869357407093048, "learning_rate": 0.00019762250309564145, "loss": 0.0954, "step": 3665 }, { "epoch": 0.2369939393939394, "grad_norm": 0.07288382947444916, "learning_rate": 0.00019762102053185525, "loss": 0.0933, "step": 3666 }, { "epoch": 0.23705858585858586, "grad_norm": 0.07795406132936478, "learning_rate": 0.00019761953751152778, "loss": 0.115, "step": 3667 }, { "epoch": 0.23712323232323232, "grad_norm": 0.074552021920681, "learning_rate": 0.00019761805403466603, "loss": 0.1013, "step": 3668 }, { "epoch": 0.23718787878787878, "grad_norm": 0.056436918675899506, "learning_rate": 0.00019761657010127688, "loss": 0.0777, "step": 3669 }, { "epoch": 0.23725252525252524, "grad_norm": 0.07594527304172516, "learning_rate": 0.0001976150857113673, "loss": 0.1091, "step": 3670 }, { "epoch": 0.2373171717171717, "grad_norm": 0.07751249521970749, "learning_rate": 0.00019761360086494427, "loss": 0.0935, "step": 3671 }, { "epoch": 0.2373818181818182, "grad_norm": 0.07881151884794235, "learning_rate": 0.0001976121155620147, "loss": 0.0835, "step": 3672 }, { "epoch": 0.23744646464646466, "grad_norm": 0.06939269602298737, "learning_rate": 0.00019761062980258552, "loss": 0.0926, "step": 3673 }, { "epoch": 0.23751111111111112, "grad_norm": 0.06641973555088043, "learning_rate": 0.0001976091435866637, "loss": 0.0899, "step": 3674 }, { "epoch": 0.23757575757575758, "grad_norm": 0.06272200495004654, "learning_rate": 0.00019760765691425615, "loss": 0.0891, "step": 3675 }, { "epoch": 0.23764040404040404, "grad_norm": 0.08037877827882767, "learning_rate": 0.0001976061697853699, "loss": 0.0865, "step": 3676 }, { "epoch": 0.2377050505050505, "grad_norm": 0.09082460403442383, "learning_rate": 0.00019760468220001185, "loss": 0.0827, "step": 3677 }, { "epoch": 0.23776969696969696, "grad_norm": 0.07090919464826584, "learning_rate": 0.00019760319415818898, "loss": 0.0874, "step": 3678 }, { "epoch": 0.23783434343434343, "grad_norm": 0.0651535913348198, "learning_rate": 0.0001976017056599082, "loss": 0.0967, "step": 3679 }, { "epoch": 0.2378989898989899, "grad_norm": 0.06856171786785126, "learning_rate": 0.00019760021670517651, "loss": 0.0982, "step": 3680 }, { "epoch": 0.2378989898989899, "eval_bleu": 12.430582261603249, "eval_loss": 0.09336743503808975, "eval_runtime": 2.7156, "eval_samples_per_second": 11.784, "eval_steps_per_second": 1.473, "step": 3680 }, { "epoch": 0.23796363636363638, "grad_norm": 0.07141885161399841, "learning_rate": 0.00019759872729400093, "loss": 0.1056, "step": 3681 }, { "epoch": 0.23802828282828284, "grad_norm": 0.07335217297077179, "learning_rate": 0.00019759723742638832, "loss": 0.098, "step": 3682 }, { "epoch": 0.2380929292929293, "grad_norm": 0.059730514883995056, "learning_rate": 0.0001975957471023457, "loss": 0.0815, "step": 3683 }, { "epoch": 0.23815757575757576, "grad_norm": 0.0720212310552597, "learning_rate": 0.00019759425632188005, "loss": 0.0968, "step": 3684 }, { "epoch": 0.23822222222222222, "grad_norm": 0.08765530586242676, "learning_rate": 0.00019759276508499833, "loss": 0.0827, "step": 3685 }, { "epoch": 0.23828686868686869, "grad_norm": 0.07773616164922714, "learning_rate": 0.00019759127339170752, "loss": 0.0963, "step": 3686 }, { "epoch": 0.23835151515151515, "grad_norm": 0.08143138885498047, "learning_rate": 0.00019758978124201457, "loss": 0.0947, "step": 3687 }, { "epoch": 0.2384161616161616, "grad_norm": 0.06535112857818604, "learning_rate": 0.00019758828863592647, "loss": 0.0753, "step": 3688 }, { "epoch": 0.23848080808080807, "grad_norm": 0.07925257831811905, "learning_rate": 0.0001975867955734502, "loss": 0.0929, "step": 3689 }, { "epoch": 0.23854545454545453, "grad_norm": 0.06724110245704651, "learning_rate": 0.00019758530205459275, "loss": 0.0865, "step": 3690 }, { "epoch": 0.23861010101010102, "grad_norm": 0.06838397681713104, "learning_rate": 0.00019758380807936114, "loss": 0.084, "step": 3691 }, { "epoch": 0.23867474747474748, "grad_norm": 0.06694263964891434, "learning_rate": 0.00019758231364776227, "loss": 0.0807, "step": 3692 }, { "epoch": 0.23873939393939395, "grad_norm": 0.07084178924560547, "learning_rate": 0.00019758081875980322, "loss": 0.0859, "step": 3693 }, { "epoch": 0.2388040404040404, "grad_norm": 0.07186637818813324, "learning_rate": 0.00019757932341549092, "loss": 0.0836, "step": 3694 }, { "epoch": 0.23886868686868687, "grad_norm": 0.0787421241402626, "learning_rate": 0.0001975778276148324, "loss": 0.0987, "step": 3695 }, { "epoch": 0.23893333333333333, "grad_norm": 0.07246874272823334, "learning_rate": 0.00019757633135783462, "loss": 0.088, "step": 3696 }, { "epoch": 0.23893333333333333, "eval_bleu": 16.48330870602483, "eval_loss": 0.09360860288143158, "eval_runtime": 2.6711, "eval_samples_per_second": 11.98, "eval_steps_per_second": 1.497, "step": 3696 }, { "epoch": 0.2389979797979798, "grad_norm": 0.0702347606420517, "learning_rate": 0.00019757483464450458, "loss": 0.084, "step": 3697 }, { "epoch": 0.23906262626262625, "grad_norm": 0.06494635343551636, "learning_rate": 0.00019757333747484935, "loss": 0.0842, "step": 3698 }, { "epoch": 0.23912727272727272, "grad_norm": 0.06721460819244385, "learning_rate": 0.00019757183984887584, "loss": 0.0821, "step": 3699 }, { "epoch": 0.2391919191919192, "grad_norm": 0.06768114119768143, "learning_rate": 0.0001975703417665911, "loss": 0.0912, "step": 3700 }, { "epoch": 0.23925656565656567, "grad_norm": 0.07227031886577606, "learning_rate": 0.00019756884322800212, "loss": 0.0931, "step": 3701 }, { "epoch": 0.23932121212121213, "grad_norm": 0.06979217380285263, "learning_rate": 0.00019756734423311592, "loss": 0.0902, "step": 3702 }, { "epoch": 0.2393858585858586, "grad_norm": 0.06806640326976776, "learning_rate": 0.00019756584478193952, "loss": 0.0911, "step": 3703 }, { "epoch": 0.23945050505050505, "grad_norm": 0.0762626975774765, "learning_rate": 0.0001975643448744799, "loss": 0.0903, "step": 3704 }, { "epoch": 0.23951515151515151, "grad_norm": 0.0667550340294838, "learning_rate": 0.00019756284451074408, "loss": 0.0858, "step": 3705 }, { "epoch": 0.23957979797979798, "grad_norm": 0.06587424129247665, "learning_rate": 0.00019756134369073913, "loss": 0.0792, "step": 3706 }, { "epoch": 0.23964444444444444, "grad_norm": 0.09357219934463501, "learning_rate": 0.000197559842414472, "loss": 0.0813, "step": 3707 }, { "epoch": 0.2397090909090909, "grad_norm": 0.07269969582557678, "learning_rate": 0.00019755834068194977, "loss": 0.0879, "step": 3708 }, { "epoch": 0.23977373737373736, "grad_norm": 0.0814739465713501, "learning_rate": 0.0001975568384931794, "loss": 0.0893, "step": 3709 }, { "epoch": 0.23983838383838385, "grad_norm": 0.0665619745850563, "learning_rate": 0.00019755533584816794, "loss": 0.0863, "step": 3710 }, { "epoch": 0.2399030303030303, "grad_norm": 0.09224192053079605, "learning_rate": 0.00019755383274692244, "loss": 0.1177, "step": 3711 }, { "epoch": 0.23996767676767677, "grad_norm": 0.08250459283590317, "learning_rate": 0.00019755232918944993, "loss": 0.0994, "step": 3712 }, { "epoch": 0.23996767676767677, "eval_bleu": 17.509389435750645, "eval_loss": 0.09335353970527649, "eval_runtime": 2.8097, "eval_samples_per_second": 11.389, "eval_steps_per_second": 1.424, "step": 3712 }, { "epoch": 0.24003232323232324, "grad_norm": 0.262030690908432, "learning_rate": 0.0001975508251757574, "loss": 0.1423, "step": 3713 }, { "epoch": 0.2400969696969697, "grad_norm": 0.09390987455844879, "learning_rate": 0.00019754932070585195, "loss": 0.0976, "step": 3714 }, { "epoch": 0.24016161616161616, "grad_norm": 0.06546153873205185, "learning_rate": 0.00019754781577974054, "loss": 0.0846, "step": 3715 }, { "epoch": 0.24022626262626262, "grad_norm": 0.07396513223648071, "learning_rate": 0.00019754631039743025, "loss": 0.1083, "step": 3716 }, { "epoch": 0.24029090909090908, "grad_norm": 0.07573252171278, "learning_rate": 0.00019754480455892815, "loss": 0.1012, "step": 3717 }, { "epoch": 0.24035555555555554, "grad_norm": 0.06593775749206543, "learning_rate": 0.00019754329826424122, "loss": 0.0908, "step": 3718 }, { "epoch": 0.24042020202020203, "grad_norm": 0.07340795546770096, "learning_rate": 0.00019754179151337651, "loss": 0.0937, "step": 3719 }, { "epoch": 0.2404848484848485, "grad_norm": 0.06906809657812119, "learning_rate": 0.0001975402843063411, "loss": 0.0982, "step": 3720 }, { "epoch": 0.24054949494949496, "grad_norm": 0.06268132477998734, "learning_rate": 0.00019753877664314203, "loss": 0.084, "step": 3721 }, { "epoch": 0.24061414141414142, "grad_norm": 0.07329512387514114, "learning_rate": 0.00019753726852378637, "loss": 0.0995, "step": 3722 }, { "epoch": 0.24067878787878788, "grad_norm": 0.06691072881221771, "learning_rate": 0.00019753575994828113, "loss": 0.082, "step": 3723 }, { "epoch": 0.24074343434343434, "grad_norm": 0.08100910484790802, "learning_rate": 0.00019753425091663336, "loss": 0.1071, "step": 3724 }, { "epoch": 0.2408080808080808, "grad_norm": 0.07027315348386765, "learning_rate": 0.0001975327414288502, "loss": 0.091, "step": 3725 }, { "epoch": 0.24087272727272727, "grad_norm": 0.06942614912986755, "learning_rate": 0.0001975312314849386, "loss": 0.0879, "step": 3726 }, { "epoch": 0.24093737373737373, "grad_norm": 0.07370313256978989, "learning_rate": 0.00019752972108490568, "loss": 0.0972, "step": 3727 }, { "epoch": 0.2410020202020202, "grad_norm": 0.07130023837089539, "learning_rate": 0.0001975282102287585, "loss": 0.0911, "step": 3728 }, { "epoch": 0.2410020202020202, "eval_bleu": 14.016115681577919, "eval_loss": 0.09258325397968292, "eval_runtime": 2.7042, "eval_samples_per_second": 11.833, "eval_steps_per_second": 1.479, "step": 3728 }, { "epoch": 0.24106666666666668, "grad_norm": 0.08074845373630524, "learning_rate": 0.00019752669891650416, "loss": 0.1003, "step": 3729 }, { "epoch": 0.24113131313131314, "grad_norm": 0.08117370307445526, "learning_rate": 0.00019752518714814967, "loss": 0.1081, "step": 3730 }, { "epoch": 0.2411959595959596, "grad_norm": 0.06793665885925293, "learning_rate": 0.00019752367492370212, "loss": 0.0797, "step": 3731 }, { "epoch": 0.24126060606060606, "grad_norm": 0.07058891654014587, "learning_rate": 0.00019752216224316858, "loss": 0.0888, "step": 3732 }, { "epoch": 0.24132525252525253, "grad_norm": 0.08567992597818375, "learning_rate": 0.00019752064910655612, "loss": 0.1102, "step": 3733 }, { "epoch": 0.241389898989899, "grad_norm": 0.07798433303833008, "learning_rate": 0.00019751913551387183, "loss": 0.0827, "step": 3734 }, { "epoch": 0.24145454545454545, "grad_norm": 0.06837143748998642, "learning_rate": 0.00019751762146512277, "loss": 0.0807, "step": 3735 }, { "epoch": 0.2415191919191919, "grad_norm": 0.07509053498506546, "learning_rate": 0.00019751610696031605, "loss": 0.0949, "step": 3736 }, { "epoch": 0.24158383838383837, "grad_norm": 0.07728492468595505, "learning_rate": 0.00019751459199945874, "loss": 0.1012, "step": 3737 }, { "epoch": 0.24164848484848483, "grad_norm": 0.07385671138763428, "learning_rate": 0.0001975130765825579, "loss": 0.0889, "step": 3738 }, { "epoch": 0.24171313131313132, "grad_norm": 0.06853918731212616, "learning_rate": 0.00019751156070962067, "loss": 0.0813, "step": 3739 }, { "epoch": 0.24177777777777779, "grad_norm": 0.07051879912614822, "learning_rate": 0.00019751004438065407, "loss": 0.0936, "step": 3740 }, { "epoch": 0.24184242424242425, "grad_norm": 0.06729122251272202, "learning_rate": 0.00019750852759566528, "loss": 0.0808, "step": 3741 }, { "epoch": 0.2419070707070707, "grad_norm": 0.07112760841846466, "learning_rate": 0.00019750701035466128, "loss": 0.1082, "step": 3742 }, { "epoch": 0.24197171717171717, "grad_norm": 0.08199884742498398, "learning_rate": 0.00019750549265764927, "loss": 0.1006, "step": 3743 }, { "epoch": 0.24203636363636363, "grad_norm": 0.06376795470714569, "learning_rate": 0.00019750397450463627, "loss": 0.0758, "step": 3744 }, { "epoch": 0.24203636363636363, "eval_bleu": 13.941046495240542, "eval_loss": 0.09321649372577667, "eval_runtime": 2.7927, "eval_samples_per_second": 11.458, "eval_steps_per_second": 1.432, "step": 3744 }, { "epoch": 0.2421010101010101, "grad_norm": 0.0710856094956398, "learning_rate": 0.00019750245589562947, "loss": 0.0886, "step": 3745 }, { "epoch": 0.24216565656565656, "grad_norm": 0.08524394780397415, "learning_rate": 0.00019750093683063586, "loss": 0.1366, "step": 3746 }, { "epoch": 0.24223030303030302, "grad_norm": 0.06681890040636063, "learning_rate": 0.00019749941730966264, "loss": 0.0845, "step": 3747 }, { "epoch": 0.2422949494949495, "grad_norm": 0.061822980642318726, "learning_rate": 0.00019749789733271688, "loss": 0.0872, "step": 3748 }, { "epoch": 0.24235959595959597, "grad_norm": 0.06491713225841522, "learning_rate": 0.00019749637689980566, "loss": 0.0959, "step": 3749 }, { "epoch": 0.24242424242424243, "grad_norm": 0.06295552104711533, "learning_rate": 0.00019749485601093612, "loss": 0.0775, "step": 3750 }, { "epoch": 0.2424888888888889, "grad_norm": 0.07560793310403824, "learning_rate": 0.00019749333466611538, "loss": 0.1077, "step": 3751 }, { "epoch": 0.24255353535353535, "grad_norm": 0.08111906051635742, "learning_rate": 0.00019749181286535055, "loss": 0.107, "step": 3752 }, { "epoch": 0.24261818181818182, "grad_norm": 0.06592545658349991, "learning_rate": 0.00019749029060864873, "loss": 0.0782, "step": 3753 }, { "epoch": 0.24268282828282828, "grad_norm": 0.08493833988904953, "learning_rate": 0.00019748876789601704, "loss": 0.1018, "step": 3754 }, { "epoch": 0.24274747474747474, "grad_norm": 0.06522996723651886, "learning_rate": 0.00019748724472746262, "loss": 0.0932, "step": 3755 }, { "epoch": 0.2428121212121212, "grad_norm": 0.07433473318815231, "learning_rate": 0.00019748572110299262, "loss": 0.094, "step": 3756 }, { "epoch": 0.24287676767676766, "grad_norm": 0.06947848945856094, "learning_rate": 0.0001974841970226141, "loss": 0.0795, "step": 3757 }, { "epoch": 0.24294141414141415, "grad_norm": 0.07081591337919235, "learning_rate": 0.00019748267248633421, "loss": 0.0904, "step": 3758 }, { "epoch": 0.24300606060606061, "grad_norm": 0.08732277154922485, "learning_rate": 0.0001974811474941601, "loss": 0.0965, "step": 3759 }, { "epoch": 0.24307070707070708, "grad_norm": 0.0738610252737999, "learning_rate": 0.00019747962204609887, "loss": 0.0836, "step": 3760 }, { "epoch": 0.24307070707070708, "eval_bleu": 13.614584913945766, "eval_loss": 0.0931699350476265, "eval_runtime": 2.6369, "eval_samples_per_second": 12.136, "eval_steps_per_second": 1.517, "step": 3760 }, { "epoch": 0.24313535353535354, "grad_norm": 0.08599385619163513, "learning_rate": 0.00019747809614215772, "loss": 0.0991, "step": 3761 }, { "epoch": 0.2432, "grad_norm": 0.06696201115846634, "learning_rate": 0.0001974765697823437, "loss": 0.075, "step": 3762 }, { "epoch": 0.24326464646464646, "grad_norm": 0.06407614052295685, "learning_rate": 0.000197475042966664, "loss": 0.084, "step": 3763 }, { "epoch": 0.24332929292929292, "grad_norm": 0.1326744556427002, "learning_rate": 0.00019747351569512572, "loss": 0.1064, "step": 3764 }, { "epoch": 0.24339393939393938, "grad_norm": 0.06654291599988937, "learning_rate": 0.00019747198796773608, "loss": 0.0884, "step": 3765 }, { "epoch": 0.24345858585858585, "grad_norm": 0.06948009133338928, "learning_rate": 0.00019747045978450216, "loss": 0.0855, "step": 3766 }, { "epoch": 0.24352323232323234, "grad_norm": 0.06581073254346848, "learning_rate": 0.0001974689311454311, "loss": 0.0775, "step": 3767 }, { "epoch": 0.2435878787878788, "grad_norm": 0.07536555826663971, "learning_rate": 0.0001974674020505301, "loss": 0.0941, "step": 3768 }, { "epoch": 0.24365252525252526, "grad_norm": 0.06603272259235382, "learning_rate": 0.00019746587249980626, "loss": 0.0785, "step": 3769 }, { "epoch": 0.24371717171717172, "grad_norm": 0.06903275847434998, "learning_rate": 0.00019746434249326677, "loss": 0.0906, "step": 3770 }, { "epoch": 0.24378181818181818, "grad_norm": 0.07458031922578812, "learning_rate": 0.00019746281203091877, "loss": 0.0859, "step": 3771 }, { "epoch": 0.24384646464646464, "grad_norm": 0.08659809082746506, "learning_rate": 0.00019746128111276941, "loss": 0.1148, "step": 3772 }, { "epoch": 0.2439111111111111, "grad_norm": 0.07417561858892441, "learning_rate": 0.00019745974973882588, "loss": 0.0873, "step": 3773 }, { "epoch": 0.24397575757575757, "grad_norm": 0.0825277641415596, "learning_rate": 0.0001974582179090953, "loss": 0.0929, "step": 3774 }, { "epoch": 0.24404040404040403, "grad_norm": 0.07200012356042862, "learning_rate": 0.00019745668562358486, "loss": 0.0985, "step": 3775 }, { "epoch": 0.2441050505050505, "grad_norm": 0.06135423108935356, "learning_rate": 0.0001974551528823017, "loss": 0.0881, "step": 3776 }, { "epoch": 0.2441050505050505, "eval_bleu": 14.072435219353501, "eval_loss": 0.0930660218000412, "eval_runtime": 2.6662, "eval_samples_per_second": 12.002, "eval_steps_per_second": 1.5, "step": 3776 }, { "epoch": 0.24416969696969698, "grad_norm": 0.062058717012405396, "learning_rate": 0.00019745361968525303, "loss": 0.0691, "step": 3777 }, { "epoch": 0.24423434343434344, "grad_norm": 0.0643954947590828, "learning_rate": 0.00019745208603244604, "loss": 0.0884, "step": 3778 }, { "epoch": 0.2442989898989899, "grad_norm": 0.07990649342536926, "learning_rate": 0.0001974505519238878, "loss": 0.0913, "step": 3779 }, { "epoch": 0.24436363636363637, "grad_norm": 0.08154579252004623, "learning_rate": 0.00019744901735958554, "loss": 0.1189, "step": 3780 }, { "epoch": 0.24442828282828283, "grad_norm": 0.0726609155535698, "learning_rate": 0.00019744748233954646, "loss": 0.0958, "step": 3781 }, { "epoch": 0.2444929292929293, "grad_norm": 0.06956779211759567, "learning_rate": 0.00019744594686377776, "loss": 0.1058, "step": 3782 }, { "epoch": 0.24455757575757575, "grad_norm": 0.07045941799879074, "learning_rate": 0.0001974444109322865, "loss": 0.1002, "step": 3783 }, { "epoch": 0.2446222222222222, "grad_norm": 0.06140134856104851, "learning_rate": 0.00019744287454508, "loss": 0.0758, "step": 3784 }, { "epoch": 0.24468686868686867, "grad_norm": 0.06358760595321655, "learning_rate": 0.0001974413377021654, "loss": 0.0937, "step": 3785 }, { "epoch": 0.24475151515151516, "grad_norm": 0.0716613382101059, "learning_rate": 0.00019743980040354985, "loss": 0.0855, "step": 3786 }, { "epoch": 0.24481616161616163, "grad_norm": 0.06060607358813286, "learning_rate": 0.0001974382626492406, "loss": 0.076, "step": 3787 }, { "epoch": 0.2448808080808081, "grad_norm": 0.06469915807247162, "learning_rate": 0.00019743672443924476, "loss": 0.0889, "step": 3788 }, { "epoch": 0.24494545454545455, "grad_norm": 0.0663461834192276, "learning_rate": 0.00019743518577356958, "loss": 0.0853, "step": 3789 }, { "epoch": 0.245010101010101, "grad_norm": 0.07696621865034103, "learning_rate": 0.00019743364665222228, "loss": 0.0763, "step": 3790 }, { "epoch": 0.24507474747474747, "grad_norm": 0.06791282445192337, "learning_rate": 0.00019743210707521002, "loss": 0.0937, "step": 3791 }, { "epoch": 0.24513939393939393, "grad_norm": 0.059839557856321335, "learning_rate": 0.00019743056704253997, "loss": 0.0859, "step": 3792 }, { "epoch": 0.24513939393939393, "eval_bleu": 14.056218074665326, "eval_loss": 0.09398597478866577, "eval_runtime": 2.7769, "eval_samples_per_second": 11.524, "eval_steps_per_second": 1.44, "step": 3792 }, { "epoch": 0.2452040404040404, "grad_norm": 0.07247661799192429, "learning_rate": 0.0001974290265542194, "loss": 0.0971, "step": 3793 }, { "epoch": 0.24526868686868686, "grad_norm": 0.06797537207603455, "learning_rate": 0.00019742748561025545, "loss": 0.0989, "step": 3794 }, { "epoch": 0.24533333333333332, "grad_norm": 0.061104148626327515, "learning_rate": 0.0001974259442106554, "loss": 0.0712, "step": 3795 }, { "epoch": 0.2453979797979798, "grad_norm": 0.06417061388492584, "learning_rate": 0.0001974244023554264, "loss": 0.0883, "step": 3796 }, { "epoch": 0.24546262626262627, "grad_norm": 0.07916979491710663, "learning_rate": 0.00019742286004457567, "loss": 0.1117, "step": 3797 }, { "epoch": 0.24552727272727273, "grad_norm": 0.0759257897734642, "learning_rate": 0.00019742131727811045, "loss": 0.0892, "step": 3798 }, { "epoch": 0.2455919191919192, "grad_norm": 0.06910717487335205, "learning_rate": 0.0001974197740560379, "loss": 0.0919, "step": 3799 }, { "epoch": 0.24565656565656566, "grad_norm": 0.0624660924077034, "learning_rate": 0.0001974182303783653, "loss": 0.0787, "step": 3800 }, { "epoch": 0.24572121212121212, "grad_norm": 0.07715356349945068, "learning_rate": 0.00019741668624509987, "loss": 0.0915, "step": 3801 }, { "epoch": 0.24578585858585858, "grad_norm": 0.06638652086257935, "learning_rate": 0.00019741514165624874, "loss": 0.0942, "step": 3802 }, { "epoch": 0.24585050505050504, "grad_norm": 0.06455226242542267, "learning_rate": 0.00019741359661181924, "loss": 0.0922, "step": 3803 }, { "epoch": 0.2459151515151515, "grad_norm": 0.06638920307159424, "learning_rate": 0.00019741205111181853, "loss": 0.0941, "step": 3804 }, { "epoch": 0.245979797979798, "grad_norm": 0.05518640950322151, "learning_rate": 0.00019741050515625387, "loss": 0.0737, "step": 3805 }, { "epoch": 0.24604444444444445, "grad_norm": 0.06490351259708405, "learning_rate": 0.0001974089587451325, "loss": 0.1007, "step": 3806 }, { "epoch": 0.24610909090909092, "grad_norm": 0.06110585108399391, "learning_rate": 0.00019740741187846162, "loss": 0.0802, "step": 3807 }, { "epoch": 0.24617373737373738, "grad_norm": 0.07082351297140121, "learning_rate": 0.00019740586455624848, "loss": 0.0813, "step": 3808 }, { "epoch": 0.24617373737373738, "eval_bleu": 16.764868008949694, "eval_loss": 0.0933462530374527, "eval_runtime": 2.7501, "eval_samples_per_second": 11.636, "eval_steps_per_second": 1.455, "step": 3808 }, { "epoch": 0.24623838383838384, "grad_norm": 0.06844015419483185, "learning_rate": 0.00019740431677850028, "loss": 0.0903, "step": 3809 }, { "epoch": 0.2463030303030303, "grad_norm": 0.06428299099206924, "learning_rate": 0.00019740276854522435, "loss": 0.0825, "step": 3810 }, { "epoch": 0.24636767676767676, "grad_norm": 0.05986114218831062, "learning_rate": 0.0001974012198564278, "loss": 0.0807, "step": 3811 }, { "epoch": 0.24643232323232322, "grad_norm": 0.08208808302879333, "learning_rate": 0.000197399670712118, "loss": 0.1083, "step": 3812 }, { "epoch": 0.2464969696969697, "grad_norm": 0.061327412724494934, "learning_rate": 0.00019739812111230215, "loss": 0.0751, "step": 3813 }, { "epoch": 0.24656161616161615, "grad_norm": 0.06825974583625793, "learning_rate": 0.00019739657105698744, "loss": 0.0769, "step": 3814 }, { "epoch": 0.24662626262626264, "grad_norm": 0.08726572245359421, "learning_rate": 0.0001973950205461812, "loss": 0.1148, "step": 3815 }, { "epoch": 0.2466909090909091, "grad_norm": 0.07514621317386627, "learning_rate": 0.00019739346957989065, "loss": 0.091, "step": 3816 }, { "epoch": 0.24675555555555556, "grad_norm": 0.06860413402318954, "learning_rate": 0.000197391918158123, "loss": 0.089, "step": 3817 }, { "epoch": 0.24682020202020202, "grad_norm": 0.0798850879073143, "learning_rate": 0.0001973903662808856, "loss": 0.0757, "step": 3818 }, { "epoch": 0.24688484848484848, "grad_norm": 0.0821099504828453, "learning_rate": 0.0001973888139481856, "loss": 0.0949, "step": 3819 }, { "epoch": 0.24694949494949495, "grad_norm": 0.07128286361694336, "learning_rate": 0.00019738726116003035, "loss": 0.0928, "step": 3820 }, { "epoch": 0.2470141414141414, "grad_norm": 0.06745872646570206, "learning_rate": 0.00019738570791642707, "loss": 0.0858, "step": 3821 }, { "epoch": 0.24707878787878787, "grad_norm": 0.06696398556232452, "learning_rate": 0.000197384154217383, "loss": 0.0821, "step": 3822 }, { "epoch": 0.24714343434343433, "grad_norm": 0.06421520560979843, "learning_rate": 0.00019738260006290547, "loss": 0.082, "step": 3823 }, { "epoch": 0.24720808080808082, "grad_norm": 0.06522439420223236, "learning_rate": 0.00019738104545300171, "loss": 0.0892, "step": 3824 }, { "epoch": 0.24720808080808082, "eval_bleu": 16.27614087403725, "eval_loss": 0.09310175478458405, "eval_runtime": 2.8713, "eval_samples_per_second": 11.145, "eval_steps_per_second": 1.393, "step": 3824 }, { "epoch": 0.24727272727272728, "grad_norm": 0.06867222487926483, "learning_rate": 0.00019737949038767897, "loss": 0.0918, "step": 3825 }, { "epoch": 0.24733737373737374, "grad_norm": 0.07688181102275848, "learning_rate": 0.00019737793486694456, "loss": 0.1024, "step": 3826 }, { "epoch": 0.2474020202020202, "grad_norm": 0.07875890284776688, "learning_rate": 0.00019737637889080575, "loss": 0.1141, "step": 3827 }, { "epoch": 0.24746666666666667, "grad_norm": 0.07064365595579147, "learning_rate": 0.0001973748224592698, "loss": 0.0893, "step": 3828 }, { "epoch": 0.24753131313131313, "grad_norm": 0.06773609668016434, "learning_rate": 0.000197373265572344, "loss": 0.0858, "step": 3829 }, { "epoch": 0.2475959595959596, "grad_norm": 0.07183733582496643, "learning_rate": 0.0001973717082300356, "loss": 0.0897, "step": 3830 }, { "epoch": 0.24766060606060605, "grad_norm": 0.07189106941223145, "learning_rate": 0.00019737015043235198, "loss": 0.1023, "step": 3831 }, { "epoch": 0.24772525252525252, "grad_norm": 0.07007001340389252, "learning_rate": 0.0001973685921793003, "loss": 0.0882, "step": 3832 }, { "epoch": 0.24778989898989898, "grad_norm": 0.07396364212036133, "learning_rate": 0.00019736703347088792, "loss": 0.0962, "step": 3833 }, { "epoch": 0.24785454545454547, "grad_norm": 0.0649465024471283, "learning_rate": 0.00019736547430712208, "loss": 0.0875, "step": 3834 }, { "epoch": 0.24791919191919193, "grad_norm": 0.06593985855579376, "learning_rate": 0.00019736391468801014, "loss": 0.0814, "step": 3835 }, { "epoch": 0.2479838383838384, "grad_norm": 0.06916589289903641, "learning_rate": 0.00019736235461355935, "loss": 0.096, "step": 3836 }, { "epoch": 0.24804848484848485, "grad_norm": 0.06952930241823196, "learning_rate": 0.00019736079408377703, "loss": 0.0825, "step": 3837 }, { "epoch": 0.2481131313131313, "grad_norm": 0.07936926931142807, "learning_rate": 0.00019735923309867047, "loss": 0.1119, "step": 3838 }, { "epoch": 0.24817777777777777, "grad_norm": 0.07878732681274414, "learning_rate": 0.00019735767165824695, "loss": 0.0969, "step": 3839 }, { "epoch": 0.24824242424242424, "grad_norm": 0.07381368428468704, "learning_rate": 0.00019735610976251376, "loss": 0.0924, "step": 3840 }, { "epoch": 0.24824242424242424, "eval_bleu": 14.521381565332044, "eval_loss": 0.09357903152704239, "eval_runtime": 2.7037, "eval_samples_per_second": 11.836, "eval_steps_per_second": 1.479, "step": 3840 }, { "epoch": 0.2483070707070707, "grad_norm": 0.07202629745006561, "learning_rate": 0.00019735454741147824, "loss": 0.0928, "step": 3841 }, { "epoch": 0.24837171717171716, "grad_norm": 0.07414745539426804, "learning_rate": 0.00019735298460514772, "loss": 0.1056, "step": 3842 }, { "epoch": 0.24843636363636365, "grad_norm": 0.06099332496523857, "learning_rate": 0.00019735142134352944, "loss": 0.0732, "step": 3843 }, { "epoch": 0.2485010101010101, "grad_norm": 0.06688322126865387, "learning_rate": 0.00019734985762663077, "loss": 0.0846, "step": 3844 }, { "epoch": 0.24856565656565657, "grad_norm": 0.07159404456615448, "learning_rate": 0.000197348293454459, "loss": 0.1065, "step": 3845 }, { "epoch": 0.24863030303030303, "grad_norm": 0.05792752653360367, "learning_rate": 0.0001973467288270214, "loss": 0.0744, "step": 3846 }, { "epoch": 0.2486949494949495, "grad_norm": 0.065129853785038, "learning_rate": 0.00019734516374432537, "loss": 0.0808, "step": 3847 }, { "epoch": 0.24875959595959596, "grad_norm": 0.07136716693639755, "learning_rate": 0.00019734359820637818, "loss": 0.0813, "step": 3848 }, { "epoch": 0.24882424242424242, "grad_norm": 0.071194589138031, "learning_rate": 0.00019734203221318718, "loss": 0.0959, "step": 3849 }, { "epoch": 0.24888888888888888, "grad_norm": 0.0713515505194664, "learning_rate": 0.00019734046576475966, "loss": 0.0942, "step": 3850 }, { "epoch": 0.24895353535353534, "grad_norm": 0.06720596551895142, "learning_rate": 0.00019733889886110295, "loss": 0.0818, "step": 3851 }, { "epoch": 0.2490181818181818, "grad_norm": 0.06579820811748505, "learning_rate": 0.00019733733150222442, "loss": 0.0846, "step": 3852 }, { "epoch": 0.2490828282828283, "grad_norm": 0.11488955467939377, "learning_rate": 0.00019733576368813135, "loss": 0.1035, "step": 3853 }, { "epoch": 0.24914747474747476, "grad_norm": 0.06219153106212616, "learning_rate": 0.00019733419541883112, "loss": 0.0808, "step": 3854 }, { "epoch": 0.24921212121212122, "grad_norm": 0.09871772676706314, "learning_rate": 0.000197332626694331, "loss": 0.1053, "step": 3855 }, { "epoch": 0.24927676767676768, "grad_norm": 0.07146307826042175, "learning_rate": 0.00019733105751463837, "loss": 0.0895, "step": 3856 }, { "epoch": 0.24927676767676768, "eval_bleu": 17.299049814793168, "eval_loss": 0.0918586328625679, "eval_runtime": 2.7495, "eval_samples_per_second": 11.638, "eval_steps_per_second": 1.455, "step": 3856 }, { "epoch": 0.24934141414141414, "grad_norm": 0.06900150328874588, "learning_rate": 0.00019732948787976057, "loss": 0.0877, "step": 3857 }, { "epoch": 0.2494060606060606, "grad_norm": 0.07269001752138138, "learning_rate": 0.00019732791778970493, "loss": 0.0908, "step": 3858 }, { "epoch": 0.24947070707070707, "grad_norm": 0.06268926709890366, "learning_rate": 0.00019732634724447878, "loss": 0.0792, "step": 3859 }, { "epoch": 0.24953535353535353, "grad_norm": 0.0876120999455452, "learning_rate": 0.00019732477624408948, "loss": 0.1067, "step": 3860 }, { "epoch": 0.2496, "grad_norm": 0.08949094265699387, "learning_rate": 0.0001973232047885444, "loss": 0.0944, "step": 3861 }, { "epoch": 0.24966464646464648, "grad_norm": 0.1021556407213211, "learning_rate": 0.00019732163287785082, "loss": 0.0927, "step": 3862 }, { "epoch": 0.24972929292929294, "grad_norm": 0.06641527265310287, "learning_rate": 0.0001973200605120162, "loss": 0.0848, "step": 3863 }, { "epoch": 0.2497939393939394, "grad_norm": 0.06752959638834, "learning_rate": 0.0001973184876910478, "loss": 0.0814, "step": 3864 }, { "epoch": 0.24985858585858586, "grad_norm": 0.07963059097528458, "learning_rate": 0.00019731691441495302, "loss": 0.1066, "step": 3865 }, { "epoch": 0.24992323232323232, "grad_norm": 0.07975966483354568, "learning_rate": 0.00019731534068373919, "loss": 0.0985, "step": 3866 }, { "epoch": 0.2499878787878788, "grad_norm": 0.07607954740524292, "learning_rate": 0.00019731376649741368, "loss": 0.1036, "step": 3867 }, { "epoch": 0.25005252525252525, "grad_norm": 0.11136452853679657, "learning_rate": 0.00019731219185598384, "loss": 0.0922, "step": 3868 }, { "epoch": 0.25011717171717174, "grad_norm": 0.07405475527048111, "learning_rate": 0.00019731061675945708, "loss": 0.0874, "step": 3869 }, { "epoch": 0.25018181818181817, "grad_norm": 0.07023259252309799, "learning_rate": 0.0001973090412078407, "loss": 0.0917, "step": 3870 }, { "epoch": 0.25024646464646466, "grad_norm": 0.06776626408100128, "learning_rate": 0.00019730746520114215, "loss": 0.0926, "step": 3871 }, { "epoch": 0.2503111111111111, "grad_norm": 0.10249544680118561, "learning_rate": 0.00019730588873936872, "loss": 0.1073, "step": 3872 }, { "epoch": 0.2503111111111111, "eval_bleu": 14.989571546850634, "eval_loss": 0.09359989315271378, "eval_runtime": 2.7169, "eval_samples_per_second": 11.778, "eval_steps_per_second": 1.472, "step": 3872 }, { "epoch": 0.2503757575757576, "grad_norm": 0.08641575276851654, "learning_rate": 0.00019730431182252782, "loss": 0.1127, "step": 3873 }, { "epoch": 0.250440404040404, "grad_norm": 0.06100583076477051, "learning_rate": 0.00019730273445062686, "loss": 0.0717, "step": 3874 }, { "epoch": 0.2505050505050505, "grad_norm": 0.07123076170682907, "learning_rate": 0.00019730115662367314, "loss": 0.087, "step": 3875 }, { "epoch": 0.25056969696969694, "grad_norm": 0.06142003461718559, "learning_rate": 0.0001972995783416741, "loss": 0.0719, "step": 3876 }, { "epoch": 0.25063434343434343, "grad_norm": 0.07932982593774796, "learning_rate": 0.00019729799960463705, "loss": 0.114, "step": 3877 }, { "epoch": 0.2506989898989899, "grad_norm": 0.09041208028793335, "learning_rate": 0.0001972964204125695, "loss": 0.093, "step": 3878 }, { "epoch": 0.25076363636363636, "grad_norm": 0.06294432282447815, "learning_rate": 0.0001972948407654787, "loss": 0.0842, "step": 3879 }, { "epoch": 0.25082828282828284, "grad_norm": 0.0731189027428627, "learning_rate": 0.0001972932606633721, "loss": 0.0979, "step": 3880 }, { "epoch": 0.2508929292929293, "grad_norm": 0.08556569367647171, "learning_rate": 0.00019729168010625708, "loss": 0.114, "step": 3881 }, { "epoch": 0.25095757575757577, "grad_norm": 0.08083625137805939, "learning_rate": 0.00019729009909414107, "loss": 0.1116, "step": 3882 }, { "epoch": 0.2510222222222222, "grad_norm": 0.07771521061658859, "learning_rate": 0.0001972885176270314, "loss": 0.0993, "step": 3883 }, { "epoch": 0.2510868686868687, "grad_norm": 0.09448112547397614, "learning_rate": 0.00019728693570493554, "loss": 0.1075, "step": 3884 }, { "epoch": 0.2511515151515151, "grad_norm": 0.06368444114923477, "learning_rate": 0.0001972853533278608, "loss": 0.087, "step": 3885 }, { "epoch": 0.2512161616161616, "grad_norm": 0.07448848336935043, "learning_rate": 0.0001972837704958146, "loss": 0.0977, "step": 3886 }, { "epoch": 0.2512808080808081, "grad_norm": 0.06747115403413773, "learning_rate": 0.00019728218720880443, "loss": 0.0856, "step": 3887 }, { "epoch": 0.25134545454545454, "grad_norm": 0.06890641897916794, "learning_rate": 0.0001972806034668376, "loss": 0.1013, "step": 3888 }, { "epoch": 0.25134545454545454, "eval_bleu": 14.539726475818581, "eval_loss": 0.09322687983512878, "eval_runtime": 2.8119, "eval_samples_per_second": 11.38, "eval_steps_per_second": 1.423, "step": 3888 }, { "epoch": 0.25141010101010103, "grad_norm": 0.06661111116409302, "learning_rate": 0.00019727901926992153, "loss": 0.0952, "step": 3889 }, { "epoch": 0.25147474747474746, "grad_norm": 0.06179904565215111, "learning_rate": 0.0001972774346180637, "loss": 0.091, "step": 3890 }, { "epoch": 0.25153939393939395, "grad_norm": 0.07435615360736847, "learning_rate": 0.00019727584951127142, "loss": 0.1097, "step": 3891 }, { "epoch": 0.2516040404040404, "grad_norm": 0.06801638752222061, "learning_rate": 0.00019727426394955218, "loss": 0.0981, "step": 3892 }, { "epoch": 0.2516686868686869, "grad_norm": 0.06010456010699272, "learning_rate": 0.00019727267793291333, "loss": 0.0774, "step": 3893 }, { "epoch": 0.2517333333333333, "grad_norm": 0.06850286573171616, "learning_rate": 0.00019727109146136233, "loss": 0.0914, "step": 3894 }, { "epoch": 0.2517979797979798, "grad_norm": 0.070985347032547, "learning_rate": 0.00019726950453490664, "loss": 0.0914, "step": 3895 }, { "epoch": 0.2518626262626263, "grad_norm": 0.06558531522750854, "learning_rate": 0.0001972679171535536, "loss": 0.0821, "step": 3896 }, { "epoch": 0.2519272727272727, "grad_norm": 0.07455248385667801, "learning_rate": 0.00019726632931731065, "loss": 0.1009, "step": 3897 }, { "epoch": 0.2519919191919192, "grad_norm": 0.06499814987182617, "learning_rate": 0.00019726474102618524, "loss": 0.0712, "step": 3898 }, { "epoch": 0.25205656565656565, "grad_norm": 0.07061312347650528, "learning_rate": 0.0001972631522801848, "loss": 0.1069, "step": 3899 }, { "epoch": 0.25212121212121213, "grad_norm": 0.07644709944725037, "learning_rate": 0.00019726156307931677, "loss": 0.1125, "step": 3900 }, { "epoch": 0.25218585858585857, "grad_norm": 0.0785532146692276, "learning_rate": 0.00019725997342358856, "loss": 0.0992, "step": 3901 }, { "epoch": 0.25225050505050506, "grad_norm": 0.06581787765026093, "learning_rate": 0.0001972583833130076, "loss": 0.0861, "step": 3902 }, { "epoch": 0.2523151515151515, "grad_norm": 0.07853641360998154, "learning_rate": 0.00019725679274758132, "loss": 0.1136, "step": 3903 }, { "epoch": 0.252379797979798, "grad_norm": 0.06795958429574966, "learning_rate": 0.00019725520172731716, "loss": 0.0885, "step": 3904 }, { "epoch": 0.252379797979798, "eval_bleu": 14.895046897536865, "eval_loss": 0.09238015115261078, "eval_runtime": 2.7203, "eval_samples_per_second": 11.764, "eval_steps_per_second": 1.47, "step": 3904 }, { "epoch": 0.25244444444444447, "grad_norm": 0.07393664121627808, "learning_rate": 0.00019725361025222263, "loss": 0.0984, "step": 3905 }, { "epoch": 0.2525090909090909, "grad_norm": 0.07062132656574249, "learning_rate": 0.00019725201832230507, "loss": 0.0887, "step": 3906 }, { "epoch": 0.2525737373737374, "grad_norm": 0.11406850069761276, "learning_rate": 0.000197250425937572, "loss": 0.0955, "step": 3907 }, { "epoch": 0.25263838383838383, "grad_norm": 0.07079752534627914, "learning_rate": 0.0001972488330980308, "loss": 0.0952, "step": 3908 }, { "epoch": 0.2527030303030303, "grad_norm": 0.07221031188964844, "learning_rate": 0.000197247239803689, "loss": 0.0977, "step": 3909 }, { "epoch": 0.25276767676767675, "grad_norm": 0.06931409984827042, "learning_rate": 0.00019724564605455398, "loss": 0.093, "step": 3910 }, { "epoch": 0.25283232323232324, "grad_norm": 0.07004189491271973, "learning_rate": 0.00019724405185063323, "loss": 0.095, "step": 3911 }, { "epoch": 0.2528969696969697, "grad_norm": 0.05664917081594467, "learning_rate": 0.0001972424571919342, "loss": 0.0714, "step": 3912 }, { "epoch": 0.25296161616161617, "grad_norm": 0.06721418350934982, "learning_rate": 0.00019724086207846436, "loss": 0.0823, "step": 3913 }, { "epoch": 0.2530262626262626, "grad_norm": 0.06993572413921356, "learning_rate": 0.00019723926651023113, "loss": 0.1066, "step": 3914 }, { "epoch": 0.2530909090909091, "grad_norm": 0.11163713783025742, "learning_rate": 0.000197237670487242, "loss": 0.0785, "step": 3915 }, { "epoch": 0.2531555555555556, "grad_norm": 0.07591798901557922, "learning_rate": 0.00019723607400950444, "loss": 0.0987, "step": 3916 }, { "epoch": 0.253220202020202, "grad_norm": 0.13017378747463226, "learning_rate": 0.0001972344770770259, "loss": 0.0993, "step": 3917 }, { "epoch": 0.2532848484848485, "grad_norm": 0.06332392245531082, "learning_rate": 0.00019723287968981384, "loss": 0.0947, "step": 3918 }, { "epoch": 0.25334949494949494, "grad_norm": 0.06982926279306412, "learning_rate": 0.0001972312818478758, "loss": 0.1006, "step": 3919 }, { "epoch": 0.2534141414141414, "grad_norm": 0.06936702132225037, "learning_rate": 0.00019722968355121915, "loss": 0.0947, "step": 3920 }, { "epoch": 0.2534141414141414, "eval_bleu": 15.983094597991043, "eval_loss": 0.09143301844596863, "eval_runtime": 2.8465, "eval_samples_per_second": 11.242, "eval_steps_per_second": 1.405, "step": 3920 }, { "epoch": 0.25347878787878786, "grad_norm": 0.07879718393087387, "learning_rate": 0.00019722808479985142, "loss": 0.0927, "step": 3921 }, { "epoch": 0.25354343434343435, "grad_norm": 0.05832449346780777, "learning_rate": 0.0001972264855937801, "loss": 0.0818, "step": 3922 }, { "epoch": 0.2536080808080808, "grad_norm": 0.07388099282979965, "learning_rate": 0.00019722488593301263, "loss": 0.1038, "step": 3923 }, { "epoch": 0.25367272727272727, "grad_norm": 0.059918127954006195, "learning_rate": 0.00019722328581755653, "loss": 0.0811, "step": 3924 }, { "epoch": 0.25373737373737376, "grad_norm": 0.058182474225759506, "learning_rate": 0.00019722168524741927, "loss": 0.0733, "step": 3925 }, { "epoch": 0.2538020202020202, "grad_norm": 0.0874689444899559, "learning_rate": 0.00019722008422260828, "loss": 0.0924, "step": 3926 }, { "epoch": 0.2538666666666667, "grad_norm": 0.06599444150924683, "learning_rate": 0.00019721848274313115, "loss": 0.0788, "step": 3927 }, { "epoch": 0.2539313131313131, "grad_norm": 0.06382293999195099, "learning_rate": 0.00019721688080899527, "loss": 0.0773, "step": 3928 }, { "epoch": 0.2539959595959596, "grad_norm": 0.06593070924282074, "learning_rate": 0.0001972152784202082, "loss": 0.0903, "step": 3929 }, { "epoch": 0.25406060606060604, "grad_norm": 0.06831151247024536, "learning_rate": 0.00019721367557677745, "loss": 0.0818, "step": 3930 }, { "epoch": 0.25412525252525253, "grad_norm": 0.07421889156103134, "learning_rate": 0.00019721207227871044, "loss": 0.0888, "step": 3931 }, { "epoch": 0.25418989898989897, "grad_norm": 0.07989794760942459, "learning_rate": 0.0001972104685260147, "loss": 0.1121, "step": 3932 }, { "epoch": 0.25425454545454546, "grad_norm": 0.06599510461091995, "learning_rate": 0.00019720886431869774, "loss": 0.0864, "step": 3933 }, { "epoch": 0.25431919191919194, "grad_norm": 0.07128570228815079, "learning_rate": 0.00019720725965676706, "loss": 0.0932, "step": 3934 }, { "epoch": 0.2543838383838384, "grad_norm": 0.07657445967197418, "learning_rate": 0.00019720565454023014, "loss": 0.1068, "step": 3935 }, { "epoch": 0.25444848484848487, "grad_norm": 0.08500547707080841, "learning_rate": 0.00019720404896909454, "loss": 0.1065, "step": 3936 }, { "epoch": 0.25444848484848487, "eval_bleu": 14.247262018083113, "eval_loss": 0.09327036142349243, "eval_runtime": 2.7759, "eval_samples_per_second": 11.528, "eval_steps_per_second": 1.441, "step": 3936 }, { "epoch": 0.2545131313131313, "grad_norm": 0.0704958513379097, "learning_rate": 0.00019720244294336775, "loss": 0.088, "step": 3937 }, { "epoch": 0.2545777777777778, "grad_norm": 0.07374102622270584, "learning_rate": 0.00019720083646305723, "loss": 0.0955, "step": 3938 }, { "epoch": 0.2546424242424242, "grad_norm": 0.06841718405485153, "learning_rate": 0.00019719922952817057, "loss": 0.0973, "step": 3939 }, { "epoch": 0.2547070707070707, "grad_norm": 0.06780092418193817, "learning_rate": 0.00019719762213871522, "loss": 0.0966, "step": 3940 }, { "epoch": 0.25477171717171715, "grad_norm": 0.0709770992398262, "learning_rate": 0.00019719601429469874, "loss": 0.095, "step": 3941 }, { "epoch": 0.25483636363636364, "grad_norm": 0.07099176943302155, "learning_rate": 0.00019719440599612863, "loss": 0.1004, "step": 3942 }, { "epoch": 0.25490101010101013, "grad_norm": 0.06256180256605148, "learning_rate": 0.00019719279724301242, "loss": 0.0956, "step": 3943 }, { "epoch": 0.25496565656565656, "grad_norm": 0.06627906858921051, "learning_rate": 0.0001971911880353576, "loss": 0.0847, "step": 3944 }, { "epoch": 0.25503030303030305, "grad_norm": 0.06458727270364761, "learning_rate": 0.00019718957837317174, "loss": 0.0715, "step": 3945 }, { "epoch": 0.2550949494949495, "grad_norm": 0.054595280438661575, "learning_rate": 0.00019718796825646235, "loss": 0.0759, "step": 3946 }, { "epoch": 0.255159595959596, "grad_norm": 0.07755506783723831, "learning_rate": 0.000197186357685237, "loss": 0.0781, "step": 3947 }, { "epoch": 0.2552242424242424, "grad_norm": 0.0669684037566185, "learning_rate": 0.00019718474665950315, "loss": 0.0928, "step": 3948 }, { "epoch": 0.2552888888888889, "grad_norm": 0.06139666214585304, "learning_rate": 0.00019718313517926833, "loss": 0.0768, "step": 3949 }, { "epoch": 0.25535353535353533, "grad_norm": 0.0804172083735466, "learning_rate": 0.00019718152324454017, "loss": 0.0989, "step": 3950 }, { "epoch": 0.2554181818181818, "grad_norm": 0.0711892694234848, "learning_rate": 0.00019717991085532613, "loss": 0.0907, "step": 3951 }, { "epoch": 0.25548282828282826, "grad_norm": 0.07749702781438828, "learning_rate": 0.00019717829801163382, "loss": 0.1004, "step": 3952 }, { "epoch": 0.25548282828282826, "eval_bleu": 15.335787811375432, "eval_loss": 0.09287138283252716, "eval_runtime": 2.8448, "eval_samples_per_second": 11.249, "eval_steps_per_second": 1.406, "step": 3952 }, { "epoch": 0.25554747474747475, "grad_norm": 0.0781218633055687, "learning_rate": 0.00019717668471347068, "loss": 0.1028, "step": 3953 }, { "epoch": 0.25561212121212123, "grad_norm": 0.06841059774160385, "learning_rate": 0.00019717507096084433, "loss": 0.0777, "step": 3954 }, { "epoch": 0.25567676767676767, "grad_norm": 0.08416417986154556, "learning_rate": 0.0001971734567537623, "loss": 0.1029, "step": 3955 }, { "epoch": 0.25574141414141416, "grad_norm": 0.08974970877170563, "learning_rate": 0.00019717184209223215, "loss": 0.114, "step": 3956 }, { "epoch": 0.2558060606060606, "grad_norm": 0.0668891966342926, "learning_rate": 0.00019717022697626142, "loss": 0.0712, "step": 3957 }, { "epoch": 0.2558707070707071, "grad_norm": 0.07214487344026566, "learning_rate": 0.00019716861140585762, "loss": 0.0829, "step": 3958 }, { "epoch": 0.2559353535353535, "grad_norm": 0.07436203211545944, "learning_rate": 0.0001971669953810284, "loss": 0.1015, "step": 3959 }, { "epoch": 0.256, "grad_norm": 0.06146183982491493, "learning_rate": 0.00019716537890178125, "loss": 0.0836, "step": 3960 }, { "epoch": 0.25606464646464644, "grad_norm": 0.06979256123304367, "learning_rate": 0.00019716376196812372, "loss": 0.0947, "step": 3961 }, { "epoch": 0.25612929292929293, "grad_norm": 0.06719477474689484, "learning_rate": 0.0001971621445800634, "loss": 0.0728, "step": 3962 }, { "epoch": 0.2561939393939394, "grad_norm": 0.08400076627731323, "learning_rate": 0.00019716052673760787, "loss": 0.1037, "step": 3963 }, { "epoch": 0.25625858585858585, "grad_norm": 0.0729844942688942, "learning_rate": 0.00019715890844076468, "loss": 0.0961, "step": 3964 }, { "epoch": 0.25632323232323234, "grad_norm": 0.06257636845111847, "learning_rate": 0.0001971572896895414, "loss": 0.0833, "step": 3965 }, { "epoch": 0.2563878787878788, "grad_norm": 0.07040037214756012, "learning_rate": 0.0001971556704839456, "loss": 0.0957, "step": 3966 }, { "epoch": 0.25645252525252527, "grad_norm": 0.06887052208185196, "learning_rate": 0.0001971540508239848, "loss": 0.0981, "step": 3967 }, { "epoch": 0.2565171717171717, "grad_norm": 0.06848348677158356, "learning_rate": 0.00019715243070966663, "loss": 0.0892, "step": 3968 }, { "epoch": 0.2565171717171717, "eval_bleu": 15.6840452912209, "eval_loss": 0.09552548825740814, "eval_runtime": 2.7638, "eval_samples_per_second": 11.578, "eval_steps_per_second": 1.447, "step": 3968 }, { "epoch": 0.2565818181818182, "grad_norm": 0.0749221071600914, "learning_rate": 0.00019715081014099871, "loss": 0.0962, "step": 3969 }, { "epoch": 0.2566464646464646, "grad_norm": 0.0686810314655304, "learning_rate": 0.00019714918911798855, "loss": 0.0905, "step": 3970 }, { "epoch": 0.2567111111111111, "grad_norm": 0.0811566561460495, "learning_rate": 0.00019714756764064375, "loss": 0.1034, "step": 3971 }, { "epoch": 0.2567757575757576, "grad_norm": 0.0615408793091774, "learning_rate": 0.00019714594570897188, "loss": 0.0797, "step": 3972 }, { "epoch": 0.25684040404040404, "grad_norm": 0.07170801609754562, "learning_rate": 0.00019714432332298054, "loss": 0.0857, "step": 3973 }, { "epoch": 0.2569050505050505, "grad_norm": 0.06380462646484375, "learning_rate": 0.00019714270048267732, "loss": 0.0769, "step": 3974 }, { "epoch": 0.25696969696969696, "grad_norm": 0.07562926411628723, "learning_rate": 0.0001971410771880698, "loss": 0.0881, "step": 3975 }, { "epoch": 0.25703434343434345, "grad_norm": 0.07395053654909134, "learning_rate": 0.00019713945343916557, "loss": 0.0895, "step": 3976 }, { "epoch": 0.2570989898989899, "grad_norm": 0.06585757434368134, "learning_rate": 0.00019713782923597224, "loss": 0.0803, "step": 3977 }, { "epoch": 0.25716363636363637, "grad_norm": 0.06458459049463272, "learning_rate": 0.0001971362045784974, "loss": 0.0836, "step": 3978 }, { "epoch": 0.2572282828282828, "grad_norm": 0.06941258907318115, "learning_rate": 0.00019713457946674866, "loss": 0.0914, "step": 3979 }, { "epoch": 0.2572929292929293, "grad_norm": 0.07414700835943222, "learning_rate": 0.00019713295390073356, "loss": 0.0966, "step": 3980 }, { "epoch": 0.2573575757575758, "grad_norm": 0.06744233518838882, "learning_rate": 0.00019713132788045976, "loss": 0.0832, "step": 3981 }, { "epoch": 0.2574222222222222, "grad_norm": 0.06691445410251617, "learning_rate": 0.00019712970140593487, "loss": 0.0978, "step": 3982 }, { "epoch": 0.2574868686868687, "grad_norm": 0.08676749467849731, "learning_rate": 0.00019712807447716646, "loss": 0.1332, "step": 3983 }, { "epoch": 0.25755151515151514, "grad_norm": 0.09434791654348373, "learning_rate": 0.00019712644709416214, "loss": 0.0971, "step": 3984 }, { "epoch": 0.25755151515151514, "eval_bleu": 16.65200315890472, "eval_loss": 0.0925353541970253, "eval_runtime": 2.8006, "eval_samples_per_second": 11.426, "eval_steps_per_second": 1.428, "step": 3984 }, { "epoch": 0.25761616161616163, "grad_norm": 0.08148640394210815, "learning_rate": 0.0001971248192569296, "loss": 0.1068, "step": 3985 }, { "epoch": 0.25768080808080807, "grad_norm": 0.0725681483745575, "learning_rate": 0.00019712319096547632, "loss": 0.0873, "step": 3986 }, { "epoch": 0.25774545454545456, "grad_norm": 0.06095732003450394, "learning_rate": 0.00019712156221981, "loss": 0.0788, "step": 3987 }, { "epoch": 0.257810101010101, "grad_norm": 0.06826002895832062, "learning_rate": 0.00019711993301993824, "loss": 0.1022, "step": 3988 }, { "epoch": 0.2578747474747475, "grad_norm": 0.07453065365552902, "learning_rate": 0.0001971183033658687, "loss": 0.0957, "step": 3989 }, { "epoch": 0.2579393939393939, "grad_norm": 0.06986382603645325, "learning_rate": 0.00019711667325760895, "loss": 0.0939, "step": 3990 }, { "epoch": 0.2580040404040404, "grad_norm": 0.07774855196475983, "learning_rate": 0.0001971150426951666, "loss": 0.1147, "step": 3991 }, { "epoch": 0.2580686868686869, "grad_norm": 0.08281876146793365, "learning_rate": 0.0001971134116785493, "loss": 0.104, "step": 3992 }, { "epoch": 0.2581333333333333, "grad_norm": 0.06474122405052185, "learning_rate": 0.0001971117802077647, "loss": 0.0773, "step": 3993 }, { "epoch": 0.2581979797979798, "grad_norm": 0.10273505002260208, "learning_rate": 0.00019711014828282036, "loss": 0.0994, "step": 3994 }, { "epoch": 0.25826262626262625, "grad_norm": 0.07526735216379166, "learning_rate": 0.00019710851590372402, "loss": 0.0975, "step": 3995 }, { "epoch": 0.25832727272727274, "grad_norm": 0.07041803002357483, "learning_rate": 0.00019710688307048323, "loss": 0.0788, "step": 3996 }, { "epoch": 0.2583919191919192, "grad_norm": 0.08080800622701645, "learning_rate": 0.00019710524978310568, "loss": 0.1069, "step": 3997 }, { "epoch": 0.25845656565656566, "grad_norm": 0.06575463712215424, "learning_rate": 0.00019710361604159893, "loss": 0.0772, "step": 3998 }, { "epoch": 0.2585212121212121, "grad_norm": 0.06528215110301971, "learning_rate": 0.0001971019818459707, "loss": 0.0785, "step": 3999 }, { "epoch": 0.2585858585858586, "grad_norm": 0.07088376581668854, "learning_rate": 0.00019710034719622857, "loss": 0.0856, "step": 4000 }, { "epoch": 0.2585858585858586, "eval_bleu": 13.79800200866905, "eval_loss": 0.09347105771303177, "eval_runtime": 2.7798, "eval_samples_per_second": 11.512, "eval_steps_per_second": 1.439, "step": 4000 }, { "epoch": 0.2586505050505051, "grad_norm": 0.06831807643175125, "learning_rate": 0.00019709871209238028, "loss": 0.0935, "step": 4001 }, { "epoch": 0.2587151515151515, "grad_norm": 0.08285486698150635, "learning_rate": 0.00019709707653443336, "loss": 0.1203, "step": 4002 }, { "epoch": 0.258779797979798, "grad_norm": 0.06411155313253403, "learning_rate": 0.00019709544052239552, "loss": 0.0853, "step": 4003 }, { "epoch": 0.25884444444444443, "grad_norm": 0.06480775028467178, "learning_rate": 0.00019709380405627443, "loss": 0.0886, "step": 4004 }, { "epoch": 0.2589090909090909, "grad_norm": 0.07566830515861511, "learning_rate": 0.00019709216713607769, "loss": 0.102, "step": 4005 }, { "epoch": 0.25897373737373736, "grad_norm": 0.06415920704603195, "learning_rate": 0.000197090529761813, "loss": 0.077, "step": 4006 }, { "epoch": 0.25903838383838385, "grad_norm": 0.07580181956291199, "learning_rate": 0.00019708889193348797, "loss": 0.0992, "step": 4007 }, { "epoch": 0.2591030303030303, "grad_norm": 0.07114388793706894, "learning_rate": 0.00019708725365111032, "loss": 0.0858, "step": 4008 }, { "epoch": 0.25916767676767677, "grad_norm": 0.06733395159244537, "learning_rate": 0.00019708561491468766, "loss": 0.0815, "step": 4009 }, { "epoch": 0.25923232323232326, "grad_norm": 0.06934670358896255, "learning_rate": 0.00019708397572422768, "loss": 0.0946, "step": 4010 }, { "epoch": 0.2592969696969697, "grad_norm": 0.0639420673251152, "learning_rate": 0.00019708233607973808, "loss": 0.0874, "step": 4011 }, { "epoch": 0.2593616161616162, "grad_norm": 0.07199052721261978, "learning_rate": 0.00019708069598122646, "loss": 0.0924, "step": 4012 }, { "epoch": 0.2594262626262626, "grad_norm": 0.07812098413705826, "learning_rate": 0.0001970790554287005, "loss": 0.1182, "step": 4013 }, { "epoch": 0.2594909090909091, "grad_norm": 0.061200935393571854, "learning_rate": 0.0001970774144221679, "loss": 0.0866, "step": 4014 }, { "epoch": 0.25955555555555554, "grad_norm": 0.08956077694892883, "learning_rate": 0.00019707577296163633, "loss": 0.1054, "step": 4015 }, { "epoch": 0.25962020202020203, "grad_norm": 0.06875993311405182, "learning_rate": 0.00019707413104711345, "loss": 0.1028, "step": 4016 }, { "epoch": 0.25962020202020203, "eval_bleu": 16.56071065349125, "eval_loss": 0.0921887457370758, "eval_runtime": 2.8312, "eval_samples_per_second": 11.303, "eval_steps_per_second": 1.413, "step": 4016 }, { "epoch": 0.25968484848484846, "grad_norm": 0.07921290397644043, "learning_rate": 0.00019707248867860698, "loss": 0.0978, "step": 4017 }, { "epoch": 0.25974949494949495, "grad_norm": 0.06352989375591278, "learning_rate": 0.00019707084585612457, "loss": 0.0876, "step": 4018 }, { "epoch": 0.25981414141414144, "grad_norm": 0.06229955330491066, "learning_rate": 0.00019706920257967387, "loss": 0.0885, "step": 4019 }, { "epoch": 0.2598787878787879, "grad_norm": 0.05789710953831673, "learning_rate": 0.00019706755884926262, "loss": 0.0699, "step": 4020 }, { "epoch": 0.25994343434343437, "grad_norm": 0.07082050293684006, "learning_rate": 0.0001970659146648985, "loss": 0.0927, "step": 4021 }, { "epoch": 0.2600080808080808, "grad_norm": 0.06347637623548508, "learning_rate": 0.0001970642700265892, "loss": 0.0834, "step": 4022 }, { "epoch": 0.2600727272727273, "grad_norm": 0.0721856951713562, "learning_rate": 0.00019706262493434236, "loss": 0.1107, "step": 4023 }, { "epoch": 0.2601373737373737, "grad_norm": 0.07536016404628754, "learning_rate": 0.00019706097938816571, "loss": 0.0793, "step": 4024 }, { "epoch": 0.2602020202020202, "grad_norm": 0.07097842544317245, "learning_rate": 0.00019705933338806696, "loss": 0.0873, "step": 4025 }, { "epoch": 0.26026666666666665, "grad_norm": 0.06801895797252655, "learning_rate": 0.00019705768693405382, "loss": 0.078, "step": 4026 }, { "epoch": 0.26033131313131314, "grad_norm": 0.07171180099248886, "learning_rate": 0.00019705604002613394, "loss": 0.0889, "step": 4027 }, { "epoch": 0.26039595959595957, "grad_norm": 0.0710098072886467, "learning_rate": 0.00019705439266431505, "loss": 0.0854, "step": 4028 }, { "epoch": 0.26046060606060606, "grad_norm": 0.06959763914346695, "learning_rate": 0.00019705274484860486, "loss": 0.0832, "step": 4029 }, { "epoch": 0.26052525252525255, "grad_norm": 0.0706619843840599, "learning_rate": 0.00019705109657901105, "loss": 0.0925, "step": 4030 }, { "epoch": 0.260589898989899, "grad_norm": 0.06845133006572723, "learning_rate": 0.00019704944785554137, "loss": 0.078, "step": 4031 }, { "epoch": 0.26065454545454547, "grad_norm": 0.06288308650255203, "learning_rate": 0.00019704779867820346, "loss": 0.0732, "step": 4032 }, { "epoch": 0.26065454545454547, "eval_bleu": 15.595858446806032, "eval_loss": 0.0905570536851883, "eval_runtime": 2.6962, "eval_samples_per_second": 11.869, "eval_steps_per_second": 1.484, "step": 4032 }, { "epoch": 0.2607191919191919, "grad_norm": 0.06585483253002167, "learning_rate": 0.0001970461490470051, "loss": 0.0848, "step": 4033 }, { "epoch": 0.2607838383838384, "grad_norm": 0.08028880506753922, "learning_rate": 0.000197044498961954, "loss": 0.1066, "step": 4034 }, { "epoch": 0.26084848484848483, "grad_norm": 0.0670945793390274, "learning_rate": 0.00019704284842305786, "loss": 0.09, "step": 4035 }, { "epoch": 0.2609131313131313, "grad_norm": 0.054228901863098145, "learning_rate": 0.00019704119743032438, "loss": 0.0719, "step": 4036 }, { "epoch": 0.26097777777777775, "grad_norm": 0.05931509658694267, "learning_rate": 0.00019703954598376133, "loss": 0.0765, "step": 4037 }, { "epoch": 0.26104242424242424, "grad_norm": 0.07083901017904282, "learning_rate": 0.00019703789408337638, "loss": 0.0911, "step": 4038 }, { "epoch": 0.26110707070707073, "grad_norm": 0.055335883051157, "learning_rate": 0.0001970362417291773, "loss": 0.0821, "step": 4039 }, { "epoch": 0.26117171717171717, "grad_norm": 0.06416014581918716, "learning_rate": 0.00019703458892117177, "loss": 0.0933, "step": 4040 }, { "epoch": 0.26123636363636366, "grad_norm": 0.07569090276956558, "learning_rate": 0.00019703293565936756, "loss": 0.0944, "step": 4041 }, { "epoch": 0.2613010101010101, "grad_norm": 0.07058663666248322, "learning_rate": 0.0001970312819437724, "loss": 0.0998, "step": 4042 }, { "epoch": 0.2613656565656566, "grad_norm": 0.26307740807533264, "learning_rate": 0.000197029627774394, "loss": 0.1567, "step": 4043 }, { "epoch": 0.261430303030303, "grad_norm": 0.06781303137540817, "learning_rate": 0.0001970279731512401, "loss": 0.0986, "step": 4044 }, { "epoch": 0.2614949494949495, "grad_norm": 0.06772926449775696, "learning_rate": 0.00019702631807431845, "loss": 0.0921, "step": 4045 }, { "epoch": 0.26155959595959594, "grad_norm": 0.07634842395782471, "learning_rate": 0.0001970246625436368, "loss": 0.1125, "step": 4046 }, { "epoch": 0.2616242424242424, "grad_norm": 0.06506653130054474, "learning_rate": 0.00019702300655920286, "loss": 0.08, "step": 4047 }, { "epoch": 0.2616888888888889, "grad_norm": 0.06255129724740982, "learning_rate": 0.00019702135012102443, "loss": 0.0848, "step": 4048 }, { "epoch": 0.2616888888888889, "eval_bleu": 13.59986281323152, "eval_loss": 0.09147222340106964, "eval_runtime": 2.7494, "eval_samples_per_second": 11.639, "eval_steps_per_second": 1.455, "step": 4048 }, { "epoch": 0.26175353535353535, "grad_norm": 0.06937336921691895, "learning_rate": 0.0001970196932291092, "loss": 0.1003, "step": 4049 }, { "epoch": 0.26181818181818184, "grad_norm": 0.07006597518920898, "learning_rate": 0.00019701803588346493, "loss": 0.0874, "step": 4050 }, { "epoch": 0.2618828282828283, "grad_norm": 0.07093960791826248, "learning_rate": 0.00019701637808409937, "loss": 0.0852, "step": 4051 }, { "epoch": 0.26194747474747476, "grad_norm": 0.07252484560012817, "learning_rate": 0.0001970147198310203, "loss": 0.086, "step": 4052 }, { "epoch": 0.2620121212121212, "grad_norm": 0.07307127863168716, "learning_rate": 0.00019701306112423548, "loss": 0.1025, "step": 4053 }, { "epoch": 0.2620767676767677, "grad_norm": 0.07132066041231155, "learning_rate": 0.0001970114019637526, "loss": 0.0927, "step": 4054 }, { "epoch": 0.2621414141414141, "grad_norm": 0.07602684944868088, "learning_rate": 0.00019700974234957948, "loss": 0.0948, "step": 4055 }, { "epoch": 0.2622060606060606, "grad_norm": 0.06550707668066025, "learning_rate": 0.00019700808228172387, "loss": 0.0768, "step": 4056 }, { "epoch": 0.26227070707070704, "grad_norm": 0.06882788985967636, "learning_rate": 0.00019700642176019356, "loss": 0.0918, "step": 4057 }, { "epoch": 0.26233535353535353, "grad_norm": 0.06616871058940887, "learning_rate": 0.00019700476078499623, "loss": 0.084, "step": 4058 }, { "epoch": 0.2624, "grad_norm": 0.08565612137317657, "learning_rate": 0.00019700309935613973, "loss": 0.1056, "step": 4059 }, { "epoch": 0.26246464646464646, "grad_norm": 0.06691957265138626, "learning_rate": 0.0001970014374736318, "loss": 0.0949, "step": 4060 }, { "epoch": 0.26252929292929295, "grad_norm": 0.0686483085155487, "learning_rate": 0.00019699977513748022, "loss": 0.0941, "step": 4061 }, { "epoch": 0.2625939393939394, "grad_norm": 0.06843416392803192, "learning_rate": 0.00019699811234769277, "loss": 0.0939, "step": 4062 }, { "epoch": 0.26265858585858587, "grad_norm": 0.06703022122383118, "learning_rate": 0.0001969964491042772, "loss": 0.0957, "step": 4063 }, { "epoch": 0.2627232323232323, "grad_norm": 0.0545797273516655, "learning_rate": 0.00019699478540724128, "loss": 0.0752, "step": 4064 }, { "epoch": 0.2627232323232323, "eval_bleu": 15.72931112639416, "eval_loss": 0.0923495963215828, "eval_runtime": 2.7634, "eval_samples_per_second": 11.58, "eval_steps_per_second": 1.447, "step": 4064 }, { "epoch": 0.2627878787878788, "grad_norm": 0.07546567916870117, "learning_rate": 0.00019699312125659284, "loss": 0.1028, "step": 4065 }, { "epoch": 0.2628525252525252, "grad_norm": 0.07956601679325104, "learning_rate": 0.00019699145665233965, "loss": 0.1, "step": 4066 }, { "epoch": 0.2629171717171717, "grad_norm": 0.06858038902282715, "learning_rate": 0.00019698979159448948, "loss": 0.0858, "step": 4067 }, { "epoch": 0.2629818181818182, "grad_norm": 0.05877428874373436, "learning_rate": 0.00019698812608305008, "loss": 0.0656, "step": 4068 }, { "epoch": 0.26304646464646464, "grad_norm": 0.08447999507188797, "learning_rate": 0.0001969864601180293, "loss": 0.0942, "step": 4069 }, { "epoch": 0.26311111111111113, "grad_norm": 0.07059522718191147, "learning_rate": 0.00019698479369943493, "loss": 0.0884, "step": 4070 }, { "epoch": 0.26317575757575756, "grad_norm": 0.06848535686731339, "learning_rate": 0.00019698312682727472, "loss": 0.0907, "step": 4071 }, { "epoch": 0.26324040404040405, "grad_norm": 0.08033183962106705, "learning_rate": 0.00019698145950155648, "loss": 0.1056, "step": 4072 }, { "epoch": 0.2633050505050505, "grad_norm": 0.09901220351457596, "learning_rate": 0.00019697979172228803, "loss": 0.1062, "step": 4073 }, { "epoch": 0.263369696969697, "grad_norm": 0.09226621687412262, "learning_rate": 0.00019697812348947716, "loss": 0.1293, "step": 4074 }, { "epoch": 0.2634343434343434, "grad_norm": 0.07036930322647095, "learning_rate": 0.00019697645480313167, "loss": 0.0874, "step": 4075 }, { "epoch": 0.2634989898989899, "grad_norm": 0.06986100226640701, "learning_rate": 0.00019697478566325935, "loss": 0.086, "step": 4076 }, { "epoch": 0.2635636363636364, "grad_norm": 0.06769050657749176, "learning_rate": 0.00019697311606986802, "loss": 0.0902, "step": 4077 }, { "epoch": 0.2636282828282828, "grad_norm": 0.07077699154615402, "learning_rate": 0.00019697144602296546, "loss": 0.1002, "step": 4078 }, { "epoch": 0.2636929292929293, "grad_norm": 0.07546249777078629, "learning_rate": 0.00019696977552255955, "loss": 0.1043, "step": 4079 }, { "epoch": 0.26375757575757575, "grad_norm": 0.07374468445777893, "learning_rate": 0.00019696810456865804, "loss": 0.1015, "step": 4080 }, { "epoch": 0.26375757575757575, "eval_bleu": 17.54633083888607, "eval_loss": 0.09299148619174957, "eval_runtime": 2.6569, "eval_samples_per_second": 12.044, "eval_steps_per_second": 1.505, "step": 4080 }, { "epoch": 0.26382222222222224, "grad_norm": 0.08618441224098206, "learning_rate": 0.00019696643316126876, "loss": 0.0947, "step": 4081 }, { "epoch": 0.26388686868686867, "grad_norm": 0.061502184718847275, "learning_rate": 0.0001969647613003995, "loss": 0.0876, "step": 4082 }, { "epoch": 0.26395151515151516, "grad_norm": 0.05714648962020874, "learning_rate": 0.00019696308898605816, "loss": 0.0704, "step": 4083 }, { "epoch": 0.2640161616161616, "grad_norm": 0.07035606354475021, "learning_rate": 0.0001969614162182525, "loss": 0.0988, "step": 4084 }, { "epoch": 0.2640808080808081, "grad_norm": 0.061043307185173035, "learning_rate": 0.00019695974299699032, "loss": 0.0856, "step": 4085 }, { "epoch": 0.26414545454545457, "grad_norm": 0.07307825982570648, "learning_rate": 0.0001969580693222795, "loss": 0.1049, "step": 4086 }, { "epoch": 0.264210101010101, "grad_norm": 0.06520330160856247, "learning_rate": 0.0001969563951941278, "loss": 0.0981, "step": 4087 }, { "epoch": 0.2642747474747475, "grad_norm": 0.07047823816537857, "learning_rate": 0.00019695472061254313, "loss": 0.0928, "step": 4088 }, { "epoch": 0.26433939393939393, "grad_norm": 0.06555815786123276, "learning_rate": 0.0001969530455775333, "loss": 0.0756, "step": 4089 }, { "epoch": 0.2644040404040404, "grad_norm": 0.06840948760509491, "learning_rate": 0.0001969513700891061, "loss": 0.0815, "step": 4090 }, { "epoch": 0.26446868686868685, "grad_norm": 0.06679616868495941, "learning_rate": 0.0001969496941472694, "loss": 0.0864, "step": 4091 }, { "epoch": 0.26453333333333334, "grad_norm": 0.07676058262586594, "learning_rate": 0.00019694801775203103, "loss": 0.0937, "step": 4092 }, { "epoch": 0.2645979797979798, "grad_norm": 0.07315148413181305, "learning_rate": 0.00019694634090339883, "loss": 0.1035, "step": 4093 }, { "epoch": 0.26466262626262627, "grad_norm": 0.07252030074596405, "learning_rate": 0.00019694466360138066, "loss": 0.0923, "step": 4094 }, { "epoch": 0.2647272727272727, "grad_norm": 0.06573876738548279, "learning_rate": 0.00019694298584598433, "loss": 0.0882, "step": 4095 }, { "epoch": 0.2647919191919192, "grad_norm": 0.1000770628452301, "learning_rate": 0.00019694130763721769, "loss": 0.1121, "step": 4096 }, { "epoch": 0.2647919191919192, "eval_bleu": 12.816742705015884, "eval_loss": 0.09407724440097809, "eval_runtime": 2.6931, "eval_samples_per_second": 11.882, "eval_steps_per_second": 1.485, "step": 4096 }, { "epoch": 0.2648565656565657, "grad_norm": 0.09010086208581924, "learning_rate": 0.00019693962897508863, "loss": 0.1212, "step": 4097 }, { "epoch": 0.2649212121212121, "grad_norm": 0.06527906656265259, "learning_rate": 0.00019693794985960495, "loss": 0.0828, "step": 4098 }, { "epoch": 0.2649858585858586, "grad_norm": 0.06824596971273422, "learning_rate": 0.00019693627029077458, "loss": 0.0846, "step": 4099 }, { "epoch": 0.26505050505050504, "grad_norm": 0.06638594716787338, "learning_rate": 0.00019693459026860527, "loss": 0.0841, "step": 4100 }, { "epoch": 0.2651151515151515, "grad_norm": 0.06276428699493408, "learning_rate": 0.00019693290979310493, "loss": 0.0863, "step": 4101 }, { "epoch": 0.26517979797979796, "grad_norm": 0.06336116790771484, "learning_rate": 0.0001969312288642814, "loss": 0.0791, "step": 4102 }, { "epoch": 0.26524444444444445, "grad_norm": 0.07096125185489655, "learning_rate": 0.0001969295474821426, "loss": 0.0934, "step": 4103 }, { "epoch": 0.2653090909090909, "grad_norm": 0.061906930059194565, "learning_rate": 0.0001969278656466963, "loss": 0.0821, "step": 4104 }, { "epoch": 0.2653737373737374, "grad_norm": 0.07907071709632874, "learning_rate": 0.00019692618335795048, "loss": 0.1106, "step": 4105 }, { "epoch": 0.26543838383838386, "grad_norm": 0.08103641867637634, "learning_rate": 0.00019692450061591291, "loss": 0.1143, "step": 4106 }, { "epoch": 0.2655030303030303, "grad_norm": 0.06572283804416656, "learning_rate": 0.00019692281742059148, "loss": 0.092, "step": 4107 }, { "epoch": 0.2655676767676768, "grad_norm": 0.05516389012336731, "learning_rate": 0.0001969211337719941, "loss": 0.0709, "step": 4108 }, { "epoch": 0.2656323232323232, "grad_norm": 0.06728114187717438, "learning_rate": 0.00019691944967012862, "loss": 0.0812, "step": 4109 }, { "epoch": 0.2656969696969697, "grad_norm": 0.06304038316011429, "learning_rate": 0.0001969177651150029, "loss": 0.0811, "step": 4110 }, { "epoch": 0.26576161616161614, "grad_norm": 0.06709577888250351, "learning_rate": 0.00019691608010662483, "loss": 0.089, "step": 4111 }, { "epoch": 0.26582626262626263, "grad_norm": 0.07005739957094193, "learning_rate": 0.00019691439464500232, "loss": 0.0997, "step": 4112 }, { "epoch": 0.26582626262626263, "eval_bleu": 11.942859156509565, "eval_loss": 0.09186488389968872, "eval_runtime": 2.8176, "eval_samples_per_second": 11.357, "eval_steps_per_second": 1.42, "step": 4112 }, { "epoch": 0.26589090909090907, "grad_norm": 0.07307527959346771, "learning_rate": 0.0001969127087301432, "loss": 0.0844, "step": 4113 }, { "epoch": 0.26595555555555556, "grad_norm": 0.07943221181631088, "learning_rate": 0.00019691102236205538, "loss": 0.0869, "step": 4114 }, { "epoch": 0.26602020202020205, "grad_norm": 0.06850095093250275, "learning_rate": 0.00019690933554074677, "loss": 0.0804, "step": 4115 }, { "epoch": 0.2660848484848485, "grad_norm": 0.06646157056093216, "learning_rate": 0.0001969076482662252, "loss": 0.0833, "step": 4116 }, { "epoch": 0.26614949494949497, "grad_norm": 0.06631097197532654, "learning_rate": 0.0001969059605384986, "loss": 0.0881, "step": 4117 }, { "epoch": 0.2662141414141414, "grad_norm": 0.06769176572561264, "learning_rate": 0.00019690427235757486, "loss": 0.0838, "step": 4118 }, { "epoch": 0.2662787878787879, "grad_norm": 0.07445267587900162, "learning_rate": 0.00019690258372346192, "loss": 0.0922, "step": 4119 }, { "epoch": 0.2663434343434343, "grad_norm": 0.07536054402589798, "learning_rate": 0.0001969008946361676, "loss": 0.0929, "step": 4120 }, { "epoch": 0.2664080808080808, "grad_norm": 0.08731085062026978, "learning_rate": 0.0001968992050956998, "loss": 0.1109, "step": 4121 }, { "epoch": 0.26647272727272725, "grad_norm": 0.07295013964176178, "learning_rate": 0.0001968975151020665, "loss": 0.0904, "step": 4122 }, { "epoch": 0.26653737373737374, "grad_norm": 0.07037265598773956, "learning_rate": 0.00019689582465527554, "loss": 0.0814, "step": 4123 }, { "epoch": 0.26660202020202023, "grad_norm": 0.06497006118297577, "learning_rate": 0.00019689413375533483, "loss": 0.0859, "step": 4124 }, { "epoch": 0.26666666666666666, "grad_norm": 0.06783854961395264, "learning_rate": 0.00019689244240225228, "loss": 0.0887, "step": 4125 }, { "epoch": 0.26673131313131315, "grad_norm": 0.06034316495060921, "learning_rate": 0.00019689075059603584, "loss": 0.085, "step": 4126 }, { "epoch": 0.2667959595959596, "grad_norm": 0.07142467051744461, "learning_rate": 0.00019688905833669336, "loss": 0.0974, "step": 4127 }, { "epoch": 0.2668606060606061, "grad_norm": 0.07669708877801895, "learning_rate": 0.0001968873656242328, "loss": 0.105, "step": 4128 }, { "epoch": 0.2668606060606061, "eval_bleu": 12.597572317323614, "eval_loss": 0.09188902378082275, "eval_runtime": 2.6064, "eval_samples_per_second": 12.278, "eval_steps_per_second": 1.535, "step": 4128 }, { "epoch": 0.2669252525252525, "grad_norm": 0.07436151057481766, "learning_rate": 0.00019688567245866207, "loss": 0.1007, "step": 4129 }, { "epoch": 0.266989898989899, "grad_norm": 0.07025950402021408, "learning_rate": 0.0001968839788399891, "loss": 0.0951, "step": 4130 }, { "epoch": 0.26705454545454543, "grad_norm": 0.07281804829835892, "learning_rate": 0.00019688228476822173, "loss": 0.0927, "step": 4131 }, { "epoch": 0.2671191919191919, "grad_norm": 0.07857491821050644, "learning_rate": 0.000196880590243368, "loss": 0.118, "step": 4132 }, { "epoch": 0.26718383838383836, "grad_norm": 0.0814911499619484, "learning_rate": 0.00019687889526543573, "loss": 0.1245, "step": 4133 }, { "epoch": 0.26724848484848485, "grad_norm": 0.0695725679397583, "learning_rate": 0.00019687719983443294, "loss": 0.1037, "step": 4134 }, { "epoch": 0.26731313131313134, "grad_norm": 0.07808953523635864, "learning_rate": 0.0001968755039503675, "loss": 0.1111, "step": 4135 }, { "epoch": 0.26737777777777777, "grad_norm": 0.10729251056909561, "learning_rate": 0.00019687380761324733, "loss": 0.1094, "step": 4136 }, { "epoch": 0.26744242424242426, "grad_norm": 0.06723996996879578, "learning_rate": 0.0001968721108230804, "loss": 0.1074, "step": 4137 }, { "epoch": 0.2675070707070707, "grad_norm": 0.06742935627698898, "learning_rate": 0.00019687041357987464, "loss": 0.101, "step": 4138 }, { "epoch": 0.2675717171717172, "grad_norm": 0.0687510296702385, "learning_rate": 0.000196868715883638, "loss": 0.0991, "step": 4139 }, { "epoch": 0.2676363636363636, "grad_norm": 0.06135712191462517, "learning_rate": 0.00019686701773437832, "loss": 0.0768, "step": 4140 }, { "epoch": 0.2677010101010101, "grad_norm": 0.08449401706457138, "learning_rate": 0.0001968653191321037, "loss": 0.1067, "step": 4141 }, { "epoch": 0.26776565656565654, "grad_norm": 0.07005251199007034, "learning_rate": 0.00019686362007682198, "loss": 0.095, "step": 4142 }, { "epoch": 0.26783030303030303, "grad_norm": 0.06596967577934265, "learning_rate": 0.00019686192056854113, "loss": 0.0865, "step": 4143 }, { "epoch": 0.2678949494949495, "grad_norm": 0.07281509041786194, "learning_rate": 0.00019686022060726912, "loss": 0.0987, "step": 4144 }, { "epoch": 0.2678949494949495, "eval_bleu": 14.480730080839121, "eval_loss": 0.09211601316928864, "eval_runtime": 2.9376, "eval_samples_per_second": 10.893, "eval_steps_per_second": 1.362, "step": 4144 }, { "epoch": 0.26795959595959595, "grad_norm": 0.0651697888970375, "learning_rate": 0.00019685852019301386, "loss": 0.0816, "step": 4145 }, { "epoch": 0.26802424242424244, "grad_norm": 0.08566612005233765, "learning_rate": 0.00019685681932578333, "loss": 0.1194, "step": 4146 }, { "epoch": 0.2680888888888889, "grad_norm": 0.06304305791854858, "learning_rate": 0.00019685511800558546, "loss": 0.0806, "step": 4147 }, { "epoch": 0.26815353535353537, "grad_norm": 0.07262536138296127, "learning_rate": 0.00019685341623242822, "loss": 0.102, "step": 4148 }, { "epoch": 0.2682181818181818, "grad_norm": 0.07394552230834961, "learning_rate": 0.00019685171400631962, "loss": 0.0941, "step": 4149 }, { "epoch": 0.2682828282828283, "grad_norm": 0.08123172074556351, "learning_rate": 0.0001968500113272675, "loss": 0.1118, "step": 4150 }, { "epoch": 0.2683474747474747, "grad_norm": 0.07228460907936096, "learning_rate": 0.00019684830819527993, "loss": 0.0809, "step": 4151 }, { "epoch": 0.2684121212121212, "grad_norm": 0.07674358785152435, "learning_rate": 0.00019684660461036487, "loss": 0.0953, "step": 4152 }, { "epoch": 0.2684767676767677, "grad_norm": 0.0802217498421669, "learning_rate": 0.00019684490057253023, "loss": 0.1075, "step": 4153 }, { "epoch": 0.26854141414141414, "grad_norm": 0.07244308292865753, "learning_rate": 0.000196843196081784, "loss": 0.1015, "step": 4154 }, { "epoch": 0.2686060606060606, "grad_norm": 0.06194354221224785, "learning_rate": 0.00019684149113813416, "loss": 0.0699, "step": 4155 }, { "epoch": 0.26867070707070706, "grad_norm": 0.07043841481208801, "learning_rate": 0.0001968397857415887, "loss": 0.0988, "step": 4156 }, { "epoch": 0.26873535353535355, "grad_norm": 0.07158217579126358, "learning_rate": 0.00019683807989215554, "loss": 0.0969, "step": 4157 }, { "epoch": 0.2688, "grad_norm": 0.06150287017226219, "learning_rate": 0.00019683637358984272, "loss": 0.078, "step": 4158 }, { "epoch": 0.2688646464646465, "grad_norm": 0.06979337334632874, "learning_rate": 0.0001968346668346582, "loss": 0.0944, "step": 4159 }, { "epoch": 0.2689292929292929, "grad_norm": 0.06354733556509018, "learning_rate": 0.00019683295962660995, "loss": 0.0865, "step": 4160 }, { "epoch": 0.2689292929292929, "eval_bleu": 12.0324787057154, "eval_loss": 0.09236061573028564, "eval_runtime": 2.7182, "eval_samples_per_second": 11.773, "eval_steps_per_second": 1.472, "step": 4160 }, { "epoch": 0.2689939393939394, "grad_norm": 0.07068794220685959, "learning_rate": 0.00019683125196570594, "loss": 0.0892, "step": 4161 }, { "epoch": 0.2690585858585859, "grad_norm": 0.06503379344940186, "learning_rate": 0.0001968295438519542, "loss": 0.0836, "step": 4162 }, { "epoch": 0.2691232323232323, "grad_norm": 0.06750360131263733, "learning_rate": 0.0001968278352853627, "loss": 0.087, "step": 4163 }, { "epoch": 0.2691878787878788, "grad_norm": 0.06069406867027283, "learning_rate": 0.0001968261262659394, "loss": 0.0807, "step": 4164 }, { "epoch": 0.26925252525252524, "grad_norm": 0.06780527532100677, "learning_rate": 0.00019682441679369232, "loss": 0.0729, "step": 4165 }, { "epoch": 0.26931717171717173, "grad_norm": 0.07256273925304413, "learning_rate": 0.00019682270686862947, "loss": 0.0977, "step": 4166 }, { "epoch": 0.26938181818181817, "grad_norm": 0.0687905102968216, "learning_rate": 0.0001968209964907588, "loss": 0.0993, "step": 4167 }, { "epoch": 0.26944646464646466, "grad_norm": 0.06238539516925812, "learning_rate": 0.00019681928566008837, "loss": 0.0735, "step": 4168 }, { "epoch": 0.2695111111111111, "grad_norm": 0.07545731216669083, "learning_rate": 0.00019681757437662616, "loss": 0.1009, "step": 4169 }, { "epoch": 0.2695757575757576, "grad_norm": 0.0640399381518364, "learning_rate": 0.0001968158626403801, "loss": 0.0823, "step": 4170 }, { "epoch": 0.269640404040404, "grad_norm": 0.06572475284337997, "learning_rate": 0.0001968141504513583, "loss": 0.0844, "step": 4171 }, { "epoch": 0.2697050505050505, "grad_norm": 0.07045052200555801, "learning_rate": 0.00019681243780956872, "loss": 0.0975, "step": 4172 }, { "epoch": 0.269769696969697, "grad_norm": 0.07127857953310013, "learning_rate": 0.0001968107247150194, "loss": 0.1157, "step": 4173 }, { "epoch": 0.2698343434343434, "grad_norm": 0.07238567620515823, "learning_rate": 0.00019680901116771827, "loss": 0.0889, "step": 4174 }, { "epoch": 0.2698989898989899, "grad_norm": 0.07276687771081924, "learning_rate": 0.00019680729716767346, "loss": 0.0933, "step": 4175 }, { "epoch": 0.26996363636363635, "grad_norm": 0.07266431301832199, "learning_rate": 0.0001968055827148929, "loss": 0.0957, "step": 4176 }, { "epoch": 0.26996363636363635, "eval_bleu": 13.410844823593065, "eval_loss": 0.09154077619314194, "eval_runtime": 2.8797, "eval_samples_per_second": 11.112, "eval_steps_per_second": 1.389, "step": 4176 }, { "epoch": 0.27002828282828284, "grad_norm": 0.06851734966039658, "learning_rate": 0.0001968038678093846, "loss": 0.0918, "step": 4177 }, { "epoch": 0.2700929292929293, "grad_norm": 0.07407873868942261, "learning_rate": 0.00019680215245115664, "loss": 0.1107, "step": 4178 }, { "epoch": 0.27015757575757576, "grad_norm": 0.06345862150192261, "learning_rate": 0.00019680043664021702, "loss": 0.0877, "step": 4179 }, { "epoch": 0.2702222222222222, "grad_norm": 0.06200195103883743, "learning_rate": 0.00019679872037657378, "loss": 0.0856, "step": 4180 }, { "epoch": 0.2702868686868687, "grad_norm": 0.06358761340379715, "learning_rate": 0.0001967970036602349, "loss": 0.0868, "step": 4181 }, { "epoch": 0.2703515151515152, "grad_norm": 0.08329736441373825, "learning_rate": 0.00019679528649120843, "loss": 0.0924, "step": 4182 }, { "epoch": 0.2704161616161616, "grad_norm": 0.07827464491128922, "learning_rate": 0.0001967935688695024, "loss": 0.0906, "step": 4183 }, { "epoch": 0.2704808080808081, "grad_norm": 0.05834691971540451, "learning_rate": 0.00019679185079512487, "loss": 0.0743, "step": 4184 }, { "epoch": 0.27054545454545453, "grad_norm": 0.06297656148672104, "learning_rate": 0.00019679013226808385, "loss": 0.079, "step": 4185 }, { "epoch": 0.270610101010101, "grad_norm": 0.06820444762706757, "learning_rate": 0.00019678841328838736, "loss": 0.0846, "step": 4186 }, { "epoch": 0.27067474747474746, "grad_norm": 0.0740358978509903, "learning_rate": 0.00019678669385604346, "loss": 0.0984, "step": 4187 }, { "epoch": 0.27073939393939395, "grad_norm": 0.07049202919006348, "learning_rate": 0.0001967849739710602, "loss": 0.0921, "step": 4188 }, { "epoch": 0.2708040404040404, "grad_norm": 0.06729531288146973, "learning_rate": 0.00019678325363344562, "loss": 0.0932, "step": 4189 }, { "epoch": 0.27086868686868687, "grad_norm": 0.08443735539913177, "learning_rate": 0.00019678153284320775, "loss": 0.0954, "step": 4190 }, { "epoch": 0.27093333333333336, "grad_norm": 0.06944546103477478, "learning_rate": 0.00019677981160035467, "loss": 0.0959, "step": 4191 }, { "epoch": 0.2709979797979798, "grad_norm": 0.07118839770555496, "learning_rate": 0.00019677808990489437, "loss": 0.0995, "step": 4192 }, { "epoch": 0.2709979797979798, "eval_bleu": 13.054764479958287, "eval_loss": 0.09221772849559784, "eval_runtime": 2.6813, "eval_samples_per_second": 11.935, "eval_steps_per_second": 1.492, "step": 4192 }, { "epoch": 0.2710626262626263, "grad_norm": 0.07209117710590363, "learning_rate": 0.00019677636775683493, "loss": 0.0971, "step": 4193 }, { "epoch": 0.2711272727272727, "grad_norm": 0.07843372970819473, "learning_rate": 0.00019677464515618445, "loss": 0.0993, "step": 4194 }, { "epoch": 0.2711919191919192, "grad_norm": 0.06937476992607117, "learning_rate": 0.00019677292210295094, "loss": 0.0894, "step": 4195 }, { "epoch": 0.27125656565656564, "grad_norm": 0.07377466559410095, "learning_rate": 0.00019677119859714244, "loss": 0.102, "step": 4196 }, { "epoch": 0.27132121212121213, "grad_norm": 0.06878658384084702, "learning_rate": 0.00019676947463876705, "loss": 0.0849, "step": 4197 }, { "epoch": 0.27138585858585856, "grad_norm": 0.06169632449746132, "learning_rate": 0.0001967677502278328, "loss": 0.072, "step": 4198 }, { "epoch": 0.27145050505050505, "grad_norm": 0.06099303811788559, "learning_rate": 0.0001967660253643478, "loss": 0.0817, "step": 4199 }, { "epoch": 0.27151515151515154, "grad_norm": 0.06536420434713364, "learning_rate": 0.00019676430004832008, "loss": 0.094, "step": 4200 }, { "epoch": 0.271579797979798, "grad_norm": 0.06275838613510132, "learning_rate": 0.00019676257427975774, "loss": 0.0773, "step": 4201 }, { "epoch": 0.27164444444444447, "grad_norm": 0.08492186665534973, "learning_rate": 0.00019676084805866882, "loss": 0.1222, "step": 4202 }, { "epoch": 0.2717090909090909, "grad_norm": 0.08464498817920685, "learning_rate": 0.0001967591213850614, "loss": 0.0994, "step": 4203 }, { "epoch": 0.2717737373737374, "grad_norm": 0.06961265951395035, "learning_rate": 0.00019675739425894354, "loss": 0.0928, "step": 4204 }, { "epoch": 0.2718383838383838, "grad_norm": 0.06475110352039337, "learning_rate": 0.00019675566668032332, "loss": 0.0972, "step": 4205 }, { "epoch": 0.2719030303030303, "grad_norm": 0.0936918705701828, "learning_rate": 0.00019675393864920885, "loss": 0.105, "step": 4206 }, { "epoch": 0.27196767676767675, "grad_norm": 0.05950174108147621, "learning_rate": 0.0001967522101656082, "loss": 0.0822, "step": 4207 }, { "epoch": 0.27203232323232324, "grad_norm": 0.06783674657344818, "learning_rate": 0.00019675048122952946, "loss": 0.0862, "step": 4208 }, { "epoch": 0.27203232323232324, "eval_bleu": 12.37491224465256, "eval_loss": 0.09030617773532867, "eval_runtime": 2.7019, "eval_samples_per_second": 11.844, "eval_steps_per_second": 1.48, "step": 4208 }, { "epoch": 0.27209696969696967, "grad_norm": 0.07009829580783844, "learning_rate": 0.00019674875184098073, "loss": 0.099, "step": 4209 }, { "epoch": 0.27216161616161616, "grad_norm": 0.07490433752536774, "learning_rate": 0.00019674702199997, "loss": 0.1026, "step": 4210 }, { "epoch": 0.27222626262626265, "grad_norm": 0.06616954505443573, "learning_rate": 0.00019674529170650548, "loss": 0.0874, "step": 4211 }, { "epoch": 0.2722909090909091, "grad_norm": 0.08078459650278091, "learning_rate": 0.0001967435609605952, "loss": 0.1052, "step": 4212 }, { "epoch": 0.2723555555555556, "grad_norm": 0.07755498588085175, "learning_rate": 0.00019674182976224728, "loss": 0.0922, "step": 4213 }, { "epoch": 0.272420202020202, "grad_norm": 0.067042276263237, "learning_rate": 0.0001967400981114698, "loss": 0.0847, "step": 4214 }, { "epoch": 0.2724848484848485, "grad_norm": 0.0760740265250206, "learning_rate": 0.00019673836600827085, "loss": 0.0953, "step": 4215 }, { "epoch": 0.27254949494949493, "grad_norm": 0.06655211746692657, "learning_rate": 0.00019673663345265858, "loss": 0.0951, "step": 4216 }, { "epoch": 0.2726141414141414, "grad_norm": 0.06735111773014069, "learning_rate": 0.000196734900444641, "loss": 0.0995, "step": 4217 }, { "epoch": 0.27267878787878785, "grad_norm": 0.06664618104696274, "learning_rate": 0.00019673316698422632, "loss": 0.0897, "step": 4218 }, { "epoch": 0.27274343434343434, "grad_norm": 0.0812290832400322, "learning_rate": 0.0001967314330714226, "loss": 0.1149, "step": 4219 }, { "epoch": 0.27280808080808083, "grad_norm": 0.0654434934258461, "learning_rate": 0.0001967296987062379, "loss": 0.0942, "step": 4220 }, { "epoch": 0.27287272727272727, "grad_norm": 0.06732401251792908, "learning_rate": 0.0001967279638886804, "loss": 0.0977, "step": 4221 }, { "epoch": 0.27293737373737376, "grad_norm": 0.08123961091041565, "learning_rate": 0.0001967262286187582, "loss": 0.0893, "step": 4222 }, { "epoch": 0.2730020202020202, "grad_norm": 0.06223050132393837, "learning_rate": 0.0001967244928964794, "loss": 0.0963, "step": 4223 }, { "epoch": 0.2730666666666667, "grad_norm": 0.0640723705291748, "learning_rate": 0.0001967227567218521, "loss": 0.0831, "step": 4224 }, { "epoch": 0.2730666666666667, "eval_bleu": 11.359121908479228, "eval_loss": 0.09155243635177612, "eval_runtime": 2.7331, "eval_samples_per_second": 11.708, "eval_steps_per_second": 1.464, "step": 4224 }, { "epoch": 0.2731313131313131, "grad_norm": 0.06645813584327698, "learning_rate": 0.00019672102009488446, "loss": 0.0807, "step": 4225 }, { "epoch": 0.2731959595959596, "grad_norm": 0.06468161195516586, "learning_rate": 0.0001967192830155846, "loss": 0.0788, "step": 4226 }, { "epoch": 0.27326060606060604, "grad_norm": 0.06649275869131088, "learning_rate": 0.0001967175454839606, "loss": 0.0853, "step": 4227 }, { "epoch": 0.2733252525252525, "grad_norm": 0.06614337116479874, "learning_rate": 0.00019671580750002066, "loss": 0.0897, "step": 4228 }, { "epoch": 0.273389898989899, "grad_norm": 0.07503994554281235, "learning_rate": 0.0001967140690637728, "loss": 0.1082, "step": 4229 }, { "epoch": 0.27345454545454545, "grad_norm": 0.07090439647436142, "learning_rate": 0.00019671233017522525, "loss": 0.0936, "step": 4230 }, { "epoch": 0.27351919191919194, "grad_norm": 0.0631272941827774, "learning_rate": 0.0001967105908343861, "loss": 0.0883, "step": 4231 }, { "epoch": 0.2735838383838384, "grad_norm": 0.0636717900633812, "learning_rate": 0.00019670885104126348, "loss": 0.0846, "step": 4232 }, { "epoch": 0.27364848484848486, "grad_norm": 0.06532414257526398, "learning_rate": 0.00019670711079586555, "loss": 0.0868, "step": 4233 }, { "epoch": 0.2737131313131313, "grad_norm": 0.07801687717437744, "learning_rate": 0.0001967053700982004, "loss": 0.1081, "step": 4234 }, { "epoch": 0.2737777777777778, "grad_norm": 0.07180683314800262, "learning_rate": 0.00019670362894827625, "loss": 0.0866, "step": 4235 }, { "epoch": 0.2738424242424242, "grad_norm": 0.06685402989387512, "learning_rate": 0.00019670188734610116, "loss": 0.0985, "step": 4236 }, { "epoch": 0.2739070707070707, "grad_norm": 0.09973353892564774, "learning_rate": 0.00019670014529168334, "loss": 0.0859, "step": 4237 }, { "epoch": 0.27397171717171714, "grad_norm": 0.1273084133863449, "learning_rate": 0.0001966984027850309, "loss": 0.0935, "step": 4238 }, { "epoch": 0.27403636363636363, "grad_norm": 0.06016869097948074, "learning_rate": 0.000196696659826152, "loss": 0.0791, "step": 4239 }, { "epoch": 0.2741010101010101, "grad_norm": 0.08196477591991425, "learning_rate": 0.0001966949164150548, "loss": 0.1009, "step": 4240 }, { "epoch": 0.2741010101010101, "eval_bleu": 13.639647915974061, "eval_loss": 0.09162388741970062, "eval_runtime": 2.8332, "eval_samples_per_second": 11.295, "eval_steps_per_second": 1.412, "step": 4240 }, { "epoch": 0.27416565656565656, "grad_norm": 0.07231196016073227, "learning_rate": 0.0001966931725517474, "loss": 0.0955, "step": 4241 }, { "epoch": 0.27423030303030305, "grad_norm": 0.061195503920316696, "learning_rate": 0.00019669142823623803, "loss": 0.0774, "step": 4242 }, { "epoch": 0.2742949494949495, "grad_norm": 0.06734532862901688, "learning_rate": 0.00019668968346853483, "loss": 0.0924, "step": 4243 }, { "epoch": 0.27435959595959597, "grad_norm": 0.06743648648262024, "learning_rate": 0.0001966879382486459, "loss": 0.0928, "step": 4244 }, { "epoch": 0.2744242424242424, "grad_norm": 0.06553462147712708, "learning_rate": 0.0001966861925765795, "loss": 0.0802, "step": 4245 }, { "epoch": 0.2744888888888889, "grad_norm": 0.07010167837142944, "learning_rate": 0.00019668444645234368, "loss": 0.085, "step": 4246 }, { "epoch": 0.2745535353535353, "grad_norm": 0.10386139899492264, "learning_rate": 0.00019668269987594672, "loss": 0.0772, "step": 4247 }, { "epoch": 0.2746181818181818, "grad_norm": 0.07533600926399231, "learning_rate": 0.0001966809528473967, "loss": 0.1038, "step": 4248 }, { "epoch": 0.2746828282828283, "grad_norm": 0.05863481014966965, "learning_rate": 0.00019667920536670187, "loss": 0.0829, "step": 4249 }, { "epoch": 0.27474747474747474, "grad_norm": 0.06582505255937576, "learning_rate": 0.00019667745743387032, "loss": 0.0917, "step": 4250 }, { "epoch": 0.27481212121212123, "grad_norm": 0.06666736304759979, "learning_rate": 0.0001966757090489103, "loss": 0.0927, "step": 4251 }, { "epoch": 0.27487676767676766, "grad_norm": 0.06843499094247818, "learning_rate": 0.00019667396021182988, "loss": 0.0825, "step": 4252 }, { "epoch": 0.27494141414141415, "grad_norm": 0.06754453480243683, "learning_rate": 0.00019667221092263738, "loss": 0.0861, "step": 4253 }, { "epoch": 0.2750060606060606, "grad_norm": 0.06907545030117035, "learning_rate": 0.00019667046118134084, "loss": 0.0897, "step": 4254 }, { "epoch": 0.2750707070707071, "grad_norm": 0.08253129571676254, "learning_rate": 0.00019666871098794858, "loss": 0.0896, "step": 4255 }, { "epoch": 0.2751353535353535, "grad_norm": 0.07836899906396866, "learning_rate": 0.00019666696034246868, "loss": 0.1165, "step": 4256 }, { "epoch": 0.2751353535353535, "eval_bleu": 13.948044391220451, "eval_loss": 0.09246565401554108, "eval_runtime": 2.7025, "eval_samples_per_second": 11.841, "eval_steps_per_second": 1.48, "step": 4256 }, { "epoch": 0.2752, "grad_norm": 0.05887774005532265, "learning_rate": 0.00019666520924490938, "loss": 0.0791, "step": 4257 }, { "epoch": 0.2752646464646465, "grad_norm": 0.07451488077640533, "learning_rate": 0.00019666345769527884, "loss": 0.115, "step": 4258 }, { "epoch": 0.2753292929292929, "grad_norm": 0.07511168718338013, "learning_rate": 0.00019666170569358527, "loss": 0.1021, "step": 4259 }, { "epoch": 0.2753939393939394, "grad_norm": 0.0797128900885582, "learning_rate": 0.00019665995323983685, "loss": 0.1086, "step": 4260 }, { "epoch": 0.27545858585858585, "grad_norm": 0.09208791702985764, "learning_rate": 0.0001966582003340418, "loss": 0.1192, "step": 4261 }, { "epoch": 0.27552323232323234, "grad_norm": 0.0739036276936531, "learning_rate": 0.0001966564469762083, "loss": 0.0949, "step": 4262 }, { "epoch": 0.27558787878787877, "grad_norm": 0.08899285644292831, "learning_rate": 0.00019665469316634452, "loss": 0.0847, "step": 4263 }, { "epoch": 0.27565252525252526, "grad_norm": 0.07904451340436935, "learning_rate": 0.00019665293890445872, "loss": 0.0983, "step": 4264 }, { "epoch": 0.2757171717171717, "grad_norm": 0.07048787921667099, "learning_rate": 0.00019665118419055907, "loss": 0.0984, "step": 4265 }, { "epoch": 0.2757818181818182, "grad_norm": 0.07577227056026459, "learning_rate": 0.0001966494290246538, "loss": 0.077, "step": 4266 }, { "epoch": 0.2758464646464647, "grad_norm": 0.059506893157958984, "learning_rate": 0.0001966476734067511, "loss": 0.0818, "step": 4267 }, { "epoch": 0.2759111111111111, "grad_norm": 0.0668000876903534, "learning_rate": 0.00019664591733685916, "loss": 0.0936, "step": 4268 }, { "epoch": 0.2759757575757576, "grad_norm": 0.06816105544567108, "learning_rate": 0.00019664416081498622, "loss": 0.0985, "step": 4269 }, { "epoch": 0.27604040404040403, "grad_norm": 0.07102940231561661, "learning_rate": 0.00019664240384114053, "loss": 0.0963, "step": 4270 }, { "epoch": 0.2761050505050505, "grad_norm": 0.061725977808237076, "learning_rate": 0.0001966406464153302, "loss": 0.0749, "step": 4271 }, { "epoch": 0.27616969696969695, "grad_norm": 0.08572505414485931, "learning_rate": 0.00019663888853756358, "loss": 0.1197, "step": 4272 }, { "epoch": 0.27616969696969695, "eval_bleu": 14.99990202706812, "eval_loss": 0.09235581755638123, "eval_runtime": 2.7835, "eval_samples_per_second": 11.496, "eval_steps_per_second": 1.437, "step": 4272 }, { "epoch": 0.27623434343434344, "grad_norm": 0.06754416227340698, "learning_rate": 0.00019663713020784878, "loss": 0.0937, "step": 4273 }, { "epoch": 0.2762989898989899, "grad_norm": 0.06085435673594475, "learning_rate": 0.0001966353714261941, "loss": 0.0917, "step": 4274 }, { "epoch": 0.27636363636363637, "grad_norm": 0.05978894606232643, "learning_rate": 0.0001966336121926077, "loss": 0.0821, "step": 4275 }, { "epoch": 0.2764282828282828, "grad_norm": 0.07794878631830215, "learning_rate": 0.00019663185250709788, "loss": 0.1126, "step": 4276 }, { "epoch": 0.2764929292929293, "grad_norm": 0.06527364999055862, "learning_rate": 0.0001966300923696728, "loss": 0.0767, "step": 4277 }, { "epoch": 0.2765575757575758, "grad_norm": 0.06329480558633804, "learning_rate": 0.00019662833178034074, "loss": 0.093, "step": 4278 }, { "epoch": 0.2766222222222222, "grad_norm": 0.0752754807472229, "learning_rate": 0.00019662657073910993, "loss": 0.1161, "step": 4279 }, { "epoch": 0.2766868686868687, "grad_norm": 0.060222890228033066, "learning_rate": 0.00019662480924598859, "loss": 0.0818, "step": 4280 }, { "epoch": 0.27675151515151514, "grad_norm": 0.06183062493801117, "learning_rate": 0.00019662304730098495, "loss": 0.0805, "step": 4281 }, { "epoch": 0.2768161616161616, "grad_norm": 0.08070927858352661, "learning_rate": 0.00019662128490410724, "loss": 0.1063, "step": 4282 }, { "epoch": 0.27688080808080806, "grad_norm": 0.06460665166378021, "learning_rate": 0.00019661952205536376, "loss": 0.0852, "step": 4283 }, { "epoch": 0.27694545454545455, "grad_norm": 0.05833077058196068, "learning_rate": 0.0001966177587547627, "loss": 0.0882, "step": 4284 }, { "epoch": 0.277010101010101, "grad_norm": 0.059798646718263626, "learning_rate": 0.00019661599500231231, "loss": 0.0902, "step": 4285 }, { "epoch": 0.2770747474747475, "grad_norm": 0.06571953743696213, "learning_rate": 0.00019661423079802087, "loss": 0.097, "step": 4286 }, { "epoch": 0.27713939393939396, "grad_norm": 0.06831257045269012, "learning_rate": 0.0001966124661418966, "loss": 0.0878, "step": 4287 }, { "epoch": 0.2772040404040404, "grad_norm": 0.06506771594285965, "learning_rate": 0.00019661070103394777, "loss": 0.0773, "step": 4288 }, { "epoch": 0.2772040404040404, "eval_bleu": 17.090568784230705, "eval_loss": 0.09156984090805054, "eval_runtime": 2.6853, "eval_samples_per_second": 11.917, "eval_steps_per_second": 1.49, "step": 4288 }, { "epoch": 0.2772686868686869, "grad_norm": 0.05979195237159729, "learning_rate": 0.00019660893547418265, "loss": 0.0774, "step": 4289 }, { "epoch": 0.2773333333333333, "grad_norm": 0.06204604357481003, "learning_rate": 0.00019660716946260946, "loss": 0.0854, "step": 4290 }, { "epoch": 0.2773979797979798, "grad_norm": 0.06890398263931274, "learning_rate": 0.0001966054029992365, "loss": 0.0954, "step": 4291 }, { "epoch": 0.27746262626262624, "grad_norm": 0.06365709751844406, "learning_rate": 0.00019660363608407194, "loss": 0.0691, "step": 4292 }, { "epoch": 0.27752727272727273, "grad_norm": 0.07831145077943802, "learning_rate": 0.0001966018687171242, "loss": 0.1112, "step": 4293 }, { "epoch": 0.27759191919191917, "grad_norm": 0.06532642990350723, "learning_rate": 0.0001966001008984014, "loss": 0.081, "step": 4294 }, { "epoch": 0.27765656565656566, "grad_norm": 0.06608685106039047, "learning_rate": 0.00019659833262791184, "loss": 0.088, "step": 4295 }, { "epoch": 0.27772121212121215, "grad_norm": 0.09675134718418121, "learning_rate": 0.00019659656390566386, "loss": 0.0829, "step": 4296 }, { "epoch": 0.2777858585858586, "grad_norm": 0.0656266063451767, "learning_rate": 0.00019659479473166567, "loss": 0.0914, "step": 4297 }, { "epoch": 0.27785050505050507, "grad_norm": 0.07426586002111435, "learning_rate": 0.00019659302510592556, "loss": 0.1001, "step": 4298 }, { "epoch": 0.2779151515151515, "grad_norm": 0.09900396317243576, "learning_rate": 0.0001965912550284518, "loss": 0.093, "step": 4299 }, { "epoch": 0.277979797979798, "grad_norm": 0.06313488632440567, "learning_rate": 0.0001965894844992527, "loss": 0.0967, "step": 4300 }, { "epoch": 0.2780444444444444, "grad_norm": 0.06991353631019592, "learning_rate": 0.0001965877135183365, "loss": 0.0971, "step": 4301 }, { "epoch": 0.2781090909090909, "grad_norm": 0.06899707019329071, "learning_rate": 0.00019658594208571146, "loss": 0.0987, "step": 4302 }, { "epoch": 0.27817373737373735, "grad_norm": 0.06870637089014053, "learning_rate": 0.00019658417020138594, "loss": 0.0927, "step": 4303 }, { "epoch": 0.27823838383838384, "grad_norm": 0.06643574684858322, "learning_rate": 0.00019658239786536817, "loss": 0.0857, "step": 4304 }, { "epoch": 0.27823838383838384, "eval_bleu": 14.586639612431172, "eval_loss": 0.09181059896945953, "eval_runtime": 2.8757, "eval_samples_per_second": 11.128, "eval_steps_per_second": 1.391, "step": 4304 }, { "epoch": 0.27830303030303033, "grad_norm": 0.07510837912559509, "learning_rate": 0.00019658062507766646, "loss": 0.0895, "step": 4305 }, { "epoch": 0.27836767676767676, "grad_norm": 0.06540877372026443, "learning_rate": 0.00019657885183828908, "loss": 0.0818, "step": 4306 }, { "epoch": 0.27843232323232325, "grad_norm": 0.07625401020050049, "learning_rate": 0.00019657707814724434, "loss": 0.1041, "step": 4307 }, { "epoch": 0.2784969696969697, "grad_norm": 0.08106216788291931, "learning_rate": 0.00019657530400454056, "loss": 0.1121, "step": 4308 }, { "epoch": 0.2785616161616162, "grad_norm": 0.06568999588489532, "learning_rate": 0.00019657352941018598, "loss": 0.0871, "step": 4309 }, { "epoch": 0.2786262626262626, "grad_norm": 0.06949364393949509, "learning_rate": 0.00019657175436418897, "loss": 0.0984, "step": 4310 }, { "epoch": 0.2786909090909091, "grad_norm": 0.06937297433614731, "learning_rate": 0.00019656997886655778, "loss": 0.0922, "step": 4311 }, { "epoch": 0.27875555555555553, "grad_norm": 0.06498976796865463, "learning_rate": 0.00019656820291730068, "loss": 0.0924, "step": 4312 }, { "epoch": 0.278820202020202, "grad_norm": 0.058843135833740234, "learning_rate": 0.00019656642651642606, "loss": 0.0763, "step": 4313 }, { "epoch": 0.27888484848484846, "grad_norm": 0.09469771385192871, "learning_rate": 0.00019656464966394217, "loss": 0.0878, "step": 4314 }, { "epoch": 0.27894949494949495, "grad_norm": 0.05855628475546837, "learning_rate": 0.00019656287235985732, "loss": 0.0769, "step": 4315 }, { "epoch": 0.27901414141414144, "grad_norm": 0.06659188121557236, "learning_rate": 0.00019656109460417987, "loss": 0.0856, "step": 4316 }, { "epoch": 0.27907878787878787, "grad_norm": 0.06598351150751114, "learning_rate": 0.00019655931639691812, "loss": 0.0969, "step": 4317 }, { "epoch": 0.27914343434343436, "grad_norm": 0.0817653238773346, "learning_rate": 0.0001965575377380803, "loss": 0.0967, "step": 4318 }, { "epoch": 0.2792080808080808, "grad_norm": 0.06330971419811249, "learning_rate": 0.00019655575862767484, "loss": 0.0899, "step": 4319 }, { "epoch": 0.2792727272727273, "grad_norm": 0.06465650349855423, "learning_rate": 0.00019655397906571004, "loss": 0.0936, "step": 4320 }, { "epoch": 0.2792727272727273, "eval_bleu": 14.333354007505713, "eval_loss": 0.09306371212005615, "eval_runtime": 2.7214, "eval_samples_per_second": 11.759, "eval_steps_per_second": 1.47, "step": 4320 }, { "epoch": 0.2793373737373737, "grad_norm": 0.07256720215082169, "learning_rate": 0.00019655219905219417, "loss": 0.0987, "step": 4321 }, { "epoch": 0.2794020202020202, "grad_norm": 0.06694379448890686, "learning_rate": 0.0001965504185871356, "loss": 0.0888, "step": 4322 }, { "epoch": 0.27946666666666664, "grad_norm": 0.06541089713573456, "learning_rate": 0.00019654863767054265, "loss": 0.0968, "step": 4323 }, { "epoch": 0.27953131313131313, "grad_norm": 0.07408475130796432, "learning_rate": 0.00019654685630242363, "loss": 0.0955, "step": 4324 }, { "epoch": 0.2795959595959596, "grad_norm": 0.06434882432222366, "learning_rate": 0.00019654507448278686, "loss": 0.0933, "step": 4325 }, { "epoch": 0.27966060606060605, "grad_norm": 0.057125017046928406, "learning_rate": 0.0001965432922116407, "loss": 0.0791, "step": 4326 }, { "epoch": 0.27972525252525254, "grad_norm": 0.06498373299837112, "learning_rate": 0.0001965415094889935, "loss": 0.0828, "step": 4327 }, { "epoch": 0.279789898989899, "grad_norm": 0.06335562467575073, "learning_rate": 0.00019653972631485358, "loss": 0.0796, "step": 4328 }, { "epoch": 0.27985454545454547, "grad_norm": 0.057698626071214676, "learning_rate": 0.00019653794268922924, "loss": 0.077, "step": 4329 }, { "epoch": 0.2799191919191919, "grad_norm": 0.061878301203250885, "learning_rate": 0.0001965361586121289, "loss": 0.0842, "step": 4330 }, { "epoch": 0.2799838383838384, "grad_norm": 0.06939241290092468, "learning_rate": 0.00019653437408356084, "loss": 0.097, "step": 4331 }, { "epoch": 0.2800484848484848, "grad_norm": 0.05771104618906975, "learning_rate": 0.0001965325891035334, "loss": 0.0787, "step": 4332 }, { "epoch": 0.2801131313131313, "grad_norm": 0.07069814205169678, "learning_rate": 0.00019653080367205498, "loss": 0.105, "step": 4333 }, { "epoch": 0.2801777777777778, "grad_norm": 0.07531116157770157, "learning_rate": 0.0001965290177891339, "loss": 0.1061, "step": 4334 }, { "epoch": 0.28024242424242424, "grad_norm": 0.06086823344230652, "learning_rate": 0.00019652723145477852, "loss": 0.0898, "step": 4335 }, { "epoch": 0.2803070707070707, "grad_norm": 0.0644456297159195, "learning_rate": 0.00019652544466899717, "loss": 0.09, "step": 4336 }, { "epoch": 0.2803070707070707, "eval_bleu": 14.48172711037433, "eval_loss": 0.09253813326358795, "eval_runtime": 2.7143, "eval_samples_per_second": 11.789, "eval_steps_per_second": 1.474, "step": 4336 }, { "epoch": 0.28037171717171716, "grad_norm": 0.061860889196395874, "learning_rate": 0.00019652365743179825, "loss": 0.0912, "step": 4337 }, { "epoch": 0.28043636363636365, "grad_norm": 0.08942409604787827, "learning_rate": 0.00019652186974319008, "loss": 0.1097, "step": 4338 }, { "epoch": 0.2805010101010101, "grad_norm": 0.055456697940826416, "learning_rate": 0.00019652008160318105, "loss": 0.0789, "step": 4339 }, { "epoch": 0.2805656565656566, "grad_norm": 0.06779647618532181, "learning_rate": 0.0001965182930117795, "loss": 0.0901, "step": 4340 }, { "epoch": 0.280630303030303, "grad_norm": 0.07024422287940979, "learning_rate": 0.0001965165039689938, "loss": 0.0984, "step": 4341 }, { "epoch": 0.2806949494949495, "grad_norm": 0.08629406988620758, "learning_rate": 0.0001965147144748323, "loss": 0.081, "step": 4342 }, { "epoch": 0.280759595959596, "grad_norm": 0.06911172717809677, "learning_rate": 0.0001965129245293034, "loss": 0.0906, "step": 4343 }, { "epoch": 0.2808242424242424, "grad_norm": 0.06524579226970673, "learning_rate": 0.00019651113413241548, "loss": 0.094, "step": 4344 }, { "epoch": 0.2808888888888889, "grad_norm": 0.06509728729724884, "learning_rate": 0.00019650934328417687, "loss": 0.0821, "step": 4345 }, { "epoch": 0.28095353535353534, "grad_norm": 0.065675288438797, "learning_rate": 0.00019650755198459598, "loss": 0.0848, "step": 4346 }, { "epoch": 0.28101818181818183, "grad_norm": 0.06912438571453094, "learning_rate": 0.00019650576023368119, "loss": 0.0975, "step": 4347 }, { "epoch": 0.28108282828282827, "grad_norm": 0.058166082948446274, "learning_rate": 0.00019650396803144084, "loss": 0.0801, "step": 4348 }, { "epoch": 0.28114747474747476, "grad_norm": 0.07102544605731964, "learning_rate": 0.00019650217537788334, "loss": 0.0953, "step": 4349 }, { "epoch": 0.2812121212121212, "grad_norm": 0.08068510890007019, "learning_rate": 0.00019650038227301707, "loss": 0.105, "step": 4350 }, { "epoch": 0.2812767676767677, "grad_norm": 0.07778412848711014, "learning_rate": 0.0001964985887168504, "loss": 0.0838, "step": 4351 }, { "epoch": 0.2813414141414141, "grad_norm": 0.06701838970184326, "learning_rate": 0.00019649679470939174, "loss": 0.0883, "step": 4352 }, { "epoch": 0.2813414141414141, "eval_bleu": 14.10159700192039, "eval_loss": 0.0937669649720192, "eval_runtime": 2.6934, "eval_samples_per_second": 11.881, "eval_steps_per_second": 1.485, "step": 4352 }, { "epoch": 0.2814060606060606, "grad_norm": 0.06856022030115128, "learning_rate": 0.00019649500025064947, "loss": 0.0919, "step": 4353 }, { "epoch": 0.2814707070707071, "grad_norm": 0.06348799169063568, "learning_rate": 0.00019649320534063198, "loss": 0.0811, "step": 4354 }, { "epoch": 0.2815353535353535, "grad_norm": 0.0736413300037384, "learning_rate": 0.0001964914099793477, "loss": 0.108, "step": 4355 }, { "epoch": 0.2816, "grad_norm": 0.07365409284830093, "learning_rate": 0.00019648961416680495, "loss": 0.1012, "step": 4356 }, { "epoch": 0.28166464646464645, "grad_norm": 0.07459267228841782, "learning_rate": 0.0001964878179030122, "loss": 0.0858, "step": 4357 }, { "epoch": 0.28172929292929294, "grad_norm": 0.0640048235654831, "learning_rate": 0.0001964860211879778, "loss": 0.0876, "step": 4358 }, { "epoch": 0.2817939393939394, "grad_norm": 0.07002885639667511, "learning_rate": 0.00019648422402171022, "loss": 0.093, "step": 4359 }, { "epoch": 0.28185858585858586, "grad_norm": 0.07678961008787155, "learning_rate": 0.00019648242640421778, "loss": 0.1066, "step": 4360 }, { "epoch": 0.2819232323232323, "grad_norm": 0.06830359995365143, "learning_rate": 0.00019648062833550896, "loss": 0.0966, "step": 4361 }, { "epoch": 0.2819878787878788, "grad_norm": 0.06086006760597229, "learning_rate": 0.00019647882981559212, "loss": 0.0797, "step": 4362 }, { "epoch": 0.2820525252525253, "grad_norm": 0.06256676465272903, "learning_rate": 0.00019647703084447568, "loss": 0.0887, "step": 4363 }, { "epoch": 0.2821171717171717, "grad_norm": 0.05955759808421135, "learning_rate": 0.0001964752314221681, "loss": 0.0736, "step": 4364 }, { "epoch": 0.2821818181818182, "grad_norm": 0.05913713946938515, "learning_rate": 0.00019647343154867772, "loss": 0.0758, "step": 4365 }, { "epoch": 0.28224646464646463, "grad_norm": 0.05970035493373871, "learning_rate": 0.000196471631224013, "loss": 0.0858, "step": 4366 }, { "epoch": 0.2823111111111111, "grad_norm": 0.06839437782764435, "learning_rate": 0.00019646983044818237, "loss": 0.0885, "step": 4367 }, { "epoch": 0.28237575757575756, "grad_norm": 0.07102259248495102, "learning_rate": 0.0001964680292211942, "loss": 0.0935, "step": 4368 }, { "epoch": 0.28237575757575756, "eval_bleu": 13.240661033997364, "eval_loss": 0.09453465789556503, "eval_runtime": 2.7499, "eval_samples_per_second": 11.637, "eval_steps_per_second": 1.455, "step": 4368 }, { "epoch": 0.28244040404040405, "grad_norm": 0.07567720115184784, "learning_rate": 0.00019646622754305697, "loss": 0.0991, "step": 4369 }, { "epoch": 0.2825050505050505, "grad_norm": 0.06866313517093658, "learning_rate": 0.00019646442541377908, "loss": 0.0897, "step": 4370 }, { "epoch": 0.28256969696969697, "grad_norm": 0.06221785768866539, "learning_rate": 0.00019646262283336898, "loss": 0.0914, "step": 4371 }, { "epoch": 0.28263434343434346, "grad_norm": 0.07376563549041748, "learning_rate": 0.00019646081980183509, "loss": 0.1005, "step": 4372 }, { "epoch": 0.2826989898989899, "grad_norm": 0.07264561951160431, "learning_rate": 0.0001964590163191858, "loss": 0.104, "step": 4373 }, { "epoch": 0.2827636363636364, "grad_norm": 0.0632186159491539, "learning_rate": 0.0001964572123854296, "loss": 0.078, "step": 4374 }, { "epoch": 0.2828282828282828, "grad_norm": 0.06842995434999466, "learning_rate": 0.0001964554080005749, "loss": 0.0998, "step": 4375 }, { "epoch": 0.2828929292929293, "grad_norm": 0.062108393758535385, "learning_rate": 0.00019645360316463016, "loss": 0.0752, "step": 4376 }, { "epoch": 0.28295757575757574, "grad_norm": 0.07050774991512299, "learning_rate": 0.00019645179787760377, "loss": 0.085, "step": 4377 }, { "epoch": 0.28302222222222223, "grad_norm": 0.06702041625976562, "learning_rate": 0.00019644999213950425, "loss": 0.0928, "step": 4378 }, { "epoch": 0.28308686868686866, "grad_norm": 0.06477727741003036, "learning_rate": 0.00019644818595033998, "loss": 0.0907, "step": 4379 }, { "epoch": 0.28315151515151515, "grad_norm": 0.05960860103368759, "learning_rate": 0.00019644637931011942, "loss": 0.0876, "step": 4380 }, { "epoch": 0.28321616161616164, "grad_norm": 0.07777737081050873, "learning_rate": 0.00019644457221885104, "loss": 0.1025, "step": 4381 }, { "epoch": 0.2832808080808081, "grad_norm": 0.06811051815748215, "learning_rate": 0.0001964427646765433, "loss": 0.0935, "step": 4382 }, { "epoch": 0.28334545454545457, "grad_norm": 0.06178711727261543, "learning_rate": 0.0001964409566832046, "loss": 0.0801, "step": 4383 }, { "epoch": 0.283410101010101, "grad_norm": 0.06660346686840057, "learning_rate": 0.00019643914823884342, "loss": 0.093, "step": 4384 }, { "epoch": 0.283410101010101, "eval_bleu": 13.503905398349396, "eval_loss": 0.09409144520759583, "eval_runtime": 2.6955, "eval_samples_per_second": 11.872, "eval_steps_per_second": 1.484, "step": 4384 }, { "epoch": 0.2834747474747475, "grad_norm": 0.07667375355958939, "learning_rate": 0.00019643733934346825, "loss": 0.0918, "step": 4385 }, { "epoch": 0.2835393939393939, "grad_norm": 0.0823507308959961, "learning_rate": 0.00019643552999708752, "loss": 0.0943, "step": 4386 }, { "epoch": 0.2836040404040404, "grad_norm": 0.0671856701374054, "learning_rate": 0.0001964337201997097, "loss": 0.0764, "step": 4387 }, { "epoch": 0.28366868686868685, "grad_norm": 0.0804782509803772, "learning_rate": 0.00019643190995134323, "loss": 0.1021, "step": 4388 }, { "epoch": 0.28373333333333334, "grad_norm": 0.07721947133541107, "learning_rate": 0.00019643009925199664, "loss": 0.1041, "step": 4389 }, { "epoch": 0.28379797979797977, "grad_norm": 0.07023712247610092, "learning_rate": 0.00019642828810167835, "loss": 0.0951, "step": 4390 }, { "epoch": 0.28386262626262626, "grad_norm": 0.06833957880735397, "learning_rate": 0.00019642647650039682, "loss": 0.104, "step": 4391 }, { "epoch": 0.28392727272727275, "grad_norm": 0.07807779312133789, "learning_rate": 0.00019642466444816053, "loss": 0.1184, "step": 4392 }, { "epoch": 0.2839919191919192, "grad_norm": 0.06609499454498291, "learning_rate": 0.00019642285194497797, "loss": 0.0804, "step": 4393 }, { "epoch": 0.2840565656565657, "grad_norm": 0.07198118418455124, "learning_rate": 0.00019642103899085756, "loss": 0.1014, "step": 4394 }, { "epoch": 0.2841212121212121, "grad_norm": 0.07774338871240616, "learning_rate": 0.0001964192255858079, "loss": 0.0968, "step": 4395 }, { "epoch": 0.2841858585858586, "grad_norm": 0.06879827380180359, "learning_rate": 0.00019641741172983737, "loss": 0.0944, "step": 4396 }, { "epoch": 0.28425050505050503, "grad_norm": 0.05825714394450188, "learning_rate": 0.00019641559742295447, "loss": 0.0841, "step": 4397 }, { "epoch": 0.2843151515151515, "grad_norm": 0.07056107372045517, "learning_rate": 0.0001964137826651677, "loss": 0.0914, "step": 4398 }, { "epoch": 0.28437979797979795, "grad_norm": 0.06801684945821762, "learning_rate": 0.00019641196745648557, "loss": 0.0967, "step": 4399 }, { "epoch": 0.28444444444444444, "grad_norm": 0.0754716619849205, "learning_rate": 0.0001964101517969165, "loss": 0.0923, "step": 4400 }, { "epoch": 0.28444444444444444, "eval_bleu": 13.84754353500129, "eval_loss": 0.09553204476833344, "eval_runtime": 2.7286, "eval_samples_per_second": 11.728, "eval_steps_per_second": 1.466, "step": 4400 }, { "epoch": 0.28450909090909093, "grad_norm": 0.08046947419643402, "learning_rate": 0.00019640833568646904, "loss": 0.0999, "step": 4401 }, { "epoch": 0.28457373737373737, "grad_norm": 0.0713997632265091, "learning_rate": 0.00019640651912515165, "loss": 0.0956, "step": 4402 }, { "epoch": 0.28463838383838386, "grad_norm": 0.0697353333234787, "learning_rate": 0.0001964047021129729, "loss": 0.0933, "step": 4403 }, { "epoch": 0.2847030303030303, "grad_norm": 0.07646799087524414, "learning_rate": 0.00019640288464994117, "loss": 0.1084, "step": 4404 }, { "epoch": 0.2847676767676768, "grad_norm": 0.06223762780427933, "learning_rate": 0.00019640106673606504, "loss": 0.0757, "step": 4405 }, { "epoch": 0.2848323232323232, "grad_norm": 0.08348898589611053, "learning_rate": 0.00019639924837135299, "loss": 0.0979, "step": 4406 }, { "epoch": 0.2848969696969697, "grad_norm": 0.08296437561511993, "learning_rate": 0.0001963974295558135, "loss": 0.0816, "step": 4407 }, { "epoch": 0.28496161616161614, "grad_norm": 0.06300361454486847, "learning_rate": 0.00019639561028945513, "loss": 0.0974, "step": 4408 }, { "epoch": 0.2850262626262626, "grad_norm": 0.07703543454408646, "learning_rate": 0.00019639379057228635, "loss": 0.09, "step": 4409 }, { "epoch": 0.2850909090909091, "grad_norm": 0.06007029861211777, "learning_rate": 0.00019639197040431568, "loss": 0.0684, "step": 4410 }, { "epoch": 0.28515555555555555, "grad_norm": 0.06998424232006073, "learning_rate": 0.00019639014978555165, "loss": 0.1021, "step": 4411 }, { "epoch": 0.28522020202020204, "grad_norm": 0.06395740061998367, "learning_rate": 0.0001963883287160027, "loss": 0.0878, "step": 4412 }, { "epoch": 0.2852848484848485, "grad_norm": 0.05911776423454285, "learning_rate": 0.00019638650719567747, "loss": 0.0719, "step": 4413 }, { "epoch": 0.28534949494949496, "grad_norm": 0.07202507555484772, "learning_rate": 0.00019638468522458434, "loss": 0.0884, "step": 4414 }, { "epoch": 0.2854141414141414, "grad_norm": 0.0732078030705452, "learning_rate": 0.00019638286280273195, "loss": 0.1066, "step": 4415 }, { "epoch": 0.2854787878787879, "grad_norm": 0.06537657976150513, "learning_rate": 0.00019638103993012879, "loss": 0.075, "step": 4416 }, { "epoch": 0.2854787878787879, "eval_bleu": 14.100920826305225, "eval_loss": 0.0930790975689888, "eval_runtime": 2.8647, "eval_samples_per_second": 11.171, "eval_steps_per_second": 1.396, "step": 4416 }, { "epoch": 0.2855434343434343, "grad_norm": 0.07066395878791809, "learning_rate": 0.00019637921660678332, "loss": 0.082, "step": 4417 }, { "epoch": 0.2856080808080808, "grad_norm": 0.06402572989463806, "learning_rate": 0.00019637739283270413, "loss": 0.0834, "step": 4418 }, { "epoch": 0.2856727272727273, "grad_norm": 0.06865181773900986, "learning_rate": 0.00019637556860789974, "loss": 0.1013, "step": 4419 }, { "epoch": 0.28573737373737373, "grad_norm": 0.07499369978904724, "learning_rate": 0.00019637374393237868, "loss": 0.1034, "step": 4420 }, { "epoch": 0.2858020202020202, "grad_norm": 0.07298306375741959, "learning_rate": 0.0001963719188061495, "loss": 0.0932, "step": 4421 }, { "epoch": 0.28586666666666666, "grad_norm": 0.057881537824869156, "learning_rate": 0.0001963700932292207, "loss": 0.0676, "step": 4422 }, { "epoch": 0.28593131313131315, "grad_norm": 0.060981057584285736, "learning_rate": 0.0001963682672016008, "loss": 0.0888, "step": 4423 }, { "epoch": 0.2859959595959596, "grad_norm": 0.06692329049110413, "learning_rate": 0.00019636644072329842, "loss": 0.0888, "step": 4424 }, { "epoch": 0.28606060606060607, "grad_norm": 0.08159921318292618, "learning_rate": 0.00019636461379432201, "loss": 0.1023, "step": 4425 }, { "epoch": 0.2861252525252525, "grad_norm": 0.0881388932466507, "learning_rate": 0.0001963627864146802, "loss": 0.1158, "step": 4426 }, { "epoch": 0.286189898989899, "grad_norm": 0.06777847558259964, "learning_rate": 0.0001963609585843815, "loss": 0.0921, "step": 4427 }, { "epoch": 0.28625454545454543, "grad_norm": 0.06777142733335495, "learning_rate": 0.00019635913030343442, "loss": 0.0937, "step": 4428 }, { "epoch": 0.2863191919191919, "grad_norm": 0.060988642275333405, "learning_rate": 0.00019635730157184758, "loss": 0.0873, "step": 4429 }, { "epoch": 0.2863838383838384, "grad_norm": 0.054307758808135986, "learning_rate": 0.00019635547238962945, "loss": 0.0685, "step": 4430 }, { "epoch": 0.28644848484848484, "grad_norm": 0.06294847279787064, "learning_rate": 0.00019635364275678865, "loss": 0.0904, "step": 4431 }, { "epoch": 0.28651313131313133, "grad_norm": 0.06527037173509598, "learning_rate": 0.00019635181267333373, "loss": 0.0842, "step": 4432 }, { "epoch": 0.28651313131313133, "eval_bleu": 13.80523211003128, "eval_loss": 0.09447641670703888, "eval_runtime": 2.7065, "eval_samples_per_second": 11.824, "eval_steps_per_second": 1.478, "step": 4432 }, { "epoch": 0.28657777777777776, "grad_norm": 0.0796068087220192, "learning_rate": 0.00019634998213927322, "loss": 0.1319, "step": 4433 }, { "epoch": 0.28664242424242425, "grad_norm": 0.06002984941005707, "learning_rate": 0.00019634815115461568, "loss": 0.087, "step": 4434 }, { "epoch": 0.2867070707070707, "grad_norm": 0.09939252585172653, "learning_rate": 0.00019634631971936973, "loss": 0.1247, "step": 4435 }, { "epoch": 0.2867717171717172, "grad_norm": 0.0634773001074791, "learning_rate": 0.00019634448783354387, "loss": 0.0832, "step": 4436 }, { "epoch": 0.2868363636363636, "grad_norm": 0.07285909354686737, "learning_rate": 0.0001963426554971467, "loss": 0.0955, "step": 4437 }, { "epoch": 0.2869010101010101, "grad_norm": 0.06427568197250366, "learning_rate": 0.0001963408227101868, "loss": 0.0816, "step": 4438 }, { "epoch": 0.2869656565656566, "grad_norm": 0.0669107660651207, "learning_rate": 0.0001963389894726727, "loss": 0.0962, "step": 4439 }, { "epoch": 0.287030303030303, "grad_norm": 0.07926435768604279, "learning_rate": 0.00019633715578461298, "loss": 0.099, "step": 4440 }, { "epoch": 0.2870949494949495, "grad_norm": 0.061039771884679794, "learning_rate": 0.00019633532164601625, "loss": 0.0835, "step": 4441 }, { "epoch": 0.28715959595959595, "grad_norm": 0.06401325017213821, "learning_rate": 0.00019633348705689105, "loss": 0.0908, "step": 4442 }, { "epoch": 0.28722424242424244, "grad_norm": 0.05776556208729744, "learning_rate": 0.00019633165201724597, "loss": 0.0649, "step": 4443 }, { "epoch": 0.28728888888888887, "grad_norm": 0.07252118736505508, "learning_rate": 0.00019632981652708965, "loss": 0.0922, "step": 4444 }, { "epoch": 0.28735353535353536, "grad_norm": 0.07046280801296234, "learning_rate": 0.00019632798058643057, "loss": 0.0909, "step": 4445 }, { "epoch": 0.2874181818181818, "grad_norm": 0.06610973924398422, "learning_rate": 0.0001963261441952774, "loss": 0.0893, "step": 4446 }, { "epoch": 0.2874828282828283, "grad_norm": 0.06593465059995651, "learning_rate": 0.00019632430735363871, "loss": 0.0853, "step": 4447 }, { "epoch": 0.2875474747474748, "grad_norm": 0.0564500167965889, "learning_rate": 0.00019632247006152307, "loss": 0.0747, "step": 4448 }, { "epoch": 0.2875474747474748, "eval_bleu": 16.61669271709447, "eval_loss": 0.0922250747680664, "eval_runtime": 2.8013, "eval_samples_per_second": 11.423, "eval_steps_per_second": 1.428, "step": 4448 }, { "epoch": 0.2876121212121212, "grad_norm": 0.06765074282884598, "learning_rate": 0.00019632063231893903, "loss": 0.0879, "step": 4449 }, { "epoch": 0.2876767676767677, "grad_norm": 0.05686028674244881, "learning_rate": 0.00019631879412589532, "loss": 0.0869, "step": 4450 }, { "epoch": 0.28774141414141413, "grad_norm": 0.0712445080280304, "learning_rate": 0.00019631695548240038, "loss": 0.1143, "step": 4451 }, { "epoch": 0.2878060606060606, "grad_norm": 0.06979040056467056, "learning_rate": 0.00019631511638846293, "loss": 0.1033, "step": 4452 }, { "epoch": 0.28787070707070705, "grad_norm": 0.06922651082277298, "learning_rate": 0.00019631327684409148, "loss": 0.089, "step": 4453 }, { "epoch": 0.28793535353535354, "grad_norm": 0.06405875086784363, "learning_rate": 0.0001963114368492947, "loss": 0.0887, "step": 4454 }, { "epoch": 0.288, "grad_norm": 0.06777805089950562, "learning_rate": 0.00019630959640408118, "loss": 0.0821, "step": 4455 }, { "epoch": 0.28806464646464647, "grad_norm": 0.06872130185365677, "learning_rate": 0.0001963077555084595, "loss": 0.0959, "step": 4456 }, { "epoch": 0.2881292929292929, "grad_norm": 0.06281452625989914, "learning_rate": 0.00019630591416243832, "loss": 0.0891, "step": 4457 }, { "epoch": 0.2881939393939394, "grad_norm": 0.05928211659193039, "learning_rate": 0.00019630407236602619, "loss": 0.0871, "step": 4458 }, { "epoch": 0.2882585858585859, "grad_norm": 0.05711644887924194, "learning_rate": 0.00019630223011923175, "loss": 0.0777, "step": 4459 }, { "epoch": 0.2883232323232323, "grad_norm": 0.08966274559497833, "learning_rate": 0.00019630038742206362, "loss": 0.0908, "step": 4460 }, { "epoch": 0.2883878787878788, "grad_norm": 0.07473871111869812, "learning_rate": 0.0001962985442745304, "loss": 0.0938, "step": 4461 }, { "epoch": 0.28845252525252524, "grad_norm": 0.0743502676486969, "learning_rate": 0.00019629670067664075, "loss": 0.1059, "step": 4462 }, { "epoch": 0.2885171717171717, "grad_norm": 0.068879134953022, "learning_rate": 0.00019629485662840324, "loss": 0.0868, "step": 4463 }, { "epoch": 0.28858181818181816, "grad_norm": 0.059188731014728546, "learning_rate": 0.00019629301212982655, "loss": 0.0834, "step": 4464 }, { "epoch": 0.28858181818181816, "eval_bleu": 15.642684770728016, "eval_loss": 0.09318391978740692, "eval_runtime": 2.8318, "eval_samples_per_second": 11.3, "eval_steps_per_second": 1.413, "step": 4464 }, { "epoch": 0.28864646464646465, "grad_norm": 0.07137424498796463, "learning_rate": 0.00019629116718091925, "loss": 0.1055, "step": 4465 }, { "epoch": 0.2887111111111111, "grad_norm": 0.061770759522914886, "learning_rate": 0.00019628932178169, "loss": 0.08, "step": 4466 }, { "epoch": 0.2887757575757576, "grad_norm": 0.07105202972888947, "learning_rate": 0.0001962874759321474, "loss": 0.1008, "step": 4467 }, { "epoch": 0.28884040404040406, "grad_norm": 0.0716095119714737, "learning_rate": 0.00019628562963230014, "loss": 0.0935, "step": 4468 }, { "epoch": 0.2889050505050505, "grad_norm": 0.07197891920804977, "learning_rate": 0.00019628378288215684, "loss": 0.0944, "step": 4469 }, { "epoch": 0.288969696969697, "grad_norm": 0.06876778602600098, "learning_rate": 0.00019628193568172608, "loss": 0.0809, "step": 4470 }, { "epoch": 0.2890343434343434, "grad_norm": 0.06736275553703308, "learning_rate": 0.00019628008803101654, "loss": 0.0846, "step": 4471 }, { "epoch": 0.2890989898989899, "grad_norm": 0.06846721470355988, "learning_rate": 0.00019627823993003686, "loss": 0.0922, "step": 4472 }, { "epoch": 0.28916363636363634, "grad_norm": 0.07255737483501434, "learning_rate": 0.0001962763913787957, "loss": 0.0789, "step": 4473 }, { "epoch": 0.28922828282828283, "grad_norm": 0.06241066753864288, "learning_rate": 0.00019627454237730168, "loss": 0.0843, "step": 4474 }, { "epoch": 0.28929292929292927, "grad_norm": 0.07016219943761826, "learning_rate": 0.00019627269292556344, "loss": 0.1055, "step": 4475 }, { "epoch": 0.28935757575757576, "grad_norm": 0.0690203458070755, "learning_rate": 0.00019627084302358965, "loss": 0.0945, "step": 4476 }, { "epoch": 0.28942222222222225, "grad_norm": 0.06753858923912048, "learning_rate": 0.00019626899267138898, "loss": 0.0889, "step": 4477 }, { "epoch": 0.2894868686868687, "grad_norm": 0.07447067648172379, "learning_rate": 0.00019626714186897002, "loss": 0.0981, "step": 4478 }, { "epoch": 0.28955151515151517, "grad_norm": 0.06929203122854233, "learning_rate": 0.00019626529061634148, "loss": 0.0843, "step": 4479 }, { "epoch": 0.2896161616161616, "grad_norm": 0.07303477078676224, "learning_rate": 0.00019626343891351197, "loss": 0.1005, "step": 4480 }, { "epoch": 0.2896161616161616, "eval_bleu": 13.938958960526056, "eval_loss": 0.09255760163068771, "eval_runtime": 2.7465, "eval_samples_per_second": 11.651, "eval_steps_per_second": 1.456, "step": 4480 }, { "epoch": 0.2896808080808081, "grad_norm": 0.07850413024425507, "learning_rate": 0.00019626158676049022, "loss": 0.1163, "step": 4481 }, { "epoch": 0.28974545454545453, "grad_norm": 0.07225518673658371, "learning_rate": 0.00019625973415728486, "loss": 0.0835, "step": 4482 }, { "epoch": 0.289810101010101, "grad_norm": 0.06698223203420639, "learning_rate": 0.0001962578811039045, "loss": 0.0815, "step": 4483 }, { "epoch": 0.28987474747474745, "grad_norm": 0.06548656523227692, "learning_rate": 0.00019625602760035786, "loss": 0.0832, "step": 4484 }, { "epoch": 0.28993939393939394, "grad_norm": 0.06506742537021637, "learning_rate": 0.00019625417364665362, "loss": 0.0913, "step": 4485 }, { "epoch": 0.29000404040404043, "grad_norm": 0.06188466399908066, "learning_rate": 0.00019625231924280044, "loss": 0.0865, "step": 4486 }, { "epoch": 0.29006868686868686, "grad_norm": 0.06359705328941345, "learning_rate": 0.00019625046438880698, "loss": 0.0872, "step": 4487 }, { "epoch": 0.29013333333333335, "grad_norm": 0.05754384398460388, "learning_rate": 0.00019624860908468188, "loss": 0.0831, "step": 4488 }, { "epoch": 0.2901979797979798, "grad_norm": 0.11809027940034866, "learning_rate": 0.00019624675333043389, "loss": 0.0974, "step": 4489 }, { "epoch": 0.2902626262626263, "grad_norm": 0.06136356666684151, "learning_rate": 0.00019624489712607166, "loss": 0.0793, "step": 4490 }, { "epoch": 0.2903272727272727, "grad_norm": 0.05404827371239662, "learning_rate": 0.00019624304047160385, "loss": 0.0766, "step": 4491 }, { "epoch": 0.2903919191919192, "grad_norm": 0.07475696504116058, "learning_rate": 0.00019624118336703913, "loss": 0.0974, "step": 4492 }, { "epoch": 0.29045656565656564, "grad_norm": 0.059779733419418335, "learning_rate": 0.00019623932581238626, "loss": 0.0855, "step": 4493 }, { "epoch": 0.2905212121212121, "grad_norm": 0.06780587881803513, "learning_rate": 0.00019623746780765387, "loss": 0.0884, "step": 4494 }, { "epoch": 0.29058585858585856, "grad_norm": 0.06374438852071762, "learning_rate": 0.00019623560935285063, "loss": 0.0952, "step": 4495 }, { "epoch": 0.29065050505050505, "grad_norm": 0.08313052356243134, "learning_rate": 0.00019623375044798528, "loss": 0.1256, "step": 4496 }, { "epoch": 0.29065050505050505, "eval_bleu": 14.767739999072903, "eval_loss": 0.09278881549835205, "eval_runtime": 2.8711, "eval_samples_per_second": 11.146, "eval_steps_per_second": 1.393, "step": 4496 }, { "epoch": 0.29071515151515154, "grad_norm": 0.0713055208325386, "learning_rate": 0.0001962318910930665, "loss": 0.0965, "step": 4497 }, { "epoch": 0.29077979797979797, "grad_norm": 0.06727954745292664, "learning_rate": 0.00019623003128810295, "loss": 0.0965, "step": 4498 }, { "epoch": 0.29084444444444446, "grad_norm": 0.06295987218618393, "learning_rate": 0.0001962281710331034, "loss": 0.0669, "step": 4499 }, { "epoch": 0.2909090909090909, "grad_norm": 0.06775689125061035, "learning_rate": 0.00019622631032807647, "loss": 0.0909, "step": 4500 }, { "epoch": 0.2909737373737374, "grad_norm": 0.06560097634792328, "learning_rate": 0.0001962244491730309, "loss": 0.0941, "step": 4501 }, { "epoch": 0.2910383838383838, "grad_norm": 0.05855448544025421, "learning_rate": 0.00019622258756797541, "loss": 0.0808, "step": 4502 }, { "epoch": 0.2911030303030303, "grad_norm": 0.06965626776218414, "learning_rate": 0.0001962207255129187, "loss": 0.1066, "step": 4503 }, { "epoch": 0.29116767676767674, "grad_norm": 0.07694855332374573, "learning_rate": 0.00019621886300786944, "loss": 0.1066, "step": 4504 }, { "epoch": 0.29123232323232323, "grad_norm": 0.07424485683441162, "learning_rate": 0.00019621700005283637, "loss": 0.0749, "step": 4505 }, { "epoch": 0.2912969696969697, "grad_norm": 0.07173068821430206, "learning_rate": 0.0001962151366478282, "loss": 0.0906, "step": 4506 }, { "epoch": 0.29136161616161615, "grad_norm": 0.06268858164548874, "learning_rate": 0.00019621327279285367, "loss": 0.084, "step": 4507 }, { "epoch": 0.29142626262626264, "grad_norm": 0.06644877046346664, "learning_rate": 0.00019621140848792142, "loss": 0.1096, "step": 4508 }, { "epoch": 0.2914909090909091, "grad_norm": 0.06466013938188553, "learning_rate": 0.00019620954373304026, "loss": 0.0885, "step": 4509 }, { "epoch": 0.29155555555555557, "grad_norm": 0.08469715714454651, "learning_rate": 0.00019620767852821885, "loss": 0.1143, "step": 4510 }, { "epoch": 0.291620202020202, "grad_norm": 0.05712594464421272, "learning_rate": 0.00019620581287346596, "loss": 0.0915, "step": 4511 }, { "epoch": 0.2916848484848485, "grad_norm": 0.06825772672891617, "learning_rate": 0.00019620394676879026, "loss": 0.0928, "step": 4512 }, { "epoch": 0.2916848484848485, "eval_bleu": 17.286629611609083, "eval_loss": 0.09411090612411499, "eval_runtime": 2.7429, "eval_samples_per_second": 11.667, "eval_steps_per_second": 1.458, "step": 4512 }, { "epoch": 0.2917494949494949, "grad_norm": 0.0751626268029213, "learning_rate": 0.00019620208021420052, "loss": 0.125, "step": 4513 }, { "epoch": 0.2918141414141414, "grad_norm": 0.10134122520685196, "learning_rate": 0.00019620021320970545, "loss": 0.0795, "step": 4514 }, { "epoch": 0.2918787878787879, "grad_norm": 0.06724418699741364, "learning_rate": 0.00019619834575531378, "loss": 0.1045, "step": 4515 }, { "epoch": 0.29194343434343434, "grad_norm": 0.05141282081604004, "learning_rate": 0.00019619647785103426, "loss": 0.066, "step": 4516 }, { "epoch": 0.2920080808080808, "grad_norm": 0.061379093676805496, "learning_rate": 0.00019619460949687562, "loss": 0.0874, "step": 4517 }, { "epoch": 0.29207272727272726, "grad_norm": 0.060092389583587646, "learning_rate": 0.00019619274069284658, "loss": 0.0767, "step": 4518 }, { "epoch": 0.29213737373737375, "grad_norm": 0.06881596893072128, "learning_rate": 0.00019619087143895588, "loss": 0.0974, "step": 4519 }, { "epoch": 0.2922020202020202, "grad_norm": 0.062164973467588425, "learning_rate": 0.0001961890017352123, "loss": 0.088, "step": 4520 }, { "epoch": 0.2922666666666667, "grad_norm": 0.07124926149845123, "learning_rate": 0.00019618713158162457, "loss": 0.0942, "step": 4521 }, { "epoch": 0.2923313131313131, "grad_norm": 0.07605507224798203, "learning_rate": 0.00019618526097820138, "loss": 0.0996, "step": 4522 }, { "epoch": 0.2923959595959596, "grad_norm": 0.09574977308511734, "learning_rate": 0.00019618338992495157, "loss": 0.0942, "step": 4523 }, { "epoch": 0.2924606060606061, "grad_norm": 0.0649600401520729, "learning_rate": 0.00019618151842188382, "loss": 0.0952, "step": 4524 }, { "epoch": 0.2925252525252525, "grad_norm": 0.1014251634478569, "learning_rate": 0.00019617964646900687, "loss": 0.0861, "step": 4525 }, { "epoch": 0.292589898989899, "grad_norm": 0.04904953017830849, "learning_rate": 0.00019617777406632955, "loss": 0.0632, "step": 4526 }, { "epoch": 0.29265454545454544, "grad_norm": 0.06114598363637924, "learning_rate": 0.00019617590121386058, "loss": 0.0842, "step": 4527 }, { "epoch": 0.29271919191919193, "grad_norm": 0.0655544251203537, "learning_rate": 0.0001961740279116087, "loss": 0.0811, "step": 4528 }, { "epoch": 0.29271919191919193, "eval_bleu": 18.58902217675777, "eval_loss": 0.09360867738723755, "eval_runtime": 2.7774, "eval_samples_per_second": 11.521, "eval_steps_per_second": 1.44, "step": 4528 }, { "epoch": 0.29278383838383837, "grad_norm": 0.06083182245492935, "learning_rate": 0.0001961721541595827, "loss": 0.0793, "step": 4529 }, { "epoch": 0.29284848484848486, "grad_norm": 0.08451911062002182, "learning_rate": 0.0001961702799577913, "loss": 0.1057, "step": 4530 }, { "epoch": 0.2929131313131313, "grad_norm": 0.06429535150527954, "learning_rate": 0.00019616840530624333, "loss": 0.0913, "step": 4531 }, { "epoch": 0.2929777777777778, "grad_norm": 0.06467705965042114, "learning_rate": 0.0001961665302049475, "loss": 0.0883, "step": 4532 }, { "epoch": 0.2930424242424242, "grad_norm": 0.07456497848033905, "learning_rate": 0.0001961646546539126, "loss": 0.1085, "step": 4533 }, { "epoch": 0.2931070707070707, "grad_norm": 0.07370718568563461, "learning_rate": 0.00019616277865314744, "loss": 0.0974, "step": 4534 }, { "epoch": 0.2931717171717172, "grad_norm": 0.07023972272872925, "learning_rate": 0.0001961609022026607, "loss": 0.0918, "step": 4535 }, { "epoch": 0.29323636363636363, "grad_norm": 0.06576091796159744, "learning_rate": 0.00019615902530246125, "loss": 0.0892, "step": 4536 }, { "epoch": 0.2933010101010101, "grad_norm": 0.10609713196754456, "learning_rate": 0.00019615714795255778, "loss": 0.1272, "step": 4537 }, { "epoch": 0.29336565656565655, "grad_norm": 0.06828856468200684, "learning_rate": 0.00019615527015295916, "loss": 0.0862, "step": 4538 }, { "epoch": 0.29343030303030304, "grad_norm": 0.05812883749604225, "learning_rate": 0.00019615339190367414, "loss": 0.0782, "step": 4539 }, { "epoch": 0.2934949494949495, "grad_norm": 0.09120214730501175, "learning_rate": 0.00019615151320471146, "loss": 0.11, "step": 4540 }, { "epoch": 0.29355959595959596, "grad_norm": 0.0586036778986454, "learning_rate": 0.00019614963405607995, "loss": 0.0738, "step": 4541 }, { "epoch": 0.2936242424242424, "grad_norm": 0.06411392986774445, "learning_rate": 0.0001961477544577884, "loss": 0.0754, "step": 4542 }, { "epoch": 0.2936888888888889, "grad_norm": 0.058797746896743774, "learning_rate": 0.00019614587440984558, "loss": 0.0762, "step": 4543 }, { "epoch": 0.2937535353535354, "grad_norm": 0.06301523000001907, "learning_rate": 0.00019614399391226027, "loss": 0.0976, "step": 4544 }, { "epoch": 0.2937535353535354, "eval_bleu": 17.05628275009619, "eval_loss": 0.09224610030651093, "eval_runtime": 2.6654, "eval_samples_per_second": 12.006, "eval_steps_per_second": 1.501, "step": 4544 }, { "epoch": 0.2938181818181818, "grad_norm": 0.06774929910898209, "learning_rate": 0.00019614211296504133, "loss": 0.0975, "step": 4545 }, { "epoch": 0.2938828282828283, "grad_norm": 0.07908044010400772, "learning_rate": 0.00019614023156819748, "loss": 0.0853, "step": 4546 }, { "epoch": 0.29394747474747474, "grad_norm": 0.06701932102441788, "learning_rate": 0.00019613834972173754, "loss": 0.0878, "step": 4547 }, { "epoch": 0.2940121212121212, "grad_norm": 0.06733550131320953, "learning_rate": 0.00019613646742567035, "loss": 0.098, "step": 4548 }, { "epoch": 0.29407676767676766, "grad_norm": 0.07471128553152084, "learning_rate": 0.00019613458468000468, "loss": 0.0917, "step": 4549 }, { "epoch": 0.29414141414141415, "grad_norm": 0.058018557727336884, "learning_rate": 0.00019613270148474931, "loss": 0.0786, "step": 4550 }, { "epoch": 0.2942060606060606, "grad_norm": 0.08356033265590668, "learning_rate": 0.0001961308178399131, "loss": 0.0853, "step": 4551 }, { "epoch": 0.29427070707070707, "grad_norm": 0.07202444225549698, "learning_rate": 0.00019612893374550483, "loss": 0.0939, "step": 4552 }, { "epoch": 0.29433535353535356, "grad_norm": 0.06706680357456207, "learning_rate": 0.00019612704920153332, "loss": 0.1023, "step": 4553 }, { "epoch": 0.2944, "grad_norm": 0.07510372996330261, "learning_rate": 0.0001961251642080074, "loss": 0.1065, "step": 4554 }, { "epoch": 0.2944646464646465, "grad_norm": 0.05924130231142044, "learning_rate": 0.0001961232787649358, "loss": 0.0764, "step": 4555 }, { "epoch": 0.2945292929292929, "grad_norm": 0.06250309944152832, "learning_rate": 0.00019612139287232747, "loss": 0.0809, "step": 4556 }, { "epoch": 0.2945939393939394, "grad_norm": 0.0651947483420372, "learning_rate": 0.00019611950653019113, "loss": 0.0852, "step": 4557 }, { "epoch": 0.29465858585858584, "grad_norm": 0.06940531730651855, "learning_rate": 0.00019611761973853566, "loss": 0.0825, "step": 4558 }, { "epoch": 0.29472323232323233, "grad_norm": 0.06890036910772324, "learning_rate": 0.00019611573249736982, "loss": 0.0871, "step": 4559 }, { "epoch": 0.29478787878787877, "grad_norm": 0.06867414712905884, "learning_rate": 0.0001961138448067025, "loss": 0.0949, "step": 4560 }, { "epoch": 0.29478787878787877, "eval_bleu": 15.953762655402342, "eval_loss": 0.09117183089256287, "eval_runtime": 2.6845, "eval_samples_per_second": 11.92, "eval_steps_per_second": 1.49, "step": 4560 }, { "epoch": 0.29485252525252525, "grad_norm": 0.06641063839197159, "learning_rate": 0.00019611195666654249, "loss": 0.0918, "step": 4561 }, { "epoch": 0.29491717171717174, "grad_norm": 0.07135960459709167, "learning_rate": 0.00019611006807689866, "loss": 0.0918, "step": 4562 }, { "epoch": 0.2949818181818182, "grad_norm": 0.06634759157896042, "learning_rate": 0.00019610817903777981, "loss": 0.0789, "step": 4563 }, { "epoch": 0.29504646464646467, "grad_norm": 0.08527235686779022, "learning_rate": 0.00019610628954919474, "loss": 0.1055, "step": 4564 }, { "epoch": 0.2951111111111111, "grad_norm": 0.08668415248394012, "learning_rate": 0.00019610439961115236, "loss": 0.1067, "step": 4565 }, { "epoch": 0.2951757575757576, "grad_norm": 0.06364767998456955, "learning_rate": 0.00019610250922366148, "loss": 0.0833, "step": 4566 }, { "epoch": 0.295240404040404, "grad_norm": 0.07508257031440735, "learning_rate": 0.0001961006183867309, "loss": 0.1044, "step": 4567 }, { "epoch": 0.2953050505050505, "grad_norm": 0.05892026796936989, "learning_rate": 0.00019609872710036954, "loss": 0.0795, "step": 4568 }, { "epoch": 0.29536969696969695, "grad_norm": 0.06696969270706177, "learning_rate": 0.00019609683536458617, "loss": 0.0906, "step": 4569 }, { "epoch": 0.29543434343434344, "grad_norm": 0.0762874111533165, "learning_rate": 0.0001960949431793897, "loss": 0.0985, "step": 4570 }, { "epoch": 0.29549898989898987, "grad_norm": 0.06808783113956451, "learning_rate": 0.00019609305054478892, "loss": 0.0855, "step": 4571 }, { "epoch": 0.29556363636363636, "grad_norm": 0.06726190447807312, "learning_rate": 0.00019609115746079269, "loss": 0.1051, "step": 4572 }, { "epoch": 0.29562828282828285, "grad_norm": 0.05773718282580376, "learning_rate": 0.00019608926392740994, "loss": 0.0765, "step": 4573 }, { "epoch": 0.2956929292929293, "grad_norm": 0.07584203034639359, "learning_rate": 0.00019608736994464944, "loss": 0.1132, "step": 4574 }, { "epoch": 0.2957575757575758, "grad_norm": 0.06285490840673447, "learning_rate": 0.00019608547551252007, "loss": 0.0832, "step": 4575 }, { "epoch": 0.2958222222222222, "grad_norm": 0.06950333714485168, "learning_rate": 0.00019608358063103072, "loss": 0.0871, "step": 4576 }, { "epoch": 0.2958222222222222, "eval_bleu": 16.400183046369527, "eval_loss": 0.09258623421192169, "eval_runtime": 2.6873, "eval_samples_per_second": 11.908, "eval_steps_per_second": 1.488, "step": 4576 }, { "epoch": 0.2958868686868687, "grad_norm": 0.06954176723957062, "learning_rate": 0.0001960816853001902, "loss": 0.0964, "step": 4577 }, { "epoch": 0.29595151515151513, "grad_norm": 0.07281685620546341, "learning_rate": 0.00019607978952000744, "loss": 0.1175, "step": 4578 }, { "epoch": 0.2960161616161616, "grad_norm": 0.06163204088807106, "learning_rate": 0.00019607789329049123, "loss": 0.0824, "step": 4579 }, { "epoch": 0.29608080808080806, "grad_norm": 0.06183109059929848, "learning_rate": 0.0001960759966116505, "loss": 0.1033, "step": 4580 }, { "epoch": 0.29614545454545455, "grad_norm": 0.06978536397218704, "learning_rate": 0.00019607409948349405, "loss": 0.0843, "step": 4581 }, { "epoch": 0.29621010101010103, "grad_norm": 0.06802777200937271, "learning_rate": 0.00019607220190603086, "loss": 0.0962, "step": 4582 }, { "epoch": 0.29627474747474747, "grad_norm": 0.07489711791276932, "learning_rate": 0.00019607030387926971, "loss": 0.0946, "step": 4583 }, { "epoch": 0.29633939393939396, "grad_norm": 0.05409941077232361, "learning_rate": 0.00019606840540321955, "loss": 0.0723, "step": 4584 }, { "epoch": 0.2964040404040404, "grad_norm": 0.05896497145295143, "learning_rate": 0.00019606650647788917, "loss": 0.0814, "step": 4585 }, { "epoch": 0.2964686868686869, "grad_norm": 0.061247263103723526, "learning_rate": 0.00019606460710328748, "loss": 0.0882, "step": 4586 }, { "epoch": 0.2965333333333333, "grad_norm": 0.07093510031700134, "learning_rate": 0.00019606270727942345, "loss": 0.1093, "step": 4587 }, { "epoch": 0.2965979797979798, "grad_norm": 0.06910679489374161, "learning_rate": 0.00019606080700630585, "loss": 0.1, "step": 4588 }, { "epoch": 0.29666262626262624, "grad_norm": 0.05494191125035286, "learning_rate": 0.0001960589062839436, "loss": 0.0809, "step": 4589 }, { "epoch": 0.29672727272727273, "grad_norm": 0.06923728436231613, "learning_rate": 0.00019605700511234563, "loss": 0.0823, "step": 4590 }, { "epoch": 0.2967919191919192, "grad_norm": 0.07791271060705185, "learning_rate": 0.00019605510349152082, "loss": 0.1124, "step": 4591 }, { "epoch": 0.29685656565656565, "grad_norm": 0.0683489739894867, "learning_rate": 0.000196053201421478, "loss": 0.0849, "step": 4592 }, { "epoch": 0.29685656565656565, "eval_bleu": 14.771845035022036, "eval_loss": 0.09264057129621506, "eval_runtime": 2.753, "eval_samples_per_second": 11.624, "eval_steps_per_second": 1.453, "step": 4592 }, { "epoch": 0.29692121212121214, "grad_norm": 0.0798630639910698, "learning_rate": 0.00019605129890222614, "loss": 0.1065, "step": 4593 }, { "epoch": 0.2969858585858586, "grad_norm": 0.05915911868214607, "learning_rate": 0.00019604939593377408, "loss": 0.0802, "step": 4594 }, { "epoch": 0.29705050505050506, "grad_norm": 0.07560048997402191, "learning_rate": 0.0001960474925161308, "loss": 0.09, "step": 4595 }, { "epoch": 0.2971151515151515, "grad_norm": 0.05218242108821869, "learning_rate": 0.0001960455886493051, "loss": 0.0619, "step": 4596 }, { "epoch": 0.297179797979798, "grad_norm": 0.07150762528181076, "learning_rate": 0.00019604368433330597, "loss": 0.1023, "step": 4597 }, { "epoch": 0.2972444444444444, "grad_norm": 0.07948549091815948, "learning_rate": 0.00019604177956814226, "loss": 0.1009, "step": 4598 }, { "epoch": 0.2973090909090909, "grad_norm": 0.12578381597995758, "learning_rate": 0.00019603987435382292, "loss": 0.0879, "step": 4599 }, { "epoch": 0.2973737373737374, "grad_norm": 0.07147827744483948, "learning_rate": 0.0001960379686903568, "loss": 0.0834, "step": 4600 }, { "epoch": 0.29743838383838384, "grad_norm": 0.0709298625588417, "learning_rate": 0.00019603606257775287, "loss": 0.0922, "step": 4601 }, { "epoch": 0.2975030303030303, "grad_norm": 0.07045287638902664, "learning_rate": 0.00019603415601602002, "loss": 0.097, "step": 4602 }, { "epoch": 0.29756767676767676, "grad_norm": 0.06109197810292244, "learning_rate": 0.0001960322490051672, "loss": 0.0776, "step": 4603 }, { "epoch": 0.29763232323232325, "grad_norm": 0.06679968535900116, "learning_rate": 0.00019603034154520326, "loss": 0.0885, "step": 4604 }, { "epoch": 0.2976969696969697, "grad_norm": 0.06881806254386902, "learning_rate": 0.0001960284336361372, "loss": 0.0845, "step": 4605 }, { "epoch": 0.29776161616161617, "grad_norm": 0.06673866510391235, "learning_rate": 0.00019602652527797788, "loss": 0.0801, "step": 4606 }, { "epoch": 0.2978262626262626, "grad_norm": 0.07082986831665039, "learning_rate": 0.00019602461647073423, "loss": 0.0854, "step": 4607 }, { "epoch": 0.2978909090909091, "grad_norm": 0.0699472576379776, "learning_rate": 0.0001960227072144152, "loss": 0.0922, "step": 4608 }, { "epoch": 0.2978909090909091, "eval_bleu": 14.474955979377038, "eval_loss": 0.09158635884523392, "eval_runtime": 2.6993, "eval_samples_per_second": 11.855, "eval_steps_per_second": 1.482, "step": 4608 }, { "epoch": 0.29795555555555553, "grad_norm": 0.08134989440441132, "learning_rate": 0.00019602079750902972, "loss": 0.0712, "step": 4609 }, { "epoch": 0.298020202020202, "grad_norm": 0.06048596277832985, "learning_rate": 0.0001960188873545867, "loss": 0.0772, "step": 4610 }, { "epoch": 0.2980848484848485, "grad_norm": 0.08579091727733612, "learning_rate": 0.00019601697675109513, "loss": 0.0932, "step": 4611 }, { "epoch": 0.29814949494949494, "grad_norm": 0.06653161346912384, "learning_rate": 0.00019601506569856384, "loss": 0.0766, "step": 4612 }, { "epoch": 0.29821414141414143, "grad_norm": 0.06581110507249832, "learning_rate": 0.00019601315419700188, "loss": 0.0909, "step": 4613 }, { "epoch": 0.29827878787878787, "grad_norm": 0.05982992798089981, "learning_rate": 0.0001960112422464181, "loss": 0.0836, "step": 4614 }, { "epoch": 0.29834343434343435, "grad_norm": 0.0542968325316906, "learning_rate": 0.00019600932984682151, "loss": 0.0773, "step": 4615 }, { "epoch": 0.2984080808080808, "grad_norm": 0.1185133159160614, "learning_rate": 0.00019600741699822102, "loss": 0.112, "step": 4616 }, { "epoch": 0.2984727272727273, "grad_norm": 0.06345734000205994, "learning_rate": 0.0001960055037006256, "loss": 0.0819, "step": 4617 }, { "epoch": 0.2985373737373737, "grad_norm": 0.07264839112758636, "learning_rate": 0.00019600358995404413, "loss": 0.0926, "step": 4618 }, { "epoch": 0.2986020202020202, "grad_norm": 0.0693463459610939, "learning_rate": 0.00019600167575848564, "loss": 0.0869, "step": 4619 }, { "epoch": 0.2986666666666667, "grad_norm": 0.09668222069740295, "learning_rate": 0.00019599976111395905, "loss": 0.1131, "step": 4620 }, { "epoch": 0.2987313131313131, "grad_norm": 0.06024109944701195, "learning_rate": 0.0001959978460204733, "loss": 0.0738, "step": 4621 }, { "epoch": 0.2987959595959596, "grad_norm": 0.057824525982141495, "learning_rate": 0.0001959959304780374, "loss": 0.0851, "step": 4622 }, { "epoch": 0.29886060606060605, "grad_norm": 0.0650748461484909, "learning_rate": 0.00019599401448666022, "loss": 0.0869, "step": 4623 }, { "epoch": 0.29892525252525254, "grad_norm": 0.06472223252058029, "learning_rate": 0.00019599209804635077, "loss": 0.0982, "step": 4624 }, { "epoch": 0.29892525252525254, "eval_bleu": 15.133697997813023, "eval_loss": 0.09448139369487762, "eval_runtime": 2.7323, "eval_samples_per_second": 11.712, "eval_steps_per_second": 1.464, "step": 4624 }, { "epoch": 0.298989898989899, "grad_norm": 0.06019340828061104, "learning_rate": 0.00019599018115711806, "loss": 0.088, "step": 4625 }, { "epoch": 0.29905454545454546, "grad_norm": 0.08765371143817902, "learning_rate": 0.00019598826381897095, "loss": 0.095, "step": 4626 }, { "epoch": 0.2991191919191919, "grad_norm": 0.07066169381141663, "learning_rate": 0.0001959863460319185, "loss": 0.0828, "step": 4627 }, { "epoch": 0.2991838383838384, "grad_norm": 0.06294021755456924, "learning_rate": 0.00019598442779596961, "loss": 0.0945, "step": 4628 }, { "epoch": 0.2992484848484849, "grad_norm": 0.05835629627108574, "learning_rate": 0.0001959825091111333, "loss": 0.0916, "step": 4629 }, { "epoch": 0.2993131313131313, "grad_norm": 0.06739974766969681, "learning_rate": 0.00019598058997741854, "loss": 0.091, "step": 4630 }, { "epoch": 0.2993777777777778, "grad_norm": 0.06979475915431976, "learning_rate": 0.00019597867039483426, "loss": 0.1017, "step": 4631 }, { "epoch": 0.29944242424242423, "grad_norm": 0.062409088015556335, "learning_rate": 0.0001959767503633895, "loss": 0.0872, "step": 4632 }, { "epoch": 0.2995070707070707, "grad_norm": 0.06673561781644821, "learning_rate": 0.0001959748298830932, "loss": 0.1009, "step": 4633 }, { "epoch": 0.29957171717171716, "grad_norm": 0.06147103011608124, "learning_rate": 0.00019597290895395435, "loss": 0.0906, "step": 4634 }, { "epoch": 0.29963636363636365, "grad_norm": 0.06785107403993607, "learning_rate": 0.00019597098757598194, "loss": 0.0843, "step": 4635 }, { "epoch": 0.2997010101010101, "grad_norm": 0.0652463361620903, "learning_rate": 0.00019596906574918493, "loss": 0.0875, "step": 4636 }, { "epoch": 0.29976565656565657, "grad_norm": 0.06536533683538437, "learning_rate": 0.0001959671434735723, "loss": 0.0955, "step": 4637 }, { "epoch": 0.299830303030303, "grad_norm": 0.05749241262674332, "learning_rate": 0.00019596522074915313, "loss": 0.077, "step": 4638 }, { "epoch": 0.2998949494949495, "grad_norm": 0.06882575154304504, "learning_rate": 0.0001959632975759363, "loss": 0.0959, "step": 4639 }, { "epoch": 0.299959595959596, "grad_norm": 0.06407667696475983, "learning_rate": 0.0001959613739539309, "loss": 0.0911, "step": 4640 }, { "epoch": 0.299959595959596, "eval_bleu": 13.214919627610731, "eval_loss": 0.09465215355157852, "eval_runtime": 2.7415, "eval_samples_per_second": 11.672, "eval_steps_per_second": 1.459, "step": 4640 }, { "epoch": 0.3000242424242424, "grad_norm": 0.07222005724906921, "learning_rate": 0.00019595944988314582, "loss": 0.0962, "step": 4641 }, { "epoch": 0.3000888888888889, "grad_norm": 0.08042989671230316, "learning_rate": 0.00019595752536359016, "loss": 0.1114, "step": 4642 }, { "epoch": 0.30015353535353534, "grad_norm": 0.06781364977359772, "learning_rate": 0.00019595560039527285, "loss": 0.0948, "step": 4643 }, { "epoch": 0.30021818181818183, "grad_norm": 0.06815223395824432, "learning_rate": 0.00019595367497820293, "loss": 0.0861, "step": 4644 }, { "epoch": 0.30028282828282826, "grad_norm": 0.06212089955806732, "learning_rate": 0.0001959517491123894, "loss": 0.0883, "step": 4645 }, { "epoch": 0.30034747474747475, "grad_norm": 0.05732403323054314, "learning_rate": 0.00019594982279784126, "loss": 0.0755, "step": 4646 }, { "epoch": 0.3004121212121212, "grad_norm": 0.06334793567657471, "learning_rate": 0.0001959478960345675, "loss": 0.0882, "step": 4647 }, { "epoch": 0.3004767676767677, "grad_norm": 0.08146253973245621, "learning_rate": 0.0001959459688225772, "loss": 0.1079, "step": 4648 }, { "epoch": 0.30054141414141416, "grad_norm": 0.06087877228856087, "learning_rate": 0.00019594404116187925, "loss": 0.0771, "step": 4649 }, { "epoch": 0.3006060606060606, "grad_norm": 0.0601593479514122, "learning_rate": 0.00019594211305248277, "loss": 0.0703, "step": 4650 }, { "epoch": 0.3006707070707071, "grad_norm": 0.07142464816570282, "learning_rate": 0.00019594018449439675, "loss": 0.0983, "step": 4651 }, { "epoch": 0.3007353535353535, "grad_norm": 0.06425745785236359, "learning_rate": 0.0001959382554876302, "loss": 0.0863, "step": 4652 }, { "epoch": 0.3008, "grad_norm": 0.06959160417318344, "learning_rate": 0.00019593632603219217, "loss": 0.0889, "step": 4653 }, { "epoch": 0.30086464646464645, "grad_norm": 0.06369398534297943, "learning_rate": 0.00019593439612809163, "loss": 0.0862, "step": 4654 }, { "epoch": 0.30092929292929294, "grad_norm": 0.07400577515363693, "learning_rate": 0.00019593246577533763, "loss": 0.1027, "step": 4655 }, { "epoch": 0.30099393939393937, "grad_norm": 0.07124436646699905, "learning_rate": 0.00019593053497393923, "loss": 0.0959, "step": 4656 }, { "epoch": 0.30099393939393937, "eval_bleu": 12.94114265878201, "eval_loss": 0.09464993327856064, "eval_runtime": 2.6959, "eval_samples_per_second": 11.87, "eval_steps_per_second": 1.484, "step": 4656 }, { "epoch": 0.30105858585858586, "grad_norm": 0.06135867163538933, "learning_rate": 0.00019592860372390542, "loss": 0.0925, "step": 4657 }, { "epoch": 0.30112323232323235, "grad_norm": 0.12385788559913635, "learning_rate": 0.00019592667202524523, "loss": 0.0886, "step": 4658 }, { "epoch": 0.3011878787878788, "grad_norm": 0.08610004186630249, "learning_rate": 0.00019592473987796772, "loss": 0.1036, "step": 4659 }, { "epoch": 0.30125252525252527, "grad_norm": 0.06922711431980133, "learning_rate": 0.00019592280728208193, "loss": 0.0859, "step": 4660 }, { "epoch": 0.3013171717171717, "grad_norm": 0.0694604441523552, "learning_rate": 0.00019592087423759687, "loss": 0.105, "step": 4661 }, { "epoch": 0.3013818181818182, "grad_norm": 0.06808525323867798, "learning_rate": 0.00019591894074452157, "loss": 0.0891, "step": 4662 }, { "epoch": 0.30144646464646463, "grad_norm": 0.07607869058847427, "learning_rate": 0.00019591700680286512, "loss": 0.1001, "step": 4663 }, { "epoch": 0.3015111111111111, "grad_norm": 0.07043629884719849, "learning_rate": 0.00019591507241263653, "loss": 0.0927, "step": 4664 }, { "epoch": 0.30157575757575755, "grad_norm": 0.05468107387423515, "learning_rate": 0.00019591313757384487, "loss": 0.082, "step": 4665 }, { "epoch": 0.30164040404040404, "grad_norm": 0.07286622375249863, "learning_rate": 0.00019591120228649916, "loss": 0.0962, "step": 4666 }, { "epoch": 0.30170505050505053, "grad_norm": 0.06715156137943268, "learning_rate": 0.00019590926655060843, "loss": 0.0899, "step": 4667 }, { "epoch": 0.30176969696969697, "grad_norm": 0.09657955914735794, "learning_rate": 0.0001959073303661818, "loss": 0.1081, "step": 4668 }, { "epoch": 0.30183434343434346, "grad_norm": 0.07730289548635483, "learning_rate": 0.0001959053937332283, "loss": 0.1133, "step": 4669 }, { "epoch": 0.3018989898989899, "grad_norm": 0.05900173634290695, "learning_rate": 0.00019590345665175697, "loss": 0.0786, "step": 4670 }, { "epoch": 0.3019636363636364, "grad_norm": 0.06607168167829514, "learning_rate": 0.00019590151912177688, "loss": 0.0878, "step": 4671 }, { "epoch": 0.3020282828282828, "grad_norm": 0.07803653925657272, "learning_rate": 0.0001958995811432971, "loss": 0.0969, "step": 4672 }, { "epoch": 0.3020282828282828, "eval_bleu": 13.620256756175307, "eval_loss": 0.09350509196519852, "eval_runtime": 2.8482, "eval_samples_per_second": 11.235, "eval_steps_per_second": 1.404, "step": 4672 }, { "epoch": 0.3020929292929293, "grad_norm": 0.06570162624120712, "learning_rate": 0.00019589764271632666, "loss": 0.0886, "step": 4673 }, { "epoch": 0.30215757575757574, "grad_norm": 0.07181063294410706, "learning_rate": 0.00019589570384087468, "loss": 0.0783, "step": 4674 }, { "epoch": 0.3022222222222222, "grad_norm": 0.06506641954183578, "learning_rate": 0.00019589376451695017, "loss": 0.0915, "step": 4675 }, { "epoch": 0.30228686868686866, "grad_norm": 0.06903707981109619, "learning_rate": 0.00019589182474456223, "loss": 0.0959, "step": 4676 }, { "epoch": 0.30235151515151515, "grad_norm": 0.06432800740003586, "learning_rate": 0.00019588988452371993, "loss": 0.1011, "step": 4677 }, { "epoch": 0.30241616161616164, "grad_norm": 0.06585089862346649, "learning_rate": 0.00019588794385443233, "loss": 0.0987, "step": 4678 }, { "epoch": 0.3024808080808081, "grad_norm": 0.06807604432106018, "learning_rate": 0.00019588600273670853, "loss": 0.093, "step": 4679 }, { "epoch": 0.30254545454545456, "grad_norm": 0.06365378946065903, "learning_rate": 0.0001958840611705576, "loss": 0.0969, "step": 4680 }, { "epoch": 0.302610101010101, "grad_norm": 0.06447930634021759, "learning_rate": 0.00019588211915598858, "loss": 0.0846, "step": 4681 }, { "epoch": 0.3026747474747475, "grad_norm": 0.07107406109571457, "learning_rate": 0.00019588017669301062, "loss": 0.0904, "step": 4682 }, { "epoch": 0.3027393939393939, "grad_norm": 0.06212260574102402, "learning_rate": 0.00019587823378163278, "loss": 0.0846, "step": 4683 }, { "epoch": 0.3028040404040404, "grad_norm": 0.07482921332120895, "learning_rate": 0.0001958762904218641, "loss": 0.119, "step": 4684 }, { "epoch": 0.30286868686868684, "grad_norm": 0.06667369604110718, "learning_rate": 0.0001958743466137137, "loss": 0.0931, "step": 4685 }, { "epoch": 0.30293333333333333, "grad_norm": 0.06501346081495285, "learning_rate": 0.00019587240235719074, "loss": 0.0971, "step": 4686 }, { "epoch": 0.3029979797979798, "grad_norm": 0.0637492686510086, "learning_rate": 0.0001958704576523042, "loss": 0.0765, "step": 4687 }, { "epoch": 0.30306262626262626, "grad_norm": 0.06262986361980438, "learning_rate": 0.0001958685124990632, "loss": 0.09, "step": 4688 }, { "epoch": 0.30306262626262626, "eval_bleu": 16.012961307451263, "eval_loss": 0.09376657754182816, "eval_runtime": 2.7259, "eval_samples_per_second": 11.739, "eval_steps_per_second": 1.467, "step": 4688 }, { "epoch": 0.30312727272727275, "grad_norm": 0.08209948241710663, "learning_rate": 0.00019586656689747693, "loss": 0.1065, "step": 4689 }, { "epoch": 0.3031919191919192, "grad_norm": 0.06138643994927406, "learning_rate": 0.0001958646208475544, "loss": 0.0835, "step": 4690 }, { "epoch": 0.30325656565656567, "grad_norm": 0.0823470875620842, "learning_rate": 0.00019586267434930468, "loss": 0.125, "step": 4691 }, { "epoch": 0.3033212121212121, "grad_norm": 0.060678817331790924, "learning_rate": 0.000195860727402737, "loss": 0.0832, "step": 4692 }, { "epoch": 0.3033858585858586, "grad_norm": 0.08106742799282074, "learning_rate": 0.0001958587800078603, "loss": 0.1017, "step": 4693 }, { "epoch": 0.303450505050505, "grad_norm": 0.07476026564836502, "learning_rate": 0.00019585683216468384, "loss": 0.1118, "step": 4694 }, { "epoch": 0.3035151515151515, "grad_norm": 0.05703451484441757, "learning_rate": 0.00019585488387321664, "loss": 0.0724, "step": 4695 }, { "epoch": 0.303579797979798, "grad_norm": 0.06587185710668564, "learning_rate": 0.00019585293513346786, "loss": 0.0922, "step": 4696 }, { "epoch": 0.30364444444444444, "grad_norm": 0.06442519277334213, "learning_rate": 0.00019585098594544659, "loss": 0.0709, "step": 4697 }, { "epoch": 0.30370909090909093, "grad_norm": 0.08398552238941193, "learning_rate": 0.00019584903630916195, "loss": 0.1026, "step": 4698 }, { "epoch": 0.30377373737373736, "grad_norm": 0.056892458349466324, "learning_rate": 0.00019584708622462303, "loss": 0.0802, "step": 4699 }, { "epoch": 0.30383838383838385, "grad_norm": 0.07192113995552063, "learning_rate": 0.00019584513569183897, "loss": 0.0975, "step": 4700 }, { "epoch": 0.3039030303030303, "grad_norm": 0.06274757534265518, "learning_rate": 0.0001958431847108189, "loss": 0.0861, "step": 4701 }, { "epoch": 0.3039676767676768, "grad_norm": 0.0750383585691452, "learning_rate": 0.00019584123328157197, "loss": 0.0863, "step": 4702 }, { "epoch": 0.3040323232323232, "grad_norm": 0.05529523640871048, "learning_rate": 0.00019583928140410724, "loss": 0.0759, "step": 4703 }, { "epoch": 0.3040969696969697, "grad_norm": 0.06533263623714447, "learning_rate": 0.00019583732907843388, "loss": 0.072, "step": 4704 }, { "epoch": 0.3040969696969697, "eval_bleu": 13.073941613645246, "eval_loss": 0.09473906457424164, "eval_runtime": 2.805, "eval_samples_per_second": 11.408, "eval_steps_per_second": 1.426, "step": 4704 }, { "epoch": 0.3041616161616162, "grad_norm": 0.07780378311872482, "learning_rate": 0.00019583537630456102, "loss": 0.1279, "step": 4705 }, { "epoch": 0.3042262626262626, "grad_norm": 0.0677536204457283, "learning_rate": 0.00019583342308249782, "loss": 0.1005, "step": 4706 }, { "epoch": 0.3042909090909091, "grad_norm": 0.06919342279434204, "learning_rate": 0.00019583146941225333, "loss": 0.0946, "step": 4707 }, { "epoch": 0.30435555555555555, "grad_norm": 0.10964915156364441, "learning_rate": 0.00019582951529383675, "loss": 0.0815, "step": 4708 }, { "epoch": 0.30442020202020204, "grad_norm": 0.06256582587957382, "learning_rate": 0.00019582756072725722, "loss": 0.0785, "step": 4709 }, { "epoch": 0.30448484848484847, "grad_norm": 0.05990685150027275, "learning_rate": 0.0001958256057125239, "loss": 0.0849, "step": 4710 }, { "epoch": 0.30454949494949496, "grad_norm": 0.06409945338964462, "learning_rate": 0.00019582365024964587, "loss": 0.1001, "step": 4711 }, { "epoch": 0.3046141414141414, "grad_norm": 0.06273867934942245, "learning_rate": 0.0001958216943386323, "loss": 0.0803, "step": 4712 }, { "epoch": 0.3046787878787879, "grad_norm": 0.07001735270023346, "learning_rate": 0.00019581973797949234, "loss": 0.0959, "step": 4713 }, { "epoch": 0.3047434343434343, "grad_norm": 0.05816273018717766, "learning_rate": 0.00019581778117223517, "loss": 0.0762, "step": 4714 }, { "epoch": 0.3048080808080808, "grad_norm": 0.06626967340707779, "learning_rate": 0.00019581582391686988, "loss": 0.0946, "step": 4715 }, { "epoch": 0.3048727272727273, "grad_norm": 0.061270006000995636, "learning_rate": 0.0001958138662134057, "loss": 0.0736, "step": 4716 }, { "epoch": 0.30493737373737373, "grad_norm": 0.05927988141775131, "learning_rate": 0.0001958119080618517, "loss": 0.0769, "step": 4717 }, { "epoch": 0.3050020202020202, "grad_norm": 0.06026887148618698, "learning_rate": 0.00019580994946221712, "loss": 0.0794, "step": 4718 }, { "epoch": 0.30506666666666665, "grad_norm": 0.07352053374052048, "learning_rate": 0.00019580799041451103, "loss": 0.0985, "step": 4719 }, { "epoch": 0.30513131313131314, "grad_norm": 0.07282546907663345, "learning_rate": 0.00019580603091874267, "loss": 0.0933, "step": 4720 }, { "epoch": 0.30513131313131314, "eval_bleu": 16.774989595062156, "eval_loss": 0.09336063265800476, "eval_runtime": 2.7165, "eval_samples_per_second": 11.78, "eval_steps_per_second": 1.472, "step": 4720 }, { "epoch": 0.3051959595959596, "grad_norm": 0.05864718183875084, "learning_rate": 0.0001958040709749212, "loss": 0.0762, "step": 4721 }, { "epoch": 0.30526060606060607, "grad_norm": 0.07190041244029999, "learning_rate": 0.00019580211058305573, "loss": 0.0954, "step": 4722 }, { "epoch": 0.3053252525252525, "grad_norm": 0.0585903562605381, "learning_rate": 0.00019580014974315545, "loss": 0.074, "step": 4723 }, { "epoch": 0.305389898989899, "grad_norm": 0.06493786722421646, "learning_rate": 0.00019579818845522957, "loss": 0.0871, "step": 4724 }, { "epoch": 0.3054545454545455, "grad_norm": 0.07651641964912415, "learning_rate": 0.00019579622671928723, "loss": 0.105, "step": 4725 }, { "epoch": 0.3055191919191919, "grad_norm": 0.06296389549970627, "learning_rate": 0.0001957942645353376, "loss": 0.0906, "step": 4726 }, { "epoch": 0.3055838383838384, "grad_norm": 0.0711960420012474, "learning_rate": 0.00019579230190338987, "loss": 0.1005, "step": 4727 }, { "epoch": 0.30564848484848484, "grad_norm": 0.06564712524414062, "learning_rate": 0.0001957903388234532, "loss": 0.0871, "step": 4728 }, { "epoch": 0.3057131313131313, "grad_norm": 0.06702519953250885, "learning_rate": 0.0001957883752955368, "loss": 0.0887, "step": 4729 }, { "epoch": 0.30577777777777776, "grad_norm": 0.06969790905714035, "learning_rate": 0.00019578641131964982, "loss": 0.107, "step": 4730 }, { "epoch": 0.30584242424242425, "grad_norm": 0.061255622655153275, "learning_rate": 0.0001957844468958015, "loss": 0.0896, "step": 4731 }, { "epoch": 0.3059070707070707, "grad_norm": 0.08252348750829697, "learning_rate": 0.00019578248202400093, "loss": 0.095, "step": 4732 }, { "epoch": 0.3059717171717172, "grad_norm": 0.06545408815145493, "learning_rate": 0.00019578051670425737, "loss": 0.0856, "step": 4733 }, { "epoch": 0.30603636363636366, "grad_norm": 0.06466548144817352, "learning_rate": 0.00019577855093658002, "loss": 0.0931, "step": 4734 }, { "epoch": 0.3061010101010101, "grad_norm": 0.0644887238740921, "learning_rate": 0.00019577658472097803, "loss": 0.0942, "step": 4735 }, { "epoch": 0.3061656565656566, "grad_norm": 0.06501445919275284, "learning_rate": 0.0001957746180574606, "loss": 0.0938, "step": 4736 }, { "epoch": 0.3061656565656566, "eval_bleu": 14.989629406095618, "eval_loss": 0.09555211663246155, "eval_runtime": 2.6007, "eval_samples_per_second": 12.304, "eval_steps_per_second": 1.538, "step": 4736 }, { "epoch": 0.306230303030303, "grad_norm": 0.06556830555200577, "learning_rate": 0.00019577265094603702, "loss": 0.0856, "step": 4737 }, { "epoch": 0.3062949494949495, "grad_norm": 0.10416121780872345, "learning_rate": 0.00019577068338671633, "loss": 0.1236, "step": 4738 }, { "epoch": 0.30635959595959594, "grad_norm": 0.05639476701617241, "learning_rate": 0.00019576871537950786, "loss": 0.0812, "step": 4739 }, { "epoch": 0.30642424242424243, "grad_norm": 0.06095866858959198, "learning_rate": 0.00019576674692442077, "loss": 0.0763, "step": 4740 }, { "epoch": 0.30648888888888887, "grad_norm": 0.07491468638181686, "learning_rate": 0.00019576477802146425, "loss": 0.1069, "step": 4741 }, { "epoch": 0.30655353535353536, "grad_norm": 0.07674787193536758, "learning_rate": 0.00019576280867064752, "loss": 0.104, "step": 4742 }, { "epoch": 0.30661818181818185, "grad_norm": 0.06851784884929657, "learning_rate": 0.00019576083887197978, "loss": 0.0865, "step": 4743 }, { "epoch": 0.3066828282828283, "grad_norm": 0.06821895390748978, "learning_rate": 0.00019575886862547028, "loss": 0.0939, "step": 4744 }, { "epoch": 0.30674747474747477, "grad_norm": 0.07297416031360626, "learning_rate": 0.00019575689793112822, "loss": 0.1185, "step": 4745 }, { "epoch": 0.3068121212121212, "grad_norm": 0.055229298770427704, "learning_rate": 0.00019575492678896276, "loss": 0.072, "step": 4746 }, { "epoch": 0.3068767676767677, "grad_norm": 0.055858418345451355, "learning_rate": 0.0001957529551989832, "loss": 0.0714, "step": 4747 }, { "epoch": 0.3069414141414141, "grad_norm": 0.07207828760147095, "learning_rate": 0.0001957509831611987, "loss": 0.0992, "step": 4748 }, { "epoch": 0.3070060606060606, "grad_norm": 0.0785556212067604, "learning_rate": 0.00019574901067561854, "loss": 0.1036, "step": 4749 }, { "epoch": 0.30707070707070705, "grad_norm": 0.05766552686691284, "learning_rate": 0.0001957470377422519, "loss": 0.0824, "step": 4750 }, { "epoch": 0.30713535353535354, "grad_norm": 0.08286474645137787, "learning_rate": 0.00019574506436110799, "loss": 0.12, "step": 4751 }, { "epoch": 0.3072, "grad_norm": 0.05716665834188461, "learning_rate": 0.0001957430905321961, "loss": 0.0753, "step": 4752 }, { "epoch": 0.3072, "eval_bleu": 13.38680526728827, "eval_loss": 0.09434428811073303, "eval_runtime": 2.6643, "eval_samples_per_second": 12.011, "eval_steps_per_second": 1.501, "step": 4752 }, { "epoch": 0.30726464646464646, "grad_norm": 0.07537861913442612, "learning_rate": 0.00019574111625552537, "loss": 0.1003, "step": 4753 }, { "epoch": 0.30732929292929295, "grad_norm": 0.0637507438659668, "learning_rate": 0.00019573914153110514, "loss": 0.0876, "step": 4754 }, { "epoch": 0.3073939393939394, "grad_norm": 0.06305477023124695, "learning_rate": 0.00019573716635894458, "loss": 0.0879, "step": 4755 }, { "epoch": 0.3074585858585859, "grad_norm": 0.06771845370531082, "learning_rate": 0.00019573519073905297, "loss": 0.0919, "step": 4756 }, { "epoch": 0.3075232323232323, "grad_norm": 0.0642908588051796, "learning_rate": 0.0001957332146714395, "loss": 0.0782, "step": 4757 }, { "epoch": 0.3075878787878788, "grad_norm": 0.06687585264444351, "learning_rate": 0.00019573123815611344, "loss": 0.0855, "step": 4758 }, { "epoch": 0.30765252525252523, "grad_norm": 0.06200970709323883, "learning_rate": 0.00019572926119308404, "loss": 0.0725, "step": 4759 }, { "epoch": 0.3077171717171717, "grad_norm": 0.06523467600345612, "learning_rate": 0.0001957272837823605, "loss": 0.0816, "step": 4760 }, { "epoch": 0.30778181818181816, "grad_norm": 0.07990944385528564, "learning_rate": 0.00019572530592395213, "loss": 0.0998, "step": 4761 }, { "epoch": 0.30784646464646465, "grad_norm": 0.06983045488595963, "learning_rate": 0.00019572332761786813, "loss": 0.0972, "step": 4762 }, { "epoch": 0.30791111111111114, "grad_norm": 0.06714247912168503, "learning_rate": 0.00019572134886411776, "loss": 0.0891, "step": 4763 }, { "epoch": 0.30797575757575757, "grad_norm": 0.08239343762397766, "learning_rate": 0.00019571936966271034, "loss": 0.0821, "step": 4764 }, { "epoch": 0.30804040404040406, "grad_norm": 0.057748764753341675, "learning_rate": 0.00019571739001365504, "loss": 0.0769, "step": 4765 }, { "epoch": 0.3081050505050505, "grad_norm": 0.0627964586019516, "learning_rate": 0.00019571540991696114, "loss": 0.0862, "step": 4766 }, { "epoch": 0.308169696969697, "grad_norm": 0.06751590967178345, "learning_rate": 0.00019571342937263792, "loss": 0.089, "step": 4767 }, { "epoch": 0.3082343434343434, "grad_norm": 0.07098577916622162, "learning_rate": 0.00019571144838069463, "loss": 0.0855, "step": 4768 }, { "epoch": 0.3082343434343434, "eval_bleu": 11.205770360479216, "eval_loss": 0.09519843757152557, "eval_runtime": 2.8521, "eval_samples_per_second": 11.22, "eval_steps_per_second": 1.402, "step": 4768 }, { "epoch": 0.3082989898989899, "grad_norm": 0.06362498551607132, "learning_rate": 0.00019570946694114053, "loss": 0.0843, "step": 4769 }, { "epoch": 0.30836363636363634, "grad_norm": 0.06605575978755951, "learning_rate": 0.00019570748505398492, "loss": 0.0881, "step": 4770 }, { "epoch": 0.30842828282828283, "grad_norm": 0.061822086572647095, "learning_rate": 0.00019570550271923702, "loss": 0.0938, "step": 4771 }, { "epoch": 0.3084929292929293, "grad_norm": 0.06998146325349808, "learning_rate": 0.00019570351993690612, "loss": 0.0966, "step": 4772 }, { "epoch": 0.30855757575757575, "grad_norm": 0.059741441160440445, "learning_rate": 0.00019570153670700152, "loss": 0.0821, "step": 4773 }, { "epoch": 0.30862222222222224, "grad_norm": 0.05550830811262131, "learning_rate": 0.00019569955302953246, "loss": 0.0679, "step": 4774 }, { "epoch": 0.3086868686868687, "grad_norm": 0.05758580192923546, "learning_rate": 0.00019569756890450824, "loss": 0.0681, "step": 4775 }, { "epoch": 0.30875151515151517, "grad_norm": 0.08599646389484406, "learning_rate": 0.00019569558433193808, "loss": 0.1019, "step": 4776 }, { "epoch": 0.3088161616161616, "grad_norm": 0.10221553593873978, "learning_rate": 0.00019569359931183135, "loss": 0.0969, "step": 4777 }, { "epoch": 0.3088808080808081, "grad_norm": 0.07317476719617844, "learning_rate": 0.0001956916138441973, "loss": 0.0976, "step": 4778 }, { "epoch": 0.3089454545454545, "grad_norm": 0.06070096790790558, "learning_rate": 0.0001956896279290452, "loss": 0.0862, "step": 4779 }, { "epoch": 0.309010101010101, "grad_norm": 0.06562337279319763, "learning_rate": 0.00019568764156638433, "loss": 0.0878, "step": 4780 }, { "epoch": 0.3090747474747475, "grad_norm": 0.07425237447023392, "learning_rate": 0.00019568565475622398, "loss": 0.0926, "step": 4781 }, { "epoch": 0.30913939393939394, "grad_norm": 0.0745721161365509, "learning_rate": 0.0001956836674985735, "loss": 0.1009, "step": 4782 }, { "epoch": 0.3092040404040404, "grad_norm": 0.059794675558805466, "learning_rate": 0.00019568167979344212, "loss": 0.0851, "step": 4783 }, { "epoch": 0.30926868686868686, "grad_norm": 0.053508102893829346, "learning_rate": 0.00019567969164083912, "loss": 0.0688, "step": 4784 }, { "epoch": 0.30926868686868686, "eval_bleu": 15.035717079060975, "eval_loss": 0.09469657391309738, "eval_runtime": 2.7942, "eval_samples_per_second": 11.452, "eval_steps_per_second": 1.432, "step": 4784 }, { "epoch": 0.30933333333333335, "grad_norm": 0.0803999975323677, "learning_rate": 0.00019567770304077388, "loss": 0.0947, "step": 4785 }, { "epoch": 0.3093979797979798, "grad_norm": 0.08259466290473938, "learning_rate": 0.00019567571399325563, "loss": 0.1044, "step": 4786 }, { "epoch": 0.3094626262626263, "grad_norm": 0.06777393072843552, "learning_rate": 0.0001956737244982937, "loss": 0.1108, "step": 4787 }, { "epoch": 0.3095272727272727, "grad_norm": 0.059062596410512924, "learning_rate": 0.0001956717345558974, "loss": 0.0888, "step": 4788 }, { "epoch": 0.3095919191919192, "grad_norm": 0.05767020583152771, "learning_rate": 0.00019566974416607602, "loss": 0.0789, "step": 4789 }, { "epoch": 0.30965656565656563, "grad_norm": 0.06474613398313522, "learning_rate": 0.00019566775332883885, "loss": 0.0996, "step": 4790 }, { "epoch": 0.3097212121212121, "grad_norm": 0.0566738024353981, "learning_rate": 0.00019566576204419527, "loss": 0.0817, "step": 4791 }, { "epoch": 0.3097858585858586, "grad_norm": 0.05534113571047783, "learning_rate": 0.00019566377031215453, "loss": 0.0754, "step": 4792 }, { "epoch": 0.30985050505050504, "grad_norm": 0.06616433709859848, "learning_rate": 0.00019566177813272595, "loss": 0.1053, "step": 4793 }, { "epoch": 0.30991515151515153, "grad_norm": 0.05442001670598984, "learning_rate": 0.00019565978550591885, "loss": 0.0716, "step": 4794 }, { "epoch": 0.30997979797979797, "grad_norm": 0.06467068195343018, "learning_rate": 0.00019565779243174258, "loss": 0.0871, "step": 4795 }, { "epoch": 0.31004444444444446, "grad_norm": 0.06228816881775856, "learning_rate": 0.00019565579891020645, "loss": 0.0911, "step": 4796 }, { "epoch": 0.3101090909090909, "grad_norm": 0.07857277989387512, "learning_rate": 0.00019565380494131974, "loss": 0.1219, "step": 4797 }, { "epoch": 0.3101737373737374, "grad_norm": 0.06959903985261917, "learning_rate": 0.00019565181052509181, "loss": 0.101, "step": 4798 }, { "epoch": 0.3102383838383838, "grad_norm": 0.07208540290594101, "learning_rate": 0.000195649815661532, "loss": 0.0949, "step": 4799 }, { "epoch": 0.3103030303030303, "grad_norm": 0.05932864174246788, "learning_rate": 0.00019564782035064963, "loss": 0.085, "step": 4800 }, { "epoch": 0.3103030303030303, "eval_bleu": 17.41184961237193, "eval_loss": 0.09466177970170975, "eval_runtime": 2.7229, "eval_samples_per_second": 11.752, "eval_steps_per_second": 1.469, "step": 4800 }, { "epoch": 0.3103676767676768, "grad_norm": 0.07277443259954453, "learning_rate": 0.00019564582459245399, "loss": 0.1045, "step": 4801 }, { "epoch": 0.3104323232323232, "grad_norm": 0.06777191907167435, "learning_rate": 0.00019564382838695447, "loss": 0.1013, "step": 4802 }, { "epoch": 0.3104969696969697, "grad_norm": 0.06147941201925278, "learning_rate": 0.0001956418317341604, "loss": 0.0893, "step": 4803 }, { "epoch": 0.31056161616161615, "grad_norm": 0.07320127636194229, "learning_rate": 0.0001956398346340811, "loss": 0.1013, "step": 4804 }, { "epoch": 0.31062626262626264, "grad_norm": 0.06448844820261002, "learning_rate": 0.00019563783708672586, "loss": 0.0835, "step": 4805 }, { "epoch": 0.3106909090909091, "grad_norm": 0.06548895686864853, "learning_rate": 0.00019563583909210411, "loss": 0.0859, "step": 4806 }, { "epoch": 0.31075555555555556, "grad_norm": 0.06265900284051895, "learning_rate": 0.00019563384065022517, "loss": 0.0802, "step": 4807 }, { "epoch": 0.310820202020202, "grad_norm": 0.0711962878704071, "learning_rate": 0.00019563184176109836, "loss": 0.0859, "step": 4808 }, { "epoch": 0.3108848484848485, "grad_norm": 0.06866409629583359, "learning_rate": 0.00019562984242473306, "loss": 0.0851, "step": 4809 }, { "epoch": 0.310949494949495, "grad_norm": 0.06327687948942184, "learning_rate": 0.0001956278426411386, "loss": 0.0874, "step": 4810 }, { "epoch": 0.3110141414141414, "grad_norm": 0.11567649245262146, "learning_rate": 0.00019562584241032428, "loss": 0.0916, "step": 4811 }, { "epoch": 0.3110787878787879, "grad_norm": 0.06246242672204971, "learning_rate": 0.00019562384173229958, "loss": 0.0866, "step": 4812 }, { "epoch": 0.31114343434343433, "grad_norm": 0.07560346275568008, "learning_rate": 0.00019562184060707374, "loss": 0.0973, "step": 4813 }, { "epoch": 0.3112080808080808, "grad_norm": 0.06402620673179626, "learning_rate": 0.00019561983903465616, "loss": 0.0968, "step": 4814 }, { "epoch": 0.31127272727272726, "grad_norm": 0.07922115921974182, "learning_rate": 0.00019561783701505623, "loss": 0.1115, "step": 4815 }, { "epoch": 0.31133737373737375, "grad_norm": 0.062015239149332047, "learning_rate": 0.00019561583454828327, "loss": 0.0861, "step": 4816 }, { "epoch": 0.31133737373737375, "eval_bleu": 12.5886452578275, "eval_loss": 0.0951358824968338, "eval_runtime": 2.8554, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 4816 }, { "epoch": 0.3114020202020202, "grad_norm": 0.058490097522735596, "learning_rate": 0.00019561383163434667, "loss": 0.0764, "step": 4817 }, { "epoch": 0.31146666666666667, "grad_norm": 0.07096497714519501, "learning_rate": 0.0001956118282732558, "loss": 0.106, "step": 4818 }, { "epoch": 0.31153131313131316, "grad_norm": 0.059846751391887665, "learning_rate": 0.00019560982446501998, "loss": 0.0857, "step": 4819 }, { "epoch": 0.3115959595959596, "grad_norm": 0.06803713738918304, "learning_rate": 0.00019560782020964864, "loss": 0.0925, "step": 4820 }, { "epoch": 0.3116606060606061, "grad_norm": 0.06713566929101944, "learning_rate": 0.00019560581550715113, "loss": 0.0892, "step": 4821 }, { "epoch": 0.3117252525252525, "grad_norm": 0.07706134766340256, "learning_rate": 0.00019560381035753683, "loss": 0.1105, "step": 4822 }, { "epoch": 0.311789898989899, "grad_norm": 0.06967047601938248, "learning_rate": 0.00019560180476081514, "loss": 0.0773, "step": 4823 }, { "epoch": 0.31185454545454544, "grad_norm": 0.06331364065408707, "learning_rate": 0.0001955997987169954, "loss": 0.0823, "step": 4824 }, { "epoch": 0.31191919191919193, "grad_norm": 0.06123747304081917, "learning_rate": 0.000195597792226087, "loss": 0.081, "step": 4825 }, { "epoch": 0.31198383838383836, "grad_norm": 0.07577239722013474, "learning_rate": 0.00019559578528809932, "loss": 0.1067, "step": 4826 }, { "epoch": 0.31204848484848485, "grad_norm": 0.06580839306116104, "learning_rate": 0.00019559377790304177, "loss": 0.0913, "step": 4827 }, { "epoch": 0.3121131313131313, "grad_norm": 0.07268828898668289, "learning_rate": 0.00019559177007092367, "loss": 0.0835, "step": 4828 }, { "epoch": 0.3121777777777778, "grad_norm": 0.0747152790427208, "learning_rate": 0.00019558976179175454, "loss": 0.1046, "step": 4829 }, { "epoch": 0.31224242424242427, "grad_norm": 0.06427829712629318, "learning_rate": 0.00019558775306554368, "loss": 0.0716, "step": 4830 }, { "epoch": 0.3123070707070707, "grad_norm": 0.07013077288866043, "learning_rate": 0.00019558574389230048, "loss": 0.0884, "step": 4831 }, { "epoch": 0.3123717171717172, "grad_norm": 0.05118946731090546, "learning_rate": 0.00019558373427203436, "loss": 0.069, "step": 4832 }, { "epoch": 0.3123717171717172, "eval_bleu": 15.852056043665936, "eval_loss": 0.09288105368614197, "eval_runtime": 2.6122, "eval_samples_per_second": 12.25, "eval_steps_per_second": 1.531, "step": 4832 }, { "epoch": 0.3124363636363636, "grad_norm": 0.0632585808634758, "learning_rate": 0.00019558172420475471, "loss": 0.0894, "step": 4833 }, { "epoch": 0.3125010101010101, "grad_norm": 0.059064168483018875, "learning_rate": 0.00019557971369047096, "loss": 0.0764, "step": 4834 }, { "epoch": 0.31256565656565655, "grad_norm": 0.06656047701835632, "learning_rate": 0.00019557770272919244, "loss": 0.1028, "step": 4835 }, { "epoch": 0.31263030303030304, "grad_norm": 0.0714767724275589, "learning_rate": 0.00019557569132092864, "loss": 0.1004, "step": 4836 }, { "epoch": 0.31269494949494947, "grad_norm": 0.0642804428935051, "learning_rate": 0.00019557367946568892, "loss": 0.089, "step": 4837 }, { "epoch": 0.31275959595959596, "grad_norm": 0.07364820688962936, "learning_rate": 0.00019557166716348268, "loss": 0.0984, "step": 4838 }, { "epoch": 0.31282424242424245, "grad_norm": 0.0542878694832325, "learning_rate": 0.00019556965441431938, "loss": 0.0639, "step": 4839 }, { "epoch": 0.3128888888888889, "grad_norm": 0.06015294790267944, "learning_rate": 0.00019556764121820837, "loss": 0.0764, "step": 4840 }, { "epoch": 0.3129535353535354, "grad_norm": 0.06800837814807892, "learning_rate": 0.00019556562757515913, "loss": 0.0994, "step": 4841 }, { "epoch": 0.3130181818181818, "grad_norm": 0.061244990676641464, "learning_rate": 0.000195563613485181, "loss": 0.0854, "step": 4842 }, { "epoch": 0.3130828282828283, "grad_norm": 0.06450098007917404, "learning_rate": 0.0001955615989482835, "loss": 0.0759, "step": 4843 }, { "epoch": 0.31314747474747473, "grad_norm": 0.05966275930404663, "learning_rate": 0.00019555958396447594, "loss": 0.0785, "step": 4844 }, { "epoch": 0.3132121212121212, "grad_norm": 0.06054462492465973, "learning_rate": 0.0001955575685337678, "loss": 0.086, "step": 4845 }, { "epoch": 0.31327676767676765, "grad_norm": 0.07119578868150711, "learning_rate": 0.00019555555265616853, "loss": 0.092, "step": 4846 }, { "epoch": 0.31334141414141414, "grad_norm": 0.07607617229223251, "learning_rate": 0.0001955535363316875, "loss": 0.1032, "step": 4847 }, { "epoch": 0.31340606060606063, "grad_norm": 0.06838122010231018, "learning_rate": 0.0001955515195603342, "loss": 0.0932, "step": 4848 }, { "epoch": 0.31340606060606063, "eval_bleu": 14.165919280145383, "eval_loss": 0.09376853704452515, "eval_runtime": 2.7251, "eval_samples_per_second": 11.743, "eval_steps_per_second": 1.468, "step": 4848 }, { "epoch": 0.31347070707070707, "grad_norm": 0.06580153852701187, "learning_rate": 0.000195549502342118, "loss": 0.084, "step": 4849 }, { "epoch": 0.31353535353535356, "grad_norm": 0.06856783479452133, "learning_rate": 0.00019554748467704843, "loss": 0.0874, "step": 4850 }, { "epoch": 0.3136, "grad_norm": 0.06505312770605087, "learning_rate": 0.0001955454665651348, "loss": 0.0918, "step": 4851 }, { "epoch": 0.3136646464646465, "grad_norm": 0.06147868186235428, "learning_rate": 0.0001955434480063866, "loss": 0.0844, "step": 4852 }, { "epoch": 0.3137292929292929, "grad_norm": 0.06927355378866196, "learning_rate": 0.00019554142900081334, "loss": 0.0943, "step": 4853 }, { "epoch": 0.3137939393939394, "grad_norm": 0.07105158269405365, "learning_rate": 0.00019553940954842436, "loss": 0.1024, "step": 4854 }, { "epoch": 0.31385858585858584, "grad_norm": 0.06368059664964676, "learning_rate": 0.00019553738964922914, "loss": 0.1013, "step": 4855 }, { "epoch": 0.3139232323232323, "grad_norm": 0.06101905554533005, "learning_rate": 0.00019553536930323718, "loss": 0.0831, "step": 4856 }, { "epoch": 0.31398787878787876, "grad_norm": 0.05785023048520088, "learning_rate": 0.00019553334851045784, "loss": 0.0852, "step": 4857 }, { "epoch": 0.31405252525252525, "grad_norm": 0.06406204402446747, "learning_rate": 0.0001955313272709006, "loss": 0.0851, "step": 4858 }, { "epoch": 0.31411717171717174, "grad_norm": 0.07250870019197464, "learning_rate": 0.00019552930558457496, "loss": 0.0991, "step": 4859 }, { "epoch": 0.3141818181818182, "grad_norm": 0.06991460919380188, "learning_rate": 0.0001955272834514903, "loss": 0.0921, "step": 4860 }, { "epoch": 0.31424646464646466, "grad_norm": 0.0643521249294281, "learning_rate": 0.00019552526087165614, "loss": 0.1065, "step": 4861 }, { "epoch": 0.3143111111111111, "grad_norm": 0.059560347348451614, "learning_rate": 0.00019552323784508192, "loss": 0.0873, "step": 4862 }, { "epoch": 0.3143757575757576, "grad_norm": 0.05429724231362343, "learning_rate": 0.0001955212143717771, "loss": 0.0695, "step": 4863 }, { "epoch": 0.314440404040404, "grad_norm": 0.05925700441002846, "learning_rate": 0.00019551919045175114, "loss": 0.0784, "step": 4864 }, { "epoch": 0.314440404040404, "eval_bleu": 17.800953675232073, "eval_loss": 0.09244292229413986, "eval_runtime": 2.7021, "eval_samples_per_second": 11.843, "eval_steps_per_second": 1.48, "step": 4864 }, { "epoch": 0.3145050505050505, "grad_norm": 0.05828540027141571, "learning_rate": 0.0001955171660850135, "loss": 0.0814, "step": 4865 }, { "epoch": 0.31456969696969694, "grad_norm": 0.07010854035615921, "learning_rate": 0.00019551514127157362, "loss": 0.1048, "step": 4866 }, { "epoch": 0.31463434343434343, "grad_norm": 0.0630769357085228, "learning_rate": 0.00019551311601144104, "loss": 0.0826, "step": 4867 }, { "epoch": 0.3146989898989899, "grad_norm": 0.06398430466651917, "learning_rate": 0.0001955110903046252, "loss": 0.0823, "step": 4868 }, { "epoch": 0.31476363636363636, "grad_norm": 0.06699857115745544, "learning_rate": 0.00019550906415113554, "loss": 0.0711, "step": 4869 }, { "epoch": 0.31482828282828285, "grad_norm": 0.05718837305903435, "learning_rate": 0.00019550703755098154, "loss": 0.0677, "step": 4870 }, { "epoch": 0.3148929292929293, "grad_norm": 0.07908740639686584, "learning_rate": 0.00019550501050417273, "loss": 0.102, "step": 4871 }, { "epoch": 0.31495757575757577, "grad_norm": 0.0655069574713707, "learning_rate": 0.00019550298301071857, "loss": 0.0802, "step": 4872 }, { "epoch": 0.3150222222222222, "grad_norm": 0.07686731964349747, "learning_rate": 0.00019550095507062852, "loss": 0.0956, "step": 4873 }, { "epoch": 0.3150868686868687, "grad_norm": 0.07690903544425964, "learning_rate": 0.00019549892668391206, "loss": 0.0959, "step": 4874 }, { "epoch": 0.3151515151515151, "grad_norm": 0.06656327843666077, "learning_rate": 0.00019549689785057872, "loss": 0.1028, "step": 4875 }, { "epoch": 0.3152161616161616, "grad_norm": 0.06390737742185593, "learning_rate": 0.00019549486857063793, "loss": 0.0778, "step": 4876 }, { "epoch": 0.3152808080808081, "grad_norm": 0.07323991507291794, "learning_rate": 0.0001954928388440992, "loss": 0.1061, "step": 4877 }, { "epoch": 0.31534545454545454, "grad_norm": 0.06591487675905228, "learning_rate": 0.0001954908086709721, "loss": 0.0972, "step": 4878 }, { "epoch": 0.31541010101010103, "grad_norm": 0.0657346174120903, "learning_rate": 0.00019548877805126598, "loss": 0.0851, "step": 4879 }, { "epoch": 0.31547474747474746, "grad_norm": 0.07673000544309616, "learning_rate": 0.00019548674698499044, "loss": 0.1101, "step": 4880 }, { "epoch": 0.31547474747474746, "eval_bleu": 13.40367618882521, "eval_loss": 0.09336511045694351, "eval_runtime": 2.8707, "eval_samples_per_second": 11.147, "eval_steps_per_second": 1.393, "step": 4880 }, { "epoch": 0.31553939393939395, "grad_norm": 0.05856234207749367, "learning_rate": 0.00019548471547215497, "loss": 0.0744, "step": 4881 }, { "epoch": 0.3156040404040404, "grad_norm": 0.10287497937679291, "learning_rate": 0.00019548268351276903, "loss": 0.0995, "step": 4882 }, { "epoch": 0.3156686868686869, "grad_norm": 0.06215628236532211, "learning_rate": 0.00019548065110684215, "loss": 0.0834, "step": 4883 }, { "epoch": 0.3157333333333333, "grad_norm": 0.062197282910346985, "learning_rate": 0.00019547861825438385, "loss": 0.0865, "step": 4884 }, { "epoch": 0.3157979797979798, "grad_norm": 0.06098649278283119, "learning_rate": 0.0001954765849554036, "loss": 0.0873, "step": 4885 }, { "epoch": 0.3158626262626263, "grad_norm": 0.057393934577703476, "learning_rate": 0.00019547455120991095, "loss": 0.0873, "step": 4886 }, { "epoch": 0.3159272727272727, "grad_norm": 0.05440773814916611, "learning_rate": 0.00019547251701791533, "loss": 0.0782, "step": 4887 }, { "epoch": 0.3159919191919192, "grad_norm": 0.06450515240430832, "learning_rate": 0.00019547048237942636, "loss": 0.1002, "step": 4888 }, { "epoch": 0.31605656565656565, "grad_norm": 0.05709666386246681, "learning_rate": 0.00019546844729445348, "loss": 0.0888, "step": 4889 }, { "epoch": 0.31612121212121214, "grad_norm": 0.06465613842010498, "learning_rate": 0.00019546641176300625, "loss": 0.0896, "step": 4890 }, { "epoch": 0.31618585858585857, "grad_norm": 0.06351575255393982, "learning_rate": 0.00019546437578509416, "loss": 0.0831, "step": 4891 }, { "epoch": 0.31625050505050506, "grad_norm": 0.06688931584358215, "learning_rate": 0.00019546233936072676, "loss": 0.1025, "step": 4892 }, { "epoch": 0.3163151515151515, "grad_norm": 0.058955784887075424, "learning_rate": 0.00019546030248991354, "loss": 0.0773, "step": 4893 }, { "epoch": 0.316379797979798, "grad_norm": 0.0577361173927784, "learning_rate": 0.00019545826517266404, "loss": 0.0815, "step": 4894 }, { "epoch": 0.3164444444444444, "grad_norm": 0.0661374032497406, "learning_rate": 0.00019545622740898782, "loss": 0.0963, "step": 4895 }, { "epoch": 0.3165090909090909, "grad_norm": 0.06921987980604172, "learning_rate": 0.00019545418919889436, "loss": 0.0891, "step": 4896 }, { "epoch": 0.3165090909090909, "eval_bleu": 12.683546899011779, "eval_loss": 0.0946357250213623, "eval_runtime": 2.6926, "eval_samples_per_second": 11.884, "eval_steps_per_second": 1.486, "step": 4896 }, { "epoch": 0.3165737373737374, "grad_norm": 0.0733335018157959, "learning_rate": 0.00019545215054239323, "loss": 0.1108, "step": 4897 }, { "epoch": 0.31663838383838383, "grad_norm": 0.06435233354568481, "learning_rate": 0.00019545011143949392, "loss": 0.0871, "step": 4898 }, { "epoch": 0.3167030303030303, "grad_norm": 0.06435911357402802, "learning_rate": 0.00019544807189020603, "loss": 0.0883, "step": 4899 }, { "epoch": 0.31676767676767675, "grad_norm": 0.07331840693950653, "learning_rate": 0.00019544603189453904, "loss": 0.1151, "step": 4900 }, { "epoch": 0.31683232323232324, "grad_norm": 0.06468986719846725, "learning_rate": 0.00019544399145250249, "loss": 0.093, "step": 4901 }, { "epoch": 0.3168969696969697, "grad_norm": 0.07084932923316956, "learning_rate": 0.000195441950564106, "loss": 0.089, "step": 4902 }, { "epoch": 0.31696161616161617, "grad_norm": 0.07533324509859085, "learning_rate": 0.000195439909229359, "loss": 0.1045, "step": 4903 }, { "epoch": 0.3170262626262626, "grad_norm": 0.06669335067272186, "learning_rate": 0.00019543786744827114, "loss": 0.0858, "step": 4904 }, { "epoch": 0.3170909090909091, "grad_norm": 0.06391675025224686, "learning_rate": 0.0001954358252208519, "loss": 0.0902, "step": 4905 }, { "epoch": 0.3171555555555556, "grad_norm": 0.06471959501504898, "learning_rate": 0.0001954337825471109, "loss": 0.0825, "step": 4906 }, { "epoch": 0.317220202020202, "grad_norm": 0.07275552302598953, "learning_rate": 0.0001954317394270576, "loss": 0.1025, "step": 4907 }, { "epoch": 0.3172848484848485, "grad_norm": 0.062462080270051956, "learning_rate": 0.00019542969586070164, "loss": 0.0964, "step": 4908 }, { "epoch": 0.31734949494949494, "grad_norm": 0.06980860978364944, "learning_rate": 0.0001954276518480525, "loss": 0.0823, "step": 4909 }, { "epoch": 0.3174141414141414, "grad_norm": 0.07418327033519745, "learning_rate": 0.00019542560738911981, "loss": 0.1083, "step": 4910 }, { "epoch": 0.31747878787878786, "grad_norm": 0.06741069257259369, "learning_rate": 0.0001954235624839131, "loss": 0.0874, "step": 4911 }, { "epoch": 0.31754343434343435, "grad_norm": 0.06400905549526215, "learning_rate": 0.00019542151713244191, "loss": 0.0838, "step": 4912 }, { "epoch": 0.31754343434343435, "eval_bleu": 13.657495976169672, "eval_loss": 0.09310087561607361, "eval_runtime": 2.7443, "eval_samples_per_second": 11.661, "eval_steps_per_second": 1.458, "step": 4912 }, { "epoch": 0.3176080808080808, "grad_norm": 0.0613933689892292, "learning_rate": 0.00019541947133471587, "loss": 0.0812, "step": 4913 }, { "epoch": 0.3176727272727273, "grad_norm": 0.06032070517539978, "learning_rate": 0.00019541742509074448, "loss": 0.0744, "step": 4914 }, { "epoch": 0.31773737373737376, "grad_norm": 0.07494694739580154, "learning_rate": 0.00019541537840053733, "loss": 0.1026, "step": 4915 }, { "epoch": 0.3178020202020202, "grad_norm": 0.061808351427316666, "learning_rate": 0.000195413331264104, "loss": 0.0854, "step": 4916 }, { "epoch": 0.3178666666666667, "grad_norm": 0.05917581915855408, "learning_rate": 0.00019541128368145408, "loss": 0.079, "step": 4917 }, { "epoch": 0.3179313131313131, "grad_norm": 0.07325282692909241, "learning_rate": 0.0001954092356525971, "loss": 0.1025, "step": 4918 }, { "epoch": 0.3179959595959596, "grad_norm": 0.06206470727920532, "learning_rate": 0.0001954071871775427, "loss": 0.0951, "step": 4919 }, { "epoch": 0.31806060606060604, "grad_norm": 0.06596078723669052, "learning_rate": 0.00019540513825630043, "loss": 0.0986, "step": 4920 }, { "epoch": 0.31812525252525253, "grad_norm": 0.06132863834500313, "learning_rate": 0.00019540308888887987, "loss": 0.0883, "step": 4921 }, { "epoch": 0.31818989898989897, "grad_norm": 0.06334802508354187, "learning_rate": 0.0001954010390752906, "loss": 0.0945, "step": 4922 }, { "epoch": 0.31825454545454546, "grad_norm": 0.0976632758975029, "learning_rate": 0.00019539898881554217, "loss": 0.0809, "step": 4923 }, { "epoch": 0.31831919191919195, "grad_norm": 0.0650988519191742, "learning_rate": 0.00019539693810964424, "loss": 0.0926, "step": 4924 }, { "epoch": 0.3183838383838384, "grad_norm": 0.06815777719020844, "learning_rate": 0.00019539488695760637, "loss": 0.0912, "step": 4925 }, { "epoch": 0.31844848484848487, "grad_norm": 0.06816314160823822, "learning_rate": 0.00019539283535943813, "loss": 0.0883, "step": 4926 }, { "epoch": 0.3185131313131313, "grad_norm": 0.06158888712525368, "learning_rate": 0.00019539078331514914, "loss": 0.0914, "step": 4927 }, { "epoch": 0.3185777777777778, "grad_norm": 0.0751294270157814, "learning_rate": 0.00019538873082474903, "loss": 0.1189, "step": 4928 }, { "epoch": 0.3185777777777778, "eval_bleu": 14.969845234431299, "eval_loss": 0.09190497547388077, "eval_runtime": 2.6879, "eval_samples_per_second": 11.905, "eval_steps_per_second": 1.488, "step": 4928 }, { "epoch": 0.3186424242424242, "grad_norm": 0.060045670717954636, "learning_rate": 0.00019538667788824733, "loss": 0.0875, "step": 4929 }, { "epoch": 0.3187070707070707, "grad_norm": 0.06228606030344963, "learning_rate": 0.00019538462450565365, "loss": 0.0927, "step": 4930 }, { "epoch": 0.31877171717171715, "grad_norm": 0.06696313619613647, "learning_rate": 0.00019538257067697765, "loss": 0.0964, "step": 4931 }, { "epoch": 0.31883636363636364, "grad_norm": 0.05998144671320915, "learning_rate": 0.00019538051640222888, "loss": 0.0817, "step": 4932 }, { "epoch": 0.3189010101010101, "grad_norm": 0.06348895281553268, "learning_rate": 0.00019537846168141699, "loss": 0.086, "step": 4933 }, { "epoch": 0.31896565656565656, "grad_norm": 0.06780138611793518, "learning_rate": 0.00019537640651455155, "loss": 0.0935, "step": 4934 }, { "epoch": 0.31903030303030305, "grad_norm": 0.0802244022488594, "learning_rate": 0.00019537435090164217, "loss": 0.0878, "step": 4935 }, { "epoch": 0.3190949494949495, "grad_norm": 0.06998592615127563, "learning_rate": 0.00019537229484269851, "loss": 0.0838, "step": 4936 }, { "epoch": 0.319159595959596, "grad_norm": 0.054281946271657944, "learning_rate": 0.00019537023833773013, "loss": 0.0691, "step": 4937 }, { "epoch": 0.3192242424242424, "grad_norm": 0.06317893415689468, "learning_rate": 0.00019536818138674668, "loss": 0.0788, "step": 4938 }, { "epoch": 0.3192888888888889, "grad_norm": 0.057587601244449615, "learning_rate": 0.0001953661239897578, "loss": 0.0851, "step": 4939 }, { "epoch": 0.31935353535353533, "grad_norm": 0.06710262596607208, "learning_rate": 0.00019536406614677307, "loss": 0.09, "step": 4940 }, { "epoch": 0.3194181818181818, "grad_norm": 0.06977008283138275, "learning_rate": 0.00019536200785780214, "loss": 0.0875, "step": 4941 }, { "epoch": 0.31948282828282826, "grad_norm": 0.06874189525842667, "learning_rate": 0.0001953599491228546, "loss": 0.0866, "step": 4942 }, { "epoch": 0.31954747474747475, "grad_norm": 0.06986114382743835, "learning_rate": 0.0001953578899419401, "loss": 0.094, "step": 4943 }, { "epoch": 0.31961212121212124, "grad_norm": 0.07599589228630066, "learning_rate": 0.0001953558303150683, "loss": 0.1091, "step": 4944 }, { "epoch": 0.31961212121212124, "eval_bleu": 12.465814519403771, "eval_loss": 0.0939895510673523, "eval_runtime": 2.7312, "eval_samples_per_second": 11.716, "eval_steps_per_second": 1.465, "step": 4944 }, { "epoch": 0.31967676767676767, "grad_norm": 0.07070202380418777, "learning_rate": 0.0001953537702422488, "loss": 0.0944, "step": 4945 }, { "epoch": 0.31974141414141416, "grad_norm": 0.06438890099525452, "learning_rate": 0.00019535170972349123, "loss": 0.0808, "step": 4946 }, { "epoch": 0.3198060606060606, "grad_norm": 0.05122537165880203, "learning_rate": 0.00019534964875880527, "loss": 0.0655, "step": 4947 }, { "epoch": 0.3198707070707071, "grad_norm": 0.06715032458305359, "learning_rate": 0.00019534758734820047, "loss": 0.0954, "step": 4948 }, { "epoch": 0.3199353535353535, "grad_norm": 0.06961732357740402, "learning_rate": 0.00019534552549168658, "loss": 0.0747, "step": 4949 }, { "epoch": 0.32, "grad_norm": 0.06812962889671326, "learning_rate": 0.00019534346318927315, "loss": 0.0955, "step": 4950 }, { "epoch": 0.32006464646464644, "grad_norm": 0.06949734687805176, "learning_rate": 0.00019534140044096988, "loss": 0.0892, "step": 4951 }, { "epoch": 0.32012929292929293, "grad_norm": 0.0696600005030632, "learning_rate": 0.0001953393372467864, "loss": 0.0983, "step": 4952 }, { "epoch": 0.3201939393939394, "grad_norm": 0.05432440713047981, "learning_rate": 0.00019533727360673234, "loss": 0.0658, "step": 4953 }, { "epoch": 0.32025858585858585, "grad_norm": 0.06622884422540665, "learning_rate": 0.00019533520952081738, "loss": 0.0929, "step": 4954 }, { "epoch": 0.32032323232323234, "grad_norm": 0.06357545405626297, "learning_rate": 0.00019533314498905116, "loss": 0.0905, "step": 4955 }, { "epoch": 0.3203878787878788, "grad_norm": 0.070916548371315, "learning_rate": 0.00019533108001144333, "loss": 0.0968, "step": 4956 }, { "epoch": 0.32045252525252527, "grad_norm": 0.0630759745836258, "learning_rate": 0.00019532901458800357, "loss": 0.0865, "step": 4957 }, { "epoch": 0.3205171717171717, "grad_norm": 0.06789695471525192, "learning_rate": 0.0001953269487187415, "loss": 0.0863, "step": 4958 }, { "epoch": 0.3205818181818182, "grad_norm": 0.07558803260326385, "learning_rate": 0.00019532488240366685, "loss": 0.1069, "step": 4959 }, { "epoch": 0.3206464646464646, "grad_norm": 0.06685559451580048, "learning_rate": 0.0001953228156427892, "loss": 0.0962, "step": 4960 }, { "epoch": 0.3206464646464646, "eval_bleu": 12.772305385473597, "eval_loss": 0.09306998550891876, "eval_runtime": 2.8422, "eval_samples_per_second": 11.259, "eval_steps_per_second": 1.407, "step": 4960 }, { "epoch": 0.3207111111111111, "grad_norm": 0.061944857239723206, "learning_rate": 0.00019532074843611828, "loss": 0.0872, "step": 4961 }, { "epoch": 0.3207757575757576, "grad_norm": 0.056623827666044235, "learning_rate": 0.0001953186807836637, "loss": 0.0802, "step": 4962 }, { "epoch": 0.32084040404040404, "grad_norm": 0.08329882472753525, "learning_rate": 0.0001953166126854352, "loss": 0.1031, "step": 4963 }, { "epoch": 0.3209050505050505, "grad_norm": 0.05859116464853287, "learning_rate": 0.0001953145441414424, "loss": 0.0758, "step": 4964 }, { "epoch": 0.32096969696969696, "grad_norm": 0.0700867548584938, "learning_rate": 0.00019531247515169496, "loss": 0.1055, "step": 4965 }, { "epoch": 0.32103434343434345, "grad_norm": 0.08815409243106842, "learning_rate": 0.0001953104057162026, "loss": 0.0773, "step": 4966 }, { "epoch": 0.3210989898989899, "grad_norm": 0.06450144201517105, "learning_rate": 0.00019530833583497498, "loss": 0.0866, "step": 4967 }, { "epoch": 0.3211636363636364, "grad_norm": 0.07723083347082138, "learning_rate": 0.0001953062655080218, "loss": 0.1056, "step": 4968 }, { "epoch": 0.3212282828282828, "grad_norm": 0.0583147332072258, "learning_rate": 0.00019530419473535272, "loss": 0.0774, "step": 4969 }, { "epoch": 0.3212929292929293, "grad_norm": 0.059548504650592804, "learning_rate": 0.00019530212351697742, "loss": 0.0675, "step": 4970 }, { "epoch": 0.32135757575757573, "grad_norm": 0.06677448749542236, "learning_rate": 0.00019530005185290557, "loss": 0.0881, "step": 4971 }, { "epoch": 0.3214222222222222, "grad_norm": 0.07151346653699875, "learning_rate": 0.00019529797974314692, "loss": 0.103, "step": 4972 }, { "epoch": 0.3214868686868687, "grad_norm": 0.07726530730724335, "learning_rate": 0.0001952959071877111, "loss": 0.1003, "step": 4973 }, { "epoch": 0.32155151515151514, "grad_norm": 0.05737035349011421, "learning_rate": 0.00019529383418660784, "loss": 0.0811, "step": 4974 }, { "epoch": 0.32161616161616163, "grad_norm": 0.07651960849761963, "learning_rate": 0.00019529176073984682, "loss": 0.0986, "step": 4975 }, { "epoch": 0.32168080808080807, "grad_norm": 0.06654155254364014, "learning_rate": 0.00019528968684743772, "loss": 0.0809, "step": 4976 }, { "epoch": 0.32168080808080807, "eval_bleu": 13.39792781488739, "eval_loss": 0.09400483965873718, "eval_runtime": 2.7493, "eval_samples_per_second": 11.639, "eval_steps_per_second": 1.455, "step": 4976 }, { "epoch": 0.32174545454545456, "grad_norm": 0.06384915113449097, "learning_rate": 0.00019528761250939028, "loss": 0.1004, "step": 4977 }, { "epoch": 0.321810101010101, "grad_norm": 0.07585534453392029, "learning_rate": 0.00019528553772571417, "loss": 0.0859, "step": 4978 }, { "epoch": 0.3218747474747475, "grad_norm": 0.0623493455350399, "learning_rate": 0.00019528346249641913, "loss": 0.0817, "step": 4979 }, { "epoch": 0.3219393939393939, "grad_norm": 0.06571008265018463, "learning_rate": 0.0001952813868215148, "loss": 0.0918, "step": 4980 }, { "epoch": 0.3220040404040404, "grad_norm": 0.057330675423145294, "learning_rate": 0.00019527931070101092, "loss": 0.0704, "step": 4981 }, { "epoch": 0.3220686868686869, "grad_norm": 0.07009575515985489, "learning_rate": 0.0001952772341349172, "loss": 0.0998, "step": 4982 }, { "epoch": 0.3221333333333333, "grad_norm": 0.05783507972955704, "learning_rate": 0.0001952751571232434, "loss": 0.0817, "step": 4983 }, { "epoch": 0.3221979797979798, "grad_norm": 0.0648999735713005, "learning_rate": 0.00019527307966599912, "loss": 0.1036, "step": 4984 }, { "epoch": 0.32226262626262625, "grad_norm": 0.059820324182510376, "learning_rate": 0.0001952710017631942, "loss": 0.0822, "step": 4985 }, { "epoch": 0.32232727272727274, "grad_norm": 0.06529063731431961, "learning_rate": 0.0001952689234148383, "loss": 0.1029, "step": 4986 }, { "epoch": 0.3223919191919192, "grad_norm": 0.0750807449221611, "learning_rate": 0.0001952668446209411, "loss": 0.1171, "step": 4987 }, { "epoch": 0.32245656565656566, "grad_norm": 0.08445903658866882, "learning_rate": 0.00019526476538151238, "loss": 0.1071, "step": 4988 }, { "epoch": 0.3225212121212121, "grad_norm": 0.05898655205965042, "learning_rate": 0.00019526268569656184, "loss": 0.0819, "step": 4989 }, { "epoch": 0.3225858585858586, "grad_norm": 0.052940040826797485, "learning_rate": 0.00019526060556609922, "loss": 0.0706, "step": 4990 }, { "epoch": 0.3226505050505051, "grad_norm": 0.061189912259578705, "learning_rate": 0.00019525852499013423, "loss": 0.0868, "step": 4991 }, { "epoch": 0.3227151515151515, "grad_norm": 0.07143127918243408, "learning_rate": 0.00019525644396867664, "loss": 0.0843, "step": 4992 }, { "epoch": 0.3227151515151515, "eval_bleu": 15.672153208444069, "eval_loss": 0.09378588944673538, "eval_runtime": 2.7914, "eval_samples_per_second": 11.464, "eval_steps_per_second": 1.433, "step": 4992 }, { "epoch": 0.322779797979798, "grad_norm": 0.060790855437517166, "learning_rate": 0.00019525436250173613, "loss": 0.0834, "step": 4993 }, { "epoch": 0.32284444444444443, "grad_norm": 0.05707736685872078, "learning_rate": 0.00019525228058932245, "loss": 0.0802, "step": 4994 }, { "epoch": 0.3229090909090909, "grad_norm": 0.06695713102817535, "learning_rate": 0.00019525019823144537, "loss": 0.0811, "step": 4995 }, { "epoch": 0.32297373737373736, "grad_norm": 0.08092916756868362, "learning_rate": 0.00019524811542811457, "loss": 0.093, "step": 4996 }, { "epoch": 0.32303838383838385, "grad_norm": 0.0689164400100708, "learning_rate": 0.00019524603217933986, "loss": 0.098, "step": 4997 }, { "epoch": 0.3231030303030303, "grad_norm": 0.06591492891311646, "learning_rate": 0.0001952439484851309, "loss": 0.0966, "step": 4998 }, { "epoch": 0.32316767676767677, "grad_norm": 0.06822417676448822, "learning_rate": 0.00019524186434549752, "loss": 0.1026, "step": 4999 }, { "epoch": 0.32323232323232326, "grad_norm": 0.05683770403265953, "learning_rate": 0.0001952397797604494, "loss": 0.0806, "step": 5000 }, { "epoch": 0.3232969696969697, "grad_norm": 0.06668691337108612, "learning_rate": 0.00019523769472999634, "loss": 0.1039, "step": 5001 }, { "epoch": 0.3233616161616162, "grad_norm": 0.062347084283828735, "learning_rate": 0.00019523560925414804, "loss": 0.1027, "step": 5002 }, { "epoch": 0.3234262626262626, "grad_norm": 0.062487971037626266, "learning_rate": 0.00019523352333291428, "loss": 0.0896, "step": 5003 }, { "epoch": 0.3234909090909091, "grad_norm": 0.05898349732160568, "learning_rate": 0.00019523143696630486, "loss": 0.0982, "step": 5004 }, { "epoch": 0.32355555555555554, "grad_norm": 0.06970492750406265, "learning_rate": 0.00019522935015432944, "loss": 0.1061, "step": 5005 }, { "epoch": 0.32362020202020203, "grad_norm": 0.06570102274417877, "learning_rate": 0.00019522726289699787, "loss": 0.0934, "step": 5006 }, { "epoch": 0.32368484848484846, "grad_norm": 0.06419141590595245, "learning_rate": 0.00019522517519431984, "loss": 0.0996, "step": 5007 }, { "epoch": 0.32374949494949495, "grad_norm": 0.06400182843208313, "learning_rate": 0.00019522308704630515, "loss": 0.0893, "step": 5008 }, { "epoch": 0.32374949494949495, "eval_bleu": 16.083217913553234, "eval_loss": 0.09427288174629211, "eval_runtime": 2.9016, "eval_samples_per_second": 11.028, "eval_steps_per_second": 1.379, "step": 5008 }, { "epoch": 0.3238141414141414, "grad_norm": 0.07289660722017288, "learning_rate": 0.00019522099845296358, "loss": 0.11, "step": 5009 }, { "epoch": 0.3238787878787879, "grad_norm": 0.05868663266301155, "learning_rate": 0.00019521890941430486, "loss": 0.072, "step": 5010 }, { "epoch": 0.32394343434343437, "grad_norm": 0.06341391801834106, "learning_rate": 0.0001952168199303388, "loss": 0.0918, "step": 5011 }, { "epoch": 0.3240080808080808, "grad_norm": 0.06332357972860336, "learning_rate": 0.00019521473000107516, "loss": 0.0908, "step": 5012 }, { "epoch": 0.3240727272727273, "grad_norm": 0.05746617540717125, "learning_rate": 0.00019521263962652368, "loss": 0.0809, "step": 5013 }, { "epoch": 0.3241373737373737, "grad_norm": 0.06302088499069214, "learning_rate": 0.00019521054880669415, "loss": 0.0842, "step": 5014 }, { "epoch": 0.3242020202020202, "grad_norm": 0.05614808201789856, "learning_rate": 0.0001952084575415964, "loss": 0.0714, "step": 5015 }, { "epoch": 0.32426666666666665, "grad_norm": 0.08165333420038223, "learning_rate": 0.00019520636583124015, "loss": 0.1204, "step": 5016 }, { "epoch": 0.32433131313131314, "grad_norm": 0.06666591018438339, "learning_rate": 0.0001952042736756352, "loss": 0.0843, "step": 5017 }, { "epoch": 0.32439595959595957, "grad_norm": 0.0809096246957779, "learning_rate": 0.00019520218107479132, "loss": 0.1043, "step": 5018 }, { "epoch": 0.32446060606060606, "grad_norm": 0.08147800713777542, "learning_rate": 0.00019520008802871832, "loss": 0.1007, "step": 5019 }, { "epoch": 0.32452525252525255, "grad_norm": 0.06933315098285675, "learning_rate": 0.000195197994537426, "loss": 0.1058, "step": 5020 }, { "epoch": 0.324589898989899, "grad_norm": 0.0682990625500679, "learning_rate": 0.0001951959006009241, "loss": 0.1006, "step": 5021 }, { "epoch": 0.3246545454545455, "grad_norm": 0.10208892822265625, "learning_rate": 0.00019519380621922249, "loss": 0.0916, "step": 5022 }, { "epoch": 0.3247191919191919, "grad_norm": 0.06210468336939812, "learning_rate": 0.0001951917113923309, "loss": 0.0868, "step": 5023 }, { "epoch": 0.3247838383838384, "grad_norm": 0.06577180325984955, "learning_rate": 0.00019518961612025913, "loss": 0.0864, "step": 5024 }, { "epoch": 0.3247838383838384, "eval_bleu": 12.472514122238822, "eval_loss": 0.0954444408416748, "eval_runtime": 2.714, "eval_samples_per_second": 11.791, "eval_steps_per_second": 1.474, "step": 5024 }, { "epoch": 0.32484848484848483, "grad_norm": 0.072667196393013, "learning_rate": 0.000195187520403017, "loss": 0.1103, "step": 5025 }, { "epoch": 0.3249131313131313, "grad_norm": 0.06775262206792831, "learning_rate": 0.00019518542424061433, "loss": 0.0843, "step": 5026 }, { "epoch": 0.32497777777777775, "grad_norm": 0.08101565390825272, "learning_rate": 0.00019518332763306085, "loss": 0.1159, "step": 5027 }, { "epoch": 0.32504242424242424, "grad_norm": 0.0728599950671196, "learning_rate": 0.00019518123058036646, "loss": 0.1021, "step": 5028 }, { "epoch": 0.32510707070707073, "grad_norm": 0.06834661215543747, "learning_rate": 0.0001951791330825409, "loss": 0.0975, "step": 5029 }, { "epoch": 0.32517171717171717, "grad_norm": 0.060497451573610306, "learning_rate": 0.00019517703513959397, "loss": 0.085, "step": 5030 }, { "epoch": 0.32523636363636366, "grad_norm": 0.06539740413427353, "learning_rate": 0.00019517493675153555, "loss": 0.0863, "step": 5031 }, { "epoch": 0.3253010101010101, "grad_norm": 0.07119203358888626, "learning_rate": 0.0001951728379183754, "loss": 0.0973, "step": 5032 }, { "epoch": 0.3253656565656566, "grad_norm": 0.06744899600744247, "learning_rate": 0.0001951707386401234, "loss": 0.0915, "step": 5033 }, { "epoch": 0.325430303030303, "grad_norm": 0.06295450776815414, "learning_rate": 0.00019516863891678928, "loss": 0.0767, "step": 5034 }, { "epoch": 0.3254949494949495, "grad_norm": 0.07959683239459991, "learning_rate": 0.00019516653874838288, "loss": 0.1101, "step": 5035 }, { "epoch": 0.32555959595959594, "grad_norm": 0.059050001204013824, "learning_rate": 0.00019516443813491404, "loss": 0.0836, "step": 5036 }, { "epoch": 0.3256242424242424, "grad_norm": 0.06866364181041718, "learning_rate": 0.0001951623370763926, "loss": 0.1026, "step": 5037 }, { "epoch": 0.3256888888888889, "grad_norm": 0.05746452137827873, "learning_rate": 0.00019516023557282836, "loss": 0.0717, "step": 5038 }, { "epoch": 0.32575353535353535, "grad_norm": 0.06633614748716354, "learning_rate": 0.0001951581336242312, "loss": 0.092, "step": 5039 }, { "epoch": 0.32581818181818184, "grad_norm": 0.06402871012687683, "learning_rate": 0.00019515603123061083, "loss": 0.0855, "step": 5040 }, { "epoch": 0.32581818181818184, "eval_bleu": 17.5241996538824, "eval_loss": 0.09342949092388153, "eval_runtime": 2.7274, "eval_samples_per_second": 11.733, "eval_steps_per_second": 1.467, "step": 5040 }, { "epoch": 0.3258828282828283, "grad_norm": 0.06713303178548813, "learning_rate": 0.00019515392839197722, "loss": 0.0993, "step": 5041 }, { "epoch": 0.32594747474747476, "grad_norm": 0.0637454017996788, "learning_rate": 0.00019515182510834015, "loss": 0.0811, "step": 5042 }, { "epoch": 0.3260121212121212, "grad_norm": 0.0662960335612297, "learning_rate": 0.00019514972137970942, "loss": 0.0882, "step": 5043 }, { "epoch": 0.3260767676767677, "grad_norm": 0.06274198740720749, "learning_rate": 0.00019514761720609492, "loss": 0.0855, "step": 5044 }, { "epoch": 0.3261414141414141, "grad_norm": 0.0655633732676506, "learning_rate": 0.00019514551258750643, "loss": 0.096, "step": 5045 }, { "epoch": 0.3262060606060606, "grad_norm": 0.07143282145261765, "learning_rate": 0.00019514340752395387, "loss": 0.1056, "step": 5046 }, { "epoch": 0.32627070707070704, "grad_norm": 0.05767039954662323, "learning_rate": 0.00019514130201544701, "loss": 0.0798, "step": 5047 }, { "epoch": 0.32633535353535353, "grad_norm": 0.07465171068906784, "learning_rate": 0.00019513919606199578, "loss": 0.0908, "step": 5048 }, { "epoch": 0.3264, "grad_norm": 0.07277978211641312, "learning_rate": 0.00019513708966360996, "loss": 0.1049, "step": 5049 }, { "epoch": 0.32646464646464646, "grad_norm": 0.06794938445091248, "learning_rate": 0.00019513498282029942, "loss": 0.0943, "step": 5050 }, { "epoch": 0.32652929292929295, "grad_norm": 0.07003027200698853, "learning_rate": 0.00019513287553207402, "loss": 0.0775, "step": 5051 }, { "epoch": 0.3265939393939394, "grad_norm": 0.06780106574296951, "learning_rate": 0.0001951307677989436, "loss": 0.094, "step": 5052 }, { "epoch": 0.32665858585858587, "grad_norm": 0.06895982474088669, "learning_rate": 0.00019512865962091803, "loss": 0.0862, "step": 5053 }, { "epoch": 0.3267232323232323, "grad_norm": 0.06453032046556473, "learning_rate": 0.0001951265509980072, "loss": 0.0901, "step": 5054 }, { "epoch": 0.3267878787878788, "grad_norm": 0.06938889622688293, "learning_rate": 0.00019512444193022093, "loss": 0.0947, "step": 5055 }, { "epoch": 0.32685252525252523, "grad_norm": 0.07029138505458832, "learning_rate": 0.00019512233241756908, "loss": 0.1069, "step": 5056 }, { "epoch": 0.32685252525252523, "eval_bleu": 18.00563460780774, "eval_loss": 0.09389109909534454, "eval_runtime": 2.6714, "eval_samples_per_second": 11.979, "eval_steps_per_second": 1.497, "step": 5056 }, { "epoch": 0.3269171717171717, "grad_norm": 0.06267280876636505, "learning_rate": 0.0001951202224600615, "loss": 0.0923, "step": 5057 }, { "epoch": 0.3269818181818182, "grad_norm": 0.0786575898528099, "learning_rate": 0.00019511811205770815, "loss": 0.1096, "step": 5058 }, { "epoch": 0.32704646464646464, "grad_norm": 0.05854830890893936, "learning_rate": 0.00019511600121051878, "loss": 0.083, "step": 5059 }, { "epoch": 0.32711111111111113, "grad_norm": 0.06258115917444229, "learning_rate": 0.00019511388991850335, "loss": 0.0891, "step": 5060 }, { "epoch": 0.32717575757575756, "grad_norm": 0.06314460933208466, "learning_rate": 0.00019511177818167167, "loss": 0.094, "step": 5061 }, { "epoch": 0.32724040404040405, "grad_norm": 0.06478629261255264, "learning_rate": 0.0001951096660000337, "loss": 0.1056, "step": 5062 }, { "epoch": 0.3273050505050505, "grad_norm": 0.0707220509648323, "learning_rate": 0.0001951075533735992, "loss": 0.1125, "step": 5063 }, { "epoch": 0.327369696969697, "grad_norm": 0.06447993963956833, "learning_rate": 0.00019510544030237816, "loss": 0.093, "step": 5064 }, { "epoch": 0.3274343434343434, "grad_norm": 0.05793168768286705, "learning_rate": 0.0001951033267863804, "loss": 0.0945, "step": 5065 }, { "epoch": 0.3274989898989899, "grad_norm": 0.0653253123164177, "learning_rate": 0.00019510121282561582, "loss": 0.0928, "step": 5066 }, { "epoch": 0.3275636363636364, "grad_norm": 0.06183865666389465, "learning_rate": 0.0001950990984200943, "loss": 0.0943, "step": 5067 }, { "epoch": 0.3276282828282828, "grad_norm": 0.06048472225666046, "learning_rate": 0.00019509698356982575, "loss": 0.0802, "step": 5068 }, { "epoch": 0.3276929292929293, "grad_norm": 0.06875723600387573, "learning_rate": 0.00019509486827482003, "loss": 0.098, "step": 5069 }, { "epoch": 0.32775757575757575, "grad_norm": 0.06606148928403854, "learning_rate": 0.00019509275253508704, "loss": 0.0858, "step": 5070 }, { "epoch": 0.32782222222222224, "grad_norm": 0.07038073241710663, "learning_rate": 0.00019509063635063674, "loss": 0.0867, "step": 5071 }, { "epoch": 0.32788686868686867, "grad_norm": 0.07425768673419952, "learning_rate": 0.00019508851972147893, "loss": 0.0878, "step": 5072 }, { "epoch": 0.32788686868686867, "eval_bleu": 14.867180770181093, "eval_loss": 0.09418272972106934, "eval_runtime": 2.67, "eval_samples_per_second": 11.985, "eval_steps_per_second": 1.498, "step": 5072 }, { "epoch": 0.32795151515151516, "grad_norm": 0.06115090101957321, "learning_rate": 0.00019508640264762354, "loss": 0.0787, "step": 5073 }, { "epoch": 0.3280161616161616, "grad_norm": 0.07193652540445328, "learning_rate": 0.0001950842851290805, "loss": 0.0884, "step": 5074 }, { "epoch": 0.3280808080808081, "grad_norm": 0.06728909909725189, "learning_rate": 0.00019508216716585967, "loss": 0.079, "step": 5075 }, { "epoch": 0.3281454545454545, "grad_norm": 0.061412930488586426, "learning_rate": 0.000195080048757971, "loss": 0.0869, "step": 5076 }, { "epoch": 0.328210101010101, "grad_norm": 0.06911016255617142, "learning_rate": 0.00019507792990542438, "loss": 0.0877, "step": 5077 }, { "epoch": 0.3282747474747475, "grad_norm": 0.08196385949850082, "learning_rate": 0.0001950758106082297, "loss": 0.1193, "step": 5078 }, { "epoch": 0.32833939393939393, "grad_norm": 0.0695338249206543, "learning_rate": 0.0001950736908663969, "loss": 0.0923, "step": 5079 }, { "epoch": 0.3284040404040404, "grad_norm": 0.06336675584316254, "learning_rate": 0.00019507157067993585, "loss": 0.0874, "step": 5080 }, { "epoch": 0.32846868686868685, "grad_norm": 0.07306136190891266, "learning_rate": 0.00019506945004885652, "loss": 0.0967, "step": 5081 }, { "epoch": 0.32853333333333334, "grad_norm": 0.0652216300368309, "learning_rate": 0.00019506732897316877, "loss": 0.0912, "step": 5082 }, { "epoch": 0.3285979797979798, "grad_norm": 0.06382980942726135, "learning_rate": 0.00019506520745288258, "loss": 0.0884, "step": 5083 }, { "epoch": 0.32866262626262627, "grad_norm": 0.06619251519441605, "learning_rate": 0.00019506308548800785, "loss": 0.0918, "step": 5084 }, { "epoch": 0.3287272727272727, "grad_norm": 0.06469406932592392, "learning_rate": 0.00019506096307855448, "loss": 0.1021, "step": 5085 }, { "epoch": 0.3287919191919192, "grad_norm": 0.07647312432527542, "learning_rate": 0.0001950588402245324, "loss": 0.1155, "step": 5086 }, { "epoch": 0.3288565656565657, "grad_norm": 0.06618805229663849, "learning_rate": 0.00019505671692595159, "loss": 0.0964, "step": 5087 }, { "epoch": 0.3289212121212121, "grad_norm": 0.05884721130132675, "learning_rate": 0.0001950545931828219, "loss": 0.0825, "step": 5088 }, { "epoch": 0.3289212121212121, "eval_bleu": 20.92322999700363, "eval_loss": 0.09511638432741165, "eval_runtime": 2.7073, "eval_samples_per_second": 11.82, "eval_steps_per_second": 1.478, "step": 5088 }, { "epoch": 0.3289858585858586, "grad_norm": 0.08250955492258072, "learning_rate": 0.00019505246899515332, "loss": 0.0885, "step": 5089 }, { "epoch": 0.32905050505050504, "grad_norm": 0.06676743924617767, "learning_rate": 0.00019505034436295574, "loss": 0.0991, "step": 5090 }, { "epoch": 0.3291151515151515, "grad_norm": 0.06790205091238022, "learning_rate": 0.00019504821928623914, "loss": 0.0957, "step": 5091 }, { "epoch": 0.32917979797979796, "grad_norm": 0.056500665843486786, "learning_rate": 0.00019504609376501346, "loss": 0.0754, "step": 5092 }, { "epoch": 0.32924444444444445, "grad_norm": 0.06154552102088928, "learning_rate": 0.00019504396779928862, "loss": 0.0892, "step": 5093 }, { "epoch": 0.3293090909090909, "grad_norm": 0.0593990683555603, "learning_rate": 0.00019504184138907453, "loss": 0.0935, "step": 5094 }, { "epoch": 0.3293737373737374, "grad_norm": 0.05421570688486099, "learning_rate": 0.0001950397145343812, "loss": 0.0738, "step": 5095 }, { "epoch": 0.32943838383838386, "grad_norm": 0.06184278056025505, "learning_rate": 0.00019503758723521855, "loss": 0.0843, "step": 5096 }, { "epoch": 0.3295030303030303, "grad_norm": 0.06964139640331268, "learning_rate": 0.0001950354594915965, "loss": 0.0894, "step": 5097 }, { "epoch": 0.3295676767676768, "grad_norm": 0.06350988894701004, "learning_rate": 0.000195033331303525, "loss": 0.0932, "step": 5098 }, { "epoch": 0.3296323232323232, "grad_norm": 0.06691033393144608, "learning_rate": 0.00019503120267101406, "loss": 0.088, "step": 5099 }, { "epoch": 0.3296969696969697, "grad_norm": 0.06917152553796768, "learning_rate": 0.00019502907359407362, "loss": 0.1005, "step": 5100 }, { "epoch": 0.32976161616161614, "grad_norm": 0.0704154521226883, "learning_rate": 0.00019502694407271359, "loss": 0.0953, "step": 5101 }, { "epoch": 0.32982626262626263, "grad_norm": 0.06082426384091377, "learning_rate": 0.00019502481410694396, "loss": 0.0676, "step": 5102 }, { "epoch": 0.32989090909090907, "grad_norm": 0.061995577067136765, "learning_rate": 0.0001950226836967747, "loss": 0.0823, "step": 5103 }, { "epoch": 0.32995555555555556, "grad_norm": 0.0694991797208786, "learning_rate": 0.00019502055284221576, "loss": 0.0883, "step": 5104 }, { "epoch": 0.32995555555555556, "eval_bleu": 14.581788892973183, "eval_loss": 0.0953206941485405, "eval_runtime": 2.8137, "eval_samples_per_second": 11.373, "eval_steps_per_second": 1.422, "step": 5104 }, { "epoch": 0.33002020202020205, "grad_norm": 0.05407920479774475, "learning_rate": 0.0001950184215432771, "loss": 0.0689, "step": 5105 }, { "epoch": 0.3300848484848485, "grad_norm": 0.07092015445232391, "learning_rate": 0.0001950162897999687, "loss": 0.1005, "step": 5106 }, { "epoch": 0.33014949494949497, "grad_norm": 0.07112333923578262, "learning_rate": 0.00019501415761230052, "loss": 0.1015, "step": 5107 }, { "epoch": 0.3302141414141414, "grad_norm": 0.07231850922107697, "learning_rate": 0.00019501202498028251, "loss": 0.091, "step": 5108 }, { "epoch": 0.3302787878787879, "grad_norm": 0.06771163642406464, "learning_rate": 0.00019500989190392474, "loss": 0.0907, "step": 5109 }, { "epoch": 0.33034343434343433, "grad_norm": 0.06725805252790451, "learning_rate": 0.00019500775838323707, "loss": 0.0808, "step": 5110 }, { "epoch": 0.3304080808080808, "grad_norm": 0.06543751060962677, "learning_rate": 0.00019500562441822955, "loss": 0.0941, "step": 5111 }, { "epoch": 0.33047272727272725, "grad_norm": 0.06468949466943741, "learning_rate": 0.00019500349000891207, "loss": 0.0861, "step": 5112 }, { "epoch": 0.33053737373737374, "grad_norm": 0.07135318964719772, "learning_rate": 0.00019500135515529473, "loss": 0.0985, "step": 5113 }, { "epoch": 0.3306020202020202, "grad_norm": 0.07334398478269577, "learning_rate": 0.00019499921985738743, "loss": 0.1032, "step": 5114 }, { "epoch": 0.33066666666666666, "grad_norm": 0.06897985935211182, "learning_rate": 0.00019499708411520021, "loss": 0.0902, "step": 5115 }, { "epoch": 0.33073131313131315, "grad_norm": 0.0717405378818512, "learning_rate": 0.00019499494792874302, "loss": 0.1036, "step": 5116 }, { "epoch": 0.3307959595959596, "grad_norm": 0.057063259184360504, "learning_rate": 0.00019499281129802586, "loss": 0.0759, "step": 5117 }, { "epoch": 0.3308606060606061, "grad_norm": 0.0608498677611351, "learning_rate": 0.00019499067422305873, "loss": 0.0789, "step": 5118 }, { "epoch": 0.3309252525252525, "grad_norm": 0.08243165910243988, "learning_rate": 0.00019498853670385163, "loss": 0.1021, "step": 5119 }, { "epoch": 0.330989898989899, "grad_norm": 0.06876850873231888, "learning_rate": 0.00019498639874041454, "loss": 0.0989, "step": 5120 }, { "epoch": 0.330989898989899, "eval_bleu": 14.253365311508597, "eval_loss": 0.0932714119553566, "eval_runtime": 2.7139, "eval_samples_per_second": 11.791, "eval_steps_per_second": 1.474, "step": 5120 }, { "epoch": 0.33105454545454543, "grad_norm": 0.06286542862653732, "learning_rate": 0.00019498426033275746, "loss": 0.088, "step": 5121 }, { "epoch": 0.3311191919191919, "grad_norm": 0.058825310319662094, "learning_rate": 0.00019498212148089038, "loss": 0.0868, "step": 5122 }, { "epoch": 0.33118383838383836, "grad_norm": 0.05647850036621094, "learning_rate": 0.00019497998218482337, "loss": 0.0789, "step": 5123 }, { "epoch": 0.33124848484848485, "grad_norm": 0.06813567876815796, "learning_rate": 0.00019497784244456635, "loss": 0.0967, "step": 5124 }, { "epoch": 0.33131313131313134, "grad_norm": 0.07070878893136978, "learning_rate": 0.00019497570226012934, "loss": 0.0998, "step": 5125 }, { "epoch": 0.33137777777777777, "grad_norm": 0.06068276986479759, "learning_rate": 0.00019497356163152238, "loss": 0.0816, "step": 5126 }, { "epoch": 0.33144242424242426, "grad_norm": 0.07000171393156052, "learning_rate": 0.0001949714205587555, "loss": 0.111, "step": 5127 }, { "epoch": 0.3315070707070707, "grad_norm": 0.05142157897353172, "learning_rate": 0.00019496927904183865, "loss": 0.0651, "step": 5128 }, { "epoch": 0.3315717171717172, "grad_norm": 0.06334732472896576, "learning_rate": 0.00019496713708078186, "loss": 0.088, "step": 5129 }, { "epoch": 0.3316363636363636, "grad_norm": 0.0659305527806282, "learning_rate": 0.00019496499467559522, "loss": 0.1042, "step": 5130 }, { "epoch": 0.3317010101010101, "grad_norm": 0.05799352005124092, "learning_rate": 0.00019496285182628867, "loss": 0.0813, "step": 5131 }, { "epoch": 0.33176565656565654, "grad_norm": 0.060741059482097626, "learning_rate": 0.00019496070853287226, "loss": 0.0965, "step": 5132 }, { "epoch": 0.33183030303030303, "grad_norm": 0.07164571434259415, "learning_rate": 0.000194958564795356, "loss": 0.0819, "step": 5133 }, { "epoch": 0.3318949494949495, "grad_norm": 0.06385243684053421, "learning_rate": 0.00019495642061374992, "loss": 0.0811, "step": 5134 }, { "epoch": 0.33195959595959595, "grad_norm": 0.060044482350349426, "learning_rate": 0.00019495427598806406, "loss": 0.0878, "step": 5135 }, { "epoch": 0.33202424242424244, "grad_norm": 0.06265515089035034, "learning_rate": 0.00019495213091830845, "loss": 0.0962, "step": 5136 }, { "epoch": 0.33202424242424244, "eval_bleu": 11.448756728010238, "eval_loss": 0.095070980489254, "eval_runtime": 2.7304, "eval_samples_per_second": 11.72, "eval_steps_per_second": 1.465, "step": 5136 }, { "epoch": 0.3320888888888889, "grad_norm": 0.07399669289588928, "learning_rate": 0.00019494998540449312, "loss": 0.0827, "step": 5137 }, { "epoch": 0.33215353535353537, "grad_norm": 0.08994245529174805, "learning_rate": 0.00019494783944662807, "loss": 0.1051, "step": 5138 }, { "epoch": 0.3322181818181818, "grad_norm": 0.067339688539505, "learning_rate": 0.00019494569304472336, "loss": 0.0947, "step": 5139 }, { "epoch": 0.3322828282828283, "grad_norm": 0.06321671605110168, "learning_rate": 0.00019494354619878907, "loss": 0.0902, "step": 5140 }, { "epoch": 0.3323474747474747, "grad_norm": 0.059462081640958786, "learning_rate": 0.0001949413989088352, "loss": 0.0908, "step": 5141 }, { "epoch": 0.3324121212121212, "grad_norm": 0.06893975287675858, "learning_rate": 0.00019493925117487177, "loss": 0.0967, "step": 5142 }, { "epoch": 0.3324767676767677, "grad_norm": 0.07452491670846939, "learning_rate": 0.00019493710299690886, "loss": 0.1017, "step": 5143 }, { "epoch": 0.33254141414141414, "grad_norm": 0.06371667981147766, "learning_rate": 0.0001949349543749565, "loss": 0.096, "step": 5144 }, { "epoch": 0.3326060606060606, "grad_norm": 0.06552218645811081, "learning_rate": 0.00019493280530902474, "loss": 0.0931, "step": 5145 }, { "epoch": 0.33267070707070706, "grad_norm": 0.07016447186470032, "learning_rate": 0.00019493065579912364, "loss": 0.107, "step": 5146 }, { "epoch": 0.33273535353535355, "grad_norm": 0.0675966739654541, "learning_rate": 0.00019492850584526325, "loss": 0.1009, "step": 5147 }, { "epoch": 0.3328, "grad_norm": 0.0669718086719513, "learning_rate": 0.00019492635544745364, "loss": 0.1027, "step": 5148 }, { "epoch": 0.3328646464646465, "grad_norm": 0.07508904486894608, "learning_rate": 0.0001949242046057048, "loss": 0.0938, "step": 5149 }, { "epoch": 0.3329292929292929, "grad_norm": 0.06506823003292084, "learning_rate": 0.00019492205332002688, "loss": 0.0835, "step": 5150 }, { "epoch": 0.3329939393939394, "grad_norm": 0.06267925351858139, "learning_rate": 0.00019491990159042986, "loss": 0.0816, "step": 5151 }, { "epoch": 0.33305858585858583, "grad_norm": 0.0638953223824501, "learning_rate": 0.00019491774941692388, "loss": 0.0875, "step": 5152 }, { "epoch": 0.33305858585858583, "eval_bleu": 16.664173136053467, "eval_loss": 0.09464696049690247, "eval_runtime": 2.8599, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 5152 }, { "epoch": 0.3331232323232323, "grad_norm": 0.07019636780023575, "learning_rate": 0.00019491559679951892, "loss": 0.1021, "step": 5153 }, { "epoch": 0.3331878787878788, "grad_norm": 0.06573473662137985, "learning_rate": 0.00019491344373822513, "loss": 0.0882, "step": 5154 }, { "epoch": 0.33325252525252524, "grad_norm": 0.06170053035020828, "learning_rate": 0.00019491129023305252, "loss": 0.0917, "step": 5155 }, { "epoch": 0.33331717171717173, "grad_norm": 0.06148397549986839, "learning_rate": 0.0001949091362840112, "loss": 0.0905, "step": 5156 }, { "epoch": 0.33338181818181817, "grad_norm": 0.05336204171180725, "learning_rate": 0.0001949069818911112, "loss": 0.0838, "step": 5157 }, { "epoch": 0.33344646464646466, "grad_norm": 0.05824108049273491, "learning_rate": 0.00019490482705436266, "loss": 0.0852, "step": 5158 }, { "epoch": 0.3335111111111111, "grad_norm": 0.06073099002242088, "learning_rate": 0.0001949026717737756, "loss": 0.0941, "step": 5159 }, { "epoch": 0.3335757575757576, "grad_norm": 0.0639898031949997, "learning_rate": 0.00019490051604936009, "loss": 0.0948, "step": 5160 }, { "epoch": 0.333640404040404, "grad_norm": 0.05767025053501129, "learning_rate": 0.00019489835988112625, "loss": 0.0815, "step": 5161 }, { "epoch": 0.3337050505050505, "grad_norm": 0.06693828105926514, "learning_rate": 0.0001948962032690842, "loss": 0.1062, "step": 5162 }, { "epoch": 0.333769696969697, "grad_norm": 0.05790676549077034, "learning_rate": 0.00019489404621324393, "loss": 0.0816, "step": 5163 }, { "epoch": 0.33383434343434343, "grad_norm": 0.06812599301338196, "learning_rate": 0.0001948918887136156, "loss": 0.0893, "step": 5164 }, { "epoch": 0.3338989898989899, "grad_norm": 0.06080705672502518, "learning_rate": 0.00019488973077020928, "loss": 0.0802, "step": 5165 }, { "epoch": 0.33396363636363635, "grad_norm": 0.06341090053319931, "learning_rate": 0.00019488757238303505, "loss": 0.0908, "step": 5166 }, { "epoch": 0.33402828282828284, "grad_norm": 0.06379783153533936, "learning_rate": 0.00019488541355210302, "loss": 0.0923, "step": 5167 }, { "epoch": 0.3340929292929293, "grad_norm": 0.06463515758514404, "learning_rate": 0.00019488325427742328, "loss": 0.0933, "step": 5168 }, { "epoch": 0.3340929292929293, "eval_bleu": 14.276964004148109, "eval_loss": 0.0936284065246582, "eval_runtime": 2.7323, "eval_samples_per_second": 11.712, "eval_steps_per_second": 1.464, "step": 5168 }, { "epoch": 0.33415757575757576, "grad_norm": 0.06666377931833267, "learning_rate": 0.0001948810945590059, "loss": 0.092, "step": 5169 }, { "epoch": 0.3342222222222222, "grad_norm": 0.05954353138804436, "learning_rate": 0.00019487893439686102, "loss": 0.079, "step": 5170 }, { "epoch": 0.3342868686868687, "grad_norm": 0.06045042350888252, "learning_rate": 0.00019487677379099875, "loss": 0.0854, "step": 5171 }, { "epoch": 0.3343515151515152, "grad_norm": 0.06336052715778351, "learning_rate": 0.00019487461274142915, "loss": 0.0898, "step": 5172 }, { "epoch": 0.3344161616161616, "grad_norm": 0.06168041378259659, "learning_rate": 0.00019487245124816239, "loss": 0.0917, "step": 5173 }, { "epoch": 0.3344808080808081, "grad_norm": 0.07327160984277725, "learning_rate": 0.00019487028931120852, "loss": 0.1058, "step": 5174 }, { "epoch": 0.33454545454545453, "grad_norm": 0.07012353837490082, "learning_rate": 0.00019486812693057765, "loss": 0.0874, "step": 5175 }, { "epoch": 0.334610101010101, "grad_norm": 0.06728319823741913, "learning_rate": 0.00019486596410627993, "loss": 0.0904, "step": 5176 }, { "epoch": 0.33467474747474746, "grad_norm": 0.061720218509435654, "learning_rate": 0.00019486380083832546, "loss": 0.0991, "step": 5177 }, { "epoch": 0.33473939393939395, "grad_norm": 0.08499522507190704, "learning_rate": 0.00019486163712672435, "loss": 0.0969, "step": 5178 }, { "epoch": 0.3348040404040404, "grad_norm": 0.09189268201589584, "learning_rate": 0.00019485947297148674, "loss": 0.0913, "step": 5179 }, { "epoch": 0.33486868686868687, "grad_norm": 0.057125989347696304, "learning_rate": 0.0001948573083726227, "loss": 0.0924, "step": 5180 }, { "epoch": 0.33493333333333336, "grad_norm": 0.0634625107049942, "learning_rate": 0.0001948551433301424, "loss": 0.0922, "step": 5181 }, { "epoch": 0.3349979797979798, "grad_norm": 0.06872695684432983, "learning_rate": 0.00019485297784405597, "loss": 0.1013, "step": 5182 }, { "epoch": 0.3350626262626263, "grad_norm": 0.059110675007104874, "learning_rate": 0.0001948508119143735, "loss": 0.0808, "step": 5183 }, { "epoch": 0.3351272727272727, "grad_norm": 0.08530732989311218, "learning_rate": 0.00019484864554110515, "loss": 0.0816, "step": 5184 }, { "epoch": 0.3351272727272727, "eval_bleu": 13.516917873060715, "eval_loss": 0.09367828071117401, "eval_runtime": 2.9274, "eval_samples_per_second": 10.931, "eval_steps_per_second": 1.366, "step": 5184 }, { "epoch": 0.3351919191919192, "grad_norm": 0.06850428879261017, "learning_rate": 0.00019484647872426106, "loss": 0.1009, "step": 5185 }, { "epoch": 0.33525656565656564, "grad_norm": 0.06608329713344574, "learning_rate": 0.00019484431146385133, "loss": 0.0988, "step": 5186 }, { "epoch": 0.33532121212121213, "grad_norm": 0.061334338039159775, "learning_rate": 0.0001948421437598861, "loss": 0.0869, "step": 5187 }, { "epoch": 0.33538585858585857, "grad_norm": 0.059113241732120514, "learning_rate": 0.0001948399756123755, "loss": 0.0812, "step": 5188 }, { "epoch": 0.33545050505050505, "grad_norm": 0.07684799283742905, "learning_rate": 0.00019483780702132973, "loss": 0.1306, "step": 5189 }, { "epoch": 0.3355151515151515, "grad_norm": 0.06829513609409332, "learning_rate": 0.00019483563798675885, "loss": 0.0916, "step": 5190 }, { "epoch": 0.335579797979798, "grad_norm": 0.06430917978286743, "learning_rate": 0.0001948334685086731, "loss": 0.0878, "step": 5191 }, { "epoch": 0.33564444444444447, "grad_norm": 0.06298217177391052, "learning_rate": 0.00019483129858708251, "loss": 0.0983, "step": 5192 }, { "epoch": 0.3357090909090909, "grad_norm": 0.05535086616873741, "learning_rate": 0.00019482912822199732, "loss": 0.0815, "step": 5193 }, { "epoch": 0.3357737373737374, "grad_norm": 0.0719243511557579, "learning_rate": 0.00019482695741342764, "loss": 0.1071, "step": 5194 }, { "epoch": 0.3358383838383838, "grad_norm": 0.05423068627715111, "learning_rate": 0.00019482478616138362, "loss": 0.0683, "step": 5195 }, { "epoch": 0.3359030303030303, "grad_norm": 0.062150537967681885, "learning_rate": 0.00019482261446587544, "loss": 0.0929, "step": 5196 }, { "epoch": 0.33596767676767675, "grad_norm": 0.07422970235347748, "learning_rate": 0.00019482044232691322, "loss": 0.1092, "step": 5197 }, { "epoch": 0.33603232323232324, "grad_norm": 0.06694428622722626, "learning_rate": 0.00019481826974450717, "loss": 0.106, "step": 5198 }, { "epoch": 0.33609696969696967, "grad_norm": 0.06134125590324402, "learning_rate": 0.0001948160967186674, "loss": 0.0939, "step": 5199 }, { "epoch": 0.33616161616161616, "grad_norm": 0.0593220479786396, "learning_rate": 0.00019481392324940407, "loss": 0.0801, "step": 5200 }, { "epoch": 0.33616161616161616, "eval_bleu": 11.728057412943766, "eval_loss": 0.09482674300670624, "eval_runtime": 2.7335, "eval_samples_per_second": 11.707, "eval_steps_per_second": 1.463, "step": 5200 }, { "epoch": 0.33622626262626265, "grad_norm": 0.059335388243198395, "learning_rate": 0.00019481174933672738, "loss": 0.0758, "step": 5201 }, { "epoch": 0.3362909090909091, "grad_norm": 0.054792359471321106, "learning_rate": 0.00019480957498064748, "loss": 0.0745, "step": 5202 }, { "epoch": 0.3363555555555556, "grad_norm": 0.05974455550312996, "learning_rate": 0.00019480740018117457, "loss": 0.086, "step": 5203 }, { "epoch": 0.336420202020202, "grad_norm": 0.06615287810564041, "learning_rate": 0.00019480522493831877, "loss": 0.0849, "step": 5204 }, { "epoch": 0.3364848484848485, "grad_norm": 0.08033490180969238, "learning_rate": 0.00019480304925209027, "loss": 0.0967, "step": 5205 }, { "epoch": 0.33654949494949493, "grad_norm": 0.05968150123953819, "learning_rate": 0.00019480087312249926, "loss": 0.0823, "step": 5206 }, { "epoch": 0.3366141414141414, "grad_norm": 0.06405745446681976, "learning_rate": 0.0001947986965495559, "loss": 0.1034, "step": 5207 }, { "epoch": 0.33667878787878786, "grad_norm": 0.06672141700983047, "learning_rate": 0.0001947965195332704, "loss": 0.0938, "step": 5208 }, { "epoch": 0.33674343434343434, "grad_norm": 0.0601082518696785, "learning_rate": 0.00019479434207365288, "loss": 0.0892, "step": 5209 }, { "epoch": 0.33680808080808083, "grad_norm": 0.059394244104623795, "learning_rate": 0.00019479216417071356, "loss": 0.087, "step": 5210 }, { "epoch": 0.33687272727272727, "grad_norm": 0.06454958021640778, "learning_rate": 0.00019478998582446265, "loss": 0.0839, "step": 5211 }, { "epoch": 0.33693737373737376, "grad_norm": 0.06793256103992462, "learning_rate": 0.0001947878070349103, "loss": 0.0721, "step": 5212 }, { "epoch": 0.3370020202020202, "grad_norm": 0.0711396262049675, "learning_rate": 0.00019478562780206673, "loss": 0.1046, "step": 5213 }, { "epoch": 0.3370666666666667, "grad_norm": 0.07256491482257843, "learning_rate": 0.0001947834481259421, "loss": 0.1001, "step": 5214 }, { "epoch": 0.3371313131313131, "grad_norm": 0.0646984875202179, "learning_rate": 0.0001947812680065466, "loss": 0.0942, "step": 5215 }, { "epoch": 0.3371959595959596, "grad_norm": 0.05478161573410034, "learning_rate": 0.0001947790874438905, "loss": 0.0785, "step": 5216 }, { "epoch": 0.3371959595959596, "eval_bleu": 12.803245476791671, "eval_loss": 0.09418702125549316, "eval_runtime": 2.8351, "eval_samples_per_second": 11.287, "eval_steps_per_second": 1.411, "step": 5216 }, { "epoch": 0.33726060606060604, "grad_norm": 0.06974922120571136, "learning_rate": 0.0001947769064379839, "loss": 0.0973, "step": 5217 }, { "epoch": 0.33732525252525253, "grad_norm": 0.07079936563968658, "learning_rate": 0.00019477472498883702, "loss": 0.0864, "step": 5218 }, { "epoch": 0.337389898989899, "grad_norm": 0.06208309158682823, "learning_rate": 0.00019477254309646012, "loss": 0.0759, "step": 5219 }, { "epoch": 0.33745454545454545, "grad_norm": 0.07577164471149445, "learning_rate": 0.00019477036076086336, "loss": 0.0919, "step": 5220 }, { "epoch": 0.33751919191919194, "grad_norm": 0.07931704074144363, "learning_rate": 0.00019476817798205697, "loss": 0.1095, "step": 5221 }, { "epoch": 0.3375838383838384, "grad_norm": 0.06813856214284897, "learning_rate": 0.00019476599476005112, "loss": 0.0899, "step": 5222 }, { "epoch": 0.33764848484848486, "grad_norm": 0.06303229182958603, "learning_rate": 0.00019476381109485603, "loss": 0.0855, "step": 5223 }, { "epoch": 0.3377131313131313, "grad_norm": 0.06302332133054733, "learning_rate": 0.00019476162698648194, "loss": 0.0955, "step": 5224 }, { "epoch": 0.3377777777777778, "grad_norm": 0.06261551380157471, "learning_rate": 0.00019475944243493905, "loss": 0.0949, "step": 5225 }, { "epoch": 0.3378424242424242, "grad_norm": 0.056157827377319336, "learning_rate": 0.00019475725744023755, "loss": 0.077, "step": 5226 }, { "epoch": 0.3379070707070707, "grad_norm": 0.06013309955596924, "learning_rate": 0.0001947550720023877, "loss": 0.0758, "step": 5227 }, { "epoch": 0.33797171717171715, "grad_norm": 0.061196219176054, "learning_rate": 0.00019475288612139972, "loss": 0.0821, "step": 5228 }, { "epoch": 0.33803636363636363, "grad_norm": 0.0713590756058693, "learning_rate": 0.0001947506997972838, "loss": 0.0937, "step": 5229 }, { "epoch": 0.3381010101010101, "grad_norm": 0.0648992732167244, "learning_rate": 0.0001947485130300502, "loss": 0.0788, "step": 5230 }, { "epoch": 0.33816565656565656, "grad_norm": 0.06307211518287659, "learning_rate": 0.00019474632581970908, "loss": 0.0836, "step": 5231 }, { "epoch": 0.33823030303030305, "grad_norm": 0.08481273055076599, "learning_rate": 0.00019474413816627077, "loss": 0.099, "step": 5232 }, { "epoch": 0.33823030303030305, "eval_bleu": 13.761151269058102, "eval_loss": 0.09390333294868469, "eval_runtime": 2.8015, "eval_samples_per_second": 11.422, "eval_steps_per_second": 1.428, "step": 5232 }, { "epoch": 0.3382949494949495, "grad_norm": 0.05996965616941452, "learning_rate": 0.00019474195006974543, "loss": 0.0866, "step": 5233 }, { "epoch": 0.33835959595959597, "grad_norm": 0.060087621212005615, "learning_rate": 0.00019473976153014331, "loss": 0.069, "step": 5234 }, { "epoch": 0.3384242424242424, "grad_norm": 0.059721577912569046, "learning_rate": 0.00019473757254747463, "loss": 0.0804, "step": 5235 }, { "epoch": 0.3384888888888889, "grad_norm": 0.06644900143146515, "learning_rate": 0.00019473538312174963, "loss": 0.1078, "step": 5236 }, { "epoch": 0.33855353535353533, "grad_norm": 0.05856110900640488, "learning_rate": 0.0001947331932529786, "loss": 0.0782, "step": 5237 }, { "epoch": 0.3386181818181818, "grad_norm": 0.06397783756256104, "learning_rate": 0.00019473100294117174, "loss": 0.0943, "step": 5238 }, { "epoch": 0.3386828282828283, "grad_norm": 0.06593852490186691, "learning_rate": 0.00019472881218633928, "loss": 0.0987, "step": 5239 }, { "epoch": 0.33874747474747474, "grad_norm": 0.06024523451924324, "learning_rate": 0.00019472662098849147, "loss": 0.0888, "step": 5240 }, { "epoch": 0.33881212121212123, "grad_norm": 0.066012904047966, "learning_rate": 0.00019472442934763862, "loss": 0.0949, "step": 5241 }, { "epoch": 0.33887676767676767, "grad_norm": 0.0693393349647522, "learning_rate": 0.0001947222372637909, "loss": 0.1041, "step": 5242 }, { "epoch": 0.33894141414141415, "grad_norm": 0.07089252769947052, "learning_rate": 0.00019472004473695858, "loss": 0.1106, "step": 5243 }, { "epoch": 0.3390060606060606, "grad_norm": 0.06830541789531708, "learning_rate": 0.00019471785176715194, "loss": 0.1073, "step": 5244 }, { "epoch": 0.3390707070707071, "grad_norm": 0.06370512396097183, "learning_rate": 0.00019471565835438122, "loss": 0.0839, "step": 5245 }, { "epoch": 0.3391353535353535, "grad_norm": 0.052051398903131485, "learning_rate": 0.00019471346449865666, "loss": 0.0699, "step": 5246 }, { "epoch": 0.3392, "grad_norm": 0.06528504192829132, "learning_rate": 0.00019471127019998855, "loss": 0.0924, "step": 5247 }, { "epoch": 0.3392646464646465, "grad_norm": 0.0579901747405529, "learning_rate": 0.00019470907545838715, "loss": 0.0912, "step": 5248 }, { "epoch": 0.3392646464646465, "eval_bleu": 13.231524313588503, "eval_loss": 0.0944802537560463, "eval_runtime": 2.8616, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 5248 }, { "epoch": 0.3393292929292929, "grad_norm": 0.05912841111421585, "learning_rate": 0.0001947068802738627, "loss": 0.0764, "step": 5249 }, { "epoch": 0.3393939393939394, "grad_norm": 0.06294563412666321, "learning_rate": 0.0001947046846464255, "loss": 0.098, "step": 5250 }, { "epoch": 0.33945858585858585, "grad_norm": 0.046023931354284286, "learning_rate": 0.00019470248857608575, "loss": 0.0663, "step": 5251 }, { "epoch": 0.33952323232323234, "grad_norm": 0.06932958960533142, "learning_rate": 0.00019470029206285382, "loss": 0.1127, "step": 5252 }, { "epoch": 0.33958787878787877, "grad_norm": 0.06204749643802643, "learning_rate": 0.0001946980951067399, "loss": 0.0907, "step": 5253 }, { "epoch": 0.33965252525252526, "grad_norm": 0.06371843069791794, "learning_rate": 0.00019469589770775434, "loss": 0.0965, "step": 5254 }, { "epoch": 0.3397171717171717, "grad_norm": 0.05734868347644806, "learning_rate": 0.00019469369986590732, "loss": 0.0775, "step": 5255 }, { "epoch": 0.3397818181818182, "grad_norm": 0.06512448936700821, "learning_rate": 0.0001946915015812092, "loss": 0.0748, "step": 5256 }, { "epoch": 0.3398464646464646, "grad_norm": 0.05566174536943436, "learning_rate": 0.0001946893028536702, "loss": 0.0743, "step": 5257 }, { "epoch": 0.3399111111111111, "grad_norm": 0.06467824429273605, "learning_rate": 0.0001946871036833007, "loss": 0.0785, "step": 5258 }, { "epoch": 0.3399757575757576, "grad_norm": 0.07455947995185852, "learning_rate": 0.00019468490407011086, "loss": 0.1108, "step": 5259 }, { "epoch": 0.34004040404040403, "grad_norm": 0.2361825853586197, "learning_rate": 0.00019468270401411104, "loss": 0.1275, "step": 5260 }, { "epoch": 0.3401050505050505, "grad_norm": 0.05676371976733208, "learning_rate": 0.0001946805035153115, "loss": 0.0748, "step": 5261 }, { "epoch": 0.34016969696969696, "grad_norm": 0.06662259995937347, "learning_rate": 0.00019467830257372258, "loss": 0.0903, "step": 5262 }, { "epoch": 0.34023434343434344, "grad_norm": 0.06699920445680618, "learning_rate": 0.00019467610118935452, "loss": 0.0967, "step": 5263 }, { "epoch": 0.3402989898989899, "grad_norm": 0.07223822921514511, "learning_rate": 0.00019467389936221764, "loss": 0.1111, "step": 5264 }, { "epoch": 0.3402989898989899, "eval_bleu": 13.43408585263464, "eval_loss": 0.09522901475429535, "eval_runtime": 2.7676, "eval_samples_per_second": 11.562, "eval_steps_per_second": 1.445, "step": 5264 }, { "epoch": 0.34036363636363637, "grad_norm": 0.47410309314727783, "learning_rate": 0.00019467169709232223, "loss": 0.1506, "step": 5265 }, { "epoch": 0.3404282828282828, "grad_norm": 0.05834919586777687, "learning_rate": 0.0001946694943796786, "loss": 0.086, "step": 5266 }, { "epoch": 0.3404929292929293, "grad_norm": 0.059020742774009705, "learning_rate": 0.00019466729122429702, "loss": 0.0868, "step": 5267 }, { "epoch": 0.3405575757575758, "grad_norm": 0.05215485021471977, "learning_rate": 0.00019466508762618783, "loss": 0.0758, "step": 5268 }, { "epoch": 0.3406222222222222, "grad_norm": 0.055135779082775116, "learning_rate": 0.0001946628835853613, "loss": 0.0847, "step": 5269 }, { "epoch": 0.3406868686868687, "grad_norm": 0.057472482323646545, "learning_rate": 0.0001946606791018278, "loss": 0.0806, "step": 5270 }, { "epoch": 0.34075151515151514, "grad_norm": 0.05483287572860718, "learning_rate": 0.00019465847417559758, "loss": 0.0842, "step": 5271 }, { "epoch": 0.34081616161616163, "grad_norm": 0.05201990529894829, "learning_rate": 0.00019465626880668096, "loss": 0.0774, "step": 5272 }, { "epoch": 0.34088080808080806, "grad_norm": 0.06047874316573143, "learning_rate": 0.00019465406299508825, "loss": 0.0905, "step": 5273 }, { "epoch": 0.34094545454545455, "grad_norm": 0.05382782965898514, "learning_rate": 0.0001946518567408298, "loss": 0.0734, "step": 5274 }, { "epoch": 0.341010101010101, "grad_norm": 0.056603722274303436, "learning_rate": 0.0001946496500439159, "loss": 0.0879, "step": 5275 }, { "epoch": 0.3410747474747475, "grad_norm": 0.06553099304437637, "learning_rate": 0.0001946474429043569, "loss": 0.0985, "step": 5276 }, { "epoch": 0.34113939393939396, "grad_norm": 0.07809021323919296, "learning_rate": 0.00019464523532216306, "loss": 0.1017, "step": 5277 }, { "epoch": 0.3412040404040404, "grad_norm": 0.07104026526212692, "learning_rate": 0.0001946430272973448, "loss": 0.0932, "step": 5278 }, { "epoch": 0.3412686868686869, "grad_norm": 0.06565055251121521, "learning_rate": 0.00019464081882991234, "loss": 0.0918, "step": 5279 }, { "epoch": 0.3413333333333333, "grad_norm": 0.07072681933641434, "learning_rate": 0.00019463860991987607, "loss": 0.0974, "step": 5280 }, { "epoch": 0.3413333333333333, "eval_bleu": 12.433872544318636, "eval_loss": 0.09418949484825134, "eval_runtime": 2.7248, "eval_samples_per_second": 11.744, "eval_steps_per_second": 1.468, "step": 5280 }, { "epoch": 0.3413979797979798, "grad_norm": 0.07309387624263763, "learning_rate": 0.0001946364005672463, "loss": 0.1164, "step": 5281 }, { "epoch": 0.34146262626262625, "grad_norm": 0.06455183774232864, "learning_rate": 0.00019463419077203338, "loss": 0.0914, "step": 5282 }, { "epoch": 0.34152727272727273, "grad_norm": 0.06871473789215088, "learning_rate": 0.00019463198053424761, "loss": 0.0856, "step": 5283 }, { "epoch": 0.34159191919191917, "grad_norm": 0.07536285370588303, "learning_rate": 0.0001946297698538994, "loss": 0.1153, "step": 5284 }, { "epoch": 0.34165656565656566, "grad_norm": 0.06737662106752396, "learning_rate": 0.000194627558730999, "loss": 0.0918, "step": 5285 }, { "epoch": 0.34172121212121215, "grad_norm": 0.0628575012087822, "learning_rate": 0.00019462534716555683, "loss": 0.0777, "step": 5286 }, { "epoch": 0.3417858585858586, "grad_norm": 0.06593616306781769, "learning_rate": 0.00019462313515758317, "loss": 0.0802, "step": 5287 }, { "epoch": 0.34185050505050507, "grad_norm": 0.06449372321367264, "learning_rate": 0.0001946209227070884, "loss": 0.0898, "step": 5288 }, { "epoch": 0.3419151515151515, "grad_norm": 0.06949684023857117, "learning_rate": 0.00019461870981408286, "loss": 0.0769, "step": 5289 }, { "epoch": 0.341979797979798, "grad_norm": 0.06232437863945961, "learning_rate": 0.00019461649647857686, "loss": 0.0811, "step": 5290 }, { "epoch": 0.34204444444444443, "grad_norm": 0.06185486167669296, "learning_rate": 0.0001946142827005808, "loss": 0.0948, "step": 5291 }, { "epoch": 0.3421090909090909, "grad_norm": 0.06736941635608673, "learning_rate": 0.00019461206848010504, "loss": 0.078, "step": 5292 }, { "epoch": 0.34217373737373735, "grad_norm": 0.07362458109855652, "learning_rate": 0.0001946098538171599, "loss": 0.0963, "step": 5293 }, { "epoch": 0.34223838383838384, "grad_norm": 0.06348512321710587, "learning_rate": 0.00019460763871175573, "loss": 0.0952, "step": 5294 }, { "epoch": 0.3423030303030303, "grad_norm": 0.057083532214164734, "learning_rate": 0.00019460542316390296, "loss": 0.0794, "step": 5295 }, { "epoch": 0.34236767676767677, "grad_norm": 0.05776918679475784, "learning_rate": 0.00019460320717361188, "loss": 0.0784, "step": 5296 }, { "epoch": 0.34236767676767677, "eval_bleu": 11.69131504905, "eval_loss": 0.09222513437271118, "eval_runtime": 2.7522, "eval_samples_per_second": 11.627, "eval_steps_per_second": 1.453, "step": 5296 }, { "epoch": 0.34243232323232325, "grad_norm": 0.07010868936777115, "learning_rate": 0.00019460099074089288, "loss": 0.0918, "step": 5297 }, { "epoch": 0.3424969696969697, "grad_norm": 0.06908310949802399, "learning_rate": 0.0001945987738657563, "loss": 0.0959, "step": 5298 }, { "epoch": 0.3425616161616162, "grad_norm": 0.06377406418323517, "learning_rate": 0.00019459655654821252, "loss": 0.0849, "step": 5299 }, { "epoch": 0.3426262626262626, "grad_norm": 0.0693165510892868, "learning_rate": 0.0001945943387882719, "loss": 0.1027, "step": 5300 }, { "epoch": 0.3426909090909091, "grad_norm": 0.06693820655345917, "learning_rate": 0.0001945921205859449, "loss": 0.0902, "step": 5301 }, { "epoch": 0.34275555555555554, "grad_norm": 0.07123447209596634, "learning_rate": 0.00019458990194124178, "loss": 0.111, "step": 5302 }, { "epoch": 0.342820202020202, "grad_norm": 0.06732272356748581, "learning_rate": 0.00019458768285417297, "loss": 0.0923, "step": 5303 }, { "epoch": 0.34288484848484846, "grad_norm": 0.06970855593681335, "learning_rate": 0.00019458546332474884, "loss": 0.0867, "step": 5304 }, { "epoch": 0.34294949494949495, "grad_norm": 0.06890291720628738, "learning_rate": 0.00019458324335297977, "loss": 0.1075, "step": 5305 }, { "epoch": 0.34301414141414144, "grad_norm": 0.061176273971796036, "learning_rate": 0.00019458102293887613, "loss": 0.0841, "step": 5306 }, { "epoch": 0.34307878787878787, "grad_norm": 0.061422199010849, "learning_rate": 0.0001945788020824483, "loss": 0.0868, "step": 5307 }, { "epoch": 0.34314343434343436, "grad_norm": 0.05408511310815811, "learning_rate": 0.0001945765807837067, "loss": 0.0757, "step": 5308 }, { "epoch": 0.3432080808080808, "grad_norm": 0.07778436690568924, "learning_rate": 0.00019457435904266172, "loss": 0.111, "step": 5309 }, { "epoch": 0.3432727272727273, "grad_norm": 0.05752675235271454, "learning_rate": 0.00019457213685932369, "loss": 0.0836, "step": 5310 }, { "epoch": 0.3433373737373737, "grad_norm": 0.0678258165717125, "learning_rate": 0.00019456991423370305, "loss": 0.0845, "step": 5311 }, { "epoch": 0.3434020202020202, "grad_norm": 0.06808333098888397, "learning_rate": 0.00019456769116581017, "loss": 0.0871, "step": 5312 }, { "epoch": 0.3434020202020202, "eval_bleu": 12.656713778127095, "eval_loss": 0.09386211633682251, "eval_runtime": 2.7997, "eval_samples_per_second": 11.43, "eval_steps_per_second": 1.429, "step": 5312 }, { "epoch": 0.34346666666666664, "grad_norm": 0.0648265928030014, "learning_rate": 0.0001945654676556555, "loss": 0.0839, "step": 5313 }, { "epoch": 0.34353131313131313, "grad_norm": 0.06835661828517914, "learning_rate": 0.00019456324370324937, "loss": 0.094, "step": 5314 }, { "epoch": 0.3435959595959596, "grad_norm": 0.07128773629665375, "learning_rate": 0.0001945610193086022, "loss": 0.0988, "step": 5315 }, { "epoch": 0.34366060606060606, "grad_norm": 0.06076571345329285, "learning_rate": 0.00019455879447172444, "loss": 0.086, "step": 5316 }, { "epoch": 0.34372525252525254, "grad_norm": 0.0822317898273468, "learning_rate": 0.00019455656919262646, "loss": 0.1219, "step": 5317 }, { "epoch": 0.343789898989899, "grad_norm": 0.06159370392560959, "learning_rate": 0.00019455434347131866, "loss": 0.0878, "step": 5318 }, { "epoch": 0.34385454545454547, "grad_norm": 0.0648675262928009, "learning_rate": 0.00019455211730781143, "loss": 0.0946, "step": 5319 }, { "epoch": 0.3439191919191919, "grad_norm": 0.05378996580839157, "learning_rate": 0.00019454989070211522, "loss": 0.0743, "step": 5320 }, { "epoch": 0.3439838383838384, "grad_norm": 0.06676554679870605, "learning_rate": 0.00019454766365424044, "loss": 0.0842, "step": 5321 }, { "epoch": 0.3440484848484848, "grad_norm": 0.08499591797590256, "learning_rate": 0.0001945454361641975, "loss": 0.1261, "step": 5322 }, { "epoch": 0.3441131313131313, "grad_norm": 0.05680030584335327, "learning_rate": 0.00019454320823199676, "loss": 0.0771, "step": 5323 }, { "epoch": 0.3441777777777778, "grad_norm": 0.06661302596330643, "learning_rate": 0.00019454097985764873, "loss": 0.0951, "step": 5324 }, { "epoch": 0.34424242424242424, "grad_norm": 0.06658933311700821, "learning_rate": 0.0001945387510411638, "loss": 0.0977, "step": 5325 }, { "epoch": 0.34430707070707073, "grad_norm": 0.06929687410593033, "learning_rate": 0.00019453652178255237, "loss": 0.0811, "step": 5326 }, { "epoch": 0.34437171717171716, "grad_norm": 0.06370438635349274, "learning_rate": 0.00019453429208182488, "loss": 0.0772, "step": 5327 }, { "epoch": 0.34443636363636365, "grad_norm": 0.06585221737623215, "learning_rate": 0.00019453206193899175, "loss": 0.0971, "step": 5328 }, { "epoch": 0.34443636363636365, "eval_bleu": 12.990024539645304, "eval_loss": 0.09530485421419144, "eval_runtime": 2.7507, "eval_samples_per_second": 11.633, "eval_steps_per_second": 1.454, "step": 5328 }, { "epoch": 0.3445010101010101, "grad_norm": 0.07433836162090302, "learning_rate": 0.00019452983135406345, "loss": 0.088, "step": 5329 }, { "epoch": 0.3445656565656566, "grad_norm": 0.07311110198497772, "learning_rate": 0.00019452760032705036, "loss": 0.1066, "step": 5330 }, { "epoch": 0.344630303030303, "grad_norm": 0.0717409998178482, "learning_rate": 0.00019452536885796292, "loss": 0.0963, "step": 5331 }, { "epoch": 0.3446949494949495, "grad_norm": 0.06740804761648178, "learning_rate": 0.00019452313694681158, "loss": 0.0924, "step": 5332 }, { "epoch": 0.34475959595959593, "grad_norm": 0.06627276539802551, "learning_rate": 0.0001945209045936068, "loss": 0.0898, "step": 5333 }, { "epoch": 0.3448242424242424, "grad_norm": 0.06454344093799591, "learning_rate": 0.000194518671798359, "loss": 0.0908, "step": 5334 }, { "epoch": 0.3448888888888889, "grad_norm": 0.06541433185338974, "learning_rate": 0.00019451643856107859, "loss": 0.0854, "step": 5335 }, { "epoch": 0.34495353535353535, "grad_norm": 0.06708386540412903, "learning_rate": 0.00019451420488177608, "loss": 0.0836, "step": 5336 }, { "epoch": 0.34501818181818183, "grad_norm": 0.06929788738489151, "learning_rate": 0.00019451197076046184, "loss": 0.0875, "step": 5337 }, { "epoch": 0.34508282828282827, "grad_norm": 0.0669521912932396, "learning_rate": 0.00019450973619714637, "loss": 0.0775, "step": 5338 }, { "epoch": 0.34514747474747476, "grad_norm": 0.07428229600191116, "learning_rate": 0.0001945075011918401, "loss": 0.0943, "step": 5339 }, { "epoch": 0.3452121212121212, "grad_norm": 0.0791178047657013, "learning_rate": 0.00019450526574455354, "loss": 0.0958, "step": 5340 }, { "epoch": 0.3452767676767677, "grad_norm": 0.07511323690414429, "learning_rate": 0.00019450302985529706, "loss": 0.1071, "step": 5341 }, { "epoch": 0.3453414141414141, "grad_norm": 0.05654502660036087, "learning_rate": 0.00019450079352408115, "loss": 0.0829, "step": 5342 }, { "epoch": 0.3454060606060606, "grad_norm": 0.07276176661252975, "learning_rate": 0.00019449855675091626, "loss": 0.1154, "step": 5343 }, { "epoch": 0.3454707070707071, "grad_norm": 0.06119151785969734, "learning_rate": 0.00019449631953581288, "loss": 0.0814, "step": 5344 }, { "epoch": 0.3454707070707071, "eval_bleu": 15.206516793387227, "eval_loss": 0.09421934187412262, "eval_runtime": 2.6773, "eval_samples_per_second": 11.952, "eval_steps_per_second": 1.494, "step": 5344 }, { "epoch": 0.34553535353535353, "grad_norm": 0.05989248678088188, "learning_rate": 0.00019449408187878145, "loss": 0.0867, "step": 5345 }, { "epoch": 0.3456, "grad_norm": 0.06290517747402191, "learning_rate": 0.00019449184377983244, "loss": 0.0917, "step": 5346 }, { "epoch": 0.34566464646464645, "grad_norm": 0.06920047104358673, "learning_rate": 0.0001944896052389763, "loss": 0.0833, "step": 5347 }, { "epoch": 0.34572929292929294, "grad_norm": 0.06957939267158508, "learning_rate": 0.00019448736625622353, "loss": 0.0913, "step": 5348 }, { "epoch": 0.3457939393939394, "grad_norm": 0.06561220437288284, "learning_rate": 0.00019448512683158456, "loss": 0.0866, "step": 5349 }, { "epoch": 0.34585858585858587, "grad_norm": 0.05995111167430878, "learning_rate": 0.0001944828869650699, "loss": 0.08, "step": 5350 }, { "epoch": 0.3459232323232323, "grad_norm": 0.06013043597340584, "learning_rate": 0.00019448064665669002, "loss": 0.0881, "step": 5351 }, { "epoch": 0.3459878787878788, "grad_norm": 0.07432939112186432, "learning_rate": 0.0001944784059064554, "loss": 0.0956, "step": 5352 }, { "epoch": 0.3460525252525253, "grad_norm": 0.06742550432682037, "learning_rate": 0.0001944761647143765, "loss": 0.0889, "step": 5353 }, { "epoch": 0.3461171717171717, "grad_norm": 0.058633383363485336, "learning_rate": 0.00019447392308046378, "loss": 0.0826, "step": 5354 }, { "epoch": 0.3461818181818182, "grad_norm": 0.05573435127735138, "learning_rate": 0.00019447168100472778, "loss": 0.0791, "step": 5355 }, { "epoch": 0.34624646464646464, "grad_norm": 0.06877442449331284, "learning_rate": 0.00019446943848717898, "loss": 0.0924, "step": 5356 }, { "epoch": 0.3463111111111111, "grad_norm": 0.056518591940402985, "learning_rate": 0.00019446719552782785, "loss": 0.0793, "step": 5357 }, { "epoch": 0.34637575757575756, "grad_norm": 0.07469072192907333, "learning_rate": 0.00019446495212668485, "loss": 0.1142, "step": 5358 }, { "epoch": 0.34644040404040405, "grad_norm": 0.06297776848077774, "learning_rate": 0.0001944627082837605, "loss": 0.0864, "step": 5359 }, { "epoch": 0.3465050505050505, "grad_norm": 0.060830045491456985, "learning_rate": 0.0001944604639990653, "loss": 0.0776, "step": 5360 }, { "epoch": 0.3465050505050505, "eval_bleu": 14.47072287750069, "eval_loss": 0.09344345331192017, "eval_runtime": 2.7694, "eval_samples_per_second": 11.555, "eval_steps_per_second": 1.444, "step": 5360 }, { "epoch": 0.34656969696969697, "grad_norm": 0.0666361004114151, "learning_rate": 0.00019445821927260975, "loss": 0.0985, "step": 5361 }, { "epoch": 0.34663434343434346, "grad_norm": 0.0619664341211319, "learning_rate": 0.0001944559741044043, "loss": 0.0795, "step": 5362 }, { "epoch": 0.3466989898989899, "grad_norm": 0.07481850683689117, "learning_rate": 0.0001944537284944595, "loss": 0.0889, "step": 5363 }, { "epoch": 0.3467636363636364, "grad_norm": 0.10630080848932266, "learning_rate": 0.00019445148244278586, "loss": 0.1363, "step": 5364 }, { "epoch": 0.3468282828282828, "grad_norm": 0.06847912818193436, "learning_rate": 0.00019444923594939386, "loss": 0.0961, "step": 5365 }, { "epoch": 0.3468929292929293, "grad_norm": 0.06266526877880096, "learning_rate": 0.00019444698901429397, "loss": 0.0847, "step": 5366 }, { "epoch": 0.34695757575757574, "grad_norm": 0.058274924755096436, "learning_rate": 0.00019444474163749677, "loss": 0.0782, "step": 5367 }, { "epoch": 0.34702222222222223, "grad_norm": 0.06570993363857269, "learning_rate": 0.00019444249381901272, "loss": 0.093, "step": 5368 }, { "epoch": 0.34708686868686867, "grad_norm": 0.06814686208963394, "learning_rate": 0.00019444024555885237, "loss": 0.0896, "step": 5369 }, { "epoch": 0.34715151515151516, "grad_norm": 0.06851497292518616, "learning_rate": 0.0001944379968570262, "loss": 0.1058, "step": 5370 }, { "epoch": 0.3472161616161616, "grad_norm": 0.058960504829883575, "learning_rate": 0.00019443574771354474, "loss": 0.0793, "step": 5371 }, { "epoch": 0.3472808080808081, "grad_norm": 0.06058274954557419, "learning_rate": 0.0001944334981284185, "loss": 0.0858, "step": 5372 }, { "epoch": 0.34734545454545457, "grad_norm": 0.06411627680063248, "learning_rate": 0.00019443124810165802, "loss": 0.0831, "step": 5373 }, { "epoch": 0.347410101010101, "grad_norm": 0.06605079025030136, "learning_rate": 0.00019442899763327378, "loss": 0.0885, "step": 5374 }, { "epoch": 0.3474747474747475, "grad_norm": 0.058634303510189056, "learning_rate": 0.00019442674672327638, "loss": 0.0788, "step": 5375 }, { "epoch": 0.3475393939393939, "grad_norm": 0.06674228608608246, "learning_rate": 0.00019442449537167628, "loss": 0.1054, "step": 5376 }, { "epoch": 0.3475393939393939, "eval_bleu": 15.692248886076214, "eval_loss": 0.09231525659561157, "eval_runtime": 2.8228, "eval_samples_per_second": 11.336, "eval_steps_per_second": 1.417, "step": 5376 }, { "epoch": 0.3476040404040404, "grad_norm": 0.06481784582138062, "learning_rate": 0.00019442224357848402, "loss": 0.0972, "step": 5377 }, { "epoch": 0.34766868686868685, "grad_norm": 0.06606926023960114, "learning_rate": 0.00019441999134371015, "loss": 0.0868, "step": 5378 }, { "epoch": 0.34773333333333334, "grad_norm": 0.054367631673812866, "learning_rate": 0.0001944177386673652, "loss": 0.0822, "step": 5379 }, { "epoch": 0.3477979797979798, "grad_norm": 0.0662309005856514, "learning_rate": 0.00019441548554945972, "loss": 0.0874, "step": 5380 }, { "epoch": 0.34786262626262626, "grad_norm": 0.06305788457393646, "learning_rate": 0.0001944132319900042, "loss": 0.0818, "step": 5381 }, { "epoch": 0.34792727272727275, "grad_norm": 0.06198299676179886, "learning_rate": 0.00019441097798900922, "loss": 0.0885, "step": 5382 }, { "epoch": 0.3479919191919192, "grad_norm": 0.05868459865450859, "learning_rate": 0.00019440872354648529, "loss": 0.0783, "step": 5383 }, { "epoch": 0.3480565656565657, "grad_norm": 0.0710318386554718, "learning_rate": 0.000194406468662443, "loss": 0.095, "step": 5384 }, { "epoch": 0.3481212121212121, "grad_norm": 0.07270139455795288, "learning_rate": 0.00019440421333689285, "loss": 0.1007, "step": 5385 }, { "epoch": 0.3481858585858586, "grad_norm": 0.06582971662282944, "learning_rate": 0.00019440195756984538, "loss": 0.0902, "step": 5386 }, { "epoch": 0.34825050505050503, "grad_norm": 0.06729482114315033, "learning_rate": 0.0001943997013613112, "loss": 0.0854, "step": 5387 }, { "epoch": 0.3483151515151515, "grad_norm": 0.06232403591275215, "learning_rate": 0.0001943974447113008, "loss": 0.0749, "step": 5388 }, { "epoch": 0.34837979797979796, "grad_norm": 0.08777357637882233, "learning_rate": 0.00019439518761982477, "loss": 0.0951, "step": 5389 }, { "epoch": 0.34844444444444445, "grad_norm": 0.057237379252910614, "learning_rate": 0.00019439293008689364, "loss": 0.0753, "step": 5390 }, { "epoch": 0.34850909090909094, "grad_norm": 0.06308239698410034, "learning_rate": 0.00019439067211251803, "loss": 0.0908, "step": 5391 }, { "epoch": 0.34857373737373737, "grad_norm": 0.052329737693071365, "learning_rate": 0.00019438841369670838, "loss": 0.0662, "step": 5392 }, { "epoch": 0.34857373737373737, "eval_bleu": 13.954759479769704, "eval_loss": 0.09024699032306671, "eval_runtime": 2.8264, "eval_samples_per_second": 11.322, "eval_steps_per_second": 1.415, "step": 5392 }, { "epoch": 0.34863838383838386, "grad_norm": 0.060405947268009186, "learning_rate": 0.00019438615483947537, "loss": 0.0792, "step": 5393 }, { "epoch": 0.3487030303030303, "grad_norm": 0.07349822670221329, "learning_rate": 0.00019438389554082952, "loss": 0.0931, "step": 5394 }, { "epoch": 0.3487676767676768, "grad_norm": 0.06358102709054947, "learning_rate": 0.00019438163580078136, "loss": 0.0823, "step": 5395 }, { "epoch": 0.3488323232323232, "grad_norm": 0.06621633470058441, "learning_rate": 0.00019437937561934152, "loss": 0.0906, "step": 5396 }, { "epoch": 0.3488969696969697, "grad_norm": 0.060286086052656174, "learning_rate": 0.00019437711499652052, "loss": 0.0905, "step": 5397 }, { "epoch": 0.34896161616161614, "grad_norm": 0.06547947227954865, "learning_rate": 0.000194374853932329, "loss": 0.0906, "step": 5398 }, { "epoch": 0.34902626262626263, "grad_norm": 0.04992571845650673, "learning_rate": 0.00019437259242677742, "loss": 0.0681, "step": 5399 }, { "epoch": 0.3490909090909091, "grad_norm": 0.05971140414476395, "learning_rate": 0.00019437033047987648, "loss": 0.0853, "step": 5400 }, { "epoch": 0.34915555555555555, "grad_norm": 0.06232995167374611, "learning_rate": 0.00019436806809163668, "loss": 0.0867, "step": 5401 }, { "epoch": 0.34922020202020204, "grad_norm": 0.07090754806995392, "learning_rate": 0.00019436580526206861, "loss": 0.0852, "step": 5402 }, { "epoch": 0.3492848484848485, "grad_norm": 0.0710451751947403, "learning_rate": 0.0001943635419911829, "loss": 0.1016, "step": 5403 }, { "epoch": 0.34934949494949497, "grad_norm": 0.059071823954582214, "learning_rate": 0.00019436127827899007, "loss": 0.0817, "step": 5404 }, { "epoch": 0.3494141414141414, "grad_norm": 0.0821671262383461, "learning_rate": 0.00019435901412550075, "loss": 0.0731, "step": 5405 }, { "epoch": 0.3494787878787879, "grad_norm": 0.0648055151104927, "learning_rate": 0.0001943567495307255, "loss": 0.0814, "step": 5406 }, { "epoch": 0.3495434343434343, "grad_norm": 0.06616731733083725, "learning_rate": 0.00019435448449467495, "loss": 0.0821, "step": 5407 }, { "epoch": 0.3496080808080808, "grad_norm": 0.14790239930152893, "learning_rate": 0.00019435221901735965, "loss": 0.1196, "step": 5408 }, { "epoch": 0.3496080808080808, "eval_bleu": 18.225478958988877, "eval_loss": 0.08874686062335968, "eval_runtime": 2.7698, "eval_samples_per_second": 11.553, "eval_steps_per_second": 1.444, "step": 5408 }, { "epoch": 0.34967272727272725, "grad_norm": 0.06771877408027649, "learning_rate": 0.00019434995309879023, "loss": 0.0775, "step": 5409 }, { "epoch": 0.34973737373737374, "grad_norm": 0.06328757852315903, "learning_rate": 0.00019434768673897725, "loss": 0.0831, "step": 5410 }, { "epoch": 0.3498020202020202, "grad_norm": 0.06440027058124542, "learning_rate": 0.00019434541993793134, "loss": 0.0854, "step": 5411 }, { "epoch": 0.34986666666666666, "grad_norm": 0.05807211622595787, "learning_rate": 0.0001943431526956631, "loss": 0.0774, "step": 5412 }, { "epoch": 0.34993131313131315, "grad_norm": 0.06489991396665573, "learning_rate": 0.0001943408850121831, "loss": 0.0881, "step": 5413 }, { "epoch": 0.3499959595959596, "grad_norm": 0.06556063890457153, "learning_rate": 0.000194338616887502, "loss": 0.0867, "step": 5414 }, { "epoch": 0.35006060606060607, "grad_norm": 0.06696070730686188, "learning_rate": 0.00019433634832163036, "loss": 0.0893, "step": 5415 }, { "epoch": 0.3501252525252525, "grad_norm": 0.06481732428073883, "learning_rate": 0.0001943340793145788, "loss": 0.0856, "step": 5416 }, { "epoch": 0.350189898989899, "grad_norm": 0.07792875915765762, "learning_rate": 0.00019433180986635796, "loss": 0.0961, "step": 5417 }, { "epoch": 0.35025454545454543, "grad_norm": 0.059600379317998886, "learning_rate": 0.00019432953997697842, "loss": 0.0743, "step": 5418 }, { "epoch": 0.3503191919191919, "grad_norm": 0.06155688688158989, "learning_rate": 0.00019432726964645078, "loss": 0.0901, "step": 5419 }, { "epoch": 0.3503838383838384, "grad_norm": 0.07287827134132385, "learning_rate": 0.00019432499887478573, "loss": 0.091, "step": 5420 }, { "epoch": 0.35044848484848484, "grad_norm": 0.05637204647064209, "learning_rate": 0.00019432272766199378, "loss": 0.0801, "step": 5421 }, { "epoch": 0.35051313131313133, "grad_norm": 0.059877995401620865, "learning_rate": 0.00019432045600808567, "loss": 0.0729, "step": 5422 }, { "epoch": 0.35057777777777777, "grad_norm": 0.06987854093313217, "learning_rate": 0.00019431818391307197, "loss": 0.0937, "step": 5423 }, { "epoch": 0.35064242424242426, "grad_norm": 0.06323510408401489, "learning_rate": 0.0001943159113769633, "loss": 0.0873, "step": 5424 }, { "epoch": 0.35064242424242426, "eval_bleu": 19.101953031511187, "eval_loss": 0.08730112016201019, "eval_runtime": 2.7011, "eval_samples_per_second": 11.847, "eval_steps_per_second": 1.481, "step": 5424 }, { "epoch": 0.3507070707070707, "grad_norm": 0.07902935147285461, "learning_rate": 0.00019431363839977027, "loss": 0.0997, "step": 5425 }, { "epoch": 0.3507717171717172, "grad_norm": 0.06195440888404846, "learning_rate": 0.00019431136498150356, "loss": 0.0854, "step": 5426 }, { "epoch": 0.3508363636363636, "grad_norm": 0.06629517674446106, "learning_rate": 0.00019430909112217376, "loss": 0.093, "step": 5427 }, { "epoch": 0.3509010101010101, "grad_norm": 0.06025164946913719, "learning_rate": 0.0001943068168217915, "loss": 0.0776, "step": 5428 }, { "epoch": 0.3509656565656566, "grad_norm": 0.05882531404495239, "learning_rate": 0.00019430454208036743, "loss": 0.0832, "step": 5429 }, { "epoch": 0.351030303030303, "grad_norm": 0.05807064473628998, "learning_rate": 0.00019430226689791226, "loss": 0.0827, "step": 5430 }, { "epoch": 0.3510949494949495, "grad_norm": 0.06380753964185715, "learning_rate": 0.0001942999912744365, "loss": 0.084, "step": 5431 }, { "epoch": 0.35115959595959595, "grad_norm": 0.06198057532310486, "learning_rate": 0.00019429771520995085, "loss": 0.0817, "step": 5432 }, { "epoch": 0.35122424242424244, "grad_norm": 0.07838543504476547, "learning_rate": 0.000194295438704466, "loss": 0.1013, "step": 5433 }, { "epoch": 0.3512888888888889, "grad_norm": 0.05862465500831604, "learning_rate": 0.00019429316175799256, "loss": 0.0801, "step": 5434 }, { "epoch": 0.35135353535353536, "grad_norm": 0.0653543472290039, "learning_rate": 0.00019429088437054114, "loss": 0.0865, "step": 5435 }, { "epoch": 0.3514181818181818, "grad_norm": 0.06240657716989517, "learning_rate": 0.00019428860654212244, "loss": 0.0848, "step": 5436 }, { "epoch": 0.3514828282828283, "grad_norm": 0.062219396233558655, "learning_rate": 0.0001942863282727471, "loss": 0.0878, "step": 5437 }, { "epoch": 0.3515474747474748, "grad_norm": 0.06554533541202545, "learning_rate": 0.00019428404956242578, "loss": 0.0927, "step": 5438 }, { "epoch": 0.3516121212121212, "grad_norm": 0.09633299708366394, "learning_rate": 0.00019428177041116914, "loss": 0.1182, "step": 5439 }, { "epoch": 0.3516767676767677, "grad_norm": 0.06733645498752594, "learning_rate": 0.0001942794908189878, "loss": 0.0902, "step": 5440 }, { "epoch": 0.3516767676767677, "eval_bleu": 18.872025551070507, "eval_loss": 0.08681249618530273, "eval_runtime": 2.76, "eval_samples_per_second": 11.594, "eval_steps_per_second": 1.449, "step": 5440 }, { "epoch": 0.35174141414141413, "grad_norm": 0.06773844361305237, "learning_rate": 0.00019427721078589248, "loss": 0.0801, "step": 5441 }, { "epoch": 0.3518060606060606, "grad_norm": 0.08086249232292175, "learning_rate": 0.00019427493031189377, "loss": 0.1055, "step": 5442 }, { "epoch": 0.35187070707070706, "grad_norm": 0.06608313322067261, "learning_rate": 0.0001942726493970024, "loss": 0.0933, "step": 5443 }, { "epoch": 0.35193535353535355, "grad_norm": 0.06522857397794724, "learning_rate": 0.00019427036804122903, "loss": 0.0657, "step": 5444 }, { "epoch": 0.352, "grad_norm": 0.06952293962240219, "learning_rate": 0.0001942680862445843, "loss": 0.083, "step": 5445 }, { "epoch": 0.35206464646464647, "grad_norm": 0.0664205327630043, "learning_rate": 0.0001942658040070789, "loss": 0.1018, "step": 5446 }, { "epoch": 0.3521292929292929, "grad_norm": 0.06659642606973648, "learning_rate": 0.00019426352132872348, "loss": 0.0894, "step": 5447 }, { "epoch": 0.3521939393939394, "grad_norm": 0.11545795202255249, "learning_rate": 0.00019426123820952875, "loss": 0.0842, "step": 5448 }, { "epoch": 0.3522585858585859, "grad_norm": 0.05927438661456108, "learning_rate": 0.00019425895464950536, "loss": 0.0724, "step": 5449 }, { "epoch": 0.3523232323232323, "grad_norm": 0.0797869861125946, "learning_rate": 0.000194256670648664, "loss": 0.1062, "step": 5450 }, { "epoch": 0.3523878787878788, "grad_norm": 0.06599069386720657, "learning_rate": 0.00019425438620701538, "loss": 0.097, "step": 5451 }, { "epoch": 0.35245252525252524, "grad_norm": 0.06288129091262817, "learning_rate": 0.00019425210132457014, "loss": 0.0903, "step": 5452 }, { "epoch": 0.35251717171717173, "grad_norm": 0.07003501802682877, "learning_rate": 0.00019424981600133897, "loss": 0.0869, "step": 5453 }, { "epoch": 0.35258181818181816, "grad_norm": 0.07866848260164261, "learning_rate": 0.0001942475302373326, "loss": 0.1039, "step": 5454 }, { "epoch": 0.35264646464646465, "grad_norm": 0.0650615468621254, "learning_rate": 0.00019424524403256166, "loss": 0.0852, "step": 5455 }, { "epoch": 0.3527111111111111, "grad_norm": 0.06603934615850449, "learning_rate": 0.00019424295738703688, "loss": 0.0868, "step": 5456 }, { "epoch": 0.3527111111111111, "eval_bleu": 19.03451031854968, "eval_loss": 0.08721117675304413, "eval_runtime": 2.7569, "eval_samples_per_second": 11.607, "eval_steps_per_second": 1.451, "step": 5456 }, { "epoch": 0.3527757575757576, "grad_norm": 0.06235654652118683, "learning_rate": 0.00019424067030076892, "loss": 0.0903, "step": 5457 }, { "epoch": 0.35284040404040407, "grad_norm": 0.07522473484277725, "learning_rate": 0.00019423838277376853, "loss": 0.0853, "step": 5458 }, { "epoch": 0.3529050505050505, "grad_norm": 0.05301303043961525, "learning_rate": 0.0001942360948060464, "loss": 0.0799, "step": 5459 }, { "epoch": 0.352969696969697, "grad_norm": 0.0651654452085495, "learning_rate": 0.00019423380639761316, "loss": 0.0891, "step": 5460 }, { "epoch": 0.3530343434343434, "grad_norm": 0.07389476150274277, "learning_rate": 0.00019423151754847958, "loss": 0.0707, "step": 5461 }, { "epoch": 0.3530989898989899, "grad_norm": 0.059685174375772476, "learning_rate": 0.00019422922825865633, "loss": 0.0796, "step": 5462 }, { "epoch": 0.35316363636363635, "grad_norm": 0.06934983283281326, "learning_rate": 0.00019422693852815415, "loss": 0.0891, "step": 5463 }, { "epoch": 0.35322828282828284, "grad_norm": 0.07016516476869583, "learning_rate": 0.00019422464835698372, "loss": 0.0991, "step": 5464 }, { "epoch": 0.35329292929292927, "grad_norm": 0.06078748404979706, "learning_rate": 0.00019422235774515576, "loss": 0.0847, "step": 5465 }, { "epoch": 0.35335757575757576, "grad_norm": 0.06731028854846954, "learning_rate": 0.000194220066692681, "loss": 0.0824, "step": 5466 }, { "epoch": 0.35342222222222225, "grad_norm": 0.06212714686989784, "learning_rate": 0.00019421777519957012, "loss": 0.0851, "step": 5467 }, { "epoch": 0.3534868686868687, "grad_norm": 0.06787095218896866, "learning_rate": 0.00019421548326583384, "loss": 0.0772, "step": 5468 }, { "epoch": 0.35355151515151517, "grad_norm": 0.06840930879116058, "learning_rate": 0.00019421319089148292, "loss": 0.0963, "step": 5469 }, { "epoch": 0.3536161616161616, "grad_norm": 0.06353110074996948, "learning_rate": 0.00019421089807652805, "loss": 0.0828, "step": 5470 }, { "epoch": 0.3536808080808081, "grad_norm": 0.07223305851221085, "learning_rate": 0.00019420860482097997, "loss": 0.0979, "step": 5471 }, { "epoch": 0.35374545454545453, "grad_norm": 0.06589917093515396, "learning_rate": 0.00019420631112484932, "loss": 0.0926, "step": 5472 }, { "epoch": 0.35374545454545453, "eval_bleu": 17.926759810338705, "eval_loss": 0.08764360845088959, "eval_runtime": 2.6915, "eval_samples_per_second": 11.889, "eval_steps_per_second": 1.486, "step": 5472 }, { "epoch": 0.353810101010101, "grad_norm": 0.06246504187583923, "learning_rate": 0.00019420401698814693, "loss": 0.0822, "step": 5473 }, { "epoch": 0.35387474747474745, "grad_norm": 0.06094297021627426, "learning_rate": 0.00019420172241088355, "loss": 0.0706, "step": 5474 }, { "epoch": 0.35393939393939394, "grad_norm": 0.06563259661197662, "learning_rate": 0.0001941994273930698, "loss": 0.0969, "step": 5475 }, { "epoch": 0.3540040404040404, "grad_norm": 0.05963452160358429, "learning_rate": 0.0001941971319347165, "loss": 0.0839, "step": 5476 }, { "epoch": 0.35406868686868687, "grad_norm": 0.06400375068187714, "learning_rate": 0.00019419483603583433, "loss": 0.089, "step": 5477 }, { "epoch": 0.35413333333333336, "grad_norm": 0.06819913536310196, "learning_rate": 0.00019419253969643404, "loss": 0.0919, "step": 5478 }, { "epoch": 0.3541979797979798, "grad_norm": 0.06493015587329865, "learning_rate": 0.00019419024291652642, "loss": 0.0889, "step": 5479 }, { "epoch": 0.3542626262626263, "grad_norm": 0.0692656859755516, "learning_rate": 0.00019418794569612215, "loss": 0.0792, "step": 5480 }, { "epoch": 0.3543272727272727, "grad_norm": 0.06938996911048889, "learning_rate": 0.00019418564803523201, "loss": 0.0866, "step": 5481 }, { "epoch": 0.3543919191919192, "grad_norm": 0.07000883668661118, "learning_rate": 0.00019418334993386672, "loss": 0.1013, "step": 5482 }, { "epoch": 0.35445656565656564, "grad_norm": 0.06489119678735733, "learning_rate": 0.00019418105139203703, "loss": 0.0865, "step": 5483 }, { "epoch": 0.3545212121212121, "grad_norm": 0.07101009786128998, "learning_rate": 0.0001941787524097537, "loss": 0.1024, "step": 5484 }, { "epoch": 0.35458585858585856, "grad_norm": 0.07344099879264832, "learning_rate": 0.0001941764529870275, "loss": 0.097, "step": 5485 }, { "epoch": 0.35465050505050505, "grad_norm": 0.060479436069726944, "learning_rate": 0.00019417415312386916, "loss": 0.0773, "step": 5486 }, { "epoch": 0.35471515151515154, "grad_norm": 0.07052353024482727, "learning_rate": 0.0001941718528202894, "loss": 0.091, "step": 5487 }, { "epoch": 0.354779797979798, "grad_norm": 0.06919814646244049, "learning_rate": 0.00019416955207629907, "loss": 0.087, "step": 5488 }, { "epoch": 0.354779797979798, "eval_bleu": 19.488286470696313, "eval_loss": 0.08756741881370544, "eval_runtime": 2.7773, "eval_samples_per_second": 11.522, "eval_steps_per_second": 1.44, "step": 5488 }, { "epoch": 0.35484444444444446, "grad_norm": 0.0689169391989708, "learning_rate": 0.0001941672508919089, "loss": 0.0942, "step": 5489 }, { "epoch": 0.3549090909090909, "grad_norm": 0.08936396986246109, "learning_rate": 0.00019416494926712956, "loss": 0.0952, "step": 5490 }, { "epoch": 0.3549737373737374, "grad_norm": 0.06837047636508942, "learning_rate": 0.0001941626472019719, "loss": 0.0942, "step": 5491 }, { "epoch": 0.3550383838383838, "grad_norm": 0.059870097786188126, "learning_rate": 0.00019416034469644665, "loss": 0.0753, "step": 5492 }, { "epoch": 0.3551030303030303, "grad_norm": 0.07562560588121414, "learning_rate": 0.00019415804175056464, "loss": 0.1007, "step": 5493 }, { "epoch": 0.35516767676767674, "grad_norm": 0.0628628358244896, "learning_rate": 0.0001941557383643366, "loss": 0.0805, "step": 5494 }, { "epoch": 0.35523232323232323, "grad_norm": 0.07205650955438614, "learning_rate": 0.00019415343453777327, "loss": 0.0898, "step": 5495 }, { "epoch": 0.3552969696969697, "grad_norm": 0.06748083978891373, "learning_rate": 0.00019415113027088546, "loss": 0.0843, "step": 5496 }, { "epoch": 0.35536161616161616, "grad_norm": 0.07686375081539154, "learning_rate": 0.00019414882556368396, "loss": 0.1022, "step": 5497 }, { "epoch": 0.35542626262626265, "grad_norm": 0.06201009079813957, "learning_rate": 0.0001941465204161795, "loss": 0.0829, "step": 5498 }, { "epoch": 0.3554909090909091, "grad_norm": 0.06823250651359558, "learning_rate": 0.00019414421482838288, "loss": 0.0877, "step": 5499 }, { "epoch": 0.35555555555555557, "grad_norm": 0.06517871469259262, "learning_rate": 0.00019414190880030492, "loss": 0.0972, "step": 5500 }, { "epoch": 0.355620202020202, "grad_norm": 0.07965034991502762, "learning_rate": 0.00019413960233195633, "loss": 0.0867, "step": 5501 }, { "epoch": 0.3556848484848485, "grad_norm": 0.07561900466680527, "learning_rate": 0.000194137295423348, "loss": 0.0972, "step": 5502 }, { "epoch": 0.3557494949494949, "grad_norm": 0.0664999783039093, "learning_rate": 0.00019413498807449063, "loss": 0.077, "step": 5503 }, { "epoch": 0.3558141414141414, "grad_norm": 0.07070228457450867, "learning_rate": 0.00019413268028539506, "loss": 0.0843, "step": 5504 }, { "epoch": 0.3558141414141414, "eval_bleu": 17.377045009150795, "eval_loss": 0.08750738203525543, "eval_runtime": 2.7082, "eval_samples_per_second": 11.816, "eval_steps_per_second": 1.477, "step": 5504 }, { "epoch": 0.3558787878787879, "grad_norm": 0.0830061212182045, "learning_rate": 0.00019413037205607205, "loss": 0.1143, "step": 5505 }, { "epoch": 0.35594343434343434, "grad_norm": 0.06037771701812744, "learning_rate": 0.0001941280633865324, "loss": 0.0858, "step": 5506 }, { "epoch": 0.35600808080808083, "grad_norm": 0.06625869870185852, "learning_rate": 0.00019412575427678693, "loss": 0.0885, "step": 5507 }, { "epoch": 0.35607272727272726, "grad_norm": 0.05788028612732887, "learning_rate": 0.00019412344472684643, "loss": 0.0663, "step": 5508 }, { "epoch": 0.35613737373737375, "grad_norm": 0.05700523778796196, "learning_rate": 0.00019412113473672168, "loss": 0.0797, "step": 5509 }, { "epoch": 0.3562020202020202, "grad_norm": 0.06845907866954803, "learning_rate": 0.0001941188243064235, "loss": 0.0868, "step": 5510 }, { "epoch": 0.3562666666666667, "grad_norm": 0.07724369317293167, "learning_rate": 0.0001941165134359627, "loss": 0.0973, "step": 5511 }, { "epoch": 0.3563313131313131, "grad_norm": 0.060447417199611664, "learning_rate": 0.00019411420212535005, "loss": 0.0779, "step": 5512 }, { "epoch": 0.3563959595959596, "grad_norm": 0.06893841177225113, "learning_rate": 0.00019411189037459644, "loss": 0.0904, "step": 5513 }, { "epoch": 0.35646060606060603, "grad_norm": 0.07802202552556992, "learning_rate": 0.0001941095781837126, "loss": 0.094, "step": 5514 }, { "epoch": 0.3565252525252525, "grad_norm": 0.07639066874980927, "learning_rate": 0.00019410726555270938, "loss": 0.0876, "step": 5515 }, { "epoch": 0.356589898989899, "grad_norm": 0.07376755774021149, "learning_rate": 0.00019410495248159762, "loss": 0.0998, "step": 5516 }, { "epoch": 0.35665454545454545, "grad_norm": 0.07825583219528198, "learning_rate": 0.00019410263897038808, "loss": 0.0966, "step": 5517 }, { "epoch": 0.35671919191919194, "grad_norm": 0.07115195691585541, "learning_rate": 0.00019410032501909158, "loss": 0.0831, "step": 5518 }, { "epoch": 0.35678383838383837, "grad_norm": 0.06978830695152283, "learning_rate": 0.00019409801062771903, "loss": 0.0933, "step": 5519 }, { "epoch": 0.35684848484848486, "grad_norm": 0.0728168934583664, "learning_rate": 0.00019409569579628116, "loss": 0.0985, "step": 5520 }, { "epoch": 0.35684848484848486, "eval_bleu": 20.311779341908164, "eval_loss": 0.0883578211069107, "eval_runtime": 2.6889, "eval_samples_per_second": 11.901, "eval_steps_per_second": 1.488, "step": 5520 }, { "epoch": 0.3569131313131313, "grad_norm": 0.07625593990087509, "learning_rate": 0.00019409338052478882, "loss": 0.0929, "step": 5521 }, { "epoch": 0.3569777777777778, "grad_norm": 0.07069002836942673, "learning_rate": 0.00019409106481325285, "loss": 0.0906, "step": 5522 }, { "epoch": 0.3570424242424242, "grad_norm": 0.0653168335556984, "learning_rate": 0.00019408874866168408, "loss": 0.0888, "step": 5523 }, { "epoch": 0.3571070707070707, "grad_norm": 0.06413861364126205, "learning_rate": 0.00019408643207009332, "loss": 0.0847, "step": 5524 }, { "epoch": 0.3571717171717172, "grad_norm": 0.06684031337499619, "learning_rate": 0.00019408411503849148, "loss": 0.098, "step": 5525 }, { "epoch": 0.35723636363636363, "grad_norm": 0.05365851894021034, "learning_rate": 0.0001940817975668893, "loss": 0.0731, "step": 5526 }, { "epoch": 0.3573010101010101, "grad_norm": 0.05229535326361656, "learning_rate": 0.00019407947965529765, "loss": 0.0755, "step": 5527 }, { "epoch": 0.35736565656565655, "grad_norm": 0.0634867325425148, "learning_rate": 0.00019407716130372737, "loss": 0.0949, "step": 5528 }, { "epoch": 0.35743030303030304, "grad_norm": 0.06671031564474106, "learning_rate": 0.0001940748425121893, "loss": 0.0928, "step": 5529 }, { "epoch": 0.3574949494949495, "grad_norm": 0.06657414883375168, "learning_rate": 0.00019407252328069432, "loss": 0.0918, "step": 5530 }, { "epoch": 0.35755959595959597, "grad_norm": 0.07135016471147537, "learning_rate": 0.00019407020360925325, "loss": 0.086, "step": 5531 }, { "epoch": 0.3576242424242424, "grad_norm": 0.06159621849656105, "learning_rate": 0.00019406788349787693, "loss": 0.0867, "step": 5532 }, { "epoch": 0.3576888888888889, "grad_norm": 0.07762881368398666, "learning_rate": 0.0001940655629465762, "loss": 0.0989, "step": 5533 }, { "epoch": 0.3577535353535354, "grad_norm": 0.06145229563117027, "learning_rate": 0.00019406324195536193, "loss": 0.0812, "step": 5534 }, { "epoch": 0.3578181818181818, "grad_norm": 0.07012417912483215, "learning_rate": 0.00019406092052424501, "loss": 0.096, "step": 5535 }, { "epoch": 0.3578828282828283, "grad_norm": 0.05990273877978325, "learning_rate": 0.00019405859865323621, "loss": 0.0821, "step": 5536 }, { "epoch": 0.3578828282828283, "eval_bleu": 20.31956883010998, "eval_loss": 0.08905403316020966, "eval_runtime": 2.7132, "eval_samples_per_second": 11.794, "eval_steps_per_second": 1.474, "step": 5536 }, { "epoch": 0.35794747474747474, "grad_norm": 0.06196853518486023, "learning_rate": 0.00019405627634234648, "loss": 0.0833, "step": 5537 }, { "epoch": 0.3580121212121212, "grad_norm": 0.09519463032484055, "learning_rate": 0.00019405395359158663, "loss": 0.0875, "step": 5538 }, { "epoch": 0.35807676767676766, "grad_norm": 0.06967026740312576, "learning_rate": 0.00019405163040096753, "loss": 0.089, "step": 5539 }, { "epoch": 0.35814141414141415, "grad_norm": 0.06125793606042862, "learning_rate": 0.00019404930677050005, "loss": 0.0912, "step": 5540 }, { "epoch": 0.3582060606060606, "grad_norm": 0.058435507118701935, "learning_rate": 0.00019404698270019503, "loss": 0.0767, "step": 5541 }, { "epoch": 0.3582707070707071, "grad_norm": 0.06635106354951859, "learning_rate": 0.0001940446581900634, "loss": 0.0977, "step": 5542 }, { "epoch": 0.35833535353535356, "grad_norm": 0.06896837800741196, "learning_rate": 0.00019404233324011596, "loss": 0.0923, "step": 5543 }, { "epoch": 0.3584, "grad_norm": 0.06676781922578812, "learning_rate": 0.00019404000785036365, "loss": 0.0801, "step": 5544 }, { "epoch": 0.3584646464646465, "grad_norm": 0.05789535120129585, "learning_rate": 0.0001940376820208173, "loss": 0.0772, "step": 5545 }, { "epoch": 0.3585292929292929, "grad_norm": 0.07204492390155792, "learning_rate": 0.0001940353557514878, "loss": 0.0952, "step": 5546 }, { "epoch": 0.3585939393939394, "grad_norm": 0.061950404196977615, "learning_rate": 0.000194033029042386, "loss": 0.0923, "step": 5547 }, { "epoch": 0.35865858585858584, "grad_norm": 0.07145895063877106, "learning_rate": 0.00019403070189352283, "loss": 0.0983, "step": 5548 }, { "epoch": 0.35872323232323233, "grad_norm": 0.07266911119222641, "learning_rate": 0.00019402837430490918, "loss": 0.0965, "step": 5549 }, { "epoch": 0.35878787878787877, "grad_norm": 0.07719830423593521, "learning_rate": 0.00019402604627655589, "loss": 0.1027, "step": 5550 }, { "epoch": 0.35885252525252526, "grad_norm": 0.06412225216627121, "learning_rate": 0.00019402371780847386, "loss": 0.0765, "step": 5551 }, { "epoch": 0.3589171717171717, "grad_norm": 0.061375316232442856, "learning_rate": 0.00019402138890067398, "loss": 0.081, "step": 5552 }, { "epoch": 0.3589171717171717, "eval_bleu": 17.761025023398123, "eval_loss": 0.08948462456464767, "eval_runtime": 2.8886, "eval_samples_per_second": 11.078, "eval_steps_per_second": 1.385, "step": 5552 }, { "epoch": 0.3589818181818182, "grad_norm": 0.06001342833042145, "learning_rate": 0.00019401905955316714, "loss": 0.077, "step": 5553 }, { "epoch": 0.35904646464646467, "grad_norm": 0.0778547152876854, "learning_rate": 0.00019401672976596427, "loss": 0.109, "step": 5554 }, { "epoch": 0.3591111111111111, "grad_norm": 0.06440649926662445, "learning_rate": 0.00019401439953907622, "loss": 0.0846, "step": 5555 }, { "epoch": 0.3591757575757576, "grad_norm": 0.0722447857260704, "learning_rate": 0.00019401206887251392, "loss": 0.098, "step": 5556 }, { "epoch": 0.359240404040404, "grad_norm": 0.06823404133319855, "learning_rate": 0.00019400973776628823, "loss": 0.0872, "step": 5557 }, { "epoch": 0.3593050505050505, "grad_norm": 0.0654689222574234, "learning_rate": 0.00019400740622041007, "loss": 0.0825, "step": 5558 }, { "epoch": 0.35936969696969695, "grad_norm": 0.06266242265701294, "learning_rate": 0.00019400507423489037, "loss": 0.0909, "step": 5559 }, { "epoch": 0.35943434343434344, "grad_norm": 0.06566400825977325, "learning_rate": 0.00019400274180974, "loss": 0.084, "step": 5560 }, { "epoch": 0.3594989898989899, "grad_norm": 0.06436508148908615, "learning_rate": 0.0001940004089449699, "loss": 0.0874, "step": 5561 }, { "epoch": 0.35956363636363636, "grad_norm": 0.06844916939735413, "learning_rate": 0.00019399807564059098, "loss": 0.0932, "step": 5562 }, { "epoch": 0.35962828282828285, "grad_norm": 0.058435652405023575, "learning_rate": 0.0001939957418966141, "loss": 0.0822, "step": 5563 }, { "epoch": 0.3596929292929293, "grad_norm": 0.06185316666960716, "learning_rate": 0.00019399340771305022, "loss": 0.086, "step": 5564 }, { "epoch": 0.3597575757575758, "grad_norm": 0.06736771017313004, "learning_rate": 0.00019399107308991026, "loss": 0.1004, "step": 5565 }, { "epoch": 0.3598222222222222, "grad_norm": 0.059196654707193375, "learning_rate": 0.0001939887380272051, "loss": 0.0762, "step": 5566 }, { "epoch": 0.3598868686868687, "grad_norm": 0.06443962454795837, "learning_rate": 0.0001939864025249457, "loss": 0.0905, "step": 5567 }, { "epoch": 0.35995151515151513, "grad_norm": 0.068876713514328, "learning_rate": 0.00019398406658314293, "loss": 0.1035, "step": 5568 }, { "epoch": 0.35995151515151513, "eval_bleu": 19.771463951881678, "eval_loss": 0.08929599821567535, "eval_runtime": 2.5713, "eval_samples_per_second": 12.445, "eval_steps_per_second": 1.556, "step": 5568 }, { "epoch": 0.3600161616161616, "grad_norm": 0.05447925254702568, "learning_rate": 0.0001939817302018078, "loss": 0.0722, "step": 5569 }, { "epoch": 0.36008080808080806, "grad_norm": 0.05446496605873108, "learning_rate": 0.00019397939338095118, "loss": 0.0628, "step": 5570 }, { "epoch": 0.36014545454545455, "grad_norm": 0.0639272928237915, "learning_rate": 0.000193977056120584, "loss": 0.0853, "step": 5571 }, { "epoch": 0.36021010101010104, "grad_norm": 0.061772074550390244, "learning_rate": 0.0001939747184207172, "loss": 0.0883, "step": 5572 }, { "epoch": 0.36027474747474747, "grad_norm": 0.0636611133813858, "learning_rate": 0.00019397238028136168, "loss": 0.0823, "step": 5573 }, { "epoch": 0.36033939393939396, "grad_norm": 0.0672069862484932, "learning_rate": 0.00019397004170252844, "loss": 0.0894, "step": 5574 }, { "epoch": 0.3604040404040404, "grad_norm": 0.058522388339042664, "learning_rate": 0.00019396770268422837, "loss": 0.0778, "step": 5575 }, { "epoch": 0.3604686868686869, "grad_norm": 0.07961131632328033, "learning_rate": 0.0001939653632264724, "loss": 0.0932, "step": 5576 }, { "epoch": 0.3605333333333333, "grad_norm": 0.0683751031756401, "learning_rate": 0.00019396302332927152, "loss": 0.0934, "step": 5577 }, { "epoch": 0.3605979797979798, "grad_norm": 0.11096221953630447, "learning_rate": 0.00019396068299263664, "loss": 0.0972, "step": 5578 }, { "epoch": 0.36066262626262624, "grad_norm": 0.08672254532575607, "learning_rate": 0.00019395834221657866, "loss": 0.0994, "step": 5579 }, { "epoch": 0.36072727272727273, "grad_norm": 0.06957241147756577, "learning_rate": 0.00019395600100110866, "loss": 0.0975, "step": 5580 }, { "epoch": 0.3607919191919192, "grad_norm": 0.07424795627593994, "learning_rate": 0.00019395365934623743, "loss": 0.0881, "step": 5581 }, { "epoch": 0.36085656565656565, "grad_norm": 0.06000884994864464, "learning_rate": 0.00019395131725197603, "loss": 0.0811, "step": 5582 }, { "epoch": 0.36092121212121214, "grad_norm": 0.06227359175682068, "learning_rate": 0.0001939489747183354, "loss": 0.0813, "step": 5583 }, { "epoch": 0.3609858585858586, "grad_norm": 0.05674334615468979, "learning_rate": 0.00019394663174532642, "loss": 0.0802, "step": 5584 }, { "epoch": 0.3609858585858586, "eval_bleu": 17.128924144916997, "eval_loss": 0.08911283314228058, "eval_runtime": 2.7417, "eval_samples_per_second": 11.671, "eval_steps_per_second": 1.459, "step": 5584 }, { "epoch": 0.36105050505050507, "grad_norm": 0.05663229152560234, "learning_rate": 0.0001939442883329601, "loss": 0.0679, "step": 5585 }, { "epoch": 0.3611151515151515, "grad_norm": 0.0633869618177414, "learning_rate": 0.00019394194448124746, "loss": 0.0826, "step": 5586 }, { "epoch": 0.361179797979798, "grad_norm": 0.08226557075977325, "learning_rate": 0.00019393960019019937, "loss": 0.0843, "step": 5587 }, { "epoch": 0.3612444444444444, "grad_norm": 0.056243862956762314, "learning_rate": 0.00019393725545982683, "loss": 0.0768, "step": 5588 }, { "epoch": 0.3613090909090909, "grad_norm": 0.06582501530647278, "learning_rate": 0.0001939349102901408, "loss": 0.1003, "step": 5589 }, { "epoch": 0.36137373737373735, "grad_norm": 0.06641250103712082, "learning_rate": 0.00019393256468115227, "loss": 0.1033, "step": 5590 }, { "epoch": 0.36143838383838384, "grad_norm": 0.058906279504299164, "learning_rate": 0.00019393021863287215, "loss": 0.0822, "step": 5591 }, { "epoch": 0.3615030303030303, "grad_norm": 0.06652873009443283, "learning_rate": 0.00019392787214531147, "loss": 0.0885, "step": 5592 }, { "epoch": 0.36156767676767676, "grad_norm": 0.06152445450425148, "learning_rate": 0.00019392552521848119, "loss": 0.0807, "step": 5593 }, { "epoch": 0.36163232323232325, "grad_norm": 0.06259158998727798, "learning_rate": 0.00019392317785239227, "loss": 0.0885, "step": 5594 }, { "epoch": 0.3616969696969697, "grad_norm": 0.05345216393470764, "learning_rate": 0.00019392083004705568, "loss": 0.0679, "step": 5595 }, { "epoch": 0.3617616161616162, "grad_norm": 0.06571955978870392, "learning_rate": 0.00019391848180248244, "loss": 0.0926, "step": 5596 }, { "epoch": 0.3618262626262626, "grad_norm": 0.07138384133577347, "learning_rate": 0.00019391613311868353, "loss": 0.0981, "step": 5597 }, { "epoch": 0.3618909090909091, "grad_norm": 0.05178312212228775, "learning_rate": 0.00019391378399566986, "loss": 0.0646, "step": 5598 }, { "epoch": 0.36195555555555553, "grad_norm": 0.05839163437485695, "learning_rate": 0.0001939114344334525, "loss": 0.0768, "step": 5599 }, { "epoch": 0.362020202020202, "grad_norm": 0.06297080963850021, "learning_rate": 0.00019390908443204242, "loss": 0.0781, "step": 5600 }, { "epoch": 0.362020202020202, "eval_bleu": 18.02319233638908, "eval_loss": 0.09004559367895126, "eval_runtime": 2.7153, "eval_samples_per_second": 11.785, "eval_steps_per_second": 1.473, "step": 5600 }, { "epoch": 0.3620848484848485, "grad_norm": 0.07369516789913177, "learning_rate": 0.00019390673399145058, "loss": 0.0972, "step": 5601 }, { "epoch": 0.36214949494949494, "grad_norm": 0.05596242845058441, "learning_rate": 0.00019390438311168802, "loss": 0.0803, "step": 5602 }, { "epoch": 0.36221414141414143, "grad_norm": 0.0625949278473854, "learning_rate": 0.00019390203179276566, "loss": 0.0926, "step": 5603 }, { "epoch": 0.36227878787878787, "grad_norm": 0.07207189500331879, "learning_rate": 0.00019389968003469455, "loss": 0.092, "step": 5604 }, { "epoch": 0.36234343434343436, "grad_norm": 0.0705079436302185, "learning_rate": 0.00019389732783748568, "loss": 0.0954, "step": 5605 }, { "epoch": 0.3624080808080808, "grad_norm": 0.07419601827859879, "learning_rate": 0.00019389497520115007, "loss": 0.1048, "step": 5606 }, { "epoch": 0.3624727272727273, "grad_norm": 0.07409415394067764, "learning_rate": 0.0001938926221256987, "loss": 0.0804, "step": 5607 }, { "epoch": 0.3625373737373737, "grad_norm": 0.06886249035596848, "learning_rate": 0.00019389026861114255, "loss": 0.0857, "step": 5608 }, { "epoch": 0.3626020202020202, "grad_norm": 0.08513378351926804, "learning_rate": 0.00019388791465749269, "loss": 0.0847, "step": 5609 }, { "epoch": 0.3626666666666667, "grad_norm": 0.06550447642803192, "learning_rate": 0.00019388556026476005, "loss": 0.0909, "step": 5610 }, { "epoch": 0.3627313131313131, "grad_norm": 0.0711509957909584, "learning_rate": 0.0001938832054329557, "loss": 0.1032, "step": 5611 }, { "epoch": 0.3627959595959596, "grad_norm": 0.062147341668605804, "learning_rate": 0.00019388085016209066, "loss": 0.0843, "step": 5612 }, { "epoch": 0.36286060606060605, "grad_norm": 0.05619615688920021, "learning_rate": 0.0001938784944521759, "loss": 0.0696, "step": 5613 }, { "epoch": 0.36292525252525254, "grad_norm": 0.0628916546702385, "learning_rate": 0.00019387613830322247, "loss": 0.09, "step": 5614 }, { "epoch": 0.362989898989899, "grad_norm": 0.0602811835706234, "learning_rate": 0.00019387378171524134, "loss": 0.076, "step": 5615 }, { "epoch": 0.36305454545454546, "grad_norm": 0.06386128067970276, "learning_rate": 0.00019387142468824358, "loss": 0.1054, "step": 5616 }, { "epoch": 0.36305454545454546, "eval_bleu": 17.598700409560603, "eval_loss": 0.09069300442934036, "eval_runtime": 2.814, "eval_samples_per_second": 11.372, "eval_steps_per_second": 1.421, "step": 5616 }, { "epoch": 0.3631191919191919, "grad_norm": 0.06871649622917175, "learning_rate": 0.0001938690672222402, "loss": 0.1127, "step": 5617 }, { "epoch": 0.3631838383838384, "grad_norm": 0.06262233853340149, "learning_rate": 0.00019386670931724223, "loss": 0.0862, "step": 5618 }, { "epoch": 0.3632484848484849, "grad_norm": 0.05834294483065605, "learning_rate": 0.00019386435097326073, "loss": 0.0873, "step": 5619 }, { "epoch": 0.3633131313131313, "grad_norm": 0.05475153028964996, "learning_rate": 0.00019386199219030665, "loss": 0.0732, "step": 5620 }, { "epoch": 0.3633777777777778, "grad_norm": 0.05791473016142845, "learning_rate": 0.00019385963296839108, "loss": 0.0899, "step": 5621 }, { "epoch": 0.36344242424242423, "grad_norm": 0.05937204137444496, "learning_rate": 0.00019385727330752502, "loss": 0.0791, "step": 5622 }, { "epoch": 0.3635070707070707, "grad_norm": 0.05843328312039375, "learning_rate": 0.00019385491320771954, "loss": 0.0725, "step": 5623 }, { "epoch": 0.36357171717171716, "grad_norm": 0.06062363460659981, "learning_rate": 0.00019385255266898566, "loss": 0.0831, "step": 5624 }, { "epoch": 0.36363636363636365, "grad_norm": 0.06332118064165115, "learning_rate": 0.00019385019169133438, "loss": 0.0872, "step": 5625 }, { "epoch": 0.3637010101010101, "grad_norm": 0.059690430760383606, "learning_rate": 0.00019384783027477683, "loss": 0.0801, "step": 5626 }, { "epoch": 0.36376565656565657, "grad_norm": 0.07130637019872665, "learning_rate": 0.00019384546841932397, "loss": 0.0981, "step": 5627 }, { "epoch": 0.363830303030303, "grad_norm": 0.060029856860637665, "learning_rate": 0.00019384310612498688, "loss": 0.0829, "step": 5628 }, { "epoch": 0.3638949494949495, "grad_norm": 0.08032119274139404, "learning_rate": 0.0001938407433917766, "loss": 0.1235, "step": 5629 }, { "epoch": 0.363959595959596, "grad_norm": 0.08778087794780731, "learning_rate": 0.00019383838021970421, "loss": 0.0995, "step": 5630 }, { "epoch": 0.3640242424242424, "grad_norm": 0.06337865442037582, "learning_rate": 0.00019383601660878075, "loss": 0.0962, "step": 5631 }, { "epoch": 0.3640888888888889, "grad_norm": 0.05765356868505478, "learning_rate": 0.0001938336525590172, "loss": 0.0703, "step": 5632 }, { "epoch": 0.3640888888888889, "eval_bleu": 19.77775186044897, "eval_loss": 0.09059233218431473, "eval_runtime": 2.7937, "eval_samples_per_second": 11.454, "eval_steps_per_second": 1.432, "step": 5632 }, { "epoch": 0.36415353535353534, "grad_norm": 0.055351532995700836, "learning_rate": 0.00019383128807042474, "loss": 0.0701, "step": 5633 }, { "epoch": 0.36421818181818183, "grad_norm": 0.06478390842676163, "learning_rate": 0.00019382892314301432, "loss": 0.0974, "step": 5634 }, { "epoch": 0.36428282828282826, "grad_norm": 0.06738635152578354, "learning_rate": 0.00019382655777679708, "loss": 0.0956, "step": 5635 }, { "epoch": 0.36434747474747475, "grad_norm": 0.05697416514158249, "learning_rate": 0.00019382419197178406, "loss": 0.083, "step": 5636 }, { "epoch": 0.3644121212121212, "grad_norm": 0.15879900753498077, "learning_rate": 0.00019382182572798628, "loss": 0.1376, "step": 5637 }, { "epoch": 0.3644767676767677, "grad_norm": 0.07478918135166168, "learning_rate": 0.00019381945904541485, "loss": 0.0892, "step": 5638 }, { "epoch": 0.36454141414141417, "grad_norm": 0.052984561771154404, "learning_rate": 0.0001938170919240808, "loss": 0.0689, "step": 5639 }, { "epoch": 0.3646060606060606, "grad_norm": 0.06066984310746193, "learning_rate": 0.00019381472436399527, "loss": 0.0729, "step": 5640 }, { "epoch": 0.3646707070707071, "grad_norm": 0.060536809265613556, "learning_rate": 0.00019381235636516925, "loss": 0.0833, "step": 5641 }, { "epoch": 0.3647353535353535, "grad_norm": 0.07288076728582382, "learning_rate": 0.0001938099879276139, "loss": 0.0813, "step": 5642 }, { "epoch": 0.3648, "grad_norm": 0.07207860797643661, "learning_rate": 0.00019380761905134021, "loss": 0.1003, "step": 5643 }, { "epoch": 0.36486464646464645, "grad_norm": 0.06924431025981903, "learning_rate": 0.00019380524973635932, "loss": 0.0884, "step": 5644 }, { "epoch": 0.36492929292929294, "grad_norm": 0.06536194682121277, "learning_rate": 0.00019380287998268226, "loss": 0.0931, "step": 5645 }, { "epoch": 0.36499393939393937, "grad_norm": 0.07337052375078201, "learning_rate": 0.00019380050979032017, "loss": 0.0992, "step": 5646 }, { "epoch": 0.36505858585858586, "grad_norm": 0.054563749581575394, "learning_rate": 0.0001937981391592841, "loss": 0.0665, "step": 5647 }, { "epoch": 0.36512323232323235, "grad_norm": 0.06677048653364182, "learning_rate": 0.00019379576808958512, "loss": 0.0886, "step": 5648 }, { "epoch": 0.36512323232323235, "eval_bleu": 18.43849823340465, "eval_loss": 0.08969748020172119, "eval_runtime": 2.719, "eval_samples_per_second": 11.769, "eval_steps_per_second": 1.471, "step": 5648 }, { "epoch": 0.3651878787878788, "grad_norm": 0.08680686354637146, "learning_rate": 0.00019379339658123437, "loss": 0.0946, "step": 5649 }, { "epoch": 0.3652525252525253, "grad_norm": 0.056637972593307495, "learning_rate": 0.00019379102463424288, "loss": 0.0742, "step": 5650 }, { "epoch": 0.3653171717171717, "grad_norm": 0.06648175418376923, "learning_rate": 0.0001937886522486218, "loss": 0.0939, "step": 5651 }, { "epoch": 0.3653818181818182, "grad_norm": 0.06271092593669891, "learning_rate": 0.0001937862794243822, "loss": 0.0781, "step": 5652 }, { "epoch": 0.36544646464646463, "grad_norm": 0.06412040442228317, "learning_rate": 0.00019378390616153516, "loss": 0.0944, "step": 5653 }, { "epoch": 0.3655111111111111, "grad_norm": 0.05281197652220726, "learning_rate": 0.0001937815324600918, "loss": 0.0714, "step": 5654 }, { "epoch": 0.36557575757575755, "grad_norm": 0.06602725386619568, "learning_rate": 0.00019377915832006324, "loss": 0.0973, "step": 5655 }, { "epoch": 0.36564040404040404, "grad_norm": 0.07501750439405441, "learning_rate": 0.00019377678374146053, "loss": 0.1037, "step": 5656 }, { "epoch": 0.3657050505050505, "grad_norm": 0.06797129660844803, "learning_rate": 0.00019377440872429483, "loss": 0.0826, "step": 5657 }, { "epoch": 0.36576969696969697, "grad_norm": 0.1106073334813118, "learning_rate": 0.0001937720332685772, "loss": 0.0854, "step": 5658 }, { "epoch": 0.36583434343434346, "grad_norm": 0.07007279247045517, "learning_rate": 0.00019376965737431877, "loss": 0.1047, "step": 5659 }, { "epoch": 0.3658989898989899, "grad_norm": 0.0631527379155159, "learning_rate": 0.00019376728104153067, "loss": 0.0826, "step": 5660 }, { "epoch": 0.3659636363636364, "grad_norm": 0.05771954730153084, "learning_rate": 0.00019376490427022396, "loss": 0.0749, "step": 5661 }, { "epoch": 0.3660282828282828, "grad_norm": 0.0732756033539772, "learning_rate": 0.00019376252706040982, "loss": 0.0856, "step": 5662 }, { "epoch": 0.3660929292929293, "grad_norm": 0.05554167181253433, "learning_rate": 0.00019376014941209935, "loss": 0.071, "step": 5663 }, { "epoch": 0.36615757575757574, "grad_norm": 0.0592421293258667, "learning_rate": 0.00019375777132530364, "loss": 0.0867, "step": 5664 }, { "epoch": 0.36615757575757574, "eval_bleu": 20.177642083134224, "eval_loss": 0.08883252739906311, "eval_runtime": 2.5295, "eval_samples_per_second": 12.651, "eval_steps_per_second": 1.581, "step": 5664 }, { "epoch": 0.3662222222222222, "grad_norm": 0.07217991352081299, "learning_rate": 0.00019375539280003384, "loss": 0.0896, "step": 5665 }, { "epoch": 0.36628686868686866, "grad_norm": 0.06669741868972778, "learning_rate": 0.00019375301383630101, "loss": 0.0902, "step": 5666 }, { "epoch": 0.36635151515151515, "grad_norm": 0.07359939068555832, "learning_rate": 0.0001937506344341164, "loss": 0.1017, "step": 5667 }, { "epoch": 0.36641616161616164, "grad_norm": 0.06427470594644547, "learning_rate": 0.00019374825459349105, "loss": 0.0849, "step": 5668 }, { "epoch": 0.3664808080808081, "grad_norm": 0.06490667164325714, "learning_rate": 0.00019374587431443606, "loss": 0.0869, "step": 5669 }, { "epoch": 0.36654545454545456, "grad_norm": 0.05909444019198418, "learning_rate": 0.00019374349359696264, "loss": 0.0854, "step": 5670 }, { "epoch": 0.366610101010101, "grad_norm": 0.061240822076797485, "learning_rate": 0.0001937411124410819, "loss": 0.0706, "step": 5671 }, { "epoch": 0.3666747474747475, "grad_norm": 0.07173585146665573, "learning_rate": 0.00019373873084680497, "loss": 0.0926, "step": 5672 }, { "epoch": 0.3667393939393939, "grad_norm": 0.0640130341053009, "learning_rate": 0.00019373634881414295, "loss": 0.0933, "step": 5673 }, { "epoch": 0.3668040404040404, "grad_norm": 0.059220850467681885, "learning_rate": 0.00019373396634310707, "loss": 0.0834, "step": 5674 }, { "epoch": 0.36686868686868684, "grad_norm": 0.058035317808389664, "learning_rate": 0.00019373158343370837, "loss": 0.0749, "step": 5675 }, { "epoch": 0.36693333333333333, "grad_norm": 0.06940995901823044, "learning_rate": 0.00019372920008595807, "loss": 0.0832, "step": 5676 }, { "epoch": 0.3669979797979798, "grad_norm": 0.06371055543422699, "learning_rate": 0.0001937268162998673, "loss": 0.0876, "step": 5677 }, { "epoch": 0.36706262626262626, "grad_norm": 0.06505129486322403, "learning_rate": 0.00019372443207544716, "loss": 0.085, "step": 5678 }, { "epoch": 0.36712727272727275, "grad_norm": 0.06256277859210968, "learning_rate": 0.00019372204741270884, "loss": 0.0853, "step": 5679 }, { "epoch": 0.3671919191919192, "grad_norm": 0.06402939558029175, "learning_rate": 0.0001937196623116635, "loss": 0.0781, "step": 5680 }, { "epoch": 0.3671919191919192, "eval_bleu": 18.419506617171407, "eval_loss": 0.08876651525497437, "eval_runtime": 2.6259, "eval_samples_per_second": 12.186, "eval_steps_per_second": 1.523, "step": 5680 }, { "epoch": 0.36725656565656567, "grad_norm": 0.05896603688597679, "learning_rate": 0.00019371727677232228, "loss": 0.0798, "step": 5681 }, { "epoch": 0.3673212121212121, "grad_norm": 0.08227799087762833, "learning_rate": 0.0001937148907946963, "loss": 0.0844, "step": 5682 }, { "epoch": 0.3673858585858586, "grad_norm": 0.05968752130866051, "learning_rate": 0.0001937125043787968, "loss": 0.0787, "step": 5683 }, { "epoch": 0.367450505050505, "grad_norm": 0.0672236755490303, "learning_rate": 0.0001937101175246349, "loss": 0.0946, "step": 5684 }, { "epoch": 0.3675151515151515, "grad_norm": 0.06951837241649628, "learning_rate": 0.00019370773023222174, "loss": 0.1043, "step": 5685 }, { "epoch": 0.367579797979798, "grad_norm": 0.06972093135118484, "learning_rate": 0.00019370534250156852, "loss": 0.1108, "step": 5686 }, { "epoch": 0.36764444444444444, "grad_norm": 0.060509372502565384, "learning_rate": 0.00019370295433268639, "loss": 0.0843, "step": 5687 }, { "epoch": 0.36770909090909093, "grad_norm": 0.06284403800964355, "learning_rate": 0.00019370056572558654, "loss": 0.0717, "step": 5688 }, { "epoch": 0.36777373737373736, "grad_norm": 0.07253480702638626, "learning_rate": 0.0001936981766802801, "loss": 0.0898, "step": 5689 }, { "epoch": 0.36783838383838385, "grad_norm": 0.06688518077135086, "learning_rate": 0.00019369578719677824, "loss": 0.0848, "step": 5690 }, { "epoch": 0.3679030303030303, "grad_norm": 0.06630069762468338, "learning_rate": 0.00019369339727509218, "loss": 0.0811, "step": 5691 }, { "epoch": 0.3679676767676768, "grad_norm": 0.06217240169644356, "learning_rate": 0.0001936910069152331, "loss": 0.0784, "step": 5692 }, { "epoch": 0.3680323232323232, "grad_norm": 0.07182607054710388, "learning_rate": 0.00019368861611721215, "loss": 0.0933, "step": 5693 }, { "epoch": 0.3680969696969697, "grad_norm": 0.06653238832950592, "learning_rate": 0.00019368622488104048, "loss": 0.0948, "step": 5694 }, { "epoch": 0.36816161616161613, "grad_norm": 0.06470491737127304, "learning_rate": 0.0001936838332067293, "loss": 0.0879, "step": 5695 }, { "epoch": 0.3682262626262626, "grad_norm": 0.05736684799194336, "learning_rate": 0.00019368144109428986, "loss": 0.0821, "step": 5696 }, { "epoch": 0.3682262626262626, "eval_bleu": 18.71993980098357, "eval_loss": 0.08756851404905319, "eval_runtime": 2.7125, "eval_samples_per_second": 11.797, "eval_steps_per_second": 1.475, "step": 5696 }, { "epoch": 0.3682909090909091, "grad_norm": 0.07400652766227722, "learning_rate": 0.00019367904854373326, "loss": 0.0918, "step": 5697 }, { "epoch": 0.36835555555555555, "grad_norm": 0.05849096551537514, "learning_rate": 0.0001936766555550707, "loss": 0.0797, "step": 5698 }, { "epoch": 0.36842020202020204, "grad_norm": 0.06075263023376465, "learning_rate": 0.0001936742621283134, "loss": 0.0757, "step": 5699 }, { "epoch": 0.36848484848484847, "grad_norm": 0.09375002235174179, "learning_rate": 0.00019367186826347256, "loss": 0.0952, "step": 5700 }, { "epoch": 0.36854949494949496, "grad_norm": 0.05690156668424606, "learning_rate": 0.00019366947396055938, "loss": 0.0778, "step": 5701 }, { "epoch": 0.3686141414141414, "grad_norm": 0.07727015018463135, "learning_rate": 0.00019366707921958498, "loss": 0.0825, "step": 5702 }, { "epoch": 0.3686787878787879, "grad_norm": 0.07139743119478226, "learning_rate": 0.00019366468404056065, "loss": 0.0913, "step": 5703 }, { "epoch": 0.3687434343434343, "grad_norm": 0.07198897749185562, "learning_rate": 0.00019366228842349756, "loss": 0.1058, "step": 5704 }, { "epoch": 0.3688080808080808, "grad_norm": 0.05929672345519066, "learning_rate": 0.00019365989236840692, "loss": 0.0869, "step": 5705 }, { "epoch": 0.3688727272727273, "grad_norm": 0.10010726004838943, "learning_rate": 0.0001936574958752999, "loss": 0.0923, "step": 5706 }, { "epoch": 0.36893737373737373, "grad_norm": 0.07636507600545883, "learning_rate": 0.00019365509894418774, "loss": 0.0989, "step": 5707 }, { "epoch": 0.3690020202020202, "grad_norm": 0.064690500497818, "learning_rate": 0.00019365270157508166, "loss": 0.0959, "step": 5708 }, { "epoch": 0.36906666666666665, "grad_norm": 0.05702101066708565, "learning_rate": 0.00019365030376799285, "loss": 0.0734, "step": 5709 }, { "epoch": 0.36913131313131314, "grad_norm": 0.055124878883361816, "learning_rate": 0.00019364790552293251, "loss": 0.0736, "step": 5710 }, { "epoch": 0.3691959595959596, "grad_norm": 0.057364728301763535, "learning_rate": 0.00019364550683991188, "loss": 0.0812, "step": 5711 }, { "epoch": 0.36926060606060607, "grad_norm": 0.06574060022830963, "learning_rate": 0.0001936431077189422, "loss": 0.0774, "step": 5712 }, { "epoch": 0.36926060606060607, "eval_bleu": 20.004144801624868, "eval_loss": 0.08843722939491272, "eval_runtime": 2.7111, "eval_samples_per_second": 11.803, "eval_steps_per_second": 1.475, "step": 5712 }, { "epoch": 0.3693252525252525, "grad_norm": 0.061844974756240845, "learning_rate": 0.00019364070816003465, "loss": 0.0867, "step": 5713 }, { "epoch": 0.369389898989899, "grad_norm": 0.06636860966682434, "learning_rate": 0.00019363830816320046, "loss": 0.0975, "step": 5714 }, { "epoch": 0.3694545454545455, "grad_norm": 0.05617674067616463, "learning_rate": 0.00019363590772845086, "loss": 0.0778, "step": 5715 }, { "epoch": 0.3695191919191919, "grad_norm": 0.062299225479364395, "learning_rate": 0.00019363350685579707, "loss": 0.0972, "step": 5716 }, { "epoch": 0.3695838383838384, "grad_norm": 0.061435747891664505, "learning_rate": 0.00019363110554525035, "loss": 0.0774, "step": 5717 }, { "epoch": 0.36964848484848484, "grad_norm": 0.06668554991483688, "learning_rate": 0.00019362870379682184, "loss": 0.0944, "step": 5718 }, { "epoch": 0.3697131313131313, "grad_norm": 0.059491679072380066, "learning_rate": 0.0001936263016105229, "loss": 0.0859, "step": 5719 }, { "epoch": 0.36977777777777776, "grad_norm": 0.059779755771160126, "learning_rate": 0.00019362389898636468, "loss": 0.0773, "step": 5720 }, { "epoch": 0.36984242424242425, "grad_norm": 0.06789460778236389, "learning_rate": 0.00019362149592435843, "loss": 0.0892, "step": 5721 }, { "epoch": 0.3699070707070707, "grad_norm": 0.06469476222991943, "learning_rate": 0.0001936190924245154, "loss": 0.0936, "step": 5722 }, { "epoch": 0.3699717171717172, "grad_norm": 0.0696030706167221, "learning_rate": 0.00019361668848684682, "loss": 0.1007, "step": 5723 }, { "epoch": 0.37003636363636366, "grad_norm": 0.06550107151269913, "learning_rate": 0.00019361428411136396, "loss": 0.093, "step": 5724 }, { "epoch": 0.3701010101010101, "grad_norm": 0.07053610682487488, "learning_rate": 0.000193611879298078, "loss": 0.0993, "step": 5725 }, { "epoch": 0.3701656565656566, "grad_norm": 0.055764224380254745, "learning_rate": 0.00019360947404700024, "loss": 0.0623, "step": 5726 }, { "epoch": 0.370230303030303, "grad_norm": 0.0642828717827797, "learning_rate": 0.00019360706835814192, "loss": 0.0848, "step": 5727 }, { "epoch": 0.3702949494949495, "grad_norm": 0.06760226935148239, "learning_rate": 0.00019360466223151433, "loss": 0.0883, "step": 5728 }, { "epoch": 0.3702949494949495, "eval_bleu": 18.215200675976664, "eval_loss": 0.08861472457647324, "eval_runtime": 2.841, "eval_samples_per_second": 11.264, "eval_steps_per_second": 1.408, "step": 5728 }, { "epoch": 0.37035959595959594, "grad_norm": 0.05753682553768158, "learning_rate": 0.00019360225566712863, "loss": 0.0921, "step": 5729 }, { "epoch": 0.37042424242424243, "grad_norm": 0.05487910285592079, "learning_rate": 0.00019359984866499613, "loss": 0.072, "step": 5730 }, { "epoch": 0.37048888888888887, "grad_norm": 0.05709809437394142, "learning_rate": 0.00019359744122512807, "loss": 0.0749, "step": 5731 }, { "epoch": 0.37055353535353536, "grad_norm": 0.057215142995119095, "learning_rate": 0.00019359503334753574, "loss": 0.0899, "step": 5732 }, { "epoch": 0.3706181818181818, "grad_norm": 0.04492242634296417, "learning_rate": 0.0001935926250322304, "loss": 0.0599, "step": 5733 }, { "epoch": 0.3706828282828283, "grad_norm": 0.06557269394397736, "learning_rate": 0.00019359021627922326, "loss": 0.1056, "step": 5734 }, { "epoch": 0.37074747474747477, "grad_norm": 0.06246566027402878, "learning_rate": 0.00019358780708852562, "loss": 0.0681, "step": 5735 }, { "epoch": 0.3708121212121212, "grad_norm": 0.06089061498641968, "learning_rate": 0.00019358539746014877, "loss": 0.0896, "step": 5736 }, { "epoch": 0.3708767676767677, "grad_norm": 0.05685008689761162, "learning_rate": 0.00019358298739410393, "loss": 0.0748, "step": 5737 }, { "epoch": 0.3709414141414141, "grad_norm": 0.05879157781600952, "learning_rate": 0.00019358057689040242, "loss": 0.0843, "step": 5738 }, { "epoch": 0.3710060606060606, "grad_norm": 0.06788202375173569, "learning_rate": 0.00019357816594905546, "loss": 0.0854, "step": 5739 }, { "epoch": 0.37107070707070705, "grad_norm": 0.06666477769613266, "learning_rate": 0.0001935757545700744, "loss": 0.0993, "step": 5740 }, { "epoch": 0.37113535353535354, "grad_norm": 0.07352785021066666, "learning_rate": 0.00019357334275347043, "loss": 0.0739, "step": 5741 }, { "epoch": 0.3712, "grad_norm": 0.061948638409376144, "learning_rate": 0.00019357093049925488, "loss": 0.0811, "step": 5742 }, { "epoch": 0.37126464646464646, "grad_norm": 0.06688946485519409, "learning_rate": 0.00019356851780743904, "loss": 0.0949, "step": 5743 }, { "epoch": 0.37132929292929295, "grad_norm": 0.06729866564273834, "learning_rate": 0.00019356610467803417, "loss": 0.0826, "step": 5744 }, { "epoch": 0.37132929292929295, "eval_bleu": 20.212971790459353, "eval_loss": 0.08940662443637848, "eval_runtime": 2.6845, "eval_samples_per_second": 11.92, "eval_steps_per_second": 1.49, "step": 5744 }, { "epoch": 0.3713939393939394, "grad_norm": 0.06624142825603485, "learning_rate": 0.00019356369111105153, "loss": 0.0917, "step": 5745 }, { "epoch": 0.3714585858585859, "grad_norm": 0.05723577365279198, "learning_rate": 0.00019356127710650246, "loss": 0.0818, "step": 5746 }, { "epoch": 0.3715232323232323, "grad_norm": 0.06462448835372925, "learning_rate": 0.00019355886266439823, "loss": 0.0972, "step": 5747 }, { "epoch": 0.3715878787878788, "grad_norm": 0.05594117194414139, "learning_rate": 0.00019355644778475012, "loss": 0.0713, "step": 5748 }, { "epoch": 0.37165252525252523, "grad_norm": 0.0599655844271183, "learning_rate": 0.00019355403246756946, "loss": 0.0819, "step": 5749 }, { "epoch": 0.3717171717171717, "grad_norm": 0.08354416489601135, "learning_rate": 0.0001935516167128675, "loss": 0.1157, "step": 5750 }, { "epoch": 0.37178181818181816, "grad_norm": 0.05443254113197327, "learning_rate": 0.00019354920052065553, "loss": 0.0685, "step": 5751 }, { "epoch": 0.37184646464646465, "grad_norm": 0.07267367839813232, "learning_rate": 0.00019354678389094492, "loss": 0.113, "step": 5752 }, { "epoch": 0.37191111111111114, "grad_norm": 0.08180012553930283, "learning_rate": 0.0001935443668237469, "loss": 0.0851, "step": 5753 }, { "epoch": 0.37197575757575757, "grad_norm": 0.06112773343920708, "learning_rate": 0.0001935419493190728, "loss": 0.0761, "step": 5754 }, { "epoch": 0.37204040404040406, "grad_norm": 0.06651456654071808, "learning_rate": 0.00019353953137693391, "loss": 0.082, "step": 5755 }, { "epoch": 0.3721050505050505, "grad_norm": 0.05613433197140694, "learning_rate": 0.0001935371129973416, "loss": 0.0763, "step": 5756 }, { "epoch": 0.372169696969697, "grad_norm": 0.06606724858283997, "learning_rate": 0.00019353469418030713, "loss": 0.1005, "step": 5757 }, { "epoch": 0.3722343434343434, "grad_norm": 0.06023230776190758, "learning_rate": 0.00019353227492584178, "loss": 0.0864, "step": 5758 }, { "epoch": 0.3722989898989899, "grad_norm": 0.0646812692284584, "learning_rate": 0.00019352985523395696, "loss": 0.0989, "step": 5759 }, { "epoch": 0.37236363636363634, "grad_norm": 0.05664282292127609, "learning_rate": 0.00019352743510466387, "loss": 0.0766, "step": 5760 }, { "epoch": 0.37236363636363634, "eval_bleu": 18.698020856397463, "eval_loss": 0.08932101726531982, "eval_runtime": 2.8059, "eval_samples_per_second": 11.404, "eval_steps_per_second": 1.426, "step": 5760 }, { "epoch": 0.37242828282828283, "grad_norm": 0.060624562203884125, "learning_rate": 0.0001935250145379739, "loss": 0.0776, "step": 5761 }, { "epoch": 0.3724929292929293, "grad_norm": 0.0767112523317337, "learning_rate": 0.00019352259353389836, "loss": 0.0924, "step": 5762 }, { "epoch": 0.37255757575757575, "grad_norm": 0.06148288771510124, "learning_rate": 0.00019352017209244859, "loss": 0.0836, "step": 5763 }, { "epoch": 0.37262222222222224, "grad_norm": 0.06981132179498672, "learning_rate": 0.00019351775021363588, "loss": 0.1021, "step": 5764 }, { "epoch": 0.3726868686868687, "grad_norm": 0.06642654538154602, "learning_rate": 0.00019351532789747155, "loss": 0.0893, "step": 5765 }, { "epoch": 0.37275151515151517, "grad_norm": 0.07178309559822083, "learning_rate": 0.00019351290514396698, "loss": 0.0938, "step": 5766 }, { "epoch": 0.3728161616161616, "grad_norm": 0.05435482785105705, "learning_rate": 0.00019351048195313344, "loss": 0.0763, "step": 5767 }, { "epoch": 0.3728808080808081, "grad_norm": 0.06991015374660492, "learning_rate": 0.0001935080583249823, "loss": 0.1069, "step": 5768 }, { "epoch": 0.3729454545454545, "grad_norm": 0.05615806207060814, "learning_rate": 0.00019350563425952488, "loss": 0.0752, "step": 5769 }, { "epoch": 0.373010101010101, "grad_norm": 0.06691889464855194, "learning_rate": 0.0001935032097567725, "loss": 0.1053, "step": 5770 }, { "epoch": 0.37307474747474745, "grad_norm": 0.06327222287654877, "learning_rate": 0.00019350078481673657, "loss": 0.0936, "step": 5771 }, { "epoch": 0.37313939393939394, "grad_norm": 0.054999105632305145, "learning_rate": 0.00019349835943942835, "loss": 0.0656, "step": 5772 }, { "epoch": 0.3732040404040404, "grad_norm": 0.0533236488699913, "learning_rate": 0.00019349593362485922, "loss": 0.0817, "step": 5773 }, { "epoch": 0.37326868686868686, "grad_norm": 0.056997254490852356, "learning_rate": 0.0001934935073730405, "loss": 0.0752, "step": 5774 }, { "epoch": 0.37333333333333335, "grad_norm": 0.05967039242386818, "learning_rate": 0.0001934910806839836, "loss": 0.0711, "step": 5775 }, { "epoch": 0.3733979797979798, "grad_norm": 0.06396369636058807, "learning_rate": 0.00019348865355769975, "loss": 0.0906, "step": 5776 }, { "epoch": 0.3733979797979798, "eval_bleu": 17.36647784659119, "eval_loss": 0.08873437345027924, "eval_runtime": 2.6506, "eval_samples_per_second": 12.073, "eval_steps_per_second": 1.509, "step": 5776 }, { "epoch": 0.3734626262626263, "grad_norm": 0.04841857776045799, "learning_rate": 0.00019348622599420044, "loss": 0.0649, "step": 5777 }, { "epoch": 0.3735272727272727, "grad_norm": 0.06399985402822495, "learning_rate": 0.00019348379799349693, "loss": 0.104, "step": 5778 }, { "epoch": 0.3735919191919192, "grad_norm": 0.05569527670741081, "learning_rate": 0.0001934813695556006, "loss": 0.0724, "step": 5779 }, { "epoch": 0.37365656565656563, "grad_norm": 0.09700875729322433, "learning_rate": 0.0001934789406805228, "loss": 0.0882, "step": 5780 }, { "epoch": 0.3737212121212121, "grad_norm": 0.058848850429058075, "learning_rate": 0.00019347651136827492, "loss": 0.0795, "step": 5781 }, { "epoch": 0.3737858585858586, "grad_norm": 0.07069579511880875, "learning_rate": 0.00019347408161886828, "loss": 0.0933, "step": 5782 }, { "epoch": 0.37385050505050504, "grad_norm": 0.08152256906032562, "learning_rate": 0.00019347165143231428, "loss": 0.1032, "step": 5783 }, { "epoch": 0.37391515151515153, "grad_norm": 0.06210129335522652, "learning_rate": 0.00019346922080862427, "loss": 0.0822, "step": 5784 }, { "epoch": 0.37397979797979797, "grad_norm": 0.06792610138654709, "learning_rate": 0.0001934667897478096, "loss": 0.0894, "step": 5785 }, { "epoch": 0.37404444444444446, "grad_norm": 0.0622699148952961, "learning_rate": 0.00019346435824988165, "loss": 0.0845, "step": 5786 }, { "epoch": 0.3741090909090909, "grad_norm": 0.05856948345899582, "learning_rate": 0.0001934619263148518, "loss": 0.0762, "step": 5787 }, { "epoch": 0.3741737373737374, "grad_norm": 0.07709803432226181, "learning_rate": 0.0001934594939427314, "loss": 0.108, "step": 5788 }, { "epoch": 0.3742383838383838, "grad_norm": 0.07581794261932373, "learning_rate": 0.00019345706113353187, "loss": 0.0776, "step": 5789 }, { "epoch": 0.3743030303030303, "grad_norm": 0.06480509787797928, "learning_rate": 0.00019345462788726456, "loss": 0.0829, "step": 5790 }, { "epoch": 0.3743676767676768, "grad_norm": 0.06293786317110062, "learning_rate": 0.00019345219420394084, "loss": 0.0814, "step": 5791 }, { "epoch": 0.3744323232323232, "grad_norm": 0.06842003017663956, "learning_rate": 0.00019344976008357212, "loss": 0.0884, "step": 5792 }, { "epoch": 0.3744323232323232, "eval_bleu": 16.65872316132256, "eval_loss": 0.08977718651294708, "eval_runtime": 2.6607, "eval_samples_per_second": 12.027, "eval_steps_per_second": 1.503, "step": 5792 }, { "epoch": 0.3744969696969697, "grad_norm": 0.06950081139802933, "learning_rate": 0.0001934473255261698, "loss": 0.088, "step": 5793 }, { "epoch": 0.37456161616161615, "grad_norm": 0.06178673356771469, "learning_rate": 0.00019344489053174516, "loss": 0.083, "step": 5794 }, { "epoch": 0.37462626262626264, "grad_norm": 0.06558817625045776, "learning_rate": 0.0001934424551003097, "loss": 0.0903, "step": 5795 }, { "epoch": 0.3746909090909091, "grad_norm": 0.05085135996341705, "learning_rate": 0.00019344001923187474, "loss": 0.0675, "step": 5796 }, { "epoch": 0.37475555555555556, "grad_norm": 0.0635736733675003, "learning_rate": 0.0001934375829264517, "loss": 0.0857, "step": 5797 }, { "epoch": 0.374820202020202, "grad_norm": 0.08168654888868332, "learning_rate": 0.000193435146184052, "loss": 0.116, "step": 5798 }, { "epoch": 0.3748848484848485, "grad_norm": 0.06893505901098251, "learning_rate": 0.00019343270900468702, "loss": 0.0925, "step": 5799 }, { "epoch": 0.374949494949495, "grad_norm": 0.05676533281803131, "learning_rate": 0.0001934302713883681, "loss": 0.0777, "step": 5800 }, { "epoch": 0.3750141414141414, "grad_norm": 0.08576533198356628, "learning_rate": 0.00019342783333510673, "loss": 0.0909, "step": 5801 }, { "epoch": 0.3750787878787879, "grad_norm": 0.05678299441933632, "learning_rate": 0.00019342539484491428, "loss": 0.0785, "step": 5802 }, { "epoch": 0.37514343434343433, "grad_norm": 0.05132952705025673, "learning_rate": 0.00019342295591780211, "loss": 0.07, "step": 5803 }, { "epoch": 0.3752080808080808, "grad_norm": 0.06154884770512581, "learning_rate": 0.00019342051655378166, "loss": 0.0773, "step": 5804 }, { "epoch": 0.37527272727272726, "grad_norm": 0.0669637992978096, "learning_rate": 0.00019341807675286434, "loss": 0.0868, "step": 5805 }, { "epoch": 0.37533737373737375, "grad_norm": 0.0652894601225853, "learning_rate": 0.00019341563651506158, "loss": 0.0847, "step": 5806 }, { "epoch": 0.3754020202020202, "grad_norm": 0.10236239433288574, "learning_rate": 0.0001934131958403848, "loss": 0.0953, "step": 5807 }, { "epoch": 0.37546666666666667, "grad_norm": 0.07050836086273193, "learning_rate": 0.00019341075472884532, "loss": 0.1023, "step": 5808 }, { "epoch": 0.37546666666666667, "eval_bleu": 19.786877756887645, "eval_loss": 0.08971795439720154, "eval_runtime": 2.7094, "eval_samples_per_second": 11.811, "eval_steps_per_second": 1.476, "step": 5808 }, { "epoch": 0.3755313131313131, "grad_norm": 0.06533868610858917, "learning_rate": 0.00019340831318045464, "loss": 0.0913, "step": 5809 }, { "epoch": 0.3755959595959596, "grad_norm": 0.05758924409747124, "learning_rate": 0.00019340587119522415, "loss": 0.0901, "step": 5810 }, { "epoch": 0.3756606060606061, "grad_norm": 0.07891985028982162, "learning_rate": 0.0001934034287731653, "loss": 0.1003, "step": 5811 }, { "epoch": 0.3757252525252525, "grad_norm": 0.06657496839761734, "learning_rate": 0.00019340098591428948, "loss": 0.0974, "step": 5812 }, { "epoch": 0.375789898989899, "grad_norm": 0.06803524494171143, "learning_rate": 0.00019339854261860815, "loss": 0.095, "step": 5813 }, { "epoch": 0.37585454545454544, "grad_norm": 0.05579148977994919, "learning_rate": 0.0001933960988861327, "loss": 0.0771, "step": 5814 }, { "epoch": 0.37591919191919193, "grad_norm": 0.07585206627845764, "learning_rate": 0.00019339365471687456, "loss": 0.1005, "step": 5815 }, { "epoch": 0.37598383838383836, "grad_norm": 0.06652296334505081, "learning_rate": 0.00019339121011084518, "loss": 0.0924, "step": 5816 }, { "epoch": 0.37604848484848485, "grad_norm": 0.07032231241464615, "learning_rate": 0.00019338876506805599, "loss": 0.1028, "step": 5817 }, { "epoch": 0.3761131313131313, "grad_norm": 0.07463331520557404, "learning_rate": 0.00019338631958851842, "loss": 0.1147, "step": 5818 }, { "epoch": 0.3761777777777778, "grad_norm": 0.05088530480861664, "learning_rate": 0.0001933838736722439, "loss": 0.0727, "step": 5819 }, { "epoch": 0.37624242424242427, "grad_norm": 0.05978494510054588, "learning_rate": 0.00019338142731924386, "loss": 0.0848, "step": 5820 }, { "epoch": 0.3763070707070707, "grad_norm": 0.05681803822517395, "learning_rate": 0.0001933789805295298, "loss": 0.0816, "step": 5821 }, { "epoch": 0.3763717171717172, "grad_norm": 0.057426176965236664, "learning_rate": 0.00019337653330311308, "loss": 0.0783, "step": 5822 }, { "epoch": 0.3764363636363636, "grad_norm": 0.05782713368535042, "learning_rate": 0.0001933740856400052, "loss": 0.0634, "step": 5823 }, { "epoch": 0.3765010101010101, "grad_norm": 0.06340282410383224, "learning_rate": 0.00019337163754021757, "loss": 0.1095, "step": 5824 }, { "epoch": 0.3765010101010101, "eval_bleu": 19.29138705285159, "eval_loss": 0.09002288430929184, "eval_runtime": 2.6832, "eval_samples_per_second": 11.926, "eval_steps_per_second": 1.491, "step": 5824 }, { "epoch": 0.37656565656565655, "grad_norm": 0.0595083124935627, "learning_rate": 0.00019336918900376165, "loss": 0.0791, "step": 5825 }, { "epoch": 0.37663030303030304, "grad_norm": 0.07301098108291626, "learning_rate": 0.00019336674003064895, "loss": 0.0982, "step": 5826 }, { "epoch": 0.37669494949494947, "grad_norm": 0.06341791898012161, "learning_rate": 0.00019336429062089084, "loss": 0.0832, "step": 5827 }, { "epoch": 0.37675959595959596, "grad_norm": 0.062067922204732895, "learning_rate": 0.00019336184077449882, "loss": 0.0911, "step": 5828 }, { "epoch": 0.37682424242424245, "grad_norm": 0.05801470950245857, "learning_rate": 0.00019335939049148433, "loss": 0.0864, "step": 5829 }, { "epoch": 0.3768888888888889, "grad_norm": 0.06212368234992027, "learning_rate": 0.00019335693977185883, "loss": 0.0917, "step": 5830 }, { "epoch": 0.3769535353535354, "grad_norm": 0.05730974301695824, "learning_rate": 0.0001933544886156338, "loss": 0.0698, "step": 5831 }, { "epoch": 0.3770181818181818, "grad_norm": 0.05190833657979965, "learning_rate": 0.00019335203702282067, "loss": 0.0763, "step": 5832 }, { "epoch": 0.3770828282828283, "grad_norm": 0.05367479845881462, "learning_rate": 0.00019334958499343094, "loss": 0.0637, "step": 5833 }, { "epoch": 0.37714747474747473, "grad_norm": 0.06430768221616745, "learning_rate": 0.00019334713252747604, "loss": 0.0878, "step": 5834 }, { "epoch": 0.3772121212121212, "grad_norm": 0.06163816526532173, "learning_rate": 0.0001933446796249675, "loss": 0.0834, "step": 5835 }, { "epoch": 0.37727676767676765, "grad_norm": 0.07156426459550858, "learning_rate": 0.00019334222628591673, "loss": 0.0948, "step": 5836 }, { "epoch": 0.37734141414141414, "grad_norm": 0.0613996759057045, "learning_rate": 0.00019333977251033521, "loss": 0.0768, "step": 5837 }, { "epoch": 0.37740606060606063, "grad_norm": 0.06612159311771393, "learning_rate": 0.00019333731829823444, "loss": 0.1067, "step": 5838 }, { "epoch": 0.37747070707070707, "grad_norm": 0.06677252799272537, "learning_rate": 0.0001933348636496259, "loss": 0.0984, "step": 5839 }, { "epoch": 0.37753535353535356, "grad_norm": 0.06668251752853394, "learning_rate": 0.00019333240856452108, "loss": 0.0968, "step": 5840 }, { "epoch": 0.37753535353535356, "eval_bleu": 19.70328271302324, "eval_loss": 0.08894934505224228, "eval_runtime": 2.8618, "eval_samples_per_second": 11.182, "eval_steps_per_second": 1.398, "step": 5840 }, { "epoch": 0.3776, "grad_norm": 0.07043835520744324, "learning_rate": 0.00019332995304293142, "loss": 0.0927, "step": 5841 }, { "epoch": 0.3776646464646465, "grad_norm": 0.06534705311059952, "learning_rate": 0.0001933274970848684, "loss": 0.0936, "step": 5842 }, { "epoch": 0.3777292929292929, "grad_norm": 0.06567228585481644, "learning_rate": 0.00019332504069034356, "loss": 0.091, "step": 5843 }, { "epoch": 0.3777939393939394, "grad_norm": 0.07715821266174316, "learning_rate": 0.00019332258385936835, "loss": 0.1147, "step": 5844 }, { "epoch": 0.37785858585858584, "grad_norm": 0.05756711959838867, "learning_rate": 0.00019332012659195423, "loss": 0.0765, "step": 5845 }, { "epoch": 0.3779232323232323, "grad_norm": 0.056365206837654114, "learning_rate": 0.0001933176688881128, "loss": 0.0871, "step": 5846 }, { "epoch": 0.37798787878787876, "grad_norm": 0.061277203261852264, "learning_rate": 0.00019331521074785545, "loss": 0.0789, "step": 5847 }, { "epoch": 0.37805252525252525, "grad_norm": 0.059099357575178146, "learning_rate": 0.0001933127521711937, "loss": 0.0832, "step": 5848 }, { "epoch": 0.37811717171717174, "grad_norm": 0.07364581525325775, "learning_rate": 0.00019331029315813907, "loss": 0.0945, "step": 5849 }, { "epoch": 0.3781818181818182, "grad_norm": 0.06112939491868019, "learning_rate": 0.00019330783370870302, "loss": 0.0772, "step": 5850 }, { "epoch": 0.37824646464646466, "grad_norm": 0.05803108587861061, "learning_rate": 0.0001933053738228971, "loss": 0.0856, "step": 5851 }, { "epoch": 0.3783111111111111, "grad_norm": 0.06474054604768753, "learning_rate": 0.0001933029135007328, "loss": 0.0854, "step": 5852 }, { "epoch": 0.3783757575757576, "grad_norm": 0.05687845125794411, "learning_rate": 0.0001933004527422216, "loss": 0.0882, "step": 5853 }, { "epoch": 0.378440404040404, "grad_norm": 0.06047268956899643, "learning_rate": 0.000193297991547375, "loss": 0.0821, "step": 5854 }, { "epoch": 0.3785050505050505, "grad_norm": 0.058367736637592316, "learning_rate": 0.0001932955299162046, "loss": 0.0853, "step": 5855 }, { "epoch": 0.37856969696969694, "grad_norm": 0.06440664082765579, "learning_rate": 0.00019329306784872182, "loss": 0.0821, "step": 5856 }, { "epoch": 0.37856969696969694, "eval_bleu": 18.92859557028654, "eval_loss": 0.08906275033950806, "eval_runtime": 2.7435, "eval_samples_per_second": 11.664, "eval_steps_per_second": 1.458, "step": 5856 }, { "epoch": 0.37863434343434343, "grad_norm": 0.06415311247110367, "learning_rate": 0.0001932906053449382, "loss": 0.1014, "step": 5857 }, { "epoch": 0.3786989898989899, "grad_norm": 0.06276305764913559, "learning_rate": 0.00019328814240486528, "loss": 0.0879, "step": 5858 }, { "epoch": 0.37876363636363636, "grad_norm": 0.06249580159783363, "learning_rate": 0.00019328567902851453, "loss": 0.0878, "step": 5859 }, { "epoch": 0.37882828282828285, "grad_norm": 0.05503752455115318, "learning_rate": 0.00019328321521589752, "loss": 0.0768, "step": 5860 }, { "epoch": 0.3788929292929293, "grad_norm": 0.06796092540025711, "learning_rate": 0.00019328075096702575, "loss": 0.084, "step": 5861 }, { "epoch": 0.37895757575757577, "grad_norm": 0.10633626580238342, "learning_rate": 0.00019327828628191074, "loss": 0.1431, "step": 5862 }, { "epoch": 0.3790222222222222, "grad_norm": 0.06786064058542252, "learning_rate": 0.00019327582116056402, "loss": 0.0906, "step": 5863 }, { "epoch": 0.3790868686868687, "grad_norm": 0.059549443423748016, "learning_rate": 0.00019327335560299713, "loss": 0.0928, "step": 5864 }, { "epoch": 0.37915151515151513, "grad_norm": 0.048274535685777664, "learning_rate": 0.00019327088960922157, "loss": 0.0679, "step": 5865 }, { "epoch": 0.3792161616161616, "grad_norm": 0.06289258599281311, "learning_rate": 0.0001932684231792489, "loss": 0.0912, "step": 5866 }, { "epoch": 0.3792808080808081, "grad_norm": 0.05349578335881233, "learning_rate": 0.00019326595631309065, "loss": 0.0715, "step": 5867 }, { "epoch": 0.37934545454545454, "grad_norm": 0.061729513108730316, "learning_rate": 0.0001932634890107584, "loss": 0.082, "step": 5868 }, { "epoch": 0.37941010101010103, "grad_norm": 0.07772201299667358, "learning_rate": 0.0001932610212722636, "loss": 0.1079, "step": 5869 }, { "epoch": 0.37947474747474746, "grad_norm": 0.062417272478342056, "learning_rate": 0.00019325855309761783, "loss": 0.0823, "step": 5870 }, { "epoch": 0.37953939393939395, "grad_norm": 0.0542578250169754, "learning_rate": 0.00019325608448683264, "loss": 0.072, "step": 5871 }, { "epoch": 0.3796040404040404, "grad_norm": 0.05923018604516983, "learning_rate": 0.0001932536154399196, "loss": 0.0799, "step": 5872 }, { "epoch": 0.3796040404040404, "eval_bleu": 19.224453417729546, "eval_loss": 0.08945882320404053, "eval_runtime": 2.6797, "eval_samples_per_second": 11.941, "eval_steps_per_second": 1.493, "step": 5872 }, { "epoch": 0.3796686868686869, "grad_norm": 0.0626467615365982, "learning_rate": 0.0001932511459568902, "loss": 0.0822, "step": 5873 }, { "epoch": 0.3797333333333333, "grad_norm": 0.06282750517129898, "learning_rate": 0.00019324867603775605, "loss": 0.0924, "step": 5874 }, { "epoch": 0.3797979797979798, "grad_norm": 0.06549464166164398, "learning_rate": 0.00019324620568252865, "loss": 0.0946, "step": 5875 }, { "epoch": 0.37986262626262624, "grad_norm": 0.060933854430913925, "learning_rate": 0.00019324373489121955, "loss": 0.0842, "step": 5876 }, { "epoch": 0.3799272727272727, "grad_norm": 0.07058953493833542, "learning_rate": 0.00019324126366384036, "loss": 0.0966, "step": 5877 }, { "epoch": 0.3799919191919192, "grad_norm": 0.06128419190645218, "learning_rate": 0.00019323879200040258, "loss": 0.0806, "step": 5878 }, { "epoch": 0.38005656565656565, "grad_norm": 0.06935785710811615, "learning_rate": 0.00019323631990091783, "loss": 0.09, "step": 5879 }, { "epoch": 0.38012121212121214, "grad_norm": 0.10325240343809128, "learning_rate": 0.00019323384736539761, "loss": 0.1078, "step": 5880 }, { "epoch": 0.38018585858585857, "grad_norm": 0.0593874529004097, "learning_rate": 0.0001932313743938535, "loss": 0.0683, "step": 5881 }, { "epoch": 0.38025050505050506, "grad_norm": 0.06472853571176529, "learning_rate": 0.00019322890098629709, "loss": 0.0876, "step": 5882 }, { "epoch": 0.3803151515151515, "grad_norm": 0.06487087905406952, "learning_rate": 0.00019322642714273992, "loss": 0.0853, "step": 5883 }, { "epoch": 0.380379797979798, "grad_norm": 0.05638899654150009, "learning_rate": 0.00019322395286319358, "loss": 0.0721, "step": 5884 }, { "epoch": 0.3804444444444444, "grad_norm": 0.06853563338518143, "learning_rate": 0.00019322147814766964, "loss": 0.0983, "step": 5885 }, { "epoch": 0.3805090909090909, "grad_norm": 0.06330393999814987, "learning_rate": 0.00019321900299617965, "loss": 0.0934, "step": 5886 }, { "epoch": 0.3805737373737374, "grad_norm": 0.06395245343446732, "learning_rate": 0.00019321652740873522, "loss": 0.0748, "step": 5887 }, { "epoch": 0.38063838383838383, "grad_norm": 0.0521356500685215, "learning_rate": 0.0001932140513853479, "loss": 0.0702, "step": 5888 }, { "epoch": 0.38063838383838383, "eval_bleu": 19.29094520835175, "eval_loss": 0.08955836296081543, "eval_runtime": 2.6484, "eval_samples_per_second": 12.083, "eval_steps_per_second": 1.51, "step": 5888 }, { "epoch": 0.3807030303030303, "grad_norm": 0.061421703547239304, "learning_rate": 0.00019321157492602925, "loss": 0.0919, "step": 5889 }, { "epoch": 0.38076767676767675, "grad_norm": 0.056196682155132294, "learning_rate": 0.0001932090980307909, "loss": 0.083, "step": 5890 }, { "epoch": 0.38083232323232324, "grad_norm": 0.0650380402803421, "learning_rate": 0.00019320662069964443, "loss": 0.1032, "step": 5891 }, { "epoch": 0.3808969696969697, "grad_norm": 0.06290214508771896, "learning_rate": 0.00019320414293260137, "loss": 0.0955, "step": 5892 }, { "epoch": 0.38096161616161617, "grad_norm": 0.06186475604772568, "learning_rate": 0.00019320166472967337, "loss": 0.0902, "step": 5893 }, { "epoch": 0.3810262626262626, "grad_norm": 0.06888004392385483, "learning_rate": 0.000193199186090872, "loss": 0.0895, "step": 5894 }, { "epoch": 0.3810909090909091, "grad_norm": 0.06361524760723114, "learning_rate": 0.00019319670701620884, "loss": 0.0866, "step": 5895 }, { "epoch": 0.3811555555555556, "grad_norm": 0.06526106595993042, "learning_rate": 0.00019319422750569548, "loss": 0.0997, "step": 5896 }, { "epoch": 0.381220202020202, "grad_norm": 0.06459544599056244, "learning_rate": 0.00019319174755934358, "loss": 0.0718, "step": 5897 }, { "epoch": 0.3812848484848485, "grad_norm": 0.07102049887180328, "learning_rate": 0.00019318926717716464, "loss": 0.107, "step": 5898 }, { "epoch": 0.38134949494949494, "grad_norm": 0.05709415674209595, "learning_rate": 0.0001931867863591703, "loss": 0.0694, "step": 5899 }, { "epoch": 0.3814141414141414, "grad_norm": 0.0635734349489212, "learning_rate": 0.00019318430510537215, "loss": 0.0815, "step": 5900 }, { "epoch": 0.38147878787878786, "grad_norm": 0.06522025913000107, "learning_rate": 0.00019318182341578185, "loss": 0.0861, "step": 5901 }, { "epoch": 0.38154343434343435, "grad_norm": 0.06441979110240936, "learning_rate": 0.00019317934129041095, "loss": 0.0633, "step": 5902 }, { "epoch": 0.3816080808080808, "grad_norm": 0.05937814339995384, "learning_rate": 0.0001931768587292711, "loss": 0.0791, "step": 5903 }, { "epoch": 0.3816727272727273, "grad_norm": 0.05709347501397133, "learning_rate": 0.00019317437573237387, "loss": 0.0673, "step": 5904 }, { "epoch": 0.3816727272727273, "eval_bleu": 15.08792209872895, "eval_loss": 0.09161631017923355, "eval_runtime": 2.6269, "eval_samples_per_second": 12.182, "eval_steps_per_second": 1.523, "step": 5904 }, { "epoch": 0.38173737373737376, "grad_norm": 0.06024442985653877, "learning_rate": 0.0001931718922997309, "loss": 0.0762, "step": 5905 }, { "epoch": 0.3818020202020202, "grad_norm": 0.05908792465925217, "learning_rate": 0.00019316940843135377, "loss": 0.0766, "step": 5906 }, { "epoch": 0.3818666666666667, "grad_norm": 0.09573209285736084, "learning_rate": 0.0001931669241272541, "loss": 0.0882, "step": 5907 }, { "epoch": 0.3819313131313131, "grad_norm": 0.06501085311174393, "learning_rate": 0.00019316443938744354, "loss": 0.0896, "step": 5908 }, { "epoch": 0.3819959595959596, "grad_norm": 0.052993617951869965, "learning_rate": 0.00019316195421193371, "loss": 0.0656, "step": 5909 }, { "epoch": 0.38206060606060605, "grad_norm": 0.05874817445874214, "learning_rate": 0.00019315946860073622, "loss": 0.0813, "step": 5910 }, { "epoch": 0.38212525252525253, "grad_norm": 0.06430738419294357, "learning_rate": 0.0001931569825538627, "loss": 0.0794, "step": 5911 }, { "epoch": 0.38218989898989897, "grad_norm": 0.09087119996547699, "learning_rate": 0.00019315449607132474, "loss": 0.0828, "step": 5912 }, { "epoch": 0.38225454545454546, "grad_norm": 0.06586827337741852, "learning_rate": 0.00019315200915313402, "loss": 0.0782, "step": 5913 }, { "epoch": 0.3823191919191919, "grad_norm": 0.06572496145963669, "learning_rate": 0.00019314952179930213, "loss": 0.0874, "step": 5914 }, { "epoch": 0.3823838383838384, "grad_norm": 0.06472916156053543, "learning_rate": 0.00019314703400984077, "loss": 0.0762, "step": 5915 }, { "epoch": 0.38244848484848487, "grad_norm": 0.06276363134384155, "learning_rate": 0.00019314454578476148, "loss": 0.0825, "step": 5916 }, { "epoch": 0.3825131313131313, "grad_norm": 0.06366869062185287, "learning_rate": 0.00019314205712407597, "loss": 0.0851, "step": 5917 }, { "epoch": 0.3825777777777778, "grad_norm": 0.07730231434106827, "learning_rate": 0.00019313956802779583, "loss": 0.092, "step": 5918 }, { "epoch": 0.38264242424242423, "grad_norm": 0.06338377296924591, "learning_rate": 0.00019313707849593273, "loss": 0.0937, "step": 5919 }, { "epoch": 0.3827070707070707, "grad_norm": 0.06261949986219406, "learning_rate": 0.0001931345885284983, "loss": 0.0875, "step": 5920 }, { "epoch": 0.3827070707070707, "eval_bleu": 15.596461818097046, "eval_loss": 0.0897599384188652, "eval_runtime": 2.5791, "eval_samples_per_second": 12.407, "eval_steps_per_second": 1.551, "step": 5920 }, { "epoch": 0.38277171717171715, "grad_norm": 0.06569968909025192, "learning_rate": 0.0001931320981255042, "loss": 0.0828, "step": 5921 }, { "epoch": 0.38283636363636364, "grad_norm": 0.06669800728559494, "learning_rate": 0.00019312960728696204, "loss": 0.0906, "step": 5922 }, { "epoch": 0.3829010101010101, "grad_norm": 0.10412409901618958, "learning_rate": 0.0001931271160128835, "loss": 0.0818, "step": 5923 }, { "epoch": 0.38296565656565656, "grad_norm": 0.07826493680477142, "learning_rate": 0.00019312462430328027, "loss": 0.1052, "step": 5924 }, { "epoch": 0.38303030303030305, "grad_norm": 0.06671993434429169, "learning_rate": 0.00019312213215816391, "loss": 0.0932, "step": 5925 }, { "epoch": 0.3830949494949495, "grad_norm": 0.0844091847538948, "learning_rate": 0.00019311963957754612, "loss": 0.0912, "step": 5926 }, { "epoch": 0.383159595959596, "grad_norm": 0.05956432223320007, "learning_rate": 0.0001931171465614386, "loss": 0.0829, "step": 5927 }, { "epoch": 0.3832242424242424, "grad_norm": 0.05547641962766647, "learning_rate": 0.00019311465310985294, "loss": 0.0843, "step": 5928 }, { "epoch": 0.3832888888888889, "grad_norm": 0.061287086457014084, "learning_rate": 0.00019311215922280085, "loss": 0.0799, "step": 5929 }, { "epoch": 0.38335353535353534, "grad_norm": 0.05399525910615921, "learning_rate": 0.00019310966490029396, "loss": 0.0772, "step": 5930 }, { "epoch": 0.3834181818181818, "grad_norm": 0.06502603739500046, "learning_rate": 0.00019310717014234396, "loss": 0.097, "step": 5931 }, { "epoch": 0.38348282828282826, "grad_norm": 0.05542701855301857, "learning_rate": 0.00019310467494896248, "loss": 0.0753, "step": 5932 }, { "epoch": 0.38354747474747475, "grad_norm": 0.0684909075498581, "learning_rate": 0.00019310217932016124, "loss": 0.0917, "step": 5933 }, { "epoch": 0.38361212121212124, "grad_norm": 0.05494304001331329, "learning_rate": 0.00019309968325595186, "loss": 0.0774, "step": 5934 }, { "epoch": 0.38367676767676767, "grad_norm": 0.06756287813186646, "learning_rate": 0.00019309718675634608, "loss": 0.0996, "step": 5935 }, { "epoch": 0.38374141414141416, "grad_norm": 0.06140131130814552, "learning_rate": 0.00019309468982135552, "loss": 0.0871, "step": 5936 }, { "epoch": 0.38374141414141416, "eval_bleu": 16.395480754867883, "eval_loss": 0.08998902887105942, "eval_runtime": 2.6357, "eval_samples_per_second": 12.141, "eval_steps_per_second": 1.518, "step": 5936 }, { "epoch": 0.3838060606060606, "grad_norm": 0.06154841184616089, "learning_rate": 0.00019309219245099187, "loss": 0.0819, "step": 5937 }, { "epoch": 0.3838707070707071, "grad_norm": 0.059064023196697235, "learning_rate": 0.0001930896946452668, "loss": 0.0817, "step": 5938 }, { "epoch": 0.3839353535353535, "grad_norm": 0.05709843710064888, "learning_rate": 0.000193087196404192, "loss": 0.0682, "step": 5939 }, { "epoch": 0.384, "grad_norm": 0.0656856894493103, "learning_rate": 0.00019308469772777917, "loss": 0.088, "step": 5940 }, { "epoch": 0.38406464646464644, "grad_norm": 0.07637253403663635, "learning_rate": 0.00019308219861604, "loss": 0.1141, "step": 5941 }, { "epoch": 0.38412929292929293, "grad_norm": 0.06196242570877075, "learning_rate": 0.00019307969906898613, "loss": 0.0843, "step": 5942 }, { "epoch": 0.3841939393939394, "grad_norm": 0.058013685047626495, "learning_rate": 0.00019307719908662926, "loss": 0.0749, "step": 5943 }, { "epoch": 0.38425858585858585, "grad_norm": 0.06150985136628151, "learning_rate": 0.00019307469866898112, "loss": 0.0847, "step": 5944 }, { "epoch": 0.38432323232323234, "grad_norm": 0.06176202371716499, "learning_rate": 0.0001930721978160534, "loss": 0.0807, "step": 5945 }, { "epoch": 0.3843878787878788, "grad_norm": 0.06526836007833481, "learning_rate": 0.00019306969652785777, "loss": 0.0919, "step": 5946 }, { "epoch": 0.38445252525252527, "grad_norm": 0.06633245944976807, "learning_rate": 0.00019306719480440595, "loss": 0.0774, "step": 5947 }, { "epoch": 0.3845171717171717, "grad_norm": 0.06313831359148026, "learning_rate": 0.00019306469264570962, "loss": 0.0799, "step": 5948 }, { "epoch": 0.3845818181818182, "grad_norm": 0.0567036010324955, "learning_rate": 0.00019306219005178047, "loss": 0.0746, "step": 5949 }, { "epoch": 0.3846464646464646, "grad_norm": 0.06819740682840347, "learning_rate": 0.00019305968702263026, "loss": 0.0983, "step": 5950 }, { "epoch": 0.3847111111111111, "grad_norm": 0.05071788653731346, "learning_rate": 0.00019305718355827064, "loss": 0.0643, "step": 5951 }, { "epoch": 0.38477575757575755, "grad_norm": 0.06182543933391571, "learning_rate": 0.00019305467965871333, "loss": 0.0852, "step": 5952 }, { "epoch": 0.38477575757575755, "eval_bleu": 16.97963456929848, "eval_loss": 0.09149065613746643, "eval_runtime": 2.5528, "eval_samples_per_second": 12.535, "eval_steps_per_second": 1.567, "step": 5952 }, { "epoch": 0.38484040404040404, "grad_norm": 0.05626741051673889, "learning_rate": 0.00019305217532397006, "loss": 0.0686, "step": 5953 }, { "epoch": 0.3849050505050505, "grad_norm": 0.06510366499423981, "learning_rate": 0.0001930496705540525, "loss": 0.084, "step": 5954 }, { "epoch": 0.38496969696969696, "grad_norm": 0.06261736899614334, "learning_rate": 0.00019304716534897243, "loss": 0.0799, "step": 5955 }, { "epoch": 0.38503434343434345, "grad_norm": 0.05738934129476547, "learning_rate": 0.0001930446597087415, "loss": 0.0833, "step": 5956 }, { "epoch": 0.3850989898989899, "grad_norm": 0.07102226465940475, "learning_rate": 0.00019304215363337147, "loss": 0.0792, "step": 5957 }, { "epoch": 0.3851636363636364, "grad_norm": 0.06456318497657776, "learning_rate": 0.00019303964712287405, "loss": 0.0932, "step": 5958 }, { "epoch": 0.3852282828282828, "grad_norm": 0.057906873524188995, "learning_rate": 0.00019303714017726095, "loss": 0.0828, "step": 5959 }, { "epoch": 0.3852929292929293, "grad_norm": 0.07138396799564362, "learning_rate": 0.0001930346327965439, "loss": 0.102, "step": 5960 }, { "epoch": 0.38535757575757573, "grad_norm": 0.0671912431716919, "learning_rate": 0.00019303212498073463, "loss": 0.0963, "step": 5961 }, { "epoch": 0.3854222222222222, "grad_norm": 0.0661538764834404, "learning_rate": 0.00019302961672984487, "loss": 0.0883, "step": 5962 }, { "epoch": 0.3854868686868687, "grad_norm": 0.054787375032901764, "learning_rate": 0.00019302710804388636, "loss": 0.0728, "step": 5963 }, { "epoch": 0.38555151515151515, "grad_norm": 0.06632887572050095, "learning_rate": 0.00019302459892287083, "loss": 0.103, "step": 5964 }, { "epoch": 0.38561616161616163, "grad_norm": 0.07166798412799835, "learning_rate": 0.00019302208936680996, "loss": 0.0744, "step": 5965 }, { "epoch": 0.38568080808080807, "grad_norm": 0.06268935650587082, "learning_rate": 0.00019301957937571557, "loss": 0.0875, "step": 5966 }, { "epoch": 0.38574545454545456, "grad_norm": 0.06309354305267334, "learning_rate": 0.0001930170689495993, "loss": 0.0855, "step": 5967 }, { "epoch": 0.385810101010101, "grad_norm": 0.05968724936246872, "learning_rate": 0.000193014558088473, "loss": 0.0772, "step": 5968 }, { "epoch": 0.385810101010101, "eval_bleu": 17.190146877912643, "eval_loss": 0.09064122289419174, "eval_runtime": 2.6713, "eval_samples_per_second": 11.979, "eval_steps_per_second": 1.497, "step": 5968 }, { "epoch": 0.3858747474747475, "grad_norm": 0.058623090386390686, "learning_rate": 0.00019301204679234837, "loss": 0.0806, "step": 5969 }, { "epoch": 0.3859393939393939, "grad_norm": 0.06855589896440506, "learning_rate": 0.00019300953506123713, "loss": 0.0901, "step": 5970 }, { "epoch": 0.3860040404040404, "grad_norm": 0.05541743338108063, "learning_rate": 0.00019300702289515106, "loss": 0.0829, "step": 5971 }, { "epoch": 0.3860686868686869, "grad_norm": 0.06281839311122894, "learning_rate": 0.00019300451029410185, "loss": 0.0935, "step": 5972 }, { "epoch": 0.38613333333333333, "grad_norm": 0.06831640750169754, "learning_rate": 0.00019300199725810134, "loss": 0.0856, "step": 5973 }, { "epoch": 0.3861979797979798, "grad_norm": 0.05924155190587044, "learning_rate": 0.0001929994837871612, "loss": 0.0828, "step": 5974 }, { "epoch": 0.38626262626262625, "grad_norm": 0.06677243858575821, "learning_rate": 0.00019299696988129325, "loss": 0.0956, "step": 5975 }, { "epoch": 0.38632727272727274, "grad_norm": 0.0653674528002739, "learning_rate": 0.00019299445554050917, "loss": 0.0925, "step": 5976 }, { "epoch": 0.3863919191919192, "grad_norm": 0.06494759768247604, "learning_rate": 0.00019299194076482078, "loss": 0.083, "step": 5977 }, { "epoch": 0.38645656565656566, "grad_norm": 0.062020443379879, "learning_rate": 0.00019298942555423984, "loss": 0.0813, "step": 5978 }, { "epoch": 0.3865212121212121, "grad_norm": 0.059867922216653824, "learning_rate": 0.0001929869099087781, "loss": 0.0839, "step": 5979 }, { "epoch": 0.3865858585858586, "grad_norm": 0.05497795715928078, "learning_rate": 0.00019298439382844733, "loss": 0.0648, "step": 5980 }, { "epoch": 0.3866505050505051, "grad_norm": 0.057126834988594055, "learning_rate": 0.00019298187731325929, "loss": 0.0892, "step": 5981 }, { "epoch": 0.3867151515151515, "grad_norm": 0.05660795420408249, "learning_rate": 0.00019297936036322572, "loss": 0.0746, "step": 5982 }, { "epoch": 0.386779797979798, "grad_norm": 0.05114062875509262, "learning_rate": 0.00019297684297835843, "loss": 0.0722, "step": 5983 }, { "epoch": 0.38684444444444444, "grad_norm": 0.05914100632071495, "learning_rate": 0.00019297432515866917, "loss": 0.0841, "step": 5984 }, { "epoch": 0.38684444444444444, "eval_bleu": 14.763636705141554, "eval_loss": 0.09031134843826294, "eval_runtime": 2.6331, "eval_samples_per_second": 12.153, "eval_steps_per_second": 1.519, "step": 5984 }, { "epoch": 0.3869090909090909, "grad_norm": 0.05303841084241867, "learning_rate": 0.00019297180690416976, "loss": 0.0723, "step": 5985 }, { "epoch": 0.38697373737373736, "grad_norm": 0.05911511182785034, "learning_rate": 0.00019296928821487192, "loss": 0.0798, "step": 5986 }, { "epoch": 0.38703838383838385, "grad_norm": 0.08924005180597305, "learning_rate": 0.00019296676909078748, "loss": 0.0804, "step": 5987 }, { "epoch": 0.3871030303030303, "grad_norm": 0.05793623998761177, "learning_rate": 0.0001929642495319282, "loss": 0.0795, "step": 5988 }, { "epoch": 0.38716767676767677, "grad_norm": 0.08372315019369125, "learning_rate": 0.00019296172953830584, "loss": 0.0946, "step": 5989 }, { "epoch": 0.3872323232323232, "grad_norm": 0.06422155350446701, "learning_rate": 0.0001929592091099322, "loss": 0.081, "step": 5990 }, { "epoch": 0.3872969696969697, "grad_norm": 0.0653742179274559, "learning_rate": 0.00019295668824681906, "loss": 0.0791, "step": 5991 }, { "epoch": 0.3873616161616162, "grad_norm": 0.0696403980255127, "learning_rate": 0.00019295416694897822, "loss": 0.0991, "step": 5992 }, { "epoch": 0.3874262626262626, "grad_norm": 0.07294734567403793, "learning_rate": 0.0001929516452164215, "loss": 0.0901, "step": 5993 }, { "epoch": 0.3874909090909091, "grad_norm": 0.06059959903359413, "learning_rate": 0.00019294912304916064, "loss": 0.0843, "step": 5994 }, { "epoch": 0.38755555555555554, "grad_norm": 0.05576810985803604, "learning_rate": 0.0001929466004472075, "loss": 0.0861, "step": 5995 }, { "epoch": 0.38762020202020203, "grad_norm": 0.0697091743350029, "learning_rate": 0.0001929440774105738, "loss": 0.1037, "step": 5996 }, { "epoch": 0.38768484848484847, "grad_norm": 0.05455729365348816, "learning_rate": 0.00019294155393927137, "loss": 0.0647, "step": 5997 }, { "epoch": 0.38774949494949495, "grad_norm": 0.06997755169868469, "learning_rate": 0.00019293903003331204, "loss": 0.0975, "step": 5998 }, { "epoch": 0.3878141414141414, "grad_norm": 0.06272505968809128, "learning_rate": 0.00019293650569270757, "loss": 0.0923, "step": 5999 }, { "epoch": 0.3878787878787879, "grad_norm": 0.0641670897603035, "learning_rate": 0.0001929339809174698, "loss": 0.0972, "step": 6000 }, { "epoch": 0.3878787878787879, "eval_bleu": 16.64779883323713, "eval_loss": 0.09066077321767807, "eval_runtime": 2.5534, "eval_samples_per_second": 12.532, "eval_steps_per_second": 1.567, "step": 6000 }, { "epoch": 0.38794343434343437, "grad_norm": 0.08192732185125351, "learning_rate": 0.00019293145570761056, "loss": 0.1061, "step": 6001 }, { "epoch": 0.3880080808080808, "grad_norm": 0.0647520199418068, "learning_rate": 0.00019292893006314155, "loss": 0.0996, "step": 6002 }, { "epoch": 0.3880727272727273, "grad_norm": 0.09492307156324387, "learning_rate": 0.0001929264039840747, "loss": 0.097, "step": 6003 }, { "epoch": 0.3881373737373737, "grad_norm": 0.05757579579949379, "learning_rate": 0.0001929238774704218, "loss": 0.0761, "step": 6004 }, { "epoch": 0.3882020202020202, "grad_norm": 0.06319024413824081, "learning_rate": 0.0001929213505221946, "loss": 0.0878, "step": 6005 }, { "epoch": 0.38826666666666665, "grad_norm": 0.06147363409399986, "learning_rate": 0.00019291882313940498, "loss": 0.0976, "step": 6006 }, { "epoch": 0.38833131313131314, "grad_norm": 0.0736456885933876, "learning_rate": 0.00019291629532206477, "loss": 0.0884, "step": 6007 }, { "epoch": 0.3883959595959596, "grad_norm": 0.06618665158748627, "learning_rate": 0.00019291376707018573, "loss": 0.0893, "step": 6008 }, { "epoch": 0.38846060606060606, "grad_norm": 0.07238055765628815, "learning_rate": 0.0001929112383837797, "loss": 0.0986, "step": 6009 }, { "epoch": 0.38852525252525255, "grad_norm": 0.06013907119631767, "learning_rate": 0.00019290870926285859, "loss": 0.0801, "step": 6010 }, { "epoch": 0.388589898989899, "grad_norm": 0.06661374866962433, "learning_rate": 0.00019290617970743411, "loss": 0.0914, "step": 6011 }, { "epoch": 0.3886545454545455, "grad_norm": 0.06436292827129364, "learning_rate": 0.00019290364971751814, "loss": 0.0808, "step": 6012 }, { "epoch": 0.3887191919191919, "grad_norm": 0.06876916438341141, "learning_rate": 0.00019290111929312254, "loss": 0.0981, "step": 6013 }, { "epoch": 0.3887838383838384, "grad_norm": 0.05472567304968834, "learning_rate": 0.00019289858843425913, "loss": 0.0786, "step": 6014 }, { "epoch": 0.38884848484848483, "grad_norm": 0.06865181028842926, "learning_rate": 0.0001928960571409397, "loss": 0.092, "step": 6015 }, { "epoch": 0.3889131313131313, "grad_norm": 0.06956758350133896, "learning_rate": 0.0001928935254131761, "loss": 0.116, "step": 6016 }, { "epoch": 0.3889131313131313, "eval_bleu": 20.11321701248186, "eval_loss": 0.08902894705533981, "eval_runtime": 2.6863, "eval_samples_per_second": 11.912, "eval_steps_per_second": 1.489, "step": 6016 }, { "epoch": 0.38897777777777776, "grad_norm": 0.06409254670143127, "learning_rate": 0.00019289099325098025, "loss": 0.0912, "step": 6017 }, { "epoch": 0.38904242424242425, "grad_norm": 0.07922434061765671, "learning_rate": 0.0001928884606543639, "loss": 0.0862, "step": 6018 }, { "epoch": 0.38910707070707073, "grad_norm": 0.05166606977581978, "learning_rate": 0.00019288592762333894, "loss": 0.0687, "step": 6019 }, { "epoch": 0.38917171717171717, "grad_norm": 0.061574503779411316, "learning_rate": 0.00019288339415791722, "loss": 0.092, "step": 6020 }, { "epoch": 0.38923636363636366, "grad_norm": 0.06548233330249786, "learning_rate": 0.00019288086025811052, "loss": 0.0917, "step": 6021 }, { "epoch": 0.3893010101010101, "grad_norm": 0.056036677211523056, "learning_rate": 0.0001928783259239308, "loss": 0.0805, "step": 6022 }, { "epoch": 0.3893656565656566, "grad_norm": 0.06381651014089584, "learning_rate": 0.00019287579115538983, "loss": 0.0873, "step": 6023 }, { "epoch": 0.389430303030303, "grad_norm": 0.054582156240940094, "learning_rate": 0.00019287325595249946, "loss": 0.0777, "step": 6024 }, { "epoch": 0.3894949494949495, "grad_norm": 0.06828862428665161, "learning_rate": 0.00019287072031527164, "loss": 0.0985, "step": 6025 }, { "epoch": 0.38955959595959594, "grad_norm": 0.05916346237063408, "learning_rate": 0.0001928681842437181, "loss": 0.0884, "step": 6026 }, { "epoch": 0.38962424242424243, "grad_norm": 0.059243343770504, "learning_rate": 0.0001928656477378508, "loss": 0.0868, "step": 6027 }, { "epoch": 0.38968888888888886, "grad_norm": 0.06233545020222664, "learning_rate": 0.00019286311079768154, "loss": 0.0901, "step": 6028 }, { "epoch": 0.38975353535353535, "grad_norm": 0.07882121950387955, "learning_rate": 0.00019286057342322223, "loss": 0.1036, "step": 6029 }, { "epoch": 0.38981818181818184, "grad_norm": 0.06462258100509644, "learning_rate": 0.0001928580356144847, "loss": 0.0887, "step": 6030 }, { "epoch": 0.3898828282828283, "grad_norm": 0.05648363381624222, "learning_rate": 0.00019285549737148086, "loss": 0.0908, "step": 6031 }, { "epoch": 0.38994747474747476, "grad_norm": 0.07134097814559937, "learning_rate": 0.00019285295869422256, "loss": 0.0923, "step": 6032 }, { "epoch": 0.38994747474747476, "eval_bleu": 18.81064541429817, "eval_loss": 0.08911585807800293, "eval_runtime": 2.5649, "eval_samples_per_second": 12.476, "eval_steps_per_second": 1.559, "step": 6032 }, { "epoch": 0.3900121212121212, "grad_norm": 0.05907868221402168, "learning_rate": 0.00019285041958272165, "loss": 0.0838, "step": 6033 }, { "epoch": 0.3900767676767677, "grad_norm": 0.06450419127941132, "learning_rate": 0.00019284788003699004, "loss": 0.0842, "step": 6034 }, { "epoch": 0.3901414141414141, "grad_norm": 0.0629051998257637, "learning_rate": 0.00019284534005703955, "loss": 0.087, "step": 6035 }, { "epoch": 0.3902060606060606, "grad_norm": 0.06331878155469894, "learning_rate": 0.00019284279964288214, "loss": 0.0781, "step": 6036 }, { "epoch": 0.39027070707070705, "grad_norm": 0.07342950999736786, "learning_rate": 0.00019284025879452962, "loss": 0.11, "step": 6037 }, { "epoch": 0.39033535353535354, "grad_norm": 0.05961890518665314, "learning_rate": 0.00019283771751199392, "loss": 0.0853, "step": 6038 }, { "epoch": 0.3904, "grad_norm": 0.05609097704291344, "learning_rate": 0.00019283517579528692, "loss": 0.0856, "step": 6039 }, { "epoch": 0.39046464646464646, "grad_norm": 0.056484878063201904, "learning_rate": 0.00019283263364442048, "loss": 0.0807, "step": 6040 }, { "epoch": 0.39052929292929295, "grad_norm": 0.05688197910785675, "learning_rate": 0.00019283009105940647, "loss": 0.0792, "step": 6041 }, { "epoch": 0.3905939393939394, "grad_norm": 0.055903274565935135, "learning_rate": 0.00019282754804025685, "loss": 0.0798, "step": 6042 }, { "epoch": 0.39065858585858587, "grad_norm": 0.05977209657430649, "learning_rate": 0.0001928250045869835, "loss": 0.0859, "step": 6043 }, { "epoch": 0.3907232323232323, "grad_norm": 0.06016681343317032, "learning_rate": 0.00019282246069959825, "loss": 0.0787, "step": 6044 }, { "epoch": 0.3907878787878788, "grad_norm": 0.06368362903594971, "learning_rate": 0.00019281991637811304, "loss": 0.0935, "step": 6045 }, { "epoch": 0.39085252525252523, "grad_norm": 0.06332768499851227, "learning_rate": 0.00019281737162253978, "loss": 0.0963, "step": 6046 }, { "epoch": 0.3909171717171717, "grad_norm": 0.058141279965639114, "learning_rate": 0.00019281482643289036, "loss": 0.0851, "step": 6047 }, { "epoch": 0.3909818181818182, "grad_norm": 0.07057707756757736, "learning_rate": 0.00019281228080917666, "loss": 0.0986, "step": 6048 }, { "epoch": 0.3909818181818182, "eval_bleu": 20.7666139936406, "eval_loss": 0.08835171908140182, "eval_runtime": 2.8235, "eval_samples_per_second": 11.333, "eval_steps_per_second": 1.417, "step": 6048 }, { "epoch": 0.39104646464646464, "grad_norm": 0.06375104933977127, "learning_rate": 0.00019280973475141062, "loss": 0.0748, "step": 6049 }, { "epoch": 0.39111111111111113, "grad_norm": 0.06070958077907562, "learning_rate": 0.00019280718825960414, "loss": 0.0933, "step": 6050 }, { "epoch": 0.39117575757575757, "grad_norm": 0.07892641425132751, "learning_rate": 0.0001928046413337691, "loss": 0.0889, "step": 6051 }, { "epoch": 0.39124040404040406, "grad_norm": 0.06481632590293884, "learning_rate": 0.00019280209397391746, "loss": 0.0931, "step": 6052 }, { "epoch": 0.3913050505050505, "grad_norm": 0.054232407361269, "learning_rate": 0.0001927995461800611, "loss": 0.0752, "step": 6053 }, { "epoch": 0.391369696969697, "grad_norm": 0.059178683906793594, "learning_rate": 0.00019279699795221193, "loss": 0.0935, "step": 6054 }, { "epoch": 0.3914343434343434, "grad_norm": 0.061701737344264984, "learning_rate": 0.00019279444929038187, "loss": 0.0802, "step": 6055 }, { "epoch": 0.3914989898989899, "grad_norm": 0.11909772455692291, "learning_rate": 0.00019279190019458287, "loss": 0.0896, "step": 6056 }, { "epoch": 0.39156363636363634, "grad_norm": 0.06124318018555641, "learning_rate": 0.00019278935066482683, "loss": 0.0817, "step": 6057 }, { "epoch": 0.3916282828282828, "grad_norm": 0.0573464035987854, "learning_rate": 0.00019278680070112568, "loss": 0.0776, "step": 6058 }, { "epoch": 0.3916929292929293, "grad_norm": 0.06631136685609818, "learning_rate": 0.0001927842503034913, "loss": 0.0974, "step": 6059 }, { "epoch": 0.39175757575757575, "grad_norm": 0.05881990119814873, "learning_rate": 0.00019278169947193566, "loss": 0.0856, "step": 6060 }, { "epoch": 0.39182222222222224, "grad_norm": 0.08466936647891998, "learning_rate": 0.00019277914820647072, "loss": 0.0949, "step": 6061 }, { "epoch": 0.3918868686868687, "grad_norm": 0.06451308727264404, "learning_rate": 0.00019277659650710836, "loss": 0.0927, "step": 6062 }, { "epoch": 0.39195151515151516, "grad_norm": 0.05932280421257019, "learning_rate": 0.0001927740443738605, "loss": 0.0743, "step": 6063 }, { "epoch": 0.3920161616161616, "grad_norm": 0.057283125817775726, "learning_rate": 0.00019277149180673913, "loss": 0.0816, "step": 6064 }, { "epoch": 0.3920161616161616, "eval_bleu": 18.449428264436357, "eval_loss": 0.08747922629117966, "eval_runtime": 2.5475, "eval_samples_per_second": 12.561, "eval_steps_per_second": 1.57, "step": 6064 }, { "epoch": 0.3920808080808081, "grad_norm": 0.06371239572763443, "learning_rate": 0.00019276893880575618, "loss": 0.0988, "step": 6065 }, { "epoch": 0.3921454545454545, "grad_norm": 0.06605517119169235, "learning_rate": 0.00019276638537092357, "loss": 0.0789, "step": 6066 }, { "epoch": 0.392210101010101, "grad_norm": 0.062010277062654495, "learning_rate": 0.0001927638315022532, "loss": 0.0763, "step": 6067 }, { "epoch": 0.3922747474747475, "grad_norm": 0.06356547772884369, "learning_rate": 0.0001927612771997571, "loss": 0.0826, "step": 6068 }, { "epoch": 0.39233939393939393, "grad_norm": 0.06292371451854706, "learning_rate": 0.00019275872246344715, "loss": 0.0831, "step": 6069 }, { "epoch": 0.3924040404040404, "grad_norm": 0.07313656061887741, "learning_rate": 0.00019275616729333534, "loss": 0.0986, "step": 6070 }, { "epoch": 0.39246868686868686, "grad_norm": 0.06289214640855789, "learning_rate": 0.0001927536116894336, "loss": 0.0811, "step": 6071 }, { "epoch": 0.39253333333333335, "grad_norm": 0.056175246834754944, "learning_rate": 0.00019275105565175386, "loss": 0.0717, "step": 6072 }, { "epoch": 0.3925979797979798, "grad_norm": 0.05695553869009018, "learning_rate": 0.00019274849918030814, "loss": 0.07, "step": 6073 }, { "epoch": 0.39266262626262627, "grad_norm": 0.053650856018066406, "learning_rate": 0.00019274594227510828, "loss": 0.0703, "step": 6074 }, { "epoch": 0.3927272727272727, "grad_norm": 0.06531906872987747, "learning_rate": 0.00019274338493616635, "loss": 0.0854, "step": 6075 }, { "epoch": 0.3927919191919192, "grad_norm": 0.05249672383069992, "learning_rate": 0.00019274082716349428, "loss": 0.0748, "step": 6076 }, { "epoch": 0.3928565656565657, "grad_norm": 0.05744104087352753, "learning_rate": 0.000192738268957104, "loss": 0.0869, "step": 6077 }, { "epoch": 0.3929212121212121, "grad_norm": 0.06996724009513855, "learning_rate": 0.0001927357103170075, "loss": 0.0942, "step": 6078 }, { "epoch": 0.3929858585858586, "grad_norm": 0.061627741903066635, "learning_rate": 0.00019273315124321676, "loss": 0.082, "step": 6079 }, { "epoch": 0.39305050505050504, "grad_norm": 0.06431727111339569, "learning_rate": 0.00019273059173574372, "loss": 0.0957, "step": 6080 }, { "epoch": 0.39305050505050504, "eval_bleu": 18.772917224993495, "eval_loss": 0.08863607794046402, "eval_runtime": 2.5728, "eval_samples_per_second": 12.438, "eval_steps_per_second": 1.555, "step": 6080 }, { "epoch": 0.39311515151515153, "grad_norm": 0.06454973667860031, "learning_rate": 0.00019272803179460035, "loss": 0.0831, "step": 6081 }, { "epoch": 0.39317979797979796, "grad_norm": 0.053508177399635315, "learning_rate": 0.00019272547141979863, "loss": 0.0796, "step": 6082 }, { "epoch": 0.39324444444444445, "grad_norm": 0.06300534307956696, "learning_rate": 0.00019272291061135053, "loss": 0.0782, "step": 6083 }, { "epoch": 0.3933090909090909, "grad_norm": 0.06497126072645187, "learning_rate": 0.00019272034936926804, "loss": 0.0971, "step": 6084 }, { "epoch": 0.3933737373737374, "grad_norm": 0.06300681829452515, "learning_rate": 0.00019271778769356312, "loss": 0.0895, "step": 6085 }, { "epoch": 0.39343838383838386, "grad_norm": 0.06522613763809204, "learning_rate": 0.00019271522558424775, "loss": 0.0831, "step": 6086 }, { "epoch": 0.3935030303030303, "grad_norm": 0.0627240315079689, "learning_rate": 0.00019271266304133393, "loss": 0.0852, "step": 6087 }, { "epoch": 0.3935676767676768, "grad_norm": 0.06329286843538284, "learning_rate": 0.00019271010006483366, "loss": 0.0946, "step": 6088 }, { "epoch": 0.3936323232323232, "grad_norm": 0.056741196662187576, "learning_rate": 0.00019270753665475888, "loss": 0.0833, "step": 6089 }, { "epoch": 0.3936969696969697, "grad_norm": 0.06135709956288338, "learning_rate": 0.00019270497281112162, "loss": 0.0951, "step": 6090 }, { "epoch": 0.39376161616161615, "grad_norm": 0.059422288089990616, "learning_rate": 0.00019270240853393381, "loss": 0.0804, "step": 6091 }, { "epoch": 0.39382626262626264, "grad_norm": 0.05891941115260124, "learning_rate": 0.0001926998438232075, "loss": 0.0899, "step": 6092 }, { "epoch": 0.39389090909090907, "grad_norm": 0.06666530668735504, "learning_rate": 0.00019269727867895467, "loss": 0.0928, "step": 6093 }, { "epoch": 0.39395555555555556, "grad_norm": 0.0608372837305069, "learning_rate": 0.00019269471310118732, "loss": 0.0929, "step": 6094 }, { "epoch": 0.394020202020202, "grad_norm": 0.0903770923614502, "learning_rate": 0.00019269214708991743, "loss": 0.0813, "step": 6095 }, { "epoch": 0.3940848484848485, "grad_norm": 0.06366690248250961, "learning_rate": 0.00019268958064515702, "loss": 0.0959, "step": 6096 }, { "epoch": 0.3940848484848485, "eval_bleu": 16.06869768325984, "eval_loss": 0.08956421166658401, "eval_runtime": 2.5437, "eval_samples_per_second": 12.58, "eval_steps_per_second": 1.572, "step": 6096 }, { "epoch": 0.39414949494949497, "grad_norm": 0.05655135586857796, "learning_rate": 0.00019268701376691806, "loss": 0.0846, "step": 6097 }, { "epoch": 0.3942141414141414, "grad_norm": 0.05576255917549133, "learning_rate": 0.00019268444645521263, "loss": 0.0768, "step": 6098 }, { "epoch": 0.3942787878787879, "grad_norm": 0.0664646178483963, "learning_rate": 0.00019268187871005263, "loss": 0.0986, "step": 6099 }, { "epoch": 0.39434343434343433, "grad_norm": 0.06384041160345078, "learning_rate": 0.00019267931053145015, "loss": 0.0814, "step": 6100 }, { "epoch": 0.3944080808080808, "grad_norm": 0.05733420327305794, "learning_rate": 0.00019267674191941717, "loss": 0.0842, "step": 6101 }, { "epoch": 0.39447272727272725, "grad_norm": 0.05670323222875595, "learning_rate": 0.0001926741728739657, "loss": 0.0783, "step": 6102 }, { "epoch": 0.39453737373737374, "grad_norm": 0.0671645924448967, "learning_rate": 0.00019267160339510778, "loss": 0.0895, "step": 6103 }, { "epoch": 0.3946020202020202, "grad_norm": 0.054618608206510544, "learning_rate": 0.0001926690334828554, "loss": 0.0757, "step": 6104 }, { "epoch": 0.39466666666666667, "grad_norm": 0.06375063210725784, "learning_rate": 0.00019266646313722055, "loss": 0.0916, "step": 6105 }, { "epoch": 0.39473131313131316, "grad_norm": 0.06112280115485191, "learning_rate": 0.00019266389235821532, "loss": 0.0673, "step": 6106 }, { "epoch": 0.3947959595959596, "grad_norm": 0.07012294977903366, "learning_rate": 0.00019266132114585172, "loss": 0.1069, "step": 6107 }, { "epoch": 0.3948606060606061, "grad_norm": 0.06437741219997406, "learning_rate": 0.00019265874950014173, "loss": 0.0954, "step": 6108 }, { "epoch": 0.3949252525252525, "grad_norm": 0.06138409301638603, "learning_rate": 0.0001926561774210974, "loss": 0.082, "step": 6109 }, { "epoch": 0.394989898989899, "grad_norm": 0.1665375530719757, "learning_rate": 0.00019265360490873076, "loss": 0.0962, "step": 6110 }, { "epoch": 0.39505454545454544, "grad_norm": 0.05991007015109062, "learning_rate": 0.00019265103196305382, "loss": 0.0781, "step": 6111 }, { "epoch": 0.3951191919191919, "grad_norm": 0.07785572111606598, "learning_rate": 0.00019264845858407869, "loss": 0.0772, "step": 6112 }, { "epoch": 0.3951191919191919, "eval_bleu": 17.96786411140525, "eval_loss": 0.08971194922924042, "eval_runtime": 2.6265, "eval_samples_per_second": 12.183, "eval_steps_per_second": 1.523, "step": 6112 }, { "epoch": 0.39518383838383836, "grad_norm": 0.06312666833400726, "learning_rate": 0.00019264588477181728, "loss": 0.0885, "step": 6113 }, { "epoch": 0.39524848484848485, "grad_norm": 0.07118730247020721, "learning_rate": 0.00019264331052628176, "loss": 0.0748, "step": 6114 }, { "epoch": 0.39531313131313134, "grad_norm": 0.06368167698383331, "learning_rate": 0.00019264073584748405, "loss": 0.0806, "step": 6115 }, { "epoch": 0.3953777777777778, "grad_norm": 0.15761902928352356, "learning_rate": 0.00019263816073543627, "loss": 0.1022, "step": 6116 }, { "epoch": 0.39544242424242426, "grad_norm": 0.058614760637283325, "learning_rate": 0.00019263558519015043, "loss": 0.0789, "step": 6117 }, { "epoch": 0.3955070707070707, "grad_norm": 0.06035598739981651, "learning_rate": 0.00019263300921163858, "loss": 0.0885, "step": 6118 }, { "epoch": 0.3955717171717172, "grad_norm": 0.061089128255844116, "learning_rate": 0.0001926304327999128, "loss": 0.0853, "step": 6119 }, { "epoch": 0.3956363636363636, "grad_norm": 0.06116747111082077, "learning_rate": 0.0001926278559549851, "loss": 0.0821, "step": 6120 }, { "epoch": 0.3957010101010101, "grad_norm": 0.06051664799451828, "learning_rate": 0.0001926252786768675, "loss": 0.0847, "step": 6121 }, { "epoch": 0.39576565656565654, "grad_norm": 0.06509357690811157, "learning_rate": 0.0001926227009655721, "loss": 0.0926, "step": 6122 }, { "epoch": 0.39583030303030303, "grad_norm": 0.06741667538881302, "learning_rate": 0.00019262012282111098, "loss": 0.0813, "step": 6123 }, { "epoch": 0.3958949494949495, "grad_norm": 0.06134733930230141, "learning_rate": 0.00019261754424349615, "loss": 0.0935, "step": 6124 }, { "epoch": 0.39595959595959596, "grad_norm": 0.06497685611248016, "learning_rate": 0.0001926149652327397, "loss": 0.1015, "step": 6125 }, { "epoch": 0.39602424242424245, "grad_norm": 0.0677296370267868, "learning_rate": 0.00019261238578885368, "loss": 0.0993, "step": 6126 }, { "epoch": 0.3960888888888889, "grad_norm": 0.07027005404233932, "learning_rate": 0.0001926098059118501, "loss": 0.0989, "step": 6127 }, { "epoch": 0.39615353535353537, "grad_norm": 0.06183847412467003, "learning_rate": 0.0001926072256017411, "loss": 0.0922, "step": 6128 }, { "epoch": 0.39615353535353537, "eval_bleu": 16.157141017139292, "eval_loss": 0.0901956856250763, "eval_runtime": 2.5159, "eval_samples_per_second": 12.719, "eval_steps_per_second": 1.59, "step": 6128 }, { "epoch": 0.3962181818181818, "grad_norm": 0.06653418391942978, "learning_rate": 0.00019260464485853875, "loss": 0.0898, "step": 6129 }, { "epoch": 0.3962828282828283, "grad_norm": 0.06713660061359406, "learning_rate": 0.00019260206368225505, "loss": 0.0957, "step": 6130 }, { "epoch": 0.3963474747474747, "grad_norm": 0.061203304678201675, "learning_rate": 0.00019259948207290214, "loss": 0.0925, "step": 6131 }, { "epoch": 0.3964121212121212, "grad_norm": 0.07382739335298538, "learning_rate": 0.00019259690003049204, "loss": 0.1029, "step": 6132 }, { "epoch": 0.39647676767676765, "grad_norm": 0.06369546800851822, "learning_rate": 0.00019259431755503688, "loss": 0.089, "step": 6133 }, { "epoch": 0.39654141414141414, "grad_norm": 0.06193177029490471, "learning_rate": 0.00019259173464654869, "loss": 0.0811, "step": 6134 }, { "epoch": 0.39660606060606063, "grad_norm": 0.06687217205762863, "learning_rate": 0.00019258915130503955, "loss": 0.1009, "step": 6135 }, { "epoch": 0.39667070707070706, "grad_norm": 0.061863914132118225, "learning_rate": 0.0001925865675305216, "loss": 0.0877, "step": 6136 }, { "epoch": 0.39673535353535355, "grad_norm": 0.07034894824028015, "learning_rate": 0.00019258398332300684, "loss": 0.0944, "step": 6137 }, { "epoch": 0.3968, "grad_norm": 0.06272067874670029, "learning_rate": 0.00019258139868250744, "loss": 0.0849, "step": 6138 }, { "epoch": 0.3968646464646465, "grad_norm": 0.0666089653968811, "learning_rate": 0.00019257881360903542, "loss": 0.0861, "step": 6139 }, { "epoch": 0.3969292929292929, "grad_norm": 0.06596490740776062, "learning_rate": 0.0001925762281026029, "loss": 0.0966, "step": 6140 }, { "epoch": 0.3969939393939394, "grad_norm": 0.0683215856552124, "learning_rate": 0.00019257364216322196, "loss": 0.0907, "step": 6141 }, { "epoch": 0.39705858585858583, "grad_norm": 0.05809183046221733, "learning_rate": 0.00019257105579090472, "loss": 0.0834, "step": 6142 }, { "epoch": 0.3971232323232323, "grad_norm": 0.06948152929544449, "learning_rate": 0.00019256846898566324, "loss": 0.0865, "step": 6143 }, { "epoch": 0.3971878787878788, "grad_norm": 0.06265562772750854, "learning_rate": 0.00019256588174750963, "loss": 0.0868, "step": 6144 }, { "epoch": 0.3971878787878788, "eval_bleu": 17.756317937636037, "eval_loss": 0.09105764329433441, "eval_runtime": 2.6961, "eval_samples_per_second": 11.869, "eval_steps_per_second": 1.484, "step": 6144 }, { "epoch": 0.39725252525252525, "grad_norm": 0.0601598396897316, "learning_rate": 0.00019256329407645598, "loss": 0.0833, "step": 6145 }, { "epoch": 0.39731717171717174, "grad_norm": 0.06053571775555611, "learning_rate": 0.00019256070597251443, "loss": 0.0913, "step": 6146 }, { "epoch": 0.39738181818181817, "grad_norm": 0.07328733056783676, "learning_rate": 0.00019255811743569705, "loss": 0.0903, "step": 6147 }, { "epoch": 0.39744646464646466, "grad_norm": 0.06082964688539505, "learning_rate": 0.00019255552846601594, "loss": 0.0703, "step": 6148 }, { "epoch": 0.3975111111111111, "grad_norm": 0.061511293053627014, "learning_rate": 0.00019255293906348325, "loss": 0.0782, "step": 6149 }, { "epoch": 0.3975757575757576, "grad_norm": 0.06249752268195152, "learning_rate": 0.00019255034922811105, "loss": 0.0867, "step": 6150 }, { "epoch": 0.397640404040404, "grad_norm": 0.07012129575014114, "learning_rate": 0.00019254775895991144, "loss": 0.0935, "step": 6151 }, { "epoch": 0.3977050505050505, "grad_norm": 0.06293827295303345, "learning_rate": 0.00019254516825889656, "loss": 0.0791, "step": 6152 }, { "epoch": 0.397769696969697, "grad_norm": 0.06124801188707352, "learning_rate": 0.00019254257712507855, "loss": 0.09, "step": 6153 }, { "epoch": 0.39783434343434343, "grad_norm": 0.06056206673383713, "learning_rate": 0.00019253998555846948, "loss": 0.082, "step": 6154 }, { "epoch": 0.3978989898989899, "grad_norm": 0.06283480674028397, "learning_rate": 0.0001925373935590815, "loss": 0.0854, "step": 6155 }, { "epoch": 0.39796363636363635, "grad_norm": 0.06635025143623352, "learning_rate": 0.00019253480112692674, "loss": 0.0963, "step": 6156 }, { "epoch": 0.39802828282828284, "grad_norm": 0.0628061443567276, "learning_rate": 0.0001925322082620173, "loss": 0.0902, "step": 6157 }, { "epoch": 0.3980929292929293, "grad_norm": 0.05936161056160927, "learning_rate": 0.00019252961496436528, "loss": 0.0871, "step": 6158 }, { "epoch": 0.39815757575757577, "grad_norm": 0.06391508877277374, "learning_rate": 0.00019252702123398287, "loss": 0.0931, "step": 6159 }, { "epoch": 0.3982222222222222, "grad_norm": 0.061019327491521835, "learning_rate": 0.00019252442707088215, "loss": 0.0804, "step": 6160 }, { "epoch": 0.3982222222222222, "eval_bleu": 16.624976612956026, "eval_loss": 0.0881526991724968, "eval_runtime": 2.6422, "eval_samples_per_second": 12.111, "eval_steps_per_second": 1.514, "step": 6160 }, { "epoch": 0.3982868686868687, "grad_norm": 0.05570165812969208, "learning_rate": 0.00019252183247507528, "loss": 0.076, "step": 6161 }, { "epoch": 0.3983515151515152, "grad_norm": 0.06894594430923462, "learning_rate": 0.0001925192374465744, "loss": 0.0979, "step": 6162 }, { "epoch": 0.3984161616161616, "grad_norm": 0.06463257223367691, "learning_rate": 0.00019251664198539163, "loss": 0.1012, "step": 6163 }, { "epoch": 0.3984808080808081, "grad_norm": 0.05916406214237213, "learning_rate": 0.0001925140460915391, "loss": 0.086, "step": 6164 }, { "epoch": 0.39854545454545454, "grad_norm": 0.06443562358617783, "learning_rate": 0.00019251144976502899, "loss": 0.0898, "step": 6165 }, { "epoch": 0.398610101010101, "grad_norm": 0.07367242127656937, "learning_rate": 0.00019250885300587335, "loss": 0.1068, "step": 6166 }, { "epoch": 0.39867474747474746, "grad_norm": 0.06023339927196503, "learning_rate": 0.00019250625581408444, "loss": 0.0913, "step": 6167 }, { "epoch": 0.39873939393939395, "grad_norm": 0.055634014308452606, "learning_rate": 0.00019250365818967432, "loss": 0.0749, "step": 6168 }, { "epoch": 0.3988040404040404, "grad_norm": 0.054479584097862244, "learning_rate": 0.0001925010601326552, "loss": 0.0754, "step": 6169 }, { "epoch": 0.3988686868686869, "grad_norm": 0.05892808735370636, "learning_rate": 0.00019249846164303918, "loss": 0.1018, "step": 6170 }, { "epoch": 0.3989333333333333, "grad_norm": 0.05191429331898689, "learning_rate": 0.00019249586272083847, "loss": 0.0747, "step": 6171 }, { "epoch": 0.3989979797979798, "grad_norm": 0.05970599502325058, "learning_rate": 0.00019249326336606516, "loss": 0.0897, "step": 6172 }, { "epoch": 0.3990626262626263, "grad_norm": 0.06581877917051315, "learning_rate": 0.00019249066357873146, "loss": 0.0953, "step": 6173 }, { "epoch": 0.3991272727272727, "grad_norm": 0.06686174869537354, "learning_rate": 0.0001924880633588495, "loss": 0.1019, "step": 6174 }, { "epoch": 0.3991919191919192, "grad_norm": 0.06059034913778305, "learning_rate": 0.0001924854627064314, "loss": 0.0836, "step": 6175 }, { "epoch": 0.39925656565656564, "grad_norm": 0.054030392318964005, "learning_rate": 0.0001924828616214894, "loss": 0.0782, "step": 6176 }, { "epoch": 0.39925656565656564, "eval_bleu": 18.935031774692757, "eval_loss": 0.09005576372146606, "eval_runtime": 2.6169, "eval_samples_per_second": 12.228, "eval_steps_per_second": 1.529, "step": 6176 }, { "epoch": 0.39932121212121213, "grad_norm": 0.07084312289953232, "learning_rate": 0.00019248026010403564, "loss": 0.1076, "step": 6177 }, { "epoch": 0.39938585858585857, "grad_norm": 0.057744115591049194, "learning_rate": 0.00019247765815408224, "loss": 0.0869, "step": 6178 }, { "epoch": 0.39945050505050506, "grad_norm": 0.054169099777936935, "learning_rate": 0.00019247505577164143, "loss": 0.076, "step": 6179 }, { "epoch": 0.3995151515151515, "grad_norm": 0.05713645741343498, "learning_rate": 0.00019247245295672536, "loss": 0.085, "step": 6180 }, { "epoch": 0.399579797979798, "grad_norm": 0.07999927550554276, "learning_rate": 0.0001924698497093462, "loss": 0.0816, "step": 6181 }, { "epoch": 0.39964444444444447, "grad_norm": 0.05664883181452751, "learning_rate": 0.00019246724602951608, "loss": 0.0776, "step": 6182 }, { "epoch": 0.3997090909090909, "grad_norm": 0.07353712618350983, "learning_rate": 0.0001924646419172473, "loss": 0.1023, "step": 6183 }, { "epoch": 0.3997737373737374, "grad_norm": 0.061008743941783905, "learning_rate": 0.00019246203737255187, "loss": 0.0925, "step": 6184 }, { "epoch": 0.3998383838383838, "grad_norm": 0.06014643982052803, "learning_rate": 0.00019245943239544212, "loss": 0.0899, "step": 6185 }, { "epoch": 0.3999030303030303, "grad_norm": 0.1315092295408249, "learning_rate": 0.00019245682698593015, "loss": 0.0787, "step": 6186 }, { "epoch": 0.39996767676767675, "grad_norm": 0.0656527429819107, "learning_rate": 0.00019245422114402816, "loss": 0.0899, "step": 6187 }, { "epoch": 0.40003232323232324, "grad_norm": 0.05592891201376915, "learning_rate": 0.00019245161486974836, "loss": 0.0886, "step": 6188 }, { "epoch": 0.4000969696969697, "grad_norm": 0.07177361845970154, "learning_rate": 0.00019244900816310292, "loss": 0.1052, "step": 6189 }, { "epoch": 0.40016161616161616, "grad_norm": 0.07556518167257309, "learning_rate": 0.00019244640102410401, "loss": 0.1091, "step": 6190 }, { "epoch": 0.40022626262626265, "grad_norm": 0.06207931041717529, "learning_rate": 0.00019244379345276387, "loss": 0.0888, "step": 6191 }, { "epoch": 0.4002909090909091, "grad_norm": 0.06689703464508057, "learning_rate": 0.00019244118544909465, "loss": 0.0922, "step": 6192 }, { "epoch": 0.4002909090909091, "eval_bleu": 16.32911757834027, "eval_loss": 0.09045293927192688, "eval_runtime": 2.6452, "eval_samples_per_second": 12.098, "eval_steps_per_second": 1.512, "step": 6192 }, { "epoch": 0.4003555555555556, "grad_norm": 0.06317510455846786, "learning_rate": 0.00019243857701310856, "loss": 0.1, "step": 6193 }, { "epoch": 0.400420202020202, "grad_norm": 0.07511056214570999, "learning_rate": 0.00019243596814481784, "loss": 0.0978, "step": 6194 }, { "epoch": 0.4004848484848485, "grad_norm": 0.06433052569627762, "learning_rate": 0.00019243335884423462, "loss": 0.0876, "step": 6195 }, { "epoch": 0.40054949494949493, "grad_norm": 0.06150728464126587, "learning_rate": 0.00019243074911137113, "loss": 0.0853, "step": 6196 }, { "epoch": 0.4006141414141414, "grad_norm": 0.06031908467411995, "learning_rate": 0.00019242813894623963, "loss": 0.0893, "step": 6197 }, { "epoch": 0.40067878787878786, "grad_norm": 0.05643102526664734, "learning_rate": 0.00019242552834885225, "loss": 0.0818, "step": 6198 }, { "epoch": 0.40074343434343435, "grad_norm": 0.06167101487517357, "learning_rate": 0.00019242291731922122, "loss": 0.082, "step": 6199 }, { "epoch": 0.40080808080808084, "grad_norm": 0.0652003288269043, "learning_rate": 0.00019242030585735879, "loss": 0.0812, "step": 6200 }, { "epoch": 0.40087272727272727, "grad_norm": 0.057824887335300446, "learning_rate": 0.00019241769396327713, "loss": 0.0823, "step": 6201 }, { "epoch": 0.40093737373737376, "grad_norm": 0.06495670974254608, "learning_rate": 0.00019241508163698846, "loss": 0.0982, "step": 6202 }, { "epoch": 0.4010020202020202, "grad_norm": 0.06464248150587082, "learning_rate": 0.000192412468878505, "loss": 0.0804, "step": 6203 }, { "epoch": 0.4010666666666667, "grad_norm": 0.06587665528059006, "learning_rate": 0.000192409855687839, "loss": 0.0881, "step": 6204 }, { "epoch": 0.4011313131313131, "grad_norm": 0.06167178973555565, "learning_rate": 0.00019240724206500263, "loss": 0.0877, "step": 6205 }, { "epoch": 0.4011959595959596, "grad_norm": 0.061472125351428986, "learning_rate": 0.00019240462801000817, "loss": 0.082, "step": 6206 }, { "epoch": 0.40126060606060604, "grad_norm": 0.06439856439828873, "learning_rate": 0.00019240201352286777, "loss": 0.0803, "step": 6207 }, { "epoch": 0.40132525252525253, "grad_norm": 0.05771096795797348, "learning_rate": 0.00019239939860359373, "loss": 0.086, "step": 6208 }, { "epoch": 0.40132525252525253, "eval_bleu": 16.88973803320502, "eval_loss": 0.08955128490924835, "eval_runtime": 2.6001, "eval_samples_per_second": 12.307, "eval_steps_per_second": 1.538, "step": 6208 }, { "epoch": 0.40138989898989896, "grad_norm": 0.06266828626394272, "learning_rate": 0.00019239678325219824, "loss": 0.0835, "step": 6209 }, { "epoch": 0.40145454545454545, "grad_norm": 0.059250589460134506, "learning_rate": 0.00019239416746869354, "loss": 0.0816, "step": 6210 }, { "epoch": 0.40151919191919194, "grad_norm": 0.05307573452591896, "learning_rate": 0.00019239155125309187, "loss": 0.0714, "step": 6211 }, { "epoch": 0.4015838383838384, "grad_norm": 0.06897120177745819, "learning_rate": 0.00019238893460540543, "loss": 0.1121, "step": 6212 }, { "epoch": 0.40164848484848487, "grad_norm": 0.06551732867956161, "learning_rate": 0.00019238631752564652, "loss": 0.0939, "step": 6213 }, { "epoch": 0.4017131313131313, "grad_norm": 0.05918030068278313, "learning_rate": 0.00019238370001382733, "loss": 0.0898, "step": 6214 }, { "epoch": 0.4017777777777778, "grad_norm": 0.05898888409137726, "learning_rate": 0.00019238108206996014, "loss": 0.0917, "step": 6215 }, { "epoch": 0.4018424242424242, "grad_norm": 0.060877665877342224, "learning_rate": 0.00019237846369405713, "loss": 0.0829, "step": 6216 }, { "epoch": 0.4019070707070707, "grad_norm": 0.0632459744811058, "learning_rate": 0.00019237584488613062, "loss": 0.0911, "step": 6217 }, { "epoch": 0.40197171717171715, "grad_norm": 0.05619823560118675, "learning_rate": 0.0001923732256461928, "loss": 0.0796, "step": 6218 }, { "epoch": 0.40203636363636364, "grad_norm": 0.056191008538007736, "learning_rate": 0.00019237060597425597, "loss": 0.0773, "step": 6219 }, { "epoch": 0.4021010101010101, "grad_norm": 0.05745304375886917, "learning_rate": 0.00019236798587033232, "loss": 0.0833, "step": 6220 }, { "epoch": 0.40216565656565656, "grad_norm": 0.05724504217505455, "learning_rate": 0.00019236536533443415, "loss": 0.0877, "step": 6221 }, { "epoch": 0.40223030303030305, "grad_norm": 0.06284992396831512, "learning_rate": 0.00019236274436657368, "loss": 0.0919, "step": 6222 }, { "epoch": 0.4022949494949495, "grad_norm": 0.05888194218277931, "learning_rate": 0.00019236012296676322, "loss": 0.0778, "step": 6223 }, { "epoch": 0.402359595959596, "grad_norm": 0.06453243643045425, "learning_rate": 0.00019235750113501498, "loss": 0.0904, "step": 6224 }, { "epoch": 0.402359595959596, "eval_bleu": 17.185766498227068, "eval_loss": 0.09100686758756638, "eval_runtime": 2.6702, "eval_samples_per_second": 11.984, "eval_steps_per_second": 1.498, "step": 6224 }, { "epoch": 0.4024242424242424, "grad_norm": 0.05837666615843773, "learning_rate": 0.00019235487887134126, "loss": 0.0948, "step": 6225 }, { "epoch": 0.4024888888888889, "grad_norm": 0.05736033245921135, "learning_rate": 0.0001923522561757543, "loss": 0.0753, "step": 6226 }, { "epoch": 0.40255353535353533, "grad_norm": 0.04817437380552292, "learning_rate": 0.00019234963304826637, "loss": 0.0685, "step": 6227 }, { "epoch": 0.4026181818181818, "grad_norm": 0.06265473365783691, "learning_rate": 0.00019234700948888973, "loss": 0.0913, "step": 6228 }, { "epoch": 0.4026828282828283, "grad_norm": 0.05184859782457352, "learning_rate": 0.00019234438549763666, "loss": 0.0722, "step": 6229 }, { "epoch": 0.40274747474747474, "grad_norm": 0.05518975481390953, "learning_rate": 0.0001923417610745194, "loss": 0.0796, "step": 6230 }, { "epoch": 0.40281212121212123, "grad_norm": 0.06509093940258026, "learning_rate": 0.00019233913621955026, "loss": 0.0817, "step": 6231 }, { "epoch": 0.40287676767676767, "grad_norm": 0.0605676993727684, "learning_rate": 0.00019233651093274152, "loss": 0.0886, "step": 6232 }, { "epoch": 0.40294141414141416, "grad_norm": 0.06798075139522552, "learning_rate": 0.00019233388521410546, "loss": 0.0893, "step": 6233 }, { "epoch": 0.4030060606060606, "grad_norm": 0.0585663765668869, "learning_rate": 0.00019233125906365433, "loss": 0.0908, "step": 6234 }, { "epoch": 0.4030707070707071, "grad_norm": 0.06047147884964943, "learning_rate": 0.00019232863248140042, "loss": 0.088, "step": 6235 }, { "epoch": 0.4031353535353535, "grad_norm": 0.05858450010418892, "learning_rate": 0.00019232600546735602, "loss": 0.0748, "step": 6236 }, { "epoch": 0.4032, "grad_norm": 0.061827633529901505, "learning_rate": 0.00019232337802153342, "loss": 0.0864, "step": 6237 }, { "epoch": 0.4032646464646465, "grad_norm": 0.06853217631578445, "learning_rate": 0.00019232075014394487, "loss": 0.0914, "step": 6238 }, { "epoch": 0.4033292929292929, "grad_norm": 0.09749385714530945, "learning_rate": 0.00019231812183460273, "loss": 0.1103, "step": 6239 }, { "epoch": 0.4033939393939394, "grad_norm": 0.05868695676326752, "learning_rate": 0.00019231549309351925, "loss": 0.0745, "step": 6240 }, { "epoch": 0.4033939393939394, "eval_bleu": 17.106908992888926, "eval_loss": 0.09054448455572128, "eval_runtime": 2.595, "eval_samples_per_second": 12.331, "eval_steps_per_second": 1.541, "step": 6240 }, { "epoch": 0.40345858585858585, "grad_norm": 0.06214582920074463, "learning_rate": 0.00019231286392070674, "loss": 0.0775, "step": 6241 }, { "epoch": 0.40352323232323234, "grad_norm": 0.05925198644399643, "learning_rate": 0.00019231023431617744, "loss": 0.0845, "step": 6242 }, { "epoch": 0.4035878787878788, "grad_norm": 0.060223836451768875, "learning_rate": 0.00019230760427994375, "loss": 0.0814, "step": 6243 }, { "epoch": 0.40365252525252526, "grad_norm": 0.06870649755001068, "learning_rate": 0.00019230497381201786, "loss": 0.1054, "step": 6244 }, { "epoch": 0.4037171717171717, "grad_norm": 0.06820739805698395, "learning_rate": 0.00019230234291241213, "loss": 0.1066, "step": 6245 }, { "epoch": 0.4037818181818182, "grad_norm": 0.07218804955482483, "learning_rate": 0.0001922997115811389, "loss": 0.0906, "step": 6246 }, { "epoch": 0.4038464646464646, "grad_norm": 0.052760858088731766, "learning_rate": 0.00019229707981821038, "loss": 0.0775, "step": 6247 }, { "epoch": 0.4039111111111111, "grad_norm": 0.05905124917626381, "learning_rate": 0.00019229444762363895, "loss": 0.0884, "step": 6248 }, { "epoch": 0.4039757575757576, "grad_norm": 0.05837773159146309, "learning_rate": 0.00019229181499743692, "loss": 0.0971, "step": 6249 }, { "epoch": 0.40404040404040403, "grad_norm": 0.06331276893615723, "learning_rate": 0.00019228918193961656, "loss": 0.0888, "step": 6250 }, { "epoch": 0.4041050505050505, "grad_norm": 0.0862065926194191, "learning_rate": 0.00019228654845019022, "loss": 0.0973, "step": 6251 }, { "epoch": 0.40416969696969696, "grad_norm": 0.06020095571875572, "learning_rate": 0.00019228391452917015, "loss": 0.0865, "step": 6252 }, { "epoch": 0.40423434343434345, "grad_norm": 0.0543849878013134, "learning_rate": 0.00019228128017656878, "loss": 0.0829, "step": 6253 }, { "epoch": 0.4042989898989899, "grad_norm": 0.0637732669711113, "learning_rate": 0.00019227864539239837, "loss": 0.0942, "step": 6254 }, { "epoch": 0.40436363636363637, "grad_norm": 0.057019270956516266, "learning_rate": 0.00019227601017667123, "loss": 0.0862, "step": 6255 }, { "epoch": 0.4044282828282828, "grad_norm": 0.06783878803253174, "learning_rate": 0.00019227337452939967, "loss": 0.1026, "step": 6256 }, { "epoch": 0.4044282828282828, "eval_bleu": 16.20386205847146, "eval_loss": 0.08927594870328903, "eval_runtime": 2.6451, "eval_samples_per_second": 12.098, "eval_steps_per_second": 1.512, "step": 6256 }, { "epoch": 0.4044929292929293, "grad_norm": 0.06168825179338455, "learning_rate": 0.00019227073845059606, "loss": 0.094, "step": 6257 }, { "epoch": 0.4045575757575758, "grad_norm": 0.06450296938419342, "learning_rate": 0.00019226810194027273, "loss": 0.0886, "step": 6258 }, { "epoch": 0.4046222222222222, "grad_norm": 0.05807584151625633, "learning_rate": 0.000192265464998442, "loss": 0.0925, "step": 6259 }, { "epoch": 0.4046868686868687, "grad_norm": 0.06425441056489944, "learning_rate": 0.00019226282762511613, "loss": 0.089, "step": 6260 }, { "epoch": 0.40475151515151514, "grad_norm": 0.060222528874874115, "learning_rate": 0.00019226018982030756, "loss": 0.0901, "step": 6261 }, { "epoch": 0.40481616161616163, "grad_norm": 0.06360756605863571, "learning_rate": 0.00019225755158402857, "loss": 0.0815, "step": 6262 }, { "epoch": 0.40488080808080806, "grad_norm": 0.05671854317188263, "learning_rate": 0.00019225491291629152, "loss": 0.0751, "step": 6263 }, { "epoch": 0.40494545454545455, "grad_norm": 0.0711979866027832, "learning_rate": 0.00019225227381710873, "loss": 0.0732, "step": 6264 }, { "epoch": 0.405010101010101, "grad_norm": 0.06642993539571762, "learning_rate": 0.00019224963428649257, "loss": 0.091, "step": 6265 }, { "epoch": 0.4050747474747475, "grad_norm": 0.06928640604019165, "learning_rate": 0.00019224699432445538, "loss": 0.0942, "step": 6266 }, { "epoch": 0.40513939393939397, "grad_norm": 0.0544753335416317, "learning_rate": 0.00019224435393100947, "loss": 0.0712, "step": 6267 }, { "epoch": 0.4052040404040404, "grad_norm": 0.06972908973693848, "learning_rate": 0.00019224171310616722, "loss": 0.0963, "step": 6268 }, { "epoch": 0.4052686868686869, "grad_norm": 0.05216776952147484, "learning_rate": 0.00019223907184994095, "loss": 0.0691, "step": 6269 }, { "epoch": 0.4053333333333333, "grad_norm": 0.06261812895536423, "learning_rate": 0.00019223643016234306, "loss": 0.0893, "step": 6270 }, { "epoch": 0.4053979797979798, "grad_norm": 0.06896913796663284, "learning_rate": 0.00019223378804338585, "loss": 0.0902, "step": 6271 }, { "epoch": 0.40546262626262625, "grad_norm": 0.0558202862739563, "learning_rate": 0.00019223114549308173, "loss": 0.0764, "step": 6272 }, { "epoch": 0.40546262626262625, "eval_bleu": 17.275602444192018, "eval_loss": 0.08909797668457031, "eval_runtime": 2.6695, "eval_samples_per_second": 11.987, "eval_steps_per_second": 1.498, "step": 6272 }, { "epoch": 0.40552727272727274, "grad_norm": 0.05707972124218941, "learning_rate": 0.00019222850251144304, "loss": 0.0714, "step": 6273 }, { "epoch": 0.40559191919191917, "grad_norm": 0.06521407514810562, "learning_rate": 0.00019222585909848213, "loss": 0.1014, "step": 6274 }, { "epoch": 0.40565656565656566, "grad_norm": 0.06540525704622269, "learning_rate": 0.00019222321525421134, "loss": 0.0848, "step": 6275 }, { "epoch": 0.4057212121212121, "grad_norm": 0.060662321746349335, "learning_rate": 0.0001922205709786431, "loss": 0.0764, "step": 6276 }, { "epoch": 0.4057858585858586, "grad_norm": 0.06789975613355637, "learning_rate": 0.00019221792627178972, "loss": 0.0927, "step": 6277 }, { "epoch": 0.4058505050505051, "grad_norm": 0.062314216047525406, "learning_rate": 0.00019221528113366358, "loss": 0.083, "step": 6278 }, { "epoch": 0.4059151515151515, "grad_norm": 0.06416933983564377, "learning_rate": 0.00019221263556427705, "loss": 0.0831, "step": 6279 }, { "epoch": 0.405979797979798, "grad_norm": 0.07141841948032379, "learning_rate": 0.00019220998956364252, "loss": 0.1063, "step": 6280 }, { "epoch": 0.40604444444444443, "grad_norm": 0.05750516057014465, "learning_rate": 0.00019220734313177236, "loss": 0.0832, "step": 6281 }, { "epoch": 0.4061090909090909, "grad_norm": 0.056281477212905884, "learning_rate": 0.00019220469626867893, "loss": 0.0842, "step": 6282 }, { "epoch": 0.40617373737373735, "grad_norm": 0.06443186104297638, "learning_rate": 0.00019220204897437462, "loss": 0.0786, "step": 6283 }, { "epoch": 0.40623838383838384, "grad_norm": 0.05903633311390877, "learning_rate": 0.0001921994012488718, "loss": 0.0716, "step": 6284 }, { "epoch": 0.4063030303030303, "grad_norm": 0.05655737966299057, "learning_rate": 0.00019219675309218288, "loss": 0.0797, "step": 6285 }, { "epoch": 0.40636767676767677, "grad_norm": 0.06483691930770874, "learning_rate": 0.0001921941045043202, "loss": 0.0866, "step": 6286 }, { "epoch": 0.40643232323232326, "grad_norm": 0.06114526465535164, "learning_rate": 0.0001921914554852962, "loss": 0.0801, "step": 6287 }, { "epoch": 0.4064969696969697, "grad_norm": 0.051034342497587204, "learning_rate": 0.0001921888060351232, "loss": 0.0596, "step": 6288 }, { "epoch": 0.4064969696969697, "eval_bleu": 19.183211362316765, "eval_loss": 0.08993978798389435, "eval_runtime": 2.5818, "eval_samples_per_second": 12.395, "eval_steps_per_second": 1.549, "step": 6288 }, { "epoch": 0.4065616161616162, "grad_norm": 0.0594867579638958, "learning_rate": 0.00019218615615381368, "loss": 0.0802, "step": 6289 }, { "epoch": 0.4066262626262626, "grad_norm": 0.06512024253606796, "learning_rate": 0.00019218350584137995, "loss": 0.0972, "step": 6290 }, { "epoch": 0.4066909090909091, "grad_norm": 0.07817979156970978, "learning_rate": 0.00019218085509783444, "loss": 0.0894, "step": 6291 }, { "epoch": 0.40675555555555554, "grad_norm": 0.07366369664669037, "learning_rate": 0.00019217820392318955, "loss": 0.1158, "step": 6292 }, { "epoch": 0.406820202020202, "grad_norm": 0.07197237014770508, "learning_rate": 0.00019217555231745766, "loss": 0.0964, "step": 6293 }, { "epoch": 0.40688484848484846, "grad_norm": 0.05749428644776344, "learning_rate": 0.00019217290028065122, "loss": 0.0765, "step": 6294 }, { "epoch": 0.40694949494949495, "grad_norm": 0.06849973648786545, "learning_rate": 0.00019217024781278258, "loss": 0.0904, "step": 6295 }, { "epoch": 0.40701414141414144, "grad_norm": 0.0741352066397667, "learning_rate": 0.00019216759491386416, "loss": 0.0967, "step": 6296 }, { "epoch": 0.4070787878787879, "grad_norm": 0.055018339306116104, "learning_rate": 0.00019216494158390838, "loss": 0.0766, "step": 6297 }, { "epoch": 0.40714343434343436, "grad_norm": 0.06522677093744278, "learning_rate": 0.00019216228782292764, "loss": 0.0776, "step": 6298 }, { "epoch": 0.4072080808080808, "grad_norm": 0.06749577075242996, "learning_rate": 0.00019215963363093433, "loss": 0.0934, "step": 6299 }, { "epoch": 0.4072727272727273, "grad_norm": 0.05969200283288956, "learning_rate": 0.00019215697900794087, "loss": 0.0857, "step": 6300 }, { "epoch": 0.4073373737373737, "grad_norm": 0.06710877269506454, "learning_rate": 0.0001921543239539597, "loss": 0.0983, "step": 6301 }, { "epoch": 0.4074020202020202, "grad_norm": 0.09139304608106613, "learning_rate": 0.00019215166846900324, "loss": 0.0829, "step": 6302 }, { "epoch": 0.40746666666666664, "grad_norm": 0.07918092608451843, "learning_rate": 0.00019214901255308386, "loss": 0.117, "step": 6303 }, { "epoch": 0.40753131313131313, "grad_norm": 0.05858078598976135, "learning_rate": 0.00019214635620621402, "loss": 0.0725, "step": 6304 }, { "epoch": 0.40753131313131313, "eval_bleu": 16.843545655212985, "eval_loss": 0.08849084377288818, "eval_runtime": 2.5356, "eval_samples_per_second": 12.62, "eval_steps_per_second": 1.578, "step": 6304 }, { "epoch": 0.4075959595959596, "grad_norm": 0.05311514809727669, "learning_rate": 0.00019214369942840615, "loss": 0.0723, "step": 6305 }, { "epoch": 0.40766060606060606, "grad_norm": 0.06641102582216263, "learning_rate": 0.00019214104221967267, "loss": 0.0909, "step": 6306 }, { "epoch": 0.40772525252525255, "grad_norm": 0.06198639050126076, "learning_rate": 0.00019213838458002598, "loss": 0.0759, "step": 6307 }, { "epoch": 0.407789898989899, "grad_norm": 0.05539361760020256, "learning_rate": 0.00019213572650947852, "loss": 0.0737, "step": 6308 }, { "epoch": 0.40785454545454547, "grad_norm": 0.05804649367928505, "learning_rate": 0.00019213306800804273, "loss": 0.0818, "step": 6309 }, { "epoch": 0.4079191919191919, "grad_norm": 0.06841015815734863, "learning_rate": 0.00019213040907573104, "loss": 0.085, "step": 6310 }, { "epoch": 0.4079838383838384, "grad_norm": 0.06494832038879395, "learning_rate": 0.0001921277497125559, "loss": 0.0863, "step": 6311 }, { "epoch": 0.4080484848484848, "grad_norm": 0.060785941779613495, "learning_rate": 0.0001921250899185297, "loss": 0.0907, "step": 6312 }, { "epoch": 0.4081131313131313, "grad_norm": 0.07769668847322464, "learning_rate": 0.00019212242969366493, "loss": 0.1017, "step": 6313 }, { "epoch": 0.40817777777777775, "grad_norm": 0.06647679209709167, "learning_rate": 0.000192119769037974, "loss": 0.1054, "step": 6314 }, { "epoch": 0.40824242424242424, "grad_norm": 0.0642043799161911, "learning_rate": 0.00019211710795146938, "loss": 0.0979, "step": 6315 }, { "epoch": 0.40830707070707073, "grad_norm": 0.06753615289926529, "learning_rate": 0.0001921144464341635, "loss": 0.094, "step": 6316 }, { "epoch": 0.40837171717171716, "grad_norm": 0.06585294753313065, "learning_rate": 0.0001921117844860688, "loss": 0.1001, "step": 6317 }, { "epoch": 0.40843636363636365, "grad_norm": 0.0661025196313858, "learning_rate": 0.00019210912210719774, "loss": 0.0802, "step": 6318 }, { "epoch": 0.4085010101010101, "grad_norm": 0.06268145889043808, "learning_rate": 0.00019210645929756276, "loss": 0.0844, "step": 6319 }, { "epoch": 0.4085656565656566, "grad_norm": 0.06500232964754105, "learning_rate": 0.0001921037960571763, "loss": 0.0896, "step": 6320 }, { "epoch": 0.4085656565656566, "eval_bleu": 17.086134668705583, "eval_loss": 0.08889850974082947, "eval_runtime": 2.5801, "eval_samples_per_second": 12.403, "eval_steps_per_second": 1.55, "step": 6320 }, { "epoch": 0.408630303030303, "grad_norm": 0.06656234711408615, "learning_rate": 0.00019210113238605088, "loss": 0.0876, "step": 6321 }, { "epoch": 0.4086949494949495, "grad_norm": 0.06568959355354309, "learning_rate": 0.00019209846828419886, "loss": 0.1035, "step": 6322 }, { "epoch": 0.40875959595959593, "grad_norm": 0.05929458141326904, "learning_rate": 0.00019209580375163278, "loss": 0.0906, "step": 6323 }, { "epoch": 0.4088242424242424, "grad_norm": 0.06133441999554634, "learning_rate": 0.00019209313878836508, "loss": 0.0781, "step": 6324 }, { "epoch": 0.4088888888888889, "grad_norm": 0.06407757103443146, "learning_rate": 0.00019209047339440819, "loss": 0.1023, "step": 6325 }, { "epoch": 0.40895353535353535, "grad_norm": 0.06653454154729843, "learning_rate": 0.00019208780756977464, "loss": 0.0972, "step": 6326 }, { "epoch": 0.40901818181818184, "grad_norm": 0.06694474816322327, "learning_rate": 0.00019208514131447682, "loss": 0.105, "step": 6327 }, { "epoch": 0.40908282828282827, "grad_norm": 0.07815913110971451, "learning_rate": 0.00019208247462852723, "loss": 0.0743, "step": 6328 }, { "epoch": 0.40914747474747476, "grad_norm": 0.06447961926460266, "learning_rate": 0.0001920798075119384, "loss": 0.1044, "step": 6329 }, { "epoch": 0.4092121212121212, "grad_norm": 0.055926863104104996, "learning_rate": 0.00019207713996472272, "loss": 0.0684, "step": 6330 }, { "epoch": 0.4092767676767677, "grad_norm": 0.06656002998352051, "learning_rate": 0.0001920744719868927, "loss": 0.0856, "step": 6331 }, { "epoch": 0.4093414141414141, "grad_norm": 0.059816598892211914, "learning_rate": 0.00019207180357846078, "loss": 0.0938, "step": 6332 }, { "epoch": 0.4094060606060606, "grad_norm": 0.06265003979206085, "learning_rate": 0.00019206913473943953, "loss": 0.0887, "step": 6333 }, { "epoch": 0.4094707070707071, "grad_norm": 0.05381350964307785, "learning_rate": 0.00019206646546984131, "loss": 0.0695, "step": 6334 }, { "epoch": 0.40953535353535353, "grad_norm": 0.06391909718513489, "learning_rate": 0.00019206379576967874, "loss": 0.0916, "step": 6335 }, { "epoch": 0.4096, "grad_norm": 0.04978372901678085, "learning_rate": 0.0001920611256389642, "loss": 0.067, "step": 6336 }, { "epoch": 0.4096, "eval_bleu": 16.416107609879965, "eval_loss": 0.08910023421049118, "eval_runtime": 2.6837, "eval_samples_per_second": 11.924, "eval_steps_per_second": 1.49, "step": 6336 }, { "epoch": 0.40966464646464645, "grad_norm": 0.05836545675992966, "learning_rate": 0.0001920584550777102, "loss": 0.0843, "step": 6337 }, { "epoch": 0.40972929292929294, "grad_norm": 0.06360521912574768, "learning_rate": 0.00019205578408592927, "loss": 0.0765, "step": 6338 }, { "epoch": 0.4097939393939394, "grad_norm": 0.06930802017450333, "learning_rate": 0.00019205311266363385, "loss": 0.0997, "step": 6339 }, { "epoch": 0.40985858585858587, "grad_norm": 0.0698675587773323, "learning_rate": 0.00019205044081083643, "loss": 0.1017, "step": 6340 }, { "epoch": 0.4099232323232323, "grad_norm": 0.05524763464927673, "learning_rate": 0.00019204776852754956, "loss": 0.0746, "step": 6341 }, { "epoch": 0.4099878787878788, "grad_norm": 0.0626600906252861, "learning_rate": 0.00019204509581378568, "loss": 0.0878, "step": 6342 }, { "epoch": 0.4100525252525253, "grad_norm": 0.05989215150475502, "learning_rate": 0.00019204242266955736, "loss": 0.091, "step": 6343 }, { "epoch": 0.4101171717171717, "grad_norm": 0.04702543094754219, "learning_rate": 0.000192039749094877, "loss": 0.0642, "step": 6344 }, { "epoch": 0.4101818181818182, "grad_norm": 0.05482880026102066, "learning_rate": 0.0001920370750897572, "loss": 0.0739, "step": 6345 }, { "epoch": 0.41024646464646464, "grad_norm": 0.0598100982606411, "learning_rate": 0.00019203440065421044, "loss": 0.087, "step": 6346 }, { "epoch": 0.4103111111111111, "grad_norm": 0.0614834800362587, "learning_rate": 0.0001920317257882492, "loss": 0.0881, "step": 6347 }, { "epoch": 0.41037575757575756, "grad_norm": 0.06558398902416229, "learning_rate": 0.000192029050491886, "loss": 0.0931, "step": 6348 }, { "epoch": 0.41044040404040405, "grad_norm": 0.06954683363437653, "learning_rate": 0.00019202637476513335, "loss": 0.1097, "step": 6349 }, { "epoch": 0.4105050505050505, "grad_norm": 0.05398554727435112, "learning_rate": 0.00019202369860800376, "loss": 0.0717, "step": 6350 }, { "epoch": 0.410569696969697, "grad_norm": 0.08577829599380493, "learning_rate": 0.00019202102202050977, "loss": 0.1099, "step": 6351 }, { "epoch": 0.4106343434343434, "grad_norm": 0.07028111070394516, "learning_rate": 0.00019201834500266388, "loss": 0.1045, "step": 6352 }, { "epoch": 0.4106343434343434, "eval_bleu": 16.741791454813715, "eval_loss": 0.08946040272712708, "eval_runtime": 2.6234, "eval_samples_per_second": 12.198, "eval_steps_per_second": 1.525, "step": 6352 }, { "epoch": 0.4106989898989899, "grad_norm": 0.057074498385190964, "learning_rate": 0.00019201566755447862, "loss": 0.0837, "step": 6353 }, { "epoch": 0.4107636363636364, "grad_norm": 0.05977410078048706, "learning_rate": 0.00019201298967596648, "loss": 0.0907, "step": 6354 }, { "epoch": 0.4108282828282828, "grad_norm": 0.06141617149114609, "learning_rate": 0.00019201031136714002, "loss": 0.0903, "step": 6355 }, { "epoch": 0.4108929292929293, "grad_norm": 0.05897141993045807, "learning_rate": 0.00019200763262801175, "loss": 0.0753, "step": 6356 }, { "epoch": 0.41095757575757574, "grad_norm": 0.06495220214128494, "learning_rate": 0.00019200495345859418, "loss": 0.0943, "step": 6357 }, { "epoch": 0.41102222222222223, "grad_norm": 0.0590679794549942, "learning_rate": 0.0001920022738588999, "loss": 0.0874, "step": 6358 }, { "epoch": 0.41108686868686867, "grad_norm": 0.06751900166273117, "learning_rate": 0.00019199959382894139, "loss": 0.1022, "step": 6359 }, { "epoch": 0.41115151515151516, "grad_norm": 0.05833285301923752, "learning_rate": 0.00019199691336873118, "loss": 0.0785, "step": 6360 }, { "epoch": 0.4112161616161616, "grad_norm": 0.05916067212820053, "learning_rate": 0.00019199423247828184, "loss": 0.0856, "step": 6361 }, { "epoch": 0.4112808080808081, "grad_norm": 0.064625084400177, "learning_rate": 0.00019199155115760585, "loss": 0.0968, "step": 6362 }, { "epoch": 0.41134545454545457, "grad_norm": 0.058268602937459946, "learning_rate": 0.00019198886940671582, "loss": 0.088, "step": 6363 }, { "epoch": 0.411410101010101, "grad_norm": 0.06716199964284897, "learning_rate": 0.00019198618722562422, "loss": 0.0929, "step": 6364 }, { "epoch": 0.4114747474747475, "grad_norm": 0.05781184136867523, "learning_rate": 0.00019198350461434367, "loss": 0.0851, "step": 6365 }, { "epoch": 0.4115393939393939, "grad_norm": 0.061366304755210876, "learning_rate": 0.00019198082157288667, "loss": 0.0916, "step": 6366 }, { "epoch": 0.4116040404040404, "grad_norm": 0.06434094905853271, "learning_rate": 0.00019197813810126576, "loss": 0.0904, "step": 6367 }, { "epoch": 0.41166868686868685, "grad_norm": 0.05944591388106346, "learning_rate": 0.00019197545419949354, "loss": 0.09, "step": 6368 }, { "epoch": 0.41166868686868685, "eval_bleu": 18.36413751316706, "eval_loss": 0.0892038568854332, "eval_runtime": 2.7142, "eval_samples_per_second": 11.79, "eval_steps_per_second": 1.474, "step": 6368 }, { "epoch": 0.41173333333333334, "grad_norm": 0.0821535661816597, "learning_rate": 0.00019197276986758247, "loss": 0.1029, "step": 6369 }, { "epoch": 0.4117979797979798, "grad_norm": 0.056876007467508316, "learning_rate": 0.00019197008510554522, "loss": 0.0844, "step": 6370 }, { "epoch": 0.41186262626262626, "grad_norm": 0.051439255475997925, "learning_rate": 0.00019196739991339428, "loss": 0.0755, "step": 6371 }, { "epoch": 0.41192727272727275, "grad_norm": 0.06323958933353424, "learning_rate": 0.00019196471429114216, "loss": 0.0911, "step": 6372 }, { "epoch": 0.4119919191919192, "grad_norm": 0.0582243949174881, "learning_rate": 0.00019196202823880155, "loss": 0.082, "step": 6373 }, { "epoch": 0.4120565656565657, "grad_norm": 0.07831689715385437, "learning_rate": 0.0001919593417563849, "loss": 0.1041, "step": 6374 }, { "epoch": 0.4121212121212121, "grad_norm": 0.05952618643641472, "learning_rate": 0.00019195665484390476, "loss": 0.0888, "step": 6375 }, { "epoch": 0.4121858585858586, "grad_norm": 0.07081044465303421, "learning_rate": 0.00019195396750137383, "loss": 0.0938, "step": 6376 }, { "epoch": 0.41225050505050503, "grad_norm": 0.0565808080136776, "learning_rate": 0.00019195127972880454, "loss": 0.0707, "step": 6377 }, { "epoch": 0.4123151515151515, "grad_norm": 0.07491331547498703, "learning_rate": 0.00019194859152620955, "loss": 0.1155, "step": 6378 }, { "epoch": 0.41237979797979796, "grad_norm": 0.06329019367694855, "learning_rate": 0.00019194590289360133, "loss": 0.0824, "step": 6379 }, { "epoch": 0.41244444444444445, "grad_norm": 0.060611430555582047, "learning_rate": 0.00019194321383099257, "loss": 0.0856, "step": 6380 }, { "epoch": 0.41250909090909094, "grad_norm": 0.06252176314592361, "learning_rate": 0.0001919405243383958, "loss": 0.0948, "step": 6381 }, { "epoch": 0.41257373737373737, "grad_norm": 0.08497213572263718, "learning_rate": 0.0001919378344158236, "loss": 0.1222, "step": 6382 }, { "epoch": 0.41263838383838386, "grad_norm": 0.06513378024101257, "learning_rate": 0.00019193514406328852, "loss": 0.094, "step": 6383 }, { "epoch": 0.4127030303030303, "grad_norm": 0.06319907307624817, "learning_rate": 0.00019193245328080316, "loss": 0.0917, "step": 6384 }, { "epoch": 0.4127030303030303, "eval_bleu": 16.903419356526683, "eval_loss": 0.0892833024263382, "eval_runtime": 2.7974, "eval_samples_per_second": 11.439, "eval_steps_per_second": 1.43, "step": 6384 }, { "epoch": 0.4127676767676768, "grad_norm": 0.054133374243974686, "learning_rate": 0.0001919297620683801, "loss": 0.0721, "step": 6385 }, { "epoch": 0.4128323232323232, "grad_norm": 0.06098697707056999, "learning_rate": 0.00019192707042603194, "loss": 0.0797, "step": 6386 }, { "epoch": 0.4128969696969697, "grad_norm": 0.056566476821899414, "learning_rate": 0.00019192437835377127, "loss": 0.0764, "step": 6387 }, { "epoch": 0.41296161616161614, "grad_norm": 0.051914699375629425, "learning_rate": 0.0001919216858516107, "loss": 0.0606, "step": 6388 }, { "epoch": 0.41302626262626263, "grad_norm": 0.0664343535900116, "learning_rate": 0.00019191899291956277, "loss": 0.1051, "step": 6389 }, { "epoch": 0.41309090909090906, "grad_norm": 0.06311753392219543, "learning_rate": 0.0001919162995576401, "loss": 0.0897, "step": 6390 }, { "epoch": 0.41315555555555555, "grad_norm": 0.06985709816217422, "learning_rate": 0.00019191360576585527, "loss": 0.1029, "step": 6391 }, { "epoch": 0.41322020202020204, "grad_norm": 0.0682913064956665, "learning_rate": 0.0001919109115442209, "loss": 0.0995, "step": 6392 }, { "epoch": 0.4132848484848485, "grad_norm": 0.05438131466507912, "learning_rate": 0.00019190821689274958, "loss": 0.0791, "step": 6393 }, { "epoch": 0.41334949494949497, "grad_norm": 0.05767160654067993, "learning_rate": 0.0001919055218114539, "loss": 0.0692, "step": 6394 }, { "epoch": 0.4134141414141414, "grad_norm": 0.062300026416778564, "learning_rate": 0.00019190282630034654, "loss": 0.0856, "step": 6395 }, { "epoch": 0.4134787878787879, "grad_norm": 0.062168676406145096, "learning_rate": 0.00019190013035943996, "loss": 0.0781, "step": 6396 }, { "epoch": 0.4135434343434343, "grad_norm": 0.06109360605478287, "learning_rate": 0.00019189743398874689, "loss": 0.0978, "step": 6397 }, { "epoch": 0.4136080808080808, "grad_norm": 0.06369418650865555, "learning_rate": 0.0001918947371882799, "loss": 0.089, "step": 6398 }, { "epoch": 0.41367272727272725, "grad_norm": 0.06408020853996277, "learning_rate": 0.00019189203995805164, "loss": 0.0747, "step": 6399 }, { "epoch": 0.41373737373737374, "grad_norm": 0.06681995838880539, "learning_rate": 0.00019188934229807462, "loss": 0.1025, "step": 6400 }, { "epoch": 0.41373737373737374, "eval_bleu": 18.509243373561908, "eval_loss": 0.08940853923559189, "eval_runtime": 2.6979, "eval_samples_per_second": 11.861, "eval_steps_per_second": 1.483, "step": 6400 }, { "epoch": 0.4138020202020202, "grad_norm": 0.06439770758152008, "learning_rate": 0.00019188664420836157, "loss": 0.0753, "step": 6401 }, { "epoch": 0.41386666666666666, "grad_norm": 0.0602901466190815, "learning_rate": 0.00019188394568892507, "loss": 0.0825, "step": 6402 }, { "epoch": 0.41393131313131315, "grad_norm": 0.05976952984929085, "learning_rate": 0.0001918812467397777, "loss": 0.0783, "step": 6403 }, { "epoch": 0.4139959595959596, "grad_norm": 0.05685512721538544, "learning_rate": 0.00019187854736093214, "loss": 0.0761, "step": 6404 }, { "epoch": 0.4140606060606061, "grad_norm": 0.05796041339635849, "learning_rate": 0.00019187584755240094, "loss": 0.0825, "step": 6405 }, { "epoch": 0.4141252525252525, "grad_norm": 0.061388250440359116, "learning_rate": 0.00019187314731419682, "loss": 0.0817, "step": 6406 }, { "epoch": 0.414189898989899, "grad_norm": 0.052921127527952194, "learning_rate": 0.00019187044664633235, "loss": 0.0725, "step": 6407 }, { "epoch": 0.41425454545454543, "grad_norm": 0.06646957248449326, "learning_rate": 0.00019186774554882021, "loss": 0.0935, "step": 6408 }, { "epoch": 0.4143191919191919, "grad_norm": 0.053579334169626236, "learning_rate": 0.00019186504402167294, "loss": 0.0798, "step": 6409 }, { "epoch": 0.4143838383838384, "grad_norm": 0.05927285552024841, "learning_rate": 0.00019186234206490326, "loss": 0.1005, "step": 6410 }, { "epoch": 0.41444848484848484, "grad_norm": 0.05721013620495796, "learning_rate": 0.00019185963967852375, "loss": 0.0813, "step": 6411 }, { "epoch": 0.41451313131313133, "grad_norm": 0.06477970629930496, "learning_rate": 0.00019185693686254708, "loss": 0.0944, "step": 6412 }, { "epoch": 0.41457777777777777, "grad_norm": 0.06658260524272919, "learning_rate": 0.0001918542336169859, "loss": 0.0861, "step": 6413 }, { "epoch": 0.41464242424242426, "grad_norm": 0.06425566226243973, "learning_rate": 0.00019185152994185284, "loss": 0.1137, "step": 6414 }, { "epoch": 0.4147070707070707, "grad_norm": 0.06000189483165741, "learning_rate": 0.00019184882583716053, "loss": 0.0846, "step": 6415 }, { "epoch": 0.4147717171717172, "grad_norm": 0.05721919611096382, "learning_rate": 0.00019184612130292164, "loss": 0.086, "step": 6416 }, { "epoch": 0.4147717171717172, "eval_bleu": 18.644971330016272, "eval_loss": 0.08915051072835922, "eval_runtime": 2.7446, "eval_samples_per_second": 11.659, "eval_steps_per_second": 1.457, "step": 6416 }, { "epoch": 0.4148363636363636, "grad_norm": 0.0618654303252697, "learning_rate": 0.00019184341633914878, "loss": 0.096, "step": 6417 }, { "epoch": 0.4149010101010101, "grad_norm": 0.055395886301994324, "learning_rate": 0.00019184071094585463, "loss": 0.0787, "step": 6418 }, { "epoch": 0.4149656565656566, "grad_norm": 0.06444157660007477, "learning_rate": 0.00019183800512305183, "loss": 0.092, "step": 6419 }, { "epoch": 0.415030303030303, "grad_norm": 0.07014118880033493, "learning_rate": 0.00019183529887075308, "loss": 0.102, "step": 6420 }, { "epoch": 0.4150949494949495, "grad_norm": 0.061668433248996735, "learning_rate": 0.00019183259218897094, "loss": 0.0908, "step": 6421 }, { "epoch": 0.41515959595959595, "grad_norm": 0.056153200566768646, "learning_rate": 0.00019182988507771817, "loss": 0.0779, "step": 6422 }, { "epoch": 0.41522424242424244, "grad_norm": 0.056416239589452744, "learning_rate": 0.00019182717753700736, "loss": 0.0772, "step": 6423 }, { "epoch": 0.4152888888888889, "grad_norm": 0.05553729087114334, "learning_rate": 0.00019182446956685122, "loss": 0.0756, "step": 6424 }, { "epoch": 0.41535353535353536, "grad_norm": 0.0693994089961052, "learning_rate": 0.00019182176116726236, "loss": 0.1138, "step": 6425 }, { "epoch": 0.4154181818181818, "grad_norm": 0.06209584325551987, "learning_rate": 0.00019181905233825352, "loss": 0.0771, "step": 6426 }, { "epoch": 0.4154828282828283, "grad_norm": 0.06680798530578613, "learning_rate": 0.00019181634307983728, "loss": 0.0878, "step": 6427 }, { "epoch": 0.4155474747474747, "grad_norm": 0.06849261373281479, "learning_rate": 0.0001918136333920264, "loss": 0.093, "step": 6428 }, { "epoch": 0.4156121212121212, "grad_norm": 0.048240307718515396, "learning_rate": 0.0001918109232748335, "loss": 0.0673, "step": 6429 }, { "epoch": 0.4156767676767677, "grad_norm": 0.0689520388841629, "learning_rate": 0.00019180821272827124, "loss": 0.1008, "step": 6430 }, { "epoch": 0.41574141414141413, "grad_norm": 0.06096424534916878, "learning_rate": 0.00019180550175235234, "loss": 0.084, "step": 6431 }, { "epoch": 0.4158060606060606, "grad_norm": 0.057649608701467514, "learning_rate": 0.00019180279034708946, "loss": 0.0789, "step": 6432 }, { "epoch": 0.4158060606060606, "eval_bleu": 14.596460367923287, "eval_loss": 0.09017134457826614, "eval_runtime": 2.6722, "eval_samples_per_second": 11.975, "eval_steps_per_second": 1.497, "step": 6432 }, { "epoch": 0.41587070707070706, "grad_norm": 0.05759517475962639, "learning_rate": 0.00019180007851249528, "loss": 0.0778, "step": 6433 }, { "epoch": 0.41593535353535355, "grad_norm": 0.060353316366672516, "learning_rate": 0.0001917973662485825, "loss": 0.0783, "step": 6434 }, { "epoch": 0.416, "grad_norm": 0.06598322838544846, "learning_rate": 0.00019179465355536373, "loss": 0.0988, "step": 6435 }, { "epoch": 0.41606464646464647, "grad_norm": 0.06110573559999466, "learning_rate": 0.00019179194043285176, "loss": 0.0941, "step": 6436 }, { "epoch": 0.4161292929292929, "grad_norm": 0.06993896514177322, "learning_rate": 0.0001917892268810592, "loss": 0.1025, "step": 6437 }, { "epoch": 0.4161939393939394, "grad_norm": 0.06245803087949753, "learning_rate": 0.0001917865128999988, "loss": 0.0939, "step": 6438 }, { "epoch": 0.4162585858585859, "grad_norm": 0.06344260275363922, "learning_rate": 0.0001917837984896832, "loss": 0.0874, "step": 6439 }, { "epoch": 0.4163232323232323, "grad_norm": 0.06057363748550415, "learning_rate": 0.00019178108365012513, "loss": 0.0831, "step": 6440 }, { "epoch": 0.4163878787878788, "grad_norm": 0.051101360470056534, "learning_rate": 0.00019177836838133727, "loss": 0.0756, "step": 6441 }, { "epoch": 0.41645252525252524, "grad_norm": 0.06011616438627243, "learning_rate": 0.0001917756526833323, "loss": 0.0821, "step": 6442 }, { "epoch": 0.41651717171717173, "grad_norm": 0.0650014653801918, "learning_rate": 0.00019177293655612298, "loss": 0.0974, "step": 6443 }, { "epoch": 0.41658181818181816, "grad_norm": 0.0607028603553772, "learning_rate": 0.00019177021999972196, "loss": 0.0818, "step": 6444 }, { "epoch": 0.41664646464646465, "grad_norm": 0.07774928212165833, "learning_rate": 0.00019176750301414197, "loss": 0.0879, "step": 6445 }, { "epoch": 0.4167111111111111, "grad_norm": 0.0596434623003006, "learning_rate": 0.00019176478559939568, "loss": 0.079, "step": 6446 }, { "epoch": 0.4167757575757576, "grad_norm": 0.07813288271427155, "learning_rate": 0.00019176206775549585, "loss": 0.0739, "step": 6447 }, { "epoch": 0.41684040404040407, "grad_norm": 0.06573434919118881, "learning_rate": 0.0001917593494824552, "loss": 0.0872, "step": 6448 }, { "epoch": 0.41684040404040407, "eval_bleu": 14.278678830928714, "eval_loss": 0.09046085178852081, "eval_runtime": 2.5973, "eval_samples_per_second": 12.321, "eval_steps_per_second": 1.54, "step": 6448 }, { "epoch": 0.4169050505050505, "grad_norm": 0.0737411379814148, "learning_rate": 0.0001917566307802863, "loss": 0.1153, "step": 6449 }, { "epoch": 0.416969696969697, "grad_norm": 0.0667005106806755, "learning_rate": 0.00019175391164900205, "loss": 0.0973, "step": 6450 }, { "epoch": 0.4170343434343434, "grad_norm": 0.07391694188117981, "learning_rate": 0.00019175119208861508, "loss": 0.106, "step": 6451 }, { "epoch": 0.4170989898989899, "grad_norm": 0.05714480206370354, "learning_rate": 0.00019174847209913812, "loss": 0.0704, "step": 6452 }, { "epoch": 0.41716363636363635, "grad_norm": 0.05232822522521019, "learning_rate": 0.00019174575168058388, "loss": 0.0764, "step": 6453 }, { "epoch": 0.41722828282828284, "grad_norm": 0.054265346378088, "learning_rate": 0.00019174303083296508, "loss": 0.0674, "step": 6454 }, { "epoch": 0.41729292929292927, "grad_norm": 0.060933660715818405, "learning_rate": 0.00019174030955629445, "loss": 0.0812, "step": 6455 }, { "epoch": 0.41735757575757576, "grad_norm": 0.07062429934740067, "learning_rate": 0.00019173758785058476, "loss": 0.0993, "step": 6456 }, { "epoch": 0.4174222222222222, "grad_norm": 0.059317395091056824, "learning_rate": 0.00019173486571584866, "loss": 0.0816, "step": 6457 }, { "epoch": 0.4174868686868687, "grad_norm": 0.058504436165094376, "learning_rate": 0.00019173214315209897, "loss": 0.0804, "step": 6458 }, { "epoch": 0.4175515151515152, "grad_norm": 0.060640495270490646, "learning_rate": 0.0001917294201593483, "loss": 0.0752, "step": 6459 }, { "epoch": 0.4176161616161616, "grad_norm": 0.053200241178274155, "learning_rate": 0.0001917266967376095, "loss": 0.0807, "step": 6460 }, { "epoch": 0.4176808080808081, "grad_norm": 0.06466002017259598, "learning_rate": 0.00019172397288689527, "loss": 0.0863, "step": 6461 }, { "epoch": 0.41774545454545453, "grad_norm": 0.06762490421533585, "learning_rate": 0.0001917212486072183, "loss": 0.0779, "step": 6462 }, { "epoch": 0.417810101010101, "grad_norm": 0.07042326033115387, "learning_rate": 0.00019171852389859143, "loss": 0.0841, "step": 6463 }, { "epoch": 0.41787474747474745, "grad_norm": 0.06329536437988281, "learning_rate": 0.0001917157987610273, "loss": 0.0839, "step": 6464 }, { "epoch": 0.41787474747474745, "eval_bleu": 17.288591884144534, "eval_loss": 0.09013227373361588, "eval_runtime": 2.7256, "eval_samples_per_second": 11.741, "eval_steps_per_second": 1.468, "step": 6464 }, { "epoch": 0.41793939393939394, "grad_norm": 0.06288028508424759, "learning_rate": 0.00019171307319453874, "loss": 0.0885, "step": 6465 }, { "epoch": 0.4180040404040404, "grad_norm": 0.057276494801044464, "learning_rate": 0.0001917103471991384, "loss": 0.0744, "step": 6466 }, { "epoch": 0.41806868686868687, "grad_norm": 0.06314460188150406, "learning_rate": 0.00019170762077483914, "loss": 0.0876, "step": 6467 }, { "epoch": 0.41813333333333336, "grad_norm": 0.0518195703625679, "learning_rate": 0.00019170489392165364, "loss": 0.0743, "step": 6468 }, { "epoch": 0.4181979797979798, "grad_norm": 0.07086772471666336, "learning_rate": 0.00019170216663959465, "loss": 0.0895, "step": 6469 }, { "epoch": 0.4182626262626263, "grad_norm": 0.061446547508239746, "learning_rate": 0.00019169943892867494, "loss": 0.0797, "step": 6470 }, { "epoch": 0.4183272727272727, "grad_norm": 0.059265777468681335, "learning_rate": 0.0001916967107889073, "loss": 0.0878, "step": 6471 }, { "epoch": 0.4183919191919192, "grad_norm": 0.051524288952350616, "learning_rate": 0.00019169398222030444, "loss": 0.0774, "step": 6472 }, { "epoch": 0.41845656565656564, "grad_norm": 0.056709788739681244, "learning_rate": 0.00019169125322287913, "loss": 0.074, "step": 6473 }, { "epoch": 0.4185212121212121, "grad_norm": 0.059597328305244446, "learning_rate": 0.00019168852379664416, "loss": 0.0901, "step": 6474 }, { "epoch": 0.41858585858585856, "grad_norm": 0.05830507352948189, "learning_rate": 0.00019168579394161225, "loss": 0.0924, "step": 6475 }, { "epoch": 0.41865050505050505, "grad_norm": 0.061570968478918076, "learning_rate": 0.0001916830636577962, "loss": 0.0937, "step": 6476 }, { "epoch": 0.41871515151515154, "grad_norm": 0.055901579558849335, "learning_rate": 0.00019168033294520882, "loss": 0.0723, "step": 6477 }, { "epoch": 0.418779797979798, "grad_norm": 0.051244985312223434, "learning_rate": 0.00019167760180386278, "loss": 0.0723, "step": 6478 }, { "epoch": 0.41884444444444446, "grad_norm": 0.06263797730207443, "learning_rate": 0.00019167487023377092, "loss": 0.085, "step": 6479 }, { "epoch": 0.4189090909090909, "grad_norm": 0.060900893062353134, "learning_rate": 0.000191672138234946, "loss": 0.0914, "step": 6480 }, { "epoch": 0.4189090909090909, "eval_bleu": 16.0403761971115, "eval_loss": 0.09027500450611115, "eval_runtime": 2.8841, "eval_samples_per_second": 11.095, "eval_steps_per_second": 1.387, "step": 6480 }, { "epoch": 0.4189737373737374, "grad_norm": 0.06262217462062836, "learning_rate": 0.00019166940580740078, "loss": 0.0842, "step": 6481 }, { "epoch": 0.4190383838383838, "grad_norm": 0.06203075125813484, "learning_rate": 0.0001916666729511481, "loss": 0.0883, "step": 6482 }, { "epoch": 0.4191030303030303, "grad_norm": 0.0671638622879982, "learning_rate": 0.00019166393966620066, "loss": 0.0984, "step": 6483 }, { "epoch": 0.41916767676767674, "grad_norm": 0.0658009946346283, "learning_rate": 0.00019166120595257127, "loss": 0.1008, "step": 6484 }, { "epoch": 0.41923232323232323, "grad_norm": 0.06919082999229431, "learning_rate": 0.0001916584718102727, "loss": 0.1048, "step": 6485 }, { "epoch": 0.4192969696969697, "grad_norm": 0.05337420105934143, "learning_rate": 0.0001916557372393178, "loss": 0.0744, "step": 6486 }, { "epoch": 0.41936161616161616, "grad_norm": 0.06111198291182518, "learning_rate": 0.0001916530022397193, "loss": 0.0704, "step": 6487 }, { "epoch": 0.41942626262626265, "grad_norm": 0.05678342655301094, "learning_rate": 0.00019165026681149003, "loss": 0.0844, "step": 6488 }, { "epoch": 0.4194909090909091, "grad_norm": 0.07526948302984238, "learning_rate": 0.00019164753095464275, "loss": 0.1065, "step": 6489 }, { "epoch": 0.41955555555555557, "grad_norm": 0.06886976957321167, "learning_rate": 0.00019164479466919024, "loss": 0.0803, "step": 6490 }, { "epoch": 0.419620202020202, "grad_norm": 0.05722823366522789, "learning_rate": 0.00019164205795514535, "loss": 0.0748, "step": 6491 }, { "epoch": 0.4196848484848485, "grad_norm": 0.061769917607307434, "learning_rate": 0.00019163932081252085, "loss": 0.0825, "step": 6492 }, { "epoch": 0.41974949494949493, "grad_norm": 0.06847032159566879, "learning_rate": 0.00019163658324132955, "loss": 0.0881, "step": 6493 }, { "epoch": 0.4198141414141414, "grad_norm": 0.06878650933504105, "learning_rate": 0.00019163384524158424, "loss": 0.0991, "step": 6494 }, { "epoch": 0.41987878787878785, "grad_norm": 0.06420434266328812, "learning_rate": 0.00019163110681329773, "loss": 0.1013, "step": 6495 }, { "epoch": 0.41994343434343434, "grad_norm": 0.05956539884209633, "learning_rate": 0.0001916283679564828, "loss": 0.0708, "step": 6496 }, { "epoch": 0.41994343434343434, "eval_bleu": 16.548584099200752, "eval_loss": 0.08968710899353027, "eval_runtime": 2.5731, "eval_samples_per_second": 12.436, "eval_steps_per_second": 1.555, "step": 6496 }, { "epoch": 0.42000808080808083, "grad_norm": 0.05886111035943031, "learning_rate": 0.00019162562867115234, "loss": 0.0805, "step": 6497 }, { "epoch": 0.42007272727272726, "grad_norm": 0.05614326521754265, "learning_rate": 0.00019162288895731907, "loss": 0.0797, "step": 6498 }, { "epoch": 0.42013737373737375, "grad_norm": 0.05635355785489082, "learning_rate": 0.00019162014881499582, "loss": 0.0888, "step": 6499 }, { "epoch": 0.4202020202020202, "grad_norm": 0.05878439545631409, "learning_rate": 0.00019161740824419544, "loss": 0.092, "step": 6500 }, { "epoch": 0.4202666666666667, "grad_norm": 0.05909040942788124, "learning_rate": 0.00019161466724493075, "loss": 0.0872, "step": 6501 }, { "epoch": 0.4203313131313131, "grad_norm": 0.06191582977771759, "learning_rate": 0.0001916119258172145, "loss": 0.0846, "step": 6502 }, { "epoch": 0.4203959595959596, "grad_norm": 0.051789335906505585, "learning_rate": 0.0001916091839610596, "loss": 0.0653, "step": 6503 }, { "epoch": 0.42046060606060603, "grad_norm": 0.0644124448299408, "learning_rate": 0.00019160644167647882, "loss": 0.0943, "step": 6504 }, { "epoch": 0.4205252525252525, "grad_norm": 0.06215892732143402, "learning_rate": 0.00019160369896348498, "loss": 0.092, "step": 6505 }, { "epoch": 0.420589898989899, "grad_norm": 0.06440113484859467, "learning_rate": 0.00019160095582209093, "loss": 0.0942, "step": 6506 }, { "epoch": 0.42065454545454545, "grad_norm": 0.07637478411197662, "learning_rate": 0.00019159821225230955, "loss": 0.0778, "step": 6507 }, { "epoch": 0.42071919191919194, "grad_norm": 0.05878744646906853, "learning_rate": 0.00019159546825415355, "loss": 0.0917, "step": 6508 }, { "epoch": 0.42078383838383837, "grad_norm": 0.05882347375154495, "learning_rate": 0.00019159272382763583, "loss": 0.0859, "step": 6509 }, { "epoch": 0.42084848484848486, "grad_norm": 0.08135002851486206, "learning_rate": 0.00019158997897276924, "loss": 0.1059, "step": 6510 }, { "epoch": 0.4209131313131313, "grad_norm": 0.059042420238256454, "learning_rate": 0.0001915872336895666, "loss": 0.0851, "step": 6511 }, { "epoch": 0.4209777777777778, "grad_norm": 0.07247724384069443, "learning_rate": 0.00019158448797804072, "loss": 0.0987, "step": 6512 }, { "epoch": 0.4209777777777778, "eval_bleu": 16.561649104559432, "eval_loss": 0.09022922068834305, "eval_runtime": 2.6769, "eval_samples_per_second": 11.954, "eval_steps_per_second": 1.494, "step": 6512 }, { "epoch": 0.4210424242424242, "grad_norm": 0.06103076413273811, "learning_rate": 0.00019158174183820447, "loss": 0.082, "step": 6513 }, { "epoch": 0.4211070707070707, "grad_norm": 0.05863596498966217, "learning_rate": 0.0001915789952700707, "loss": 0.0827, "step": 6514 }, { "epoch": 0.4211717171717172, "grad_norm": 0.06097216159105301, "learning_rate": 0.00019157624827365226, "loss": 0.085, "step": 6515 }, { "epoch": 0.42123636363636363, "grad_norm": 0.0619642548263073, "learning_rate": 0.00019157350084896197, "loss": 0.0929, "step": 6516 }, { "epoch": 0.4213010101010101, "grad_norm": 0.07569549977779388, "learning_rate": 0.0001915707529960127, "loss": 0.0989, "step": 6517 }, { "epoch": 0.42136565656565655, "grad_norm": 0.06263240426778793, "learning_rate": 0.00019156800471481727, "loss": 0.0697, "step": 6518 }, { "epoch": 0.42143030303030304, "grad_norm": 0.05989835411310196, "learning_rate": 0.00019156525600538856, "loss": 0.0879, "step": 6519 }, { "epoch": 0.4214949494949495, "grad_norm": 0.05985410884022713, "learning_rate": 0.00019156250686773942, "loss": 0.0901, "step": 6520 }, { "epoch": 0.42155959595959597, "grad_norm": 0.07319796085357666, "learning_rate": 0.00019155975730188272, "loss": 0.1117, "step": 6521 }, { "epoch": 0.4216242424242424, "grad_norm": 0.05341310799121857, "learning_rate": 0.00019155700730783128, "loss": 0.0771, "step": 6522 }, { "epoch": 0.4216888888888889, "grad_norm": 0.05341542884707451, "learning_rate": 0.000191554256885598, "loss": 0.0771, "step": 6523 }, { "epoch": 0.4217535353535354, "grad_norm": 0.059527479112148285, "learning_rate": 0.00019155150603519574, "loss": 0.0813, "step": 6524 }, { "epoch": 0.4218181818181818, "grad_norm": 0.06480792164802551, "learning_rate": 0.00019154875475663733, "loss": 0.1038, "step": 6525 }, { "epoch": 0.4218828282828283, "grad_norm": 0.06270837783813477, "learning_rate": 0.00019154600304993567, "loss": 0.0996, "step": 6526 }, { "epoch": 0.42194747474747474, "grad_norm": 0.060864876955747604, "learning_rate": 0.0001915432509151036, "loss": 0.0811, "step": 6527 }, { "epoch": 0.4220121212121212, "grad_norm": 0.0539768785238266, "learning_rate": 0.00019154049835215403, "loss": 0.0746, "step": 6528 }, { "epoch": 0.4220121212121212, "eval_bleu": 16.0867768685008, "eval_loss": 0.09034961462020874, "eval_runtime": 2.7046, "eval_samples_per_second": 11.832, "eval_steps_per_second": 1.479, "step": 6528 }, { "epoch": 0.42207676767676766, "grad_norm": 0.0806153416633606, "learning_rate": 0.0001915377453610998, "loss": 0.1177, "step": 6529 }, { "epoch": 0.42214141414141415, "grad_norm": 0.06434538960456848, "learning_rate": 0.0001915349919419538, "loss": 0.091, "step": 6530 }, { "epoch": 0.4222060606060606, "grad_norm": 0.06783617287874222, "learning_rate": 0.00019153223809472892, "loss": 0.1017, "step": 6531 }, { "epoch": 0.4222707070707071, "grad_norm": 0.0638948380947113, "learning_rate": 0.000191529483819438, "loss": 0.0947, "step": 6532 }, { "epoch": 0.4223353535353535, "grad_norm": 0.06020256504416466, "learning_rate": 0.00019152672911609394, "loss": 0.0922, "step": 6533 }, { "epoch": 0.4224, "grad_norm": 0.060263946652412415, "learning_rate": 0.00019152397398470965, "loss": 0.0888, "step": 6534 }, { "epoch": 0.4224646464646465, "grad_norm": 0.0611228421330452, "learning_rate": 0.000191521218425298, "loss": 0.0854, "step": 6535 }, { "epoch": 0.4225292929292929, "grad_norm": 0.06031518056988716, "learning_rate": 0.00019151846243787183, "loss": 0.0856, "step": 6536 }, { "epoch": 0.4225939393939394, "grad_norm": 0.0628715455532074, "learning_rate": 0.00019151570602244408, "loss": 0.0976, "step": 6537 }, { "epoch": 0.42265858585858584, "grad_norm": 0.058890167623758316, "learning_rate": 0.00019151294917902766, "loss": 0.0838, "step": 6538 }, { "epoch": 0.42272323232323233, "grad_norm": 0.05440796539187431, "learning_rate": 0.00019151019190763536, "loss": 0.0844, "step": 6539 }, { "epoch": 0.42278787878787877, "grad_norm": 0.06462236493825912, "learning_rate": 0.0001915074342082802, "loss": 0.0864, "step": 6540 }, { "epoch": 0.42285252525252526, "grad_norm": 0.0683492049574852, "learning_rate": 0.000191504676080975, "loss": 0.0997, "step": 6541 }, { "epoch": 0.4229171717171717, "grad_norm": 0.07341332733631134, "learning_rate": 0.0001915019175257327, "loss": 0.1009, "step": 6542 }, { "epoch": 0.4229818181818182, "grad_norm": 0.06576399505138397, "learning_rate": 0.00019149915854256615, "loss": 0.1005, "step": 6543 }, { "epoch": 0.42304646464646467, "grad_norm": 0.09562241286039352, "learning_rate": 0.00019149639913148834, "loss": 0.0965, "step": 6544 }, { "epoch": 0.42304646464646467, "eval_bleu": 14.654638737363467, "eval_loss": 0.09012212604284286, "eval_runtime": 2.631, "eval_samples_per_second": 12.163, "eval_steps_per_second": 1.52, "step": 6544 }, { "epoch": 0.4231111111111111, "grad_norm": 0.05989299714565277, "learning_rate": 0.00019149363929251204, "loss": 0.0891, "step": 6545 }, { "epoch": 0.4231757575757576, "grad_norm": 0.06380750238895416, "learning_rate": 0.0001914908790256503, "loss": 0.0717, "step": 6546 }, { "epoch": 0.42324040404040403, "grad_norm": 0.06264451891183853, "learning_rate": 0.00019148811833091594, "loss": 0.0901, "step": 6547 }, { "epoch": 0.4233050505050505, "grad_norm": 0.0581914447247982, "learning_rate": 0.0001914853572083219, "loss": 0.0783, "step": 6548 }, { "epoch": 0.42336969696969695, "grad_norm": 0.05626753717660904, "learning_rate": 0.0001914825956578811, "loss": 0.0715, "step": 6549 }, { "epoch": 0.42343434343434344, "grad_norm": 0.05592091754078865, "learning_rate": 0.00019147983367960643, "loss": 0.0805, "step": 6550 }, { "epoch": 0.4234989898989899, "grad_norm": 0.06021624431014061, "learning_rate": 0.0001914770712735108, "loss": 0.0763, "step": 6551 }, { "epoch": 0.42356363636363636, "grad_norm": 0.05660613998770714, "learning_rate": 0.0001914743084396072, "loss": 0.0831, "step": 6552 }, { "epoch": 0.42362828282828285, "grad_norm": 0.08443588018417358, "learning_rate": 0.00019147154517790848, "loss": 0.0757, "step": 6553 }, { "epoch": 0.4236929292929293, "grad_norm": 0.06152249127626419, "learning_rate": 0.00019146878148842758, "loss": 0.0776, "step": 6554 }, { "epoch": 0.4237575757575758, "grad_norm": 0.061850447207689285, "learning_rate": 0.0001914660173711774, "loss": 0.0839, "step": 6555 }, { "epoch": 0.4238222222222222, "grad_norm": 0.06433109194040298, "learning_rate": 0.00019146325282617095, "loss": 0.093, "step": 6556 }, { "epoch": 0.4238868686868687, "grad_norm": 0.056415293365716934, "learning_rate": 0.00019146048785342106, "loss": 0.077, "step": 6557 }, { "epoch": 0.42395151515151513, "grad_norm": 0.06468323618173599, "learning_rate": 0.00019145772245294073, "loss": 0.0875, "step": 6558 }, { "epoch": 0.4240161616161616, "grad_norm": 0.05390962213277817, "learning_rate": 0.00019145495662474285, "loss": 0.0832, "step": 6559 }, { "epoch": 0.42408080808080806, "grad_norm": 0.06274785846471786, "learning_rate": 0.0001914521903688404, "loss": 0.089, "step": 6560 }, { "epoch": 0.42408080808080806, "eval_bleu": 12.442344465581659, "eval_loss": 0.09033806622028351, "eval_runtime": 2.6616, "eval_samples_per_second": 12.023, "eval_steps_per_second": 1.503, "step": 6560 }, { "epoch": 0.42414545454545455, "grad_norm": 0.07190267741680145, "learning_rate": 0.00019144942368524626, "loss": 0.0912, "step": 6561 }, { "epoch": 0.42421010101010104, "grad_norm": 0.057445231825113297, "learning_rate": 0.0001914466565739734, "loss": 0.0778, "step": 6562 }, { "epoch": 0.42427474747474747, "grad_norm": 0.06656220555305481, "learning_rate": 0.00019144388903503473, "loss": 0.084, "step": 6563 }, { "epoch": 0.42433939393939396, "grad_norm": 0.062016408890485764, "learning_rate": 0.0001914411210684433, "loss": 0.095, "step": 6564 }, { "epoch": 0.4244040404040404, "grad_norm": 0.06366956979036331, "learning_rate": 0.0001914383526742119, "loss": 0.0932, "step": 6565 }, { "epoch": 0.4244686868686869, "grad_norm": 0.05601107329130173, "learning_rate": 0.00019143558385235362, "loss": 0.0831, "step": 6566 }, { "epoch": 0.4245333333333333, "grad_norm": 0.05694717541337013, "learning_rate": 0.0001914328146028813, "loss": 0.0764, "step": 6567 }, { "epoch": 0.4245979797979798, "grad_norm": 0.06612081080675125, "learning_rate": 0.00019143004492580792, "loss": 0.0836, "step": 6568 }, { "epoch": 0.42466262626262624, "grad_norm": 0.061216723173856735, "learning_rate": 0.00019142727482114648, "loss": 0.0887, "step": 6569 }, { "epoch": 0.42472727272727273, "grad_norm": 0.05907219275832176, "learning_rate": 0.00019142450428890988, "loss": 0.0858, "step": 6570 }, { "epoch": 0.42479191919191917, "grad_norm": 0.06586386263370514, "learning_rate": 0.0001914217333291111, "loss": 0.0913, "step": 6571 }, { "epoch": 0.42485656565656565, "grad_norm": 0.06863091140985489, "learning_rate": 0.0001914189619417631, "loss": 0.092, "step": 6572 }, { "epoch": 0.42492121212121214, "grad_norm": 0.07041668146848679, "learning_rate": 0.00019141619012687887, "loss": 0.1188, "step": 6573 }, { "epoch": 0.4249858585858586, "grad_norm": 0.07513442635536194, "learning_rate": 0.00019141341788447128, "loss": 0.1137, "step": 6574 }, { "epoch": 0.42505050505050507, "grad_norm": 0.0556240975856781, "learning_rate": 0.00019141064521455337, "loss": 0.0868, "step": 6575 }, { "epoch": 0.4251151515151515, "grad_norm": 0.05549519881606102, "learning_rate": 0.00019140787211713813, "loss": 0.0816, "step": 6576 }, { "epoch": 0.4251151515151515, "eval_bleu": 14.087464386263449, "eval_loss": 0.09202475845813751, "eval_runtime": 2.7371, "eval_samples_per_second": 11.691, "eval_steps_per_second": 1.461, "step": 6576 }, { "epoch": 0.425179797979798, "grad_norm": 0.05485127866268158, "learning_rate": 0.00019140509859223847, "loss": 0.0786, "step": 6577 }, { "epoch": 0.4252444444444444, "grad_norm": 0.06251101195812225, "learning_rate": 0.0001914023246398674, "loss": 0.1022, "step": 6578 }, { "epoch": 0.4253090909090909, "grad_norm": 0.06044426187872887, "learning_rate": 0.00019139955026003784, "loss": 0.0917, "step": 6579 }, { "epoch": 0.42537373737373735, "grad_norm": 0.07116618007421494, "learning_rate": 0.0001913967754527628, "loss": 0.0917, "step": 6580 }, { "epoch": 0.42543838383838384, "grad_norm": 0.06475582718849182, "learning_rate": 0.0001913940002180553, "loss": 0.0843, "step": 6581 }, { "epoch": 0.4255030303030303, "grad_norm": 0.05655675381422043, "learning_rate": 0.00019139122455592825, "loss": 0.0796, "step": 6582 }, { "epoch": 0.42556767676767676, "grad_norm": 0.05823151022195816, "learning_rate": 0.00019138844846639463, "loss": 0.084, "step": 6583 }, { "epoch": 0.42563232323232325, "grad_norm": 0.05494045093655586, "learning_rate": 0.00019138567194946747, "loss": 0.0854, "step": 6584 }, { "epoch": 0.4256969696969697, "grad_norm": 0.06613332778215408, "learning_rate": 0.00019138289500515975, "loss": 0.0967, "step": 6585 }, { "epoch": 0.4257616161616162, "grad_norm": 0.06125938892364502, "learning_rate": 0.00019138011763348442, "loss": 0.0852, "step": 6586 }, { "epoch": 0.4258262626262626, "grad_norm": 0.05659712105989456, "learning_rate": 0.0001913773398344545, "loss": 0.0831, "step": 6587 }, { "epoch": 0.4258909090909091, "grad_norm": 0.05642635002732277, "learning_rate": 0.00019137456160808297, "loss": 0.0797, "step": 6588 }, { "epoch": 0.42595555555555553, "grad_norm": 0.06033669784665108, "learning_rate": 0.0001913717829543828, "loss": 0.0901, "step": 6589 }, { "epoch": 0.426020202020202, "grad_norm": 0.06372498720884323, "learning_rate": 0.00019136900387336706, "loss": 0.095, "step": 6590 }, { "epoch": 0.4260848484848485, "grad_norm": 0.06893781572580338, "learning_rate": 0.00019136622436504865, "loss": 0.076, "step": 6591 }, { "epoch": 0.42614949494949494, "grad_norm": 0.060583338141441345, "learning_rate": 0.00019136344442944063, "loss": 0.0777, "step": 6592 }, { "epoch": 0.42614949494949494, "eval_bleu": 10.707045338418325, "eval_loss": 0.091401606798172, "eval_runtime": 2.8236, "eval_samples_per_second": 11.333, "eval_steps_per_second": 1.417, "step": 6592 }, { "epoch": 0.42621414141414143, "grad_norm": 0.0709112137556076, "learning_rate": 0.00019136066406655596, "loss": 0.0983, "step": 6593 }, { "epoch": 0.42627878787878787, "grad_norm": 0.05989782512187958, "learning_rate": 0.0001913578832764077, "loss": 0.0839, "step": 6594 }, { "epoch": 0.42634343434343436, "grad_norm": 0.05916980654001236, "learning_rate": 0.0001913551020590088, "loss": 0.0776, "step": 6595 }, { "epoch": 0.4264080808080808, "grad_norm": 0.06350085884332657, "learning_rate": 0.0001913523204143723, "loss": 0.0951, "step": 6596 }, { "epoch": 0.4264727272727273, "grad_norm": 0.07590396702289581, "learning_rate": 0.0001913495383425112, "loss": 0.1, "step": 6597 }, { "epoch": 0.4265373737373737, "grad_norm": 0.055536337196826935, "learning_rate": 0.00019134675584343852, "loss": 0.085, "step": 6598 }, { "epoch": 0.4266020202020202, "grad_norm": 0.06626736372709274, "learning_rate": 0.00019134397291716723, "loss": 0.0741, "step": 6599 }, { "epoch": 0.4266666666666667, "grad_norm": 0.06859724223613739, "learning_rate": 0.00019134118956371041, "loss": 0.1121, "step": 6600 }, { "epoch": 0.42673131313131313, "grad_norm": 0.06778359413146973, "learning_rate": 0.000191338405783081, "loss": 0.1023, "step": 6601 }, { "epoch": 0.4267959595959596, "grad_norm": 0.05678689479827881, "learning_rate": 0.0001913356215752921, "loss": 0.0785, "step": 6602 }, { "epoch": 0.42686060606060605, "grad_norm": 0.05825059115886688, "learning_rate": 0.00019133283694035668, "loss": 0.0848, "step": 6603 }, { "epoch": 0.42692525252525254, "grad_norm": 0.05619468912482262, "learning_rate": 0.0001913300518782878, "loss": 0.075, "step": 6604 }, { "epoch": 0.426989898989899, "grad_norm": 0.05654546990990639, "learning_rate": 0.00019132726638909842, "loss": 0.0776, "step": 6605 }, { "epoch": 0.42705454545454546, "grad_norm": 0.0592694953083992, "learning_rate": 0.00019132448047280161, "loss": 0.0869, "step": 6606 }, { "epoch": 0.4271191919191919, "grad_norm": 0.05591333284974098, "learning_rate": 0.00019132169412941044, "loss": 0.0901, "step": 6607 }, { "epoch": 0.4271838383838384, "grad_norm": 0.056615013629198074, "learning_rate": 0.00019131890735893788, "loss": 0.0804, "step": 6608 }, { "epoch": 0.4271838383838384, "eval_bleu": 14.613026795897857, "eval_loss": 0.0904146060347557, "eval_runtime": 2.7371, "eval_samples_per_second": 11.691, "eval_steps_per_second": 1.461, "step": 6608 }, { "epoch": 0.4272484848484848, "grad_norm": 0.07056908309459686, "learning_rate": 0.00019131612016139694, "loss": 0.1125, "step": 6609 }, { "epoch": 0.4273131313131313, "grad_norm": 0.05856446549296379, "learning_rate": 0.00019131333253680072, "loss": 0.0906, "step": 6610 }, { "epoch": 0.4273777777777778, "grad_norm": 0.06128755956888199, "learning_rate": 0.00019131054448516226, "loss": 0.0807, "step": 6611 }, { "epoch": 0.42744242424242423, "grad_norm": 0.0597204864025116, "learning_rate": 0.00019130775600649454, "loss": 0.0987, "step": 6612 }, { "epoch": 0.4275070707070707, "grad_norm": 0.06497611850500107, "learning_rate": 0.00019130496710081062, "loss": 0.0983, "step": 6613 }, { "epoch": 0.42757171717171716, "grad_norm": 0.05756524205207825, "learning_rate": 0.00019130217776812356, "loss": 0.0809, "step": 6614 }, { "epoch": 0.42763636363636365, "grad_norm": 0.05572421848773956, "learning_rate": 0.00019129938800844642, "loss": 0.084, "step": 6615 }, { "epoch": 0.4277010101010101, "grad_norm": 0.06086616590619087, "learning_rate": 0.0001912965978217922, "loss": 0.0781, "step": 6616 }, { "epoch": 0.42776565656565657, "grad_norm": 0.05883936956524849, "learning_rate": 0.00019129380720817401, "loss": 0.0876, "step": 6617 }, { "epoch": 0.427830303030303, "grad_norm": 0.061151277273893356, "learning_rate": 0.00019129101616760483, "loss": 0.0862, "step": 6618 }, { "epoch": 0.4278949494949495, "grad_norm": 0.05282574146986008, "learning_rate": 0.00019128822470009776, "loss": 0.0718, "step": 6619 }, { "epoch": 0.427959595959596, "grad_norm": 0.059277188032865524, "learning_rate": 0.00019128543280566584, "loss": 0.0896, "step": 6620 }, { "epoch": 0.4280242424242424, "grad_norm": 0.05840647965669632, "learning_rate": 0.00019128264048432213, "loss": 0.0938, "step": 6621 }, { "epoch": 0.4280888888888889, "grad_norm": 0.05223406106233597, "learning_rate": 0.00019127984773607967, "loss": 0.0701, "step": 6622 }, { "epoch": 0.42815353535353534, "grad_norm": 0.05898236855864525, "learning_rate": 0.00019127705456095157, "loss": 0.0782, "step": 6623 }, { "epoch": 0.42821818181818183, "grad_norm": 0.05674196779727936, "learning_rate": 0.00019127426095895086, "loss": 0.0826, "step": 6624 }, { "epoch": 0.42821818181818183, "eval_bleu": 10.772494188289562, "eval_loss": 0.09073875844478607, "eval_runtime": 2.5708, "eval_samples_per_second": 12.447, "eval_steps_per_second": 1.556, "step": 6624 }, { "epoch": 0.42828282828282827, "grad_norm": 0.05830037221312523, "learning_rate": 0.00019127146693009059, "loss": 0.0882, "step": 6625 }, { "epoch": 0.42834747474747475, "grad_norm": 0.049030616879463196, "learning_rate": 0.00019126867247438385, "loss": 0.0738, "step": 6626 }, { "epoch": 0.4284121212121212, "grad_norm": 0.0554330050945282, "learning_rate": 0.0001912658775918437, "loss": 0.078, "step": 6627 }, { "epoch": 0.4284767676767677, "grad_norm": 0.0692732110619545, "learning_rate": 0.0001912630822824832, "loss": 0.101, "step": 6628 }, { "epoch": 0.42854141414141417, "grad_norm": 0.06298629939556122, "learning_rate": 0.00019126028654631547, "loss": 0.093, "step": 6629 }, { "epoch": 0.4286060606060606, "grad_norm": 0.0621403306722641, "learning_rate": 0.0001912574903833535, "loss": 0.0899, "step": 6630 }, { "epoch": 0.4286707070707071, "grad_norm": 0.0664587989449501, "learning_rate": 0.00019125469379361045, "loss": 0.0899, "step": 6631 }, { "epoch": 0.4287353535353535, "grad_norm": 0.05386051535606384, "learning_rate": 0.00019125189677709936, "loss": 0.0706, "step": 6632 }, { "epoch": 0.4288, "grad_norm": 0.058830708265304565, "learning_rate": 0.0001912490993338333, "loss": 0.0868, "step": 6633 }, { "epoch": 0.42886464646464645, "grad_norm": 0.05178527161478996, "learning_rate": 0.0001912463014638254, "loss": 0.0772, "step": 6634 }, { "epoch": 0.42892929292929294, "grad_norm": 0.06519562751054764, "learning_rate": 0.0001912435031670887, "loss": 0.0963, "step": 6635 }, { "epoch": 0.42899393939393937, "grad_norm": 0.06102428585290909, "learning_rate": 0.0001912407044436363, "loss": 0.0968, "step": 6636 }, { "epoch": 0.42905858585858586, "grad_norm": 0.057426467537879944, "learning_rate": 0.0001912379052934813, "loss": 0.0884, "step": 6637 }, { "epoch": 0.42912323232323235, "grad_norm": 0.0662488266825676, "learning_rate": 0.00019123510571663675, "loss": 0.0986, "step": 6638 }, { "epoch": 0.4291878787878788, "grad_norm": 0.05731409415602684, "learning_rate": 0.0001912323057131158, "loss": 0.0838, "step": 6639 }, { "epoch": 0.4292525252525253, "grad_norm": 0.05220220237970352, "learning_rate": 0.0001912295052829315, "loss": 0.0662, "step": 6640 }, { "epoch": 0.4292525252525253, "eval_bleu": 12.862331961077293, "eval_loss": 0.09125283360481262, "eval_runtime": 2.6021, "eval_samples_per_second": 12.298, "eval_steps_per_second": 1.537, "step": 6640 }, { "epoch": 0.4293171717171717, "grad_norm": 0.06433258950710297, "learning_rate": 0.00019122670442609695, "loss": 0.0907, "step": 6641 }, { "epoch": 0.4293818181818182, "grad_norm": 0.06808263063430786, "learning_rate": 0.00019122390314262527, "loss": 0.1103, "step": 6642 }, { "epoch": 0.42944646464646463, "grad_norm": 0.06154566630721092, "learning_rate": 0.00019122110143252958, "loss": 0.1024, "step": 6643 }, { "epoch": 0.4295111111111111, "grad_norm": 0.05780541151762009, "learning_rate": 0.00019121829929582296, "loss": 0.0935, "step": 6644 }, { "epoch": 0.42957575757575756, "grad_norm": 0.0565878190100193, "learning_rate": 0.00019121549673251846, "loss": 0.0676, "step": 6645 }, { "epoch": 0.42964040404040404, "grad_norm": 0.0596904531121254, "learning_rate": 0.0001912126937426293, "loss": 0.0862, "step": 6646 }, { "epoch": 0.4297050505050505, "grad_norm": 0.058165475726127625, "learning_rate": 0.00019120989032616847, "loss": 0.0913, "step": 6647 }, { "epoch": 0.42976969696969697, "grad_norm": 0.050650063902139664, "learning_rate": 0.0001912070864831492, "loss": 0.0788, "step": 6648 }, { "epoch": 0.42983434343434346, "grad_norm": 0.05824596434831619, "learning_rate": 0.00019120428221358447, "loss": 0.0955, "step": 6649 }, { "epoch": 0.4298989898989899, "grad_norm": 0.0657096654176712, "learning_rate": 0.00019120147751748747, "loss": 0.094, "step": 6650 }, { "epoch": 0.4299636363636364, "grad_norm": 0.07057999819517136, "learning_rate": 0.00019119867239487134, "loss": 0.0823, "step": 6651 }, { "epoch": 0.4300282828282828, "grad_norm": 0.05516476929187775, "learning_rate": 0.00019119586684574917, "loss": 0.079, "step": 6652 }, { "epoch": 0.4300929292929293, "grad_norm": 0.048518966883420944, "learning_rate": 0.00019119306087013406, "loss": 0.0667, "step": 6653 }, { "epoch": 0.43015757575757574, "grad_norm": 0.05460382625460625, "learning_rate": 0.0001911902544680392, "loss": 0.0791, "step": 6654 }, { "epoch": 0.43022222222222223, "grad_norm": 0.0556037463247776, "learning_rate": 0.0001911874476394776, "loss": 0.0895, "step": 6655 }, { "epoch": 0.43028686868686866, "grad_norm": 0.05801599845290184, "learning_rate": 0.00019118464038446248, "loss": 0.078, "step": 6656 }, { "epoch": 0.43028686868686866, "eval_bleu": 15.306660204254397, "eval_loss": 0.08993926644325256, "eval_runtime": 2.7307, "eval_samples_per_second": 11.719, "eval_steps_per_second": 1.465, "step": 6656 }, { "epoch": 0.43035151515151515, "grad_norm": 0.068367600440979, "learning_rate": 0.00019118183270300696, "loss": 0.1081, "step": 6657 }, { "epoch": 0.43041616161616164, "grad_norm": 0.06189889460802078, "learning_rate": 0.00019117902459512412, "loss": 0.0929, "step": 6658 }, { "epoch": 0.4304808080808081, "grad_norm": 0.05229642987251282, "learning_rate": 0.00019117621606082718, "loss": 0.0768, "step": 6659 }, { "epoch": 0.43054545454545456, "grad_norm": 0.07743939012289047, "learning_rate": 0.00019117340710012915, "loss": 0.1081, "step": 6660 }, { "epoch": 0.430610101010101, "grad_norm": 0.06043891981244087, "learning_rate": 0.00019117059771304326, "loss": 0.0875, "step": 6661 }, { "epoch": 0.4306747474747475, "grad_norm": 0.05482163280248642, "learning_rate": 0.00019116778789958266, "loss": 0.0705, "step": 6662 }, { "epoch": 0.4307393939393939, "grad_norm": 0.06063186377286911, "learning_rate": 0.00019116497765976043, "loss": 0.0839, "step": 6663 }, { "epoch": 0.4308040404040404, "grad_norm": 0.06368141621351242, "learning_rate": 0.00019116216699358973, "loss": 0.0885, "step": 6664 }, { "epoch": 0.43086868686868685, "grad_norm": 0.0687251016497612, "learning_rate": 0.00019115935590108371, "loss": 0.09, "step": 6665 }, { "epoch": 0.43093333333333333, "grad_norm": 0.06637921184301376, "learning_rate": 0.00019115654438225553, "loss": 0.1015, "step": 6666 }, { "epoch": 0.4309979797979798, "grad_norm": 0.06342704594135284, "learning_rate": 0.00019115373243711834, "loss": 0.0837, "step": 6667 }, { "epoch": 0.43106262626262626, "grad_norm": 0.06134212017059326, "learning_rate": 0.00019115092006568524, "loss": 0.0835, "step": 6668 }, { "epoch": 0.43112727272727275, "grad_norm": 0.07064781337976456, "learning_rate": 0.00019114810726796944, "loss": 0.0941, "step": 6669 }, { "epoch": 0.4311919191919192, "grad_norm": 0.06272266060113907, "learning_rate": 0.00019114529404398408, "loss": 0.0818, "step": 6670 }, { "epoch": 0.43125656565656567, "grad_norm": 0.06405533850193024, "learning_rate": 0.00019114248039374233, "loss": 0.0964, "step": 6671 }, { "epoch": 0.4313212121212121, "grad_norm": 0.0646195039153099, "learning_rate": 0.00019113966631725734, "loss": 0.0926, "step": 6672 }, { "epoch": 0.4313212121212121, "eval_bleu": 16.41276748339313, "eval_loss": 0.09039974212646484, "eval_runtime": 2.7123, "eval_samples_per_second": 11.798, "eval_steps_per_second": 1.475, "step": 6672 }, { "epoch": 0.4313858585858586, "grad_norm": 0.06108766794204712, "learning_rate": 0.00019113685181454222, "loss": 0.093, "step": 6673 }, { "epoch": 0.43145050505050503, "grad_norm": 0.05477692559361458, "learning_rate": 0.00019113403688561018, "loss": 0.0772, "step": 6674 }, { "epoch": 0.4315151515151515, "grad_norm": 0.0831528827548027, "learning_rate": 0.00019113122153047437, "loss": 0.0968, "step": 6675 }, { "epoch": 0.43157979797979795, "grad_norm": 0.06713037937879562, "learning_rate": 0.00019112840574914797, "loss": 0.0972, "step": 6676 }, { "epoch": 0.43164444444444444, "grad_norm": 0.07544389367103577, "learning_rate": 0.00019112558954164418, "loss": 0.092, "step": 6677 }, { "epoch": 0.43170909090909093, "grad_norm": 0.05758429691195488, "learning_rate": 0.0001911227729079761, "loss": 0.0884, "step": 6678 }, { "epoch": 0.43177373737373737, "grad_norm": 0.05520794913172722, "learning_rate": 0.00019111995584815693, "loss": 0.0829, "step": 6679 }, { "epoch": 0.43183838383838385, "grad_norm": 0.057399529963731766, "learning_rate": 0.00019111713836219988, "loss": 0.084, "step": 6680 }, { "epoch": 0.4319030303030303, "grad_norm": 0.06861952692270279, "learning_rate": 0.00019111432045011806, "loss": 0.1014, "step": 6681 }, { "epoch": 0.4319676767676768, "grad_norm": 0.07065750658512115, "learning_rate": 0.0001911115021119247, "loss": 0.0985, "step": 6682 }, { "epoch": 0.4320323232323232, "grad_norm": 0.056043751537799835, "learning_rate": 0.00019110868334763298, "loss": 0.0714, "step": 6683 }, { "epoch": 0.4320969696969697, "grad_norm": 0.05294429138302803, "learning_rate": 0.00019110586415725607, "loss": 0.0799, "step": 6684 }, { "epoch": 0.43216161616161614, "grad_norm": 0.06799832731485367, "learning_rate": 0.0001911030445408071, "loss": 0.0962, "step": 6685 }, { "epoch": 0.4322262626262626, "grad_norm": 0.059292253106832504, "learning_rate": 0.00019110022449829936, "loss": 0.081, "step": 6686 }, { "epoch": 0.4322909090909091, "grad_norm": 0.06291504949331284, "learning_rate": 0.00019109740402974596, "loss": 0.0907, "step": 6687 }, { "epoch": 0.43235555555555555, "grad_norm": 0.06880916655063629, "learning_rate": 0.00019109458313516016, "loss": 0.086, "step": 6688 }, { "epoch": 0.43235555555555555, "eval_bleu": 16.47894474773476, "eval_loss": 0.08969217538833618, "eval_runtime": 2.748, "eval_samples_per_second": 11.645, "eval_steps_per_second": 1.456, "step": 6688 }, { "epoch": 0.43242020202020204, "grad_norm": 0.06024152785539627, "learning_rate": 0.00019109176181455507, "loss": 0.0838, "step": 6689 }, { "epoch": 0.43248484848484847, "grad_norm": 0.05266639217734337, "learning_rate": 0.00019108894006794396, "loss": 0.0813, "step": 6690 }, { "epoch": 0.43254949494949496, "grad_norm": 0.061658769845962524, "learning_rate": 0.00019108611789533995, "loss": 0.0808, "step": 6691 }, { "epoch": 0.4326141414141414, "grad_norm": 0.051952045410871506, "learning_rate": 0.00019108329529675627, "loss": 0.0828, "step": 6692 }, { "epoch": 0.4326787878787879, "grad_norm": 0.05340585485100746, "learning_rate": 0.00019108047227220617, "loss": 0.0691, "step": 6693 }, { "epoch": 0.4327434343434343, "grad_norm": 0.06265600770711899, "learning_rate": 0.0001910776488217028, "loss": 0.0937, "step": 6694 }, { "epoch": 0.4328080808080808, "grad_norm": 0.04557260125875473, "learning_rate": 0.00019107482494525935, "loss": 0.0718, "step": 6695 }, { "epoch": 0.4328727272727273, "grad_norm": 0.048701077699661255, "learning_rate": 0.0001910720006428891, "loss": 0.0679, "step": 6696 }, { "epoch": 0.43293737373737373, "grad_norm": 0.08133979886770248, "learning_rate": 0.00019106917591460519, "loss": 0.1108, "step": 6697 }, { "epoch": 0.4330020202020202, "grad_norm": 0.05488954484462738, "learning_rate": 0.00019106635076042086, "loss": 0.083, "step": 6698 }, { "epoch": 0.43306666666666666, "grad_norm": 0.06451868265867233, "learning_rate": 0.0001910635251803493, "loss": 0.0821, "step": 6699 }, { "epoch": 0.43313131313131314, "grad_norm": 0.07605867087841034, "learning_rate": 0.00019106069917440373, "loss": 0.0963, "step": 6700 }, { "epoch": 0.4331959595959596, "grad_norm": 0.059033121913671494, "learning_rate": 0.00019105787274259739, "loss": 0.0744, "step": 6701 }, { "epoch": 0.43326060606060607, "grad_norm": 0.05753353238105774, "learning_rate": 0.00019105504588494347, "loss": 0.0793, "step": 6702 }, { "epoch": 0.4333252525252525, "grad_norm": 0.06622052937746048, "learning_rate": 0.0001910522186014552, "loss": 0.0946, "step": 6703 }, { "epoch": 0.433389898989899, "grad_norm": 0.050431326031684875, "learning_rate": 0.00019104939089214582, "loss": 0.0693, "step": 6704 }, { "epoch": 0.433389898989899, "eval_bleu": 19.32855283098627, "eval_loss": 0.09100078046321869, "eval_runtime": 2.8575, "eval_samples_per_second": 11.199, "eval_steps_per_second": 1.4, "step": 6704 }, { "epoch": 0.4334545454545455, "grad_norm": 0.05740467831492424, "learning_rate": 0.00019104656275702856, "loss": 0.0902, "step": 6705 }, { "epoch": 0.4335191919191919, "grad_norm": 0.06027499958872795, "learning_rate": 0.00019104373419611656, "loss": 0.0804, "step": 6706 }, { "epoch": 0.4335838383838384, "grad_norm": 0.06557957828044891, "learning_rate": 0.00019104090520942317, "loss": 0.1005, "step": 6707 }, { "epoch": 0.43364848484848484, "grad_norm": 0.046889279037714005, "learning_rate": 0.00019103807579696154, "loss": 0.0666, "step": 6708 }, { "epoch": 0.43371313131313133, "grad_norm": 0.06470265239477158, "learning_rate": 0.00019103524595874492, "loss": 0.074, "step": 6709 }, { "epoch": 0.43377777777777776, "grad_norm": 0.06314821541309357, "learning_rate": 0.00019103241569478656, "loss": 0.0965, "step": 6710 }, { "epoch": 0.43384242424242425, "grad_norm": 0.058993518352508545, "learning_rate": 0.00019102958500509968, "loss": 0.0861, "step": 6711 }, { "epoch": 0.4339070707070707, "grad_norm": 0.0708533301949501, "learning_rate": 0.00019102675388969756, "loss": 0.1091, "step": 6712 }, { "epoch": 0.4339717171717172, "grad_norm": 0.06450173258781433, "learning_rate": 0.00019102392234859335, "loss": 0.0957, "step": 6713 }, { "epoch": 0.4340363636363636, "grad_norm": 0.060513876378536224, "learning_rate": 0.0001910210903818004, "loss": 0.0911, "step": 6714 }, { "epoch": 0.4341010101010101, "grad_norm": 0.06199612468481064, "learning_rate": 0.00019101825798933185, "loss": 0.0825, "step": 6715 }, { "epoch": 0.4341656565656566, "grad_norm": 0.0612860806286335, "learning_rate": 0.00019101542517120103, "loss": 0.0881, "step": 6716 }, { "epoch": 0.434230303030303, "grad_norm": 0.04900862276554108, "learning_rate": 0.00019101259192742115, "loss": 0.0636, "step": 6717 }, { "epoch": 0.4342949494949495, "grad_norm": 0.06391648948192596, "learning_rate": 0.0001910097582580055, "loss": 0.0884, "step": 6718 }, { "epoch": 0.43435959595959595, "grad_norm": 0.06530877947807312, "learning_rate": 0.00019100692416296723, "loss": 0.1031, "step": 6719 }, { "epoch": 0.43442424242424243, "grad_norm": 0.0682988315820694, "learning_rate": 0.0001910040896423197, "loss": 0.0944, "step": 6720 }, { "epoch": 0.43442424242424243, "eval_bleu": 19.597381302755885, "eval_loss": 0.08979760110378265, "eval_runtime": 2.689, "eval_samples_per_second": 11.901, "eval_steps_per_second": 1.488, "step": 6720 }, { "epoch": 0.43448888888888887, "grad_norm": 0.05005868896842003, "learning_rate": 0.00019100125469607613, "loss": 0.0772, "step": 6721 }, { "epoch": 0.43455353535353536, "grad_norm": 0.061649978160858154, "learning_rate": 0.00019099841932424978, "loss": 0.0765, "step": 6722 }, { "epoch": 0.4346181818181818, "grad_norm": 0.061026379466056824, "learning_rate": 0.00019099558352685388, "loss": 0.0871, "step": 6723 }, { "epoch": 0.4346828282828283, "grad_norm": 0.05858129635453224, "learning_rate": 0.00019099274730390172, "loss": 0.0777, "step": 6724 }, { "epoch": 0.43474747474747477, "grad_norm": 0.058974042534828186, "learning_rate": 0.0001909899106554066, "loss": 0.0909, "step": 6725 }, { "epoch": 0.4348121212121212, "grad_norm": 0.05763079226016998, "learning_rate": 0.00019098707358138173, "loss": 0.0782, "step": 6726 }, { "epoch": 0.4348767676767677, "grad_norm": 0.062005434185266495, "learning_rate": 0.00019098423608184038, "loss": 0.0886, "step": 6727 }, { "epoch": 0.43494141414141413, "grad_norm": 0.060816679149866104, "learning_rate": 0.00019098139815679584, "loss": 0.0756, "step": 6728 }, { "epoch": 0.4350060606060606, "grad_norm": 0.05746083706617355, "learning_rate": 0.0001909785598062614, "loss": 0.0843, "step": 6729 }, { "epoch": 0.43507070707070705, "grad_norm": 0.06679429858922958, "learning_rate": 0.0001909757210302503, "loss": 0.0867, "step": 6730 }, { "epoch": 0.43513535353535354, "grad_norm": 0.06472452729940414, "learning_rate": 0.00019097288182877583, "loss": 0.0907, "step": 6731 }, { "epoch": 0.4352, "grad_norm": 0.060266487300395966, "learning_rate": 0.00019097004220185125, "loss": 0.0807, "step": 6732 }, { "epoch": 0.43526464646464647, "grad_norm": 0.07091352343559265, "learning_rate": 0.00019096720214948986, "loss": 0.1067, "step": 6733 }, { "epoch": 0.43532929292929295, "grad_norm": 0.06913848966360092, "learning_rate": 0.00019096436167170494, "loss": 0.0949, "step": 6734 }, { "epoch": 0.4353939393939394, "grad_norm": 0.05639937147498131, "learning_rate": 0.00019096152076850982, "loss": 0.0896, "step": 6735 }, { "epoch": 0.4354585858585859, "grad_norm": 0.06356509029865265, "learning_rate": 0.00019095867943991768, "loss": 0.0947, "step": 6736 }, { "epoch": 0.4354585858585859, "eval_bleu": 18.09796533838362, "eval_loss": 0.0898888036608696, "eval_runtime": 2.57, "eval_samples_per_second": 12.451, "eval_steps_per_second": 1.556, "step": 6736 }, { "epoch": 0.4355232323232323, "grad_norm": 0.055265769362449646, "learning_rate": 0.00019095583768594188, "loss": 0.0878, "step": 6737 }, { "epoch": 0.4355878787878788, "grad_norm": 0.06255878508090973, "learning_rate": 0.0001909529955065957, "loss": 0.1067, "step": 6738 }, { "epoch": 0.43565252525252524, "grad_norm": 0.05967830866575241, "learning_rate": 0.00019095015290189243, "loss": 0.0942, "step": 6739 }, { "epoch": 0.4357171717171717, "grad_norm": 0.06846214830875397, "learning_rate": 0.0001909473098718454, "loss": 0.084, "step": 6740 }, { "epoch": 0.43578181818181816, "grad_norm": 0.06153913587331772, "learning_rate": 0.0001909444664164678, "loss": 0.0914, "step": 6741 }, { "epoch": 0.43584646464646465, "grad_norm": 0.05843881517648697, "learning_rate": 0.00019094162253577307, "loss": 0.0791, "step": 6742 }, { "epoch": 0.43591111111111114, "grad_norm": 0.06679181009531021, "learning_rate": 0.00019093877822977439, "loss": 0.0863, "step": 6743 }, { "epoch": 0.43597575757575757, "grad_norm": 0.05715608224272728, "learning_rate": 0.0001909359334984851, "loss": 0.0809, "step": 6744 }, { "epoch": 0.43604040404040406, "grad_norm": 0.05215649679303169, "learning_rate": 0.0001909330883419185, "loss": 0.0807, "step": 6745 }, { "epoch": 0.4361050505050505, "grad_norm": 0.05429426580667496, "learning_rate": 0.00019093024276008795, "loss": 0.0748, "step": 6746 }, { "epoch": 0.436169696969697, "grad_norm": 0.050558798015117645, "learning_rate": 0.0001909273967530067, "loss": 0.0652, "step": 6747 }, { "epoch": 0.4362343434343434, "grad_norm": 0.0589262917637825, "learning_rate": 0.00019092455032068808, "loss": 0.0906, "step": 6748 }, { "epoch": 0.4362989898989899, "grad_norm": 0.06198734790086746, "learning_rate": 0.00019092170346314538, "loss": 0.0966, "step": 6749 }, { "epoch": 0.43636363636363634, "grad_norm": 0.08289777487516403, "learning_rate": 0.00019091885618039196, "loss": 0.0824, "step": 6750 }, { "epoch": 0.43642828282828283, "grad_norm": 0.06290540099143982, "learning_rate": 0.00019091600847244108, "loss": 0.0894, "step": 6751 }, { "epoch": 0.43649292929292927, "grad_norm": 0.05783205106854439, "learning_rate": 0.0001909131603393061, "loss": 0.0866, "step": 6752 }, { "epoch": 0.43649292929292927, "eval_bleu": 16.52805380949454, "eval_loss": 0.0909336507320404, "eval_runtime": 2.6697, "eval_samples_per_second": 11.986, "eval_steps_per_second": 1.498, "step": 6752 }, { "epoch": 0.43655757575757576, "grad_norm": 0.06865835934877396, "learning_rate": 0.00019091031178100032, "loss": 0.0869, "step": 6753 }, { "epoch": 0.43662222222222224, "grad_norm": 0.060650721192359924, "learning_rate": 0.00019090746279753706, "loss": 0.088, "step": 6754 }, { "epoch": 0.4366868686868687, "grad_norm": 0.06919240206480026, "learning_rate": 0.00019090461338892967, "loss": 0.0936, "step": 6755 }, { "epoch": 0.43675151515151517, "grad_norm": 0.05776713043451309, "learning_rate": 0.00019090176355519143, "loss": 0.0869, "step": 6756 }, { "epoch": 0.4368161616161616, "grad_norm": 0.05391646549105644, "learning_rate": 0.0001908989132963357, "loss": 0.0733, "step": 6757 }, { "epoch": 0.4368808080808081, "grad_norm": 0.06746477633714676, "learning_rate": 0.00019089606261237584, "loss": 0.0999, "step": 6758 }, { "epoch": 0.4369454545454545, "grad_norm": 0.06493270397186279, "learning_rate": 0.00019089321150332508, "loss": 0.0906, "step": 6759 }, { "epoch": 0.437010101010101, "grad_norm": 0.05379673093557358, "learning_rate": 0.00019089035996919688, "loss": 0.0718, "step": 6760 }, { "epoch": 0.43707474747474745, "grad_norm": 0.05376768857240677, "learning_rate": 0.00019088750801000447, "loss": 0.0851, "step": 6761 }, { "epoch": 0.43713939393939394, "grad_norm": 0.06285647302865982, "learning_rate": 0.00019088465562576123, "loss": 0.0827, "step": 6762 }, { "epoch": 0.43720404040404043, "grad_norm": 0.055171310901641846, "learning_rate": 0.00019088180281648053, "loss": 0.0797, "step": 6763 }, { "epoch": 0.43726868686868686, "grad_norm": 0.055926255881786346, "learning_rate": 0.00019087894958217568, "loss": 0.0804, "step": 6764 }, { "epoch": 0.43733333333333335, "grad_norm": 0.0702073872089386, "learning_rate": 0.00019087609592286, "loss": 0.0932, "step": 6765 }, { "epoch": 0.4373979797979798, "grad_norm": 0.05558589845895767, "learning_rate": 0.00019087324183854687, "loss": 0.0833, "step": 6766 }, { "epoch": 0.4374626262626263, "grad_norm": 0.06253517419099808, "learning_rate": 0.00019087038732924967, "loss": 0.0878, "step": 6767 }, { "epoch": 0.4375272727272727, "grad_norm": 0.0822148472070694, "learning_rate": 0.00019086753239498164, "loss": 0.0931, "step": 6768 }, { "epoch": 0.4375272727272727, "eval_bleu": 19.400239134833487, "eval_loss": 0.09037335216999054, "eval_runtime": 2.6531, "eval_samples_per_second": 12.061, "eval_steps_per_second": 1.508, "step": 6768 }, { "epoch": 0.4375919191919192, "grad_norm": 0.06698177009820938, "learning_rate": 0.00019086467703575627, "loss": 0.0992, "step": 6769 }, { "epoch": 0.43765656565656563, "grad_norm": 0.05294684320688248, "learning_rate": 0.0001908618212515868, "loss": 0.0745, "step": 6770 }, { "epoch": 0.4377212121212121, "grad_norm": 0.06932110339403152, "learning_rate": 0.00019085896504248667, "loss": 0.0952, "step": 6771 }, { "epoch": 0.4377858585858586, "grad_norm": 0.062051549553871155, "learning_rate": 0.00019085610840846915, "loss": 0.1007, "step": 6772 }, { "epoch": 0.43785050505050505, "grad_norm": 0.053681932389736176, "learning_rate": 0.00019085325134954767, "loss": 0.0799, "step": 6773 }, { "epoch": 0.43791515151515154, "grad_norm": 0.06426096707582474, "learning_rate": 0.0001908503938657356, "loss": 0.0947, "step": 6774 }, { "epoch": 0.43797979797979797, "grad_norm": 0.0663200095295906, "learning_rate": 0.00019084753595704622, "loss": 0.0948, "step": 6775 }, { "epoch": 0.43804444444444446, "grad_norm": 0.06578589230775833, "learning_rate": 0.00019084467762349298, "loss": 0.0897, "step": 6776 }, { "epoch": 0.4381090909090909, "grad_norm": 0.06200915202498436, "learning_rate": 0.0001908418188650892, "loss": 0.0968, "step": 6777 }, { "epoch": 0.4381737373737374, "grad_norm": 0.06902661174535751, "learning_rate": 0.0001908389596818483, "loss": 0.0907, "step": 6778 }, { "epoch": 0.4382383838383838, "grad_norm": 0.05732037127017975, "learning_rate": 0.00019083610007378357, "loss": 0.0876, "step": 6779 }, { "epoch": 0.4383030303030303, "grad_norm": 0.05982803925871849, "learning_rate": 0.00019083324004090844, "loss": 0.0873, "step": 6780 }, { "epoch": 0.4383676767676768, "grad_norm": 0.06258191913366318, "learning_rate": 0.00019083037958323628, "loss": 0.0973, "step": 6781 }, { "epoch": 0.43843232323232323, "grad_norm": 0.059785228222608566, "learning_rate": 0.00019082751870078046, "loss": 0.091, "step": 6782 }, { "epoch": 0.4384969696969697, "grad_norm": 0.06572655588388443, "learning_rate": 0.00019082465739355436, "loss": 0.0879, "step": 6783 }, { "epoch": 0.43856161616161615, "grad_norm": 0.05563567206263542, "learning_rate": 0.00019082179566157139, "loss": 0.0764, "step": 6784 }, { "epoch": 0.43856161616161615, "eval_bleu": 16.106795121769895, "eval_loss": 0.08832554519176483, "eval_runtime": 2.6244, "eval_samples_per_second": 12.193, "eval_steps_per_second": 1.524, "step": 6784 }, { "epoch": 0.43862626262626264, "grad_norm": 0.06797052174806595, "learning_rate": 0.00019081893350484486, "loss": 0.0942, "step": 6785 }, { "epoch": 0.4386909090909091, "grad_norm": 0.04467397183179855, "learning_rate": 0.00019081607092338824, "loss": 0.054, "step": 6786 }, { "epoch": 0.43875555555555557, "grad_norm": 0.057415660470724106, "learning_rate": 0.00019081320791721488, "loss": 0.0897, "step": 6787 }, { "epoch": 0.438820202020202, "grad_norm": 0.06148495897650719, "learning_rate": 0.00019081034448633813, "loss": 0.081, "step": 6788 }, { "epoch": 0.4388848484848485, "grad_norm": 0.06306958943605423, "learning_rate": 0.00019080748063077146, "loss": 0.0945, "step": 6789 }, { "epoch": 0.4389494949494949, "grad_norm": 0.059776801615953445, "learning_rate": 0.00019080461635052822, "loss": 0.0863, "step": 6790 }, { "epoch": 0.4390141414141414, "grad_norm": 0.06156567111611366, "learning_rate": 0.0001908017516456218, "loss": 0.0868, "step": 6791 }, { "epoch": 0.4390787878787879, "grad_norm": 0.064261794090271, "learning_rate": 0.0001907988865160656, "loss": 0.0944, "step": 6792 }, { "epoch": 0.43914343434343434, "grad_norm": 0.05549774691462517, "learning_rate": 0.00019079602096187305, "loss": 0.0797, "step": 6793 }, { "epoch": 0.4392080808080808, "grad_norm": 0.06017737090587616, "learning_rate": 0.0001907931549830575, "loss": 0.0911, "step": 6794 }, { "epoch": 0.43927272727272726, "grad_norm": 0.0667809396982193, "learning_rate": 0.0001907902885796324, "loss": 0.0912, "step": 6795 }, { "epoch": 0.43933737373737375, "grad_norm": 0.06037105992436409, "learning_rate": 0.0001907874217516111, "loss": 0.1006, "step": 6796 }, { "epoch": 0.4394020202020202, "grad_norm": 0.05256382375955582, "learning_rate": 0.0001907845544990071, "loss": 0.0693, "step": 6797 }, { "epoch": 0.43946666666666667, "grad_norm": 0.06434550881385803, "learning_rate": 0.00019078168682183367, "loss": 0.1012, "step": 6798 }, { "epoch": 0.4395313131313131, "grad_norm": 0.060745999217033386, "learning_rate": 0.00019077881872010436, "loss": 0.0968, "step": 6799 }, { "epoch": 0.4395959595959596, "grad_norm": 0.07048343867063522, "learning_rate": 0.0001907759501938325, "loss": 0.1027, "step": 6800 }, { "epoch": 0.4395959595959596, "eval_bleu": 13.165130364933864, "eval_loss": 0.09007284790277481, "eval_runtime": 2.5091, "eval_samples_per_second": 12.754, "eval_steps_per_second": 1.594, "step": 6800 }, { "epoch": 0.4396606060606061, "grad_norm": 0.10203253477811813, "learning_rate": 0.00019077308124303157, "loss": 0.1007, "step": 6801 }, { "epoch": 0.4397252525252525, "grad_norm": 0.06673820316791534, "learning_rate": 0.00019077021186771493, "loss": 0.0867, "step": 6802 }, { "epoch": 0.439789898989899, "grad_norm": 0.06017213314771652, "learning_rate": 0.000190767342067896, "loss": 0.0904, "step": 6803 }, { "epoch": 0.43985454545454544, "grad_norm": 0.05252980813384056, "learning_rate": 0.00019076447184358824, "loss": 0.0746, "step": 6804 }, { "epoch": 0.43991919191919193, "grad_norm": 0.05212440341711044, "learning_rate": 0.00019076160119480505, "loss": 0.0814, "step": 6805 }, { "epoch": 0.43998383838383837, "grad_norm": 0.05741170793771744, "learning_rate": 0.00019075873012155983, "loss": 0.0916, "step": 6806 }, { "epoch": 0.44004848484848486, "grad_norm": 0.0692896693944931, "learning_rate": 0.00019075585862386607, "loss": 0.1184, "step": 6807 }, { "epoch": 0.4401131313131313, "grad_norm": 0.05625918135046959, "learning_rate": 0.00019075298670173714, "loss": 0.0649, "step": 6808 }, { "epoch": 0.4401777777777778, "grad_norm": 0.055878665298223495, "learning_rate": 0.00019075011435518648, "loss": 0.0863, "step": 6809 }, { "epoch": 0.44024242424242427, "grad_norm": 0.05237409472465515, "learning_rate": 0.00019074724158422757, "loss": 0.0792, "step": 6810 }, { "epoch": 0.4403070707070707, "grad_norm": 0.05773475393652916, "learning_rate": 0.00019074436838887378, "loss": 0.0819, "step": 6811 }, { "epoch": 0.4403717171717172, "grad_norm": 0.06371887028217316, "learning_rate": 0.0001907414947691386, "loss": 0.0848, "step": 6812 }, { "epoch": 0.4404363636363636, "grad_norm": 0.048104751855134964, "learning_rate": 0.00019073862072503542, "loss": 0.0655, "step": 6813 }, { "epoch": 0.4405010101010101, "grad_norm": 0.05337997525930405, "learning_rate": 0.00019073574625657776, "loss": 0.0755, "step": 6814 }, { "epoch": 0.44056565656565655, "grad_norm": 0.05910511314868927, "learning_rate": 0.00019073287136377895, "loss": 0.0946, "step": 6815 }, { "epoch": 0.44063030303030304, "grad_norm": 0.056381747126579285, "learning_rate": 0.00019072999604665254, "loss": 0.0793, "step": 6816 }, { "epoch": 0.44063030303030304, "eval_bleu": 16.874461782108767, "eval_loss": 0.0918041318655014, "eval_runtime": 2.542, "eval_samples_per_second": 12.588, "eval_steps_per_second": 1.574, "step": 6816 }, { "epoch": 0.4406949494949495, "grad_norm": 0.06909943372011185, "learning_rate": 0.00019072712030521194, "loss": 0.0879, "step": 6817 }, { "epoch": 0.44075959595959596, "grad_norm": 0.0627220869064331, "learning_rate": 0.00019072424413947055, "loss": 0.0752, "step": 6818 }, { "epoch": 0.44082424242424245, "grad_norm": 0.05220532789826393, "learning_rate": 0.00019072136754944187, "loss": 0.0705, "step": 6819 }, { "epoch": 0.4408888888888889, "grad_norm": 0.05247614160180092, "learning_rate": 0.0001907184905351394, "loss": 0.0725, "step": 6820 }, { "epoch": 0.4409535353535354, "grad_norm": 0.056871362030506134, "learning_rate": 0.00019071561309657648, "loss": 0.0824, "step": 6821 }, { "epoch": 0.4410181818181818, "grad_norm": 0.06117440015077591, "learning_rate": 0.00019071273523376664, "loss": 0.0719, "step": 6822 }, { "epoch": 0.4410828282828283, "grad_norm": 0.07127328962087631, "learning_rate": 0.00019070985694672333, "loss": 0.0979, "step": 6823 }, { "epoch": 0.44114747474747473, "grad_norm": 0.059418823570013046, "learning_rate": 0.00019070697823546002, "loss": 0.0802, "step": 6824 }, { "epoch": 0.4412121212121212, "grad_norm": 0.05781334638595581, "learning_rate": 0.00019070409909999016, "loss": 0.0789, "step": 6825 }, { "epoch": 0.44127676767676766, "grad_norm": 0.06055542454123497, "learning_rate": 0.00019070121954032723, "loss": 0.0833, "step": 6826 }, { "epoch": 0.44134141414141415, "grad_norm": 0.06089811772108078, "learning_rate": 0.00019069833955648465, "loss": 0.084, "step": 6827 }, { "epoch": 0.4414060606060606, "grad_norm": 0.06183990091085434, "learning_rate": 0.00019069545914847594, "loss": 0.0904, "step": 6828 }, { "epoch": 0.44147070707070707, "grad_norm": 0.059669092297554016, "learning_rate": 0.00019069257831631452, "loss": 0.0997, "step": 6829 }, { "epoch": 0.44153535353535356, "grad_norm": 0.04831521213054657, "learning_rate": 0.00019068969706001392, "loss": 0.0626, "step": 6830 }, { "epoch": 0.4416, "grad_norm": 0.08243787288665771, "learning_rate": 0.00019068681537958758, "loss": 0.0946, "step": 6831 }, { "epoch": 0.4416646464646465, "grad_norm": 0.0529077984392643, "learning_rate": 0.00019068393327504896, "loss": 0.0744, "step": 6832 }, { "epoch": 0.4416646464646465, "eval_bleu": 14.841361886476385, "eval_loss": 0.09283682703971863, "eval_runtime": 2.621, "eval_samples_per_second": 12.209, "eval_steps_per_second": 1.526, "step": 6832 }, { "epoch": 0.4417292929292929, "grad_norm": 0.062294360250234604, "learning_rate": 0.0001906810507464116, "loss": 0.0894, "step": 6833 }, { "epoch": 0.4417939393939394, "grad_norm": 0.05574224516749382, "learning_rate": 0.00019067816779368894, "loss": 0.0744, "step": 6834 }, { "epoch": 0.44185858585858584, "grad_norm": 0.0614391528069973, "learning_rate": 0.00019067528441689445, "loss": 0.0895, "step": 6835 }, { "epoch": 0.44192323232323233, "grad_norm": 0.0630955621600151, "learning_rate": 0.00019067240061604166, "loss": 0.0871, "step": 6836 }, { "epoch": 0.44198787878787876, "grad_norm": 0.07296746969223022, "learning_rate": 0.000190669516391144, "loss": 0.1043, "step": 6837 }, { "epoch": 0.44205252525252525, "grad_norm": 0.05839208886027336, "learning_rate": 0.000190666631742215, "loss": 0.0796, "step": 6838 }, { "epoch": 0.44211717171717174, "grad_norm": 0.07029929757118225, "learning_rate": 0.00019066374666926815, "loss": 0.1054, "step": 6839 }, { "epoch": 0.4421818181818182, "grad_norm": 0.05576518550515175, "learning_rate": 0.0001906608611723169, "loss": 0.0824, "step": 6840 }, { "epoch": 0.44224646464646467, "grad_norm": 0.05051611363887787, "learning_rate": 0.0001906579752513748, "loss": 0.0709, "step": 6841 }, { "epoch": 0.4423111111111111, "grad_norm": 0.05694295093417168, "learning_rate": 0.0001906550889064553, "loss": 0.091, "step": 6842 }, { "epoch": 0.4423757575757576, "grad_norm": 0.06398777663707733, "learning_rate": 0.00019065220213757193, "loss": 0.091, "step": 6843 }, { "epoch": 0.442440404040404, "grad_norm": 0.05899769440293312, "learning_rate": 0.0001906493149447382, "loss": 0.0818, "step": 6844 }, { "epoch": 0.4425050505050505, "grad_norm": 0.07618407905101776, "learning_rate": 0.00019064642732796757, "loss": 0.0876, "step": 6845 }, { "epoch": 0.44256969696969695, "grad_norm": 0.05732094496488571, "learning_rate": 0.00019064353928727355, "loss": 0.0707, "step": 6846 }, { "epoch": 0.44263434343434344, "grad_norm": 0.06402862817049026, "learning_rate": 0.0001906406508226697, "loss": 0.0871, "step": 6847 }, { "epoch": 0.4426989898989899, "grad_norm": 0.07104036957025528, "learning_rate": 0.00019063776193416944, "loss": 0.0938, "step": 6848 }, { "epoch": 0.4426989898989899, "eval_bleu": 12.348084315869894, "eval_loss": 0.09159594774246216, "eval_runtime": 2.6014, "eval_samples_per_second": 12.301, "eval_steps_per_second": 1.538, "step": 6848 }, { "epoch": 0.44276363636363636, "grad_norm": 0.0651189386844635, "learning_rate": 0.00019063487262178637, "loss": 0.1004, "step": 6849 }, { "epoch": 0.44282828282828285, "grad_norm": 0.0690358579158783, "learning_rate": 0.00019063198288553395, "loss": 0.1026, "step": 6850 }, { "epoch": 0.4428929292929293, "grad_norm": 0.06083299592137337, "learning_rate": 0.00019062909272542572, "loss": 0.0907, "step": 6851 }, { "epoch": 0.44295757575757577, "grad_norm": 0.06530936062335968, "learning_rate": 0.00019062620214147516, "loss": 0.0934, "step": 6852 }, { "epoch": 0.4430222222222222, "grad_norm": 0.06842318922281265, "learning_rate": 0.00019062331113369585, "loss": 0.0958, "step": 6853 }, { "epoch": 0.4430868686868687, "grad_norm": 0.055230479687452316, "learning_rate": 0.00019062041970210127, "loss": 0.0794, "step": 6854 }, { "epoch": 0.44315151515151513, "grad_norm": 0.061038825660943985, "learning_rate": 0.00019061752784670492, "loss": 0.0865, "step": 6855 }, { "epoch": 0.4432161616161616, "grad_norm": 0.059175316244363785, "learning_rate": 0.00019061463556752038, "loss": 0.092, "step": 6856 }, { "epoch": 0.44328080808080805, "grad_norm": 0.06119150295853615, "learning_rate": 0.0001906117428645611, "loss": 0.0917, "step": 6857 }, { "epoch": 0.44334545454545454, "grad_norm": 0.058736417442560196, "learning_rate": 0.0001906088497378407, "loss": 0.0876, "step": 6858 }, { "epoch": 0.44341010101010103, "grad_norm": 0.06157537177205086, "learning_rate": 0.00019060595618737263, "loss": 0.0975, "step": 6859 }, { "epoch": 0.44347474747474747, "grad_norm": 0.043411433696746826, "learning_rate": 0.00019060306221317048, "loss": 0.061, "step": 6860 }, { "epoch": 0.44353939393939396, "grad_norm": 0.06295739859342575, "learning_rate": 0.00019060016781524775, "loss": 0.0947, "step": 6861 }, { "epoch": 0.4436040404040404, "grad_norm": 0.06735324114561081, "learning_rate": 0.00019059727299361798, "loss": 0.107, "step": 6862 }, { "epoch": 0.4436686868686869, "grad_norm": 0.0541529506444931, "learning_rate": 0.00019059437774829473, "loss": 0.076, "step": 6863 }, { "epoch": 0.4437333333333333, "grad_norm": 0.0565139539539814, "learning_rate": 0.00019059148207929152, "loss": 0.0895, "step": 6864 }, { "epoch": 0.4437333333333333, "eval_bleu": 13.727878345492915, "eval_loss": 0.09107012301683426, "eval_runtime": 2.5904, "eval_samples_per_second": 12.353, "eval_steps_per_second": 1.544, "step": 6864 }, { "epoch": 0.4437979797979798, "grad_norm": 0.058204129338264465, "learning_rate": 0.00019058858598662194, "loss": 0.0784, "step": 6865 }, { "epoch": 0.44386262626262624, "grad_norm": 0.059144530445337296, "learning_rate": 0.00019058568947029944, "loss": 0.0706, "step": 6866 }, { "epoch": 0.4439272727272727, "grad_norm": 0.06533249467611313, "learning_rate": 0.00019058279253033762, "loss": 0.0974, "step": 6867 }, { "epoch": 0.4439919191919192, "grad_norm": 0.05503584444522858, "learning_rate": 0.00019057989516675004, "loss": 0.0882, "step": 6868 }, { "epoch": 0.44405656565656565, "grad_norm": 0.056108783930540085, "learning_rate": 0.0001905769973795502, "loss": 0.0889, "step": 6869 }, { "epoch": 0.44412121212121214, "grad_norm": 0.058371949940919876, "learning_rate": 0.00019057409916875174, "loss": 0.0639, "step": 6870 }, { "epoch": 0.4441858585858586, "grad_norm": 0.055866118520498276, "learning_rate": 0.0001905712005343681, "loss": 0.0776, "step": 6871 }, { "epoch": 0.44425050505050506, "grad_norm": 0.05752483382821083, "learning_rate": 0.00019056830147641293, "loss": 0.0921, "step": 6872 }, { "epoch": 0.4443151515151515, "grad_norm": 0.061909694224596024, "learning_rate": 0.00019056540199489979, "loss": 0.0939, "step": 6873 }, { "epoch": 0.444379797979798, "grad_norm": 0.07148955017328262, "learning_rate": 0.00019056250208984216, "loss": 0.0829, "step": 6874 }, { "epoch": 0.4444444444444444, "grad_norm": 0.05289623141288757, "learning_rate": 0.00019055960176125368, "loss": 0.0811, "step": 6875 }, { "epoch": 0.4445090909090909, "grad_norm": 0.05738150328397751, "learning_rate": 0.00019055670100914785, "loss": 0.0951, "step": 6876 }, { "epoch": 0.4445737373737374, "grad_norm": 0.05527816712856293, "learning_rate": 0.00019055379983353826, "loss": 0.0838, "step": 6877 }, { "epoch": 0.44463838383838383, "grad_norm": 0.05478030443191528, "learning_rate": 0.0001905508982344385, "loss": 0.082, "step": 6878 }, { "epoch": 0.4447030303030303, "grad_norm": 0.06548585742712021, "learning_rate": 0.00019054799621186214, "loss": 0.0957, "step": 6879 }, { "epoch": 0.44476767676767676, "grad_norm": 0.06199490278959274, "learning_rate": 0.00019054509376582272, "loss": 0.0965, "step": 6880 }, { "epoch": 0.44476767676767676, "eval_bleu": 10.739428037158445, "eval_loss": 0.09167717397212982, "eval_runtime": 2.6647, "eval_samples_per_second": 12.009, "eval_steps_per_second": 1.501, "step": 6880 }, { "epoch": 0.44483232323232325, "grad_norm": 0.05316866934299469, "learning_rate": 0.00019054219089633385, "loss": 0.0777, "step": 6881 }, { "epoch": 0.4448969696969697, "grad_norm": 0.07083311676979065, "learning_rate": 0.00019053928760340906, "loss": 0.0979, "step": 6882 }, { "epoch": 0.44496161616161617, "grad_norm": 0.06002625450491905, "learning_rate": 0.00019053638388706197, "loss": 0.0834, "step": 6883 }, { "epoch": 0.4450262626262626, "grad_norm": 0.061205070465803146, "learning_rate": 0.00019053347974730614, "loss": 0.0904, "step": 6884 }, { "epoch": 0.4450909090909091, "grad_norm": 0.05568086728453636, "learning_rate": 0.00019053057518415517, "loss": 0.0839, "step": 6885 }, { "epoch": 0.4451555555555556, "grad_norm": 0.07098755985498428, "learning_rate": 0.0001905276701976226, "loss": 0.1138, "step": 6886 }, { "epoch": 0.445220202020202, "grad_norm": 0.05639488995075226, "learning_rate": 0.00019052476478772205, "loss": 0.0861, "step": 6887 }, { "epoch": 0.4452848484848485, "grad_norm": 0.05245119333267212, "learning_rate": 0.0001905218589544671, "loss": 0.0831, "step": 6888 }, { "epoch": 0.44534949494949494, "grad_norm": 0.0634574219584465, "learning_rate": 0.00019051895269787137, "loss": 0.1053, "step": 6889 }, { "epoch": 0.44541414141414143, "grad_norm": 0.06082945317029953, "learning_rate": 0.00019051604601794838, "loss": 0.0894, "step": 6890 }, { "epoch": 0.44547878787878786, "grad_norm": 0.06841325014829636, "learning_rate": 0.0001905131389147118, "loss": 0.0928, "step": 6891 }, { "epoch": 0.44554343434343435, "grad_norm": 0.06371424347162247, "learning_rate": 0.00019051023138817519, "loss": 0.0913, "step": 6892 }, { "epoch": 0.4456080808080808, "grad_norm": 0.050261154770851135, "learning_rate": 0.00019050732343835213, "loss": 0.0739, "step": 6893 }, { "epoch": 0.4456727272727273, "grad_norm": 0.05761515721678734, "learning_rate": 0.00019050441506525626, "loss": 0.0908, "step": 6894 }, { "epoch": 0.4457373737373737, "grad_norm": 0.052624963223934174, "learning_rate": 0.00019050150626890116, "loss": 0.0715, "step": 6895 }, { "epoch": 0.4458020202020202, "grad_norm": 0.06262457370758057, "learning_rate": 0.00019049859704930043, "loss": 0.0875, "step": 6896 }, { "epoch": 0.4458020202020202, "eval_bleu": 14.946595774442368, "eval_loss": 0.08949264883995056, "eval_runtime": 2.5368, "eval_samples_per_second": 12.615, "eval_steps_per_second": 1.577, "step": 6896 }, { "epoch": 0.4458666666666667, "grad_norm": 0.05931051820516586, "learning_rate": 0.00019049568740646766, "loss": 0.0976, "step": 6897 }, { "epoch": 0.4459313131313131, "grad_norm": 0.057962797582149506, "learning_rate": 0.00019049277734041651, "loss": 0.0779, "step": 6898 }, { "epoch": 0.4459959595959596, "grad_norm": 0.05798592045903206, "learning_rate": 0.00019048986685116053, "loss": 0.0801, "step": 6899 }, { "epoch": 0.44606060606060605, "grad_norm": 0.05407224968075752, "learning_rate": 0.00019048695593871338, "loss": 0.0693, "step": 6900 }, { "epoch": 0.44612525252525254, "grad_norm": 0.06827379763126373, "learning_rate": 0.00019048404460308863, "loss": 0.0804, "step": 6901 }, { "epoch": 0.44618989898989897, "grad_norm": 0.05188268423080444, "learning_rate": 0.00019048113284429993, "loss": 0.0757, "step": 6902 }, { "epoch": 0.44625454545454546, "grad_norm": 0.05204017832875252, "learning_rate": 0.00019047822066236086, "loss": 0.0729, "step": 6903 }, { "epoch": 0.4463191919191919, "grad_norm": 0.06660844385623932, "learning_rate": 0.0001904753080572851, "loss": 0.0903, "step": 6904 }, { "epoch": 0.4463838383838384, "grad_norm": 0.05577358976006508, "learning_rate": 0.00019047239502908623, "loss": 0.0811, "step": 6905 }, { "epoch": 0.4464484848484849, "grad_norm": 0.06544695049524307, "learning_rate": 0.00019046948157777784, "loss": 0.0866, "step": 6906 }, { "epoch": 0.4465131313131313, "grad_norm": 0.060708314180374146, "learning_rate": 0.00019046656770337363, "loss": 0.0858, "step": 6907 }, { "epoch": 0.4465777777777778, "grad_norm": 0.09825305640697479, "learning_rate": 0.00019046365340588719, "loss": 0.0844, "step": 6908 }, { "epoch": 0.44664242424242423, "grad_norm": 0.0490134172141552, "learning_rate": 0.00019046073868533213, "loss": 0.0713, "step": 6909 }, { "epoch": 0.4467070707070707, "grad_norm": 0.06182310730218887, "learning_rate": 0.00019045782354172212, "loss": 0.0877, "step": 6910 }, { "epoch": 0.44677171717171715, "grad_norm": 0.058152493089437485, "learning_rate": 0.00019045490797507078, "loss": 0.0826, "step": 6911 }, { "epoch": 0.44683636363636364, "grad_norm": 0.05518599599599838, "learning_rate": 0.0001904519919853917, "loss": 0.083, "step": 6912 }, { "epoch": 0.44683636363636364, "eval_bleu": 15.42838824496179, "eval_loss": 0.08855625241994858, "eval_runtime": 2.6714, "eval_samples_per_second": 11.979, "eval_steps_per_second": 1.497, "step": 6912 }, { "epoch": 0.4469010101010101, "grad_norm": 0.07188152521848679, "learning_rate": 0.0001904490755726986, "loss": 0.1089, "step": 6913 }, { "epoch": 0.44696565656565657, "grad_norm": 0.06542427092790604, "learning_rate": 0.00019044615873700505, "loss": 0.0881, "step": 6914 }, { "epoch": 0.44703030303030306, "grad_norm": 0.05406093969941139, "learning_rate": 0.00019044324147832474, "loss": 0.075, "step": 6915 }, { "epoch": 0.4470949494949495, "grad_norm": 0.05813553184270859, "learning_rate": 0.00019044032379667125, "loss": 0.0844, "step": 6916 }, { "epoch": 0.447159595959596, "grad_norm": 0.05680302157998085, "learning_rate": 0.00019043740569205827, "loss": 0.0793, "step": 6917 }, { "epoch": 0.4472242424242424, "grad_norm": 0.058941833674907684, "learning_rate": 0.00019043448716449948, "loss": 0.0962, "step": 6918 }, { "epoch": 0.4472888888888889, "grad_norm": 0.05661686509847641, "learning_rate": 0.00019043156821400844, "loss": 0.0843, "step": 6919 }, { "epoch": 0.44735353535353534, "grad_norm": 0.07589738816022873, "learning_rate": 0.0001904286488405989, "loss": 0.1031, "step": 6920 }, { "epoch": 0.4474181818181818, "grad_norm": 0.05732523277401924, "learning_rate": 0.00019042572904428445, "loss": 0.0745, "step": 6921 }, { "epoch": 0.44748282828282826, "grad_norm": 0.05749354511499405, "learning_rate": 0.00019042280882507876, "loss": 0.0759, "step": 6922 }, { "epoch": 0.44754747474747475, "grad_norm": 0.058268822729587555, "learning_rate": 0.0001904198881829955, "loss": 0.0751, "step": 6923 }, { "epoch": 0.44761212121212124, "grad_norm": 0.05875641107559204, "learning_rate": 0.00019041696711804827, "loss": 0.094, "step": 6924 }, { "epoch": 0.4476767676767677, "grad_norm": 0.06455613672733307, "learning_rate": 0.0001904140456302508, "loss": 0.0868, "step": 6925 }, { "epoch": 0.44774141414141416, "grad_norm": 0.060186147689819336, "learning_rate": 0.00019041112371961672, "loss": 0.0833, "step": 6926 }, { "epoch": 0.4478060606060606, "grad_norm": 0.06460586190223694, "learning_rate": 0.00019040820138615974, "loss": 0.0872, "step": 6927 }, { "epoch": 0.4478707070707071, "grad_norm": 0.06744648516178131, "learning_rate": 0.00019040527862989345, "loss": 0.0779, "step": 6928 }, { "epoch": 0.4478707070707071, "eval_bleu": 12.166007114577942, "eval_loss": 0.08967415243387222, "eval_runtime": 2.6121, "eval_samples_per_second": 12.251, "eval_steps_per_second": 1.531, "step": 6928 }, { "epoch": 0.4479353535353535, "grad_norm": 0.056869521737098694, "learning_rate": 0.00019040235545083158, "loss": 0.0804, "step": 6929 }, { "epoch": 0.448, "grad_norm": 0.054253365844488144, "learning_rate": 0.00019039943184898775, "loss": 0.0768, "step": 6930 }, { "epoch": 0.44806464646464644, "grad_norm": 0.05472457408905029, "learning_rate": 0.00019039650782437572, "loss": 0.0818, "step": 6931 }, { "epoch": 0.44812929292929293, "grad_norm": 0.06302040815353394, "learning_rate": 0.00019039358337700904, "loss": 0.0854, "step": 6932 }, { "epoch": 0.44819393939393937, "grad_norm": 0.05808340385556221, "learning_rate": 0.0001903906585069015, "loss": 0.0785, "step": 6933 }, { "epoch": 0.44825858585858586, "grad_norm": 0.05898988991975784, "learning_rate": 0.00019038773321406672, "loss": 0.0755, "step": 6934 }, { "epoch": 0.44832323232323235, "grad_norm": 0.05379583314061165, "learning_rate": 0.00019038480749851842, "loss": 0.0674, "step": 6935 }, { "epoch": 0.4483878787878788, "grad_norm": 0.07427021861076355, "learning_rate": 0.0001903818813602702, "loss": 0.1055, "step": 6936 }, { "epoch": 0.44845252525252527, "grad_norm": 0.07185864448547363, "learning_rate": 0.00019037895479933584, "loss": 0.1146, "step": 6937 }, { "epoch": 0.4485171717171717, "grad_norm": 0.059994541108608246, "learning_rate": 0.000190376027815729, "loss": 0.0935, "step": 6938 }, { "epoch": 0.4485818181818182, "grad_norm": 0.06498627364635468, "learning_rate": 0.00019037310040946332, "loss": 0.0879, "step": 6939 }, { "epoch": 0.4486464646464646, "grad_norm": 0.06212816759943962, "learning_rate": 0.00019037017258055253, "loss": 0.0925, "step": 6940 }, { "epoch": 0.4487111111111111, "grad_norm": 0.05700937658548355, "learning_rate": 0.00019036724432901035, "loss": 0.0853, "step": 6941 }, { "epoch": 0.44877575757575755, "grad_norm": 0.06111691892147064, "learning_rate": 0.00019036431565485043, "loss": 0.0816, "step": 6942 }, { "epoch": 0.44884040404040404, "grad_norm": 0.06395572423934937, "learning_rate": 0.00019036138655808646, "loss": 0.075, "step": 6943 }, { "epoch": 0.44890505050505053, "grad_norm": 0.05781414732336998, "learning_rate": 0.00019035845703873217, "loss": 0.088, "step": 6944 }, { "epoch": 0.44890505050505053, "eval_bleu": 15.747129552765117, "eval_loss": 0.0898839682340622, "eval_runtime": 2.7036, "eval_samples_per_second": 11.836, "eval_steps_per_second": 1.479, "step": 6944 }, { "epoch": 0.44896969696969696, "grad_norm": 0.05927276611328125, "learning_rate": 0.00019035552709680125, "loss": 0.0829, "step": 6945 }, { "epoch": 0.44903434343434345, "grad_norm": 0.06316172331571579, "learning_rate": 0.0001903525967323074, "loss": 0.0892, "step": 6946 }, { "epoch": 0.4490989898989899, "grad_norm": 0.06607817858457565, "learning_rate": 0.00019034966594526432, "loss": 0.0882, "step": 6947 }, { "epoch": 0.4491636363636364, "grad_norm": 0.06973529607057571, "learning_rate": 0.00019034673473568571, "loss": 0.0935, "step": 6948 }, { "epoch": 0.4492282828282828, "grad_norm": 0.0567387156188488, "learning_rate": 0.00019034380310358533, "loss": 0.0829, "step": 6949 }, { "epoch": 0.4492929292929293, "grad_norm": 0.049556441605091095, "learning_rate": 0.0001903408710489768, "loss": 0.0696, "step": 6950 }, { "epoch": 0.44935757575757573, "grad_norm": 0.0637783631682396, "learning_rate": 0.00019033793857187392, "loss": 0.0842, "step": 6951 }, { "epoch": 0.4494222222222222, "grad_norm": 0.060015588998794556, "learning_rate": 0.00019033500567229036, "loss": 0.0893, "step": 6952 }, { "epoch": 0.4494868686868687, "grad_norm": 0.05716494470834732, "learning_rate": 0.00019033207235023985, "loss": 0.0814, "step": 6953 }, { "epoch": 0.44955151515151515, "grad_norm": 0.05346056446433067, "learning_rate": 0.00019032913860573606, "loss": 0.0853, "step": 6954 }, { "epoch": 0.44961616161616164, "grad_norm": 0.05587244778871536, "learning_rate": 0.0001903262044387928, "loss": 0.0762, "step": 6955 }, { "epoch": 0.44968080808080807, "grad_norm": 0.05591361224651337, "learning_rate": 0.0001903232698494237, "loss": 0.0696, "step": 6956 }, { "epoch": 0.44974545454545456, "grad_norm": 0.0613478384912014, "learning_rate": 0.00019032033483764256, "loss": 0.0893, "step": 6957 }, { "epoch": 0.449810101010101, "grad_norm": 0.05871087685227394, "learning_rate": 0.00019031739940346304, "loss": 0.0898, "step": 6958 }, { "epoch": 0.4498747474747475, "grad_norm": 0.06468261778354645, "learning_rate": 0.00019031446354689894, "loss": 0.1032, "step": 6959 }, { "epoch": 0.4499393939393939, "grad_norm": 0.05729063227772713, "learning_rate": 0.00019031152726796396, "loss": 0.0855, "step": 6960 }, { "epoch": 0.4499393939393939, "eval_bleu": 13.71280684847394, "eval_loss": 0.09172676503658295, "eval_runtime": 2.6742, "eval_samples_per_second": 11.966, "eval_steps_per_second": 1.496, "step": 6960 }, { "epoch": 0.4500040404040404, "grad_norm": 0.05106128752231598, "learning_rate": 0.00019030859056667177, "loss": 0.0825, "step": 6961 }, { "epoch": 0.4500686868686869, "grad_norm": 0.06182041019201279, "learning_rate": 0.0001903056534430362, "loss": 0.0946, "step": 6962 }, { "epoch": 0.45013333333333333, "grad_norm": 0.05469273030757904, "learning_rate": 0.00019030271589707092, "loss": 0.0833, "step": 6963 }, { "epoch": 0.4501979797979798, "grad_norm": 0.05048250034451485, "learning_rate": 0.0001902997779287897, "loss": 0.0717, "step": 6964 }, { "epoch": 0.45026262626262625, "grad_norm": 0.062171194702386856, "learning_rate": 0.00019029683953820627, "loss": 0.0926, "step": 6965 }, { "epoch": 0.45032727272727274, "grad_norm": 0.055695515125989914, "learning_rate": 0.00019029390072533437, "loss": 0.0769, "step": 6966 }, { "epoch": 0.4503919191919192, "grad_norm": 0.056805700063705444, "learning_rate": 0.00019029096149018776, "loss": 0.0796, "step": 6967 }, { "epoch": 0.45045656565656567, "grad_norm": 0.05857550725340843, "learning_rate": 0.00019028802183278015, "loss": 0.083, "step": 6968 }, { "epoch": 0.4505212121212121, "grad_norm": 0.05865366756916046, "learning_rate": 0.00019028508175312535, "loss": 0.0827, "step": 6969 }, { "epoch": 0.4505858585858586, "grad_norm": 0.057752661406993866, "learning_rate": 0.00019028214125123703, "loss": 0.0831, "step": 6970 }, { "epoch": 0.450650505050505, "grad_norm": 0.06165473908185959, "learning_rate": 0.00019027920032712898, "loss": 0.0939, "step": 6971 }, { "epoch": 0.4507151515151515, "grad_norm": 0.053947195410728455, "learning_rate": 0.00019027625898081498, "loss": 0.0739, "step": 6972 }, { "epoch": 0.450779797979798, "grad_norm": 0.06585847586393356, "learning_rate": 0.00019027331721230875, "loss": 0.106, "step": 6973 }, { "epoch": 0.45084444444444444, "grad_norm": 0.05844097211956978, "learning_rate": 0.00019027037502162406, "loss": 0.086, "step": 6974 }, { "epoch": 0.4509090909090909, "grad_norm": 0.06291331350803375, "learning_rate": 0.00019026743240877468, "loss": 0.0858, "step": 6975 }, { "epoch": 0.45097373737373736, "grad_norm": 0.06178602948784828, "learning_rate": 0.00019026448937377435, "loss": 0.0981, "step": 6976 }, { "epoch": 0.45097373737373736, "eval_bleu": 11.674895882714745, "eval_loss": 0.09387855976819992, "eval_runtime": 2.7039, "eval_samples_per_second": 11.835, "eval_steps_per_second": 1.479, "step": 6976 }, { "epoch": 0.45103838383838385, "grad_norm": 0.07132205367088318, "learning_rate": 0.00019026154591663685, "loss": 0.0969, "step": 6977 }, { "epoch": 0.4511030303030303, "grad_norm": 0.06943094730377197, "learning_rate": 0.00019025860203737593, "loss": 0.1054, "step": 6978 }, { "epoch": 0.4511676767676768, "grad_norm": 0.06691502034664154, "learning_rate": 0.00019025565773600538, "loss": 0.0854, "step": 6979 }, { "epoch": 0.4512323232323232, "grad_norm": 0.054549023509025574, "learning_rate": 0.00019025271301253892, "loss": 0.0739, "step": 6980 }, { "epoch": 0.4512969696969697, "grad_norm": 0.05432969331741333, "learning_rate": 0.00019024976786699042, "loss": 0.0803, "step": 6981 }, { "epoch": 0.4513616161616162, "grad_norm": 0.06340986490249634, "learning_rate": 0.00019024682229937354, "loss": 0.0928, "step": 6982 }, { "epoch": 0.4514262626262626, "grad_norm": 0.06434701383113861, "learning_rate": 0.00019024387630970212, "loss": 0.0974, "step": 6983 }, { "epoch": 0.4514909090909091, "grad_norm": 0.06673652678728104, "learning_rate": 0.00019024092989798996, "loss": 0.1008, "step": 6984 }, { "epoch": 0.45155555555555554, "grad_norm": 0.0667102187871933, "learning_rate": 0.00019023798306425075, "loss": 0.0857, "step": 6985 }, { "epoch": 0.45162020202020203, "grad_norm": 0.06318831443786621, "learning_rate": 0.00019023503580849836, "loss": 0.0911, "step": 6986 }, { "epoch": 0.45168484848484847, "grad_norm": 0.06468236446380615, "learning_rate": 0.00019023208813074652, "loss": 0.1104, "step": 6987 }, { "epoch": 0.45174949494949496, "grad_norm": 0.05706823989748955, "learning_rate": 0.00019022914003100904, "loss": 0.0834, "step": 6988 }, { "epoch": 0.4518141414141414, "grad_norm": 0.0615336075425148, "learning_rate": 0.00019022619150929973, "loss": 0.0928, "step": 6989 }, { "epoch": 0.4518787878787879, "grad_norm": 0.06496910750865936, "learning_rate": 0.00019022324256563232, "loss": 0.104, "step": 6990 }, { "epoch": 0.45194343434343437, "grad_norm": 0.06736738234758377, "learning_rate": 0.00019022029320002064, "loss": 0.0829, "step": 6991 }, { "epoch": 0.4520080808080808, "grad_norm": 0.049840349704027176, "learning_rate": 0.00019021734341247845, "loss": 0.0676, "step": 6992 }, { "epoch": 0.4520080808080808, "eval_bleu": 15.552001861216146, "eval_loss": 0.09206150472164154, "eval_runtime": 2.6336, "eval_samples_per_second": 12.151, "eval_steps_per_second": 1.519, "step": 6992 }, { "epoch": 0.4520727272727273, "grad_norm": 0.05133712664246559, "learning_rate": 0.0001902143932030196, "loss": 0.0759, "step": 6993 }, { "epoch": 0.4521373737373737, "grad_norm": 0.06123444810509682, "learning_rate": 0.00019021144257165787, "loss": 0.0894, "step": 6994 }, { "epoch": 0.4522020202020202, "grad_norm": 0.057804688811302185, "learning_rate": 0.000190208491518407, "loss": 0.0783, "step": 6995 }, { "epoch": 0.45226666666666665, "grad_norm": 0.05769381299614906, "learning_rate": 0.00019020554004328086, "loss": 0.0861, "step": 6996 }, { "epoch": 0.45233131313131314, "grad_norm": 0.06167766824364662, "learning_rate": 0.00019020258814629324, "loss": 0.082, "step": 6997 }, { "epoch": 0.4523959595959596, "grad_norm": 0.05515992268919945, "learning_rate": 0.00019019963582745796, "loss": 0.0761, "step": 6998 }, { "epoch": 0.45246060606060606, "grad_norm": 0.04994555935263634, "learning_rate": 0.00019019668308678872, "loss": 0.0705, "step": 6999 }, { "epoch": 0.45252525252525255, "grad_norm": 0.07197088748216629, "learning_rate": 0.00019019372992429946, "loss": 0.1021, "step": 7000 }, { "epoch": 0.452589898989899, "grad_norm": 0.05343087762594223, "learning_rate": 0.00019019077634000397, "loss": 0.0788, "step": 7001 }, { "epoch": 0.4526545454545455, "grad_norm": 0.05702870711684227, "learning_rate": 0.00019018782233391602, "loss": 0.0692, "step": 7002 }, { "epoch": 0.4527191919191919, "grad_norm": 0.05536028742790222, "learning_rate": 0.00019018486790604944, "loss": 0.0861, "step": 7003 }, { "epoch": 0.4527838383838384, "grad_norm": 0.05584926903247833, "learning_rate": 0.000190181913056418, "loss": 0.0791, "step": 7004 }, { "epoch": 0.45284848484848483, "grad_norm": 0.048610229045152664, "learning_rate": 0.00019017895778503563, "loss": 0.0731, "step": 7005 }, { "epoch": 0.4529131313131313, "grad_norm": 0.06430976092815399, "learning_rate": 0.00019017600209191607, "loss": 0.0861, "step": 7006 }, { "epoch": 0.45297777777777776, "grad_norm": 0.05096680670976639, "learning_rate": 0.00019017304597707314, "loss": 0.0769, "step": 7007 }, { "epoch": 0.45304242424242425, "grad_norm": 0.05749129876494408, "learning_rate": 0.0001901700894405207, "loss": 0.0817, "step": 7008 }, { "epoch": 0.45304242424242425, "eval_bleu": 14.884326739079775, "eval_loss": 0.09235093742609024, "eval_runtime": 2.7315, "eval_samples_per_second": 11.715, "eval_steps_per_second": 1.464, "step": 7008 }, { "epoch": 0.4531070707070707, "grad_norm": 0.0613827221095562, "learning_rate": 0.00019016713248227256, "loss": 0.0857, "step": 7009 }, { "epoch": 0.45317171717171717, "grad_norm": 0.05504656955599785, "learning_rate": 0.00019016417510234256, "loss": 0.0836, "step": 7010 }, { "epoch": 0.45323636363636366, "grad_norm": 0.05709686875343323, "learning_rate": 0.0001901612173007445, "loss": 0.0817, "step": 7011 }, { "epoch": 0.4533010101010101, "grad_norm": 0.05854745954275131, "learning_rate": 0.00019015825907749224, "loss": 0.0839, "step": 7012 }, { "epoch": 0.4533656565656566, "grad_norm": 0.06183277443051338, "learning_rate": 0.0001901553004325996, "loss": 0.0913, "step": 7013 }, { "epoch": 0.453430303030303, "grad_norm": 0.05211608484387398, "learning_rate": 0.00019015234136608044, "loss": 0.0799, "step": 7014 }, { "epoch": 0.4534949494949495, "grad_norm": 0.059425268322229385, "learning_rate": 0.0001901493818779486, "loss": 0.0799, "step": 7015 }, { "epoch": 0.45355959595959594, "grad_norm": 0.049835000187158585, "learning_rate": 0.00019014642196821786, "loss": 0.0692, "step": 7016 }, { "epoch": 0.45362424242424243, "grad_norm": 0.06984440237283707, "learning_rate": 0.00019014346163690214, "loss": 0.1174, "step": 7017 }, { "epoch": 0.45368888888888886, "grad_norm": 0.05113052949309349, "learning_rate": 0.00019014050088401522, "loss": 0.0665, "step": 7018 }, { "epoch": 0.45375353535353535, "grad_norm": 0.07299787551164627, "learning_rate": 0.000190137539709571, "loss": 0.1136, "step": 7019 }, { "epoch": 0.45381818181818184, "grad_norm": 0.06523466110229492, "learning_rate": 0.0001901345781135833, "loss": 0.0939, "step": 7020 }, { "epoch": 0.4538828282828283, "grad_norm": 0.05918470770120621, "learning_rate": 0.00019013161609606597, "loss": 0.0869, "step": 7021 }, { "epoch": 0.45394747474747477, "grad_norm": 0.06468678265810013, "learning_rate": 0.0001901286536570329, "loss": 0.0968, "step": 7022 }, { "epoch": 0.4540121212121212, "grad_norm": 0.0643845796585083, "learning_rate": 0.00019012569079649788, "loss": 0.0876, "step": 7023 }, { "epoch": 0.4540767676767677, "grad_norm": 0.07212929427623749, "learning_rate": 0.0001901227275144748, "loss": 0.0731, "step": 7024 }, { "epoch": 0.4540767676767677, "eval_bleu": 13.679854947377502, "eval_loss": 0.09103026986122131, "eval_runtime": 2.7166, "eval_samples_per_second": 11.78, "eval_steps_per_second": 1.472, "step": 7024 }, { "epoch": 0.4541414141414141, "grad_norm": 0.04967840388417244, "learning_rate": 0.00019011976381097753, "loss": 0.065, "step": 7025 }, { "epoch": 0.4542060606060606, "grad_norm": 0.05544466897845268, "learning_rate": 0.00019011679968601993, "loss": 0.0878, "step": 7026 }, { "epoch": 0.45427070707070705, "grad_norm": 0.05269740894436836, "learning_rate": 0.00019011383513961583, "loss": 0.0741, "step": 7027 }, { "epoch": 0.45433535353535354, "grad_norm": 0.06790318340063095, "learning_rate": 0.00019011087017177912, "loss": 0.1039, "step": 7028 }, { "epoch": 0.4544, "grad_norm": 0.06366728991270065, "learning_rate": 0.00019010790478252366, "loss": 0.0844, "step": 7029 }, { "epoch": 0.45446464646464646, "grad_norm": 0.055279649794101715, "learning_rate": 0.0001901049389718633, "loss": 0.0725, "step": 7030 }, { "epoch": 0.45452929292929295, "grad_norm": 0.06322003901004791, "learning_rate": 0.00019010197273981193, "loss": 0.0799, "step": 7031 }, { "epoch": 0.4545939393939394, "grad_norm": 0.056046321988105774, "learning_rate": 0.00019009900608638346, "loss": 0.0772, "step": 7032 }, { "epoch": 0.4546585858585859, "grad_norm": 0.04925842955708504, "learning_rate": 0.0001900960390115917, "loss": 0.0718, "step": 7033 }, { "epoch": 0.4547232323232323, "grad_norm": 0.06450553983449936, "learning_rate": 0.00019009307151545053, "loss": 0.0977, "step": 7034 }, { "epoch": 0.4547878787878788, "grad_norm": 0.05923449620604515, "learning_rate": 0.00019009010359797389, "loss": 0.0803, "step": 7035 }, { "epoch": 0.45485252525252523, "grad_norm": 0.061388786882162094, "learning_rate": 0.00019008713525917558, "loss": 0.0909, "step": 7036 }, { "epoch": 0.4549171717171717, "grad_norm": 0.06918361037969589, "learning_rate": 0.00019008416649906955, "loss": 0.0952, "step": 7037 }, { "epoch": 0.4549818181818182, "grad_norm": 0.06034991145133972, "learning_rate": 0.00019008119731766962, "loss": 0.0989, "step": 7038 }, { "epoch": 0.45504646464646464, "grad_norm": 0.06343725323677063, "learning_rate": 0.00019007822771498976, "loss": 0.0789, "step": 7039 }, { "epoch": 0.45511111111111113, "grad_norm": 0.06637958437204361, "learning_rate": 0.00019007525769104377, "loss": 0.0895, "step": 7040 }, { "epoch": 0.45511111111111113, "eval_bleu": 16.23170720534324, "eval_loss": 0.09164945781230927, "eval_runtime": 2.6549, "eval_samples_per_second": 12.053, "eval_steps_per_second": 1.507, "step": 7040 }, { "epoch": 0.45517575757575757, "grad_norm": 0.06908579170703888, "learning_rate": 0.00019007228724584558, "loss": 0.0999, "step": 7041 }, { "epoch": 0.45524040404040406, "grad_norm": 0.05528010055422783, "learning_rate": 0.0001900693163794091, "loss": 0.0861, "step": 7042 }, { "epoch": 0.4553050505050505, "grad_norm": 0.051388904452323914, "learning_rate": 0.0001900663450917482, "loss": 0.0733, "step": 7043 }, { "epoch": 0.455369696969697, "grad_norm": 0.0601758174598217, "learning_rate": 0.00019006337338287675, "loss": 0.0851, "step": 7044 }, { "epoch": 0.4554343434343434, "grad_norm": 0.054402489215135574, "learning_rate": 0.0001900604012528087, "loss": 0.0776, "step": 7045 }, { "epoch": 0.4554989898989899, "grad_norm": 0.06897033005952835, "learning_rate": 0.00019005742870155796, "loss": 0.1007, "step": 7046 }, { "epoch": 0.45556363636363634, "grad_norm": 0.06823015213012695, "learning_rate": 0.00019005445572913837, "loss": 0.0991, "step": 7047 }, { "epoch": 0.4556282828282828, "grad_norm": 0.05425736680626869, "learning_rate": 0.00019005148233556382, "loss": 0.0757, "step": 7048 }, { "epoch": 0.4556929292929293, "grad_norm": 0.08169886469841003, "learning_rate": 0.00019004850852084832, "loss": 0.0975, "step": 7049 }, { "epoch": 0.45575757575757575, "grad_norm": 0.054632291197776794, "learning_rate": 0.00019004553428500568, "loss": 0.0766, "step": 7050 }, { "epoch": 0.45582222222222224, "grad_norm": 0.06032199412584305, "learning_rate": 0.00019004255962804987, "loss": 0.0782, "step": 7051 }, { "epoch": 0.4558868686868687, "grad_norm": 0.0535498708486557, "learning_rate": 0.00019003958454999475, "loss": 0.0796, "step": 7052 }, { "epoch": 0.45595151515151516, "grad_norm": 0.0654628649353981, "learning_rate": 0.00019003660905085427, "loss": 0.1092, "step": 7053 }, { "epoch": 0.4560161616161616, "grad_norm": 0.046066805720329285, "learning_rate": 0.00019003363313064233, "loss": 0.067, "step": 7054 }, { "epoch": 0.4560808080808081, "grad_norm": 0.060095179826021194, "learning_rate": 0.00019003065678937286, "loss": 0.0889, "step": 7055 }, { "epoch": 0.4561454545454545, "grad_norm": 0.056389667093753815, "learning_rate": 0.00019002768002705978, "loss": 0.0822, "step": 7056 }, { "epoch": 0.4561454545454545, "eval_bleu": 17.027442776628252, "eval_loss": 0.09086190164089203, "eval_runtime": 2.6091, "eval_samples_per_second": 12.265, "eval_steps_per_second": 1.533, "step": 7056 }, { "epoch": 0.456210101010101, "grad_norm": 0.0600268691778183, "learning_rate": 0.000190024702843717, "loss": 0.0868, "step": 7057 }, { "epoch": 0.4562747474747475, "grad_norm": 0.06409511715173721, "learning_rate": 0.0001900217252393584, "loss": 0.0964, "step": 7058 }, { "epoch": 0.45633939393939393, "grad_norm": 0.06728044152259827, "learning_rate": 0.000190018747213998, "loss": 0.0977, "step": 7059 }, { "epoch": 0.4564040404040404, "grad_norm": 0.061861783266067505, "learning_rate": 0.0001900157687676497, "loss": 0.0846, "step": 7060 }, { "epoch": 0.45646868686868686, "grad_norm": 0.05850927531719208, "learning_rate": 0.00019001278990032737, "loss": 0.0902, "step": 7061 }, { "epoch": 0.45653333333333335, "grad_norm": 0.053094733506441116, "learning_rate": 0.00019000981061204498, "loss": 0.0761, "step": 7062 }, { "epoch": 0.4565979797979798, "grad_norm": 0.06212780252099037, "learning_rate": 0.00019000683090281646, "loss": 0.0741, "step": 7063 }, { "epoch": 0.45666262626262627, "grad_norm": 0.06638514250516891, "learning_rate": 0.00019000385077265578, "loss": 0.1026, "step": 7064 }, { "epoch": 0.4567272727272727, "grad_norm": 0.05653370916843414, "learning_rate": 0.0001900008702215768, "loss": 0.0858, "step": 7065 }, { "epoch": 0.4567919191919192, "grad_norm": 0.06549810618162155, "learning_rate": 0.00018999788924959352, "loss": 0.0816, "step": 7066 }, { "epoch": 0.4568565656565657, "grad_norm": 0.05748653784394264, "learning_rate": 0.00018999490785671987, "loss": 0.08, "step": 7067 }, { "epoch": 0.4569212121212121, "grad_norm": 0.0654853954911232, "learning_rate": 0.0001899919260429698, "loss": 0.1061, "step": 7068 }, { "epoch": 0.4569858585858586, "grad_norm": 0.05960091948509216, "learning_rate": 0.00018998894380835722, "loss": 0.0806, "step": 7069 }, { "epoch": 0.45705050505050504, "grad_norm": 0.06203774735331535, "learning_rate": 0.00018998596115289611, "loss": 0.0746, "step": 7070 }, { "epoch": 0.45711515151515153, "grad_norm": 0.05697517469525337, "learning_rate": 0.0001899829780766004, "loss": 0.074, "step": 7071 }, { "epoch": 0.45717979797979796, "grad_norm": 0.07434123754501343, "learning_rate": 0.00018997999457948404, "loss": 0.1209, "step": 7072 }, { "epoch": 0.45717979797979796, "eval_bleu": 18.978925521756793, "eval_loss": 0.0910872295498848, "eval_runtime": 2.8294, "eval_samples_per_second": 11.31, "eval_steps_per_second": 1.414, "step": 7072 }, { "epoch": 0.45724444444444445, "grad_norm": 0.07041558623313904, "learning_rate": 0.00018997701066156104, "loss": 0.0965, "step": 7073 }, { "epoch": 0.4573090909090909, "grad_norm": 0.05974428355693817, "learning_rate": 0.00018997402632284528, "loss": 0.0858, "step": 7074 }, { "epoch": 0.4573737373737374, "grad_norm": 0.049557287245988846, "learning_rate": 0.00018997104156335073, "loss": 0.067, "step": 7075 }, { "epoch": 0.4574383838383838, "grad_norm": 0.06373658031225204, "learning_rate": 0.00018996805638309136, "loss": 0.0842, "step": 7076 }, { "epoch": 0.4575030303030303, "grad_norm": 0.06128661334514618, "learning_rate": 0.00018996507078208118, "loss": 0.0914, "step": 7077 }, { "epoch": 0.4575676767676768, "grad_norm": 0.059732504189014435, "learning_rate": 0.00018996208476033407, "loss": 0.0882, "step": 7078 }, { "epoch": 0.4576323232323232, "grad_norm": 0.05144989490509033, "learning_rate": 0.000189959098317864, "loss": 0.0799, "step": 7079 }, { "epoch": 0.4576969696969697, "grad_norm": 0.04887744411826134, "learning_rate": 0.00018995611145468502, "loss": 0.0683, "step": 7080 }, { "epoch": 0.45776161616161615, "grad_norm": 0.05941283702850342, "learning_rate": 0.000189953124170811, "loss": 0.0969, "step": 7081 }, { "epoch": 0.45782626262626264, "grad_norm": 0.06427998840808868, "learning_rate": 0.000189950136466256, "loss": 0.0845, "step": 7082 }, { "epoch": 0.45789090909090907, "grad_norm": 0.06219245493412018, "learning_rate": 0.00018994714834103392, "loss": 0.1013, "step": 7083 }, { "epoch": 0.45795555555555556, "grad_norm": 0.07142374664545059, "learning_rate": 0.0001899441597951588, "loss": 0.1042, "step": 7084 }, { "epoch": 0.458020202020202, "grad_norm": 0.0584946908056736, "learning_rate": 0.00018994117082864453, "loss": 0.0949, "step": 7085 }, { "epoch": 0.4580848484848485, "grad_norm": 0.06162520870566368, "learning_rate": 0.00018993818144150514, "loss": 0.0864, "step": 7086 }, { "epoch": 0.458149494949495, "grad_norm": 0.05273338034749031, "learning_rate": 0.00018993519163375464, "loss": 0.0689, "step": 7087 }, { "epoch": 0.4582141414141414, "grad_norm": 0.06692434847354889, "learning_rate": 0.00018993220140540697, "loss": 0.0776, "step": 7088 }, { "epoch": 0.4582141414141414, "eval_bleu": 18.73353043235895, "eval_loss": 0.0919497087597847, "eval_runtime": 2.6537, "eval_samples_per_second": 12.059, "eval_steps_per_second": 1.507, "step": 7088 }, { "epoch": 0.4582787878787879, "grad_norm": 0.06215566769242287, "learning_rate": 0.0001899292107564761, "loss": 0.0922, "step": 7089 }, { "epoch": 0.45834343434343433, "grad_norm": 0.061910104006528854, "learning_rate": 0.0001899262196869761, "loss": 0.0928, "step": 7090 }, { "epoch": 0.4584080808080808, "grad_norm": 0.060293253511190414, "learning_rate": 0.00018992322819692085, "loss": 0.0999, "step": 7091 }, { "epoch": 0.45847272727272725, "grad_norm": 0.06498220562934875, "learning_rate": 0.00018992023628632443, "loss": 0.0884, "step": 7092 }, { "epoch": 0.45853737373737374, "grad_norm": 0.05547791346907616, "learning_rate": 0.00018991724395520074, "loss": 0.0849, "step": 7093 }, { "epoch": 0.4586020202020202, "grad_norm": 0.06146413832902908, "learning_rate": 0.00018991425120356388, "loss": 0.0922, "step": 7094 }, { "epoch": 0.45866666666666667, "grad_norm": 0.06552723050117493, "learning_rate": 0.00018991125803142776, "loss": 0.0948, "step": 7095 }, { "epoch": 0.45873131313131316, "grad_norm": 0.08245430886745453, "learning_rate": 0.0001899082644388064, "loss": 0.0978, "step": 7096 }, { "epoch": 0.4587959595959596, "grad_norm": 0.05251757800579071, "learning_rate": 0.0001899052704257138, "loss": 0.0769, "step": 7097 }, { "epoch": 0.4588606060606061, "grad_norm": 0.0560099296271801, "learning_rate": 0.00018990227599216403, "loss": 0.0815, "step": 7098 }, { "epoch": 0.4589252525252525, "grad_norm": 0.06328769028186798, "learning_rate": 0.000189899281138171, "loss": 0.0792, "step": 7099 }, { "epoch": 0.458989898989899, "grad_norm": 0.055962298065423965, "learning_rate": 0.00018989628586374874, "loss": 0.0763, "step": 7100 }, { "epoch": 0.45905454545454544, "grad_norm": 0.06920085847377777, "learning_rate": 0.0001898932901689113, "loss": 0.1057, "step": 7101 }, { "epoch": 0.4591191919191919, "grad_norm": 0.055156875401735306, "learning_rate": 0.00018989029405367265, "loss": 0.083, "step": 7102 }, { "epoch": 0.45918383838383836, "grad_norm": 0.056849196553230286, "learning_rate": 0.0001898872975180468, "loss": 0.0808, "step": 7103 }, { "epoch": 0.45924848484848485, "grad_norm": 0.04974493384361267, "learning_rate": 0.00018988430056204777, "loss": 0.0733, "step": 7104 }, { "epoch": 0.45924848484848485, "eval_bleu": 17.737047164003197, "eval_loss": 0.09086254984140396, "eval_runtime": 2.7229, "eval_samples_per_second": 11.752, "eval_steps_per_second": 1.469, "step": 7104 }, { "epoch": 0.45931313131313134, "grad_norm": 0.060705527663230896, "learning_rate": 0.00018988130318568961, "loss": 0.0921, "step": 7105 }, { "epoch": 0.4593777777777778, "grad_norm": 0.06887055188417435, "learning_rate": 0.0001898783053889863, "loss": 0.0858, "step": 7106 }, { "epoch": 0.45944242424242426, "grad_norm": 0.05495855212211609, "learning_rate": 0.00018987530717195185, "loss": 0.0826, "step": 7107 }, { "epoch": 0.4595070707070707, "grad_norm": 0.06102900579571724, "learning_rate": 0.00018987230853460032, "loss": 0.0932, "step": 7108 }, { "epoch": 0.4595717171717172, "grad_norm": 0.05802123248577118, "learning_rate": 0.00018986930947694573, "loss": 0.088, "step": 7109 }, { "epoch": 0.4596363636363636, "grad_norm": 0.06613217294216156, "learning_rate": 0.00018986630999900207, "loss": 0.1063, "step": 7110 }, { "epoch": 0.4597010101010101, "grad_norm": 0.0697341337800026, "learning_rate": 0.00018986331010078337, "loss": 0.1058, "step": 7111 }, { "epoch": 0.45976565656565654, "grad_norm": 0.07848163694143295, "learning_rate": 0.00018986030978230368, "loss": 0.0757, "step": 7112 }, { "epoch": 0.45983030303030303, "grad_norm": 0.07183175534009933, "learning_rate": 0.00018985730904357706, "loss": 0.083, "step": 7113 }, { "epoch": 0.45989494949494947, "grad_norm": 0.0584479495882988, "learning_rate": 0.0001898543078846175, "loss": 0.0899, "step": 7114 }, { "epoch": 0.45995959595959596, "grad_norm": 0.07697788625955582, "learning_rate": 0.00018985130630543904, "loss": 0.0943, "step": 7115 }, { "epoch": 0.46002424242424245, "grad_norm": 0.05978691205382347, "learning_rate": 0.00018984830430605573, "loss": 0.0884, "step": 7116 }, { "epoch": 0.4600888888888889, "grad_norm": 0.05339720472693443, "learning_rate": 0.0001898453018864816, "loss": 0.0796, "step": 7117 }, { "epoch": 0.46015353535353537, "grad_norm": 0.06407467275857925, "learning_rate": 0.00018984229904673067, "loss": 0.092, "step": 7118 }, { "epoch": 0.4602181818181818, "grad_norm": 0.06968402117490768, "learning_rate": 0.00018983929578681705, "loss": 0.1093, "step": 7119 }, { "epoch": 0.4602828282828283, "grad_norm": 0.06314373761415482, "learning_rate": 0.00018983629210675472, "loss": 0.0923, "step": 7120 }, { "epoch": 0.4602828282828283, "eval_bleu": 18.348304070501545, "eval_loss": 0.09071508049964905, "eval_runtime": 2.8199, "eval_samples_per_second": 11.348, "eval_steps_per_second": 1.418, "step": 7120 }, { "epoch": 0.4603474747474747, "grad_norm": 0.04845608398318291, "learning_rate": 0.00018983328800655774, "loss": 0.0693, "step": 7121 }, { "epoch": 0.4604121212121212, "grad_norm": 0.058756329119205475, "learning_rate": 0.00018983028348624022, "loss": 0.0806, "step": 7122 }, { "epoch": 0.46047676767676765, "grad_norm": 0.05418854579329491, "learning_rate": 0.00018982727854581613, "loss": 0.0721, "step": 7123 }, { "epoch": 0.46054141414141414, "grad_norm": 0.058667294681072235, "learning_rate": 0.00018982427318529953, "loss": 0.0834, "step": 7124 }, { "epoch": 0.46060606060606063, "grad_norm": 0.06293337047100067, "learning_rate": 0.00018982126740470454, "loss": 0.0805, "step": 7125 }, { "epoch": 0.46067070707070706, "grad_norm": 0.05028688162565231, "learning_rate": 0.00018981826120404515, "loss": 0.0742, "step": 7126 }, { "epoch": 0.46073535353535355, "grad_norm": 0.05654554069042206, "learning_rate": 0.00018981525458333548, "loss": 0.0764, "step": 7127 }, { "epoch": 0.4608, "grad_norm": 0.0616355836391449, "learning_rate": 0.0001898122475425895, "loss": 0.0939, "step": 7128 }, { "epoch": 0.4608646464646465, "grad_norm": 0.06585165113210678, "learning_rate": 0.00018980924008182138, "loss": 0.1185, "step": 7129 }, { "epoch": 0.4609292929292929, "grad_norm": 0.057439349591732025, "learning_rate": 0.0001898062322010451, "loss": 0.0673, "step": 7130 }, { "epoch": 0.4609939393939394, "grad_norm": 0.057104259729385376, "learning_rate": 0.00018980322390027478, "loss": 0.0821, "step": 7131 }, { "epoch": 0.46105858585858583, "grad_norm": 0.06734573096036911, "learning_rate": 0.00018980021517952449, "loss": 0.1039, "step": 7132 }, { "epoch": 0.4611232323232323, "grad_norm": 0.05706849694252014, "learning_rate": 0.00018979720603880823, "loss": 0.0817, "step": 7133 }, { "epoch": 0.4611878787878788, "grad_norm": 0.054645176976919174, "learning_rate": 0.00018979419647814017, "loss": 0.085, "step": 7134 }, { "epoch": 0.46125252525252525, "grad_norm": 0.060767777264118195, "learning_rate": 0.00018979118649753432, "loss": 0.0883, "step": 7135 }, { "epoch": 0.46131717171717174, "grad_norm": 0.0656442642211914, "learning_rate": 0.0001897881760970048, "loss": 0.0953, "step": 7136 }, { "epoch": 0.46131717171717174, "eval_bleu": 19.62827446662249, "eval_loss": 0.09121295809745789, "eval_runtime": 2.6708, "eval_samples_per_second": 11.981, "eval_steps_per_second": 1.498, "step": 7136 }, { "epoch": 0.46138181818181817, "grad_norm": 0.05123092606663704, "learning_rate": 0.0001897851652765656, "loss": 0.0659, "step": 7137 }, { "epoch": 0.46144646464646466, "grad_norm": 0.05000960826873779, "learning_rate": 0.00018978215403623092, "loss": 0.0781, "step": 7138 }, { "epoch": 0.4615111111111111, "grad_norm": 0.05246249958872795, "learning_rate": 0.0001897791423760148, "loss": 0.0681, "step": 7139 }, { "epoch": 0.4615757575757576, "grad_norm": 0.0598507821559906, "learning_rate": 0.00018977613029593125, "loss": 0.0905, "step": 7140 }, { "epoch": 0.461640404040404, "grad_norm": 0.06788454204797745, "learning_rate": 0.00018977311779599444, "loss": 0.1073, "step": 7141 }, { "epoch": 0.4617050505050505, "grad_norm": 0.06614597141742706, "learning_rate": 0.00018977010487621846, "loss": 0.1075, "step": 7142 }, { "epoch": 0.461769696969697, "grad_norm": 0.05953095480799675, "learning_rate": 0.00018976709153661737, "loss": 0.0716, "step": 7143 }, { "epoch": 0.46183434343434343, "grad_norm": 0.061394065618515015, "learning_rate": 0.00018976407777720526, "loss": 0.0928, "step": 7144 }, { "epoch": 0.4618989898989899, "grad_norm": 0.05740918964147568, "learning_rate": 0.00018976106359799624, "loss": 0.0786, "step": 7145 }, { "epoch": 0.46196363636363635, "grad_norm": 0.06475084275007248, "learning_rate": 0.0001897580489990044, "loss": 0.0956, "step": 7146 }, { "epoch": 0.46202828282828284, "grad_norm": 0.053041521459817886, "learning_rate": 0.00018975503398024383, "loss": 0.0772, "step": 7147 }, { "epoch": 0.4620929292929293, "grad_norm": 0.05908234044909477, "learning_rate": 0.00018975201854172865, "loss": 0.0857, "step": 7148 }, { "epoch": 0.46215757575757577, "grad_norm": 0.059436433017253876, "learning_rate": 0.00018974900268347295, "loss": 0.0858, "step": 7149 }, { "epoch": 0.4622222222222222, "grad_norm": 0.0596509650349617, "learning_rate": 0.00018974598640549082, "loss": 0.0852, "step": 7150 }, { "epoch": 0.4622868686868687, "grad_norm": 0.06068241968750954, "learning_rate": 0.0001897429697077964, "loss": 0.0914, "step": 7151 }, { "epoch": 0.4623515151515151, "grad_norm": 0.05818972736597061, "learning_rate": 0.00018973995259040377, "loss": 0.0862, "step": 7152 }, { "epoch": 0.4623515151515151, "eval_bleu": 17.64428627846574, "eval_loss": 0.09237204492092133, "eval_runtime": 2.6539, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 7152 }, { "epoch": 0.4624161616161616, "grad_norm": 0.05276405066251755, "learning_rate": 0.00018973693505332703, "loss": 0.081, "step": 7153 }, { "epoch": 0.4624808080808081, "grad_norm": 0.05746103823184967, "learning_rate": 0.00018973391709658035, "loss": 0.0946, "step": 7154 }, { "epoch": 0.46254545454545454, "grad_norm": 0.056544654071331024, "learning_rate": 0.0001897308987201778, "loss": 0.0942, "step": 7155 }, { "epoch": 0.462610101010101, "grad_norm": 0.05136793479323387, "learning_rate": 0.00018972787992413344, "loss": 0.077, "step": 7156 }, { "epoch": 0.46267474747474746, "grad_norm": 0.05730273947119713, "learning_rate": 0.00018972486070846154, "loss": 0.0938, "step": 7157 }, { "epoch": 0.46273939393939395, "grad_norm": 0.07124999165534973, "learning_rate": 0.00018972184107317607, "loss": 0.1029, "step": 7158 }, { "epoch": 0.4628040404040404, "grad_norm": 0.06343958526849747, "learning_rate": 0.00018971882101829124, "loss": 0.0999, "step": 7159 }, { "epoch": 0.4628686868686869, "grad_norm": 0.05918755382299423, "learning_rate": 0.0001897158005438211, "loss": 0.085, "step": 7160 }, { "epoch": 0.4629333333333333, "grad_norm": 0.07142215222120285, "learning_rate": 0.00018971277964977988, "loss": 0.0839, "step": 7161 }, { "epoch": 0.4629979797979798, "grad_norm": 0.0652376338839531, "learning_rate": 0.00018970975833618162, "loss": 0.0943, "step": 7162 }, { "epoch": 0.4630626262626263, "grad_norm": 0.059475578367710114, "learning_rate": 0.00018970673660304046, "loss": 0.0901, "step": 7163 }, { "epoch": 0.4631272727272727, "grad_norm": 0.04820244759321213, "learning_rate": 0.00018970371445037055, "loss": 0.0636, "step": 7164 }, { "epoch": 0.4631919191919192, "grad_norm": 0.059644605964422226, "learning_rate": 0.00018970069187818607, "loss": 0.088, "step": 7165 }, { "epoch": 0.46325656565656564, "grad_norm": 0.06114288046956062, "learning_rate": 0.00018969766888650105, "loss": 0.0875, "step": 7166 }, { "epoch": 0.46332121212121213, "grad_norm": 0.052497223019599915, "learning_rate": 0.00018969464547532972, "loss": 0.0797, "step": 7167 }, { "epoch": 0.46338585858585857, "grad_norm": 0.05949230119585991, "learning_rate": 0.0001896916216446862, "loss": 0.0797, "step": 7168 }, { "epoch": 0.46338585858585857, "eval_bleu": 17.541563009530666, "eval_loss": 0.09283225983381271, "eval_runtime": 2.7192, "eval_samples_per_second": 11.768, "eval_steps_per_second": 1.471, "step": 7168 }, { "epoch": 0.46345050505050506, "grad_norm": 0.11092780530452728, "learning_rate": 0.00018968859739458457, "loss": 0.1133, "step": 7169 }, { "epoch": 0.4635151515151515, "grad_norm": 0.07470229268074036, "learning_rate": 0.00018968557272503906, "loss": 0.0752, "step": 7170 }, { "epoch": 0.463579797979798, "grad_norm": 0.05738930031657219, "learning_rate": 0.00018968254763606374, "loss": 0.079, "step": 7171 }, { "epoch": 0.46364444444444447, "grad_norm": 0.05421553552150726, "learning_rate": 0.00018967952212767283, "loss": 0.0758, "step": 7172 }, { "epoch": 0.4637090909090909, "grad_norm": 0.06704524904489517, "learning_rate": 0.00018967649619988045, "loss": 0.0988, "step": 7173 }, { "epoch": 0.4637737373737374, "grad_norm": 0.04699809476733208, "learning_rate": 0.0001896734698527007, "loss": 0.0598, "step": 7174 }, { "epoch": 0.4638383838383838, "grad_norm": 0.06557723879814148, "learning_rate": 0.0001896704430861478, "loss": 0.0918, "step": 7175 }, { "epoch": 0.4639030303030303, "grad_norm": 0.05910750478506088, "learning_rate": 0.00018966741590023588, "loss": 0.0885, "step": 7176 }, { "epoch": 0.46396767676767675, "grad_norm": 0.05677521228790283, "learning_rate": 0.0001896643882949791, "loss": 0.0786, "step": 7177 }, { "epoch": 0.46403232323232324, "grad_norm": 0.06263885647058487, "learning_rate": 0.0001896613602703916, "loss": 0.0916, "step": 7178 }, { "epoch": 0.4640969696969697, "grad_norm": 0.05663053318858147, "learning_rate": 0.0001896583318264876, "loss": 0.0918, "step": 7179 }, { "epoch": 0.46416161616161616, "grad_norm": 0.058634400367736816, "learning_rate": 0.0001896553029632812, "loss": 0.078, "step": 7180 }, { "epoch": 0.46422626262626265, "grad_norm": 0.05469169095158577, "learning_rate": 0.0001896522736807866, "loss": 0.0682, "step": 7181 }, { "epoch": 0.4642909090909091, "grad_norm": 0.05631436035037041, "learning_rate": 0.00018964924397901794, "loss": 0.0711, "step": 7182 }, { "epoch": 0.4643555555555556, "grad_norm": 0.05524951219558716, "learning_rate": 0.0001896462138579894, "loss": 0.0745, "step": 7183 }, { "epoch": 0.464420202020202, "grad_norm": 0.06386777013540268, "learning_rate": 0.00018964318331771515, "loss": 0.0935, "step": 7184 }, { "epoch": 0.464420202020202, "eval_bleu": 15.722805456484364, "eval_loss": 0.09271927177906036, "eval_runtime": 2.6585, "eval_samples_per_second": 12.037, "eval_steps_per_second": 1.505, "step": 7184 }, { "epoch": 0.4644848484848485, "grad_norm": 0.06449155509471893, "learning_rate": 0.0001896401523582094, "loss": 0.0735, "step": 7185 }, { "epoch": 0.46454949494949493, "grad_norm": 0.07660414278507233, "learning_rate": 0.00018963712097948624, "loss": 0.1147, "step": 7186 }, { "epoch": 0.4646141414141414, "grad_norm": 0.057258304208517075, "learning_rate": 0.00018963408918155994, "loss": 0.0762, "step": 7187 }, { "epoch": 0.46467878787878786, "grad_norm": 0.05825633928179741, "learning_rate": 0.0001896310569644446, "loss": 0.0818, "step": 7188 }, { "epoch": 0.46474343434343435, "grad_norm": 0.057260941714048386, "learning_rate": 0.00018962802432815446, "loss": 0.0805, "step": 7189 }, { "epoch": 0.4648080808080808, "grad_norm": 0.05526965111494064, "learning_rate": 0.0001896249912727037, "loss": 0.0746, "step": 7190 }, { "epoch": 0.46487272727272727, "grad_norm": 0.05526936799287796, "learning_rate": 0.00018962195779810649, "loss": 0.08, "step": 7191 }, { "epoch": 0.46493737373737376, "grad_norm": 0.07603998482227325, "learning_rate": 0.00018961892390437698, "loss": 0.0845, "step": 7192 }, { "epoch": 0.4650020202020202, "grad_norm": 0.07031303644180298, "learning_rate": 0.0001896158895915294, "loss": 0.0861, "step": 7193 }, { "epoch": 0.4650666666666667, "grad_norm": 0.06617601960897446, "learning_rate": 0.00018961285485957793, "loss": 0.1023, "step": 7194 }, { "epoch": 0.4651313131313131, "grad_norm": 0.06603274494409561, "learning_rate": 0.00018960981970853676, "loss": 0.101, "step": 7195 }, { "epoch": 0.4651959595959596, "grad_norm": 0.07425568997859955, "learning_rate": 0.0001896067841384201, "loss": 0.12, "step": 7196 }, { "epoch": 0.46526060606060604, "grad_norm": 0.051168788224458694, "learning_rate": 0.00018960374814924212, "loss": 0.0731, "step": 7197 }, { "epoch": 0.46532525252525253, "grad_norm": 0.06740325689315796, "learning_rate": 0.00018960071174101706, "loss": 0.0938, "step": 7198 }, { "epoch": 0.46538989898989896, "grad_norm": 0.047168973833322525, "learning_rate": 0.00018959767491375906, "loss": 0.0662, "step": 7199 }, { "epoch": 0.46545454545454545, "grad_norm": 0.0553610734641552, "learning_rate": 0.0001895946376674824, "loss": 0.0798, "step": 7200 }, { "epoch": 0.46545454545454545, "eval_bleu": 18.721719641718813, "eval_loss": 0.09209852665662766, "eval_runtime": 2.7068, "eval_samples_per_second": 11.822, "eval_steps_per_second": 1.478, "step": 7200 }, { "epoch": 0.46551919191919194, "grad_norm": 0.05687474459409714, "learning_rate": 0.00018959160000220116, "loss": 0.082, "step": 7201 }, { "epoch": 0.4655838383838384, "grad_norm": 0.057504694908857346, "learning_rate": 0.0001895885619179297, "loss": 0.085, "step": 7202 }, { "epoch": 0.46564848484848487, "grad_norm": 0.07238850742578506, "learning_rate": 0.00018958552341468213, "loss": 0.0874, "step": 7203 }, { "epoch": 0.4657131313131313, "grad_norm": 0.05455256253480911, "learning_rate": 0.00018958248449247263, "loss": 0.0762, "step": 7204 }, { "epoch": 0.4657777777777778, "grad_norm": 0.057944025844335556, "learning_rate": 0.00018957944515131554, "loss": 0.0883, "step": 7205 }, { "epoch": 0.4658424242424242, "grad_norm": 0.05675322189927101, "learning_rate": 0.00018957640539122496, "loss": 0.0829, "step": 7206 }, { "epoch": 0.4659070707070707, "grad_norm": 0.05802517011761665, "learning_rate": 0.00018957336521221516, "loss": 0.0906, "step": 7207 }, { "epoch": 0.46597171717171715, "grad_norm": 0.05890211835503578, "learning_rate": 0.00018957032461430035, "loss": 0.0828, "step": 7208 }, { "epoch": 0.46603636363636364, "grad_norm": 0.062217362225055695, "learning_rate": 0.00018956728359749474, "loss": 0.0824, "step": 7209 }, { "epoch": 0.4661010101010101, "grad_norm": 0.050164297223091125, "learning_rate": 0.00018956424216181255, "loss": 0.0809, "step": 7210 }, { "epoch": 0.46616565656565656, "grad_norm": 0.07047893106937408, "learning_rate": 0.000189561200307268, "loss": 0.0805, "step": 7211 }, { "epoch": 0.46623030303030305, "grad_norm": 0.06355404108762741, "learning_rate": 0.0001895581580338753, "loss": 0.1087, "step": 7212 }, { "epoch": 0.4662949494949495, "grad_norm": 0.055491141974925995, "learning_rate": 0.00018955511534164877, "loss": 0.0944, "step": 7213 }, { "epoch": 0.466359595959596, "grad_norm": 0.05675783008337021, "learning_rate": 0.00018955207223060253, "loss": 0.0836, "step": 7214 }, { "epoch": 0.4664242424242424, "grad_norm": 0.057222358882427216, "learning_rate": 0.00018954902870075085, "loss": 0.0858, "step": 7215 }, { "epoch": 0.4664888888888889, "grad_norm": 0.06837775558233261, "learning_rate": 0.00018954598475210798, "loss": 0.0992, "step": 7216 }, { "epoch": 0.4664888888888889, "eval_bleu": 18.28352450602476, "eval_loss": 0.09250275045633316, "eval_runtime": 2.9514, "eval_samples_per_second": 10.842, "eval_steps_per_second": 1.355, "step": 7216 }, { "epoch": 0.46655353535353533, "grad_norm": 0.05426108092069626, "learning_rate": 0.00018954294038468815, "loss": 0.0944, "step": 7217 }, { "epoch": 0.4666181818181818, "grad_norm": 0.0553646981716156, "learning_rate": 0.00018953989559850558, "loss": 0.0655, "step": 7218 }, { "epoch": 0.4666828282828283, "grad_norm": 0.053447991609573364, "learning_rate": 0.0001895368503935745, "loss": 0.0799, "step": 7219 }, { "epoch": 0.46674747474747474, "grad_norm": 0.05610032379627228, "learning_rate": 0.00018953380476990922, "loss": 0.0912, "step": 7220 }, { "epoch": 0.46681212121212123, "grad_norm": 0.054938219487667084, "learning_rate": 0.0001895307587275239, "loss": 0.0928, "step": 7221 }, { "epoch": 0.46687676767676767, "grad_norm": 0.08759074658155441, "learning_rate": 0.0001895277122664328, "loss": 0.0911, "step": 7222 }, { "epoch": 0.46694141414141416, "grad_norm": 0.0623321607708931, "learning_rate": 0.0001895246653866502, "loss": 0.0829, "step": 7223 }, { "epoch": 0.4670060606060606, "grad_norm": 0.0682804137468338, "learning_rate": 0.0001895216180881904, "loss": 0.1005, "step": 7224 }, { "epoch": 0.4670707070707071, "grad_norm": 0.05756324157118797, "learning_rate": 0.0001895185703710675, "loss": 0.0845, "step": 7225 }, { "epoch": 0.4671353535353535, "grad_norm": 0.05127490311861038, "learning_rate": 0.00018951552223529588, "loss": 0.0738, "step": 7226 }, { "epoch": 0.4672, "grad_norm": 0.05110714212059975, "learning_rate": 0.00018951247368088976, "loss": 0.0792, "step": 7227 }, { "epoch": 0.46726464646464644, "grad_norm": 0.05211690440773964, "learning_rate": 0.0001895094247078634, "loss": 0.0802, "step": 7228 }, { "epoch": 0.4673292929292929, "grad_norm": 0.05694428086280823, "learning_rate": 0.000189506375316231, "loss": 0.0812, "step": 7229 }, { "epoch": 0.4673939393939394, "grad_norm": 0.052858803421258926, "learning_rate": 0.00018950332550600693, "loss": 0.0722, "step": 7230 }, { "epoch": 0.46745858585858585, "grad_norm": 0.05718578025698662, "learning_rate": 0.00018950027527720538, "loss": 0.0787, "step": 7231 }, { "epoch": 0.46752323232323234, "grad_norm": 0.05656655877828598, "learning_rate": 0.00018949722462984064, "loss": 0.0758, "step": 7232 }, { "epoch": 0.46752323232323234, "eval_bleu": 17.49209768488284, "eval_loss": 0.09236271679401398, "eval_runtime": 2.7476, "eval_samples_per_second": 11.646, "eval_steps_per_second": 1.456, "step": 7232 }, { "epoch": 0.4675878787878788, "grad_norm": 0.0597824901342392, "learning_rate": 0.00018949417356392694, "loss": 0.0977, "step": 7233 }, { "epoch": 0.46765252525252526, "grad_norm": 0.06078418344259262, "learning_rate": 0.00018949112207947857, "loss": 0.0921, "step": 7234 }, { "epoch": 0.4677171717171717, "grad_norm": 0.05921223759651184, "learning_rate": 0.0001894880701765098, "loss": 0.0908, "step": 7235 }, { "epoch": 0.4677818181818182, "grad_norm": 0.06957735121250153, "learning_rate": 0.00018948501785503498, "loss": 0.0974, "step": 7236 }, { "epoch": 0.4678464646464646, "grad_norm": 0.06434625387191772, "learning_rate": 0.00018948196511506826, "loss": 0.1021, "step": 7237 }, { "epoch": 0.4679111111111111, "grad_norm": 0.05560140311717987, "learning_rate": 0.000189478911956624, "loss": 0.0891, "step": 7238 }, { "epoch": 0.4679757575757576, "grad_norm": 0.05088731646537781, "learning_rate": 0.0001894758583797164, "loss": 0.0796, "step": 7239 }, { "epoch": 0.46804040404040403, "grad_norm": 0.06991235911846161, "learning_rate": 0.00018947280438435984, "loss": 0.1164, "step": 7240 }, { "epoch": 0.4681050505050505, "grad_norm": 0.0577993206679821, "learning_rate": 0.00018946974997056856, "loss": 0.0854, "step": 7241 }, { "epoch": 0.46816969696969696, "grad_norm": 0.05750711262226105, "learning_rate": 0.0001894666951383568, "loss": 0.0876, "step": 7242 }, { "epoch": 0.46823434343434345, "grad_norm": 0.05845741927623749, "learning_rate": 0.0001894636398877389, "loss": 0.082, "step": 7243 }, { "epoch": 0.4682989898989899, "grad_norm": 0.05847091227769852, "learning_rate": 0.00018946058421872913, "loss": 0.0882, "step": 7244 }, { "epoch": 0.46836363636363637, "grad_norm": 0.060255683958530426, "learning_rate": 0.00018945752813134177, "loss": 0.1047, "step": 7245 }, { "epoch": 0.4684282828282828, "grad_norm": 0.05977492406964302, "learning_rate": 0.00018945447162559113, "loss": 0.0793, "step": 7246 }, { "epoch": 0.4684929292929293, "grad_norm": 0.06529433280229568, "learning_rate": 0.00018945141470149153, "loss": 0.1025, "step": 7247 }, { "epoch": 0.4685575757575758, "grad_norm": 0.060815706849098206, "learning_rate": 0.00018944835735905721, "loss": 0.094, "step": 7248 }, { "epoch": 0.4685575757575758, "eval_bleu": 14.805584460640508, "eval_loss": 0.09266255795955658, "eval_runtime": 2.7975, "eval_samples_per_second": 11.439, "eval_steps_per_second": 1.43, "step": 7248 }, { "epoch": 0.4686222222222222, "grad_norm": 0.05649518221616745, "learning_rate": 0.0001894452995983025, "loss": 0.0755, "step": 7249 }, { "epoch": 0.4686868686868687, "grad_norm": 0.05884825438261032, "learning_rate": 0.0001894422414192417, "loss": 0.0866, "step": 7250 }, { "epoch": 0.46875151515151514, "grad_norm": 0.06604856997728348, "learning_rate": 0.00018943918282188907, "loss": 0.0912, "step": 7251 }, { "epoch": 0.46881616161616163, "grad_norm": 0.06351850181818008, "learning_rate": 0.00018943612380625897, "loss": 0.0881, "step": 7252 }, { "epoch": 0.46888080808080806, "grad_norm": 0.0449657179415226, "learning_rate": 0.00018943306437236573, "loss": 0.0641, "step": 7253 }, { "epoch": 0.46894545454545455, "grad_norm": 0.05862146615982056, "learning_rate": 0.00018943000452022356, "loss": 0.0867, "step": 7254 }, { "epoch": 0.469010101010101, "grad_norm": 0.0637274980545044, "learning_rate": 0.00018942694424984682, "loss": 0.0824, "step": 7255 }, { "epoch": 0.4690747474747475, "grad_norm": 0.07097877562046051, "learning_rate": 0.00018942388356124986, "loss": 0.0979, "step": 7256 }, { "epoch": 0.4691393939393939, "grad_norm": 0.07764679193496704, "learning_rate": 0.00018942082245444695, "loss": 0.1001, "step": 7257 }, { "epoch": 0.4692040404040404, "grad_norm": 0.061027780175209045, "learning_rate": 0.00018941776092945242, "loss": 0.0849, "step": 7258 }, { "epoch": 0.4692686868686869, "grad_norm": 0.04984840378165245, "learning_rate": 0.00018941469898628056, "loss": 0.0735, "step": 7259 }, { "epoch": 0.4693333333333333, "grad_norm": 0.04994824528694153, "learning_rate": 0.00018941163662494572, "loss": 0.0673, "step": 7260 }, { "epoch": 0.4693979797979798, "grad_norm": 0.08965365588665009, "learning_rate": 0.0001894085738454622, "loss": 0.0902, "step": 7261 }, { "epoch": 0.46946262626262625, "grad_norm": 0.06361673772335052, "learning_rate": 0.00018940551064784437, "loss": 0.1062, "step": 7262 }, { "epoch": 0.46952727272727274, "grad_norm": 0.050021056085824966, "learning_rate": 0.0001894024470321065, "loss": 0.0719, "step": 7263 }, { "epoch": 0.46959191919191917, "grad_norm": 0.05567657947540283, "learning_rate": 0.000189399382998263, "loss": 0.0836, "step": 7264 }, { "epoch": 0.46959191919191917, "eval_bleu": 17.86180896430321, "eval_loss": 0.09336129575967789, "eval_runtime": 2.7106, "eval_samples_per_second": 11.806, "eval_steps_per_second": 1.476, "step": 7264 }, { "epoch": 0.46965656565656566, "grad_norm": 0.09004451334476471, "learning_rate": 0.00018939631854632806, "loss": 0.0849, "step": 7265 }, { "epoch": 0.4697212121212121, "grad_norm": 0.06315835565328598, "learning_rate": 0.00018939325367631614, "loss": 0.0809, "step": 7266 }, { "epoch": 0.4697858585858586, "grad_norm": 0.06530977040529251, "learning_rate": 0.0001893901883882415, "loss": 0.0939, "step": 7267 }, { "epoch": 0.4698505050505051, "grad_norm": 0.05387529730796814, "learning_rate": 0.00018938712268211854, "loss": 0.0757, "step": 7268 }, { "epoch": 0.4699151515151515, "grad_norm": 0.05530110374093056, "learning_rate": 0.0001893840565579615, "loss": 0.0908, "step": 7269 }, { "epoch": 0.469979797979798, "grad_norm": 0.059576235711574554, "learning_rate": 0.00018938099001578483, "loss": 0.0952, "step": 7270 }, { "epoch": 0.47004444444444443, "grad_norm": 0.06066891923546791, "learning_rate": 0.00018937792305560278, "loss": 0.0917, "step": 7271 }, { "epoch": 0.4701090909090909, "grad_norm": 0.05483390763401985, "learning_rate": 0.00018937485567742973, "loss": 0.0722, "step": 7272 }, { "epoch": 0.47017373737373735, "grad_norm": 0.06227520480751991, "learning_rate": 0.00018937178788128007, "loss": 0.0862, "step": 7273 }, { "epoch": 0.47023838383838384, "grad_norm": 0.0542667955160141, "learning_rate": 0.00018936871966716807, "loss": 0.0802, "step": 7274 }, { "epoch": 0.4703030303030303, "grad_norm": 0.05789713189005852, "learning_rate": 0.00018936565103510813, "loss": 0.0886, "step": 7275 }, { "epoch": 0.47036767676767677, "grad_norm": 0.06234133616089821, "learning_rate": 0.00018936258198511458, "loss": 0.0928, "step": 7276 }, { "epoch": 0.47043232323232326, "grad_norm": 0.06895460933446884, "learning_rate": 0.00018935951251720177, "loss": 0.0806, "step": 7277 }, { "epoch": 0.4704969696969697, "grad_norm": 0.0635010376572609, "learning_rate": 0.00018935644263138405, "loss": 0.0864, "step": 7278 }, { "epoch": 0.4705616161616162, "grad_norm": 0.04954046010971069, "learning_rate": 0.0001893533723276758, "loss": 0.0733, "step": 7279 }, { "epoch": 0.4706262626262626, "grad_norm": 0.06974852085113525, "learning_rate": 0.00018935030160609136, "loss": 0.0989, "step": 7280 }, { "epoch": 0.4706262626262626, "eval_bleu": 19.86228471851607, "eval_loss": 0.0928819552063942, "eval_runtime": 2.8506, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 7280 }, { "epoch": 0.4706909090909091, "grad_norm": 0.058547187596559525, "learning_rate": 0.00018934723046664512, "loss": 0.1033, "step": 7281 }, { "epoch": 0.47075555555555554, "grad_norm": 0.054631661623716354, "learning_rate": 0.00018934415890935143, "loss": 0.0664, "step": 7282 }, { "epoch": 0.470820202020202, "grad_norm": 0.0571826733648777, "learning_rate": 0.00018934108693422462, "loss": 0.0916, "step": 7283 }, { "epoch": 0.47088484848484846, "grad_norm": 0.0575735978782177, "learning_rate": 0.0001893380145412791, "loss": 0.0826, "step": 7284 }, { "epoch": 0.47094949494949495, "grad_norm": 0.05697232112288475, "learning_rate": 0.00018933494173052918, "loss": 0.0849, "step": 7285 }, { "epoch": 0.47101414141414144, "grad_norm": 0.055659398436546326, "learning_rate": 0.0001893318685019893, "loss": 0.0833, "step": 7286 }, { "epoch": 0.4710787878787879, "grad_norm": 0.0511322058737278, "learning_rate": 0.0001893287948556738, "loss": 0.0751, "step": 7287 }, { "epoch": 0.47114343434343436, "grad_norm": 0.0728987455368042, "learning_rate": 0.00018932572079159704, "loss": 0.0991, "step": 7288 }, { "epoch": 0.4712080808080808, "grad_norm": 0.0732710063457489, "learning_rate": 0.00018932264630977343, "loss": 0.0866, "step": 7289 }, { "epoch": 0.4712727272727273, "grad_norm": 0.06716963648796082, "learning_rate": 0.00018931957141021734, "loss": 0.0938, "step": 7290 }, { "epoch": 0.4713373737373737, "grad_norm": 0.06250447779893875, "learning_rate": 0.00018931649609294312, "loss": 0.1015, "step": 7291 }, { "epoch": 0.4714020202020202, "grad_norm": 0.052389465272426605, "learning_rate": 0.00018931342035796521, "loss": 0.0821, "step": 7292 }, { "epoch": 0.47146666666666665, "grad_norm": 0.056494005024433136, "learning_rate": 0.00018931034420529796, "loss": 0.0765, "step": 7293 }, { "epoch": 0.47153131313131313, "grad_norm": 0.05071423947811127, "learning_rate": 0.00018930726763495575, "loss": 0.0707, "step": 7294 }, { "epoch": 0.47159595959595957, "grad_norm": 0.05134226009249687, "learning_rate": 0.00018930419064695295, "loss": 0.0719, "step": 7295 }, { "epoch": 0.47166060606060606, "grad_norm": 0.05860813334584236, "learning_rate": 0.00018930111324130398, "loss": 0.0892, "step": 7296 }, { "epoch": 0.47166060606060606, "eval_bleu": 19.63253369746337, "eval_loss": 0.09260168671607971, "eval_runtime": 2.6469, "eval_samples_per_second": 12.09, "eval_steps_per_second": 1.511, "step": 7296 }, { "epoch": 0.47172525252525255, "grad_norm": 0.06289013475179672, "learning_rate": 0.00018929803541802324, "loss": 0.0921, "step": 7297 }, { "epoch": 0.471789898989899, "grad_norm": 0.06766319274902344, "learning_rate": 0.00018929495717712508, "loss": 0.0818, "step": 7298 }, { "epoch": 0.47185454545454547, "grad_norm": 0.061813659965991974, "learning_rate": 0.00018929187851862396, "loss": 0.0864, "step": 7299 }, { "epoch": 0.4719191919191919, "grad_norm": 0.061841387301683426, "learning_rate": 0.00018928879944253422, "loss": 0.0956, "step": 7300 }, { "epoch": 0.4719838383838384, "grad_norm": 0.06480183452367783, "learning_rate": 0.0001892857199488703, "loss": 0.0996, "step": 7301 }, { "epoch": 0.47204848484848483, "grad_norm": 0.0546305775642395, "learning_rate": 0.0001892826400376466, "loss": 0.0747, "step": 7302 }, { "epoch": 0.4721131313131313, "grad_norm": 0.05801054835319519, "learning_rate": 0.0001892795597088775, "loss": 0.0816, "step": 7303 }, { "epoch": 0.47217777777777775, "grad_norm": 0.0521390438079834, "learning_rate": 0.00018927647896257739, "loss": 0.0655, "step": 7304 }, { "epoch": 0.47224242424242424, "grad_norm": 0.058850936591625214, "learning_rate": 0.0001892733977987607, "loss": 0.0978, "step": 7305 }, { "epoch": 0.47230707070707073, "grad_norm": 0.06128442659974098, "learning_rate": 0.00018927031621744183, "loss": 0.0838, "step": 7306 }, { "epoch": 0.47237171717171716, "grad_norm": 0.06194846332073212, "learning_rate": 0.00018926723421863525, "loss": 0.0945, "step": 7307 }, { "epoch": 0.47243636363636365, "grad_norm": 0.05447714403271675, "learning_rate": 0.0001892641518023553, "loss": 0.079, "step": 7308 }, { "epoch": 0.4725010101010101, "grad_norm": 0.053598467260599136, "learning_rate": 0.0001892610689686164, "loss": 0.0702, "step": 7309 }, { "epoch": 0.4725656565656566, "grad_norm": 0.050804659724235535, "learning_rate": 0.00018925798571743305, "loss": 0.0696, "step": 7310 }, { "epoch": 0.472630303030303, "grad_norm": 0.06487833708524704, "learning_rate": 0.0001892549020488196, "loss": 0.102, "step": 7311 }, { "epoch": 0.4726949494949495, "grad_norm": 0.05978713929653168, "learning_rate": 0.00018925181796279044, "loss": 0.086, "step": 7312 }, { "epoch": 0.4726949494949495, "eval_bleu": 15.99179760428157, "eval_loss": 0.09220074117183685, "eval_runtime": 2.6635, "eval_samples_per_second": 12.014, "eval_steps_per_second": 1.502, "step": 7312 }, { "epoch": 0.47275959595959594, "grad_norm": 0.05655250698328018, "learning_rate": 0.00018924873345936005, "loss": 0.0775, "step": 7313 }, { "epoch": 0.4728242424242424, "grad_norm": 0.06821805983781815, "learning_rate": 0.00018924564853854282, "loss": 0.0917, "step": 7314 }, { "epoch": 0.4728888888888889, "grad_norm": 0.06744733452796936, "learning_rate": 0.00018924256320035322, "loss": 0.0839, "step": 7315 }, { "epoch": 0.47295353535353535, "grad_norm": 0.06085432693362236, "learning_rate": 0.00018923947744480565, "loss": 0.0907, "step": 7316 }, { "epoch": 0.47301818181818184, "grad_norm": 0.06379047781229019, "learning_rate": 0.00018923639127191457, "loss": 0.0816, "step": 7317 }, { "epoch": 0.47308282828282827, "grad_norm": 0.057483572512865067, "learning_rate": 0.00018923330468169437, "loss": 0.0764, "step": 7318 }, { "epoch": 0.47314747474747476, "grad_norm": 0.07122159004211426, "learning_rate": 0.00018923021767415946, "loss": 0.1061, "step": 7319 }, { "epoch": 0.4732121212121212, "grad_norm": 0.06174711510539055, "learning_rate": 0.00018922713024932438, "loss": 0.0827, "step": 7320 }, { "epoch": 0.4732767676767677, "grad_norm": 0.066444993019104, "learning_rate": 0.00018922404240720347, "loss": 0.0846, "step": 7321 }, { "epoch": 0.4733414141414141, "grad_norm": 0.08104237914085388, "learning_rate": 0.00018922095414781125, "loss": 0.1081, "step": 7322 }, { "epoch": 0.4734060606060606, "grad_norm": 0.06309256702661514, "learning_rate": 0.0001892178654711621, "loss": 0.0866, "step": 7323 }, { "epoch": 0.4734707070707071, "grad_norm": 0.07226023077964783, "learning_rate": 0.0001892147763772705, "loss": 0.0981, "step": 7324 }, { "epoch": 0.47353535353535353, "grad_norm": 0.0630379468202591, "learning_rate": 0.00018921168686615087, "loss": 0.0955, "step": 7325 }, { "epoch": 0.4736, "grad_norm": 0.053598884493112564, "learning_rate": 0.00018920859693781768, "loss": 0.0734, "step": 7326 }, { "epoch": 0.47366464646464645, "grad_norm": 0.05860722064971924, "learning_rate": 0.00018920550659228536, "loss": 0.0887, "step": 7327 }, { "epoch": 0.47372929292929294, "grad_norm": 0.053978219628334045, "learning_rate": 0.0001892024158295684, "loss": 0.0677, "step": 7328 }, { "epoch": 0.47372929292929294, "eval_bleu": 18.23458670708463, "eval_loss": 0.09343469142913818, "eval_runtime": 2.6422, "eval_samples_per_second": 12.111, "eval_steps_per_second": 1.514, "step": 7328 }, { "epoch": 0.4737939393939394, "grad_norm": 0.056524090468883514, "learning_rate": 0.00018919932464968122, "loss": 0.0885, "step": 7329 }, { "epoch": 0.47385858585858587, "grad_norm": 0.04937510937452316, "learning_rate": 0.00018919623305263826, "loss": 0.0721, "step": 7330 }, { "epoch": 0.4739232323232323, "grad_norm": 0.06430433690547943, "learning_rate": 0.00018919314103845407, "loss": 0.0969, "step": 7331 }, { "epoch": 0.4739878787878788, "grad_norm": 0.0631866529583931, "learning_rate": 0.000189190048607143, "loss": 0.103, "step": 7332 }, { "epoch": 0.4740525252525252, "grad_norm": 0.0482548363506794, "learning_rate": 0.00018918695575871955, "loss": 0.0637, "step": 7333 }, { "epoch": 0.4741171717171717, "grad_norm": 0.06568702310323715, "learning_rate": 0.00018918386249319822, "loss": 0.0891, "step": 7334 }, { "epoch": 0.4741818181818182, "grad_norm": 0.06256521493196487, "learning_rate": 0.00018918076881059345, "loss": 0.0824, "step": 7335 }, { "epoch": 0.47424646464646464, "grad_norm": 0.051587142050266266, "learning_rate": 0.00018917767471091968, "loss": 0.0763, "step": 7336 }, { "epoch": 0.47431111111111113, "grad_norm": 0.06193213909864426, "learning_rate": 0.00018917458019419143, "loss": 0.0858, "step": 7337 }, { "epoch": 0.47437575757575756, "grad_norm": 0.055177636444568634, "learning_rate": 0.00018917148526042315, "loss": 0.0751, "step": 7338 }, { "epoch": 0.47444040404040405, "grad_norm": 0.05465149134397507, "learning_rate": 0.0001891683899096293, "loss": 0.0746, "step": 7339 }, { "epoch": 0.4745050505050505, "grad_norm": 0.07283540070056915, "learning_rate": 0.00018916529414182438, "loss": 0.1009, "step": 7340 }, { "epoch": 0.474569696969697, "grad_norm": 0.056146178394556046, "learning_rate": 0.00018916219795702283, "loss": 0.0846, "step": 7341 }, { "epoch": 0.4746343434343434, "grad_norm": 0.05808147042989731, "learning_rate": 0.0001891591013552392, "loss": 0.0839, "step": 7342 }, { "epoch": 0.4746989898989899, "grad_norm": 0.09299144148826599, "learning_rate": 0.0001891560043364879, "loss": 0.0853, "step": 7343 }, { "epoch": 0.4747636363636364, "grad_norm": 0.060095496475696564, "learning_rate": 0.00018915290690078344, "loss": 0.09, "step": 7344 }, { "epoch": 0.4747636363636364, "eval_bleu": 17.23930729929578, "eval_loss": 0.09171617776155472, "eval_runtime": 2.7298, "eval_samples_per_second": 11.723, "eval_steps_per_second": 1.465, "step": 7344 }, { "epoch": 0.4748282828282828, "grad_norm": 0.06482581049203873, "learning_rate": 0.00018914980904814033, "loss": 0.0851, "step": 7345 }, { "epoch": 0.4748929292929293, "grad_norm": 0.058468952775001526, "learning_rate": 0.00018914671077857302, "loss": 0.08, "step": 7346 }, { "epoch": 0.47495757575757575, "grad_norm": 0.06121734157204628, "learning_rate": 0.00018914361209209602, "loss": 0.0964, "step": 7347 }, { "epoch": 0.47502222222222223, "grad_norm": 0.05901621654629707, "learning_rate": 0.00018914051298872381, "loss": 0.0854, "step": 7348 }, { "epoch": 0.47508686868686867, "grad_norm": 0.055521853268146515, "learning_rate": 0.00018913741346847086, "loss": 0.0828, "step": 7349 }, { "epoch": 0.47515151515151516, "grad_norm": 0.060256291180849075, "learning_rate": 0.00018913431353135174, "loss": 0.0822, "step": 7350 }, { "epoch": 0.4752161616161616, "grad_norm": 0.05465661734342575, "learning_rate": 0.0001891312131773809, "loss": 0.0811, "step": 7351 }, { "epoch": 0.4752808080808081, "grad_norm": 0.06250262260437012, "learning_rate": 0.00018912811240657282, "loss": 0.0952, "step": 7352 }, { "epoch": 0.47534545454545457, "grad_norm": 0.05815069004893303, "learning_rate": 0.00018912501121894204, "loss": 0.1046, "step": 7353 }, { "epoch": 0.475410101010101, "grad_norm": 0.055010534822940826, "learning_rate": 0.00018912190961450307, "loss": 0.0923, "step": 7354 }, { "epoch": 0.4754747474747475, "grad_norm": 0.056974828243255615, "learning_rate": 0.00018911880759327033, "loss": 0.0931, "step": 7355 }, { "epoch": 0.47553939393939393, "grad_norm": 0.055506303906440735, "learning_rate": 0.00018911570515525844, "loss": 0.0762, "step": 7356 }, { "epoch": 0.4756040404040404, "grad_norm": 0.05283723026514053, "learning_rate": 0.00018911260230048186, "loss": 0.0742, "step": 7357 }, { "epoch": 0.47566868686868685, "grad_norm": 0.06277589499950409, "learning_rate": 0.00018910949902895506, "loss": 0.0902, "step": 7358 }, { "epoch": 0.47573333333333334, "grad_norm": 0.06028132140636444, "learning_rate": 0.00018910639534069264, "loss": 0.0798, "step": 7359 }, { "epoch": 0.4757979797979798, "grad_norm": 0.06542912870645523, "learning_rate": 0.00018910329123570903, "loss": 0.1046, "step": 7360 }, { "epoch": 0.4757979797979798, "eval_bleu": 17.84002337742173, "eval_loss": 0.09067389369010925, "eval_runtime": 2.5919, "eval_samples_per_second": 12.346, "eval_steps_per_second": 1.543, "step": 7360 }, { "epoch": 0.47586262626262626, "grad_norm": 0.06225048750638962, "learning_rate": 0.0001891001867140188, "loss": 0.0864, "step": 7361 }, { "epoch": 0.47592727272727275, "grad_norm": 0.05807938054203987, "learning_rate": 0.00018909708177563644, "loss": 0.0873, "step": 7362 }, { "epoch": 0.4759919191919192, "grad_norm": 0.06082826852798462, "learning_rate": 0.0001890939764205765, "loss": 0.0824, "step": 7363 }, { "epoch": 0.4760565656565657, "grad_norm": 0.05685294419527054, "learning_rate": 0.00018909087064885344, "loss": 0.0849, "step": 7364 }, { "epoch": 0.4761212121212121, "grad_norm": 0.06039021536707878, "learning_rate": 0.0001890877644604819, "loss": 0.0858, "step": 7365 }, { "epoch": 0.4761858585858586, "grad_norm": 0.05535202845931053, "learning_rate": 0.0001890846578554763, "loss": 0.0847, "step": 7366 }, { "epoch": 0.47625050505050504, "grad_norm": 0.05184906721115112, "learning_rate": 0.0001890815508338512, "loss": 0.0782, "step": 7367 }, { "epoch": 0.4763151515151515, "grad_norm": 0.052889902144670486, "learning_rate": 0.00018907844339562116, "loss": 0.0742, "step": 7368 }, { "epoch": 0.47637979797979796, "grad_norm": 0.059444595128297806, "learning_rate": 0.00018907533554080069, "loss": 0.1016, "step": 7369 }, { "epoch": 0.47644444444444445, "grad_norm": 0.0530921146273613, "learning_rate": 0.0001890722272694043, "loss": 0.0838, "step": 7370 }, { "epoch": 0.4765090909090909, "grad_norm": 0.06201579421758652, "learning_rate": 0.00018906911858144654, "loss": 0.0967, "step": 7371 }, { "epoch": 0.47657373737373737, "grad_norm": 0.0539022758603096, "learning_rate": 0.00018906600947694199, "loss": 0.0859, "step": 7372 }, { "epoch": 0.47663838383838386, "grad_norm": 0.0476655513048172, "learning_rate": 0.0001890628999559051, "loss": 0.0844, "step": 7373 }, { "epoch": 0.4767030303030303, "grad_norm": 0.06938281655311584, "learning_rate": 0.00018905979001835054, "loss": 0.0859, "step": 7374 }, { "epoch": 0.4767676767676768, "grad_norm": 0.0514361746609211, "learning_rate": 0.00018905667966429276, "loss": 0.0836, "step": 7375 }, { "epoch": 0.4768323232323232, "grad_norm": 0.05874991416931152, "learning_rate": 0.00018905356889374635, "loss": 0.0866, "step": 7376 }, { "epoch": 0.4768323232323232, "eval_bleu": 16.509957564443646, "eval_loss": 0.09085878729820251, "eval_runtime": 2.7189, "eval_samples_per_second": 11.769, "eval_steps_per_second": 1.471, "step": 7376 }, { "epoch": 0.4768969696969697, "grad_norm": 0.05865233764052391, "learning_rate": 0.0001890504577067258, "loss": 0.0921, "step": 7377 }, { "epoch": 0.47696161616161614, "grad_norm": 0.05612686648964882, "learning_rate": 0.00018904734610324572, "loss": 0.0684, "step": 7378 }, { "epoch": 0.47702626262626263, "grad_norm": 0.059539541602134705, "learning_rate": 0.00018904423408332063, "loss": 0.0899, "step": 7379 }, { "epoch": 0.47709090909090907, "grad_norm": 0.05396325886249542, "learning_rate": 0.00018904112164696513, "loss": 0.0827, "step": 7380 }, { "epoch": 0.47715555555555556, "grad_norm": 0.059559572488069534, "learning_rate": 0.00018903800879419371, "loss": 0.095, "step": 7381 }, { "epoch": 0.47722020202020204, "grad_norm": 0.056799426674842834, "learning_rate": 0.00018903489552502095, "loss": 0.0724, "step": 7382 }, { "epoch": 0.4772848484848485, "grad_norm": 0.05517604947090149, "learning_rate": 0.00018903178183946143, "loss": 0.0785, "step": 7383 }, { "epoch": 0.47734949494949497, "grad_norm": 0.0704139769077301, "learning_rate": 0.00018902866773752973, "loss": 0.1002, "step": 7384 }, { "epoch": 0.4774141414141414, "grad_norm": 0.08497188985347748, "learning_rate": 0.00018902555321924033, "loss": 0.0881, "step": 7385 }, { "epoch": 0.4774787878787879, "grad_norm": 0.05520370602607727, "learning_rate": 0.0001890224382846079, "loss": 0.0711, "step": 7386 }, { "epoch": 0.4775434343434343, "grad_norm": 0.06420005857944489, "learning_rate": 0.0001890193229336469, "loss": 0.0896, "step": 7387 }, { "epoch": 0.4776080808080808, "grad_norm": 0.06292455643415451, "learning_rate": 0.000189016207166372, "loss": 0.0866, "step": 7388 }, { "epoch": 0.47767272727272725, "grad_norm": 0.055018216371536255, "learning_rate": 0.0001890130909827977, "loss": 0.0762, "step": 7389 }, { "epoch": 0.47773737373737374, "grad_norm": 0.06735753268003464, "learning_rate": 0.00018900997438293864, "loss": 0.0889, "step": 7390 }, { "epoch": 0.47780202020202023, "grad_norm": 0.05564945191144943, "learning_rate": 0.00018900685736680933, "loss": 0.0827, "step": 7391 }, { "epoch": 0.47786666666666666, "grad_norm": 0.06764265149831772, "learning_rate": 0.00018900373993442434, "loss": 0.0936, "step": 7392 }, { "epoch": 0.47786666666666666, "eval_bleu": 18.28668125763204, "eval_loss": 0.09061364829540253, "eval_runtime": 2.8595, "eval_samples_per_second": 11.191, "eval_steps_per_second": 1.399, "step": 7392 }, { "epoch": 0.47793131313131315, "grad_norm": 0.05921243503689766, "learning_rate": 0.0001890006220857983, "loss": 0.0883, "step": 7393 }, { "epoch": 0.4779959595959596, "grad_norm": 0.05159585550427437, "learning_rate": 0.0001889975038209458, "loss": 0.0738, "step": 7394 }, { "epoch": 0.4780606060606061, "grad_norm": 0.06793906539678574, "learning_rate": 0.00018899438513988139, "loss": 0.0932, "step": 7395 }, { "epoch": 0.4781252525252525, "grad_norm": 0.04965663328766823, "learning_rate": 0.00018899126604261966, "loss": 0.0729, "step": 7396 }, { "epoch": 0.478189898989899, "grad_norm": 0.0639742910861969, "learning_rate": 0.00018898814652917523, "loss": 0.0934, "step": 7397 }, { "epoch": 0.47825454545454543, "grad_norm": 0.057809364050626755, "learning_rate": 0.0001889850265995626, "loss": 0.0825, "step": 7398 }, { "epoch": 0.4783191919191919, "grad_norm": 0.05602162703871727, "learning_rate": 0.00018898190625379644, "loss": 0.0758, "step": 7399 }, { "epoch": 0.4783838383838384, "grad_norm": 0.052965011447668076, "learning_rate": 0.00018897878549189133, "loss": 0.0718, "step": 7400 }, { "epoch": 0.47844848484848485, "grad_norm": 0.0603640079498291, "learning_rate": 0.00018897566431386184, "loss": 0.0881, "step": 7401 }, { "epoch": 0.47851313131313133, "grad_norm": 0.07532789558172226, "learning_rate": 0.0001889725427197226, "loss": 0.0802, "step": 7402 }, { "epoch": 0.47857777777777777, "grad_norm": 0.06412248313426971, "learning_rate": 0.00018896942070948817, "loss": 0.1064, "step": 7403 }, { "epoch": 0.47864242424242426, "grad_norm": 0.05123303085565567, "learning_rate": 0.0001889662982831732, "loss": 0.0701, "step": 7404 }, { "epoch": 0.4787070707070707, "grad_norm": 0.06790277361869812, "learning_rate": 0.00018896317544079224, "loss": 0.0988, "step": 7405 }, { "epoch": 0.4787717171717172, "grad_norm": 0.059856709092855453, "learning_rate": 0.00018896005218235992, "loss": 0.0807, "step": 7406 }, { "epoch": 0.4788363636363636, "grad_norm": 0.06576141715049744, "learning_rate": 0.00018895692850789088, "loss": 0.1074, "step": 7407 }, { "epoch": 0.4789010101010101, "grad_norm": 0.06487879157066345, "learning_rate": 0.00018895380441739964, "loss": 0.0732, "step": 7408 }, { "epoch": 0.4789010101010101, "eval_bleu": 17.590843818882878, "eval_loss": 0.09128732979297638, "eval_runtime": 2.7402, "eval_samples_per_second": 11.678, "eval_steps_per_second": 1.46, "step": 7408 }, { "epoch": 0.47896565656565654, "grad_norm": 0.047426413744688034, "learning_rate": 0.00018895067991090092, "loss": 0.0747, "step": 7409 }, { "epoch": 0.47903030303030303, "grad_norm": 0.053511153906583786, "learning_rate": 0.00018894755498840923, "loss": 0.0731, "step": 7410 }, { "epoch": 0.4790949494949495, "grad_norm": 0.0688922107219696, "learning_rate": 0.00018894442964993927, "loss": 0.0874, "step": 7411 }, { "epoch": 0.47915959595959595, "grad_norm": 0.06405059248209, "learning_rate": 0.00018894130389550558, "loss": 0.0992, "step": 7412 }, { "epoch": 0.47922424242424244, "grad_norm": 0.06326265633106232, "learning_rate": 0.00018893817772512283, "loss": 0.0941, "step": 7413 }, { "epoch": 0.4792888888888889, "grad_norm": 0.06417236477136612, "learning_rate": 0.00018893505113880564, "loss": 0.0843, "step": 7414 }, { "epoch": 0.47935353535353536, "grad_norm": 0.059450212866067886, "learning_rate": 0.0001889319241365686, "loss": 0.088, "step": 7415 }, { "epoch": 0.4794181818181818, "grad_norm": 0.062471307814121246, "learning_rate": 0.00018892879671842633, "loss": 0.101, "step": 7416 }, { "epoch": 0.4794828282828283, "grad_norm": 0.045170776546001434, "learning_rate": 0.00018892566888439354, "loss": 0.0594, "step": 7417 }, { "epoch": 0.4795474747474747, "grad_norm": 0.05748889222741127, "learning_rate": 0.00018892254063448472, "loss": 0.0942, "step": 7418 }, { "epoch": 0.4796121212121212, "grad_norm": 0.11109445244073868, "learning_rate": 0.00018891941196871463, "loss": 0.0854, "step": 7419 }, { "epoch": 0.4796767676767677, "grad_norm": 0.06887159496545792, "learning_rate": 0.00018891628288709785, "loss": 0.0885, "step": 7420 }, { "epoch": 0.47974141414141414, "grad_norm": 0.0610644556581974, "learning_rate": 0.00018891315338964896, "loss": 0.097, "step": 7421 }, { "epoch": 0.4798060606060606, "grad_norm": 0.050202663987874985, "learning_rate": 0.00018891002347638268, "loss": 0.0599, "step": 7422 }, { "epoch": 0.47987070707070706, "grad_norm": 0.053863439708948135, "learning_rate": 0.0001889068931473136, "loss": 0.0759, "step": 7423 }, { "epoch": 0.47993535353535355, "grad_norm": 0.056931789964437485, "learning_rate": 0.00018890376240245638, "loss": 0.0759, "step": 7424 }, { "epoch": 0.47993535353535355, "eval_bleu": 19.283695447837637, "eval_loss": 0.09193984419107437, "eval_runtime": 2.6778, "eval_samples_per_second": 11.95, "eval_steps_per_second": 1.494, "step": 7424 }, { "epoch": 0.48, "grad_norm": 0.05911869928240776, "learning_rate": 0.0001889006312418257, "loss": 0.0762, "step": 7425 }, { "epoch": 0.48006464646464647, "grad_norm": 0.07061933726072311, "learning_rate": 0.00018889749966543608, "loss": 0.098, "step": 7426 }, { "epoch": 0.4801292929292929, "grad_norm": 0.06653372198343277, "learning_rate": 0.00018889436767330228, "loss": 0.0975, "step": 7427 }, { "epoch": 0.4801939393939394, "grad_norm": 0.06122582033276558, "learning_rate": 0.00018889123526543892, "loss": 0.0849, "step": 7428 }, { "epoch": 0.4802585858585859, "grad_norm": 0.06055361032485962, "learning_rate": 0.00018888810244186062, "loss": 0.0785, "step": 7429 }, { "epoch": 0.4803232323232323, "grad_norm": 0.06611603498458862, "learning_rate": 0.00018888496920258202, "loss": 0.0882, "step": 7430 }, { "epoch": 0.4803878787878788, "grad_norm": 0.05764475464820862, "learning_rate": 0.00018888183554761787, "loss": 0.0785, "step": 7431 }, { "epoch": 0.48045252525252524, "grad_norm": 0.07658588886260986, "learning_rate": 0.00018887870147698272, "loss": 0.0883, "step": 7432 }, { "epoch": 0.48051717171717173, "grad_norm": 0.06241154670715332, "learning_rate": 0.0001888755669906913, "loss": 0.0972, "step": 7433 }, { "epoch": 0.48058181818181817, "grad_norm": 0.0550629161298275, "learning_rate": 0.00018887243208875817, "loss": 0.0865, "step": 7434 }, { "epoch": 0.48064646464646466, "grad_norm": 0.058775000274181366, "learning_rate": 0.00018886929677119812, "loss": 0.0874, "step": 7435 }, { "epoch": 0.4807111111111111, "grad_norm": 0.05692753568291664, "learning_rate": 0.0001888661610380257, "loss": 0.0676, "step": 7436 }, { "epoch": 0.4807757575757576, "grad_norm": 0.05629044398665428, "learning_rate": 0.00018886302488925564, "loss": 0.0788, "step": 7437 }, { "epoch": 0.48084040404040407, "grad_norm": 0.06468663364648819, "learning_rate": 0.0001888598883249026, "loss": 0.1083, "step": 7438 }, { "epoch": 0.4809050505050505, "grad_norm": 0.06150979921221733, "learning_rate": 0.00018885675134498126, "loss": 0.0941, "step": 7439 }, { "epoch": 0.480969696969697, "grad_norm": 0.05555856600403786, "learning_rate": 0.00018885361394950625, "loss": 0.0862, "step": 7440 }, { "epoch": 0.480969696969697, "eval_bleu": 17.41407755527788, "eval_loss": 0.09168033301830292, "eval_runtime": 2.6603, "eval_samples_per_second": 12.029, "eval_steps_per_second": 1.504, "step": 7440 }, { "epoch": 0.4810343434343434, "grad_norm": 0.05627838522195816, "learning_rate": 0.00018885047613849225, "loss": 0.0864, "step": 7441 }, { "epoch": 0.4810989898989899, "grad_norm": 0.053141918033361435, "learning_rate": 0.00018884733791195397, "loss": 0.0778, "step": 7442 }, { "epoch": 0.48116363636363635, "grad_norm": 0.04749554768204689, "learning_rate": 0.00018884419926990602, "loss": 0.0647, "step": 7443 }, { "epoch": 0.48122828282828284, "grad_norm": 0.062148723751306534, "learning_rate": 0.00018884106021236318, "loss": 0.0933, "step": 7444 }, { "epoch": 0.4812929292929293, "grad_norm": 0.05633784830570221, "learning_rate": 0.00018883792073934004, "loss": 0.0899, "step": 7445 }, { "epoch": 0.48135757575757576, "grad_norm": 0.07805418968200684, "learning_rate": 0.0001888347808508513, "loss": 0.1257, "step": 7446 }, { "epoch": 0.4814222222222222, "grad_norm": 0.07340484112501144, "learning_rate": 0.00018883164054691166, "loss": 0.0889, "step": 7447 }, { "epoch": 0.4814868686868687, "grad_norm": 0.06854554265737534, "learning_rate": 0.00018882849982753582, "loss": 0.0842, "step": 7448 }, { "epoch": 0.4815515151515152, "grad_norm": 0.05371340364217758, "learning_rate": 0.00018882535869273846, "loss": 0.0788, "step": 7449 }, { "epoch": 0.4816161616161616, "grad_norm": 0.0513310469686985, "learning_rate": 0.00018882221714253423, "loss": 0.074, "step": 7450 }, { "epoch": 0.4816808080808081, "grad_norm": 0.06965672969818115, "learning_rate": 0.0001888190751769379, "loss": 0.0991, "step": 7451 }, { "epoch": 0.48174545454545453, "grad_norm": 0.0510311983525753, "learning_rate": 0.0001888159327959641, "loss": 0.0767, "step": 7452 }, { "epoch": 0.481810101010101, "grad_norm": 0.060104116797447205, "learning_rate": 0.00018881278999962752, "loss": 0.0935, "step": 7453 }, { "epoch": 0.48187474747474746, "grad_norm": 0.05799567326903343, "learning_rate": 0.0001888096467879429, "loss": 0.0874, "step": 7454 }, { "epoch": 0.48193939393939395, "grad_norm": 0.05720657482743263, "learning_rate": 0.00018880650316092492, "loss": 0.0862, "step": 7455 }, { "epoch": 0.4820040404040404, "grad_norm": 0.06967896223068237, "learning_rate": 0.00018880335911858828, "loss": 0.1054, "step": 7456 }, { "epoch": 0.4820040404040404, "eval_bleu": 16.44884504420862, "eval_loss": 0.09189954400062561, "eval_runtime": 2.7227, "eval_samples_per_second": 11.753, "eval_steps_per_second": 1.469, "step": 7456 }, { "epoch": 0.48206868686868687, "grad_norm": 0.05039912834763527, "learning_rate": 0.00018880021466094772, "loss": 0.0679, "step": 7457 }, { "epoch": 0.48213333333333336, "grad_norm": 0.056948982179164886, "learning_rate": 0.00018879706978801788, "loss": 0.0935, "step": 7458 }, { "epoch": 0.4821979797979798, "grad_norm": 0.055664170533418655, "learning_rate": 0.0001887939244998135, "loss": 0.0908, "step": 7459 }, { "epoch": 0.4822626262626263, "grad_norm": 0.05578738451004028, "learning_rate": 0.0001887907787963493, "loss": 0.0755, "step": 7460 }, { "epoch": 0.4823272727272727, "grad_norm": 0.054203420877456665, "learning_rate": 0.00018878763267763998, "loss": 0.0825, "step": 7461 }, { "epoch": 0.4823919191919192, "grad_norm": 0.0542246513068676, "learning_rate": 0.00018878448614370025, "loss": 0.0829, "step": 7462 }, { "epoch": 0.48245656565656564, "grad_norm": 0.060039062052965164, "learning_rate": 0.00018878133919454483, "loss": 0.0884, "step": 7463 }, { "epoch": 0.48252121212121213, "grad_norm": 0.05522818863391876, "learning_rate": 0.00018877819183018844, "loss": 0.0869, "step": 7464 }, { "epoch": 0.48258585858585856, "grad_norm": 0.057807646691799164, "learning_rate": 0.00018877504405064578, "loss": 0.0819, "step": 7465 }, { "epoch": 0.48265050505050505, "grad_norm": 0.0589180625975132, "learning_rate": 0.0001887718958559316, "loss": 0.0889, "step": 7466 }, { "epoch": 0.48271515151515154, "grad_norm": 0.059853896498680115, "learning_rate": 0.00018876874724606065, "loss": 0.0905, "step": 7467 }, { "epoch": 0.482779797979798, "grad_norm": 0.05839342996478081, "learning_rate": 0.00018876559822104757, "loss": 0.0855, "step": 7468 }, { "epoch": 0.48284444444444446, "grad_norm": 0.054131921380758286, "learning_rate": 0.0001887624487809071, "loss": 0.0823, "step": 7469 }, { "epoch": 0.4829090909090909, "grad_norm": 0.05482763797044754, "learning_rate": 0.00018875929892565407, "loss": 0.0785, "step": 7470 }, { "epoch": 0.4829737373737374, "grad_norm": 0.057354919612407684, "learning_rate": 0.00018875614865530312, "loss": 0.085, "step": 7471 }, { "epoch": 0.4830383838383838, "grad_norm": 0.06068718060851097, "learning_rate": 0.000188752997969869, "loss": 0.0924, "step": 7472 }, { "epoch": 0.4830383838383838, "eval_bleu": 16.281445245800096, "eval_loss": 0.09071114659309387, "eval_runtime": 2.7759, "eval_samples_per_second": 11.528, "eval_steps_per_second": 1.441, "step": 7472 }, { "epoch": 0.4831030303030303, "grad_norm": 0.06047188118100166, "learning_rate": 0.00018874984686936643, "loss": 0.092, "step": 7473 }, { "epoch": 0.48316767676767675, "grad_norm": 0.06361375004053116, "learning_rate": 0.0001887466953538102, "loss": 0.0907, "step": 7474 }, { "epoch": 0.48323232323232324, "grad_norm": 0.052549462765455246, "learning_rate": 0.000188743543423215, "loss": 0.0756, "step": 7475 }, { "epoch": 0.48329696969696967, "grad_norm": 0.08005545288324356, "learning_rate": 0.00018874039107759559, "loss": 0.0813, "step": 7476 }, { "epoch": 0.48336161616161616, "grad_norm": 0.06664285063743591, "learning_rate": 0.00018873723831696668, "loss": 0.1052, "step": 7477 }, { "epoch": 0.48342626262626265, "grad_norm": 0.05544440820813179, "learning_rate": 0.00018873408514134309, "loss": 0.0783, "step": 7478 }, { "epoch": 0.4834909090909091, "grad_norm": 0.06015217304229736, "learning_rate": 0.00018873093155073948, "loss": 0.0991, "step": 7479 }, { "epoch": 0.48355555555555557, "grad_norm": 0.06439178436994553, "learning_rate": 0.00018872777754517066, "loss": 0.0954, "step": 7480 }, { "epoch": 0.483620202020202, "grad_norm": 0.05751205235719681, "learning_rate": 0.00018872462312465133, "loss": 0.0847, "step": 7481 }, { "epoch": 0.4836848484848485, "grad_norm": 0.06328058242797852, "learning_rate": 0.00018872146828919633, "loss": 0.1025, "step": 7482 }, { "epoch": 0.48374949494949493, "grad_norm": 0.05646781250834465, "learning_rate": 0.00018871831303882028, "loss": 0.0759, "step": 7483 }, { "epoch": 0.4838141414141414, "grad_norm": 0.053624387830495834, "learning_rate": 0.00018871515737353805, "loss": 0.0799, "step": 7484 }, { "epoch": 0.48387878787878785, "grad_norm": 0.05667683109641075, "learning_rate": 0.00018871200129336435, "loss": 0.0754, "step": 7485 }, { "epoch": 0.48394343434343434, "grad_norm": 0.06401709467172623, "learning_rate": 0.00018870884479831396, "loss": 0.0998, "step": 7486 }, { "epoch": 0.48400808080808083, "grad_norm": 0.05780074745416641, "learning_rate": 0.00018870568788840163, "loss": 0.074, "step": 7487 }, { "epoch": 0.48407272727272727, "grad_norm": 0.0964619442820549, "learning_rate": 0.00018870253056364208, "loss": 0.0993, "step": 7488 }, { "epoch": 0.48407272727272727, "eval_bleu": 14.643369765529943, "eval_loss": 0.09093542397022247, "eval_runtime": 2.6748, "eval_samples_per_second": 11.963, "eval_steps_per_second": 1.495, "step": 7488 }, { "epoch": 0.48413737373737376, "grad_norm": 0.056897349655628204, "learning_rate": 0.00018869937282405016, "loss": 0.0907, "step": 7489 }, { "epoch": 0.4842020202020202, "grad_norm": 0.05163009092211723, "learning_rate": 0.00018869621466964057, "loss": 0.0768, "step": 7490 }, { "epoch": 0.4842666666666667, "grad_norm": 0.06215915083885193, "learning_rate": 0.00018869305610042813, "loss": 0.0928, "step": 7491 }, { "epoch": 0.4843313131313131, "grad_norm": 0.05661742761731148, "learning_rate": 0.00018868989711642758, "loss": 0.0793, "step": 7492 }, { "epoch": 0.4843959595959596, "grad_norm": 0.08293861895799637, "learning_rate": 0.0001886867377176537, "loss": 0.0868, "step": 7493 }, { "epoch": 0.48446060606060604, "grad_norm": 0.05247582495212555, "learning_rate": 0.00018868357790412127, "loss": 0.0782, "step": 7494 }, { "epoch": 0.4845252525252525, "grad_norm": 0.057859014719724655, "learning_rate": 0.00018868041767584507, "loss": 0.0762, "step": 7495 }, { "epoch": 0.484589898989899, "grad_norm": 0.05614588409662247, "learning_rate": 0.00018867725703283986, "loss": 0.075, "step": 7496 }, { "epoch": 0.48465454545454545, "grad_norm": 0.0573287159204483, "learning_rate": 0.00018867409597512046, "loss": 0.0743, "step": 7497 }, { "epoch": 0.48471919191919194, "grad_norm": 0.05666404590010643, "learning_rate": 0.0001886709345027016, "loss": 0.0728, "step": 7498 }, { "epoch": 0.4847838383838384, "grad_norm": 0.06275829672813416, "learning_rate": 0.0001886677726155981, "loss": 0.0861, "step": 7499 }, { "epoch": 0.48484848484848486, "grad_norm": 0.06108987703919411, "learning_rate": 0.00018866461031382474, "loss": 0.0876, "step": 7500 }, { "epoch": 0.4849131313131313, "grad_norm": 0.05813375487923622, "learning_rate": 0.00018866144759739634, "loss": 0.0817, "step": 7501 }, { "epoch": 0.4849777777777778, "grad_norm": 0.06503874063491821, "learning_rate": 0.0001886582844663276, "loss": 0.099, "step": 7502 }, { "epoch": 0.4850424242424242, "grad_norm": 0.06651957333087921, "learning_rate": 0.0001886551209206334, "loss": 0.0962, "step": 7503 }, { "epoch": 0.4851070707070707, "grad_norm": 0.05186796188354492, "learning_rate": 0.00018865195696032855, "loss": 0.0697, "step": 7504 }, { "epoch": 0.4851070707070707, "eval_bleu": 13.715529214360517, "eval_loss": 0.09115578234195709, "eval_runtime": 2.7193, "eval_samples_per_second": 11.768, "eval_steps_per_second": 1.471, "step": 7504 }, { "epoch": 0.4851717171717172, "grad_norm": 0.06104163080453873, "learning_rate": 0.00018864879258542776, "loss": 0.082, "step": 7505 }, { "epoch": 0.48523636363636363, "grad_norm": 0.053498558700084686, "learning_rate": 0.00018864562779594588, "loss": 0.0765, "step": 7506 }, { "epoch": 0.4853010101010101, "grad_norm": 0.05497893691062927, "learning_rate": 0.00018864246259189772, "loss": 0.0845, "step": 7507 }, { "epoch": 0.48536565656565656, "grad_norm": 0.06394123286008835, "learning_rate": 0.00018863929697329805, "loss": 0.102, "step": 7508 }, { "epoch": 0.48543030303030305, "grad_norm": 0.05249600112438202, "learning_rate": 0.00018863613094016168, "loss": 0.0801, "step": 7509 }, { "epoch": 0.4854949494949495, "grad_norm": 0.057221561670303345, "learning_rate": 0.00018863296449250346, "loss": 0.071, "step": 7510 }, { "epoch": 0.48555959595959597, "grad_norm": 0.07538211345672607, "learning_rate": 0.00018862979763033813, "loss": 0.1139, "step": 7511 }, { "epoch": 0.4856242424242424, "grad_norm": 0.059072982519865036, "learning_rate": 0.00018862663035368055, "loss": 0.0894, "step": 7512 }, { "epoch": 0.4856888888888889, "grad_norm": 0.05306990444660187, "learning_rate": 0.00018862346266254555, "loss": 0.0846, "step": 7513 }, { "epoch": 0.4857535353535353, "grad_norm": 0.061334699392318726, "learning_rate": 0.00018862029455694786, "loss": 0.0869, "step": 7514 }, { "epoch": 0.4858181818181818, "grad_norm": 0.06370957940816879, "learning_rate": 0.00018861712603690243, "loss": 0.1031, "step": 7515 }, { "epoch": 0.4858828282828283, "grad_norm": 0.05715881288051605, "learning_rate": 0.00018861395710242395, "loss": 0.0842, "step": 7516 }, { "epoch": 0.48594747474747474, "grad_norm": 0.04821036383509636, "learning_rate": 0.00018861078775352726, "loss": 0.0608, "step": 7517 }, { "epoch": 0.48601212121212123, "grad_norm": 0.05528814345598221, "learning_rate": 0.00018860761799022725, "loss": 0.0749, "step": 7518 }, { "epoch": 0.48607676767676766, "grad_norm": 0.058606553822755814, "learning_rate": 0.0001886044478125387, "loss": 0.0907, "step": 7519 }, { "epoch": 0.48614141414141415, "grad_norm": 0.062225889414548874, "learning_rate": 0.0001886012772204764, "loss": 0.0912, "step": 7520 }, { "epoch": 0.48614141414141415, "eval_bleu": 16.22139576109295, "eval_loss": 0.09069855511188507, "eval_runtime": 2.6341, "eval_samples_per_second": 12.148, "eval_steps_per_second": 1.519, "step": 7520 }, { "epoch": 0.4862060606060606, "grad_norm": 0.07032915949821472, "learning_rate": 0.00018859810621405524, "loss": 0.1118, "step": 7521 }, { "epoch": 0.4862707070707071, "grad_norm": 0.062040820717811584, "learning_rate": 0.00018859493479329005, "loss": 0.0839, "step": 7522 }, { "epoch": 0.4863353535353535, "grad_norm": 0.05863143876194954, "learning_rate": 0.00018859176295819562, "loss": 0.0941, "step": 7523 }, { "epoch": 0.4864, "grad_norm": 0.0599319264292717, "learning_rate": 0.0001885885907087868, "loss": 0.0807, "step": 7524 }, { "epoch": 0.4864646464646465, "grad_norm": 0.061720073223114014, "learning_rate": 0.00018858541804507846, "loss": 0.0757, "step": 7525 }, { "epoch": 0.4865292929292929, "grad_norm": 0.07385706901550293, "learning_rate": 0.00018858224496708537, "loss": 0.1042, "step": 7526 }, { "epoch": 0.4865939393939394, "grad_norm": 0.057300299406051636, "learning_rate": 0.00018857907147482242, "loss": 0.0764, "step": 7527 }, { "epoch": 0.48665858585858585, "grad_norm": 0.060272958129644394, "learning_rate": 0.00018857589756830442, "loss": 0.0847, "step": 7528 }, { "epoch": 0.48672323232323234, "grad_norm": 0.05285350978374481, "learning_rate": 0.00018857272324754624, "loss": 0.0791, "step": 7529 }, { "epoch": 0.48678787878787877, "grad_norm": 0.05893218517303467, "learning_rate": 0.0001885695485125627, "loss": 0.0846, "step": 7530 }, { "epoch": 0.48685252525252526, "grad_norm": 0.08663883805274963, "learning_rate": 0.0001885663733633687, "loss": 0.1245, "step": 7531 }, { "epoch": 0.4869171717171717, "grad_norm": 0.06049446761608124, "learning_rate": 0.00018856319779997902, "loss": 0.0955, "step": 7532 }, { "epoch": 0.4869818181818182, "grad_norm": 0.05603338032960892, "learning_rate": 0.00018856002182240857, "loss": 0.0818, "step": 7533 }, { "epoch": 0.48704646464646467, "grad_norm": 0.06434468179941177, "learning_rate": 0.00018855684543067213, "loss": 0.0934, "step": 7534 }, { "epoch": 0.4871111111111111, "grad_norm": 0.04910537973046303, "learning_rate": 0.00018855366862478463, "loss": 0.0672, "step": 7535 }, { "epoch": 0.4871757575757576, "grad_norm": 0.050021927803754807, "learning_rate": 0.0001885504914047609, "loss": 0.0759, "step": 7536 }, { "epoch": 0.4871757575757576, "eval_bleu": 18.925336443380704, "eval_loss": 0.09077443927526474, "eval_runtime": 2.6256, "eval_samples_per_second": 12.188, "eval_steps_per_second": 1.523, "step": 7536 }, { "epoch": 0.48724040404040403, "grad_norm": 0.06381462514400482, "learning_rate": 0.00018854731377061578, "loss": 0.0856, "step": 7537 }, { "epoch": 0.4873050505050505, "grad_norm": 0.06001684069633484, "learning_rate": 0.00018854413572236416, "loss": 0.0859, "step": 7538 }, { "epoch": 0.48736969696969695, "grad_norm": 0.06264454871416092, "learning_rate": 0.0001885409572600209, "loss": 0.0967, "step": 7539 }, { "epoch": 0.48743434343434344, "grad_norm": 0.05649297311902046, "learning_rate": 0.00018853777838360083, "loss": 0.0803, "step": 7540 }, { "epoch": 0.4874989898989899, "grad_norm": 0.060657911002635956, "learning_rate": 0.00018853459909311886, "loss": 0.0958, "step": 7541 }, { "epoch": 0.48756363636363637, "grad_norm": 0.05793769657611847, "learning_rate": 0.00018853141938858983, "loss": 0.0907, "step": 7542 }, { "epoch": 0.48762828282828286, "grad_norm": 0.06151202321052551, "learning_rate": 0.00018852823927002863, "loss": 0.0831, "step": 7543 }, { "epoch": 0.4876929292929293, "grad_norm": 0.058094728738069534, "learning_rate": 0.0001885250587374501, "loss": 0.0887, "step": 7544 }, { "epoch": 0.4877575757575758, "grad_norm": 0.0728970393538475, "learning_rate": 0.00018852187779086913, "loss": 0.0963, "step": 7545 }, { "epoch": 0.4878222222222222, "grad_norm": 0.059648651629686356, "learning_rate": 0.00018851869643030065, "loss": 0.0942, "step": 7546 }, { "epoch": 0.4878868686868687, "grad_norm": 0.05597603693604469, "learning_rate": 0.00018851551465575946, "loss": 0.0739, "step": 7547 }, { "epoch": 0.48795151515151514, "grad_norm": 0.05513569712638855, "learning_rate": 0.00018851233246726046, "loss": 0.0906, "step": 7548 }, { "epoch": 0.4880161616161616, "grad_norm": 0.060769833624362946, "learning_rate": 0.00018850914986481856, "loss": 0.09, "step": 7549 }, { "epoch": 0.48808080808080806, "grad_norm": 0.05272754281759262, "learning_rate": 0.0001885059668484486, "loss": 0.0782, "step": 7550 }, { "epoch": 0.48814545454545455, "grad_norm": 0.05979376286268234, "learning_rate": 0.00018850278341816552, "loss": 0.0859, "step": 7551 }, { "epoch": 0.488210101010101, "grad_norm": 0.052101437002420425, "learning_rate": 0.00018849959957398417, "loss": 0.0759, "step": 7552 }, { "epoch": 0.488210101010101, "eval_bleu": 18.7494458848749, "eval_loss": 0.09098012745380402, "eval_runtime": 2.6156, "eval_samples_per_second": 12.234, "eval_steps_per_second": 1.529, "step": 7552 }, { "epoch": 0.4882747474747475, "grad_norm": 0.06320875138044357, "learning_rate": 0.00018849641531591945, "loss": 0.0963, "step": 7553 }, { "epoch": 0.48833939393939396, "grad_norm": 0.046928223222494125, "learning_rate": 0.00018849323064398625, "loss": 0.0635, "step": 7554 }, { "epoch": 0.4884040404040404, "grad_norm": 0.04965583235025406, "learning_rate": 0.0001884900455581995, "loss": 0.072, "step": 7555 }, { "epoch": 0.4884686868686869, "grad_norm": 0.06004434451460838, "learning_rate": 0.000188486860058574, "loss": 0.0878, "step": 7556 }, { "epoch": 0.4885333333333333, "grad_norm": 0.06365232169628143, "learning_rate": 0.00018848367414512477, "loss": 0.0919, "step": 7557 }, { "epoch": 0.4885979797979798, "grad_norm": 0.05433553084731102, "learning_rate": 0.0001884804878178666, "loss": 0.0859, "step": 7558 }, { "epoch": 0.48866262626262624, "grad_norm": 0.05674424022436142, "learning_rate": 0.00018847730107681449, "loss": 0.0826, "step": 7559 }, { "epoch": 0.48872727272727273, "grad_norm": 0.0618436299264431, "learning_rate": 0.00018847411392198323, "loss": 0.1003, "step": 7560 }, { "epoch": 0.48879191919191917, "grad_norm": 0.057236362248659134, "learning_rate": 0.00018847092635338784, "loss": 0.0869, "step": 7561 }, { "epoch": 0.48885656565656566, "grad_norm": 0.05457776039838791, "learning_rate": 0.00018846773837104315, "loss": 0.0871, "step": 7562 }, { "epoch": 0.48892121212121215, "grad_norm": 0.049727875739336014, "learning_rate": 0.00018846454997496407, "loss": 0.0742, "step": 7563 }, { "epoch": 0.4889858585858586, "grad_norm": 0.04133527725934982, "learning_rate": 0.00018846136116516557, "loss": 0.0607, "step": 7564 }, { "epoch": 0.48905050505050507, "grad_norm": 0.07601038366556168, "learning_rate": 0.00018845817194166253, "loss": 0.1065, "step": 7565 }, { "epoch": 0.4891151515151515, "grad_norm": 0.04899442940950394, "learning_rate": 0.00018845498230446984, "loss": 0.0753, "step": 7566 }, { "epoch": 0.489179797979798, "grad_norm": 0.05743100121617317, "learning_rate": 0.00018845179225360247, "loss": 0.0909, "step": 7567 }, { "epoch": 0.4892444444444444, "grad_norm": 0.050940513610839844, "learning_rate": 0.00018844860178907527, "loss": 0.0745, "step": 7568 }, { "epoch": 0.4892444444444444, "eval_bleu": 15.990045611340395, "eval_loss": 0.09231633692979813, "eval_runtime": 2.6202, "eval_samples_per_second": 12.213, "eval_steps_per_second": 1.527, "step": 7568 }, { "epoch": 0.4893090909090909, "grad_norm": 0.058533474802970886, "learning_rate": 0.00018844541091090323, "loss": 0.0935, "step": 7569 }, { "epoch": 0.48937373737373735, "grad_norm": 0.052270032465457916, "learning_rate": 0.00018844221961910126, "loss": 0.0759, "step": 7570 }, { "epoch": 0.48943838383838384, "grad_norm": 0.052158571779727936, "learning_rate": 0.00018843902791368424, "loss": 0.0746, "step": 7571 }, { "epoch": 0.48950303030303033, "grad_norm": 0.05481791868805885, "learning_rate": 0.00018843583579466713, "loss": 0.0706, "step": 7572 }, { "epoch": 0.48956767676767676, "grad_norm": 0.057583630084991455, "learning_rate": 0.00018843264326206486, "loss": 0.0876, "step": 7573 }, { "epoch": 0.48963232323232325, "grad_norm": 0.06106598302721977, "learning_rate": 0.00018842945031589234, "loss": 0.0944, "step": 7574 }, { "epoch": 0.4896969696969697, "grad_norm": 0.06017839536070824, "learning_rate": 0.00018842625695616452, "loss": 0.0979, "step": 7575 }, { "epoch": 0.4897616161616162, "grad_norm": 0.05476900562644005, "learning_rate": 0.00018842306318289634, "loss": 0.0825, "step": 7576 }, { "epoch": 0.4898262626262626, "grad_norm": 0.05828084051609039, "learning_rate": 0.00018841986899610272, "loss": 0.0885, "step": 7577 }, { "epoch": 0.4898909090909091, "grad_norm": 0.06722602993249893, "learning_rate": 0.0001884166743957986, "loss": 0.0908, "step": 7578 }, { "epoch": 0.48995555555555553, "grad_norm": 0.05690741539001465, "learning_rate": 0.00018841347938199893, "loss": 0.0792, "step": 7579 }, { "epoch": 0.490020202020202, "grad_norm": 0.06266800314188004, "learning_rate": 0.0001884102839547186, "loss": 0.0953, "step": 7580 }, { "epoch": 0.4900848484848485, "grad_norm": 0.06460393220186234, "learning_rate": 0.00018840708811397266, "loss": 0.0839, "step": 7581 }, { "epoch": 0.49014949494949495, "grad_norm": 0.060007575899362564, "learning_rate": 0.00018840389185977596, "loss": 0.0759, "step": 7582 }, { "epoch": 0.49021414141414144, "grad_norm": 0.05185353383421898, "learning_rate": 0.0001884006951921435, "loss": 0.0785, "step": 7583 }, { "epoch": 0.49027878787878787, "grad_norm": 0.05502147972583771, "learning_rate": 0.00018839749811109022, "loss": 0.0895, "step": 7584 }, { "epoch": 0.49027878787878787, "eval_bleu": 15.616823521972465, "eval_loss": 0.09264984726905823, "eval_runtime": 2.5976, "eval_samples_per_second": 12.319, "eval_steps_per_second": 1.54, "step": 7584 }, { "epoch": 0.49034343434343436, "grad_norm": 0.05906379967927933, "learning_rate": 0.00018839430061663105, "loss": 0.0866, "step": 7585 }, { "epoch": 0.4904080808080808, "grad_norm": 0.060299333184957504, "learning_rate": 0.00018839110270878098, "loss": 0.0878, "step": 7586 }, { "epoch": 0.4904727272727273, "grad_norm": 0.06496629863977432, "learning_rate": 0.00018838790438755493, "loss": 0.0956, "step": 7587 }, { "epoch": 0.4905373737373737, "grad_norm": 0.05506758391857147, "learning_rate": 0.00018838470565296785, "loss": 0.0785, "step": 7588 }, { "epoch": 0.4906020202020202, "grad_norm": 0.062295593321323395, "learning_rate": 0.00018838150650503474, "loss": 0.0857, "step": 7589 }, { "epoch": 0.49066666666666664, "grad_norm": 0.055233750492334366, "learning_rate": 0.00018837830694377056, "loss": 0.0853, "step": 7590 }, { "epoch": 0.49073131313131313, "grad_norm": 0.05841900408267975, "learning_rate": 0.00018837510696919022, "loss": 0.0813, "step": 7591 }, { "epoch": 0.4907959595959596, "grad_norm": 0.06589291989803314, "learning_rate": 0.00018837190658130878, "loss": 0.0963, "step": 7592 }, { "epoch": 0.49086060606060605, "grad_norm": 0.06322571635246277, "learning_rate": 0.00018836870578014107, "loss": 0.0809, "step": 7593 }, { "epoch": 0.49092525252525254, "grad_norm": 0.10294733941555023, "learning_rate": 0.0001883655045657022, "loss": 0.0789, "step": 7594 }, { "epoch": 0.490989898989899, "grad_norm": 0.05360643193125725, "learning_rate": 0.00018836230293800706, "loss": 0.0812, "step": 7595 }, { "epoch": 0.49105454545454547, "grad_norm": 0.05013853311538696, "learning_rate": 0.0001883591008970706, "loss": 0.0747, "step": 7596 }, { "epoch": 0.4911191919191919, "grad_norm": 0.05648389086127281, "learning_rate": 0.00018835589844290787, "loss": 0.084, "step": 7597 }, { "epoch": 0.4911838383838384, "grad_norm": 0.05842467397451401, "learning_rate": 0.00018835269557553383, "loss": 0.0923, "step": 7598 }, { "epoch": 0.4912484848484848, "grad_norm": 0.05728521570563316, "learning_rate": 0.00018834949229496343, "loss": 0.0855, "step": 7599 }, { "epoch": 0.4913131313131313, "grad_norm": 0.05778517946600914, "learning_rate": 0.00018834628860121163, "loss": 0.0759, "step": 7600 }, { "epoch": 0.4913131313131313, "eval_bleu": 14.294307712341261, "eval_loss": 0.09230908751487732, "eval_runtime": 2.7279, "eval_samples_per_second": 11.731, "eval_steps_per_second": 1.466, "step": 7600 }, { "epoch": 0.4913777777777778, "grad_norm": 0.04904454946517944, "learning_rate": 0.0001883430844942935, "loss": 0.0725, "step": 7601 }, { "epoch": 0.49144242424242424, "grad_norm": 0.059577006846666336, "learning_rate": 0.00018833987997422393, "loss": 0.0825, "step": 7602 }, { "epoch": 0.4915070707070707, "grad_norm": 0.06565333157777786, "learning_rate": 0.00018833667504101795, "loss": 0.1056, "step": 7603 }, { "epoch": 0.49157171717171716, "grad_norm": 0.07160551846027374, "learning_rate": 0.00018833346969469056, "loss": 0.1044, "step": 7604 }, { "epoch": 0.49163636363636365, "grad_norm": 0.06122187525033951, "learning_rate": 0.00018833026393525671, "loss": 0.0776, "step": 7605 }, { "epoch": 0.4917010101010101, "grad_norm": 0.05413959175348282, "learning_rate": 0.00018832705776273146, "loss": 0.076, "step": 7606 }, { "epoch": 0.4917656565656566, "grad_norm": 0.0634588971734047, "learning_rate": 0.00018832385117712973, "loss": 0.0859, "step": 7607 }, { "epoch": 0.491830303030303, "grad_norm": 0.0650792345404625, "learning_rate": 0.00018832064417846655, "loss": 0.0913, "step": 7608 }, { "epoch": 0.4918949494949495, "grad_norm": 0.05421821400523186, "learning_rate": 0.00018831743676675692, "loss": 0.0789, "step": 7609 }, { "epoch": 0.491959595959596, "grad_norm": 0.07067961245775223, "learning_rate": 0.00018831422894201584, "loss": 0.1025, "step": 7610 }, { "epoch": 0.4920242424242424, "grad_norm": 0.06062060594558716, "learning_rate": 0.0001883110207042583, "loss": 0.0894, "step": 7611 }, { "epoch": 0.4920888888888889, "grad_norm": 0.06420599669218063, "learning_rate": 0.0001883078120534993, "loss": 0.0952, "step": 7612 }, { "epoch": 0.49215353535353534, "grad_norm": 0.05648969113826752, "learning_rate": 0.00018830460298975386, "loss": 0.0772, "step": 7613 }, { "epoch": 0.49221818181818183, "grad_norm": 0.05778888985514641, "learning_rate": 0.00018830139351303703, "loss": 0.0839, "step": 7614 }, { "epoch": 0.49228282828282827, "grad_norm": 0.064747154712677, "learning_rate": 0.0001882981836233637, "loss": 0.0836, "step": 7615 }, { "epoch": 0.49234747474747476, "grad_norm": 0.06549056619405746, "learning_rate": 0.00018829497332074901, "loss": 0.0892, "step": 7616 }, { "epoch": 0.49234747474747476, "eval_bleu": 14.359031981025039, "eval_loss": 0.09173575043678284, "eval_runtime": 2.6314, "eval_samples_per_second": 12.161, "eval_steps_per_second": 1.52, "step": 7616 }, { "epoch": 0.4924121212121212, "grad_norm": 0.05367930233478546, "learning_rate": 0.00018829176260520792, "loss": 0.0746, "step": 7617 }, { "epoch": 0.4924767676767677, "grad_norm": 0.05850067362189293, "learning_rate": 0.00018828855147675543, "loss": 0.0849, "step": 7618 }, { "epoch": 0.49254141414141417, "grad_norm": 0.061516549438238144, "learning_rate": 0.00018828533993540657, "loss": 0.0872, "step": 7619 }, { "epoch": 0.4926060606060606, "grad_norm": 0.04922865331172943, "learning_rate": 0.00018828212798117634, "loss": 0.0707, "step": 7620 }, { "epoch": 0.4926707070707071, "grad_norm": 0.05580343306064606, "learning_rate": 0.0001882789156140798, "loss": 0.0716, "step": 7621 }, { "epoch": 0.4927353535353535, "grad_norm": 0.05793207883834839, "learning_rate": 0.000188275702834132, "loss": 0.0845, "step": 7622 }, { "epoch": 0.4928, "grad_norm": 0.05385265499353409, "learning_rate": 0.00018827248964134788, "loss": 0.0787, "step": 7623 }, { "epoch": 0.49286464646464645, "grad_norm": 0.061909113079309464, "learning_rate": 0.0001882692760357425, "loss": 0.0892, "step": 7624 }, { "epoch": 0.49292929292929294, "grad_norm": 0.05222504958510399, "learning_rate": 0.0001882660620173309, "loss": 0.076, "step": 7625 }, { "epoch": 0.4929939393939394, "grad_norm": 0.057761840522289276, "learning_rate": 0.00018826284758612812, "loss": 0.0873, "step": 7626 }, { "epoch": 0.49305858585858586, "grad_norm": 0.05607140064239502, "learning_rate": 0.00018825963274214918, "loss": 0.0856, "step": 7627 }, { "epoch": 0.4931232323232323, "grad_norm": 0.04745960980653763, "learning_rate": 0.00018825641748540913, "loss": 0.0564, "step": 7628 }, { "epoch": 0.4931878787878788, "grad_norm": 0.05524513125419617, "learning_rate": 0.00018825320181592296, "loss": 0.0851, "step": 7629 }, { "epoch": 0.4932525252525253, "grad_norm": 0.06350874155759811, "learning_rate": 0.00018824998573370577, "loss": 0.0957, "step": 7630 }, { "epoch": 0.4933171717171717, "grad_norm": 0.05841444060206413, "learning_rate": 0.00018824676923877255, "loss": 0.0874, "step": 7631 }, { "epoch": 0.4933818181818182, "grad_norm": 0.05571497231721878, "learning_rate": 0.00018824355233113837, "loss": 0.0823, "step": 7632 }, { "epoch": 0.4933818181818182, "eval_bleu": 16.83254157056784, "eval_loss": 0.09152859449386597, "eval_runtime": 2.5701, "eval_samples_per_second": 12.451, "eval_steps_per_second": 1.556, "step": 7632 }, { "epoch": 0.49344646464646463, "grad_norm": 0.05417928099632263, "learning_rate": 0.00018824033501081827, "loss": 0.0747, "step": 7633 }, { "epoch": 0.4935111111111111, "grad_norm": 0.05084426328539848, "learning_rate": 0.0001882371172778273, "loss": 0.0706, "step": 7634 }, { "epoch": 0.49357575757575756, "grad_norm": 0.05547347664833069, "learning_rate": 0.0001882338991321805, "loss": 0.0762, "step": 7635 }, { "epoch": 0.49364040404040405, "grad_norm": 0.05764460191130638, "learning_rate": 0.0001882306805738929, "loss": 0.0848, "step": 7636 }, { "epoch": 0.4937050505050505, "grad_norm": 0.05144386738538742, "learning_rate": 0.0001882274616029796, "loss": 0.0729, "step": 7637 }, { "epoch": 0.49376969696969697, "grad_norm": 0.06622016429901123, "learning_rate": 0.00018822424221945564, "loss": 0.1023, "step": 7638 }, { "epoch": 0.49383434343434346, "grad_norm": 0.06727254390716553, "learning_rate": 0.00018822102242333605, "loss": 0.0984, "step": 7639 }, { "epoch": 0.4938989898989899, "grad_norm": 0.057015858590602875, "learning_rate": 0.0001882178022146359, "loss": 0.0779, "step": 7640 }, { "epoch": 0.4939636363636364, "grad_norm": 0.06399842351675034, "learning_rate": 0.00018821458159337025, "loss": 0.1051, "step": 7641 }, { "epoch": 0.4940282828282828, "grad_norm": 0.04956638067960739, "learning_rate": 0.00018821136055955417, "loss": 0.0687, "step": 7642 }, { "epoch": 0.4940929292929293, "grad_norm": 0.04984515905380249, "learning_rate": 0.0001882081391132027, "loss": 0.08, "step": 7643 }, { "epoch": 0.49415757575757574, "grad_norm": 0.061256829649209976, "learning_rate": 0.00018820491725433098, "loss": 0.0962, "step": 7644 }, { "epoch": 0.49422222222222223, "grad_norm": 0.0537383034825325, "learning_rate": 0.00018820169498295398, "loss": 0.0814, "step": 7645 }, { "epoch": 0.49428686868686866, "grad_norm": 0.055893201380968094, "learning_rate": 0.00018819847229908682, "loss": 0.0918, "step": 7646 }, { "epoch": 0.49435151515151515, "grad_norm": 0.051569223403930664, "learning_rate": 0.0001881952492027446, "loss": 0.0775, "step": 7647 }, { "epoch": 0.49441616161616164, "grad_norm": 0.059239521622657776, "learning_rate": 0.00018819202569394228, "loss": 0.0987, "step": 7648 }, { "epoch": 0.49441616161616164, "eval_bleu": 16.545630020079766, "eval_loss": 0.09224047511816025, "eval_runtime": 2.7213, "eval_samples_per_second": 11.759, "eval_steps_per_second": 1.47, "step": 7648 }, { "epoch": 0.4944808080808081, "grad_norm": 0.08846689760684967, "learning_rate": 0.00018818880177269508, "loss": 0.0896, "step": 7649 }, { "epoch": 0.49454545454545457, "grad_norm": 0.08112302422523499, "learning_rate": 0.00018818557743901796, "loss": 0.0808, "step": 7650 }, { "epoch": 0.494610101010101, "grad_norm": 0.059747908264398575, "learning_rate": 0.0001881823526929261, "loss": 0.0912, "step": 7651 }, { "epoch": 0.4946747474747475, "grad_norm": 0.06174100190401077, "learning_rate": 0.00018817912753443447, "loss": 0.0798, "step": 7652 }, { "epoch": 0.4947393939393939, "grad_norm": 0.055235542356967926, "learning_rate": 0.00018817590196355824, "loss": 0.082, "step": 7653 }, { "epoch": 0.4948040404040404, "grad_norm": 0.0679246336221695, "learning_rate": 0.00018817267598031248, "loss": 0.0881, "step": 7654 }, { "epoch": 0.49486868686868685, "grad_norm": 0.04852352663874626, "learning_rate": 0.00018816944958471225, "loss": 0.0671, "step": 7655 }, { "epoch": 0.49493333333333334, "grad_norm": 0.05211624130606651, "learning_rate": 0.00018816622277677264, "loss": 0.0793, "step": 7656 }, { "epoch": 0.4949979797979798, "grad_norm": 0.054555222392082214, "learning_rate": 0.00018816299555650878, "loss": 0.0749, "step": 7657 }, { "epoch": 0.49506262626262626, "grad_norm": 0.056283045560121536, "learning_rate": 0.0001881597679239357, "loss": 0.083, "step": 7658 }, { "epoch": 0.49512727272727275, "grad_norm": 0.07266935706138611, "learning_rate": 0.00018815653987906857, "loss": 0.0892, "step": 7659 }, { "epoch": 0.4951919191919192, "grad_norm": 0.06044600158929825, "learning_rate": 0.00018815331142192244, "loss": 0.0824, "step": 7660 }, { "epoch": 0.4952565656565657, "grad_norm": 0.05007245019078255, "learning_rate": 0.0001881500825525124, "loss": 0.0722, "step": 7661 }, { "epoch": 0.4953212121212121, "grad_norm": 0.05531005933880806, "learning_rate": 0.00018814685327085356, "loss": 0.0727, "step": 7662 }, { "epoch": 0.4953858585858586, "grad_norm": 0.06071164458990097, "learning_rate": 0.00018814362357696105, "loss": 0.0898, "step": 7663 }, { "epoch": 0.49545050505050503, "grad_norm": 0.05750046297907829, "learning_rate": 0.00018814039347084992, "loss": 0.0866, "step": 7664 }, { "epoch": 0.49545050505050503, "eval_bleu": 16.19619504909173, "eval_loss": 0.09231750667095184, "eval_runtime": 2.6055, "eval_samples_per_second": 12.282, "eval_steps_per_second": 1.535, "step": 7664 }, { "epoch": 0.4955151515151515, "grad_norm": 0.05643800273537636, "learning_rate": 0.00018813716295253534, "loss": 0.0805, "step": 7665 }, { "epoch": 0.49557979797979795, "grad_norm": 0.0700121596455574, "learning_rate": 0.00018813393202203238, "loss": 0.0885, "step": 7666 }, { "epoch": 0.49564444444444444, "grad_norm": 0.05989697575569153, "learning_rate": 0.00018813070067935615, "loss": 0.0875, "step": 7667 }, { "epoch": 0.49570909090909093, "grad_norm": 0.04936042055487633, "learning_rate": 0.00018812746892452178, "loss": 0.0753, "step": 7668 }, { "epoch": 0.49577373737373737, "grad_norm": 0.05556981638073921, "learning_rate": 0.00018812423675754433, "loss": 0.0827, "step": 7669 }, { "epoch": 0.49583838383838386, "grad_norm": 0.054676543921232224, "learning_rate": 0.000188121004178439, "loss": 0.0839, "step": 7670 }, { "epoch": 0.4959030303030303, "grad_norm": 0.05256362631917, "learning_rate": 0.00018811777118722084, "loss": 0.0886, "step": 7671 }, { "epoch": 0.4959676767676768, "grad_norm": 0.04933900013566017, "learning_rate": 0.00018811453778390505, "loss": 0.0722, "step": 7672 }, { "epoch": 0.4960323232323232, "grad_norm": 0.05733026564121246, "learning_rate": 0.00018811130396850664, "loss": 0.0852, "step": 7673 }, { "epoch": 0.4960969696969697, "grad_norm": 0.05735313519835472, "learning_rate": 0.0001881080697410408, "loss": 0.0849, "step": 7674 }, { "epoch": 0.49616161616161614, "grad_norm": 0.048303596675395966, "learning_rate": 0.00018810483510152266, "loss": 0.0661, "step": 7675 }, { "epoch": 0.4962262626262626, "grad_norm": 0.05413429066538811, "learning_rate": 0.00018810160004996735, "loss": 0.0806, "step": 7676 }, { "epoch": 0.4962909090909091, "grad_norm": 0.05520182475447655, "learning_rate": 0.00018809836458638993, "loss": 0.0825, "step": 7677 }, { "epoch": 0.49635555555555555, "grad_norm": 0.05886749550700188, "learning_rate": 0.00018809512871080563, "loss": 0.0896, "step": 7678 }, { "epoch": 0.49642020202020204, "grad_norm": 0.06321075558662415, "learning_rate": 0.0001880918924232295, "loss": 0.0998, "step": 7679 }, { "epoch": 0.4964848484848485, "grad_norm": 0.05824482813477516, "learning_rate": 0.00018808865572367676, "loss": 0.0838, "step": 7680 }, { "epoch": 0.4964848484848485, "eval_bleu": 17.19313505242103, "eval_loss": 0.09271492809057236, "eval_runtime": 2.699, "eval_samples_per_second": 11.856, "eval_steps_per_second": 1.482, "step": 7680 }, { "epoch": 0.49654949494949496, "grad_norm": 0.05971664562821388, "learning_rate": 0.00018808541861216246, "loss": 0.0871, "step": 7681 }, { "epoch": 0.4966141414141414, "grad_norm": 0.05990976840257645, "learning_rate": 0.0001880821810887018, "loss": 0.0734, "step": 7682 }, { "epoch": 0.4966787878787879, "grad_norm": 0.04958893731236458, "learning_rate": 0.00018807894315330987, "loss": 0.0715, "step": 7683 }, { "epoch": 0.4967434343434343, "grad_norm": 0.05134095624089241, "learning_rate": 0.00018807570480600187, "loss": 0.072, "step": 7684 }, { "epoch": 0.4968080808080808, "grad_norm": 0.0607442706823349, "learning_rate": 0.00018807246604679288, "loss": 0.0861, "step": 7685 }, { "epoch": 0.4968727272727273, "grad_norm": 0.06391191482543945, "learning_rate": 0.00018806922687569812, "loss": 0.097, "step": 7686 }, { "epoch": 0.49693737373737373, "grad_norm": 0.05182332172989845, "learning_rate": 0.0001880659872927327, "loss": 0.0738, "step": 7687 }, { "epoch": 0.4970020202020202, "grad_norm": 0.05422450602054596, "learning_rate": 0.00018806274729791175, "loss": 0.0778, "step": 7688 }, { "epoch": 0.49706666666666666, "grad_norm": 0.05565275996923447, "learning_rate": 0.00018805950689125046, "loss": 0.0812, "step": 7689 }, { "epoch": 0.49713131313131315, "grad_norm": 0.0784100666642189, "learning_rate": 0.00018805626607276398, "loss": 0.0749, "step": 7690 }, { "epoch": 0.4971959595959596, "grad_norm": 0.05967739224433899, "learning_rate": 0.0001880530248424674, "loss": 0.0826, "step": 7691 }, { "epoch": 0.49726060606060607, "grad_norm": 0.06026720255613327, "learning_rate": 0.00018804978320037597, "loss": 0.0898, "step": 7692 }, { "epoch": 0.4973252525252525, "grad_norm": 0.05645086616277695, "learning_rate": 0.0001880465411465048, "loss": 0.0855, "step": 7693 }, { "epoch": 0.497389898989899, "grad_norm": 0.052412744611501694, "learning_rate": 0.0001880432986808691, "loss": 0.0688, "step": 7694 }, { "epoch": 0.4974545454545454, "grad_norm": 0.05509515106678009, "learning_rate": 0.00018804005580348395, "loss": 0.0786, "step": 7695 }, { "epoch": 0.4975191919191919, "grad_norm": 0.053978271782398224, "learning_rate": 0.00018803681251436461, "loss": 0.0727, "step": 7696 }, { "epoch": 0.4975191919191919, "eval_bleu": 17.397325150045752, "eval_loss": 0.09104049205780029, "eval_runtime": 2.7264, "eval_samples_per_second": 11.737, "eval_steps_per_second": 1.467, "step": 7696 }, { "epoch": 0.4975838383838384, "grad_norm": 0.05858197435736656, "learning_rate": 0.0001880335688135262, "loss": 0.0808, "step": 7697 }, { "epoch": 0.49764848484848484, "grad_norm": 0.059663400053977966, "learning_rate": 0.00018803032470098385, "loss": 0.0925, "step": 7698 }, { "epoch": 0.49771313131313133, "grad_norm": 0.06857690215110779, "learning_rate": 0.00018802708017675278, "loss": 0.0936, "step": 7699 }, { "epoch": 0.49777777777777776, "grad_norm": 0.059494245797395706, "learning_rate": 0.0001880238352408482, "loss": 0.0821, "step": 7700 }, { "epoch": 0.49784242424242425, "grad_norm": 0.22679895162582397, "learning_rate": 0.00018802058989328518, "loss": 0.1022, "step": 7701 }, { "epoch": 0.4979070707070707, "grad_norm": 0.06411238759756088, "learning_rate": 0.00018801734413407902, "loss": 0.0937, "step": 7702 }, { "epoch": 0.4979717171717172, "grad_norm": 0.06528118997812271, "learning_rate": 0.0001880140979632448, "loss": 0.1026, "step": 7703 }, { "epoch": 0.4980363636363636, "grad_norm": 0.055078305304050446, "learning_rate": 0.00018801085138079777, "loss": 0.076, "step": 7704 }, { "epoch": 0.4981010101010101, "grad_norm": 0.05819091573357582, "learning_rate": 0.0001880076043867531, "loss": 0.0734, "step": 7705 }, { "epoch": 0.4981656565656566, "grad_norm": 0.047112759202718735, "learning_rate": 0.00018800435698112592, "loss": 0.068, "step": 7706 }, { "epoch": 0.498230303030303, "grad_norm": 0.060769032686948776, "learning_rate": 0.00018800110916393145, "loss": 0.0773, "step": 7707 }, { "epoch": 0.4982949494949495, "grad_norm": 0.048682019114494324, "learning_rate": 0.0001879978609351849, "loss": 0.0728, "step": 7708 }, { "epoch": 0.49835959595959595, "grad_norm": 0.058563727885484695, "learning_rate": 0.00018799461229490144, "loss": 0.0894, "step": 7709 }, { "epoch": 0.49842424242424244, "grad_norm": 0.05101766809821129, "learning_rate": 0.00018799136324309628, "loss": 0.0796, "step": 7710 }, { "epoch": 0.49848888888888887, "grad_norm": 0.0659051239490509, "learning_rate": 0.0001879881137797846, "loss": 0.1066, "step": 7711 }, { "epoch": 0.49855353535353536, "grad_norm": 0.053849659860134125, "learning_rate": 0.0001879848639049816, "loss": 0.0771, "step": 7712 }, { "epoch": 0.49855353535353536, "eval_bleu": 18.191837638054285, "eval_loss": 0.09094206243753433, "eval_runtime": 2.7063, "eval_samples_per_second": 11.824, "eval_steps_per_second": 1.478, "step": 7712 }, { "epoch": 0.4986181818181818, "grad_norm": 0.05895485356450081, "learning_rate": 0.00018798161361870247, "loss": 0.0878, "step": 7713 }, { "epoch": 0.4986828282828283, "grad_norm": 0.056831955909729004, "learning_rate": 0.00018797836292096243, "loss": 0.0847, "step": 7714 }, { "epoch": 0.4987474747474748, "grad_norm": 0.05916425958275795, "learning_rate": 0.00018797511181177666, "loss": 0.0949, "step": 7715 }, { "epoch": 0.4988121212121212, "grad_norm": 0.04867985472083092, "learning_rate": 0.00018797186029116038, "loss": 0.0603, "step": 7716 }, { "epoch": 0.4988767676767677, "grad_norm": 0.07323768734931946, "learning_rate": 0.0001879686083591288, "loss": 0.1045, "step": 7717 }, { "epoch": 0.49894141414141413, "grad_norm": 0.054140061140060425, "learning_rate": 0.00018796535601569713, "loss": 0.0805, "step": 7718 }, { "epoch": 0.4990060606060606, "grad_norm": 0.05138729512691498, "learning_rate": 0.00018796210326088057, "loss": 0.0758, "step": 7719 }, { "epoch": 0.49907070707070705, "grad_norm": 0.052113112062215805, "learning_rate": 0.00018795885009469432, "loss": 0.072, "step": 7720 }, { "epoch": 0.49913535353535354, "grad_norm": 0.05622563883662224, "learning_rate": 0.0001879555965171536, "loss": 0.086, "step": 7721 }, { "epoch": 0.4992, "grad_norm": 0.07612086832523346, "learning_rate": 0.00018795234252827364, "loss": 0.0752, "step": 7722 }, { "epoch": 0.49926464646464647, "grad_norm": 0.059670694172382355, "learning_rate": 0.00018794908812806967, "loss": 0.0877, "step": 7723 }, { "epoch": 0.49932929292929296, "grad_norm": 0.049978647381067276, "learning_rate": 0.00018794583331655686, "loss": 0.074, "step": 7724 }, { "epoch": 0.4993939393939394, "grad_norm": 0.05497646704316139, "learning_rate": 0.00018794257809375044, "loss": 0.0684, "step": 7725 }, { "epoch": 0.4994585858585859, "grad_norm": 0.06179879978299141, "learning_rate": 0.00018793932245966572, "loss": 0.0814, "step": 7726 }, { "epoch": 0.4995232323232323, "grad_norm": 0.06447987258434296, "learning_rate": 0.00018793606641431782, "loss": 0.1013, "step": 7727 }, { "epoch": 0.4995878787878788, "grad_norm": 0.05798546224832535, "learning_rate": 0.00018793280995772202, "loss": 0.0776, "step": 7728 }, { "epoch": 0.4995878787878788, "eval_bleu": 11.59933282286961, "eval_loss": 0.09172666072845459, "eval_runtime": 2.6529, "eval_samples_per_second": 12.062, "eval_steps_per_second": 1.508, "step": 7728 }, { "epoch": 0.49965252525252524, "grad_norm": 0.061895426362752914, "learning_rate": 0.00018792955308989353, "loss": 0.0784, "step": 7729 }, { "epoch": 0.4997171717171717, "grad_norm": 0.08963526785373688, "learning_rate": 0.0001879262958108476, "loss": 0.1026, "step": 7730 }, { "epoch": 0.49978181818181816, "grad_norm": 0.053780898451805115, "learning_rate": 0.00018792303812059945, "loss": 0.0766, "step": 7731 }, { "epoch": 0.49984646464646465, "grad_norm": 0.05156232789158821, "learning_rate": 0.0001879197800191643, "loss": 0.0773, "step": 7732 }, { "epoch": 0.4999111111111111, "grad_norm": 0.061143022030591965, "learning_rate": 0.00018791652150655744, "loss": 0.0913, "step": 7733 }, { "epoch": 0.4999757575757576, "grad_norm": 0.06868083029985428, "learning_rate": 0.00018791326258279405, "loss": 0.1144, "step": 7734 }, { "epoch": 0.5000404040404041, "grad_norm": 0.055382758378982544, "learning_rate": 0.00018791000324788938, "loss": 0.0828, "step": 7735 }, { "epoch": 0.5001050505050505, "grad_norm": 0.051951635628938675, "learning_rate": 0.0001879067435018587, "loss": 0.0778, "step": 7736 }, { "epoch": 0.5001696969696969, "grad_norm": 0.057541072368621826, "learning_rate": 0.00018790348334471726, "loss": 0.0801, "step": 7737 }, { "epoch": 0.5002343434343435, "grad_norm": 0.049863725900650024, "learning_rate": 0.00018790022277648027, "loss": 0.0829, "step": 7738 }, { "epoch": 0.5002989898989899, "grad_norm": 0.05668799579143524, "learning_rate": 0.000187896961797163, "loss": 0.0921, "step": 7739 }, { "epoch": 0.5003636363636363, "grad_norm": 0.05527791008353233, "learning_rate": 0.0001878937004067807, "loss": 0.0842, "step": 7740 }, { "epoch": 0.5004282828282828, "grad_norm": 0.05547543242573738, "learning_rate": 0.00018789043860534862, "loss": 0.0822, "step": 7741 }, { "epoch": 0.5004929292929293, "grad_norm": 0.05904552713036537, "learning_rate": 0.00018788717639288202, "loss": 0.0817, "step": 7742 }, { "epoch": 0.5005575757575758, "grad_norm": 0.05548317730426788, "learning_rate": 0.00018788391376939616, "loss": 0.0812, "step": 7743 }, { "epoch": 0.5006222222222222, "grad_norm": 0.061217401176691055, "learning_rate": 0.00018788065073490628, "loss": 0.1053, "step": 7744 }, { "epoch": 0.5006222222222222, "eval_bleu": 15.167941694236072, "eval_loss": 0.09110569953918457, "eval_runtime": 2.6695, "eval_samples_per_second": 11.987, "eval_steps_per_second": 1.498, "step": 7744 }, { "epoch": 0.5006868686868687, "grad_norm": 0.06615059822797775, "learning_rate": 0.00018787738728942764, "loss": 0.1019, "step": 7745 }, { "epoch": 0.5007515151515152, "grad_norm": 0.06267862021923065, "learning_rate": 0.0001878741234329755, "loss": 0.0974, "step": 7746 }, { "epoch": 0.5008161616161616, "grad_norm": 0.055523574352264404, "learning_rate": 0.00018787085916556516, "loss": 0.0774, "step": 7747 }, { "epoch": 0.500880808080808, "grad_norm": 0.05526783689856529, "learning_rate": 0.00018786759448721185, "loss": 0.0742, "step": 7748 }, { "epoch": 0.5009454545454546, "grad_norm": 0.06524644047021866, "learning_rate": 0.00018786432939793085, "loss": 0.1064, "step": 7749 }, { "epoch": 0.501010101010101, "grad_norm": 0.05740687623620033, "learning_rate": 0.00018786106389773744, "loss": 0.0839, "step": 7750 }, { "epoch": 0.5010747474747475, "grad_norm": 0.05784508213400841, "learning_rate": 0.00018785779798664685, "loss": 0.0847, "step": 7751 }, { "epoch": 0.5011393939393939, "grad_norm": 0.055017516016960144, "learning_rate": 0.0001878545316646744, "loss": 0.0946, "step": 7752 }, { "epoch": 0.5012040404040404, "grad_norm": 0.05718708410859108, "learning_rate": 0.00018785126493183537, "loss": 0.0832, "step": 7753 }, { "epoch": 0.5012686868686869, "grad_norm": 0.0675986036658287, "learning_rate": 0.00018784799778814501, "loss": 0.078, "step": 7754 }, { "epoch": 0.5013333333333333, "grad_norm": 0.05447746813297272, "learning_rate": 0.00018784473023361858, "loss": 0.0806, "step": 7755 }, { "epoch": 0.5013979797979798, "grad_norm": 0.07197645306587219, "learning_rate": 0.0001878414622682714, "loss": 0.1051, "step": 7756 }, { "epoch": 0.5014626262626263, "grad_norm": 0.05958004668354988, "learning_rate": 0.00018783819389211876, "loss": 0.0809, "step": 7757 }, { "epoch": 0.5015272727272727, "grad_norm": 0.06197085976600647, "learning_rate": 0.0001878349251051759, "loss": 0.1046, "step": 7758 }, { "epoch": 0.5015919191919191, "grad_norm": 0.05958814173936844, "learning_rate": 0.00018783165590745816, "loss": 0.0896, "step": 7759 }, { "epoch": 0.5016565656565657, "grad_norm": 0.05140664428472519, "learning_rate": 0.00018782838629898078, "loss": 0.0834, "step": 7760 }, { "epoch": 0.5016565656565657, "eval_bleu": 16.183681912529366, "eval_loss": 0.09124398231506348, "eval_runtime": 2.6724, "eval_samples_per_second": 11.974, "eval_steps_per_second": 1.497, "step": 7760 }, { "epoch": 0.5017212121212121, "grad_norm": 0.05677957832813263, "learning_rate": 0.0001878251162797591, "loss": 0.0816, "step": 7761 }, { "epoch": 0.5017858585858586, "grad_norm": 0.055675141513347626, "learning_rate": 0.00018782184584980834, "loss": 0.0878, "step": 7762 }, { "epoch": 0.5018505050505051, "grad_norm": 0.053137246519327164, "learning_rate": 0.0001878185750091439, "loss": 0.0743, "step": 7763 }, { "epoch": 0.5019151515151515, "grad_norm": 0.050618965178728104, "learning_rate": 0.00018781530375778095, "loss": 0.0745, "step": 7764 }, { "epoch": 0.501979797979798, "grad_norm": 0.050186675041913986, "learning_rate": 0.0001878120320957349, "loss": 0.0677, "step": 7765 }, { "epoch": 0.5020444444444444, "grad_norm": 0.05422979220747948, "learning_rate": 0.000187808760023021, "loss": 0.0854, "step": 7766 }, { "epoch": 0.502109090909091, "grad_norm": 0.0594891682267189, "learning_rate": 0.00018780548753965454, "loss": 0.0817, "step": 7767 }, { "epoch": 0.5021737373737374, "grad_norm": 0.06357744336128235, "learning_rate": 0.00018780221464565085, "loss": 0.0972, "step": 7768 }, { "epoch": 0.5022383838383838, "grad_norm": 0.05131134018301964, "learning_rate": 0.00018779894134102524, "loss": 0.0729, "step": 7769 }, { "epoch": 0.5023030303030303, "grad_norm": 0.06014890596270561, "learning_rate": 0.000187795667625793, "loss": 0.1044, "step": 7770 }, { "epoch": 0.5023676767676768, "grad_norm": 0.07713127881288528, "learning_rate": 0.00018779239349996947, "loss": 0.0911, "step": 7771 }, { "epoch": 0.5024323232323232, "grad_norm": 0.07322830706834793, "learning_rate": 0.0001877891189635699, "loss": 0.0832, "step": 7772 }, { "epoch": 0.5024969696969697, "grad_norm": 0.059696100652217865, "learning_rate": 0.0001877858440166097, "loss": 0.0898, "step": 7773 }, { "epoch": 0.5025616161616162, "grad_norm": 0.05774519219994545, "learning_rate": 0.00018778256865910407, "loss": 0.0798, "step": 7774 }, { "epoch": 0.5026262626262626, "grad_norm": 0.06098848581314087, "learning_rate": 0.0001877792928910684, "loss": 0.1042, "step": 7775 }, { "epoch": 0.5026909090909091, "grad_norm": 0.06094692647457123, "learning_rate": 0.000187776016712518, "loss": 0.0883, "step": 7776 }, { "epoch": 0.5026909090909091, "eval_bleu": 17.277104566977734, "eval_loss": 0.09053380787372589, "eval_runtime": 2.7604, "eval_samples_per_second": 11.592, "eval_steps_per_second": 1.449, "step": 7776 }, { "epoch": 0.5027555555555555, "grad_norm": 0.06001632288098335, "learning_rate": 0.00018777274012346816, "loss": 0.0932, "step": 7777 }, { "epoch": 0.5028202020202021, "grad_norm": 0.05589991807937622, "learning_rate": 0.0001877694631239343, "loss": 0.0833, "step": 7778 }, { "epoch": 0.5028848484848485, "grad_norm": 0.05885219946503639, "learning_rate": 0.00018776618571393164, "loss": 0.093, "step": 7779 }, { "epoch": 0.5029494949494949, "grad_norm": 0.054170817136764526, "learning_rate": 0.00018776290789347555, "loss": 0.0746, "step": 7780 }, { "epoch": 0.5030141414141415, "grad_norm": 0.06243616342544556, "learning_rate": 0.00018775962966258134, "loss": 0.0886, "step": 7781 }, { "epoch": 0.5030787878787879, "grad_norm": 0.05272437259554863, "learning_rate": 0.00018775635102126435, "loss": 0.0783, "step": 7782 }, { "epoch": 0.5031434343434343, "grad_norm": 0.05494653806090355, "learning_rate": 0.00018775307196953992, "loss": 0.0807, "step": 7783 }, { "epoch": 0.5032080808080808, "grad_norm": 0.05855415388941765, "learning_rate": 0.00018774979250742342, "loss": 0.1054, "step": 7784 }, { "epoch": 0.5032727272727273, "grad_norm": 0.06356093287467957, "learning_rate": 0.0001877465126349301, "loss": 0.0983, "step": 7785 }, { "epoch": 0.5033373737373737, "grad_norm": 0.055706970393657684, "learning_rate": 0.0001877432323520754, "loss": 0.0821, "step": 7786 }, { "epoch": 0.5034020202020202, "grad_norm": 0.050611577928066254, "learning_rate": 0.00018773995165887456, "loss": 0.0769, "step": 7787 }, { "epoch": 0.5034666666666666, "grad_norm": 0.05812849476933479, "learning_rate": 0.00018773667055534297, "loss": 0.0888, "step": 7788 }, { "epoch": 0.5035313131313132, "grad_norm": 0.05966668576002121, "learning_rate": 0.00018773338904149603, "loss": 0.0859, "step": 7789 }, { "epoch": 0.5035959595959596, "grad_norm": 0.06009143218398094, "learning_rate": 0.000187730107117349, "loss": 0.0872, "step": 7790 }, { "epoch": 0.503660606060606, "grad_norm": 0.056643061339855194, "learning_rate": 0.00018772682478291727, "loss": 0.0898, "step": 7791 }, { "epoch": 0.5037252525252526, "grad_norm": 0.06323621422052383, "learning_rate": 0.00018772354203821617, "loss": 0.0891, "step": 7792 }, { "epoch": 0.5037252525252526, "eval_bleu": 17.63318271932576, "eval_loss": 0.09084570407867432, "eval_runtime": 2.591, "eval_samples_per_second": 12.35, "eval_steps_per_second": 1.544, "step": 7792 }, { "epoch": 0.503789898989899, "grad_norm": 0.053825803101062775, "learning_rate": 0.00018772025888326107, "loss": 0.0834, "step": 7793 }, { "epoch": 0.5038545454545454, "grad_norm": 0.1455695629119873, "learning_rate": 0.00018771697531806735, "loss": 0.1032, "step": 7794 }, { "epoch": 0.5039191919191919, "grad_norm": 0.06513268500566483, "learning_rate": 0.00018771369134265028, "loss": 0.1033, "step": 7795 }, { "epoch": 0.5039838383838384, "grad_norm": 0.0620516873896122, "learning_rate": 0.00018771040695702532, "loss": 0.1041, "step": 7796 }, { "epoch": 0.5040484848484849, "grad_norm": 0.05304070934653282, "learning_rate": 0.00018770712216120776, "loss": 0.0807, "step": 7797 }, { "epoch": 0.5041131313131313, "grad_norm": 0.05335331708192825, "learning_rate": 0.00018770383695521298, "loss": 0.0849, "step": 7798 }, { "epoch": 0.5041777777777777, "grad_norm": 0.05942315608263016, "learning_rate": 0.00018770055133905634, "loss": 0.09, "step": 7799 }, { "epoch": 0.5042424242424243, "grad_norm": 0.06202027201652527, "learning_rate": 0.00018769726531275326, "loss": 0.0948, "step": 7800 }, { "epoch": 0.5043070707070707, "grad_norm": 0.05961659550666809, "learning_rate": 0.00018769397887631904, "loss": 0.089, "step": 7801 }, { "epoch": 0.5043717171717171, "grad_norm": 0.05594084784388542, "learning_rate": 0.00018769069202976906, "loss": 0.0823, "step": 7802 }, { "epoch": 0.5044363636363637, "grad_norm": 0.052008260041475296, "learning_rate": 0.0001876874047731187, "loss": 0.0769, "step": 7803 }, { "epoch": 0.5045010101010101, "grad_norm": 0.06048525869846344, "learning_rate": 0.00018768411710638337, "loss": 0.0886, "step": 7804 }, { "epoch": 0.5045656565656566, "grad_norm": 0.05647002160549164, "learning_rate": 0.00018768082902957838, "loss": 0.0893, "step": 7805 }, { "epoch": 0.504630303030303, "grad_norm": 0.04958126321434975, "learning_rate": 0.00018767754054271915, "loss": 0.0685, "step": 7806 }, { "epoch": 0.5046949494949495, "grad_norm": 0.060223497450351715, "learning_rate": 0.00018767425164582102, "loss": 0.0871, "step": 7807 }, { "epoch": 0.504759595959596, "grad_norm": 0.060406338423490524, "learning_rate": 0.00018767096233889943, "loss": 0.1008, "step": 7808 }, { "epoch": 0.504759595959596, "eval_bleu": 17.827906893763124, "eval_loss": 0.09135714918375015, "eval_runtime": 2.7331, "eval_samples_per_second": 11.708, "eval_steps_per_second": 1.464, "step": 7808 }, { "epoch": 0.5048242424242424, "grad_norm": 0.05350075289607048, "learning_rate": 0.00018766767262196974, "loss": 0.0806, "step": 7809 }, { "epoch": 0.5048888888888889, "grad_norm": 0.07041635364294052, "learning_rate": 0.0001876643824950473, "loss": 0.1159, "step": 7810 }, { "epoch": 0.5049535353535354, "grad_norm": 0.055358003824949265, "learning_rate": 0.00018766109195814755, "loss": 0.0836, "step": 7811 }, { "epoch": 0.5050181818181818, "grad_norm": 0.09427614510059357, "learning_rate": 0.00018765780101128585, "loss": 0.086, "step": 7812 }, { "epoch": 0.5050828282828282, "grad_norm": 0.052834443747997284, "learning_rate": 0.0001876545096544776, "loss": 0.0743, "step": 7813 }, { "epoch": 0.5051474747474748, "grad_norm": 0.0582008957862854, "learning_rate": 0.00018765121788773815, "loss": 0.0877, "step": 7814 }, { "epoch": 0.5052121212121212, "grad_norm": 0.05214683338999748, "learning_rate": 0.00018764792571108297, "loss": 0.0771, "step": 7815 }, { "epoch": 0.5052767676767677, "grad_norm": 0.13003692030906677, "learning_rate": 0.0001876446331245274, "loss": 0.0765, "step": 7816 }, { "epoch": 0.5053414141414141, "grad_norm": 0.05472637340426445, "learning_rate": 0.00018764134012808686, "loss": 0.0751, "step": 7817 }, { "epoch": 0.5054060606060606, "grad_norm": 0.05559482425451279, "learning_rate": 0.00018763804672177674, "loss": 0.0716, "step": 7818 }, { "epoch": 0.5054707070707071, "grad_norm": 0.05140122398734093, "learning_rate": 0.00018763475290561245, "loss": 0.0743, "step": 7819 }, { "epoch": 0.5055353535353535, "grad_norm": 0.17917051911354065, "learning_rate": 0.0001876314586796094, "loss": 0.1018, "step": 7820 }, { "epoch": 0.5056, "grad_norm": 0.06044577434659004, "learning_rate": 0.000187628164043783, "loss": 0.088, "step": 7821 }, { "epoch": 0.5056646464646465, "grad_norm": 0.06358007341623306, "learning_rate": 0.00018762486899814862, "loss": 0.094, "step": 7822 }, { "epoch": 0.5057292929292929, "grad_norm": 0.059017930179834366, "learning_rate": 0.00018762157354272168, "loss": 0.0771, "step": 7823 }, { "epoch": 0.5057939393939394, "grad_norm": 0.06450504064559937, "learning_rate": 0.00018761827767751766, "loss": 0.0947, "step": 7824 }, { "epoch": 0.5057939393939394, "eval_bleu": 15.641089399206727, "eval_loss": 0.09162072092294693, "eval_runtime": 2.6714, "eval_samples_per_second": 11.979, "eval_steps_per_second": 1.497, "step": 7824 }, { "epoch": 0.5058585858585859, "grad_norm": 0.0601838193833828, "learning_rate": 0.0001876149814025519, "loss": 0.0826, "step": 7825 }, { "epoch": 0.5059232323232323, "grad_norm": 0.05653796344995499, "learning_rate": 0.00018761168471783982, "loss": 0.0785, "step": 7826 }, { "epoch": 0.5059878787878788, "grad_norm": 0.06735582649707794, "learning_rate": 0.00018760838762339686, "loss": 0.1024, "step": 7827 }, { "epoch": 0.5060525252525252, "grad_norm": 0.06379131227731705, "learning_rate": 0.00018760509011923844, "loss": 0.0962, "step": 7828 }, { "epoch": 0.5061171717171717, "grad_norm": 0.04940928891301155, "learning_rate": 0.00018760179220537996, "loss": 0.069, "step": 7829 }, { "epoch": 0.5061818181818182, "grad_norm": 0.04646586999297142, "learning_rate": 0.00018759849388183688, "loss": 0.0663, "step": 7830 }, { "epoch": 0.5062464646464646, "grad_norm": 0.06357093155384064, "learning_rate": 0.0001875951951486246, "loss": 0.0804, "step": 7831 }, { "epoch": 0.5063111111111112, "grad_norm": 0.06451089680194855, "learning_rate": 0.00018759189600575855, "loss": 0.0945, "step": 7832 }, { "epoch": 0.5063757575757576, "grad_norm": 0.060535307973623276, "learning_rate": 0.00018758859645325412, "loss": 0.0896, "step": 7833 }, { "epoch": 0.506440404040404, "grad_norm": 0.09721873700618744, "learning_rate": 0.00018758529649112683, "loss": 0.0939, "step": 7834 }, { "epoch": 0.5065050505050505, "grad_norm": 0.052662622183561325, "learning_rate": 0.00018758199611939206, "loss": 0.0806, "step": 7835 }, { "epoch": 0.506569696969697, "grad_norm": 0.05203745514154434, "learning_rate": 0.00018757869533806524, "loss": 0.0718, "step": 7836 }, { "epoch": 0.5066343434343434, "grad_norm": 0.05875157564878464, "learning_rate": 0.00018757539414716181, "loss": 0.0876, "step": 7837 }, { "epoch": 0.5066989898989899, "grad_norm": 0.060065414756536484, "learning_rate": 0.00018757209254669722, "loss": 0.0958, "step": 7838 }, { "epoch": 0.5067636363636364, "grad_norm": 0.05511051043868065, "learning_rate": 0.0001875687905366869, "loss": 0.0894, "step": 7839 }, { "epoch": 0.5068282828282829, "grad_norm": 0.05936272442340851, "learning_rate": 0.00018756548811714628, "loss": 0.0838, "step": 7840 }, { "epoch": 0.5068282828282829, "eval_bleu": 16.620969077949145, "eval_loss": 0.09217087924480438, "eval_runtime": 2.6278, "eval_samples_per_second": 12.177, "eval_steps_per_second": 1.522, "step": 7840 }, { "epoch": 0.5068929292929293, "grad_norm": 0.060392674058675766, "learning_rate": 0.00018756218528809084, "loss": 0.0928, "step": 7841 }, { "epoch": 0.5069575757575757, "grad_norm": 0.06391189247369766, "learning_rate": 0.00018755888204953603, "loss": 0.0891, "step": 7842 }, { "epoch": 0.5070222222222223, "grad_norm": 0.058797743171453476, "learning_rate": 0.00018755557840149725, "loss": 0.0907, "step": 7843 }, { "epoch": 0.5070868686868687, "grad_norm": 0.05628548562526703, "learning_rate": 0.00018755227434398997, "loss": 0.0793, "step": 7844 }, { "epoch": 0.5071515151515151, "grad_norm": 0.04995729401707649, "learning_rate": 0.00018754896987702963, "loss": 0.0671, "step": 7845 }, { "epoch": 0.5072161616161616, "grad_norm": 0.05938170105218887, "learning_rate": 0.00018754566500063174, "loss": 0.0871, "step": 7846 }, { "epoch": 0.5072808080808081, "grad_norm": 0.05673157423734665, "learning_rate": 0.0001875423597148117, "loss": 0.0802, "step": 7847 }, { "epoch": 0.5073454545454545, "grad_norm": 0.0628538727760315, "learning_rate": 0.00018753905401958497, "loss": 0.0906, "step": 7848 }, { "epoch": 0.507410101010101, "grad_norm": 0.05792514234781265, "learning_rate": 0.00018753574791496703, "loss": 0.0843, "step": 7849 }, { "epoch": 0.5074747474747475, "grad_norm": 0.06103606894612312, "learning_rate": 0.00018753244140097338, "loss": 0.0953, "step": 7850 }, { "epoch": 0.507539393939394, "grad_norm": 0.05922936275601387, "learning_rate": 0.00018752913447761936, "loss": 0.0904, "step": 7851 }, { "epoch": 0.5076040404040404, "grad_norm": 0.06412958353757858, "learning_rate": 0.0001875258271449206, "loss": 0.0945, "step": 7852 }, { "epoch": 0.5076686868686868, "grad_norm": 0.05391160026192665, "learning_rate": 0.0001875225194028924, "loss": 0.0848, "step": 7853 }, { "epoch": 0.5077333333333334, "grad_norm": 0.06314773112535477, "learning_rate": 0.00018751921125155038, "loss": 0.0898, "step": 7854 }, { "epoch": 0.5077979797979798, "grad_norm": 0.062177874147892, "learning_rate": 0.00018751590269090992, "loss": 0.0892, "step": 7855 }, { "epoch": 0.5078626262626262, "grad_norm": 0.06816834956407547, "learning_rate": 0.0001875125937209865, "loss": 0.0985, "step": 7856 }, { "epoch": 0.5078626262626262, "eval_bleu": 16.310581190616382, "eval_loss": 0.09088582545518875, "eval_runtime": 2.6823, "eval_samples_per_second": 11.93, "eval_steps_per_second": 1.491, "step": 7856 }, { "epoch": 0.5079272727272728, "grad_norm": 0.06768687814474106, "learning_rate": 0.0001875092843417956, "loss": 0.1001, "step": 7857 }, { "epoch": 0.5079919191919192, "grad_norm": 0.06489092856645584, "learning_rate": 0.00018750597455335273, "loss": 0.0944, "step": 7858 }, { "epoch": 0.5080565656565657, "grad_norm": 0.06238860264420509, "learning_rate": 0.00018750266435567335, "loss": 0.0858, "step": 7859 }, { "epoch": 0.5081212121212121, "grad_norm": 0.05994223803281784, "learning_rate": 0.00018749935374877294, "loss": 0.0935, "step": 7860 }, { "epoch": 0.5081858585858586, "grad_norm": 0.05738324671983719, "learning_rate": 0.00018749604273266696, "loss": 0.0944, "step": 7861 }, { "epoch": 0.5082505050505051, "grad_norm": 0.06664850562810898, "learning_rate": 0.00018749273130737093, "loss": 0.0865, "step": 7862 }, { "epoch": 0.5083151515151515, "grad_norm": 0.06320400536060333, "learning_rate": 0.0001874894194729003, "loss": 0.0976, "step": 7863 }, { "epoch": 0.5083797979797979, "grad_norm": 0.058019209653139114, "learning_rate": 0.00018748610722927057, "loss": 0.0924, "step": 7864 }, { "epoch": 0.5084444444444445, "grad_norm": 0.06698229908943176, "learning_rate": 0.00018748279457649728, "loss": 0.0875, "step": 7865 }, { "epoch": 0.5085090909090909, "grad_norm": 0.05354395508766174, "learning_rate": 0.00018747948151459585, "loss": 0.0792, "step": 7866 }, { "epoch": 0.5085737373737373, "grad_norm": 0.05062909051775932, "learning_rate": 0.00018747616804358182, "loss": 0.0707, "step": 7867 }, { "epoch": 0.5086383838383839, "grad_norm": 0.06286794692277908, "learning_rate": 0.00018747285416347066, "loss": 0.0951, "step": 7868 }, { "epoch": 0.5087030303030303, "grad_norm": 0.055112723261117935, "learning_rate": 0.00018746953987427788, "loss": 0.0825, "step": 7869 }, { "epoch": 0.5087676767676768, "grad_norm": 0.06302549690008163, "learning_rate": 0.00018746622517601898, "loss": 0.0984, "step": 7870 }, { "epoch": 0.5088323232323232, "grad_norm": 0.06679811328649521, "learning_rate": 0.0001874629100687095, "loss": 0.1006, "step": 7871 }, { "epoch": 0.5088969696969697, "grad_norm": 0.05105537921190262, "learning_rate": 0.00018745959455236487, "loss": 0.0822, "step": 7872 }, { "epoch": 0.5088969696969697, "eval_bleu": 11.873598967030487, "eval_loss": 0.09091059863567352, "eval_runtime": 2.5614, "eval_samples_per_second": 12.493, "eval_steps_per_second": 1.562, "step": 7872 }, { "epoch": 0.5089616161616162, "grad_norm": 0.05519352853298187, "learning_rate": 0.0001874562786270006, "loss": 0.0851, "step": 7873 }, { "epoch": 0.5090262626262626, "grad_norm": 0.05711270123720169, "learning_rate": 0.00018745296229263225, "loss": 0.0836, "step": 7874 }, { "epoch": 0.509090909090909, "grad_norm": 0.057911504060029984, "learning_rate": 0.00018744964554927535, "loss": 0.101, "step": 7875 }, { "epoch": 0.5091555555555556, "grad_norm": 0.057794827967882156, "learning_rate": 0.00018744632839694532, "loss": 0.0861, "step": 7876 }, { "epoch": 0.509220202020202, "grad_norm": 0.10412281006574631, "learning_rate": 0.00018744301083565773, "loss": 0.0866, "step": 7877 }, { "epoch": 0.5092848484848485, "grad_norm": 0.06619876623153687, "learning_rate": 0.00018743969286542808, "loss": 0.0939, "step": 7878 }, { "epoch": 0.509349494949495, "grad_norm": 0.05556159093976021, "learning_rate": 0.00018743637448627188, "loss": 0.0817, "step": 7879 }, { "epoch": 0.5094141414141414, "grad_norm": 0.0516040101647377, "learning_rate": 0.0001874330556982047, "loss": 0.0729, "step": 7880 }, { "epoch": 0.5094787878787879, "grad_norm": 0.048545707017183304, "learning_rate": 0.00018742973650124202, "loss": 0.0683, "step": 7881 }, { "epoch": 0.5095434343434343, "grad_norm": 0.05391348525881767, "learning_rate": 0.00018742641689539934, "loss": 0.0706, "step": 7882 }, { "epoch": 0.5096080808080808, "grad_norm": 0.058631811290979385, "learning_rate": 0.00018742309688069223, "loss": 0.0896, "step": 7883 }, { "epoch": 0.5096727272727273, "grad_norm": 0.07073426246643066, "learning_rate": 0.0001874197764571362, "loss": 0.0981, "step": 7884 }, { "epoch": 0.5097373737373737, "grad_norm": 0.05919932946562767, "learning_rate": 0.00018741645562474676, "loss": 0.0883, "step": 7885 }, { "epoch": 0.5098020202020203, "grad_norm": 0.05816143378615379, "learning_rate": 0.00018741313438353947, "loss": 0.0836, "step": 7886 }, { "epoch": 0.5098666666666667, "grad_norm": 0.05710756406188011, "learning_rate": 0.00018740981273352984, "loss": 0.0877, "step": 7887 }, { "epoch": 0.5099313131313131, "grad_norm": 0.058808717876672745, "learning_rate": 0.00018740649067473342, "loss": 0.0843, "step": 7888 }, { "epoch": 0.5099313131313131, "eval_bleu": 14.215636766813562, "eval_loss": 0.08966147899627686, "eval_runtime": 2.5693, "eval_samples_per_second": 12.455, "eval_steps_per_second": 1.557, "step": 7888 }, { "epoch": 0.5099959595959596, "grad_norm": 0.05250198021531105, "learning_rate": 0.00018740316820716575, "loss": 0.073, "step": 7889 }, { "epoch": 0.5100606060606061, "grad_norm": 0.060085318982601166, "learning_rate": 0.0001873998453308423, "loss": 0.0862, "step": 7890 }, { "epoch": 0.5101252525252525, "grad_norm": 0.06083827465772629, "learning_rate": 0.00018739652204577872, "loss": 0.0765, "step": 7891 }, { "epoch": 0.510189898989899, "grad_norm": 0.06268062442541122, "learning_rate": 0.00018739319835199049, "loss": 0.1078, "step": 7892 }, { "epoch": 0.5102545454545454, "grad_norm": 0.05585675686597824, "learning_rate": 0.00018738987424949316, "loss": 0.0794, "step": 7893 }, { "epoch": 0.510319191919192, "grad_norm": 0.059034865349531174, "learning_rate": 0.0001873865497383023, "loss": 0.0893, "step": 7894 }, { "epoch": 0.5103838383838384, "grad_norm": 0.06352706253528595, "learning_rate": 0.00018738322481843343, "loss": 0.0957, "step": 7895 }, { "epoch": 0.5104484848484848, "grad_norm": 0.0639958307147026, "learning_rate": 0.0001873798994899021, "loss": 0.1053, "step": 7896 }, { "epoch": 0.5105131313131314, "grad_norm": 0.05201926454901695, "learning_rate": 0.00018737657375272383, "loss": 0.0765, "step": 7897 }, { "epoch": 0.5105777777777778, "grad_norm": 0.05187271907925606, "learning_rate": 0.00018737324760691423, "loss": 0.0811, "step": 7898 }, { "epoch": 0.5106424242424242, "grad_norm": 0.05465557798743248, "learning_rate": 0.00018736992105248886, "loss": 0.082, "step": 7899 }, { "epoch": 0.5107070707070707, "grad_norm": 0.06319289654493332, "learning_rate": 0.00018736659408946323, "loss": 0.0905, "step": 7900 }, { "epoch": 0.5107717171717172, "grad_norm": 0.05660290643572807, "learning_rate": 0.00018736326671785295, "loss": 0.0904, "step": 7901 }, { "epoch": 0.5108363636363636, "grad_norm": 0.048804305493831635, "learning_rate": 0.00018735993893767355, "loss": 0.0737, "step": 7902 }, { "epoch": 0.5109010101010101, "grad_norm": 0.06313981115818024, "learning_rate": 0.00018735661074894058, "loss": 0.084, "step": 7903 }, { "epoch": 0.5109656565656565, "grad_norm": 0.06525015830993652, "learning_rate": 0.00018735328215166962, "loss": 0.0918, "step": 7904 }, { "epoch": 0.5109656565656565, "eval_bleu": 16.780183392847157, "eval_loss": 0.08956025540828705, "eval_runtime": 2.7168, "eval_samples_per_second": 11.778, "eval_steps_per_second": 1.472, "step": 7904 }, { "epoch": 0.5110303030303031, "grad_norm": 0.05908253416419029, "learning_rate": 0.00018734995314587623, "loss": 0.0784, "step": 7905 }, { "epoch": 0.5110949494949495, "grad_norm": 0.046972185373306274, "learning_rate": 0.00018734662373157598, "loss": 0.0674, "step": 7906 }, { "epoch": 0.5111595959595959, "grad_norm": 0.062111202627420425, "learning_rate": 0.00018734329390878446, "loss": 0.0904, "step": 7907 }, { "epoch": 0.5112242424242425, "grad_norm": 0.04978378489613533, "learning_rate": 0.00018733996367751725, "loss": 0.0739, "step": 7908 }, { "epoch": 0.5112888888888889, "grad_norm": 0.056425709277391434, "learning_rate": 0.00018733663303778988, "loss": 0.08, "step": 7909 }, { "epoch": 0.5113535353535353, "grad_norm": 0.06212376430630684, "learning_rate": 0.00018733330198961797, "loss": 0.0996, "step": 7910 }, { "epoch": 0.5114181818181818, "grad_norm": 0.05954008176922798, "learning_rate": 0.00018732997053301707, "loss": 0.0892, "step": 7911 }, { "epoch": 0.5114828282828283, "grad_norm": 0.07022903859615326, "learning_rate": 0.00018732663866800276, "loss": 0.1021, "step": 7912 }, { "epoch": 0.5115474747474748, "grad_norm": 0.05342933535575867, "learning_rate": 0.00018732330639459064, "loss": 0.062, "step": 7913 }, { "epoch": 0.5116121212121212, "grad_norm": 0.0598909817636013, "learning_rate": 0.0001873199737127963, "loss": 0.0858, "step": 7914 }, { "epoch": 0.5116767676767677, "grad_norm": 0.06252982467412949, "learning_rate": 0.00018731664062263524, "loss": 0.0885, "step": 7915 }, { "epoch": 0.5117414141414142, "grad_norm": 0.060067083686590195, "learning_rate": 0.00018731330712412319, "loss": 0.0773, "step": 7916 }, { "epoch": 0.5118060606060606, "grad_norm": 0.06401772052049637, "learning_rate": 0.00018730997321727563, "loss": 0.0931, "step": 7917 }, { "epoch": 0.511870707070707, "grad_norm": 0.060516923666000366, "learning_rate": 0.00018730663890210823, "loss": 0.0932, "step": 7918 }, { "epoch": 0.5119353535353536, "grad_norm": 0.06042430177330971, "learning_rate": 0.0001873033041786365, "loss": 0.0816, "step": 7919 }, { "epoch": 0.512, "grad_norm": 0.05498149245977402, "learning_rate": 0.0001872999690468761, "loss": 0.0913, "step": 7920 }, { "epoch": 0.512, "eval_bleu": 17.565508338060447, "eval_loss": 0.0909627377986908, "eval_runtime": 2.7352, "eval_samples_per_second": 11.699, "eval_steps_per_second": 1.462, "step": 7920 }, { "epoch": 0.5120646464646464, "grad_norm": 0.057869210839271545, "learning_rate": 0.0001872966335068426, "loss": 0.0841, "step": 7921 }, { "epoch": 0.5121292929292929, "grad_norm": 0.055628061294555664, "learning_rate": 0.0001872932975585516, "loss": 0.0997, "step": 7922 }, { "epoch": 0.5121939393939394, "grad_norm": 0.052671413868665695, "learning_rate": 0.00018728996120201874, "loss": 0.0803, "step": 7923 }, { "epoch": 0.5122585858585859, "grad_norm": 0.07037317007780075, "learning_rate": 0.00018728662443725952, "loss": 0.1059, "step": 7924 }, { "epoch": 0.5123232323232323, "grad_norm": 0.06162027269601822, "learning_rate": 0.0001872832872642897, "loss": 0.0867, "step": 7925 }, { "epoch": 0.5123878787878788, "grad_norm": 0.04926425591111183, "learning_rate": 0.00018727994968312475, "loss": 0.076, "step": 7926 }, { "epoch": 0.5124525252525253, "grad_norm": 0.052356671541929245, "learning_rate": 0.00018727661169378032, "loss": 0.074, "step": 7927 }, { "epoch": 0.5125171717171717, "grad_norm": 0.07029546052217484, "learning_rate": 0.00018727327329627204, "loss": 0.0943, "step": 7928 }, { "epoch": 0.5125818181818181, "grad_norm": 0.05377443507313728, "learning_rate": 0.00018726993449061553, "loss": 0.0799, "step": 7929 }, { "epoch": 0.5126464646464647, "grad_norm": 0.05513812229037285, "learning_rate": 0.00018726659527682637, "loss": 0.0843, "step": 7930 }, { "epoch": 0.5127111111111111, "grad_norm": 0.05371826887130737, "learning_rate": 0.0001872632556549202, "loss": 0.0867, "step": 7931 }, { "epoch": 0.5127757575757576, "grad_norm": 0.0587933249771595, "learning_rate": 0.00018725991562491262, "loss": 0.0942, "step": 7932 }, { "epoch": 0.512840404040404, "grad_norm": 0.05721677467226982, "learning_rate": 0.0001872565751868193, "loss": 0.0947, "step": 7933 }, { "epoch": 0.5129050505050505, "grad_norm": 0.06043868511915207, "learning_rate": 0.00018725323434065579, "loss": 0.0842, "step": 7934 }, { "epoch": 0.512969696969697, "grad_norm": 0.05544653907418251, "learning_rate": 0.00018724989308643774, "loss": 0.0792, "step": 7935 }, { "epoch": 0.5130343434343434, "grad_norm": 0.058307114988565445, "learning_rate": 0.00018724655142418083, "loss": 0.088, "step": 7936 }, { "epoch": 0.5130343434343434, "eval_bleu": 14.997227264222097, "eval_loss": 0.09173398464918137, "eval_runtime": 2.5866, "eval_samples_per_second": 12.372, "eval_steps_per_second": 1.546, "step": 7936 }, { "epoch": 0.5130989898989899, "grad_norm": 0.06230641156435013, "learning_rate": 0.00018724320935390059, "loss": 0.0887, "step": 7937 }, { "epoch": 0.5131636363636364, "grad_norm": 0.05930856615304947, "learning_rate": 0.00018723986687561273, "loss": 0.0724, "step": 7938 }, { "epoch": 0.5132282828282828, "grad_norm": 0.04725337028503418, "learning_rate": 0.00018723652398933285, "loss": 0.0693, "step": 7939 }, { "epoch": 0.5132929292929292, "grad_norm": 0.07112560421228409, "learning_rate": 0.00018723318069507658, "loss": 0.0846, "step": 7940 }, { "epoch": 0.5133575757575758, "grad_norm": 0.05559176206588745, "learning_rate": 0.00018722983699285956, "loss": 0.0868, "step": 7941 }, { "epoch": 0.5134222222222222, "grad_norm": 0.0646115317940712, "learning_rate": 0.00018722649288269744, "loss": 0.0961, "step": 7942 }, { "epoch": 0.5134868686868687, "grad_norm": 0.06235827878117561, "learning_rate": 0.00018722314836460587, "loss": 0.0961, "step": 7943 }, { "epoch": 0.5135515151515152, "grad_norm": 0.05584695190191269, "learning_rate": 0.00018721980343860045, "loss": 0.0821, "step": 7944 }, { "epoch": 0.5136161616161616, "grad_norm": 0.06168879196047783, "learning_rate": 0.00018721645810469681, "loss": 0.0751, "step": 7945 }, { "epoch": 0.5136808080808081, "grad_norm": 0.048408228904008865, "learning_rate": 0.00018721311236291065, "loss": 0.0717, "step": 7946 }, { "epoch": 0.5137454545454545, "grad_norm": 0.07711298763751984, "learning_rate": 0.00018720976621325764, "loss": 0.0887, "step": 7947 }, { "epoch": 0.513810101010101, "grad_norm": 0.05392232537269592, "learning_rate": 0.00018720641965575333, "loss": 0.079, "step": 7948 }, { "epoch": 0.5138747474747475, "grad_norm": 0.05293821915984154, "learning_rate": 0.00018720307269041347, "loss": 0.0777, "step": 7949 }, { "epoch": 0.5139393939393939, "grad_norm": 0.05559979006648064, "learning_rate": 0.00018719972531725364, "loss": 0.0887, "step": 7950 }, { "epoch": 0.5140040404040404, "grad_norm": 0.056672632694244385, "learning_rate": 0.00018719637753628952, "loss": 0.0859, "step": 7951 }, { "epoch": 0.5140686868686869, "grad_norm": 0.061127085238695145, "learning_rate": 0.00018719302934753677, "loss": 0.0891, "step": 7952 }, { "epoch": 0.5140686868686869, "eval_bleu": 15.035821016310763, "eval_loss": 0.09044254571199417, "eval_runtime": 2.5859, "eval_samples_per_second": 12.375, "eval_steps_per_second": 1.547, "step": 7952 }, { "epoch": 0.5141333333333333, "grad_norm": 0.06218895688652992, "learning_rate": 0.00018718968075101105, "loss": 0.0826, "step": 7953 }, { "epoch": 0.5141979797979798, "grad_norm": 0.062405064702034, "learning_rate": 0.000187186331746728, "loss": 0.0995, "step": 7954 }, { "epoch": 0.5142626262626263, "grad_norm": 0.07626596093177795, "learning_rate": 0.00018718298233470334, "loss": 0.1036, "step": 7955 }, { "epoch": 0.5143272727272727, "grad_norm": 0.05381162837147713, "learning_rate": 0.00018717963251495268, "loss": 0.0834, "step": 7956 }, { "epoch": 0.5143919191919192, "grad_norm": 0.06037190929055214, "learning_rate": 0.0001871762822874917, "loss": 0.0925, "step": 7957 }, { "epoch": 0.5144565656565656, "grad_norm": 0.06163175031542778, "learning_rate": 0.00018717293165233602, "loss": 0.0952, "step": 7958 }, { "epoch": 0.5145212121212122, "grad_norm": 0.06236550211906433, "learning_rate": 0.00018716958060950142, "loss": 0.0942, "step": 7959 }, { "epoch": 0.5145858585858586, "grad_norm": 0.059510648250579834, "learning_rate": 0.0001871662291590035, "loss": 0.0874, "step": 7960 }, { "epoch": 0.514650505050505, "grad_norm": 0.05638205632567406, "learning_rate": 0.00018716287730085793, "loss": 0.0859, "step": 7961 }, { "epoch": 0.5147151515151516, "grad_norm": 0.061473410576581955, "learning_rate": 0.00018715952503508038, "loss": 0.0876, "step": 7962 }, { "epoch": 0.514779797979798, "grad_norm": 0.05693434551358223, "learning_rate": 0.00018715617236168656, "loss": 0.0838, "step": 7963 }, { "epoch": 0.5148444444444444, "grad_norm": 0.06742648035287857, "learning_rate": 0.00018715281928069214, "loss": 0.0912, "step": 7964 }, { "epoch": 0.5149090909090909, "grad_norm": 0.05672824755311012, "learning_rate": 0.00018714946579211281, "loss": 0.086, "step": 7965 }, { "epoch": 0.5149737373737374, "grad_norm": 0.05780954658985138, "learning_rate": 0.00018714611189596421, "loss": 0.0898, "step": 7966 }, { "epoch": 0.5150383838383839, "grad_norm": 0.05764427036046982, "learning_rate": 0.00018714275759226207, "loss": 0.076, "step": 7967 }, { "epoch": 0.5151030303030303, "grad_norm": 0.06250914186239243, "learning_rate": 0.00018713940288102204, "loss": 0.0888, "step": 7968 }, { "epoch": 0.5151030303030303, "eval_bleu": 14.637273887046193, "eval_loss": 0.09100360423326492, "eval_runtime": 2.7165, "eval_samples_per_second": 11.78, "eval_steps_per_second": 1.472, "step": 7968 }, { "epoch": 0.5151676767676767, "grad_norm": 0.058573391288518906, "learning_rate": 0.00018713604776225988, "loss": 0.0834, "step": 7969 }, { "epoch": 0.5152323232323233, "grad_norm": 0.06677860021591187, "learning_rate": 0.0001871326922359912, "loss": 0.0914, "step": 7970 }, { "epoch": 0.5152969696969697, "grad_norm": 0.07790549099445343, "learning_rate": 0.00018712933630223174, "loss": 0.0807, "step": 7971 }, { "epoch": 0.5153616161616161, "grad_norm": 0.06089644134044647, "learning_rate": 0.00018712597996099715, "loss": 0.0934, "step": 7972 }, { "epoch": 0.5154262626262627, "grad_norm": 0.06560095399618149, "learning_rate": 0.00018712262321230315, "loss": 0.1036, "step": 7973 }, { "epoch": 0.5154909090909091, "grad_norm": 0.06027298420667648, "learning_rate": 0.00018711926605616548, "loss": 0.0914, "step": 7974 }, { "epoch": 0.5155555555555555, "grad_norm": 0.05753987282514572, "learning_rate": 0.0001871159084925998, "loss": 0.0656, "step": 7975 }, { "epoch": 0.515620202020202, "grad_norm": 0.06755924969911575, "learning_rate": 0.0001871125505216218, "loss": 0.1106, "step": 7976 }, { "epoch": 0.5156848484848485, "grad_norm": 0.06507290154695511, "learning_rate": 0.0001871091921432472, "loss": 0.0824, "step": 7977 }, { "epoch": 0.515749494949495, "grad_norm": 0.09246175736188889, "learning_rate": 0.0001871058333574917, "loss": 0.0995, "step": 7978 }, { "epoch": 0.5158141414141414, "grad_norm": 0.05552591010928154, "learning_rate": 0.00018710247416437104, "loss": 0.0828, "step": 7979 }, { "epoch": 0.5158787878787878, "grad_norm": 0.0532689243555069, "learning_rate": 0.00018709911456390085, "loss": 0.0692, "step": 7980 }, { "epoch": 0.5159434343434344, "grad_norm": 0.058784786611795425, "learning_rate": 0.00018709575455609696, "loss": 0.0818, "step": 7981 }, { "epoch": 0.5160080808080808, "grad_norm": 0.05511625483632088, "learning_rate": 0.00018709239414097496, "loss": 0.0774, "step": 7982 }, { "epoch": 0.5160727272727272, "grad_norm": 0.06552447378635406, "learning_rate": 0.0001870890333185507, "loss": 0.0907, "step": 7983 }, { "epoch": 0.5161373737373738, "grad_norm": 0.05549672618508339, "learning_rate": 0.00018708567208883977, "loss": 0.0764, "step": 7984 }, { "epoch": 0.5161373737373738, "eval_bleu": 17.026277979234315, "eval_loss": 0.09102334082126617, "eval_runtime": 2.5374, "eval_samples_per_second": 12.611, "eval_steps_per_second": 1.576, "step": 7984 }, { "epoch": 0.5162020202020202, "grad_norm": 0.061487793922424316, "learning_rate": 0.00018708231045185794, "loss": 0.0916, "step": 7985 }, { "epoch": 0.5162666666666667, "grad_norm": 0.055573128163814545, "learning_rate": 0.00018707894840762094, "loss": 0.0786, "step": 7986 }, { "epoch": 0.5163313131313131, "grad_norm": 0.054142121225595474, "learning_rate": 0.0001870755859561445, "loss": 0.089, "step": 7987 }, { "epoch": 0.5163959595959596, "grad_norm": 0.06409410387277603, "learning_rate": 0.00018707222309744431, "loss": 0.0987, "step": 7988 }, { "epoch": 0.5164606060606061, "grad_norm": 0.05946100503206253, "learning_rate": 0.00018706885983153614, "loss": 0.0937, "step": 7989 }, { "epoch": 0.5165252525252525, "grad_norm": 0.059724267572164536, "learning_rate": 0.00018706549615843567, "loss": 0.0855, "step": 7990 }, { "epoch": 0.516589898989899, "grad_norm": 0.0454130694270134, "learning_rate": 0.00018706213207815868, "loss": 0.0676, "step": 7991 }, { "epoch": 0.5166545454545455, "grad_norm": 0.06027151644229889, "learning_rate": 0.00018705876759072087, "loss": 0.086, "step": 7992 }, { "epoch": 0.5167191919191919, "grad_norm": 0.05617490038275719, "learning_rate": 0.00018705540269613798, "loss": 0.0777, "step": 7993 }, { "epoch": 0.5167838383838383, "grad_norm": 0.06504158675670624, "learning_rate": 0.0001870520373944258, "loss": 0.0842, "step": 7994 }, { "epoch": 0.5168484848484849, "grad_norm": 0.06090238317847252, "learning_rate": 0.00018704867168559995, "loss": 0.0993, "step": 7995 }, { "epoch": 0.5169131313131313, "grad_norm": 0.05292081460356712, "learning_rate": 0.00018704530556967627, "loss": 0.0834, "step": 7996 }, { "epoch": 0.5169777777777778, "grad_norm": 0.05523741617798805, "learning_rate": 0.0001870419390466705, "loss": 0.0827, "step": 7997 }, { "epoch": 0.5170424242424242, "grad_norm": 0.060230325907468796, "learning_rate": 0.00018703857211659829, "loss": 0.079, "step": 7998 }, { "epoch": 0.5171070707070707, "grad_norm": 0.06576273590326309, "learning_rate": 0.0001870352047794755, "loss": 0.0852, "step": 7999 }, { "epoch": 0.5171717171717172, "grad_norm": 0.06397124379873276, "learning_rate": 0.0001870318370353178, "loss": 0.0986, "step": 8000 }, { "epoch": 0.5171717171717172, "eval_bleu": 16.19356058797213, "eval_loss": 0.09221199154853821, "eval_runtime": 2.5333, "eval_samples_per_second": 12.632, "eval_steps_per_second": 1.579, "step": 8000 }, { "epoch": 0.5172363636363636, "grad_norm": 0.05284617096185684, "learning_rate": 0.000187028468884141, "loss": 0.0796, "step": 8001 }, { "epoch": 0.5173010101010102, "grad_norm": 0.05690190941095352, "learning_rate": 0.00018702510032596084, "loss": 0.0834, "step": 8002 }, { "epoch": 0.5173656565656566, "grad_norm": 0.05775248631834984, "learning_rate": 0.00018702173136079302, "loss": 0.0887, "step": 8003 }, { "epoch": 0.517430303030303, "grad_norm": 0.05529121682047844, "learning_rate": 0.00018701836198865336, "loss": 0.0812, "step": 8004 }, { "epoch": 0.5174949494949495, "grad_norm": 0.060552265495061874, "learning_rate": 0.00018701499220955757, "loss": 0.086, "step": 8005 }, { "epoch": 0.517559595959596, "grad_norm": 0.04693681001663208, "learning_rate": 0.00018701162202352142, "loss": 0.0733, "step": 8006 }, { "epoch": 0.5176242424242424, "grad_norm": 0.06721578538417816, "learning_rate": 0.00018700825143056068, "loss": 0.0946, "step": 8007 }, { "epoch": 0.5176888888888889, "grad_norm": 0.059476979076862335, "learning_rate": 0.00018700488043069114, "loss": 0.1011, "step": 8008 }, { "epoch": 0.5177535353535353, "grad_norm": 0.05823297053575516, "learning_rate": 0.0001870015090239285, "loss": 0.088, "step": 8009 }, { "epoch": 0.5178181818181818, "grad_norm": 0.05206834152340889, "learning_rate": 0.0001869981372102886, "loss": 0.0692, "step": 8010 }, { "epoch": 0.5178828282828283, "grad_norm": 0.06298154592514038, "learning_rate": 0.00018699476498978715, "loss": 0.0868, "step": 8011 }, { "epoch": 0.5179474747474747, "grad_norm": 0.06580867618322372, "learning_rate": 0.00018699139236243994, "loss": 0.0929, "step": 8012 }, { "epoch": 0.5180121212121213, "grad_norm": 0.06023244932293892, "learning_rate": 0.00018698801932826275, "loss": 0.0838, "step": 8013 }, { "epoch": 0.5180767676767677, "grad_norm": 0.05557830259203911, "learning_rate": 0.00018698464588727136, "loss": 0.0782, "step": 8014 }, { "epoch": 0.5181414141414141, "grad_norm": 0.06302457302808762, "learning_rate": 0.00018698127203948154, "loss": 0.0887, "step": 8015 }, { "epoch": 0.5182060606060606, "grad_norm": 0.06634131073951721, "learning_rate": 0.00018697789778490906, "loss": 0.0908, "step": 8016 }, { "epoch": 0.5182060606060606, "eval_bleu": 15.831523471723443, "eval_loss": 0.09165791422128677, "eval_runtime": 2.8373, "eval_samples_per_second": 11.278, "eval_steps_per_second": 1.41, "step": 8016 }, { "epoch": 0.5182707070707071, "grad_norm": 0.07039667665958405, "learning_rate": 0.0001869745231235697, "loss": 0.0938, "step": 8017 }, { "epoch": 0.5183353535353535, "grad_norm": 0.0510932132601738, "learning_rate": 0.00018697114805547925, "loss": 0.0744, "step": 8018 }, { "epoch": 0.5184, "grad_norm": 0.06125972792506218, "learning_rate": 0.00018696777258065352, "loss": 0.0803, "step": 8019 }, { "epoch": 0.5184646464646465, "grad_norm": 0.05473436415195465, "learning_rate": 0.00018696439669910824, "loss": 0.0731, "step": 8020 }, { "epoch": 0.518529292929293, "grad_norm": 0.05645788088440895, "learning_rate": 0.00018696102041085923, "loss": 0.0778, "step": 8021 }, { "epoch": 0.5185939393939394, "grad_norm": 0.0667681097984314, "learning_rate": 0.0001869576437159223, "loss": 0.0985, "step": 8022 }, { "epoch": 0.5186585858585858, "grad_norm": 0.05831639841198921, "learning_rate": 0.0001869542666143132, "loss": 0.0881, "step": 8023 }, { "epoch": 0.5187232323232324, "grad_norm": 0.07342809438705444, "learning_rate": 0.00018695088910604773, "loss": 0.1073, "step": 8024 }, { "epoch": 0.5187878787878788, "grad_norm": 0.06629791855812073, "learning_rate": 0.00018694751119114171, "loss": 0.0963, "step": 8025 }, { "epoch": 0.5188525252525252, "grad_norm": 0.05144278332591057, "learning_rate": 0.00018694413286961093, "loss": 0.0817, "step": 8026 }, { "epoch": 0.5189171717171717, "grad_norm": 0.054942917078733444, "learning_rate": 0.00018694075414147116, "loss": 0.0853, "step": 8027 }, { "epoch": 0.5189818181818182, "grad_norm": 0.05287417396903038, "learning_rate": 0.00018693737500673826, "loss": 0.0775, "step": 8028 }, { "epoch": 0.5190464646464646, "grad_norm": 0.07009001821279526, "learning_rate": 0.00018693399546542796, "loss": 0.104, "step": 8029 }, { "epoch": 0.5191111111111111, "grad_norm": 0.061332181096076965, "learning_rate": 0.00018693061551755614, "loss": 0.1013, "step": 8030 }, { "epoch": 0.5191757575757576, "grad_norm": 0.05353868380188942, "learning_rate": 0.00018692723516313854, "loss": 0.0802, "step": 8031 }, { "epoch": 0.5192404040404041, "grad_norm": 0.05989716202020645, "learning_rate": 0.000186923854402191, "loss": 0.0819, "step": 8032 }, { "epoch": 0.5192404040404041, "eval_bleu": 16.974789731870608, "eval_loss": 0.0911623015999794, "eval_runtime": 2.6064, "eval_samples_per_second": 12.278, "eval_steps_per_second": 1.535, "step": 8032 }, { "epoch": 0.5193050505050505, "grad_norm": 0.06321995705366135, "learning_rate": 0.00018692047323472935, "loss": 0.0867, "step": 8033 }, { "epoch": 0.5193696969696969, "grad_norm": 0.05779041722416878, "learning_rate": 0.00018691709166076937, "loss": 0.0835, "step": 8034 }, { "epoch": 0.5194343434343435, "grad_norm": 0.051018282771110535, "learning_rate": 0.00018691370968032688, "loss": 0.0725, "step": 8035 }, { "epoch": 0.5194989898989899, "grad_norm": 0.05592404305934906, "learning_rate": 0.00018691032729341772, "loss": 0.0897, "step": 8036 }, { "epoch": 0.5195636363636363, "grad_norm": 0.05164778232574463, "learning_rate": 0.00018690694450005765, "loss": 0.0859, "step": 8037 }, { "epoch": 0.5196282828282829, "grad_norm": 0.06739141047000885, "learning_rate": 0.0001869035613002626, "loss": 0.1042, "step": 8038 }, { "epoch": 0.5196929292929293, "grad_norm": 0.05892856791615486, "learning_rate": 0.00018690017769404827, "loss": 0.0834, "step": 8039 }, { "epoch": 0.5197575757575758, "grad_norm": 0.05598752200603485, "learning_rate": 0.00018689679368143056, "loss": 0.0815, "step": 8040 }, { "epoch": 0.5198222222222222, "grad_norm": 0.055124007165431976, "learning_rate": 0.00018689340926242526, "loss": 0.0776, "step": 8041 }, { "epoch": 0.5198868686868687, "grad_norm": 0.05407913029193878, "learning_rate": 0.00018689002443704822, "loss": 0.0792, "step": 8042 }, { "epoch": 0.5199515151515152, "grad_norm": 0.05240151286125183, "learning_rate": 0.00018688663920531526, "loss": 0.0768, "step": 8043 }, { "epoch": 0.5200161616161616, "grad_norm": 0.056700143963098526, "learning_rate": 0.00018688325356724223, "loss": 0.089, "step": 8044 }, { "epoch": 0.520080808080808, "grad_norm": 0.06450142711400986, "learning_rate": 0.0001868798675228449, "loss": 0.0898, "step": 8045 }, { "epoch": 0.5201454545454546, "grad_norm": 0.05190321430563927, "learning_rate": 0.0001868764810721392, "loss": 0.0753, "step": 8046 }, { "epoch": 0.520210101010101, "grad_norm": 0.05658072605729103, "learning_rate": 0.0001868730942151409, "loss": 0.0749, "step": 8047 }, { "epoch": 0.5202747474747474, "grad_norm": 0.0676039457321167, "learning_rate": 0.00018686970695186585, "loss": 0.0984, "step": 8048 }, { "epoch": 0.5202747474747474, "eval_bleu": 14.504629277667862, "eval_loss": 0.09205296635627747, "eval_runtime": 2.6562, "eval_samples_per_second": 12.047, "eval_steps_per_second": 1.506, "step": 8048 }, { "epoch": 0.520339393939394, "grad_norm": 0.05314463749527931, "learning_rate": 0.00018686631928232993, "loss": 0.0854, "step": 8049 }, { "epoch": 0.5204040404040404, "grad_norm": 0.09898544102907181, "learning_rate": 0.0001868629312065489, "loss": 0.0771, "step": 8050 }, { "epoch": 0.5204686868686869, "grad_norm": 0.05841261148452759, "learning_rate": 0.0001868595427245387, "loss": 0.0925, "step": 8051 }, { "epoch": 0.5205333333333333, "grad_norm": 0.04872497171163559, "learning_rate": 0.0001868561538363151, "loss": 0.0652, "step": 8052 }, { "epoch": 0.5205979797979798, "grad_norm": 0.06210535392165184, "learning_rate": 0.00018685276454189403, "loss": 0.0902, "step": 8053 }, { "epoch": 0.5206626262626263, "grad_norm": 0.056773412972688675, "learning_rate": 0.00018684937484129125, "loss": 0.0807, "step": 8054 }, { "epoch": 0.5207272727272727, "grad_norm": 0.05487416684627533, "learning_rate": 0.00018684598473452265, "loss": 0.0805, "step": 8055 }, { "epoch": 0.5207919191919191, "grad_norm": 0.0662560909986496, "learning_rate": 0.00018684259422160416, "loss": 0.104, "step": 8056 }, { "epoch": 0.5208565656565657, "grad_norm": 0.04683137312531471, "learning_rate": 0.0001868392033025515, "loss": 0.0688, "step": 8057 }, { "epoch": 0.5209212121212121, "grad_norm": 0.05143234506249428, "learning_rate": 0.0001868358119773806, "loss": 0.076, "step": 8058 }, { "epoch": 0.5209858585858586, "grad_norm": 0.05574750155210495, "learning_rate": 0.0001868324202461073, "loss": 0.0786, "step": 8059 }, { "epoch": 0.5210505050505051, "grad_norm": 0.058614615350961685, "learning_rate": 0.00018682902810874754, "loss": 0.091, "step": 8060 }, { "epoch": 0.5211151515151515, "grad_norm": 0.058229222893714905, "learning_rate": 0.00018682563556531706, "loss": 0.0801, "step": 8061 }, { "epoch": 0.521179797979798, "grad_norm": 0.05697263404726982, "learning_rate": 0.00018682224261583184, "loss": 0.0831, "step": 8062 }, { "epoch": 0.5212444444444444, "grad_norm": 0.06947024911642075, "learning_rate": 0.00018681884926030767, "loss": 0.0918, "step": 8063 }, { "epoch": 0.5213090909090909, "grad_norm": 0.05299440771341324, "learning_rate": 0.00018681545549876043, "loss": 0.0771, "step": 8064 }, { "epoch": 0.5213090909090909, "eval_bleu": 12.999116836377597, "eval_loss": 0.09261239320039749, "eval_runtime": 2.5516, "eval_samples_per_second": 12.541, "eval_steps_per_second": 1.568, "step": 8064 }, { "epoch": 0.5213737373737374, "grad_norm": 0.05844137445092201, "learning_rate": 0.00018681206133120602, "loss": 0.0888, "step": 8065 }, { "epoch": 0.5214383838383838, "grad_norm": 0.06301189959049225, "learning_rate": 0.0001868086667576603, "loss": 0.0988, "step": 8066 }, { "epoch": 0.5215030303030304, "grad_norm": 0.056530363857746124, "learning_rate": 0.00018680527177813912, "loss": 0.0756, "step": 8067 }, { "epoch": 0.5215676767676768, "grad_norm": 0.05084362253546715, "learning_rate": 0.0001868018763926584, "loss": 0.0718, "step": 8068 }, { "epoch": 0.5216323232323232, "grad_norm": 0.0586542934179306, "learning_rate": 0.000186798480601234, "loss": 0.0829, "step": 8069 }, { "epoch": 0.5216969696969697, "grad_norm": 0.05610043182969093, "learning_rate": 0.0001867950844038818, "loss": 0.0853, "step": 8070 }, { "epoch": 0.5217616161616162, "grad_norm": 0.0537438727915287, "learning_rate": 0.00018679168780061768, "loss": 0.0822, "step": 8071 }, { "epoch": 0.5218262626262626, "grad_norm": 0.05773867666721344, "learning_rate": 0.00018678829079145755, "loss": 0.0967, "step": 8072 }, { "epoch": 0.5218909090909091, "grad_norm": 0.0583631731569767, "learning_rate": 0.00018678489337641723, "loss": 0.0882, "step": 8073 }, { "epoch": 0.5219555555555555, "grad_norm": 0.05365129932761192, "learning_rate": 0.00018678149555551268, "loss": 0.0783, "step": 8074 }, { "epoch": 0.522020202020202, "grad_norm": 0.06064806878566742, "learning_rate": 0.0001867780973287598, "loss": 0.093, "step": 8075 }, { "epoch": 0.5220848484848485, "grad_norm": 0.05870945006608963, "learning_rate": 0.0001867746986961744, "loss": 0.0903, "step": 8076 }, { "epoch": 0.5221494949494949, "grad_norm": 0.0752149224281311, "learning_rate": 0.00018677129965777244, "loss": 0.0997, "step": 8077 }, { "epoch": 0.5222141414141415, "grad_norm": 0.06252361834049225, "learning_rate": 0.00018676790021356977, "loss": 0.0862, "step": 8078 }, { "epoch": 0.5222787878787879, "grad_norm": 0.0573294572532177, "learning_rate": 0.00018676450036358232, "loss": 0.0856, "step": 8079 }, { "epoch": 0.5223434343434343, "grad_norm": 0.06126003712415695, "learning_rate": 0.00018676110010782602, "loss": 0.0875, "step": 8080 }, { "epoch": 0.5223434343434343, "eval_bleu": 13.14398660740446, "eval_loss": 0.09243758022785187, "eval_runtime": 2.5551, "eval_samples_per_second": 12.524, "eval_steps_per_second": 1.565, "step": 8080 }, { "epoch": 0.5224080808080808, "grad_norm": 0.05620555579662323, "learning_rate": 0.00018675769944631672, "loss": 0.0883, "step": 8081 }, { "epoch": 0.5224727272727273, "grad_norm": 0.05991693213582039, "learning_rate": 0.00018675429837907033, "loss": 0.0931, "step": 8082 }, { "epoch": 0.5225373737373737, "grad_norm": 0.05624626949429512, "learning_rate": 0.00018675089690610279, "loss": 0.083, "step": 8083 }, { "epoch": 0.5226020202020202, "grad_norm": 0.052377574145793915, "learning_rate": 0.00018674749502742998, "loss": 0.073, "step": 8084 }, { "epoch": 0.5226666666666666, "grad_norm": 0.056381676346063614, "learning_rate": 0.00018674409274306784, "loss": 0.0914, "step": 8085 }, { "epoch": 0.5227313131313132, "grad_norm": 0.059518154710531235, "learning_rate": 0.00018674069005303218, "loss": 0.0783, "step": 8086 }, { "epoch": 0.5227959595959596, "grad_norm": 0.054079268127679825, "learning_rate": 0.00018673728695733905, "loss": 0.0864, "step": 8087 }, { "epoch": 0.522860606060606, "grad_norm": 0.05449111759662628, "learning_rate": 0.00018673388345600428, "loss": 0.0789, "step": 8088 }, { "epoch": 0.5229252525252526, "grad_norm": 0.053463540971279144, "learning_rate": 0.00018673047954904384, "loss": 0.085, "step": 8089 }, { "epoch": 0.522989898989899, "grad_norm": 0.05365019291639328, "learning_rate": 0.0001867270752364736, "loss": 0.0766, "step": 8090 }, { "epoch": 0.5230545454545454, "grad_norm": 0.05615364387631416, "learning_rate": 0.0001867236705183095, "loss": 0.0762, "step": 8091 }, { "epoch": 0.5231191919191919, "grad_norm": 0.05114414542913437, "learning_rate": 0.00018672026539456743, "loss": 0.0796, "step": 8092 }, { "epoch": 0.5231838383838384, "grad_norm": 0.06401913613080978, "learning_rate": 0.00018671685986526338, "loss": 0.0931, "step": 8093 }, { "epoch": 0.5232484848484849, "grad_norm": 0.05319920927286148, "learning_rate": 0.00018671345393041325, "loss": 0.083, "step": 8094 }, { "epoch": 0.5233131313131313, "grad_norm": 0.05945587158203125, "learning_rate": 0.00018671004759003297, "loss": 0.085, "step": 8095 }, { "epoch": 0.5233777777777778, "grad_norm": 0.06035274639725685, "learning_rate": 0.00018670664084413841, "loss": 0.072, "step": 8096 }, { "epoch": 0.5233777777777778, "eval_bleu": 12.753771580455947, "eval_loss": 0.09237653762102127, "eval_runtime": 2.5958, "eval_samples_per_second": 12.327, "eval_steps_per_second": 1.541, "step": 8096 }, { "epoch": 0.5234424242424243, "grad_norm": 0.05063999071717262, "learning_rate": 0.00018670323369274563, "loss": 0.0831, "step": 8097 }, { "epoch": 0.5235070707070707, "grad_norm": 0.06166744604706764, "learning_rate": 0.00018669982613587043, "loss": 0.0926, "step": 8098 }, { "epoch": 0.5235717171717171, "grad_norm": 0.05881870537996292, "learning_rate": 0.00018669641817352882, "loss": 0.0955, "step": 8099 }, { "epoch": 0.5236363636363637, "grad_norm": 0.051383793354034424, "learning_rate": 0.00018669300980573674, "loss": 0.0756, "step": 8100 }, { "epoch": 0.5237010101010101, "grad_norm": 0.08074943721294403, "learning_rate": 0.00018668960103251009, "loss": 0.1022, "step": 8101 }, { "epoch": 0.5237656565656565, "grad_norm": 0.05814139172434807, "learning_rate": 0.00018668619185386486, "loss": 0.0745, "step": 8102 }, { "epoch": 0.523830303030303, "grad_norm": 0.05597849190235138, "learning_rate": 0.00018668278226981696, "loss": 0.0904, "step": 8103 }, { "epoch": 0.5238949494949495, "grad_norm": 0.05532548576593399, "learning_rate": 0.00018667937228038234, "loss": 0.088, "step": 8104 }, { "epoch": 0.523959595959596, "grad_norm": 0.049798764288425446, "learning_rate": 0.00018667596188557696, "loss": 0.0847, "step": 8105 }, { "epoch": 0.5240242424242424, "grad_norm": 0.04529700428247452, "learning_rate": 0.00018667255108541678, "loss": 0.0676, "step": 8106 }, { "epoch": 0.5240888888888889, "grad_norm": 0.05694340169429779, "learning_rate": 0.0001866691398799177, "loss": 0.0879, "step": 8107 }, { "epoch": 0.5241535353535354, "grad_norm": 0.07598643749952316, "learning_rate": 0.0001866657282690957, "loss": 0.0908, "step": 8108 }, { "epoch": 0.5242181818181818, "grad_norm": 0.054334819316864014, "learning_rate": 0.00018666231625296678, "loss": 0.0839, "step": 8109 }, { "epoch": 0.5242828282828282, "grad_norm": 0.05691142380237579, "learning_rate": 0.0001866589038315468, "loss": 0.0883, "step": 8110 }, { "epoch": 0.5243474747474748, "grad_norm": 0.05710835009813309, "learning_rate": 0.00018665549100485183, "loss": 0.0732, "step": 8111 }, { "epoch": 0.5244121212121212, "grad_norm": 0.04936891421675682, "learning_rate": 0.00018665207777289773, "loss": 0.0788, "step": 8112 }, { "epoch": 0.5244121212121212, "eval_bleu": 13.763155961545422, "eval_loss": 0.09214763343334198, "eval_runtime": 2.6123, "eval_samples_per_second": 12.25, "eval_steps_per_second": 1.531, "step": 8112 }, { "epoch": 0.5244767676767677, "grad_norm": 0.04991089180111885, "learning_rate": 0.00018664866413570054, "loss": 0.0767, "step": 8113 }, { "epoch": 0.5245414141414141, "grad_norm": 0.05383076146245003, "learning_rate": 0.00018664525009327618, "loss": 0.0831, "step": 8114 }, { "epoch": 0.5246060606060606, "grad_norm": 0.05507123842835426, "learning_rate": 0.0001866418356456406, "loss": 0.0823, "step": 8115 }, { "epoch": 0.5246707070707071, "grad_norm": 0.05143503472208977, "learning_rate": 0.00018663842079280987, "loss": 0.0765, "step": 8116 }, { "epoch": 0.5247353535353535, "grad_norm": 0.06402487307786942, "learning_rate": 0.00018663500553479985, "loss": 0.0976, "step": 8117 }, { "epoch": 0.5248, "grad_norm": 0.055830955505371094, "learning_rate": 0.00018663158987162654, "loss": 0.09, "step": 8118 }, { "epoch": 0.5248646464646465, "grad_norm": 0.056812841445207596, "learning_rate": 0.00018662817380330594, "loss": 0.0893, "step": 8119 }, { "epoch": 0.5249292929292929, "grad_norm": 0.07345831394195557, "learning_rate": 0.00018662475732985397, "loss": 0.097, "step": 8120 }, { "epoch": 0.5249939393939393, "grad_norm": 0.06089197099208832, "learning_rate": 0.0001866213404512867, "loss": 0.0952, "step": 8121 }, { "epoch": 0.5250585858585859, "grad_norm": 0.05826721340417862, "learning_rate": 0.00018661792316762005, "loss": 0.0876, "step": 8122 }, { "epoch": 0.5251232323232323, "grad_norm": 0.0505029559135437, "learning_rate": 0.00018661450547886998, "loss": 0.0728, "step": 8123 }, { "epoch": 0.5251878787878788, "grad_norm": 0.05985327810049057, "learning_rate": 0.00018661108738505253, "loss": 0.0986, "step": 8124 }, { "epoch": 0.5252525252525253, "grad_norm": 0.048617757856845856, "learning_rate": 0.00018660766888618368, "loss": 0.075, "step": 8125 }, { "epoch": 0.5253171717171717, "grad_norm": 0.06735645979642868, "learning_rate": 0.00018660424998227933, "loss": 0.1006, "step": 8126 }, { "epoch": 0.5253818181818182, "grad_norm": 0.0503406748175621, "learning_rate": 0.00018660083067335558, "loss": 0.0721, "step": 8127 }, { "epoch": 0.5254464646464646, "grad_norm": 0.11167537420988083, "learning_rate": 0.00018659741095942834, "loss": 0.1029, "step": 8128 }, { "epoch": 0.5254464646464646, "eval_bleu": 13.299117857066399, "eval_loss": 0.0917312502861023, "eval_runtime": 2.611, "eval_samples_per_second": 12.256, "eval_steps_per_second": 1.532, "step": 8128 }, { "epoch": 0.5255111111111112, "grad_norm": 0.0579511933028698, "learning_rate": 0.00018659399084051365, "loss": 0.0851, "step": 8129 }, { "epoch": 0.5255757575757576, "grad_norm": 0.07113382965326309, "learning_rate": 0.0001865905703166275, "loss": 0.0991, "step": 8130 }, { "epoch": 0.525640404040404, "grad_norm": 0.06183483079075813, "learning_rate": 0.0001865871493877859, "loss": 0.0896, "step": 8131 }, { "epoch": 0.5257050505050505, "grad_norm": 0.05923070013523102, "learning_rate": 0.0001865837280540048, "loss": 0.0962, "step": 8132 }, { "epoch": 0.525769696969697, "grad_norm": 0.04951385036110878, "learning_rate": 0.0001865803063153002, "loss": 0.0755, "step": 8133 }, { "epoch": 0.5258343434343434, "grad_norm": 0.05455077439546585, "learning_rate": 0.00018657688417168816, "loss": 0.084, "step": 8134 }, { "epoch": 0.5258989898989899, "grad_norm": 0.06021414324641228, "learning_rate": 0.00018657346162318468, "loss": 0.0977, "step": 8135 }, { "epoch": 0.5259636363636364, "grad_norm": 0.06283588707447052, "learning_rate": 0.00018657003866980572, "loss": 0.098, "step": 8136 }, { "epoch": 0.5260282828282828, "grad_norm": 0.06610030680894852, "learning_rate": 0.0001865666153115673, "loss": 0.0927, "step": 8137 }, { "epoch": 0.5260929292929293, "grad_norm": 0.05935713276267052, "learning_rate": 0.00018656319154848546, "loss": 0.0881, "step": 8138 }, { "epoch": 0.5261575757575757, "grad_norm": 0.053316615521907806, "learning_rate": 0.00018655976738057617, "loss": 0.0826, "step": 8139 }, { "epoch": 0.5262222222222223, "grad_norm": 0.048153653740882874, "learning_rate": 0.00018655634280785547, "loss": 0.0644, "step": 8140 }, { "epoch": 0.5262868686868687, "grad_norm": 0.05810118466615677, "learning_rate": 0.00018655291783033936, "loss": 0.0908, "step": 8141 }, { "epoch": 0.5263515151515151, "grad_norm": 0.05374021455645561, "learning_rate": 0.0001865494924480439, "loss": 0.07, "step": 8142 }, { "epoch": 0.5264161616161617, "grad_norm": 0.06478973478078842, "learning_rate": 0.00018654606666098503, "loss": 0.0986, "step": 8143 }, { "epoch": 0.5264808080808081, "grad_norm": 0.05463015288114548, "learning_rate": 0.00018654264046917885, "loss": 0.0767, "step": 8144 }, { "epoch": 0.5264808080808081, "eval_bleu": 15.950983594044432, "eval_loss": 0.09250018000602722, "eval_runtime": 2.5833, "eval_samples_per_second": 12.387, "eval_steps_per_second": 1.548, "step": 8144 }, { "epoch": 0.5265454545454545, "grad_norm": 0.05252375826239586, "learning_rate": 0.00018653921387264136, "loss": 0.081, "step": 8145 }, { "epoch": 0.526610101010101, "grad_norm": 0.06316515803337097, "learning_rate": 0.00018653578687138854, "loss": 0.0907, "step": 8146 }, { "epoch": 0.5266747474747475, "grad_norm": 0.05742136016488075, "learning_rate": 0.00018653235946543644, "loss": 0.09, "step": 8147 }, { "epoch": 0.526739393939394, "grad_norm": 0.07419617474079132, "learning_rate": 0.00018652893165480117, "loss": 0.1043, "step": 8148 }, { "epoch": 0.5268040404040404, "grad_norm": 0.057711247354745865, "learning_rate": 0.00018652550343949863, "loss": 0.0791, "step": 8149 }, { "epoch": 0.5268686868686868, "grad_norm": 0.06053418293595314, "learning_rate": 0.00018652207481954496, "loss": 0.0921, "step": 8150 }, { "epoch": 0.5269333333333334, "grad_norm": 0.15552648901939392, "learning_rate": 0.00018651864579495613, "loss": 0.0959, "step": 8151 }, { "epoch": 0.5269979797979798, "grad_norm": 0.06005292385816574, "learning_rate": 0.0001865152163657482, "loss": 0.0904, "step": 8152 }, { "epoch": 0.5270626262626262, "grad_norm": 0.06239711120724678, "learning_rate": 0.0001865117865319372, "loss": 0.0906, "step": 8153 }, { "epoch": 0.5271272727272728, "grad_norm": 0.059815194457769394, "learning_rate": 0.00018650835629353916, "loss": 0.0888, "step": 8154 }, { "epoch": 0.5271919191919192, "grad_norm": 0.05872350186109543, "learning_rate": 0.00018650492565057015, "loss": 0.0934, "step": 8155 }, { "epoch": 0.5272565656565656, "grad_norm": 0.061077095568180084, "learning_rate": 0.0001865014946030462, "loss": 0.0816, "step": 8156 }, { "epoch": 0.5273212121212121, "grad_norm": 0.05857612192630768, "learning_rate": 0.00018649806315098336, "loss": 0.0904, "step": 8157 }, { "epoch": 0.5273858585858586, "grad_norm": 0.04974500834941864, "learning_rate": 0.0001864946312943977, "loss": 0.0703, "step": 8158 }, { "epoch": 0.5274505050505051, "grad_norm": 0.055091917514801025, "learning_rate": 0.0001864911990333052, "loss": 0.0833, "step": 8159 }, { "epoch": 0.5275151515151515, "grad_norm": 0.05399928241968155, "learning_rate": 0.000186487766367722, "loss": 0.0786, "step": 8160 }, { "epoch": 0.5275151515151515, "eval_bleu": 15.423135026208046, "eval_loss": 0.09146535396575928, "eval_runtime": 2.6385, "eval_samples_per_second": 12.128, "eval_steps_per_second": 1.516, "step": 8160 }, { "epoch": 0.5275797979797979, "grad_norm": 0.06263037025928497, "learning_rate": 0.00018648433329766407, "loss": 0.0804, "step": 8161 }, { "epoch": 0.5276444444444445, "grad_norm": 0.06118642911314964, "learning_rate": 0.0001864808998231475, "loss": 0.0725, "step": 8162 }, { "epoch": 0.5277090909090909, "grad_norm": 0.06673570722341537, "learning_rate": 0.00018647746594418837, "loss": 0.1149, "step": 8163 }, { "epoch": 0.5277737373737373, "grad_norm": 0.058076225221157074, "learning_rate": 0.00018647403166080273, "loss": 0.0823, "step": 8164 }, { "epoch": 0.5278383838383839, "grad_norm": 0.06645149737596512, "learning_rate": 0.00018647059697300659, "loss": 0.0825, "step": 8165 }, { "epoch": 0.5279030303030303, "grad_norm": 0.057600297033786774, "learning_rate": 0.0001864671618808161, "loss": 0.0976, "step": 8166 }, { "epoch": 0.5279676767676768, "grad_norm": 0.056478723883628845, "learning_rate": 0.00018646372638424726, "loss": 0.0782, "step": 8167 }, { "epoch": 0.5280323232323232, "grad_norm": 0.05365762487053871, "learning_rate": 0.00018646029048331617, "loss": 0.079, "step": 8168 }, { "epoch": 0.5280969696969697, "grad_norm": 0.05698290094733238, "learning_rate": 0.0001864568541780389, "loss": 0.0974, "step": 8169 }, { "epoch": 0.5281616161616162, "grad_norm": 0.06262478977441788, "learning_rate": 0.00018645341746843146, "loss": 0.1023, "step": 8170 }, { "epoch": 0.5282262626262626, "grad_norm": 0.06686689704656601, "learning_rate": 0.00018644998035451, "loss": 0.0984, "step": 8171 }, { "epoch": 0.5282909090909091, "grad_norm": 0.06988895684480667, "learning_rate": 0.00018644654283629054, "loss": 0.0905, "step": 8172 }, { "epoch": 0.5283555555555556, "grad_norm": 0.056057363748550415, "learning_rate": 0.00018644310491378917, "loss": 0.0903, "step": 8173 }, { "epoch": 0.528420202020202, "grad_norm": 0.059943217784166336, "learning_rate": 0.000186439666587022, "loss": 0.1008, "step": 8174 }, { "epoch": 0.5284848484848484, "grad_norm": 0.05228625610470772, "learning_rate": 0.00018643622785600506, "loss": 0.0846, "step": 8175 }, { "epoch": 0.528549494949495, "grad_norm": 0.059641752392053604, "learning_rate": 0.0001864327887207545, "loss": 0.0918, "step": 8176 }, { "epoch": 0.528549494949495, "eval_bleu": 12.331679471693457, "eval_loss": 0.09192488342523575, "eval_runtime": 2.5643, "eval_samples_per_second": 12.479, "eval_steps_per_second": 1.56, "step": 8176 }, { "epoch": 0.5286141414141414, "grad_norm": 0.05332181975245476, "learning_rate": 0.00018642934918128634, "loss": 0.0827, "step": 8177 }, { "epoch": 0.5286787878787879, "grad_norm": 0.06467719376087189, "learning_rate": 0.0001864259092376167, "loss": 0.0994, "step": 8178 }, { "epoch": 0.5287434343434343, "grad_norm": 0.05302804708480835, "learning_rate": 0.00018642246888976162, "loss": 0.0781, "step": 8179 }, { "epoch": 0.5288080808080808, "grad_norm": 0.05929264426231384, "learning_rate": 0.00018641902813773724, "loss": 0.081, "step": 8180 }, { "epoch": 0.5288727272727273, "grad_norm": 0.05968436226248741, "learning_rate": 0.00018641558698155963, "loss": 0.0786, "step": 8181 }, { "epoch": 0.5289373737373737, "grad_norm": 0.05421347916126251, "learning_rate": 0.0001864121454212449, "loss": 0.0792, "step": 8182 }, { "epoch": 0.5290020202020203, "grad_norm": 0.06089012697339058, "learning_rate": 0.00018640870345680913, "loss": 0.0994, "step": 8183 }, { "epoch": 0.5290666666666667, "grad_norm": 0.07299090176820755, "learning_rate": 0.0001864052610882684, "loss": 0.1015, "step": 8184 }, { "epoch": 0.5291313131313131, "grad_norm": 0.05812188982963562, "learning_rate": 0.00018640181831563885, "loss": 0.0869, "step": 8185 }, { "epoch": 0.5291959595959596, "grad_norm": 0.056718338280916214, "learning_rate": 0.00018639837513893657, "loss": 0.0863, "step": 8186 }, { "epoch": 0.5292606060606061, "grad_norm": 0.052731145173311234, "learning_rate": 0.00018639493155817765, "loss": 0.0753, "step": 8187 }, { "epoch": 0.5293252525252525, "grad_norm": 0.051588647067546844, "learning_rate": 0.00018639148757337818, "loss": 0.0758, "step": 8188 }, { "epoch": 0.529389898989899, "grad_norm": 0.05534090846776962, "learning_rate": 0.0001863880431845543, "loss": 0.086, "step": 8189 }, { "epoch": 0.5294545454545454, "grad_norm": 0.06360449641942978, "learning_rate": 0.00018638459839172207, "loss": 0.0794, "step": 8190 }, { "epoch": 0.529519191919192, "grad_norm": 0.0637177899479866, "learning_rate": 0.00018638115319489766, "loss": 0.0993, "step": 8191 }, { "epoch": 0.5295838383838384, "grad_norm": 0.05807386338710785, "learning_rate": 0.00018637770759409716, "loss": 0.0865, "step": 8192 }, { "epoch": 0.5295838383838384, "eval_bleu": 15.967130906039495, "eval_loss": 0.09111778438091278, "eval_runtime": 2.6335, "eval_samples_per_second": 12.151, "eval_steps_per_second": 1.519, "step": 8192 }, { "epoch": 0.5296484848484848, "grad_norm": 0.061091382056474686, "learning_rate": 0.00018637426158933665, "loss": 0.0875, "step": 8193 }, { "epoch": 0.5297131313131314, "grad_norm": 0.0589834488928318, "learning_rate": 0.0001863708151806323, "loss": 0.0938, "step": 8194 }, { "epoch": 0.5297777777777778, "grad_norm": 0.05274652689695358, "learning_rate": 0.00018636736836800017, "loss": 0.0753, "step": 8195 }, { "epoch": 0.5298424242424242, "grad_norm": 0.05291749909520149, "learning_rate": 0.00018636392115145643, "loss": 0.0785, "step": 8196 }, { "epoch": 0.5299070707070707, "grad_norm": 0.05401220917701721, "learning_rate": 0.0001863604735310172, "loss": 0.083, "step": 8197 }, { "epoch": 0.5299717171717172, "grad_norm": 0.055646684020757675, "learning_rate": 0.00018635702550669856, "loss": 0.0818, "step": 8198 }, { "epoch": 0.5300363636363636, "grad_norm": 0.05552418902516365, "learning_rate": 0.00018635357707851667, "loss": 0.0811, "step": 8199 }, { "epoch": 0.5301010101010101, "grad_norm": 0.0542762354016304, "learning_rate": 0.00018635012824648762, "loss": 0.0899, "step": 8200 }, { "epoch": 0.5301656565656566, "grad_norm": 0.06189398467540741, "learning_rate": 0.0001863466790106276, "loss": 0.078, "step": 8201 }, { "epoch": 0.530230303030303, "grad_norm": 0.05979127809405327, "learning_rate": 0.0001863432293709527, "loss": 0.088, "step": 8202 }, { "epoch": 0.5302949494949495, "grad_norm": 0.07108569145202637, "learning_rate": 0.00018633977932747906, "loss": 0.0953, "step": 8203 }, { "epoch": 0.5303595959595959, "grad_norm": 0.05051000416278839, "learning_rate": 0.0001863363288802228, "loss": 0.0715, "step": 8204 }, { "epoch": 0.5304242424242425, "grad_norm": 0.06007734686136246, "learning_rate": 0.0001863328780292001, "loss": 0.102, "step": 8205 }, { "epoch": 0.5304888888888889, "grad_norm": 0.06084006279706955, "learning_rate": 0.00018632942677442708, "loss": 0.1049, "step": 8206 }, { "epoch": 0.5305535353535353, "grad_norm": 0.07701319456100464, "learning_rate": 0.00018632597511591983, "loss": 0.1053, "step": 8207 }, { "epoch": 0.5306181818181818, "grad_norm": 0.05825622379779816, "learning_rate": 0.00018632252305369456, "loss": 0.0993, "step": 8208 }, { "epoch": 0.5306181818181818, "eval_bleu": 15.034497339513024, "eval_loss": 0.09159655123949051, "eval_runtime": 2.716, "eval_samples_per_second": 11.782, "eval_steps_per_second": 1.473, "step": 8208 }, { "epoch": 0.5306828282828283, "grad_norm": 0.06671900302171707, "learning_rate": 0.00018631907058776735, "loss": 0.0776, "step": 8209 }, { "epoch": 0.5307474747474747, "grad_norm": 0.06987617164850235, "learning_rate": 0.00018631561771815445, "loss": 0.0721, "step": 8210 }, { "epoch": 0.5308121212121212, "grad_norm": 0.06296186149120331, "learning_rate": 0.0001863121644448719, "loss": 0.0922, "step": 8211 }, { "epoch": 0.5308767676767677, "grad_norm": 0.0487523078918457, "learning_rate": 0.0001863087107679359, "loss": 0.0714, "step": 8212 }, { "epoch": 0.5309414141414142, "grad_norm": 0.05206483602523804, "learning_rate": 0.00018630525668736255, "loss": 0.0664, "step": 8213 }, { "epoch": 0.5310060606060606, "grad_norm": 0.06087940186262131, "learning_rate": 0.00018630180220316811, "loss": 0.0984, "step": 8214 }, { "epoch": 0.531070707070707, "grad_norm": 0.061995118856430054, "learning_rate": 0.00018629834731536863, "loss": 0.0933, "step": 8215 }, { "epoch": 0.5311353535353536, "grad_norm": 0.06555937975645065, "learning_rate": 0.00018629489202398033, "loss": 0.0955, "step": 8216 }, { "epoch": 0.5312, "grad_norm": 0.054444488137960434, "learning_rate": 0.00018629143632901935, "loss": 0.079, "step": 8217 }, { "epoch": 0.5312646464646464, "grad_norm": 0.04922828823328018, "learning_rate": 0.00018628798023050184, "loss": 0.0745, "step": 8218 }, { "epoch": 0.531329292929293, "grad_norm": 0.057211704552173615, "learning_rate": 0.000186284523728444, "loss": 0.0865, "step": 8219 }, { "epoch": 0.5313939393939394, "grad_norm": 0.06343744695186615, "learning_rate": 0.00018628106682286191, "loss": 0.0914, "step": 8220 }, { "epoch": 0.5314585858585859, "grad_norm": 0.06144876405596733, "learning_rate": 0.00018627760951377186, "loss": 0.0909, "step": 8221 }, { "epoch": 0.5315232323232323, "grad_norm": 0.0568680502474308, "learning_rate": 0.00018627415180118992, "loss": 0.0859, "step": 8222 }, { "epoch": 0.5315878787878788, "grad_norm": 0.049704354256391525, "learning_rate": 0.0001862706936851323, "loss": 0.0662, "step": 8223 }, { "epoch": 0.5316525252525253, "grad_norm": 0.05485045164823532, "learning_rate": 0.00018626723516561514, "loss": 0.076, "step": 8224 }, { "epoch": 0.5316525252525253, "eval_bleu": 12.218024695365063, "eval_loss": 0.09018754959106445, "eval_runtime": 2.7145, "eval_samples_per_second": 11.789, "eval_steps_per_second": 1.474, "step": 8224 }, { "epoch": 0.5317171717171717, "grad_norm": 0.05384925380349159, "learning_rate": 0.00018626377624265467, "loss": 0.0809, "step": 8225 }, { "epoch": 0.5317818181818181, "grad_norm": 0.058380234986543655, "learning_rate": 0.00018626031691626704, "loss": 0.0989, "step": 8226 }, { "epoch": 0.5318464646464647, "grad_norm": 0.05814234912395477, "learning_rate": 0.0001862568571864684, "loss": 0.087, "step": 8227 }, { "epoch": 0.5319111111111111, "grad_norm": 0.055581558495759964, "learning_rate": 0.000186253397053275, "loss": 0.0946, "step": 8228 }, { "epoch": 0.5319757575757575, "grad_norm": 0.0517161563038826, "learning_rate": 0.00018624993651670293, "loss": 0.0664, "step": 8229 }, { "epoch": 0.5320404040404041, "grad_norm": 0.05167453736066818, "learning_rate": 0.00018624647557676841, "loss": 0.0779, "step": 8230 }, { "epoch": 0.5321050505050505, "grad_norm": 0.05502316355705261, "learning_rate": 0.00018624301423348768, "loss": 0.0697, "step": 8231 }, { "epoch": 0.532169696969697, "grad_norm": 0.055368680506944656, "learning_rate": 0.00018623955248687688, "loss": 0.0854, "step": 8232 }, { "epoch": 0.5322343434343434, "grad_norm": 0.05876335874199867, "learning_rate": 0.00018623609033695216, "loss": 0.0856, "step": 8233 }, { "epoch": 0.5322989898989899, "grad_norm": 0.06381727755069733, "learning_rate": 0.0001862326277837298, "loss": 0.1002, "step": 8234 }, { "epoch": 0.5323636363636364, "grad_norm": 0.057452376931905746, "learning_rate": 0.00018622916482722595, "loss": 0.0916, "step": 8235 }, { "epoch": 0.5324282828282828, "grad_norm": 0.05934375151991844, "learning_rate": 0.00018622570146745678, "loss": 0.0894, "step": 8236 }, { "epoch": 0.5324929292929292, "grad_norm": 0.05105289816856384, "learning_rate": 0.0001862222377044385, "loss": 0.0798, "step": 8237 }, { "epoch": 0.5325575757575758, "grad_norm": 0.057537905871868134, "learning_rate": 0.00018621877353818732, "loss": 0.0884, "step": 8238 }, { "epoch": 0.5326222222222222, "grad_norm": 0.04983418807387352, "learning_rate": 0.00018621530896871947, "loss": 0.0738, "step": 8239 }, { "epoch": 0.5326868686868687, "grad_norm": 0.0654606968164444, "learning_rate": 0.00018621184399605112, "loss": 0.0835, "step": 8240 }, { "epoch": 0.5326868686868687, "eval_bleu": 17.106619389405196, "eval_loss": 0.09067206084728241, "eval_runtime": 2.6745, "eval_samples_per_second": 11.965, "eval_steps_per_second": 1.496, "step": 8240 }, { "epoch": 0.5327515151515152, "grad_norm": 0.0471956767141819, "learning_rate": 0.00018620837862019847, "loss": 0.0728, "step": 8241 }, { "epoch": 0.5328161616161616, "grad_norm": 0.049919672310352325, "learning_rate": 0.0001862049128411777, "loss": 0.0734, "step": 8242 }, { "epoch": 0.5328808080808081, "grad_norm": 0.06254030019044876, "learning_rate": 0.0001862014466590051, "loss": 0.0986, "step": 8243 }, { "epoch": 0.5329454545454545, "grad_norm": 0.05742871016263962, "learning_rate": 0.0001861979800736968, "loss": 0.0949, "step": 8244 }, { "epoch": 0.533010101010101, "grad_norm": 0.0682227686047554, "learning_rate": 0.00018619451308526905, "loss": 0.0868, "step": 8245 }, { "epoch": 0.5330747474747475, "grad_norm": 0.057319898158311844, "learning_rate": 0.00018619104569373804, "loss": 0.0731, "step": 8246 }, { "epoch": 0.5331393939393939, "grad_norm": 0.07615464925765991, "learning_rate": 0.00018618757789912004, "loss": 0.0928, "step": 8247 }, { "epoch": 0.5332040404040405, "grad_norm": 0.05360186845064163, "learning_rate": 0.00018618410970143123, "loss": 0.0792, "step": 8248 }, { "epoch": 0.5332686868686869, "grad_norm": 0.047628216445446014, "learning_rate": 0.0001861806411006878, "loss": 0.0723, "step": 8249 }, { "epoch": 0.5333333333333333, "grad_norm": 0.051107682287693024, "learning_rate": 0.00018617717209690602, "loss": 0.07, "step": 8250 }, { "epoch": 0.5333979797979798, "grad_norm": 0.06712552905082703, "learning_rate": 0.00018617370269010208, "loss": 0.084, "step": 8251 }, { "epoch": 0.5334626262626263, "grad_norm": 0.052443407475948334, "learning_rate": 0.00018617023288029224, "loss": 0.0619, "step": 8252 }, { "epoch": 0.5335272727272727, "grad_norm": 0.06523042917251587, "learning_rate": 0.00018616676266749272, "loss": 0.103, "step": 8253 }, { "epoch": 0.5335919191919192, "grad_norm": 0.05097396299242973, "learning_rate": 0.0001861632920517197, "loss": 0.0743, "step": 8254 }, { "epoch": 0.5336565656565656, "grad_norm": 0.060317009687423706, "learning_rate": 0.00018615982103298948, "loss": 0.0799, "step": 8255 }, { "epoch": 0.5337212121212122, "grad_norm": 0.05907973274588585, "learning_rate": 0.00018615634961131826, "loss": 0.0856, "step": 8256 }, { "epoch": 0.5337212121212122, "eval_bleu": 16.145015215905637, "eval_loss": 0.09140747785568237, "eval_runtime": 2.6819, "eval_samples_per_second": 11.932, "eval_steps_per_second": 1.491, "step": 8256 }, { "epoch": 0.5337858585858586, "grad_norm": 0.05299656465649605, "learning_rate": 0.00018615287778672224, "loss": 0.0742, "step": 8257 }, { "epoch": 0.533850505050505, "grad_norm": 0.05650373548269272, "learning_rate": 0.00018614940555921773, "loss": 0.0867, "step": 8258 }, { "epoch": 0.5339151515151516, "grad_norm": 0.059793610125780106, "learning_rate": 0.00018614593292882092, "loss": 0.0932, "step": 8259 }, { "epoch": 0.533979797979798, "grad_norm": 0.0863662138581276, "learning_rate": 0.00018614245989554803, "loss": 0.0882, "step": 8260 }, { "epoch": 0.5340444444444444, "grad_norm": 0.05435851216316223, "learning_rate": 0.0001861389864594154, "loss": 0.0706, "step": 8261 }, { "epoch": 0.5341090909090909, "grad_norm": 0.05007358640432358, "learning_rate": 0.00018613551262043916, "loss": 0.0755, "step": 8262 }, { "epoch": 0.5341737373737374, "grad_norm": 0.05672455206513405, "learning_rate": 0.00018613203837863562, "loss": 0.0853, "step": 8263 }, { "epoch": 0.5342383838383838, "grad_norm": 0.08078844100236893, "learning_rate": 0.00018612856373402098, "loss": 0.075, "step": 8264 }, { "epoch": 0.5343030303030303, "grad_norm": 0.05466439574956894, "learning_rate": 0.00018612508868661153, "loss": 0.0681, "step": 8265 }, { "epoch": 0.5343676767676767, "grad_norm": 0.058427222073078156, "learning_rate": 0.00018612161323642353, "loss": 0.0936, "step": 8266 }, { "epoch": 0.5344323232323233, "grad_norm": 0.05492251366376877, "learning_rate": 0.00018611813738347323, "loss": 0.0829, "step": 8267 }, { "epoch": 0.5344969696969697, "grad_norm": 0.07162462919950485, "learning_rate": 0.00018611466112777687, "loss": 0.0795, "step": 8268 }, { "epoch": 0.5345616161616161, "grad_norm": 0.054898444563150406, "learning_rate": 0.00018611118446935066, "loss": 0.0821, "step": 8269 }, { "epoch": 0.5346262626262627, "grad_norm": 0.053176648914813995, "learning_rate": 0.00018610770740821095, "loss": 0.0763, "step": 8270 }, { "epoch": 0.5346909090909091, "grad_norm": 0.06907225400209427, "learning_rate": 0.00018610422994437396, "loss": 0.0949, "step": 8271 }, { "epoch": 0.5347555555555555, "grad_norm": 0.06239648908376694, "learning_rate": 0.00018610075207785594, "loss": 0.0927, "step": 8272 }, { "epoch": 0.5347555555555555, "eval_bleu": 19.223231436842184, "eval_loss": 0.08939863741397858, "eval_runtime": 2.7272, "eval_samples_per_second": 11.734, "eval_steps_per_second": 1.467, "step": 8272 }, { "epoch": 0.534820202020202, "grad_norm": 0.05446686968207359, "learning_rate": 0.00018609727380867317, "loss": 0.0825, "step": 8273 }, { "epoch": 0.5348848484848485, "grad_norm": 0.050335466861724854, "learning_rate": 0.00018609379513684192, "loss": 0.0826, "step": 8274 }, { "epoch": 0.534949494949495, "grad_norm": 0.0544157437980175, "learning_rate": 0.00018609031606237845, "loss": 0.0822, "step": 8275 }, { "epoch": 0.5350141414141414, "grad_norm": 0.05645119026303291, "learning_rate": 0.00018608683658529902, "loss": 0.0846, "step": 8276 }, { "epoch": 0.5350787878787879, "grad_norm": 0.04752171039581299, "learning_rate": 0.0001860833567056199, "loss": 0.075, "step": 8277 }, { "epoch": 0.5351434343434344, "grad_norm": 0.051613397896289825, "learning_rate": 0.00018607987642335744, "loss": 0.0818, "step": 8278 }, { "epoch": 0.5352080808080808, "grad_norm": 0.057909850031137466, "learning_rate": 0.0001860763957385278, "loss": 0.0915, "step": 8279 }, { "epoch": 0.5352727272727272, "grad_norm": 0.05699526146054268, "learning_rate": 0.00018607291465114735, "loss": 0.0853, "step": 8280 }, { "epoch": 0.5353373737373738, "grad_norm": 0.057267285883426666, "learning_rate": 0.0001860694331612323, "loss": 0.0763, "step": 8281 }, { "epoch": 0.5354020202020202, "grad_norm": 0.049214329570531845, "learning_rate": 0.000186065951268799, "loss": 0.0762, "step": 8282 }, { "epoch": 0.5354666666666666, "grad_norm": 0.0625491514801979, "learning_rate": 0.00018606246897386364, "loss": 0.0958, "step": 8283 }, { "epoch": 0.5355313131313131, "grad_norm": 0.06192827969789505, "learning_rate": 0.00018605898627644264, "loss": 0.1096, "step": 8284 }, { "epoch": 0.5355959595959596, "grad_norm": 0.059381015598773956, "learning_rate": 0.00018605550317655215, "loss": 0.0885, "step": 8285 }, { "epoch": 0.5356606060606061, "grad_norm": 0.05330536514520645, "learning_rate": 0.00018605201967420855, "loss": 0.0847, "step": 8286 }, { "epoch": 0.5357252525252525, "grad_norm": 0.04886066913604736, "learning_rate": 0.00018604853576942808, "loss": 0.0734, "step": 8287 }, { "epoch": 0.535789898989899, "grad_norm": 0.057301830500364304, "learning_rate": 0.00018604505146222708, "loss": 0.0871, "step": 8288 }, { "epoch": 0.535789898989899, "eval_bleu": 18.954430670542465, "eval_loss": 0.0894395038485527, "eval_runtime": 2.6518, "eval_samples_per_second": 12.067, "eval_steps_per_second": 1.508, "step": 8288 }, { "epoch": 0.5358545454545455, "grad_norm": 0.044807419180870056, "learning_rate": 0.00018604156675262182, "loss": 0.0648, "step": 8289 }, { "epoch": 0.5359191919191919, "grad_norm": 0.05310199782252312, "learning_rate": 0.00018603808164062858, "loss": 0.0699, "step": 8290 }, { "epoch": 0.5359838383838383, "grad_norm": 0.054336246103048325, "learning_rate": 0.00018603459612626368, "loss": 0.0871, "step": 8291 }, { "epoch": 0.5360484848484849, "grad_norm": 0.06122654303908348, "learning_rate": 0.00018603111020954342, "loss": 0.0936, "step": 8292 }, { "epoch": 0.5361131313131313, "grad_norm": 0.06756310909986496, "learning_rate": 0.00018602762389048413, "loss": 0.0985, "step": 8293 }, { "epoch": 0.5361777777777778, "grad_norm": 0.07783056795597076, "learning_rate": 0.00018602413716910205, "loss": 0.0924, "step": 8294 }, { "epoch": 0.5362424242424242, "grad_norm": 0.04656273499131203, "learning_rate": 0.0001860206500454135, "loss": 0.0705, "step": 8295 }, { "epoch": 0.5363070707070707, "grad_norm": 0.06412513554096222, "learning_rate": 0.00018601716251943483, "loss": 0.1028, "step": 8296 }, { "epoch": 0.5363717171717172, "grad_norm": 0.0647125318646431, "learning_rate": 0.00018601367459118232, "loss": 0.0968, "step": 8297 }, { "epoch": 0.5364363636363636, "grad_norm": 0.05226157605648041, "learning_rate": 0.00018601018626067232, "loss": 0.0905, "step": 8298 }, { "epoch": 0.5365010101010101, "grad_norm": 0.05691823363304138, "learning_rate": 0.0001860066975279211, "loss": 0.0801, "step": 8299 }, { "epoch": 0.5365656565656566, "grad_norm": 0.05182036757469177, "learning_rate": 0.00018600320839294498, "loss": 0.0869, "step": 8300 }, { "epoch": 0.536630303030303, "grad_norm": 0.06154217943549156, "learning_rate": 0.0001859997188557603, "loss": 0.0896, "step": 8301 }, { "epoch": 0.5366949494949494, "grad_norm": 0.06061071157455444, "learning_rate": 0.00018599622891638334, "loss": 0.0899, "step": 8302 }, { "epoch": 0.536759595959596, "grad_norm": 0.06307435035705566, "learning_rate": 0.00018599273857483045, "loss": 0.08, "step": 8303 }, { "epoch": 0.5368242424242424, "grad_norm": 0.056303080171346664, "learning_rate": 0.00018598924783111797, "loss": 0.0875, "step": 8304 }, { "epoch": 0.5368242424242424, "eval_bleu": 19.58538118797418, "eval_loss": 0.08850456774234772, "eval_runtime": 2.5748, "eval_samples_per_second": 12.428, "eval_steps_per_second": 1.554, "step": 8304 }, { "epoch": 0.5368888888888889, "grad_norm": 0.05522151291370392, "learning_rate": 0.0001859857566852622, "loss": 0.07, "step": 8305 }, { "epoch": 0.5369535353535354, "grad_norm": 0.05094585940241814, "learning_rate": 0.00018598226513727946, "loss": 0.0769, "step": 8306 }, { "epoch": 0.5370181818181818, "grad_norm": 0.052457161247730255, "learning_rate": 0.0001859787731871861, "loss": 0.0668, "step": 8307 }, { "epoch": 0.5370828282828283, "grad_norm": 0.0579199343919754, "learning_rate": 0.00018597528083499844, "loss": 0.0872, "step": 8308 }, { "epoch": 0.5371474747474747, "grad_norm": 0.06060720980167389, "learning_rate": 0.00018597178808073283, "loss": 0.0796, "step": 8309 }, { "epoch": 0.5372121212121213, "grad_norm": 0.06388574093580246, "learning_rate": 0.00018596829492440557, "loss": 0.1016, "step": 8310 }, { "epoch": 0.5372767676767677, "grad_norm": 0.06407853215932846, "learning_rate": 0.00018596480136603302, "loss": 0.0987, "step": 8311 }, { "epoch": 0.5373414141414141, "grad_norm": 0.04617239534854889, "learning_rate": 0.0001859613074056315, "loss": 0.0701, "step": 8312 }, { "epoch": 0.5374060606060606, "grad_norm": 0.06006339192390442, "learning_rate": 0.0001859578130432174, "loss": 0.0934, "step": 8313 }, { "epoch": 0.5374707070707071, "grad_norm": 0.05537143722176552, "learning_rate": 0.00018595431827880699, "loss": 0.0861, "step": 8314 }, { "epoch": 0.5375353535353535, "grad_norm": 0.06844300031661987, "learning_rate": 0.00018595082311241665, "loss": 0.0871, "step": 8315 }, { "epoch": 0.5376, "grad_norm": 0.052262332290410995, "learning_rate": 0.00018594732754406276, "loss": 0.0796, "step": 8316 }, { "epoch": 0.5376646464646465, "grad_norm": 0.062317464500665665, "learning_rate": 0.0001859438315737616, "loss": 0.0835, "step": 8317 }, { "epoch": 0.537729292929293, "grad_norm": 0.05691210553050041, "learning_rate": 0.00018594033520152956, "loss": 0.0867, "step": 8318 }, { "epoch": 0.5377939393939394, "grad_norm": 0.055950846523046494, "learning_rate": 0.00018593683842738296, "loss": 0.0805, "step": 8319 }, { "epoch": 0.5378585858585858, "grad_norm": 0.047129467129707336, "learning_rate": 0.00018593334125133822, "loss": 0.069, "step": 8320 }, { "epoch": 0.5378585858585858, "eval_bleu": 21.024658686103276, "eval_loss": 0.08874163776636124, "eval_runtime": 2.6543, "eval_samples_per_second": 12.056, "eval_steps_per_second": 1.507, "step": 8320 }, { "epoch": 0.5379232323232324, "grad_norm": 0.059489376842975616, "learning_rate": 0.00018592984367341164, "loss": 0.0947, "step": 8321 }, { "epoch": 0.5379878787878788, "grad_norm": 0.06001853197813034, "learning_rate": 0.00018592634569361959, "loss": 0.1006, "step": 8322 }, { "epoch": 0.5380525252525252, "grad_norm": 0.055997006595134735, "learning_rate": 0.00018592284731197843, "loss": 0.09, "step": 8323 }, { "epoch": 0.5381171717171718, "grad_norm": 0.05804590880870819, "learning_rate": 0.0001859193485285045, "loss": 0.0848, "step": 8324 }, { "epoch": 0.5381818181818182, "grad_norm": 0.06728817522525787, "learning_rate": 0.00018591584934321418, "loss": 0.0936, "step": 8325 }, { "epoch": 0.5382464646464646, "grad_norm": 0.05033940449357033, "learning_rate": 0.00018591234975612385, "loss": 0.0709, "step": 8326 }, { "epoch": 0.5383111111111111, "grad_norm": 0.054995425045490265, "learning_rate": 0.00018590884976724988, "loss": 0.0876, "step": 8327 }, { "epoch": 0.5383757575757576, "grad_norm": 0.06335888057947159, "learning_rate": 0.00018590534937660858, "loss": 0.0867, "step": 8328 }, { "epoch": 0.538440404040404, "grad_norm": 0.05561644956469536, "learning_rate": 0.00018590184858421639, "loss": 0.0702, "step": 8329 }, { "epoch": 0.5385050505050505, "grad_norm": 0.05800582468509674, "learning_rate": 0.0001858983473900896, "loss": 0.0908, "step": 8330 }, { "epoch": 0.5385696969696969, "grad_norm": 0.05595811828970909, "learning_rate": 0.0001858948457942447, "loss": 0.0772, "step": 8331 }, { "epoch": 0.5386343434343435, "grad_norm": 0.05636722967028618, "learning_rate": 0.000185891343796698, "loss": 0.0908, "step": 8332 }, { "epoch": 0.5386989898989899, "grad_norm": 0.05745876953005791, "learning_rate": 0.00018588784139746584, "loss": 0.0909, "step": 8333 }, { "epoch": 0.5387636363636363, "grad_norm": 0.059869375079870224, "learning_rate": 0.00018588433859656467, "loss": 0.1004, "step": 8334 }, { "epoch": 0.5388282828282829, "grad_norm": 0.062045373022556305, "learning_rate": 0.00018588083539401083, "loss": 0.1018, "step": 8335 }, { "epoch": 0.5388929292929293, "grad_norm": 0.05753149837255478, "learning_rate": 0.00018587733178982072, "loss": 0.0818, "step": 8336 }, { "epoch": 0.5388929292929293, "eval_bleu": 17.834654120806253, "eval_loss": 0.08853840827941895, "eval_runtime": 2.6237, "eval_samples_per_second": 12.196, "eval_steps_per_second": 1.525, "step": 8336 }, { "epoch": 0.5389575757575757, "grad_norm": 0.05332630127668381, "learning_rate": 0.00018587382778401072, "loss": 0.0891, "step": 8337 }, { "epoch": 0.5390222222222222, "grad_norm": 0.09478280693292618, "learning_rate": 0.0001858703233765972, "loss": 0.1018, "step": 8338 }, { "epoch": 0.5390868686868687, "grad_norm": 0.05538664385676384, "learning_rate": 0.0001858668185675966, "loss": 0.0758, "step": 8339 }, { "epoch": 0.5391515151515152, "grad_norm": 0.0640513151884079, "learning_rate": 0.00018586331335702526, "loss": 0.0853, "step": 8340 }, { "epoch": 0.5392161616161616, "grad_norm": 0.053850021213293076, "learning_rate": 0.00018585980774489958, "loss": 0.0845, "step": 8341 }, { "epoch": 0.539280808080808, "grad_norm": 0.05923475697636604, "learning_rate": 0.00018585630173123597, "loss": 0.0877, "step": 8342 }, { "epoch": 0.5393454545454546, "grad_norm": 0.09389933198690414, "learning_rate": 0.00018585279531605083, "loss": 0.1006, "step": 8343 }, { "epoch": 0.539410101010101, "grad_norm": 0.05449429899454117, "learning_rate": 0.00018584928849936053, "loss": 0.0802, "step": 8344 }, { "epoch": 0.5394747474747474, "grad_norm": 0.05550706386566162, "learning_rate": 0.00018584578128118148, "loss": 0.082, "step": 8345 }, { "epoch": 0.539539393939394, "grad_norm": 0.05854269489645958, "learning_rate": 0.00018584227366153012, "loss": 0.0889, "step": 8346 }, { "epoch": 0.5396040404040404, "grad_norm": 0.05781397223472595, "learning_rate": 0.00018583876564042285, "loss": 0.0896, "step": 8347 }, { "epoch": 0.5396686868686869, "grad_norm": 0.07529180496931076, "learning_rate": 0.000185835257217876, "loss": 0.1129, "step": 8348 }, { "epoch": 0.5397333333333333, "grad_norm": 0.05766432359814644, "learning_rate": 0.00018583174839390603, "loss": 0.0905, "step": 8349 }, { "epoch": 0.5397979797979798, "grad_norm": 0.06224052980542183, "learning_rate": 0.00018582823916852935, "loss": 0.0862, "step": 8350 }, { "epoch": 0.5398626262626263, "grad_norm": 0.06260453909635544, "learning_rate": 0.00018582472954176238, "loss": 0.0878, "step": 8351 }, { "epoch": 0.5399272727272727, "grad_norm": 0.0520743802189827, "learning_rate": 0.00018582121951362152, "loss": 0.0733, "step": 8352 }, { "epoch": 0.5399272727272727, "eval_bleu": 16.107070007067154, "eval_loss": 0.08820085227489471, "eval_runtime": 2.6822, "eval_samples_per_second": 11.931, "eval_steps_per_second": 1.491, "step": 8352 }, { "epoch": 0.5399919191919192, "grad_norm": 0.05755551531910896, "learning_rate": 0.0001858177090841232, "loss": 0.092, "step": 8353 }, { "epoch": 0.5400565656565657, "grad_norm": 0.05572543293237686, "learning_rate": 0.00018581419825328382, "loss": 0.0896, "step": 8354 }, { "epoch": 0.5401212121212121, "grad_norm": 0.05118353292346001, "learning_rate": 0.0001858106870211198, "loss": 0.081, "step": 8355 }, { "epoch": 0.5401858585858585, "grad_norm": 0.05387420207262039, "learning_rate": 0.0001858071753876476, "loss": 0.0916, "step": 8356 }, { "epoch": 0.5402505050505051, "grad_norm": 0.06279915571212769, "learning_rate": 0.00018580366335288354, "loss": 0.0978, "step": 8357 }, { "epoch": 0.5403151515151515, "grad_norm": 0.05241883173584938, "learning_rate": 0.00018580015091684414, "loss": 0.0808, "step": 8358 }, { "epoch": 0.540379797979798, "grad_norm": 0.06007806956768036, "learning_rate": 0.0001857966380795458, "loss": 0.0855, "step": 8359 }, { "epoch": 0.5404444444444444, "grad_norm": 0.07481272518634796, "learning_rate": 0.00018579312484100498, "loss": 0.1046, "step": 8360 }, { "epoch": 0.5405090909090909, "grad_norm": 0.04217597097158432, "learning_rate": 0.00018578961120123806, "loss": 0.0597, "step": 8361 }, { "epoch": 0.5405737373737374, "grad_norm": 0.05918454751372337, "learning_rate": 0.00018578609716026148, "loss": 0.0968, "step": 8362 }, { "epoch": 0.5406383838383838, "grad_norm": 0.04481532424688339, "learning_rate": 0.0001857825827180917, "loss": 0.0669, "step": 8363 }, { "epoch": 0.5407030303030304, "grad_norm": 0.05391525477170944, "learning_rate": 0.00018577906787474512, "loss": 0.087, "step": 8364 }, { "epoch": 0.5407676767676768, "grad_norm": 0.05525801330804825, "learning_rate": 0.00018577555263023818, "loss": 0.0813, "step": 8365 }, { "epoch": 0.5408323232323232, "grad_norm": 0.05640725418925285, "learning_rate": 0.00018577203698458736, "loss": 0.0847, "step": 8366 }, { "epoch": 0.5408969696969697, "grad_norm": 0.052552398294210434, "learning_rate": 0.00018576852093780909, "loss": 0.0811, "step": 8367 }, { "epoch": 0.5409616161616162, "grad_norm": 0.0614917054772377, "learning_rate": 0.00018576500448991978, "loss": 0.0954, "step": 8368 }, { "epoch": 0.5409616161616162, "eval_bleu": 19.02898797411804, "eval_loss": 0.08984184265136719, "eval_runtime": 2.7799, "eval_samples_per_second": 11.511, "eval_steps_per_second": 1.439, "step": 8368 }, { "epoch": 0.5410262626262626, "grad_norm": 0.05920136347413063, "learning_rate": 0.00018576148764093593, "loss": 0.0982, "step": 8369 }, { "epoch": 0.5410909090909091, "grad_norm": 0.06127585098147392, "learning_rate": 0.00018575797039087393, "loss": 0.0981, "step": 8370 }, { "epoch": 0.5411555555555555, "grad_norm": 0.05832083895802498, "learning_rate": 0.00018575445273975025, "loss": 0.1018, "step": 8371 }, { "epoch": 0.541220202020202, "grad_norm": 0.054121069610118866, "learning_rate": 0.00018575093468758136, "loss": 0.079, "step": 8372 }, { "epoch": 0.5412848484848485, "grad_norm": 0.05774083361029625, "learning_rate": 0.00018574741623438368, "loss": 0.0892, "step": 8373 }, { "epoch": 0.5413494949494949, "grad_norm": 0.05657060071825981, "learning_rate": 0.00018574389738017368, "loss": 0.0892, "step": 8374 }, { "epoch": 0.5414141414141415, "grad_norm": 0.059504810720682144, "learning_rate": 0.00018574037812496785, "loss": 0.0955, "step": 8375 }, { "epoch": 0.5414787878787879, "grad_norm": 0.06212432309985161, "learning_rate": 0.0001857368584687826, "loss": 0.1045, "step": 8376 }, { "epoch": 0.5415434343434343, "grad_norm": 0.051974181085824966, "learning_rate": 0.00018573333841163437, "loss": 0.0737, "step": 8377 }, { "epoch": 0.5416080808080808, "grad_norm": 0.1502837836742401, "learning_rate": 0.0001857298179535397, "loss": 0.1228, "step": 8378 }, { "epoch": 0.5416727272727273, "grad_norm": 0.05611203610897064, "learning_rate": 0.00018572629709451502, "loss": 0.0857, "step": 8379 }, { "epoch": 0.5417373737373737, "grad_norm": 0.06193416565656662, "learning_rate": 0.00018572277583457677, "loss": 0.1036, "step": 8380 }, { "epoch": 0.5418020202020202, "grad_norm": 0.05550897866487503, "learning_rate": 0.00018571925417374142, "loss": 0.0906, "step": 8381 }, { "epoch": 0.5418666666666667, "grad_norm": 0.15446196496486664, "learning_rate": 0.0001857157321120255, "loss": 0.1296, "step": 8382 }, { "epoch": 0.5419313131313132, "grad_norm": 0.04985259845852852, "learning_rate": 0.00018571220964944541, "loss": 0.0701, "step": 8383 }, { "epoch": 0.5419959595959596, "grad_norm": 0.052967000752687454, "learning_rate": 0.00018570868678601766, "loss": 0.0846, "step": 8384 }, { "epoch": 0.5419959595959596, "eval_bleu": 16.633366712327398, "eval_loss": 0.0888901948928833, "eval_runtime": 2.788, "eval_samples_per_second": 11.478, "eval_steps_per_second": 1.435, "step": 8384 }, { "epoch": 0.542060606060606, "grad_norm": 0.048540957272052765, "learning_rate": 0.0001857051635217587, "loss": 0.0772, "step": 8385 }, { "epoch": 0.5421252525252526, "grad_norm": 0.06004468724131584, "learning_rate": 0.00018570163985668505, "loss": 0.0935, "step": 8386 }, { "epoch": 0.542189898989899, "grad_norm": 0.06196308508515358, "learning_rate": 0.00018569811579081312, "loss": 0.1168, "step": 8387 }, { "epoch": 0.5422545454545454, "grad_norm": 0.05243933945894241, "learning_rate": 0.0001856945913241595, "loss": 0.0833, "step": 8388 }, { "epoch": 0.5423191919191919, "grad_norm": 0.060408998280763626, "learning_rate": 0.00018569106645674056, "loss": 0.1008, "step": 8389 }, { "epoch": 0.5423838383838384, "grad_norm": 0.05245039612054825, "learning_rate": 0.00018568754118857282, "loss": 0.069, "step": 8390 }, { "epoch": 0.5424484848484848, "grad_norm": 0.06266285479068756, "learning_rate": 0.0001856840155196728, "loss": 0.0864, "step": 8391 }, { "epoch": 0.5425131313131313, "grad_norm": 0.053365856409072876, "learning_rate": 0.00018568048945005698, "loss": 0.0838, "step": 8392 }, { "epoch": 0.5425777777777778, "grad_norm": 0.051518719643354416, "learning_rate": 0.0001856769629797418, "loss": 0.0732, "step": 8393 }, { "epoch": 0.5426424242424243, "grad_norm": 0.05865391343832016, "learning_rate": 0.0001856734361087438, "loss": 0.0913, "step": 8394 }, { "epoch": 0.5427070707070707, "grad_norm": 0.049443308264017105, "learning_rate": 0.0001856699088370795, "loss": 0.0716, "step": 8395 }, { "epoch": 0.5427717171717171, "grad_norm": 0.06939326226711273, "learning_rate": 0.00018566638116476533, "loss": 0.1006, "step": 8396 }, { "epoch": 0.5428363636363637, "grad_norm": 0.059850648045539856, "learning_rate": 0.00018566285309181782, "loss": 0.0836, "step": 8397 }, { "epoch": 0.5429010101010101, "grad_norm": 0.055774323642253876, "learning_rate": 0.00018565932461825347, "loss": 0.0978, "step": 8398 }, { "epoch": 0.5429656565656565, "grad_norm": 0.05196011811494827, "learning_rate": 0.00018565579574408877, "loss": 0.0796, "step": 8399 }, { "epoch": 0.5430303030303031, "grad_norm": 0.05100923776626587, "learning_rate": 0.0001856522664693402, "loss": 0.0752, "step": 8400 }, { "epoch": 0.5430303030303031, "eval_bleu": 15.418245475050794, "eval_loss": 0.08952485024929047, "eval_runtime": 2.7692, "eval_samples_per_second": 11.556, "eval_steps_per_second": 1.444, "step": 8400 }, { "epoch": 0.5430949494949495, "grad_norm": 0.06348365545272827, "learning_rate": 0.00018564873679402435, "loss": 0.101, "step": 8401 }, { "epoch": 0.543159595959596, "grad_norm": 0.056157637387514114, "learning_rate": 0.00018564520671815764, "loss": 0.0864, "step": 8402 }, { "epoch": 0.5432242424242424, "grad_norm": 0.0586116760969162, "learning_rate": 0.00018564167624175664, "loss": 0.0748, "step": 8403 }, { "epoch": 0.5432888888888889, "grad_norm": 0.07707057148218155, "learning_rate": 0.0001856381453648378, "loss": 0.0998, "step": 8404 }, { "epoch": 0.5433535353535354, "grad_norm": 0.05943547934293747, "learning_rate": 0.00018563461408741767, "loss": 0.0935, "step": 8405 }, { "epoch": 0.5434181818181818, "grad_norm": 0.061430055648088455, "learning_rate": 0.00018563108240951277, "loss": 0.0927, "step": 8406 }, { "epoch": 0.5434828282828282, "grad_norm": 0.05652159824967384, "learning_rate": 0.0001856275503311396, "loss": 0.0824, "step": 8407 }, { "epoch": 0.5435474747474748, "grad_norm": 0.056507568806409836, "learning_rate": 0.0001856240178523147, "loss": 0.0893, "step": 8408 }, { "epoch": 0.5436121212121212, "grad_norm": 0.05945686623454094, "learning_rate": 0.00018562048497305456, "loss": 0.0799, "step": 8409 }, { "epoch": 0.5436767676767676, "grad_norm": 0.05805601552128792, "learning_rate": 0.0001856169516933757, "loss": 0.0925, "step": 8410 }, { "epoch": 0.5437414141414142, "grad_norm": 0.05750316381454468, "learning_rate": 0.00018561341801329471, "loss": 0.0898, "step": 8411 }, { "epoch": 0.5438060606060606, "grad_norm": 0.05648347735404968, "learning_rate": 0.00018560988393282802, "loss": 0.0789, "step": 8412 }, { "epoch": 0.5438707070707071, "grad_norm": 0.05910155549645424, "learning_rate": 0.00018560634945199224, "loss": 0.0912, "step": 8413 }, { "epoch": 0.5439353535353535, "grad_norm": 0.061711475253105164, "learning_rate": 0.00018560281457080384, "loss": 0.0826, "step": 8414 }, { "epoch": 0.544, "grad_norm": 0.057071685791015625, "learning_rate": 0.0001855992792892794, "loss": 0.0855, "step": 8415 }, { "epoch": 0.5440646464646465, "grad_norm": 0.059835780411958694, "learning_rate": 0.0001855957436074354, "loss": 0.0965, "step": 8416 }, { "epoch": 0.5440646464646465, "eval_bleu": 18.803531654934172, "eval_loss": 0.09108718484640121, "eval_runtime": 2.8583, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 8416 }, { "epoch": 0.5441292929292929, "grad_norm": 0.04928569868206978, "learning_rate": 0.00018559220752528842, "loss": 0.0686, "step": 8417 }, { "epoch": 0.5441939393939393, "grad_norm": 0.05229545384645462, "learning_rate": 0.000185588671042855, "loss": 0.0769, "step": 8418 }, { "epoch": 0.5442585858585859, "grad_norm": 0.06272634118795395, "learning_rate": 0.00018558513416015165, "loss": 0.0911, "step": 8419 }, { "epoch": 0.5443232323232323, "grad_norm": 0.05749104171991348, "learning_rate": 0.00018558159687719492, "loss": 0.0809, "step": 8420 }, { "epoch": 0.5443878787878788, "grad_norm": 0.050723958760499954, "learning_rate": 0.00018557805919400134, "loss": 0.0819, "step": 8421 }, { "epoch": 0.5444525252525253, "grad_norm": 0.058704011142253876, "learning_rate": 0.0001855745211105875, "loss": 0.091, "step": 8422 }, { "epoch": 0.5445171717171717, "grad_norm": 0.05834496393799782, "learning_rate": 0.00018557098262696986, "loss": 0.0949, "step": 8423 }, { "epoch": 0.5445818181818182, "grad_norm": 0.05419443920254707, "learning_rate": 0.00018556744374316505, "loss": 0.0696, "step": 8424 }, { "epoch": 0.5446464646464646, "grad_norm": 0.04613729193806648, "learning_rate": 0.00018556390445918962, "loss": 0.0681, "step": 8425 }, { "epoch": 0.5447111111111111, "grad_norm": 0.059159018099308014, "learning_rate": 0.00018556036477506007, "loss": 0.0881, "step": 8426 }, { "epoch": 0.5447757575757576, "grad_norm": 0.04964340850710869, "learning_rate": 0.00018555682469079297, "loss": 0.0698, "step": 8427 }, { "epoch": 0.544840404040404, "grad_norm": 0.051479123532772064, "learning_rate": 0.0001855532842064049, "loss": 0.0809, "step": 8428 }, { "epoch": 0.5449050505050506, "grad_norm": 0.05433405563235283, "learning_rate": 0.00018554974332191243, "loss": 0.0808, "step": 8429 }, { "epoch": 0.544969696969697, "grad_norm": 0.06508401781320572, "learning_rate": 0.00018554620203733207, "loss": 0.0937, "step": 8430 }, { "epoch": 0.5450343434343434, "grad_norm": 0.061269670724868774, "learning_rate": 0.00018554266035268038, "loss": 0.0909, "step": 8431 }, { "epoch": 0.5450989898989899, "grad_norm": 0.06291522085666656, "learning_rate": 0.00018553911826797397, "loss": 0.1027, "step": 8432 }, { "epoch": 0.5450989898989899, "eval_bleu": 17.201422927010913, "eval_loss": 0.09039796888828278, "eval_runtime": 2.6734, "eval_samples_per_second": 11.97, "eval_steps_per_second": 1.496, "step": 8432 }, { "epoch": 0.5451636363636364, "grad_norm": 0.06002259999513626, "learning_rate": 0.00018553557578322938, "loss": 0.0919, "step": 8433 }, { "epoch": 0.5452282828282828, "grad_norm": 0.06078853830695152, "learning_rate": 0.00018553203289846317, "loss": 0.0866, "step": 8434 }, { "epoch": 0.5452929292929293, "grad_norm": 0.06078164279460907, "learning_rate": 0.00018552848961369192, "loss": 0.0969, "step": 8435 }, { "epoch": 0.5453575757575757, "grad_norm": 0.05452439934015274, "learning_rate": 0.00018552494592893222, "loss": 0.0837, "step": 8436 }, { "epoch": 0.5454222222222223, "grad_norm": 0.05712348222732544, "learning_rate": 0.00018552140184420062, "loss": 0.078, "step": 8437 }, { "epoch": 0.5454868686868687, "grad_norm": 0.05907926708459854, "learning_rate": 0.00018551785735951369, "loss": 0.0892, "step": 8438 }, { "epoch": 0.5455515151515151, "grad_norm": 0.05306452140212059, "learning_rate": 0.000185514312474888, "loss": 0.0793, "step": 8439 }, { "epoch": 0.5456161616161617, "grad_norm": 0.058474667370319366, "learning_rate": 0.00018551076719034012, "loss": 0.0925, "step": 8440 }, { "epoch": 0.5456808080808081, "grad_norm": 0.05322973057627678, "learning_rate": 0.0001855072215058867, "loss": 0.0848, "step": 8441 }, { "epoch": 0.5457454545454545, "grad_norm": 0.058103013783693314, "learning_rate": 0.00018550367542154424, "loss": 0.0774, "step": 8442 }, { "epoch": 0.545810101010101, "grad_norm": 0.052553195506334305, "learning_rate": 0.0001855001289373294, "loss": 0.0795, "step": 8443 }, { "epoch": 0.5458747474747475, "grad_norm": 0.06051313132047653, "learning_rate": 0.00018549658205325867, "loss": 0.0862, "step": 8444 }, { "epoch": 0.545939393939394, "grad_norm": 0.05118947848677635, "learning_rate": 0.00018549303476934873, "loss": 0.0781, "step": 8445 }, { "epoch": 0.5460040404040404, "grad_norm": 0.05602129548788071, "learning_rate": 0.00018548948708561612, "loss": 0.0897, "step": 8446 }, { "epoch": 0.5460686868686868, "grad_norm": 0.06301146745681763, "learning_rate": 0.00018548593900207742, "loss": 0.1008, "step": 8447 }, { "epoch": 0.5461333333333334, "grad_norm": 0.053052809089422226, "learning_rate": 0.00018548239051874928, "loss": 0.0745, "step": 8448 }, { "epoch": 0.5461333333333334, "eval_bleu": 19.4792652922248, "eval_loss": 0.09056854248046875, "eval_runtime": 2.6044, "eval_samples_per_second": 12.287, "eval_steps_per_second": 1.536, "step": 8448 }, { "epoch": 0.5461979797979798, "grad_norm": 0.04736509174108505, "learning_rate": 0.00018547884163564823, "loss": 0.075, "step": 8449 }, { "epoch": 0.5462626262626262, "grad_norm": 0.05272328853607178, "learning_rate": 0.0001854752923527909, "loss": 0.0848, "step": 8450 }, { "epoch": 0.5463272727272728, "grad_norm": 0.055410150438547134, "learning_rate": 0.00018547174267019388, "loss": 0.0895, "step": 8451 }, { "epoch": 0.5463919191919192, "grad_norm": 0.0540524385869503, "learning_rate": 0.0001854681925878738, "loss": 0.0846, "step": 8452 }, { "epoch": 0.5464565656565656, "grad_norm": 0.06850180774927139, "learning_rate": 0.0001854646421058472, "loss": 0.1013, "step": 8453 }, { "epoch": 0.5465212121212121, "grad_norm": 0.06064102053642273, "learning_rate": 0.00018546109122413075, "loss": 0.0875, "step": 8454 }, { "epoch": 0.5465858585858586, "grad_norm": 0.054534122347831726, "learning_rate": 0.00018545753994274105, "loss": 0.0674, "step": 8455 }, { "epoch": 0.546650505050505, "grad_norm": 0.05368673801422119, "learning_rate": 0.00018545398826169468, "loss": 0.0757, "step": 8456 }, { "epoch": 0.5467151515151515, "grad_norm": 0.05879918485879898, "learning_rate": 0.0001854504361810082, "loss": 0.0935, "step": 8457 }, { "epoch": 0.546779797979798, "grad_norm": 0.05784719064831734, "learning_rate": 0.00018544688370069834, "loss": 0.0829, "step": 8458 }, { "epoch": 0.5468444444444445, "grad_norm": 0.05725192651152611, "learning_rate": 0.00018544333082078164, "loss": 0.0917, "step": 8459 }, { "epoch": 0.5469090909090909, "grad_norm": 0.05721799284219742, "learning_rate": 0.00018543977754127472, "loss": 0.0904, "step": 8460 }, { "epoch": 0.5469737373737373, "grad_norm": 0.05654417723417282, "learning_rate": 0.0001854362238621942, "loss": 0.0807, "step": 8461 }, { "epoch": 0.5470383838383839, "grad_norm": 0.06806594133377075, "learning_rate": 0.0001854326697835567, "loss": 0.1095, "step": 8462 }, { "epoch": 0.5471030303030303, "grad_norm": 0.05376395583152771, "learning_rate": 0.00018542911530537888, "loss": 0.0807, "step": 8463 }, { "epoch": 0.5471676767676767, "grad_norm": 0.06918716430664062, "learning_rate": 0.00018542556042767732, "loss": 0.1076, "step": 8464 }, { "epoch": 0.5471676767676767, "eval_bleu": 19.862917269912906, "eval_loss": 0.09041158854961395, "eval_runtime": 2.6767, "eval_samples_per_second": 11.955, "eval_steps_per_second": 1.494, "step": 8464 }, { "epoch": 0.5472323232323232, "grad_norm": 0.0555545911192894, "learning_rate": 0.00018542200515046865, "loss": 0.0891, "step": 8465 }, { "epoch": 0.5472969696969697, "grad_norm": 0.057647813111543655, "learning_rate": 0.0001854184494737695, "loss": 0.0952, "step": 8466 }, { "epoch": 0.5473616161616162, "grad_norm": 0.058100875467061996, "learning_rate": 0.0001854148933975965, "loss": 0.0906, "step": 8467 }, { "epoch": 0.5474262626262626, "grad_norm": 0.05828297883272171, "learning_rate": 0.00018541133692196628, "loss": 0.0978, "step": 8468 }, { "epoch": 0.5474909090909091, "grad_norm": 0.05317743495106697, "learning_rate": 0.00018540778004689544, "loss": 0.0851, "step": 8469 }, { "epoch": 0.5475555555555556, "grad_norm": 0.0499565452337265, "learning_rate": 0.0001854042227724007, "loss": 0.0686, "step": 8470 }, { "epoch": 0.547620202020202, "grad_norm": 0.06545493751764297, "learning_rate": 0.00018540066509849864, "loss": 0.0915, "step": 8471 }, { "epoch": 0.5476848484848484, "grad_norm": 0.0552094541490078, "learning_rate": 0.0001853971070252059, "loss": 0.0894, "step": 8472 }, { "epoch": 0.547749494949495, "grad_norm": 0.057058531790971756, "learning_rate": 0.0001853935485525391, "loss": 0.0961, "step": 8473 }, { "epoch": 0.5478141414141414, "grad_norm": 0.06517986208200455, "learning_rate": 0.00018538998968051492, "loss": 0.1021, "step": 8474 }, { "epoch": 0.5478787878787879, "grad_norm": 0.0476628914475441, "learning_rate": 0.00018538643040914997, "loss": 0.0799, "step": 8475 }, { "epoch": 0.5479434343434343, "grad_norm": 0.06356378644704819, "learning_rate": 0.00018538287073846093, "loss": 0.0869, "step": 8476 }, { "epoch": 0.5480080808080808, "grad_norm": 0.054594866931438446, "learning_rate": 0.00018537931066846443, "loss": 0.0876, "step": 8477 }, { "epoch": 0.5480727272727273, "grad_norm": 0.05724204331636429, "learning_rate": 0.0001853757501991771, "loss": 0.086, "step": 8478 }, { "epoch": 0.5481373737373737, "grad_norm": 0.05871690437197685, "learning_rate": 0.00018537218933061564, "loss": 0.1003, "step": 8479 }, { "epoch": 0.5482020202020202, "grad_norm": 0.051363203674554825, "learning_rate": 0.00018536862806279664, "loss": 0.082, "step": 8480 }, { "epoch": 0.5482020202020202, "eval_bleu": 17.71075630493864, "eval_loss": 0.0916891098022461, "eval_runtime": 2.642, "eval_samples_per_second": 12.112, "eval_steps_per_second": 1.514, "step": 8480 }, { "epoch": 0.5482666666666667, "grad_norm": 0.05304015427827835, "learning_rate": 0.00018536506639573683, "loss": 0.0808, "step": 8481 }, { "epoch": 0.5483313131313131, "grad_norm": 0.0582304522395134, "learning_rate": 0.0001853615043294528, "loss": 0.0888, "step": 8482 }, { "epoch": 0.5483959595959595, "grad_norm": 0.05039060860872269, "learning_rate": 0.00018535794186396123, "loss": 0.0722, "step": 8483 }, { "epoch": 0.5484606060606061, "grad_norm": 0.05572601780295372, "learning_rate": 0.0001853543789992788, "loss": 0.0899, "step": 8484 }, { "epoch": 0.5485252525252525, "grad_norm": 0.059249766170978546, "learning_rate": 0.00018535081573542214, "loss": 0.0887, "step": 8485 }, { "epoch": 0.548589898989899, "grad_norm": 0.05697702243924141, "learning_rate": 0.00018534725207240796, "loss": 0.0962, "step": 8486 }, { "epoch": 0.5486545454545455, "grad_norm": 0.0526839904487133, "learning_rate": 0.00018534368801025289, "loss": 0.073, "step": 8487 }, { "epoch": 0.5487191919191919, "grad_norm": 0.055772487074136734, "learning_rate": 0.00018534012354897358, "loss": 0.0772, "step": 8488 }, { "epoch": 0.5487838383838384, "grad_norm": 0.05501142889261246, "learning_rate": 0.00018533655868858672, "loss": 0.0814, "step": 8489 }, { "epoch": 0.5488484848484848, "grad_norm": 0.07299815118312836, "learning_rate": 0.00018533299342910904, "loss": 0.1088, "step": 8490 }, { "epoch": 0.5489131313131314, "grad_norm": 0.05947336181998253, "learning_rate": 0.0001853294277705571, "loss": 0.0805, "step": 8491 }, { "epoch": 0.5489777777777778, "grad_norm": 0.045659035444259644, "learning_rate": 0.00018532586171294768, "loss": 0.0633, "step": 8492 }, { "epoch": 0.5490424242424242, "grad_norm": 0.05863765999674797, "learning_rate": 0.0001853222952562974, "loss": 0.0942, "step": 8493 }, { "epoch": 0.5491070707070707, "grad_norm": 0.061164382845163345, "learning_rate": 0.00018531872840062293, "loss": 0.076, "step": 8494 }, { "epoch": 0.5491717171717172, "grad_norm": 0.06618953496217728, "learning_rate": 0.00018531516114594102, "loss": 0.0659, "step": 8495 }, { "epoch": 0.5492363636363636, "grad_norm": 0.060752082616090775, "learning_rate": 0.00018531159349226826, "loss": 0.0933, "step": 8496 }, { "epoch": 0.5492363636363636, "eval_bleu": 18.72556693079402, "eval_loss": 0.09209508448839188, "eval_runtime": 2.5912, "eval_samples_per_second": 12.349, "eval_steps_per_second": 1.544, "step": 8496 }, { "epoch": 0.5493010101010101, "grad_norm": 0.05093330144882202, "learning_rate": 0.0001853080254396214, "loss": 0.0748, "step": 8497 }, { "epoch": 0.5493656565656566, "grad_norm": 0.05763445049524307, "learning_rate": 0.00018530445698801708, "loss": 0.0803, "step": 8498 }, { "epoch": 0.549430303030303, "grad_norm": 0.05766602233052254, "learning_rate": 0.00018530088813747207, "loss": 0.0871, "step": 8499 }, { "epoch": 0.5494949494949495, "grad_norm": 0.06110754236578941, "learning_rate": 0.00018529731888800296, "loss": 0.0748, "step": 8500 }, { "epoch": 0.5495595959595959, "grad_norm": 0.06153168901801109, "learning_rate": 0.0001852937492396265, "loss": 0.0935, "step": 8501 }, { "epoch": 0.5496242424242425, "grad_norm": 0.05538514256477356, "learning_rate": 0.0001852901791923594, "loss": 0.0793, "step": 8502 }, { "epoch": 0.5496888888888889, "grad_norm": 0.06243006885051727, "learning_rate": 0.0001852866087462183, "loss": 0.09, "step": 8503 }, { "epoch": 0.5497535353535353, "grad_norm": 0.07592008262872696, "learning_rate": 0.00018528303790121996, "loss": 0.0998, "step": 8504 }, { "epoch": 0.5498181818181819, "grad_norm": 0.06026335805654526, "learning_rate": 0.00018527946665738098, "loss": 0.0937, "step": 8505 }, { "epoch": 0.5498828282828283, "grad_norm": 0.0585004985332489, "learning_rate": 0.00018527589501471817, "loss": 0.0958, "step": 8506 }, { "epoch": 0.5499474747474747, "grad_norm": 0.05122988298535347, "learning_rate": 0.0001852723229732482, "loss": 0.0704, "step": 8507 }, { "epoch": 0.5500121212121212, "grad_norm": 0.06212889030575752, "learning_rate": 0.00018526875053298774, "loss": 0.0876, "step": 8508 }, { "epoch": 0.5500767676767677, "grad_norm": 0.05525459721684456, "learning_rate": 0.00018526517769395354, "loss": 0.0881, "step": 8509 }, { "epoch": 0.5501414141414142, "grad_norm": 0.06140845641493797, "learning_rate": 0.00018526160445616227, "loss": 0.0887, "step": 8510 }, { "epoch": 0.5502060606060606, "grad_norm": 0.05825958028435707, "learning_rate": 0.0001852580308196307, "loss": 0.0853, "step": 8511 }, { "epoch": 0.550270707070707, "grad_norm": 0.06356102228164673, "learning_rate": 0.00018525445678437547, "loss": 0.0923, "step": 8512 }, { "epoch": 0.550270707070707, "eval_bleu": 18.819003345094302, "eval_loss": 0.0906292125582695, "eval_runtime": 2.5861, "eval_samples_per_second": 12.374, "eval_steps_per_second": 1.547, "step": 8512 }, { "epoch": 0.5503353535353536, "grad_norm": 0.06484409421682358, "learning_rate": 0.00018525088235041336, "loss": 0.0936, "step": 8513 }, { "epoch": 0.5504, "grad_norm": 0.055615589022636414, "learning_rate": 0.00018524730751776102, "loss": 0.0893, "step": 8514 }, { "epoch": 0.5504646464646464, "grad_norm": 0.0693335309624672, "learning_rate": 0.00018524373228643524, "loss": 0.0911, "step": 8515 }, { "epoch": 0.550529292929293, "grad_norm": 0.05166839063167572, "learning_rate": 0.00018524015665645268, "loss": 0.0755, "step": 8516 }, { "epoch": 0.5505939393939394, "grad_norm": 0.060407672077417374, "learning_rate": 0.0001852365806278301, "loss": 0.0949, "step": 8517 }, { "epoch": 0.5506585858585858, "grad_norm": 0.047847721725702286, "learning_rate": 0.00018523300420058422, "loss": 0.0642, "step": 8518 }, { "epoch": 0.5507232323232323, "grad_norm": 0.07262896001338959, "learning_rate": 0.00018522942737473174, "loss": 0.0746, "step": 8519 }, { "epoch": 0.5507878787878788, "grad_norm": 0.05812994763255119, "learning_rate": 0.00018522585015028943, "loss": 0.0839, "step": 8520 }, { "epoch": 0.5508525252525253, "grad_norm": 0.05401806905865669, "learning_rate": 0.00018522227252727396, "loss": 0.0928, "step": 8521 }, { "epoch": 0.5509171717171717, "grad_norm": 0.06370342522859573, "learning_rate": 0.00018521869450570212, "loss": 0.1003, "step": 8522 }, { "epoch": 0.5509818181818181, "grad_norm": 0.05556744337081909, "learning_rate": 0.00018521511608559062, "loss": 0.0903, "step": 8523 }, { "epoch": 0.5510464646464647, "grad_norm": 0.06560049206018448, "learning_rate": 0.00018521153726695617, "loss": 0.1034, "step": 8524 }, { "epoch": 0.5511111111111111, "grad_norm": 0.0594295859336853, "learning_rate": 0.00018520795804981554, "loss": 0.0887, "step": 8525 }, { "epoch": 0.5511757575757575, "grad_norm": 0.07043296843767166, "learning_rate": 0.00018520437843418548, "loss": 0.1144, "step": 8526 }, { "epoch": 0.5512404040404041, "grad_norm": 0.05553800240159035, "learning_rate": 0.00018520079842008265, "loss": 0.1009, "step": 8527 }, { "epoch": 0.5513050505050505, "grad_norm": 0.052831750363111496, "learning_rate": 0.0001851972180075239, "loss": 0.0804, "step": 8528 }, { "epoch": 0.5513050505050505, "eval_bleu": 16.29490593986113, "eval_loss": 0.08992880582809448, "eval_runtime": 2.6121, "eval_samples_per_second": 12.251, "eval_steps_per_second": 1.531, "step": 8528 }, { "epoch": 0.551369696969697, "grad_norm": 0.05288231000304222, "learning_rate": 0.00018519363719652593, "loss": 0.0887, "step": 8529 }, { "epoch": 0.5514343434343434, "grad_norm": 0.053602032363414764, "learning_rate": 0.0001851900559871055, "loss": 0.0743, "step": 8530 }, { "epoch": 0.5514989898989899, "grad_norm": 0.06755752861499786, "learning_rate": 0.00018518647437927932, "loss": 0.0877, "step": 8531 }, { "epoch": 0.5515636363636364, "grad_norm": 0.05730917677283287, "learning_rate": 0.00018518289237306416, "loss": 0.0828, "step": 8532 }, { "epoch": 0.5516282828282828, "grad_norm": 0.05736367031931877, "learning_rate": 0.00018517930996847677, "loss": 0.0831, "step": 8533 }, { "epoch": 0.5516929292929293, "grad_norm": 0.06024248152971268, "learning_rate": 0.0001851757271655339, "loss": 0.0876, "step": 8534 }, { "epoch": 0.5517575757575758, "grad_norm": 0.0519428625702858, "learning_rate": 0.00018517214396425233, "loss": 0.0729, "step": 8535 }, { "epoch": 0.5518222222222222, "grad_norm": 0.050916656851768494, "learning_rate": 0.0001851685603646488, "loss": 0.0794, "step": 8536 }, { "epoch": 0.5518868686868686, "grad_norm": 0.05802636966109276, "learning_rate": 0.00018516497636674007, "loss": 0.069, "step": 8537 }, { "epoch": 0.5519515151515152, "grad_norm": 0.056541651487350464, "learning_rate": 0.00018516139197054287, "loss": 0.0845, "step": 8538 }, { "epoch": 0.5520161616161616, "grad_norm": 0.059428539127111435, "learning_rate": 0.000185157807176074, "loss": 0.0797, "step": 8539 }, { "epoch": 0.5520808080808081, "grad_norm": 0.06684110313653946, "learning_rate": 0.00018515422198335025, "loss": 0.1014, "step": 8540 }, { "epoch": 0.5521454545454545, "grad_norm": 0.050504740327596664, "learning_rate": 0.00018515063639238833, "loss": 0.0816, "step": 8541 }, { "epoch": 0.552210101010101, "grad_norm": 0.06396680325269699, "learning_rate": 0.00018514705040320503, "loss": 0.099, "step": 8542 }, { "epoch": 0.5522747474747475, "grad_norm": 0.06169065460562706, "learning_rate": 0.00018514346401581717, "loss": 0.106, "step": 8543 }, { "epoch": 0.5523393939393939, "grad_norm": 0.05583159625530243, "learning_rate": 0.00018513987723024143, "loss": 0.0733, "step": 8544 }, { "epoch": 0.5523393939393939, "eval_bleu": 21.655265571731622, "eval_loss": 0.08964072912931442, "eval_runtime": 2.584, "eval_samples_per_second": 12.384, "eval_steps_per_second": 1.548, "step": 8544 }, { "epoch": 0.5524040404040405, "grad_norm": 0.05512620136141777, "learning_rate": 0.00018513629004649463, "loss": 0.0841, "step": 8545 }, { "epoch": 0.5524686868686869, "grad_norm": 0.07798700779676437, "learning_rate": 0.0001851327024645936, "loss": 0.0941, "step": 8546 }, { "epoch": 0.5525333333333333, "grad_norm": 0.049690645188093185, "learning_rate": 0.00018512911448455502, "loss": 0.0728, "step": 8547 }, { "epoch": 0.5525979797979798, "grad_norm": 0.05470888689160347, "learning_rate": 0.0001851255261063957, "loss": 0.086, "step": 8548 }, { "epoch": 0.5526626262626263, "grad_norm": 0.047312524169683456, "learning_rate": 0.00018512193733013246, "loss": 0.0839, "step": 8549 }, { "epoch": 0.5527272727272727, "grad_norm": 0.059361692517995834, "learning_rate": 0.00018511834815578208, "loss": 0.0987, "step": 8550 }, { "epoch": 0.5527919191919192, "grad_norm": 0.05968122556805611, "learning_rate": 0.0001851147585833613, "loss": 0.0793, "step": 8551 }, { "epoch": 0.5528565656565656, "grad_norm": 0.06668534874916077, "learning_rate": 0.00018511116861288693, "loss": 0.0835, "step": 8552 }, { "epoch": 0.5529212121212121, "grad_norm": 0.05345892161130905, "learning_rate": 0.00018510757824437578, "loss": 0.0911, "step": 8553 }, { "epoch": 0.5529858585858586, "grad_norm": 0.05707128718495369, "learning_rate": 0.00018510398747784458, "loss": 0.0992, "step": 8554 }, { "epoch": 0.553050505050505, "grad_norm": 0.06292226165533066, "learning_rate": 0.00018510039631331019, "loss": 0.085, "step": 8555 }, { "epoch": 0.5531151515151516, "grad_norm": 0.04947541281580925, "learning_rate": 0.0001850968047507894, "loss": 0.0805, "step": 8556 }, { "epoch": 0.553179797979798, "grad_norm": 0.04750622808933258, "learning_rate": 0.00018509321279029897, "loss": 0.0784, "step": 8557 }, { "epoch": 0.5532444444444444, "grad_norm": 0.056873951107263565, "learning_rate": 0.0001850896204318557, "loss": 0.0844, "step": 8558 }, { "epoch": 0.5533090909090909, "grad_norm": 0.04843200370669365, "learning_rate": 0.00018508602767547644, "loss": 0.0778, "step": 8559 }, { "epoch": 0.5533737373737374, "grad_norm": 0.05119411274790764, "learning_rate": 0.00018508243452117795, "loss": 0.0778, "step": 8560 }, { "epoch": 0.5533737373737374, "eval_bleu": 20.160218714771094, "eval_loss": 0.0889068990945816, "eval_runtime": 2.6477, "eval_samples_per_second": 12.086, "eval_steps_per_second": 1.511, "step": 8560 }, { "epoch": 0.5534383838383838, "grad_norm": 0.05382227152585983, "learning_rate": 0.000185078840968977, "loss": 0.0873, "step": 8561 }, { "epoch": 0.5535030303030303, "grad_norm": 0.06481458991765976, "learning_rate": 0.00018507524701889046, "loss": 0.1056, "step": 8562 }, { "epoch": 0.5535676767676768, "grad_norm": 0.05905880779027939, "learning_rate": 0.00018507165267093515, "loss": 0.0871, "step": 8563 }, { "epoch": 0.5536323232323233, "grad_norm": 0.05508779361844063, "learning_rate": 0.0001850680579251278, "loss": 0.0795, "step": 8564 }, { "epoch": 0.5536969696969697, "grad_norm": 0.06330199539661407, "learning_rate": 0.0001850644627814853, "loss": 0.0924, "step": 8565 }, { "epoch": 0.5537616161616161, "grad_norm": 0.05159846693277359, "learning_rate": 0.00018506086724002442, "loss": 0.0796, "step": 8566 }, { "epoch": 0.5538262626262627, "grad_norm": 0.06244294345378876, "learning_rate": 0.000185057271300762, "loss": 0.0988, "step": 8567 }, { "epoch": 0.5538909090909091, "grad_norm": 0.05870566889643669, "learning_rate": 0.0001850536749637148, "loss": 0.0991, "step": 8568 }, { "epoch": 0.5539555555555555, "grad_norm": 0.06311102211475372, "learning_rate": 0.0001850500782288997, "loss": 0.1068, "step": 8569 }, { "epoch": 0.554020202020202, "grad_norm": 0.062173549085855484, "learning_rate": 0.0001850464810963335, "loss": 0.0836, "step": 8570 }, { "epoch": 0.5540848484848485, "grad_norm": 0.05363662913441658, "learning_rate": 0.000185042883566033, "loss": 0.0857, "step": 8571 }, { "epoch": 0.554149494949495, "grad_norm": 0.059720899909734726, "learning_rate": 0.00018503928563801507, "loss": 0.0893, "step": 8572 }, { "epoch": 0.5542141414141414, "grad_norm": 0.04359506443142891, "learning_rate": 0.00018503568731229652, "loss": 0.0619, "step": 8573 }, { "epoch": 0.5542787878787879, "grad_norm": 0.06345093995332718, "learning_rate": 0.00018503208858889418, "loss": 0.1106, "step": 8574 }, { "epoch": 0.5543434343434344, "grad_norm": 0.06235799938440323, "learning_rate": 0.00018502848946782486, "loss": 0.0861, "step": 8575 }, { "epoch": 0.5544080808080808, "grad_norm": 0.048265837132930756, "learning_rate": 0.0001850248899491054, "loss": 0.0658, "step": 8576 }, { "epoch": 0.5544080808080808, "eval_bleu": 19.815393556771422, "eval_loss": 0.09043727070093155, "eval_runtime": 2.6, "eval_samples_per_second": 12.308, "eval_steps_per_second": 1.538, "step": 8576 }, { "epoch": 0.5544727272727272, "grad_norm": 0.05250389873981476, "learning_rate": 0.00018502129003275265, "loss": 0.0799, "step": 8577 }, { "epoch": 0.5545373737373738, "grad_norm": 0.045226044952869415, "learning_rate": 0.00018501768971878344, "loss": 0.0681, "step": 8578 }, { "epoch": 0.5546020202020202, "grad_norm": 0.08579523861408234, "learning_rate": 0.0001850140890072146, "loss": 0.0815, "step": 8579 }, { "epoch": 0.5546666666666666, "grad_norm": 0.07494629174470901, "learning_rate": 0.00018501048789806295, "loss": 0.0823, "step": 8580 }, { "epoch": 0.5547313131313132, "grad_norm": 0.05314254388213158, "learning_rate": 0.00018500688639134537, "loss": 0.0818, "step": 8581 }, { "epoch": 0.5547959595959596, "grad_norm": 0.05775831639766693, "learning_rate": 0.00018500328448707866, "loss": 0.0849, "step": 8582 }, { "epoch": 0.554860606060606, "grad_norm": 0.04820224642753601, "learning_rate": 0.0001849996821852797, "loss": 0.0761, "step": 8583 }, { "epoch": 0.5549252525252525, "grad_norm": 0.05934206023812294, "learning_rate": 0.00018499607948596535, "loss": 0.0863, "step": 8584 }, { "epoch": 0.554989898989899, "grad_norm": 0.05496034026145935, "learning_rate": 0.00018499247638915242, "loss": 0.0795, "step": 8585 }, { "epoch": 0.5550545454545455, "grad_norm": 0.048044268041849136, "learning_rate": 0.00018498887289485777, "loss": 0.0784, "step": 8586 }, { "epoch": 0.5551191919191919, "grad_norm": 0.06375626474618912, "learning_rate": 0.00018498526900309828, "loss": 0.0979, "step": 8587 }, { "epoch": 0.5551838383838383, "grad_norm": 0.07100881636142731, "learning_rate": 0.00018498166471389075, "loss": 0.0958, "step": 8588 }, { "epoch": 0.5552484848484849, "grad_norm": 0.05465933680534363, "learning_rate": 0.0001849780600272521, "loss": 0.088, "step": 8589 }, { "epoch": 0.5553131313131313, "grad_norm": 0.057275112718343735, "learning_rate": 0.00018497445494319913, "loss": 0.1018, "step": 8590 }, { "epoch": 0.5553777777777777, "grad_norm": 0.05681046098470688, "learning_rate": 0.00018497084946174876, "loss": 0.0823, "step": 8591 }, { "epoch": 0.5554424242424243, "grad_norm": 0.050680115818977356, "learning_rate": 0.0001849672435829178, "loss": 0.0781, "step": 8592 }, { "epoch": 0.5554424242424243, "eval_bleu": 21.23693031039074, "eval_loss": 0.08895865827798843, "eval_runtime": 2.6407, "eval_samples_per_second": 12.118, "eval_steps_per_second": 1.515, "step": 8592 }, { "epoch": 0.5555070707070707, "grad_norm": 0.05246419832110405, "learning_rate": 0.0001849636373067231, "loss": 0.0816, "step": 8593 }, { "epoch": 0.5555717171717172, "grad_norm": 0.05425532907247543, "learning_rate": 0.0001849600306331816, "loss": 0.0848, "step": 8594 }, { "epoch": 0.5556363636363636, "grad_norm": 0.055495746433734894, "learning_rate": 0.0001849564235623101, "loss": 0.0756, "step": 8595 }, { "epoch": 0.5557010101010101, "grad_norm": 0.058443181216716766, "learning_rate": 0.00018495281609412552, "loss": 0.094, "step": 8596 }, { "epoch": 0.5557656565656566, "grad_norm": 0.06340799480676651, "learning_rate": 0.00018494920822864468, "loss": 0.0962, "step": 8597 }, { "epoch": 0.555830303030303, "grad_norm": 0.049928274005651474, "learning_rate": 0.0001849455999658845, "loss": 0.0812, "step": 8598 }, { "epoch": 0.5558949494949494, "grad_norm": 0.046841755509376526, "learning_rate": 0.0001849419913058618, "loss": 0.0756, "step": 8599 }, { "epoch": 0.555959595959596, "grad_norm": 0.04867812991142273, "learning_rate": 0.0001849383822485935, "loss": 0.0746, "step": 8600 }, { "epoch": 0.5560242424242424, "grad_norm": 0.050249043852090836, "learning_rate": 0.00018493477279409644, "loss": 0.0764, "step": 8601 }, { "epoch": 0.5560888888888889, "grad_norm": 0.05424565076828003, "learning_rate": 0.0001849311629423876, "loss": 0.0865, "step": 8602 }, { "epoch": 0.5561535353535354, "grad_norm": 0.061980508267879486, "learning_rate": 0.0001849275526934837, "loss": 0.0896, "step": 8603 }, { "epoch": 0.5562181818181818, "grad_norm": 0.05581079050898552, "learning_rate": 0.0001849239420474018, "loss": 0.0894, "step": 8604 }, { "epoch": 0.5562828282828283, "grad_norm": 0.056814707815647125, "learning_rate": 0.00018492033100415864, "loss": 0.0821, "step": 8605 }, { "epoch": 0.5563474747474747, "grad_norm": 0.05500706657767296, "learning_rate": 0.00018491671956377117, "loss": 0.0751, "step": 8606 }, { "epoch": 0.5564121212121212, "grad_norm": 0.04816592112183571, "learning_rate": 0.00018491310772625628, "loss": 0.0668, "step": 8607 }, { "epoch": 0.5564767676767677, "grad_norm": 0.061996906995773315, "learning_rate": 0.00018490949549163088, "loss": 0.0929, "step": 8608 }, { "epoch": 0.5564767676767677, "eval_bleu": 20.364204070583742, "eval_loss": 0.08867713063955307, "eval_runtime": 2.6068, "eval_samples_per_second": 12.276, "eval_steps_per_second": 1.534, "step": 8608 }, { "epoch": 0.5565414141414141, "grad_norm": 0.06262017786502838, "learning_rate": 0.00018490588285991182, "loss": 0.09, "step": 8609 }, { "epoch": 0.5566060606060607, "grad_norm": 0.06076890975236893, "learning_rate": 0.00018490226983111603, "loss": 0.0907, "step": 8610 }, { "epoch": 0.5566707070707071, "grad_norm": 0.05991611257195473, "learning_rate": 0.00018489865640526035, "loss": 0.0949, "step": 8611 }, { "epoch": 0.5567353535353535, "grad_norm": 0.06228615343570709, "learning_rate": 0.00018489504258236176, "loss": 0.0896, "step": 8612 }, { "epoch": 0.5568, "grad_norm": 0.0495808869600296, "learning_rate": 0.00018489142836243712, "loss": 0.073, "step": 8613 }, { "epoch": 0.5568646464646465, "grad_norm": 0.0699712336063385, "learning_rate": 0.00018488781374550333, "loss": 0.1054, "step": 8614 }, { "epoch": 0.5569292929292929, "grad_norm": 0.05624164268374443, "learning_rate": 0.0001848841987315773, "loss": 0.0825, "step": 8615 }, { "epoch": 0.5569939393939394, "grad_norm": 0.054450344294309616, "learning_rate": 0.00018488058332067592, "loss": 0.0759, "step": 8616 }, { "epoch": 0.5570585858585858, "grad_norm": 0.05778719484806061, "learning_rate": 0.00018487696751281614, "loss": 0.0828, "step": 8617 }, { "epoch": 0.5571232323232324, "grad_norm": 0.051073141396045685, "learning_rate": 0.0001848733513080148, "loss": 0.0733, "step": 8618 }, { "epoch": 0.5571878787878788, "grad_norm": 0.05708806961774826, "learning_rate": 0.00018486973470628888, "loss": 0.088, "step": 8619 }, { "epoch": 0.5572525252525252, "grad_norm": 0.07828760147094727, "learning_rate": 0.00018486611770765528, "loss": 0.1213, "step": 8620 }, { "epoch": 0.5573171717171718, "grad_norm": 0.05615083500742912, "learning_rate": 0.00018486250031213088, "loss": 0.0823, "step": 8621 }, { "epoch": 0.5573818181818182, "grad_norm": 0.05507369339466095, "learning_rate": 0.00018485888251973265, "loss": 0.0784, "step": 8622 }, { "epoch": 0.5574464646464646, "grad_norm": 0.05637180432677269, "learning_rate": 0.00018485526433047747, "loss": 0.0777, "step": 8623 }, { "epoch": 0.5575111111111111, "grad_norm": 0.09263873100280762, "learning_rate": 0.00018485164574438226, "loss": 0.0865, "step": 8624 }, { "epoch": 0.5575111111111111, "eval_bleu": 20.930269124712716, "eval_loss": 0.08981513977050781, "eval_runtime": 2.7905, "eval_samples_per_second": 11.468, "eval_steps_per_second": 1.433, "step": 8624 }, { "epoch": 0.5575757575757576, "grad_norm": 0.05894949659705162, "learning_rate": 0.00018484802676146394, "loss": 0.0959, "step": 8625 }, { "epoch": 0.557640404040404, "grad_norm": 0.04132280871272087, "learning_rate": 0.00018484440738173946, "loss": 0.0567, "step": 8626 }, { "epoch": 0.5577050505050505, "grad_norm": 0.053076520562171936, "learning_rate": 0.00018484078760522577, "loss": 0.0886, "step": 8627 }, { "epoch": 0.5577696969696969, "grad_norm": 0.05915549769997597, "learning_rate": 0.0001848371674319397, "loss": 0.1028, "step": 8628 }, { "epoch": 0.5578343434343435, "grad_norm": 0.0636865645647049, "learning_rate": 0.00018483354686189832, "loss": 0.1021, "step": 8629 }, { "epoch": 0.5578989898989899, "grad_norm": 0.05503813177347183, "learning_rate": 0.00018482992589511846, "loss": 0.0829, "step": 8630 }, { "epoch": 0.5579636363636363, "grad_norm": 0.05101288482546806, "learning_rate": 0.00018482630453161704, "loss": 0.0675, "step": 8631 }, { "epoch": 0.5580282828282829, "grad_norm": 0.05242958664894104, "learning_rate": 0.00018482268277141106, "loss": 0.0859, "step": 8632 }, { "epoch": 0.5580929292929293, "grad_norm": 0.05309920385479927, "learning_rate": 0.00018481906061451747, "loss": 0.0766, "step": 8633 }, { "epoch": 0.5581575757575757, "grad_norm": 0.05535418540239334, "learning_rate": 0.0001848154380609531, "loss": 0.0853, "step": 8634 }, { "epoch": 0.5582222222222222, "grad_norm": 0.061480652540922165, "learning_rate": 0.00018481181511073504, "loss": 0.0905, "step": 8635 }, { "epoch": 0.5582868686868687, "grad_norm": 0.05495958775281906, "learning_rate": 0.00018480819176388012, "loss": 0.0834, "step": 8636 }, { "epoch": 0.5583515151515152, "grad_norm": 0.056370992213487625, "learning_rate": 0.00018480456802040534, "loss": 0.0867, "step": 8637 }, { "epoch": 0.5584161616161616, "grad_norm": 0.05902678519487381, "learning_rate": 0.00018480094388032762, "loss": 0.0862, "step": 8638 }, { "epoch": 0.5584808080808081, "grad_norm": 0.050766721367836, "learning_rate": 0.00018479731934366393, "loss": 0.08, "step": 8639 }, { "epoch": 0.5585454545454546, "grad_norm": 0.06851779669523239, "learning_rate": 0.0001847936944104312, "loss": 0.08, "step": 8640 }, { "epoch": 0.5585454545454546, "eval_bleu": 18.491793231418658, "eval_loss": 0.09064921736717224, "eval_runtime": 2.6778, "eval_samples_per_second": 11.95, "eval_steps_per_second": 1.494, "step": 8640 }, { "epoch": 0.558610101010101, "grad_norm": 0.05347486212849617, "learning_rate": 0.00018479006908064642, "loss": 0.0826, "step": 8641 }, { "epoch": 0.5586747474747474, "grad_norm": 0.05523145571351051, "learning_rate": 0.0001847864433543265, "loss": 0.0812, "step": 8642 }, { "epoch": 0.558739393939394, "grad_norm": 0.06093825399875641, "learning_rate": 0.0001847828172314884, "loss": 0.0905, "step": 8643 }, { "epoch": 0.5588040404040404, "grad_norm": 0.056226614862680435, "learning_rate": 0.00018477919071214908, "loss": 0.0794, "step": 8644 }, { "epoch": 0.5588686868686868, "grad_norm": 0.05645235627889633, "learning_rate": 0.00018477556379632555, "loss": 0.0912, "step": 8645 }, { "epoch": 0.5589333333333333, "grad_norm": 0.058711033314466476, "learning_rate": 0.0001847719364840347, "loss": 0.084, "step": 8646 }, { "epoch": 0.5589979797979798, "grad_norm": 0.060634057968854904, "learning_rate": 0.00018476830877529356, "loss": 0.0998, "step": 8647 }, { "epoch": 0.5590626262626263, "grad_norm": 0.05598815903067589, "learning_rate": 0.00018476468067011906, "loss": 0.0839, "step": 8648 }, { "epoch": 0.5591272727272727, "grad_norm": 0.06504741311073303, "learning_rate": 0.00018476105216852818, "loss": 0.1068, "step": 8649 }, { "epoch": 0.5591919191919192, "grad_norm": 0.057945266366004944, "learning_rate": 0.00018475742327053786, "loss": 0.0926, "step": 8650 }, { "epoch": 0.5592565656565657, "grad_norm": 0.05511823669075966, "learning_rate": 0.0001847537939761651, "loss": 0.0791, "step": 8651 }, { "epoch": 0.5593212121212121, "grad_norm": 0.046245869249105453, "learning_rate": 0.00018475016428542686, "loss": 0.0711, "step": 8652 }, { "epoch": 0.5593858585858585, "grad_norm": 0.04578534886240959, "learning_rate": 0.00018474653419834011, "loss": 0.0663, "step": 8653 }, { "epoch": 0.5594505050505051, "grad_norm": 0.04928568750619888, "learning_rate": 0.00018474290371492185, "loss": 0.0778, "step": 8654 }, { "epoch": 0.5595151515151515, "grad_norm": 0.06341521441936493, "learning_rate": 0.00018473927283518905, "loss": 0.0849, "step": 8655 }, { "epoch": 0.559579797979798, "grad_norm": 0.05002934858202934, "learning_rate": 0.0001847356415591587, "loss": 0.08, "step": 8656 }, { "epoch": 0.559579797979798, "eval_bleu": 19.209867542876662, "eval_loss": 0.09008830785751343, "eval_runtime": 2.6363, "eval_samples_per_second": 12.138, "eval_steps_per_second": 1.517, "step": 8656 }, { "epoch": 0.5596444444444445, "grad_norm": 0.05382854864001274, "learning_rate": 0.00018473200988684772, "loss": 0.0749, "step": 8657 }, { "epoch": 0.5597090909090909, "grad_norm": 0.05017104744911194, "learning_rate": 0.00018472837781827317, "loss": 0.0763, "step": 8658 }, { "epoch": 0.5597737373737374, "grad_norm": 0.05167162045836449, "learning_rate": 0.00018472474535345202, "loss": 0.0816, "step": 8659 }, { "epoch": 0.5598383838383838, "grad_norm": 0.055129118263721466, "learning_rate": 0.0001847211124924012, "loss": 0.0772, "step": 8660 }, { "epoch": 0.5599030303030303, "grad_norm": 0.05940185487270355, "learning_rate": 0.00018471747923513778, "loss": 0.091, "step": 8661 }, { "epoch": 0.5599676767676768, "grad_norm": 0.05801228806376457, "learning_rate": 0.0001847138455816787, "loss": 0.0757, "step": 8662 }, { "epoch": 0.5600323232323232, "grad_norm": 0.04993444308638573, "learning_rate": 0.00018471021153204097, "loss": 0.0792, "step": 8663 }, { "epoch": 0.5600969696969696, "grad_norm": 0.05846647173166275, "learning_rate": 0.00018470657708624162, "loss": 0.088, "step": 8664 }, { "epoch": 0.5601616161616162, "grad_norm": 0.057954683899879456, "learning_rate": 0.00018470294224429758, "loss": 0.0926, "step": 8665 }, { "epoch": 0.5602262626262626, "grad_norm": 0.05970615893602371, "learning_rate": 0.0001846993070062259, "loss": 0.0941, "step": 8666 }, { "epoch": 0.5602909090909091, "grad_norm": 0.05692056193947792, "learning_rate": 0.00018469567137204354, "loss": 0.0895, "step": 8667 }, { "epoch": 0.5603555555555556, "grad_norm": 0.06134670600295067, "learning_rate": 0.00018469203534176754, "loss": 0.0945, "step": 8668 }, { "epoch": 0.560420202020202, "grad_norm": 0.051682695746421814, "learning_rate": 0.0001846883989154149, "loss": 0.0726, "step": 8669 }, { "epoch": 0.5604848484848485, "grad_norm": 0.06091154366731644, "learning_rate": 0.00018468476209300262, "loss": 0.0974, "step": 8670 }, { "epoch": 0.5605494949494949, "grad_norm": 0.057035479694604874, "learning_rate": 0.00018468112487454765, "loss": 0.0766, "step": 8671 }, { "epoch": 0.5606141414141415, "grad_norm": 0.05756233632564545, "learning_rate": 0.00018467748726006707, "loss": 0.0934, "step": 8672 }, { "epoch": 0.5606141414141415, "eval_bleu": 19.577338378866678, "eval_loss": 0.09088188409805298, "eval_runtime": 2.67, "eval_samples_per_second": 11.985, "eval_steps_per_second": 1.498, "step": 8672 }, { "epoch": 0.5606787878787879, "grad_norm": 0.05211443454027176, "learning_rate": 0.0001846738492495779, "loss": 0.0799, "step": 8673 }, { "epoch": 0.5607434343434343, "grad_norm": 0.04776475206017494, "learning_rate": 0.00018467021084309714, "loss": 0.0695, "step": 8674 }, { "epoch": 0.5608080808080808, "grad_norm": 0.05181039497256279, "learning_rate": 0.0001846665720406418, "loss": 0.0854, "step": 8675 }, { "epoch": 0.5608727272727273, "grad_norm": 0.05552777647972107, "learning_rate": 0.00018466293284222885, "loss": 0.0813, "step": 8676 }, { "epoch": 0.5609373737373737, "grad_norm": 0.05747552588582039, "learning_rate": 0.00018465929324787539, "loss": 0.0858, "step": 8677 }, { "epoch": 0.5610020202020202, "grad_norm": 0.048651546239852905, "learning_rate": 0.0001846556532575984, "loss": 0.0686, "step": 8678 }, { "epoch": 0.5610666666666667, "grad_norm": 0.05060027167201042, "learning_rate": 0.00018465201287141486, "loss": 0.0768, "step": 8679 }, { "epoch": 0.5611313131313131, "grad_norm": 0.050368521362543106, "learning_rate": 0.00018464837208934188, "loss": 0.0744, "step": 8680 }, { "epoch": 0.5611959595959596, "grad_norm": 0.052513234317302704, "learning_rate": 0.00018464473091139646, "loss": 0.0793, "step": 8681 }, { "epoch": 0.561260606060606, "grad_norm": 0.052003130316734314, "learning_rate": 0.0001846410893375956, "loss": 0.0886, "step": 8682 }, { "epoch": 0.5613252525252526, "grad_norm": 0.05825111269950867, "learning_rate": 0.00018463744736795637, "loss": 0.0921, "step": 8683 }, { "epoch": 0.561389898989899, "grad_norm": 0.06389695405960083, "learning_rate": 0.00018463380500249575, "loss": 0.0773, "step": 8684 }, { "epoch": 0.5614545454545454, "grad_norm": 0.058512866497039795, "learning_rate": 0.00018463016224123078, "loss": 0.0867, "step": 8685 }, { "epoch": 0.561519191919192, "grad_norm": 0.05513356998562813, "learning_rate": 0.00018462651908417857, "loss": 0.0839, "step": 8686 }, { "epoch": 0.5615838383838384, "grad_norm": 0.05152648687362671, "learning_rate": 0.00018462287553135606, "loss": 0.076, "step": 8687 }, { "epoch": 0.5616484848484848, "grad_norm": 0.05915629863739014, "learning_rate": 0.0001846192315827804, "loss": 0.0745, "step": 8688 }, { "epoch": 0.5616484848484848, "eval_bleu": 19.031091554712777, "eval_loss": 0.08984971791505814, "eval_runtime": 2.6498, "eval_samples_per_second": 12.076, "eval_steps_per_second": 1.51, "step": 8688 }, { "epoch": 0.5617131313131313, "grad_norm": 0.0590272881090641, "learning_rate": 0.0001846155872384685, "loss": 0.1073, "step": 8689 }, { "epoch": 0.5617777777777778, "grad_norm": 0.060834113508462906, "learning_rate": 0.00018461194249843753, "loss": 0.1072, "step": 8690 }, { "epoch": 0.5618424242424243, "grad_norm": 0.05375421419739723, "learning_rate": 0.00018460829736270445, "loss": 0.0789, "step": 8691 }, { "epoch": 0.5619070707070707, "grad_norm": 0.053263746201992035, "learning_rate": 0.00018460465183128632, "loss": 0.0829, "step": 8692 }, { "epoch": 0.5619717171717171, "grad_norm": 0.06226739659905434, "learning_rate": 0.00018460100590420023, "loss": 0.0846, "step": 8693 }, { "epoch": 0.5620363636363637, "grad_norm": 0.04798587039113045, "learning_rate": 0.00018459735958146318, "loss": 0.0664, "step": 8694 }, { "epoch": 0.5621010101010101, "grad_norm": 0.05608699470758438, "learning_rate": 0.00018459371286309227, "loss": 0.0812, "step": 8695 }, { "epoch": 0.5621656565656565, "grad_norm": 0.053713783621788025, "learning_rate": 0.0001845900657491045, "loss": 0.0815, "step": 8696 }, { "epoch": 0.5622303030303031, "grad_norm": 0.049066029489040375, "learning_rate": 0.00018458641823951698, "loss": 0.0724, "step": 8697 }, { "epoch": 0.5622949494949495, "grad_norm": 0.0753750205039978, "learning_rate": 0.00018458277033434677, "loss": 0.0971, "step": 8698 }, { "epoch": 0.562359595959596, "grad_norm": 0.0569225512444973, "learning_rate": 0.00018457912203361087, "loss": 0.0742, "step": 8699 }, { "epoch": 0.5624242424242424, "grad_norm": 0.05971173197031021, "learning_rate": 0.0001845754733373264, "loss": 0.0903, "step": 8700 }, { "epoch": 0.5624888888888889, "grad_norm": 0.05029066652059555, "learning_rate": 0.00018457182424551038, "loss": 0.0799, "step": 8701 }, { "epoch": 0.5625535353535354, "grad_norm": 0.061541538685560226, "learning_rate": 0.0001845681747581799, "loss": 0.0872, "step": 8702 }, { "epoch": 0.5626181818181818, "grad_norm": 0.05283352732658386, "learning_rate": 0.00018456452487535202, "loss": 0.0855, "step": 8703 }, { "epoch": 0.5626828282828282, "grad_norm": 0.05384129658341408, "learning_rate": 0.00018456087459704383, "loss": 0.0831, "step": 8704 }, { "epoch": 0.5626828282828282, "eval_bleu": 18.301437885840944, "eval_loss": 0.08981127291917801, "eval_runtime": 2.678, "eval_samples_per_second": 11.949, "eval_steps_per_second": 1.494, "step": 8704 }, { "epoch": 0.5627474747474748, "grad_norm": 0.05215170979499817, "learning_rate": 0.00018455722392327238, "loss": 0.0805, "step": 8705 }, { "epoch": 0.5628121212121212, "grad_norm": 0.06686409562826157, "learning_rate": 0.00018455357285405475, "loss": 0.0854, "step": 8706 }, { "epoch": 0.5628767676767676, "grad_norm": 0.05513312667608261, "learning_rate": 0.00018454992138940802, "loss": 0.0872, "step": 8707 }, { "epoch": 0.5629414141414142, "grad_norm": 0.05623937398195267, "learning_rate": 0.0001845462695293492, "loss": 0.0877, "step": 8708 }, { "epoch": 0.5630060606060606, "grad_norm": 0.05583951994776726, "learning_rate": 0.00018454261727389551, "loss": 0.0899, "step": 8709 }, { "epoch": 0.563070707070707, "grad_norm": 0.06051473692059517, "learning_rate": 0.0001845389646230639, "loss": 0.0909, "step": 8710 }, { "epoch": 0.5631353535353535, "grad_norm": 0.059424370527267456, "learning_rate": 0.00018453531157687153, "loss": 0.0898, "step": 8711 }, { "epoch": 0.5632, "grad_norm": 0.05124353617429733, "learning_rate": 0.00018453165813533544, "loss": 0.0783, "step": 8712 }, { "epoch": 0.5632646464646465, "grad_norm": 0.047427017241716385, "learning_rate": 0.00018452800429847273, "loss": 0.0751, "step": 8713 }, { "epoch": 0.5633292929292929, "grad_norm": 0.07107948511838913, "learning_rate": 0.00018452435006630048, "loss": 0.1003, "step": 8714 }, { "epoch": 0.5633939393939394, "grad_norm": 0.05216708406805992, "learning_rate": 0.00018452069543883578, "loss": 0.0707, "step": 8715 }, { "epoch": 0.5634585858585859, "grad_norm": 0.07156307995319366, "learning_rate": 0.00018451704041609577, "loss": 0.0825, "step": 8716 }, { "epoch": 0.5635232323232323, "grad_norm": 0.055742498487234116, "learning_rate": 0.00018451338499809748, "loss": 0.0863, "step": 8717 }, { "epoch": 0.5635878787878787, "grad_norm": 0.04500174522399902, "learning_rate": 0.000184509729184858, "loss": 0.0633, "step": 8718 }, { "epoch": 0.5636525252525253, "grad_norm": 0.05029028654098511, "learning_rate": 0.00018450607297639448, "loss": 0.0783, "step": 8719 }, { "epoch": 0.5637171717171717, "grad_norm": 0.06268620491027832, "learning_rate": 0.000184502416372724, "loss": 0.0877, "step": 8720 }, { "epoch": 0.5637171717171717, "eval_bleu": 19.49659338113741, "eval_loss": 0.08985411375761032, "eval_runtime": 2.7122, "eval_samples_per_second": 11.799, "eval_steps_per_second": 1.475, "step": 8720 }, { "epoch": 0.5637818181818182, "grad_norm": 0.05000540614128113, "learning_rate": 0.00018449875937386366, "loss": 0.0778, "step": 8721 }, { "epoch": 0.5638464646464646, "grad_norm": 0.062020741403102875, "learning_rate": 0.00018449510197983052, "loss": 0.0986, "step": 8722 }, { "epoch": 0.5639111111111111, "grad_norm": 0.05697214975953102, "learning_rate": 0.00018449144419064174, "loss": 0.0973, "step": 8723 }, { "epoch": 0.5639757575757576, "grad_norm": 0.05635448917746544, "learning_rate": 0.00018448778600631442, "loss": 0.0818, "step": 8724 }, { "epoch": 0.564040404040404, "grad_norm": 0.06384506076574326, "learning_rate": 0.00018448412742686564, "loss": 0.0891, "step": 8725 }, { "epoch": 0.5641050505050506, "grad_norm": 0.050592489540576935, "learning_rate": 0.00018448046845231258, "loss": 0.0835, "step": 8726 }, { "epoch": 0.564169696969697, "grad_norm": 0.04895809665322304, "learning_rate": 0.00018447680908267224, "loss": 0.0776, "step": 8727 }, { "epoch": 0.5642343434343434, "grad_norm": 0.052662793546915054, "learning_rate": 0.0001844731493179618, "loss": 0.0812, "step": 8728 }, { "epoch": 0.5642989898989899, "grad_norm": 0.04361062869429588, "learning_rate": 0.00018446948915819836, "loss": 0.0605, "step": 8729 }, { "epoch": 0.5643636363636364, "grad_norm": 0.048750124871730804, "learning_rate": 0.00018446582860339906, "loss": 0.0795, "step": 8730 }, { "epoch": 0.5644282828282828, "grad_norm": 0.06389498710632324, "learning_rate": 0.00018446216765358102, "loss": 0.0827, "step": 8731 }, { "epoch": 0.5644929292929293, "grad_norm": 0.052995022386312485, "learning_rate": 0.00018445850630876133, "loss": 0.0864, "step": 8732 }, { "epoch": 0.5645575757575757, "grad_norm": 0.05226517096161842, "learning_rate": 0.0001844548445689571, "loss": 0.0832, "step": 8733 }, { "epoch": 0.5646222222222222, "grad_norm": 0.05520207807421684, "learning_rate": 0.00018445118243418552, "loss": 0.0882, "step": 8734 }, { "epoch": 0.5646868686868687, "grad_norm": 0.058045074343681335, "learning_rate": 0.00018444751990446368, "loss": 0.0945, "step": 8735 }, { "epoch": 0.5647515151515151, "grad_norm": 0.058213505893945694, "learning_rate": 0.00018444385697980869, "loss": 0.086, "step": 8736 }, { "epoch": 0.5647515151515151, "eval_bleu": 16.226517276878063, "eval_loss": 0.0894380509853363, "eval_runtime": 2.593, "eval_samples_per_second": 12.341, "eval_steps_per_second": 1.543, "step": 8736 }, { "epoch": 0.5648161616161617, "grad_norm": 0.04529042914509773, "learning_rate": 0.0001844401936602377, "loss": 0.0742, "step": 8737 }, { "epoch": 0.5648808080808081, "grad_norm": 0.06049531325697899, "learning_rate": 0.0001844365299457678, "loss": 0.0922, "step": 8738 }, { "epoch": 0.5649454545454545, "grad_norm": 0.0630745217204094, "learning_rate": 0.00018443286583641623, "loss": 0.0935, "step": 8739 }, { "epoch": 0.565010101010101, "grad_norm": 0.061176806688308716, "learning_rate": 0.00018442920133220002, "loss": 0.1031, "step": 8740 }, { "epoch": 0.5650747474747475, "grad_norm": 0.055167678743600845, "learning_rate": 0.00018442553643313638, "loss": 0.0801, "step": 8741 }, { "epoch": 0.5651393939393939, "grad_norm": 0.06291116774082184, "learning_rate": 0.00018442187113924238, "loss": 0.0938, "step": 8742 }, { "epoch": 0.5652040404040404, "grad_norm": 0.05463993921875954, "learning_rate": 0.00018441820545053524, "loss": 0.0744, "step": 8743 }, { "epoch": 0.5652686868686869, "grad_norm": 0.05547681078314781, "learning_rate": 0.000184414539367032, "loss": 0.0922, "step": 8744 }, { "epoch": 0.5653333333333334, "grad_norm": 0.05564368516206741, "learning_rate": 0.00018441087288874992, "loss": 0.0869, "step": 8745 }, { "epoch": 0.5653979797979798, "grad_norm": 0.05639887973666191, "learning_rate": 0.00018440720601570605, "loss": 0.0805, "step": 8746 }, { "epoch": 0.5654626262626262, "grad_norm": 0.05873168632388115, "learning_rate": 0.00018440353874791762, "loss": 0.0884, "step": 8747 }, { "epoch": 0.5655272727272728, "grad_norm": 0.060898557305336, "learning_rate": 0.00018439987108540172, "loss": 0.0864, "step": 8748 }, { "epoch": 0.5655919191919192, "grad_norm": 0.06412575393915176, "learning_rate": 0.00018439620302817552, "loss": 0.1025, "step": 8749 }, { "epoch": 0.5656565656565656, "grad_norm": 0.07912924885749817, "learning_rate": 0.00018439253457625622, "loss": 0.1012, "step": 8750 }, { "epoch": 0.5657212121212121, "grad_norm": 0.051931772381067276, "learning_rate": 0.0001843888657296609, "loss": 0.0854, "step": 8751 }, { "epoch": 0.5657858585858586, "grad_norm": 0.05735223740339279, "learning_rate": 0.00018438519648840678, "loss": 0.0808, "step": 8752 }, { "epoch": 0.5657858585858586, "eval_bleu": 17.99522868975359, "eval_loss": 0.08936098963022232, "eval_runtime": 2.7015, "eval_samples_per_second": 11.845, "eval_steps_per_second": 1.481, "step": 8752 }, { "epoch": 0.565850505050505, "grad_norm": 0.05008351057767868, "learning_rate": 0.00018438152685251095, "loss": 0.0703, "step": 8753 }, { "epoch": 0.5659151515151515, "grad_norm": 0.05639440193772316, "learning_rate": 0.00018437785682199064, "loss": 0.0976, "step": 8754 }, { "epoch": 0.565979797979798, "grad_norm": 0.05571547523140907, "learning_rate": 0.00018437418639686299, "loss": 0.0876, "step": 8755 }, { "epoch": 0.5660444444444445, "grad_norm": 0.0661916434764862, "learning_rate": 0.00018437051557714518, "loss": 0.1046, "step": 8756 }, { "epoch": 0.5661090909090909, "grad_norm": 0.056157004088163376, "learning_rate": 0.00018436684436285434, "loss": 0.0884, "step": 8757 }, { "epoch": 0.5661737373737373, "grad_norm": 0.05300220474600792, "learning_rate": 0.00018436317275400766, "loss": 0.0799, "step": 8758 }, { "epoch": 0.5662383838383839, "grad_norm": 0.050668004900217056, "learning_rate": 0.0001843595007506223, "loss": 0.0825, "step": 8759 }, { "epoch": 0.5663030303030303, "grad_norm": 0.05276070535182953, "learning_rate": 0.00018435582835271545, "loss": 0.0912, "step": 8760 }, { "epoch": 0.5663676767676767, "grad_norm": 0.04591405391693115, "learning_rate": 0.0001843521555603043, "loss": 0.0803, "step": 8761 }, { "epoch": 0.5664323232323233, "grad_norm": 0.05628626421093941, "learning_rate": 0.00018434848237340598, "loss": 0.0922, "step": 8762 }, { "epoch": 0.5664969696969697, "grad_norm": 0.0771668404340744, "learning_rate": 0.00018434480879203773, "loss": 0.0798, "step": 8763 }, { "epoch": 0.5665616161616162, "grad_norm": 0.05368519201874733, "learning_rate": 0.00018434113481621665, "loss": 0.0847, "step": 8764 }, { "epoch": 0.5666262626262626, "grad_norm": 0.0622108168900013, "learning_rate": 0.00018433746044595997, "loss": 0.1038, "step": 8765 }, { "epoch": 0.5666909090909091, "grad_norm": 0.054184556007385254, "learning_rate": 0.00018433378568128488, "loss": 0.0872, "step": 8766 }, { "epoch": 0.5667555555555556, "grad_norm": 0.0543886199593544, "learning_rate": 0.00018433011052220856, "loss": 0.0862, "step": 8767 }, { "epoch": 0.566820202020202, "grad_norm": 0.06367596238851547, "learning_rate": 0.00018432643496874818, "loss": 0.0976, "step": 8768 }, { "epoch": 0.566820202020202, "eval_bleu": 18.229525147951474, "eval_loss": 0.0908517986536026, "eval_runtime": 2.6305, "eval_samples_per_second": 12.165, "eval_steps_per_second": 1.521, "step": 8768 }, { "epoch": 0.5668848484848484, "grad_norm": 0.06422419100999832, "learning_rate": 0.00018432275902092096, "loss": 0.0915, "step": 8769 }, { "epoch": 0.566949494949495, "grad_norm": 0.05179256945848465, "learning_rate": 0.00018431908267874407, "loss": 0.0693, "step": 8770 }, { "epoch": 0.5670141414141414, "grad_norm": 0.06738229095935822, "learning_rate": 0.0001843154059422347, "loss": 0.1066, "step": 8771 }, { "epoch": 0.5670787878787878, "grad_norm": 0.046142224222421646, "learning_rate": 0.00018431172881141003, "loss": 0.0725, "step": 8772 }, { "epoch": 0.5671434343434344, "grad_norm": 0.05671165883541107, "learning_rate": 0.0001843080512862873, "loss": 0.083, "step": 8773 }, { "epoch": 0.5672080808080808, "grad_norm": 0.055166326463222504, "learning_rate": 0.00018430437336688368, "loss": 0.0804, "step": 8774 }, { "epoch": 0.5672727272727273, "grad_norm": 0.0472162663936615, "learning_rate": 0.00018430069505321638, "loss": 0.0714, "step": 8775 }, { "epoch": 0.5673373737373737, "grad_norm": 0.05350706726312637, "learning_rate": 0.0001842970163453026, "loss": 0.0907, "step": 8776 }, { "epoch": 0.5674020202020202, "grad_norm": 0.06162343919277191, "learning_rate": 0.00018429333724315954, "loss": 0.1009, "step": 8777 }, { "epoch": 0.5674666666666667, "grad_norm": 0.056073497980833054, "learning_rate": 0.00018428965774680437, "loss": 0.0799, "step": 8778 }, { "epoch": 0.5675313131313131, "grad_norm": 0.05903627350926399, "learning_rate": 0.0001842859778562544, "loss": 0.0901, "step": 8779 }, { "epoch": 0.5675959595959595, "grad_norm": 0.05817148834466934, "learning_rate": 0.00018428229757152674, "loss": 0.0953, "step": 8780 }, { "epoch": 0.5676606060606061, "grad_norm": 0.05477965623140335, "learning_rate": 0.00018427861689263866, "loss": 0.0895, "step": 8781 }, { "epoch": 0.5677252525252525, "grad_norm": 0.05286632105708122, "learning_rate": 0.00018427493581960733, "loss": 0.0849, "step": 8782 }, { "epoch": 0.567789898989899, "grad_norm": 0.05148129165172577, "learning_rate": 0.00018427125435244998, "loss": 0.0796, "step": 8783 }, { "epoch": 0.5678545454545455, "grad_norm": 0.05298740416765213, "learning_rate": 0.00018426757249118386, "loss": 0.0788, "step": 8784 }, { "epoch": 0.5678545454545455, "eval_bleu": 17.97366585019468, "eval_loss": 0.08973270654678345, "eval_runtime": 2.8867, "eval_samples_per_second": 11.085, "eval_steps_per_second": 1.386, "step": 8784 }, { "epoch": 0.5679191919191919, "grad_norm": 0.052805159240961075, "learning_rate": 0.00018426389023582616, "loss": 0.0874, "step": 8785 }, { "epoch": 0.5679838383838384, "grad_norm": 0.06367046386003494, "learning_rate": 0.0001842602075863941, "loss": 0.1011, "step": 8786 }, { "epoch": 0.5680484848484848, "grad_norm": 0.05893612653017044, "learning_rate": 0.00018425652454290487, "loss": 0.0807, "step": 8787 }, { "epoch": 0.5681131313131313, "grad_norm": 0.0568956583738327, "learning_rate": 0.00018425284110537576, "loss": 0.0755, "step": 8788 }, { "epoch": 0.5681777777777778, "grad_norm": 0.07088121026754379, "learning_rate": 0.00018424915727382395, "loss": 0.1062, "step": 8789 }, { "epoch": 0.5682424242424242, "grad_norm": 0.051713746041059494, "learning_rate": 0.0001842454730482667, "loss": 0.0696, "step": 8790 }, { "epoch": 0.5683070707070708, "grad_norm": 0.06334646791219711, "learning_rate": 0.0001842417884287212, "loss": 0.1151, "step": 8791 }, { "epoch": 0.5683717171717172, "grad_norm": 0.055170342326164246, "learning_rate": 0.00018423810341520474, "loss": 0.0887, "step": 8792 }, { "epoch": 0.5684363636363636, "grad_norm": 0.0588499940931797, "learning_rate": 0.0001842344180077345, "loss": 0.1013, "step": 8793 }, { "epoch": 0.5685010101010101, "grad_norm": 0.04769960418343544, "learning_rate": 0.00018423073220632773, "loss": 0.0647, "step": 8794 }, { "epoch": 0.5685656565656566, "grad_norm": 0.060603365302085876, "learning_rate": 0.0001842270460110017, "loss": 0.0943, "step": 8795 }, { "epoch": 0.568630303030303, "grad_norm": 0.04931863397359848, "learning_rate": 0.00018422335942177358, "loss": 0.0753, "step": 8796 }, { "epoch": 0.5686949494949495, "grad_norm": 0.05149358883500099, "learning_rate": 0.00018421967243866067, "loss": 0.0751, "step": 8797 }, { "epoch": 0.5687595959595959, "grad_norm": 0.06434503942728043, "learning_rate": 0.0001842159850616802, "loss": 0.0906, "step": 8798 }, { "epoch": 0.5688242424242425, "grad_norm": 0.04883582890033722, "learning_rate": 0.0001842122972908494, "loss": 0.0807, "step": 8799 }, { "epoch": 0.5688888888888889, "grad_norm": 0.05030699074268341, "learning_rate": 0.00018420860912618552, "loss": 0.0794, "step": 8800 }, { "epoch": 0.5688888888888889, "eval_bleu": 17.030839954709123, "eval_loss": 0.08949118852615356, "eval_runtime": 2.6635, "eval_samples_per_second": 12.014, "eval_steps_per_second": 1.502, "step": 8800 }, { "epoch": 0.5689535353535353, "grad_norm": 0.06241225451231003, "learning_rate": 0.00018420492056770585, "loss": 0.0887, "step": 8801 }, { "epoch": 0.5690181818181819, "grad_norm": 0.04597688093781471, "learning_rate": 0.00018420123161542757, "loss": 0.0703, "step": 8802 }, { "epoch": 0.5690828282828283, "grad_norm": 0.0672423243522644, "learning_rate": 0.00018419754226936796, "loss": 0.1039, "step": 8803 }, { "epoch": 0.5691474747474747, "grad_norm": 0.061652153730392456, "learning_rate": 0.0001841938525295443, "loss": 0.101, "step": 8804 }, { "epoch": 0.5692121212121212, "grad_norm": 0.048494044691324234, "learning_rate": 0.00018419016239597382, "loss": 0.0683, "step": 8805 }, { "epoch": 0.5692767676767677, "grad_norm": 0.05402207002043724, "learning_rate": 0.00018418647186867377, "loss": 0.0892, "step": 8806 }, { "epoch": 0.5693414141414141, "grad_norm": 0.06199387088418007, "learning_rate": 0.00018418278094766143, "loss": 0.0982, "step": 8807 }, { "epoch": 0.5694060606060606, "grad_norm": 0.04882899671792984, "learning_rate": 0.00018417908963295405, "loss": 0.0709, "step": 8808 }, { "epoch": 0.569470707070707, "grad_norm": 0.058178722858428955, "learning_rate": 0.00018417539792456894, "loss": 0.0922, "step": 8809 }, { "epoch": 0.5695353535353536, "grad_norm": 0.05039582774043083, "learning_rate": 0.00018417170582252327, "loss": 0.0821, "step": 8810 }, { "epoch": 0.5696, "grad_norm": 0.05096636712551117, "learning_rate": 0.0001841680133268344, "loss": 0.0696, "step": 8811 }, { "epoch": 0.5696646464646464, "grad_norm": 0.05462642386555672, "learning_rate": 0.0001841643204375195, "loss": 0.0787, "step": 8812 }, { "epoch": 0.569729292929293, "grad_norm": 0.059331879019737244, "learning_rate": 0.00018416062715459591, "loss": 0.1036, "step": 8813 }, { "epoch": 0.5697939393939394, "grad_norm": 0.048579443246126175, "learning_rate": 0.00018415693347808093, "loss": 0.073, "step": 8814 }, { "epoch": 0.5698585858585858, "grad_norm": 0.059223853051662445, "learning_rate": 0.00018415323940799177, "loss": 0.0837, "step": 8815 }, { "epoch": 0.5699232323232323, "grad_norm": 0.054337091743946075, "learning_rate": 0.00018414954494434574, "loss": 0.0787, "step": 8816 }, { "epoch": 0.5699232323232323, "eval_bleu": 19.183699341305818, "eval_loss": 0.08930224180221558, "eval_runtime": 2.6031, "eval_samples_per_second": 12.293, "eval_steps_per_second": 1.537, "step": 8816 }, { "epoch": 0.5699878787878788, "grad_norm": 0.05047253891825676, "learning_rate": 0.0001841458500871601, "loss": 0.0677, "step": 8817 }, { "epoch": 0.5700525252525253, "grad_norm": 0.06028267741203308, "learning_rate": 0.00018414215483645213, "loss": 0.0942, "step": 8818 }, { "epoch": 0.5701171717171717, "grad_norm": 0.05938512831926346, "learning_rate": 0.00018413845919223914, "loss": 0.1037, "step": 8819 }, { "epoch": 0.5701818181818182, "grad_norm": 0.0597180500626564, "learning_rate": 0.0001841347631545384, "loss": 0.0853, "step": 8820 }, { "epoch": 0.5702464646464647, "grad_norm": 0.058166466653347015, "learning_rate": 0.00018413106672336715, "loss": 0.0905, "step": 8821 }, { "epoch": 0.5703111111111111, "grad_norm": 0.05811283737421036, "learning_rate": 0.00018412736989874273, "loss": 0.0794, "step": 8822 }, { "epoch": 0.5703757575757575, "grad_norm": 0.04459002614021301, "learning_rate": 0.0001841236726806824, "loss": 0.0732, "step": 8823 }, { "epoch": 0.5704404040404041, "grad_norm": 0.049907322973012924, "learning_rate": 0.0001841199750692035, "loss": 0.0739, "step": 8824 }, { "epoch": 0.5705050505050505, "grad_norm": 0.054704487323760986, "learning_rate": 0.00018411627706432324, "loss": 0.0887, "step": 8825 }, { "epoch": 0.570569696969697, "grad_norm": 0.06512756645679474, "learning_rate": 0.000184112578666059, "loss": 0.0731, "step": 8826 }, { "epoch": 0.5706343434343434, "grad_norm": 0.05398295074701309, "learning_rate": 0.00018410887987442803, "loss": 0.0871, "step": 8827 }, { "epoch": 0.5706989898989899, "grad_norm": 0.05736410617828369, "learning_rate": 0.0001841051806894476, "loss": 0.0948, "step": 8828 }, { "epoch": 0.5707636363636364, "grad_norm": 0.04851173982024193, "learning_rate": 0.00018410148111113508, "loss": 0.072, "step": 8829 }, { "epoch": 0.5708282828282828, "grad_norm": 0.0621102936565876, "learning_rate": 0.00018409778113950771, "loss": 0.1054, "step": 8830 }, { "epoch": 0.5708929292929293, "grad_norm": 0.05425441637635231, "learning_rate": 0.00018409408077458284, "loss": 0.0863, "step": 8831 }, { "epoch": 0.5709575757575758, "grad_norm": 0.060067664831876755, "learning_rate": 0.00018409038001637775, "loss": 0.1103, "step": 8832 }, { "epoch": 0.5709575757575758, "eval_bleu": 17.86836020937673, "eval_loss": 0.0885801762342453, "eval_runtime": 2.7341, "eval_samples_per_second": 11.704, "eval_steps_per_second": 1.463, "step": 8832 }, { "epoch": 0.5710222222222222, "grad_norm": 0.051619138568639755, "learning_rate": 0.00018408667886490976, "loss": 0.0745, "step": 8833 }, { "epoch": 0.5710868686868686, "grad_norm": 0.05455167591571808, "learning_rate": 0.00018408297732019614, "loss": 0.0872, "step": 8834 }, { "epoch": 0.5711515151515152, "grad_norm": 0.051753394305706024, "learning_rate": 0.00018407927538225427, "loss": 0.0901, "step": 8835 }, { "epoch": 0.5712161616161616, "grad_norm": 0.05538632348179817, "learning_rate": 0.0001840755730511014, "loss": 0.0821, "step": 8836 }, { "epoch": 0.5712808080808081, "grad_norm": 0.0507223904132843, "learning_rate": 0.0001840718703267549, "loss": 0.0789, "step": 8837 }, { "epoch": 0.5713454545454546, "grad_norm": 0.049345772713422775, "learning_rate": 0.00018406816720923204, "loss": 0.0701, "step": 8838 }, { "epoch": 0.571410101010101, "grad_norm": 0.04999859631061554, "learning_rate": 0.00018406446369855016, "loss": 0.0776, "step": 8839 }, { "epoch": 0.5714747474747475, "grad_norm": 0.051941484212875366, "learning_rate": 0.00018406075979472653, "loss": 0.0819, "step": 8840 }, { "epoch": 0.5715393939393939, "grad_norm": 0.04904894158244133, "learning_rate": 0.00018405705549777858, "loss": 0.079, "step": 8841 }, { "epoch": 0.5716040404040404, "grad_norm": 0.05745995044708252, "learning_rate": 0.00018405335080772355, "loss": 0.0925, "step": 8842 }, { "epoch": 0.5716686868686869, "grad_norm": 0.055378567427396774, "learning_rate": 0.00018404964572457877, "loss": 0.0816, "step": 8843 }, { "epoch": 0.5717333333333333, "grad_norm": 0.058112796396017075, "learning_rate": 0.0001840459402483616, "loss": 0.0926, "step": 8844 }, { "epoch": 0.5717979797979797, "grad_norm": 0.066911481320858, "learning_rate": 0.00018404223437908934, "loss": 0.0799, "step": 8845 }, { "epoch": 0.5718626262626263, "grad_norm": 0.057780634611845016, "learning_rate": 0.00018403852811677934, "loss": 0.0973, "step": 8846 }, { "epoch": 0.5719272727272727, "grad_norm": 0.05957140773534775, "learning_rate": 0.00018403482146144893, "loss": 0.0847, "step": 8847 }, { "epoch": 0.5719919191919192, "grad_norm": 0.05005825683474541, "learning_rate": 0.00018403111441311543, "loss": 0.0726, "step": 8848 }, { "epoch": 0.5719919191919192, "eval_bleu": 17.345722789924917, "eval_loss": 0.08882703632116318, "eval_runtime": 2.5874, "eval_samples_per_second": 12.367, "eval_steps_per_second": 1.546, "step": 8848 }, { "epoch": 0.5720565656565657, "grad_norm": 0.05401446670293808, "learning_rate": 0.00018402740697179618, "loss": 0.0832, "step": 8849 }, { "epoch": 0.5721212121212121, "grad_norm": 0.05747033283114433, "learning_rate": 0.00018402369913750853, "loss": 0.0749, "step": 8850 }, { "epoch": 0.5721858585858586, "grad_norm": 0.06496869027614594, "learning_rate": 0.00018401999091026985, "loss": 0.0928, "step": 8851 }, { "epoch": 0.572250505050505, "grad_norm": 0.062131118029356, "learning_rate": 0.00018401628229009742, "loss": 0.0823, "step": 8852 }, { "epoch": 0.5723151515151516, "grad_norm": 0.05798916146159172, "learning_rate": 0.0001840125732770086, "loss": 0.0874, "step": 8853 }, { "epoch": 0.572379797979798, "grad_norm": 0.052805326879024506, "learning_rate": 0.0001840088638710208, "loss": 0.0831, "step": 8854 }, { "epoch": 0.5724444444444444, "grad_norm": 0.056598760187625885, "learning_rate": 0.00018400515407215127, "loss": 0.083, "step": 8855 }, { "epoch": 0.5725090909090909, "grad_norm": 0.059193000197410583, "learning_rate": 0.0001840014438804174, "loss": 0.0808, "step": 8856 }, { "epoch": 0.5725737373737374, "grad_norm": 0.05053366348147392, "learning_rate": 0.0001839977332958366, "loss": 0.0759, "step": 8857 }, { "epoch": 0.5726383838383838, "grad_norm": 0.05775180086493492, "learning_rate": 0.00018399402231842612, "loss": 0.0935, "step": 8858 }, { "epoch": 0.5727030303030303, "grad_norm": 0.059919122606515884, "learning_rate": 0.0001839903109482034, "loss": 0.0891, "step": 8859 }, { "epoch": 0.5727676767676768, "grad_norm": 0.051013585180044174, "learning_rate": 0.00018398659918518573, "loss": 0.0763, "step": 8860 }, { "epoch": 0.5728323232323232, "grad_norm": 0.07275684177875519, "learning_rate": 0.00018398288702939052, "loss": 0.0985, "step": 8861 }, { "epoch": 0.5728969696969697, "grad_norm": 0.05190551280975342, "learning_rate": 0.0001839791744808351, "loss": 0.0822, "step": 8862 }, { "epoch": 0.5729616161616161, "grad_norm": 0.05404422804713249, "learning_rate": 0.00018397546153953685, "loss": 0.0818, "step": 8863 }, { "epoch": 0.5730262626262627, "grad_norm": 0.05672702565789223, "learning_rate": 0.00018397174820551315, "loss": 0.0934, "step": 8864 }, { "epoch": 0.5730262626262627, "eval_bleu": 17.25692574594251, "eval_loss": 0.08860113471746445, "eval_runtime": 2.6938, "eval_samples_per_second": 11.879, "eval_steps_per_second": 1.485, "step": 8864 }, { "epoch": 0.5730909090909091, "grad_norm": 0.06180180609226227, "learning_rate": 0.00018396803447878133, "loss": 0.0978, "step": 8865 }, { "epoch": 0.5731555555555555, "grad_norm": 0.0492742620408535, "learning_rate": 0.00018396432035935875, "loss": 0.0749, "step": 8866 }, { "epoch": 0.5732202020202021, "grad_norm": 0.051020022481679916, "learning_rate": 0.00018396060584726284, "loss": 0.0747, "step": 8867 }, { "epoch": 0.5732848484848485, "grad_norm": 0.060598116368055344, "learning_rate": 0.0001839568909425109, "loss": 0.0948, "step": 8868 }, { "epoch": 0.5733494949494949, "grad_norm": 0.050270844250917435, "learning_rate": 0.00018395317564512035, "loss": 0.078, "step": 8869 }, { "epoch": 0.5734141414141414, "grad_norm": 0.05845020338892937, "learning_rate": 0.00018394945995510854, "loss": 0.0945, "step": 8870 }, { "epoch": 0.5734787878787879, "grad_norm": 0.05463476851582527, "learning_rate": 0.00018394574387249287, "loss": 0.0813, "step": 8871 }, { "epoch": 0.5735434343434344, "grad_norm": 0.05384064465761185, "learning_rate": 0.00018394202739729072, "loss": 0.0903, "step": 8872 }, { "epoch": 0.5736080808080808, "grad_norm": 0.052499737590551376, "learning_rate": 0.00018393831052951944, "loss": 0.0921, "step": 8873 }, { "epoch": 0.5736727272727272, "grad_norm": 0.05455370247364044, "learning_rate": 0.00018393459326919646, "loss": 0.0838, "step": 8874 }, { "epoch": 0.5737373737373738, "grad_norm": 0.05087612569332123, "learning_rate": 0.0001839308756163391, "loss": 0.0727, "step": 8875 }, { "epoch": 0.5738020202020202, "grad_norm": 0.050424035638570786, "learning_rate": 0.00018392715757096478, "loss": 0.0816, "step": 8876 }, { "epoch": 0.5738666666666666, "grad_norm": 0.06816119700670242, "learning_rate": 0.0001839234391330909, "loss": 0.0841, "step": 8877 }, { "epoch": 0.5739313131313132, "grad_norm": 0.05493653938174248, "learning_rate": 0.00018391972030273486, "loss": 0.0903, "step": 8878 }, { "epoch": 0.5739959595959596, "grad_norm": 0.05211782082915306, "learning_rate": 0.000183916001079914, "loss": 0.0863, "step": 8879 }, { "epoch": 0.574060606060606, "grad_norm": 0.05780307948589325, "learning_rate": 0.00018391228146464578, "loss": 0.0909, "step": 8880 }, { "epoch": 0.574060606060606, "eval_bleu": 17.738325132669907, "eval_loss": 0.08965550363063812, "eval_runtime": 2.7413, "eval_samples_per_second": 11.673, "eval_steps_per_second": 1.459, "step": 8880 }, { "epoch": 0.5741252525252525, "grad_norm": 0.05427788197994232, "learning_rate": 0.00018390856145694752, "loss": 0.0829, "step": 8881 }, { "epoch": 0.574189898989899, "grad_norm": 0.05342056229710579, "learning_rate": 0.0001839048410568367, "loss": 0.0869, "step": 8882 }, { "epoch": 0.5742545454545455, "grad_norm": 0.0513044111430645, "learning_rate": 0.00018390112026433065, "loss": 0.076, "step": 8883 }, { "epoch": 0.5743191919191919, "grad_norm": 0.05626235529780388, "learning_rate": 0.00018389739907944682, "loss": 0.0898, "step": 8884 }, { "epoch": 0.5743838383838383, "grad_norm": 0.0567803718149662, "learning_rate": 0.00018389367750220256, "loss": 0.0818, "step": 8885 }, { "epoch": 0.5744484848484849, "grad_norm": 0.061904486268758774, "learning_rate": 0.0001838899555326153, "loss": 0.0986, "step": 8886 }, { "epoch": 0.5745131313131313, "grad_norm": 0.05526834353804588, "learning_rate": 0.0001838862331707025, "loss": 0.0864, "step": 8887 }, { "epoch": 0.5745777777777777, "grad_norm": 0.06748247146606445, "learning_rate": 0.00018388251041648145, "loss": 0.1054, "step": 8888 }, { "epoch": 0.5746424242424243, "grad_norm": 0.06284887343645096, "learning_rate": 0.0001838787872699697, "loss": 0.1108, "step": 8889 }, { "epoch": 0.5747070707070707, "grad_norm": 0.06124211102724075, "learning_rate": 0.00018387506373118454, "loss": 0.0817, "step": 8890 }, { "epoch": 0.5747717171717172, "grad_norm": 0.047787535935640335, "learning_rate": 0.00018387133980014346, "loss": 0.0746, "step": 8891 }, { "epoch": 0.5748363636363636, "grad_norm": 0.04825766757130623, "learning_rate": 0.00018386761547686383, "loss": 0.0666, "step": 8892 }, { "epoch": 0.5749010101010101, "grad_norm": 0.05553201958537102, "learning_rate": 0.00018386389076136312, "loss": 0.0816, "step": 8893 }, { "epoch": 0.5749656565656566, "grad_norm": 0.054832104593515396, "learning_rate": 0.00018386016565365872, "loss": 0.0862, "step": 8894 }, { "epoch": 0.575030303030303, "grad_norm": 0.06322586536407471, "learning_rate": 0.00018385644015376802, "loss": 0.095, "step": 8895 }, { "epoch": 0.5750949494949495, "grad_norm": 0.057262666523456573, "learning_rate": 0.00018385271426170848, "loss": 0.0843, "step": 8896 }, { "epoch": 0.5750949494949495, "eval_bleu": 19.145167970549977, "eval_loss": 0.08911332488059998, "eval_runtime": 2.7564, "eval_samples_per_second": 11.609, "eval_steps_per_second": 1.451, "step": 8896 }, { "epoch": 0.575159595959596, "grad_norm": 0.04644199088215828, "learning_rate": 0.00018384898797749752, "loss": 0.0728, "step": 8897 }, { "epoch": 0.5752242424242424, "grad_norm": 0.06984365731477737, "learning_rate": 0.0001838452613011526, "loss": 0.1166, "step": 8898 }, { "epoch": 0.5752888888888888, "grad_norm": 0.056202955543994904, "learning_rate": 0.00018384153423269103, "loss": 0.0879, "step": 8899 }, { "epoch": 0.5753535353535354, "grad_norm": 0.06235950440168381, "learning_rate": 0.0001838378067721304, "loss": 0.1038, "step": 8900 }, { "epoch": 0.5754181818181818, "grad_norm": 0.04869581758975983, "learning_rate": 0.00018383407891948804, "loss": 0.0689, "step": 8901 }, { "epoch": 0.5754828282828283, "grad_norm": 0.06001954525709152, "learning_rate": 0.00018383035067478137, "loss": 0.0989, "step": 8902 }, { "epoch": 0.5755474747474747, "grad_norm": 0.05136748403310776, "learning_rate": 0.00018382662203802788, "loss": 0.0858, "step": 8903 }, { "epoch": 0.5756121212121212, "grad_norm": 0.054447803646326065, "learning_rate": 0.00018382289300924502, "loss": 0.0896, "step": 8904 }, { "epoch": 0.5756767676767677, "grad_norm": 0.05060271546244621, "learning_rate": 0.00018381916358845018, "loss": 0.0828, "step": 8905 }, { "epoch": 0.5757414141414141, "grad_norm": 0.053740210831165314, "learning_rate": 0.00018381543377566083, "loss": 0.0861, "step": 8906 }, { "epoch": 0.5758060606060607, "grad_norm": 0.056781720370054245, "learning_rate": 0.00018381170357089442, "loss": 0.0911, "step": 8907 }, { "epoch": 0.5758707070707071, "grad_norm": 0.04227498546242714, "learning_rate": 0.00018380797297416835, "loss": 0.0592, "step": 8908 }, { "epoch": 0.5759353535353535, "grad_norm": 0.052434686571359634, "learning_rate": 0.00018380424198550014, "loss": 0.0959, "step": 8909 }, { "epoch": 0.576, "grad_norm": 0.058030471205711365, "learning_rate": 0.00018380051060490712, "loss": 0.0961, "step": 8910 }, { "epoch": 0.5760646464646465, "grad_norm": 0.05185621604323387, "learning_rate": 0.00018379677883240687, "loss": 0.0854, "step": 8911 }, { "epoch": 0.5761292929292929, "grad_norm": 0.05114945396780968, "learning_rate": 0.0001837930466680168, "loss": 0.0896, "step": 8912 }, { "epoch": 0.5761292929292929, "eval_bleu": 17.149196693250314, "eval_loss": 0.08907881379127502, "eval_runtime": 2.611, "eval_samples_per_second": 12.256, "eval_steps_per_second": 1.532, "step": 8912 }, { "epoch": 0.5761939393939394, "grad_norm": 0.05824131891131401, "learning_rate": 0.0001837893141117543, "loss": 0.0875, "step": 8913 }, { "epoch": 0.5762585858585858, "grad_norm": 0.05204668641090393, "learning_rate": 0.0001837855811636369, "loss": 0.085, "step": 8914 }, { "epoch": 0.5763232323232323, "grad_norm": 0.07321479171514511, "learning_rate": 0.00018378184782368205, "loss": 0.0849, "step": 8915 }, { "epoch": 0.5763878787878788, "grad_norm": 0.05688861384987831, "learning_rate": 0.00018377811409190723, "loss": 0.0931, "step": 8916 }, { "epoch": 0.5764525252525252, "grad_norm": 0.05993263050913811, "learning_rate": 0.0001837743799683298, "loss": 0.0895, "step": 8917 }, { "epoch": 0.5765171717171718, "grad_norm": 0.041410185396671295, "learning_rate": 0.00018377064545296734, "loss": 0.0668, "step": 8918 }, { "epoch": 0.5765818181818182, "grad_norm": 0.05078979954123497, "learning_rate": 0.00018376691054583724, "loss": 0.0646, "step": 8919 }, { "epoch": 0.5766464646464646, "grad_norm": 0.07452135533094406, "learning_rate": 0.00018376317524695698, "loss": 0.0708, "step": 8920 }, { "epoch": 0.5767111111111111, "grad_norm": 0.05223538726568222, "learning_rate": 0.00018375943955634404, "loss": 0.0859, "step": 8921 }, { "epoch": 0.5767757575757576, "grad_norm": 0.06285907328128815, "learning_rate": 0.00018375570347401593, "loss": 0.0995, "step": 8922 }, { "epoch": 0.576840404040404, "grad_norm": 0.052025746554136276, "learning_rate": 0.00018375196699999005, "loss": 0.0896, "step": 8923 }, { "epoch": 0.5769050505050505, "grad_norm": 0.046466223895549774, "learning_rate": 0.00018374823013428393, "loss": 0.0712, "step": 8924 }, { "epoch": 0.576969696969697, "grad_norm": 0.0547785647213459, "learning_rate": 0.00018374449287691502, "loss": 0.0937, "step": 8925 }, { "epoch": 0.5770343434343435, "grad_norm": 0.06920119374990463, "learning_rate": 0.00018374075522790078, "loss": 0.0716, "step": 8926 }, { "epoch": 0.5770989898989899, "grad_norm": 0.05027547851204872, "learning_rate": 0.00018373701718725873, "loss": 0.0777, "step": 8927 }, { "epoch": 0.5771636363636363, "grad_norm": 0.05870254337787628, "learning_rate": 0.00018373327875500633, "loss": 0.0814, "step": 8928 }, { "epoch": 0.5771636363636363, "eval_bleu": 20.257299540711752, "eval_loss": 0.08883155882358551, "eval_runtime": 2.666, "eval_samples_per_second": 12.003, "eval_steps_per_second": 1.5, "step": 8928 }, { "epoch": 0.5772282828282829, "grad_norm": 0.04710323363542557, "learning_rate": 0.00018372953993116108, "loss": 0.0674, "step": 8929 }, { "epoch": 0.5772929292929293, "grad_norm": 0.06775926053524017, "learning_rate": 0.00018372580071574044, "loss": 0.0963, "step": 8930 }, { "epoch": 0.5773575757575757, "grad_norm": 0.05401886627078056, "learning_rate": 0.00018372206110876192, "loss": 0.0944, "step": 8931 }, { "epoch": 0.5774222222222222, "grad_norm": 0.05510074272751808, "learning_rate": 0.00018371832111024297, "loss": 0.0881, "step": 8932 }, { "epoch": 0.5774868686868687, "grad_norm": 0.05112973973155022, "learning_rate": 0.00018371458072020112, "loss": 0.0834, "step": 8933 }, { "epoch": 0.5775515151515151, "grad_norm": 0.05286102741956711, "learning_rate": 0.00018371083993865385, "loss": 0.0732, "step": 8934 }, { "epoch": 0.5776161616161616, "grad_norm": 0.06996668875217438, "learning_rate": 0.00018370709876561867, "loss": 0.0973, "step": 8935 }, { "epoch": 0.5776808080808081, "grad_norm": 0.054446108639240265, "learning_rate": 0.00018370335720111305, "loss": 0.0739, "step": 8936 }, { "epoch": 0.5777454545454546, "grad_norm": 0.05721259489655495, "learning_rate": 0.00018369961524515452, "loss": 0.0903, "step": 8937 }, { "epoch": 0.577810101010101, "grad_norm": 0.05236532911658287, "learning_rate": 0.00018369587289776052, "loss": 0.0813, "step": 8938 }, { "epoch": 0.5778747474747474, "grad_norm": 0.05421515926718712, "learning_rate": 0.00018369213015894864, "loss": 0.0824, "step": 8939 }, { "epoch": 0.577939393939394, "grad_norm": 0.04705904424190521, "learning_rate": 0.0001836883870287363, "loss": 0.0646, "step": 8940 }, { "epoch": 0.5780040404040404, "grad_norm": 0.05539143830537796, "learning_rate": 0.00018368464350714106, "loss": 0.0915, "step": 8941 }, { "epoch": 0.5780686868686868, "grad_norm": 0.06151365488767624, "learning_rate": 0.0001836808995941804, "loss": 0.0854, "step": 8942 }, { "epoch": 0.5781333333333334, "grad_norm": 0.05910845845937729, "learning_rate": 0.00018367715528987187, "loss": 0.0877, "step": 8943 }, { "epoch": 0.5781979797979798, "grad_norm": 0.060380663722753525, "learning_rate": 0.00018367341059423288, "loss": 0.0998, "step": 8944 }, { "epoch": 0.5781979797979798, "eval_bleu": 19.01628237555066, "eval_loss": 0.08872319757938385, "eval_runtime": 2.6829, "eval_samples_per_second": 11.927, "eval_steps_per_second": 1.491, "step": 8944 }, { "epoch": 0.5782626262626263, "grad_norm": 0.08339352160692215, "learning_rate": 0.00018366966550728106, "loss": 0.0856, "step": 8945 }, { "epoch": 0.5783272727272727, "grad_norm": 0.0528285838663578, "learning_rate": 0.00018366592002903387, "loss": 0.0788, "step": 8946 }, { "epoch": 0.5783919191919192, "grad_norm": 0.057881321758031845, "learning_rate": 0.00018366217415950883, "loss": 0.0861, "step": 8947 }, { "epoch": 0.5784565656565657, "grad_norm": 0.04805448278784752, "learning_rate": 0.00018365842789872346, "loss": 0.0673, "step": 8948 }, { "epoch": 0.5785212121212121, "grad_norm": 0.05197633057832718, "learning_rate": 0.00018365468124669526, "loss": 0.0789, "step": 8949 }, { "epoch": 0.5785858585858585, "grad_norm": 0.04904663935303688, "learning_rate": 0.00018365093420344178, "loss": 0.0673, "step": 8950 }, { "epoch": 0.5786505050505051, "grad_norm": 0.06221050024032593, "learning_rate": 0.00018364718676898054, "loss": 0.1065, "step": 8951 }, { "epoch": 0.5787151515151515, "grad_norm": 0.06326711177825928, "learning_rate": 0.00018364343894332908, "loss": 0.0891, "step": 8952 }, { "epoch": 0.578779797979798, "grad_norm": 0.057041946798563004, "learning_rate": 0.00018363969072650486, "loss": 0.0974, "step": 8953 }, { "epoch": 0.5788444444444445, "grad_norm": 0.06380994617938995, "learning_rate": 0.00018363594211852547, "loss": 0.108, "step": 8954 }, { "epoch": 0.5789090909090909, "grad_norm": 0.05192818492650986, "learning_rate": 0.00018363219311940846, "loss": 0.078, "step": 8955 }, { "epoch": 0.5789737373737374, "grad_norm": 0.05872844159603119, "learning_rate": 0.00018362844372917132, "loss": 0.0912, "step": 8956 }, { "epoch": 0.5790383838383838, "grad_norm": 0.05876574665307999, "learning_rate": 0.0001836246939478316, "loss": 0.0904, "step": 8957 }, { "epoch": 0.5791030303030303, "grad_norm": 0.04860706254839897, "learning_rate": 0.0001836209437754068, "loss": 0.0706, "step": 8958 }, { "epoch": 0.5791676767676768, "grad_norm": 0.0469873882830143, "learning_rate": 0.00018361719321191453, "loss": 0.0719, "step": 8959 }, { "epoch": 0.5792323232323232, "grad_norm": 0.0537065714597702, "learning_rate": 0.00018361344225737228, "loss": 0.0852, "step": 8960 }, { "epoch": 0.5792323232323232, "eval_bleu": 14.211538754161905, "eval_loss": 0.08863554894924164, "eval_runtime": 2.7753, "eval_samples_per_second": 11.53, "eval_steps_per_second": 1.441, "step": 8960 }, { "epoch": 0.5792969696969696, "grad_norm": 0.050213079899549484, "learning_rate": 0.00018360969091179758, "loss": 0.0829, "step": 8961 }, { "epoch": 0.5793616161616162, "grad_norm": 0.054808687418699265, "learning_rate": 0.00018360593917520802, "loss": 0.0861, "step": 8962 }, { "epoch": 0.5794262626262626, "grad_norm": 0.052315082401037216, "learning_rate": 0.00018360218704762112, "loss": 0.0783, "step": 8963 }, { "epoch": 0.5794909090909091, "grad_norm": 0.047944653779268265, "learning_rate": 0.0001835984345290544, "loss": 0.0731, "step": 8964 }, { "epoch": 0.5795555555555556, "grad_norm": 0.05350741744041443, "learning_rate": 0.00018359468161952547, "loss": 0.0794, "step": 8965 }, { "epoch": 0.579620202020202, "grad_norm": 0.06666690111160278, "learning_rate": 0.00018359092831905186, "loss": 0.0845, "step": 8966 }, { "epoch": 0.5796848484848485, "grad_norm": 0.04913049191236496, "learning_rate": 0.0001835871746276511, "loss": 0.0748, "step": 8967 }, { "epoch": 0.5797494949494949, "grad_norm": 0.06244496628642082, "learning_rate": 0.00018358342054534073, "loss": 0.0998, "step": 8968 }, { "epoch": 0.5798141414141414, "grad_norm": 0.05994952842593193, "learning_rate": 0.00018357966607213837, "loss": 0.0929, "step": 8969 }, { "epoch": 0.5798787878787879, "grad_norm": 0.0575484000146389, "learning_rate": 0.00018357591120806154, "loss": 0.0978, "step": 8970 }, { "epoch": 0.5799434343434343, "grad_norm": 0.05401790514588356, "learning_rate": 0.00018357215595312778, "loss": 0.0833, "step": 8971 }, { "epoch": 0.5800080808080809, "grad_norm": 0.0542006678879261, "learning_rate": 0.0001835684003073547, "loss": 0.0863, "step": 8972 }, { "epoch": 0.5800727272727273, "grad_norm": 0.07021008431911469, "learning_rate": 0.00018356464427075984, "loss": 0.0753, "step": 8973 }, { "epoch": 0.5801373737373737, "grad_norm": 0.0692056342959404, "learning_rate": 0.00018356088784336075, "loss": 0.085, "step": 8974 }, { "epoch": 0.5802020202020202, "grad_norm": 0.06440766900777817, "learning_rate": 0.000183557131025175, "loss": 0.0664, "step": 8975 }, { "epoch": 0.5802666666666667, "grad_norm": 0.05534060671925545, "learning_rate": 0.0001835533738162202, "loss": 0.0843, "step": 8976 }, { "epoch": 0.5802666666666667, "eval_bleu": 17.106904957149265, "eval_loss": 0.08877456933259964, "eval_runtime": 2.6806, "eval_samples_per_second": 11.937, "eval_steps_per_second": 1.492, "step": 8976 }, { "epoch": 0.5803313131313131, "grad_norm": 0.05138641968369484, "learning_rate": 0.00018354961621651388, "loss": 0.0837, "step": 8977 }, { "epoch": 0.5803959595959596, "grad_norm": 0.05153067409992218, "learning_rate": 0.00018354585822607361, "loss": 0.0837, "step": 8978 }, { "epoch": 0.580460606060606, "grad_norm": 0.053655996918678284, "learning_rate": 0.000183542099844917, "loss": 0.0834, "step": 8979 }, { "epoch": 0.5805252525252526, "grad_norm": 0.0618569441139698, "learning_rate": 0.0001835383410730616, "loss": 0.0888, "step": 8980 }, { "epoch": 0.580589898989899, "grad_norm": 0.05204379931092262, "learning_rate": 0.00018353458191052498, "loss": 0.0761, "step": 8981 }, { "epoch": 0.5806545454545454, "grad_norm": 0.06541129946708679, "learning_rate": 0.00018353082235732476, "loss": 0.1046, "step": 8982 }, { "epoch": 0.580719191919192, "grad_norm": 0.055499717593193054, "learning_rate": 0.00018352706241347853, "loss": 0.0919, "step": 8983 }, { "epoch": 0.5807838383838384, "grad_norm": 0.05879778414964676, "learning_rate": 0.0001835233020790038, "loss": 0.093, "step": 8984 }, { "epoch": 0.5808484848484848, "grad_norm": 0.0593944676220417, "learning_rate": 0.00018351954135391817, "loss": 0.102, "step": 8985 }, { "epoch": 0.5809131313131313, "grad_norm": 0.05012635886669159, "learning_rate": 0.0001835157802382393, "loss": 0.0695, "step": 8986 }, { "epoch": 0.5809777777777778, "grad_norm": 0.043502818793058395, "learning_rate": 0.00018351201873198473, "loss": 0.0661, "step": 8987 }, { "epoch": 0.5810424242424242, "grad_norm": 0.07047661393880844, "learning_rate": 0.00018350825683517209, "loss": 0.119, "step": 8988 }, { "epoch": 0.5811070707070707, "grad_norm": 0.05263730511069298, "learning_rate": 0.00018350449454781887, "loss": 0.0913, "step": 8989 }, { "epoch": 0.5811717171717171, "grad_norm": 0.05119817331433296, "learning_rate": 0.00018350073186994278, "loss": 0.0795, "step": 8990 }, { "epoch": 0.5812363636363637, "grad_norm": 0.05124109238386154, "learning_rate": 0.00018349696880156136, "loss": 0.0878, "step": 8991 }, { "epoch": 0.5813010101010101, "grad_norm": 0.04700752720236778, "learning_rate": 0.00018349320534269224, "loss": 0.079, "step": 8992 }, { "epoch": 0.5813010101010101, "eval_bleu": 20.72853789307772, "eval_loss": 0.09018982946872711, "eval_runtime": 2.7429, "eval_samples_per_second": 11.667, "eval_steps_per_second": 1.458, "step": 8992 }, { "epoch": 0.5813656565656565, "grad_norm": 0.05666239187121391, "learning_rate": 0.00018348944149335298, "loss": 0.0842, "step": 8993 }, { "epoch": 0.5814303030303031, "grad_norm": 0.0633101686835289, "learning_rate": 0.00018348567725356117, "loss": 0.0923, "step": 8994 }, { "epoch": 0.5814949494949495, "grad_norm": 0.050814785063266754, "learning_rate": 0.0001834819126233345, "loss": 0.0786, "step": 8995 }, { "epoch": 0.5815595959595959, "grad_norm": 0.04521835967898369, "learning_rate": 0.0001834781476026905, "loss": 0.0621, "step": 8996 }, { "epoch": 0.5816242424242424, "grad_norm": 0.05040154606103897, "learning_rate": 0.0001834743821916468, "loss": 0.0733, "step": 8997 }, { "epoch": 0.5816888888888889, "grad_norm": 0.07840932905673981, "learning_rate": 0.00018347061639022104, "loss": 0.0929, "step": 8998 }, { "epoch": 0.5817535353535354, "grad_norm": 0.06093781813979149, "learning_rate": 0.00018346685019843078, "loss": 0.1025, "step": 8999 }, { "epoch": 0.5818181818181818, "grad_norm": 0.052086006850004196, "learning_rate": 0.00018346308361629366, "loss": 0.0716, "step": 9000 }, { "epoch": 0.5818828282828283, "grad_norm": 0.05810294672846794, "learning_rate": 0.00018345931664382728, "loss": 0.0722, "step": 9001 }, { "epoch": 0.5819474747474748, "grad_norm": 0.058901891112327576, "learning_rate": 0.00018345554928104926, "loss": 0.095, "step": 9002 }, { "epoch": 0.5820121212121212, "grad_norm": 0.05859379097819328, "learning_rate": 0.00018345178152797724, "loss": 0.0876, "step": 9003 }, { "epoch": 0.5820767676767676, "grad_norm": 0.06831064075231552, "learning_rate": 0.00018344801338462884, "loss": 0.0954, "step": 9004 }, { "epoch": 0.5821414141414142, "grad_norm": 0.08787602931261063, "learning_rate": 0.00018344424485102162, "loss": 0.096, "step": 9005 }, { "epoch": 0.5822060606060606, "grad_norm": 0.06024113669991493, "learning_rate": 0.00018344047592717331, "loss": 0.102, "step": 9006 }, { "epoch": 0.582270707070707, "grad_norm": 0.06056505814194679, "learning_rate": 0.00018343670661310143, "loss": 0.0821, "step": 9007 }, { "epoch": 0.5823353535353535, "grad_norm": 0.052631836384534836, "learning_rate": 0.00018343293690882372, "loss": 0.0877, "step": 9008 }, { "epoch": 0.5823353535353535, "eval_bleu": 19.939369959378237, "eval_loss": 0.08966313302516937, "eval_runtime": 2.6802, "eval_samples_per_second": 11.94, "eval_steps_per_second": 1.492, "step": 9008 }, { "epoch": 0.5824, "grad_norm": 0.061435870826244354, "learning_rate": 0.0001834291668143577, "loss": 0.0861, "step": 9009 }, { "epoch": 0.5824646464646465, "grad_norm": 0.05063420161604881, "learning_rate": 0.00018342539632972105, "loss": 0.0733, "step": 9010 }, { "epoch": 0.5825292929292929, "grad_norm": 0.04881446436047554, "learning_rate": 0.0001834216254549314, "loss": 0.0746, "step": 9011 }, { "epoch": 0.5825939393939394, "grad_norm": 0.054577428847551346, "learning_rate": 0.00018341785419000642, "loss": 0.0945, "step": 9012 }, { "epoch": 0.5826585858585859, "grad_norm": 0.058025527745485306, "learning_rate": 0.0001834140825349637, "loss": 0.0993, "step": 9013 }, { "epoch": 0.5827232323232323, "grad_norm": 0.055110957473516464, "learning_rate": 0.00018341031048982085, "loss": 0.1007, "step": 9014 }, { "epoch": 0.5827878787878787, "grad_norm": 0.04645516723394394, "learning_rate": 0.0001834065380545956, "loss": 0.0661, "step": 9015 }, { "epoch": 0.5828525252525253, "grad_norm": 0.059841178357601166, "learning_rate": 0.0001834027652293055, "loss": 0.0952, "step": 9016 }, { "epoch": 0.5829171717171717, "grad_norm": 0.06034037470817566, "learning_rate": 0.00018339899201396827, "loss": 0.0988, "step": 9017 }, { "epoch": 0.5829818181818182, "grad_norm": 0.054601769894361496, "learning_rate": 0.00018339521840860152, "loss": 0.0806, "step": 9018 }, { "epoch": 0.5830464646464647, "grad_norm": 0.05624205246567726, "learning_rate": 0.00018339144441322292, "loss": 0.0767, "step": 9019 }, { "epoch": 0.5831111111111111, "grad_norm": 0.057488199323415756, "learning_rate": 0.00018338767002785007, "loss": 0.105, "step": 9020 }, { "epoch": 0.5831757575757576, "grad_norm": 0.058403223752975464, "learning_rate": 0.00018338389525250067, "loss": 0.0795, "step": 9021 }, { "epoch": 0.583240404040404, "grad_norm": 0.05518298223614693, "learning_rate": 0.00018338012008719238, "loss": 0.0881, "step": 9022 }, { "epoch": 0.5833050505050505, "grad_norm": 0.05550944060087204, "learning_rate": 0.0001833763445319428, "loss": 0.0727, "step": 9023 }, { "epoch": 0.583369696969697, "grad_norm": 0.06442783772945404, "learning_rate": 0.00018337256858676963, "loss": 0.0898, "step": 9024 }, { "epoch": 0.583369696969697, "eval_bleu": 19.025609905731283, "eval_loss": 0.08997128158807755, "eval_runtime": 2.7677, "eval_samples_per_second": 11.562, "eval_steps_per_second": 1.445, "step": 9024 }, { "epoch": 0.5834343434343434, "grad_norm": 0.05557333305478096, "learning_rate": 0.00018336879225169054, "loss": 0.0785, "step": 9025 }, { "epoch": 0.5834989898989899, "grad_norm": 0.047359272837638855, "learning_rate": 0.00018336501552672317, "loss": 0.0645, "step": 9026 }, { "epoch": 0.5835636363636364, "grad_norm": 0.052981987595558167, "learning_rate": 0.00018336123841188515, "loss": 0.0816, "step": 9027 }, { "epoch": 0.5836282828282828, "grad_norm": 0.056850019842386246, "learning_rate": 0.00018335746090719418, "loss": 0.0754, "step": 9028 }, { "epoch": 0.5836929292929293, "grad_norm": 0.059317875653505325, "learning_rate": 0.00018335368301266797, "loss": 0.0949, "step": 9029 }, { "epoch": 0.5837575757575758, "grad_norm": 0.05483614280819893, "learning_rate": 0.0001833499047283241, "loss": 0.0825, "step": 9030 }, { "epoch": 0.5838222222222222, "grad_norm": 0.06098262593150139, "learning_rate": 0.00018334612605418028, "loss": 0.0832, "step": 9031 }, { "epoch": 0.5838868686868687, "grad_norm": 0.08006541430950165, "learning_rate": 0.00018334234699025418, "loss": 0.0753, "step": 9032 }, { "epoch": 0.5839515151515151, "grad_norm": 0.05083772540092468, "learning_rate": 0.0001833385675365635, "loss": 0.0712, "step": 9033 }, { "epoch": 0.5840161616161617, "grad_norm": 0.05805937200784683, "learning_rate": 0.00018333478769312587, "loss": 0.0702, "step": 9034 }, { "epoch": 0.5840808080808081, "grad_norm": 0.060439225286245346, "learning_rate": 0.00018333100745995898, "loss": 0.0898, "step": 9035 }, { "epoch": 0.5841454545454545, "grad_norm": 0.059651970863342285, "learning_rate": 0.00018332722683708053, "loss": 0.0979, "step": 9036 }, { "epoch": 0.584210101010101, "grad_norm": 0.05369448661804199, "learning_rate": 0.0001833234458245082, "loss": 0.0845, "step": 9037 }, { "epoch": 0.5842747474747475, "grad_norm": 0.07275805622339249, "learning_rate": 0.00018331966442225965, "loss": 0.106, "step": 9038 }, { "epoch": 0.5843393939393939, "grad_norm": 0.05892793461680412, "learning_rate": 0.00018331588263035257, "loss": 0.0886, "step": 9039 }, { "epoch": 0.5844040404040404, "grad_norm": 0.052489738911390305, "learning_rate": 0.00018331210044880467, "loss": 0.0914, "step": 9040 }, { "epoch": 0.5844040404040404, "eval_bleu": 19.455499922956722, "eval_loss": 0.08997312188148499, "eval_runtime": 2.8399, "eval_samples_per_second": 11.268, "eval_steps_per_second": 1.408, "step": 9040 }, { "epoch": 0.5844686868686869, "grad_norm": 0.06368999928236008, "learning_rate": 0.00018330831787763359, "loss": 0.1072, "step": 9041 }, { "epoch": 0.5845333333333333, "grad_norm": 0.048123542219400406, "learning_rate": 0.00018330453491685706, "loss": 0.0688, "step": 9042 }, { "epoch": 0.5845979797979798, "grad_norm": 0.058485087007284164, "learning_rate": 0.00018330075156649275, "loss": 0.0873, "step": 9043 }, { "epoch": 0.5846626262626262, "grad_norm": 0.046576812863349915, "learning_rate": 0.0001832969678265584, "loss": 0.0687, "step": 9044 }, { "epoch": 0.5847272727272728, "grad_norm": 0.056535787880420685, "learning_rate": 0.00018329318369707161, "loss": 0.0789, "step": 9045 }, { "epoch": 0.5847919191919192, "grad_norm": 0.04668867215514183, "learning_rate": 0.00018328939917805018, "loss": 0.0663, "step": 9046 }, { "epoch": 0.5848565656565656, "grad_norm": 0.051056817173957825, "learning_rate": 0.00018328561426951177, "loss": 0.0754, "step": 9047 }, { "epoch": 0.5849212121212122, "grad_norm": 0.0595892034471035, "learning_rate": 0.00018328182897147405, "loss": 0.093, "step": 9048 }, { "epoch": 0.5849858585858586, "grad_norm": 0.051213834434747696, "learning_rate": 0.00018327804328395475, "loss": 0.0712, "step": 9049 }, { "epoch": 0.585050505050505, "grad_norm": 0.0491357184946537, "learning_rate": 0.00018327425720697158, "loss": 0.0704, "step": 9050 }, { "epoch": 0.5851151515151515, "grad_norm": 0.05003630742430687, "learning_rate": 0.00018327047074054225, "loss": 0.0739, "step": 9051 }, { "epoch": 0.585179797979798, "grad_norm": 0.06251582503318787, "learning_rate": 0.00018326668388468444, "loss": 0.0931, "step": 9052 }, { "epoch": 0.5852444444444445, "grad_norm": 0.06152809411287308, "learning_rate": 0.00018326289663941588, "loss": 0.0958, "step": 9053 }, { "epoch": 0.5853090909090909, "grad_norm": 0.05457994341850281, "learning_rate": 0.0001832591090047543, "loss": 0.0785, "step": 9054 }, { "epoch": 0.5853737373737373, "grad_norm": 0.05122746527194977, "learning_rate": 0.00018325532098071738, "loss": 0.0787, "step": 9055 }, { "epoch": 0.5854383838383839, "grad_norm": 0.05261232331395149, "learning_rate": 0.00018325153256732284, "loss": 0.0807, "step": 9056 }, { "epoch": 0.5854383838383839, "eval_bleu": 20.137258253624285, "eval_loss": 0.0892399251461029, "eval_runtime": 2.8845, "eval_samples_per_second": 11.094, "eval_steps_per_second": 1.387, "step": 9056 }, { "epoch": 0.5855030303030303, "grad_norm": 0.0595356822013855, "learning_rate": 0.0001832477437645884, "loss": 0.1049, "step": 9057 }, { "epoch": 0.5855676767676767, "grad_norm": 0.05547074228525162, "learning_rate": 0.00018324395457253179, "loss": 0.0914, "step": 9058 }, { "epoch": 0.5856323232323233, "grad_norm": 0.07887762039899826, "learning_rate": 0.00018324016499117072, "loss": 0.0772, "step": 9059 }, { "epoch": 0.5856969696969697, "grad_norm": 0.05774722620844841, "learning_rate": 0.00018323637502052291, "loss": 0.0888, "step": 9060 }, { "epoch": 0.5857616161616161, "grad_norm": 0.0565013512969017, "learning_rate": 0.0001832325846606061, "loss": 0.0911, "step": 9061 }, { "epoch": 0.5858262626262626, "grad_norm": 0.057539187371730804, "learning_rate": 0.00018322879391143802, "loss": 0.0879, "step": 9062 }, { "epoch": 0.5858909090909091, "grad_norm": 0.05531030148267746, "learning_rate": 0.00018322500277303638, "loss": 0.0841, "step": 9063 }, { "epoch": 0.5859555555555556, "grad_norm": 0.05072743818163872, "learning_rate": 0.0001832212112454189, "loss": 0.0812, "step": 9064 }, { "epoch": 0.586020202020202, "grad_norm": 0.05519331619143486, "learning_rate": 0.00018321741932860337, "loss": 0.0785, "step": 9065 }, { "epoch": 0.5860848484848484, "grad_norm": 0.06017308682203293, "learning_rate": 0.00018321362702260742, "loss": 0.1014, "step": 9066 }, { "epoch": 0.586149494949495, "grad_norm": 0.05984785407781601, "learning_rate": 0.00018320983432744889, "loss": 0.0911, "step": 9067 }, { "epoch": 0.5862141414141414, "grad_norm": 0.053321730345487595, "learning_rate": 0.00018320604124314548, "loss": 0.0719, "step": 9068 }, { "epoch": 0.5862787878787878, "grad_norm": 0.07865922898054123, "learning_rate": 0.00018320224776971489, "loss": 0.0943, "step": 9069 }, { "epoch": 0.5863434343434344, "grad_norm": 0.05222305282950401, "learning_rate": 0.0001831984539071749, "loss": 0.0836, "step": 9070 }, { "epoch": 0.5864080808080808, "grad_norm": 0.05956115573644638, "learning_rate": 0.00018319465965554326, "loss": 0.0877, "step": 9071 }, { "epoch": 0.5864727272727273, "grad_norm": 0.053884778171777725, "learning_rate": 0.00018319086501483769, "loss": 0.0855, "step": 9072 }, { "epoch": 0.5864727272727273, "eval_bleu": 18.341474252057864, "eval_loss": 0.08925361186265945, "eval_runtime": 2.6973, "eval_samples_per_second": 11.864, "eval_steps_per_second": 1.483, "step": 9072 }, { "epoch": 0.5865373737373737, "grad_norm": 0.057896438986063004, "learning_rate": 0.00018318706998507594, "loss": 0.0923, "step": 9073 }, { "epoch": 0.5866020202020202, "grad_norm": 0.058821361511945724, "learning_rate": 0.0001831832745662758, "loss": 0.0926, "step": 9074 }, { "epoch": 0.5866666666666667, "grad_norm": 0.04767459258437157, "learning_rate": 0.00018317947875845495, "loss": 0.0626, "step": 9075 }, { "epoch": 0.5867313131313131, "grad_norm": 0.0576208271086216, "learning_rate": 0.00018317568256163118, "loss": 0.081, "step": 9076 }, { "epoch": 0.5867959595959596, "grad_norm": 0.056605804711580276, "learning_rate": 0.00018317188597582223, "loss": 0.0864, "step": 9077 }, { "epoch": 0.5868606060606061, "grad_norm": 0.050927501171827316, "learning_rate": 0.0001831680890010459, "loss": 0.0737, "step": 9078 }, { "epoch": 0.5869252525252525, "grad_norm": 0.0724203810095787, "learning_rate": 0.00018316429163731988, "loss": 0.1268, "step": 9079 }, { "epoch": 0.586989898989899, "grad_norm": 0.061362482607364655, "learning_rate": 0.00018316049388466196, "loss": 0.0939, "step": 9080 }, { "epoch": 0.5870545454545455, "grad_norm": 0.04795632138848305, "learning_rate": 0.00018315669574308992, "loss": 0.068, "step": 9081 }, { "epoch": 0.5871191919191919, "grad_norm": 0.05766903981566429, "learning_rate": 0.0001831528972126215, "loss": 0.0831, "step": 9082 }, { "epoch": 0.5871838383838384, "grad_norm": 0.05098048597574234, "learning_rate": 0.0001831490982932745, "loss": 0.0808, "step": 9083 }, { "epoch": 0.5872484848484848, "grad_norm": 0.048666246235370636, "learning_rate": 0.0001831452989850666, "loss": 0.0719, "step": 9084 }, { "epoch": 0.5873131313131313, "grad_norm": 0.061897989362478256, "learning_rate": 0.00018314149928801566, "loss": 0.0912, "step": 9085 }, { "epoch": 0.5873777777777778, "grad_norm": 0.054546110332012177, "learning_rate": 0.0001831376992021394, "loss": 0.0811, "step": 9086 }, { "epoch": 0.5874424242424242, "grad_norm": 0.06040853634476662, "learning_rate": 0.0001831338987274556, "loss": 0.0885, "step": 9087 }, { "epoch": 0.5875070707070708, "grad_norm": 0.06230511516332626, "learning_rate": 0.00018313009786398207, "loss": 0.1045, "step": 9088 }, { "epoch": 0.5875070707070708, "eval_bleu": 16.537561713523708, "eval_loss": 0.08950278162956238, "eval_runtime": 2.879, "eval_samples_per_second": 11.115, "eval_steps_per_second": 1.389, "step": 9088 }, { "epoch": 0.5875717171717172, "grad_norm": 0.052356161177158356, "learning_rate": 0.00018312629661173654, "loss": 0.0861, "step": 9089 }, { "epoch": 0.5876363636363636, "grad_norm": 0.047860585153102875, "learning_rate": 0.0001831224949707368, "loss": 0.0757, "step": 9090 }, { "epoch": 0.5877010101010101, "grad_norm": 0.05536630004644394, "learning_rate": 0.0001831186929410006, "loss": 0.0784, "step": 9091 }, { "epoch": 0.5877656565656566, "grad_norm": 0.05321324244141579, "learning_rate": 0.0001831148905225458, "loss": 0.0764, "step": 9092 }, { "epoch": 0.587830303030303, "grad_norm": 0.051683299243450165, "learning_rate": 0.00018311108771539012, "loss": 0.0765, "step": 9093 }, { "epoch": 0.5878949494949495, "grad_norm": 0.04887934401631355, "learning_rate": 0.00018310728451955135, "loss": 0.069, "step": 9094 }, { "epoch": 0.5879595959595959, "grad_norm": 0.06719184666872025, "learning_rate": 0.00018310348093504726, "loss": 0.1037, "step": 9095 }, { "epoch": 0.5880242424242424, "grad_norm": 0.04895949363708496, "learning_rate": 0.0001830996769618957, "loss": 0.0674, "step": 9096 }, { "epoch": 0.5880888888888889, "grad_norm": 0.05349082499742508, "learning_rate": 0.00018309587260011443, "loss": 0.0865, "step": 9097 }, { "epoch": 0.5881535353535353, "grad_norm": 0.05967748910188675, "learning_rate": 0.00018309206784972122, "loss": 0.1008, "step": 9098 }, { "epoch": 0.5882181818181819, "grad_norm": 0.05608963221311569, "learning_rate": 0.00018308826271073387, "loss": 0.0833, "step": 9099 }, { "epoch": 0.5882828282828283, "grad_norm": 0.05630242079496384, "learning_rate": 0.0001830844571831702, "loss": 0.0994, "step": 9100 }, { "epoch": 0.5883474747474747, "grad_norm": 0.05887025222182274, "learning_rate": 0.00018308065126704796, "loss": 0.0896, "step": 9101 }, { "epoch": 0.5884121212121212, "grad_norm": 0.05496933311223984, "learning_rate": 0.000183076844962385, "loss": 0.0777, "step": 9102 }, { "epoch": 0.5884767676767677, "grad_norm": 0.061057910323143005, "learning_rate": 0.0001830730382691991, "loss": 0.0938, "step": 9103 }, { "epoch": 0.5885414141414141, "grad_norm": 0.05915696918964386, "learning_rate": 0.0001830692311875081, "loss": 0.0909, "step": 9104 }, { "epoch": 0.5885414141414141, "eval_bleu": 15.549553715645697, "eval_loss": 0.0892290249466896, "eval_runtime": 2.7909, "eval_samples_per_second": 11.466, "eval_steps_per_second": 1.433, "step": 9104 }, { "epoch": 0.5886060606060606, "grad_norm": 0.05124753713607788, "learning_rate": 0.00018306542371732972, "loss": 0.0758, "step": 9105 }, { "epoch": 0.5886707070707071, "grad_norm": 0.05264751985669136, "learning_rate": 0.00018306161585868183, "loss": 0.0801, "step": 9106 }, { "epoch": 0.5887353535353536, "grad_norm": 0.0647093802690506, "learning_rate": 0.00018305780761158222, "loss": 0.1024, "step": 9107 }, { "epoch": 0.5888, "grad_norm": 0.05752125754952431, "learning_rate": 0.0001830539989760487, "loss": 0.0956, "step": 9108 }, { "epoch": 0.5888646464646464, "grad_norm": 0.057029854506254196, "learning_rate": 0.00018305018995209908, "loss": 0.0879, "step": 9109 }, { "epoch": 0.588929292929293, "grad_norm": 0.054997947067022324, "learning_rate": 0.00018304638053975118, "loss": 0.0836, "step": 9110 }, { "epoch": 0.5889939393939394, "grad_norm": 0.06565428525209427, "learning_rate": 0.00018304257073902282, "loss": 0.0894, "step": 9111 }, { "epoch": 0.5890585858585858, "grad_norm": 0.057895317673683167, "learning_rate": 0.0001830387605499318, "loss": 0.0859, "step": 9112 }, { "epoch": 0.5891232323232323, "grad_norm": 0.04485045745968819, "learning_rate": 0.00018303494997249598, "loss": 0.0656, "step": 9113 }, { "epoch": 0.5891878787878788, "grad_norm": 0.05734753608703613, "learning_rate": 0.0001830311390067331, "loss": 0.0794, "step": 9114 }, { "epoch": 0.5892525252525252, "grad_norm": 0.06022331118583679, "learning_rate": 0.00018302732765266107, "loss": 0.0958, "step": 9115 }, { "epoch": 0.5893171717171717, "grad_norm": 0.05231441557407379, "learning_rate": 0.00018302351591029768, "loss": 0.076, "step": 9116 }, { "epoch": 0.5893818181818182, "grad_norm": 0.059548091143369675, "learning_rate": 0.00018301970377966072, "loss": 0.092, "step": 9117 }, { "epoch": 0.5894464646464647, "grad_norm": 0.06287868320941925, "learning_rate": 0.00018301589126076806, "loss": 0.1058, "step": 9118 }, { "epoch": 0.5895111111111111, "grad_norm": 0.06589928269386292, "learning_rate": 0.00018301207835363755, "loss": 0.1138, "step": 9119 }, { "epoch": 0.5895757575757575, "grad_norm": 0.05299884080886841, "learning_rate": 0.00018300826505828694, "loss": 0.074, "step": 9120 }, { "epoch": 0.5895757575757575, "eval_bleu": 20.644439489447322, "eval_loss": 0.08887513726949692, "eval_runtime": 2.7055, "eval_samples_per_second": 11.828, "eval_steps_per_second": 1.478, "step": 9120 }, { "epoch": 0.5896404040404041, "grad_norm": 0.05180409550666809, "learning_rate": 0.00018300445137473418, "loss": 0.069, "step": 9121 }, { "epoch": 0.5897050505050505, "grad_norm": 0.05940230190753937, "learning_rate": 0.000183000637302997, "loss": 0.0783, "step": 9122 }, { "epoch": 0.5897696969696969, "grad_norm": 0.049868032336235046, "learning_rate": 0.00018299682284309327, "loss": 0.0739, "step": 9123 }, { "epoch": 0.5898343434343435, "grad_norm": 0.0506473109126091, "learning_rate": 0.00018299300799504086, "loss": 0.0728, "step": 9124 }, { "epoch": 0.5898989898989899, "grad_norm": 0.046401411294937134, "learning_rate": 0.0001829891927588576, "loss": 0.0736, "step": 9125 }, { "epoch": 0.5899636363636364, "grad_norm": 0.0579676479101181, "learning_rate": 0.0001829853771345613, "loss": 0.0955, "step": 9126 }, { "epoch": 0.5900282828282828, "grad_norm": 0.06306740641593933, "learning_rate": 0.0001829815611221698, "loss": 0.1118, "step": 9127 }, { "epoch": 0.5900929292929293, "grad_norm": 0.069278284907341, "learning_rate": 0.00018297774472170102, "loss": 0.0916, "step": 9128 }, { "epoch": 0.5901575757575758, "grad_norm": 0.05062253028154373, "learning_rate": 0.00018297392793317275, "loss": 0.0833, "step": 9129 }, { "epoch": 0.5902222222222222, "grad_norm": 0.05516158044338226, "learning_rate": 0.00018297011075660286, "loss": 0.088, "step": 9130 }, { "epoch": 0.5902868686868686, "grad_norm": 0.06656276434659958, "learning_rate": 0.00018296629319200917, "loss": 0.1135, "step": 9131 }, { "epoch": 0.5903515151515152, "grad_norm": 0.056892819702625275, "learning_rate": 0.00018296247523940954, "loss": 0.087, "step": 9132 }, { "epoch": 0.5904161616161616, "grad_norm": 0.05316544324159622, "learning_rate": 0.00018295865689882187, "loss": 0.0791, "step": 9133 }, { "epoch": 0.590480808080808, "grad_norm": 0.055557265877723694, "learning_rate": 0.00018295483817026397, "loss": 0.0811, "step": 9134 }, { "epoch": 0.5905454545454546, "grad_norm": 0.06192019581794739, "learning_rate": 0.00018295101905375374, "loss": 0.0889, "step": 9135 }, { "epoch": 0.590610101010101, "grad_norm": 0.046728238463401794, "learning_rate": 0.00018294719954930902, "loss": 0.0749, "step": 9136 }, { "epoch": 0.590610101010101, "eval_bleu": 18.055484140860703, "eval_loss": 0.08662500232458115, "eval_runtime": 2.7112, "eval_samples_per_second": 11.803, "eval_steps_per_second": 1.475, "step": 9136 }, { "epoch": 0.5906747474747475, "grad_norm": 0.058726560324430466, "learning_rate": 0.00018294337965694762, "loss": 0.0937, "step": 9137 }, { "epoch": 0.5907393939393939, "grad_norm": 0.060054194182157516, "learning_rate": 0.00018293955937668749, "loss": 0.1024, "step": 9138 }, { "epoch": 0.5908040404040404, "grad_norm": 0.06933224946260452, "learning_rate": 0.00018293573870854646, "loss": 0.0803, "step": 9139 }, { "epoch": 0.5908686868686869, "grad_norm": 0.05684564635157585, "learning_rate": 0.0001829319176525424, "loss": 0.0877, "step": 9140 }, { "epoch": 0.5909333333333333, "grad_norm": 0.060081273317337036, "learning_rate": 0.00018292809620869315, "loss": 0.0903, "step": 9141 }, { "epoch": 0.5909979797979797, "grad_norm": 0.05961374193429947, "learning_rate": 0.00018292427437701663, "loss": 0.0994, "step": 9142 }, { "epoch": 0.5910626262626263, "grad_norm": 0.05367889255285263, "learning_rate": 0.0001829204521575307, "loss": 0.0852, "step": 9143 }, { "epoch": 0.5911272727272727, "grad_norm": 0.05915446579456329, "learning_rate": 0.00018291662955025322, "loss": 0.0948, "step": 9144 }, { "epoch": 0.5911919191919192, "grad_norm": 0.11964838206768036, "learning_rate": 0.00018291280655520207, "loss": 0.0912, "step": 9145 }, { "epoch": 0.5912565656565657, "grad_norm": 0.07057321816682816, "learning_rate": 0.00018290898317239513, "loss": 0.1025, "step": 9146 }, { "epoch": 0.5913212121212121, "grad_norm": 0.056596554815769196, "learning_rate": 0.0001829051594018503, "loss": 0.0847, "step": 9147 }, { "epoch": 0.5913858585858586, "grad_norm": 0.05452312156558037, "learning_rate": 0.00018290133524358546, "loss": 0.08, "step": 9148 }, { "epoch": 0.591450505050505, "grad_norm": 0.048939790576696396, "learning_rate": 0.00018289751069761845, "loss": 0.0709, "step": 9149 }, { "epoch": 0.5915151515151515, "grad_norm": 0.052288249135017395, "learning_rate": 0.0001828936857639672, "loss": 0.084, "step": 9150 }, { "epoch": 0.591579797979798, "grad_norm": 0.058101505041122437, "learning_rate": 0.00018288986044264957, "loss": 0.0845, "step": 9151 }, { "epoch": 0.5916444444444444, "grad_norm": 0.05799223482608795, "learning_rate": 0.0001828860347336835, "loss": 0.0987, "step": 9152 }, { "epoch": 0.5916444444444444, "eval_bleu": 16.97222879895455, "eval_loss": 0.0874587744474411, "eval_runtime": 2.8663, "eval_samples_per_second": 11.164, "eval_steps_per_second": 1.396, "step": 9152 }, { "epoch": 0.591709090909091, "grad_norm": 0.06227225065231323, "learning_rate": 0.00018288220863708682, "loss": 0.0976, "step": 9153 }, { "epoch": 0.5917737373737374, "grad_norm": 0.055021677166223526, "learning_rate": 0.0001828783821528775, "loss": 0.0874, "step": 9154 }, { "epoch": 0.5918383838383838, "grad_norm": 0.0508236289024353, "learning_rate": 0.00018287455528107334, "loss": 0.0815, "step": 9155 }, { "epoch": 0.5919030303030303, "grad_norm": 0.059219490736722946, "learning_rate": 0.00018287072802169226, "loss": 0.1048, "step": 9156 }, { "epoch": 0.5919676767676768, "grad_norm": 0.05515627935528755, "learning_rate": 0.0001828669003747522, "loss": 0.0895, "step": 9157 }, { "epoch": 0.5920323232323232, "grad_norm": 0.056971173733472824, "learning_rate": 0.00018286307234027108, "loss": 0.0892, "step": 9158 }, { "epoch": 0.5920969696969697, "grad_norm": 0.04291302710771561, "learning_rate": 0.00018285924391826672, "loss": 0.076, "step": 9159 }, { "epoch": 0.5921616161616161, "grad_norm": 0.0629926547408104, "learning_rate": 0.0001828554151087571, "loss": 0.1051, "step": 9160 }, { "epoch": 0.5922262626262627, "grad_norm": 0.05361894890666008, "learning_rate": 0.00018285158591176007, "loss": 0.0907, "step": 9161 }, { "epoch": 0.5922909090909091, "grad_norm": 0.0692467987537384, "learning_rate": 0.00018284775632729358, "loss": 0.1083, "step": 9162 }, { "epoch": 0.5923555555555555, "grad_norm": 0.055932480841875076, "learning_rate": 0.00018284392635537552, "loss": 0.0846, "step": 9163 }, { "epoch": 0.5924202020202021, "grad_norm": 0.0649254322052002, "learning_rate": 0.0001828400959960238, "loss": 0.1019, "step": 9164 }, { "epoch": 0.5924848484848485, "grad_norm": 0.04836229234933853, "learning_rate": 0.00018283626524925634, "loss": 0.0741, "step": 9165 }, { "epoch": 0.5925494949494949, "grad_norm": 0.05911018326878548, "learning_rate": 0.00018283243411509106, "loss": 0.0768, "step": 9166 }, { "epoch": 0.5926141414141414, "grad_norm": 0.05254668742418289, "learning_rate": 0.00018282860259354587, "loss": 0.0795, "step": 9167 }, { "epoch": 0.5926787878787879, "grad_norm": 0.05523259565234184, "learning_rate": 0.00018282477068463866, "loss": 0.0859, "step": 9168 }, { "epoch": 0.5926787878787879, "eval_bleu": 16.37941015637504, "eval_loss": 0.08848515897989273, "eval_runtime": 2.9206, "eval_samples_per_second": 10.957, "eval_steps_per_second": 1.37, "step": 9168 }, { "epoch": 0.5927434343434343, "grad_norm": 0.05766342580318451, "learning_rate": 0.00018282093838838738, "loss": 0.0913, "step": 9169 }, { "epoch": 0.5928080808080808, "grad_norm": 0.0504431426525116, "learning_rate": 0.00018281710570481, "loss": 0.0742, "step": 9170 }, { "epoch": 0.5928727272727272, "grad_norm": 0.06160716339945793, "learning_rate": 0.00018281327263392435, "loss": 0.0817, "step": 9171 }, { "epoch": 0.5929373737373738, "grad_norm": 0.05753431096673012, "learning_rate": 0.00018280943917574843, "loss": 0.0831, "step": 9172 }, { "epoch": 0.5930020202020202, "grad_norm": 0.05552075058221817, "learning_rate": 0.00018280560533030012, "loss": 0.0892, "step": 9173 }, { "epoch": 0.5930666666666666, "grad_norm": 0.055398326367139816, "learning_rate": 0.00018280177109759738, "loss": 0.0724, "step": 9174 }, { "epoch": 0.5931313131313132, "grad_norm": 0.051229942589998245, "learning_rate": 0.0001827979364776581, "loss": 0.0622, "step": 9175 }, { "epoch": 0.5931959595959596, "grad_norm": 0.05887843668460846, "learning_rate": 0.00018279410147050025, "loss": 0.102, "step": 9176 }, { "epoch": 0.593260606060606, "grad_norm": 0.05895410105586052, "learning_rate": 0.0001827902660761418, "loss": 0.1015, "step": 9177 }, { "epoch": 0.5933252525252525, "grad_norm": 0.0489514023065567, "learning_rate": 0.00018278643029460062, "loss": 0.0718, "step": 9178 }, { "epoch": 0.593389898989899, "grad_norm": 0.05140523612499237, "learning_rate": 0.00018278259412589468, "loss": 0.0788, "step": 9179 }, { "epoch": 0.5934545454545455, "grad_norm": 0.052932802587747574, "learning_rate": 0.0001827787575700419, "loss": 0.0756, "step": 9180 }, { "epoch": 0.5935191919191919, "grad_norm": 0.057676516473293304, "learning_rate": 0.00018277492062706029, "loss": 0.0927, "step": 9181 }, { "epoch": 0.5935838383838384, "grad_norm": 0.06462277472019196, "learning_rate": 0.00018277108329696766, "loss": 0.0962, "step": 9182 }, { "epoch": 0.5936484848484849, "grad_norm": 0.05582078546285629, "learning_rate": 0.0001827672455797821, "loss": 0.0845, "step": 9183 }, { "epoch": 0.5937131313131313, "grad_norm": 0.06156732141971588, "learning_rate": 0.00018276340747552146, "loss": 0.1035, "step": 9184 }, { "epoch": 0.5937131313131313, "eval_bleu": 17.713132678192952, "eval_loss": 0.08888290822505951, "eval_runtime": 2.7424, "eval_samples_per_second": 11.669, "eval_steps_per_second": 1.459, "step": 9184 }, { "epoch": 0.5937777777777777, "grad_norm": 0.05700775235891342, "learning_rate": 0.00018275956898420375, "loss": 0.0929, "step": 9185 }, { "epoch": 0.5938424242424243, "grad_norm": 0.05120936408638954, "learning_rate": 0.00018275573010584687, "loss": 0.087, "step": 9186 }, { "epoch": 0.5939070707070707, "grad_norm": 0.05586719140410423, "learning_rate": 0.0001827518908404688, "loss": 0.089, "step": 9187 }, { "epoch": 0.5939717171717172, "grad_norm": 0.047926779836416245, "learning_rate": 0.0001827480511880875, "loss": 0.0735, "step": 9188 }, { "epoch": 0.5940363636363636, "grad_norm": 0.06225164234638214, "learning_rate": 0.00018274421114872094, "loss": 0.0946, "step": 9189 }, { "epoch": 0.5941010101010101, "grad_norm": 0.04869600012898445, "learning_rate": 0.00018274037072238704, "loss": 0.0741, "step": 9190 }, { "epoch": 0.5941656565656566, "grad_norm": 0.05059714987874031, "learning_rate": 0.00018273652990910376, "loss": 0.0744, "step": 9191 }, { "epoch": 0.594230303030303, "grad_norm": 0.058421261608600616, "learning_rate": 0.0001827326887088891, "loss": 0.0881, "step": 9192 }, { "epoch": 0.5942949494949495, "grad_norm": 0.05466179922223091, "learning_rate": 0.000182728847121761, "loss": 0.0969, "step": 9193 }, { "epoch": 0.594359595959596, "grad_norm": 0.05249101668596268, "learning_rate": 0.00018272500514773747, "loss": 0.0814, "step": 9194 }, { "epoch": 0.5944242424242424, "grad_norm": 0.05772356316447258, "learning_rate": 0.00018272116278683642, "loss": 0.0692, "step": 9195 }, { "epoch": 0.5944888888888888, "grad_norm": 0.05059539154171944, "learning_rate": 0.00018271732003907583, "loss": 0.0809, "step": 9196 }, { "epoch": 0.5945535353535354, "grad_norm": 0.056555405259132385, "learning_rate": 0.00018271347690447366, "loss": 0.0898, "step": 9197 }, { "epoch": 0.5946181818181818, "grad_norm": 0.05917682126164436, "learning_rate": 0.00018270963338304794, "loss": 0.0997, "step": 9198 }, { "epoch": 0.5946828282828283, "grad_norm": 0.047227684408426285, "learning_rate": 0.00018270578947481657, "loss": 0.0684, "step": 9199 }, { "epoch": 0.5947474747474748, "grad_norm": 0.04991176351904869, "learning_rate": 0.00018270194517979759, "loss": 0.0768, "step": 9200 }, { "epoch": 0.5947474747474748, "eval_bleu": 18.521954453077385, "eval_loss": 0.08885680139064789, "eval_runtime": 2.8349, "eval_samples_per_second": 11.288, "eval_steps_per_second": 1.411, "step": 9200 }, { "epoch": 0.5948121212121212, "grad_norm": 0.05131318047642708, "learning_rate": 0.00018269810049800897, "loss": 0.0818, "step": 9201 }, { "epoch": 0.5948767676767677, "grad_norm": 0.055266525596380234, "learning_rate": 0.00018269425542946863, "loss": 0.0802, "step": 9202 }, { "epoch": 0.5949414141414141, "grad_norm": 0.05111261084675789, "learning_rate": 0.00018269040997419462, "loss": 0.0773, "step": 9203 }, { "epoch": 0.5950060606060606, "grad_norm": 0.05634641647338867, "learning_rate": 0.00018268656413220492, "loss": 0.0882, "step": 9204 }, { "epoch": 0.5950707070707071, "grad_norm": 0.05886472016572952, "learning_rate": 0.00018268271790351746, "loss": 0.1028, "step": 9205 }, { "epoch": 0.5951353535353535, "grad_norm": 0.05364375561475754, "learning_rate": 0.00018267887128815027, "loss": 0.084, "step": 9206 }, { "epoch": 0.5952, "grad_norm": 0.0759287178516388, "learning_rate": 0.00018267502428612133, "loss": 0.0814, "step": 9207 }, { "epoch": 0.5952646464646465, "grad_norm": 0.05766788497567177, "learning_rate": 0.00018267117689744865, "loss": 0.0843, "step": 9208 }, { "epoch": 0.5953292929292929, "grad_norm": 0.05501188710331917, "learning_rate": 0.00018266732912215018, "loss": 0.0875, "step": 9209 }, { "epoch": 0.5953939393939394, "grad_norm": 0.05616890639066696, "learning_rate": 0.00018266348096024397, "loss": 0.0894, "step": 9210 }, { "epoch": 0.5954585858585859, "grad_norm": 0.05870002135634422, "learning_rate": 0.00018265963241174797, "loss": 0.0884, "step": 9211 }, { "epoch": 0.5955232323232323, "grad_norm": 0.061203282326459885, "learning_rate": 0.0001826557834766802, "loss": 0.0879, "step": 9212 }, { "epoch": 0.5955878787878788, "grad_norm": 0.05529177933931351, "learning_rate": 0.00018265193415505866, "loss": 0.0873, "step": 9213 }, { "epoch": 0.5956525252525252, "grad_norm": 0.06088469177484512, "learning_rate": 0.00018264808444690137, "loss": 0.0755, "step": 9214 }, { "epoch": 0.5957171717171718, "grad_norm": 0.062172774225473404, "learning_rate": 0.00018264423435222627, "loss": 0.0951, "step": 9215 }, { "epoch": 0.5957818181818182, "grad_norm": 0.05622595176100731, "learning_rate": 0.0001826403838710514, "loss": 0.0823, "step": 9216 }, { "epoch": 0.5957818181818182, "eval_bleu": 18.818238109103604, "eval_loss": 0.08792939782142639, "eval_runtime": 2.7799, "eval_samples_per_second": 11.511, "eval_steps_per_second": 1.439, "step": 9216 }, { "epoch": 0.5958464646464646, "grad_norm": 0.057026248425245285, "learning_rate": 0.00018263653300339483, "loss": 0.0836, "step": 9217 }, { "epoch": 0.5959111111111111, "grad_norm": 0.052822601050138474, "learning_rate": 0.00018263268174927445, "loss": 0.0841, "step": 9218 }, { "epoch": 0.5959757575757576, "grad_norm": 0.05876884236931801, "learning_rate": 0.0001826288301087084, "loss": 0.0737, "step": 9219 }, { "epoch": 0.596040404040404, "grad_norm": 0.05007770285010338, "learning_rate": 0.0001826249780817146, "loss": 0.0715, "step": 9220 }, { "epoch": 0.5961050505050505, "grad_norm": 0.06130215898156166, "learning_rate": 0.00018262112566831107, "loss": 0.1012, "step": 9221 }, { "epoch": 0.596169696969697, "grad_norm": 0.05636681616306305, "learning_rate": 0.00018261727286851586, "loss": 0.091, "step": 9222 }, { "epoch": 0.5962343434343435, "grad_norm": 0.04954428970813751, "learning_rate": 0.000182613419682347, "loss": 0.0754, "step": 9223 }, { "epoch": 0.5962989898989899, "grad_norm": 0.055411115288734436, "learning_rate": 0.00018260956610982248, "loss": 0.0782, "step": 9224 }, { "epoch": 0.5963636363636363, "grad_norm": 0.05129629373550415, "learning_rate": 0.00018260571215096032, "loss": 0.0808, "step": 9225 }, { "epoch": 0.5964282828282829, "grad_norm": 0.05359562858939171, "learning_rate": 0.00018260185780577857, "loss": 0.0747, "step": 9226 }, { "epoch": 0.5964929292929293, "grad_norm": 0.05646972358226776, "learning_rate": 0.00018259800307429523, "loss": 0.0906, "step": 9227 }, { "epoch": 0.5965575757575757, "grad_norm": 0.05575394257903099, "learning_rate": 0.00018259414795652833, "loss": 0.0865, "step": 9228 }, { "epoch": 0.5966222222222223, "grad_norm": 0.05891105532646179, "learning_rate": 0.0001825902924524959, "loss": 0.0863, "step": 9229 }, { "epoch": 0.5966868686868687, "grad_norm": 0.057797789573669434, "learning_rate": 0.00018258643656221597, "loss": 0.0914, "step": 9230 }, { "epoch": 0.5967515151515151, "grad_norm": 0.05347130447626114, "learning_rate": 0.0001825825802857066, "loss": 0.0909, "step": 9231 }, { "epoch": 0.5968161616161616, "grad_norm": 0.05399787798523903, "learning_rate": 0.0001825787236229858, "loss": 0.0764, "step": 9232 }, { "epoch": 0.5968161616161616, "eval_bleu": 17.636425600101944, "eval_loss": 0.08881048858165741, "eval_runtime": 2.7402, "eval_samples_per_second": 11.678, "eval_steps_per_second": 1.46, "step": 9232 }, { "epoch": 0.5968808080808081, "grad_norm": 0.05056849867105484, "learning_rate": 0.00018257486657407158, "loss": 0.0846, "step": 9233 }, { "epoch": 0.5969454545454546, "grad_norm": 0.058239854872226715, "learning_rate": 0.00018257100913898205, "loss": 0.0954, "step": 9234 }, { "epoch": 0.597010101010101, "grad_norm": 0.054200366139411926, "learning_rate": 0.00018256715131773519, "loss": 0.0727, "step": 9235 }, { "epoch": 0.5970747474747474, "grad_norm": 0.05450156703591347, "learning_rate": 0.00018256329311034904, "loss": 0.09, "step": 9236 }, { "epoch": 0.597139393939394, "grad_norm": 0.06140401214361191, "learning_rate": 0.0001825594345168417, "loss": 0.0939, "step": 9237 }, { "epoch": 0.5972040404040404, "grad_norm": 0.06339334696531296, "learning_rate": 0.00018255557553723114, "loss": 0.0963, "step": 9238 }, { "epoch": 0.5972686868686868, "grad_norm": 0.05680403858423233, "learning_rate": 0.0001825517161715355, "loss": 0.0993, "step": 9239 }, { "epoch": 0.5973333333333334, "grad_norm": 0.05393490940332413, "learning_rate": 0.00018254785641977272, "loss": 0.0778, "step": 9240 }, { "epoch": 0.5973979797979798, "grad_norm": 0.05475551262497902, "learning_rate": 0.00018254399628196092, "loss": 0.0795, "step": 9241 }, { "epoch": 0.5974626262626263, "grad_norm": 0.04973144829273224, "learning_rate": 0.00018254013575811815, "loss": 0.0829, "step": 9242 }, { "epoch": 0.5975272727272727, "grad_norm": 0.06083136796951294, "learning_rate": 0.00018253627484826246, "loss": 0.0978, "step": 9243 }, { "epoch": 0.5975919191919192, "grad_norm": 0.04901028424501419, "learning_rate": 0.00018253241355241189, "loss": 0.0756, "step": 9244 }, { "epoch": 0.5976565656565657, "grad_norm": 0.06351029127836227, "learning_rate": 0.0001825285518705845, "loss": 0.0922, "step": 9245 }, { "epoch": 0.5977212121212121, "grad_norm": 0.055549055337905884, "learning_rate": 0.00018252468980279837, "loss": 0.0834, "step": 9246 }, { "epoch": 0.5977858585858585, "grad_norm": 0.04851272702217102, "learning_rate": 0.0001825208273490715, "loss": 0.0726, "step": 9247 }, { "epoch": 0.5978505050505051, "grad_norm": 0.05590742826461792, "learning_rate": 0.00018251696450942206, "loss": 0.0829, "step": 9248 }, { "epoch": 0.5978505050505051, "eval_bleu": 17.739425705178597, "eval_loss": 0.08769983053207397, "eval_runtime": 2.7166, "eval_samples_per_second": 11.779, "eval_steps_per_second": 1.472, "step": 9248 }, { "epoch": 0.5979151515151515, "grad_norm": 0.06067168712615967, "learning_rate": 0.00018251310128386802, "loss": 0.1007, "step": 9249 }, { "epoch": 0.597979797979798, "grad_norm": 0.05523289740085602, "learning_rate": 0.00018250923767242753, "loss": 0.0809, "step": 9250 }, { "epoch": 0.5980444444444445, "grad_norm": 0.052563391625881195, "learning_rate": 0.00018250537367511856, "loss": 0.0812, "step": 9251 }, { "epoch": 0.5981090909090909, "grad_norm": 0.05136674642562866, "learning_rate": 0.00018250150929195922, "loss": 0.0766, "step": 9252 }, { "epoch": 0.5981737373737374, "grad_norm": 0.05551496893167496, "learning_rate": 0.00018249764452296765, "loss": 0.0845, "step": 9253 }, { "epoch": 0.5982383838383838, "grad_norm": 0.05758613720536232, "learning_rate": 0.00018249377936816185, "loss": 0.0821, "step": 9254 }, { "epoch": 0.5983030303030303, "grad_norm": 0.06347978860139847, "learning_rate": 0.0001824899138275599, "loss": 0.0839, "step": 9255 }, { "epoch": 0.5983676767676768, "grad_norm": 0.04841926321387291, "learning_rate": 0.00018248604790117992, "loss": 0.0719, "step": 9256 }, { "epoch": 0.5984323232323232, "grad_norm": 0.05642183497548103, "learning_rate": 0.00018248218158903993, "loss": 0.0975, "step": 9257 }, { "epoch": 0.5984969696969697, "grad_norm": 0.051033586263656616, "learning_rate": 0.0001824783148911581, "loss": 0.0852, "step": 9258 }, { "epoch": 0.5985616161616162, "grad_norm": 0.04538574814796448, "learning_rate": 0.0001824744478075524, "loss": 0.0739, "step": 9259 }, { "epoch": 0.5986262626262626, "grad_norm": 0.05266017094254494, "learning_rate": 0.000182470580338241, "loss": 0.0897, "step": 9260 }, { "epoch": 0.598690909090909, "grad_norm": 0.0466657318174839, "learning_rate": 0.00018246671248324197, "loss": 0.0811, "step": 9261 }, { "epoch": 0.5987555555555556, "grad_norm": 0.0608215369284153, "learning_rate": 0.00018246284424257335, "loss": 0.1064, "step": 9262 }, { "epoch": 0.598820202020202, "grad_norm": 0.05590904504060745, "learning_rate": 0.00018245897561625332, "loss": 0.0853, "step": 9263 }, { "epoch": 0.5988848484848485, "grad_norm": 0.0669223964214325, "learning_rate": 0.00018245510660429986, "loss": 0.0918, "step": 9264 }, { "epoch": 0.5988848484848485, "eval_bleu": 18.94005312509076, "eval_loss": 0.0880223885178566, "eval_runtime": 2.701, "eval_samples_per_second": 11.848, "eval_steps_per_second": 1.481, "step": 9264 }, { "epoch": 0.5989494949494949, "grad_norm": 0.050235409289598465, "learning_rate": 0.00018245123720673118, "loss": 0.0821, "step": 9265 }, { "epoch": 0.5990141414141414, "grad_norm": 0.053051866590976715, "learning_rate": 0.00018244736742356528, "loss": 0.0758, "step": 9266 }, { "epoch": 0.5990787878787879, "grad_norm": 0.05043734982609749, "learning_rate": 0.00018244349725482034, "loss": 0.085, "step": 9267 }, { "epoch": 0.5991434343434343, "grad_norm": 0.0520324669778347, "learning_rate": 0.0001824396267005144, "loss": 0.071, "step": 9268 }, { "epoch": 0.5992080808080809, "grad_norm": 0.04643169417977333, "learning_rate": 0.0001824357557606656, "loss": 0.0731, "step": 9269 }, { "epoch": 0.5992727272727273, "grad_norm": 0.061684783548116684, "learning_rate": 0.000182431884435292, "loss": 0.087, "step": 9270 }, { "epoch": 0.5993373737373737, "grad_norm": 0.05080613121390343, "learning_rate": 0.00018242801272441174, "loss": 0.0798, "step": 9271 }, { "epoch": 0.5994020202020202, "grad_norm": 0.06759650260210037, "learning_rate": 0.0001824241406280429, "loss": 0.0884, "step": 9272 }, { "epoch": 0.5994666666666667, "grad_norm": 0.05792969837784767, "learning_rate": 0.0001824202681462036, "loss": 0.093, "step": 9273 }, { "epoch": 0.5995313131313131, "grad_norm": 0.05206482857465744, "learning_rate": 0.00018241639527891197, "loss": 0.0802, "step": 9274 }, { "epoch": 0.5995959595959596, "grad_norm": 0.07848216593265533, "learning_rate": 0.00018241252202618607, "loss": 0.0818, "step": 9275 }, { "epoch": 0.599660606060606, "grad_norm": 0.05379154533147812, "learning_rate": 0.0001824086483880441, "loss": 0.0852, "step": 9276 }, { "epoch": 0.5997252525252526, "grad_norm": 0.05447760224342346, "learning_rate": 0.0001824047743645041, "loss": 0.0809, "step": 9277 }, { "epoch": 0.599789898989899, "grad_norm": 0.052436552941799164, "learning_rate": 0.00018240089995558422, "loss": 0.086, "step": 9278 }, { "epoch": 0.5998545454545454, "grad_norm": 0.05265559256076813, "learning_rate": 0.00018239702516130256, "loss": 0.0808, "step": 9279 }, { "epoch": 0.599919191919192, "grad_norm": 0.052759911864995956, "learning_rate": 0.00018239314998167726, "loss": 0.0816, "step": 9280 }, { "epoch": 0.599919191919192, "eval_bleu": 16.225892484487847, "eval_loss": 0.08973203599452972, "eval_runtime": 2.7248, "eval_samples_per_second": 11.744, "eval_steps_per_second": 1.468, "step": 9280 }, { "epoch": 0.5999838383838384, "grad_norm": 0.06109708920121193, "learning_rate": 0.00018238927441672646, "loss": 0.109, "step": 9281 }, { "epoch": 0.6000484848484848, "grad_norm": 0.0599808543920517, "learning_rate": 0.00018238539846646823, "loss": 0.1023, "step": 9282 }, { "epoch": 0.6001131313131313, "grad_norm": 0.05814085155725479, "learning_rate": 0.00018238152213092073, "loss": 0.0983, "step": 9283 }, { "epoch": 0.6001777777777778, "grad_norm": 0.053232502192258835, "learning_rate": 0.0001823776454101021, "loss": 0.086, "step": 9284 }, { "epoch": 0.6002424242424242, "grad_norm": 0.05021049827337265, "learning_rate": 0.00018237376830403045, "loss": 0.0798, "step": 9285 }, { "epoch": 0.6003070707070707, "grad_norm": 0.053982459008693695, "learning_rate": 0.00018236989081272392, "loss": 0.0863, "step": 9286 }, { "epoch": 0.6003717171717172, "grad_norm": 0.05667713284492493, "learning_rate": 0.00018236601293620064, "loss": 0.093, "step": 9287 }, { "epoch": 0.6004363636363637, "grad_norm": 0.057849954813718796, "learning_rate": 0.00018236213467447872, "loss": 0.0855, "step": 9288 }, { "epoch": 0.6005010101010101, "grad_norm": 0.053041454404592514, "learning_rate": 0.00018235825602757637, "loss": 0.073, "step": 9289 }, { "epoch": 0.6005656565656565, "grad_norm": 0.04976919665932655, "learning_rate": 0.00018235437699551165, "loss": 0.0837, "step": 9290 }, { "epoch": 0.6006303030303031, "grad_norm": 0.046836528927087784, "learning_rate": 0.00018235049757830277, "loss": 0.0794, "step": 9291 }, { "epoch": 0.6006949494949495, "grad_norm": 0.05389755591750145, "learning_rate": 0.0001823466177759678, "loss": 0.0791, "step": 9292 }, { "epoch": 0.6007595959595959, "grad_norm": 0.061286166310310364, "learning_rate": 0.00018234273758852495, "loss": 0.0762, "step": 9293 }, { "epoch": 0.6008242424242424, "grad_norm": 0.05562683939933777, "learning_rate": 0.00018233885701599229, "loss": 0.0882, "step": 9294 }, { "epoch": 0.6008888888888889, "grad_norm": 0.046313006430864334, "learning_rate": 0.00018233497605838806, "loss": 0.0769, "step": 9295 }, { "epoch": 0.6009535353535354, "grad_norm": 0.047027621418237686, "learning_rate": 0.00018233109471573036, "loss": 0.0678, "step": 9296 }, { "epoch": 0.6009535353535354, "eval_bleu": 17.3187830481455, "eval_loss": 0.08799010515213013, "eval_runtime": 2.7473, "eval_samples_per_second": 11.648, "eval_steps_per_second": 1.456, "step": 9296 }, { "epoch": 0.6010181818181818, "grad_norm": 0.059704769402742386, "learning_rate": 0.00018232721298803734, "loss": 0.0891, "step": 9297 }, { "epoch": 0.6010828282828283, "grad_norm": 0.05856990069150925, "learning_rate": 0.00018232333087532716, "loss": 0.0995, "step": 9298 }, { "epoch": 0.6011474747474748, "grad_norm": 0.056184861809015274, "learning_rate": 0.000182319448377618, "loss": 0.0855, "step": 9299 }, { "epoch": 0.6012121212121212, "grad_norm": 0.05260922387242317, "learning_rate": 0.000182315565494928, "loss": 0.0785, "step": 9300 }, { "epoch": 0.6012767676767676, "grad_norm": 0.0494307316839695, "learning_rate": 0.0001823116822272753, "loss": 0.0723, "step": 9301 }, { "epoch": 0.6013414141414142, "grad_norm": 0.06754857301712036, "learning_rate": 0.00018230779857467803, "loss": 0.0836, "step": 9302 }, { "epoch": 0.6014060606060606, "grad_norm": 0.05381741374731064, "learning_rate": 0.00018230391453715443, "loss": 0.079, "step": 9303 }, { "epoch": 0.601470707070707, "grad_norm": 0.058093100786209106, "learning_rate": 0.00018230003011472263, "loss": 0.0917, "step": 9304 }, { "epoch": 0.6015353535353536, "grad_norm": 0.05695437267422676, "learning_rate": 0.0001822961453074008, "loss": 0.0894, "step": 9305 }, { "epoch": 0.6016, "grad_norm": 0.0551203191280365, "learning_rate": 0.00018229226011520712, "loss": 0.082, "step": 9306 }, { "epoch": 0.6016646464646465, "grad_norm": 0.059350986033678055, "learning_rate": 0.00018228837453815972, "loss": 0.0997, "step": 9307 }, { "epoch": 0.6017292929292929, "grad_norm": 0.05375274270772934, "learning_rate": 0.0001822844885762768, "loss": 0.0751, "step": 9308 }, { "epoch": 0.6017939393939394, "grad_norm": 0.057413555681705475, "learning_rate": 0.00018228060222957653, "loss": 0.0944, "step": 9309 }, { "epoch": 0.6018585858585859, "grad_norm": 0.060371384024620056, "learning_rate": 0.00018227671549807711, "loss": 0.1007, "step": 9310 }, { "epoch": 0.6019232323232323, "grad_norm": 0.05411671847105026, "learning_rate": 0.00018227282838179668, "loss": 0.0857, "step": 9311 }, { "epoch": 0.6019878787878787, "grad_norm": 0.05060362070798874, "learning_rate": 0.0001822689408807534, "loss": 0.0692, "step": 9312 }, { "epoch": 0.6019878787878787, "eval_bleu": 16.58864454720468, "eval_loss": 0.08865423500537872, "eval_runtime": 2.6126, "eval_samples_per_second": 12.248, "eval_steps_per_second": 1.531, "step": 9312 }, { "epoch": 0.6020525252525253, "grad_norm": 0.04841672256588936, "learning_rate": 0.00018226505299496552, "loss": 0.0827, "step": 9313 }, { "epoch": 0.6021171717171717, "grad_norm": 0.05453760176897049, "learning_rate": 0.00018226116472445117, "loss": 0.0874, "step": 9314 }, { "epoch": 0.6021818181818182, "grad_norm": 0.06099946051836014, "learning_rate": 0.00018225727606922853, "loss": 0.0845, "step": 9315 }, { "epoch": 0.6022464646464647, "grad_norm": 0.05958755314350128, "learning_rate": 0.00018225338702931583, "loss": 0.0896, "step": 9316 }, { "epoch": 0.6023111111111111, "grad_norm": 0.04226629063487053, "learning_rate": 0.00018224949760473123, "loss": 0.0716, "step": 9317 }, { "epoch": 0.6023757575757576, "grad_norm": 0.05748404935002327, "learning_rate": 0.0001822456077954929, "loss": 0.0896, "step": 9318 }, { "epoch": 0.602440404040404, "grad_norm": 0.08999787271022797, "learning_rate": 0.00018224171760161908, "loss": 0.0927, "step": 9319 }, { "epoch": 0.6025050505050505, "grad_norm": 0.05240680277347565, "learning_rate": 0.0001822378270231279, "loss": 0.0743, "step": 9320 }, { "epoch": 0.602569696969697, "grad_norm": 0.05584258213639259, "learning_rate": 0.00018223393606003763, "loss": 0.0925, "step": 9321 }, { "epoch": 0.6026343434343434, "grad_norm": 0.05348473787307739, "learning_rate": 0.00018223004471236637, "loss": 0.0841, "step": 9322 }, { "epoch": 0.6026989898989898, "grad_norm": 0.05990305915474892, "learning_rate": 0.00018222615298013243, "loss": 0.1047, "step": 9323 }, { "epoch": 0.6027636363636364, "grad_norm": 0.04625198245048523, "learning_rate": 0.00018222226086335393, "loss": 0.0681, "step": 9324 }, { "epoch": 0.6028282828282828, "grad_norm": 0.054871100932359695, "learning_rate": 0.0001822183683620491, "loss": 0.0896, "step": 9325 }, { "epoch": 0.6028929292929293, "grad_norm": 0.05907516926527023, "learning_rate": 0.00018221447547623617, "loss": 0.0948, "step": 9326 }, { "epoch": 0.6029575757575758, "grad_norm": 0.05664428323507309, "learning_rate": 0.0001822105822059333, "loss": 0.0828, "step": 9327 }, { "epoch": 0.6030222222222222, "grad_norm": 0.04121410474181175, "learning_rate": 0.00018220668855115869, "loss": 0.0567, "step": 9328 }, { "epoch": 0.6030222222222222, "eval_bleu": 16.0204884365074, "eval_loss": 0.08812561631202698, "eval_runtime": 2.7487, "eval_samples_per_second": 11.642, "eval_steps_per_second": 1.455, "step": 9328 }, { "epoch": 0.6030868686868687, "grad_norm": 0.054203350096940994, "learning_rate": 0.00018220279451193057, "loss": 0.0851, "step": 9329 }, { "epoch": 0.6031515151515151, "grad_norm": 0.06393826007843018, "learning_rate": 0.00018219890008826722, "loss": 0.1022, "step": 9330 }, { "epoch": 0.6032161616161617, "grad_norm": 0.047972671687603, "learning_rate": 0.00018219500528018673, "loss": 0.0704, "step": 9331 }, { "epoch": 0.6032808080808081, "grad_norm": 0.053569648414850235, "learning_rate": 0.0001821911100877074, "loss": 0.0787, "step": 9332 }, { "epoch": 0.6033454545454545, "grad_norm": 0.053759828209877014, "learning_rate": 0.00018218721451084742, "loss": 0.0781, "step": 9333 }, { "epoch": 0.6034101010101011, "grad_norm": 0.06731259077787399, "learning_rate": 0.00018218331854962498, "loss": 0.0796, "step": 9334 }, { "epoch": 0.6034747474747475, "grad_norm": 0.04890581592917442, "learning_rate": 0.00018217942220405834, "loss": 0.075, "step": 9335 }, { "epoch": 0.6035393939393939, "grad_norm": 0.05613734945654869, "learning_rate": 0.00018217552547416575, "loss": 0.0945, "step": 9336 }, { "epoch": 0.6036040404040404, "grad_norm": 0.052223462611436844, "learning_rate": 0.00018217162835996534, "loss": 0.0747, "step": 9337 }, { "epoch": 0.6036686868686869, "grad_norm": 0.04944951459765434, "learning_rate": 0.00018216773086147543, "loss": 0.0731, "step": 9338 }, { "epoch": 0.6037333333333333, "grad_norm": 0.055942948907613754, "learning_rate": 0.0001821638329787142, "loss": 0.0972, "step": 9339 }, { "epoch": 0.6037979797979798, "grad_norm": 0.056910108774900436, "learning_rate": 0.00018215993471169987, "loss": 0.0871, "step": 9340 }, { "epoch": 0.6038626262626262, "grad_norm": 0.05987938866019249, "learning_rate": 0.00018215603606045068, "loss": 0.0945, "step": 9341 }, { "epoch": 0.6039272727272728, "grad_norm": 0.05712134391069412, "learning_rate": 0.00018215213702498488, "loss": 0.0773, "step": 9342 }, { "epoch": 0.6039919191919192, "grad_norm": 0.05468282103538513, "learning_rate": 0.0001821482376053207, "loss": 0.0923, "step": 9343 }, { "epoch": 0.6040565656565656, "grad_norm": 0.04780352860689163, "learning_rate": 0.0001821443378014764, "loss": 0.0721, "step": 9344 }, { "epoch": 0.6040565656565656, "eval_bleu": 16.530617019067666, "eval_loss": 0.08821222186088562, "eval_runtime": 2.854, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.402, "step": 9344 }, { "epoch": 0.6041212121212122, "grad_norm": 0.05933382734656334, "learning_rate": 0.00018214043761347013, "loss": 0.0897, "step": 9345 }, { "epoch": 0.6041858585858586, "grad_norm": 0.056929804384708405, "learning_rate": 0.00018213653704132022, "loss": 0.0943, "step": 9346 }, { "epoch": 0.604250505050505, "grad_norm": 0.05111568048596382, "learning_rate": 0.00018213263608504488, "loss": 0.0846, "step": 9347 }, { "epoch": 0.6043151515151515, "grad_norm": 0.05490010976791382, "learning_rate": 0.00018212873474466234, "loss": 0.0871, "step": 9348 }, { "epoch": 0.604379797979798, "grad_norm": 0.048945747315883636, "learning_rate": 0.00018212483302019087, "loss": 0.0769, "step": 9349 }, { "epoch": 0.6044444444444445, "grad_norm": 0.05047060176730156, "learning_rate": 0.00018212093091164868, "loss": 0.0741, "step": 9350 }, { "epoch": 0.6045090909090909, "grad_norm": 0.05959855020046234, "learning_rate": 0.0001821170284190541, "loss": 0.1039, "step": 9351 }, { "epoch": 0.6045737373737373, "grad_norm": 0.07407384365797043, "learning_rate": 0.00018211312554242527, "loss": 0.0797, "step": 9352 }, { "epoch": 0.6046383838383839, "grad_norm": 0.049605585634708405, "learning_rate": 0.00018210922228178053, "loss": 0.0653, "step": 9353 }, { "epoch": 0.6047030303030303, "grad_norm": 0.05791972950100899, "learning_rate": 0.00018210531863713808, "loss": 0.0983, "step": 9354 }, { "epoch": 0.6047676767676767, "grad_norm": 0.05266221985220909, "learning_rate": 0.00018210141460851622, "loss": 0.0699, "step": 9355 }, { "epoch": 0.6048323232323233, "grad_norm": 0.05782538652420044, "learning_rate": 0.0001820975101959332, "loss": 0.0948, "step": 9356 }, { "epoch": 0.6048969696969697, "grad_norm": 0.059462688863277435, "learning_rate": 0.0001820936053994072, "loss": 0.0896, "step": 9357 }, { "epoch": 0.6049616161616161, "grad_norm": 0.04714878275990486, "learning_rate": 0.00018208970021895663, "loss": 0.0711, "step": 9358 }, { "epoch": 0.6050262626262626, "grad_norm": 0.05671320855617523, "learning_rate": 0.0001820857946545996, "loss": 0.0898, "step": 9359 }, { "epoch": 0.6050909090909091, "grad_norm": 0.04726758226752281, "learning_rate": 0.0001820818887063545, "loss": 0.0783, "step": 9360 }, { "epoch": 0.6050909090909091, "eval_bleu": 14.486297901289312, "eval_loss": 0.08833492547273636, "eval_runtime": 2.7876, "eval_samples_per_second": 11.479, "eval_steps_per_second": 1.435, "step": 9360 }, { "epoch": 0.6051555555555556, "grad_norm": 0.054479196667671204, "learning_rate": 0.00018207798237423952, "loss": 0.0885, "step": 9361 }, { "epoch": 0.605220202020202, "grad_norm": 0.06410248577594757, "learning_rate": 0.00018207407565827292, "loss": 0.1166, "step": 9362 }, { "epoch": 0.6052848484848485, "grad_norm": 0.053995948284864426, "learning_rate": 0.00018207016855847306, "loss": 0.0852, "step": 9363 }, { "epoch": 0.605349494949495, "grad_norm": 0.052321381866931915, "learning_rate": 0.00018206626107485812, "loss": 0.087, "step": 9364 }, { "epoch": 0.6054141414141414, "grad_norm": 0.04913843423128128, "learning_rate": 0.0001820623532074464, "loss": 0.0736, "step": 9365 }, { "epoch": 0.6054787878787878, "grad_norm": 0.051884908229112625, "learning_rate": 0.00018205844495625622, "loss": 0.0739, "step": 9366 }, { "epoch": 0.6055434343434344, "grad_norm": 0.05209816247224808, "learning_rate": 0.00018205453632130577, "loss": 0.0847, "step": 9367 }, { "epoch": 0.6056080808080808, "grad_norm": 0.05516160652041435, "learning_rate": 0.00018205062730261343, "loss": 0.0871, "step": 9368 }, { "epoch": 0.6056727272727273, "grad_norm": 0.0572843924164772, "learning_rate": 0.00018204671790019738, "loss": 0.0857, "step": 9369 }, { "epoch": 0.6057373737373737, "grad_norm": 0.08052563667297363, "learning_rate": 0.00018204280811407597, "loss": 0.0877, "step": 9370 }, { "epoch": 0.6058020202020202, "grad_norm": 0.05005412548780441, "learning_rate": 0.0001820388979442675, "loss": 0.0678, "step": 9371 }, { "epoch": 0.6058666666666667, "grad_norm": 0.052731744945049286, "learning_rate": 0.00018203498739079022, "loss": 0.0796, "step": 9372 }, { "epoch": 0.6059313131313131, "grad_norm": 0.060624074190855026, "learning_rate": 0.0001820310764536624, "loss": 0.1143, "step": 9373 }, { "epoch": 0.6059959595959596, "grad_norm": 0.05081413686275482, "learning_rate": 0.00018202716513290236, "loss": 0.0781, "step": 9374 }, { "epoch": 0.6060606060606061, "grad_norm": 0.04832477122545242, "learning_rate": 0.0001820232534285284, "loss": 0.0681, "step": 9375 }, { "epoch": 0.6061252525252525, "grad_norm": 0.049497541040182114, "learning_rate": 0.00018201934134055877, "loss": 0.0825, "step": 9376 }, { "epoch": 0.6061252525252525, "eval_bleu": 19.171097859845677, "eval_loss": 0.08797016739845276, "eval_runtime": 2.6071, "eval_samples_per_second": 12.274, "eval_steps_per_second": 1.534, "step": 9376 }, { "epoch": 0.606189898989899, "grad_norm": 0.045212000608444214, "learning_rate": 0.00018201542886901183, "loss": 0.0646, "step": 9377 }, { "epoch": 0.6062545454545455, "grad_norm": 0.05284617841243744, "learning_rate": 0.00018201151601390586, "loss": 0.0825, "step": 9378 }, { "epoch": 0.6063191919191919, "grad_norm": 0.05812883749604225, "learning_rate": 0.00018200760277525908, "loss": 0.089, "step": 9379 }, { "epoch": 0.6063838383838384, "grad_norm": 0.055171504616737366, "learning_rate": 0.0001820036891530899, "loss": 0.091, "step": 9380 }, { "epoch": 0.6064484848484849, "grad_norm": 0.05667423456907272, "learning_rate": 0.00018199977514741654, "loss": 0.0898, "step": 9381 }, { "epoch": 0.6065131313131313, "grad_norm": 0.051362209022045135, "learning_rate": 0.00018199586075825738, "loss": 0.0819, "step": 9382 }, { "epoch": 0.6065777777777778, "grad_norm": 0.05289510637521744, "learning_rate": 0.00018199194598563068, "loss": 0.081, "step": 9383 }, { "epoch": 0.6066424242424242, "grad_norm": 0.05934342369437218, "learning_rate": 0.00018198803082955474, "loss": 0.0919, "step": 9384 }, { "epoch": 0.6067070707070708, "grad_norm": 0.04951174557209015, "learning_rate": 0.00018198411529004788, "loss": 0.081, "step": 9385 }, { "epoch": 0.6067717171717172, "grad_norm": 0.050499238073825836, "learning_rate": 0.00018198019936712843, "loss": 0.0715, "step": 9386 }, { "epoch": 0.6068363636363636, "grad_norm": 0.053488992154598236, "learning_rate": 0.00018197628306081468, "loss": 0.0808, "step": 9387 }, { "epoch": 0.60690101010101, "grad_norm": 0.06575750559568405, "learning_rate": 0.00018197236637112495, "loss": 0.1095, "step": 9388 }, { "epoch": 0.6069656565656566, "grad_norm": 0.05724027752876282, "learning_rate": 0.0001819684492980776, "loss": 0.0932, "step": 9389 }, { "epoch": 0.607030303030303, "grad_norm": 0.05263465642929077, "learning_rate": 0.00018196453184169084, "loss": 0.0855, "step": 9390 }, { "epoch": 0.6070949494949495, "grad_norm": 0.04725230485200882, "learning_rate": 0.00018196061400198313, "loss": 0.0833, "step": 9391 }, { "epoch": 0.607159595959596, "grad_norm": 0.04917055740952492, "learning_rate": 0.00018195669577897268, "loss": 0.0828, "step": 9392 }, { "epoch": 0.607159595959596, "eval_bleu": 16.131499966278003, "eval_loss": 0.08730745315551758, "eval_runtime": 2.6867, "eval_samples_per_second": 11.911, "eval_steps_per_second": 1.489, "step": 9392 }, { "epoch": 0.6072242424242424, "grad_norm": 0.051806192845106125, "learning_rate": 0.0001819527771726779, "loss": 0.0767, "step": 9393 }, { "epoch": 0.6072888888888889, "grad_norm": 0.05411780998110771, "learning_rate": 0.00018194885818311702, "loss": 0.0912, "step": 9394 }, { "epoch": 0.6073535353535353, "grad_norm": 0.05156666785478592, "learning_rate": 0.00018194493881030846, "loss": 0.0733, "step": 9395 }, { "epoch": 0.6074181818181819, "grad_norm": 0.045128364115953445, "learning_rate": 0.0001819410190542705, "loss": 0.0681, "step": 9396 }, { "epoch": 0.6074828282828283, "grad_norm": 0.05733111500740051, "learning_rate": 0.00018193709891502148, "loss": 0.0849, "step": 9397 }, { "epoch": 0.6075474747474747, "grad_norm": 0.050160422921180725, "learning_rate": 0.00018193317839257974, "loss": 0.0762, "step": 9398 }, { "epoch": 0.6076121212121212, "grad_norm": 0.059303440153598785, "learning_rate": 0.00018192925748696363, "loss": 0.1036, "step": 9399 }, { "epoch": 0.6076767676767677, "grad_norm": 0.06216861680150032, "learning_rate": 0.00018192533619819146, "loss": 0.1, "step": 9400 }, { "epoch": 0.6077414141414141, "grad_norm": 0.062201596796512604, "learning_rate": 0.00018192141452628155, "loss": 0.0745, "step": 9401 }, { "epoch": 0.6078060606060606, "grad_norm": 0.04797658696770668, "learning_rate": 0.00018191749247125228, "loss": 0.0714, "step": 9402 }, { "epoch": 0.6078707070707071, "grad_norm": 0.052640318870544434, "learning_rate": 0.000181913570033122, "loss": 0.0805, "step": 9403 }, { "epoch": 0.6079353535353536, "grad_norm": 0.05144422501325607, "learning_rate": 0.00018190964721190902, "loss": 0.0769, "step": 9404 }, { "epoch": 0.608, "grad_norm": 0.05117727816104889, "learning_rate": 0.00018190572400763168, "loss": 0.0759, "step": 9405 }, { "epoch": 0.6080646464646464, "grad_norm": 0.05392908677458763, "learning_rate": 0.00018190180042030837, "loss": 0.0854, "step": 9406 }, { "epoch": 0.608129292929293, "grad_norm": 0.054625846445560455, "learning_rate": 0.0001818978764499574, "loss": 0.0821, "step": 9407 }, { "epoch": 0.6081939393939394, "grad_norm": 0.06192926689982414, "learning_rate": 0.00018189395209659717, "loss": 0.0933, "step": 9408 }, { "epoch": 0.6081939393939394, "eval_bleu": 11.588121563271779, "eval_loss": 0.08746656030416489, "eval_runtime": 2.46, "eval_samples_per_second": 13.008, "eval_steps_per_second": 1.626, "step": 9408 }, { "epoch": 0.6082585858585858, "grad_norm": 0.05338921770453453, "learning_rate": 0.00018189002736024597, "loss": 0.0698, "step": 9409 }, { "epoch": 0.6083232323232324, "grad_norm": 0.05451475828886032, "learning_rate": 0.00018188610224092216, "loss": 0.0896, "step": 9410 }, { "epoch": 0.6083878787878788, "grad_norm": 0.0657874047756195, "learning_rate": 0.00018188217673864416, "loss": 0.1014, "step": 9411 }, { "epoch": 0.6084525252525252, "grad_norm": 0.053775422275066376, "learning_rate": 0.0001818782508534303, "loss": 0.0861, "step": 9412 }, { "epoch": 0.6085171717171717, "grad_norm": 0.05308878794312477, "learning_rate": 0.00018187432458529888, "loss": 0.078, "step": 9413 }, { "epoch": 0.6085818181818182, "grad_norm": 0.059085726737976074, "learning_rate": 0.00018187039793426835, "loss": 0.0865, "step": 9414 }, { "epoch": 0.6086464646464647, "grad_norm": 0.05518946796655655, "learning_rate": 0.00018186647090035702, "loss": 0.0941, "step": 9415 }, { "epoch": 0.6087111111111111, "grad_norm": 0.05320896580815315, "learning_rate": 0.00018186254348358328, "loss": 0.0881, "step": 9416 }, { "epoch": 0.6087757575757575, "grad_norm": 0.04919058829545975, "learning_rate": 0.00018185861568396547, "loss": 0.0846, "step": 9417 }, { "epoch": 0.6088404040404041, "grad_norm": 0.0579373873770237, "learning_rate": 0.000181854687501522, "loss": 0.0951, "step": 9418 }, { "epoch": 0.6089050505050505, "grad_norm": 0.04512621834874153, "learning_rate": 0.0001818507589362712, "loss": 0.0726, "step": 9419 }, { "epoch": 0.6089696969696969, "grad_norm": 0.04968760907649994, "learning_rate": 0.00018184682998823146, "loss": 0.0808, "step": 9420 }, { "epoch": 0.6090343434343435, "grad_norm": 0.05350997671484947, "learning_rate": 0.00018184290065742116, "loss": 0.0816, "step": 9421 }, { "epoch": 0.6090989898989899, "grad_norm": 0.05510673671960831, "learning_rate": 0.00018183897094385866, "loss": 0.0862, "step": 9422 }, { "epoch": 0.6091636363636364, "grad_norm": 0.04910155385732651, "learning_rate": 0.00018183504084756238, "loss": 0.0858, "step": 9423 }, { "epoch": 0.6092282828282828, "grad_norm": 0.051201753318309784, "learning_rate": 0.00018183111036855062, "loss": 0.0801, "step": 9424 }, { "epoch": 0.6092282828282828, "eval_bleu": 15.965664719949116, "eval_loss": 0.08688366413116455, "eval_runtime": 2.6021, "eval_samples_per_second": 12.298, "eval_steps_per_second": 1.537, "step": 9424 }, { "epoch": 0.6092929292929293, "grad_norm": 0.05567748472094536, "learning_rate": 0.00018182717950684184, "loss": 0.0913, "step": 9425 }, { "epoch": 0.6093575757575758, "grad_norm": 0.06209744140505791, "learning_rate": 0.00018182324826245441, "loss": 0.1044, "step": 9426 }, { "epoch": 0.6094222222222222, "grad_norm": 0.04883311316370964, "learning_rate": 0.00018181931663540664, "loss": 0.0841, "step": 9427 }, { "epoch": 0.6094868686868686, "grad_norm": 0.05289270728826523, "learning_rate": 0.00018181538462571704, "loss": 0.0833, "step": 9428 }, { "epoch": 0.6095515151515152, "grad_norm": 0.05097644776105881, "learning_rate": 0.0001818114522334039, "loss": 0.0797, "step": 9429 }, { "epoch": 0.6096161616161616, "grad_norm": 0.045895516872406006, "learning_rate": 0.00018180751945848567, "loss": 0.0611, "step": 9430 }, { "epoch": 0.609680808080808, "grad_norm": 0.06297482550144196, "learning_rate": 0.00018180358630098066, "loss": 0.0984, "step": 9431 }, { "epoch": 0.6097454545454546, "grad_norm": 0.059170711785554886, "learning_rate": 0.00018179965276090735, "loss": 0.0991, "step": 9432 }, { "epoch": 0.609810101010101, "grad_norm": 0.04943109303712845, "learning_rate": 0.00018179571883828412, "loss": 0.0685, "step": 9433 }, { "epoch": 0.6098747474747475, "grad_norm": 0.05137787014245987, "learning_rate": 0.00018179178453312935, "loss": 0.0744, "step": 9434 }, { "epoch": 0.6099393939393939, "grad_norm": 0.05358685925602913, "learning_rate": 0.00018178784984546143, "loss": 0.0788, "step": 9435 }, { "epoch": 0.6100040404040404, "grad_norm": 0.05035299435257912, "learning_rate": 0.00018178391477529878, "loss": 0.0823, "step": 9436 }, { "epoch": 0.6100686868686869, "grad_norm": 0.056310705840587616, "learning_rate": 0.00018177997932265983, "loss": 0.0947, "step": 9437 }, { "epoch": 0.6101333333333333, "grad_norm": 0.055732693523168564, "learning_rate": 0.00018177604348756292, "loss": 0.0777, "step": 9438 }, { "epoch": 0.6101979797979799, "grad_norm": 0.05543842539191246, "learning_rate": 0.0001817721072700265, "loss": 0.0809, "step": 9439 }, { "epoch": 0.6102626262626263, "grad_norm": 0.060618724673986435, "learning_rate": 0.00018176817067006894, "loss": 0.0861, "step": 9440 }, { "epoch": 0.6102626262626263, "eval_bleu": 18.74261961225278, "eval_loss": 0.08658559620380402, "eval_runtime": 2.6783, "eval_samples_per_second": 11.948, "eval_steps_per_second": 1.493, "step": 9440 }, { "epoch": 0.6103272727272727, "grad_norm": 0.06487573683261871, "learning_rate": 0.00018176423368770873, "loss": 0.1079, "step": 9441 }, { "epoch": 0.6103919191919192, "grad_norm": 0.05037583410739899, "learning_rate": 0.00018176029632296418, "loss": 0.0734, "step": 9442 }, { "epoch": 0.6104565656565657, "grad_norm": 0.05082321539521217, "learning_rate": 0.00018175635857585378, "loss": 0.074, "step": 9443 }, { "epoch": 0.6105212121212121, "grad_norm": 0.06087791547179222, "learning_rate": 0.0001817524204463959, "loss": 0.0957, "step": 9444 }, { "epoch": 0.6105858585858586, "grad_norm": 0.055501408874988556, "learning_rate": 0.000181748481934609, "loss": 0.0819, "step": 9445 }, { "epoch": 0.610650505050505, "grad_norm": 0.05585609748959541, "learning_rate": 0.00018174454304051144, "loss": 0.0833, "step": 9446 }, { "epoch": 0.6107151515151515, "grad_norm": 0.1298169493675232, "learning_rate": 0.00018174060376412174, "loss": 0.0778, "step": 9447 }, { "epoch": 0.610779797979798, "grad_norm": 0.054736360907554626, "learning_rate": 0.00018173666410545823, "loss": 0.0902, "step": 9448 }, { "epoch": 0.6108444444444444, "grad_norm": 0.06032203137874603, "learning_rate": 0.00018173272406453932, "loss": 0.0877, "step": 9449 }, { "epoch": 0.610909090909091, "grad_norm": 0.05347141996026039, "learning_rate": 0.00018172878364138353, "loss": 0.0889, "step": 9450 }, { "epoch": 0.6109737373737374, "grad_norm": 0.05124564841389656, "learning_rate": 0.0001817248428360092, "loss": 0.0889, "step": 9451 }, { "epoch": 0.6110383838383838, "grad_norm": 0.05423453822731972, "learning_rate": 0.00018172090164843483, "loss": 0.1048, "step": 9452 }, { "epoch": 0.6111030303030303, "grad_norm": 0.05394921451807022, "learning_rate": 0.0001817169600786788, "loss": 0.0931, "step": 9453 }, { "epoch": 0.6111676767676768, "grad_norm": 0.0486159585416317, "learning_rate": 0.00018171301812675957, "loss": 0.0766, "step": 9454 }, { "epoch": 0.6112323232323232, "grad_norm": 0.056287091225385666, "learning_rate": 0.00018170907579269558, "loss": 0.0757, "step": 9455 }, { "epoch": 0.6112969696969697, "grad_norm": 0.049736857414245605, "learning_rate": 0.00018170513307650524, "loss": 0.0813, "step": 9456 }, { "epoch": 0.6112969696969697, "eval_bleu": 18.866093290843423, "eval_loss": 0.08800183236598969, "eval_runtime": 2.6753, "eval_samples_per_second": 11.961, "eval_steps_per_second": 1.495, "step": 9456 }, { "epoch": 0.6113616161616162, "grad_norm": 0.042076122015714645, "learning_rate": 0.000181701189978207, "loss": 0.0655, "step": 9457 }, { "epoch": 0.6114262626262627, "grad_norm": 0.05063765496015549, "learning_rate": 0.0001816972464978193, "loss": 0.0793, "step": 9458 }, { "epoch": 0.6114909090909091, "grad_norm": 0.05449441447854042, "learning_rate": 0.00018169330263536062, "loss": 0.0887, "step": 9459 }, { "epoch": 0.6115555555555555, "grad_norm": 0.05560411512851715, "learning_rate": 0.00018168935839084935, "loss": 0.0868, "step": 9460 }, { "epoch": 0.6116202020202021, "grad_norm": 0.0586371049284935, "learning_rate": 0.00018168541376430397, "loss": 0.1001, "step": 9461 }, { "epoch": 0.6116848484848485, "grad_norm": 0.059704191982746124, "learning_rate": 0.00018168146875574288, "loss": 0.0928, "step": 9462 }, { "epoch": 0.6117494949494949, "grad_norm": 0.04768018797039986, "learning_rate": 0.0001816775233651846, "loss": 0.0832, "step": 9463 }, { "epoch": 0.6118141414141414, "grad_norm": 0.05479509010910988, "learning_rate": 0.00018167357759264752, "loss": 0.0887, "step": 9464 }, { "epoch": 0.6118787878787879, "grad_norm": 0.06737764924764633, "learning_rate": 0.00018166963143815014, "loss": 0.108, "step": 9465 }, { "epoch": 0.6119434343434343, "grad_norm": 0.05136725306510925, "learning_rate": 0.00018166568490171088, "loss": 0.0698, "step": 9466 }, { "epoch": 0.6120080808080808, "grad_norm": 0.05782267078757286, "learning_rate": 0.0001816617379833482, "loss": 0.0981, "step": 9467 }, { "epoch": 0.6120727272727273, "grad_norm": 0.05158628895878792, "learning_rate": 0.00018165779068308059, "loss": 0.0872, "step": 9468 }, { "epoch": 0.6121373737373738, "grad_norm": 0.045529596507549286, "learning_rate": 0.00018165384300092646, "loss": 0.0684, "step": 9469 }, { "epoch": 0.6122020202020202, "grad_norm": 0.04455210641026497, "learning_rate": 0.00018164989493690435, "loss": 0.0637, "step": 9470 }, { "epoch": 0.6122666666666666, "grad_norm": 0.052365101873874664, "learning_rate": 0.00018164594649103263, "loss": 0.0851, "step": 9471 }, { "epoch": 0.6123313131313132, "grad_norm": 0.05402002111077309, "learning_rate": 0.00018164199766332984, "loss": 0.0839, "step": 9472 }, { "epoch": 0.6123313131313132, "eval_bleu": 18.341532924964824, "eval_loss": 0.08805583417415619, "eval_runtime": 2.687, "eval_samples_per_second": 11.909, "eval_steps_per_second": 1.489, "step": 9472 }, { "epoch": 0.6123959595959596, "grad_norm": 0.05022764578461647, "learning_rate": 0.0001816380484538144, "loss": 0.0839, "step": 9473 }, { "epoch": 0.612460606060606, "grad_norm": 0.06227951869368553, "learning_rate": 0.0001816340988625048, "loss": 0.097, "step": 9474 }, { "epoch": 0.6125252525252525, "grad_norm": 0.05508830025792122, "learning_rate": 0.0001816301488894195, "loss": 0.0942, "step": 9475 }, { "epoch": 0.612589898989899, "grad_norm": 0.05467323213815689, "learning_rate": 0.000181626198534577, "loss": 0.0839, "step": 9476 }, { "epoch": 0.6126545454545455, "grad_norm": 0.05642442777752876, "learning_rate": 0.00018162224779799574, "loss": 0.0932, "step": 9477 }, { "epoch": 0.6127191919191919, "grad_norm": 0.050000227987766266, "learning_rate": 0.00018161829667969417, "loss": 0.0865, "step": 9478 }, { "epoch": 0.6127838383838384, "grad_norm": 0.0593462735414505, "learning_rate": 0.00018161434517969087, "loss": 0.1071, "step": 9479 }, { "epoch": 0.6128484848484849, "grad_norm": 0.049452316015958786, "learning_rate": 0.00018161039329800424, "loss": 0.0779, "step": 9480 }, { "epoch": 0.6129131313131313, "grad_norm": 0.053209684789180756, "learning_rate": 0.0001816064410346528, "loss": 0.084, "step": 9481 }, { "epoch": 0.6129777777777777, "grad_norm": 0.04530273377895355, "learning_rate": 0.00018160248838965494, "loss": 0.0662, "step": 9482 }, { "epoch": 0.6130424242424243, "grad_norm": 0.051499366760253906, "learning_rate": 0.00018159853536302932, "loss": 0.0816, "step": 9483 }, { "epoch": 0.6131070707070707, "grad_norm": 0.056973233819007874, "learning_rate": 0.00018159458195479426, "loss": 0.0857, "step": 9484 }, { "epoch": 0.6131717171717171, "grad_norm": 0.06934098154306412, "learning_rate": 0.0001815906281649683, "loss": 0.1031, "step": 9485 }, { "epoch": 0.6132363636363637, "grad_norm": 0.049756668508052826, "learning_rate": 0.00018158667399357, "loss": 0.0749, "step": 9486 }, { "epoch": 0.6133010101010101, "grad_norm": 0.0478893481194973, "learning_rate": 0.00018158271944061777, "loss": 0.0649, "step": 9487 }, { "epoch": 0.6133656565656566, "grad_norm": 0.05605468899011612, "learning_rate": 0.00018157876450613014, "loss": 0.0793, "step": 9488 }, { "epoch": 0.6133656565656566, "eval_bleu": 15.435853384110814, "eval_loss": 0.08783368766307831, "eval_runtime": 2.6295, "eval_samples_per_second": 12.17, "eval_steps_per_second": 1.521, "step": 9488 }, { "epoch": 0.613430303030303, "grad_norm": 0.07534047961235046, "learning_rate": 0.0001815748091901256, "loss": 0.0789, "step": 9489 }, { "epoch": 0.6134949494949495, "grad_norm": 0.0526944175362587, "learning_rate": 0.00018157085349262264, "loss": 0.0828, "step": 9490 }, { "epoch": 0.613559595959596, "grad_norm": 0.05247270315885544, "learning_rate": 0.00018156689741363974, "loss": 0.0821, "step": 9491 }, { "epoch": 0.6136242424242424, "grad_norm": 0.057864852249622345, "learning_rate": 0.00018156294095319545, "loss": 0.089, "step": 9492 }, { "epoch": 0.6136888888888888, "grad_norm": 0.0487433597445488, "learning_rate": 0.00018155898411130827, "loss": 0.0717, "step": 9493 }, { "epoch": 0.6137535353535354, "grad_norm": 0.05475940555334091, "learning_rate": 0.00018155502688799668, "loss": 0.0961, "step": 9494 }, { "epoch": 0.6138181818181818, "grad_norm": 0.04976212978363037, "learning_rate": 0.00018155106928327916, "loss": 0.0713, "step": 9495 }, { "epoch": 0.6138828282828283, "grad_norm": 0.057842500507831573, "learning_rate": 0.00018154711129717428, "loss": 0.0921, "step": 9496 }, { "epoch": 0.6139474747474748, "grad_norm": 0.07052791118621826, "learning_rate": 0.0001815431529297005, "loss": 0.0994, "step": 9497 }, { "epoch": 0.6140121212121212, "grad_norm": 0.06075529381632805, "learning_rate": 0.00018153919418087636, "loss": 0.0907, "step": 9498 }, { "epoch": 0.6140767676767677, "grad_norm": 0.049631692469120026, "learning_rate": 0.00018153523505072037, "loss": 0.0796, "step": 9499 }, { "epoch": 0.6141414141414141, "grad_norm": 0.04709490016102791, "learning_rate": 0.000181531275539251, "loss": 0.0664, "step": 9500 }, { "epoch": 0.6142060606060606, "grad_norm": 0.04736785590648651, "learning_rate": 0.00018152731564648685, "loss": 0.0741, "step": 9501 }, { "epoch": 0.6142707070707071, "grad_norm": 0.049560341984033585, "learning_rate": 0.00018152335537244637, "loss": 0.0824, "step": 9502 }, { "epoch": 0.6143353535353535, "grad_norm": 0.051859188824892044, "learning_rate": 0.00018151939471714815, "loss": 0.0803, "step": 9503 }, { "epoch": 0.6144, "grad_norm": 0.05678745359182358, "learning_rate": 0.00018151543368061064, "loss": 0.0927, "step": 9504 }, { "epoch": 0.6144, "eval_bleu": 12.559595150390436, "eval_loss": 0.08783996850252151, "eval_runtime": 2.6663, "eval_samples_per_second": 12.002, "eval_steps_per_second": 1.5, "step": 9504 }, { "epoch": 0.6144646464646465, "grad_norm": 0.05441804602742195, "learning_rate": 0.00018151147226285242, "loss": 0.0831, "step": 9505 }, { "epoch": 0.6145292929292929, "grad_norm": 0.04948701709508896, "learning_rate": 0.00018150751046389196, "loss": 0.0761, "step": 9506 }, { "epoch": 0.6145939393939394, "grad_norm": 0.051456570625305176, "learning_rate": 0.00018150354828374782, "loss": 0.0796, "step": 9507 }, { "epoch": 0.6146585858585859, "grad_norm": 0.04944676533341408, "learning_rate": 0.00018149958572243855, "loss": 0.0814, "step": 9508 }, { "epoch": 0.6147232323232323, "grad_norm": 0.08710639178752899, "learning_rate": 0.00018149562277998264, "loss": 0.0717, "step": 9509 }, { "epoch": 0.6147878787878788, "grad_norm": 0.06079666689038277, "learning_rate": 0.00018149165945639868, "loss": 0.086, "step": 9510 }, { "epoch": 0.6148525252525252, "grad_norm": 0.0494140088558197, "learning_rate": 0.00018148769575170512, "loss": 0.0735, "step": 9511 }, { "epoch": 0.6149171717171718, "grad_norm": 0.05090288445353508, "learning_rate": 0.00018148373166592058, "loss": 0.0746, "step": 9512 }, { "epoch": 0.6149818181818182, "grad_norm": 0.05616435036063194, "learning_rate": 0.00018147976719906357, "loss": 0.0857, "step": 9513 }, { "epoch": 0.6150464646464646, "grad_norm": 0.05248246341943741, "learning_rate": 0.0001814758023511526, "loss": 0.0739, "step": 9514 }, { "epoch": 0.6151111111111112, "grad_norm": 0.06067917123436928, "learning_rate": 0.00018147183712220627, "loss": 0.1033, "step": 9515 }, { "epoch": 0.6151757575757576, "grad_norm": 0.06394044309854507, "learning_rate": 0.00018146787151224306, "loss": 0.1109, "step": 9516 }, { "epoch": 0.615240404040404, "grad_norm": 0.05714740604162216, "learning_rate": 0.00018146390552128153, "loss": 0.083, "step": 9517 }, { "epoch": 0.6153050505050505, "grad_norm": 0.06050596758723259, "learning_rate": 0.0001814599391493403, "loss": 0.0722, "step": 9518 }, { "epoch": 0.615369696969697, "grad_norm": 0.06006387248635292, "learning_rate": 0.00018145597239643783, "loss": 0.0998, "step": 9519 }, { "epoch": 0.6154343434343434, "grad_norm": 0.045123130083084106, "learning_rate": 0.0001814520052625927, "loss": 0.0686, "step": 9520 }, { "epoch": 0.6154343434343434, "eval_bleu": 14.260347694917671, "eval_loss": 0.08827628940343857, "eval_runtime": 2.7886, "eval_samples_per_second": 11.475, "eval_steps_per_second": 1.434, "step": 9520 }, { "epoch": 0.6154989898989899, "grad_norm": 0.059214141219854355, "learning_rate": 0.00018144803774782352, "loss": 0.0854, "step": 9521 }, { "epoch": 0.6155636363636363, "grad_norm": 0.06354603916406631, "learning_rate": 0.00018144406985214876, "loss": 0.1152, "step": 9522 }, { "epoch": 0.6156282828282829, "grad_norm": 0.053596995770931244, "learning_rate": 0.00018144010157558697, "loss": 0.0924, "step": 9523 }, { "epoch": 0.6156929292929293, "grad_norm": 0.05600646138191223, "learning_rate": 0.00018143613291815683, "loss": 0.0885, "step": 9524 }, { "epoch": 0.6157575757575757, "grad_norm": 0.061736591160297394, "learning_rate": 0.00018143216387987674, "loss": 0.1018, "step": 9525 }, { "epoch": 0.6158222222222223, "grad_norm": 0.05615519359707832, "learning_rate": 0.0001814281944607654, "loss": 0.0757, "step": 9526 }, { "epoch": 0.6158868686868687, "grad_norm": 0.054505590349435806, "learning_rate": 0.00018142422466084127, "loss": 0.0917, "step": 9527 }, { "epoch": 0.6159515151515151, "grad_norm": 0.050660084933042526, "learning_rate": 0.000181420254480123, "loss": 0.0768, "step": 9528 }, { "epoch": 0.6160161616161616, "grad_norm": 0.05841151997447014, "learning_rate": 0.00018141628391862908, "loss": 0.0794, "step": 9529 }, { "epoch": 0.6160808080808081, "grad_norm": 0.0499127060174942, "learning_rate": 0.00018141231297637812, "loss": 0.0876, "step": 9530 }, { "epoch": 0.6161454545454546, "grad_norm": 0.055056776851415634, "learning_rate": 0.00018140834165338872, "loss": 0.0761, "step": 9531 }, { "epoch": 0.616210101010101, "grad_norm": 0.04454522579908371, "learning_rate": 0.00018140436994967937, "loss": 0.0582, "step": 9532 }, { "epoch": 0.6162747474747474, "grad_norm": 0.060447704046964645, "learning_rate": 0.00018140039786526874, "loss": 0.0971, "step": 9533 }, { "epoch": 0.616339393939394, "grad_norm": 0.05185241252183914, "learning_rate": 0.00018139642540017533, "loss": 0.0791, "step": 9534 }, { "epoch": 0.6164040404040404, "grad_norm": 0.055945418775081635, "learning_rate": 0.00018139245255441777, "loss": 0.0804, "step": 9535 }, { "epoch": 0.6164686868686868, "grad_norm": 0.05916115269064903, "learning_rate": 0.00018138847932801458, "loss": 0.0935, "step": 9536 }, { "epoch": 0.6164686868686868, "eval_bleu": 18.260853023656868, "eval_loss": 0.08887571841478348, "eval_runtime": 2.6277, "eval_samples_per_second": 12.178, "eval_steps_per_second": 1.522, "step": 9536 }, { "epoch": 0.6165333333333334, "grad_norm": 0.05299973487854004, "learning_rate": 0.00018138450572098441, "loss": 0.0831, "step": 9537 }, { "epoch": 0.6165979797979798, "grad_norm": 0.052241250872612, "learning_rate": 0.0001813805317333458, "loss": 0.0811, "step": 9538 }, { "epoch": 0.6166626262626262, "grad_norm": 0.06929423660039902, "learning_rate": 0.00018137655736511738, "loss": 0.0709, "step": 9539 }, { "epoch": 0.6167272727272727, "grad_norm": 0.045238103717565536, "learning_rate": 0.0001813725826163177, "loss": 0.0633, "step": 9540 }, { "epoch": 0.6167919191919192, "grad_norm": 0.06472372263669968, "learning_rate": 0.0001813686074869653, "loss": 0.0932, "step": 9541 }, { "epoch": 0.6168565656565657, "grad_norm": 0.04982895404100418, "learning_rate": 0.00018136463197707884, "loss": 0.0821, "step": 9542 }, { "epoch": 0.6169212121212121, "grad_norm": 0.056263748556375504, "learning_rate": 0.00018136065608667691, "loss": 0.0901, "step": 9543 }, { "epoch": 0.6169858585858586, "grad_norm": 0.059961240738630295, "learning_rate": 0.0001813566798157781, "loss": 0.0897, "step": 9544 }, { "epoch": 0.6170505050505051, "grad_norm": 0.05260671675205231, "learning_rate": 0.000181352703164401, "loss": 0.0906, "step": 9545 }, { "epoch": 0.6171151515151515, "grad_norm": 0.062079254537820816, "learning_rate": 0.00018134872613256414, "loss": 0.0932, "step": 9546 }, { "epoch": 0.6171797979797979, "grad_norm": 0.057178106158971786, "learning_rate": 0.00018134474872028624, "loss": 0.0933, "step": 9547 }, { "epoch": 0.6172444444444445, "grad_norm": 0.0510297492146492, "learning_rate": 0.00018134077092758584, "loss": 0.0807, "step": 9548 }, { "epoch": 0.6173090909090909, "grad_norm": 0.056495893746614456, "learning_rate": 0.0001813367927544815, "loss": 0.097, "step": 9549 }, { "epoch": 0.6173737373737374, "grad_norm": 0.05001109838485718, "learning_rate": 0.00018133281420099192, "loss": 0.074, "step": 9550 }, { "epoch": 0.6174383838383838, "grad_norm": 0.052068766206502914, "learning_rate": 0.00018132883526713565, "loss": 0.0858, "step": 9551 }, { "epoch": 0.6175030303030303, "grad_norm": 0.13263340294361115, "learning_rate": 0.00018132485595293127, "loss": 0.1071, "step": 9552 }, { "epoch": 0.6175030303030303, "eval_bleu": 18.988705737474792, "eval_loss": 0.08864723145961761, "eval_runtime": 2.7674, "eval_samples_per_second": 11.563, "eval_steps_per_second": 1.445, "step": 9552 }, { "epoch": 0.6175676767676768, "grad_norm": 0.048922184854745865, "learning_rate": 0.00018132087625839747, "loss": 0.0772, "step": 9553 }, { "epoch": 0.6176323232323232, "grad_norm": 0.058987416326999664, "learning_rate": 0.0001813168961835528, "loss": 0.1015, "step": 9554 }, { "epoch": 0.6176969696969697, "grad_norm": 0.057210199534893036, "learning_rate": 0.00018131291572841585, "loss": 0.0782, "step": 9555 }, { "epoch": 0.6177616161616162, "grad_norm": 0.053854990750551224, "learning_rate": 0.0001813089348930053, "loss": 0.0759, "step": 9556 }, { "epoch": 0.6178262626262626, "grad_norm": 0.04834631457924843, "learning_rate": 0.00018130495367733976, "loss": 0.0792, "step": 9557 }, { "epoch": 0.617890909090909, "grad_norm": 0.066676564514637, "learning_rate": 0.00018130097208143782, "loss": 0.0869, "step": 9558 }, { "epoch": 0.6179555555555556, "grad_norm": 0.059479229152202606, "learning_rate": 0.00018129699010531808, "loss": 0.0997, "step": 9559 }, { "epoch": 0.618020202020202, "grad_norm": 0.0604429766535759, "learning_rate": 0.00018129300774899924, "loss": 0.1011, "step": 9560 }, { "epoch": 0.6180848484848485, "grad_norm": 0.057826653122901917, "learning_rate": 0.00018128902501249983, "loss": 0.1011, "step": 9561 }, { "epoch": 0.618149494949495, "grad_norm": 0.0553378127515316, "learning_rate": 0.0001812850418958386, "loss": 0.0931, "step": 9562 }, { "epoch": 0.6182141414141414, "grad_norm": 0.058645617216825485, "learning_rate": 0.00018128105839903405, "loss": 0.0825, "step": 9563 }, { "epoch": 0.6182787878787879, "grad_norm": 0.05420980229973793, "learning_rate": 0.00018127707452210486, "loss": 0.0848, "step": 9564 }, { "epoch": 0.6183434343434343, "grad_norm": 0.05890471488237381, "learning_rate": 0.00018127309026506967, "loss": 0.089, "step": 9565 }, { "epoch": 0.6184080808080809, "grad_norm": 0.04824194312095642, "learning_rate": 0.0001812691056279471, "loss": 0.0786, "step": 9566 }, { "epoch": 0.6184727272727273, "grad_norm": 0.05548529699444771, "learning_rate": 0.0001812651206107558, "loss": 0.0883, "step": 9567 }, { "epoch": 0.6185373737373737, "grad_norm": 0.05986257642507553, "learning_rate": 0.00018126113521351437, "loss": 0.0935, "step": 9568 }, { "epoch": 0.6185373737373737, "eval_bleu": 17.133306768161596, "eval_loss": 0.08898554742336273, "eval_runtime": 2.6465, "eval_samples_per_second": 12.092, "eval_steps_per_second": 1.511, "step": 9568 }, { "epoch": 0.6186020202020202, "grad_norm": 0.05185319483280182, "learning_rate": 0.0001812571494362415, "loss": 0.0736, "step": 9569 }, { "epoch": 0.6186666666666667, "grad_norm": 0.05246417596936226, "learning_rate": 0.0001812531632789558, "loss": 0.0886, "step": 9570 }, { "epoch": 0.6187313131313131, "grad_norm": 0.054864972829818726, "learning_rate": 0.00018124917674167593, "loss": 0.0871, "step": 9571 }, { "epoch": 0.6187959595959596, "grad_norm": 0.048025768250226974, "learning_rate": 0.0001812451898244205, "loss": 0.0679, "step": 9572 }, { "epoch": 0.6188606060606061, "grad_norm": 0.05048222839832306, "learning_rate": 0.00018124120252720817, "loss": 0.0738, "step": 9573 }, { "epoch": 0.6189252525252525, "grad_norm": 0.06478758156299591, "learning_rate": 0.0001812372148500576, "loss": 0.1028, "step": 9574 }, { "epoch": 0.618989898989899, "grad_norm": 0.05829968303442001, "learning_rate": 0.00018123322679298743, "loss": 0.0968, "step": 9575 }, { "epoch": 0.6190545454545454, "grad_norm": 0.05079164728522301, "learning_rate": 0.00018122923835601636, "loss": 0.0777, "step": 9576 }, { "epoch": 0.619119191919192, "grad_norm": 0.05877024307847023, "learning_rate": 0.00018122524953916296, "loss": 0.1, "step": 9577 }, { "epoch": 0.6191838383838384, "grad_norm": 0.04977516084909439, "learning_rate": 0.00018122126034244592, "loss": 0.0777, "step": 9578 }, { "epoch": 0.6192484848484848, "grad_norm": 0.044117335230112076, "learning_rate": 0.00018121727076588387, "loss": 0.0587, "step": 9579 }, { "epoch": 0.6193131313131313, "grad_norm": 0.05342421308159828, "learning_rate": 0.00018121328080949553, "loss": 0.0876, "step": 9580 }, { "epoch": 0.6193777777777778, "grad_norm": 0.04919970780611038, "learning_rate": 0.0001812092904732995, "loss": 0.0786, "step": 9581 }, { "epoch": 0.6194424242424242, "grad_norm": 0.058167457580566406, "learning_rate": 0.0001812052997573145, "loss": 0.0794, "step": 9582 }, { "epoch": 0.6195070707070707, "grad_norm": 0.05495934560894966, "learning_rate": 0.00018120130866155912, "loss": 0.087, "step": 9583 }, { "epoch": 0.6195717171717172, "grad_norm": 0.05605284124612808, "learning_rate": 0.0001811973171860521, "loss": 0.0914, "step": 9584 }, { "epoch": 0.6195717171717172, "eval_bleu": 19.8635638387284, "eval_loss": 0.08998416364192963, "eval_runtime": 2.6338, "eval_samples_per_second": 12.15, "eval_steps_per_second": 1.519, "step": 9584 }, { "epoch": 0.6196363636363637, "grad_norm": 0.05641790106892586, "learning_rate": 0.00018119332533081205, "loss": 0.0815, "step": 9585 }, { "epoch": 0.6197010101010101, "grad_norm": 0.05015675351023674, "learning_rate": 0.00018118933309585765, "loss": 0.0696, "step": 9586 }, { "epoch": 0.6197656565656565, "grad_norm": 0.05131001025438309, "learning_rate": 0.00018118534048120762, "loss": 0.0922, "step": 9587 }, { "epoch": 0.6198303030303031, "grad_norm": 0.06638558208942413, "learning_rate": 0.00018118134748688056, "loss": 0.1032, "step": 9588 }, { "epoch": 0.6198949494949495, "grad_norm": 0.04921744763851166, "learning_rate": 0.0001811773541128952, "loss": 0.0697, "step": 9589 }, { "epoch": 0.6199595959595959, "grad_norm": 0.04921913146972656, "learning_rate": 0.00018117336035927014, "loss": 0.0715, "step": 9590 }, { "epoch": 0.6200242424242425, "grad_norm": 0.06023449823260307, "learning_rate": 0.00018116936622602416, "loss": 0.0982, "step": 9591 }, { "epoch": 0.6200888888888889, "grad_norm": 0.06720209866762161, "learning_rate": 0.00018116537171317587, "loss": 0.0939, "step": 9592 }, { "epoch": 0.6201535353535353, "grad_norm": 0.05969652906060219, "learning_rate": 0.00018116137682074398, "loss": 0.0921, "step": 9593 }, { "epoch": 0.6202181818181818, "grad_norm": 0.05230917036533356, "learning_rate": 0.00018115738154874715, "loss": 0.078, "step": 9594 }, { "epoch": 0.6202828282828283, "grad_norm": 0.05873195827007294, "learning_rate": 0.00018115338589720406, "loss": 0.0829, "step": 9595 }, { "epoch": 0.6203474747474748, "grad_norm": 0.04945479333400726, "learning_rate": 0.00018114938986613343, "loss": 0.0715, "step": 9596 }, { "epoch": 0.6204121212121212, "grad_norm": 0.06358293443918228, "learning_rate": 0.00018114539345555396, "loss": 0.0867, "step": 9597 }, { "epoch": 0.6204767676767676, "grad_norm": 0.05405670776963234, "learning_rate": 0.00018114139666548427, "loss": 0.0719, "step": 9598 }, { "epoch": 0.6205414141414142, "grad_norm": 0.043881841003894806, "learning_rate": 0.0001811373994959431, "loss": 0.0727, "step": 9599 }, { "epoch": 0.6206060606060606, "grad_norm": 0.08234690129756927, "learning_rate": 0.00018113340194694916, "loss": 0.0827, "step": 9600 }, { "epoch": 0.6206060606060606, "eval_bleu": 18.07385155938187, "eval_loss": 0.08989162743091583, "eval_runtime": 2.636, "eval_samples_per_second": 12.14, "eval_steps_per_second": 1.517, "step": 9600 }, { "epoch": 0.620670707070707, "grad_norm": 0.05685693398118019, "learning_rate": 0.0001811294040185211, "loss": 0.0797, "step": 9601 }, { "epoch": 0.6207353535353536, "grad_norm": 0.056118790060281754, "learning_rate": 0.00018112540571067762, "loss": 0.0938, "step": 9602 }, { "epoch": 0.6208, "grad_norm": 0.058308880776166916, "learning_rate": 0.00018112140702343749, "loss": 0.0955, "step": 9603 }, { "epoch": 0.6208646464646465, "grad_norm": 0.05962080880999565, "learning_rate": 0.0001811174079568193, "loss": 0.0965, "step": 9604 }, { "epoch": 0.6209292929292929, "grad_norm": 0.055381618440151215, "learning_rate": 0.00018111340851084185, "loss": 0.0953, "step": 9605 }, { "epoch": 0.6209939393939394, "grad_norm": 0.05533050000667572, "learning_rate": 0.0001811094086855238, "loss": 0.0798, "step": 9606 }, { "epoch": 0.6210585858585859, "grad_norm": 0.048587616533041, "learning_rate": 0.0001811054084808838, "loss": 0.0769, "step": 9607 }, { "epoch": 0.6211232323232323, "grad_norm": 0.05376973748207092, "learning_rate": 0.0001811014078969407, "loss": 0.0767, "step": 9608 }, { "epoch": 0.6211878787878787, "grad_norm": 0.052740905433893204, "learning_rate": 0.00018109740693371312, "loss": 0.0802, "step": 9609 }, { "epoch": 0.6212525252525253, "grad_norm": 0.05461864173412323, "learning_rate": 0.00018109340559121972, "loss": 0.0715, "step": 9610 }, { "epoch": 0.6213171717171717, "grad_norm": 0.05895165354013443, "learning_rate": 0.0001810894038694793, "loss": 0.1057, "step": 9611 }, { "epoch": 0.6213818181818181, "grad_norm": 0.04856916517019272, "learning_rate": 0.00018108540176851054, "loss": 0.0762, "step": 9612 }, { "epoch": 0.6214464646464647, "grad_norm": 0.05111650750041008, "learning_rate": 0.0001810813992883322, "loss": 0.0822, "step": 9613 }, { "epoch": 0.6215111111111111, "grad_norm": 0.04969792068004608, "learning_rate": 0.00018107739642896293, "loss": 0.0692, "step": 9614 }, { "epoch": 0.6215757575757576, "grad_norm": 0.053943932056427, "learning_rate": 0.00018107339319042146, "loss": 0.0792, "step": 9615 }, { "epoch": 0.621640404040404, "grad_norm": 0.055069100111722946, "learning_rate": 0.0001810693895727266, "loss": 0.0822, "step": 9616 }, { "epoch": 0.621640404040404, "eval_bleu": 18.90234994335136, "eval_loss": 0.08833940327167511, "eval_runtime": 2.583, "eval_samples_per_second": 12.389, "eval_steps_per_second": 1.549, "step": 9616 }, { "epoch": 0.6217050505050505, "grad_norm": 0.05640248954296112, "learning_rate": 0.00018106538557589695, "loss": 0.0874, "step": 9617 }, { "epoch": 0.621769696969697, "grad_norm": 0.05116138234734535, "learning_rate": 0.0001810613811999513, "loss": 0.0889, "step": 9618 }, { "epoch": 0.6218343434343434, "grad_norm": 0.0630054697394371, "learning_rate": 0.00018105737644490835, "loss": 0.101, "step": 9619 }, { "epoch": 0.62189898989899, "grad_norm": 0.05727817118167877, "learning_rate": 0.0001810533713107869, "loss": 0.0943, "step": 9620 }, { "epoch": 0.6219636363636364, "grad_norm": 0.04871181398630142, "learning_rate": 0.00018104936579760555, "loss": 0.0673, "step": 9621 }, { "epoch": 0.6220282828282828, "grad_norm": 0.052687522023916245, "learning_rate": 0.00018104535990538318, "loss": 0.0751, "step": 9622 }, { "epoch": 0.6220929292929293, "grad_norm": 0.06123575568199158, "learning_rate": 0.00018104135363413844, "loss": 0.0945, "step": 9623 }, { "epoch": 0.6221575757575758, "grad_norm": 0.06036514416337013, "learning_rate": 0.00018103734698389005, "loss": 0.0986, "step": 9624 }, { "epoch": 0.6222222222222222, "grad_norm": 0.0732359066605568, "learning_rate": 0.0001810333399546568, "loss": 0.1009, "step": 9625 }, { "epoch": 0.6222868686868687, "grad_norm": 0.05055307596921921, "learning_rate": 0.00018102933254645742, "loss": 0.0803, "step": 9626 }, { "epoch": 0.6223515151515151, "grad_norm": 0.05567167326807976, "learning_rate": 0.00018102532475931064, "loss": 0.0831, "step": 9627 }, { "epoch": 0.6224161616161616, "grad_norm": 0.050922878086566925, "learning_rate": 0.00018102131659323518, "loss": 0.0851, "step": 9628 }, { "epoch": 0.6224808080808081, "grad_norm": 0.05642584338784218, "learning_rate": 0.00018101730804824984, "loss": 0.0842, "step": 9629 }, { "epoch": 0.6225454545454545, "grad_norm": 0.0566360205411911, "learning_rate": 0.00018101329912437327, "loss": 0.0837, "step": 9630 }, { "epoch": 0.6226101010101011, "grad_norm": 0.06326840817928314, "learning_rate": 0.00018100928982162434, "loss": 0.1, "step": 9631 }, { "epoch": 0.6226747474747475, "grad_norm": 0.053713373839855194, "learning_rate": 0.00018100528014002173, "loss": 0.084, "step": 9632 }, { "epoch": 0.6226747474747475, "eval_bleu": 20.547283772939874, "eval_loss": 0.08786409348249435, "eval_runtime": 2.6122, "eval_samples_per_second": 12.25, "eval_steps_per_second": 1.531, "step": 9632 }, { "epoch": 0.6227393939393939, "grad_norm": 0.05402345955371857, "learning_rate": 0.0001810012700795842, "loss": 0.0898, "step": 9633 }, { "epoch": 0.6228040404040404, "grad_norm": 0.049608100205659866, "learning_rate": 0.00018099725964033052, "loss": 0.0711, "step": 9634 }, { "epoch": 0.6228686868686869, "grad_norm": 0.06267761439085007, "learning_rate": 0.0001809932488222794, "loss": 0.0994, "step": 9635 }, { "epoch": 0.6229333333333333, "grad_norm": 0.0577191598713398, "learning_rate": 0.00018098923762544967, "loss": 0.0923, "step": 9636 }, { "epoch": 0.6229979797979798, "grad_norm": 0.05588376149535179, "learning_rate": 0.00018098522604986003, "loss": 0.0901, "step": 9637 }, { "epoch": 0.6230626262626263, "grad_norm": 0.04780367761850357, "learning_rate": 0.0001809812140955293, "loss": 0.0661, "step": 9638 }, { "epoch": 0.6231272727272728, "grad_norm": 0.05551775544881821, "learning_rate": 0.00018097720176247614, "loss": 0.0763, "step": 9639 }, { "epoch": 0.6231919191919192, "grad_norm": 0.0610777921974659, "learning_rate": 0.0001809731890507194, "loss": 0.0911, "step": 9640 }, { "epoch": 0.6232565656565656, "grad_norm": 0.04723057895898819, "learning_rate": 0.00018096917596027785, "loss": 0.0791, "step": 9641 }, { "epoch": 0.6233212121212122, "grad_norm": 0.056147750467061996, "learning_rate": 0.0001809651624911702, "loss": 0.0832, "step": 9642 }, { "epoch": 0.6233858585858586, "grad_norm": 0.05480760335922241, "learning_rate": 0.0001809611486434153, "loss": 0.0863, "step": 9643 }, { "epoch": 0.623450505050505, "grad_norm": 0.06404627859592438, "learning_rate": 0.00018095713441703182, "loss": 0.0824, "step": 9644 }, { "epoch": 0.6235151515151515, "grad_norm": 0.04449770227074623, "learning_rate": 0.0001809531198120386, "loss": 0.0756, "step": 9645 }, { "epoch": 0.623579797979798, "grad_norm": 0.04688223451375961, "learning_rate": 0.0001809491048284544, "loss": 0.0687, "step": 9646 }, { "epoch": 0.6236444444444444, "grad_norm": 0.051815249025821686, "learning_rate": 0.00018094508946629801, "loss": 0.0881, "step": 9647 }, { "epoch": 0.6237090909090909, "grad_norm": 0.05726795643568039, "learning_rate": 0.0001809410737255882, "loss": 0.1009, "step": 9648 }, { "epoch": 0.6237090909090909, "eval_bleu": 18.58705858840569, "eval_loss": 0.08918391168117523, "eval_runtime": 2.6307, "eval_samples_per_second": 12.164, "eval_steps_per_second": 1.521, "step": 9648 }, { "epoch": 0.6237737373737374, "grad_norm": 0.0512734092772007, "learning_rate": 0.00018093705760634373, "loss": 0.0732, "step": 9649 }, { "epoch": 0.6238383838383839, "grad_norm": 0.05391843616962433, "learning_rate": 0.0001809330411085834, "loss": 0.0903, "step": 9650 }, { "epoch": 0.6239030303030303, "grad_norm": 0.046430014073848724, "learning_rate": 0.000180929024232326, "loss": 0.0713, "step": 9651 }, { "epoch": 0.6239676767676767, "grad_norm": 0.06048184633255005, "learning_rate": 0.00018092500697759032, "loss": 0.0918, "step": 9652 }, { "epoch": 0.6240323232323233, "grad_norm": 0.05273683741688728, "learning_rate": 0.0001809209893443951, "loss": 0.079, "step": 9653 }, { "epoch": 0.6240969696969697, "grad_norm": 0.07136894017457962, "learning_rate": 0.0001809169713327592, "loss": 0.0844, "step": 9654 }, { "epoch": 0.6241616161616161, "grad_norm": 0.05135270208120346, "learning_rate": 0.00018091295294270137, "loss": 0.0774, "step": 9655 }, { "epoch": 0.6242262626262626, "grad_norm": 0.05173762887716293, "learning_rate": 0.0001809089341742404, "loss": 0.0798, "step": 9656 }, { "epoch": 0.6242909090909091, "grad_norm": 0.05857198312878609, "learning_rate": 0.0001809049150273951, "loss": 0.1001, "step": 9657 }, { "epoch": 0.6243555555555556, "grad_norm": 0.05289507284760475, "learning_rate": 0.00018090089550218427, "loss": 0.0799, "step": 9658 }, { "epoch": 0.624420202020202, "grad_norm": 0.053254589438438416, "learning_rate": 0.00018089687559862664, "loss": 0.081, "step": 9659 }, { "epoch": 0.6244848484848485, "grad_norm": 0.05247701331973076, "learning_rate": 0.00018089285531674111, "loss": 0.0804, "step": 9660 }, { "epoch": 0.624549494949495, "grad_norm": 0.05342647060751915, "learning_rate": 0.00018088883465654644, "loss": 0.0904, "step": 9661 }, { "epoch": 0.6246141414141414, "grad_norm": 0.04722720757126808, "learning_rate": 0.00018088481361806143, "loss": 0.0554, "step": 9662 }, { "epoch": 0.6246787878787878, "grad_norm": 0.05650734156370163, "learning_rate": 0.0001808807922013049, "loss": 0.0959, "step": 9663 }, { "epoch": 0.6247434343434344, "grad_norm": 0.07277008146047592, "learning_rate": 0.0001808767704062956, "loss": 0.1022, "step": 9664 }, { "epoch": 0.6247434343434344, "eval_bleu": 13.959932243178006, "eval_loss": 0.08815840631723404, "eval_runtime": 2.5301, "eval_samples_per_second": 12.648, "eval_steps_per_second": 1.581, "step": 9664 }, { "epoch": 0.6248080808080808, "grad_norm": 0.06401682645082474, "learning_rate": 0.00018087274823305242, "loss": 0.0797, "step": 9665 }, { "epoch": 0.6248727272727272, "grad_norm": 0.05775313824415207, "learning_rate": 0.00018086872568159412, "loss": 0.0928, "step": 9666 }, { "epoch": 0.6249373737373738, "grad_norm": 0.057298265397548676, "learning_rate": 0.00018086470275193955, "loss": 0.0915, "step": 9667 }, { "epoch": 0.6250020202020202, "grad_norm": 0.044392794370651245, "learning_rate": 0.00018086067944410745, "loss": 0.0617, "step": 9668 }, { "epoch": 0.6250666666666667, "grad_norm": 0.0746421068906784, "learning_rate": 0.00018085665575811672, "loss": 0.0955, "step": 9669 }, { "epoch": 0.6251313131313131, "grad_norm": 0.0484134666621685, "learning_rate": 0.0001808526316939861, "loss": 0.0736, "step": 9670 }, { "epoch": 0.6251959595959596, "grad_norm": 0.05268852040171623, "learning_rate": 0.00018084860725173452, "loss": 0.0841, "step": 9671 }, { "epoch": 0.6252606060606061, "grad_norm": 0.0559694841504097, "learning_rate": 0.0001808445824313807, "loss": 0.0883, "step": 9672 }, { "epoch": 0.6253252525252525, "grad_norm": 0.05880420282483101, "learning_rate": 0.00018084055723294347, "loss": 0.0936, "step": 9673 }, { "epoch": 0.6253898989898989, "grad_norm": 0.05137414112687111, "learning_rate": 0.00018083653165644172, "loss": 0.076, "step": 9674 }, { "epoch": 0.6254545454545455, "grad_norm": 0.044723354279994965, "learning_rate": 0.0001808325057018942, "loss": 0.0738, "step": 9675 }, { "epoch": 0.6255191919191919, "grad_norm": 0.05369570478796959, "learning_rate": 0.0001808284793693198, "loss": 0.0746, "step": 9676 }, { "epoch": 0.6255838383838384, "grad_norm": 0.05106741189956665, "learning_rate": 0.00018082445265873732, "loss": 0.0778, "step": 9677 }, { "epoch": 0.6256484848484849, "grad_norm": 0.05919661745429039, "learning_rate": 0.00018082042557016555, "loss": 0.0706, "step": 9678 }, { "epoch": 0.6257131313131313, "grad_norm": 0.06451162695884705, "learning_rate": 0.0001808163981036234, "loss": 0.0908, "step": 9679 }, { "epoch": 0.6257777777777778, "grad_norm": 0.05329928919672966, "learning_rate": 0.0001808123702591297, "loss": 0.0901, "step": 9680 }, { "epoch": 0.6257777777777778, "eval_bleu": 15.608825455208617, "eval_loss": 0.08939829468727112, "eval_runtime": 2.7872, "eval_samples_per_second": 11.481, "eval_steps_per_second": 1.435, "step": 9680 }, { "epoch": 0.6258424242424242, "grad_norm": 0.057871416211128235, "learning_rate": 0.00018080834203670324, "loss": 0.0773, "step": 9681 }, { "epoch": 0.6259070707070707, "grad_norm": 0.04860113188624382, "learning_rate": 0.00018080431343636287, "loss": 0.0724, "step": 9682 }, { "epoch": 0.6259717171717172, "grad_norm": 0.05607250705361366, "learning_rate": 0.00018080028445812744, "loss": 0.0547, "step": 9683 }, { "epoch": 0.6260363636363636, "grad_norm": 0.0589488260447979, "learning_rate": 0.0001807962551020158, "loss": 0.0834, "step": 9684 }, { "epoch": 0.62610101010101, "grad_norm": 0.046649497002363205, "learning_rate": 0.00018079222536804677, "loss": 0.0668, "step": 9685 }, { "epoch": 0.6261656565656566, "grad_norm": 0.05263568460941315, "learning_rate": 0.00018078819525623921, "loss": 0.0939, "step": 9686 }, { "epoch": 0.626230303030303, "grad_norm": 0.04980294406414032, "learning_rate": 0.00018078416476661203, "loss": 0.0708, "step": 9687 }, { "epoch": 0.6262949494949495, "grad_norm": 0.05128546804189682, "learning_rate": 0.00018078013389918396, "loss": 0.0865, "step": 9688 }, { "epoch": 0.626359595959596, "grad_norm": 0.0510791577398777, "learning_rate": 0.00018077610265397392, "loss": 0.0834, "step": 9689 }, { "epoch": 0.6264242424242424, "grad_norm": 0.04812939092516899, "learning_rate": 0.00018077207103100077, "loss": 0.0791, "step": 9690 }, { "epoch": 0.6264888888888889, "grad_norm": 0.057489506900310516, "learning_rate": 0.00018076803903028333, "loss": 0.0796, "step": 9691 }, { "epoch": 0.6265535353535353, "grad_norm": 0.056044381111860275, "learning_rate": 0.00018076400665184043, "loss": 0.0862, "step": 9692 }, { "epoch": 0.6266181818181819, "grad_norm": 0.06002064794301987, "learning_rate": 0.00018075997389569102, "loss": 0.0941, "step": 9693 }, { "epoch": 0.6266828282828283, "grad_norm": 0.0553692951798439, "learning_rate": 0.00018075594076185392, "loss": 0.0783, "step": 9694 }, { "epoch": 0.6267474747474747, "grad_norm": 0.05098913237452507, "learning_rate": 0.00018075190725034797, "loss": 0.0801, "step": 9695 }, { "epoch": 0.6268121212121213, "grad_norm": 0.05078668147325516, "learning_rate": 0.00018074787336119202, "loss": 0.0803, "step": 9696 }, { "epoch": 0.6268121212121213, "eval_bleu": 16.258268752540918, "eval_loss": 0.0892329066991806, "eval_runtime": 2.6852, "eval_samples_per_second": 11.917, "eval_steps_per_second": 1.49, "step": 9696 }, { "epoch": 0.6268767676767677, "grad_norm": 0.05090342089533806, "learning_rate": 0.000180743839094405, "loss": 0.0833, "step": 9697 }, { "epoch": 0.6269414141414141, "grad_norm": 0.056016381829977036, "learning_rate": 0.0001807398044500057, "loss": 0.085, "step": 9698 }, { "epoch": 0.6270060606060606, "grad_norm": 0.06063252314925194, "learning_rate": 0.00018073576942801307, "loss": 0.0986, "step": 9699 }, { "epoch": 0.6270707070707071, "grad_norm": 0.07047326117753983, "learning_rate": 0.0001807317340284459, "loss": 0.1188, "step": 9700 }, { "epoch": 0.6271353535353535, "grad_norm": 0.053901173174381256, "learning_rate": 0.00018072769825132313, "loss": 0.0794, "step": 9701 }, { "epoch": 0.6272, "grad_norm": 0.05113985762000084, "learning_rate": 0.00018072366209666356, "loss": 0.0801, "step": 9702 }, { "epoch": 0.6272646464646464, "grad_norm": 0.06078094616532326, "learning_rate": 0.00018071962556448617, "loss": 0.1025, "step": 9703 }, { "epoch": 0.627329292929293, "grad_norm": 0.05477865785360336, "learning_rate": 0.0001807155886548097, "loss": 0.094, "step": 9704 }, { "epoch": 0.6273939393939394, "grad_norm": 0.048696067184209824, "learning_rate": 0.00018071155136765315, "loss": 0.0782, "step": 9705 }, { "epoch": 0.6274585858585858, "grad_norm": 0.04804442822933197, "learning_rate": 0.0001807075137030354, "loss": 0.0803, "step": 9706 }, { "epoch": 0.6275232323232324, "grad_norm": 0.046524785459041595, "learning_rate": 0.00018070347566097523, "loss": 0.0773, "step": 9707 }, { "epoch": 0.6275878787878788, "grad_norm": 0.053735457360744476, "learning_rate": 0.0001806994372414916, "loss": 0.0807, "step": 9708 }, { "epoch": 0.6276525252525252, "grad_norm": 0.05095867067575455, "learning_rate": 0.00018069539844460338, "loss": 0.077, "step": 9709 }, { "epoch": 0.6277171717171717, "grad_norm": 0.05940290167927742, "learning_rate": 0.00018069135927032946, "loss": 0.0981, "step": 9710 }, { "epoch": 0.6277818181818182, "grad_norm": 0.04740976169705391, "learning_rate": 0.00018068731971868874, "loss": 0.0778, "step": 9711 }, { "epoch": 0.6278464646464647, "grad_norm": 0.05119834095239639, "learning_rate": 0.00018068327978970007, "loss": 0.085, "step": 9712 }, { "epoch": 0.6278464646464647, "eval_bleu": 16.07086041903164, "eval_loss": 0.08800500631332397, "eval_runtime": 2.7415, "eval_samples_per_second": 11.672, "eval_steps_per_second": 1.459, "step": 9712 }, { "epoch": 0.6279111111111111, "grad_norm": 0.05337872356176376, "learning_rate": 0.00018067923948338242, "loss": 0.0823, "step": 9713 }, { "epoch": 0.6279757575757575, "grad_norm": 0.05403805896639824, "learning_rate": 0.00018067519879975459, "loss": 0.077, "step": 9714 }, { "epoch": 0.6280404040404041, "grad_norm": 0.057973653078079224, "learning_rate": 0.00018067115773883554, "loss": 0.0983, "step": 9715 }, { "epoch": 0.6281050505050505, "grad_norm": 0.05788058042526245, "learning_rate": 0.0001806671163006442, "loss": 0.0911, "step": 9716 }, { "epoch": 0.6281696969696969, "grad_norm": 0.09571719914674759, "learning_rate": 0.00018066307448519938, "loss": 0.0838, "step": 9717 }, { "epoch": 0.6282343434343435, "grad_norm": 0.0431741364300251, "learning_rate": 0.00018065903229252, "loss": 0.0651, "step": 9718 }, { "epoch": 0.6282989898989899, "grad_norm": 0.0469515435397625, "learning_rate": 0.00018065498972262504, "loss": 0.0705, "step": 9719 }, { "epoch": 0.6283636363636363, "grad_norm": 0.05814005061984062, "learning_rate": 0.00018065094677553333, "loss": 0.0802, "step": 9720 }, { "epoch": 0.6284282828282828, "grad_norm": 0.06358738988637924, "learning_rate": 0.00018064690345126385, "loss": 0.0971, "step": 9721 }, { "epoch": 0.6284929292929293, "grad_norm": 0.04351510480046272, "learning_rate": 0.0001806428597498354, "loss": 0.0703, "step": 9722 }, { "epoch": 0.6285575757575758, "grad_norm": 0.06675971299409866, "learning_rate": 0.00018063881567126702, "loss": 0.0957, "step": 9723 }, { "epoch": 0.6286222222222222, "grad_norm": 0.06361675262451172, "learning_rate": 0.0001806347712155775, "loss": 0.0894, "step": 9724 }, { "epoch": 0.6286868686868687, "grad_norm": 0.061473969370126724, "learning_rate": 0.00018063072638278585, "loss": 0.0866, "step": 9725 }, { "epoch": 0.6287515151515152, "grad_norm": 0.05391973629593849, "learning_rate": 0.00018062668117291094, "loss": 0.0833, "step": 9726 }, { "epoch": 0.6288161616161616, "grad_norm": 0.051638081669807434, "learning_rate": 0.00018062263558597168, "loss": 0.0818, "step": 9727 }, { "epoch": 0.628880808080808, "grad_norm": 0.05314822494983673, "learning_rate": 0.00018061858962198702, "loss": 0.0805, "step": 9728 }, { "epoch": 0.628880808080808, "eval_bleu": 19.623357269762646, "eval_loss": 0.0879005491733551, "eval_runtime": 2.6124, "eval_samples_per_second": 12.249, "eval_steps_per_second": 1.531, "step": 9728 }, { "epoch": 0.6289454545454546, "grad_norm": 0.04994453117251396, "learning_rate": 0.00018061454328097587, "loss": 0.0764, "step": 9729 }, { "epoch": 0.629010101010101, "grad_norm": 0.056346211582422256, "learning_rate": 0.00018061049656295714, "loss": 0.0951, "step": 9730 }, { "epoch": 0.6290747474747475, "grad_norm": 0.042397934943437576, "learning_rate": 0.00018060644946794977, "loss": 0.0637, "step": 9731 }, { "epoch": 0.6291393939393939, "grad_norm": 0.04568217322230339, "learning_rate": 0.00018060240199597273, "loss": 0.074, "step": 9732 }, { "epoch": 0.6292040404040404, "grad_norm": 0.04924406856298447, "learning_rate": 0.00018059835414704485, "loss": 0.0864, "step": 9733 }, { "epoch": 0.6292686868686869, "grad_norm": 0.055419228971004486, "learning_rate": 0.00018059430592118513, "loss": 0.0947, "step": 9734 }, { "epoch": 0.6293333333333333, "grad_norm": 0.04350543022155762, "learning_rate": 0.00018059025731841246, "loss": 0.0707, "step": 9735 }, { "epoch": 0.6293979797979798, "grad_norm": 0.052202008664608, "learning_rate": 0.00018058620833874583, "loss": 0.0887, "step": 9736 }, { "epoch": 0.6294626262626263, "grad_norm": 0.05039006099104881, "learning_rate": 0.00018058215898220415, "loss": 0.0789, "step": 9737 }, { "epoch": 0.6295272727272727, "grad_norm": 0.0522797591984272, "learning_rate": 0.00018057810924880632, "loss": 0.0737, "step": 9738 }, { "epoch": 0.6295919191919191, "grad_norm": 0.059988539665937424, "learning_rate": 0.00018057405913857131, "loss": 0.0905, "step": 9739 }, { "epoch": 0.6296565656565657, "grad_norm": 0.04847341403365135, "learning_rate": 0.00018057000865151807, "loss": 0.079, "step": 9740 }, { "epoch": 0.6297212121212121, "grad_norm": 0.059559524059295654, "learning_rate": 0.00018056595778766557, "loss": 0.0878, "step": 9741 }, { "epoch": 0.6297858585858586, "grad_norm": 0.05579565465450287, "learning_rate": 0.00018056190654703268, "loss": 0.0829, "step": 9742 }, { "epoch": 0.6298505050505051, "grad_norm": 0.06627902388572693, "learning_rate": 0.00018055785492963842, "loss": 0.0967, "step": 9743 }, { "epoch": 0.6299151515151515, "grad_norm": 0.05197802558541298, "learning_rate": 0.0001805538029355017, "loss": 0.0829, "step": 9744 }, { "epoch": 0.6299151515151515, "eval_bleu": 19.524499039598876, "eval_loss": 0.08743035793304443, "eval_runtime": 2.7255, "eval_samples_per_second": 11.741, "eval_steps_per_second": 1.468, "step": 9744 }, { "epoch": 0.629979797979798, "grad_norm": 0.057060834020376205, "learning_rate": 0.00018054975056464145, "loss": 0.1009, "step": 9745 }, { "epoch": 0.6300444444444444, "grad_norm": 0.05745081230998039, "learning_rate": 0.00018054569781707667, "loss": 0.0902, "step": 9746 }, { "epoch": 0.630109090909091, "grad_norm": 0.05976644903421402, "learning_rate": 0.00018054164469282627, "loss": 0.0875, "step": 9747 }, { "epoch": 0.6301737373737374, "grad_norm": 0.059339698404073715, "learning_rate": 0.0001805375911919092, "loss": 0.0828, "step": 9748 }, { "epoch": 0.6302383838383838, "grad_norm": 0.05720999091863632, "learning_rate": 0.0001805335373143445, "loss": 0.097, "step": 9749 }, { "epoch": 0.6303030303030303, "grad_norm": 0.05088430643081665, "learning_rate": 0.00018052948306015104, "loss": 0.0799, "step": 9750 }, { "epoch": 0.6303676767676768, "grad_norm": 0.048825349658727646, "learning_rate": 0.00018052542842934778, "loss": 0.0807, "step": 9751 }, { "epoch": 0.6304323232323232, "grad_norm": 0.06344833970069885, "learning_rate": 0.00018052137342195376, "loss": 0.1101, "step": 9752 }, { "epoch": 0.6304969696969697, "grad_norm": 0.05898173525929451, "learning_rate": 0.0001805173180379879, "loss": 0.0941, "step": 9753 }, { "epoch": 0.6305616161616162, "grad_norm": 0.06979236006736755, "learning_rate": 0.00018051326227746915, "loss": 0.0834, "step": 9754 }, { "epoch": 0.6306262626262626, "grad_norm": 0.05379551276564598, "learning_rate": 0.00018050920614041645, "loss": 0.0956, "step": 9755 }, { "epoch": 0.6306909090909091, "grad_norm": 0.17936736345291138, "learning_rate": 0.00018050514962684886, "loss": 0.1143, "step": 9756 }, { "epoch": 0.6307555555555555, "grad_norm": 0.05422957241535187, "learning_rate": 0.00018050109273678528, "loss": 0.0954, "step": 9757 }, { "epoch": 0.6308202020202021, "grad_norm": 0.056353483349084854, "learning_rate": 0.0001804970354702447, "loss": 0.0979, "step": 9758 }, { "epoch": 0.6308848484848485, "grad_norm": 0.06036766618490219, "learning_rate": 0.0001804929778272461, "loss": 0.1122, "step": 9759 }, { "epoch": 0.6309494949494949, "grad_norm": 0.04746231809258461, "learning_rate": 0.00018048891980780845, "loss": 0.0762, "step": 9760 }, { "epoch": 0.6309494949494949, "eval_bleu": 18.35354417357128, "eval_loss": 0.08635468035936356, "eval_runtime": 2.5956, "eval_samples_per_second": 12.329, "eval_steps_per_second": 1.541, "step": 9760 }, { "epoch": 0.6310141414141414, "grad_norm": 0.05536958947777748, "learning_rate": 0.00018048486141195074, "loss": 0.1041, "step": 9761 }, { "epoch": 0.6310787878787879, "grad_norm": 0.047432880848646164, "learning_rate": 0.00018048080263969194, "loss": 0.0754, "step": 9762 }, { "epoch": 0.6311434343434343, "grad_norm": 0.04759209230542183, "learning_rate": 0.00018047674349105103, "loss": 0.0766, "step": 9763 }, { "epoch": 0.6312080808080808, "grad_norm": 0.04758182913064957, "learning_rate": 0.00018047268396604698, "loss": 0.0726, "step": 9764 }, { "epoch": 0.6312727272727273, "grad_norm": 0.048166584223508835, "learning_rate": 0.0001804686240646988, "loss": 0.0772, "step": 9765 }, { "epoch": 0.6313373737373738, "grad_norm": 0.0479947030544281, "learning_rate": 0.00018046456378702547, "loss": 0.0698, "step": 9766 }, { "epoch": 0.6314020202020202, "grad_norm": 0.07287788391113281, "learning_rate": 0.00018046050313304599, "loss": 0.0986, "step": 9767 }, { "epoch": 0.6314666666666666, "grad_norm": 0.05195832997560501, "learning_rate": 0.00018045644210277933, "loss": 0.0753, "step": 9768 }, { "epoch": 0.6315313131313132, "grad_norm": 0.05337585136294365, "learning_rate": 0.00018045238069624447, "loss": 0.0732, "step": 9769 }, { "epoch": 0.6315959595959596, "grad_norm": 0.05241524800658226, "learning_rate": 0.00018044831891346044, "loss": 0.0853, "step": 9770 }, { "epoch": 0.631660606060606, "grad_norm": 0.05723768100142479, "learning_rate": 0.00018044425675444622, "loss": 0.0946, "step": 9771 }, { "epoch": 0.6317252525252526, "grad_norm": 0.049008164554834366, "learning_rate": 0.00018044019421922078, "loss": 0.0799, "step": 9772 }, { "epoch": 0.631789898989899, "grad_norm": 0.05394263193011284, "learning_rate": 0.00018043613130780316, "loss": 0.0858, "step": 9773 }, { "epoch": 0.6318545454545454, "grad_norm": 0.06023272126913071, "learning_rate": 0.00018043206802021238, "loss": 0.0965, "step": 9774 }, { "epoch": 0.6319191919191919, "grad_norm": 0.05954528972506523, "learning_rate": 0.00018042800435646736, "loss": 0.0971, "step": 9775 }, { "epoch": 0.6319838383838384, "grad_norm": 0.05215217173099518, "learning_rate": 0.00018042394031658719, "loss": 0.0867, "step": 9776 }, { "epoch": 0.6319838383838384, "eval_bleu": 20.226305364177897, "eval_loss": 0.0876459926366806, "eval_runtime": 2.6887, "eval_samples_per_second": 11.901, "eval_steps_per_second": 1.488, "step": 9776 }, { "epoch": 0.6320484848484849, "grad_norm": 0.05447407066822052, "learning_rate": 0.0001804198759005908, "loss": 0.0854, "step": 9777 }, { "epoch": 0.6321131313131313, "grad_norm": 0.06231718882918358, "learning_rate": 0.00018041581110849725, "loss": 0.0945, "step": 9778 }, { "epoch": 0.6321777777777777, "grad_norm": 0.05187726020812988, "learning_rate": 0.00018041174594032557, "loss": 0.0844, "step": 9779 }, { "epoch": 0.6322424242424243, "grad_norm": 0.07462898641824722, "learning_rate": 0.0001804076803960947, "loss": 0.1245, "step": 9780 }, { "epoch": 0.6323070707070707, "grad_norm": 0.05878031253814697, "learning_rate": 0.0001804036144758237, "loss": 0.0953, "step": 9781 }, { "epoch": 0.6323717171717171, "grad_norm": 0.0525965616106987, "learning_rate": 0.00018039954817953159, "loss": 0.0806, "step": 9782 }, { "epoch": 0.6324363636363637, "grad_norm": 0.055511731654405594, "learning_rate": 0.00018039548150723734, "loss": 0.0945, "step": 9783 }, { "epoch": 0.6325010101010101, "grad_norm": 0.05531933903694153, "learning_rate": 0.00018039141445896003, "loss": 0.0965, "step": 9784 }, { "epoch": 0.6325656565656566, "grad_norm": 0.06053992733359337, "learning_rate": 0.00018038734703471865, "loss": 0.0912, "step": 9785 }, { "epoch": 0.632630303030303, "grad_norm": 0.053498588502407074, "learning_rate": 0.00018038327923453222, "loss": 0.0749, "step": 9786 }, { "epoch": 0.6326949494949495, "grad_norm": 0.06118452921509743, "learning_rate": 0.00018037921105841975, "loss": 0.0964, "step": 9787 }, { "epoch": 0.632759595959596, "grad_norm": 0.0553448349237442, "learning_rate": 0.0001803751425064003, "loss": 0.0934, "step": 9788 }, { "epoch": 0.6328242424242424, "grad_norm": 0.061345502734184265, "learning_rate": 0.00018037107357849288, "loss": 0.0935, "step": 9789 }, { "epoch": 0.6328888888888888, "grad_norm": 0.06773505359888077, "learning_rate": 0.0001803670042747165, "loss": 0.1101, "step": 9790 }, { "epoch": 0.6329535353535354, "grad_norm": 0.05733510106801987, "learning_rate": 0.00018036293459509025, "loss": 0.094, "step": 9791 }, { "epoch": 0.6330181818181818, "grad_norm": 0.06216839700937271, "learning_rate": 0.00018035886453963306, "loss": 0.0847, "step": 9792 }, { "epoch": 0.6330181818181818, "eval_bleu": 19.980680773890604, "eval_loss": 0.08723853528499603, "eval_runtime": 2.6986, "eval_samples_per_second": 11.858, "eval_steps_per_second": 1.482, "step": 9792 }, { "epoch": 0.6330828282828282, "grad_norm": 0.05175827071070671, "learning_rate": 0.00018035479410836408, "loss": 0.0907, "step": 9793 }, { "epoch": 0.6331474747474748, "grad_norm": 0.051289141178131104, "learning_rate": 0.00018035072330130228, "loss": 0.0897, "step": 9794 }, { "epoch": 0.6332121212121212, "grad_norm": 0.049464184790849686, "learning_rate": 0.0001803466521184667, "loss": 0.0738, "step": 9795 }, { "epoch": 0.6332767676767677, "grad_norm": 0.05055466666817665, "learning_rate": 0.0001803425805598764, "loss": 0.0875, "step": 9796 }, { "epoch": 0.6333414141414141, "grad_norm": 0.05349530279636383, "learning_rate": 0.00018033850862555042, "loss": 0.0896, "step": 9797 }, { "epoch": 0.6334060606060606, "grad_norm": 0.052457600831985474, "learning_rate": 0.00018033443631550777, "loss": 0.0885, "step": 9798 }, { "epoch": 0.6334707070707071, "grad_norm": 0.06267128139734268, "learning_rate": 0.00018033036362976758, "loss": 0.0968, "step": 9799 }, { "epoch": 0.6335353535353535, "grad_norm": 0.052911799401044846, "learning_rate": 0.00018032629056834875, "loss": 0.0824, "step": 9800 }, { "epoch": 0.6336, "grad_norm": 0.052450407296419144, "learning_rate": 0.00018032221713127044, "loss": 0.086, "step": 9801 }, { "epoch": 0.6336646464646465, "grad_norm": 0.05269014462828636, "learning_rate": 0.00018031814331855172, "loss": 0.0873, "step": 9802 }, { "epoch": 0.6337292929292929, "grad_norm": 0.07408583164215088, "learning_rate": 0.00018031406913021156, "loss": 0.109, "step": 9803 }, { "epoch": 0.6337939393939394, "grad_norm": 0.05461026728153229, "learning_rate": 0.00018030999456626905, "loss": 0.0878, "step": 9804 }, { "epoch": 0.6338585858585859, "grad_norm": 0.04926561191678047, "learning_rate": 0.00018030591962674324, "loss": 0.08, "step": 9805 }, { "epoch": 0.6339232323232323, "grad_norm": 0.05962306261062622, "learning_rate": 0.0001803018443116532, "loss": 0.108, "step": 9806 }, { "epoch": 0.6339878787878788, "grad_norm": 0.05224376916885376, "learning_rate": 0.000180297768621018, "loss": 0.0905, "step": 9807 }, { "epoch": 0.6340525252525252, "grad_norm": 0.05882658064365387, "learning_rate": 0.00018029369255485666, "loss": 0.0913, "step": 9808 }, { "epoch": 0.6340525252525252, "eval_bleu": 19.566010737910347, "eval_loss": 0.08698968589305878, "eval_runtime": 2.744, "eval_samples_per_second": 11.662, "eval_steps_per_second": 1.458, "step": 9808 }, { "epoch": 0.6341171717171717, "grad_norm": 0.05288080871105194, "learning_rate": 0.00018028961611318825, "loss": 0.0864, "step": 9809 }, { "epoch": 0.6341818181818182, "grad_norm": 0.05010340362787247, "learning_rate": 0.00018028553929603188, "loss": 0.0867, "step": 9810 }, { "epoch": 0.6342464646464646, "grad_norm": 0.049782514572143555, "learning_rate": 0.00018028146210340658, "loss": 0.0828, "step": 9811 }, { "epoch": 0.6343111111111112, "grad_norm": 0.05011408403515816, "learning_rate": 0.0001802773845353314, "loss": 0.0826, "step": 9812 }, { "epoch": 0.6343757575757576, "grad_norm": 0.05150309577584267, "learning_rate": 0.00018027330659182547, "loss": 0.0787, "step": 9813 }, { "epoch": 0.634440404040404, "grad_norm": 0.055025242269039154, "learning_rate": 0.0001802692282729078, "loss": 0.0887, "step": 9814 }, { "epoch": 0.6345050505050505, "grad_norm": 0.056268360465765, "learning_rate": 0.00018026514957859748, "loss": 0.0934, "step": 9815 }, { "epoch": 0.634569696969697, "grad_norm": 0.055301543325185776, "learning_rate": 0.0001802610705089136, "loss": 0.0859, "step": 9816 }, { "epoch": 0.6346343434343434, "grad_norm": 0.05043425410985947, "learning_rate": 0.0001802569910638752, "loss": 0.0741, "step": 9817 }, { "epoch": 0.6346989898989899, "grad_norm": 0.053933389484882355, "learning_rate": 0.00018025291124350144, "loss": 0.087, "step": 9818 }, { "epoch": 0.6347636363636364, "grad_norm": 0.0700526088476181, "learning_rate": 0.00018024883104781128, "loss": 0.1004, "step": 9819 }, { "epoch": 0.6348282828282829, "grad_norm": 0.052496448159217834, "learning_rate": 0.00018024475047682392, "loss": 0.0828, "step": 9820 }, { "epoch": 0.6348929292929293, "grad_norm": 0.05390848219394684, "learning_rate": 0.00018024066953055838, "loss": 0.0899, "step": 9821 }, { "epoch": 0.6349575757575757, "grad_norm": 0.0487716905772686, "learning_rate": 0.00018023658820903374, "loss": 0.0721, "step": 9822 }, { "epoch": 0.6350222222222223, "grad_norm": 0.0632319375872612, "learning_rate": 0.00018023250651226912, "loss": 0.0987, "step": 9823 }, { "epoch": 0.6350868686868687, "grad_norm": 0.05512642487883568, "learning_rate": 0.00018022842444028356, "loss": 0.0721, "step": 9824 }, { "epoch": 0.6350868686868687, "eval_bleu": 20.69119994098271, "eval_loss": 0.08806480467319489, "eval_runtime": 2.6622, "eval_samples_per_second": 12.02, "eval_steps_per_second": 1.503, "step": 9824 }, { "epoch": 0.6351515151515151, "grad_norm": 0.05682190880179405, "learning_rate": 0.0001802243419930962, "loss": 0.0796, "step": 9825 }, { "epoch": 0.6352161616161616, "grad_norm": 0.050109464675188065, "learning_rate": 0.00018022025917072615, "loss": 0.0749, "step": 9826 }, { "epoch": 0.6352808080808081, "grad_norm": 0.06627358496189117, "learning_rate": 0.0001802161759731924, "loss": 0.0873, "step": 9827 }, { "epoch": 0.6353454545454545, "grad_norm": 0.0605652816593647, "learning_rate": 0.00018021209240051418, "loss": 0.0864, "step": 9828 }, { "epoch": 0.635410101010101, "grad_norm": 0.06585296988487244, "learning_rate": 0.00018020800845271046, "loss": 0.1007, "step": 9829 }, { "epoch": 0.6354747474747475, "grad_norm": 0.07049667090177536, "learning_rate": 0.00018020392412980044, "loss": 0.1016, "step": 9830 }, { "epoch": 0.635539393939394, "grad_norm": 0.057591259479522705, "learning_rate": 0.00018019983943180318, "loss": 0.0908, "step": 9831 }, { "epoch": 0.6356040404040404, "grad_norm": 0.05252205953001976, "learning_rate": 0.00018019575435873777, "loss": 0.091, "step": 9832 }, { "epoch": 0.6356686868686868, "grad_norm": 0.05534578487277031, "learning_rate": 0.00018019166891062333, "loss": 0.0904, "step": 9833 }, { "epoch": 0.6357333333333334, "grad_norm": 0.05364471673965454, "learning_rate": 0.000180187583087479, "loss": 0.0757, "step": 9834 }, { "epoch": 0.6357979797979798, "grad_norm": 0.05760897323489189, "learning_rate": 0.00018018349688932381, "loss": 0.0893, "step": 9835 }, { "epoch": 0.6358626262626262, "grad_norm": 0.046139560639858246, "learning_rate": 0.00018017941031617693, "loss": 0.0733, "step": 9836 }, { "epoch": 0.6359272727272727, "grad_norm": 0.05529811233282089, "learning_rate": 0.00018017532336805746, "loss": 0.084, "step": 9837 }, { "epoch": 0.6359919191919192, "grad_norm": 0.05575833469629288, "learning_rate": 0.0001801712360449845, "loss": 0.0852, "step": 9838 }, { "epoch": 0.6360565656565657, "grad_norm": 0.05441390722990036, "learning_rate": 0.00018016714834697716, "loss": 0.0842, "step": 9839 }, { "epoch": 0.6361212121212121, "grad_norm": 0.05237498879432678, "learning_rate": 0.00018016306027405458, "loss": 0.0782, "step": 9840 }, { "epoch": 0.6361212121212121, "eval_bleu": 19.889324419246737, "eval_loss": 0.08815804123878479, "eval_runtime": 2.7305, "eval_samples_per_second": 11.72, "eval_steps_per_second": 1.465, "step": 9840 }, { "epoch": 0.6361858585858586, "grad_norm": 0.059118714183568954, "learning_rate": 0.0001801589718262359, "loss": 0.0958, "step": 9841 }, { "epoch": 0.6362505050505051, "grad_norm": 0.04717165604233742, "learning_rate": 0.00018015488300354018, "loss": 0.0664, "step": 9842 }, { "epoch": 0.6363151515151515, "grad_norm": 0.054101672023534775, "learning_rate": 0.00018015079380598656, "loss": 0.0755, "step": 9843 }, { "epoch": 0.6363797979797979, "grad_norm": 0.05500855669379234, "learning_rate": 0.00018014670423359422, "loss": 0.0696, "step": 9844 }, { "epoch": 0.6364444444444445, "grad_norm": 0.05059969052672386, "learning_rate": 0.0001801426142863822, "loss": 0.0704, "step": 9845 }, { "epoch": 0.6365090909090909, "grad_norm": 0.054890334606170654, "learning_rate": 0.00018013852396436966, "loss": 0.0925, "step": 9846 }, { "epoch": 0.6365737373737373, "grad_norm": 0.059658993035554886, "learning_rate": 0.00018013443326757574, "loss": 0.0814, "step": 9847 }, { "epoch": 0.6366383838383839, "grad_norm": 0.0546690933406353, "learning_rate": 0.0001801303421960196, "loss": 0.0784, "step": 9848 }, { "epoch": 0.6367030303030303, "grad_norm": 0.055828142911195755, "learning_rate": 0.0001801262507497203, "loss": 0.0848, "step": 9849 }, { "epoch": 0.6367676767676768, "grad_norm": 0.0475967675447464, "learning_rate": 0.00018012215892869703, "loss": 0.0736, "step": 9850 }, { "epoch": 0.6368323232323232, "grad_norm": 0.04358759522438049, "learning_rate": 0.00018011806673296895, "loss": 0.0594, "step": 9851 }, { "epoch": 0.6368969696969697, "grad_norm": 0.06610281020402908, "learning_rate": 0.0001801139741625551, "loss": 0.0968, "step": 9852 }, { "epoch": 0.6369616161616162, "grad_norm": 0.06220583990216255, "learning_rate": 0.0001801098812174747, "loss": 0.0702, "step": 9853 }, { "epoch": 0.6370262626262626, "grad_norm": 0.061874035745859146, "learning_rate": 0.00018010578789774687, "loss": 0.0913, "step": 9854 }, { "epoch": 0.637090909090909, "grad_norm": 0.04611054062843323, "learning_rate": 0.00018010169420339077, "loss": 0.0723, "step": 9855 }, { "epoch": 0.6371555555555556, "grad_norm": 0.062470607459545135, "learning_rate": 0.0001800976001344255, "loss": 0.0959, "step": 9856 }, { "epoch": 0.6371555555555556, "eval_bleu": 18.82793227790076, "eval_loss": 0.08694825321435928, "eval_runtime": 2.6558, "eval_samples_per_second": 12.049, "eval_steps_per_second": 1.506, "step": 9856 }, { "epoch": 0.637220202020202, "grad_norm": 0.057087551802396774, "learning_rate": 0.0001800935056908702, "loss": 0.1046, "step": 9857 }, { "epoch": 0.6372848484848485, "grad_norm": 0.049213264137506485, "learning_rate": 0.00018008941087274414, "loss": 0.0666, "step": 9858 }, { "epoch": 0.637349494949495, "grad_norm": 0.05532834306359291, "learning_rate": 0.00018008531568006634, "loss": 0.0896, "step": 9859 }, { "epoch": 0.6374141414141414, "grad_norm": 0.04942358657717705, "learning_rate": 0.000180081220112856, "loss": 0.0769, "step": 9860 }, { "epoch": 0.6374787878787879, "grad_norm": 0.050958096981048584, "learning_rate": 0.00018007712417113226, "loss": 0.0864, "step": 9861 }, { "epoch": 0.6375434343434343, "grad_norm": 0.05710505694150925, "learning_rate": 0.00018007302785491428, "loss": 0.0862, "step": 9862 }, { "epoch": 0.6376080808080808, "grad_norm": 0.05706352740526199, "learning_rate": 0.00018006893116422122, "loss": 0.0864, "step": 9863 }, { "epoch": 0.6376727272727273, "grad_norm": 0.057465873658657074, "learning_rate": 0.00018006483409907224, "loss": 0.0896, "step": 9864 }, { "epoch": 0.6377373737373737, "grad_norm": 0.06079970300197601, "learning_rate": 0.0001800607366594865, "loss": 0.0889, "step": 9865 }, { "epoch": 0.6378020202020201, "grad_norm": 0.04698382690548897, "learning_rate": 0.00018005663884548319, "loss": 0.0705, "step": 9866 }, { "epoch": 0.6378666666666667, "grad_norm": 0.05282482132315636, "learning_rate": 0.0001800525406570814, "loss": 0.0857, "step": 9867 }, { "epoch": 0.6379313131313131, "grad_norm": 0.04869740083813667, "learning_rate": 0.00018004844209430036, "loss": 0.0733, "step": 9868 }, { "epoch": 0.6379959595959596, "grad_norm": 0.050280142575502396, "learning_rate": 0.00018004434315715924, "loss": 0.0693, "step": 9869 }, { "epoch": 0.6380606060606061, "grad_norm": 0.050588589161634445, "learning_rate": 0.00018004024384567715, "loss": 0.0782, "step": 9870 }, { "epoch": 0.6381252525252525, "grad_norm": 0.05138189718127251, "learning_rate": 0.00018003614415987334, "loss": 0.0765, "step": 9871 }, { "epoch": 0.638189898989899, "grad_norm": 0.0560012087225914, "learning_rate": 0.00018003204409976692, "loss": 0.0768, "step": 9872 }, { "epoch": 0.638189898989899, "eval_bleu": 20.13603558809569, "eval_loss": 0.08888503164052963, "eval_runtime": 2.6335, "eval_samples_per_second": 12.151, "eval_steps_per_second": 1.519, "step": 9872 }, { "epoch": 0.6382545454545454, "grad_norm": 0.052972212433815, "learning_rate": 0.0001800279436653771, "loss": 0.0823, "step": 9873 }, { "epoch": 0.638319191919192, "grad_norm": 0.04869149252772331, "learning_rate": 0.00018002384285672305, "loss": 0.0727, "step": 9874 }, { "epoch": 0.6383838383838384, "grad_norm": 0.048130109906196594, "learning_rate": 0.0001800197416738239, "loss": 0.0784, "step": 9875 }, { "epoch": 0.6384484848484848, "grad_norm": 0.07661563158035278, "learning_rate": 0.00018001564011669894, "loss": 0.089, "step": 9876 }, { "epoch": 0.6385131313131314, "grad_norm": 0.05851416289806366, "learning_rate": 0.00018001153818536722, "loss": 0.0768, "step": 9877 }, { "epoch": 0.6385777777777778, "grad_norm": 0.05257833003997803, "learning_rate": 0.00018000743587984804, "loss": 0.081, "step": 9878 }, { "epoch": 0.6386424242424242, "grad_norm": 0.054314546287059784, "learning_rate": 0.0001800033332001605, "loss": 0.0804, "step": 9879 }, { "epoch": 0.6387070707070707, "grad_norm": 0.07756319642066956, "learning_rate": 0.00017999923014632381, "loss": 0.1031, "step": 9880 }, { "epoch": 0.6387717171717172, "grad_norm": 0.05209039896726608, "learning_rate": 0.00017999512671835718, "loss": 0.0804, "step": 9881 }, { "epoch": 0.6388363636363636, "grad_norm": 0.05249698832631111, "learning_rate": 0.0001799910229162798, "loss": 0.0798, "step": 9882 }, { "epoch": 0.6389010101010101, "grad_norm": 0.047519270330667496, "learning_rate": 0.00017998691874011088, "loss": 0.0746, "step": 9883 }, { "epoch": 0.6389656565656565, "grad_norm": 0.05735498666763306, "learning_rate": 0.00017998281418986952, "loss": 0.0859, "step": 9884 }, { "epoch": 0.6390303030303031, "grad_norm": 0.05371516942977905, "learning_rate": 0.000179978709265575, "loss": 0.0885, "step": 9885 }, { "epoch": 0.6390949494949495, "grad_norm": 0.04889999330043793, "learning_rate": 0.0001799746039672465, "loss": 0.074, "step": 9886 }, { "epoch": 0.6391595959595959, "grad_norm": 0.051954928785562515, "learning_rate": 0.00017997049829490322, "loss": 0.0839, "step": 9887 }, { "epoch": 0.6392242424242425, "grad_norm": 0.061482593417167664, "learning_rate": 0.00017996639224856438, "loss": 0.0861, "step": 9888 }, { "epoch": 0.6392242424242425, "eval_bleu": 14.591271764323118, "eval_loss": 0.08775955438613892, "eval_runtime": 2.6357, "eval_samples_per_second": 12.141, "eval_steps_per_second": 1.518, "step": 9888 }, { "epoch": 0.6392888888888889, "grad_norm": 0.056569360196590424, "learning_rate": 0.00017996228582824913, "loss": 0.0964, "step": 9889 }, { "epoch": 0.6393535353535353, "grad_norm": 0.04986441507935524, "learning_rate": 0.0001799581790339767, "loss": 0.0729, "step": 9890 }, { "epoch": 0.6394181818181818, "grad_norm": 0.05286859720945358, "learning_rate": 0.00017995407186576631, "loss": 0.0771, "step": 9891 }, { "epoch": 0.6394828282828283, "grad_norm": 0.05587729811668396, "learning_rate": 0.00017994996432363719, "loss": 0.0863, "step": 9892 }, { "epoch": 0.6395474747474748, "grad_norm": 0.058828797191381454, "learning_rate": 0.00017994585640760847, "loss": 0.0996, "step": 9893 }, { "epoch": 0.6396121212121212, "grad_norm": 0.046169813722372055, "learning_rate": 0.0001799417481176994, "loss": 0.065, "step": 9894 }, { "epoch": 0.6396767676767676, "grad_norm": 0.060176439583301544, "learning_rate": 0.00017993763945392926, "loss": 0.1009, "step": 9895 }, { "epoch": 0.6397414141414142, "grad_norm": 0.04977443069219589, "learning_rate": 0.00017993353041631716, "loss": 0.0809, "step": 9896 }, { "epoch": 0.6398060606060606, "grad_norm": 0.05167710408568382, "learning_rate": 0.00017992942100488238, "loss": 0.0887, "step": 9897 }, { "epoch": 0.639870707070707, "grad_norm": 0.04925675690174103, "learning_rate": 0.00017992531121964413, "loss": 0.0775, "step": 9898 }, { "epoch": 0.6399353535353536, "grad_norm": 0.0558888278901577, "learning_rate": 0.0001799212010606216, "loss": 0.0914, "step": 9899 }, { "epoch": 0.64, "grad_norm": 0.058629341423511505, "learning_rate": 0.00017991709052783404, "loss": 0.1013, "step": 9900 }, { "epoch": 0.6400646464646464, "grad_norm": 0.053618066012859344, "learning_rate": 0.0001799129796213007, "loss": 0.0841, "step": 9901 }, { "epoch": 0.6401292929292929, "grad_norm": 0.0532694011926651, "learning_rate": 0.00017990886834104073, "loss": 0.0825, "step": 9902 }, { "epoch": 0.6401939393939394, "grad_norm": 0.1554884910583496, "learning_rate": 0.00017990475668707342, "loss": 0.1062, "step": 9903 }, { "epoch": 0.6402585858585859, "grad_norm": 0.0578804686665535, "learning_rate": 0.00017990064465941798, "loss": 0.0857, "step": 9904 }, { "epoch": 0.6402585858585859, "eval_bleu": 19.245071090889326, "eval_loss": 0.08926622569561005, "eval_runtime": 2.7109, "eval_samples_per_second": 11.804, "eval_steps_per_second": 1.476, "step": 9904 }, { "epoch": 0.6403232323232323, "grad_norm": 0.05511504039168358, "learning_rate": 0.00017989653225809367, "loss": 0.0823, "step": 9905 }, { "epoch": 0.6403878787878788, "grad_norm": 0.046316757798194885, "learning_rate": 0.00017989241948311965, "loss": 0.0688, "step": 9906 }, { "epoch": 0.6404525252525253, "grad_norm": 0.04912593588232994, "learning_rate": 0.0001798883063345152, "loss": 0.0812, "step": 9907 }, { "epoch": 0.6405171717171717, "grad_norm": 0.061372239142656326, "learning_rate": 0.00017988419281229955, "loss": 0.1008, "step": 9908 }, { "epoch": 0.6405818181818181, "grad_norm": 0.05954630672931671, "learning_rate": 0.00017988007891649196, "loss": 0.0766, "step": 9909 }, { "epoch": 0.6406464646464647, "grad_norm": 0.048066239804029465, "learning_rate": 0.00017987596464711161, "loss": 0.0902, "step": 9910 }, { "epoch": 0.6407111111111111, "grad_norm": 0.053839586675167084, "learning_rate": 0.00017987185000417782, "loss": 0.0952, "step": 9911 }, { "epoch": 0.6407757575757576, "grad_norm": 0.04701082035899162, "learning_rate": 0.00017986773498770978, "loss": 0.069, "step": 9912 }, { "epoch": 0.640840404040404, "grad_norm": 0.05168093368411064, "learning_rate": 0.00017986361959772672, "loss": 0.091, "step": 9913 }, { "epoch": 0.6409050505050505, "grad_norm": 0.06413009017705917, "learning_rate": 0.00017985950383424796, "loss": 0.098, "step": 9914 }, { "epoch": 0.640969696969697, "grad_norm": 0.055790968239307404, "learning_rate": 0.00017985538769729268, "loss": 0.0841, "step": 9915 }, { "epoch": 0.6410343434343434, "grad_norm": 0.047049276530742645, "learning_rate": 0.00017985127118688013, "loss": 0.08, "step": 9916 }, { "epoch": 0.64109898989899, "grad_norm": 0.05915847793221474, "learning_rate": 0.0001798471543030296, "loss": 0.0898, "step": 9917 }, { "epoch": 0.6411636363636364, "grad_norm": 0.05072220787405968, "learning_rate": 0.00017984303704576035, "loss": 0.0822, "step": 9918 }, { "epoch": 0.6412282828282828, "grad_norm": 0.05242101848125458, "learning_rate": 0.00017983891941509162, "loss": 0.0712, "step": 9919 }, { "epoch": 0.6412929292929292, "grad_norm": 0.04955407232046127, "learning_rate": 0.00017983480141104262, "loss": 0.0757, "step": 9920 }, { "epoch": 0.6412929292929292, "eval_bleu": 19.2946811677055, "eval_loss": 0.08854640275239944, "eval_runtime": 2.5774, "eval_samples_per_second": 12.416, "eval_steps_per_second": 1.552, "step": 9920 }, { "epoch": 0.6413575757575758, "grad_norm": 0.055893316864967346, "learning_rate": 0.00017983068303363266, "loss": 0.0892, "step": 9921 }, { "epoch": 0.6414222222222222, "grad_norm": 0.05910059064626694, "learning_rate": 0.00017982656428288096, "loss": 0.0816, "step": 9922 }, { "epoch": 0.6414868686868687, "grad_norm": 0.05298285186290741, "learning_rate": 0.00017982244515880684, "loss": 0.0773, "step": 9923 }, { "epoch": 0.6415515151515152, "grad_norm": 0.05259937793016434, "learning_rate": 0.00017981832566142954, "loss": 0.0852, "step": 9924 }, { "epoch": 0.6416161616161616, "grad_norm": 0.061017461121082306, "learning_rate": 0.0001798142057907683, "loss": 0.0926, "step": 9925 }, { "epoch": 0.6416808080808081, "grad_norm": 0.05836660414934158, "learning_rate": 0.0001798100855468424, "loss": 0.0966, "step": 9926 }, { "epoch": 0.6417454545454545, "grad_norm": 0.062268100678920746, "learning_rate": 0.00017980596492967114, "loss": 0.0894, "step": 9927 }, { "epoch": 0.641810101010101, "grad_norm": 0.055221058428287506, "learning_rate": 0.00017980184393927374, "loss": 0.0817, "step": 9928 }, { "epoch": 0.6418747474747475, "grad_norm": 0.04979895427823067, "learning_rate": 0.0001797977225756695, "loss": 0.0817, "step": 9929 }, { "epoch": 0.6419393939393939, "grad_norm": 0.06094241887331009, "learning_rate": 0.0001797936008388777, "loss": 0.0985, "step": 9930 }, { "epoch": 0.6420040404040404, "grad_norm": 0.0495346374809742, "learning_rate": 0.00017978947872891761, "loss": 0.0785, "step": 9931 }, { "epoch": 0.6420686868686869, "grad_norm": 0.06057848036289215, "learning_rate": 0.0001797853562458085, "loss": 0.0838, "step": 9932 }, { "epoch": 0.6421333333333333, "grad_norm": 0.050278227776288986, "learning_rate": 0.00017978123338956964, "loss": 0.0773, "step": 9933 }, { "epoch": 0.6421979797979798, "grad_norm": 0.059643615037202835, "learning_rate": 0.00017977711016022035, "loss": 0.1056, "step": 9934 }, { "epoch": 0.6422626262626263, "grad_norm": 0.05492153391242027, "learning_rate": 0.00017977298655777989, "loss": 0.0865, "step": 9935 }, { "epoch": 0.6423272727272727, "grad_norm": 0.047749314457178116, "learning_rate": 0.00017976886258226754, "loss": 0.0713, "step": 9936 }, { "epoch": 0.6423272727272727, "eval_bleu": 19.277076071839133, "eval_loss": 0.08785676211118698, "eval_runtime": 2.6908, "eval_samples_per_second": 11.892, "eval_steps_per_second": 1.487, "step": 9936 }, { "epoch": 0.6423919191919192, "grad_norm": 0.05267943814396858, "learning_rate": 0.0001797647382337026, "loss": 0.0726, "step": 9937 }, { "epoch": 0.6424565656565656, "grad_norm": 0.05529758334159851, "learning_rate": 0.00017976061351210435, "loss": 0.0793, "step": 9938 }, { "epoch": 0.6425212121212122, "grad_norm": 0.04934766888618469, "learning_rate": 0.00017975648841749207, "loss": 0.0759, "step": 9939 }, { "epoch": 0.6425858585858586, "grad_norm": 0.062252145260572433, "learning_rate": 0.00017975236294988505, "loss": 0.1069, "step": 9940 }, { "epoch": 0.642650505050505, "grad_norm": 0.05456852167844772, "learning_rate": 0.00017974823710930262, "loss": 0.0808, "step": 9941 }, { "epoch": 0.6427151515151515, "grad_norm": 0.05051539093255997, "learning_rate": 0.00017974411089576403, "loss": 0.0728, "step": 9942 }, { "epoch": 0.642779797979798, "grad_norm": 0.06274040043354034, "learning_rate": 0.00017973998430928858, "loss": 0.1072, "step": 9943 }, { "epoch": 0.6428444444444444, "grad_norm": 0.06251243501901627, "learning_rate": 0.0001797358573498956, "loss": 0.1027, "step": 9944 }, { "epoch": 0.6429090909090909, "grad_norm": 0.0611676461994648, "learning_rate": 0.0001797317300176044, "loss": 0.0991, "step": 9945 }, { "epoch": 0.6429737373737374, "grad_norm": 0.05646947771310806, "learning_rate": 0.00017972760231243422, "loss": 0.0981, "step": 9946 }, { "epoch": 0.6430383838383839, "grad_norm": 0.053014837205410004, "learning_rate": 0.00017972347423440442, "loss": 0.0802, "step": 9947 }, { "epoch": 0.6431030303030303, "grad_norm": 0.051882363855838776, "learning_rate": 0.0001797193457835343, "loss": 0.0749, "step": 9948 }, { "epoch": 0.6431676767676767, "grad_norm": 0.053188346326351166, "learning_rate": 0.00017971521695984313, "loss": 0.0836, "step": 9949 }, { "epoch": 0.6432323232323233, "grad_norm": 0.04357510805130005, "learning_rate": 0.00017971108776335023, "loss": 0.0658, "step": 9950 }, { "epoch": 0.6432969696969697, "grad_norm": 0.06288231909275055, "learning_rate": 0.00017970695819407496, "loss": 0.1131, "step": 9951 }, { "epoch": 0.6433616161616161, "grad_norm": 0.06336627900600433, "learning_rate": 0.0001797028282520366, "loss": 0.1081, "step": 9952 }, { "epoch": 0.6433616161616161, "eval_bleu": 18.391595480390023, "eval_loss": 0.08936593681573868, "eval_runtime": 2.678, "eval_samples_per_second": 11.949, "eval_steps_per_second": 1.494, "step": 9952 }, { "epoch": 0.6434262626262627, "grad_norm": 0.048185233026742935, "learning_rate": 0.00017969869793725446, "loss": 0.0828, "step": 9953 }, { "epoch": 0.6434909090909091, "grad_norm": 0.05574232339859009, "learning_rate": 0.00017969456724974782, "loss": 0.0998, "step": 9954 }, { "epoch": 0.6435555555555555, "grad_norm": 0.05727709084749222, "learning_rate": 0.00017969043618953605, "loss": 0.0911, "step": 9955 }, { "epoch": 0.643620202020202, "grad_norm": 0.05138951167464256, "learning_rate": 0.0001796863047566385, "loss": 0.0916, "step": 9956 }, { "epoch": 0.6436848484848485, "grad_norm": 0.06858886033296585, "learning_rate": 0.00017968217295107438, "loss": 0.1007, "step": 9957 }, { "epoch": 0.643749494949495, "grad_norm": 0.056544508785009384, "learning_rate": 0.0001796780407728631, "loss": 0.0942, "step": 9958 }, { "epoch": 0.6438141414141414, "grad_norm": 0.051282405853271484, "learning_rate": 0.000179673908222024, "loss": 0.0901, "step": 9959 }, { "epoch": 0.6438787878787878, "grad_norm": 0.055274978280067444, "learning_rate": 0.00017966977529857632, "loss": 0.0721, "step": 9960 }, { "epoch": 0.6439434343434344, "grad_norm": 0.057601794600486755, "learning_rate": 0.00017966564200253948, "loss": 0.0933, "step": 9961 }, { "epoch": 0.6440080808080808, "grad_norm": 0.047827474772930145, "learning_rate": 0.00017966150833393277, "loss": 0.0717, "step": 9962 }, { "epoch": 0.6440727272727272, "grad_norm": 0.05916670709848404, "learning_rate": 0.0001796573742927755, "loss": 0.0929, "step": 9963 }, { "epoch": 0.6441373737373738, "grad_norm": 0.05632597208023071, "learning_rate": 0.00017965323987908702, "loss": 0.0876, "step": 9964 }, { "epoch": 0.6442020202020202, "grad_norm": 0.049394816160202026, "learning_rate": 0.0001796491050928867, "loss": 0.0739, "step": 9965 }, { "epoch": 0.6442666666666667, "grad_norm": 0.06614021956920624, "learning_rate": 0.0001796449699341938, "loss": 0.0915, "step": 9966 }, { "epoch": 0.6443313131313131, "grad_norm": 0.0683840662240982, "learning_rate": 0.00017964083440302772, "loss": 0.0946, "step": 9967 }, { "epoch": 0.6443959595959596, "grad_norm": 0.04890046641230583, "learning_rate": 0.00017963669849940784, "loss": 0.0796, "step": 9968 }, { "epoch": 0.6443959595959596, "eval_bleu": 18.000311037951338, "eval_loss": 0.08929183334112167, "eval_runtime": 2.7072, "eval_samples_per_second": 11.82, "eval_steps_per_second": 1.478, "step": 9968 }, { "epoch": 0.6444606060606061, "grad_norm": 0.05904802680015564, "learning_rate": 0.0001796325622233534, "loss": 0.081, "step": 9969 }, { "epoch": 0.6445252525252525, "grad_norm": 0.061447642743587494, "learning_rate": 0.00017962842557488377, "loss": 0.1009, "step": 9970 }, { "epoch": 0.6445898989898989, "grad_norm": 0.05578022450208664, "learning_rate": 0.00017962428855401835, "loss": 0.0984, "step": 9971 }, { "epoch": 0.6446545454545455, "grad_norm": 0.05482397601008415, "learning_rate": 0.00017962015116077646, "loss": 0.0842, "step": 9972 }, { "epoch": 0.6447191919191919, "grad_norm": 0.05087227746844292, "learning_rate": 0.00017961601339517745, "loss": 0.082, "step": 9973 }, { "epoch": 0.6447838383838383, "grad_norm": 0.05668569728732109, "learning_rate": 0.00017961187525724067, "loss": 0.0966, "step": 9974 }, { "epoch": 0.6448484848484849, "grad_norm": 0.053404487669467926, "learning_rate": 0.00017960773674698544, "loss": 0.0904, "step": 9975 }, { "epoch": 0.6449131313131313, "grad_norm": 0.047241292893886566, "learning_rate": 0.00017960359786443113, "loss": 0.0734, "step": 9976 }, { "epoch": 0.6449777777777778, "grad_norm": 0.05319118872284889, "learning_rate": 0.00017959945860959714, "loss": 0.0858, "step": 9977 }, { "epoch": 0.6450424242424242, "grad_norm": 0.05324626713991165, "learning_rate": 0.00017959531898250282, "loss": 0.0857, "step": 9978 }, { "epoch": 0.6451070707070707, "grad_norm": 0.0555843785405159, "learning_rate": 0.00017959117898316748, "loss": 0.0772, "step": 9979 }, { "epoch": 0.6451717171717172, "grad_norm": 0.04371309280395508, "learning_rate": 0.00017958703861161048, "loss": 0.0657, "step": 9980 }, { "epoch": 0.6452363636363636, "grad_norm": 0.06048743054270744, "learning_rate": 0.00017958289786785127, "loss": 0.0846, "step": 9981 }, { "epoch": 0.6453010101010102, "grad_norm": 0.04778307303786278, "learning_rate": 0.00017957875675190908, "loss": 0.0697, "step": 9982 }, { "epoch": 0.6453656565656566, "grad_norm": 0.04762129858136177, "learning_rate": 0.00017957461526380342, "loss": 0.0699, "step": 9983 }, { "epoch": 0.645430303030303, "grad_norm": 0.06548965722322464, "learning_rate": 0.00017957047340355357, "loss": 0.1122, "step": 9984 }, { "epoch": 0.645430303030303, "eval_bleu": 18.027472850716446, "eval_loss": 0.08856790512800217, "eval_runtime": 2.7791, "eval_samples_per_second": 11.515, "eval_steps_per_second": 1.439, "step": 9984 }, { "epoch": 0.6454949494949495, "grad_norm": 0.06275077164173126, "learning_rate": 0.00017956633117117892, "loss": 0.0761, "step": 9985 }, { "epoch": 0.645559595959596, "grad_norm": 0.05388579145073891, "learning_rate": 0.00017956218856669885, "loss": 0.0824, "step": 9986 }, { "epoch": 0.6456242424242424, "grad_norm": 0.04477975144982338, "learning_rate": 0.0001795580455901327, "loss": 0.0681, "step": 9987 }, { "epoch": 0.6456888888888889, "grad_norm": 0.049963172525167465, "learning_rate": 0.0001795539022414999, "loss": 0.0778, "step": 9988 }, { "epoch": 0.6457535353535353, "grad_norm": 0.054506465792655945, "learning_rate": 0.00017954975852081982, "loss": 0.0817, "step": 9989 }, { "epoch": 0.6458181818181818, "grad_norm": 0.056430634111166, "learning_rate": 0.00017954561442811177, "loss": 0.0843, "step": 9990 }, { "epoch": 0.6458828282828283, "grad_norm": 0.06383189558982849, "learning_rate": 0.00017954146996339518, "loss": 0.0954, "step": 9991 }, { "epoch": 0.6459474747474747, "grad_norm": 0.0633719339966774, "learning_rate": 0.00017953732512668947, "loss": 0.105, "step": 9992 }, { "epoch": 0.6460121212121213, "grad_norm": 0.0569661408662796, "learning_rate": 0.00017953317991801395, "loss": 0.0939, "step": 9993 }, { "epoch": 0.6460767676767677, "grad_norm": 0.05830950289964676, "learning_rate": 0.00017952903433738807, "loss": 0.0875, "step": 9994 }, { "epoch": 0.6461414141414141, "grad_norm": 0.05574384704232216, "learning_rate": 0.00017952488838483116, "loss": 0.0858, "step": 9995 }, { "epoch": 0.6462060606060606, "grad_norm": 0.04756586626172066, "learning_rate": 0.00017952074206036266, "loss": 0.0742, "step": 9996 }, { "epoch": 0.6462707070707071, "grad_norm": 0.04709923267364502, "learning_rate": 0.00017951659536400195, "loss": 0.0664, "step": 9997 }, { "epoch": 0.6463353535353535, "grad_norm": 0.0497976690530777, "learning_rate": 0.00017951244829576837, "loss": 0.0862, "step": 9998 }, { "epoch": 0.6464, "grad_norm": 0.04431433603167534, "learning_rate": 0.00017950830085568137, "loss": 0.073, "step": 9999 }, { "epoch": 0.6464646464646465, "grad_norm": 0.04780964553356171, "learning_rate": 0.00017950415304376034, "loss": 0.0782, "step": 10000 }, { "epoch": 0.6464646464646465, "eval_bleu": 16.51975730993241, "eval_loss": 0.08833007514476776, "eval_runtime": 2.6338, "eval_samples_per_second": 12.15, "eval_steps_per_second": 1.519, "step": 10000 }, { "epoch": 0.646529292929293, "grad_norm": 0.060992490500211716, "learning_rate": 0.00017950000486002468, "loss": 0.0836, "step": 10001 }, { "epoch": 0.6465939393939394, "grad_norm": 0.05135582014918327, "learning_rate": 0.00017949585630449376, "loss": 0.0786, "step": 10002 }, { "epoch": 0.6466585858585858, "grad_norm": 0.04874500259757042, "learning_rate": 0.00017949170737718698, "loss": 0.0735, "step": 10003 }, { "epoch": 0.6467232323232324, "grad_norm": 0.06144370883703232, "learning_rate": 0.00017948755807812382, "loss": 0.1038, "step": 10004 }, { "epoch": 0.6467878787878788, "grad_norm": 0.06033185124397278, "learning_rate": 0.0001794834084073236, "loss": 0.0959, "step": 10005 }, { "epoch": 0.6468525252525252, "grad_norm": 0.04905034601688385, "learning_rate": 0.00017947925836480578, "loss": 0.073, "step": 10006 }, { "epoch": 0.6469171717171717, "grad_norm": 0.050180237740278244, "learning_rate": 0.0001794751079505897, "loss": 0.0792, "step": 10007 }, { "epoch": 0.6469818181818182, "grad_norm": 0.061098385602235794, "learning_rate": 0.00017947095716469482, "loss": 0.1067, "step": 10008 }, { "epoch": 0.6470464646464646, "grad_norm": 0.05362142249941826, "learning_rate": 0.00017946680600714057, "loss": 0.096, "step": 10009 }, { "epoch": 0.6471111111111111, "grad_norm": 0.05683455988764763, "learning_rate": 0.00017946265447794634, "loss": 0.0897, "step": 10010 }, { "epoch": 0.6471757575757576, "grad_norm": 0.05196186900138855, "learning_rate": 0.00017945850257713154, "loss": 0.0873, "step": 10011 }, { "epoch": 0.6472404040404041, "grad_norm": 0.04640404134988785, "learning_rate": 0.00017945435030471555, "loss": 0.0605, "step": 10012 }, { "epoch": 0.6473050505050505, "grad_norm": 0.05350055173039436, "learning_rate": 0.0001794501976607179, "loss": 0.0932, "step": 10013 }, { "epoch": 0.6473696969696969, "grad_norm": 0.05380675569176674, "learning_rate": 0.0001794460446451579, "loss": 0.0806, "step": 10014 }, { "epoch": 0.6474343434343435, "grad_norm": 0.048019230365753174, "learning_rate": 0.000179441891258055, "loss": 0.0685, "step": 10015 }, { "epoch": 0.6474989898989899, "grad_norm": 0.05061103776097298, "learning_rate": 0.00017943773749942867, "loss": 0.0791, "step": 10016 }, { "epoch": 0.6474989898989899, "eval_bleu": 17.902919134019736, "eval_loss": 0.088566355407238, "eval_runtime": 2.7276, "eval_samples_per_second": 11.732, "eval_steps_per_second": 1.466, "step": 10016 }, { "epoch": 0.6475636363636363, "grad_norm": 0.04669830575585365, "learning_rate": 0.0001794335833692983, "loss": 0.0733, "step": 10017 }, { "epoch": 0.6476282828282828, "grad_norm": 0.05818536877632141, "learning_rate": 0.0001794294288676833, "loss": 0.0958, "step": 10018 }, { "epoch": 0.6476929292929293, "grad_norm": 0.05439404398202896, "learning_rate": 0.00017942527399460314, "loss": 0.0932, "step": 10019 }, { "epoch": 0.6477575757575758, "grad_norm": 0.04952364042401314, "learning_rate": 0.00017942111875007724, "loss": 0.0751, "step": 10020 }, { "epoch": 0.6478222222222222, "grad_norm": 0.052276402711868286, "learning_rate": 0.000179416963134125, "loss": 0.0864, "step": 10021 }, { "epoch": 0.6478868686868687, "grad_norm": 0.04831822216510773, "learning_rate": 0.0001794128071467659, "loss": 0.0707, "step": 10022 }, { "epoch": 0.6479515151515152, "grad_norm": 0.057348817586898804, "learning_rate": 0.00017940865078801933, "loss": 0.0765, "step": 10023 }, { "epoch": 0.6480161616161616, "grad_norm": 0.05772754177451134, "learning_rate": 0.00017940449405790475, "loss": 0.0897, "step": 10024 }, { "epoch": 0.648080808080808, "grad_norm": 0.04397277534008026, "learning_rate": 0.00017940033695644164, "loss": 0.0641, "step": 10025 }, { "epoch": 0.6481454545454546, "grad_norm": 0.06002219021320343, "learning_rate": 0.00017939617948364937, "loss": 0.0942, "step": 10026 }, { "epoch": 0.648210101010101, "grad_norm": 0.05074901133775711, "learning_rate": 0.00017939202163954742, "loss": 0.0865, "step": 10027 }, { "epoch": 0.6482747474747474, "grad_norm": 0.05554606765508652, "learning_rate": 0.00017938786342415526, "loss": 0.0802, "step": 10028 }, { "epoch": 0.648339393939394, "grad_norm": 0.057782020419836044, "learning_rate": 0.00017938370483749227, "loss": 0.0882, "step": 10029 }, { "epoch": 0.6484040404040404, "grad_norm": 0.05575177073478699, "learning_rate": 0.00017937954587957794, "loss": 0.0912, "step": 10030 }, { "epoch": 0.6484686868686869, "grad_norm": 0.05761939287185669, "learning_rate": 0.00017937538655043173, "loss": 0.0877, "step": 10031 }, { "epoch": 0.6485333333333333, "grad_norm": 0.050348713994026184, "learning_rate": 0.00017937122685007307, "loss": 0.0805, "step": 10032 }, { "epoch": 0.6485333333333333, "eval_bleu": 17.968122166755062, "eval_loss": 0.0889134332537651, "eval_runtime": 2.7114, "eval_samples_per_second": 11.802, "eval_steps_per_second": 1.475, "step": 10032 }, { "epoch": 0.6485979797979798, "grad_norm": 0.055425964295864105, "learning_rate": 0.00017936706677852145, "loss": 0.0796, "step": 10033 }, { "epoch": 0.6486626262626263, "grad_norm": 0.05505460873246193, "learning_rate": 0.00017936290633579626, "loss": 0.0813, "step": 10034 }, { "epoch": 0.6487272727272727, "grad_norm": 0.04897046089172363, "learning_rate": 0.000179358745521917, "loss": 0.0774, "step": 10035 }, { "epoch": 0.6487919191919191, "grad_norm": 0.05466997250914574, "learning_rate": 0.00017935458433690314, "loss": 0.0782, "step": 10036 }, { "epoch": 0.6488565656565657, "grad_norm": 0.048395831137895584, "learning_rate": 0.00017935042278077412, "loss": 0.0763, "step": 10037 }, { "epoch": 0.6489212121212121, "grad_norm": 0.06428859382867813, "learning_rate": 0.0001793462608535494, "loss": 0.0887, "step": 10038 }, { "epoch": 0.6489858585858586, "grad_norm": 0.051575180143117905, "learning_rate": 0.00017934209855524844, "loss": 0.0926, "step": 10039 }, { "epoch": 0.6490505050505051, "grad_norm": 0.053126413375139236, "learning_rate": 0.0001793379358858907, "loss": 0.0806, "step": 10040 }, { "epoch": 0.6491151515151515, "grad_norm": 0.04675816372036934, "learning_rate": 0.00017933377284549572, "loss": 0.0767, "step": 10041 }, { "epoch": 0.649179797979798, "grad_norm": 0.04623724892735481, "learning_rate": 0.00017932960943408286, "loss": 0.0634, "step": 10042 }, { "epoch": 0.6492444444444444, "grad_norm": 0.05650758370757103, "learning_rate": 0.00017932544565167166, "loss": 0.0969, "step": 10043 }, { "epoch": 0.649309090909091, "grad_norm": 0.05240333452820778, "learning_rate": 0.00017932128149828155, "loss": 0.0888, "step": 10044 }, { "epoch": 0.6493737373737374, "grad_norm": 0.06764701753854752, "learning_rate": 0.00017931711697393208, "loss": 0.0805, "step": 10045 }, { "epoch": 0.6494383838383838, "grad_norm": 0.05075753852725029, "learning_rate": 0.00017931295207864264, "loss": 0.0719, "step": 10046 }, { "epoch": 0.6495030303030302, "grad_norm": 0.05627547577023506, "learning_rate": 0.00017930878681243274, "loss": 0.0943, "step": 10047 }, { "epoch": 0.6495676767676768, "grad_norm": 0.05663019046187401, "learning_rate": 0.0001793046211753219, "loss": 0.0846, "step": 10048 }, { "epoch": 0.6495676767676768, "eval_bleu": 19.23202759900361, "eval_loss": 0.0909474790096283, "eval_runtime": 2.6823, "eval_samples_per_second": 11.93, "eval_steps_per_second": 1.491, "step": 10048 }, { "epoch": 0.6496323232323232, "grad_norm": 0.06630007922649384, "learning_rate": 0.00017930045516732955, "loss": 0.1035, "step": 10049 }, { "epoch": 0.6496969696969697, "grad_norm": 0.04933970794081688, "learning_rate": 0.0001792962887884752, "loss": 0.0727, "step": 10050 }, { "epoch": 0.6497616161616162, "grad_norm": 0.04488104581832886, "learning_rate": 0.00017929212203877826, "loss": 0.0683, "step": 10051 }, { "epoch": 0.6498262626262626, "grad_norm": 0.055514026433229446, "learning_rate": 0.00017928795491825832, "loss": 0.0817, "step": 10052 }, { "epoch": 0.6498909090909091, "grad_norm": 0.05832381919026375, "learning_rate": 0.00017928378742693482, "loss": 0.0946, "step": 10053 }, { "epoch": 0.6499555555555555, "grad_norm": 0.05143982172012329, "learning_rate": 0.00017927961956482727, "loss": 0.0959, "step": 10054 }, { "epoch": 0.650020202020202, "grad_norm": 0.05729496851563454, "learning_rate": 0.00017927545133195513, "loss": 0.0907, "step": 10055 }, { "epoch": 0.6500848484848485, "grad_norm": 0.0465690977871418, "learning_rate": 0.00017927128272833792, "loss": 0.0722, "step": 10056 }, { "epoch": 0.6501494949494949, "grad_norm": 0.05465994402766228, "learning_rate": 0.00017926711375399515, "loss": 0.0824, "step": 10057 }, { "epoch": 0.6502141414141415, "grad_norm": 0.05433119460940361, "learning_rate": 0.00017926294440894628, "loss": 0.0812, "step": 10058 }, { "epoch": 0.6502787878787879, "grad_norm": 0.05332766845822334, "learning_rate": 0.0001792587746932108, "loss": 0.0797, "step": 10059 }, { "epoch": 0.6503434343434343, "grad_norm": 0.057419490069150925, "learning_rate": 0.00017925460460680825, "loss": 0.0938, "step": 10060 }, { "epoch": 0.6504080808080808, "grad_norm": 0.04813119024038315, "learning_rate": 0.00017925043414975813, "loss": 0.0658, "step": 10061 }, { "epoch": 0.6504727272727273, "grad_norm": 0.0630747377872467, "learning_rate": 0.0001792462633220799, "loss": 0.0843, "step": 10062 }, { "epoch": 0.6505373737373737, "grad_norm": 0.05489436537027359, "learning_rate": 0.00017924209212379313, "loss": 0.085, "step": 10063 }, { "epoch": 0.6506020202020202, "grad_norm": 0.04816500097513199, "learning_rate": 0.00017923792055491727, "loss": 0.0838, "step": 10064 }, { "epoch": 0.6506020202020202, "eval_bleu": 18.10096575216441, "eval_loss": 0.09006283432245255, "eval_runtime": 2.6246, "eval_samples_per_second": 12.192, "eval_steps_per_second": 1.524, "step": 10064 }, { "epoch": 0.6506666666666666, "grad_norm": 0.051668234169483185, "learning_rate": 0.00017923374861547183, "loss": 0.0835, "step": 10065 }, { "epoch": 0.6507313131313132, "grad_norm": 0.05258619785308838, "learning_rate": 0.0001792295763054764, "loss": 0.0809, "step": 10066 }, { "epoch": 0.6507959595959596, "grad_norm": 0.050463683903217316, "learning_rate": 0.00017922540362495036, "loss": 0.0774, "step": 10067 }, { "epoch": 0.650860606060606, "grad_norm": 0.05224032327532768, "learning_rate": 0.00017922123057391337, "loss": 0.081, "step": 10068 }, { "epoch": 0.6509252525252526, "grad_norm": 0.05625748634338379, "learning_rate": 0.00017921705715238484, "loss": 0.0846, "step": 10069 }, { "epoch": 0.650989898989899, "grad_norm": 0.04780224710702896, "learning_rate": 0.00017921288336038434, "loss": 0.0734, "step": 10070 }, { "epoch": 0.6510545454545454, "grad_norm": 0.053123362362384796, "learning_rate": 0.00017920870919793135, "loss": 0.0833, "step": 10071 }, { "epoch": 0.6511191919191919, "grad_norm": 0.0599016472697258, "learning_rate": 0.00017920453466504542, "loss": 0.1083, "step": 10072 }, { "epoch": 0.6511838383838384, "grad_norm": 0.05426918715238571, "learning_rate": 0.00017920035976174605, "loss": 0.0782, "step": 10073 }, { "epoch": 0.6512484848484849, "grad_norm": 0.04774899035692215, "learning_rate": 0.0001791961844880528, "loss": 0.0848, "step": 10074 }, { "epoch": 0.6513131313131313, "grad_norm": 0.06654895097017288, "learning_rate": 0.00017919200884398517, "loss": 0.1029, "step": 10075 }, { "epoch": 0.6513777777777778, "grad_norm": 0.050834231078624725, "learning_rate": 0.0001791878328295627, "loss": 0.08, "step": 10076 }, { "epoch": 0.6514424242424243, "grad_norm": 0.06976943463087082, "learning_rate": 0.00017918365644480492, "loss": 0.0935, "step": 10077 }, { "epoch": 0.6515070707070707, "grad_norm": 0.06334570050239563, "learning_rate": 0.00017917947968973134, "loss": 0.1003, "step": 10078 }, { "epoch": 0.6515717171717171, "grad_norm": 0.0530727319419384, "learning_rate": 0.00017917530256436154, "loss": 0.0804, "step": 10079 }, { "epoch": 0.6516363636363637, "grad_norm": 0.04721825197339058, "learning_rate": 0.00017917112506871498, "loss": 0.0733, "step": 10080 }, { "epoch": 0.6516363636363637, "eval_bleu": 18.580748438129955, "eval_loss": 0.09010928869247437, "eval_runtime": 2.6634, "eval_samples_per_second": 12.015, "eval_steps_per_second": 1.502, "step": 10080 }, { "epoch": 0.6517010101010101, "grad_norm": 0.0568452924489975, "learning_rate": 0.00017916694720281127, "loss": 0.0816, "step": 10081 }, { "epoch": 0.6517656565656565, "grad_norm": 0.05398707836866379, "learning_rate": 0.0001791627689666699, "loss": 0.0842, "step": 10082 }, { "epoch": 0.651830303030303, "grad_norm": 0.043016836047172546, "learning_rate": 0.00017915859036031045, "loss": 0.0611, "step": 10083 }, { "epoch": 0.6518949494949495, "grad_norm": 0.06322471797466278, "learning_rate": 0.00017915441138375245, "loss": 0.086, "step": 10084 }, { "epoch": 0.651959595959596, "grad_norm": 0.06059642136096954, "learning_rate": 0.0001791502320370154, "loss": 0.0912, "step": 10085 }, { "epoch": 0.6520242424242424, "grad_norm": 0.0579603910446167, "learning_rate": 0.00017914605232011895, "loss": 0.0944, "step": 10086 }, { "epoch": 0.6520888888888889, "grad_norm": 0.053806014358997345, "learning_rate": 0.0001791418722330825, "loss": 0.0816, "step": 10087 }, { "epoch": 0.6521535353535354, "grad_norm": 0.06776726245880127, "learning_rate": 0.00017913769177592572, "loss": 0.0924, "step": 10088 }, { "epoch": 0.6522181818181818, "grad_norm": 0.07590265572071075, "learning_rate": 0.00017913351094866814, "loss": 0.085, "step": 10089 }, { "epoch": 0.6522828282828282, "grad_norm": 0.0519663468003273, "learning_rate": 0.00017912932975132923, "loss": 0.0744, "step": 10090 }, { "epoch": 0.6523474747474748, "grad_norm": 0.05797363445162773, "learning_rate": 0.00017912514818392862, "loss": 0.0744, "step": 10091 }, { "epoch": 0.6524121212121212, "grad_norm": 0.07021604478359222, "learning_rate": 0.00017912096624648588, "loss": 0.1079, "step": 10092 }, { "epoch": 0.6524767676767677, "grad_norm": 0.05279383435845375, "learning_rate": 0.00017911678393902052, "loss": 0.0871, "step": 10093 }, { "epoch": 0.6525414141414141, "grad_norm": 0.053974222391843796, "learning_rate": 0.00017911260126155215, "loss": 0.0824, "step": 10094 }, { "epoch": 0.6526060606060606, "grad_norm": 0.05208023265004158, "learning_rate": 0.00017910841821410024, "loss": 0.0704, "step": 10095 }, { "epoch": 0.6526707070707071, "grad_norm": 0.0530535914003849, "learning_rate": 0.00017910423479668443, "loss": 0.0825, "step": 10096 }, { "epoch": 0.6526707070707071, "eval_bleu": 19.561011438848787, "eval_loss": 0.09018561244010925, "eval_runtime": 2.6661, "eval_samples_per_second": 12.003, "eval_steps_per_second": 1.5, "step": 10096 }, { "epoch": 0.6527353535353535, "grad_norm": 0.04831776022911072, "learning_rate": 0.00017910005100932428, "loss": 0.0732, "step": 10097 }, { "epoch": 0.6528, "grad_norm": 0.05325857177376747, "learning_rate": 0.0001790958668520393, "loss": 0.0802, "step": 10098 }, { "epoch": 0.6528646464646465, "grad_norm": 0.0609809048473835, "learning_rate": 0.00017909168232484917, "loss": 0.0997, "step": 10099 }, { "epoch": 0.6529292929292929, "grad_norm": 0.04706213250756264, "learning_rate": 0.00017908749742777334, "loss": 0.0779, "step": 10100 }, { "epoch": 0.6529939393939393, "grad_norm": 0.04882325232028961, "learning_rate": 0.00017908331216083144, "loss": 0.0673, "step": 10101 }, { "epoch": 0.6530585858585859, "grad_norm": 0.04605832323431969, "learning_rate": 0.00017907912652404307, "loss": 0.0804, "step": 10102 }, { "epoch": 0.6531232323232323, "grad_norm": 0.06498763710260391, "learning_rate": 0.0001790749405174277, "loss": 0.0926, "step": 10103 }, { "epoch": 0.6531878787878788, "grad_norm": 0.04398577660322189, "learning_rate": 0.00017907075414100502, "loss": 0.057, "step": 10104 }, { "epoch": 0.6532525252525253, "grad_norm": 0.061487484723329544, "learning_rate": 0.00017906656739479458, "loss": 0.1026, "step": 10105 }, { "epoch": 0.6533171717171717, "grad_norm": 0.05337060987949371, "learning_rate": 0.00017906238027881588, "loss": 0.0854, "step": 10106 }, { "epoch": 0.6533818181818182, "grad_norm": 0.052252646535634995, "learning_rate": 0.0001790581927930886, "loss": 0.0756, "step": 10107 }, { "epoch": 0.6534464646464646, "grad_norm": 0.05328037217259407, "learning_rate": 0.0001790540049376323, "loss": 0.0842, "step": 10108 }, { "epoch": 0.6535111111111112, "grad_norm": 0.06372325867414474, "learning_rate": 0.00017904981671246654, "loss": 0.0805, "step": 10109 }, { "epoch": 0.6535757575757576, "grad_norm": 0.08388439565896988, "learning_rate": 0.00017904562811761092, "loss": 0.0991, "step": 10110 }, { "epoch": 0.653640404040404, "grad_norm": 0.056230880320072174, "learning_rate": 0.00017904143915308505, "loss": 0.0936, "step": 10111 }, { "epoch": 0.6537050505050505, "grad_norm": 0.06056665629148483, "learning_rate": 0.00017903724981890848, "loss": 0.1065, "step": 10112 }, { "epoch": 0.6537050505050505, "eval_bleu": 18.827420187567437, "eval_loss": 0.08915296196937561, "eval_runtime": 2.7903, "eval_samples_per_second": 11.468, "eval_steps_per_second": 1.434, "step": 10112 }, { "epoch": 0.653769696969697, "grad_norm": 0.07179708033800125, "learning_rate": 0.00017903306011510082, "loss": 0.1069, "step": 10113 }, { "epoch": 0.6538343434343434, "grad_norm": 0.057501472532749176, "learning_rate": 0.00017902887004168165, "loss": 0.0981, "step": 10114 }, { "epoch": 0.6538989898989899, "grad_norm": 0.06711297482252121, "learning_rate": 0.00017902467959867062, "loss": 0.1154, "step": 10115 }, { "epoch": 0.6539636363636364, "grad_norm": 0.05166565254330635, "learning_rate": 0.00017902048878608727, "loss": 0.0879, "step": 10116 }, { "epoch": 0.6540282828282828, "grad_norm": 0.05308091267943382, "learning_rate": 0.0001790162976039512, "loss": 0.0804, "step": 10117 }, { "epoch": 0.6540929292929293, "grad_norm": 0.05578884109854698, "learning_rate": 0.00017901210605228207, "loss": 0.091, "step": 10118 }, { "epoch": 0.6541575757575757, "grad_norm": 0.04314521700143814, "learning_rate": 0.00017900791413109942, "loss": 0.0727, "step": 10119 }, { "epoch": 0.6542222222222223, "grad_norm": 0.06520584225654602, "learning_rate": 0.00017900372184042284, "loss": 0.102, "step": 10120 }, { "epoch": 0.6542868686868687, "grad_norm": 0.0466947928071022, "learning_rate": 0.000178999529180272, "loss": 0.069, "step": 10121 }, { "epoch": 0.6543515151515151, "grad_norm": 0.05664955824613571, "learning_rate": 0.00017899533615066648, "loss": 0.0977, "step": 10122 }, { "epoch": 0.6544161616161616, "grad_norm": 0.04561570659279823, "learning_rate": 0.0001789911427516259, "loss": 0.0674, "step": 10123 }, { "epoch": 0.6544808080808081, "grad_norm": 0.053774476051330566, "learning_rate": 0.00017898694898316986, "loss": 0.0854, "step": 10124 }, { "epoch": 0.6545454545454545, "grad_norm": 0.050335876643657684, "learning_rate": 0.00017898275484531798, "loss": 0.08, "step": 10125 }, { "epoch": 0.654610101010101, "grad_norm": 0.08193143457174301, "learning_rate": 0.00017897856033808983, "loss": 0.0909, "step": 10126 }, { "epoch": 0.6546747474747475, "grad_norm": 0.06001056730747223, "learning_rate": 0.0001789743654615051, "loss": 0.0962, "step": 10127 }, { "epoch": 0.654739393939394, "grad_norm": 0.04965902864933014, "learning_rate": 0.00017897017021558333, "loss": 0.0717, "step": 10128 }, { "epoch": 0.654739393939394, "eval_bleu": 15.473597120451682, "eval_loss": 0.09002748876810074, "eval_runtime": 2.6612, "eval_samples_per_second": 12.024, "eval_steps_per_second": 1.503, "step": 10128 }, { "epoch": 0.6548040404040404, "grad_norm": 0.05415545403957367, "learning_rate": 0.0001789659746003442, "loss": 0.0846, "step": 10129 }, { "epoch": 0.6548686868686868, "grad_norm": 0.05744028836488724, "learning_rate": 0.00017896177861580732, "loss": 0.0961, "step": 10130 }, { "epoch": 0.6549333333333334, "grad_norm": 0.055262256413698196, "learning_rate": 0.0001789575822619923, "loss": 0.0835, "step": 10131 }, { "epoch": 0.6549979797979798, "grad_norm": 0.05877065658569336, "learning_rate": 0.00017895338553891875, "loss": 0.0806, "step": 10132 }, { "epoch": 0.6550626262626262, "grad_norm": 0.05554532632231712, "learning_rate": 0.00017894918844660635, "loss": 0.0908, "step": 10133 }, { "epoch": 0.6551272727272728, "grad_norm": 0.05720461905002594, "learning_rate": 0.0001789449909850747, "loss": 0.095, "step": 10134 }, { "epoch": 0.6551919191919192, "grad_norm": 0.05910974368453026, "learning_rate": 0.0001789407931543434, "loss": 0.0964, "step": 10135 }, { "epoch": 0.6552565656565656, "grad_norm": 0.05900576710700989, "learning_rate": 0.0001789365949544321, "loss": 0.0856, "step": 10136 }, { "epoch": 0.6553212121212121, "grad_norm": 0.049430686980485916, "learning_rate": 0.0001789323963853605, "loss": 0.0835, "step": 10137 }, { "epoch": 0.6553858585858586, "grad_norm": 0.048249583691358566, "learning_rate": 0.00017892819744714811, "loss": 0.0695, "step": 10138 }, { "epoch": 0.6554505050505051, "grad_norm": 0.05674559250473976, "learning_rate": 0.00017892399813981466, "loss": 0.0888, "step": 10139 }, { "epoch": 0.6555151515151515, "grad_norm": 0.05025885999202728, "learning_rate": 0.0001789197984633798, "loss": 0.0848, "step": 10140 }, { "epoch": 0.6555797979797979, "grad_norm": 0.05610506236553192, "learning_rate": 0.00017891559841786306, "loss": 0.0943, "step": 10141 }, { "epoch": 0.6556444444444445, "grad_norm": 0.05780574679374695, "learning_rate": 0.00017891139800328422, "loss": 0.0694, "step": 10142 }, { "epoch": 0.6557090909090909, "grad_norm": 0.050133153796195984, "learning_rate": 0.00017890719721966281, "loss": 0.0771, "step": 10143 }, { "epoch": 0.6557737373737373, "grad_norm": 0.058120597153902054, "learning_rate": 0.00017890299606701854, "loss": 0.0887, "step": 10144 }, { "epoch": 0.6557737373737373, "eval_bleu": 14.119617531794367, "eval_loss": 0.08950690180063248, "eval_runtime": 2.6799, "eval_samples_per_second": 11.941, "eval_steps_per_second": 1.493, "step": 10144 }, { "epoch": 0.6558383838383839, "grad_norm": 0.05844513326883316, "learning_rate": 0.00017889879454537107, "loss": 0.0718, "step": 10145 }, { "epoch": 0.6559030303030303, "grad_norm": 0.0535377562046051, "learning_rate": 0.00017889459265473998, "loss": 0.0924, "step": 10146 }, { "epoch": 0.6559676767676768, "grad_norm": 0.053127776831388474, "learning_rate": 0.00017889039039514497, "loss": 0.081, "step": 10147 }, { "epoch": 0.6560323232323232, "grad_norm": 0.05815545469522476, "learning_rate": 0.0001788861877666057, "loss": 0.1029, "step": 10148 }, { "epoch": 0.6560969696969697, "grad_norm": 0.055379971861839294, "learning_rate": 0.0001788819847691418, "loss": 0.0946, "step": 10149 }, { "epoch": 0.6561616161616162, "grad_norm": 0.050712209194898605, "learning_rate": 0.00017887778140277295, "loss": 0.0805, "step": 10150 }, { "epoch": 0.6562262626262626, "grad_norm": 0.05346331745386124, "learning_rate": 0.00017887357766751877, "loss": 0.0756, "step": 10151 }, { "epoch": 0.656290909090909, "grad_norm": 0.06442822515964508, "learning_rate": 0.00017886937356339898, "loss": 0.1027, "step": 10152 }, { "epoch": 0.6563555555555556, "grad_norm": 0.04459696263074875, "learning_rate": 0.00017886516909043317, "loss": 0.07, "step": 10153 }, { "epoch": 0.656420202020202, "grad_norm": 0.05270623415708542, "learning_rate": 0.00017886096424864106, "loss": 0.0887, "step": 10154 }, { "epoch": 0.6564848484848484, "grad_norm": 0.054713934659957886, "learning_rate": 0.00017885675903804224, "loss": 0.0808, "step": 10155 }, { "epoch": 0.656549494949495, "grad_norm": 0.05736137554049492, "learning_rate": 0.00017885255345865647, "loss": 0.0793, "step": 10156 }, { "epoch": 0.6566141414141414, "grad_norm": 0.06490117311477661, "learning_rate": 0.0001788483475105034, "loss": 0.0979, "step": 10157 }, { "epoch": 0.6566787878787879, "grad_norm": 0.05063994228839874, "learning_rate": 0.00017884414119360262, "loss": 0.0764, "step": 10158 }, { "epoch": 0.6567434343434343, "grad_norm": 0.055047664791345596, "learning_rate": 0.00017883993450797388, "loss": 0.0897, "step": 10159 }, { "epoch": 0.6568080808080808, "grad_norm": 0.04428455978631973, "learning_rate": 0.00017883572745363684, "loss": 0.0744, "step": 10160 }, { "epoch": 0.6568080808080808, "eval_bleu": 17.91798068235138, "eval_loss": 0.08936188369989395, "eval_runtime": 2.7133, "eval_samples_per_second": 11.794, "eval_steps_per_second": 1.474, "step": 10160 }, { "epoch": 0.6568727272727273, "grad_norm": 0.048937395215034485, "learning_rate": 0.00017883152003061114, "loss": 0.0743, "step": 10161 }, { "epoch": 0.6569373737373737, "grad_norm": 0.08113251626491547, "learning_rate": 0.0001788273122389165, "loss": 0.1287, "step": 10162 }, { "epoch": 0.6570020202020203, "grad_norm": 0.05187812075018883, "learning_rate": 0.0001788231040785726, "loss": 0.0874, "step": 10163 }, { "epoch": 0.6570666666666667, "grad_norm": 0.04070551320910454, "learning_rate": 0.00017881889554959907, "loss": 0.0586, "step": 10164 }, { "epoch": 0.6571313131313131, "grad_norm": 0.05705524608492851, "learning_rate": 0.00017881468665201564, "loss": 0.0938, "step": 10165 }, { "epoch": 0.6571959595959596, "grad_norm": 0.0503280945122242, "learning_rate": 0.00017881047738584196, "loss": 0.0707, "step": 10166 }, { "epoch": 0.6572606060606061, "grad_norm": 0.05320427566766739, "learning_rate": 0.0001788062677510977, "loss": 0.0889, "step": 10167 }, { "epoch": 0.6573252525252525, "grad_norm": 0.05453011393547058, "learning_rate": 0.00017880205774780265, "loss": 0.0793, "step": 10168 }, { "epoch": 0.657389898989899, "grad_norm": 0.06043929606676102, "learning_rate": 0.0001787978473759764, "loss": 0.0914, "step": 10169 }, { "epoch": 0.6574545454545454, "grad_norm": 0.047348372638225555, "learning_rate": 0.0001787936366356387, "loss": 0.0748, "step": 10170 }, { "epoch": 0.657519191919192, "grad_norm": 0.05457662045955658, "learning_rate": 0.00017878942552680913, "loss": 0.0923, "step": 10171 }, { "epoch": 0.6575838383838384, "grad_norm": 0.05332671105861664, "learning_rate": 0.00017878521404950752, "loss": 0.09, "step": 10172 }, { "epoch": 0.6576484848484848, "grad_norm": 0.04717263579368591, "learning_rate": 0.0001787810022037535, "loss": 0.072, "step": 10173 }, { "epoch": 0.6577131313131314, "grad_norm": 0.04712047055363655, "learning_rate": 0.00017877678998956678, "loss": 0.0729, "step": 10174 }, { "epoch": 0.6577777777777778, "grad_norm": 0.055206023156642914, "learning_rate": 0.00017877257740696705, "loss": 0.0851, "step": 10175 }, { "epoch": 0.6578424242424242, "grad_norm": 0.044657666236162186, "learning_rate": 0.00017876836445597402, "loss": 0.0752, "step": 10176 }, { "epoch": 0.6578424242424242, "eval_bleu": 17.53061066774975, "eval_loss": 0.0893302857875824, "eval_runtime": 2.7776, "eval_samples_per_second": 11.521, "eval_steps_per_second": 1.44, "step": 10176 }, { "epoch": 0.6579070707070707, "grad_norm": 0.05772417038679123, "learning_rate": 0.0001787641511366074, "loss": 0.0837, "step": 10177 }, { "epoch": 0.6579717171717172, "grad_norm": 0.05361907556653023, "learning_rate": 0.00017875993744888687, "loss": 0.0927, "step": 10178 }, { "epoch": 0.6580363636363636, "grad_norm": 0.05155982822179794, "learning_rate": 0.00017875572339283213, "loss": 0.0834, "step": 10179 }, { "epoch": 0.6581010101010101, "grad_norm": 0.04942192882299423, "learning_rate": 0.00017875150896846294, "loss": 0.0813, "step": 10180 }, { "epoch": 0.6581656565656566, "grad_norm": 0.05193647742271423, "learning_rate": 0.00017874729417579897, "loss": 0.0797, "step": 10181 }, { "epoch": 0.658230303030303, "grad_norm": 0.05970875918865204, "learning_rate": 0.00017874307901485993, "loss": 0.0947, "step": 10182 }, { "epoch": 0.6582949494949495, "grad_norm": 0.0498674176633358, "learning_rate": 0.00017873886348566552, "loss": 0.0837, "step": 10183 }, { "epoch": 0.6583595959595959, "grad_norm": 0.05526595935225487, "learning_rate": 0.00017873464758823553, "loss": 0.0911, "step": 10184 }, { "epoch": 0.6584242424242425, "grad_norm": 0.069437175989151, "learning_rate": 0.0001787304313225896, "loss": 0.1083, "step": 10185 }, { "epoch": 0.6584888888888889, "grad_norm": 0.053360626101493835, "learning_rate": 0.00017872621468874748, "loss": 0.0882, "step": 10186 }, { "epoch": 0.6585535353535353, "grad_norm": 0.054374128580093384, "learning_rate": 0.00017872199768672884, "loss": 0.0932, "step": 10187 }, { "epoch": 0.6586181818181818, "grad_norm": 0.04712884500622749, "learning_rate": 0.00017871778031655348, "loss": 0.0743, "step": 10188 }, { "epoch": 0.6586828282828283, "grad_norm": 0.06002098694443703, "learning_rate": 0.00017871356257824107, "loss": 0.1039, "step": 10189 }, { "epoch": 0.6587474747474747, "grad_norm": 0.05316292494535446, "learning_rate": 0.00017870934447181136, "loss": 0.0929, "step": 10190 }, { "epoch": 0.6588121212121212, "grad_norm": 0.04594529792666435, "learning_rate": 0.00017870512599728406, "loss": 0.0768, "step": 10191 }, { "epoch": 0.6588767676767677, "grad_norm": 0.049901530146598816, "learning_rate": 0.00017870090715467888, "loss": 0.0874, "step": 10192 }, { "epoch": 0.6588767676767677, "eval_bleu": 17.106215099917776, "eval_loss": 0.08910731226205826, "eval_runtime": 2.7687, "eval_samples_per_second": 11.558, "eval_steps_per_second": 1.445, "step": 10192 }, { "epoch": 0.6589414141414142, "grad_norm": 0.056745514273643494, "learning_rate": 0.0001786966879440156, "loss": 0.0851, "step": 10193 }, { "epoch": 0.6590060606060606, "grad_norm": 0.04797672480344772, "learning_rate": 0.00017869246836531394, "loss": 0.0766, "step": 10194 }, { "epoch": 0.659070707070707, "grad_norm": 0.04788314178586006, "learning_rate": 0.00017868824841859362, "loss": 0.069, "step": 10195 }, { "epoch": 0.6591353535353536, "grad_norm": 0.05870746448636055, "learning_rate": 0.00017868402810387437, "loss": 0.1031, "step": 10196 }, { "epoch": 0.6592, "grad_norm": 0.053959473967552185, "learning_rate": 0.00017867980742117593, "loss": 0.0826, "step": 10197 }, { "epoch": 0.6592646464646464, "grad_norm": 0.05360955744981766, "learning_rate": 0.00017867558637051802, "loss": 0.0912, "step": 10198 }, { "epoch": 0.6593292929292929, "grad_norm": 0.048823677003383636, "learning_rate": 0.00017867136495192045, "loss": 0.0713, "step": 10199 }, { "epoch": 0.6593939393939394, "grad_norm": 0.05271755903959274, "learning_rate": 0.00017866714316540287, "loss": 0.0882, "step": 10200 }, { "epoch": 0.6594585858585859, "grad_norm": 0.05371154844760895, "learning_rate": 0.0001786629210109851, "loss": 0.083, "step": 10201 }, { "epoch": 0.6595232323232323, "grad_norm": 0.053528793156147, "learning_rate": 0.0001786586984886868, "loss": 0.0765, "step": 10202 }, { "epoch": 0.6595878787878788, "grad_norm": 0.049326200038194656, "learning_rate": 0.0001786544755985278, "loss": 0.0825, "step": 10203 }, { "epoch": 0.6596525252525253, "grad_norm": 0.07017800956964493, "learning_rate": 0.00017865025234052785, "loss": 0.0919, "step": 10204 }, { "epoch": 0.6597171717171717, "grad_norm": 0.05874167010188103, "learning_rate": 0.00017864602871470666, "loss": 0.0914, "step": 10205 }, { "epoch": 0.6597818181818181, "grad_norm": 0.05697527900338173, "learning_rate": 0.00017864180472108399, "loss": 0.0826, "step": 10206 }, { "epoch": 0.6598464646464647, "grad_norm": 0.04281046614050865, "learning_rate": 0.0001786375803596796, "loss": 0.0691, "step": 10207 }, { "epoch": 0.6599111111111111, "grad_norm": 0.059006255120038986, "learning_rate": 0.00017863335563051322, "loss": 0.0786, "step": 10208 }, { "epoch": 0.6599111111111111, "eval_bleu": 17.27352835917634, "eval_loss": 0.08864007890224457, "eval_runtime": 2.7077, "eval_samples_per_second": 11.818, "eval_steps_per_second": 1.477, "step": 10208 }, { "epoch": 0.6599757575757575, "grad_norm": 0.05239826440811157, "learning_rate": 0.00017862913053360465, "loss": 0.0821, "step": 10209 }, { "epoch": 0.6600404040404041, "grad_norm": 0.05153816565871239, "learning_rate": 0.00017862490506897363, "loss": 0.0748, "step": 10210 }, { "epoch": 0.6601050505050505, "grad_norm": 0.060320738703012466, "learning_rate": 0.0001786206792366399, "loss": 0.0902, "step": 10211 }, { "epoch": 0.660169696969697, "grad_norm": 0.04577665403485298, "learning_rate": 0.00017861645303662326, "loss": 0.071, "step": 10212 }, { "epoch": 0.6602343434343434, "grad_norm": 0.05457471311092377, "learning_rate": 0.00017861222646894346, "loss": 0.0903, "step": 10213 }, { "epoch": 0.6602989898989899, "grad_norm": 0.04857414588332176, "learning_rate": 0.0001786079995336203, "loss": 0.0671, "step": 10214 }, { "epoch": 0.6603636363636364, "grad_norm": 0.042382244020700455, "learning_rate": 0.00017860377223067347, "loss": 0.0639, "step": 10215 }, { "epoch": 0.6604282828282828, "grad_norm": 0.05418889597058296, "learning_rate": 0.00017859954456012281, "loss": 0.0814, "step": 10216 }, { "epoch": 0.6604929292929292, "grad_norm": 0.052340954542160034, "learning_rate": 0.00017859531652198804, "loss": 0.066, "step": 10217 }, { "epoch": 0.6605575757575758, "grad_norm": 0.060837775468826294, "learning_rate": 0.00017859108811628898, "loss": 0.0914, "step": 10218 }, { "epoch": 0.6606222222222222, "grad_norm": 0.05589916929602623, "learning_rate": 0.00017858685934304535, "loss": 0.0844, "step": 10219 }, { "epoch": 0.6606868686868687, "grad_norm": 0.06022046133875847, "learning_rate": 0.000178582630202277, "loss": 0.0966, "step": 10220 }, { "epoch": 0.6607515151515152, "grad_norm": 0.04626772552728653, "learning_rate": 0.00017857840069400362, "loss": 0.074, "step": 10221 }, { "epoch": 0.6608161616161616, "grad_norm": 0.050533194094896317, "learning_rate": 0.00017857417081824508, "loss": 0.0789, "step": 10222 }, { "epoch": 0.6608808080808081, "grad_norm": 0.05423295497894287, "learning_rate": 0.0001785699405750211, "loss": 0.0817, "step": 10223 }, { "epoch": 0.6609454545454545, "grad_norm": 0.06742218881845474, "learning_rate": 0.0001785657099643515, "loss": 0.1009, "step": 10224 }, { "epoch": 0.6609454545454545, "eval_bleu": 18.07479694944291, "eval_loss": 0.09022768586874008, "eval_runtime": 2.6423, "eval_samples_per_second": 12.111, "eval_steps_per_second": 1.514, "step": 10224 }, { "epoch": 0.661010101010101, "grad_norm": 0.050451721996068954, "learning_rate": 0.00017856147898625603, "loss": 0.0768, "step": 10225 }, { "epoch": 0.6610747474747475, "grad_norm": 0.05541658401489258, "learning_rate": 0.00017855724764075448, "loss": 0.0918, "step": 10226 }, { "epoch": 0.6611393939393939, "grad_norm": 0.046438608318567276, "learning_rate": 0.0001785530159278667, "loss": 0.0764, "step": 10227 }, { "epoch": 0.6612040404040403, "grad_norm": 0.05167097598314285, "learning_rate": 0.00017854878384761239, "loss": 0.0847, "step": 10228 }, { "epoch": 0.6612686868686869, "grad_norm": 0.05451875552535057, "learning_rate": 0.0001785445514000114, "loss": 0.0823, "step": 10229 }, { "epoch": 0.6613333333333333, "grad_norm": 0.05058397352695465, "learning_rate": 0.0001785403185850835, "loss": 0.0774, "step": 10230 }, { "epoch": 0.6613979797979798, "grad_norm": 0.04819513484835625, "learning_rate": 0.00017853608540284853, "loss": 0.0726, "step": 10231 }, { "epoch": 0.6614626262626263, "grad_norm": 0.05068168789148331, "learning_rate": 0.00017853185185332622, "loss": 0.0821, "step": 10232 }, { "epoch": 0.6615272727272727, "grad_norm": 0.05111650377511978, "learning_rate": 0.0001785276179365364, "loss": 0.079, "step": 10233 }, { "epoch": 0.6615919191919192, "grad_norm": 0.04842058941721916, "learning_rate": 0.0001785233836524989, "loss": 0.069, "step": 10234 }, { "epoch": 0.6616565656565656, "grad_norm": 0.053757231682538986, "learning_rate": 0.00017851914900123345, "loss": 0.089, "step": 10235 }, { "epoch": 0.6617212121212122, "grad_norm": 0.0551467165350914, "learning_rate": 0.00017851491398275995, "loss": 0.0958, "step": 10236 }, { "epoch": 0.6617858585858586, "grad_norm": 0.06525532156229019, "learning_rate": 0.0001785106785970981, "loss": 0.1029, "step": 10237 }, { "epoch": 0.661850505050505, "grad_norm": 0.05554277449846268, "learning_rate": 0.0001785064428442678, "loss": 0.09, "step": 10238 }, { "epoch": 0.6619151515151516, "grad_norm": 0.052880171686410904, "learning_rate": 0.00017850220672428882, "loss": 0.0865, "step": 10239 }, { "epoch": 0.661979797979798, "grad_norm": 0.04797482118010521, "learning_rate": 0.00017849797023718094, "loss": 0.0713, "step": 10240 }, { "epoch": 0.661979797979798, "eval_bleu": 17.001603875569575, "eval_loss": 0.08813238143920898, "eval_runtime": 2.7289, "eval_samples_per_second": 11.726, "eval_steps_per_second": 1.466, "step": 10240 }, { "epoch": 0.6620444444444444, "grad_norm": 0.049845460802316666, "learning_rate": 0.00017849373338296403, "loss": 0.0913, "step": 10241 }, { "epoch": 0.6621090909090909, "grad_norm": 0.0473443903028965, "learning_rate": 0.00017848949616165787, "loss": 0.0742, "step": 10242 }, { "epoch": 0.6621737373737374, "grad_norm": 0.05419163033366203, "learning_rate": 0.0001784852585732823, "loss": 0.0863, "step": 10243 }, { "epoch": 0.6622383838383838, "grad_norm": 0.06253422796726227, "learning_rate": 0.00017848102061785709, "loss": 0.1065, "step": 10244 }, { "epoch": 0.6623030303030303, "grad_norm": 0.057101961225271225, "learning_rate": 0.0001784767822954021, "loss": 0.1016, "step": 10245 }, { "epoch": 0.6623676767676767, "grad_norm": 0.06272678822278976, "learning_rate": 0.00017847254360593717, "loss": 0.0794, "step": 10246 }, { "epoch": 0.6624323232323233, "grad_norm": 0.051759131252765656, "learning_rate": 0.00017846830454948208, "loss": 0.0738, "step": 10247 }, { "epoch": 0.6624969696969697, "grad_norm": 0.05439075082540512, "learning_rate": 0.00017846406512605668, "loss": 0.0944, "step": 10248 }, { "epoch": 0.6625616161616161, "grad_norm": 0.05330037698149681, "learning_rate": 0.00017845982533568075, "loss": 0.088, "step": 10249 }, { "epoch": 0.6626262626262627, "grad_norm": 0.04516725614666939, "learning_rate": 0.0001784555851783742, "loss": 0.0759, "step": 10250 }, { "epoch": 0.6626909090909091, "grad_norm": 0.054199665784835815, "learning_rate": 0.0001784513446541568, "loss": 0.0869, "step": 10251 }, { "epoch": 0.6627555555555555, "grad_norm": 0.05711120367050171, "learning_rate": 0.0001784471037630484, "loss": 0.0828, "step": 10252 }, { "epoch": 0.662820202020202, "grad_norm": 0.049388039857149124, "learning_rate": 0.00017844286250506884, "loss": 0.0858, "step": 10253 }, { "epoch": 0.6628848484848485, "grad_norm": 0.04961549863219261, "learning_rate": 0.0001784386208802379, "loss": 0.0852, "step": 10254 }, { "epoch": 0.662949494949495, "grad_norm": 0.05711861327290535, "learning_rate": 0.0001784343788885755, "loss": 0.0849, "step": 10255 }, { "epoch": 0.6630141414141414, "grad_norm": 0.06972243636846542, "learning_rate": 0.00017843013653010144, "loss": 0.1039, "step": 10256 }, { "epoch": 0.6630141414141414, "eval_bleu": 20.086468326664342, "eval_loss": 0.08900181949138641, "eval_runtime": 2.8036, "eval_samples_per_second": 11.414, "eval_steps_per_second": 1.427, "step": 10256 }, { "epoch": 0.6630787878787879, "grad_norm": 0.041810378432273865, "learning_rate": 0.00017842589380483555, "loss": 0.0573, "step": 10257 }, { "epoch": 0.6631434343434344, "grad_norm": 0.04772426187992096, "learning_rate": 0.0001784216507127977, "loss": 0.0701, "step": 10258 }, { "epoch": 0.6632080808080808, "grad_norm": 0.057533323764801025, "learning_rate": 0.0001784174072540077, "loss": 0.0991, "step": 10259 }, { "epoch": 0.6632727272727272, "grad_norm": 0.08679311722517014, "learning_rate": 0.00017841316342848539, "loss": 0.0696, "step": 10260 }, { "epoch": 0.6633373737373738, "grad_norm": 0.05634527653455734, "learning_rate": 0.00017840891923625064, "loss": 0.0831, "step": 10261 }, { "epoch": 0.6634020202020202, "grad_norm": 0.055026594549417496, "learning_rate": 0.00017840467467732332, "loss": 0.0806, "step": 10262 }, { "epoch": 0.6634666666666666, "grad_norm": 0.04825066402554512, "learning_rate": 0.00017840042975172328, "loss": 0.073, "step": 10263 }, { "epoch": 0.6635313131313131, "grad_norm": 0.05237942561507225, "learning_rate": 0.00017839618445947029, "loss": 0.081, "step": 10264 }, { "epoch": 0.6635959595959596, "grad_norm": 0.0542687326669693, "learning_rate": 0.0001783919388005843, "loss": 0.078, "step": 10265 }, { "epoch": 0.6636606060606061, "grad_norm": 0.055040229111909866, "learning_rate": 0.0001783876927750851, "loss": 0.0889, "step": 10266 }, { "epoch": 0.6637252525252525, "grad_norm": 0.05319792032241821, "learning_rate": 0.0001783834463829926, "loss": 0.0881, "step": 10267 }, { "epoch": 0.663789898989899, "grad_norm": 0.040569208562374115, "learning_rate": 0.00017837919962432664, "loss": 0.0637, "step": 10268 }, { "epoch": 0.6638545454545455, "grad_norm": 0.0480937659740448, "learning_rate": 0.00017837495249910706, "loss": 0.0747, "step": 10269 }, { "epoch": 0.6639191919191919, "grad_norm": 0.05075068771839142, "learning_rate": 0.00017837070500735373, "loss": 0.0812, "step": 10270 }, { "epoch": 0.6639838383838383, "grad_norm": 0.053749457001686096, "learning_rate": 0.00017836645714908652, "loss": 0.0915, "step": 10271 }, { "epoch": 0.6640484848484849, "grad_norm": 0.055688828229904175, "learning_rate": 0.00017836220892432532, "loss": 0.0856, "step": 10272 }, { "epoch": 0.6640484848484849, "eval_bleu": 17.997460933013013, "eval_loss": 0.08846329897642136, "eval_runtime": 2.6835, "eval_samples_per_second": 11.925, "eval_steps_per_second": 1.491, "step": 10272 } ], "logging_steps": 1, "max_steps": 46404, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 16, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0016689445863424e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }