{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5352, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005605381165919282, "grad_norm": 0.978611306363841, "learning_rate": 3.7313432835820895e-07, "loss": 1.3562, "step": 1 }, { "epoch": 0.0011210762331838565, "grad_norm": 0.9785669708543357, "learning_rate": 7.462686567164179e-07, "loss": 1.3832, "step": 2 }, { "epoch": 0.0016816143497757848, "grad_norm": 0.965928509838546, "learning_rate": 1.119402985074627e-06, "loss": 1.3498, "step": 3 }, { "epoch": 0.002242152466367713, "grad_norm": 1.0011476156785488, "learning_rate": 1.4925373134328358e-06, "loss": 1.3665, "step": 4 }, { "epoch": 0.002802690582959641, "grad_norm": 0.9743028706157322, "learning_rate": 1.8656716417910446e-06, "loss": 1.3718, "step": 5 }, { "epoch": 0.0033632286995515697, "grad_norm": 0.9549639453848878, "learning_rate": 2.238805970149254e-06, "loss": 1.355, "step": 6 }, { "epoch": 0.003923766816143498, "grad_norm": 0.9444168662872774, "learning_rate": 2.6119402985074627e-06, "loss": 1.3296, "step": 7 }, { "epoch": 0.004484304932735426, "grad_norm": 0.9653130750500606, "learning_rate": 2.9850746268656716e-06, "loss": 1.3505, "step": 8 }, { "epoch": 0.005044843049327354, "grad_norm": 0.9728971645466663, "learning_rate": 3.358208955223881e-06, "loss": 1.3525, "step": 9 }, { "epoch": 0.005605381165919282, "grad_norm": 0.9263086036248187, "learning_rate": 3.7313432835820893e-06, "loss": 1.3388, "step": 10 }, { "epoch": 0.00616591928251121, "grad_norm": 0.9753471433575638, "learning_rate": 4.1044776119402985e-06, "loss": 1.3496, "step": 11 }, { "epoch": 0.006726457399103139, "grad_norm": 0.9040200360486869, "learning_rate": 4.477611940298508e-06, "loss": 1.3239, "step": 12 }, { "epoch": 0.0072869955156950675, "grad_norm": 0.9437511706878353, "learning_rate": 4.850746268656717e-06, "loss": 1.3111, "step": 13 }, { "epoch": 0.007847533632286996, "grad_norm": 0.9494404428795452, "learning_rate": 5.2238805970149255e-06, "loss": 1.3522, "step": 14 }, { "epoch": 0.008408071748878924, "grad_norm": 0.9317807129575663, "learning_rate": 5.597014925373135e-06, "loss": 1.3261, "step": 15 }, { "epoch": 0.008968609865470852, "grad_norm": 0.9503925528940235, "learning_rate": 5.970149253731343e-06, "loss": 1.3358, "step": 16 }, { "epoch": 0.00952914798206278, "grad_norm": 0.8692982982016761, "learning_rate": 6.343283582089552e-06, "loss": 1.2674, "step": 17 }, { "epoch": 0.010089686098654708, "grad_norm": 0.8512916351445661, "learning_rate": 6.716417910447762e-06, "loss": 1.261, "step": 18 }, { "epoch": 0.010650224215246636, "grad_norm": 0.8710189287442345, "learning_rate": 7.08955223880597e-06, "loss": 1.2842, "step": 19 }, { "epoch": 0.011210762331838564, "grad_norm": 0.781668523585148, "learning_rate": 7.4626865671641785e-06, "loss": 1.2373, "step": 20 }, { "epoch": 0.011771300448430493, "grad_norm": 0.7381052580031785, "learning_rate": 7.835820895522389e-06, "loss": 1.1989, "step": 21 }, { "epoch": 0.01233183856502242, "grad_norm": 0.6770393690721677, "learning_rate": 8.208955223880597e-06, "loss": 1.1672, "step": 22 }, { "epoch": 0.01289237668161435, "grad_norm": 0.6191395018917473, "learning_rate": 8.582089552238805e-06, "loss": 1.1619, "step": 23 }, { "epoch": 0.013452914798206279, "grad_norm": 0.6289526892086984, "learning_rate": 8.955223880597016e-06, "loss": 1.1558, "step": 24 }, { "epoch": 0.014013452914798207, "grad_norm": 0.5682916094085656, "learning_rate": 9.328358208955226e-06, "loss": 1.1096, "step": 25 }, { "epoch": 0.014573991031390135, "grad_norm": 0.5649795252033589, "learning_rate": 9.701492537313434e-06, "loss": 1.1183, "step": 26 }, { "epoch": 0.015134529147982063, "grad_norm": 0.5592638391416755, "learning_rate": 1.0074626865671643e-05, "loss": 1.1039, "step": 27 }, { "epoch": 0.01569506726457399, "grad_norm": 0.5453326006757038, "learning_rate": 1.0447761194029851e-05, "loss": 1.0674, "step": 28 }, { "epoch": 0.016255605381165918, "grad_norm": 0.5439999602347558, "learning_rate": 1.082089552238806e-05, "loss": 1.0574, "step": 29 }, { "epoch": 0.016816143497757848, "grad_norm": 0.5511396127286549, "learning_rate": 1.119402985074627e-05, "loss": 1.013, "step": 30 }, { "epoch": 0.017376681614349777, "grad_norm": 0.5702221080642447, "learning_rate": 1.1567164179104478e-05, "loss": 0.9702, "step": 31 }, { "epoch": 0.017937219730941704, "grad_norm": 0.5720559627118424, "learning_rate": 1.1940298507462686e-05, "loss": 0.9662, "step": 32 }, { "epoch": 0.018497757847533634, "grad_norm": 0.5611113226975845, "learning_rate": 1.2313432835820896e-05, "loss": 0.9274, "step": 33 }, { "epoch": 0.01905829596412556, "grad_norm": 0.6090665517488381, "learning_rate": 1.2686567164179105e-05, "loss": 0.9, "step": 34 }, { "epoch": 0.01961883408071749, "grad_norm": 0.5839791809624392, "learning_rate": 1.3059701492537313e-05, "loss": 0.8903, "step": 35 }, { "epoch": 0.020179372197309416, "grad_norm": 0.5736113015335764, "learning_rate": 1.3432835820895523e-05, "loss": 0.8887, "step": 36 }, { "epoch": 0.020739910313901346, "grad_norm": 0.5190623064152249, "learning_rate": 1.3805970149253733e-05, "loss": 0.8273, "step": 37 }, { "epoch": 0.021300448430493273, "grad_norm": 0.553768945628212, "learning_rate": 1.417910447761194e-05, "loss": 0.8283, "step": 38 }, { "epoch": 0.021860986547085202, "grad_norm": 0.534909748905568, "learning_rate": 1.455223880597015e-05, "loss": 0.782, "step": 39 }, { "epoch": 0.02242152466367713, "grad_norm": 0.5401435696197063, "learning_rate": 1.4925373134328357e-05, "loss": 0.7611, "step": 40 }, { "epoch": 0.02298206278026906, "grad_norm": 0.528685630974113, "learning_rate": 1.529850746268657e-05, "loss": 0.7325, "step": 41 }, { "epoch": 0.023542600896860985, "grad_norm": 0.49093077695142545, "learning_rate": 1.5671641791044777e-05, "loss": 0.6981, "step": 42 }, { "epoch": 0.024103139013452915, "grad_norm": 0.4438139439428313, "learning_rate": 1.6044776119402986e-05, "loss": 0.6774, "step": 43 }, { "epoch": 0.02466367713004484, "grad_norm": 0.4482263523387966, "learning_rate": 1.6417910447761194e-05, "loss": 0.6645, "step": 44 }, { "epoch": 0.02522421524663677, "grad_norm": 0.4540231120607396, "learning_rate": 1.6791044776119406e-05, "loss": 0.6123, "step": 45 }, { "epoch": 0.0257847533632287, "grad_norm": 0.4326299184847521, "learning_rate": 1.716417910447761e-05, "loss": 0.5931, "step": 46 }, { "epoch": 0.026345291479820628, "grad_norm": 0.3024749933538969, "learning_rate": 1.7537313432835823e-05, "loss": 0.6052, "step": 47 }, { "epoch": 0.026905829596412557, "grad_norm": 0.2818759060120326, "learning_rate": 1.791044776119403e-05, "loss": 0.5998, "step": 48 }, { "epoch": 0.027466367713004484, "grad_norm": 0.27406340113751826, "learning_rate": 1.828358208955224e-05, "loss": 0.5961, "step": 49 }, { "epoch": 0.028026905829596414, "grad_norm": 0.2479023351788548, "learning_rate": 1.865671641791045e-05, "loss": 0.5627, "step": 50 }, { "epoch": 0.02858744394618834, "grad_norm": 0.23908909726117478, "learning_rate": 1.9029850746268656e-05, "loss": 0.5535, "step": 51 }, { "epoch": 0.02914798206278027, "grad_norm": 0.21890178164904825, "learning_rate": 1.9402985074626868e-05, "loss": 0.5498, "step": 52 }, { "epoch": 0.029708520179372196, "grad_norm": 0.20444438580516272, "learning_rate": 1.9776119402985073e-05, "loss": 0.5661, "step": 53 }, { "epoch": 0.030269058295964126, "grad_norm": 0.20254090583306722, "learning_rate": 2.0149253731343285e-05, "loss": 0.5653, "step": 54 }, { "epoch": 0.030829596412556053, "grad_norm": 0.21987649104111895, "learning_rate": 2.0522388059701493e-05, "loss": 0.5406, "step": 55 }, { "epoch": 0.03139013452914798, "grad_norm": 0.18690597707242654, "learning_rate": 2.0895522388059702e-05, "loss": 0.5544, "step": 56 }, { "epoch": 0.03195067264573991, "grad_norm": 0.18441579639270703, "learning_rate": 2.126865671641791e-05, "loss": 0.5324, "step": 57 }, { "epoch": 0.032511210762331835, "grad_norm": 0.17795328095595792, "learning_rate": 2.164179104477612e-05, "loss": 0.5284, "step": 58 }, { "epoch": 0.033071748878923765, "grad_norm": 0.17770157145787804, "learning_rate": 2.201492537313433e-05, "loss": 0.5199, "step": 59 }, { "epoch": 0.033632286995515695, "grad_norm": 0.19713002587880057, "learning_rate": 2.238805970149254e-05, "loss": 0.4949, "step": 60 }, { "epoch": 0.034192825112107625, "grad_norm": 0.1936909800596092, "learning_rate": 2.2761194029850747e-05, "loss": 0.5079, "step": 61 }, { "epoch": 0.034753363228699555, "grad_norm": 0.1766770198216385, "learning_rate": 2.3134328358208956e-05, "loss": 0.5172, "step": 62 }, { "epoch": 0.03531390134529148, "grad_norm": 0.16215448560164902, "learning_rate": 2.3507462686567168e-05, "loss": 0.515, "step": 63 }, { "epoch": 0.03587443946188341, "grad_norm": 0.16097779703313178, "learning_rate": 2.3880597014925373e-05, "loss": 0.4931, "step": 64 }, { "epoch": 0.03643497757847534, "grad_norm": 0.1679553680550244, "learning_rate": 2.4253731343283584e-05, "loss": 0.5, "step": 65 }, { "epoch": 0.03699551569506727, "grad_norm": 0.16770477974378512, "learning_rate": 2.4626865671641793e-05, "loss": 0.5052, "step": 66 }, { "epoch": 0.03755605381165919, "grad_norm": 0.15086103753068986, "learning_rate": 2.5e-05, "loss": 0.4932, "step": 67 }, { "epoch": 0.03811659192825112, "grad_norm": 0.1448393261526423, "learning_rate": 2.537313432835821e-05, "loss": 0.4965, "step": 68 }, { "epoch": 0.03867713004484305, "grad_norm": 0.13536710404495228, "learning_rate": 2.574626865671642e-05, "loss": 0.4949, "step": 69 }, { "epoch": 0.03923766816143498, "grad_norm": 0.1318960286316755, "learning_rate": 2.6119402985074626e-05, "loss": 0.4717, "step": 70 }, { "epoch": 0.0397982062780269, "grad_norm": 0.13908346439467814, "learning_rate": 2.6492537313432835e-05, "loss": 0.4886, "step": 71 }, { "epoch": 0.04035874439461883, "grad_norm": 0.13274477559301887, "learning_rate": 2.6865671641791047e-05, "loss": 0.478, "step": 72 }, { "epoch": 0.04091928251121076, "grad_norm": 0.12187263958584976, "learning_rate": 2.7238805970149255e-05, "loss": 0.481, "step": 73 }, { "epoch": 0.04147982062780269, "grad_norm": 0.12112276934943426, "learning_rate": 2.7611940298507467e-05, "loss": 0.4629, "step": 74 }, { "epoch": 0.04204035874439462, "grad_norm": 0.12784869143888544, "learning_rate": 2.7985074626865672e-05, "loss": 0.4667, "step": 75 }, { "epoch": 0.042600896860986545, "grad_norm": 0.12366913602187946, "learning_rate": 2.835820895522388e-05, "loss": 0.4663, "step": 76 }, { "epoch": 0.043161434977578475, "grad_norm": 0.13038172476851312, "learning_rate": 2.8731343283582092e-05, "loss": 0.4644, "step": 77 }, { "epoch": 0.043721973094170405, "grad_norm": 0.11454023239597903, "learning_rate": 2.91044776119403e-05, "loss": 0.4528, "step": 78 }, { "epoch": 0.044282511210762335, "grad_norm": 0.11601962977381795, "learning_rate": 2.9477611940298512e-05, "loss": 0.462, "step": 79 }, { "epoch": 0.04484304932735426, "grad_norm": 0.1219882451392154, "learning_rate": 2.9850746268656714e-05, "loss": 0.4791, "step": 80 }, { "epoch": 0.04540358744394619, "grad_norm": 0.1293231926178124, "learning_rate": 3.0223880597014926e-05, "loss": 0.4684, "step": 81 }, { "epoch": 0.04596412556053812, "grad_norm": 0.11996052800463039, "learning_rate": 3.059701492537314e-05, "loss": 0.4566, "step": 82 }, { "epoch": 0.04652466367713005, "grad_norm": 0.12246511147890005, "learning_rate": 3.0970149253731346e-05, "loss": 0.4681, "step": 83 }, { "epoch": 0.04708520179372197, "grad_norm": 0.11218525045205559, "learning_rate": 3.1343283582089554e-05, "loss": 0.4633, "step": 84 }, { "epoch": 0.0476457399103139, "grad_norm": 0.11506805026698533, "learning_rate": 3.171641791044776e-05, "loss": 0.4764, "step": 85 }, { "epoch": 0.04820627802690583, "grad_norm": 0.1278539909669664, "learning_rate": 3.208955223880597e-05, "loss": 0.4578, "step": 86 }, { "epoch": 0.04876681614349776, "grad_norm": 0.11325282709726044, "learning_rate": 3.246268656716418e-05, "loss": 0.436, "step": 87 }, { "epoch": 0.04932735426008968, "grad_norm": 0.11889933717086941, "learning_rate": 3.283582089552239e-05, "loss": 0.45, "step": 88 }, { "epoch": 0.04988789237668161, "grad_norm": 0.12864292199145877, "learning_rate": 3.32089552238806e-05, "loss": 0.4465, "step": 89 }, { "epoch": 0.05044843049327354, "grad_norm": 0.12367148564811485, "learning_rate": 3.358208955223881e-05, "loss": 0.4442, "step": 90 }, { "epoch": 0.05100896860986547, "grad_norm": 0.11840931265139464, "learning_rate": 3.395522388059701e-05, "loss": 0.4342, "step": 91 }, { "epoch": 0.0515695067264574, "grad_norm": 0.12460573190500202, "learning_rate": 3.432835820895522e-05, "loss": 0.4704, "step": 92 }, { "epoch": 0.052130044843049325, "grad_norm": 0.11275332417661361, "learning_rate": 3.470149253731344e-05, "loss": 0.4566, "step": 93 }, { "epoch": 0.052690582959641255, "grad_norm": 0.12027472702499808, "learning_rate": 3.5074626865671645e-05, "loss": 0.4683, "step": 94 }, { "epoch": 0.053251121076233185, "grad_norm": 0.12166680224485274, "learning_rate": 3.5447761194029854e-05, "loss": 0.4502, "step": 95 }, { "epoch": 0.053811659192825115, "grad_norm": 0.1143149054981622, "learning_rate": 3.582089552238806e-05, "loss": 0.4384, "step": 96 }, { "epoch": 0.05437219730941704, "grad_norm": 0.11701314141339264, "learning_rate": 3.619402985074627e-05, "loss": 0.4361, "step": 97 }, { "epoch": 0.05493273542600897, "grad_norm": 0.12407417894757372, "learning_rate": 3.656716417910448e-05, "loss": 0.4498, "step": 98 }, { "epoch": 0.0554932735426009, "grad_norm": 0.1300165206876751, "learning_rate": 3.694029850746269e-05, "loss": 0.4563, "step": 99 }, { "epoch": 0.05605381165919283, "grad_norm": 0.11518521501317551, "learning_rate": 3.73134328358209e-05, "loss": 0.4466, "step": 100 }, { "epoch": 0.05661434977578475, "grad_norm": 0.11754858539944062, "learning_rate": 3.7686567164179104e-05, "loss": 0.4267, "step": 101 }, { "epoch": 0.05717488789237668, "grad_norm": 0.12077083786609252, "learning_rate": 3.805970149253731e-05, "loss": 0.4313, "step": 102 }, { "epoch": 0.05773542600896861, "grad_norm": 0.12080980975033845, "learning_rate": 3.843283582089552e-05, "loss": 0.4335, "step": 103 }, { "epoch": 0.05829596412556054, "grad_norm": 0.1242265803727591, "learning_rate": 3.8805970149253736e-05, "loss": 0.4461, "step": 104 }, { "epoch": 0.05885650224215247, "grad_norm": 0.13003496252871916, "learning_rate": 3.9179104477611945e-05, "loss": 0.4447, "step": 105 }, { "epoch": 0.05941704035874439, "grad_norm": 0.12908146749282087, "learning_rate": 3.9552238805970146e-05, "loss": 0.4219, "step": 106 }, { "epoch": 0.05997757847533632, "grad_norm": 0.11582509313092396, "learning_rate": 3.992537313432836e-05, "loss": 0.4108, "step": 107 }, { "epoch": 0.06053811659192825, "grad_norm": 0.12643265441930768, "learning_rate": 4.029850746268657e-05, "loss": 0.4229, "step": 108 }, { "epoch": 0.06109865470852018, "grad_norm": 0.13820675934864066, "learning_rate": 4.067164179104478e-05, "loss": 0.4275, "step": 109 }, { "epoch": 0.061659192825112105, "grad_norm": 0.1360538838053136, "learning_rate": 4.104477611940299e-05, "loss": 0.4312, "step": 110 }, { "epoch": 0.062219730941704035, "grad_norm": 0.12554029300674785, "learning_rate": 4.1417910447761195e-05, "loss": 0.4326, "step": 111 }, { "epoch": 0.06278026905829596, "grad_norm": 0.13890555381256833, "learning_rate": 4.1791044776119404e-05, "loss": 0.4339, "step": 112 }, { "epoch": 0.0633408071748879, "grad_norm": 0.13227702018718224, "learning_rate": 4.216417910447761e-05, "loss": 0.414, "step": 113 }, { "epoch": 0.06390134529147982, "grad_norm": 0.1376833510856438, "learning_rate": 4.253731343283582e-05, "loss": 0.4179, "step": 114 }, { "epoch": 0.06446188340807175, "grad_norm": 0.1416082156028862, "learning_rate": 4.2910447761194036e-05, "loss": 0.4207, "step": 115 }, { "epoch": 0.06502242152466367, "grad_norm": 0.1365191037664384, "learning_rate": 4.328358208955224e-05, "loss": 0.4243, "step": 116 }, { "epoch": 0.0655829596412556, "grad_norm": 0.14062318849572733, "learning_rate": 4.3656716417910446e-05, "loss": 0.413, "step": 117 }, { "epoch": 0.06614349775784753, "grad_norm": 0.14187004014123297, "learning_rate": 4.402985074626866e-05, "loss": 0.4135, "step": 118 }, { "epoch": 0.06670403587443946, "grad_norm": 0.14531335764922046, "learning_rate": 4.440298507462687e-05, "loss": 0.4147, "step": 119 }, { "epoch": 0.06726457399103139, "grad_norm": 0.13755295535271247, "learning_rate": 4.477611940298508e-05, "loss": 0.433, "step": 120 }, { "epoch": 0.06782511210762332, "grad_norm": 0.1432819698901696, "learning_rate": 4.5149253731343286e-05, "loss": 0.4096, "step": 121 }, { "epoch": 0.06838565022421525, "grad_norm": 0.14193457086049052, "learning_rate": 4.5522388059701495e-05, "loss": 0.4245, "step": 122 }, { "epoch": 0.06894618834080718, "grad_norm": 0.13928060042054785, "learning_rate": 4.58955223880597e-05, "loss": 0.4262, "step": 123 }, { "epoch": 0.06950672645739911, "grad_norm": 0.14003170515118332, "learning_rate": 4.626865671641791e-05, "loss": 0.4035, "step": 124 }, { "epoch": 0.07006726457399103, "grad_norm": 0.1331650387336229, "learning_rate": 4.664179104477612e-05, "loss": 0.4203, "step": 125 }, { "epoch": 0.07062780269058296, "grad_norm": 0.13498074727847828, "learning_rate": 4.7014925373134335e-05, "loss": 0.3935, "step": 126 }, { "epoch": 0.07118834080717489, "grad_norm": 0.13666502845282, "learning_rate": 4.738805970149254e-05, "loss": 0.4042, "step": 127 }, { "epoch": 0.07174887892376682, "grad_norm": 0.14758309572816788, "learning_rate": 4.7761194029850745e-05, "loss": 0.4058, "step": 128 }, { "epoch": 0.07230941704035874, "grad_norm": 0.14492402892274034, "learning_rate": 4.813432835820896e-05, "loss": 0.41, "step": 129 }, { "epoch": 0.07286995515695067, "grad_norm": 0.14695792097356924, "learning_rate": 4.850746268656717e-05, "loss": 0.3948, "step": 130 }, { "epoch": 0.0734304932735426, "grad_norm": 0.15151969250879915, "learning_rate": 4.888059701492538e-05, "loss": 0.3948, "step": 131 }, { "epoch": 0.07399103139013453, "grad_norm": 0.13754418875073354, "learning_rate": 4.9253731343283586e-05, "loss": 0.4022, "step": 132 }, { "epoch": 0.07455156950672645, "grad_norm": 0.15254984318473536, "learning_rate": 4.9626865671641794e-05, "loss": 0.4013, "step": 133 }, { "epoch": 0.07511210762331838, "grad_norm": 0.14547684214076642, "learning_rate": 5e-05, "loss": 0.4063, "step": 134 }, { "epoch": 0.07567264573991031, "grad_norm": 0.14922739042225802, "learning_rate": 5.0373134328358204e-05, "loss": 0.4101, "step": 135 }, { "epoch": 0.07623318385650224, "grad_norm": 0.15098699044966904, "learning_rate": 5.074626865671642e-05, "loss": 0.4047, "step": 136 }, { "epoch": 0.07679372197309417, "grad_norm": 0.1452335515819299, "learning_rate": 5.111940298507463e-05, "loss": 0.4048, "step": 137 }, { "epoch": 0.0773542600896861, "grad_norm": 0.15079187782364642, "learning_rate": 5.149253731343284e-05, "loss": 0.4024, "step": 138 }, { "epoch": 0.07791479820627803, "grad_norm": 0.14434756712244956, "learning_rate": 5.1865671641791044e-05, "loss": 0.405, "step": 139 }, { "epoch": 0.07847533632286996, "grad_norm": 0.13254131018383866, "learning_rate": 5.223880597014925e-05, "loss": 0.4001, "step": 140 }, { "epoch": 0.07903587443946189, "grad_norm": 0.1417441767722713, "learning_rate": 5.261194029850747e-05, "loss": 0.3927, "step": 141 }, { "epoch": 0.0795964125560538, "grad_norm": 0.15274755766224463, "learning_rate": 5.298507462686567e-05, "loss": 0.4009, "step": 142 }, { "epoch": 0.08015695067264574, "grad_norm": 0.14245430733786046, "learning_rate": 5.3358208955223885e-05, "loss": 0.3854, "step": 143 }, { "epoch": 0.08071748878923767, "grad_norm": 0.1536214882615985, "learning_rate": 5.373134328358209e-05, "loss": 0.3851, "step": 144 }, { "epoch": 0.0812780269058296, "grad_norm": 0.19139835581037787, "learning_rate": 5.4104477611940295e-05, "loss": 0.4039, "step": 145 }, { "epoch": 0.08183856502242152, "grad_norm": 0.16363703483744255, "learning_rate": 5.447761194029851e-05, "loss": 0.4022, "step": 146 }, { "epoch": 0.08239910313901345, "grad_norm": 0.15294575979455324, "learning_rate": 5.485074626865672e-05, "loss": 0.3867, "step": 147 }, { "epoch": 0.08295964125560538, "grad_norm": 0.16677125259280787, "learning_rate": 5.5223880597014934e-05, "loss": 0.3919, "step": 148 }, { "epoch": 0.08352017937219731, "grad_norm": 0.16265087004682827, "learning_rate": 5.5597014925373135e-05, "loss": 0.3864, "step": 149 }, { "epoch": 0.08408071748878924, "grad_norm": 0.14272101706953824, "learning_rate": 5.5970149253731344e-05, "loss": 0.3827, "step": 150 }, { "epoch": 0.08464125560538116, "grad_norm": 0.16824308627818732, "learning_rate": 5.634328358208956e-05, "loss": 0.4, "step": 151 }, { "epoch": 0.08520179372197309, "grad_norm": 0.16732040229196365, "learning_rate": 5.671641791044776e-05, "loss": 0.3701, "step": 152 }, { "epoch": 0.08576233183856502, "grad_norm": 0.14882351488471363, "learning_rate": 5.7089552238805976e-05, "loss": 0.3972, "step": 153 }, { "epoch": 0.08632286995515695, "grad_norm": 0.16796214736400641, "learning_rate": 5.7462686567164184e-05, "loss": 0.4063, "step": 154 }, { "epoch": 0.08688340807174888, "grad_norm": 0.16090778047008664, "learning_rate": 5.7835820895522386e-05, "loss": 0.4082, "step": 155 }, { "epoch": 0.08744394618834081, "grad_norm": 0.1515617591452116, "learning_rate": 5.82089552238806e-05, "loss": 0.3936, "step": 156 }, { "epoch": 0.08800448430493274, "grad_norm": 0.14765500679843194, "learning_rate": 5.85820895522388e-05, "loss": 0.3889, "step": 157 }, { "epoch": 0.08856502242152467, "grad_norm": 0.15544914165747062, "learning_rate": 5.8955223880597025e-05, "loss": 0.38, "step": 158 }, { "epoch": 0.08912556053811659, "grad_norm": 0.16721420657142755, "learning_rate": 5.9328358208955226e-05, "loss": 0.3811, "step": 159 }, { "epoch": 0.08968609865470852, "grad_norm": 0.16794811927217704, "learning_rate": 5.970149253731343e-05, "loss": 0.3909, "step": 160 }, { "epoch": 0.09024663677130045, "grad_norm": 0.15165371471110536, "learning_rate": 6.007462686567164e-05, "loss": 0.3836, "step": 161 }, { "epoch": 0.09080717488789238, "grad_norm": 0.15262228871854122, "learning_rate": 6.044776119402985e-05, "loss": 0.367, "step": 162 }, { "epoch": 0.0913677130044843, "grad_norm": 0.16250954049606353, "learning_rate": 6.082089552238807e-05, "loss": 0.3842, "step": 163 }, { "epoch": 0.09192825112107623, "grad_norm": 0.16164141273342517, "learning_rate": 6.119402985074628e-05, "loss": 0.4001, "step": 164 }, { "epoch": 0.09248878923766816, "grad_norm": 0.14821752027103804, "learning_rate": 6.156716417910448e-05, "loss": 0.391, "step": 165 }, { "epoch": 0.0930493273542601, "grad_norm": 0.1649310143100295, "learning_rate": 6.194029850746269e-05, "loss": 0.3831, "step": 166 }, { "epoch": 0.09360986547085202, "grad_norm": 0.16182061113902127, "learning_rate": 6.23134328358209e-05, "loss": 0.3779, "step": 167 }, { "epoch": 0.09417040358744394, "grad_norm": 0.16190445609076318, "learning_rate": 6.268656716417911e-05, "loss": 0.3897, "step": 168 }, { "epoch": 0.09473094170403587, "grad_norm": 0.15156515470873255, "learning_rate": 6.305970149253731e-05, "loss": 0.3764, "step": 169 }, { "epoch": 0.0952914798206278, "grad_norm": 0.17156111651448278, "learning_rate": 6.343283582089553e-05, "loss": 0.3938, "step": 170 }, { "epoch": 0.09585201793721973, "grad_norm": 0.15513896968402013, "learning_rate": 6.380597014925374e-05, "loss": 0.387, "step": 171 }, { "epoch": 0.09641255605381166, "grad_norm": 0.1788700878157143, "learning_rate": 6.417910447761194e-05, "loss": 0.3749, "step": 172 }, { "epoch": 0.09697309417040359, "grad_norm": 0.1843006493151444, "learning_rate": 6.455223880597016e-05, "loss": 0.3857, "step": 173 }, { "epoch": 0.09753363228699552, "grad_norm": 0.16235020723013396, "learning_rate": 6.492537313432836e-05, "loss": 0.3819, "step": 174 }, { "epoch": 0.09809417040358745, "grad_norm": 0.1767763613709395, "learning_rate": 6.529850746268657e-05, "loss": 0.3761, "step": 175 }, { "epoch": 0.09865470852017937, "grad_norm": 0.17256885471732497, "learning_rate": 6.567164179104478e-05, "loss": 0.3895, "step": 176 }, { "epoch": 0.0992152466367713, "grad_norm": 0.16950015988649256, "learning_rate": 6.604477611940298e-05, "loss": 0.3767, "step": 177 }, { "epoch": 0.09977578475336323, "grad_norm": 0.19591518959576854, "learning_rate": 6.64179104477612e-05, "loss": 0.4027, "step": 178 }, { "epoch": 0.10033632286995516, "grad_norm": 0.15667226214733038, "learning_rate": 6.679104477611941e-05, "loss": 0.3821, "step": 179 }, { "epoch": 0.10089686098654709, "grad_norm": 0.17160752708809768, "learning_rate": 6.716417910447762e-05, "loss": 0.3685, "step": 180 }, { "epoch": 0.10145739910313901, "grad_norm": 0.15121233958417601, "learning_rate": 6.753731343283583e-05, "loss": 0.3677, "step": 181 }, { "epoch": 0.10201793721973094, "grad_norm": 0.15733202495872775, "learning_rate": 6.791044776119403e-05, "loss": 0.3841, "step": 182 }, { "epoch": 0.10257847533632287, "grad_norm": 0.17626355608916983, "learning_rate": 6.828358208955224e-05, "loss": 0.3889, "step": 183 }, { "epoch": 0.1031390134529148, "grad_norm": 0.17412687579414454, "learning_rate": 6.865671641791044e-05, "loss": 0.3793, "step": 184 }, { "epoch": 0.10369955156950672, "grad_norm": 0.16411490181034571, "learning_rate": 6.902985074626866e-05, "loss": 0.3651, "step": 185 }, { "epoch": 0.10426008968609865, "grad_norm": 0.1809656119702917, "learning_rate": 6.940298507462687e-05, "loss": 0.3785, "step": 186 }, { "epoch": 0.10482062780269058, "grad_norm": 0.1703795524339579, "learning_rate": 6.977611940298508e-05, "loss": 0.3742, "step": 187 }, { "epoch": 0.10538116591928251, "grad_norm": 0.17116149539584355, "learning_rate": 7.014925373134329e-05, "loss": 0.3729, "step": 188 }, { "epoch": 0.10594170403587444, "grad_norm": 0.17956733029995822, "learning_rate": 7.052238805970149e-05, "loss": 0.3775, "step": 189 }, { "epoch": 0.10650224215246637, "grad_norm": 0.16346388961754899, "learning_rate": 7.089552238805971e-05, "loss": 0.3873, "step": 190 }, { "epoch": 0.1070627802690583, "grad_norm": 0.1568726807138821, "learning_rate": 7.126865671641791e-05, "loss": 0.373, "step": 191 }, { "epoch": 0.10762331838565023, "grad_norm": 0.1796852200923323, "learning_rate": 7.164179104477612e-05, "loss": 0.3804, "step": 192 }, { "epoch": 0.10818385650224215, "grad_norm": 0.16135711806174538, "learning_rate": 7.201492537313434e-05, "loss": 0.3736, "step": 193 }, { "epoch": 0.10874439461883408, "grad_norm": 0.16102305151660706, "learning_rate": 7.238805970149254e-05, "loss": 0.3658, "step": 194 }, { "epoch": 0.109304932735426, "grad_norm": 0.16781137288468057, "learning_rate": 7.276119402985076e-05, "loss": 0.3737, "step": 195 }, { "epoch": 0.10986547085201794, "grad_norm": 0.18545424393368118, "learning_rate": 7.313432835820896e-05, "loss": 0.3695, "step": 196 }, { "epoch": 0.11042600896860987, "grad_norm": 0.17958819480714575, "learning_rate": 7.350746268656716e-05, "loss": 0.3759, "step": 197 }, { "epoch": 0.1109865470852018, "grad_norm": 0.20219498453199727, "learning_rate": 7.388059701492537e-05, "loss": 0.3864, "step": 198 }, { "epoch": 0.11154708520179372, "grad_norm": 0.1726692529003045, "learning_rate": 7.425373134328359e-05, "loss": 0.3635, "step": 199 }, { "epoch": 0.11210762331838565, "grad_norm": 0.1746472625241535, "learning_rate": 7.46268656716418e-05, "loss": 0.3774, "step": 200 }, { "epoch": 0.11266816143497758, "grad_norm": 0.18311083159929262, "learning_rate": 7.500000000000001e-05, "loss": 0.3736, "step": 201 }, { "epoch": 0.1132286995515695, "grad_norm": 0.18726919900702563, "learning_rate": 7.537313432835821e-05, "loss": 0.3817, "step": 202 }, { "epoch": 0.11378923766816143, "grad_norm": 0.18987151892129903, "learning_rate": 7.574626865671642e-05, "loss": 0.3794, "step": 203 }, { "epoch": 0.11434977578475336, "grad_norm": 0.18795067963203516, "learning_rate": 7.611940298507463e-05, "loss": 0.3674, "step": 204 }, { "epoch": 0.11491031390134529, "grad_norm": 0.19242197215916712, "learning_rate": 7.649253731343284e-05, "loss": 0.364, "step": 205 }, { "epoch": 0.11547085201793722, "grad_norm": 0.17251419491478528, "learning_rate": 7.686567164179104e-05, "loss": 0.3632, "step": 206 }, { "epoch": 0.11603139013452915, "grad_norm": 0.1728889950589284, "learning_rate": 7.723880597014926e-05, "loss": 0.3591, "step": 207 }, { "epoch": 0.11659192825112108, "grad_norm": 0.1899590050084193, "learning_rate": 7.761194029850747e-05, "loss": 0.3759, "step": 208 }, { "epoch": 0.11715246636771301, "grad_norm": 0.1789564355228219, "learning_rate": 7.798507462686567e-05, "loss": 0.3669, "step": 209 }, { "epoch": 0.11771300448430494, "grad_norm": 0.18910618441974555, "learning_rate": 7.835820895522389e-05, "loss": 0.3679, "step": 210 }, { "epoch": 0.11827354260089686, "grad_norm": 0.1768485362984264, "learning_rate": 7.873134328358209e-05, "loss": 0.3668, "step": 211 }, { "epoch": 0.11883408071748879, "grad_norm": 0.19534942551965084, "learning_rate": 7.910447761194029e-05, "loss": 0.3635, "step": 212 }, { "epoch": 0.11939461883408072, "grad_norm": 0.17520388878220655, "learning_rate": 7.947761194029851e-05, "loss": 0.3543, "step": 213 }, { "epoch": 0.11995515695067265, "grad_norm": 0.18235087513990209, "learning_rate": 7.985074626865672e-05, "loss": 0.3658, "step": 214 }, { "epoch": 0.12051569506726457, "grad_norm": 0.1827333194205262, "learning_rate": 8.022388059701494e-05, "loss": 0.364, "step": 215 }, { "epoch": 0.1210762331838565, "grad_norm": 0.19175747163007564, "learning_rate": 8.059701492537314e-05, "loss": 0.3903, "step": 216 }, { "epoch": 0.12163677130044843, "grad_norm": 0.18369610261399918, "learning_rate": 8.097014925373134e-05, "loss": 0.3936, "step": 217 }, { "epoch": 0.12219730941704036, "grad_norm": 0.17145574679985418, "learning_rate": 8.134328358208956e-05, "loss": 0.3567, "step": 218 }, { "epoch": 0.12275784753363228, "grad_norm": 0.1921713146643893, "learning_rate": 8.171641791044776e-05, "loss": 0.3674, "step": 219 }, { "epoch": 0.12331838565022421, "grad_norm": 0.1570865798975667, "learning_rate": 8.208955223880597e-05, "loss": 0.3773, "step": 220 }, { "epoch": 0.12387892376681614, "grad_norm": 0.1763016822133998, "learning_rate": 8.246268656716419e-05, "loss": 0.3757, "step": 221 }, { "epoch": 0.12443946188340807, "grad_norm": 0.18174355080196333, "learning_rate": 8.283582089552239e-05, "loss": 0.3653, "step": 222 }, { "epoch": 0.125, "grad_norm": 0.16621511712083203, "learning_rate": 8.32089552238806e-05, "loss": 0.3678, "step": 223 }, { "epoch": 0.12556053811659193, "grad_norm": 0.1663459391630373, "learning_rate": 8.358208955223881e-05, "loss": 0.3705, "step": 224 }, { "epoch": 0.12612107623318386, "grad_norm": 0.1725075785062485, "learning_rate": 8.395522388059702e-05, "loss": 0.3609, "step": 225 }, { "epoch": 0.1266816143497758, "grad_norm": 0.15966589014814622, "learning_rate": 8.432835820895522e-05, "loss": 0.36, "step": 226 }, { "epoch": 0.12724215246636772, "grad_norm": 0.17842238761202062, "learning_rate": 8.470149253731343e-05, "loss": 0.3564, "step": 227 }, { "epoch": 0.12780269058295965, "grad_norm": 0.18978029743720834, "learning_rate": 8.507462686567164e-05, "loss": 0.3845, "step": 228 }, { "epoch": 0.12836322869955158, "grad_norm": 0.17577228283856472, "learning_rate": 8.544776119402986e-05, "loss": 0.3494, "step": 229 }, { "epoch": 0.1289237668161435, "grad_norm": 0.17628453743082703, "learning_rate": 8.582089552238807e-05, "loss": 0.36, "step": 230 }, { "epoch": 0.12948430493273544, "grad_norm": 0.16428265399110947, "learning_rate": 8.619402985074627e-05, "loss": 0.3647, "step": 231 }, { "epoch": 0.13004484304932734, "grad_norm": 0.1836827615926638, "learning_rate": 8.656716417910447e-05, "loss": 0.3648, "step": 232 }, { "epoch": 0.13060538116591927, "grad_norm": 0.1866251710481314, "learning_rate": 8.694029850746269e-05, "loss": 0.3596, "step": 233 }, { "epoch": 0.1311659192825112, "grad_norm": 0.18372353476625689, "learning_rate": 8.731343283582089e-05, "loss": 0.3581, "step": 234 }, { "epoch": 0.13172645739910313, "grad_norm": 0.17254888167050808, "learning_rate": 8.76865671641791e-05, "loss": 0.3736, "step": 235 }, { "epoch": 0.13228699551569506, "grad_norm": 0.1682612354618672, "learning_rate": 8.805970149253732e-05, "loss": 0.3627, "step": 236 }, { "epoch": 0.132847533632287, "grad_norm": 0.1684459539424527, "learning_rate": 8.843283582089554e-05, "loss": 0.3663, "step": 237 }, { "epoch": 0.13340807174887892, "grad_norm": 0.16975246091905197, "learning_rate": 8.880597014925374e-05, "loss": 0.3608, "step": 238 }, { "epoch": 0.13396860986547085, "grad_norm": 0.15473984678290356, "learning_rate": 8.917910447761194e-05, "loss": 0.354, "step": 239 }, { "epoch": 0.13452914798206278, "grad_norm": 0.18185884309316858, "learning_rate": 8.955223880597016e-05, "loss": 0.37, "step": 240 }, { "epoch": 0.1350896860986547, "grad_norm": 0.14927991709699787, "learning_rate": 8.992537313432836e-05, "loss": 0.3647, "step": 241 }, { "epoch": 0.13565022421524664, "grad_norm": 0.1636477741733644, "learning_rate": 9.029850746268657e-05, "loss": 0.353, "step": 242 }, { "epoch": 0.13621076233183857, "grad_norm": 0.15567916635474852, "learning_rate": 9.067164179104479e-05, "loss": 0.3495, "step": 243 }, { "epoch": 0.1367713004484305, "grad_norm": 0.1586917656306562, "learning_rate": 9.104477611940299e-05, "loss": 0.3586, "step": 244 }, { "epoch": 0.13733183856502243, "grad_norm": 0.1658072814748706, "learning_rate": 9.14179104477612e-05, "loss": 0.377, "step": 245 }, { "epoch": 0.13789237668161436, "grad_norm": 0.17172300258830786, "learning_rate": 9.17910447761194e-05, "loss": 0.3629, "step": 246 }, { "epoch": 0.1384529147982063, "grad_norm": 0.16886456245374368, "learning_rate": 9.216417910447762e-05, "loss": 0.3633, "step": 247 }, { "epoch": 0.13901345291479822, "grad_norm": 0.1684761343496583, "learning_rate": 9.253731343283582e-05, "loss": 0.3415, "step": 248 }, { "epoch": 0.13957399103139012, "grad_norm": 0.16925015900566961, "learning_rate": 9.291044776119402e-05, "loss": 0.3461, "step": 249 }, { "epoch": 0.14013452914798205, "grad_norm": 0.15843094471805708, "learning_rate": 9.328358208955224e-05, "loss": 0.3551, "step": 250 }, { "epoch": 0.14069506726457398, "grad_norm": 0.1696294696406677, "learning_rate": 9.365671641791045e-05, "loss": 0.3458, "step": 251 }, { "epoch": 0.1412556053811659, "grad_norm": 0.17601667106314525, "learning_rate": 9.402985074626867e-05, "loss": 0.3639, "step": 252 }, { "epoch": 0.14181614349775784, "grad_norm": 0.15595609006027045, "learning_rate": 9.440298507462687e-05, "loss": 0.3735, "step": 253 }, { "epoch": 0.14237668161434977, "grad_norm": 0.16626523046712888, "learning_rate": 9.477611940298507e-05, "loss": 0.3541, "step": 254 }, { "epoch": 0.1429372197309417, "grad_norm": 0.16235816753333587, "learning_rate": 9.514925373134329e-05, "loss": 0.3501, "step": 255 }, { "epoch": 0.14349775784753363, "grad_norm": 0.15176637857752753, "learning_rate": 9.552238805970149e-05, "loss": 0.3619, "step": 256 }, { "epoch": 0.14405829596412556, "grad_norm": 0.1688461940896348, "learning_rate": 9.58955223880597e-05, "loss": 0.3539, "step": 257 }, { "epoch": 0.1446188340807175, "grad_norm": 0.15435614225238878, "learning_rate": 9.626865671641792e-05, "loss": 0.3543, "step": 258 }, { "epoch": 0.14517937219730942, "grad_norm": 0.1795558992512674, "learning_rate": 9.664179104477612e-05, "loss": 0.3631, "step": 259 }, { "epoch": 0.14573991031390135, "grad_norm": 0.15941611102169298, "learning_rate": 9.701492537313434e-05, "loss": 0.3417, "step": 260 }, { "epoch": 0.14630044843049328, "grad_norm": 0.16017055644477737, "learning_rate": 9.738805970149254e-05, "loss": 0.3624, "step": 261 }, { "epoch": 0.1468609865470852, "grad_norm": 0.1702042759017935, "learning_rate": 9.776119402985075e-05, "loss": 0.3675, "step": 262 }, { "epoch": 0.14742152466367714, "grad_norm": 0.15574217721760533, "learning_rate": 9.813432835820896e-05, "loss": 0.3537, "step": 263 }, { "epoch": 0.14798206278026907, "grad_norm": 0.15595756003052536, "learning_rate": 9.850746268656717e-05, "loss": 0.3413, "step": 264 }, { "epoch": 0.148542600896861, "grad_norm": 0.16407156402134312, "learning_rate": 9.888059701492539e-05, "loss": 0.348, "step": 265 }, { "epoch": 0.1491031390134529, "grad_norm": 0.16740889130311884, "learning_rate": 9.925373134328359e-05, "loss": 0.3434, "step": 266 }, { "epoch": 0.14966367713004483, "grad_norm": 0.16159905903153549, "learning_rate": 9.96268656716418e-05, "loss": 0.3585, "step": 267 }, { "epoch": 0.15022421524663676, "grad_norm": 0.16452829367524685, "learning_rate": 0.0001, "loss": 0.36, "step": 268 }, { "epoch": 0.1507847533632287, "grad_norm": 0.16212933723251338, "learning_rate": 0.00010037313432835822, "loss": 0.3536, "step": 269 }, { "epoch": 0.15134529147982062, "grad_norm": 0.1677596853734516, "learning_rate": 0.00010074626865671641, "loss": 0.3461, "step": 270 }, { "epoch": 0.15190582959641255, "grad_norm": 0.14092955431703147, "learning_rate": 0.00010111940298507462, "loss": 0.3514, "step": 271 }, { "epoch": 0.15246636771300448, "grad_norm": 0.15504121203886792, "learning_rate": 0.00010149253731343284, "loss": 0.3601, "step": 272 }, { "epoch": 0.1530269058295964, "grad_norm": 0.16086274774375903, "learning_rate": 0.00010186567164179107, "loss": 0.3683, "step": 273 }, { "epoch": 0.15358744394618834, "grad_norm": 0.1604018025859141, "learning_rate": 0.00010223880597014926, "loss": 0.3633, "step": 274 }, { "epoch": 0.15414798206278027, "grad_norm": 0.15692121791747665, "learning_rate": 0.00010261194029850747, "loss": 0.332, "step": 275 }, { "epoch": 0.1547085201793722, "grad_norm": 0.16795441518784324, "learning_rate": 0.00010298507462686569, "loss": 0.3644, "step": 276 }, { "epoch": 0.15526905829596413, "grad_norm": 0.17131877689821567, "learning_rate": 0.00010335820895522387, "loss": 0.3434, "step": 277 }, { "epoch": 0.15582959641255606, "grad_norm": 0.1607628706106527, "learning_rate": 0.00010373134328358209, "loss": 0.3626, "step": 278 }, { "epoch": 0.156390134529148, "grad_norm": 0.16269328407163647, "learning_rate": 0.0001041044776119403, "loss": 0.3586, "step": 279 }, { "epoch": 0.15695067264573992, "grad_norm": 0.17535923762892863, "learning_rate": 0.0001044776119402985, "loss": 0.3644, "step": 280 }, { "epoch": 0.15751121076233185, "grad_norm": 0.15924213345397006, "learning_rate": 0.00010485074626865672, "loss": 0.3443, "step": 281 }, { "epoch": 0.15807174887892378, "grad_norm": 0.16200755025292304, "learning_rate": 0.00010522388059701494, "loss": 0.3451, "step": 282 }, { "epoch": 0.1586322869955157, "grad_norm": 0.16240644121268424, "learning_rate": 0.00010559701492537315, "loss": 0.3433, "step": 283 }, { "epoch": 0.1591928251121076, "grad_norm": 0.1599085397839329, "learning_rate": 0.00010597014925373134, "loss": 0.3445, "step": 284 }, { "epoch": 0.15975336322869954, "grad_norm": 0.16881740152266073, "learning_rate": 0.00010634328358208955, "loss": 0.3587, "step": 285 }, { "epoch": 0.16031390134529147, "grad_norm": 0.15197850146245018, "learning_rate": 0.00010671641791044777, "loss": 0.3581, "step": 286 }, { "epoch": 0.1608744394618834, "grad_norm": 0.15979984233256023, "learning_rate": 0.00010708955223880597, "loss": 0.3325, "step": 287 }, { "epoch": 0.16143497757847533, "grad_norm": 0.14785386490221808, "learning_rate": 0.00010746268656716419, "loss": 0.3523, "step": 288 }, { "epoch": 0.16199551569506726, "grad_norm": 0.16121360229564613, "learning_rate": 0.0001078358208955224, "loss": 0.3532, "step": 289 }, { "epoch": 0.1625560538116592, "grad_norm": 0.1666278996368264, "learning_rate": 0.00010820895522388059, "loss": 0.3344, "step": 290 }, { "epoch": 0.16311659192825112, "grad_norm": 0.1658946129973587, "learning_rate": 0.0001085820895522388, "loss": 0.3619, "step": 291 }, { "epoch": 0.16367713004484305, "grad_norm": 0.15391246174723924, "learning_rate": 0.00010895522388059702, "loss": 0.3437, "step": 292 }, { "epoch": 0.16423766816143498, "grad_norm": 0.17369812945799734, "learning_rate": 0.00010932835820895524, "loss": 0.3435, "step": 293 }, { "epoch": 0.1647982062780269, "grad_norm": 0.16097988268775634, "learning_rate": 0.00010970149253731344, "loss": 0.3458, "step": 294 }, { "epoch": 0.16535874439461884, "grad_norm": 0.14845524173879637, "learning_rate": 0.00011007462686567165, "loss": 0.3617, "step": 295 }, { "epoch": 0.16591928251121077, "grad_norm": 0.16204126238942138, "learning_rate": 0.00011044776119402987, "loss": 0.3639, "step": 296 }, { "epoch": 0.1664798206278027, "grad_norm": 0.15918335661791597, "learning_rate": 0.00011082089552238806, "loss": 0.3556, "step": 297 }, { "epoch": 0.16704035874439463, "grad_norm": 0.1508624404514423, "learning_rate": 0.00011119402985074627, "loss": 0.3331, "step": 298 }, { "epoch": 0.16760089686098656, "grad_norm": 0.15001100888742291, "learning_rate": 0.00011156716417910449, "loss": 0.3336, "step": 299 }, { "epoch": 0.1681614349775785, "grad_norm": 0.1565482297002879, "learning_rate": 0.00011194029850746269, "loss": 0.3475, "step": 300 }, { "epoch": 0.1687219730941704, "grad_norm": 0.16549120599034722, "learning_rate": 0.0001123134328358209, "loss": 0.3558, "step": 301 }, { "epoch": 0.16928251121076232, "grad_norm": 0.1505863588805418, "learning_rate": 0.00011268656716417912, "loss": 0.3664, "step": 302 }, { "epoch": 0.16984304932735425, "grad_norm": 0.15323937742218322, "learning_rate": 0.00011305970149253733, "loss": 0.3552, "step": 303 }, { "epoch": 0.17040358744394618, "grad_norm": 0.15854127887583344, "learning_rate": 0.00011343283582089552, "loss": 0.358, "step": 304 }, { "epoch": 0.1709641255605381, "grad_norm": 0.16425026656892816, "learning_rate": 0.00011380597014925374, "loss": 0.3385, "step": 305 }, { "epoch": 0.17152466367713004, "grad_norm": 0.14453136672166597, "learning_rate": 0.00011417910447761195, "loss": 0.3485, "step": 306 }, { "epoch": 0.17208520179372197, "grad_norm": 0.14521817335331513, "learning_rate": 0.00011455223880597015, "loss": 0.3379, "step": 307 }, { "epoch": 0.1726457399103139, "grad_norm": 0.15447478262387904, "learning_rate": 0.00011492537313432837, "loss": 0.3401, "step": 308 }, { "epoch": 0.17320627802690583, "grad_norm": 0.14543845819057685, "learning_rate": 0.00011529850746268658, "loss": 0.3531, "step": 309 }, { "epoch": 0.17376681614349776, "grad_norm": 0.14848515874034607, "learning_rate": 0.00011567164179104477, "loss": 0.3408, "step": 310 }, { "epoch": 0.1743273542600897, "grad_norm": 0.1468921528006467, "learning_rate": 0.00011604477611940299, "loss": 0.3539, "step": 311 }, { "epoch": 0.17488789237668162, "grad_norm": 0.14226517621252568, "learning_rate": 0.0001164179104477612, "loss": 0.3406, "step": 312 }, { "epoch": 0.17544843049327355, "grad_norm": 0.14388093245906539, "learning_rate": 0.00011679104477611942, "loss": 0.3419, "step": 313 }, { "epoch": 0.17600896860986548, "grad_norm": 0.1447233515128136, "learning_rate": 0.0001171641791044776, "loss": 0.336, "step": 314 }, { "epoch": 0.1765695067264574, "grad_norm": 0.1455324803503982, "learning_rate": 0.00011753731343283582, "loss": 0.3465, "step": 315 }, { "epoch": 0.17713004484304934, "grad_norm": 0.14770033589219328, "learning_rate": 0.00011791044776119405, "loss": 0.3405, "step": 316 }, { "epoch": 0.17769058295964127, "grad_norm": 0.13999832382983, "learning_rate": 0.00011828358208955224, "loss": 0.3469, "step": 317 }, { "epoch": 0.17825112107623317, "grad_norm": 0.13762285639869806, "learning_rate": 0.00011865671641791045, "loss": 0.3416, "step": 318 }, { "epoch": 0.1788116591928251, "grad_norm": 0.14693857139845815, "learning_rate": 0.00011902985074626867, "loss": 0.3411, "step": 319 }, { "epoch": 0.17937219730941703, "grad_norm": 0.1508866834864334, "learning_rate": 0.00011940298507462686, "loss": 0.3629, "step": 320 }, { "epoch": 0.17993273542600896, "grad_norm": 0.14654222377365972, "learning_rate": 0.00011977611940298507, "loss": 0.3385, "step": 321 }, { "epoch": 0.1804932735426009, "grad_norm": 0.14817029949391497, "learning_rate": 0.00012014925373134329, "loss": 0.3187, "step": 322 }, { "epoch": 0.18105381165919282, "grad_norm": 0.14496977911609008, "learning_rate": 0.0001205223880597015, "loss": 0.334, "step": 323 }, { "epoch": 0.18161434977578475, "grad_norm": 0.14616862773211267, "learning_rate": 0.0001208955223880597, "loss": 0.3353, "step": 324 }, { "epoch": 0.18217488789237668, "grad_norm": 0.139574242854469, "learning_rate": 0.00012126865671641792, "loss": 0.3444, "step": 325 }, { "epoch": 0.1827354260089686, "grad_norm": 0.14678711716707, "learning_rate": 0.00012164179104477613, "loss": 0.3406, "step": 326 }, { "epoch": 0.18329596412556054, "grad_norm": 0.1383242188369039, "learning_rate": 0.00012201492537313432, "loss": 0.3462, "step": 327 }, { "epoch": 0.18385650224215247, "grad_norm": 0.14074334987634868, "learning_rate": 0.00012238805970149255, "loss": 0.34, "step": 328 }, { "epoch": 0.1844170403587444, "grad_norm": 0.13221503607649598, "learning_rate": 0.00012276119402985077, "loss": 0.3382, "step": 329 }, { "epoch": 0.18497757847533633, "grad_norm": 0.14504252934701425, "learning_rate": 0.00012313432835820895, "loss": 0.3356, "step": 330 }, { "epoch": 0.18553811659192826, "grad_norm": 0.14189066468483738, "learning_rate": 0.00012350746268656717, "loss": 0.3543, "step": 331 }, { "epoch": 0.1860986547085202, "grad_norm": 0.14575030702579406, "learning_rate": 0.00012388059701492538, "loss": 0.3327, "step": 332 }, { "epoch": 0.18665919282511212, "grad_norm": 0.1518908122266155, "learning_rate": 0.0001242537313432836, "loss": 0.3427, "step": 333 }, { "epoch": 0.18721973094170405, "grad_norm": 0.13073540683964113, "learning_rate": 0.0001246268656716418, "loss": 0.3323, "step": 334 }, { "epoch": 0.18778026905829595, "grad_norm": 0.14654813791251525, "learning_rate": 0.000125, "loss": 0.3513, "step": 335 }, { "epoch": 0.18834080717488788, "grad_norm": 0.14395303052531452, "learning_rate": 0.00012537313432835822, "loss": 0.3563, "step": 336 }, { "epoch": 0.1889013452914798, "grad_norm": 0.1361516821315323, "learning_rate": 0.0001257462686567164, "loss": 0.3263, "step": 337 }, { "epoch": 0.18946188340807174, "grad_norm": 0.1419463217755675, "learning_rate": 0.00012611940298507462, "loss": 0.3297, "step": 338 }, { "epoch": 0.19002242152466367, "grad_norm": 0.1655300104293705, "learning_rate": 0.00012649253731343284, "loss": 0.3393, "step": 339 }, { "epoch": 0.1905829596412556, "grad_norm": 0.15281292173494365, "learning_rate": 0.00012686567164179105, "loss": 0.3433, "step": 340 }, { "epoch": 0.19114349775784753, "grad_norm": 0.16592140919555418, "learning_rate": 0.00012723880597014927, "loss": 0.3285, "step": 341 }, { "epoch": 0.19170403587443946, "grad_norm": 0.14634846678856345, "learning_rate": 0.00012761194029850748, "loss": 0.3353, "step": 342 }, { "epoch": 0.1922645739910314, "grad_norm": 0.14725326795722657, "learning_rate": 0.0001279850746268657, "loss": 0.347, "step": 343 }, { "epoch": 0.19282511210762332, "grad_norm": 0.1410427654180543, "learning_rate": 0.00012835820895522389, "loss": 0.348, "step": 344 }, { "epoch": 0.19338565022421525, "grad_norm": 0.14665245126527168, "learning_rate": 0.0001287313432835821, "loss": 0.3442, "step": 345 }, { "epoch": 0.19394618834080718, "grad_norm": 0.149124223155523, "learning_rate": 0.00012910447761194032, "loss": 0.3358, "step": 346 }, { "epoch": 0.1945067264573991, "grad_norm": 0.1395297425768304, "learning_rate": 0.0001294776119402985, "loss": 0.3265, "step": 347 }, { "epoch": 0.19506726457399104, "grad_norm": 0.15476619855626284, "learning_rate": 0.00012985074626865672, "loss": 0.342, "step": 348 }, { "epoch": 0.19562780269058297, "grad_norm": 0.14943977200050382, "learning_rate": 0.00013022388059701493, "loss": 0.347, "step": 349 }, { "epoch": 0.1961883408071749, "grad_norm": 0.13087846809773412, "learning_rate": 0.00013059701492537315, "loss": 0.3273, "step": 350 }, { "epoch": 0.19674887892376683, "grad_norm": 0.14118185286815782, "learning_rate": 0.00013097014925373134, "loss": 0.3314, "step": 351 }, { "epoch": 0.19730941704035873, "grad_norm": 0.1486868542745911, "learning_rate": 0.00013134328358208955, "loss": 0.3327, "step": 352 }, { "epoch": 0.19786995515695066, "grad_norm": 0.13026695041967967, "learning_rate": 0.00013171641791044777, "loss": 0.3339, "step": 353 }, { "epoch": 0.1984304932735426, "grad_norm": 0.14206153326263493, "learning_rate": 0.00013208955223880596, "loss": 0.3428, "step": 354 }, { "epoch": 0.19899103139013452, "grad_norm": 0.14564897855047862, "learning_rate": 0.0001324626865671642, "loss": 0.3502, "step": 355 }, { "epoch": 0.19955156950672645, "grad_norm": 0.13257702188973255, "learning_rate": 0.0001328358208955224, "loss": 0.3417, "step": 356 }, { "epoch": 0.20011210762331838, "grad_norm": 0.12707052181073036, "learning_rate": 0.0001332089552238806, "loss": 0.3379, "step": 357 }, { "epoch": 0.2006726457399103, "grad_norm": 0.14558961583149116, "learning_rate": 0.00013358208955223882, "loss": 0.3446, "step": 358 }, { "epoch": 0.20123318385650224, "grad_norm": 0.13403130997892224, "learning_rate": 0.00013395522388059703, "loss": 0.3386, "step": 359 }, { "epoch": 0.20179372197309417, "grad_norm": 0.1383693750622578, "learning_rate": 0.00013432835820895525, "loss": 0.3294, "step": 360 }, { "epoch": 0.2023542600896861, "grad_norm": 0.14914836437591436, "learning_rate": 0.00013470149253731343, "loss": 0.336, "step": 361 }, { "epoch": 0.20291479820627803, "grad_norm": 0.14807475904717177, "learning_rate": 0.00013507462686567165, "loss": 0.3458, "step": 362 }, { "epoch": 0.20347533632286996, "grad_norm": 0.14607368191149564, "learning_rate": 0.00013544776119402987, "loss": 0.3418, "step": 363 }, { "epoch": 0.2040358744394619, "grad_norm": 0.14366790334710722, "learning_rate": 0.00013582089552238805, "loss": 0.3274, "step": 364 }, { "epoch": 0.20459641255605382, "grad_norm": 0.13680550785389592, "learning_rate": 0.00013619402985074627, "loss": 0.3123, "step": 365 }, { "epoch": 0.20515695067264575, "grad_norm": 0.13993820313914998, "learning_rate": 0.00013656716417910448, "loss": 0.3382, "step": 366 }, { "epoch": 0.20571748878923768, "grad_norm": 0.13377861071671815, "learning_rate": 0.00013694029850746267, "loss": 0.33, "step": 367 }, { "epoch": 0.2062780269058296, "grad_norm": 0.13704463750464957, "learning_rate": 0.0001373134328358209, "loss": 0.3523, "step": 368 }, { "epoch": 0.2068385650224215, "grad_norm": 0.13513020711651452, "learning_rate": 0.0001376865671641791, "loss": 0.3416, "step": 369 }, { "epoch": 0.20739910313901344, "grad_norm": 0.14131252420969243, "learning_rate": 0.00013805970149253732, "loss": 0.3316, "step": 370 }, { "epoch": 0.20795964125560537, "grad_norm": 0.15006108842697338, "learning_rate": 0.00013843283582089553, "loss": 0.3319, "step": 371 }, { "epoch": 0.2085201793721973, "grad_norm": 0.13713602623692356, "learning_rate": 0.00013880597014925375, "loss": 0.3334, "step": 372 }, { "epoch": 0.20908071748878923, "grad_norm": 0.14038281003520495, "learning_rate": 0.00013917910447761196, "loss": 0.3459, "step": 373 }, { "epoch": 0.20964125560538116, "grad_norm": 0.12250736037089766, "learning_rate": 0.00013955223880597015, "loss": 0.3432, "step": 374 }, { "epoch": 0.2102017937219731, "grad_norm": 0.12958496353681398, "learning_rate": 0.00013992537313432837, "loss": 0.3406, "step": 375 }, { "epoch": 0.21076233183856502, "grad_norm": 0.12931345924298085, "learning_rate": 0.00014029850746268658, "loss": 0.344, "step": 376 }, { "epoch": 0.21132286995515695, "grad_norm": 0.14903529712216704, "learning_rate": 0.00014067164179104477, "loss": 0.3318, "step": 377 }, { "epoch": 0.21188340807174888, "grad_norm": 0.13170430329554297, "learning_rate": 0.00014104477611940298, "loss": 0.3317, "step": 378 }, { "epoch": 0.2124439461883408, "grad_norm": 0.12667419201802455, "learning_rate": 0.0001414179104477612, "loss": 0.3273, "step": 379 }, { "epoch": 0.21300448430493274, "grad_norm": 0.13178478862435777, "learning_rate": 0.00014179104477611942, "loss": 0.3381, "step": 380 }, { "epoch": 0.21356502242152467, "grad_norm": 0.1291010897335178, "learning_rate": 0.0001421641791044776, "loss": 0.3375, "step": 381 }, { "epoch": 0.2141255605381166, "grad_norm": 0.13394707835363362, "learning_rate": 0.00014253731343283582, "loss": 0.3326, "step": 382 }, { "epoch": 0.21468609865470853, "grad_norm": 0.12853384755084213, "learning_rate": 0.00014291044776119403, "loss": 0.345, "step": 383 }, { "epoch": 0.21524663677130046, "grad_norm": 0.12994048579956208, "learning_rate": 0.00014328358208955225, "loss": 0.3156, "step": 384 }, { "epoch": 0.2158071748878924, "grad_norm": 0.1368026539480669, "learning_rate": 0.00014365671641791046, "loss": 0.3342, "step": 385 }, { "epoch": 0.2163677130044843, "grad_norm": 0.1380109557323045, "learning_rate": 0.00014402985074626868, "loss": 0.3292, "step": 386 }, { "epoch": 0.21692825112107622, "grad_norm": 0.13514092013237952, "learning_rate": 0.00014440298507462687, "loss": 0.3285, "step": 387 }, { "epoch": 0.21748878923766815, "grad_norm": 0.12733082443982355, "learning_rate": 0.00014477611940298508, "loss": 0.3315, "step": 388 }, { "epoch": 0.21804932735426008, "grad_norm": 0.1294327190388571, "learning_rate": 0.0001451492537313433, "loss": 0.3373, "step": 389 }, { "epoch": 0.218609865470852, "grad_norm": 0.12943400016143503, "learning_rate": 0.0001455223880597015, "loss": 0.3286, "step": 390 }, { "epoch": 0.21917040358744394, "grad_norm": 0.12626830903738212, "learning_rate": 0.0001458955223880597, "loss": 0.3448, "step": 391 }, { "epoch": 0.21973094170403587, "grad_norm": 0.12459201714786171, "learning_rate": 0.00014626865671641792, "loss": 0.3314, "step": 392 }, { "epoch": 0.2202914798206278, "grad_norm": 0.12647395067356484, "learning_rate": 0.00014664179104477613, "loss": 0.3395, "step": 393 }, { "epoch": 0.22085201793721973, "grad_norm": 0.12867348213574867, "learning_rate": 0.00014701492537313432, "loss": 0.3423, "step": 394 }, { "epoch": 0.22141255605381166, "grad_norm": 0.1221176661464511, "learning_rate": 0.00014738805970149253, "loss": 0.3301, "step": 395 }, { "epoch": 0.2219730941704036, "grad_norm": 0.1372287955348562, "learning_rate": 0.00014776119402985075, "loss": 0.343, "step": 396 }, { "epoch": 0.22253363228699552, "grad_norm": 0.13066443642622122, "learning_rate": 0.00014813432835820894, "loss": 0.3337, "step": 397 }, { "epoch": 0.22309417040358745, "grad_norm": 0.12841879469681047, "learning_rate": 0.00014850746268656718, "loss": 0.323, "step": 398 }, { "epoch": 0.22365470852017938, "grad_norm": 0.12639845139011982, "learning_rate": 0.0001488805970149254, "loss": 0.3181, "step": 399 }, { "epoch": 0.2242152466367713, "grad_norm": 0.13152331106311524, "learning_rate": 0.0001492537313432836, "loss": 0.3311, "step": 400 }, { "epoch": 0.22477578475336324, "grad_norm": 0.1257243364510895, "learning_rate": 0.0001496268656716418, "loss": 0.3473, "step": 401 }, { "epoch": 0.22533632286995517, "grad_norm": 0.11961559661149512, "learning_rate": 0.00015000000000000001, "loss": 0.3302, "step": 402 }, { "epoch": 0.2258968609865471, "grad_norm": 0.1285158772976168, "learning_rate": 0.00015037313432835823, "loss": 0.3303, "step": 403 }, { "epoch": 0.226457399103139, "grad_norm": 0.12700948859909972, "learning_rate": 0.00015074626865671642, "loss": 0.346, "step": 404 }, { "epoch": 0.22701793721973093, "grad_norm": 0.12493785729680966, "learning_rate": 0.00015111940298507463, "loss": 0.3228, "step": 405 }, { "epoch": 0.22757847533632286, "grad_norm": 0.1387796666197342, "learning_rate": 0.00015149253731343285, "loss": 0.3419, "step": 406 }, { "epoch": 0.2281390134529148, "grad_norm": 0.12168730216608055, "learning_rate": 0.00015186567164179106, "loss": 0.3269, "step": 407 }, { "epoch": 0.22869955156950672, "grad_norm": 0.12054043851765807, "learning_rate": 0.00015223880597014925, "loss": 0.3417, "step": 408 }, { "epoch": 0.22926008968609865, "grad_norm": 0.1262020840793984, "learning_rate": 0.00015261194029850747, "loss": 0.3351, "step": 409 }, { "epoch": 0.22982062780269058, "grad_norm": 0.12646633604287688, "learning_rate": 0.00015298507462686568, "loss": 0.332, "step": 410 }, { "epoch": 0.2303811659192825, "grad_norm": 0.12678814574552924, "learning_rate": 0.00015335820895522387, "loss": 0.3274, "step": 411 }, { "epoch": 0.23094170403587444, "grad_norm": 0.1247749319102667, "learning_rate": 0.00015373134328358208, "loss": 0.3293, "step": 412 }, { "epoch": 0.23150224215246637, "grad_norm": 0.1319512977808111, "learning_rate": 0.0001541044776119403, "loss": 0.3475, "step": 413 }, { "epoch": 0.2320627802690583, "grad_norm": 0.1291927456921302, "learning_rate": 0.00015447761194029851, "loss": 0.3318, "step": 414 }, { "epoch": 0.23262331838565023, "grad_norm": 0.1326160886521916, "learning_rate": 0.00015485074626865673, "loss": 0.3398, "step": 415 }, { "epoch": 0.23318385650224216, "grad_norm": 0.13783299680690347, "learning_rate": 0.00015522388059701495, "loss": 0.3145, "step": 416 }, { "epoch": 0.2337443946188341, "grad_norm": 0.14278625424021602, "learning_rate": 0.00015559701492537316, "loss": 0.3237, "step": 417 }, { "epoch": 0.23430493273542602, "grad_norm": 0.1255560465560579, "learning_rate": 0.00015597014925373135, "loss": 0.318, "step": 418 }, { "epoch": 0.23486547085201795, "grad_norm": 0.1293448524834811, "learning_rate": 0.00015634328358208956, "loss": 0.3244, "step": 419 }, { "epoch": 0.23542600896860988, "grad_norm": 0.12125345580304638, "learning_rate": 0.00015671641791044778, "loss": 0.3269, "step": 420 }, { "epoch": 0.23598654708520178, "grad_norm": 0.1301695815606069, "learning_rate": 0.00015708955223880597, "loss": 0.3452, "step": 421 }, { "epoch": 0.2365470852017937, "grad_norm": 0.11543487301027802, "learning_rate": 0.00015746268656716418, "loss": 0.3108, "step": 422 }, { "epoch": 0.23710762331838564, "grad_norm": 0.11808058684873649, "learning_rate": 0.0001578358208955224, "loss": 0.304, "step": 423 }, { "epoch": 0.23766816143497757, "grad_norm": 0.11914598540750668, "learning_rate": 0.00015820895522388059, "loss": 0.3307, "step": 424 }, { "epoch": 0.2382286995515695, "grad_norm": 0.11718635825048222, "learning_rate": 0.0001585820895522388, "loss": 0.3282, "step": 425 }, { "epoch": 0.23878923766816143, "grad_norm": 0.12828602408853326, "learning_rate": 0.00015895522388059702, "loss": 0.344, "step": 426 }, { "epoch": 0.23934977578475336, "grad_norm": 0.12241044394498629, "learning_rate": 0.00015932835820895523, "loss": 0.3263, "step": 427 }, { "epoch": 0.2399103139013453, "grad_norm": 0.12628652174142493, "learning_rate": 0.00015970149253731345, "loss": 0.3193, "step": 428 }, { "epoch": 0.24047085201793722, "grad_norm": 0.12350703208546007, "learning_rate": 0.00016007462686567166, "loss": 0.33, "step": 429 }, { "epoch": 0.24103139013452915, "grad_norm": 0.1260052637488716, "learning_rate": 0.00016044776119402988, "loss": 0.3201, "step": 430 }, { "epoch": 0.24159192825112108, "grad_norm": 0.1281992427766547, "learning_rate": 0.00016082089552238806, "loss": 0.3278, "step": 431 }, { "epoch": 0.242152466367713, "grad_norm": 0.12262788847949457, "learning_rate": 0.00016119402985074628, "loss": 0.3321, "step": 432 }, { "epoch": 0.24271300448430494, "grad_norm": 0.12259307695789733, "learning_rate": 0.0001615671641791045, "loss": 0.3334, "step": 433 }, { "epoch": 0.24327354260089687, "grad_norm": 0.13187011912485422, "learning_rate": 0.00016194029850746268, "loss": 0.3377, "step": 434 }, { "epoch": 0.2438340807174888, "grad_norm": 0.12047127318376988, "learning_rate": 0.0001623134328358209, "loss": 0.3327, "step": 435 }, { "epoch": 0.24439461883408073, "grad_norm": 0.12254124825937357, "learning_rate": 0.00016268656716417911, "loss": 0.3251, "step": 436 }, { "epoch": 0.24495515695067266, "grad_norm": 0.11229576952987816, "learning_rate": 0.00016305970149253733, "loss": 0.3247, "step": 437 }, { "epoch": 0.24551569506726456, "grad_norm": 0.11316247053608615, "learning_rate": 0.00016343283582089552, "loss": 0.3362, "step": 438 }, { "epoch": 0.2460762331838565, "grad_norm": 0.12670848030646706, "learning_rate": 0.00016380597014925373, "loss": 0.3324, "step": 439 }, { "epoch": 0.24663677130044842, "grad_norm": 0.11407342067194146, "learning_rate": 0.00016417910447761195, "loss": 0.33, "step": 440 }, { "epoch": 0.24719730941704035, "grad_norm": 0.12686252667414058, "learning_rate": 0.00016455223880597016, "loss": 0.321, "step": 441 }, { "epoch": 0.24775784753363228, "grad_norm": 0.12257208306795794, "learning_rate": 0.00016492537313432838, "loss": 0.3359, "step": 442 }, { "epoch": 0.2483183856502242, "grad_norm": 0.12416215291655391, "learning_rate": 0.0001652985074626866, "loss": 0.3391, "step": 443 }, { "epoch": 0.24887892376681614, "grad_norm": 0.11842325228755514, "learning_rate": 0.00016567164179104478, "loss": 0.3228, "step": 444 }, { "epoch": 0.24943946188340807, "grad_norm": 0.11557531510118868, "learning_rate": 0.000166044776119403, "loss": 0.3307, "step": 445 }, { "epoch": 0.25, "grad_norm": 0.12170820760450851, "learning_rate": 0.0001664179104477612, "loss": 0.3482, "step": 446 }, { "epoch": 0.2505605381165919, "grad_norm": 0.1293312049559174, "learning_rate": 0.00016679104477611943, "loss": 0.3348, "step": 447 }, { "epoch": 0.25112107623318386, "grad_norm": 0.1129033052648288, "learning_rate": 0.00016716417910447761, "loss": 0.3312, "step": 448 }, { "epoch": 0.25168161434977576, "grad_norm": 0.1215834233409738, "learning_rate": 0.00016753731343283583, "loss": 0.3255, "step": 449 }, { "epoch": 0.2522421524663677, "grad_norm": 0.12180545876740287, "learning_rate": 0.00016791044776119405, "loss": 0.3339, "step": 450 }, { "epoch": 0.2528026905829596, "grad_norm": 0.11871718449220875, "learning_rate": 0.00016828358208955223, "loss": 0.3342, "step": 451 }, { "epoch": 0.2533632286995516, "grad_norm": 0.10955442215647969, "learning_rate": 0.00016865671641791045, "loss": 0.3183, "step": 452 }, { "epoch": 0.2539237668161435, "grad_norm": 0.12141904390740511, "learning_rate": 0.00016902985074626866, "loss": 0.3338, "step": 453 }, { "epoch": 0.25448430493273544, "grad_norm": 0.109363161276611, "learning_rate": 0.00016940298507462685, "loss": 0.3275, "step": 454 }, { "epoch": 0.25504484304932734, "grad_norm": 0.10861033736457545, "learning_rate": 0.00016977611940298507, "loss": 0.3202, "step": 455 }, { "epoch": 0.2556053811659193, "grad_norm": 0.11701255773856319, "learning_rate": 0.00017014925373134328, "loss": 0.3259, "step": 456 }, { "epoch": 0.2561659192825112, "grad_norm": 0.1237391208716396, "learning_rate": 0.0001705223880597015, "loss": 0.3367, "step": 457 }, { "epoch": 0.25672645739910316, "grad_norm": 0.12193150919073976, "learning_rate": 0.0001708955223880597, "loss": 0.3064, "step": 458 }, { "epoch": 0.25728699551569506, "grad_norm": 0.11247330351826199, "learning_rate": 0.00017126865671641793, "loss": 0.337, "step": 459 }, { "epoch": 0.257847533632287, "grad_norm": 0.12608345820517644, "learning_rate": 0.00017164179104477614, "loss": 0.3307, "step": 460 }, { "epoch": 0.2584080717488789, "grad_norm": 0.10981245125607718, "learning_rate": 0.00017201492537313433, "loss": 0.3249, "step": 461 }, { "epoch": 0.2589686098654709, "grad_norm": 0.12160721153034673, "learning_rate": 0.00017238805970149255, "loss": 0.3148, "step": 462 }, { "epoch": 0.2595291479820628, "grad_norm": 0.11499630830458173, "learning_rate": 0.00017276119402985076, "loss": 0.3247, "step": 463 }, { "epoch": 0.2600896860986547, "grad_norm": 0.11773276443927631, "learning_rate": 0.00017313432835820895, "loss": 0.3409, "step": 464 }, { "epoch": 0.26065022421524664, "grad_norm": 0.1163894352235071, "learning_rate": 0.00017350746268656716, "loss": 0.3312, "step": 465 }, { "epoch": 0.26121076233183854, "grad_norm": 0.11990564012827774, "learning_rate": 0.00017388059701492538, "loss": 0.3253, "step": 466 }, { "epoch": 0.2617713004484305, "grad_norm": 0.11171211187369197, "learning_rate": 0.0001742537313432836, "loss": 0.3254, "step": 467 }, { "epoch": 0.2623318385650224, "grad_norm": 0.11139288133558127, "learning_rate": 0.00017462686567164178, "loss": 0.3384, "step": 468 }, { "epoch": 0.26289237668161436, "grad_norm": 0.10673322543340892, "learning_rate": 0.000175, "loss": 0.3214, "step": 469 }, { "epoch": 0.26345291479820626, "grad_norm": 0.11082988585060753, "learning_rate": 0.0001753731343283582, "loss": 0.3326, "step": 470 }, { "epoch": 0.2640134529147982, "grad_norm": 0.11109033950550597, "learning_rate": 0.00017574626865671643, "loss": 0.3234, "step": 471 }, { "epoch": 0.2645739910313901, "grad_norm": 0.10788131857393239, "learning_rate": 0.00017611940298507464, "loss": 0.3144, "step": 472 }, { "epoch": 0.2651345291479821, "grad_norm": 0.11526598585641781, "learning_rate": 0.00017649253731343286, "loss": 0.3208, "step": 473 }, { "epoch": 0.265695067264574, "grad_norm": 0.12174354420730553, "learning_rate": 0.00017686567164179107, "loss": 0.3173, "step": 474 }, { "epoch": 0.26625560538116594, "grad_norm": 0.11021769520247941, "learning_rate": 0.00017723880597014926, "loss": 0.3194, "step": 475 }, { "epoch": 0.26681614349775784, "grad_norm": 0.11133392006178908, "learning_rate": 0.00017761194029850748, "loss": 0.318, "step": 476 }, { "epoch": 0.2673766816143498, "grad_norm": 0.11254586812321232, "learning_rate": 0.0001779850746268657, "loss": 0.3293, "step": 477 }, { "epoch": 0.2679372197309417, "grad_norm": 0.11183122193309668, "learning_rate": 0.00017835820895522388, "loss": 0.3368, "step": 478 }, { "epoch": 0.26849775784753366, "grad_norm": 0.10711717418436024, "learning_rate": 0.0001787313432835821, "loss": 0.3153, "step": 479 }, { "epoch": 0.26905829596412556, "grad_norm": 0.11448137615640368, "learning_rate": 0.0001791044776119403, "loss": 0.3471, "step": 480 }, { "epoch": 0.26961883408071746, "grad_norm": 0.11914801671735352, "learning_rate": 0.0001794776119402985, "loss": 0.3223, "step": 481 }, { "epoch": 0.2701793721973094, "grad_norm": 0.11137282750934051, "learning_rate": 0.00017985074626865671, "loss": 0.3211, "step": 482 }, { "epoch": 0.2707399103139013, "grad_norm": 0.10832969827373858, "learning_rate": 0.00018022388059701493, "loss": 0.3085, "step": 483 }, { "epoch": 0.2713004484304933, "grad_norm": 0.11085902902463791, "learning_rate": 0.00018059701492537314, "loss": 0.3175, "step": 484 }, { "epoch": 0.2718609865470852, "grad_norm": 0.11819460294134253, "learning_rate": 0.00018097014925373136, "loss": 0.3248, "step": 485 }, { "epoch": 0.27242152466367714, "grad_norm": 0.11313174097954944, "learning_rate": 0.00018134328358208958, "loss": 0.3199, "step": 486 }, { "epoch": 0.27298206278026904, "grad_norm": 0.12095925240596792, "learning_rate": 0.0001817164179104478, "loss": 0.3225, "step": 487 }, { "epoch": 0.273542600896861, "grad_norm": 0.11014289563863666, "learning_rate": 0.00018208955223880598, "loss": 0.3286, "step": 488 }, { "epoch": 0.2741031390134529, "grad_norm": 0.11204674994833956, "learning_rate": 0.0001824626865671642, "loss": 0.3108, "step": 489 }, { "epoch": 0.27466367713004486, "grad_norm": 0.1157331496584953, "learning_rate": 0.0001828358208955224, "loss": 0.3351, "step": 490 }, { "epoch": 0.27522421524663676, "grad_norm": 0.10566204971680675, "learning_rate": 0.0001832089552238806, "loss": 0.3116, "step": 491 }, { "epoch": 0.2757847533632287, "grad_norm": 0.11759218211668784, "learning_rate": 0.0001835820895522388, "loss": 0.3258, "step": 492 }, { "epoch": 0.2763452914798206, "grad_norm": 0.11586768756096413, "learning_rate": 0.00018395522388059703, "loss": 0.3313, "step": 493 }, { "epoch": 0.2769058295964126, "grad_norm": 0.1113792378824031, "learning_rate": 0.00018432835820895524, "loss": 0.3164, "step": 494 }, { "epoch": 0.2774663677130045, "grad_norm": 0.10801651632926149, "learning_rate": 0.00018470149253731343, "loss": 0.3189, "step": 495 }, { "epoch": 0.27802690582959644, "grad_norm": 0.11938118233555668, "learning_rate": 0.00018507462686567165, "loss": 0.3203, "step": 496 }, { "epoch": 0.27858744394618834, "grad_norm": 0.11022565270186098, "learning_rate": 0.00018544776119402986, "loss": 0.3284, "step": 497 }, { "epoch": 0.27914798206278024, "grad_norm": 0.11945531573485274, "learning_rate": 0.00018582089552238805, "loss": 0.3271, "step": 498 }, { "epoch": 0.2797085201793722, "grad_norm": 0.12370474328505932, "learning_rate": 0.00018619402985074626, "loss": 0.3387, "step": 499 }, { "epoch": 0.2802690582959641, "grad_norm": 0.11262416779259585, "learning_rate": 0.00018656716417910448, "loss": 0.3191, "step": 500 }, { "epoch": 0.28082959641255606, "grad_norm": 0.11273380446520367, "learning_rate": 0.0001869402985074627, "loss": 0.3211, "step": 501 }, { "epoch": 0.28139013452914796, "grad_norm": 0.12015390797410297, "learning_rate": 0.0001873134328358209, "loss": 0.3214, "step": 502 }, { "epoch": 0.2819506726457399, "grad_norm": 0.12206397188877245, "learning_rate": 0.00018768656716417913, "loss": 0.3162, "step": 503 }, { "epoch": 0.2825112107623318, "grad_norm": 0.10816697719872387, "learning_rate": 0.00018805970149253734, "loss": 0.3301, "step": 504 }, { "epoch": 0.2830717488789238, "grad_norm": 0.11848480710071109, "learning_rate": 0.00018843283582089553, "loss": 0.3247, "step": 505 }, { "epoch": 0.2836322869955157, "grad_norm": 0.11298401980526003, "learning_rate": 0.00018880597014925374, "loss": 0.3279, "step": 506 }, { "epoch": 0.28419282511210764, "grad_norm": 0.11953959263626984, "learning_rate": 0.00018917910447761196, "loss": 0.3299, "step": 507 }, { "epoch": 0.28475336322869954, "grad_norm": 0.10894689280623014, "learning_rate": 0.00018955223880597015, "loss": 0.3166, "step": 508 }, { "epoch": 0.2853139013452915, "grad_norm": 0.11704978977855075, "learning_rate": 0.00018992537313432836, "loss": 0.3288, "step": 509 }, { "epoch": 0.2858744394618834, "grad_norm": 0.11017058286438122, "learning_rate": 0.00019029850746268658, "loss": 0.324, "step": 510 }, { "epoch": 0.28643497757847536, "grad_norm": 0.10023837779249684, "learning_rate": 0.00019067164179104477, "loss": 0.3136, "step": 511 }, { "epoch": 0.28699551569506726, "grad_norm": 0.10568197659375772, "learning_rate": 0.00019104477611940298, "loss": 0.316, "step": 512 }, { "epoch": 0.2875560538116592, "grad_norm": 0.11314477582838516, "learning_rate": 0.0001914179104477612, "loss": 0.332, "step": 513 }, { "epoch": 0.2881165919282511, "grad_norm": 0.10223986780338, "learning_rate": 0.0001917910447761194, "loss": 0.3263, "step": 514 }, { "epoch": 0.288677130044843, "grad_norm": 0.11621140033209205, "learning_rate": 0.00019216417910447763, "loss": 0.3251, "step": 515 }, { "epoch": 0.289237668161435, "grad_norm": 0.10557208094551189, "learning_rate": 0.00019253731343283584, "loss": 0.3101, "step": 516 }, { "epoch": 0.2897982062780269, "grad_norm": 0.10613098222526403, "learning_rate": 0.00019291044776119406, "loss": 0.3081, "step": 517 }, { "epoch": 0.29035874439461884, "grad_norm": 0.10511912052331052, "learning_rate": 0.00019328358208955224, "loss": 0.3085, "step": 518 }, { "epoch": 0.29091928251121074, "grad_norm": 0.10622882777429886, "learning_rate": 0.00019365671641791046, "loss": 0.3211, "step": 519 }, { "epoch": 0.2914798206278027, "grad_norm": 0.11402567747806869, "learning_rate": 0.00019402985074626867, "loss": 0.3209, "step": 520 }, { "epoch": 0.2920403587443946, "grad_norm": 0.10923405553131683, "learning_rate": 0.00019440298507462686, "loss": 0.3273, "step": 521 }, { "epoch": 0.29260089686098656, "grad_norm": 0.11579444570145475, "learning_rate": 0.00019477611940298508, "loss": 0.3161, "step": 522 }, { "epoch": 0.29316143497757846, "grad_norm": 0.10559557707401233, "learning_rate": 0.0001951492537313433, "loss": 0.3248, "step": 523 }, { "epoch": 0.2937219730941704, "grad_norm": 0.1122893033865488, "learning_rate": 0.0001955223880597015, "loss": 0.3284, "step": 524 }, { "epoch": 0.2942825112107623, "grad_norm": 0.10802888290038817, "learning_rate": 0.0001958955223880597, "loss": 0.3337, "step": 525 }, { "epoch": 0.2948430493273543, "grad_norm": 0.1093293512128306, "learning_rate": 0.0001962686567164179, "loss": 0.3238, "step": 526 }, { "epoch": 0.2954035874439462, "grad_norm": 0.11270917361257277, "learning_rate": 0.00019664179104477613, "loss": 0.3306, "step": 527 }, { "epoch": 0.29596412556053814, "grad_norm": 0.10963247890500617, "learning_rate": 0.00019701492537313434, "loss": 0.3187, "step": 528 }, { "epoch": 0.29652466367713004, "grad_norm": 0.10687024928485456, "learning_rate": 0.00019738805970149256, "loss": 0.3185, "step": 529 }, { "epoch": 0.297085201793722, "grad_norm": 0.10717585930306575, "learning_rate": 0.00019776119402985077, "loss": 0.3133, "step": 530 }, { "epoch": 0.2976457399103139, "grad_norm": 0.11014802532876093, "learning_rate": 0.00019813432835820896, "loss": 0.3213, "step": 531 }, { "epoch": 0.2982062780269058, "grad_norm": 0.11159846382648866, "learning_rate": 0.00019850746268656718, "loss": 0.3272, "step": 532 }, { "epoch": 0.29876681614349776, "grad_norm": 0.11310481463261243, "learning_rate": 0.0001988805970149254, "loss": 0.3087, "step": 533 }, { "epoch": 0.29932735426008966, "grad_norm": 0.110102311935057, "learning_rate": 0.0001992537313432836, "loss": 0.3342, "step": 534 }, { "epoch": 0.2998878923766816, "grad_norm": 0.10480909108854215, "learning_rate": 0.0001996268656716418, "loss": 0.3119, "step": 535 }, { "epoch": 0.3004484304932735, "grad_norm": 0.11468645363518458, "learning_rate": 0.0002, "loss": 0.3187, "step": 536 }, { "epoch": 0.3010089686098655, "grad_norm": 0.10153097094841049, "learning_rate": 0.00019999997872366705, "loss": 0.318, "step": 537 }, { "epoch": 0.3015695067264574, "grad_norm": 0.1007684303802919, "learning_rate": 0.00019999991489467726, "loss": 0.3213, "step": 538 }, { "epoch": 0.30213004484304934, "grad_norm": 0.10014895411046884, "learning_rate": 0.00019999980851305782, "loss": 0.3076, "step": 539 }, { "epoch": 0.30269058295964124, "grad_norm": 0.10585004893494934, "learning_rate": 0.00019999965957885393, "loss": 0.3292, "step": 540 }, { "epoch": 0.3032511210762332, "grad_norm": 0.1152760618726801, "learning_rate": 0.00019999946809212904, "loss": 0.3221, "step": 541 }, { "epoch": 0.3038116591928251, "grad_norm": 0.09959725256443848, "learning_rate": 0.00019999923405296458, "loss": 0.3352, "step": 542 }, { "epoch": 0.30437219730941706, "grad_norm": 0.10277553131827694, "learning_rate": 0.0001999989574614602, "loss": 0.3336, "step": 543 }, { "epoch": 0.30493273542600896, "grad_norm": 0.10378740301586237, "learning_rate": 0.0001999986383177335, "loss": 0.3305, "step": 544 }, { "epoch": 0.3054932735426009, "grad_norm": 0.10570039748170808, "learning_rate": 0.00019999827662192033, "loss": 0.3141, "step": 545 }, { "epoch": 0.3060538116591928, "grad_norm": 0.10751566390236701, "learning_rate": 0.00019999787237417468, "loss": 0.3078, "step": 546 }, { "epoch": 0.3066143497757848, "grad_norm": 0.10953137196388753, "learning_rate": 0.00019999742557466846, "loss": 0.3109, "step": 547 }, { "epoch": 0.3071748878923767, "grad_norm": 0.10836313700897207, "learning_rate": 0.00019999693622359184, "loss": 0.3309, "step": 548 }, { "epoch": 0.3077354260089686, "grad_norm": 0.11964787223857871, "learning_rate": 0.00019999640432115303, "loss": 0.321, "step": 549 }, { "epoch": 0.30829596412556054, "grad_norm": 0.09679420757442243, "learning_rate": 0.00019999582986757842, "loss": 0.328, "step": 550 }, { "epoch": 0.30885650224215244, "grad_norm": 0.10483981868839679, "learning_rate": 0.00019999521286311238, "loss": 0.3277, "step": 551 }, { "epoch": 0.3094170403587444, "grad_norm": 0.10724254550549712, "learning_rate": 0.0001999945533080175, "loss": 0.3121, "step": 552 }, { "epoch": 0.3099775784753363, "grad_norm": 0.09635511885821291, "learning_rate": 0.00019999385120257447, "loss": 0.3146, "step": 553 }, { "epoch": 0.31053811659192826, "grad_norm": 0.10613771585662128, "learning_rate": 0.00019999310654708204, "loss": 0.3218, "step": 554 }, { "epoch": 0.31109865470852016, "grad_norm": 0.0957282225898911, "learning_rate": 0.00019999231934185704, "loss": 0.3157, "step": 555 }, { "epoch": 0.3116591928251121, "grad_norm": 0.0969081583603865, "learning_rate": 0.00019999148958723447, "loss": 0.3187, "step": 556 }, { "epoch": 0.312219730941704, "grad_norm": 0.09891256442125447, "learning_rate": 0.00019999061728356743, "loss": 0.3054, "step": 557 }, { "epoch": 0.312780269058296, "grad_norm": 0.1122183619977053, "learning_rate": 0.0001999897024312271, "loss": 0.3112, "step": 558 }, { "epoch": 0.3133408071748879, "grad_norm": 0.11277342986397464, "learning_rate": 0.00019998874503060273, "loss": 0.3296, "step": 559 }, { "epoch": 0.31390134529147984, "grad_norm": 0.0970263777528716, "learning_rate": 0.0001999877450821018, "loss": 0.3154, "step": 560 }, { "epoch": 0.31446188340807174, "grad_norm": 0.10002989014994842, "learning_rate": 0.00019998670258614975, "loss": 0.3147, "step": 561 }, { "epoch": 0.3150224215246637, "grad_norm": 0.10261105992382952, "learning_rate": 0.00019998561754319024, "loss": 0.3174, "step": 562 }, { "epoch": 0.3155829596412556, "grad_norm": 0.0972262981200895, "learning_rate": 0.0001999844899536849, "loss": 0.3211, "step": 563 }, { "epoch": 0.31614349775784756, "grad_norm": 0.09662314153307829, "learning_rate": 0.00019998331981811366, "loss": 0.3069, "step": 564 }, { "epoch": 0.31670403587443946, "grad_norm": 0.09693479137435353, "learning_rate": 0.00019998210713697437, "loss": 0.3248, "step": 565 }, { "epoch": 0.3172645739910314, "grad_norm": 0.10673502252972808, "learning_rate": 0.0001999808519107831, "loss": 0.3077, "step": 566 }, { "epoch": 0.3178251121076233, "grad_norm": 0.10270536213698937, "learning_rate": 0.00019997955414007392, "loss": 0.3177, "step": 567 }, { "epoch": 0.3183856502242152, "grad_norm": 0.10141916843663253, "learning_rate": 0.0001999782138253991, "loss": 0.304, "step": 568 }, { "epoch": 0.3189461883408072, "grad_norm": 0.10545679225726372, "learning_rate": 0.00019997683096732906, "loss": 0.3126, "step": 569 }, { "epoch": 0.3195067264573991, "grad_norm": 0.10698494377190094, "learning_rate": 0.00019997540556645208, "loss": 0.3197, "step": 570 }, { "epoch": 0.32006726457399104, "grad_norm": 0.09791580882147409, "learning_rate": 0.00019997393762337487, "loss": 0.3313, "step": 571 }, { "epoch": 0.32062780269058294, "grad_norm": 0.09848352076082446, "learning_rate": 0.00019997242713872196, "loss": 0.3106, "step": 572 }, { "epoch": 0.3211883408071749, "grad_norm": 0.10380107770857379, "learning_rate": 0.00019997087411313617, "loss": 0.3213, "step": 573 }, { "epoch": 0.3217488789237668, "grad_norm": 0.09518341108716867, "learning_rate": 0.0001999692785472783, "loss": 0.3141, "step": 574 }, { "epoch": 0.32230941704035876, "grad_norm": 0.09490611547488066, "learning_rate": 0.00019996764044182737, "loss": 0.3022, "step": 575 }, { "epoch": 0.32286995515695066, "grad_norm": 0.10238074080827257, "learning_rate": 0.00019996595979748037, "loss": 0.324, "step": 576 }, { "epoch": 0.3234304932735426, "grad_norm": 0.09636959644273263, "learning_rate": 0.00019996423661495252, "loss": 0.3029, "step": 577 }, { "epoch": 0.3239910313901345, "grad_norm": 0.0906416563210388, "learning_rate": 0.00019996247089497704, "loss": 0.3046, "step": 578 }, { "epoch": 0.3245515695067265, "grad_norm": 0.09867816493236885, "learning_rate": 0.00019996066263830531, "loss": 0.3034, "step": 579 }, { "epoch": 0.3251121076233184, "grad_norm": 0.10144669492274641, "learning_rate": 0.00019995881184570676, "loss": 0.3231, "step": 580 }, { "epoch": 0.32567264573991034, "grad_norm": 0.09336944858099906, "learning_rate": 0.000199956918517969, "loss": 0.322, "step": 581 }, { "epoch": 0.32623318385650224, "grad_norm": 0.09516852389764888, "learning_rate": 0.00019995498265589764, "loss": 0.3153, "step": 582 }, { "epoch": 0.3267937219730942, "grad_norm": 0.09579667771014898, "learning_rate": 0.00019995300426031652, "loss": 0.3192, "step": 583 }, { "epoch": 0.3273542600896861, "grad_norm": 0.09465110693869315, "learning_rate": 0.00019995098333206742, "loss": 0.3021, "step": 584 }, { "epoch": 0.327914798206278, "grad_norm": 0.09649031325116542, "learning_rate": 0.00019994891987201033, "loss": 0.3237, "step": 585 }, { "epoch": 0.32847533632286996, "grad_norm": 0.09981989468985598, "learning_rate": 0.00019994681388102329, "loss": 0.3122, "step": 586 }, { "epoch": 0.32903587443946186, "grad_norm": 0.09679065544912857, "learning_rate": 0.00019994466536000247, "loss": 0.3058, "step": 587 }, { "epoch": 0.3295964125560538, "grad_norm": 0.09342549112119153, "learning_rate": 0.00019994247430986213, "loss": 0.3076, "step": 588 }, { "epoch": 0.3301569506726457, "grad_norm": 0.09804709111681088, "learning_rate": 0.0001999402407315346, "loss": 0.3151, "step": 589 }, { "epoch": 0.3307174887892377, "grad_norm": 0.09797177899408367, "learning_rate": 0.00019993796462597038, "loss": 0.3159, "step": 590 }, { "epoch": 0.3312780269058296, "grad_norm": 0.0930412294160317, "learning_rate": 0.00019993564599413792, "loss": 0.3112, "step": 591 }, { "epoch": 0.33183856502242154, "grad_norm": 0.09188878510394342, "learning_rate": 0.00019993328483702393, "loss": 0.3053, "step": 592 }, { "epoch": 0.33239910313901344, "grad_norm": 0.09712951064347462, "learning_rate": 0.00019993088115563318, "loss": 0.3104, "step": 593 }, { "epoch": 0.3329596412556054, "grad_norm": 0.09704324731264691, "learning_rate": 0.00019992843495098838, "loss": 0.3116, "step": 594 }, { "epoch": 0.3335201793721973, "grad_norm": 0.09257904567873011, "learning_rate": 0.00019992594622413056, "loss": 0.3197, "step": 595 }, { "epoch": 0.33408071748878926, "grad_norm": 0.09909579185505196, "learning_rate": 0.0001999234149761187, "loss": 0.3296, "step": 596 }, { "epoch": 0.33464125560538116, "grad_norm": 0.09585706197264815, "learning_rate": 0.0001999208412080299, "loss": 0.3062, "step": 597 }, { "epoch": 0.3352017937219731, "grad_norm": 0.09786691067245115, "learning_rate": 0.00019991822492095943, "loss": 0.312, "step": 598 }, { "epoch": 0.335762331838565, "grad_norm": 0.09425861974210471, "learning_rate": 0.0001999155661160205, "loss": 0.321, "step": 599 }, { "epoch": 0.336322869955157, "grad_norm": 0.09807000412215684, "learning_rate": 0.00019991286479434454, "loss": 0.3177, "step": 600 }, { "epoch": 0.3368834080717489, "grad_norm": 0.09292352823747375, "learning_rate": 0.00019991012095708105, "loss": 0.3054, "step": 601 }, { "epoch": 0.3374439461883408, "grad_norm": 0.09778994065142038, "learning_rate": 0.00019990733460539762, "loss": 0.3286, "step": 602 }, { "epoch": 0.33800448430493274, "grad_norm": 0.10450814592202336, "learning_rate": 0.0001999045057404799, "loss": 0.3001, "step": 603 }, { "epoch": 0.33856502242152464, "grad_norm": 0.09520669536670996, "learning_rate": 0.0001999016343635316, "loss": 0.3078, "step": 604 }, { "epoch": 0.3391255605381166, "grad_norm": 0.09123463098371748, "learning_rate": 0.00019989872047577464, "loss": 0.3164, "step": 605 }, { "epoch": 0.3396860986547085, "grad_norm": 0.092099612562847, "learning_rate": 0.00019989576407844893, "loss": 0.3006, "step": 606 }, { "epoch": 0.34024663677130046, "grad_norm": 0.0994434074200813, "learning_rate": 0.00019989276517281247, "loss": 0.3149, "step": 607 }, { "epoch": 0.34080717488789236, "grad_norm": 0.09630693591932951, "learning_rate": 0.00019988972376014142, "loss": 0.3106, "step": 608 }, { "epoch": 0.3413677130044843, "grad_norm": 0.09685835074291246, "learning_rate": 0.00019988663984172992, "loss": 0.3247, "step": 609 }, { "epoch": 0.3419282511210762, "grad_norm": 0.09391183577383291, "learning_rate": 0.00019988351341889034, "loss": 0.3035, "step": 610 }, { "epoch": 0.3424887892376682, "grad_norm": 0.09525770323709616, "learning_rate": 0.00019988034449295298, "loss": 0.3104, "step": 611 }, { "epoch": 0.3430493273542601, "grad_norm": 0.09130352692181182, "learning_rate": 0.00019987713306526638, "loss": 0.3021, "step": 612 }, { "epoch": 0.34360986547085204, "grad_norm": 0.09620468186426591, "learning_rate": 0.00019987387913719698, "loss": 0.3019, "step": 613 }, { "epoch": 0.34417040358744394, "grad_norm": 0.09840324784808555, "learning_rate": 0.00019987058271012952, "loss": 0.3218, "step": 614 }, { "epoch": 0.3447309417040359, "grad_norm": 0.09636745952863293, "learning_rate": 0.0001998672437854667, "loss": 0.3052, "step": 615 }, { "epoch": 0.3452914798206278, "grad_norm": 0.09479726421382038, "learning_rate": 0.00019986386236462924, "loss": 0.3198, "step": 616 }, { "epoch": 0.34585201793721976, "grad_norm": 0.10083947783070678, "learning_rate": 0.00019986043844905612, "loss": 0.308, "step": 617 }, { "epoch": 0.34641255605381166, "grad_norm": 0.09657101971626757, "learning_rate": 0.00019985697204020423, "loss": 0.3103, "step": 618 }, { "epoch": 0.34697309417040356, "grad_norm": 0.1075570262110066, "learning_rate": 0.00019985346313954868, "loss": 0.3134, "step": 619 }, { "epoch": 0.3475336322869955, "grad_norm": 0.09711004477313603, "learning_rate": 0.00019984991174858257, "loss": 0.325, "step": 620 }, { "epoch": 0.3480941704035874, "grad_norm": 0.09418690943467588, "learning_rate": 0.00019984631786881715, "loss": 0.3147, "step": 621 }, { "epoch": 0.3486547085201794, "grad_norm": 0.09724763178040019, "learning_rate": 0.00019984268150178167, "loss": 0.3235, "step": 622 }, { "epoch": 0.3492152466367713, "grad_norm": 0.09182375734289457, "learning_rate": 0.00019983900264902352, "loss": 0.3138, "step": 623 }, { "epoch": 0.34977578475336324, "grad_norm": 0.10118783544252928, "learning_rate": 0.00019983528131210812, "loss": 0.3165, "step": 624 }, { "epoch": 0.35033632286995514, "grad_norm": 0.10138373780823115, "learning_rate": 0.00019983151749261905, "loss": 0.3214, "step": 625 }, { "epoch": 0.3508968609865471, "grad_norm": 0.09104715454857507, "learning_rate": 0.00019982771119215784, "loss": 0.3144, "step": 626 }, { "epoch": 0.351457399103139, "grad_norm": 0.09908394515423055, "learning_rate": 0.00019982386241234424, "loss": 0.3109, "step": 627 }, { "epoch": 0.35201793721973096, "grad_norm": 0.09498301385503512, "learning_rate": 0.00019981997115481602, "loss": 0.3007, "step": 628 }, { "epoch": 0.35257847533632286, "grad_norm": 0.09740150558388727, "learning_rate": 0.00019981603742122894, "loss": 0.3149, "step": 629 }, { "epoch": 0.3531390134529148, "grad_norm": 0.09447092201896107, "learning_rate": 0.00019981206121325696, "loss": 0.3032, "step": 630 }, { "epoch": 0.3536995515695067, "grad_norm": 0.09734285460983631, "learning_rate": 0.00019980804253259205, "loss": 0.3132, "step": 631 }, { "epoch": 0.3542600896860987, "grad_norm": 0.09301208400220677, "learning_rate": 0.00019980398138094428, "loss": 0.3192, "step": 632 }, { "epoch": 0.3548206278026906, "grad_norm": 0.08989075009688506, "learning_rate": 0.00019979987776004178, "loss": 0.308, "step": 633 }, { "epoch": 0.35538116591928254, "grad_norm": 0.09501691186329138, "learning_rate": 0.0001997957316716307, "loss": 0.3105, "step": 634 }, { "epoch": 0.35594170403587444, "grad_norm": 0.08750421147011359, "learning_rate": 0.00019979154311747536, "loss": 0.3176, "step": 635 }, { "epoch": 0.35650224215246634, "grad_norm": 0.08913056153223253, "learning_rate": 0.0001997873120993581, "loss": 0.3177, "step": 636 }, { "epoch": 0.3570627802690583, "grad_norm": 0.09047371146376425, "learning_rate": 0.00019978303861907932, "loss": 0.3037, "step": 637 }, { "epoch": 0.3576233183856502, "grad_norm": 0.0980302332632284, "learning_rate": 0.0001997787226784575, "loss": 0.3222, "step": 638 }, { "epoch": 0.35818385650224216, "grad_norm": 0.09807009199677308, "learning_rate": 0.0001997743642793292, "loss": 0.3159, "step": 639 }, { "epoch": 0.35874439461883406, "grad_norm": 0.0936984161703018, "learning_rate": 0.00019976996342354898, "loss": 0.3131, "step": 640 }, { "epoch": 0.359304932735426, "grad_norm": 0.09710519116360979, "learning_rate": 0.0001997655201129896, "loss": 0.3102, "step": 641 }, { "epoch": 0.3598654708520179, "grad_norm": 0.09834199646182591, "learning_rate": 0.00019976103434954175, "loss": 0.2985, "step": 642 }, { "epoch": 0.3604260089686099, "grad_norm": 0.09318331458756994, "learning_rate": 0.00019975650613511428, "loss": 0.3105, "step": 643 }, { "epoch": 0.3609865470852018, "grad_norm": 0.09060134422957127, "learning_rate": 0.00019975193547163404, "loss": 0.3025, "step": 644 }, { "epoch": 0.36154708520179374, "grad_norm": 0.1001621229650619, "learning_rate": 0.00019974732236104596, "loss": 0.318, "step": 645 }, { "epoch": 0.36210762331838564, "grad_norm": 0.09533176887067027, "learning_rate": 0.00019974266680531307, "loss": 0.3188, "step": 646 }, { "epoch": 0.3626681614349776, "grad_norm": 0.09447592612994977, "learning_rate": 0.00019973796880641645, "loss": 0.3231, "step": 647 }, { "epoch": 0.3632286995515695, "grad_norm": 0.10038905685358805, "learning_rate": 0.00019973322836635518, "loss": 0.3057, "step": 648 }, { "epoch": 0.36378923766816146, "grad_norm": 0.09837039723338849, "learning_rate": 0.00019972844548714648, "loss": 0.3097, "step": 649 }, { "epoch": 0.36434977578475336, "grad_norm": 0.09746096847482318, "learning_rate": 0.00019972362017082554, "loss": 0.2969, "step": 650 }, { "epoch": 0.3649103139013453, "grad_norm": 0.09046357217513716, "learning_rate": 0.0001997187524194457, "loss": 0.3075, "step": 651 }, { "epoch": 0.3654708520179372, "grad_norm": 0.08981985275088883, "learning_rate": 0.0001997138422350783, "loss": 0.3137, "step": 652 }, { "epoch": 0.3660313901345291, "grad_norm": 0.0973371615853409, "learning_rate": 0.0001997088896198128, "loss": 0.3079, "step": 653 }, { "epoch": 0.3665919282511211, "grad_norm": 0.09641053284915638, "learning_rate": 0.0001997038945757566, "loss": 0.3233, "step": 654 }, { "epoch": 0.367152466367713, "grad_norm": 0.09083531796513775, "learning_rate": 0.0001996988571050353, "loss": 0.3042, "step": 655 }, { "epoch": 0.36771300448430494, "grad_norm": 0.0923886180677156, "learning_rate": 0.00019969377720979237, "loss": 0.3095, "step": 656 }, { "epoch": 0.36827354260089684, "grad_norm": 0.09268763132645462, "learning_rate": 0.0001996886548921896, "loss": 0.3157, "step": 657 }, { "epoch": 0.3688340807174888, "grad_norm": 0.096762877643564, "learning_rate": 0.00019968349015440652, "loss": 0.2992, "step": 658 }, { "epoch": 0.3693946188340807, "grad_norm": 0.09588852152964636, "learning_rate": 0.00019967828299864094, "loss": 0.3226, "step": 659 }, { "epoch": 0.36995515695067266, "grad_norm": 0.09855560751484342, "learning_rate": 0.00019967303342710864, "loss": 0.3172, "step": 660 }, { "epoch": 0.37051569506726456, "grad_norm": 0.09124675338819187, "learning_rate": 0.00019966774144204342, "loss": 0.3114, "step": 661 }, { "epoch": 0.3710762331838565, "grad_norm": 0.09497679213022318, "learning_rate": 0.0001996624070456972, "loss": 0.3119, "step": 662 }, { "epoch": 0.3716367713004484, "grad_norm": 0.09383513529910878, "learning_rate": 0.00019965703024033988, "loss": 0.315, "step": 663 }, { "epoch": 0.3721973094170404, "grad_norm": 0.09187405365219889, "learning_rate": 0.00019965161102825945, "loss": 0.2989, "step": 664 }, { "epoch": 0.3727578475336323, "grad_norm": 0.09846630219244981, "learning_rate": 0.00019964614941176195, "loss": 0.3162, "step": 665 }, { "epoch": 0.37331838565022424, "grad_norm": 0.10258783996322045, "learning_rate": 0.00019964064539317137, "loss": 0.3116, "step": 666 }, { "epoch": 0.37387892376681614, "grad_norm": 0.09694456938304831, "learning_rate": 0.00019963509897482986, "loss": 0.3019, "step": 667 }, { "epoch": 0.3744394618834081, "grad_norm": 0.08723571528519997, "learning_rate": 0.0001996295101590976, "loss": 0.2958, "step": 668 }, { "epoch": 0.375, "grad_norm": 0.0902327190955776, "learning_rate": 0.00019962387894835275, "loss": 0.3125, "step": 669 }, { "epoch": 0.3755605381165919, "grad_norm": 0.10497903692593351, "learning_rate": 0.00019961820534499154, "loss": 0.3145, "step": 670 }, { "epoch": 0.37612107623318386, "grad_norm": 0.08794974221680721, "learning_rate": 0.00019961248935142825, "loss": 0.304, "step": 671 }, { "epoch": 0.37668161434977576, "grad_norm": 0.08592111650477056, "learning_rate": 0.00019960673097009518, "loss": 0.3295, "step": 672 }, { "epoch": 0.3772421524663677, "grad_norm": 0.09416973778233857, "learning_rate": 0.00019960093020344265, "loss": 0.3078, "step": 673 }, { "epoch": 0.3778026905829596, "grad_norm": 0.09882554225698, "learning_rate": 0.0001995950870539391, "loss": 0.3208, "step": 674 }, { "epoch": 0.3783632286995516, "grad_norm": 0.09056733013559291, "learning_rate": 0.00019958920152407088, "loss": 0.3057, "step": 675 }, { "epoch": 0.3789237668161435, "grad_norm": 0.093680820250419, "learning_rate": 0.00019958327361634248, "loss": 0.305, "step": 676 }, { "epoch": 0.37948430493273544, "grad_norm": 0.09117543360175803, "learning_rate": 0.00019957730333327637, "loss": 0.3079, "step": 677 }, { "epoch": 0.38004484304932734, "grad_norm": 0.09058289965652437, "learning_rate": 0.00019957129067741308, "loss": 0.3224, "step": 678 }, { "epoch": 0.3806053811659193, "grad_norm": 0.09434118281344171, "learning_rate": 0.00019956523565131115, "loss": 0.3071, "step": 679 }, { "epoch": 0.3811659192825112, "grad_norm": 0.09689254622157607, "learning_rate": 0.00019955913825754713, "loss": 0.3175, "step": 680 }, { "epoch": 0.38172645739910316, "grad_norm": 0.09033082100701963, "learning_rate": 0.00019955299849871568, "loss": 0.3153, "step": 681 }, { "epoch": 0.38228699551569506, "grad_norm": 0.09300510466497924, "learning_rate": 0.00019954681637742933, "loss": 0.3182, "step": 682 }, { "epoch": 0.382847533632287, "grad_norm": 0.08982220320560935, "learning_rate": 0.00019954059189631883, "loss": 0.2985, "step": 683 }, { "epoch": 0.3834080717488789, "grad_norm": 0.09212901813961062, "learning_rate": 0.00019953432505803286, "loss": 0.3079, "step": 684 }, { "epoch": 0.3839686098654709, "grad_norm": 0.09640137098440237, "learning_rate": 0.00019952801586523808, "loss": 0.3063, "step": 685 }, { "epoch": 0.3845291479820628, "grad_norm": 0.09408208141715627, "learning_rate": 0.00019952166432061924, "loss": 0.3168, "step": 686 }, { "epoch": 0.3850896860986547, "grad_norm": 0.09168586104891628, "learning_rate": 0.0001995152704268791, "loss": 0.3168, "step": 687 }, { "epoch": 0.38565022421524664, "grad_norm": 0.09216058156012896, "learning_rate": 0.0001995088341867384, "loss": 0.3058, "step": 688 }, { "epoch": 0.38621076233183854, "grad_norm": 0.09314968709238407, "learning_rate": 0.000199502355602936, "loss": 0.3074, "step": 689 }, { "epoch": 0.3867713004484305, "grad_norm": 0.08840063438652919, "learning_rate": 0.0001994958346782286, "loss": 0.3001, "step": 690 }, { "epoch": 0.3873318385650224, "grad_norm": 0.09621405562039968, "learning_rate": 0.00019948927141539113, "loss": 0.3212, "step": 691 }, { "epoch": 0.38789237668161436, "grad_norm": 0.08842288128281502, "learning_rate": 0.00019948266581721642, "loss": 0.2957, "step": 692 }, { "epoch": 0.38845291479820626, "grad_norm": 0.09329535326280906, "learning_rate": 0.00019947601788651527, "loss": 0.3059, "step": 693 }, { "epoch": 0.3890134529147982, "grad_norm": 0.09776146544452814, "learning_rate": 0.00019946932762611658, "loss": 0.3207, "step": 694 }, { "epoch": 0.3895739910313901, "grad_norm": 0.09346752724479822, "learning_rate": 0.0001994625950388673, "loss": 0.2949, "step": 695 }, { "epoch": 0.3901345291479821, "grad_norm": 0.09772096851212224, "learning_rate": 0.0001994558201276322, "loss": 0.3142, "step": 696 }, { "epoch": 0.390695067264574, "grad_norm": 0.0938987828017779, "learning_rate": 0.00019944900289529425, "loss": 0.3056, "step": 697 }, { "epoch": 0.39125560538116594, "grad_norm": 0.0965497026787379, "learning_rate": 0.00019944214334475442, "loss": 0.3088, "step": 698 }, { "epoch": 0.39181614349775784, "grad_norm": 0.09602660877350971, "learning_rate": 0.00019943524147893153, "loss": 0.316, "step": 699 }, { "epoch": 0.3923766816143498, "grad_norm": 0.09339139467599078, "learning_rate": 0.00019942829730076257, "loss": 0.3134, "step": 700 }, { "epoch": 0.3929372197309417, "grad_norm": 0.09273886736295357, "learning_rate": 0.00019942131081320246, "loss": 0.312, "step": 701 }, { "epoch": 0.39349775784753366, "grad_norm": 0.09490328698430993, "learning_rate": 0.00019941428201922413, "loss": 0.3095, "step": 702 }, { "epoch": 0.39405829596412556, "grad_norm": 0.09384106880964664, "learning_rate": 0.00019940721092181853, "loss": 0.3223, "step": 703 }, { "epoch": 0.39461883408071746, "grad_norm": 0.09133086939387193, "learning_rate": 0.0001994000975239946, "loss": 0.2988, "step": 704 }, { "epoch": 0.3951793721973094, "grad_norm": 0.09553642977701621, "learning_rate": 0.00019939294182877925, "loss": 0.3193, "step": 705 }, { "epoch": 0.3957399103139013, "grad_norm": 0.08857932146083475, "learning_rate": 0.0001993857438392175, "loss": 0.305, "step": 706 }, { "epoch": 0.3963004484304933, "grad_norm": 0.08771824644520687, "learning_rate": 0.00019937850355837217, "loss": 0.2984, "step": 707 }, { "epoch": 0.3968609865470852, "grad_norm": 0.0903763377102578, "learning_rate": 0.00019937122098932428, "loss": 0.298, "step": 708 }, { "epoch": 0.39742152466367714, "grad_norm": 0.09034740590815869, "learning_rate": 0.0001993638961351727, "loss": 0.3112, "step": 709 }, { "epoch": 0.39798206278026904, "grad_norm": 0.09708930191408309, "learning_rate": 0.00019935652899903442, "loss": 0.3106, "step": 710 }, { "epoch": 0.398542600896861, "grad_norm": 0.09763701767129569, "learning_rate": 0.00019934911958404428, "loss": 0.3092, "step": 711 }, { "epoch": 0.3991031390134529, "grad_norm": 0.09257194185153657, "learning_rate": 0.00019934166789335525, "loss": 0.3091, "step": 712 }, { "epoch": 0.39966367713004486, "grad_norm": 0.09116327725227731, "learning_rate": 0.00019933417393013815, "loss": 0.3076, "step": 713 }, { "epoch": 0.40022421524663676, "grad_norm": 0.08918440964551518, "learning_rate": 0.00019932663769758194, "loss": 0.321, "step": 714 }, { "epoch": 0.4007847533632287, "grad_norm": 0.09328211535654479, "learning_rate": 0.00019931905919889342, "loss": 0.3016, "step": 715 }, { "epoch": 0.4013452914798206, "grad_norm": 0.08758524473066116, "learning_rate": 0.00019931143843729748, "loss": 0.3019, "step": 716 }, { "epoch": 0.4019058295964126, "grad_norm": 0.08923669644041521, "learning_rate": 0.00019930377541603695, "loss": 0.3133, "step": 717 }, { "epoch": 0.4024663677130045, "grad_norm": 0.08958452005045, "learning_rate": 0.0001992960701383727, "loss": 0.304, "step": 718 }, { "epoch": 0.40302690582959644, "grad_norm": 0.08916290118406439, "learning_rate": 0.0001992883226075834, "loss": 0.299, "step": 719 }, { "epoch": 0.40358744394618834, "grad_norm": 0.09392346415005111, "learning_rate": 0.00019928053282696596, "loss": 0.3075, "step": 720 }, { "epoch": 0.40414798206278024, "grad_norm": 0.09133173321569467, "learning_rate": 0.00019927270079983506, "loss": 0.3074, "step": 721 }, { "epoch": 0.4047085201793722, "grad_norm": 0.08909159921128257, "learning_rate": 0.00019926482652952347, "loss": 0.3136, "step": 722 }, { "epoch": 0.4052690582959641, "grad_norm": 0.09527893792172565, "learning_rate": 0.0001992569100193819, "loss": 0.3156, "step": 723 }, { "epoch": 0.40582959641255606, "grad_norm": 0.08969969555334135, "learning_rate": 0.00019924895127277907, "loss": 0.3086, "step": 724 }, { "epoch": 0.40639013452914796, "grad_norm": 0.09241404516413201, "learning_rate": 0.00019924095029310158, "loss": 0.301, "step": 725 }, { "epoch": 0.4069506726457399, "grad_norm": 0.09069006933938728, "learning_rate": 0.00019923290708375412, "loss": 0.3148, "step": 726 }, { "epoch": 0.4075112107623318, "grad_norm": 0.09890533901369626, "learning_rate": 0.0001992248216481592, "loss": 0.3048, "step": 727 }, { "epoch": 0.4080717488789238, "grad_norm": 0.09313363812448927, "learning_rate": 0.00019921669398975745, "loss": 0.3013, "step": 728 }, { "epoch": 0.4086322869955157, "grad_norm": 0.08711918573325704, "learning_rate": 0.0001992085241120074, "loss": 0.2915, "step": 729 }, { "epoch": 0.40919282511210764, "grad_norm": 0.09498553317795624, "learning_rate": 0.00019920031201838557, "loss": 0.3079, "step": 730 }, { "epoch": 0.40975336322869954, "grad_norm": 0.08829302260299005, "learning_rate": 0.00019919205771238638, "loss": 0.308, "step": 731 }, { "epoch": 0.4103139013452915, "grad_norm": 0.09193666567406178, "learning_rate": 0.0001991837611975223, "loss": 0.3077, "step": 732 }, { "epoch": 0.4108744394618834, "grad_norm": 0.0877409102296216, "learning_rate": 0.0001991754224773237, "loss": 0.3021, "step": 733 }, { "epoch": 0.41143497757847536, "grad_norm": 0.09163527694355844, "learning_rate": 0.0001991670415553389, "loss": 0.3157, "step": 734 }, { "epoch": 0.41199551569506726, "grad_norm": 0.08851308902476697, "learning_rate": 0.00019915861843513425, "loss": 0.2987, "step": 735 }, { "epoch": 0.4125560538116592, "grad_norm": 0.08865061811742718, "learning_rate": 0.000199150153120294, "loss": 0.318, "step": 736 }, { "epoch": 0.4131165919282511, "grad_norm": 0.0916424076710549, "learning_rate": 0.00019914164561442036, "loss": 0.3042, "step": 737 }, { "epoch": 0.413677130044843, "grad_norm": 0.09075462969970004, "learning_rate": 0.00019913309592113347, "loss": 0.3093, "step": 738 }, { "epoch": 0.414237668161435, "grad_norm": 0.09227473483449715, "learning_rate": 0.0001991245040440715, "loss": 0.3072, "step": 739 }, { "epoch": 0.4147982062780269, "grad_norm": 0.09014553712868298, "learning_rate": 0.0001991158699868905, "loss": 0.3028, "step": 740 }, { "epoch": 0.41535874439461884, "grad_norm": 0.0923974795060702, "learning_rate": 0.00019910719375326453, "loss": 0.2984, "step": 741 }, { "epoch": 0.41591928251121074, "grad_norm": 0.08943840942515693, "learning_rate": 0.00019909847534688553, "loss": 0.3004, "step": 742 }, { "epoch": 0.4164798206278027, "grad_norm": 0.08567226895006697, "learning_rate": 0.00019908971477146338, "loss": 0.2944, "step": 743 }, { "epoch": 0.4170403587443946, "grad_norm": 0.08701439358407168, "learning_rate": 0.00019908091203072598, "loss": 0.2941, "step": 744 }, { "epoch": 0.41760089686098656, "grad_norm": 0.08850206841755387, "learning_rate": 0.00019907206712841915, "loss": 0.3077, "step": 745 }, { "epoch": 0.41816143497757846, "grad_norm": 0.08721106462150639, "learning_rate": 0.00019906318006830657, "loss": 0.3151, "step": 746 }, { "epoch": 0.4187219730941704, "grad_norm": 0.09393278041736332, "learning_rate": 0.00019905425085416995, "loss": 0.3068, "step": 747 }, { "epoch": 0.4192825112107623, "grad_norm": 0.09324005796413601, "learning_rate": 0.00019904527948980894, "loss": 0.3047, "step": 748 }, { "epoch": 0.4198430493273543, "grad_norm": 0.0888935722749391, "learning_rate": 0.00019903626597904105, "loss": 0.2967, "step": 749 }, { "epoch": 0.4204035874439462, "grad_norm": 0.08673797173238905, "learning_rate": 0.00019902721032570176, "loss": 0.2977, "step": 750 }, { "epoch": 0.42096412556053814, "grad_norm": 0.08806981508498309, "learning_rate": 0.00019901811253364456, "loss": 0.318, "step": 751 }, { "epoch": 0.42152466367713004, "grad_norm": 0.08881109920967563, "learning_rate": 0.00019900897260674073, "loss": 0.2999, "step": 752 }, { "epoch": 0.422085201793722, "grad_norm": 0.09538416584410722, "learning_rate": 0.00019899979054887964, "loss": 0.3203, "step": 753 }, { "epoch": 0.4226457399103139, "grad_norm": 0.08952151997022839, "learning_rate": 0.0001989905663639684, "loss": 0.3004, "step": 754 }, { "epoch": 0.4232062780269058, "grad_norm": 0.09001015247373621, "learning_rate": 0.00019898130005593218, "loss": 0.2847, "step": 755 }, { "epoch": 0.42376681614349776, "grad_norm": 0.09281590608787986, "learning_rate": 0.00019897199162871408, "loss": 0.307, "step": 756 }, { "epoch": 0.42432735426008966, "grad_norm": 0.08672644067463742, "learning_rate": 0.00019896264108627506, "loss": 0.3116, "step": 757 }, { "epoch": 0.4248878923766816, "grad_norm": 0.08503075443969747, "learning_rate": 0.000198953248432594, "loss": 0.303, "step": 758 }, { "epoch": 0.4254484304932735, "grad_norm": 0.09051207207933547, "learning_rate": 0.00019894381367166773, "loss": 0.3055, "step": 759 }, { "epoch": 0.4260089686098655, "grad_norm": 0.08723584743342241, "learning_rate": 0.00019893433680751103, "loss": 0.3102, "step": 760 }, { "epoch": 0.4265695067264574, "grad_norm": 0.08507901240916076, "learning_rate": 0.00019892481784415653, "loss": 0.3009, "step": 761 }, { "epoch": 0.42713004484304934, "grad_norm": 0.08923668828516959, "learning_rate": 0.0001989152567856548, "loss": 0.3039, "step": 762 }, { "epoch": 0.42769058295964124, "grad_norm": 0.08780375279840583, "learning_rate": 0.00019890565363607436, "loss": 0.3181, "step": 763 }, { "epoch": 0.4282511210762332, "grad_norm": 0.08725479512572638, "learning_rate": 0.00019889600839950155, "loss": 0.296, "step": 764 }, { "epoch": 0.4288116591928251, "grad_norm": 0.08288503384141521, "learning_rate": 0.00019888632108004074, "loss": 0.2958, "step": 765 }, { "epoch": 0.42937219730941706, "grad_norm": 0.08968254486397406, "learning_rate": 0.0001988765916818141, "loss": 0.3125, "step": 766 }, { "epoch": 0.42993273542600896, "grad_norm": 0.08205694133360567, "learning_rate": 0.00019886682020896172, "loss": 0.292, "step": 767 }, { "epoch": 0.4304932735426009, "grad_norm": 0.0897100401146557, "learning_rate": 0.0001988570066656417, "loss": 0.2924, "step": 768 }, { "epoch": 0.4310538116591928, "grad_norm": 0.08738986931277319, "learning_rate": 0.00019884715105602992, "loss": 0.2935, "step": 769 }, { "epoch": 0.4316143497757848, "grad_norm": 0.08997555138739971, "learning_rate": 0.00019883725338432017, "loss": 0.321, "step": 770 }, { "epoch": 0.4321748878923767, "grad_norm": 0.09148998862478722, "learning_rate": 0.00019882731365472424, "loss": 0.3118, "step": 771 }, { "epoch": 0.4327354260089686, "grad_norm": 0.08668291843813691, "learning_rate": 0.00019881733187147171, "loss": 0.321, "step": 772 }, { "epoch": 0.43329596412556054, "grad_norm": 0.08776191807915673, "learning_rate": 0.00019880730803881013, "loss": 0.2999, "step": 773 }, { "epoch": 0.43385650224215244, "grad_norm": 0.08987310100223357, "learning_rate": 0.00019879724216100486, "loss": 0.3162, "step": 774 }, { "epoch": 0.4344170403587444, "grad_norm": 0.08760893018411792, "learning_rate": 0.00019878713424233924, "loss": 0.313, "step": 775 }, { "epoch": 0.4349775784753363, "grad_norm": 0.09028271206037046, "learning_rate": 0.00019877698428711442, "loss": 0.3076, "step": 776 }, { "epoch": 0.43553811659192826, "grad_norm": 0.08422567534720686, "learning_rate": 0.00019876679229964949, "loss": 0.2988, "step": 777 }, { "epoch": 0.43609865470852016, "grad_norm": 0.08974811464181615, "learning_rate": 0.00019875655828428145, "loss": 0.311, "step": 778 }, { "epoch": 0.4366591928251121, "grad_norm": 0.08446337267991294, "learning_rate": 0.00019874628224536513, "loss": 0.2995, "step": 779 }, { "epoch": 0.437219730941704, "grad_norm": 0.09443337966417452, "learning_rate": 0.0001987359641872732, "loss": 0.3195, "step": 780 }, { "epoch": 0.437780269058296, "grad_norm": 0.08831834843301448, "learning_rate": 0.00019872560411439633, "loss": 0.3051, "step": 781 }, { "epoch": 0.4383408071748879, "grad_norm": 0.08717347016508245, "learning_rate": 0.000198715202031143, "loss": 0.3095, "step": 782 }, { "epoch": 0.43890134529147984, "grad_norm": 0.08143361331223166, "learning_rate": 0.00019870475794193956, "loss": 0.3034, "step": 783 }, { "epoch": 0.43946188340807174, "grad_norm": 0.0872812460626501, "learning_rate": 0.00019869427185123027, "loss": 0.3037, "step": 784 }, { "epoch": 0.4400224215246637, "grad_norm": 0.08663859380296493, "learning_rate": 0.0001986837437634772, "loss": 0.3115, "step": 785 }, { "epoch": 0.4405829596412556, "grad_norm": 0.09141249344984331, "learning_rate": 0.00019867317368316037, "loss": 0.3036, "step": 786 }, { "epoch": 0.44114349775784756, "grad_norm": 0.08653436715401916, "learning_rate": 0.0001986625616147776, "loss": 0.2992, "step": 787 }, { "epoch": 0.44170403587443946, "grad_norm": 0.0875322052920452, "learning_rate": 0.00019865190756284467, "loss": 0.3012, "step": 788 }, { "epoch": 0.4422645739910314, "grad_norm": 0.09067851337305519, "learning_rate": 0.0001986412115318951, "loss": 0.2952, "step": 789 }, { "epoch": 0.4428251121076233, "grad_norm": 0.0882649566344737, "learning_rate": 0.00019863047352648033, "loss": 0.3093, "step": 790 }, { "epoch": 0.4433856502242152, "grad_norm": 0.08513561123668154, "learning_rate": 0.0001986196935511697, "loss": 0.3136, "step": 791 }, { "epoch": 0.4439461883408072, "grad_norm": 0.09622214580200847, "learning_rate": 0.00019860887161055038, "loss": 0.3108, "step": 792 }, { "epoch": 0.4445067264573991, "grad_norm": 0.08687900491585457, "learning_rate": 0.0001985980077092274, "loss": 0.2986, "step": 793 }, { "epoch": 0.44506726457399104, "grad_norm": 0.08869706121836343, "learning_rate": 0.0001985871018518236, "loss": 0.31, "step": 794 }, { "epoch": 0.44562780269058294, "grad_norm": 0.08728306694736086, "learning_rate": 0.00019857615404297974, "loss": 0.3074, "step": 795 }, { "epoch": 0.4461883408071749, "grad_norm": 0.08713815025560269, "learning_rate": 0.0001985651642873544, "loss": 0.3044, "step": 796 }, { "epoch": 0.4467488789237668, "grad_norm": 0.08358102481973519, "learning_rate": 0.00019855413258962402, "loss": 0.3005, "step": 797 }, { "epoch": 0.44730941704035876, "grad_norm": 0.08558742897488306, "learning_rate": 0.00019854305895448287, "loss": 0.3125, "step": 798 }, { "epoch": 0.44786995515695066, "grad_norm": 0.09278562896889817, "learning_rate": 0.00019853194338664308, "loss": 0.3007, "step": 799 }, { "epoch": 0.4484304932735426, "grad_norm": 0.08837103157047665, "learning_rate": 0.00019852078589083466, "loss": 0.3196, "step": 800 }, { "epoch": 0.4489910313901345, "grad_norm": 0.0877899829779131, "learning_rate": 0.00019850958647180534, "loss": 0.2914, "step": 801 }, { "epoch": 0.4495515695067265, "grad_norm": 0.08740164316166062, "learning_rate": 0.00019849834513432083, "loss": 0.2977, "step": 802 }, { "epoch": 0.4501121076233184, "grad_norm": 0.08735279292449416, "learning_rate": 0.00019848706188316465, "loss": 0.3261, "step": 803 }, { "epoch": 0.45067264573991034, "grad_norm": 0.08828556023556346, "learning_rate": 0.00019847573672313802, "loss": 0.3054, "step": 804 }, { "epoch": 0.45123318385650224, "grad_norm": 0.08800560863232093, "learning_rate": 0.0001984643696590602, "loss": 0.308, "step": 805 }, { "epoch": 0.4517937219730942, "grad_norm": 0.08208728640708014, "learning_rate": 0.00019845296069576809, "loss": 0.3014, "step": 806 }, { "epoch": 0.4523542600896861, "grad_norm": 0.08271451540214034, "learning_rate": 0.00019844150983811657, "loss": 0.3001, "step": 807 }, { "epoch": 0.452914798206278, "grad_norm": 0.09035018600164363, "learning_rate": 0.0001984300170909783, "loss": 0.3024, "step": 808 }, { "epoch": 0.45347533632286996, "grad_norm": 0.08771905523544332, "learning_rate": 0.0001984184824592437, "loss": 0.3183, "step": 809 }, { "epoch": 0.45403587443946186, "grad_norm": 0.08752544096797525, "learning_rate": 0.00019840690594782109, "loss": 0.2999, "step": 810 }, { "epoch": 0.4545964125560538, "grad_norm": 0.08578415848259245, "learning_rate": 0.00019839528756163656, "loss": 0.3091, "step": 811 }, { "epoch": 0.4551569506726457, "grad_norm": 0.08779365509665701, "learning_rate": 0.00019838362730563406, "loss": 0.3019, "step": 812 }, { "epoch": 0.4557174887892377, "grad_norm": 0.08759195135247771, "learning_rate": 0.00019837192518477536, "loss": 0.309, "step": 813 }, { "epoch": 0.4562780269058296, "grad_norm": 0.0853426049035136, "learning_rate": 0.00019836018120404002, "loss": 0.3054, "step": 814 }, { "epoch": 0.45683856502242154, "grad_norm": 0.08576349200936671, "learning_rate": 0.00019834839536842536, "loss": 0.3098, "step": 815 }, { "epoch": 0.45739910313901344, "grad_norm": 0.08619979660754418, "learning_rate": 0.00019833656768294662, "loss": 0.2964, "step": 816 }, { "epoch": 0.4579596412556054, "grad_norm": 0.09528083875601881, "learning_rate": 0.0001983246981526368, "loss": 0.2959, "step": 817 }, { "epoch": 0.4585201793721973, "grad_norm": 0.08344925075952278, "learning_rate": 0.0001983127867825467, "loss": 0.3069, "step": 818 }, { "epoch": 0.45908071748878926, "grad_norm": 0.08460194518898766, "learning_rate": 0.00019830083357774486, "loss": 0.3042, "step": 819 }, { "epoch": 0.45964125560538116, "grad_norm": 0.08721789967399277, "learning_rate": 0.00019828883854331776, "loss": 0.3177, "step": 820 }, { "epoch": 0.4602017937219731, "grad_norm": 0.08722294812002279, "learning_rate": 0.0001982768016843696, "loss": 0.2854, "step": 821 }, { "epoch": 0.460762331838565, "grad_norm": 0.08689503300840559, "learning_rate": 0.00019826472300602237, "loss": 0.2978, "step": 822 }, { "epoch": 0.461322869955157, "grad_norm": 0.08779777368725004, "learning_rate": 0.00019825260251341587, "loss": 0.2997, "step": 823 }, { "epoch": 0.4618834080717489, "grad_norm": 0.08693707059653778, "learning_rate": 0.0001982404402117077, "loss": 0.2945, "step": 824 }, { "epoch": 0.4624439461883408, "grad_norm": 0.08650668902086212, "learning_rate": 0.0001982282361060732, "loss": 0.2937, "step": 825 }, { "epoch": 0.46300448430493274, "grad_norm": 0.09077683624522533, "learning_rate": 0.0001982159902017056, "loss": 0.3028, "step": 826 }, { "epoch": 0.46356502242152464, "grad_norm": 0.08647148809856209, "learning_rate": 0.00019820370250381585, "loss": 0.299, "step": 827 }, { "epoch": 0.4641255605381166, "grad_norm": 0.08243284869484088, "learning_rate": 0.00019819137301763267, "loss": 0.3057, "step": 828 }, { "epoch": 0.4646860986547085, "grad_norm": 0.08572317691098481, "learning_rate": 0.00019817900174840257, "loss": 0.3066, "step": 829 }, { "epoch": 0.46524663677130046, "grad_norm": 0.08511427435690537, "learning_rate": 0.0001981665887013899, "loss": 0.3064, "step": 830 }, { "epoch": 0.46580717488789236, "grad_norm": 0.08716461609535324, "learning_rate": 0.00019815413388187672, "loss": 0.3134, "step": 831 }, { "epoch": 0.4663677130044843, "grad_norm": 0.08720503249671421, "learning_rate": 0.00019814163729516292, "loss": 0.2941, "step": 832 }, { "epoch": 0.4669282511210762, "grad_norm": 0.08863179581270388, "learning_rate": 0.00019812909894656607, "loss": 0.3091, "step": 833 }, { "epoch": 0.4674887892376682, "grad_norm": 0.08659608487256064, "learning_rate": 0.00019811651884142162, "loss": 0.2929, "step": 834 }, { "epoch": 0.4680493273542601, "grad_norm": 0.09212980135470979, "learning_rate": 0.0001981038969850827, "loss": 0.3138, "step": 835 }, { "epoch": 0.46860986547085204, "grad_norm": 0.08663807470643362, "learning_rate": 0.0001980912333829203, "loss": 0.3109, "step": 836 }, { "epoch": 0.46917040358744394, "grad_norm": 0.08056886951111863, "learning_rate": 0.00019807852804032305, "loss": 0.3056, "step": 837 }, { "epoch": 0.4697309417040359, "grad_norm": 0.08447064309312803, "learning_rate": 0.0001980657809626975, "loss": 0.305, "step": 838 }, { "epoch": 0.4702914798206278, "grad_norm": 0.08525248902671248, "learning_rate": 0.00019805299215546778, "loss": 0.3085, "step": 839 }, { "epoch": 0.47085201793721976, "grad_norm": 0.08299100105530506, "learning_rate": 0.0001980401616240759, "loss": 0.3034, "step": 840 }, { "epoch": 0.47141255605381166, "grad_norm": 0.08789070494074572, "learning_rate": 0.00019802728937398165, "loss": 0.3021, "step": 841 }, { "epoch": 0.47197309417040356, "grad_norm": 0.08685735259602445, "learning_rate": 0.00019801437541066243, "loss": 0.3009, "step": 842 }, { "epoch": 0.4725336322869955, "grad_norm": 0.0879939615906324, "learning_rate": 0.00019800141973961357, "loss": 0.3014, "step": 843 }, { "epoch": 0.4730941704035874, "grad_norm": 0.08780966857867269, "learning_rate": 0.00019798842236634797, "loss": 0.2952, "step": 844 }, { "epoch": 0.4736547085201794, "grad_norm": 0.08871799147041039, "learning_rate": 0.0001979753832963964, "loss": 0.2958, "step": 845 }, { "epoch": 0.4742152466367713, "grad_norm": 0.08597502962700726, "learning_rate": 0.00019796230253530728, "loss": 0.3081, "step": 846 }, { "epoch": 0.47477578475336324, "grad_norm": 0.08323592889114863, "learning_rate": 0.00019794918008864687, "loss": 0.304, "step": 847 }, { "epoch": 0.47533632286995514, "grad_norm": 0.08512574100191798, "learning_rate": 0.00019793601596199912, "loss": 0.3064, "step": 848 }, { "epoch": 0.4758968609865471, "grad_norm": 0.08203530412255575, "learning_rate": 0.00019792281016096572, "loss": 0.3063, "step": 849 }, { "epoch": 0.476457399103139, "grad_norm": 0.08334682011724658, "learning_rate": 0.0001979095626911661, "loss": 0.2988, "step": 850 }, { "epoch": 0.47701793721973096, "grad_norm": 0.08518984832948939, "learning_rate": 0.00019789627355823735, "loss": 0.2968, "step": 851 }, { "epoch": 0.47757847533632286, "grad_norm": 0.08791084921603258, "learning_rate": 0.00019788294276783442, "loss": 0.2973, "step": 852 }, { "epoch": 0.4781390134529148, "grad_norm": 0.08269680827258992, "learning_rate": 0.00019786957032562986, "loss": 0.2892, "step": 853 }, { "epoch": 0.4786995515695067, "grad_norm": 0.08784029867945722, "learning_rate": 0.00019785615623731407, "loss": 0.311, "step": 854 }, { "epoch": 0.4792600896860987, "grad_norm": 0.08797940188677301, "learning_rate": 0.00019784270050859503, "loss": 0.2975, "step": 855 }, { "epoch": 0.4798206278026906, "grad_norm": 0.08287170236731753, "learning_rate": 0.00019782920314519856, "loss": 0.307, "step": 856 }, { "epoch": 0.48038116591928254, "grad_norm": 0.08748965260811624, "learning_rate": 0.00019781566415286812, "loss": 0.2999, "step": 857 }, { "epoch": 0.48094170403587444, "grad_norm": 0.08510434026271985, "learning_rate": 0.00019780208353736495, "loss": 0.299, "step": 858 }, { "epoch": 0.48150224215246634, "grad_norm": 0.08667037835816382, "learning_rate": 0.00019778846130446792, "loss": 0.2967, "step": 859 }, { "epoch": 0.4820627802690583, "grad_norm": 0.08547251398480658, "learning_rate": 0.00019777479745997366, "loss": 0.3007, "step": 860 }, { "epoch": 0.4826233183856502, "grad_norm": 0.08527076483126479, "learning_rate": 0.0001977610920096965, "loss": 0.3066, "step": 861 }, { "epoch": 0.48318385650224216, "grad_norm": 0.0840641801252334, "learning_rate": 0.0001977473449594685, "loss": 0.303, "step": 862 }, { "epoch": 0.48374439461883406, "grad_norm": 0.08931134590055506, "learning_rate": 0.00019773355631513942, "loss": 0.3126, "step": 863 }, { "epoch": 0.484304932735426, "grad_norm": 0.08820459512945028, "learning_rate": 0.00019771972608257659, "loss": 0.3026, "step": 864 }, { "epoch": 0.4848654708520179, "grad_norm": 0.08414892835297764, "learning_rate": 0.00019770585426766527, "loss": 0.303, "step": 865 }, { "epoch": 0.4854260089686099, "grad_norm": 0.08554246377401148, "learning_rate": 0.00019769194087630818, "loss": 0.2914, "step": 866 }, { "epoch": 0.4859865470852018, "grad_norm": 0.08733454850313473, "learning_rate": 0.0001976779859144259, "loss": 0.3, "step": 867 }, { "epoch": 0.48654708520179374, "grad_norm": 0.08234421007740982, "learning_rate": 0.00019766398938795662, "loss": 0.2991, "step": 868 }, { "epoch": 0.48710762331838564, "grad_norm": 0.08306767371159449, "learning_rate": 0.00019764995130285625, "loss": 0.3053, "step": 869 }, { "epoch": 0.4876681614349776, "grad_norm": 0.0870799104666167, "learning_rate": 0.00019763587166509835, "loss": 0.3016, "step": 870 }, { "epoch": 0.4882286995515695, "grad_norm": 0.08881348677675693, "learning_rate": 0.0001976217504806742, "loss": 0.2944, "step": 871 }, { "epoch": 0.48878923766816146, "grad_norm": 0.0881222931146758, "learning_rate": 0.00019760758775559274, "loss": 0.313, "step": 872 }, { "epoch": 0.48934977578475336, "grad_norm": 0.07774576318742801, "learning_rate": 0.00019759338349588054, "loss": 0.2827, "step": 873 }, { "epoch": 0.4899103139013453, "grad_norm": 0.08820544631924236, "learning_rate": 0.00019757913770758196, "loss": 0.2971, "step": 874 }, { "epoch": 0.4904708520179372, "grad_norm": 0.0832526339338868, "learning_rate": 0.0001975648503967589, "loss": 0.2941, "step": 875 }, { "epoch": 0.4910313901345291, "grad_norm": 0.08545950056971909, "learning_rate": 0.00019755052156949105, "loss": 0.311, "step": 876 }, { "epoch": 0.4915919282511211, "grad_norm": 0.08417839227555864, "learning_rate": 0.00019753615123187568, "loss": 0.3109, "step": 877 }, { "epoch": 0.492152466367713, "grad_norm": 0.08054406355358931, "learning_rate": 0.00019752173939002776, "loss": 0.3008, "step": 878 }, { "epoch": 0.49271300448430494, "grad_norm": 0.08678396311747016, "learning_rate": 0.0001975072860500799, "loss": 0.2913, "step": 879 }, { "epoch": 0.49327354260089684, "grad_norm": 0.08665877370645897, "learning_rate": 0.00019749279121818235, "loss": 0.2974, "step": 880 }, { "epoch": 0.4938340807174888, "grad_norm": 0.08501744931535897, "learning_rate": 0.00019747825490050314, "loss": 0.3122, "step": 881 }, { "epoch": 0.4943946188340807, "grad_norm": 0.0849881245549837, "learning_rate": 0.00019746367710322778, "loss": 0.3032, "step": 882 }, { "epoch": 0.49495515695067266, "grad_norm": 0.0843430176577945, "learning_rate": 0.00019744905783255953, "loss": 0.2987, "step": 883 }, { "epoch": 0.49551569506726456, "grad_norm": 0.0861446110677906, "learning_rate": 0.0001974343970947193, "loss": 0.2856, "step": 884 }, { "epoch": 0.4960762331838565, "grad_norm": 0.09074356324891023, "learning_rate": 0.0001974196948959456, "loss": 0.3141, "step": 885 }, { "epoch": 0.4966367713004484, "grad_norm": 0.08187514785841825, "learning_rate": 0.0001974049512424946, "loss": 0.3033, "step": 886 }, { "epoch": 0.4971973094170404, "grad_norm": 0.08379922826990938, "learning_rate": 0.00019739016614064018, "loss": 0.3116, "step": 887 }, { "epoch": 0.4977578475336323, "grad_norm": 0.08078480933878579, "learning_rate": 0.0001973753395966737, "loss": 0.3014, "step": 888 }, { "epoch": 0.49831838565022424, "grad_norm": 0.08269206583626729, "learning_rate": 0.00019736047161690435, "loss": 0.2912, "step": 889 }, { "epoch": 0.49887892376681614, "grad_norm": 0.08455761204795444, "learning_rate": 0.00019734556220765877, "loss": 0.301, "step": 890 }, { "epoch": 0.4994394618834081, "grad_norm": 0.0883603760082935, "learning_rate": 0.00019733061137528136, "loss": 0.3056, "step": 891 }, { "epoch": 0.5, "grad_norm": 0.08345844537621519, "learning_rate": 0.00019731561912613406, "loss": 0.2918, "step": 892 }, { "epoch": 0.500560538116592, "grad_norm": 0.08379990914186539, "learning_rate": 0.00019730058546659653, "loss": 0.3041, "step": 893 }, { "epoch": 0.5011210762331838, "grad_norm": 0.08126863320078369, "learning_rate": 0.00019728551040306593, "loss": 0.3049, "step": 894 }, { "epoch": 0.5016816143497758, "grad_norm": 0.08687314500462376, "learning_rate": 0.0001972703939419571, "loss": 0.3043, "step": 895 }, { "epoch": 0.5022421524663677, "grad_norm": 0.0890477112161628, "learning_rate": 0.00019725523608970255, "loss": 0.3097, "step": 896 }, { "epoch": 0.5028026905829597, "grad_norm": 0.08299004293296505, "learning_rate": 0.00019724003685275235, "loss": 0.3032, "step": 897 }, { "epoch": 0.5033632286995515, "grad_norm": 0.08938387229759379, "learning_rate": 0.00019722479623757413, "loss": 0.2923, "step": 898 }, { "epoch": 0.5039237668161435, "grad_norm": 0.0862274235241268, "learning_rate": 0.00019720951425065318, "loss": 0.2916, "step": 899 }, { "epoch": 0.5044843049327354, "grad_norm": 0.08413531724910275, "learning_rate": 0.00019719419089849247, "loss": 0.3093, "step": 900 }, { "epoch": 0.5050448430493274, "grad_norm": 0.08505393388446382, "learning_rate": 0.0001971788261876124, "loss": 0.2996, "step": 901 }, { "epoch": 0.5056053811659192, "grad_norm": 0.08595061886024928, "learning_rate": 0.00019716342012455112, "loss": 0.2948, "step": 902 }, { "epoch": 0.5061659192825112, "grad_norm": 0.086781454846237, "learning_rate": 0.00019714797271586432, "loss": 0.3035, "step": 903 }, { "epoch": 0.5067264573991032, "grad_norm": 0.08615481383556181, "learning_rate": 0.00019713248396812524, "loss": 0.3002, "step": 904 }, { "epoch": 0.5072869955156951, "grad_norm": 0.08185668542667167, "learning_rate": 0.0001971169538879248, "loss": 0.2985, "step": 905 }, { "epoch": 0.507847533632287, "grad_norm": 0.08487130316266801, "learning_rate": 0.00019710138248187143, "loss": 0.3145, "step": 906 }, { "epoch": 0.5084080717488789, "grad_norm": 0.08473969458264732, "learning_rate": 0.00019708576975659123, "loss": 0.2968, "step": 907 }, { "epoch": 0.5089686098654709, "grad_norm": 0.08487478002006031, "learning_rate": 0.00019707011571872777, "loss": 0.3149, "step": 908 }, { "epoch": 0.5095291479820628, "grad_norm": 0.07891403399740557, "learning_rate": 0.0001970544203749423, "loss": 0.3028, "step": 909 }, { "epoch": 0.5100896860986547, "grad_norm": 0.08186472706182046, "learning_rate": 0.00019703868373191358, "loss": 0.31, "step": 910 }, { "epoch": 0.5106502242152466, "grad_norm": 0.08271364945301952, "learning_rate": 0.00019702290579633799, "loss": 0.2998, "step": 911 }, { "epoch": 0.5112107623318386, "grad_norm": 0.08408491039003725, "learning_rate": 0.00019700708657492948, "loss": 0.2946, "step": 912 }, { "epoch": 0.5117713004484304, "grad_norm": 0.08217553132602926, "learning_rate": 0.0001969912260744195, "loss": 0.2966, "step": 913 }, { "epoch": 0.5123318385650224, "grad_norm": 0.08527351868710076, "learning_rate": 0.00019697532430155716, "loss": 0.2953, "step": 914 }, { "epoch": 0.5128923766816144, "grad_norm": 0.08430499254522876, "learning_rate": 0.00019695938126310908, "loss": 0.3144, "step": 915 }, { "epoch": 0.5134529147982063, "grad_norm": 0.08747472712751019, "learning_rate": 0.00019694339696585942, "loss": 0.2923, "step": 916 }, { "epoch": 0.5140134529147982, "grad_norm": 0.0838268115142445, "learning_rate": 0.00019692737141660996, "loss": 0.3056, "step": 917 }, { "epoch": 0.5145739910313901, "grad_norm": 0.08324548511507619, "learning_rate": 0.00019691130462217996, "loss": 0.2978, "step": 918 }, { "epoch": 0.5151345291479821, "grad_norm": 0.0856078729888167, "learning_rate": 0.0001968951965894063, "loss": 0.3045, "step": 919 }, { "epoch": 0.515695067264574, "grad_norm": 0.07967163778859834, "learning_rate": 0.0001968790473251434, "loss": 0.2954, "step": 920 }, { "epoch": 0.5162556053811659, "grad_norm": 0.08354138058819514, "learning_rate": 0.00019686285683626314, "loss": 0.2843, "step": 921 }, { "epoch": 0.5168161434977578, "grad_norm": 0.07968434796052404, "learning_rate": 0.00019684662512965505, "loss": 0.2966, "step": 922 }, { "epoch": 0.5173766816143498, "grad_norm": 0.08155084666307773, "learning_rate": 0.00019683035221222618, "loss": 0.303, "step": 923 }, { "epoch": 0.5179372197309418, "grad_norm": 0.08600324315161532, "learning_rate": 0.00019681403809090097, "loss": 0.329, "step": 924 }, { "epoch": 0.5184977578475336, "grad_norm": 0.08025233555071982, "learning_rate": 0.00019679768277262164, "loss": 0.2973, "step": 925 }, { "epoch": 0.5190582959641256, "grad_norm": 0.08371492254683732, "learning_rate": 0.00019678128626434777, "loss": 0.2909, "step": 926 }, { "epoch": 0.5196188340807175, "grad_norm": 0.08506849111333878, "learning_rate": 0.00019676484857305654, "loss": 0.3015, "step": 927 }, { "epoch": 0.5201793721973094, "grad_norm": 0.0831534850052501, "learning_rate": 0.00019674836970574254, "loss": 0.3045, "step": 928 }, { "epoch": 0.5207399103139013, "grad_norm": 0.08092206783572063, "learning_rate": 0.00019673184966941803, "loss": 0.2963, "step": 929 }, { "epoch": 0.5213004484304933, "grad_norm": 0.08408705293952237, "learning_rate": 0.00019671528847111275, "loss": 0.2879, "step": 930 }, { "epoch": 0.5218609865470852, "grad_norm": 0.08576490959085466, "learning_rate": 0.00019669868611787387, "loss": 0.2994, "step": 931 }, { "epoch": 0.5224215246636771, "grad_norm": 0.08854960466302186, "learning_rate": 0.00019668204261676618, "loss": 0.3071, "step": 932 }, { "epoch": 0.522982062780269, "grad_norm": 0.08110885464158614, "learning_rate": 0.00019666535797487194, "loss": 0.2964, "step": 933 }, { "epoch": 0.523542600896861, "grad_norm": 0.08031550556933642, "learning_rate": 0.00019664863219929086, "loss": 0.2966, "step": 934 }, { "epoch": 0.524103139013453, "grad_norm": 0.08238932652082077, "learning_rate": 0.0001966318652971402, "loss": 0.3089, "step": 935 }, { "epoch": 0.5246636771300448, "grad_norm": 0.08422789627755416, "learning_rate": 0.00019661505727555482, "loss": 0.303, "step": 936 }, { "epoch": 0.5252242152466368, "grad_norm": 0.08734140193215176, "learning_rate": 0.0001965982081416869, "loss": 0.3074, "step": 937 }, { "epoch": 0.5257847533632287, "grad_norm": 0.08127075155698334, "learning_rate": 0.0001965813179027062, "loss": 0.297, "step": 938 }, { "epoch": 0.5263452914798207, "grad_norm": 0.08500582631589884, "learning_rate": 0.00019656438656579997, "loss": 0.2927, "step": 939 }, { "epoch": 0.5269058295964125, "grad_norm": 0.08228510145684727, "learning_rate": 0.00019654741413817296, "loss": 0.2939, "step": 940 }, { "epoch": 0.5274663677130045, "grad_norm": 0.08647748094621052, "learning_rate": 0.00019653040062704737, "loss": 0.2997, "step": 941 }, { "epoch": 0.5280269058295964, "grad_norm": 0.07839774222368344, "learning_rate": 0.00019651334603966295, "loss": 0.3014, "step": 942 }, { "epoch": 0.5285874439461884, "grad_norm": 0.08693378206066758, "learning_rate": 0.00019649625038327683, "loss": 0.3021, "step": 943 }, { "epoch": 0.5291479820627802, "grad_norm": 0.08215386654261363, "learning_rate": 0.0001964791136651637, "loss": 0.3067, "step": 944 }, { "epoch": 0.5297085201793722, "grad_norm": 0.08369155709381948, "learning_rate": 0.00019646193589261565, "loss": 0.3126, "step": 945 }, { "epoch": 0.5302690582959642, "grad_norm": 0.08236756787936445, "learning_rate": 0.00019644471707294233, "loss": 0.3179, "step": 946 }, { "epoch": 0.530829596412556, "grad_norm": 0.08268734154037605, "learning_rate": 0.00019642745721347077, "loss": 0.2978, "step": 947 }, { "epoch": 0.531390134529148, "grad_norm": 0.08359759524474471, "learning_rate": 0.00019641015632154552, "loss": 0.3032, "step": 948 }, { "epoch": 0.5319506726457399, "grad_norm": 0.08415225130718881, "learning_rate": 0.00019639281440452856, "loss": 0.2989, "step": 949 }, { "epoch": 0.5325112107623319, "grad_norm": 0.08454145862518143, "learning_rate": 0.00019637543146979939, "loss": 0.2933, "step": 950 }, { "epoch": 0.5330717488789237, "grad_norm": 0.08178203425002459, "learning_rate": 0.0001963580075247548, "loss": 0.3045, "step": 951 }, { "epoch": 0.5336322869955157, "grad_norm": 0.08121384674961249, "learning_rate": 0.00019634054257680923, "loss": 0.292, "step": 952 }, { "epoch": 0.5341928251121076, "grad_norm": 0.08281600642346805, "learning_rate": 0.00019632303663339444, "loss": 0.3008, "step": 953 }, { "epoch": 0.5347533632286996, "grad_norm": 0.08062824486513642, "learning_rate": 0.00019630548970195975, "loss": 0.2911, "step": 954 }, { "epoch": 0.5353139013452914, "grad_norm": 0.082421799974641, "learning_rate": 0.00019628790178997173, "loss": 0.2964, "step": 955 }, { "epoch": 0.5358744394618834, "grad_norm": 0.08450222653145384, "learning_rate": 0.00019627027290491458, "loss": 0.3043, "step": 956 }, { "epoch": 0.5364349775784754, "grad_norm": 0.08761895213193802, "learning_rate": 0.00019625260305428989, "loss": 0.3059, "step": 957 }, { "epoch": 0.5369955156950673, "grad_norm": 0.08492338268252649, "learning_rate": 0.00019623489224561657, "loss": 0.2994, "step": 958 }, { "epoch": 0.5375560538116592, "grad_norm": 0.08135785679281451, "learning_rate": 0.0001962171404864311, "loss": 0.2966, "step": 959 }, { "epoch": 0.5381165919282511, "grad_norm": 0.08044172378823879, "learning_rate": 0.0001961993477842873, "loss": 0.2983, "step": 960 }, { "epoch": 0.5386771300448431, "grad_norm": 0.08185763804368876, "learning_rate": 0.00019618151414675644, "loss": 0.3054, "step": 961 }, { "epoch": 0.5392376681614349, "grad_norm": 0.08366828054161651, "learning_rate": 0.00019616363958142722, "loss": 0.2985, "step": 962 }, { "epoch": 0.5397982062780269, "grad_norm": 0.08518406054877001, "learning_rate": 0.00019614572409590574, "loss": 0.3043, "step": 963 }, { "epoch": 0.5403587443946188, "grad_norm": 0.07975472875203916, "learning_rate": 0.00019612776769781554, "loss": 0.3046, "step": 964 }, { "epoch": 0.5409192825112108, "grad_norm": 0.08010969525048041, "learning_rate": 0.00019610977039479746, "loss": 0.3025, "step": 965 }, { "epoch": 0.5414798206278026, "grad_norm": 0.08372359413046519, "learning_rate": 0.00019609173219450998, "loss": 0.2944, "step": 966 }, { "epoch": 0.5420403587443946, "grad_norm": 0.08073693338331876, "learning_rate": 0.00019607365310462868, "loss": 0.2926, "step": 967 }, { "epoch": 0.5426008968609866, "grad_norm": 0.08239761710797047, "learning_rate": 0.0001960555331328468, "loss": 0.2919, "step": 968 }, { "epoch": 0.5431614349775785, "grad_norm": 0.08080189571564042, "learning_rate": 0.0001960373722868748, "loss": 0.288, "step": 969 }, { "epoch": 0.5437219730941704, "grad_norm": 0.0836435259389605, "learning_rate": 0.00019601917057444072, "loss": 0.2961, "step": 970 }, { "epoch": 0.5442825112107623, "grad_norm": 0.08086009589923894, "learning_rate": 0.0001960009280032897, "loss": 0.2924, "step": 971 }, { "epoch": 0.5448430493273543, "grad_norm": 0.08296429379715402, "learning_rate": 0.00019598264458118458, "loss": 0.2983, "step": 972 }, { "epoch": 0.5454035874439462, "grad_norm": 0.08308733138527796, "learning_rate": 0.0001959643203159054, "loss": 0.2903, "step": 973 }, { "epoch": 0.5459641255605381, "grad_norm": 0.07872166292022645, "learning_rate": 0.0001959459552152496, "loss": 0.2929, "step": 974 }, { "epoch": 0.54652466367713, "grad_norm": 0.07986976819269802, "learning_rate": 0.00019592754928703205, "loss": 0.301, "step": 975 }, { "epoch": 0.547085201793722, "grad_norm": 0.08205647747003766, "learning_rate": 0.00019590910253908494, "loss": 0.2976, "step": 976 }, { "epoch": 0.547645739910314, "grad_norm": 0.0839728790871989, "learning_rate": 0.0001958906149792579, "loss": 0.3089, "step": 977 }, { "epoch": 0.5482062780269058, "grad_norm": 0.0803251806561804, "learning_rate": 0.00019587208661541784, "loss": 0.3064, "step": 978 }, { "epoch": 0.5487668161434978, "grad_norm": 0.07985275513420223, "learning_rate": 0.00019585351745544905, "loss": 0.2882, "step": 979 }, { "epoch": 0.5493273542600897, "grad_norm": 0.08140438201801921, "learning_rate": 0.00019583490750725325, "loss": 0.3104, "step": 980 }, { "epoch": 0.5498878923766816, "grad_norm": 0.08307980787136376, "learning_rate": 0.00019581625677874944, "loss": 0.298, "step": 981 }, { "epoch": 0.5504484304932735, "grad_norm": 0.0797856319440976, "learning_rate": 0.00019579756527787404, "loss": 0.3056, "step": 982 }, { "epoch": 0.5510089686098655, "grad_norm": 0.080113786389902, "learning_rate": 0.0001957788330125807, "loss": 0.2981, "step": 983 }, { "epoch": 0.5515695067264574, "grad_norm": 0.08553049329528893, "learning_rate": 0.0001957600599908406, "loss": 0.2968, "step": 984 }, { "epoch": 0.5521300448430493, "grad_norm": 0.08937914865124305, "learning_rate": 0.00019574124622064208, "loss": 0.3042, "step": 985 }, { "epoch": 0.5526905829596412, "grad_norm": 0.0808160952842041, "learning_rate": 0.00019572239170999098, "loss": 0.2942, "step": 986 }, { "epoch": 0.5532511210762332, "grad_norm": 0.08097659068475872, "learning_rate": 0.00019570349646691034, "loss": 0.2847, "step": 987 }, { "epoch": 0.5538116591928252, "grad_norm": 0.08091683580684034, "learning_rate": 0.0001956845604994406, "loss": 0.2972, "step": 988 }, { "epoch": 0.554372197309417, "grad_norm": 0.07621229573195737, "learning_rate": 0.0001956655838156395, "loss": 0.2955, "step": 989 }, { "epoch": 0.554932735426009, "grad_norm": 0.08338821667309904, "learning_rate": 0.00019564656642358217, "loss": 0.2993, "step": 990 }, { "epoch": 0.5554932735426009, "grad_norm": 0.0820814226050958, "learning_rate": 0.00019562750833136097, "loss": 0.2948, "step": 991 }, { "epoch": 0.5560538116591929, "grad_norm": 0.08424537008272272, "learning_rate": 0.00019560840954708565, "loss": 0.2931, "step": 992 }, { "epoch": 0.5566143497757847, "grad_norm": 0.08630715850803786, "learning_rate": 0.00019558927007888328, "loss": 0.2893, "step": 993 }, { "epoch": 0.5571748878923767, "grad_norm": 0.08787717405940232, "learning_rate": 0.00019557008993489815, "loss": 0.3017, "step": 994 }, { "epoch": 0.5577354260089686, "grad_norm": 0.08219811787822397, "learning_rate": 0.00019555086912329198, "loss": 0.2987, "step": 995 }, { "epoch": 0.5582959641255605, "grad_norm": 0.08257802571035627, "learning_rate": 0.00019553160765224372, "loss": 0.2968, "step": 996 }, { "epoch": 0.5588565022421524, "grad_norm": 0.08232718737486464, "learning_rate": 0.0001955123055299496, "loss": 0.2927, "step": 997 }, { "epoch": 0.5594170403587444, "grad_norm": 0.08472102275377649, "learning_rate": 0.00019549296276462325, "loss": 0.2967, "step": 998 }, { "epoch": 0.5599775784753364, "grad_norm": 0.08162396782133255, "learning_rate": 0.0001954735793644955, "loss": 0.3083, "step": 999 }, { "epoch": 0.5605381165919282, "grad_norm": 0.08124050128833456, "learning_rate": 0.0001954541553378145, "loss": 0.2942, "step": 1000 }, { "epoch": 0.5610986547085202, "grad_norm": 0.08292789662007756, "learning_rate": 0.00019543469069284572, "loss": 0.3047, "step": 1001 }, { "epoch": 0.5616591928251121, "grad_norm": 0.0802762167797529, "learning_rate": 0.00019541518543787184, "loss": 0.2997, "step": 1002 }, { "epoch": 0.5622197309417041, "grad_norm": 0.08099010862176718, "learning_rate": 0.00019539563958119292, "loss": 0.3083, "step": 1003 }, { "epoch": 0.5627802690582959, "grad_norm": 0.08287263273356754, "learning_rate": 0.0001953760531311262, "loss": 0.3062, "step": 1004 }, { "epoch": 0.5633408071748879, "grad_norm": 0.0822128744799154, "learning_rate": 0.00019535642609600623, "loss": 0.2965, "step": 1005 }, { "epoch": 0.5639013452914798, "grad_norm": 0.08091755046384955, "learning_rate": 0.00019533675848418488, "loss": 0.2865, "step": 1006 }, { "epoch": 0.5644618834080718, "grad_norm": 0.0824953280005843, "learning_rate": 0.00019531705030403123, "loss": 0.3058, "step": 1007 }, { "epoch": 0.5650224215246636, "grad_norm": 0.08139511280822345, "learning_rate": 0.0001952973015639316, "loss": 0.3009, "step": 1008 }, { "epoch": 0.5655829596412556, "grad_norm": 0.08380566540204436, "learning_rate": 0.00019527751227228963, "loss": 0.3094, "step": 1009 }, { "epoch": 0.5661434977578476, "grad_norm": 0.08396668460521664, "learning_rate": 0.0001952576824375262, "loss": 0.3011, "step": 1010 }, { "epoch": 0.5667040358744395, "grad_norm": 0.08324030871087482, "learning_rate": 0.00019523781206807944, "loss": 0.2996, "step": 1011 }, { "epoch": 0.5672645739910314, "grad_norm": 0.08074950884817227, "learning_rate": 0.0001952179011724047, "loss": 0.3091, "step": 1012 }, { "epoch": 0.5678251121076233, "grad_norm": 0.08156566020093663, "learning_rate": 0.0001951979497589746, "loss": 0.3074, "step": 1013 }, { "epoch": 0.5683856502242153, "grad_norm": 0.08130419215015418, "learning_rate": 0.000195177957836279, "loss": 0.3047, "step": 1014 }, { "epoch": 0.5689461883408071, "grad_norm": 0.07816132167890519, "learning_rate": 0.00019515792541282504, "loss": 0.2919, "step": 1015 }, { "epoch": 0.5695067264573991, "grad_norm": 0.08065245102568831, "learning_rate": 0.00019513785249713697, "loss": 0.2886, "step": 1016 }, { "epoch": 0.570067264573991, "grad_norm": 0.0811814245119974, "learning_rate": 0.00019511773909775638, "loss": 0.2964, "step": 1017 }, { "epoch": 0.570627802690583, "grad_norm": 0.08086825544168999, "learning_rate": 0.00019509758522324208, "loss": 0.2962, "step": 1018 }, { "epoch": 0.5711883408071748, "grad_norm": 0.07905304183732399, "learning_rate": 0.00019507739088217007, "loss": 0.2919, "step": 1019 }, { "epoch": 0.5717488789237668, "grad_norm": 0.08154893266510474, "learning_rate": 0.00019505715608313359, "loss": 0.294, "step": 1020 }, { "epoch": 0.5723094170403588, "grad_norm": 0.08621625302958354, "learning_rate": 0.00019503688083474306, "loss": 0.3072, "step": 1021 }, { "epoch": 0.5728699551569507, "grad_norm": 0.08102355358650781, "learning_rate": 0.00019501656514562616, "loss": 0.2977, "step": 1022 }, { "epoch": 0.5734304932735426, "grad_norm": 0.08352356346903747, "learning_rate": 0.00019499620902442777, "loss": 0.2973, "step": 1023 }, { "epoch": 0.5739910313901345, "grad_norm": 0.08645298103063442, "learning_rate": 0.00019497581247980992, "loss": 0.3009, "step": 1024 }, { "epoch": 0.5745515695067265, "grad_norm": 0.08215864116992247, "learning_rate": 0.0001949553755204519, "loss": 0.2986, "step": 1025 }, { "epoch": 0.5751121076233184, "grad_norm": 0.08300242923574697, "learning_rate": 0.00019493489815505018, "loss": 0.2979, "step": 1026 }, { "epoch": 0.5756726457399103, "grad_norm": 0.08274552344212312, "learning_rate": 0.00019491438039231847, "loss": 0.2897, "step": 1027 }, { "epoch": 0.5762331838565022, "grad_norm": 0.08198176538321253, "learning_rate": 0.0001948938222409876, "loss": 0.2911, "step": 1028 }, { "epoch": 0.5767937219730942, "grad_norm": 0.08268500303931227, "learning_rate": 0.00019487322370980557, "loss": 0.2982, "step": 1029 }, { "epoch": 0.577354260089686, "grad_norm": 0.08292390243228843, "learning_rate": 0.00019485258480753763, "loss": 0.2953, "step": 1030 }, { "epoch": 0.577914798206278, "grad_norm": 0.08017134599658629, "learning_rate": 0.0001948319055429662, "loss": 0.2978, "step": 1031 }, { "epoch": 0.57847533632287, "grad_norm": 0.07872226328959364, "learning_rate": 0.00019481118592489086, "loss": 0.2938, "step": 1032 }, { "epoch": 0.5790358744394619, "grad_norm": 0.08151729117815305, "learning_rate": 0.0001947904259621283, "loss": 0.3008, "step": 1033 }, { "epoch": 0.5795964125560538, "grad_norm": 0.0827677338829837, "learning_rate": 0.0001947696256635125, "loss": 0.3031, "step": 1034 }, { "epoch": 0.5801569506726457, "grad_norm": 0.08327695153084624, "learning_rate": 0.00019474878503789457, "loss": 0.2808, "step": 1035 }, { "epoch": 0.5807174887892377, "grad_norm": 0.08225192058168172, "learning_rate": 0.00019472790409414266, "loss": 0.2992, "step": 1036 }, { "epoch": 0.5812780269058296, "grad_norm": 0.08073922503190531, "learning_rate": 0.00019470698284114221, "loss": 0.2888, "step": 1037 }, { "epoch": 0.5818385650224215, "grad_norm": 0.08066445833016037, "learning_rate": 0.0001946860212877958, "loss": 0.2967, "step": 1038 }, { "epoch": 0.5823991031390134, "grad_norm": 0.082569667570103, "learning_rate": 0.0001946650194430231, "loss": 0.2997, "step": 1039 }, { "epoch": 0.5829596412556054, "grad_norm": 0.07835736029464292, "learning_rate": 0.00019464397731576094, "loss": 0.2891, "step": 1040 }, { "epoch": 0.5835201793721974, "grad_norm": 0.07887715261284907, "learning_rate": 0.00019462289491496335, "loss": 0.2984, "step": 1041 }, { "epoch": 0.5840807174887892, "grad_norm": 0.08162349914427701, "learning_rate": 0.0001946017722496014, "loss": 0.2891, "step": 1042 }, { "epoch": 0.5846412556053812, "grad_norm": 0.08446767920897999, "learning_rate": 0.00019458060932866342, "loss": 0.2769, "step": 1043 }, { "epoch": 0.5852017937219731, "grad_norm": 0.0809568234632934, "learning_rate": 0.00019455940616115472, "loss": 0.3003, "step": 1044 }, { "epoch": 0.5857623318385651, "grad_norm": 0.08308651523590681, "learning_rate": 0.00019453816275609786, "loss": 0.3016, "step": 1045 }, { "epoch": 0.5863228699551569, "grad_norm": 0.0818178078741461, "learning_rate": 0.00019451687912253247, "loss": 0.2997, "step": 1046 }, { "epoch": 0.5868834080717489, "grad_norm": 0.0807603243720443, "learning_rate": 0.00019449555526951528, "loss": 0.2866, "step": 1047 }, { "epoch": 0.5874439461883408, "grad_norm": 0.0813055583399256, "learning_rate": 0.00019447419120612017, "loss": 0.281, "step": 1048 }, { "epoch": 0.5880044843049327, "grad_norm": 0.07993839104960439, "learning_rate": 0.00019445278694143813, "loss": 0.2958, "step": 1049 }, { "epoch": 0.5885650224215246, "grad_norm": 0.08372831791778379, "learning_rate": 0.00019443134248457727, "loss": 0.3057, "step": 1050 }, { "epoch": 0.5891255605381166, "grad_norm": 0.07820095062069352, "learning_rate": 0.0001944098578446627, "loss": 0.3036, "step": 1051 }, { "epoch": 0.5896860986547086, "grad_norm": 0.08196388786253347, "learning_rate": 0.00019438833303083678, "loss": 0.3065, "step": 1052 }, { "epoch": 0.5902466367713004, "grad_norm": 0.08085677677082478, "learning_rate": 0.00019436676805225885, "loss": 0.3027, "step": 1053 }, { "epoch": 0.5908071748878924, "grad_norm": 0.08049968008603187, "learning_rate": 0.0001943451629181054, "loss": 0.2936, "step": 1054 }, { "epoch": 0.5913677130044843, "grad_norm": 0.08110490365762947, "learning_rate": 0.00019432351763756998, "loss": 0.2946, "step": 1055 }, { "epoch": 0.5919282511210763, "grad_norm": 0.08020434181505288, "learning_rate": 0.00019430183221986325, "loss": 0.2905, "step": 1056 }, { "epoch": 0.5924887892376681, "grad_norm": 0.08109196937394578, "learning_rate": 0.00019428010667421294, "loss": 0.296, "step": 1057 }, { "epoch": 0.5930493273542601, "grad_norm": 0.08521408961392643, "learning_rate": 0.0001942583410098638, "loss": 0.3011, "step": 1058 }, { "epoch": 0.593609865470852, "grad_norm": 0.08178596609634749, "learning_rate": 0.00019423653523607776, "loss": 0.2995, "step": 1059 }, { "epoch": 0.594170403587444, "grad_norm": 0.08275727913240233, "learning_rate": 0.0001942146893621337, "loss": 0.3057, "step": 1060 }, { "epoch": 0.5947309417040358, "grad_norm": 0.07869497119472335, "learning_rate": 0.0001941928033973277, "loss": 0.2803, "step": 1061 }, { "epoch": 0.5952914798206278, "grad_norm": 0.07938151254570681, "learning_rate": 0.00019417087735097276, "loss": 0.3008, "step": 1062 }, { "epoch": 0.5958520179372198, "grad_norm": 0.08096247717015172, "learning_rate": 0.00019414891123239902, "loss": 0.3009, "step": 1063 }, { "epoch": 0.5964125560538116, "grad_norm": 0.08188219012976368, "learning_rate": 0.00019412690505095365, "loss": 0.2834, "step": 1064 }, { "epoch": 0.5969730941704036, "grad_norm": 0.07950204205382994, "learning_rate": 0.00019410485881600083, "loss": 0.3028, "step": 1065 }, { "epoch": 0.5975336322869955, "grad_norm": 0.07961964154651452, "learning_rate": 0.00019408277253692187, "loss": 0.299, "step": 1066 }, { "epoch": 0.5980941704035875, "grad_norm": 0.08141812652243936, "learning_rate": 0.00019406064622311503, "loss": 0.3103, "step": 1067 }, { "epoch": 0.5986547085201793, "grad_norm": 0.08272941016317377, "learning_rate": 0.0001940384798839957, "loss": 0.3019, "step": 1068 }, { "epoch": 0.5992152466367713, "grad_norm": 0.08134911105415703, "learning_rate": 0.00019401627352899617, "loss": 0.2926, "step": 1069 }, { "epoch": 0.5997757847533632, "grad_norm": 0.0802752797624018, "learning_rate": 0.00019399402716756593, "loss": 0.2991, "step": 1070 }, { "epoch": 0.6003363228699552, "grad_norm": 0.08196460690449331, "learning_rate": 0.00019397174080917133, "loss": 0.2874, "step": 1071 }, { "epoch": 0.600896860986547, "grad_norm": 0.08062758419866864, "learning_rate": 0.00019394941446329583, "loss": 0.2924, "step": 1072 }, { "epoch": 0.601457399103139, "grad_norm": 0.08103845174553295, "learning_rate": 0.00019392704813943988, "loss": 0.303, "step": 1073 }, { "epoch": 0.602017937219731, "grad_norm": 0.07931275903974408, "learning_rate": 0.00019390464184712095, "loss": 0.2887, "step": 1074 }, { "epoch": 0.6025784753363229, "grad_norm": 0.07809852492116702, "learning_rate": 0.00019388219559587352, "loss": 0.2954, "step": 1075 }, { "epoch": 0.6031390134529148, "grad_norm": 0.07695128462165139, "learning_rate": 0.0001938597093952491, "loss": 0.2937, "step": 1076 }, { "epoch": 0.6036995515695067, "grad_norm": 0.07968305921044003, "learning_rate": 0.00019383718325481611, "loss": 0.2899, "step": 1077 }, { "epoch": 0.6042600896860987, "grad_norm": 0.0766371362280237, "learning_rate": 0.00019381461718416003, "loss": 0.2887, "step": 1078 }, { "epoch": 0.6048206278026906, "grad_norm": 0.08224654145621542, "learning_rate": 0.00019379201119288335, "loss": 0.298, "step": 1079 }, { "epoch": 0.6053811659192825, "grad_norm": 0.08935417113127495, "learning_rate": 0.00019376936529060554, "loss": 0.2903, "step": 1080 }, { "epoch": 0.6059417040358744, "grad_norm": 0.07924901485783611, "learning_rate": 0.000193746679486963, "loss": 0.3027, "step": 1081 }, { "epoch": 0.6065022421524664, "grad_norm": 0.08002093282893484, "learning_rate": 0.00019372395379160912, "loss": 0.2897, "step": 1082 }, { "epoch": 0.6070627802690582, "grad_norm": 0.07970239275692652, "learning_rate": 0.00019370118821421435, "loss": 0.3055, "step": 1083 }, { "epoch": 0.6076233183856502, "grad_norm": 0.07952328389630461, "learning_rate": 0.000193678382764466, "loss": 0.2951, "step": 1084 }, { "epoch": 0.6081838565022422, "grad_norm": 0.07836077662821399, "learning_rate": 0.00019365553745206846, "loss": 0.3058, "step": 1085 }, { "epoch": 0.6087443946188341, "grad_norm": 0.07837725396868078, "learning_rate": 0.00019363265228674296, "loss": 0.2992, "step": 1086 }, { "epoch": 0.609304932735426, "grad_norm": 0.07958884714718022, "learning_rate": 0.00019360972727822774, "loss": 0.2788, "step": 1087 }, { "epoch": 0.6098654708520179, "grad_norm": 0.07932542886598398, "learning_rate": 0.00019358676243627808, "loss": 0.2883, "step": 1088 }, { "epoch": 0.6104260089686099, "grad_norm": 0.08309045755745338, "learning_rate": 0.00019356375777066604, "loss": 0.3058, "step": 1089 }, { "epoch": 0.6109865470852018, "grad_norm": 0.08453098433315742, "learning_rate": 0.0001935407132911808, "loss": 0.2861, "step": 1090 }, { "epoch": 0.6115470852017937, "grad_norm": 0.07726523523099504, "learning_rate": 0.00019351762900762833, "loss": 0.2975, "step": 1091 }, { "epoch": 0.6121076233183856, "grad_norm": 0.08583559568404842, "learning_rate": 0.00019349450492983164, "loss": 0.2962, "step": 1092 }, { "epoch": 0.6126681614349776, "grad_norm": 0.08279385055747263, "learning_rate": 0.00019347134106763062, "loss": 0.3077, "step": 1093 }, { "epoch": 0.6132286995515696, "grad_norm": 0.07913164196006647, "learning_rate": 0.00019344813743088217, "loss": 0.306, "step": 1094 }, { "epoch": 0.6137892376681614, "grad_norm": 0.08142004526828342, "learning_rate": 0.00019342489402945998, "loss": 0.3104, "step": 1095 }, { "epoch": 0.6143497757847534, "grad_norm": 0.07725508609340993, "learning_rate": 0.0001934016108732548, "loss": 0.2832, "step": 1096 }, { "epoch": 0.6149103139013453, "grad_norm": 0.07701666006903832, "learning_rate": 0.0001933782879721742, "loss": 0.3037, "step": 1097 }, { "epoch": 0.6154708520179372, "grad_norm": 0.0796016432331057, "learning_rate": 0.00019335492533614272, "loss": 0.288, "step": 1098 }, { "epoch": 0.6160313901345291, "grad_norm": 0.08038386277262886, "learning_rate": 0.00019333152297510176, "loss": 0.2958, "step": 1099 }, { "epoch": 0.6165919282511211, "grad_norm": 0.08285440899869845, "learning_rate": 0.00019330808089900963, "loss": 0.3047, "step": 1100 }, { "epoch": 0.617152466367713, "grad_norm": 0.08012111876088956, "learning_rate": 0.00019328459911784163, "loss": 0.2938, "step": 1101 }, { "epoch": 0.6177130044843049, "grad_norm": 0.08236582150599499, "learning_rate": 0.00019326107764158982, "loss": 0.2902, "step": 1102 }, { "epoch": 0.6182735426008968, "grad_norm": 0.08255214806492887, "learning_rate": 0.0001932375164802632, "loss": 0.3007, "step": 1103 }, { "epoch": 0.6188340807174888, "grad_norm": 0.08146766932112492, "learning_rate": 0.00019321391564388775, "loss": 0.2992, "step": 1104 }, { "epoch": 0.6193946188340808, "grad_norm": 0.08066260480208594, "learning_rate": 0.00019319027514250618, "loss": 0.2795, "step": 1105 }, { "epoch": 0.6199551569506726, "grad_norm": 0.08387491653054607, "learning_rate": 0.0001931665949861782, "loss": 0.2908, "step": 1106 }, { "epoch": 0.6205156950672646, "grad_norm": 0.07852227844176, "learning_rate": 0.00019314287518498033, "loss": 0.2857, "step": 1107 }, { "epoch": 0.6210762331838565, "grad_norm": 0.07612580999607896, "learning_rate": 0.00019311911574900598, "loss": 0.294, "step": 1108 }, { "epoch": 0.6216367713004485, "grad_norm": 0.0765034734546166, "learning_rate": 0.00019309531668836545, "loss": 0.3059, "step": 1109 }, { "epoch": 0.6221973094170403, "grad_norm": 0.07948666602882432, "learning_rate": 0.00019307147801318585, "loss": 0.3032, "step": 1110 }, { "epoch": 0.6227578475336323, "grad_norm": 0.08120273308887432, "learning_rate": 0.00019304759973361112, "loss": 0.2921, "step": 1111 }, { "epoch": 0.6233183856502242, "grad_norm": 0.07598800597650929, "learning_rate": 0.00019302368185980217, "loss": 0.2807, "step": 1112 }, { "epoch": 0.6238789237668162, "grad_norm": 0.08107964441077349, "learning_rate": 0.00019299972440193672, "loss": 0.299, "step": 1113 }, { "epoch": 0.624439461883408, "grad_norm": 0.08025669159681502, "learning_rate": 0.00019297572737020922, "loss": 0.3001, "step": 1114 }, { "epoch": 0.625, "grad_norm": 0.0808282612672915, "learning_rate": 0.0001929516907748311, "loss": 0.2929, "step": 1115 }, { "epoch": 0.625560538116592, "grad_norm": 0.07907580039207018, "learning_rate": 0.00019292761462603056, "loss": 0.2986, "step": 1116 }, { "epoch": 0.6261210762331838, "grad_norm": 0.0849296890291424, "learning_rate": 0.00019290349893405268, "loss": 0.3079, "step": 1117 }, { "epoch": 0.6266816143497758, "grad_norm": 0.07739679164512617, "learning_rate": 0.00019287934370915925, "loss": 0.2929, "step": 1118 }, { "epoch": 0.6272421524663677, "grad_norm": 0.07656728336817563, "learning_rate": 0.00019285514896162905, "loss": 0.3, "step": 1119 }, { "epoch": 0.6278026905829597, "grad_norm": 0.07690746172050378, "learning_rate": 0.00019283091470175754, "loss": 0.2943, "step": 1120 }, { "epoch": 0.6283632286995515, "grad_norm": 0.07997516497280334, "learning_rate": 0.00019280664093985705, "loss": 0.3095, "step": 1121 }, { "epoch": 0.6289237668161435, "grad_norm": 0.07824202192119754, "learning_rate": 0.0001927823276862567, "loss": 0.306, "step": 1122 }, { "epoch": 0.6294843049327354, "grad_norm": 0.07528993585215925, "learning_rate": 0.00019275797495130247, "loss": 0.2947, "step": 1123 }, { "epoch": 0.6300448430493274, "grad_norm": 0.07902576493284015, "learning_rate": 0.00019273358274535704, "loss": 0.2933, "step": 1124 }, { "epoch": 0.6306053811659192, "grad_norm": 0.08134667819638486, "learning_rate": 0.0001927091510788, "loss": 0.3097, "step": 1125 }, { "epoch": 0.6311659192825112, "grad_norm": 0.07938084005101839, "learning_rate": 0.00019268467996202762, "loss": 0.304, "step": 1126 }, { "epoch": 0.6317264573991032, "grad_norm": 0.08270307342787032, "learning_rate": 0.00019266016940545306, "loss": 0.2941, "step": 1127 }, { "epoch": 0.6322869955156951, "grad_norm": 0.08035414600387734, "learning_rate": 0.00019263561941950622, "loss": 0.2929, "step": 1128 }, { "epoch": 0.632847533632287, "grad_norm": 0.08408923218561712, "learning_rate": 0.0001926110300146337, "loss": 0.3006, "step": 1129 }, { "epoch": 0.6334080717488789, "grad_norm": 0.0781064497769012, "learning_rate": 0.00019258640120129906, "loss": 0.2943, "step": 1130 }, { "epoch": 0.6339686098654709, "grad_norm": 0.08038236922867095, "learning_rate": 0.00019256173298998243, "loss": 0.2885, "step": 1131 }, { "epoch": 0.6345291479820628, "grad_norm": 0.08268602877577204, "learning_rate": 0.0001925370253911808, "loss": 0.288, "step": 1132 }, { "epoch": 0.6350896860986547, "grad_norm": 0.07860721805885536, "learning_rate": 0.00019251227841540796, "loss": 0.3001, "step": 1133 }, { "epoch": 0.6356502242152466, "grad_norm": 0.07708782842231296, "learning_rate": 0.00019248749207319437, "loss": 0.2845, "step": 1134 }, { "epoch": 0.6362107623318386, "grad_norm": 0.08111114715442702, "learning_rate": 0.00019246266637508726, "loss": 0.2925, "step": 1135 }, { "epoch": 0.6367713004484304, "grad_norm": 0.07924848980093907, "learning_rate": 0.00019243780133165067, "loss": 0.2973, "step": 1136 }, { "epoch": 0.6373318385650224, "grad_norm": 0.08002464215813118, "learning_rate": 0.00019241289695346532, "loss": 0.2891, "step": 1137 }, { "epoch": 0.6378923766816144, "grad_norm": 0.07998453079671493, "learning_rate": 0.0001923879532511287, "loss": 0.2924, "step": 1138 }, { "epoch": 0.6384529147982063, "grad_norm": 0.0796813445505197, "learning_rate": 0.00019236297023525497, "loss": 0.2953, "step": 1139 }, { "epoch": 0.6390134529147982, "grad_norm": 0.07958559913224303, "learning_rate": 0.00019233794791647516, "loss": 0.2946, "step": 1140 }, { "epoch": 0.6395739910313901, "grad_norm": 0.07820666152885129, "learning_rate": 0.00019231288630543685, "loss": 0.2925, "step": 1141 }, { "epoch": 0.6401345291479821, "grad_norm": 0.08113787442901467, "learning_rate": 0.00019228778541280445, "loss": 0.2936, "step": 1142 }, { "epoch": 0.640695067264574, "grad_norm": 0.07979761874717892, "learning_rate": 0.0001922626452492591, "loss": 0.2929, "step": 1143 }, { "epoch": 0.6412556053811659, "grad_norm": 0.07936109418562343, "learning_rate": 0.00019223746582549853, "loss": 0.2958, "step": 1144 }, { "epoch": 0.6418161434977578, "grad_norm": 0.07638652262545426, "learning_rate": 0.00019221224715223732, "loss": 0.2862, "step": 1145 }, { "epoch": 0.6423766816143498, "grad_norm": 0.08046334380154133, "learning_rate": 0.00019218698924020668, "loss": 0.2801, "step": 1146 }, { "epoch": 0.6429372197309418, "grad_norm": 0.08288744800397352, "learning_rate": 0.00019216169210015452, "loss": 0.3026, "step": 1147 }, { "epoch": 0.6434977578475336, "grad_norm": 0.08371225533628113, "learning_rate": 0.0001921363557428454, "loss": 0.3043, "step": 1148 }, { "epoch": 0.6440582959641256, "grad_norm": 0.07998489281693971, "learning_rate": 0.0001921109801790607, "loss": 0.3037, "step": 1149 }, { "epoch": 0.6446188340807175, "grad_norm": 0.0774369203642628, "learning_rate": 0.00019208556541959834, "loss": 0.2968, "step": 1150 }, { "epoch": 0.6451793721973094, "grad_norm": 0.07994420039040068, "learning_rate": 0.00019206011147527297, "loss": 0.2973, "step": 1151 }, { "epoch": 0.6457399103139013, "grad_norm": 0.07681632416455374, "learning_rate": 0.00019203461835691594, "loss": 0.2896, "step": 1152 }, { "epoch": 0.6463004484304933, "grad_norm": 0.0802178755635615, "learning_rate": 0.0001920090860753753, "loss": 0.3013, "step": 1153 }, { "epoch": 0.6468609865470852, "grad_norm": 0.08001251593046894, "learning_rate": 0.00019198351464151564, "loss": 0.3064, "step": 1154 }, { "epoch": 0.6474215246636771, "grad_norm": 0.07999620091460043, "learning_rate": 0.00019195790406621832, "loss": 0.2889, "step": 1155 }, { "epoch": 0.647982062780269, "grad_norm": 0.07437850211384636, "learning_rate": 0.00019193225436038133, "loss": 0.2904, "step": 1156 }, { "epoch": 0.648542600896861, "grad_norm": 0.07719958033834659, "learning_rate": 0.0001919065655349193, "loss": 0.295, "step": 1157 }, { "epoch": 0.649103139013453, "grad_norm": 0.07809909218661798, "learning_rate": 0.0001918808376007635, "loss": 0.2928, "step": 1158 }, { "epoch": 0.6496636771300448, "grad_norm": 0.07869917717937468, "learning_rate": 0.00019185507056886184, "loss": 0.2918, "step": 1159 }, { "epoch": 0.6502242152466368, "grad_norm": 0.0772141480892248, "learning_rate": 0.00019182926445017893, "loss": 0.2907, "step": 1160 }, { "epoch": 0.6507847533632287, "grad_norm": 0.07727526271027191, "learning_rate": 0.00019180341925569588, "loss": 0.2862, "step": 1161 }, { "epoch": 0.6513452914798207, "grad_norm": 0.07935252002939296, "learning_rate": 0.0001917775349964106, "loss": 0.2931, "step": 1162 }, { "epoch": 0.6519058295964125, "grad_norm": 0.08399651091444187, "learning_rate": 0.00019175161168333751, "loss": 0.2846, "step": 1163 }, { "epoch": 0.6524663677130045, "grad_norm": 0.08077552818708185, "learning_rate": 0.0001917256493275076, "loss": 0.297, "step": 1164 }, { "epoch": 0.6530269058295964, "grad_norm": 0.07900938204339188, "learning_rate": 0.0001916996479399686, "loss": 0.3003, "step": 1165 }, { "epoch": 0.6535874439461884, "grad_norm": 0.07531638452434798, "learning_rate": 0.00019167360753178481, "loss": 0.2813, "step": 1166 }, { "epoch": 0.6541479820627802, "grad_norm": 0.08224864578997634, "learning_rate": 0.00019164752811403707, "loss": 0.3053, "step": 1167 }, { "epoch": 0.6547085201793722, "grad_norm": 0.08063253736908517, "learning_rate": 0.00019162140969782292, "loss": 0.3042, "step": 1168 }, { "epoch": 0.6552690582959642, "grad_norm": 0.07750934853255069, "learning_rate": 0.00019159525229425642, "loss": 0.2838, "step": 1169 }, { "epoch": 0.655829596412556, "grad_norm": 0.07901906811626463, "learning_rate": 0.0001915690559144682, "loss": 0.3118, "step": 1170 }, { "epoch": 0.656390134529148, "grad_norm": 0.07609691917887312, "learning_rate": 0.00019154282056960557, "loss": 0.2807, "step": 1171 }, { "epoch": 0.6569506726457399, "grad_norm": 0.07638632839140339, "learning_rate": 0.00019151654627083236, "loss": 0.2903, "step": 1172 }, { "epoch": 0.6575112107623319, "grad_norm": 0.07639299406238732, "learning_rate": 0.000191490233029329, "loss": 0.2836, "step": 1173 }, { "epoch": 0.6580717488789237, "grad_norm": 0.08206956385097149, "learning_rate": 0.00019146388085629245, "loss": 0.297, "step": 1174 }, { "epoch": 0.6586322869955157, "grad_norm": 0.07832471243632444, "learning_rate": 0.00019143748976293624, "loss": 0.2876, "step": 1175 }, { "epoch": 0.6591928251121076, "grad_norm": 0.0796515723229912, "learning_rate": 0.00019141105976049053, "loss": 0.2932, "step": 1176 }, { "epoch": 0.6597533632286996, "grad_norm": 0.080297126566589, "learning_rate": 0.00019138459086020198, "loss": 0.2994, "step": 1177 }, { "epoch": 0.6603139013452914, "grad_norm": 0.08028040228837845, "learning_rate": 0.0001913580830733338, "loss": 0.2948, "step": 1178 }, { "epoch": 0.6608744394618834, "grad_norm": 0.07586611021097159, "learning_rate": 0.00019133153641116577, "loss": 0.2926, "step": 1179 }, { "epoch": 0.6614349775784754, "grad_norm": 0.07839649698548022, "learning_rate": 0.0001913049508849942, "loss": 0.2816, "step": 1180 }, { "epoch": 0.6619955156950673, "grad_norm": 0.0804896491931343, "learning_rate": 0.00019127832650613189, "loss": 0.3017, "step": 1181 }, { "epoch": 0.6625560538116592, "grad_norm": 0.07642599468111257, "learning_rate": 0.00019125166328590832, "loss": 0.2928, "step": 1182 }, { "epoch": 0.6631165919282511, "grad_norm": 0.07494985488009158, "learning_rate": 0.00019122496123566933, "loss": 0.2883, "step": 1183 }, { "epoch": 0.6636771300448431, "grad_norm": 0.07897164442319689, "learning_rate": 0.00019119822036677738, "loss": 0.2932, "step": 1184 }, { "epoch": 0.6642376681614349, "grad_norm": 0.07725523600064436, "learning_rate": 0.0001911714406906114, "loss": 0.2939, "step": 1185 }, { "epoch": 0.6647982062780269, "grad_norm": 0.07857775076834542, "learning_rate": 0.00019114462221856692, "loss": 0.2966, "step": 1186 }, { "epoch": 0.6653587443946188, "grad_norm": 0.07874979095357006, "learning_rate": 0.0001911177649620558, "loss": 0.2887, "step": 1187 }, { "epoch": 0.6659192825112108, "grad_norm": 0.07562173420640422, "learning_rate": 0.00019109086893250664, "loss": 0.2713, "step": 1188 }, { "epoch": 0.6664798206278026, "grad_norm": 0.0751433940022504, "learning_rate": 0.00019106393414136438, "loss": 0.2852, "step": 1189 }, { "epoch": 0.6670403587443946, "grad_norm": 0.08060658317525636, "learning_rate": 0.00019103696060009044, "loss": 0.2955, "step": 1190 }, { "epoch": 0.6676008968609866, "grad_norm": 0.08126966081941613, "learning_rate": 0.00019100994832016283, "loss": 0.2961, "step": 1191 }, { "epoch": 0.6681614349775785, "grad_norm": 0.08060783778066744, "learning_rate": 0.000190982897313076, "loss": 0.2978, "step": 1192 }, { "epoch": 0.6687219730941704, "grad_norm": 0.07930156320662468, "learning_rate": 0.00019095580759034082, "loss": 0.2962, "step": 1193 }, { "epoch": 0.6692825112107623, "grad_norm": 0.07672510005379797, "learning_rate": 0.00019092867916348477, "loss": 0.2879, "step": 1194 }, { "epoch": 0.6698430493273543, "grad_norm": 0.08058371505436204, "learning_rate": 0.00019090151204405166, "loss": 0.3008, "step": 1195 }, { "epoch": 0.6704035874439462, "grad_norm": 0.07833685452897783, "learning_rate": 0.0001908743062436018, "loss": 0.2969, "step": 1196 }, { "epoch": 0.6709641255605381, "grad_norm": 0.07925289543162384, "learning_rate": 0.00019084706177371208, "loss": 0.3002, "step": 1197 }, { "epoch": 0.67152466367713, "grad_norm": 0.07591225635300325, "learning_rate": 0.00019081977864597564, "loss": 0.2951, "step": 1198 }, { "epoch": 0.672085201793722, "grad_norm": 0.07889895529910429, "learning_rate": 0.00019079245687200227, "loss": 0.2989, "step": 1199 }, { "epoch": 0.672645739910314, "grad_norm": 0.07647347213481327, "learning_rate": 0.00019076509646341806, "loss": 0.2875, "step": 1200 }, { "epoch": 0.6732062780269058, "grad_norm": 0.07966217356892244, "learning_rate": 0.0001907376974318656, "loss": 0.2948, "step": 1201 }, { "epoch": 0.6737668161434978, "grad_norm": 0.07782921796907775, "learning_rate": 0.00019071025978900392, "loss": 0.288, "step": 1202 }, { "epoch": 0.6743273542600897, "grad_norm": 0.08166371192255209, "learning_rate": 0.00019068278354650845, "loss": 0.2845, "step": 1203 }, { "epoch": 0.6748878923766816, "grad_norm": 0.08168203664130542, "learning_rate": 0.00019065526871607112, "loss": 0.296, "step": 1204 }, { "epoch": 0.6754484304932735, "grad_norm": 0.07828404784030776, "learning_rate": 0.00019062771530940013, "loss": 0.2908, "step": 1205 }, { "epoch": 0.6760089686098655, "grad_norm": 0.07878905522865796, "learning_rate": 0.00019060012333822025, "loss": 0.286, "step": 1206 }, { "epoch": 0.6765695067264574, "grad_norm": 0.07729516979929453, "learning_rate": 0.0001905724928142726, "loss": 0.2958, "step": 1207 }, { "epoch": 0.6771300448430493, "grad_norm": 0.08155782664594782, "learning_rate": 0.00019054482374931467, "loss": 0.305, "step": 1208 }, { "epoch": 0.6776905829596412, "grad_norm": 0.07754570918200666, "learning_rate": 0.00019051711615512043, "loss": 0.2808, "step": 1209 }, { "epoch": 0.6782511210762332, "grad_norm": 0.07994832399248969, "learning_rate": 0.00019048937004348016, "loss": 0.2873, "step": 1210 }, { "epoch": 0.6788116591928252, "grad_norm": 0.08079659164992742, "learning_rate": 0.0001904615854262006, "loss": 0.3056, "step": 1211 }, { "epoch": 0.679372197309417, "grad_norm": 0.07699454605752862, "learning_rate": 0.00019043376231510484, "loss": 0.309, "step": 1212 }, { "epoch": 0.679932735426009, "grad_norm": 0.07958776554217052, "learning_rate": 0.00019040590072203232, "loss": 0.2927, "step": 1213 }, { "epoch": 0.6804932735426009, "grad_norm": 0.0761266135368868, "learning_rate": 0.00019037800065883895, "loss": 0.3034, "step": 1214 }, { "epoch": 0.6810538116591929, "grad_norm": 0.07525512019293601, "learning_rate": 0.0001903500621373969, "loss": 0.2969, "step": 1215 }, { "epoch": 0.6816143497757847, "grad_norm": 0.0850182475030861, "learning_rate": 0.0001903220851695948, "loss": 0.3009, "step": 1216 }, { "epoch": 0.6821748878923767, "grad_norm": 0.0793591392970288, "learning_rate": 0.00019029406976733756, "loss": 0.2913, "step": 1217 }, { "epoch": 0.6827354260089686, "grad_norm": 0.07563554698353232, "learning_rate": 0.0001902660159425465, "loss": 0.2886, "step": 1218 }, { "epoch": 0.6832959641255605, "grad_norm": 0.08294638253174616, "learning_rate": 0.00019023792370715924, "loss": 0.292, "step": 1219 }, { "epoch": 0.6838565022421524, "grad_norm": 0.0808696941836712, "learning_rate": 0.0001902097930731298, "loss": 0.3064, "step": 1220 }, { "epoch": 0.6844170403587444, "grad_norm": 0.0800785386075129, "learning_rate": 0.0001901816240524285, "loss": 0.3081, "step": 1221 }, { "epoch": 0.6849775784753364, "grad_norm": 0.08033149677757245, "learning_rate": 0.00019015341665704205, "loss": 0.3001, "step": 1222 }, { "epoch": 0.6855381165919282, "grad_norm": 0.08082502481771073, "learning_rate": 0.0001901251708989734, "loss": 0.2999, "step": 1223 }, { "epoch": 0.6860986547085202, "grad_norm": 0.0752550962403608, "learning_rate": 0.0001900968867902419, "loss": 0.2929, "step": 1224 }, { "epoch": 0.6866591928251121, "grad_norm": 0.07769334337032176, "learning_rate": 0.00019006856434288324, "loss": 0.2912, "step": 1225 }, { "epoch": 0.6872197309417041, "grad_norm": 0.07802959161878885, "learning_rate": 0.00019004020356894926, "loss": 0.2872, "step": 1226 }, { "epoch": 0.6877802690582959, "grad_norm": 0.07834574297659148, "learning_rate": 0.00019001180448050827, "loss": 0.287, "step": 1227 }, { "epoch": 0.6883408071748879, "grad_norm": 0.08160535681558054, "learning_rate": 0.00018998336708964488, "loss": 0.3056, "step": 1228 }, { "epoch": 0.6889013452914798, "grad_norm": 0.07690740685562249, "learning_rate": 0.00018995489140845995, "loss": 0.2938, "step": 1229 }, { "epoch": 0.6894618834080718, "grad_norm": 0.07817464809682581, "learning_rate": 0.00018992637744907063, "loss": 0.2829, "step": 1230 }, { "epoch": 0.6900224215246636, "grad_norm": 0.07889620909426741, "learning_rate": 0.00018989782522361033, "loss": 0.2973, "step": 1231 }, { "epoch": 0.6905829596412556, "grad_norm": 0.07909642834814787, "learning_rate": 0.00018986923474422884, "loss": 0.284, "step": 1232 }, { "epoch": 0.6911434977578476, "grad_norm": 0.07876951617721326, "learning_rate": 0.00018984060602309213, "loss": 0.2904, "step": 1233 }, { "epoch": 0.6917040358744395, "grad_norm": 0.07842618999673406, "learning_rate": 0.00018981193907238253, "loss": 0.299, "step": 1234 }, { "epoch": 0.6922645739910314, "grad_norm": 0.07561517151383235, "learning_rate": 0.00018978323390429855, "loss": 0.2884, "step": 1235 }, { "epoch": 0.6928251121076233, "grad_norm": 0.07578068063055152, "learning_rate": 0.00018975449053105505, "loss": 0.2844, "step": 1236 }, { "epoch": 0.6933856502242153, "grad_norm": 0.07790798516880275, "learning_rate": 0.00018972570896488305, "loss": 0.2982, "step": 1237 }, { "epoch": 0.6939461883408071, "grad_norm": 0.07914549155279324, "learning_rate": 0.00018969688921802988, "loss": 0.2878, "step": 1238 }, { "epoch": 0.6945067264573991, "grad_norm": 0.07404575322266842, "learning_rate": 0.00018966803130275915, "loss": 0.2868, "step": 1239 }, { "epoch": 0.695067264573991, "grad_norm": 0.08120303825599713, "learning_rate": 0.0001896391352313506, "loss": 0.2945, "step": 1240 }, { "epoch": 0.695627802690583, "grad_norm": 0.07437289579864437, "learning_rate": 0.00018961020101610038, "loss": 0.2905, "step": 1241 }, { "epoch": 0.6961883408071748, "grad_norm": 0.07927624379164572, "learning_rate": 0.00018958122866932067, "loss": 0.2873, "step": 1242 }, { "epoch": 0.6967488789237668, "grad_norm": 0.07683562036131639, "learning_rate": 0.00018955221820334008, "loss": 0.2995, "step": 1243 }, { "epoch": 0.6973094170403588, "grad_norm": 0.07562879219671298, "learning_rate": 0.00018952316963050328, "loss": 0.2967, "step": 1244 }, { "epoch": 0.6978699551569507, "grad_norm": 0.07988497736399475, "learning_rate": 0.00018949408296317115, "loss": 0.2907, "step": 1245 }, { "epoch": 0.6984304932735426, "grad_norm": 0.0787077251985745, "learning_rate": 0.00018946495821372094, "loss": 0.3093, "step": 1246 }, { "epoch": 0.6989910313901345, "grad_norm": 0.07735407110297393, "learning_rate": 0.000189435795394546, "loss": 0.2997, "step": 1247 }, { "epoch": 0.6995515695067265, "grad_norm": 0.07741020308703161, "learning_rate": 0.0001894065945180558, "loss": 0.2926, "step": 1248 }, { "epoch": 0.7001121076233184, "grad_norm": 0.07976914290152838, "learning_rate": 0.00018937735559667617, "loss": 0.293, "step": 1249 }, { "epoch": 0.7006726457399103, "grad_norm": 0.07510775283599667, "learning_rate": 0.00018934807864284903, "loss": 0.2943, "step": 1250 }, { "epoch": 0.7012331838565022, "grad_norm": 0.07593201972228869, "learning_rate": 0.00018931876366903253, "loss": 0.2917, "step": 1251 }, { "epoch": 0.7017937219730942, "grad_norm": 0.07451377135318003, "learning_rate": 0.00018928941068770093, "loss": 0.2939, "step": 1252 }, { "epoch": 0.702354260089686, "grad_norm": 0.07813147177851087, "learning_rate": 0.0001892600197113447, "loss": 0.2937, "step": 1253 }, { "epoch": 0.702914798206278, "grad_norm": 0.0770543925802459, "learning_rate": 0.00018923059075247054, "loss": 0.2852, "step": 1254 }, { "epoch": 0.70347533632287, "grad_norm": 0.0786103794148151, "learning_rate": 0.0001892011238236012, "loss": 0.2872, "step": 1255 }, { "epoch": 0.7040358744394619, "grad_norm": 0.07774864788902053, "learning_rate": 0.0001891716189372757, "loss": 0.2985, "step": 1256 }, { "epoch": 0.7045964125560538, "grad_norm": 0.0769095326661384, "learning_rate": 0.0001891420761060491, "loss": 0.2797, "step": 1257 }, { "epoch": 0.7051569506726457, "grad_norm": 0.07782531883080296, "learning_rate": 0.00018911249534249268, "loss": 0.293, "step": 1258 }, { "epoch": 0.7057174887892377, "grad_norm": 0.0800402316054872, "learning_rate": 0.00018908287665919384, "loss": 0.2991, "step": 1259 }, { "epoch": 0.7062780269058296, "grad_norm": 0.0775037455296807, "learning_rate": 0.00018905322006875617, "loss": 0.2993, "step": 1260 }, { "epoch": 0.7068385650224215, "grad_norm": 0.07450553709538528, "learning_rate": 0.00018902352558379924, "loss": 0.2798, "step": 1261 }, { "epoch": 0.7073991031390134, "grad_norm": 0.07518676284466798, "learning_rate": 0.00018899379321695895, "loss": 0.2907, "step": 1262 }, { "epoch": 0.7079596412556054, "grad_norm": 0.07842392987449064, "learning_rate": 0.00018896402298088715, "loss": 0.274, "step": 1263 }, { "epoch": 0.7085201793721974, "grad_norm": 0.08030036395322773, "learning_rate": 0.0001889342148882519, "loss": 0.2914, "step": 1264 }, { "epoch": 0.7090807174887892, "grad_norm": 0.07846212324557195, "learning_rate": 0.00018890436895173732, "loss": 0.2991, "step": 1265 }, { "epoch": 0.7096412556053812, "grad_norm": 0.08157597085045058, "learning_rate": 0.00018887448518404364, "loss": 0.2864, "step": 1266 }, { "epoch": 0.7102017937219731, "grad_norm": 0.079973832983049, "learning_rate": 0.00018884456359788724, "loss": 0.2945, "step": 1267 }, { "epoch": 0.7107623318385651, "grad_norm": 0.08100813683795925, "learning_rate": 0.0001888146042060005, "loss": 0.3062, "step": 1268 }, { "epoch": 0.7113228699551569, "grad_norm": 0.07850234211695056, "learning_rate": 0.000188784607021132, "loss": 0.2998, "step": 1269 }, { "epoch": 0.7118834080717489, "grad_norm": 0.07651407446888785, "learning_rate": 0.00018875457205604632, "loss": 0.2757, "step": 1270 }, { "epoch": 0.7124439461883408, "grad_norm": 0.07609078214940616, "learning_rate": 0.0001887244993235241, "loss": 0.2946, "step": 1271 }, { "epoch": 0.7130044843049327, "grad_norm": 0.07746617973765456, "learning_rate": 0.00018869438883636214, "loss": 0.2887, "step": 1272 }, { "epoch": 0.7135650224215246, "grad_norm": 0.0789499722820708, "learning_rate": 0.0001886642406073732, "loss": 0.3051, "step": 1273 }, { "epoch": 0.7141255605381166, "grad_norm": 0.07521197453858279, "learning_rate": 0.00018863405464938622, "loss": 0.2913, "step": 1274 }, { "epoch": 0.7146860986547086, "grad_norm": 0.07898055919018118, "learning_rate": 0.00018860383097524608, "loss": 0.3067, "step": 1275 }, { "epoch": 0.7152466367713004, "grad_norm": 0.0773309100267759, "learning_rate": 0.00018857356959781378, "loss": 0.2968, "step": 1276 }, { "epoch": 0.7158071748878924, "grad_norm": 0.07631060611281504, "learning_rate": 0.00018854327052996635, "loss": 0.3048, "step": 1277 }, { "epoch": 0.7163677130044843, "grad_norm": 0.07770995667275178, "learning_rate": 0.00018851293378459685, "loss": 0.3072, "step": 1278 }, { "epoch": 0.7169282511210763, "grad_norm": 0.07669477056566686, "learning_rate": 0.00018848255937461435, "loss": 0.2976, "step": 1279 }, { "epoch": 0.7174887892376681, "grad_norm": 0.07592043732806922, "learning_rate": 0.000188452147312944, "loss": 0.2923, "step": 1280 }, { "epoch": 0.7180493273542601, "grad_norm": 0.07864646912832403, "learning_rate": 0.0001884216976125269, "loss": 0.2875, "step": 1281 }, { "epoch": 0.718609865470852, "grad_norm": 0.08284975313147833, "learning_rate": 0.00018839121028632025, "loss": 0.2828, "step": 1282 }, { "epoch": 0.719170403587444, "grad_norm": 0.07969891678461695, "learning_rate": 0.00018836068534729722, "loss": 0.2845, "step": 1283 }, { "epoch": 0.7197309417040358, "grad_norm": 0.07680522024641628, "learning_rate": 0.00018833012280844699, "loss": 0.2883, "step": 1284 }, { "epoch": 0.7202914798206278, "grad_norm": 0.0765633573161656, "learning_rate": 0.0001882995226827747, "loss": 0.303, "step": 1285 }, { "epoch": 0.7208520179372198, "grad_norm": 0.07838781066391033, "learning_rate": 0.00018826888498330155, "loss": 0.2804, "step": 1286 }, { "epoch": 0.7214125560538116, "grad_norm": 0.07719297221136888, "learning_rate": 0.00018823820972306468, "loss": 0.3005, "step": 1287 }, { "epoch": 0.7219730941704036, "grad_norm": 0.07800903389182938, "learning_rate": 0.0001882074969151172, "loss": 0.2864, "step": 1288 }, { "epoch": 0.7225336322869955, "grad_norm": 0.07996779675024149, "learning_rate": 0.00018817674657252832, "loss": 0.2973, "step": 1289 }, { "epoch": 0.7230941704035875, "grad_norm": 0.07733184379911967, "learning_rate": 0.00018814595870838305, "loss": 0.2982, "step": 1290 }, { "epoch": 0.7236547085201793, "grad_norm": 0.07731490662415895, "learning_rate": 0.0001881151333357825, "loss": 0.29, "step": 1291 }, { "epoch": 0.7242152466367713, "grad_norm": 0.0782262630148421, "learning_rate": 0.00018808427046784366, "loss": 0.3026, "step": 1292 }, { "epoch": 0.7247757847533632, "grad_norm": 0.07651090647832731, "learning_rate": 0.00018805337011769947, "loss": 0.2906, "step": 1293 }, { "epoch": 0.7253363228699552, "grad_norm": 0.07872373873076764, "learning_rate": 0.00018802243229849893, "loss": 0.3079, "step": 1294 }, { "epoch": 0.725896860986547, "grad_norm": 0.07884950443304208, "learning_rate": 0.0001879914570234068, "loss": 0.2991, "step": 1295 }, { "epoch": 0.726457399103139, "grad_norm": 0.07497108857838872, "learning_rate": 0.000187960444305604, "loss": 0.2894, "step": 1296 }, { "epoch": 0.727017937219731, "grad_norm": 0.07587875237581815, "learning_rate": 0.0001879293941582872, "loss": 0.2899, "step": 1297 }, { "epoch": 0.7275784753363229, "grad_norm": 0.07369102143037222, "learning_rate": 0.00018789830659466912, "loss": 0.2827, "step": 1298 }, { "epoch": 0.7281390134529148, "grad_norm": 0.07789720105922017, "learning_rate": 0.00018786718162797826, "loss": 0.2756, "step": 1299 }, { "epoch": 0.7286995515695067, "grad_norm": 0.0760606785136823, "learning_rate": 0.0001878360192714592, "loss": 0.2964, "step": 1300 }, { "epoch": 0.7292600896860987, "grad_norm": 0.07437994591108035, "learning_rate": 0.00018780481953837233, "loss": 0.2876, "step": 1301 }, { "epoch": 0.7298206278026906, "grad_norm": 0.07608950743250191, "learning_rate": 0.00018777358244199393, "loss": 0.2796, "step": 1302 }, { "epoch": 0.7303811659192825, "grad_norm": 0.07805452353954782, "learning_rate": 0.0001877423079956163, "loss": 0.2701, "step": 1303 }, { "epoch": 0.7309417040358744, "grad_norm": 0.07573651354847788, "learning_rate": 0.00018771099621254746, "loss": 0.2833, "step": 1304 }, { "epoch": 0.7315022421524664, "grad_norm": 0.07806005779452138, "learning_rate": 0.00018767964710611148, "loss": 0.2855, "step": 1305 }, { "epoch": 0.7320627802690582, "grad_norm": 0.07467295002820801, "learning_rate": 0.0001876482606896482, "loss": 0.2926, "step": 1306 }, { "epoch": 0.7326233183856502, "grad_norm": 0.07553584065478107, "learning_rate": 0.0001876168369765134, "loss": 0.2943, "step": 1307 }, { "epoch": 0.7331838565022422, "grad_norm": 0.07591966668596306, "learning_rate": 0.00018758537598007868, "loss": 0.2967, "step": 1308 }, { "epoch": 0.7337443946188341, "grad_norm": 0.07711605811857049, "learning_rate": 0.00018755387771373155, "loss": 0.2785, "step": 1309 }, { "epoch": 0.734304932735426, "grad_norm": 0.07804229988629692, "learning_rate": 0.00018752234219087538, "loss": 0.3016, "step": 1310 }, { "epoch": 0.7348654708520179, "grad_norm": 0.07724641664074823, "learning_rate": 0.00018749076942492935, "loss": 0.2996, "step": 1311 }, { "epoch": 0.7354260089686099, "grad_norm": 0.07674865115715372, "learning_rate": 0.0001874591594293285, "loss": 0.3016, "step": 1312 }, { "epoch": 0.7359865470852018, "grad_norm": 0.07493025188227236, "learning_rate": 0.00018742751221752376, "loss": 0.3033, "step": 1313 }, { "epoch": 0.7365470852017937, "grad_norm": 0.07935121801870736, "learning_rate": 0.00018739582780298187, "loss": 0.2942, "step": 1314 }, { "epoch": 0.7371076233183856, "grad_norm": 0.07792308909891997, "learning_rate": 0.00018736410619918535, "loss": 0.2847, "step": 1315 }, { "epoch": 0.7376681614349776, "grad_norm": 0.07543353574385893, "learning_rate": 0.0001873323474196326, "loss": 0.285, "step": 1316 }, { "epoch": 0.7382286995515696, "grad_norm": 0.07826412511500047, "learning_rate": 0.00018730055147783787, "loss": 0.2843, "step": 1317 }, { "epoch": 0.7387892376681614, "grad_norm": 0.07543515537108975, "learning_rate": 0.00018726871838733113, "loss": 0.2952, "step": 1318 }, { "epoch": 0.7393497757847534, "grad_norm": 0.07872077356370108, "learning_rate": 0.0001872368481616582, "loss": 0.292, "step": 1319 }, { "epoch": 0.7399103139013453, "grad_norm": 0.07649789977559601, "learning_rate": 0.00018720494081438078, "loss": 0.3022, "step": 1320 }, { "epoch": 0.7404708520179372, "grad_norm": 0.07589209125514779, "learning_rate": 0.0001871729963590762, "loss": 0.2882, "step": 1321 }, { "epoch": 0.7410313901345291, "grad_norm": 0.07559430157312046, "learning_rate": 0.0001871410148093378, "loss": 0.2962, "step": 1322 }, { "epoch": 0.7415919282511211, "grad_norm": 0.07351979593308759, "learning_rate": 0.00018710899617877446, "loss": 0.2772, "step": 1323 }, { "epoch": 0.742152466367713, "grad_norm": 0.07629143680804043, "learning_rate": 0.00018707694048101104, "loss": 0.2916, "step": 1324 }, { "epoch": 0.7427130044843049, "grad_norm": 0.0737835599349803, "learning_rate": 0.00018704484772968808, "loss": 0.2859, "step": 1325 }, { "epoch": 0.7432735426008968, "grad_norm": 0.07927090998639666, "learning_rate": 0.00018701271793846185, "loss": 0.2983, "step": 1326 }, { "epoch": 0.7438340807174888, "grad_norm": 0.07550225413500154, "learning_rate": 0.0001869805511210045, "loss": 0.2852, "step": 1327 }, { "epoch": 0.7443946188340808, "grad_norm": 0.07460152949029368, "learning_rate": 0.00018694834729100386, "loss": 0.2906, "step": 1328 }, { "epoch": 0.7449551569506726, "grad_norm": 0.07676090452029702, "learning_rate": 0.00018691610646216344, "loss": 0.2834, "step": 1329 }, { "epoch": 0.7455156950672646, "grad_norm": 0.07803325358661241, "learning_rate": 0.00018688382864820267, "loss": 0.2902, "step": 1330 }, { "epoch": 0.7460762331838565, "grad_norm": 0.0769711200908372, "learning_rate": 0.0001868515138628566, "loss": 0.3062, "step": 1331 }, { "epoch": 0.7466367713004485, "grad_norm": 0.07433534825617263, "learning_rate": 0.00018681916211987597, "loss": 0.2976, "step": 1332 }, { "epoch": 0.7471973094170403, "grad_norm": 0.0776685315458737, "learning_rate": 0.00018678677343302738, "loss": 0.2978, "step": 1333 }, { "epoch": 0.7477578475336323, "grad_norm": 0.07545609129455182, "learning_rate": 0.00018675434781609303, "loss": 0.2898, "step": 1334 }, { "epoch": 0.7483183856502242, "grad_norm": 0.07572405945464439, "learning_rate": 0.00018672188528287093, "loss": 0.2802, "step": 1335 }, { "epoch": 0.7488789237668162, "grad_norm": 0.07719842275732951, "learning_rate": 0.00018668938584717471, "loss": 0.2949, "step": 1336 }, { "epoch": 0.749439461883408, "grad_norm": 0.07690169045731857, "learning_rate": 0.0001866568495228338, "loss": 0.2982, "step": 1337 }, { "epoch": 0.75, "grad_norm": 0.07467124607579745, "learning_rate": 0.0001866242763236932, "loss": 0.2906, "step": 1338 }, { "epoch": 0.750560538116592, "grad_norm": 0.0811234171484504, "learning_rate": 0.00018659166626361375, "loss": 0.2949, "step": 1339 }, { "epoch": 0.7511210762331838, "grad_norm": 0.07717253760610217, "learning_rate": 0.00018655901935647187, "loss": 0.3011, "step": 1340 }, { "epoch": 0.7516816143497758, "grad_norm": 0.07491111542612311, "learning_rate": 0.00018652633561615963, "loss": 0.2933, "step": 1341 }, { "epoch": 0.7522421524663677, "grad_norm": 0.07812280967678403, "learning_rate": 0.00018649361505658493, "loss": 0.2802, "step": 1342 }, { "epoch": 0.7528026905829597, "grad_norm": 0.0739088336544016, "learning_rate": 0.00018646085769167119, "loss": 0.2775, "step": 1343 }, { "epoch": 0.7533632286995515, "grad_norm": 0.07744656519534383, "learning_rate": 0.00018642806353535754, "loss": 0.2902, "step": 1344 }, { "epoch": 0.7539237668161435, "grad_norm": 0.07741913855187414, "learning_rate": 0.0001863952326015988, "loss": 0.2921, "step": 1345 }, { "epoch": 0.7544843049327354, "grad_norm": 0.07621173786924237, "learning_rate": 0.00018636236490436535, "loss": 0.2938, "step": 1346 }, { "epoch": 0.7550448430493274, "grad_norm": 0.07952963227808484, "learning_rate": 0.00018632946045764334, "loss": 0.2954, "step": 1347 }, { "epoch": 0.7556053811659192, "grad_norm": 0.07906435746949275, "learning_rate": 0.00018629651927543447, "loss": 0.2832, "step": 1348 }, { "epoch": 0.7561659192825112, "grad_norm": 0.07540400579101485, "learning_rate": 0.00018626354137175603, "loss": 0.2875, "step": 1349 }, { "epoch": 0.7567264573991032, "grad_norm": 0.07616969867605854, "learning_rate": 0.00018623052676064106, "loss": 0.2839, "step": 1350 }, { "epoch": 0.7572869955156951, "grad_norm": 0.08035148075312004, "learning_rate": 0.00018619747545613814, "loss": 0.2909, "step": 1351 }, { "epoch": 0.757847533632287, "grad_norm": 0.07969136405752344, "learning_rate": 0.00018616438747231148, "loss": 0.2902, "step": 1352 }, { "epoch": 0.7584080717488789, "grad_norm": 0.07753625952755534, "learning_rate": 0.00018613126282324092, "loss": 0.2853, "step": 1353 }, { "epoch": 0.7589686098654709, "grad_norm": 0.07604892980998836, "learning_rate": 0.00018609810152302183, "loss": 0.2993, "step": 1354 }, { "epoch": 0.7595291479820628, "grad_norm": 0.07896616442677123, "learning_rate": 0.0001860649035857653, "loss": 0.3027, "step": 1355 }, { "epoch": 0.7600896860986547, "grad_norm": 0.07940682063157535, "learning_rate": 0.00018603166902559783, "loss": 0.2941, "step": 1356 }, { "epoch": 0.7606502242152466, "grad_norm": 0.07279709841837333, "learning_rate": 0.00018599839785666172, "loss": 0.2784, "step": 1357 }, { "epoch": 0.7612107623318386, "grad_norm": 0.07287156515708634, "learning_rate": 0.00018596509009311473, "loss": 0.2955, "step": 1358 }, { "epoch": 0.7617713004484304, "grad_norm": 0.07910667735159985, "learning_rate": 0.00018593174574913014, "loss": 0.2918, "step": 1359 }, { "epoch": 0.7623318385650224, "grad_norm": 0.07535123141943245, "learning_rate": 0.00018589836483889687, "loss": 0.3043, "step": 1360 }, { "epoch": 0.7628923766816144, "grad_norm": 0.07079144742106978, "learning_rate": 0.00018586494737661942, "loss": 0.2906, "step": 1361 }, { "epoch": 0.7634529147982063, "grad_norm": 0.07646124561271408, "learning_rate": 0.0001858314933765178, "loss": 0.2753, "step": 1362 }, { "epoch": 0.7640134529147982, "grad_norm": 0.08000962596500587, "learning_rate": 0.00018579800285282758, "loss": 0.2888, "step": 1363 }, { "epoch": 0.7645739910313901, "grad_norm": 0.07737151093016979, "learning_rate": 0.00018576447581979984, "loss": 0.2866, "step": 1364 }, { "epoch": 0.7651345291479821, "grad_norm": 0.07922437567858846, "learning_rate": 0.00018573091229170125, "loss": 0.2859, "step": 1365 }, { "epoch": 0.765695067264574, "grad_norm": 0.07781993765796033, "learning_rate": 0.00018569731228281402, "loss": 0.293, "step": 1366 }, { "epoch": 0.7662556053811659, "grad_norm": 0.0786279322732407, "learning_rate": 0.00018566367580743578, "loss": 0.2894, "step": 1367 }, { "epoch": 0.7668161434977578, "grad_norm": 0.07748284406177973, "learning_rate": 0.0001856300028798798, "loss": 0.2869, "step": 1368 }, { "epoch": 0.7673766816143498, "grad_norm": 0.07679842315728289, "learning_rate": 0.00018559629351447477, "loss": 0.2971, "step": 1369 }, { "epoch": 0.7679372197309418, "grad_norm": 0.0736803367460062, "learning_rate": 0.00018556254772556497, "loss": 0.2841, "step": 1370 }, { "epoch": 0.7684977578475336, "grad_norm": 0.07769094809947598, "learning_rate": 0.0001855287655275101, "loss": 0.2985, "step": 1371 }, { "epoch": 0.7690582959641256, "grad_norm": 0.08724476283650384, "learning_rate": 0.0001854949469346854, "loss": 0.2887, "step": 1372 }, { "epoch": 0.7696188340807175, "grad_norm": 0.073526400118211, "learning_rate": 0.00018546109196148158, "loss": 0.2815, "step": 1373 }, { "epoch": 0.7701793721973094, "grad_norm": 0.07062244148529655, "learning_rate": 0.00018542720062230482, "loss": 0.2967, "step": 1374 }, { "epoch": 0.7707399103139013, "grad_norm": 0.07671357170768117, "learning_rate": 0.0001853932729315768, "loss": 0.2898, "step": 1375 }, { "epoch": 0.7713004484304933, "grad_norm": 0.07347117491199325, "learning_rate": 0.00018535930890373466, "loss": 0.2876, "step": 1376 }, { "epoch": 0.7718609865470852, "grad_norm": 0.07571055973685425, "learning_rate": 0.00018532530855323097, "loss": 0.2925, "step": 1377 }, { "epoch": 0.7724215246636771, "grad_norm": 0.07569006514744296, "learning_rate": 0.00018529127189453382, "loss": 0.2875, "step": 1378 }, { "epoch": 0.772982062780269, "grad_norm": 0.07964297912638683, "learning_rate": 0.00018525719894212675, "loss": 0.2993, "step": 1379 }, { "epoch": 0.773542600896861, "grad_norm": 0.0783041774718746, "learning_rate": 0.00018522308971050865, "loss": 0.2989, "step": 1380 }, { "epoch": 0.774103139013453, "grad_norm": 0.0758786416845156, "learning_rate": 0.0001851889442141939, "loss": 0.2894, "step": 1381 }, { "epoch": 0.7746636771300448, "grad_norm": 0.07541063650038844, "learning_rate": 0.00018515476246771232, "loss": 0.3034, "step": 1382 }, { "epoch": 0.7752242152466368, "grad_norm": 0.07650126337312455, "learning_rate": 0.0001851205444856092, "loss": 0.2902, "step": 1383 }, { "epoch": 0.7757847533632287, "grad_norm": 0.07588693687731386, "learning_rate": 0.00018508629028244519, "loss": 0.2958, "step": 1384 }, { "epoch": 0.7763452914798207, "grad_norm": 0.07726162355363445, "learning_rate": 0.00018505199987279634, "loss": 0.2828, "step": 1385 }, { "epoch": 0.7769058295964125, "grad_norm": 0.07688664333995268, "learning_rate": 0.00018501767327125417, "loss": 0.2927, "step": 1386 }, { "epoch": 0.7774663677130045, "grad_norm": 0.07463864822174102, "learning_rate": 0.00018498331049242553, "loss": 0.2824, "step": 1387 }, { "epoch": 0.7780269058295964, "grad_norm": 0.07270587460945237, "learning_rate": 0.00018494891155093274, "loss": 0.285, "step": 1388 }, { "epoch": 0.7785874439461884, "grad_norm": 0.07472118172733451, "learning_rate": 0.00018491447646141337, "loss": 0.2838, "step": 1389 }, { "epoch": 0.7791479820627802, "grad_norm": 0.07906313104393052, "learning_rate": 0.0001848800052385206, "loss": 0.2993, "step": 1390 }, { "epoch": 0.7797085201793722, "grad_norm": 0.08000296545660737, "learning_rate": 0.00018484549789692277, "loss": 0.3023, "step": 1391 }, { "epoch": 0.7802690582959642, "grad_norm": 0.08070960466410868, "learning_rate": 0.0001848109544513037, "loss": 0.2888, "step": 1392 }, { "epoch": 0.780829596412556, "grad_norm": 0.07772837035425759, "learning_rate": 0.00018477637491636254, "loss": 0.2924, "step": 1393 }, { "epoch": 0.781390134529148, "grad_norm": 0.07328464817430952, "learning_rate": 0.0001847417593068138, "loss": 0.2844, "step": 1394 }, { "epoch": 0.7819506726457399, "grad_norm": 0.07588594193230476, "learning_rate": 0.00018470710763738736, "loss": 0.3031, "step": 1395 }, { "epoch": 0.7825112107623319, "grad_norm": 0.07594699223222112, "learning_rate": 0.00018467241992282843, "loss": 0.2872, "step": 1396 }, { "epoch": 0.7830717488789237, "grad_norm": 0.0718626645875063, "learning_rate": 0.00018463769617789756, "loss": 0.2935, "step": 1397 }, { "epoch": 0.7836322869955157, "grad_norm": 0.0739120716985456, "learning_rate": 0.00018460293641737056, "loss": 0.2837, "step": 1398 }, { "epoch": 0.7841928251121076, "grad_norm": 0.07631399704508741, "learning_rate": 0.00018456814065603874, "loss": 0.2838, "step": 1399 }, { "epoch": 0.7847533632286996, "grad_norm": 0.07527887722291483, "learning_rate": 0.00018453330890870855, "loss": 0.2941, "step": 1400 }, { "epoch": 0.7853139013452914, "grad_norm": 0.07687638348874318, "learning_rate": 0.00018449844119020187, "loss": 0.2958, "step": 1401 }, { "epoch": 0.7858744394618834, "grad_norm": 0.0751731950384892, "learning_rate": 0.00018446353751535585, "loss": 0.2891, "step": 1402 }, { "epoch": 0.7864349775784754, "grad_norm": 0.0771625419362304, "learning_rate": 0.00018442859789902287, "loss": 0.2844, "step": 1403 }, { "epoch": 0.7869955156950673, "grad_norm": 0.07948109827598755, "learning_rate": 0.0001843936223560707, "loss": 0.2908, "step": 1404 }, { "epoch": 0.7875560538116592, "grad_norm": 0.07731481396768788, "learning_rate": 0.0001843586109013824, "loss": 0.2877, "step": 1405 }, { "epoch": 0.7881165919282511, "grad_norm": 0.07740649519667313, "learning_rate": 0.00018432356354985624, "loss": 0.2901, "step": 1406 }, { "epoch": 0.7886771300448431, "grad_norm": 0.07299887622297382, "learning_rate": 0.00018428848031640582, "loss": 0.2857, "step": 1407 }, { "epoch": 0.7892376681614349, "grad_norm": 0.07281704715766726, "learning_rate": 0.00018425336121596, "loss": 0.2789, "step": 1408 }, { "epoch": 0.7897982062780269, "grad_norm": 0.07442852265460735, "learning_rate": 0.00018421820626346287, "loss": 0.2862, "step": 1409 }, { "epoch": 0.7903587443946188, "grad_norm": 0.07253215159296206, "learning_rate": 0.0001841830154738738, "loss": 0.3008, "step": 1410 }, { "epoch": 0.7909192825112108, "grad_norm": 0.07303673507071831, "learning_rate": 0.00018414778886216744, "loss": 0.2745, "step": 1411 }, { "epoch": 0.7914798206278026, "grad_norm": 0.07633390907693668, "learning_rate": 0.00018411252644333362, "loss": 0.2841, "step": 1412 }, { "epoch": 0.7920403587443946, "grad_norm": 0.07473600112958628, "learning_rate": 0.0001840772282323774, "loss": 0.2941, "step": 1413 }, { "epoch": 0.7926008968609866, "grad_norm": 0.07371548412534, "learning_rate": 0.00018404189424431921, "loss": 0.2832, "step": 1414 }, { "epoch": 0.7931614349775785, "grad_norm": 0.07692758307826633, "learning_rate": 0.00018400652449419456, "loss": 0.2868, "step": 1415 }, { "epoch": 0.7937219730941704, "grad_norm": 0.07644275048553147, "learning_rate": 0.00018397111899705419, "loss": 0.2889, "step": 1416 }, { "epoch": 0.7942825112107623, "grad_norm": 0.07900690964527601, "learning_rate": 0.00018393567776796413, "loss": 0.3043, "step": 1417 }, { "epoch": 0.7948430493273543, "grad_norm": 0.07358862281444084, "learning_rate": 0.0001839002008220055, "loss": 0.2901, "step": 1418 }, { "epoch": 0.7954035874439462, "grad_norm": 0.07391016621207044, "learning_rate": 0.0001838646881742748, "loss": 0.2867, "step": 1419 }, { "epoch": 0.7959641255605381, "grad_norm": 0.07941176651737829, "learning_rate": 0.00018382913983988348, "loss": 0.2867, "step": 1420 }, { "epoch": 0.79652466367713, "grad_norm": 0.07572282829815266, "learning_rate": 0.00018379355583395842, "loss": 0.2934, "step": 1421 }, { "epoch": 0.797085201793722, "grad_norm": 0.074960350680707, "learning_rate": 0.00018375793617164145, "loss": 0.309, "step": 1422 }, { "epoch": 0.797645739910314, "grad_norm": 0.07575566300387161, "learning_rate": 0.00018372228086808979, "loss": 0.2862, "step": 1423 }, { "epoch": 0.7982062780269058, "grad_norm": 0.07308428000549903, "learning_rate": 0.00018368658993847566, "loss": 0.2909, "step": 1424 }, { "epoch": 0.7987668161434978, "grad_norm": 0.07549301760709565, "learning_rate": 0.0001836508633979865, "loss": 0.2864, "step": 1425 }, { "epoch": 0.7993273542600897, "grad_norm": 0.0731274376673481, "learning_rate": 0.00018361510126182492, "loss": 0.2789, "step": 1426 }, { "epoch": 0.7998878923766816, "grad_norm": 0.07738565135023746, "learning_rate": 0.0001835793035452087, "loss": 0.3048, "step": 1427 }, { "epoch": 0.8004484304932735, "grad_norm": 0.07177820742743628, "learning_rate": 0.00018354347026337066, "loss": 0.2907, "step": 1428 }, { "epoch": 0.8010089686098655, "grad_norm": 0.07640689818899002, "learning_rate": 0.00018350760143155884, "loss": 0.2863, "step": 1429 }, { "epoch": 0.8015695067264574, "grad_norm": 0.07672084445856638, "learning_rate": 0.0001834716970650364, "loss": 0.2965, "step": 1430 }, { "epoch": 0.8021300448430493, "grad_norm": 0.07627208815081751, "learning_rate": 0.00018343575717908158, "loss": 0.296, "step": 1431 }, { "epoch": 0.8026905829596412, "grad_norm": 0.07380746132396154, "learning_rate": 0.0001833997817889878, "loss": 0.2719, "step": 1432 }, { "epoch": 0.8032511210762332, "grad_norm": 0.07973900555459396, "learning_rate": 0.00018336377091006351, "loss": 0.2961, "step": 1433 }, { "epoch": 0.8038116591928252, "grad_norm": 0.07299240702694566, "learning_rate": 0.00018332772455763232, "loss": 0.2886, "step": 1434 }, { "epoch": 0.804372197309417, "grad_norm": 0.07489555184832908, "learning_rate": 0.00018329164274703287, "loss": 0.2934, "step": 1435 }, { "epoch": 0.804932735426009, "grad_norm": 0.0768283435332101, "learning_rate": 0.000183255525493619, "loss": 0.2975, "step": 1436 }, { "epoch": 0.8054932735426009, "grad_norm": 0.07530896474211601, "learning_rate": 0.00018321937281275951, "loss": 0.2772, "step": 1437 }, { "epoch": 0.8060538116591929, "grad_norm": 0.07343763669424976, "learning_rate": 0.00018318318471983837, "loss": 0.2905, "step": 1438 }, { "epoch": 0.8066143497757847, "grad_norm": 0.0759174954877864, "learning_rate": 0.00018314696123025454, "loss": 0.2756, "step": 1439 }, { "epoch": 0.8071748878923767, "grad_norm": 0.07954362881356519, "learning_rate": 0.0001831107023594221, "loss": 0.2906, "step": 1440 }, { "epoch": 0.8077354260089686, "grad_norm": 0.07722907000273242, "learning_rate": 0.00018307440812277017, "loss": 0.2927, "step": 1441 }, { "epoch": 0.8082959641255605, "grad_norm": 0.0761978990934437, "learning_rate": 0.0001830380785357429, "loss": 0.2854, "step": 1442 }, { "epoch": 0.8088565022421524, "grad_norm": 0.07192499399813279, "learning_rate": 0.00018300171361379953, "loss": 0.2751, "step": 1443 }, { "epoch": 0.8094170403587444, "grad_norm": 0.07672166540018124, "learning_rate": 0.00018296531337241425, "loss": 0.2867, "step": 1444 }, { "epoch": 0.8099775784753364, "grad_norm": 0.07432234888523945, "learning_rate": 0.0001829288778270764, "loss": 0.28, "step": 1445 }, { "epoch": 0.8105381165919282, "grad_norm": 0.07552370929739224, "learning_rate": 0.0001828924069932902, "loss": 0.3054, "step": 1446 }, { "epoch": 0.8110986547085202, "grad_norm": 0.07243138995072261, "learning_rate": 0.00018285590088657503, "loss": 0.2743, "step": 1447 }, { "epoch": 0.8116591928251121, "grad_norm": 0.0744121758908257, "learning_rate": 0.00018281935952246518, "loss": 0.2936, "step": 1448 }, { "epoch": 0.8122197309417041, "grad_norm": 0.07618031580109315, "learning_rate": 0.00018278278291650998, "loss": 0.2869, "step": 1449 }, { "epoch": 0.8127802690582959, "grad_norm": 0.07364536149850888, "learning_rate": 0.00018274617108427374, "loss": 0.2872, "step": 1450 }, { "epoch": 0.8133408071748879, "grad_norm": 0.0731242088928229, "learning_rate": 0.0001827095240413358, "loss": 0.2908, "step": 1451 }, { "epoch": 0.8139013452914798, "grad_norm": 0.07910289495768733, "learning_rate": 0.0001826728418032904, "loss": 0.2999, "step": 1452 }, { "epoch": 0.8144618834080718, "grad_norm": 0.07491422734436892, "learning_rate": 0.0001826361243857469, "loss": 0.2918, "step": 1453 }, { "epoch": 0.8150224215246636, "grad_norm": 0.07069684199857261, "learning_rate": 0.00018259937180432943, "loss": 0.2825, "step": 1454 }, { "epoch": 0.8155829596412556, "grad_norm": 0.07461081918013844, "learning_rate": 0.0001825625840746773, "loss": 0.2996, "step": 1455 }, { "epoch": 0.8161434977578476, "grad_norm": 0.07456817318255526, "learning_rate": 0.00018252576121244456, "loss": 0.291, "step": 1456 }, { "epoch": 0.8167040358744395, "grad_norm": 0.0752009880568552, "learning_rate": 0.00018248890323330037, "loss": 0.2879, "step": 1457 }, { "epoch": 0.8172645739910314, "grad_norm": 0.07976424882963153, "learning_rate": 0.00018245201015292884, "loss": 0.291, "step": 1458 }, { "epoch": 0.8178251121076233, "grad_norm": 0.07391074644209736, "learning_rate": 0.00018241508198702888, "loss": 0.2795, "step": 1459 }, { "epoch": 0.8183856502242153, "grad_norm": 0.0744088522851671, "learning_rate": 0.00018237811875131444, "loss": 0.2982, "step": 1460 }, { "epoch": 0.8189461883408071, "grad_norm": 0.07261544628193192, "learning_rate": 0.00018234112046151436, "loss": 0.28, "step": 1461 }, { "epoch": 0.8195067264573991, "grad_norm": 0.07734099041728819, "learning_rate": 0.00018230408713337242, "loss": 0.295, "step": 1462 }, { "epoch": 0.820067264573991, "grad_norm": 0.07363549616953434, "learning_rate": 0.00018226701878264724, "loss": 0.2817, "step": 1463 }, { "epoch": 0.820627802690583, "grad_norm": 0.07466610010605963, "learning_rate": 0.00018222991542511246, "loss": 0.2985, "step": 1464 }, { "epoch": 0.8211883408071748, "grad_norm": 0.07301472031476657, "learning_rate": 0.00018219277707655644, "loss": 0.281, "step": 1465 }, { "epoch": 0.8217488789237668, "grad_norm": 0.07369725422981575, "learning_rate": 0.00018215560375278264, "loss": 0.2927, "step": 1466 }, { "epoch": 0.8223094170403588, "grad_norm": 0.0720833906344199, "learning_rate": 0.00018211839546960928, "loss": 0.2913, "step": 1467 }, { "epoch": 0.8228699551569507, "grad_norm": 0.07282264057414481, "learning_rate": 0.00018208115224286947, "loss": 0.2887, "step": 1468 }, { "epoch": 0.8234304932735426, "grad_norm": 0.07209233132516016, "learning_rate": 0.0001820438740884111, "loss": 0.2936, "step": 1469 }, { "epoch": 0.8239910313901345, "grad_norm": 0.07431361686281528, "learning_rate": 0.00018200656102209718, "loss": 0.2798, "step": 1470 }, { "epoch": 0.8245515695067265, "grad_norm": 0.0729069874816868, "learning_rate": 0.00018196921305980532, "loss": 0.2888, "step": 1471 }, { "epoch": 0.8251121076233184, "grad_norm": 0.07514945154645199, "learning_rate": 0.0001819318302174281, "loss": 0.2898, "step": 1472 }, { "epoch": 0.8256726457399103, "grad_norm": 0.07526812354818167, "learning_rate": 0.00018189441251087292, "loss": 0.2896, "step": 1473 }, { "epoch": 0.8262331838565022, "grad_norm": 0.07367208114650646, "learning_rate": 0.00018185695995606195, "loss": 0.2906, "step": 1474 }, { "epoch": 0.8267937219730942, "grad_norm": 0.07347381591983008, "learning_rate": 0.00018181947256893234, "loss": 0.2877, "step": 1475 }, { "epoch": 0.827354260089686, "grad_norm": 0.07537713414133736, "learning_rate": 0.00018178195036543592, "loss": 0.2842, "step": 1476 }, { "epoch": 0.827914798206278, "grad_norm": 0.075994983600787, "learning_rate": 0.00018174439336153943, "loss": 0.2867, "step": 1477 }, { "epoch": 0.82847533632287, "grad_norm": 0.07483465441397896, "learning_rate": 0.00018170680157322434, "loss": 0.2858, "step": 1478 }, { "epoch": 0.8290358744394619, "grad_norm": 0.07805373208430955, "learning_rate": 0.00018166917501648695, "loss": 0.3151, "step": 1479 }, { "epoch": 0.8295964125560538, "grad_norm": 0.07482220546270187, "learning_rate": 0.00018163151370733838, "loss": 0.2926, "step": 1480 }, { "epoch": 0.8301569506726457, "grad_norm": 0.07368764143513033, "learning_rate": 0.00018159381766180452, "loss": 0.292, "step": 1481 }, { "epoch": 0.8307174887892377, "grad_norm": 0.08143484239387924, "learning_rate": 0.00018155608689592604, "loss": 0.2885, "step": 1482 }, { "epoch": 0.8312780269058296, "grad_norm": 0.07849511557171265, "learning_rate": 0.00018151832142575838, "loss": 0.2884, "step": 1483 }, { "epoch": 0.8318385650224215, "grad_norm": 0.0739668970466124, "learning_rate": 0.00018148052126737177, "loss": 0.29, "step": 1484 }, { "epoch": 0.8323991031390134, "grad_norm": 0.0741235222041159, "learning_rate": 0.00018144268643685118, "loss": 0.2897, "step": 1485 }, { "epoch": 0.8329596412556054, "grad_norm": 0.0751369067793912, "learning_rate": 0.00018140481695029634, "loss": 0.2708, "step": 1486 }, { "epoch": 0.8335201793721974, "grad_norm": 0.07468639268265492, "learning_rate": 0.0001813669128238217, "loss": 0.286, "step": 1487 }, { "epoch": 0.8340807174887892, "grad_norm": 0.07405511155465547, "learning_rate": 0.00018132897407355657, "loss": 0.275, "step": 1488 }, { "epoch": 0.8346412556053812, "grad_norm": 0.07183006998574253, "learning_rate": 0.00018129100071564476, "loss": 0.2774, "step": 1489 }, { "epoch": 0.8352017937219731, "grad_norm": 0.07455414281594977, "learning_rate": 0.00018125299276624504, "loss": 0.2878, "step": 1490 }, { "epoch": 0.8357623318385651, "grad_norm": 0.07342341772225446, "learning_rate": 0.0001812149502415308, "loss": 0.2942, "step": 1491 }, { "epoch": 0.8363228699551569, "grad_norm": 0.07247308389338286, "learning_rate": 0.00018117687315769007, "loss": 0.2736, "step": 1492 }, { "epoch": 0.8368834080717489, "grad_norm": 0.07331788336274414, "learning_rate": 0.00018113876153092576, "loss": 0.2924, "step": 1493 }, { "epoch": 0.8374439461883408, "grad_norm": 0.07636980653536268, "learning_rate": 0.00018110061537745536, "loss": 0.285, "step": 1494 }, { "epoch": 0.8380044843049327, "grad_norm": 0.07379623135697953, "learning_rate": 0.00018106243471351105, "loss": 0.2902, "step": 1495 }, { "epoch": 0.8385650224215246, "grad_norm": 0.07550751475407677, "learning_rate": 0.00018102421955533974, "loss": 0.298, "step": 1496 }, { "epoch": 0.8391255605381166, "grad_norm": 0.07360096778024856, "learning_rate": 0.00018098596991920297, "loss": 0.2967, "step": 1497 }, { "epoch": 0.8396860986547086, "grad_norm": 0.07242540875769851, "learning_rate": 0.000180947685821377, "loss": 0.2817, "step": 1498 }, { "epoch": 0.8402466367713004, "grad_norm": 0.07314544504553466, "learning_rate": 0.00018090936727815278, "loss": 0.2801, "step": 1499 }, { "epoch": 0.8408071748878924, "grad_norm": 0.07591653427559013, "learning_rate": 0.00018087101430583577, "loss": 0.296, "step": 1500 }, { "epoch": 0.8413677130044843, "grad_norm": 0.07416486332512015, "learning_rate": 0.00018083262692074627, "loss": 0.2748, "step": 1501 }, { "epoch": 0.8419282511210763, "grad_norm": 0.07317031892892478, "learning_rate": 0.00018079420513921913, "loss": 0.2779, "step": 1502 }, { "epoch": 0.8424887892376681, "grad_norm": 0.07485099910739548, "learning_rate": 0.00018075574897760376, "loss": 0.2821, "step": 1503 }, { "epoch": 0.8430493273542601, "grad_norm": 0.0733565558947233, "learning_rate": 0.00018071725845226436, "loss": 0.2899, "step": 1504 }, { "epoch": 0.843609865470852, "grad_norm": 0.07634656510574725, "learning_rate": 0.00018067873357957968, "loss": 0.2963, "step": 1505 }, { "epoch": 0.844170403587444, "grad_norm": 0.0716798945396636, "learning_rate": 0.00018064017437594303, "loss": 0.2887, "step": 1506 }, { "epoch": 0.8447309417040358, "grad_norm": 0.07494626920072107, "learning_rate": 0.0001806015808577624, "loss": 0.3005, "step": 1507 }, { "epoch": 0.8452914798206278, "grad_norm": 0.07371906529487111, "learning_rate": 0.0001805629530414604, "loss": 0.2945, "step": 1508 }, { "epoch": 0.8458520179372198, "grad_norm": 0.07331479262974581, "learning_rate": 0.00018052429094347411, "loss": 0.2838, "step": 1509 }, { "epoch": 0.8464125560538116, "grad_norm": 0.07540102259039154, "learning_rate": 0.00018048559458025537, "loss": 0.287, "step": 1510 }, { "epoch": 0.8469730941704036, "grad_norm": 0.07457926552589283, "learning_rate": 0.00018044686396827047, "loss": 0.2755, "step": 1511 }, { "epoch": 0.8475336322869955, "grad_norm": 0.07497185379846803, "learning_rate": 0.0001804080991240003, "loss": 0.2867, "step": 1512 }, { "epoch": 0.8480941704035875, "grad_norm": 0.07785367889006829, "learning_rate": 0.00018036930006394038, "loss": 0.2963, "step": 1513 }, { "epoch": 0.8486547085201793, "grad_norm": 0.07251561565241364, "learning_rate": 0.00018033046680460073, "loss": 0.2871, "step": 1514 }, { "epoch": 0.8492152466367713, "grad_norm": 0.07316362504655254, "learning_rate": 0.00018029159936250593, "loss": 0.2903, "step": 1515 }, { "epoch": 0.8497757847533632, "grad_norm": 0.07273848774249944, "learning_rate": 0.00018025269775419507, "loss": 0.2784, "step": 1516 }, { "epoch": 0.8503363228699552, "grad_norm": 0.07357435732694743, "learning_rate": 0.0001802137619962219, "loss": 0.293, "step": 1517 }, { "epoch": 0.850896860986547, "grad_norm": 0.07301027893663752, "learning_rate": 0.00018017479210515462, "loss": 0.2813, "step": 1518 }, { "epoch": 0.851457399103139, "grad_norm": 0.07417040740255613, "learning_rate": 0.0001801357880975759, "loss": 0.2815, "step": 1519 }, { "epoch": 0.852017937219731, "grad_norm": 0.07201234029862826, "learning_rate": 0.000180096749990083, "loss": 0.2837, "step": 1520 }, { "epoch": 0.8525784753363229, "grad_norm": 0.07654746052709169, "learning_rate": 0.00018005767779928768, "loss": 0.3035, "step": 1521 }, { "epoch": 0.8531390134529148, "grad_norm": 0.07559009198121788, "learning_rate": 0.00018001857154181626, "loss": 0.2953, "step": 1522 }, { "epoch": 0.8536995515695067, "grad_norm": 0.0742104436963455, "learning_rate": 0.00017997943123430936, "loss": 0.2733, "step": 1523 }, { "epoch": 0.8542600896860987, "grad_norm": 0.07313887641047588, "learning_rate": 0.00017994025689342235, "loss": 0.283, "step": 1524 }, { "epoch": 0.8548206278026906, "grad_norm": 0.07282347483864758, "learning_rate": 0.00017990104853582493, "loss": 0.2806, "step": 1525 }, { "epoch": 0.8553811659192825, "grad_norm": 0.07364438692256288, "learning_rate": 0.00017986180617820123, "loss": 0.2899, "step": 1526 }, { "epoch": 0.8559417040358744, "grad_norm": 0.07649758437982163, "learning_rate": 0.00017982252983725, "loss": 0.2906, "step": 1527 }, { "epoch": 0.8565022421524664, "grad_norm": 0.07423743401057138, "learning_rate": 0.00017978321952968434, "loss": 0.2868, "step": 1528 }, { "epoch": 0.8570627802690582, "grad_norm": 0.0758700713210416, "learning_rate": 0.00017974387527223184, "loss": 0.2981, "step": 1529 }, { "epoch": 0.8576233183856502, "grad_norm": 0.07786873063275755, "learning_rate": 0.00017970449708163452, "loss": 0.2919, "step": 1530 }, { "epoch": 0.8581838565022422, "grad_norm": 0.07565227507775568, "learning_rate": 0.0001796650849746489, "loss": 0.2949, "step": 1531 }, { "epoch": 0.8587443946188341, "grad_norm": 0.0762723363189425, "learning_rate": 0.00017962563896804578, "loss": 0.2958, "step": 1532 }, { "epoch": 0.859304932735426, "grad_norm": 0.07144478659817405, "learning_rate": 0.00017958615907861055, "loss": 0.2893, "step": 1533 }, { "epoch": 0.8598654708520179, "grad_norm": 0.07507825955322966, "learning_rate": 0.00017954664532314295, "loss": 0.2847, "step": 1534 }, { "epoch": 0.8604260089686099, "grad_norm": 0.07424681293346497, "learning_rate": 0.0001795070977184572, "loss": 0.2944, "step": 1535 }, { "epoch": 0.8609865470852018, "grad_norm": 0.07497557650932773, "learning_rate": 0.00017946751628138174, "loss": 0.2698, "step": 1536 }, { "epoch": 0.8615470852017937, "grad_norm": 0.07066603839213563, "learning_rate": 0.0001794279010287596, "loss": 0.2856, "step": 1537 }, { "epoch": 0.8621076233183856, "grad_norm": 0.07503927606305691, "learning_rate": 0.00017938825197744807, "loss": 0.2764, "step": 1538 }, { "epoch": 0.8626681614349776, "grad_norm": 0.07871173230887407, "learning_rate": 0.00017934856914431899, "loss": 0.2844, "step": 1539 }, { "epoch": 0.8632286995515696, "grad_norm": 0.07386649820562965, "learning_rate": 0.00017930885254625832, "loss": 0.2912, "step": 1540 }, { "epoch": 0.8637892376681614, "grad_norm": 0.07584706875354484, "learning_rate": 0.00017926910220016667, "loss": 0.3029, "step": 1541 }, { "epoch": 0.8643497757847534, "grad_norm": 0.07615015482798214, "learning_rate": 0.0001792293181229588, "loss": 0.2887, "step": 1542 }, { "epoch": 0.8649103139013453, "grad_norm": 0.07157336214839691, "learning_rate": 0.00017918950033156384, "loss": 0.283, "step": 1543 }, { "epoch": 0.8654708520179372, "grad_norm": 0.07478197968145954, "learning_rate": 0.00017914964884292544, "loss": 0.2822, "step": 1544 }, { "epoch": 0.8660313901345291, "grad_norm": 0.07670048948114096, "learning_rate": 0.0001791097636740014, "loss": 0.2895, "step": 1545 }, { "epoch": 0.8665919282511211, "grad_norm": 0.0756401758028447, "learning_rate": 0.0001790698448417639, "loss": 0.2807, "step": 1546 }, { "epoch": 0.867152466367713, "grad_norm": 0.07682176437281332, "learning_rate": 0.00017902989236319954, "loss": 0.29, "step": 1547 }, { "epoch": 0.8677130044843049, "grad_norm": 0.07527893824548262, "learning_rate": 0.0001789899062553091, "loss": 0.3, "step": 1548 }, { "epoch": 0.8682735426008968, "grad_norm": 0.0734658991643186, "learning_rate": 0.0001789498865351078, "loss": 0.2888, "step": 1549 }, { "epoch": 0.8688340807174888, "grad_norm": 0.07338358151768215, "learning_rate": 0.00017890983321962501, "loss": 0.2897, "step": 1550 }, { "epoch": 0.8693946188340808, "grad_norm": 0.07335106174051759, "learning_rate": 0.0001788697463259046, "loss": 0.2854, "step": 1551 }, { "epoch": 0.8699551569506726, "grad_norm": 0.07173539150981001, "learning_rate": 0.0001788296258710045, "loss": 0.2847, "step": 1552 }, { "epoch": 0.8705156950672646, "grad_norm": 0.07242438890257322, "learning_rate": 0.0001787894718719971, "loss": 0.2948, "step": 1553 }, { "epoch": 0.8710762331838565, "grad_norm": 0.0733302232690697, "learning_rate": 0.00017874928434596896, "loss": 0.2907, "step": 1554 }, { "epoch": 0.8716367713004485, "grad_norm": 0.07290729598111374, "learning_rate": 0.00017870906331002098, "loss": 0.2935, "step": 1555 }, { "epoch": 0.8721973094170403, "grad_norm": 0.07418692156886422, "learning_rate": 0.00017866880878126824, "loss": 0.2956, "step": 1556 }, { "epoch": 0.8727578475336323, "grad_norm": 0.07510192303785496, "learning_rate": 0.00017862852077684015, "loss": 0.2922, "step": 1557 }, { "epoch": 0.8733183856502242, "grad_norm": 0.07293741181847814, "learning_rate": 0.00017858819931388032, "loss": 0.2729, "step": 1558 }, { "epoch": 0.8738789237668162, "grad_norm": 0.0727239038880651, "learning_rate": 0.0001785478444095466, "loss": 0.2871, "step": 1559 }, { "epoch": 0.874439461883408, "grad_norm": 0.07136868555596285, "learning_rate": 0.0001785074560810111, "loss": 0.2824, "step": 1560 }, { "epoch": 0.875, "grad_norm": 0.07365218153638509, "learning_rate": 0.0001784670343454601, "loss": 0.2884, "step": 1561 }, { "epoch": 0.875560538116592, "grad_norm": 0.0744107435266324, "learning_rate": 0.00017842657922009415, "loss": 0.2941, "step": 1562 }, { "epoch": 0.8761210762331838, "grad_norm": 0.07692989521022398, "learning_rate": 0.00017838609072212794, "loss": 0.2856, "step": 1563 }, { "epoch": 0.8766816143497758, "grad_norm": 0.07615571543745912, "learning_rate": 0.00017834556886879045, "loss": 0.2987, "step": 1564 }, { "epoch": 0.8772421524663677, "grad_norm": 0.07303922127135919, "learning_rate": 0.00017830501367732484, "loss": 0.291, "step": 1565 }, { "epoch": 0.8778026905829597, "grad_norm": 0.07672307658101149, "learning_rate": 0.00017826442516498837, "loss": 0.2976, "step": 1566 }, { "epoch": 0.8783632286995515, "grad_norm": 0.07441023986700969, "learning_rate": 0.00017822380334905251, "loss": 0.2955, "step": 1567 }, { "epoch": 0.8789237668161435, "grad_norm": 0.07523563038497909, "learning_rate": 0.000178183148246803, "loss": 0.2719, "step": 1568 }, { "epoch": 0.8794843049327354, "grad_norm": 0.07415243357870113, "learning_rate": 0.00017814245987553962, "loss": 0.2809, "step": 1569 }, { "epoch": 0.8800448430493274, "grad_norm": 0.07525867364687405, "learning_rate": 0.00017810173825257635, "loss": 0.2736, "step": 1570 }, { "epoch": 0.8806053811659192, "grad_norm": 0.074790495663368, "learning_rate": 0.00017806098339524136, "loss": 0.2864, "step": 1571 }, { "epoch": 0.8811659192825112, "grad_norm": 0.07251174855671035, "learning_rate": 0.00017802019532087694, "loss": 0.2903, "step": 1572 }, { "epoch": 0.8817264573991032, "grad_norm": 0.07246612243866497, "learning_rate": 0.00017797937404683944, "loss": 0.2878, "step": 1573 }, { "epoch": 0.8822869955156951, "grad_norm": 0.0738823540378283, "learning_rate": 0.00017793851959049944, "loss": 0.3034, "step": 1574 }, { "epoch": 0.882847533632287, "grad_norm": 0.07284419168932472, "learning_rate": 0.00017789763196924163, "loss": 0.2848, "step": 1575 }, { "epoch": 0.8834080717488789, "grad_norm": 0.07194220334685288, "learning_rate": 0.00017785671120046473, "loss": 0.2915, "step": 1576 }, { "epoch": 0.8839686098654709, "grad_norm": 0.07103917207526902, "learning_rate": 0.00017781575730158164, "loss": 0.2676, "step": 1577 }, { "epoch": 0.8845291479820628, "grad_norm": 0.07243230180954226, "learning_rate": 0.00017777477029001933, "loss": 0.2903, "step": 1578 }, { "epoch": 0.8850896860986547, "grad_norm": 0.07703232437862763, "learning_rate": 0.00017773375018321886, "loss": 0.3024, "step": 1579 }, { "epoch": 0.8856502242152466, "grad_norm": 0.07412623943492334, "learning_rate": 0.00017769269699863542, "loss": 0.2859, "step": 1580 }, { "epoch": 0.8862107623318386, "grad_norm": 0.07321862402476478, "learning_rate": 0.00017765161075373816, "loss": 0.2807, "step": 1581 }, { "epoch": 0.8867713004484304, "grad_norm": 0.07472200715797371, "learning_rate": 0.00017761049146601047, "loss": 0.2852, "step": 1582 }, { "epoch": 0.8873318385650224, "grad_norm": 0.07208903613257223, "learning_rate": 0.00017756933915294963, "loss": 0.279, "step": 1583 }, { "epoch": 0.8878923766816144, "grad_norm": 0.07307170811980906, "learning_rate": 0.00017752815383206705, "loss": 0.2728, "step": 1584 }, { "epoch": 0.8884529147982063, "grad_norm": 0.07370855397672015, "learning_rate": 0.00017748693552088822, "loss": 0.2887, "step": 1585 }, { "epoch": 0.8890134529147982, "grad_norm": 0.07496967397687773, "learning_rate": 0.00017744568423695259, "loss": 0.295, "step": 1586 }, { "epoch": 0.8895739910313901, "grad_norm": 0.07189932754460883, "learning_rate": 0.0001774043999978137, "loss": 0.2905, "step": 1587 }, { "epoch": 0.8901345291479821, "grad_norm": 0.07126333198542388, "learning_rate": 0.00017736308282103908, "loss": 0.2844, "step": 1588 }, { "epoch": 0.890695067264574, "grad_norm": 0.0718213911978105, "learning_rate": 0.0001773217327242103, "loss": 0.2861, "step": 1589 }, { "epoch": 0.8912556053811659, "grad_norm": 0.07343198943939576, "learning_rate": 0.00017728034972492297, "loss": 0.3002, "step": 1590 }, { "epoch": 0.8918161434977578, "grad_norm": 0.07275212591986045, "learning_rate": 0.0001772389338407866, "loss": 0.2821, "step": 1591 }, { "epoch": 0.8923766816143498, "grad_norm": 0.07531081626251544, "learning_rate": 0.0001771974850894248, "loss": 0.2887, "step": 1592 }, { "epoch": 0.8929372197309418, "grad_norm": 0.07279085656429406, "learning_rate": 0.00017715600348847506, "loss": 0.2877, "step": 1593 }, { "epoch": 0.8934977578475336, "grad_norm": 0.07301727573273224, "learning_rate": 0.00017711448905558897, "loss": 0.2716, "step": 1594 }, { "epoch": 0.8940582959641256, "grad_norm": 0.0733169780867396, "learning_rate": 0.00017707294180843196, "loss": 0.287, "step": 1595 }, { "epoch": 0.8946188340807175, "grad_norm": 0.07721743970204584, "learning_rate": 0.00017703136176468355, "loss": 0.2905, "step": 1596 }, { "epoch": 0.8951793721973094, "grad_norm": 0.07462719746638331, "learning_rate": 0.0001769897489420371, "loss": 0.2942, "step": 1597 }, { "epoch": 0.8957399103139013, "grad_norm": 0.07088383450363162, "learning_rate": 0.00017694810335820008, "loss": 0.2832, "step": 1598 }, { "epoch": 0.8963004484304933, "grad_norm": 0.07341824885173188, "learning_rate": 0.0001769064250308937, "loss": 0.2951, "step": 1599 }, { "epoch": 0.8968609865470852, "grad_norm": 0.07462788854070883, "learning_rate": 0.0001768647139778532, "loss": 0.2829, "step": 1600 }, { "epoch": 0.8974215246636771, "grad_norm": 0.07150287687830631, "learning_rate": 0.0001768229702168278, "loss": 0.2885, "step": 1601 }, { "epoch": 0.897982062780269, "grad_norm": 0.07109980541289707, "learning_rate": 0.00017678119376558055, "loss": 0.2884, "step": 1602 }, { "epoch": 0.898542600896861, "grad_norm": 0.0724317268039019, "learning_rate": 0.00017673938464188847, "loss": 0.3012, "step": 1603 }, { "epoch": 0.899103139013453, "grad_norm": 0.0722355061570106, "learning_rate": 0.00017669754286354241, "loss": 0.293, "step": 1604 }, { "epoch": 0.8996636771300448, "grad_norm": 0.07023265944433922, "learning_rate": 0.00017665566844834717, "loss": 0.2792, "step": 1605 }, { "epoch": 0.9002242152466368, "grad_norm": 0.07453291308877033, "learning_rate": 0.0001766137614141215, "loss": 0.3005, "step": 1606 }, { "epoch": 0.9007847533632287, "grad_norm": 0.0781899935366532, "learning_rate": 0.00017657182177869787, "loss": 0.2982, "step": 1607 }, { "epoch": 0.9013452914798207, "grad_norm": 0.07384604421356011, "learning_rate": 0.00017652984955992277, "loss": 0.2874, "step": 1608 }, { "epoch": 0.9019058295964125, "grad_norm": 0.07543178994690473, "learning_rate": 0.00017648784477565648, "loss": 0.2823, "step": 1609 }, { "epoch": 0.9024663677130045, "grad_norm": 0.07647822461198672, "learning_rate": 0.0001764458074437731, "loss": 0.2946, "step": 1610 }, { "epoch": 0.9030269058295964, "grad_norm": 0.07427298006381126, "learning_rate": 0.00017640373758216077, "loss": 0.2801, "step": 1611 }, { "epoch": 0.9035874439461884, "grad_norm": 0.07085855311195798, "learning_rate": 0.00017636163520872122, "loss": 0.2822, "step": 1612 }, { "epoch": 0.9041479820627802, "grad_norm": 0.07275776548540917, "learning_rate": 0.00017631950034137015, "loss": 0.2782, "step": 1613 }, { "epoch": 0.9047085201793722, "grad_norm": 0.07289352731265318, "learning_rate": 0.00017627733299803712, "loss": 0.2964, "step": 1614 }, { "epoch": 0.9052690582959642, "grad_norm": 0.07577448675281252, "learning_rate": 0.00017623513319666543, "loss": 0.2892, "step": 1615 }, { "epoch": 0.905829596412556, "grad_norm": 0.07269049085417104, "learning_rate": 0.0001761929009552122, "loss": 0.2931, "step": 1616 }, { "epoch": 0.906390134529148, "grad_norm": 0.07216742524982867, "learning_rate": 0.00017615063629164838, "loss": 0.2916, "step": 1617 }, { "epoch": 0.9069506726457399, "grad_norm": 0.07521766155167502, "learning_rate": 0.00017610833922395878, "loss": 0.2777, "step": 1618 }, { "epoch": 0.9075112107623319, "grad_norm": 0.07783378306532199, "learning_rate": 0.00017606600977014184, "loss": 0.297, "step": 1619 }, { "epoch": 0.9080717488789237, "grad_norm": 0.07533264268788593, "learning_rate": 0.0001760236479482099, "loss": 0.2859, "step": 1620 }, { "epoch": 0.9086322869955157, "grad_norm": 0.07778834560707679, "learning_rate": 0.00017598125377618905, "loss": 0.2865, "step": 1621 }, { "epoch": 0.9091928251121076, "grad_norm": 0.07532574463796318, "learning_rate": 0.00017593882727211916, "loss": 0.2877, "step": 1622 }, { "epoch": 0.9097533632286996, "grad_norm": 0.07550661399566316, "learning_rate": 0.00017589636845405376, "loss": 0.2888, "step": 1623 }, { "epoch": 0.9103139013452914, "grad_norm": 0.07215447422793152, "learning_rate": 0.00017585387734006034, "loss": 0.2877, "step": 1624 }, { "epoch": 0.9108744394618834, "grad_norm": 0.07394756569605147, "learning_rate": 0.0001758113539482199, "loss": 0.2784, "step": 1625 }, { "epoch": 0.9114349775784754, "grad_norm": 0.07481048058195179, "learning_rate": 0.00017576879829662732, "loss": 0.2859, "step": 1626 }, { "epoch": 0.9119955156950673, "grad_norm": 0.07495227012462921, "learning_rate": 0.00017572621040339113, "loss": 0.2835, "step": 1627 }, { "epoch": 0.9125560538116592, "grad_norm": 0.07215411798909717, "learning_rate": 0.00017568359028663364, "loss": 0.2848, "step": 1628 }, { "epoch": 0.9131165919282511, "grad_norm": 0.07477079461228578, "learning_rate": 0.00017564093796449087, "loss": 0.2922, "step": 1629 }, { "epoch": 0.9136771300448431, "grad_norm": 0.07122569878337474, "learning_rate": 0.00017559825345511243, "loss": 0.2841, "step": 1630 }, { "epoch": 0.9142376681614349, "grad_norm": 0.07319260835369037, "learning_rate": 0.00017555553677666184, "loss": 0.2829, "step": 1631 }, { "epoch": 0.9147982062780269, "grad_norm": 0.0730504020630298, "learning_rate": 0.00017551278794731607, "loss": 0.2879, "step": 1632 }, { "epoch": 0.9153587443946188, "grad_norm": 0.07413813123792716, "learning_rate": 0.00017547000698526596, "loss": 0.2905, "step": 1633 }, { "epoch": 0.9159192825112108, "grad_norm": 0.07395622659326352, "learning_rate": 0.00017542719390871593, "loss": 0.2998, "step": 1634 }, { "epoch": 0.9164798206278026, "grad_norm": 0.07323445118900361, "learning_rate": 0.00017538434873588408, "loss": 0.2828, "step": 1635 }, { "epoch": 0.9170403587443946, "grad_norm": 0.06841709465287436, "learning_rate": 0.0001753414714850022, "loss": 0.2903, "step": 1636 }, { "epoch": 0.9176008968609866, "grad_norm": 0.07413478072911547, "learning_rate": 0.00017529856217431567, "loss": 0.2839, "step": 1637 }, { "epoch": 0.9181614349775785, "grad_norm": 0.07754698812595993, "learning_rate": 0.00017525562082208355, "loss": 0.2876, "step": 1638 }, { "epoch": 0.9187219730941704, "grad_norm": 0.07327624458482546, "learning_rate": 0.00017521264744657856, "loss": 0.2837, "step": 1639 }, { "epoch": 0.9192825112107623, "grad_norm": 0.07558291745709474, "learning_rate": 0.00017516964206608696, "loss": 0.2863, "step": 1640 }, { "epoch": 0.9198430493273543, "grad_norm": 0.07558419350710056, "learning_rate": 0.0001751266046989087, "loss": 0.3001, "step": 1641 }, { "epoch": 0.9204035874439462, "grad_norm": 0.0782416546651362, "learning_rate": 0.0001750835353633574, "loss": 0.3039, "step": 1642 }, { "epoch": 0.9209641255605381, "grad_norm": 0.07546183397911319, "learning_rate": 0.00017504043407776015, "loss": 0.2862, "step": 1643 }, { "epoch": 0.92152466367713, "grad_norm": 0.0756112604900152, "learning_rate": 0.00017499730086045767, "loss": 0.3063, "step": 1644 }, { "epoch": 0.922085201793722, "grad_norm": 0.07215444054397309, "learning_rate": 0.00017495413572980435, "loss": 0.2893, "step": 1645 }, { "epoch": 0.922645739910314, "grad_norm": 0.07046996461513384, "learning_rate": 0.00017491093870416807, "loss": 0.2848, "step": 1646 }, { "epoch": 0.9232062780269058, "grad_norm": 0.07099289832624417, "learning_rate": 0.00017486770980193033, "loss": 0.2837, "step": 1647 }, { "epoch": 0.9237668161434978, "grad_norm": 0.07204428919620674, "learning_rate": 0.00017482444904148617, "loss": 0.282, "step": 1648 }, { "epoch": 0.9243273542600897, "grad_norm": 0.07432187501781345, "learning_rate": 0.00017478115644124423, "loss": 0.2943, "step": 1649 }, { "epoch": 0.9248878923766816, "grad_norm": 0.06906577445089956, "learning_rate": 0.00017473783201962665, "loss": 0.2882, "step": 1650 }, { "epoch": 0.9254484304932735, "grad_norm": 0.07260085951472438, "learning_rate": 0.00017469447579506907, "loss": 0.2848, "step": 1651 }, { "epoch": 0.9260089686098655, "grad_norm": 0.07075547934294112, "learning_rate": 0.0001746510877860208, "loss": 0.2852, "step": 1652 }, { "epoch": 0.9265695067264574, "grad_norm": 0.07525858681227156, "learning_rate": 0.00017460766801094454, "loss": 0.2809, "step": 1653 }, { "epoch": 0.9271300448430493, "grad_norm": 0.07234387482659316, "learning_rate": 0.00017456421648831655, "loss": 0.2876, "step": 1654 }, { "epoch": 0.9276905829596412, "grad_norm": 0.07487929033557668, "learning_rate": 0.0001745207332366267, "loss": 0.2899, "step": 1655 }, { "epoch": 0.9282511210762332, "grad_norm": 0.07259129422906563, "learning_rate": 0.0001744772182743782, "loss": 0.295, "step": 1656 }, { "epoch": 0.9288116591928252, "grad_norm": 0.07078856411822994, "learning_rate": 0.00017443367162008785, "loss": 0.2786, "step": 1657 }, { "epoch": 0.929372197309417, "grad_norm": 0.07194721171963898, "learning_rate": 0.00017439009329228586, "loss": 0.2639, "step": 1658 }, { "epoch": 0.929932735426009, "grad_norm": 0.07178484282976831, "learning_rate": 0.00017434648330951605, "loss": 0.2871, "step": 1659 }, { "epoch": 0.9304932735426009, "grad_norm": 0.07340063223561963, "learning_rate": 0.0001743028416903356, "loss": 0.2806, "step": 1660 }, { "epoch": 0.9310538116591929, "grad_norm": 0.07566542410927178, "learning_rate": 0.00017425916845331517, "loss": 0.2896, "step": 1661 }, { "epoch": 0.9316143497757847, "grad_norm": 0.07486497673865919, "learning_rate": 0.0001742154636170389, "loss": 0.2911, "step": 1662 }, { "epoch": 0.9321748878923767, "grad_norm": 0.07511359840039579, "learning_rate": 0.00017417172720010434, "loss": 0.2989, "step": 1663 }, { "epoch": 0.9327354260089686, "grad_norm": 0.07557817479695081, "learning_rate": 0.00017412795922112253, "loss": 0.2752, "step": 1664 }, { "epoch": 0.9332959641255605, "grad_norm": 0.07461501444770283, "learning_rate": 0.0001740841596987179, "loss": 0.2875, "step": 1665 }, { "epoch": 0.9338565022421524, "grad_norm": 0.0693166526056427, "learning_rate": 0.00017404032865152834, "loss": 0.2842, "step": 1666 }, { "epoch": 0.9344170403587444, "grad_norm": 0.07322495815613736, "learning_rate": 0.00017399646609820505, "loss": 0.2825, "step": 1667 }, { "epoch": 0.9349775784753364, "grad_norm": 0.07665243071009871, "learning_rate": 0.0001739525720574128, "loss": 0.2915, "step": 1668 }, { "epoch": 0.9355381165919282, "grad_norm": 0.07737383882355353, "learning_rate": 0.00017390864654782964, "loss": 0.2998, "step": 1669 }, { "epoch": 0.9360986547085202, "grad_norm": 0.0737494468927531, "learning_rate": 0.00017386468958814706, "loss": 0.2878, "step": 1670 }, { "epoch": 0.9366591928251121, "grad_norm": 0.0714066787744356, "learning_rate": 0.00017382070119706988, "loss": 0.2813, "step": 1671 }, { "epoch": 0.9372197309417041, "grad_norm": 0.07043150060159434, "learning_rate": 0.0001737766813933164, "loss": 0.2784, "step": 1672 }, { "epoch": 0.9377802690582959, "grad_norm": 0.07227826616766787, "learning_rate": 0.00017373263019561814, "loss": 0.2836, "step": 1673 }, { "epoch": 0.9383408071748879, "grad_norm": 0.0733661404805429, "learning_rate": 0.00017368854762272014, "loss": 0.2799, "step": 1674 }, { "epoch": 0.9389013452914798, "grad_norm": 0.07071892951110591, "learning_rate": 0.00017364443369338064, "loss": 0.2678, "step": 1675 }, { "epoch": 0.9394618834080718, "grad_norm": 0.07470850463250955, "learning_rate": 0.0001736002884263713, "loss": 0.2949, "step": 1676 }, { "epoch": 0.9400224215246636, "grad_norm": 0.07278768556263866, "learning_rate": 0.00017355611184047718, "loss": 0.274, "step": 1677 }, { "epoch": 0.9405829596412556, "grad_norm": 0.07258556909282828, "learning_rate": 0.00017351190395449651, "loss": 0.282, "step": 1678 }, { "epoch": 0.9411434977578476, "grad_norm": 0.07444418686302096, "learning_rate": 0.000173467664787241, "loss": 0.2888, "step": 1679 }, { "epoch": 0.9417040358744395, "grad_norm": 0.07728184646618526, "learning_rate": 0.00017342339435753553, "loss": 0.3073, "step": 1680 }, { "epoch": 0.9422645739910314, "grad_norm": 0.07412477436974288, "learning_rate": 0.00017337909268421835, "loss": 0.2878, "step": 1681 }, { "epoch": 0.9428251121076233, "grad_norm": 0.0741828737276978, "learning_rate": 0.00017333475978614107, "loss": 0.301, "step": 1682 }, { "epoch": 0.9433856502242153, "grad_norm": 0.07254070658542294, "learning_rate": 0.00017329039568216844, "loss": 0.2745, "step": 1683 }, { "epoch": 0.9439461883408071, "grad_norm": 0.07217470374090927, "learning_rate": 0.00017324600039117863, "loss": 0.2835, "step": 1684 }, { "epoch": 0.9445067264573991, "grad_norm": 0.07196992715424201, "learning_rate": 0.00017320157393206298, "loss": 0.2831, "step": 1685 }, { "epoch": 0.945067264573991, "grad_norm": 0.07168226957448584, "learning_rate": 0.00017315711632372613, "loss": 0.2803, "step": 1686 }, { "epoch": 0.945627802690583, "grad_norm": 0.07358645350177724, "learning_rate": 0.000173112627585086, "loss": 0.2819, "step": 1687 }, { "epoch": 0.9461883408071748, "grad_norm": 0.0748629407326928, "learning_rate": 0.00017306810773507376, "loss": 0.2838, "step": 1688 }, { "epoch": 0.9467488789237668, "grad_norm": 0.07535040596126334, "learning_rate": 0.00017302355679263377, "loss": 0.2957, "step": 1689 }, { "epoch": 0.9473094170403588, "grad_norm": 0.07008243670534321, "learning_rate": 0.0001729789747767236, "loss": 0.2927, "step": 1690 }, { "epoch": 0.9478699551569507, "grad_norm": 0.0729678214876866, "learning_rate": 0.00017293436170631415, "loss": 0.2896, "step": 1691 }, { "epoch": 0.9484304932735426, "grad_norm": 0.07339696701392524, "learning_rate": 0.00017288971760038942, "loss": 0.289, "step": 1692 }, { "epoch": 0.9489910313901345, "grad_norm": 0.07197335218511028, "learning_rate": 0.00017284504247794667, "loss": 0.2933, "step": 1693 }, { "epoch": 0.9495515695067265, "grad_norm": 0.07378140210096644, "learning_rate": 0.0001728003363579964, "loss": 0.2797, "step": 1694 }, { "epoch": 0.9501121076233184, "grad_norm": 0.07116653651425976, "learning_rate": 0.00017275559925956227, "loss": 0.2728, "step": 1695 }, { "epoch": 0.9506726457399103, "grad_norm": 0.06946675108565646, "learning_rate": 0.00017271083120168102, "loss": 0.2784, "step": 1696 }, { "epoch": 0.9512331838565022, "grad_norm": 0.07416590486296964, "learning_rate": 0.0001726660322034027, "loss": 0.2873, "step": 1697 }, { "epoch": 0.9517937219730942, "grad_norm": 0.07593399139838054, "learning_rate": 0.0001726212022837905, "loss": 0.2857, "step": 1698 }, { "epoch": 0.952354260089686, "grad_norm": 0.0727408230288732, "learning_rate": 0.00017257634146192072, "loss": 0.2691, "step": 1699 }, { "epoch": 0.952914798206278, "grad_norm": 0.07460940164156943, "learning_rate": 0.00017253144975688285, "loss": 0.2873, "step": 1700 }, { "epoch": 0.95347533632287, "grad_norm": 0.07429803178082536, "learning_rate": 0.0001724865271877795, "loss": 0.2869, "step": 1701 }, { "epoch": 0.9540358744394619, "grad_norm": 0.07134851108937684, "learning_rate": 0.00017244157377372638, "loss": 0.275, "step": 1702 }, { "epoch": 0.9545964125560538, "grad_norm": 0.06986109810489552, "learning_rate": 0.00017239658953385246, "loss": 0.2837, "step": 1703 }, { "epoch": 0.9551569506726457, "grad_norm": 0.06856388411678808, "learning_rate": 0.00017235157448729967, "loss": 0.2777, "step": 1704 }, { "epoch": 0.9557174887892377, "grad_norm": 0.07019980323520368, "learning_rate": 0.00017230652865322309, "loss": 0.2855, "step": 1705 }, { "epoch": 0.9562780269058296, "grad_norm": 0.07028149928057296, "learning_rate": 0.00017226145205079095, "loss": 0.2845, "step": 1706 }, { "epoch": 0.9568385650224215, "grad_norm": 0.0722077813669913, "learning_rate": 0.00017221634469918458, "loss": 0.2819, "step": 1707 }, { "epoch": 0.9573991031390134, "grad_norm": 0.07222438978417997, "learning_rate": 0.00017217120661759832, "loss": 0.2865, "step": 1708 }, { "epoch": 0.9579596412556054, "grad_norm": 0.07357381335046854, "learning_rate": 0.00017212603782523964, "loss": 0.2786, "step": 1709 }, { "epoch": 0.9585201793721974, "grad_norm": 0.07635328706561935, "learning_rate": 0.00017208083834132905, "loss": 0.2837, "step": 1710 }, { "epoch": 0.9590807174887892, "grad_norm": 0.07295089505628365, "learning_rate": 0.00017203560818510017, "loss": 0.2917, "step": 1711 }, { "epoch": 0.9596412556053812, "grad_norm": 0.07045358913278182, "learning_rate": 0.0001719903473757996, "loss": 0.2833, "step": 1712 }, { "epoch": 0.9602017937219731, "grad_norm": 0.07228807028747083, "learning_rate": 0.00017194505593268704, "loss": 0.2873, "step": 1713 }, { "epoch": 0.9607623318385651, "grad_norm": 0.07603221313096017, "learning_rate": 0.00017189973387503522, "loss": 0.2889, "step": 1714 }, { "epoch": 0.9613228699551569, "grad_norm": 0.07307077879941619, "learning_rate": 0.00017185438122212983, "loss": 0.2933, "step": 1715 }, { "epoch": 0.9618834080717489, "grad_norm": 0.07322017184361226, "learning_rate": 0.0001718089979932697, "loss": 0.2955, "step": 1716 }, { "epoch": 0.9624439461883408, "grad_norm": 0.07295962133338377, "learning_rate": 0.00017176358420776654, "loss": 0.2873, "step": 1717 }, { "epoch": 0.9630044843049327, "grad_norm": 0.07133986025044317, "learning_rate": 0.00017171813988494522, "loss": 0.2842, "step": 1718 }, { "epoch": 0.9635650224215246, "grad_norm": 0.07403222948184751, "learning_rate": 0.00017167266504414342, "loss": 0.2913, "step": 1719 }, { "epoch": 0.9641255605381166, "grad_norm": 0.07074358995016189, "learning_rate": 0.0001716271597047119, "loss": 0.2868, "step": 1720 }, { "epoch": 0.9646860986547086, "grad_norm": 0.07377182669211226, "learning_rate": 0.00017158162388601443, "loss": 0.283, "step": 1721 }, { "epoch": 0.9652466367713004, "grad_norm": 0.0703874372314276, "learning_rate": 0.00017153605760742777, "loss": 0.2791, "step": 1722 }, { "epoch": 0.9658071748878924, "grad_norm": 0.07113289112728188, "learning_rate": 0.00017149046088834146, "loss": 0.2754, "step": 1723 }, { "epoch": 0.9663677130044843, "grad_norm": 0.07261030259423171, "learning_rate": 0.0001714448337481582, "loss": 0.2946, "step": 1724 }, { "epoch": 0.9669282511210763, "grad_norm": 0.07566204586329739, "learning_rate": 0.00017139917620629356, "loss": 0.3112, "step": 1725 }, { "epoch": 0.9674887892376681, "grad_norm": 0.07371387803374542, "learning_rate": 0.000171353488282176, "loss": 0.2928, "step": 1726 }, { "epoch": 0.9680493273542601, "grad_norm": 0.07108812790567268, "learning_rate": 0.00017130776999524697, "loss": 0.289, "step": 1727 }, { "epoch": 0.968609865470852, "grad_norm": 0.07129036585172886, "learning_rate": 0.0001712620213649608, "loss": 0.2923, "step": 1728 }, { "epoch": 0.969170403587444, "grad_norm": 0.07419267575581269, "learning_rate": 0.00017121624241078477, "loss": 0.2817, "step": 1729 }, { "epoch": 0.9697309417040358, "grad_norm": 0.06964552695593264, "learning_rate": 0.0001711704331521991, "loss": 0.2839, "step": 1730 }, { "epoch": 0.9702914798206278, "grad_norm": 0.0704547099569811, "learning_rate": 0.00017112459360869674, "loss": 0.2908, "step": 1731 }, { "epoch": 0.9708520179372198, "grad_norm": 0.07062667985179868, "learning_rate": 0.00017107872379978374, "loss": 0.2902, "step": 1732 }, { "epoch": 0.9714125560538116, "grad_norm": 0.0749374524137223, "learning_rate": 0.00017103282374497883, "loss": 0.2826, "step": 1733 }, { "epoch": 0.9719730941704036, "grad_norm": 0.07667026279177375, "learning_rate": 0.0001709868934638138, "loss": 0.2977, "step": 1734 }, { "epoch": 0.9725336322869955, "grad_norm": 0.07429141839562514, "learning_rate": 0.00017094093297583316, "loss": 0.293, "step": 1735 }, { "epoch": 0.9730941704035875, "grad_norm": 0.07384385025776573, "learning_rate": 0.00017089494230059432, "loss": 0.2769, "step": 1736 }, { "epoch": 0.9736547085201793, "grad_norm": 0.07170487252525123, "learning_rate": 0.00017084892145766755, "loss": 0.2964, "step": 1737 }, { "epoch": 0.9742152466367713, "grad_norm": 0.07015361855556726, "learning_rate": 0.00017080287046663596, "loss": 0.2792, "step": 1738 }, { "epoch": 0.9747757847533632, "grad_norm": 0.06983521640448365, "learning_rate": 0.00017075678934709543, "loss": 0.2738, "step": 1739 }, { "epoch": 0.9753363228699552, "grad_norm": 0.07268793667525557, "learning_rate": 0.00017071067811865476, "loss": 0.291, "step": 1740 }, { "epoch": 0.975896860986547, "grad_norm": 0.0695631561511414, "learning_rate": 0.00017066453680093547, "loss": 0.2709, "step": 1741 }, { "epoch": 0.976457399103139, "grad_norm": 0.06951619746345279, "learning_rate": 0.00017061836541357192, "loss": 0.2691, "step": 1742 }, { "epoch": 0.977017937219731, "grad_norm": 0.07315230646262866, "learning_rate": 0.0001705721639762113, "loss": 0.2909, "step": 1743 }, { "epoch": 0.9775784753363229, "grad_norm": 0.07655221744584981, "learning_rate": 0.0001705259325085135, "loss": 0.2979, "step": 1744 }, { "epoch": 0.9781390134529148, "grad_norm": 0.07423686823518584, "learning_rate": 0.00017047967103015133, "loss": 0.2936, "step": 1745 }, { "epoch": 0.9786995515695067, "grad_norm": 0.07187254002922631, "learning_rate": 0.0001704333795608102, "loss": 0.2802, "step": 1746 }, { "epoch": 0.9792600896860987, "grad_norm": 0.07187169839889325, "learning_rate": 0.00017038705812018833, "loss": 0.2761, "step": 1747 }, { "epoch": 0.9798206278026906, "grad_norm": 0.07175327447886823, "learning_rate": 0.00017034070672799684, "loss": 0.2718, "step": 1748 }, { "epoch": 0.9803811659192825, "grad_norm": 0.07325962417801318, "learning_rate": 0.00017029432540395943, "loss": 0.282, "step": 1749 }, { "epoch": 0.9809417040358744, "grad_norm": 0.07220968769746226, "learning_rate": 0.00017024791416781257, "loss": 0.2868, "step": 1750 }, { "epoch": 0.9815022421524664, "grad_norm": 0.07092038680130973, "learning_rate": 0.00017020147303930554, "loss": 0.2911, "step": 1751 }, { "epoch": 0.9820627802690582, "grad_norm": 0.06939572948741551, "learning_rate": 0.00017015500203820022, "loss": 0.2689, "step": 1752 }, { "epoch": 0.9826233183856502, "grad_norm": 0.07479910602139032, "learning_rate": 0.00017010850118427125, "loss": 0.2915, "step": 1753 }, { "epoch": 0.9831838565022422, "grad_norm": 0.0768870712403653, "learning_rate": 0.00017006197049730602, "loss": 0.3008, "step": 1754 }, { "epoch": 0.9837443946188341, "grad_norm": 0.0720958557658177, "learning_rate": 0.00017001540999710458, "loss": 0.2901, "step": 1755 }, { "epoch": 0.984304932735426, "grad_norm": 0.07102148035823705, "learning_rate": 0.00016996881970347962, "loss": 0.2782, "step": 1756 }, { "epoch": 0.9848654708520179, "grad_norm": 0.07326873236410598, "learning_rate": 0.00016992219963625659, "loss": 0.2864, "step": 1757 }, { "epoch": 0.9854260089686099, "grad_norm": 0.07239571035831743, "learning_rate": 0.00016987554981527357, "loss": 0.2878, "step": 1758 }, { "epoch": 0.9859865470852018, "grad_norm": 0.06905207533713582, "learning_rate": 0.00016982887026038132, "loss": 0.2718, "step": 1759 }, { "epoch": 0.9865470852017937, "grad_norm": 0.0737159363857612, "learning_rate": 0.0001697821609914432, "loss": 0.2871, "step": 1760 }, { "epoch": 0.9871076233183856, "grad_norm": 0.07430574604340062, "learning_rate": 0.00016973542202833528, "loss": 0.2799, "step": 1761 }, { "epoch": 0.9876681614349776, "grad_norm": 0.07367466835976355, "learning_rate": 0.00016968865339094617, "loss": 0.2833, "step": 1762 }, { "epoch": 0.9882286995515696, "grad_norm": 0.07167264334942111, "learning_rate": 0.00016964185509917725, "loss": 0.2848, "step": 1763 }, { "epoch": 0.9887892376681614, "grad_norm": 0.07155343589224945, "learning_rate": 0.00016959502717294242, "loss": 0.2809, "step": 1764 }, { "epoch": 0.9893497757847534, "grad_norm": 0.07381396163039367, "learning_rate": 0.00016954816963216817, "loss": 0.2929, "step": 1765 }, { "epoch": 0.9899103139013453, "grad_norm": 0.07316072572490417, "learning_rate": 0.00016950128249679366, "loss": 0.2827, "step": 1766 }, { "epoch": 0.9904708520179372, "grad_norm": 0.06881598592091107, "learning_rate": 0.00016945436578677065, "loss": 0.2724, "step": 1767 }, { "epoch": 0.9910313901345291, "grad_norm": 0.0737642707714976, "learning_rate": 0.0001694074195220634, "loss": 0.2831, "step": 1768 }, { "epoch": 0.9915919282511211, "grad_norm": 0.07126875761505212, "learning_rate": 0.0001693604437226488, "loss": 0.2787, "step": 1769 }, { "epoch": 0.992152466367713, "grad_norm": 0.07055264246428153, "learning_rate": 0.00016931343840851634, "loss": 0.2782, "step": 1770 }, { "epoch": 0.9927130044843049, "grad_norm": 0.07393618688943292, "learning_rate": 0.00016926640359966807, "loss": 0.2862, "step": 1771 }, { "epoch": 0.9932735426008968, "grad_norm": 0.07555718134908355, "learning_rate": 0.0001692193393161184, "loss": 0.2927, "step": 1772 }, { "epoch": 0.9938340807174888, "grad_norm": 0.07570857481309304, "learning_rate": 0.0001691722455778946, "loss": 0.2824, "step": 1773 }, { "epoch": 0.9943946188340808, "grad_norm": 0.07465306801537522, "learning_rate": 0.00016912512240503625, "loss": 0.2947, "step": 1774 }, { "epoch": 0.9949551569506726, "grad_norm": 0.07004367926501669, "learning_rate": 0.0001690779698175955, "loss": 0.2744, "step": 1775 }, { "epoch": 0.9955156950672646, "grad_norm": 0.07297217404931539, "learning_rate": 0.000169030787835637, "loss": 0.2685, "step": 1776 }, { "epoch": 0.9960762331838565, "grad_norm": 0.07402597749753811, "learning_rate": 0.0001689835764792381, "loss": 0.3014, "step": 1777 }, { "epoch": 0.9966367713004485, "grad_norm": 0.06893864854981804, "learning_rate": 0.00016893633576848827, "loss": 0.2673, "step": 1778 }, { "epoch": 0.9971973094170403, "grad_norm": 0.07454637620033523, "learning_rate": 0.00016888906572348988, "loss": 0.3062, "step": 1779 }, { "epoch": 0.9977578475336323, "grad_norm": 0.06892855385802012, "learning_rate": 0.00016884176636435748, "loss": 0.2849, "step": 1780 }, { "epoch": 0.9983183856502242, "grad_norm": 0.07142555303088303, "learning_rate": 0.00016879443771121826, "loss": 0.2927, "step": 1781 }, { "epoch": 0.9988789237668162, "grad_norm": 0.06913628944171628, "learning_rate": 0.0001687470797842118, "loss": 0.2833, "step": 1782 }, { "epoch": 0.999439461883408, "grad_norm": 0.07329335793223181, "learning_rate": 0.00016869969260349018, "loss": 0.2758, "step": 1783 }, { "epoch": 1.0, "grad_norm": 0.0731286536565907, "learning_rate": 0.00016865227618921788, "loss": 0.2882, "step": 1784 }, { "epoch": 1.0, "eval_loss": 0.28521716594696045, "eval_runtime": 350.0541, "eval_samples_per_second": 34.329, "eval_steps_per_second": 1.074, "step": 1784 }, { "epoch": 1.0005605381165918, "grad_norm": 0.07293250489340956, "learning_rate": 0.00016860483056157187, "loss": 0.2731, "step": 1785 }, { "epoch": 1.001121076233184, "grad_norm": 0.07287987344760258, "learning_rate": 0.00016855735574074153, "loss": 0.2777, "step": 1786 }, { "epoch": 1.0016816143497758, "grad_norm": 0.07024587084667601, "learning_rate": 0.00016850985174692867, "loss": 0.2686, "step": 1787 }, { "epoch": 1.0022421524663676, "grad_norm": 0.07061542132913487, "learning_rate": 0.00016846231860034747, "loss": 0.2683, "step": 1788 }, { "epoch": 1.0028026905829597, "grad_norm": 0.07234478832965124, "learning_rate": 0.0001684147563212246, "loss": 0.2668, "step": 1789 }, { "epoch": 1.0033632286995515, "grad_norm": 0.07539927212010297, "learning_rate": 0.00016836716492979903, "loss": 0.2974, "step": 1790 }, { "epoch": 1.0039237668161436, "grad_norm": 0.07191466876978402, "learning_rate": 0.0001683195444463222, "loss": 0.2721, "step": 1791 }, { "epoch": 1.0044843049327354, "grad_norm": 0.07516331104667469, "learning_rate": 0.00016827189489105788, "loss": 0.2865, "step": 1792 }, { "epoch": 1.0050448430493273, "grad_norm": 0.07705362114824074, "learning_rate": 0.00016822421628428223, "loss": 0.2779, "step": 1793 }, { "epoch": 1.0056053811659194, "grad_norm": 0.07478865240503156, "learning_rate": 0.00016817650864628375, "loss": 0.2761, "step": 1794 }, { "epoch": 1.0061659192825112, "grad_norm": 0.0720000034295649, "learning_rate": 0.00016812877199736333, "loss": 0.2579, "step": 1795 }, { "epoch": 1.006726457399103, "grad_norm": 0.07435548623394256, "learning_rate": 0.00016808100635783423, "loss": 0.264, "step": 1796 }, { "epoch": 1.0072869955156951, "grad_norm": 0.07183268780358426, "learning_rate": 0.00016803321174802194, "loss": 0.2706, "step": 1797 }, { "epoch": 1.007847533632287, "grad_norm": 0.07248897595226891, "learning_rate": 0.00016798538818826435, "loss": 0.2815, "step": 1798 }, { "epoch": 1.008408071748879, "grad_norm": 0.0761853411440106, "learning_rate": 0.00016793753569891164, "loss": 0.274, "step": 1799 }, { "epoch": 1.0089686098654709, "grad_norm": 0.07087616772664414, "learning_rate": 0.00016788965430032638, "loss": 0.2587, "step": 1800 }, { "epoch": 1.0095291479820627, "grad_norm": 0.07533376703956196, "learning_rate": 0.00016784174401288335, "loss": 0.2822, "step": 1801 }, { "epoch": 1.0100896860986548, "grad_norm": 0.07790901551081032, "learning_rate": 0.00016779380485696966, "loss": 0.2876, "step": 1802 }, { "epoch": 1.0106502242152466, "grad_norm": 0.075412403632775, "learning_rate": 0.00016774583685298468, "loss": 0.2778, "step": 1803 }, { "epoch": 1.0112107623318385, "grad_norm": 0.0794674500124423, "learning_rate": 0.00016769784002134008, "loss": 0.2735, "step": 1804 }, { "epoch": 1.0117713004484306, "grad_norm": 0.07574994258448081, "learning_rate": 0.00016764981438245982, "loss": 0.2652, "step": 1805 }, { "epoch": 1.0123318385650224, "grad_norm": 0.07626438458999782, "learning_rate": 0.00016760175995678007, "loss": 0.2665, "step": 1806 }, { "epoch": 1.0128923766816142, "grad_norm": 0.07928434348537178, "learning_rate": 0.00016755367676474925, "loss": 0.295, "step": 1807 }, { "epoch": 1.0134529147982063, "grad_norm": 0.07586826519871802, "learning_rate": 0.00016750556482682805, "loss": 0.2882, "step": 1808 }, { "epoch": 1.0140134529147982, "grad_norm": 0.07201222606650312, "learning_rate": 0.0001674574241634894, "loss": 0.2734, "step": 1809 }, { "epoch": 1.0145739910313902, "grad_norm": 0.07214175221268386, "learning_rate": 0.00016740925479521846, "loss": 0.2801, "step": 1810 }, { "epoch": 1.015134529147982, "grad_norm": 0.0705783431084273, "learning_rate": 0.00016736105674251253, "loss": 0.2704, "step": 1811 }, { "epoch": 1.015695067264574, "grad_norm": 0.07063542893083063, "learning_rate": 0.0001673128300258812, "loss": 0.2742, "step": 1812 }, { "epoch": 1.016255605381166, "grad_norm": 0.06794217535359018, "learning_rate": 0.00016726457466584616, "loss": 0.2531, "step": 1813 }, { "epoch": 1.0168161434977578, "grad_norm": 0.0692309014154081, "learning_rate": 0.00016721629068294143, "loss": 0.2715, "step": 1814 }, { "epoch": 1.0173766816143497, "grad_norm": 0.07322768519959756, "learning_rate": 0.00016716797809771309, "loss": 0.2504, "step": 1815 }, { "epoch": 1.0179372197309418, "grad_norm": 0.07434577440239028, "learning_rate": 0.00016711963693071943, "loss": 0.2832, "step": 1816 }, { "epoch": 1.0184977578475336, "grad_norm": 0.07442519185698841, "learning_rate": 0.00016707126720253096, "loss": 0.2627, "step": 1817 }, { "epoch": 1.0190582959641254, "grad_norm": 0.07349019336629117, "learning_rate": 0.00016702286893373021, "loss": 0.2696, "step": 1818 }, { "epoch": 1.0196188340807175, "grad_norm": 0.07431849333936506, "learning_rate": 0.000166974442144912, "loss": 0.2685, "step": 1819 }, { "epoch": 1.0201793721973094, "grad_norm": 0.07372154678436714, "learning_rate": 0.00016692598685668318, "loss": 0.2634, "step": 1820 }, { "epoch": 1.0207399103139014, "grad_norm": 0.07412288889160065, "learning_rate": 0.00016687750308966277, "loss": 0.2812, "step": 1821 }, { "epoch": 1.0213004484304933, "grad_norm": 0.07447017805279436, "learning_rate": 0.0001668289908644819, "loss": 0.2831, "step": 1822 }, { "epoch": 1.0218609865470851, "grad_norm": 0.07509167912997043, "learning_rate": 0.00016678045020178386, "loss": 0.2756, "step": 1823 }, { "epoch": 1.0224215246636772, "grad_norm": 0.07329819074630305, "learning_rate": 0.00016673188112222394, "loss": 0.2805, "step": 1824 }, { "epoch": 1.022982062780269, "grad_norm": 0.07110181482666514, "learning_rate": 0.00016668328364646964, "loss": 0.2615, "step": 1825 }, { "epoch": 1.0235426008968609, "grad_norm": 0.07097793223431766, "learning_rate": 0.0001666346577952004, "loss": 0.2625, "step": 1826 }, { "epoch": 1.024103139013453, "grad_norm": 0.07347791538232488, "learning_rate": 0.0001665860035891079, "loss": 0.2671, "step": 1827 }, { "epoch": 1.0246636771300448, "grad_norm": 0.07439211790067399, "learning_rate": 0.00016653732104889572, "loss": 0.285, "step": 1828 }, { "epoch": 1.0252242152466369, "grad_norm": 0.0715778928034537, "learning_rate": 0.00016648861019527965, "loss": 0.2617, "step": 1829 }, { "epoch": 1.0257847533632287, "grad_norm": 0.07373282835850924, "learning_rate": 0.0001664398710489874, "loss": 0.2626, "step": 1830 }, { "epoch": 1.0263452914798206, "grad_norm": 0.07377690998400391, "learning_rate": 0.00016639110363075884, "loss": 0.2733, "step": 1831 }, { "epoch": 1.0269058295964126, "grad_norm": 0.07729672713511841, "learning_rate": 0.00016634230796134576, "loss": 0.2838, "step": 1832 }, { "epoch": 1.0274663677130045, "grad_norm": 0.07282556514626554, "learning_rate": 0.000166293484061512, "loss": 0.2799, "step": 1833 }, { "epoch": 1.0280269058295963, "grad_norm": 0.07403167397836082, "learning_rate": 0.00016624463195203347, "loss": 0.2679, "step": 1834 }, { "epoch": 1.0285874439461884, "grad_norm": 0.07215275103758523, "learning_rate": 0.00016619575165369805, "loss": 0.2581, "step": 1835 }, { "epoch": 1.0291479820627802, "grad_norm": 0.07344180895601204, "learning_rate": 0.0001661468431873056, "loss": 0.2795, "step": 1836 }, { "epoch": 1.029708520179372, "grad_norm": 0.07432497378747537, "learning_rate": 0.00016609790657366798, "loss": 0.2837, "step": 1837 }, { "epoch": 1.0302690582959642, "grad_norm": 0.07441357225587134, "learning_rate": 0.000166048941833609, "loss": 0.2717, "step": 1838 }, { "epoch": 1.030829596412556, "grad_norm": 0.07590935476605204, "learning_rate": 0.00016599994898796444, "loss": 0.2851, "step": 1839 }, { "epoch": 1.031390134529148, "grad_norm": 0.07354179140119582, "learning_rate": 0.0001659509280575821, "loss": 0.2688, "step": 1840 }, { "epoch": 1.03195067264574, "grad_norm": 0.07439261859279374, "learning_rate": 0.00016590187906332176, "loss": 0.2727, "step": 1841 }, { "epoch": 1.0325112107623318, "grad_norm": 0.07342703110129624, "learning_rate": 0.00016585280202605497, "loss": 0.2737, "step": 1842 }, { "epoch": 1.0330717488789238, "grad_norm": 0.07525213264906828, "learning_rate": 0.00016580369696666533, "loss": 0.2732, "step": 1843 }, { "epoch": 1.0336322869955157, "grad_norm": 0.07665551497568929, "learning_rate": 0.0001657545639060484, "loss": 0.2717, "step": 1844 }, { "epoch": 1.0341928251121075, "grad_norm": 0.07479429520952934, "learning_rate": 0.0001657054028651116, "loss": 0.2826, "step": 1845 }, { "epoch": 1.0347533632286996, "grad_norm": 0.07161623450510525, "learning_rate": 0.00016565621386477423, "loss": 0.2676, "step": 1846 }, { "epoch": 1.0353139013452914, "grad_norm": 0.07600791631557516, "learning_rate": 0.0001656069969259675, "loss": 0.2631, "step": 1847 }, { "epoch": 1.0358744394618835, "grad_norm": 0.07376485296076876, "learning_rate": 0.0001655577520696346, "loss": 0.2741, "step": 1848 }, { "epoch": 1.0364349775784754, "grad_norm": 0.0728118805040438, "learning_rate": 0.0001655084793167305, "loss": 0.2683, "step": 1849 }, { "epoch": 1.0369955156950672, "grad_norm": 0.07729576875042328, "learning_rate": 0.00016545917868822203, "loss": 0.2757, "step": 1850 }, { "epoch": 1.0375560538116593, "grad_norm": 0.07378878336261326, "learning_rate": 0.000165409850205088, "loss": 0.2717, "step": 1851 }, { "epoch": 1.0381165919282511, "grad_norm": 0.07242644750935845, "learning_rate": 0.00016536049388831894, "loss": 0.2782, "step": 1852 }, { "epoch": 1.038677130044843, "grad_norm": 0.07858958551614088, "learning_rate": 0.00016531110975891728, "loss": 0.2698, "step": 1853 }, { "epoch": 1.039237668161435, "grad_norm": 0.07400067409429323, "learning_rate": 0.00016526169783789732, "loss": 0.2685, "step": 1854 }, { "epoch": 1.0397982062780269, "grad_norm": 0.07421871003510273, "learning_rate": 0.00016521225814628506, "loss": 0.2791, "step": 1855 }, { "epoch": 1.0403587443946187, "grad_norm": 0.06969685301603386, "learning_rate": 0.00016516279070511854, "loss": 0.2709, "step": 1856 }, { "epoch": 1.0409192825112108, "grad_norm": 0.0702092212314942, "learning_rate": 0.0001651132955354474, "loss": 0.2663, "step": 1857 }, { "epoch": 1.0414798206278026, "grad_norm": 0.07554527085023427, "learning_rate": 0.00016506377265833314, "loss": 0.2837, "step": 1858 }, { "epoch": 1.0420403587443947, "grad_norm": 0.07409275626702215, "learning_rate": 0.00016501422209484908, "loss": 0.2735, "step": 1859 }, { "epoch": 1.0426008968609866, "grad_norm": 0.07507628014574805, "learning_rate": 0.0001649646438660803, "loss": 0.2788, "step": 1860 }, { "epoch": 1.0431614349775784, "grad_norm": 0.07450700255195067, "learning_rate": 0.0001649150379931237, "loss": 0.2868, "step": 1861 }, { "epoch": 1.0437219730941705, "grad_norm": 0.07140069146454657, "learning_rate": 0.00016486540449708783, "loss": 0.2744, "step": 1862 }, { "epoch": 1.0442825112107623, "grad_norm": 0.07658793289262474, "learning_rate": 0.0001648157433990931, "loss": 0.2829, "step": 1863 }, { "epoch": 1.0448430493273542, "grad_norm": 0.0766190372006443, "learning_rate": 0.00016476605472027172, "loss": 0.2869, "step": 1864 }, { "epoch": 1.0454035874439462, "grad_norm": 0.07550648904207007, "learning_rate": 0.00016471633848176738, "loss": 0.2711, "step": 1865 }, { "epoch": 1.045964125560538, "grad_norm": 0.07764537487796822, "learning_rate": 0.00016466659470473579, "loss": 0.295, "step": 1866 }, { "epoch": 1.0465246636771302, "grad_norm": 0.0744227562253243, "learning_rate": 0.0001646168234103442, "loss": 0.2734, "step": 1867 }, { "epoch": 1.047085201793722, "grad_norm": 0.07383161532210666, "learning_rate": 0.0001645670246197716, "loss": 0.2767, "step": 1868 }, { "epoch": 1.0476457399103138, "grad_norm": 0.07862854408338234, "learning_rate": 0.00016451719835420877, "loss": 0.2921, "step": 1869 }, { "epoch": 1.048206278026906, "grad_norm": 0.07240748787709966, "learning_rate": 0.0001644673446348581, "loss": 0.2779, "step": 1870 }, { "epoch": 1.0487668161434978, "grad_norm": 0.0728487907630601, "learning_rate": 0.00016441746348293363, "loss": 0.2881, "step": 1871 }, { "epoch": 1.0493273542600896, "grad_norm": 0.0696023534040059, "learning_rate": 0.00016436755491966115, "loss": 0.2517, "step": 1872 }, { "epoch": 1.0498878923766817, "grad_norm": 0.07345203238219637, "learning_rate": 0.00016431761896627806, "loss": 0.2612, "step": 1873 }, { "epoch": 1.0504484304932735, "grad_norm": 0.07513713484721089, "learning_rate": 0.0001642676556440335, "loss": 0.2793, "step": 1874 }, { "epoch": 1.0510089686098654, "grad_norm": 0.07452318706610823, "learning_rate": 0.00016421766497418816, "loss": 0.2749, "step": 1875 }, { "epoch": 1.0515695067264574, "grad_norm": 0.074578312418863, "learning_rate": 0.00016416764697801438, "loss": 0.2841, "step": 1876 }, { "epoch": 1.0521300448430493, "grad_norm": 0.0732401610664788, "learning_rate": 0.00016411760167679617, "loss": 0.2664, "step": 1877 }, { "epoch": 1.0526905829596414, "grad_norm": 0.07456699601331833, "learning_rate": 0.00016406752909182916, "loss": 0.2637, "step": 1878 }, { "epoch": 1.0532511210762332, "grad_norm": 0.07671286998164872, "learning_rate": 0.00016401742924442055, "loss": 0.2855, "step": 1879 }, { "epoch": 1.053811659192825, "grad_norm": 0.07677523499569627, "learning_rate": 0.00016396730215588915, "loss": 0.2869, "step": 1880 }, { "epoch": 1.0543721973094171, "grad_norm": 0.07277576355711973, "learning_rate": 0.00016391714784756538, "loss": 0.2755, "step": 1881 }, { "epoch": 1.054932735426009, "grad_norm": 0.07247938594655151, "learning_rate": 0.00016386696634079125, "loss": 0.2726, "step": 1882 }, { "epoch": 1.0554932735426008, "grad_norm": 0.07569041734695106, "learning_rate": 0.00016381675765692028, "loss": 0.2686, "step": 1883 }, { "epoch": 1.0560538116591929, "grad_norm": 0.07799774429884256, "learning_rate": 0.00016376652181731769, "loss": 0.2773, "step": 1884 }, { "epoch": 1.0566143497757847, "grad_norm": 0.07321293018316793, "learning_rate": 0.0001637162588433601, "loss": 0.2702, "step": 1885 }, { "epoch": 1.0571748878923768, "grad_norm": 0.07436428152382965, "learning_rate": 0.00016366596875643576, "loss": 0.2764, "step": 1886 }, { "epoch": 1.0577354260089686, "grad_norm": 0.0761426898434651, "learning_rate": 0.00016361565157794447, "loss": 0.277, "step": 1887 }, { "epoch": 1.0582959641255605, "grad_norm": 0.07208787257540157, "learning_rate": 0.0001635653073292975, "loss": 0.2701, "step": 1888 }, { "epoch": 1.0588565022421526, "grad_norm": 0.07687084904278584, "learning_rate": 0.00016351493603191766, "loss": 0.2796, "step": 1889 }, { "epoch": 1.0594170403587444, "grad_norm": 0.07593381504905147, "learning_rate": 0.0001634645377072393, "loss": 0.2749, "step": 1890 }, { "epoch": 1.0599775784753362, "grad_norm": 0.0727610271335173, "learning_rate": 0.00016341411237670827, "loss": 0.2664, "step": 1891 }, { "epoch": 1.0605381165919283, "grad_norm": 0.0753329496258447, "learning_rate": 0.00016336366006178187, "loss": 0.2714, "step": 1892 }, { "epoch": 1.0610986547085202, "grad_norm": 0.07624308074621539, "learning_rate": 0.0001633131807839289, "loss": 0.2917, "step": 1893 }, { "epoch": 1.061659192825112, "grad_norm": 0.07363716680200075, "learning_rate": 0.00016326267456462964, "loss": 0.2736, "step": 1894 }, { "epoch": 1.062219730941704, "grad_norm": 0.07894494493318059, "learning_rate": 0.00016321214142537584, "loss": 0.2745, "step": 1895 }, { "epoch": 1.062780269058296, "grad_norm": 0.07539024803923519, "learning_rate": 0.0001631615813876707, "loss": 0.2883, "step": 1896 }, { "epoch": 1.063340807174888, "grad_norm": 0.07434098765575238, "learning_rate": 0.00016311099447302886, "loss": 0.279, "step": 1897 }, { "epoch": 1.0639013452914798, "grad_norm": 0.07412931566846713, "learning_rate": 0.00016306038070297641, "loss": 0.2646, "step": 1898 }, { "epoch": 1.0644618834080717, "grad_norm": 0.0740234768281963, "learning_rate": 0.00016300974009905085, "loss": 0.2676, "step": 1899 }, { "epoch": 1.0650224215246638, "grad_norm": 0.07557766774395729, "learning_rate": 0.00016295907268280109, "loss": 0.2683, "step": 1900 }, { "epoch": 1.0655829596412556, "grad_norm": 0.07281936621870358, "learning_rate": 0.0001629083784757875, "loss": 0.262, "step": 1901 }, { "epoch": 1.0661434977578474, "grad_norm": 0.07508960385501345, "learning_rate": 0.0001628576574995818, "loss": 0.2735, "step": 1902 }, { "epoch": 1.0667040358744395, "grad_norm": 0.07507273932238448, "learning_rate": 0.0001628069097757671, "loss": 0.2723, "step": 1903 }, { "epoch": 1.0672645739910314, "grad_norm": 0.07377330757731597, "learning_rate": 0.0001627561353259379, "loss": 0.2571, "step": 1904 }, { "epoch": 1.0678251121076232, "grad_norm": 0.07402325427347224, "learning_rate": 0.00016270533417170015, "loss": 0.277, "step": 1905 }, { "epoch": 1.0683856502242153, "grad_norm": 0.07323515158476367, "learning_rate": 0.00016265450633467105, "loss": 0.2719, "step": 1906 }, { "epoch": 1.0689461883408071, "grad_norm": 0.07296175866704731, "learning_rate": 0.0001626036518364792, "loss": 0.2708, "step": 1907 }, { "epoch": 1.0695067264573992, "grad_norm": 0.07374203094566148, "learning_rate": 0.00016255277069876454, "loss": 0.2691, "step": 1908 }, { "epoch": 1.070067264573991, "grad_norm": 0.07203784919823066, "learning_rate": 0.00016250186294317835, "loss": 0.27, "step": 1909 }, { "epoch": 1.0706278026905829, "grad_norm": 0.0720051439648434, "learning_rate": 0.00016245092859138328, "loss": 0.2664, "step": 1910 }, { "epoch": 1.071188340807175, "grad_norm": 0.07352081307789217, "learning_rate": 0.0001623999676650532, "loss": 0.289, "step": 1911 }, { "epoch": 1.0717488789237668, "grad_norm": 0.0734531363351306, "learning_rate": 0.00016234898018587337, "loss": 0.2719, "step": 1912 }, { "epoch": 1.0723094170403586, "grad_norm": 0.0716115495065179, "learning_rate": 0.00016229796617554028, "loss": 0.2723, "step": 1913 }, { "epoch": 1.0728699551569507, "grad_norm": 0.07629494921644377, "learning_rate": 0.00016224692565576184, "loss": 0.2792, "step": 1914 }, { "epoch": 1.0734304932735426, "grad_norm": 0.0767134313243209, "learning_rate": 0.00016219585864825706, "loss": 0.2797, "step": 1915 }, { "epoch": 1.0739910313901346, "grad_norm": 0.07529608505081263, "learning_rate": 0.00016214476517475637, "loss": 0.2766, "step": 1916 }, { "epoch": 1.0745515695067265, "grad_norm": 0.07366744989513864, "learning_rate": 0.00016209364525700138, "loss": 0.2695, "step": 1917 }, { "epoch": 1.0751121076233183, "grad_norm": 0.07597379970681774, "learning_rate": 0.00016204249891674496, "loss": 0.2603, "step": 1918 }, { "epoch": 1.0756726457399104, "grad_norm": 0.07499576204333906, "learning_rate": 0.0001619913261757513, "loss": 0.2767, "step": 1919 }, { "epoch": 1.0762331838565022, "grad_norm": 0.07629421939936472, "learning_rate": 0.00016194012705579572, "loss": 0.2857, "step": 1920 }, { "epoch": 1.076793721973094, "grad_norm": 0.07384185136588617, "learning_rate": 0.00016188890157866484, "loss": 0.2859, "step": 1921 }, { "epoch": 1.0773542600896862, "grad_norm": 0.07211707283690752, "learning_rate": 0.0001618376497661564, "loss": 0.2737, "step": 1922 }, { "epoch": 1.077914798206278, "grad_norm": 0.07424252476059912, "learning_rate": 0.00016178637164007947, "loss": 0.2699, "step": 1923 }, { "epoch": 1.07847533632287, "grad_norm": 0.07580353014079456, "learning_rate": 0.00016173506722225428, "loss": 0.2649, "step": 1924 }, { "epoch": 1.079035874439462, "grad_norm": 0.07337276159961045, "learning_rate": 0.00016168373653451218, "loss": 0.279, "step": 1925 }, { "epoch": 1.0795964125560538, "grad_norm": 0.07704345689952927, "learning_rate": 0.00016163237959869578, "loss": 0.2751, "step": 1926 }, { "epoch": 1.0801569506726458, "grad_norm": 0.07165282706648543, "learning_rate": 0.00016158099643665878, "loss": 0.2726, "step": 1927 }, { "epoch": 1.0807174887892377, "grad_norm": 0.07313133774074929, "learning_rate": 0.00016152958707026614, "loss": 0.2687, "step": 1928 }, { "epoch": 1.0812780269058295, "grad_norm": 0.07463579158475091, "learning_rate": 0.00016147815152139385, "loss": 0.2736, "step": 1929 }, { "epoch": 1.0818385650224216, "grad_norm": 0.07633363811644278, "learning_rate": 0.00016142668981192917, "loss": 0.2728, "step": 1930 }, { "epoch": 1.0823991031390134, "grad_norm": 0.07215708677994055, "learning_rate": 0.00016137520196377042, "loss": 0.2689, "step": 1931 }, { "epoch": 1.0829596412556053, "grad_norm": 0.0728669146380796, "learning_rate": 0.00016132368799882704, "loss": 0.2599, "step": 1932 }, { "epoch": 1.0835201793721974, "grad_norm": 0.07403038728621991, "learning_rate": 0.00016127214793901958, "loss": 0.2776, "step": 1933 }, { "epoch": 1.0840807174887892, "grad_norm": 0.078488015679292, "learning_rate": 0.0001612205818062797, "loss": 0.268, "step": 1934 }, { "epoch": 1.0846412556053813, "grad_norm": 0.07270573932284943, "learning_rate": 0.0001611689896225502, "loss": 0.2606, "step": 1935 }, { "epoch": 1.0852017937219731, "grad_norm": 0.07633324067651374, "learning_rate": 0.00016111737140978494, "loss": 0.2733, "step": 1936 }, { "epoch": 1.085762331838565, "grad_norm": 0.07314871818107127, "learning_rate": 0.0001610657271899488, "loss": 0.2725, "step": 1937 }, { "epoch": 1.086322869955157, "grad_norm": 0.07436002582849571, "learning_rate": 0.00016101405698501782, "loss": 0.2733, "step": 1938 }, { "epoch": 1.0868834080717489, "grad_norm": 0.07258638981000642, "learning_rate": 0.000160962360816979, "loss": 0.2599, "step": 1939 }, { "epoch": 1.0874439461883407, "grad_norm": 0.0779396234130977, "learning_rate": 0.00016091063870783047, "loss": 0.2828, "step": 1940 }, { "epoch": 1.0880044843049328, "grad_norm": 0.07387292587429643, "learning_rate": 0.00016085889067958136, "loss": 0.2579, "step": 1941 }, { "epoch": 1.0885650224215246, "grad_norm": 0.07477688016800593, "learning_rate": 0.0001608071167542518, "loss": 0.2773, "step": 1942 }, { "epoch": 1.0891255605381165, "grad_norm": 0.0716554336645514, "learning_rate": 0.00016075531695387303, "loss": 0.2573, "step": 1943 }, { "epoch": 1.0896860986547086, "grad_norm": 0.07642593341952433, "learning_rate": 0.00016070349130048724, "loss": 0.2862, "step": 1944 }, { "epoch": 1.0902466367713004, "grad_norm": 0.07517365458647388, "learning_rate": 0.00016065163981614764, "loss": 0.2798, "step": 1945 }, { "epoch": 1.0908071748878925, "grad_norm": 0.07694130433057056, "learning_rate": 0.00016059976252291835, "loss": 0.2821, "step": 1946 }, { "epoch": 1.0913677130044843, "grad_norm": 0.07370821250050272, "learning_rate": 0.0001605478594428746, "loss": 0.2774, "step": 1947 }, { "epoch": 1.0919282511210762, "grad_norm": 0.07303370270961763, "learning_rate": 0.00016049593059810248, "loss": 0.2793, "step": 1948 }, { "epoch": 1.0924887892376682, "grad_norm": 0.07339460012365612, "learning_rate": 0.00016044397601069918, "loss": 0.2707, "step": 1949 }, { "epoch": 1.09304932735426, "grad_norm": 0.08404334083821938, "learning_rate": 0.0001603919957027727, "loss": 0.2785, "step": 1950 }, { "epoch": 1.093609865470852, "grad_norm": 0.0769906958486627, "learning_rate": 0.00016033998969644205, "loss": 0.285, "step": 1951 }, { "epoch": 1.094170403587444, "grad_norm": 0.07644487362969105, "learning_rate": 0.00016028795801383718, "loss": 0.2911, "step": 1952 }, { "epoch": 1.0947309417040358, "grad_norm": 0.0724238461624434, "learning_rate": 0.00016023590067709898, "loss": 0.2783, "step": 1953 }, { "epoch": 1.0952914798206277, "grad_norm": 0.0740558866631432, "learning_rate": 0.00016018381770837922, "loss": 0.2789, "step": 1954 }, { "epoch": 1.0958520179372198, "grad_norm": 0.07226000369479146, "learning_rate": 0.00016013170912984058, "loss": 0.2719, "step": 1955 }, { "epoch": 1.0964125560538116, "grad_norm": 0.07753612392525226, "learning_rate": 0.00016007957496365667, "loss": 0.2772, "step": 1956 }, { "epoch": 1.0969730941704037, "grad_norm": 0.07491630542893295, "learning_rate": 0.00016002741523201195, "loss": 0.2764, "step": 1957 }, { "epoch": 1.0975336322869955, "grad_norm": 0.074447892607347, "learning_rate": 0.00015997522995710178, "loss": 0.2831, "step": 1958 }, { "epoch": 1.0980941704035874, "grad_norm": 0.07609596858643143, "learning_rate": 0.00015992301916113242, "loss": 0.2852, "step": 1959 }, { "epoch": 1.0986547085201794, "grad_norm": 0.07332301350280312, "learning_rate": 0.0001598707828663209, "loss": 0.2705, "step": 1960 }, { "epoch": 1.0992152466367713, "grad_norm": 0.07545889966915205, "learning_rate": 0.00015981852109489517, "loss": 0.2536, "step": 1961 }, { "epoch": 1.0997757847533631, "grad_norm": 0.07421315465410608, "learning_rate": 0.000159766233869094, "loss": 0.277, "step": 1962 }, { "epoch": 1.1003363228699552, "grad_norm": 0.0746229326858004, "learning_rate": 0.00015971392121116705, "loss": 0.2673, "step": 1963 }, { "epoch": 1.100896860986547, "grad_norm": 0.07803795188828119, "learning_rate": 0.00015966158314337472, "loss": 0.2787, "step": 1964 }, { "epoch": 1.1014573991031391, "grad_norm": 0.07507220910349977, "learning_rate": 0.00015960921968798824, "loss": 0.2706, "step": 1965 }, { "epoch": 1.102017937219731, "grad_norm": 0.07245626260377289, "learning_rate": 0.00015955683086728962, "loss": 0.2726, "step": 1966 }, { "epoch": 1.1025784753363228, "grad_norm": 0.07147169574943817, "learning_rate": 0.0001595044167035718, "loss": 0.265, "step": 1967 }, { "epoch": 1.1031390134529149, "grad_norm": 0.07721066159888125, "learning_rate": 0.00015945197721913833, "loss": 0.2751, "step": 1968 }, { "epoch": 1.1036995515695067, "grad_norm": 0.07437528064108397, "learning_rate": 0.00015939951243630363, "loss": 0.2745, "step": 1969 }, { "epoch": 1.1042600896860986, "grad_norm": 0.07158845788098418, "learning_rate": 0.00015934702237739288, "loss": 0.2622, "step": 1970 }, { "epoch": 1.1048206278026906, "grad_norm": 0.07339457836335966, "learning_rate": 0.00015929450706474198, "loss": 0.2773, "step": 1971 }, { "epoch": 1.1053811659192825, "grad_norm": 0.07430073797855495, "learning_rate": 0.00015924196652069758, "loss": 0.2654, "step": 1972 }, { "epoch": 1.1059417040358746, "grad_norm": 0.07093762722174926, "learning_rate": 0.0001591894007676171, "loss": 0.2625, "step": 1973 }, { "epoch": 1.1065022421524664, "grad_norm": 0.07460068410984273, "learning_rate": 0.00015913680982786868, "loss": 0.2739, "step": 1974 }, { "epoch": 1.1070627802690582, "grad_norm": 0.073403458352427, "learning_rate": 0.00015908419372383112, "loss": 0.2671, "step": 1975 }, { "epoch": 1.1076233183856503, "grad_norm": 0.07492098467625789, "learning_rate": 0.00015903155247789404, "loss": 0.2703, "step": 1976 }, { "epoch": 1.1081838565022422, "grad_norm": 0.07128177484776693, "learning_rate": 0.00015897888611245766, "loss": 0.2754, "step": 1977 }, { "epoch": 1.108744394618834, "grad_norm": 0.07383092451473902, "learning_rate": 0.00015892619464993293, "loss": 0.2706, "step": 1978 }, { "epoch": 1.109304932735426, "grad_norm": 0.07263557664368579, "learning_rate": 0.00015887347811274145, "loss": 0.2668, "step": 1979 }, { "epoch": 1.109865470852018, "grad_norm": 0.07115233393958298, "learning_rate": 0.00015882073652331556, "loss": 0.2734, "step": 1980 }, { "epoch": 1.1104260089686098, "grad_norm": 0.07555708936788974, "learning_rate": 0.00015876796990409815, "loss": 0.2795, "step": 1981 }, { "epoch": 1.1109865470852018, "grad_norm": 0.0753979028353171, "learning_rate": 0.00015871517827754285, "loss": 0.2809, "step": 1982 }, { "epoch": 1.1115470852017937, "grad_norm": 0.0770002797947777, "learning_rate": 0.00015866236166611395, "loss": 0.2765, "step": 1983 }, { "epoch": 1.1121076233183858, "grad_norm": 0.07575667063290138, "learning_rate": 0.00015860952009228625, "loss": 0.2779, "step": 1984 }, { "epoch": 1.1126681614349776, "grad_norm": 0.07787975694349888, "learning_rate": 0.0001585566535785453, "loss": 0.2719, "step": 1985 }, { "epoch": 1.1132286995515694, "grad_norm": 0.0740094391058316, "learning_rate": 0.0001585037621473872, "loss": 0.2615, "step": 1986 }, { "epoch": 1.1137892376681615, "grad_norm": 0.07279280444286443, "learning_rate": 0.00015845084582131867, "loss": 0.2739, "step": 1987 }, { "epoch": 1.1143497757847534, "grad_norm": 0.07483962761701597, "learning_rate": 0.00015839790462285696, "loss": 0.2704, "step": 1988 }, { "epoch": 1.1149103139013452, "grad_norm": 0.0730593124018396, "learning_rate": 0.00015834493857453007, "loss": 0.2726, "step": 1989 }, { "epoch": 1.1154708520179373, "grad_norm": 0.07462548117811857, "learning_rate": 0.00015829194769887634, "loss": 0.2759, "step": 1990 }, { "epoch": 1.1160313901345291, "grad_norm": 0.07509532867389848, "learning_rate": 0.0001582389320184449, "loss": 0.2692, "step": 1991 }, { "epoch": 1.116591928251121, "grad_norm": 0.07305637870162947, "learning_rate": 0.0001581858915557953, "loss": 0.2737, "step": 1992 }, { "epoch": 1.117152466367713, "grad_norm": 0.07287159088842872, "learning_rate": 0.00015813282633349765, "loss": 0.2709, "step": 1993 }, { "epoch": 1.1177130044843049, "grad_norm": 0.07435947185943617, "learning_rate": 0.00015807973637413264, "loss": 0.2737, "step": 1994 }, { "epoch": 1.118273542600897, "grad_norm": 0.07278272531265187, "learning_rate": 0.00015802662170029148, "loss": 0.2719, "step": 1995 }, { "epoch": 1.1188340807174888, "grad_norm": 0.07382141303330053, "learning_rate": 0.00015797348233457584, "loss": 0.2772, "step": 1996 }, { "epoch": 1.1193946188340806, "grad_norm": 0.07397075999132298, "learning_rate": 0.000157920318299598, "loss": 0.2607, "step": 1997 }, { "epoch": 1.1199551569506727, "grad_norm": 0.07183826883376702, "learning_rate": 0.0001578671296179806, "loss": 0.2813, "step": 1998 }, { "epoch": 1.1205156950672646, "grad_norm": 0.07436399136538775, "learning_rate": 0.00015781391631235686, "loss": 0.2734, "step": 1999 }, { "epoch": 1.1210762331838564, "grad_norm": 0.07192922286500203, "learning_rate": 0.0001577606784053705, "loss": 0.2568, "step": 2000 }, { "epoch": 1.1216367713004485, "grad_norm": 0.07617022498805927, "learning_rate": 0.00015770741591967559, "loss": 0.2762, "step": 2001 }, { "epoch": 1.1221973094170403, "grad_norm": 0.07651159820540177, "learning_rate": 0.00015765412887793682, "loss": 0.2798, "step": 2002 }, { "epoch": 1.1227578475336322, "grad_norm": 0.07712755698411447, "learning_rate": 0.00015760081730282924, "loss": 0.2749, "step": 2003 }, { "epoch": 1.1233183856502242, "grad_norm": 0.0777130799899898, "learning_rate": 0.0001575474812170383, "loss": 0.2744, "step": 2004 }, { "epoch": 1.123878923766816, "grad_norm": 0.07297566247431461, "learning_rate": 0.00015749412064325994, "loss": 0.2614, "step": 2005 }, { "epoch": 1.1244394618834082, "grad_norm": 0.07287115726268938, "learning_rate": 0.00015744073560420053, "loss": 0.273, "step": 2006 }, { "epoch": 1.125, "grad_norm": 0.07543796690093375, "learning_rate": 0.0001573873261225768, "loss": 0.2744, "step": 2007 }, { "epoch": 1.1255605381165918, "grad_norm": 0.07125092095866423, "learning_rate": 0.00015733389222111592, "loss": 0.2858, "step": 2008 }, { "epoch": 1.126121076233184, "grad_norm": 0.07115468366189806, "learning_rate": 0.00015728043392255545, "loss": 0.2716, "step": 2009 }, { "epoch": 1.1266816143497758, "grad_norm": 0.07453062732128388, "learning_rate": 0.0001572269512496433, "loss": 0.2727, "step": 2010 }, { "epoch": 1.1272421524663678, "grad_norm": 0.07513388744332164, "learning_rate": 0.0001571734442251378, "loss": 0.2909, "step": 2011 }, { "epoch": 1.1278026905829597, "grad_norm": 0.07494383499721882, "learning_rate": 0.0001571199128718076, "loss": 0.2781, "step": 2012 }, { "epoch": 1.1283632286995515, "grad_norm": 0.07504409999077581, "learning_rate": 0.00015706635721243173, "loss": 0.289, "step": 2013 }, { "epoch": 1.1289237668161436, "grad_norm": 0.07166225475336385, "learning_rate": 0.00015701277726979952, "loss": 0.2682, "step": 2014 }, { "epoch": 1.1294843049327354, "grad_norm": 0.0742582638600298, "learning_rate": 0.00015695917306671067, "loss": 0.2607, "step": 2015 }, { "epoch": 1.1300448430493273, "grad_norm": 0.0747725454096467, "learning_rate": 0.00015690554462597522, "loss": 0.2729, "step": 2016 }, { "epoch": 1.1306053811659194, "grad_norm": 0.07511852957362249, "learning_rate": 0.0001568518919704135, "loss": 0.2818, "step": 2017 }, { "epoch": 1.1311659192825112, "grad_norm": 0.07314543195251337, "learning_rate": 0.00015679821512285615, "loss": 0.2709, "step": 2018 }, { "epoch": 1.131726457399103, "grad_norm": 0.07433168459280702, "learning_rate": 0.00015674451410614405, "loss": 0.2771, "step": 2019 }, { "epoch": 1.1322869955156951, "grad_norm": 0.07200847633930003, "learning_rate": 0.00015669078894312848, "loss": 0.2532, "step": 2020 }, { "epoch": 1.132847533632287, "grad_norm": 0.07400974519073239, "learning_rate": 0.0001566370396566709, "loss": 0.277, "step": 2021 }, { "epoch": 1.133408071748879, "grad_norm": 0.07450630557819417, "learning_rate": 0.00015658326626964301, "loss": 0.278, "step": 2022 }, { "epoch": 1.1339686098654709, "grad_norm": 0.07698578648706768, "learning_rate": 0.00015652946880492693, "loss": 0.2714, "step": 2023 }, { "epoch": 1.1345291479820627, "grad_norm": 0.07482987816471644, "learning_rate": 0.00015647564728541485, "loss": 0.2766, "step": 2024 }, { "epoch": 1.1350896860986548, "grad_norm": 0.07267521546994753, "learning_rate": 0.0001564218017340093, "loss": 0.274, "step": 2025 }, { "epoch": 1.1356502242152466, "grad_norm": 0.07414428261073383, "learning_rate": 0.00015636793217362288, "loss": 0.2738, "step": 2026 }, { "epoch": 1.1362107623318385, "grad_norm": 0.07282687696485579, "learning_rate": 0.0001563140386271787, "loss": 0.2713, "step": 2027 }, { "epoch": 1.1367713004484306, "grad_norm": 0.07659535033965022, "learning_rate": 0.00015626012111760975, "loss": 0.2754, "step": 2028 }, { "epoch": 1.1373318385650224, "grad_norm": 0.07396044636197424, "learning_rate": 0.00015620617966785946, "loss": 0.262, "step": 2029 }, { "epoch": 1.1378923766816142, "grad_norm": 0.07397253747286966, "learning_rate": 0.00015615221430088133, "loss": 0.2769, "step": 2030 }, { "epoch": 1.1384529147982063, "grad_norm": 0.07202784009774799, "learning_rate": 0.00015609822503963907, "loss": 0.2698, "step": 2031 }, { "epoch": 1.1390134529147982, "grad_norm": 0.07085916746893994, "learning_rate": 0.0001560442119071065, "loss": 0.2731, "step": 2032 }, { "epoch": 1.1395739910313902, "grad_norm": 0.07146571140318538, "learning_rate": 0.00015599017492626773, "loss": 0.2719, "step": 2033 }, { "epoch": 1.140134529147982, "grad_norm": 0.07223059545245242, "learning_rate": 0.00015593611412011686, "loss": 0.2724, "step": 2034 }, { "epoch": 1.140695067264574, "grad_norm": 0.07503730230596423, "learning_rate": 0.00015588202951165824, "loss": 0.2846, "step": 2035 }, { "epoch": 1.141255605381166, "grad_norm": 0.07870846771326531, "learning_rate": 0.0001558279211239063, "loss": 0.2877, "step": 2036 }, { "epoch": 1.1418161434977578, "grad_norm": 0.07129188141657869, "learning_rate": 0.0001557737889798856, "loss": 0.2593, "step": 2037 }, { "epoch": 1.1423766816143497, "grad_norm": 0.07298394346129729, "learning_rate": 0.00015571963310263086, "loss": 0.2703, "step": 2038 }, { "epoch": 1.1429372197309418, "grad_norm": 0.07475042100280528, "learning_rate": 0.0001556654535151868, "loss": 0.2766, "step": 2039 }, { "epoch": 1.1434977578475336, "grad_norm": 0.07522057409664021, "learning_rate": 0.00015561125024060826, "loss": 0.2919, "step": 2040 }, { "epoch": 1.1440582959641254, "grad_norm": 0.07525585281821849, "learning_rate": 0.00015555702330196023, "loss": 0.2753, "step": 2041 }, { "epoch": 1.1446188340807175, "grad_norm": 0.07307159500608724, "learning_rate": 0.00015550277272231768, "loss": 0.2662, "step": 2042 }, { "epoch": 1.1451793721973094, "grad_norm": 0.07316600541297696, "learning_rate": 0.00015544849852476572, "loss": 0.2695, "step": 2043 }, { "epoch": 1.1457399103139014, "grad_norm": 0.07328754262887907, "learning_rate": 0.00015539420073239942, "loss": 0.2728, "step": 2044 }, { "epoch": 1.1463004484304933, "grad_norm": 0.07449548962149974, "learning_rate": 0.00015533987936832398, "loss": 0.2757, "step": 2045 }, { "epoch": 1.1468609865470851, "grad_norm": 0.0717644803480842, "learning_rate": 0.00015528553445565453, "loss": 0.2692, "step": 2046 }, { "epoch": 1.1474215246636772, "grad_norm": 0.0760828551305723, "learning_rate": 0.00015523116601751636, "loss": 0.2705, "step": 2047 }, { "epoch": 1.147982062780269, "grad_norm": 0.07217028849499157, "learning_rate": 0.0001551767740770446, "loss": 0.2624, "step": 2048 }, { "epoch": 1.148542600896861, "grad_norm": 0.07294470052669451, "learning_rate": 0.00015512235865738455, "loss": 0.2736, "step": 2049 }, { "epoch": 1.149103139013453, "grad_norm": 0.07264040780197499, "learning_rate": 0.00015506791978169137, "loss": 0.2675, "step": 2050 }, { "epoch": 1.1496636771300448, "grad_norm": 0.07380138969502384, "learning_rate": 0.00015501345747313027, "loss": 0.2672, "step": 2051 }, { "epoch": 1.1502242152466366, "grad_norm": 0.07393005364478991, "learning_rate": 0.00015495897175487645, "loss": 0.2728, "step": 2052 }, { "epoch": 1.1507847533632287, "grad_norm": 0.07228127557031969, "learning_rate": 0.00015490446265011495, "loss": 0.2614, "step": 2053 }, { "epoch": 1.1513452914798206, "grad_norm": 0.07395836335811319, "learning_rate": 0.00015484993018204094, "loss": 0.2619, "step": 2054 }, { "epoch": 1.1519058295964126, "grad_norm": 0.07793273892744321, "learning_rate": 0.00015479537437385938, "loss": 0.2872, "step": 2055 }, { "epoch": 1.1524663677130045, "grad_norm": 0.07701208241018277, "learning_rate": 0.00015474079524878525, "loss": 0.2789, "step": 2056 }, { "epoch": 1.1530269058295963, "grad_norm": 0.07303503277383908, "learning_rate": 0.0001546861928300434, "loss": 0.2643, "step": 2057 }, { "epoch": 1.1535874439461884, "grad_norm": 0.07499397816821558, "learning_rate": 0.00015463156714086863, "loss": 0.2839, "step": 2058 }, { "epoch": 1.1541479820627802, "grad_norm": 0.07245367094461294, "learning_rate": 0.00015457691820450564, "loss": 0.2741, "step": 2059 }, { "epoch": 1.1547085201793723, "grad_norm": 0.07284724044775001, "learning_rate": 0.00015452224604420897, "loss": 0.2774, "step": 2060 }, { "epoch": 1.1552690582959642, "grad_norm": 0.07538937638420962, "learning_rate": 0.0001544675506832431, "loss": 0.2701, "step": 2061 }, { "epoch": 1.155829596412556, "grad_norm": 0.071544697213512, "learning_rate": 0.0001544128321448824, "loss": 0.2726, "step": 2062 }, { "epoch": 1.156390134529148, "grad_norm": 0.07273860295876183, "learning_rate": 0.000154358090452411, "loss": 0.2686, "step": 2063 }, { "epoch": 1.15695067264574, "grad_norm": 0.07362864858475911, "learning_rate": 0.000154303325629123, "loss": 0.2832, "step": 2064 }, { "epoch": 1.1575112107623318, "grad_norm": 0.07280685817498073, "learning_rate": 0.00015424853769832226, "loss": 0.269, "step": 2065 }, { "epoch": 1.1580717488789238, "grad_norm": 0.07646770256689027, "learning_rate": 0.00015419372668332254, "loss": 0.2726, "step": 2066 }, { "epoch": 1.1586322869955157, "grad_norm": 0.07101844516132096, "learning_rate": 0.00015413889260744735, "loss": 0.2636, "step": 2067 }, { "epoch": 1.1591928251121075, "grad_norm": 0.07379766462631883, "learning_rate": 0.0001540840354940301, "loss": 0.2728, "step": 2068 }, { "epoch": 1.1597533632286996, "grad_norm": 0.07221682533650452, "learning_rate": 0.0001540291553664139, "loss": 0.2747, "step": 2069 }, { "epoch": 1.1603139013452914, "grad_norm": 0.07401261609018353, "learning_rate": 0.00015397425224795177, "loss": 0.2756, "step": 2070 }, { "epoch": 1.1608744394618835, "grad_norm": 0.07339268743499448, "learning_rate": 0.0001539193261620064, "loss": 0.2817, "step": 2071 }, { "epoch": 1.1614349775784754, "grad_norm": 0.07129692574775143, "learning_rate": 0.0001538643771319503, "loss": 0.2743, "step": 2072 }, { "epoch": 1.1619955156950672, "grad_norm": 0.07368552168044658, "learning_rate": 0.00015380940518116578, "loss": 0.2711, "step": 2073 }, { "epoch": 1.1625560538116593, "grad_norm": 0.0741087351189812, "learning_rate": 0.00015375441033304484, "loss": 0.2617, "step": 2074 }, { "epoch": 1.1631165919282511, "grad_norm": 0.07145873287096843, "learning_rate": 0.00015369939261098927, "loss": 0.2648, "step": 2075 }, { "epoch": 1.163677130044843, "grad_norm": 0.07312692487144262, "learning_rate": 0.00015364435203841058, "loss": 0.284, "step": 2076 }, { "epoch": 1.164237668161435, "grad_norm": 0.07408197867575649, "learning_rate": 0.00015358928863872998, "loss": 0.263, "step": 2077 }, { "epoch": 1.1647982062780269, "grad_norm": 0.07227449052330495, "learning_rate": 0.00015353420243537848, "loss": 0.2583, "step": 2078 }, { "epoch": 1.1653587443946187, "grad_norm": 0.07218673290807645, "learning_rate": 0.00015347909345179666, "loss": 0.2701, "step": 2079 }, { "epoch": 1.1659192825112108, "grad_norm": 0.07277001388474791, "learning_rate": 0.00015342396171143488, "loss": 0.2738, "step": 2080 }, { "epoch": 1.1664798206278026, "grad_norm": 0.07384023078084269, "learning_rate": 0.00015336880723775312, "loss": 0.2792, "step": 2081 }, { "epoch": 1.1670403587443947, "grad_norm": 0.07221786179926024, "learning_rate": 0.00015331363005422117, "loss": 0.2737, "step": 2082 }, { "epoch": 1.1676008968609866, "grad_norm": 0.07454906070908028, "learning_rate": 0.00015325843018431835, "loss": 0.2711, "step": 2083 }, { "epoch": 1.1681614349775784, "grad_norm": 0.07217768915976, "learning_rate": 0.00015320320765153367, "loss": 0.2842, "step": 2084 }, { "epoch": 1.1687219730941705, "grad_norm": 0.07307954618021212, "learning_rate": 0.00015314796247936578, "loss": 0.2691, "step": 2085 }, { "epoch": 1.1692825112107623, "grad_norm": 0.07459096911237502, "learning_rate": 0.000153092694691323, "loss": 0.2688, "step": 2086 }, { "epoch": 1.1698430493273542, "grad_norm": 0.07051881368437275, "learning_rate": 0.00015303740431092325, "loss": 0.2673, "step": 2087 }, { "epoch": 1.1704035874439462, "grad_norm": 0.07355848353297499, "learning_rate": 0.00015298209136169403, "loss": 0.2695, "step": 2088 }, { "epoch": 1.170964125560538, "grad_norm": 0.0739564640053159, "learning_rate": 0.00015292675586717246, "loss": 0.2745, "step": 2089 }, { "epoch": 1.17152466367713, "grad_norm": 0.07679015985636238, "learning_rate": 0.00015287139785090533, "loss": 0.2779, "step": 2090 }, { "epoch": 1.172085201793722, "grad_norm": 0.07606643203847724, "learning_rate": 0.00015281601733644894, "loss": 0.2675, "step": 2091 }, { "epoch": 1.1726457399103138, "grad_norm": 0.0756602189290465, "learning_rate": 0.00015276061434736914, "loss": 0.2693, "step": 2092 }, { "epoch": 1.173206278026906, "grad_norm": 0.07538211370389455, "learning_rate": 0.00015270518890724138, "loss": 0.2822, "step": 2093 }, { "epoch": 1.1737668161434978, "grad_norm": 0.07369372022215448, "learning_rate": 0.00015264974103965068, "loss": 0.2779, "step": 2094 }, { "epoch": 1.1743273542600896, "grad_norm": 0.07313457952230602, "learning_rate": 0.0001525942707681916, "loss": 0.2709, "step": 2095 }, { "epoch": 1.1748878923766817, "grad_norm": 0.0772087090084902, "learning_rate": 0.00015253877811646817, "loss": 0.2867, "step": 2096 }, { "epoch": 1.1754484304932735, "grad_norm": 0.07470780457162388, "learning_rate": 0.00015248326310809404, "loss": 0.2798, "step": 2097 }, { "epoch": 1.1760089686098656, "grad_norm": 0.07119790551418703, "learning_rate": 0.00015242772576669236, "loss": 0.2758, "step": 2098 }, { "epoch": 1.1765695067264574, "grad_norm": 0.07251655224837092, "learning_rate": 0.00015237216611589563, "loss": 0.2738, "step": 2099 }, { "epoch": 1.1771300448430493, "grad_norm": 0.07382764770345313, "learning_rate": 0.00015231658417934606, "loss": 0.2707, "step": 2100 }, { "epoch": 1.1776905829596414, "grad_norm": 0.07487957934819602, "learning_rate": 0.0001522609799806952, "loss": 0.2783, "step": 2101 }, { "epoch": 1.1782511210762332, "grad_norm": 0.07022136303111838, "learning_rate": 0.00015220535354360415, "loss": 0.2607, "step": 2102 }, { "epoch": 1.178811659192825, "grad_norm": 0.0740416630693838, "learning_rate": 0.00015214970489174341, "loss": 0.2809, "step": 2103 }, { "epoch": 1.1793721973094171, "grad_norm": 0.07274708528402801, "learning_rate": 0.00015209403404879303, "loss": 0.2728, "step": 2104 }, { "epoch": 1.179932735426009, "grad_norm": 0.07598974070073093, "learning_rate": 0.00015203834103844237, "loss": 0.2745, "step": 2105 }, { "epoch": 1.1804932735426008, "grad_norm": 0.07312980386250663, "learning_rate": 0.00015198262588439032, "loss": 0.2696, "step": 2106 }, { "epoch": 1.1810538116591929, "grad_norm": 0.0735721273097422, "learning_rate": 0.00015192688861034515, "loss": 0.2749, "step": 2107 }, { "epoch": 1.1816143497757847, "grad_norm": 0.07301663598658625, "learning_rate": 0.00015187112924002456, "loss": 0.2671, "step": 2108 }, { "epoch": 1.1821748878923768, "grad_norm": 0.07044857634256689, "learning_rate": 0.00015181534779715565, "loss": 0.2529, "step": 2109 }, { "epoch": 1.1827354260089686, "grad_norm": 0.0742907419683679, "learning_rate": 0.0001517595443054749, "loss": 0.2661, "step": 2110 }, { "epoch": 1.1832959641255605, "grad_norm": 0.07310441013458707, "learning_rate": 0.00015170371878872818, "loss": 0.2743, "step": 2111 }, { "epoch": 1.1838565022421526, "grad_norm": 0.07238894594001412, "learning_rate": 0.0001516478712706708, "loss": 0.2562, "step": 2112 }, { "epoch": 1.1844170403587444, "grad_norm": 0.07786287525190146, "learning_rate": 0.00015159200177506727, "loss": 0.2697, "step": 2113 }, { "epoch": 1.1849775784753362, "grad_norm": 0.07261831937645209, "learning_rate": 0.0001515361103256916, "loss": 0.2695, "step": 2114 }, { "epoch": 1.1855381165919283, "grad_norm": 0.07532954820401011, "learning_rate": 0.00015148019694632715, "loss": 0.2771, "step": 2115 }, { "epoch": 1.1860986547085202, "grad_norm": 0.07658085746908065, "learning_rate": 0.00015142426166076645, "loss": 0.2858, "step": 2116 }, { "epoch": 1.186659192825112, "grad_norm": 0.07542217177630721, "learning_rate": 0.00015136830449281148, "loss": 0.2784, "step": 2117 }, { "epoch": 1.187219730941704, "grad_norm": 0.07528234769052114, "learning_rate": 0.00015131232546627355, "loss": 0.2657, "step": 2118 }, { "epoch": 1.187780269058296, "grad_norm": 0.07515155676111435, "learning_rate": 0.0001512563246049732, "loss": 0.2703, "step": 2119 }, { "epoch": 1.188340807174888, "grad_norm": 0.07361613613334834, "learning_rate": 0.00015120030193274027, "loss": 0.2627, "step": 2120 }, { "epoch": 1.1889013452914798, "grad_norm": 0.07367647072905742, "learning_rate": 0.00015114425747341396, "loss": 0.2695, "step": 2121 }, { "epoch": 1.1894618834080717, "grad_norm": 0.07517684710602675, "learning_rate": 0.00015108819125084262, "loss": 0.2734, "step": 2122 }, { "epoch": 1.1900224215246638, "grad_norm": 0.07459082377476396, "learning_rate": 0.00015103210328888396, "loss": 0.2687, "step": 2123 }, { "epoch": 1.1905829596412556, "grad_norm": 0.07449003919367506, "learning_rate": 0.00015097599361140487, "loss": 0.2722, "step": 2124 }, { "epoch": 1.1911434977578474, "grad_norm": 0.07212325460713669, "learning_rate": 0.00015091986224228157, "loss": 0.2658, "step": 2125 }, { "epoch": 1.1917040358744395, "grad_norm": 0.07171874994936649, "learning_rate": 0.00015086370920539937, "loss": 0.2776, "step": 2126 }, { "epoch": 1.1922645739910314, "grad_norm": 0.0724722953900823, "learning_rate": 0.00015080753452465296, "loss": 0.2786, "step": 2127 }, { "epoch": 1.1928251121076232, "grad_norm": 0.07513205905519629, "learning_rate": 0.00015075133822394613, "loss": 0.2717, "step": 2128 }, { "epoch": 1.1933856502242153, "grad_norm": 0.07262251960721408, "learning_rate": 0.0001506951203271919, "loss": 0.2719, "step": 2129 }, { "epoch": 1.1939461883408071, "grad_norm": 0.07317864940142353, "learning_rate": 0.0001506388808583125, "loss": 0.2725, "step": 2130 }, { "epoch": 1.1945067264573992, "grad_norm": 0.07543270870371434, "learning_rate": 0.0001505826198412393, "loss": 0.274, "step": 2131 }, { "epoch": 1.195067264573991, "grad_norm": 0.07626307700070344, "learning_rate": 0.00015052633729991294, "loss": 0.285, "step": 2132 }, { "epoch": 1.1956278026905829, "grad_norm": 0.07192479462550591, "learning_rate": 0.00015047003325828305, "loss": 0.2703, "step": 2133 }, { "epoch": 1.196188340807175, "grad_norm": 0.0750745523992153, "learning_rate": 0.0001504137077403085, "loss": 0.2778, "step": 2134 }, { "epoch": 1.1967488789237668, "grad_norm": 0.07457514985822909, "learning_rate": 0.00015035736076995736, "loss": 0.2593, "step": 2135 }, { "epoch": 1.1973094170403586, "grad_norm": 0.07286343319243911, "learning_rate": 0.00015030099237120674, "loss": 0.2761, "step": 2136 }, { "epoch": 1.1978699551569507, "grad_norm": 0.07453411808354928, "learning_rate": 0.0001502446025680429, "loss": 0.267, "step": 2137 }, { "epoch": 1.1984304932735426, "grad_norm": 0.0745072188165511, "learning_rate": 0.0001501881913844612, "loss": 0.2767, "step": 2138 }, { "epoch": 1.1989910313901344, "grad_norm": 0.07249510969676805, "learning_rate": 0.0001501317588444661, "loss": 0.2637, "step": 2139 }, { "epoch": 1.1995515695067265, "grad_norm": 0.07717624953611066, "learning_rate": 0.00015007530497207117, "loss": 0.2794, "step": 2140 }, { "epoch": 1.2001121076233183, "grad_norm": 0.07387397226639023, "learning_rate": 0.00015001882979129899, "loss": 0.2743, "step": 2141 }, { "epoch": 1.2006726457399104, "grad_norm": 0.07496227177042705, "learning_rate": 0.00014996233332618128, "loss": 0.2755, "step": 2142 }, { "epoch": 1.2012331838565022, "grad_norm": 0.07496313798382635, "learning_rate": 0.00014990581560075881, "loss": 0.2721, "step": 2143 }, { "epoch": 1.201793721973094, "grad_norm": 0.07439376297487851, "learning_rate": 0.00014984927663908137, "loss": 0.2708, "step": 2144 }, { "epoch": 1.2023542600896862, "grad_norm": 0.07523485667912887, "learning_rate": 0.00014979271646520782, "loss": 0.2612, "step": 2145 }, { "epoch": 1.202914798206278, "grad_norm": 0.0716685011793692, "learning_rate": 0.00014973613510320594, "loss": 0.264, "step": 2146 }, { "epoch": 1.20347533632287, "grad_norm": 0.07371711472779705, "learning_rate": 0.00014967953257715268, "loss": 0.2754, "step": 2147 }, { "epoch": 1.204035874439462, "grad_norm": 0.0752351159209486, "learning_rate": 0.00014962290891113394, "loss": 0.2821, "step": 2148 }, { "epoch": 1.2045964125560538, "grad_norm": 0.0732745052311195, "learning_rate": 0.00014956626412924453, "loss": 0.2775, "step": 2149 }, { "epoch": 1.2051569506726458, "grad_norm": 0.07324326890999037, "learning_rate": 0.00014950959825558836, "loss": 0.2751, "step": 2150 }, { "epoch": 1.2057174887892377, "grad_norm": 0.07409978346285448, "learning_rate": 0.00014945291131427825, "loss": 0.2834, "step": 2151 }, { "epoch": 1.2062780269058295, "grad_norm": 0.07404126723380783, "learning_rate": 0.00014939620332943604, "loss": 0.2646, "step": 2152 }, { "epoch": 1.2068385650224216, "grad_norm": 0.07189288322301148, "learning_rate": 0.00014933947432519245, "loss": 0.276, "step": 2153 }, { "epoch": 1.2073991031390134, "grad_norm": 0.0711885201660691, "learning_rate": 0.0001492827243256872, "loss": 0.2629, "step": 2154 }, { "epoch": 1.2079596412556053, "grad_norm": 0.07357263793071628, "learning_rate": 0.00014922595335506892, "loss": 0.2739, "step": 2155 }, { "epoch": 1.2085201793721974, "grad_norm": 0.07372589669407172, "learning_rate": 0.00014916916143749518, "loss": 0.2762, "step": 2156 }, { "epoch": 1.2090807174887892, "grad_norm": 0.07925057873266328, "learning_rate": 0.00014911234859713243, "loss": 0.2636, "step": 2157 }, { "epoch": 1.2096412556053813, "grad_norm": 0.07224035604882845, "learning_rate": 0.0001490555148581561, "loss": 0.2728, "step": 2158 }, { "epoch": 1.2102017937219731, "grad_norm": 0.07328029375527327, "learning_rate": 0.00014899866024475043, "loss": 0.2714, "step": 2159 }, { "epoch": 1.210762331838565, "grad_norm": 0.07440276422602377, "learning_rate": 0.00014894178478110857, "loss": 0.2811, "step": 2160 }, { "epoch": 1.211322869955157, "grad_norm": 0.07292885635959033, "learning_rate": 0.00014888488849143253, "loss": 0.2671, "step": 2161 }, { "epoch": 1.2118834080717489, "grad_norm": 0.07235886439051883, "learning_rate": 0.00014882797139993326, "loss": 0.2651, "step": 2162 }, { "epoch": 1.2124439461883407, "grad_norm": 0.07311515589613908, "learning_rate": 0.00014877103353083042, "loss": 0.2785, "step": 2163 }, { "epoch": 1.2130044843049328, "grad_norm": 0.0742640259694533, "learning_rate": 0.00014871407490835262, "loss": 0.2679, "step": 2164 }, { "epoch": 1.2135650224215246, "grad_norm": 0.07121133370611302, "learning_rate": 0.00014865709555673734, "loss": 0.2678, "step": 2165 }, { "epoch": 1.2141255605381165, "grad_norm": 0.07120529979390215, "learning_rate": 0.00014860009550023072, "loss": 0.2689, "step": 2166 }, { "epoch": 1.2146860986547086, "grad_norm": 0.07297595302908803, "learning_rate": 0.00014854307476308781, "loss": 0.2754, "step": 2167 }, { "epoch": 1.2152466367713004, "grad_norm": 0.0765807805862152, "learning_rate": 0.00014848603336957251, "loss": 0.2811, "step": 2168 }, { "epoch": 1.2158071748878925, "grad_norm": 0.08047048222225542, "learning_rate": 0.00014842897134395743, "loss": 0.2671, "step": 2169 }, { "epoch": 1.2163677130044843, "grad_norm": 0.08551996410336832, "learning_rate": 0.000148371888710524, "loss": 0.266, "step": 2170 }, { "epoch": 1.2169282511210762, "grad_norm": 0.07120052575877524, "learning_rate": 0.00014831478549356234, "loss": 0.2497, "step": 2171 }, { "epoch": 1.2174887892376682, "grad_norm": 0.0722776407246042, "learning_rate": 0.00014825766171737146, "loss": 0.2654, "step": 2172 }, { "epoch": 1.21804932735426, "grad_norm": 0.07458138290458914, "learning_rate": 0.00014820051740625903, "loss": 0.271, "step": 2173 }, { "epoch": 1.218609865470852, "grad_norm": 0.0729714914697601, "learning_rate": 0.00014814335258454145, "loss": 0.2665, "step": 2174 }, { "epoch": 1.219170403587444, "grad_norm": 0.07460522134944941, "learning_rate": 0.0001480861672765439, "loss": 0.2817, "step": 2175 }, { "epoch": 1.2197309417040358, "grad_norm": 0.07836658995501097, "learning_rate": 0.00014802896150660022, "loss": 0.2754, "step": 2176 }, { "epoch": 1.2202914798206277, "grad_norm": 0.0736678787021427, "learning_rate": 0.00014797173529905306, "loss": 0.2798, "step": 2177 }, { "epoch": 1.2208520179372198, "grad_norm": 0.07071270337372172, "learning_rate": 0.00014791448867825365, "loss": 0.2584, "step": 2178 }, { "epoch": 1.2214125560538116, "grad_norm": 0.07231434933962502, "learning_rate": 0.00014785722166856194, "loss": 0.2769, "step": 2179 }, { "epoch": 1.2219730941704037, "grad_norm": 0.07094203957441038, "learning_rate": 0.00014779993429434659, "loss": 0.2635, "step": 2180 }, { "epoch": 1.2225336322869955, "grad_norm": 0.07033983329472304, "learning_rate": 0.00014774262657998491, "loss": 0.2675, "step": 2181 }, { "epoch": 1.2230941704035874, "grad_norm": 0.07425672814540628, "learning_rate": 0.00014768529854986286, "loss": 0.2773, "step": 2182 }, { "epoch": 1.2236547085201794, "grad_norm": 0.07118439642612227, "learning_rate": 0.00014762795022837504, "loss": 0.2678, "step": 2183 }, { "epoch": 1.2242152466367713, "grad_norm": 0.07350427202585197, "learning_rate": 0.00014757058163992464, "loss": 0.2746, "step": 2184 }, { "epoch": 1.2247757847533634, "grad_norm": 0.07384173873934805, "learning_rate": 0.00014751319280892366, "loss": 0.2707, "step": 2185 }, { "epoch": 1.2253363228699552, "grad_norm": 0.07511276355337908, "learning_rate": 0.00014745578375979245, "loss": 0.2671, "step": 2186 }, { "epoch": 1.225896860986547, "grad_norm": 0.07542390591320873, "learning_rate": 0.00014739835451696011, "loss": 0.2796, "step": 2187 }, { "epoch": 1.226457399103139, "grad_norm": 0.07318198970241921, "learning_rate": 0.00014734090510486433, "loss": 0.2765, "step": 2188 }, { "epoch": 1.227017937219731, "grad_norm": 0.07221627487433964, "learning_rate": 0.00014728343554795142, "loss": 0.274, "step": 2189 }, { "epoch": 1.2275784753363228, "grad_norm": 0.07302762715277836, "learning_rate": 0.0001472259458706761, "loss": 0.2735, "step": 2190 }, { "epoch": 1.2281390134529149, "grad_norm": 0.07071971217399264, "learning_rate": 0.00014716843609750187, "loss": 0.2597, "step": 2191 }, { "epoch": 1.2286995515695067, "grad_norm": 0.07064134460184776, "learning_rate": 0.00014711090625290057, "loss": 0.2742, "step": 2192 }, { "epoch": 1.2292600896860986, "grad_norm": 0.07175876715421317, "learning_rate": 0.0001470533563613528, "loss": 0.2774, "step": 2193 }, { "epoch": 1.2298206278026906, "grad_norm": 0.07201379572339128, "learning_rate": 0.00014699578644734746, "loss": 0.267, "step": 2194 }, { "epoch": 1.2303811659192825, "grad_norm": 0.07433355312968802, "learning_rate": 0.00014693819653538215, "loss": 0.2627, "step": 2195 }, { "epoch": 1.2309417040358746, "grad_norm": 0.07390570798339077, "learning_rate": 0.00014688058664996285, "loss": 0.2661, "step": 2196 }, { "epoch": 1.2315022421524664, "grad_norm": 0.073286120109269, "learning_rate": 0.0001468229568156042, "loss": 0.2797, "step": 2197 }, { "epoch": 1.2320627802690582, "grad_norm": 0.07485715475066163, "learning_rate": 0.00014676530705682914, "loss": 0.2539, "step": 2198 }, { "epoch": 1.2326233183856503, "grad_norm": 0.07433746289621238, "learning_rate": 0.00014670763739816923, "loss": 0.2656, "step": 2199 }, { "epoch": 1.2331838565022422, "grad_norm": 0.07812464782683104, "learning_rate": 0.0001466499478641644, "loss": 0.2801, "step": 2200 }, { "epoch": 1.233744394618834, "grad_norm": 0.07406661426771381, "learning_rate": 0.00014659223847936315, "loss": 0.2726, "step": 2201 }, { "epoch": 1.234304932735426, "grad_norm": 0.07283861596645486, "learning_rate": 0.00014653450926832234, "loss": 0.27, "step": 2202 }, { "epoch": 1.234865470852018, "grad_norm": 0.07475440658412534, "learning_rate": 0.00014647676025560726, "loss": 0.2769, "step": 2203 }, { "epoch": 1.2354260089686098, "grad_norm": 0.07276401211295648, "learning_rate": 0.00014641899146579168, "loss": 0.2684, "step": 2204 }, { "epoch": 1.2359865470852018, "grad_norm": 0.07576724537918178, "learning_rate": 0.00014636120292345773, "loss": 0.2743, "step": 2205 }, { "epoch": 1.2365470852017937, "grad_norm": 0.07233726605379628, "learning_rate": 0.00014630339465319603, "loss": 0.2692, "step": 2206 }, { "epoch": 1.2371076233183858, "grad_norm": 0.07410354461819409, "learning_rate": 0.00014624556667960548, "loss": 0.2688, "step": 2207 }, { "epoch": 1.2376681614349776, "grad_norm": 0.07233910524759327, "learning_rate": 0.00014618771902729342, "loss": 0.2805, "step": 2208 }, { "epoch": 1.2382286995515694, "grad_norm": 0.0703341510133088, "learning_rate": 0.00014612985172087565, "loss": 0.2602, "step": 2209 }, { "epoch": 1.2387892376681615, "grad_norm": 0.0728511550381156, "learning_rate": 0.0001460719647849762, "loss": 0.2635, "step": 2210 }, { "epoch": 1.2393497757847534, "grad_norm": 0.0724761427933535, "learning_rate": 0.0001460140582442275, "loss": 0.2596, "step": 2211 }, { "epoch": 1.2399103139013452, "grad_norm": 0.07432563748277206, "learning_rate": 0.00014595613212327032, "loss": 0.2765, "step": 2212 }, { "epoch": 1.2404708520179373, "grad_norm": 0.07635799672042924, "learning_rate": 0.00014589818644675378, "loss": 0.2831, "step": 2213 }, { "epoch": 1.2410313901345291, "grad_norm": 0.07608695610001634, "learning_rate": 0.0001458402212393353, "loss": 0.2757, "step": 2214 }, { "epoch": 1.241591928251121, "grad_norm": 0.0751592324960594, "learning_rate": 0.00014578223652568067, "loss": 0.269, "step": 2215 }, { "epoch": 1.242152466367713, "grad_norm": 0.07439220809306073, "learning_rate": 0.00014572423233046386, "loss": 0.2718, "step": 2216 }, { "epoch": 1.2427130044843049, "grad_norm": 0.07575224592435262, "learning_rate": 0.00014566620867836725, "loss": 0.2715, "step": 2217 }, { "epoch": 1.243273542600897, "grad_norm": 0.07518921890484784, "learning_rate": 0.00014560816559408142, "loss": 0.2735, "step": 2218 }, { "epoch": 1.2438340807174888, "grad_norm": 0.07499287456456347, "learning_rate": 0.0001455501031023053, "loss": 0.27, "step": 2219 }, { "epoch": 1.2443946188340806, "grad_norm": 0.07479889085555115, "learning_rate": 0.00014549202122774596, "loss": 0.2938, "step": 2220 }, { "epoch": 1.2449551569506727, "grad_norm": 0.07352899020051516, "learning_rate": 0.0001454339199951188, "loss": 0.2765, "step": 2221 }, { "epoch": 1.2455156950672646, "grad_norm": 0.07284971509791976, "learning_rate": 0.00014537579942914752, "loss": 0.2659, "step": 2222 }, { "epoch": 1.2460762331838564, "grad_norm": 0.07305313033571034, "learning_rate": 0.00014531765955456388, "loss": 0.2717, "step": 2223 }, { "epoch": 1.2466367713004485, "grad_norm": 0.07373772134837558, "learning_rate": 0.000145259500396108, "loss": 0.2711, "step": 2224 }, { "epoch": 1.2471973094170403, "grad_norm": 0.07269328693647831, "learning_rate": 0.00014520132197852812, "loss": 0.2643, "step": 2225 }, { "epoch": 1.2477578475336322, "grad_norm": 0.07349888227263872, "learning_rate": 0.00014514312432658072, "loss": 0.2868, "step": 2226 }, { "epoch": 1.2483183856502242, "grad_norm": 0.0759531850087204, "learning_rate": 0.00014508490746503044, "loss": 0.2709, "step": 2227 }, { "epoch": 1.248878923766816, "grad_norm": 0.07320229472749225, "learning_rate": 0.00014502667141865015, "loss": 0.2735, "step": 2228 }, { "epoch": 1.2494394618834082, "grad_norm": 0.07410626056346635, "learning_rate": 0.00014496841621222076, "loss": 0.2712, "step": 2229 }, { "epoch": 1.25, "grad_norm": 0.07402976456458424, "learning_rate": 0.00014491014187053148, "loss": 0.2761, "step": 2230 }, { "epoch": 1.2505605381165918, "grad_norm": 0.07312087350695935, "learning_rate": 0.0001448518484183796, "loss": 0.2681, "step": 2231 }, { "epoch": 1.251121076233184, "grad_norm": 0.07213741476143876, "learning_rate": 0.00014479353588057052, "loss": 0.2553, "step": 2232 }, { "epoch": 1.2516816143497758, "grad_norm": 0.07125397595984921, "learning_rate": 0.00014473520428191775, "loss": 0.2757, "step": 2233 }, { "epoch": 1.2522421524663678, "grad_norm": 0.07434629780509557, "learning_rate": 0.00014467685364724298, "loss": 0.2751, "step": 2234 }, { "epoch": 1.2528026905829597, "grad_norm": 0.0715831021024079, "learning_rate": 0.00014461848400137595, "loss": 0.2611, "step": 2235 }, { "epoch": 1.2533632286995515, "grad_norm": 0.07675312304907582, "learning_rate": 0.00014456009536915448, "loss": 0.2675, "step": 2236 }, { "epoch": 1.2539237668161434, "grad_norm": 0.07200440188423189, "learning_rate": 0.0001445016877754245, "loss": 0.273, "step": 2237 }, { "epoch": 1.2544843049327354, "grad_norm": 0.07061599291190361, "learning_rate": 0.00014444326124504002, "loss": 0.2595, "step": 2238 }, { "epoch": 1.2550448430493273, "grad_norm": 0.069665333881225, "learning_rate": 0.0001443848158028631, "loss": 0.2527, "step": 2239 }, { "epoch": 1.2556053811659194, "grad_norm": 0.07085060283078363, "learning_rate": 0.00014432635147376376, "loss": 0.26, "step": 2240 }, { "epoch": 1.2561659192825112, "grad_norm": 0.07287943724145639, "learning_rate": 0.00014426786828262018, "loss": 0.2734, "step": 2241 }, { "epoch": 1.256726457399103, "grad_norm": 0.07600559046718026, "learning_rate": 0.00014420936625431853, "loss": 0.2737, "step": 2242 }, { "epoch": 1.2572869955156951, "grad_norm": 0.07263207168045012, "learning_rate": 0.00014415084541375295, "loss": 0.2739, "step": 2243 }, { "epoch": 1.257847533632287, "grad_norm": 0.07481167414877608, "learning_rate": 0.00014409230578582566, "loss": 0.2825, "step": 2244 }, { "epoch": 1.258408071748879, "grad_norm": 0.07248565996941632, "learning_rate": 0.00014403374739544678, "loss": 0.2774, "step": 2245 }, { "epoch": 1.2589686098654709, "grad_norm": 0.07526214400489531, "learning_rate": 0.0001439751702675345, "loss": 0.2659, "step": 2246 }, { "epoch": 1.2595291479820627, "grad_norm": 0.07199083013001102, "learning_rate": 0.00014391657442701494, "loss": 0.2725, "step": 2247 }, { "epoch": 1.2600896860986546, "grad_norm": 0.07185921680402563, "learning_rate": 0.00014385795989882221, "loss": 0.2764, "step": 2248 }, { "epoch": 1.2606502242152466, "grad_norm": 0.0745909469659899, "learning_rate": 0.00014379932670789832, "loss": 0.2834, "step": 2249 }, { "epoch": 1.2612107623318385, "grad_norm": 0.07356235271331986, "learning_rate": 0.00014374067487919322, "loss": 0.2841, "step": 2250 }, { "epoch": 1.2617713004484306, "grad_norm": 0.07432903247307711, "learning_rate": 0.00014368200443766495, "loss": 0.2729, "step": 2251 }, { "epoch": 1.2623318385650224, "grad_norm": 0.07183795198830267, "learning_rate": 0.00014362331540827928, "loss": 0.2679, "step": 2252 }, { "epoch": 1.2628923766816142, "grad_norm": 0.0728036017529027, "learning_rate": 0.00014356460781600992, "loss": 0.2752, "step": 2253 }, { "epoch": 1.2634529147982063, "grad_norm": 0.07085391731397918, "learning_rate": 0.00014350588168583856, "loss": 0.2721, "step": 2254 }, { "epoch": 1.2640134529147982, "grad_norm": 0.0710387486000272, "learning_rate": 0.00014344713704275472, "loss": 0.2601, "step": 2255 }, { "epoch": 1.2645739910313902, "grad_norm": 0.07169809121267175, "learning_rate": 0.00014338837391175582, "loss": 0.2703, "step": 2256 }, { "epoch": 1.265134529147982, "grad_norm": 0.07053674888474745, "learning_rate": 0.00014332959231784712, "loss": 0.2643, "step": 2257 }, { "epoch": 1.265695067264574, "grad_norm": 0.07205890920821309, "learning_rate": 0.00014327079228604176, "loss": 0.2798, "step": 2258 }, { "epoch": 1.266255605381166, "grad_norm": 0.07386825344155837, "learning_rate": 0.0001432119738413608, "loss": 0.2747, "step": 2259 }, { "epoch": 1.2668161434977578, "grad_norm": 0.0721786940621619, "learning_rate": 0.00014315313700883294, "loss": 0.2558, "step": 2260 }, { "epoch": 1.26737668161435, "grad_norm": 0.07226327141121044, "learning_rate": 0.00014309428181349484, "loss": 0.279, "step": 2261 }, { "epoch": 1.2679372197309418, "grad_norm": 0.07417550526920404, "learning_rate": 0.00014303540828039098, "loss": 0.282, "step": 2262 }, { "epoch": 1.2684977578475336, "grad_norm": 0.07055665704719952, "learning_rate": 0.00014297651643457366, "loss": 0.2752, "step": 2263 }, { "epoch": 1.2690582959641254, "grad_norm": 0.07145927025909532, "learning_rate": 0.00014291760630110288, "loss": 0.2721, "step": 2264 }, { "epoch": 1.2696188340807175, "grad_norm": 0.07055847344841475, "learning_rate": 0.0001428586779050465, "loss": 0.2575, "step": 2265 }, { "epoch": 1.2701793721973094, "grad_norm": 0.0706480831312279, "learning_rate": 0.00014279973127148004, "loss": 0.2786, "step": 2266 }, { "epoch": 1.2707399103139014, "grad_norm": 0.07007179862259319, "learning_rate": 0.000142740766425487, "loss": 0.2619, "step": 2267 }, { "epoch": 1.2713004484304933, "grad_norm": 0.07131325441368183, "learning_rate": 0.00014268178339215838, "loss": 0.2698, "step": 2268 }, { "epoch": 1.2718609865470851, "grad_norm": 0.07415336161002052, "learning_rate": 0.0001426227821965931, "loss": 0.2729, "step": 2269 }, { "epoch": 1.2724215246636772, "grad_norm": 0.07306777949703797, "learning_rate": 0.00014256376286389769, "loss": 0.2692, "step": 2270 }, { "epoch": 1.272982062780269, "grad_norm": 0.07156311353403604, "learning_rate": 0.0001425047254191865, "loss": 0.2725, "step": 2271 }, { "epoch": 1.273542600896861, "grad_norm": 0.07526729163791998, "learning_rate": 0.00014244566988758152, "loss": 0.2835, "step": 2272 }, { "epoch": 1.274103139013453, "grad_norm": 0.07258970663293911, "learning_rate": 0.00014238659629421245, "loss": 0.2709, "step": 2273 }, { "epoch": 1.2746636771300448, "grad_norm": 0.0750406240649443, "learning_rate": 0.00014232750466421665, "loss": 0.2783, "step": 2274 }, { "epoch": 1.2752242152466366, "grad_norm": 0.07456835751881956, "learning_rate": 0.0001422683950227392, "loss": 0.2831, "step": 2275 }, { "epoch": 1.2757847533632287, "grad_norm": 0.0738641247695516, "learning_rate": 0.00014220926739493288, "loss": 0.2862, "step": 2276 }, { "epoch": 1.2763452914798206, "grad_norm": 0.07447914433347605, "learning_rate": 0.00014215012180595802, "loss": 0.2725, "step": 2277 }, { "epoch": 1.2769058295964126, "grad_norm": 0.072193602978639, "learning_rate": 0.00014209095828098263, "loss": 0.2648, "step": 2278 }, { "epoch": 1.2774663677130045, "grad_norm": 0.0706664606726907, "learning_rate": 0.00014203177684518243, "loss": 0.258, "step": 2279 }, { "epoch": 1.2780269058295963, "grad_norm": 0.07333496681068567, "learning_rate": 0.0001419725775237406, "loss": 0.2773, "step": 2280 }, { "epoch": 1.2785874439461884, "grad_norm": 0.07243360497997042, "learning_rate": 0.00014191336034184818, "loss": 0.2741, "step": 2281 }, { "epoch": 1.2791479820627802, "grad_norm": 0.06998674397688194, "learning_rate": 0.0001418541253247035, "loss": 0.2667, "step": 2282 }, { "epoch": 1.2797085201793723, "grad_norm": 0.06880900181815017, "learning_rate": 0.0001417948724975127, "loss": 0.2514, "step": 2283 }, { "epoch": 1.2802690582959642, "grad_norm": 0.07143344551366512, "learning_rate": 0.00014173560188548948, "loss": 0.27, "step": 2284 }, { "epoch": 1.280829596412556, "grad_norm": 0.07293076040915937, "learning_rate": 0.00014167631351385504, "loss": 0.2678, "step": 2285 }, { "epoch": 1.2813901345291479, "grad_norm": 0.07382402327007528, "learning_rate": 0.00014161700740783815, "loss": 0.2812, "step": 2286 }, { "epoch": 1.28195067264574, "grad_norm": 0.07046114694580449, "learning_rate": 0.00014155768359267511, "loss": 0.2603, "step": 2287 }, { "epoch": 1.2825112107623318, "grad_norm": 0.07294584210310635, "learning_rate": 0.00014149834209360986, "loss": 0.2695, "step": 2288 }, { "epoch": 1.2830717488789238, "grad_norm": 0.07311525705372239, "learning_rate": 0.00014143898293589373, "loss": 0.2697, "step": 2289 }, { "epoch": 1.2836322869955157, "grad_norm": 0.07151234033113946, "learning_rate": 0.00014137960614478564, "loss": 0.2639, "step": 2290 }, { "epoch": 1.2841928251121075, "grad_norm": 0.06883676087253877, "learning_rate": 0.00014132021174555198, "loss": 0.266, "step": 2291 }, { "epoch": 1.2847533632286996, "grad_norm": 0.07323764340793483, "learning_rate": 0.0001412607997634667, "loss": 0.281, "step": 2292 }, { "epoch": 1.2853139013452914, "grad_norm": 0.06994806350440688, "learning_rate": 0.00014120137022381117, "loss": 0.2607, "step": 2293 }, { "epoch": 1.2858744394618835, "grad_norm": 0.07302141344099933, "learning_rate": 0.0001411419231518742, "loss": 0.2666, "step": 2294 }, { "epoch": 1.2864349775784754, "grad_norm": 0.07577632080911738, "learning_rate": 0.0001410824585729521, "loss": 0.2797, "step": 2295 }, { "epoch": 1.2869955156950672, "grad_norm": 0.07252737827488943, "learning_rate": 0.0001410229765123487, "loss": 0.2752, "step": 2296 }, { "epoch": 1.2875560538116593, "grad_norm": 0.07270549855949322, "learning_rate": 0.00014096347699537516, "loss": 0.2787, "step": 2297 }, { "epoch": 1.2881165919282511, "grad_norm": 0.07168693628490974, "learning_rate": 0.0001409039600473501, "loss": 0.2583, "step": 2298 }, { "epoch": 1.288677130044843, "grad_norm": 0.07304145070152694, "learning_rate": 0.00014084442569359964, "loss": 0.2747, "step": 2299 }, { "epoch": 1.289237668161435, "grad_norm": 0.07304492367720695, "learning_rate": 0.00014078487395945713, "loss": 0.2793, "step": 2300 }, { "epoch": 1.2897982062780269, "grad_norm": 0.07261463886806618, "learning_rate": 0.00014072530487026347, "loss": 0.2679, "step": 2301 }, { "epoch": 1.2903587443946187, "grad_norm": 0.07501550696532452, "learning_rate": 0.00014066571845136692, "loss": 0.2829, "step": 2302 }, { "epoch": 1.2909192825112108, "grad_norm": 0.0725622550946889, "learning_rate": 0.000140606114728123, "loss": 0.2868, "step": 2303 }, { "epoch": 1.2914798206278026, "grad_norm": 0.0715450409641427, "learning_rate": 0.00014054649372589482, "loss": 0.271, "step": 2304 }, { "epoch": 1.2920403587443947, "grad_norm": 0.07332758764704696, "learning_rate": 0.0001404868554700526, "loss": 0.2819, "step": 2305 }, { "epoch": 1.2926008968609866, "grad_norm": 0.0691637338358271, "learning_rate": 0.00014042719998597409, "loss": 0.2628, "step": 2306 }, { "epoch": 1.2931614349775784, "grad_norm": 0.07196187566198209, "learning_rate": 0.00014036752729904418, "loss": 0.2773, "step": 2307 }, { "epoch": 1.2937219730941705, "grad_norm": 0.07556174369975631, "learning_rate": 0.00014030783743465528, "loss": 0.2833, "step": 2308 }, { "epoch": 1.2942825112107623, "grad_norm": 0.07430317200840218, "learning_rate": 0.00014024813041820699, "loss": 0.2831, "step": 2309 }, { "epoch": 1.2948430493273544, "grad_norm": 0.07428510115993528, "learning_rate": 0.00014018840627510622, "loss": 0.2674, "step": 2310 }, { "epoch": 1.2954035874439462, "grad_norm": 0.07122164159460866, "learning_rate": 0.00014012866503076721, "loss": 0.2672, "step": 2311 }, { "epoch": 1.295964125560538, "grad_norm": 0.0729114329375746, "learning_rate": 0.00014006890671061143, "loss": 0.277, "step": 2312 }, { "epoch": 1.29652466367713, "grad_norm": 0.07042398703873136, "learning_rate": 0.00014000913134006767, "loss": 0.2559, "step": 2313 }, { "epoch": 1.297085201793722, "grad_norm": 0.07393397653669982, "learning_rate": 0.00013994933894457192, "loss": 0.2548, "step": 2314 }, { "epoch": 1.2976457399103138, "grad_norm": 0.07287978252403222, "learning_rate": 0.00013988952954956745, "loss": 0.2757, "step": 2315 }, { "epoch": 1.298206278026906, "grad_norm": 0.07234603630097806, "learning_rate": 0.0001398297031805047, "loss": 0.2665, "step": 2316 }, { "epoch": 1.2987668161434978, "grad_norm": 0.07375355118824485, "learning_rate": 0.00013976985986284147, "loss": 0.2733, "step": 2317 }, { "epoch": 1.2993273542600896, "grad_norm": 0.07087325341820375, "learning_rate": 0.00013970999962204265, "loss": 0.259, "step": 2318 }, { "epoch": 1.2998878923766817, "grad_norm": 0.07220953544823676, "learning_rate": 0.00013965012248358036, "loss": 0.2755, "step": 2319 }, { "epoch": 1.3004484304932735, "grad_norm": 0.07119809272659058, "learning_rate": 0.00013959022847293391, "loss": 0.2636, "step": 2320 }, { "epoch": 1.3010089686098656, "grad_norm": 0.07253157824321244, "learning_rate": 0.00013953031761558982, "loss": 0.2729, "step": 2321 }, { "epoch": 1.3015695067264574, "grad_norm": 0.07186140529816074, "learning_rate": 0.00013947038993704177, "loss": 0.2741, "step": 2322 }, { "epoch": 1.3021300448430493, "grad_norm": 0.07143050947797319, "learning_rate": 0.00013941044546279054, "loss": 0.2593, "step": 2323 }, { "epoch": 1.3026905829596411, "grad_norm": 0.07322050298555188, "learning_rate": 0.0001393504842183441, "loss": 0.2849, "step": 2324 }, { "epoch": 1.3032511210762332, "grad_norm": 0.07245247220336955, "learning_rate": 0.00013929050622921762, "loss": 0.2681, "step": 2325 }, { "epoch": 1.303811659192825, "grad_norm": 0.0721492767322242, "learning_rate": 0.0001392305115209333, "loss": 0.2769, "step": 2326 }, { "epoch": 1.3043721973094171, "grad_norm": 0.07130441768921186, "learning_rate": 0.00013917050011902048, "loss": 0.2734, "step": 2327 }, { "epoch": 1.304932735426009, "grad_norm": 0.07381937785338555, "learning_rate": 0.0001391104720490156, "loss": 0.2767, "step": 2328 }, { "epoch": 1.3054932735426008, "grad_norm": 0.07235614217671463, "learning_rate": 0.00013905042733646224, "loss": 0.2711, "step": 2329 }, { "epoch": 1.3060538116591929, "grad_norm": 0.07019467785727469, "learning_rate": 0.000138990366006911, "loss": 0.2624, "step": 2330 }, { "epoch": 1.3066143497757847, "grad_norm": 0.07240908628062101, "learning_rate": 0.0001389302880859196, "loss": 0.2868, "step": 2331 }, { "epoch": 1.3071748878923768, "grad_norm": 0.07257995860684734, "learning_rate": 0.00013887019359905275, "loss": 0.2715, "step": 2332 }, { "epoch": 1.3077354260089686, "grad_norm": 0.07237788870869241, "learning_rate": 0.0001388100825718823, "loss": 0.2779, "step": 2333 }, { "epoch": 1.3082959641255605, "grad_norm": 0.07339006898519476, "learning_rate": 0.00013874995502998706, "loss": 0.2797, "step": 2334 }, { "epoch": 1.3088565022421523, "grad_norm": 0.07075462432399776, "learning_rate": 0.00013868981099895294, "loss": 0.2601, "step": 2335 }, { "epoch": 1.3094170403587444, "grad_norm": 0.07162694408756295, "learning_rate": 0.0001386296505043728, "loss": 0.263, "step": 2336 }, { "epoch": 1.3099775784753362, "grad_norm": 0.06924353205055772, "learning_rate": 0.00013856947357184657, "loss": 0.2719, "step": 2337 }, { "epoch": 1.3105381165919283, "grad_norm": 0.07313976800317197, "learning_rate": 0.00013850928022698112, "loss": 0.2779, "step": 2338 }, { "epoch": 1.3110986547085202, "grad_norm": 0.07219278412249301, "learning_rate": 0.0001384490704953903, "loss": 0.2662, "step": 2339 }, { "epoch": 1.311659192825112, "grad_norm": 0.07088047183199304, "learning_rate": 0.00013838884440269496, "loss": 0.2623, "step": 2340 }, { "epoch": 1.312219730941704, "grad_norm": 0.07202159403822177, "learning_rate": 0.00013832860197452294, "loss": 0.2628, "step": 2341 }, { "epoch": 1.312780269058296, "grad_norm": 0.07289564709942851, "learning_rate": 0.000138268343236509, "loss": 0.2837, "step": 2342 }, { "epoch": 1.313340807174888, "grad_norm": 0.06880975974949648, "learning_rate": 0.0001382080682142948, "loss": 0.26, "step": 2343 }, { "epoch": 1.3139013452914798, "grad_norm": 0.07268727279675526, "learning_rate": 0.000138147776933529, "loss": 0.2749, "step": 2344 }, { "epoch": 1.3144618834080717, "grad_norm": 0.07258524138923673, "learning_rate": 0.00013808746941986708, "loss": 0.2746, "step": 2345 }, { "epoch": 1.3150224215246638, "grad_norm": 0.07379419595042286, "learning_rate": 0.00013802714569897162, "loss": 0.2699, "step": 2346 }, { "epoch": 1.3155829596412556, "grad_norm": 0.07166246045813811, "learning_rate": 0.00013796680579651187, "loss": 0.2713, "step": 2347 }, { "epoch": 1.3161434977578477, "grad_norm": 0.07269806792322606, "learning_rate": 0.0001379064497381641, "loss": 0.2637, "step": 2348 }, { "epoch": 1.3167040358744395, "grad_norm": 0.0748217429176605, "learning_rate": 0.0001378460775496114, "loss": 0.2867, "step": 2349 }, { "epoch": 1.3172645739910314, "grad_norm": 0.07115741698647755, "learning_rate": 0.00013778568925654382, "loss": 0.2629, "step": 2350 }, { "epoch": 1.3178251121076232, "grad_norm": 0.07347852046495931, "learning_rate": 0.0001377252848846581, "loss": 0.2665, "step": 2351 }, { "epoch": 1.3183856502242153, "grad_norm": 0.07287568379605906, "learning_rate": 0.00013766486445965795, "loss": 0.273, "step": 2352 }, { "epoch": 1.3189461883408071, "grad_norm": 0.07487119510405997, "learning_rate": 0.00013760442800725387, "loss": 0.2804, "step": 2353 }, { "epoch": 1.3195067264573992, "grad_norm": 0.07129724434208816, "learning_rate": 0.00013754397555316322, "loss": 0.268, "step": 2354 }, { "epoch": 1.320067264573991, "grad_norm": 0.07413545960455435, "learning_rate": 0.00013748350712311004, "loss": 0.2809, "step": 2355 }, { "epoch": 1.3206278026905829, "grad_norm": 0.07167169675379284, "learning_rate": 0.00013742302274282533, "loss": 0.2781, "step": 2356 }, { "epoch": 1.321188340807175, "grad_norm": 0.07308218187278753, "learning_rate": 0.00013736252243804677, "loss": 0.2783, "step": 2357 }, { "epoch": 1.3217488789237668, "grad_norm": 0.07226496753721331, "learning_rate": 0.00013730200623451888, "loss": 0.2732, "step": 2358 }, { "epoch": 1.3223094170403589, "grad_norm": 0.07190735515069895, "learning_rate": 0.00013724147415799292, "loss": 0.2672, "step": 2359 }, { "epoch": 1.3228699551569507, "grad_norm": 0.07240624410270693, "learning_rate": 0.00013718092623422686, "loss": 0.2718, "step": 2360 }, { "epoch": 1.3234304932735426, "grad_norm": 0.07338334184086785, "learning_rate": 0.0001371203624889855, "loss": 0.2784, "step": 2361 }, { "epoch": 1.3239910313901344, "grad_norm": 0.07082306820891138, "learning_rate": 0.00013705978294804028, "loss": 0.2585, "step": 2362 }, { "epoch": 1.3245515695067265, "grad_norm": 0.07207401693906636, "learning_rate": 0.0001369991876371695, "loss": 0.2876, "step": 2363 }, { "epoch": 1.3251121076233183, "grad_norm": 0.0739441911284224, "learning_rate": 0.000136938576582158, "loss": 0.2788, "step": 2364 }, { "epoch": 1.3256726457399104, "grad_norm": 0.07423962072526538, "learning_rate": 0.0001368779498087974, "loss": 0.2803, "step": 2365 }, { "epoch": 1.3262331838565022, "grad_norm": 0.07505191493550833, "learning_rate": 0.00013681730734288605, "loss": 0.2706, "step": 2366 }, { "epoch": 1.326793721973094, "grad_norm": 0.07051531038088685, "learning_rate": 0.0001367566492102289, "loss": 0.2647, "step": 2367 }, { "epoch": 1.3273542600896862, "grad_norm": 0.07118623141697969, "learning_rate": 0.00013669597543663762, "loss": 0.278, "step": 2368 }, { "epoch": 1.327914798206278, "grad_norm": 0.07214664899734435, "learning_rate": 0.0001366352860479305, "loss": 0.2586, "step": 2369 }, { "epoch": 1.32847533632287, "grad_norm": 0.07179236909265237, "learning_rate": 0.00013657458106993258, "loss": 0.2739, "step": 2370 }, { "epoch": 1.329035874439462, "grad_norm": 0.07101289143586781, "learning_rate": 0.00013651386052847533, "loss": 0.2697, "step": 2371 }, { "epoch": 1.3295964125560538, "grad_norm": 0.0725617724663863, "learning_rate": 0.000136453124449397, "loss": 0.2604, "step": 2372 }, { "epoch": 1.3301569506726456, "grad_norm": 0.076506082639709, "learning_rate": 0.00013639237285854243, "loss": 0.2778, "step": 2373 }, { "epoch": 1.3307174887892377, "grad_norm": 0.07319661031867133, "learning_rate": 0.000136331605781763, "loss": 0.2664, "step": 2374 }, { "epoch": 1.3312780269058295, "grad_norm": 0.0719402160984244, "learning_rate": 0.00013627082324491678, "loss": 0.2619, "step": 2375 }, { "epoch": 1.3318385650224216, "grad_norm": 0.07019774909320625, "learning_rate": 0.00013621002527386834, "loss": 0.252, "step": 2376 }, { "epoch": 1.3323991031390134, "grad_norm": 0.07035640757768981, "learning_rate": 0.00013614921189448879, "loss": 0.2748, "step": 2377 }, { "epoch": 1.3329596412556053, "grad_norm": 0.07268993860380529, "learning_rate": 0.00013608838313265587, "loss": 0.2734, "step": 2378 }, { "epoch": 1.3335201793721974, "grad_norm": 0.07116808690159099, "learning_rate": 0.0001360275390142539, "loss": 0.2758, "step": 2379 }, { "epoch": 1.3340807174887892, "grad_norm": 0.0705630739577653, "learning_rate": 0.0001359666795651736, "loss": 0.2754, "step": 2380 }, { "epoch": 1.3346412556053813, "grad_norm": 0.07124532253903049, "learning_rate": 0.0001359058048113123, "loss": 0.2669, "step": 2381 }, { "epoch": 1.3352017937219731, "grad_norm": 0.07120827955893569, "learning_rate": 0.00013584491477857384, "loss": 0.262, "step": 2382 }, { "epoch": 1.335762331838565, "grad_norm": 0.07179104584486316, "learning_rate": 0.00013578400949286855, "loss": 0.2719, "step": 2383 }, { "epoch": 1.336322869955157, "grad_norm": 0.07098980417413621, "learning_rate": 0.0001357230889801133, "loss": 0.2617, "step": 2384 }, { "epoch": 1.3368834080717489, "grad_norm": 0.07208511500860665, "learning_rate": 0.0001356621532662313, "loss": 0.2639, "step": 2385 }, { "epoch": 1.3374439461883407, "grad_norm": 0.07294348606152207, "learning_rate": 0.00013560120237715242, "loss": 0.2694, "step": 2386 }, { "epoch": 1.3380044843049328, "grad_norm": 0.072253436486753, "learning_rate": 0.0001355402363388128, "loss": 0.2727, "step": 2387 }, { "epoch": 1.3385650224215246, "grad_norm": 0.07179010157489436, "learning_rate": 0.00013547925517715519, "loss": 0.2638, "step": 2388 }, { "epoch": 1.3391255605381165, "grad_norm": 0.07031658021354985, "learning_rate": 0.00013541825891812863, "loss": 0.2732, "step": 2389 }, { "epoch": 1.3396860986547086, "grad_norm": 0.07361561315970444, "learning_rate": 0.00013535724758768867, "loss": 0.2683, "step": 2390 }, { "epoch": 1.3402466367713004, "grad_norm": 0.07007627948386362, "learning_rate": 0.00013529622121179733, "loss": 0.2651, "step": 2391 }, { "epoch": 1.3408071748878925, "grad_norm": 0.07248825788078303, "learning_rate": 0.00013523517981642286, "loss": 0.2757, "step": 2392 }, { "epoch": 1.3413677130044843, "grad_norm": 0.07486248813509154, "learning_rate": 0.0001351741234275401, "loss": 0.2791, "step": 2393 }, { "epoch": 1.3419282511210762, "grad_norm": 0.07215523400320231, "learning_rate": 0.0001351130520711301, "loss": 0.2822, "step": 2394 }, { "epoch": 1.3424887892376682, "grad_norm": 0.07206609859268151, "learning_rate": 0.0001350519657731803, "loss": 0.2719, "step": 2395 }, { "epoch": 1.34304932735426, "grad_norm": 0.07094754593911295, "learning_rate": 0.00013499086455968467, "loss": 0.2672, "step": 2396 }, { "epoch": 1.3436098654708521, "grad_norm": 0.07345131253742171, "learning_rate": 0.00013492974845664336, "loss": 0.2646, "step": 2397 }, { "epoch": 1.344170403587444, "grad_norm": 0.07148448933462602, "learning_rate": 0.00013486861749006286, "loss": 0.2631, "step": 2398 }, { "epoch": 1.3447309417040358, "grad_norm": 0.0734079306257349, "learning_rate": 0.0001348074716859561, "loss": 0.2803, "step": 2399 }, { "epoch": 1.3452914798206277, "grad_norm": 0.07510957327668555, "learning_rate": 0.0001347463110703422, "loss": 0.2732, "step": 2400 }, { "epoch": 1.3458520179372198, "grad_norm": 0.07361881542352745, "learning_rate": 0.00013468513566924662, "loss": 0.2682, "step": 2401 }, { "epoch": 1.3464125560538116, "grad_norm": 0.07337817081027358, "learning_rate": 0.00013462394550870115, "loss": 0.2746, "step": 2402 }, { "epoch": 1.3469730941704037, "grad_norm": 0.07321876065712256, "learning_rate": 0.00013456274061474384, "loss": 0.2618, "step": 2403 }, { "epoch": 1.3475336322869955, "grad_norm": 0.07395144294198633, "learning_rate": 0.00013450152101341896, "loss": 0.2704, "step": 2404 }, { "epoch": 1.3480941704035874, "grad_norm": 0.07216160241493295, "learning_rate": 0.00013444028673077716, "loss": 0.2855, "step": 2405 }, { "epoch": 1.3486547085201794, "grad_norm": 0.06951922593507989, "learning_rate": 0.0001343790377928752, "loss": 0.2627, "step": 2406 }, { "epoch": 1.3492152466367713, "grad_norm": 0.07141496023438036, "learning_rate": 0.00013431777422577614, "loss": 0.2676, "step": 2407 }, { "epoch": 1.3497757847533634, "grad_norm": 0.07112079206079516, "learning_rate": 0.00013425649605554928, "loss": 0.2668, "step": 2408 }, { "epoch": 1.3503363228699552, "grad_norm": 0.07165333061044075, "learning_rate": 0.0001341952033082701, "loss": 0.2698, "step": 2409 }, { "epoch": 1.350896860986547, "grad_norm": 0.0731320853414597, "learning_rate": 0.00013413389601002034, "loss": 0.2647, "step": 2410 }, { "epoch": 1.351457399103139, "grad_norm": 0.07198129210042335, "learning_rate": 0.0001340725741868878, "loss": 0.2662, "step": 2411 }, { "epoch": 1.352017937219731, "grad_norm": 0.0684396964613639, "learning_rate": 0.00013401123786496664, "loss": 0.2539, "step": 2412 }, { "epoch": 1.3525784753363228, "grad_norm": 0.07309220050943435, "learning_rate": 0.00013394988707035707, "loss": 0.2577, "step": 2413 }, { "epoch": 1.3531390134529149, "grad_norm": 0.07483948334841695, "learning_rate": 0.00013388852182916544, "loss": 0.2702, "step": 2414 }, { "epoch": 1.3536995515695067, "grad_norm": 0.07026305718568777, "learning_rate": 0.00013382714216750438, "loss": 0.2591, "step": 2415 }, { "epoch": 1.3542600896860986, "grad_norm": 0.07500682600556109, "learning_rate": 0.00013376574811149253, "loss": 0.2829, "step": 2416 }, { "epoch": 1.3548206278026906, "grad_norm": 0.07317671520929299, "learning_rate": 0.00013370433968725468, "loss": 0.2786, "step": 2417 }, { "epoch": 1.3553811659192825, "grad_norm": 0.07092215065383145, "learning_rate": 0.00013364291692092182, "loss": 0.2724, "step": 2418 }, { "epoch": 1.3559417040358746, "grad_norm": 0.0722465926401936, "learning_rate": 0.00013358147983863087, "loss": 0.276, "step": 2419 }, { "epoch": 1.3565022421524664, "grad_norm": 0.0689147359853441, "learning_rate": 0.00013352002846652504, "loss": 0.262, "step": 2420 }, { "epoch": 1.3570627802690582, "grad_norm": 0.06956653911163503, "learning_rate": 0.00013345856283075347, "loss": 0.2684, "step": 2421 }, { "epoch": 1.35762331838565, "grad_norm": 0.07095296721528385, "learning_rate": 0.00013339708295747146, "loss": 0.2624, "step": 2422 }, { "epoch": 1.3581838565022422, "grad_norm": 0.07132575762686803, "learning_rate": 0.0001333355888728403, "loss": 0.2811, "step": 2423 }, { "epoch": 1.358744394618834, "grad_norm": 0.07301929011953896, "learning_rate": 0.00013327408060302738, "loss": 0.2703, "step": 2424 }, { "epoch": 1.359304932735426, "grad_norm": 0.07186179738526437, "learning_rate": 0.00013321255817420614, "loss": 0.2708, "step": 2425 }, { "epoch": 1.359865470852018, "grad_norm": 0.07158317572330604, "learning_rate": 0.000133151021612556, "loss": 0.2715, "step": 2426 }, { "epoch": 1.3604260089686098, "grad_norm": 0.07282108957574474, "learning_rate": 0.00013308947094426237, "loss": 0.278, "step": 2427 }, { "epoch": 1.3609865470852018, "grad_norm": 0.07211048808596557, "learning_rate": 0.00013302790619551674, "loss": 0.2582, "step": 2428 }, { "epoch": 1.3615470852017937, "grad_norm": 0.07199512540150688, "learning_rate": 0.00013296632739251649, "loss": 0.2616, "step": 2429 }, { "epoch": 1.3621076233183858, "grad_norm": 0.07376143293365058, "learning_rate": 0.00013290473456146513, "loss": 0.271, "step": 2430 }, { "epoch": 1.3626681614349776, "grad_norm": 0.07289892751397548, "learning_rate": 0.00013284312772857197, "loss": 0.2709, "step": 2431 }, { "epoch": 1.3632286995515694, "grad_norm": 0.07210804572294657, "learning_rate": 0.00013278150692005243, "loss": 0.2838, "step": 2432 }, { "epoch": 1.3637892376681615, "grad_norm": 0.07043118964102611, "learning_rate": 0.0001327198721621278, "loss": 0.2609, "step": 2433 }, { "epoch": 1.3643497757847534, "grad_norm": 0.07031977851384845, "learning_rate": 0.00013265822348102526, "loss": 0.2752, "step": 2434 }, { "epoch": 1.3649103139013454, "grad_norm": 0.0726635579400134, "learning_rate": 0.000132596560902978, "loss": 0.2747, "step": 2435 }, { "epoch": 1.3654708520179373, "grad_norm": 0.07282388910028605, "learning_rate": 0.00013253488445422507, "loss": 0.2769, "step": 2436 }, { "epoch": 1.3660313901345291, "grad_norm": 0.07105098521583782, "learning_rate": 0.00013247319416101146, "loss": 0.2562, "step": 2437 }, { "epoch": 1.366591928251121, "grad_norm": 0.0722914203984814, "learning_rate": 0.00013241149004958807, "loss": 0.277, "step": 2438 }, { "epoch": 1.367152466367713, "grad_norm": 0.07130305361271941, "learning_rate": 0.00013234977214621158, "loss": 0.2679, "step": 2439 }, { "epoch": 1.3677130044843049, "grad_norm": 0.07382272031436721, "learning_rate": 0.00013228804047714463, "loss": 0.2838, "step": 2440 }, { "epoch": 1.368273542600897, "grad_norm": 0.07168330955222213, "learning_rate": 0.00013222629506865572, "loss": 0.2589, "step": 2441 }, { "epoch": 1.3688340807174888, "grad_norm": 0.07176541308751692, "learning_rate": 0.00013216453594701912, "loss": 0.2676, "step": 2442 }, { "epoch": 1.3693946188340806, "grad_norm": 0.07160344434933923, "learning_rate": 0.000132102763138515, "loss": 0.2722, "step": 2443 }, { "epoch": 1.3699551569506727, "grad_norm": 0.07152436426939439, "learning_rate": 0.00013204097666942932, "loss": 0.2858, "step": 2444 }, { "epoch": 1.3705156950672646, "grad_norm": 0.07669489731715146, "learning_rate": 0.0001319791765660539, "loss": 0.2747, "step": 2445 }, { "epoch": 1.3710762331838566, "grad_norm": 0.07425976800563304, "learning_rate": 0.00013191736285468638, "loss": 0.2834, "step": 2446 }, { "epoch": 1.3716367713004485, "grad_norm": 0.07296885862444086, "learning_rate": 0.00013185553556163, "loss": 0.2713, "step": 2447 }, { "epoch": 1.3721973094170403, "grad_norm": 0.07428666702438311, "learning_rate": 0.00013179369471319404, "loss": 0.2842, "step": 2448 }, { "epoch": 1.3727578475336322, "grad_norm": 0.07296014058514794, "learning_rate": 0.00013173184033569342, "loss": 0.2745, "step": 2449 }, { "epoch": 1.3733183856502242, "grad_norm": 0.07174745038851872, "learning_rate": 0.00013166997245544877, "loss": 0.2783, "step": 2450 }, { "epoch": 1.373878923766816, "grad_norm": 0.07147497552727385, "learning_rate": 0.00013160809109878655, "loss": 0.2721, "step": 2451 }, { "epoch": 1.3744394618834082, "grad_norm": 0.07150141944074127, "learning_rate": 0.00013154619629203893, "loss": 0.2598, "step": 2452 }, { "epoch": 1.375, "grad_norm": 0.07181043821190135, "learning_rate": 0.00013148428806154382, "loss": 0.2667, "step": 2453 }, { "epoch": 1.3755605381165918, "grad_norm": 0.07051746386550256, "learning_rate": 0.0001314223664336448, "loss": 0.2742, "step": 2454 }, { "epoch": 1.376121076233184, "grad_norm": 0.07253062832481688, "learning_rate": 0.00013136043143469116, "loss": 0.2634, "step": 2455 }, { "epoch": 1.3766816143497758, "grad_norm": 0.07432425293979825, "learning_rate": 0.0001312984830910379, "loss": 0.2663, "step": 2456 }, { "epoch": 1.3772421524663678, "grad_norm": 0.0711176297415943, "learning_rate": 0.00013123652142904574, "loss": 0.2719, "step": 2457 }, { "epoch": 1.3778026905829597, "grad_norm": 0.07257757758574657, "learning_rate": 0.00013117454647508094, "loss": 0.2681, "step": 2458 }, { "epoch": 1.3783632286995515, "grad_norm": 0.0708100378992227, "learning_rate": 0.00013111255825551556, "loss": 0.2698, "step": 2459 }, { "epoch": 1.3789237668161434, "grad_norm": 0.06870018997269857, "learning_rate": 0.0001310505567967272, "loss": 0.2718, "step": 2460 }, { "epoch": 1.3794843049327354, "grad_norm": 0.07203519243056784, "learning_rate": 0.00013098854212509917, "loss": 0.2796, "step": 2461 }, { "epoch": 1.3800448430493273, "grad_norm": 0.07187680311401265, "learning_rate": 0.00013092651426702034, "loss": 0.2684, "step": 2462 }, { "epoch": 1.3806053811659194, "grad_norm": 0.06981221618911998, "learning_rate": 0.0001308644732488852, "loss": 0.2496, "step": 2463 }, { "epoch": 1.3811659192825112, "grad_norm": 0.0731266258637114, "learning_rate": 0.00013080241909709387, "loss": 0.2763, "step": 2464 }, { "epoch": 1.381726457399103, "grad_norm": 0.07584163817780647, "learning_rate": 0.00013074035183805209, "loss": 0.2723, "step": 2465 }, { "epoch": 1.3822869955156951, "grad_norm": 0.07430429344317484, "learning_rate": 0.00013067827149817112, "loss": 0.2689, "step": 2466 }, { "epoch": 1.382847533632287, "grad_norm": 0.07265967034126478, "learning_rate": 0.00013061617810386774, "loss": 0.2596, "step": 2467 }, { "epoch": 1.383408071748879, "grad_norm": 0.07203365339853766, "learning_rate": 0.00013055407168156437, "loss": 0.2613, "step": 2468 }, { "epoch": 1.3839686098654709, "grad_norm": 0.06996162619517453, "learning_rate": 0.00013049195225768898, "loss": 0.2726, "step": 2469 }, { "epoch": 1.3845291479820627, "grad_norm": 0.07104204707391903, "learning_rate": 0.00013042981985867503, "loss": 0.2646, "step": 2470 }, { "epoch": 1.3850896860986546, "grad_norm": 0.07096724598851205, "learning_rate": 0.00013036767451096148, "loss": 0.2522, "step": 2471 }, { "epoch": 1.3856502242152466, "grad_norm": 0.07160993411400399, "learning_rate": 0.00013030551624099287, "loss": 0.2751, "step": 2472 }, { "epoch": 1.3862107623318385, "grad_norm": 0.07107463944019773, "learning_rate": 0.0001302433450752192, "loss": 0.2805, "step": 2473 }, { "epoch": 1.3867713004484306, "grad_norm": 0.07394302746868758, "learning_rate": 0.0001301811610400959, "loss": 0.3, "step": 2474 }, { "epoch": 1.3873318385650224, "grad_norm": 0.07500896698184124, "learning_rate": 0.00013011896416208405, "loss": 0.261, "step": 2475 }, { "epoch": 1.3878923766816142, "grad_norm": 0.07284475433767182, "learning_rate": 0.00013005675446764998, "loss": 0.279, "step": 2476 }, { "epoch": 1.3884529147982063, "grad_norm": 0.07059100190411745, "learning_rate": 0.00012999453198326557, "loss": 0.2681, "step": 2477 }, { "epoch": 1.3890134529147982, "grad_norm": 0.07201489034288251, "learning_rate": 0.00012993229673540822, "loss": 0.2748, "step": 2478 }, { "epoch": 1.3895739910313902, "grad_norm": 0.06947626310788592, "learning_rate": 0.0001298700487505606, "loss": 0.2692, "step": 2479 }, { "epoch": 1.390134529147982, "grad_norm": 0.07303319834804392, "learning_rate": 0.000129807788055211, "loss": 0.2737, "step": 2480 }, { "epoch": 1.390695067264574, "grad_norm": 0.07431794317957002, "learning_rate": 0.0001297455146758529, "loss": 0.2615, "step": 2481 }, { "epoch": 1.391255605381166, "grad_norm": 0.07137612781800744, "learning_rate": 0.00012968322863898533, "loss": 0.262, "step": 2482 }, { "epoch": 1.3918161434977578, "grad_norm": 0.07266858820294593, "learning_rate": 0.00012962092997111265, "loss": 0.2845, "step": 2483 }, { "epoch": 1.39237668161435, "grad_norm": 0.07229964808963044, "learning_rate": 0.0001295586186987446, "loss": 0.2717, "step": 2484 }, { "epoch": 1.3929372197309418, "grad_norm": 0.07265995318964269, "learning_rate": 0.0001294962948483963, "loss": 0.264, "step": 2485 }, { "epoch": 1.3934977578475336, "grad_norm": 0.07191498135629393, "learning_rate": 0.00012943395844658821, "loss": 0.2534, "step": 2486 }, { "epoch": 1.3940582959641254, "grad_norm": 0.07209557545021433, "learning_rate": 0.0001293716095198461, "loss": 0.2685, "step": 2487 }, { "epoch": 1.3946188340807175, "grad_norm": 0.07239319810802483, "learning_rate": 0.00012930924809470115, "loss": 0.2682, "step": 2488 }, { "epoch": 1.3951793721973094, "grad_norm": 0.07657128034518046, "learning_rate": 0.00012924687419768976, "loss": 0.2841, "step": 2489 }, { "epoch": 1.3957399103139014, "grad_norm": 0.07293139721131948, "learning_rate": 0.0001291844878553537, "loss": 0.2743, "step": 2490 }, { "epoch": 1.3963004484304933, "grad_norm": 0.07139628602035565, "learning_rate": 0.00012912208909424006, "loss": 0.2649, "step": 2491 }, { "epoch": 1.3968609865470851, "grad_norm": 0.0713037784138301, "learning_rate": 0.00012905967794090114, "loss": 0.2672, "step": 2492 }, { "epoch": 1.3974215246636772, "grad_norm": 0.07197814435876061, "learning_rate": 0.00012899725442189457, "loss": 0.2792, "step": 2493 }, { "epoch": 1.397982062780269, "grad_norm": 0.07242621985982685, "learning_rate": 0.00012893481856378317, "loss": 0.2821, "step": 2494 }, { "epoch": 1.398542600896861, "grad_norm": 0.07114519859944791, "learning_rate": 0.00012887237039313514, "loss": 0.2684, "step": 2495 }, { "epoch": 1.399103139013453, "grad_norm": 0.07012615325623704, "learning_rate": 0.00012880990993652377, "loss": 0.2417, "step": 2496 }, { "epoch": 1.3996636771300448, "grad_norm": 0.07294149638436452, "learning_rate": 0.00012874743722052768, "loss": 0.2749, "step": 2497 }, { "epoch": 1.4002242152466366, "grad_norm": 0.07591298177509882, "learning_rate": 0.00012868495227173068, "loss": 0.2781, "step": 2498 }, { "epoch": 1.4007847533632287, "grad_norm": 0.07304669174863514, "learning_rate": 0.0001286224551167218, "loss": 0.288, "step": 2499 }, { "epoch": 1.4013452914798206, "grad_norm": 0.07383601777071674, "learning_rate": 0.00012855994578209526, "loss": 0.2607, "step": 2500 }, { "epoch": 1.4019058295964126, "grad_norm": 0.07508444870924733, "learning_rate": 0.00012849742429445034, "loss": 0.2819, "step": 2501 }, { "epoch": 1.4024663677130045, "grad_norm": 0.07250717828673553, "learning_rate": 0.0001284348906803917, "loss": 0.2658, "step": 2502 }, { "epoch": 1.4030269058295963, "grad_norm": 0.07236648227488171, "learning_rate": 0.000128372344966529, "loss": 0.2692, "step": 2503 }, { "epoch": 1.4035874439461884, "grad_norm": 0.06937327028257002, "learning_rate": 0.00012830978717947718, "loss": 0.2588, "step": 2504 }, { "epoch": 1.4041479820627802, "grad_norm": 0.0691096829483645, "learning_rate": 0.00012824721734585622, "loss": 0.277, "step": 2505 }, { "epoch": 1.4047085201793723, "grad_norm": 0.07041324357200651, "learning_rate": 0.00012818463549229121, "loss": 0.2714, "step": 2506 }, { "epoch": 1.4052690582959642, "grad_norm": 0.07115900727837653, "learning_rate": 0.00012812204164541245, "loss": 0.2712, "step": 2507 }, { "epoch": 1.405829596412556, "grad_norm": 0.07322315484925514, "learning_rate": 0.00012805943583185525, "loss": 0.2706, "step": 2508 }, { "epoch": 1.4063901345291479, "grad_norm": 0.0702243899051264, "learning_rate": 0.00012799681807826004, "loss": 0.2638, "step": 2509 }, { "epoch": 1.40695067264574, "grad_norm": 0.07119816545279681, "learning_rate": 0.0001279341884112724, "loss": 0.2767, "step": 2510 }, { "epoch": 1.4075112107623318, "grad_norm": 0.07027483611000442, "learning_rate": 0.0001278715468575429, "loss": 0.2634, "step": 2511 }, { "epoch": 1.4080717488789238, "grad_norm": 0.07416164006334076, "learning_rate": 0.00012780889344372718, "loss": 0.2767, "step": 2512 }, { "epoch": 1.4086322869955157, "grad_norm": 0.07371935302881523, "learning_rate": 0.00012774622819648597, "loss": 0.2747, "step": 2513 }, { "epoch": 1.4091928251121075, "grad_norm": 0.07491047306138772, "learning_rate": 0.00012768355114248494, "loss": 0.278, "step": 2514 }, { "epoch": 1.4097533632286996, "grad_norm": 0.07330923773402105, "learning_rate": 0.0001276208623083949, "loss": 0.2779, "step": 2515 }, { "epoch": 1.4103139013452914, "grad_norm": 0.07084274284445143, "learning_rate": 0.00012755816172089164, "loss": 0.2668, "step": 2516 }, { "epoch": 1.4108744394618835, "grad_norm": 0.07244971643006003, "learning_rate": 0.00012749544940665586, "loss": 0.2762, "step": 2517 }, { "epoch": 1.4114349775784754, "grad_norm": 0.07143764784972916, "learning_rate": 0.00012743272539237333, "loss": 0.2678, "step": 2518 }, { "epoch": 1.4119955156950672, "grad_norm": 0.07111406798867922, "learning_rate": 0.00012736998970473487, "loss": 0.2773, "step": 2519 }, { "epoch": 1.4125560538116593, "grad_norm": 0.07067982889597485, "learning_rate": 0.00012730724237043615, "loss": 0.2558, "step": 2520 }, { "epoch": 1.4131165919282511, "grad_norm": 0.07053306522161905, "learning_rate": 0.00012724448341617776, "loss": 0.2609, "step": 2521 }, { "epoch": 1.413677130044843, "grad_norm": 0.07151634488386535, "learning_rate": 0.00012718171286866538, "loss": 0.2818, "step": 2522 }, { "epoch": 1.414237668161435, "grad_norm": 0.06953363186812431, "learning_rate": 0.00012711893075460957, "loss": 0.2667, "step": 2523 }, { "epoch": 1.4147982062780269, "grad_norm": 0.07024275109725835, "learning_rate": 0.00012705613710072575, "loss": 0.2721, "step": 2524 }, { "epoch": 1.4153587443946187, "grad_norm": 0.06947733509275524, "learning_rate": 0.0001269933319337343, "loss": 0.2525, "step": 2525 }, { "epoch": 1.4159192825112108, "grad_norm": 0.07274290192513362, "learning_rate": 0.00012693051528036051, "loss": 0.2674, "step": 2526 }, { "epoch": 1.4164798206278026, "grad_norm": 0.07325339688881154, "learning_rate": 0.00012686768716733453, "loss": 0.2783, "step": 2527 }, { "epoch": 1.4170403587443947, "grad_norm": 0.06868100889040213, "learning_rate": 0.0001268048476213914, "loss": 0.2537, "step": 2528 }, { "epoch": 1.4176008968609866, "grad_norm": 0.06976187011388851, "learning_rate": 0.000126741996669271, "loss": 0.2703, "step": 2529 }, { "epoch": 1.4181614349775784, "grad_norm": 0.07257196857537393, "learning_rate": 0.0001266791343377181, "loss": 0.2663, "step": 2530 }, { "epoch": 1.4187219730941705, "grad_norm": 0.07363162939396198, "learning_rate": 0.0001266162606534823, "loss": 0.2704, "step": 2531 }, { "epoch": 1.4192825112107623, "grad_norm": 0.07297789157037428, "learning_rate": 0.00012655337564331805, "loss": 0.2682, "step": 2532 }, { "epoch": 1.4198430493273544, "grad_norm": 0.07347585669911441, "learning_rate": 0.0001264904793339846, "loss": 0.2738, "step": 2533 }, { "epoch": 1.4204035874439462, "grad_norm": 0.07234283051398997, "learning_rate": 0.00012642757175224595, "loss": 0.2645, "step": 2534 }, { "epoch": 1.420964125560538, "grad_norm": 0.07127456429361785, "learning_rate": 0.000126364652924871, "loss": 0.2506, "step": 2535 }, { "epoch": 1.42152466367713, "grad_norm": 0.07176298472511297, "learning_rate": 0.0001263017228786334, "loss": 0.2691, "step": 2536 }, { "epoch": 1.422085201793722, "grad_norm": 0.07257759103677881, "learning_rate": 0.0001262387816403115, "loss": 0.2707, "step": 2537 }, { "epoch": 1.4226457399103138, "grad_norm": 0.07112861165554513, "learning_rate": 0.00012617582923668853, "loss": 0.2711, "step": 2538 }, { "epoch": 1.423206278026906, "grad_norm": 0.07450628624044056, "learning_rate": 0.0001261128656945524, "loss": 0.2669, "step": 2539 }, { "epoch": 1.4237668161434978, "grad_norm": 0.07405807598729093, "learning_rate": 0.0001260498910406958, "loss": 0.2754, "step": 2540 }, { "epoch": 1.4243273542600896, "grad_norm": 0.07248947588583825, "learning_rate": 0.00012598690530191608, "loss": 0.2776, "step": 2541 }, { "epoch": 1.4248878923766817, "grad_norm": 0.07310931331855644, "learning_rate": 0.00012592390850501537, "loss": 0.2632, "step": 2542 }, { "epoch": 1.4254484304932735, "grad_norm": 0.07061925291194063, "learning_rate": 0.00012586090067680047, "loss": 0.2632, "step": 2543 }, { "epoch": 1.4260089686098656, "grad_norm": 0.07131670537493585, "learning_rate": 0.00012579788184408295, "loss": 0.2809, "step": 2544 }, { "epoch": 1.4265695067264574, "grad_norm": 0.07106765132212761, "learning_rate": 0.00012573485203367895, "loss": 0.2518, "step": 2545 }, { "epoch": 1.4271300448430493, "grad_norm": 0.07117179198796343, "learning_rate": 0.00012567181127240933, "loss": 0.2665, "step": 2546 }, { "epoch": 1.4276905829596411, "grad_norm": 0.07266380140678072, "learning_rate": 0.00012560875958709963, "loss": 0.2749, "step": 2547 }, { "epoch": 1.4282511210762332, "grad_norm": 0.0722153633269687, "learning_rate": 0.00012554569700458002, "loss": 0.2721, "step": 2548 }, { "epoch": 1.428811659192825, "grad_norm": 0.06952925783993638, "learning_rate": 0.00012548262355168533, "loss": 0.2715, "step": 2549 }, { "epoch": 1.4293721973094171, "grad_norm": 0.07215173118012304, "learning_rate": 0.000125419539255255, "loss": 0.2763, "step": 2550 }, { "epoch": 1.429932735426009, "grad_norm": 0.07150045454122866, "learning_rate": 0.000125356444142133, "loss": 0.2676, "step": 2551 }, { "epoch": 1.4304932735426008, "grad_norm": 0.07037204873905802, "learning_rate": 0.00012529333823916807, "loss": 0.2846, "step": 2552 }, { "epoch": 1.4310538116591929, "grad_norm": 0.06942295359738748, "learning_rate": 0.00012523022157321346, "loss": 0.2693, "step": 2553 }, { "epoch": 1.4316143497757847, "grad_norm": 0.07052826277122247, "learning_rate": 0.00012516709417112693, "loss": 0.2789, "step": 2554 }, { "epoch": 1.4321748878923768, "grad_norm": 0.06766539896480486, "learning_rate": 0.00012510395605977087, "loss": 0.2757, "step": 2555 }, { "epoch": 1.4327354260089686, "grad_norm": 0.07142433812281358, "learning_rate": 0.00012504080726601232, "loss": 0.2619, "step": 2556 }, { "epoch": 1.4332959641255605, "grad_norm": 0.07044191277742376, "learning_rate": 0.0001249776478167227, "loss": 0.2825, "step": 2557 }, { "epoch": 1.4338565022421523, "grad_norm": 0.0701961555356744, "learning_rate": 0.00012491447773877804, "loss": 0.2756, "step": 2558 }, { "epoch": 1.4344170403587444, "grad_norm": 0.07089835656914216, "learning_rate": 0.00012485129705905893, "loss": 0.2662, "step": 2559 }, { "epoch": 1.4349775784753362, "grad_norm": 0.07252791283428492, "learning_rate": 0.0001247881058044504, "loss": 0.271, "step": 2560 }, { "epoch": 1.4355381165919283, "grad_norm": 0.07217047354376249, "learning_rate": 0.00012472490400184205, "loss": 0.2622, "step": 2561 }, { "epoch": 1.4360986547085202, "grad_norm": 0.07126349475146648, "learning_rate": 0.0001246616916781279, "loss": 0.2703, "step": 2562 }, { "epoch": 1.436659192825112, "grad_norm": 0.07133291996222735, "learning_rate": 0.00012459846886020643, "loss": 0.2612, "step": 2563 }, { "epoch": 1.437219730941704, "grad_norm": 0.06766315324492028, "learning_rate": 0.00012453523557498075, "loss": 0.2591, "step": 2564 }, { "epoch": 1.437780269058296, "grad_norm": 0.07217502057141792, "learning_rate": 0.00012447199184935823, "loss": 0.2815, "step": 2565 }, { "epoch": 1.438340807174888, "grad_norm": 0.07416700940497611, "learning_rate": 0.0001244087377102508, "loss": 0.2598, "step": 2566 }, { "epoch": 1.4389013452914798, "grad_norm": 0.0701666560713026, "learning_rate": 0.00012434547318457474, "loss": 0.2718, "step": 2567 }, { "epoch": 1.4394618834080717, "grad_norm": 0.07068547220314525, "learning_rate": 0.00012428219829925083, "loss": 0.2653, "step": 2568 }, { "epoch": 1.4400224215246638, "grad_norm": 0.07092853131496765, "learning_rate": 0.0001242189130812042, "loss": 0.2641, "step": 2569 }, { "epoch": 1.4405829596412556, "grad_norm": 0.07325103216390964, "learning_rate": 0.0001241556175573644, "loss": 0.2786, "step": 2570 }, { "epoch": 1.4411434977578477, "grad_norm": 0.07007544062017068, "learning_rate": 0.00012409231175466537, "loss": 0.2706, "step": 2571 }, { "epoch": 1.4417040358744395, "grad_norm": 0.06931592432697589, "learning_rate": 0.00012402899570004543, "loss": 0.2616, "step": 2572 }, { "epoch": 1.4422645739910314, "grad_norm": 0.06984288584717985, "learning_rate": 0.00012396566942044724, "loss": 0.2694, "step": 2573 }, { "epoch": 1.4428251121076232, "grad_norm": 0.07034742549246696, "learning_rate": 0.0001239023329428178, "loss": 0.2583, "step": 2574 }, { "epoch": 1.4433856502242153, "grad_norm": 0.07156428602049808, "learning_rate": 0.00012383898629410843, "loss": 0.2718, "step": 2575 }, { "epoch": 1.4439461883408071, "grad_norm": 0.07435589986711433, "learning_rate": 0.00012377562950127493, "loss": 0.2879, "step": 2576 }, { "epoch": 1.4445067264573992, "grad_norm": 0.07248407339009183, "learning_rate": 0.00012371226259127725, "loss": 0.268, "step": 2577 }, { "epoch": 1.445067264573991, "grad_norm": 0.07149452390292611, "learning_rate": 0.00012364888559107966, "loss": 0.2625, "step": 2578 }, { "epoch": 1.4456278026905829, "grad_norm": 0.07385282012847322, "learning_rate": 0.00012358549852765083, "loss": 0.2687, "step": 2579 }, { "epoch": 1.446188340807175, "grad_norm": 0.07214885170023577, "learning_rate": 0.0001235221014279636, "loss": 0.2601, "step": 2580 }, { "epoch": 1.4467488789237668, "grad_norm": 0.07356871078510978, "learning_rate": 0.0001234586943189951, "loss": 0.263, "step": 2581 }, { "epoch": 1.4473094170403589, "grad_norm": 0.07215878633263344, "learning_rate": 0.00012339527722772683, "loss": 0.2722, "step": 2582 }, { "epoch": 1.4478699551569507, "grad_norm": 0.0714950757352865, "learning_rate": 0.00012333185018114439, "loss": 0.2782, "step": 2583 }, { "epoch": 1.4484304932735426, "grad_norm": 0.06893091139036768, "learning_rate": 0.00012326841320623767, "loss": 0.2582, "step": 2584 }, { "epoch": 1.4489910313901344, "grad_norm": 0.07247272132126951, "learning_rate": 0.00012320496633000088, "loss": 0.2705, "step": 2585 }, { "epoch": 1.4495515695067265, "grad_norm": 0.0742463707609409, "learning_rate": 0.00012314150957943226, "loss": 0.257, "step": 2586 }, { "epoch": 1.4501121076233183, "grad_norm": 0.07289183935471263, "learning_rate": 0.0001230780429815344, "loss": 0.2785, "step": 2587 }, { "epoch": 1.4506726457399104, "grad_norm": 0.07100913828665414, "learning_rate": 0.00012301456656331402, "loss": 0.2545, "step": 2588 }, { "epoch": 1.4512331838565022, "grad_norm": 0.07359079977697225, "learning_rate": 0.000122951080351782, "loss": 0.2752, "step": 2589 }, { "epoch": 1.451793721973094, "grad_norm": 0.07124953126346799, "learning_rate": 0.00012288758437395343, "loss": 0.262, "step": 2590 }, { "epoch": 1.4523542600896862, "grad_norm": 0.07301714224898355, "learning_rate": 0.00012282407865684758, "loss": 0.2786, "step": 2591 }, { "epoch": 1.452914798206278, "grad_norm": 0.0728659319865238, "learning_rate": 0.00012276056322748778, "loss": 0.254, "step": 2592 }, { "epoch": 1.45347533632287, "grad_norm": 0.07110814045711723, "learning_rate": 0.0001226970381129016, "loss": 0.2677, "step": 2593 }, { "epoch": 1.454035874439462, "grad_norm": 0.07231421794163116, "learning_rate": 0.0001226335033401206, "loss": 0.2743, "step": 2594 }, { "epoch": 1.4545964125560538, "grad_norm": 0.07283555116735829, "learning_rate": 0.00012256995893618054, "loss": 0.2642, "step": 2595 }, { "epoch": 1.4551569506726456, "grad_norm": 0.073614984693919, "learning_rate": 0.0001225064049281212, "loss": 0.2709, "step": 2596 }, { "epoch": 1.4557174887892377, "grad_norm": 0.07712966688124136, "learning_rate": 0.00012244284134298666, "loss": 0.2737, "step": 2597 }, { "epoch": 1.4562780269058295, "grad_norm": 0.07027837764258521, "learning_rate": 0.00012237926820782478, "loss": 0.2582, "step": 2598 }, { "epoch": 1.4568385650224216, "grad_norm": 0.07190987399585402, "learning_rate": 0.00012231568554968767, "loss": 0.2709, "step": 2599 }, { "epoch": 1.4573991031390134, "grad_norm": 0.07164571881496312, "learning_rate": 0.00012225209339563145, "loss": 0.2721, "step": 2600 }, { "epoch": 1.4579596412556053, "grad_norm": 0.07157687722194844, "learning_rate": 0.00012218849177271626, "loss": 0.2641, "step": 2601 }, { "epoch": 1.4585201793721974, "grad_norm": 0.07083119161099244, "learning_rate": 0.00012212488070800635, "loss": 0.2759, "step": 2602 }, { "epoch": 1.4590807174887892, "grad_norm": 0.07050244131689339, "learning_rate": 0.00012206126022856984, "loss": 0.2668, "step": 2603 }, { "epoch": 1.4596412556053813, "grad_norm": 0.07271707727122122, "learning_rate": 0.00012199763036147895, "loss": 0.2731, "step": 2604 }, { "epoch": 1.4602017937219731, "grad_norm": 0.06951004638988882, "learning_rate": 0.00012193399113380994, "loss": 0.2617, "step": 2605 }, { "epoch": 1.460762331838565, "grad_norm": 0.0723026309843755, "learning_rate": 0.00012187034257264297, "loss": 0.2689, "step": 2606 }, { "epoch": 1.461322869955157, "grad_norm": 0.07046208655478085, "learning_rate": 0.0001218066847050622, "loss": 0.257, "step": 2607 }, { "epoch": 1.4618834080717489, "grad_norm": 0.07079115538340826, "learning_rate": 0.00012174301755815571, "loss": 0.2782, "step": 2608 }, { "epoch": 1.4624439461883407, "grad_norm": 0.07405499837804955, "learning_rate": 0.00012167934115901563, "loss": 0.2658, "step": 2609 }, { "epoch": 1.4630044843049328, "grad_norm": 0.0724381236796581, "learning_rate": 0.00012161565553473792, "loss": 0.2753, "step": 2610 }, { "epoch": 1.4635650224215246, "grad_norm": 0.07152512654173403, "learning_rate": 0.00012155196071242254, "loss": 0.2737, "step": 2611 }, { "epoch": 1.4641255605381165, "grad_norm": 0.07565898894319906, "learning_rate": 0.00012148825671917334, "loss": 0.2762, "step": 2612 }, { "epoch": 1.4646860986547086, "grad_norm": 0.07006862453203005, "learning_rate": 0.00012142454358209803, "loss": 0.2585, "step": 2613 }, { "epoch": 1.4652466367713004, "grad_norm": 0.06943282943633416, "learning_rate": 0.00012136082132830828, "loss": 0.2619, "step": 2614 }, { "epoch": 1.4658071748878925, "grad_norm": 0.07151809193225557, "learning_rate": 0.0001212970899849196, "loss": 0.2721, "step": 2615 }, { "epoch": 1.4663677130044843, "grad_norm": 0.06957586106604624, "learning_rate": 0.0001212333495790514, "loss": 0.2617, "step": 2616 }, { "epoch": 1.4669282511210762, "grad_norm": 0.0682323905413523, "learning_rate": 0.00012116960013782684, "loss": 0.2675, "step": 2617 }, { "epoch": 1.4674887892376682, "grad_norm": 0.07157429220500997, "learning_rate": 0.00012110584168837309, "loss": 0.274, "step": 2618 }, { "epoch": 1.46804932735426, "grad_norm": 0.07082774922467017, "learning_rate": 0.00012104207425782104, "loss": 0.2691, "step": 2619 }, { "epoch": 1.4686098654708521, "grad_norm": 0.07172556028530615, "learning_rate": 0.00012097829787330544, "loss": 0.2782, "step": 2620 }, { "epoch": 1.469170403587444, "grad_norm": 0.07282110633980718, "learning_rate": 0.00012091451256196484, "loss": 0.2737, "step": 2621 }, { "epoch": 1.4697309417040358, "grad_norm": 0.07215127145326976, "learning_rate": 0.0001208507183509416, "loss": 0.2644, "step": 2622 }, { "epoch": 1.4702914798206277, "grad_norm": 0.07070637957573725, "learning_rate": 0.00012078691526738181, "loss": 0.2675, "step": 2623 }, { "epoch": 1.4708520179372198, "grad_norm": 0.07102377989515693, "learning_rate": 0.00012072310333843544, "loss": 0.2641, "step": 2624 }, { "epoch": 1.4714125560538116, "grad_norm": 0.07221309044790118, "learning_rate": 0.00012065928259125611, "loss": 0.2614, "step": 2625 }, { "epoch": 1.4719730941704037, "grad_norm": 0.07180070926361873, "learning_rate": 0.0001205954530530013, "loss": 0.2736, "step": 2626 }, { "epoch": 1.4725336322869955, "grad_norm": 0.06971237885977041, "learning_rate": 0.0001205316147508322, "loss": 0.2717, "step": 2627 }, { "epoch": 1.4730941704035874, "grad_norm": 0.07367921734383244, "learning_rate": 0.00012046776771191366, "loss": 0.2777, "step": 2628 }, { "epoch": 1.4736547085201794, "grad_norm": 0.07097022201909618, "learning_rate": 0.00012040391196341427, "loss": 0.2712, "step": 2629 }, { "epoch": 1.4742152466367713, "grad_norm": 0.06842246960972916, "learning_rate": 0.00012034004753250643, "loss": 0.2625, "step": 2630 }, { "epoch": 1.4747757847533634, "grad_norm": 0.07260291316386094, "learning_rate": 0.00012027617444636612, "loss": 0.2567, "step": 2631 }, { "epoch": 1.4753363228699552, "grad_norm": 0.06995572021067038, "learning_rate": 0.00012021229273217302, "loss": 0.2641, "step": 2632 }, { "epoch": 1.475896860986547, "grad_norm": 0.07559509626086572, "learning_rate": 0.00012014840241711054, "loss": 0.2786, "step": 2633 }, { "epoch": 1.476457399103139, "grad_norm": 0.07400623872966176, "learning_rate": 0.00012008450352836572, "loss": 0.2664, "step": 2634 }, { "epoch": 1.477017937219731, "grad_norm": 0.06892022781762777, "learning_rate": 0.00012002059609312917, "loss": 0.2598, "step": 2635 }, { "epoch": 1.4775784753363228, "grad_norm": 0.07318586459323616, "learning_rate": 0.00011995668013859529, "loss": 0.2739, "step": 2636 }, { "epoch": 1.4781390134529149, "grad_norm": 0.0724655659139408, "learning_rate": 0.00011989275569196194, "loss": 0.2732, "step": 2637 }, { "epoch": 1.4786995515695067, "grad_norm": 0.0708265950975277, "learning_rate": 0.00011982882278043077, "loss": 0.263, "step": 2638 }, { "epoch": 1.4792600896860986, "grad_norm": 0.0715074657441884, "learning_rate": 0.00011976488143120687, "loss": 0.2741, "step": 2639 }, { "epoch": 1.4798206278026906, "grad_norm": 0.06812472045590152, "learning_rate": 0.00011970093167149905, "loss": 0.2696, "step": 2640 }, { "epoch": 1.4803811659192825, "grad_norm": 0.06926495675281664, "learning_rate": 0.00011963697352851955, "loss": 0.2715, "step": 2641 }, { "epoch": 1.4809417040358746, "grad_norm": 0.06789046197779881, "learning_rate": 0.00011957300702948435, "loss": 0.2641, "step": 2642 }, { "epoch": 1.4815022421524664, "grad_norm": 0.07183132691344266, "learning_rate": 0.00011950903220161285, "loss": 0.275, "step": 2643 }, { "epoch": 1.4820627802690582, "grad_norm": 0.07124962976203886, "learning_rate": 0.00011944504907212804, "loss": 0.2667, "step": 2644 }, { "epoch": 1.48262331838565, "grad_norm": 0.07381585212265701, "learning_rate": 0.0001193810576682565, "loss": 0.2736, "step": 2645 }, { "epoch": 1.4831838565022422, "grad_norm": 0.07207926726683282, "learning_rate": 0.00011931705801722818, "loss": 0.2667, "step": 2646 }, { "epoch": 1.483744394618834, "grad_norm": 0.07106474229779323, "learning_rate": 0.00011925305014627678, "loss": 0.2588, "step": 2647 }, { "epoch": 1.484304932735426, "grad_norm": 0.07148674970861063, "learning_rate": 0.00011918903408263924, "loss": 0.2798, "step": 2648 }, { "epoch": 1.484865470852018, "grad_norm": 0.06948382996852732, "learning_rate": 0.00011912500985355614, "loss": 0.2712, "step": 2649 }, { "epoch": 1.4854260089686098, "grad_norm": 0.07199061909052544, "learning_rate": 0.00011906097748627149, "loss": 0.278, "step": 2650 }, { "epoch": 1.4859865470852018, "grad_norm": 0.07159046390315446, "learning_rate": 0.00011899693700803278, "loss": 0.2717, "step": 2651 }, { "epoch": 1.4865470852017937, "grad_norm": 0.07056011106519934, "learning_rate": 0.00011893288844609094, "loss": 0.274, "step": 2652 }, { "epoch": 1.4871076233183858, "grad_norm": 0.07348995833777743, "learning_rate": 0.00011886883182770035, "loss": 0.2579, "step": 2653 }, { "epoch": 1.4876681614349776, "grad_norm": 0.07263591105598495, "learning_rate": 0.00011880476718011877, "loss": 0.2615, "step": 2654 }, { "epoch": 1.4882286995515694, "grad_norm": 0.07070466411503919, "learning_rate": 0.00011874069453060746, "loss": 0.2527, "step": 2655 }, { "epoch": 1.4887892376681615, "grad_norm": 0.07060666180313581, "learning_rate": 0.000118676613906431, "loss": 0.2723, "step": 2656 }, { "epoch": 1.4893497757847534, "grad_norm": 0.07293653223106318, "learning_rate": 0.00011861252533485742, "loss": 0.2621, "step": 2657 }, { "epoch": 1.4899103139013454, "grad_norm": 0.07305538068005077, "learning_rate": 0.00011854842884315813, "loss": 0.2736, "step": 2658 }, { "epoch": 1.4904708520179373, "grad_norm": 0.07138618891331834, "learning_rate": 0.00011848432445860789, "loss": 0.2794, "step": 2659 }, { "epoch": 1.4910313901345291, "grad_norm": 0.07073266031117438, "learning_rate": 0.00011842021220848486, "loss": 0.2644, "step": 2660 }, { "epoch": 1.491591928251121, "grad_norm": 0.0765358318870987, "learning_rate": 0.00011835609212007042, "loss": 0.2958, "step": 2661 }, { "epoch": 1.492152466367713, "grad_norm": 0.07162018165171527, "learning_rate": 0.00011829196422064943, "loss": 0.2662, "step": 2662 }, { "epoch": 1.4927130044843049, "grad_norm": 0.07290240396375056, "learning_rate": 0.00011822782853751002, "loss": 0.2688, "step": 2663 }, { "epoch": 1.493273542600897, "grad_norm": 0.07229718840329626, "learning_rate": 0.00011816368509794364, "loss": 0.2691, "step": 2664 }, { "epoch": 1.4938340807174888, "grad_norm": 0.07353094536852173, "learning_rate": 0.00011809953392924504, "loss": 0.2605, "step": 2665 }, { "epoch": 1.4943946188340806, "grad_norm": 0.07089474069888169, "learning_rate": 0.00011803537505871225, "loss": 0.2573, "step": 2666 }, { "epoch": 1.4949551569506727, "grad_norm": 0.07282249799098499, "learning_rate": 0.00011797120851364653, "loss": 0.2755, "step": 2667 }, { "epoch": 1.4955156950672646, "grad_norm": 0.07238850475224404, "learning_rate": 0.00011790703432135253, "loss": 0.273, "step": 2668 }, { "epoch": 1.4960762331838566, "grad_norm": 0.07410776643227834, "learning_rate": 0.00011784285250913802, "loss": 0.274, "step": 2669 }, { "epoch": 1.4966367713004485, "grad_norm": 0.07013595725707807, "learning_rate": 0.00011777866310431409, "loss": 0.2643, "step": 2670 }, { "epoch": 1.4971973094170403, "grad_norm": 0.07312148011415108, "learning_rate": 0.00011771446613419508, "loss": 0.2622, "step": 2671 }, { "epoch": 1.4977578475336322, "grad_norm": 0.0715652009409089, "learning_rate": 0.00011765026162609847, "loss": 0.2753, "step": 2672 }, { "epoch": 1.4983183856502242, "grad_norm": 0.06865430293035583, "learning_rate": 0.00011758604960734499, "loss": 0.2624, "step": 2673 }, { "epoch": 1.498878923766816, "grad_norm": 0.071011427325384, "learning_rate": 0.0001175218301052586, "loss": 0.2753, "step": 2674 }, { "epoch": 1.4994394618834082, "grad_norm": 0.07501914248030692, "learning_rate": 0.00011745760314716636, "loss": 0.2846, "step": 2675 }, { "epoch": 1.5, "grad_norm": 0.07315247184854988, "learning_rate": 0.00011739336876039859, "loss": 0.2536, "step": 2676 }, { "epoch": 1.5005605381165918, "grad_norm": 0.07157590122387843, "learning_rate": 0.00011732912697228872, "loss": 0.2767, "step": 2677 }, { "epoch": 1.5011210762331837, "grad_norm": 0.07109590528600179, "learning_rate": 0.00011726487781017337, "loss": 0.2694, "step": 2678 }, { "epoch": 1.5016816143497758, "grad_norm": 0.07068316674858972, "learning_rate": 0.0001172006213013922, "loss": 0.2658, "step": 2679 }, { "epoch": 1.5022421524663678, "grad_norm": 0.07250084006014036, "learning_rate": 0.00011713635747328818, "loss": 0.2706, "step": 2680 }, { "epoch": 1.5028026905829597, "grad_norm": 0.07366391637259316, "learning_rate": 0.00011707208635320718, "loss": 0.2674, "step": 2681 }, { "epoch": 1.5033632286995515, "grad_norm": 0.0699000966109222, "learning_rate": 0.00011700780796849833, "loss": 0.2713, "step": 2682 }, { "epoch": 1.5039237668161434, "grad_norm": 0.07257458686854877, "learning_rate": 0.00011694352234651373, "loss": 0.2849, "step": 2683 }, { "epoch": 1.5044843049327354, "grad_norm": 0.07190670499483498, "learning_rate": 0.00011687922951460872, "loss": 0.2657, "step": 2684 }, { "epoch": 1.5050448430493275, "grad_norm": 0.07117959792590416, "learning_rate": 0.00011681492950014157, "loss": 0.266, "step": 2685 }, { "epoch": 1.5056053811659194, "grad_norm": 0.07294991022142684, "learning_rate": 0.00011675062233047364, "loss": 0.2763, "step": 2686 }, { "epoch": 1.5061659192825112, "grad_norm": 0.07142237945995769, "learning_rate": 0.00011668630803296939, "loss": 0.2802, "step": 2687 }, { "epoch": 1.506726457399103, "grad_norm": 0.0735994024551922, "learning_rate": 0.00011662198663499619, "loss": 0.2659, "step": 2688 }, { "epoch": 1.5072869955156951, "grad_norm": 0.07203572611511369, "learning_rate": 0.00011655765816392457, "loss": 0.2687, "step": 2689 }, { "epoch": 1.507847533632287, "grad_norm": 0.071135139809204, "learning_rate": 0.00011649332264712798, "loss": 0.2687, "step": 2690 }, { "epoch": 1.508408071748879, "grad_norm": 0.07379585959916812, "learning_rate": 0.00011642898011198288, "loss": 0.2756, "step": 2691 }, { "epoch": 1.5089686098654709, "grad_norm": 0.07231405879270913, "learning_rate": 0.00011636463058586881, "loss": 0.2556, "step": 2692 }, { "epoch": 1.5095291479820627, "grad_norm": 0.07157443774998543, "learning_rate": 0.00011630027409616817, "loss": 0.2653, "step": 2693 }, { "epoch": 1.5100896860986546, "grad_norm": 0.06944155962361898, "learning_rate": 0.00011623591067026636, "loss": 0.2728, "step": 2694 }, { "epoch": 1.5106502242152466, "grad_norm": 0.07350506179889904, "learning_rate": 0.00011617154033555169, "loss": 0.2753, "step": 2695 }, { "epoch": 1.5112107623318387, "grad_norm": 0.06980443587081692, "learning_rate": 0.0001161071631194155, "loss": 0.2537, "step": 2696 }, { "epoch": 1.5117713004484306, "grad_norm": 0.07382751069626999, "learning_rate": 0.000116042779049252, "loss": 0.289, "step": 2697 }, { "epoch": 1.5123318385650224, "grad_norm": 0.06844084556266467, "learning_rate": 0.00011597838815245836, "loss": 0.2514, "step": 2698 }, { "epoch": 1.5128923766816142, "grad_norm": 0.07038537623927177, "learning_rate": 0.00011591399045643455, "loss": 0.2789, "step": 2699 }, { "epoch": 1.5134529147982063, "grad_norm": 0.07115950011898217, "learning_rate": 0.00011584958598858359, "loss": 0.2532, "step": 2700 }, { "epoch": 1.5140134529147982, "grad_norm": 0.07091988456732695, "learning_rate": 0.00011578517477631125, "loss": 0.2649, "step": 2701 }, { "epoch": 1.5145739910313902, "grad_norm": 0.07208183094581253, "learning_rate": 0.00011572075684702624, "loss": 0.2725, "step": 2702 }, { "epoch": 1.515134529147982, "grad_norm": 0.07021910344644247, "learning_rate": 0.00011565633222814005, "loss": 0.2623, "step": 2703 }, { "epoch": 1.515695067264574, "grad_norm": 0.07114186929092427, "learning_rate": 0.00011559190094706714, "loss": 0.2557, "step": 2704 }, { "epoch": 1.5162556053811658, "grad_norm": 0.07215828197766797, "learning_rate": 0.0001155274630312247, "loss": 0.2781, "step": 2705 }, { "epoch": 1.5168161434977578, "grad_norm": 0.07282199701609747, "learning_rate": 0.00011546301850803282, "loss": 0.2737, "step": 2706 }, { "epoch": 1.51737668161435, "grad_norm": 0.07083573120703515, "learning_rate": 0.00011539856740491432, "loss": 0.2695, "step": 2707 }, { "epoch": 1.5179372197309418, "grad_norm": 0.07259718479360823, "learning_rate": 0.0001153341097492949, "loss": 0.2805, "step": 2708 }, { "epoch": 1.5184977578475336, "grad_norm": 0.07061212408384449, "learning_rate": 0.00011526964556860298, "loss": 0.2661, "step": 2709 }, { "epoch": 1.5190582959641254, "grad_norm": 0.06926870430173317, "learning_rate": 0.0001152051748902698, "loss": 0.2656, "step": 2710 }, { "epoch": 1.5196188340807175, "grad_norm": 0.07171120149804051, "learning_rate": 0.00011514069774172936, "loss": 0.2747, "step": 2711 }, { "epoch": 1.5201793721973094, "grad_norm": 0.07436517776992174, "learning_rate": 0.00011507621415041837, "loss": 0.2773, "step": 2712 }, { "epoch": 1.5207399103139014, "grad_norm": 0.0688617758581786, "learning_rate": 0.00011501172414377634, "loss": 0.2563, "step": 2713 }, { "epoch": 1.5213004484304933, "grad_norm": 0.07049212794757495, "learning_rate": 0.00011494722774924554, "loss": 0.2645, "step": 2714 }, { "epoch": 1.5218609865470851, "grad_norm": 0.07169040941860719, "learning_rate": 0.0001148827249942708, "loss": 0.2821, "step": 2715 }, { "epoch": 1.522421524663677, "grad_norm": 0.07251652451590532, "learning_rate": 0.00011481821590629985, "loss": 0.2593, "step": 2716 }, { "epoch": 1.522982062780269, "grad_norm": 0.07270256957010714, "learning_rate": 0.00011475370051278298, "loss": 0.2657, "step": 2717 }, { "epoch": 1.523542600896861, "grad_norm": 0.07386026335593725, "learning_rate": 0.00011468917884117323, "loss": 0.2707, "step": 2718 }, { "epoch": 1.524103139013453, "grad_norm": 0.07115321344068759, "learning_rate": 0.0001146246509189263, "loss": 0.2782, "step": 2719 }, { "epoch": 1.5246636771300448, "grad_norm": 0.07029243312234992, "learning_rate": 0.00011456011677350051, "loss": 0.2701, "step": 2720 }, { "epoch": 1.5252242152466366, "grad_norm": 0.07110567655941383, "learning_rate": 0.00011449557643235686, "loss": 0.2646, "step": 2721 }, { "epoch": 1.5257847533632287, "grad_norm": 0.07330966340004928, "learning_rate": 0.00011443102992295904, "loss": 0.2872, "step": 2722 }, { "epoch": 1.5263452914798208, "grad_norm": 0.07141975317438559, "learning_rate": 0.00011436647727277326, "loss": 0.2665, "step": 2723 }, { "epoch": 1.5269058295964126, "grad_norm": 0.07109655928452469, "learning_rate": 0.00011430191850926837, "loss": 0.2806, "step": 2724 }, { "epoch": 1.5274663677130045, "grad_norm": 0.07087037740185703, "learning_rate": 0.0001142373536599159, "loss": 0.2751, "step": 2725 }, { "epoch": 1.5280269058295963, "grad_norm": 0.07042245583234814, "learning_rate": 0.0001141727827521899, "loss": 0.2726, "step": 2726 }, { "epoch": 1.5285874439461884, "grad_norm": 0.06983269779012252, "learning_rate": 0.00011410820581356705, "loss": 0.2716, "step": 2727 }, { "epoch": 1.5291479820627802, "grad_norm": 0.07007730001343894, "learning_rate": 0.00011404362287152646, "loss": 0.2693, "step": 2728 }, { "epoch": 1.5297085201793723, "grad_norm": 0.06948591281865128, "learning_rate": 0.00011397903395354996, "loss": 0.2668, "step": 2729 }, { "epoch": 1.5302690582959642, "grad_norm": 0.06983825876951026, "learning_rate": 0.00011391443908712185, "loss": 0.2685, "step": 2730 }, { "epoch": 1.530829596412556, "grad_norm": 0.0703247996027728, "learning_rate": 0.00011384983829972898, "loss": 0.2661, "step": 2731 }, { "epoch": 1.5313901345291479, "grad_norm": 0.07069065438339443, "learning_rate": 0.00011378523161886066, "loss": 0.2603, "step": 2732 }, { "epoch": 1.53195067264574, "grad_norm": 0.07320307867717173, "learning_rate": 0.00011372061907200881, "loss": 0.2632, "step": 2733 }, { "epoch": 1.532511210762332, "grad_norm": 0.07502228465481751, "learning_rate": 0.0001136560006866678, "loss": 0.2759, "step": 2734 }, { "epoch": 1.5330717488789238, "grad_norm": 0.07260114925117225, "learning_rate": 0.0001135913764903344, "loss": 0.2656, "step": 2735 }, { "epoch": 1.5336322869955157, "grad_norm": 0.07233958278729473, "learning_rate": 0.00011352674651050796, "loss": 0.2648, "step": 2736 }, { "epoch": 1.5341928251121075, "grad_norm": 0.07047796696966747, "learning_rate": 0.00011346211077469029, "loss": 0.2653, "step": 2737 }, { "epoch": 1.5347533632286996, "grad_norm": 0.06960266451046976, "learning_rate": 0.00011339746931038562, "loss": 0.2604, "step": 2738 }, { "epoch": 1.5353139013452914, "grad_norm": 0.07251433886476083, "learning_rate": 0.00011333282214510057, "loss": 0.269, "step": 2739 }, { "epoch": 1.5358744394618835, "grad_norm": 0.07198280804520825, "learning_rate": 0.00011326816930634427, "loss": 0.2739, "step": 2740 }, { "epoch": 1.5364349775784754, "grad_norm": 0.06817081809098341, "learning_rate": 0.00011320351082162821, "loss": 0.2704, "step": 2741 }, { "epoch": 1.5369955156950672, "grad_norm": 0.07273992337057941, "learning_rate": 0.00011313884671846631, "loss": 0.2663, "step": 2742 }, { "epoch": 1.537556053811659, "grad_norm": 0.07221271416711049, "learning_rate": 0.00011307417702437486, "loss": 0.264, "step": 2743 }, { "epoch": 1.5381165919282511, "grad_norm": 0.06943371837414848, "learning_rate": 0.00011300950176687255, "loss": 0.2567, "step": 2744 }, { "epoch": 1.5386771300448432, "grad_norm": 0.07117596406391344, "learning_rate": 0.00011294482097348041, "loss": 0.2744, "step": 2745 }, { "epoch": 1.539237668161435, "grad_norm": 0.07227818845611876, "learning_rate": 0.00011288013467172184, "loss": 0.2778, "step": 2746 }, { "epoch": 1.5397982062780269, "grad_norm": 0.07042713884682089, "learning_rate": 0.00011281544288912264, "loss": 0.2618, "step": 2747 }, { "epoch": 1.5403587443946187, "grad_norm": 0.07304650101652455, "learning_rate": 0.0001127507456532108, "loss": 0.2732, "step": 2748 }, { "epoch": 1.5409192825112108, "grad_norm": 0.07328343982768012, "learning_rate": 0.00011268604299151677, "loss": 0.279, "step": 2749 }, { "epoch": 1.5414798206278026, "grad_norm": 0.07318699184086801, "learning_rate": 0.00011262133493157327, "loss": 0.2713, "step": 2750 }, { "epoch": 1.5420403587443947, "grad_norm": 0.07337036632547349, "learning_rate": 0.00011255662150091526, "loss": 0.2566, "step": 2751 }, { "epoch": 1.5426008968609866, "grad_norm": 0.0695308768661344, "learning_rate": 0.00011249190272708008, "loss": 0.2624, "step": 2752 }, { "epoch": 1.5431614349775784, "grad_norm": 0.0705214090456011, "learning_rate": 0.00011242717863760723, "loss": 0.2628, "step": 2753 }, { "epoch": 1.5437219730941703, "grad_norm": 0.07129424585278271, "learning_rate": 0.00011236244926003865, "loss": 0.2788, "step": 2754 }, { "epoch": 1.5442825112107623, "grad_norm": 0.07344043028329685, "learning_rate": 0.0001122977146219183, "loss": 0.2627, "step": 2755 }, { "epoch": 1.5448430493273544, "grad_norm": 0.07176293018560918, "learning_rate": 0.00011223297475079251, "loss": 0.2728, "step": 2756 }, { "epoch": 1.5454035874439462, "grad_norm": 0.07338480257588491, "learning_rate": 0.00011216822967420985, "loss": 0.2771, "step": 2757 }, { "epoch": 1.545964125560538, "grad_norm": 0.06922021966047334, "learning_rate": 0.00011210347941972108, "loss": 0.2745, "step": 2758 }, { "epoch": 1.54652466367713, "grad_norm": 0.0707187359354573, "learning_rate": 0.00011203872401487916, "loss": 0.2674, "step": 2759 }, { "epoch": 1.547085201793722, "grad_norm": 0.07189796571809781, "learning_rate": 0.00011197396348723923, "loss": 0.2662, "step": 2760 }, { "epoch": 1.547645739910314, "grad_norm": 0.07125606905672409, "learning_rate": 0.00011190919786435863, "loss": 0.2771, "step": 2761 }, { "epoch": 1.548206278026906, "grad_norm": 0.07189254363456374, "learning_rate": 0.00011184442717379686, "loss": 0.27, "step": 2762 }, { "epoch": 1.5487668161434978, "grad_norm": 0.06947296257811135, "learning_rate": 0.00011177965144311556, "loss": 0.2813, "step": 2763 }, { "epoch": 1.5493273542600896, "grad_norm": 0.07347781274131902, "learning_rate": 0.00011171487069987851, "loss": 0.2789, "step": 2764 }, { "epoch": 1.5498878923766815, "grad_norm": 0.06778496555499179, "learning_rate": 0.00011165008497165168, "loss": 0.2601, "step": 2765 }, { "epoch": 1.5504484304932735, "grad_norm": 0.07252596866984758, "learning_rate": 0.00011158529428600313, "loss": 0.2689, "step": 2766 }, { "epoch": 1.5510089686098656, "grad_norm": 0.0733734824623474, "learning_rate": 0.00011152049867050305, "loss": 0.266, "step": 2767 }, { "epoch": 1.5515695067264574, "grad_norm": 0.06979653415887406, "learning_rate": 0.0001114556981527236, "loss": 0.2585, "step": 2768 }, { "epoch": 1.5521300448430493, "grad_norm": 0.06970372354236257, "learning_rate": 0.00011139089276023919, "loss": 0.2684, "step": 2769 }, { "epoch": 1.5526905829596411, "grad_norm": 0.07104856291168664, "learning_rate": 0.00011132608252062629, "loss": 0.2744, "step": 2770 }, { "epoch": 1.5532511210762332, "grad_norm": 0.07079411878105876, "learning_rate": 0.0001112612674614633, "loss": 0.2643, "step": 2771 }, { "epoch": 1.5538116591928253, "grad_norm": 0.0681075421107277, "learning_rate": 0.00011119644761033078, "loss": 0.2678, "step": 2772 }, { "epoch": 1.5543721973094171, "grad_norm": 0.0721166405276055, "learning_rate": 0.00011113162299481134, "loss": 0.2744, "step": 2773 }, { "epoch": 1.554932735426009, "grad_norm": 0.07146514304558317, "learning_rate": 0.00011106679364248957, "loss": 0.2532, "step": 2774 }, { "epoch": 1.5554932735426008, "grad_norm": 0.06874137621174602, "learning_rate": 0.00011100195958095208, "loss": 0.2676, "step": 2775 }, { "epoch": 1.5560538116591929, "grad_norm": 0.0709707177707054, "learning_rate": 0.00011093712083778746, "loss": 0.2658, "step": 2776 }, { "epoch": 1.5566143497757847, "grad_norm": 0.07155378484789673, "learning_rate": 0.00011087227744058637, "loss": 0.2703, "step": 2777 }, { "epoch": 1.5571748878923768, "grad_norm": 0.07099205099644741, "learning_rate": 0.00011080742941694136, "loss": 0.2521, "step": 2778 }, { "epoch": 1.5577354260089686, "grad_norm": 0.07149758730548149, "learning_rate": 0.00011074257679444702, "loss": 0.2676, "step": 2779 }, { "epoch": 1.5582959641255605, "grad_norm": 0.07460605906429514, "learning_rate": 0.00011067771960069991, "loss": 0.2743, "step": 2780 }, { "epoch": 1.5588565022421523, "grad_norm": 0.07055552984018025, "learning_rate": 0.0001106128578632984, "loss": 0.2762, "step": 2781 }, { "epoch": 1.5594170403587444, "grad_norm": 0.06945617974553053, "learning_rate": 0.000110547991609843, "loss": 0.2601, "step": 2782 }, { "epoch": 1.5599775784753365, "grad_norm": 0.07204499448870326, "learning_rate": 0.00011048312086793593, "loss": 0.2736, "step": 2783 }, { "epoch": 1.5605381165919283, "grad_norm": 0.06979439642918006, "learning_rate": 0.00011041824566518146, "loss": 0.2624, "step": 2784 }, { "epoch": 1.5610986547085202, "grad_norm": 0.0715784064127982, "learning_rate": 0.00011035336602918575, "loss": 0.2789, "step": 2785 }, { "epoch": 1.561659192825112, "grad_norm": 0.06969271980528327, "learning_rate": 0.00011028848198755674, "loss": 0.2662, "step": 2786 }, { "epoch": 1.562219730941704, "grad_norm": 0.07031677060499117, "learning_rate": 0.00011022359356790444, "loss": 0.2613, "step": 2787 }, { "epoch": 1.562780269058296, "grad_norm": 0.0713995278588566, "learning_rate": 0.00011015870079784048, "loss": 0.2679, "step": 2788 }, { "epoch": 1.563340807174888, "grad_norm": 0.07023125811789262, "learning_rate": 0.00011009380370497851, "loss": 0.2739, "step": 2789 }, { "epoch": 1.5639013452914798, "grad_norm": 0.0711916126481959, "learning_rate": 0.00011002890231693395, "loss": 0.2637, "step": 2790 }, { "epoch": 1.5644618834080717, "grad_norm": 0.07193346331917248, "learning_rate": 0.00010996399666132411, "loss": 0.2602, "step": 2791 }, { "epoch": 1.5650224215246635, "grad_norm": 0.07058975312724415, "learning_rate": 0.00010989908676576807, "loss": 0.2622, "step": 2792 }, { "epoch": 1.5655829596412556, "grad_norm": 0.07184971261985763, "learning_rate": 0.00010983417265788673, "loss": 0.2715, "step": 2793 }, { "epoch": 1.5661434977578477, "grad_norm": 0.070371241492397, "learning_rate": 0.00010976925436530275, "loss": 0.2638, "step": 2794 }, { "epoch": 1.5667040358744395, "grad_norm": 0.06995184283050783, "learning_rate": 0.00010970433191564058, "loss": 0.2468, "step": 2795 }, { "epoch": 1.5672645739910314, "grad_norm": 0.07167981001709199, "learning_rate": 0.00010963940533652648, "loss": 0.2804, "step": 2796 }, { "epoch": 1.5678251121076232, "grad_norm": 0.0736796357197792, "learning_rate": 0.00010957447465558844, "loss": 0.2732, "step": 2797 }, { "epoch": 1.5683856502242153, "grad_norm": 0.07222731874484677, "learning_rate": 0.00010950953990045615, "loss": 0.2703, "step": 2798 }, { "epoch": 1.5689461883408071, "grad_norm": 0.07294943644454632, "learning_rate": 0.00010944460109876116, "loss": 0.2614, "step": 2799 }, { "epoch": 1.5695067264573992, "grad_norm": 0.0697380808648057, "learning_rate": 0.00010937965827813661, "loss": 0.2551, "step": 2800 }, { "epoch": 1.570067264573991, "grad_norm": 0.06959792050929359, "learning_rate": 0.00010931471146621743, "loss": 0.264, "step": 2801 }, { "epoch": 1.5706278026905829, "grad_norm": 0.07226178235763066, "learning_rate": 0.00010924976069064017, "loss": 0.2611, "step": 2802 }, { "epoch": 1.5711883408071747, "grad_norm": 0.06982425711033327, "learning_rate": 0.00010918480597904317, "loss": 0.2624, "step": 2803 }, { "epoch": 1.5717488789237668, "grad_norm": 0.07167713506916018, "learning_rate": 0.00010911984735906635, "loss": 0.2766, "step": 2804 }, { "epoch": 1.5723094170403589, "grad_norm": 0.07335757508509609, "learning_rate": 0.00010905488485835138, "loss": 0.2651, "step": 2805 }, { "epoch": 1.5728699551569507, "grad_norm": 0.0690048074688932, "learning_rate": 0.00010898991850454148, "loss": 0.2609, "step": 2806 }, { "epoch": 1.5734304932735426, "grad_norm": 0.06899119672362618, "learning_rate": 0.00010892494832528161, "loss": 0.2449, "step": 2807 }, { "epoch": 1.5739910313901344, "grad_norm": 0.06809427165175322, "learning_rate": 0.00010885997434821831, "loss": 0.2741, "step": 2808 }, { "epoch": 1.5745515695067265, "grad_norm": 0.07024340254718424, "learning_rate": 0.0001087949966009997, "loss": 0.2712, "step": 2809 }, { "epoch": 1.5751121076233185, "grad_norm": 0.0713010211256685, "learning_rate": 0.00010873001511127556, "loss": 0.2719, "step": 2810 }, { "epoch": 1.5756726457399104, "grad_norm": 0.07275783822804599, "learning_rate": 0.0001086650299066973, "loss": 0.2846, "step": 2811 }, { "epoch": 1.5762331838565022, "grad_norm": 0.0708113444975407, "learning_rate": 0.00010860004101491779, "loss": 0.2727, "step": 2812 }, { "epoch": 1.576793721973094, "grad_norm": 0.07054937568232769, "learning_rate": 0.00010853504846359157, "loss": 0.2584, "step": 2813 }, { "epoch": 1.577354260089686, "grad_norm": 0.06924677616308327, "learning_rate": 0.0001084700522803747, "loss": 0.2642, "step": 2814 }, { "epoch": 1.577914798206278, "grad_norm": 0.07141196823985863, "learning_rate": 0.00010840505249292476, "loss": 0.273, "step": 2815 }, { "epoch": 1.57847533632287, "grad_norm": 0.07110415841307303, "learning_rate": 0.00010834004912890092, "loss": 0.2704, "step": 2816 }, { "epoch": 1.579035874439462, "grad_norm": 0.07098262554380318, "learning_rate": 0.00010827504221596387, "loss": 0.284, "step": 2817 }, { "epoch": 1.5795964125560538, "grad_norm": 0.06938465067367643, "learning_rate": 0.0001082100317817757, "loss": 0.2654, "step": 2818 }, { "epoch": 1.5801569506726456, "grad_norm": 0.07177108422535652, "learning_rate": 0.00010814501785400017, "loss": 0.2643, "step": 2819 }, { "epoch": 1.5807174887892377, "grad_norm": 0.07017833742767383, "learning_rate": 0.0001080800004603024, "loss": 0.253, "step": 2820 }, { "epoch": 1.5812780269058297, "grad_norm": 0.07174402594855155, "learning_rate": 0.00010801497962834906, "loss": 0.264, "step": 2821 }, { "epoch": 1.5818385650224216, "grad_norm": 0.0738102730992614, "learning_rate": 0.00010794995538580819, "loss": 0.2597, "step": 2822 }, { "epoch": 1.5823991031390134, "grad_norm": 0.07128680620070806, "learning_rate": 0.00010788492776034935, "loss": 0.2648, "step": 2823 }, { "epoch": 1.5829596412556053, "grad_norm": 0.07157932013154396, "learning_rate": 0.00010781989677964355, "loss": 0.2589, "step": 2824 }, { "epoch": 1.5835201793721974, "grad_norm": 0.07198657642299293, "learning_rate": 0.00010775486247136322, "loss": 0.2638, "step": 2825 }, { "epoch": 1.5840807174887892, "grad_norm": 0.07131804998406553, "learning_rate": 0.00010768982486318216, "loss": 0.2574, "step": 2826 }, { "epoch": 1.5846412556053813, "grad_norm": 0.06973227350515016, "learning_rate": 0.00010762478398277563, "loss": 0.2503, "step": 2827 }, { "epoch": 1.5852017937219731, "grad_norm": 0.07223748123285785, "learning_rate": 0.00010755973985782022, "loss": 0.2601, "step": 2828 }, { "epoch": 1.585762331838565, "grad_norm": 0.07199270000671919, "learning_rate": 0.000107494692515994, "loss": 0.2676, "step": 2829 }, { "epoch": 1.5863228699551568, "grad_norm": 0.07102152544902327, "learning_rate": 0.00010742964198497629, "loss": 0.2771, "step": 2830 }, { "epoch": 1.5868834080717489, "grad_norm": 0.06943510787252358, "learning_rate": 0.00010736458829244785, "loss": 0.2591, "step": 2831 }, { "epoch": 1.587443946188341, "grad_norm": 0.07229759136086263, "learning_rate": 0.00010729953146609076, "loss": 0.2679, "step": 2832 }, { "epoch": 1.5880044843049328, "grad_norm": 0.07382303361218785, "learning_rate": 0.00010723447153358843, "loss": 0.2748, "step": 2833 }, { "epoch": 1.5885650224215246, "grad_norm": 0.0722381606461495, "learning_rate": 0.00010716940852262564, "loss": 0.2711, "step": 2834 }, { "epoch": 1.5891255605381165, "grad_norm": 0.07174213151662394, "learning_rate": 0.00010710434246088834, "loss": 0.2786, "step": 2835 }, { "epoch": 1.5896860986547086, "grad_norm": 0.07072008633108894, "learning_rate": 0.00010703927337606396, "loss": 0.266, "step": 2836 }, { "epoch": 1.5902466367713004, "grad_norm": 0.06802868005905105, "learning_rate": 0.00010697420129584108, "loss": 0.2702, "step": 2837 }, { "epoch": 1.5908071748878925, "grad_norm": 0.0706938270071413, "learning_rate": 0.00010690912624790966, "loss": 0.2606, "step": 2838 }, { "epoch": 1.5913677130044843, "grad_norm": 0.07081635104087369, "learning_rate": 0.00010684404825996079, "loss": 0.2636, "step": 2839 }, { "epoch": 1.5919282511210762, "grad_norm": 0.07197010979102217, "learning_rate": 0.00010677896735968693, "loss": 0.2752, "step": 2840 }, { "epoch": 1.592488789237668, "grad_norm": 0.07214003327677651, "learning_rate": 0.00010671388357478179, "loss": 0.2859, "step": 2841 }, { "epoch": 1.59304932735426, "grad_norm": 0.07586786214389095, "learning_rate": 0.00010664879693294017, "loss": 0.2514, "step": 2842 }, { "epoch": 1.5936098654708521, "grad_norm": 0.07060520451179678, "learning_rate": 0.00010658370746185817, "loss": 0.2679, "step": 2843 }, { "epoch": 1.594170403587444, "grad_norm": 0.07028483497476899, "learning_rate": 0.00010651861518923319, "loss": 0.2706, "step": 2844 }, { "epoch": 1.5947309417040358, "grad_norm": 0.06893105853432323, "learning_rate": 0.00010645352014276364, "loss": 0.2566, "step": 2845 }, { "epoch": 1.5952914798206277, "grad_norm": 0.07169038763566891, "learning_rate": 0.00010638842235014924, "loss": 0.2579, "step": 2846 }, { "epoch": 1.5958520179372198, "grad_norm": 0.07049140886089875, "learning_rate": 0.0001063233218390908, "loss": 0.2701, "step": 2847 }, { "epoch": 1.5964125560538116, "grad_norm": 0.07083561940110712, "learning_rate": 0.00010625821863729036, "loss": 0.2764, "step": 2848 }, { "epoch": 1.5969730941704037, "grad_norm": 0.07107293763979249, "learning_rate": 0.00010619311277245104, "loss": 0.2575, "step": 2849 }, { "epoch": 1.5975336322869955, "grad_norm": 0.0701489084443605, "learning_rate": 0.00010612800427227714, "loss": 0.2686, "step": 2850 }, { "epoch": 1.5980941704035874, "grad_norm": 0.07270388701694894, "learning_rate": 0.00010606289316447406, "loss": 0.2764, "step": 2851 }, { "epoch": 1.5986547085201792, "grad_norm": 0.0703825627000912, "learning_rate": 0.00010599777947674829, "loss": 0.26, "step": 2852 }, { "epoch": 1.5992152466367713, "grad_norm": 0.07263488626169894, "learning_rate": 0.00010593266323680749, "loss": 0.2674, "step": 2853 }, { "epoch": 1.5997757847533634, "grad_norm": 0.0717526815615539, "learning_rate": 0.00010586754447236031, "loss": 0.2695, "step": 2854 }, { "epoch": 1.6003363228699552, "grad_norm": 0.0693101009253264, "learning_rate": 0.00010580242321111653, "loss": 0.2574, "step": 2855 }, { "epoch": 1.600896860986547, "grad_norm": 0.06893498612964494, "learning_rate": 0.00010573729948078699, "loss": 0.2598, "step": 2856 }, { "epoch": 1.601457399103139, "grad_norm": 0.07264346152373237, "learning_rate": 0.00010567217330908357, "loss": 0.268, "step": 2857 }, { "epoch": 1.602017937219731, "grad_norm": 0.07215474208576995, "learning_rate": 0.00010560704472371919, "loss": 0.2705, "step": 2858 }, { "epoch": 1.602578475336323, "grad_norm": 0.07014321948741521, "learning_rate": 0.0001055419137524078, "loss": 0.2805, "step": 2859 }, { "epoch": 1.6031390134529149, "grad_norm": 0.07162201942423144, "learning_rate": 0.00010547678042286436, "loss": 0.2602, "step": 2860 }, { "epoch": 1.6036995515695067, "grad_norm": 0.0691817907815888, "learning_rate": 0.00010541164476280487, "loss": 0.2706, "step": 2861 }, { "epoch": 1.6042600896860986, "grad_norm": 0.06968361003295294, "learning_rate": 0.00010534650679994627, "loss": 0.2656, "step": 2862 }, { "epoch": 1.6048206278026906, "grad_norm": 0.07128643674238011, "learning_rate": 0.00010528136656200647, "loss": 0.2787, "step": 2863 }, { "epoch": 1.6053811659192825, "grad_norm": 0.06960925004641096, "learning_rate": 0.00010521622407670439, "loss": 0.2698, "step": 2864 }, { "epoch": 1.6059417040358746, "grad_norm": 0.07042370337542646, "learning_rate": 0.00010515107937175995, "loss": 0.2716, "step": 2865 }, { "epoch": 1.6065022421524664, "grad_norm": 0.07097628524248398, "learning_rate": 0.00010508593247489389, "loss": 0.2716, "step": 2866 }, { "epoch": 1.6070627802690582, "grad_norm": 0.06916798698525788, "learning_rate": 0.00010502078341382797, "loss": 0.2671, "step": 2867 }, { "epoch": 1.60762331838565, "grad_norm": 0.07064265752587373, "learning_rate": 0.00010495563221628486, "loss": 0.2617, "step": 2868 }, { "epoch": 1.6081838565022422, "grad_norm": 0.06920364415237995, "learning_rate": 0.00010489047890998816, "loss": 0.2647, "step": 2869 }, { "epoch": 1.6087443946188342, "grad_norm": 0.07259613304903145, "learning_rate": 0.00010482532352266227, "loss": 0.2796, "step": 2870 }, { "epoch": 1.609304932735426, "grad_norm": 0.07272303135725403, "learning_rate": 0.0001047601660820326, "loss": 0.2742, "step": 2871 }, { "epoch": 1.609865470852018, "grad_norm": 0.07268996656414749, "learning_rate": 0.00010469500661582536, "loss": 0.2598, "step": 2872 }, { "epoch": 1.6104260089686098, "grad_norm": 0.07097297630819888, "learning_rate": 0.00010462984515176764, "loss": 0.2794, "step": 2873 }, { "epoch": 1.6109865470852018, "grad_norm": 0.07015583243603275, "learning_rate": 0.0001045646817175874, "loss": 0.2764, "step": 2874 }, { "epoch": 1.6115470852017937, "grad_norm": 0.07061977707648315, "learning_rate": 0.00010449951634101338, "loss": 0.268, "step": 2875 }, { "epoch": 1.6121076233183858, "grad_norm": 0.07129773048093416, "learning_rate": 0.00010443434904977518, "loss": 0.2815, "step": 2876 }, { "epoch": 1.6126681614349776, "grad_norm": 0.07530695328656455, "learning_rate": 0.00010436917987160328, "loss": 0.273, "step": 2877 }, { "epoch": 1.6132286995515694, "grad_norm": 0.06912379645418971, "learning_rate": 0.00010430400883422886, "loss": 0.2727, "step": 2878 }, { "epoch": 1.6137892376681613, "grad_norm": 0.0703260999096315, "learning_rate": 0.00010423883596538395, "loss": 0.2696, "step": 2879 }, { "epoch": 1.6143497757847534, "grad_norm": 0.07116881586283529, "learning_rate": 0.00010417366129280133, "loss": 0.2648, "step": 2880 }, { "epoch": 1.6149103139013454, "grad_norm": 0.06900435851378543, "learning_rate": 0.00010410848484421454, "loss": 0.2646, "step": 2881 }, { "epoch": 1.6154708520179373, "grad_norm": 0.07252026805511815, "learning_rate": 0.00010404330664735796, "loss": 0.2634, "step": 2882 }, { "epoch": 1.6160313901345291, "grad_norm": 0.0704851611576344, "learning_rate": 0.00010397812672996658, "loss": 0.2651, "step": 2883 }, { "epoch": 1.616591928251121, "grad_norm": 0.06915927703579841, "learning_rate": 0.00010391294511977623, "loss": 0.2615, "step": 2884 }, { "epoch": 1.617152466367713, "grad_norm": 0.07130739293659118, "learning_rate": 0.0001038477618445234, "loss": 0.2555, "step": 2885 }, { "epoch": 1.6177130044843049, "grad_norm": 0.0712896928495532, "learning_rate": 0.00010378257693194535, "loss": 0.2785, "step": 2886 }, { "epoch": 1.618273542600897, "grad_norm": 0.07182611511899527, "learning_rate": 0.00010371739040978, "loss": 0.2774, "step": 2887 }, { "epoch": 1.6188340807174888, "grad_norm": 0.07218038843947314, "learning_rate": 0.0001036522023057659, "loss": 0.2696, "step": 2888 }, { "epoch": 1.6193946188340806, "grad_norm": 0.0708372643358692, "learning_rate": 0.00010358701264764234, "loss": 0.2632, "step": 2889 }, { "epoch": 1.6199551569506725, "grad_norm": 0.07305156201554344, "learning_rate": 0.00010352182146314931, "loss": 0.2802, "step": 2890 }, { "epoch": 1.6205156950672646, "grad_norm": 0.06901337263656344, "learning_rate": 0.00010345662878002733, "loss": 0.2647, "step": 2891 }, { "epoch": 1.6210762331838566, "grad_norm": 0.07305095994353092, "learning_rate": 0.00010339143462601768, "loss": 0.2698, "step": 2892 }, { "epoch": 1.6216367713004485, "grad_norm": 0.0718301702444084, "learning_rate": 0.00010332623902886214, "loss": 0.2693, "step": 2893 }, { "epoch": 1.6221973094170403, "grad_norm": 0.07207048037056661, "learning_rate": 0.00010326104201630326, "loss": 0.2813, "step": 2894 }, { "epoch": 1.6227578475336322, "grad_norm": 0.07257496358147895, "learning_rate": 0.00010319584361608407, "loss": 0.278, "step": 2895 }, { "epoch": 1.6233183856502242, "grad_norm": 0.07193843740836481, "learning_rate": 0.00010313064385594822, "loss": 0.266, "step": 2896 }, { "epoch": 1.6238789237668163, "grad_norm": 0.06886935555901082, "learning_rate": 0.00010306544276363992, "loss": 0.272, "step": 2897 }, { "epoch": 1.6244394618834082, "grad_norm": 0.07197003880910544, "learning_rate": 0.00010300024036690402, "loss": 0.2625, "step": 2898 }, { "epoch": 1.625, "grad_norm": 0.06942898499670558, "learning_rate": 0.00010293503669348586, "loss": 0.269, "step": 2899 }, { "epoch": 1.6255605381165918, "grad_norm": 0.0705762892144056, "learning_rate": 0.00010286983177113135, "loss": 0.2698, "step": 2900 }, { "epoch": 1.6261210762331837, "grad_norm": 0.06944684274046396, "learning_rate": 0.0001028046256275869, "loss": 0.2689, "step": 2901 }, { "epoch": 1.6266816143497758, "grad_norm": 0.07229536193834671, "learning_rate": 0.0001027394182905995, "loss": 0.2604, "step": 2902 }, { "epoch": 1.6272421524663678, "grad_norm": 0.07167360624516773, "learning_rate": 0.00010267420978791657, "loss": 0.2532, "step": 2903 }, { "epoch": 1.6278026905829597, "grad_norm": 0.06910521396979334, "learning_rate": 0.0001026090001472861, "loss": 0.2666, "step": 2904 }, { "epoch": 1.6283632286995515, "grad_norm": 0.0708341927339222, "learning_rate": 0.00010254378939645648, "loss": 0.2763, "step": 2905 }, { "epoch": 1.6289237668161434, "grad_norm": 0.07150730049544997, "learning_rate": 0.00010247857756317666, "loss": 0.2734, "step": 2906 }, { "epoch": 1.6294843049327354, "grad_norm": 0.06995895983103713, "learning_rate": 0.00010241336467519604, "loss": 0.2657, "step": 2907 }, { "epoch": 1.6300448430493275, "grad_norm": 0.07108159682180759, "learning_rate": 0.00010234815076026442, "loss": 0.2645, "step": 2908 }, { "epoch": 1.6306053811659194, "grad_norm": 0.07106786705261911, "learning_rate": 0.00010228293584613203, "loss": 0.2742, "step": 2909 }, { "epoch": 1.6311659192825112, "grad_norm": 0.07098809892260202, "learning_rate": 0.00010221771996054958, "loss": 0.276, "step": 2910 }, { "epoch": 1.631726457399103, "grad_norm": 0.0716960153271508, "learning_rate": 0.00010215250313126817, "loss": 0.2619, "step": 2911 }, { "epoch": 1.6322869955156951, "grad_norm": 0.07146872339606528, "learning_rate": 0.00010208728538603929, "loss": 0.2685, "step": 2912 }, { "epoch": 1.632847533632287, "grad_norm": 0.0693907623417422, "learning_rate": 0.00010202206675261484, "loss": 0.2715, "step": 2913 }, { "epoch": 1.633408071748879, "grad_norm": 0.07049327137235907, "learning_rate": 0.00010195684725874706, "loss": 0.2639, "step": 2914 }, { "epoch": 1.6339686098654709, "grad_norm": 0.07056680465844478, "learning_rate": 0.00010189162693218864, "loss": 0.2688, "step": 2915 }, { "epoch": 1.6345291479820627, "grad_norm": 0.07165852741318098, "learning_rate": 0.0001018264058006925, "loss": 0.2876, "step": 2916 }, { "epoch": 1.6350896860986546, "grad_norm": 0.07032431728646045, "learning_rate": 0.00010176118389201201, "loss": 0.2641, "step": 2917 }, { "epoch": 1.6356502242152466, "grad_norm": 0.07229702801072894, "learning_rate": 0.00010169596123390082, "loss": 0.2783, "step": 2918 }, { "epoch": 1.6362107623318387, "grad_norm": 0.06945481415257722, "learning_rate": 0.0001016307378541129, "loss": 0.2619, "step": 2919 }, { "epoch": 1.6367713004484306, "grad_norm": 0.06876431346048285, "learning_rate": 0.00010156551378040258, "loss": 0.2782, "step": 2920 }, { "epoch": 1.6373318385650224, "grad_norm": 0.07014776896764986, "learning_rate": 0.0001015002890405244, "loss": 0.2644, "step": 2921 }, { "epoch": 1.6378923766816142, "grad_norm": 0.06792938310515426, "learning_rate": 0.00010143506366223323, "loss": 0.2728, "step": 2922 }, { "epoch": 1.6384529147982063, "grad_norm": 0.06949802951722193, "learning_rate": 0.00010136983767328422, "loss": 0.2537, "step": 2923 }, { "epoch": 1.6390134529147982, "grad_norm": 0.06706218003585812, "learning_rate": 0.00010130461110143277, "loss": 0.2631, "step": 2924 }, { "epoch": 1.6395739910313902, "grad_norm": 0.06924991154675039, "learning_rate": 0.00010123938397443451, "loss": 0.2664, "step": 2925 }, { "epoch": 1.640134529147982, "grad_norm": 0.07235014008202201, "learning_rate": 0.0001011741563200453, "loss": 0.2656, "step": 2926 }, { "epoch": 1.640695067264574, "grad_norm": 0.06985174256664821, "learning_rate": 0.0001011089281660213, "loss": 0.2682, "step": 2927 }, { "epoch": 1.6412556053811658, "grad_norm": 0.0700158681377797, "learning_rate": 0.00010104369954011883, "loss": 0.2668, "step": 2928 }, { "epoch": 1.6418161434977578, "grad_norm": 0.07156922368606, "learning_rate": 0.00010097847047009437, "loss": 0.2554, "step": 2929 }, { "epoch": 1.64237668161435, "grad_norm": 0.0719107263817642, "learning_rate": 0.00010091324098370458, "loss": 0.2701, "step": 2930 }, { "epoch": 1.6429372197309418, "grad_norm": 0.07293399490486906, "learning_rate": 0.00010084801110870648, "loss": 0.2671, "step": 2931 }, { "epoch": 1.6434977578475336, "grad_norm": 0.07003004697968263, "learning_rate": 0.000100782780872857, "loss": 0.2653, "step": 2932 }, { "epoch": 1.6440582959641254, "grad_norm": 0.06935644692435886, "learning_rate": 0.0001007175503039134, "loss": 0.2501, "step": 2933 }, { "epoch": 1.6446188340807175, "grad_norm": 0.06864500191077569, "learning_rate": 0.00010065231942963305, "loss": 0.2634, "step": 2934 }, { "epoch": 1.6451793721973094, "grad_norm": 0.06940400777998981, "learning_rate": 0.00010058708827777335, "loss": 0.262, "step": 2935 }, { "epoch": 1.6457399103139014, "grad_norm": 0.07022841818358101, "learning_rate": 0.00010052185687609197, "loss": 0.2675, "step": 2936 }, { "epoch": 1.6463004484304933, "grad_norm": 0.06986936343805794, "learning_rate": 0.00010045662525234656, "loss": 0.2682, "step": 2937 }, { "epoch": 1.6468609865470851, "grad_norm": 0.06864227560784018, "learning_rate": 0.00010039139343429492, "loss": 0.257, "step": 2938 }, { "epoch": 1.647421524663677, "grad_norm": 0.07076917806402258, "learning_rate": 0.00010032616144969494, "loss": 0.2672, "step": 2939 }, { "epoch": 1.647982062780269, "grad_norm": 0.06974530633846089, "learning_rate": 0.00010026092932630457, "loss": 0.2626, "step": 2940 }, { "epoch": 1.648542600896861, "grad_norm": 0.06737481489290036, "learning_rate": 0.00010019569709188186, "loss": 0.2492, "step": 2941 }, { "epoch": 1.649103139013453, "grad_norm": 0.07101003567831017, "learning_rate": 0.00010013046477418475, "loss": 0.257, "step": 2942 }, { "epoch": 1.6496636771300448, "grad_norm": 0.07198482669475338, "learning_rate": 0.00010006523240097146, "loss": 0.2745, "step": 2943 }, { "epoch": 1.6502242152466366, "grad_norm": 0.07116817086657783, "learning_rate": 0.0001, "loss": 0.2639, "step": 2944 }, { "epoch": 1.6507847533632287, "grad_norm": 0.07064895989898635, "learning_rate": 9.99347675990286e-05, "loss": 0.264, "step": 2945 }, { "epoch": 1.6513452914798208, "grad_norm": 0.07006740044190507, "learning_rate": 9.986953522581526e-05, "loss": 0.2744, "step": 2946 }, { "epoch": 1.6519058295964126, "grad_norm": 0.06985943320704728, "learning_rate": 9.980430290811818e-05, "loss": 0.2547, "step": 2947 }, { "epoch": 1.6524663677130045, "grad_norm": 0.07137157676555536, "learning_rate": 9.973907067369543e-05, "loss": 0.2724, "step": 2948 }, { "epoch": 1.6530269058295963, "grad_norm": 0.07279180942706098, "learning_rate": 9.967383855030509e-05, "loss": 0.2759, "step": 2949 }, { "epoch": 1.6535874439461884, "grad_norm": 0.0704488424146485, "learning_rate": 9.960860656570509e-05, "loss": 0.2751, "step": 2950 }, { "epoch": 1.6541479820627802, "grad_norm": 0.07104883829550132, "learning_rate": 9.954337474765347e-05, "loss": 0.2755, "step": 2951 }, { "epoch": 1.6547085201793723, "grad_norm": 0.07269547521425603, "learning_rate": 9.947814312390808e-05, "loss": 0.2648, "step": 2952 }, { "epoch": 1.6552690582959642, "grad_norm": 0.07173620153129019, "learning_rate": 9.941291172222666e-05, "loss": 0.277, "step": 2953 }, { "epoch": 1.655829596412556, "grad_norm": 0.07091369461295653, "learning_rate": 9.934768057036699e-05, "loss": 0.2728, "step": 2954 }, { "epoch": 1.6563901345291479, "grad_norm": 0.06941523353081054, "learning_rate": 9.928244969608659e-05, "loss": 0.2687, "step": 2955 }, { "epoch": 1.65695067264574, "grad_norm": 0.07021619384640268, "learning_rate": 9.921721912714301e-05, "loss": 0.2576, "step": 2956 }, { "epoch": 1.657511210762332, "grad_norm": 0.07064173212862243, "learning_rate": 9.915198889129353e-05, "loss": 0.2635, "step": 2957 }, { "epoch": 1.6580717488789238, "grad_norm": 0.07017826202142279, "learning_rate": 9.908675901629543e-05, "loss": 0.2563, "step": 2958 }, { "epoch": 1.6586322869955157, "grad_norm": 0.07167348464219234, "learning_rate": 9.902152952990568e-05, "loss": 0.2757, "step": 2959 }, { "epoch": 1.6591928251121075, "grad_norm": 0.07084110287213552, "learning_rate": 9.89563004598812e-05, "loss": 0.2662, "step": 2960 }, { "epoch": 1.6597533632286996, "grad_norm": 0.07005592040282468, "learning_rate": 9.889107183397872e-05, "loss": 0.2584, "step": 2961 }, { "epoch": 1.6603139013452914, "grad_norm": 0.06939271278116278, "learning_rate": 9.88258436799547e-05, "loss": 0.2642, "step": 2962 }, { "epoch": 1.6608744394618835, "grad_norm": 0.07270946229402037, "learning_rate": 9.876061602556552e-05, "loss": 0.2709, "step": 2963 }, { "epoch": 1.6614349775784754, "grad_norm": 0.0725881565646497, "learning_rate": 9.869538889856723e-05, "loss": 0.2761, "step": 2964 }, { "epoch": 1.6619955156950672, "grad_norm": 0.07158674809084947, "learning_rate": 9.86301623267158e-05, "loss": 0.2689, "step": 2965 }, { "epoch": 1.662556053811659, "grad_norm": 0.07060696614329968, "learning_rate": 9.856493633776682e-05, "loss": 0.2577, "step": 2966 }, { "epoch": 1.6631165919282511, "grad_norm": 0.07180296984368345, "learning_rate": 9.849971095947562e-05, "loss": 0.2628, "step": 2967 }, { "epoch": 1.6636771300448432, "grad_norm": 0.06809368516988418, "learning_rate": 9.843448621959745e-05, "loss": 0.2569, "step": 2968 }, { "epoch": 1.664237668161435, "grad_norm": 0.06953623240111217, "learning_rate": 9.83692621458871e-05, "loss": 0.2587, "step": 2969 }, { "epoch": 1.6647982062780269, "grad_norm": 0.07132151939492777, "learning_rate": 9.830403876609922e-05, "loss": 0.2677, "step": 2970 }, { "epoch": 1.6653587443946187, "grad_norm": 0.06760232918482255, "learning_rate": 9.823881610798804e-05, "loss": 0.2539, "step": 2971 }, { "epoch": 1.6659192825112108, "grad_norm": 0.06873085708011183, "learning_rate": 9.817359419930751e-05, "loss": 0.2645, "step": 2972 }, { "epoch": 1.6664798206278026, "grad_norm": 0.06715588054507451, "learning_rate": 9.810837306781141e-05, "loss": 0.2527, "step": 2973 }, { "epoch": 1.6670403587443947, "grad_norm": 0.07141325757822094, "learning_rate": 9.804315274125295e-05, "loss": 0.2632, "step": 2974 }, { "epoch": 1.6676008968609866, "grad_norm": 0.07067485424500766, "learning_rate": 9.797793324738519e-05, "loss": 0.2781, "step": 2975 }, { "epoch": 1.6681614349775784, "grad_norm": 0.07027156452442415, "learning_rate": 9.79127146139607e-05, "loss": 0.2716, "step": 2976 }, { "epoch": 1.6687219730941703, "grad_norm": 0.06902926780891283, "learning_rate": 9.784749686873185e-05, "loss": 0.2685, "step": 2977 }, { "epoch": 1.6692825112107623, "grad_norm": 0.07009559217339502, "learning_rate": 9.778228003945047e-05, "loss": 0.2579, "step": 2978 }, { "epoch": 1.6698430493273544, "grad_norm": 0.07290110289519239, "learning_rate": 9.7717064153868e-05, "loss": 0.2685, "step": 2979 }, { "epoch": 1.6704035874439462, "grad_norm": 0.07219199438100524, "learning_rate": 9.765184923973561e-05, "loss": 0.2663, "step": 2980 }, { "epoch": 1.670964125560538, "grad_norm": 0.07173612240528146, "learning_rate": 9.758663532480395e-05, "loss": 0.262, "step": 2981 }, { "epoch": 1.67152466367713, "grad_norm": 0.06999620724925222, "learning_rate": 9.752142243682335e-05, "loss": 0.2646, "step": 2982 }, { "epoch": 1.672085201793722, "grad_norm": 0.07583412286829273, "learning_rate": 9.745621060354353e-05, "loss": 0.2684, "step": 2983 }, { "epoch": 1.672645739910314, "grad_norm": 0.0717555811476974, "learning_rate": 9.739099985271394e-05, "loss": 0.2688, "step": 2984 }, { "epoch": 1.673206278026906, "grad_norm": 0.071767650515776, "learning_rate": 9.732579021208348e-05, "loss": 0.2727, "step": 2985 }, { "epoch": 1.6737668161434978, "grad_norm": 0.0716331600302753, "learning_rate": 9.726058170940053e-05, "loss": 0.2628, "step": 2986 }, { "epoch": 1.6743273542600896, "grad_norm": 0.07198001567738212, "learning_rate": 9.719537437241312e-05, "loss": 0.2689, "step": 2987 }, { "epoch": 1.6748878923766815, "grad_norm": 0.07180028215768763, "learning_rate": 9.713016822886866e-05, "loss": 0.2868, "step": 2988 }, { "epoch": 1.6754484304932735, "grad_norm": 0.07328396935140088, "learning_rate": 9.706496330651415e-05, "loss": 0.2475, "step": 2989 }, { "epoch": 1.6760089686098656, "grad_norm": 0.06894723066689226, "learning_rate": 9.699975963309599e-05, "loss": 0.266, "step": 2990 }, { "epoch": 1.6765695067264574, "grad_norm": 0.06736843118572723, "learning_rate": 9.693455723636011e-05, "loss": 0.2521, "step": 2991 }, { "epoch": 1.6771300448430493, "grad_norm": 0.07280393560519785, "learning_rate": 9.686935614405183e-05, "loss": 0.279, "step": 2992 }, { "epoch": 1.6776905829596411, "grad_norm": 0.07175747202684021, "learning_rate": 9.680415638391594e-05, "loss": 0.2689, "step": 2993 }, { "epoch": 1.6782511210762332, "grad_norm": 0.07098623385981147, "learning_rate": 9.673895798369676e-05, "loss": 0.2702, "step": 2994 }, { "epoch": 1.6788116591928253, "grad_norm": 0.07088433029769355, "learning_rate": 9.667376097113786e-05, "loss": 0.2702, "step": 2995 }, { "epoch": 1.6793721973094171, "grad_norm": 0.0704308492547985, "learning_rate": 9.660856537398235e-05, "loss": 0.2655, "step": 2996 }, { "epoch": 1.679932735426009, "grad_norm": 0.06944793765819227, "learning_rate": 9.654337121997266e-05, "loss": 0.2773, "step": 2997 }, { "epoch": 1.6804932735426008, "grad_norm": 0.06882161335319166, "learning_rate": 9.647817853685072e-05, "loss": 0.2725, "step": 2998 }, { "epoch": 1.6810538116591929, "grad_norm": 0.06916524968769376, "learning_rate": 9.641298735235768e-05, "loss": 0.2716, "step": 2999 }, { "epoch": 1.6816143497757847, "grad_norm": 0.0690816299425424, "learning_rate": 9.63477976942341e-05, "loss": 0.2726, "step": 3000 }, { "epoch": 1.6821748878923768, "grad_norm": 0.07408137401644892, "learning_rate": 9.628260959022004e-05, "loss": 0.2793, "step": 3001 }, { "epoch": 1.6827354260089686, "grad_norm": 0.07362027591922002, "learning_rate": 9.621742306805465e-05, "loss": 0.2682, "step": 3002 }, { "epoch": 1.6832959641255605, "grad_norm": 0.07145697481318186, "learning_rate": 9.615223815547662e-05, "loss": 0.263, "step": 3003 }, { "epoch": 1.6838565022421523, "grad_norm": 0.07114809621469859, "learning_rate": 9.608705488022378e-05, "loss": 0.2571, "step": 3004 }, { "epoch": 1.6844170403587444, "grad_norm": 0.07275872617108507, "learning_rate": 9.602187327003344e-05, "loss": 0.2592, "step": 3005 }, { "epoch": 1.6849775784753365, "grad_norm": 0.07080794287021444, "learning_rate": 9.59566933526421e-05, "loss": 0.2728, "step": 3006 }, { "epoch": 1.6855381165919283, "grad_norm": 0.07142005610893062, "learning_rate": 9.589151515578547e-05, "loss": 0.2737, "step": 3007 }, { "epoch": 1.6860986547085202, "grad_norm": 0.07124654403647934, "learning_rate": 9.582633870719871e-05, "loss": 0.2762, "step": 3008 }, { "epoch": 1.686659192825112, "grad_norm": 0.07381357998712451, "learning_rate": 9.576116403461606e-05, "loss": 0.2647, "step": 3009 }, { "epoch": 1.687219730941704, "grad_norm": 0.06931885142630366, "learning_rate": 9.569599116577116e-05, "loss": 0.2453, "step": 3010 }, { "epoch": 1.687780269058296, "grad_norm": 0.07379506914416824, "learning_rate": 9.563082012839676e-05, "loss": 0.2739, "step": 3011 }, { "epoch": 1.688340807174888, "grad_norm": 0.0745453623059807, "learning_rate": 9.556565095022483e-05, "loss": 0.2698, "step": 3012 }, { "epoch": 1.6889013452914798, "grad_norm": 0.07108561066703549, "learning_rate": 9.550048365898666e-05, "loss": 0.2632, "step": 3013 }, { "epoch": 1.6894618834080717, "grad_norm": 0.07032612375066644, "learning_rate": 9.543531828241262e-05, "loss": 0.2625, "step": 3014 }, { "epoch": 1.6900224215246635, "grad_norm": 0.07015202217184319, "learning_rate": 9.53701548482324e-05, "loss": 0.266, "step": 3015 }, { "epoch": 1.6905829596412556, "grad_norm": 0.07018880177928077, "learning_rate": 9.530499338417465e-05, "loss": 0.2697, "step": 3016 }, { "epoch": 1.6911434977578477, "grad_norm": 0.06954117778871866, "learning_rate": 9.523983391796741e-05, "loss": 0.2709, "step": 3017 }, { "epoch": 1.6917040358744395, "grad_norm": 0.06944908965735618, "learning_rate": 9.517467647733776e-05, "loss": 0.2636, "step": 3018 }, { "epoch": 1.6922645739910314, "grad_norm": 0.07100586273481777, "learning_rate": 9.510952109001188e-05, "loss": 0.2577, "step": 3019 }, { "epoch": 1.6928251121076232, "grad_norm": 0.07116860959799011, "learning_rate": 9.504436778371515e-05, "loss": 0.258, "step": 3020 }, { "epoch": 1.6933856502242153, "grad_norm": 0.07067580662515252, "learning_rate": 9.497921658617202e-05, "loss": 0.2707, "step": 3021 }, { "epoch": 1.6939461883408071, "grad_norm": 0.06929238656794127, "learning_rate": 9.491406752510615e-05, "loss": 0.2569, "step": 3022 }, { "epoch": 1.6945067264573992, "grad_norm": 0.06884714893774431, "learning_rate": 9.484892062824006e-05, "loss": 0.2548, "step": 3023 }, { "epoch": 1.695067264573991, "grad_norm": 0.0724877318697893, "learning_rate": 9.478377592329563e-05, "loss": 0.2722, "step": 3024 }, { "epoch": 1.6956278026905829, "grad_norm": 0.07015228462062964, "learning_rate": 9.471863343799357e-05, "loss": 0.2699, "step": 3025 }, { "epoch": 1.6961883408071747, "grad_norm": 0.07015815284285637, "learning_rate": 9.465349320005376e-05, "loss": 0.2586, "step": 3026 }, { "epoch": 1.6967488789237668, "grad_norm": 0.06988909279333051, "learning_rate": 9.458835523719515e-05, "loss": 0.2649, "step": 3027 }, { "epoch": 1.6973094170403589, "grad_norm": 0.07348641110049924, "learning_rate": 9.452321957713564e-05, "loss": 0.2895, "step": 3028 }, { "epoch": 1.6978699551569507, "grad_norm": 0.07119626809216405, "learning_rate": 9.445808624759222e-05, "loss": 0.26, "step": 3029 }, { "epoch": 1.6984304932735426, "grad_norm": 0.06891101524214079, "learning_rate": 9.439295527628081e-05, "loss": 0.2561, "step": 3030 }, { "epoch": 1.6989910313901344, "grad_norm": 0.0723019273672455, "learning_rate": 9.432782669091645e-05, "loss": 0.2688, "step": 3031 }, { "epoch": 1.6995515695067265, "grad_norm": 0.06983747657931758, "learning_rate": 9.426270051921304e-05, "loss": 0.2571, "step": 3032 }, { "epoch": 1.7001121076233185, "grad_norm": 0.07132127600929675, "learning_rate": 9.419757678888348e-05, "loss": 0.2553, "step": 3033 }, { "epoch": 1.7006726457399104, "grad_norm": 0.06718582215606907, "learning_rate": 9.413245552763972e-05, "loss": 0.252, "step": 3034 }, { "epoch": 1.7012331838565022, "grad_norm": 0.07159437990405979, "learning_rate": 9.406733676319252e-05, "loss": 0.2591, "step": 3035 }, { "epoch": 1.701793721973094, "grad_norm": 0.07098325897458493, "learning_rate": 9.400222052325174e-05, "loss": 0.2703, "step": 3036 }, { "epoch": 1.702354260089686, "grad_norm": 0.06998738777762464, "learning_rate": 9.393710683552596e-05, "loss": 0.2649, "step": 3037 }, { "epoch": 1.702914798206278, "grad_norm": 0.07062642596208016, "learning_rate": 9.387199572772289e-05, "loss": 0.2617, "step": 3038 }, { "epoch": 1.70347533632287, "grad_norm": 0.06869741170969536, "learning_rate": 9.3806887227549e-05, "loss": 0.266, "step": 3039 }, { "epoch": 1.704035874439462, "grad_norm": 0.06995979410621993, "learning_rate": 9.374178136270966e-05, "loss": 0.2624, "step": 3040 }, { "epoch": 1.7045964125560538, "grad_norm": 0.07060379407487441, "learning_rate": 9.367667816090923e-05, "loss": 0.2584, "step": 3041 }, { "epoch": 1.7051569506726456, "grad_norm": 0.07162770183311762, "learning_rate": 9.361157764985077e-05, "loss": 0.2606, "step": 3042 }, { "epoch": 1.7057174887892377, "grad_norm": 0.07077137055125471, "learning_rate": 9.354647985723639e-05, "loss": 0.2609, "step": 3043 }, { "epoch": 1.7062780269058297, "grad_norm": 0.06853811613387817, "learning_rate": 9.348138481076682e-05, "loss": 0.2461, "step": 3044 }, { "epoch": 1.7068385650224216, "grad_norm": 0.07198854797865258, "learning_rate": 9.341629253814185e-05, "loss": 0.2545, "step": 3045 }, { "epoch": 1.7073991031390134, "grad_norm": 0.0702773269196386, "learning_rate": 9.335120306705988e-05, "loss": 0.2502, "step": 3046 }, { "epoch": 1.7079596412556053, "grad_norm": 0.07042076930798195, "learning_rate": 9.328611642521824e-05, "loss": 0.2615, "step": 3047 }, { "epoch": 1.7085201793721974, "grad_norm": 0.06992299100185177, "learning_rate": 9.32210326403131e-05, "loss": 0.2536, "step": 3048 }, { "epoch": 1.7090807174887892, "grad_norm": 0.07290105226536828, "learning_rate": 9.315595174003922e-05, "loss": 0.2902, "step": 3049 }, { "epoch": 1.7096412556053813, "grad_norm": 0.07359894082978928, "learning_rate": 9.309087375209038e-05, "loss": 0.2791, "step": 3050 }, { "epoch": 1.7102017937219731, "grad_norm": 0.07384148555694922, "learning_rate": 9.302579870415891e-05, "loss": 0.2857, "step": 3051 }, { "epoch": 1.710762331838565, "grad_norm": 0.07045667499614068, "learning_rate": 9.296072662393607e-05, "loss": 0.281, "step": 3052 }, { "epoch": 1.7113228699551568, "grad_norm": 0.07106247579603404, "learning_rate": 9.289565753911168e-05, "loss": 0.2687, "step": 3053 }, { "epoch": 1.7118834080717489, "grad_norm": 0.07188165141857443, "learning_rate": 9.283059147737438e-05, "loss": 0.2695, "step": 3054 }, { "epoch": 1.712443946188341, "grad_norm": 0.07101824771296773, "learning_rate": 9.276552846641159e-05, "loss": 0.2737, "step": 3055 }, { "epoch": 1.7130044843049328, "grad_norm": 0.07179545911967246, "learning_rate": 9.270046853390925e-05, "loss": 0.2541, "step": 3056 }, { "epoch": 1.7135650224215246, "grad_norm": 0.070350875098147, "learning_rate": 9.263541170755219e-05, "loss": 0.2696, "step": 3057 }, { "epoch": 1.7141255605381165, "grad_norm": 0.0718526150502596, "learning_rate": 9.257035801502374e-05, "loss": 0.2719, "step": 3058 }, { "epoch": 1.7146860986547086, "grad_norm": 0.07337203808787283, "learning_rate": 9.250530748400603e-05, "loss": 0.274, "step": 3059 }, { "epoch": 1.7152466367713004, "grad_norm": 0.07024342762836264, "learning_rate": 9.244026014217981e-05, "loss": 0.2758, "step": 3060 }, { "epoch": 1.7158071748878925, "grad_norm": 0.0674875710121854, "learning_rate": 9.237521601722441e-05, "loss": 0.2693, "step": 3061 }, { "epoch": 1.7163677130044843, "grad_norm": 0.07053448479785376, "learning_rate": 9.231017513681787e-05, "loss": 0.2702, "step": 3062 }, { "epoch": 1.7169282511210762, "grad_norm": 0.07003506100204805, "learning_rate": 9.224513752863678e-05, "loss": 0.2689, "step": 3063 }, { "epoch": 1.717488789237668, "grad_norm": 0.06930945698949534, "learning_rate": 9.218010322035647e-05, "loss": 0.2737, "step": 3064 }, { "epoch": 1.71804932735426, "grad_norm": 0.07060988877570926, "learning_rate": 9.211507223965068e-05, "loss": 0.2809, "step": 3065 }, { "epoch": 1.7186098654708521, "grad_norm": 0.07016350349691232, "learning_rate": 9.205004461419183e-05, "loss": 0.2654, "step": 3066 }, { "epoch": 1.719170403587444, "grad_norm": 0.07060481235737551, "learning_rate": 9.198502037165099e-05, "loss": 0.2627, "step": 3067 }, { "epoch": 1.7197309417040358, "grad_norm": 0.07180419301130528, "learning_rate": 9.19199995396976e-05, "loss": 0.2655, "step": 3068 }, { "epoch": 1.7202914798206277, "grad_norm": 0.07012083865224054, "learning_rate": 9.185498214599986e-05, "loss": 0.2609, "step": 3069 }, { "epoch": 1.7208520179372198, "grad_norm": 0.07075057742476638, "learning_rate": 9.17899682182243e-05, "loss": 0.2651, "step": 3070 }, { "epoch": 1.7214125560538116, "grad_norm": 0.07130713360383625, "learning_rate": 9.172495778403616e-05, "loss": 0.2594, "step": 3071 }, { "epoch": 1.7219730941704037, "grad_norm": 0.07164578949602156, "learning_rate": 9.165995087109911e-05, "loss": 0.2628, "step": 3072 }, { "epoch": 1.7225336322869955, "grad_norm": 0.07054279941227877, "learning_rate": 9.159494750707526e-05, "loss": 0.2672, "step": 3073 }, { "epoch": 1.7230941704035874, "grad_norm": 0.07438277723282749, "learning_rate": 9.152994771962534e-05, "loss": 0.2721, "step": 3074 }, { "epoch": 1.7236547085201792, "grad_norm": 0.0702279516844392, "learning_rate": 9.146495153640843e-05, "loss": 0.2763, "step": 3075 }, { "epoch": 1.7242152466367713, "grad_norm": 0.07202764128466099, "learning_rate": 9.139995898508223e-05, "loss": 0.2651, "step": 3076 }, { "epoch": 1.7247757847533634, "grad_norm": 0.0702625810768685, "learning_rate": 9.13349700933027e-05, "loss": 0.2529, "step": 3077 }, { "epoch": 1.7253363228699552, "grad_norm": 0.07210497296936554, "learning_rate": 9.126998488872445e-05, "loss": 0.2716, "step": 3078 }, { "epoch": 1.725896860986547, "grad_norm": 0.06899408142272777, "learning_rate": 9.120500339900034e-05, "loss": 0.2722, "step": 3079 }, { "epoch": 1.726457399103139, "grad_norm": 0.06848898381848725, "learning_rate": 9.114002565178172e-05, "loss": 0.2563, "step": 3080 }, { "epoch": 1.727017937219731, "grad_norm": 0.06860275806120515, "learning_rate": 9.107505167471842e-05, "loss": 0.2679, "step": 3081 }, { "epoch": 1.727578475336323, "grad_norm": 0.06798358697582003, "learning_rate": 9.101008149545853e-05, "loss": 0.2631, "step": 3082 }, { "epoch": 1.7281390134529149, "grad_norm": 0.06660575541074609, "learning_rate": 9.094511514164865e-05, "loss": 0.2656, "step": 3083 }, { "epoch": 1.7286995515695067, "grad_norm": 0.07050352657958711, "learning_rate": 9.088015264093365e-05, "loss": 0.2577, "step": 3084 }, { "epoch": 1.7292600896860986, "grad_norm": 0.06775868625679991, "learning_rate": 9.081519402095686e-05, "loss": 0.2547, "step": 3085 }, { "epoch": 1.7298206278026906, "grad_norm": 0.07065844362425264, "learning_rate": 9.075023930935986e-05, "loss": 0.2622, "step": 3086 }, { "epoch": 1.7303811659192825, "grad_norm": 0.06864780010863963, "learning_rate": 9.068528853378259e-05, "loss": 0.2649, "step": 3087 }, { "epoch": 1.7309417040358746, "grad_norm": 0.06857036119308015, "learning_rate": 9.062034172186341e-05, "loss": 0.254, "step": 3088 }, { "epoch": 1.7315022421524664, "grad_norm": 0.07050490876479314, "learning_rate": 9.055539890123884e-05, "loss": 0.2561, "step": 3089 }, { "epoch": 1.7320627802690582, "grad_norm": 0.07090601308162996, "learning_rate": 9.049046009954386e-05, "loss": 0.2698, "step": 3090 }, { "epoch": 1.73262331838565, "grad_norm": 0.06789021117057008, "learning_rate": 9.042552534441158e-05, "loss": 0.2612, "step": 3091 }, { "epoch": 1.7331838565022422, "grad_norm": 0.0699997888142276, "learning_rate": 9.036059466347354e-05, "loss": 0.2642, "step": 3092 }, { "epoch": 1.7337443946188342, "grad_norm": 0.06998588770115205, "learning_rate": 9.029566808435947e-05, "loss": 0.2645, "step": 3093 }, { "epoch": 1.734304932735426, "grad_norm": 0.07462279254263574, "learning_rate": 9.023074563469729e-05, "loss": 0.2744, "step": 3094 }, { "epoch": 1.734865470852018, "grad_norm": 0.0702336911996288, "learning_rate": 9.016582734211329e-05, "loss": 0.2625, "step": 3095 }, { "epoch": 1.7354260089686098, "grad_norm": 0.0688000072136554, "learning_rate": 9.010091323423192e-05, "loss": 0.2587, "step": 3096 }, { "epoch": 1.7359865470852018, "grad_norm": 0.07136428883923977, "learning_rate": 9.00360033386759e-05, "loss": 0.2684, "step": 3097 }, { "epoch": 1.7365470852017937, "grad_norm": 0.06878614529989564, "learning_rate": 8.997109768306607e-05, "loss": 0.2563, "step": 3098 }, { "epoch": 1.7371076233183858, "grad_norm": 0.07023414067608712, "learning_rate": 8.990619629502151e-05, "loss": 0.2684, "step": 3099 }, { "epoch": 1.7376681614349776, "grad_norm": 0.07091769076032442, "learning_rate": 8.984129920215959e-05, "loss": 0.2577, "step": 3100 }, { "epoch": 1.7382286995515694, "grad_norm": 0.07165391074392734, "learning_rate": 8.97764064320956e-05, "loss": 0.2652, "step": 3101 }, { "epoch": 1.7387892376681613, "grad_norm": 0.06756220303399435, "learning_rate": 8.971151801244328e-05, "loss": 0.2483, "step": 3102 }, { "epoch": 1.7393497757847534, "grad_norm": 0.07083035336483604, "learning_rate": 8.964663397081427e-05, "loss": 0.2633, "step": 3103 }, { "epoch": 1.7399103139013454, "grad_norm": 0.06898172477509441, "learning_rate": 8.958175433481855e-05, "loss": 0.2535, "step": 3104 }, { "epoch": 1.7404708520179373, "grad_norm": 0.07441487339332731, "learning_rate": 8.951687913206412e-05, "loss": 0.2773, "step": 3105 }, { "epoch": 1.7410313901345291, "grad_norm": 0.07092710174526971, "learning_rate": 8.945200839015704e-05, "loss": 0.2683, "step": 3106 }, { "epoch": 1.741591928251121, "grad_norm": 0.07098459363992686, "learning_rate": 8.938714213670161e-05, "loss": 0.2712, "step": 3107 }, { "epoch": 1.742152466367713, "grad_norm": 0.07073516711074394, "learning_rate": 8.93222803993001e-05, "loss": 0.2629, "step": 3108 }, { "epoch": 1.7427130044843049, "grad_norm": 0.07019086756585276, "learning_rate": 8.925742320555299e-05, "loss": 0.2646, "step": 3109 }, { "epoch": 1.743273542600897, "grad_norm": 0.07198470245921795, "learning_rate": 8.919257058305865e-05, "loss": 0.2711, "step": 3110 }, { "epoch": 1.7438340807174888, "grad_norm": 0.07072514162027327, "learning_rate": 8.912772255941366e-05, "loss": 0.271, "step": 3111 }, { "epoch": 1.7443946188340806, "grad_norm": 0.07267649599137432, "learning_rate": 8.906287916221259e-05, "loss": 0.2743, "step": 3112 }, { "epoch": 1.7449551569506725, "grad_norm": 0.067678361706435, "learning_rate": 8.899804041904795e-05, "loss": 0.2591, "step": 3113 }, { "epoch": 1.7455156950672646, "grad_norm": 0.0706536193841887, "learning_rate": 8.893320635751046e-05, "loss": 0.2698, "step": 3114 }, { "epoch": 1.7460762331838566, "grad_norm": 0.07012229400985279, "learning_rate": 8.886837700518867e-05, "loss": 0.2673, "step": 3115 }, { "epoch": 1.7466367713004485, "grad_norm": 0.07104279219256676, "learning_rate": 8.880355238966923e-05, "loss": 0.268, "step": 3116 }, { "epoch": 1.7471973094170403, "grad_norm": 0.07219760103078716, "learning_rate": 8.873873253853671e-05, "loss": 0.2645, "step": 3117 }, { "epoch": 1.7477578475336322, "grad_norm": 0.07006623090942019, "learning_rate": 8.867391747937375e-05, "loss": 0.2667, "step": 3118 }, { "epoch": 1.7483183856502242, "grad_norm": 0.07025344403982556, "learning_rate": 8.860910723976082e-05, "loss": 0.2689, "step": 3119 }, { "epoch": 1.7488789237668163, "grad_norm": 0.07179975973608532, "learning_rate": 8.85443018472764e-05, "loss": 0.2741, "step": 3120 }, { "epoch": 1.7494394618834082, "grad_norm": 0.06699987628425405, "learning_rate": 8.8479501329497e-05, "loss": 0.2583, "step": 3121 }, { "epoch": 1.75, "grad_norm": 0.070404933115905, "learning_rate": 8.841470571399685e-05, "loss": 0.2612, "step": 3122 }, { "epoch": 1.7505605381165918, "grad_norm": 0.0709969851086739, "learning_rate": 8.834991502834834e-05, "loss": 0.2675, "step": 3123 }, { "epoch": 1.7511210762331837, "grad_norm": 0.07039249756051098, "learning_rate": 8.82851293001215e-05, "loss": 0.255, "step": 3124 }, { "epoch": 1.7516816143497758, "grad_norm": 0.07152062412816956, "learning_rate": 8.822034855688447e-05, "loss": 0.2696, "step": 3125 }, { "epoch": 1.7522421524663678, "grad_norm": 0.06780549183692385, "learning_rate": 8.815557282620319e-05, "loss": 0.2663, "step": 3126 }, { "epoch": 1.7528026905829597, "grad_norm": 0.0711071595044796, "learning_rate": 8.809080213564138e-05, "loss": 0.2634, "step": 3127 }, { "epoch": 1.7533632286995515, "grad_norm": 0.07121032137984734, "learning_rate": 8.802603651276078e-05, "loss": 0.2636, "step": 3128 }, { "epoch": 1.7539237668161434, "grad_norm": 0.0721488260548215, "learning_rate": 8.796127598512083e-05, "loss": 0.2729, "step": 3129 }, { "epoch": 1.7544843049327354, "grad_norm": 0.07154160384180487, "learning_rate": 8.789652058027893e-05, "loss": 0.2637, "step": 3130 }, { "epoch": 1.7550448430493275, "grad_norm": 0.07031801849017648, "learning_rate": 8.783177032579016e-05, "loss": 0.2466, "step": 3131 }, { "epoch": 1.7556053811659194, "grad_norm": 0.07176124650491238, "learning_rate": 8.77670252492075e-05, "loss": 0.2682, "step": 3132 }, { "epoch": 1.7561659192825112, "grad_norm": 0.06964915955219239, "learning_rate": 8.770228537808176e-05, "loss": 0.256, "step": 3133 }, { "epoch": 1.756726457399103, "grad_norm": 0.0723981284459236, "learning_rate": 8.763755073996138e-05, "loss": 0.2527, "step": 3134 }, { "epoch": 1.7572869955156951, "grad_norm": 0.07128482933734852, "learning_rate": 8.757282136239278e-05, "loss": 0.2718, "step": 3135 }, { "epoch": 1.757847533632287, "grad_norm": 0.0693705470171095, "learning_rate": 8.750809727291995e-05, "loss": 0.2546, "step": 3136 }, { "epoch": 1.758408071748879, "grad_norm": 0.06992921067552417, "learning_rate": 8.744337849908475e-05, "loss": 0.2674, "step": 3137 }, { "epoch": 1.7589686098654709, "grad_norm": 0.06856428122737501, "learning_rate": 8.737866506842678e-05, "loss": 0.2648, "step": 3138 }, { "epoch": 1.7595291479820627, "grad_norm": 0.07136748675182866, "learning_rate": 8.731395700848325e-05, "loss": 0.2671, "step": 3139 }, { "epoch": 1.7600896860986546, "grad_norm": 0.06964346593155742, "learning_rate": 8.724925434678923e-05, "loss": 0.2747, "step": 3140 }, { "epoch": 1.7606502242152466, "grad_norm": 0.07362594572182111, "learning_rate": 8.718455711087738e-05, "loss": 0.2778, "step": 3141 }, { "epoch": 1.7612107623318387, "grad_norm": 0.07093386207693751, "learning_rate": 8.711986532827818e-05, "loss": 0.2651, "step": 3142 }, { "epoch": 1.7617713004484306, "grad_norm": 0.06804162479263667, "learning_rate": 8.70551790265196e-05, "loss": 0.2685, "step": 3143 }, { "epoch": 1.7623318385650224, "grad_norm": 0.06947569097608512, "learning_rate": 8.699049823312748e-05, "loss": 0.2634, "step": 3144 }, { "epoch": 1.7628923766816142, "grad_norm": 0.07080636522352704, "learning_rate": 8.692582297562517e-05, "loss": 0.2524, "step": 3145 }, { "epoch": 1.7634529147982063, "grad_norm": 0.07231716313744631, "learning_rate": 8.68611532815337e-05, "loss": 0.262, "step": 3146 }, { "epoch": 1.7640134529147982, "grad_norm": 0.07357365761579043, "learning_rate": 8.679648917837183e-05, "loss": 0.2728, "step": 3147 }, { "epoch": 1.7645739910313902, "grad_norm": 0.07469380384489484, "learning_rate": 8.673183069365574e-05, "loss": 0.2812, "step": 3148 }, { "epoch": 1.765134529147982, "grad_norm": 0.07139128759640609, "learning_rate": 8.666717785489946e-05, "loss": 0.2785, "step": 3149 }, { "epoch": 1.765695067264574, "grad_norm": 0.07104476454836627, "learning_rate": 8.660253068961439e-05, "loss": 0.2652, "step": 3150 }, { "epoch": 1.7662556053811658, "grad_norm": 0.06873181865782436, "learning_rate": 8.653788922530972e-05, "loss": 0.256, "step": 3151 }, { "epoch": 1.7668161434977578, "grad_norm": 0.07082589489465703, "learning_rate": 8.647325348949206e-05, "loss": 0.2729, "step": 3152 }, { "epoch": 1.76737668161435, "grad_norm": 0.06892885881911734, "learning_rate": 8.640862350966561e-05, "loss": 0.2586, "step": 3153 }, { "epoch": 1.7679372197309418, "grad_norm": 0.07005436255428589, "learning_rate": 8.634399931333226e-05, "loss": 0.2698, "step": 3154 }, { "epoch": 1.7684977578475336, "grad_norm": 0.06872973363314161, "learning_rate": 8.62793809279912e-05, "loss": 0.2658, "step": 3155 }, { "epoch": 1.7690582959641254, "grad_norm": 0.06728216748363262, "learning_rate": 8.621476838113937e-05, "loss": 0.2581, "step": 3156 }, { "epoch": 1.7696188340807175, "grad_norm": 0.07111028155058144, "learning_rate": 8.615016170027105e-05, "loss": 0.2602, "step": 3157 }, { "epoch": 1.7701793721973094, "grad_norm": 0.06848589057565128, "learning_rate": 8.608556091287816e-05, "loss": 0.2607, "step": 3158 }, { "epoch": 1.7707399103139014, "grad_norm": 0.07143064266989885, "learning_rate": 8.602096604645009e-05, "loss": 0.2593, "step": 3159 }, { "epoch": 1.7713004484304933, "grad_norm": 0.06741105563703916, "learning_rate": 8.595637712847358e-05, "loss": 0.2593, "step": 3160 }, { "epoch": 1.7718609865470851, "grad_norm": 0.06956810942187591, "learning_rate": 8.5891794186433e-05, "loss": 0.2605, "step": 3161 }, { "epoch": 1.772421524663677, "grad_norm": 0.07109752640084324, "learning_rate": 8.582721724781009e-05, "loss": 0.2699, "step": 3162 }, { "epoch": 1.772982062780269, "grad_norm": 0.07249070737198052, "learning_rate": 8.576264634008413e-05, "loss": 0.2589, "step": 3163 }, { "epoch": 1.773542600896861, "grad_norm": 0.0704968621730139, "learning_rate": 8.569808149073163e-05, "loss": 0.2623, "step": 3164 }, { "epoch": 1.774103139013453, "grad_norm": 0.06983540091279966, "learning_rate": 8.563352272722678e-05, "loss": 0.2748, "step": 3165 }, { "epoch": 1.7746636771300448, "grad_norm": 0.07084953142000845, "learning_rate": 8.556897007704101e-05, "loss": 0.2666, "step": 3166 }, { "epoch": 1.7752242152466366, "grad_norm": 0.07175664841858653, "learning_rate": 8.550442356764314e-05, "loss": 0.2766, "step": 3167 }, { "epoch": 1.7757847533632287, "grad_norm": 0.07035713859999491, "learning_rate": 8.543988322649954e-05, "loss": 0.2624, "step": 3168 }, { "epoch": 1.7763452914798208, "grad_norm": 0.06880609784502839, "learning_rate": 8.537534908107373e-05, "loss": 0.2585, "step": 3169 }, { "epoch": 1.7769058295964126, "grad_norm": 0.07240736256877163, "learning_rate": 8.531082115882679e-05, "loss": 0.2684, "step": 3170 }, { "epoch": 1.7774663677130045, "grad_norm": 0.07040923173470649, "learning_rate": 8.524629948721701e-05, "loss": 0.2647, "step": 3171 }, { "epoch": 1.7780269058295963, "grad_norm": 0.07112543261971123, "learning_rate": 8.518178409370017e-05, "loss": 0.2592, "step": 3172 }, { "epoch": 1.7785874439461884, "grad_norm": 0.07326574347624072, "learning_rate": 8.511727500572923e-05, "loss": 0.2722, "step": 3173 }, { "epoch": 1.7791479820627802, "grad_norm": 0.06914195983997955, "learning_rate": 8.505277225075449e-05, "loss": 0.2562, "step": 3174 }, { "epoch": 1.7797085201793723, "grad_norm": 0.07049569779725122, "learning_rate": 8.498827585622368e-05, "loss": 0.2817, "step": 3175 }, { "epoch": 1.7802690582959642, "grad_norm": 0.0722256599595862, "learning_rate": 8.492378584958164e-05, "loss": 0.2677, "step": 3176 }, { "epoch": 1.780829596412556, "grad_norm": 0.06888788626499369, "learning_rate": 8.485930225827067e-05, "loss": 0.2668, "step": 3177 }, { "epoch": 1.7813901345291479, "grad_norm": 0.06999229867299883, "learning_rate": 8.47948251097302e-05, "loss": 0.26, "step": 3178 }, { "epoch": 1.78195067264574, "grad_norm": 0.07105075343537055, "learning_rate": 8.473035443139704e-05, "loss": 0.2686, "step": 3179 }, { "epoch": 1.782511210762332, "grad_norm": 0.06985758955852764, "learning_rate": 8.466589025070513e-05, "loss": 0.2545, "step": 3180 }, { "epoch": 1.7830717488789238, "grad_norm": 0.06889826978128506, "learning_rate": 8.460143259508569e-05, "loss": 0.2624, "step": 3181 }, { "epoch": 1.7836322869955157, "grad_norm": 0.06665555794591822, "learning_rate": 8.45369814919672e-05, "loss": 0.2429, "step": 3182 }, { "epoch": 1.7841928251121075, "grad_norm": 0.06866345022967978, "learning_rate": 8.447253696877529e-05, "loss": 0.2576, "step": 3183 }, { "epoch": 1.7847533632286996, "grad_norm": 0.07451630210940821, "learning_rate": 8.440809905293288e-05, "loss": 0.2729, "step": 3184 }, { "epoch": 1.7853139013452914, "grad_norm": 0.07147842306947091, "learning_rate": 8.434366777185999e-05, "loss": 0.2519, "step": 3185 }, { "epoch": 1.7858744394618835, "grad_norm": 0.07091977993095579, "learning_rate": 8.42792431529738e-05, "loss": 0.2714, "step": 3186 }, { "epoch": 1.7864349775784754, "grad_norm": 0.06926473478972674, "learning_rate": 8.42148252236888e-05, "loss": 0.2642, "step": 3187 }, { "epoch": 1.7869955156950672, "grad_norm": 0.07117021031936455, "learning_rate": 8.415041401141642e-05, "loss": 0.255, "step": 3188 }, { "epoch": 1.787556053811659, "grad_norm": 0.07006286138561699, "learning_rate": 8.408600954356548e-05, "loss": 0.2663, "step": 3189 }, { "epoch": 1.7881165919282511, "grad_norm": 0.0794701824361796, "learning_rate": 8.402161184754167e-05, "loss": 0.2498, "step": 3190 }, { "epoch": 1.7886771300448432, "grad_norm": 0.0710591466676107, "learning_rate": 8.395722095074802e-05, "loss": 0.2641, "step": 3191 }, { "epoch": 1.789237668161435, "grad_norm": 0.06852756842054991, "learning_rate": 8.389283688058454e-05, "loss": 0.2615, "step": 3192 }, { "epoch": 1.7897982062780269, "grad_norm": 0.06920898255021925, "learning_rate": 8.382845966444835e-05, "loss": 0.2693, "step": 3193 }, { "epoch": 1.7903587443946187, "grad_norm": 0.06997216796862439, "learning_rate": 8.376408932973368e-05, "loss": 0.2739, "step": 3194 }, { "epoch": 1.7909192825112108, "grad_norm": 0.07271591113089486, "learning_rate": 8.369972590383184e-05, "loss": 0.2687, "step": 3195 }, { "epoch": 1.7914798206278026, "grad_norm": 0.06965632588414579, "learning_rate": 8.363536941413121e-05, "loss": 0.2685, "step": 3196 }, { "epoch": 1.7920403587443947, "grad_norm": 0.06924543112339039, "learning_rate": 8.35710198880171e-05, "loss": 0.2752, "step": 3197 }, { "epoch": 1.7926008968609866, "grad_norm": 0.07127109557181958, "learning_rate": 8.350667735287204e-05, "loss": 0.2739, "step": 3198 }, { "epoch": 1.7931614349775784, "grad_norm": 0.07082010008262021, "learning_rate": 8.344234183607548e-05, "loss": 0.2784, "step": 3199 }, { "epoch": 1.7937219730941703, "grad_norm": 0.06889189876720758, "learning_rate": 8.337801336500383e-05, "loss": 0.2679, "step": 3200 }, { "epoch": 1.7942825112107623, "grad_norm": 0.07111508795108079, "learning_rate": 8.331369196703067e-05, "loss": 0.2666, "step": 3201 }, { "epoch": 1.7948430493273544, "grad_norm": 0.07047617931067245, "learning_rate": 8.324937766952638e-05, "loss": 0.2714, "step": 3202 }, { "epoch": 1.7954035874439462, "grad_norm": 0.06829611267382257, "learning_rate": 8.318507049985844e-05, "loss": 0.256, "step": 3203 }, { "epoch": 1.795964125560538, "grad_norm": 0.0714382509113834, "learning_rate": 8.312077048539126e-05, "loss": 0.2641, "step": 3204 }, { "epoch": 1.79652466367713, "grad_norm": 0.07127447028914165, "learning_rate": 8.305647765348628e-05, "loss": 0.2702, "step": 3205 }, { "epoch": 1.797085201793722, "grad_norm": 0.07082535121720834, "learning_rate": 8.299219203150172e-05, "loss": 0.2533, "step": 3206 }, { "epoch": 1.797645739910314, "grad_norm": 0.06724754996951594, "learning_rate": 8.292791364679284e-05, "loss": 0.2639, "step": 3207 }, { "epoch": 1.798206278026906, "grad_norm": 0.07388332110536336, "learning_rate": 8.286364252671187e-05, "loss": 0.2783, "step": 3208 }, { "epoch": 1.7987668161434978, "grad_norm": 0.07227585955328372, "learning_rate": 8.27993786986078e-05, "loss": 0.2666, "step": 3209 }, { "epoch": 1.7993273542600896, "grad_norm": 0.07033468100949353, "learning_rate": 8.273512218982667e-05, "loss": 0.2706, "step": 3210 }, { "epoch": 1.7998878923766815, "grad_norm": 0.06979323578283012, "learning_rate": 8.267087302771127e-05, "loss": 0.2672, "step": 3211 }, { "epoch": 1.8004484304932735, "grad_norm": 0.06808999277656827, "learning_rate": 8.260663123960143e-05, "loss": 0.2583, "step": 3212 }, { "epoch": 1.8010089686098656, "grad_norm": 0.06981524626852194, "learning_rate": 8.254239685283369e-05, "loss": 0.2621, "step": 3213 }, { "epoch": 1.8015695067264574, "grad_norm": 0.06768463125622527, "learning_rate": 8.247816989474144e-05, "loss": 0.259, "step": 3214 }, { "epoch": 1.8021300448430493, "grad_norm": 0.07103339223893138, "learning_rate": 8.241395039265504e-05, "loss": 0.2707, "step": 3215 }, { "epoch": 1.8026905829596411, "grad_norm": 0.07168521210758859, "learning_rate": 8.234973837390154e-05, "loss": 0.2655, "step": 3216 }, { "epoch": 1.8032511210762332, "grad_norm": 0.07160453714399535, "learning_rate": 8.228553386580496e-05, "loss": 0.2698, "step": 3217 }, { "epoch": 1.8038116591928253, "grad_norm": 0.07050039742585491, "learning_rate": 8.22213368956859e-05, "loss": 0.2751, "step": 3218 }, { "epoch": 1.8043721973094171, "grad_norm": 0.07036432486773841, "learning_rate": 8.215714749086199e-05, "loss": 0.2742, "step": 3219 }, { "epoch": 1.804932735426009, "grad_norm": 0.06606636055771409, "learning_rate": 8.209296567864752e-05, "loss": 0.2424, "step": 3220 }, { "epoch": 1.8054932735426008, "grad_norm": 0.0698273036311671, "learning_rate": 8.202879148635348e-05, "loss": 0.2662, "step": 3221 }, { "epoch": 1.8060538116591929, "grad_norm": 0.06902402511065545, "learning_rate": 8.196462494128778e-05, "loss": 0.2527, "step": 3222 }, { "epoch": 1.8066143497757847, "grad_norm": 0.06966987801069195, "learning_rate": 8.190046607075496e-05, "loss": 0.2513, "step": 3223 }, { "epoch": 1.8071748878923768, "grad_norm": 0.0708178147378975, "learning_rate": 8.183631490205637e-05, "loss": 0.259, "step": 3224 }, { "epoch": 1.8077354260089686, "grad_norm": 0.07219415365250346, "learning_rate": 8.177217146249001e-05, "loss": 0.272, "step": 3225 }, { "epoch": 1.8082959641255605, "grad_norm": 0.07092431502983654, "learning_rate": 8.17080357793506e-05, "loss": 0.2686, "step": 3226 }, { "epoch": 1.8088565022421523, "grad_norm": 0.06940473245167077, "learning_rate": 8.164390787992963e-05, "loss": 0.2592, "step": 3227 }, { "epoch": 1.8094170403587444, "grad_norm": 0.07063170321896624, "learning_rate": 8.157978779151518e-05, "loss": 0.2706, "step": 3228 }, { "epoch": 1.8099775784753365, "grad_norm": 0.0691899810144605, "learning_rate": 8.151567554139213e-05, "loss": 0.257, "step": 3229 }, { "epoch": 1.8105381165919283, "grad_norm": 0.07043625516430235, "learning_rate": 8.145157115684188e-05, "loss": 0.2695, "step": 3230 }, { "epoch": 1.8110986547085202, "grad_norm": 0.07121692930243351, "learning_rate": 8.138747466514258e-05, "loss": 0.2638, "step": 3231 }, { "epoch": 1.811659192825112, "grad_norm": 0.07112343850379138, "learning_rate": 8.132338609356904e-05, "loss": 0.2748, "step": 3232 }, { "epoch": 1.812219730941704, "grad_norm": 0.0708272835416039, "learning_rate": 8.125930546939258e-05, "loss": 0.278, "step": 3233 }, { "epoch": 1.812780269058296, "grad_norm": 0.07002810189428951, "learning_rate": 8.119523281988128e-05, "loss": 0.261, "step": 3234 }, { "epoch": 1.813340807174888, "grad_norm": 0.06857860753845983, "learning_rate": 8.113116817229969e-05, "loss": 0.2699, "step": 3235 }, { "epoch": 1.8139013452914798, "grad_norm": 0.06873621338240818, "learning_rate": 8.106711155390908e-05, "loss": 0.2602, "step": 3236 }, { "epoch": 1.8144618834080717, "grad_norm": 0.06951754756328997, "learning_rate": 8.100306299196722e-05, "loss": 0.275, "step": 3237 }, { "epoch": 1.8150224215246635, "grad_norm": 0.07079477677031519, "learning_rate": 8.093902251372853e-05, "loss": 0.262, "step": 3238 }, { "epoch": 1.8155829596412556, "grad_norm": 0.06994273981841134, "learning_rate": 8.087499014644388e-05, "loss": 0.267, "step": 3239 }, { "epoch": 1.8161434977578477, "grad_norm": 0.07012393285154203, "learning_rate": 8.081096591736076e-05, "loss": 0.268, "step": 3240 }, { "epoch": 1.8167040358744395, "grad_norm": 0.07179251474163152, "learning_rate": 8.074694985372327e-05, "loss": 0.2495, "step": 3241 }, { "epoch": 1.8172645739910314, "grad_norm": 0.06895965921480107, "learning_rate": 8.068294198277181e-05, "loss": 0.2609, "step": 3242 }, { "epoch": 1.8178251121076232, "grad_norm": 0.07374500362428504, "learning_rate": 8.061894233174354e-05, "loss": 0.2707, "step": 3243 }, { "epoch": 1.8183856502242153, "grad_norm": 0.07016354106209098, "learning_rate": 8.055495092787196e-05, "loss": 0.267, "step": 3244 }, { "epoch": 1.8189461883408071, "grad_norm": 0.0702743851240875, "learning_rate": 8.049096779838719e-05, "loss": 0.2507, "step": 3245 }, { "epoch": 1.8195067264573992, "grad_norm": 0.06953644702458327, "learning_rate": 8.04269929705157e-05, "loss": 0.27, "step": 3246 }, { "epoch": 1.820067264573991, "grad_norm": 0.07111965090248204, "learning_rate": 8.036302647148046e-05, "loss": 0.2688, "step": 3247 }, { "epoch": 1.8206278026905829, "grad_norm": 0.07225789021090594, "learning_rate": 8.029906832850098e-05, "loss": 0.2749, "step": 3248 }, { "epoch": 1.8211883408071747, "grad_norm": 0.06865477466078054, "learning_rate": 8.023511856879312e-05, "loss": 0.2762, "step": 3249 }, { "epoch": 1.8217488789237668, "grad_norm": 0.06957466780206226, "learning_rate": 8.017117721956925e-05, "loss": 0.2647, "step": 3250 }, { "epoch": 1.8223094170403589, "grad_norm": 0.07143351345817087, "learning_rate": 8.010724430803806e-05, "loss": 0.2736, "step": 3251 }, { "epoch": 1.8228699551569507, "grad_norm": 0.07217209061752763, "learning_rate": 8.004331986140474e-05, "loss": 0.2569, "step": 3252 }, { "epoch": 1.8234304932735426, "grad_norm": 0.06995558984458415, "learning_rate": 7.997940390687087e-05, "loss": 0.2701, "step": 3253 }, { "epoch": 1.8239910313901344, "grad_norm": 0.06795951316620413, "learning_rate": 7.991549647163432e-05, "loss": 0.2661, "step": 3254 }, { "epoch": 1.8245515695067265, "grad_norm": 0.07162393917965158, "learning_rate": 7.985159758288947e-05, "loss": 0.2733, "step": 3255 }, { "epoch": 1.8251121076233185, "grad_norm": 0.06940977813543703, "learning_rate": 7.978770726782697e-05, "loss": 0.2561, "step": 3256 }, { "epoch": 1.8256726457399104, "grad_norm": 0.07077292043103751, "learning_rate": 7.972382555363391e-05, "loss": 0.2691, "step": 3257 }, { "epoch": 1.8262331838565022, "grad_norm": 0.06910878263615959, "learning_rate": 7.965995246749357e-05, "loss": 0.2598, "step": 3258 }, { "epoch": 1.826793721973094, "grad_norm": 0.06971989571511918, "learning_rate": 7.959608803658575e-05, "loss": 0.2673, "step": 3259 }, { "epoch": 1.827354260089686, "grad_norm": 0.07088971305325505, "learning_rate": 7.953223228808639e-05, "loss": 0.2689, "step": 3260 }, { "epoch": 1.827914798206278, "grad_norm": 0.06957420254574456, "learning_rate": 7.94683852491678e-05, "loss": 0.2587, "step": 3261 }, { "epoch": 1.82847533632287, "grad_norm": 0.06939132116929006, "learning_rate": 7.94045469469987e-05, "loss": 0.26, "step": 3262 }, { "epoch": 1.829035874439462, "grad_norm": 0.06917830409614653, "learning_rate": 7.93407174087439e-05, "loss": 0.2587, "step": 3263 }, { "epoch": 1.8295964125560538, "grad_norm": 0.07049559381730097, "learning_rate": 7.927689666156458e-05, "loss": 0.2564, "step": 3264 }, { "epoch": 1.8301569506726456, "grad_norm": 0.07032746320631154, "learning_rate": 7.92130847326182e-05, "loss": 0.2599, "step": 3265 }, { "epoch": 1.8307174887892377, "grad_norm": 0.06991754386154504, "learning_rate": 7.914928164905844e-05, "loss": 0.2727, "step": 3266 }, { "epoch": 1.8312780269058297, "grad_norm": 0.0690391811553293, "learning_rate": 7.90854874380352e-05, "loss": 0.2484, "step": 3267 }, { "epoch": 1.8318385650224216, "grad_norm": 0.06895022301849879, "learning_rate": 7.902170212669457e-05, "loss": 0.2558, "step": 3268 }, { "epoch": 1.8323991031390134, "grad_norm": 0.07026848639407661, "learning_rate": 7.895792574217897e-05, "loss": 0.263, "step": 3269 }, { "epoch": 1.8329596412556053, "grad_norm": 0.06950520145984855, "learning_rate": 7.88941583116269e-05, "loss": 0.2696, "step": 3270 }, { "epoch": 1.8335201793721974, "grad_norm": 0.07198396734595838, "learning_rate": 7.883039986217319e-05, "loss": 0.2649, "step": 3271 }, { "epoch": 1.8340807174887892, "grad_norm": 0.07260195426208052, "learning_rate": 7.876665042094867e-05, "loss": 0.2709, "step": 3272 }, { "epoch": 1.8346412556053813, "grad_norm": 0.06863939703963023, "learning_rate": 7.870291001508041e-05, "loss": 0.2625, "step": 3273 }, { "epoch": 1.8352017937219731, "grad_norm": 0.06777536080393645, "learning_rate": 7.863917867169174e-05, "loss": 0.2679, "step": 3274 }, { "epoch": 1.835762331838565, "grad_norm": 0.06994768384030978, "learning_rate": 7.857545641790198e-05, "loss": 0.2687, "step": 3275 }, { "epoch": 1.8363228699551568, "grad_norm": 0.06773691764880604, "learning_rate": 7.851174328082669e-05, "loss": 0.2617, "step": 3276 }, { "epoch": 1.8368834080717489, "grad_norm": 0.06936073208434729, "learning_rate": 7.844803928757746e-05, "loss": 0.2611, "step": 3277 }, { "epoch": 1.837443946188341, "grad_norm": 0.06944387446971986, "learning_rate": 7.83843444652621e-05, "loss": 0.2637, "step": 3278 }, { "epoch": 1.8380044843049328, "grad_norm": 0.06992787302602949, "learning_rate": 7.832065884098442e-05, "loss": 0.2655, "step": 3279 }, { "epoch": 1.8385650224215246, "grad_norm": 0.06937655616251745, "learning_rate": 7.825698244184431e-05, "loss": 0.2714, "step": 3280 }, { "epoch": 1.8391255605381165, "grad_norm": 0.07068469553748749, "learning_rate": 7.819331529493785e-05, "loss": 0.2761, "step": 3281 }, { "epoch": 1.8396860986547086, "grad_norm": 0.06934047764865918, "learning_rate": 7.812965742735704e-05, "loss": 0.2568, "step": 3282 }, { "epoch": 1.8402466367713004, "grad_norm": 0.06871391216147392, "learning_rate": 7.806600886619008e-05, "loss": 0.2664, "step": 3283 }, { "epoch": 1.8408071748878925, "grad_norm": 0.07283045631824599, "learning_rate": 7.800236963852106e-05, "loss": 0.2782, "step": 3284 }, { "epoch": 1.8413677130044843, "grad_norm": 0.06928698187482213, "learning_rate": 7.793873977143019e-05, "loss": 0.2601, "step": 3285 }, { "epoch": 1.8419282511210762, "grad_norm": 0.07072451148724478, "learning_rate": 7.78751192919937e-05, "loss": 0.2676, "step": 3286 }, { "epoch": 1.842488789237668, "grad_norm": 0.07076600383735644, "learning_rate": 7.781150822728373e-05, "loss": 0.2799, "step": 3287 }, { "epoch": 1.84304932735426, "grad_norm": 0.07131993457810994, "learning_rate": 7.774790660436858e-05, "loss": 0.2623, "step": 3288 }, { "epoch": 1.8436098654708521, "grad_norm": 0.06881675330410725, "learning_rate": 7.768431445031233e-05, "loss": 0.2702, "step": 3289 }, { "epoch": 1.844170403587444, "grad_norm": 0.07152766006078261, "learning_rate": 7.762073179217526e-05, "loss": 0.2803, "step": 3290 }, { "epoch": 1.8447309417040358, "grad_norm": 0.0700623448658665, "learning_rate": 7.755715865701337e-05, "loss": 0.264, "step": 3291 }, { "epoch": 1.8452914798206277, "grad_norm": 0.07361118318553973, "learning_rate": 7.749359507187882e-05, "loss": 0.2755, "step": 3292 }, { "epoch": 1.8458520179372198, "grad_norm": 0.07158716462650891, "learning_rate": 7.743004106381952e-05, "loss": 0.2709, "step": 3293 }, { "epoch": 1.8464125560538116, "grad_norm": 0.069650654147604, "learning_rate": 7.736649665987943e-05, "loss": 0.2622, "step": 3294 }, { "epoch": 1.8469730941704037, "grad_norm": 0.06838339865969636, "learning_rate": 7.730296188709844e-05, "loss": 0.264, "step": 3295 }, { "epoch": 1.8475336322869955, "grad_norm": 0.06915450823913075, "learning_rate": 7.723943677251222e-05, "loss": 0.2539, "step": 3296 }, { "epoch": 1.8480941704035874, "grad_norm": 0.07051579087788801, "learning_rate": 7.717592134315243e-05, "loss": 0.2598, "step": 3297 }, { "epoch": 1.8486547085201792, "grad_norm": 0.07275806676175088, "learning_rate": 7.711241562604655e-05, "loss": 0.2714, "step": 3298 }, { "epoch": 1.8492152466367713, "grad_norm": 0.07072076011263187, "learning_rate": 7.704891964821802e-05, "loss": 0.2626, "step": 3299 }, { "epoch": 1.8497757847533634, "grad_norm": 0.07077532083248372, "learning_rate": 7.698543343668602e-05, "loss": 0.2604, "step": 3300 }, { "epoch": 1.8503363228699552, "grad_norm": 0.06818733312627143, "learning_rate": 7.69219570184656e-05, "loss": 0.245, "step": 3301 }, { "epoch": 1.850896860986547, "grad_norm": 0.07023083270256576, "learning_rate": 7.685849042056776e-05, "loss": 0.2601, "step": 3302 }, { "epoch": 1.851457399103139, "grad_norm": 0.07171723482332348, "learning_rate": 7.679503366999913e-05, "loss": 0.2659, "step": 3303 }, { "epoch": 1.852017937219731, "grad_norm": 0.07000108111565913, "learning_rate": 7.673158679376234e-05, "loss": 0.2566, "step": 3304 }, { "epoch": 1.852578475336323, "grad_norm": 0.0703263400695774, "learning_rate": 7.666814981885562e-05, "loss": 0.2495, "step": 3305 }, { "epoch": 1.8531390134529149, "grad_norm": 0.0722579247585532, "learning_rate": 7.66047227722732e-05, "loss": 0.2694, "step": 3306 }, { "epoch": 1.8536995515695067, "grad_norm": 0.07136350358402029, "learning_rate": 7.654130568100492e-05, "loss": 0.2578, "step": 3307 }, { "epoch": 1.8542600896860986, "grad_norm": 0.06908740131278682, "learning_rate": 7.647789857203645e-05, "loss": 0.2481, "step": 3308 }, { "epoch": 1.8548206278026906, "grad_norm": 0.07265326145159552, "learning_rate": 7.64145014723492e-05, "loss": 0.2628, "step": 3309 }, { "epoch": 1.8553811659192825, "grad_norm": 0.07148769815898748, "learning_rate": 7.635111440892034e-05, "loss": 0.2749, "step": 3310 }, { "epoch": 1.8559417040358746, "grad_norm": 0.07087077594439263, "learning_rate": 7.62877374087228e-05, "loss": 0.2622, "step": 3311 }, { "epoch": 1.8565022421524664, "grad_norm": 0.07034259844404242, "learning_rate": 7.622437049872512e-05, "loss": 0.2683, "step": 3312 }, { "epoch": 1.8570627802690582, "grad_norm": 0.07194898546729013, "learning_rate": 7.616101370589158e-05, "loss": 0.2643, "step": 3313 }, { "epoch": 1.85762331838565, "grad_norm": 0.07097837155946217, "learning_rate": 7.609766705718225e-05, "loss": 0.26, "step": 3314 }, { "epoch": 1.8581838565022422, "grad_norm": 0.07060544817745804, "learning_rate": 7.60343305795528e-05, "loss": 0.2648, "step": 3315 }, { "epoch": 1.8587443946188342, "grad_norm": 0.06912082320374903, "learning_rate": 7.59710042999546e-05, "loss": 0.2684, "step": 3316 }, { "epoch": 1.859304932735426, "grad_norm": 0.07062735182228592, "learning_rate": 7.590768824533463e-05, "loss": 0.2708, "step": 3317 }, { "epoch": 1.859865470852018, "grad_norm": 0.07106775905431914, "learning_rate": 7.584438244263561e-05, "loss": 0.2665, "step": 3318 }, { "epoch": 1.8604260089686098, "grad_norm": 0.07179109023822038, "learning_rate": 7.578108691879584e-05, "loss": 0.2766, "step": 3319 }, { "epoch": 1.8609865470852018, "grad_norm": 0.07177401584888915, "learning_rate": 7.57178017007492e-05, "loss": 0.2614, "step": 3320 }, { "epoch": 1.8615470852017937, "grad_norm": 0.07334474144983744, "learning_rate": 7.565452681542529e-05, "loss": 0.2722, "step": 3321 }, { "epoch": 1.8621076233183858, "grad_norm": 0.07217197580352912, "learning_rate": 7.559126228974921e-05, "loss": 0.2593, "step": 3322 }, { "epoch": 1.8626681614349776, "grad_norm": 0.06946442838759735, "learning_rate": 7.55280081506418e-05, "loss": 0.2667, "step": 3323 }, { "epoch": 1.8632286995515694, "grad_norm": 0.07131625752713222, "learning_rate": 7.546476442501926e-05, "loss": 0.2655, "step": 3324 }, { "epoch": 1.8637892376681613, "grad_norm": 0.06996968895541275, "learning_rate": 7.54015311397936e-05, "loss": 0.2723, "step": 3325 }, { "epoch": 1.8643497757847534, "grad_norm": 0.07084335490981197, "learning_rate": 7.533830832187216e-05, "loss": 0.2677, "step": 3326 }, { "epoch": 1.8649103139013454, "grad_norm": 0.07049422582762938, "learning_rate": 7.527509599815799e-05, "loss": 0.2654, "step": 3327 }, { "epoch": 1.8654708520179373, "grad_norm": 0.07020752735237043, "learning_rate": 7.521189419554963e-05, "loss": 0.2606, "step": 3328 }, { "epoch": 1.8660313901345291, "grad_norm": 0.06884771378378535, "learning_rate": 7.51487029409411e-05, "loss": 0.2564, "step": 3329 }, { "epoch": 1.866591928251121, "grad_norm": 0.06850733209924952, "learning_rate": 7.508552226122197e-05, "loss": 0.267, "step": 3330 }, { "epoch": 1.867152466367713, "grad_norm": 0.06958197189734695, "learning_rate": 7.502235218327731e-05, "loss": 0.2654, "step": 3331 }, { "epoch": 1.8677130044843049, "grad_norm": 0.06861922633796758, "learning_rate": 7.49591927339877e-05, "loss": 0.2704, "step": 3332 }, { "epoch": 1.868273542600897, "grad_norm": 0.06588015335176929, "learning_rate": 7.489604394022914e-05, "loss": 0.2449, "step": 3333 }, { "epoch": 1.8688340807174888, "grad_norm": 0.07808355023131226, "learning_rate": 7.483290582887308e-05, "loss": 0.2591, "step": 3334 }, { "epoch": 1.8693946188340806, "grad_norm": 0.06726693978009231, "learning_rate": 7.476977842678659e-05, "loss": 0.2628, "step": 3335 }, { "epoch": 1.8699551569506725, "grad_norm": 0.07212035137203555, "learning_rate": 7.470666176083192e-05, "loss": 0.2805, "step": 3336 }, { "epoch": 1.8705156950672646, "grad_norm": 0.06904850809163075, "learning_rate": 7.464355585786702e-05, "loss": 0.2665, "step": 3337 }, { "epoch": 1.8710762331838566, "grad_norm": 0.07195609236414281, "learning_rate": 7.458046074474504e-05, "loss": 0.2543, "step": 3338 }, { "epoch": 1.8716367713004485, "grad_norm": 0.06899222034013164, "learning_rate": 7.451737644831469e-05, "loss": 0.2781, "step": 3339 }, { "epoch": 1.8721973094170403, "grad_norm": 0.06897223641407911, "learning_rate": 7.445430299542002e-05, "loss": 0.2535, "step": 3340 }, { "epoch": 1.8727578475336322, "grad_norm": 0.07179626227747822, "learning_rate": 7.43912404129004e-05, "loss": 0.268, "step": 3341 }, { "epoch": 1.8733183856502242, "grad_norm": 0.0668777419822922, "learning_rate": 7.432818872759071e-05, "loss": 0.2578, "step": 3342 }, { "epoch": 1.8738789237668163, "grad_norm": 0.06907163869149917, "learning_rate": 7.426514796632108e-05, "loss": 0.2726, "step": 3343 }, { "epoch": 1.8744394618834082, "grad_norm": 0.0706533991015148, "learning_rate": 7.420211815591709e-05, "loss": 0.2559, "step": 3344 }, { "epoch": 1.875, "grad_norm": 0.07173026287207163, "learning_rate": 7.413909932319952e-05, "loss": 0.2709, "step": 3345 }, { "epoch": 1.8755605381165918, "grad_norm": 0.07111268467729995, "learning_rate": 7.407609149498467e-05, "loss": 0.2538, "step": 3346 }, { "epoch": 1.8761210762331837, "grad_norm": 0.0684648010044524, "learning_rate": 7.401309469808395e-05, "loss": 0.2636, "step": 3347 }, { "epoch": 1.8766816143497758, "grad_norm": 0.06904085368570378, "learning_rate": 7.395010895930421e-05, "loss": 0.2732, "step": 3348 }, { "epoch": 1.8772421524663678, "grad_norm": 0.06744956753540765, "learning_rate": 7.388713430544763e-05, "loss": 0.2682, "step": 3349 }, { "epoch": 1.8778026905829597, "grad_norm": 0.0685403697581941, "learning_rate": 7.382417076331147e-05, "loss": 0.258, "step": 3350 }, { "epoch": 1.8783632286995515, "grad_norm": 0.07099772248656062, "learning_rate": 7.376121835968851e-05, "loss": 0.2672, "step": 3351 }, { "epoch": 1.8789237668161434, "grad_norm": 0.07080807687251338, "learning_rate": 7.369827712136661e-05, "loss": 0.2637, "step": 3352 }, { "epoch": 1.8794843049327354, "grad_norm": 0.06846463607950867, "learning_rate": 7.363534707512901e-05, "loss": 0.2525, "step": 3353 }, { "epoch": 1.8800448430493275, "grad_norm": 0.06777615635017255, "learning_rate": 7.357242824775406e-05, "loss": 0.2568, "step": 3354 }, { "epoch": 1.8806053811659194, "grad_norm": 0.07173455763218528, "learning_rate": 7.35095206660154e-05, "loss": 0.2659, "step": 3355 }, { "epoch": 1.8811659192825112, "grad_norm": 0.06946326908201468, "learning_rate": 7.344662435668196e-05, "loss": 0.2714, "step": 3356 }, { "epoch": 1.881726457399103, "grad_norm": 0.07026772989639453, "learning_rate": 7.338373934651768e-05, "loss": 0.266, "step": 3357 }, { "epoch": 1.8822869955156951, "grad_norm": 0.07253312539345642, "learning_rate": 7.332086566228194e-05, "loss": 0.2583, "step": 3358 }, { "epoch": 1.882847533632287, "grad_norm": 0.072342685380318, "learning_rate": 7.325800333072904e-05, "loss": 0.2689, "step": 3359 }, { "epoch": 1.883408071748879, "grad_norm": 0.07121501443116349, "learning_rate": 7.319515237860864e-05, "loss": 0.2664, "step": 3360 }, { "epoch": 1.8839686098654709, "grad_norm": 0.0706672213069749, "learning_rate": 7.313231283266551e-05, "loss": 0.262, "step": 3361 }, { "epoch": 1.8845291479820627, "grad_norm": 0.0708292342381674, "learning_rate": 7.306948471963951e-05, "loss": 0.2592, "step": 3362 }, { "epoch": 1.8850896860986546, "grad_norm": 0.07074562930059831, "learning_rate": 7.300666806626572e-05, "loss": 0.2658, "step": 3363 }, { "epoch": 1.8856502242152466, "grad_norm": 0.0690218491119218, "learning_rate": 7.294386289927425e-05, "loss": 0.2495, "step": 3364 }, { "epoch": 1.8862107623318387, "grad_norm": 0.07256808673582511, "learning_rate": 7.288106924539045e-05, "loss": 0.2754, "step": 3365 }, { "epoch": 1.8867713004484306, "grad_norm": 0.06936835280441091, "learning_rate": 7.281828713133463e-05, "loss": 0.2624, "step": 3366 }, { "epoch": 1.8873318385650224, "grad_norm": 0.07098030153719274, "learning_rate": 7.275551658382224e-05, "loss": 0.2786, "step": 3367 }, { "epoch": 1.8878923766816142, "grad_norm": 0.06914782827830317, "learning_rate": 7.269275762956392e-05, "loss": 0.2598, "step": 3368 }, { "epoch": 1.8884529147982063, "grad_norm": 0.07038723810265705, "learning_rate": 7.263001029526514e-05, "loss": 0.269, "step": 3369 }, { "epoch": 1.8890134529147982, "grad_norm": 0.07129477470082596, "learning_rate": 7.256727460762669e-05, "loss": 0.2781, "step": 3370 }, { "epoch": 1.8895739910313902, "grad_norm": 0.0679187090080146, "learning_rate": 7.250455059334417e-05, "loss": 0.254, "step": 3371 }, { "epoch": 1.890134529147982, "grad_norm": 0.0701264604194011, "learning_rate": 7.24418382791084e-05, "loss": 0.268, "step": 3372 }, { "epoch": 1.890695067264574, "grad_norm": 0.06989599911204056, "learning_rate": 7.237913769160514e-05, "loss": 0.2768, "step": 3373 }, { "epoch": 1.8912556053811658, "grad_norm": 0.07112791500392587, "learning_rate": 7.231644885751507e-05, "loss": 0.2685, "step": 3374 }, { "epoch": 1.8918161434977578, "grad_norm": 0.07253441231150679, "learning_rate": 7.225377180351406e-05, "loss": 0.2726, "step": 3375 }, { "epoch": 1.89237668161435, "grad_norm": 0.07037726472225406, "learning_rate": 7.219110655627281e-05, "loss": 0.258, "step": 3376 }, { "epoch": 1.8929372197309418, "grad_norm": 0.07196987542914432, "learning_rate": 7.212845314245712e-05, "loss": 0.2604, "step": 3377 }, { "epoch": 1.8934977578475336, "grad_norm": 0.07431132206256182, "learning_rate": 7.20658115887276e-05, "loss": 0.2638, "step": 3378 }, { "epoch": 1.8940582959641254, "grad_norm": 0.06932179226439326, "learning_rate": 7.200318192173998e-05, "loss": 0.2572, "step": 3379 }, { "epoch": 1.8946188340807175, "grad_norm": 0.06984265728570498, "learning_rate": 7.194056416814481e-05, "loss": 0.2607, "step": 3380 }, { "epoch": 1.8951793721973094, "grad_norm": 0.06794315477215011, "learning_rate": 7.187795835458759e-05, "loss": 0.2553, "step": 3381 }, { "epoch": 1.8957399103139014, "grad_norm": 0.07257280209516982, "learning_rate": 7.181536450770882e-05, "loss": 0.2658, "step": 3382 }, { "epoch": 1.8963004484304933, "grad_norm": 0.06998263085993002, "learning_rate": 7.17527826541438e-05, "loss": 0.2737, "step": 3383 }, { "epoch": 1.8968609865470851, "grad_norm": 0.06850490431660242, "learning_rate": 7.169021282052283e-05, "loss": 0.2538, "step": 3384 }, { "epoch": 1.897421524663677, "grad_norm": 0.07011061332069132, "learning_rate": 7.162765503347097e-05, "loss": 0.2574, "step": 3385 }, { "epoch": 1.897982062780269, "grad_norm": 0.06744822387404592, "learning_rate": 7.156510931960833e-05, "loss": 0.2502, "step": 3386 }, { "epoch": 1.898542600896861, "grad_norm": 0.0682590772522164, "learning_rate": 7.15025757055497e-05, "loss": 0.267, "step": 3387 }, { "epoch": 1.899103139013453, "grad_norm": 0.0718727765124095, "learning_rate": 7.144005421790479e-05, "loss": 0.2671, "step": 3388 }, { "epoch": 1.8996636771300448, "grad_norm": 0.06914143322440189, "learning_rate": 7.137754488327822e-05, "loss": 0.2546, "step": 3389 }, { "epoch": 1.9002242152466366, "grad_norm": 0.07013261679104858, "learning_rate": 7.131504772826931e-05, "loss": 0.2576, "step": 3390 }, { "epoch": 1.9007847533632287, "grad_norm": 0.07149179608649352, "learning_rate": 7.125256277947234e-05, "loss": 0.2658, "step": 3391 }, { "epoch": 1.9013452914798208, "grad_norm": 0.07162361854306432, "learning_rate": 7.119009006347625e-05, "loss": 0.2733, "step": 3392 }, { "epoch": 1.9019058295964126, "grad_norm": 0.0705232636082105, "learning_rate": 7.112762960686489e-05, "loss": 0.2611, "step": 3393 }, { "epoch": 1.9024663677130045, "grad_norm": 0.07068874339562098, "learning_rate": 7.106518143621687e-05, "loss": 0.2559, "step": 3394 }, { "epoch": 1.9030269058295963, "grad_norm": 0.07028228605770458, "learning_rate": 7.100274557810546e-05, "loss": 0.2596, "step": 3395 }, { "epoch": 1.9035874439461884, "grad_norm": 0.07031247943732717, "learning_rate": 7.094032205909888e-05, "loss": 0.261, "step": 3396 }, { "epoch": 1.9041479820627802, "grad_norm": 0.07276064957321741, "learning_rate": 7.087791090575995e-05, "loss": 0.2605, "step": 3397 }, { "epoch": 1.9047085201793723, "grad_norm": 0.06930737962177644, "learning_rate": 7.081551214464632e-05, "loss": 0.2637, "step": 3398 }, { "epoch": 1.9052690582959642, "grad_norm": 0.07065158486563466, "learning_rate": 7.075312580231027e-05, "loss": 0.2577, "step": 3399 }, { "epoch": 1.905829596412556, "grad_norm": 0.07105371941610006, "learning_rate": 7.069075190529888e-05, "loss": 0.2595, "step": 3400 }, { "epoch": 1.9063901345291479, "grad_norm": 0.07097615339552256, "learning_rate": 7.062839048015392e-05, "loss": 0.2739, "step": 3401 }, { "epoch": 1.90695067264574, "grad_norm": 0.07166368974905003, "learning_rate": 7.056604155341182e-05, "loss": 0.2549, "step": 3402 }, { "epoch": 1.907511210762332, "grad_norm": 0.07083603818333438, "learning_rate": 7.050370515160373e-05, "loss": 0.2734, "step": 3403 }, { "epoch": 1.9080717488789238, "grad_norm": 0.06761603470174596, "learning_rate": 7.044138130125542e-05, "loss": 0.2634, "step": 3404 }, { "epoch": 1.9086322869955157, "grad_norm": 0.06724505576057976, "learning_rate": 7.037907002888738e-05, "loss": 0.2612, "step": 3405 }, { "epoch": 1.9091928251121075, "grad_norm": 0.06824851044340624, "learning_rate": 7.031677136101472e-05, "loss": 0.2649, "step": 3406 }, { "epoch": 1.9097533632286996, "grad_norm": 0.06851903037823158, "learning_rate": 7.025448532414712e-05, "loss": 0.2628, "step": 3407 }, { "epoch": 1.9103139013452914, "grad_norm": 0.06840303525882, "learning_rate": 7.019221194478904e-05, "loss": 0.2495, "step": 3408 }, { "epoch": 1.9108744394618835, "grad_norm": 0.06964956062777737, "learning_rate": 7.012995124943937e-05, "loss": 0.2671, "step": 3409 }, { "epoch": 1.9114349775784754, "grad_norm": 0.06620059709148725, "learning_rate": 7.006770326459182e-05, "loss": 0.2596, "step": 3410 }, { "epoch": 1.9119955156950672, "grad_norm": 0.06864741139145739, "learning_rate": 7.000546801673444e-05, "loss": 0.2497, "step": 3411 }, { "epoch": 1.912556053811659, "grad_norm": 0.06845870815715167, "learning_rate": 6.994324553235006e-05, "loss": 0.2628, "step": 3412 }, { "epoch": 1.9131165919282511, "grad_norm": 0.06991522454333711, "learning_rate": 6.9881035837916e-05, "loss": 0.2671, "step": 3413 }, { "epoch": 1.9136771300448432, "grad_norm": 0.06790449947644525, "learning_rate": 6.981883895990409e-05, "loss": 0.2568, "step": 3414 }, { "epoch": 1.914237668161435, "grad_norm": 0.06895546513207257, "learning_rate": 6.975665492478084e-05, "loss": 0.2735, "step": 3415 }, { "epoch": 1.9147982062780269, "grad_norm": 0.06997529341199701, "learning_rate": 6.969448375900715e-05, "loss": 0.2641, "step": 3416 }, { "epoch": 1.9153587443946187, "grad_norm": 0.07031479138242287, "learning_rate": 6.963232548903853e-05, "loss": 0.2773, "step": 3417 }, { "epoch": 1.9159192825112108, "grad_norm": 0.06988837242062663, "learning_rate": 6.957018014132498e-05, "loss": 0.2659, "step": 3418 }, { "epoch": 1.9164798206278026, "grad_norm": 0.07048631773028599, "learning_rate": 6.950804774231104e-05, "loss": 0.2858, "step": 3419 }, { "epoch": 1.9170403587443947, "grad_norm": 0.07065037724997916, "learning_rate": 6.944592831843566e-05, "loss": 0.2673, "step": 3420 }, { "epoch": 1.9176008968609866, "grad_norm": 0.07091648817424527, "learning_rate": 6.938382189613228e-05, "loss": 0.2614, "step": 3421 }, { "epoch": 1.9181614349775784, "grad_norm": 0.0701086193186055, "learning_rate": 6.932172850182893e-05, "loss": 0.2559, "step": 3422 }, { "epoch": 1.9187219730941703, "grad_norm": 0.07150135750957044, "learning_rate": 6.925964816194791e-05, "loss": 0.2741, "step": 3423 }, { "epoch": 1.9192825112107623, "grad_norm": 0.07037303016788464, "learning_rate": 6.919758090290614e-05, "loss": 0.2661, "step": 3424 }, { "epoch": 1.9198430493273544, "grad_norm": 0.06989314969355041, "learning_rate": 6.913552675111481e-05, "loss": 0.2559, "step": 3425 }, { "epoch": 1.9204035874439462, "grad_norm": 0.07333797503236086, "learning_rate": 6.90734857329797e-05, "loss": 0.2666, "step": 3426 }, { "epoch": 1.920964125560538, "grad_norm": 0.07056701656580355, "learning_rate": 6.901145787490087e-05, "loss": 0.264, "step": 3427 }, { "epoch": 1.92152466367713, "grad_norm": 0.07091650241937678, "learning_rate": 6.894944320327281e-05, "loss": 0.2711, "step": 3428 }, { "epoch": 1.922085201793722, "grad_norm": 0.069847449242454, "learning_rate": 6.888744174448446e-05, "loss": 0.2573, "step": 3429 }, { "epoch": 1.922645739910314, "grad_norm": 0.07089561215141386, "learning_rate": 6.882545352491904e-05, "loss": 0.2525, "step": 3430 }, { "epoch": 1.923206278026906, "grad_norm": 0.07095533170948423, "learning_rate": 6.87634785709543e-05, "loss": 0.264, "step": 3431 }, { "epoch": 1.9237668161434978, "grad_norm": 0.0710438893524046, "learning_rate": 6.870151690896209e-05, "loss": 0.2558, "step": 3432 }, { "epoch": 1.9243273542600896, "grad_norm": 0.06895392739783383, "learning_rate": 6.863956856530885e-05, "loss": 0.2574, "step": 3433 }, { "epoch": 1.9248878923766815, "grad_norm": 0.06666994015063393, "learning_rate": 6.857763356635525e-05, "loss": 0.2494, "step": 3434 }, { "epoch": 1.9254484304932735, "grad_norm": 0.06995428661714155, "learning_rate": 6.851571193845619e-05, "loss": 0.2792, "step": 3435 }, { "epoch": 1.9260089686098656, "grad_norm": 0.07121034669068539, "learning_rate": 6.845380370796111e-05, "loss": 0.2717, "step": 3436 }, { "epoch": 1.9265695067264574, "grad_norm": 0.06806246914144769, "learning_rate": 6.839190890121348e-05, "loss": 0.2529, "step": 3437 }, { "epoch": 1.9271300448430493, "grad_norm": 0.06986789929975527, "learning_rate": 6.833002754455125e-05, "loss": 0.2584, "step": 3438 }, { "epoch": 1.9276905829596411, "grad_norm": 0.07238871603645033, "learning_rate": 6.826815966430664e-05, "loss": 0.2556, "step": 3439 }, { "epoch": 1.9282511210762332, "grad_norm": 0.07283599960176848, "learning_rate": 6.820630528680597e-05, "loss": 0.2631, "step": 3440 }, { "epoch": 1.9288116591928253, "grad_norm": 0.07207116428072398, "learning_rate": 6.814446443837001e-05, "loss": 0.2581, "step": 3441 }, { "epoch": 1.9293721973094171, "grad_norm": 0.0719993417784609, "learning_rate": 6.808263714531364e-05, "loss": 0.2664, "step": 3442 }, { "epoch": 1.929932735426009, "grad_norm": 0.06970555438156936, "learning_rate": 6.802082343394611e-05, "loss": 0.2598, "step": 3443 }, { "epoch": 1.9304932735426008, "grad_norm": 0.07226176670628195, "learning_rate": 6.795902333057067e-05, "loss": 0.2635, "step": 3444 }, { "epoch": 1.9310538116591929, "grad_norm": 0.06961802825931704, "learning_rate": 6.789723686148502e-05, "loss": 0.2614, "step": 3445 }, { "epoch": 1.9316143497757847, "grad_norm": 0.06982975699214118, "learning_rate": 6.783546405298094e-05, "loss": 0.2531, "step": 3446 }, { "epoch": 1.9321748878923768, "grad_norm": 0.07114606328300568, "learning_rate": 6.777370493134431e-05, "loss": 0.256, "step": 3447 }, { "epoch": 1.9327354260089686, "grad_norm": 0.07222637872927773, "learning_rate": 6.77119595228554e-05, "loss": 0.2545, "step": 3448 }, { "epoch": 1.9332959641255605, "grad_norm": 0.0691511910091209, "learning_rate": 6.765022785378845e-05, "loss": 0.2637, "step": 3449 }, { "epoch": 1.9338565022421523, "grad_norm": 0.07122319030048847, "learning_rate": 6.758850995041197e-05, "loss": 0.2635, "step": 3450 }, { "epoch": 1.9344170403587444, "grad_norm": 0.06893474155708976, "learning_rate": 6.752680583898853e-05, "loss": 0.2565, "step": 3451 }, { "epoch": 1.9349775784753365, "grad_norm": 0.07132881195848476, "learning_rate": 6.746511554577497e-05, "loss": 0.2702, "step": 3452 }, { "epoch": 1.9355381165919283, "grad_norm": 0.07196555459747075, "learning_rate": 6.740343909702205e-05, "loss": 0.278, "step": 3453 }, { "epoch": 1.9360986547085202, "grad_norm": 0.07010298135227981, "learning_rate": 6.734177651897475e-05, "loss": 0.2616, "step": 3454 }, { "epoch": 1.936659192825112, "grad_norm": 0.06726790355415212, "learning_rate": 6.728012783787224e-05, "loss": 0.2599, "step": 3455 }, { "epoch": 1.937219730941704, "grad_norm": 0.06985074552731459, "learning_rate": 6.721849307994756e-05, "loss": 0.2661, "step": 3456 }, { "epoch": 1.937780269058296, "grad_norm": 0.0683660746443686, "learning_rate": 6.715687227142804e-05, "loss": 0.2457, "step": 3457 }, { "epoch": 1.938340807174888, "grad_norm": 0.06873087698366843, "learning_rate": 6.709526543853489e-05, "loss": 0.2642, "step": 3458 }, { "epoch": 1.9389013452914798, "grad_norm": 0.06885891373281881, "learning_rate": 6.703367260748352e-05, "loss": 0.2658, "step": 3459 }, { "epoch": 1.9394618834080717, "grad_norm": 0.0711839966087522, "learning_rate": 6.697209380448333e-05, "loss": 0.2787, "step": 3460 }, { "epoch": 1.9400224215246635, "grad_norm": 0.07203560142958441, "learning_rate": 6.691052905573766e-05, "loss": 0.2779, "step": 3461 }, { "epoch": 1.9405829596412556, "grad_norm": 0.07064585839757186, "learning_rate": 6.684897838744403e-05, "loss": 0.2639, "step": 3462 }, { "epoch": 1.9411434977578477, "grad_norm": 0.06814353501211662, "learning_rate": 6.678744182579384e-05, "loss": 0.2565, "step": 3463 }, { "epoch": 1.9417040358744395, "grad_norm": 0.07081774051467991, "learning_rate": 6.672591939697261e-05, "loss": 0.273, "step": 3464 }, { "epoch": 1.9422645739910314, "grad_norm": 0.06898429138012982, "learning_rate": 6.66644111271597e-05, "loss": 0.2561, "step": 3465 }, { "epoch": 1.9428251121076232, "grad_norm": 0.06893255463540728, "learning_rate": 6.660291704252855e-05, "loss": 0.2623, "step": 3466 }, { "epoch": 1.9433856502242153, "grad_norm": 0.06918468020767439, "learning_rate": 6.654143716924656e-05, "loss": 0.2546, "step": 3467 }, { "epoch": 1.9439461883408071, "grad_norm": 0.0694607758966827, "learning_rate": 6.647997153347498e-05, "loss": 0.2608, "step": 3468 }, { "epoch": 1.9445067264573992, "grad_norm": 0.06975092141192037, "learning_rate": 6.641852016136916e-05, "loss": 0.2609, "step": 3469 }, { "epoch": 1.945067264573991, "grad_norm": 0.06913760605936561, "learning_rate": 6.635708307907822e-05, "loss": 0.2623, "step": 3470 }, { "epoch": 1.9456278026905829, "grad_norm": 0.0875483678068613, "learning_rate": 6.629566031274532e-05, "loss": 0.2584, "step": 3471 }, { "epoch": 1.9461883408071747, "grad_norm": 0.07058202455417682, "learning_rate": 6.623425188850746e-05, "loss": 0.2629, "step": 3472 }, { "epoch": 1.9467488789237668, "grad_norm": 0.07038343018874284, "learning_rate": 6.617285783249563e-05, "loss": 0.2625, "step": 3473 }, { "epoch": 1.9473094170403589, "grad_norm": 0.06941074905461624, "learning_rate": 6.611147817083456e-05, "loss": 0.2712, "step": 3474 }, { "epoch": 1.9478699551569507, "grad_norm": 0.0692743253769447, "learning_rate": 6.605011292964297e-05, "loss": 0.2483, "step": 3475 }, { "epoch": 1.9484304932735426, "grad_norm": 0.06808507809897883, "learning_rate": 6.598876213503339e-05, "loss": 0.2628, "step": 3476 }, { "epoch": 1.9489910313901344, "grad_norm": 0.0667729391352458, "learning_rate": 6.592742581311221e-05, "loss": 0.2493, "step": 3477 }, { "epoch": 1.9495515695067265, "grad_norm": 0.07055567641483064, "learning_rate": 6.58661039899797e-05, "loss": 0.2667, "step": 3478 }, { "epoch": 1.9501121076233185, "grad_norm": 0.0718589113081936, "learning_rate": 6.580479669172989e-05, "loss": 0.273, "step": 3479 }, { "epoch": 1.9506726457399104, "grad_norm": 0.06705308605935648, "learning_rate": 6.574350394445074e-05, "loss": 0.2557, "step": 3480 }, { "epoch": 1.9512331838565022, "grad_norm": 0.07119398618447678, "learning_rate": 6.568222577422389e-05, "loss": 0.2691, "step": 3481 }, { "epoch": 1.951793721973094, "grad_norm": 0.06839092796226122, "learning_rate": 6.562096220712482e-05, "loss": 0.2626, "step": 3482 }, { "epoch": 1.952354260089686, "grad_norm": 0.07330272242824666, "learning_rate": 6.555971326922286e-05, "loss": 0.2766, "step": 3483 }, { "epoch": 1.952914798206278, "grad_norm": 0.06811195070717267, "learning_rate": 6.549847898658102e-05, "loss": 0.2589, "step": 3484 }, { "epoch": 1.95347533632287, "grad_norm": 0.0700191072296125, "learning_rate": 6.54372593852562e-05, "loss": 0.2665, "step": 3485 }, { "epoch": 1.954035874439462, "grad_norm": 0.06806656260491024, "learning_rate": 6.537605449129888e-05, "loss": 0.2716, "step": 3486 }, { "epoch": 1.9545964125560538, "grad_norm": 0.06902008174469523, "learning_rate": 6.531486433075339e-05, "loss": 0.2712, "step": 3487 }, { "epoch": 1.9551569506726456, "grad_norm": 0.06943678410535838, "learning_rate": 6.525368892965784e-05, "loss": 0.2554, "step": 3488 }, { "epoch": 1.9557174887892377, "grad_norm": 0.068487552278527, "learning_rate": 6.519252831404392e-05, "loss": 0.2624, "step": 3489 }, { "epoch": 1.9562780269058297, "grad_norm": 0.07075715030821601, "learning_rate": 6.513138250993716e-05, "loss": 0.2714, "step": 3490 }, { "epoch": 1.9568385650224216, "grad_norm": 0.07190539931059477, "learning_rate": 6.507025154335666e-05, "loss": 0.2693, "step": 3491 }, { "epoch": 1.9573991031390134, "grad_norm": 0.06926257962240638, "learning_rate": 6.500913544031534e-05, "loss": 0.2444, "step": 3492 }, { "epoch": 1.9579596412556053, "grad_norm": 0.07127571826060886, "learning_rate": 6.494803422681972e-05, "loss": 0.262, "step": 3493 }, { "epoch": 1.9585201793721974, "grad_norm": 0.07075473239262661, "learning_rate": 6.488694792886996e-05, "loss": 0.2637, "step": 3494 }, { "epoch": 1.9590807174887892, "grad_norm": 0.0718492394912454, "learning_rate": 6.482587657245994e-05, "loss": 0.2638, "step": 3495 }, { "epoch": 1.9596412556053813, "grad_norm": 0.06909919217224249, "learning_rate": 6.476482018357713e-05, "loss": 0.2572, "step": 3496 }, { "epoch": 1.9602017937219731, "grad_norm": 0.06955860991608306, "learning_rate": 6.47037787882027e-05, "loss": 0.2627, "step": 3497 }, { "epoch": 1.960762331838565, "grad_norm": 0.06980360814637783, "learning_rate": 6.464275241231132e-05, "loss": 0.2633, "step": 3498 }, { "epoch": 1.9613228699551568, "grad_norm": 0.06713565112040079, "learning_rate": 6.458174108187139e-05, "loss": 0.2634, "step": 3499 }, { "epoch": 1.9618834080717489, "grad_norm": 0.06954400272833183, "learning_rate": 6.452074482284487e-05, "loss": 0.264, "step": 3500 }, { "epoch": 1.962443946188341, "grad_norm": 0.06805182739731903, "learning_rate": 6.445976366118722e-05, "loss": 0.2667, "step": 3501 }, { "epoch": 1.9630044843049328, "grad_norm": 0.0685719547389835, "learning_rate": 6.439879762284763e-05, "loss": 0.2487, "step": 3502 }, { "epoch": 1.9635650224215246, "grad_norm": 0.07172031683536766, "learning_rate": 6.43378467337687e-05, "loss": 0.2704, "step": 3503 }, { "epoch": 1.9641255605381165, "grad_norm": 0.06963178535603862, "learning_rate": 6.427691101988673e-05, "loss": 0.2498, "step": 3504 }, { "epoch": 1.9646860986547086, "grad_norm": 0.07098641070628586, "learning_rate": 6.421599050713144e-05, "loss": 0.2725, "step": 3505 }, { "epoch": 1.9652466367713004, "grad_norm": 0.06828147116270429, "learning_rate": 6.415508522142619e-05, "loss": 0.2574, "step": 3506 }, { "epoch": 1.9658071748878925, "grad_norm": 0.06957147042938308, "learning_rate": 6.409419518868775e-05, "loss": 0.2633, "step": 3507 }, { "epoch": 1.9663677130044843, "grad_norm": 0.07109989970710494, "learning_rate": 6.403332043482643e-05, "loss": 0.273, "step": 3508 }, { "epoch": 1.9669282511210762, "grad_norm": 0.0672142330811063, "learning_rate": 6.397246098574615e-05, "loss": 0.254, "step": 3509 }, { "epoch": 1.967488789237668, "grad_norm": 0.06840471354209353, "learning_rate": 6.391161686734413e-05, "loss": 0.2506, "step": 3510 }, { "epoch": 1.96804932735426, "grad_norm": 0.06920685502036582, "learning_rate": 6.385078810551124e-05, "loss": 0.2705, "step": 3511 }, { "epoch": 1.9686098654708521, "grad_norm": 0.07107145289071963, "learning_rate": 6.378997472613169e-05, "loss": 0.2765, "step": 3512 }, { "epoch": 1.969170403587444, "grad_norm": 0.06773951318983387, "learning_rate": 6.372917675508324e-05, "loss": 0.2552, "step": 3513 }, { "epoch": 1.9697309417040358, "grad_norm": 0.07167256187046933, "learning_rate": 6.366839421823702e-05, "loss": 0.2688, "step": 3514 }, { "epoch": 1.9702914798206277, "grad_norm": 0.06966715196630331, "learning_rate": 6.360762714145761e-05, "loss": 0.2431, "step": 3515 }, { "epoch": 1.9708520179372198, "grad_norm": 0.07022803064435848, "learning_rate": 6.354687555060302e-05, "loss": 0.2622, "step": 3516 }, { "epoch": 1.9714125560538116, "grad_norm": 0.070004797446652, "learning_rate": 6.348613947152468e-05, "loss": 0.2513, "step": 3517 }, { "epoch": 1.9719730941704037, "grad_norm": 0.06784047449113982, "learning_rate": 6.342541893006746e-05, "loss": 0.2563, "step": 3518 }, { "epoch": 1.9725336322869955, "grad_norm": 0.07028830379687871, "learning_rate": 6.336471395206946e-05, "loss": 0.2626, "step": 3519 }, { "epoch": 1.9730941704035874, "grad_norm": 0.07144555424165255, "learning_rate": 6.330402456336237e-05, "loss": 0.2586, "step": 3520 }, { "epoch": 1.9736547085201792, "grad_norm": 0.0712109303411046, "learning_rate": 6.324335078977112e-05, "loss": 0.2705, "step": 3521 }, { "epoch": 1.9742152466367713, "grad_norm": 0.07047632646386391, "learning_rate": 6.318269265711398e-05, "loss": 0.2659, "step": 3522 }, { "epoch": 1.9747757847533634, "grad_norm": 0.07160627079639259, "learning_rate": 6.312205019120262e-05, "loss": 0.279, "step": 3523 }, { "epoch": 1.9753363228699552, "grad_norm": 0.06996500426013655, "learning_rate": 6.306142341784202e-05, "loss": 0.251, "step": 3524 }, { "epoch": 1.975896860986547, "grad_norm": 0.07115808243675245, "learning_rate": 6.300081236283053e-05, "loss": 0.2588, "step": 3525 }, { "epoch": 1.976457399103139, "grad_norm": 0.07118630719531856, "learning_rate": 6.294021705195974e-05, "loss": 0.2557, "step": 3526 }, { "epoch": 1.977017937219731, "grad_norm": 0.07209774976734813, "learning_rate": 6.287963751101454e-05, "loss": 0.268, "step": 3527 }, { "epoch": 1.977578475336323, "grad_norm": 0.07024002145899598, "learning_rate": 6.281907376577316e-05, "loss": 0.2547, "step": 3528 }, { "epoch": 1.9781390134529149, "grad_norm": 0.0730987438721767, "learning_rate": 6.27585258420071e-05, "loss": 0.2533, "step": 3529 }, { "epoch": 1.9786995515695067, "grad_norm": 0.06986966309781407, "learning_rate": 6.269799376548116e-05, "loss": 0.2598, "step": 3530 }, { "epoch": 1.9792600896860986, "grad_norm": 0.06984335708696456, "learning_rate": 6.263747756195324e-05, "loss": 0.27, "step": 3531 }, { "epoch": 1.9798206278026906, "grad_norm": 0.07196028280505191, "learning_rate": 6.257697725717468e-05, "loss": 0.2614, "step": 3532 }, { "epoch": 1.9803811659192825, "grad_norm": 0.07156502426950263, "learning_rate": 6.251649287688999e-05, "loss": 0.2699, "step": 3533 }, { "epoch": 1.9809417040358746, "grad_norm": 0.07167329694846092, "learning_rate": 6.245602444683681e-05, "loss": 0.2541, "step": 3534 }, { "epoch": 1.9815022421524664, "grad_norm": 0.06847158656031785, "learning_rate": 6.239557199274615e-05, "loss": 0.2589, "step": 3535 }, { "epoch": 1.9820627802690582, "grad_norm": 0.07134674822776889, "learning_rate": 6.233513554034204e-05, "loss": 0.2695, "step": 3536 }, { "epoch": 1.98262331838565, "grad_norm": 0.07024867462243686, "learning_rate": 6.227471511534191e-05, "loss": 0.2613, "step": 3537 }, { "epoch": 1.9831838565022422, "grad_norm": 0.06914310788811501, "learning_rate": 6.221431074345618e-05, "loss": 0.2662, "step": 3538 }, { "epoch": 1.9837443946188342, "grad_norm": 0.07049521652395627, "learning_rate": 6.21539224503886e-05, "loss": 0.265, "step": 3539 }, { "epoch": 1.984304932735426, "grad_norm": 0.06930403337463462, "learning_rate": 6.209355026183594e-05, "loss": 0.2531, "step": 3540 }, { "epoch": 1.984865470852018, "grad_norm": 0.06784220137590388, "learning_rate": 6.203319420348814e-05, "loss": 0.2552, "step": 3541 }, { "epoch": 1.9854260089686098, "grad_norm": 0.06979882931331614, "learning_rate": 6.197285430102843e-05, "loss": 0.2578, "step": 3542 }, { "epoch": 1.9859865470852018, "grad_norm": 0.06946776088389077, "learning_rate": 6.191253058013292e-05, "loss": 0.2682, "step": 3543 }, { "epoch": 1.9865470852017937, "grad_norm": 0.06973110505777286, "learning_rate": 6.185222306647105e-05, "loss": 0.2684, "step": 3544 }, { "epoch": 1.9871076233183858, "grad_norm": 0.06677543280650287, "learning_rate": 6.179193178570521e-05, "loss": 0.2413, "step": 3545 }, { "epoch": 1.9876681614349776, "grad_norm": 0.07212011602930748, "learning_rate": 6.173165676349103e-05, "loss": 0.2622, "step": 3546 }, { "epoch": 1.9882286995515694, "grad_norm": 0.06899791297894088, "learning_rate": 6.167139802547709e-05, "loss": 0.2572, "step": 3547 }, { "epoch": 1.9887892376681613, "grad_norm": 0.06933956269035081, "learning_rate": 6.161115559730505e-05, "loss": 0.2497, "step": 3548 }, { "epoch": 1.9893497757847534, "grad_norm": 0.07089229995175773, "learning_rate": 6.155092950460972e-05, "loss": 0.2674, "step": 3549 }, { "epoch": 1.9899103139013454, "grad_norm": 0.07146566008039273, "learning_rate": 6.149071977301889e-05, "loss": 0.2594, "step": 3550 }, { "epoch": 1.9904708520179373, "grad_norm": 0.0709130469045352, "learning_rate": 6.143052642815344e-05, "loss": 0.2648, "step": 3551 }, { "epoch": 1.9910313901345291, "grad_norm": 0.06939505763940963, "learning_rate": 6.137034949562719e-05, "loss": 0.2637, "step": 3552 }, { "epoch": 1.991591928251121, "grad_norm": 0.06919568454464115, "learning_rate": 6.131018900104705e-05, "loss": 0.2643, "step": 3553 }, { "epoch": 1.992152466367713, "grad_norm": 0.06839363754516936, "learning_rate": 6.125004497001297e-05, "loss": 0.2638, "step": 3554 }, { "epoch": 1.9927130044843049, "grad_norm": 0.06941072012337744, "learning_rate": 6.118991742811773e-05, "loss": 0.2485, "step": 3555 }, { "epoch": 1.993273542600897, "grad_norm": 0.06919893078365716, "learning_rate": 6.112980640094728e-05, "loss": 0.267, "step": 3556 }, { "epoch": 1.9938340807174888, "grad_norm": 0.06988259270592627, "learning_rate": 6.106971191408042e-05, "loss": 0.2623, "step": 3557 }, { "epoch": 1.9943946188340806, "grad_norm": 0.07389775122760188, "learning_rate": 6.1009633993089023e-05, "loss": 0.28, "step": 3558 }, { "epoch": 1.9949551569506725, "grad_norm": 0.07096910171506648, "learning_rate": 6.094957266353776e-05, "loss": 0.2644, "step": 3559 }, { "epoch": 1.9955156950672646, "grad_norm": 0.0698603598767232, "learning_rate": 6.0889527950984416e-05, "loss": 0.2604, "step": 3560 }, { "epoch": 1.9960762331838566, "grad_norm": 0.07024849389536654, "learning_rate": 6.082949988097954e-05, "loss": 0.2652, "step": 3561 }, { "epoch": 1.9966367713004485, "grad_norm": 0.06949137843652796, "learning_rate": 6.0769488479066706e-05, "loss": 0.2584, "step": 3562 }, { "epoch": 1.9971973094170403, "grad_norm": 0.07090870799176502, "learning_rate": 6.07094937707824e-05, "loss": 0.2627, "step": 3563 }, { "epoch": 1.9977578475336322, "grad_norm": 0.06914428548303446, "learning_rate": 6.06495157816559e-05, "loss": 0.2642, "step": 3564 }, { "epoch": 1.9983183856502242, "grad_norm": 0.06946977098314562, "learning_rate": 6.058955453720949e-05, "loss": 0.261, "step": 3565 }, { "epoch": 1.9988789237668163, "grad_norm": 0.06896759326172326, "learning_rate": 6.052961006295824e-05, "loss": 0.2573, "step": 3566 }, { "epoch": 1.9994394618834082, "grad_norm": 0.06819216529004883, "learning_rate": 6.0469682384410195e-05, "loss": 0.2576, "step": 3567 }, { "epoch": 2.0, "grad_norm": 0.06871967606933381, "learning_rate": 6.040977152706613e-05, "loss": 0.257, "step": 3568 }, { "epoch": 2.0, "eval_loss": 0.2704615294933319, "eval_runtime": 342.5117, "eval_samples_per_second": 35.085, "eval_steps_per_second": 1.098, "step": 3568 }, { "epoch": 2.000560538116592, "grad_norm": 0.06881331512094488, "learning_rate": 6.034987751641967e-05, "loss": 0.2532, "step": 3569 }, { "epoch": 2.0011210762331837, "grad_norm": 0.06683374304992575, "learning_rate": 6.029000037795738e-05, "loss": 0.258, "step": 3570 }, { "epoch": 2.001681614349776, "grad_norm": 0.0697166774924195, "learning_rate": 6.023014013715853e-05, "loss": 0.253, "step": 3571 }, { "epoch": 2.002242152466368, "grad_norm": 0.07162643839747408, "learning_rate": 6.017029681949531e-05, "loss": 0.2525, "step": 3572 }, { "epoch": 2.0028026905829597, "grad_norm": 0.06976413816504087, "learning_rate": 6.01104704504326e-05, "loss": 0.2425, "step": 3573 }, { "epoch": 2.0033632286995515, "grad_norm": 0.06861865906982345, "learning_rate": 6.005066105542809e-05, "loss": 0.2374, "step": 3574 }, { "epoch": 2.0039237668161434, "grad_norm": 0.07142251589075896, "learning_rate": 5.999086865993236e-05, "loss": 0.2382, "step": 3575 }, { "epoch": 2.004484304932735, "grad_norm": 0.07156604660459492, "learning_rate": 5.9931093289388576e-05, "loss": 0.2458, "step": 3576 }, { "epoch": 2.0050448430493275, "grad_norm": 0.07280334146812427, "learning_rate": 5.987133496923281e-05, "loss": 0.2481, "step": 3577 }, { "epoch": 2.0056053811659194, "grad_norm": 0.07492042606750819, "learning_rate": 5.981159372489378e-05, "loss": 0.2453, "step": 3578 }, { "epoch": 2.006165919282511, "grad_norm": 0.0758618656234295, "learning_rate": 5.975186958179304e-05, "loss": 0.2584, "step": 3579 }, { "epoch": 2.006726457399103, "grad_norm": 0.07644821585164174, "learning_rate": 5.9692162565344755e-05, "loss": 0.2498, "step": 3580 }, { "epoch": 2.007286995515695, "grad_norm": 0.07455617340795175, "learning_rate": 5.9632472700955846e-05, "loss": 0.2466, "step": 3581 }, { "epoch": 2.007847533632287, "grad_norm": 0.07172595519992646, "learning_rate": 5.957280001402595e-05, "loss": 0.2308, "step": 3582 }, { "epoch": 2.008408071748879, "grad_norm": 0.07367491550452068, "learning_rate": 5.951314452994738e-05, "loss": 0.2506, "step": 3583 }, { "epoch": 2.008968609865471, "grad_norm": 0.072440305357167, "learning_rate": 5.94535062741052e-05, "loss": 0.2336, "step": 3584 }, { "epoch": 2.0095291479820627, "grad_norm": 0.0735028091121321, "learning_rate": 5.939388527187697e-05, "loss": 0.2429, "step": 3585 }, { "epoch": 2.0100896860986546, "grad_norm": 0.07316342029447456, "learning_rate": 5.9334281548633106e-05, "loss": 0.2564, "step": 3586 }, { "epoch": 2.0106502242152464, "grad_norm": 0.0734512260587328, "learning_rate": 5.927469512973656e-05, "loss": 0.2542, "step": 3587 }, { "epoch": 2.0112107623318387, "grad_norm": 0.07391007018946201, "learning_rate": 5.921512604054289e-05, "loss": 0.2423, "step": 3588 }, { "epoch": 2.0117713004484306, "grad_norm": 0.07519596716498071, "learning_rate": 5.9155574306400395e-05, "loss": 0.2535, "step": 3589 }, { "epoch": 2.0123318385650224, "grad_norm": 0.0729143913336409, "learning_rate": 5.9096039952649876e-05, "loss": 0.2478, "step": 3590 }, { "epoch": 2.0128923766816142, "grad_norm": 0.07194680849829693, "learning_rate": 5.903652300462485e-05, "loss": 0.2474, "step": 3591 }, { "epoch": 2.013452914798206, "grad_norm": 0.07168630931317939, "learning_rate": 5.897702348765129e-05, "loss": 0.2453, "step": 3592 }, { "epoch": 2.0140134529147984, "grad_norm": 0.06919176725888804, "learning_rate": 5.891754142704791e-05, "loss": 0.2354, "step": 3593 }, { "epoch": 2.0145739910313902, "grad_norm": 0.07349036326309909, "learning_rate": 5.885807684812584e-05, "loss": 0.2501, "step": 3594 }, { "epoch": 2.015134529147982, "grad_norm": 0.0724354831554613, "learning_rate": 5.879862977618886e-05, "loss": 0.242, "step": 3595 }, { "epoch": 2.015695067264574, "grad_norm": 0.07670077811205034, "learning_rate": 5.873920023653332e-05, "loss": 0.2384, "step": 3596 }, { "epoch": 2.0162556053811658, "grad_norm": 0.072267893958526, "learning_rate": 5.867978825444802e-05, "loss": 0.25, "step": 3597 }, { "epoch": 2.016816143497758, "grad_norm": 0.07544364867089007, "learning_rate": 5.8620393855214384e-05, "loss": 0.2553, "step": 3598 }, { "epoch": 2.01737668161435, "grad_norm": 0.07298918720019351, "learning_rate": 5.856101706410628e-05, "loss": 0.2356, "step": 3599 }, { "epoch": 2.0179372197309418, "grad_norm": 0.07275824547402367, "learning_rate": 5.8501657906390175e-05, "loss": 0.2354, "step": 3600 }, { "epoch": 2.0184977578475336, "grad_norm": 0.0746848707308173, "learning_rate": 5.8442316407324895e-05, "loss": 0.2375, "step": 3601 }, { "epoch": 2.0190582959641254, "grad_norm": 0.07612609715007827, "learning_rate": 5.838299259216187e-05, "loss": 0.245, "step": 3602 }, { "epoch": 2.0196188340807173, "grad_norm": 0.07619694838706523, "learning_rate": 5.832368648614499e-05, "loss": 0.247, "step": 3603 }, { "epoch": 2.0201793721973096, "grad_norm": 0.0740977327631378, "learning_rate": 5.826439811451052e-05, "loss": 0.2423, "step": 3604 }, { "epoch": 2.0207399103139014, "grad_norm": 0.0731684928055168, "learning_rate": 5.820512750248731e-05, "loss": 0.2431, "step": 3605 }, { "epoch": 2.0213004484304933, "grad_norm": 0.07577288505530066, "learning_rate": 5.814587467529652e-05, "loss": 0.2434, "step": 3606 }, { "epoch": 2.021860986547085, "grad_norm": 0.07507269911073491, "learning_rate": 5.808663965815188e-05, "loss": 0.2275, "step": 3607 }, { "epoch": 2.022421524663677, "grad_norm": 0.07513388383068403, "learning_rate": 5.8027422476259385e-05, "loss": 0.2419, "step": 3608 }, { "epoch": 2.0229820627802693, "grad_norm": 0.0741382324111763, "learning_rate": 5.796822315481758e-05, "loss": 0.2493, "step": 3609 }, { "epoch": 2.023542600896861, "grad_norm": 0.07569066865477501, "learning_rate": 5.7909041719017385e-05, "loss": 0.2367, "step": 3610 }, { "epoch": 2.024103139013453, "grad_norm": 0.07450803937671906, "learning_rate": 5.7849878194042e-05, "loss": 0.2493, "step": 3611 }, { "epoch": 2.024663677130045, "grad_norm": 0.07204647458005459, "learning_rate": 5.779073260506713e-05, "loss": 0.239, "step": 3612 }, { "epoch": 2.0252242152466366, "grad_norm": 0.07400742953868136, "learning_rate": 5.773160497726082e-05, "loss": 0.2439, "step": 3613 }, { "epoch": 2.0257847533632285, "grad_norm": 0.07234116837916794, "learning_rate": 5.767249533578338e-05, "loss": 0.2388, "step": 3614 }, { "epoch": 2.026345291479821, "grad_norm": 0.0729479198673305, "learning_rate": 5.76134037057876e-05, "loss": 0.2447, "step": 3615 }, { "epoch": 2.0269058295964126, "grad_norm": 0.07548523547570704, "learning_rate": 5.755433011241851e-05, "loss": 0.2467, "step": 3616 }, { "epoch": 2.0274663677130045, "grad_norm": 0.07356258567090886, "learning_rate": 5.7495274580813494e-05, "loss": 0.2451, "step": 3617 }, { "epoch": 2.0280269058295963, "grad_norm": 0.07503584154196515, "learning_rate": 5.743623713610229e-05, "loss": 0.2521, "step": 3618 }, { "epoch": 2.028587443946188, "grad_norm": 0.07503181429152264, "learning_rate": 5.7377217803406925e-05, "loss": 0.2602, "step": 3619 }, { "epoch": 2.0291479820627805, "grad_norm": 0.07497700035198634, "learning_rate": 5.7318216607841644e-05, "loss": 0.2538, "step": 3620 }, { "epoch": 2.0297085201793723, "grad_norm": 0.07440507985772586, "learning_rate": 5.7259233574513025e-05, "loss": 0.2401, "step": 3621 }, { "epoch": 2.030269058295964, "grad_norm": 0.0754323185863951, "learning_rate": 5.720026872851998e-05, "loss": 0.2368, "step": 3622 }, { "epoch": 2.030829596412556, "grad_norm": 0.07543000929160433, "learning_rate": 5.714132209495354e-05, "loss": 0.2475, "step": 3623 }, { "epoch": 2.031390134529148, "grad_norm": 0.07470759129408015, "learning_rate": 5.7082393698897166e-05, "loss": 0.2535, "step": 3624 }, { "epoch": 2.0319506726457397, "grad_norm": 0.0738447933102809, "learning_rate": 5.702348356542635e-05, "loss": 0.239, "step": 3625 }, { "epoch": 2.032511210762332, "grad_norm": 0.07437147445974224, "learning_rate": 5.696459171960899e-05, "loss": 0.249, "step": 3626 }, { "epoch": 2.033071748878924, "grad_norm": 0.07427215677895992, "learning_rate": 5.6905718186505185e-05, "loss": 0.2385, "step": 3627 }, { "epoch": 2.0336322869955157, "grad_norm": 0.072072620232772, "learning_rate": 5.684686299116709e-05, "loss": 0.2401, "step": 3628 }, { "epoch": 2.0341928251121075, "grad_norm": 0.07418752073168021, "learning_rate": 5.678802615863925e-05, "loss": 0.2432, "step": 3629 }, { "epoch": 2.0347533632286994, "grad_norm": 0.07376478735097676, "learning_rate": 5.672920771395822e-05, "loss": 0.2476, "step": 3630 }, { "epoch": 2.0353139013452917, "grad_norm": 0.07317600360599674, "learning_rate": 5.6670407682152906e-05, "loss": 0.247, "step": 3631 }, { "epoch": 2.0358744394618835, "grad_norm": 0.07548459192437802, "learning_rate": 5.6611626088244194e-05, "loss": 0.2518, "step": 3632 }, { "epoch": 2.0364349775784754, "grad_norm": 0.07673378194190912, "learning_rate": 5.655286295724528e-05, "loss": 0.2504, "step": 3633 }, { "epoch": 2.036995515695067, "grad_norm": 0.07594685544914863, "learning_rate": 5.649411831416147e-05, "loss": 0.2519, "step": 3634 }, { "epoch": 2.037556053811659, "grad_norm": 0.0789259362724179, "learning_rate": 5.643539218399009e-05, "loss": 0.2558, "step": 3635 }, { "epoch": 2.038116591928251, "grad_norm": 0.07513976547343512, "learning_rate": 5.6376684591720766e-05, "loss": 0.2471, "step": 3636 }, { "epoch": 2.038677130044843, "grad_norm": 0.07673956293110795, "learning_rate": 5.6317995562335055e-05, "loss": 0.2542, "step": 3637 }, { "epoch": 2.039237668161435, "grad_norm": 0.07422344910909859, "learning_rate": 5.625932512080678e-05, "loss": 0.2357, "step": 3638 }, { "epoch": 2.039798206278027, "grad_norm": 0.0770687977508266, "learning_rate": 5.620067329210172e-05, "loss": 0.2477, "step": 3639 }, { "epoch": 2.0403587443946187, "grad_norm": 0.07477312610911233, "learning_rate": 5.614204010117785e-05, "loss": 0.2458, "step": 3640 }, { "epoch": 2.0409192825112106, "grad_norm": 0.07703501507752882, "learning_rate": 5.608342557298508e-05, "loss": 0.2691, "step": 3641 }, { "epoch": 2.041479820627803, "grad_norm": 0.07425971126281866, "learning_rate": 5.60248297324655e-05, "loss": 0.2518, "step": 3642 }, { "epoch": 2.0420403587443947, "grad_norm": 0.07409210468771125, "learning_rate": 5.596625260455324e-05, "loss": 0.2459, "step": 3643 }, { "epoch": 2.0426008968609866, "grad_norm": 0.07743348048971965, "learning_rate": 5.5907694214174344e-05, "loss": 0.2474, "step": 3644 }, { "epoch": 2.0431614349775784, "grad_norm": 0.07555808441932708, "learning_rate": 5.584915458624706e-05, "loss": 0.2417, "step": 3645 }, { "epoch": 2.0437219730941703, "grad_norm": 0.07646567090721604, "learning_rate": 5.5790633745681475e-05, "loss": 0.2443, "step": 3646 }, { "epoch": 2.0442825112107625, "grad_norm": 0.07391358861055916, "learning_rate": 5.573213171737983e-05, "loss": 0.2551, "step": 3647 }, { "epoch": 2.0448430493273544, "grad_norm": 0.07467413970510883, "learning_rate": 5.567364852623629e-05, "loss": 0.2453, "step": 3648 }, { "epoch": 2.0454035874439462, "grad_norm": 0.0755515448790851, "learning_rate": 5.561518419713695e-05, "loss": 0.2385, "step": 3649 }, { "epoch": 2.045964125560538, "grad_norm": 0.07519782812949223, "learning_rate": 5.555673875495999e-05, "loss": 0.2473, "step": 3650 }, { "epoch": 2.04652466367713, "grad_norm": 0.07838271136523178, "learning_rate": 5.549831222457549e-05, "loss": 0.2515, "step": 3651 }, { "epoch": 2.0470852017937218, "grad_norm": 0.08057752662737491, "learning_rate": 5.543990463084554e-05, "loss": 0.2409, "step": 3652 }, { "epoch": 2.047645739910314, "grad_norm": 0.07357099627993306, "learning_rate": 5.538151599862407e-05, "loss": 0.2443, "step": 3653 }, { "epoch": 2.048206278026906, "grad_norm": 0.07528720685237601, "learning_rate": 5.532314635275705e-05, "loss": 0.2471, "step": 3654 }, { "epoch": 2.0487668161434978, "grad_norm": 0.07640859527958416, "learning_rate": 5.52647957180823e-05, "loss": 0.2572, "step": 3655 }, { "epoch": 2.0493273542600896, "grad_norm": 0.07577679780106567, "learning_rate": 5.520646411942951e-05, "loss": 0.2488, "step": 3656 }, { "epoch": 2.0498878923766815, "grad_norm": 0.07390965585458116, "learning_rate": 5.514815158162041e-05, "loss": 0.2461, "step": 3657 }, { "epoch": 2.0504484304932737, "grad_norm": 0.07605508395958041, "learning_rate": 5.50898581294685e-05, "loss": 0.2418, "step": 3658 }, { "epoch": 2.0510089686098656, "grad_norm": 0.07787568561042851, "learning_rate": 5.503158378777924e-05, "loss": 0.2527, "step": 3659 }, { "epoch": 2.0515695067264574, "grad_norm": 0.07379881671794046, "learning_rate": 5.497332858134992e-05, "loss": 0.2462, "step": 3660 }, { "epoch": 2.0521300448430493, "grad_norm": 0.07474063782729076, "learning_rate": 5.491509253496958e-05, "loss": 0.2439, "step": 3661 }, { "epoch": 2.052690582959641, "grad_norm": 0.07515187953468082, "learning_rate": 5.4856875673419326e-05, "loss": 0.251, "step": 3662 }, { "epoch": 2.053251121076233, "grad_norm": 0.07412649718740476, "learning_rate": 5.479867802147191e-05, "loss": 0.2416, "step": 3663 }, { "epoch": 2.0538116591928253, "grad_norm": 0.0763874254973806, "learning_rate": 5.474049960389205e-05, "loss": 0.2505, "step": 3664 }, { "epoch": 2.054372197309417, "grad_norm": 0.07464565136436936, "learning_rate": 5.468234044543614e-05, "loss": 0.2457, "step": 3665 }, { "epoch": 2.054932735426009, "grad_norm": 0.07412344875406843, "learning_rate": 5.462420057085249e-05, "loss": 0.2521, "step": 3666 }, { "epoch": 2.055493273542601, "grad_norm": 0.07306081151791718, "learning_rate": 5.456608000488119e-05, "loss": 0.2385, "step": 3667 }, { "epoch": 2.0560538116591927, "grad_norm": 0.07730984821947563, "learning_rate": 5.450797877225404e-05, "loss": 0.2531, "step": 3668 }, { "epoch": 2.056614349775785, "grad_norm": 0.07521617598903653, "learning_rate": 5.4449896897694744e-05, "loss": 0.2406, "step": 3669 }, { "epoch": 2.057174887892377, "grad_norm": 0.07760150487580726, "learning_rate": 5.4391834405918574e-05, "loss": 0.2517, "step": 3670 }, { "epoch": 2.0577354260089686, "grad_norm": 0.07765654951932434, "learning_rate": 5.433379132163279e-05, "loss": 0.2479, "step": 3671 }, { "epoch": 2.0582959641255605, "grad_norm": 0.07459978677925008, "learning_rate": 5.4275767669536146e-05, "loss": 0.2302, "step": 3672 }, { "epoch": 2.0588565022421523, "grad_norm": 0.0740377335860988, "learning_rate": 5.421776347431937e-05, "loss": 0.2457, "step": 3673 }, { "epoch": 2.059417040358744, "grad_norm": 0.07306074248882465, "learning_rate": 5.41597787606647e-05, "loss": 0.2416, "step": 3674 }, { "epoch": 2.0599775784753365, "grad_norm": 0.07448946926245875, "learning_rate": 5.410181355324622e-05, "loss": 0.2455, "step": 3675 }, { "epoch": 2.0605381165919283, "grad_norm": 0.07665634256411681, "learning_rate": 5.40438678767297e-05, "loss": 0.2576, "step": 3676 }, { "epoch": 2.06109865470852, "grad_norm": 0.07741339767568457, "learning_rate": 5.398594175577252e-05, "loss": 0.2497, "step": 3677 }, { "epoch": 2.061659192825112, "grad_norm": 0.0771709029185898, "learning_rate": 5.392803521502383e-05, "loss": 0.2486, "step": 3678 }, { "epoch": 2.062219730941704, "grad_norm": 0.07577514077984976, "learning_rate": 5.387014827912434e-05, "loss": 0.2458, "step": 3679 }, { "epoch": 2.062780269058296, "grad_norm": 0.07395310574854698, "learning_rate": 5.381228097270659e-05, "loss": 0.2309, "step": 3680 }, { "epoch": 2.063340807174888, "grad_norm": 0.07710178899027828, "learning_rate": 5.375443332039458e-05, "loss": 0.2584, "step": 3681 }, { "epoch": 2.06390134529148, "grad_norm": 0.074934507801412, "learning_rate": 5.369660534680402e-05, "loss": 0.248, "step": 3682 }, { "epoch": 2.0644618834080717, "grad_norm": 0.07345878202054623, "learning_rate": 5.363879707654228e-05, "loss": 0.2408, "step": 3683 }, { "epoch": 2.0650224215246635, "grad_norm": 0.07603644683417077, "learning_rate": 5.3581008534208334e-05, "loss": 0.2451, "step": 3684 }, { "epoch": 2.0655829596412554, "grad_norm": 0.07407798941233894, "learning_rate": 5.352323974439275e-05, "loss": 0.2324, "step": 3685 }, { "epoch": 2.0661434977578477, "grad_norm": 0.07589936133155524, "learning_rate": 5.3465490731677655e-05, "loss": 0.2523, "step": 3686 }, { "epoch": 2.0667040358744395, "grad_norm": 0.07476780548570601, "learning_rate": 5.3407761520636845e-05, "loss": 0.2416, "step": 3687 }, { "epoch": 2.0672645739910314, "grad_norm": 0.07620405663914806, "learning_rate": 5.3350052135835616e-05, "loss": 0.2473, "step": 3688 }, { "epoch": 2.067825112107623, "grad_norm": 0.0767194329863688, "learning_rate": 5.32923626018308e-05, "loss": 0.2497, "step": 3689 }, { "epoch": 2.068385650224215, "grad_norm": 0.07495906496924769, "learning_rate": 5.3234692943170874e-05, "loss": 0.243, "step": 3690 }, { "epoch": 2.0689461883408073, "grad_norm": 0.07649211839470273, "learning_rate": 5.31770431843958e-05, "loss": 0.2563, "step": 3691 }, { "epoch": 2.069506726457399, "grad_norm": 0.07558525414436873, "learning_rate": 5.311941335003715e-05, "loss": 0.2414, "step": 3692 }, { "epoch": 2.070067264573991, "grad_norm": 0.0757366907227106, "learning_rate": 5.306180346461786e-05, "loss": 0.2457, "step": 3693 }, { "epoch": 2.070627802690583, "grad_norm": 0.07728198121553063, "learning_rate": 5.300421355265257e-05, "loss": 0.2549, "step": 3694 }, { "epoch": 2.0711883408071747, "grad_norm": 0.07666444202365309, "learning_rate": 5.294664363864725e-05, "loss": 0.2533, "step": 3695 }, { "epoch": 2.071748878923767, "grad_norm": 0.073571632939005, "learning_rate": 5.2889093747099427e-05, "loss": 0.2451, "step": 3696 }, { "epoch": 2.072309417040359, "grad_norm": 0.07441857755812561, "learning_rate": 5.283156390249817e-05, "loss": 0.2478, "step": 3697 }, { "epoch": 2.0728699551569507, "grad_norm": 0.07209931004529369, "learning_rate": 5.27740541293239e-05, "loss": 0.2337, "step": 3698 }, { "epoch": 2.0734304932735426, "grad_norm": 0.07434303853065642, "learning_rate": 5.27165644520486e-05, "loss": 0.2389, "step": 3699 }, { "epoch": 2.0739910313901344, "grad_norm": 0.07538332019807867, "learning_rate": 5.265909489513567e-05, "loss": 0.2346, "step": 3700 }, { "epoch": 2.0745515695067263, "grad_norm": 0.07563528299165076, "learning_rate": 5.2601645483039896e-05, "loss": 0.2503, "step": 3701 }, { "epoch": 2.0751121076233185, "grad_norm": 0.07550535897521962, "learning_rate": 5.25442162402076e-05, "loss": 0.2457, "step": 3702 }, { "epoch": 2.0756726457399104, "grad_norm": 0.07708504890256196, "learning_rate": 5.248680719107636e-05, "loss": 0.2437, "step": 3703 }, { "epoch": 2.0762331838565022, "grad_norm": 0.07926456227678434, "learning_rate": 5.242941836007536e-05, "loss": 0.2365, "step": 3704 }, { "epoch": 2.076793721973094, "grad_norm": 0.07803839890484698, "learning_rate": 5.237204977162498e-05, "loss": 0.2367, "step": 3705 }, { "epoch": 2.077354260089686, "grad_norm": 0.0779619945978881, "learning_rate": 5.231470145013717e-05, "loss": 0.2462, "step": 3706 }, { "epoch": 2.0779147982062782, "grad_norm": 0.07725602039695345, "learning_rate": 5.22573734200151e-05, "loss": 0.2392, "step": 3707 }, { "epoch": 2.07847533632287, "grad_norm": 0.07703201254512783, "learning_rate": 5.220006570565341e-05, "loss": 0.2483, "step": 3708 }, { "epoch": 2.079035874439462, "grad_norm": 0.07834171005721667, "learning_rate": 5.214277833143808e-05, "loss": 0.2476, "step": 3709 }, { "epoch": 2.0795964125560538, "grad_norm": 0.07754509610713888, "learning_rate": 5.208551132174637e-05, "loss": 0.2491, "step": 3710 }, { "epoch": 2.0801569506726456, "grad_norm": 0.07803061197291622, "learning_rate": 5.202826470094697e-05, "loss": 0.2523, "step": 3711 }, { "epoch": 2.0807174887892375, "grad_norm": 0.07506148068870845, "learning_rate": 5.197103849339978e-05, "loss": 0.2506, "step": 3712 }, { "epoch": 2.0812780269058297, "grad_norm": 0.07685695921965507, "learning_rate": 5.1913832723456144e-05, "loss": 0.2492, "step": 3713 }, { "epoch": 2.0818385650224216, "grad_norm": 0.07827560089693103, "learning_rate": 5.185664741545861e-05, "loss": 0.2478, "step": 3714 }, { "epoch": 2.0823991031390134, "grad_norm": 0.0772674108978287, "learning_rate": 5.179948259374102e-05, "loss": 0.2453, "step": 3715 }, { "epoch": 2.0829596412556053, "grad_norm": 0.07533389028706951, "learning_rate": 5.174233828262855e-05, "loss": 0.2378, "step": 3716 }, { "epoch": 2.083520179372197, "grad_norm": 0.07285465726019562, "learning_rate": 5.1685214506437654e-05, "loss": 0.2393, "step": 3717 }, { "epoch": 2.0840807174887894, "grad_norm": 0.07656244214074619, "learning_rate": 5.162811128947602e-05, "loss": 0.2458, "step": 3718 }, { "epoch": 2.0846412556053813, "grad_norm": 0.07510983004184547, "learning_rate": 5.157102865604255e-05, "loss": 0.2477, "step": 3719 }, { "epoch": 2.085201793721973, "grad_norm": 0.07571182232761092, "learning_rate": 5.151396663042749e-05, "loss": 0.2394, "step": 3720 }, { "epoch": 2.085762331838565, "grad_norm": 0.07738104482933404, "learning_rate": 5.145692523691222e-05, "loss": 0.2423, "step": 3721 }, { "epoch": 2.086322869955157, "grad_norm": 0.07424042187491937, "learning_rate": 5.139990449976933e-05, "loss": 0.2552, "step": 3722 }, { "epoch": 2.086883408071749, "grad_norm": 0.07712723505263727, "learning_rate": 5.1342904443262686e-05, "loss": 0.2429, "step": 3723 }, { "epoch": 2.087443946188341, "grad_norm": 0.07744484870522164, "learning_rate": 5.128592509164736e-05, "loss": 0.2541, "step": 3724 }, { "epoch": 2.088004484304933, "grad_norm": 0.07568429844380314, "learning_rate": 5.122896646916959e-05, "loss": 0.24, "step": 3725 }, { "epoch": 2.0885650224215246, "grad_norm": 0.07667519386845806, "learning_rate": 5.1172028600066757e-05, "loss": 0.2478, "step": 3726 }, { "epoch": 2.0891255605381165, "grad_norm": 0.07309287635617359, "learning_rate": 5.1115111508567484e-05, "loss": 0.2295, "step": 3727 }, { "epoch": 2.0896860986547083, "grad_norm": 0.07595344765462039, "learning_rate": 5.105821521889147e-05, "loss": 0.2515, "step": 3728 }, { "epoch": 2.0902466367713006, "grad_norm": 0.07729452661945743, "learning_rate": 5.100133975524959e-05, "loss": 0.2412, "step": 3729 }, { "epoch": 2.0908071748878925, "grad_norm": 0.07521225983949126, "learning_rate": 5.094448514184393e-05, "loss": 0.2444, "step": 3730 }, { "epoch": 2.0913677130044843, "grad_norm": 0.07579948195096538, "learning_rate": 5.0887651402867576e-05, "loss": 0.232, "step": 3731 }, { "epoch": 2.091928251121076, "grad_norm": 0.07935544607466706, "learning_rate": 5.0830838562504835e-05, "loss": 0.2501, "step": 3732 }, { "epoch": 2.092488789237668, "grad_norm": 0.07751926544596244, "learning_rate": 5.0774046644931074e-05, "loss": 0.2401, "step": 3733 }, { "epoch": 2.0930493273542603, "grad_norm": 0.07389189242772543, "learning_rate": 5.0717275674312814e-05, "loss": 0.2245, "step": 3734 }, { "epoch": 2.093609865470852, "grad_norm": 0.0769607643153231, "learning_rate": 5.066052567480759e-05, "loss": 0.2351, "step": 3735 }, { "epoch": 2.094170403587444, "grad_norm": 0.0759419778431755, "learning_rate": 5.060379667056399e-05, "loss": 0.2326, "step": 3736 }, { "epoch": 2.094730941704036, "grad_norm": 0.0792415087895477, "learning_rate": 5.054708868572178e-05, "loss": 0.2551, "step": 3737 }, { "epoch": 2.0952914798206277, "grad_norm": 0.07855042508720948, "learning_rate": 5.049040174441166e-05, "loss": 0.248, "step": 3738 }, { "epoch": 2.0958520179372195, "grad_norm": 0.07613422022280052, "learning_rate": 5.043373587075551e-05, "loss": 0.2508, "step": 3739 }, { "epoch": 2.096412556053812, "grad_norm": 0.0789895223067926, "learning_rate": 5.0377091088866094e-05, "loss": 0.2467, "step": 3740 }, { "epoch": 2.0969730941704037, "grad_norm": 0.08027646715524697, "learning_rate": 5.032046742284731e-05, "loss": 0.2468, "step": 3741 }, { "epoch": 2.0975336322869955, "grad_norm": 0.0778152627142349, "learning_rate": 5.026386489679408e-05, "loss": 0.2538, "step": 3742 }, { "epoch": 2.0980941704035874, "grad_norm": 0.07677584154005475, "learning_rate": 5.0207283534792205e-05, "loss": 0.2481, "step": 3743 }, { "epoch": 2.098654708520179, "grad_norm": 0.07547205995345732, "learning_rate": 5.015072336091866e-05, "loss": 0.2482, "step": 3744 }, { "epoch": 2.0992152466367715, "grad_norm": 0.07596555024492226, "learning_rate": 5.0094184399241196e-05, "loss": 0.2442, "step": 3745 }, { "epoch": 2.0997757847533634, "grad_norm": 0.0760207349377979, "learning_rate": 5.003766667381875e-05, "loss": 0.2504, "step": 3746 }, { "epoch": 2.100336322869955, "grad_norm": 0.07549347569273997, "learning_rate": 4.998117020870108e-05, "loss": 0.2417, "step": 3747 }, { "epoch": 2.100896860986547, "grad_norm": 0.07484700862466377, "learning_rate": 4.992469502792889e-05, "loss": 0.2414, "step": 3748 }, { "epoch": 2.101457399103139, "grad_norm": 0.07366947561702218, "learning_rate": 4.986824115553392e-05, "loss": 0.2335, "step": 3749 }, { "epoch": 2.1020179372197307, "grad_norm": 0.07484692603803325, "learning_rate": 4.98118086155388e-05, "loss": 0.2469, "step": 3750 }, { "epoch": 2.102578475336323, "grad_norm": 0.07464427055286087, "learning_rate": 4.9755397431957116e-05, "loss": 0.2523, "step": 3751 }, { "epoch": 2.103139013452915, "grad_norm": 0.07636046363902961, "learning_rate": 4.969900762879325e-05, "loss": 0.2364, "step": 3752 }, { "epoch": 2.1036995515695067, "grad_norm": 0.07565136512051622, "learning_rate": 4.9642639230042654e-05, "loss": 0.2394, "step": 3753 }, { "epoch": 2.1042600896860986, "grad_norm": 0.07499809113500086, "learning_rate": 4.958629225969153e-05, "loss": 0.2441, "step": 3754 }, { "epoch": 2.1048206278026904, "grad_norm": 0.0728745125772784, "learning_rate": 4.952996674171698e-05, "loss": 0.2364, "step": 3755 }, { "epoch": 2.1053811659192827, "grad_norm": 0.0776436778018926, "learning_rate": 4.947366270008707e-05, "loss": 0.2356, "step": 3756 }, { "epoch": 2.1059417040358746, "grad_norm": 0.0748259736118957, "learning_rate": 4.9417380158760663e-05, "loss": 0.2414, "step": 3757 }, { "epoch": 2.1065022421524664, "grad_norm": 0.07740404251257217, "learning_rate": 4.936111914168749e-05, "loss": 0.2377, "step": 3758 }, { "epoch": 2.1070627802690582, "grad_norm": 0.07602820206573185, "learning_rate": 4.930487967280809e-05, "loss": 0.2447, "step": 3759 }, { "epoch": 2.10762331838565, "grad_norm": 0.07744549973889647, "learning_rate": 4.924866177605389e-05, "loss": 0.2449, "step": 3760 }, { "epoch": 2.108183856502242, "grad_norm": 0.07865211127766736, "learning_rate": 4.919246547534708e-05, "loss": 0.2435, "step": 3761 }, { "epoch": 2.1087443946188342, "grad_norm": 0.07601670072773913, "learning_rate": 4.913629079460065e-05, "loss": 0.2525, "step": 3762 }, { "epoch": 2.109304932735426, "grad_norm": 0.07449968206673598, "learning_rate": 4.908013775771849e-05, "loss": 0.2336, "step": 3763 }, { "epoch": 2.109865470852018, "grad_norm": 0.07913432716177185, "learning_rate": 4.9024006388595155e-05, "loss": 0.2491, "step": 3764 }, { "epoch": 2.1104260089686098, "grad_norm": 0.0783133059555131, "learning_rate": 4.896789671111606e-05, "loss": 0.2505, "step": 3765 }, { "epoch": 2.1109865470852016, "grad_norm": 0.0770458655210177, "learning_rate": 4.891180874915737e-05, "loss": 0.2391, "step": 3766 }, { "epoch": 2.111547085201794, "grad_norm": 0.07804450235053058, "learning_rate": 4.885574252658607e-05, "loss": 0.2512, "step": 3767 }, { "epoch": 2.1121076233183858, "grad_norm": 0.07719063199576529, "learning_rate": 4.8799698067259757e-05, "loss": 0.2496, "step": 3768 }, { "epoch": 2.1126681614349776, "grad_norm": 0.07509637258198873, "learning_rate": 4.8743675395026836e-05, "loss": 0.2318, "step": 3769 }, { "epoch": 2.1132286995515694, "grad_norm": 0.07880164737334229, "learning_rate": 4.868767453372649e-05, "loss": 0.251, "step": 3770 }, { "epoch": 2.1137892376681613, "grad_norm": 0.07472006746718912, "learning_rate": 4.863169550718855e-05, "loss": 0.2386, "step": 3771 }, { "epoch": 2.1143497757847536, "grad_norm": 0.07503838079189648, "learning_rate": 4.857573833923361e-05, "loss": 0.2451, "step": 3772 }, { "epoch": 2.1149103139013454, "grad_norm": 0.07783665493828258, "learning_rate": 4.85198030536729e-05, "loss": 0.2469, "step": 3773 }, { "epoch": 2.1154708520179373, "grad_norm": 0.07252369190069591, "learning_rate": 4.8463889674308386e-05, "loss": 0.237, "step": 3774 }, { "epoch": 2.116031390134529, "grad_norm": 0.0767332797547159, "learning_rate": 4.8407998224932746e-05, "loss": 0.2397, "step": 3775 }, { "epoch": 2.116591928251121, "grad_norm": 0.07314039073294969, "learning_rate": 4.8352128729329226e-05, "loss": 0.241, "step": 3776 }, { "epoch": 2.117152466367713, "grad_norm": 0.07798493184421496, "learning_rate": 4.8296281211271845e-05, "loss": 0.2569, "step": 3777 }, { "epoch": 2.117713004484305, "grad_norm": 0.07640627922643582, "learning_rate": 4.824045569452512e-05, "loss": 0.2545, "step": 3778 }, { "epoch": 2.118273542600897, "grad_norm": 0.07698966097835544, "learning_rate": 4.81846522028444e-05, "loss": 0.2494, "step": 3779 }, { "epoch": 2.118834080717489, "grad_norm": 0.0777952701462906, "learning_rate": 4.8128870759975474e-05, "loss": 0.2569, "step": 3780 }, { "epoch": 2.1193946188340806, "grad_norm": 0.07933407709088636, "learning_rate": 4.8073111389654904e-05, "loss": 0.2553, "step": 3781 }, { "epoch": 2.1199551569506725, "grad_norm": 0.0736801240831016, "learning_rate": 4.8017374115609705e-05, "loss": 0.2325, "step": 3782 }, { "epoch": 2.120515695067265, "grad_norm": 0.07656959783693124, "learning_rate": 4.796165896155762e-05, "loss": 0.2369, "step": 3783 }, { "epoch": 2.1210762331838566, "grad_norm": 0.07589478437817554, "learning_rate": 4.790596595120699e-05, "loss": 0.2475, "step": 3784 }, { "epoch": 2.1216367713004485, "grad_norm": 0.07832222185990736, "learning_rate": 4.785029510825656e-05, "loss": 0.2439, "step": 3785 }, { "epoch": 2.1221973094170403, "grad_norm": 0.07661442828983898, "learning_rate": 4.7794646456395864e-05, "loss": 0.2286, "step": 3786 }, { "epoch": 2.122757847533632, "grad_norm": 0.07948573208800389, "learning_rate": 4.7739020019304836e-05, "loss": 0.258, "step": 3787 }, { "epoch": 2.123318385650224, "grad_norm": 0.07676654468744926, "learning_rate": 4.7683415820653976e-05, "loss": 0.2359, "step": 3788 }, { "epoch": 2.1238789237668163, "grad_norm": 0.07949196052434777, "learning_rate": 4.7627833884104376e-05, "loss": 0.2539, "step": 3789 }, { "epoch": 2.124439461883408, "grad_norm": 0.0808029184848327, "learning_rate": 4.757227423330766e-05, "loss": 0.2465, "step": 3790 }, { "epoch": 2.125, "grad_norm": 0.07684233703423032, "learning_rate": 4.751673689190596e-05, "loss": 0.2465, "step": 3791 }, { "epoch": 2.125560538116592, "grad_norm": 0.07582523958146972, "learning_rate": 4.746122188353182e-05, "loss": 0.2389, "step": 3792 }, { "epoch": 2.1261210762331837, "grad_norm": 0.07770405301933939, "learning_rate": 4.740572923180843e-05, "loss": 0.2462, "step": 3793 }, { "epoch": 2.126681614349776, "grad_norm": 0.07630725806189007, "learning_rate": 4.7350258960349345e-05, "loss": 0.2418, "step": 3794 }, { "epoch": 2.127242152466368, "grad_norm": 0.08095571713148267, "learning_rate": 4.729481109275864e-05, "loss": 0.2349, "step": 3795 }, { "epoch": 2.1278026905829597, "grad_norm": 0.07515324286245524, "learning_rate": 4.723938565263091e-05, "loss": 0.2338, "step": 3796 }, { "epoch": 2.1283632286995515, "grad_norm": 0.07576355358240246, "learning_rate": 4.718398266355109e-05, "loss": 0.2365, "step": 3797 }, { "epoch": 2.1289237668161434, "grad_norm": 0.08129801619576742, "learning_rate": 4.712860214909466e-05, "loss": 0.2562, "step": 3798 }, { "epoch": 2.1294843049327357, "grad_norm": 0.07792485345307101, "learning_rate": 4.707324413282751e-05, "loss": 0.2452, "step": 3799 }, { "epoch": 2.1300448430493275, "grad_norm": 0.07677282962544202, "learning_rate": 4.7017908638305995e-05, "loss": 0.2474, "step": 3800 }, { "epoch": 2.1306053811659194, "grad_norm": 0.07859457790958559, "learning_rate": 4.6962595689076796e-05, "loss": 0.2527, "step": 3801 }, { "epoch": 2.131165919282511, "grad_norm": 0.07490852181293503, "learning_rate": 4.6907305308677005e-05, "loss": 0.2365, "step": 3802 }, { "epoch": 2.131726457399103, "grad_norm": 0.07599246717101346, "learning_rate": 4.685203752063425e-05, "loss": 0.2399, "step": 3803 }, { "epoch": 2.132286995515695, "grad_norm": 0.07659072204029127, "learning_rate": 4.6796792348466356e-05, "loss": 0.2357, "step": 3804 }, { "epoch": 2.132847533632287, "grad_norm": 0.07676255779618765, "learning_rate": 4.6741569815681685e-05, "loss": 0.2373, "step": 3805 }, { "epoch": 2.133408071748879, "grad_norm": 0.07471477190494208, "learning_rate": 4.668636994577884e-05, "loss": 0.2352, "step": 3806 }, { "epoch": 2.133968609865471, "grad_norm": 0.07791178793759876, "learning_rate": 4.663119276224688e-05, "loss": 0.2506, "step": 3807 }, { "epoch": 2.1345291479820627, "grad_norm": 0.07603256569795648, "learning_rate": 4.657603828856517e-05, "loss": 0.2459, "step": 3808 }, { "epoch": 2.1350896860986546, "grad_norm": 0.07359062577381702, "learning_rate": 4.652090654820337e-05, "loss": 0.2343, "step": 3809 }, { "epoch": 2.1356502242152464, "grad_norm": 0.07916302803967806, "learning_rate": 4.646579756462156e-05, "loss": 0.246, "step": 3810 }, { "epoch": 2.1362107623318387, "grad_norm": 0.07706270016322592, "learning_rate": 4.641071136127001e-05, "loss": 0.2462, "step": 3811 }, { "epoch": 2.1367713004484306, "grad_norm": 0.07634153045319463, "learning_rate": 4.635564796158945e-05, "loss": 0.233, "step": 3812 }, { "epoch": 2.1373318385650224, "grad_norm": 0.07927724125092082, "learning_rate": 4.6300607389010744e-05, "loss": 0.2519, "step": 3813 }, { "epoch": 2.1378923766816142, "grad_norm": 0.07872347424418544, "learning_rate": 4.62455896669552e-05, "loss": 0.2443, "step": 3814 }, { "epoch": 2.138452914798206, "grad_norm": 0.07626789547393499, "learning_rate": 4.619059481883425e-05, "loss": 0.2394, "step": 3815 }, { "epoch": 2.1390134529147984, "grad_norm": 0.07598290348951336, "learning_rate": 4.61356228680497e-05, "loss": 0.2404, "step": 3816 }, { "epoch": 2.1395739910313902, "grad_norm": 0.07637304699987817, "learning_rate": 4.608067383799363e-05, "loss": 0.2299, "step": 3817 }, { "epoch": 2.140134529147982, "grad_norm": 0.07640569892654692, "learning_rate": 4.602574775204823e-05, "loss": 0.2452, "step": 3818 }, { "epoch": 2.140695067264574, "grad_norm": 0.07826351124159894, "learning_rate": 4.59708446335861e-05, "loss": 0.2533, "step": 3819 }, { "epoch": 2.1412556053811658, "grad_norm": 0.0755196721551444, "learning_rate": 4.59159645059699e-05, "loss": 0.2392, "step": 3820 }, { "epoch": 2.141816143497758, "grad_norm": 0.07857330298730242, "learning_rate": 4.586110739255266e-05, "loss": 0.2466, "step": 3821 }, { "epoch": 2.14237668161435, "grad_norm": 0.07687845949230376, "learning_rate": 4.580627331667747e-05, "loss": 0.2484, "step": 3822 }, { "epoch": 2.1429372197309418, "grad_norm": 0.07632208103163783, "learning_rate": 4.575146230167773e-05, "loss": 0.2339, "step": 3823 }, { "epoch": 2.1434977578475336, "grad_norm": 0.07970092291202537, "learning_rate": 4.569667437087702e-05, "loss": 0.2515, "step": 3824 }, { "epoch": 2.1440582959641254, "grad_norm": 0.07747091344484212, "learning_rate": 4.5641909547589e-05, "loss": 0.2395, "step": 3825 }, { "epoch": 2.1446188340807173, "grad_norm": 0.07660521329433977, "learning_rate": 4.558716785511764e-05, "loss": 0.2499, "step": 3826 }, { "epoch": 2.1451793721973096, "grad_norm": 0.07656104461730631, "learning_rate": 4.553244931675694e-05, "loss": 0.2388, "step": 3827 }, { "epoch": 2.1457399103139014, "grad_norm": 0.07737457472872494, "learning_rate": 4.547775395579106e-05, "loss": 0.2294, "step": 3828 }, { "epoch": 2.1463004484304933, "grad_norm": 0.07474592555709123, "learning_rate": 4.542308179549442e-05, "loss": 0.2377, "step": 3829 }, { "epoch": 2.146860986547085, "grad_norm": 0.07629029026990886, "learning_rate": 4.5368432859131395e-05, "loss": 0.246, "step": 3830 }, { "epoch": 2.147421524663677, "grad_norm": 0.08308993641254947, "learning_rate": 4.5313807169956604e-05, "loss": 0.251, "step": 3831 }, { "epoch": 2.1479820627802693, "grad_norm": 0.07435536864433591, "learning_rate": 4.5259204751214743e-05, "loss": 0.2422, "step": 3832 }, { "epoch": 2.148542600896861, "grad_norm": 0.0750009085658999, "learning_rate": 4.520462562614063e-05, "loss": 0.2458, "step": 3833 }, { "epoch": 2.149103139013453, "grad_norm": 0.07542735638758706, "learning_rate": 4.515006981795909e-05, "loss": 0.2362, "step": 3834 }, { "epoch": 2.149663677130045, "grad_norm": 0.07705255610502441, "learning_rate": 4.5095537349885055e-05, "loss": 0.2429, "step": 3835 }, { "epoch": 2.1502242152466366, "grad_norm": 0.0759154050047277, "learning_rate": 4.50410282451236e-05, "loss": 0.2407, "step": 3836 }, { "epoch": 2.1507847533632285, "grad_norm": 0.07820174924523798, "learning_rate": 4.498654252686975e-05, "loss": 0.2489, "step": 3837 }, { "epoch": 2.151345291479821, "grad_norm": 0.07898805879350591, "learning_rate": 4.493208021830867e-05, "loss": 0.2479, "step": 3838 }, { "epoch": 2.1519058295964126, "grad_norm": 0.07834726373003945, "learning_rate": 4.487764134261549e-05, "loss": 0.2428, "step": 3839 }, { "epoch": 2.1524663677130045, "grad_norm": 0.07730436457149517, "learning_rate": 4.48232259229554e-05, "loss": 0.2513, "step": 3840 }, { "epoch": 2.1530269058295963, "grad_norm": 0.07633422950781828, "learning_rate": 4.4768833982483694e-05, "loss": 0.2443, "step": 3841 }, { "epoch": 2.153587443946188, "grad_norm": 0.07596439538807573, "learning_rate": 4.471446554434548e-05, "loss": 0.2506, "step": 3842 }, { "epoch": 2.1541479820627805, "grad_norm": 0.07582511713890523, "learning_rate": 4.466012063167607e-05, "loss": 0.2319, "step": 3843 }, { "epoch": 2.1547085201793723, "grad_norm": 0.07835604156296745, "learning_rate": 4.460579926760059e-05, "loss": 0.247, "step": 3844 }, { "epoch": 2.155269058295964, "grad_norm": 0.07985378092104663, "learning_rate": 4.455150147523431e-05, "loss": 0.244, "step": 3845 }, { "epoch": 2.155829596412556, "grad_norm": 0.07705146405540986, "learning_rate": 4.449722727768233e-05, "loss": 0.251, "step": 3846 }, { "epoch": 2.156390134529148, "grad_norm": 0.07930047758046223, "learning_rate": 4.444297669803981e-05, "loss": 0.248, "step": 3847 }, { "epoch": 2.15695067264574, "grad_norm": 0.078031348593723, "learning_rate": 4.4388749759391754e-05, "loss": 0.2436, "step": 3848 }, { "epoch": 2.157511210762332, "grad_norm": 0.07754219449849102, "learning_rate": 4.433454648481321e-05, "loss": 0.2418, "step": 3849 }, { "epoch": 2.158071748878924, "grad_norm": 0.07283117307017277, "learning_rate": 4.4280366897369165e-05, "loss": 0.2351, "step": 3850 }, { "epoch": 2.1586322869955157, "grad_norm": 0.07617950654479107, "learning_rate": 4.422621102011438e-05, "loss": 0.2477, "step": 3851 }, { "epoch": 2.1591928251121075, "grad_norm": 0.07638222595701658, "learning_rate": 4.417207887609372e-05, "loss": 0.233, "step": 3852 }, { "epoch": 2.1597533632286994, "grad_norm": 0.07651077585222588, "learning_rate": 4.411797048834179e-05, "loss": 0.2258, "step": 3853 }, { "epoch": 2.1603139013452917, "grad_norm": 0.07936726239590704, "learning_rate": 4.4063885879883184e-05, "loss": 0.2394, "step": 3854 }, { "epoch": 2.1608744394618835, "grad_norm": 0.07520368120379198, "learning_rate": 4.40098250737323e-05, "loss": 0.2387, "step": 3855 }, { "epoch": 2.1614349775784754, "grad_norm": 0.07928479974436303, "learning_rate": 4.395578809289349e-05, "loss": 0.2446, "step": 3856 }, { "epoch": 2.161995515695067, "grad_norm": 0.07559575442400272, "learning_rate": 4.3901774960360964e-05, "loss": 0.2403, "step": 3857 }, { "epoch": 2.162556053811659, "grad_norm": 0.07804502998940659, "learning_rate": 4.384778569911867e-05, "loss": 0.236, "step": 3858 }, { "epoch": 2.163116591928251, "grad_norm": 0.0784335877150939, "learning_rate": 4.379382033214055e-05, "loss": 0.2541, "step": 3859 }, { "epoch": 2.163677130044843, "grad_norm": 0.07989086111430377, "learning_rate": 4.373987888239024e-05, "loss": 0.2524, "step": 3860 }, { "epoch": 2.164237668161435, "grad_norm": 0.07613239404875734, "learning_rate": 4.3685961372821336e-05, "loss": 0.2289, "step": 3861 }, { "epoch": 2.164798206278027, "grad_norm": 0.08097441844842519, "learning_rate": 4.363206782637714e-05, "loss": 0.2547, "step": 3862 }, { "epoch": 2.1653587443946187, "grad_norm": 0.0792652512977873, "learning_rate": 4.3578198265990765e-05, "loss": 0.2502, "step": 3863 }, { "epoch": 2.1659192825112106, "grad_norm": 0.07492364153520117, "learning_rate": 4.352435271458516e-05, "loss": 0.242, "step": 3864 }, { "epoch": 2.166479820627803, "grad_norm": 0.07749362513129567, "learning_rate": 4.347053119507306e-05, "loss": 0.2349, "step": 3865 }, { "epoch": 2.1670403587443947, "grad_norm": 0.07511795817075329, "learning_rate": 4.341673373035698e-05, "loss": 0.2283, "step": 3866 }, { "epoch": 2.1676008968609866, "grad_norm": 0.076819330087108, "learning_rate": 4.336296034332912e-05, "loss": 0.2445, "step": 3867 }, { "epoch": 2.1681614349775784, "grad_norm": 0.07600638908748635, "learning_rate": 4.3309211056871546e-05, "loss": 0.2452, "step": 3868 }, { "epoch": 2.1687219730941703, "grad_norm": 0.07838996109633528, "learning_rate": 4.3255485893855985e-05, "loss": 0.2417, "step": 3869 }, { "epoch": 2.1692825112107625, "grad_norm": 0.07612712342833701, "learning_rate": 4.320178487714389e-05, "loss": 0.2517, "step": 3870 }, { "epoch": 2.1698430493273544, "grad_norm": 0.07750713654226672, "learning_rate": 4.3148108029586545e-05, "loss": 0.2283, "step": 3871 }, { "epoch": 2.1704035874439462, "grad_norm": 0.07811498338255723, "learning_rate": 4.30944553740248e-05, "loss": 0.2564, "step": 3872 }, { "epoch": 2.170964125560538, "grad_norm": 0.0761621345285335, "learning_rate": 4.3040826933289335e-05, "loss": 0.236, "step": 3873 }, { "epoch": 2.17152466367713, "grad_norm": 0.07563388758275286, "learning_rate": 4.2987222730200515e-05, "loss": 0.2462, "step": 3874 }, { "epoch": 2.1720852017937218, "grad_norm": 0.07708477410440934, "learning_rate": 4.2933642787568293e-05, "loss": 0.2379, "step": 3875 }, { "epoch": 2.172645739910314, "grad_norm": 0.07971412225618608, "learning_rate": 4.288008712819243e-05, "loss": 0.2413, "step": 3876 }, { "epoch": 2.173206278026906, "grad_norm": 0.07893113768036324, "learning_rate": 4.282655577486221e-05, "loss": 0.2461, "step": 3877 }, { "epoch": 2.1737668161434978, "grad_norm": 0.0758942562754993, "learning_rate": 4.2773048750356716e-05, "loss": 0.2466, "step": 3878 }, { "epoch": 2.1743273542600896, "grad_norm": 0.07729882209380876, "learning_rate": 4.2719566077444565e-05, "loss": 0.2308, "step": 3879 }, { "epoch": 2.1748878923766815, "grad_norm": 0.07908419518265027, "learning_rate": 4.2666107778884065e-05, "loss": 0.2475, "step": 3880 }, { "epoch": 2.1754484304932737, "grad_norm": 0.07714408912918791, "learning_rate": 4.261267387742323e-05, "loss": 0.2386, "step": 3881 }, { "epoch": 2.1760089686098656, "grad_norm": 0.07845478885021406, "learning_rate": 4.255926439579948e-05, "loss": 0.2409, "step": 3882 }, { "epoch": 2.1765695067264574, "grad_norm": 0.07785859360485685, "learning_rate": 4.250587935674009e-05, "loss": 0.2349, "step": 3883 }, { "epoch": 2.1771300448430493, "grad_norm": 0.07751788254387426, "learning_rate": 4.245251878296171e-05, "loss": 0.2372, "step": 3884 }, { "epoch": 2.177690582959641, "grad_norm": 0.07727111253762252, "learning_rate": 4.2399182697170806e-05, "loss": 0.2511, "step": 3885 }, { "epoch": 2.178251121076233, "grad_norm": 0.0813134242476208, "learning_rate": 4.234587112206317e-05, "loss": 0.2483, "step": 3886 }, { "epoch": 2.1788116591928253, "grad_norm": 0.08104585169424193, "learning_rate": 4.2292584080324424e-05, "loss": 0.2412, "step": 3887 }, { "epoch": 2.179372197309417, "grad_norm": 0.07784345841521798, "learning_rate": 4.223932159462954e-05, "loss": 0.237, "step": 3888 }, { "epoch": 2.179932735426009, "grad_norm": 0.07412396478730746, "learning_rate": 4.218608368764314e-05, "loss": 0.2407, "step": 3889 }, { "epoch": 2.180493273542601, "grad_norm": 0.07413010645194912, "learning_rate": 4.213287038201943e-05, "loss": 0.2491, "step": 3890 }, { "epoch": 2.1810538116591927, "grad_norm": 0.08052501546387542, "learning_rate": 4.207968170040202e-05, "loss": 0.235, "step": 3891 }, { "epoch": 2.181614349775785, "grad_norm": 0.07749474283314743, "learning_rate": 4.202651766542416e-05, "loss": 0.2441, "step": 3892 }, { "epoch": 2.182174887892377, "grad_norm": 0.07857814645642697, "learning_rate": 4.197337829970852e-05, "loss": 0.2527, "step": 3893 }, { "epoch": 2.1827354260089686, "grad_norm": 0.07543219070462921, "learning_rate": 4.1920263625867364e-05, "loss": 0.2422, "step": 3894 }, { "epoch": 2.1832959641255605, "grad_norm": 0.07617937414399167, "learning_rate": 4.1867173666502393e-05, "loss": 0.2334, "step": 3895 }, { "epoch": 2.1838565022421523, "grad_norm": 0.07993840930599429, "learning_rate": 4.181410844420474e-05, "loss": 0.2561, "step": 3896 }, { "epoch": 2.1844170403587446, "grad_norm": 0.07924758171821018, "learning_rate": 4.1761067981555114e-05, "loss": 0.247, "step": 3897 }, { "epoch": 2.1849775784753365, "grad_norm": 0.07495089799120876, "learning_rate": 4.170805230112366e-05, "loss": 0.2383, "step": 3898 }, { "epoch": 2.1855381165919283, "grad_norm": 0.07727828477574174, "learning_rate": 4.1655061425469976e-05, "loss": 0.2459, "step": 3899 }, { "epoch": 2.18609865470852, "grad_norm": 0.07555327292191727, "learning_rate": 4.160209537714304e-05, "loss": 0.2469, "step": 3900 }, { "epoch": 2.186659192825112, "grad_norm": 0.07592763275951135, "learning_rate": 4.154915417868137e-05, "loss": 0.2331, "step": 3901 }, { "epoch": 2.187219730941704, "grad_norm": 0.07644350710027373, "learning_rate": 4.149623785261284e-05, "loss": 0.2364, "step": 3902 }, { "epoch": 2.187780269058296, "grad_norm": 0.07419899351884267, "learning_rate": 4.1443346421454724e-05, "loss": 0.239, "step": 3903 }, { "epoch": 2.188340807174888, "grad_norm": 0.07910748606280418, "learning_rate": 4.139047990771378e-05, "loss": 0.254, "step": 3904 }, { "epoch": 2.18890134529148, "grad_norm": 0.07852864663680223, "learning_rate": 4.133763833388609e-05, "loss": 0.2353, "step": 3905 }, { "epoch": 2.1894618834080717, "grad_norm": 0.07932087877745334, "learning_rate": 4.128482172245715e-05, "loss": 0.2437, "step": 3906 }, { "epoch": 2.1900224215246635, "grad_norm": 0.0758056247353558, "learning_rate": 4.123203009590185e-05, "loss": 0.2443, "step": 3907 }, { "epoch": 2.1905829596412554, "grad_norm": 0.07805058809414696, "learning_rate": 4.1179263476684474e-05, "loss": 0.2469, "step": 3908 }, { "epoch": 2.1911434977578477, "grad_norm": 0.07674673233219874, "learning_rate": 4.112652188725859e-05, "loss": 0.2429, "step": 3909 }, { "epoch": 2.1917040358744395, "grad_norm": 0.07350573510538515, "learning_rate": 4.1073805350067096e-05, "loss": 0.2293, "step": 3910 }, { "epoch": 2.1922645739910314, "grad_norm": 0.07935859742013034, "learning_rate": 4.102111388754238e-05, "loss": 0.2517, "step": 3911 }, { "epoch": 2.192825112107623, "grad_norm": 0.07596313105262871, "learning_rate": 4.096844752210598e-05, "loss": 0.2479, "step": 3912 }, { "epoch": 2.193385650224215, "grad_norm": 0.07985349632362626, "learning_rate": 4.091580627616888e-05, "loss": 0.237, "step": 3913 }, { "epoch": 2.1939461883408073, "grad_norm": 0.07661541764495726, "learning_rate": 4.0863190172131364e-05, "loss": 0.2542, "step": 3914 }, { "epoch": 2.194506726457399, "grad_norm": 0.07762540071070838, "learning_rate": 4.0810599232382916e-05, "loss": 0.2454, "step": 3915 }, { "epoch": 2.195067264573991, "grad_norm": 0.07787073005395762, "learning_rate": 4.075803347930245e-05, "loss": 0.2408, "step": 3916 }, { "epoch": 2.195627802690583, "grad_norm": 0.07657366919537804, "learning_rate": 4.070549293525804e-05, "loss": 0.2447, "step": 3917 }, { "epoch": 2.1961883408071747, "grad_norm": 0.07724671822423297, "learning_rate": 4.0652977622607145e-05, "loss": 0.2481, "step": 3918 }, { "epoch": 2.196748878923767, "grad_norm": 0.07524027479414251, "learning_rate": 4.0600487563696364e-05, "loss": 0.2358, "step": 3919 }, { "epoch": 2.197309417040359, "grad_norm": 0.077028529564304, "learning_rate": 4.054802278086168e-05, "loss": 0.2359, "step": 3920 }, { "epoch": 2.1978699551569507, "grad_norm": 0.0785710538200137, "learning_rate": 4.0495583296428205e-05, "loss": 0.2427, "step": 3921 }, { "epoch": 2.1984304932735426, "grad_norm": 0.07816972861165873, "learning_rate": 4.044316913271036e-05, "loss": 0.2414, "step": 3922 }, { "epoch": 2.1989910313901344, "grad_norm": 0.07692316529475866, "learning_rate": 4.03907803120118e-05, "loss": 0.2464, "step": 3923 }, { "epoch": 2.1995515695067263, "grad_norm": 0.07748236072229979, "learning_rate": 4.0338416856625294e-05, "loss": 0.2331, "step": 3924 }, { "epoch": 2.2001121076233185, "grad_norm": 0.0769074810556553, "learning_rate": 4.028607878883297e-05, "loss": 0.2453, "step": 3925 }, { "epoch": 2.2006726457399104, "grad_norm": 0.07879946810562749, "learning_rate": 4.023376613090599e-05, "loss": 0.2473, "step": 3926 }, { "epoch": 2.2012331838565022, "grad_norm": 0.07608325617460276, "learning_rate": 4.018147890510486e-05, "loss": 0.2277, "step": 3927 }, { "epoch": 2.201793721973094, "grad_norm": 0.0784418091050367, "learning_rate": 4.012921713367916e-05, "loss": 0.2517, "step": 3928 }, { "epoch": 2.202354260089686, "grad_norm": 0.07802357173766879, "learning_rate": 4.0076980838867625e-05, "loss": 0.2444, "step": 3929 }, { "epoch": 2.2029147982062782, "grad_norm": 0.08036596867295931, "learning_rate": 4.0024770042898215e-05, "loss": 0.2368, "step": 3930 }, { "epoch": 2.20347533632287, "grad_norm": 0.07627521653760704, "learning_rate": 3.997258476798804e-05, "loss": 0.2478, "step": 3931 }, { "epoch": 2.204035874439462, "grad_norm": 0.07637266150754995, "learning_rate": 3.9920425036343344e-05, "loss": 0.2396, "step": 3932 }, { "epoch": 2.2045964125560538, "grad_norm": 0.0757979635557969, "learning_rate": 3.9868290870159405e-05, "loss": 0.2372, "step": 3933 }, { "epoch": 2.2051569506726456, "grad_norm": 0.0758271305367863, "learning_rate": 3.98161822916208e-05, "loss": 0.2315, "step": 3934 }, { "epoch": 2.2057174887892375, "grad_norm": 0.07840895001099792, "learning_rate": 3.9764099322901047e-05, "loss": 0.243, "step": 3935 }, { "epoch": 2.2062780269058297, "grad_norm": 0.07792711393841163, "learning_rate": 3.971204198616284e-05, "loss": 0.2458, "step": 3936 }, { "epoch": 2.2068385650224216, "grad_norm": 0.07779689081296363, "learning_rate": 3.9660010303558005e-05, "loss": 0.256, "step": 3937 }, { "epoch": 2.2073991031390134, "grad_norm": 0.07770239643017164, "learning_rate": 3.960800429722734e-05, "loss": 0.2391, "step": 3938 }, { "epoch": 2.2079596412556053, "grad_norm": 0.07759603142305241, "learning_rate": 3.955602398930084e-05, "loss": 0.2393, "step": 3939 }, { "epoch": 2.208520179372197, "grad_norm": 0.07864454186406615, "learning_rate": 3.9504069401897505e-05, "loss": 0.2356, "step": 3940 }, { "epoch": 2.2090807174887894, "grad_norm": 0.07917369685068788, "learning_rate": 3.9452140557125435e-05, "loss": 0.243, "step": 3941 }, { "epoch": 2.2096412556053813, "grad_norm": 0.0769382735363664, "learning_rate": 3.940023747708169e-05, "loss": 0.2377, "step": 3942 }, { "epoch": 2.210201793721973, "grad_norm": 0.07825002371469891, "learning_rate": 3.934836018385239e-05, "loss": 0.2526, "step": 3943 }, { "epoch": 2.210762331838565, "grad_norm": 0.07734985454503165, "learning_rate": 3.929650869951278e-05, "loss": 0.2437, "step": 3944 }, { "epoch": 2.211322869955157, "grad_norm": 0.07900761978227461, "learning_rate": 3.924468304612696e-05, "loss": 0.2369, "step": 3945 }, { "epoch": 2.211883408071749, "grad_norm": 0.07796886832267623, "learning_rate": 3.9192883245748194e-05, "loss": 0.2529, "step": 3946 }, { "epoch": 2.212443946188341, "grad_norm": 0.07774089387545376, "learning_rate": 3.914110932041865e-05, "loss": 0.2356, "step": 3947 }, { "epoch": 2.213004484304933, "grad_norm": 0.07669595893560695, "learning_rate": 3.908936129216955e-05, "loss": 0.2441, "step": 3948 }, { "epoch": 2.2135650224215246, "grad_norm": 0.07694441354189155, "learning_rate": 3.903763918302104e-05, "loss": 0.2418, "step": 3949 }, { "epoch": 2.2141255605381165, "grad_norm": 0.0784777046425361, "learning_rate": 3.898594301498221e-05, "loss": 0.248, "step": 3950 }, { "epoch": 2.2146860986547083, "grad_norm": 0.07805312335295722, "learning_rate": 3.893427281005122e-05, "loss": 0.2381, "step": 3951 }, { "epoch": 2.2152466367713006, "grad_norm": 0.08146147369591868, "learning_rate": 3.8882628590215074e-05, "loss": 0.2508, "step": 3952 }, { "epoch": 2.2158071748878925, "grad_norm": 0.07740833226057425, "learning_rate": 3.8831010377449816e-05, "loss": 0.2498, "step": 3953 }, { "epoch": 2.2163677130044843, "grad_norm": 0.07565218389561658, "learning_rate": 3.877941819372031e-05, "loss": 0.2353, "step": 3954 }, { "epoch": 2.216928251121076, "grad_norm": 0.0787770429773584, "learning_rate": 3.8727852060980444e-05, "loss": 0.2499, "step": 3955 }, { "epoch": 2.217488789237668, "grad_norm": 0.0799115347001249, "learning_rate": 3.8676312001173e-05, "loss": 0.2496, "step": 3956 }, { "epoch": 2.21804932735426, "grad_norm": 0.07721801439002256, "learning_rate": 3.862479803622958e-05, "loss": 0.2416, "step": 3957 }, { "epoch": 2.218609865470852, "grad_norm": 0.07818729106979565, "learning_rate": 3.8573310188070845e-05, "loss": 0.2546, "step": 3958 }, { "epoch": 2.219170403587444, "grad_norm": 0.07966364760612964, "learning_rate": 3.852184847860615e-05, "loss": 0.2352, "step": 3959 }, { "epoch": 2.219730941704036, "grad_norm": 0.0773622586431013, "learning_rate": 3.84704129297339e-05, "loss": 0.2435, "step": 3960 }, { "epoch": 2.2202914798206277, "grad_norm": 0.07706245453058086, "learning_rate": 3.841900356334127e-05, "loss": 0.2413, "step": 3961 }, { "epoch": 2.2208520179372195, "grad_norm": 0.07925653446803047, "learning_rate": 3.836762040130426e-05, "loss": 0.2443, "step": 3962 }, { "epoch": 2.221412556053812, "grad_norm": 0.07597662986644867, "learning_rate": 3.8316263465487834e-05, "loss": 0.248, "step": 3963 }, { "epoch": 2.2219730941704037, "grad_norm": 0.07965193025099362, "learning_rate": 3.826493277774572e-05, "loss": 0.2519, "step": 3964 }, { "epoch": 2.2225336322869955, "grad_norm": 0.07974783423100505, "learning_rate": 3.821362835992053e-05, "loss": 0.2491, "step": 3965 }, { "epoch": 2.2230941704035874, "grad_norm": 0.07617788752007221, "learning_rate": 3.81623502338436e-05, "loss": 0.2458, "step": 3966 }, { "epoch": 2.223654708520179, "grad_norm": 0.07716877427571417, "learning_rate": 3.81110984213352e-05, "loss": 0.2365, "step": 3967 }, { "epoch": 2.2242152466367715, "grad_norm": 0.07925341243024847, "learning_rate": 3.8059872944204324e-05, "loss": 0.2443, "step": 3968 }, { "epoch": 2.2247757847533634, "grad_norm": 0.07883561549575804, "learning_rate": 3.800867382424872e-05, "loss": 0.2469, "step": 3969 }, { "epoch": 2.225336322869955, "grad_norm": 0.07655529733887344, "learning_rate": 3.7957501083255065e-05, "loss": 0.2546, "step": 3970 }, { "epoch": 2.225896860986547, "grad_norm": 0.07559675606024821, "learning_rate": 3.7906354742998654e-05, "loss": 0.2391, "step": 3971 }, { "epoch": 2.226457399103139, "grad_norm": 0.0784403741646514, "learning_rate": 3.7855234825243644e-05, "loss": 0.2564, "step": 3972 }, { "epoch": 2.2270179372197307, "grad_norm": 0.07677942678759365, "learning_rate": 3.7804141351742925e-05, "loss": 0.2369, "step": 3973 }, { "epoch": 2.227578475336323, "grad_norm": 0.07786561126409022, "learning_rate": 3.775307434423818e-05, "loss": 0.2488, "step": 3974 }, { "epoch": 2.228139013452915, "grad_norm": 0.07733142719653782, "learning_rate": 3.770203382445974e-05, "loss": 0.2412, "step": 3975 }, { "epoch": 2.2286995515695067, "grad_norm": 0.07510479482687211, "learning_rate": 3.7651019814126654e-05, "loss": 0.2404, "step": 3976 }, { "epoch": 2.2292600896860986, "grad_norm": 0.0793733886983191, "learning_rate": 3.760003233494683e-05, "loss": 0.2479, "step": 3977 }, { "epoch": 2.2298206278026904, "grad_norm": 0.07943863183490149, "learning_rate": 3.754907140861674e-05, "loss": 0.2377, "step": 3978 }, { "epoch": 2.2303811659192827, "grad_norm": 0.07476932550523543, "learning_rate": 3.7498137056821634e-05, "loss": 0.2298, "step": 3979 }, { "epoch": 2.2309417040358746, "grad_norm": 0.07930513376330914, "learning_rate": 3.7447229301235445e-05, "loss": 0.2424, "step": 3980 }, { "epoch": 2.2315022421524664, "grad_norm": 0.07769688844282986, "learning_rate": 3.739634816352081e-05, "loss": 0.2449, "step": 3981 }, { "epoch": 2.2320627802690582, "grad_norm": 0.07778542276378456, "learning_rate": 3.734549366532898e-05, "loss": 0.2431, "step": 3982 }, { "epoch": 2.23262331838565, "grad_norm": 0.07605549064102234, "learning_rate": 3.7294665828299856e-05, "loss": 0.2453, "step": 3983 }, { "epoch": 2.233183856502242, "grad_norm": 0.07560550393605212, "learning_rate": 3.724386467406211e-05, "loss": 0.236, "step": 3984 }, { "epoch": 2.2337443946188342, "grad_norm": 0.0778699111104112, "learning_rate": 3.719309022423293e-05, "loss": 0.2453, "step": 3985 }, { "epoch": 2.234304932735426, "grad_norm": 0.07691655621653959, "learning_rate": 3.7142342500418256e-05, "loss": 0.245, "step": 3986 }, { "epoch": 2.234865470852018, "grad_norm": 0.07839221092373565, "learning_rate": 3.709162152421253e-05, "loss": 0.2417, "step": 3987 }, { "epoch": 2.2354260089686098, "grad_norm": 0.07837476833048565, "learning_rate": 3.704092731719892e-05, "loss": 0.2396, "step": 3988 }, { "epoch": 2.2359865470852016, "grad_norm": 0.07892332509063256, "learning_rate": 3.699025990094919e-05, "loss": 0.236, "step": 3989 }, { "epoch": 2.236547085201794, "grad_norm": 0.07867162636838011, "learning_rate": 3.6939619297023595e-05, "loss": 0.2428, "step": 3990 }, { "epoch": 2.2371076233183858, "grad_norm": 0.07895061423186442, "learning_rate": 3.688900552697115e-05, "loss": 0.2637, "step": 3991 }, { "epoch": 2.2376681614349776, "grad_norm": 0.07699857519011867, "learning_rate": 3.6838418612329305e-05, "loss": 0.2445, "step": 3992 }, { "epoch": 2.2382286995515694, "grad_norm": 0.07663795542669885, "learning_rate": 3.6787858574624176e-05, "loss": 0.2426, "step": 3993 }, { "epoch": 2.2387892376681613, "grad_norm": 0.07848411722853481, "learning_rate": 3.6737325435370374e-05, "loss": 0.2631, "step": 3994 }, { "epoch": 2.2393497757847536, "grad_norm": 0.07971443492816431, "learning_rate": 3.668681921607113e-05, "loss": 0.2494, "step": 3995 }, { "epoch": 2.2399103139013454, "grad_norm": 0.07852676713453574, "learning_rate": 3.663633993821816e-05, "loss": 0.2538, "step": 3996 }, { "epoch": 2.2404708520179373, "grad_norm": 0.07840943264658039, "learning_rate": 3.658588762329174e-05, "loss": 0.2365, "step": 3997 }, { "epoch": 2.241031390134529, "grad_norm": 0.07743865110849929, "learning_rate": 3.6535462292760715e-05, "loss": 0.2517, "step": 3998 }, { "epoch": 2.241591928251121, "grad_norm": 0.07988589540027931, "learning_rate": 3.6485063968082344e-05, "loss": 0.2498, "step": 3999 }, { "epoch": 2.242152466367713, "grad_norm": 0.07613715602980295, "learning_rate": 3.6434692670702545e-05, "loss": 0.2418, "step": 4000 }, { "epoch": 2.242713004484305, "grad_norm": 0.07606313247057905, "learning_rate": 3.638434842205558e-05, "loss": 0.2306, "step": 4001 }, { "epoch": 2.243273542600897, "grad_norm": 0.07720011259500538, "learning_rate": 3.633403124356426e-05, "loss": 0.2387, "step": 4002 }, { "epoch": 2.243834080717489, "grad_norm": 0.07846912534834977, "learning_rate": 3.628374115663995e-05, "loss": 0.2506, "step": 4003 }, { "epoch": 2.2443946188340806, "grad_norm": 0.07776662326654782, "learning_rate": 3.6233478182682345e-05, "loss": 0.2345, "step": 4004 }, { "epoch": 2.2449551569506725, "grad_norm": 0.07540971408911927, "learning_rate": 3.618324234307973e-05, "loss": 0.2373, "step": 4005 }, { "epoch": 2.2455156950672643, "grad_norm": 0.077880691647363, "learning_rate": 3.613303365920877e-05, "loss": 0.2412, "step": 4006 }, { "epoch": 2.2460762331838566, "grad_norm": 0.08161547694039394, "learning_rate": 3.6082852152434646e-05, "loss": 0.2541, "step": 4007 }, { "epoch": 2.2466367713004485, "grad_norm": 0.07710773873508162, "learning_rate": 3.60326978441109e-05, "loss": 0.2378, "step": 4008 }, { "epoch": 2.2471973094170403, "grad_norm": 0.07698233339569654, "learning_rate": 3.598257075557948e-05, "loss": 0.2324, "step": 4009 }, { "epoch": 2.247757847533632, "grad_norm": 0.07960604665158132, "learning_rate": 3.593247090817088e-05, "loss": 0.2459, "step": 4010 }, { "epoch": 2.248318385650224, "grad_norm": 0.07303690685007934, "learning_rate": 3.5882398323203834e-05, "loss": 0.2406, "step": 4011 }, { "epoch": 2.2488789237668163, "grad_norm": 0.07989948416525376, "learning_rate": 3.583235302198562e-05, "loss": 0.2444, "step": 4012 }, { "epoch": 2.249439461883408, "grad_norm": 0.0768515749305458, "learning_rate": 3.578233502581183e-05, "loss": 0.2423, "step": 4013 }, { "epoch": 2.25, "grad_norm": 0.07962415092933411, "learning_rate": 3.5732344355966494e-05, "loss": 0.2408, "step": 4014 }, { "epoch": 2.250560538116592, "grad_norm": 0.07901083134605137, "learning_rate": 3.5682381033721944e-05, "loss": 0.2372, "step": 4015 }, { "epoch": 2.2511210762331837, "grad_norm": 0.08134667818885108, "learning_rate": 3.563244508033887e-05, "loss": 0.2536, "step": 4016 }, { "epoch": 2.251681614349776, "grad_norm": 0.07714038989261836, "learning_rate": 3.558253651706641e-05, "loss": 0.2412, "step": 4017 }, { "epoch": 2.252242152466368, "grad_norm": 0.07646345861859105, "learning_rate": 3.5532655365141934e-05, "loss": 0.2328, "step": 4018 }, { "epoch": 2.2528026905829597, "grad_norm": 0.07513654052258976, "learning_rate": 3.548280164579126e-05, "loss": 0.2421, "step": 4019 }, { "epoch": 2.2533632286995515, "grad_norm": 0.0767600702398758, "learning_rate": 3.543297538022842e-05, "loss": 0.2507, "step": 4020 }, { "epoch": 2.2539237668161434, "grad_norm": 0.07794866454848276, "learning_rate": 3.538317658965583e-05, "loss": 0.2428, "step": 4021 }, { "epoch": 2.2544843049327357, "grad_norm": 0.07751718657953899, "learning_rate": 3.533340529526426e-05, "loss": 0.2464, "step": 4022 }, { "epoch": 2.2550448430493275, "grad_norm": 0.07890045797164695, "learning_rate": 3.5283661518232635e-05, "loss": 0.2464, "step": 4023 }, { "epoch": 2.2556053811659194, "grad_norm": 0.07771274932781744, "learning_rate": 3.523394527972833e-05, "loss": 0.2446, "step": 4024 }, { "epoch": 2.256165919282511, "grad_norm": 0.07799172469865923, "learning_rate": 3.5184256600906885e-05, "loss": 0.2492, "step": 4025 }, { "epoch": 2.256726457399103, "grad_norm": 0.07770715943505946, "learning_rate": 3.513459550291219e-05, "loss": 0.2519, "step": 4026 }, { "epoch": 2.257286995515695, "grad_norm": 0.08006532274437009, "learning_rate": 3.508496200687633e-05, "loss": 0.2526, "step": 4027 }, { "epoch": 2.257847533632287, "grad_norm": 0.07499003121362369, "learning_rate": 3.503535613391973e-05, "loss": 0.2264, "step": 4028 }, { "epoch": 2.258408071748879, "grad_norm": 0.07983255025461201, "learning_rate": 3.498577790515095e-05, "loss": 0.2325, "step": 4029 }, { "epoch": 2.258968609865471, "grad_norm": 0.08010545958973918, "learning_rate": 3.493622734166688e-05, "loss": 0.2532, "step": 4030 }, { "epoch": 2.2595291479820627, "grad_norm": 0.076604938351707, "learning_rate": 3.4886704464552635e-05, "loss": 0.2427, "step": 4031 }, { "epoch": 2.2600896860986546, "grad_norm": 0.07817015284831182, "learning_rate": 3.4837209294881467e-05, "loss": 0.2406, "step": 4032 }, { "epoch": 2.2606502242152464, "grad_norm": 0.07822138725494229, "learning_rate": 3.478774185371494e-05, "loss": 0.2405, "step": 4033 }, { "epoch": 2.2612107623318387, "grad_norm": 0.07386648586034597, "learning_rate": 3.473830216210271e-05, "loss": 0.2356, "step": 4034 }, { "epoch": 2.2617713004484306, "grad_norm": 0.07914046299505863, "learning_rate": 3.468889024108275e-05, "loss": 0.2425, "step": 4035 }, { "epoch": 2.2623318385650224, "grad_norm": 0.07792349004039284, "learning_rate": 3.463950611168111e-05, "loss": 0.2333, "step": 4036 }, { "epoch": 2.2628923766816142, "grad_norm": 0.08293485491792615, "learning_rate": 3.459014979491203e-05, "loss": 0.2426, "step": 4037 }, { "epoch": 2.263452914798206, "grad_norm": 0.07648159565150504, "learning_rate": 3.454082131177797e-05, "loss": 0.2391, "step": 4038 }, { "epoch": 2.2640134529147984, "grad_norm": 0.07604430237799964, "learning_rate": 3.449152068326951e-05, "loss": 0.2347, "step": 4039 }, { "epoch": 2.2645739910313902, "grad_norm": 0.08036645521485034, "learning_rate": 3.4442247930365426e-05, "loss": 0.2496, "step": 4040 }, { "epoch": 2.265134529147982, "grad_norm": 0.07581613443950523, "learning_rate": 3.439300307403254e-05, "loss": 0.2427, "step": 4041 }, { "epoch": 2.265695067264574, "grad_norm": 0.0795612256663696, "learning_rate": 3.434378613522582e-05, "loss": 0.2444, "step": 4042 }, { "epoch": 2.2662556053811658, "grad_norm": 0.07958407744060352, "learning_rate": 3.429459713488846e-05, "loss": 0.2491, "step": 4043 }, { "epoch": 2.266816143497758, "grad_norm": 0.08036965217293619, "learning_rate": 3.424543609395162e-05, "loss": 0.2401, "step": 4044 }, { "epoch": 2.26737668161435, "grad_norm": 0.07644904034051768, "learning_rate": 3.419630303333466e-05, "loss": 0.2329, "step": 4045 }, { "epoch": 2.2679372197309418, "grad_norm": 0.07495170930821564, "learning_rate": 3.4147197973945035e-05, "loss": 0.2373, "step": 4046 }, { "epoch": 2.2684977578475336, "grad_norm": 0.07948384758363594, "learning_rate": 3.409812093667826e-05, "loss": 0.2321, "step": 4047 }, { "epoch": 2.2690582959641254, "grad_norm": 0.08064903002594626, "learning_rate": 3.4049071942417896e-05, "loss": 0.2487, "step": 4048 }, { "epoch": 2.2696188340807173, "grad_norm": 0.07667457565003931, "learning_rate": 3.400005101203557e-05, "loss": 0.2445, "step": 4049 }, { "epoch": 2.2701793721973096, "grad_norm": 0.08096075945625165, "learning_rate": 3.395105816639106e-05, "loss": 0.2482, "step": 4050 }, { "epoch": 2.2707399103139014, "grad_norm": 0.0802842618493245, "learning_rate": 3.390209342633205e-05, "loss": 0.2364, "step": 4051 }, { "epoch": 2.2713004484304933, "grad_norm": 0.07799608919902139, "learning_rate": 3.385315681269443e-05, "loss": 0.2484, "step": 4052 }, { "epoch": 2.271860986547085, "grad_norm": 0.07986379349658052, "learning_rate": 3.380424834630196e-05, "loss": 0.234, "step": 4053 }, { "epoch": 2.272421524663677, "grad_norm": 0.07868640209787364, "learning_rate": 3.375536804796652e-05, "loss": 0.2475, "step": 4054 }, { "epoch": 2.272982062780269, "grad_norm": 0.07977117505183345, "learning_rate": 3.370651593848802e-05, "loss": 0.2426, "step": 4055 }, { "epoch": 2.273542600896861, "grad_norm": 0.07978111905869915, "learning_rate": 3.365769203865425e-05, "loss": 0.2493, "step": 4056 }, { "epoch": 2.274103139013453, "grad_norm": 0.07815652824547649, "learning_rate": 3.3608896369241196e-05, "loss": 0.2525, "step": 4057 }, { "epoch": 2.274663677130045, "grad_norm": 0.07897625992101927, "learning_rate": 3.356012895101259e-05, "loss": 0.2411, "step": 4058 }, { "epoch": 2.2752242152466366, "grad_norm": 0.07796984568897937, "learning_rate": 3.351138980472038e-05, "loss": 0.2469, "step": 4059 }, { "epoch": 2.2757847533632285, "grad_norm": 0.07554704922149767, "learning_rate": 3.346267895110429e-05, "loss": 0.2318, "step": 4060 }, { "epoch": 2.276345291479821, "grad_norm": 0.07803333492071318, "learning_rate": 3.3413996410892143e-05, "loss": 0.255, "step": 4061 }, { "epoch": 2.2769058295964126, "grad_norm": 0.07812744126921167, "learning_rate": 3.336534220479961e-05, "loss": 0.2368, "step": 4062 }, { "epoch": 2.2774663677130045, "grad_norm": 0.08184618250853364, "learning_rate": 3.331671635353037e-05, "loss": 0.2407, "step": 4063 }, { "epoch": 2.2780269058295963, "grad_norm": 0.07650302276882859, "learning_rate": 3.3268118877776066e-05, "loss": 0.24, "step": 4064 }, { "epoch": 2.278587443946188, "grad_norm": 0.07648967362677596, "learning_rate": 3.3219549798216145e-05, "loss": 0.2434, "step": 4065 }, { "epoch": 2.2791479820627805, "grad_norm": 0.07454490616444838, "learning_rate": 3.317100913551812e-05, "loss": 0.2352, "step": 4066 }, { "epoch": 2.2797085201793723, "grad_norm": 0.07706418209665264, "learning_rate": 3.3122496910337245e-05, "loss": 0.2398, "step": 4067 }, { "epoch": 2.280269058295964, "grad_norm": 0.07769346623390297, "learning_rate": 3.307401314331686e-05, "loss": 0.2463, "step": 4068 }, { "epoch": 2.280829596412556, "grad_norm": 0.07761138116810193, "learning_rate": 3.302555785508802e-05, "loss": 0.2474, "step": 4069 }, { "epoch": 2.281390134529148, "grad_norm": 0.07802716154311114, "learning_rate": 3.297713106626978e-05, "loss": 0.2421, "step": 4070 }, { "epoch": 2.28195067264574, "grad_norm": 0.07665101677824855, "learning_rate": 3.292873279746906e-05, "loss": 0.2353, "step": 4071 }, { "epoch": 2.282511210762332, "grad_norm": 0.08033235546011366, "learning_rate": 3.288036306928055e-05, "loss": 0.2482, "step": 4072 }, { "epoch": 2.283071748878924, "grad_norm": 0.07805004891580285, "learning_rate": 3.283202190228692e-05, "loss": 0.2437, "step": 4073 }, { "epoch": 2.2836322869955157, "grad_norm": 0.07677009588242084, "learning_rate": 3.2783709317058575e-05, "loss": 0.2425, "step": 4074 }, { "epoch": 2.2841928251121075, "grad_norm": 0.07843372585241097, "learning_rate": 3.273542533415386e-05, "loss": 0.2429, "step": 4075 }, { "epoch": 2.2847533632286994, "grad_norm": 0.0780651584816468, "learning_rate": 3.268716997411886e-05, "loss": 0.2466, "step": 4076 }, { "epoch": 2.2853139013452917, "grad_norm": 0.07784304696239416, "learning_rate": 3.26389432574875e-05, "loss": 0.2421, "step": 4077 }, { "epoch": 2.2858744394618835, "grad_norm": 0.07751781105207137, "learning_rate": 3.2590745204781534e-05, "loss": 0.244, "step": 4078 }, { "epoch": 2.2864349775784754, "grad_norm": 0.07810555174866082, "learning_rate": 3.2542575836510556e-05, "loss": 0.2481, "step": 4079 }, { "epoch": 2.286995515695067, "grad_norm": 0.07740809999962645, "learning_rate": 3.249443517317194e-05, "loss": 0.246, "step": 4080 }, { "epoch": 2.287556053811659, "grad_norm": 0.07497482770784392, "learning_rate": 3.244632323525074e-05, "loss": 0.2355, "step": 4081 }, { "epoch": 2.288116591928251, "grad_norm": 0.07732100446605851, "learning_rate": 3.239824004321995e-05, "loss": 0.2454, "step": 4082 }, { "epoch": 2.288677130044843, "grad_norm": 0.07840414159510177, "learning_rate": 3.235018561754022e-05, "loss": 0.2391, "step": 4083 }, { "epoch": 2.289237668161435, "grad_norm": 0.07943565519059993, "learning_rate": 3.230215997865993e-05, "loss": 0.2434, "step": 4084 }, { "epoch": 2.289798206278027, "grad_norm": 0.0799020551470384, "learning_rate": 3.225416314701537e-05, "loss": 0.2497, "step": 4085 }, { "epoch": 2.2903587443946187, "grad_norm": 0.07862871878240238, "learning_rate": 3.220619514303037e-05, "loss": 0.2257, "step": 4086 }, { "epoch": 2.2909192825112106, "grad_norm": 0.07736652159487113, "learning_rate": 3.2158255987116656e-05, "loss": 0.2295, "step": 4087 }, { "epoch": 2.291479820627803, "grad_norm": 0.08090013758807421, "learning_rate": 3.211034569967365e-05, "loss": 0.2512, "step": 4088 }, { "epoch": 2.2920403587443947, "grad_norm": 0.07723418842114685, "learning_rate": 3.2062464301088366e-05, "loss": 0.2491, "step": 4089 }, { "epoch": 2.2926008968609866, "grad_norm": 0.07906858623019813, "learning_rate": 3.2014611811735695e-05, "loss": 0.2602, "step": 4090 }, { "epoch": 2.2931614349775784, "grad_norm": 0.07912332090476885, "learning_rate": 3.19667882519781e-05, "loss": 0.2425, "step": 4091 }, { "epoch": 2.2937219730941703, "grad_norm": 0.07859642615535287, "learning_rate": 3.191899364216581e-05, "loss": 0.2433, "step": 4092 }, { "epoch": 2.2942825112107625, "grad_norm": 0.0796324284676464, "learning_rate": 3.187122800263667e-05, "loss": 0.2522, "step": 4093 }, { "epoch": 2.2948430493273544, "grad_norm": 0.0744524204632941, "learning_rate": 3.182349135371627e-05, "loss": 0.2306, "step": 4094 }, { "epoch": 2.2954035874439462, "grad_norm": 0.07777123457265987, "learning_rate": 3.17757837157178e-05, "loss": 0.2513, "step": 4095 }, { "epoch": 2.295964125560538, "grad_norm": 0.07524779879726812, "learning_rate": 3.172810510894213e-05, "loss": 0.243, "step": 4096 }, { "epoch": 2.29652466367713, "grad_norm": 0.08221702157434065, "learning_rate": 3.1680455553677824e-05, "loss": 0.2523, "step": 4097 }, { "epoch": 2.297085201793722, "grad_norm": 0.07822118201964864, "learning_rate": 3.1632835070200975e-05, "loss": 0.2396, "step": 4098 }, { "epoch": 2.297645739910314, "grad_norm": 0.07892494483631408, "learning_rate": 3.158524367877543e-05, "loss": 0.2366, "step": 4099 }, { "epoch": 2.298206278026906, "grad_norm": 0.07793237181027066, "learning_rate": 3.153768139965253e-05, "loss": 0.2343, "step": 4100 }, { "epoch": 2.2987668161434978, "grad_norm": 0.07775126927184879, "learning_rate": 3.1490148253071364e-05, "loss": 0.2347, "step": 4101 }, { "epoch": 2.2993273542600896, "grad_norm": 0.0810981916746797, "learning_rate": 3.144264425925847e-05, "loss": 0.2508, "step": 4102 }, { "epoch": 2.2998878923766815, "grad_norm": 0.0775882006991532, "learning_rate": 3.139516943842812e-05, "loss": 0.2441, "step": 4103 }, { "epoch": 2.3004484304932733, "grad_norm": 0.08040060914547632, "learning_rate": 3.1347723810782134e-05, "loss": 0.2564, "step": 4104 }, { "epoch": 2.3010089686098656, "grad_norm": 0.07888930461738493, "learning_rate": 3.130030739650983e-05, "loss": 0.2364, "step": 4105 }, { "epoch": 2.3015695067264574, "grad_norm": 0.07745309053089348, "learning_rate": 3.125292021578822e-05, "loss": 0.2493, "step": 4106 }, { "epoch": 2.3021300448430493, "grad_norm": 0.07826243616532201, "learning_rate": 3.120556228878174e-05, "loss": 0.2446, "step": 4107 }, { "epoch": 2.302690582959641, "grad_norm": 0.07712325781994052, "learning_rate": 3.115823363564254e-05, "loss": 0.2477, "step": 4108 }, { "epoch": 2.303251121076233, "grad_norm": 0.07904786797151007, "learning_rate": 3.111093427651016e-05, "loss": 0.2404, "step": 4109 }, { "epoch": 2.3038116591928253, "grad_norm": 0.07768571598879805, "learning_rate": 3.1063664231511737e-05, "loss": 0.2434, "step": 4110 }, { "epoch": 2.304372197309417, "grad_norm": 0.08103527833027029, "learning_rate": 3.101642352076194e-05, "loss": 0.2432, "step": 4111 }, { "epoch": 2.304932735426009, "grad_norm": 0.07757052847999166, "learning_rate": 3.0969212164362957e-05, "loss": 0.2328, "step": 4112 }, { "epoch": 2.305493273542601, "grad_norm": 0.07919100705158838, "learning_rate": 3.092203018240453e-05, "loss": 0.2495, "step": 4113 }, { "epoch": 2.3060538116591927, "grad_norm": 0.08109971919640106, "learning_rate": 3.087487759496377e-05, "loss": 0.2458, "step": 4114 }, { "epoch": 2.306614349775785, "grad_norm": 0.08030104502170897, "learning_rate": 3.0827754422105416e-05, "loss": 0.2462, "step": 4115 }, { "epoch": 2.307174887892377, "grad_norm": 0.0792602287700327, "learning_rate": 3.078066068388162e-05, "loss": 0.2449, "step": 4116 }, { "epoch": 2.3077354260089686, "grad_norm": 0.07674638180789069, "learning_rate": 3.0733596400331985e-05, "loss": 0.2321, "step": 4117 }, { "epoch": 2.3082959641255605, "grad_norm": 0.0767444128071632, "learning_rate": 3.0686561591483675e-05, "loss": 0.2326, "step": 4118 }, { "epoch": 2.3088565022421523, "grad_norm": 0.07942404273918463, "learning_rate": 3.063955627735121e-05, "loss": 0.2364, "step": 4119 }, { "epoch": 2.3094170403587446, "grad_norm": 0.07881660767792163, "learning_rate": 3.059258047793661e-05, "loss": 0.2507, "step": 4120 }, { "epoch": 2.3099775784753365, "grad_norm": 0.07788673391022796, "learning_rate": 3.0545634213229344e-05, "loss": 0.2413, "step": 4121 }, { "epoch": 2.3105381165919283, "grad_norm": 0.07771950950622547, "learning_rate": 3.0498717503206343e-05, "loss": 0.2361, "step": 4122 }, { "epoch": 2.31109865470852, "grad_norm": 0.07753886183078841, "learning_rate": 3.0451830367831858e-05, "loss": 0.2541, "step": 4123 }, { "epoch": 2.311659192825112, "grad_norm": 0.0776569441223699, "learning_rate": 3.040497282705761e-05, "loss": 0.2443, "step": 4124 }, { "epoch": 2.312219730941704, "grad_norm": 0.07951166059416201, "learning_rate": 3.0358144900822782e-05, "loss": 0.2426, "step": 4125 }, { "epoch": 2.312780269058296, "grad_norm": 0.07821582041906866, "learning_rate": 3.0311346609053838e-05, "loss": 0.2492, "step": 4126 }, { "epoch": 2.313340807174888, "grad_norm": 0.0769999045840295, "learning_rate": 3.0264577971664764e-05, "loss": 0.2416, "step": 4127 }, { "epoch": 2.31390134529148, "grad_norm": 0.08098583886406661, "learning_rate": 3.0217839008556816e-05, "loss": 0.2548, "step": 4128 }, { "epoch": 2.3144618834080717, "grad_norm": 0.07957081430188001, "learning_rate": 3.0171129739618676e-05, "loss": 0.2447, "step": 4129 }, { "epoch": 2.3150224215246635, "grad_norm": 0.08109236841779928, "learning_rate": 3.0124450184726426e-05, "loss": 0.2498, "step": 4130 }, { "epoch": 2.3155829596412554, "grad_norm": 0.07542664082322645, "learning_rate": 3.0077800363743404e-05, "loss": 0.2414, "step": 4131 }, { "epoch": 2.3161434977578477, "grad_norm": 0.07811373621644717, "learning_rate": 3.003118029652041e-05, "loss": 0.2423, "step": 4132 }, { "epoch": 2.3167040358744395, "grad_norm": 0.07683433823903178, "learning_rate": 2.998459000289545e-05, "loss": 0.2392, "step": 4133 }, { "epoch": 2.3172645739910314, "grad_norm": 0.07650638507528401, "learning_rate": 2.993802950269402e-05, "loss": 0.2324, "step": 4134 }, { "epoch": 2.317825112107623, "grad_norm": 0.0781286363655784, "learning_rate": 2.989149881572878e-05, "loss": 0.2376, "step": 4135 }, { "epoch": 2.318385650224215, "grad_norm": 0.08039557730752402, "learning_rate": 2.9844997961799814e-05, "loss": 0.2494, "step": 4136 }, { "epoch": 2.3189461883408073, "grad_norm": 0.07599771755317568, "learning_rate": 2.9798526960694496e-05, "loss": 0.2334, "step": 4137 }, { "epoch": 2.319506726457399, "grad_norm": 0.07976903775833245, "learning_rate": 2.9752085832187416e-05, "loss": 0.2456, "step": 4138 }, { "epoch": 2.320067264573991, "grad_norm": 0.07863155156129803, "learning_rate": 2.970567459604059e-05, "loss": 0.2383, "step": 4139 }, { "epoch": 2.320627802690583, "grad_norm": 0.077354580932718, "learning_rate": 2.9659293272003164e-05, "loss": 0.2392, "step": 4140 }, { "epoch": 2.3211883408071747, "grad_norm": 0.07702481247877152, "learning_rate": 2.9612941879811684e-05, "loss": 0.2529, "step": 4141 }, { "epoch": 2.321748878923767, "grad_norm": 0.07938052518841564, "learning_rate": 2.9566620439189874e-05, "loss": 0.2526, "step": 4142 }, { "epoch": 2.322309417040359, "grad_norm": 0.07983346286425103, "learning_rate": 2.952032896984871e-05, "loss": 0.2456, "step": 4143 }, { "epoch": 2.3228699551569507, "grad_norm": 0.07883227984065583, "learning_rate": 2.947406749148649e-05, "loss": 0.2391, "step": 4144 }, { "epoch": 2.3234304932735426, "grad_norm": 0.07631071687041674, "learning_rate": 2.942783602378869e-05, "loss": 0.2404, "step": 4145 }, { "epoch": 2.3239910313901344, "grad_norm": 0.07880079020974173, "learning_rate": 2.9381634586428085e-05, "loss": 0.2428, "step": 4146 }, { "epoch": 2.3245515695067267, "grad_norm": 0.07806630823958408, "learning_rate": 2.933546319906453e-05, "loss": 0.2347, "step": 4147 }, { "epoch": 2.3251121076233185, "grad_norm": 0.07890647078523001, "learning_rate": 2.9289321881345254e-05, "loss": 0.2418, "step": 4148 }, { "epoch": 2.3256726457399104, "grad_norm": 0.08074047596923722, "learning_rate": 2.9243210652904596e-05, "loss": 0.2469, "step": 4149 }, { "epoch": 2.3262331838565022, "grad_norm": 0.08036244888123181, "learning_rate": 2.9197129533364065e-05, "loss": 0.2597, "step": 4150 }, { "epoch": 2.326793721973094, "grad_norm": 0.07550127743518344, "learning_rate": 2.9151078542332478e-05, "loss": 0.2384, "step": 4151 }, { "epoch": 2.327354260089686, "grad_norm": 0.07672098868048535, "learning_rate": 2.9105057699405702e-05, "loss": 0.2481, "step": 4152 }, { "epoch": 2.327914798206278, "grad_norm": 0.0783208984911028, "learning_rate": 2.9059067024166854e-05, "loss": 0.2542, "step": 4153 }, { "epoch": 2.32847533632287, "grad_norm": 0.08095674337572743, "learning_rate": 2.9013106536186186e-05, "loss": 0.2465, "step": 4154 }, { "epoch": 2.329035874439462, "grad_norm": 0.07680151887015477, "learning_rate": 2.8967176255021167e-05, "loss": 0.2315, "step": 4155 }, { "epoch": 2.3295964125560538, "grad_norm": 0.0780413711790148, "learning_rate": 2.8921276200216296e-05, "loss": 0.2419, "step": 4156 }, { "epoch": 2.3301569506726456, "grad_norm": 0.08115479226965608, "learning_rate": 2.8875406391303263e-05, "loss": 0.2324, "step": 4157 }, { "epoch": 2.3307174887892375, "grad_norm": 0.08035250801214854, "learning_rate": 2.8829566847800948e-05, "loss": 0.24, "step": 4158 }, { "epoch": 2.3312780269058297, "grad_norm": 0.07953425457381018, "learning_rate": 2.878375758921522e-05, "loss": 0.2514, "step": 4159 }, { "epoch": 2.3318385650224216, "grad_norm": 0.07773993048067764, "learning_rate": 2.873797863503923e-05, "loss": 0.2341, "step": 4160 }, { "epoch": 2.3323991031390134, "grad_norm": 0.0784552985488261, "learning_rate": 2.8692230004753063e-05, "loss": 0.2489, "step": 4161 }, { "epoch": 2.3329596412556053, "grad_norm": 0.07722982653058977, "learning_rate": 2.8646511717824022e-05, "loss": 0.24, "step": 4162 }, { "epoch": 2.333520179372197, "grad_norm": 0.0796887994841046, "learning_rate": 2.8600823793706476e-05, "loss": 0.2467, "step": 4163 }, { "epoch": 2.3340807174887894, "grad_norm": 0.07927708767540144, "learning_rate": 2.8555166251841802e-05, "loss": 0.2376, "step": 4164 }, { "epoch": 2.3346412556053813, "grad_norm": 0.07721321287976032, "learning_rate": 2.850953911165857e-05, "loss": 0.2494, "step": 4165 }, { "epoch": 2.335201793721973, "grad_norm": 0.07789760540328405, "learning_rate": 2.846394239257226e-05, "loss": 0.2395, "step": 4166 }, { "epoch": 2.335762331838565, "grad_norm": 0.07920922048422345, "learning_rate": 2.841837611398558e-05, "loss": 0.2515, "step": 4167 }, { "epoch": 2.336322869955157, "grad_norm": 0.08017300403346657, "learning_rate": 2.8372840295288106e-05, "loss": 0.2473, "step": 4168 }, { "epoch": 2.336883408071749, "grad_norm": 0.0787310910925733, "learning_rate": 2.8327334955856598e-05, "loss": 0.2441, "step": 4169 }, { "epoch": 2.337443946188341, "grad_norm": 0.07895596255184466, "learning_rate": 2.8281860115054815e-05, "loss": 0.2377, "step": 4170 }, { "epoch": 2.338004484304933, "grad_norm": 0.0756965468129164, "learning_rate": 2.823641579223344e-05, "loss": 0.2416, "step": 4171 }, { "epoch": 2.3385650224215246, "grad_norm": 0.07708556236193148, "learning_rate": 2.8191002006730328e-05, "loss": 0.2574, "step": 4172 }, { "epoch": 2.3391255605381165, "grad_norm": 0.07907645792962631, "learning_rate": 2.8145618777870176e-05, "loss": 0.2405, "step": 4173 }, { "epoch": 2.3396860986547083, "grad_norm": 0.07762214440323861, "learning_rate": 2.8100266124964824e-05, "loss": 0.2352, "step": 4174 }, { "epoch": 2.3402466367713006, "grad_norm": 0.07667205585008668, "learning_rate": 2.8054944067313005e-05, "loss": 0.2387, "step": 4175 }, { "epoch": 2.3408071748878925, "grad_norm": 0.07914785379616622, "learning_rate": 2.800965262420043e-05, "loss": 0.2569, "step": 4176 }, { "epoch": 2.3413677130044843, "grad_norm": 0.07786459051819472, "learning_rate": 2.796439181489985e-05, "loss": 0.2348, "step": 4177 }, { "epoch": 2.341928251121076, "grad_norm": 0.07729522574347791, "learning_rate": 2.7919161658670945e-05, "loss": 0.2408, "step": 4178 }, { "epoch": 2.342488789237668, "grad_norm": 0.07639465568166552, "learning_rate": 2.787396217476038e-05, "loss": 0.2362, "step": 4179 }, { "epoch": 2.34304932735426, "grad_norm": 0.0763207392197241, "learning_rate": 2.7828793382401685e-05, "loss": 0.2334, "step": 4180 }, { "epoch": 2.343609865470852, "grad_norm": 0.08115765241592474, "learning_rate": 2.7783655300815447e-05, "loss": 0.2497, "step": 4181 }, { "epoch": 2.344170403587444, "grad_norm": 0.07869998341763018, "learning_rate": 2.7738547949209082e-05, "loss": 0.2406, "step": 4182 }, { "epoch": 2.344730941704036, "grad_norm": 0.07820693522758504, "learning_rate": 2.7693471346776944e-05, "loss": 0.2441, "step": 4183 }, { "epoch": 2.3452914798206277, "grad_norm": 0.07758533438790251, "learning_rate": 2.7648425512700393e-05, "loss": 0.2376, "step": 4184 }, { "epoch": 2.3458520179372195, "grad_norm": 0.08007894475897577, "learning_rate": 2.7603410466147572e-05, "loss": 0.2525, "step": 4185 }, { "epoch": 2.346412556053812, "grad_norm": 0.07936192949458998, "learning_rate": 2.7558426226273615e-05, "loss": 0.2499, "step": 4186 }, { "epoch": 2.3469730941704037, "grad_norm": 0.07874011422451643, "learning_rate": 2.751347281222051e-05, "loss": 0.2378, "step": 4187 }, { "epoch": 2.3475336322869955, "grad_norm": 0.07898987392037829, "learning_rate": 2.7468550243117165e-05, "loss": 0.2374, "step": 4188 }, { "epoch": 2.3480941704035874, "grad_norm": 0.07785486151463969, "learning_rate": 2.74236585380793e-05, "loss": 0.2372, "step": 4189 }, { "epoch": 2.348654708520179, "grad_norm": 0.07853561330172434, "learning_rate": 2.7378797716209503e-05, "loss": 0.2428, "step": 4190 }, { "epoch": 2.3492152466367715, "grad_norm": 0.07875410139224716, "learning_rate": 2.7333967796597315e-05, "loss": 0.2493, "step": 4191 }, { "epoch": 2.3497757847533634, "grad_norm": 0.0804659168601273, "learning_rate": 2.7289168798318997e-05, "loss": 0.2378, "step": 4192 }, { "epoch": 2.350336322869955, "grad_norm": 0.0793124955256602, "learning_rate": 2.724440074043778e-05, "loss": 0.2439, "step": 4193 }, { "epoch": 2.350896860986547, "grad_norm": 0.07862219014425169, "learning_rate": 2.7199663642003603e-05, "loss": 0.2363, "step": 4194 }, { "epoch": 2.351457399103139, "grad_norm": 0.07942827124797995, "learning_rate": 2.7154957522053316e-05, "loss": 0.2403, "step": 4195 }, { "epoch": 2.352017937219731, "grad_norm": 0.07925172483902786, "learning_rate": 2.711028239961061e-05, "loss": 0.2401, "step": 4196 }, { "epoch": 2.352578475336323, "grad_norm": 0.0794265937335694, "learning_rate": 2.706563829368587e-05, "loss": 0.2566, "step": 4197 }, { "epoch": 2.353139013452915, "grad_norm": 0.07786233837941513, "learning_rate": 2.702102522327642e-05, "loss": 0.2415, "step": 4198 }, { "epoch": 2.3536995515695067, "grad_norm": 0.07655076644210121, "learning_rate": 2.6976443207366255e-05, "loss": 0.2405, "step": 4199 }, { "epoch": 2.3542600896860986, "grad_norm": 0.0795775900145881, "learning_rate": 2.693189226492625e-05, "loss": 0.2418, "step": 4200 }, { "epoch": 2.3548206278026904, "grad_norm": 0.07792781835497384, "learning_rate": 2.688737241491398e-05, "loss": 0.2346, "step": 4201 }, { "epoch": 2.3553811659192827, "grad_norm": 0.07977929198265812, "learning_rate": 2.6842883676273857e-05, "loss": 0.2431, "step": 4202 }, { "epoch": 2.3559417040358746, "grad_norm": 0.08037315181607053, "learning_rate": 2.6798426067937045e-05, "loss": 0.2364, "step": 4203 }, { "epoch": 2.3565022421524664, "grad_norm": 0.07552057334313014, "learning_rate": 2.675399960882138e-05, "loss": 0.2435, "step": 4204 }, { "epoch": 2.3570627802690582, "grad_norm": 0.07822281390195752, "learning_rate": 2.6709604317831583e-05, "loss": 0.2395, "step": 4205 }, { "epoch": 2.35762331838565, "grad_norm": 0.07508628675605872, "learning_rate": 2.6665240213858946e-05, "loss": 0.2329, "step": 4206 }, { "epoch": 2.358183856502242, "grad_norm": 0.07824720213732886, "learning_rate": 2.6620907315781662e-05, "loss": 0.2546, "step": 4207 }, { "epoch": 2.3587443946188342, "grad_norm": 0.07708763304847759, "learning_rate": 2.657660564246449e-05, "loss": 0.2388, "step": 4208 }, { "epoch": 2.359304932735426, "grad_norm": 0.07979685337298173, "learning_rate": 2.653233521275904e-05, "loss": 0.2557, "step": 4209 }, { "epoch": 2.359865470852018, "grad_norm": 0.07847072190202505, "learning_rate": 2.6488096045503485e-05, "loss": 0.2403, "step": 4210 }, { "epoch": 2.3604260089686098, "grad_norm": 0.08144716623859771, "learning_rate": 2.6443888159522823e-05, "loss": 0.2565, "step": 4211 }, { "epoch": 2.3609865470852016, "grad_norm": 0.0761119790546523, "learning_rate": 2.6399711573628704e-05, "loss": 0.2463, "step": 4212 }, { "epoch": 2.361547085201794, "grad_norm": 0.07790333307748155, "learning_rate": 2.6355566306619373e-05, "loss": 0.2383, "step": 4213 }, { "epoch": 2.3621076233183858, "grad_norm": 0.07959313799088172, "learning_rate": 2.63114523772799e-05, "loss": 0.2404, "step": 4214 }, { "epoch": 2.3626681614349776, "grad_norm": 0.08043596452556043, "learning_rate": 2.626736980438189e-05, "loss": 0.2498, "step": 4215 }, { "epoch": 2.3632286995515694, "grad_norm": 0.07833916486793696, "learning_rate": 2.6223318606683645e-05, "loss": 0.2379, "step": 4216 }, { "epoch": 2.3637892376681613, "grad_norm": 0.07900490105132915, "learning_rate": 2.6179298802930154e-05, "loss": 0.2514, "step": 4217 }, { "epoch": 2.3643497757847536, "grad_norm": 0.07616742985296307, "learning_rate": 2.6135310411852977e-05, "loss": 0.2343, "step": 4218 }, { "epoch": 2.3649103139013454, "grad_norm": 0.07710329656177863, "learning_rate": 2.6091353452170375e-05, "loss": 0.2253, "step": 4219 }, { "epoch": 2.3654708520179373, "grad_norm": 0.07824264240298706, "learning_rate": 2.60474279425872e-05, "loss": 0.2391, "step": 4220 }, { "epoch": 2.366031390134529, "grad_norm": 0.07744570500088467, "learning_rate": 2.6003533901794962e-05, "loss": 0.2376, "step": 4221 }, { "epoch": 2.366591928251121, "grad_norm": 0.0792097248625774, "learning_rate": 2.5959671348471715e-05, "loss": 0.2443, "step": 4222 }, { "epoch": 2.367152466367713, "grad_norm": 0.08155545890005843, "learning_rate": 2.5915840301282114e-05, "loss": 0.2414, "step": 4223 }, { "epoch": 2.367713004484305, "grad_norm": 0.08197006402100236, "learning_rate": 2.5872040778877503e-05, "loss": 0.2547, "step": 4224 }, { "epoch": 2.368273542600897, "grad_norm": 0.0788384367686174, "learning_rate": 2.582827279989568e-05, "loss": 0.2546, "step": 4225 }, { "epoch": 2.368834080717489, "grad_norm": 0.08117189338626334, "learning_rate": 2.5784536382961145e-05, "loss": 0.2393, "step": 4226 }, { "epoch": 2.3693946188340806, "grad_norm": 0.07847918315769185, "learning_rate": 2.5740831546684853e-05, "loss": 0.2453, "step": 4227 }, { "epoch": 2.3699551569506725, "grad_norm": 0.07660854798589885, "learning_rate": 2.5697158309664404e-05, "loss": 0.2268, "step": 4228 }, { "epoch": 2.3705156950672643, "grad_norm": 0.07993643725998426, "learning_rate": 2.565351669048397e-05, "loss": 0.2334, "step": 4229 }, { "epoch": 2.3710762331838566, "grad_norm": 0.0788627487921801, "learning_rate": 2.5609906707714137e-05, "loss": 0.2464, "step": 4230 }, { "epoch": 2.3716367713004485, "grad_norm": 0.07878287045150213, "learning_rate": 2.5566328379912196e-05, "loss": 0.238, "step": 4231 }, { "epoch": 2.3721973094170403, "grad_norm": 0.08726667034774241, "learning_rate": 2.5522781725621813e-05, "loss": 0.2599, "step": 4232 }, { "epoch": 2.372757847533632, "grad_norm": 0.07917378228374336, "learning_rate": 2.547926676337333e-05, "loss": 0.2437, "step": 4233 }, { "epoch": 2.373318385650224, "grad_norm": 0.07998427079224918, "learning_rate": 2.5435783511683443e-05, "loss": 0.246, "step": 4234 }, { "epoch": 2.3738789237668163, "grad_norm": 0.08095502398783906, "learning_rate": 2.5392331989055486e-05, "loss": 0.2427, "step": 4235 }, { "epoch": 2.374439461883408, "grad_norm": 0.0818390218436475, "learning_rate": 2.5348912213979235e-05, "loss": 0.2426, "step": 4236 }, { "epoch": 2.375, "grad_norm": 0.0811426481293932, "learning_rate": 2.530552420493094e-05, "loss": 0.2431, "step": 4237 }, { "epoch": 2.375560538116592, "grad_norm": 0.0785544440402398, "learning_rate": 2.5262167980373395e-05, "loss": 0.2467, "step": 4238 }, { "epoch": 2.3761210762331837, "grad_norm": 0.07859659167847614, "learning_rate": 2.5218843558755778e-05, "loss": 0.244, "step": 4239 }, { "epoch": 2.376681614349776, "grad_norm": 0.08042249061124941, "learning_rate": 2.5175550958513837e-05, "loss": 0.2393, "step": 4240 }, { "epoch": 2.377242152466368, "grad_norm": 0.07820509183110318, "learning_rate": 2.5132290198069675e-05, "loss": 0.2405, "step": 4241 }, { "epoch": 2.3778026905829597, "grad_norm": 0.07719627068710612, "learning_rate": 2.508906129583195e-05, "loss": 0.2404, "step": 4242 }, { "epoch": 2.3783632286995515, "grad_norm": 0.07934775266755617, "learning_rate": 2.5045864270195675e-05, "loss": 0.2398, "step": 4243 }, { "epoch": 2.3789237668161434, "grad_norm": 0.07976585651151132, "learning_rate": 2.500269913954233e-05, "loss": 0.2515, "step": 4244 }, { "epoch": 2.3794843049327357, "grad_norm": 0.07674324206891203, "learning_rate": 2.495956592223988e-05, "loss": 0.2368, "step": 4245 }, { "epoch": 2.3800448430493275, "grad_norm": 0.0804676516461419, "learning_rate": 2.491646463664261e-05, "loss": 0.2535, "step": 4246 }, { "epoch": 2.3806053811659194, "grad_norm": 0.07980334937569152, "learning_rate": 2.4873395301091306e-05, "loss": 0.2437, "step": 4247 }, { "epoch": 2.381165919282511, "grad_norm": 0.078867433642548, "learning_rate": 2.4830357933913063e-05, "loss": 0.2546, "step": 4248 }, { "epoch": 2.381726457399103, "grad_norm": 0.07672055027397391, "learning_rate": 2.4787352553421493e-05, "loss": 0.2313, "step": 4249 }, { "epoch": 2.382286995515695, "grad_norm": 0.0787764644600143, "learning_rate": 2.4744379177916498e-05, "loss": 0.2392, "step": 4250 }, { "epoch": 2.382847533632287, "grad_norm": 0.07969681313403791, "learning_rate": 2.470143782568436e-05, "loss": 0.2389, "step": 4251 }, { "epoch": 2.383408071748879, "grad_norm": 0.0809229643912626, "learning_rate": 2.4658528514997815e-05, "loss": 0.245, "step": 4252 }, { "epoch": 2.383968609865471, "grad_norm": 0.07820701687792098, "learning_rate": 2.4615651264115903e-05, "loss": 0.2258, "step": 4253 }, { "epoch": 2.3845291479820627, "grad_norm": 0.07881482865000651, "learning_rate": 2.4572806091284073e-05, "loss": 0.247, "step": 4254 }, { "epoch": 2.3850896860986546, "grad_norm": 0.07810169967599107, "learning_rate": 2.452999301473403e-05, "loss": 0.2477, "step": 4255 }, { "epoch": 2.3856502242152464, "grad_norm": 0.07822537831160745, "learning_rate": 2.448721205268395e-05, "loss": 0.239, "step": 4256 }, { "epoch": 2.3862107623318387, "grad_norm": 0.08059235644032076, "learning_rate": 2.444446322333821e-05, "loss": 0.2576, "step": 4257 }, { "epoch": 2.3867713004484306, "grad_norm": 0.07991668386232427, "learning_rate": 2.4401746544887584e-05, "loss": 0.2508, "step": 4258 }, { "epoch": 2.3873318385650224, "grad_norm": 0.08189843528545956, "learning_rate": 2.435906203550916e-05, "loss": 0.2457, "step": 4259 }, { "epoch": 2.3878923766816142, "grad_norm": 0.07693101994417906, "learning_rate": 2.4316409713366352e-05, "loss": 0.248, "step": 4260 }, { "epoch": 2.388452914798206, "grad_norm": 0.07645113103596757, "learning_rate": 2.4273789596608887e-05, "loss": 0.2281, "step": 4261 }, { "epoch": 2.3890134529147984, "grad_norm": 0.0756294332759964, "learning_rate": 2.423120170337272e-05, "loss": 0.2398, "step": 4262 }, { "epoch": 2.3895739910313902, "grad_norm": 0.08038861463910725, "learning_rate": 2.4188646051780117e-05, "loss": 0.2453, "step": 4263 }, { "epoch": 2.390134529147982, "grad_norm": 0.0807597476574884, "learning_rate": 2.4146122659939686e-05, "loss": 0.2369, "step": 4264 }, { "epoch": 2.390695067264574, "grad_norm": 0.07738844331988792, "learning_rate": 2.4103631545946225e-05, "loss": 0.2322, "step": 4265 }, { "epoch": 2.3912556053811658, "grad_norm": 0.08144564840558986, "learning_rate": 2.4061172727880886e-05, "loss": 0.2498, "step": 4266 }, { "epoch": 2.391816143497758, "grad_norm": 0.08014202723678344, "learning_rate": 2.4018746223810974e-05, "loss": 0.2515, "step": 4267 }, { "epoch": 2.39237668161435, "grad_norm": 0.08100509140202429, "learning_rate": 2.3976352051790117e-05, "loss": 0.2495, "step": 4268 }, { "epoch": 2.3929372197309418, "grad_norm": 0.07794262279111899, "learning_rate": 2.3933990229858193e-05, "loss": 0.239, "step": 4269 }, { "epoch": 2.3934977578475336, "grad_norm": 0.08012132421360765, "learning_rate": 2.3891660776041247e-05, "loss": 0.249, "step": 4270 }, { "epoch": 2.3940582959641254, "grad_norm": 0.07733188014743615, "learning_rate": 2.3849363708351625e-05, "loss": 0.2406, "step": 4271 }, { "epoch": 2.3946188340807173, "grad_norm": 0.07792694206848627, "learning_rate": 2.3807099044787818e-05, "loss": 0.2393, "step": 4272 }, { "epoch": 2.3951793721973096, "grad_norm": 0.07738852943741609, "learning_rate": 2.3764866803334606e-05, "loss": 0.2354, "step": 4273 }, { "epoch": 2.3957399103139014, "grad_norm": 0.07792379658822556, "learning_rate": 2.3722667001962896e-05, "loss": 0.2349, "step": 4274 }, { "epoch": 2.3963004484304933, "grad_norm": 0.07742232260896334, "learning_rate": 2.3680499658629874e-05, "loss": 0.2382, "step": 4275 }, { "epoch": 2.396860986547085, "grad_norm": 0.07856630204479857, "learning_rate": 2.363836479127881e-05, "loss": 0.2444, "step": 4276 }, { "epoch": 2.397421524663677, "grad_norm": 0.07927930061326396, "learning_rate": 2.3596262417839255e-05, "loss": 0.2419, "step": 4277 }, { "epoch": 2.397982062780269, "grad_norm": 0.07836612250875499, "learning_rate": 2.3554192556226896e-05, "loss": 0.2313, "step": 4278 }, { "epoch": 2.398542600896861, "grad_norm": 0.07940189246279593, "learning_rate": 2.3512155224343546e-05, "loss": 0.2567, "step": 4279 }, { "epoch": 2.399103139013453, "grad_norm": 0.07828878517899063, "learning_rate": 2.3470150440077266e-05, "loss": 0.2458, "step": 4280 }, { "epoch": 2.399663677130045, "grad_norm": 0.07912544986227371, "learning_rate": 2.3428178221302144e-05, "loss": 0.2367, "step": 4281 }, { "epoch": 2.4002242152466366, "grad_norm": 0.07855078011819978, "learning_rate": 2.3386238585878538e-05, "loss": 0.2495, "step": 4282 }, { "epoch": 2.4007847533632285, "grad_norm": 0.0823602874783878, "learning_rate": 2.3344331551652854e-05, "loss": 0.2465, "step": 4283 }, { "epoch": 2.401345291479821, "grad_norm": 0.08065013835008998, "learning_rate": 2.3302457136457623e-05, "loss": 0.2555, "step": 4284 }, { "epoch": 2.4019058295964126, "grad_norm": 0.0782231546469997, "learning_rate": 2.326061535811156e-05, "loss": 0.238, "step": 4285 }, { "epoch": 2.4024663677130045, "grad_norm": 0.08010215013422635, "learning_rate": 2.3218806234419443e-05, "loss": 0.2476, "step": 4286 }, { "epoch": 2.4030269058295963, "grad_norm": 0.07845562409630219, "learning_rate": 2.317702978317221e-05, "loss": 0.2486, "step": 4287 }, { "epoch": 2.403587443946188, "grad_norm": 0.07969730646413398, "learning_rate": 2.3135286022146785e-05, "loss": 0.2485, "step": 4288 }, { "epoch": 2.4041479820627805, "grad_norm": 0.07663272981994358, "learning_rate": 2.3093574969106323e-05, "loss": 0.2406, "step": 4289 }, { "epoch": 2.4047085201793723, "grad_norm": 0.0776438435990643, "learning_rate": 2.3051896641799952e-05, "loss": 0.2424, "step": 4290 }, { "epoch": 2.405269058295964, "grad_norm": 0.08018596927300335, "learning_rate": 2.3010251057962883e-05, "loss": 0.25, "step": 4291 }, { "epoch": 2.405829596412556, "grad_norm": 0.07966056261094619, "learning_rate": 2.2968638235316466e-05, "loss": 0.2378, "step": 4292 }, { "epoch": 2.406390134529148, "grad_norm": 0.0775690470940192, "learning_rate": 2.292705819156803e-05, "loss": 0.2362, "step": 4293 }, { "epoch": 2.40695067264574, "grad_norm": 0.07927168083563292, "learning_rate": 2.288551094441106e-05, "loss": 0.2533, "step": 4294 }, { "epoch": 2.407511210762332, "grad_norm": 0.07857339041746221, "learning_rate": 2.2843996511524934e-05, "loss": 0.2495, "step": 4295 }, { "epoch": 2.408071748878924, "grad_norm": 0.07967302591749084, "learning_rate": 2.2802514910575223e-05, "loss": 0.2557, "step": 4296 }, { "epoch": 2.4086322869955157, "grad_norm": 0.07861300943021306, "learning_rate": 2.2761066159213417e-05, "loss": 0.246, "step": 4297 }, { "epoch": 2.4091928251121075, "grad_norm": 0.07786911660528349, "learning_rate": 2.271965027507704e-05, "loss": 0.2381, "step": 4298 }, { "epoch": 2.4097533632286994, "grad_norm": 0.07775500781116534, "learning_rate": 2.2678267275789712e-05, "loss": 0.2415, "step": 4299 }, { "epoch": 2.4103139013452917, "grad_norm": 0.0792791489016953, "learning_rate": 2.2636917178960938e-05, "loss": 0.2473, "step": 4300 }, { "epoch": 2.4108744394618835, "grad_norm": 0.0819511269755226, "learning_rate": 2.259560000218631e-05, "loss": 0.2487, "step": 4301 }, { "epoch": 2.4114349775784754, "grad_norm": 0.08299400506533303, "learning_rate": 2.255431576304744e-05, "loss": 0.2529, "step": 4302 }, { "epoch": 2.411995515695067, "grad_norm": 0.07929795528474204, "learning_rate": 2.25130644791118e-05, "loss": 0.2341, "step": 4303 }, { "epoch": 2.412556053811659, "grad_norm": 0.07743424489324233, "learning_rate": 2.2471846167932975e-05, "loss": 0.241, "step": 4304 }, { "epoch": 2.413116591928251, "grad_norm": 0.07995720980201747, "learning_rate": 2.243066084705039e-05, "loss": 0.2373, "step": 4305 }, { "epoch": 2.413677130044843, "grad_norm": 0.07970112758822016, "learning_rate": 2.2389508533989555e-05, "loss": 0.2378, "step": 4306 }, { "epoch": 2.414237668161435, "grad_norm": 0.0778690871138127, "learning_rate": 2.2348389246261837e-05, "loss": 0.2435, "step": 4307 }, { "epoch": 2.414798206278027, "grad_norm": 0.07685825327763682, "learning_rate": 2.230730300136461e-05, "loss": 0.2305, "step": 4308 }, { "epoch": 2.4153587443946187, "grad_norm": 0.07469704153860139, "learning_rate": 2.226624981678115e-05, "loss": 0.237, "step": 4309 }, { "epoch": 2.4159192825112106, "grad_norm": 0.07975969402431803, "learning_rate": 2.2225229709980676e-05, "loss": 0.2426, "step": 4310 }, { "epoch": 2.416479820627803, "grad_norm": 0.08007789008632174, "learning_rate": 2.218424269841838e-05, "loss": 0.2452, "step": 4311 }, { "epoch": 2.4170403587443947, "grad_norm": 0.07812765917880916, "learning_rate": 2.214328879953528e-05, "loss": 0.2202, "step": 4312 }, { "epoch": 2.4176008968609866, "grad_norm": 0.07833743456277244, "learning_rate": 2.210236803075839e-05, "loss": 0.2446, "step": 4313 }, { "epoch": 2.4181614349775784, "grad_norm": 0.07971713235725623, "learning_rate": 2.2061480409500556e-05, "loss": 0.2307, "step": 4314 }, { "epoch": 2.4187219730941703, "grad_norm": 0.07736258676653242, "learning_rate": 2.2020625953160577e-05, "loss": 0.2437, "step": 4315 }, { "epoch": 2.4192825112107625, "grad_norm": 0.07961912114769379, "learning_rate": 2.1979804679123106e-05, "loss": 0.2483, "step": 4316 }, { "epoch": 2.4198430493273544, "grad_norm": 0.07935805691038576, "learning_rate": 2.1939016604758656e-05, "loss": 0.2363, "step": 4317 }, { "epoch": 2.4204035874439462, "grad_norm": 0.08065309659353526, "learning_rate": 2.1898261747423655e-05, "loss": 0.2531, "step": 4318 }, { "epoch": 2.420964125560538, "grad_norm": 0.07754738254742989, "learning_rate": 2.1857540124460397e-05, "loss": 0.2325, "step": 4319 }, { "epoch": 2.42152466367713, "grad_norm": 0.07660030820169896, "learning_rate": 2.181685175319702e-05, "loss": 0.2401, "step": 4320 }, { "epoch": 2.422085201793722, "grad_norm": 0.0793339551644271, "learning_rate": 2.177619665094749e-05, "loss": 0.2354, "step": 4321 }, { "epoch": 2.422645739910314, "grad_norm": 0.07763747831226875, "learning_rate": 2.1735574835011664e-05, "loss": 0.248, "step": 4322 }, { "epoch": 2.423206278026906, "grad_norm": 0.08188821089100286, "learning_rate": 2.1694986322675202e-05, "loss": 0.2517, "step": 4323 }, { "epoch": 2.4237668161434978, "grad_norm": 0.07903829794138365, "learning_rate": 2.1654431131209553e-05, "loss": 0.2451, "step": 4324 }, { "epoch": 2.4243273542600896, "grad_norm": 0.0813434473736274, "learning_rate": 2.1613909277872056e-05, "loss": 0.2546, "step": 4325 }, { "epoch": 2.4248878923766815, "grad_norm": 0.08003868992303634, "learning_rate": 2.157342077990586e-05, "loss": 0.2489, "step": 4326 }, { "epoch": 2.4254484304932733, "grad_norm": 0.07693609909216155, "learning_rate": 2.1532965654539915e-05, "loss": 0.2474, "step": 4327 }, { "epoch": 2.4260089686098656, "grad_norm": 0.07950874352056599, "learning_rate": 2.1492543918988907e-05, "loss": 0.2525, "step": 4328 }, { "epoch": 2.4265695067264574, "grad_norm": 0.08214382775918407, "learning_rate": 2.1452155590453404e-05, "loss": 0.2429, "step": 4329 }, { "epoch": 2.4271300448430493, "grad_norm": 0.0758595129042081, "learning_rate": 2.141180068611971e-05, "loss": 0.2265, "step": 4330 }, { "epoch": 2.427690582959641, "grad_norm": 0.08153205909614925, "learning_rate": 2.1371479223159862e-05, "loss": 0.2433, "step": 4331 }, { "epoch": 2.428251121076233, "grad_norm": 0.0786338410238144, "learning_rate": 2.1331191218731783e-05, "loss": 0.2348, "step": 4332 }, { "epoch": 2.4288116591928253, "grad_norm": 0.07898342934560665, "learning_rate": 2.1290936689979047e-05, "loss": 0.2588, "step": 4333 }, { "epoch": 2.429372197309417, "grad_norm": 0.08115594101252847, "learning_rate": 2.125071565403104e-05, "loss": 0.2553, "step": 4334 }, { "epoch": 2.429932735426009, "grad_norm": 0.08148196147954538, "learning_rate": 2.1210528128002904e-05, "loss": 0.2451, "step": 4335 }, { "epoch": 2.430493273542601, "grad_norm": 0.07611515539965039, "learning_rate": 2.1170374128995507e-05, "loss": 0.2385, "step": 4336 }, { "epoch": 2.4310538116591927, "grad_norm": 0.07760886178051588, "learning_rate": 2.1130253674095435e-05, "loss": 0.2397, "step": 4337 }, { "epoch": 2.431614349775785, "grad_norm": 0.07889059788244317, "learning_rate": 2.1090166780374975e-05, "loss": 0.2472, "step": 4338 }, { "epoch": 2.432174887892377, "grad_norm": 0.07976461126208553, "learning_rate": 2.105011346489224e-05, "loss": 0.2487, "step": 4339 }, { "epoch": 2.4327354260089686, "grad_norm": 0.07951958202780855, "learning_rate": 2.1010093744690908e-05, "loss": 0.2333, "step": 4340 }, { "epoch": 2.4332959641255605, "grad_norm": 0.07962954473906425, "learning_rate": 2.0970107636800495e-05, "loss": 0.2429, "step": 4341 }, { "epoch": 2.4338565022421523, "grad_norm": 0.0794323731347504, "learning_rate": 2.093015515823612e-05, "loss": 0.2413, "step": 4342 }, { "epoch": 2.4344170403587446, "grad_norm": 0.07952549060784866, "learning_rate": 2.0890236325998635e-05, "loss": 0.2373, "step": 4343 }, { "epoch": 2.4349775784753365, "grad_norm": 0.07917474803448454, "learning_rate": 2.0850351157074598e-05, "loss": 0.2429, "step": 4344 }, { "epoch": 2.4355381165919283, "grad_norm": 0.07913531651724788, "learning_rate": 2.0810499668436166e-05, "loss": 0.235, "step": 4345 }, { "epoch": 2.43609865470852, "grad_norm": 0.08034038891466093, "learning_rate": 2.0770681877041253e-05, "loss": 0.2482, "step": 4346 }, { "epoch": 2.436659192825112, "grad_norm": 0.07891006005804807, "learning_rate": 2.0730897799833348e-05, "loss": 0.2391, "step": 4347 }, { "epoch": 2.437219730941704, "grad_norm": 0.08217772709245133, "learning_rate": 2.0691147453741687e-05, "loss": 0.2473, "step": 4348 }, { "epoch": 2.437780269058296, "grad_norm": 0.07853254914239965, "learning_rate": 2.0651430855681064e-05, "loss": 0.2496, "step": 4349 }, { "epoch": 2.438340807174888, "grad_norm": 0.08087459568714833, "learning_rate": 2.0611748022551936e-05, "loss": 0.2319, "step": 4350 }, { "epoch": 2.43890134529148, "grad_norm": 0.08140349728833127, "learning_rate": 2.057209897124043e-05, "loss": 0.2495, "step": 4351 }, { "epoch": 2.4394618834080717, "grad_norm": 0.08125526010914347, "learning_rate": 2.0532483718618267e-05, "loss": 0.2531, "step": 4352 }, { "epoch": 2.4400224215246635, "grad_norm": 0.07734467440134396, "learning_rate": 2.0492902281542836e-05, "loss": 0.2414, "step": 4353 }, { "epoch": 2.4405829596412554, "grad_norm": 0.07660232671184296, "learning_rate": 2.045335467685703e-05, "loss": 0.2381, "step": 4354 }, { "epoch": 2.4411434977578477, "grad_norm": 0.08019024531719846, "learning_rate": 2.041384092138946e-05, "loss": 0.2464, "step": 4355 }, { "epoch": 2.4417040358744395, "grad_norm": 0.07706511284846577, "learning_rate": 2.037436103195426e-05, "loss": 0.2318, "step": 4356 }, { "epoch": 2.4422645739910314, "grad_norm": 0.08246826832148706, "learning_rate": 2.0334915025351142e-05, "loss": 0.2538, "step": 4357 }, { "epoch": 2.442825112107623, "grad_norm": 0.08263485684952807, "learning_rate": 2.0295502918365472e-05, "loss": 0.2487, "step": 4358 }, { "epoch": 2.443385650224215, "grad_norm": 0.07687059585038453, "learning_rate": 2.0256124727768143e-05, "loss": 0.2405, "step": 4359 }, { "epoch": 2.4439461883408073, "grad_norm": 0.0765570939471235, "learning_rate": 2.0216780470315655e-05, "loss": 0.2242, "step": 4360 }, { "epoch": 2.444506726457399, "grad_norm": 0.07908731441849903, "learning_rate": 2.017747016274999e-05, "loss": 0.2421, "step": 4361 }, { "epoch": 2.445067264573991, "grad_norm": 0.08001846364376108, "learning_rate": 2.013819382179878e-05, "loss": 0.2452, "step": 4362 }, { "epoch": 2.445627802690583, "grad_norm": 0.07592919032742099, "learning_rate": 2.009895146417512e-05, "loss": 0.2372, "step": 4363 }, { "epoch": 2.4461883408071747, "grad_norm": 0.07982023578857524, "learning_rate": 2.0059743106577654e-05, "loss": 0.2551, "step": 4364 }, { "epoch": 2.446748878923767, "grad_norm": 0.08070692659023758, "learning_rate": 2.002056876569066e-05, "loss": 0.2476, "step": 4365 }, { "epoch": 2.447309417040359, "grad_norm": 0.08002500517767909, "learning_rate": 1.9981428458183792e-05, "loss": 0.2384, "step": 4366 }, { "epoch": 2.4478699551569507, "grad_norm": 0.07873885223419161, "learning_rate": 1.9942322200712315e-05, "loss": 0.2433, "step": 4367 }, { "epoch": 2.4484304932735426, "grad_norm": 0.08066244265538584, "learning_rate": 1.9903250009916997e-05, "loss": 0.2543, "step": 4368 }, { "epoch": 2.4489910313901344, "grad_norm": 0.08182267861637552, "learning_rate": 1.9864211902424123e-05, "loss": 0.2464, "step": 4369 }, { "epoch": 2.4495515695067267, "grad_norm": 0.07793138319209078, "learning_rate": 1.982520789484541e-05, "loss": 0.2365, "step": 4370 }, { "epoch": 2.4501121076233185, "grad_norm": 0.07729500337141762, "learning_rate": 1.978623800377809e-05, "loss": 0.238, "step": 4371 }, { "epoch": 2.4506726457399104, "grad_norm": 0.0785283191450988, "learning_rate": 1.9747302245804945e-05, "loss": 0.2551, "step": 4372 }, { "epoch": 2.4512331838565022, "grad_norm": 0.07754012827939691, "learning_rate": 1.9708400637494105e-05, "loss": 0.2407, "step": 4373 }, { "epoch": 2.451793721973094, "grad_norm": 0.07908752512751777, "learning_rate": 1.9669533195399316e-05, "loss": 0.2491, "step": 4374 }, { "epoch": 2.452354260089686, "grad_norm": 0.07769777019386097, "learning_rate": 1.963069993605964e-05, "loss": 0.2495, "step": 4375 }, { "epoch": 2.452914798206278, "grad_norm": 0.07953808071860707, "learning_rate": 1.9591900875999703e-05, "loss": 0.2493, "step": 4376 }, { "epoch": 2.45347533632287, "grad_norm": 0.0767251163133952, "learning_rate": 1.955313603172957e-05, "loss": 0.2366, "step": 4377 }, { "epoch": 2.454035874439462, "grad_norm": 0.0772876732653098, "learning_rate": 1.9514405419744654e-05, "loss": 0.2377, "step": 4378 }, { "epoch": 2.4545964125560538, "grad_norm": 0.07802199616568897, "learning_rate": 1.9475709056525905e-05, "loss": 0.2407, "step": 4379 }, { "epoch": 2.4551569506726456, "grad_norm": 0.08125062605646792, "learning_rate": 1.943704695853963e-05, "loss": 0.2457, "step": 4380 }, { "epoch": 2.4557174887892375, "grad_norm": 0.07950019725415046, "learning_rate": 1.939841914223761e-05, "loss": 0.2322, "step": 4381 }, { "epoch": 2.4562780269058297, "grad_norm": 0.07970714171355352, "learning_rate": 1.935982562405698e-05, "loss": 0.2428, "step": 4382 }, { "epoch": 2.4568385650224216, "grad_norm": 0.07860283694525858, "learning_rate": 1.932126642042035e-05, "loss": 0.2372, "step": 4383 }, { "epoch": 2.4573991031390134, "grad_norm": 0.0799753943160765, "learning_rate": 1.9282741547735637e-05, "loss": 0.2363, "step": 4384 }, { "epoch": 2.4579596412556053, "grad_norm": 0.07884053181074349, "learning_rate": 1.9244251022396233e-05, "loss": 0.2486, "step": 4385 }, { "epoch": 2.458520179372197, "grad_norm": 0.0818453974140935, "learning_rate": 1.9205794860780914e-05, "loss": 0.2538, "step": 4386 }, { "epoch": 2.4590807174887894, "grad_norm": 0.07779196140984937, "learning_rate": 1.9167373079253727e-05, "loss": 0.2409, "step": 4387 }, { "epoch": 2.4596412556053813, "grad_norm": 0.08201345727344617, "learning_rate": 1.9128985694164237e-05, "loss": 0.2426, "step": 4388 }, { "epoch": 2.460201793721973, "grad_norm": 0.08178188182088565, "learning_rate": 1.909063272184727e-05, "loss": 0.2403, "step": 4389 }, { "epoch": 2.460762331838565, "grad_norm": 0.07567328625492045, "learning_rate": 1.9052314178623008e-05, "loss": 0.236, "step": 4390 }, { "epoch": 2.461322869955157, "grad_norm": 0.07804704174016891, "learning_rate": 1.901403008079704e-05, "loss": 0.2406, "step": 4391 }, { "epoch": 2.461883408071749, "grad_norm": 0.08055788967776399, "learning_rate": 1.8975780444660273e-05, "loss": 0.2454, "step": 4392 }, { "epoch": 2.462443946188341, "grad_norm": 0.07939038698067213, "learning_rate": 1.8937565286488966e-05, "loss": 0.2451, "step": 4393 }, { "epoch": 2.463004484304933, "grad_norm": 0.07808079204750822, "learning_rate": 1.8899384622544646e-05, "loss": 0.2437, "step": 4394 }, { "epoch": 2.4635650224215246, "grad_norm": 0.07847211429675839, "learning_rate": 1.8861238469074248e-05, "loss": 0.249, "step": 4395 }, { "epoch": 2.4641255605381165, "grad_norm": 0.08213501625298139, "learning_rate": 1.8823126842309956e-05, "loss": 0.2408, "step": 4396 }, { "epoch": 2.4646860986547083, "grad_norm": 0.07848382733862806, "learning_rate": 1.8785049758469254e-05, "loss": 0.2408, "step": 4397 }, { "epoch": 2.4652466367713006, "grad_norm": 0.07943947355086714, "learning_rate": 1.8747007233755e-05, "loss": 0.2443, "step": 4398 }, { "epoch": 2.4658071748878925, "grad_norm": 0.08090414909413521, "learning_rate": 1.8708999284355266e-05, "loss": 0.246, "step": 4399 }, { "epoch": 2.4663677130044843, "grad_norm": 0.07786272828440925, "learning_rate": 1.8671025926443465e-05, "loss": 0.2367, "step": 4400 }, { "epoch": 2.466928251121076, "grad_norm": 0.08054905391377483, "learning_rate": 1.8633087176178276e-05, "loss": 0.2455, "step": 4401 }, { "epoch": 2.467488789237668, "grad_norm": 0.08390852101212544, "learning_rate": 1.8595183049703668e-05, "loss": 0.2475, "step": 4402 }, { "epoch": 2.46804932735426, "grad_norm": 0.08319907705555271, "learning_rate": 1.8557313563148847e-05, "loss": 0.251, "step": 4403 }, { "epoch": 2.468609865470852, "grad_norm": 0.07819796437479647, "learning_rate": 1.8519478732628247e-05, "loss": 0.2385, "step": 4404 }, { "epoch": 2.469170403587444, "grad_norm": 0.08300631738946596, "learning_rate": 1.8481678574241646e-05, "loss": 0.2566, "step": 4405 }, { "epoch": 2.469730941704036, "grad_norm": 0.07987810854223557, "learning_rate": 1.8443913104073983e-05, "loss": 0.2356, "step": 4406 }, { "epoch": 2.4702914798206277, "grad_norm": 0.07729763414437077, "learning_rate": 1.840618233819552e-05, "loss": 0.226, "step": 4407 }, { "epoch": 2.4708520179372195, "grad_norm": 0.0788044560119795, "learning_rate": 1.8368486292661657e-05, "loss": 0.2546, "step": 4408 }, { "epoch": 2.471412556053812, "grad_norm": 0.07870547075686708, "learning_rate": 1.8330824983513072e-05, "loss": 0.2271, "step": 4409 }, { "epoch": 2.4719730941704037, "grad_norm": 0.07736564087421319, "learning_rate": 1.829319842677569e-05, "loss": 0.2381, "step": 4410 }, { "epoch": 2.4725336322869955, "grad_norm": 0.07917595786659042, "learning_rate": 1.8255606638460576e-05, "loss": 0.2427, "step": 4411 }, { "epoch": 2.4730941704035874, "grad_norm": 0.07929542482453235, "learning_rate": 1.8218049634564082e-05, "loss": 0.2315, "step": 4412 }, { "epoch": 2.473654708520179, "grad_norm": 0.08353627003581249, "learning_rate": 1.818052743106766e-05, "loss": 0.2601, "step": 4413 }, { "epoch": 2.4742152466367715, "grad_norm": 0.07782654990584914, "learning_rate": 1.8143040043938054e-05, "loss": 0.2491, "step": 4414 }, { "epoch": 2.4747757847533634, "grad_norm": 0.07936809537464609, "learning_rate": 1.8105587489127106e-05, "loss": 0.2323, "step": 4415 }, { "epoch": 2.475336322869955, "grad_norm": 0.07816231296613192, "learning_rate": 1.806816978257192e-05, "loss": 0.241, "step": 4416 }, { "epoch": 2.475896860986547, "grad_norm": 0.07749425776392747, "learning_rate": 1.8030786940194688e-05, "loss": 0.243, "step": 4417 }, { "epoch": 2.476457399103139, "grad_norm": 0.08105678697332463, "learning_rate": 1.799343897790282e-05, "loss": 0.244, "step": 4418 }, { "epoch": 2.477017937219731, "grad_norm": 0.07967362420984159, "learning_rate": 1.7956125911588893e-05, "loss": 0.2442, "step": 4419 }, { "epoch": 2.477578475336323, "grad_norm": 0.07924519560102106, "learning_rate": 1.7918847757130575e-05, "loss": 0.2378, "step": 4420 }, { "epoch": 2.478139013452915, "grad_norm": 0.07848148392860445, "learning_rate": 1.788160453039075e-05, "loss": 0.2345, "step": 4421 }, { "epoch": 2.4786995515695067, "grad_norm": 0.07788126157653065, "learning_rate": 1.7844396247217354e-05, "loss": 0.2431, "step": 4422 }, { "epoch": 2.4792600896860986, "grad_norm": 0.07936994105987126, "learning_rate": 1.7807222923443567e-05, "loss": 0.2478, "step": 4423 }, { "epoch": 2.4798206278026904, "grad_norm": 0.07983013467764592, "learning_rate": 1.7770084574887567e-05, "loss": 0.2601, "step": 4424 }, { "epoch": 2.4803811659192827, "grad_norm": 0.08100937582379807, "learning_rate": 1.773298121735275e-05, "loss": 0.242, "step": 4425 }, { "epoch": 2.4809417040358746, "grad_norm": 0.07860663922366933, "learning_rate": 1.7695912866627595e-05, "loss": 0.2522, "step": 4426 }, { "epoch": 2.4815022421524664, "grad_norm": 0.07825753850876958, "learning_rate": 1.7658879538485628e-05, "loss": 0.239, "step": 4427 }, { "epoch": 2.4820627802690582, "grad_norm": 0.0841305919680388, "learning_rate": 1.762188124868557e-05, "loss": 0.2531, "step": 4428 }, { "epoch": 2.48262331838565, "grad_norm": 0.08114721060009097, "learning_rate": 1.758491801297114e-05, "loss": 0.241, "step": 4429 }, { "epoch": 2.483183856502242, "grad_norm": 0.07930211089835233, "learning_rate": 1.7547989847071178e-05, "loss": 0.2486, "step": 4430 }, { "epoch": 2.4837443946188342, "grad_norm": 0.07808225378957155, "learning_rate": 1.7511096766699643e-05, "loss": 0.235, "step": 4431 }, { "epoch": 2.484304932735426, "grad_norm": 0.08022252134741344, "learning_rate": 1.7474238787555476e-05, "loss": 0.2421, "step": 4432 }, { "epoch": 2.484865470852018, "grad_norm": 0.08119256917900608, "learning_rate": 1.7437415925322743e-05, "loss": 0.2342, "step": 4433 }, { "epoch": 2.4854260089686098, "grad_norm": 0.07989719398209497, "learning_rate": 1.7400628195670578e-05, "loss": 0.2607, "step": 4434 }, { "epoch": 2.4859865470852016, "grad_norm": 0.0833588246480297, "learning_rate": 1.7363875614253135e-05, "loss": 0.2464, "step": 4435 }, { "epoch": 2.486547085201794, "grad_norm": 0.07816092133763146, "learning_rate": 1.7327158196709613e-05, "loss": 0.2388, "step": 4436 }, { "epoch": 2.4871076233183858, "grad_norm": 0.08122459364730421, "learning_rate": 1.729047595866422e-05, "loss": 0.2383, "step": 4437 }, { "epoch": 2.4876681614349776, "grad_norm": 0.0815383675823079, "learning_rate": 1.725382891572629e-05, "loss": 0.2602, "step": 4438 }, { "epoch": 2.4882286995515694, "grad_norm": 0.0802074151304083, "learning_rate": 1.7217217083490044e-05, "loss": 0.2468, "step": 4439 }, { "epoch": 2.4887892376681613, "grad_norm": 0.07881996617835439, "learning_rate": 1.7180640477534847e-05, "loss": 0.2472, "step": 4440 }, { "epoch": 2.4893497757847536, "grad_norm": 0.07896209214879413, "learning_rate": 1.7144099113424984e-05, "loss": 0.2496, "step": 4441 }, { "epoch": 2.4899103139013454, "grad_norm": 0.07970756377176035, "learning_rate": 1.7107593006709798e-05, "loss": 0.2469, "step": 4442 }, { "epoch": 2.4904708520179373, "grad_norm": 0.07799921125634339, "learning_rate": 1.7071122172923636e-05, "loss": 0.2391, "step": 4443 }, { "epoch": 2.491031390134529, "grad_norm": 0.07928685093016447, "learning_rate": 1.7034686627585762e-05, "loss": 0.241, "step": 4444 }, { "epoch": 2.491591928251121, "grad_norm": 0.07984796699906008, "learning_rate": 1.6998286386200503e-05, "loss": 0.2365, "step": 4445 }, { "epoch": 2.492152466367713, "grad_norm": 0.08129675139741374, "learning_rate": 1.6961921464257114e-05, "loss": 0.24, "step": 4446 }, { "epoch": 2.492713004484305, "grad_norm": 0.0791719916919716, "learning_rate": 1.6925591877229863e-05, "loss": 0.2448, "step": 4447 }, { "epoch": 2.493273542600897, "grad_norm": 0.08149933503813549, "learning_rate": 1.6889297640577915e-05, "loss": 0.2466, "step": 4448 }, { "epoch": 2.493834080717489, "grad_norm": 0.08132523919018882, "learning_rate": 1.6853038769745467e-05, "loss": 0.2304, "step": 4449 }, { "epoch": 2.4943946188340806, "grad_norm": 0.07958570040183402, "learning_rate": 1.681681528016166e-05, "loss": 0.2495, "step": 4450 }, { "epoch": 2.4949551569506725, "grad_norm": 0.07854599401912843, "learning_rate": 1.6780627187240493e-05, "loss": 0.2408, "step": 4451 }, { "epoch": 2.4955156950672643, "grad_norm": 0.07854173988150537, "learning_rate": 1.6744474506381023e-05, "loss": 0.2357, "step": 4452 }, { "epoch": 2.4960762331838566, "grad_norm": 0.07606243551369524, "learning_rate": 1.670835725296713e-05, "loss": 0.2399, "step": 4453 }, { "epoch": 2.4966367713004485, "grad_norm": 0.08173527176087826, "learning_rate": 1.667227544236771e-05, "loss": 0.2392, "step": 4454 }, { "epoch": 2.4971973094170403, "grad_norm": 0.08425758700292559, "learning_rate": 1.6636229089936496e-05, "loss": 0.251, "step": 4455 }, { "epoch": 2.497757847533632, "grad_norm": 0.0786189663188184, "learning_rate": 1.660021821101222e-05, "loss": 0.2376, "step": 4456 }, { "epoch": 2.498318385650224, "grad_norm": 0.07850242465631987, "learning_rate": 1.6564242820918418e-05, "loss": 0.2444, "step": 4457 }, { "epoch": 2.4988789237668163, "grad_norm": 0.08449997838843436, "learning_rate": 1.65283029349636e-05, "loss": 0.2484, "step": 4458 }, { "epoch": 2.499439461883408, "grad_norm": 0.07851891897220185, "learning_rate": 1.649239856844117e-05, "loss": 0.2407, "step": 4459 }, { "epoch": 2.5, "grad_norm": 0.0765513225178551, "learning_rate": 1.6456529736629345e-05, "loss": 0.2254, "step": 4460 }, { "epoch": 2.500560538116592, "grad_norm": 0.08363321984849857, "learning_rate": 1.6420696454791328e-05, "loss": 0.2535, "step": 4461 }, { "epoch": 2.5011210762331837, "grad_norm": 0.07737282235847463, "learning_rate": 1.638489873817508e-05, "loss": 0.2353, "step": 4462 }, { "epoch": 2.501681614349776, "grad_norm": 0.08120031383422203, "learning_rate": 1.6349136602013527e-05, "loss": 0.2469, "step": 4463 }, { "epoch": 2.502242152466368, "grad_norm": 0.07968539336795967, "learning_rate": 1.6313410061524393e-05, "loss": 0.2435, "step": 4464 }, { "epoch": 2.5028026905829597, "grad_norm": 0.07712043740829289, "learning_rate": 1.627771913191024e-05, "loss": 0.2362, "step": 4465 }, { "epoch": 2.5033632286995515, "grad_norm": 0.07857567724201052, "learning_rate": 1.6242063828358544e-05, "loss": 0.2334, "step": 4466 }, { "epoch": 2.5039237668161434, "grad_norm": 0.08151544220668971, "learning_rate": 1.620644416604159e-05, "loss": 0.249, "step": 4467 }, { "epoch": 2.5044843049327357, "grad_norm": 0.07972918683534676, "learning_rate": 1.617086016011652e-05, "loss": 0.2444, "step": 4468 }, { "epoch": 2.5050448430493275, "grad_norm": 0.07812113755382295, "learning_rate": 1.6135311825725208e-05, "loss": 0.2418, "step": 4469 }, { "epoch": 2.5056053811659194, "grad_norm": 0.07748075153834869, "learning_rate": 1.609979917799449e-05, "loss": 0.2316, "step": 4470 }, { "epoch": 2.506165919282511, "grad_norm": 0.08065539286292779, "learning_rate": 1.60643222320359e-05, "loss": 0.2405, "step": 4471 }, { "epoch": 2.506726457399103, "grad_norm": 0.08030272895310879, "learning_rate": 1.6028881002945818e-05, "loss": 0.2536, "step": 4472 }, { "epoch": 2.5072869955156953, "grad_norm": 0.07938423577475674, "learning_rate": 1.5993475505805467e-05, "loss": 0.2413, "step": 4473 }, { "epoch": 2.5078475336322867, "grad_norm": 0.07728689118560435, "learning_rate": 1.5958105755680795e-05, "loss": 0.2467, "step": 4474 }, { "epoch": 2.508408071748879, "grad_norm": 0.08172159601220094, "learning_rate": 1.5922771767622592e-05, "loss": 0.2443, "step": 4475 }, { "epoch": 2.508968609865471, "grad_norm": 0.08162774951123969, "learning_rate": 1.588747355666642e-05, "loss": 0.2461, "step": 4476 }, { "epoch": 2.5095291479820627, "grad_norm": 0.08042985764171678, "learning_rate": 1.5852211137832583e-05, "loss": 0.2419, "step": 4477 }, { "epoch": 2.5100896860986546, "grad_norm": 0.08105304381057965, "learning_rate": 1.5816984526126222e-05, "loss": 0.2359, "step": 4478 }, { "epoch": 2.5106502242152464, "grad_norm": 0.07763512135806293, "learning_rate": 1.5781793736537143e-05, "loss": 0.2441, "step": 4479 }, { "epoch": 2.5112107623318387, "grad_norm": 0.08043991943257807, "learning_rate": 1.5746638784040025e-05, "loss": 0.2343, "step": 4480 }, { "epoch": 2.5117713004484306, "grad_norm": 0.07809864052161128, "learning_rate": 1.5711519683594188e-05, "loss": 0.2352, "step": 4481 }, { "epoch": 2.5123318385650224, "grad_norm": 0.07785867561014251, "learning_rate": 1.567643645014376e-05, "loss": 0.2389, "step": 4482 }, { "epoch": 2.5128923766816142, "grad_norm": 0.07777156654471881, "learning_rate": 1.564138909861762e-05, "loss": 0.23, "step": 4483 }, { "epoch": 2.513452914798206, "grad_norm": 0.07725422661411521, "learning_rate": 1.5606377643929304e-05, "loss": 0.2295, "step": 4484 }, { "epoch": 2.5140134529147984, "grad_norm": 0.07834660994788373, "learning_rate": 1.5571402100977163e-05, "loss": 0.2324, "step": 4485 }, { "epoch": 2.5145739910313902, "grad_norm": 0.0805634437751587, "learning_rate": 1.5536462484644187e-05, "loss": 0.2349, "step": 4486 }, { "epoch": 2.515134529147982, "grad_norm": 0.07766852791708763, "learning_rate": 1.5501558809798154e-05, "loss": 0.2266, "step": 4487 }, { "epoch": 2.515695067264574, "grad_norm": 0.08147062811926131, "learning_rate": 1.5466691091291454e-05, "loss": 0.2446, "step": 4488 }, { "epoch": 2.5162556053811658, "grad_norm": 0.0795981439577369, "learning_rate": 1.5431859343961284e-05, "loss": 0.2267, "step": 4489 }, { "epoch": 2.516816143497758, "grad_norm": 0.0814700308689247, "learning_rate": 1.5397063582629445e-05, "loss": 0.246, "step": 4490 }, { "epoch": 2.51737668161435, "grad_norm": 0.07933582847866993, "learning_rate": 1.5362303822102466e-05, "loss": 0.2444, "step": 4491 }, { "epoch": 2.5179372197309418, "grad_norm": 0.07714052608508398, "learning_rate": 1.5327580077171587e-05, "loss": 0.2359, "step": 4492 }, { "epoch": 2.5184977578475336, "grad_norm": 0.0790934573509968, "learning_rate": 1.5292892362612642e-05, "loss": 0.2356, "step": 4493 }, { "epoch": 2.5190582959641254, "grad_norm": 0.0803605935968346, "learning_rate": 1.525824069318621e-05, "loss": 0.2407, "step": 4494 }, { "epoch": 2.5196188340807177, "grad_norm": 0.08167257081183345, "learning_rate": 1.5223625083637471e-05, "loss": 0.2335, "step": 4495 }, { "epoch": 2.520179372197309, "grad_norm": 0.07919293163948786, "learning_rate": 1.5189045548696323e-05, "loss": 0.2418, "step": 4496 }, { "epoch": 2.5207399103139014, "grad_norm": 0.07895571155816208, "learning_rate": 1.5154502103077261e-05, "loss": 0.2444, "step": 4497 }, { "epoch": 2.5213004484304933, "grad_norm": 0.08321216825372495, "learning_rate": 1.5119994761479427e-05, "loss": 0.2598, "step": 4498 }, { "epoch": 2.521860986547085, "grad_norm": 0.0804118459280637, "learning_rate": 1.5085523538586632e-05, "loss": 0.2378, "step": 4499 }, { "epoch": 2.522421524663677, "grad_norm": 0.08262966477108134, "learning_rate": 1.5051088449067285e-05, "loss": 0.2532, "step": 4500 }, { "epoch": 2.522982062780269, "grad_norm": 0.080991477269032, "learning_rate": 1.5016689507574488e-05, "loss": 0.2493, "step": 4501 }, { "epoch": 2.523542600896861, "grad_norm": 0.08076073788813379, "learning_rate": 1.4982326728745843e-05, "loss": 0.2388, "step": 4502 }, { "epoch": 2.524103139013453, "grad_norm": 0.08012089860702121, "learning_rate": 1.4948000127203666e-05, "loss": 0.2462, "step": 4503 }, { "epoch": 2.524663677130045, "grad_norm": 0.07946309313108386, "learning_rate": 1.4913709717554836e-05, "loss": 0.2399, "step": 4504 }, { "epoch": 2.5252242152466366, "grad_norm": 0.07700977642894151, "learning_rate": 1.4879455514390816e-05, "loss": 0.236, "step": 4505 }, { "epoch": 2.5257847533632285, "grad_norm": 0.08192900961330732, "learning_rate": 1.4845237532287704e-05, "loss": 0.2569, "step": 4506 }, { "epoch": 2.526345291479821, "grad_norm": 0.07732857927297841, "learning_rate": 1.4811055785806138e-05, "loss": 0.2301, "step": 4507 }, { "epoch": 2.5269058295964126, "grad_norm": 0.07881332488983218, "learning_rate": 1.4776910289491385e-05, "loss": 0.2385, "step": 4508 }, { "epoch": 2.5274663677130045, "grad_norm": 0.0798930794260508, "learning_rate": 1.4742801057873257e-05, "loss": 0.2475, "step": 4509 }, { "epoch": 2.5280269058295963, "grad_norm": 0.08099140742978495, "learning_rate": 1.4708728105466163e-05, "loss": 0.2605, "step": 4510 }, { "epoch": 2.528587443946188, "grad_norm": 0.08039335763500817, "learning_rate": 1.467469144676904e-05, "loss": 0.2542, "step": 4511 }, { "epoch": 2.5291479820627805, "grad_norm": 0.08032503381937363, "learning_rate": 1.4640691096265358e-05, "loss": 0.235, "step": 4512 }, { "epoch": 2.5297085201793723, "grad_norm": 0.07911745865309457, "learning_rate": 1.460672706842323e-05, "loss": 0.2452, "step": 4513 }, { "epoch": 2.530269058295964, "grad_norm": 0.07906123109904009, "learning_rate": 1.45727993776952e-05, "loss": 0.2375, "step": 4514 }, { "epoch": 2.530829596412556, "grad_norm": 0.08147152535880466, "learning_rate": 1.4538908038518428e-05, "loss": 0.2424, "step": 4515 }, { "epoch": 2.531390134529148, "grad_norm": 0.07972598814904243, "learning_rate": 1.4505053065314611e-05, "loss": 0.2451, "step": 4516 }, { "epoch": 2.53195067264574, "grad_norm": 0.07841395503999708, "learning_rate": 1.4471234472489892e-05, "loss": 0.2425, "step": 4517 }, { "epoch": 2.532511210762332, "grad_norm": 0.08000426459048555, "learning_rate": 1.4437452274435037e-05, "loss": 0.2436, "step": 4518 }, { "epoch": 2.533071748878924, "grad_norm": 0.08122733185769104, "learning_rate": 1.4403706485525225e-05, "loss": 0.2433, "step": 4519 }, { "epoch": 2.5336322869955157, "grad_norm": 0.0786567013200513, "learning_rate": 1.4369997120120227e-05, "loss": 0.2234, "step": 4520 }, { "epoch": 2.5341928251121075, "grad_norm": 0.07672949608216117, "learning_rate": 1.4336324192564232e-05, "loss": 0.2482, "step": 4521 }, { "epoch": 2.5347533632287, "grad_norm": 0.07997039882139345, "learning_rate": 1.4302687717186014e-05, "loss": 0.2311, "step": 4522 }, { "epoch": 2.535313901345291, "grad_norm": 0.07938003456651439, "learning_rate": 1.4269087708298755e-05, "loss": 0.2455, "step": 4523 }, { "epoch": 2.5358744394618835, "grad_norm": 0.07928533776455994, "learning_rate": 1.4235524180200166e-05, "loss": 0.2432, "step": 4524 }, { "epoch": 2.5364349775784754, "grad_norm": 0.08194063261809265, "learning_rate": 1.4201997147172453e-05, "loss": 0.2521, "step": 4525 }, { "epoch": 2.536995515695067, "grad_norm": 0.07946429322564628, "learning_rate": 1.4168506623482202e-05, "loss": 0.2298, "step": 4526 }, { "epoch": 2.537556053811659, "grad_norm": 0.08106337835210105, "learning_rate": 1.4135052623380596e-05, "loss": 0.2518, "step": 4527 }, { "epoch": 2.538116591928251, "grad_norm": 0.07957732161781982, "learning_rate": 1.4101635161103132e-05, "loss": 0.2368, "step": 4528 }, { "epoch": 2.538677130044843, "grad_norm": 0.07883091820953315, "learning_rate": 1.4068254250869895e-05, "loss": 0.2348, "step": 4529 }, { "epoch": 2.539237668161435, "grad_norm": 0.08068664223577357, "learning_rate": 1.4034909906885308e-05, "loss": 0.2479, "step": 4530 }, { "epoch": 2.539798206278027, "grad_norm": 0.0786028594120897, "learning_rate": 1.4001602143338277e-05, "loss": 0.2478, "step": 4531 }, { "epoch": 2.5403587443946187, "grad_norm": 0.08154351406638181, "learning_rate": 1.3968330974402166e-05, "loss": 0.2276, "step": 4532 }, { "epoch": 2.5409192825112106, "grad_norm": 0.07803866700309083, "learning_rate": 1.3935096414234728e-05, "loss": 0.2349, "step": 4533 }, { "epoch": 2.541479820627803, "grad_norm": 0.08112742036593473, "learning_rate": 1.390189847697818e-05, "loss": 0.2348, "step": 4534 }, { "epoch": 2.5420403587443947, "grad_norm": 0.07816987304461699, "learning_rate": 1.3868737176759106e-05, "loss": 0.2319, "step": 4535 }, { "epoch": 2.5426008968609866, "grad_norm": 0.08162227289399063, "learning_rate": 1.3835612527688536e-05, "loss": 0.2415, "step": 4536 }, { "epoch": 2.5431614349775784, "grad_norm": 0.07834022741575637, "learning_rate": 1.3802524543861894e-05, "loss": 0.2402, "step": 4537 }, { "epoch": 2.5437219730941703, "grad_norm": 0.07705607029820112, "learning_rate": 1.3769473239358965e-05, "loss": 0.2384, "step": 4538 }, { "epoch": 2.5442825112107625, "grad_norm": 0.08090779714416466, "learning_rate": 1.3736458628244008e-05, "loss": 0.2371, "step": 4539 }, { "epoch": 2.5448430493273544, "grad_norm": 0.07997151891543049, "learning_rate": 1.3703480724565577e-05, "loss": 0.2497, "step": 4540 }, { "epoch": 2.5454035874439462, "grad_norm": 0.0775203468999416, "learning_rate": 1.3670539542356664e-05, "loss": 0.2373, "step": 4541 }, { "epoch": 2.545964125560538, "grad_norm": 0.07965788466092727, "learning_rate": 1.3637635095634626e-05, "loss": 0.2484, "step": 4542 }, { "epoch": 2.54652466367713, "grad_norm": 0.07914207365379075, "learning_rate": 1.3604767398401208e-05, "loss": 0.2377, "step": 4543 }, { "epoch": 2.547085201793722, "grad_norm": 0.08037478126061315, "learning_rate": 1.3571936464642466e-05, "loss": 0.2328, "step": 4544 }, { "epoch": 2.547645739910314, "grad_norm": 0.08045814110102133, "learning_rate": 1.353914230832881e-05, "loss": 0.2465, "step": 4545 }, { "epoch": 2.548206278026906, "grad_norm": 0.08201626906455045, "learning_rate": 1.3506384943415084e-05, "loss": 0.2508, "step": 4546 }, { "epoch": 2.5487668161434978, "grad_norm": 0.07948458979602914, "learning_rate": 1.3473664383840367e-05, "loss": 0.2457, "step": 4547 }, { "epoch": 2.5493273542600896, "grad_norm": 0.07907056490097471, "learning_rate": 1.3440980643528144e-05, "loss": 0.2385, "step": 4548 }, { "epoch": 2.5498878923766815, "grad_norm": 0.07622766870336829, "learning_rate": 1.3408333736386236e-05, "loss": 0.2344, "step": 4549 }, { "epoch": 2.5504484304932733, "grad_norm": 0.08030827437419959, "learning_rate": 1.3375723676306783e-05, "loss": 0.2343, "step": 4550 }, { "epoch": 2.5510089686098656, "grad_norm": 0.07961449378768444, "learning_rate": 1.3343150477166222e-05, "loss": 0.2433, "step": 4551 }, { "epoch": 2.5515695067264574, "grad_norm": 0.0806096282528269, "learning_rate": 1.3310614152825273e-05, "loss": 0.2318, "step": 4552 }, { "epoch": 2.5521300448430493, "grad_norm": 0.08180752786629299, "learning_rate": 1.3278114717129087e-05, "loss": 0.2422, "step": 4553 }, { "epoch": 2.552690582959641, "grad_norm": 0.0806910267206613, "learning_rate": 1.3245652183906965e-05, "loss": 0.2427, "step": 4554 }, { "epoch": 2.553251121076233, "grad_norm": 0.08057267283346009, "learning_rate": 1.3213226566972647e-05, "loss": 0.2307, "step": 4555 }, { "epoch": 2.5538116591928253, "grad_norm": 0.08034367523185901, "learning_rate": 1.3180837880124041e-05, "loss": 0.242, "step": 4556 }, { "epoch": 2.554372197309417, "grad_norm": 0.07940310528470543, "learning_rate": 1.3148486137143423e-05, "loss": 0.2405, "step": 4557 }, { "epoch": 2.554932735426009, "grad_norm": 0.0813902006779356, "learning_rate": 1.3116171351797336e-05, "loss": 0.2556, "step": 4558 }, { "epoch": 2.555493273542601, "grad_norm": 0.08056421516881529, "learning_rate": 1.3083893537836556e-05, "loss": 0.2451, "step": 4559 }, { "epoch": 2.5560538116591927, "grad_norm": 0.07870155170299742, "learning_rate": 1.3051652708996177e-05, "loss": 0.2381, "step": 4560 }, { "epoch": 2.556614349775785, "grad_norm": 0.08021855892759801, "learning_rate": 1.3019448878995499e-05, "loss": 0.2356, "step": 4561 }, { "epoch": 2.557174887892377, "grad_norm": 0.0792681406793524, "learning_rate": 1.2987282061538164e-05, "loss": 0.2424, "step": 4562 }, { "epoch": 2.5577354260089686, "grad_norm": 0.08254078751573686, "learning_rate": 1.2955152270311966e-05, "loss": 0.2461, "step": 4563 }, { "epoch": 2.5582959641255605, "grad_norm": 0.07926710899890296, "learning_rate": 1.2923059518988979e-05, "loss": 0.2397, "step": 4564 }, { "epoch": 2.5588565022421523, "grad_norm": 0.08008160747822592, "learning_rate": 1.2891003821225545e-05, "loss": 0.2526, "step": 4565 }, { "epoch": 2.5594170403587446, "grad_norm": 0.08247390082805088, "learning_rate": 1.285898519066221e-05, "loss": 0.2406, "step": 4566 }, { "epoch": 2.5599775784753365, "grad_norm": 0.08026679579463669, "learning_rate": 1.2827003640923784e-05, "loss": 0.2374, "step": 4567 }, { "epoch": 2.5605381165919283, "grad_norm": 0.08066291654705528, "learning_rate": 1.2795059185619229e-05, "loss": 0.2318, "step": 4568 }, { "epoch": 2.56109865470852, "grad_norm": 0.07971569258124504, "learning_rate": 1.2763151838341802e-05, "loss": 0.239, "step": 4569 }, { "epoch": 2.561659192825112, "grad_norm": 0.07783877390894758, "learning_rate": 1.273128161266891e-05, "loss": 0.238, "step": 4570 }, { "epoch": 2.5622197309417043, "grad_norm": 0.07880881028458633, "learning_rate": 1.2699448522162161e-05, "loss": 0.2369, "step": 4571 }, { "epoch": 2.5627802690582957, "grad_norm": 0.07789180805884843, "learning_rate": 1.2667652580367417e-05, "loss": 0.225, "step": 4572 }, { "epoch": 2.563340807174888, "grad_norm": 0.07905274035098453, "learning_rate": 1.2635893800814669e-05, "loss": 0.2441, "step": 4573 }, { "epoch": 2.56390134529148, "grad_norm": 0.08161497148242826, "learning_rate": 1.260417219701815e-05, "loss": 0.2481, "step": 4574 }, { "epoch": 2.5644618834080717, "grad_norm": 0.08324032759367675, "learning_rate": 1.2572487782476228e-05, "loss": 0.251, "step": 4575 }, { "epoch": 2.5650224215246635, "grad_norm": 0.0782085524442898, "learning_rate": 1.2540840570671497e-05, "loss": 0.2401, "step": 4576 }, { "epoch": 2.5655829596412554, "grad_norm": 0.07768410537529105, "learning_rate": 1.2509230575070686e-05, "loss": 0.2539, "step": 4577 }, { "epoch": 2.5661434977578477, "grad_norm": 0.08105766213368072, "learning_rate": 1.2477657809124631e-05, "loss": 0.242, "step": 4578 }, { "epoch": 2.5667040358744395, "grad_norm": 0.07995084175421051, "learning_rate": 1.2446122286268469e-05, "loss": 0.2433, "step": 4579 }, { "epoch": 2.5672645739910314, "grad_norm": 0.07746120906424067, "learning_rate": 1.241462401992134e-05, "loss": 0.2303, "step": 4580 }, { "epoch": 2.567825112107623, "grad_norm": 0.07821609713443424, "learning_rate": 1.238316302348661e-05, "loss": 0.2362, "step": 4581 }, { "epoch": 2.568385650224215, "grad_norm": 0.07921550521842298, "learning_rate": 1.2351739310351795e-05, "loss": 0.2365, "step": 4582 }, { "epoch": 2.5689461883408073, "grad_norm": 0.07781707559113729, "learning_rate": 1.2320352893888532e-05, "loss": 0.2434, "step": 4583 }, { "epoch": 2.569506726457399, "grad_norm": 0.07812556330316417, "learning_rate": 1.2289003787452557e-05, "loss": 0.2217, "step": 4584 }, { "epoch": 2.570067264573991, "grad_norm": 0.07958740262817013, "learning_rate": 1.2257692004383725e-05, "loss": 0.2416, "step": 4585 }, { "epoch": 2.570627802690583, "grad_norm": 0.0769627442836369, "learning_rate": 1.2226417558006087e-05, "loss": 0.2319, "step": 4586 }, { "epoch": 2.5711883408071747, "grad_norm": 0.07679104601067815, "learning_rate": 1.2195180461627698e-05, "loss": 0.2337, "step": 4587 }, { "epoch": 2.571748878923767, "grad_norm": 0.07473221010039457, "learning_rate": 1.2163980728540835e-05, "loss": 0.2287, "step": 4588 }, { "epoch": 2.572309417040359, "grad_norm": 0.0782509976939727, "learning_rate": 1.2132818372021759e-05, "loss": 0.2338, "step": 4589 }, { "epoch": 2.5728699551569507, "grad_norm": 0.08023409978733587, "learning_rate": 1.2101693405330906e-05, "loss": 0.2474, "step": 4590 }, { "epoch": 2.5734304932735426, "grad_norm": 0.0815156134427342, "learning_rate": 1.2070605841712813e-05, "loss": 0.2418, "step": 4591 }, { "epoch": 2.5739910313901344, "grad_norm": 0.08174500722411769, "learning_rate": 1.203955569439601e-05, "loss": 0.2518, "step": 4592 }, { "epoch": 2.5745515695067267, "grad_norm": 0.08030947691507194, "learning_rate": 1.2008542976593206e-05, "loss": 0.2388, "step": 4593 }, { "epoch": 2.5751121076233185, "grad_norm": 0.08130695240203926, "learning_rate": 1.1977567701501103e-05, "loss": 0.2355, "step": 4594 }, { "epoch": 2.5756726457399104, "grad_norm": 0.0790884021363608, "learning_rate": 1.1946629882300553e-05, "loss": 0.2389, "step": 4595 }, { "epoch": 2.5762331838565022, "grad_norm": 0.07893046150354383, "learning_rate": 1.1915729532156372e-05, "loss": 0.2414, "step": 4596 }, { "epoch": 2.576793721973094, "grad_norm": 0.07761726989109233, "learning_rate": 1.188486666421753e-05, "loss": 0.2386, "step": 4597 }, { "epoch": 2.577354260089686, "grad_norm": 0.0797890917727982, "learning_rate": 1.1854041291616946e-05, "loss": 0.2404, "step": 4598 }, { "epoch": 2.577914798206278, "grad_norm": 0.0827881777224284, "learning_rate": 1.1823253427471681e-05, "loss": 0.2495, "step": 4599 }, { "epoch": 2.57847533632287, "grad_norm": 0.08082628941808752, "learning_rate": 1.1792503084882789e-05, "loss": 0.2428, "step": 4600 }, { "epoch": 2.579035874439462, "grad_norm": 0.08058979589937264, "learning_rate": 1.1761790276935336e-05, "loss": 0.2373, "step": 4601 }, { "epoch": 2.5795964125560538, "grad_norm": 0.07695649037026123, "learning_rate": 1.173111501669848e-05, "loss": 0.2341, "step": 4602 }, { "epoch": 2.5801569506726456, "grad_norm": 0.08087859212820767, "learning_rate": 1.1700477317225334e-05, "loss": 0.2385, "step": 4603 }, { "epoch": 2.5807174887892375, "grad_norm": 0.08287720887802386, "learning_rate": 1.1669877191553035e-05, "loss": 0.2263, "step": 4604 }, { "epoch": 2.5812780269058297, "grad_norm": 0.07816250950995993, "learning_rate": 1.1639314652702793e-05, "loss": 0.2398, "step": 4605 }, { "epoch": 2.5818385650224216, "grad_norm": 0.08091750663182318, "learning_rate": 1.1608789713679757e-05, "loss": 0.2629, "step": 4606 }, { "epoch": 2.5823991031390134, "grad_norm": 0.07660575513511397, "learning_rate": 1.1578302387473105e-05, "loss": 0.233, "step": 4607 }, { "epoch": 2.5829596412556053, "grad_norm": 0.07788562205501486, "learning_rate": 1.1547852687056015e-05, "loss": 0.2337, "step": 4608 }, { "epoch": 2.583520179372197, "grad_norm": 0.08016084118623913, "learning_rate": 1.1517440625385667e-05, "loss": 0.2384, "step": 4609 }, { "epoch": 2.5840807174887894, "grad_norm": 0.0783022785313455, "learning_rate": 1.1487066215403186e-05, "loss": 0.2407, "step": 4610 }, { "epoch": 2.5846412556053813, "grad_norm": 0.08113824685483437, "learning_rate": 1.1456729470033667e-05, "loss": 0.255, "step": 4611 }, { "epoch": 2.585201793721973, "grad_norm": 0.08211068433770122, "learning_rate": 1.142643040218624e-05, "loss": 0.241, "step": 4612 }, { "epoch": 2.585762331838565, "grad_norm": 0.07804110098165112, "learning_rate": 1.1396169024753933e-05, "loss": 0.2357, "step": 4613 }, { "epoch": 2.586322869955157, "grad_norm": 0.07843205512424804, "learning_rate": 1.1365945350613793e-05, "loss": 0.2296, "step": 4614 }, { "epoch": 2.586883408071749, "grad_norm": 0.07740813803202314, "learning_rate": 1.1335759392626798e-05, "loss": 0.2467, "step": 4615 }, { "epoch": 2.587443946188341, "grad_norm": 0.07890370072179591, "learning_rate": 1.1305611163637886e-05, "loss": 0.2442, "step": 4616 }, { "epoch": 2.588004484304933, "grad_norm": 0.08049726094139927, "learning_rate": 1.1275500676475925e-05, "loss": 0.2461, "step": 4617 }, { "epoch": 2.5885650224215246, "grad_norm": 0.07677261217838514, "learning_rate": 1.12454279439537e-05, "loss": 0.2388, "step": 4618 }, { "epoch": 2.5891255605381165, "grad_norm": 0.07969531796060425, "learning_rate": 1.121539297886801e-05, "loss": 0.2294, "step": 4619 }, { "epoch": 2.589686098654709, "grad_norm": 0.07793063555972102, "learning_rate": 1.1185395793999497e-05, "loss": 0.2382, "step": 4620 }, { "epoch": 2.5902466367713, "grad_norm": 0.08041496562426886, "learning_rate": 1.1155436402112785e-05, "loss": 0.245, "step": 4621 }, { "epoch": 2.5908071748878925, "grad_norm": 0.07736623150867082, "learning_rate": 1.1125514815956361e-05, "loss": 0.2309, "step": 4622 }, { "epoch": 2.5913677130044843, "grad_norm": 0.08058079119287281, "learning_rate": 1.109563104826269e-05, "loss": 0.2416, "step": 4623 }, { "epoch": 2.591928251121076, "grad_norm": 0.07671705061752368, "learning_rate": 1.1065785111748117e-05, "loss": 0.2445, "step": 4624 }, { "epoch": 2.592488789237668, "grad_norm": 0.08091256841818938, "learning_rate": 1.1035977019112852e-05, "loss": 0.2375, "step": 4625 }, { "epoch": 2.59304932735426, "grad_norm": 0.07834204684578358, "learning_rate": 1.1006206783041063e-05, "loss": 0.2313, "step": 4626 }, { "epoch": 2.593609865470852, "grad_norm": 0.0812937113539261, "learning_rate": 1.0976474416200755e-05, "loss": 0.2392, "step": 4627 }, { "epoch": 2.594170403587444, "grad_norm": 0.0782731877233754, "learning_rate": 1.0946779931243866e-05, "loss": 0.2321, "step": 4628 }, { "epoch": 2.594730941704036, "grad_norm": 0.07883648296355882, "learning_rate": 1.0917123340806168e-05, "loss": 0.2396, "step": 4629 }, { "epoch": 2.5952914798206277, "grad_norm": 0.078362956128506, "learning_rate": 1.0887504657507353e-05, "loss": 0.2233, "step": 4630 }, { "epoch": 2.5958520179372195, "grad_norm": 0.08003834880521533, "learning_rate": 1.0857923893950928e-05, "loss": 0.247, "step": 4631 }, { "epoch": 2.596412556053812, "grad_norm": 0.08025058062989013, "learning_rate": 1.0828381062724324e-05, "loss": 0.2494, "step": 4632 }, { "epoch": 2.5969730941704037, "grad_norm": 0.07880394678428947, "learning_rate": 1.079887617639881e-05, "loss": 0.2317, "step": 4633 }, { "epoch": 2.5975336322869955, "grad_norm": 0.08150930138545963, "learning_rate": 1.0769409247529472e-05, "loss": 0.2536, "step": 4634 }, { "epoch": 2.5980941704035874, "grad_norm": 0.07985651384036674, "learning_rate": 1.0739980288655316e-05, "loss": 0.2349, "step": 4635 }, { "epoch": 2.598654708520179, "grad_norm": 0.08150527738856496, "learning_rate": 1.0710589312299091e-05, "loss": 0.2333, "step": 4636 }, { "epoch": 2.5992152466367715, "grad_norm": 0.08416068011414829, "learning_rate": 1.0681236330967503e-05, "loss": 0.2492, "step": 4637 }, { "epoch": 2.5997757847533634, "grad_norm": 0.07921301034048962, "learning_rate": 1.0651921357150996e-05, "loss": 0.2423, "step": 4638 }, { "epoch": 2.600336322869955, "grad_norm": 0.08012881168134223, "learning_rate": 1.0622644403323844e-05, "loss": 0.2299, "step": 4639 }, { "epoch": 2.600896860986547, "grad_norm": 0.07948077463770052, "learning_rate": 1.0593405481944208e-05, "loss": 0.2475, "step": 4640 }, { "epoch": 2.601457399103139, "grad_norm": 0.08134179424726441, "learning_rate": 1.0564204605454032e-05, "loss": 0.234, "step": 4641 }, { "epoch": 2.602017937219731, "grad_norm": 0.08318924334977452, "learning_rate": 1.0535041786279066e-05, "loss": 0.2461, "step": 4642 }, { "epoch": 2.602578475336323, "grad_norm": 0.08136989551036139, "learning_rate": 1.050591703682886e-05, "loss": 0.2488, "step": 4643 }, { "epoch": 2.603139013452915, "grad_norm": 0.08134620876824172, "learning_rate": 1.0476830369496759e-05, "loss": 0.2448, "step": 4644 }, { "epoch": 2.6036995515695067, "grad_norm": 0.07831996360486439, "learning_rate": 1.0447781796659938e-05, "loss": 0.2336, "step": 4645 }, { "epoch": 2.6042600896860986, "grad_norm": 0.07755786350418256, "learning_rate": 1.0418771330679311e-05, "loss": 0.2342, "step": 4646 }, { "epoch": 2.604820627802691, "grad_norm": 0.07775354144623337, "learning_rate": 1.0389798983899624e-05, "loss": 0.2188, "step": 4647 }, { "epoch": 2.6053811659192823, "grad_norm": 0.07888127674381684, "learning_rate": 1.0360864768649381e-05, "loss": 0.2418, "step": 4648 }, { "epoch": 2.6059417040358746, "grad_norm": 0.08047717524674732, "learning_rate": 1.0331968697240879e-05, "loss": 0.254, "step": 4649 }, { "epoch": 2.6065022421524664, "grad_norm": 0.07762362263219512, "learning_rate": 1.030311078197015e-05, "loss": 0.2384, "step": 4650 }, { "epoch": 2.6070627802690582, "grad_norm": 0.07799042816569264, "learning_rate": 1.027429103511698e-05, "loss": 0.2302, "step": 4651 }, { "epoch": 2.60762331838565, "grad_norm": 0.08085930304246164, "learning_rate": 1.0245509468944992e-05, "loss": 0.2446, "step": 4652 }, { "epoch": 2.608183856502242, "grad_norm": 0.07988787874092403, "learning_rate": 1.0216766095701457e-05, "loss": 0.2502, "step": 4653 }, { "epoch": 2.6087443946188342, "grad_norm": 0.0809710576534792, "learning_rate": 1.0188060927617494e-05, "loss": 0.2501, "step": 4654 }, { "epoch": 2.609304932735426, "grad_norm": 0.0782794833603739, "learning_rate": 1.0159393976907871e-05, "loss": 0.238, "step": 4655 }, { "epoch": 2.609865470852018, "grad_norm": 0.07852642768667292, "learning_rate": 1.0130765255771169e-05, "loss": 0.2489, "step": 4656 }, { "epoch": 2.6104260089686098, "grad_norm": 0.07980893402914446, "learning_rate": 1.0102174776389683e-05, "loss": 0.2392, "step": 4657 }, { "epoch": 2.6109865470852016, "grad_norm": 0.07734153573236843, "learning_rate": 1.0073622550929395e-05, "loss": 0.2383, "step": 4658 }, { "epoch": 2.611547085201794, "grad_norm": 0.07860451703068655, "learning_rate": 1.0045108591540075e-05, "loss": 0.2289, "step": 4659 }, { "epoch": 2.6121076233183858, "grad_norm": 0.0795175394237589, "learning_rate": 1.0016632910355117e-05, "loss": 0.2327, "step": 4660 }, { "epoch": 2.6126681614349776, "grad_norm": 0.08074684826967744, "learning_rate": 9.988195519491739e-06, "loss": 0.2309, "step": 4661 }, { "epoch": 2.6132286995515694, "grad_norm": 0.07752388356246212, "learning_rate": 9.959796431050772e-06, "loss": 0.2322, "step": 4662 }, { "epoch": 2.6137892376681613, "grad_norm": 0.07847828544892076, "learning_rate": 9.931435657116817e-06, "loss": 0.2352, "step": 4663 }, { "epoch": 2.6143497757847536, "grad_norm": 0.07897018834734854, "learning_rate": 9.903113209758096e-06, "loss": 0.2309, "step": 4664 }, { "epoch": 2.6149103139013454, "grad_norm": 0.08033580219940079, "learning_rate": 9.874829101026584e-06, "loss": 0.238, "step": 4665 }, { "epoch": 2.6154708520179373, "grad_norm": 0.0756377449056898, "learning_rate": 9.84658334295796e-06, "loss": 0.2369, "step": 4666 }, { "epoch": 2.616031390134529, "grad_norm": 0.08068419590807634, "learning_rate": 9.81837594757149e-06, "loss": 0.2356, "step": 4667 }, { "epoch": 2.616591928251121, "grad_norm": 0.07996958970321559, "learning_rate": 9.790206926870215e-06, "loss": 0.2352, "step": 4668 }, { "epoch": 2.6171524663677133, "grad_norm": 0.07998580082674038, "learning_rate": 9.762076292840783e-06, "loss": 0.2426, "step": 4669 }, { "epoch": 2.6177130044843047, "grad_norm": 0.08341483581380287, "learning_rate": 9.733984057453538e-06, "loss": 0.2456, "step": 4670 }, { "epoch": 2.618273542600897, "grad_norm": 0.08034491533586992, "learning_rate": 9.705930232662453e-06, "loss": 0.244, "step": 4671 }, { "epoch": 2.618834080717489, "grad_norm": 0.07772304435861314, "learning_rate": 9.67791483040521e-06, "loss": 0.2432, "step": 4672 }, { "epoch": 2.6193946188340806, "grad_norm": 0.08057563850747478, "learning_rate": 9.649937862603099e-06, "loss": 0.2347, "step": 4673 }, { "epoch": 2.6199551569506725, "grad_norm": 0.08122556936148323, "learning_rate": 9.621999341161047e-06, "loss": 0.2462, "step": 4674 }, { "epoch": 2.6205156950672643, "grad_norm": 0.07976444756404565, "learning_rate": 9.594099277967683e-06, "loss": 0.235, "step": 4675 }, { "epoch": 2.6210762331838566, "grad_norm": 0.08050677289630043, "learning_rate": 9.566237684895174e-06, "loss": 0.2367, "step": 4676 }, { "epoch": 2.6216367713004485, "grad_norm": 0.08007169774187989, "learning_rate": 9.538414573799414e-06, "loss": 0.2463, "step": 4677 }, { "epoch": 2.6221973094170403, "grad_norm": 0.07968327117755601, "learning_rate": 9.510629956519868e-06, "loss": 0.2369, "step": 4678 }, { "epoch": 2.622757847533632, "grad_norm": 0.08057650051681482, "learning_rate": 9.482883844879597e-06, "loss": 0.2416, "step": 4679 }, { "epoch": 2.623318385650224, "grad_norm": 0.07976406048750015, "learning_rate": 9.455176250685338e-06, "loss": 0.247, "step": 4680 }, { "epoch": 2.6238789237668163, "grad_norm": 0.08036405300601392, "learning_rate": 9.427507185727413e-06, "loss": 0.2436, "step": 4681 }, { "epoch": 2.624439461883408, "grad_norm": 0.07820639019388559, "learning_rate": 9.399876661779771e-06, "loss": 0.2465, "step": 4682 }, { "epoch": 2.625, "grad_norm": 0.08169916753855645, "learning_rate": 9.372284690599887e-06, "loss": 0.2526, "step": 4683 }, { "epoch": 2.625560538116592, "grad_norm": 0.0805764573820981, "learning_rate": 9.34473128392892e-06, "loss": 0.2362, "step": 4684 }, { "epoch": 2.6261210762331837, "grad_norm": 0.08014393480570253, "learning_rate": 9.317216453491562e-06, "loss": 0.2467, "step": 4685 }, { "epoch": 2.626681614349776, "grad_norm": 0.0820739945849279, "learning_rate": 9.28974021099609e-06, "loss": 0.2556, "step": 4686 }, { "epoch": 2.627242152466368, "grad_norm": 0.08136757509280972, "learning_rate": 9.262302568134418e-06, "loss": 0.2538, "step": 4687 }, { "epoch": 2.6278026905829597, "grad_norm": 0.07774133860201993, "learning_rate": 9.234903536581952e-06, "loss": 0.2273, "step": 4688 }, { "epoch": 2.6283632286995515, "grad_norm": 0.08084569565280489, "learning_rate": 9.207543127997731e-06, "loss": 0.2422, "step": 4689 }, { "epoch": 2.6289237668161434, "grad_norm": 0.07986444650344848, "learning_rate": 9.180221354024354e-06, "loss": 0.2358, "step": 4690 }, { "epoch": 2.6294843049327357, "grad_norm": 0.07962366862394384, "learning_rate": 9.152938226287932e-06, "loss": 0.2319, "step": 4691 }, { "epoch": 2.6300448430493275, "grad_norm": 0.08022289749822759, "learning_rate": 9.125693756398202e-06, "loss": 0.2356, "step": 4692 }, { "epoch": 2.6306053811659194, "grad_norm": 0.07875499080453484, "learning_rate": 9.098487955948364e-06, "loss": 0.2377, "step": 4693 }, { "epoch": 2.631165919282511, "grad_norm": 0.08077688843109554, "learning_rate": 9.071320836515262e-06, "loss": 0.2456, "step": 4694 }, { "epoch": 2.631726457399103, "grad_norm": 0.08026063523711201, "learning_rate": 9.04419240965918e-06, "loss": 0.2376, "step": 4695 }, { "epoch": 2.6322869955156953, "grad_norm": 0.07839831164392934, "learning_rate": 9.017102686924028e-06, "loss": 0.2375, "step": 4696 }, { "epoch": 2.6328475336322867, "grad_norm": 0.07939925362615312, "learning_rate": 8.990051679837175e-06, "loss": 0.2474, "step": 4697 }, { "epoch": 2.633408071748879, "grad_norm": 0.07863814740863895, "learning_rate": 8.963039399909556e-06, "loss": 0.2394, "step": 4698 }, { "epoch": 2.633968609865471, "grad_norm": 0.08038397945388753, "learning_rate": 8.936065858635633e-06, "loss": 0.2442, "step": 4699 }, { "epoch": 2.6345291479820627, "grad_norm": 0.08153688943104939, "learning_rate": 8.909131067493348e-06, "loss": 0.2429, "step": 4700 }, { "epoch": 2.6350896860986546, "grad_norm": 0.07933631782552886, "learning_rate": 8.882235037944186e-06, "loss": 0.2487, "step": 4701 }, { "epoch": 2.6356502242152464, "grad_norm": 0.08350820800736432, "learning_rate": 8.855377781433094e-06, "loss": 0.2455, "step": 4702 }, { "epoch": 2.6362107623318387, "grad_norm": 0.08225559708493778, "learning_rate": 8.828559309388596e-06, "loss": 0.2457, "step": 4703 }, { "epoch": 2.6367713004484306, "grad_norm": 0.07950349828915644, "learning_rate": 8.80177963322263e-06, "loss": 0.2397, "step": 4704 }, { "epoch": 2.6373318385650224, "grad_norm": 0.0833246707543202, "learning_rate": 8.775038764330679e-06, "loss": 0.2345, "step": 4705 }, { "epoch": 2.6378923766816142, "grad_norm": 0.08210156398956797, "learning_rate": 8.748336714091698e-06, "loss": 0.2456, "step": 4706 }, { "epoch": 2.638452914798206, "grad_norm": 0.07923756575214472, "learning_rate": 8.72167349386811e-06, "loss": 0.2396, "step": 4707 }, { "epoch": 2.6390134529147984, "grad_norm": 0.08042949943095787, "learning_rate": 8.695049115005837e-06, "loss": 0.2322, "step": 4708 }, { "epoch": 2.6395739910313902, "grad_norm": 0.08324991921517294, "learning_rate": 8.668463588834253e-06, "loss": 0.2446, "step": 4709 }, { "epoch": 2.640134529147982, "grad_norm": 0.07962396678104898, "learning_rate": 8.641916926666216e-06, "loss": 0.2445, "step": 4710 }, { "epoch": 2.640695067264574, "grad_norm": 0.08128990462058089, "learning_rate": 8.615409139798048e-06, "loss": 0.2504, "step": 4711 }, { "epoch": 2.6412556053811658, "grad_norm": 0.08049269636217003, "learning_rate": 8.58894023950948e-06, "loss": 0.2468, "step": 4712 }, { "epoch": 2.641816143497758, "grad_norm": 0.07924177779021946, "learning_rate": 8.562510237063758e-06, "loss": 0.242, "step": 4713 }, { "epoch": 2.64237668161435, "grad_norm": 0.08155667402234759, "learning_rate": 8.536119143707555e-06, "loss": 0.2481, "step": 4714 }, { "epoch": 2.6429372197309418, "grad_norm": 0.07820082385607566, "learning_rate": 8.509766970671007e-06, "loss": 0.2346, "step": 4715 }, { "epoch": 2.6434977578475336, "grad_norm": 0.0825180322805504, "learning_rate": 8.483453729167622e-06, "loss": 0.2517, "step": 4716 }, { "epoch": 2.6440582959641254, "grad_norm": 0.08108549758779822, "learning_rate": 8.457179430394424e-06, "loss": 0.2372, "step": 4717 }, { "epoch": 2.6446188340807177, "grad_norm": 0.07790912705506942, "learning_rate": 8.430944085531811e-06, "loss": 0.2436, "step": 4718 }, { "epoch": 2.645179372197309, "grad_norm": 0.07977766511688403, "learning_rate": 8.40474770574361e-06, "loss": 0.2528, "step": 4719 }, { "epoch": 2.6457399103139014, "grad_norm": 0.08123725272003492, "learning_rate": 8.378590302177102e-06, "loss": 0.2384, "step": 4720 }, { "epoch": 2.6463004484304933, "grad_norm": 0.08140475056257848, "learning_rate": 8.352471885962931e-06, "loss": 0.2347, "step": 4721 }, { "epoch": 2.646860986547085, "grad_norm": 0.07923734139983633, "learning_rate": 8.326392468215205e-06, "loss": 0.231, "step": 4722 }, { "epoch": 2.647421524663677, "grad_norm": 0.0833880681940613, "learning_rate": 8.300352060031391e-06, "loss": 0.2429, "step": 4723 }, { "epoch": 2.647982062780269, "grad_norm": 0.08357552737977247, "learning_rate": 8.274350672492415e-06, "loss": 0.2437, "step": 4724 }, { "epoch": 2.648542600896861, "grad_norm": 0.0805449550726762, "learning_rate": 8.248388316662525e-06, "loss": 0.2585, "step": 4725 }, { "epoch": 2.649103139013453, "grad_norm": 0.07891704758599948, "learning_rate": 8.222465003589398e-06, "loss": 0.2374, "step": 4726 }, { "epoch": 2.649663677130045, "grad_norm": 0.08106681371740672, "learning_rate": 8.196580744304116e-06, "loss": 0.2322, "step": 4727 }, { "epoch": 2.6502242152466366, "grad_norm": 0.08009115068030585, "learning_rate": 8.170735549821085e-06, "loss": 0.232, "step": 4728 }, { "epoch": 2.6507847533632285, "grad_norm": 0.07874987050584663, "learning_rate": 8.14492943113817e-06, "loss": 0.2399, "step": 4729 }, { "epoch": 2.651345291479821, "grad_norm": 0.07662010629486314, "learning_rate": 8.119162399236513e-06, "loss": 0.2422, "step": 4730 }, { "epoch": 2.6519058295964126, "grad_norm": 0.08352877307067955, "learning_rate": 8.093434465080706e-06, "loss": 0.2427, "step": 4731 }, { "epoch": 2.6524663677130045, "grad_norm": 0.08123913507433109, "learning_rate": 8.067745639618684e-06, "loss": 0.2453, "step": 4732 }, { "epoch": 2.6530269058295963, "grad_norm": 0.08063630580939841, "learning_rate": 8.04209593378168e-06, "loss": 0.2476, "step": 4733 }, { "epoch": 2.653587443946188, "grad_norm": 0.07806425792440652, "learning_rate": 8.016485358484383e-06, "loss": 0.2288, "step": 4734 }, { "epoch": 2.6541479820627805, "grad_norm": 0.07707749516236907, "learning_rate": 7.990913924624722e-06, "loss": 0.2381, "step": 4735 }, { "epoch": 2.6547085201793723, "grad_norm": 0.07900612663841797, "learning_rate": 7.96538164308407e-06, "loss": 0.2319, "step": 4736 }, { "epoch": 2.655269058295964, "grad_norm": 0.07954983331838897, "learning_rate": 7.939888524727047e-06, "loss": 0.2349, "step": 4737 }, { "epoch": 2.655829596412556, "grad_norm": 0.08021331901320931, "learning_rate": 7.914434580401686e-06, "loss": 0.2399, "step": 4738 }, { "epoch": 2.656390134529148, "grad_norm": 0.07905180362069561, "learning_rate": 7.889019820939325e-06, "loss": 0.2332, "step": 4739 }, { "epoch": 2.65695067264574, "grad_norm": 0.08130104002495893, "learning_rate": 7.8636442571546e-06, "loss": 0.2483, "step": 4740 }, { "epoch": 2.657511210762332, "grad_norm": 0.07953731009728611, "learning_rate": 7.838307899845509e-06, "loss": 0.238, "step": 4741 }, { "epoch": 2.658071748878924, "grad_norm": 0.08075923076714953, "learning_rate": 7.813010759793326e-06, "loss": 0.2399, "step": 4742 }, { "epoch": 2.6586322869955157, "grad_norm": 0.07912799503613634, "learning_rate": 7.787752847762685e-06, "loss": 0.244, "step": 4743 }, { "epoch": 2.6591928251121075, "grad_norm": 0.08262334060021577, "learning_rate": 7.76253417450149e-06, "loss": 0.2528, "step": 4744 }, { "epoch": 2.6597533632287, "grad_norm": 0.08379137046638221, "learning_rate": 7.737354750740933e-06, "loss": 0.2445, "step": 4745 }, { "epoch": 2.660313901345291, "grad_norm": 0.07959878967597425, "learning_rate": 7.712214587195554e-06, "loss": 0.249, "step": 4746 }, { "epoch": 2.6608744394618835, "grad_norm": 0.07851889305169113, "learning_rate": 7.687113694563153e-06, "loss": 0.2325, "step": 4747 }, { "epoch": 2.6614349775784754, "grad_norm": 0.07951174972995802, "learning_rate": 7.662052083524863e-06, "loss": 0.2445, "step": 4748 }, { "epoch": 2.661995515695067, "grad_norm": 0.07800438368537502, "learning_rate": 7.637029764745019e-06, "loss": 0.2463, "step": 4749 }, { "epoch": 2.662556053811659, "grad_norm": 0.07793478332510123, "learning_rate": 7.612046748871327e-06, "loss": 0.2143, "step": 4750 }, { "epoch": 2.663116591928251, "grad_norm": 0.07937211104180285, "learning_rate": 7.587103046534705e-06, "loss": 0.2376, "step": 4751 }, { "epoch": 2.663677130044843, "grad_norm": 0.08041006055644226, "learning_rate": 7.562198668349352e-06, "loss": 0.2336, "step": 4752 }, { "epoch": 2.664237668161435, "grad_norm": 0.07947393992836635, "learning_rate": 7.537333624912768e-06, "loss": 0.2347, "step": 4753 }, { "epoch": 2.664798206278027, "grad_norm": 0.0809116546041903, "learning_rate": 7.512507926805668e-06, "loss": 0.2624, "step": 4754 }, { "epoch": 2.6653587443946187, "grad_norm": 0.0796819941226107, "learning_rate": 7.4877215845920555e-06, "loss": 0.2412, "step": 4755 }, { "epoch": 2.6659192825112106, "grad_norm": 0.07864802644574229, "learning_rate": 7.462974608819196e-06, "loss": 0.2371, "step": 4756 }, { "epoch": 2.666479820627803, "grad_norm": 0.07734365155763259, "learning_rate": 7.438267010017585e-06, "loss": 0.2375, "step": 4757 }, { "epoch": 2.6670403587443947, "grad_norm": 0.08022431608682906, "learning_rate": 7.41359879870096e-06, "loss": 0.2364, "step": 4758 }, { "epoch": 2.6676008968609866, "grad_norm": 0.07997424733047849, "learning_rate": 7.38896998536629e-06, "loss": 0.2376, "step": 4759 }, { "epoch": 2.6681614349775784, "grad_norm": 0.08197462441510985, "learning_rate": 7.364380580493813e-06, "loss": 0.2461, "step": 4760 }, { "epoch": 2.6687219730941703, "grad_norm": 0.07633156868865107, "learning_rate": 7.339830594546937e-06, "loss": 0.2323, "step": 4761 }, { "epoch": 2.6692825112107625, "grad_norm": 0.07682717870384738, "learning_rate": 7.315320037972395e-06, "loss": 0.2386, "step": 4762 }, { "epoch": 2.6698430493273544, "grad_norm": 0.07828778526955535, "learning_rate": 7.290848921200022e-06, "loss": 0.2313, "step": 4763 }, { "epoch": 2.6704035874439462, "grad_norm": 0.08058231110695564, "learning_rate": 7.2664172546429655e-06, "loss": 0.2478, "step": 4764 }, { "epoch": 2.670964125560538, "grad_norm": 0.07953252462034587, "learning_rate": 7.242025048697565e-06, "loss": 0.2338, "step": 4765 }, { "epoch": 2.67152466367713, "grad_norm": 0.0782901538887304, "learning_rate": 7.217672313743306e-06, "loss": 0.2451, "step": 4766 }, { "epoch": 2.672085201793722, "grad_norm": 0.08036684426565673, "learning_rate": 7.193359060142979e-06, "loss": 0.2339, "step": 4767 }, { "epoch": 2.672645739910314, "grad_norm": 0.07749707686250977, "learning_rate": 7.169085298242473e-06, "loss": 0.2552, "step": 4768 }, { "epoch": 2.673206278026906, "grad_norm": 0.08274416518853744, "learning_rate": 7.1448510383709696e-06, "loss": 0.2565, "step": 4769 }, { "epoch": 2.6737668161434978, "grad_norm": 0.07804543456423374, "learning_rate": 7.120656290840744e-06, "loss": 0.2453, "step": 4770 }, { "epoch": 2.6743273542600896, "grad_norm": 0.08115831948075995, "learning_rate": 7.0965010659473256e-06, "loss": 0.2312, "step": 4771 }, { "epoch": 2.6748878923766815, "grad_norm": 0.08128287586424483, "learning_rate": 7.0723853739694364e-06, "loss": 0.2452, "step": 4772 }, { "epoch": 2.6754484304932733, "grad_norm": 0.08119408695590151, "learning_rate": 7.048309225168903e-06, "loss": 0.2358, "step": 4773 }, { "epoch": 2.6760089686098656, "grad_norm": 0.07941834280290176, "learning_rate": 7.024272629790795e-06, "loss": 0.2387, "step": 4774 }, { "epoch": 2.6765695067264574, "grad_norm": 0.08015615594526529, "learning_rate": 7.000275598063299e-06, "loss": 0.2363, "step": 4775 }, { "epoch": 2.6771300448430493, "grad_norm": 0.0778155107576842, "learning_rate": 6.976318140197835e-06, "loss": 0.2285, "step": 4776 }, { "epoch": 2.677690582959641, "grad_norm": 0.08062179382102118, "learning_rate": 6.952400266388903e-06, "loss": 0.2529, "step": 4777 }, { "epoch": 2.678251121076233, "grad_norm": 0.08224849028132702, "learning_rate": 6.928521986814196e-06, "loss": 0.2471, "step": 4778 }, { "epoch": 2.6788116591928253, "grad_norm": 0.08003155990063, "learning_rate": 6.9046833116345635e-06, "loss": 0.2469, "step": 4779 }, { "epoch": 2.679372197309417, "grad_norm": 0.08078888522682524, "learning_rate": 6.8808842509940015e-06, "loss": 0.2409, "step": 4780 }, { "epoch": 2.679932735426009, "grad_norm": 0.08159683352788072, "learning_rate": 6.857124815019666e-06, "loss": 0.252, "step": 4781 }, { "epoch": 2.680493273542601, "grad_norm": 0.07765536446895811, "learning_rate": 6.833405013821792e-06, "loss": 0.2462, "step": 4782 }, { "epoch": 2.6810538116591927, "grad_norm": 0.08021593389332181, "learning_rate": 6.809724857493826e-06, "loss": 0.2399, "step": 4783 }, { "epoch": 2.681614349775785, "grad_norm": 0.08141240401544701, "learning_rate": 6.7860843561122765e-06, "loss": 0.2467, "step": 4784 }, { "epoch": 2.682174887892377, "grad_norm": 0.07766797150732227, "learning_rate": 6.762483519736806e-06, "loss": 0.2423, "step": 4785 }, { "epoch": 2.6827354260089686, "grad_norm": 0.0829476374142876, "learning_rate": 6.7389223584102265e-06, "loss": 0.249, "step": 4786 }, { "epoch": 2.6832959641255605, "grad_norm": 0.0790995203648318, "learning_rate": 6.715400882158396e-06, "loss": 0.2493, "step": 4787 }, { "epoch": 2.6838565022421523, "grad_norm": 0.08060900565937387, "learning_rate": 6.6919191009903734e-06, "loss": 0.2499, "step": 4788 }, { "epoch": 2.6844170403587446, "grad_norm": 0.07701822265961265, "learning_rate": 6.668477024898257e-06, "loss": 0.2381, "step": 4789 }, { "epoch": 2.6849775784753365, "grad_norm": 0.07947128695041439, "learning_rate": 6.645074663857298e-06, "loss": 0.2512, "step": 4790 }, { "epoch": 2.6855381165919283, "grad_norm": 0.07912370472664916, "learning_rate": 6.621712027825811e-06, "loss": 0.243, "step": 4791 }, { "epoch": 2.68609865470852, "grad_norm": 0.08015187760994366, "learning_rate": 6.598389126745208e-06, "loss": 0.2242, "step": 4792 }, { "epoch": 2.686659192825112, "grad_norm": 0.08021705459077672, "learning_rate": 6.5751059705400295e-06, "loss": 0.2475, "step": 4793 }, { "epoch": 2.6872197309417043, "grad_norm": 0.0793126903156517, "learning_rate": 6.55186256911785e-06, "loss": 0.246, "step": 4794 }, { "epoch": 2.6877802690582957, "grad_norm": 0.08237946052523472, "learning_rate": 6.5286589323693914e-06, "loss": 0.2473, "step": 4795 }, { "epoch": 2.688340807174888, "grad_norm": 0.08298713615830196, "learning_rate": 6.505495070168388e-06, "loss": 0.2422, "step": 4796 }, { "epoch": 2.68890134529148, "grad_norm": 0.08190739726043625, "learning_rate": 6.482370992371689e-06, "loss": 0.2451, "step": 4797 }, { "epoch": 2.6894618834080717, "grad_norm": 0.08095815081778734, "learning_rate": 6.459286708819234e-06, "loss": 0.2406, "step": 4798 }, { "epoch": 2.6900224215246635, "grad_norm": 0.08024020442517155, "learning_rate": 6.4362422293339665e-06, "loss": 0.2324, "step": 4799 }, { "epoch": 2.6905829596412554, "grad_norm": 0.08199655127025579, "learning_rate": 6.413237563721941e-06, "loss": 0.24, "step": 4800 }, { "epoch": 2.6911434977578477, "grad_norm": 0.07907318573785835, "learning_rate": 6.39027272177225e-06, "loss": 0.243, "step": 4801 }, { "epoch": 2.6917040358744395, "grad_norm": 0.0815893672235332, "learning_rate": 6.367347713257066e-06, "loss": 0.2477, "step": 4802 }, { "epoch": 2.6922645739910314, "grad_norm": 0.0807876069026921, "learning_rate": 6.344462547931551e-06, "loss": 0.241, "step": 4803 }, { "epoch": 2.692825112107623, "grad_norm": 0.08201034304267234, "learning_rate": 6.321617235533983e-06, "loss": 0.2547, "step": 4804 }, { "epoch": 2.693385650224215, "grad_norm": 0.0807326355114875, "learning_rate": 6.298811785785663e-06, "loss": 0.2432, "step": 4805 }, { "epoch": 2.6939461883408073, "grad_norm": 0.08143303028404608, "learning_rate": 6.276046208390873e-06, "loss": 0.2505, "step": 4806 }, { "epoch": 2.694506726457399, "grad_norm": 0.07886102191315485, "learning_rate": 6.253320513037031e-06, "loss": 0.226, "step": 4807 }, { "epoch": 2.695067264573991, "grad_norm": 0.07878016928054814, "learning_rate": 6.230634709394478e-06, "loss": 0.2366, "step": 4808 }, { "epoch": 2.695627802690583, "grad_norm": 0.07975266815564028, "learning_rate": 6.207988807116649e-06, "loss": 0.2365, "step": 4809 }, { "epoch": 2.6961883408071747, "grad_norm": 0.07899986348448437, "learning_rate": 6.185382815839969e-06, "loss": 0.2476, "step": 4810 }, { "epoch": 2.696748878923767, "grad_norm": 0.08157078034556416, "learning_rate": 6.162816745183919e-06, "loss": 0.2398, "step": 4811 }, { "epoch": 2.697309417040359, "grad_norm": 0.07931318136489432, "learning_rate": 6.14029060475092e-06, "loss": 0.243, "step": 4812 }, { "epoch": 2.6978699551569507, "grad_norm": 0.07772738146435056, "learning_rate": 6.117804404126459e-06, "loss": 0.2353, "step": 4813 }, { "epoch": 2.6984304932735426, "grad_norm": 0.07937697193267051, "learning_rate": 6.095358152879049e-06, "loss": 0.2406, "step": 4814 }, { "epoch": 2.6989910313901344, "grad_norm": 0.08013247564199258, "learning_rate": 6.072951860560128e-06, "loss": 0.2418, "step": 4815 }, { "epoch": 2.6995515695067267, "grad_norm": 0.08280738561451664, "learning_rate": 6.0505855367041895e-06, "loss": 0.2458, "step": 4816 }, { "epoch": 2.7001121076233185, "grad_norm": 0.07795701064327104, "learning_rate": 6.0282591908287e-06, "loss": 0.2318, "step": 4817 }, { "epoch": 2.7006726457399104, "grad_norm": 0.07787441036672033, "learning_rate": 6.005972832434093e-06, "loss": 0.2318, "step": 4818 }, { "epoch": 2.7012331838565022, "grad_norm": 0.08228921367300045, "learning_rate": 5.983726471003836e-06, "loss": 0.2431, "step": 4819 }, { "epoch": 2.701793721973094, "grad_norm": 0.0815026313218986, "learning_rate": 5.961520116004327e-06, "loss": 0.2432, "step": 4820 }, { "epoch": 2.702354260089686, "grad_norm": 0.08253122671677174, "learning_rate": 5.93935377688497e-06, "loss": 0.2347, "step": 4821 }, { "epoch": 2.702914798206278, "grad_norm": 0.08356273204249452, "learning_rate": 5.917227463078146e-06, "loss": 0.2467, "step": 4822 }, { "epoch": 2.70347533632287, "grad_norm": 0.0797360105512306, "learning_rate": 5.895141183999187e-06, "loss": 0.239, "step": 4823 }, { "epoch": 2.704035874439462, "grad_norm": 0.07962457970507808, "learning_rate": 5.873094949046387e-06, "loss": 0.2305, "step": 4824 }, { "epoch": 2.7045964125560538, "grad_norm": 0.07778230843941036, "learning_rate": 5.851088767600998e-06, "loss": 0.2299, "step": 4825 }, { "epoch": 2.7051569506726456, "grad_norm": 0.0817028845104827, "learning_rate": 5.8291226490272526e-06, "loss": 0.2458, "step": 4826 }, { "epoch": 2.7057174887892375, "grad_norm": 0.08313270905328339, "learning_rate": 5.807196602672305e-06, "loss": 0.2521, "step": 4827 }, { "epoch": 2.7062780269058297, "grad_norm": 0.07929295703556682, "learning_rate": 5.785310637866304e-06, "loss": 0.2248, "step": 4828 }, { "epoch": 2.7068385650224216, "grad_norm": 0.08038151631744177, "learning_rate": 5.763464763922255e-06, "loss": 0.2377, "step": 4829 }, { "epoch": 2.7073991031390134, "grad_norm": 0.08086742912267261, "learning_rate": 5.7416589901362115e-06, "loss": 0.2326, "step": 4830 }, { "epoch": 2.7079596412556053, "grad_norm": 0.0812195687445791, "learning_rate": 5.7198933257870955e-06, "loss": 0.2537, "step": 4831 }, { "epoch": 2.708520179372197, "grad_norm": 0.08277110352553523, "learning_rate": 5.698167780136765e-06, "loss": 0.239, "step": 4832 }, { "epoch": 2.7090807174887894, "grad_norm": 0.08032941302057249, "learning_rate": 5.676482362430047e-06, "loss": 0.2381, "step": 4833 }, { "epoch": 2.7096412556053813, "grad_norm": 0.08059047081602719, "learning_rate": 5.654837081894626e-06, "loss": 0.2473, "step": 4834 }, { "epoch": 2.710201793721973, "grad_norm": 0.08055343672565488, "learning_rate": 5.63323194774118e-06, "loss": 0.2437, "step": 4835 }, { "epoch": 2.710762331838565, "grad_norm": 0.08329435670479612, "learning_rate": 5.611666969163243e-06, "loss": 0.2446, "step": 4836 }, { "epoch": 2.711322869955157, "grad_norm": 0.0778452362302262, "learning_rate": 5.590142155337308e-06, "loss": 0.2412, "step": 4837 }, { "epoch": 2.711883408071749, "grad_norm": 0.08349358126462086, "learning_rate": 5.568657515422759e-06, "loss": 0.2406, "step": 4838 }, { "epoch": 2.712443946188341, "grad_norm": 0.08061624483910564, "learning_rate": 5.547213058561862e-06, "loss": 0.2474, "step": 4839 }, { "epoch": 2.713004484304933, "grad_norm": 0.08583768508047582, "learning_rate": 5.525808793879838e-06, "loss": 0.2499, "step": 4840 }, { "epoch": 2.7135650224215246, "grad_norm": 0.08190295770275775, "learning_rate": 5.504444730484726e-06, "loss": 0.2297, "step": 4841 }, { "epoch": 2.7141255605381165, "grad_norm": 0.08046549479817044, "learning_rate": 5.4831208774675515e-06, "loss": 0.2517, "step": 4842 }, { "epoch": 2.714686098654709, "grad_norm": 0.08275112958692439, "learning_rate": 5.461837243902146e-06, "loss": 0.2411, "step": 4843 }, { "epoch": 2.7152466367713, "grad_norm": 0.08320547371796622, "learning_rate": 5.440593838845287e-06, "loss": 0.2412, "step": 4844 }, { "epoch": 2.7158071748878925, "grad_norm": 0.08583192312348593, "learning_rate": 5.4193906713366e-06, "loss": 0.2509, "step": 4845 }, { "epoch": 2.7163677130044843, "grad_norm": 0.08095407175439231, "learning_rate": 5.398227750398588e-06, "loss": 0.2436, "step": 4846 }, { "epoch": 2.716928251121076, "grad_norm": 0.0817826587660967, "learning_rate": 5.377105085036671e-06, "loss": 0.244, "step": 4847 }, { "epoch": 2.717488789237668, "grad_norm": 0.07997392241960607, "learning_rate": 5.3560226842390596e-06, "loss": 0.2412, "step": 4848 }, { "epoch": 2.71804932735426, "grad_norm": 0.0806758305455095, "learning_rate": 5.33498055697692e-06, "loss": 0.241, "step": 4849 }, { "epoch": 2.718609865470852, "grad_norm": 0.08151772787644262, "learning_rate": 5.313978712204215e-06, "loss": 0.2521, "step": 4850 }, { "epoch": 2.719170403587444, "grad_norm": 0.08042888903419694, "learning_rate": 5.293017158857804e-06, "loss": 0.2378, "step": 4851 }, { "epoch": 2.719730941704036, "grad_norm": 0.0817197201429122, "learning_rate": 5.2720959058573775e-06, "loss": 0.2431, "step": 4852 }, { "epoch": 2.7202914798206277, "grad_norm": 0.07911262496270136, "learning_rate": 5.251214962105466e-06, "loss": 0.2446, "step": 4853 }, { "epoch": 2.7208520179372195, "grad_norm": 0.08148581568289714, "learning_rate": 5.230374336487498e-06, "loss": 0.2453, "step": 4854 }, { "epoch": 2.721412556053812, "grad_norm": 0.07972619160712666, "learning_rate": 5.209574037871701e-06, "loss": 0.2345, "step": 4855 }, { "epoch": 2.7219730941704037, "grad_norm": 0.08018928423055902, "learning_rate": 5.188814075109172e-06, "loss": 0.2544, "step": 4856 }, { "epoch": 2.7225336322869955, "grad_norm": 0.07909166543769797, "learning_rate": 5.168094457033801e-06, "loss": 0.2293, "step": 4857 }, { "epoch": 2.7230941704035874, "grad_norm": 0.08227752919006537, "learning_rate": 5.147415192462379e-06, "loss": 0.2425, "step": 4858 }, { "epoch": 2.723654708520179, "grad_norm": 0.08192267781634209, "learning_rate": 5.1267762901944575e-06, "loss": 0.2437, "step": 4859 }, { "epoch": 2.7242152466367715, "grad_norm": 0.0790172993354068, "learning_rate": 5.106177759012421e-06, "loss": 0.2341, "step": 4860 }, { "epoch": 2.7247757847533634, "grad_norm": 0.07960463122203566, "learning_rate": 5.085619607681524e-06, "loss": 0.2444, "step": 4861 }, { "epoch": 2.725336322869955, "grad_norm": 0.07758129627669537, "learning_rate": 5.065101844949794e-06, "loss": 0.2439, "step": 4862 }, { "epoch": 2.725896860986547, "grad_norm": 0.08202261406444565, "learning_rate": 5.044624479548099e-06, "loss": 0.2402, "step": 4863 }, { "epoch": 2.726457399103139, "grad_norm": 0.07979434006296819, "learning_rate": 5.024187520190104e-06, "loss": 0.2336, "step": 4864 }, { "epoch": 2.727017937219731, "grad_norm": 0.0816635584131755, "learning_rate": 5.003790975572253e-06, "loss": 0.239, "step": 4865 }, { "epoch": 2.727578475336323, "grad_norm": 0.08024566579238525, "learning_rate": 4.983434854373858e-06, "loss": 0.2442, "step": 4866 }, { "epoch": 2.728139013452915, "grad_norm": 0.08106903881757009, "learning_rate": 4.9631191652569465e-06, "loss": 0.2543, "step": 4867 }, { "epoch": 2.7286995515695067, "grad_norm": 0.08347005456982191, "learning_rate": 4.942843916866435e-06, "loss": 0.2449, "step": 4868 }, { "epoch": 2.7292600896860986, "grad_norm": 0.08170344258225214, "learning_rate": 4.922609117829946e-06, "loss": 0.2391, "step": 4869 }, { "epoch": 2.729820627802691, "grad_norm": 0.0797622584057896, "learning_rate": 4.902414776757924e-06, "loss": 0.228, "step": 4870 }, { "epoch": 2.7303811659192823, "grad_norm": 0.08062118472366343, "learning_rate": 4.88226090224364e-06, "loss": 0.2357, "step": 4871 }, { "epoch": 2.7309417040358746, "grad_norm": 0.08068485482047623, "learning_rate": 4.862147502863057e-06, "loss": 0.247, "step": 4872 }, { "epoch": 2.7315022421524664, "grad_norm": 0.07981616688176582, "learning_rate": 4.842074587175005e-06, "loss": 0.2365, "step": 4873 }, { "epoch": 2.7320627802690582, "grad_norm": 0.07694542358170804, "learning_rate": 4.8220421637209965e-06, "loss": 0.2302, "step": 4874 }, { "epoch": 2.73262331838565, "grad_norm": 0.08008372601815365, "learning_rate": 4.802050241025413e-06, "loss": 0.2368, "step": 4875 }, { "epoch": 2.733183856502242, "grad_norm": 0.07925762672758445, "learning_rate": 4.7820988275953045e-06, "loss": 0.2423, "step": 4876 }, { "epoch": 2.7337443946188342, "grad_norm": 0.08165250402638714, "learning_rate": 4.762187931920581e-06, "loss": 0.2471, "step": 4877 }, { "epoch": 2.734304932735426, "grad_norm": 0.0783592756245194, "learning_rate": 4.742317562473797e-06, "loss": 0.2415, "step": 4878 }, { "epoch": 2.734865470852018, "grad_norm": 0.07911870695642753, "learning_rate": 4.722487727710368e-06, "loss": 0.2318, "step": 4879 }, { "epoch": 2.7354260089686098, "grad_norm": 0.08067448083031267, "learning_rate": 4.7026984360684205e-06, "loss": 0.2455, "step": 4880 }, { "epoch": 2.7359865470852016, "grad_norm": 0.07891274357847908, "learning_rate": 4.6829496959687855e-06, "loss": 0.2363, "step": 4881 }, { "epoch": 2.736547085201794, "grad_norm": 0.07844329612266418, "learning_rate": 4.663241515815131e-06, "loss": 0.2333, "step": 4882 }, { "epoch": 2.7371076233183858, "grad_norm": 0.07957756570103335, "learning_rate": 4.64357390399377e-06, "loss": 0.2385, "step": 4883 }, { "epoch": 2.7376681614349776, "grad_norm": 0.08250791774164136, "learning_rate": 4.623946868873819e-06, "loss": 0.2438, "step": 4884 }, { "epoch": 2.7382286995515694, "grad_norm": 0.078887470344168, "learning_rate": 4.604360418807108e-06, "loss": 0.2367, "step": 4885 }, { "epoch": 2.7387892376681613, "grad_norm": 0.0799538042776628, "learning_rate": 4.584814562128159e-06, "loss": 0.2355, "step": 4886 }, { "epoch": 2.7393497757847536, "grad_norm": 0.07890725023986828, "learning_rate": 4.565309307154286e-06, "loss": 0.2317, "step": 4887 }, { "epoch": 2.7399103139013454, "grad_norm": 0.08039816000780453, "learning_rate": 4.5458446621854945e-06, "loss": 0.2449, "step": 4888 }, { "epoch": 2.7404708520179373, "grad_norm": 0.08266772222645542, "learning_rate": 4.526420635504502e-06, "loss": 0.2388, "step": 4889 }, { "epoch": 2.741031390134529, "grad_norm": 0.07847399945650287, "learning_rate": 4.507037235376754e-06, "loss": 0.2403, "step": 4890 }, { "epoch": 2.741591928251121, "grad_norm": 0.0779826784899804, "learning_rate": 4.487694470050408e-06, "loss": 0.2375, "step": 4891 }, { "epoch": 2.7421524663677133, "grad_norm": 0.07834341959171157, "learning_rate": 4.468392347756312e-06, "loss": 0.2252, "step": 4892 }, { "epoch": 2.7427130044843047, "grad_norm": 0.081313239708545, "learning_rate": 4.44913087670803e-06, "loss": 0.2454, "step": 4893 }, { "epoch": 2.743273542600897, "grad_norm": 0.0813023161291648, "learning_rate": 4.42991006510185e-06, "loss": 0.2528, "step": 4894 }, { "epoch": 2.743834080717489, "grad_norm": 0.08061347706009954, "learning_rate": 4.41072992111673e-06, "loss": 0.2375, "step": 4895 }, { "epoch": 2.7443946188340806, "grad_norm": 0.08317858229039553, "learning_rate": 4.391590452914352e-06, "loss": 0.2491, "step": 4896 }, { "epoch": 2.7449551569506725, "grad_norm": 0.0829078123381164, "learning_rate": 4.372491668639034e-06, "loss": 0.237, "step": 4897 }, { "epoch": 2.7455156950672643, "grad_norm": 0.08108923857329119, "learning_rate": 4.3534335764178536e-06, "loss": 0.2399, "step": 4898 }, { "epoch": 2.7460762331838566, "grad_norm": 0.07967048316211159, "learning_rate": 4.334416184360512e-06, "loss": 0.2489, "step": 4899 }, { "epoch": 2.7466367713004485, "grad_norm": 0.07937350424281729, "learning_rate": 4.315439500559426e-06, "loss": 0.2461, "step": 4900 }, { "epoch": 2.7471973094170403, "grad_norm": 0.07761947036514329, "learning_rate": 4.29650353308968e-06, "loss": 0.2271, "step": 4901 }, { "epoch": 2.747757847533632, "grad_norm": 0.07857183843296289, "learning_rate": 4.277608290009027e-06, "loss": 0.2383, "step": 4902 }, { "epoch": 2.748318385650224, "grad_norm": 0.08140988844295936, "learning_rate": 4.258753779357904e-06, "loss": 0.2497, "step": 4903 }, { "epoch": 2.7488789237668163, "grad_norm": 0.07894403613544215, "learning_rate": 4.2399400091594154e-06, "loss": 0.2321, "step": 4904 }, { "epoch": 2.749439461883408, "grad_norm": 0.08116152376529585, "learning_rate": 4.221166987419289e-06, "loss": 0.2508, "step": 4905 }, { "epoch": 2.75, "grad_norm": 0.08105910638593729, "learning_rate": 4.202434722125992e-06, "loss": 0.2388, "step": 4906 }, { "epoch": 2.750560538116592, "grad_norm": 0.08004397586876048, "learning_rate": 4.183743221250569e-06, "loss": 0.2395, "step": 4907 }, { "epoch": 2.7511210762331837, "grad_norm": 0.08312500342586408, "learning_rate": 4.16509249274678e-06, "loss": 0.2434, "step": 4908 }, { "epoch": 2.751681614349776, "grad_norm": 0.08453968282838248, "learning_rate": 4.146482544550967e-06, "loss": 0.2543, "step": 4909 }, { "epoch": 2.752242152466368, "grad_norm": 0.08298475355558854, "learning_rate": 4.127913384582205e-06, "loss": 0.257, "step": 4910 }, { "epoch": 2.7528026905829597, "grad_norm": 0.0841434311601157, "learning_rate": 4.109385020742118e-06, "loss": 0.2459, "step": 4911 }, { "epoch": 2.7533632286995515, "grad_norm": 0.07996229214808133, "learning_rate": 4.090897460915055e-06, "loss": 0.2409, "step": 4912 }, { "epoch": 2.7539237668161434, "grad_norm": 0.08045599167074863, "learning_rate": 4.0724507129679676e-06, "loss": 0.2466, "step": 4913 }, { "epoch": 2.7544843049327357, "grad_norm": 0.07993007824493323, "learning_rate": 4.0540447847504105e-06, "loss": 0.2461, "step": 4914 }, { "epoch": 2.7550448430493275, "grad_norm": 0.08018779590258114, "learning_rate": 4.0356796840946286e-06, "loss": 0.245, "step": 4915 }, { "epoch": 2.7556053811659194, "grad_norm": 0.08176447835814545, "learning_rate": 4.017355418815427e-06, "loss": 0.2493, "step": 4916 }, { "epoch": 2.756165919282511, "grad_norm": 0.07955554783291073, "learning_rate": 3.999071996710313e-06, "loss": 0.2432, "step": 4917 }, { "epoch": 2.756726457399103, "grad_norm": 0.08050780902204767, "learning_rate": 3.980829425559329e-06, "loss": 0.2359, "step": 4918 }, { "epoch": 2.7572869955156953, "grad_norm": 0.0808025743924573, "learning_rate": 3.962627713125189e-06, "loss": 0.2469, "step": 4919 }, { "epoch": 2.7578475336322867, "grad_norm": 0.0800776363391883, "learning_rate": 3.944466867153218e-06, "loss": 0.246, "step": 4920 }, { "epoch": 2.758408071748879, "grad_norm": 0.08208451543057813, "learning_rate": 3.926346895371313e-06, "loss": 0.2507, "step": 4921 }, { "epoch": 2.758968609865471, "grad_norm": 0.08027660809899927, "learning_rate": 3.908267805490051e-06, "loss": 0.2283, "step": 4922 }, { "epoch": 2.7595291479820627, "grad_norm": 0.0808482546151297, "learning_rate": 3.890229605202522e-06, "loss": 0.2481, "step": 4923 }, { "epoch": 2.7600896860986546, "grad_norm": 0.07994689251204488, "learning_rate": 3.872232302184487e-06, "loss": 0.2373, "step": 4924 }, { "epoch": 2.7606502242152464, "grad_norm": 0.07699473357642464, "learning_rate": 3.8542759040942734e-06, "loss": 0.2351, "step": 4925 }, { "epoch": 2.7612107623318387, "grad_norm": 0.08243198715586861, "learning_rate": 3.836360418572793e-06, "loss": 0.2461, "step": 4926 }, { "epoch": 2.7617713004484306, "grad_norm": 0.07884934050552518, "learning_rate": 3.81848585324357e-06, "loss": 0.2292, "step": 4927 }, { "epoch": 2.7623318385650224, "grad_norm": 0.08077545761857474, "learning_rate": 3.8006522157127078e-06, "loss": 0.2308, "step": 4928 }, { "epoch": 2.7628923766816142, "grad_norm": 0.08021670897868287, "learning_rate": 3.782859513568915e-06, "loss": 0.2404, "step": 4929 }, { "epoch": 2.763452914798206, "grad_norm": 0.08416006061751108, "learning_rate": 3.7651077543834346e-06, "loss": 0.2592, "step": 4930 }, { "epoch": 2.7640134529147984, "grad_norm": 0.083454558547178, "learning_rate": 3.7473969457101356e-06, "loss": 0.2442, "step": 4931 }, { "epoch": 2.7645739910313902, "grad_norm": 0.08028477042441148, "learning_rate": 3.729727095085422e-06, "loss": 0.242, "step": 4932 }, { "epoch": 2.765134529147982, "grad_norm": 0.08061222581489345, "learning_rate": 3.712098210028281e-06, "loss": 0.2427, "step": 4933 }, { "epoch": 2.765695067264574, "grad_norm": 0.0784356871893266, "learning_rate": 3.694510298040288e-06, "loss": 0.2312, "step": 4934 }, { "epoch": 2.7662556053811658, "grad_norm": 0.07930054523012602, "learning_rate": 3.676963366605557e-06, "loss": 0.2401, "step": 4935 }, { "epoch": 2.766816143497758, "grad_norm": 0.08276631730613505, "learning_rate": 3.659457423190782e-06, "loss": 0.2507, "step": 4936 }, { "epoch": 2.76737668161435, "grad_norm": 0.07760277060054778, "learning_rate": 3.641992475245204e-06, "loss": 0.2373, "step": 4937 }, { "epoch": 2.7679372197309418, "grad_norm": 0.07999184808020934, "learning_rate": 3.6245685302006447e-06, "loss": 0.2373, "step": 4938 }, { "epoch": 2.7684977578475336, "grad_norm": 0.07937605545005519, "learning_rate": 3.6071855954714406e-06, "loss": 0.2305, "step": 4939 }, { "epoch": 2.7690582959641254, "grad_norm": 0.0776053555961307, "learning_rate": 3.5898436784544854e-06, "loss": 0.2403, "step": 4940 }, { "epoch": 2.7696188340807177, "grad_norm": 0.07961889367097014, "learning_rate": 3.572542786529243e-06, "loss": 0.2321, "step": 4941 }, { "epoch": 2.770179372197309, "grad_norm": 0.0793727827627126, "learning_rate": 3.5552829270576792e-06, "loss": 0.2332, "step": 4942 }, { "epoch": 2.7707399103139014, "grad_norm": 0.08138300071359784, "learning_rate": 3.5380641073843645e-06, "loss": 0.2343, "step": 4943 }, { "epoch": 2.7713004484304933, "grad_norm": 0.08105653688643481, "learning_rate": 3.5208863348363263e-06, "loss": 0.2344, "step": 4944 }, { "epoch": 2.771860986547085, "grad_norm": 0.08219759076109563, "learning_rate": 3.503749616723173e-06, "loss": 0.2441, "step": 4945 }, { "epoch": 2.772421524663677, "grad_norm": 0.07948917468362943, "learning_rate": 3.4866539603370605e-06, "loss": 0.2536, "step": 4946 }, { "epoch": 2.772982062780269, "grad_norm": 0.07940583597981922, "learning_rate": 3.4695993729526254e-06, "loss": 0.2375, "step": 4947 }, { "epoch": 2.773542600896861, "grad_norm": 0.07809825453838833, "learning_rate": 3.4525858618270625e-06, "loss": 0.2334, "step": 4948 }, { "epoch": 2.774103139013453, "grad_norm": 0.0790630400089733, "learning_rate": 3.4356134342000467e-06, "loss": 0.2334, "step": 4949 }, { "epoch": 2.774663677130045, "grad_norm": 0.07968962642906242, "learning_rate": 3.418682097293835e-06, "loss": 0.2412, "step": 4950 }, { "epoch": 2.7752242152466366, "grad_norm": 0.081772312236787, "learning_rate": 3.4017918583131414e-06, "loss": 0.2537, "step": 4951 }, { "epoch": 2.7757847533632285, "grad_norm": 0.07791353466848155, "learning_rate": 3.384942724445195e-06, "loss": 0.2387, "step": 4952 }, { "epoch": 2.776345291479821, "grad_norm": 0.07982942110717695, "learning_rate": 3.368134702859782e-06, "loss": 0.2351, "step": 4953 }, { "epoch": 2.7769058295964126, "grad_norm": 0.0794594563417086, "learning_rate": 3.3513678007091596e-06, "loss": 0.2436, "step": 4954 }, { "epoch": 2.7774663677130045, "grad_norm": 0.0809108170869888, "learning_rate": 3.3346420251280876e-06, "loss": 0.2414, "step": 4955 }, { "epoch": 2.7780269058295963, "grad_norm": 0.0799346895705784, "learning_rate": 3.317957383233816e-06, "loss": 0.2338, "step": 4956 }, { "epoch": 2.778587443946188, "grad_norm": 0.08209863494745351, "learning_rate": 3.3013138821261336e-06, "loss": 0.2361, "step": 4957 }, { "epoch": 2.7791479820627805, "grad_norm": 0.08173173423039823, "learning_rate": 3.284711528887274e-06, "loss": 0.2452, "step": 4958 }, { "epoch": 2.7797085201793723, "grad_norm": 0.07815383566511941, "learning_rate": 3.268150330581976e-06, "loss": 0.225, "step": 4959 }, { "epoch": 2.780269058295964, "grad_norm": 0.0768024051910523, "learning_rate": 3.2516302942574793e-06, "loss": 0.2328, "step": 4960 }, { "epoch": 2.780829596412556, "grad_norm": 0.08145432874610853, "learning_rate": 3.2351514269434945e-06, "loss": 0.2395, "step": 4961 }, { "epoch": 2.781390134529148, "grad_norm": 0.07791961234583099, "learning_rate": 3.2187137356522346e-06, "loss": 0.2363, "step": 4962 }, { "epoch": 2.78195067264574, "grad_norm": 0.08034275923951512, "learning_rate": 3.2023172273783486e-06, "loss": 0.2483, "step": 4963 }, { "epoch": 2.782511210762332, "grad_norm": 0.08040564210423004, "learning_rate": 3.1859619090990222e-06, "loss": 0.2408, "step": 4964 }, { "epoch": 2.783071748878924, "grad_norm": 0.08026860440316434, "learning_rate": 3.169647787773866e-06, "loss": 0.2346, "step": 4965 }, { "epoch": 2.7836322869955157, "grad_norm": 0.07995657738869455, "learning_rate": 3.1533748703449494e-06, "loss": 0.2388, "step": 4966 }, { "epoch": 2.7841928251121075, "grad_norm": 0.07816803033206252, "learning_rate": 3.1371431637368665e-06, "loss": 0.2401, "step": 4967 }, { "epoch": 2.7847533632287, "grad_norm": 0.08108139592489091, "learning_rate": 3.120952674856614e-06, "loss": 0.2377, "step": 4968 }, { "epoch": 2.785313901345291, "grad_norm": 0.07894030303325457, "learning_rate": 3.104803410593693e-06, "loss": 0.2385, "step": 4969 }, { "epoch": 2.7858744394618835, "grad_norm": 0.07766078483546503, "learning_rate": 3.0886953778200277e-06, "loss": 0.2283, "step": 4970 }, { "epoch": 2.7864349775784754, "grad_norm": 0.08313766543340664, "learning_rate": 3.0726285833900583e-06, "loss": 0.2443, "step": 4971 }, { "epoch": 2.786995515695067, "grad_norm": 0.0783837382802284, "learning_rate": 3.0566030341405925e-06, "loss": 0.2358, "step": 4972 }, { "epoch": 2.787556053811659, "grad_norm": 0.07980212385524099, "learning_rate": 3.0406187368909435e-06, "loss": 0.2521, "step": 4973 }, { "epoch": 2.788116591928251, "grad_norm": 0.08193651596707059, "learning_rate": 3.0246756984428582e-06, "loss": 0.2464, "step": 4974 }, { "epoch": 2.788677130044843, "grad_norm": 0.07896535840835087, "learning_rate": 3.0087739255804993e-06, "loss": 0.2417, "step": 4975 }, { "epoch": 2.789237668161435, "grad_norm": 0.08002503693440731, "learning_rate": 2.9929134250705427e-06, "loss": 0.2398, "step": 4976 }, { "epoch": 2.789798206278027, "grad_norm": 0.08082508620144269, "learning_rate": 2.977094203662012e-06, "loss": 0.2407, "step": 4977 }, { "epoch": 2.7903587443946187, "grad_norm": 0.08292065152762618, "learning_rate": 2.9613162680864224e-06, "loss": 0.2462, "step": 4978 }, { "epoch": 2.7909192825112106, "grad_norm": 0.08050626144783643, "learning_rate": 2.945579625057715e-06, "loss": 0.2475, "step": 4979 }, { "epoch": 2.791479820627803, "grad_norm": 0.08182481488605035, "learning_rate": 2.9298842812722327e-06, "loss": 0.2523, "step": 4980 }, { "epoch": 2.7920403587443947, "grad_norm": 0.08249741359834577, "learning_rate": 2.914230243408789e-06, "loss": 0.2354, "step": 4981 }, { "epoch": 2.7926008968609866, "grad_norm": 0.08086698775520357, "learning_rate": 2.898617518128566e-06, "loss": 0.2386, "step": 4982 }, { "epoch": 2.7931614349775784, "grad_norm": 0.08314237452585792, "learning_rate": 2.8830461120752163e-06, "loss": 0.2424, "step": 4983 }, { "epoch": 2.7937219730941703, "grad_norm": 0.0791477644478879, "learning_rate": 2.8675160318747727e-06, "loss": 0.238, "step": 4984 }, { "epoch": 2.7942825112107625, "grad_norm": 0.08031181024209258, "learning_rate": 2.8520272841357055e-06, "loss": 0.2421, "step": 4985 }, { "epoch": 2.7948430493273544, "grad_norm": 0.0810401880716221, "learning_rate": 2.836579875448886e-06, "loss": 0.2407, "step": 4986 }, { "epoch": 2.7954035874439462, "grad_norm": 0.08004090489087526, "learning_rate": 2.8211738123876006e-06, "loss": 0.2478, "step": 4987 }, { "epoch": 2.795964125560538, "grad_norm": 0.08115702823252946, "learning_rate": 2.8058091015075394e-06, "loss": 0.2436, "step": 4988 }, { "epoch": 2.79652466367713, "grad_norm": 0.08034753708061859, "learning_rate": 2.790485749346805e-06, "loss": 0.2381, "step": 4989 }, { "epoch": 2.797085201793722, "grad_norm": 0.08439626147729173, "learning_rate": 2.775203762425882e-06, "loss": 0.2462, "step": 4990 }, { "epoch": 2.797645739910314, "grad_norm": 0.07875450477637529, "learning_rate": 2.7599631472476683e-06, "loss": 0.2319, "step": 4991 }, { "epoch": 2.798206278026906, "grad_norm": 0.08115777020139331, "learning_rate": 2.7447639102974434e-06, "loss": 0.231, "step": 4992 }, { "epoch": 2.7987668161434978, "grad_norm": 0.08316558015368568, "learning_rate": 2.7296060580428885e-06, "loss": 0.2424, "step": 4993 }, { "epoch": 2.7993273542600896, "grad_norm": 0.08079652698760205, "learning_rate": 2.714489596934089e-06, "loss": 0.2371, "step": 4994 }, { "epoch": 2.7998878923766815, "grad_norm": 0.0812179868460853, "learning_rate": 2.6994145334034994e-06, "loss": 0.2427, "step": 4995 }, { "epoch": 2.8004484304932733, "grad_norm": 0.0812805787555618, "learning_rate": 2.6843808738659324e-06, "loss": 0.2479, "step": 4996 }, { "epoch": 2.8010089686098656, "grad_norm": 0.07843320292716413, "learning_rate": 2.6693886247186605e-06, "loss": 0.236, "step": 4997 }, { "epoch": 2.8015695067264574, "grad_norm": 0.08061411163696575, "learning_rate": 2.6544377923412465e-06, "loss": 0.2504, "step": 4998 }, { "epoch": 2.8021300448430493, "grad_norm": 0.07930131566618949, "learning_rate": 2.6395283830956686e-06, "loss": 0.2377, "step": 4999 }, { "epoch": 2.802690582959641, "grad_norm": 0.08071531139627736, "learning_rate": 2.6246604033262954e-06, "loss": 0.2421, "step": 5000 }, { "epoch": 2.803251121076233, "grad_norm": 0.07931495086159082, "learning_rate": 2.6098338593598447e-06, "loss": 0.2258, "step": 5001 }, { "epoch": 2.8038116591928253, "grad_norm": 0.08094859462018693, "learning_rate": 2.595048757505392e-06, "loss": 0.2392, "step": 5002 }, { "epoch": 2.804372197309417, "grad_norm": 0.07992420918468417, "learning_rate": 2.5803051040544146e-06, "loss": 0.2275, "step": 5003 }, { "epoch": 2.804932735426009, "grad_norm": 0.0799245025277946, "learning_rate": 2.565602905280717e-06, "loss": 0.2401, "step": 5004 }, { "epoch": 2.805493273542601, "grad_norm": 0.07856487841272483, "learning_rate": 2.5509421674404844e-06, "loss": 0.2313, "step": 5005 }, { "epoch": 2.8060538116591927, "grad_norm": 0.07885956821847813, "learning_rate": 2.5363228967722364e-06, "loss": 0.2449, "step": 5006 }, { "epoch": 2.806614349775785, "grad_norm": 0.07722484258529054, "learning_rate": 2.521745099496886e-06, "loss": 0.2427, "step": 5007 }, { "epoch": 2.807174887892377, "grad_norm": 0.07920463080690886, "learning_rate": 2.5072087818176382e-06, "loss": 0.2326, "step": 5008 }, { "epoch": 2.8077354260089686, "grad_norm": 0.07947998344169381, "learning_rate": 2.4927139499201225e-06, "loss": 0.23, "step": 5009 }, { "epoch": 2.8082959641255605, "grad_norm": 0.08173714135267525, "learning_rate": 2.4782606099722606e-06, "loss": 0.247, "step": 5010 }, { "epoch": 2.8088565022421523, "grad_norm": 0.07981986410717254, "learning_rate": 2.4638487681243215e-06, "loss": 0.2236, "step": 5011 }, { "epoch": 2.8094170403587446, "grad_norm": 0.07642627727368918, "learning_rate": 2.4494784305089557e-06, "loss": 0.238, "step": 5012 }, { "epoch": 2.8099775784753365, "grad_norm": 0.08232066180554665, "learning_rate": 2.4351496032410938e-06, "loss": 0.2414, "step": 5013 }, { "epoch": 2.8105381165919283, "grad_norm": 0.08064050801326456, "learning_rate": 2.4208622924180578e-06, "loss": 0.2406, "step": 5014 }, { "epoch": 2.81109865470852, "grad_norm": 0.08195562292195246, "learning_rate": 2.406616504119463e-06, "loss": 0.2476, "step": 5015 }, { "epoch": 2.811659192825112, "grad_norm": 0.08059547939756304, "learning_rate": 2.392412244407294e-06, "loss": 0.2513, "step": 5016 }, { "epoch": 2.8122197309417043, "grad_norm": 0.08216416650635741, "learning_rate": 2.3782495193258147e-06, "loss": 0.2558, "step": 5017 }, { "epoch": 2.8127802690582957, "grad_norm": 0.08068733347935715, "learning_rate": 2.3641283349016607e-06, "loss": 0.258, "step": 5018 }, { "epoch": 2.813340807174888, "grad_norm": 0.08050818728921794, "learning_rate": 2.3500486971437587e-06, "loss": 0.2456, "step": 5019 }, { "epoch": 2.81390134529148, "grad_norm": 0.08292945331512773, "learning_rate": 2.336010612043382e-06, "loss": 0.2437, "step": 5020 }, { "epoch": 2.8144618834080717, "grad_norm": 0.07795781503050385, "learning_rate": 2.322014085574109e-06, "loss": 0.2386, "step": 5021 }, { "epoch": 2.8150224215246635, "grad_norm": 0.08022008972424731, "learning_rate": 2.3080591236918303e-06, "loss": 0.2336, "step": 5022 }, { "epoch": 2.8155829596412554, "grad_norm": 0.08247247207217011, "learning_rate": 2.2941457323347627e-06, "loss": 0.2506, "step": 5023 }, { "epoch": 2.8161434977578477, "grad_norm": 0.08018762360263491, "learning_rate": 2.2802739174234146e-06, "loss": 0.2406, "step": 5024 }, { "epoch": 2.8167040358744395, "grad_norm": 0.08380126760379156, "learning_rate": 2.2664436848606194e-06, "loss": 0.265, "step": 5025 }, { "epoch": 2.8172645739910314, "grad_norm": 0.0800422086942214, "learning_rate": 2.252655040531493e-06, "loss": 0.2322, "step": 5026 }, { "epoch": 2.817825112107623, "grad_norm": 0.07979590612087926, "learning_rate": 2.238907990303496e-06, "loss": 0.2359, "step": 5027 }, { "epoch": 2.818385650224215, "grad_norm": 0.08201699136530352, "learning_rate": 2.225202540026361e-06, "loss": 0.2594, "step": 5028 }, { "epoch": 2.8189461883408073, "grad_norm": 0.0830963486106769, "learning_rate": 2.2115386955321004e-06, "loss": 0.2448, "step": 5029 }, { "epoch": 2.819506726457399, "grad_norm": 0.08119421109654049, "learning_rate": 2.1979164626350748e-06, "loss": 0.2481, "step": 5030 }, { "epoch": 2.820067264573991, "grad_norm": 0.0809885995298249, "learning_rate": 2.1843358471318908e-06, "loss": 0.2436, "step": 5031 }, { "epoch": 2.820627802690583, "grad_norm": 0.07844458178452507, "learning_rate": 2.170796854801449e-06, "loss": 0.242, "step": 5032 }, { "epoch": 2.8211883408071747, "grad_norm": 0.07898023469165534, "learning_rate": 2.1572994914049847e-06, "loss": 0.234, "step": 5033 }, { "epoch": 2.821748878923767, "grad_norm": 0.0823097354542868, "learning_rate": 2.1438437626859487e-06, "loss": 0.2403, "step": 5034 }, { "epoch": 2.822309417040359, "grad_norm": 0.07985392689741619, "learning_rate": 2.130429674370138e-06, "loss": 0.227, "step": 5035 }, { "epoch": 2.8228699551569507, "grad_norm": 0.08264550665738286, "learning_rate": 2.1170572321655868e-06, "loss": 0.2488, "step": 5036 }, { "epoch": 2.8234304932735426, "grad_norm": 0.08096897766330863, "learning_rate": 2.1037264417626544e-06, "loss": 0.2502, "step": 5037 }, { "epoch": 2.8239910313901344, "grad_norm": 0.08129709995477422, "learning_rate": 2.0904373088339367e-06, "loss": 0.2506, "step": 5038 }, { "epoch": 2.8245515695067267, "grad_norm": 0.08062035437913692, "learning_rate": 2.077189839034288e-06, "loss": 0.2321, "step": 5039 }, { "epoch": 2.8251121076233185, "grad_norm": 0.07923689907747232, "learning_rate": 2.063984038000888e-06, "loss": 0.2402, "step": 5040 }, { "epoch": 2.8256726457399104, "grad_norm": 0.08120919706097315, "learning_rate": 2.0508199113531414e-06, "loss": 0.2444, "step": 5041 }, { "epoch": 2.8262331838565022, "grad_norm": 0.08176687509396283, "learning_rate": 2.037697464692756e-06, "loss": 0.2376, "step": 5042 }, { "epoch": 2.826793721973094, "grad_norm": 0.08126695881107068, "learning_rate": 2.0246167036036543e-06, "loss": 0.2372, "step": 5043 }, { "epoch": 2.827354260089686, "grad_norm": 0.08124315561587099, "learning_rate": 2.011577633652062e-06, "loss": 0.2484, "step": 5044 }, { "epoch": 2.827914798206278, "grad_norm": 0.0792594597102603, "learning_rate": 1.9985802603864624e-06, "loss": 0.2401, "step": 5045 }, { "epoch": 2.82847533632287, "grad_norm": 0.08248713684791316, "learning_rate": 1.9856245893375645e-06, "loss": 0.241, "step": 5046 }, { "epoch": 2.829035874439462, "grad_norm": 0.08223085773508047, "learning_rate": 1.9727106260183704e-06, "loss": 0.2318, "step": 5047 }, { "epoch": 2.8295964125560538, "grad_norm": 0.08181175375406184, "learning_rate": 1.9598383759240946e-06, "loss": 0.2573, "step": 5048 }, { "epoch": 2.8301569506726456, "grad_norm": 0.07889165704259438, "learning_rate": 1.947007844532245e-06, "loss": 0.2198, "step": 5049 }, { "epoch": 2.8307174887892375, "grad_norm": 0.07950468343923922, "learning_rate": 1.9342190373025313e-06, "loss": 0.2343, "step": 5050 }, { "epoch": 2.8312780269058297, "grad_norm": 0.07981220514558676, "learning_rate": 1.921471959676957e-06, "loss": 0.2442, "step": 5051 }, { "epoch": 2.8318385650224216, "grad_norm": 0.079618243775057, "learning_rate": 1.9087666170797267e-06, "loss": 0.24, "step": 5052 }, { "epoch": 2.8323991031390134, "grad_norm": 0.0785857151532339, "learning_rate": 1.8961030149173054e-06, "loss": 0.2476, "step": 5053 }, { "epoch": 2.8329596412556053, "grad_norm": 0.08025682912235395, "learning_rate": 1.883481158578404e-06, "loss": 0.2477, "step": 5054 }, { "epoch": 2.833520179372197, "grad_norm": 0.0788992016416964, "learning_rate": 1.8709010534339378e-06, "loss": 0.2346, "step": 5055 }, { "epoch": 2.8340807174887894, "grad_norm": 0.07886702908057779, "learning_rate": 1.8583627048371022e-06, "loss": 0.2378, "step": 5056 }, { "epoch": 2.8346412556053813, "grad_norm": 0.07768139483717154, "learning_rate": 1.8458661181232739e-06, "loss": 0.2207, "step": 5057 }, { "epoch": 2.835201793721973, "grad_norm": 0.07668230535027755, "learning_rate": 1.8334112986100992e-06, "loss": 0.2359, "step": 5058 }, { "epoch": 2.835762331838565, "grad_norm": 0.07892917109696834, "learning_rate": 1.8209982515974277e-06, "loss": 0.2346, "step": 5059 }, { "epoch": 2.836322869955157, "grad_norm": 0.07841402349758807, "learning_rate": 1.8086269823673563e-06, "loss": 0.2339, "step": 5060 }, { "epoch": 2.836883408071749, "grad_norm": 0.07847361926914799, "learning_rate": 1.7962974961841738e-06, "loss": 0.2315, "step": 5061 }, { "epoch": 2.837443946188341, "grad_norm": 0.08110310021840367, "learning_rate": 1.784009798294406e-06, "loss": 0.2511, "step": 5062 }, { "epoch": 2.838004484304933, "grad_norm": 0.08141770914014684, "learning_rate": 1.7717638939268145e-06, "loss": 0.237, "step": 5063 }, { "epoch": 2.8385650224215246, "grad_norm": 0.08032709195173478, "learning_rate": 1.7595597882923309e-06, "loss": 0.2359, "step": 5064 }, { "epoch": 2.8391255605381165, "grad_norm": 0.08112292362244095, "learning_rate": 1.7473974865841569e-06, "loss": 0.2299, "step": 5065 }, { "epoch": 2.839686098654709, "grad_norm": 0.07803159972214607, "learning_rate": 1.7352769939776526e-06, "loss": 0.2328, "step": 5066 }, { "epoch": 2.8402466367713, "grad_norm": 0.0785507265144912, "learning_rate": 1.7231983156304144e-06, "loss": 0.2367, "step": 5067 }, { "epoch": 2.8408071748878925, "grad_norm": 0.0817662928210577, "learning_rate": 1.711161456682242e-06, "loss": 0.2434, "step": 5068 }, { "epoch": 2.8413677130044843, "grad_norm": 0.07645522785547558, "learning_rate": 1.6991664222551495e-06, "loss": 0.2245, "step": 5069 }, { "epoch": 2.841928251121076, "grad_norm": 0.08026739599895902, "learning_rate": 1.6872132174533427e-06, "loss": 0.2441, "step": 5070 }, { "epoch": 2.842488789237668, "grad_norm": 0.08211132386376416, "learning_rate": 1.6753018473632087e-06, "loss": 0.2506, "step": 5071 }, { "epoch": 2.84304932735426, "grad_norm": 0.08170934677712485, "learning_rate": 1.6634323170533928e-06, "loss": 0.2446, "step": 5072 }, { "epoch": 2.843609865470852, "grad_norm": 0.07739677551438127, "learning_rate": 1.6516046315746659e-06, "loss": 0.2428, "step": 5073 }, { "epoch": 2.844170403587444, "grad_norm": 0.08151001977790998, "learning_rate": 1.639818795960013e-06, "loss": 0.2469, "step": 5074 }, { "epoch": 2.844730941704036, "grad_norm": 0.08105458687991796, "learning_rate": 1.6280748152246562e-06, "loss": 0.255, "step": 5075 }, { "epoch": 2.8452914798206277, "grad_norm": 0.08235718212563223, "learning_rate": 1.6163726943659419e-06, "loss": 0.2458, "step": 5076 }, { "epoch": 2.8458520179372195, "grad_norm": 0.07878870549768287, "learning_rate": 1.6047124383634537e-06, "loss": 0.2382, "step": 5077 }, { "epoch": 2.846412556053812, "grad_norm": 0.08324852404587757, "learning_rate": 1.593094052178945e-06, "loss": 0.2397, "step": 5078 }, { "epoch": 2.8469730941704037, "grad_norm": 0.08146821956937725, "learning_rate": 1.5815175407563165e-06, "loss": 0.233, "step": 5079 }, { "epoch": 2.8475336322869955, "grad_norm": 0.08099050094226903, "learning_rate": 1.5699829090217278e-06, "loss": 0.2408, "step": 5080 }, { "epoch": 2.8480941704035874, "grad_norm": 0.07791236653043693, "learning_rate": 1.5584901618834301e-06, "loss": 0.2382, "step": 5081 }, { "epoch": 2.848654708520179, "grad_norm": 0.08058011888194505, "learning_rate": 1.5470393042319232e-06, "loss": 0.238, "step": 5082 }, { "epoch": 2.8492152466367715, "grad_norm": 0.0821254894649332, "learning_rate": 1.535630340939842e-06, "loss": 0.2481, "step": 5083 }, { "epoch": 2.8497757847533634, "grad_norm": 0.08206897357919915, "learning_rate": 1.5242632768619925e-06, "loss": 0.2369, "step": 5084 }, { "epoch": 2.850336322869955, "grad_norm": 0.0786350321647868, "learning_rate": 1.512938116835394e-06, "loss": 0.24, "step": 5085 }, { "epoch": 2.850896860986547, "grad_norm": 0.08234829259429416, "learning_rate": 1.5016548656791697e-06, "loss": 0.2566, "step": 5086 }, { "epoch": 2.851457399103139, "grad_norm": 0.08187607263505849, "learning_rate": 1.4904135281946673e-06, "loss": 0.2459, "step": 5087 }, { "epoch": 2.852017937219731, "grad_norm": 0.07914659198871633, "learning_rate": 1.4792141091653612e-06, "loss": 0.2366, "step": 5088 }, { "epoch": 2.852578475336323, "grad_norm": 0.08124554504977664, "learning_rate": 1.4680566133569162e-06, "loss": 0.2418, "step": 5089 }, { "epoch": 2.853139013452915, "grad_norm": 0.07868278459909037, "learning_rate": 1.4569410455171351e-06, "loss": 0.246, "step": 5090 }, { "epoch": 2.8536995515695067, "grad_norm": 0.0795003399865061, "learning_rate": 1.4458674103759894e-06, "loss": 0.2565, "step": 5091 }, { "epoch": 2.8542600896860986, "grad_norm": 0.08041450301498791, "learning_rate": 1.4348357126456102e-06, "loss": 0.2321, "step": 5092 }, { "epoch": 2.854820627802691, "grad_norm": 0.08077531018250238, "learning_rate": 1.4238459570202644e-06, "loss": 0.2468, "step": 5093 }, { "epoch": 2.8553811659192823, "grad_norm": 0.08090510985480488, "learning_rate": 1.4128981481764115e-06, "loss": 0.2482, "step": 5094 }, { "epoch": 2.8559417040358746, "grad_norm": 0.0822852376229242, "learning_rate": 1.4019922907726136e-06, "loss": 0.2482, "step": 5095 }, { "epoch": 2.8565022421524664, "grad_norm": 0.07866308000957997, "learning_rate": 1.3911283894496253e-06, "loss": 0.2267, "step": 5096 }, { "epoch": 2.8570627802690582, "grad_norm": 0.08146261608076158, "learning_rate": 1.380306448830293e-06, "loss": 0.2414, "step": 5097 }, { "epoch": 2.85762331838565, "grad_norm": 0.08291387306663744, "learning_rate": 1.3695264735196778e-06, "loss": 0.2613, "step": 5098 }, { "epoch": 2.858183856502242, "grad_norm": 0.07746619006915174, "learning_rate": 1.3587884681049322e-06, "loss": 0.229, "step": 5099 }, { "epoch": 2.8587443946188342, "grad_norm": 0.0811871605233219, "learning_rate": 1.348092437155346e-06, "loss": 0.2532, "step": 5100 }, { "epoch": 2.859304932735426, "grad_norm": 0.08048551935917159, "learning_rate": 1.3374383852223892e-06, "loss": 0.2342, "step": 5101 }, { "epoch": 2.859865470852018, "grad_norm": 0.08047930535275231, "learning_rate": 1.3268263168396245e-06, "loss": 0.2468, "step": 5102 }, { "epoch": 2.8604260089686098, "grad_norm": 0.07982049190502127, "learning_rate": 1.316256236522806e-06, "loss": 0.2399, "step": 5103 }, { "epoch": 2.8609865470852016, "grad_norm": 0.0808219184841395, "learning_rate": 1.305728148769736e-06, "loss": 0.2442, "step": 5104 }, { "epoch": 2.861547085201794, "grad_norm": 0.08077620103395643, "learning_rate": 1.295242058060442e-06, "loss": 0.2438, "step": 5105 }, { "epoch": 2.8621076233183858, "grad_norm": 0.07761821979349973, "learning_rate": 1.28479796885701e-06, "loss": 0.2398, "step": 5106 }, { "epoch": 2.8626681614349776, "grad_norm": 0.0788037658984251, "learning_rate": 1.2743958856036743e-06, "loss": 0.2391, "step": 5107 }, { "epoch": 2.8632286995515694, "grad_norm": 0.07908527218812027, "learning_rate": 1.2640358127268049e-06, "loss": 0.2407, "step": 5108 }, { "epoch": 2.8637892376681613, "grad_norm": 0.07836661894689284, "learning_rate": 1.2537177546348978e-06, "loss": 0.2346, "step": 5109 }, { "epoch": 2.8643497757847536, "grad_norm": 0.08349840583336998, "learning_rate": 1.2434417157185519e-06, "loss": 0.2485, "step": 5110 }, { "epoch": 2.8649103139013454, "grad_norm": 0.07926798817133278, "learning_rate": 1.2332077003505027e-06, "loss": 0.2349, "step": 5111 }, { "epoch": 2.8654708520179373, "grad_norm": 0.08018566682994367, "learning_rate": 1.223015712885589e-06, "loss": 0.2415, "step": 5112 }, { "epoch": 2.866031390134529, "grad_norm": 0.0815649615936414, "learning_rate": 1.2128657576607861e-06, "loss": 0.2394, "step": 5113 }, { "epoch": 2.866591928251121, "grad_norm": 0.07900698383651082, "learning_rate": 1.2027578389951499e-06, "loss": 0.2319, "step": 5114 }, { "epoch": 2.8671524663677133, "grad_norm": 0.0781644852483143, "learning_rate": 1.1926919611898847e-06, "loss": 0.2418, "step": 5115 }, { "epoch": 2.8677130044843047, "grad_norm": 0.07902751814612542, "learning_rate": 1.182668128528286e-06, "loss": 0.2289, "step": 5116 }, { "epoch": 2.868273542600897, "grad_norm": 0.07763216154627357, "learning_rate": 1.1726863452757642e-06, "loss": 0.2432, "step": 5117 }, { "epoch": 2.868834080717489, "grad_norm": 0.07987496597951721, "learning_rate": 1.1627466156798328e-06, "loss": 0.2366, "step": 5118 }, { "epoch": 2.8693946188340806, "grad_norm": 0.08033334021918984, "learning_rate": 1.1528489439701085e-06, "loss": 0.2439, "step": 5119 }, { "epoch": 2.8699551569506725, "grad_norm": 0.08015640437497674, "learning_rate": 1.142993334358311e-06, "loss": 0.2416, "step": 5120 }, { "epoch": 2.8705156950672643, "grad_norm": 0.08066441239988657, "learning_rate": 1.1331797910382747e-06, "loss": 0.2408, "step": 5121 }, { "epoch": 2.8710762331838566, "grad_norm": 0.08384191065283618, "learning_rate": 1.1234083181859256e-06, "loss": 0.2384, "step": 5122 }, { "epoch": 2.8716367713004485, "grad_norm": 0.08123361697642735, "learning_rate": 1.1136789199592713e-06, "loss": 0.2462, "step": 5123 }, { "epoch": 2.8721973094170403, "grad_norm": 0.0794233829323199, "learning_rate": 1.1039916004984441e-06, "loss": 0.2418, "step": 5124 }, { "epoch": 2.872757847533632, "grad_norm": 0.07897174463108514, "learning_rate": 1.094346363925647e-06, "loss": 0.2377, "step": 5125 }, { "epoch": 2.873318385650224, "grad_norm": 0.08348418792163165, "learning_rate": 1.0847432143451962e-06, "loss": 0.2421, "step": 5126 }, { "epoch": 2.8738789237668163, "grad_norm": 0.08161778924749036, "learning_rate": 1.0751821558434793e-06, "loss": 0.2459, "step": 5127 }, { "epoch": 2.874439461883408, "grad_norm": 0.08116353598027902, "learning_rate": 1.0656631924889749e-06, "loss": 0.2478, "step": 5128 }, { "epoch": 2.875, "grad_norm": 0.08347565726708857, "learning_rate": 1.0561863283322759e-06, "loss": 0.2393, "step": 5129 }, { "epoch": 2.875560538116592, "grad_norm": 0.08033654412257757, "learning_rate": 1.0467515674060236e-06, "loss": 0.2417, "step": 5130 }, { "epoch": 2.8761210762331837, "grad_norm": 0.07684616191101404, "learning_rate": 1.037358913724973e-06, "loss": 0.2362, "step": 5131 }, { "epoch": 2.876681614349776, "grad_norm": 0.08189726163249501, "learning_rate": 1.028008371285938e-06, "loss": 0.2407, "step": 5132 }, { "epoch": 2.877242152466368, "grad_norm": 0.08280875568688238, "learning_rate": 1.0186999440678246e-06, "loss": 0.2496, "step": 5133 }, { "epoch": 2.8778026905829597, "grad_norm": 0.07971865121690089, "learning_rate": 1.0094336360316202e-06, "loss": 0.2363, "step": 5134 }, { "epoch": 2.8783632286995515, "grad_norm": 0.08195841973986688, "learning_rate": 1.0002094511203819e-06, "loss": 0.2407, "step": 5135 }, { "epoch": 2.8789237668161434, "grad_norm": 0.08239469761028677, "learning_rate": 9.910273932592584e-07, "loss": 0.2561, "step": 5136 }, { "epoch": 2.8794843049327357, "grad_norm": 0.08177280278997268, "learning_rate": 9.818874663554357e-07, "loss": 0.2459, "step": 5137 }, { "epoch": 2.8800448430493275, "grad_norm": 0.07983579704437194, "learning_rate": 9.727896742982245e-07, "loss": 0.2486, "step": 5138 }, { "epoch": 2.8806053811659194, "grad_norm": 0.08053571258340099, "learning_rate": 9.63734020958973e-07, "loss": 0.2365, "step": 5139 }, { "epoch": 2.881165919282511, "grad_norm": 0.07728186842334782, "learning_rate": 9.54720510191076e-07, "loss": 0.2355, "step": 5140 }, { "epoch": 2.881726457399103, "grad_norm": 0.08057560656759212, "learning_rate": 9.457491458300549e-07, "loss": 0.2421, "step": 5141 }, { "epoch": 2.8822869955156953, "grad_norm": 0.08003610843585328, "learning_rate": 9.368199316934445e-07, "loss": 0.2478, "step": 5142 }, { "epoch": 2.8828475336322867, "grad_norm": 0.08280976479529986, "learning_rate": 9.279328715808722e-07, "loss": 0.2435, "step": 5143 }, { "epoch": 2.883408071748879, "grad_norm": 0.07809939277009989, "learning_rate": 9.190879692740128e-07, "loss": 0.2337, "step": 5144 }, { "epoch": 2.883968609865471, "grad_norm": 0.07922551065074915, "learning_rate": 9.102852285366226e-07, "loss": 0.2446, "step": 5145 }, { "epoch": 2.8845291479820627, "grad_norm": 0.07825885666601355, "learning_rate": 9.015246531144939e-07, "loss": 0.2306, "step": 5146 }, { "epoch": 2.8850896860986546, "grad_norm": 0.07878833678038266, "learning_rate": 8.92806246735467e-07, "loss": 0.2341, "step": 5147 }, { "epoch": 2.8856502242152464, "grad_norm": 0.08181587406985132, "learning_rate": 8.841300131094854e-07, "loss": 0.2457, "step": 5148 }, { "epoch": 2.8862107623318387, "grad_norm": 0.0805969143015379, "learning_rate": 8.75495955928507e-07, "loss": 0.2351, "step": 5149 }, { "epoch": 2.8867713004484306, "grad_norm": 0.07843158968423362, "learning_rate": 8.669040788665372e-07, "loss": 0.2324, "step": 5150 }, { "epoch": 2.8873318385650224, "grad_norm": 0.07739285893068815, "learning_rate": 8.583543855796738e-07, "loss": 0.2319, "step": 5151 }, { "epoch": 2.8878923766816142, "grad_norm": 0.08471175286687038, "learning_rate": 8.498468797060289e-07, "loss": 0.2366, "step": 5152 }, { "epoch": 2.888452914798206, "grad_norm": 0.08302254160230785, "learning_rate": 8.413815648657731e-07, "loss": 0.2525, "step": 5153 }, { "epoch": 2.8890134529147984, "grad_norm": 0.07967514164685403, "learning_rate": 8.329584446611138e-07, "loss": 0.2447, "step": 5154 }, { "epoch": 2.8895739910313902, "grad_norm": 0.08256158297069675, "learning_rate": 8.245775226763397e-07, "loss": 0.2476, "step": 5155 }, { "epoch": 2.890134529147982, "grad_norm": 0.08179878115799412, "learning_rate": 8.162388024777201e-07, "loss": 0.2411, "step": 5156 }, { "epoch": 2.890695067264574, "grad_norm": 0.08052932090181747, "learning_rate": 8.079422876136388e-07, "loss": 0.237, "step": 5157 }, { "epoch": 2.8912556053811658, "grad_norm": 0.08060278046000689, "learning_rate": 7.996879816144498e-07, "loss": 0.2423, "step": 5158 }, { "epoch": 2.891816143497758, "grad_norm": 0.08077338111923703, "learning_rate": 7.914758879925988e-07, "loss": 0.2445, "step": 5159 }, { "epoch": 2.89237668161435, "grad_norm": 0.07978404180548836, "learning_rate": 7.833060102425682e-07, "loss": 0.2481, "step": 5160 }, { "epoch": 2.8929372197309418, "grad_norm": 0.08072306353866018, "learning_rate": 7.751783518408218e-07, "loss": 0.2392, "step": 5161 }, { "epoch": 2.8934977578475336, "grad_norm": 0.07596315491150957, "learning_rate": 7.670929162459261e-07, "loss": 0.2229, "step": 5162 }, { "epoch": 2.8940582959641254, "grad_norm": 0.08204698511684655, "learning_rate": 7.590497068984293e-07, "loss": 0.2464, "step": 5163 }, { "epoch": 2.8946188340807177, "grad_norm": 0.08109058564013136, "learning_rate": 7.510487272209377e-07, "loss": 0.2437, "step": 5164 }, { "epoch": 2.895179372197309, "grad_norm": 0.08325131574882993, "learning_rate": 7.430899806180835e-07, "loss": 0.245, "step": 5165 }, { "epoch": 2.8957399103139014, "grad_norm": 0.08101001328605403, "learning_rate": 7.351734704765245e-07, "loss": 0.2377, "step": 5166 }, { "epoch": 2.8963004484304933, "grad_norm": 0.08026757155181756, "learning_rate": 7.272992001649436e-07, "loss": 0.2284, "step": 5167 }, { "epoch": 2.896860986547085, "grad_norm": 0.07914441858849464, "learning_rate": 7.194671730340608e-07, "loss": 0.2449, "step": 5168 }, { "epoch": 2.897421524663677, "grad_norm": 0.08445317725781687, "learning_rate": 7.116773924166098e-07, "loss": 0.2426, "step": 5169 }, { "epoch": 2.897982062780269, "grad_norm": 0.07822137255581857, "learning_rate": 7.039298616273393e-07, "loss": 0.2333, "step": 5170 }, { "epoch": 2.898542600896861, "grad_norm": 0.07983348122960583, "learning_rate": 6.962245839630455e-07, "loss": 0.2417, "step": 5171 }, { "epoch": 2.899103139013453, "grad_norm": 0.08133611004648454, "learning_rate": 6.885615627025166e-07, "loss": 0.2406, "step": 5172 }, { "epoch": 2.899663677130045, "grad_norm": 0.07786241853242937, "learning_rate": 6.809408011065887e-07, "loss": 0.2348, "step": 5173 }, { "epoch": 2.9002242152466366, "grad_norm": 0.08319128466847514, "learning_rate": 6.733623024180791e-07, "loss": 0.251, "step": 5174 }, { "epoch": 2.9007847533632285, "grad_norm": 0.0806610722398245, "learning_rate": 6.658260698618524e-07, "loss": 0.2428, "step": 5175 }, { "epoch": 2.901345291479821, "grad_norm": 0.07884296766934859, "learning_rate": 6.583321066447656e-07, "loss": 0.2354, "step": 5176 }, { "epoch": 2.9019058295964126, "grad_norm": 0.08325168236695274, "learning_rate": 6.508804159557236e-07, "loss": 0.2576, "step": 5177 }, { "epoch": 2.9024663677130045, "grad_norm": 0.08127027999981284, "learning_rate": 6.434710009656008e-07, "loss": 0.2307, "step": 5178 }, { "epoch": 2.9030269058295963, "grad_norm": 0.07950439689511878, "learning_rate": 6.361038648273088e-07, "loss": 0.238, "step": 5179 }, { "epoch": 2.903587443946188, "grad_norm": 0.07941759994163064, "learning_rate": 6.287790106757396e-07, "loss": 0.2459, "step": 5180 }, { "epoch": 2.9041479820627805, "grad_norm": 0.08089931654603481, "learning_rate": 6.214964416278445e-07, "loss": 0.2446, "step": 5181 }, { "epoch": 2.9047085201793723, "grad_norm": 0.08088896415189534, "learning_rate": 6.142561607825337e-07, "loss": 0.2353, "step": 5182 }, { "epoch": 2.905269058295964, "grad_norm": 0.08117874836660299, "learning_rate": 6.070581712207424e-07, "loss": 0.2469, "step": 5183 }, { "epoch": 2.905829596412556, "grad_norm": 0.08065707726695893, "learning_rate": 5.999024760054095e-07, "loss": 0.2387, "step": 5184 }, { "epoch": 2.906390134529148, "grad_norm": 0.08049225142767744, "learning_rate": 5.927890781814661e-07, "loss": 0.2415, "step": 5185 }, { "epoch": 2.90695067264574, "grad_norm": 0.08113091376622063, "learning_rate": 5.857179807758684e-07, "loss": 0.2491, "step": 5186 }, { "epoch": 2.907511210762332, "grad_norm": 0.07695739944725659, "learning_rate": 5.78689186797543e-07, "loss": 0.2306, "step": 5187 }, { "epoch": 2.908071748878924, "grad_norm": 0.08066498499230515, "learning_rate": 5.717026992374308e-07, "loss": 0.2472, "step": 5188 }, { "epoch": 2.9086322869955157, "grad_norm": 0.08083582416430926, "learning_rate": 5.647585210684758e-07, "loss": 0.2547, "step": 5189 }, { "epoch": 2.9091928251121075, "grad_norm": 0.0813592996568508, "learning_rate": 5.578566552456032e-07, "loss": 0.2411, "step": 5190 }, { "epoch": 2.9097533632287, "grad_norm": 0.07931356603718039, "learning_rate": 5.509971047057416e-07, "loss": 0.2411, "step": 5191 }, { "epoch": 2.910313901345291, "grad_norm": 0.08105103356021948, "learning_rate": 5.441798723678115e-07, "loss": 0.2415, "step": 5192 }, { "epoch": 2.9108744394618835, "grad_norm": 0.07855446840127679, "learning_rate": 5.37404961132737e-07, "loss": 0.2408, "step": 5193 }, { "epoch": 2.9114349775784754, "grad_norm": 0.07995087383105443, "learning_rate": 5.306723738834119e-07, "loss": 0.2389, "step": 5194 }, { "epoch": 2.911995515695067, "grad_norm": 0.08029067563327771, "learning_rate": 5.239821134847445e-07, "loss": 0.2341, "step": 5195 }, { "epoch": 2.912556053811659, "grad_norm": 0.07820034799966565, "learning_rate": 5.173341827836021e-07, "loss": 0.2436, "step": 5196 }, { "epoch": 2.913116591928251, "grad_norm": 0.07774134929015715, "learning_rate": 5.107285846088772e-07, "loss": 0.2465, "step": 5197 }, { "epoch": 2.913677130044843, "grad_norm": 0.081846016749724, "learning_rate": 5.041653217713993e-07, "loss": 0.2551, "step": 5198 }, { "epoch": 2.914237668161435, "grad_norm": 0.08222288904358514, "learning_rate": 4.976443970640343e-07, "loss": 0.2476, "step": 5199 }, { "epoch": 2.914798206278027, "grad_norm": 0.07929364672043442, "learning_rate": 4.91165813261607e-07, "loss": 0.2474, "step": 5200 }, { "epoch": 2.9153587443946187, "grad_norm": 0.0796620167903564, "learning_rate": 4.847295731209234e-07, "loss": 0.2489, "step": 5201 }, { "epoch": 2.9159192825112106, "grad_norm": 0.08027609274529898, "learning_rate": 4.783356793807814e-07, "loss": 0.2394, "step": 5202 }, { "epoch": 2.916479820627803, "grad_norm": 0.07993725490400207, "learning_rate": 4.7198413476193804e-07, "loss": 0.2492, "step": 5203 }, { "epoch": 2.9170403587443947, "grad_norm": 0.0803466866394501, "learning_rate": 4.6567494196715354e-07, "loss": 0.2511, "step": 5204 }, { "epoch": 2.9176008968609866, "grad_norm": 0.0800223749066712, "learning_rate": 4.5940810368116924e-07, "loss": 0.2418, "step": 5205 }, { "epoch": 2.9181614349775784, "grad_norm": 0.08007370509835651, "learning_rate": 4.5318362257067426e-07, "loss": 0.2277, "step": 5206 }, { "epoch": 2.9187219730941703, "grad_norm": 0.08043926279562774, "learning_rate": 4.4700150128436094e-07, "loss": 0.2492, "step": 5207 }, { "epoch": 2.9192825112107625, "grad_norm": 0.08165038215580994, "learning_rate": 4.4086174245288047e-07, "loss": 0.2419, "step": 5208 }, { "epoch": 2.9198430493273544, "grad_norm": 0.08080196846154497, "learning_rate": 4.347643486888653e-07, "loss": 0.2433, "step": 5209 }, { "epoch": 2.9204035874439462, "grad_norm": 0.07920220502048722, "learning_rate": 4.287093225869288e-07, "loss": 0.2443, "step": 5210 }, { "epoch": 2.920964125560538, "grad_norm": 0.08143488760783185, "learning_rate": 4.226966667236321e-07, "loss": 0.2463, "step": 5211 }, { "epoch": 2.92152466367713, "grad_norm": 0.08247489520940915, "learning_rate": 4.167263836575286e-07, "loss": 0.247, "step": 5212 }, { "epoch": 2.922085201793722, "grad_norm": 0.0807093966938727, "learning_rate": 4.107984759291306e-07, "loss": 0.2392, "step": 5213 }, { "epoch": 2.922645739910314, "grad_norm": 0.0795617388431577, "learning_rate": 4.0491294606093135e-07, "loss": 0.2384, "step": 5214 }, { "epoch": 2.923206278026906, "grad_norm": 0.07962850511281837, "learning_rate": 3.990697965573609e-07, "loss": 0.2374, "step": 5215 }, { "epoch": 2.9237668161434978, "grad_norm": 0.07993802378102899, "learning_rate": 3.9326902990484136e-07, "loss": 0.2432, "step": 5216 }, { "epoch": 2.9243273542600896, "grad_norm": 0.08015230120043666, "learning_rate": 3.87510648571765e-07, "loss": 0.2484, "step": 5217 }, { "epoch": 2.9248878923766815, "grad_norm": 0.07999735915052116, "learning_rate": 3.8179465500846057e-07, "loss": 0.2297, "step": 5218 }, { "epoch": 2.9254484304932733, "grad_norm": 0.07941900333132416, "learning_rate": 3.761210516472602e-07, "loss": 0.2383, "step": 5219 }, { "epoch": 2.9260089686098656, "grad_norm": 0.08122180837584679, "learning_rate": 3.7048984090239934e-07, "loss": 0.2384, "step": 5220 }, { "epoch": 2.9265695067264574, "grad_norm": 0.07721862593318037, "learning_rate": 3.64901025170139e-07, "loss": 0.2316, "step": 5221 }, { "epoch": 2.9271300448430493, "grad_norm": 0.07826050021759158, "learning_rate": 3.593546068286435e-07, "loss": 0.2359, "step": 5222 }, { "epoch": 2.927690582959641, "grad_norm": 0.08215849382135554, "learning_rate": 3.5385058823809156e-07, "loss": 0.241, "step": 5223 }, { "epoch": 2.928251121076233, "grad_norm": 0.07902599499630579, "learning_rate": 3.4838897174055417e-07, "loss": 0.2371, "step": 5224 }, { "epoch": 2.9288116591928253, "grad_norm": 0.08411121884071082, "learning_rate": 3.429697596601278e-07, "loss": 0.2457, "step": 5225 }, { "epoch": 2.929372197309417, "grad_norm": 0.07626874289616034, "learning_rate": 3.3759295430281223e-07, "loss": 0.2375, "step": 5226 }, { "epoch": 2.929932735426009, "grad_norm": 0.07894069331815652, "learning_rate": 3.3225855795658845e-07, "loss": 0.2371, "step": 5227 }, { "epoch": 2.930493273542601, "grad_norm": 0.0828622145940112, "learning_rate": 3.26966572891374e-07, "loss": 0.2532, "step": 5228 }, { "epoch": 2.9310538116591927, "grad_norm": 0.08139724099673272, "learning_rate": 3.2171700135906756e-07, "loss": 0.2362, "step": 5229 }, { "epoch": 2.931614349775785, "grad_norm": 0.07847946908193802, "learning_rate": 3.1650984559349337e-07, "loss": 0.2404, "step": 5230 }, { "epoch": 2.932174887892377, "grad_norm": 0.08144523311452388, "learning_rate": 3.1134510781042347e-07, "loss": 0.2577, "step": 5231 }, { "epoch": 2.9327354260089686, "grad_norm": 0.08331031210937782, "learning_rate": 3.062227902076109e-07, "loss": 0.2371, "step": 5232 }, { "epoch": 2.9332959641255605, "grad_norm": 0.08001492880179598, "learning_rate": 3.011428949647233e-07, "loss": 0.2433, "step": 5233 }, { "epoch": 2.9338565022421523, "grad_norm": 0.0807533308463706, "learning_rate": 2.9610542424339803e-07, "loss": 0.2407, "step": 5234 }, { "epoch": 2.9344170403587446, "grad_norm": 0.08240554474517298, "learning_rate": 2.911103801872206e-07, "loss": 0.2454, "step": 5235 }, { "epoch": 2.9349775784753365, "grad_norm": 0.07734753742670465, "learning_rate": 2.8615776492170176e-07, "loss": 0.2379, "step": 5236 }, { "epoch": 2.9355381165919283, "grad_norm": 0.078549890262253, "learning_rate": 2.812475805543224e-07, "loss": 0.2395, "step": 5237 }, { "epoch": 2.93609865470852, "grad_norm": 0.07995050894732823, "learning_rate": 2.763798291744779e-07, "loss": 0.2401, "step": 5238 }, { "epoch": 2.936659192825112, "grad_norm": 0.07807092079681063, "learning_rate": 2.715545128535557e-07, "loss": 0.2356, "step": 5239 }, { "epoch": 2.9372197309417043, "grad_norm": 0.07855991185553275, "learning_rate": 2.667716336448356e-07, "loss": 0.245, "step": 5240 }, { "epoch": 2.9377802690582957, "grad_norm": 0.07944252318522475, "learning_rate": 2.6203119358356733e-07, "loss": 0.2428, "step": 5241 }, { "epoch": 2.938340807174888, "grad_norm": 0.08289075442761087, "learning_rate": 2.573331946869262e-07, "loss": 0.2293, "step": 5242 }, { "epoch": 2.93890134529148, "grad_norm": 0.0815455144238909, "learning_rate": 2.526776389540353e-07, "loss": 0.2254, "step": 5243 }, { "epoch": 2.9394618834080717, "grad_norm": 0.0803571095603025, "learning_rate": 2.480645283659766e-07, "loss": 0.2322, "step": 5244 }, { "epoch": 2.9400224215246635, "grad_norm": 0.08474456628701002, "learning_rate": 2.4349386488574654e-07, "loss": 0.2532, "step": 5245 }, { "epoch": 2.9405829596412554, "grad_norm": 0.0805078766355413, "learning_rate": 2.3896565045826714e-07, "loss": 0.2369, "step": 5246 }, { "epoch": 2.9411434977578477, "grad_norm": 0.08180737048391117, "learning_rate": 2.3447988701043034e-07, "loss": 0.2467, "step": 5247 }, { "epoch": 2.9417040358744395, "grad_norm": 0.07958616382853968, "learning_rate": 2.300365764510315e-07, "loss": 0.236, "step": 5248 }, { "epoch": 2.9422645739910314, "grad_norm": 0.08049069112082367, "learning_rate": 2.2563572067083595e-07, "loss": 0.2414, "step": 5249 }, { "epoch": 2.942825112107623, "grad_norm": 0.0781498909935061, "learning_rate": 2.2127732154251235e-07, "loss": 0.24, "step": 5250 }, { "epoch": 2.943385650224215, "grad_norm": 0.07528173107924589, "learning_rate": 2.169613809206883e-07, "loss": 0.2294, "step": 5251 }, { "epoch": 2.9439461883408073, "grad_norm": 0.07999346420773303, "learning_rate": 2.126879006419058e-07, "loss": 0.2254, "step": 5252 }, { "epoch": 2.944506726457399, "grad_norm": 0.08168423635911633, "learning_rate": 2.0845688252464357e-07, "loss": 0.25, "step": 5253 }, { "epoch": 2.945067264573991, "grad_norm": 0.08056444463902272, "learning_rate": 2.0426832836930587e-07, "loss": 0.2392, "step": 5254 }, { "epoch": 2.945627802690583, "grad_norm": 0.07982966048495936, "learning_rate": 2.0012223995824474e-07, "loss": 0.2382, "step": 5255 }, { "epoch": 2.9461883408071747, "grad_norm": 0.08063967191486175, "learning_rate": 1.9601861905572672e-07, "loss": 0.2341, "step": 5256 }, { "epoch": 2.946748878923767, "grad_norm": 0.07939613336880443, "learning_rate": 1.9195746740795495e-07, "loss": 0.2485, "step": 5257 }, { "epoch": 2.947309417040359, "grad_norm": 0.08065516975169129, "learning_rate": 1.879387867430471e-07, "loss": 0.2509, "step": 5258 }, { "epoch": 2.9478699551569507, "grad_norm": 0.08133260163247694, "learning_rate": 1.839625787710686e-07, "loss": 0.2409, "step": 5259 }, { "epoch": 2.9484304932735426, "grad_norm": 0.07895193321754743, "learning_rate": 1.8002884518401041e-07, "loss": 0.2223, "step": 5260 }, { "epoch": 2.9489910313901344, "grad_norm": 0.07988661288054187, "learning_rate": 1.7613758765576692e-07, "loss": 0.2478, "step": 5261 }, { "epoch": 2.9495515695067267, "grad_norm": 0.07916704842128662, "learning_rate": 1.7228880784216915e-07, "loss": 0.2297, "step": 5262 }, { "epoch": 2.9501121076233185, "grad_norm": 0.08033690224000986, "learning_rate": 1.684825073809848e-07, "loss": 0.2371, "step": 5263 }, { "epoch": 2.9506726457399104, "grad_norm": 0.08024692582305298, "learning_rate": 1.6471868789189603e-07, "loss": 0.2296, "step": 5264 }, { "epoch": 2.9512331838565022, "grad_norm": 0.07631516485832594, "learning_rate": 1.6099735097651058e-07, "loss": 0.2249, "step": 5265 }, { "epoch": 2.951793721973094, "grad_norm": 0.08230465193965804, "learning_rate": 1.5731849821833954e-07, "loss": 0.2406, "step": 5266 }, { "epoch": 2.952354260089686, "grad_norm": 0.08151279907385826, "learning_rate": 1.536821311828529e-07, "loss": 0.2415, "step": 5267 }, { "epoch": 2.952914798206278, "grad_norm": 0.08051643231472558, "learning_rate": 1.50088251417424e-07, "loss": 0.2321, "step": 5268 }, { "epoch": 2.95347533632287, "grad_norm": 0.0813535094180172, "learning_rate": 1.4653686045131843e-07, "loss": 0.2327, "step": 5269 }, { "epoch": 2.954035874439462, "grad_norm": 0.08177174334741082, "learning_rate": 1.4302795979577177e-07, "loss": 0.2573, "step": 5270 }, { "epoch": 2.9545964125560538, "grad_norm": 0.08309563398890388, "learning_rate": 1.395615509439119e-07, "loss": 0.2405, "step": 5271 }, { "epoch": 2.9551569506726456, "grad_norm": 0.07977820548089735, "learning_rate": 1.3613763537078105e-07, "loss": 0.2393, "step": 5272 }, { "epoch": 2.9557174887892375, "grad_norm": 0.07727833251814736, "learning_rate": 1.3275621453333608e-07, "loss": 0.2396, "step": 5273 }, { "epoch": 2.9562780269058297, "grad_norm": 0.07994854813799306, "learning_rate": 1.294172898704815e-07, "loss": 0.2255, "step": 5274 }, { "epoch": 2.9568385650224216, "grad_norm": 0.08086972983216732, "learning_rate": 1.2612086280302527e-07, "loss": 0.2412, "step": 5275 }, { "epoch": 2.9573991031390134, "grad_norm": 0.07839291898304604, "learning_rate": 1.228669347336564e-07, "loss": 0.2306, "step": 5276 }, { "epoch": 2.9579596412556053, "grad_norm": 0.08084565279047115, "learning_rate": 1.1965550704702288e-07, "loss": 0.2517, "step": 5277 }, { "epoch": 2.958520179372197, "grad_norm": 0.08194291771179481, "learning_rate": 1.1648658110967603e-07, "loss": 0.2494, "step": 5278 }, { "epoch": 2.9590807174887894, "grad_norm": 0.08006505008080902, "learning_rate": 1.1336015827008161e-07, "loss": 0.243, "step": 5279 }, { "epoch": 2.9596412556053813, "grad_norm": 0.07754436239138039, "learning_rate": 1.102762398586088e-07, "loss": 0.2288, "step": 5280 }, { "epoch": 2.960201793721973, "grad_norm": 0.07944347964397883, "learning_rate": 1.0723482718754118e-07, "loss": 0.2366, "step": 5281 }, { "epoch": 2.960762331838565, "grad_norm": 0.07924613103277234, "learning_rate": 1.0423592155108797e-07, "loss": 0.2485, "step": 5282 }, { "epoch": 2.961322869955157, "grad_norm": 0.0775394939832644, "learning_rate": 1.0127952422536169e-07, "loss": 0.2306, "step": 5283 }, { "epoch": 2.961883408071749, "grad_norm": 0.08111721430949032, "learning_rate": 9.836563646840047e-08, "loss": 0.2451, "step": 5284 }, { "epoch": 2.962443946188341, "grad_norm": 0.07987107450762725, "learning_rate": 9.549425952012358e-08, "loss": 0.2284, "step": 5285 }, { "epoch": 2.963004484304933, "grad_norm": 0.08141807380302148, "learning_rate": 9.266539460238699e-08, "loss": 0.2419, "step": 5286 }, { "epoch": 2.9635650224215246, "grad_norm": 0.07786866507150199, "learning_rate": 8.98790429189389e-08, "loss": 0.2376, "step": 5287 }, { "epoch": 2.9641255605381165, "grad_norm": 0.0832605088128385, "learning_rate": 8.713520565546419e-08, "loss": 0.237, "step": 5288 }, { "epoch": 2.964686098654709, "grad_norm": 0.08094358918281826, "learning_rate": 8.44338839795289e-08, "loss": 0.241, "step": 5289 }, { "epoch": 2.9652466367713, "grad_norm": 0.08049074033771993, "learning_rate": 8.177507904060244e-08, "loss": 0.2351, "step": 5290 }, { "epoch": 2.9658071748878925, "grad_norm": 0.08298281914638202, "learning_rate": 7.915879197010201e-08, "loss": 0.2554, "step": 5291 }, { "epoch": 2.9663677130044843, "grad_norm": 0.0773610087962181, "learning_rate": 7.658502388131483e-08, "loss": 0.2312, "step": 5292 }, { "epoch": 2.966928251121076, "grad_norm": 0.07854609379353328, "learning_rate": 7.405377586945372e-08, "loss": 0.2344, "step": 5293 }, { "epoch": 2.967488789237668, "grad_norm": 0.07953023109582347, "learning_rate": 7.156504901162375e-08, "loss": 0.2404, "step": 5294 }, { "epoch": 2.96804932735426, "grad_norm": 0.08025893705482982, "learning_rate": 6.911884436685556e-08, "loss": 0.2316, "step": 5295 }, { "epoch": 2.968609865470852, "grad_norm": 0.07994746443161596, "learning_rate": 6.671516297606095e-08, "loss": 0.2371, "step": 5296 }, { "epoch": 2.969170403587444, "grad_norm": 0.08187138973611886, "learning_rate": 6.43540058620884e-08, "loss": 0.2413, "step": 5297 }, { "epoch": 2.969730941704036, "grad_norm": 0.08147503619062962, "learning_rate": 6.203537402965643e-08, "loss": 0.2468, "step": 5298 }, { "epoch": 2.9702914798206277, "grad_norm": 0.08068868467183059, "learning_rate": 5.975926846540914e-08, "loss": 0.2349, "step": 5299 }, { "epoch": 2.9708520179372195, "grad_norm": 0.07894573332874899, "learning_rate": 5.752569013788289e-08, "loss": 0.2315, "step": 5300 }, { "epoch": 2.971412556053812, "grad_norm": 0.08129324141411996, "learning_rate": 5.533463999755073e-08, "loss": 0.2423, "step": 5301 }, { "epoch": 2.9719730941704037, "grad_norm": 0.08115487487147469, "learning_rate": 5.318611897673353e-08, "loss": 0.2485, "step": 5302 }, { "epoch": 2.9725336322869955, "grad_norm": 0.0806793046784492, "learning_rate": 5.1080127989699966e-08, "loss": 0.2495, "step": 5303 }, { "epoch": 2.9730941704035874, "grad_norm": 0.07952902349286228, "learning_rate": 4.901666793261095e-08, "loss": 0.2433, "step": 5304 }, { "epoch": 2.973654708520179, "grad_norm": 0.07782880919830099, "learning_rate": 4.6995739683508564e-08, "loss": 0.2478, "step": 5305 }, { "epoch": 2.9742152466367715, "grad_norm": 0.07836102983454873, "learning_rate": 4.501734410234937e-08, "loss": 0.2317, "step": 5306 }, { "epoch": 2.9747757847533634, "grad_norm": 0.08228369591277344, "learning_rate": 4.3081482031015476e-08, "loss": 0.241, "step": 5307 }, { "epoch": 2.975336322869955, "grad_norm": 0.07932916963866564, "learning_rate": 4.118815429324796e-08, "loss": 0.2463, "step": 5308 }, { "epoch": 2.975896860986547, "grad_norm": 0.08058193160876528, "learning_rate": 3.933736169471347e-08, "loss": 0.2398, "step": 5309 }, { "epoch": 2.976457399103139, "grad_norm": 0.07912455799954479, "learning_rate": 3.752910502297091e-08, "loss": 0.2348, "step": 5310 }, { "epoch": 2.977017937219731, "grad_norm": 0.07955512966216624, "learning_rate": 3.576338504749366e-08, "loss": 0.2499, "step": 5311 }, { "epoch": 2.977578475336323, "grad_norm": 0.08242015311617172, "learning_rate": 3.404020251963624e-08, "loss": 0.2299, "step": 5312 }, { "epoch": 2.978139013452915, "grad_norm": 0.07910613197284071, "learning_rate": 3.235955817264546e-08, "loss": 0.2408, "step": 5313 }, { "epoch": 2.9786995515695067, "grad_norm": 0.07865882269700655, "learning_rate": 3.0721452721704794e-08, "loss": 0.2257, "step": 5314 }, { "epoch": 2.9792600896860986, "grad_norm": 0.0815141709448338, "learning_rate": 2.912588686384554e-08, "loss": 0.2486, "step": 5315 }, { "epoch": 2.979820627802691, "grad_norm": 0.075898357644101, "learning_rate": 2.7572861278046814e-08, "loss": 0.2239, "step": 5316 }, { "epoch": 2.9803811659192823, "grad_norm": 0.08081346619433816, "learning_rate": 2.6062376625146658e-08, "loss": 0.2434, "step": 5317 }, { "epoch": 2.9809417040358746, "grad_norm": 0.08136868711161611, "learning_rate": 2.4594433547908692e-08, "loss": 0.2495, "step": 5318 }, { "epoch": 2.9815022421524664, "grad_norm": 0.08028070958974504, "learning_rate": 2.3169032670966595e-08, "loss": 0.2526, "step": 5319 }, { "epoch": 2.9820627802690582, "grad_norm": 0.07911738094906717, "learning_rate": 2.1786174600879617e-08, "loss": 0.2445, "step": 5320 }, { "epoch": 2.98262331838565, "grad_norm": 0.07899780552489176, "learning_rate": 2.044585992608816e-08, "loss": 0.2375, "step": 5321 }, { "epoch": 2.983183856502242, "grad_norm": 0.07786507902858977, "learning_rate": 1.9148089216936006e-08, "loss": 0.239, "step": 5322 }, { "epoch": 2.9837443946188342, "grad_norm": 0.08380194012054783, "learning_rate": 1.7892863025648078e-08, "loss": 0.2341, "step": 5323 }, { "epoch": 2.984304932735426, "grad_norm": 0.08168210844370141, "learning_rate": 1.6680181886352676e-08, "loss": 0.2274, "step": 5324 }, { "epoch": 2.984865470852018, "grad_norm": 0.08387252489451523, "learning_rate": 1.5510046315092563e-08, "loss": 0.2578, "step": 5325 }, { "epoch": 2.9854260089686098, "grad_norm": 0.08217803833684102, "learning_rate": 1.4382456809791667e-08, "loss": 0.2474, "step": 5326 }, { "epoch": 2.9859865470852016, "grad_norm": 0.07870568632951037, "learning_rate": 1.329741385025507e-08, "loss": 0.2403, "step": 5327 }, { "epoch": 2.986547085201794, "grad_norm": 0.08120854289722704, "learning_rate": 1.2254917898213425e-08, "loss": 0.2336, "step": 5328 }, { "epoch": 2.9871076233183858, "grad_norm": 0.08275810981080454, "learning_rate": 1.1254969397267446e-08, "loss": 0.246, "step": 5329 }, { "epoch": 2.9876681614349776, "grad_norm": 0.07922153622750067, "learning_rate": 1.0297568772921208e-08, "loss": 0.2353, "step": 5330 }, { "epoch": 2.9882286995515694, "grad_norm": 0.0801170106962022, "learning_rate": 9.38271643258215e-09, "loss": 0.2437, "step": 5331 }, { "epoch": 2.9887892376681613, "grad_norm": 0.08220155343207602, "learning_rate": 8.510412765538877e-09, "loss": 0.2507, "step": 5332 }, { "epoch": 2.9893497757847536, "grad_norm": 0.08188243844050613, "learning_rate": 7.680658142972252e-09, "loss": 0.2398, "step": 5333 }, { "epoch": 2.9899103139013454, "grad_norm": 0.08040070756209229, "learning_rate": 6.893452917977606e-09, "loss": 0.2559, "step": 5334 }, { "epoch": 2.9904708520179373, "grad_norm": 0.08049706669376877, "learning_rate": 6.148797425520325e-09, "loss": 0.2479, "step": 5335 }, { "epoch": 2.991031390134529, "grad_norm": 0.07909212031292856, "learning_rate": 5.44669198249137e-09, "loss": 0.2419, "step": 5336 }, { "epoch": 2.991591928251121, "grad_norm": 0.0798282239268473, "learning_rate": 4.787136887629551e-09, "loss": 0.2371, "step": 5337 }, { "epoch": 2.9921524663677133, "grad_norm": 0.08070436214896615, "learning_rate": 4.170132421610351e-09, "loss": 0.2366, "step": 5338 }, { "epoch": 2.9927130044843047, "grad_norm": 0.0779271714653247, "learning_rate": 3.595678846979311e-09, "loss": 0.2428, "step": 5339 }, { "epoch": 2.993273542600897, "grad_norm": 0.08259654736944913, "learning_rate": 3.0637764081853372e-09, "loss": 0.2532, "step": 5340 }, { "epoch": 2.993834080717489, "grad_norm": 0.07982008564088164, "learning_rate": 2.574425331558494e-09, "loss": 0.2365, "step": 5341 }, { "epoch": 2.9943946188340806, "grad_norm": 0.0788106786967822, "learning_rate": 2.127625825343316e-09, "loss": 0.2401, "step": 5342 }, { "epoch": 2.9949551569506725, "grad_norm": 0.08042716587308342, "learning_rate": 1.723378079654392e-09, "loss": 0.2358, "step": 5343 }, { "epoch": 2.9955156950672643, "grad_norm": 0.08040778822418321, "learning_rate": 1.3616822665096785e-09, "loss": 0.2349, "step": 5344 }, { "epoch": 2.9960762331838566, "grad_norm": 0.0804256952244607, "learning_rate": 1.0425385398304955e-09, "loss": 0.2333, "step": 5345 }, { "epoch": 2.9966367713004485, "grad_norm": 0.08424987235173159, "learning_rate": 7.659470354193232e-10, "loss": 0.2492, "step": 5346 }, { "epoch": 2.9971973094170403, "grad_norm": 0.08132774570248197, "learning_rate": 5.319078709709047e-10, "loss": 0.2483, "step": 5347 }, { "epoch": 2.997757847533632, "grad_norm": 0.08075649495863021, "learning_rate": 3.4042114606114284e-10, "loss": 0.2315, "step": 5348 }, { "epoch": 2.998318385650224, "grad_norm": 0.07950158267209038, "learning_rate": 1.9148694219150997e-10, "loss": 0.2441, "step": 5349 }, { "epoch": 2.9988789237668163, "grad_norm": 0.08212722437926628, "learning_rate": 8.510532273353633e-11, "loss": 0.2496, "step": 5350 }, { "epoch": 2.999439461883408, "grad_norm": 0.07977824407540027, "learning_rate": 2.127633295101461e-11, "loss": 0.2393, "step": 5351 }, { "epoch": 3.0, "grad_norm": 0.0776265795211927, "learning_rate": 0.0, "loss": 0.2329, "step": 5352 }, { "epoch": 3.0, "eval_loss": 0.26840582489967346, "eval_runtime": 342.4131, "eval_samples_per_second": 35.095, "eval_steps_per_second": 1.098, "step": 5352 }, { "epoch": 3.0, "step": 5352, "total_flos": 1.7156250092814991e+18, "train_loss": 0.28245328673299386, "train_runtime": 54133.2624, "train_samples_per_second": 12.652, "train_steps_per_second": 0.099 } ], "logging_steps": 1, "max_steps": 5352, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7156250092814991e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }