diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,39831 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9792235231974646, + "eval_steps": 710, + "global_step": 5678, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.14370673894882202, + "learning_rate": 2e-05, + "loss": 1.4342, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 1.4139364957809448, + "eval_runtime": 685.7235, + "eval_samples_per_second": 10.027, + "eval_steps_per_second": 5.014, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.14010629057884216, + "learning_rate": 4e-05, + "loss": 1.4461, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 0.14629808068275452, + "learning_rate": 6e-05, + "loss": 1.5264, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 0.14778390526771545, + "learning_rate": 8e-05, + "loss": 1.4242, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 0.13870327174663544, + "learning_rate": 0.0001, + "loss": 1.4421, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.13916651904582977, + "learning_rate": 0.00012, + "loss": 1.4097, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 0.14333531260490417, + "learning_rate": 0.00014, + "loss": 1.4631, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 0.12884660065174103, + "learning_rate": 0.00016, + "loss": 1.3788, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 0.10925424844026566, + "learning_rate": 0.00018, + "loss": 1.3871, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 0.1368756741285324, + "learning_rate": 0.0002, + "loss": 1.4336, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.14384619891643524, + "learning_rate": 0.00019999999616659956, + "loss": 1.3772, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 0.14267297089099884, + "learning_rate": 0.00019999998466639854, + "loss": 1.3686, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 0.1526922881603241, + "learning_rate": 0.00019999996549939782, + "loss": 1.3395, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 0.12112465500831604, + "learning_rate": 0.00019999993866559883, + "loss": 1.2573, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 0.1302354335784912, + "learning_rate": 0.00019999990416500366, + "loss": 1.3261, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.18590843677520752, + "learning_rate": 0.000199999861997615, + "loss": 1.3111, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 0.12813763320446014, + "learning_rate": 0.000199999812163436, + "loss": 1.4044, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 0.123746357858181, + "learning_rate": 0.00019999975466247053, + "loss": 1.3508, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 0.13045178353786469, + "learning_rate": 0.00019999968949472297, + "loss": 1.3344, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 0.1230260580778122, + "learning_rate": 0.00019999961666019838, + "loss": 1.348, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.10891468077898026, + "learning_rate": 0.00019999953615890226, + "loss": 1.3366, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 0.10767161101102829, + "learning_rate": 0.00019999944799084082, + "loss": 1.2208, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 0.11098026484251022, + "learning_rate": 0.00019999935215602084, + "loss": 1.3507, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 0.10552258044481277, + "learning_rate": 0.00019999924865444962, + "loss": 1.285, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 0.10019772499799728, + "learning_rate": 0.00019999913748613515, + "loss": 1.3506, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.19789332151412964, + "learning_rate": 0.00019999901865108588, + "loss": 1.2869, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 0.10755759477615356, + "learning_rate": 0.000199998892149311, + "loss": 1.3804, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 0.1173456609249115, + "learning_rate": 0.00019999875798082018, + "loss": 1.3407, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 0.10115078836679459, + "learning_rate": 0.00019999861614562368, + "loss": 1.3737, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 0.10270245373249054, + "learning_rate": 0.0001999984666437324, + "loss": 1.3361, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.11115173250436783, + "learning_rate": 0.0001999983094751578, + "loss": 1.2959, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 0.10555535554885864, + "learning_rate": 0.00019999814463991195, + "loss": 1.3213, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 0.10262320190668106, + "learning_rate": 0.00019999797213800743, + "loss": 1.2701, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 0.10323359072208405, + "learning_rate": 0.0001999977919694575, + "loss": 1.2588, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 0.09919729828834534, + "learning_rate": 0.00019999760413427602, + "loss": 1.2837, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.1005103588104248, + "learning_rate": 0.00019999740863247732, + "loss": 1.2368, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 0.10752934962511063, + "learning_rate": 0.00019999720546407638, + "loss": 1.3274, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 0.10152855515480042, + "learning_rate": 0.00019999699462908886, + "loss": 1.3079, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 0.10640770941972733, + "learning_rate": 0.00019999677612753082, + "loss": 1.2958, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 0.10559240728616714, + "learning_rate": 0.0001999965499594191, + "loss": 1.3084, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.105494424700737, + "learning_rate": 0.000199996316124771, + "loss": 1.298, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 0.10321727395057678, + "learning_rate": 0.00019999607462360444, + "loss": 1.1781, + "step": 42 + }, + { + "epoch": 0.02, + "grad_norm": 0.11195287108421326, + "learning_rate": 0.000199995825455938, + "loss": 1.2647, + "step": 43 + }, + { + "epoch": 0.02, + "grad_norm": 0.10888040065765381, + "learning_rate": 0.00019999556862179064, + "loss": 1.2142, + "step": 44 + }, + { + "epoch": 0.02, + "grad_norm": 0.11213697493076324, + "learning_rate": 0.00019999530412118222, + "loss": 1.2636, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.11512047797441483, + "learning_rate": 0.00019999503195413293, + "loss": 1.3122, + "step": 46 + }, + { + "epoch": 0.02, + "grad_norm": 0.11261572688817978, + "learning_rate": 0.0001999947521206636, + "loss": 1.3468, + "step": 47 + }, + { + "epoch": 0.02, + "grad_norm": 0.11023643612861633, + "learning_rate": 0.00019999446462079577, + "loss": 1.2716, + "step": 48 + }, + { + "epoch": 0.02, + "grad_norm": 0.11236844211816788, + "learning_rate": 0.00019999416945455146, + "loss": 1.1672, + "step": 49 + }, + { + "epoch": 0.02, + "grad_norm": 0.10870081931352615, + "learning_rate": 0.00019999386662195324, + "loss": 1.2377, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.11656371504068375, + "learning_rate": 0.0001999935561230244, + "loss": 1.309, + "step": 51 + }, + { + "epoch": 0.02, + "grad_norm": 0.11757932603359222, + "learning_rate": 0.00019999323795778874, + "loss": 1.3768, + "step": 52 + }, + { + "epoch": 0.02, + "grad_norm": 0.11963064968585968, + "learning_rate": 0.00019999291212627058, + "loss": 1.2987, + "step": 53 + }, + { + "epoch": 0.02, + "grad_norm": 0.12015822529792786, + "learning_rate": 0.000199992578628495, + "loss": 1.2777, + "step": 54 + }, + { + "epoch": 0.02, + "grad_norm": 0.10828964412212372, + "learning_rate": 0.00019999223746448746, + "loss": 1.2406, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 0.12686830759048462, + "learning_rate": 0.00019999188863427421, + "loss": 1.2505, + "step": 56 + }, + { + "epoch": 0.02, + "grad_norm": 0.11787737905979156, + "learning_rate": 0.00019999153213788197, + "loss": 1.2192, + "step": 57 + }, + { + "epoch": 0.02, + "grad_norm": 0.11279816180467606, + "learning_rate": 0.00019999116797533804, + "loss": 1.2675, + "step": 58 + }, + { + "epoch": 0.02, + "grad_norm": 0.11661232262849808, + "learning_rate": 0.0001999907961466704, + "loss": 1.2613, + "step": 59 + }, + { + "epoch": 0.02, + "grad_norm": 0.12005340307950974, + "learning_rate": 0.00019999041665190746, + "loss": 1.301, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.12159264087677002, + "learning_rate": 0.0001999900294910784, + "loss": 1.261, + "step": 61 + }, + { + "epoch": 0.02, + "grad_norm": 0.12154638767242432, + "learning_rate": 0.0001999896346642129, + "loss": 1.1817, + "step": 62 + }, + { + "epoch": 0.02, + "grad_norm": 0.1204255223274231, + "learning_rate": 0.0001999892321713412, + "loss": 1.2293, + "step": 63 + }, + { + "epoch": 0.02, + "grad_norm": 0.11920502036809921, + "learning_rate": 0.00019998882201249413, + "loss": 1.2225, + "step": 64 + }, + { + "epoch": 0.02, + "grad_norm": 0.12662023305892944, + "learning_rate": 0.00019998840418770323, + "loss": 1.2439, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 0.12107300013303757, + "learning_rate": 0.00019998797869700044, + "loss": 1.263, + "step": 66 + }, + { + "epoch": 0.02, + "grad_norm": 0.11776495724916458, + "learning_rate": 0.00019998754554041842, + "loss": 1.2241, + "step": 67 + }, + { + "epoch": 0.02, + "grad_norm": 0.13094747066497803, + "learning_rate": 0.00019998710471799035, + "loss": 1.3078, + "step": 68 + }, + { + "epoch": 0.02, + "grad_norm": 0.1258789449930191, + "learning_rate": 0.0001999866562297501, + "loss": 1.2448, + "step": 69 + }, + { + "epoch": 0.02, + "grad_norm": 0.13428783416748047, + "learning_rate": 0.00019998620007573198, + "loss": 1.3025, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 0.12916766107082367, + "learning_rate": 0.000199985736255971, + "loss": 1.2096, + "step": 71 + }, + { + "epoch": 0.03, + "grad_norm": 0.12558427453041077, + "learning_rate": 0.0001999852647705027, + "loss": 1.1837, + "step": 72 + }, + { + "epoch": 0.03, + "grad_norm": 0.13561181724071503, + "learning_rate": 0.00019998478561936324, + "loss": 1.2618, + "step": 73 + }, + { + "epoch": 0.03, + "grad_norm": 0.12791557610034943, + "learning_rate": 0.0001999842988025894, + "loss": 1.1922, + "step": 74 + }, + { + "epoch": 0.03, + "grad_norm": 0.12207165360450745, + "learning_rate": 0.00019998380432021838, + "loss": 1.2192, + "step": 75 + }, + { + "epoch": 0.03, + "grad_norm": 0.13057637214660645, + "learning_rate": 0.0001999833021722882, + "loss": 1.2109, + "step": 76 + }, + { + "epoch": 0.03, + "grad_norm": 0.1369403749704361, + "learning_rate": 0.00019998279235883734, + "loss": 1.2083, + "step": 77 + }, + { + "epoch": 0.03, + "grad_norm": 0.1295352429151535, + "learning_rate": 0.00019998227487990486, + "loss": 1.2487, + "step": 78 + }, + { + "epoch": 0.03, + "grad_norm": 0.12950530648231506, + "learning_rate": 0.00019998174973553048, + "loss": 1.2241, + "step": 79 + }, + { + "epoch": 0.03, + "grad_norm": 0.1299756020307541, + "learning_rate": 0.0001999812169257544, + "loss": 1.198, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 0.1389460414648056, + "learning_rate": 0.0001999806764506175, + "loss": 1.2143, + "step": 81 + }, + { + "epoch": 0.03, + "grad_norm": 0.13119052350521088, + "learning_rate": 0.0001999801283101612, + "loss": 1.2537, + "step": 82 + }, + { + "epoch": 0.03, + "grad_norm": 0.13153623044490814, + "learning_rate": 0.00019997957250442755, + "loss": 1.203, + "step": 83 + }, + { + "epoch": 0.03, + "grad_norm": 0.13375413417816162, + "learning_rate": 0.00019997900903345918, + "loss": 1.1866, + "step": 84 + }, + { + "epoch": 0.03, + "grad_norm": 0.13613435626029968, + "learning_rate": 0.00019997843789729925, + "loss": 1.2383, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.13316839933395386, + "learning_rate": 0.00019997785909599153, + "loss": 1.1934, + "step": 86 + }, + { + "epoch": 0.03, + "grad_norm": 0.13064943253993988, + "learning_rate": 0.00019997727262958045, + "loss": 1.1934, + "step": 87 + }, + { + "epoch": 0.03, + "grad_norm": 0.1421564519405365, + "learning_rate": 0.00019997667849811098, + "loss": 1.2651, + "step": 88 + }, + { + "epoch": 0.03, + "grad_norm": 0.1368361860513687, + "learning_rate": 0.00019997607670162862, + "loss": 1.2236, + "step": 89 + }, + { + "epoch": 0.03, + "grad_norm": 0.1279619336128235, + "learning_rate": 0.0001999754672401795, + "loss": 1.224, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.13549309968948364, + "learning_rate": 0.0001999748501138104, + "loss": 1.2221, + "step": 91 + }, + { + "epoch": 0.03, + "grad_norm": 0.131364107131958, + "learning_rate": 0.0001999742253225686, + "loss": 1.2194, + "step": 92 + }, + { + "epoch": 0.03, + "grad_norm": 0.13993385434150696, + "learning_rate": 0.00019997359286650203, + "loss": 1.2473, + "step": 93 + }, + { + "epoch": 0.03, + "grad_norm": 0.13425293564796448, + "learning_rate": 0.00019997295274565917, + "loss": 1.1737, + "step": 94 + }, + { + "epoch": 0.03, + "grad_norm": 0.1355249583721161, + "learning_rate": 0.00019997230496008906, + "loss": 1.2063, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.14003869891166687, + "learning_rate": 0.00019997164950984138, + "loss": 1.203, + "step": 96 + }, + { + "epoch": 0.03, + "grad_norm": 0.13123658299446106, + "learning_rate": 0.0001999709863949664, + "loss": 1.1163, + "step": 97 + }, + { + "epoch": 0.03, + "grad_norm": 0.1321428418159485, + "learning_rate": 0.00019997031561551498, + "loss": 1.17, + "step": 98 + }, + { + "epoch": 0.03, + "grad_norm": 0.13351233303546906, + "learning_rate": 0.0001999696371715385, + "loss": 1.1809, + "step": 99 + }, + { + "epoch": 0.04, + "grad_norm": 0.12996333837509155, + "learning_rate": 0.000199968951063089, + "loss": 1.1476, + "step": 100 + }, + { + "epoch": 0.04, + "grad_norm": 0.13364213705062866, + "learning_rate": 0.0001999682572902191, + "loss": 1.1802, + "step": 101 + }, + { + "epoch": 0.04, + "grad_norm": 0.13364386558532715, + "learning_rate": 0.00019996755585298192, + "loss": 1.2397, + "step": 102 + }, + { + "epoch": 0.04, + "grad_norm": 0.137208953499794, + "learning_rate": 0.0001999668467514313, + "loss": 1.2739, + "step": 103 + }, + { + "epoch": 0.04, + "grad_norm": 0.1342509239912033, + "learning_rate": 0.00019996612998562162, + "loss": 1.2535, + "step": 104 + }, + { + "epoch": 0.04, + "grad_norm": 0.15147949755191803, + "learning_rate": 0.0001999654055556078, + "loss": 1.1905, + "step": 105 + }, + { + "epoch": 0.04, + "grad_norm": 0.12738168239593506, + "learning_rate": 0.00019996467346144537, + "loss": 1.1332, + "step": 106 + }, + { + "epoch": 0.04, + "grad_norm": 0.1382182538509369, + "learning_rate": 0.00019996393370319047, + "loss": 1.2299, + "step": 107 + }, + { + "epoch": 0.04, + "grad_norm": 0.14213505387306213, + "learning_rate": 0.0001999631862808998, + "loss": 1.2613, + "step": 108 + }, + { + "epoch": 0.04, + "grad_norm": 0.13719429075717926, + "learning_rate": 0.00019996243119463074, + "loss": 1.2, + "step": 109 + }, + { + "epoch": 0.04, + "grad_norm": 0.13508781790733337, + "learning_rate": 0.00019996166844444107, + "loss": 1.1761, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.1376897543668747, + "learning_rate": 0.00019996089803038937, + "loss": 1.2237, + "step": 111 + }, + { + "epoch": 0.04, + "grad_norm": 0.14335481822490692, + "learning_rate": 0.00019996011995253462, + "loss": 1.1745, + "step": 112 + }, + { + "epoch": 0.04, + "grad_norm": 0.1393982619047165, + "learning_rate": 0.0001999593342109365, + "loss": 1.2872, + "step": 113 + }, + { + "epoch": 0.04, + "grad_norm": 0.14151644706726074, + "learning_rate": 0.00019995854080565528, + "loss": 1.2365, + "step": 114 + }, + { + "epoch": 0.04, + "grad_norm": 0.13480781018733978, + "learning_rate": 0.0001999577397367518, + "loss": 1.2138, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.1416129618883133, + "learning_rate": 0.0001999569310042874, + "loss": 1.2345, + "step": 116 + }, + { + "epoch": 0.04, + "grad_norm": 0.1319161355495453, + "learning_rate": 0.00019995611460832415, + "loss": 1.1598, + "step": 117 + }, + { + "epoch": 0.04, + "grad_norm": 0.13647343218326569, + "learning_rate": 0.00019995529054892463, + "loss": 1.1749, + "step": 118 + }, + { + "epoch": 0.04, + "grad_norm": 0.1369137465953827, + "learning_rate": 0.00019995445882615203, + "loss": 1.2051, + "step": 119 + }, + { + "epoch": 0.04, + "grad_norm": 0.1341824233531952, + "learning_rate": 0.00019995361944007007, + "loss": 1.262, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.1396571397781372, + "learning_rate": 0.00019995277239074315, + "loss": 1.1545, + "step": 121 + }, + { + "epoch": 0.04, + "grad_norm": 0.13041186332702637, + "learning_rate": 0.0001999519176782362, + "loss": 1.1699, + "step": 122 + }, + { + "epoch": 0.04, + "grad_norm": 0.12751354277133942, + "learning_rate": 0.0001999510553026147, + "loss": 1.1315, + "step": 123 + }, + { + "epoch": 0.04, + "grad_norm": 0.13470140099525452, + "learning_rate": 0.00019995018526394485, + "loss": 1.2279, + "step": 124 + }, + { + "epoch": 0.04, + "grad_norm": 0.1435287892818451, + "learning_rate": 0.0001999493075622933, + "loss": 1.224, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.14406907558441162, + "learning_rate": 0.00019994842219772739, + "loss": 1.1869, + "step": 126 + }, + { + "epoch": 0.04, + "grad_norm": 0.13931530714035034, + "learning_rate": 0.00019994752917031492, + "loss": 1.2554, + "step": 127 + }, + { + "epoch": 0.05, + "grad_norm": 0.13751220703125, + "learning_rate": 0.0001999466284801244, + "loss": 1.2116, + "step": 128 + }, + { + "epoch": 0.05, + "grad_norm": 0.1349681317806244, + "learning_rate": 0.00019994572012722492, + "loss": 1.1736, + "step": 129 + }, + { + "epoch": 0.05, + "grad_norm": 0.1377020627260208, + "learning_rate": 0.00019994480411168609, + "loss": 1.1921, + "step": 130 + }, + { + "epoch": 0.05, + "grad_norm": 0.13730505108833313, + "learning_rate": 0.00019994388043357807, + "loss": 1.2046, + "step": 131 + }, + { + "epoch": 0.05, + "grad_norm": 0.14024712145328522, + "learning_rate": 0.0001999429490929718, + "loss": 1.1561, + "step": 132 + }, + { + "epoch": 0.05, + "grad_norm": 0.1297326534986496, + "learning_rate": 0.00019994201008993864, + "loss": 1.1856, + "step": 133 + }, + { + "epoch": 0.05, + "grad_norm": 0.129353865981102, + "learning_rate": 0.00019994106342455053, + "loss": 1.1283, + "step": 134 + }, + { + "epoch": 0.05, + "grad_norm": 0.14311642944812775, + "learning_rate": 0.00019994010909688012, + "loss": 1.2684, + "step": 135 + }, + { + "epoch": 0.05, + "grad_norm": 0.13943718373775482, + "learning_rate": 0.00019993914710700052, + "loss": 1.2236, + "step": 136 + }, + { + "epoch": 0.05, + "grad_norm": 0.14227281510829926, + "learning_rate": 0.0001999381774549855, + "loss": 1.287, + "step": 137 + }, + { + "epoch": 0.05, + "grad_norm": 0.14150571823120117, + "learning_rate": 0.00019993720014090945, + "loss": 1.1848, + "step": 138 + }, + { + "epoch": 0.05, + "grad_norm": 0.1421256810426712, + "learning_rate": 0.0001999362151648472, + "loss": 1.1925, + "step": 139 + }, + { + "epoch": 0.05, + "grad_norm": 0.1376809924840927, + "learning_rate": 0.00019993522252687438, + "loss": 1.2419, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 0.15068894624710083, + "learning_rate": 0.00019993422222706704, + "loss": 1.206, + "step": 141 + }, + { + "epoch": 0.05, + "grad_norm": 0.15035270154476166, + "learning_rate": 0.0001999332142655018, + "loss": 1.2368, + "step": 142 + }, + { + "epoch": 0.05, + "grad_norm": 0.136764794588089, + "learning_rate": 0.00019993219864225607, + "loss": 1.162, + "step": 143 + }, + { + "epoch": 0.05, + "grad_norm": 0.1427554190158844, + "learning_rate": 0.00019993117535740764, + "loss": 1.1967, + "step": 144 + }, + { + "epoch": 0.05, + "grad_norm": 0.1433485597372055, + "learning_rate": 0.00019993014441103496, + "loss": 1.1817, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.13792641460895538, + "learning_rate": 0.0001999291058032171, + "loss": 1.1668, + "step": 146 + }, + { + "epoch": 0.05, + "grad_norm": 0.1383659839630127, + "learning_rate": 0.0001999280595340337, + "loss": 1.2255, + "step": 147 + }, + { + "epoch": 0.05, + "grad_norm": 0.13606032729148865, + "learning_rate": 0.0001999270056035649, + "loss": 1.1995, + "step": 148 + }, + { + "epoch": 0.05, + "grad_norm": 0.14569996297359467, + "learning_rate": 0.00019992594401189157, + "loss": 1.2617, + "step": 149 + }, + { + "epoch": 0.05, + "grad_norm": 0.13744774460792542, + "learning_rate": 0.00019992487475909508, + "loss": 1.2096, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.13937488198280334, + "learning_rate": 0.0001999237978452574, + "loss": 1.2524, + "step": 151 + }, + { + "epoch": 0.05, + "grad_norm": 0.13918696343898773, + "learning_rate": 0.00019992271327046114, + "loss": 1.1093, + "step": 152 + }, + { + "epoch": 0.05, + "grad_norm": 0.13565635681152344, + "learning_rate": 0.0001999216210347894, + "loss": 1.1467, + "step": 153 + }, + { + "epoch": 0.05, + "grad_norm": 0.14065219461917877, + "learning_rate": 0.00019992052113832593, + "loss": 1.1114, + "step": 154 + }, + { + "epoch": 0.05, + "grad_norm": 0.14353357255458832, + "learning_rate": 0.0001999194135811551, + "loss": 1.1444, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 0.14801861345767975, + "learning_rate": 0.00019991829836336176, + "loss": 1.2434, + "step": 156 + }, + { + "epoch": 0.06, + "grad_norm": 0.15463212132453918, + "learning_rate": 0.00019991717548503143, + "loss": 1.1974, + "step": 157 + }, + { + "epoch": 0.06, + "grad_norm": 0.15134255588054657, + "learning_rate": 0.00019991604494625021, + "loss": 1.18, + "step": 158 + }, + { + "epoch": 0.06, + "grad_norm": 0.15339814126491547, + "learning_rate": 0.0001999149067471048, + "loss": 1.1829, + "step": 159 + }, + { + "epoch": 0.06, + "grad_norm": 0.14708739519119263, + "learning_rate": 0.0001999137608876824, + "loss": 1.1795, + "step": 160 + }, + { + "epoch": 0.06, + "grad_norm": 0.15173062682151794, + "learning_rate": 0.0001999126073680709, + "loss": 1.2195, + "step": 161 + }, + { + "epoch": 0.06, + "grad_norm": 0.16870474815368652, + "learning_rate": 0.00019991144618835877, + "loss": 1.2889, + "step": 162 + }, + { + "epoch": 0.06, + "grad_norm": 0.14730185270309448, + "learning_rate": 0.000199910277348635, + "loss": 1.1631, + "step": 163 + }, + { + "epoch": 0.06, + "grad_norm": 0.14397159218788147, + "learning_rate": 0.0001999091008489892, + "loss": 1.1906, + "step": 164 + }, + { + "epoch": 0.06, + "grad_norm": 0.1512182652950287, + "learning_rate": 0.00019990791668951155, + "loss": 1.1496, + "step": 165 + }, + { + "epoch": 0.06, + "grad_norm": 0.15269646048545837, + "learning_rate": 0.00019990672487029288, + "loss": 1.1805, + "step": 166 + }, + { + "epoch": 0.06, + "grad_norm": 0.1465330868959427, + "learning_rate": 0.00019990552539142454, + "loss": 1.2078, + "step": 167 + }, + { + "epoch": 0.06, + "grad_norm": 0.14436009526252747, + "learning_rate": 0.0001999043182529985, + "loss": 1.2336, + "step": 168 + }, + { + "epoch": 0.06, + "grad_norm": 0.1508115530014038, + "learning_rate": 0.00019990310345510733, + "loss": 1.304, + "step": 169 + }, + { + "epoch": 0.06, + "grad_norm": 0.14511720836162567, + "learning_rate": 0.00019990188099784412, + "loss": 1.2004, + "step": 170 + }, + { + "epoch": 0.06, + "grad_norm": 0.14771127700805664, + "learning_rate": 0.0001999006508813026, + "loss": 1.2467, + "step": 171 + }, + { + "epoch": 0.06, + "grad_norm": 0.14662198722362518, + "learning_rate": 0.0001998994131055771, + "loss": 1.1317, + "step": 172 + }, + { + "epoch": 0.06, + "grad_norm": 0.1433134377002716, + "learning_rate": 0.00019989816767076253, + "loss": 1.1357, + "step": 173 + }, + { + "epoch": 0.06, + "grad_norm": 0.15154090523719788, + "learning_rate": 0.0001998969145769544, + "loss": 1.2298, + "step": 174 + }, + { + "epoch": 0.06, + "grad_norm": 0.13884200155735016, + "learning_rate": 0.00019989565382424869, + "loss": 1.2133, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 0.14153452217578888, + "learning_rate": 0.0001998943854127421, + "loss": 1.1647, + "step": 176 + }, + { + "epoch": 0.06, + "grad_norm": 0.14802685379981995, + "learning_rate": 0.00019989310934253191, + "loss": 1.1019, + "step": 177 + }, + { + "epoch": 0.06, + "grad_norm": 0.1560146063566208, + "learning_rate": 0.00019989182561371593, + "loss": 1.168, + "step": 178 + }, + { + "epoch": 0.06, + "grad_norm": 0.14783112704753876, + "learning_rate": 0.0001998905342263926, + "loss": 1.2003, + "step": 179 + }, + { + "epoch": 0.06, + "grad_norm": 0.14809173345565796, + "learning_rate": 0.00019988923518066089, + "loss": 1.2171, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 0.1465620994567871, + "learning_rate": 0.00019988792847662042, + "loss": 1.1109, + "step": 181 + }, + { + "epoch": 0.06, + "grad_norm": 0.14426849782466888, + "learning_rate": 0.00019988661411437137, + "loss": 1.1649, + "step": 182 + }, + { + "epoch": 0.06, + "grad_norm": 0.15189453959465027, + "learning_rate": 0.0001998852920940145, + "loss": 1.1772, + "step": 183 + }, + { + "epoch": 0.06, + "grad_norm": 0.16095630824565887, + "learning_rate": 0.00019988396241565117, + "loss": 1.0906, + "step": 184 + }, + { + "epoch": 0.07, + "grad_norm": 0.13917481899261475, + "learning_rate": 0.00019988262507938335, + "loss": 1.1591, + "step": 185 + }, + { + "epoch": 0.07, + "grad_norm": 0.15862134099006653, + "learning_rate": 0.00019988128008531354, + "loss": 1.1531, + "step": 186 + }, + { + "epoch": 0.07, + "grad_norm": 0.14392463862895966, + "learning_rate": 0.00019987992743354487, + "loss": 1.2193, + "step": 187 + }, + { + "epoch": 0.07, + "grad_norm": 0.14952170848846436, + "learning_rate": 0.00019987856712418104, + "loss": 1.2214, + "step": 188 + }, + { + "epoch": 0.07, + "grad_norm": 0.1452377885580063, + "learning_rate": 0.00019987719915732634, + "loss": 1.1678, + "step": 189 + }, + { + "epoch": 0.07, + "grad_norm": 0.1571136713027954, + "learning_rate": 0.00019987582353308566, + "loss": 1.1832, + "step": 190 + }, + { + "epoch": 0.07, + "grad_norm": 0.15406474471092224, + "learning_rate": 0.00019987444025156446, + "loss": 1.1609, + "step": 191 + }, + { + "epoch": 0.07, + "grad_norm": 0.16054005920886993, + "learning_rate": 0.0001998730493128688, + "loss": 1.1787, + "step": 192 + }, + { + "epoch": 0.07, + "grad_norm": 0.14553441107273102, + "learning_rate": 0.00019987165071710527, + "loss": 1.1636, + "step": 193 + }, + { + "epoch": 0.07, + "grad_norm": 0.14451570808887482, + "learning_rate": 0.0001998702444643812, + "loss": 1.1658, + "step": 194 + }, + { + "epoch": 0.07, + "grad_norm": 0.149046391248703, + "learning_rate": 0.0001998688305548043, + "loss": 1.1452, + "step": 195 + }, + { + "epoch": 0.07, + "grad_norm": 0.1507096290588379, + "learning_rate": 0.00019986740898848306, + "loss": 1.1181, + "step": 196 + }, + { + "epoch": 0.07, + "grad_norm": 0.150601327419281, + "learning_rate": 0.0001998659797655264, + "loss": 1.1789, + "step": 197 + }, + { + "epoch": 0.07, + "grad_norm": 0.14738313853740692, + "learning_rate": 0.00019986454288604394, + "loss": 1.1728, + "step": 198 + }, + { + "epoch": 0.07, + "grad_norm": 0.15052588284015656, + "learning_rate": 0.0001998630983501458, + "loss": 1.2037, + "step": 199 + }, + { + "epoch": 0.07, + "grad_norm": 0.15669110417366028, + "learning_rate": 0.0001998616461579428, + "loss": 1.1426, + "step": 200 + }, + { + "epoch": 0.07, + "grad_norm": 0.155398428440094, + "learning_rate": 0.00019986018630954616, + "loss": 1.1367, + "step": 201 + }, + { + "epoch": 0.07, + "grad_norm": 0.14136084914207458, + "learning_rate": 0.00019985871880506793, + "loss": 1.1654, + "step": 202 + }, + { + "epoch": 0.07, + "grad_norm": 0.14925213158130646, + "learning_rate": 0.00019985724364462057, + "loss": 1.1997, + "step": 203 + }, + { + "epoch": 0.07, + "grad_norm": 0.15234960615634918, + "learning_rate": 0.00019985576082831715, + "loss": 1.142, + "step": 204 + }, + { + "epoch": 0.07, + "grad_norm": 0.1498517394065857, + "learning_rate": 0.00019985427035627137, + "loss": 1.1695, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.14819611608982086, + "learning_rate": 0.00019985277222859754, + "loss": 1.1948, + "step": 206 + }, + { + "epoch": 0.07, + "grad_norm": 0.14833801984786987, + "learning_rate": 0.00019985126644541046, + "loss": 1.2263, + "step": 207 + }, + { + "epoch": 0.07, + "grad_norm": 0.138643279671669, + "learning_rate": 0.0001998497530068256, + "loss": 1.1655, + "step": 208 + }, + { + "epoch": 0.07, + "grad_norm": 0.15962731838226318, + "learning_rate": 0.000199848231912959, + "loss": 1.2159, + "step": 209 + }, + { + "epoch": 0.07, + "grad_norm": 0.14703673124313354, + "learning_rate": 0.00019984670316392728, + "loss": 1.1169, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 0.14648592472076416, + "learning_rate": 0.0001998451667598477, + "loss": 1.266, + "step": 211 + }, + { + "epoch": 0.07, + "grad_norm": 0.15541663765907288, + "learning_rate": 0.0001998436227008379, + "loss": 1.1395, + "step": 212 + }, + { + "epoch": 0.08, + "grad_norm": 0.14851492643356323, + "learning_rate": 0.00019984207098701638, + "loss": 1.1509, + "step": 213 + }, + { + "epoch": 0.08, + "grad_norm": 0.15682092308998108, + "learning_rate": 0.0001998405116185021, + "loss": 1.2423, + "step": 214 + }, + { + "epoch": 0.08, + "grad_norm": 0.16800767183303833, + "learning_rate": 0.0001998389445954146, + "loss": 1.1532, + "step": 215 + }, + { + "epoch": 0.08, + "grad_norm": 0.1449185460805893, + "learning_rate": 0.000199837369917874, + "loss": 1.1184, + "step": 216 + }, + { + "epoch": 0.08, + "grad_norm": 0.15644285082817078, + "learning_rate": 0.00019983578758600104, + "loss": 1.2339, + "step": 217 + }, + { + "epoch": 0.08, + "grad_norm": 0.14295175671577454, + "learning_rate": 0.00019983419759991703, + "loss": 1.1165, + "step": 218 + }, + { + "epoch": 0.08, + "grad_norm": 0.15597833693027496, + "learning_rate": 0.0001998325999597439, + "loss": 1.2016, + "step": 219 + }, + { + "epoch": 0.08, + "grad_norm": 0.14484375715255737, + "learning_rate": 0.0001998309946656041, + "loss": 1.1666, + "step": 220 + }, + { + "epoch": 0.08, + "grad_norm": 0.16649185121059418, + "learning_rate": 0.0001998293817176207, + "loss": 1.1367, + "step": 221 + }, + { + "epoch": 0.08, + "grad_norm": 0.1692935675382614, + "learning_rate": 0.00019982776111591744, + "loss": 1.1454, + "step": 222 + }, + { + "epoch": 0.08, + "grad_norm": 0.1514350026845932, + "learning_rate": 0.00019982613286061846, + "loss": 1.1746, + "step": 223 + }, + { + "epoch": 0.08, + "grad_norm": 0.15292806923389435, + "learning_rate": 0.00019982449695184865, + "loss": 1.1006, + "step": 224 + }, + { + "epoch": 0.08, + "grad_norm": 0.16936799883842468, + "learning_rate": 0.00019982285338973344, + "loss": 1.1535, + "step": 225 + }, + { + "epoch": 0.08, + "grad_norm": 0.1575537770986557, + "learning_rate": 0.00019982120217439884, + "loss": 1.2035, + "step": 226 + }, + { + "epoch": 0.08, + "grad_norm": 0.14317381381988525, + "learning_rate": 0.00019981954330597143, + "loss": 1.1423, + "step": 227 + }, + { + "epoch": 0.08, + "grad_norm": 0.16129539906978607, + "learning_rate": 0.00019981787678457837, + "loss": 1.0953, + "step": 228 + }, + { + "epoch": 0.08, + "grad_norm": 0.1630077362060547, + "learning_rate": 0.00019981620261034743, + "loss": 1.0997, + "step": 229 + }, + { + "epoch": 0.08, + "grad_norm": 0.14313393831253052, + "learning_rate": 0.00019981452078340702, + "loss": 1.1185, + "step": 230 + }, + { + "epoch": 0.08, + "grad_norm": 0.15655392408370972, + "learning_rate": 0.00019981283130388606, + "loss": 1.0938, + "step": 231 + }, + { + "epoch": 0.08, + "grad_norm": 0.14617329835891724, + "learning_rate": 0.00019981113417191407, + "loss": 1.0718, + "step": 232 + }, + { + "epoch": 0.08, + "grad_norm": 0.15672458708286285, + "learning_rate": 0.00019980942938762115, + "loss": 1.1351, + "step": 233 + }, + { + "epoch": 0.08, + "grad_norm": 0.15482361614704132, + "learning_rate": 0.000199807716951138, + "loss": 1.1938, + "step": 234 + }, + { + "epoch": 0.08, + "grad_norm": 0.15706798434257507, + "learning_rate": 0.00019980599686259594, + "loss": 1.2067, + "step": 235 + }, + { + "epoch": 0.08, + "grad_norm": 0.1500590741634369, + "learning_rate": 0.00019980426912212686, + "loss": 1.1681, + "step": 236 + }, + { + "epoch": 0.08, + "grad_norm": 0.16141961514949799, + "learning_rate": 0.00019980253372986316, + "loss": 1.1418, + "step": 237 + }, + { + "epoch": 0.08, + "grad_norm": 0.1429641842842102, + "learning_rate": 0.00019980079068593793, + "loss": 1.1352, + "step": 238 + }, + { + "epoch": 0.08, + "grad_norm": 0.15207724273204803, + "learning_rate": 0.00019979903999048482, + "loss": 1.2145, + "step": 239 + }, + { + "epoch": 0.08, + "grad_norm": 0.16128486394882202, + "learning_rate": 0.000199797281643638, + "loss": 1.14, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 0.15816397964954376, + "learning_rate": 0.00019979551564553233, + "loss": 1.1786, + "step": 241 + }, + { + "epoch": 0.09, + "grad_norm": 0.15580213069915771, + "learning_rate": 0.0001997937419963032, + "loss": 1.1646, + "step": 242 + }, + { + "epoch": 0.09, + "grad_norm": 0.14587156474590302, + "learning_rate": 0.0001997919606960865, + "loss": 1.1389, + "step": 243 + }, + { + "epoch": 0.09, + "grad_norm": 0.14832736551761627, + "learning_rate": 0.00019979017174501896, + "loss": 1.0983, + "step": 244 + }, + { + "epoch": 0.09, + "grad_norm": 0.1540011465549469, + "learning_rate": 0.00019978837514323763, + "loss": 1.122, + "step": 245 + }, + { + "epoch": 0.09, + "grad_norm": 0.1595630794763565, + "learning_rate": 0.00019978657089088028, + "loss": 1.1561, + "step": 246 + }, + { + "epoch": 0.09, + "grad_norm": 0.14259830117225647, + "learning_rate": 0.00019978475898808524, + "loss": 1.0869, + "step": 247 + }, + { + "epoch": 0.09, + "grad_norm": 0.14668312668800354, + "learning_rate": 0.0001997829394349914, + "loss": 1.1542, + "step": 248 + }, + { + "epoch": 0.09, + "grad_norm": 0.14027823507785797, + "learning_rate": 0.00019978111223173828, + "loss": 1.1879, + "step": 249 + }, + { + "epoch": 0.09, + "grad_norm": 0.15351636707782745, + "learning_rate": 0.00019977927737846595, + "loss": 1.1686, + "step": 250 + }, + { + "epoch": 0.09, + "grad_norm": 0.15014703571796417, + "learning_rate": 0.00019977743487531514, + "loss": 1.1307, + "step": 251 + }, + { + "epoch": 0.09, + "grad_norm": 0.1641966849565506, + "learning_rate": 0.00019977558472242704, + "loss": 1.1601, + "step": 252 + }, + { + "epoch": 0.09, + "grad_norm": 0.15307414531707764, + "learning_rate": 0.00019977372691994355, + "loss": 1.1866, + "step": 253 + }, + { + "epoch": 0.09, + "grad_norm": 0.15638311207294464, + "learning_rate": 0.00019977186146800707, + "loss": 1.2126, + "step": 254 + }, + { + "epoch": 0.09, + "grad_norm": 0.14870685338974, + "learning_rate": 0.00019976998836676067, + "loss": 1.1191, + "step": 255 + }, + { + "epoch": 0.09, + "grad_norm": 0.1501302719116211, + "learning_rate": 0.00019976810761634787, + "loss": 1.172, + "step": 256 + }, + { + "epoch": 0.09, + "grad_norm": 0.15548266470432281, + "learning_rate": 0.00019976621921691297, + "loss": 1.139, + "step": 257 + }, + { + "epoch": 0.09, + "grad_norm": 0.15052320063114166, + "learning_rate": 0.00019976432316860067, + "loss": 1.1248, + "step": 258 + }, + { + "epoch": 0.09, + "grad_norm": 0.14842264354228973, + "learning_rate": 0.00019976241947155634, + "loss": 1.2016, + "step": 259 + }, + { + "epoch": 0.09, + "grad_norm": 0.15101812779903412, + "learning_rate": 0.00019976050812592594, + "loss": 1.1772, + "step": 260 + }, + { + "epoch": 0.09, + "grad_norm": 0.14940795302391052, + "learning_rate": 0.00019975858913185609, + "loss": 1.1316, + "step": 261 + }, + { + "epoch": 0.09, + "grad_norm": 0.14583760499954224, + "learning_rate": 0.00019975666248949378, + "loss": 1.1271, + "step": 262 + }, + { + "epoch": 0.09, + "grad_norm": 0.15480725467205048, + "learning_rate": 0.00019975472819898683, + "loss": 1.1886, + "step": 263 + }, + { + "epoch": 0.09, + "grad_norm": 0.14589324593544006, + "learning_rate": 0.0001997527862604835, + "loss": 1.2016, + "step": 264 + }, + { + "epoch": 0.09, + "grad_norm": 0.14960989356040955, + "learning_rate": 0.00019975083667413265, + "loss": 1.1269, + "step": 265 + }, + { + "epoch": 0.09, + "grad_norm": 0.1421205699443817, + "learning_rate": 0.00019974887944008377, + "loss": 1.052, + "step": 266 + }, + { + "epoch": 0.09, + "grad_norm": 0.15329426527023315, + "learning_rate": 0.00019974691455848696, + "loss": 1.1607, + "step": 267 + }, + { + "epoch": 0.09, + "grad_norm": 0.14127160608768463, + "learning_rate": 0.0001997449420294928, + "loss": 1.1725, + "step": 268 + }, + { + "epoch": 0.09, + "grad_norm": 0.16101525723934174, + "learning_rate": 0.00019974296185325255, + "loss": 1.1535, + "step": 269 + }, + { + "epoch": 0.1, + "grad_norm": 0.15944872796535492, + "learning_rate": 0.000199740974029918, + "loss": 1.1134, + "step": 270 + }, + { + "epoch": 0.1, + "grad_norm": 0.1493426114320755, + "learning_rate": 0.0001997389785596416, + "loss": 1.1452, + "step": 271 + }, + { + "epoch": 0.1, + "grad_norm": 0.14942991733551025, + "learning_rate": 0.0001997369754425763, + "loss": 1.2205, + "step": 272 + }, + { + "epoch": 0.1, + "grad_norm": 0.15282519161701202, + "learning_rate": 0.00019973496467887568, + "loss": 1.1538, + "step": 273 + }, + { + "epoch": 0.1, + "grad_norm": 0.1616443246603012, + "learning_rate": 0.00019973294626869393, + "loss": 1.2279, + "step": 274 + }, + { + "epoch": 0.1, + "grad_norm": 0.15750998258590698, + "learning_rate": 0.00019973092021218576, + "loss": 1.1482, + "step": 275 + }, + { + "epoch": 0.1, + "grad_norm": 0.14668568968772888, + "learning_rate": 0.00019972888650950652, + "loss": 1.1397, + "step": 276 + }, + { + "epoch": 0.1, + "grad_norm": 0.14951111376285553, + "learning_rate": 0.00019972684516081214, + "loss": 1.1073, + "step": 277 + }, + { + "epoch": 0.1, + "grad_norm": 0.15213429927825928, + "learning_rate": 0.0001997247961662591, + "loss": 1.1308, + "step": 278 + }, + { + "epoch": 0.1, + "grad_norm": 0.18259520828723907, + "learning_rate": 0.00019972273952600448, + "loss": 1.1946, + "step": 279 + }, + { + "epoch": 0.1, + "grad_norm": 0.15054333209991455, + "learning_rate": 0.00019972067524020605, + "loss": 1.1469, + "step": 280 + }, + { + "epoch": 0.1, + "grad_norm": 0.15350620448589325, + "learning_rate": 0.00019971860330902198, + "loss": 1.164, + "step": 281 + }, + { + "epoch": 0.1, + "grad_norm": 0.16368921101093292, + "learning_rate": 0.00019971652373261116, + "loss": 1.2219, + "step": 282 + }, + { + "epoch": 0.1, + "grad_norm": 0.1547555923461914, + "learning_rate": 0.000199714436511133, + "loss": 1.091, + "step": 283 + }, + { + "epoch": 0.1, + "grad_norm": 0.1510656177997589, + "learning_rate": 0.00019971234164474754, + "loss": 1.1535, + "step": 284 + }, + { + "epoch": 0.1, + "grad_norm": 0.15672382712364197, + "learning_rate": 0.0001997102391336154, + "loss": 1.0743, + "step": 285 + }, + { + "epoch": 0.1, + "grad_norm": 0.16902947425842285, + "learning_rate": 0.0001997081289778978, + "loss": 1.1292, + "step": 286 + }, + { + "epoch": 0.1, + "grad_norm": 0.14788220822811127, + "learning_rate": 0.00019970601117775644, + "loss": 1.1239, + "step": 287 + }, + { + "epoch": 0.1, + "grad_norm": 0.1836184412240982, + "learning_rate": 0.00019970388573335374, + "loss": 1.1346, + "step": 288 + }, + { + "epoch": 0.1, + "grad_norm": 0.14677155017852783, + "learning_rate": 0.00019970175264485266, + "loss": 1.1423, + "step": 289 + }, + { + "epoch": 0.1, + "grad_norm": 0.15332669019699097, + "learning_rate": 0.00019969961191241675, + "loss": 1.1339, + "step": 290 + }, + { + "epoch": 0.1, + "grad_norm": 0.1484992951154709, + "learning_rate": 0.0001996974635362101, + "loss": 1.169, + "step": 291 + }, + { + "epoch": 0.1, + "grad_norm": 0.1394282877445221, + "learning_rate": 0.00019969530751639742, + "loss": 1.0936, + "step": 292 + }, + { + "epoch": 0.1, + "grad_norm": 0.15574292838573456, + "learning_rate": 0.00019969314385314405, + "loss": 1.1508, + "step": 293 + }, + { + "epoch": 0.1, + "grad_norm": 0.15783685445785522, + "learning_rate": 0.0001996909725466158, + "loss": 1.1752, + "step": 294 + }, + { + "epoch": 0.1, + "grad_norm": 0.1484525501728058, + "learning_rate": 0.00019968879359697926, + "loss": 1.1611, + "step": 295 + }, + { + "epoch": 0.1, + "grad_norm": 0.1568101942539215, + "learning_rate": 0.00019968660700440136, + "loss": 1.0999, + "step": 296 + }, + { + "epoch": 0.1, + "grad_norm": 0.1591518223285675, + "learning_rate": 0.0001996844127690498, + "loss": 1.0899, + "step": 297 + }, + { + "epoch": 0.1, + "grad_norm": 0.14987587928771973, + "learning_rate": 0.00019968221089109283, + "loss": 1.1148, + "step": 298 + }, + { + "epoch": 0.11, + "grad_norm": 0.1563010960817337, + "learning_rate": 0.0001996800013706992, + "loss": 1.0537, + "step": 299 + }, + { + "epoch": 0.11, + "grad_norm": 0.16013358533382416, + "learning_rate": 0.00019967778420803834, + "loss": 1.0721, + "step": 300 + }, + { + "epoch": 0.11, + "grad_norm": 0.16744442284107208, + "learning_rate": 0.00019967555940328028, + "loss": 1.1195, + "step": 301 + }, + { + "epoch": 0.11, + "grad_norm": 0.14828696846961975, + "learning_rate": 0.00019967332695659554, + "loss": 1.0544, + "step": 302 + }, + { + "epoch": 0.11, + "grad_norm": 0.16506452858448029, + "learning_rate": 0.00019967108686815528, + "loss": 1.1455, + "step": 303 + }, + { + "epoch": 0.11, + "grad_norm": 0.178831547498703, + "learning_rate": 0.00019966883913813125, + "loss": 1.0979, + "step": 304 + }, + { + "epoch": 0.11, + "grad_norm": 0.15071475505828857, + "learning_rate": 0.00019966658376669577, + "loss": 1.1009, + "step": 305 + }, + { + "epoch": 0.11, + "grad_norm": 0.17837685346603394, + "learning_rate": 0.00019966432075402182, + "loss": 1.1708, + "step": 306 + }, + { + "epoch": 0.11, + "grad_norm": 0.18097737431526184, + "learning_rate": 0.0001996620501002828, + "loss": 1.1184, + "step": 307 + }, + { + "epoch": 0.11, + "grad_norm": 0.14264380931854248, + "learning_rate": 0.00019965977180565284, + "loss": 1.1471, + "step": 308 + }, + { + "epoch": 0.11, + "grad_norm": 0.1658606231212616, + "learning_rate": 0.00019965748587030664, + "loss": 1.0892, + "step": 309 + }, + { + "epoch": 0.11, + "grad_norm": 0.16822941601276398, + "learning_rate": 0.00019965519229441943, + "loss": 1.135, + "step": 310 + }, + { + "epoch": 0.11, + "grad_norm": 0.1563027799129486, + "learning_rate": 0.00019965289107816707, + "loss": 1.2013, + "step": 311 + }, + { + "epoch": 0.11, + "grad_norm": 0.15105265378952026, + "learning_rate": 0.00019965058222172594, + "loss": 1.1629, + "step": 312 + }, + { + "epoch": 0.11, + "grad_norm": 0.15457677841186523, + "learning_rate": 0.00019964826572527314, + "loss": 1.1253, + "step": 313 + }, + { + "epoch": 0.11, + "grad_norm": 0.18937544524669647, + "learning_rate": 0.00019964594158898617, + "loss": 1.155, + "step": 314 + }, + { + "epoch": 0.11, + "grad_norm": 0.1488984227180481, + "learning_rate": 0.0001996436098130433, + "loss": 1.1303, + "step": 315 + }, + { + "epoch": 0.11, + "grad_norm": 0.1458614617586136, + "learning_rate": 0.0001996412703976233, + "loss": 1.1398, + "step": 316 + }, + { + "epoch": 0.11, + "grad_norm": 0.1635604351758957, + "learning_rate": 0.00019963892334290547, + "loss": 1.0879, + "step": 317 + }, + { + "epoch": 0.11, + "grad_norm": 0.1525888293981552, + "learning_rate": 0.00019963656864906978, + "loss": 1.1006, + "step": 318 + }, + { + "epoch": 0.11, + "grad_norm": 0.15084509551525116, + "learning_rate": 0.00019963420631629678, + "loss": 1.1898, + "step": 319 + }, + { + "epoch": 0.11, + "grad_norm": 0.16898000240325928, + "learning_rate": 0.00019963183634476756, + "loss": 1.1698, + "step": 320 + }, + { + "epoch": 0.11, + "grad_norm": 0.15749044716358185, + "learning_rate": 0.00019962945873466383, + "loss": 1.212, + "step": 321 + }, + { + "epoch": 0.11, + "grad_norm": 0.15450423955917358, + "learning_rate": 0.0001996270734861679, + "loss": 1.162, + "step": 322 + }, + { + "epoch": 0.11, + "grad_norm": 0.15154510736465454, + "learning_rate": 0.0001996246805994626, + "loss": 1.1085, + "step": 323 + }, + { + "epoch": 0.11, + "grad_norm": 0.15856508910655975, + "learning_rate": 0.00019962228007473144, + "loss": 1.0942, + "step": 324 + }, + { + "epoch": 0.11, + "grad_norm": 0.17117822170257568, + "learning_rate": 0.00019961987191215841, + "loss": 1.1871, + "step": 325 + }, + { + "epoch": 0.11, + "grad_norm": 0.15272203087806702, + "learning_rate": 0.00019961745611192817, + "loss": 1.0386, + "step": 326 + }, + { + "epoch": 0.12, + "grad_norm": 0.15557795763015747, + "learning_rate": 0.0001996150326742259, + "loss": 1.121, + "step": 327 + }, + { + "epoch": 0.12, + "grad_norm": 0.16436509788036346, + "learning_rate": 0.00019961260159923744, + "loss": 1.1567, + "step": 328 + }, + { + "epoch": 0.12, + "grad_norm": 0.16334863007068634, + "learning_rate": 0.00019961016288714918, + "loss": 1.0672, + "step": 329 + }, + { + "epoch": 0.12, + "grad_norm": 0.16035060584545135, + "learning_rate": 0.00019960771653814807, + "loss": 1.1199, + "step": 330 + }, + { + "epoch": 0.12, + "grad_norm": 0.15284912288188934, + "learning_rate": 0.00019960526255242164, + "loss": 1.0992, + "step": 331 + }, + { + "epoch": 0.12, + "grad_norm": 0.15060201287269592, + "learning_rate": 0.00019960280093015808, + "loss": 1.1156, + "step": 332 + }, + { + "epoch": 0.12, + "grad_norm": 0.16274458169937134, + "learning_rate": 0.0001996003316715461, + "loss": 1.1305, + "step": 333 + }, + { + "epoch": 0.12, + "grad_norm": 0.1730499416589737, + "learning_rate": 0.00019959785477677503, + "loss": 1.1174, + "step": 334 + }, + { + "epoch": 0.12, + "grad_norm": 0.15439949929714203, + "learning_rate": 0.0001995953702460347, + "loss": 1.0511, + "step": 335 + }, + { + "epoch": 0.12, + "grad_norm": 0.15664365887641907, + "learning_rate": 0.0001995928780795157, + "loss": 1.141, + "step": 336 + }, + { + "epoch": 0.12, + "grad_norm": 0.16615834832191467, + "learning_rate": 0.00019959037827740902, + "loss": 1.156, + "step": 337 + }, + { + "epoch": 0.12, + "grad_norm": 0.15994668006896973, + "learning_rate": 0.00019958787083990633, + "loss": 1.1022, + "step": 338 + }, + { + "epoch": 0.12, + "grad_norm": 0.15766587853431702, + "learning_rate": 0.00019958535576719991, + "loss": 1.1335, + "step": 339 + }, + { + "epoch": 0.12, + "grad_norm": 0.15908043086528778, + "learning_rate": 0.00019958283305948252, + "loss": 1.172, + "step": 340 + }, + { + "epoch": 0.12, + "grad_norm": 0.16546039283275604, + "learning_rate": 0.00019958030271694766, + "loss": 1.2044, + "step": 341 + }, + { + "epoch": 0.12, + "grad_norm": 0.16330744326114655, + "learning_rate": 0.00019957776473978923, + "loss": 1.1237, + "step": 342 + }, + { + "epoch": 0.12, + "grad_norm": 0.1535853147506714, + "learning_rate": 0.00019957521912820187, + "loss": 1.114, + "step": 343 + }, + { + "epoch": 0.12, + "grad_norm": 0.1673428863286972, + "learning_rate": 0.0001995726658823807, + "loss": 1.0633, + "step": 344 + }, + { + "epoch": 0.12, + "grad_norm": 0.15542234480381012, + "learning_rate": 0.00019957010500252153, + "loss": 1.1305, + "step": 345 + }, + { + "epoch": 0.12, + "grad_norm": 0.15837039053440094, + "learning_rate": 0.00019956753648882068, + "loss": 1.1461, + "step": 346 + }, + { + "epoch": 0.12, + "grad_norm": 0.16113828122615814, + "learning_rate": 0.00019956496034147505, + "loss": 1.1012, + "step": 347 + }, + { + "epoch": 0.12, + "grad_norm": 0.150481179356575, + "learning_rate": 0.00019956237656068218, + "loss": 1.0548, + "step": 348 + }, + { + "epoch": 0.12, + "grad_norm": 0.15013793110847473, + "learning_rate": 0.00019955978514664012, + "loss": 1.0639, + "step": 349 + }, + { + "epoch": 0.12, + "grad_norm": 0.16781456768512726, + "learning_rate": 0.00019955718609954758, + "loss": 1.0529, + "step": 350 + }, + { + "epoch": 0.12, + "grad_norm": 0.15649698674678802, + "learning_rate": 0.00019955457941960383, + "loss": 1.0471, + "step": 351 + }, + { + "epoch": 0.12, + "grad_norm": 0.15318119525909424, + "learning_rate": 0.0001995519651070087, + "loss": 1.1111, + "step": 352 + }, + { + "epoch": 0.12, + "grad_norm": 0.1564999669790268, + "learning_rate": 0.00019954934316196261, + "loss": 1.1563, + "step": 353 + }, + { + "epoch": 0.12, + "grad_norm": 0.15984880924224854, + "learning_rate": 0.00019954671358466663, + "loss": 1.1305, + "step": 354 + }, + { + "epoch": 0.13, + "grad_norm": 0.17183969914913177, + "learning_rate": 0.00019954407637532234, + "loss": 1.1312, + "step": 355 + }, + { + "epoch": 0.13, + "grad_norm": 0.15881489217281342, + "learning_rate": 0.0001995414315341319, + "loss": 1.1161, + "step": 356 + }, + { + "epoch": 0.13, + "grad_norm": 0.15662528574466705, + "learning_rate": 0.0001995387790612981, + "loss": 1.1191, + "step": 357 + }, + { + "epoch": 0.13, + "grad_norm": 0.1573573797941208, + "learning_rate": 0.00019953611895702435, + "loss": 1.0853, + "step": 358 + }, + { + "epoch": 0.13, + "grad_norm": 0.15552809834480286, + "learning_rate": 0.00019953345122151453, + "loss": 1.0935, + "step": 359 + }, + { + "epoch": 0.13, + "grad_norm": 0.15939250588417053, + "learning_rate": 0.00019953077585497319, + "loss": 1.1027, + "step": 360 + }, + { + "epoch": 0.13, + "grad_norm": 0.1561977118253708, + "learning_rate": 0.00019952809285760545, + "loss": 1.1131, + "step": 361 + }, + { + "epoch": 0.13, + "grad_norm": 0.15809029340744019, + "learning_rate": 0.000199525402229617, + "loss": 1.1493, + "step": 362 + }, + { + "epoch": 0.13, + "grad_norm": 0.1678066998720169, + "learning_rate": 0.00019952270397121416, + "loss": 1.1568, + "step": 363 + }, + { + "epoch": 0.13, + "grad_norm": 0.16522270441055298, + "learning_rate": 0.00019951999808260376, + "loss": 1.1968, + "step": 364 + }, + { + "epoch": 0.13, + "grad_norm": 0.15423065423965454, + "learning_rate": 0.00019951728456399327, + "loss": 1.1335, + "step": 365 + }, + { + "epoch": 0.13, + "grad_norm": 0.16434919834136963, + "learning_rate": 0.0001995145634155907, + "loss": 1.1583, + "step": 366 + }, + { + "epoch": 0.13, + "grad_norm": 0.15384919941425323, + "learning_rate": 0.00019951183463760475, + "loss": 1.1789, + "step": 367 + }, + { + "epoch": 0.13, + "grad_norm": 0.15325228869915009, + "learning_rate": 0.00019950909823024455, + "loss": 1.1198, + "step": 368 + }, + { + "epoch": 0.13, + "grad_norm": 0.15763159096240997, + "learning_rate": 0.00019950635419371996, + "loss": 1.184, + "step": 369 + }, + { + "epoch": 0.13, + "grad_norm": 0.17088153958320618, + "learning_rate": 0.0001995036025282413, + "loss": 1.1653, + "step": 370 + }, + { + "epoch": 0.13, + "grad_norm": 0.15483850240707397, + "learning_rate": 0.00019950084323401958, + "loss": 1.0983, + "step": 371 + }, + { + "epoch": 0.13, + "grad_norm": 0.164934441447258, + "learning_rate": 0.00019949807631126633, + "loss": 1.0897, + "step": 372 + }, + { + "epoch": 0.13, + "grad_norm": 0.16767363250255585, + "learning_rate": 0.00019949530176019368, + "loss": 1.1031, + "step": 373 + }, + { + "epoch": 0.13, + "grad_norm": 0.16566498577594757, + "learning_rate": 0.00019949251958101437, + "loss": 1.176, + "step": 374 + }, + { + "epoch": 0.13, + "grad_norm": 0.15819025039672852, + "learning_rate": 0.0001994897297739417, + "loss": 1.2291, + "step": 375 + }, + { + "epoch": 0.13, + "grad_norm": 0.15242032706737518, + "learning_rate": 0.00019948693233918952, + "loss": 1.1527, + "step": 376 + }, + { + "epoch": 0.13, + "grad_norm": 0.15688876807689667, + "learning_rate": 0.00019948412727697234, + "loss": 1.1566, + "step": 377 + }, + { + "epoch": 0.13, + "grad_norm": 0.15263889729976654, + "learning_rate": 0.00019948131458750524, + "loss": 1.1477, + "step": 378 + }, + { + "epoch": 0.13, + "grad_norm": 0.1564534604549408, + "learning_rate": 0.00019947849427100382, + "loss": 1.1185, + "step": 379 + }, + { + "epoch": 0.13, + "grad_norm": 0.1585642248392105, + "learning_rate": 0.0001994756663276843, + "loss": 1.1919, + "step": 380 + }, + { + "epoch": 0.13, + "grad_norm": 0.15505757927894592, + "learning_rate": 0.00019947283075776352, + "loss": 1.1588, + "step": 381 + }, + { + "epoch": 0.13, + "grad_norm": 0.14966590702533722, + "learning_rate": 0.0001994699875614589, + "loss": 1.0691, + "step": 382 + }, + { + "epoch": 0.13, + "grad_norm": 0.16230319440364838, + "learning_rate": 0.0001994671367389884, + "loss": 1.0874, + "step": 383 + }, + { + "epoch": 0.14, + "grad_norm": 0.1507527083158493, + "learning_rate": 0.00019946427829057057, + "loss": 1.1406, + "step": 384 + }, + { + "epoch": 0.14, + "grad_norm": 0.15179996192455292, + "learning_rate": 0.00019946141221642458, + "loss": 1.0817, + "step": 385 + }, + { + "epoch": 0.14, + "grad_norm": 0.15593743324279785, + "learning_rate": 0.00019945853851677014, + "loss": 1.0832, + "step": 386 + }, + { + "epoch": 0.14, + "grad_norm": 0.15427644550800323, + "learning_rate": 0.00019945565719182762, + "loss": 1.0984, + "step": 387 + }, + { + "epoch": 0.14, + "grad_norm": 0.14921724796295166, + "learning_rate": 0.00019945276824181787, + "loss": 1.1237, + "step": 388 + }, + { + "epoch": 0.14, + "grad_norm": 0.15762227773666382, + "learning_rate": 0.00019944987166696244, + "loss": 1.1492, + "step": 389 + }, + { + "epoch": 0.14, + "grad_norm": 0.17605511844158173, + "learning_rate": 0.00019944696746748334, + "loss": 1.077, + "step": 390 + }, + { + "epoch": 0.14, + "grad_norm": 0.15459680557250977, + "learning_rate": 0.00019944405564360326, + "loss": 1.0774, + "step": 391 + }, + { + "epoch": 0.14, + "grad_norm": 0.15118005871772766, + "learning_rate": 0.00019944113619554545, + "loss": 1.0614, + "step": 392 + }, + { + "epoch": 0.14, + "grad_norm": 0.15591846406459808, + "learning_rate": 0.00019943820912353375, + "loss": 1.2164, + "step": 393 + }, + { + "epoch": 0.14, + "grad_norm": 0.1590687781572342, + "learning_rate": 0.00019943527442779256, + "loss": 1.1047, + "step": 394 + }, + { + "epoch": 0.14, + "grad_norm": 0.15030692517757416, + "learning_rate": 0.00019943233210854686, + "loss": 1.0685, + "step": 395 + }, + { + "epoch": 0.14, + "grad_norm": 0.16304552555084229, + "learning_rate": 0.00019942938216602223, + "loss": 1.1315, + "step": 396 + }, + { + "epoch": 0.14, + "grad_norm": 0.16410882771015167, + "learning_rate": 0.00019942642460044486, + "loss": 1.0894, + "step": 397 + }, + { + "epoch": 0.14, + "grad_norm": 0.15879030525684357, + "learning_rate": 0.00019942345941204148, + "loss": 1.1264, + "step": 398 + }, + { + "epoch": 0.14, + "grad_norm": 0.17829300463199615, + "learning_rate": 0.00019942048660103948, + "loss": 1.1409, + "step": 399 + }, + { + "epoch": 0.14, + "grad_norm": 0.1717354953289032, + "learning_rate": 0.0001994175061676667, + "loss": 1.1616, + "step": 400 + }, + { + "epoch": 0.14, + "grad_norm": 0.16701237857341766, + "learning_rate": 0.00019941451811215168, + "loss": 1.1964, + "step": 401 + }, + { + "epoch": 0.14, + "grad_norm": 0.16376936435699463, + "learning_rate": 0.00019941152243472353, + "loss": 1.1349, + "step": 402 + }, + { + "epoch": 0.14, + "grad_norm": 0.1571936160326004, + "learning_rate": 0.00019940851913561187, + "loss": 1.1143, + "step": 403 + }, + { + "epoch": 0.14, + "grad_norm": 0.1656806915998459, + "learning_rate": 0.000199405508215047, + "loss": 1.1323, + "step": 404 + }, + { + "epoch": 0.14, + "grad_norm": 0.16776010394096375, + "learning_rate": 0.00019940248967325976, + "loss": 1.1367, + "step": 405 + }, + { + "epoch": 0.14, + "grad_norm": 0.14835897088050842, + "learning_rate": 0.00019939946351048158, + "loss": 1.1074, + "step": 406 + }, + { + "epoch": 0.14, + "grad_norm": 0.1668928563594818, + "learning_rate": 0.00019939642972694442, + "loss": 1.076, + "step": 407 + }, + { + "epoch": 0.14, + "grad_norm": 0.16557957231998444, + "learning_rate": 0.0001993933883228809, + "loss": 1.1482, + "step": 408 + }, + { + "epoch": 0.14, + "grad_norm": 0.15333621203899384, + "learning_rate": 0.00019939033929852425, + "loss": 1.1367, + "step": 409 + }, + { + "epoch": 0.14, + "grad_norm": 0.15507064759731293, + "learning_rate": 0.00019938728265410816, + "loss": 1.1554, + "step": 410 + }, + { + "epoch": 0.14, + "grad_norm": 0.15727224946022034, + "learning_rate": 0.000199384218389867, + "loss": 1.1084, + "step": 411 + }, + { + "epoch": 0.15, + "grad_norm": 0.1558888554573059, + "learning_rate": 0.00019938114650603573, + "loss": 1.1928, + "step": 412 + }, + { + "epoch": 0.15, + "grad_norm": 0.1571364551782608, + "learning_rate": 0.00019937806700284986, + "loss": 1.0771, + "step": 413 + }, + { + "epoch": 0.15, + "grad_norm": 0.16041377186775208, + "learning_rate": 0.00019937497988054546, + "loss": 1.14, + "step": 414 + }, + { + "epoch": 0.15, + "grad_norm": 0.15569083392620087, + "learning_rate": 0.00019937188513935921, + "loss": 1.1003, + "step": 415 + }, + { + "epoch": 0.15, + "grad_norm": 0.15835022926330566, + "learning_rate": 0.0001993687827795284, + "loss": 1.0935, + "step": 416 + }, + { + "epoch": 0.15, + "grad_norm": 0.16400137543678284, + "learning_rate": 0.00019936567280129085, + "loss": 1.1469, + "step": 417 + }, + { + "epoch": 0.15, + "grad_norm": 0.17146562039852142, + "learning_rate": 0.00019936255520488504, + "loss": 1.1114, + "step": 418 + }, + { + "epoch": 0.15, + "grad_norm": 0.16136683523654938, + "learning_rate": 0.00019935942999055, + "loss": 1.1084, + "step": 419 + }, + { + "epoch": 0.15, + "grad_norm": 0.15654154121875763, + "learning_rate": 0.00019935629715852525, + "loss": 1.0972, + "step": 420 + }, + { + "epoch": 0.15, + "grad_norm": 0.1646607220172882, + "learning_rate": 0.00019935315670905105, + "loss": 1.1281, + "step": 421 + }, + { + "epoch": 0.15, + "grad_norm": 0.1610719859600067, + "learning_rate": 0.0001993500086423682, + "loss": 1.1131, + "step": 422 + }, + { + "epoch": 0.15, + "grad_norm": 0.16374924778938293, + "learning_rate": 0.00019934685295871796, + "loss": 1.1501, + "step": 423 + }, + { + "epoch": 0.15, + "grad_norm": 0.14906828105449677, + "learning_rate": 0.00019934368965834236, + "loss": 1.0911, + "step": 424 + }, + { + "epoch": 0.15, + "grad_norm": 0.15239720046520233, + "learning_rate": 0.00019934051874148385, + "loss": 1.1018, + "step": 425 + }, + { + "epoch": 0.15, + "grad_norm": 0.16222716867923737, + "learning_rate": 0.00019933734020838558, + "loss": 1.1582, + "step": 426 + }, + { + "epoch": 0.15, + "grad_norm": 0.16466976702213287, + "learning_rate": 0.00019933415405929124, + "loss": 1.1741, + "step": 427 + }, + { + "epoch": 0.15, + "grad_norm": 0.16280680894851685, + "learning_rate": 0.00019933096029444512, + "loss": 1.1858, + "step": 428 + }, + { + "epoch": 0.15, + "grad_norm": 0.1722201555967331, + "learning_rate": 0.00019932775891409204, + "loss": 1.1564, + "step": 429 + }, + { + "epoch": 0.15, + "grad_norm": 0.1672065407037735, + "learning_rate": 0.00019932454991847745, + "loss": 1.0803, + "step": 430 + }, + { + "epoch": 0.15, + "grad_norm": 0.15295496582984924, + "learning_rate": 0.0001993213333078474, + "loss": 1.0711, + "step": 431 + }, + { + "epoch": 0.15, + "grad_norm": 0.1692061722278595, + "learning_rate": 0.0001993181090824485, + "loss": 1.1564, + "step": 432 + }, + { + "epoch": 0.15, + "grad_norm": 0.17974984645843506, + "learning_rate": 0.00019931487724252796, + "loss": 1.1416, + "step": 433 + }, + { + "epoch": 0.15, + "grad_norm": 0.2724705934524536, + "learning_rate": 0.00019931163778833352, + "loss": 1.1635, + "step": 434 + }, + { + "epoch": 0.15, + "grad_norm": 0.16062946617603302, + "learning_rate": 0.00019930839072011353, + "loss": 1.2007, + "step": 435 + }, + { + "epoch": 0.15, + "grad_norm": 0.1821368783712387, + "learning_rate": 0.00019930513603811699, + "loss": 1.0672, + "step": 436 + }, + { + "epoch": 0.15, + "grad_norm": 0.17222100496292114, + "learning_rate": 0.00019930187374259337, + "loss": 1.1162, + "step": 437 + }, + { + "epoch": 0.15, + "grad_norm": 0.16529572010040283, + "learning_rate": 0.00019929860383379285, + "loss": 1.1139, + "step": 438 + }, + { + "epoch": 0.15, + "grad_norm": 0.18187189102172852, + "learning_rate": 0.00019929532631196608, + "loss": 1.1688, + "step": 439 + }, + { + "epoch": 0.15, + "grad_norm": 0.18088270723819733, + "learning_rate": 0.00019929204117736436, + "loss": 1.1112, + "step": 440 + }, + { + "epoch": 0.16, + "grad_norm": 0.1561852991580963, + "learning_rate": 0.00019928874843023957, + "loss": 1.1113, + "step": 441 + }, + { + "epoch": 0.16, + "grad_norm": 0.1627861112356186, + "learning_rate": 0.0001992854480708441, + "loss": 1.0826, + "step": 442 + }, + { + "epoch": 0.16, + "grad_norm": 0.16556496918201447, + "learning_rate": 0.00019928214009943106, + "loss": 1.1006, + "step": 443 + }, + { + "epoch": 0.16, + "grad_norm": 0.18377332389354706, + "learning_rate": 0.00019927882451625402, + "loss": 1.1345, + "step": 444 + }, + { + "epoch": 0.16, + "grad_norm": 0.1585996001958847, + "learning_rate": 0.00019927550132156716, + "loss": 1.0774, + "step": 445 + }, + { + "epoch": 0.16, + "grad_norm": 0.1622677743434906, + "learning_rate": 0.00019927217051562531, + "loss": 1.0757, + "step": 446 + }, + { + "epoch": 0.16, + "grad_norm": 0.1750255525112152, + "learning_rate": 0.00019926883209868382, + "loss": 1.1532, + "step": 447 + }, + { + "epoch": 0.16, + "grad_norm": 0.1599978357553482, + "learning_rate": 0.00019926548607099865, + "loss": 1.0104, + "step": 448 + }, + { + "epoch": 0.16, + "grad_norm": 0.15971815586090088, + "learning_rate": 0.00019926213243282628, + "loss": 1.0962, + "step": 449 + }, + { + "epoch": 0.16, + "grad_norm": 0.16724000871181488, + "learning_rate": 0.00019925877118442386, + "loss": 1.1142, + "step": 450 + }, + { + "epoch": 0.16, + "grad_norm": 0.17150260508060455, + "learning_rate": 0.00019925540232604915, + "loss": 1.1623, + "step": 451 + }, + { + "epoch": 0.16, + "grad_norm": 0.1774536669254303, + "learning_rate": 0.00019925202585796033, + "loss": 1.1171, + "step": 452 + }, + { + "epoch": 0.16, + "grad_norm": 0.1575271040201187, + "learning_rate": 0.00019924864178041634, + "loss": 1.095, + "step": 453 + }, + { + "epoch": 0.16, + "grad_norm": 0.3351128101348877, + "learning_rate": 0.00019924525009367658, + "loss": 1.2996, + "step": 454 + }, + { + "epoch": 0.16, + "grad_norm": 0.1803962141275406, + "learning_rate": 0.00019924185079800113, + "loss": 1.084, + "step": 455 + }, + { + "epoch": 0.16, + "grad_norm": 0.17613469064235687, + "learning_rate": 0.0001992384438936506, + "loss": 1.1243, + "step": 456 + }, + { + "epoch": 0.16, + "grad_norm": 0.17339295148849487, + "learning_rate": 0.00019923502938088615, + "loss": 1.0991, + "step": 457 + }, + { + "epoch": 0.16, + "grad_norm": 0.16048653423786163, + "learning_rate": 0.0001992316072599696, + "loss": 1.0882, + "step": 458 + }, + { + "epoch": 0.16, + "grad_norm": 0.15557259321212769, + "learning_rate": 0.0001992281775311633, + "loss": 1.0639, + "step": 459 + }, + { + "epoch": 0.16, + "grad_norm": 0.17755229771137238, + "learning_rate": 0.00019922474019473023, + "loss": 1.1323, + "step": 460 + }, + { + "epoch": 0.16, + "grad_norm": 0.16753968596458435, + "learning_rate": 0.00019922129525093389, + "loss": 1.0643, + "step": 461 + }, + { + "epoch": 0.16, + "grad_norm": 0.15593862533569336, + "learning_rate": 0.00019921784270003843, + "loss": 1.1237, + "step": 462 + }, + { + "epoch": 0.16, + "grad_norm": 0.1679830104112625, + "learning_rate": 0.0001992143825423085, + "loss": 1.1163, + "step": 463 + }, + { + "epoch": 0.16, + "grad_norm": 0.17055770754814148, + "learning_rate": 0.00019921091477800946, + "loss": 1.1181, + "step": 464 + }, + { + "epoch": 0.16, + "grad_norm": 0.1715300977230072, + "learning_rate": 0.0001992074394074071, + "loss": 1.1767, + "step": 465 + }, + { + "epoch": 0.16, + "grad_norm": 0.15965238213539124, + "learning_rate": 0.0001992039564307679, + "loss": 1.1201, + "step": 466 + }, + { + "epoch": 0.16, + "grad_norm": 0.1599990427494049, + "learning_rate": 0.00019920046584835887, + "loss": 1.1146, + "step": 467 + }, + { + "epoch": 0.16, + "grad_norm": 0.16567906737327576, + "learning_rate": 0.00019919696766044768, + "loss": 1.1322, + "step": 468 + }, + { + "epoch": 0.17, + "grad_norm": 0.15802539885044098, + "learning_rate": 0.0001991934618673025, + "loss": 1.0508, + "step": 469 + }, + { + "epoch": 0.17, + "grad_norm": 0.161991149187088, + "learning_rate": 0.0001991899484691921, + "loss": 1.1892, + "step": 470 + }, + { + "epoch": 0.17, + "grad_norm": 0.16740918159484863, + "learning_rate": 0.00019918642746638584, + "loss": 1.1625, + "step": 471 + }, + { + "epoch": 0.17, + "grad_norm": 0.15869614481925964, + "learning_rate": 0.0001991828988591537, + "loss": 1.126, + "step": 472 + }, + { + "epoch": 0.17, + "grad_norm": 0.17097507417201996, + "learning_rate": 0.0001991793626477662, + "loss": 1.0856, + "step": 473 + }, + { + "epoch": 0.17, + "grad_norm": 0.15990661084651947, + "learning_rate": 0.00019917581883249446, + "loss": 1.0745, + "step": 474 + }, + { + "epoch": 0.17, + "grad_norm": 0.1636432260274887, + "learning_rate": 0.00019917226741361015, + "loss": 1.192, + "step": 475 + }, + { + "epoch": 0.17, + "grad_norm": 0.16329345107078552, + "learning_rate": 0.00019916870839138556, + "loss": 1.0846, + "step": 476 + }, + { + "epoch": 0.17, + "grad_norm": 0.16847611963748932, + "learning_rate": 0.00019916514176609358, + "loss": 1.1549, + "step": 477 + }, + { + "epoch": 0.17, + "grad_norm": 0.16813425719738007, + "learning_rate": 0.00019916156753800764, + "loss": 1.0764, + "step": 478 + }, + { + "epoch": 0.17, + "grad_norm": 0.1625032275915146, + "learning_rate": 0.00019915798570740173, + "loss": 1.1547, + "step": 479 + }, + { + "epoch": 0.17, + "grad_norm": 0.170704185962677, + "learning_rate": 0.00019915439627455052, + "loss": 1.0889, + "step": 480 + }, + { + "epoch": 0.17, + "grad_norm": 0.17293334007263184, + "learning_rate": 0.00019915079923972918, + "loss": 1.202, + "step": 481 + }, + { + "epoch": 0.17, + "grad_norm": 0.15445169806480408, + "learning_rate": 0.0001991471946032135, + "loss": 1.0738, + "step": 482 + }, + { + "epoch": 0.17, + "grad_norm": 0.16902180016040802, + "learning_rate": 0.00019914358236527982, + "loss": 1.0554, + "step": 483 + }, + { + "epoch": 0.17, + "grad_norm": 0.15919239819049835, + "learning_rate": 0.0001991399625262051, + "loss": 1.1367, + "step": 484 + }, + { + "epoch": 0.17, + "grad_norm": 0.16527967154979706, + "learning_rate": 0.00019913633508626685, + "loss": 1.1041, + "step": 485 + }, + { + "epoch": 0.17, + "grad_norm": 0.16559232771396637, + "learning_rate": 0.0001991327000457432, + "loss": 1.0582, + "step": 486 + }, + { + "epoch": 0.17, + "grad_norm": 0.16566972434520721, + "learning_rate": 0.0001991290574049128, + "loss": 1.0881, + "step": 487 + }, + { + "epoch": 0.17, + "grad_norm": 0.15967406332492828, + "learning_rate": 0.00019912540716405497, + "loss": 1.0668, + "step": 488 + }, + { + "epoch": 0.17, + "grad_norm": 0.16620907187461853, + "learning_rate": 0.00019912174932344953, + "loss": 1.0945, + "step": 489 + }, + { + "epoch": 0.17, + "grad_norm": 0.16342945396900177, + "learning_rate": 0.00019911808388337697, + "loss": 1.0558, + "step": 490 + }, + { + "epoch": 0.17, + "grad_norm": 0.14973656833171844, + "learning_rate": 0.00019911441084411827, + "loss": 1.042, + "step": 491 + }, + { + "epoch": 0.17, + "grad_norm": 0.1684689223766327, + "learning_rate": 0.00019911073020595504, + "loss": 1.1281, + "step": 492 + }, + { + "epoch": 0.17, + "grad_norm": 0.15556031465530396, + "learning_rate": 0.00019910704196916948, + "loss": 1.0338, + "step": 493 + }, + { + "epoch": 0.17, + "grad_norm": 0.16132386028766632, + "learning_rate": 0.00019910334613404434, + "loss": 1.0586, + "step": 494 + }, + { + "epoch": 0.17, + "grad_norm": 0.16225095093250275, + "learning_rate": 0.000199099642700863, + "loss": 1.1608, + "step": 495 + }, + { + "epoch": 0.17, + "grad_norm": 0.17955203354358673, + "learning_rate": 0.00019909593166990934, + "loss": 1.0858, + "step": 496 + }, + { + "epoch": 0.18, + "grad_norm": 0.153148353099823, + "learning_rate": 0.00019909221304146795, + "loss": 1.1217, + "step": 497 + }, + { + "epoch": 0.18, + "grad_norm": 0.16156357526779175, + "learning_rate": 0.00019908848681582391, + "loss": 1.08, + "step": 498 + }, + { + "epoch": 0.18, + "grad_norm": 0.167068749666214, + "learning_rate": 0.00019908475299326286, + "loss": 1.1021, + "step": 499 + }, + { + "epoch": 0.18, + "grad_norm": 0.17191778123378754, + "learning_rate": 0.00019908101157407112, + "loss": 1.085, + "step": 500 + }, + { + "epoch": 0.18, + "grad_norm": 0.17865735292434692, + "learning_rate": 0.00019907726255853547, + "loss": 1.118, + "step": 501 + }, + { + "epoch": 0.18, + "grad_norm": 0.1564328372478485, + "learning_rate": 0.0001990735059469434, + "loss": 1.0634, + "step": 502 + }, + { + "epoch": 0.18, + "grad_norm": 0.1716470867395401, + "learning_rate": 0.0001990697417395829, + "loss": 1.0566, + "step": 503 + }, + { + "epoch": 0.18, + "grad_norm": 0.1743009388446808, + "learning_rate": 0.00019906596993674258, + "loss": 1.0836, + "step": 504 + }, + { + "epoch": 0.18, + "grad_norm": 0.1646343320608139, + "learning_rate": 0.0001990621905387116, + "loss": 1.0982, + "step": 505 + }, + { + "epoch": 0.18, + "grad_norm": 0.18098647892475128, + "learning_rate": 0.00019905840354577972, + "loss": 1.1517, + "step": 506 + }, + { + "epoch": 0.18, + "grad_norm": 0.16818532347679138, + "learning_rate": 0.00019905460895823727, + "loss": 1.13, + "step": 507 + }, + { + "epoch": 0.18, + "grad_norm": 0.16616329550743103, + "learning_rate": 0.00019905080677637518, + "loss": 1.1478, + "step": 508 + }, + { + "epoch": 0.18, + "grad_norm": 0.16015742719173431, + "learning_rate": 0.000199046997000485, + "loss": 1.0683, + "step": 509 + }, + { + "epoch": 0.18, + "grad_norm": 0.16552738845348358, + "learning_rate": 0.00019904317963085876, + "loss": 1.0965, + "step": 510 + }, + { + "epoch": 0.18, + "grad_norm": 0.15932437777519226, + "learning_rate": 0.00019903935466778915, + "loss": 1.0809, + "step": 511 + }, + { + "epoch": 0.18, + "grad_norm": 0.157403826713562, + "learning_rate": 0.00019903552211156939, + "loss": 1.073, + "step": 512 + }, + { + "epoch": 0.18, + "grad_norm": 0.1659756451845169, + "learning_rate": 0.0001990316819624934, + "loss": 1.111, + "step": 513 + }, + { + "epoch": 0.18, + "grad_norm": 0.1708558350801468, + "learning_rate": 0.0001990278342208555, + "loss": 1.0917, + "step": 514 + }, + { + "epoch": 0.18, + "grad_norm": 0.1662391573190689, + "learning_rate": 0.00019902397888695077, + "loss": 1.1407, + "step": 515 + }, + { + "epoch": 0.18, + "grad_norm": 0.16887429356575012, + "learning_rate": 0.00019902011596107473, + "loss": 1.025, + "step": 516 + }, + { + "epoch": 0.18, + "grad_norm": 0.17284426093101501, + "learning_rate": 0.00019901624544352357, + "loss": 1.0788, + "step": 517 + }, + { + "epoch": 0.18, + "grad_norm": 0.1698935627937317, + "learning_rate": 0.00019901236733459404, + "loss": 1.1118, + "step": 518 + }, + { + "epoch": 0.18, + "grad_norm": 0.15765266120433807, + "learning_rate": 0.00019900848163458346, + "loss": 1.0623, + "step": 519 + }, + { + "epoch": 0.18, + "grad_norm": 0.1714356690645218, + "learning_rate": 0.00019900458834378972, + "loss": 1.1496, + "step": 520 + }, + { + "epoch": 0.18, + "grad_norm": 0.16561633348464966, + "learning_rate": 0.00019900068746251135, + "loss": 1.0381, + "step": 521 + }, + { + "epoch": 0.18, + "grad_norm": 0.16743022203445435, + "learning_rate": 0.00019899677899104737, + "loss": 1.0783, + "step": 522 + }, + { + "epoch": 0.18, + "grad_norm": 0.16489222645759583, + "learning_rate": 0.00019899286292969747, + "loss": 1.1166, + "step": 523 + }, + { + "epoch": 0.18, + "grad_norm": 0.1696144938468933, + "learning_rate": 0.0001989889392787619, + "loss": 1.1433, + "step": 524 + }, + { + "epoch": 0.18, + "grad_norm": 0.16742746531963348, + "learning_rate": 0.00019898500803854145, + "loss": 1.0809, + "step": 525 + }, + { + "epoch": 0.19, + "grad_norm": 0.1641719490289688, + "learning_rate": 0.00019898106920933755, + "loss": 1.1071, + "step": 526 + }, + { + "epoch": 0.19, + "grad_norm": 0.17350101470947266, + "learning_rate": 0.00019897712279145214, + "loss": 1.0923, + "step": 527 + }, + { + "epoch": 0.19, + "grad_norm": 0.17832672595977783, + "learning_rate": 0.0001989731687851878, + "loss": 1.1332, + "step": 528 + }, + { + "epoch": 0.19, + "grad_norm": 0.15399731695652008, + "learning_rate": 0.0001989692071908477, + "loss": 1.0644, + "step": 529 + }, + { + "epoch": 0.19, + "grad_norm": 0.16712157428264618, + "learning_rate": 0.00019896523800873555, + "loss": 1.1236, + "step": 530 + }, + { + "epoch": 0.19, + "grad_norm": 0.1874651312828064, + "learning_rate": 0.00019896126123915563, + "loss": 1.1011, + "step": 531 + }, + { + "epoch": 0.19, + "grad_norm": 0.17110246419906616, + "learning_rate": 0.00019895727688241287, + "loss": 1.0806, + "step": 532 + }, + { + "epoch": 0.19, + "grad_norm": 0.17345298826694489, + "learning_rate": 0.00019895328493881276, + "loss": 1.1803, + "step": 533 + }, + { + "epoch": 0.19, + "grad_norm": 0.1709088385105133, + "learning_rate": 0.0001989492854086613, + "loss": 1.1504, + "step": 534 + }, + { + "epoch": 0.19, + "grad_norm": 0.16704735159873962, + "learning_rate": 0.00019894527829226517, + "loss": 1.1459, + "step": 535 + }, + { + "epoch": 0.19, + "grad_norm": 0.1761869490146637, + "learning_rate": 0.00019894126358993157, + "loss": 1.0465, + "step": 536 + }, + { + "epoch": 0.19, + "grad_norm": 0.16007749736309052, + "learning_rate": 0.00019893724130196828, + "loss": 1.0763, + "step": 537 + }, + { + "epoch": 0.19, + "grad_norm": 0.1607150137424469, + "learning_rate": 0.00019893321142868377, + "loss": 1.0575, + "step": 538 + }, + { + "epoch": 0.19, + "grad_norm": 0.17952270805835724, + "learning_rate": 0.00019892917397038685, + "loss": 1.1514, + "step": 539 + }, + { + "epoch": 0.19, + "grad_norm": 0.165918231010437, + "learning_rate": 0.00019892512892738718, + "loss": 1.1026, + "step": 540 + }, + { + "epoch": 0.19, + "grad_norm": 0.17264124751091003, + "learning_rate": 0.00019892107629999485, + "loss": 1.0882, + "step": 541 + }, + { + "epoch": 0.19, + "grad_norm": 0.4207189977169037, + "learning_rate": 0.00019891701608852058, + "loss": 1.2084, + "step": 542 + }, + { + "epoch": 0.19, + "grad_norm": 0.22649544477462769, + "learning_rate": 0.00019891294829327562, + "loss": 1.0122, + "step": 543 + }, + { + "epoch": 0.19, + "grad_norm": 0.18632979691028595, + "learning_rate": 0.0001989088729145719, + "loss": 1.0788, + "step": 544 + }, + { + "epoch": 0.19, + "grad_norm": 0.16893143951892853, + "learning_rate": 0.00019890478995272183, + "loss": 1.1284, + "step": 545 + }, + { + "epoch": 0.19, + "grad_norm": 0.33213508129119873, + "learning_rate": 0.00019890069940803847, + "loss": 1.0338, + "step": 546 + }, + { + "epoch": 0.19, + "grad_norm": 0.18078738451004028, + "learning_rate": 0.00019889660128083537, + "loss": 1.1017, + "step": 547 + }, + { + "epoch": 0.19, + "grad_norm": 0.1705341935157776, + "learning_rate": 0.00019889249557142677, + "loss": 1.1644, + "step": 548 + }, + { + "epoch": 0.19, + "grad_norm": 0.1637420952320099, + "learning_rate": 0.00019888838228012746, + "loss": 1.1165, + "step": 549 + }, + { + "epoch": 0.19, + "grad_norm": 0.18197381496429443, + "learning_rate": 0.00019888426140725283, + "loss": 1.1492, + "step": 550 + }, + { + "epoch": 0.19, + "grad_norm": 0.1665923148393631, + "learning_rate": 0.00019888013295311875, + "loss": 1.0797, + "step": 551 + }, + { + "epoch": 0.19, + "grad_norm": 0.17134587466716766, + "learning_rate": 0.00019887599691804174, + "loss": 1.1146, + "step": 552 + }, + { + "epoch": 0.19, + "grad_norm": 0.16827426850795746, + "learning_rate": 0.00019887185330233893, + "loss": 1.0843, + "step": 553 + }, + { + "epoch": 0.2, + "grad_norm": 0.1690630316734314, + "learning_rate": 0.00019886770210632802, + "loss": 1.1285, + "step": 554 + }, + { + "epoch": 0.2, + "grad_norm": 0.1599999964237213, + "learning_rate": 0.00019886354333032723, + "loss": 1.0514, + "step": 555 + }, + { + "epoch": 0.2, + "grad_norm": 0.16263416409492493, + "learning_rate": 0.00019885937697465545, + "loss": 1.0951, + "step": 556 + }, + { + "epoch": 0.2, + "grad_norm": 0.1642608791589737, + "learning_rate": 0.00019885520303963204, + "loss": 1.0927, + "step": 557 + }, + { + "epoch": 0.2, + "grad_norm": 0.16561810672283173, + "learning_rate": 0.00019885102152557708, + "loss": 1.1361, + "step": 558 + }, + { + "epoch": 0.2, + "grad_norm": 0.16234217584133148, + "learning_rate": 0.00019884683243281116, + "loss": 1.0727, + "step": 559 + }, + { + "epoch": 0.2, + "grad_norm": 0.15540958940982819, + "learning_rate": 0.0001988426357616554, + "loss": 0.9542, + "step": 560 + }, + { + "epoch": 0.2, + "grad_norm": 0.16096945106983185, + "learning_rate": 0.00019883843151243156, + "loss": 1.0627, + "step": 561 + }, + { + "epoch": 0.2, + "grad_norm": 0.1687619537115097, + "learning_rate": 0.00019883421968546196, + "loss": 1.1358, + "step": 562 + }, + { + "epoch": 0.2, + "grad_norm": 0.16691409051418304, + "learning_rate": 0.00019883000028106954, + "loss": 1.0642, + "step": 563 + }, + { + "epoch": 0.2, + "grad_norm": 0.16787758469581604, + "learning_rate": 0.00019882577329957782, + "loss": 1.1038, + "step": 564 + }, + { + "epoch": 0.2, + "grad_norm": 0.16505680978298187, + "learning_rate": 0.00019882153874131083, + "loss": 1.0917, + "step": 565 + }, + { + "epoch": 0.2, + "grad_norm": 0.1684773862361908, + "learning_rate": 0.00019881729660659324, + "loss": 1.1094, + "step": 566 + }, + { + "epoch": 0.2, + "grad_norm": 0.16181319952011108, + "learning_rate": 0.00019881304689575025, + "loss": 1.1195, + "step": 567 + }, + { + "epoch": 0.2, + "grad_norm": 0.16680899262428284, + "learning_rate": 0.00019880878960910772, + "loss": 1.0675, + "step": 568 + }, + { + "epoch": 0.2, + "grad_norm": 0.16721315681934357, + "learning_rate": 0.00019880452474699205, + "loss": 1.0641, + "step": 569 + }, + { + "epoch": 0.2, + "grad_norm": 0.16467982530593872, + "learning_rate": 0.0001988002523097302, + "loss": 1.068, + "step": 570 + }, + { + "epoch": 0.2, + "grad_norm": 0.16453714668750763, + "learning_rate": 0.00019879597229764974, + "loss": 1.1099, + "step": 571 + }, + { + "epoch": 0.2, + "grad_norm": 0.17011506855487823, + "learning_rate": 0.0001987916847110788, + "loss": 1.0738, + "step": 572 + }, + { + "epoch": 0.2, + "grad_norm": 0.16118289530277252, + "learning_rate": 0.00019878738955034607, + "loss": 1.1012, + "step": 573 + }, + { + "epoch": 0.2, + "grad_norm": 0.16739653050899506, + "learning_rate": 0.00019878308681578095, + "loss": 1.0256, + "step": 574 + }, + { + "epoch": 0.2, + "grad_norm": 0.16780000925064087, + "learning_rate": 0.00019877877650771323, + "loss": 1.1269, + "step": 575 + }, + { + "epoch": 0.2, + "grad_norm": 0.16901536285877228, + "learning_rate": 0.0001987744586264734, + "loss": 1.0934, + "step": 576 + }, + { + "epoch": 0.2, + "grad_norm": 0.1620328426361084, + "learning_rate": 0.0001987701331723925, + "loss": 1.1137, + "step": 577 + }, + { + "epoch": 0.2, + "grad_norm": 0.1608792394399643, + "learning_rate": 0.00019876580014580215, + "loss": 1.0165, + "step": 578 + }, + { + "epoch": 0.2, + "grad_norm": 0.17747630178928375, + "learning_rate": 0.00019876145954703458, + "loss": 1.0936, + "step": 579 + }, + { + "epoch": 0.2, + "grad_norm": 0.16932570934295654, + "learning_rate": 0.00019875711137642258, + "loss": 1.1101, + "step": 580 + }, + { + "epoch": 0.2, + "grad_norm": 0.15958163142204285, + "learning_rate": 0.00019875275563429945, + "loss": 1.0829, + "step": 581 + }, + { + "epoch": 0.2, + "grad_norm": 0.16792955994606018, + "learning_rate": 0.00019874839232099918, + "loss": 1.1083, + "step": 582 + }, + { + "epoch": 0.21, + "grad_norm": 0.1686125099658966, + "learning_rate": 0.00019874402143685633, + "loss": 1.0504, + "step": 583 + }, + { + "epoch": 0.21, + "grad_norm": 0.158395916223526, + "learning_rate": 0.00019873964298220597, + "loss": 1.0783, + "step": 584 + }, + { + "epoch": 0.21, + "grad_norm": 0.15973375737667084, + "learning_rate": 0.00019873525695738374, + "loss": 1.093, + "step": 585 + }, + { + "epoch": 0.21, + "grad_norm": 0.16699771583080292, + "learning_rate": 0.00019873086336272603, + "loss": 1.0864, + "step": 586 + }, + { + "epoch": 0.21, + "grad_norm": 0.16269387304782867, + "learning_rate": 0.00019872646219856957, + "loss": 1.0638, + "step": 587 + }, + { + "epoch": 0.21, + "grad_norm": 0.16882598400115967, + "learning_rate": 0.00019872205346525186, + "loss": 1.1471, + "step": 588 + }, + { + "epoch": 0.21, + "grad_norm": 0.16338156163692474, + "learning_rate": 0.00019871763716311085, + "loss": 1.0887, + "step": 589 + }, + { + "epoch": 0.21, + "grad_norm": 0.1657753735780716, + "learning_rate": 0.0001987132132924852, + "loss": 1.1273, + "step": 590 + }, + { + "epoch": 0.21, + "grad_norm": 0.1670033186674118, + "learning_rate": 0.000198708781853714, + "loss": 1.1171, + "step": 591 + }, + { + "epoch": 0.21, + "grad_norm": 0.16076108813285828, + "learning_rate": 0.0001987043428471371, + "loss": 1.0601, + "step": 592 + }, + { + "epoch": 0.21, + "grad_norm": 0.1607861965894699, + "learning_rate": 0.00019869989627309474, + "loss": 1.0151, + "step": 593 + }, + { + "epoch": 0.21, + "grad_norm": 0.15327143669128418, + "learning_rate": 0.00019869544213192787, + "loss": 1.0779, + "step": 594 + }, + { + "epoch": 0.21, + "grad_norm": 0.16481177508831024, + "learning_rate": 0.00019869098042397796, + "loss": 1.0822, + "step": 595 + }, + { + "epoch": 0.21, + "grad_norm": 0.17603227496147156, + "learning_rate": 0.00019868651114958707, + "loss": 1.0934, + "step": 596 + }, + { + "epoch": 0.21, + "grad_norm": 0.15565873682498932, + "learning_rate": 0.0001986820343090979, + "loss": 0.9236, + "step": 597 + }, + { + "epoch": 0.21, + "grad_norm": 0.15814374387264252, + "learning_rate": 0.00019867754990285366, + "loss": 1.0454, + "step": 598 + }, + { + "epoch": 0.21, + "grad_norm": 0.17093122005462646, + "learning_rate": 0.00019867305793119816, + "loss": 1.0845, + "step": 599 + }, + { + "epoch": 0.21, + "grad_norm": 0.1707032173871994, + "learning_rate": 0.00019866855839447575, + "loss": 1.0129, + "step": 600 + }, + { + "epoch": 0.21, + "grad_norm": 0.1676967591047287, + "learning_rate": 0.00019866405129303146, + "loss": 1.0709, + "step": 601 + }, + { + "epoch": 0.21, + "grad_norm": 0.16182450950145721, + "learning_rate": 0.00019865953662721083, + "loss": 1.1096, + "step": 602 + }, + { + "epoch": 0.21, + "grad_norm": 0.16513295471668243, + "learning_rate": 0.00019865501439735993, + "loss": 1.1376, + "step": 603 + }, + { + "epoch": 0.21, + "grad_norm": 0.1725083738565445, + "learning_rate": 0.00019865048460382555, + "loss": 1.1235, + "step": 604 + }, + { + "epoch": 0.21, + "grad_norm": 0.174422025680542, + "learning_rate": 0.00019864594724695493, + "loss": 1.1195, + "step": 605 + }, + { + "epoch": 0.21, + "grad_norm": 0.16189318895339966, + "learning_rate": 0.00019864140232709598, + "loss": 1.0649, + "step": 606 + }, + { + "epoch": 0.21, + "grad_norm": 0.176430344581604, + "learning_rate": 0.0001986368498445971, + "loss": 1.1182, + "step": 607 + }, + { + "epoch": 0.21, + "grad_norm": 0.16489064693450928, + "learning_rate": 0.00019863228979980735, + "loss": 1.0936, + "step": 608 + }, + { + "epoch": 0.21, + "grad_norm": 0.16108497977256775, + "learning_rate": 0.00019862772219307636, + "loss": 1.1095, + "step": 609 + }, + { + "epoch": 0.21, + "grad_norm": 0.163023442029953, + "learning_rate": 0.00019862314702475425, + "loss": 1.0862, + "step": 610 + }, + { + "epoch": 0.22, + "grad_norm": 0.16625739634037018, + "learning_rate": 0.0001986185642951919, + "loss": 1.0695, + "step": 611 + }, + { + "epoch": 0.22, + "grad_norm": 0.18615350127220154, + "learning_rate": 0.0001986139740047405, + "loss": 1.1538, + "step": 612 + }, + { + "epoch": 0.22, + "grad_norm": 0.17098894715309143, + "learning_rate": 0.00019860937615375214, + "loss": 1.0959, + "step": 613 + }, + { + "epoch": 0.22, + "grad_norm": 0.1671309918165207, + "learning_rate": 0.00019860477074257923, + "loss": 1.0731, + "step": 614 + }, + { + "epoch": 0.22, + "grad_norm": 0.18721477687358856, + "learning_rate": 0.00019860015777157491, + "loss": 1.1827, + "step": 615 + }, + { + "epoch": 0.22, + "grad_norm": 0.16247648000717163, + "learning_rate": 0.00019859553724109282, + "loss": 1.0652, + "step": 616 + }, + { + "epoch": 0.22, + "grad_norm": 0.16883952915668488, + "learning_rate": 0.0001985909091514872, + "loss": 1.0913, + "step": 617 + }, + { + "epoch": 0.22, + "grad_norm": 0.16720356047153473, + "learning_rate": 0.0001985862735031129, + "loss": 1.0836, + "step": 618 + }, + { + "epoch": 0.22, + "grad_norm": 0.16027386486530304, + "learning_rate": 0.00019858163029632528, + "loss": 1.0332, + "step": 619 + }, + { + "epoch": 0.22, + "grad_norm": 0.15760108828544617, + "learning_rate": 0.00019857697953148037, + "loss": 1.0635, + "step": 620 + }, + { + "epoch": 0.22, + "grad_norm": 0.16750258207321167, + "learning_rate": 0.00019857232120893477, + "loss": 1.1103, + "step": 621 + }, + { + "epoch": 0.22, + "grad_norm": 0.16163256764411926, + "learning_rate": 0.0001985676553290455, + "loss": 1.0626, + "step": 622 + }, + { + "epoch": 0.22, + "grad_norm": 0.16259679198265076, + "learning_rate": 0.0001985629818921704, + "loss": 1.0135, + "step": 623 + }, + { + "epoch": 0.22, + "grad_norm": 0.16069941222667694, + "learning_rate": 0.0001985583008986678, + "loss": 1.0288, + "step": 624 + }, + { + "epoch": 0.22, + "grad_norm": 0.16990730166435242, + "learning_rate": 0.0001985536123488964, + "loss": 1.14, + "step": 625 + }, + { + "epoch": 0.22, + "grad_norm": 0.1745760291814804, + "learning_rate": 0.00019854891624321587, + "loss": 1.0704, + "step": 626 + }, + { + "epoch": 0.22, + "grad_norm": 0.16768278181552887, + "learning_rate": 0.00019854421258198613, + "loss": 1.0832, + "step": 627 + }, + { + "epoch": 0.22, + "grad_norm": 0.16163139045238495, + "learning_rate": 0.00019853950136556781, + "loss": 1.0877, + "step": 628 + }, + { + "epoch": 0.22, + "grad_norm": 0.16793616116046906, + "learning_rate": 0.00019853478259432213, + "loss": 1.1423, + "step": 629 + }, + { + "epoch": 0.22, + "grad_norm": 0.1579504758119583, + "learning_rate": 0.0001985300562686109, + "loss": 1.0608, + "step": 630 + }, + { + "epoch": 0.22, + "grad_norm": 0.17058435082435608, + "learning_rate": 0.00019852532238879645, + "loss": 1.0879, + "step": 631 + }, + { + "epoch": 0.22, + "grad_norm": 0.16249874234199524, + "learning_rate": 0.0001985205809552417, + "loss": 1.0558, + "step": 632 + }, + { + "epoch": 0.22, + "grad_norm": 0.160230353474617, + "learning_rate": 0.00019851583196831015, + "loss": 0.9504, + "step": 633 + }, + { + "epoch": 0.22, + "grad_norm": 0.163030743598938, + "learning_rate": 0.00019851107542836598, + "loss": 1.0638, + "step": 634 + }, + { + "epoch": 0.22, + "grad_norm": 0.16014380753040314, + "learning_rate": 0.00019850631133577377, + "loss": 1.0828, + "step": 635 + }, + { + "epoch": 0.22, + "grad_norm": 0.1663341075181961, + "learning_rate": 0.0001985015396908988, + "loss": 1.0808, + "step": 636 + }, + { + "epoch": 0.22, + "grad_norm": 0.16736695170402527, + "learning_rate": 0.00019849676049410694, + "loss": 1.0942, + "step": 637 + }, + { + "epoch": 0.22, + "grad_norm": 0.16381196677684784, + "learning_rate": 0.00019849197374576457, + "loss": 1.1072, + "step": 638 + }, + { + "epoch": 0.23, + "grad_norm": 0.16722634434700012, + "learning_rate": 0.00019848717944623872, + "loss": 1.0613, + "step": 639 + }, + { + "epoch": 0.23, + "grad_norm": 0.15986140072345734, + "learning_rate": 0.00019848237759589692, + "loss": 1.0266, + "step": 640 + }, + { + "epoch": 0.23, + "grad_norm": 0.17291495203971863, + "learning_rate": 0.0001984775681951073, + "loss": 1.0988, + "step": 641 + }, + { + "epoch": 0.23, + "grad_norm": 0.15927840769290924, + "learning_rate": 0.00019847275124423863, + "loss": 1.0066, + "step": 642 + }, + { + "epoch": 0.23, + "grad_norm": 0.16089984774589539, + "learning_rate": 0.00019846792674366016, + "loss": 1.0368, + "step": 643 + }, + { + "epoch": 0.23, + "grad_norm": 0.16961584985256195, + "learning_rate": 0.00019846309469374185, + "loss": 1.1053, + "step": 644 + }, + { + "epoch": 0.23, + "grad_norm": 0.16907954216003418, + "learning_rate": 0.0001984582550948541, + "loss": 1.1217, + "step": 645 + }, + { + "epoch": 0.23, + "grad_norm": 0.1658971607685089, + "learning_rate": 0.00019845340794736802, + "loss": 1.0813, + "step": 646 + }, + { + "epoch": 0.23, + "grad_norm": 0.18253681063652039, + "learning_rate": 0.00019844855325165518, + "loss": 1.0798, + "step": 647 + }, + { + "epoch": 0.23, + "grad_norm": 0.16371911764144897, + "learning_rate": 0.0001984436910080878, + "loss": 1.0753, + "step": 648 + }, + { + "epoch": 0.23, + "grad_norm": 0.1763499230146408, + "learning_rate": 0.00019843882121703863, + "loss": 1.1702, + "step": 649 + }, + { + "epoch": 0.23, + "grad_norm": 0.17335723340511322, + "learning_rate": 0.00019843394387888104, + "loss": 1.1431, + "step": 650 + }, + { + "epoch": 0.23, + "grad_norm": 0.16688063740730286, + "learning_rate": 0.00019842905899398897, + "loss": 1.0716, + "step": 651 + }, + { + "epoch": 0.23, + "grad_norm": 0.16677774488925934, + "learning_rate": 0.00019842416656273696, + "loss": 1.0497, + "step": 652 + }, + { + "epoch": 0.23, + "grad_norm": 0.16049520671367645, + "learning_rate": 0.00019841926658550007, + "loss": 1.0556, + "step": 653 + }, + { + "epoch": 0.23, + "grad_norm": 0.16821910440921783, + "learning_rate": 0.000198414359062654, + "loss": 1.0611, + "step": 654 + }, + { + "epoch": 0.23, + "grad_norm": 0.16483479738235474, + "learning_rate": 0.00019840944399457492, + "loss": 1.0531, + "step": 655 + }, + { + "epoch": 0.23, + "grad_norm": 0.17143693566322327, + "learning_rate": 0.00019840452138163977, + "loss": 1.0541, + "step": 656 + }, + { + "epoch": 0.23, + "grad_norm": 0.17825862765312195, + "learning_rate": 0.0001983995912242259, + "loss": 1.0862, + "step": 657 + }, + { + "epoch": 0.23, + "grad_norm": 0.16992264986038208, + "learning_rate": 0.0001983946535227113, + "loss": 1.1236, + "step": 658 + }, + { + "epoch": 0.23, + "grad_norm": 0.17565155029296875, + "learning_rate": 0.00019838970827747453, + "loss": 1.023, + "step": 659 + }, + { + "epoch": 0.23, + "grad_norm": 0.16197721660137177, + "learning_rate": 0.00019838475548889471, + "loss": 1.0602, + "step": 660 + }, + { + "epoch": 0.23, + "grad_norm": 0.157365620136261, + "learning_rate": 0.00019837979515735166, + "loss": 1.0197, + "step": 661 + }, + { + "epoch": 0.23, + "grad_norm": 0.16862985491752625, + "learning_rate": 0.00019837482728322553, + "loss": 1.1263, + "step": 662 + }, + { + "epoch": 0.23, + "grad_norm": 0.16315187513828278, + "learning_rate": 0.0001983698518668973, + "loss": 1.078, + "step": 663 + }, + { + "epoch": 0.23, + "grad_norm": 0.16445574164390564, + "learning_rate": 0.00019836486890874845, + "loss": 1.1159, + "step": 664 + }, + { + "epoch": 0.23, + "grad_norm": 0.1527933031320572, + "learning_rate": 0.00019835987840916092, + "loss": 1.0328, + "step": 665 + }, + { + "epoch": 0.23, + "grad_norm": 0.16762986779212952, + "learning_rate": 0.00019835488036851735, + "loss": 1.0463, + "step": 666 + }, + { + "epoch": 0.23, + "grad_norm": 0.17638851702213287, + "learning_rate": 0.00019834987478720096, + "loss": 1.0314, + "step": 667 + }, + { + "epoch": 0.24, + "grad_norm": 0.1704005002975464, + "learning_rate": 0.0001983448616655955, + "loss": 1.1023, + "step": 668 + }, + { + "epoch": 0.24, + "grad_norm": 0.1858178824186325, + "learning_rate": 0.00019833984100408531, + "loss": 1.0918, + "step": 669 + }, + { + "epoch": 0.24, + "grad_norm": 0.22606873512268066, + "learning_rate": 0.00019833481280305535, + "loss": 0.9917, + "step": 670 + }, + { + "epoch": 0.24, + "grad_norm": 0.16366678476333618, + "learning_rate": 0.00019832977706289108, + "loss": 1.0597, + "step": 671 + }, + { + "epoch": 0.24, + "grad_norm": 0.1654898226261139, + "learning_rate": 0.0001983247337839786, + "loss": 1.062, + "step": 672 + }, + { + "epoch": 0.24, + "grad_norm": 0.18031100928783417, + "learning_rate": 0.00019831968296670454, + "loss": 1.111, + "step": 673 + }, + { + "epoch": 0.24, + "grad_norm": 0.17027471959590912, + "learning_rate": 0.00019831462461145617, + "loss": 1.1478, + "step": 674 + }, + { + "epoch": 0.24, + "grad_norm": 0.16689349710941315, + "learning_rate": 0.0001983095587186213, + "loss": 1.0514, + "step": 675 + }, + { + "epoch": 0.24, + "grad_norm": 0.17473068833351135, + "learning_rate": 0.00019830448528858832, + "loss": 1.1135, + "step": 676 + }, + { + "epoch": 0.24, + "grad_norm": 0.15812206268310547, + "learning_rate": 0.00019829940432174617, + "loss": 1.06, + "step": 677 + }, + { + "epoch": 0.24, + "grad_norm": 0.16691286861896515, + "learning_rate": 0.00019829431581848446, + "loss": 1.0982, + "step": 678 + }, + { + "epoch": 0.24, + "grad_norm": 0.1686815768480301, + "learning_rate": 0.00019828921977919323, + "loss": 1.0948, + "step": 679 + }, + { + "epoch": 0.24, + "grad_norm": 0.1836208552122116, + "learning_rate": 0.0001982841162042633, + "loss": 1.1331, + "step": 680 + }, + { + "epoch": 0.24, + "grad_norm": 0.1622176170349121, + "learning_rate": 0.00019827900509408581, + "loss": 1.0369, + "step": 681 + }, + { + "epoch": 0.24, + "grad_norm": 0.1718926578760147, + "learning_rate": 0.0001982738864490527, + "loss": 1.0552, + "step": 682 + }, + { + "epoch": 0.24, + "grad_norm": 0.1771085262298584, + "learning_rate": 0.00019826876026955644, + "loss": 1.0502, + "step": 683 + }, + { + "epoch": 0.24, + "grad_norm": 0.16786454617977142, + "learning_rate": 0.00019826362655599, + "loss": 1.0486, + "step": 684 + }, + { + "epoch": 0.24, + "grad_norm": 0.16586992144584656, + "learning_rate": 0.00019825848530874692, + "loss": 1.0809, + "step": 685 + }, + { + "epoch": 0.24, + "grad_norm": 0.1600944846868515, + "learning_rate": 0.00019825333652822146, + "loss": 1.0243, + "step": 686 + }, + { + "epoch": 0.24, + "grad_norm": 0.1946248859167099, + "learning_rate": 0.00019824818021480832, + "loss": 1.0905, + "step": 687 + }, + { + "epoch": 0.24, + "grad_norm": 0.16281233727931976, + "learning_rate": 0.0001982430163689028, + "loss": 1.0344, + "step": 688 + }, + { + "epoch": 0.24, + "grad_norm": 0.16939066350460052, + "learning_rate": 0.00019823784499090088, + "loss": 1.0619, + "step": 689 + }, + { + "epoch": 0.24, + "grad_norm": 0.18849721550941467, + "learning_rate": 0.00019823266608119897, + "loss": 1.0294, + "step": 690 + }, + { + "epoch": 0.24, + "grad_norm": 0.17455913126468658, + "learning_rate": 0.00019822747964019416, + "loss": 1.0463, + "step": 691 + }, + { + "epoch": 0.24, + "grad_norm": 0.16168515384197235, + "learning_rate": 0.0001982222856682841, + "loss": 1.0988, + "step": 692 + }, + { + "epoch": 0.24, + "grad_norm": 0.17095373570919037, + "learning_rate": 0.00019821708416586692, + "loss": 1.0377, + "step": 693 + }, + { + "epoch": 0.24, + "grad_norm": 0.17848733067512512, + "learning_rate": 0.0001982118751333415, + "loss": 1.0138, + "step": 694 + }, + { + "epoch": 0.24, + "grad_norm": 0.1678147166967392, + "learning_rate": 0.00019820665857110716, + "loss": 1.0738, + "step": 695 + }, + { + "epoch": 0.25, + "grad_norm": 0.16244111955165863, + "learning_rate": 0.00019820143447956388, + "loss": 1.0178, + "step": 696 + }, + { + "epoch": 0.25, + "grad_norm": 0.17898660898208618, + "learning_rate": 0.00019819620285911212, + "loss": 1.0424, + "step": 697 + }, + { + "epoch": 0.25, + "grad_norm": 0.18440993130207062, + "learning_rate": 0.00019819096371015302, + "loss": 1.087, + "step": 698 + }, + { + "epoch": 0.25, + "grad_norm": 0.1611272543668747, + "learning_rate": 0.00019818571703308826, + "loss": 1.0659, + "step": 699 + }, + { + "epoch": 0.25, + "grad_norm": 0.17365390062332153, + "learning_rate": 0.00019818046282832005, + "loss": 1.0261, + "step": 700 + }, + { + "epoch": 0.25, + "grad_norm": 0.1745585799217224, + "learning_rate": 0.00019817520109625128, + "loss": 1.0753, + "step": 701 + }, + { + "epoch": 0.25, + "grad_norm": 0.16264259815216064, + "learning_rate": 0.00019816993183728533, + "loss": 1.1131, + "step": 702 + }, + { + "epoch": 0.25, + "grad_norm": 0.16973325610160828, + "learning_rate": 0.00019816465505182617, + "loss": 1.1309, + "step": 703 + }, + { + "epoch": 0.25, + "grad_norm": 0.16402168571949005, + "learning_rate": 0.00019815937074027837, + "loss": 1.0483, + "step": 704 + }, + { + "epoch": 0.25, + "grad_norm": 0.174351766705513, + "learning_rate": 0.00019815407890304707, + "loss": 1.1422, + "step": 705 + }, + { + "epoch": 0.25, + "grad_norm": 0.16474570333957672, + "learning_rate": 0.00019814877954053798, + "loss": 1.0539, + "step": 706 + }, + { + "epoch": 0.25, + "grad_norm": 0.16664201021194458, + "learning_rate": 0.00019814347265315738, + "loss": 1.0573, + "step": 707 + }, + { + "epoch": 0.25, + "grad_norm": 0.17523398995399475, + "learning_rate": 0.00019813815824131216, + "loss": 1.1203, + "step": 708 + }, + { + "epoch": 0.25, + "grad_norm": 0.16613712906837463, + "learning_rate": 0.0001981328363054098, + "loss": 1.0876, + "step": 709 + }, + { + "epoch": 0.25, + "grad_norm": 0.1733550727367401, + "learning_rate": 0.00019812750684585823, + "loss": 1.0538, + "step": 710 + }, + { + "epoch": 0.25, + "eval_loss": 1.0733253955841064, + "eval_runtime": 679.1234, + "eval_samples_per_second": 10.125, + "eval_steps_per_second": 5.062, + "step": 710 + }, + { + "epoch": 0.25, + "grad_norm": 0.16474869847297668, + "learning_rate": 0.00019812216986306612, + "loss": 1.1042, + "step": 711 + }, + { + "epoch": 0.25, + "grad_norm": 0.1642632931470871, + "learning_rate": 0.00019811682535744263, + "loss": 1.0718, + "step": 712 + }, + { + "epoch": 0.25, + "grad_norm": 0.17970983684062958, + "learning_rate": 0.0001981114733293975, + "loss": 1.0629, + "step": 713 + }, + { + "epoch": 0.25, + "grad_norm": 0.15759165585041046, + "learning_rate": 0.00019810611377934104, + "loss": 1.0588, + "step": 714 + }, + { + "epoch": 0.25, + "grad_norm": 0.17333929240703583, + "learning_rate": 0.00019810074670768423, + "loss": 1.0582, + "step": 715 + }, + { + "epoch": 0.25, + "grad_norm": 0.1946728527545929, + "learning_rate": 0.00019809537211483847, + "loss": 1.1664, + "step": 716 + }, + { + "epoch": 0.25, + "grad_norm": 0.1639917492866516, + "learning_rate": 0.00019808999000121586, + "loss": 1.1074, + "step": 717 + }, + { + "epoch": 0.25, + "grad_norm": 0.19192171096801758, + "learning_rate": 0.00019808460036722906, + "loss": 1.0592, + "step": 718 + }, + { + "epoch": 0.25, + "grad_norm": 0.1635810136795044, + "learning_rate": 0.00019807920321329123, + "loss": 1.038, + "step": 719 + }, + { + "epoch": 0.25, + "grad_norm": 0.16381768882274628, + "learning_rate": 0.0001980737985398162, + "loss": 1.0321, + "step": 720 + }, + { + "epoch": 0.25, + "grad_norm": 0.1705102175474167, + "learning_rate": 0.00019806838634721832, + "loss": 1.1072, + "step": 721 + }, + { + "epoch": 0.25, + "grad_norm": 0.1895795315504074, + "learning_rate": 0.0001980629666359125, + "loss": 1.0437, + "step": 722 + }, + { + "epoch": 0.25, + "grad_norm": 0.1661144644021988, + "learning_rate": 0.0001980575394063143, + "loss": 1.0303, + "step": 723 + }, + { + "epoch": 0.25, + "grad_norm": 0.18807817995548248, + "learning_rate": 0.0001980521046588398, + "loss": 1.1596, + "step": 724 + }, + { + "epoch": 0.26, + "grad_norm": 0.16420172154903412, + "learning_rate": 0.00019804666239390568, + "loss": 0.9703, + "step": 725 + }, + { + "epoch": 0.26, + "grad_norm": 0.1850784718990326, + "learning_rate": 0.0001980412126119292, + "loss": 1.1502, + "step": 726 + }, + { + "epoch": 0.26, + "grad_norm": 0.16995462775230408, + "learning_rate": 0.00019803575531332816, + "loss": 1.0608, + "step": 727 + }, + { + "epoch": 0.26, + "grad_norm": 0.16244123876094818, + "learning_rate": 0.00019803029049852096, + "loss": 1.0032, + "step": 728 + }, + { + "epoch": 0.26, + "grad_norm": 0.16180339455604553, + "learning_rate": 0.00019802481816792657, + "loss": 1.0867, + "step": 729 + }, + { + "epoch": 0.26, + "grad_norm": 0.16730904579162598, + "learning_rate": 0.00019801933832196456, + "loss": 1.0569, + "step": 730 + }, + { + "epoch": 0.26, + "grad_norm": 0.17155852913856506, + "learning_rate": 0.0001980138509610551, + "loss": 1.0967, + "step": 731 + }, + { + "epoch": 0.26, + "grad_norm": 0.17094744741916656, + "learning_rate": 0.0001980083560856188, + "loss": 1.0308, + "step": 732 + }, + { + "epoch": 0.26, + "grad_norm": 0.17406611144542694, + "learning_rate": 0.000198002853696077, + "loss": 1.0859, + "step": 733 + }, + { + "epoch": 0.26, + "grad_norm": 0.1751219928264618, + "learning_rate": 0.00019799734379285155, + "loss": 1.1167, + "step": 734 + }, + { + "epoch": 0.26, + "grad_norm": 0.16861020028591156, + "learning_rate": 0.0001979918263763649, + "loss": 1.0223, + "step": 735 + }, + { + "epoch": 0.26, + "grad_norm": 0.17369449138641357, + "learning_rate": 0.00019798630144704, + "loss": 1.0547, + "step": 736 + }, + { + "epoch": 0.26, + "grad_norm": 0.17756544053554535, + "learning_rate": 0.00019798076900530052, + "loss": 1.105, + "step": 737 + }, + { + "epoch": 0.26, + "grad_norm": 0.17094124853610992, + "learning_rate": 0.00019797522905157054, + "loss": 1.0508, + "step": 738 + }, + { + "epoch": 0.26, + "grad_norm": 0.17031005024909973, + "learning_rate": 0.00019796968158627488, + "loss": 1.0735, + "step": 739 + }, + { + "epoch": 0.26, + "grad_norm": 0.17707915604114532, + "learning_rate": 0.00019796412660983876, + "loss": 1.0652, + "step": 740 + }, + { + "epoch": 0.26, + "grad_norm": 0.1686302125453949, + "learning_rate": 0.00019795856412268813, + "loss": 1.0452, + "step": 741 + }, + { + "epoch": 0.26, + "grad_norm": 0.1696590632200241, + "learning_rate": 0.00019795299412524945, + "loss": 1.056, + "step": 742 + }, + { + "epoch": 0.26, + "grad_norm": 0.1714959293603897, + "learning_rate": 0.00019794741661794977, + "loss": 1.0193, + "step": 743 + }, + { + "epoch": 0.26, + "grad_norm": 0.16320788860321045, + "learning_rate": 0.00019794183160121666, + "loss": 1.088, + "step": 744 + }, + { + "epoch": 0.26, + "grad_norm": 0.15728294849395752, + "learning_rate": 0.00019793623907547834, + "loss": 1.0298, + "step": 745 + }, + { + "epoch": 0.26, + "grad_norm": 0.17356523871421814, + "learning_rate": 0.0001979306390411636, + "loss": 1.1152, + "step": 746 + }, + { + "epoch": 0.26, + "grad_norm": 0.17470648884773254, + "learning_rate": 0.00019792503149870173, + "loss": 1.0811, + "step": 747 + }, + { + "epoch": 0.26, + "grad_norm": 0.17432372272014618, + "learning_rate": 0.00019791941644852273, + "loss": 1.1509, + "step": 748 + }, + { + "epoch": 0.26, + "grad_norm": 0.16345620155334473, + "learning_rate": 0.000197913793891057, + "loss": 1.0062, + "step": 749 + }, + { + "epoch": 0.26, + "grad_norm": 0.17330077290534973, + "learning_rate": 0.00019790816382673568, + "loss": 0.9442, + "step": 750 + }, + { + "epoch": 0.26, + "grad_norm": 0.1774456650018692, + "learning_rate": 0.0001979025262559904, + "loss": 1.1019, + "step": 751 + }, + { + "epoch": 0.26, + "grad_norm": 0.1685924530029297, + "learning_rate": 0.00019789688117925335, + "loss": 1.116, + "step": 752 + }, + { + "epoch": 0.27, + "grad_norm": 0.17273302376270294, + "learning_rate": 0.00019789122859695737, + "loss": 1.0744, + "step": 753 + }, + { + "epoch": 0.27, + "grad_norm": 0.1677582710981369, + "learning_rate": 0.0001978855685095358, + "loss": 1.0655, + "step": 754 + }, + { + "epoch": 0.27, + "grad_norm": 0.16952186822891235, + "learning_rate": 0.0001978799009174226, + "loss": 1.0505, + "step": 755 + }, + { + "epoch": 0.27, + "grad_norm": 0.1648993194103241, + "learning_rate": 0.00019787422582105227, + "loss": 1.1172, + "step": 756 + }, + { + "epoch": 0.27, + "grad_norm": 0.1641387939453125, + "learning_rate": 0.00019786854322085997, + "loss": 1.0762, + "step": 757 + }, + { + "epoch": 0.27, + "grad_norm": 0.17605146765708923, + "learning_rate": 0.0001978628531172813, + "loss": 1.0586, + "step": 758 + }, + { + "epoch": 0.27, + "grad_norm": 0.17868219316005707, + "learning_rate": 0.0001978571555107526, + "loss": 1.0759, + "step": 759 + }, + { + "epoch": 0.27, + "grad_norm": 0.1822500377893448, + "learning_rate": 0.0001978514504017106, + "loss": 1.116, + "step": 760 + }, + { + "epoch": 0.27, + "grad_norm": 0.17165422439575195, + "learning_rate": 0.0001978457377905927, + "loss": 1.0491, + "step": 761 + }, + { + "epoch": 0.27, + "grad_norm": 0.1742304265499115, + "learning_rate": 0.000197840017677837, + "loss": 1.0565, + "step": 762 + }, + { + "epoch": 0.27, + "grad_norm": 0.16227282583713531, + "learning_rate": 0.0001978342900638819, + "loss": 1.067, + "step": 763 + }, + { + "epoch": 0.27, + "grad_norm": 0.17510762810707092, + "learning_rate": 0.0001978285549491666, + "loss": 1.0838, + "step": 764 + }, + { + "epoch": 0.27, + "grad_norm": 0.1696232557296753, + "learning_rate": 0.0001978228123341308, + "loss": 1.0111, + "step": 765 + }, + { + "epoch": 0.27, + "grad_norm": 0.1753813475370407, + "learning_rate": 0.00019781706221921473, + "loss": 1.0459, + "step": 766 + }, + { + "epoch": 0.27, + "grad_norm": 0.16556529700756073, + "learning_rate": 0.0001978113046048593, + "loss": 1.0401, + "step": 767 + }, + { + "epoch": 0.27, + "grad_norm": 0.16358666121959686, + "learning_rate": 0.0001978055394915059, + "loss": 1.0166, + "step": 768 + }, + { + "epoch": 0.27, + "grad_norm": 0.1697453260421753, + "learning_rate": 0.00019779976687959652, + "loss": 1.0732, + "step": 769 + }, + { + "epoch": 0.27, + "grad_norm": 0.17842546105384827, + "learning_rate": 0.00019779398676957375, + "loss": 1.0354, + "step": 770 + }, + { + "epoch": 0.27, + "grad_norm": 0.17023664712905884, + "learning_rate": 0.00019778819916188076, + "loss": 1.1196, + "step": 771 + }, + { + "epoch": 0.27, + "grad_norm": 0.18845778703689575, + "learning_rate": 0.00019778240405696123, + "loss": 1.084, + "step": 772 + }, + { + "epoch": 0.27, + "grad_norm": 0.17198894917964935, + "learning_rate": 0.0001977766014552595, + "loss": 1.0415, + "step": 773 + }, + { + "epoch": 0.27, + "grad_norm": 0.17236502468585968, + "learning_rate": 0.00019777079135722042, + "loss": 1.0656, + "step": 774 + }, + { + "epoch": 0.27, + "grad_norm": 0.1829390674829483, + "learning_rate": 0.00019776497376328944, + "loss": 1.0939, + "step": 775 + }, + { + "epoch": 0.27, + "grad_norm": 0.16156379878520966, + "learning_rate": 0.00019775914867391258, + "loss": 1.0654, + "step": 776 + }, + { + "epoch": 0.27, + "grad_norm": 0.17911884188652039, + "learning_rate": 0.0001977533160895365, + "loss": 1.0854, + "step": 777 + }, + { + "epoch": 0.27, + "grad_norm": 0.17780935764312744, + "learning_rate": 0.00019774747601060825, + "loss": 1.1216, + "step": 778 + }, + { + "epoch": 0.27, + "grad_norm": 0.16712267696857452, + "learning_rate": 0.0001977416284375757, + "loss": 1.0472, + "step": 779 + }, + { + "epoch": 0.27, + "grad_norm": 0.17137813568115234, + "learning_rate": 0.0001977357733708871, + "loss": 1.0768, + "step": 780 + }, + { + "epoch": 0.28, + "grad_norm": 0.16970902681350708, + "learning_rate": 0.00019772991081099136, + "loss": 1.1034, + "step": 781 + }, + { + "epoch": 0.28, + "grad_norm": 0.17829537391662598, + "learning_rate": 0.00019772404075833795, + "loss": 1.1081, + "step": 782 + }, + { + "epoch": 0.28, + "grad_norm": 0.1679857075214386, + "learning_rate": 0.00019771816321337692, + "loss": 1.0901, + "step": 783 + }, + { + "epoch": 0.28, + "grad_norm": 0.17669101059436798, + "learning_rate": 0.00019771227817655892, + "loss": 1.008, + "step": 784 + }, + { + "epoch": 0.28, + "grad_norm": 0.18435387313365936, + "learning_rate": 0.0001977063856483351, + "loss": 1.0802, + "step": 785 + }, + { + "epoch": 0.28, + "grad_norm": 0.16921114921569824, + "learning_rate": 0.00019770048562915723, + "loss": 1.0708, + "step": 786 + }, + { + "epoch": 0.28, + "grad_norm": 0.17006106674671173, + "learning_rate": 0.00019769457811947766, + "loss": 1.0267, + "step": 787 + }, + { + "epoch": 0.28, + "grad_norm": 0.19145065546035767, + "learning_rate": 0.00019768866311974933, + "loss": 1.1084, + "step": 788 + }, + { + "epoch": 0.28, + "grad_norm": 0.17966581881046295, + "learning_rate": 0.00019768274063042572, + "loss": 1.0757, + "step": 789 + }, + { + "epoch": 0.28, + "grad_norm": 0.16242194175720215, + "learning_rate": 0.00019767681065196085, + "loss": 1.06, + "step": 790 + }, + { + "epoch": 0.28, + "grad_norm": 0.17664171755313873, + "learning_rate": 0.00019767087318480945, + "loss": 1.0989, + "step": 791 + }, + { + "epoch": 0.28, + "grad_norm": 0.18222616612911224, + "learning_rate": 0.00019766492822942668, + "loss": 1.1169, + "step": 792 + }, + { + "epoch": 0.28, + "grad_norm": 0.17824456095695496, + "learning_rate": 0.0001976589757862683, + "loss": 1.0474, + "step": 793 + }, + { + "epoch": 0.28, + "grad_norm": 0.17561721801757812, + "learning_rate": 0.00019765301585579072, + "loss": 1.0891, + "step": 794 + }, + { + "epoch": 0.28, + "grad_norm": 0.1854829043149948, + "learning_rate": 0.00019764704843845086, + "loss": 1.0897, + "step": 795 + }, + { + "epoch": 0.28, + "grad_norm": 0.17442169785499573, + "learning_rate": 0.0001976410735347062, + "loss": 1.008, + "step": 796 + }, + { + "epoch": 0.28, + "grad_norm": 0.16896742582321167, + "learning_rate": 0.0001976350911450149, + "loss": 1.1197, + "step": 797 + }, + { + "epoch": 0.28, + "grad_norm": 0.1652253419160843, + "learning_rate": 0.00019762910126983556, + "loss": 1.0353, + "step": 798 + }, + { + "epoch": 0.28, + "grad_norm": 0.17013123631477356, + "learning_rate": 0.0001976231039096274, + "loss": 1.0953, + "step": 799 + }, + { + "epoch": 0.28, + "grad_norm": 0.16675332188606262, + "learning_rate": 0.00019761709906485026, + "loss": 1.0707, + "step": 800 + }, + { + "epoch": 0.28, + "grad_norm": 0.164289191365242, + "learning_rate": 0.00019761108673596448, + "loss": 1.035, + "step": 801 + }, + { + "epoch": 0.28, + "grad_norm": 0.1685546636581421, + "learning_rate": 0.00019760506692343107, + "loss": 1.0126, + "step": 802 + }, + { + "epoch": 0.28, + "grad_norm": 0.159869983792305, + "learning_rate": 0.00019759903962771156, + "loss": 1.0443, + "step": 803 + }, + { + "epoch": 0.28, + "grad_norm": 0.17488044500350952, + "learning_rate": 0.00019759300484926796, + "loss": 1.0821, + "step": 804 + }, + { + "epoch": 0.28, + "grad_norm": 0.18100354075431824, + "learning_rate": 0.00019758696258856303, + "loss": 1.0498, + "step": 805 + }, + { + "epoch": 0.28, + "grad_norm": 0.16927726566791534, + "learning_rate": 0.00019758091284606, + "loss": 1.0592, + "step": 806 + }, + { + "epoch": 0.28, + "grad_norm": 0.17606563866138458, + "learning_rate": 0.00019757485562222265, + "loss": 1.0358, + "step": 807 + }, + { + "epoch": 0.28, + "grad_norm": 0.1672973483800888, + "learning_rate": 0.00019756879091751544, + "loss": 0.9804, + "step": 808 + }, + { + "epoch": 0.28, + "grad_norm": 0.1743594855070114, + "learning_rate": 0.0001975627187324033, + "loss": 1.0395, + "step": 809 + }, + { + "epoch": 0.29, + "grad_norm": 0.17184039950370789, + "learning_rate": 0.0001975566390673518, + "loss": 1.0169, + "step": 810 + }, + { + "epoch": 0.29, + "grad_norm": 0.17724105715751648, + "learning_rate": 0.000197550551922827, + "loss": 0.9956, + "step": 811 + }, + { + "epoch": 0.29, + "grad_norm": 0.16138318181037903, + "learning_rate": 0.00019754445729929562, + "loss": 1.0382, + "step": 812 + }, + { + "epoch": 0.29, + "grad_norm": 0.18038204312324524, + "learning_rate": 0.00019753835519722494, + "loss": 1.0985, + "step": 813 + }, + { + "epoch": 0.29, + "grad_norm": 0.17275594174861908, + "learning_rate": 0.0001975322456170828, + "loss": 1.0409, + "step": 814 + }, + { + "epoch": 0.29, + "grad_norm": 0.17210884392261505, + "learning_rate": 0.00019752612855933755, + "loss": 1.0578, + "step": 815 + }, + { + "epoch": 0.29, + "grad_norm": 0.15895509719848633, + "learning_rate": 0.00019752000402445825, + "loss": 0.9716, + "step": 816 + }, + { + "epoch": 0.29, + "grad_norm": 0.1858808845281601, + "learning_rate": 0.00019751387201291442, + "loss": 1.0542, + "step": 817 + }, + { + "epoch": 0.29, + "grad_norm": 0.16655674576759338, + "learning_rate": 0.00019750773252517618, + "loss": 1.0328, + "step": 818 + }, + { + "epoch": 0.29, + "grad_norm": 0.17627952992916107, + "learning_rate": 0.00019750158556171426, + "loss": 1.0429, + "step": 819 + }, + { + "epoch": 0.29, + "grad_norm": 0.16836844384670258, + "learning_rate": 0.0001974954311229999, + "loss": 1.0179, + "step": 820 + }, + { + "epoch": 0.29, + "grad_norm": 0.17369742691516876, + "learning_rate": 0.000197489269209505, + "loss": 1.0074, + "step": 821 + }, + { + "epoch": 0.29, + "grad_norm": 0.17023475468158722, + "learning_rate": 0.0001974830998217019, + "loss": 1.0429, + "step": 822 + }, + { + "epoch": 0.29, + "grad_norm": 0.18552331626415253, + "learning_rate": 0.00019747692296006366, + "loss": 1.1223, + "step": 823 + }, + { + "epoch": 0.29, + "grad_norm": 0.18040619790554047, + "learning_rate": 0.00019747073862506383, + "loss": 1.0913, + "step": 824 + }, + { + "epoch": 0.29, + "grad_norm": 0.1917116492986679, + "learning_rate": 0.00019746454681717656, + "loss": 1.0605, + "step": 825 + }, + { + "epoch": 0.29, + "grad_norm": 0.18105146288871765, + "learning_rate": 0.00019745834753687652, + "loss": 1.0573, + "step": 826 + }, + { + "epoch": 0.29, + "grad_norm": 0.16846537590026855, + "learning_rate": 0.00019745214078463908, + "loss": 1.0284, + "step": 827 + }, + { + "epoch": 0.29, + "grad_norm": 0.18332545459270477, + "learning_rate": 0.00019744592656094004, + "loss": 1.0732, + "step": 828 + }, + { + "epoch": 0.29, + "grad_norm": 0.18125556409358978, + "learning_rate": 0.00019743970486625582, + "loss": 0.9678, + "step": 829 + }, + { + "epoch": 0.29, + "grad_norm": 0.16851511597633362, + "learning_rate": 0.00019743347570106348, + "loss": 1.0347, + "step": 830 + }, + { + "epoch": 0.29, + "grad_norm": 0.18086205422878265, + "learning_rate": 0.00019742723906584052, + "loss": 1.1649, + "step": 831 + }, + { + "epoch": 0.29, + "grad_norm": 0.176468163728714, + "learning_rate": 0.0001974209949610652, + "loss": 1.0727, + "step": 832 + }, + { + "epoch": 0.29, + "grad_norm": 0.17329242825508118, + "learning_rate": 0.00019741474338721613, + "loss": 1.0446, + "step": 833 + }, + { + "epoch": 0.29, + "grad_norm": 0.186629056930542, + "learning_rate": 0.00019740848434477267, + "loss": 1.028, + "step": 834 + }, + { + "epoch": 0.29, + "grad_norm": 0.17307981848716736, + "learning_rate": 0.00019740221783421467, + "loss": 1.0449, + "step": 835 + }, + { + "epoch": 0.29, + "grad_norm": 0.17103774845600128, + "learning_rate": 0.00019739594385602256, + "loss": 1.1006, + "step": 836 + }, + { + "epoch": 0.29, + "grad_norm": 0.18144570291042328, + "learning_rate": 0.00019738966241067738, + "loss": 1.1385, + "step": 837 + }, + { + "epoch": 0.3, + "grad_norm": 0.17040754854679108, + "learning_rate": 0.00019738337349866071, + "loss": 1.0435, + "step": 838 + }, + { + "epoch": 0.3, + "grad_norm": 0.16767257452011108, + "learning_rate": 0.0001973770771204547, + "loss": 1.0754, + "step": 839 + }, + { + "epoch": 0.3, + "grad_norm": 0.1669158935546875, + "learning_rate": 0.00019737077327654204, + "loss": 1.0454, + "step": 840 + }, + { + "epoch": 0.3, + "grad_norm": 0.18039458990097046, + "learning_rate": 0.00019736446196740612, + "loss": 1.083, + "step": 841 + }, + { + "epoch": 0.3, + "grad_norm": 0.18282979726791382, + "learning_rate": 0.00019735814319353078, + "loss": 1.065, + "step": 842 + }, + { + "epoch": 0.3, + "grad_norm": 0.18812443315982819, + "learning_rate": 0.00019735181695540043, + "loss": 1.0857, + "step": 843 + }, + { + "epoch": 0.3, + "grad_norm": 0.17648079991340637, + "learning_rate": 0.00019734548325350013, + "loss": 1.0887, + "step": 844 + }, + { + "epoch": 0.3, + "grad_norm": 0.18308205902576447, + "learning_rate": 0.00019733914208831547, + "loss": 1.0436, + "step": 845 + }, + { + "epoch": 0.3, + "grad_norm": 0.1741240918636322, + "learning_rate": 0.0001973327934603326, + "loss": 0.9797, + "step": 846 + }, + { + "epoch": 0.3, + "grad_norm": 0.17795363068580627, + "learning_rate": 0.00019732643737003827, + "loss": 0.9717, + "step": 847 + }, + { + "epoch": 0.3, + "grad_norm": 0.18939168751239777, + "learning_rate": 0.00019732007381791978, + "loss": 1.1139, + "step": 848 + }, + { + "epoch": 0.3, + "grad_norm": 0.17296737432479858, + "learning_rate": 0.00019731370280446497, + "loss": 1.0673, + "step": 849 + }, + { + "epoch": 0.3, + "grad_norm": 0.16962456703186035, + "learning_rate": 0.00019730732433016236, + "loss": 1.0461, + "step": 850 + }, + { + "epoch": 0.3, + "grad_norm": 0.170401930809021, + "learning_rate": 0.00019730093839550098, + "loss": 1.0249, + "step": 851 + }, + { + "epoch": 0.3, + "grad_norm": 0.1670999675989151, + "learning_rate": 0.00019729454500097036, + "loss": 1.0277, + "step": 852 + }, + { + "epoch": 0.3, + "grad_norm": 0.17348629236221313, + "learning_rate": 0.00019728814414706074, + "loss": 1.1394, + "step": 853 + }, + { + "epoch": 0.3, + "grad_norm": 0.1759617030620575, + "learning_rate": 0.0001972817358342628, + "loss": 1.0365, + "step": 854 + }, + { + "epoch": 0.3, + "grad_norm": 0.16651113331317902, + "learning_rate": 0.00019727532006306788, + "loss": 1.0067, + "step": 855 + }, + { + "epoch": 0.3, + "grad_norm": 0.1713470071554184, + "learning_rate": 0.00019726889683396786, + "loss": 1.0365, + "step": 856 + }, + { + "epoch": 0.3, + "grad_norm": 0.1733425259590149, + "learning_rate": 0.0001972624661474552, + "loss": 1.1274, + "step": 857 + }, + { + "epoch": 0.3, + "grad_norm": 0.16293053328990936, + "learning_rate": 0.00019725602800402294, + "loss": 1.0848, + "step": 858 + }, + { + "epoch": 0.3, + "grad_norm": 0.1699129045009613, + "learning_rate": 0.00019724958240416467, + "loss": 1.0418, + "step": 859 + }, + { + "epoch": 0.3, + "grad_norm": 0.1770995557308197, + "learning_rate": 0.00019724312934837454, + "loss": 1.1251, + "step": 860 + }, + { + "epoch": 0.3, + "grad_norm": 0.17217662930488586, + "learning_rate": 0.00019723666883714733, + "loss": 1.075, + "step": 861 + }, + { + "epoch": 0.3, + "grad_norm": 0.18040764331817627, + "learning_rate": 0.00019723020087097833, + "loss": 1.0622, + "step": 862 + }, + { + "epoch": 0.3, + "grad_norm": 0.16920557618141174, + "learning_rate": 0.00019722372545036342, + "loss": 1.0461, + "step": 863 + }, + { + "epoch": 0.3, + "grad_norm": 0.17057543992996216, + "learning_rate": 0.00019721724257579907, + "loss": 1.0893, + "step": 864 + }, + { + "epoch": 0.3, + "grad_norm": 0.18235866725444794, + "learning_rate": 0.0001972107522477823, + "loss": 1.026, + "step": 865 + }, + { + "epoch": 0.3, + "grad_norm": 0.18047012388706207, + "learning_rate": 0.00019720425446681077, + "loss": 1.0588, + "step": 866 + }, + { + "epoch": 0.31, + "grad_norm": 0.1593666523694992, + "learning_rate": 0.00019719774923338253, + "loss": 0.9946, + "step": 867 + }, + { + "epoch": 0.31, + "grad_norm": 0.17716600000858307, + "learning_rate": 0.00019719123654799646, + "loss": 1.0427, + "step": 868 + }, + { + "epoch": 0.31, + "grad_norm": 0.168783500790596, + "learning_rate": 0.00019718471641115177, + "loss": 1.0223, + "step": 869 + }, + { + "epoch": 0.31, + "grad_norm": 0.18226021528244019, + "learning_rate": 0.00019717818882334837, + "loss": 1.0657, + "step": 870 + }, + { + "epoch": 0.31, + "grad_norm": 0.17665214836597443, + "learning_rate": 0.00019717165378508678, + "loss": 1.0003, + "step": 871 + }, + { + "epoch": 0.31, + "grad_norm": 0.17209143936634064, + "learning_rate": 0.00019716511129686795, + "loss": 1.0387, + "step": 872 + }, + { + "epoch": 0.31, + "grad_norm": 0.17868731915950775, + "learning_rate": 0.00019715856135919352, + "loss": 1.0744, + "step": 873 + }, + { + "epoch": 0.31, + "grad_norm": 0.16554374992847443, + "learning_rate": 0.00019715200397256563, + "loss": 1.049, + "step": 874 + }, + { + "epoch": 0.31, + "grad_norm": 0.15895216166973114, + "learning_rate": 0.00019714543913748706, + "loss": 0.9866, + "step": 875 + }, + { + "epoch": 0.31, + "grad_norm": 0.1729590743780136, + "learning_rate": 0.0001971388668544611, + "loss": 1.0204, + "step": 876 + }, + { + "epoch": 0.31, + "grad_norm": 0.17698253691196442, + "learning_rate": 0.00019713228712399167, + "loss": 1.0451, + "step": 877 + }, + { + "epoch": 0.31, + "grad_norm": 0.18253202736377716, + "learning_rate": 0.00019712569994658315, + "loss": 1.1374, + "step": 878 + }, + { + "epoch": 0.31, + "grad_norm": 0.17986972630023956, + "learning_rate": 0.00019711910532274062, + "loss": 1.0667, + "step": 879 + }, + { + "epoch": 0.31, + "grad_norm": 0.18846715986728668, + "learning_rate": 0.00019711250325296967, + "loss": 1.1056, + "step": 880 + }, + { + "epoch": 0.31, + "grad_norm": 0.17018909752368927, + "learning_rate": 0.00019710589373777646, + "loss": 0.9997, + "step": 881 + }, + { + "epoch": 0.31, + "grad_norm": 0.1827511042356491, + "learning_rate": 0.00019709927677766774, + "loss": 1.0255, + "step": 882 + }, + { + "epoch": 0.31, + "grad_norm": 0.18419188261032104, + "learning_rate": 0.0001970926523731508, + "loss": 1.0579, + "step": 883 + }, + { + "epoch": 0.31, + "grad_norm": 0.17267294228076935, + "learning_rate": 0.00019708602052473357, + "loss": 1.0677, + "step": 884 + }, + { + "epoch": 0.31, + "grad_norm": 0.1671350598335266, + "learning_rate": 0.00019707938123292442, + "loss": 1.0914, + "step": 885 + }, + { + "epoch": 0.31, + "grad_norm": 0.16826584935188293, + "learning_rate": 0.00019707273449823243, + "loss": 1.0836, + "step": 886 + }, + { + "epoch": 0.31, + "grad_norm": 0.16998471319675446, + "learning_rate": 0.00019706608032116714, + "loss": 1.0238, + "step": 887 + }, + { + "epoch": 0.31, + "grad_norm": 0.1634981334209442, + "learning_rate": 0.0001970594187022388, + "loss": 1.0445, + "step": 888 + }, + { + "epoch": 0.31, + "grad_norm": 0.16182340681552887, + "learning_rate": 0.00019705274964195807, + "loss": 0.9969, + "step": 889 + }, + { + "epoch": 0.31, + "grad_norm": 0.17049826681613922, + "learning_rate": 0.0001970460731408363, + "loss": 1.0772, + "step": 890 + }, + { + "epoch": 0.31, + "grad_norm": 0.1803371161222458, + "learning_rate": 0.0001970393891993853, + "loss": 1.07, + "step": 891 + }, + { + "epoch": 0.31, + "grad_norm": 0.17657095193862915, + "learning_rate": 0.00019703269781811756, + "loss": 1.0898, + "step": 892 + }, + { + "epoch": 0.31, + "grad_norm": 0.1745598018169403, + "learning_rate": 0.00019702599899754607, + "loss": 1.0386, + "step": 893 + }, + { + "epoch": 0.31, + "grad_norm": 0.1789242923259735, + "learning_rate": 0.00019701929273818446, + "loss": 1.0609, + "step": 894 + }, + { + "epoch": 0.32, + "grad_norm": 0.17516635358333588, + "learning_rate": 0.00019701257904054686, + "loss": 1.0211, + "step": 895 + }, + { + "epoch": 0.32, + "grad_norm": 0.17327910661697388, + "learning_rate": 0.000197005857905148, + "loss": 1.0538, + "step": 896 + }, + { + "epoch": 0.32, + "grad_norm": 0.1742434799671173, + "learning_rate": 0.00019699912933250315, + "loss": 1.085, + "step": 897 + }, + { + "epoch": 0.32, + "grad_norm": 0.17944319546222687, + "learning_rate": 0.0001969923933231282, + "loss": 0.9968, + "step": 898 + }, + { + "epoch": 0.32, + "grad_norm": 0.17509445548057556, + "learning_rate": 0.00019698564987753958, + "loss": 1.0612, + "step": 899 + }, + { + "epoch": 0.32, + "grad_norm": 0.18131905794143677, + "learning_rate": 0.0001969788989962543, + "loss": 1.0615, + "step": 900 + }, + { + "epoch": 0.32, + "grad_norm": 0.17456522583961487, + "learning_rate": 0.00019697214067978999, + "loss": 1.0396, + "step": 901 + }, + { + "epoch": 0.32, + "grad_norm": 0.18266192078590393, + "learning_rate": 0.00019696537492866468, + "loss": 1.0479, + "step": 902 + }, + { + "epoch": 0.32, + "grad_norm": 0.1651952564716339, + "learning_rate": 0.00019695860174339714, + "loss": 1.0023, + "step": 903 + }, + { + "epoch": 0.32, + "grad_norm": 0.1714158058166504, + "learning_rate": 0.0001969518211245067, + "loss": 1.0011, + "step": 904 + }, + { + "epoch": 0.32, + "grad_norm": 0.18282106518745422, + "learning_rate": 0.00019694503307251317, + "loss": 1.1196, + "step": 905 + }, + { + "epoch": 0.32, + "grad_norm": 0.16715645790100098, + "learning_rate": 0.000196938237587937, + "loss": 1.0366, + "step": 906 + }, + { + "epoch": 0.32, + "grad_norm": 0.18058747053146362, + "learning_rate": 0.00019693143467129916, + "loss": 1.0491, + "step": 907 + }, + { + "epoch": 0.32, + "grad_norm": 0.18357785046100616, + "learning_rate": 0.00019692462432312124, + "loss": 1.0959, + "step": 908 + }, + { + "epoch": 0.32, + "grad_norm": 0.16413332521915436, + "learning_rate": 0.00019691780654392535, + "loss": 0.9923, + "step": 909 + }, + { + "epoch": 0.32, + "grad_norm": 0.17373456060886383, + "learning_rate": 0.00019691098133423423, + "loss": 1.0187, + "step": 910 + }, + { + "epoch": 0.32, + "grad_norm": 0.18135735392570496, + "learning_rate": 0.00019690414869457117, + "loss": 1.0138, + "step": 911 + }, + { + "epoch": 0.32, + "grad_norm": 0.16327910125255585, + "learning_rate": 0.00019689730862545995, + "loss": 1.01, + "step": 912 + }, + { + "epoch": 0.32, + "grad_norm": 0.1778937131166458, + "learning_rate": 0.00019689046112742503, + "loss": 1.0789, + "step": 913 + }, + { + "epoch": 0.32, + "grad_norm": 0.16726310551166534, + "learning_rate": 0.00019688360620099133, + "loss": 0.986, + "step": 914 + }, + { + "epoch": 0.32, + "grad_norm": 0.1793222278356552, + "learning_rate": 0.00019687674384668451, + "loss": 1.1003, + "step": 915 + }, + { + "epoch": 0.32, + "grad_norm": 0.16796770691871643, + "learning_rate": 0.00019686987406503063, + "loss": 1.0466, + "step": 916 + }, + { + "epoch": 0.32, + "grad_norm": 0.16727067530155182, + "learning_rate": 0.00019686299685655638, + "loss": 0.987, + "step": 917 + }, + { + "epoch": 0.32, + "grad_norm": 0.17687837779521942, + "learning_rate": 0.00019685611222178905, + "loss": 1.0444, + "step": 918 + }, + { + "epoch": 0.32, + "grad_norm": 0.1811404675245285, + "learning_rate": 0.0001968492201612564, + "loss": 1.0468, + "step": 919 + }, + { + "epoch": 0.32, + "grad_norm": 0.1711091250181198, + "learning_rate": 0.00019684232067548695, + "loss": 1.0123, + "step": 920 + }, + { + "epoch": 0.32, + "grad_norm": 0.1783980429172516, + "learning_rate": 0.00019683541376500955, + "loss": 1.0674, + "step": 921 + }, + { + "epoch": 0.32, + "grad_norm": 0.18069174885749817, + "learning_rate": 0.00019682849943035384, + "loss": 1.067, + "step": 922 + }, + { + "epoch": 0.33, + "grad_norm": 0.1692749261856079, + "learning_rate": 0.00019682157767204986, + "loss": 1.012, + "step": 923 + }, + { + "epoch": 0.33, + "grad_norm": 0.17725811898708344, + "learning_rate": 0.00019681464849062832, + "loss": 0.9682, + "step": 924 + }, + { + "epoch": 0.33, + "grad_norm": 0.17122116684913635, + "learning_rate": 0.00019680771188662044, + "loss": 1.0045, + "step": 925 + }, + { + "epoch": 0.33, + "grad_norm": 0.17642873525619507, + "learning_rate": 0.00019680076786055806, + "loss": 1.0636, + "step": 926 + }, + { + "epoch": 0.33, + "grad_norm": 0.19407136738300323, + "learning_rate": 0.00019679381641297353, + "loss": 1.1208, + "step": 927 + }, + { + "epoch": 0.33, + "grad_norm": 0.17200174927711487, + "learning_rate": 0.00019678685754439987, + "loss": 1.0143, + "step": 928 + }, + { + "epoch": 0.33, + "grad_norm": 0.17304272949695587, + "learning_rate": 0.00019677989125537054, + "loss": 0.9638, + "step": 929 + }, + { + "epoch": 0.33, + "grad_norm": 0.18978925049304962, + "learning_rate": 0.00019677291754641965, + "loss": 1.007, + "step": 930 + }, + { + "epoch": 0.33, + "grad_norm": 0.17057448625564575, + "learning_rate": 0.0001967659364180819, + "loss": 1.0672, + "step": 931 + }, + { + "epoch": 0.33, + "grad_norm": 0.17902058362960815, + "learning_rate": 0.00019675894787089243, + "loss": 1.0503, + "step": 932 + }, + { + "epoch": 0.33, + "grad_norm": 0.19893060624599457, + "learning_rate": 0.00019675195190538714, + "loss": 1.07, + "step": 933 + }, + { + "epoch": 0.33, + "grad_norm": 0.16946718096733093, + "learning_rate": 0.0001967449485221023, + "loss": 1.0448, + "step": 934 + }, + { + "epoch": 0.33, + "grad_norm": 0.18813353776931763, + "learning_rate": 0.00019673793772157492, + "loss": 1.1135, + "step": 935 + }, + { + "epoch": 0.33, + "grad_norm": 0.1719808280467987, + "learning_rate": 0.00019673091950434248, + "loss": 1.0385, + "step": 936 + }, + { + "epoch": 0.33, + "grad_norm": 0.179408997297287, + "learning_rate": 0.00019672389387094302, + "loss": 1.0326, + "step": 937 + }, + { + "epoch": 0.33, + "grad_norm": 0.1784079670906067, + "learning_rate": 0.00019671686082191526, + "loss": 1.0154, + "step": 938 + }, + { + "epoch": 0.33, + "grad_norm": 0.19028711318969727, + "learning_rate": 0.00019670982035779834, + "loss": 1.0589, + "step": 939 + }, + { + "epoch": 0.33, + "grad_norm": 0.1741345077753067, + "learning_rate": 0.00019670277247913205, + "loss": 1.0797, + "step": 940 + }, + { + "epoch": 0.33, + "grad_norm": 0.18272146582603455, + "learning_rate": 0.00019669571718645675, + "loss": 1.0764, + "step": 941 + }, + { + "epoch": 0.33, + "grad_norm": 0.17313426733016968, + "learning_rate": 0.00019668865448031335, + "loss": 1.0728, + "step": 942 + }, + { + "epoch": 0.33, + "grad_norm": 0.17928192019462585, + "learning_rate": 0.00019668158436124332, + "loss": 1.0319, + "step": 943 + }, + { + "epoch": 0.33, + "grad_norm": 0.16490381956100464, + "learning_rate": 0.00019667450682978876, + "loss": 1.0106, + "step": 944 + }, + { + "epoch": 0.33, + "grad_norm": 0.1769440919160843, + "learning_rate": 0.00019666742188649225, + "loss": 1.0644, + "step": 945 + }, + { + "epoch": 0.33, + "grad_norm": 0.1783764362335205, + "learning_rate": 0.000196660329531897, + "loss": 1.0492, + "step": 946 + }, + { + "epoch": 0.33, + "grad_norm": 0.18292124569416046, + "learning_rate": 0.0001966532297665467, + "loss": 1.0893, + "step": 947 + }, + { + "epoch": 0.33, + "grad_norm": 0.16674058139324188, + "learning_rate": 0.00019664612259098578, + "loss": 1.0172, + "step": 948 + }, + { + "epoch": 0.33, + "grad_norm": 0.17283639311790466, + "learning_rate": 0.00019663900800575906, + "loss": 1.0527, + "step": 949 + }, + { + "epoch": 0.33, + "grad_norm": 0.1824640929698944, + "learning_rate": 0.00019663188601141203, + "loss": 1.1194, + "step": 950 + }, + { + "epoch": 0.33, + "grad_norm": 0.17792586982250214, + "learning_rate": 0.00019662475660849066, + "loss": 1.0048, + "step": 951 + }, + { + "epoch": 0.34, + "grad_norm": 0.1702018827199936, + "learning_rate": 0.00019661761979754166, + "loss": 1.0611, + "step": 952 + }, + { + "epoch": 0.34, + "grad_norm": 0.16764649748802185, + "learning_rate": 0.00019661047557911212, + "loss": 1.0499, + "step": 953 + }, + { + "epoch": 0.34, + "grad_norm": 0.1887722909450531, + "learning_rate": 0.00019660332395374979, + "loss": 1.0431, + "step": 954 + }, + { + "epoch": 0.34, + "grad_norm": 0.1827782541513443, + "learning_rate": 0.00019659616492200295, + "loss": 1.074, + "step": 955 + }, + { + "epoch": 0.34, + "grad_norm": 0.17926305532455444, + "learning_rate": 0.00019658899848442048, + "loss": 1.0444, + "step": 956 + }, + { + "epoch": 0.34, + "grad_norm": 0.16336272656917572, + "learning_rate": 0.00019658182464155183, + "loss": 0.9583, + "step": 957 + }, + { + "epoch": 0.34, + "grad_norm": 0.1775018870830536, + "learning_rate": 0.000196574643393947, + "loss": 0.9603, + "step": 958 + }, + { + "epoch": 0.34, + "grad_norm": 0.18180976808071136, + "learning_rate": 0.00019656745474215656, + "loss": 1.0505, + "step": 959 + }, + { + "epoch": 0.34, + "grad_norm": 0.1839762032032013, + "learning_rate": 0.00019656025868673165, + "loss": 1.0521, + "step": 960 + }, + { + "epoch": 0.34, + "grad_norm": 0.18608036637306213, + "learning_rate": 0.00019655305522822399, + "loss": 1.0153, + "step": 961 + }, + { + "epoch": 0.34, + "grad_norm": 0.17250166833400726, + "learning_rate": 0.0001965458443671858, + "loss": 1.0238, + "step": 962 + }, + { + "epoch": 0.34, + "grad_norm": 0.17518852651119232, + "learning_rate": 0.00019653862610417, + "loss": 0.9844, + "step": 963 + }, + { + "epoch": 0.34, + "grad_norm": 0.18602406978607178, + "learning_rate": 0.00019653140043972994, + "loss": 1.0637, + "step": 964 + }, + { + "epoch": 0.34, + "grad_norm": 0.16935952007770538, + "learning_rate": 0.0001965241673744196, + "loss": 1.0002, + "step": 965 + }, + { + "epoch": 0.34, + "grad_norm": 0.2699510157108307, + "learning_rate": 0.0001965169269087936, + "loss": 1.0215, + "step": 966 + }, + { + "epoch": 0.34, + "grad_norm": 0.17568126320838928, + "learning_rate": 0.00019650967904340694, + "loss": 1.0628, + "step": 967 + }, + { + "epoch": 0.34, + "grad_norm": 0.16942435503005981, + "learning_rate": 0.00019650242377881538, + "loss": 1.0706, + "step": 968 + }, + { + "epoch": 0.34, + "grad_norm": 0.17300187051296234, + "learning_rate": 0.00019649516111557515, + "loss": 1.0659, + "step": 969 + }, + { + "epoch": 0.34, + "grad_norm": 0.1726924628019333, + "learning_rate": 0.00019648789105424305, + "loss": 1.102, + "step": 970 + }, + { + "epoch": 0.34, + "grad_norm": 0.1732690930366516, + "learning_rate": 0.00019648061359537646, + "loss": 1.1284, + "step": 971 + }, + { + "epoch": 0.34, + "grad_norm": 0.17127656936645508, + "learning_rate": 0.00019647332873953336, + "loss": 1.0179, + "step": 972 + }, + { + "epoch": 0.34, + "grad_norm": 0.17334236204624176, + "learning_rate": 0.00019646603648727223, + "loss": 1.0578, + "step": 973 + }, + { + "epoch": 0.34, + "grad_norm": 0.16784228384494781, + "learning_rate": 0.00019645873683915213, + "loss": 1.0121, + "step": 974 + }, + { + "epoch": 0.34, + "grad_norm": 0.167621910572052, + "learning_rate": 0.00019645142979573277, + "loss": 1.0196, + "step": 975 + }, + { + "epoch": 0.34, + "grad_norm": 0.1630851924419403, + "learning_rate": 0.00019644411535757437, + "loss": 1.0524, + "step": 976 + }, + { + "epoch": 0.34, + "grad_norm": 0.16767187416553497, + "learning_rate": 0.00019643679352523763, + "loss": 1.04, + "step": 977 + }, + { + "epoch": 0.34, + "grad_norm": 0.18620026111602783, + "learning_rate": 0.00019642946429928397, + "loss": 1.0708, + "step": 978 + }, + { + "epoch": 0.34, + "grad_norm": 0.17906849086284637, + "learning_rate": 0.0001964221276802753, + "loss": 1.0709, + "step": 979 + }, + { + "epoch": 0.35, + "grad_norm": 0.1792149543762207, + "learning_rate": 0.0001964147836687741, + "loss": 1.0365, + "step": 980 + }, + { + "epoch": 0.35, + "grad_norm": 0.17698103189468384, + "learning_rate": 0.0001964074322653434, + "loss": 1.0732, + "step": 981 + }, + { + "epoch": 0.35, + "grad_norm": 0.17077916860580444, + "learning_rate": 0.00019640007347054684, + "loss": 1.0463, + "step": 982 + }, + { + "epoch": 0.35, + "grad_norm": 0.17265799641609192, + "learning_rate": 0.0001963927072849486, + "loss": 1.0835, + "step": 983 + }, + { + "epoch": 0.35, + "grad_norm": 0.1759408712387085, + "learning_rate": 0.00019638533370911341, + "loss": 1.0469, + "step": 984 + }, + { + "epoch": 0.35, + "grad_norm": 0.17300966382026672, + "learning_rate": 0.00019637795274360663, + "loss": 1.0623, + "step": 985 + }, + { + "epoch": 0.35, + "grad_norm": 0.1715632528066635, + "learning_rate": 0.0001963705643889941, + "loss": 1.0799, + "step": 986 + }, + { + "epoch": 0.35, + "grad_norm": 0.17230769991874695, + "learning_rate": 0.0001963631686458423, + "loss": 1.0518, + "step": 987 + }, + { + "epoch": 0.35, + "grad_norm": 0.17746640741825104, + "learning_rate": 0.0001963557655147182, + "loss": 1.0129, + "step": 988 + }, + { + "epoch": 0.35, + "grad_norm": 0.1871126890182495, + "learning_rate": 0.00019634835499618948, + "loss": 1.0845, + "step": 989 + }, + { + "epoch": 0.35, + "grad_norm": 0.17565259337425232, + "learning_rate": 0.0001963409370908242, + "loss": 1.0699, + "step": 990 + }, + { + "epoch": 0.35, + "grad_norm": 0.1731378436088562, + "learning_rate": 0.00019633351179919109, + "loss": 0.9835, + "step": 991 + }, + { + "epoch": 0.35, + "grad_norm": 0.18255259096622467, + "learning_rate": 0.00019632607912185949, + "loss": 0.964, + "step": 992 + }, + { + "epoch": 0.35, + "grad_norm": 0.1729213297367096, + "learning_rate": 0.00019631863905939916, + "loss": 1.0397, + "step": 993 + }, + { + "epoch": 0.35, + "grad_norm": 0.17974966764450073, + "learning_rate": 0.0001963111916123806, + "loss": 1.0543, + "step": 994 + }, + { + "epoch": 0.35, + "grad_norm": 0.17525100708007812, + "learning_rate": 0.0001963037367813747, + "loss": 1.0449, + "step": 995 + }, + { + "epoch": 0.35, + "grad_norm": 0.18129664659500122, + "learning_rate": 0.0001962962745669531, + "loss": 1.0856, + "step": 996 + }, + { + "epoch": 0.35, + "grad_norm": 0.1881393939256668, + "learning_rate": 0.00019628880496968786, + "loss": 1.0776, + "step": 997 + }, + { + "epoch": 0.35, + "grad_norm": 0.17546981573104858, + "learning_rate": 0.00019628132799015167, + "loss": 1.0396, + "step": 998 + }, + { + "epoch": 0.35, + "grad_norm": 0.1705467253923416, + "learning_rate": 0.00019627384362891776, + "loss": 1.0109, + "step": 999 + }, + { + "epoch": 0.35, + "grad_norm": 0.1715412139892578, + "learning_rate": 0.00019626635188656, + "loss": 1.0649, + "step": 1000 + }, + { + "epoch": 0.35, + "grad_norm": 0.17314362525939941, + "learning_rate": 0.00019625885276365268, + "loss": 0.984, + "step": 1001 + }, + { + "epoch": 0.35, + "grad_norm": 0.18583662807941437, + "learning_rate": 0.00019625134626077083, + "loss": 1.0614, + "step": 1002 + }, + { + "epoch": 0.35, + "grad_norm": 0.18308642506599426, + "learning_rate": 0.00019624383237848986, + "loss": 1.0291, + "step": 1003 + }, + { + "epoch": 0.35, + "grad_norm": 0.17275825142860413, + "learning_rate": 0.00019623631111738595, + "loss": 1.0661, + "step": 1004 + }, + { + "epoch": 0.35, + "grad_norm": 0.17974300682544708, + "learning_rate": 0.00019622878247803569, + "loss": 1.0458, + "step": 1005 + }, + { + "epoch": 0.35, + "grad_norm": 0.17539826035499573, + "learning_rate": 0.00019622124646101626, + "loss": 1.0174, + "step": 1006 + }, + { + "epoch": 0.35, + "grad_norm": 0.1848883181810379, + "learning_rate": 0.00019621370306690546, + "loss": 1.0746, + "step": 1007 + }, + { + "epoch": 0.35, + "grad_norm": 0.16872359812259674, + "learning_rate": 0.00019620615229628164, + "loss": 1.0053, + "step": 1008 + }, + { + "epoch": 0.36, + "grad_norm": 0.16738854348659515, + "learning_rate": 0.00019619859414972366, + "loss": 1.0096, + "step": 1009 + }, + { + "epoch": 0.36, + "grad_norm": 0.1777888685464859, + "learning_rate": 0.00019619102862781105, + "loss": 0.946, + "step": 1010 + }, + { + "epoch": 0.36, + "grad_norm": 0.18343977630138397, + "learning_rate": 0.00019618345573112377, + "loss": 0.9957, + "step": 1011 + }, + { + "epoch": 0.36, + "grad_norm": 0.20510901510715485, + "learning_rate": 0.0001961758754602425, + "loss": 1.0588, + "step": 1012 + }, + { + "epoch": 0.36, + "grad_norm": 0.18343272805213928, + "learning_rate": 0.00019616828781574835, + "loss": 1.0221, + "step": 1013 + }, + { + "epoch": 0.36, + "grad_norm": 0.17978180944919586, + "learning_rate": 0.00019616069279822305, + "loss": 1.0499, + "step": 1014 + }, + { + "epoch": 0.36, + "grad_norm": 0.1911647468805313, + "learning_rate": 0.00019615309040824888, + "loss": 1.075, + "step": 1015 + }, + { + "epoch": 0.36, + "grad_norm": 0.17616963386535645, + "learning_rate": 0.00019614548064640877, + "loss": 1.0689, + "step": 1016 + }, + { + "epoch": 0.36, + "grad_norm": 0.17211149632930756, + "learning_rate": 0.00019613786351328609, + "loss": 0.986, + "step": 1017 + }, + { + "epoch": 0.36, + "grad_norm": 0.1717488318681717, + "learning_rate": 0.00019613023900946484, + "loss": 1.0073, + "step": 1018 + }, + { + "epoch": 0.36, + "grad_norm": 0.18268033862113953, + "learning_rate": 0.00019612260713552956, + "loss": 1.0969, + "step": 1019 + }, + { + "epoch": 0.36, + "grad_norm": 0.1773374378681183, + "learning_rate": 0.0001961149678920654, + "loss": 0.9878, + "step": 1020 + }, + { + "epoch": 0.36, + "grad_norm": 0.17691907286643982, + "learning_rate": 0.00019610732127965804, + "loss": 1.1073, + "step": 1021 + }, + { + "epoch": 0.36, + "grad_norm": 0.18169642984867096, + "learning_rate": 0.00019609966729889372, + "loss": 1.0644, + "step": 1022 + }, + { + "epoch": 0.36, + "grad_norm": 0.17204183340072632, + "learning_rate": 0.00019609200595035927, + "loss": 0.9929, + "step": 1023 + }, + { + "epoch": 0.36, + "grad_norm": 0.1788758486509323, + "learning_rate": 0.00019608433723464206, + "loss": 1.0369, + "step": 1024 + }, + { + "epoch": 0.36, + "grad_norm": 0.17738159000873566, + "learning_rate": 0.00019607666115233, + "loss": 1.0628, + "step": 1025 + }, + { + "epoch": 0.36, + "grad_norm": 0.1694379299879074, + "learning_rate": 0.00019606897770401167, + "loss": 1.0282, + "step": 1026 + }, + { + "epoch": 0.36, + "grad_norm": 0.16664545238018036, + "learning_rate": 0.0001960612868902761, + "loss": 0.9741, + "step": 1027 + }, + { + "epoch": 0.36, + "grad_norm": 0.16760802268981934, + "learning_rate": 0.00019605358871171294, + "loss": 0.9351, + "step": 1028 + }, + { + "epoch": 0.36, + "grad_norm": 0.17086480557918549, + "learning_rate": 0.0001960458831689124, + "loss": 1.0735, + "step": 1029 + }, + { + "epoch": 0.36, + "grad_norm": 0.17389248311519623, + "learning_rate": 0.00019603817026246523, + "loss": 0.9759, + "step": 1030 + }, + { + "epoch": 0.36, + "grad_norm": 0.1694263219833374, + "learning_rate": 0.0001960304499929628, + "loss": 1.0005, + "step": 1031 + }, + { + "epoch": 0.36, + "grad_norm": 0.17080332338809967, + "learning_rate": 0.00019602272236099693, + "loss": 0.9775, + "step": 1032 + }, + { + "epoch": 0.36, + "grad_norm": 0.1651020646095276, + "learning_rate": 0.00019601498736716017, + "loss": 1.0308, + "step": 1033 + }, + { + "epoch": 0.36, + "grad_norm": 0.1663987785577774, + "learning_rate": 0.00019600724501204553, + "loss": 0.9909, + "step": 1034 + }, + { + "epoch": 0.36, + "grad_norm": 0.17150457203388214, + "learning_rate": 0.00019599949529624654, + "loss": 1.0159, + "step": 1035 + }, + { + "epoch": 0.36, + "grad_norm": 0.17601971328258514, + "learning_rate": 0.00019599173822035744, + "loss": 0.997, + "step": 1036 + }, + { + "epoch": 0.37, + "grad_norm": 0.18561767041683197, + "learning_rate": 0.00019598397378497287, + "loss": 1.0094, + "step": 1037 + }, + { + "epoch": 0.37, + "grad_norm": 0.17770105600357056, + "learning_rate": 0.00019597620199068817, + "loss": 1.0259, + "step": 1038 + }, + { + "epoch": 0.37, + "grad_norm": 0.17730176448822021, + "learning_rate": 0.00019596842283809917, + "loss": 1.0212, + "step": 1039 + }, + { + "epoch": 0.37, + "grad_norm": 0.1914602667093277, + "learning_rate": 0.0001959606363278023, + "loss": 1.086, + "step": 1040 + }, + { + "epoch": 0.37, + "grad_norm": 0.18201208114624023, + "learning_rate": 0.00019595284246039447, + "loss": 1.0525, + "step": 1041 + }, + { + "epoch": 0.37, + "grad_norm": 0.16805338859558105, + "learning_rate": 0.0001959450412364733, + "loss": 1.0172, + "step": 1042 + }, + { + "epoch": 0.37, + "grad_norm": 0.18455561995506287, + "learning_rate": 0.00019593723265663685, + "loss": 1.0622, + "step": 1043 + }, + { + "epoch": 0.37, + "grad_norm": 0.18691962957382202, + "learning_rate": 0.0001959294167214838, + "loss": 1.0064, + "step": 1044 + }, + { + "epoch": 0.37, + "grad_norm": 0.17232896387577057, + "learning_rate": 0.00019592159343161342, + "loss": 0.9567, + "step": 1045 + }, + { + "epoch": 0.37, + "grad_norm": 0.17483456432819366, + "learning_rate": 0.00019591376278762541, + "loss": 1.0152, + "step": 1046 + }, + { + "epoch": 0.37, + "grad_norm": 0.18186868727207184, + "learning_rate": 0.00019590592479012023, + "loss": 0.988, + "step": 1047 + }, + { + "epoch": 0.37, + "grad_norm": 0.19091114401817322, + "learning_rate": 0.00019589807943969873, + "loss": 1.0016, + "step": 1048 + }, + { + "epoch": 0.37, + "grad_norm": 0.17381857335567474, + "learning_rate": 0.00019589022673696245, + "loss": 1.0549, + "step": 1049 + }, + { + "epoch": 0.37, + "grad_norm": 0.1698593646287918, + "learning_rate": 0.0001958823666825134, + "loss": 1.0751, + "step": 1050 + }, + { + "epoch": 0.37, + "grad_norm": 0.1881944239139557, + "learning_rate": 0.00019587449927695424, + "loss": 1.0665, + "step": 1051 + }, + { + "epoch": 0.37, + "grad_norm": 0.17412430047988892, + "learning_rate": 0.00019586662452088813, + "loss": 1.0164, + "step": 1052 + }, + { + "epoch": 0.37, + "grad_norm": 0.17845436930656433, + "learning_rate": 0.0001958587424149188, + "loss": 1.0372, + "step": 1053 + }, + { + "epoch": 0.37, + "grad_norm": 0.17655451595783234, + "learning_rate": 0.00019585085295965055, + "loss": 1.0524, + "step": 1054 + }, + { + "epoch": 0.37, + "grad_norm": 0.17986845970153809, + "learning_rate": 0.00019584295615568826, + "loss": 1.0226, + "step": 1055 + }, + { + "epoch": 0.37, + "grad_norm": 0.17774830758571625, + "learning_rate": 0.00019583505200363734, + "loss": 1.0196, + "step": 1056 + }, + { + "epoch": 0.37, + "grad_norm": 0.18801482021808624, + "learning_rate": 0.00019582714050410386, + "loss": 1.0183, + "step": 1057 + }, + { + "epoch": 0.37, + "grad_norm": 0.173845112323761, + "learning_rate": 0.00019581922165769428, + "loss": 1.0156, + "step": 1058 + }, + { + "epoch": 0.37, + "grad_norm": 0.17635942995548248, + "learning_rate": 0.0001958112954650158, + "loss": 1.0471, + "step": 1059 + }, + { + "epoch": 0.37, + "grad_norm": 0.18246427178382874, + "learning_rate": 0.00019580336192667606, + "loss": 1.0075, + "step": 1060 + }, + { + "epoch": 0.37, + "grad_norm": 0.18411587178707123, + "learning_rate": 0.0001957954210432833, + "loss": 1.0057, + "step": 1061 + }, + { + "epoch": 0.37, + "grad_norm": 0.16751395165920258, + "learning_rate": 0.00019578747281544638, + "loss": 0.9679, + "step": 1062 + }, + { + "epoch": 0.37, + "grad_norm": 0.1719505786895752, + "learning_rate": 0.00019577951724377462, + "loss": 1.0377, + "step": 1063 + }, + { + "epoch": 0.37, + "grad_norm": 0.16536471247673035, + "learning_rate": 0.00019577155432887804, + "loss": 1.0532, + "step": 1064 + }, + { + "epoch": 0.38, + "grad_norm": 0.19073115289211273, + "learning_rate": 0.00019576358407136707, + "loss": 1.0635, + "step": 1065 + }, + { + "epoch": 0.38, + "grad_norm": 0.18181023001670837, + "learning_rate": 0.00019575560647185278, + "loss": 1.0941, + "step": 1066 + }, + { + "epoch": 0.38, + "grad_norm": 0.16925771534442902, + "learning_rate": 0.0001957476215309468, + "loss": 1.0137, + "step": 1067 + }, + { + "epoch": 0.38, + "grad_norm": 0.17827220261096954, + "learning_rate": 0.00019573962924926134, + "loss": 1.047, + "step": 1068 + }, + { + "epoch": 0.38, + "grad_norm": 0.1751449555158615, + "learning_rate": 0.00019573162962740914, + "loss": 1.0427, + "step": 1069 + }, + { + "epoch": 0.38, + "grad_norm": 0.18006440997123718, + "learning_rate": 0.00019572362266600353, + "loss": 1.0312, + "step": 1070 + }, + { + "epoch": 0.38, + "grad_norm": 0.17753449082374573, + "learning_rate": 0.00019571560836565834, + "loss": 1.0177, + "step": 1071 + }, + { + "epoch": 0.38, + "grad_norm": 0.17010781168937683, + "learning_rate": 0.00019570758672698805, + "loss": 1.087, + "step": 1072 + }, + { + "epoch": 0.38, + "grad_norm": 0.17299331724643707, + "learning_rate": 0.00019569955775060766, + "loss": 1.0537, + "step": 1073 + }, + { + "epoch": 0.38, + "grad_norm": 0.17425057291984558, + "learning_rate": 0.00019569152143713274, + "loss": 0.9863, + "step": 1074 + }, + { + "epoch": 0.38, + "grad_norm": 0.17348161339759827, + "learning_rate": 0.0001956834777871794, + "loss": 1.0099, + "step": 1075 + }, + { + "epoch": 0.38, + "grad_norm": 0.17897789180278778, + "learning_rate": 0.00019567542680136438, + "loss": 1.1186, + "step": 1076 + }, + { + "epoch": 0.38, + "grad_norm": 0.16750477254390717, + "learning_rate": 0.00019566736848030484, + "loss": 0.9746, + "step": 1077 + }, + { + "epoch": 0.38, + "grad_norm": 0.18266913294792175, + "learning_rate": 0.00019565930282461867, + "loss": 1.0324, + "step": 1078 + }, + { + "epoch": 0.38, + "grad_norm": 0.17874056100845337, + "learning_rate": 0.00019565122983492423, + "loss": 1.0307, + "step": 1079 + }, + { + "epoch": 0.38, + "grad_norm": 0.1887243390083313, + "learning_rate": 0.00019564314951184047, + "loss": 1.048, + "step": 1080 + }, + { + "epoch": 0.38, + "grad_norm": 0.17233537137508392, + "learning_rate": 0.00019563506185598685, + "loss": 1.0384, + "step": 1081 + }, + { + "epoch": 0.38, + "grad_norm": 0.18821567296981812, + "learning_rate": 0.0001956269668679835, + "loss": 1.0694, + "step": 1082 + }, + { + "epoch": 0.38, + "grad_norm": 0.19365891814231873, + "learning_rate": 0.00019561886454845097, + "loss": 1.0784, + "step": 1083 + }, + { + "epoch": 0.38, + "grad_norm": 0.18988920748233795, + "learning_rate": 0.00019561075489801054, + "loss": 1.0649, + "step": 1084 + }, + { + "epoch": 0.38, + "grad_norm": 0.18308693170547485, + "learning_rate": 0.00019560263791728387, + "loss": 1.0837, + "step": 1085 + }, + { + "epoch": 0.38, + "grad_norm": 0.1714373528957367, + "learning_rate": 0.0001955945136068933, + "loss": 1.071, + "step": 1086 + }, + { + "epoch": 0.38, + "grad_norm": 0.17758604884147644, + "learning_rate": 0.00019558638196746176, + "loss": 1.0841, + "step": 1087 + }, + { + "epoch": 0.38, + "grad_norm": 0.17851248383522034, + "learning_rate": 0.0001955782429996126, + "loss": 1.0355, + "step": 1088 + }, + { + "epoch": 0.38, + "grad_norm": 0.16533468663692474, + "learning_rate": 0.0001955700967039699, + "loss": 1.0278, + "step": 1089 + }, + { + "epoch": 0.38, + "grad_norm": 0.17190004885196686, + "learning_rate": 0.00019556194308115816, + "loss": 1.0253, + "step": 1090 + }, + { + "epoch": 0.38, + "grad_norm": 0.17942769825458527, + "learning_rate": 0.00019555378213180252, + "loss": 0.9406, + "step": 1091 + }, + { + "epoch": 0.38, + "grad_norm": 0.17903082072734833, + "learning_rate": 0.0001955456138565287, + "loss": 1.037, + "step": 1092 + }, + { + "epoch": 0.38, + "grad_norm": 0.1841680407524109, + "learning_rate": 0.0001955374382559629, + "loss": 1.0799, + "step": 1093 + }, + { + "epoch": 0.39, + "grad_norm": 0.18083688616752625, + "learning_rate": 0.00019552925533073193, + "loss": 1.0175, + "step": 1094 + }, + { + "epoch": 0.39, + "grad_norm": 0.1914413422346115, + "learning_rate": 0.00019552106508146318, + "loss": 1.1054, + "step": 1095 + }, + { + "epoch": 0.39, + "grad_norm": 0.18662455677986145, + "learning_rate": 0.00019551286750878454, + "loss": 1.0377, + "step": 1096 + }, + { + "epoch": 0.39, + "grad_norm": 0.17855742573738098, + "learning_rate": 0.00019550466261332455, + "loss": 1.0656, + "step": 1097 + }, + { + "epoch": 0.39, + "grad_norm": 0.17760635912418365, + "learning_rate": 0.00019549645039571226, + "loss": 1.0806, + "step": 1098 + }, + { + "epoch": 0.39, + "grad_norm": 0.17919620871543884, + "learning_rate": 0.00019548823085657727, + "loss": 0.9743, + "step": 1099 + }, + { + "epoch": 0.39, + "grad_norm": 0.17144104838371277, + "learning_rate": 0.00019548000399654975, + "loss": 0.9721, + "step": 1100 + }, + { + "epoch": 0.39, + "grad_norm": 0.1793820559978485, + "learning_rate": 0.00019547176981626042, + "loss": 1.038, + "step": 1101 + }, + { + "epoch": 0.39, + "grad_norm": 0.17242422699928284, + "learning_rate": 0.00019546352831634062, + "loss": 0.9687, + "step": 1102 + }, + { + "epoch": 0.39, + "grad_norm": 0.17942006886005402, + "learning_rate": 0.0001954552794974222, + "loss": 1.0924, + "step": 1103 + }, + { + "epoch": 0.39, + "grad_norm": 0.19152596592903137, + "learning_rate": 0.00019544702336013754, + "loss": 1.0564, + "step": 1104 + }, + { + "epoch": 0.39, + "grad_norm": 0.17344453930854797, + "learning_rate": 0.00019543875990511969, + "loss": 1.0529, + "step": 1105 + }, + { + "epoch": 0.39, + "grad_norm": 0.17758041620254517, + "learning_rate": 0.0001954304891330021, + "loss": 1.0792, + "step": 1106 + }, + { + "epoch": 0.39, + "grad_norm": 0.17804868519306183, + "learning_rate": 0.00019542221104441895, + "loss": 1.0285, + "step": 1107 + }, + { + "epoch": 0.39, + "grad_norm": 0.191632479429245, + "learning_rate": 0.00019541392564000488, + "loss": 0.9587, + "step": 1108 + }, + { + "epoch": 0.39, + "grad_norm": 0.17585574090480804, + "learning_rate": 0.00019540563292039513, + "loss": 0.9943, + "step": 1109 + }, + { + "epoch": 0.39, + "grad_norm": 0.16620998084545135, + "learning_rate": 0.0001953973328862255, + "loss": 1.012, + "step": 1110 + }, + { + "epoch": 0.39, + "grad_norm": 0.1953848898410797, + "learning_rate": 0.00019538902553813226, + "loss": 1.0499, + "step": 1111 + }, + { + "epoch": 0.39, + "grad_norm": 0.18621788918972015, + "learning_rate": 0.0001953807108767524, + "loss": 1.0103, + "step": 1112 + }, + { + "epoch": 0.39, + "grad_norm": 0.1733444482088089, + "learning_rate": 0.00019537238890272334, + "loss": 1.1045, + "step": 1113 + }, + { + "epoch": 0.39, + "grad_norm": 0.1846238672733307, + "learning_rate": 0.00019536405961668313, + "loss": 1.0661, + "step": 1114 + }, + { + "epoch": 0.39, + "grad_norm": 0.18024377524852753, + "learning_rate": 0.00019535572301927032, + "loss": 1.0236, + "step": 1115 + }, + { + "epoch": 0.39, + "grad_norm": 0.1756822168827057, + "learning_rate": 0.00019534737911112412, + "loss": 1.0008, + "step": 1116 + }, + { + "epoch": 0.39, + "grad_norm": 0.17604592442512512, + "learning_rate": 0.00019533902789288426, + "loss": 1.0103, + "step": 1117 + }, + { + "epoch": 0.39, + "grad_norm": 0.18618983030319214, + "learning_rate": 0.00019533066936519092, + "loss": 1.0273, + "step": 1118 + }, + { + "epoch": 0.39, + "grad_norm": 0.17751984298229218, + "learning_rate": 0.00019532230352868499, + "loss": 1.0341, + "step": 1119 + }, + { + "epoch": 0.39, + "grad_norm": 0.2014542818069458, + "learning_rate": 0.00019531393038400786, + "loss": 1.0502, + "step": 1120 + }, + { + "epoch": 0.39, + "grad_norm": 0.17488035559654236, + "learning_rate": 0.00019530554993180151, + "loss": 1.0167, + "step": 1121 + }, + { + "epoch": 0.4, + "grad_norm": 0.18563339114189148, + "learning_rate": 0.00019529716217270835, + "loss": 1.0396, + "step": 1122 + }, + { + "epoch": 0.4, + "grad_norm": 0.17215563356876373, + "learning_rate": 0.00019528876710737155, + "loss": 1.0216, + "step": 1123 + }, + { + "epoch": 0.4, + "grad_norm": 0.18212297558784485, + "learning_rate": 0.00019528036473643473, + "loss": 0.9686, + "step": 1124 + }, + { + "epoch": 0.4, + "grad_norm": 0.1783788949251175, + "learning_rate": 0.00019527195506054203, + "loss": 1.0206, + "step": 1125 + }, + { + "epoch": 0.4, + "grad_norm": 0.1766308844089508, + "learning_rate": 0.00019526353808033825, + "loss": 1.0558, + "step": 1126 + }, + { + "epoch": 0.4, + "grad_norm": 0.18010054528713226, + "learning_rate": 0.0001952551137964687, + "loss": 1.0818, + "step": 1127 + }, + { + "epoch": 0.4, + "grad_norm": 0.19283832609653473, + "learning_rate": 0.00019524668220957927, + "loss": 1.1005, + "step": 1128 + }, + { + "epoch": 0.4, + "grad_norm": 0.1859970986843109, + "learning_rate": 0.00019523824332031632, + "loss": 1.0519, + "step": 1129 + }, + { + "epoch": 0.4, + "grad_norm": 0.17316271364688873, + "learning_rate": 0.00019522979712932693, + "loss": 1.039, + "step": 1130 + }, + { + "epoch": 0.4, + "grad_norm": 0.1714569330215454, + "learning_rate": 0.0001952213436372586, + "loss": 1.0384, + "step": 1131 + }, + { + "epoch": 0.4, + "grad_norm": 0.18875844776630402, + "learning_rate": 0.00019521288284475943, + "loss": 1.0167, + "step": 1132 + }, + { + "epoch": 0.4, + "grad_norm": 0.1846914291381836, + "learning_rate": 0.00019520441475247814, + "loss": 0.9885, + "step": 1133 + }, + { + "epoch": 0.4, + "grad_norm": 0.18210475146770477, + "learning_rate": 0.00019519593936106395, + "loss": 0.9981, + "step": 1134 + }, + { + "epoch": 0.4, + "grad_norm": 0.1718529462814331, + "learning_rate": 0.00019518745667116663, + "loss": 0.9566, + "step": 1135 + }, + { + "epoch": 0.4, + "grad_norm": 0.17763008177280426, + "learning_rate": 0.00019517896668343654, + "loss": 1.0626, + "step": 1136 + }, + { + "epoch": 0.4, + "grad_norm": 0.17546570301055908, + "learning_rate": 0.00019517046939852462, + "loss": 1.026, + "step": 1137 + }, + { + "epoch": 0.4, + "grad_norm": 0.174907848238945, + "learning_rate": 0.00019516196481708228, + "loss": 1.0155, + "step": 1138 + }, + { + "epoch": 0.4, + "grad_norm": 0.18386012315750122, + "learning_rate": 0.00019515345293976156, + "loss": 1.0359, + "step": 1139 + }, + { + "epoch": 0.4, + "grad_norm": 0.16808752715587616, + "learning_rate": 0.00019514493376721512, + "loss": 1.0262, + "step": 1140 + }, + { + "epoch": 0.4, + "grad_norm": 0.17714083194732666, + "learning_rate": 0.00019513640730009604, + "loss": 1.0467, + "step": 1141 + }, + { + "epoch": 0.4, + "grad_norm": 0.19117549061775208, + "learning_rate": 0.00019512787353905803, + "loss": 0.9753, + "step": 1142 + }, + { + "epoch": 0.4, + "grad_norm": 0.17344416677951813, + "learning_rate": 0.0001951193324847554, + "loss": 1.078, + "step": 1143 + }, + { + "epoch": 0.4, + "grad_norm": 0.1775476038455963, + "learning_rate": 0.00019511078413784295, + "loss": 1.031, + "step": 1144 + }, + { + "epoch": 0.4, + "grad_norm": 0.17347300052642822, + "learning_rate": 0.00019510222849897604, + "loss": 1.0538, + "step": 1145 + }, + { + "epoch": 0.4, + "grad_norm": 0.17518822848796844, + "learning_rate": 0.00019509366556881065, + "loss": 1.0162, + "step": 1146 + }, + { + "epoch": 0.4, + "grad_norm": 0.18036702275276184, + "learning_rate": 0.00019508509534800326, + "loss": 1.0921, + "step": 1147 + }, + { + "epoch": 0.4, + "grad_norm": 0.1831878125667572, + "learning_rate": 0.00019507651783721094, + "loss": 1.079, + "step": 1148 + }, + { + "epoch": 0.4, + "grad_norm": 0.1758822798728943, + "learning_rate": 0.00019506793303709132, + "loss": 1.0403, + "step": 1149 + }, + { + "epoch": 0.4, + "grad_norm": 0.18290312588214874, + "learning_rate": 0.00019505934094830258, + "loss": 1.0331, + "step": 1150 + }, + { + "epoch": 0.41, + "grad_norm": 0.190335214138031, + "learning_rate": 0.00019505074157150342, + "loss": 0.9999, + "step": 1151 + }, + { + "epoch": 0.41, + "grad_norm": 0.16814689338207245, + "learning_rate": 0.0001950421349073532, + "loss": 1.0396, + "step": 1152 + }, + { + "epoch": 0.41, + "grad_norm": 0.189680278301239, + "learning_rate": 0.00019503352095651174, + "loss": 1.022, + "step": 1153 + }, + { + "epoch": 0.41, + "grad_norm": 0.18800580501556396, + "learning_rate": 0.00019502489971963942, + "loss": 1.0712, + "step": 1154 + }, + { + "epoch": 0.41, + "grad_norm": 0.18286903202533722, + "learning_rate": 0.00019501627119739728, + "loss": 1.0245, + "step": 1155 + }, + { + "epoch": 0.41, + "grad_norm": 0.1748037189245224, + "learning_rate": 0.00019500763539044683, + "loss": 1.0192, + "step": 1156 + }, + { + "epoch": 0.41, + "grad_norm": 0.19290375709533691, + "learning_rate": 0.00019499899229945012, + "loss": 1.0468, + "step": 1157 + }, + { + "epoch": 0.41, + "grad_norm": 0.1722959280014038, + "learning_rate": 0.00019499034192506983, + "loss": 1.0388, + "step": 1158 + }, + { + "epoch": 0.41, + "grad_norm": 0.17889316380023956, + "learning_rate": 0.00019498168426796919, + "loss": 1.0789, + "step": 1159 + }, + { + "epoch": 0.41, + "grad_norm": 0.17683634161949158, + "learning_rate": 0.00019497301932881194, + "loss": 1.0811, + "step": 1160 + }, + { + "epoch": 0.41, + "grad_norm": 0.18203513324260712, + "learning_rate": 0.0001949643471082624, + "loss": 1.0446, + "step": 1161 + }, + { + "epoch": 0.41, + "grad_norm": 0.18062527477741241, + "learning_rate": 0.00019495566760698547, + "loss": 1.0625, + "step": 1162 + }, + { + "epoch": 0.41, + "grad_norm": 0.18026340007781982, + "learning_rate": 0.00019494698082564655, + "loss": 1.0631, + "step": 1163 + }, + { + "epoch": 0.41, + "grad_norm": 0.17580631375312805, + "learning_rate": 0.0001949382867649117, + "loss": 0.9806, + "step": 1164 + }, + { + "epoch": 0.41, + "grad_norm": 0.1745951920747757, + "learning_rate": 0.00019492958542544742, + "loss": 1.0034, + "step": 1165 + }, + { + "epoch": 0.41, + "grad_norm": 0.18265682458877563, + "learning_rate": 0.00019492087680792085, + "loss": 1.044, + "step": 1166 + }, + { + "epoch": 0.41, + "grad_norm": 0.17897681891918182, + "learning_rate": 0.00019491216091299966, + "loss": 1.025, + "step": 1167 + }, + { + "epoch": 0.41, + "grad_norm": 0.18096914887428284, + "learning_rate": 0.00019490343774135208, + "loss": 0.9971, + "step": 1168 + }, + { + "epoch": 0.41, + "grad_norm": 0.1791757494211197, + "learning_rate": 0.00019489470729364692, + "loss": 1.0091, + "step": 1169 + }, + { + "epoch": 0.41, + "grad_norm": 0.18002015352249146, + "learning_rate": 0.00019488596957055348, + "loss": 1.0668, + "step": 1170 + }, + { + "epoch": 0.41, + "grad_norm": 0.18384124338626862, + "learning_rate": 0.00019487722457274168, + "loss": 1.0324, + "step": 1171 + }, + { + "epoch": 0.41, + "grad_norm": 0.17454668879508972, + "learning_rate": 0.000194868472300882, + "loss": 1.0107, + "step": 1172 + }, + { + "epoch": 0.41, + "grad_norm": 0.17286750674247742, + "learning_rate": 0.00019485971275564546, + "loss": 1.0268, + "step": 1173 + }, + { + "epoch": 0.41, + "grad_norm": 0.17558777332305908, + "learning_rate": 0.00019485094593770362, + "loss": 1.0354, + "step": 1174 + }, + { + "epoch": 0.41, + "grad_norm": 0.1863299012184143, + "learning_rate": 0.00019484217184772865, + "loss": 1.0204, + "step": 1175 + }, + { + "epoch": 0.41, + "grad_norm": 0.18392202258110046, + "learning_rate": 0.00019483339048639316, + "loss": 1.0339, + "step": 1176 + }, + { + "epoch": 0.41, + "grad_norm": 0.17207181453704834, + "learning_rate": 0.00019482460185437045, + "loss": 1.0036, + "step": 1177 + }, + { + "epoch": 0.41, + "grad_norm": 0.1741429716348648, + "learning_rate": 0.00019481580595233434, + "loss": 1.0321, + "step": 1178 + }, + { + "epoch": 0.42, + "grad_norm": 0.18753582239151, + "learning_rate": 0.0001948070027809592, + "loss": 1.0876, + "step": 1179 + }, + { + "epoch": 0.42, + "grad_norm": 0.16908766329288483, + "learning_rate": 0.00019479819234091993, + "loss": 0.9712, + "step": 1180 + }, + { + "epoch": 0.42, + "grad_norm": 0.16926541924476624, + "learning_rate": 0.00019478937463289202, + "loss": 1.0192, + "step": 1181 + }, + { + "epoch": 0.42, + "grad_norm": 0.18049052357673645, + "learning_rate": 0.0001947805496575515, + "loss": 0.9744, + "step": 1182 + }, + { + "epoch": 0.42, + "grad_norm": 0.17877183854579926, + "learning_rate": 0.00019477171741557494, + "loss": 1.0187, + "step": 1183 + }, + { + "epoch": 0.42, + "grad_norm": 0.17817731201648712, + "learning_rate": 0.00019476287790763953, + "loss": 1.0025, + "step": 1184 + }, + { + "epoch": 0.42, + "grad_norm": 0.17894314229488373, + "learning_rate": 0.00019475403113442297, + "loss": 0.9888, + "step": 1185 + }, + { + "epoch": 0.42, + "grad_norm": 0.1737499237060547, + "learning_rate": 0.00019474517709660346, + "loss": 1.0435, + "step": 1186 + }, + { + "epoch": 0.42, + "grad_norm": 0.1924152672290802, + "learning_rate": 0.00019473631579485995, + "loss": 1.021, + "step": 1187 + }, + { + "epoch": 0.42, + "grad_norm": 0.17013049125671387, + "learning_rate": 0.0001947274472298717, + "loss": 1.0198, + "step": 1188 + }, + { + "epoch": 0.42, + "grad_norm": 0.17775459587574005, + "learning_rate": 0.00019471857140231872, + "loss": 1.0803, + "step": 1189 + }, + { + "epoch": 0.42, + "grad_norm": 0.17278152704238892, + "learning_rate": 0.00019470968831288143, + "loss": 0.991, + "step": 1190 + }, + { + "epoch": 0.42, + "grad_norm": 0.17254570126533508, + "learning_rate": 0.00019470079796224095, + "loss": 0.9979, + "step": 1191 + }, + { + "epoch": 0.42, + "grad_norm": 0.16804873943328857, + "learning_rate": 0.00019469190035107886, + "loss": 1.0026, + "step": 1192 + }, + { + "epoch": 0.42, + "grad_norm": 0.1779593974351883, + "learning_rate": 0.00019468299548007728, + "loss": 1.0992, + "step": 1193 + }, + { + "epoch": 0.42, + "grad_norm": 0.17888137698173523, + "learning_rate": 0.00019467408334991902, + "loss": 1.0435, + "step": 1194 + }, + { + "epoch": 0.42, + "grad_norm": 0.17595483362674713, + "learning_rate": 0.00019466516396128726, + "loss": 0.9999, + "step": 1195 + }, + { + "epoch": 0.42, + "grad_norm": 0.17406252026557922, + "learning_rate": 0.00019465623731486592, + "loss": 1.0375, + "step": 1196 + }, + { + "epoch": 0.42, + "grad_norm": 0.18004122376441956, + "learning_rate": 0.0001946473034113393, + "loss": 1.0238, + "step": 1197 + }, + { + "epoch": 0.42, + "grad_norm": 0.18256844580173492, + "learning_rate": 0.0001946383622513924, + "loss": 0.9845, + "step": 1198 + }, + { + "epoch": 0.42, + "grad_norm": 0.1867818981409073, + "learning_rate": 0.00019462941383571072, + "loss": 1.0887, + "step": 1199 + }, + { + "epoch": 0.42, + "grad_norm": 0.17373467981815338, + "learning_rate": 0.0001946204581649803, + "loss": 0.9685, + "step": 1200 + }, + { + "epoch": 0.42, + "grad_norm": 0.17235921323299408, + "learning_rate": 0.00019461149523988772, + "loss": 1.006, + "step": 1201 + }, + { + "epoch": 0.42, + "grad_norm": 0.17519314587116241, + "learning_rate": 0.00019460252506112025, + "loss": 1.0133, + "step": 1202 + }, + { + "epoch": 0.42, + "grad_norm": 0.17482595145702362, + "learning_rate": 0.00019459354762936553, + "loss": 1.0493, + "step": 1203 + }, + { + "epoch": 0.42, + "grad_norm": 0.17520184814929962, + "learning_rate": 0.00019458456294531185, + "loss": 1.0206, + "step": 1204 + }, + { + "epoch": 0.42, + "grad_norm": 0.17929650843143463, + "learning_rate": 0.0001945755710096481, + "loss": 1.0205, + "step": 1205 + }, + { + "epoch": 0.42, + "grad_norm": 0.3840513527393341, + "learning_rate": 0.00019456657182306358, + "loss": 0.9666, + "step": 1206 + }, + { + "epoch": 0.43, + "grad_norm": 0.18337073922157288, + "learning_rate": 0.00019455756538624833, + "loss": 1.0845, + "step": 1207 + }, + { + "epoch": 0.43, + "grad_norm": 0.17474819719791412, + "learning_rate": 0.00019454855169989284, + "loss": 1.048, + "step": 1208 + }, + { + "epoch": 0.43, + "grad_norm": 0.17507345974445343, + "learning_rate": 0.00019453953076468814, + "loss": 1.0749, + "step": 1209 + }, + { + "epoch": 0.43, + "grad_norm": 0.17816558480262756, + "learning_rate": 0.00019453050258132586, + "loss": 1.0486, + "step": 1210 + }, + { + "epoch": 0.43, + "grad_norm": 0.1680316925048828, + "learning_rate": 0.00019452146715049818, + "loss": 0.9808, + "step": 1211 + }, + { + "epoch": 0.43, + "grad_norm": 0.1678476631641388, + "learning_rate": 0.0001945124244728978, + "loss": 0.9723, + "step": 1212 + }, + { + "epoch": 0.43, + "grad_norm": 0.17107614874839783, + "learning_rate": 0.00019450337454921806, + "loss": 0.9654, + "step": 1213 + }, + { + "epoch": 0.43, + "grad_norm": 0.17758767306804657, + "learning_rate": 0.00019449431738015273, + "loss": 1.0573, + "step": 1214 + }, + { + "epoch": 0.43, + "grad_norm": 0.176117405295372, + "learning_rate": 0.00019448525296639625, + "loss": 1.0669, + "step": 1215 + }, + { + "epoch": 0.43, + "grad_norm": 0.17450761795043945, + "learning_rate": 0.00019447618130864357, + "loss": 0.9906, + "step": 1216 + }, + { + "epoch": 0.43, + "grad_norm": 0.1826401948928833, + "learning_rate": 0.0001944671024075902, + "loss": 1.0665, + "step": 1217 + }, + { + "epoch": 0.43, + "grad_norm": 0.17291821539402008, + "learning_rate": 0.00019445801626393218, + "loss": 0.9476, + "step": 1218 + }, + { + "epoch": 0.43, + "grad_norm": 0.16751059889793396, + "learning_rate": 0.00019444892287836613, + "loss": 1.016, + "step": 1219 + }, + { + "epoch": 0.43, + "grad_norm": 0.17721164226531982, + "learning_rate": 0.00019443982225158924, + "loss": 1.0528, + "step": 1220 + }, + { + "epoch": 0.43, + "grad_norm": 0.17266419529914856, + "learning_rate": 0.00019443071438429921, + "loss": 1.0224, + "step": 1221 + }, + { + "epoch": 0.43, + "grad_norm": 0.17830513417720795, + "learning_rate": 0.00019442159927719434, + "loss": 1.0509, + "step": 1222 + }, + { + "epoch": 0.43, + "grad_norm": 0.17815667390823364, + "learning_rate": 0.00019441247693097346, + "loss": 0.9926, + "step": 1223 + }, + { + "epoch": 0.43, + "grad_norm": 0.17313189804553986, + "learning_rate": 0.000194403347346336, + "loss": 0.9687, + "step": 1224 + }, + { + "epoch": 0.43, + "grad_norm": 0.18634116649627686, + "learning_rate": 0.00019439421052398182, + "loss": 1.0251, + "step": 1225 + }, + { + "epoch": 0.43, + "grad_norm": 0.1939675211906433, + "learning_rate": 0.0001943850664646115, + "loss": 1.047, + "step": 1226 + }, + { + "epoch": 0.43, + "grad_norm": 0.17838935554027557, + "learning_rate": 0.00019437591516892607, + "loss": 1.0672, + "step": 1227 + }, + { + "epoch": 0.43, + "grad_norm": 0.17744366824626923, + "learning_rate": 0.00019436675663762716, + "loss": 0.9857, + "step": 1228 + }, + { + "epoch": 0.43, + "grad_norm": 0.16612915694713593, + "learning_rate": 0.0001943575908714169, + "loss": 0.9924, + "step": 1229 + }, + { + "epoch": 0.43, + "grad_norm": 0.1929885447025299, + "learning_rate": 0.00019434841787099803, + "loss": 1.0051, + "step": 1230 + }, + { + "epoch": 0.43, + "grad_norm": 0.189718559384346, + "learning_rate": 0.00019433923763707383, + "loss": 1.1377, + "step": 1231 + }, + { + "epoch": 0.43, + "grad_norm": 0.19058705866336823, + "learning_rate": 0.00019433005017034815, + "loss": 1.0702, + "step": 1232 + }, + { + "epoch": 0.43, + "grad_norm": 0.17250509560108185, + "learning_rate": 0.00019432085547152532, + "loss": 1.0217, + "step": 1233 + }, + { + "epoch": 0.43, + "grad_norm": 0.17606855928897858, + "learning_rate": 0.0001943116535413103, + "loss": 0.9705, + "step": 1234 + }, + { + "epoch": 0.43, + "grad_norm": 0.17175276577472687, + "learning_rate": 0.00019430244438040862, + "loss": 1.0118, + "step": 1235 + }, + { + "epoch": 0.44, + "grad_norm": 0.17503391206264496, + "learning_rate": 0.0001942932279895263, + "loss": 0.9583, + "step": 1236 + }, + { + "epoch": 0.44, + "grad_norm": 0.17668573558330536, + "learning_rate": 0.00019428400436936994, + "loss": 1.0018, + "step": 1237 + }, + { + "epoch": 0.44, + "grad_norm": 0.17263321578502655, + "learning_rate": 0.0001942747735206467, + "loss": 1.0072, + "step": 1238 + }, + { + "epoch": 0.44, + "grad_norm": 0.1817825585603714, + "learning_rate": 0.0001942655354440643, + "loss": 1.077, + "step": 1239 + }, + { + "epoch": 0.44, + "grad_norm": 0.18519999086856842, + "learning_rate": 0.00019425629014033097, + "loss": 1.041, + "step": 1240 + }, + { + "epoch": 0.44, + "grad_norm": 0.1884918361902237, + "learning_rate": 0.0001942470376101556, + "loss": 0.971, + "step": 1241 + }, + { + "epoch": 0.44, + "grad_norm": 0.17670756578445435, + "learning_rate": 0.00019423777785424748, + "loss": 1.0509, + "step": 1242 + }, + { + "epoch": 0.44, + "grad_norm": 0.18296687304973602, + "learning_rate": 0.0001942285108733166, + "loss": 1.0273, + "step": 1243 + }, + { + "epoch": 0.44, + "grad_norm": 0.17323486506938934, + "learning_rate": 0.0001942192366680734, + "loss": 1.0364, + "step": 1244 + }, + { + "epoch": 0.44, + "grad_norm": 0.18001830577850342, + "learning_rate": 0.00019420995523922895, + "loss": 1.0587, + "step": 1245 + }, + { + "epoch": 0.44, + "grad_norm": 0.18932732939720154, + "learning_rate": 0.00019420066658749481, + "loss": 1.0051, + "step": 1246 + }, + { + "epoch": 0.44, + "grad_norm": 0.18397623300552368, + "learning_rate": 0.00019419137071358312, + "loss": 0.9917, + "step": 1247 + }, + { + "epoch": 0.44, + "grad_norm": 0.1875317096710205, + "learning_rate": 0.0001941820676182066, + "loss": 1.052, + "step": 1248 + }, + { + "epoch": 0.44, + "grad_norm": 0.18426766991615295, + "learning_rate": 0.0001941727573020785, + "loss": 1.1202, + "step": 1249 + }, + { + "epoch": 0.44, + "grad_norm": 0.181876078248024, + "learning_rate": 0.00019416343976591261, + "loss": 1.0529, + "step": 1250 + }, + { + "epoch": 0.44, + "grad_norm": 0.17538008093833923, + "learning_rate": 0.00019415411501042326, + "loss": 0.9733, + "step": 1251 + }, + { + "epoch": 0.44, + "grad_norm": 0.16881094872951508, + "learning_rate": 0.00019414478303632544, + "loss": 1.0014, + "step": 1252 + }, + { + "epoch": 0.44, + "grad_norm": 0.17428913712501526, + "learning_rate": 0.00019413544384433453, + "loss": 1.0101, + "step": 1253 + }, + { + "epoch": 0.44, + "grad_norm": 0.16802147030830383, + "learning_rate": 0.00019412609743516657, + "loss": 1.0394, + "step": 1254 + }, + { + "epoch": 0.44, + "grad_norm": 0.18043246865272522, + "learning_rate": 0.00019411674380953817, + "loss": 0.997, + "step": 1255 + }, + { + "epoch": 0.44, + "grad_norm": 0.1805686354637146, + "learning_rate": 0.0001941073829681664, + "loss": 1.0549, + "step": 1256 + }, + { + "epoch": 0.44, + "grad_norm": 0.1995280236005783, + "learning_rate": 0.000194098014911769, + "loss": 1.0735, + "step": 1257 + }, + { + "epoch": 0.44, + "grad_norm": 0.174880713224411, + "learning_rate": 0.00019408863964106413, + "loss": 0.9948, + "step": 1258 + }, + { + "epoch": 0.44, + "grad_norm": 0.16793687641620636, + "learning_rate": 0.00019407925715677062, + "loss": 0.992, + "step": 1259 + }, + { + "epoch": 0.44, + "grad_norm": 0.19079770147800446, + "learning_rate": 0.0001940698674596078, + "loss": 1.0717, + "step": 1260 + }, + { + "epoch": 0.44, + "grad_norm": 0.18188640475273132, + "learning_rate": 0.00019406047055029553, + "loss": 1.0195, + "step": 1261 + }, + { + "epoch": 0.44, + "grad_norm": 0.18434570729732513, + "learning_rate": 0.00019405106642955428, + "loss": 1.05, + "step": 1262 + }, + { + "epoch": 0.44, + "grad_norm": 0.18857026100158691, + "learning_rate": 0.00019404165509810504, + "loss": 1.0704, + "step": 1263 + }, + { + "epoch": 0.45, + "grad_norm": 0.17161917686462402, + "learning_rate": 0.00019403223655666937, + "loss": 1.0103, + "step": 1264 + }, + { + "epoch": 0.45, + "grad_norm": 0.17019355297088623, + "learning_rate": 0.00019402281080596934, + "loss": 0.9766, + "step": 1265 + }, + { + "epoch": 0.45, + "grad_norm": 0.1757907271385193, + "learning_rate": 0.0001940133778467276, + "loss": 1.0475, + "step": 1266 + }, + { + "epoch": 0.45, + "grad_norm": 0.183481827378273, + "learning_rate": 0.00019400393767966744, + "loss": 1.0011, + "step": 1267 + }, + { + "epoch": 0.45, + "grad_norm": 0.1849091649055481, + "learning_rate": 0.0001939944903055125, + "loss": 1.029, + "step": 1268 + }, + { + "epoch": 0.45, + "grad_norm": 0.17956006526947021, + "learning_rate": 0.00019398503572498718, + "loss": 1.0034, + "step": 1269 + }, + { + "epoch": 0.45, + "grad_norm": 0.172621950507164, + "learning_rate": 0.00019397557393881633, + "loss": 1.0086, + "step": 1270 + }, + { + "epoch": 0.45, + "grad_norm": 0.18043522536754608, + "learning_rate": 0.00019396610494772532, + "loss": 1.025, + "step": 1271 + }, + { + "epoch": 0.45, + "grad_norm": 0.17680267989635468, + "learning_rate": 0.00019395662875244015, + "loss": 1.042, + "step": 1272 + }, + { + "epoch": 0.45, + "grad_norm": 0.17402620613574982, + "learning_rate": 0.00019394714535368732, + "loss": 0.9135, + "step": 1273 + }, + { + "epoch": 0.45, + "grad_norm": 0.1817319095134735, + "learning_rate": 0.00019393765475219393, + "loss": 1.0469, + "step": 1274 + }, + { + "epoch": 0.45, + "grad_norm": 0.18868154287338257, + "learning_rate": 0.0001939281569486876, + "loss": 1.0571, + "step": 1275 + }, + { + "epoch": 0.45, + "grad_norm": 0.16820600628852844, + "learning_rate": 0.0001939186519438965, + "loss": 1.0429, + "step": 1276 + }, + { + "epoch": 0.45, + "grad_norm": 0.1712152361869812, + "learning_rate": 0.00019390913973854938, + "loss": 0.9995, + "step": 1277 + }, + { + "epoch": 0.45, + "grad_norm": 0.1772497594356537, + "learning_rate": 0.00019389962033337552, + "loss": 0.9795, + "step": 1278 + }, + { + "epoch": 0.45, + "grad_norm": 0.17727500200271606, + "learning_rate": 0.0001938900937291047, + "loss": 0.9998, + "step": 1279 + }, + { + "epoch": 0.45, + "grad_norm": 0.17319205403327942, + "learning_rate": 0.00019388055992646737, + "loss": 0.9909, + "step": 1280 + }, + { + "epoch": 0.45, + "grad_norm": 0.1945336014032364, + "learning_rate": 0.00019387101892619443, + "loss": 1.0173, + "step": 1281 + }, + { + "epoch": 0.45, + "grad_norm": 0.18670299649238586, + "learning_rate": 0.0001938614707290174, + "loss": 1.023, + "step": 1282 + }, + { + "epoch": 0.45, + "grad_norm": 0.17243389785289764, + "learning_rate": 0.00019385191533566828, + "loss": 1.0632, + "step": 1283 + }, + { + "epoch": 0.45, + "grad_norm": 0.19126373529434204, + "learning_rate": 0.0001938423527468797, + "loss": 0.9837, + "step": 1284 + }, + { + "epoch": 0.45, + "grad_norm": 0.19175255298614502, + "learning_rate": 0.0001938327829633848, + "loss": 1.0314, + "step": 1285 + }, + { + "epoch": 0.45, + "grad_norm": 0.1695779263973236, + "learning_rate": 0.00019382320598591727, + "loss": 1.0078, + "step": 1286 + }, + { + "epoch": 0.45, + "grad_norm": 0.17427422106266022, + "learning_rate": 0.00019381362181521134, + "loss": 0.9935, + "step": 1287 + }, + { + "epoch": 0.45, + "grad_norm": 0.18275946378707886, + "learning_rate": 0.00019380403045200182, + "loss": 1.0181, + "step": 1288 + }, + { + "epoch": 0.45, + "grad_norm": 0.18247005343437195, + "learning_rate": 0.0001937944318970241, + "loss": 1.0173, + "step": 1289 + }, + { + "epoch": 0.45, + "grad_norm": 0.1783648580312729, + "learning_rate": 0.00019378482615101401, + "loss": 1.027, + "step": 1290 + }, + { + "epoch": 0.45, + "grad_norm": 0.18437276780605316, + "learning_rate": 0.00019377521321470805, + "loss": 1.0709, + "step": 1291 + }, + { + "epoch": 0.45, + "grad_norm": 0.1733713001012802, + "learning_rate": 0.00019376559308884324, + "loss": 1.0126, + "step": 1292 + }, + { + "epoch": 0.46, + "grad_norm": 0.17454306781291962, + "learning_rate": 0.0001937559657741571, + "loss": 1.0644, + "step": 1293 + }, + { + "epoch": 0.46, + "grad_norm": 0.18214353919029236, + "learning_rate": 0.00019374633127138772, + "loss": 0.9976, + "step": 1294 + }, + { + "epoch": 0.46, + "grad_norm": 0.1960800439119339, + "learning_rate": 0.00019373668958127383, + "loss": 1.0927, + "step": 1295 + }, + { + "epoch": 0.46, + "grad_norm": 0.18307611346244812, + "learning_rate": 0.00019372704070455456, + "loss": 1.0643, + "step": 1296 + }, + { + "epoch": 0.46, + "grad_norm": 0.17762793600559235, + "learning_rate": 0.0001937173846419697, + "loss": 0.9654, + "step": 1297 + }, + { + "epoch": 0.46, + "grad_norm": 0.1836734414100647, + "learning_rate": 0.00019370772139425962, + "loss": 0.9957, + "step": 1298 + }, + { + "epoch": 0.46, + "grad_norm": 0.17774513363838196, + "learning_rate": 0.00019369805096216509, + "loss": 1.0072, + "step": 1299 + }, + { + "epoch": 0.46, + "grad_norm": 0.1814611703157425, + "learning_rate": 0.00019368837334642758, + "loss": 1.0244, + "step": 1300 + }, + { + "epoch": 0.46, + "grad_norm": 0.4324641227722168, + "learning_rate": 0.00019367868854778904, + "loss": 1.035, + "step": 1301 + }, + { + "epoch": 0.46, + "grad_norm": 0.1968534290790558, + "learning_rate": 0.00019366899656699196, + "loss": 0.9609, + "step": 1302 + }, + { + "epoch": 0.46, + "grad_norm": 0.18885602056980133, + "learning_rate": 0.0001936592974047794, + "loss": 1.0719, + "step": 1303 + }, + { + "epoch": 0.46, + "grad_norm": 0.1806947886943817, + "learning_rate": 0.00019364959106189504, + "loss": 1.0308, + "step": 1304 + }, + { + "epoch": 0.46, + "grad_norm": 0.1710716187953949, + "learning_rate": 0.000193639877539083, + "loss": 0.999, + "step": 1305 + }, + { + "epoch": 0.46, + "grad_norm": 0.17838624119758606, + "learning_rate": 0.000193630156837088, + "loss": 0.9507, + "step": 1306 + }, + { + "epoch": 0.46, + "grad_norm": 0.17988012731075287, + "learning_rate": 0.0001936204289566553, + "loss": 1.0071, + "step": 1307 + }, + { + "epoch": 0.46, + "grad_norm": 0.1811203956604004, + "learning_rate": 0.00019361069389853074, + "loss": 1.0272, + "step": 1308 + }, + { + "epoch": 0.46, + "grad_norm": 0.1934630423784256, + "learning_rate": 0.00019360095166346067, + "loss": 1.0003, + "step": 1309 + }, + { + "epoch": 0.46, + "grad_norm": 0.1737378090620041, + "learning_rate": 0.00019359120225219203, + "loss": 1.0585, + "step": 1310 + }, + { + "epoch": 0.46, + "grad_norm": 0.2015053778886795, + "learning_rate": 0.00019358144566547226, + "loss": 1.0141, + "step": 1311 + }, + { + "epoch": 0.46, + "grad_norm": 0.1798975020647049, + "learning_rate": 0.00019357168190404936, + "loss": 1.0133, + "step": 1312 + }, + { + "epoch": 0.46, + "grad_norm": 0.18216365575790405, + "learning_rate": 0.00019356191096867197, + "loss": 1.0467, + "step": 1313 + }, + { + "epoch": 0.46, + "grad_norm": 0.1877415031194687, + "learning_rate": 0.00019355213286008916, + "loss": 1.081, + "step": 1314 + }, + { + "epoch": 0.46, + "grad_norm": 0.18899911642074585, + "learning_rate": 0.00019354234757905055, + "loss": 1.0421, + "step": 1315 + }, + { + "epoch": 0.46, + "grad_norm": 0.18335410952568054, + "learning_rate": 0.00019353255512630647, + "loss": 0.9673, + "step": 1316 + }, + { + "epoch": 0.46, + "grad_norm": 0.1879153847694397, + "learning_rate": 0.00019352275550260759, + "loss": 1.028, + "step": 1317 + }, + { + "epoch": 0.46, + "grad_norm": 0.18537075817584991, + "learning_rate": 0.00019351294870870528, + "loss": 0.984, + "step": 1318 + }, + { + "epoch": 0.46, + "grad_norm": 0.18217268586158752, + "learning_rate": 0.0001935031347453514, + "loss": 1.0041, + "step": 1319 + }, + { + "epoch": 0.46, + "grad_norm": 0.1934812366962433, + "learning_rate": 0.00019349331361329835, + "loss": 1.0278, + "step": 1320 + }, + { + "epoch": 0.47, + "grad_norm": 0.19314013421535492, + "learning_rate": 0.0001934834853132991, + "loss": 1.0594, + "step": 1321 + }, + { + "epoch": 0.47, + "grad_norm": 0.18223635852336884, + "learning_rate": 0.00019347364984610718, + "loss": 1.0376, + "step": 1322 + }, + { + "epoch": 0.47, + "grad_norm": 0.1895359456539154, + "learning_rate": 0.00019346380721247669, + "loss": 1.0477, + "step": 1323 + }, + { + "epoch": 0.47, + "grad_norm": 0.19515518844127655, + "learning_rate": 0.00019345395741316216, + "loss": 1.0081, + "step": 1324 + }, + { + "epoch": 0.47, + "grad_norm": 0.17444954812526703, + "learning_rate": 0.0001934441004489188, + "loss": 0.9642, + "step": 1325 + }, + { + "epoch": 0.47, + "grad_norm": 0.1794186234474182, + "learning_rate": 0.00019343423632050232, + "loss": 0.952, + "step": 1326 + }, + { + "epoch": 0.47, + "grad_norm": 0.1817828118801117, + "learning_rate": 0.00019342436502866898, + "loss": 1.0201, + "step": 1327 + }, + { + "epoch": 0.47, + "grad_norm": 0.18109366297721863, + "learning_rate": 0.0001934144865741756, + "loss": 1.0059, + "step": 1328 + }, + { + "epoch": 0.47, + "grad_norm": 0.18579837679862976, + "learning_rate": 0.00019340460095777957, + "loss": 1.0449, + "step": 1329 + }, + { + "epoch": 0.47, + "grad_norm": 0.17582041025161743, + "learning_rate": 0.00019339470818023876, + "loss": 0.9718, + "step": 1330 + }, + { + "epoch": 0.47, + "grad_norm": 0.1840338110923767, + "learning_rate": 0.00019338480824231162, + "loss": 1.0338, + "step": 1331 + }, + { + "epoch": 0.47, + "grad_norm": 0.18756501376628876, + "learning_rate": 0.00019337490114475719, + "loss": 1.0245, + "step": 1332 + }, + { + "epoch": 0.47, + "grad_norm": 0.1740320324897766, + "learning_rate": 0.000193364986888335, + "loss": 0.9619, + "step": 1333 + }, + { + "epoch": 0.47, + "grad_norm": 0.18331877887248993, + "learning_rate": 0.00019335506547380518, + "loss": 0.9684, + "step": 1334 + }, + { + "epoch": 0.47, + "grad_norm": 0.18273407220840454, + "learning_rate": 0.0001933451369019284, + "loss": 1.0624, + "step": 1335 + }, + { + "epoch": 0.47, + "grad_norm": 0.17652833461761475, + "learning_rate": 0.0001933352011734658, + "loss": 0.9889, + "step": 1336 + }, + { + "epoch": 0.47, + "grad_norm": 0.18905626237392426, + "learning_rate": 0.0001933252582891792, + "loss": 1.0827, + "step": 1337 + }, + { + "epoch": 0.47, + "grad_norm": 0.17634400725364685, + "learning_rate": 0.00019331530824983085, + "loss": 0.9897, + "step": 1338 + }, + { + "epoch": 0.47, + "grad_norm": 0.18231314420700073, + "learning_rate": 0.00019330535105618367, + "loss": 1.0558, + "step": 1339 + }, + { + "epoch": 0.47, + "grad_norm": 0.19120241701602936, + "learning_rate": 0.00019329538670900096, + "loss": 1.027, + "step": 1340 + }, + { + "epoch": 0.47, + "grad_norm": 0.1788601577281952, + "learning_rate": 0.00019328541520904677, + "loss": 1.0448, + "step": 1341 + }, + { + "epoch": 0.47, + "grad_norm": 0.1818162500858307, + "learning_rate": 0.0001932754365570855, + "loss": 1.0101, + "step": 1342 + }, + { + "epoch": 0.47, + "grad_norm": 0.193072110414505, + "learning_rate": 0.00019326545075388225, + "loss": 1.1396, + "step": 1343 + }, + { + "epoch": 0.47, + "grad_norm": 0.1790308952331543, + "learning_rate": 0.0001932554578002026, + "loss": 0.9894, + "step": 1344 + }, + { + "epoch": 0.47, + "grad_norm": 0.19239982962608337, + "learning_rate": 0.0001932454576968127, + "loss": 1.0331, + "step": 1345 + }, + { + "epoch": 0.47, + "grad_norm": 0.1802983582019806, + "learning_rate": 0.0001932354504444792, + "loss": 1.0169, + "step": 1346 + }, + { + "epoch": 0.47, + "grad_norm": 0.18328198790550232, + "learning_rate": 0.0001932254360439694, + "loss": 1.0175, + "step": 1347 + }, + { + "epoch": 0.47, + "grad_norm": 0.17947252094745636, + "learning_rate": 0.000193215414496051, + "loss": 1.0286, + "step": 1348 + }, + { + "epoch": 0.48, + "grad_norm": 0.18461839854717255, + "learning_rate": 0.00019320538580149243, + "loss": 1.0737, + "step": 1349 + }, + { + "epoch": 0.48, + "grad_norm": 0.18745096027851105, + "learning_rate": 0.0001931953499610625, + "loss": 1.011, + "step": 1350 + }, + { + "epoch": 0.48, + "grad_norm": 0.1850559115409851, + "learning_rate": 0.00019318530697553062, + "loss": 1.0351, + "step": 1351 + }, + { + "epoch": 0.48, + "grad_norm": 0.17920838296413422, + "learning_rate": 0.00019317525684566685, + "loss": 0.9879, + "step": 1352 + }, + { + "epoch": 0.48, + "grad_norm": 0.18742653727531433, + "learning_rate": 0.00019316519957224165, + "loss": 0.9737, + "step": 1353 + }, + { + "epoch": 0.48, + "grad_norm": 0.18532788753509521, + "learning_rate": 0.00019315513515602613, + "loss": 0.9625, + "step": 1354 + }, + { + "epoch": 0.48, + "grad_norm": 0.17812992632389069, + "learning_rate": 0.00019314506359779186, + "loss": 0.9914, + "step": 1355 + }, + { + "epoch": 0.48, + "grad_norm": 0.17627066373825073, + "learning_rate": 0.00019313498489831102, + "loss": 1.0321, + "step": 1356 + }, + { + "epoch": 0.48, + "grad_norm": 0.18436536192893982, + "learning_rate": 0.00019312489905835636, + "loss": 1.0064, + "step": 1357 + }, + { + "epoch": 0.48, + "grad_norm": 0.18245936930179596, + "learning_rate": 0.00019311480607870108, + "loss": 1.0026, + "step": 1358 + }, + { + "epoch": 0.48, + "grad_norm": 0.1713896244764328, + "learning_rate": 0.00019310470596011907, + "loss": 0.942, + "step": 1359 + }, + { + "epoch": 0.48, + "grad_norm": 0.17367751896381378, + "learning_rate": 0.00019309459870338463, + "loss": 0.9872, + "step": 1360 + }, + { + "epoch": 0.48, + "grad_norm": 0.1733555942773819, + "learning_rate": 0.0001930844843092727, + "loss": 1.0109, + "step": 1361 + }, + { + "epoch": 0.48, + "grad_norm": 0.17760616540908813, + "learning_rate": 0.00019307436277855866, + "loss": 1.0196, + "step": 1362 + }, + { + "epoch": 0.48, + "grad_norm": 0.1863793581724167, + "learning_rate": 0.00019306423411201857, + "loss": 1.05, + "step": 1363 + }, + { + "epoch": 0.48, + "grad_norm": 0.1735723912715912, + "learning_rate": 0.00019305409831042898, + "loss": 1.0202, + "step": 1364 + }, + { + "epoch": 0.48, + "grad_norm": 0.18256759643554688, + "learning_rate": 0.00019304395537456692, + "loss": 1.0575, + "step": 1365 + }, + { + "epoch": 0.48, + "grad_norm": 0.1789254993200302, + "learning_rate": 0.00019303380530521008, + "loss": 1.0937, + "step": 1366 + }, + { + "epoch": 0.48, + "grad_norm": 0.17587143182754517, + "learning_rate": 0.00019302364810313668, + "loss": 1.0526, + "step": 1367 + }, + { + "epoch": 0.48, + "grad_norm": 0.17091909050941467, + "learning_rate": 0.00019301348376912536, + "loss": 0.9902, + "step": 1368 + }, + { + "epoch": 0.48, + "grad_norm": 0.18148952722549438, + "learning_rate": 0.00019300331230395545, + "loss": 1.029, + "step": 1369 + }, + { + "epoch": 0.48, + "grad_norm": 0.1739834100008011, + "learning_rate": 0.00019299313370840676, + "loss": 1.0009, + "step": 1370 + }, + { + "epoch": 0.48, + "grad_norm": 0.18367813527584076, + "learning_rate": 0.0001929829479832597, + "loss": 1.0411, + "step": 1371 + }, + { + "epoch": 0.48, + "grad_norm": 0.17575769126415253, + "learning_rate": 0.00019297275512929516, + "loss": 1.0155, + "step": 1372 + }, + { + "epoch": 0.48, + "grad_norm": 0.1805105209350586, + "learning_rate": 0.0001929625551472946, + "loss": 1.0153, + "step": 1373 + }, + { + "epoch": 0.48, + "grad_norm": 0.17204175889492035, + "learning_rate": 0.00019295234803804004, + "loss": 0.9689, + "step": 1374 + }, + { + "epoch": 0.48, + "grad_norm": 0.1902618110179901, + "learning_rate": 0.00019294213380231402, + "loss": 0.9915, + "step": 1375 + }, + { + "epoch": 0.48, + "grad_norm": 0.17897525429725647, + "learning_rate": 0.00019293191244089972, + "loss": 0.9946, + "step": 1376 + }, + { + "epoch": 0.48, + "grad_norm": 0.17735904455184937, + "learning_rate": 0.0001929216839545807, + "loss": 0.963, + "step": 1377 + }, + { + "epoch": 0.49, + "grad_norm": 0.1746383160352707, + "learning_rate": 0.0001929114483441412, + "loss": 0.9751, + "step": 1378 + }, + { + "epoch": 0.49, + "grad_norm": 0.17372174561023712, + "learning_rate": 0.000192901205610366, + "loss": 1.0304, + "step": 1379 + }, + { + "epoch": 0.49, + "grad_norm": 0.17814914882183075, + "learning_rate": 0.0001928909557540403, + "loss": 0.9754, + "step": 1380 + }, + { + "epoch": 0.49, + "grad_norm": 0.17424102127552032, + "learning_rate": 0.00019288069877594998, + "loss": 1.0329, + "step": 1381 + }, + { + "epoch": 0.49, + "grad_norm": 0.17923283576965332, + "learning_rate": 0.00019287043467688145, + "loss": 1.0678, + "step": 1382 + }, + { + "epoch": 0.49, + "grad_norm": 0.17741328477859497, + "learning_rate": 0.00019286016345762162, + "loss": 0.9972, + "step": 1383 + }, + { + "epoch": 0.49, + "grad_norm": 0.1734980046749115, + "learning_rate": 0.00019284988511895793, + "loss": 1.0207, + "step": 1384 + }, + { + "epoch": 0.49, + "grad_norm": 0.17344945669174194, + "learning_rate": 0.00019283959966167843, + "loss": 0.98, + "step": 1385 + }, + { + "epoch": 0.49, + "grad_norm": 0.18832659721374512, + "learning_rate": 0.00019282930708657169, + "loss": 1.0533, + "step": 1386 + }, + { + "epoch": 0.49, + "grad_norm": 0.18710127472877502, + "learning_rate": 0.00019281900739442684, + "loss": 1.0172, + "step": 1387 + }, + { + "epoch": 0.49, + "grad_norm": 0.18463793396949768, + "learning_rate": 0.0001928087005860335, + "loss": 1.0912, + "step": 1388 + }, + { + "epoch": 0.49, + "grad_norm": 0.1741846352815628, + "learning_rate": 0.00019279838666218187, + "loss": 0.9459, + "step": 1389 + }, + { + "epoch": 0.49, + "grad_norm": 0.18016698956489563, + "learning_rate": 0.00019278806562366275, + "loss": 1.0024, + "step": 1390 + }, + { + "epoch": 0.49, + "grad_norm": 0.18188920617103577, + "learning_rate": 0.00019277773747126736, + "loss": 1.0591, + "step": 1391 + }, + { + "epoch": 0.49, + "grad_norm": 0.1762164831161499, + "learning_rate": 0.00019276740220578758, + "loss": 1.0583, + "step": 1392 + }, + { + "epoch": 0.49, + "grad_norm": 0.17220334708690643, + "learning_rate": 0.00019275705982801578, + "loss": 1.0282, + "step": 1393 + }, + { + "epoch": 0.49, + "grad_norm": 0.1795509308576584, + "learning_rate": 0.00019274671033874492, + "loss": 1.0277, + "step": 1394 + }, + { + "epoch": 0.49, + "grad_norm": 0.17990824580192566, + "learning_rate": 0.00019273635373876844, + "loss": 1.0421, + "step": 1395 + }, + { + "epoch": 0.49, + "grad_norm": 0.18492265045642853, + "learning_rate": 0.0001927259900288804, + "loss": 0.9711, + "step": 1396 + }, + { + "epoch": 0.49, + "grad_norm": 0.18191231787204742, + "learning_rate": 0.0001927156192098753, + "loss": 1.0157, + "step": 1397 + }, + { + "epoch": 0.49, + "grad_norm": 0.1850530356168747, + "learning_rate": 0.0001927052412825483, + "loss": 1.0406, + "step": 1398 + }, + { + "epoch": 0.49, + "grad_norm": 0.19039122760295868, + "learning_rate": 0.00019269485624769507, + "loss": 0.9688, + "step": 1399 + }, + { + "epoch": 0.49, + "grad_norm": 0.17096638679504395, + "learning_rate": 0.00019268446410611176, + "loss": 1.0016, + "step": 1400 + }, + { + "epoch": 0.49, + "grad_norm": 0.19254373013973236, + "learning_rate": 0.00019267406485859513, + "loss": 1.0246, + "step": 1401 + }, + { + "epoch": 0.49, + "grad_norm": 0.1767900288105011, + "learning_rate": 0.0001926636585059425, + "loss": 1.0083, + "step": 1402 + }, + { + "epoch": 0.49, + "grad_norm": 0.18407025933265686, + "learning_rate": 0.00019265324504895168, + "loss": 1.012, + "step": 1403 + }, + { + "epoch": 0.49, + "grad_norm": 0.18007557094097137, + "learning_rate": 0.00019264282448842103, + "loss": 1.0839, + "step": 1404 + }, + { + "epoch": 0.49, + "grad_norm": 0.18103165924549103, + "learning_rate": 0.00019263239682514952, + "loss": 1.0383, + "step": 1405 + }, + { + "epoch": 0.5, + "grad_norm": 0.18077121675014496, + "learning_rate": 0.00019262196205993657, + "loss": 1.0107, + "step": 1406 + }, + { + "epoch": 0.5, + "grad_norm": 0.17406535148620605, + "learning_rate": 0.00019261152019358222, + "loss": 1.0433, + "step": 1407 + }, + { + "epoch": 0.5, + "grad_norm": 0.1833508461713791, + "learning_rate": 0.00019260107122688705, + "loss": 1.0356, + "step": 1408 + }, + { + "epoch": 0.5, + "grad_norm": 0.17652858793735504, + "learning_rate": 0.0001925906151606521, + "loss": 0.9974, + "step": 1409 + }, + { + "epoch": 0.5, + "grad_norm": 0.17765457928180695, + "learning_rate": 0.00019258015199567904, + "loss": 0.9817, + "step": 1410 + }, + { + "epoch": 0.5, + "grad_norm": 0.1752379983663559, + "learning_rate": 0.0001925696817327701, + "loss": 0.9987, + "step": 1411 + }, + { + "epoch": 0.5, + "grad_norm": 0.17794077098369598, + "learning_rate": 0.000192559204372728, + "loss": 0.9804, + "step": 1412 + }, + { + "epoch": 0.5, + "grad_norm": 0.1826150268316269, + "learning_rate": 0.00019254871991635598, + "loss": 1.0078, + "step": 1413 + }, + { + "epoch": 0.5, + "grad_norm": 0.18269848823547363, + "learning_rate": 0.00019253822836445787, + "loss": 0.99, + "step": 1414 + }, + { + "epoch": 0.5, + "grad_norm": 0.19174599647521973, + "learning_rate": 0.00019252772971783806, + "loss": 1.0228, + "step": 1415 + }, + { + "epoch": 0.5, + "grad_norm": 0.1771102249622345, + "learning_rate": 0.00019251722397730148, + "loss": 0.9811, + "step": 1416 + }, + { + "epoch": 0.5, + "grad_norm": 0.17833948135375977, + "learning_rate": 0.00019250671114365352, + "loss": 1.0279, + "step": 1417 + }, + { + "epoch": 0.5, + "grad_norm": 0.1791437864303589, + "learning_rate": 0.0001924961912177002, + "loss": 1.0045, + "step": 1418 + }, + { + "epoch": 0.5, + "grad_norm": 0.1782650500535965, + "learning_rate": 0.00019248566420024813, + "loss": 0.962, + "step": 1419 + }, + { + "epoch": 0.5, + "grad_norm": 0.17893794178962708, + "learning_rate": 0.00019247513009210433, + "loss": 1.0527, + "step": 1420 + }, + { + "epoch": 0.5, + "eval_loss": 1.0163156986236572, + "eval_runtime": 680.0523, + "eval_samples_per_second": 10.111, + "eval_steps_per_second": 5.055, + "step": 1420 + }, + { + "epoch": 0.5, + "grad_norm": 0.1815989464521408, + "learning_rate": 0.00019246458889407643, + "loss": 1.0524, + "step": 1421 + }, + { + "epoch": 0.5, + "grad_norm": 0.17999018728733063, + "learning_rate": 0.0001924540406069726, + "loss": 0.9832, + "step": 1422 + }, + { + "epoch": 0.5, + "grad_norm": 0.18894697725772858, + "learning_rate": 0.00019244348523160155, + "loss": 1.0769, + "step": 1423 + }, + { + "epoch": 0.5, + "grad_norm": 0.1834978312253952, + "learning_rate": 0.00019243292276877258, + "loss": 1.0129, + "step": 1424 + }, + { + "epoch": 0.5, + "grad_norm": 0.1804187297821045, + "learning_rate": 0.00019242235321929548, + "loss": 1.0363, + "step": 1425 + }, + { + "epoch": 0.5, + "grad_norm": 0.17544397711753845, + "learning_rate": 0.0001924117765839806, + "loss": 0.9883, + "step": 1426 + }, + { + "epoch": 0.5, + "grad_norm": 0.18312135338783264, + "learning_rate": 0.0001924011928636388, + "loss": 0.9859, + "step": 1427 + }, + { + "epoch": 0.5, + "grad_norm": 0.17146514356136322, + "learning_rate": 0.00019239060205908154, + "loss": 0.9585, + "step": 1428 + }, + { + "epoch": 0.5, + "grad_norm": 0.18334969878196716, + "learning_rate": 0.00019238000417112078, + "loss": 1.0252, + "step": 1429 + }, + { + "epoch": 0.5, + "grad_norm": 0.180964857339859, + "learning_rate": 0.00019236939920056902, + "loss": 1.0354, + "step": 1430 + }, + { + "epoch": 0.5, + "grad_norm": 0.19952955842018127, + "learning_rate": 0.0001923587871482394, + "loss": 1.0261, + "step": 1431 + }, + { + "epoch": 0.5, + "grad_norm": 0.18565645813941956, + "learning_rate": 0.00019234816801494542, + "loss": 0.9784, + "step": 1432 + }, + { + "epoch": 0.5, + "grad_norm": 0.18717040121555328, + "learning_rate": 0.00019233754180150129, + "loss": 0.9832, + "step": 1433 + }, + { + "epoch": 0.5, + "grad_norm": 0.18185356259346008, + "learning_rate": 0.00019232690850872172, + "loss": 1.021, + "step": 1434 + }, + { + "epoch": 0.51, + "grad_norm": 0.17485785484313965, + "learning_rate": 0.0001923162681374219, + "loss": 0.9975, + "step": 1435 + }, + { + "epoch": 0.51, + "grad_norm": 0.18476179242134094, + "learning_rate": 0.0001923056206884176, + "loss": 0.9592, + "step": 1436 + }, + { + "epoch": 0.51, + "grad_norm": 0.18408514559268951, + "learning_rate": 0.0001922949661625252, + "loss": 1.0284, + "step": 1437 + }, + { + "epoch": 0.51, + "grad_norm": 0.1944282501935959, + "learning_rate": 0.00019228430456056148, + "loss": 1.0313, + "step": 1438 + }, + { + "epoch": 0.51, + "grad_norm": 0.1734023094177246, + "learning_rate": 0.0001922736358833439, + "loss": 1.0098, + "step": 1439 + }, + { + "epoch": 0.51, + "grad_norm": 0.18978969752788544, + "learning_rate": 0.00019226296013169042, + "loss": 0.9933, + "step": 1440 + }, + { + "epoch": 0.51, + "grad_norm": 0.18699029088020325, + "learning_rate": 0.00019225227730641943, + "loss": 1.0292, + "step": 1441 + }, + { + "epoch": 0.51, + "grad_norm": 0.17445528507232666, + "learning_rate": 0.00019224158740835007, + "loss": 1.0196, + "step": 1442 + }, + { + "epoch": 0.51, + "grad_norm": 0.17488212883472443, + "learning_rate": 0.0001922308904383019, + "loss": 1.0401, + "step": 1443 + }, + { + "epoch": 0.51, + "grad_norm": 0.1726742386817932, + "learning_rate": 0.00019222018639709497, + "loss": 0.9805, + "step": 1444 + }, + { + "epoch": 0.51, + "grad_norm": 0.17496153712272644, + "learning_rate": 0.00019220947528554997, + "loss": 0.9819, + "step": 1445 + }, + { + "epoch": 0.51, + "grad_norm": 0.18209873139858246, + "learning_rate": 0.00019219875710448813, + "loss": 1.0321, + "step": 1446 + }, + { + "epoch": 0.51, + "grad_norm": 0.17869645357131958, + "learning_rate": 0.00019218803185473116, + "loss": 0.941, + "step": 1447 + }, + { + "epoch": 0.51, + "grad_norm": 0.19009461998939514, + "learning_rate": 0.00019217729953710134, + "loss": 1.0568, + "step": 1448 + }, + { + "epoch": 0.51, + "grad_norm": 0.1776958853006363, + "learning_rate": 0.0001921665601524215, + "loss": 1.0039, + "step": 1449 + }, + { + "epoch": 0.51, + "grad_norm": 0.18249039351940155, + "learning_rate": 0.00019215581370151505, + "loss": 0.9773, + "step": 1450 + }, + { + "epoch": 0.51, + "grad_norm": 0.19207897782325745, + "learning_rate": 0.00019214506018520582, + "loss": 1.0134, + "step": 1451 + }, + { + "epoch": 0.51, + "grad_norm": 0.18842819333076477, + "learning_rate": 0.00019213429960431832, + "loss": 1.0259, + "step": 1452 + }, + { + "epoch": 0.51, + "grad_norm": 0.18662334978580475, + "learning_rate": 0.00019212353195967752, + "loss": 0.9814, + "step": 1453 + }, + { + "epoch": 0.51, + "grad_norm": 0.18366405367851257, + "learning_rate": 0.00019211275725210898, + "loss": 1.0715, + "step": 1454 + }, + { + "epoch": 0.51, + "grad_norm": 0.18426047265529633, + "learning_rate": 0.00019210197548243874, + "loss": 1.0182, + "step": 1455 + }, + { + "epoch": 0.51, + "grad_norm": 0.17692065238952637, + "learning_rate": 0.00019209118665149344, + "loss": 0.9778, + "step": 1456 + }, + { + "epoch": 0.51, + "grad_norm": 0.1945948749780655, + "learning_rate": 0.0001920803907601002, + "loss": 1.0043, + "step": 1457 + }, + { + "epoch": 0.51, + "grad_norm": 0.17809897661209106, + "learning_rate": 0.00019206958780908679, + "loss": 1.0542, + "step": 1458 + }, + { + "epoch": 0.51, + "grad_norm": 0.18257014453411102, + "learning_rate": 0.00019205877779928136, + "loss": 1.0037, + "step": 1459 + }, + { + "epoch": 0.51, + "grad_norm": 0.1871815174818039, + "learning_rate": 0.00019204796073151278, + "loss": 1.0409, + "step": 1460 + }, + { + "epoch": 0.51, + "grad_norm": 0.17258718609809875, + "learning_rate": 0.00019203713660661032, + "loss": 0.9901, + "step": 1461 + }, + { + "epoch": 0.51, + "grad_norm": 0.1797480434179306, + "learning_rate": 0.00019202630542540386, + "loss": 1.0053, + "step": 1462 + }, + { + "epoch": 0.52, + "grad_norm": 0.18446534872055054, + "learning_rate": 0.00019201546718872382, + "loss": 1.0494, + "step": 1463 + }, + { + "epoch": 0.52, + "grad_norm": 0.1935523897409439, + "learning_rate": 0.0001920046218974011, + "loss": 1.0355, + "step": 1464 + }, + { + "epoch": 0.52, + "grad_norm": 0.1764344573020935, + "learning_rate": 0.00019199376955226724, + "loss": 0.9837, + "step": 1465 + }, + { + "epoch": 0.52, + "grad_norm": 0.17583085596561432, + "learning_rate": 0.00019198291015415424, + "loss": 1.0092, + "step": 1466 + }, + { + "epoch": 0.52, + "grad_norm": 0.18314938247203827, + "learning_rate": 0.00019197204370389467, + "loss": 1.0004, + "step": 1467 + }, + { + "epoch": 0.52, + "grad_norm": 0.1774096041917801, + "learning_rate": 0.00019196117020232165, + "loss": 0.9551, + "step": 1468 + }, + { + "epoch": 0.52, + "grad_norm": 0.18138380348682404, + "learning_rate": 0.00019195028965026883, + "loss": 0.996, + "step": 1469 + }, + { + "epoch": 0.52, + "grad_norm": 0.1724422425031662, + "learning_rate": 0.00019193940204857035, + "loss": 0.9521, + "step": 1470 + }, + { + "epoch": 0.52, + "grad_norm": 0.18512949347496033, + "learning_rate": 0.000191928507398061, + "loss": 1.0161, + "step": 1471 + }, + { + "epoch": 0.52, + "grad_norm": 0.17651984095573425, + "learning_rate": 0.00019191760569957605, + "loss": 1.0599, + "step": 1472 + }, + { + "epoch": 0.52, + "grad_norm": 0.1797732263803482, + "learning_rate": 0.00019190669695395127, + "loss": 1.0212, + "step": 1473 + }, + { + "epoch": 0.52, + "grad_norm": 0.18342822790145874, + "learning_rate": 0.00019189578116202307, + "loss": 1.0642, + "step": 1474 + }, + { + "epoch": 0.52, + "grad_norm": 0.17123663425445557, + "learning_rate": 0.0001918848583246283, + "loss": 1.0349, + "step": 1475 + }, + { + "epoch": 0.52, + "grad_norm": 0.17873786389827728, + "learning_rate": 0.00019187392844260438, + "loss": 0.994, + "step": 1476 + }, + { + "epoch": 0.52, + "grad_norm": 0.18375077843666077, + "learning_rate": 0.00019186299151678933, + "loss": 0.9608, + "step": 1477 + }, + { + "epoch": 0.52, + "grad_norm": 0.1748666614294052, + "learning_rate": 0.00019185204754802162, + "loss": 0.9826, + "step": 1478 + }, + { + "epoch": 0.52, + "grad_norm": 0.17497992515563965, + "learning_rate": 0.0001918410965371403, + "loss": 0.9611, + "step": 1479 + }, + { + "epoch": 0.52, + "grad_norm": 0.17814147472381592, + "learning_rate": 0.00019183013848498505, + "loss": 0.9753, + "step": 1480 + }, + { + "epoch": 0.52, + "grad_norm": 0.18198871612548828, + "learning_rate": 0.00019181917339239587, + "loss": 1.0253, + "step": 1481 + }, + { + "epoch": 0.52, + "grad_norm": 0.1947755515575409, + "learning_rate": 0.0001918082012602135, + "loss": 1.0288, + "step": 1482 + }, + { + "epoch": 0.52, + "grad_norm": 0.1716606318950653, + "learning_rate": 0.00019179722208927916, + "loss": 1.0063, + "step": 1483 + }, + { + "epoch": 0.52, + "grad_norm": 0.17634867131710052, + "learning_rate": 0.0001917862358804346, + "loss": 0.9734, + "step": 1484 + }, + { + "epoch": 0.52, + "grad_norm": 0.16377978026866913, + "learning_rate": 0.00019177524263452208, + "loss": 0.8907, + "step": 1485 + }, + { + "epoch": 0.52, + "grad_norm": 0.1761971414089203, + "learning_rate": 0.00019176424235238445, + "loss": 0.9301, + "step": 1486 + }, + { + "epoch": 0.52, + "grad_norm": 0.1791556477546692, + "learning_rate": 0.00019175323503486507, + "loss": 1.0068, + "step": 1487 + }, + { + "epoch": 0.52, + "grad_norm": 0.18452045321464539, + "learning_rate": 0.00019174222068280784, + "loss": 1.0406, + "step": 1488 + }, + { + "epoch": 0.52, + "grad_norm": 0.17810030281543732, + "learning_rate": 0.00019173119929705726, + "loss": 1.0032, + "step": 1489 + }, + { + "epoch": 0.52, + "grad_norm": 0.18171797692775726, + "learning_rate": 0.00019172017087845826, + "loss": 1.0213, + "step": 1490 + }, + { + "epoch": 0.53, + "grad_norm": 0.19251057505607605, + "learning_rate": 0.0001917091354278564, + "loss": 1.0263, + "step": 1491 + }, + { + "epoch": 0.53, + "grad_norm": 0.1829746812582016, + "learning_rate": 0.00019169809294609772, + "loss": 1.0505, + "step": 1492 + }, + { + "epoch": 0.53, + "grad_norm": 0.1797158122062683, + "learning_rate": 0.00019168704343402887, + "loss": 1.0091, + "step": 1493 + }, + { + "epoch": 0.53, + "grad_norm": 0.1812933087348938, + "learning_rate": 0.00019167598689249696, + "loss": 1.0804, + "step": 1494 + }, + { + "epoch": 0.53, + "grad_norm": 0.17717424035072327, + "learning_rate": 0.00019166492332234968, + "loss": 1.017, + "step": 1495 + }, + { + "epoch": 0.53, + "grad_norm": 0.1779852956533432, + "learning_rate": 0.00019165385272443524, + "loss": 0.9392, + "step": 1496 + }, + { + "epoch": 0.53, + "grad_norm": 0.18181325495243073, + "learning_rate": 0.0001916427750996024, + "loss": 0.9957, + "step": 1497 + }, + { + "epoch": 0.53, + "grad_norm": 0.18540653586387634, + "learning_rate": 0.0001916316904487005, + "loss": 0.9936, + "step": 1498 + }, + { + "epoch": 0.53, + "grad_norm": 0.1858024299144745, + "learning_rate": 0.00019162059877257933, + "loss": 0.9896, + "step": 1499 + }, + { + "epoch": 0.53, + "grad_norm": 0.187087744474411, + "learning_rate": 0.00019160950007208926, + "loss": 1.0209, + "step": 1500 + }, + { + "epoch": 0.53, + "grad_norm": 0.18074601888656616, + "learning_rate": 0.00019159839434808125, + "loss": 0.9911, + "step": 1501 + }, + { + "epoch": 0.53, + "grad_norm": 0.18445174396038055, + "learning_rate": 0.00019158728160140675, + "loss": 1.0246, + "step": 1502 + }, + { + "epoch": 0.53, + "grad_norm": 0.18151141703128815, + "learning_rate": 0.00019157616183291772, + "loss": 1.0255, + "step": 1503 + }, + { + "epoch": 0.53, + "grad_norm": 0.1752597987651825, + "learning_rate": 0.00019156503504346673, + "loss": 0.9441, + "step": 1504 + }, + { + "epoch": 0.53, + "grad_norm": 0.1818474382162094, + "learning_rate": 0.0001915539012339068, + "loss": 0.9716, + "step": 1505 + }, + { + "epoch": 0.53, + "grad_norm": 0.18579250574111938, + "learning_rate": 0.00019154276040509157, + "loss": 1.0682, + "step": 1506 + }, + { + "epoch": 0.53, + "grad_norm": 0.18655996024608612, + "learning_rate": 0.00019153161255787518, + "loss": 0.9676, + "step": 1507 + }, + { + "epoch": 0.53, + "grad_norm": 0.1754051148891449, + "learning_rate": 0.00019152045769311232, + "loss": 0.9886, + "step": 1508 + }, + { + "epoch": 0.53, + "grad_norm": 0.1764054149389267, + "learning_rate": 0.00019150929581165818, + "loss": 0.983, + "step": 1509 + }, + { + "epoch": 0.53, + "grad_norm": 0.17365694046020508, + "learning_rate": 0.00019149812691436854, + "loss": 0.9749, + "step": 1510 + }, + { + "epoch": 0.53, + "grad_norm": 0.19486962258815765, + "learning_rate": 0.0001914869510020997, + "loss": 1.0016, + "step": 1511 + }, + { + "epoch": 0.53, + "grad_norm": 0.17791058123111725, + "learning_rate": 0.00019147576807570854, + "loss": 1.0725, + "step": 1512 + }, + { + "epoch": 0.53, + "grad_norm": 0.18877653777599335, + "learning_rate": 0.00019146457813605232, + "loss": 1.0221, + "step": 1513 + }, + { + "epoch": 0.53, + "grad_norm": 0.18828843533992767, + "learning_rate": 0.00019145338118398906, + "loss": 1.0174, + "step": 1514 + }, + { + "epoch": 0.53, + "grad_norm": 0.18070997297763824, + "learning_rate": 0.00019144217722037713, + "loss": 1.026, + "step": 1515 + }, + { + "epoch": 0.53, + "grad_norm": 0.18031719326972961, + "learning_rate": 0.00019143096624607558, + "loss": 1.0322, + "step": 1516 + }, + { + "epoch": 0.53, + "grad_norm": 0.18009379506111145, + "learning_rate": 0.00019141974826194388, + "loss": 1.0786, + "step": 1517 + }, + { + "epoch": 0.53, + "grad_norm": 0.1836472451686859, + "learning_rate": 0.00019140852326884216, + "loss": 1.0363, + "step": 1518 + }, + { + "epoch": 0.53, + "grad_norm": 0.1833399087190628, + "learning_rate": 0.00019139729126763092, + "loss": 0.9989, + "step": 1519 + }, + { + "epoch": 0.54, + "grad_norm": 0.17795605957508087, + "learning_rate": 0.00019138605225917137, + "loss": 1.0266, + "step": 1520 + }, + { + "epoch": 0.54, + "grad_norm": 0.18254658579826355, + "learning_rate": 0.00019137480624432517, + "loss": 1.0618, + "step": 1521 + }, + { + "epoch": 0.54, + "grad_norm": 0.18196122348308563, + "learning_rate": 0.0001913635532239545, + "loss": 0.9893, + "step": 1522 + }, + { + "epoch": 0.54, + "grad_norm": 0.18685205280780792, + "learning_rate": 0.0001913522931989221, + "loss": 1.0147, + "step": 1523 + }, + { + "epoch": 0.54, + "grad_norm": 0.18355733156204224, + "learning_rate": 0.00019134102617009133, + "loss": 0.9925, + "step": 1524 + }, + { + "epoch": 0.54, + "grad_norm": 0.17968907952308655, + "learning_rate": 0.00019132975213832594, + "loss": 1.0555, + "step": 1525 + }, + { + "epoch": 0.54, + "grad_norm": 0.17847761511802673, + "learning_rate": 0.0001913184711044903, + "loss": 1.0557, + "step": 1526 + }, + { + "epoch": 0.54, + "grad_norm": 0.17835336923599243, + "learning_rate": 0.00019130718306944932, + "loss": 0.9833, + "step": 1527 + }, + { + "epoch": 0.54, + "grad_norm": 0.1996411383152008, + "learning_rate": 0.00019129588803406842, + "loss": 1.0438, + "step": 1528 + }, + { + "epoch": 0.54, + "grad_norm": 0.17643500864505768, + "learning_rate": 0.00019128458599921357, + "loss": 1.0064, + "step": 1529 + }, + { + "epoch": 0.54, + "grad_norm": 0.18694715201854706, + "learning_rate": 0.00019127327696575125, + "loss": 1.001, + "step": 1530 + }, + { + "epoch": 0.54, + "grad_norm": 0.18230758607387543, + "learning_rate": 0.0001912619609345486, + "loss": 1.062, + "step": 1531 + }, + { + "epoch": 0.54, + "grad_norm": 0.17499038577079773, + "learning_rate": 0.00019125063790647305, + "loss": 0.9381, + "step": 1532 + }, + { + "epoch": 0.54, + "grad_norm": 0.17816944420337677, + "learning_rate": 0.00019123930788239281, + "loss": 1.0399, + "step": 1533 + }, + { + "epoch": 0.54, + "grad_norm": 0.19370074570178986, + "learning_rate": 0.00019122797086317653, + "loss": 0.9949, + "step": 1534 + }, + { + "epoch": 0.54, + "grad_norm": 0.1971571147441864, + "learning_rate": 0.00019121662684969335, + "loss": 0.9631, + "step": 1535 + }, + { + "epoch": 0.54, + "grad_norm": 0.18476071953773499, + "learning_rate": 0.00019120527584281304, + "loss": 1.0312, + "step": 1536 + }, + { + "epoch": 0.54, + "grad_norm": 0.18056322634220123, + "learning_rate": 0.0001911939178434058, + "loss": 0.9848, + "step": 1537 + }, + { + "epoch": 0.54, + "grad_norm": 0.19656671583652496, + "learning_rate": 0.0001911825528523425, + "loss": 1.0728, + "step": 1538 + }, + { + "epoch": 0.54, + "grad_norm": 0.19638994336128235, + "learning_rate": 0.00019117118087049438, + "loss": 0.9967, + "step": 1539 + }, + { + "epoch": 0.54, + "grad_norm": 0.19183266162872314, + "learning_rate": 0.00019115980189873341, + "loss": 0.9916, + "step": 1540 + }, + { + "epoch": 0.54, + "grad_norm": 0.1767280101776123, + "learning_rate": 0.00019114841593793194, + "loss": 1.0588, + "step": 1541 + }, + { + "epoch": 0.54, + "grad_norm": 0.1942903995513916, + "learning_rate": 0.0001911370229889629, + "loss": 0.943, + "step": 1542 + }, + { + "epoch": 0.54, + "grad_norm": 0.1888478547334671, + "learning_rate": 0.0001911256230526998, + "loss": 1.0352, + "step": 1543 + }, + { + "epoch": 0.54, + "grad_norm": 0.18362468481063843, + "learning_rate": 0.00019111421613001662, + "loss": 0.9774, + "step": 1544 + }, + { + "epoch": 0.54, + "grad_norm": 0.17923595011234283, + "learning_rate": 0.0001911028022217879, + "loss": 0.9808, + "step": 1545 + }, + { + "epoch": 0.54, + "grad_norm": 0.19007225334644318, + "learning_rate": 0.00019109138132888872, + "loss": 1.002, + "step": 1546 + }, + { + "epoch": 0.54, + "grad_norm": 0.17761003971099854, + "learning_rate": 0.00019107995345219475, + "loss": 0.9513, + "step": 1547 + }, + { + "epoch": 0.55, + "grad_norm": 0.18594999611377716, + "learning_rate": 0.0001910685185925821, + "loss": 1.0227, + "step": 1548 + }, + { + "epoch": 0.55, + "grad_norm": 0.1720389872789383, + "learning_rate": 0.00019105707675092743, + "loss": 0.9558, + "step": 1549 + }, + { + "epoch": 0.55, + "grad_norm": 0.18154871463775635, + "learning_rate": 0.000191045627928108, + "loss": 0.995, + "step": 1550 + }, + { + "epoch": 0.55, + "grad_norm": 0.19383765757083893, + "learning_rate": 0.00019103417212500162, + "loss": 1.0508, + "step": 1551 + }, + { + "epoch": 0.55, + "grad_norm": 0.18130627274513245, + "learning_rate": 0.00019102270934248647, + "loss": 1.0096, + "step": 1552 + }, + { + "epoch": 0.55, + "grad_norm": 0.18290060758590698, + "learning_rate": 0.00019101123958144146, + "loss": 1.0364, + "step": 1553 + }, + { + "epoch": 0.55, + "grad_norm": 0.19369938969612122, + "learning_rate": 0.0001909997628427459, + "loss": 0.9589, + "step": 1554 + }, + { + "epoch": 0.55, + "grad_norm": 0.19195522367954254, + "learning_rate": 0.00019098827912727976, + "loss": 1.0424, + "step": 1555 + }, + { + "epoch": 0.55, + "grad_norm": 0.18206413090229034, + "learning_rate": 0.00019097678843592344, + "loss": 0.9539, + "step": 1556 + }, + { + "epoch": 0.55, + "grad_norm": 0.1879245638847351, + "learning_rate": 0.00019096529076955787, + "loss": 1.0335, + "step": 1557 + }, + { + "epoch": 0.55, + "grad_norm": 0.1857089251279831, + "learning_rate": 0.0001909537861290646, + "loss": 1.0136, + "step": 1558 + }, + { + "epoch": 0.55, + "grad_norm": 0.20278511941432953, + "learning_rate": 0.00019094227451532563, + "loss": 1.002, + "step": 1559 + }, + { + "epoch": 0.55, + "grad_norm": 0.197901651263237, + "learning_rate": 0.00019093075592922358, + "loss": 1.0214, + "step": 1560 + }, + { + "epoch": 0.55, + "grad_norm": 0.18838657438755035, + "learning_rate": 0.00019091923037164156, + "loss": 1.0061, + "step": 1561 + }, + { + "epoch": 0.55, + "grad_norm": 0.18443070352077484, + "learning_rate": 0.00019090769784346314, + "loss": 1.0103, + "step": 1562 + }, + { + "epoch": 0.55, + "grad_norm": 0.18846985697746277, + "learning_rate": 0.00019089615834557252, + "loss": 0.9854, + "step": 1563 + }, + { + "epoch": 0.55, + "grad_norm": 0.1843843013048172, + "learning_rate": 0.0001908846118788545, + "loss": 1.0166, + "step": 1564 + }, + { + "epoch": 0.55, + "grad_norm": 0.18994472920894623, + "learning_rate": 0.0001908730584441942, + "loss": 1.0502, + "step": 1565 + }, + { + "epoch": 0.55, + "grad_norm": 0.1749737709760666, + "learning_rate": 0.00019086149804247747, + "loss": 0.9723, + "step": 1566 + }, + { + "epoch": 0.55, + "grad_norm": 0.1871078461408615, + "learning_rate": 0.0001908499306745906, + "loss": 0.9993, + "step": 1567 + }, + { + "epoch": 0.55, + "grad_norm": 0.18963484466075897, + "learning_rate": 0.00019083835634142043, + "loss": 1.0489, + "step": 1568 + }, + { + "epoch": 0.55, + "grad_norm": 0.1813851296901703, + "learning_rate": 0.00019082677504385438, + "loss": 0.9793, + "step": 1569 + }, + { + "epoch": 0.55, + "grad_norm": 0.181089848279953, + "learning_rate": 0.00019081518678278035, + "loss": 0.9995, + "step": 1570 + }, + { + "epoch": 0.55, + "grad_norm": 0.18877080082893372, + "learning_rate": 0.00019080359155908676, + "loss": 1.0485, + "step": 1571 + }, + { + "epoch": 0.55, + "grad_norm": 0.17573611438274384, + "learning_rate": 0.0001907919893736626, + "loss": 0.9614, + "step": 1572 + }, + { + "epoch": 0.55, + "grad_norm": 0.18343812227249146, + "learning_rate": 0.0001907803802273974, + "loss": 1.0384, + "step": 1573 + }, + { + "epoch": 0.55, + "grad_norm": 0.16975995898246765, + "learning_rate": 0.00019076876412118122, + "loss": 0.9212, + "step": 1574 + }, + { + "epoch": 0.55, + "grad_norm": 0.18836268782615662, + "learning_rate": 0.00019075714105590464, + "loss": 0.9728, + "step": 1575 + }, + { + "epoch": 0.55, + "grad_norm": 0.18021149933338165, + "learning_rate": 0.00019074551103245876, + "loss": 1.0136, + "step": 1576 + }, + { + "epoch": 0.56, + "grad_norm": 0.18562135100364685, + "learning_rate": 0.0001907338740517352, + "loss": 1.0479, + "step": 1577 + }, + { + "epoch": 0.56, + "grad_norm": 0.1791529655456543, + "learning_rate": 0.00019072223011462627, + "loss": 1.0383, + "step": 1578 + }, + { + "epoch": 0.56, + "grad_norm": 0.1899656504392624, + "learning_rate": 0.00019071057922202456, + "loss": 1.0454, + "step": 1579 + }, + { + "epoch": 0.56, + "grad_norm": 0.17192921042442322, + "learning_rate": 0.00019069892137482333, + "loss": 0.9879, + "step": 1580 + }, + { + "epoch": 0.56, + "grad_norm": 0.17805634438991547, + "learning_rate": 0.00019068725657391645, + "loss": 1.0285, + "step": 1581 + }, + { + "epoch": 0.56, + "grad_norm": 0.17237581312656403, + "learning_rate": 0.00019067558482019811, + "loss": 0.9978, + "step": 1582 + }, + { + "epoch": 0.56, + "grad_norm": 0.17143943905830383, + "learning_rate": 0.00019066390611456328, + "loss": 0.9697, + "step": 1583 + }, + { + "epoch": 0.56, + "grad_norm": 0.16979925334453583, + "learning_rate": 0.0001906522204579073, + "loss": 0.9267, + "step": 1584 + }, + { + "epoch": 0.56, + "grad_norm": 0.18201367557048798, + "learning_rate": 0.00019064052785112607, + "loss": 0.9633, + "step": 1585 + }, + { + "epoch": 0.56, + "grad_norm": 0.17835623025894165, + "learning_rate": 0.00019062882829511607, + "loss": 0.9812, + "step": 1586 + }, + { + "epoch": 0.56, + "grad_norm": 0.18148957192897797, + "learning_rate": 0.00019061712179077424, + "loss": 1.0285, + "step": 1587 + }, + { + "epoch": 0.56, + "grad_norm": 0.18641547858715057, + "learning_rate": 0.00019060540833899814, + "loss": 1.0153, + "step": 1588 + }, + { + "epoch": 0.56, + "grad_norm": 0.1883457750082016, + "learning_rate": 0.00019059368794068578, + "loss": 0.9653, + "step": 1589 + }, + { + "epoch": 0.56, + "grad_norm": 0.18426276743412018, + "learning_rate": 0.00019058196059673575, + "loss": 1.0388, + "step": 1590 + }, + { + "epoch": 0.56, + "grad_norm": 0.19204431772232056, + "learning_rate": 0.00019057022630804716, + "loss": 1.0635, + "step": 1591 + }, + { + "epoch": 0.56, + "grad_norm": 0.1714024692773819, + "learning_rate": 0.00019055848507551967, + "loss": 0.9776, + "step": 1592 + }, + { + "epoch": 0.56, + "grad_norm": 0.1830907016992569, + "learning_rate": 0.00019054673690005344, + "loss": 0.9496, + "step": 1593 + }, + { + "epoch": 0.56, + "grad_norm": 0.17566922307014465, + "learning_rate": 0.0001905349817825492, + "loss": 0.9616, + "step": 1594 + }, + { + "epoch": 0.56, + "grad_norm": 0.1792527139186859, + "learning_rate": 0.00019052321972390815, + "loss": 0.9606, + "step": 1595 + }, + { + "epoch": 0.56, + "grad_norm": 0.18835650384426117, + "learning_rate": 0.00019051145072503215, + "loss": 1.0187, + "step": 1596 + }, + { + "epoch": 0.56, + "grad_norm": 0.17348940670490265, + "learning_rate": 0.00019049967478682338, + "loss": 0.9624, + "step": 1597 + }, + { + "epoch": 0.56, + "grad_norm": 0.1860518902540207, + "learning_rate": 0.00019048789191018477, + "loss": 0.963, + "step": 1598 + }, + { + "epoch": 0.56, + "grad_norm": 0.1794481873512268, + "learning_rate": 0.0001904761020960197, + "loss": 0.9907, + "step": 1599 + }, + { + "epoch": 0.56, + "grad_norm": 0.18411587178707123, + "learning_rate": 0.00019046430534523198, + "loss": 1.0253, + "step": 1600 + }, + { + "epoch": 0.56, + "grad_norm": 0.1995529979467392, + "learning_rate": 0.00019045250165872614, + "loss": 1.0077, + "step": 1601 + }, + { + "epoch": 0.56, + "grad_norm": 0.17933034896850586, + "learning_rate": 0.00019044069103740707, + "loss": 0.9448, + "step": 1602 + }, + { + "epoch": 0.56, + "grad_norm": 0.17415685951709747, + "learning_rate": 0.00019042887348218033, + "loss": 0.9509, + "step": 1603 + }, + { + "epoch": 0.56, + "grad_norm": 0.17275942862033844, + "learning_rate": 0.00019041704899395192, + "loss": 0.975, + "step": 1604 + }, + { + "epoch": 0.57, + "grad_norm": 0.17786523699760437, + "learning_rate": 0.0001904052175736284, + "loss": 1.0138, + "step": 1605 + }, + { + "epoch": 0.57, + "grad_norm": 0.19810418784618378, + "learning_rate": 0.00019039337922211683, + "loss": 1.0133, + "step": 1606 + }, + { + "epoch": 0.57, + "grad_norm": 0.17100881040096283, + "learning_rate": 0.0001903815339403249, + "loss": 0.9999, + "step": 1607 + }, + { + "epoch": 0.57, + "grad_norm": 0.1892804652452469, + "learning_rate": 0.0001903696817291607, + "loss": 0.9823, + "step": 1608 + }, + { + "epoch": 0.57, + "grad_norm": 0.18206055462360382, + "learning_rate": 0.00019035782258953297, + "loss": 0.9637, + "step": 1609 + }, + { + "epoch": 0.57, + "grad_norm": 0.18716344237327576, + "learning_rate": 0.0001903459565223509, + "loss": 1.0761, + "step": 1610 + }, + { + "epoch": 0.57, + "grad_norm": 0.17864565551280975, + "learning_rate": 0.00019033408352852422, + "loss": 1.0491, + "step": 1611 + }, + { + "epoch": 0.57, + "grad_norm": 0.18046505749225616, + "learning_rate": 0.00019032220360896325, + "loss": 1.0324, + "step": 1612 + }, + { + "epoch": 0.57, + "grad_norm": 0.18205080926418304, + "learning_rate": 0.00019031031676457875, + "loss": 0.9451, + "step": 1613 + }, + { + "epoch": 0.57, + "grad_norm": 0.1753171980381012, + "learning_rate": 0.00019029842299628208, + "loss": 1.0274, + "step": 1614 + }, + { + "epoch": 0.57, + "grad_norm": 0.1871657818555832, + "learning_rate": 0.00019028652230498516, + "loss": 1.0019, + "step": 1615 + }, + { + "epoch": 0.57, + "grad_norm": 0.17937354743480682, + "learning_rate": 0.00019027461469160032, + "loss": 0.9694, + "step": 1616 + }, + { + "epoch": 0.57, + "grad_norm": 0.17640085518360138, + "learning_rate": 0.00019026270015704054, + "loss": 1.0143, + "step": 1617 + }, + { + "epoch": 0.57, + "grad_norm": 0.18968771398067474, + "learning_rate": 0.00019025077870221929, + "loss": 0.9798, + "step": 1618 + }, + { + "epoch": 0.57, + "grad_norm": 0.18246042728424072, + "learning_rate": 0.0001902388503280505, + "loss": 1.0376, + "step": 1619 + }, + { + "epoch": 0.57, + "grad_norm": 0.18239466845989227, + "learning_rate": 0.00019022691503544877, + "loss": 1.0622, + "step": 1620 + }, + { + "epoch": 0.57, + "grad_norm": 0.17998111248016357, + "learning_rate": 0.00019021497282532907, + "loss": 1.0192, + "step": 1621 + }, + { + "epoch": 0.57, + "grad_norm": 0.18229350447654724, + "learning_rate": 0.00019020302369860708, + "loss": 1.0453, + "step": 1622 + }, + { + "epoch": 0.57, + "grad_norm": 0.17646130919456482, + "learning_rate": 0.00019019106765619888, + "loss": 0.9841, + "step": 1623 + }, + { + "epoch": 0.57, + "grad_norm": 0.18252936005592346, + "learning_rate": 0.00019017910469902108, + "loss": 1.0199, + "step": 1624 + }, + { + "epoch": 0.57, + "grad_norm": 0.17558379471302032, + "learning_rate": 0.00019016713482799088, + "loss": 0.9245, + "step": 1625 + }, + { + "epoch": 0.57, + "grad_norm": 0.1946648508310318, + "learning_rate": 0.00019015515804402604, + "loss": 1.0411, + "step": 1626 + }, + { + "epoch": 0.57, + "grad_norm": 0.1894584596157074, + "learning_rate": 0.00019014317434804472, + "loss": 0.977, + "step": 1627 + }, + { + "epoch": 0.57, + "grad_norm": 0.1869843602180481, + "learning_rate": 0.0001901311837409657, + "loss": 1.0375, + "step": 1628 + }, + { + "epoch": 0.57, + "grad_norm": 0.1728430539369583, + "learning_rate": 0.0001901191862237083, + "loss": 0.9841, + "step": 1629 + }, + { + "epoch": 0.57, + "grad_norm": 0.18686573207378387, + "learning_rate": 0.00019010718179719236, + "loss": 0.9953, + "step": 1630 + }, + { + "epoch": 0.57, + "grad_norm": 0.19124893844127655, + "learning_rate": 0.00019009517046233816, + "loss": 1.0541, + "step": 1631 + }, + { + "epoch": 0.57, + "grad_norm": 0.1842876523733139, + "learning_rate": 0.00019008315222006667, + "loss": 0.9448, + "step": 1632 + }, + { + "epoch": 0.58, + "grad_norm": 0.17653100192546844, + "learning_rate": 0.00019007112707129927, + "loss": 1.0036, + "step": 1633 + }, + { + "epoch": 0.58, + "grad_norm": 0.17082680761814117, + "learning_rate": 0.0001900590950169579, + "loss": 0.9503, + "step": 1634 + }, + { + "epoch": 0.58, + "grad_norm": 0.17388126254081726, + "learning_rate": 0.00019004705605796504, + "loss": 0.9682, + "step": 1635 + }, + { + "epoch": 0.58, + "grad_norm": 0.19252508878707886, + "learning_rate": 0.00019003501019524368, + "loss": 0.9718, + "step": 1636 + }, + { + "epoch": 0.58, + "grad_norm": 0.18236425518989563, + "learning_rate": 0.00019002295742971738, + "loss": 1.0233, + "step": 1637 + }, + { + "epoch": 0.58, + "grad_norm": 0.18430474400520325, + "learning_rate": 0.0001900108977623102, + "loss": 1.0188, + "step": 1638 + }, + { + "epoch": 0.58, + "grad_norm": 0.1759885847568512, + "learning_rate": 0.00018999883119394668, + "loss": 0.9615, + "step": 1639 + }, + { + "epoch": 0.58, + "grad_norm": 0.1740919053554535, + "learning_rate": 0.000189986757725552, + "loss": 1.0351, + "step": 1640 + }, + { + "epoch": 0.58, + "grad_norm": 0.188923642039299, + "learning_rate": 0.0001899746773580518, + "loss": 1.004, + "step": 1641 + }, + { + "epoch": 0.58, + "grad_norm": 0.17621168494224548, + "learning_rate": 0.0001899625900923722, + "loss": 0.986, + "step": 1642 + }, + { + "epoch": 0.58, + "grad_norm": 0.1900838315486908, + "learning_rate": 0.00018995049592943997, + "loss": 1.0189, + "step": 1643 + }, + { + "epoch": 0.58, + "grad_norm": 0.18396282196044922, + "learning_rate": 0.00018993839487018232, + "loss": 0.8813, + "step": 1644 + }, + { + "epoch": 0.58, + "grad_norm": 0.18651731312274933, + "learning_rate": 0.00018992628691552702, + "loss": 1.0907, + "step": 1645 + }, + { + "epoch": 0.58, + "grad_norm": 0.18417122960090637, + "learning_rate": 0.00018991417206640237, + "loss": 1.0003, + "step": 1646 + }, + { + "epoch": 0.58, + "grad_norm": 0.19596096873283386, + "learning_rate": 0.00018990205032373716, + "loss": 1.0152, + "step": 1647 + }, + { + "epoch": 0.58, + "grad_norm": 0.17158856987953186, + "learning_rate": 0.0001898899216884608, + "loss": 0.9455, + "step": 1648 + }, + { + "epoch": 0.58, + "grad_norm": 0.17305323481559753, + "learning_rate": 0.00018987778616150308, + "loss": 0.9388, + "step": 1649 + }, + { + "epoch": 0.58, + "grad_norm": 0.1787583976984024, + "learning_rate": 0.0001898656437437945, + "loss": 1.0368, + "step": 1650 + }, + { + "epoch": 0.58, + "grad_norm": 0.18098847568035126, + "learning_rate": 0.00018985349443626594, + "loss": 1.0413, + "step": 1651 + }, + { + "epoch": 0.58, + "grad_norm": 0.18061774969100952, + "learning_rate": 0.00018984133823984886, + "loss": 1.0476, + "step": 1652 + }, + { + "epoch": 0.58, + "grad_norm": 0.18374860286712646, + "learning_rate": 0.0001898291751554753, + "loss": 1.0636, + "step": 1653 + }, + { + "epoch": 0.58, + "grad_norm": 0.17729640007019043, + "learning_rate": 0.00018981700518407775, + "loss": 0.9942, + "step": 1654 + }, + { + "epoch": 0.58, + "grad_norm": 0.18511778116226196, + "learning_rate": 0.00018980482832658922, + "loss": 1.0035, + "step": 1655 + }, + { + "epoch": 0.58, + "grad_norm": 0.18169361352920532, + "learning_rate": 0.00018979264458394334, + "loss": 0.998, + "step": 1656 + }, + { + "epoch": 0.58, + "grad_norm": 0.18548671901226044, + "learning_rate": 0.00018978045395707418, + "loss": 1.0163, + "step": 1657 + }, + { + "epoch": 0.58, + "grad_norm": 0.1922914832830429, + "learning_rate": 0.00018976825644691637, + "loss": 1.065, + "step": 1658 + }, + { + "epoch": 0.58, + "grad_norm": 0.17772276699543, + "learning_rate": 0.00018975605205440513, + "loss": 0.9657, + "step": 1659 + }, + { + "epoch": 0.58, + "grad_norm": 0.17630743980407715, + "learning_rate": 0.00018974384078047606, + "loss": 0.9632, + "step": 1660 + }, + { + "epoch": 0.58, + "grad_norm": 0.17994816601276398, + "learning_rate": 0.00018973162262606543, + "loss": 0.946, + "step": 1661 + }, + { + "epoch": 0.59, + "grad_norm": 0.18123731017112732, + "learning_rate": 0.00018971939759210998, + "loss": 1.0613, + "step": 1662 + }, + { + "epoch": 0.59, + "grad_norm": 0.18457649648189545, + "learning_rate": 0.0001897071656795469, + "loss": 0.984, + "step": 1663 + }, + { + "epoch": 0.59, + "grad_norm": 0.18137028813362122, + "learning_rate": 0.00018969492688931412, + "loss": 0.999, + "step": 1664 + }, + { + "epoch": 0.59, + "grad_norm": 0.18939583003520966, + "learning_rate": 0.00018968268122234987, + "loss": 0.9991, + "step": 1665 + }, + { + "epoch": 0.59, + "grad_norm": 0.18159011006355286, + "learning_rate": 0.000189670428679593, + "loss": 0.9348, + "step": 1666 + }, + { + "epoch": 0.59, + "grad_norm": 0.1750485897064209, + "learning_rate": 0.00018965816926198295, + "loss": 1.0172, + "step": 1667 + }, + { + "epoch": 0.59, + "grad_norm": 0.18323971331119537, + "learning_rate": 0.00018964590297045958, + "loss": 0.9855, + "step": 1668 + }, + { + "epoch": 0.59, + "grad_norm": 0.1904216706752777, + "learning_rate": 0.00018963362980596333, + "loss": 1.0082, + "step": 1669 + }, + { + "epoch": 0.59, + "grad_norm": 0.17032112181186676, + "learning_rate": 0.00018962134976943516, + "loss": 0.9158, + "step": 1670 + }, + { + "epoch": 0.59, + "grad_norm": 0.18121746182441711, + "learning_rate": 0.00018960906286181653, + "loss": 0.9998, + "step": 1671 + }, + { + "epoch": 0.59, + "grad_norm": 0.17808766663074493, + "learning_rate": 0.00018959676908404948, + "loss": 0.9863, + "step": 1672 + }, + { + "epoch": 0.59, + "grad_norm": 0.19097678363323212, + "learning_rate": 0.00018958446843707658, + "loss": 0.9903, + "step": 1673 + }, + { + "epoch": 0.59, + "grad_norm": 0.2249254286289215, + "learning_rate": 0.00018957216092184084, + "loss": 0.9, + "step": 1674 + }, + { + "epoch": 0.59, + "grad_norm": 0.17637252807617188, + "learning_rate": 0.0001895598465392859, + "loss": 1.0117, + "step": 1675 + }, + { + "epoch": 0.59, + "grad_norm": 0.17421402037143707, + "learning_rate": 0.00018954752529035584, + "loss": 0.9635, + "step": 1676 + }, + { + "epoch": 0.59, + "grad_norm": 0.17446309328079224, + "learning_rate": 0.0001895351971759953, + "loss": 0.985, + "step": 1677 + }, + { + "epoch": 0.59, + "grad_norm": 0.18266022205352783, + "learning_rate": 0.00018952286219714952, + "loss": 1.0179, + "step": 1678 + }, + { + "epoch": 0.59, + "grad_norm": 0.1776091754436493, + "learning_rate": 0.0001895105203547641, + "loss": 0.9669, + "step": 1679 + }, + { + "epoch": 0.59, + "grad_norm": 0.18488481640815735, + "learning_rate": 0.00018949817164978536, + "loss": 1.0574, + "step": 1680 + }, + { + "epoch": 0.59, + "grad_norm": 0.17778055369853973, + "learning_rate": 0.00018948581608315999, + "loss": 1.0059, + "step": 1681 + }, + { + "epoch": 0.59, + "grad_norm": 0.1818065345287323, + "learning_rate": 0.00018947345365583526, + "loss": 1.0158, + "step": 1682 + }, + { + "epoch": 0.59, + "grad_norm": 0.18250834941864014, + "learning_rate": 0.00018946108436875903, + "loss": 0.9978, + "step": 1683 + }, + { + "epoch": 0.59, + "grad_norm": 0.1843356043100357, + "learning_rate": 0.00018944870822287956, + "loss": 1.066, + "step": 1684 + }, + { + "epoch": 0.59, + "grad_norm": 0.18172258138656616, + "learning_rate": 0.00018943632521914577, + "loss": 0.9483, + "step": 1685 + }, + { + "epoch": 0.59, + "grad_norm": 0.17615202069282532, + "learning_rate": 0.000189423935358507, + "loss": 0.9955, + "step": 1686 + }, + { + "epoch": 0.59, + "grad_norm": 0.17779870331287384, + "learning_rate": 0.00018941153864191317, + "loss": 1.0287, + "step": 1687 + }, + { + "epoch": 0.59, + "grad_norm": 0.18770545721054077, + "learning_rate": 0.0001893991350703147, + "loss": 1.0095, + "step": 1688 + }, + { + "epoch": 0.59, + "grad_norm": 0.18199758231639862, + "learning_rate": 0.00018938672464466255, + "loss": 0.9966, + "step": 1689 + }, + { + "epoch": 0.6, + "grad_norm": 0.1823631227016449, + "learning_rate": 0.0001893743073659082, + "loss": 1.0017, + "step": 1690 + }, + { + "epoch": 0.6, + "grad_norm": 0.18197377026081085, + "learning_rate": 0.00018936188323500367, + "loss": 0.9994, + "step": 1691 + }, + { + "epoch": 0.6, + "grad_norm": 0.17887642979621887, + "learning_rate": 0.00018934945225290148, + "loss": 0.9761, + "step": 1692 + }, + { + "epoch": 0.6, + "grad_norm": 0.18398606777191162, + "learning_rate": 0.0001893370144205547, + "loss": 0.9836, + "step": 1693 + }, + { + "epoch": 0.6, + "grad_norm": 0.18904682993888855, + "learning_rate": 0.00018932456973891695, + "loss": 1.0479, + "step": 1694 + }, + { + "epoch": 0.6, + "grad_norm": 0.18656067550182343, + "learning_rate": 0.00018931211820894225, + "loss": 0.987, + "step": 1695 + }, + { + "epoch": 0.6, + "grad_norm": 0.17840254306793213, + "learning_rate": 0.00018929965983158532, + "loss": 0.9536, + "step": 1696 + }, + { + "epoch": 0.6, + "grad_norm": 0.187407448887825, + "learning_rate": 0.00018928719460780123, + "loss": 1.0308, + "step": 1697 + }, + { + "epoch": 0.6, + "grad_norm": 0.18530529737472534, + "learning_rate": 0.00018927472253854575, + "loss": 1.0369, + "step": 1698 + }, + { + "epoch": 0.6, + "grad_norm": 0.17825321853160858, + "learning_rate": 0.00018926224362477508, + "loss": 1.013, + "step": 1699 + }, + { + "epoch": 0.6, + "grad_norm": 0.17379194498062134, + "learning_rate": 0.0001892497578674459, + "loss": 1.0158, + "step": 1700 + }, + { + "epoch": 0.6, + "grad_norm": 0.18412184715270996, + "learning_rate": 0.00018923726526751548, + "loss": 1.0062, + "step": 1701 + }, + { + "epoch": 0.6, + "grad_norm": 0.18638308346271515, + "learning_rate": 0.00018922476582594163, + "loss": 1.0076, + "step": 1702 + }, + { + "epoch": 0.6, + "grad_norm": 0.17837877571582794, + "learning_rate": 0.00018921225954368268, + "loss": 0.8609, + "step": 1703 + }, + { + "epoch": 0.6, + "grad_norm": 0.184190034866333, + "learning_rate": 0.00018919974642169738, + "loss": 0.9835, + "step": 1704 + }, + { + "epoch": 0.6, + "grad_norm": 0.19461223483085632, + "learning_rate": 0.00018918722646094516, + "loss": 1.0066, + "step": 1705 + }, + { + "epoch": 0.6, + "grad_norm": 0.1890239417552948, + "learning_rate": 0.00018917469966238585, + "loss": 0.9629, + "step": 1706 + }, + { + "epoch": 0.6, + "grad_norm": 0.17515210807323456, + "learning_rate": 0.00018916216602697991, + "loss": 0.9093, + "step": 1707 + }, + { + "epoch": 0.6, + "grad_norm": 0.1932542473077774, + "learning_rate": 0.00018914962555568825, + "loss": 0.9947, + "step": 1708 + }, + { + "epoch": 0.6, + "grad_norm": 0.17919421195983887, + "learning_rate": 0.00018913707824947228, + "loss": 0.971, + "step": 1709 + }, + { + "epoch": 0.6, + "grad_norm": 0.18187792599201202, + "learning_rate": 0.000189124524109294, + "loss": 0.994, + "step": 1710 + }, + { + "epoch": 0.6, + "grad_norm": 0.1855144202709198, + "learning_rate": 0.00018911196313611597, + "loss": 0.9237, + "step": 1711 + }, + { + "epoch": 0.6, + "grad_norm": 0.19063417613506317, + "learning_rate": 0.00018909939533090113, + "loss": 1.0746, + "step": 1712 + }, + { + "epoch": 0.6, + "grad_norm": 0.1843891590833664, + "learning_rate": 0.00018908682069461303, + "loss": 1.0307, + "step": 1713 + }, + { + "epoch": 0.6, + "grad_norm": 0.19610640406608582, + "learning_rate": 0.00018907423922821584, + "loss": 1.0087, + "step": 1714 + }, + { + "epoch": 0.6, + "grad_norm": 0.19017747044563293, + "learning_rate": 0.00018906165093267405, + "loss": 0.9879, + "step": 1715 + }, + { + "epoch": 0.6, + "grad_norm": 0.1834624856710434, + "learning_rate": 0.00018904905580895284, + "loss": 0.996, + "step": 1716 + }, + { + "epoch": 0.6, + "grad_norm": 0.18889987468719482, + "learning_rate": 0.00018903645385801783, + "loss": 0.9959, + "step": 1717 + }, + { + "epoch": 0.6, + "grad_norm": 0.18375161290168762, + "learning_rate": 0.00018902384508083517, + "loss": 1.0468, + "step": 1718 + }, + { + "epoch": 0.61, + "grad_norm": 0.17865221202373505, + "learning_rate": 0.0001890112294783716, + "loss": 0.9869, + "step": 1719 + }, + { + "epoch": 0.61, + "grad_norm": 0.18713891506195068, + "learning_rate": 0.00018899860705159428, + "loss": 0.984, + "step": 1720 + }, + { + "epoch": 0.61, + "grad_norm": 0.18462030589580536, + "learning_rate": 0.00018898597780147097, + "loss": 0.9846, + "step": 1721 + }, + { + "epoch": 0.61, + "grad_norm": 0.19466228783130646, + "learning_rate": 0.00018897334172896993, + "loss": 0.9924, + "step": 1722 + }, + { + "epoch": 0.61, + "grad_norm": 0.17786429822444916, + "learning_rate": 0.00018896069883505992, + "loss": 1.0272, + "step": 1723 + }, + { + "epoch": 0.61, + "grad_norm": 0.1776842325925827, + "learning_rate": 0.00018894804912071033, + "loss": 0.9687, + "step": 1724 + }, + { + "epoch": 0.61, + "grad_norm": 0.17448489367961884, + "learning_rate": 0.00018893539258689089, + "loss": 0.9592, + "step": 1725 + }, + { + "epoch": 0.61, + "grad_norm": 0.1864984929561615, + "learning_rate": 0.00018892272923457198, + "loss": 1.0414, + "step": 1726 + }, + { + "epoch": 0.61, + "grad_norm": 0.17481493949890137, + "learning_rate": 0.0001889100590647245, + "loss": 1.0149, + "step": 1727 + }, + { + "epoch": 0.61, + "grad_norm": 0.17752544581890106, + "learning_rate": 0.0001888973820783198, + "loss": 0.9654, + "step": 1728 + }, + { + "epoch": 0.61, + "grad_norm": 0.17314109206199646, + "learning_rate": 0.00018888469827632985, + "loss": 1.0088, + "step": 1729 + }, + { + "epoch": 0.61, + "grad_norm": 0.19234046339988708, + "learning_rate": 0.00018887200765972705, + "loss": 0.9895, + "step": 1730 + }, + { + "epoch": 0.61, + "grad_norm": 0.1833478957414627, + "learning_rate": 0.0001888593102294844, + "loss": 1.0427, + "step": 1731 + }, + { + "epoch": 0.61, + "grad_norm": 0.18196554481983185, + "learning_rate": 0.00018884660598657535, + "loss": 1.0088, + "step": 1732 + }, + { + "epoch": 0.61, + "grad_norm": 0.18826749920845032, + "learning_rate": 0.00018883389493197394, + "loss": 1.0019, + "step": 1733 + }, + { + "epoch": 0.61, + "grad_norm": 0.18463777005672455, + "learning_rate": 0.00018882117706665467, + "loss": 0.9894, + "step": 1734 + }, + { + "epoch": 0.61, + "grad_norm": 0.17686514556407928, + "learning_rate": 0.00018880845239159264, + "loss": 0.9487, + "step": 1735 + }, + { + "epoch": 0.61, + "grad_norm": 0.18111956119537354, + "learning_rate": 0.0001887957209077634, + "loss": 1.0057, + "step": 1736 + }, + { + "epoch": 0.61, + "grad_norm": 0.17422161996364594, + "learning_rate": 0.00018878298261614304, + "loss": 1.0053, + "step": 1737 + }, + { + "epoch": 0.61, + "grad_norm": 0.1680193990468979, + "learning_rate": 0.00018877023751770816, + "loss": 0.9142, + "step": 1738 + }, + { + "epoch": 0.61, + "grad_norm": 0.1755419671535492, + "learning_rate": 0.00018875748561343595, + "loss": 1.0097, + "step": 1739 + }, + { + "epoch": 0.61, + "grad_norm": 0.1781400442123413, + "learning_rate": 0.00018874472690430408, + "loss": 0.9885, + "step": 1740 + }, + { + "epoch": 0.61, + "grad_norm": 0.18728172779083252, + "learning_rate": 0.00018873196139129067, + "loss": 1.0323, + "step": 1741 + }, + { + "epoch": 0.61, + "grad_norm": 0.185468390583992, + "learning_rate": 0.00018871918907537445, + "loss": 0.9909, + "step": 1742 + }, + { + "epoch": 0.61, + "grad_norm": 0.18841905891895294, + "learning_rate": 0.00018870640995753468, + "loss": 0.9763, + "step": 1743 + }, + { + "epoch": 0.61, + "grad_norm": 0.18700815737247467, + "learning_rate": 0.00018869362403875107, + "loss": 1.0644, + "step": 1744 + }, + { + "epoch": 0.61, + "grad_norm": 0.18525230884552002, + "learning_rate": 0.00018868083132000393, + "loss": 1.0113, + "step": 1745 + }, + { + "epoch": 0.61, + "grad_norm": 0.1759456992149353, + "learning_rate": 0.00018866803180227402, + "loss": 1.0158, + "step": 1746 + }, + { + "epoch": 0.62, + "grad_norm": 0.18815183639526367, + "learning_rate": 0.00018865522548654266, + "loss": 1.0415, + "step": 1747 + }, + { + "epoch": 0.62, + "grad_norm": 0.17770276963710785, + "learning_rate": 0.0001886424123737917, + "loss": 0.9943, + "step": 1748 + }, + { + "epoch": 0.62, + "grad_norm": 0.17623820900917053, + "learning_rate": 0.0001886295924650035, + "loss": 1.0151, + "step": 1749 + }, + { + "epoch": 0.62, + "grad_norm": 0.1780458390712738, + "learning_rate": 0.0001886167657611609, + "loss": 1.0295, + "step": 1750 + }, + { + "epoch": 0.62, + "grad_norm": 0.18125370144844055, + "learning_rate": 0.00018860393226324734, + "loss": 0.9904, + "step": 1751 + }, + { + "epoch": 0.62, + "grad_norm": 0.1796397864818573, + "learning_rate": 0.00018859109197224668, + "loss": 0.9625, + "step": 1752 + }, + { + "epoch": 0.62, + "grad_norm": 0.18634067475795746, + "learning_rate": 0.00018857824488914345, + "loss": 0.9659, + "step": 1753 + }, + { + "epoch": 0.62, + "grad_norm": 0.1882651001214981, + "learning_rate": 0.00018856539101492256, + "loss": 1.0561, + "step": 1754 + }, + { + "epoch": 0.62, + "grad_norm": 0.18537171185016632, + "learning_rate": 0.00018855253035056943, + "loss": 1.0249, + "step": 1755 + }, + { + "epoch": 0.62, + "grad_norm": 0.1841856837272644, + "learning_rate": 0.00018853966289707018, + "loss": 1.0308, + "step": 1756 + }, + { + "epoch": 0.62, + "grad_norm": 0.18117250502109528, + "learning_rate": 0.00018852678865541122, + "loss": 1.0331, + "step": 1757 + }, + { + "epoch": 0.62, + "grad_norm": 0.18570590019226074, + "learning_rate": 0.0001885139076265797, + "loss": 0.966, + "step": 1758 + }, + { + "epoch": 0.62, + "grad_norm": 0.1879170686006546, + "learning_rate": 0.0001885010198115631, + "loss": 0.9882, + "step": 1759 + }, + { + "epoch": 0.62, + "grad_norm": 0.18629556894302368, + "learning_rate": 0.00018848812521134956, + "loss": 0.9312, + "step": 1760 + }, + { + "epoch": 0.62, + "grad_norm": 0.17607717216014862, + "learning_rate": 0.00018847522382692763, + "loss": 0.9519, + "step": 1761 + }, + { + "epoch": 0.62, + "grad_norm": 0.1829354614019394, + "learning_rate": 0.00018846231565928644, + "loss": 0.9932, + "step": 1762 + }, + { + "epoch": 0.62, + "grad_norm": 0.18066377937793732, + "learning_rate": 0.0001884494007094157, + "loss": 0.9585, + "step": 1763 + }, + { + "epoch": 0.62, + "grad_norm": 0.17497728765010834, + "learning_rate": 0.0001884364789783055, + "loss": 0.9822, + "step": 1764 + }, + { + "epoch": 0.62, + "grad_norm": 0.18487730622291565, + "learning_rate": 0.00018842355046694656, + "loss": 1.0206, + "step": 1765 + }, + { + "epoch": 0.62, + "grad_norm": 0.1813206523656845, + "learning_rate": 0.00018841061517633006, + "loss": 0.9873, + "step": 1766 + }, + { + "epoch": 0.62, + "grad_norm": 0.1820863038301468, + "learning_rate": 0.00018839767310744777, + "loss": 1.002, + "step": 1767 + }, + { + "epoch": 0.62, + "grad_norm": 0.18336263298988342, + "learning_rate": 0.00018838472426129185, + "loss": 0.9764, + "step": 1768 + }, + { + "epoch": 0.62, + "grad_norm": 0.18435858190059662, + "learning_rate": 0.00018837176863885512, + "loss": 1.0391, + "step": 1769 + }, + { + "epoch": 0.62, + "grad_norm": 0.17950308322906494, + "learning_rate": 0.00018835880624113087, + "loss": 0.9695, + "step": 1770 + }, + { + "epoch": 0.62, + "grad_norm": 0.18525420129299164, + "learning_rate": 0.00018834583706911286, + "loss": 1.0169, + "step": 1771 + }, + { + "epoch": 0.62, + "grad_norm": 0.18522191047668457, + "learning_rate": 0.00018833286112379547, + "loss": 1.0383, + "step": 1772 + }, + { + "epoch": 0.62, + "grad_norm": 0.17976364493370056, + "learning_rate": 0.00018831987840617348, + "loss": 0.9452, + "step": 1773 + }, + { + "epoch": 0.62, + "grad_norm": 0.19244349002838135, + "learning_rate": 0.00018830688891724226, + "loss": 1.0472, + "step": 1774 + }, + { + "epoch": 0.63, + "grad_norm": 0.1925305873155594, + "learning_rate": 0.00018829389265799775, + "loss": 1.0674, + "step": 1775 + }, + { + "epoch": 0.63, + "grad_norm": 0.1825743019580841, + "learning_rate": 0.00018828088962943626, + "loss": 0.9906, + "step": 1776 + }, + { + "epoch": 0.63, + "grad_norm": 0.1989976316690445, + "learning_rate": 0.00018826787983255473, + "loss": 0.9963, + "step": 1777 + }, + { + "epoch": 0.63, + "grad_norm": 0.1910424530506134, + "learning_rate": 0.00018825486326835063, + "loss": 0.9898, + "step": 1778 + }, + { + "epoch": 0.63, + "grad_norm": 0.17530961334705353, + "learning_rate": 0.00018824183993782192, + "loss": 1.0211, + "step": 1779 + }, + { + "epoch": 0.63, + "grad_norm": 0.1942378282546997, + "learning_rate": 0.00018822880984196703, + "loss": 1.0093, + "step": 1780 + }, + { + "epoch": 0.63, + "grad_norm": 0.1794646829366684, + "learning_rate": 0.00018821577298178496, + "loss": 0.9389, + "step": 1781 + }, + { + "epoch": 0.63, + "grad_norm": 0.19524367153644562, + "learning_rate": 0.00018820272935827524, + "loss": 0.99, + "step": 1782 + }, + { + "epoch": 0.63, + "grad_norm": 0.1962813436985016, + "learning_rate": 0.00018818967897243785, + "loss": 0.9494, + "step": 1783 + }, + { + "epoch": 0.63, + "grad_norm": 0.17543120682239532, + "learning_rate": 0.00018817662182527342, + "loss": 0.96, + "step": 1784 + }, + { + "epoch": 0.63, + "grad_norm": 0.18655969202518463, + "learning_rate": 0.00018816355791778294, + "loss": 1.0292, + "step": 1785 + }, + { + "epoch": 0.63, + "grad_norm": 0.1929544359445572, + "learning_rate": 0.00018815048725096805, + "loss": 1.0016, + "step": 1786 + }, + { + "epoch": 0.63, + "grad_norm": 0.1946924775838852, + "learning_rate": 0.00018813740982583083, + "loss": 1.0131, + "step": 1787 + }, + { + "epoch": 0.63, + "grad_norm": 0.1820334494113922, + "learning_rate": 0.00018812432564337388, + "loss": 0.9705, + "step": 1788 + }, + { + "epoch": 0.63, + "grad_norm": 0.18293553590774536, + "learning_rate": 0.00018811123470460035, + "loss": 0.9449, + "step": 1789 + }, + { + "epoch": 0.63, + "grad_norm": 0.18539494276046753, + "learning_rate": 0.0001880981370105139, + "loss": 0.9884, + "step": 1790 + }, + { + "epoch": 0.63, + "grad_norm": 0.18742531538009644, + "learning_rate": 0.00018808503256211872, + "loss": 1.0232, + "step": 1791 + }, + { + "epoch": 0.63, + "grad_norm": 0.1922466903924942, + "learning_rate": 0.00018807192136041943, + "loss": 0.9864, + "step": 1792 + }, + { + "epoch": 0.63, + "grad_norm": 0.19350063800811768, + "learning_rate": 0.00018805880340642137, + "loss": 0.9306, + "step": 1793 + }, + { + "epoch": 0.63, + "grad_norm": 0.1808355748653412, + "learning_rate": 0.00018804567870113017, + "loss": 0.948, + "step": 1794 + }, + { + "epoch": 0.63, + "grad_norm": 0.1811530888080597, + "learning_rate": 0.0001880325472455521, + "loss": 1.0872, + "step": 1795 + }, + { + "epoch": 0.63, + "grad_norm": 0.18100488185882568, + "learning_rate": 0.0001880194090406939, + "loss": 1.0122, + "step": 1796 + }, + { + "epoch": 0.63, + "grad_norm": 0.20256204903125763, + "learning_rate": 0.00018800626408756288, + "loss": 1.0903, + "step": 1797 + }, + { + "epoch": 0.63, + "grad_norm": 0.19154663383960724, + "learning_rate": 0.00018799311238716683, + "loss": 1.0409, + "step": 1798 + }, + { + "epoch": 0.63, + "grad_norm": 0.1828100085258484, + "learning_rate": 0.00018797995394051408, + "loss": 0.9465, + "step": 1799 + }, + { + "epoch": 0.63, + "grad_norm": 0.1766197383403778, + "learning_rate": 0.00018796678874861343, + "loss": 0.9912, + "step": 1800 + }, + { + "epoch": 0.63, + "grad_norm": 0.19492526352405548, + "learning_rate": 0.00018795361681247426, + "loss": 0.9834, + "step": 1801 + }, + { + "epoch": 0.63, + "grad_norm": 0.2047867476940155, + "learning_rate": 0.00018794043813310637, + "loss": 1.0388, + "step": 1802 + }, + { + "epoch": 0.63, + "grad_norm": 0.17892782390117645, + "learning_rate": 0.00018792725271152025, + "loss": 0.9646, + "step": 1803 + }, + { + "epoch": 0.64, + "grad_norm": 0.17773431539535522, + "learning_rate": 0.00018791406054872672, + "loss": 0.9839, + "step": 1804 + }, + { + "epoch": 0.64, + "grad_norm": 0.1797155886888504, + "learning_rate": 0.00018790086164573722, + "loss": 1.0079, + "step": 1805 + }, + { + "epoch": 0.64, + "grad_norm": 0.18631033599376678, + "learning_rate": 0.0001878876560035637, + "loss": 1.0001, + "step": 1806 + }, + { + "epoch": 0.64, + "grad_norm": 0.18853716552257538, + "learning_rate": 0.00018787444362321857, + "loss": 0.9939, + "step": 1807 + }, + { + "epoch": 0.64, + "grad_norm": 0.1785057783126831, + "learning_rate": 0.00018786122450571485, + "loss": 0.9477, + "step": 1808 + }, + { + "epoch": 0.64, + "grad_norm": 0.18913787603378296, + "learning_rate": 0.00018784799865206596, + "loss": 0.9952, + "step": 1809 + }, + { + "epoch": 0.64, + "grad_norm": 0.18841691315174103, + "learning_rate": 0.00018783476606328594, + "loss": 1.0136, + "step": 1810 + }, + { + "epoch": 0.64, + "grad_norm": 0.17880366742610931, + "learning_rate": 0.00018782152674038933, + "loss": 1.0012, + "step": 1811 + }, + { + "epoch": 0.64, + "grad_norm": 0.18537718057632446, + "learning_rate": 0.00018780828068439112, + "loss": 1.0081, + "step": 1812 + }, + { + "epoch": 0.64, + "grad_norm": 0.18226812779903412, + "learning_rate": 0.00018779502789630686, + "loss": 0.9489, + "step": 1813 + }, + { + "epoch": 0.64, + "grad_norm": 0.18130582571029663, + "learning_rate": 0.00018778176837715263, + "loss": 0.9939, + "step": 1814 + }, + { + "epoch": 0.64, + "grad_norm": 0.1837967485189438, + "learning_rate": 0.000187768502127945, + "loss": 1.0102, + "step": 1815 + }, + { + "epoch": 0.64, + "grad_norm": 0.18316712975502014, + "learning_rate": 0.0001877552291497011, + "loss": 0.9439, + "step": 1816 + }, + { + "epoch": 0.64, + "grad_norm": 0.1856684684753418, + "learning_rate": 0.0001877419494434385, + "loss": 0.9549, + "step": 1817 + }, + { + "epoch": 0.64, + "grad_norm": 0.18236926198005676, + "learning_rate": 0.00018772866301017534, + "loss": 1.0313, + "step": 1818 + }, + { + "epoch": 0.64, + "grad_norm": 0.18475475907325745, + "learning_rate": 0.00018771536985093026, + "loss": 1.026, + "step": 1819 + }, + { + "epoch": 0.64, + "grad_norm": 0.18892961740493774, + "learning_rate": 0.00018770206996672245, + "loss": 0.968, + "step": 1820 + }, + { + "epoch": 0.64, + "grad_norm": 0.18441183865070343, + "learning_rate": 0.0001876887633585716, + "loss": 1.0116, + "step": 1821 + }, + { + "epoch": 0.64, + "grad_norm": 0.18181204795837402, + "learning_rate": 0.00018767545002749782, + "loss": 1.0226, + "step": 1822 + }, + { + "epoch": 0.64, + "grad_norm": 0.1826629638671875, + "learning_rate": 0.00018766212997452185, + "loss": 0.9366, + "step": 1823 + }, + { + "epoch": 0.64, + "grad_norm": 0.1744999885559082, + "learning_rate": 0.00018764880320066497, + "loss": 0.9736, + "step": 1824 + }, + { + "epoch": 0.64, + "grad_norm": 0.18155504763126373, + "learning_rate": 0.00018763546970694886, + "loss": 0.9828, + "step": 1825 + }, + { + "epoch": 0.64, + "grad_norm": 0.18461792171001434, + "learning_rate": 0.00018762212949439578, + "loss": 1.0278, + "step": 1826 + }, + { + "epoch": 0.64, + "grad_norm": 0.18739303946495056, + "learning_rate": 0.00018760878256402852, + "loss": 0.9921, + "step": 1827 + }, + { + "epoch": 0.64, + "grad_norm": 0.1814883053302765, + "learning_rate": 0.0001875954289168703, + "loss": 1.0008, + "step": 1828 + }, + { + "epoch": 0.64, + "grad_norm": 0.17120951414108276, + "learning_rate": 0.000187582068553945, + "loss": 0.9497, + "step": 1829 + }, + { + "epoch": 0.64, + "grad_norm": 0.183866485953331, + "learning_rate": 0.0001875687014762769, + "loss": 1.0129, + "step": 1830 + }, + { + "epoch": 0.64, + "grad_norm": 0.186480313539505, + "learning_rate": 0.00018755532768489078, + "loss": 0.9857, + "step": 1831 + }, + { + "epoch": 0.65, + "grad_norm": 0.19442792236804962, + "learning_rate": 0.00018754194718081205, + "loss": 0.9795, + "step": 1832 + }, + { + "epoch": 0.65, + "grad_norm": 0.18308177590370178, + "learning_rate": 0.00018752855996506654, + "loss": 1.0739, + "step": 1833 + }, + { + "epoch": 0.65, + "grad_norm": 0.1863255500793457, + "learning_rate": 0.00018751516603868066, + "loss": 0.9829, + "step": 1834 + }, + { + "epoch": 0.65, + "grad_norm": 0.17856715619564056, + "learning_rate": 0.0001875017654026812, + "loss": 1.0195, + "step": 1835 + }, + { + "epoch": 0.65, + "grad_norm": 0.18626144528388977, + "learning_rate": 0.00018748835805809568, + "loss": 0.9666, + "step": 1836 + }, + { + "epoch": 0.65, + "grad_norm": 0.19257773458957672, + "learning_rate": 0.00018747494400595191, + "loss": 1.0068, + "step": 1837 + }, + { + "epoch": 0.65, + "grad_norm": 0.18437279760837555, + "learning_rate": 0.00018746152324727836, + "loss": 1.0515, + "step": 1838 + }, + { + "epoch": 0.65, + "grad_norm": 0.18242545425891876, + "learning_rate": 0.00018744809578310397, + "loss": 0.9792, + "step": 1839 + }, + { + "epoch": 0.65, + "grad_norm": 0.1814119666814804, + "learning_rate": 0.00018743466161445823, + "loss": 0.9656, + "step": 1840 + }, + { + "epoch": 0.65, + "grad_norm": 0.18533895909786224, + "learning_rate": 0.00018742122074237106, + "loss": 0.9934, + "step": 1841 + }, + { + "epoch": 0.65, + "grad_norm": 0.18429550528526306, + "learning_rate": 0.00018740777316787296, + "loss": 1.0336, + "step": 1842 + }, + { + "epoch": 0.65, + "grad_norm": 0.19191229343414307, + "learning_rate": 0.00018739431889199497, + "loss": 1.0694, + "step": 1843 + }, + { + "epoch": 0.65, + "grad_norm": 0.18915875256061554, + "learning_rate": 0.00018738085791576855, + "loss": 1.0051, + "step": 1844 + }, + { + "epoch": 0.65, + "grad_norm": 0.1901199370622635, + "learning_rate": 0.0001873673902402257, + "loss": 1.0248, + "step": 1845 + }, + { + "epoch": 0.65, + "grad_norm": 0.18844999372959137, + "learning_rate": 0.00018735391586639904, + "loss": 1.06, + "step": 1846 + }, + { + "epoch": 0.65, + "grad_norm": 0.1894400715827942, + "learning_rate": 0.0001873404347953216, + "loss": 0.9807, + "step": 1847 + }, + { + "epoch": 0.65, + "grad_norm": 0.18685835599899292, + "learning_rate": 0.00018732694702802694, + "loss": 0.9532, + "step": 1848 + }, + { + "epoch": 0.65, + "grad_norm": 0.176396906375885, + "learning_rate": 0.0001873134525655491, + "loss": 1.0345, + "step": 1849 + }, + { + "epoch": 0.65, + "grad_norm": 0.18638281524181366, + "learning_rate": 0.00018729995140892274, + "loss": 1.0103, + "step": 1850 + }, + { + "epoch": 0.65, + "grad_norm": 0.17325080931186676, + "learning_rate": 0.00018728644355918292, + "loss": 1.0141, + "step": 1851 + }, + { + "epoch": 0.65, + "grad_norm": 0.1763128787279129, + "learning_rate": 0.00018727292901736528, + "loss": 0.9846, + "step": 1852 + }, + { + "epoch": 0.65, + "grad_norm": 0.18143166601657867, + "learning_rate": 0.00018725940778450598, + "loss": 0.9628, + "step": 1853 + }, + { + "epoch": 0.65, + "grad_norm": 0.18402259051799774, + "learning_rate": 0.00018724587986164158, + "loss": 0.9959, + "step": 1854 + }, + { + "epoch": 0.65, + "grad_norm": 0.19441376626491547, + "learning_rate": 0.0001872323452498093, + "loss": 0.9977, + "step": 1855 + }, + { + "epoch": 0.65, + "grad_norm": 0.18418952822685242, + "learning_rate": 0.00018721880395004682, + "loss": 0.849, + "step": 1856 + }, + { + "epoch": 0.65, + "grad_norm": 0.1789008378982544, + "learning_rate": 0.00018720525596339232, + "loss": 0.9796, + "step": 1857 + }, + { + "epoch": 0.65, + "grad_norm": 0.18109720945358276, + "learning_rate": 0.00018719170129088449, + "loss": 0.996, + "step": 1858 + }, + { + "epoch": 0.65, + "grad_norm": 0.18615257740020752, + "learning_rate": 0.0001871781399335625, + "loss": 1.0433, + "step": 1859 + }, + { + "epoch": 0.65, + "grad_norm": 0.19213053584098816, + "learning_rate": 0.00018716457189246614, + "loss": 1.0045, + "step": 1860 + }, + { + "epoch": 0.66, + "grad_norm": 0.18630923330783844, + "learning_rate": 0.0001871509971686356, + "loss": 0.9688, + "step": 1861 + }, + { + "epoch": 0.66, + "grad_norm": 0.18649254739284515, + "learning_rate": 0.00018713741576311166, + "loss": 1.0137, + "step": 1862 + }, + { + "epoch": 0.66, + "grad_norm": 0.1861242651939392, + "learning_rate": 0.00018712382767693554, + "loss": 0.9949, + "step": 1863 + }, + { + "epoch": 0.66, + "grad_norm": 0.18726636469364166, + "learning_rate": 0.00018711023291114902, + "loss": 0.9067, + "step": 1864 + }, + { + "epoch": 0.66, + "grad_norm": 0.1886870563030243, + "learning_rate": 0.00018709663146679442, + "loss": 0.9772, + "step": 1865 + }, + { + "epoch": 0.66, + "grad_norm": 0.18354971706867218, + "learning_rate": 0.0001870830233449145, + "loss": 1.0024, + "step": 1866 + }, + { + "epoch": 0.66, + "grad_norm": 0.17389263212680817, + "learning_rate": 0.00018706940854655257, + "loss": 0.9816, + "step": 1867 + }, + { + "epoch": 0.66, + "grad_norm": 0.17908309400081635, + "learning_rate": 0.00018705578707275248, + "loss": 0.9976, + "step": 1868 + }, + { + "epoch": 0.66, + "grad_norm": 0.21369712054729462, + "learning_rate": 0.0001870421589245585, + "loss": 1.0394, + "step": 1869 + }, + { + "epoch": 0.66, + "grad_norm": 0.19233554601669312, + "learning_rate": 0.00018702852410301554, + "loss": 0.9666, + "step": 1870 + }, + { + "epoch": 0.66, + "grad_norm": 0.1837395280599594, + "learning_rate": 0.00018701488260916893, + "loss": 1.0495, + "step": 1871 + }, + { + "epoch": 0.66, + "grad_norm": 0.18571002781391144, + "learning_rate": 0.0001870012344440645, + "loss": 1.0363, + "step": 1872 + }, + { + "epoch": 0.66, + "grad_norm": 0.19713854789733887, + "learning_rate": 0.0001869875796087487, + "loss": 0.9658, + "step": 1873 + }, + { + "epoch": 0.66, + "grad_norm": 0.19223444163799286, + "learning_rate": 0.00018697391810426836, + "loss": 1.0167, + "step": 1874 + }, + { + "epoch": 0.66, + "grad_norm": 0.18585839867591858, + "learning_rate": 0.00018696024993167088, + "loss": 0.9921, + "step": 1875 + }, + { + "epoch": 0.66, + "grad_norm": 0.1860339492559433, + "learning_rate": 0.00018694657509200422, + "loss": 0.9184, + "step": 1876 + }, + { + "epoch": 0.66, + "grad_norm": 0.18812976777553558, + "learning_rate": 0.0001869328935863168, + "loss": 1.0175, + "step": 1877 + }, + { + "epoch": 0.66, + "grad_norm": 0.19325508177280426, + "learning_rate": 0.00018691920541565746, + "loss": 1.103, + "step": 1878 + }, + { + "epoch": 0.66, + "grad_norm": 0.18449616432189941, + "learning_rate": 0.00018690551058107575, + "loss": 1.0278, + "step": 1879 + }, + { + "epoch": 0.66, + "grad_norm": 0.1745263785123825, + "learning_rate": 0.0001868918090836216, + "loss": 0.9653, + "step": 1880 + }, + { + "epoch": 0.66, + "grad_norm": 0.18073584139347076, + "learning_rate": 0.00018687810092434543, + "loss": 1.0015, + "step": 1881 + }, + { + "epoch": 0.66, + "grad_norm": 0.19327598810195923, + "learning_rate": 0.0001868643861042983, + "loss": 1.016, + "step": 1882 + }, + { + "epoch": 0.66, + "grad_norm": 0.1936059147119522, + "learning_rate": 0.0001868506646245316, + "loss": 0.9642, + "step": 1883 + }, + { + "epoch": 0.66, + "grad_norm": 0.19047310948371887, + "learning_rate": 0.0001868369364860974, + "loss": 1.1297, + "step": 1884 + }, + { + "epoch": 0.66, + "grad_norm": 0.17914171516895294, + "learning_rate": 0.00018682320169004818, + "loss": 0.9462, + "step": 1885 + }, + { + "epoch": 0.66, + "grad_norm": 0.17842666804790497, + "learning_rate": 0.00018680946023743697, + "loss": 0.9738, + "step": 1886 + }, + { + "epoch": 0.66, + "grad_norm": 0.18439552187919617, + "learning_rate": 0.0001867957121293173, + "loss": 0.9926, + "step": 1887 + }, + { + "epoch": 0.66, + "grad_norm": 0.1877158284187317, + "learning_rate": 0.00018678195736674319, + "loss": 1.0793, + "step": 1888 + }, + { + "epoch": 0.67, + "grad_norm": 0.1864771544933319, + "learning_rate": 0.00018676819595076925, + "loss": 0.9975, + "step": 1889 + }, + { + "epoch": 0.67, + "grad_norm": 0.18416166305541992, + "learning_rate": 0.00018675442788245047, + "loss": 0.9312, + "step": 1890 + }, + { + "epoch": 0.67, + "grad_norm": 0.17662164568901062, + "learning_rate": 0.00018674065316284243, + "loss": 0.935, + "step": 1891 + }, + { + "epoch": 0.67, + "grad_norm": 0.17577767372131348, + "learning_rate": 0.00018672687179300125, + "loss": 0.9825, + "step": 1892 + }, + { + "epoch": 0.67, + "grad_norm": 0.17717039585113525, + "learning_rate": 0.0001867130837739835, + "loss": 0.9599, + "step": 1893 + }, + { + "epoch": 0.67, + "grad_norm": 0.17826318740844727, + "learning_rate": 0.00018669928910684626, + "loss": 1.055, + "step": 1894 + }, + { + "epoch": 0.67, + "grad_norm": 0.18381334841251373, + "learning_rate": 0.00018668548779264717, + "loss": 1.0138, + "step": 1895 + }, + { + "epoch": 0.67, + "grad_norm": 0.17650206387043, + "learning_rate": 0.00018667167983244434, + "loss": 1.0064, + "step": 1896 + }, + { + "epoch": 0.67, + "grad_norm": 0.17828193306922913, + "learning_rate": 0.00018665786522729637, + "loss": 0.9707, + "step": 1897 + }, + { + "epoch": 0.67, + "grad_norm": 0.18671469390392303, + "learning_rate": 0.00018664404397826246, + "loss": 0.9894, + "step": 1898 + }, + { + "epoch": 0.67, + "grad_norm": 0.17767448723316193, + "learning_rate": 0.00018663021608640224, + "loss": 0.967, + "step": 1899 + }, + { + "epoch": 0.67, + "grad_norm": 0.19075635075569153, + "learning_rate": 0.0001866163815527758, + "loss": 0.993, + "step": 1900 + }, + { + "epoch": 0.67, + "grad_norm": 0.2010372281074524, + "learning_rate": 0.00018660254037844388, + "loss": 0.959, + "step": 1901 + }, + { + "epoch": 0.67, + "grad_norm": 0.17780248820781708, + "learning_rate": 0.00018658869256446762, + "loss": 0.9479, + "step": 1902 + }, + { + "epoch": 0.67, + "grad_norm": 0.18399259448051453, + "learning_rate": 0.0001865748381119087, + "loss": 1.0196, + "step": 1903 + }, + { + "epoch": 0.67, + "grad_norm": 0.1945018470287323, + "learning_rate": 0.00018656097702182938, + "loss": 0.9681, + "step": 1904 + }, + { + "epoch": 0.67, + "grad_norm": 0.1856342852115631, + "learning_rate": 0.00018654710929529226, + "loss": 1.0241, + "step": 1905 + }, + { + "epoch": 0.67, + "grad_norm": 0.1952160745859146, + "learning_rate": 0.00018653323493336062, + "loss": 1.033, + "step": 1906 + }, + { + "epoch": 0.67, + "grad_norm": 0.17957225441932678, + "learning_rate": 0.00018651935393709814, + "loss": 0.8973, + "step": 1907 + }, + { + "epoch": 0.67, + "grad_norm": 0.19565623998641968, + "learning_rate": 0.00018650546630756911, + "loss": 1.0472, + "step": 1908 + }, + { + "epoch": 0.67, + "grad_norm": 0.17848311364650726, + "learning_rate": 0.0001864915720458382, + "loss": 0.9535, + "step": 1909 + }, + { + "epoch": 0.67, + "grad_norm": 0.1798015534877777, + "learning_rate": 0.00018647767115297068, + "loss": 0.9587, + "step": 1910 + }, + { + "epoch": 0.67, + "grad_norm": 0.18396656215190887, + "learning_rate": 0.0001864637636300323, + "loss": 1.0296, + "step": 1911 + }, + { + "epoch": 0.67, + "grad_norm": 0.1760600209236145, + "learning_rate": 0.00018644984947808934, + "loss": 0.8995, + "step": 1912 + }, + { + "epoch": 0.67, + "grad_norm": 0.195132777094841, + "learning_rate": 0.00018643592869820857, + "loss": 1.0686, + "step": 1913 + }, + { + "epoch": 0.67, + "grad_norm": 0.19066643714904785, + "learning_rate": 0.00018642200129145723, + "loss": 0.967, + "step": 1914 + }, + { + "epoch": 0.67, + "grad_norm": 0.18508267402648926, + "learning_rate": 0.0001864080672589031, + "loss": 1.0049, + "step": 1915 + }, + { + "epoch": 0.67, + "grad_norm": 0.18527060747146606, + "learning_rate": 0.00018639412660161457, + "loss": 0.977, + "step": 1916 + }, + { + "epoch": 0.68, + "grad_norm": 0.18264782428741455, + "learning_rate": 0.00018638017932066036, + "loss": 0.9405, + "step": 1917 + }, + { + "epoch": 0.68, + "grad_norm": 0.17923100292682648, + "learning_rate": 0.00018636622541710982, + "loss": 1.01, + "step": 1918 + }, + { + "epoch": 0.68, + "grad_norm": 0.1840544193983078, + "learning_rate": 0.00018635226489203271, + "loss": 1.0268, + "step": 1919 + }, + { + "epoch": 0.68, + "grad_norm": 0.1948777288198471, + "learning_rate": 0.0001863382977464994, + "loss": 0.9392, + "step": 1920 + }, + { + "epoch": 0.68, + "grad_norm": 0.18678011000156403, + "learning_rate": 0.00018632432398158074, + "loss": 0.9766, + "step": 1921 + }, + { + "epoch": 0.68, + "grad_norm": 0.17854583263397217, + "learning_rate": 0.000186310343598348, + "loss": 0.9723, + "step": 1922 + }, + { + "epoch": 0.68, + "grad_norm": 0.18715204298496246, + "learning_rate": 0.00018629635659787316, + "loss": 0.9783, + "step": 1923 + }, + { + "epoch": 0.68, + "grad_norm": 0.18582193553447723, + "learning_rate": 0.00018628236298122842, + "loss": 0.9439, + "step": 1924 + }, + { + "epoch": 0.68, + "grad_norm": 0.18433134257793427, + "learning_rate": 0.00018626836274948674, + "loss": 0.9874, + "step": 1925 + }, + { + "epoch": 0.68, + "grad_norm": 0.18748904764652252, + "learning_rate": 0.00018625435590372143, + "loss": 1.0405, + "step": 1926 + }, + { + "epoch": 0.68, + "grad_norm": 0.17972299456596375, + "learning_rate": 0.00018624034244500647, + "loss": 0.9845, + "step": 1927 + }, + { + "epoch": 0.68, + "grad_norm": 0.18048334121704102, + "learning_rate": 0.00018622632237441612, + "loss": 0.9354, + "step": 1928 + }, + { + "epoch": 0.68, + "grad_norm": 0.1844087690114975, + "learning_rate": 0.00018621229569302532, + "loss": 0.9674, + "step": 1929 + }, + { + "epoch": 0.68, + "grad_norm": 0.18459495902061462, + "learning_rate": 0.0001861982624019095, + "loss": 1.0473, + "step": 1930 + }, + { + "epoch": 0.68, + "grad_norm": 0.19419121742248535, + "learning_rate": 0.00018618422250214452, + "loss": 1.0508, + "step": 1931 + }, + { + "epoch": 0.68, + "grad_norm": 0.17196646332740784, + "learning_rate": 0.00018617017599480682, + "loss": 0.9615, + "step": 1932 + }, + { + "epoch": 0.68, + "grad_norm": 0.18627747893333435, + "learning_rate": 0.0001861561228809733, + "loss": 1.0027, + "step": 1933 + }, + { + "epoch": 0.68, + "grad_norm": 0.19207796454429626, + "learning_rate": 0.0001861420631617214, + "loss": 1.0159, + "step": 1934 + }, + { + "epoch": 0.68, + "grad_norm": 0.18138843774795532, + "learning_rate": 0.00018612799683812904, + "loss": 0.9165, + "step": 1935 + }, + { + "epoch": 0.68, + "grad_norm": 0.18867118656635284, + "learning_rate": 0.00018611392391127467, + "loss": 0.9952, + "step": 1936 + }, + { + "epoch": 0.68, + "grad_norm": 0.18637476861476898, + "learning_rate": 0.0001860998443822372, + "loss": 0.9637, + "step": 1937 + }, + { + "epoch": 0.68, + "grad_norm": 0.19077537953853607, + "learning_rate": 0.0001860857582520961, + "loss": 1.0254, + "step": 1938 + }, + { + "epoch": 0.68, + "grad_norm": 0.1727682203054428, + "learning_rate": 0.00018607166552193135, + "loss": 0.9634, + "step": 1939 + }, + { + "epoch": 0.68, + "grad_norm": 0.186919167637825, + "learning_rate": 0.00018605756619282336, + "loss": 1.0402, + "step": 1940 + }, + { + "epoch": 0.68, + "grad_norm": 0.18637052178382874, + "learning_rate": 0.00018604346026585312, + "loss": 1.0357, + "step": 1941 + }, + { + "epoch": 0.68, + "grad_norm": 0.18484027683734894, + "learning_rate": 0.00018602934774210214, + "loss": 0.9575, + "step": 1942 + }, + { + "epoch": 0.68, + "grad_norm": 0.1901281327009201, + "learning_rate": 0.00018601522862265237, + "loss": 0.9959, + "step": 1943 + }, + { + "epoch": 0.68, + "grad_norm": 0.1861106902360916, + "learning_rate": 0.00018600110290858629, + "loss": 1.0286, + "step": 1944 + }, + { + "epoch": 0.68, + "grad_norm": 0.19661380350589752, + "learning_rate": 0.00018598697060098686, + "loss": 1.02, + "step": 1945 + }, + { + "epoch": 0.69, + "grad_norm": 0.1923096925020218, + "learning_rate": 0.00018597283170093763, + "loss": 1.0177, + "step": 1946 + }, + { + "epoch": 0.69, + "grad_norm": 0.19194743037223816, + "learning_rate": 0.0001859586862095226, + "loss": 1.0526, + "step": 1947 + }, + { + "epoch": 0.69, + "grad_norm": 0.18508084118366241, + "learning_rate": 0.00018594453412782623, + "loss": 1.0435, + "step": 1948 + }, + { + "epoch": 0.69, + "grad_norm": 0.18176361918449402, + "learning_rate": 0.00018593037545693356, + "loss": 1.0053, + "step": 1949 + }, + { + "epoch": 0.69, + "grad_norm": 0.1897421032190323, + "learning_rate": 0.00018591621019793013, + "loss": 1.0259, + "step": 1950 + }, + { + "epoch": 0.69, + "grad_norm": 0.18557140231132507, + "learning_rate": 0.00018590203835190192, + "loss": 0.9631, + "step": 1951 + }, + { + "epoch": 0.69, + "grad_norm": 0.19048331677913666, + "learning_rate": 0.0001858878599199355, + "loss": 1.0471, + "step": 1952 + }, + { + "epoch": 0.69, + "grad_norm": 0.17859672009944916, + "learning_rate": 0.00018587367490311785, + "loss": 1.0225, + "step": 1953 + }, + { + "epoch": 0.69, + "grad_norm": 0.18589475750923157, + "learning_rate": 0.00018585948330253652, + "loss": 0.9908, + "step": 1954 + }, + { + "epoch": 0.69, + "grad_norm": 0.1961270272731781, + "learning_rate": 0.0001858452851192796, + "loss": 1.0266, + "step": 1955 + }, + { + "epoch": 0.69, + "grad_norm": 0.1803094893693924, + "learning_rate": 0.0001858310803544356, + "loss": 0.9862, + "step": 1956 + }, + { + "epoch": 0.69, + "grad_norm": 0.18154191970825195, + "learning_rate": 0.00018581686900909355, + "loss": 0.9889, + "step": 1957 + }, + { + "epoch": 0.69, + "grad_norm": 0.1789197474718094, + "learning_rate": 0.00018580265108434306, + "loss": 0.9266, + "step": 1958 + }, + { + "epoch": 0.69, + "grad_norm": 0.18821920454502106, + "learning_rate": 0.00018578842658127415, + "loss": 0.9923, + "step": 1959 + }, + { + "epoch": 0.69, + "grad_norm": 0.1863432079553604, + "learning_rate": 0.0001857741955009774, + "loss": 0.9733, + "step": 1960 + }, + { + "epoch": 0.69, + "grad_norm": 0.18911732733249664, + "learning_rate": 0.00018575995784454386, + "loss": 0.9425, + "step": 1961 + }, + { + "epoch": 0.69, + "grad_norm": 0.18338856101036072, + "learning_rate": 0.0001857457136130651, + "loss": 0.9391, + "step": 1962 + }, + { + "epoch": 0.69, + "grad_norm": 0.18629801273345947, + "learning_rate": 0.00018573146280763324, + "loss": 1.0108, + "step": 1963 + }, + { + "epoch": 0.69, + "grad_norm": 0.18729791045188904, + "learning_rate": 0.00018571720542934084, + "loss": 0.9739, + "step": 1964 + }, + { + "epoch": 0.69, + "grad_norm": 0.17371436953544617, + "learning_rate": 0.00018570294147928092, + "loss": 0.9305, + "step": 1965 + }, + { + "epoch": 0.69, + "grad_norm": 0.1936265081167221, + "learning_rate": 0.0001856886709585472, + "loss": 1.0033, + "step": 1966 + }, + { + "epoch": 0.69, + "grad_norm": 0.187389075756073, + "learning_rate": 0.00018567439386823367, + "loss": 0.9921, + "step": 1967 + }, + { + "epoch": 0.69, + "grad_norm": 0.19305671751499176, + "learning_rate": 0.00018566011020943496, + "loss": 0.9698, + "step": 1968 + }, + { + "epoch": 0.69, + "grad_norm": 0.18552327156066895, + "learning_rate": 0.00018564581998324614, + "loss": 1.0511, + "step": 1969 + }, + { + "epoch": 0.69, + "grad_norm": 0.19322232902050018, + "learning_rate": 0.00018563152319076286, + "loss": 0.9734, + "step": 1970 + }, + { + "epoch": 0.69, + "grad_norm": 0.18172651529312134, + "learning_rate": 0.0001856172198330812, + "loss": 0.9552, + "step": 1971 + }, + { + "epoch": 0.69, + "grad_norm": 0.1780477911233902, + "learning_rate": 0.00018560290991129777, + "loss": 0.9882, + "step": 1972 + }, + { + "epoch": 0.69, + "grad_norm": 0.18396319448947906, + "learning_rate": 0.00018558859342650969, + "loss": 0.9748, + "step": 1973 + }, + { + "epoch": 0.7, + "grad_norm": 0.18518681824207306, + "learning_rate": 0.00018557427037981458, + "loss": 0.9754, + "step": 1974 + }, + { + "epoch": 0.7, + "grad_norm": 0.17571696639060974, + "learning_rate": 0.00018555994077231055, + "loss": 0.9538, + "step": 1975 + }, + { + "epoch": 0.7, + "grad_norm": 0.1853148639202118, + "learning_rate": 0.00018554560460509623, + "loss": 0.9983, + "step": 1976 + }, + { + "epoch": 0.7, + "grad_norm": 0.18485432863235474, + "learning_rate": 0.00018553126187927072, + "loss": 0.9954, + "step": 1977 + }, + { + "epoch": 0.7, + "grad_norm": 0.19659732282161713, + "learning_rate": 0.00018551691259593368, + "loss": 1.0263, + "step": 1978 + }, + { + "epoch": 0.7, + "grad_norm": 0.17942313849925995, + "learning_rate": 0.00018550255675618528, + "loss": 0.9816, + "step": 1979 + }, + { + "epoch": 0.7, + "grad_norm": 0.19176405668258667, + "learning_rate": 0.00018548819436112603, + "loss": 0.9783, + "step": 1980 + }, + { + "epoch": 0.7, + "grad_norm": 0.20149610936641693, + "learning_rate": 0.00018547382541185718, + "loss": 0.9438, + "step": 1981 + }, + { + "epoch": 0.7, + "grad_norm": 0.18515346944332123, + "learning_rate": 0.0001854594499094803, + "loss": 1.0068, + "step": 1982 + }, + { + "epoch": 0.7, + "grad_norm": 0.19630250334739685, + "learning_rate": 0.00018544506785509758, + "loss": 0.992, + "step": 1983 + }, + { + "epoch": 0.7, + "grad_norm": 0.18869252502918243, + "learning_rate": 0.00018543067924981166, + "loss": 0.9607, + "step": 1984 + }, + { + "epoch": 0.7, + "grad_norm": 0.2046448290348053, + "learning_rate": 0.00018541628409472565, + "loss": 1.0241, + "step": 1985 + }, + { + "epoch": 0.7, + "grad_norm": 0.17943765223026276, + "learning_rate": 0.00018540188239094321, + "loss": 1.0121, + "step": 1986 + }, + { + "epoch": 0.7, + "grad_norm": 0.18648383021354675, + "learning_rate": 0.00018538747413956853, + "loss": 0.9549, + "step": 1987 + }, + { + "epoch": 0.7, + "grad_norm": 0.18941380083560944, + "learning_rate": 0.0001853730593417062, + "loss": 1.0078, + "step": 1988 + }, + { + "epoch": 0.7, + "grad_norm": 0.18444235622882843, + "learning_rate": 0.0001853586379984614, + "loss": 1.0314, + "step": 1989 + }, + { + "epoch": 0.7, + "grad_norm": 0.18902845680713654, + "learning_rate": 0.00018534421011093982, + "loss": 1.014, + "step": 1990 + }, + { + "epoch": 0.7, + "grad_norm": 0.17776677012443542, + "learning_rate": 0.00018532977568024757, + "loss": 0.9594, + "step": 1991 + }, + { + "epoch": 0.7, + "grad_norm": 0.1941346377134323, + "learning_rate": 0.00018531533470749132, + "loss": 1.0107, + "step": 1992 + }, + { + "epoch": 0.7, + "grad_norm": 0.19353926181793213, + "learning_rate": 0.00018530088719377825, + "loss": 1.0376, + "step": 1993 + }, + { + "epoch": 0.7, + "grad_norm": 0.18152490258216858, + "learning_rate": 0.000185286433140216, + "loss": 0.9123, + "step": 1994 + }, + { + "epoch": 0.7, + "grad_norm": 0.18493008613586426, + "learning_rate": 0.0001852719725479127, + "loss": 0.999, + "step": 1995 + }, + { + "epoch": 0.7, + "grad_norm": 0.18543054163455963, + "learning_rate": 0.00018525750541797712, + "loss": 0.9819, + "step": 1996 + }, + { + "epoch": 0.7, + "grad_norm": 0.18111182749271393, + "learning_rate": 0.00018524303175151833, + "loss": 0.9611, + "step": 1997 + }, + { + "epoch": 0.7, + "grad_norm": 0.18060384690761566, + "learning_rate": 0.00018522855154964605, + "loss": 0.9838, + "step": 1998 + }, + { + "epoch": 0.7, + "grad_norm": 0.18530574440956116, + "learning_rate": 0.0001852140648134704, + "loss": 1.0593, + "step": 1999 + }, + { + "epoch": 0.7, + "grad_norm": 0.18004360795021057, + "learning_rate": 0.00018519957154410207, + "loss": 0.9767, + "step": 2000 + }, + { + "epoch": 0.7, + "grad_norm": 0.18427857756614685, + "learning_rate": 0.0001851850717426523, + "loss": 1.0015, + "step": 2001 + }, + { + "epoch": 0.7, + "grad_norm": 0.18990133702754974, + "learning_rate": 0.00018517056541023262, + "loss": 1.0502, + "step": 2002 + }, + { + "epoch": 0.71, + "grad_norm": 0.19328436255455017, + "learning_rate": 0.00018515605254795534, + "loss": 1.0043, + "step": 2003 + }, + { + "epoch": 0.71, + "grad_norm": 0.1968836933374405, + "learning_rate": 0.00018514153315693307, + "loss": 1.0026, + "step": 2004 + }, + { + "epoch": 0.71, + "grad_norm": 0.18505604565143585, + "learning_rate": 0.00018512700723827892, + "loss": 1.0009, + "step": 2005 + }, + { + "epoch": 0.71, + "grad_norm": 0.17385464906692505, + "learning_rate": 0.00018511247479310669, + "loss": 0.9709, + "step": 2006 + }, + { + "epoch": 0.71, + "grad_norm": 0.185564324259758, + "learning_rate": 0.00018509793582253048, + "loss": 0.9357, + "step": 2007 + }, + { + "epoch": 0.71, + "grad_norm": 0.186592698097229, + "learning_rate": 0.00018508339032766494, + "loss": 0.9939, + "step": 2008 + }, + { + "epoch": 0.71, + "grad_norm": 0.1900765746831894, + "learning_rate": 0.00018506883830962534, + "loss": 0.9358, + "step": 2009 + }, + { + "epoch": 0.71, + "grad_norm": 0.175924152135849, + "learning_rate": 0.00018505427976952724, + "loss": 0.9777, + "step": 2010 + }, + { + "epoch": 0.71, + "grad_norm": 0.17754964530467987, + "learning_rate": 0.00018503971470848688, + "loss": 0.9231, + "step": 2011 + }, + { + "epoch": 0.71, + "grad_norm": 0.18660208582878113, + "learning_rate": 0.00018502514312762096, + "loss": 0.9708, + "step": 2012 + }, + { + "epoch": 0.71, + "grad_norm": 0.18159836530685425, + "learning_rate": 0.00018501056502804656, + "loss": 0.9688, + "step": 2013 + }, + { + "epoch": 0.71, + "grad_norm": 0.1811097264289856, + "learning_rate": 0.00018499598041088144, + "loss": 0.977, + "step": 2014 + }, + { + "epoch": 0.71, + "grad_norm": 0.19245150685310364, + "learning_rate": 0.00018498138927724376, + "loss": 1.0467, + "step": 2015 + }, + { + "epoch": 0.71, + "grad_norm": 0.1905694156885147, + "learning_rate": 0.00018496679162825216, + "loss": 1.0223, + "step": 2016 + }, + { + "epoch": 0.71, + "grad_norm": 0.1842517852783203, + "learning_rate": 0.00018495218746502582, + "loss": 0.9497, + "step": 2017 + }, + { + "epoch": 0.71, + "grad_norm": 0.18859322369098663, + "learning_rate": 0.00018493757678868445, + "loss": 1.0115, + "step": 2018 + }, + { + "epoch": 0.71, + "grad_norm": 0.19249236583709717, + "learning_rate": 0.00018492295960034815, + "loss": 1.002, + "step": 2019 + }, + { + "epoch": 0.71, + "grad_norm": 0.18580509722232819, + "learning_rate": 0.00018490833590113767, + "loss": 0.9569, + "step": 2020 + }, + { + "epoch": 0.71, + "grad_norm": 0.19517533481121063, + "learning_rate": 0.00018489370569217415, + "loss": 0.9678, + "step": 2021 + }, + { + "epoch": 0.71, + "grad_norm": 0.17847387492656708, + "learning_rate": 0.00018487906897457924, + "loss": 1.008, + "step": 2022 + }, + { + "epoch": 0.71, + "grad_norm": 0.18228110671043396, + "learning_rate": 0.00018486442574947511, + "loss": 0.9509, + "step": 2023 + }, + { + "epoch": 0.71, + "grad_norm": 0.18810895085334778, + "learning_rate": 0.00018484977601798444, + "loss": 0.965, + "step": 2024 + }, + { + "epoch": 0.71, + "grad_norm": 0.18813060224056244, + "learning_rate": 0.0001848351197812304, + "loss": 1.026, + "step": 2025 + }, + { + "epoch": 0.71, + "grad_norm": 0.1792432814836502, + "learning_rate": 0.00018482045704033663, + "loss": 0.9821, + "step": 2026 + }, + { + "epoch": 0.71, + "grad_norm": 0.19384756684303284, + "learning_rate": 0.00018480578779642734, + "loss": 0.9905, + "step": 2027 + }, + { + "epoch": 0.71, + "grad_norm": 0.1915934830904007, + "learning_rate": 0.00018479111205062715, + "loss": 1.0132, + "step": 2028 + }, + { + "epoch": 0.71, + "grad_norm": 0.21746857464313507, + "learning_rate": 0.00018477642980406126, + "loss": 0.888, + "step": 2029 + }, + { + "epoch": 0.71, + "grad_norm": 0.1871354579925537, + "learning_rate": 0.00018476174105785527, + "loss": 1.1023, + "step": 2030 + }, + { + "epoch": 0.72, + "grad_norm": 0.19254761934280396, + "learning_rate": 0.00018474704581313537, + "loss": 0.9672, + "step": 2031 + }, + { + "epoch": 0.72, + "grad_norm": 0.18293356895446777, + "learning_rate": 0.0001847323440710282, + "loss": 1.0073, + "step": 2032 + }, + { + "epoch": 0.72, + "grad_norm": 0.1897682547569275, + "learning_rate": 0.00018471763583266095, + "loss": 0.9788, + "step": 2033 + }, + { + "epoch": 0.72, + "grad_norm": 0.178378164768219, + "learning_rate": 0.00018470292109916127, + "loss": 0.9124, + "step": 2034 + }, + { + "epoch": 0.72, + "grad_norm": 0.1845291703939438, + "learning_rate": 0.00018468819987165725, + "loss": 1.0039, + "step": 2035 + }, + { + "epoch": 0.72, + "grad_norm": 0.18279947340488434, + "learning_rate": 0.0001846734721512776, + "loss": 0.9861, + "step": 2036 + }, + { + "epoch": 0.72, + "grad_norm": 0.2001655399799347, + "learning_rate": 0.00018465873793915141, + "loss": 0.9734, + "step": 2037 + }, + { + "epoch": 0.72, + "grad_norm": 0.1830873042345047, + "learning_rate": 0.00018464399723640837, + "loss": 0.9534, + "step": 2038 + }, + { + "epoch": 0.72, + "grad_norm": 0.18374669551849365, + "learning_rate": 0.00018462925004417862, + "loss": 1.0282, + "step": 2039 + }, + { + "epoch": 0.72, + "grad_norm": 0.18520888686180115, + "learning_rate": 0.00018461449636359277, + "loss": 0.9944, + "step": 2040 + }, + { + "epoch": 0.72, + "grad_norm": 0.18403057754039764, + "learning_rate": 0.00018459973619578193, + "loss": 0.946, + "step": 2041 + }, + { + "epoch": 0.72, + "grad_norm": 0.1820298135280609, + "learning_rate": 0.00018458496954187783, + "loss": 0.9202, + "step": 2042 + }, + { + "epoch": 0.72, + "grad_norm": 0.1788777858018875, + "learning_rate": 0.0001845701964030125, + "loss": 1.0047, + "step": 2043 + }, + { + "epoch": 0.72, + "grad_norm": 0.18801230192184448, + "learning_rate": 0.0001845554167803186, + "loss": 0.9193, + "step": 2044 + }, + { + "epoch": 0.72, + "grad_norm": 0.17807801067829132, + "learning_rate": 0.0001845406306749293, + "loss": 0.9653, + "step": 2045 + }, + { + "epoch": 0.72, + "grad_norm": 0.19261790812015533, + "learning_rate": 0.00018452583808797814, + "loss": 0.9817, + "step": 2046 + }, + { + "epoch": 0.72, + "grad_norm": 0.18605123460292816, + "learning_rate": 0.0001845110390205993, + "loss": 0.987, + "step": 2047 + }, + { + "epoch": 0.72, + "grad_norm": 0.17385923862457275, + "learning_rate": 0.00018449623347392737, + "loss": 0.9361, + "step": 2048 + }, + { + "epoch": 0.72, + "grad_norm": 0.188616544008255, + "learning_rate": 0.00018448142144909747, + "loss": 1.0278, + "step": 2049 + }, + { + "epoch": 0.72, + "grad_norm": 0.1917959302663803, + "learning_rate": 0.0001844666029472452, + "loss": 0.9451, + "step": 2050 + }, + { + "epoch": 0.72, + "grad_norm": 0.18233361840248108, + "learning_rate": 0.00018445177796950669, + "loss": 0.9313, + "step": 2051 + }, + { + "epoch": 0.72, + "grad_norm": 0.19012048840522766, + "learning_rate": 0.0001844369465170185, + "loss": 1.0405, + "step": 2052 + }, + { + "epoch": 0.72, + "grad_norm": 0.18029513955116272, + "learning_rate": 0.00018442210859091776, + "loss": 0.9404, + "step": 2053 + }, + { + "epoch": 0.72, + "grad_norm": 0.200601264834404, + "learning_rate": 0.00018440726419234203, + "loss": 1.0396, + "step": 2054 + }, + { + "epoch": 0.72, + "grad_norm": 0.18682414293289185, + "learning_rate": 0.00018439241332242944, + "loss": 0.9931, + "step": 2055 + }, + { + "epoch": 0.72, + "grad_norm": 0.17674268782138824, + "learning_rate": 0.00018437755598231856, + "loss": 0.9469, + "step": 2056 + }, + { + "epoch": 0.72, + "grad_norm": 0.1858448088169098, + "learning_rate": 0.0001843626921731485, + "loss": 0.9994, + "step": 2057 + }, + { + "epoch": 0.72, + "grad_norm": 0.18077892065048218, + "learning_rate": 0.00018434782189605877, + "loss": 0.9673, + "step": 2058 + }, + { + "epoch": 0.73, + "grad_norm": 0.1898825615644455, + "learning_rate": 0.0001843329451521895, + "loss": 0.9681, + "step": 2059 + }, + { + "epoch": 0.73, + "grad_norm": 0.17584487795829773, + "learning_rate": 0.00018431806194268126, + "loss": 0.9353, + "step": 2060 + }, + { + "epoch": 0.73, + "grad_norm": 0.2292255014181137, + "learning_rate": 0.0001843031722686751, + "loss": 0.896, + "step": 2061 + }, + { + "epoch": 0.73, + "grad_norm": 0.1748904138803482, + "learning_rate": 0.0001842882761313126, + "loss": 0.8746, + "step": 2062 + }, + { + "epoch": 0.73, + "grad_norm": 0.17828615009784698, + "learning_rate": 0.00018427337353173578, + "loss": 0.9323, + "step": 2063 + }, + { + "epoch": 0.73, + "grad_norm": 0.18547767400741577, + "learning_rate": 0.00018425846447108718, + "loss": 0.9273, + "step": 2064 + }, + { + "epoch": 0.73, + "grad_norm": 0.18525430560112, + "learning_rate": 0.00018424354895050994, + "loss": 1.0056, + "step": 2065 + }, + { + "epoch": 0.73, + "grad_norm": 0.17384150624275208, + "learning_rate": 0.00018422862697114754, + "loss": 0.9105, + "step": 2066 + }, + { + "epoch": 0.73, + "grad_norm": 0.18717877566814423, + "learning_rate": 0.00018421369853414401, + "loss": 0.9566, + "step": 2067 + }, + { + "epoch": 0.73, + "grad_norm": 0.19069983065128326, + "learning_rate": 0.00018419876364064388, + "loss": 1.0089, + "step": 2068 + }, + { + "epoch": 0.73, + "grad_norm": 0.17962156236171722, + "learning_rate": 0.0001841838222917922, + "loss": 0.9166, + "step": 2069 + }, + { + "epoch": 0.73, + "grad_norm": 0.18591444194316864, + "learning_rate": 0.00018416887448873453, + "loss": 0.9007, + "step": 2070 + }, + { + "epoch": 0.73, + "grad_norm": 0.1889745593070984, + "learning_rate": 0.00018415392023261683, + "loss": 1.0553, + "step": 2071 + }, + { + "epoch": 0.73, + "grad_norm": 0.18329787254333496, + "learning_rate": 0.00018413895952458563, + "loss": 0.9331, + "step": 2072 + }, + { + "epoch": 0.73, + "grad_norm": 0.18456338346004486, + "learning_rate": 0.0001841239923657879, + "loss": 0.9649, + "step": 2073 + }, + { + "epoch": 0.73, + "grad_norm": 0.2080213725566864, + "learning_rate": 0.0001841090187573712, + "loss": 0.973, + "step": 2074 + }, + { + "epoch": 0.73, + "grad_norm": 0.17959704995155334, + "learning_rate": 0.00018409403870048356, + "loss": 0.9257, + "step": 2075 + }, + { + "epoch": 0.73, + "grad_norm": 0.18018606305122375, + "learning_rate": 0.00018407905219627337, + "loss": 0.9597, + "step": 2076 + }, + { + "epoch": 0.73, + "grad_norm": 0.1891738921403885, + "learning_rate": 0.00018406405924588967, + "loss": 0.9433, + "step": 2077 + }, + { + "epoch": 0.73, + "grad_norm": 0.18608370423316956, + "learning_rate": 0.00018404905985048192, + "loss": 0.9511, + "step": 2078 + }, + { + "epoch": 0.73, + "grad_norm": 0.18085582554340363, + "learning_rate": 0.00018403405401120013, + "loss": 1.0328, + "step": 2079 + }, + { + "epoch": 0.73, + "grad_norm": 0.19775338470935822, + "learning_rate": 0.00018401904172919475, + "loss": 0.8966, + "step": 2080 + }, + { + "epoch": 0.73, + "grad_norm": 0.1780966818332672, + "learning_rate": 0.00018400402300561673, + "loss": 0.9511, + "step": 2081 + }, + { + "epoch": 0.73, + "grad_norm": 0.19541499018669128, + "learning_rate": 0.00018398899784161752, + "loss": 1.0246, + "step": 2082 + }, + { + "epoch": 0.73, + "grad_norm": 0.18545083701610565, + "learning_rate": 0.0001839739662383491, + "loss": 1.0099, + "step": 2083 + }, + { + "epoch": 0.73, + "grad_norm": 0.17320199310779572, + "learning_rate": 0.00018395892819696389, + "loss": 0.9432, + "step": 2084 + }, + { + "epoch": 0.73, + "grad_norm": 0.18442998826503754, + "learning_rate": 0.00018394388371861483, + "loss": 1.0097, + "step": 2085 + }, + { + "epoch": 0.73, + "grad_norm": 0.17621727287769318, + "learning_rate": 0.00018392883280445537, + "loss": 1.0116, + "step": 2086 + }, + { + "epoch": 0.73, + "grad_norm": 0.18226811289787292, + "learning_rate": 0.00018391377545563938, + "loss": 1.0203, + "step": 2087 + }, + { + "epoch": 0.74, + "grad_norm": 0.18577659130096436, + "learning_rate": 0.00018389871167332134, + "loss": 0.9453, + "step": 2088 + }, + { + "epoch": 0.74, + "grad_norm": 0.18943215906620026, + "learning_rate": 0.00018388364145865613, + "loss": 0.9726, + "step": 2089 + }, + { + "epoch": 0.74, + "grad_norm": 0.18668906390666962, + "learning_rate": 0.00018386856481279916, + "loss": 0.9741, + "step": 2090 + }, + { + "epoch": 0.74, + "grad_norm": 0.17848534882068634, + "learning_rate": 0.00018385348173690632, + "loss": 0.9991, + "step": 2091 + }, + { + "epoch": 0.74, + "grad_norm": 0.18247073888778687, + "learning_rate": 0.000183838392232134, + "loss": 0.9163, + "step": 2092 + }, + { + "epoch": 0.74, + "grad_norm": 0.1850193589925766, + "learning_rate": 0.00018382329629963907, + "loss": 0.9621, + "step": 2093 + }, + { + "epoch": 0.74, + "grad_norm": 0.18309763073921204, + "learning_rate": 0.00018380819394057894, + "loss": 0.9825, + "step": 2094 + }, + { + "epoch": 0.74, + "grad_norm": 0.19286571443080902, + "learning_rate": 0.00018379308515611146, + "loss": 0.994, + "step": 2095 + }, + { + "epoch": 0.74, + "grad_norm": 0.1862298846244812, + "learning_rate": 0.00018377796994739498, + "loss": 0.961, + "step": 2096 + }, + { + "epoch": 0.74, + "grad_norm": 0.18532899022102356, + "learning_rate": 0.00018376284831558834, + "loss": 1.0029, + "step": 2097 + }, + { + "epoch": 0.74, + "grad_norm": 0.18360939621925354, + "learning_rate": 0.00018374772026185095, + "loss": 0.9917, + "step": 2098 + }, + { + "epoch": 0.74, + "grad_norm": 0.18725305795669556, + "learning_rate": 0.00018373258578734255, + "loss": 0.9478, + "step": 2099 + }, + { + "epoch": 0.74, + "grad_norm": 0.18553733825683594, + "learning_rate": 0.00018371744489322357, + "loss": 0.9833, + "step": 2100 + }, + { + "epoch": 0.74, + "grad_norm": 0.17913220822811127, + "learning_rate": 0.00018370229758065478, + "loss": 0.9987, + "step": 2101 + }, + { + "epoch": 0.74, + "grad_norm": 0.1810830533504486, + "learning_rate": 0.0001836871438507975, + "loss": 0.9334, + "step": 2102 + }, + { + "epoch": 0.74, + "grad_norm": 0.19181925058364868, + "learning_rate": 0.00018367198370481354, + "loss": 0.9945, + "step": 2103 + }, + { + "epoch": 0.74, + "grad_norm": 0.1797618567943573, + "learning_rate": 0.00018365681714386516, + "loss": 0.9155, + "step": 2104 + }, + { + "epoch": 0.74, + "grad_norm": 0.18634293973445892, + "learning_rate": 0.0001836416441691152, + "loss": 0.9568, + "step": 2105 + }, + { + "epoch": 0.74, + "grad_norm": 0.1783328354358673, + "learning_rate": 0.00018362646478172696, + "loss": 0.9257, + "step": 2106 + }, + { + "epoch": 0.74, + "grad_norm": 0.19061227142810822, + "learning_rate": 0.00018361127898286416, + "loss": 1.0015, + "step": 2107 + }, + { + "epoch": 0.74, + "grad_norm": 0.18546177446842194, + "learning_rate": 0.00018359608677369108, + "loss": 0.9977, + "step": 2108 + }, + { + "epoch": 0.74, + "grad_norm": 0.18221919238567352, + "learning_rate": 0.00018358088815537246, + "loss": 0.9498, + "step": 2109 + }, + { + "epoch": 0.74, + "grad_norm": 0.18326273560523987, + "learning_rate": 0.0001835656831290736, + "loss": 0.9702, + "step": 2110 + }, + { + "epoch": 0.74, + "grad_norm": 0.1829109638929367, + "learning_rate": 0.0001835504716959602, + "loss": 0.9142, + "step": 2111 + }, + { + "epoch": 0.74, + "grad_norm": 0.18340815603733063, + "learning_rate": 0.0001835352538571985, + "loss": 0.9222, + "step": 2112 + }, + { + "epoch": 0.74, + "grad_norm": 0.19066669046878815, + "learning_rate": 0.0001835200296139552, + "loss": 0.9965, + "step": 2113 + }, + { + "epoch": 0.74, + "grad_norm": 0.18047630786895752, + "learning_rate": 0.00018350479896739756, + "loss": 0.9369, + "step": 2114 + }, + { + "epoch": 0.74, + "grad_norm": 0.1809929460287094, + "learning_rate": 0.00018348956191869328, + "loss": 0.9247, + "step": 2115 + }, + { + "epoch": 0.75, + "grad_norm": 0.18403151631355286, + "learning_rate": 0.00018347431846901048, + "loss": 0.9744, + "step": 2116 + }, + { + "epoch": 0.75, + "grad_norm": 0.19564421474933624, + "learning_rate": 0.0001834590686195179, + "loss": 0.9778, + "step": 2117 + }, + { + "epoch": 0.75, + "grad_norm": 0.1840590536594391, + "learning_rate": 0.00018344381237138472, + "loss": 0.9687, + "step": 2118 + }, + { + "epoch": 0.75, + "grad_norm": 0.18494993448257446, + "learning_rate": 0.00018342854972578062, + "loss": 0.9746, + "step": 2119 + }, + { + "epoch": 0.75, + "grad_norm": 0.18578451871871948, + "learning_rate": 0.00018341328068387572, + "loss": 1.0011, + "step": 2120 + }, + { + "epoch": 0.75, + "grad_norm": 0.18412530422210693, + "learning_rate": 0.00018339800524684068, + "loss": 0.907, + "step": 2121 + }, + { + "epoch": 0.75, + "grad_norm": 0.18739020824432373, + "learning_rate": 0.00018338272341584662, + "loss": 0.961, + "step": 2122 + }, + { + "epoch": 0.75, + "grad_norm": 0.1956738978624344, + "learning_rate": 0.0001833674351920652, + "loss": 0.9948, + "step": 2123 + }, + { + "epoch": 0.75, + "grad_norm": 0.18302105367183685, + "learning_rate": 0.00018335214057666854, + "loss": 0.9506, + "step": 2124 + }, + { + "epoch": 0.75, + "grad_norm": 0.18477575480937958, + "learning_rate": 0.0001833368395708292, + "loss": 0.9732, + "step": 2125 + }, + { + "epoch": 0.75, + "grad_norm": 0.18320126831531525, + "learning_rate": 0.00018332153217572034, + "loss": 0.9378, + "step": 2126 + }, + { + "epoch": 0.75, + "grad_norm": 0.18950019776821136, + "learning_rate": 0.00018330621839251547, + "loss": 0.9409, + "step": 2127 + }, + { + "epoch": 0.75, + "grad_norm": 0.1776246428489685, + "learning_rate": 0.00018329089822238874, + "loss": 0.9854, + "step": 2128 + }, + { + "epoch": 0.75, + "grad_norm": 0.19237257540225983, + "learning_rate": 0.0001832755716665147, + "loss": 0.958, + "step": 2129 + }, + { + "epoch": 0.75, + "grad_norm": 0.19101490080356598, + "learning_rate": 0.0001832602387260684, + "loss": 0.9659, + "step": 2130 + }, + { + "epoch": 0.75, + "eval_loss": 0.9828736782073975, + "eval_runtime": 680.0071, + "eval_samples_per_second": 10.112, + "eval_steps_per_second": 5.056, + "step": 2130 + }, + { + "epoch": 0.75, + "grad_norm": 0.19474919140338898, + "learning_rate": 0.00018324489940222538, + "loss": 1.0003, + "step": 2131 + }, + { + "epoch": 0.75, + "grad_norm": 0.19254593551158905, + "learning_rate": 0.00018322955369616165, + "loss": 0.9636, + "step": 2132 + }, + { + "epoch": 0.75, + "grad_norm": 0.18535253405570984, + "learning_rate": 0.00018321420160905382, + "loss": 0.9502, + "step": 2133 + }, + { + "epoch": 0.75, + "grad_norm": 0.18019430339336395, + "learning_rate": 0.00018319884314207878, + "loss": 1.0317, + "step": 2134 + }, + { + "epoch": 0.75, + "grad_norm": 0.18415750563144684, + "learning_rate": 0.00018318347829641412, + "loss": 0.9957, + "step": 2135 + }, + { + "epoch": 0.75, + "grad_norm": 0.18982534110546112, + "learning_rate": 0.00018316810707323783, + "loss": 0.9908, + "step": 2136 + }, + { + "epoch": 0.75, + "grad_norm": 0.19444747269153595, + "learning_rate": 0.00018315272947372837, + "loss": 1.0274, + "step": 2137 + }, + { + "epoch": 0.75, + "grad_norm": 0.19093841314315796, + "learning_rate": 0.0001831373454990647, + "loss": 1.0041, + "step": 2138 + }, + { + "epoch": 0.75, + "grad_norm": 0.18283675611019135, + "learning_rate": 0.00018312195515042632, + "loss": 0.9485, + "step": 2139 + }, + { + "epoch": 0.75, + "grad_norm": 0.17775124311447144, + "learning_rate": 0.00018310655842899312, + "loss": 0.9003, + "step": 2140 + }, + { + "epoch": 0.75, + "grad_norm": 0.20296508073806763, + "learning_rate": 0.00018309115533594558, + "loss": 1.0346, + "step": 2141 + }, + { + "epoch": 0.75, + "grad_norm": 0.191764697432518, + "learning_rate": 0.00018307574587246459, + "loss": 0.9972, + "step": 2142 + }, + { + "epoch": 0.75, + "grad_norm": 0.19153252243995667, + "learning_rate": 0.0001830603300397316, + "loss": 0.9488, + "step": 2143 + }, + { + "epoch": 0.75, + "grad_norm": 0.1953708678483963, + "learning_rate": 0.00018304490783892848, + "loss": 1.1073, + "step": 2144 + }, + { + "epoch": 0.76, + "grad_norm": 0.20149077475070953, + "learning_rate": 0.00018302947927123766, + "loss": 0.9729, + "step": 2145 + }, + { + "epoch": 0.76, + "grad_norm": 0.19051818549633026, + "learning_rate": 0.00018301404433784194, + "loss": 1.0179, + "step": 2146 + }, + { + "epoch": 0.76, + "grad_norm": 0.18015353381633759, + "learning_rate": 0.0001829986030399248, + "loss": 0.862, + "step": 2147 + }, + { + "epoch": 0.76, + "grad_norm": 0.19477394223213196, + "learning_rate": 0.00018298315537866998, + "loss": 1.0168, + "step": 2148 + }, + { + "epoch": 0.76, + "grad_norm": 0.1973867565393448, + "learning_rate": 0.0001829677013552619, + "loss": 1.0129, + "step": 2149 + }, + { + "epoch": 0.76, + "grad_norm": 0.18476396799087524, + "learning_rate": 0.00018295224097088534, + "loss": 0.9461, + "step": 2150 + }, + { + "epoch": 0.76, + "grad_norm": 0.18751783668994904, + "learning_rate": 0.00018293677422672567, + "loss": 1.0069, + "step": 2151 + }, + { + "epoch": 0.76, + "grad_norm": 0.19476616382598877, + "learning_rate": 0.00018292130112396865, + "loss": 0.9646, + "step": 2152 + }, + { + "epoch": 0.76, + "grad_norm": 0.1982491910457611, + "learning_rate": 0.00018290582166380058, + "loss": 1.0004, + "step": 2153 + }, + { + "epoch": 0.76, + "grad_norm": 0.18301215767860413, + "learning_rate": 0.0001828903358474082, + "loss": 0.9201, + "step": 2154 + }, + { + "epoch": 0.76, + "grad_norm": 0.19771361351013184, + "learning_rate": 0.00018287484367597888, + "loss": 0.9588, + "step": 2155 + }, + { + "epoch": 0.76, + "grad_norm": 0.18548940122127533, + "learning_rate": 0.00018285934515070026, + "loss": 0.9673, + "step": 2156 + }, + { + "epoch": 0.76, + "grad_norm": 0.1889079064130783, + "learning_rate": 0.00018284384027276068, + "loss": 1.0334, + "step": 2157 + }, + { + "epoch": 0.76, + "grad_norm": 0.20975716412067413, + "learning_rate": 0.00018282832904334883, + "loss": 1.0518, + "step": 2158 + }, + { + "epoch": 0.76, + "grad_norm": 0.19074541330337524, + "learning_rate": 0.0001828128114636539, + "loss": 0.9866, + "step": 2159 + }, + { + "epoch": 0.76, + "grad_norm": 0.20263442397117615, + "learning_rate": 0.00018279728753486558, + "loss": 1.0949, + "step": 2160 + }, + { + "epoch": 0.76, + "grad_norm": 0.19469326734542847, + "learning_rate": 0.00018278175725817413, + "loss": 0.9735, + "step": 2161 + }, + { + "epoch": 0.76, + "grad_norm": 0.19660715758800507, + "learning_rate": 0.00018276622063477017, + "loss": 0.9821, + "step": 2162 + }, + { + "epoch": 0.76, + "grad_norm": 0.18767257034778595, + "learning_rate": 0.00018275067766584488, + "loss": 0.9505, + "step": 2163 + }, + { + "epoch": 0.76, + "grad_norm": 0.17904675006866455, + "learning_rate": 0.0001827351283525899, + "loss": 0.965, + "step": 2164 + }, + { + "epoch": 0.76, + "grad_norm": 0.18472503125667572, + "learning_rate": 0.00018271957269619737, + "loss": 0.9939, + "step": 2165 + }, + { + "epoch": 0.76, + "grad_norm": 0.18497197329998016, + "learning_rate": 0.00018270401069785992, + "loss": 1.0184, + "step": 2166 + }, + { + "epoch": 0.76, + "grad_norm": 0.1877606064081192, + "learning_rate": 0.00018268844235877062, + "loss": 0.923, + "step": 2167 + }, + { + "epoch": 0.76, + "grad_norm": 0.18791928887367249, + "learning_rate": 0.00018267286768012313, + "loss": 1.0198, + "step": 2168 + }, + { + "epoch": 0.76, + "grad_norm": 0.1900901347398758, + "learning_rate": 0.00018265728666311146, + "loss": 1.0139, + "step": 2169 + }, + { + "epoch": 0.76, + "grad_norm": 0.18016625940799713, + "learning_rate": 0.00018264169930893022, + "loss": 0.9624, + "step": 2170 + }, + { + "epoch": 0.76, + "grad_norm": 0.1885920614004135, + "learning_rate": 0.00018262610561877444, + "loss": 0.9212, + "step": 2171 + }, + { + "epoch": 0.76, + "grad_norm": 0.19056080281734467, + "learning_rate": 0.00018261050559383968, + "loss": 0.9597, + "step": 2172 + }, + { + "epoch": 0.77, + "grad_norm": 0.19492998719215393, + "learning_rate": 0.00018259489923532195, + "loss": 1.0138, + "step": 2173 + }, + { + "epoch": 0.77, + "grad_norm": 0.17653758823871613, + "learning_rate": 0.00018257928654441777, + "loss": 0.9123, + "step": 2174 + }, + { + "epoch": 0.77, + "grad_norm": 0.17693361639976501, + "learning_rate": 0.00018256366752232407, + "loss": 0.9263, + "step": 2175 + }, + { + "epoch": 0.77, + "grad_norm": 0.18627983331680298, + "learning_rate": 0.00018254804217023843, + "loss": 0.9921, + "step": 2176 + }, + { + "epoch": 0.77, + "grad_norm": 0.18731026351451874, + "learning_rate": 0.00018253241048935874, + "loss": 0.9612, + "step": 2177 + }, + { + "epoch": 0.77, + "grad_norm": 0.19288547337055206, + "learning_rate": 0.00018251677248088348, + "loss": 1.0085, + "step": 2178 + }, + { + "epoch": 0.77, + "grad_norm": 0.18385879695415497, + "learning_rate": 0.00018250112814601158, + "loss": 0.9856, + "step": 2179 + }, + { + "epoch": 0.77, + "grad_norm": 0.1872677505016327, + "learning_rate": 0.00018248547748594244, + "loss": 1.0597, + "step": 2180 + }, + { + "epoch": 0.77, + "grad_norm": 0.18108715116977692, + "learning_rate": 0.000182469820501876, + "loss": 0.9922, + "step": 2181 + }, + { + "epoch": 0.77, + "grad_norm": 0.18408574163913727, + "learning_rate": 0.00018245415719501265, + "loss": 0.9816, + "step": 2182 + }, + { + "epoch": 0.77, + "grad_norm": 0.19224940240383148, + "learning_rate": 0.00018243848756655323, + "loss": 1.0338, + "step": 2183 + }, + { + "epoch": 0.77, + "grad_norm": 0.18560826778411865, + "learning_rate": 0.0001824228116176991, + "loss": 1.0483, + "step": 2184 + }, + { + "epoch": 0.77, + "grad_norm": 0.18587149679660797, + "learning_rate": 0.00018240712934965218, + "loss": 0.9734, + "step": 2185 + }, + { + "epoch": 0.77, + "grad_norm": 0.19045782089233398, + "learning_rate": 0.0001823914407636147, + "loss": 1.0319, + "step": 2186 + }, + { + "epoch": 0.77, + "grad_norm": 0.17874370515346527, + "learning_rate": 0.0001823757458607895, + "loss": 0.9584, + "step": 2187 + }, + { + "epoch": 0.77, + "grad_norm": 0.19332368671894073, + "learning_rate": 0.00018236004464237992, + "loss": 1.0145, + "step": 2188 + }, + { + "epoch": 0.77, + "grad_norm": 0.18693716824054718, + "learning_rate": 0.0001823443371095897, + "loss": 1.0324, + "step": 2189 + }, + { + "epoch": 0.77, + "grad_norm": 0.1814510077238083, + "learning_rate": 0.0001823286232636231, + "loss": 1.0154, + "step": 2190 + }, + { + "epoch": 0.77, + "grad_norm": 0.1795709729194641, + "learning_rate": 0.0001823129031056849, + "loss": 0.9817, + "step": 2191 + }, + { + "epoch": 0.77, + "grad_norm": 0.18347346782684326, + "learning_rate": 0.00018229717663698032, + "loss": 0.955, + "step": 2192 + }, + { + "epoch": 0.77, + "grad_norm": 0.190670445561409, + "learning_rate": 0.0001822814438587151, + "loss": 0.9807, + "step": 2193 + }, + { + "epoch": 0.77, + "grad_norm": 0.18033914268016815, + "learning_rate": 0.00018226570477209534, + "loss": 1.0029, + "step": 2194 + }, + { + "epoch": 0.77, + "grad_norm": 0.18933604657649994, + "learning_rate": 0.00018224995937832786, + "loss": 0.9991, + "step": 2195 + }, + { + "epoch": 0.77, + "grad_norm": 0.1753634661436081, + "learning_rate": 0.00018223420767861978, + "loss": 0.9316, + "step": 2196 + }, + { + "epoch": 0.77, + "grad_norm": 0.18977148830890656, + "learning_rate": 0.0001822184496741787, + "loss": 0.9702, + "step": 2197 + }, + { + "epoch": 0.77, + "grad_norm": 0.18761374056339264, + "learning_rate": 0.00018220268536621283, + "loss": 1.0516, + "step": 2198 + }, + { + "epoch": 0.77, + "grad_norm": 0.18400248885154724, + "learning_rate": 0.00018218691475593073, + "loss": 0.962, + "step": 2199 + }, + { + "epoch": 0.77, + "grad_norm": 0.1761545091867447, + "learning_rate": 0.00018217113784454151, + "loss": 0.9558, + "step": 2200 + }, + { + "epoch": 0.78, + "grad_norm": 0.18742233514785767, + "learning_rate": 0.0001821553546332548, + "loss": 0.967, + "step": 2201 + }, + { + "epoch": 0.78, + "grad_norm": 0.18793874979019165, + "learning_rate": 0.00018213956512328064, + "loss": 1.0253, + "step": 2202 + }, + { + "epoch": 0.78, + "grad_norm": 0.18570399284362793, + "learning_rate": 0.00018212376931582956, + "loss": 0.9804, + "step": 2203 + }, + { + "epoch": 0.78, + "grad_norm": 0.18871964514255524, + "learning_rate": 0.0001821079672121126, + "loss": 1.021, + "step": 2204 + }, + { + "epoch": 0.78, + "grad_norm": 0.19510523974895477, + "learning_rate": 0.0001820921588133413, + "loss": 1.0137, + "step": 2205 + }, + { + "epoch": 0.78, + "grad_norm": 0.18368487060070038, + "learning_rate": 0.00018207634412072764, + "loss": 0.9795, + "step": 2206 + }, + { + "epoch": 0.78, + "grad_norm": 0.1859757900238037, + "learning_rate": 0.00018206052313548413, + "loss": 0.9345, + "step": 2207 + }, + { + "epoch": 0.78, + "grad_norm": 0.2072616070508957, + "learning_rate": 0.00018204469585882365, + "loss": 1.0181, + "step": 2208 + }, + { + "epoch": 0.78, + "grad_norm": 0.1866096556186676, + "learning_rate": 0.00018202886229195975, + "loss": 0.9891, + "step": 2209 + }, + { + "epoch": 0.78, + "grad_norm": 0.18876902759075165, + "learning_rate": 0.0001820130224361063, + "loss": 1.0037, + "step": 2210 + }, + { + "epoch": 0.78, + "grad_norm": 0.18180185556411743, + "learning_rate": 0.00018199717629247773, + "loss": 1.0389, + "step": 2211 + }, + { + "epoch": 0.78, + "grad_norm": 0.19424739480018616, + "learning_rate": 0.0001819813238622889, + "loss": 0.9684, + "step": 2212 + }, + { + "epoch": 0.78, + "grad_norm": 0.188624769449234, + "learning_rate": 0.00018196546514675523, + "loss": 0.956, + "step": 2213 + }, + { + "epoch": 0.78, + "grad_norm": 0.1843746155500412, + "learning_rate": 0.00018194960014709257, + "loss": 0.9866, + "step": 2214 + }, + { + "epoch": 0.78, + "grad_norm": 0.18615764379501343, + "learning_rate": 0.0001819337288645172, + "loss": 0.9164, + "step": 2215 + }, + { + "epoch": 0.78, + "grad_norm": 0.17863336205482483, + "learning_rate": 0.000181917851300246, + "loss": 1.0111, + "step": 2216 + }, + { + "epoch": 0.78, + "grad_norm": 0.20061299204826355, + "learning_rate": 0.00018190196745549627, + "loss": 0.9398, + "step": 2217 + }, + { + "epoch": 0.78, + "grad_norm": 0.18024347722530365, + "learning_rate": 0.00018188607733148578, + "loss": 0.9883, + "step": 2218 + }, + { + "epoch": 0.78, + "grad_norm": 0.17821189761161804, + "learning_rate": 0.00018187018092943278, + "loss": 0.9278, + "step": 2219 + }, + { + "epoch": 0.78, + "grad_norm": 0.1844683438539505, + "learning_rate": 0.00018185427825055602, + "loss": 1.044, + "step": 2220 + }, + { + "epoch": 0.78, + "grad_norm": 0.19516977667808533, + "learning_rate": 0.00018183836929607473, + "loss": 1.0143, + "step": 2221 + }, + { + "epoch": 0.78, + "grad_norm": 0.192205011844635, + "learning_rate": 0.0001818224540672086, + "loss": 1.0169, + "step": 2222 + }, + { + "epoch": 0.78, + "grad_norm": 0.20235204696655273, + "learning_rate": 0.00018180653256517787, + "loss": 1.0375, + "step": 2223 + }, + { + "epoch": 0.78, + "grad_norm": 0.18561050295829773, + "learning_rate": 0.0001817906047912032, + "loss": 0.923, + "step": 2224 + }, + { + "epoch": 0.78, + "grad_norm": 0.19041159749031067, + "learning_rate": 0.0001817746707465057, + "loss": 1.0645, + "step": 2225 + }, + { + "epoch": 0.78, + "grad_norm": 0.18981195986270905, + "learning_rate": 0.000181758730432307, + "loss": 0.935, + "step": 2226 + }, + { + "epoch": 0.78, + "grad_norm": 0.187282532453537, + "learning_rate": 0.00018174278384982926, + "loss": 1.0661, + "step": 2227 + }, + { + "epoch": 0.78, + "grad_norm": 0.18467959761619568, + "learning_rate": 0.00018172683100029503, + "loss": 0.9665, + "step": 2228 + }, + { + "epoch": 0.78, + "grad_norm": 0.19500401616096497, + "learning_rate": 0.00018171087188492742, + "loss": 1.0129, + "step": 2229 + }, + { + "epoch": 0.79, + "grad_norm": 0.19473527371883392, + "learning_rate": 0.00018169490650494995, + "loss": 1.0049, + "step": 2230 + }, + { + "epoch": 0.79, + "grad_norm": 0.19505290687084198, + "learning_rate": 0.0001816789348615867, + "loss": 1.0335, + "step": 2231 + }, + { + "epoch": 0.79, + "grad_norm": 0.1958623081445694, + "learning_rate": 0.0001816629569560621, + "loss": 1.0676, + "step": 2232 + }, + { + "epoch": 0.79, + "grad_norm": 0.18890038132667542, + "learning_rate": 0.00018164697278960122, + "loss": 0.9362, + "step": 2233 + }, + { + "epoch": 0.79, + "grad_norm": 0.18806125223636627, + "learning_rate": 0.00018163098236342952, + "loss": 0.9476, + "step": 2234 + }, + { + "epoch": 0.79, + "grad_norm": 0.19116836786270142, + "learning_rate": 0.0001816149856787729, + "loss": 0.9534, + "step": 2235 + }, + { + "epoch": 0.79, + "grad_norm": 0.1891220211982727, + "learning_rate": 0.00018159898273685787, + "loss": 0.9905, + "step": 2236 + }, + { + "epoch": 0.79, + "grad_norm": 0.18333634734153748, + "learning_rate": 0.0001815829735389113, + "loss": 1.0209, + "step": 2237 + }, + { + "epoch": 0.79, + "grad_norm": 0.18993903696537018, + "learning_rate": 0.00018156695808616062, + "loss": 0.9481, + "step": 2238 + }, + { + "epoch": 0.79, + "grad_norm": 0.1848689764738083, + "learning_rate": 0.00018155093637983367, + "loss": 0.9124, + "step": 2239 + }, + { + "epoch": 0.79, + "grad_norm": 0.19026519358158112, + "learning_rate": 0.00018153490842115879, + "loss": 1.0066, + "step": 2240 + }, + { + "epoch": 0.79, + "grad_norm": 0.18695740401744843, + "learning_rate": 0.00018151887421136484, + "loss": 0.991, + "step": 2241 + }, + { + "epoch": 0.79, + "grad_norm": 0.18972289562225342, + "learning_rate": 0.00018150283375168114, + "loss": 0.9192, + "step": 2242 + }, + { + "epoch": 0.79, + "grad_norm": 0.19145484268665314, + "learning_rate": 0.00018148678704333745, + "loss": 1.0402, + "step": 2243 + }, + { + "epoch": 0.79, + "grad_norm": 0.19382283091545105, + "learning_rate": 0.00018147073408756405, + "loss": 0.9467, + "step": 2244 + }, + { + "epoch": 0.79, + "grad_norm": 0.19206614792346954, + "learning_rate": 0.0001814546748855917, + "loss": 0.9458, + "step": 2245 + }, + { + "epoch": 0.79, + "grad_norm": 0.18128708004951477, + "learning_rate": 0.0001814386094386516, + "loss": 0.93, + "step": 2246 + }, + { + "epoch": 0.79, + "grad_norm": 0.17813989520072937, + "learning_rate": 0.0001814225377479755, + "loss": 0.9856, + "step": 2247 + }, + { + "epoch": 0.79, + "grad_norm": 0.18716156482696533, + "learning_rate": 0.00018140645981479554, + "loss": 0.9642, + "step": 2248 + }, + { + "epoch": 0.79, + "grad_norm": 0.20171204209327698, + "learning_rate": 0.0001813903756403444, + "loss": 0.9714, + "step": 2249 + }, + { + "epoch": 0.79, + "grad_norm": 0.18735407292842865, + "learning_rate": 0.0001813742852258552, + "loss": 0.9901, + "step": 2250 + }, + { + "epoch": 0.79, + "grad_norm": 0.18513000011444092, + "learning_rate": 0.00018135818857256163, + "loss": 0.926, + "step": 2251 + }, + { + "epoch": 0.79, + "grad_norm": 0.19058287143707275, + "learning_rate": 0.00018134208568169774, + "loss": 0.9119, + "step": 2252 + }, + { + "epoch": 0.79, + "grad_norm": 0.1841873973608017, + "learning_rate": 0.00018132597655449805, + "loss": 0.972, + "step": 2253 + }, + { + "epoch": 0.79, + "grad_norm": 0.1925683617591858, + "learning_rate": 0.00018130986119219774, + "loss": 1.0006, + "step": 2254 + }, + { + "epoch": 0.79, + "grad_norm": 0.18620803952217102, + "learning_rate": 0.00018129373959603222, + "loss": 1.022, + "step": 2255 + }, + { + "epoch": 0.79, + "grad_norm": 0.1849374920129776, + "learning_rate": 0.00018127761176723757, + "loss": 0.9537, + "step": 2256 + }, + { + "epoch": 0.79, + "grad_norm": 0.19724594056606293, + "learning_rate": 0.00018126147770705027, + "loss": 0.9363, + "step": 2257 + }, + { + "epoch": 0.8, + "grad_norm": 0.1982530653476715, + "learning_rate": 0.00018124533741670725, + "loss": 1.0424, + "step": 2258 + }, + { + "epoch": 0.8, + "grad_norm": 0.18769797682762146, + "learning_rate": 0.00018122919089744603, + "loss": 1.006, + "step": 2259 + }, + { + "epoch": 0.8, + "grad_norm": 0.18896335363388062, + "learning_rate": 0.00018121303815050449, + "loss": 1.0598, + "step": 2260 + }, + { + "epoch": 0.8, + "grad_norm": 0.1882297247648239, + "learning_rate": 0.00018119687917712097, + "loss": 0.9607, + "step": 2261 + }, + { + "epoch": 0.8, + "grad_norm": 0.18507346510887146, + "learning_rate": 0.00018118071397853447, + "loss": 1.0024, + "step": 2262 + }, + { + "epoch": 0.8, + "grad_norm": 0.18987086415290833, + "learning_rate": 0.00018116454255598423, + "loss": 0.9818, + "step": 2263 + }, + { + "epoch": 0.8, + "grad_norm": 0.18397410213947296, + "learning_rate": 0.00018114836491071014, + "loss": 1.0325, + "step": 2264 + }, + { + "epoch": 0.8, + "grad_norm": 0.18793408572673798, + "learning_rate": 0.00018113218104395248, + "loss": 0.9989, + "step": 2265 + }, + { + "epoch": 0.8, + "grad_norm": 0.1876463145017624, + "learning_rate": 0.00018111599095695205, + "loss": 0.9804, + "step": 2266 + }, + { + "epoch": 0.8, + "grad_norm": 0.17934471368789673, + "learning_rate": 0.00018109979465095013, + "loss": 0.9507, + "step": 2267 + }, + { + "epoch": 0.8, + "grad_norm": 0.17715683579444885, + "learning_rate": 0.00018108359212718841, + "loss": 0.9766, + "step": 2268 + }, + { + "epoch": 0.8, + "grad_norm": 0.17394061386585236, + "learning_rate": 0.00018106738338690917, + "loss": 0.9053, + "step": 2269 + }, + { + "epoch": 0.8, + "grad_norm": 0.17950716614723206, + "learning_rate": 0.00018105116843135504, + "loss": 0.9292, + "step": 2270 + }, + { + "epoch": 0.8, + "grad_norm": 0.18406367301940918, + "learning_rate": 0.00018103494726176922, + "loss": 0.9567, + "step": 2271 + }, + { + "epoch": 0.8, + "grad_norm": 0.18885083496570587, + "learning_rate": 0.00018101871987939535, + "loss": 0.9399, + "step": 2272 + }, + { + "epoch": 0.8, + "grad_norm": 0.19340074062347412, + "learning_rate": 0.0001810024862854775, + "loss": 1.0206, + "step": 2273 + }, + { + "epoch": 0.8, + "grad_norm": 0.182267963886261, + "learning_rate": 0.0001809862464812604, + "loss": 0.9908, + "step": 2274 + }, + { + "epoch": 0.8, + "grad_norm": 0.1867702603340149, + "learning_rate": 0.00018097000046798898, + "loss": 0.9909, + "step": 2275 + }, + { + "epoch": 0.8, + "grad_norm": 0.18310391902923584, + "learning_rate": 0.0001809537482469089, + "loss": 1.0097, + "step": 2276 + }, + { + "epoch": 0.8, + "grad_norm": 0.1698244959115982, + "learning_rate": 0.00018093748981926609, + "loss": 0.906, + "step": 2277 + }, + { + "epoch": 0.8, + "grad_norm": 0.1908068060874939, + "learning_rate": 0.00018092122518630709, + "loss": 0.9443, + "step": 2278 + }, + { + "epoch": 0.8, + "grad_norm": 0.18088297545909882, + "learning_rate": 0.00018090495434927893, + "loss": 0.9008, + "step": 2279 + }, + { + "epoch": 0.8, + "grad_norm": 0.18029263615608215, + "learning_rate": 0.00018088867730942898, + "loss": 1.0218, + "step": 2280 + }, + { + "epoch": 0.8, + "grad_norm": 0.18683558702468872, + "learning_rate": 0.00018087239406800522, + "loss": 1.0452, + "step": 2281 + }, + { + "epoch": 0.8, + "grad_norm": 0.18489494919776917, + "learning_rate": 0.00018085610462625603, + "loss": 0.9451, + "step": 2282 + }, + { + "epoch": 0.8, + "grad_norm": 0.1905013769865036, + "learning_rate": 0.00018083980898543032, + "loss": 1.015, + "step": 2283 + }, + { + "epoch": 0.8, + "grad_norm": 0.18374451994895935, + "learning_rate": 0.00018082350714677739, + "loss": 0.9872, + "step": 2284 + }, + { + "epoch": 0.8, + "grad_norm": 0.19135090708732605, + "learning_rate": 0.0001808071991115471, + "loss": 0.9376, + "step": 2285 + }, + { + "epoch": 0.81, + "grad_norm": 0.19377972185611725, + "learning_rate": 0.00018079088488098978, + "loss": 1.1116, + "step": 2286 + }, + { + "epoch": 0.81, + "grad_norm": 0.18835166096687317, + "learning_rate": 0.0001807745644563562, + "loss": 0.9739, + "step": 2287 + }, + { + "epoch": 0.81, + "grad_norm": 0.1849549114704132, + "learning_rate": 0.00018075823783889755, + "loss": 1.0043, + "step": 2288 + }, + { + "epoch": 0.81, + "grad_norm": 0.1912752389907837, + "learning_rate": 0.00018074190502986563, + "loss": 0.9567, + "step": 2289 + }, + { + "epoch": 0.81, + "grad_norm": 0.1859811693429947, + "learning_rate": 0.00018072556603051265, + "loss": 0.9896, + "step": 2290 + }, + { + "epoch": 0.81, + "grad_norm": 0.17959462106227875, + "learning_rate": 0.00018070922084209123, + "loss": 0.9897, + "step": 2291 + }, + { + "epoch": 0.81, + "grad_norm": 0.18935257196426392, + "learning_rate": 0.00018069286946585455, + "loss": 0.9496, + "step": 2292 + }, + { + "epoch": 0.81, + "grad_norm": 0.19748300313949585, + "learning_rate": 0.00018067651190305627, + "loss": 0.9379, + "step": 2293 + }, + { + "epoch": 0.81, + "grad_norm": 0.1971241533756256, + "learning_rate": 0.00018066014815495046, + "loss": 1.0008, + "step": 2294 + }, + { + "epoch": 0.81, + "grad_norm": 0.18202240765094757, + "learning_rate": 0.00018064377822279168, + "loss": 0.9954, + "step": 2295 + }, + { + "epoch": 0.81, + "grad_norm": 0.19593803584575653, + "learning_rate": 0.000180627402107835, + "loss": 1.0572, + "step": 2296 + }, + { + "epoch": 0.81, + "grad_norm": 0.18760298192501068, + "learning_rate": 0.00018061101981133596, + "loss": 0.9804, + "step": 2297 + }, + { + "epoch": 0.81, + "grad_norm": 0.1931912750005722, + "learning_rate": 0.00018059463133455051, + "loss": 0.9814, + "step": 2298 + }, + { + "epoch": 0.81, + "grad_norm": 0.17908045649528503, + "learning_rate": 0.00018057823667873518, + "loss": 0.8676, + "step": 2299 + }, + { + "epoch": 0.81, + "grad_norm": 0.18545086681842804, + "learning_rate": 0.0001805618358451469, + "loss": 1.0557, + "step": 2300 + }, + { + "epoch": 0.81, + "grad_norm": 0.19220340251922607, + "learning_rate": 0.00018054542883504305, + "loss": 0.9436, + "step": 2301 + }, + { + "epoch": 0.81, + "grad_norm": 0.18264839053153992, + "learning_rate": 0.00018052901564968158, + "loss": 1.0022, + "step": 2302 + }, + { + "epoch": 0.81, + "grad_norm": 0.18435068428516388, + "learning_rate": 0.00018051259629032083, + "loss": 0.9704, + "step": 2303 + }, + { + "epoch": 0.81, + "grad_norm": 0.1799696683883667, + "learning_rate": 0.00018049617075821962, + "loss": 0.8971, + "step": 2304 + }, + { + "epoch": 0.81, + "grad_norm": 0.18429692089557648, + "learning_rate": 0.00018047973905463728, + "loss": 0.9938, + "step": 2305 + }, + { + "epoch": 0.81, + "grad_norm": 0.18325746059417725, + "learning_rate": 0.0001804633011808336, + "loss": 0.9852, + "step": 2306 + }, + { + "epoch": 0.81, + "grad_norm": 0.18153943121433258, + "learning_rate": 0.00018044685713806881, + "loss": 0.9657, + "step": 2307 + }, + { + "epoch": 0.81, + "grad_norm": 0.18153220415115356, + "learning_rate": 0.00018043040692760368, + "loss": 0.9414, + "step": 2308 + }, + { + "epoch": 0.81, + "grad_norm": 0.1895730197429657, + "learning_rate": 0.00018041395055069944, + "loss": 0.9586, + "step": 2309 + }, + { + "epoch": 0.81, + "grad_norm": 0.19165296852588654, + "learning_rate": 0.00018039748800861768, + "loss": 1.0116, + "step": 2310 + }, + { + "epoch": 0.81, + "grad_norm": 0.1748407781124115, + "learning_rate": 0.00018038101930262064, + "loss": 0.9276, + "step": 2311 + }, + { + "epoch": 0.81, + "grad_norm": 0.18681953847408295, + "learning_rate": 0.00018036454443397086, + "loss": 0.976, + "step": 2312 + }, + { + "epoch": 0.81, + "grad_norm": 0.17880845069885254, + "learning_rate": 0.00018034806340393153, + "loss": 0.9367, + "step": 2313 + }, + { + "epoch": 0.81, + "grad_norm": 0.19864773750305176, + "learning_rate": 0.00018033157621376612, + "loss": 0.9697, + "step": 2314 + }, + { + "epoch": 0.82, + "grad_norm": 0.19031883776187897, + "learning_rate": 0.00018031508286473874, + "loss": 1.0077, + "step": 2315 + }, + { + "epoch": 0.82, + "grad_norm": 0.18201903998851776, + "learning_rate": 0.0001802985833581139, + "loss": 0.9863, + "step": 2316 + }, + { + "epoch": 0.82, + "grad_norm": 0.19363977015018463, + "learning_rate": 0.0001802820776951565, + "loss": 1.0851, + "step": 2317 + }, + { + "epoch": 0.82, + "grad_norm": 0.19829228520393372, + "learning_rate": 0.0001802655658771321, + "loss": 0.9617, + "step": 2318 + }, + { + "epoch": 0.82, + "grad_norm": 0.17512492835521698, + "learning_rate": 0.0001802490479053066, + "loss": 0.9007, + "step": 2319 + }, + { + "epoch": 0.82, + "grad_norm": 0.18068177998065948, + "learning_rate": 0.00018023252378094635, + "loss": 0.9897, + "step": 2320 + }, + { + "epoch": 0.82, + "grad_norm": 0.18359632790088654, + "learning_rate": 0.0001802159935053183, + "loss": 1.0025, + "step": 2321 + }, + { + "epoch": 0.82, + "grad_norm": 0.18495793640613556, + "learning_rate": 0.00018019945707968972, + "loss": 0.9347, + "step": 2322 + }, + { + "epoch": 0.82, + "grad_norm": 0.18118029832839966, + "learning_rate": 0.00018018291450532848, + "loss": 0.9876, + "step": 2323 + }, + { + "epoch": 0.82, + "grad_norm": 0.193434476852417, + "learning_rate": 0.00018016636578350282, + "loss": 0.9605, + "step": 2324 + }, + { + "epoch": 0.82, + "grad_norm": 0.19045786559581757, + "learning_rate": 0.00018014981091548152, + "loss": 0.9737, + "step": 2325 + }, + { + "epoch": 0.82, + "grad_norm": 0.18402209877967834, + "learning_rate": 0.00018013324990253385, + "loss": 0.967, + "step": 2326 + }, + { + "epoch": 0.82, + "grad_norm": 0.19088177382946014, + "learning_rate": 0.00018011668274592942, + "loss": 0.9235, + "step": 2327 + }, + { + "epoch": 0.82, + "grad_norm": 0.19194436073303223, + "learning_rate": 0.00018010010944693848, + "loss": 0.9763, + "step": 2328 + }, + { + "epoch": 0.82, + "grad_norm": 0.1945270448923111, + "learning_rate": 0.00018008353000683163, + "loss": 0.932, + "step": 2329 + }, + { + "epoch": 0.82, + "grad_norm": 0.38292306661605835, + "learning_rate": 0.00018006694442687998, + "loss": 0.9527, + "step": 2330 + }, + { + "epoch": 0.82, + "grad_norm": 0.19509528577327728, + "learning_rate": 0.00018005035270835516, + "loss": 0.9853, + "step": 2331 + }, + { + "epoch": 0.82, + "grad_norm": 0.19551971554756165, + "learning_rate": 0.00018003375485252919, + "loss": 0.9776, + "step": 2332 + }, + { + "epoch": 0.82, + "grad_norm": 0.17869053781032562, + "learning_rate": 0.00018001715086067457, + "loss": 0.9142, + "step": 2333 + }, + { + "epoch": 0.82, + "grad_norm": 0.1891433149576187, + "learning_rate": 0.00018000054073406436, + "loss": 0.9876, + "step": 2334 + }, + { + "epoch": 0.82, + "grad_norm": 0.1875545084476471, + "learning_rate": 0.00017998392447397197, + "loss": 1.0268, + "step": 2335 + }, + { + "epoch": 0.82, + "grad_norm": 0.19381293654441833, + "learning_rate": 0.00017996730208167132, + "loss": 0.9694, + "step": 2336 + }, + { + "epoch": 0.82, + "grad_norm": 0.20120295882225037, + "learning_rate": 0.00017995067355843687, + "loss": 1.0433, + "step": 2337 + }, + { + "epoch": 0.82, + "grad_norm": 0.1825202852487564, + "learning_rate": 0.0001799340389055435, + "loss": 0.9054, + "step": 2338 + }, + { + "epoch": 0.82, + "grad_norm": 0.18435628712177277, + "learning_rate": 0.00017991739812426652, + "loss": 0.9426, + "step": 2339 + }, + { + "epoch": 0.82, + "grad_norm": 0.18861481547355652, + "learning_rate": 0.00017990075121588172, + "loss": 0.989, + "step": 2340 + }, + { + "epoch": 0.82, + "grad_norm": 0.1839429885149002, + "learning_rate": 0.00017988409818166546, + "loss": 0.9547, + "step": 2341 + }, + { + "epoch": 0.82, + "grad_norm": 0.18472762405872345, + "learning_rate": 0.00017986743902289446, + "loss": 0.9353, + "step": 2342 + }, + { + "epoch": 0.83, + "grad_norm": 0.18218994140625, + "learning_rate": 0.00017985077374084592, + "loss": 0.9985, + "step": 2343 + }, + { + "epoch": 0.83, + "grad_norm": 0.18237660825252533, + "learning_rate": 0.00017983410233679754, + "loss": 0.9403, + "step": 2344 + }, + { + "epoch": 0.83, + "grad_norm": 0.32768821716308594, + "learning_rate": 0.0001798174248120275, + "loss": 0.9062, + "step": 2345 + }, + { + "epoch": 0.83, + "grad_norm": 0.19033341109752655, + "learning_rate": 0.00017980074116781447, + "loss": 0.9599, + "step": 2346 + }, + { + "epoch": 0.83, + "grad_norm": 0.1890161782503128, + "learning_rate": 0.00017978405140543747, + "loss": 1.0005, + "step": 2347 + }, + { + "epoch": 0.83, + "grad_norm": 0.18877194821834564, + "learning_rate": 0.00017976735552617613, + "loss": 0.9992, + "step": 2348 + }, + { + "epoch": 0.83, + "grad_norm": 0.18344587087631226, + "learning_rate": 0.00017975065353131048, + "loss": 0.9636, + "step": 2349 + }, + { + "epoch": 0.83, + "grad_norm": 0.19172963500022888, + "learning_rate": 0.000179733945422121, + "loss": 0.9603, + "step": 2350 + }, + { + "epoch": 0.83, + "grad_norm": 0.1853189468383789, + "learning_rate": 0.00017971723119988872, + "loss": 0.9867, + "step": 2351 + }, + { + "epoch": 0.83, + "grad_norm": 0.18967324495315552, + "learning_rate": 0.00017970051086589503, + "loss": 0.9997, + "step": 2352 + }, + { + "epoch": 0.83, + "grad_norm": 0.1847553700208664, + "learning_rate": 0.0001796837844214219, + "loss": 0.9851, + "step": 2353 + }, + { + "epoch": 0.83, + "grad_norm": 0.19326816499233246, + "learning_rate": 0.00017966705186775166, + "loss": 0.9813, + "step": 2354 + }, + { + "epoch": 0.83, + "grad_norm": 0.18615196645259857, + "learning_rate": 0.0001796503132061672, + "loss": 0.9783, + "step": 2355 + }, + { + "epoch": 0.83, + "grad_norm": 0.17984063923358917, + "learning_rate": 0.00017963356843795177, + "loss": 0.9427, + "step": 2356 + }, + { + "epoch": 0.83, + "grad_norm": 0.17674154043197632, + "learning_rate": 0.00017961681756438927, + "loss": 0.9691, + "step": 2357 + }, + { + "epoch": 0.83, + "grad_norm": 0.18042780458927155, + "learning_rate": 0.00017960006058676388, + "loss": 0.9217, + "step": 2358 + }, + { + "epoch": 0.83, + "grad_norm": 0.1813279390335083, + "learning_rate": 0.00017958329750636037, + "loss": 1.0397, + "step": 2359 + }, + { + "epoch": 0.83, + "grad_norm": 0.18235574662685394, + "learning_rate": 0.00017956652832446386, + "loss": 0.9653, + "step": 2360 + }, + { + "epoch": 0.83, + "grad_norm": 0.19341222941875458, + "learning_rate": 0.00017954975304236006, + "loss": 0.9598, + "step": 2361 + }, + { + "epoch": 0.83, + "grad_norm": 0.18789170682430267, + "learning_rate": 0.00017953297166133512, + "loss": 0.9359, + "step": 2362 + }, + { + "epoch": 0.83, + "grad_norm": 0.18214201927185059, + "learning_rate": 0.00017951618418267557, + "loss": 0.9558, + "step": 2363 + }, + { + "epoch": 0.83, + "grad_norm": 0.1955145299434662, + "learning_rate": 0.00017949939060766856, + "loss": 1.019, + "step": 2364 + }, + { + "epoch": 0.83, + "grad_norm": 0.18358461558818817, + "learning_rate": 0.00017948259093760155, + "loss": 0.9635, + "step": 2365 + }, + { + "epoch": 0.83, + "grad_norm": 0.18531377613544464, + "learning_rate": 0.0001794657851737625, + "loss": 0.9763, + "step": 2366 + }, + { + "epoch": 0.83, + "grad_norm": 0.18523001670837402, + "learning_rate": 0.00017944897331743999, + "loss": 0.9611, + "step": 2367 + }, + { + "epoch": 0.83, + "grad_norm": 0.19314917922019958, + "learning_rate": 0.00017943215536992289, + "loss": 0.9916, + "step": 2368 + }, + { + "epoch": 0.83, + "grad_norm": 0.19348804652690887, + "learning_rate": 0.0001794153313325006, + "loss": 0.9994, + "step": 2369 + }, + { + "epoch": 0.83, + "grad_norm": 0.23391282558441162, + "learning_rate": 0.000179398501206463, + "loss": 0.915, + "step": 2370 + }, + { + "epoch": 0.83, + "grad_norm": 0.18983489274978638, + "learning_rate": 0.00017938166499310035, + "loss": 0.9543, + "step": 2371 + }, + { + "epoch": 0.84, + "grad_norm": 0.1920963078737259, + "learning_rate": 0.00017936482269370355, + "loss": 1.0246, + "step": 2372 + }, + { + "epoch": 0.84, + "grad_norm": 0.19405518472194672, + "learning_rate": 0.00017934797430956383, + "loss": 0.9415, + "step": 2373 + }, + { + "epoch": 0.84, + "grad_norm": 0.24989286065101624, + "learning_rate": 0.0001793311198419729, + "loss": 0.8994, + "step": 2374 + }, + { + "epoch": 0.84, + "grad_norm": 0.18366257846355438, + "learning_rate": 0.000179314259292223, + "loss": 0.9825, + "step": 2375 + }, + { + "epoch": 0.84, + "grad_norm": 0.19646799564361572, + "learning_rate": 0.00017929739266160674, + "loss": 0.9353, + "step": 2376 + }, + { + "epoch": 0.84, + "grad_norm": 0.19476506114006042, + "learning_rate": 0.0001792805199514173, + "loss": 1.0047, + "step": 2377 + }, + { + "epoch": 0.84, + "grad_norm": 0.1821003556251526, + "learning_rate": 0.00017926364116294824, + "loss": 1.0428, + "step": 2378 + }, + { + "epoch": 0.84, + "grad_norm": 0.19362856447696686, + "learning_rate": 0.00017924675629749364, + "loss": 0.9599, + "step": 2379 + }, + { + "epoch": 0.84, + "grad_norm": 0.1914760172367096, + "learning_rate": 0.00017922986535634804, + "loss": 0.9749, + "step": 2380 + }, + { + "epoch": 0.84, + "grad_norm": 0.1982416957616806, + "learning_rate": 0.0001792129683408064, + "loss": 0.9649, + "step": 2381 + }, + { + "epoch": 0.84, + "grad_norm": 0.18061783909797668, + "learning_rate": 0.00017919606525216422, + "loss": 0.9222, + "step": 2382 + }, + { + "epoch": 0.84, + "grad_norm": 0.17703884840011597, + "learning_rate": 0.00017917915609171742, + "loss": 0.9463, + "step": 2383 + }, + { + "epoch": 0.84, + "grad_norm": 0.19080610573291779, + "learning_rate": 0.00017916224086076237, + "loss": 1.0145, + "step": 2384 + }, + { + "epoch": 0.84, + "grad_norm": 0.17901524901390076, + "learning_rate": 0.00017914531956059594, + "loss": 0.9843, + "step": 2385 + }, + { + "epoch": 0.84, + "grad_norm": 0.19128692150115967, + "learning_rate": 0.00017912839219251547, + "loss": 1.02, + "step": 2386 + }, + { + "epoch": 0.84, + "grad_norm": 0.1813100278377533, + "learning_rate": 0.0001791114587578187, + "loss": 0.8977, + "step": 2387 + }, + { + "epoch": 0.84, + "grad_norm": 0.1859530359506607, + "learning_rate": 0.00017909451925780394, + "loss": 0.9613, + "step": 2388 + }, + { + "epoch": 0.84, + "grad_norm": 0.1826142519712448, + "learning_rate": 0.00017907757369376985, + "loss": 0.9584, + "step": 2389 + }, + { + "epoch": 0.84, + "grad_norm": 0.19591540098190308, + "learning_rate": 0.0001790606220670157, + "loss": 1.0151, + "step": 2390 + }, + { + "epoch": 0.84, + "grad_norm": 0.1937379240989685, + "learning_rate": 0.00017904366437884102, + "loss": 0.9983, + "step": 2391 + }, + { + "epoch": 0.84, + "grad_norm": 0.18422500789165497, + "learning_rate": 0.00017902670063054597, + "loss": 0.903, + "step": 2392 + }, + { + "epoch": 0.84, + "grad_norm": 0.181779146194458, + "learning_rate": 0.0001790097308234312, + "loss": 0.951, + "step": 2393 + }, + { + "epoch": 0.84, + "grad_norm": 0.18863825500011444, + "learning_rate": 0.00017899275495879765, + "loss": 1.015, + "step": 2394 + }, + { + "epoch": 0.84, + "grad_norm": 0.1767343282699585, + "learning_rate": 0.0001789757730379469, + "loss": 0.8843, + "step": 2395 + }, + { + "epoch": 0.84, + "grad_norm": 0.1767064779996872, + "learning_rate": 0.00017895878506218085, + "loss": 0.9389, + "step": 2396 + }, + { + "epoch": 0.84, + "grad_norm": 0.1905364841222763, + "learning_rate": 0.00017894179103280198, + "loss": 0.9241, + "step": 2397 + }, + { + "epoch": 0.84, + "grad_norm": 0.184580460190773, + "learning_rate": 0.00017892479095111318, + "loss": 0.9488, + "step": 2398 + }, + { + "epoch": 0.84, + "grad_norm": 0.18272309005260468, + "learning_rate": 0.00017890778481841781, + "loss": 0.9805, + "step": 2399 + }, + { + "epoch": 0.85, + "grad_norm": 0.1822749674320221, + "learning_rate": 0.0001788907726360197, + "loss": 0.9814, + "step": 2400 + }, + { + "epoch": 0.85, + "grad_norm": 0.18006400763988495, + "learning_rate": 0.00017887375440522313, + "loss": 0.974, + "step": 2401 + }, + { + "epoch": 0.85, + "grad_norm": 0.18517065048217773, + "learning_rate": 0.00017885673012733288, + "loss": 0.9311, + "step": 2402 + }, + { + "epoch": 0.85, + "grad_norm": 0.18556898832321167, + "learning_rate": 0.00017883969980365413, + "loss": 0.9755, + "step": 2403 + }, + { + "epoch": 0.85, + "grad_norm": 0.17660577595233917, + "learning_rate": 0.00017882266343549258, + "loss": 0.9523, + "step": 2404 + }, + { + "epoch": 0.85, + "grad_norm": 0.18745553493499756, + "learning_rate": 0.0001788056210241544, + "loss": 1.0022, + "step": 2405 + }, + { + "epoch": 0.85, + "grad_norm": 0.1848209798336029, + "learning_rate": 0.00017878857257094615, + "loss": 0.9009, + "step": 2406 + }, + { + "epoch": 0.85, + "grad_norm": 0.1945207267999649, + "learning_rate": 0.00017877151807717492, + "loss": 0.9552, + "step": 2407 + }, + { + "epoch": 0.85, + "grad_norm": 0.17822642624378204, + "learning_rate": 0.00017875445754414826, + "loss": 0.8894, + "step": 2408 + }, + { + "epoch": 0.85, + "grad_norm": 0.17673255503177643, + "learning_rate": 0.00017873739097317416, + "loss": 0.9404, + "step": 2409 + }, + { + "epoch": 0.85, + "grad_norm": 0.18701551854610443, + "learning_rate": 0.00017872031836556104, + "loss": 0.9207, + "step": 2410 + }, + { + "epoch": 0.85, + "grad_norm": 0.19993789494037628, + "learning_rate": 0.0001787032397226179, + "loss": 0.9728, + "step": 2411 + }, + { + "epoch": 0.85, + "grad_norm": 0.18984495103359222, + "learning_rate": 0.00017868615504565405, + "loss": 0.9743, + "step": 2412 + }, + { + "epoch": 0.85, + "grad_norm": 0.19204598665237427, + "learning_rate": 0.0001786690643359794, + "loss": 0.9662, + "step": 2413 + }, + { + "epoch": 0.85, + "grad_norm": 0.19300125539302826, + "learning_rate": 0.00017865196759490418, + "loss": 0.9866, + "step": 2414 + }, + { + "epoch": 0.85, + "grad_norm": 0.19442309439182281, + "learning_rate": 0.00017863486482373926, + "loss": 1.0097, + "step": 2415 + }, + { + "epoch": 0.85, + "grad_norm": 0.198700413107872, + "learning_rate": 0.00017861775602379581, + "loss": 1.0121, + "step": 2416 + }, + { + "epoch": 0.85, + "grad_norm": 0.1955936849117279, + "learning_rate": 0.00017860064119638557, + "loss": 0.9874, + "step": 2417 + }, + { + "epoch": 0.85, + "grad_norm": 0.1855369508266449, + "learning_rate": 0.00017858352034282068, + "loss": 0.9305, + "step": 2418 + }, + { + "epoch": 0.85, + "grad_norm": 0.20869863033294678, + "learning_rate": 0.00017856639346441373, + "loss": 0.9681, + "step": 2419 + }, + { + "epoch": 0.85, + "grad_norm": 0.18309414386749268, + "learning_rate": 0.00017854926056247786, + "loss": 0.9009, + "step": 2420 + }, + { + "epoch": 0.85, + "grad_norm": 0.18978330492973328, + "learning_rate": 0.00017853212163832655, + "loss": 0.9706, + "step": 2421 + }, + { + "epoch": 0.85, + "grad_norm": 0.18249556422233582, + "learning_rate": 0.00017851497669327387, + "loss": 0.9336, + "step": 2422 + }, + { + "epoch": 0.85, + "grad_norm": 0.1928057074546814, + "learning_rate": 0.00017849782572863427, + "loss": 0.9682, + "step": 2423 + }, + { + "epoch": 0.85, + "grad_norm": 0.2058742493391037, + "learning_rate": 0.00017848066874572266, + "loss": 1.0131, + "step": 2424 + }, + { + "epoch": 0.85, + "grad_norm": 0.19305527210235596, + "learning_rate": 0.00017846350574585445, + "loss": 1.0091, + "step": 2425 + }, + { + "epoch": 0.85, + "grad_norm": 0.1902138888835907, + "learning_rate": 0.0001784463367303455, + "loss": 1.0441, + "step": 2426 + }, + { + "epoch": 0.85, + "grad_norm": 0.19089341163635254, + "learning_rate": 0.0001784291617005121, + "loss": 0.9453, + "step": 2427 + }, + { + "epoch": 0.86, + "grad_norm": 0.19015923142433167, + "learning_rate": 0.00017841198065767107, + "loss": 0.9733, + "step": 2428 + }, + { + "epoch": 0.86, + "grad_norm": 0.20033025741577148, + "learning_rate": 0.00017839479360313957, + "loss": 0.972, + "step": 2429 + }, + { + "epoch": 0.86, + "grad_norm": 0.20651842653751373, + "learning_rate": 0.00017837760053823537, + "loss": 0.9752, + "step": 2430 + }, + { + "epoch": 0.86, + "grad_norm": 0.19228796660900116, + "learning_rate": 0.00017836040146427658, + "loss": 1.0119, + "step": 2431 + }, + { + "epoch": 0.86, + "grad_norm": 0.19866019487380981, + "learning_rate": 0.00017834319638258185, + "loss": 0.9802, + "step": 2432 + }, + { + "epoch": 0.86, + "grad_norm": 0.19379134476184845, + "learning_rate": 0.00017832598529447026, + "loss": 0.9727, + "step": 2433 + }, + { + "epoch": 0.86, + "grad_norm": 0.20367862284183502, + "learning_rate": 0.00017830876820126133, + "loss": 0.9894, + "step": 2434 + }, + { + "epoch": 0.86, + "grad_norm": 0.20174342393875122, + "learning_rate": 0.00017829154510427505, + "loss": 0.9858, + "step": 2435 + }, + { + "epoch": 0.86, + "grad_norm": 0.1984124779701233, + "learning_rate": 0.00017827431600483194, + "loss": 1.0181, + "step": 2436 + }, + { + "epoch": 0.86, + "grad_norm": 0.19504335522651672, + "learning_rate": 0.00017825708090425283, + "loss": 1.0525, + "step": 2437 + }, + { + "epoch": 0.86, + "grad_norm": 0.19187763333320618, + "learning_rate": 0.0001782398398038592, + "loss": 0.9748, + "step": 2438 + }, + { + "epoch": 0.86, + "grad_norm": 0.19658133387565613, + "learning_rate": 0.00017822259270497282, + "loss": 0.9971, + "step": 2439 + }, + { + "epoch": 0.86, + "grad_norm": 0.2015499472618103, + "learning_rate": 0.00017820533960891601, + "loss": 0.9745, + "step": 2440 + }, + { + "epoch": 0.86, + "grad_norm": 0.18588942289352417, + "learning_rate": 0.0001781880805170115, + "loss": 0.9684, + "step": 2441 + }, + { + "epoch": 0.86, + "grad_norm": 0.19589237868785858, + "learning_rate": 0.0001781708154305826, + "loss": 1.0866, + "step": 2442 + }, + { + "epoch": 0.86, + "grad_norm": 0.19100069999694824, + "learning_rate": 0.00017815354435095292, + "loss": 0.9363, + "step": 2443 + }, + { + "epoch": 0.86, + "grad_norm": 0.20036999881267548, + "learning_rate": 0.0001781362672794466, + "loss": 0.9627, + "step": 2444 + }, + { + "epoch": 0.86, + "grad_norm": 0.2073875069618225, + "learning_rate": 0.00017811898421738826, + "loss": 0.9667, + "step": 2445 + }, + { + "epoch": 0.86, + "grad_norm": 0.19809319078922272, + "learning_rate": 0.00017810169516610294, + "loss": 1.0038, + "step": 2446 + }, + { + "epoch": 0.86, + "grad_norm": 0.18860866129398346, + "learning_rate": 0.00017808440012691618, + "loss": 1.0263, + "step": 2447 + }, + { + "epoch": 0.86, + "grad_norm": 0.1854882389307022, + "learning_rate": 0.00017806709910115392, + "loss": 0.9482, + "step": 2448 + }, + { + "epoch": 0.86, + "grad_norm": 0.20285579562187195, + "learning_rate": 0.00017804979209014264, + "loss": 0.9279, + "step": 2449 + }, + { + "epoch": 0.86, + "grad_norm": 0.21248310804367065, + "learning_rate": 0.0001780324790952092, + "loss": 0.9916, + "step": 2450 + }, + { + "epoch": 0.86, + "grad_norm": 0.1837083101272583, + "learning_rate": 0.00017801516011768095, + "loss": 0.9036, + "step": 2451 + }, + { + "epoch": 0.86, + "grad_norm": 0.18199291825294495, + "learning_rate": 0.00017799783515888576, + "loss": 0.9792, + "step": 2452 + }, + { + "epoch": 0.86, + "grad_norm": 0.19304387271404266, + "learning_rate": 0.0001779805042201518, + "loss": 0.9757, + "step": 2453 + }, + { + "epoch": 0.86, + "grad_norm": 0.18951784074306488, + "learning_rate": 0.0001779631673028079, + "loss": 0.9124, + "step": 2454 + }, + { + "epoch": 0.86, + "grad_norm": 0.1998693346977234, + "learning_rate": 0.00017794582440818318, + "loss": 0.9957, + "step": 2455 + }, + { + "epoch": 0.86, + "grad_norm": 0.24689649045467377, + "learning_rate": 0.0001779284755376073, + "loss": 0.8665, + "step": 2456 + }, + { + "epoch": 0.87, + "grad_norm": 0.18024122714996338, + "learning_rate": 0.00017791112069241036, + "loss": 0.9108, + "step": 2457 + }, + { + "epoch": 0.87, + "grad_norm": 0.19234396517276764, + "learning_rate": 0.00017789375987392294, + "loss": 0.9928, + "step": 2458 + }, + { + "epoch": 0.87, + "grad_norm": 0.18758738040924072, + "learning_rate": 0.00017787639308347608, + "loss": 0.9688, + "step": 2459 + }, + { + "epoch": 0.87, + "grad_norm": 0.18658681213855743, + "learning_rate": 0.0001778590203224012, + "loss": 1.0176, + "step": 2460 + }, + { + "epoch": 0.87, + "grad_norm": 0.19142453372478485, + "learning_rate": 0.00017784164159203027, + "loss": 0.9591, + "step": 2461 + }, + { + "epoch": 0.87, + "grad_norm": 0.1846286803483963, + "learning_rate": 0.00017782425689369567, + "loss": 0.9502, + "step": 2462 + }, + { + "epoch": 0.87, + "grad_norm": 0.1844143271446228, + "learning_rate": 0.00017780686622873026, + "loss": 0.9868, + "step": 2463 + }, + { + "epoch": 0.87, + "grad_norm": 0.18845881521701813, + "learning_rate": 0.00017778946959846737, + "loss": 0.9394, + "step": 2464 + }, + { + "epoch": 0.87, + "grad_norm": 0.19077198207378387, + "learning_rate": 0.0001777720670042407, + "loss": 1.027, + "step": 2465 + }, + { + "epoch": 0.87, + "grad_norm": 0.18606038391590118, + "learning_rate": 0.00017775465844738453, + "loss": 0.9552, + "step": 2466 + }, + { + "epoch": 0.87, + "grad_norm": 0.18549682199954987, + "learning_rate": 0.00017773724392923352, + "loss": 0.9809, + "step": 2467 + }, + { + "epoch": 0.87, + "grad_norm": 0.191581591963768, + "learning_rate": 0.00017771982345112278, + "loss": 0.9697, + "step": 2468 + }, + { + "epoch": 0.87, + "grad_norm": 0.18413247168064117, + "learning_rate": 0.00017770239701438799, + "loss": 0.9741, + "step": 2469 + }, + { + "epoch": 0.87, + "grad_norm": 0.19427776336669922, + "learning_rate": 0.00017768496462036509, + "loss": 1.0082, + "step": 2470 + }, + { + "epoch": 0.87, + "grad_norm": 0.18671664595603943, + "learning_rate": 0.00017766752627039063, + "loss": 1.0208, + "step": 2471 + }, + { + "epoch": 0.87, + "grad_norm": 0.18228982388973236, + "learning_rate": 0.0001776500819658016, + "loss": 0.9956, + "step": 2472 + }, + { + "epoch": 0.87, + "grad_norm": 0.17968390882015228, + "learning_rate": 0.00017763263170793541, + "loss": 0.9631, + "step": 2473 + }, + { + "epoch": 0.87, + "grad_norm": 0.18487539887428284, + "learning_rate": 0.00017761517549812992, + "loss": 0.9693, + "step": 2474 + }, + { + "epoch": 0.87, + "grad_norm": 0.18275010585784912, + "learning_rate": 0.00017759771333772346, + "loss": 0.918, + "step": 2475 + }, + { + "epoch": 0.87, + "grad_norm": 0.1807790845632553, + "learning_rate": 0.00017758024522805484, + "loss": 0.9267, + "step": 2476 + }, + { + "epoch": 0.87, + "grad_norm": 0.18188652396202087, + "learning_rate": 0.00017756277117046332, + "loss": 0.9219, + "step": 2477 + }, + { + "epoch": 0.87, + "grad_norm": 0.1865534782409668, + "learning_rate": 0.00017754529116628852, + "loss": 0.9727, + "step": 2478 + }, + { + "epoch": 0.87, + "grad_norm": 0.17771273851394653, + "learning_rate": 0.0001775278052168707, + "loss": 0.9443, + "step": 2479 + }, + { + "epoch": 0.87, + "grad_norm": 0.18896722793579102, + "learning_rate": 0.0001775103133235504, + "loss": 0.944, + "step": 2480 + }, + { + "epoch": 0.87, + "grad_norm": 0.19240954518318176, + "learning_rate": 0.00017749281548766873, + "loss": 0.9398, + "step": 2481 + }, + { + "epoch": 0.87, + "grad_norm": 0.1850031018257141, + "learning_rate": 0.00017747531171056718, + "loss": 0.9118, + "step": 2482 + }, + { + "epoch": 0.87, + "grad_norm": 0.19715161621570587, + "learning_rate": 0.00017745780199358776, + "loss": 1.0052, + "step": 2483 + }, + { + "epoch": 0.87, + "grad_norm": 0.1838032752275467, + "learning_rate": 0.00017744028633807286, + "loss": 0.9127, + "step": 2484 + }, + { + "epoch": 0.88, + "grad_norm": 0.18692578375339508, + "learning_rate": 0.00017742276474536545, + "loss": 0.9777, + "step": 2485 + }, + { + "epoch": 0.88, + "grad_norm": 0.18337734043598175, + "learning_rate": 0.0001774052372168088, + "loss": 0.9411, + "step": 2486 + }, + { + "epoch": 0.88, + "grad_norm": 0.18327218294143677, + "learning_rate": 0.00017738770375374675, + "loss": 0.9248, + "step": 2487 + }, + { + "epoch": 0.88, + "grad_norm": 0.1908503621816635, + "learning_rate": 0.00017737016435752357, + "loss": 0.9677, + "step": 2488 + }, + { + "epoch": 0.88, + "grad_norm": 0.1790422648191452, + "learning_rate": 0.0001773526190294839, + "loss": 0.9293, + "step": 2489 + }, + { + "epoch": 0.88, + "grad_norm": 0.17901550233364105, + "learning_rate": 0.000177335067770973, + "loss": 0.9304, + "step": 2490 + }, + { + "epoch": 0.88, + "grad_norm": 0.18893276154994965, + "learning_rate": 0.0001773175105833364, + "loss": 0.9439, + "step": 2491 + }, + { + "epoch": 0.88, + "grad_norm": 0.18400788307189941, + "learning_rate": 0.00017729994746792027, + "loss": 0.9323, + "step": 2492 + }, + { + "epoch": 0.88, + "grad_norm": 0.18849042057991028, + "learning_rate": 0.00017728237842607104, + "loss": 1.0064, + "step": 2493 + }, + { + "epoch": 0.88, + "grad_norm": 0.18234604597091675, + "learning_rate": 0.00017726480345913573, + "loss": 0.946, + "step": 2494 + }, + { + "epoch": 0.88, + "grad_norm": 0.18465781211853027, + "learning_rate": 0.00017724722256846182, + "loss": 0.9963, + "step": 2495 + }, + { + "epoch": 0.88, + "grad_norm": 0.17847271263599396, + "learning_rate": 0.00017722963575539716, + "loss": 0.9777, + "step": 2496 + }, + { + "epoch": 0.88, + "grad_norm": 0.19351433217525482, + "learning_rate": 0.0001772120430212901, + "loss": 1.0891, + "step": 2497 + }, + { + "epoch": 0.88, + "grad_norm": 0.19211028516292572, + "learning_rate": 0.00017719444436748943, + "loss": 0.9809, + "step": 2498 + }, + { + "epoch": 0.88, + "grad_norm": 0.18651790916919708, + "learning_rate": 0.00017717683979534443, + "loss": 0.95, + "step": 2499 + }, + { + "epoch": 0.88, + "grad_norm": 0.18656237423419952, + "learning_rate": 0.0001771592293062048, + "loss": 0.8969, + "step": 2500 + }, + { + "epoch": 0.88, + "grad_norm": 0.1903420239686966, + "learning_rate": 0.00017714161290142074, + "loss": 0.969, + "step": 2501 + }, + { + "epoch": 0.88, + "grad_norm": 0.19045324623584747, + "learning_rate": 0.00017712399058234275, + "loss": 0.9857, + "step": 2502 + }, + { + "epoch": 0.88, + "grad_norm": 0.20729127526283264, + "learning_rate": 0.000177106362350322, + "loss": 0.9877, + "step": 2503 + }, + { + "epoch": 0.88, + "grad_norm": 0.18529750406742096, + "learning_rate": 0.00017708872820670996, + "loss": 0.9178, + "step": 2504 + }, + { + "epoch": 0.88, + "grad_norm": 0.18597035109996796, + "learning_rate": 0.00017707108815285866, + "loss": 0.9055, + "step": 2505 + }, + { + "epoch": 0.88, + "grad_norm": 0.19581322371959686, + "learning_rate": 0.00017705344219012047, + "loss": 1.0413, + "step": 2506 + }, + { + "epoch": 0.88, + "grad_norm": 0.177476167678833, + "learning_rate": 0.0001770357903198483, + "loss": 0.9686, + "step": 2507 + }, + { + "epoch": 0.88, + "grad_norm": 0.17651228606700897, + "learning_rate": 0.0001770181325433955, + "loss": 0.9821, + "step": 2508 + }, + { + "epoch": 0.88, + "grad_norm": 0.1783791333436966, + "learning_rate": 0.0001770004688621158, + "loss": 0.9437, + "step": 2509 + }, + { + "epoch": 0.88, + "grad_norm": 0.1889670491218567, + "learning_rate": 0.00017698279927736349, + "loss": 0.996, + "step": 2510 + }, + { + "epoch": 0.88, + "grad_norm": 0.18272949755191803, + "learning_rate": 0.00017696512379049325, + "loss": 0.971, + "step": 2511 + }, + { + "epoch": 0.88, + "grad_norm": 0.18914766609668732, + "learning_rate": 0.0001769474424028602, + "loss": 0.937, + "step": 2512 + }, + { + "epoch": 0.88, + "grad_norm": 0.1900867372751236, + "learning_rate": 0.00017692975511581998, + "loss": 1.0283, + "step": 2513 + }, + { + "epoch": 0.89, + "grad_norm": 0.1854158490896225, + "learning_rate": 0.0001769120619307286, + "loss": 0.9811, + "step": 2514 + }, + { + "epoch": 0.89, + "grad_norm": 0.18695878982543945, + "learning_rate": 0.00017689436284894257, + "loss": 0.9677, + "step": 2515 + }, + { + "epoch": 0.89, + "grad_norm": 0.1806347817182541, + "learning_rate": 0.0001768766578718189, + "loss": 0.9546, + "step": 2516 + }, + { + "epoch": 0.89, + "grad_norm": 0.18050779402256012, + "learning_rate": 0.00017685894700071489, + "loss": 0.9554, + "step": 2517 + }, + { + "epoch": 0.89, + "grad_norm": 0.20120397210121155, + "learning_rate": 0.00017684123023698847, + "loss": 0.9677, + "step": 2518 + }, + { + "epoch": 0.89, + "grad_norm": 0.19255037605762482, + "learning_rate": 0.00017682350758199794, + "loss": 0.9656, + "step": 2519 + }, + { + "epoch": 0.89, + "grad_norm": 0.19251564145088196, + "learning_rate": 0.00017680577903710204, + "loss": 0.9783, + "step": 2520 + }, + { + "epoch": 0.89, + "grad_norm": 0.19172608852386475, + "learning_rate": 0.00017678804460366, + "loss": 0.9441, + "step": 2521 + }, + { + "epoch": 0.89, + "grad_norm": 0.17901365458965302, + "learning_rate": 0.0001767703042830315, + "loss": 0.8979, + "step": 2522 + }, + { + "epoch": 0.89, + "grad_norm": 0.1947840005159378, + "learning_rate": 0.00017675255807657661, + "loss": 0.982, + "step": 2523 + }, + { + "epoch": 0.89, + "grad_norm": 0.192917600274086, + "learning_rate": 0.00017673480598565596, + "loss": 1.0014, + "step": 2524 + }, + { + "epoch": 0.89, + "grad_norm": 0.18823352456092834, + "learning_rate": 0.00017671704801163046, + "loss": 0.9995, + "step": 2525 + }, + { + "epoch": 0.89, + "grad_norm": 0.19002202153205872, + "learning_rate": 0.00017669928415586168, + "loss": 1.0014, + "step": 2526 + }, + { + "epoch": 0.89, + "grad_norm": 0.18676120042800903, + "learning_rate": 0.0001766815144197115, + "loss": 0.9492, + "step": 2527 + }, + { + "epoch": 0.89, + "grad_norm": 0.18423797190189362, + "learning_rate": 0.0001766637388045423, + "loss": 0.9578, + "step": 2528 + }, + { + "epoch": 0.89, + "grad_norm": 0.18916915357112885, + "learning_rate": 0.00017664595731171692, + "loss": 0.9713, + "step": 2529 + }, + { + "epoch": 0.89, + "grad_norm": 0.21028481423854828, + "learning_rate": 0.00017662816994259858, + "loss": 0.9628, + "step": 2530 + }, + { + "epoch": 0.89, + "grad_norm": 0.17987662553787231, + "learning_rate": 0.00017661037669855105, + "loss": 0.8998, + "step": 2531 + }, + { + "epoch": 0.89, + "grad_norm": 0.17913588881492615, + "learning_rate": 0.00017659257758093845, + "loss": 0.9486, + "step": 2532 + }, + { + "epoch": 0.89, + "grad_norm": 0.20375816524028778, + "learning_rate": 0.0001765747725911255, + "loss": 1.0011, + "step": 2533 + }, + { + "epoch": 0.89, + "grad_norm": 0.18085017800331116, + "learning_rate": 0.00017655696173047715, + "loss": 0.9419, + "step": 2534 + }, + { + "epoch": 0.89, + "grad_norm": 0.20048849284648895, + "learning_rate": 0.000176539145000359, + "loss": 1.0196, + "step": 2535 + }, + { + "epoch": 0.89, + "grad_norm": 0.1985366940498352, + "learning_rate": 0.000176521322402137, + "loss": 1.0123, + "step": 2536 + }, + { + "epoch": 0.89, + "grad_norm": 0.18767307698726654, + "learning_rate": 0.0001765034939371776, + "loss": 0.957, + "step": 2537 + }, + { + "epoch": 0.89, + "grad_norm": 0.19573388993740082, + "learning_rate": 0.00017648565960684762, + "loss": 0.9504, + "step": 2538 + }, + { + "epoch": 0.89, + "grad_norm": 0.19709470868110657, + "learning_rate": 0.00017646781941251443, + "loss": 1.0158, + "step": 2539 + }, + { + "epoch": 0.89, + "grad_norm": 0.19420363008975983, + "learning_rate": 0.0001764499733555458, + "loss": 0.9909, + "step": 2540 + }, + { + "epoch": 0.89, + "grad_norm": 0.19303253293037415, + "learning_rate": 0.0001764321214373099, + "loss": 1.0083, + "step": 2541 + }, + { + "epoch": 0.9, + "grad_norm": 0.18523170053958893, + "learning_rate": 0.0001764142636591755, + "loss": 0.9079, + "step": 2542 + }, + { + "epoch": 0.9, + "grad_norm": 0.18193301558494568, + "learning_rate": 0.0001763964000225116, + "loss": 0.9455, + "step": 2543 + }, + { + "epoch": 0.9, + "grad_norm": 0.20065605640411377, + "learning_rate": 0.0001763785305286879, + "loss": 1.003, + "step": 2544 + }, + { + "epoch": 0.9, + "grad_norm": 0.19543297588825226, + "learning_rate": 0.00017636065517907427, + "loss": 0.9628, + "step": 2545 + }, + { + "epoch": 0.9, + "grad_norm": 0.19203916192054749, + "learning_rate": 0.0001763427739750413, + "loss": 0.9727, + "step": 2546 + }, + { + "epoch": 0.9, + "grad_norm": 0.1904589831829071, + "learning_rate": 0.00017632488691795987, + "loss": 0.9252, + "step": 2547 + }, + { + "epoch": 0.9, + "grad_norm": 0.19423457980155945, + "learning_rate": 0.0001763069940092013, + "loss": 0.9845, + "step": 2548 + }, + { + "epoch": 0.9, + "grad_norm": 0.18518464267253876, + "learning_rate": 0.00017628909525013744, + "loss": 0.9494, + "step": 2549 + }, + { + "epoch": 0.9, + "grad_norm": 0.18881121277809143, + "learning_rate": 0.00017627119064214056, + "loss": 0.9423, + "step": 2550 + }, + { + "epoch": 0.9, + "grad_norm": 0.1885634958744049, + "learning_rate": 0.0001762532801865834, + "loss": 0.9789, + "step": 2551 + }, + { + "epoch": 0.9, + "grad_norm": 0.18694055080413818, + "learning_rate": 0.00017623536388483905, + "loss": 0.9094, + "step": 2552 + }, + { + "epoch": 0.9, + "grad_norm": 0.18675947189331055, + "learning_rate": 0.00017621744173828115, + "loss": 0.9737, + "step": 2553 + }, + { + "epoch": 0.9, + "grad_norm": 0.18994884192943573, + "learning_rate": 0.00017619951374828373, + "loss": 0.9488, + "step": 2554 + }, + { + "epoch": 0.9, + "grad_norm": 0.1960517317056656, + "learning_rate": 0.00017618157991622138, + "loss": 0.9846, + "step": 2555 + }, + { + "epoch": 0.9, + "grad_norm": 0.1805785894393921, + "learning_rate": 0.00017616364024346893, + "loss": 0.9257, + "step": 2556 + }, + { + "epoch": 0.9, + "grad_norm": 0.19077786803245544, + "learning_rate": 0.00017614569473140188, + "loss": 0.9756, + "step": 2557 + }, + { + "epoch": 0.9, + "grad_norm": 0.19670559465885162, + "learning_rate": 0.000176127743381396, + "loss": 0.9607, + "step": 2558 + }, + { + "epoch": 0.9, + "grad_norm": 0.19572590291500092, + "learning_rate": 0.00017610978619482764, + "loss": 0.9815, + "step": 2559 + }, + { + "epoch": 0.9, + "grad_norm": 0.19013354182243347, + "learning_rate": 0.0001760918231730735, + "loss": 0.9206, + "step": 2560 + }, + { + "epoch": 0.9, + "grad_norm": 0.17755639553070068, + "learning_rate": 0.0001760738543175108, + "loss": 0.9608, + "step": 2561 + }, + { + "epoch": 0.9, + "grad_norm": 0.18971143662929535, + "learning_rate": 0.00017605587962951717, + "loss": 0.9529, + "step": 2562 + }, + { + "epoch": 0.9, + "grad_norm": 0.20042835175991058, + "learning_rate": 0.0001760378991104707, + "loss": 1.0179, + "step": 2563 + }, + { + "epoch": 0.9, + "grad_norm": 0.18564312160015106, + "learning_rate": 0.0001760199127617499, + "loss": 0.9651, + "step": 2564 + }, + { + "epoch": 0.9, + "grad_norm": 0.19597946107387543, + "learning_rate": 0.00017600192058473378, + "loss": 0.9924, + "step": 2565 + }, + { + "epoch": 0.9, + "grad_norm": 0.19503167271614075, + "learning_rate": 0.00017598392258080168, + "loss": 0.9508, + "step": 2566 + }, + { + "epoch": 0.9, + "grad_norm": 0.18775935471057892, + "learning_rate": 0.0001759659187513336, + "loss": 0.9388, + "step": 2567 + }, + { + "epoch": 0.9, + "grad_norm": 0.18213537335395813, + "learning_rate": 0.00017594790909770976, + "loss": 0.9227, + "step": 2568 + }, + { + "epoch": 0.9, + "grad_norm": 0.20675437152385712, + "learning_rate": 0.00017592989362131097, + "loss": 1.002, + "step": 2569 + }, + { + "epoch": 0.91, + "grad_norm": 0.17877671122550964, + "learning_rate": 0.0001759118723235184, + "loss": 0.9257, + "step": 2570 + }, + { + "epoch": 0.91, + "grad_norm": 0.18973997235298157, + "learning_rate": 0.00017589384520571376, + "loss": 0.9784, + "step": 2571 + }, + { + "epoch": 0.91, + "grad_norm": 0.19125111401081085, + "learning_rate": 0.0001758758122692791, + "loss": 0.9594, + "step": 2572 + }, + { + "epoch": 0.91, + "grad_norm": 0.18701352179050446, + "learning_rate": 0.00017585777351559703, + "loss": 0.963, + "step": 2573 + }, + { + "epoch": 0.91, + "grad_norm": 0.18558120727539062, + "learning_rate": 0.00017583972894605047, + "loss": 0.9009, + "step": 2574 + }, + { + "epoch": 0.91, + "grad_norm": 0.18775434792041779, + "learning_rate": 0.00017582167856202295, + "loss": 1.0063, + "step": 2575 + }, + { + "epoch": 0.91, + "grad_norm": 0.1810017228126526, + "learning_rate": 0.00017580362236489823, + "loss": 0.8829, + "step": 2576 + }, + { + "epoch": 0.91, + "grad_norm": 0.17568695545196533, + "learning_rate": 0.00017578556035606079, + "loss": 0.9182, + "step": 2577 + }, + { + "epoch": 0.91, + "grad_norm": 0.18526588380336761, + "learning_rate": 0.00017576749253689532, + "loss": 0.9913, + "step": 2578 + }, + { + "epoch": 0.91, + "grad_norm": 0.18252696096897125, + "learning_rate": 0.00017574941890878705, + "loss": 0.9919, + "step": 2579 + }, + { + "epoch": 0.91, + "grad_norm": 0.1952015608549118, + "learning_rate": 0.0001757313394731217, + "loss": 1.019, + "step": 2580 + }, + { + "epoch": 0.91, + "grad_norm": 0.18415303528308868, + "learning_rate": 0.00017571325423128533, + "loss": 0.9776, + "step": 2581 + }, + { + "epoch": 0.91, + "grad_norm": 0.18594758212566376, + "learning_rate": 0.00017569516318466453, + "loss": 0.9915, + "step": 2582 + }, + { + "epoch": 0.91, + "grad_norm": 0.19213664531707764, + "learning_rate": 0.00017567706633464628, + "loss": 1.0335, + "step": 2583 + }, + { + "epoch": 0.91, + "grad_norm": 0.18794137239456177, + "learning_rate": 0.00017565896368261803, + "loss": 0.9585, + "step": 2584 + }, + { + "epoch": 0.91, + "grad_norm": 0.19076941907405853, + "learning_rate": 0.00017564085522996767, + "loss": 0.9427, + "step": 2585 + }, + { + "epoch": 0.91, + "grad_norm": 0.1841420829296112, + "learning_rate": 0.00017562274097808359, + "loss": 0.9604, + "step": 2586 + }, + { + "epoch": 0.91, + "grad_norm": 0.17589803040027618, + "learning_rate": 0.00017560462092835452, + "loss": 0.9205, + "step": 2587 + }, + { + "epoch": 0.91, + "grad_norm": 0.19224637746810913, + "learning_rate": 0.0001755864950821697, + "loss": 0.9698, + "step": 2588 + }, + { + "epoch": 0.91, + "grad_norm": 0.18274915218353271, + "learning_rate": 0.00017556836344091882, + "loss": 0.9323, + "step": 2589 + }, + { + "epoch": 0.91, + "grad_norm": 0.18005827069282532, + "learning_rate": 0.00017555022600599198, + "loss": 0.9531, + "step": 2590 + }, + { + "epoch": 0.91, + "grad_norm": 0.18462547659873962, + "learning_rate": 0.00017553208277877973, + "loss": 1.0143, + "step": 2591 + }, + { + "epoch": 0.91, + "grad_norm": 0.18477481603622437, + "learning_rate": 0.00017551393376067314, + "loss": 0.9429, + "step": 2592 + }, + { + "epoch": 0.91, + "grad_norm": 0.18777184188365936, + "learning_rate": 0.00017549577895306358, + "loss": 0.9305, + "step": 2593 + }, + { + "epoch": 0.91, + "grad_norm": 0.18475376069545746, + "learning_rate": 0.00017547761835734295, + "loss": 0.9892, + "step": 2594 + }, + { + "epoch": 0.91, + "grad_norm": 0.176994189620018, + "learning_rate": 0.00017545945197490365, + "loss": 0.9627, + "step": 2595 + }, + { + "epoch": 0.91, + "grad_norm": 0.180376335978508, + "learning_rate": 0.00017544127980713837, + "loss": 0.9661, + "step": 2596 + }, + { + "epoch": 0.91, + "grad_norm": 0.18328218162059784, + "learning_rate": 0.00017542310185544041, + "loss": 0.9033, + "step": 2597 + }, + { + "epoch": 0.91, + "grad_norm": 0.19031573832035065, + "learning_rate": 0.00017540491812120343, + "loss": 0.9594, + "step": 2598 + }, + { + "epoch": 0.92, + "grad_norm": 0.1834496259689331, + "learning_rate": 0.00017538672860582151, + "loss": 0.9981, + "step": 2599 + }, + { + "epoch": 0.92, + "grad_norm": 0.19750292599201202, + "learning_rate": 0.00017536853331068922, + "loss": 1.1101, + "step": 2600 + }, + { + "epoch": 0.92, + "grad_norm": 0.19078829884529114, + "learning_rate": 0.00017535033223720154, + "loss": 0.9598, + "step": 2601 + }, + { + "epoch": 0.92, + "grad_norm": 0.18731549382209778, + "learning_rate": 0.00017533212538675395, + "loss": 0.9635, + "step": 2602 + }, + { + "epoch": 0.92, + "grad_norm": 0.18684478104114532, + "learning_rate": 0.00017531391276074228, + "loss": 0.9185, + "step": 2603 + }, + { + "epoch": 0.92, + "grad_norm": 0.18306666612625122, + "learning_rate": 0.00017529569436056287, + "loss": 0.9284, + "step": 2604 + }, + { + "epoch": 0.92, + "grad_norm": 0.17791491746902466, + "learning_rate": 0.00017527747018761253, + "loss": 0.9262, + "step": 2605 + }, + { + "epoch": 0.92, + "grad_norm": 0.18981628119945526, + "learning_rate": 0.00017525924024328844, + "loss": 1.0016, + "step": 2606 + }, + { + "epoch": 0.92, + "grad_norm": 0.1939721405506134, + "learning_rate": 0.00017524100452898824, + "loss": 0.9912, + "step": 2607 + }, + { + "epoch": 0.92, + "grad_norm": 0.19684866070747375, + "learning_rate": 0.00017522276304611005, + "loss": 1.0439, + "step": 2608 + }, + { + "epoch": 0.92, + "grad_norm": 0.1944311410188675, + "learning_rate": 0.0001752045157960524, + "loss": 0.921, + "step": 2609 + }, + { + "epoch": 0.92, + "grad_norm": 0.18503262102603912, + "learning_rate": 0.00017518626278021425, + "loss": 0.9166, + "step": 2610 + }, + { + "epoch": 0.92, + "grad_norm": 0.17905835807323456, + "learning_rate": 0.00017516800399999507, + "loss": 0.9416, + "step": 2611 + }, + { + "epoch": 0.92, + "grad_norm": 0.17581358551979065, + "learning_rate": 0.0001751497394567947, + "loss": 0.8916, + "step": 2612 + }, + { + "epoch": 0.92, + "grad_norm": 0.18805000185966492, + "learning_rate": 0.0001751314691520134, + "loss": 0.9259, + "step": 2613 + }, + { + "epoch": 0.92, + "grad_norm": 0.18567489087581635, + "learning_rate": 0.00017511319308705198, + "loss": 0.9211, + "step": 2614 + }, + { + "epoch": 0.92, + "grad_norm": 0.19012784957885742, + "learning_rate": 0.00017509491126331162, + "loss": 0.9819, + "step": 2615 + }, + { + "epoch": 0.92, + "grad_norm": 0.18892110884189606, + "learning_rate": 0.00017507662368219395, + "loss": 0.896, + "step": 2616 + }, + { + "epoch": 0.92, + "grad_norm": 0.18657296895980835, + "learning_rate": 0.00017505833034510104, + "loss": 0.9289, + "step": 2617 + }, + { + "epoch": 0.92, + "grad_norm": 0.19216986000537872, + "learning_rate": 0.00017504003125343536, + "loss": 0.9743, + "step": 2618 + }, + { + "epoch": 0.92, + "grad_norm": 0.18053677678108215, + "learning_rate": 0.00017502172640859991, + "loss": 0.9065, + "step": 2619 + }, + { + "epoch": 0.92, + "grad_norm": 0.19183310866355896, + "learning_rate": 0.0001750034158119981, + "loss": 1.0026, + "step": 2620 + }, + { + "epoch": 0.92, + "grad_norm": 0.17989866435527802, + "learning_rate": 0.0001749850994650337, + "loss": 0.8568, + "step": 2621 + }, + { + "epoch": 0.92, + "grad_norm": 0.19608454406261444, + "learning_rate": 0.0001749667773691111, + "loss": 0.9396, + "step": 2622 + }, + { + "epoch": 0.92, + "grad_norm": 0.1893497258424759, + "learning_rate": 0.00017494844952563492, + "loss": 1.0244, + "step": 2623 + }, + { + "epoch": 0.92, + "grad_norm": 0.1881159394979477, + "learning_rate": 0.00017493011593601034, + "loss": 0.9258, + "step": 2624 + }, + { + "epoch": 0.92, + "grad_norm": 0.19302113354206085, + "learning_rate": 0.00017491177660164294, + "loss": 0.9426, + "step": 2625 + }, + { + "epoch": 0.92, + "grad_norm": 0.1864844262599945, + "learning_rate": 0.00017489343152393886, + "loss": 0.9482, + "step": 2626 + }, + { + "epoch": 0.93, + "grad_norm": 0.19288282096385956, + "learning_rate": 0.00017487508070430447, + "loss": 1.0078, + "step": 2627 + }, + { + "epoch": 0.93, + "grad_norm": 0.19508597254753113, + "learning_rate": 0.00017485672414414675, + "loss": 1.0081, + "step": 2628 + }, + { + "epoch": 0.93, + "grad_norm": 0.19731415808200836, + "learning_rate": 0.00017483836184487304, + "loss": 1.0362, + "step": 2629 + }, + { + "epoch": 0.93, + "grad_norm": 0.18698172271251678, + "learning_rate": 0.00017481999380789112, + "loss": 0.9922, + "step": 2630 + }, + { + "epoch": 0.93, + "grad_norm": 0.1864076554775238, + "learning_rate": 0.0001748016200346093, + "loss": 0.9954, + "step": 2631 + }, + { + "epoch": 0.93, + "grad_norm": 0.19683367013931274, + "learning_rate": 0.00017478324052643617, + "loss": 0.9626, + "step": 2632 + }, + { + "epoch": 0.93, + "grad_norm": 0.1958743780851364, + "learning_rate": 0.00017476485528478093, + "loss": 0.9605, + "step": 2633 + }, + { + "epoch": 0.93, + "grad_norm": 0.19047711789608002, + "learning_rate": 0.00017474646431105305, + "loss": 0.9122, + "step": 2634 + }, + { + "epoch": 0.93, + "grad_norm": 0.18464651703834534, + "learning_rate": 0.00017472806760666262, + "loss": 0.9645, + "step": 2635 + }, + { + "epoch": 0.93, + "grad_norm": 0.19960200786590576, + "learning_rate": 0.00017470966517302004, + "loss": 0.9316, + "step": 2636 + }, + { + "epoch": 0.93, + "grad_norm": 0.18094268441200256, + "learning_rate": 0.00017469125701153622, + "loss": 0.9249, + "step": 2637 + }, + { + "epoch": 0.93, + "grad_norm": 0.18596448004245758, + "learning_rate": 0.00017467284312362242, + "loss": 0.9562, + "step": 2638 + }, + { + "epoch": 0.93, + "grad_norm": 0.19108310341835022, + "learning_rate": 0.00017465442351069044, + "loss": 0.9802, + "step": 2639 + }, + { + "epoch": 0.93, + "grad_norm": 0.19825781881809235, + "learning_rate": 0.00017463599817415243, + "loss": 1.0161, + "step": 2640 + }, + { + "epoch": 0.93, + "grad_norm": 0.18590597808361053, + "learning_rate": 0.0001746175671154211, + "loss": 0.8963, + "step": 2641 + }, + { + "epoch": 0.93, + "grad_norm": 0.19133514165878296, + "learning_rate": 0.00017459913033590944, + "loss": 0.9536, + "step": 2642 + }, + { + "epoch": 0.93, + "grad_norm": 0.18116304278373718, + "learning_rate": 0.000174580687837031, + "loss": 1.0119, + "step": 2643 + }, + { + "epoch": 0.93, + "grad_norm": 0.19031855463981628, + "learning_rate": 0.00017456223962019975, + "loss": 0.9332, + "step": 2644 + }, + { + "epoch": 0.93, + "grad_norm": 0.18714213371276855, + "learning_rate": 0.00017454378568683003, + "loss": 0.9149, + "step": 2645 + }, + { + "epoch": 0.93, + "grad_norm": 0.20420034229755402, + "learning_rate": 0.00017452532603833668, + "loss": 0.9958, + "step": 2646 + }, + { + "epoch": 0.93, + "grad_norm": 0.1930292397737503, + "learning_rate": 0.00017450686067613502, + "loss": 0.9853, + "step": 2647 + }, + { + "epoch": 0.93, + "grad_norm": 0.20497813820838928, + "learning_rate": 0.00017448838960164068, + "loss": 0.9942, + "step": 2648 + }, + { + "epoch": 0.93, + "grad_norm": 0.19013455510139465, + "learning_rate": 0.00017446991281626982, + "loss": 0.999, + "step": 2649 + }, + { + "epoch": 0.93, + "grad_norm": 0.19203388690948486, + "learning_rate": 0.00017445143032143905, + "loss": 0.9922, + "step": 2650 + }, + { + "epoch": 0.93, + "grad_norm": 0.1985304057598114, + "learning_rate": 0.00017443294211856534, + "loss": 0.9287, + "step": 2651 + }, + { + "epoch": 0.93, + "grad_norm": 0.19734938442707062, + "learning_rate": 0.0001744144482090662, + "loss": 1.036, + "step": 2652 + }, + { + "epoch": 0.93, + "grad_norm": 0.19779257476329803, + "learning_rate": 0.00017439594859435946, + "loss": 0.9858, + "step": 2653 + }, + { + "epoch": 0.93, + "grad_norm": 0.1939266473054886, + "learning_rate": 0.0001743774432758635, + "loss": 0.9208, + "step": 2654 + }, + { + "epoch": 0.93, + "grad_norm": 0.19337120652198792, + "learning_rate": 0.00017435893225499705, + "loss": 0.9435, + "step": 2655 + }, + { + "epoch": 0.94, + "grad_norm": 0.17661985754966736, + "learning_rate": 0.00017434041553317933, + "loss": 0.9302, + "step": 2656 + }, + { + "epoch": 0.94, + "grad_norm": 0.19542555510997772, + "learning_rate": 0.00017432189311182995, + "loss": 0.956, + "step": 2657 + }, + { + "epoch": 0.94, + "grad_norm": 0.19809268414974213, + "learning_rate": 0.00017430336499236904, + "loss": 1.0116, + "step": 2658 + }, + { + "epoch": 0.94, + "grad_norm": 0.18813373148441315, + "learning_rate": 0.00017428483117621707, + "loss": 0.9388, + "step": 2659 + }, + { + "epoch": 0.94, + "grad_norm": 0.18381525576114655, + "learning_rate": 0.00017426629166479503, + "loss": 1.002, + "step": 2660 + }, + { + "epoch": 0.94, + "grad_norm": 0.18643952906131744, + "learning_rate": 0.00017424774645952425, + "loss": 0.9856, + "step": 2661 + }, + { + "epoch": 0.94, + "grad_norm": 0.18275515735149384, + "learning_rate": 0.0001742291955618266, + "loss": 0.9922, + "step": 2662 + }, + { + "epoch": 0.94, + "grad_norm": 0.18715785443782806, + "learning_rate": 0.00017421063897312439, + "loss": 0.9633, + "step": 2663 + }, + { + "epoch": 0.94, + "grad_norm": 0.19189901649951935, + "learning_rate": 0.00017419207669484022, + "loss": 0.9482, + "step": 2664 + }, + { + "epoch": 0.94, + "grad_norm": 0.2036040723323822, + "learning_rate": 0.0001741735087283972, + "loss": 1.0095, + "step": 2665 + }, + { + "epoch": 0.94, + "grad_norm": 0.19645290076732635, + "learning_rate": 0.00017415493507521904, + "loss": 0.9933, + "step": 2666 + }, + { + "epoch": 0.94, + "grad_norm": 0.1870705783367157, + "learning_rate": 0.00017413635573672963, + "loss": 0.9274, + "step": 2667 + }, + { + "epoch": 0.94, + "grad_norm": 0.18717564642429352, + "learning_rate": 0.00017411777071435345, + "loss": 1.0066, + "step": 2668 + }, + { + "epoch": 0.94, + "grad_norm": 0.18918143212795258, + "learning_rate": 0.00017409918000951534, + "loss": 0.9256, + "step": 2669 + }, + { + "epoch": 0.94, + "grad_norm": 0.18597491085529327, + "learning_rate": 0.00017408058362364067, + "loss": 0.9461, + "step": 2670 + }, + { + "epoch": 0.94, + "grad_norm": 0.18700912594795227, + "learning_rate": 0.0001740619815581552, + "loss": 0.9501, + "step": 2671 + }, + { + "epoch": 0.94, + "grad_norm": 0.18128564953804016, + "learning_rate": 0.000174043373814485, + "loss": 0.9801, + "step": 2672 + }, + { + "epoch": 0.94, + "grad_norm": 0.19424092769622803, + "learning_rate": 0.0001740247603940568, + "loss": 0.977, + "step": 2673 + }, + { + "epoch": 0.94, + "grad_norm": 0.19749069213867188, + "learning_rate": 0.00017400614129829762, + "loss": 1.0002, + "step": 2674 + }, + { + "epoch": 0.94, + "grad_norm": 0.19106240570545197, + "learning_rate": 0.00017398751652863495, + "loss": 0.9679, + "step": 2675 + }, + { + "epoch": 0.94, + "grad_norm": 0.18876773118972778, + "learning_rate": 0.0001739688860864967, + "loss": 0.9475, + "step": 2676 + }, + { + "epoch": 0.94, + "grad_norm": 0.1802065521478653, + "learning_rate": 0.00017395024997331123, + "loss": 0.9605, + "step": 2677 + }, + { + "epoch": 0.94, + "grad_norm": 0.18019360303878784, + "learning_rate": 0.00017393160819050737, + "loss": 0.9025, + "step": 2678 + }, + { + "epoch": 0.94, + "grad_norm": 0.20628519356250763, + "learning_rate": 0.00017391296073951428, + "loss": 0.9899, + "step": 2679 + }, + { + "epoch": 0.94, + "grad_norm": 0.1838846504688263, + "learning_rate": 0.00017389430762176168, + "loss": 0.9286, + "step": 2680 + }, + { + "epoch": 0.94, + "grad_norm": 0.196673184633255, + "learning_rate": 0.00017387564883867968, + "loss": 0.9841, + "step": 2681 + }, + { + "epoch": 0.94, + "grad_norm": 0.19348804652690887, + "learning_rate": 0.00017385698439169875, + "loss": 0.981, + "step": 2682 + }, + { + "epoch": 0.94, + "grad_norm": 0.19190922379493713, + "learning_rate": 0.0001738383142822499, + "loss": 0.9236, + "step": 2683 + }, + { + "epoch": 0.95, + "grad_norm": 0.1826351135969162, + "learning_rate": 0.00017381963851176454, + "loss": 0.9592, + "step": 2684 + }, + { + "epoch": 0.95, + "grad_norm": 0.18728382885456085, + "learning_rate": 0.00017380095708167446, + "loss": 0.8939, + "step": 2685 + }, + { + "epoch": 0.95, + "grad_norm": 0.19076916575431824, + "learning_rate": 0.00017378226999341197, + "loss": 0.9717, + "step": 2686 + }, + { + "epoch": 0.95, + "grad_norm": 0.17857998609542847, + "learning_rate": 0.00017376357724840973, + "loss": 0.9, + "step": 2687 + }, + { + "epoch": 0.95, + "grad_norm": 0.1867329478263855, + "learning_rate": 0.00017374487884810088, + "loss": 0.957, + "step": 2688 + }, + { + "epoch": 0.95, + "grad_norm": 0.18857437372207642, + "learning_rate": 0.00017372617479391903, + "loss": 0.9405, + "step": 2689 + }, + { + "epoch": 0.95, + "grad_norm": 0.19370107352733612, + "learning_rate": 0.00017370746508729818, + "loss": 0.8735, + "step": 2690 + }, + { + "epoch": 0.95, + "grad_norm": 0.18515117466449738, + "learning_rate": 0.00017368874972967273, + "loss": 0.8689, + "step": 2691 + }, + { + "epoch": 0.95, + "grad_norm": 0.20236286520957947, + "learning_rate": 0.00017367002872247758, + "loss": 1.0547, + "step": 2692 + }, + { + "epoch": 0.95, + "grad_norm": 0.19606740772724152, + "learning_rate": 0.000173651302067148, + "loss": 1.0677, + "step": 2693 + }, + { + "epoch": 0.95, + "grad_norm": 0.1996169537305832, + "learning_rate": 0.00017363256976511972, + "loss": 0.8994, + "step": 2694 + }, + { + "epoch": 0.95, + "grad_norm": 0.1887851357460022, + "learning_rate": 0.00017361383181782898, + "loss": 0.9454, + "step": 2695 + }, + { + "epoch": 0.95, + "grad_norm": 0.1962120532989502, + "learning_rate": 0.0001735950882267123, + "loss": 0.9883, + "step": 2696 + }, + { + "epoch": 0.95, + "grad_norm": 0.184779554605484, + "learning_rate": 0.0001735763389932068, + "loss": 0.9725, + "step": 2697 + }, + { + "epoch": 0.95, + "grad_norm": 0.19174376130104065, + "learning_rate": 0.00017355758411874983, + "loss": 0.9666, + "step": 2698 + }, + { + "epoch": 0.95, + "grad_norm": 0.19610293209552765, + "learning_rate": 0.00017353882360477937, + "loss": 0.9812, + "step": 2699 + }, + { + "epoch": 0.95, + "grad_norm": 0.19059418141841888, + "learning_rate": 0.0001735200574527337, + "loss": 0.9819, + "step": 2700 + }, + { + "epoch": 0.95, + "grad_norm": 0.18484486639499664, + "learning_rate": 0.00017350128566405172, + "loss": 0.9654, + "step": 2701 + }, + { + "epoch": 0.95, + "grad_norm": 0.19833725690841675, + "learning_rate": 0.00017348250824017242, + "loss": 0.9318, + "step": 2702 + }, + { + "epoch": 0.95, + "grad_norm": 0.1945193111896515, + "learning_rate": 0.00017346372518253558, + "loss": 0.978, + "step": 2703 + }, + { + "epoch": 0.95, + "grad_norm": 0.18793296813964844, + "learning_rate": 0.00017344493649258118, + "loss": 0.9256, + "step": 2704 + }, + { + "epoch": 0.95, + "grad_norm": 0.1921757608652115, + "learning_rate": 0.00017342614217174976, + "loss": 0.981, + "step": 2705 + }, + { + "epoch": 0.95, + "grad_norm": 0.19174674153327942, + "learning_rate": 0.00017340734222148223, + "loss": 0.981, + "step": 2706 + }, + { + "epoch": 0.95, + "grad_norm": 0.19075125455856323, + "learning_rate": 0.00017338853664321992, + "loss": 0.985, + "step": 2707 + }, + { + "epoch": 0.95, + "grad_norm": 0.18513436615467072, + "learning_rate": 0.00017336972543840467, + "loss": 1.0468, + "step": 2708 + }, + { + "epoch": 0.95, + "grad_norm": 0.1740717887878418, + "learning_rate": 0.00017335090860847862, + "loss": 0.911, + "step": 2709 + }, + { + "epoch": 0.95, + "grad_norm": 0.20851874351501465, + "learning_rate": 0.00017333208615488447, + "loss": 1.1055, + "step": 2710 + }, + { + "epoch": 0.95, + "grad_norm": 0.1837608367204666, + "learning_rate": 0.0001733132580790653, + "loss": 0.9793, + "step": 2711 + }, + { + "epoch": 0.96, + "grad_norm": 0.17361067235469818, + "learning_rate": 0.0001732944243824646, + "loss": 0.8594, + "step": 2712 + }, + { + "epoch": 0.96, + "grad_norm": 0.1763828694820404, + "learning_rate": 0.00017327558506652636, + "loss": 0.9235, + "step": 2713 + }, + { + "epoch": 0.96, + "grad_norm": 0.1873314380645752, + "learning_rate": 0.00017325674013269487, + "loss": 0.9479, + "step": 2714 + }, + { + "epoch": 0.96, + "grad_norm": 0.19114772975444794, + "learning_rate": 0.00017323788958241502, + "loss": 0.9618, + "step": 2715 + }, + { + "epoch": 0.96, + "grad_norm": 0.19344426691532135, + "learning_rate": 0.00017321903341713203, + "loss": 0.9868, + "step": 2716 + }, + { + "epoch": 0.96, + "grad_norm": 0.19095127284526825, + "learning_rate": 0.00017320017163829148, + "loss": 0.9675, + "step": 2717 + }, + { + "epoch": 0.96, + "grad_norm": 0.19331395626068115, + "learning_rate": 0.00017318130424733954, + "loss": 0.9681, + "step": 2718 + }, + { + "epoch": 0.96, + "grad_norm": 0.19180914759635925, + "learning_rate": 0.00017316243124572273, + "loss": 0.9113, + "step": 2719 + }, + { + "epoch": 0.96, + "grad_norm": 0.1976597011089325, + "learning_rate": 0.000173143552634888, + "loss": 1.0001, + "step": 2720 + }, + { + "epoch": 0.96, + "grad_norm": 0.19116324186325073, + "learning_rate": 0.00017312466841628272, + "loss": 0.9865, + "step": 2721 + }, + { + "epoch": 0.96, + "grad_norm": 0.1902334988117218, + "learning_rate": 0.0001731057785913547, + "loss": 0.9652, + "step": 2722 + }, + { + "epoch": 0.96, + "grad_norm": 0.1837366372346878, + "learning_rate": 0.00017308688316155225, + "loss": 0.8781, + "step": 2723 + }, + { + "epoch": 0.96, + "grad_norm": 0.18876072764396667, + "learning_rate": 0.00017306798212832395, + "loss": 1.0168, + "step": 2724 + }, + { + "epoch": 0.96, + "grad_norm": 0.18862731754779816, + "learning_rate": 0.000173049075493119, + "loss": 0.9721, + "step": 2725 + }, + { + "epoch": 0.96, + "grad_norm": 0.1875831037759781, + "learning_rate": 0.00017303016325738685, + "loss": 0.9598, + "step": 2726 + }, + { + "epoch": 0.96, + "grad_norm": 0.18485279381275177, + "learning_rate": 0.00017301124542257748, + "loss": 0.9447, + "step": 2727 + }, + { + "epoch": 0.96, + "grad_norm": 0.1946990042924881, + "learning_rate": 0.00017299232199014135, + "loss": 0.9895, + "step": 2728 + }, + { + "epoch": 0.96, + "grad_norm": 0.18176619708538055, + "learning_rate": 0.0001729733929615292, + "loss": 0.9482, + "step": 2729 + }, + { + "epoch": 0.96, + "grad_norm": 0.19105994701385498, + "learning_rate": 0.00017295445833819236, + "loss": 0.9972, + "step": 2730 + }, + { + "epoch": 0.96, + "grad_norm": 0.18423745036125183, + "learning_rate": 0.0001729355181215824, + "loss": 0.9693, + "step": 2731 + }, + { + "epoch": 0.96, + "grad_norm": 0.18534033000469208, + "learning_rate": 0.00017291657231315152, + "loss": 0.8961, + "step": 2732 + }, + { + "epoch": 0.96, + "grad_norm": 0.1894759088754654, + "learning_rate": 0.00017289762091435225, + "loss": 0.96, + "step": 2733 + }, + { + "epoch": 0.96, + "grad_norm": 0.18298478424549103, + "learning_rate": 0.00017287866392663752, + "loss": 0.8785, + "step": 2734 + }, + { + "epoch": 0.96, + "grad_norm": 0.18672245740890503, + "learning_rate": 0.00017285970135146075, + "loss": 0.9642, + "step": 2735 + }, + { + "epoch": 0.96, + "grad_norm": 0.18695160746574402, + "learning_rate": 0.00017284073319027575, + "loss": 0.9333, + "step": 2736 + }, + { + "epoch": 0.96, + "grad_norm": 0.1886570155620575, + "learning_rate": 0.00017282175944453675, + "loss": 0.9322, + "step": 2737 + }, + { + "epoch": 0.96, + "grad_norm": 0.1965176910161972, + "learning_rate": 0.00017280278011569847, + "loss": 0.9819, + "step": 2738 + }, + { + "epoch": 0.96, + "grad_norm": 0.19003883004188538, + "learning_rate": 0.00017278379520521605, + "loss": 1.0588, + "step": 2739 + }, + { + "epoch": 0.96, + "grad_norm": 0.19280295073986053, + "learning_rate": 0.00017276480471454492, + "loss": 0.9592, + "step": 2740 + }, + { + "epoch": 0.97, + "grad_norm": 0.1867469996213913, + "learning_rate": 0.00017274580864514108, + "loss": 0.9086, + "step": 2741 + }, + { + "epoch": 0.97, + "grad_norm": 0.1786963790655136, + "learning_rate": 0.000172726806998461, + "loss": 0.8676, + "step": 2742 + }, + { + "epoch": 0.97, + "grad_norm": 0.18356633186340332, + "learning_rate": 0.0001727077997759614, + "loss": 0.9183, + "step": 2743 + }, + { + "epoch": 0.97, + "grad_norm": 0.19396550953388214, + "learning_rate": 0.00017268878697909961, + "loss": 0.9955, + "step": 2744 + }, + { + "epoch": 0.97, + "grad_norm": 0.17852063477039337, + "learning_rate": 0.0001726697686093332, + "loss": 0.9336, + "step": 2745 + }, + { + "epoch": 0.97, + "grad_norm": 0.18407288193702698, + "learning_rate": 0.0001726507446681204, + "loss": 0.9827, + "step": 2746 + }, + { + "epoch": 0.97, + "grad_norm": 0.1954531967639923, + "learning_rate": 0.0001726317151569196, + "loss": 0.9903, + "step": 2747 + }, + { + "epoch": 0.97, + "grad_norm": 0.18365801870822906, + "learning_rate": 0.00017261268007718984, + "loss": 0.9263, + "step": 2748 + }, + { + "epoch": 0.97, + "grad_norm": 0.19449764490127563, + "learning_rate": 0.00017259363943039052, + "loss": 0.9948, + "step": 2749 + }, + { + "epoch": 0.97, + "grad_norm": 0.18458783626556396, + "learning_rate": 0.00017257459321798137, + "loss": 0.9354, + "step": 2750 + }, + { + "epoch": 0.97, + "grad_norm": 0.18690603971481323, + "learning_rate": 0.00017255554144142267, + "loss": 0.894, + "step": 2751 + }, + { + "epoch": 0.97, + "grad_norm": 0.18686780333518982, + "learning_rate": 0.00017253648410217508, + "loss": 1.0131, + "step": 2752 + }, + { + "epoch": 0.97, + "grad_norm": 0.182310089468956, + "learning_rate": 0.0001725174212016997, + "loss": 1.0015, + "step": 2753 + }, + { + "epoch": 0.97, + "grad_norm": 0.18795253336429596, + "learning_rate": 0.00017249835274145805, + "loss": 0.9597, + "step": 2754 + }, + { + "epoch": 0.97, + "grad_norm": 0.19204358756542206, + "learning_rate": 0.000172479278722912, + "loss": 0.9657, + "step": 2755 + }, + { + "epoch": 0.97, + "grad_norm": 0.19570910930633545, + "learning_rate": 0.000172460199147524, + "loss": 0.9586, + "step": 2756 + }, + { + "epoch": 0.97, + "grad_norm": 0.19368427991867065, + "learning_rate": 0.00017244111401675682, + "loss": 0.9487, + "step": 2757 + }, + { + "epoch": 0.97, + "grad_norm": 0.19332663714885712, + "learning_rate": 0.00017242202333207364, + "loss": 0.9792, + "step": 2758 + }, + { + "epoch": 0.97, + "grad_norm": 0.18540112674236298, + "learning_rate": 0.0001724029270949382, + "loss": 0.9517, + "step": 2759 + }, + { + "epoch": 0.97, + "grad_norm": 0.1930074840784073, + "learning_rate": 0.00017238382530681445, + "loss": 1.0426, + "step": 2760 + }, + { + "epoch": 0.97, + "grad_norm": 0.18122000992298126, + "learning_rate": 0.00017236471796916694, + "loss": 0.9315, + "step": 2761 + }, + { + "epoch": 0.97, + "grad_norm": 0.18953172862529755, + "learning_rate": 0.0001723456050834606, + "loss": 0.9644, + "step": 2762 + }, + { + "epoch": 0.97, + "grad_norm": 0.18717069923877716, + "learning_rate": 0.0001723264866511608, + "loss": 1.0098, + "step": 2763 + }, + { + "epoch": 0.97, + "grad_norm": 0.17663700878620148, + "learning_rate": 0.00017230736267373326, + "loss": 0.9677, + "step": 2764 + }, + { + "epoch": 0.97, + "grad_norm": 0.18183469772338867, + "learning_rate": 0.0001722882331526442, + "loss": 0.9359, + "step": 2765 + }, + { + "epoch": 0.97, + "grad_norm": 0.18459314107894897, + "learning_rate": 0.00017226909808936027, + "loss": 0.9705, + "step": 2766 + }, + { + "epoch": 0.97, + "grad_norm": 0.1858403980731964, + "learning_rate": 0.00017224995748534844, + "loss": 0.9647, + "step": 2767 + }, + { + "epoch": 0.97, + "grad_norm": 0.1881045550107956, + "learning_rate": 0.00017223081134207628, + "loss": 1.0115, + "step": 2768 + }, + { + "epoch": 0.98, + "grad_norm": 0.18951913714408875, + "learning_rate": 0.00017221165966101163, + "loss": 0.9398, + "step": 2769 + }, + { + "epoch": 0.98, + "grad_norm": 0.19097717106342316, + "learning_rate": 0.0001721925024436228, + "loss": 0.9834, + "step": 2770 + }, + { + "epoch": 0.98, + "grad_norm": 0.18614451587200165, + "learning_rate": 0.0001721733396913786, + "loss": 0.9537, + "step": 2771 + }, + { + "epoch": 0.98, + "grad_norm": 0.1865241825580597, + "learning_rate": 0.0001721541714057481, + "loss": 0.9338, + "step": 2772 + }, + { + "epoch": 0.98, + "grad_norm": 0.1903114914894104, + "learning_rate": 0.00017213499758820096, + "loss": 0.9616, + "step": 2773 + }, + { + "epoch": 0.98, + "grad_norm": 0.1903352588415146, + "learning_rate": 0.00017211581824020723, + "loss": 0.9934, + "step": 2774 + }, + { + "epoch": 0.98, + "grad_norm": 0.18006877601146698, + "learning_rate": 0.00017209663336323726, + "loss": 0.9624, + "step": 2775 + }, + { + "epoch": 0.98, + "grad_norm": 0.19256149232387543, + "learning_rate": 0.00017207744295876198, + "loss": 0.9431, + "step": 2776 + }, + { + "epoch": 0.98, + "grad_norm": 0.19010770320892334, + "learning_rate": 0.0001720582470282527, + "loss": 0.9376, + "step": 2777 + }, + { + "epoch": 0.98, + "grad_norm": 0.1907973736524582, + "learning_rate": 0.00017203904557318108, + "loss": 0.9791, + "step": 2778 + }, + { + "epoch": 0.98, + "grad_norm": 0.19442807137966156, + "learning_rate": 0.00017201983859501928, + "loss": 1.0337, + "step": 2779 + }, + { + "epoch": 0.98, + "grad_norm": 0.18805912137031555, + "learning_rate": 0.00017200062609523984, + "loss": 1.0056, + "step": 2780 + }, + { + "epoch": 0.98, + "grad_norm": 0.19650529325008392, + "learning_rate": 0.00017198140807531576, + "loss": 1.0526, + "step": 2781 + }, + { + "epoch": 0.98, + "grad_norm": 0.19417373836040497, + "learning_rate": 0.0001719621845367205, + "loss": 0.9836, + "step": 2782 + }, + { + "epoch": 0.98, + "grad_norm": 0.17788995802402496, + "learning_rate": 0.0001719429554809278, + "loss": 0.9189, + "step": 2783 + }, + { + "epoch": 0.98, + "grad_norm": 0.19120745360851288, + "learning_rate": 0.00017192372090941196, + "loss": 0.965, + "step": 2784 + }, + { + "epoch": 0.98, + "grad_norm": 0.18333250284194946, + "learning_rate": 0.00017190448082364765, + "loss": 0.9157, + "step": 2785 + }, + { + "epoch": 0.98, + "grad_norm": 0.19432075321674347, + "learning_rate": 0.00017188523522510996, + "loss": 0.9863, + "step": 2786 + }, + { + "epoch": 0.98, + "grad_norm": 0.1891552060842514, + "learning_rate": 0.00017186598411527446, + "loss": 0.9274, + "step": 2787 + }, + { + "epoch": 0.98, + "grad_norm": 0.19520063698291779, + "learning_rate": 0.00017184672749561705, + "loss": 0.9257, + "step": 2788 + }, + { + "epoch": 0.98, + "grad_norm": 0.18742509186267853, + "learning_rate": 0.00017182746536761404, + "loss": 0.9583, + "step": 2789 + }, + { + "epoch": 0.98, + "grad_norm": 0.19111135601997375, + "learning_rate": 0.00017180819773274234, + "loss": 0.9878, + "step": 2790 + }, + { + "epoch": 0.98, + "grad_norm": 0.18058103322982788, + "learning_rate": 0.00017178892459247908, + "loss": 0.962, + "step": 2791 + }, + { + "epoch": 0.98, + "grad_norm": 0.18281134963035583, + "learning_rate": 0.00017176964594830193, + "loss": 0.9301, + "step": 2792 + }, + { + "epoch": 0.98, + "grad_norm": 0.17827610671520233, + "learning_rate": 0.00017175036180168892, + "loss": 0.8986, + "step": 2793 + }, + { + "epoch": 0.98, + "grad_norm": 0.19877582788467407, + "learning_rate": 0.00017173107215411854, + "loss": 0.9709, + "step": 2794 + }, + { + "epoch": 0.98, + "grad_norm": 0.19208525121212006, + "learning_rate": 0.0001717117770070697, + "loss": 0.9325, + "step": 2795 + }, + { + "epoch": 0.98, + "grad_norm": 0.19865301251411438, + "learning_rate": 0.0001716924763620217, + "loss": 0.9956, + "step": 2796 + }, + { + "epoch": 0.98, + "grad_norm": 0.19380724430084229, + "learning_rate": 0.00017167317022045428, + "loss": 0.9418, + "step": 2797 + }, + { + "epoch": 0.99, + "grad_norm": 0.19513899087905884, + "learning_rate": 0.00017165385858384765, + "loss": 1.0074, + "step": 2798 + }, + { + "epoch": 0.99, + "grad_norm": 0.1949981302022934, + "learning_rate": 0.0001716345414536823, + "loss": 0.999, + "step": 2799 + }, + { + "epoch": 0.99, + "grad_norm": 0.18708331882953644, + "learning_rate": 0.00017161521883143934, + "loss": 1.0341, + "step": 2800 + }, + { + "epoch": 0.99, + "grad_norm": 0.19197827577590942, + "learning_rate": 0.00017159589071860014, + "loss": 0.9257, + "step": 2801 + }, + { + "epoch": 0.99, + "grad_norm": 0.1827811300754547, + "learning_rate": 0.00017157655711664654, + "loss": 0.9209, + "step": 2802 + }, + { + "epoch": 0.99, + "grad_norm": 0.1887734979391098, + "learning_rate": 0.00017155721802706087, + "loss": 0.9731, + "step": 2803 + }, + { + "epoch": 0.99, + "grad_norm": 0.19555433094501495, + "learning_rate": 0.00017153787345132573, + "loss": 0.9979, + "step": 2804 + }, + { + "epoch": 0.99, + "grad_norm": 0.20035997033119202, + "learning_rate": 0.0001715185233909243, + "loss": 1.017, + "step": 2805 + }, + { + "epoch": 0.99, + "grad_norm": 0.19120369851589203, + "learning_rate": 0.0001714991678473401, + "loss": 0.969, + "step": 2806 + }, + { + "epoch": 0.99, + "grad_norm": 0.1866525262594223, + "learning_rate": 0.00017147980682205702, + "loss": 0.9909, + "step": 2807 + }, + { + "epoch": 0.99, + "grad_norm": 0.1893429458141327, + "learning_rate": 0.00017146044031655951, + "loss": 0.9889, + "step": 2808 + }, + { + "epoch": 0.99, + "grad_norm": 0.19175229966640472, + "learning_rate": 0.00017144106833233232, + "loss": 0.9641, + "step": 2809 + }, + { + "epoch": 0.99, + "grad_norm": 0.18215136229991913, + "learning_rate": 0.0001714216908708607, + "loss": 0.9432, + "step": 2810 + }, + { + "epoch": 0.99, + "grad_norm": 0.18707439303398132, + "learning_rate": 0.00017140230793363025, + "loss": 0.9354, + "step": 2811 + }, + { + "epoch": 0.99, + "grad_norm": 0.1920577585697174, + "learning_rate": 0.000171382919522127, + "loss": 0.9781, + "step": 2812 + }, + { + "epoch": 0.99, + "grad_norm": 0.17737874388694763, + "learning_rate": 0.0001713635256378375, + "loss": 1.0113, + "step": 2813 + }, + { + "epoch": 0.99, + "grad_norm": 0.1806550770998001, + "learning_rate": 0.00017134412628224852, + "loss": 0.9249, + "step": 2814 + }, + { + "epoch": 0.99, + "grad_norm": 0.18417756259441376, + "learning_rate": 0.00017132472145684746, + "loss": 0.9205, + "step": 2815 + }, + { + "epoch": 0.99, + "grad_norm": 0.18996597826480865, + "learning_rate": 0.00017130531116312203, + "loss": 0.9541, + "step": 2816 + }, + { + "epoch": 0.99, + "grad_norm": 0.19089314341545105, + "learning_rate": 0.0001712858954025604, + "loss": 0.9308, + "step": 2817 + }, + { + "epoch": 0.99, + "grad_norm": 0.17796491086483002, + "learning_rate": 0.00017126647417665107, + "loss": 1.0109, + "step": 2818 + }, + { + "epoch": 0.99, + "grad_norm": 0.19152474403381348, + "learning_rate": 0.0001712470474868831, + "loss": 0.9048, + "step": 2819 + }, + { + "epoch": 0.99, + "grad_norm": 0.22874225676059723, + "learning_rate": 0.00017122761533474586, + "loss": 0.9911, + "step": 2820 + }, + { + "epoch": 0.99, + "grad_norm": 0.1857227236032486, + "learning_rate": 0.00017120817772172916, + "loss": 0.9237, + "step": 2821 + }, + { + "epoch": 0.99, + "grad_norm": 0.18604129552841187, + "learning_rate": 0.00017118873464932327, + "loss": 0.9447, + "step": 2822 + }, + { + "epoch": 0.99, + "grad_norm": 0.20199713110923767, + "learning_rate": 0.00017116928611901885, + "loss": 0.9754, + "step": 2823 + }, + { + "epoch": 0.99, + "grad_norm": 0.20207203924655914, + "learning_rate": 0.000171149832132307, + "loss": 1.015, + "step": 2824 + }, + { + "epoch": 0.99, + "grad_norm": 0.18461887538433075, + "learning_rate": 0.00017113037269067912, + "loss": 0.9923, + "step": 2825 + }, + { + "epoch": 1.0, + "grad_norm": 0.187242791056633, + "learning_rate": 0.00017111090779562725, + "loss": 0.927, + "step": 2826 + }, + { + "epoch": 1.0, + "grad_norm": 0.18785372376441956, + "learning_rate": 0.0001710914374486437, + "loss": 0.9408, + "step": 2827 + }, + { + "epoch": 1.0, + "grad_norm": 0.18827159702777863, + "learning_rate": 0.00017107196165122113, + "loss": 0.9365, + "step": 2828 + }, + { + "epoch": 1.0, + "grad_norm": 0.1842368096113205, + "learning_rate": 0.00017105248040485282, + "loss": 0.943, + "step": 2829 + }, + { + "epoch": 1.0, + "grad_norm": 0.18567433953285217, + "learning_rate": 0.0001710329937110323, + "loss": 0.9705, + "step": 2830 + }, + { + "epoch": 1.0, + "grad_norm": 0.18705464899539948, + "learning_rate": 0.0001710135015712536, + "loss": 1.0149, + "step": 2831 + }, + { + "epoch": 1.0, + "grad_norm": 0.19440047442913055, + "learning_rate": 0.00017099400398701113, + "loss": 0.9546, + "step": 2832 + }, + { + "epoch": 1.0, + "grad_norm": 0.19255554676055908, + "learning_rate": 0.00017097450095979975, + "loss": 0.9735, + "step": 2833 + }, + { + "epoch": 1.0, + "grad_norm": 0.1825937181711197, + "learning_rate": 0.00017095499249111467, + "loss": 0.9555, + "step": 2834 + }, + { + "epoch": 1.0, + "grad_norm": 0.19005203247070312, + "learning_rate": 0.00017093547858245162, + "loss": 0.9334, + "step": 2835 + }, + { + "epoch": 1.0, + "grad_norm": 0.18393410742282867, + "learning_rate": 0.0001709159592353067, + "loss": 0.9209, + "step": 2836 + }, + { + "epoch": 1.0, + "grad_norm": 0.18357278406620026, + "learning_rate": 0.00017089643445117637, + "loss": 0.9059, + "step": 2837 + }, + { + "epoch": 1.0, + "grad_norm": 0.1842675656080246, + "learning_rate": 0.00017087690423155756, + "loss": 0.9593, + "step": 2838 + }, + { + "epoch": 1.0, + "grad_norm": 0.19786691665649414, + "learning_rate": 0.00017085736857794763, + "loss": 1.015, + "step": 2839 + }, + { + "epoch": 1.0, + "grad_norm": 0.18587841093540192, + "learning_rate": 0.00017083782749184435, + "loss": 0.9738, + "step": 2840 + }, + { + "epoch": 1.0, + "eval_loss": 0.9591795802116394, + "eval_runtime": 679.171, + "eval_samples_per_second": 10.124, + "eval_steps_per_second": 5.062, + "step": 2840 + }, + { + "epoch": 1.0, + "grad_norm": 0.1888989806175232, + "learning_rate": 0.0001708182809747459, + "loss": 0.954, + "step": 2841 + }, + { + "epoch": 1.0, + "grad_norm": 0.1919054090976715, + "learning_rate": 0.00017079872902815084, + "loss": 0.9808, + "step": 2842 + }, + { + "epoch": 1.0, + "grad_norm": 0.20058830082416534, + "learning_rate": 0.00017077917165355823, + "loss": 1.0177, + "step": 2843 + }, + { + "epoch": 1.0, + "grad_norm": 0.17403985559940338, + "learning_rate": 0.00017075960885246745, + "loss": 0.8599, + "step": 2844 + }, + { + "epoch": 1.0, + "grad_norm": 0.19590140879154205, + "learning_rate": 0.00017074004062637833, + "loss": 0.9614, + "step": 2845 + }, + { + "epoch": 1.0, + "grad_norm": 0.20473170280456543, + "learning_rate": 0.00017072046697679118, + "loss": 0.9607, + "step": 2846 + }, + { + "epoch": 1.0, + "grad_norm": 0.1959337592124939, + "learning_rate": 0.00017070088790520664, + "loss": 0.9394, + "step": 2847 + }, + { + "epoch": 1.0, + "grad_norm": 0.17946115136146545, + "learning_rate": 0.0001706813034131258, + "loss": 0.9491, + "step": 2848 + }, + { + "epoch": 1.0, + "grad_norm": 0.1879536658525467, + "learning_rate": 0.00017066171350205012, + "loss": 0.931, + "step": 2849 + }, + { + "epoch": 1.0, + "grad_norm": 0.18565255403518677, + "learning_rate": 0.00017064211817348164, + "loss": 0.8425, + "step": 2850 + }, + { + "epoch": 1.0, + "grad_norm": 0.18562796711921692, + "learning_rate": 0.00017062251742892256, + "loss": 0.9054, + "step": 2851 + }, + { + "epoch": 1.0, + "grad_norm": 0.1931876242160797, + "learning_rate": 0.00017060291126987572, + "loss": 0.9934, + "step": 2852 + }, + { + "epoch": 1.0, + "grad_norm": 0.19624903798103333, + "learning_rate": 0.00017058329969784426, + "loss": 0.9625, + "step": 2853 + }, + { + "epoch": 1.01, + "grad_norm": 0.19228866696357727, + "learning_rate": 0.00017056368271433175, + "loss": 0.9353, + "step": 2854 + }, + { + "epoch": 1.01, + "grad_norm": 0.19154101610183716, + "learning_rate": 0.0001705440603208422, + "loss": 0.9374, + "step": 2855 + }, + { + "epoch": 1.01, + "grad_norm": 0.18350714445114136, + "learning_rate": 0.00017052443251888, + "loss": 0.9005, + "step": 2856 + }, + { + "epoch": 1.01, + "grad_norm": 0.18323072791099548, + "learning_rate": 0.00017050479930995, + "loss": 0.9087, + "step": 2857 + }, + { + "epoch": 1.01, + "grad_norm": 0.18845704197883606, + "learning_rate": 0.0001704851606955574, + "loss": 0.9322, + "step": 2858 + }, + { + "epoch": 1.01, + "grad_norm": 0.19670087099075317, + "learning_rate": 0.00017046551667720787, + "loss": 0.9987, + "step": 2859 + }, + { + "epoch": 1.01, + "grad_norm": 0.18626806139945984, + "learning_rate": 0.00017044586725640756, + "loss": 0.9809, + "step": 2860 + }, + { + "epoch": 1.01, + "grad_norm": 0.19881881773471832, + "learning_rate": 0.00017042621243466277, + "loss": 0.9545, + "step": 2861 + }, + { + "epoch": 1.01, + "grad_norm": 0.19658763706684113, + "learning_rate": 0.00017040655221348057, + "loss": 0.9976, + "step": 2862 + }, + { + "epoch": 1.01, + "grad_norm": 0.1838339865207672, + "learning_rate": 0.0001703868865943682, + "loss": 0.9416, + "step": 2863 + }, + { + "epoch": 1.01, + "grad_norm": 0.18808238208293915, + "learning_rate": 0.00017036721557883334, + "loss": 0.9292, + "step": 2864 + }, + { + "epoch": 1.01, + "grad_norm": 0.19379892945289612, + "learning_rate": 0.00017034753916838423, + "loss": 1.0161, + "step": 2865 + }, + { + "epoch": 1.01, + "grad_norm": 0.1916080266237259, + "learning_rate": 0.00017032785736452937, + "loss": 0.9748, + "step": 2866 + }, + { + "epoch": 1.01, + "grad_norm": 0.19743172824382782, + "learning_rate": 0.00017030817016877768, + "loss": 1.0049, + "step": 2867 + }, + { + "epoch": 1.01, + "grad_norm": 0.18762719631195068, + "learning_rate": 0.0001702884775826386, + "loss": 0.915, + "step": 2868 + }, + { + "epoch": 1.01, + "grad_norm": 0.17494773864746094, + "learning_rate": 0.0001702687796076219, + "loss": 0.9419, + "step": 2869 + }, + { + "epoch": 1.01, + "grad_norm": 0.18630997836589813, + "learning_rate": 0.0001702490762452378, + "loss": 0.9856, + "step": 2870 + }, + { + "epoch": 1.01, + "grad_norm": 0.1823118031024933, + "learning_rate": 0.0001702293674969969, + "loss": 0.9582, + "step": 2871 + }, + { + "epoch": 1.01, + "grad_norm": 0.201270192861557, + "learning_rate": 0.0001702096533644102, + "loss": 1.0076, + "step": 2872 + }, + { + "epoch": 1.01, + "grad_norm": 0.17998550832271576, + "learning_rate": 0.00017018993384898922, + "loss": 0.8525, + "step": 2873 + }, + { + "epoch": 1.01, + "grad_norm": 0.19493627548217773, + "learning_rate": 0.00017017020895224573, + "loss": 0.9494, + "step": 2874 + }, + { + "epoch": 1.01, + "grad_norm": 0.18817171454429626, + "learning_rate": 0.0001701504786756921, + "loss": 0.9834, + "step": 2875 + }, + { + "epoch": 1.01, + "grad_norm": 0.1890181452035904, + "learning_rate": 0.0001701307430208409, + "loss": 0.939, + "step": 2876 + }, + { + "epoch": 1.01, + "grad_norm": 0.186468243598938, + "learning_rate": 0.0001701110019892053, + "loss": 0.8638, + "step": 2877 + }, + { + "epoch": 1.01, + "grad_norm": 0.192847341299057, + "learning_rate": 0.00017009125558229874, + "loss": 0.9671, + "step": 2878 + }, + { + "epoch": 1.01, + "grad_norm": 0.18834351003170013, + "learning_rate": 0.00017007150380163522, + "loss": 0.9366, + "step": 2879 + }, + { + "epoch": 1.01, + "grad_norm": 0.1983274221420288, + "learning_rate": 0.00017005174664872904, + "loss": 0.9598, + "step": 2880 + }, + { + "epoch": 1.01, + "grad_norm": 0.18392324447631836, + "learning_rate": 0.00017003198412509488, + "loss": 0.9406, + "step": 2881 + }, + { + "epoch": 1.01, + "grad_norm": 0.19125406444072723, + "learning_rate": 0.00017001221623224797, + "loss": 0.9212, + "step": 2882 + }, + { + "epoch": 1.02, + "grad_norm": 0.19399990141391754, + "learning_rate": 0.00016999244297170383, + "loss": 0.9889, + "step": 2883 + }, + { + "epoch": 1.02, + "grad_norm": 0.18320515751838684, + "learning_rate": 0.00016997266434497848, + "loss": 0.9134, + "step": 2884 + }, + { + "epoch": 1.02, + "grad_norm": 0.18350949883460999, + "learning_rate": 0.00016995288035358828, + "loss": 0.9717, + "step": 2885 + }, + { + "epoch": 1.02, + "grad_norm": 0.18024645745754242, + "learning_rate": 0.00016993309099905003, + "loss": 0.9331, + "step": 2886 + }, + { + "epoch": 1.02, + "grad_norm": 0.18857190012931824, + "learning_rate": 0.00016991329628288093, + "loss": 0.9852, + "step": 2887 + }, + { + "epoch": 1.02, + "grad_norm": 0.18607757985591888, + "learning_rate": 0.00016989349620659866, + "loss": 0.9123, + "step": 2888 + }, + { + "epoch": 1.02, + "grad_norm": 0.18504849076271057, + "learning_rate": 0.00016987369077172115, + "loss": 0.9228, + "step": 2889 + }, + { + "epoch": 1.02, + "grad_norm": 0.18795835971832275, + "learning_rate": 0.00016985387997976693, + "loss": 0.889, + "step": 2890 + }, + { + "epoch": 1.02, + "grad_norm": 0.19008661806583405, + "learning_rate": 0.0001698340638322548, + "loss": 0.9829, + "step": 2891 + }, + { + "epoch": 1.02, + "grad_norm": 0.19802813231945038, + "learning_rate": 0.00016981424233070404, + "loss": 0.9692, + "step": 2892 + }, + { + "epoch": 1.02, + "grad_norm": 0.20054937899112701, + "learning_rate": 0.00016979441547663435, + "loss": 0.939, + "step": 2893 + }, + { + "epoch": 1.02, + "grad_norm": 0.18934161961078644, + "learning_rate": 0.00016977458327156582, + "loss": 0.9009, + "step": 2894 + }, + { + "epoch": 1.02, + "grad_norm": 0.1872309148311615, + "learning_rate": 0.0001697547457170189, + "loss": 0.9415, + "step": 2895 + }, + { + "epoch": 1.02, + "grad_norm": 0.1852417290210724, + "learning_rate": 0.00016973490281451455, + "loss": 0.9377, + "step": 2896 + }, + { + "epoch": 1.02, + "grad_norm": 0.1943930685520172, + "learning_rate": 0.000169715054565574, + "loss": 0.9183, + "step": 2897 + }, + { + "epoch": 1.0, + "grad_norm": 0.23046432435512543, + "learning_rate": 0.00016969520097171906, + "loss": 0.9881, + "step": 2898 + }, + { + "epoch": 1.0, + "grad_norm": 0.1862451732158661, + "learning_rate": 0.00016967534203447184, + "loss": 0.8953, + "step": 2899 + }, + { + "epoch": 1.0, + "grad_norm": 0.18676815927028656, + "learning_rate": 0.00016965547775535492, + "loss": 0.9184, + "step": 2900 + }, + { + "epoch": 1.0, + "grad_norm": 0.18597565591335297, + "learning_rate": 0.00016963560813589116, + "loss": 0.9172, + "step": 2901 + }, + { + "epoch": 1.0, + "grad_norm": 0.1836593896150589, + "learning_rate": 0.000169615733177604, + "loss": 0.9615, + "step": 2902 + }, + { + "epoch": 1.0, + "grad_norm": 0.2000277191400528, + "learning_rate": 0.00016959585288201719, + "loss": 0.9409, + "step": 2903 + }, + { + "epoch": 1.0, + "grad_norm": 0.1949191689491272, + "learning_rate": 0.0001695759672506549, + "loss": 0.8877, + "step": 2904 + }, + { + "epoch": 1.0, + "grad_norm": 0.19184264540672302, + "learning_rate": 0.0001695560762850418, + "loss": 0.9198, + "step": 2905 + }, + { + "epoch": 1.0, + "grad_norm": 0.18635979294776917, + "learning_rate": 0.00016953617998670279, + "loss": 0.9238, + "step": 2906 + }, + { + "epoch": 1.0, + "grad_norm": 0.18243515491485596, + "learning_rate": 0.0001695162783571633, + "loss": 0.9363, + "step": 2907 + }, + { + "epoch": 1.0, + "grad_norm": 0.1891496181488037, + "learning_rate": 0.0001694963713979492, + "loss": 0.9342, + "step": 2908 + }, + { + "epoch": 1.0, + "grad_norm": 0.19546544551849365, + "learning_rate": 0.00016947645911058668, + "loss": 0.944, + "step": 2909 + }, + { + "epoch": 1.0, + "grad_norm": 0.18900978565216064, + "learning_rate": 0.00016945654149660237, + "loss": 0.8945, + "step": 2910 + }, + { + "epoch": 1.0, + "grad_norm": 0.1874096542596817, + "learning_rate": 0.0001694366185575233, + "loss": 0.9204, + "step": 2911 + }, + { + "epoch": 1.01, + "grad_norm": 0.18490877747535706, + "learning_rate": 0.00016941669029487698, + "loss": 0.9533, + "step": 2912 + }, + { + "epoch": 1.01, + "grad_norm": 0.18125925958156586, + "learning_rate": 0.00016939675671019124, + "loss": 0.8697, + "step": 2913 + }, + { + "epoch": 1.01, + "grad_norm": 0.19065441191196442, + "learning_rate": 0.00016937681780499432, + "loss": 0.887, + "step": 2914 + }, + { + "epoch": 1.01, + "grad_norm": 0.18740889430046082, + "learning_rate": 0.00016935687358081493, + "loss": 0.9241, + "step": 2915 + }, + { + "epoch": 1.01, + "grad_norm": 0.1898462325334549, + "learning_rate": 0.00016933692403918219, + "loss": 0.9273, + "step": 2916 + }, + { + "epoch": 1.01, + "grad_norm": 0.19207645952701569, + "learning_rate": 0.00016931696918162547, + "loss": 0.9765, + "step": 2917 + }, + { + "epoch": 1.01, + "grad_norm": 0.19262659549713135, + "learning_rate": 0.0001692970090096748, + "loss": 0.9454, + "step": 2918 + }, + { + "epoch": 1.01, + "grad_norm": 0.19295157492160797, + "learning_rate": 0.00016927704352486038, + "loss": 0.9456, + "step": 2919 + }, + { + "epoch": 1.01, + "grad_norm": 0.20015589892864227, + "learning_rate": 0.000169257072728713, + "loss": 1.0075, + "step": 2920 + }, + { + "epoch": 1.01, + "grad_norm": 0.177590474486351, + "learning_rate": 0.00016923709662276378, + "loss": 0.8629, + "step": 2921 + }, + { + "epoch": 1.01, + "grad_norm": 0.19473347067832947, + "learning_rate": 0.00016921711520854422, + "loss": 0.9014, + "step": 2922 + }, + { + "epoch": 1.01, + "grad_norm": 0.19092902541160583, + "learning_rate": 0.0001691971284875862, + "loss": 0.9194, + "step": 2923 + }, + { + "epoch": 1.01, + "grad_norm": 0.18545444309711456, + "learning_rate": 0.00016917713646142222, + "loss": 0.9279, + "step": 2924 + }, + { + "epoch": 1.01, + "grad_norm": 0.19252456724643707, + "learning_rate": 0.00016915713913158485, + "loss": 0.9169, + "step": 2925 + }, + { + "epoch": 1.01, + "grad_norm": 0.18508480489253998, + "learning_rate": 0.00016913713649960738, + "loss": 0.8767, + "step": 2926 + }, + { + "epoch": 1.01, + "grad_norm": 0.190322607755661, + "learning_rate": 0.0001691171285670233, + "loss": 0.8883, + "step": 2927 + }, + { + "epoch": 1.01, + "grad_norm": 0.19125689566135406, + "learning_rate": 0.0001690971153353666, + "loss": 0.986, + "step": 2928 + }, + { + "epoch": 1.01, + "grad_norm": 0.19923120737075806, + "learning_rate": 0.00016907709680617162, + "loss": 0.9492, + "step": 2929 + }, + { + "epoch": 1.01, + "grad_norm": 0.18744708597660065, + "learning_rate": 0.00016905707298097323, + "loss": 0.8807, + "step": 2930 + }, + { + "epoch": 1.01, + "grad_norm": 0.19923172891139984, + "learning_rate": 0.00016903704386130652, + "loss": 0.9619, + "step": 2931 + }, + { + "epoch": 1.01, + "grad_norm": 0.17846401035785675, + "learning_rate": 0.00016901700944870713, + "loss": 0.8608, + "step": 2932 + }, + { + "epoch": 1.01, + "grad_norm": 0.18846310675144196, + "learning_rate": 0.00016899696974471105, + "loss": 0.9406, + "step": 2933 + }, + { + "epoch": 1.01, + "grad_norm": 0.1996917575597763, + "learning_rate": 0.00016897692475085468, + "loss": 0.9684, + "step": 2934 + }, + { + "epoch": 1.01, + "grad_norm": 0.18243834376335144, + "learning_rate": 0.00016895687446867485, + "loss": 0.8787, + "step": 2935 + }, + { + "epoch": 1.01, + "grad_norm": 0.18757185339927673, + "learning_rate": 0.00016893681889970874, + "loss": 0.9692, + "step": 2936 + }, + { + "epoch": 1.01, + "grad_norm": 0.1936056762933731, + "learning_rate": 0.00016891675804549402, + "loss": 1.0149, + "step": 2937 + }, + { + "epoch": 1.01, + "grad_norm": 0.18731190264225006, + "learning_rate": 0.00016889669190756868, + "loss": 0.9039, + "step": 2938 + }, + { + "epoch": 1.01, + "grad_norm": 0.19040818512439728, + "learning_rate": 0.00016887662048747112, + "loss": 0.9222, + "step": 2939 + }, + { + "epoch": 1.02, + "grad_norm": 0.19378627836704254, + "learning_rate": 0.00016885654378674024, + "loss": 0.9511, + "step": 2940 + }, + { + "epoch": 1.02, + "grad_norm": 0.1890939325094223, + "learning_rate": 0.00016883646180691525, + "loss": 0.9387, + "step": 2941 + }, + { + "epoch": 1.02, + "grad_norm": 0.19378086924552917, + "learning_rate": 0.00016881637454953583, + "loss": 0.8926, + "step": 2942 + }, + { + "epoch": 1.02, + "grad_norm": 0.18797039985656738, + "learning_rate": 0.00016879628201614194, + "loss": 0.9298, + "step": 2943 + }, + { + "epoch": 1.02, + "grad_norm": 0.21416975557804108, + "learning_rate": 0.00016877618420827413, + "loss": 0.9618, + "step": 2944 + }, + { + "epoch": 1.02, + "grad_norm": 0.18781521916389465, + "learning_rate": 0.00016875608112747322, + "loss": 0.898, + "step": 2945 + }, + { + "epoch": 1.02, + "grad_norm": 0.18484070897102356, + "learning_rate": 0.00016873597277528046, + "loss": 0.9284, + "step": 2946 + }, + { + "epoch": 1.02, + "grad_norm": 0.20203427970409393, + "learning_rate": 0.00016871585915323757, + "loss": 0.9545, + "step": 2947 + }, + { + "epoch": 1.02, + "grad_norm": 0.19452163577079773, + "learning_rate": 0.00016869574026288654, + "loss": 0.9575, + "step": 2948 + }, + { + "epoch": 1.02, + "grad_norm": 0.1887548416852951, + "learning_rate": 0.0001686756161057699, + "loss": 0.9401, + "step": 2949 + }, + { + "epoch": 1.02, + "grad_norm": 0.19443704187870026, + "learning_rate": 0.00016865548668343054, + "loss": 0.9254, + "step": 2950 + }, + { + "epoch": 1.02, + "grad_norm": 0.19337326288223267, + "learning_rate": 0.0001686353519974117, + "loss": 0.9227, + "step": 2951 + }, + { + "epoch": 1.02, + "grad_norm": 0.2151598483324051, + "learning_rate": 0.0001686152120492571, + "loss": 0.8004, + "step": 2952 + }, + { + "epoch": 1.02, + "grad_norm": 0.19231447577476501, + "learning_rate": 0.0001685950668405108, + "loss": 0.9479, + "step": 2953 + }, + { + "epoch": 1.02, + "grad_norm": 0.1957997977733612, + "learning_rate": 0.00016857491637271732, + "loss": 0.9058, + "step": 2954 + }, + { + "epoch": 1.02, + "grad_norm": 0.19956451654434204, + "learning_rate": 0.00016855476064742155, + "loss": 0.9873, + "step": 2955 + }, + { + "epoch": 1.02, + "grad_norm": 0.18961627781391144, + "learning_rate": 0.00016853459966616877, + "loss": 0.8979, + "step": 2956 + }, + { + "epoch": 1.02, + "grad_norm": 0.19322636723518372, + "learning_rate": 0.00016851443343050474, + "loss": 0.9682, + "step": 2957 + }, + { + "epoch": 1.02, + "grad_norm": 0.20402783155441284, + "learning_rate": 0.0001684942619419755, + "loss": 0.8716, + "step": 2958 + }, + { + "epoch": 1.02, + "grad_norm": 0.19145797193050385, + "learning_rate": 0.00016847408520212758, + "loss": 0.9521, + "step": 2959 + }, + { + "epoch": 1.02, + "grad_norm": 0.18774887919425964, + "learning_rate": 0.00016845390321250787, + "loss": 0.8892, + "step": 2960 + }, + { + "epoch": 1.02, + "grad_norm": 0.19743181765079498, + "learning_rate": 0.00016843371597466373, + "loss": 0.96, + "step": 2961 + }, + { + "epoch": 1.02, + "grad_norm": 0.2033877819776535, + "learning_rate": 0.00016841352349014285, + "loss": 0.9533, + "step": 2962 + }, + { + "epoch": 1.02, + "grad_norm": 0.19369393587112427, + "learning_rate": 0.00016839332576049334, + "loss": 0.9296, + "step": 2963 + }, + { + "epoch": 1.02, + "grad_norm": 0.1883126199245453, + "learning_rate": 0.0001683731227872637, + "loss": 0.9543, + "step": 2964 + }, + { + "epoch": 1.02, + "grad_norm": 0.1869036704301834, + "learning_rate": 0.00016835291457200293, + "loss": 0.9109, + "step": 2965 + }, + { + "epoch": 1.02, + "grad_norm": 0.18386952579021454, + "learning_rate": 0.00016833270111626028, + "loss": 0.9149, + "step": 2966 + }, + { + "epoch": 1.02, + "grad_norm": 0.18549568951129913, + "learning_rate": 0.0001683124824215855, + "loss": 0.9119, + "step": 2967 + }, + { + "epoch": 1.02, + "grad_norm": 0.18437707424163818, + "learning_rate": 0.0001682922584895287, + "loss": 0.9272, + "step": 2968 + }, + { + "epoch": 1.03, + "grad_norm": 0.1883966028690338, + "learning_rate": 0.00016827202932164046, + "loss": 0.9637, + "step": 2969 + }, + { + "epoch": 1.03, + "grad_norm": 0.1802394688129425, + "learning_rate": 0.00016825179491947165, + "loss": 0.9544, + "step": 2970 + }, + { + "epoch": 1.03, + "grad_norm": 0.1978726089000702, + "learning_rate": 0.0001682315552845736, + "loss": 0.933, + "step": 2971 + }, + { + "epoch": 1.03, + "grad_norm": 0.19730928540229797, + "learning_rate": 0.00016821131041849812, + "loss": 0.8994, + "step": 2972 + }, + { + "epoch": 1.03, + "grad_norm": 0.18910084664821625, + "learning_rate": 0.00016819106032279723, + "loss": 0.9299, + "step": 2973 + }, + { + "epoch": 1.03, + "grad_norm": 0.19059893488883972, + "learning_rate": 0.0001681708049990236, + "loss": 0.9231, + "step": 2974 + }, + { + "epoch": 1.03, + "grad_norm": 0.19314156472682953, + "learning_rate": 0.00016815054444873002, + "loss": 0.9978, + "step": 2975 + }, + { + "epoch": 1.03, + "grad_norm": 0.1947154849767685, + "learning_rate": 0.00016813027867346994, + "loss": 0.9429, + "step": 2976 + }, + { + "epoch": 1.03, + "grad_norm": 0.19466917216777802, + "learning_rate": 0.000168110007674797, + "loss": 0.9488, + "step": 2977 + }, + { + "epoch": 1.03, + "grad_norm": 0.19159191846847534, + "learning_rate": 0.00016808973145426547, + "loss": 0.9295, + "step": 2978 + }, + { + "epoch": 1.03, + "grad_norm": 0.19307000935077667, + "learning_rate": 0.00016806945001342975, + "loss": 0.9588, + "step": 2979 + }, + { + "epoch": 1.03, + "grad_norm": 0.19548317790031433, + "learning_rate": 0.00016804916335384486, + "loss": 0.9053, + "step": 2980 + }, + { + "epoch": 1.03, + "grad_norm": 0.19848956167697906, + "learning_rate": 0.0001680288714770661, + "loss": 0.9296, + "step": 2981 + }, + { + "epoch": 1.03, + "grad_norm": 0.1890234500169754, + "learning_rate": 0.00016800857438464923, + "loss": 0.9, + "step": 2982 + }, + { + "epoch": 1.03, + "grad_norm": 0.1944311261177063, + "learning_rate": 0.0001679882720781504, + "loss": 0.8962, + "step": 2983 + }, + { + "epoch": 1.03, + "grad_norm": 0.19622524082660675, + "learning_rate": 0.0001679679645591261, + "loss": 0.9182, + "step": 2984 + }, + { + "epoch": 1.03, + "grad_norm": 0.18798577785491943, + "learning_rate": 0.00016794765182913328, + "loss": 0.9858, + "step": 2985 + }, + { + "epoch": 1.03, + "grad_norm": 0.19635619223117828, + "learning_rate": 0.00016792733388972932, + "loss": 0.904, + "step": 2986 + }, + { + "epoch": 1.03, + "grad_norm": 0.19752800464630127, + "learning_rate": 0.0001679070107424719, + "loss": 0.9645, + "step": 2987 + }, + { + "epoch": 1.03, + "grad_norm": 0.19207364320755005, + "learning_rate": 0.00016788668238891923, + "loss": 0.8897, + "step": 2988 + }, + { + "epoch": 1.03, + "grad_norm": 0.1890457272529602, + "learning_rate": 0.00016786634883062974, + "loss": 0.9724, + "step": 2989 + }, + { + "epoch": 1.03, + "grad_norm": 0.1933799535036087, + "learning_rate": 0.00016784601006916244, + "loss": 0.9378, + "step": 2990 + }, + { + "epoch": 1.03, + "grad_norm": 0.18908622860908508, + "learning_rate": 0.00016782566610607663, + "loss": 0.9372, + "step": 2991 + }, + { + "epoch": 1.03, + "grad_norm": 0.2023576945066452, + "learning_rate": 0.00016780531694293206, + "loss": 0.994, + "step": 2992 + }, + { + "epoch": 1.03, + "grad_norm": 0.18966807425022125, + "learning_rate": 0.00016778496258128888, + "loss": 0.8883, + "step": 2993 + }, + { + "epoch": 1.03, + "grad_norm": 0.20282705128192902, + "learning_rate": 0.00016776460302270758, + "loss": 0.9586, + "step": 2994 + }, + { + "epoch": 1.03, + "grad_norm": 0.19105058908462524, + "learning_rate": 0.00016774423826874908, + "loss": 0.972, + "step": 2995 + }, + { + "epoch": 1.03, + "grad_norm": 0.19293305277824402, + "learning_rate": 0.0001677238683209747, + "loss": 0.9297, + "step": 2996 + }, + { + "epoch": 1.04, + "grad_norm": 0.19731226563453674, + "learning_rate": 0.00016770349318094623, + "loss": 0.8922, + "step": 2997 + }, + { + "epoch": 1.04, + "grad_norm": 0.1933281421661377, + "learning_rate": 0.00016768311285022573, + "loss": 0.9455, + "step": 2998 + }, + { + "epoch": 1.04, + "grad_norm": 0.19647730886936188, + "learning_rate": 0.00016766272733037576, + "loss": 0.9428, + "step": 2999 + }, + { + "epoch": 1.04, + "grad_norm": 0.1985509991645813, + "learning_rate": 0.00016764233662295918, + "loss": 0.9212, + "step": 3000 + }, + { + "epoch": 1.04, + "grad_norm": 0.18913675844669342, + "learning_rate": 0.00016762194072953937, + "loss": 0.8769, + "step": 3001 + }, + { + "epoch": 1.04, + "grad_norm": 0.19756121933460236, + "learning_rate": 0.00016760153965167995, + "loss": 0.8908, + "step": 3002 + }, + { + "epoch": 1.04, + "grad_norm": 0.19995391368865967, + "learning_rate": 0.00016758113339094516, + "loss": 0.9668, + "step": 3003 + }, + { + "epoch": 1.04, + "grad_norm": 0.19399406015872955, + "learning_rate": 0.00016756072194889943, + "loss": 0.9075, + "step": 3004 + }, + { + "epoch": 1.04, + "grad_norm": 0.19179430603981018, + "learning_rate": 0.00016754030532710763, + "loss": 0.8711, + "step": 3005 + }, + { + "epoch": 1.04, + "grad_norm": 0.19073912501335144, + "learning_rate": 0.00016751988352713511, + "loss": 0.9328, + "step": 3006 + }, + { + "epoch": 1.04, + "grad_norm": 0.18982064723968506, + "learning_rate": 0.00016749945655054763, + "loss": 0.9255, + "step": 3007 + }, + { + "epoch": 1.04, + "grad_norm": 0.1822153478860855, + "learning_rate": 0.00016747902439891117, + "loss": 0.9393, + "step": 3008 + }, + { + "epoch": 1.04, + "grad_norm": 0.18544809520244598, + "learning_rate": 0.00016745858707379227, + "loss": 0.8614, + "step": 3009 + }, + { + "epoch": 1.04, + "grad_norm": 0.19058829545974731, + "learning_rate": 0.00016743814457675785, + "loss": 0.9478, + "step": 3010 + }, + { + "epoch": 1.04, + "grad_norm": 0.19758155941963196, + "learning_rate": 0.00016741769690937512, + "loss": 0.9714, + "step": 3011 + }, + { + "epoch": 1.04, + "grad_norm": 0.1888519525527954, + "learning_rate": 0.00016739724407321185, + "loss": 0.8456, + "step": 3012 + }, + { + "epoch": 1.04, + "grad_norm": 0.19396807253360748, + "learning_rate": 0.00016737678606983607, + "loss": 0.8806, + "step": 3013 + }, + { + "epoch": 1.04, + "grad_norm": 0.20305144786834717, + "learning_rate": 0.00016735632290081624, + "loss": 0.9567, + "step": 3014 + }, + { + "epoch": 1.04, + "grad_norm": 0.1895797699689865, + "learning_rate": 0.00016733585456772128, + "loss": 0.9095, + "step": 3015 + }, + { + "epoch": 1.04, + "grad_norm": 0.1943138986825943, + "learning_rate": 0.00016731538107212039, + "loss": 0.9681, + "step": 3016 + }, + { + "epoch": 1.04, + "grad_norm": 0.18373315036296844, + "learning_rate": 0.0001672949024155833, + "loss": 0.9072, + "step": 3017 + }, + { + "epoch": 1.04, + "grad_norm": 0.19009727239608765, + "learning_rate": 0.00016727441859968004, + "loss": 0.9642, + "step": 3018 + }, + { + "epoch": 1.04, + "grad_norm": 0.19613289833068848, + "learning_rate": 0.00016725392962598106, + "loss": 0.9783, + "step": 3019 + }, + { + "epoch": 1.04, + "grad_norm": 0.18772883713245392, + "learning_rate": 0.0001672334354960572, + "loss": 0.8989, + "step": 3020 + }, + { + "epoch": 1.04, + "grad_norm": 0.19232602417469025, + "learning_rate": 0.00016721293621147968, + "loss": 0.9428, + "step": 3021 + }, + { + "epoch": 1.04, + "grad_norm": 0.19024088978767395, + "learning_rate": 0.00016719243177382024, + "loss": 0.8916, + "step": 3022 + }, + { + "epoch": 1.04, + "grad_norm": 0.19054242968559265, + "learning_rate": 0.00016717192218465077, + "loss": 0.9239, + "step": 3023 + }, + { + "epoch": 1.04, + "grad_norm": 0.2045336663722992, + "learning_rate": 0.00016715140744554384, + "loss": 0.9251, + "step": 3024 + }, + { + "epoch": 1.04, + "grad_norm": 0.19345614314079285, + "learning_rate": 0.0001671308875580722, + "loss": 0.9296, + "step": 3025 + }, + { + "epoch": 1.05, + "grad_norm": 0.20464305579662323, + "learning_rate": 0.00016711036252380909, + "loss": 0.9632, + "step": 3026 + }, + { + "epoch": 1.05, + "grad_norm": 0.1874096393585205, + "learning_rate": 0.00016708983234432808, + "loss": 0.8621, + "step": 3027 + }, + { + "epoch": 1.05, + "grad_norm": 0.19258873164653778, + "learning_rate": 0.00016706929702120323, + "loss": 0.9658, + "step": 3028 + }, + { + "epoch": 1.05, + "grad_norm": 0.19458803534507751, + "learning_rate": 0.00016704875655600897, + "loss": 0.9901, + "step": 3029 + }, + { + "epoch": 1.05, + "grad_norm": 0.19785954058170319, + "learning_rate": 0.00016702821095031998, + "loss": 0.9192, + "step": 3030 + }, + { + "epoch": 1.05, + "grad_norm": 0.20203198492527008, + "learning_rate": 0.00016700766020571157, + "loss": 0.9488, + "step": 3031 + }, + { + "epoch": 1.05, + "grad_norm": 0.18694931268692017, + "learning_rate": 0.00016698710432375925, + "loss": 0.9295, + "step": 3032 + }, + { + "epoch": 1.05, + "grad_norm": 0.2125498652458191, + "learning_rate": 0.000166966543306039, + "loss": 0.9405, + "step": 3033 + }, + { + "epoch": 1.05, + "grad_norm": 0.1959962248802185, + "learning_rate": 0.00016694597715412727, + "loss": 0.9279, + "step": 3034 + }, + { + "epoch": 1.05, + "grad_norm": 0.19424889981746674, + "learning_rate": 0.00016692540586960074, + "loss": 0.9329, + "step": 3035 + }, + { + "epoch": 1.05, + "grad_norm": 0.1940089464187622, + "learning_rate": 0.0001669048294540366, + "loss": 0.9566, + "step": 3036 + }, + { + "epoch": 1.05, + "grad_norm": 0.1906086653470993, + "learning_rate": 0.00016688424790901242, + "loss": 0.9052, + "step": 3037 + }, + { + "epoch": 1.05, + "grad_norm": 0.19040682911872864, + "learning_rate": 0.00016686366123610613, + "loss": 0.921, + "step": 3038 + }, + { + "epoch": 1.05, + "grad_norm": 0.20232956111431122, + "learning_rate": 0.00016684306943689608, + "loss": 0.9927, + "step": 3039 + }, + { + "epoch": 1.05, + "grad_norm": 0.20033952593803406, + "learning_rate": 0.000166822472512961, + "loss": 0.925, + "step": 3040 + }, + { + "epoch": 1.05, + "grad_norm": 0.1834961622953415, + "learning_rate": 0.00016680187046588, + "loss": 0.8689, + "step": 3041 + }, + { + "epoch": 1.05, + "grad_norm": 0.19811643660068512, + "learning_rate": 0.00016678126329723258, + "loss": 0.9067, + "step": 3042 + }, + { + "epoch": 1.05, + "grad_norm": 0.1838274598121643, + "learning_rate": 0.00016676065100859868, + "loss": 0.9123, + "step": 3043 + }, + { + "epoch": 1.05, + "grad_norm": 0.19470977783203125, + "learning_rate": 0.00016674003360155862, + "loss": 0.9938, + "step": 3044 + }, + { + "epoch": 1.05, + "grad_norm": 0.1993788778781891, + "learning_rate": 0.00016671941107769306, + "loss": 0.945, + "step": 3045 + }, + { + "epoch": 1.05, + "grad_norm": 0.19459454715251923, + "learning_rate": 0.00016669878343858309, + "loss": 0.8592, + "step": 3046 + }, + { + "epoch": 1.05, + "grad_norm": 0.1934826523065567, + "learning_rate": 0.00016667815068581026, + "loss": 0.8991, + "step": 3047 + }, + { + "epoch": 1.05, + "grad_norm": 0.19478553533554077, + "learning_rate": 0.00016665751282095634, + "loss": 0.933, + "step": 3048 + }, + { + "epoch": 1.05, + "grad_norm": 0.18333174288272858, + "learning_rate": 0.00016663686984560365, + "loss": 0.8764, + "step": 3049 + }, + { + "epoch": 1.05, + "grad_norm": 0.18906213343143463, + "learning_rate": 0.00016661622176133482, + "loss": 0.8985, + "step": 3050 + }, + { + "epoch": 1.05, + "grad_norm": 0.19240978360176086, + "learning_rate": 0.0001665955685697329, + "loss": 0.9382, + "step": 3051 + }, + { + "epoch": 1.05, + "grad_norm": 0.18707260489463806, + "learning_rate": 0.00016657491027238138, + "loss": 0.9029, + "step": 3052 + }, + { + "epoch": 1.05, + "grad_norm": 0.19875676929950714, + "learning_rate": 0.00016655424687086402, + "loss": 0.9134, + "step": 3053 + }, + { + "epoch": 1.06, + "grad_norm": 0.20501351356506348, + "learning_rate": 0.0001665335783667651, + "loss": 0.9141, + "step": 3054 + }, + { + "epoch": 1.06, + "grad_norm": 0.2028193324804306, + "learning_rate": 0.0001665129047616692, + "loss": 0.9328, + "step": 3055 + }, + { + "epoch": 1.06, + "grad_norm": 0.1942424327135086, + "learning_rate": 0.00016649222605716132, + "loss": 0.9344, + "step": 3056 + }, + { + "epoch": 1.06, + "grad_norm": 0.19648103415966034, + "learning_rate": 0.00016647154225482688, + "loss": 0.9037, + "step": 3057 + }, + { + "epoch": 1.06, + "grad_norm": 0.19552026689052582, + "learning_rate": 0.00016645085335625166, + "loss": 0.9409, + "step": 3058 + }, + { + "epoch": 1.06, + "grad_norm": 0.20716822147369385, + "learning_rate": 0.0001664301593630218, + "loss": 0.9503, + "step": 3059 + }, + { + "epoch": 1.06, + "grad_norm": 0.19085535407066345, + "learning_rate": 0.00016640946027672392, + "loss": 0.9805, + "step": 3060 + }, + { + "epoch": 1.06, + "grad_norm": 0.19183199107646942, + "learning_rate": 0.00016638875609894494, + "loss": 0.9525, + "step": 3061 + }, + { + "epoch": 1.06, + "grad_norm": 0.20048820972442627, + "learning_rate": 0.00016636804683127222, + "loss": 0.9452, + "step": 3062 + }, + { + "epoch": 1.06, + "grad_norm": 0.19393523037433624, + "learning_rate": 0.0001663473324752935, + "loss": 0.9248, + "step": 3063 + }, + { + "epoch": 1.06, + "grad_norm": 0.19443608820438385, + "learning_rate": 0.00016632661303259691, + "loss": 0.9816, + "step": 3064 + }, + { + "epoch": 1.06, + "grad_norm": 0.20129789412021637, + "learning_rate": 0.00016630588850477098, + "loss": 0.9673, + "step": 3065 + }, + { + "epoch": 1.06, + "grad_norm": 0.19859552383422852, + "learning_rate": 0.0001662851588934046, + "loss": 0.9159, + "step": 3066 + }, + { + "epoch": 1.06, + "grad_norm": 0.20427550375461578, + "learning_rate": 0.00016626442420008706, + "loss": 0.9684, + "step": 3067 + }, + { + "epoch": 1.06, + "grad_norm": 0.2060311883687973, + "learning_rate": 0.00016624368442640807, + "loss": 0.9626, + "step": 3068 + }, + { + "epoch": 1.06, + "grad_norm": 0.20656219124794006, + "learning_rate": 0.0001662229395739577, + "loss": 0.9244, + "step": 3069 + }, + { + "epoch": 1.06, + "grad_norm": 0.19876663386821747, + "learning_rate": 0.00016620218964432643, + "loss": 0.9035, + "step": 3070 + }, + { + "epoch": 1.06, + "grad_norm": 0.1983165442943573, + "learning_rate": 0.00016618143463910506, + "loss": 0.9005, + "step": 3071 + }, + { + "epoch": 1.06, + "grad_norm": 0.19417667388916016, + "learning_rate": 0.00016616067455988491, + "loss": 0.9707, + "step": 3072 + }, + { + "epoch": 1.06, + "grad_norm": 0.19066518545150757, + "learning_rate": 0.00016613990940825754, + "loss": 0.941, + "step": 3073 + }, + { + "epoch": 1.06, + "grad_norm": 0.19258198142051697, + "learning_rate": 0.00016611913918581505, + "loss": 0.9481, + "step": 3074 + }, + { + "epoch": 1.06, + "grad_norm": 0.2004285454750061, + "learning_rate": 0.0001660983638941498, + "loss": 0.9491, + "step": 3075 + }, + { + "epoch": 1.06, + "grad_norm": 0.19285230338573456, + "learning_rate": 0.00016607758353485462, + "loss": 0.9509, + "step": 3076 + }, + { + "epoch": 1.06, + "grad_norm": 0.18702608346939087, + "learning_rate": 0.00016605679810952268, + "loss": 0.8851, + "step": 3077 + }, + { + "epoch": 1.06, + "grad_norm": 0.1984139382839203, + "learning_rate": 0.00016603600761974755, + "loss": 0.9796, + "step": 3078 + }, + { + "epoch": 1.06, + "grad_norm": 0.20252801477909088, + "learning_rate": 0.00016601521206712318, + "loss": 0.9725, + "step": 3079 + }, + { + "epoch": 1.06, + "grad_norm": 0.20916445553302765, + "learning_rate": 0.000165994411453244, + "loss": 0.9426, + "step": 3080 + }, + { + "epoch": 1.06, + "grad_norm": 0.19025819003582, + "learning_rate": 0.00016597360577970464, + "loss": 0.9129, + "step": 3081 + }, + { + "epoch": 1.07, + "grad_norm": 0.19675807654857635, + "learning_rate": 0.00016595279504810035, + "loss": 0.9671, + "step": 3082 + }, + { + "epoch": 1.07, + "grad_norm": 0.19846715033054352, + "learning_rate": 0.00016593197926002655, + "loss": 0.9286, + "step": 3083 + }, + { + "epoch": 1.07, + "grad_norm": 0.19845189154148102, + "learning_rate": 0.00016591115841707918, + "loss": 0.9887, + "step": 3084 + }, + { + "epoch": 1.07, + "grad_norm": 0.20557260513305664, + "learning_rate": 0.00016589033252085455, + "loss": 0.9767, + "step": 3085 + }, + { + "epoch": 1.07, + "grad_norm": 0.19639475643634796, + "learning_rate": 0.00016586950157294932, + "loss": 0.9543, + "step": 3086 + }, + { + "epoch": 1.07, + "grad_norm": 0.19399593770503998, + "learning_rate": 0.00016584866557496056, + "loss": 0.8986, + "step": 3087 + }, + { + "epoch": 1.07, + "grad_norm": 0.1935609132051468, + "learning_rate": 0.00016582782452848575, + "loss": 0.9144, + "step": 3088 + }, + { + "epoch": 1.07, + "grad_norm": 0.2042001187801361, + "learning_rate": 0.00016580697843512267, + "loss": 0.984, + "step": 3089 + }, + { + "epoch": 1.07, + "grad_norm": 0.19425135850906372, + "learning_rate": 0.00016578612729646963, + "loss": 0.9508, + "step": 3090 + }, + { + "epoch": 1.07, + "grad_norm": 0.19415859878063202, + "learning_rate": 0.00016576527111412516, + "loss": 0.899, + "step": 3091 + }, + { + "epoch": 1.07, + "grad_norm": 0.18906068801879883, + "learning_rate": 0.00016574440988968833, + "loss": 0.9146, + "step": 3092 + }, + { + "epoch": 1.07, + "grad_norm": 0.19608232378959656, + "learning_rate": 0.0001657235436247585, + "loss": 0.9263, + "step": 3093 + }, + { + "epoch": 1.07, + "grad_norm": 0.19680467247962952, + "learning_rate": 0.00016570267232093544, + "loss": 0.8997, + "step": 3094 + }, + { + "epoch": 1.07, + "grad_norm": 0.20155908167362213, + "learning_rate": 0.00016568179597981932, + "loss": 0.9227, + "step": 3095 + }, + { + "epoch": 1.07, + "grad_norm": 0.19624723494052887, + "learning_rate": 0.0001656609146030107, + "loss": 0.8891, + "step": 3096 + }, + { + "epoch": 1.07, + "grad_norm": 0.19039687514305115, + "learning_rate": 0.00016564002819211048, + "loss": 0.8529, + "step": 3097 + }, + { + "epoch": 1.07, + "grad_norm": 0.19904066622257233, + "learning_rate": 0.00016561913674872, + "loss": 0.9535, + "step": 3098 + }, + { + "epoch": 1.07, + "grad_norm": 0.1853899508714676, + "learning_rate": 0.00016559824027444098, + "loss": 0.8376, + "step": 3099 + }, + { + "epoch": 1.07, + "grad_norm": 0.19816075265407562, + "learning_rate": 0.00016557733877087548, + "loss": 0.9628, + "step": 3100 + }, + { + "epoch": 1.07, + "grad_norm": 0.20754671096801758, + "learning_rate": 0.000165556432239626, + "loss": 0.9895, + "step": 3101 + }, + { + "epoch": 1.07, + "grad_norm": 0.1962323784828186, + "learning_rate": 0.0001655355206822954, + "loss": 0.948, + "step": 3102 + }, + { + "epoch": 1.07, + "grad_norm": 0.1916942596435547, + "learning_rate": 0.0001655146041004869, + "loss": 0.924, + "step": 3103 + }, + { + "epoch": 1.07, + "grad_norm": 0.1882028877735138, + "learning_rate": 0.00016549368249580416, + "loss": 0.905, + "step": 3104 + }, + { + "epoch": 1.07, + "grad_norm": 0.1991576850414276, + "learning_rate": 0.00016547275586985122, + "loss": 0.9417, + "step": 3105 + }, + { + "epoch": 1.07, + "grad_norm": 0.20659486949443817, + "learning_rate": 0.00016545182422423246, + "loss": 0.999, + "step": 3106 + }, + { + "epoch": 1.07, + "grad_norm": 0.19963429868221283, + "learning_rate": 0.00016543088756055263, + "loss": 0.9741, + "step": 3107 + }, + { + "epoch": 1.07, + "grad_norm": 0.2001790702342987, + "learning_rate": 0.00016540994588041695, + "loss": 0.8866, + "step": 3108 + }, + { + "epoch": 1.07, + "grad_norm": 0.18648461997509003, + "learning_rate": 0.00016538899918543094, + "loss": 0.9174, + "step": 3109 + }, + { + "epoch": 1.07, + "grad_norm": 0.1969536989927292, + "learning_rate": 0.0001653680474772006, + "loss": 0.9187, + "step": 3110 + }, + { + "epoch": 1.08, + "grad_norm": 0.19499890506267548, + "learning_rate": 0.0001653470907573322, + "loss": 0.8722, + "step": 3111 + }, + { + "epoch": 1.08, + "grad_norm": 0.20478104054927826, + "learning_rate": 0.00016532612902743246, + "loss": 0.8899, + "step": 3112 + }, + { + "epoch": 1.08, + "grad_norm": 0.19237670302391052, + "learning_rate": 0.0001653051622891085, + "loss": 0.9049, + "step": 3113 + }, + { + "epoch": 1.08, + "grad_norm": 0.20637308061122894, + "learning_rate": 0.0001652841905439678, + "loss": 0.9914, + "step": 3114 + }, + { + "epoch": 1.08, + "grad_norm": 0.2024182230234146, + "learning_rate": 0.00016526321379361816, + "loss": 0.8817, + "step": 3115 + }, + { + "epoch": 1.08, + "grad_norm": 0.19620735943317413, + "learning_rate": 0.0001652422320396679, + "loss": 0.9344, + "step": 3116 + }, + { + "epoch": 1.08, + "grad_norm": 0.1945084184408188, + "learning_rate": 0.00016522124528372563, + "loss": 1.0151, + "step": 3117 + }, + { + "epoch": 1.08, + "grad_norm": 0.20316572487354279, + "learning_rate": 0.0001652002535274003, + "loss": 1.0103, + "step": 3118 + }, + { + "epoch": 1.08, + "grad_norm": 0.197440043091774, + "learning_rate": 0.0001651792567723014, + "loss": 0.9926, + "step": 3119 + }, + { + "epoch": 1.08, + "grad_norm": 0.1884462982416153, + "learning_rate": 0.00016515825502003866, + "loss": 0.9498, + "step": 3120 + }, + { + "epoch": 1.08, + "grad_norm": 0.19690294563770294, + "learning_rate": 0.00016513724827222227, + "loss": 0.9016, + "step": 3121 + }, + { + "epoch": 1.08, + "grad_norm": 0.1945369392633438, + "learning_rate": 0.00016511623653046268, + "loss": 0.9326, + "step": 3122 + }, + { + "epoch": 1.08, + "grad_norm": 0.20239035785198212, + "learning_rate": 0.00016509521979637094, + "loss": 0.9521, + "step": 3123 + }, + { + "epoch": 1.08, + "grad_norm": 0.19082894921302795, + "learning_rate": 0.0001650741980715583, + "loss": 0.9664, + "step": 3124 + }, + { + "epoch": 1.08, + "grad_norm": 0.19143463671207428, + "learning_rate": 0.00016505317135763652, + "loss": 0.8994, + "step": 3125 + }, + { + "epoch": 1.08, + "grad_norm": 0.1881125271320343, + "learning_rate": 0.0001650321396562176, + "loss": 0.9712, + "step": 3126 + }, + { + "epoch": 1.08, + "grad_norm": 0.19766877591609955, + "learning_rate": 0.000165011102968914, + "loss": 0.9933, + "step": 3127 + }, + { + "epoch": 1.08, + "grad_norm": 0.19713425636291504, + "learning_rate": 0.00016499006129733857, + "loss": 0.9075, + "step": 3128 + }, + { + "epoch": 1.08, + "grad_norm": 0.1889505535364151, + "learning_rate": 0.00016496901464310457, + "loss": 0.9361, + "step": 3129 + }, + { + "epoch": 1.08, + "grad_norm": 0.19792018830776215, + "learning_rate": 0.0001649479630078256, + "loss": 0.9492, + "step": 3130 + }, + { + "epoch": 1.08, + "grad_norm": 0.19634611904621124, + "learning_rate": 0.00016492690639311562, + "loss": 0.9335, + "step": 3131 + }, + { + "epoch": 1.08, + "grad_norm": 0.19777911901474, + "learning_rate": 0.000164905844800589, + "loss": 0.9047, + "step": 3132 + }, + { + "epoch": 1.08, + "grad_norm": 0.1970728635787964, + "learning_rate": 0.00016488477823186048, + "loss": 0.9206, + "step": 3133 + }, + { + "epoch": 1.08, + "grad_norm": 0.20032721757888794, + "learning_rate": 0.00016486370668854523, + "loss": 0.9584, + "step": 3134 + }, + { + "epoch": 1.08, + "grad_norm": 0.19533273577690125, + "learning_rate": 0.00016484263017225875, + "loss": 0.9534, + "step": 3135 + }, + { + "epoch": 1.08, + "grad_norm": 0.1874852180480957, + "learning_rate": 0.00016482154868461692, + "loss": 0.8876, + "step": 3136 + }, + { + "epoch": 1.08, + "grad_norm": 0.207426518201828, + "learning_rate": 0.00016480046222723603, + "loss": 0.9597, + "step": 3137 + }, + { + "epoch": 1.08, + "grad_norm": 0.1896134465932846, + "learning_rate": 0.0001647793708017327, + "loss": 0.9288, + "step": 3138 + }, + { + "epoch": 1.09, + "grad_norm": 0.19841088354587555, + "learning_rate": 0.000164758274409724, + "loss": 0.9661, + "step": 3139 + }, + { + "epoch": 1.09, + "grad_norm": 0.1989322006702423, + "learning_rate": 0.00016473717305282742, + "loss": 0.9517, + "step": 3140 + }, + { + "epoch": 1.09, + "grad_norm": 0.19085904955863953, + "learning_rate": 0.00016471606673266066, + "loss": 0.9135, + "step": 3141 + }, + { + "epoch": 1.09, + "grad_norm": 0.1852741837501526, + "learning_rate": 0.00016469495545084185, + "loss": 0.8794, + "step": 3142 + }, + { + "epoch": 1.09, + "grad_norm": 0.19000276923179626, + "learning_rate": 0.00016467383920898965, + "loss": 0.8926, + "step": 3143 + }, + { + "epoch": 1.09, + "grad_norm": 0.18369747698307037, + "learning_rate": 0.00016465271800872303, + "loss": 0.8703, + "step": 3144 + }, + { + "epoch": 1.09, + "grad_norm": 0.19372214376926422, + "learning_rate": 0.0001646315918516612, + "loss": 0.9348, + "step": 3145 + }, + { + "epoch": 1.09, + "grad_norm": 0.1902085244655609, + "learning_rate": 0.00016461046073942397, + "loss": 0.8901, + "step": 3146 + }, + { + "epoch": 1.09, + "grad_norm": 0.19396044313907623, + "learning_rate": 0.00016458932467363133, + "loss": 0.9754, + "step": 3147 + }, + { + "epoch": 1.09, + "grad_norm": 0.19111724197864532, + "learning_rate": 0.00016456818365590377, + "loss": 0.9584, + "step": 3148 + }, + { + "epoch": 1.09, + "grad_norm": 0.18766222894191742, + "learning_rate": 0.00016454703768786216, + "loss": 0.8653, + "step": 3149 + }, + { + "epoch": 1.09, + "grad_norm": 0.1874326914548874, + "learning_rate": 0.00016452588677112767, + "loss": 0.8799, + "step": 3150 + }, + { + "epoch": 1.09, + "grad_norm": 0.19378015398979187, + "learning_rate": 0.00016450473090732194, + "loss": 0.945, + "step": 3151 + }, + { + "epoch": 1.09, + "grad_norm": 0.18832823634147644, + "learning_rate": 0.00016448357009806692, + "loss": 0.8718, + "step": 3152 + }, + { + "epoch": 1.09, + "grad_norm": 0.1912168264389038, + "learning_rate": 0.000164462404344985, + "loss": 0.8851, + "step": 3153 + }, + { + "epoch": 1.09, + "grad_norm": 0.1954006403684616, + "learning_rate": 0.00016444123364969885, + "loss": 0.9769, + "step": 3154 + }, + { + "epoch": 1.09, + "grad_norm": 0.19893434643745422, + "learning_rate": 0.00016442005801383167, + "loss": 0.9268, + "step": 3155 + }, + { + "epoch": 1.09, + "grad_norm": 0.19670341908931732, + "learning_rate": 0.00016439887743900693, + "loss": 0.9658, + "step": 3156 + }, + { + "epoch": 1.09, + "grad_norm": 0.19159068167209625, + "learning_rate": 0.00016437769192684845, + "loss": 0.8905, + "step": 3157 + }, + { + "epoch": 1.09, + "grad_norm": 0.18277385830879211, + "learning_rate": 0.00016435650147898052, + "loss": 0.832, + "step": 3158 + }, + { + "epoch": 1.09, + "grad_norm": 0.19444088637828827, + "learning_rate": 0.00016433530609702776, + "loss": 0.9398, + "step": 3159 + }, + { + "epoch": 1.09, + "grad_norm": 0.1958342045545578, + "learning_rate": 0.00016431410578261524, + "loss": 0.8834, + "step": 3160 + }, + { + "epoch": 1.09, + "grad_norm": 0.19928790628910065, + "learning_rate": 0.00016429290053736822, + "loss": 0.9126, + "step": 3161 + }, + { + "epoch": 1.09, + "grad_norm": 0.1952957957983017, + "learning_rate": 0.0001642716903629126, + "loss": 0.9472, + "step": 3162 + }, + { + "epoch": 1.09, + "grad_norm": 0.18636207282543182, + "learning_rate": 0.0001642504752608744, + "loss": 0.9279, + "step": 3163 + }, + { + "epoch": 1.09, + "grad_norm": 0.18926826119422913, + "learning_rate": 0.00016422925523288022, + "loss": 0.9201, + "step": 3164 + }, + { + "epoch": 1.09, + "grad_norm": 0.1961440145969391, + "learning_rate": 0.00016420803028055692, + "loss": 0.998, + "step": 3165 + }, + { + "epoch": 1.09, + "grad_norm": 0.19415853917598724, + "learning_rate": 0.00016418680040553181, + "loss": 0.9176, + "step": 3166 + }, + { + "epoch": 1.09, + "grad_norm": 0.1978716403245926, + "learning_rate": 0.0001641655656094325, + "loss": 0.9559, + "step": 3167 + }, + { + "epoch": 1.1, + "grad_norm": 0.19432327151298523, + "learning_rate": 0.00016414432589388706, + "loss": 0.8507, + "step": 3168 + }, + { + "epoch": 1.1, + "grad_norm": 0.19534040987491608, + "learning_rate": 0.00016412308126052385, + "loss": 0.927, + "step": 3169 + }, + { + "epoch": 1.1, + "grad_norm": 0.1897934377193451, + "learning_rate": 0.0001641018317109717, + "loss": 0.8587, + "step": 3170 + }, + { + "epoch": 1.1, + "grad_norm": 0.20288267731666565, + "learning_rate": 0.00016408057724685976, + "loss": 0.9847, + "step": 3171 + }, + { + "epoch": 1.1, + "grad_norm": 0.18903978168964386, + "learning_rate": 0.00016405931786981755, + "loss": 0.9283, + "step": 3172 + }, + { + "epoch": 1.1, + "grad_norm": 0.1943960338830948, + "learning_rate": 0.00016403805358147495, + "loss": 0.936, + "step": 3173 + }, + { + "epoch": 1.1, + "grad_norm": 0.18546950817108154, + "learning_rate": 0.00016401678438346235, + "loss": 0.967, + "step": 3174 + }, + { + "epoch": 1.1, + "grad_norm": 0.19150441884994507, + "learning_rate": 0.0001639955102774103, + "loss": 0.9381, + "step": 3175 + }, + { + "epoch": 1.1, + "grad_norm": 0.18767720460891724, + "learning_rate": 0.00016397423126494998, + "loss": 0.9682, + "step": 3176 + }, + { + "epoch": 1.1, + "grad_norm": 0.19705381989479065, + "learning_rate": 0.0001639529473477127, + "loss": 0.8661, + "step": 3177 + }, + { + "epoch": 1.1, + "grad_norm": 0.20014797151088715, + "learning_rate": 0.00016393165852733027, + "loss": 0.9955, + "step": 3178 + }, + { + "epoch": 1.1, + "grad_norm": 0.1958370953798294, + "learning_rate": 0.00016391036480543488, + "loss": 0.9264, + "step": 3179 + }, + { + "epoch": 1.1, + "grad_norm": 0.18295347690582275, + "learning_rate": 0.0001638890661836591, + "loss": 0.91, + "step": 3180 + }, + { + "epoch": 1.1, + "grad_norm": 0.20895381271839142, + "learning_rate": 0.00016386776266363583, + "loss": 0.9342, + "step": 3181 + }, + { + "epoch": 1.1, + "grad_norm": 0.1930595338344574, + "learning_rate": 0.00016384645424699835, + "loss": 0.9263, + "step": 3182 + }, + { + "epoch": 1.1, + "grad_norm": 0.1938629299402237, + "learning_rate": 0.00016382514093538037, + "loss": 0.9294, + "step": 3183 + }, + { + "epoch": 1.1, + "grad_norm": 0.1964857280254364, + "learning_rate": 0.00016380382273041593, + "loss": 0.9872, + "step": 3184 + }, + { + "epoch": 1.1, + "grad_norm": 0.1964603066444397, + "learning_rate": 0.00016378249963373942, + "loss": 0.9155, + "step": 3185 + }, + { + "epoch": 1.1, + "grad_norm": 0.20166955888271332, + "learning_rate": 0.00016376117164698567, + "loss": 0.9221, + "step": 3186 + }, + { + "epoch": 1.1, + "grad_norm": 0.19599172472953796, + "learning_rate": 0.00016373983877178986, + "loss": 0.9071, + "step": 3187 + }, + { + "epoch": 1.1, + "grad_norm": 0.196363627910614, + "learning_rate": 0.0001637185010097875, + "loss": 1.0266, + "step": 3188 + }, + { + "epoch": 1.1, + "grad_norm": 0.1886577606201172, + "learning_rate": 0.00016369715836261458, + "loss": 0.9177, + "step": 3189 + }, + { + "epoch": 1.1, + "grad_norm": 0.19875650107860565, + "learning_rate": 0.00016367581083190735, + "loss": 0.9816, + "step": 3190 + }, + { + "epoch": 1.1, + "grad_norm": 0.19966274499893188, + "learning_rate": 0.0001636544584193025, + "loss": 0.8889, + "step": 3191 + }, + { + "epoch": 1.1, + "grad_norm": 0.18415604531764984, + "learning_rate": 0.00016363310112643703, + "loss": 0.9384, + "step": 3192 + }, + { + "epoch": 1.1, + "grad_norm": 0.17933693528175354, + "learning_rate": 0.00016361173895494845, + "loss": 0.875, + "step": 3193 + }, + { + "epoch": 1.1, + "grad_norm": 0.19441203773021698, + "learning_rate": 0.0001635903719064745, + "loss": 0.9285, + "step": 3194 + }, + { + "epoch": 1.1, + "grad_norm": 0.19981196522712708, + "learning_rate": 0.00016356899998265333, + "loss": 0.9445, + "step": 3195 + }, + { + "epoch": 1.11, + "grad_norm": 0.18804876506328583, + "learning_rate": 0.00016354762318512354, + "loss": 0.9127, + "step": 3196 + }, + { + "epoch": 1.11, + "grad_norm": 0.1986115276813507, + "learning_rate": 0.000163526241515524, + "loss": 0.9932, + "step": 3197 + }, + { + "epoch": 1.11, + "grad_norm": 0.19924885034561157, + "learning_rate": 0.00016350485497549402, + "loss": 0.9489, + "step": 3198 + }, + { + "epoch": 1.11, + "grad_norm": 0.1975104957818985, + "learning_rate": 0.00016348346356667324, + "loss": 1.0024, + "step": 3199 + }, + { + "epoch": 1.11, + "grad_norm": 0.1849527508020401, + "learning_rate": 0.00016346206729070173, + "loss": 0.8958, + "step": 3200 + }, + { + "epoch": 1.11, + "grad_norm": 0.1946130096912384, + "learning_rate": 0.00016344066614921987, + "loss": 0.8933, + "step": 3201 + }, + { + "epoch": 1.11, + "grad_norm": 0.19796276092529297, + "learning_rate": 0.00016341926014386846, + "loss": 0.8946, + "step": 3202 + }, + { + "epoch": 1.11, + "grad_norm": 0.19626247882843018, + "learning_rate": 0.00016339784927628867, + "loss": 0.9928, + "step": 3203 + }, + { + "epoch": 1.11, + "grad_norm": 0.1890694499015808, + "learning_rate": 0.00016337643354812203, + "loss": 0.9041, + "step": 3204 + }, + { + "epoch": 1.11, + "grad_norm": 0.19057592749595642, + "learning_rate": 0.00016335501296101037, + "loss": 0.9625, + "step": 3205 + }, + { + "epoch": 1.11, + "grad_norm": 0.18700197339057922, + "learning_rate": 0.00016333358751659605, + "loss": 0.8718, + "step": 3206 + }, + { + "epoch": 1.11, + "grad_norm": 0.19421708583831787, + "learning_rate": 0.00016331215721652166, + "loss": 0.9084, + "step": 3207 + }, + { + "epoch": 1.11, + "grad_norm": 0.19596804678440094, + "learning_rate": 0.0001632907220624303, + "loss": 0.8881, + "step": 3208 + }, + { + "epoch": 1.11, + "grad_norm": 0.20006729662418365, + "learning_rate": 0.00016326928205596527, + "loss": 0.9956, + "step": 3209 + }, + { + "epoch": 1.11, + "grad_norm": 0.1877383142709732, + "learning_rate": 0.00016324783719877032, + "loss": 0.8683, + "step": 3210 + }, + { + "epoch": 1.11, + "grad_norm": 0.20360927283763885, + "learning_rate": 0.00016322638749248969, + "loss": 0.9104, + "step": 3211 + }, + { + "epoch": 1.11, + "grad_norm": 0.18599390983581543, + "learning_rate": 0.0001632049329387678, + "loss": 0.9254, + "step": 3212 + }, + { + "epoch": 1.11, + "grad_norm": 0.20069356262683868, + "learning_rate": 0.00016318347353924957, + "loss": 0.9163, + "step": 3213 + }, + { + "epoch": 1.11, + "grad_norm": 0.19795280694961548, + "learning_rate": 0.0001631620092955802, + "loss": 0.872, + "step": 3214 + }, + { + "epoch": 1.11, + "grad_norm": 0.21090590953826904, + "learning_rate": 0.00016314054020940538, + "loss": 0.9532, + "step": 3215 + }, + { + "epoch": 1.11, + "grad_norm": 0.19131018221378326, + "learning_rate": 0.00016311906628237104, + "loss": 0.8848, + "step": 3216 + }, + { + "epoch": 1.11, + "grad_norm": 0.20162901282310486, + "learning_rate": 0.00016309758751612357, + "loss": 0.9615, + "step": 3217 + }, + { + "epoch": 1.11, + "grad_norm": 0.20793616771697998, + "learning_rate": 0.00016307610391230973, + "loss": 0.9463, + "step": 3218 + }, + { + "epoch": 1.11, + "grad_norm": 0.19533415138721466, + "learning_rate": 0.0001630546154725766, + "loss": 0.9327, + "step": 3219 + }, + { + "epoch": 1.11, + "grad_norm": 0.200450137257576, + "learning_rate": 0.00016303312219857166, + "loss": 0.8954, + "step": 3220 + }, + { + "epoch": 1.11, + "grad_norm": 0.1979542076587677, + "learning_rate": 0.00016301162409194272, + "loss": 0.9546, + "step": 3221 + }, + { + "epoch": 1.11, + "grad_norm": 0.1875590831041336, + "learning_rate": 0.000162990121154338, + "loss": 0.9371, + "step": 3222 + }, + { + "epoch": 1.11, + "grad_norm": 0.19330134987831116, + "learning_rate": 0.00016296861338740618, + "loss": 0.8646, + "step": 3223 + }, + { + "epoch": 1.12, + "grad_norm": 0.18598540127277374, + "learning_rate": 0.00016294710079279613, + "loss": 0.9595, + "step": 3224 + }, + { + "epoch": 1.12, + "grad_norm": 0.18598809838294983, + "learning_rate": 0.0001629255833721572, + "loss": 0.879, + "step": 3225 + }, + { + "epoch": 1.12, + "grad_norm": 0.1924973428249359, + "learning_rate": 0.00016290406112713906, + "loss": 0.8916, + "step": 3226 + }, + { + "epoch": 1.12, + "grad_norm": 0.1871059238910675, + "learning_rate": 0.00016288253405939183, + "loss": 0.9009, + "step": 3227 + }, + { + "epoch": 1.12, + "grad_norm": 0.1845734864473343, + "learning_rate": 0.00016286100217056596, + "loss": 0.9263, + "step": 3228 + }, + { + "epoch": 1.12, + "grad_norm": 0.19569912552833557, + "learning_rate": 0.00016283946546231218, + "loss": 0.9312, + "step": 3229 + }, + { + "epoch": 1.12, + "grad_norm": 0.19267801940441132, + "learning_rate": 0.0001628179239362817, + "loss": 0.873, + "step": 3230 + }, + { + "epoch": 1.12, + "grad_norm": 0.2015204131603241, + "learning_rate": 0.0001627963775941261, + "loss": 0.9102, + "step": 3231 + }, + { + "epoch": 1.12, + "grad_norm": 0.19582143425941467, + "learning_rate": 0.00016277482643749726, + "loss": 0.9614, + "step": 3232 + }, + { + "epoch": 1.12, + "grad_norm": 0.20045465230941772, + "learning_rate": 0.00016275327046804747, + "loss": 0.9631, + "step": 3233 + }, + { + "epoch": 1.12, + "grad_norm": 0.18617044389247894, + "learning_rate": 0.0001627317096874294, + "loss": 0.8873, + "step": 3234 + }, + { + "epoch": 1.12, + "grad_norm": 0.20299577713012695, + "learning_rate": 0.00016271014409729605, + "loss": 0.922, + "step": 3235 + }, + { + "epoch": 1.12, + "grad_norm": 0.20413140952587128, + "learning_rate": 0.0001626885736993008, + "loss": 0.9555, + "step": 3236 + }, + { + "epoch": 1.12, + "grad_norm": 0.18887461721897125, + "learning_rate": 0.00016266699849509743, + "loss": 0.9311, + "step": 3237 + }, + { + "epoch": 1.12, + "grad_norm": 0.19803249835968018, + "learning_rate": 0.0001626454184863401, + "loss": 0.9104, + "step": 3238 + }, + { + "epoch": 1.12, + "grad_norm": 0.19364400207996368, + "learning_rate": 0.00016262383367468325, + "loss": 0.9282, + "step": 3239 + }, + { + "epoch": 1.12, + "grad_norm": 0.20137445628643036, + "learning_rate": 0.00016260224406178177, + "loss": 0.9215, + "step": 3240 + }, + { + "epoch": 1.12, + "grad_norm": 0.19514375925064087, + "learning_rate": 0.00016258064964929088, + "loss": 0.9409, + "step": 3241 + }, + { + "epoch": 1.12, + "grad_norm": 0.19378265738487244, + "learning_rate": 0.0001625590504388662, + "loss": 0.8933, + "step": 3242 + }, + { + "epoch": 1.12, + "grad_norm": 0.18738335371017456, + "learning_rate": 0.00016253744643216368, + "loss": 0.9545, + "step": 3243 + }, + { + "epoch": 1.12, + "grad_norm": 0.1987677812576294, + "learning_rate": 0.00016251583763083967, + "loss": 0.9697, + "step": 3244 + }, + { + "epoch": 1.12, + "grad_norm": 0.20496296882629395, + "learning_rate": 0.00016249422403655087, + "loss": 0.96, + "step": 3245 + }, + { + "epoch": 1.12, + "grad_norm": 0.20727890729904175, + "learning_rate": 0.00016247260565095437, + "loss": 1.0182, + "step": 3246 + }, + { + "epoch": 1.12, + "grad_norm": 0.19182537496089935, + "learning_rate": 0.00016245098247570758, + "loss": 0.9521, + "step": 3247 + }, + { + "epoch": 1.12, + "grad_norm": 0.2026839703321457, + "learning_rate": 0.0001624293545124683, + "loss": 0.9507, + "step": 3248 + }, + { + "epoch": 1.12, + "grad_norm": 0.20389671623706818, + "learning_rate": 0.00016240772176289475, + "loss": 0.9635, + "step": 3249 + }, + { + "epoch": 1.12, + "grad_norm": 0.1950456202030182, + "learning_rate": 0.00016238608422864542, + "loss": 0.8601, + "step": 3250 + }, + { + "epoch": 1.12, + "grad_norm": 0.1940208077430725, + "learning_rate": 0.00016236444191137917, + "loss": 0.9209, + "step": 3251 + }, + { + "epoch": 1.12, + "grad_norm": 0.19530853629112244, + "learning_rate": 0.00016234279481275543, + "loss": 0.9396, + "step": 3252 + }, + { + "epoch": 1.13, + "grad_norm": 0.19468019902706146, + "learning_rate": 0.0001623211429344337, + "loss": 0.8943, + "step": 3253 + }, + { + "epoch": 1.13, + "grad_norm": 0.1874203383922577, + "learning_rate": 0.00016229948627807404, + "loss": 0.9383, + "step": 3254 + }, + { + "epoch": 1.13, + "grad_norm": 0.198331817984581, + "learning_rate": 0.00016227782484533681, + "loss": 0.9178, + "step": 3255 + }, + { + "epoch": 1.13, + "grad_norm": 0.19345207512378693, + "learning_rate": 0.00016225615863788278, + "loss": 0.9393, + "step": 3256 + }, + { + "epoch": 1.13, + "grad_norm": 0.1910363733768463, + "learning_rate": 0.000162234487657373, + "loss": 0.9635, + "step": 3257 + }, + { + "epoch": 1.13, + "grad_norm": 0.18190106749534607, + "learning_rate": 0.000162212811905469, + "loss": 0.8974, + "step": 3258 + }, + { + "epoch": 1.13, + "grad_norm": 0.19228288531303406, + "learning_rate": 0.00016219113138383258, + "loss": 0.9087, + "step": 3259 + }, + { + "epoch": 1.13, + "grad_norm": 0.1946840137243271, + "learning_rate": 0.00016216944609412595, + "loss": 0.9161, + "step": 3260 + }, + { + "epoch": 1.13, + "grad_norm": 0.1919838786125183, + "learning_rate": 0.00016214775603801167, + "loss": 0.9231, + "step": 3261 + }, + { + "epoch": 1.13, + "grad_norm": 0.203338623046875, + "learning_rate": 0.00016212606121715267, + "loss": 0.9122, + "step": 3262 + }, + { + "epoch": 1.13, + "grad_norm": 0.18521350622177124, + "learning_rate": 0.0001621043616332123, + "loss": 0.8616, + "step": 3263 + }, + { + "epoch": 1.13, + "grad_norm": 0.19194412231445312, + "learning_rate": 0.00016208265728785416, + "loss": 0.9158, + "step": 3264 + }, + { + "epoch": 1.13, + "grad_norm": 0.19357708096504211, + "learning_rate": 0.00016206094818274229, + "loss": 0.9746, + "step": 3265 + }, + { + "epoch": 1.13, + "grad_norm": 0.1967921108007431, + "learning_rate": 0.00016203923431954112, + "loss": 0.9261, + "step": 3266 + }, + { + "epoch": 1.13, + "grad_norm": 0.20249836146831512, + "learning_rate": 0.00016201751569991537, + "loss": 0.9597, + "step": 3267 + }, + { + "epoch": 1.13, + "grad_norm": 0.19432705640792847, + "learning_rate": 0.0001619957923255302, + "loss": 0.9656, + "step": 3268 + }, + { + "epoch": 1.13, + "grad_norm": 0.18914005160331726, + "learning_rate": 0.00016197406419805107, + "loss": 0.8933, + "step": 3269 + }, + { + "epoch": 1.13, + "grad_norm": 0.20102287828922272, + "learning_rate": 0.00016195233131914382, + "loss": 0.9124, + "step": 3270 + }, + { + "epoch": 1.13, + "grad_norm": 0.18623220920562744, + "learning_rate": 0.0001619305936904747, + "loss": 0.8506, + "step": 3271 + }, + { + "epoch": 1.13, + "grad_norm": 0.19582891464233398, + "learning_rate": 0.00016190885131371028, + "loss": 0.9541, + "step": 3272 + }, + { + "epoch": 1.13, + "grad_norm": 0.19561882317066193, + "learning_rate": 0.00016188710419051748, + "loss": 0.9532, + "step": 3273 + }, + { + "epoch": 1.13, + "grad_norm": 0.19685816764831543, + "learning_rate": 0.00016186535232256364, + "loss": 0.959, + "step": 3274 + }, + { + "epoch": 1.13, + "grad_norm": 0.20488882064819336, + "learning_rate": 0.00016184359571151644, + "loss": 0.953, + "step": 3275 + }, + { + "epoch": 1.13, + "grad_norm": 0.1881592869758606, + "learning_rate": 0.00016182183435904389, + "loss": 0.9008, + "step": 3276 + }, + { + "epoch": 1.13, + "grad_norm": 0.19552184641361237, + "learning_rate": 0.00016180006826681438, + "loss": 0.9552, + "step": 3277 + }, + { + "epoch": 1.13, + "grad_norm": 0.1943063884973526, + "learning_rate": 0.00016177829743649672, + "loss": 0.8943, + "step": 3278 + }, + { + "epoch": 1.13, + "grad_norm": 0.18469476699829102, + "learning_rate": 0.00016175652186975996, + "loss": 0.8334, + "step": 3279 + }, + { + "epoch": 1.13, + "grad_norm": 0.1960335671901703, + "learning_rate": 0.0001617347415682737, + "loss": 0.9298, + "step": 3280 + }, + { + "epoch": 1.14, + "grad_norm": 0.18966439366340637, + "learning_rate": 0.00016171295653370768, + "loss": 0.8931, + "step": 3281 + }, + { + "epoch": 1.14, + "grad_norm": 0.18533951044082642, + "learning_rate": 0.00016169116676773219, + "loss": 0.873, + "step": 3282 + }, + { + "epoch": 1.14, + "grad_norm": 0.19226530194282532, + "learning_rate": 0.00016166937227201776, + "loss": 0.9701, + "step": 3283 + }, + { + "epoch": 1.14, + "grad_norm": 0.20334738492965698, + "learning_rate": 0.00016164757304823536, + "loss": 0.9888, + "step": 3284 + }, + { + "epoch": 1.14, + "grad_norm": 0.19117876887321472, + "learning_rate": 0.0001616257690980563, + "loss": 0.8898, + "step": 3285 + }, + { + "epoch": 1.14, + "grad_norm": 0.19045065343379974, + "learning_rate": 0.00016160396042315224, + "loss": 0.9351, + "step": 3286 + }, + { + "epoch": 1.14, + "grad_norm": 0.19016839563846588, + "learning_rate": 0.00016158214702519517, + "loss": 0.9358, + "step": 3287 + }, + { + "epoch": 1.14, + "grad_norm": 0.1997360736131668, + "learning_rate": 0.0001615603289058575, + "loss": 0.9525, + "step": 3288 + }, + { + "epoch": 1.14, + "grad_norm": 0.1873968541622162, + "learning_rate": 0.00016153850606681202, + "loss": 0.8936, + "step": 3289 + }, + { + "epoch": 1.14, + "grad_norm": 0.201723113656044, + "learning_rate": 0.00016151667850973183, + "loss": 0.9871, + "step": 3290 + }, + { + "epoch": 1.14, + "grad_norm": 0.19081851840019226, + "learning_rate": 0.00016149484623629036, + "loss": 0.8942, + "step": 3291 + }, + { + "epoch": 1.14, + "grad_norm": 0.18933135271072388, + "learning_rate": 0.00016147300924816146, + "loss": 0.9196, + "step": 3292 + }, + { + "epoch": 1.14, + "grad_norm": 0.20124155282974243, + "learning_rate": 0.0001614511675470194, + "loss": 0.9442, + "step": 3293 + }, + { + "epoch": 1.14, + "grad_norm": 0.18622492253780365, + "learning_rate": 0.0001614293211345386, + "loss": 0.9153, + "step": 3294 + }, + { + "epoch": 1.14, + "grad_norm": 0.1968419998884201, + "learning_rate": 0.00016140747001239415, + "loss": 0.9315, + "step": 3295 + }, + { + "epoch": 1.14, + "grad_norm": 0.19139516353607178, + "learning_rate": 0.0001613856141822612, + "loss": 0.8875, + "step": 3296 + }, + { + "epoch": 1.14, + "grad_norm": 0.20368632674217224, + "learning_rate": 0.00016136375364581547, + "loss": 1.0369, + "step": 3297 + }, + { + "epoch": 1.14, + "grad_norm": 0.1934148222208023, + "learning_rate": 0.0001613418884047329, + "loss": 0.8596, + "step": 3298 + }, + { + "epoch": 1.14, + "grad_norm": 0.1904744654893875, + "learning_rate": 0.00016132001846068988, + "loss": 0.9054, + "step": 3299 + }, + { + "epoch": 1.14, + "grad_norm": 0.18952694535255432, + "learning_rate": 0.00016129814381536322, + "loss": 0.9598, + "step": 3300 + }, + { + "epoch": 1.14, + "grad_norm": 0.18965736031532288, + "learning_rate": 0.00016127626447042984, + "loss": 0.9516, + "step": 3301 + }, + { + "epoch": 1.14, + "grad_norm": 0.19727297127246857, + "learning_rate": 0.00016125438042756732, + "loss": 0.8983, + "step": 3302 + }, + { + "epoch": 1.14, + "grad_norm": 0.2037915289402008, + "learning_rate": 0.0001612324916884534, + "loss": 0.9696, + "step": 3303 + }, + { + "epoch": 1.14, + "grad_norm": 0.18593308329582214, + "learning_rate": 0.0001612105982547663, + "loss": 0.883, + "step": 3304 + }, + { + "epoch": 1.14, + "grad_norm": 0.1988077610731125, + "learning_rate": 0.00016118870012818447, + "loss": 0.9199, + "step": 3305 + }, + { + "epoch": 1.14, + "grad_norm": 0.20564799010753632, + "learning_rate": 0.0001611667973103869, + "loss": 0.9839, + "step": 3306 + }, + { + "epoch": 1.14, + "grad_norm": 0.19398616254329681, + "learning_rate": 0.00016114488980305272, + "loss": 0.9529, + "step": 3307 + }, + { + "epoch": 1.14, + "grad_norm": 0.19221334159374237, + "learning_rate": 0.00016112297760786159, + "loss": 0.9208, + "step": 3308 + }, + { + "epoch": 1.14, + "grad_norm": 0.19437314569950104, + "learning_rate": 0.0001611010607264935, + "loss": 0.9303, + "step": 3309 + }, + { + "epoch": 1.15, + "grad_norm": 0.1968432515859604, + "learning_rate": 0.00016107913916062875, + "loss": 0.9664, + "step": 3310 + }, + { + "epoch": 1.15, + "grad_norm": 0.19320005178451538, + "learning_rate": 0.000161057212911948, + "loss": 0.9252, + "step": 3311 + }, + { + "epoch": 1.15, + "grad_norm": 0.20163682103157043, + "learning_rate": 0.00016103528198213232, + "loss": 0.9298, + "step": 3312 + }, + { + "epoch": 1.15, + "grad_norm": 0.20061266422271729, + "learning_rate": 0.0001610133463728631, + "loss": 0.9236, + "step": 3313 + }, + { + "epoch": 1.15, + "grad_norm": 0.20181746780872345, + "learning_rate": 0.0001609914060858221, + "loss": 0.9528, + "step": 3314 + }, + { + "epoch": 1.15, + "grad_norm": 0.19283704459667206, + "learning_rate": 0.00016096946112269145, + "loss": 0.9264, + "step": 3315 + }, + { + "epoch": 1.15, + "grad_norm": 0.1948464959859848, + "learning_rate": 0.0001609475114851536, + "loss": 0.9575, + "step": 3316 + }, + { + "epoch": 1.15, + "grad_norm": 0.2059197872877121, + "learning_rate": 0.00016092555717489142, + "loss": 0.9476, + "step": 3317 + }, + { + "epoch": 1.15, + "grad_norm": 0.19160623848438263, + "learning_rate": 0.00016090359819358806, + "loss": 0.9308, + "step": 3318 + }, + { + "epoch": 1.15, + "grad_norm": 0.19544410705566406, + "learning_rate": 0.0001608816345429271, + "loss": 0.9077, + "step": 3319 + }, + { + "epoch": 1.15, + "grad_norm": 0.19720809161663055, + "learning_rate": 0.0001608596662245925, + "loss": 0.9383, + "step": 3320 + }, + { + "epoch": 1.15, + "grad_norm": 0.2031698375940323, + "learning_rate": 0.0001608376932402684, + "loss": 0.8995, + "step": 3321 + }, + { + "epoch": 1.15, + "grad_norm": 0.21251250803470612, + "learning_rate": 0.00016081571559163952, + "loss": 0.9421, + "step": 3322 + }, + { + "epoch": 1.15, + "grad_norm": 0.1931510716676712, + "learning_rate": 0.0001607937332803908, + "loss": 0.9147, + "step": 3323 + }, + { + "epoch": 1.15, + "grad_norm": 0.19057153165340424, + "learning_rate": 0.00016077174630820765, + "loss": 0.8873, + "step": 3324 + }, + { + "epoch": 1.15, + "grad_norm": 0.19663560390472412, + "learning_rate": 0.0001607497546767757, + "loss": 0.8729, + "step": 3325 + }, + { + "epoch": 1.15, + "grad_norm": 0.197501540184021, + "learning_rate": 0.00016072775838778104, + "loss": 0.9721, + "step": 3326 + }, + { + "epoch": 1.15, + "grad_norm": 0.201323002576828, + "learning_rate": 0.00016070575744291004, + "loss": 0.9891, + "step": 3327 + }, + { + "epoch": 1.15, + "grad_norm": 0.19409815967082977, + "learning_rate": 0.0001606837518438495, + "loss": 0.9291, + "step": 3328 + }, + { + "epoch": 1.15, + "grad_norm": 0.18671323359012604, + "learning_rate": 0.00016066174159228653, + "loss": 0.8786, + "step": 3329 + }, + { + "epoch": 1.15, + "grad_norm": 0.20137618482112885, + "learning_rate": 0.00016063972668990862, + "loss": 0.8937, + "step": 3330 + }, + { + "epoch": 1.15, + "grad_norm": 0.2120594084262848, + "learning_rate": 0.00016061770713840363, + "loss": 0.9768, + "step": 3331 + }, + { + "epoch": 1.15, + "grad_norm": 0.20866088569164276, + "learning_rate": 0.0001605956829394597, + "loss": 0.911, + "step": 3332 + }, + { + "epoch": 1.15, + "grad_norm": 0.1928453892469406, + "learning_rate": 0.00016057365409476545, + "loss": 0.9378, + "step": 3333 + }, + { + "epoch": 1.15, + "grad_norm": 0.21047063171863556, + "learning_rate": 0.0001605516206060097, + "loss": 0.937, + "step": 3334 + }, + { + "epoch": 1.15, + "grad_norm": 0.20211422443389893, + "learning_rate": 0.00016052958247488182, + "loss": 0.9226, + "step": 3335 + }, + { + "epoch": 1.15, + "grad_norm": 0.21109655499458313, + "learning_rate": 0.0001605075397030714, + "loss": 0.9584, + "step": 3336 + }, + { + "epoch": 1.15, + "grad_norm": 0.19364911317825317, + "learning_rate": 0.00016048549229226832, + "loss": 0.9552, + "step": 3337 + }, + { + "epoch": 1.16, + "grad_norm": 0.2017887681722641, + "learning_rate": 0.00016046344024416302, + "loss": 0.9571, + "step": 3338 + }, + { + "epoch": 1.16, + "grad_norm": 0.19468630850315094, + "learning_rate": 0.00016044138356044614, + "loss": 0.9033, + "step": 3339 + }, + { + "epoch": 1.16, + "grad_norm": 0.1936691552400589, + "learning_rate": 0.00016041932224280877, + "loss": 0.9071, + "step": 3340 + }, + { + "epoch": 1.16, + "grad_norm": 0.20116499066352844, + "learning_rate": 0.00016039725629294225, + "loss": 0.9469, + "step": 3341 + }, + { + "epoch": 1.16, + "grad_norm": 0.19864040613174438, + "learning_rate": 0.00016037518571253832, + "loss": 0.9157, + "step": 3342 + }, + { + "epoch": 1.16, + "grad_norm": 0.19681987166404724, + "learning_rate": 0.00016035311050328915, + "loss": 0.8943, + "step": 3343 + }, + { + "epoch": 1.16, + "grad_norm": 0.19773036241531372, + "learning_rate": 0.00016033103066688715, + "loss": 0.9386, + "step": 3344 + }, + { + "epoch": 1.16, + "grad_norm": 0.20723670721054077, + "learning_rate": 0.00016030894620502518, + "loss": 0.8724, + "step": 3345 + }, + { + "epoch": 1.16, + "grad_norm": 0.1986563503742218, + "learning_rate": 0.00016028685711939636, + "loss": 0.918, + "step": 3346 + }, + { + "epoch": 1.16, + "grad_norm": 0.20282043516635895, + "learning_rate": 0.00016026476341169426, + "loss": 0.9582, + "step": 3347 + }, + { + "epoch": 1.16, + "grad_norm": 0.19087593257427216, + "learning_rate": 0.00016024266508361275, + "loss": 0.9584, + "step": 3348 + }, + { + "epoch": 1.16, + "grad_norm": 0.18812496960163116, + "learning_rate": 0.00016022056213684609, + "loss": 0.9218, + "step": 3349 + }, + { + "epoch": 1.16, + "grad_norm": 0.19818438589572906, + "learning_rate": 0.0001601984545730888, + "loss": 0.9222, + "step": 3350 + }, + { + "epoch": 1.16, + "grad_norm": 0.19090117514133453, + "learning_rate": 0.00016017634239403589, + "loss": 0.9023, + "step": 3351 + }, + { + "epoch": 1.16, + "grad_norm": 0.19678524136543274, + "learning_rate": 0.00016015422560138263, + "loss": 0.9563, + "step": 3352 + }, + { + "epoch": 1.16, + "grad_norm": 0.18356016278266907, + "learning_rate": 0.00016013210419682465, + "loss": 0.8765, + "step": 3353 + }, + { + "epoch": 1.16, + "grad_norm": 0.21214888989925385, + "learning_rate": 0.000160109978182058, + "loss": 0.9886, + "step": 3354 + }, + { + "epoch": 1.16, + "grad_norm": 0.19988428056240082, + "learning_rate": 0.000160087847558779, + "loss": 0.9294, + "step": 3355 + }, + { + "epoch": 1.16, + "grad_norm": 0.19415299594402313, + "learning_rate": 0.00016006571232868438, + "loss": 0.9029, + "step": 3356 + }, + { + "epoch": 1.16, + "grad_norm": 0.1970900595188141, + "learning_rate": 0.00016004357249347118, + "loss": 0.9029, + "step": 3357 + }, + { + "epoch": 1.16, + "grad_norm": 0.20032072067260742, + "learning_rate": 0.00016002142805483685, + "loss": 0.9369, + "step": 3358 + }, + { + "epoch": 1.16, + "grad_norm": 0.19752494990825653, + "learning_rate": 0.00015999927901447913, + "loss": 0.934, + "step": 3359 + }, + { + "epoch": 1.16, + "grad_norm": 0.1939263790845871, + "learning_rate": 0.00015997712537409615, + "loss": 0.9206, + "step": 3360 + }, + { + "epoch": 1.16, + "grad_norm": 0.19927872717380524, + "learning_rate": 0.0001599549671353864, + "loss": 0.9109, + "step": 3361 + }, + { + "epoch": 1.16, + "grad_norm": 0.19712553918361664, + "learning_rate": 0.0001599328043000487, + "loss": 0.955, + "step": 3362 + }, + { + "epoch": 1.16, + "grad_norm": 0.18878357112407684, + "learning_rate": 0.00015991063686978226, + "loss": 0.9895, + "step": 3363 + }, + { + "epoch": 1.16, + "grad_norm": 0.19527225196361542, + "learning_rate": 0.00015988846484628656, + "loss": 0.9226, + "step": 3364 + }, + { + "epoch": 1.16, + "grad_norm": 0.1946769803762436, + "learning_rate": 0.0001598662882312615, + "loss": 0.9261, + "step": 3365 + }, + { + "epoch": 1.17, + "grad_norm": 0.20022673904895782, + "learning_rate": 0.00015984410702640735, + "loss": 0.9437, + "step": 3366 + }, + { + "epoch": 1.17, + "grad_norm": 0.19915050268173218, + "learning_rate": 0.00015982192123342468, + "loss": 0.9444, + "step": 3367 + }, + { + "epoch": 1.17, + "grad_norm": 0.19298063218593597, + "learning_rate": 0.0001597997308540144, + "loss": 0.927, + "step": 3368 + }, + { + "epoch": 1.17, + "grad_norm": 0.19722694158554077, + "learning_rate": 0.00015977753588987783, + "loss": 0.9354, + "step": 3369 + }, + { + "epoch": 1.17, + "grad_norm": 0.1972188502550125, + "learning_rate": 0.00015975533634271662, + "loss": 0.972, + "step": 3370 + }, + { + "epoch": 1.17, + "grad_norm": 0.19931070506572723, + "learning_rate": 0.00015973313221423274, + "loss": 0.9313, + "step": 3371 + }, + { + "epoch": 1.17, + "grad_norm": 0.20220732688903809, + "learning_rate": 0.00015971092350612859, + "loss": 0.9237, + "step": 3372 + }, + { + "epoch": 1.17, + "grad_norm": 0.1999865621328354, + "learning_rate": 0.00015968871022010677, + "loss": 0.988, + "step": 3373 + }, + { + "epoch": 1.17, + "grad_norm": 0.19390293955802917, + "learning_rate": 0.00015966649235787044, + "loss": 0.9628, + "step": 3374 + }, + { + "epoch": 1.17, + "grad_norm": 0.19825689494609833, + "learning_rate": 0.0001596442699211229, + "loss": 0.9282, + "step": 3375 + }, + { + "epoch": 1.17, + "grad_norm": 0.19494299590587616, + "learning_rate": 0.000159622042911568, + "loss": 0.886, + "step": 3376 + }, + { + "epoch": 1.17, + "grad_norm": 0.1895485520362854, + "learning_rate": 0.00015959981133090978, + "loss": 0.8627, + "step": 3377 + }, + { + "epoch": 1.17, + "grad_norm": 0.1998288780450821, + "learning_rate": 0.00015957757518085267, + "loss": 0.9129, + "step": 3378 + }, + { + "epoch": 1.17, + "grad_norm": 0.1855890303850174, + "learning_rate": 0.0001595553344631015, + "loss": 0.893, + "step": 3379 + }, + { + "epoch": 1.17, + "grad_norm": 0.1998797506093979, + "learning_rate": 0.00015953308917936144, + "loss": 0.9685, + "step": 3380 + }, + { + "epoch": 1.17, + "grad_norm": 0.20662716031074524, + "learning_rate": 0.000159510839331338, + "loss": 0.9312, + "step": 3381 + }, + { + "epoch": 1.17, + "grad_norm": 0.20258738100528717, + "learning_rate": 0.00015948858492073696, + "loss": 0.9709, + "step": 3382 + }, + { + "epoch": 1.17, + "grad_norm": 0.18903274834156036, + "learning_rate": 0.00015946632594926458, + "loss": 0.8984, + "step": 3383 + }, + { + "epoch": 1.17, + "grad_norm": 0.2018447369337082, + "learning_rate": 0.00015944406241862738, + "loss": 0.9672, + "step": 3384 + }, + { + "epoch": 1.17, + "grad_norm": 0.19322530925273895, + "learning_rate": 0.00015942179433053232, + "loss": 0.9422, + "step": 3385 + }, + { + "epoch": 1.17, + "grad_norm": 0.19214372336864471, + "learning_rate": 0.00015939952168668656, + "loss": 0.9142, + "step": 3386 + }, + { + "epoch": 1.17, + "grad_norm": 0.19462235271930695, + "learning_rate": 0.0001593772444887978, + "loss": 0.9098, + "step": 3387 + }, + { + "epoch": 1.17, + "grad_norm": 0.19193169474601746, + "learning_rate": 0.0001593549627385739, + "loss": 0.9119, + "step": 3388 + }, + { + "epoch": 1.17, + "grad_norm": 0.1959066390991211, + "learning_rate": 0.0001593326764377232, + "loss": 0.9462, + "step": 3389 + }, + { + "epoch": 1.17, + "grad_norm": 0.19710206985473633, + "learning_rate": 0.00015931038558795435, + "loss": 0.9513, + "step": 3390 + }, + { + "epoch": 1.17, + "grad_norm": 0.19761282205581665, + "learning_rate": 0.0001592880901909763, + "loss": 0.9108, + "step": 3391 + }, + { + "epoch": 1.17, + "grad_norm": 0.19094908237457275, + "learning_rate": 0.0001592657902484985, + "loss": 0.9442, + "step": 3392 + }, + { + "epoch": 1.17, + "grad_norm": 0.19495327770709991, + "learning_rate": 0.00015924348576223048, + "loss": 0.9221, + "step": 3393 + }, + { + "epoch": 1.17, + "grad_norm": 0.19475141167640686, + "learning_rate": 0.00015922117673388244, + "loss": 0.9524, + "step": 3394 + }, + { + "epoch": 1.18, + "grad_norm": 0.2048145830631256, + "learning_rate": 0.00015919886316516469, + "loss": 1.0231, + "step": 3395 + }, + { + "epoch": 1.18, + "grad_norm": 0.19695360958576202, + "learning_rate": 0.0001591765450577879, + "loss": 0.9444, + "step": 3396 + }, + { + "epoch": 1.18, + "grad_norm": 0.18348188698291779, + "learning_rate": 0.00015915422241346333, + "loss": 0.8791, + "step": 3397 + }, + { + "epoch": 1.18, + "grad_norm": 0.18603946268558502, + "learning_rate": 0.00015913189523390227, + "loss": 0.892, + "step": 3398 + }, + { + "epoch": 1.18, + "grad_norm": 0.19879482686519623, + "learning_rate": 0.00015910956352081657, + "loss": 0.9514, + "step": 3399 + }, + { + "epoch": 1.18, + "grad_norm": 0.19644108414649963, + "learning_rate": 0.0001590872272759183, + "loss": 0.9085, + "step": 3400 + }, + { + "epoch": 1.18, + "grad_norm": 0.1905001401901245, + "learning_rate": 0.00015906488650091997, + "loss": 0.9118, + "step": 3401 + }, + { + "epoch": 1.18, + "grad_norm": 0.193280428647995, + "learning_rate": 0.0001590425411975344, + "loss": 0.9526, + "step": 3402 + }, + { + "epoch": 1.18, + "grad_norm": 0.19824816286563873, + "learning_rate": 0.0001590201913674748, + "loss": 0.9328, + "step": 3403 + }, + { + "epoch": 1.18, + "grad_norm": 0.267400860786438, + "learning_rate": 0.00015899783701245462, + "loss": 0.8283, + "step": 3404 + }, + { + "epoch": 1.18, + "grad_norm": 0.19649048149585724, + "learning_rate": 0.00015897547813418776, + "loss": 0.9114, + "step": 3405 + }, + { + "epoch": 1.18, + "grad_norm": 0.19132551550865173, + "learning_rate": 0.0001589531147343884, + "loss": 0.8749, + "step": 3406 + }, + { + "epoch": 1.18, + "grad_norm": 0.19222450256347656, + "learning_rate": 0.0001589307468147711, + "loss": 0.9477, + "step": 3407 + }, + { + "epoch": 1.18, + "grad_norm": 0.19050081074237823, + "learning_rate": 0.00015890837437705085, + "loss": 0.8783, + "step": 3408 + }, + { + "epoch": 1.18, + "grad_norm": 0.18469731509685516, + "learning_rate": 0.00015888599742294282, + "loss": 0.9796, + "step": 3409 + }, + { + "epoch": 1.18, + "grad_norm": 0.19211378693580627, + "learning_rate": 0.0001588636159541626, + "loss": 0.924, + "step": 3410 + }, + { + "epoch": 1.18, + "grad_norm": 0.19958160817623138, + "learning_rate": 0.00015884122997242615, + "loss": 0.9866, + "step": 3411 + }, + { + "epoch": 1.18, + "grad_norm": 0.19366028904914856, + "learning_rate": 0.00015881883947944974, + "loss": 0.8821, + "step": 3412 + }, + { + "epoch": 1.18, + "grad_norm": 0.19694273173809052, + "learning_rate": 0.00015879644447695007, + "loss": 0.8971, + "step": 3413 + }, + { + "epoch": 1.18, + "grad_norm": 0.18563781678676605, + "learning_rate": 0.00015877404496664402, + "loss": 0.9001, + "step": 3414 + }, + { + "epoch": 1.18, + "grad_norm": 0.20295719802379608, + "learning_rate": 0.000158751640950249, + "loss": 0.8894, + "step": 3415 + }, + { + "epoch": 1.18, + "grad_norm": 0.20849061012268066, + "learning_rate": 0.00015872923242948267, + "loss": 0.9321, + "step": 3416 + }, + { + "epoch": 1.18, + "grad_norm": 0.19227440655231476, + "learning_rate": 0.00015870681940606302, + "loss": 0.8957, + "step": 3417 + }, + { + "epoch": 1.18, + "grad_norm": 0.1898086965084076, + "learning_rate": 0.00015868440188170847, + "loss": 0.9095, + "step": 3418 + }, + { + "epoch": 1.18, + "grad_norm": 0.19240839779376984, + "learning_rate": 0.0001586619798581376, + "loss": 0.9037, + "step": 3419 + }, + { + "epoch": 1.18, + "grad_norm": 0.19419671595096588, + "learning_rate": 0.00015863955333706957, + "loss": 0.8636, + "step": 3420 + }, + { + "epoch": 1.18, + "grad_norm": 0.20581892132759094, + "learning_rate": 0.00015861712232022375, + "loss": 0.9749, + "step": 3421 + }, + { + "epoch": 1.18, + "grad_norm": 0.19966059923171997, + "learning_rate": 0.00015859468680931989, + "loss": 0.9905, + "step": 3422 + }, + { + "epoch": 1.19, + "grad_norm": 0.18994350731372833, + "learning_rate": 0.00015857224680607804, + "loss": 0.9269, + "step": 3423 + }, + { + "epoch": 1.19, + "grad_norm": 0.19134818017482758, + "learning_rate": 0.00015854980231221869, + "loss": 0.8993, + "step": 3424 + }, + { + "epoch": 1.19, + "grad_norm": 0.1913415640592575, + "learning_rate": 0.00015852735332946253, + "loss": 0.9024, + "step": 3425 + }, + { + "epoch": 1.19, + "grad_norm": 0.18681485950946808, + "learning_rate": 0.00015850489985953076, + "loss": 0.9176, + "step": 3426 + }, + { + "epoch": 1.19, + "grad_norm": 0.2123211771249771, + "learning_rate": 0.0001584824419041448, + "loss": 0.9553, + "step": 3427 + }, + { + "epoch": 1.19, + "grad_norm": 0.19513416290283203, + "learning_rate": 0.0001584599794650265, + "loss": 0.9052, + "step": 3428 + }, + { + "epoch": 1.19, + "grad_norm": 0.20175153017044067, + "learning_rate": 0.00015843751254389796, + "loss": 0.9644, + "step": 3429 + }, + { + "epoch": 1.19, + "grad_norm": 0.1934724897146225, + "learning_rate": 0.00015841504114248167, + "loss": 0.9069, + "step": 3430 + }, + { + "epoch": 1.19, + "grad_norm": 0.19547207653522491, + "learning_rate": 0.0001583925652625005, + "loss": 0.8992, + "step": 3431 + }, + { + "epoch": 1.19, + "grad_norm": 0.19625899195671082, + "learning_rate": 0.00015837008490567764, + "loss": 0.8821, + "step": 3432 + }, + { + "epoch": 1.19, + "grad_norm": 0.19435423612594604, + "learning_rate": 0.00015834760007373658, + "loss": 0.9296, + "step": 3433 + }, + { + "epoch": 1.19, + "grad_norm": 0.1921653300523758, + "learning_rate": 0.00015832511076840122, + "loss": 0.9497, + "step": 3434 + }, + { + "epoch": 1.19, + "grad_norm": 0.20929893851280212, + "learning_rate": 0.00015830261699139575, + "loss": 0.9234, + "step": 3435 + }, + { + "epoch": 1.19, + "grad_norm": 0.2004985213279724, + "learning_rate": 0.0001582801187444447, + "loss": 0.9163, + "step": 3436 + }, + { + "epoch": 1.19, + "grad_norm": 0.20779582858085632, + "learning_rate": 0.000158257616029273, + "loss": 0.8724, + "step": 3437 + }, + { + "epoch": 1.19, + "grad_norm": 0.2043759822845459, + "learning_rate": 0.0001582351088476059, + "loss": 0.8911, + "step": 3438 + }, + { + "epoch": 1.19, + "grad_norm": 0.19317206740379333, + "learning_rate": 0.00015821259720116897, + "loss": 0.8972, + "step": 3439 + }, + { + "epoch": 1.19, + "grad_norm": 0.20157548785209656, + "learning_rate": 0.00015819008109168808, + "loss": 0.9373, + "step": 3440 + }, + { + "epoch": 1.19, + "grad_norm": 0.19609443843364716, + "learning_rate": 0.00015816756052088957, + "loss": 0.9662, + "step": 3441 + }, + { + "epoch": 1.19, + "grad_norm": 0.19324661791324615, + "learning_rate": 0.00015814503549050002, + "loss": 0.8593, + "step": 3442 + }, + { + "epoch": 1.19, + "grad_norm": 0.19976924359798431, + "learning_rate": 0.00015812250600224636, + "loss": 0.9051, + "step": 3443 + }, + { + "epoch": 1.19, + "grad_norm": 0.19995374977588654, + "learning_rate": 0.0001580999720578559, + "loss": 0.9245, + "step": 3444 + }, + { + "epoch": 1.19, + "grad_norm": 0.1970292627811432, + "learning_rate": 0.00015807743365905627, + "loss": 0.8853, + "step": 3445 + }, + { + "epoch": 1.19, + "grad_norm": 0.19631227850914001, + "learning_rate": 0.00015805489080757544, + "loss": 0.8863, + "step": 3446 + }, + { + "epoch": 1.19, + "grad_norm": 0.20259711146354675, + "learning_rate": 0.00015803234350514176, + "loss": 0.9394, + "step": 3447 + }, + { + "epoch": 1.19, + "grad_norm": 0.21781596541404724, + "learning_rate": 0.00015800979175348382, + "loss": 0.9558, + "step": 3448 + }, + { + "epoch": 1.19, + "grad_norm": 0.20329150557518005, + "learning_rate": 0.00015798723555433068, + "loss": 0.9256, + "step": 3449 + }, + { + "epoch": 1.19, + "grad_norm": 0.18962758779525757, + "learning_rate": 0.00015796467490941164, + "loss": 0.856, + "step": 3450 + }, + { + "epoch": 1.19, + "grad_norm": 0.21290762722492218, + "learning_rate": 0.00015794210982045636, + "loss": 0.9407, + "step": 3451 + }, + { + "epoch": 1.2, + "grad_norm": 0.19620513916015625, + "learning_rate": 0.00015791954028919497, + "loss": 0.9601, + "step": 3452 + }, + { + "epoch": 1.2, + "grad_norm": 0.20747888088226318, + "learning_rate": 0.00015789696631735768, + "loss": 0.9534, + "step": 3453 + }, + { + "epoch": 1.2, + "grad_norm": 0.19664233922958374, + "learning_rate": 0.0001578743879066753, + "loss": 0.9279, + "step": 3454 + }, + { + "epoch": 1.2, + "grad_norm": 0.19570444524288177, + "learning_rate": 0.00015785180505887883, + "loss": 0.9025, + "step": 3455 + }, + { + "epoch": 1.2, + "grad_norm": 0.18608535826206207, + "learning_rate": 0.00015782921777569966, + "loss": 0.8819, + "step": 3456 + }, + { + "epoch": 1.2, + "grad_norm": 0.20291505753993988, + "learning_rate": 0.00015780662605886951, + "loss": 0.9383, + "step": 3457 + }, + { + "epoch": 1.2, + "grad_norm": 0.2000468522310257, + "learning_rate": 0.00015778402991012045, + "loss": 0.9386, + "step": 3458 + }, + { + "epoch": 1.2, + "grad_norm": 0.19044055044651031, + "learning_rate": 0.00015776142933118488, + "loss": 0.9092, + "step": 3459 + }, + { + "epoch": 1.2, + "grad_norm": 0.2033713459968567, + "learning_rate": 0.00015773882432379552, + "loss": 0.9389, + "step": 3460 + }, + { + "epoch": 1.2, + "grad_norm": 0.19457867741584778, + "learning_rate": 0.0001577162148896855, + "loss": 0.8789, + "step": 3461 + }, + { + "epoch": 1.2, + "grad_norm": 0.20580138266086578, + "learning_rate": 0.00015769360103058816, + "loss": 0.904, + "step": 3462 + }, + { + "epoch": 1.2, + "grad_norm": 0.19230179488658905, + "learning_rate": 0.00015767098274823733, + "loss": 0.9108, + "step": 3463 + }, + { + "epoch": 1.2, + "grad_norm": 0.19959823787212372, + "learning_rate": 0.0001576483600443671, + "loss": 0.9708, + "step": 3464 + }, + { + "epoch": 1.2, + "grad_norm": 0.19323502480983734, + "learning_rate": 0.00015762573292071186, + "loss": 0.9289, + "step": 3465 + }, + { + "epoch": 1.2, + "grad_norm": 0.2083415389060974, + "learning_rate": 0.00015760310137900643, + "loss": 1.0455, + "step": 3466 + }, + { + "epoch": 1.2, + "grad_norm": 0.198960542678833, + "learning_rate": 0.00015758046542098595, + "loss": 0.888, + "step": 3467 + }, + { + "epoch": 1.2, + "grad_norm": 0.1912854015827179, + "learning_rate": 0.0001575578250483858, + "loss": 0.9166, + "step": 3468 + }, + { + "epoch": 1.2, + "grad_norm": 0.2046162486076355, + "learning_rate": 0.00015753518026294185, + "loss": 0.9227, + "step": 3469 + }, + { + "epoch": 1.2, + "grad_norm": 0.19570709764957428, + "learning_rate": 0.00015751253106639014, + "loss": 0.8762, + "step": 3470 + }, + { + "epoch": 1.2, + "grad_norm": 0.18846337497234344, + "learning_rate": 0.0001574898774604672, + "loss": 0.9041, + "step": 3471 + }, + { + "epoch": 1.2, + "grad_norm": 0.18992602825164795, + "learning_rate": 0.00015746721944690986, + "loss": 0.9162, + "step": 3472 + }, + { + "epoch": 1.2, + "grad_norm": 0.19614478945732117, + "learning_rate": 0.00015744455702745521, + "loss": 0.8904, + "step": 3473 + }, + { + "epoch": 1.2, + "grad_norm": 0.19449754059314728, + "learning_rate": 0.00015742189020384077, + "loss": 0.9354, + "step": 3474 + }, + { + "epoch": 1.2, + "grad_norm": 0.20135246217250824, + "learning_rate": 0.00015739921897780436, + "loss": 0.9433, + "step": 3475 + }, + { + "epoch": 1.2, + "grad_norm": 0.19418463110923767, + "learning_rate": 0.0001573765433510841, + "loss": 0.8763, + "step": 3476 + }, + { + "epoch": 1.2, + "grad_norm": 0.19569334387779236, + "learning_rate": 0.00015735386332541847, + "loss": 0.8775, + "step": 3477 + }, + { + "epoch": 1.2, + "grad_norm": 0.19247597455978394, + "learning_rate": 0.0001573311789025464, + "loss": 0.8953, + "step": 3478 + }, + { + "epoch": 1.2, + "grad_norm": 0.21302643418312073, + "learning_rate": 0.00015730849008420701, + "loss": 0.9473, + "step": 3479 + }, + { + "epoch": 1.21, + "grad_norm": 0.19832172989845276, + "learning_rate": 0.00015728579687213974, + "loss": 0.9332, + "step": 3480 + }, + { + "epoch": 1.21, + "grad_norm": 0.19264574348926544, + "learning_rate": 0.00015726309926808452, + "loss": 0.9148, + "step": 3481 + }, + { + "epoch": 1.21, + "grad_norm": 0.1975184977054596, + "learning_rate": 0.00015724039727378148, + "loss": 0.8925, + "step": 3482 + }, + { + "epoch": 1.21, + "grad_norm": 0.19813388586044312, + "learning_rate": 0.0001572176908909712, + "loss": 0.9127, + "step": 3483 + }, + { + "epoch": 1.21, + "grad_norm": 0.20585894584655762, + "learning_rate": 0.00015719498012139446, + "loss": 0.9244, + "step": 3484 + }, + { + "epoch": 1.21, + "grad_norm": 0.18802528083324432, + "learning_rate": 0.00015717226496679248, + "loss": 0.8992, + "step": 3485 + }, + { + "epoch": 1.21, + "grad_norm": 0.18946048617362976, + "learning_rate": 0.00015714954542890677, + "loss": 0.9641, + "step": 3486 + }, + { + "epoch": 1.21, + "grad_norm": 0.19644182920455933, + "learning_rate": 0.00015712682150947923, + "loss": 0.9015, + "step": 3487 + }, + { + "epoch": 1.21, + "grad_norm": 0.19160401821136475, + "learning_rate": 0.00015710409321025202, + "loss": 0.9143, + "step": 3488 + }, + { + "epoch": 1.21, + "grad_norm": 0.18982695043087006, + "learning_rate": 0.00015708136053296768, + "loss": 0.8535, + "step": 3489 + }, + { + "epoch": 1.21, + "grad_norm": 0.19489288330078125, + "learning_rate": 0.00015705862347936914, + "loss": 0.9514, + "step": 3490 + }, + { + "epoch": 1.21, + "grad_norm": 0.19520345330238342, + "learning_rate": 0.0001570358820511995, + "loss": 0.9161, + "step": 3491 + }, + { + "epoch": 1.21, + "grad_norm": 0.19223810732364655, + "learning_rate": 0.0001570131362502024, + "loss": 0.8904, + "step": 3492 + }, + { + "epoch": 1.21, + "grad_norm": 0.1916969120502472, + "learning_rate": 0.00015699038607812162, + "loss": 0.8793, + "step": 3493 + }, + { + "epoch": 1.21, + "grad_norm": 0.19017493724822998, + "learning_rate": 0.00015696763153670142, + "loss": 0.8336, + "step": 3494 + }, + { + "epoch": 1.21, + "grad_norm": 0.18902774155139923, + "learning_rate": 0.00015694487262768634, + "loss": 0.8857, + "step": 3495 + }, + { + "epoch": 1.21, + "grad_norm": 0.19431182742118835, + "learning_rate": 0.0001569221093528213, + "loss": 0.87, + "step": 3496 + }, + { + "epoch": 1.21, + "grad_norm": 0.18795667588710785, + "learning_rate": 0.00015689934171385147, + "loss": 0.882, + "step": 3497 + }, + { + "epoch": 1.21, + "grad_norm": 0.1952623426914215, + "learning_rate": 0.00015687656971252238, + "loss": 0.928, + "step": 3498 + }, + { + "epoch": 1.21, + "grad_norm": 0.18606235086917877, + "learning_rate": 0.00015685379335057996, + "loss": 0.8403, + "step": 3499 + }, + { + "epoch": 1.21, + "grad_norm": 0.18356823921203613, + "learning_rate": 0.0001568310126297704, + "loss": 0.8661, + "step": 3500 + }, + { + "epoch": 1.21, + "grad_norm": 0.2000182569026947, + "learning_rate": 0.00015680822755184028, + "loss": 0.9734, + "step": 3501 + }, + { + "epoch": 1.21, + "grad_norm": 0.19715382158756256, + "learning_rate": 0.00015678543811853643, + "loss": 0.8895, + "step": 3502 + }, + { + "epoch": 1.21, + "grad_norm": 0.1973632425069809, + "learning_rate": 0.00015676264433160615, + "loss": 0.9437, + "step": 3503 + }, + { + "epoch": 1.21, + "grad_norm": 0.20098167657852173, + "learning_rate": 0.0001567398461927969, + "loss": 0.8708, + "step": 3504 + }, + { + "epoch": 1.21, + "grad_norm": 0.2057405710220337, + "learning_rate": 0.0001567170437038567, + "loss": 0.9534, + "step": 3505 + }, + { + "epoch": 1.21, + "grad_norm": 0.189925417304039, + "learning_rate": 0.00015669423686653366, + "loss": 0.8764, + "step": 3506 + }, + { + "epoch": 1.21, + "grad_norm": 0.18974903225898743, + "learning_rate": 0.00015667142568257636, + "loss": 0.8952, + "step": 3507 + }, + { + "epoch": 1.22, + "grad_norm": 0.19761645793914795, + "learning_rate": 0.0001566486101537337, + "loss": 0.9287, + "step": 3508 + }, + { + "epoch": 1.22, + "grad_norm": 0.19054748117923737, + "learning_rate": 0.00015662579028175486, + "loss": 0.9112, + "step": 3509 + }, + { + "epoch": 1.22, + "grad_norm": 0.19649845361709595, + "learning_rate": 0.0001566029660683895, + "loss": 0.9309, + "step": 3510 + }, + { + "epoch": 1.22, + "grad_norm": 0.18728889524936676, + "learning_rate": 0.0001565801375153874, + "loss": 0.8955, + "step": 3511 + }, + { + "epoch": 1.22, + "grad_norm": 0.18988218903541565, + "learning_rate": 0.00015655730462449882, + "loss": 0.9138, + "step": 3512 + }, + { + "epoch": 1.22, + "grad_norm": 0.20179392397403717, + "learning_rate": 0.00015653446739747427, + "loss": 0.9328, + "step": 3513 + }, + { + "epoch": 1.22, + "grad_norm": 0.19956812262535095, + "learning_rate": 0.00015651162583606474, + "loss": 0.9143, + "step": 3514 + }, + { + "epoch": 1.22, + "grad_norm": 0.18776121735572815, + "learning_rate": 0.00015648877994202138, + "loss": 0.9249, + "step": 3515 + }, + { + "epoch": 1.22, + "grad_norm": 0.20067352056503296, + "learning_rate": 0.0001564659297170957, + "loss": 0.943, + "step": 3516 + }, + { + "epoch": 1.22, + "grad_norm": 0.1972014605998993, + "learning_rate": 0.00015644307516303964, + "loss": 0.9838, + "step": 3517 + }, + { + "epoch": 1.22, + "grad_norm": 0.1812957376241684, + "learning_rate": 0.0001564202162816054, + "loss": 0.9204, + "step": 3518 + }, + { + "epoch": 1.22, + "grad_norm": 0.20647543668746948, + "learning_rate": 0.0001563973530745455, + "loss": 0.9156, + "step": 3519 + }, + { + "epoch": 1.22, + "grad_norm": 0.20621806383132935, + "learning_rate": 0.00015637448554361286, + "loss": 0.9443, + "step": 3520 + }, + { + "epoch": 1.22, + "grad_norm": 0.19071152806282043, + "learning_rate": 0.00015635161369056066, + "loss": 0.9304, + "step": 3521 + }, + { + "epoch": 1.22, + "grad_norm": 0.18997131288051605, + "learning_rate": 0.00015632873751714245, + "loss": 0.9167, + "step": 3522 + }, + { + "epoch": 1.22, + "grad_norm": 0.1913936585187912, + "learning_rate": 0.0001563058570251121, + "loss": 0.8775, + "step": 3523 + }, + { + "epoch": 1.22, + "grad_norm": 0.20011787116527557, + "learning_rate": 0.00015628297221622378, + "loss": 0.9236, + "step": 3524 + }, + { + "epoch": 1.22, + "grad_norm": 0.20807957649230957, + "learning_rate": 0.00015626008309223207, + "loss": 0.9442, + "step": 3525 + }, + { + "epoch": 1.22, + "grad_norm": 0.19250045716762543, + "learning_rate": 0.0001562371896548918, + "loss": 0.9152, + "step": 3526 + }, + { + "epoch": 1.22, + "grad_norm": 0.19840964674949646, + "learning_rate": 0.0001562142919059582, + "loss": 0.924, + "step": 3527 + }, + { + "epoch": 1.22, + "grad_norm": 0.20426321029663086, + "learning_rate": 0.00015619138984718673, + "loss": 0.9256, + "step": 3528 + }, + { + "epoch": 1.22, + "grad_norm": 0.2006935477256775, + "learning_rate": 0.00015616848348033334, + "loss": 0.9052, + "step": 3529 + }, + { + "epoch": 1.22, + "grad_norm": 0.20013563334941864, + "learning_rate": 0.00015614557280715412, + "loss": 0.9853, + "step": 3530 + }, + { + "epoch": 1.22, + "grad_norm": 0.19713100790977478, + "learning_rate": 0.00015612265782940566, + "loss": 0.8925, + "step": 3531 + }, + { + "epoch": 1.22, + "grad_norm": 0.19918465614318848, + "learning_rate": 0.00015609973854884473, + "loss": 0.9249, + "step": 3532 + }, + { + "epoch": 1.22, + "grad_norm": 0.1967756599187851, + "learning_rate": 0.0001560768149672286, + "loss": 0.9007, + "step": 3533 + }, + { + "epoch": 1.22, + "grad_norm": 0.18599602580070496, + "learning_rate": 0.0001560538870863147, + "loss": 0.8715, + "step": 3534 + }, + { + "epoch": 1.22, + "grad_norm": 0.19670726358890533, + "learning_rate": 0.0001560309549078609, + "loss": 0.9482, + "step": 3535 + }, + { + "epoch": 1.22, + "grad_norm": 0.1904277354478836, + "learning_rate": 0.00015600801843362535, + "loss": 0.9127, + "step": 3536 + }, + { + "epoch": 1.23, + "grad_norm": 0.1943923681974411, + "learning_rate": 0.00015598507766536655, + "loss": 0.9423, + "step": 3537 + }, + { + "epoch": 1.23, + "grad_norm": 0.20839069783687592, + "learning_rate": 0.0001559621326048433, + "loss": 0.9427, + "step": 3538 + }, + { + "epoch": 1.23, + "grad_norm": 0.19504216313362122, + "learning_rate": 0.0001559391832538148, + "loss": 0.9384, + "step": 3539 + }, + { + "epoch": 1.23, + "grad_norm": 0.19287019968032837, + "learning_rate": 0.0001559162296140405, + "loss": 0.9277, + "step": 3540 + }, + { + "epoch": 1.23, + "grad_norm": 0.19911225140094757, + "learning_rate": 0.00015589327168728022, + "loss": 0.8505, + "step": 3541 + }, + { + "epoch": 1.23, + "grad_norm": 0.19432951509952545, + "learning_rate": 0.00015587030947529412, + "loss": 0.9193, + "step": 3542 + }, + { + "epoch": 1.23, + "grad_norm": 0.20329613983631134, + "learning_rate": 0.00015584734297984258, + "loss": 0.9654, + "step": 3543 + }, + { + "epoch": 1.23, + "grad_norm": 0.19105875492095947, + "learning_rate": 0.00015582437220268647, + "loss": 0.9219, + "step": 3544 + }, + { + "epoch": 1.23, + "grad_norm": 0.1942165344953537, + "learning_rate": 0.00015580139714558693, + "loss": 0.9053, + "step": 3545 + }, + { + "epoch": 1.23, + "grad_norm": 0.1945033073425293, + "learning_rate": 0.00015577841781030537, + "loss": 0.9851, + "step": 3546 + }, + { + "epoch": 1.23, + "grad_norm": 0.19224001467227936, + "learning_rate": 0.00015575543419860357, + "loss": 0.9162, + "step": 3547 + }, + { + "epoch": 1.23, + "grad_norm": 0.1970493197441101, + "learning_rate": 0.00015573244631224365, + "loss": 0.9324, + "step": 3548 + }, + { + "epoch": 1.23, + "grad_norm": 0.2003912329673767, + "learning_rate": 0.00015570945415298803, + "loss": 0.8751, + "step": 3549 + }, + { + "epoch": 1.23, + "grad_norm": 0.21274451911449432, + "learning_rate": 0.0001556864577225995, + "loss": 1.0118, + "step": 3550 + }, + { + "epoch": 1.23, + "eval_loss": 0.9446007609367371, + "eval_runtime": 679.1052, + "eval_samples_per_second": 10.125, + "eval_steps_per_second": 5.063, + "step": 3550 + }, + { + "epoch": 1.23, + "grad_norm": 0.18957918882369995, + "learning_rate": 0.0001556634570228412, + "loss": 0.9222, + "step": 3551 + }, + { + "epoch": 1.23, + "grad_norm": 0.1931392103433609, + "learning_rate": 0.0001556404520554764, + "loss": 0.9261, + "step": 3552 + }, + { + "epoch": 1.23, + "grad_norm": 0.19815470278263092, + "learning_rate": 0.00015561744282226896, + "loss": 0.9156, + "step": 3553 + }, + { + "epoch": 1.23, + "grad_norm": 0.21058355271816254, + "learning_rate": 0.00015559442932498293, + "loss": 0.9234, + "step": 3554 + }, + { + "epoch": 1.23, + "grad_norm": 0.20303329825401306, + "learning_rate": 0.00015557141156538268, + "loss": 0.9156, + "step": 3555 + }, + { + "epoch": 1.23, + "grad_norm": 0.1917152851819992, + "learning_rate": 0.000155548389545233, + "loss": 0.88, + "step": 3556 + }, + { + "epoch": 1.23, + "grad_norm": 0.19306671619415283, + "learning_rate": 0.00015552536326629888, + "loss": 0.8316, + "step": 3557 + }, + { + "epoch": 1.23, + "grad_norm": 0.19891008734703064, + "learning_rate": 0.00015550233273034568, + "loss": 0.9051, + "step": 3558 + }, + { + "epoch": 1.23, + "grad_norm": 0.21156953275203705, + "learning_rate": 0.0001554792979391392, + "loss": 0.9795, + "step": 3559 + }, + { + "epoch": 1.23, + "grad_norm": 0.19550539553165436, + "learning_rate": 0.0001554562588944454, + "loss": 0.9283, + "step": 3560 + }, + { + "epoch": 1.23, + "grad_norm": 0.19516271352767944, + "learning_rate": 0.00015543321559803065, + "loss": 0.9426, + "step": 3561 + }, + { + "epoch": 1.23, + "grad_norm": 0.20502054691314697, + "learning_rate": 0.00015541016805166162, + "loss": 0.9175, + "step": 3562 + }, + { + "epoch": 1.23, + "grad_norm": 0.18738922476768494, + "learning_rate": 0.00015538711625710536, + "loss": 0.8853, + "step": 3563 + }, + { + "epoch": 1.23, + "grad_norm": 0.19391462206840515, + "learning_rate": 0.00015536406021612916, + "loss": 0.9396, + "step": 3564 + }, + { + "epoch": 1.24, + "grad_norm": 0.19468341767787933, + "learning_rate": 0.00015534099993050072, + "loss": 0.9577, + "step": 3565 + }, + { + "epoch": 1.24, + "grad_norm": 0.19213582575321198, + "learning_rate": 0.00015531793540198802, + "loss": 0.8845, + "step": 3566 + }, + { + "epoch": 1.24, + "grad_norm": 0.19879239797592163, + "learning_rate": 0.00015529486663235935, + "loss": 0.9431, + "step": 3567 + }, + { + "epoch": 1.24, + "grad_norm": 0.19197018444538116, + "learning_rate": 0.00015527179362338333, + "loss": 0.9384, + "step": 3568 + }, + { + "epoch": 1.24, + "grad_norm": 0.19179058074951172, + "learning_rate": 0.00015524871637682898, + "loss": 0.8971, + "step": 3569 + }, + { + "epoch": 1.24, + "grad_norm": 0.1983899474143982, + "learning_rate": 0.00015522563489446552, + "loss": 0.8992, + "step": 3570 + }, + { + "epoch": 1.24, + "grad_norm": 0.18999382853507996, + "learning_rate": 0.00015520254917806265, + "loss": 0.9508, + "step": 3571 + }, + { + "epoch": 1.24, + "grad_norm": 0.20845505595207214, + "learning_rate": 0.00015517945922939025, + "loss": 0.9412, + "step": 3572 + }, + { + "epoch": 1.24, + "grad_norm": 0.1960909217596054, + "learning_rate": 0.00015515636505021853, + "loss": 0.9537, + "step": 3573 + }, + { + "epoch": 1.24, + "grad_norm": 0.2100881189107895, + "learning_rate": 0.00015513326664231815, + "loss": 0.962, + "step": 3574 + }, + { + "epoch": 1.24, + "grad_norm": 0.19650724530220032, + "learning_rate": 0.00015511016400746, + "loss": 0.9683, + "step": 3575 + }, + { + "epoch": 1.24, + "grad_norm": 0.19711832702159882, + "learning_rate": 0.00015508705714741531, + "loss": 0.8819, + "step": 3576 + }, + { + "epoch": 1.24, + "grad_norm": 0.1942337453365326, + "learning_rate": 0.00015506394606395568, + "loss": 0.9437, + "step": 3577 + }, + { + "epoch": 1.24, + "grad_norm": 0.21626056730747223, + "learning_rate": 0.00015504083075885288, + "loss": 0.9474, + "step": 3578 + }, + { + "epoch": 1.24, + "grad_norm": 0.20113731920719147, + "learning_rate": 0.00015501771123387922, + "loss": 0.9172, + "step": 3579 + }, + { + "epoch": 1.24, + "grad_norm": 0.1971191167831421, + "learning_rate": 0.0001549945874908072, + "loss": 0.8949, + "step": 3580 + }, + { + "epoch": 1.24, + "grad_norm": 0.20610153675079346, + "learning_rate": 0.00015497145953140962, + "loss": 0.9802, + "step": 3581 + }, + { + "epoch": 1.24, + "grad_norm": 0.19442158937454224, + "learning_rate": 0.0001549483273574597, + "loss": 0.9386, + "step": 3582 + }, + { + "epoch": 1.24, + "grad_norm": 0.19450068473815918, + "learning_rate": 0.00015492519097073097, + "loss": 1.0089, + "step": 3583 + }, + { + "epoch": 1.24, + "grad_norm": 0.19892315566539764, + "learning_rate": 0.0001549020503729972, + "loss": 0.9811, + "step": 3584 + }, + { + "epoch": 1.24, + "grad_norm": 0.19469180703163147, + "learning_rate": 0.00015487890556603254, + "loss": 0.9294, + "step": 3585 + }, + { + "epoch": 1.24, + "grad_norm": 0.2058008313179016, + "learning_rate": 0.00015485575655161147, + "loss": 0.9309, + "step": 3586 + }, + { + "epoch": 1.24, + "grad_norm": 0.19313830137252808, + "learning_rate": 0.0001548326033315088, + "loss": 0.9438, + "step": 3587 + }, + { + "epoch": 1.24, + "grad_norm": 0.19705910980701447, + "learning_rate": 0.00015480944590749954, + "loss": 0.9313, + "step": 3588 + }, + { + "epoch": 1.24, + "grad_norm": 0.19770359992980957, + "learning_rate": 0.00015478628428135924, + "loss": 0.927, + "step": 3589 + }, + { + "epoch": 1.24, + "grad_norm": 0.20620097219944, + "learning_rate": 0.00015476311845486363, + "loss": 0.962, + "step": 3590 + }, + { + "epoch": 1.24, + "grad_norm": 0.2014746069908142, + "learning_rate": 0.00015473994842978874, + "loss": 0.8598, + "step": 3591 + }, + { + "epoch": 1.24, + "grad_norm": 0.18861185014247894, + "learning_rate": 0.00015471677420791102, + "loss": 0.9241, + "step": 3592 + }, + { + "epoch": 1.25, + "grad_norm": 0.19452114403247833, + "learning_rate": 0.00015469359579100718, + "loss": 0.9356, + "step": 3593 + }, + { + "epoch": 1.25, + "grad_norm": 0.19815759360790253, + "learning_rate": 0.00015467041318085423, + "loss": 0.9815, + "step": 3594 + }, + { + "epoch": 1.25, + "grad_norm": 0.19805648922920227, + "learning_rate": 0.00015464722637922957, + "loss": 0.9215, + "step": 3595 + }, + { + "epoch": 1.25, + "grad_norm": 0.1998823881149292, + "learning_rate": 0.00015462403538791088, + "loss": 0.9155, + "step": 3596 + }, + { + "epoch": 1.25, + "grad_norm": 0.197417750954628, + "learning_rate": 0.00015460084020867617, + "loss": 0.8845, + "step": 3597 + }, + { + "epoch": 1.25, + "grad_norm": 0.20399829745292664, + "learning_rate": 0.00015457764084330375, + "loss": 0.9541, + "step": 3598 + }, + { + "epoch": 1.25, + "grad_norm": 0.20391467213630676, + "learning_rate": 0.00015455443729357228, + "loss": 0.8704, + "step": 3599 + }, + { + "epoch": 1.25, + "grad_norm": 0.20385457575321198, + "learning_rate": 0.00015453122956126075, + "loss": 0.9182, + "step": 3600 + }, + { + "epoch": 1.25, + "grad_norm": 0.19763916730880737, + "learning_rate": 0.00015450801764814838, + "loss": 0.9316, + "step": 3601 + }, + { + "epoch": 1.25, + "grad_norm": 0.19183963537216187, + "learning_rate": 0.00015448480155601487, + "loss": 0.9035, + "step": 3602 + }, + { + "epoch": 1.25, + "grad_norm": 0.20232851803302765, + "learning_rate": 0.00015446158128664013, + "loss": 0.9119, + "step": 3603 + }, + { + "epoch": 1.25, + "grad_norm": 0.19677840173244476, + "learning_rate": 0.00015443835684180436, + "loss": 0.9383, + "step": 3604 + }, + { + "epoch": 1.25, + "grad_norm": 0.1975477635860443, + "learning_rate": 0.0001544151282232882, + "loss": 0.8994, + "step": 3605 + }, + { + "epoch": 1.25, + "grad_norm": 0.19218041002750397, + "learning_rate": 0.00015439189543287247, + "loss": 0.904, + "step": 3606 + }, + { + "epoch": 1.25, + "grad_norm": 0.18120858073234558, + "learning_rate": 0.00015436865847233845, + "loss": 0.8842, + "step": 3607 + }, + { + "epoch": 1.25, + "grad_norm": 0.19843339920043945, + "learning_rate": 0.00015434541734346766, + "loss": 0.9324, + "step": 3608 + }, + { + "epoch": 1.25, + "grad_norm": 0.1892959624528885, + "learning_rate": 0.0001543221720480419, + "loss": 0.9368, + "step": 3609 + }, + { + "epoch": 1.25, + "grad_norm": 0.1868380457162857, + "learning_rate": 0.00015429892258784337, + "loss": 0.9239, + "step": 3610 + }, + { + "epoch": 1.25, + "grad_norm": 0.19214345514774323, + "learning_rate": 0.00015427566896465458, + "loss": 0.8936, + "step": 3611 + }, + { + "epoch": 1.25, + "grad_norm": 0.20086589455604553, + "learning_rate": 0.00015425241118025834, + "loss": 0.9443, + "step": 3612 + }, + { + "epoch": 1.25, + "grad_norm": 0.19924329221248627, + "learning_rate": 0.00015422914923643772, + "loss": 0.8946, + "step": 3613 + }, + { + "epoch": 1.25, + "grad_norm": 0.18544204533100128, + "learning_rate": 0.00015420588313497623, + "loss": 0.8469, + "step": 3614 + }, + { + "epoch": 1.25, + "grad_norm": 0.20435726642608643, + "learning_rate": 0.0001541826128776576, + "loss": 1.0018, + "step": 3615 + }, + { + "epoch": 1.25, + "grad_norm": 0.1996217519044876, + "learning_rate": 0.00015415933846626593, + "loss": 0.9117, + "step": 3616 + }, + { + "epoch": 1.25, + "grad_norm": 0.1999591886997223, + "learning_rate": 0.00015413605990258566, + "loss": 0.9222, + "step": 3617 + }, + { + "epoch": 1.25, + "grad_norm": 0.19460801780223846, + "learning_rate": 0.00015411277718840145, + "loss": 0.9324, + "step": 3618 + }, + { + "epoch": 1.25, + "grad_norm": 0.20450779795646667, + "learning_rate": 0.00015408949032549835, + "loss": 0.9441, + "step": 3619 + }, + { + "epoch": 1.25, + "grad_norm": 0.20513708889484406, + "learning_rate": 0.00015406619931566172, + "loss": 0.9168, + "step": 3620 + }, + { + "epoch": 1.25, + "grad_norm": 0.19295939803123474, + "learning_rate": 0.00015404290416067725, + "loss": 0.9253, + "step": 3621 + }, + { + "epoch": 1.26, + "grad_norm": 0.1939520537853241, + "learning_rate": 0.00015401960486233093, + "loss": 0.886, + "step": 3622 + }, + { + "epoch": 1.26, + "grad_norm": 0.20039387047290802, + "learning_rate": 0.0001539963014224091, + "loss": 0.9371, + "step": 3623 + }, + { + "epoch": 1.26, + "grad_norm": 0.19236010313034058, + "learning_rate": 0.00015397299384269832, + "loss": 0.9098, + "step": 3624 + }, + { + "epoch": 1.26, + "grad_norm": 0.20198063552379608, + "learning_rate": 0.00015394968212498555, + "loss": 0.9768, + "step": 3625 + }, + { + "epoch": 1.26, + "grad_norm": 0.1999257355928421, + "learning_rate": 0.0001539263662710581, + "loss": 0.8571, + "step": 3626 + }, + { + "epoch": 1.26, + "grad_norm": 0.1928747296333313, + "learning_rate": 0.00015390304628270355, + "loss": 0.9619, + "step": 3627 + }, + { + "epoch": 1.26, + "grad_norm": 0.2093496322631836, + "learning_rate": 0.00015387972216170977, + "loss": 0.9477, + "step": 3628 + }, + { + "epoch": 1.26, + "grad_norm": 0.2011002153158188, + "learning_rate": 0.00015385639390986494, + "loss": 0.907, + "step": 3629 + }, + { + "epoch": 1.26, + "grad_norm": 0.19310890138149261, + "learning_rate": 0.00015383306152895766, + "loss": 0.9346, + "step": 3630 + }, + { + "epoch": 1.26, + "grad_norm": 0.20581993460655212, + "learning_rate": 0.0001538097250207767, + "loss": 0.9707, + "step": 3631 + }, + { + "epoch": 1.26, + "grad_norm": 0.1893724799156189, + "learning_rate": 0.00015378638438711133, + "loss": 0.9351, + "step": 3632 + }, + { + "epoch": 1.26, + "grad_norm": 0.18906715512275696, + "learning_rate": 0.0001537630396297509, + "loss": 0.9621, + "step": 3633 + }, + { + "epoch": 1.26, + "grad_norm": 0.1960383504629135, + "learning_rate": 0.00015373969075048533, + "loss": 0.934, + "step": 3634 + }, + { + "epoch": 1.26, + "grad_norm": 0.197191521525383, + "learning_rate": 0.00015371633775110462, + "loss": 0.945, + "step": 3635 + }, + { + "epoch": 1.26, + "grad_norm": 0.18677780032157898, + "learning_rate": 0.00015369298063339933, + "loss": 0.8289, + "step": 3636 + }, + { + "epoch": 1.26, + "grad_norm": 0.20285886526107788, + "learning_rate": 0.00015366961939916008, + "loss": 0.9173, + "step": 3637 + }, + { + "epoch": 1.26, + "grad_norm": 0.1963968425989151, + "learning_rate": 0.00015364625405017796, + "loss": 0.9221, + "step": 3638 + }, + { + "epoch": 1.26, + "grad_norm": 0.19961443543434143, + "learning_rate": 0.00015362288458824438, + "loss": 0.9478, + "step": 3639 + }, + { + "epoch": 1.26, + "grad_norm": 0.2008897364139557, + "learning_rate": 0.000153599511015151, + "loss": 0.882, + "step": 3640 + }, + { + "epoch": 1.26, + "grad_norm": 0.20071753859519958, + "learning_rate": 0.00015357613333268985, + "loss": 0.9574, + "step": 3641 + }, + { + "epoch": 1.26, + "grad_norm": 0.19547533988952637, + "learning_rate": 0.00015355275154265322, + "loss": 0.9415, + "step": 3642 + }, + { + "epoch": 1.26, + "grad_norm": 0.20526045560836792, + "learning_rate": 0.0001535293656468338, + "loss": 0.9546, + "step": 3643 + }, + { + "epoch": 1.26, + "grad_norm": 0.19212369620800018, + "learning_rate": 0.00015350597564702448, + "loss": 0.9417, + "step": 3644 + }, + { + "epoch": 1.26, + "grad_norm": 0.20225495100021362, + "learning_rate": 0.00015348258154501853, + "loss": 0.9573, + "step": 3645 + }, + { + "epoch": 1.26, + "grad_norm": 0.19001320004463196, + "learning_rate": 0.00015345918334260956, + "loss": 0.917, + "step": 3646 + }, + { + "epoch": 1.26, + "grad_norm": 0.20381398499011993, + "learning_rate": 0.00015343578104159145, + "loss": 0.9687, + "step": 3647 + }, + { + "epoch": 1.26, + "grad_norm": 0.19656528532505035, + "learning_rate": 0.00015341237464375844, + "loss": 0.9432, + "step": 3648 + }, + { + "epoch": 1.26, + "grad_norm": 0.18959979712963104, + "learning_rate": 0.000153388964150905, + "loss": 0.9259, + "step": 3649 + }, + { + "epoch": 1.27, + "grad_norm": 0.20691464841365814, + "learning_rate": 0.00015336554956482594, + "loss": 0.9746, + "step": 3650 + }, + { + "epoch": 1.27, + "grad_norm": 0.1913762390613556, + "learning_rate": 0.0001533421308873165, + "loss": 0.9936, + "step": 3651 + }, + { + "epoch": 1.27, + "grad_norm": 0.20307478308677673, + "learning_rate": 0.0001533187081201721, + "loss": 0.9708, + "step": 3652 + }, + { + "epoch": 1.27, + "grad_norm": 0.19943755865097046, + "learning_rate": 0.00015329528126518848, + "loss": 0.8927, + "step": 3653 + }, + { + "epoch": 1.27, + "grad_norm": 0.19460420310497284, + "learning_rate": 0.0001532718503241618, + "loss": 0.8652, + "step": 3654 + }, + { + "epoch": 1.27, + "grad_norm": 0.20158496499061584, + "learning_rate": 0.0001532484152988884, + "loss": 0.9135, + "step": 3655 + }, + { + "epoch": 1.27, + "grad_norm": 0.20068541169166565, + "learning_rate": 0.00015322497619116505, + "loss": 0.8876, + "step": 3656 + }, + { + "epoch": 1.27, + "grad_norm": 0.3100888729095459, + "learning_rate": 0.00015320153300278877, + "loss": 0.8805, + "step": 3657 + }, + { + "epoch": 1.27, + "grad_norm": 0.19288921356201172, + "learning_rate": 0.00015317808573555683, + "loss": 1.0011, + "step": 3658 + }, + { + "epoch": 1.27, + "grad_norm": 0.19910819828510284, + "learning_rate": 0.000153154634391267, + "loss": 0.9088, + "step": 3659 + }, + { + "epoch": 1.27, + "grad_norm": 0.1915523111820221, + "learning_rate": 0.00015313117897171717, + "loss": 0.9092, + "step": 3660 + }, + { + "epoch": 1.27, + "grad_norm": 0.19844427704811096, + "learning_rate": 0.00015310771947870565, + "loss": 0.9244, + "step": 3661 + }, + { + "epoch": 1.27, + "grad_norm": 0.19637028872966766, + "learning_rate": 0.00015308425591403104, + "loss": 0.9569, + "step": 3662 + }, + { + "epoch": 1.27, + "grad_norm": 0.1928602159023285, + "learning_rate": 0.0001530607882794922, + "loss": 0.9344, + "step": 3663 + }, + { + "epoch": 1.27, + "grad_norm": 0.18807289004325867, + "learning_rate": 0.0001530373165768884, + "loss": 0.922, + "step": 3664 + }, + { + "epoch": 1.27, + "grad_norm": 0.21626749634742737, + "learning_rate": 0.0001530138408080191, + "loss": 1.0031, + "step": 3665 + }, + { + "epoch": 1.27, + "grad_norm": 0.20268388092517853, + "learning_rate": 0.0001529903609746842, + "loss": 0.9478, + "step": 3666 + }, + { + "epoch": 1.27, + "grad_norm": 0.20564964413642883, + "learning_rate": 0.0001529668770786839, + "loss": 0.8934, + "step": 3667 + }, + { + "epoch": 1.27, + "grad_norm": 0.20551858842372894, + "learning_rate": 0.0001529433891218185, + "loss": 0.9278, + "step": 3668 + }, + { + "epoch": 1.27, + "grad_norm": 0.1947879195213318, + "learning_rate": 0.00015291989710588898, + "loss": 0.7883, + "step": 3669 + }, + { + "epoch": 1.27, + "grad_norm": 0.20598158240318298, + "learning_rate": 0.00015289640103269625, + "loss": 0.9217, + "step": 3670 + }, + { + "epoch": 1.27, + "grad_norm": 0.1998508870601654, + "learning_rate": 0.0001528729009040418, + "loss": 0.9446, + "step": 3671 + }, + { + "epoch": 1.27, + "grad_norm": 0.1991170197725296, + "learning_rate": 0.00015284939672172731, + "loss": 0.8942, + "step": 3672 + }, + { + "epoch": 1.27, + "grad_norm": 0.19294770061969757, + "learning_rate": 0.0001528258884875548, + "loss": 0.9282, + "step": 3673 + }, + { + "epoch": 1.27, + "grad_norm": 0.1996181309223175, + "learning_rate": 0.00015280237620332663, + "loss": 0.8971, + "step": 3674 + }, + { + "epoch": 1.27, + "grad_norm": 0.19610020518302917, + "learning_rate": 0.0001527788598708454, + "loss": 0.9225, + "step": 3675 + }, + { + "epoch": 1.27, + "grad_norm": 0.19014745950698853, + "learning_rate": 0.0001527553394919141, + "loss": 0.8461, + "step": 3676 + }, + { + "epoch": 1.27, + "grad_norm": 0.19598208367824554, + "learning_rate": 0.00015273181506833593, + "loss": 0.9787, + "step": 3677 + }, + { + "epoch": 1.27, + "grad_norm": 0.2032850831747055, + "learning_rate": 0.00015270828660191453, + "loss": 0.9879, + "step": 3678 + }, + { + "epoch": 1.28, + "grad_norm": 0.1870907098054886, + "learning_rate": 0.00015268475409445373, + "loss": 0.8367, + "step": 3679 + }, + { + "epoch": 1.28, + "grad_norm": 0.19532081484794617, + "learning_rate": 0.00015266121754775773, + "loss": 0.8911, + "step": 3680 + }, + { + "epoch": 1.28, + "grad_norm": 0.194292813539505, + "learning_rate": 0.00015263767696363105, + "loss": 0.9161, + "step": 3681 + }, + { + "epoch": 1.28, + "grad_norm": 0.19262300431728363, + "learning_rate": 0.0001526141323438785, + "loss": 0.9105, + "step": 3682 + }, + { + "epoch": 1.28, + "grad_norm": 0.2018946409225464, + "learning_rate": 0.00015259058369030516, + "loss": 0.938, + "step": 3683 + }, + { + "epoch": 1.28, + "grad_norm": 0.1982242912054062, + "learning_rate": 0.00015256703100471653, + "loss": 0.9816, + "step": 3684 + }, + { + "epoch": 1.28, + "grad_norm": 0.19996307790279388, + "learning_rate": 0.00015254347428891825, + "loss": 0.9292, + "step": 3685 + }, + { + "epoch": 1.28, + "grad_norm": 0.18849660456180573, + "learning_rate": 0.00015251991354471643, + "loss": 0.8891, + "step": 3686 + }, + { + "epoch": 1.28, + "grad_norm": 0.1935540735721588, + "learning_rate": 0.00015249634877391742, + "loss": 0.9042, + "step": 3687 + }, + { + "epoch": 1.28, + "grad_norm": 0.20464152097702026, + "learning_rate": 0.00015247277997832787, + "loss": 0.9383, + "step": 3688 + }, + { + "epoch": 1.28, + "grad_norm": 0.19373281300067902, + "learning_rate": 0.00015244920715975482, + "loss": 0.8985, + "step": 3689 + }, + { + "epoch": 1.28, + "grad_norm": 0.2008526623249054, + "learning_rate": 0.00015242563032000545, + "loss": 0.8887, + "step": 3690 + }, + { + "epoch": 1.28, + "grad_norm": 0.21022352576255798, + "learning_rate": 0.00015240204946088737, + "loss": 0.9097, + "step": 3691 + }, + { + "epoch": 1.28, + "grad_norm": 0.19161288440227509, + "learning_rate": 0.00015237846458420853, + "loss": 0.9256, + "step": 3692 + }, + { + "epoch": 1.28, + "grad_norm": 0.2000376433134079, + "learning_rate": 0.00015235487569177707, + "loss": 0.8774, + "step": 3693 + }, + { + "epoch": 1.28, + "grad_norm": 0.19469022750854492, + "learning_rate": 0.00015233128278540158, + "loss": 0.8713, + "step": 3694 + }, + { + "epoch": 1.28, + "grad_norm": 0.19838149845600128, + "learning_rate": 0.00015230768586689086, + "loss": 0.9343, + "step": 3695 + }, + { + "epoch": 1.28, + "grad_norm": 0.193618044257164, + "learning_rate": 0.00015228408493805397, + "loss": 0.8804, + "step": 3696 + }, + { + "epoch": 1.28, + "grad_norm": 0.20271454751491547, + "learning_rate": 0.0001522604800007004, + "loss": 0.9614, + "step": 3697 + }, + { + "epoch": 1.28, + "grad_norm": 0.1923941969871521, + "learning_rate": 0.0001522368710566399, + "loss": 0.9059, + "step": 3698 + }, + { + "epoch": 1.28, + "grad_norm": 0.20286774635314941, + "learning_rate": 0.0001522132581076825, + "loss": 0.8956, + "step": 3699 + }, + { + "epoch": 1.28, + "grad_norm": 0.19995754957199097, + "learning_rate": 0.0001521896411556386, + "loss": 0.9006, + "step": 3700 + }, + { + "epoch": 1.28, + "grad_norm": 0.19672347605228424, + "learning_rate": 0.0001521660202023188, + "loss": 0.9199, + "step": 3701 + }, + { + "epoch": 1.28, + "grad_norm": 0.19233226776123047, + "learning_rate": 0.00015214239524953408, + "loss": 0.8852, + "step": 3702 + }, + { + "epoch": 1.28, + "grad_norm": 0.2116343080997467, + "learning_rate": 0.00015211876629909576, + "loss": 0.9474, + "step": 3703 + }, + { + "epoch": 1.28, + "grad_norm": 0.18998068571090698, + "learning_rate": 0.00015209513335281543, + "loss": 0.8806, + "step": 3704 + }, + { + "epoch": 1.28, + "grad_norm": 0.20243072509765625, + "learning_rate": 0.00015207149641250495, + "loss": 0.9505, + "step": 3705 + }, + { + "epoch": 1.28, + "grad_norm": 0.19964581727981567, + "learning_rate": 0.00015204785547997648, + "loss": 0.9438, + "step": 3706 + }, + { + "epoch": 1.29, + "grad_norm": 0.19266873598098755, + "learning_rate": 0.00015202421055704262, + "loss": 0.8944, + "step": 3707 + }, + { + "epoch": 1.29, + "grad_norm": 0.20089896023273468, + "learning_rate": 0.00015200056164551607, + "loss": 0.9743, + "step": 3708 + }, + { + "epoch": 1.29, + "grad_norm": 0.19461996853351593, + "learning_rate": 0.00015197690874721003, + "loss": 0.9625, + "step": 3709 + }, + { + "epoch": 1.29, + "grad_norm": 0.22794750332832336, + "learning_rate": 0.00015195325186393794, + "loss": 1.0161, + "step": 3710 + }, + { + "epoch": 1.29, + "grad_norm": 0.20262105762958527, + "learning_rate": 0.00015192959099751343, + "loss": 0.9498, + "step": 3711 + }, + { + "epoch": 1.29, + "grad_norm": 0.19459131360054016, + "learning_rate": 0.0001519059261497506, + "loss": 0.8988, + "step": 3712 + }, + { + "epoch": 1.29, + "grad_norm": 0.2000139057636261, + "learning_rate": 0.00015188225732246373, + "loss": 0.9255, + "step": 3713 + }, + { + "epoch": 1.29, + "grad_norm": 0.19961777329444885, + "learning_rate": 0.00015185858451746752, + "loss": 0.9512, + "step": 3714 + }, + { + "epoch": 1.29, + "grad_norm": 0.2007269263267517, + "learning_rate": 0.0001518349077365769, + "loss": 1.0035, + "step": 3715 + }, + { + "epoch": 1.29, + "grad_norm": 0.18371418118476868, + "learning_rate": 0.00015181122698160714, + "loss": 0.9236, + "step": 3716 + }, + { + "epoch": 1.29, + "grad_norm": 0.199965700507164, + "learning_rate": 0.00015178754225437372, + "loss": 0.911, + "step": 3717 + }, + { + "epoch": 1.29, + "grad_norm": 0.19855192303657532, + "learning_rate": 0.00015176385355669255, + "loss": 0.9889, + "step": 3718 + }, + { + "epoch": 1.29, + "grad_norm": 0.1886325627565384, + "learning_rate": 0.00015174016089037983, + "loss": 0.9105, + "step": 3719 + }, + { + "epoch": 1.29, + "grad_norm": 0.1962376832962036, + "learning_rate": 0.00015171646425725204, + "loss": 0.9684, + "step": 3720 + }, + { + "epoch": 1.29, + "grad_norm": 0.20075201988220215, + "learning_rate": 0.00015169276365912583, + "loss": 0.8843, + "step": 3721 + }, + { + "epoch": 1.29, + "grad_norm": 0.2069513201713562, + "learning_rate": 0.0001516690590978184, + "loss": 0.9751, + "step": 3722 + }, + { + "epoch": 1.29, + "grad_norm": 0.20042923092842102, + "learning_rate": 0.0001516453505751471, + "loss": 0.9644, + "step": 3723 + }, + { + "epoch": 1.29, + "grad_norm": 0.19810305535793304, + "learning_rate": 0.00015162163809292957, + "loss": 0.9473, + "step": 3724 + }, + { + "epoch": 1.29, + "grad_norm": 0.20136240124702454, + "learning_rate": 0.00015159792165298386, + "loss": 0.9029, + "step": 3725 + }, + { + "epoch": 1.29, + "grad_norm": 0.19834771752357483, + "learning_rate": 0.00015157420125712825, + "loss": 0.9343, + "step": 3726 + }, + { + "epoch": 1.29, + "grad_norm": 0.19727087020874023, + "learning_rate": 0.0001515504769071813, + "loss": 0.901, + "step": 3727 + }, + { + "epoch": 1.29, + "grad_norm": 0.19946539402008057, + "learning_rate": 0.00015152674860496195, + "loss": 0.9179, + "step": 3728 + }, + { + "epoch": 1.29, + "grad_norm": 0.19287483394145966, + "learning_rate": 0.00015150301635228936, + "loss": 0.9113, + "step": 3729 + }, + { + "epoch": 1.29, + "grad_norm": 0.19016975164413452, + "learning_rate": 0.0001514792801509831, + "loss": 0.9222, + "step": 3730 + }, + { + "epoch": 1.29, + "grad_norm": 0.19780956208705902, + "learning_rate": 0.0001514555400028629, + "loss": 0.9618, + "step": 3731 + }, + { + "epoch": 1.29, + "grad_norm": 0.19044287502765656, + "learning_rate": 0.0001514317959097489, + "loss": 0.858, + "step": 3732 + }, + { + "epoch": 1.29, + "grad_norm": 0.1943211555480957, + "learning_rate": 0.00015140804787346153, + "loss": 0.8965, + "step": 3733 + }, + { + "epoch": 1.29, + "grad_norm": 0.2054436206817627, + "learning_rate": 0.00015138429589582148, + "loss": 0.9122, + "step": 3734 + }, + { + "epoch": 1.3, + "grad_norm": 0.2009715586900711, + "learning_rate": 0.00015136053997864983, + "loss": 0.8797, + "step": 3735 + }, + { + "epoch": 1.3, + "grad_norm": 0.18632814288139343, + "learning_rate": 0.00015133678012376777, + "loss": 0.8947, + "step": 3736 + }, + { + "epoch": 1.3, + "grad_norm": 0.2046545445919037, + "learning_rate": 0.00015131301633299705, + "loss": 0.9694, + "step": 3737 + }, + { + "epoch": 1.3, + "grad_norm": 0.18827247619628906, + "learning_rate": 0.00015128924860815954, + "loss": 0.9236, + "step": 3738 + }, + { + "epoch": 1.3, + "grad_norm": 0.19354717433452606, + "learning_rate": 0.0001512654769510774, + "loss": 0.9056, + "step": 3739 + }, + { + "epoch": 1.3, + "grad_norm": 0.20237867534160614, + "learning_rate": 0.00015124170136357332, + "loss": 0.8638, + "step": 3740 + }, + { + "epoch": 1.3, + "grad_norm": 0.19454175233840942, + "learning_rate": 0.00015121792184746998, + "loss": 0.9331, + "step": 3741 + }, + { + "epoch": 1.3, + "grad_norm": 0.19441847503185272, + "learning_rate": 0.00015119413840459055, + "loss": 0.9276, + "step": 3742 + }, + { + "epoch": 1.3, + "grad_norm": 0.20244662463665009, + "learning_rate": 0.0001511703510367585, + "loss": 0.938, + "step": 3743 + }, + { + "epoch": 1.3, + "grad_norm": 0.2046027034521103, + "learning_rate": 0.00015114655974579746, + "loss": 0.9821, + "step": 3744 + }, + { + "epoch": 1.3, + "grad_norm": 0.19821032881736755, + "learning_rate": 0.00015112276453353157, + "loss": 0.9499, + "step": 3745 + }, + { + "epoch": 1.3, + "grad_norm": 0.1993158459663391, + "learning_rate": 0.0001510989654017851, + "loss": 0.9264, + "step": 3746 + }, + { + "epoch": 1.3, + "grad_norm": 0.19219717383384705, + "learning_rate": 0.00015107516235238273, + "loss": 0.9408, + "step": 3747 + }, + { + "epoch": 1.3, + "grad_norm": 0.20064875483512878, + "learning_rate": 0.00015105135538714937, + "loss": 0.9396, + "step": 3748 + }, + { + "epoch": 1.3, + "grad_norm": 0.19371774792671204, + "learning_rate": 0.0001510275445079102, + "loss": 0.8951, + "step": 3749 + }, + { + "epoch": 1.3, + "grad_norm": 0.19230003654956818, + "learning_rate": 0.00015100372971649082, + "loss": 0.9125, + "step": 3750 + }, + { + "epoch": 1.3, + "grad_norm": 0.1980389803647995, + "learning_rate": 0.000150979911014717, + "loss": 0.8656, + "step": 3751 + }, + { + "epoch": 1.3, + "grad_norm": 0.19611652195453644, + "learning_rate": 0.000150956088404415, + "loss": 0.9375, + "step": 3752 + }, + { + "epoch": 1.3, + "grad_norm": 0.19706304371356964, + "learning_rate": 0.0001509322618874111, + "loss": 0.8721, + "step": 3753 + }, + { + "epoch": 1.3, + "grad_norm": 0.19378282129764557, + "learning_rate": 0.0001509084314655321, + "loss": 0.8858, + "step": 3754 + }, + { + "epoch": 1.3, + "grad_norm": 0.20083080232143402, + "learning_rate": 0.00015088459714060503, + "loss": 0.9249, + "step": 3755 + }, + { + "epoch": 1.3, + "grad_norm": 0.1986744999885559, + "learning_rate": 0.00015086075891445724, + "loss": 0.9235, + "step": 3756 + }, + { + "epoch": 1.3, + "grad_norm": 0.19933581352233887, + "learning_rate": 0.00015083691678891633, + "loss": 0.9516, + "step": 3757 + }, + { + "epoch": 1.3, + "grad_norm": 0.20001207292079926, + "learning_rate": 0.00015081307076581023, + "loss": 0.9141, + "step": 3758 + }, + { + "epoch": 1.3, + "grad_norm": 0.1966947466135025, + "learning_rate": 0.00015078922084696715, + "loss": 0.8709, + "step": 3759 + }, + { + "epoch": 1.3, + "grad_norm": 0.1952902227640152, + "learning_rate": 0.00015076536703421565, + "loss": 0.9097, + "step": 3760 + }, + { + "epoch": 1.3, + "grad_norm": 0.19831472635269165, + "learning_rate": 0.00015074150932938455, + "loss": 0.8747, + "step": 3761 + }, + { + "epoch": 1.3, + "grad_norm": 0.20838016271591187, + "learning_rate": 0.000150717647734303, + "loss": 0.8938, + "step": 3762 + }, + { + "epoch": 1.3, + "grad_norm": 0.193468376994133, + "learning_rate": 0.00015069378225080032, + "loss": 0.9141, + "step": 3763 + }, + { + "epoch": 1.31, + "grad_norm": 0.19065572321414948, + "learning_rate": 0.0001506699128807063, + "loss": 0.917, + "step": 3764 + }, + { + "epoch": 1.31, + "grad_norm": 0.19850143790245056, + "learning_rate": 0.000150646039625851, + "loss": 0.9113, + "step": 3765 + }, + { + "epoch": 1.31, + "grad_norm": 0.19818361103534698, + "learning_rate": 0.0001506221624880647, + "loss": 0.9928, + "step": 3766 + }, + { + "epoch": 1.31, + "grad_norm": 0.20216867327690125, + "learning_rate": 0.00015059828146917793, + "loss": 0.9086, + "step": 3767 + }, + { + "epoch": 1.31, + "grad_norm": 0.19267527759075165, + "learning_rate": 0.0001505743965710217, + "loss": 0.8939, + "step": 3768 + }, + { + "epoch": 1.31, + "grad_norm": 0.19998551905155182, + "learning_rate": 0.00015055050779542718, + "loss": 0.8814, + "step": 3769 + }, + { + "epoch": 1.31, + "grad_norm": 0.19110707938671112, + "learning_rate": 0.0001505266151442259, + "loss": 0.8582, + "step": 3770 + }, + { + "epoch": 1.31, + "grad_norm": 0.19865083694458008, + "learning_rate": 0.00015050271861924967, + "loss": 0.9322, + "step": 3771 + }, + { + "epoch": 1.31, + "grad_norm": 0.19565609097480774, + "learning_rate": 0.00015047881822233053, + "loss": 0.8699, + "step": 3772 + }, + { + "epoch": 1.31, + "grad_norm": 0.1937209516763687, + "learning_rate": 0.0001504549139553009, + "loss": 0.8931, + "step": 3773 + }, + { + "epoch": 1.31, + "grad_norm": 0.1972823441028595, + "learning_rate": 0.0001504310058199935, + "loss": 0.9337, + "step": 3774 + }, + { + "epoch": 1.31, + "grad_norm": 0.20414002239704132, + "learning_rate": 0.0001504070938182413, + "loss": 0.9194, + "step": 3775 + }, + { + "epoch": 1.31, + "grad_norm": 0.1957191377878189, + "learning_rate": 0.00015038317795187757, + "loss": 0.8957, + "step": 3776 + }, + { + "epoch": 1.31, + "grad_norm": 0.2019643932580948, + "learning_rate": 0.00015035925822273592, + "loss": 0.9323, + "step": 3777 + }, + { + "epoch": 1.31, + "grad_norm": 0.19081658124923706, + "learning_rate": 0.0001503353346326502, + "loss": 0.979, + "step": 3778 + }, + { + "epoch": 1.31, + "grad_norm": 0.20455485582351685, + "learning_rate": 0.0001503114071834546, + "loss": 0.9567, + "step": 3779 + }, + { + "epoch": 1.31, + "grad_norm": 0.19730593264102936, + "learning_rate": 0.00015028747587698363, + "loss": 0.8996, + "step": 3780 + }, + { + "epoch": 1.31, + "grad_norm": 0.2002158761024475, + "learning_rate": 0.000150263540715072, + "loss": 0.9626, + "step": 3781 + }, + { + "epoch": 1.31, + "grad_norm": 0.20014959573745728, + "learning_rate": 0.00015023960169955477, + "loss": 0.8964, + "step": 3782 + }, + { + "epoch": 1.31, + "grad_norm": 0.19767984747886658, + "learning_rate": 0.00015021565883226732, + "loss": 0.9118, + "step": 3783 + }, + { + "epoch": 1.31, + "grad_norm": 0.20115704834461212, + "learning_rate": 0.0001501917121150453, + "loss": 0.9544, + "step": 3784 + }, + { + "epoch": 1.31, + "grad_norm": 0.20704735815525055, + "learning_rate": 0.00015016776154972466, + "loss": 0.9204, + "step": 3785 + }, + { + "epoch": 1.31, + "grad_norm": 0.19559411704540253, + "learning_rate": 0.00015014380713814158, + "loss": 0.9321, + "step": 3786 + }, + { + "epoch": 1.31, + "grad_norm": 0.198293074965477, + "learning_rate": 0.00015011984888213274, + "loss": 0.8662, + "step": 3787 + }, + { + "epoch": 1.31, + "grad_norm": 0.20051126182079315, + "learning_rate": 0.00015009588678353483, + "loss": 0.8989, + "step": 3788 + }, + { + "epoch": 1.31, + "grad_norm": 0.20624862611293793, + "learning_rate": 0.00015007192084418502, + "loss": 0.9028, + "step": 3789 + }, + { + "epoch": 1.31, + "grad_norm": 0.20225314795970917, + "learning_rate": 0.00015004795106592078, + "loss": 0.9514, + "step": 3790 + }, + { + "epoch": 1.31, + "grad_norm": 0.2037535309791565, + "learning_rate": 0.00015002397745057975, + "loss": 0.9128, + "step": 3791 + }, + { + "epoch": 1.32, + "grad_norm": 0.20016394555568695, + "learning_rate": 0.00015000000000000001, + "loss": 0.9348, + "step": 3792 + }, + { + "epoch": 1.32, + "grad_norm": 0.19721950590610504, + "learning_rate": 0.00014997601871601982, + "loss": 0.9399, + "step": 3793 + }, + { + "epoch": 1.32, + "grad_norm": 0.20106512308120728, + "learning_rate": 0.00014995203360047775, + "loss": 0.9711, + "step": 3794 + }, + { + "epoch": 1.32, + "grad_norm": 0.1967417299747467, + "learning_rate": 0.00014992804465521276, + "loss": 0.95, + "step": 3795 + }, + { + "epoch": 1.32, + "grad_norm": 0.19685690104961395, + "learning_rate": 0.00014990405188206401, + "loss": 0.9278, + "step": 3796 + }, + { + "epoch": 1.32, + "grad_norm": 0.20026066899299622, + "learning_rate": 0.00014988005528287097, + "loss": 0.8825, + "step": 3797 + }, + { + "epoch": 1.32, + "grad_norm": 0.1951385736465454, + "learning_rate": 0.0001498560548594734, + "loss": 0.8832, + "step": 3798 + }, + { + "epoch": 1.32, + "grad_norm": 0.19592271745204926, + "learning_rate": 0.00014983205061371138, + "loss": 0.8754, + "step": 3799 + }, + { + "epoch": 1.32, + "grad_norm": 0.18903060257434845, + "learning_rate": 0.00014980804254742525, + "loss": 0.9081, + "step": 3800 + }, + { + "epoch": 1.32, + "grad_norm": 0.19677716493606567, + "learning_rate": 0.00014978403066245568, + "loss": 0.9202, + "step": 3801 + }, + { + "epoch": 1.32, + "grad_norm": 0.19893287122249603, + "learning_rate": 0.0001497600149606436, + "loss": 0.9271, + "step": 3802 + }, + { + "epoch": 1.32, + "grad_norm": 0.20432735979557037, + "learning_rate": 0.00014973599544383027, + "loss": 0.9466, + "step": 3803 + }, + { + "epoch": 1.32, + "grad_norm": 0.19333679974079132, + "learning_rate": 0.0001497119721138572, + "loss": 0.9862, + "step": 3804 + }, + { + "epoch": 1.32, + "grad_norm": 0.20335853099822998, + "learning_rate": 0.00014968794497256623, + "loss": 0.9579, + "step": 3805 + }, + { + "epoch": 1.32, + "grad_norm": 0.19586090743541718, + "learning_rate": 0.00014966391402179943, + "loss": 0.8735, + "step": 3806 + }, + { + "epoch": 1.32, + "grad_norm": 0.19172383844852448, + "learning_rate": 0.00014963987926339925, + "loss": 0.9268, + "step": 3807 + }, + { + "epoch": 1.32, + "grad_norm": 0.20108972489833832, + "learning_rate": 0.00014961584069920835, + "loss": 0.894, + "step": 3808 + }, + { + "epoch": 1.32, + "grad_norm": 0.19839361310005188, + "learning_rate": 0.00014959179833106973, + "loss": 0.902, + "step": 3809 + }, + { + "epoch": 1.32, + "grad_norm": 0.18988490104675293, + "learning_rate": 0.0001495677521608267, + "loss": 0.919, + "step": 3810 + }, + { + "epoch": 1.32, + "grad_norm": 0.201633483171463, + "learning_rate": 0.00014954370219032282, + "loss": 0.9612, + "step": 3811 + }, + { + "epoch": 1.32, + "grad_norm": 0.19168926775455475, + "learning_rate": 0.00014951964842140192, + "loss": 0.8556, + "step": 3812 + }, + { + "epoch": 1.32, + "grad_norm": 0.19750337302684784, + "learning_rate": 0.00014949559085590818, + "loss": 0.9275, + "step": 3813 + }, + { + "epoch": 1.32, + "grad_norm": 0.20000135898590088, + "learning_rate": 0.00014947152949568605, + "loss": 0.942, + "step": 3814 + }, + { + "epoch": 1.32, + "grad_norm": 0.19141016900539398, + "learning_rate": 0.00014944746434258026, + "loss": 0.9408, + "step": 3815 + }, + { + "epoch": 1.32, + "grad_norm": 0.19368094205856323, + "learning_rate": 0.00014942339539843584, + "loss": 0.9202, + "step": 3816 + }, + { + "epoch": 1.32, + "grad_norm": 0.19549012184143066, + "learning_rate": 0.00014939932266509807, + "loss": 0.9736, + "step": 3817 + }, + { + "epoch": 1.32, + "grad_norm": 0.19119693338871002, + "learning_rate": 0.00014937524614441264, + "loss": 0.8828, + "step": 3818 + }, + { + "epoch": 1.32, + "grad_norm": 0.1875646561384201, + "learning_rate": 0.00014935116583822538, + "loss": 0.8702, + "step": 3819 + }, + { + "epoch": 1.32, + "grad_norm": 0.1974024921655655, + "learning_rate": 0.00014932708174838252, + "loss": 0.9605, + "step": 3820 + }, + { + "epoch": 1.33, + "grad_norm": 0.2048822045326233, + "learning_rate": 0.00014930299387673054, + "loss": 0.9164, + "step": 3821 + }, + { + "epoch": 1.33, + "grad_norm": 0.18495222926139832, + "learning_rate": 0.00014927890222511615, + "loss": 0.8123, + "step": 3822 + }, + { + "epoch": 1.33, + "grad_norm": 0.19920560717582703, + "learning_rate": 0.00014925480679538647, + "loss": 0.9169, + "step": 3823 + }, + { + "epoch": 1.33, + "grad_norm": 0.1953980028629303, + "learning_rate": 0.0001492307075893888, + "loss": 0.8839, + "step": 3824 + }, + { + "epoch": 1.33, + "grad_norm": 0.20536701381206512, + "learning_rate": 0.00014920660460897083, + "loss": 0.9569, + "step": 3825 + }, + { + "epoch": 1.33, + "grad_norm": 0.19224631786346436, + "learning_rate": 0.00014918249785598046, + "loss": 0.8981, + "step": 3826 + }, + { + "epoch": 1.33, + "grad_norm": 0.2037082314491272, + "learning_rate": 0.00014915838733226594, + "loss": 1.0014, + "step": 3827 + }, + { + "epoch": 1.33, + "grad_norm": 0.19932124018669128, + "learning_rate": 0.00014913427303967568, + "loss": 0.9632, + "step": 3828 + }, + { + "epoch": 1.33, + "grad_norm": 0.20192798972129822, + "learning_rate": 0.00014911015498005859, + "loss": 0.9635, + "step": 3829 + }, + { + "epoch": 1.33, + "grad_norm": 0.19329585134983063, + "learning_rate": 0.00014908603315526368, + "loss": 0.8993, + "step": 3830 + }, + { + "epoch": 1.33, + "grad_norm": 0.19806140661239624, + "learning_rate": 0.0001490619075671404, + "loss": 0.9436, + "step": 3831 + }, + { + "epoch": 1.33, + "grad_norm": 0.20333150029182434, + "learning_rate": 0.0001490377782175383, + "loss": 0.9349, + "step": 3832 + }, + { + "epoch": 1.33, + "grad_norm": 0.194121852517128, + "learning_rate": 0.0001490136451083074, + "loss": 0.9253, + "step": 3833 + }, + { + "epoch": 1.33, + "grad_norm": 0.19719244539737701, + "learning_rate": 0.0001489895082412979, + "loss": 0.992, + "step": 3834 + }, + { + "epoch": 1.33, + "grad_norm": 0.1979801207780838, + "learning_rate": 0.0001489653676183604, + "loss": 0.9117, + "step": 3835 + }, + { + "epoch": 1.33, + "grad_norm": 0.1975490152835846, + "learning_rate": 0.00014894122324134564, + "loss": 0.8944, + "step": 3836 + }, + { + "epoch": 1.33, + "grad_norm": 0.2017369270324707, + "learning_rate": 0.00014891707511210473, + "loss": 0.9631, + "step": 3837 + }, + { + "epoch": 1.33, + "grad_norm": 0.19351305067539215, + "learning_rate": 0.0001488929232324891, + "loss": 0.9337, + "step": 3838 + }, + { + "epoch": 1.33, + "grad_norm": 0.18836168944835663, + "learning_rate": 0.0001488687676043504, + "loss": 0.862, + "step": 3839 + }, + { + "epoch": 1.33, + "grad_norm": 0.19105665385723114, + "learning_rate": 0.00014884460822954057, + "loss": 0.9026, + "step": 3840 + }, + { + "epoch": 1.33, + "grad_norm": 0.19071929156780243, + "learning_rate": 0.0001488204451099119, + "loss": 0.9249, + "step": 3841 + }, + { + "epoch": 1.33, + "grad_norm": 0.2096261978149414, + "learning_rate": 0.00014879627824731693, + "loss": 0.9825, + "step": 3842 + }, + { + "epoch": 1.33, + "grad_norm": 0.19523192942142487, + "learning_rate": 0.00014877210764360847, + "loss": 0.8733, + "step": 3843 + }, + { + "epoch": 1.33, + "grad_norm": 0.19203850626945496, + "learning_rate": 0.0001487479333006396, + "loss": 0.8879, + "step": 3844 + }, + { + "epoch": 1.33, + "grad_norm": 0.19315475225448608, + "learning_rate": 0.00014872375522026376, + "loss": 0.9211, + "step": 3845 + }, + { + "epoch": 1.33, + "grad_norm": 0.1942511945962906, + "learning_rate": 0.00014869957340433461, + "loss": 0.9567, + "step": 3846 + }, + { + "epoch": 1.33, + "grad_norm": 0.199487566947937, + "learning_rate": 0.00014867538785470617, + "loss": 0.8816, + "step": 3847 + }, + { + "epoch": 1.33, + "grad_norm": 0.20415233075618744, + "learning_rate": 0.00014865119857323267, + "loss": 0.8951, + "step": 3848 + }, + { + "epoch": 1.34, + "grad_norm": 0.19348014891147614, + "learning_rate": 0.0001486270055617686, + "loss": 0.9019, + "step": 3849 + }, + { + "epoch": 1.34, + "grad_norm": 0.19001100957393646, + "learning_rate": 0.0001486028088221689, + "loss": 0.9418, + "step": 3850 + }, + { + "epoch": 1.34, + "grad_norm": 0.18715202808380127, + "learning_rate": 0.00014857860835628856, + "loss": 0.8782, + "step": 3851 + }, + { + "epoch": 1.34, + "grad_norm": 0.19490717351436615, + "learning_rate": 0.00014855440416598309, + "loss": 0.9294, + "step": 3852 + }, + { + "epoch": 1.34, + "grad_norm": 0.21514825522899628, + "learning_rate": 0.00014853019625310813, + "loss": 0.9061, + "step": 3853 + }, + { + "epoch": 1.34, + "grad_norm": 0.20610563457012177, + "learning_rate": 0.00014850598461951963, + "loss": 0.8979, + "step": 3854 + }, + { + "epoch": 1.34, + "grad_norm": 0.1885242909193039, + "learning_rate": 0.0001484817692670739, + "loss": 0.915, + "step": 3855 + }, + { + "epoch": 1.34, + "grad_norm": 0.20315773785114288, + "learning_rate": 0.00014845755019762743, + "loss": 0.9954, + "step": 3856 + }, + { + "epoch": 1.34, + "grad_norm": 0.2000637799501419, + "learning_rate": 0.0001484333274130371, + "loss": 0.8913, + "step": 3857 + }, + { + "epoch": 1.34, + "grad_norm": 0.20190002024173737, + "learning_rate": 0.00014840910091515997, + "loss": 0.8499, + "step": 3858 + }, + { + "epoch": 1.34, + "grad_norm": 0.20083366334438324, + "learning_rate": 0.0001483848707058535, + "loss": 0.8497, + "step": 3859 + }, + { + "epoch": 1.34, + "grad_norm": 0.20369665324687958, + "learning_rate": 0.0001483606367869753, + "loss": 0.9244, + "step": 3860 + }, + { + "epoch": 1.34, + "grad_norm": 0.19197633862495422, + "learning_rate": 0.0001483363991603834, + "loss": 0.8958, + "step": 3861 + }, + { + "epoch": 1.34, + "grad_norm": 0.19108688831329346, + "learning_rate": 0.000148312157827936, + "loss": 0.8965, + "step": 3862 + }, + { + "epoch": 1.34, + "grad_norm": 0.19200675189495087, + "learning_rate": 0.00014828791279149168, + "loss": 0.9681, + "step": 3863 + }, + { + "epoch": 1.34, + "grad_norm": 0.18933221697807312, + "learning_rate": 0.00014826366405290922, + "loss": 0.916, + "step": 3864 + }, + { + "epoch": 1.34, + "grad_norm": 0.19670377671718597, + "learning_rate": 0.00014823941161404774, + "loss": 0.943, + "step": 3865 + }, + { + "epoch": 1.34, + "grad_norm": 0.18979068100452423, + "learning_rate": 0.00014821515547676662, + "loss": 0.8877, + "step": 3866 + }, + { + "epoch": 1.34, + "grad_norm": 0.18644832074642181, + "learning_rate": 0.00014819089564292552, + "loss": 0.8796, + "step": 3867 + }, + { + "epoch": 1.34, + "grad_norm": 0.19427362084388733, + "learning_rate": 0.00014816663211438447, + "loss": 0.9048, + "step": 3868 + }, + { + "epoch": 1.34, + "grad_norm": 0.20413526892662048, + "learning_rate": 0.0001481423648930036, + "loss": 0.9031, + "step": 3869 + }, + { + "epoch": 1.34, + "grad_norm": 0.20067287981510162, + "learning_rate": 0.00014811809398064348, + "loss": 0.9111, + "step": 3870 + }, + { + "epoch": 1.34, + "grad_norm": 0.1960747241973877, + "learning_rate": 0.00014809381937916488, + "loss": 0.8996, + "step": 3871 + }, + { + "epoch": 1.34, + "grad_norm": 0.1922447681427002, + "learning_rate": 0.0001480695410904289, + "loss": 0.9269, + "step": 3872 + }, + { + "epoch": 1.34, + "grad_norm": 0.19527967274188995, + "learning_rate": 0.000148045259116297, + "loss": 0.9108, + "step": 3873 + }, + { + "epoch": 1.34, + "grad_norm": 0.18520699441432953, + "learning_rate": 0.0001480209734586307, + "loss": 0.8605, + "step": 3874 + }, + { + "epoch": 1.34, + "grad_norm": 0.21494515240192413, + "learning_rate": 0.000147996684119292, + "loss": 0.9773, + "step": 3875 + }, + { + "epoch": 1.34, + "grad_norm": 0.1982784867286682, + "learning_rate": 0.00014797239110014306, + "loss": 0.9689, + "step": 3876 + }, + { + "epoch": 1.35, + "grad_norm": 0.1936819702386856, + "learning_rate": 0.00014794809440304646, + "loss": 0.9486, + "step": 3877 + }, + { + "epoch": 1.35, + "grad_norm": 0.20160476863384247, + "learning_rate": 0.00014792379402986492, + "loss": 0.9043, + "step": 3878 + }, + { + "epoch": 1.35, + "grad_norm": 0.20444336533546448, + "learning_rate": 0.0001478994899824615, + "loss": 0.9223, + "step": 3879 + }, + { + "epoch": 1.35, + "grad_norm": 0.19447480142116547, + "learning_rate": 0.0001478751822626996, + "loss": 0.9149, + "step": 3880 + }, + { + "epoch": 1.35, + "grad_norm": 0.19876286387443542, + "learning_rate": 0.00014785087087244275, + "loss": 0.9407, + "step": 3881 + }, + { + "epoch": 1.35, + "grad_norm": 0.19465559720993042, + "learning_rate": 0.00014782655581355495, + "loss": 0.9301, + "step": 3882 + }, + { + "epoch": 1.35, + "grad_norm": 0.19302017986774445, + "learning_rate": 0.0001478022370879003, + "loss": 0.8842, + "step": 3883 + }, + { + "epoch": 1.35, + "grad_norm": 0.1959199607372284, + "learning_rate": 0.00014777791469734337, + "loss": 0.8692, + "step": 3884 + }, + { + "epoch": 1.35, + "grad_norm": 0.18946966528892517, + "learning_rate": 0.00014775358864374885, + "loss": 0.8763, + "step": 3885 + }, + { + "epoch": 1.35, + "grad_norm": 0.1975627839565277, + "learning_rate": 0.00014772925892898175, + "loss": 0.9775, + "step": 3886 + }, + { + "epoch": 1.35, + "grad_norm": 0.2010650932788849, + "learning_rate": 0.0001477049255549074, + "loss": 0.9001, + "step": 3887 + }, + { + "epoch": 1.35, + "grad_norm": 0.1925669014453888, + "learning_rate": 0.00014768058852339142, + "loss": 0.8867, + "step": 3888 + }, + { + "epoch": 1.35, + "grad_norm": 0.19858621060848236, + "learning_rate": 0.00014765624783629963, + "loss": 0.9029, + "step": 3889 + }, + { + "epoch": 1.35, + "grad_norm": 0.2079782485961914, + "learning_rate": 0.00014763190349549823, + "loss": 0.9251, + "step": 3890 + }, + { + "epoch": 1.35, + "grad_norm": 0.1923612803220749, + "learning_rate": 0.00014760755550285366, + "loss": 0.8288, + "step": 3891 + }, + { + "epoch": 1.35, + "grad_norm": 0.19295886158943176, + "learning_rate": 0.00014758320386023255, + "loss": 0.8466, + "step": 3892 + }, + { + "epoch": 1.35, + "grad_norm": 0.19733382761478424, + "learning_rate": 0.00014755884856950196, + "loss": 0.8735, + "step": 3893 + }, + { + "epoch": 1.35, + "grad_norm": 0.19566164910793304, + "learning_rate": 0.00014753448963252918, + "loss": 0.8891, + "step": 3894 + }, + { + "epoch": 1.35, + "grad_norm": 0.1920028030872345, + "learning_rate": 0.00014751012705118172, + "loss": 0.9059, + "step": 3895 + }, + { + "epoch": 1.35, + "grad_norm": 0.2053966224193573, + "learning_rate": 0.00014748576082732742, + "loss": 0.8818, + "step": 3896 + }, + { + "epoch": 1.35, + "grad_norm": 0.20320646464824677, + "learning_rate": 0.00014746139096283436, + "loss": 0.8939, + "step": 3897 + }, + { + "epoch": 1.35, + "grad_norm": 0.20649878680706024, + "learning_rate": 0.000147437017459571, + "loss": 0.9206, + "step": 3898 + }, + { + "epoch": 1.35, + "grad_norm": 0.19128867983818054, + "learning_rate": 0.00014741264031940593, + "loss": 0.8968, + "step": 3899 + }, + { + "epoch": 1.35, + "grad_norm": 0.18966726958751678, + "learning_rate": 0.00014738825954420815, + "loss": 0.8833, + "step": 3900 + }, + { + "epoch": 1.35, + "grad_norm": 0.1958254873752594, + "learning_rate": 0.00014736387513584686, + "loss": 0.9248, + "step": 3901 + }, + { + "epoch": 1.35, + "grad_norm": 0.19757814705371857, + "learning_rate": 0.00014733948709619158, + "loss": 0.9552, + "step": 3902 + }, + { + "epoch": 1.35, + "grad_norm": 0.20112890005111694, + "learning_rate": 0.0001473150954271121, + "loss": 0.9392, + "step": 3903 + }, + { + "epoch": 1.35, + "grad_norm": 0.2000960111618042, + "learning_rate": 0.00014729070013047845, + "loss": 0.9083, + "step": 3904 + }, + { + "epoch": 1.35, + "grad_norm": 0.1944068968296051, + "learning_rate": 0.000147266301208161, + "loss": 0.9276, + "step": 3905 + }, + { + "epoch": 1.36, + "grad_norm": 0.2013338953256607, + "learning_rate": 0.00014724189866203037, + "loss": 0.9418, + "step": 3906 + }, + { + "epoch": 1.36, + "grad_norm": 0.2008303552865982, + "learning_rate": 0.00014721749249395737, + "loss": 0.9855, + "step": 3907 + }, + { + "epoch": 1.36, + "grad_norm": 0.20033757388591766, + "learning_rate": 0.00014719308270581328, + "loss": 0.9395, + "step": 3908 + }, + { + "epoch": 1.36, + "grad_norm": 0.21251152455806732, + "learning_rate": 0.0001471686692994695, + "loss": 0.9209, + "step": 3909 + }, + { + "epoch": 1.36, + "grad_norm": 0.19628505408763885, + "learning_rate": 0.00014714425227679777, + "loss": 0.9154, + "step": 3910 + }, + { + "epoch": 1.36, + "grad_norm": 0.1894054412841797, + "learning_rate": 0.0001471198316396701, + "loss": 0.9032, + "step": 3911 + }, + { + "epoch": 1.36, + "grad_norm": 0.19757375121116638, + "learning_rate": 0.00014709540738995876, + "loss": 0.8706, + "step": 3912 + }, + { + "epoch": 1.36, + "grad_norm": 0.20138540863990784, + "learning_rate": 0.0001470709795295363, + "loss": 0.9553, + "step": 3913 + }, + { + "epoch": 1.36, + "grad_norm": 0.19919735193252563, + "learning_rate": 0.0001470465480602756, + "loss": 0.9502, + "step": 3914 + }, + { + "epoch": 1.36, + "grad_norm": 0.19295333325862885, + "learning_rate": 0.00014702211298404968, + "loss": 0.9364, + "step": 3915 + }, + { + "epoch": 1.36, + "grad_norm": 0.1922289878129959, + "learning_rate": 0.000146997674302732, + "loss": 0.9687, + "step": 3916 + }, + { + "epoch": 1.36, + "grad_norm": 0.20261609554290771, + "learning_rate": 0.00014697323201819624, + "loss": 0.9599, + "step": 3917 + }, + { + "epoch": 1.36, + "grad_norm": 0.1992306411266327, + "learning_rate": 0.0001469487861323163, + "loss": 0.8967, + "step": 3918 + }, + { + "epoch": 1.36, + "grad_norm": 0.2102137804031372, + "learning_rate": 0.0001469243366469664, + "loss": 1.0319, + "step": 3919 + }, + { + "epoch": 1.36, + "grad_norm": 0.1930590718984604, + "learning_rate": 0.00014689988356402108, + "loss": 0.9564, + "step": 3920 + }, + { + "epoch": 1.36, + "grad_norm": 0.19371606409549713, + "learning_rate": 0.00014687542688535506, + "loss": 0.9345, + "step": 3921 + }, + { + "epoch": 1.36, + "grad_norm": 0.20073796808719635, + "learning_rate": 0.00014685096661284336, + "loss": 0.8684, + "step": 3922 + }, + { + "epoch": 1.36, + "grad_norm": 0.21918940544128418, + "learning_rate": 0.00014682650274836138, + "loss": 0.9269, + "step": 3923 + }, + { + "epoch": 1.36, + "grad_norm": 0.21027129888534546, + "learning_rate": 0.00014680203529378468, + "loss": 0.9555, + "step": 3924 + }, + { + "epoch": 1.36, + "grad_norm": 0.19333447515964508, + "learning_rate": 0.00014677756425098907, + "loss": 0.9541, + "step": 3925 + }, + { + "epoch": 1.36, + "grad_norm": 0.20419958233833313, + "learning_rate": 0.0001467530896218508, + "loss": 0.9048, + "step": 3926 + }, + { + "epoch": 1.36, + "grad_norm": 0.19317831099033356, + "learning_rate": 0.00014672861140824624, + "loss": 0.9029, + "step": 3927 + }, + { + "epoch": 1.36, + "grad_norm": 0.20992499589920044, + "learning_rate": 0.00014670412961205207, + "loss": 0.9877, + "step": 3928 + }, + { + "epoch": 1.36, + "grad_norm": 0.20966270565986633, + "learning_rate": 0.00014667964423514528, + "loss": 0.9048, + "step": 3929 + }, + { + "epoch": 1.36, + "grad_norm": 0.20526836812496185, + "learning_rate": 0.00014665515527940312, + "loss": 0.8693, + "step": 3930 + }, + { + "epoch": 1.36, + "grad_norm": 0.20329995453357697, + "learning_rate": 0.0001466306627467031, + "loss": 1.0054, + "step": 3931 + }, + { + "epoch": 1.36, + "grad_norm": 0.20244084298610687, + "learning_rate": 0.000146606166638923, + "loss": 0.9703, + "step": 3932 + }, + { + "epoch": 1.36, + "grad_norm": 0.192376971244812, + "learning_rate": 0.00014658166695794095, + "loss": 0.8323, + "step": 3933 + }, + { + "epoch": 1.37, + "grad_norm": 0.18748922646045685, + "learning_rate": 0.00014655716370563522, + "loss": 0.8728, + "step": 3934 + }, + { + "epoch": 1.37, + "grad_norm": 0.2070818841457367, + "learning_rate": 0.00014653265688388447, + "loss": 0.9134, + "step": 3935 + }, + { + "epoch": 1.37, + "grad_norm": 0.20446988940238953, + "learning_rate": 0.00014650814649456754, + "loss": 0.9876, + "step": 3936 + }, + { + "epoch": 1.37, + "grad_norm": 0.19711360335350037, + "learning_rate": 0.00014648363253956363, + "loss": 0.916, + "step": 3937 + }, + { + "epoch": 1.37, + "grad_norm": 0.19562388956546783, + "learning_rate": 0.0001464591150207522, + "loss": 0.8866, + "step": 3938 + }, + { + "epoch": 1.37, + "grad_norm": 0.1958800107240677, + "learning_rate": 0.00014643459394001292, + "loss": 0.8752, + "step": 3939 + }, + { + "epoch": 1.37, + "grad_norm": 0.2000405341386795, + "learning_rate": 0.00014641006929922576, + "loss": 0.9439, + "step": 3940 + }, + { + "epoch": 1.37, + "grad_norm": 0.2032128870487213, + "learning_rate": 0.000146385541100271, + "loss": 0.9456, + "step": 3941 + }, + { + "epoch": 1.37, + "grad_norm": 0.20214006304740906, + "learning_rate": 0.00014636100934502916, + "loss": 0.8781, + "step": 3942 + }, + { + "epoch": 1.37, + "grad_norm": 0.21211445331573486, + "learning_rate": 0.00014633647403538107, + "loss": 0.9876, + "step": 3943 + }, + { + "epoch": 1.37, + "grad_norm": 0.20307664573192596, + "learning_rate": 0.00014631193517320777, + "loss": 0.9049, + "step": 3944 + }, + { + "epoch": 1.37, + "grad_norm": 0.1985590010881424, + "learning_rate": 0.00014628739276039063, + "loss": 0.8968, + "step": 3945 + }, + { + "epoch": 1.37, + "grad_norm": 0.20269496738910675, + "learning_rate": 0.00014626284679881125, + "loss": 0.9093, + "step": 3946 + }, + { + "epoch": 1.37, + "grad_norm": 0.19618482887744904, + "learning_rate": 0.0001462382972903515, + "loss": 0.9047, + "step": 3947 + }, + { + "epoch": 1.37, + "grad_norm": 0.18889334797859192, + "learning_rate": 0.00014621374423689357, + "loss": 0.8841, + "step": 3948 + }, + { + "epoch": 1.37, + "grad_norm": 0.19615551829338074, + "learning_rate": 0.00014618918764031991, + "loss": 0.8833, + "step": 3949 + }, + { + "epoch": 1.37, + "grad_norm": 0.20412616431713104, + "learning_rate": 0.0001461646275025132, + "loss": 0.9088, + "step": 3950 + }, + { + "epoch": 1.37, + "grad_norm": 0.19538843631744385, + "learning_rate": 0.0001461400638253564, + "loss": 0.9257, + "step": 3951 + }, + { + "epoch": 1.37, + "grad_norm": 0.1954936534166336, + "learning_rate": 0.0001461154966107328, + "loss": 0.874, + "step": 3952 + }, + { + "epoch": 1.37, + "grad_norm": 0.20418086647987366, + "learning_rate": 0.0001460909258605259, + "loss": 0.9677, + "step": 3953 + }, + { + "epoch": 1.37, + "grad_norm": 0.1930086761713028, + "learning_rate": 0.00014606635157661947, + "loss": 0.9172, + "step": 3954 + }, + { + "epoch": 1.37, + "grad_norm": 0.18511292338371277, + "learning_rate": 0.00014604177376089763, + "loss": 0.8675, + "step": 3955 + }, + { + "epoch": 1.37, + "grad_norm": 0.19437043368816376, + "learning_rate": 0.00014601719241524462, + "loss": 0.881, + "step": 3956 + }, + { + "epoch": 1.37, + "grad_norm": 0.19371411204338074, + "learning_rate": 0.0001459926075415451, + "loss": 0.915, + "step": 3957 + }, + { + "epoch": 1.37, + "grad_norm": 0.19958390295505524, + "learning_rate": 0.000145968019141684, + "loss": 0.9678, + "step": 3958 + }, + { + "epoch": 1.37, + "grad_norm": 0.20946894586086273, + "learning_rate": 0.00014594342721754635, + "loss": 0.9311, + "step": 3959 + }, + { + "epoch": 1.37, + "grad_norm": 0.19142213463783264, + "learning_rate": 0.00014591883177101763, + "loss": 0.8736, + "step": 3960 + }, + { + "epoch": 1.37, + "grad_norm": 0.1921462118625641, + "learning_rate": 0.00014589423280398354, + "loss": 0.8867, + "step": 3961 + }, + { + "epoch": 1.37, + "grad_norm": 0.19228462874889374, + "learning_rate": 0.00014586963031832994, + "loss": 0.9924, + "step": 3962 + }, + { + "epoch": 1.38, + "grad_norm": 0.19412390887737274, + "learning_rate": 0.00014584502431594316, + "loss": 0.8861, + "step": 3963 + }, + { + "epoch": 1.38, + "grad_norm": 0.19502438604831696, + "learning_rate": 0.00014582041479870966, + "loss": 0.8712, + "step": 3964 + }, + { + "epoch": 1.38, + "grad_norm": 0.20914730429649353, + "learning_rate": 0.00014579580176851616, + "loss": 1.0042, + "step": 3965 + }, + { + "epoch": 1.38, + "grad_norm": 0.2027062028646469, + "learning_rate": 0.00014577118522724977, + "loss": 0.9674, + "step": 3966 + }, + { + "epoch": 1.38, + "grad_norm": 0.19774848222732544, + "learning_rate": 0.0001457465651767977, + "loss": 0.9081, + "step": 3967 + }, + { + "epoch": 1.38, + "grad_norm": 0.19559279084205627, + "learning_rate": 0.0001457219416190476, + "loss": 0.8935, + "step": 3968 + }, + { + "epoch": 1.38, + "grad_norm": 0.21025317907333374, + "learning_rate": 0.00014569731455588728, + "loss": 0.9731, + "step": 3969 + }, + { + "epoch": 1.38, + "grad_norm": 0.1929987221956253, + "learning_rate": 0.0001456726839892048, + "loss": 0.8873, + "step": 3970 + }, + { + "epoch": 1.38, + "grad_norm": 0.18292102217674255, + "learning_rate": 0.00014564804992088864, + "loss": 0.8333, + "step": 3971 + }, + { + "epoch": 1.38, + "grad_norm": 0.20394857227802277, + "learning_rate": 0.00014562341235282734, + "loss": 1.0219, + "step": 3972 + }, + { + "epoch": 1.38, + "grad_norm": 0.20659831166267395, + "learning_rate": 0.0001455987712869099, + "loss": 1.0135, + "step": 3973 + }, + { + "epoch": 1.38, + "grad_norm": 0.1952691227197647, + "learning_rate": 0.00014557412672502542, + "loss": 0.8934, + "step": 3974 + }, + { + "epoch": 1.38, + "grad_norm": 0.18991760909557343, + "learning_rate": 0.0001455494786690634, + "loss": 0.8838, + "step": 3975 + }, + { + "epoch": 1.38, + "grad_norm": 0.19869369268417358, + "learning_rate": 0.00014552482712091357, + "loss": 0.9144, + "step": 3976 + }, + { + "epoch": 1.38, + "grad_norm": 0.20484110713005066, + "learning_rate": 0.00014550017208246586, + "loss": 0.9326, + "step": 3977 + }, + { + "epoch": 1.38, + "grad_norm": 0.19844062626361847, + "learning_rate": 0.0001454755135556106, + "loss": 0.9201, + "step": 3978 + }, + { + "epoch": 1.38, + "grad_norm": 0.1981542855501175, + "learning_rate": 0.00014545085154223824, + "loss": 0.9259, + "step": 3979 + }, + { + "epoch": 1.38, + "grad_norm": 0.19899936020374298, + "learning_rate": 0.00014542618604423957, + "loss": 0.8957, + "step": 3980 + }, + { + "epoch": 1.38, + "grad_norm": 0.197379469871521, + "learning_rate": 0.0001454015170635057, + "loss": 0.9057, + "step": 3981 + }, + { + "epoch": 1.38, + "grad_norm": 0.20835913717746735, + "learning_rate": 0.0001453768446019279, + "loss": 0.933, + "step": 3982 + }, + { + "epoch": 1.38, + "grad_norm": 0.21079415082931519, + "learning_rate": 0.0001453521686613978, + "loss": 0.9159, + "step": 3983 + }, + { + "epoch": 1.38, + "grad_norm": 0.1958971619606018, + "learning_rate": 0.00014532748924380722, + "loss": 0.9019, + "step": 3984 + }, + { + "epoch": 1.38, + "grad_norm": 0.19664226472377777, + "learning_rate": 0.00014530280635104828, + "loss": 0.9261, + "step": 3985 + }, + { + "epoch": 1.38, + "grad_norm": 0.1906144917011261, + "learning_rate": 0.00014527811998501342, + "loss": 0.8938, + "step": 3986 + }, + { + "epoch": 1.38, + "grad_norm": 0.20075096189975739, + "learning_rate": 0.00014525343014759522, + "loss": 0.9138, + "step": 3987 + }, + { + "epoch": 1.38, + "grad_norm": 0.20587298274040222, + "learning_rate": 0.00014522873684068665, + "loss": 0.9163, + "step": 3988 + }, + { + "epoch": 1.38, + "grad_norm": 0.19886673986911774, + "learning_rate": 0.00014520404006618088, + "loss": 0.9025, + "step": 3989 + }, + { + "epoch": 1.38, + "grad_norm": 0.19743849337100983, + "learning_rate": 0.00014517933982597137, + "loss": 0.9412, + "step": 3990 + }, + { + "epoch": 1.39, + "grad_norm": 0.1906091868877411, + "learning_rate": 0.0001451546361219519, + "loss": 0.8986, + "step": 3991 + }, + { + "epoch": 1.39, + "grad_norm": 0.1904173046350479, + "learning_rate": 0.0001451299289560163, + "loss": 0.8772, + "step": 3992 + }, + { + "epoch": 1.39, + "grad_norm": 0.19968535006046295, + "learning_rate": 0.00014510521833005892, + "loss": 0.9069, + "step": 3993 + }, + { + "epoch": 1.39, + "grad_norm": 0.20211651921272278, + "learning_rate": 0.0001450805042459743, + "loss": 0.9065, + "step": 3994 + }, + { + "epoch": 1.39, + "grad_norm": 0.19885098934173584, + "learning_rate": 0.00014505578670565713, + "loss": 0.9555, + "step": 3995 + }, + { + "epoch": 1.39, + "grad_norm": 0.19500750303268433, + "learning_rate": 0.00014503106571100259, + "loss": 0.8783, + "step": 3996 + }, + { + "epoch": 1.39, + "grad_norm": 0.2058175504207611, + "learning_rate": 0.00014500634126390585, + "loss": 0.9622, + "step": 3997 + }, + { + "epoch": 1.39, + "grad_norm": 0.19853660464286804, + "learning_rate": 0.00014498161336626254, + "loss": 0.9553, + "step": 3998 + }, + { + "epoch": 1.39, + "grad_norm": 0.19556358456611633, + "learning_rate": 0.00014495688201996852, + "loss": 0.93, + "step": 3999 + }, + { + "epoch": 1.39, + "grad_norm": 0.21239247918128967, + "learning_rate": 0.00014493214722691982, + "loss": 0.9389, + "step": 4000 + }, + { + "epoch": 1.39, + "grad_norm": 0.20066651701927185, + "learning_rate": 0.0001449074089890129, + "loss": 0.9596, + "step": 4001 + }, + { + "epoch": 1.39, + "grad_norm": 0.1968492865562439, + "learning_rate": 0.00014488266730814438, + "loss": 0.941, + "step": 4002 + }, + { + "epoch": 1.39, + "grad_norm": 0.18987908959388733, + "learning_rate": 0.0001448579221862111, + "loss": 0.9014, + "step": 4003 + }, + { + "epoch": 1.39, + "grad_norm": 0.18961165845394135, + "learning_rate": 0.00014483317362511027, + "loss": 0.8637, + "step": 4004 + }, + { + "epoch": 1.39, + "grad_norm": 0.1986960470676422, + "learning_rate": 0.00014480842162673923, + "loss": 0.9423, + "step": 4005 + }, + { + "epoch": 1.39, + "grad_norm": 0.2111463099718094, + "learning_rate": 0.00014478366619299576, + "loss": 0.8806, + "step": 4006 + }, + { + "epoch": 1.39, + "grad_norm": 0.18590988218784332, + "learning_rate": 0.00014475890732577778, + "loss": 0.8824, + "step": 4007 + }, + { + "epoch": 1.39, + "grad_norm": 0.19554437696933746, + "learning_rate": 0.0001447341450269835, + "loss": 0.9777, + "step": 4008 + }, + { + "epoch": 1.39, + "grad_norm": 0.19814185798168182, + "learning_rate": 0.0001447093792985114, + "loss": 0.8583, + "step": 4009 + }, + { + "epoch": 1.39, + "grad_norm": 0.2064286172389984, + "learning_rate": 0.0001446846101422602, + "loss": 0.9261, + "step": 4010 + }, + { + "epoch": 1.39, + "grad_norm": 0.208113431930542, + "learning_rate": 0.00014465983756012892, + "loss": 0.9472, + "step": 4011 + }, + { + "epoch": 1.39, + "grad_norm": 0.20458854734897614, + "learning_rate": 0.0001446350615540168, + "loss": 0.936, + "step": 4012 + }, + { + "epoch": 1.39, + "grad_norm": 0.19451703131198883, + "learning_rate": 0.00014461028212582342, + "loss": 0.9528, + "step": 4013 + }, + { + "epoch": 1.39, + "grad_norm": 0.1916043907403946, + "learning_rate": 0.00014458549927744854, + "loss": 0.8705, + "step": 4014 + }, + { + "epoch": 1.39, + "grad_norm": 0.19787919521331787, + "learning_rate": 0.00014456071301079216, + "loss": 0.9302, + "step": 4015 + }, + { + "epoch": 1.39, + "grad_norm": 0.20245125889778137, + "learning_rate": 0.00014453592332775466, + "loss": 0.9772, + "step": 4016 + }, + { + "epoch": 1.39, + "grad_norm": 0.19883689284324646, + "learning_rate": 0.00014451113023023664, + "loss": 0.8607, + "step": 4017 + }, + { + "epoch": 1.39, + "grad_norm": 0.20537172257900238, + "learning_rate": 0.00014448633372013886, + "loss": 0.8589, + "step": 4018 + }, + { + "epoch": 1.4, + "grad_norm": 0.1990588903427124, + "learning_rate": 0.00014446153379936246, + "loss": 0.935, + "step": 4019 + }, + { + "epoch": 1.4, + "grad_norm": 0.20056746900081635, + "learning_rate": 0.00014443673046980876, + "loss": 0.8643, + "step": 4020 + }, + { + "epoch": 1.4, + "grad_norm": 0.19613921642303467, + "learning_rate": 0.00014441192373337947, + "loss": 0.9284, + "step": 4021 + }, + { + "epoch": 1.4, + "grad_norm": 0.19420263171195984, + "learning_rate": 0.0001443871135919764, + "loss": 0.8983, + "step": 4022 + }, + { + "epoch": 1.4, + "grad_norm": 0.19666177034378052, + "learning_rate": 0.00014436230004750172, + "loss": 0.8861, + "step": 4023 + }, + { + "epoch": 1.4, + "grad_norm": 0.20106558501720428, + "learning_rate": 0.0001443374831018578, + "loss": 0.9547, + "step": 4024 + }, + { + "epoch": 1.4, + "grad_norm": 0.1971885859966278, + "learning_rate": 0.00014431266275694734, + "loss": 0.9432, + "step": 4025 + }, + { + "epoch": 1.4, + "grad_norm": 0.19694119691848755, + "learning_rate": 0.0001442878390146733, + "loss": 0.9428, + "step": 4026 + }, + { + "epoch": 1.4, + "grad_norm": 0.2107475996017456, + "learning_rate": 0.0001442630118769388, + "loss": 1.0014, + "step": 4027 + }, + { + "epoch": 1.4, + "grad_norm": 0.19050538539886475, + "learning_rate": 0.0001442381813456473, + "loss": 0.9064, + "step": 4028 + }, + { + "epoch": 1.4, + "grad_norm": 0.2018805891275406, + "learning_rate": 0.00014421334742270256, + "loss": 0.9149, + "step": 4029 + }, + { + "epoch": 1.4, + "grad_norm": 0.19574452936649323, + "learning_rate": 0.00014418851011000847, + "loss": 0.9126, + "step": 4030 + }, + { + "epoch": 1.4, + "grad_norm": 0.19954833388328552, + "learning_rate": 0.00014416366940946934, + "loss": 0.8749, + "step": 4031 + }, + { + "epoch": 1.4, + "grad_norm": 0.19605934619903564, + "learning_rate": 0.00014413882532298956, + "loss": 0.8863, + "step": 4032 + }, + { + "epoch": 1.4, + "grad_norm": 0.2000497281551361, + "learning_rate": 0.00014411397785247398, + "loss": 0.9146, + "step": 4033 + }, + { + "epoch": 1.4, + "grad_norm": 0.19151616096496582, + "learning_rate": 0.00014408912699982756, + "loss": 0.9435, + "step": 4034 + }, + { + "epoch": 1.4, + "grad_norm": 0.1961325705051422, + "learning_rate": 0.00014406427276695552, + "loss": 0.8814, + "step": 4035 + }, + { + "epoch": 1.4, + "grad_norm": 0.1881711333990097, + "learning_rate": 0.00014403941515576344, + "loss": 0.8064, + "step": 4036 + }, + { + "epoch": 1.4, + "grad_norm": 0.19434835016727448, + "learning_rate": 0.0001440145541681571, + "loss": 0.888, + "step": 4037 + }, + { + "epoch": 1.4, + "grad_norm": 0.19846580922603607, + "learning_rate": 0.00014398968980604252, + "loss": 0.942, + "step": 4038 + }, + { + "epoch": 1.4, + "grad_norm": 0.21146689355373383, + "learning_rate": 0.00014396482207132606, + "loss": 0.9229, + "step": 4039 + }, + { + "epoch": 1.4, + "grad_norm": 0.19547779858112335, + "learning_rate": 0.00014393995096591416, + "loss": 0.8803, + "step": 4040 + }, + { + "epoch": 1.4, + "grad_norm": 0.19985531270503998, + "learning_rate": 0.00014391507649171375, + "loss": 0.9313, + "step": 4041 + }, + { + "epoch": 1.4, + "grad_norm": 0.2019869089126587, + "learning_rate": 0.00014389019865063187, + "loss": 0.9602, + "step": 4042 + }, + { + "epoch": 1.4, + "grad_norm": 0.18351396918296814, + "learning_rate": 0.00014386531744457585, + "loss": 0.8261, + "step": 4043 + }, + { + "epoch": 1.4, + "grad_norm": 0.19523383677005768, + "learning_rate": 0.0001438404328754533, + "loss": 0.9308, + "step": 4044 + }, + { + "epoch": 1.4, + "grad_norm": 0.19494594633579254, + "learning_rate": 0.000143815544945172, + "loss": 0.9157, + "step": 4045 + }, + { + "epoch": 1.4, + "grad_norm": 0.19079479575157166, + "learning_rate": 0.00014379065365564017, + "loss": 0.8429, + "step": 4046 + }, + { + "epoch": 1.4, + "grad_norm": 0.19934016466140747, + "learning_rate": 0.00014376575900876612, + "loss": 0.9156, + "step": 4047 + }, + { + "epoch": 1.41, + "grad_norm": 0.20024937391281128, + "learning_rate": 0.00014374086100645846, + "loss": 0.9277, + "step": 4048 + }, + { + "epoch": 1.41, + "grad_norm": 0.19894444942474365, + "learning_rate": 0.00014371595965062607, + "loss": 0.9388, + "step": 4049 + }, + { + "epoch": 1.41, + "grad_norm": 0.2016160637140274, + "learning_rate": 0.0001436910549431781, + "loss": 0.8564, + "step": 4050 + }, + { + "epoch": 1.41, + "grad_norm": 0.2054978460073471, + "learning_rate": 0.00014366614688602396, + "loss": 1.0209, + "step": 4051 + }, + { + "epoch": 1.41, + "grad_norm": 0.18863257765769958, + "learning_rate": 0.00014364123548107327, + "loss": 0.8814, + "step": 4052 + }, + { + "epoch": 1.41, + "grad_norm": 0.19060060381889343, + "learning_rate": 0.00014361632073023596, + "loss": 0.9216, + "step": 4053 + }, + { + "epoch": 1.41, + "grad_norm": 0.2042149156332016, + "learning_rate": 0.0001435914026354222, + "loss": 0.9728, + "step": 4054 + }, + { + "epoch": 1.41, + "grad_norm": 0.20097431540489197, + "learning_rate": 0.00014356648119854236, + "loss": 0.9218, + "step": 4055 + }, + { + "epoch": 1.41, + "grad_norm": 0.20287711918354034, + "learning_rate": 0.0001435415564215072, + "loss": 0.9008, + "step": 4056 + }, + { + "epoch": 1.41, + "grad_norm": 0.1915043294429779, + "learning_rate": 0.00014351662830622757, + "loss": 0.8405, + "step": 4057 + }, + { + "epoch": 1.41, + "grad_norm": 0.20655031502246857, + "learning_rate": 0.00014349169685461471, + "loss": 1.0139, + "step": 4058 + }, + { + "epoch": 1.41, + "grad_norm": 0.19209517538547516, + "learning_rate": 0.00014346676206858009, + "loss": 0.9055, + "step": 4059 + }, + { + "epoch": 1.41, + "grad_norm": 0.20339743793010712, + "learning_rate": 0.00014344182395003533, + "loss": 0.9622, + "step": 4060 + }, + { + "epoch": 1.41, + "grad_norm": 0.19336020946502686, + "learning_rate": 0.00014341688250089243, + "loss": 0.8918, + "step": 4061 + }, + { + "epoch": 1.41, + "grad_norm": 0.19621950387954712, + "learning_rate": 0.0001433919377230636, + "loss": 0.8601, + "step": 4062 + }, + { + "epoch": 1.41, + "grad_norm": 0.21147488057613373, + "learning_rate": 0.00014336698961846134, + "loss": 0.9483, + "step": 4063 + }, + { + "epoch": 1.41, + "grad_norm": 0.2010038048028946, + "learning_rate": 0.0001433420381889983, + "loss": 0.9204, + "step": 4064 + }, + { + "epoch": 1.41, + "grad_norm": 0.1948138177394867, + "learning_rate": 0.00014331708343658748, + "loss": 0.9032, + "step": 4065 + }, + { + "epoch": 1.41, + "grad_norm": 0.20164598524570465, + "learning_rate": 0.00014329212536314217, + "loss": 0.8759, + "step": 4066 + }, + { + "epoch": 1.41, + "grad_norm": 0.20524714887142181, + "learning_rate": 0.0001432671639705758, + "loss": 0.9107, + "step": 4067 + }, + { + "epoch": 1.41, + "grad_norm": 0.2029474675655365, + "learning_rate": 0.00014324219926080208, + "loss": 0.8739, + "step": 4068 + }, + { + "epoch": 1.41, + "grad_norm": 0.20881551504135132, + "learning_rate": 0.0001432172312357351, + "loss": 0.9433, + "step": 4069 + }, + { + "epoch": 1.41, + "grad_norm": 0.198815256357193, + "learning_rate": 0.00014319225989728901, + "loss": 0.9242, + "step": 4070 + }, + { + "epoch": 1.41, + "grad_norm": 0.19560791552066803, + "learning_rate": 0.0001431672852473784, + "loss": 0.9165, + "step": 4071 + }, + { + "epoch": 1.41, + "grad_norm": 0.2154237926006317, + "learning_rate": 0.00014314230728791795, + "loss": 0.8763, + "step": 4072 + }, + { + "epoch": 1.41, + "grad_norm": 0.20212805271148682, + "learning_rate": 0.00014311732602082273, + "loss": 0.9695, + "step": 4073 + }, + { + "epoch": 1.41, + "grad_norm": 0.20487231016159058, + "learning_rate": 0.00014309234144800796, + "loss": 0.9338, + "step": 4074 + }, + { + "epoch": 1.41, + "grad_norm": 0.19709345698356628, + "learning_rate": 0.00014306735357138916, + "loss": 0.9005, + "step": 4075 + }, + { + "epoch": 1.42, + "grad_norm": 0.20154863595962524, + "learning_rate": 0.00014304236239288213, + "loss": 0.8645, + "step": 4076 + }, + { + "epoch": 1.42, + "grad_norm": 0.20887543261051178, + "learning_rate": 0.0001430173679144029, + "loss": 0.9507, + "step": 4077 + }, + { + "epoch": 1.42, + "grad_norm": 0.2201569527387619, + "learning_rate": 0.00014299237013786772, + "loss": 0.8901, + "step": 4078 + }, + { + "epoch": 1.42, + "grad_norm": 0.20078633725643158, + "learning_rate": 0.00014296736906519312, + "loss": 0.8637, + "step": 4079 + }, + { + "epoch": 1.42, + "grad_norm": 0.20457907021045685, + "learning_rate": 0.00014294236469829588, + "loss": 0.947, + "step": 4080 + }, + { + "epoch": 1.42, + "grad_norm": 0.20842711627483368, + "learning_rate": 0.00014291735703909306, + "loss": 0.942, + "step": 4081 + }, + { + "epoch": 1.42, + "grad_norm": 0.21422892808914185, + "learning_rate": 0.00014289234608950193, + "loss": 0.9287, + "step": 4082 + }, + { + "epoch": 1.42, + "grad_norm": 0.20222635567188263, + "learning_rate": 0.00014286733185144004, + "loss": 0.9141, + "step": 4083 + }, + { + "epoch": 1.42, + "grad_norm": 0.20117203891277313, + "learning_rate": 0.00014284231432682516, + "loss": 0.8867, + "step": 4084 + }, + { + "epoch": 1.42, + "grad_norm": 0.20013990998268127, + "learning_rate": 0.00014281729351757534, + "loss": 0.9631, + "step": 4085 + }, + { + "epoch": 1.42, + "grad_norm": 0.1925857663154602, + "learning_rate": 0.00014279226942560887, + "loss": 0.8808, + "step": 4086 + }, + { + "epoch": 1.42, + "grad_norm": 0.19970326125621796, + "learning_rate": 0.00014276724205284434, + "loss": 0.9158, + "step": 4087 + }, + { + "epoch": 1.42, + "grad_norm": 0.2049684375524521, + "learning_rate": 0.0001427422114012005, + "loss": 0.8965, + "step": 4088 + }, + { + "epoch": 1.42, + "grad_norm": 0.21222761273384094, + "learning_rate": 0.00014271717747259646, + "loss": 0.9446, + "step": 4089 + }, + { + "epoch": 1.42, + "grad_norm": 0.19456645846366882, + "learning_rate": 0.00014269214026895144, + "loss": 0.855, + "step": 4090 + }, + { + "epoch": 1.42, + "grad_norm": 0.20363333821296692, + "learning_rate": 0.00014266709979218504, + "loss": 0.9686, + "step": 4091 + }, + { + "epoch": 1.42, + "grad_norm": 0.20178015530109406, + "learning_rate": 0.00014264205604421704, + "loss": 0.9871, + "step": 4092 + }, + { + "epoch": 1.42, + "grad_norm": 0.19955269992351532, + "learning_rate": 0.00014261700902696753, + "loss": 0.9779, + "step": 4093 + }, + { + "epoch": 1.42, + "grad_norm": 0.2012970894575119, + "learning_rate": 0.00014259195874235682, + "loss": 0.891, + "step": 4094 + }, + { + "epoch": 1.42, + "grad_norm": 0.19689302146434784, + "learning_rate": 0.0001425669051923054, + "loss": 0.8973, + "step": 4095 + }, + { + "epoch": 1.42, + "grad_norm": 0.18733972311019897, + "learning_rate": 0.00014254184837873414, + "loss": 0.8695, + "step": 4096 + }, + { + "epoch": 1.42, + "grad_norm": 0.19153453409671783, + "learning_rate": 0.00014251678830356408, + "loss": 0.9009, + "step": 4097 + }, + { + "epoch": 1.42, + "grad_norm": 0.1985730081796646, + "learning_rate": 0.0001424917249687165, + "loss": 0.9046, + "step": 4098 + }, + { + "epoch": 1.42, + "grad_norm": 0.19961155951023102, + "learning_rate": 0.000142466658376113, + "loss": 0.9044, + "step": 4099 + }, + { + "epoch": 1.42, + "grad_norm": 0.1989118456840515, + "learning_rate": 0.00014244158852767534, + "loss": 0.9417, + "step": 4100 + }, + { + "epoch": 1.42, + "grad_norm": 0.19771447777748108, + "learning_rate": 0.0001424165154253256, + "loss": 0.9204, + "step": 4101 + }, + { + "epoch": 1.42, + "grad_norm": 0.18623396754264832, + "learning_rate": 0.0001423914390709861, + "loss": 0.8632, + "step": 4102 + }, + { + "epoch": 1.42, + "grad_norm": 0.19544756412506104, + "learning_rate": 0.00014236635946657933, + "loss": 0.9045, + "step": 4103 + }, + { + "epoch": 1.42, + "grad_norm": 0.20649179816246033, + "learning_rate": 0.0001423412766140282, + "loss": 0.9267, + "step": 4104 + }, + { + "epoch": 1.43, + "grad_norm": 0.20009225606918335, + "learning_rate": 0.00014231619051525564, + "loss": 0.8781, + "step": 4105 + }, + { + "epoch": 1.43, + "grad_norm": 0.20141996443271637, + "learning_rate": 0.00014229110117218503, + "loss": 0.8861, + "step": 4106 + }, + { + "epoch": 1.43, + "grad_norm": 0.1917063295841217, + "learning_rate": 0.0001422660085867399, + "loss": 0.9038, + "step": 4107 + }, + { + "epoch": 1.43, + "grad_norm": 0.19084444642066956, + "learning_rate": 0.00014224091276084406, + "loss": 0.8411, + "step": 4108 + }, + { + "epoch": 1.43, + "grad_norm": 0.19777989387512207, + "learning_rate": 0.00014221581369642158, + "loss": 0.889, + "step": 4109 + }, + { + "epoch": 1.43, + "grad_norm": 0.19790798425674438, + "learning_rate": 0.00014219071139539668, + "loss": 0.9385, + "step": 4110 + }, + { + "epoch": 1.43, + "grad_norm": 0.19884786009788513, + "learning_rate": 0.00014216560585969396, + "loss": 0.9052, + "step": 4111 + }, + { + "epoch": 1.43, + "grad_norm": 0.1907127946615219, + "learning_rate": 0.0001421404970912382, + "loss": 0.8382, + "step": 4112 + }, + { + "epoch": 1.43, + "grad_norm": 0.19692125916481018, + "learning_rate": 0.00014211538509195442, + "loss": 0.8997, + "step": 4113 + }, + { + "epoch": 1.43, + "grad_norm": 0.2003505676984787, + "learning_rate": 0.00014209026986376796, + "loss": 0.8873, + "step": 4114 + }, + { + "epoch": 1.43, + "grad_norm": 0.20393992960453033, + "learning_rate": 0.00014206515140860427, + "loss": 0.9339, + "step": 4115 + }, + { + "epoch": 1.43, + "grad_norm": 0.1977144181728363, + "learning_rate": 0.0001420400297283892, + "loss": 0.8991, + "step": 4116 + }, + { + "epoch": 1.43, + "grad_norm": 0.1982010304927826, + "learning_rate": 0.0001420149048250488, + "loss": 0.9067, + "step": 4117 + }, + { + "epoch": 1.43, + "grad_norm": 0.20188704133033752, + "learning_rate": 0.00014198977670050925, + "loss": 0.9074, + "step": 4118 + }, + { + "epoch": 1.43, + "grad_norm": 0.19379226863384247, + "learning_rate": 0.00014196464535669718, + "loss": 0.9018, + "step": 4119 + }, + { + "epoch": 1.43, + "grad_norm": 0.19640572369098663, + "learning_rate": 0.0001419395107955393, + "loss": 0.9235, + "step": 4120 + }, + { + "epoch": 1.43, + "grad_norm": 0.19844560325145721, + "learning_rate": 0.0001419143730189626, + "loss": 0.9753, + "step": 4121 + }, + { + "epoch": 1.43, + "grad_norm": 0.20532171428203583, + "learning_rate": 0.00014188923202889439, + "loss": 0.8703, + "step": 4122 + }, + { + "epoch": 1.43, + "grad_norm": 0.2062155157327652, + "learning_rate": 0.00014186408782726218, + "loss": 0.8938, + "step": 4123 + }, + { + "epoch": 1.43, + "grad_norm": 0.1967145800590515, + "learning_rate": 0.00014183894041599373, + "loss": 0.9188, + "step": 4124 + }, + { + "epoch": 1.43, + "grad_norm": 0.19157996773719788, + "learning_rate": 0.000141813789797017, + "loss": 0.9115, + "step": 4125 + }, + { + "epoch": 1.43, + "grad_norm": 0.1903439611196518, + "learning_rate": 0.00014178863597226027, + "loss": 0.8493, + "step": 4126 + }, + { + "epoch": 1.43, + "grad_norm": 0.19151780009269714, + "learning_rate": 0.00014176347894365204, + "loss": 0.9128, + "step": 4127 + }, + { + "epoch": 1.43, + "grad_norm": 0.20262552797794342, + "learning_rate": 0.00014173831871312103, + "loss": 0.923, + "step": 4128 + }, + { + "epoch": 1.43, + "grad_norm": 0.20540249347686768, + "learning_rate": 0.00014171315528259623, + "loss": 0.9111, + "step": 4129 + }, + { + "epoch": 1.43, + "grad_norm": 0.2055845409631729, + "learning_rate": 0.0001416879886540069, + "loss": 0.9231, + "step": 4130 + }, + { + "epoch": 1.43, + "grad_norm": 0.19773642718791962, + "learning_rate": 0.00014166281882928245, + "loss": 0.9057, + "step": 4131 + }, + { + "epoch": 1.43, + "grad_norm": 0.19996729493141174, + "learning_rate": 0.00014163764581035266, + "loss": 0.9112, + "step": 4132 + }, + { + "epoch": 1.44, + "grad_norm": 0.21551403403282166, + "learning_rate": 0.00014161246959914744, + "loss": 0.9638, + "step": 4133 + }, + { + "epoch": 1.44, + "grad_norm": 0.1965765506029129, + "learning_rate": 0.00014158729019759706, + "loss": 0.8685, + "step": 4134 + }, + { + "epoch": 1.44, + "grad_norm": 0.20234429836273193, + "learning_rate": 0.00014156210760763198, + "loss": 0.94, + "step": 4135 + }, + { + "epoch": 1.44, + "grad_norm": 0.19687169790267944, + "learning_rate": 0.00014153692183118283, + "loss": 0.9069, + "step": 4136 + }, + { + "epoch": 1.44, + "grad_norm": 0.20247773826122284, + "learning_rate": 0.0001415117328701806, + "loss": 0.9672, + "step": 4137 + }, + { + "epoch": 1.44, + "grad_norm": 0.19329603016376495, + "learning_rate": 0.00014148654072655646, + "loss": 0.8837, + "step": 4138 + }, + { + "epoch": 1.44, + "grad_norm": 0.19726818799972534, + "learning_rate": 0.00014146134540224186, + "loss": 0.8691, + "step": 4139 + }, + { + "epoch": 1.44, + "grad_norm": 0.19611580669879913, + "learning_rate": 0.00014143614689916847, + "loss": 0.9759, + "step": 4140 + }, + { + "epoch": 1.44, + "grad_norm": 0.19384996592998505, + "learning_rate": 0.00014141094521926816, + "loss": 0.8821, + "step": 4141 + }, + { + "epoch": 1.44, + "grad_norm": 0.20498308539390564, + "learning_rate": 0.00014138574036447318, + "loss": 0.933, + "step": 4142 + }, + { + "epoch": 1.44, + "grad_norm": 0.20109273493289948, + "learning_rate": 0.00014136053233671587, + "loss": 0.9647, + "step": 4143 + }, + { + "epoch": 1.44, + "grad_norm": 0.20127202570438385, + "learning_rate": 0.0001413353211379289, + "loss": 0.9003, + "step": 4144 + }, + { + "epoch": 1.44, + "grad_norm": 0.1983189433813095, + "learning_rate": 0.0001413101067700452, + "loss": 0.8785, + "step": 4145 + }, + { + "epoch": 1.44, + "grad_norm": 0.2038581520318985, + "learning_rate": 0.00014128488923499785, + "loss": 0.9107, + "step": 4146 + }, + { + "epoch": 1.44, + "grad_norm": 0.1996109038591385, + "learning_rate": 0.00014125966853472026, + "loss": 0.8666, + "step": 4147 + }, + { + "epoch": 1.44, + "grad_norm": 0.19804580509662628, + "learning_rate": 0.00014123444467114603, + "loss": 0.9559, + "step": 4148 + }, + { + "epoch": 1.44, + "grad_norm": 0.1972654163837433, + "learning_rate": 0.000141209217646209, + "loss": 0.9336, + "step": 4149 + }, + { + "epoch": 1.44, + "grad_norm": 0.1962662637233734, + "learning_rate": 0.00014118398746184338, + "loss": 0.8972, + "step": 4150 + }, + { + "epoch": 1.44, + "grad_norm": 0.1934090405702591, + "learning_rate": 0.00014115875411998337, + "loss": 0.9185, + "step": 4151 + }, + { + "epoch": 1.44, + "grad_norm": 0.19299843907356262, + "learning_rate": 0.00014113351762256367, + "loss": 0.8529, + "step": 4152 + }, + { + "epoch": 1.44, + "grad_norm": 0.20182262361049652, + "learning_rate": 0.00014110827797151907, + "loss": 0.918, + "step": 4153 + }, + { + "epoch": 1.44, + "grad_norm": 0.19320084154605865, + "learning_rate": 0.00014108303516878465, + "loss": 0.8898, + "step": 4154 + }, + { + "epoch": 1.44, + "grad_norm": 0.20861463248729706, + "learning_rate": 0.00014105778921629576, + "loss": 0.9821, + "step": 4155 + }, + { + "epoch": 1.44, + "grad_norm": 0.1959957629442215, + "learning_rate": 0.0001410325401159879, + "loss": 0.9332, + "step": 4156 + }, + { + "epoch": 1.44, + "grad_norm": 0.20823851227760315, + "learning_rate": 0.0001410072878697969, + "loss": 0.9911, + "step": 4157 + }, + { + "epoch": 1.44, + "grad_norm": 0.19281212985515594, + "learning_rate": 0.00014098203247965875, + "loss": 0.8473, + "step": 4158 + }, + { + "epoch": 1.44, + "grad_norm": 0.20280300080776215, + "learning_rate": 0.0001409567739475098, + "loss": 0.9104, + "step": 4159 + }, + { + "epoch": 1.44, + "grad_norm": 0.20401504635810852, + "learning_rate": 0.00014093151227528657, + "loss": 0.9004, + "step": 4160 + }, + { + "epoch": 1.45, + "grad_norm": 0.19846060872077942, + "learning_rate": 0.00014090624746492577, + "loss": 0.9688, + "step": 4161 + }, + { + "epoch": 1.45, + "grad_norm": 0.20426008105278015, + "learning_rate": 0.00014088097951836444, + "loss": 0.9435, + "step": 4162 + }, + { + "epoch": 1.45, + "grad_norm": 0.201716348528862, + "learning_rate": 0.00014085570843753978, + "loss": 0.9067, + "step": 4163 + }, + { + "epoch": 1.45, + "grad_norm": 0.19857504963874817, + "learning_rate": 0.00014083043422438935, + "loss": 0.8696, + "step": 4164 + }, + { + "epoch": 1.45, + "grad_norm": 0.19757845997810364, + "learning_rate": 0.00014080515688085082, + "loss": 0.9303, + "step": 4165 + }, + { + "epoch": 1.45, + "grad_norm": 0.19037018716335297, + "learning_rate": 0.00014077987640886215, + "loss": 0.8765, + "step": 4166 + }, + { + "epoch": 1.45, + "grad_norm": 0.20469224452972412, + "learning_rate": 0.00014075459281036155, + "loss": 0.945, + "step": 4167 + }, + { + "epoch": 1.45, + "grad_norm": 0.20371098816394806, + "learning_rate": 0.00014072930608728746, + "loss": 0.8623, + "step": 4168 + }, + { + "epoch": 1.45, + "grad_norm": 0.19951358437538147, + "learning_rate": 0.0001407040162415786, + "loss": 0.9219, + "step": 4169 + }, + { + "epoch": 1.45, + "grad_norm": 0.19785675406455994, + "learning_rate": 0.00014067872327517384, + "loss": 0.9279, + "step": 4170 + }, + { + "epoch": 1.45, + "grad_norm": 0.19662170112133026, + "learning_rate": 0.00014065342719001236, + "loss": 0.8534, + "step": 4171 + }, + { + "epoch": 1.45, + "grad_norm": 0.1945483237504959, + "learning_rate": 0.0001406281279880336, + "loss": 0.9185, + "step": 4172 + }, + { + "epoch": 1.45, + "grad_norm": 0.2053336203098297, + "learning_rate": 0.00014060282567117716, + "loss": 0.9257, + "step": 4173 + }, + { + "epoch": 1.45, + "grad_norm": 0.19595634937286377, + "learning_rate": 0.0001405775202413829, + "loss": 0.9356, + "step": 4174 + }, + { + "epoch": 1.45, + "grad_norm": 0.19171977043151855, + "learning_rate": 0.00014055221170059096, + "loss": 0.8608, + "step": 4175 + }, + { + "epoch": 1.45, + "grad_norm": 0.20737864077091217, + "learning_rate": 0.0001405269000507417, + "loss": 0.9602, + "step": 4176 + }, + { + "epoch": 1.45, + "grad_norm": 0.19404630362987518, + "learning_rate": 0.00014050158529377573, + "loss": 0.812, + "step": 4177 + }, + { + "epoch": 1.45, + "grad_norm": 0.20775002241134644, + "learning_rate": 0.00014047626743163383, + "loss": 0.9876, + "step": 4178 + }, + { + "epoch": 1.45, + "grad_norm": 0.1968124359846115, + "learning_rate": 0.00014045094646625712, + "loss": 0.9084, + "step": 4179 + }, + { + "epoch": 1.45, + "grad_norm": 0.20270562171936035, + "learning_rate": 0.00014042562239958688, + "loss": 0.9759, + "step": 4180 + }, + { + "epoch": 1.45, + "grad_norm": 0.2031795233488083, + "learning_rate": 0.00014040029523356467, + "loss": 0.9155, + "step": 4181 + }, + { + "epoch": 1.45, + "grad_norm": 0.20602475106716156, + "learning_rate": 0.0001403749649701323, + "loss": 0.9024, + "step": 4182 + }, + { + "epoch": 1.45, + "grad_norm": 0.19497054815292358, + "learning_rate": 0.00014034963161123175, + "loss": 0.882, + "step": 4183 + }, + { + "epoch": 1.45, + "grad_norm": 0.18592894077301025, + "learning_rate": 0.00014032429515880524, + "loss": 0.853, + "step": 4184 + }, + { + "epoch": 1.45, + "grad_norm": 0.20189309120178223, + "learning_rate": 0.00014029895561479535, + "loss": 0.9096, + "step": 4185 + }, + { + "epoch": 1.45, + "grad_norm": 0.2048352062702179, + "learning_rate": 0.00014027361298114474, + "loss": 0.8815, + "step": 4186 + }, + { + "epoch": 1.45, + "grad_norm": 0.1962141990661621, + "learning_rate": 0.00014024826725979645, + "loss": 0.9115, + "step": 4187 + }, + { + "epoch": 1.45, + "grad_norm": 0.1993790864944458, + "learning_rate": 0.00014022291845269363, + "loss": 0.9071, + "step": 4188 + }, + { + "epoch": 1.45, + "grad_norm": 0.2057470977306366, + "learning_rate": 0.00014019756656177975, + "loss": 0.8741, + "step": 4189 + }, + { + "epoch": 1.46, + "grad_norm": 0.19505277276039124, + "learning_rate": 0.0001401722115889985, + "loss": 0.8974, + "step": 4190 + }, + { + "epoch": 1.46, + "grad_norm": 0.1985812783241272, + "learning_rate": 0.00014014685353629376, + "loss": 1.0093, + "step": 4191 + }, + { + "epoch": 1.46, + "grad_norm": 0.19670027494430542, + "learning_rate": 0.0001401214924056097, + "loss": 0.9443, + "step": 4192 + }, + { + "epoch": 1.46, + "grad_norm": 0.2052966207265854, + "learning_rate": 0.00014009612819889072, + "loss": 0.9262, + "step": 4193 + }, + { + "epoch": 1.46, + "grad_norm": 0.2016931027173996, + "learning_rate": 0.0001400707609180814, + "loss": 0.9776, + "step": 4194 + }, + { + "epoch": 1.46, + "grad_norm": 0.20231129229068756, + "learning_rate": 0.00014004539056512667, + "loss": 0.9499, + "step": 4195 + }, + { + "epoch": 1.46, + "grad_norm": 0.1933603733778, + "learning_rate": 0.00014002001714197155, + "loss": 0.9471, + "step": 4196 + }, + { + "epoch": 1.46, + "grad_norm": 0.1898379921913147, + "learning_rate": 0.0001399946406505614, + "loss": 0.8167, + "step": 4197 + }, + { + "epoch": 1.46, + "grad_norm": 0.19617202877998352, + "learning_rate": 0.00013996926109284183, + "loss": 0.9083, + "step": 4198 + }, + { + "epoch": 1.46, + "grad_norm": 0.1954466700553894, + "learning_rate": 0.00013994387847075855, + "loss": 0.8999, + "step": 4199 + }, + { + "epoch": 1.46, + "grad_norm": 0.19861939549446106, + "learning_rate": 0.00013991849278625765, + "loss": 0.9274, + "step": 4200 + }, + { + "epoch": 1.46, + "grad_norm": 0.1931011974811554, + "learning_rate": 0.00013989310404128542, + "loss": 0.9264, + "step": 4201 + }, + { + "epoch": 1.46, + "grad_norm": 0.19421404600143433, + "learning_rate": 0.0001398677122377883, + "loss": 0.9099, + "step": 4202 + }, + { + "epoch": 1.46, + "grad_norm": 0.2066340446472168, + "learning_rate": 0.0001398423173777131, + "loss": 0.9021, + "step": 4203 + }, + { + "epoch": 1.46, + "grad_norm": 0.20380087196826935, + "learning_rate": 0.00013981691946300675, + "loss": 0.9444, + "step": 4204 + }, + { + "epoch": 1.46, + "grad_norm": 0.2037912905216217, + "learning_rate": 0.00013979151849561648, + "loss": 0.9184, + "step": 4205 + }, + { + "epoch": 1.46, + "grad_norm": 0.2029280811548233, + "learning_rate": 0.00013976611447748968, + "loss": 0.8991, + "step": 4206 + }, + { + "epoch": 1.46, + "grad_norm": 0.18916313350200653, + "learning_rate": 0.0001397407074105741, + "loss": 0.9124, + "step": 4207 + }, + { + "epoch": 1.46, + "grad_norm": 0.20596222579479218, + "learning_rate": 0.00013971529729681762, + "loss": 1.0275, + "step": 4208 + }, + { + "epoch": 1.46, + "grad_norm": 0.22113633155822754, + "learning_rate": 0.00013968988413816833, + "loss": 1.0119, + "step": 4209 + }, + { + "epoch": 1.46, + "grad_norm": 0.19565562903881073, + "learning_rate": 0.00013966446793657467, + "loss": 0.8888, + "step": 4210 + }, + { + "epoch": 1.46, + "grad_norm": 0.20347142219543457, + "learning_rate": 0.00013963904869398528, + "loss": 0.9253, + "step": 4211 + }, + { + "epoch": 1.46, + "grad_norm": 0.19310647249221802, + "learning_rate": 0.00013961362641234893, + "loss": 0.8565, + "step": 4212 + }, + { + "epoch": 1.46, + "grad_norm": 0.20248988270759583, + "learning_rate": 0.0001395882010936147, + "loss": 0.9677, + "step": 4213 + }, + { + "epoch": 1.46, + "grad_norm": 0.1965925544500351, + "learning_rate": 0.0001395627727397319, + "loss": 0.9319, + "step": 4214 + }, + { + "epoch": 1.46, + "grad_norm": 0.21116456389427185, + "learning_rate": 0.00013953734135265013, + "loss": 0.9578, + "step": 4215 + }, + { + "epoch": 1.46, + "grad_norm": 0.20068742334842682, + "learning_rate": 0.0001395119069343191, + "loss": 0.899, + "step": 4216 + }, + { + "epoch": 1.46, + "grad_norm": 0.19809217751026154, + "learning_rate": 0.00013948646948668886, + "loss": 0.9072, + "step": 4217 + }, + { + "epoch": 1.47, + "grad_norm": 0.20866192877292633, + "learning_rate": 0.00013946102901170963, + "loss": 0.8836, + "step": 4218 + }, + { + "epoch": 1.47, + "grad_norm": 0.18966230750083923, + "learning_rate": 0.00013943558551133186, + "loss": 0.8439, + "step": 4219 + }, + { + "epoch": 1.47, + "grad_norm": 0.20649796724319458, + "learning_rate": 0.00013941013898750631, + "loss": 0.9673, + "step": 4220 + }, + { + "epoch": 1.47, + "grad_norm": 0.20510034263134003, + "learning_rate": 0.0001393846894421838, + "loss": 0.9216, + "step": 4221 + }, + { + "epoch": 1.47, + "grad_norm": 0.2012736052274704, + "learning_rate": 0.00013935923687731567, + "loss": 0.8898, + "step": 4222 + }, + { + "epoch": 1.47, + "grad_norm": 0.18844866752624512, + "learning_rate": 0.00013933378129485317, + "loss": 0.8726, + "step": 4223 + }, + { + "epoch": 1.47, + "grad_norm": 0.19127140939235687, + "learning_rate": 0.00013930832269674797, + "loss": 0.8688, + "step": 4224 + }, + { + "epoch": 1.47, + "grad_norm": 0.20248930156230927, + "learning_rate": 0.00013928286108495197, + "loss": 0.938, + "step": 4225 + }, + { + "epoch": 1.47, + "grad_norm": 0.20166027545928955, + "learning_rate": 0.0001392573964614172, + "loss": 0.958, + "step": 4226 + }, + { + "epoch": 1.47, + "grad_norm": 0.19940727949142456, + "learning_rate": 0.000139231928828096, + "loss": 0.8704, + "step": 4227 + }, + { + "epoch": 1.47, + "grad_norm": 0.2015244960784912, + "learning_rate": 0.00013920645818694098, + "loss": 0.9304, + "step": 4228 + }, + { + "epoch": 1.47, + "grad_norm": 0.1990206241607666, + "learning_rate": 0.00013918098453990485, + "loss": 0.9443, + "step": 4229 + }, + { + "epoch": 1.47, + "grad_norm": 0.20932473242282867, + "learning_rate": 0.00013915550788894064, + "loss": 0.9485, + "step": 4230 + }, + { + "epoch": 1.47, + "grad_norm": 0.19755159318447113, + "learning_rate": 0.00013913002823600162, + "loss": 0.9026, + "step": 4231 + }, + { + "epoch": 1.47, + "grad_norm": 0.20039217174053192, + "learning_rate": 0.00013910454558304126, + "loss": 0.9063, + "step": 4232 + }, + { + "epoch": 1.47, + "grad_norm": 0.2012191265821457, + "learning_rate": 0.00013907905993201328, + "loss": 0.9489, + "step": 4233 + }, + { + "epoch": 1.47, + "grad_norm": 0.19405993819236755, + "learning_rate": 0.0001390535712848715, + "loss": 0.94, + "step": 4234 + }, + { + "epoch": 1.47, + "grad_norm": 0.2001253366470337, + "learning_rate": 0.00013902807964357025, + "loss": 0.955, + "step": 4235 + }, + { + "epoch": 1.47, + "grad_norm": 0.19616849720478058, + "learning_rate": 0.00013900258501006382, + "loss": 0.9086, + "step": 4236 + }, + { + "epoch": 1.47, + "grad_norm": 0.18834726512432098, + "learning_rate": 0.00013897708738630686, + "loss": 0.8754, + "step": 4237 + }, + { + "epoch": 1.47, + "grad_norm": 0.18973401188850403, + "learning_rate": 0.00013895158677425424, + "loss": 0.9142, + "step": 4238 + }, + { + "epoch": 1.47, + "grad_norm": 0.19476298987865448, + "learning_rate": 0.00013892608317586097, + "loss": 0.829, + "step": 4239 + }, + { + "epoch": 1.47, + "grad_norm": 0.1959441602230072, + "learning_rate": 0.00013890057659308246, + "loss": 0.9672, + "step": 4240 + }, + { + "epoch": 1.47, + "grad_norm": 0.20653578639030457, + "learning_rate": 0.0001388750670278742, + "loss": 0.9431, + "step": 4241 + }, + { + "epoch": 1.47, + "grad_norm": 0.20152495801448822, + "learning_rate": 0.00013884955448219196, + "loss": 0.9173, + "step": 4242 + }, + { + "epoch": 1.47, + "grad_norm": 0.20955972373485565, + "learning_rate": 0.0001388240389579917, + "loss": 0.9552, + "step": 4243 + }, + { + "epoch": 1.47, + "grad_norm": 0.1913870871067047, + "learning_rate": 0.00013879852045722968, + "loss": 0.9056, + "step": 4244 + }, + { + "epoch": 1.47, + "grad_norm": 0.2016601711511612, + "learning_rate": 0.00013877299898186235, + "loss": 0.9518, + "step": 4245 + }, + { + "epoch": 1.47, + "grad_norm": 0.20458324253559113, + "learning_rate": 0.00013874747453384643, + "loss": 0.933, + "step": 4246 + }, + { + "epoch": 1.48, + "grad_norm": 0.20345601439476013, + "learning_rate": 0.00013872194711513877, + "loss": 0.937, + "step": 4247 + }, + { + "epoch": 1.48, + "grad_norm": 0.19450397789478302, + "learning_rate": 0.00013869641672769654, + "loss": 0.9003, + "step": 4248 + }, + { + "epoch": 1.48, + "grad_norm": 0.19730433821678162, + "learning_rate": 0.00013867088337347704, + "loss": 0.8861, + "step": 4249 + }, + { + "epoch": 1.48, + "grad_norm": 0.21060039103031158, + "learning_rate": 0.00013864534705443796, + "loss": 1.0102, + "step": 4250 + }, + { + "epoch": 1.48, + "grad_norm": 0.20124833285808563, + "learning_rate": 0.00013861980777253707, + "loss": 0.9119, + "step": 4251 + }, + { + "epoch": 1.48, + "grad_norm": 0.1974477618932724, + "learning_rate": 0.0001385942655297324, + "loss": 0.8834, + "step": 4252 + }, + { + "epoch": 1.48, + "grad_norm": 0.201642706990242, + "learning_rate": 0.00013856872032798226, + "loss": 0.9881, + "step": 4253 + }, + { + "epoch": 1.48, + "grad_norm": 0.19658470153808594, + "learning_rate": 0.0001385431721692451, + "loss": 0.8564, + "step": 4254 + }, + { + "epoch": 1.48, + "grad_norm": 0.2029627114534378, + "learning_rate": 0.0001385176210554797, + "loss": 0.9421, + "step": 4255 + }, + { + "epoch": 1.48, + "grad_norm": 0.19619666039943695, + "learning_rate": 0.00013849206698864498, + "loss": 0.9308, + "step": 4256 + }, + { + "epoch": 1.48, + "grad_norm": 0.21590463817119598, + "learning_rate": 0.00013846650997070012, + "loss": 0.9667, + "step": 4257 + }, + { + "epoch": 1.48, + "grad_norm": 0.20509636402130127, + "learning_rate": 0.00013844095000360457, + "loss": 0.9068, + "step": 4258 + }, + { + "epoch": 1.48, + "grad_norm": 0.2073296159505844, + "learning_rate": 0.0001384153870893179, + "loss": 0.9285, + "step": 4259 + }, + { + "epoch": 1.48, + "grad_norm": 0.20427079498767853, + "learning_rate": 0.00013838982122979998, + "loss": 0.955, + "step": 4260 + }, + { + "epoch": 1.48, + "eval_loss": 0.9315346479415894, + "eval_runtime": 792.6138, + "eval_samples_per_second": 8.675, + "eval_steps_per_second": 4.338, + "step": 4260 + }, + { + "epoch": 1.48, + "grad_norm": 0.2080182284116745, + "learning_rate": 0.0001383642524270109, + "loss": 0.9158, + "step": 4261 + }, + { + "epoch": 1.48, + "grad_norm": 0.20025072991847992, + "learning_rate": 0.000138338680682911, + "loss": 0.9673, + "step": 4262 + }, + { + "epoch": 1.48, + "grad_norm": 0.19327238202095032, + "learning_rate": 0.00013831310599946082, + "loss": 0.8705, + "step": 4263 + }, + { + "epoch": 1.48, + "grad_norm": 0.20488376915454865, + "learning_rate": 0.00013828752837862103, + "loss": 0.9752, + "step": 4264 + }, + { + "epoch": 1.48, + "grad_norm": 0.1918758749961853, + "learning_rate": 0.00013826194782235273, + "loss": 0.8837, + "step": 4265 + }, + { + "epoch": 1.48, + "grad_norm": 0.19506444036960602, + "learning_rate": 0.00013823636433261705, + "loss": 0.9593, + "step": 4266 + }, + { + "epoch": 1.48, + "grad_norm": 0.19961698353290558, + "learning_rate": 0.00013821077791137545, + "loss": 0.9217, + "step": 4267 + }, + { + "epoch": 1.48, + "grad_norm": 0.1855212301015854, + "learning_rate": 0.00013818518856058962, + "loss": 0.9365, + "step": 4268 + }, + { + "epoch": 1.48, + "grad_norm": 0.1939588487148285, + "learning_rate": 0.00013815959628222138, + "loss": 0.8809, + "step": 4269 + }, + { + "epoch": 1.48, + "grad_norm": 0.19562089443206787, + "learning_rate": 0.0001381340010782329, + "loss": 0.9127, + "step": 4270 + }, + { + "epoch": 1.48, + "grad_norm": 0.1901262402534485, + "learning_rate": 0.00013810840295058646, + "loss": 0.9086, + "step": 4271 + }, + { + "epoch": 1.48, + "grad_norm": 0.1936071664094925, + "learning_rate": 0.00013808280190124466, + "loss": 0.9219, + "step": 4272 + }, + { + "epoch": 1.48, + "grad_norm": 0.20040670037269592, + "learning_rate": 0.00013805719793217032, + "loss": 0.939, + "step": 4273 + }, + { + "epoch": 1.48, + "grad_norm": 0.2018422931432724, + "learning_rate": 0.0001380315910453263, + "loss": 0.9056, + "step": 4274 + }, + { + "epoch": 1.49, + "grad_norm": 0.18731442093849182, + "learning_rate": 0.000138005981242676, + "loss": 0.9318, + "step": 4275 + }, + { + "epoch": 1.49, + "grad_norm": 0.19478319585323334, + "learning_rate": 0.00013798036852618275, + "loss": 0.8266, + "step": 4276 + }, + { + "epoch": 1.49, + "grad_norm": 0.2014399617910385, + "learning_rate": 0.0001379547528978103, + "loss": 0.9502, + "step": 4277 + }, + { + "epoch": 1.49, + "grad_norm": 0.19708126783370972, + "learning_rate": 0.00013792913435952252, + "loss": 0.8693, + "step": 4278 + }, + { + "epoch": 1.49, + "grad_norm": 0.20559723675251007, + "learning_rate": 0.00013790351291328347, + "loss": 0.9499, + "step": 4279 + }, + { + "epoch": 1.49, + "grad_norm": 0.19979485869407654, + "learning_rate": 0.0001378778885610576, + "loss": 0.8887, + "step": 4280 + }, + { + "epoch": 1.49, + "grad_norm": 0.20127691328525543, + "learning_rate": 0.00013785226130480946, + "loss": 0.9415, + "step": 4281 + }, + { + "epoch": 1.49, + "grad_norm": 0.20772793889045715, + "learning_rate": 0.00013782663114650379, + "loss": 0.9173, + "step": 4282 + }, + { + "epoch": 1.49, + "grad_norm": 0.19286410510540009, + "learning_rate": 0.00013780099808810564, + "loss": 0.8666, + "step": 4283 + }, + { + "epoch": 1.49, + "grad_norm": 0.2035040259361267, + "learning_rate": 0.00013777536213158023, + "loss": 0.9432, + "step": 4284 + }, + { + "epoch": 1.49, + "grad_norm": 0.19898177683353424, + "learning_rate": 0.00013774972327889302, + "loss": 0.8985, + "step": 4285 + }, + { + "epoch": 1.49, + "grad_norm": 0.1931130290031433, + "learning_rate": 0.0001377240815320097, + "loss": 0.9683, + "step": 4286 + }, + { + "epoch": 1.49, + "grad_norm": 0.20073358714580536, + "learning_rate": 0.00013769843689289614, + "loss": 0.8938, + "step": 4287 + }, + { + "epoch": 1.49, + "grad_norm": 0.20493336021900177, + "learning_rate": 0.00013767278936351854, + "loss": 0.9391, + "step": 4288 + }, + { + "epoch": 1.49, + "grad_norm": 0.20389360189437866, + "learning_rate": 0.00013764713894584313, + "loss": 0.8701, + "step": 4289 + }, + { + "epoch": 1.49, + "grad_norm": 0.20712493360042572, + "learning_rate": 0.0001376214856418366, + "loss": 0.9492, + "step": 4290 + }, + { + "epoch": 1.49, + "grad_norm": 0.2005758136510849, + "learning_rate": 0.00013759582945346564, + "loss": 0.877, + "step": 4291 + }, + { + "epoch": 1.49, + "grad_norm": 0.19810102880001068, + "learning_rate": 0.0001375701703826973, + "loss": 0.8873, + "step": 4292 + }, + { + "epoch": 1.49, + "grad_norm": 0.19251015782356262, + "learning_rate": 0.00013754450843149883, + "loss": 0.8936, + "step": 4293 + }, + { + "epoch": 1.49, + "grad_norm": 0.19246292114257812, + "learning_rate": 0.0001375188436018376, + "loss": 0.8776, + "step": 4294 + }, + { + "epoch": 1.49, + "grad_norm": 0.20692507922649384, + "learning_rate": 0.00013749317589568138, + "loss": 0.9213, + "step": 4295 + }, + { + "epoch": 1.49, + "grad_norm": 0.19895991683006287, + "learning_rate": 0.00013746750531499802, + "loss": 0.9143, + "step": 4296 + }, + { + "epoch": 1.49, + "grad_norm": 0.19519945979118347, + "learning_rate": 0.0001374418318617556, + "loss": 0.9254, + "step": 4297 + }, + { + "epoch": 1.49, + "grad_norm": 0.19164012372493744, + "learning_rate": 0.0001374161555379225, + "loss": 0.8961, + "step": 4298 + }, + { + "epoch": 1.49, + "grad_norm": 0.1917676478624344, + "learning_rate": 0.00013739047634546729, + "loss": 0.8847, + "step": 4299 + }, + { + "epoch": 1.49, + "grad_norm": 0.18594776093959808, + "learning_rate": 0.0001373647942863587, + "loss": 0.8765, + "step": 4300 + }, + { + "epoch": 1.49, + "grad_norm": 0.1943291574716568, + "learning_rate": 0.00013733910936256568, + "loss": 0.8541, + "step": 4301 + }, + { + "epoch": 1.49, + "grad_norm": 0.2048158198595047, + "learning_rate": 0.00013731342157605753, + "loss": 0.9455, + "step": 4302 + }, + { + "epoch": 1.5, + "grad_norm": 0.19819948077201843, + "learning_rate": 0.00013728773092880364, + "loss": 0.8636, + "step": 4303 + }, + { + "epoch": 1.5, + "grad_norm": 0.2028474062681198, + "learning_rate": 0.00013726203742277368, + "loss": 0.9195, + "step": 4304 + }, + { + "epoch": 1.5, + "grad_norm": 0.20039261877536774, + "learning_rate": 0.0001372363410599375, + "loss": 0.8995, + "step": 4305 + }, + { + "epoch": 1.5, + "grad_norm": 0.2012881338596344, + "learning_rate": 0.00013721064184226523, + "loss": 0.9096, + "step": 4306 + }, + { + "epoch": 1.5, + "grad_norm": 0.2081228643655777, + "learning_rate": 0.0001371849397717271, + "loss": 0.958, + "step": 4307 + }, + { + "epoch": 1.5, + "grad_norm": 0.19576063752174377, + "learning_rate": 0.00013715923485029365, + "loss": 0.9018, + "step": 4308 + }, + { + "epoch": 1.5, + "grad_norm": 0.2039271742105484, + "learning_rate": 0.0001371335270799357, + "loss": 0.9569, + "step": 4309 + }, + { + "epoch": 1.5, + "grad_norm": 0.1974891871213913, + "learning_rate": 0.00013710781646262418, + "loss": 0.929, + "step": 4310 + }, + { + "epoch": 1.5, + "grad_norm": 0.19602631032466888, + "learning_rate": 0.00013708210300033023, + "loss": 0.9001, + "step": 4311 + }, + { + "epoch": 1.5, + "grad_norm": 0.1993727684020996, + "learning_rate": 0.00013705638669502527, + "loss": 0.8957, + "step": 4312 + }, + { + "epoch": 1.5, + "grad_norm": 0.20080606639385223, + "learning_rate": 0.00013703066754868095, + "loss": 0.896, + "step": 4313 + }, + { + "epoch": 1.5, + "grad_norm": 0.20421402156352997, + "learning_rate": 0.0001370049455632691, + "loss": 0.9239, + "step": 4314 + }, + { + "epoch": 1.5, + "grad_norm": 0.19175568222999573, + "learning_rate": 0.00013697922074076173, + "loss": 0.9108, + "step": 4315 + }, + { + "epoch": 1.5, + "grad_norm": 0.2039186805486679, + "learning_rate": 0.00013695349308313114, + "loss": 0.8913, + "step": 4316 + }, + { + "epoch": 1.5, + "grad_norm": 0.20334771275520325, + "learning_rate": 0.00013692776259234982, + "loss": 0.8841, + "step": 4317 + }, + { + "epoch": 1.5, + "grad_norm": 0.20497700572013855, + "learning_rate": 0.00013690202927039047, + "loss": 0.8602, + "step": 4318 + }, + { + "epoch": 1.5, + "grad_norm": 0.2132885605096817, + "learning_rate": 0.00013687629311922602, + "loss": 0.9771, + "step": 4319 + }, + { + "epoch": 1.5, + "grad_norm": 0.20935708284378052, + "learning_rate": 0.00013685055414082962, + "loss": 0.9509, + "step": 4320 + }, + { + "epoch": 1.5, + "grad_norm": 0.21456383168697357, + "learning_rate": 0.0001368248123371746, + "loss": 0.896, + "step": 4321 + }, + { + "epoch": 1.5, + "grad_norm": 0.18961356580257416, + "learning_rate": 0.00013679906771023453, + "loss": 0.8618, + "step": 4322 + }, + { + "epoch": 1.5, + "grad_norm": 0.20388853549957275, + "learning_rate": 0.0001367733202619832, + "loss": 0.9323, + "step": 4323 + }, + { + "epoch": 1.5, + "grad_norm": 0.20104189217090607, + "learning_rate": 0.00013674756999439464, + "loss": 0.8609, + "step": 4324 + }, + { + "epoch": 1.5, + "grad_norm": 0.20272791385650635, + "learning_rate": 0.00013672181690944308, + "loss": 0.9397, + "step": 4325 + }, + { + "epoch": 1.5, + "grad_norm": 0.19907397031784058, + "learning_rate": 0.00013669606100910291, + "loss": 0.8918, + "step": 4326 + }, + { + "epoch": 1.5, + "grad_norm": 0.2002435177564621, + "learning_rate": 0.00013667030229534883, + "loss": 0.9188, + "step": 4327 + }, + { + "epoch": 1.5, + "grad_norm": 0.20999270677566528, + "learning_rate": 0.00013664454077015568, + "loss": 0.9373, + "step": 4328 + }, + { + "epoch": 1.5, + "grad_norm": 0.19210919737815857, + "learning_rate": 0.00013661877643549858, + "loss": 0.9088, + "step": 4329 + }, + { + "epoch": 1.5, + "grad_norm": 0.2012004852294922, + "learning_rate": 0.00013659300929335277, + "loss": 0.9251, + "step": 4330 + }, + { + "epoch": 1.5, + "grad_norm": 0.1998959332704544, + "learning_rate": 0.00013656723934569383, + "loss": 0.8971, + "step": 4331 + }, + { + "epoch": 1.51, + "grad_norm": 0.19553914666175842, + "learning_rate": 0.0001365414665944974, + "loss": 0.9171, + "step": 4332 + }, + { + "epoch": 1.51, + "grad_norm": 0.19870030879974365, + "learning_rate": 0.00013651569104173954, + "loss": 0.9363, + "step": 4333 + }, + { + "epoch": 1.51, + "grad_norm": 0.1972079575061798, + "learning_rate": 0.00013648991268939634, + "loss": 0.9027, + "step": 4334 + }, + { + "epoch": 1.51, + "grad_norm": 0.20828427374362946, + "learning_rate": 0.00013646413153944423, + "loss": 0.9268, + "step": 4335 + }, + { + "epoch": 1.51, + "grad_norm": 0.1969297230243683, + "learning_rate": 0.0001364383475938597, + "loss": 0.9146, + "step": 4336 + }, + { + "epoch": 1.51, + "grad_norm": 0.20469069480895996, + "learning_rate": 0.00013641256085461967, + "loss": 0.9479, + "step": 4337 + }, + { + "epoch": 1.51, + "grad_norm": 0.2146853655576706, + "learning_rate": 0.00013638677132370107, + "loss": 0.853, + "step": 4338 + }, + { + "epoch": 1.51, + "grad_norm": 0.19474592804908752, + "learning_rate": 0.0001363609790030812, + "loss": 0.9035, + "step": 4339 + }, + { + "epoch": 1.51, + "grad_norm": 0.19707852602005005, + "learning_rate": 0.00013633518389473742, + "loss": 0.9282, + "step": 4340 + }, + { + "epoch": 1.51, + "grad_norm": 0.19231432676315308, + "learning_rate": 0.00013630938600064747, + "loss": 0.8703, + "step": 4341 + }, + { + "epoch": 1.51, + "grad_norm": 0.21518941223621368, + "learning_rate": 0.0001362835853227892, + "loss": 0.9615, + "step": 4342 + }, + { + "epoch": 1.51, + "grad_norm": 0.19725742936134338, + "learning_rate": 0.00013625778186314067, + "loss": 0.8598, + "step": 4343 + }, + { + "epoch": 1.51, + "grad_norm": 0.2083866149187088, + "learning_rate": 0.0001362319756236802, + "loss": 0.9858, + "step": 4344 + }, + { + "epoch": 1.51, + "grad_norm": 0.2003520131111145, + "learning_rate": 0.0001362061666063863, + "loss": 0.8966, + "step": 4345 + }, + { + "epoch": 1.51, + "grad_norm": 0.20607808232307434, + "learning_rate": 0.00013618035481323775, + "loss": 0.9609, + "step": 4346 + }, + { + "epoch": 1.51, + "grad_norm": 0.19025003910064697, + "learning_rate": 0.0001361545402462134, + "loss": 0.8294, + "step": 4347 + }, + { + "epoch": 1.51, + "grad_norm": 0.1967662274837494, + "learning_rate": 0.0001361287229072924, + "loss": 0.938, + "step": 4348 + }, + { + "epoch": 1.51, + "grad_norm": 0.20086213946342468, + "learning_rate": 0.00013610290279845422, + "loss": 0.9094, + "step": 4349 + }, + { + "epoch": 1.51, + "grad_norm": 0.2035747617483139, + "learning_rate": 0.00013607707992167834, + "loss": 0.98, + "step": 4350 + }, + { + "epoch": 1.51, + "grad_norm": 0.1947256624698639, + "learning_rate": 0.0001360512542789446, + "loss": 0.9259, + "step": 4351 + }, + { + "epoch": 1.51, + "grad_norm": 0.19627153873443604, + "learning_rate": 0.00013602542587223295, + "loss": 0.8606, + "step": 4352 + }, + { + "epoch": 1.51, + "grad_norm": 0.20166419446468353, + "learning_rate": 0.00013599959470352363, + "loss": 0.9869, + "step": 4353 + }, + { + "epoch": 1.51, + "grad_norm": 0.2084067314863205, + "learning_rate": 0.0001359737607747971, + "loss": 0.89, + "step": 4354 + }, + { + "epoch": 1.51, + "grad_norm": 0.20896011590957642, + "learning_rate": 0.0001359479240880339, + "loss": 0.8938, + "step": 4355 + }, + { + "epoch": 1.51, + "grad_norm": 0.20715130865573883, + "learning_rate": 0.000135922084645215, + "loss": 0.9206, + "step": 4356 + }, + { + "epoch": 1.51, + "grad_norm": 0.19523970782756805, + "learning_rate": 0.00013589624244832136, + "loss": 0.8097, + "step": 4357 + }, + { + "epoch": 1.51, + "grad_norm": 0.19200685620307922, + "learning_rate": 0.00013587039749933432, + "loss": 0.8675, + "step": 4358 + }, + { + "epoch": 1.51, + "grad_norm": 0.19039379060268402, + "learning_rate": 0.00013584454980023532, + "loss": 0.8412, + "step": 4359 + }, + { + "epoch": 1.52, + "grad_norm": 0.19700483977794647, + "learning_rate": 0.00013581869935300604, + "loss": 0.8602, + "step": 4360 + }, + { + "epoch": 1.52, + "grad_norm": 0.20600780844688416, + "learning_rate": 0.00013579284615962843, + "loss": 0.8716, + "step": 4361 + }, + { + "epoch": 1.52, + "grad_norm": 0.19441702961921692, + "learning_rate": 0.00013576699022208453, + "loss": 0.859, + "step": 4362 + }, + { + "epoch": 1.52, + "grad_norm": 0.20182082056999207, + "learning_rate": 0.00013574113154235675, + "loss": 0.92, + "step": 4363 + }, + { + "epoch": 1.52, + "grad_norm": 0.20214153826236725, + "learning_rate": 0.00013571527012242758, + "loss": 0.8722, + "step": 4364 + }, + { + "epoch": 1.52, + "grad_norm": 0.2132052481174469, + "learning_rate": 0.00013568940596427975, + "loss": 0.956, + "step": 4365 + }, + { + "epoch": 1.52, + "grad_norm": 0.20397233963012695, + "learning_rate": 0.00013566353906989623, + "loss": 0.9429, + "step": 4366 + }, + { + "epoch": 1.52, + "grad_norm": 0.19374212622642517, + "learning_rate": 0.0001356376694412602, + "loss": 0.9663, + "step": 4367 + }, + { + "epoch": 1.52, + "grad_norm": 0.19922977685928345, + "learning_rate": 0.00013561179708035495, + "loss": 0.9123, + "step": 4368 + }, + { + "epoch": 1.52, + "grad_norm": 0.20432357490062714, + "learning_rate": 0.0001355859219891642, + "loss": 0.9537, + "step": 4369 + }, + { + "epoch": 1.52, + "grad_norm": 0.19571687281131744, + "learning_rate": 0.00013556004416967165, + "loss": 0.9415, + "step": 4370 + }, + { + "epoch": 1.52, + "grad_norm": 0.20116746425628662, + "learning_rate": 0.00013553416362386132, + "loss": 0.9905, + "step": 4371 + }, + { + "epoch": 1.52, + "grad_norm": 0.19328705966472626, + "learning_rate": 0.00013550828035371738, + "loss": 0.8699, + "step": 4372 + }, + { + "epoch": 1.52, + "grad_norm": 0.20481045544147491, + "learning_rate": 0.00013548239436122434, + "loss": 0.9877, + "step": 4373 + }, + { + "epoch": 1.52, + "grad_norm": 0.19554881751537323, + "learning_rate": 0.00013545650564836676, + "loss": 0.9079, + "step": 4374 + }, + { + "epoch": 1.52, + "grad_norm": 0.19037027657032013, + "learning_rate": 0.0001354306142171295, + "loss": 0.8858, + "step": 4375 + }, + { + "epoch": 1.52, + "grad_norm": 0.20761582255363464, + "learning_rate": 0.0001354047200694976, + "loss": 0.9034, + "step": 4376 + }, + { + "epoch": 1.52, + "grad_norm": 0.2009163498878479, + "learning_rate": 0.00013537882320745627, + "loss": 0.9119, + "step": 4377 + }, + { + "epoch": 1.52, + "grad_norm": 0.2040199190378189, + "learning_rate": 0.00013535292363299104, + "loss": 0.9102, + "step": 4378 + }, + { + "epoch": 1.52, + "grad_norm": 0.1997002214193344, + "learning_rate": 0.00013532702134808755, + "loss": 0.9247, + "step": 4379 + }, + { + "epoch": 1.52, + "grad_norm": 0.1887330412864685, + "learning_rate": 0.00013530111635473167, + "loss": 0.8357, + "step": 4380 + }, + { + "epoch": 1.52, + "grad_norm": 0.20603513717651367, + "learning_rate": 0.0001352752086549095, + "loss": 0.8948, + "step": 4381 + }, + { + "epoch": 1.52, + "grad_norm": 0.19191201031208038, + "learning_rate": 0.0001352492982506073, + "loss": 0.8744, + "step": 4382 + }, + { + "epoch": 1.52, + "grad_norm": 0.20775219798088074, + "learning_rate": 0.0001352233851438116, + "loss": 0.962, + "step": 4383 + }, + { + "epoch": 1.52, + "grad_norm": 0.187792107462883, + "learning_rate": 0.0001351974693365091, + "loss": 0.819, + "step": 4384 + }, + { + "epoch": 1.52, + "grad_norm": 0.1982596069574356, + "learning_rate": 0.0001351715508306867, + "loss": 0.8621, + "step": 4385 + }, + { + "epoch": 1.52, + "grad_norm": 0.20273452997207642, + "learning_rate": 0.00013514562962833158, + "loss": 0.8676, + "step": 4386 + }, + { + "epoch": 1.52, + "grad_norm": 0.21657277643680573, + "learning_rate": 0.00013511970573143095, + "loss": 0.9464, + "step": 4387 + }, + { + "epoch": 1.52, + "grad_norm": 0.2003757208585739, + "learning_rate": 0.00013509377914197246, + "loss": 0.9091, + "step": 4388 + }, + { + "epoch": 1.53, + "grad_norm": 0.2024131715297699, + "learning_rate": 0.00013506784986194376, + "loss": 0.8762, + "step": 4389 + }, + { + "epoch": 1.53, + "grad_norm": 0.19812408089637756, + "learning_rate": 0.00013504191789333284, + "loss": 0.9416, + "step": 4390 + }, + { + "epoch": 1.53, + "grad_norm": 0.204255148768425, + "learning_rate": 0.00013501598323812793, + "loss": 0.9498, + "step": 4391 + }, + { + "epoch": 1.53, + "grad_norm": 0.20179559290409088, + "learning_rate": 0.00013499004589831722, + "loss": 0.9185, + "step": 4392 + }, + { + "epoch": 1.53, + "grad_norm": 0.1984333097934723, + "learning_rate": 0.00013496410587588939, + "loss": 0.9283, + "step": 4393 + }, + { + "epoch": 1.53, + "grad_norm": 0.20066393911838531, + "learning_rate": 0.00013493816317283316, + "loss": 0.9169, + "step": 4394 + }, + { + "epoch": 1.53, + "grad_norm": 0.20280592143535614, + "learning_rate": 0.00013491221779113755, + "loss": 0.9415, + "step": 4395 + }, + { + "epoch": 1.53, + "grad_norm": 0.1986709088087082, + "learning_rate": 0.00013488626973279173, + "loss": 0.8959, + "step": 4396 + }, + { + "epoch": 1.53, + "grad_norm": 0.19992804527282715, + "learning_rate": 0.00013486031899978505, + "loss": 0.9565, + "step": 4397 + }, + { + "epoch": 1.53, + "grad_norm": 0.18855875730514526, + "learning_rate": 0.00013483436559410717, + "loss": 0.8672, + "step": 4398 + }, + { + "epoch": 1.53, + "grad_norm": 0.19654472172260284, + "learning_rate": 0.0001348084095177478, + "loss": 0.9141, + "step": 4399 + }, + { + "epoch": 1.53, + "grad_norm": 0.20703695714473724, + "learning_rate": 0.00013478245077269696, + "loss": 0.952, + "step": 4400 + }, + { + "epoch": 1.53, + "grad_norm": 0.19787438213825226, + "learning_rate": 0.00013475648936094495, + "loss": 0.8901, + "step": 4401 + }, + { + "epoch": 1.53, + "grad_norm": 0.19545020163059235, + "learning_rate": 0.00013473052528448201, + "loss": 0.8773, + "step": 4402 + }, + { + "epoch": 1.53, + "grad_norm": 0.20343650877475739, + "learning_rate": 0.00013470455854529894, + "loss": 0.9165, + "step": 4403 + }, + { + "epoch": 1.53, + "grad_norm": 0.20238184928894043, + "learning_rate": 0.00013467858914538642, + "loss": 0.9452, + "step": 4404 + }, + { + "epoch": 1.53, + "grad_norm": 0.19688518345355988, + "learning_rate": 0.00013465261708673552, + "loss": 0.884, + "step": 4405 + }, + { + "epoch": 1.53, + "grad_norm": 0.2027846872806549, + "learning_rate": 0.00013462664237133754, + "loss": 0.9246, + "step": 4406 + }, + { + "epoch": 1.53, + "grad_norm": 0.19596917927265167, + "learning_rate": 0.00013460066500118372, + "loss": 0.9475, + "step": 4407 + }, + { + "epoch": 1.53, + "grad_norm": 0.19403593242168427, + "learning_rate": 0.0001345746849782659, + "loss": 0.8755, + "step": 4408 + }, + { + "epoch": 1.53, + "grad_norm": 0.20170363783836365, + "learning_rate": 0.0001345487023045758, + "loss": 0.8977, + "step": 4409 + }, + { + "epoch": 1.53, + "grad_norm": 0.19403304159641266, + "learning_rate": 0.00013452271698210548, + "loss": 0.9049, + "step": 4410 + }, + { + "epoch": 1.53, + "grad_norm": 0.2069365233182907, + "learning_rate": 0.00013449672901284722, + "loss": 0.9529, + "step": 4411 + }, + { + "epoch": 1.53, + "grad_norm": 0.20516468584537506, + "learning_rate": 0.0001344707383987934, + "loss": 0.9236, + "step": 4412 + }, + { + "epoch": 1.53, + "grad_norm": 0.19744600355625153, + "learning_rate": 0.0001344447451419367, + "loss": 0.9136, + "step": 4413 + }, + { + "epoch": 1.53, + "grad_norm": 0.20329774916172028, + "learning_rate": 0.00013441874924427, + "loss": 0.8956, + "step": 4414 + }, + { + "epoch": 1.53, + "grad_norm": 0.19856001436710358, + "learning_rate": 0.00013439275070778632, + "loss": 0.8937, + "step": 4415 + }, + { + "epoch": 1.53, + "grad_norm": 0.1949041783809662, + "learning_rate": 0.00013436674953447894, + "loss": 0.9168, + "step": 4416 + }, + { + "epoch": 1.54, + "grad_norm": 0.20832853019237518, + "learning_rate": 0.00013434074572634126, + "loss": 0.9482, + "step": 4417 + }, + { + "epoch": 1.54, + "grad_norm": 0.21745702624320984, + "learning_rate": 0.00013431473928536702, + "loss": 0.9225, + "step": 4418 + }, + { + "epoch": 1.54, + "grad_norm": 0.20856374502182007, + "learning_rate": 0.00013428873021355004, + "loss": 0.9394, + "step": 4419 + }, + { + "epoch": 1.54, + "grad_norm": 0.19907131791114807, + "learning_rate": 0.0001342627185128844, + "loss": 0.9404, + "step": 4420 + }, + { + "epoch": 1.54, + "grad_norm": 0.20088614523410797, + "learning_rate": 0.00013423670418536435, + "loss": 0.8747, + "step": 4421 + }, + { + "epoch": 1.54, + "grad_norm": 0.19531351327896118, + "learning_rate": 0.00013421068723298434, + "loss": 0.9087, + "step": 4422 + }, + { + "epoch": 1.54, + "grad_norm": 0.19716942310333252, + "learning_rate": 0.00013418466765773907, + "loss": 0.9053, + "step": 4423 + }, + { + "epoch": 1.54, + "grad_norm": 0.2023046761751175, + "learning_rate": 0.0001341586454616234, + "loss": 0.917, + "step": 4424 + }, + { + "epoch": 1.54, + "grad_norm": 0.2044750452041626, + "learning_rate": 0.0001341326206466324, + "loss": 0.9218, + "step": 4425 + }, + { + "epoch": 1.54, + "grad_norm": 0.2067694514989853, + "learning_rate": 0.00013410659321476132, + "loss": 0.8862, + "step": 4426 + }, + { + "epoch": 1.54, + "grad_norm": 0.1974387913942337, + "learning_rate": 0.00013408056316800568, + "loss": 0.8885, + "step": 4427 + }, + { + "epoch": 1.54, + "grad_norm": 0.2019626647233963, + "learning_rate": 0.00013405453050836113, + "loss": 0.9814, + "step": 4428 + }, + { + "epoch": 1.54, + "grad_norm": 0.20270082354545593, + "learning_rate": 0.00013402849523782353, + "loss": 0.8564, + "step": 4429 + }, + { + "epoch": 1.54, + "grad_norm": 0.19102539122104645, + "learning_rate": 0.0001340024573583889, + "loss": 0.861, + "step": 4430 + }, + { + "epoch": 1.54, + "grad_norm": 0.20177873969078064, + "learning_rate": 0.00013397641687205365, + "loss": 0.8803, + "step": 4431 + }, + { + "epoch": 1.54, + "grad_norm": 0.2063576877117157, + "learning_rate": 0.0001339503737808141, + "loss": 0.9562, + "step": 4432 + }, + { + "epoch": 1.54, + "grad_norm": 0.20372329652309418, + "learning_rate": 0.00013392432808666707, + "loss": 0.9515, + "step": 4433 + }, + { + "epoch": 1.54, + "grad_norm": 0.2125784009695053, + "learning_rate": 0.00013389827979160929, + "loss": 0.9238, + "step": 4434 + }, + { + "epoch": 1.54, + "grad_norm": 0.21021676063537598, + "learning_rate": 0.0001338722288976379, + "loss": 0.9715, + "step": 4435 + }, + { + "epoch": 1.54, + "grad_norm": 0.2150159329175949, + "learning_rate": 0.0001338461754067502, + "loss": 0.9791, + "step": 4436 + }, + { + "epoch": 1.54, + "grad_norm": 0.19582140445709229, + "learning_rate": 0.00013382011932094363, + "loss": 0.92, + "step": 4437 + }, + { + "epoch": 1.54, + "grad_norm": 0.20285850763320923, + "learning_rate": 0.00013379406064221582, + "loss": 0.8859, + "step": 4438 + }, + { + "epoch": 1.54, + "grad_norm": 0.1983582079410553, + "learning_rate": 0.00013376799937256468, + "loss": 0.9032, + "step": 4439 + }, + { + "epoch": 1.54, + "grad_norm": 0.20245066285133362, + "learning_rate": 0.00013374193551398826, + "loss": 0.9366, + "step": 4440 + }, + { + "epoch": 1.54, + "grad_norm": 0.20687803626060486, + "learning_rate": 0.00013371586906848486, + "loss": 0.9917, + "step": 4441 + }, + { + "epoch": 1.54, + "grad_norm": 0.20650020241737366, + "learning_rate": 0.00013368980003805288, + "loss": 0.9583, + "step": 4442 + }, + { + "epoch": 1.54, + "grad_norm": 0.20843304693698883, + "learning_rate": 0.00013366372842469105, + "loss": 0.9294, + "step": 4443 + }, + { + "epoch": 1.54, + "grad_norm": 0.19253431260585785, + "learning_rate": 0.00013363765423039816, + "loss": 0.901, + "step": 4444 + }, + { + "epoch": 1.55, + "grad_norm": 0.1946306675672531, + "learning_rate": 0.0001336115774571733, + "loss": 0.8535, + "step": 4445 + }, + { + "epoch": 1.55, + "grad_norm": 0.2142277956008911, + "learning_rate": 0.00013358549810701573, + "loss": 0.8874, + "step": 4446 + }, + { + "epoch": 1.55, + "grad_norm": 0.20530451834201813, + "learning_rate": 0.0001335594161819249, + "loss": 0.8352, + "step": 4447 + }, + { + "epoch": 1.55, + "grad_norm": 0.1976638287305832, + "learning_rate": 0.00013353333168390045, + "loss": 0.9163, + "step": 4448 + }, + { + "epoch": 1.55, + "grad_norm": 0.204092338681221, + "learning_rate": 0.00013350724461494223, + "loss": 0.9034, + "step": 4449 + }, + { + "epoch": 1.55, + "grad_norm": 0.1945483237504959, + "learning_rate": 0.00013348115497705028, + "loss": 0.8528, + "step": 4450 + }, + { + "epoch": 1.55, + "grad_norm": 0.19972464442253113, + "learning_rate": 0.00013345506277222485, + "loss": 0.852, + "step": 4451 + }, + { + "epoch": 1.55, + "grad_norm": 0.20141078531742096, + "learning_rate": 0.00013342896800246638, + "loss": 0.9889, + "step": 4452 + }, + { + "epoch": 1.55, + "grad_norm": 0.20517943799495697, + "learning_rate": 0.00013340287066977547, + "loss": 0.9725, + "step": 4453 + }, + { + "epoch": 1.55, + "grad_norm": 0.2040693759918213, + "learning_rate": 0.00013337677077615302, + "loss": 0.9314, + "step": 4454 + }, + { + "epoch": 1.55, + "grad_norm": 0.20373493432998657, + "learning_rate": 0.00013335066832359998, + "loss": 0.9459, + "step": 4455 + }, + { + "epoch": 1.55, + "grad_norm": 0.19994668662548065, + "learning_rate": 0.00013332456331411761, + "loss": 0.8899, + "step": 4456 + }, + { + "epoch": 1.55, + "grad_norm": 0.2057134211063385, + "learning_rate": 0.00013329845574970732, + "loss": 0.8571, + "step": 4457 + }, + { + "epoch": 1.55, + "grad_norm": 0.20253203809261322, + "learning_rate": 0.00013327234563237074, + "loss": 0.8994, + "step": 4458 + }, + { + "epoch": 1.55, + "grad_norm": 0.20478413999080658, + "learning_rate": 0.0001332462329641097, + "loss": 0.9676, + "step": 4459 + }, + { + "epoch": 1.55, + "grad_norm": 0.20461991429328918, + "learning_rate": 0.00013322011774692614, + "loss": 0.9049, + "step": 4460 + }, + { + "epoch": 1.55, + "grad_norm": 0.20310884714126587, + "learning_rate": 0.0001331939999828223, + "loss": 0.8697, + "step": 4461 + }, + { + "epoch": 1.55, + "grad_norm": 0.20332156121730804, + "learning_rate": 0.0001331678796738006, + "loss": 0.9044, + "step": 4462 + }, + { + "epoch": 1.55, + "grad_norm": 0.19390572607517242, + "learning_rate": 0.0001331417568218636, + "loss": 0.9324, + "step": 4463 + }, + { + "epoch": 1.55, + "grad_norm": 0.19995063543319702, + "learning_rate": 0.00013311563142901408, + "loss": 0.8561, + "step": 4464 + }, + { + "epoch": 1.55, + "grad_norm": 0.1999330073595047, + "learning_rate": 0.00013308950349725504, + "loss": 0.9186, + "step": 4465 + }, + { + "epoch": 1.55, + "grad_norm": 0.19406409561634064, + "learning_rate": 0.00013306337302858968, + "loss": 0.8674, + "step": 4466 + }, + { + "epoch": 1.55, + "grad_norm": 0.2086877077817917, + "learning_rate": 0.0001330372400250213, + "loss": 0.9126, + "step": 4467 + }, + { + "epoch": 1.55, + "grad_norm": 0.2143045961856842, + "learning_rate": 0.00013301110448855357, + "loss": 0.9841, + "step": 4468 + }, + { + "epoch": 1.55, + "grad_norm": 0.19664418697357178, + "learning_rate": 0.00013298496642119014, + "loss": 0.9627, + "step": 4469 + }, + { + "epoch": 1.55, + "grad_norm": 0.19305755198001862, + "learning_rate": 0.00013295882582493502, + "loss": 0.9068, + "step": 4470 + }, + { + "epoch": 1.55, + "grad_norm": 0.20256315171718597, + "learning_rate": 0.0001329326827017924, + "loss": 0.8876, + "step": 4471 + }, + { + "epoch": 1.55, + "grad_norm": 0.2027614861726761, + "learning_rate": 0.0001329065370537665, + "loss": 0.8925, + "step": 4472 + }, + { + "epoch": 1.55, + "grad_norm": 0.1951225847005844, + "learning_rate": 0.000132880388882862, + "loss": 0.9562, + "step": 4473 + }, + { + "epoch": 1.56, + "grad_norm": 0.19473542273044586, + "learning_rate": 0.0001328542381910835, + "loss": 0.9228, + "step": 4474 + }, + { + "epoch": 1.56, + "grad_norm": 0.1945401281118393, + "learning_rate": 0.00013282808498043597, + "loss": 0.9131, + "step": 4475 + }, + { + "epoch": 1.56, + "grad_norm": 0.18757790327072144, + "learning_rate": 0.00013280192925292457, + "loss": 0.8966, + "step": 4476 + }, + { + "epoch": 1.56, + "grad_norm": 0.19668155908584595, + "learning_rate": 0.00013277577101055454, + "loss": 0.8772, + "step": 4477 + }, + { + "epoch": 1.56, + "grad_norm": 0.1970047950744629, + "learning_rate": 0.00013274961025533144, + "loss": 0.8969, + "step": 4478 + }, + { + "epoch": 1.56, + "grad_norm": 0.19303329288959503, + "learning_rate": 0.0001327234469892609, + "loss": 0.863, + "step": 4479 + }, + { + "epoch": 1.56, + "grad_norm": 0.1973162442445755, + "learning_rate": 0.00013269728121434882, + "loss": 0.961, + "step": 4480 + }, + { + "epoch": 1.56, + "grad_norm": 0.20247642695903778, + "learning_rate": 0.00013267111293260134, + "loss": 0.8567, + "step": 4481 + }, + { + "epoch": 1.56, + "grad_norm": 0.1964680701494217, + "learning_rate": 0.00013264494214602467, + "loss": 0.8572, + "step": 4482 + }, + { + "epoch": 1.56, + "grad_norm": 0.20020325481891632, + "learning_rate": 0.0001326187688566253, + "loss": 0.9614, + "step": 4483 + }, + { + "epoch": 1.56, + "grad_norm": 0.20274128019809723, + "learning_rate": 0.00013259259306640987, + "loss": 0.8822, + "step": 4484 + }, + { + "epoch": 1.56, + "grad_norm": 0.1965228021144867, + "learning_rate": 0.0001325664147773852, + "loss": 0.9174, + "step": 4485 + }, + { + "epoch": 1.56, + "grad_norm": 0.20517116785049438, + "learning_rate": 0.00013254023399155836, + "loss": 0.9295, + "step": 4486 + }, + { + "epoch": 1.56, + "grad_norm": 0.20203058421611786, + "learning_rate": 0.00013251405071093662, + "loss": 0.8963, + "step": 4487 + }, + { + "epoch": 1.56, + "grad_norm": 0.202811598777771, + "learning_rate": 0.00013248786493752732, + "loss": 0.9262, + "step": 4488 + }, + { + "epoch": 1.56, + "grad_norm": 0.19520260393619537, + "learning_rate": 0.00013246167667333808, + "loss": 0.9205, + "step": 4489 + }, + { + "epoch": 1.56, + "grad_norm": 0.2029237449169159, + "learning_rate": 0.00013243548592037675, + "loss": 0.9363, + "step": 4490 + }, + { + "epoch": 1.56, + "grad_norm": 0.2026432752609253, + "learning_rate": 0.0001324092926806513, + "loss": 0.8972, + "step": 4491 + }, + { + "epoch": 1.56, + "grad_norm": 0.2118445485830307, + "learning_rate": 0.0001323830969561699, + "loss": 0.9456, + "step": 4492 + }, + { + "epoch": 1.56, + "grad_norm": 0.1965012401342392, + "learning_rate": 0.00013235689874894097, + "loss": 0.8599, + "step": 4493 + }, + { + "epoch": 1.56, + "grad_norm": 0.20173734426498413, + "learning_rate": 0.00013233069806097301, + "loss": 0.8679, + "step": 4494 + }, + { + "epoch": 1.56, + "grad_norm": 0.1969406008720398, + "learning_rate": 0.0001323044948942748, + "loss": 0.8983, + "step": 4495 + }, + { + "epoch": 1.56, + "grad_norm": 0.2201402634382248, + "learning_rate": 0.00013227828925085533, + "loss": 0.8905, + "step": 4496 + }, + { + "epoch": 1.56, + "grad_norm": 0.19682130217552185, + "learning_rate": 0.00013225208113272366, + "loss": 0.8263, + "step": 4497 + }, + { + "epoch": 1.56, + "grad_norm": 0.20680737495422363, + "learning_rate": 0.00013222587054188917, + "loss": 0.9453, + "step": 4498 + }, + { + "epoch": 1.56, + "grad_norm": 0.2154068499803543, + "learning_rate": 0.00013219965748036133, + "loss": 0.915, + "step": 4499 + }, + { + "epoch": 1.56, + "grad_norm": 0.20928502082824707, + "learning_rate": 0.00013217344195014987, + "loss": 0.9181, + "step": 4500 + }, + { + "epoch": 1.56, + "grad_norm": 0.20148850977420807, + "learning_rate": 0.0001321472239532647, + "loss": 0.9762, + "step": 4501 + }, + { + "epoch": 1.57, + "grad_norm": 0.21044816076755524, + "learning_rate": 0.00013212100349171589, + "loss": 0.9026, + "step": 4502 + }, + { + "epoch": 1.57, + "grad_norm": 0.19358424842357635, + "learning_rate": 0.00013209478056751367, + "loss": 0.8614, + "step": 4503 + }, + { + "epoch": 1.57, + "grad_norm": 0.204145610332489, + "learning_rate": 0.00013206855518266855, + "loss": 0.8295, + "step": 4504 + }, + { + "epoch": 1.57, + "grad_norm": 0.21280977129936218, + "learning_rate": 0.00013204232733919112, + "loss": 0.9205, + "step": 4505 + }, + { + "epoch": 1.57, + "grad_norm": 0.2101668119430542, + "learning_rate": 0.00013201609703909227, + "loss": 0.9607, + "step": 4506 + }, + { + "epoch": 1.57, + "grad_norm": 0.2028149664402008, + "learning_rate": 0.00013198986428438305, + "loss": 0.8992, + "step": 4507 + }, + { + "epoch": 1.57, + "grad_norm": 0.20130696892738342, + "learning_rate": 0.0001319636290770746, + "loss": 0.8599, + "step": 4508 + }, + { + "epoch": 1.57, + "grad_norm": 0.20510746538639069, + "learning_rate": 0.00013193739141917837, + "loss": 0.949, + "step": 4509 + }, + { + "epoch": 1.57, + "grad_norm": 0.19777636229991913, + "learning_rate": 0.0001319111513127059, + "loss": 0.9049, + "step": 4510 + }, + { + "epoch": 1.57, + "grad_norm": 0.2046719193458557, + "learning_rate": 0.00013188490875966902, + "loss": 0.9144, + "step": 4511 + }, + { + "epoch": 1.57, + "grad_norm": 0.2182987481355667, + "learning_rate": 0.0001318586637620797, + "loss": 0.8728, + "step": 4512 + }, + { + "epoch": 1.57, + "grad_norm": 0.21205998957157135, + "learning_rate": 0.00013183241632195, + "loss": 0.9209, + "step": 4513 + }, + { + "epoch": 1.57, + "grad_norm": 0.1928020566701889, + "learning_rate": 0.0001318061664412924, + "loss": 0.8787, + "step": 4514 + }, + { + "epoch": 1.57, + "grad_norm": 0.20081821084022522, + "learning_rate": 0.0001317799141221193, + "loss": 0.9282, + "step": 4515 + }, + { + "epoch": 1.57, + "grad_norm": 0.1871049404144287, + "learning_rate": 0.0001317536593664435, + "loss": 0.8818, + "step": 4516 + }, + { + "epoch": 1.57, + "grad_norm": 0.2123117744922638, + "learning_rate": 0.00013172740217627784, + "loss": 0.9548, + "step": 4517 + }, + { + "epoch": 1.57, + "grad_norm": 0.19874443113803864, + "learning_rate": 0.00013170114255363542, + "loss": 0.8925, + "step": 4518 + }, + { + "epoch": 1.57, + "grad_norm": 0.20230533182621002, + "learning_rate": 0.00013167488050052953, + "loss": 0.8894, + "step": 4519 + }, + { + "epoch": 1.57, + "grad_norm": 0.2105279266834259, + "learning_rate": 0.00013164861601897362, + "loss": 0.9251, + "step": 4520 + }, + { + "epoch": 1.57, + "grad_norm": 0.20260430872440338, + "learning_rate": 0.00013162234911098136, + "loss": 1.0058, + "step": 4521 + }, + { + "epoch": 1.57, + "grad_norm": 0.1892397105693817, + "learning_rate": 0.00013159607977856656, + "loss": 0.8249, + "step": 4522 + }, + { + "epoch": 1.57, + "grad_norm": 0.19726528227329254, + "learning_rate": 0.0001315698080237432, + "loss": 0.8644, + "step": 4523 + }, + { + "epoch": 1.57, + "grad_norm": 0.21568602323532104, + "learning_rate": 0.00013154353384852558, + "loss": 0.9634, + "step": 4524 + }, + { + "epoch": 1.57, + "grad_norm": 0.19832541048526764, + "learning_rate": 0.00013151725725492797, + "loss": 0.9232, + "step": 4525 + }, + { + "epoch": 1.57, + "grad_norm": 0.194269597530365, + "learning_rate": 0.00013149097824496502, + "loss": 0.8723, + "step": 4526 + }, + { + "epoch": 1.57, + "grad_norm": 0.19430376589298248, + "learning_rate": 0.00013146469682065147, + "loss": 0.9318, + "step": 4527 + }, + { + "epoch": 1.57, + "grad_norm": 0.2051064819097519, + "learning_rate": 0.00013143841298400227, + "loss": 0.897, + "step": 4528 + }, + { + "epoch": 1.57, + "grad_norm": 0.19880448281764984, + "learning_rate": 0.00013141212673703257, + "loss": 0.9256, + "step": 4529 + }, + { + "epoch": 1.57, + "grad_norm": 0.22948618233203888, + "learning_rate": 0.00013138583808175761, + "loss": 0.8805, + "step": 4530 + }, + { + "epoch": 1.58, + "grad_norm": 0.2091957926750183, + "learning_rate": 0.00013135954702019298, + "loss": 0.9641, + "step": 4531 + }, + { + "epoch": 1.58, + "grad_norm": 0.1984574943780899, + "learning_rate": 0.00013133325355435432, + "loss": 0.8683, + "step": 4532 + }, + { + "epoch": 1.58, + "grad_norm": 0.19852031767368317, + "learning_rate": 0.00013130695768625748, + "loss": 0.835, + "step": 4533 + }, + { + "epoch": 1.58, + "grad_norm": 0.19653603434562683, + "learning_rate": 0.00013128065941791857, + "loss": 0.9282, + "step": 4534 + }, + { + "epoch": 1.58, + "grad_norm": 0.19558171927928925, + "learning_rate": 0.0001312543587513537, + "loss": 0.8546, + "step": 4535 + }, + { + "epoch": 1.58, + "grad_norm": 0.1992642730474472, + "learning_rate": 0.00013122805568857948, + "loss": 0.9177, + "step": 4536 + }, + { + "epoch": 1.58, + "grad_norm": 0.2031051516532898, + "learning_rate": 0.00013120175023161235, + "loss": 0.9602, + "step": 4537 + }, + { + "epoch": 1.58, + "grad_norm": 0.21117272973060608, + "learning_rate": 0.00013117544238246917, + "loss": 0.9495, + "step": 4538 + }, + { + "epoch": 1.58, + "grad_norm": 0.189724400639534, + "learning_rate": 0.00013114913214316693, + "loss": 0.847, + "step": 4539 + }, + { + "epoch": 1.58, + "grad_norm": 0.1954822987318039, + "learning_rate": 0.0001311228195157227, + "loss": 0.8963, + "step": 4540 + }, + { + "epoch": 1.58, + "grad_norm": 0.18917202949523926, + "learning_rate": 0.00013109650450215393, + "loss": 0.8647, + "step": 4541 + }, + { + "epoch": 1.58, + "grad_norm": 0.20269404351711273, + "learning_rate": 0.00013107018710447802, + "loss": 0.9066, + "step": 4542 + }, + { + "epoch": 1.58, + "grad_norm": 0.20098453760147095, + "learning_rate": 0.00013104386732471274, + "loss": 0.9232, + "step": 4543 + }, + { + "epoch": 1.58, + "grad_norm": 0.18985962867736816, + "learning_rate": 0.00013101754516487603, + "loss": 0.8402, + "step": 4544 + }, + { + "epoch": 1.58, + "grad_norm": 0.20924219489097595, + "learning_rate": 0.00013099122062698582, + "loss": 0.9598, + "step": 4545 + }, + { + "epoch": 1.58, + "grad_norm": 0.19651223719120026, + "learning_rate": 0.00013096489371306047, + "loss": 0.832, + "step": 4546 + }, + { + "epoch": 1.58, + "grad_norm": 0.20140282809734344, + "learning_rate": 0.00013093856442511836, + "loss": 0.9027, + "step": 4547 + }, + { + "epoch": 1.58, + "grad_norm": 0.1940830647945404, + "learning_rate": 0.0001309122327651781, + "loss": 0.8725, + "step": 4548 + }, + { + "epoch": 1.58, + "grad_norm": 0.19329622387886047, + "learning_rate": 0.00013088589873525857, + "loss": 0.9091, + "step": 4549 + }, + { + "epoch": 1.58, + "grad_norm": 0.19192615151405334, + "learning_rate": 0.00013085956233737864, + "loss": 0.882, + "step": 4550 + }, + { + "epoch": 1.58, + "grad_norm": 0.20170480012893677, + "learning_rate": 0.0001308332235735575, + "loss": 0.9323, + "step": 4551 + }, + { + "epoch": 1.58, + "grad_norm": 0.201574444770813, + "learning_rate": 0.00013080688244581452, + "loss": 0.95, + "step": 4552 + }, + { + "epoch": 1.58, + "grad_norm": 0.21443568170070648, + "learning_rate": 0.0001307805389561692, + "loss": 0.8848, + "step": 4553 + }, + { + "epoch": 1.58, + "grad_norm": 0.19022703170776367, + "learning_rate": 0.00013075419310664123, + "loss": 0.8998, + "step": 4554 + }, + { + "epoch": 1.58, + "grad_norm": 0.1944078505039215, + "learning_rate": 0.00013072784489925055, + "loss": 0.9451, + "step": 4555 + }, + { + "epoch": 1.58, + "grad_norm": 0.1990111768245697, + "learning_rate": 0.00013070149433601718, + "loss": 0.9322, + "step": 4556 + }, + { + "epoch": 1.58, + "grad_norm": 0.20842678844928741, + "learning_rate": 0.00013067514141896136, + "loss": 0.9692, + "step": 4557 + }, + { + "epoch": 1.58, + "grad_norm": 0.20555783808231354, + "learning_rate": 0.00013064878615010348, + "loss": 0.9217, + "step": 4558 + }, + { + "epoch": 1.59, + "grad_norm": 0.19351287186145782, + "learning_rate": 0.00013062242853146427, + "loss": 0.9378, + "step": 4559 + }, + { + "epoch": 1.59, + "grad_norm": 0.20051833987236023, + "learning_rate": 0.00013059606856506443, + "loss": 0.9638, + "step": 4560 + }, + { + "epoch": 1.59, + "grad_norm": 0.19287043809890747, + "learning_rate": 0.00013056970625292488, + "loss": 0.8593, + "step": 4561 + }, + { + "epoch": 1.59, + "grad_norm": 0.1942070871591568, + "learning_rate": 0.00013054334159706688, + "loss": 0.8507, + "step": 4562 + }, + { + "epoch": 1.59, + "grad_norm": 0.1981026977300644, + "learning_rate": 0.00013051697459951167, + "loss": 0.8782, + "step": 4563 + }, + { + "epoch": 1.59, + "grad_norm": 0.19574685394763947, + "learning_rate": 0.00013049060526228078, + "loss": 0.9031, + "step": 4564 + }, + { + "epoch": 1.59, + "grad_norm": 0.20009981095790863, + "learning_rate": 0.00013046423358739593, + "loss": 0.9434, + "step": 4565 + }, + { + "epoch": 1.59, + "grad_norm": 0.19632475078105927, + "learning_rate": 0.00013043785957687887, + "loss": 0.8625, + "step": 4566 + }, + { + "epoch": 1.59, + "grad_norm": 0.20003314316272736, + "learning_rate": 0.0001304114832327518, + "loss": 0.9462, + "step": 4567 + }, + { + "epoch": 1.59, + "grad_norm": 0.19301815330982208, + "learning_rate": 0.00013038510455703684, + "loss": 0.8618, + "step": 4568 + }, + { + "epoch": 1.59, + "grad_norm": 0.22005648910999298, + "learning_rate": 0.0001303587235517564, + "loss": 1.0248, + "step": 4569 + }, + { + "epoch": 1.59, + "grad_norm": 0.20169512927532196, + "learning_rate": 0.00013033234021893312, + "loss": 0.8955, + "step": 4570 + }, + { + "epoch": 1.59, + "grad_norm": 0.21043767035007477, + "learning_rate": 0.00013030595456058966, + "loss": 0.9648, + "step": 4571 + }, + { + "epoch": 1.59, + "grad_norm": 0.21289391815662384, + "learning_rate": 0.00013027956657874905, + "loss": 0.918, + "step": 4572 + }, + { + "epoch": 1.59, + "grad_norm": 0.21469812095165253, + "learning_rate": 0.00013025317627543431, + "loss": 0.9514, + "step": 4573 + }, + { + "epoch": 1.59, + "grad_norm": 0.19922086596488953, + "learning_rate": 0.00013022678365266885, + "loss": 0.997, + "step": 4574 + }, + { + "epoch": 1.59, + "grad_norm": 0.1971648931503296, + "learning_rate": 0.00013020038871247607, + "loss": 0.9485, + "step": 4575 + }, + { + "epoch": 1.59, + "grad_norm": 0.20105613768100739, + "learning_rate": 0.00013017399145687957, + "loss": 0.9337, + "step": 4576 + }, + { + "epoch": 1.59, + "grad_norm": 0.1998489499092102, + "learning_rate": 0.00013014759188790325, + "loss": 0.9294, + "step": 4577 + }, + { + "epoch": 1.59, + "grad_norm": 0.20058594644069672, + "learning_rate": 0.0001301211900075711, + "loss": 0.8733, + "step": 4578 + }, + { + "epoch": 1.59, + "grad_norm": 0.19787292182445526, + "learning_rate": 0.00013009478581790725, + "loss": 0.8558, + "step": 4579 + }, + { + "epoch": 1.59, + "grad_norm": 0.1924135535955429, + "learning_rate": 0.00013006837932093615, + "loss": 0.9001, + "step": 4580 + }, + { + "epoch": 1.59, + "grad_norm": 0.19671164453029633, + "learning_rate": 0.00013004197051868225, + "loss": 0.9114, + "step": 4581 + }, + { + "epoch": 1.59, + "grad_norm": 0.2027590572834015, + "learning_rate": 0.0001300155594131703, + "loss": 0.9188, + "step": 4582 + }, + { + "epoch": 1.59, + "grad_norm": 0.20843097567558289, + "learning_rate": 0.00012998914600642516, + "loss": 0.9254, + "step": 4583 + }, + { + "epoch": 1.59, + "grad_norm": 0.2043987512588501, + "learning_rate": 0.0001299627303004719, + "loss": 0.8734, + "step": 4584 + }, + { + "epoch": 1.59, + "grad_norm": 0.21097159385681152, + "learning_rate": 0.00012993631229733582, + "loss": 0.9372, + "step": 4585 + }, + { + "epoch": 1.59, + "grad_norm": 0.20122608542442322, + "learning_rate": 0.00012990989199904225, + "loss": 0.8941, + "step": 4586 + }, + { + "epoch": 1.6, + "grad_norm": 0.19901883602142334, + "learning_rate": 0.00012988346940761682, + "loss": 0.925, + "step": 4587 + }, + { + "epoch": 1.6, + "grad_norm": 0.2017354816198349, + "learning_rate": 0.00012985704452508527, + "loss": 0.8339, + "step": 4588 + }, + { + "epoch": 1.6, + "grad_norm": 0.1998375803232193, + "learning_rate": 0.0001298306173534736, + "loss": 0.9294, + "step": 4589 + }, + { + "epoch": 1.6, + "grad_norm": 0.21041107177734375, + "learning_rate": 0.00012980418789480791, + "loss": 0.9718, + "step": 4590 + }, + { + "epoch": 1.6, + "grad_norm": 0.19458279013633728, + "learning_rate": 0.00012977775615111443, + "loss": 0.8662, + "step": 4591 + }, + { + "epoch": 1.6, + "grad_norm": 0.20359832048416138, + "learning_rate": 0.00012975132212441972, + "loss": 0.8277, + "step": 4592 + }, + { + "epoch": 1.6, + "grad_norm": 0.20850984752178192, + "learning_rate": 0.00012972488581675033, + "loss": 0.9371, + "step": 4593 + }, + { + "epoch": 1.6, + "grad_norm": 0.20215162634849548, + "learning_rate": 0.00012969844723013317, + "loss": 0.874, + "step": 4594 + }, + { + "epoch": 1.6, + "grad_norm": 0.20475077629089355, + "learning_rate": 0.0001296720063665952, + "loss": 0.921, + "step": 4595 + }, + { + "epoch": 1.6, + "grad_norm": 0.20587964355945587, + "learning_rate": 0.00012964556322816358, + "loss": 0.9558, + "step": 4596 + }, + { + "epoch": 1.6, + "grad_norm": 0.21276992559432983, + "learning_rate": 0.00012961911781686567, + "loss": 0.9088, + "step": 4597 + }, + { + "epoch": 1.6, + "grad_norm": 0.19974417984485626, + "learning_rate": 0.00012959267013472892, + "loss": 0.869, + "step": 4598 + }, + { + "epoch": 1.6, + "grad_norm": 0.20184177160263062, + "learning_rate": 0.00012956622018378116, + "loss": 0.8675, + "step": 4599 + }, + { + "epoch": 1.6, + "grad_norm": 0.21369704604148865, + "learning_rate": 0.00012953976796605012, + "loss": 0.9403, + "step": 4600 + }, + { + "epoch": 1.6, + "grad_norm": 0.1977105587720871, + "learning_rate": 0.0001295133134835639, + "loss": 0.9233, + "step": 4601 + }, + { + "epoch": 1.6, + "grad_norm": 0.20390766859054565, + "learning_rate": 0.00012948685673835067, + "loss": 0.9024, + "step": 4602 + }, + { + "epoch": 1.6, + "grad_norm": 0.19591738283634186, + "learning_rate": 0.00012946039773243889, + "loss": 0.8169, + "step": 4603 + }, + { + "epoch": 1.6, + "grad_norm": 0.19412824511528015, + "learning_rate": 0.00012943393646785704, + "loss": 0.8583, + "step": 4604 + }, + { + "epoch": 1.6, + "grad_norm": 0.20760369300842285, + "learning_rate": 0.0001294074729466339, + "loss": 0.895, + "step": 4605 + }, + { + "epoch": 1.6, + "grad_norm": 0.20479688048362732, + "learning_rate": 0.00012938100717079838, + "loss": 0.8874, + "step": 4606 + }, + { + "epoch": 1.6, + "grad_norm": 0.2013799101114273, + "learning_rate": 0.00012935453914237954, + "loss": 0.9256, + "step": 4607 + }, + { + "epoch": 1.6, + "grad_norm": 0.21338863670825958, + "learning_rate": 0.00012932806886340666, + "loss": 0.9692, + "step": 4608 + }, + { + "epoch": 1.6, + "grad_norm": 0.19070522487163544, + "learning_rate": 0.00012930159633590908, + "loss": 0.8283, + "step": 4609 + }, + { + "epoch": 1.6, + "grad_norm": 0.20960764586925507, + "learning_rate": 0.00012927512156191647, + "loss": 0.9784, + "step": 4610 + }, + { + "epoch": 1.6, + "grad_norm": 0.20272405445575714, + "learning_rate": 0.00012924864454345858, + "loss": 0.8618, + "step": 4611 + }, + { + "epoch": 1.6, + "grad_norm": 0.2035459280014038, + "learning_rate": 0.00012922216528256538, + "loss": 0.916, + "step": 4612 + }, + { + "epoch": 1.6, + "grad_norm": 0.20177391171455383, + "learning_rate": 0.00012919568378126693, + "loss": 0.9787, + "step": 4613 + }, + { + "epoch": 1.6, + "grad_norm": 0.19269515573978424, + "learning_rate": 0.00012916920004159356, + "loss": 0.8297, + "step": 4614 + }, + { + "epoch": 1.6, + "grad_norm": 0.19745002686977386, + "learning_rate": 0.0001291427140655757, + "loss": 0.9053, + "step": 4615 + }, + { + "epoch": 1.61, + "grad_norm": 0.19630128145217896, + "learning_rate": 0.00012911622585524396, + "loss": 0.8849, + "step": 4616 + }, + { + "epoch": 1.61, + "grad_norm": 0.19954022765159607, + "learning_rate": 0.0001290897354126292, + "loss": 0.8558, + "step": 4617 + }, + { + "epoch": 1.61, + "grad_norm": 0.20994316041469574, + "learning_rate": 0.00012906324273976233, + "loss": 0.9551, + "step": 4618 + }, + { + "epoch": 1.61, + "grad_norm": 0.19849319756031036, + "learning_rate": 0.00012903674783867447, + "loss": 0.8924, + "step": 4619 + }, + { + "epoch": 1.61, + "grad_norm": 0.21105317771434784, + "learning_rate": 0.000129010250711397, + "loss": 0.9563, + "step": 4620 + }, + { + "epoch": 1.61, + "grad_norm": 0.20199191570281982, + "learning_rate": 0.00012898375135996136, + "loss": 0.8566, + "step": 4621 + }, + { + "epoch": 1.61, + "grad_norm": 0.20917585492134094, + "learning_rate": 0.00012895724978639924, + "loss": 0.8869, + "step": 4622 + }, + { + "epoch": 1.61, + "grad_norm": 0.22021600604057312, + "learning_rate": 0.0001289307459927424, + "loss": 0.9356, + "step": 4623 + }, + { + "epoch": 1.61, + "grad_norm": 0.20073702931404114, + "learning_rate": 0.0001289042399810229, + "loss": 0.9461, + "step": 4624 + }, + { + "epoch": 1.61, + "grad_norm": 0.20655374228954315, + "learning_rate": 0.00012887773175327286, + "loss": 0.8657, + "step": 4625 + }, + { + "epoch": 1.61, + "grad_norm": 0.21302717924118042, + "learning_rate": 0.00012885122131152467, + "loss": 0.9399, + "step": 4626 + }, + { + "epoch": 1.61, + "grad_norm": 0.20100539922714233, + "learning_rate": 0.00012882470865781075, + "loss": 0.8738, + "step": 4627 + }, + { + "epoch": 1.61, + "grad_norm": 0.19878286123275757, + "learning_rate": 0.00012879819379416383, + "loss": 0.8997, + "step": 4628 + }, + { + "epoch": 1.61, + "grad_norm": 0.21100857853889465, + "learning_rate": 0.0001287716767226167, + "loss": 0.9444, + "step": 4629 + }, + { + "epoch": 1.61, + "grad_norm": 0.18998339772224426, + "learning_rate": 0.00012874515744520243, + "loss": 0.8804, + "step": 4630 + }, + { + "epoch": 1.61, + "grad_norm": 0.2040288895368576, + "learning_rate": 0.00012871863596395418, + "loss": 0.9086, + "step": 4631 + }, + { + "epoch": 1.61, + "grad_norm": 0.20322178304195404, + "learning_rate": 0.0001286921122809053, + "loss": 0.9374, + "step": 4632 + }, + { + "epoch": 1.61, + "grad_norm": 0.19871999323368073, + "learning_rate": 0.0001286655863980893, + "loss": 0.9024, + "step": 4633 + }, + { + "epoch": 1.61, + "grad_norm": 0.2054329365491867, + "learning_rate": 0.00012863905831753984, + "loss": 0.9025, + "step": 4634 + }, + { + "epoch": 1.61, + "grad_norm": 0.19915160536766052, + "learning_rate": 0.00012861252804129082, + "loss": 0.8937, + "step": 4635 + }, + { + "epoch": 1.61, + "grad_norm": 0.19782203435897827, + "learning_rate": 0.00012858599557137628, + "loss": 0.8921, + "step": 4636 + }, + { + "epoch": 1.61, + "grad_norm": 0.1984853893518448, + "learning_rate": 0.00012855946090983034, + "loss": 0.8927, + "step": 4637 + }, + { + "epoch": 1.61, + "grad_norm": 0.2001384049654007, + "learning_rate": 0.00012853292405868742, + "loss": 0.9125, + "step": 4638 + }, + { + "epoch": 1.61, + "grad_norm": 0.19436684250831604, + "learning_rate": 0.000128506385019982, + "loss": 0.8602, + "step": 4639 + }, + { + "epoch": 1.61, + "grad_norm": 0.2030458152294159, + "learning_rate": 0.00012847984379574882, + "loss": 0.9206, + "step": 4640 + }, + { + "epoch": 1.61, + "grad_norm": 0.1936401128768921, + "learning_rate": 0.0001284533003880227, + "loss": 0.8341, + "step": 4641 + }, + { + "epoch": 1.61, + "grad_norm": 0.18999110162258148, + "learning_rate": 0.00012842675479883875, + "loss": 0.8246, + "step": 4642 + }, + { + "epoch": 1.61, + "grad_norm": 0.20278599858283997, + "learning_rate": 0.00012840020703023207, + "loss": 0.8715, + "step": 4643 + }, + { + "epoch": 1.62, + "grad_norm": 0.19653545320034027, + "learning_rate": 0.00012837365708423808, + "loss": 0.8786, + "step": 4644 + }, + { + "epoch": 1.62, + "grad_norm": 0.2093815803527832, + "learning_rate": 0.00012834710496289228, + "loss": 0.9366, + "step": 4645 + }, + { + "epoch": 1.62, + "grad_norm": 0.20715850591659546, + "learning_rate": 0.00012832055066823038, + "loss": 0.9033, + "step": 4646 + }, + { + "epoch": 1.62, + "grad_norm": 0.1966526210308075, + "learning_rate": 0.0001282939942022883, + "loss": 0.9521, + "step": 4647 + }, + { + "epoch": 1.62, + "grad_norm": 0.19843120872974396, + "learning_rate": 0.00012826743556710202, + "loss": 0.9032, + "step": 4648 + }, + { + "epoch": 1.62, + "grad_norm": 0.20358197391033173, + "learning_rate": 0.0001282408747647077, + "loss": 0.9334, + "step": 4649 + }, + { + "epoch": 1.62, + "grad_norm": 0.21139803528785706, + "learning_rate": 0.00012821431179714176, + "loss": 0.9502, + "step": 4650 + }, + { + "epoch": 1.62, + "grad_norm": 0.1972806453704834, + "learning_rate": 0.0001281877466664407, + "loss": 0.9055, + "step": 4651 + }, + { + "epoch": 1.62, + "grad_norm": 0.19850580394268036, + "learning_rate": 0.00012816117937464126, + "loss": 0.907, + "step": 4652 + }, + { + "epoch": 1.62, + "grad_norm": 0.20600315928459167, + "learning_rate": 0.00012813460992378028, + "loss": 0.8872, + "step": 4653 + }, + { + "epoch": 1.62, + "grad_norm": 0.20602796971797943, + "learning_rate": 0.00012810803831589474, + "loss": 0.872, + "step": 4654 + }, + { + "epoch": 1.62, + "grad_norm": 0.20422983169555664, + "learning_rate": 0.0001280814645530219, + "loss": 0.9397, + "step": 4655 + }, + { + "epoch": 1.62, + "grad_norm": 0.2100948840379715, + "learning_rate": 0.00012805488863719907, + "loss": 0.9563, + "step": 4656 + }, + { + "epoch": 1.62, + "grad_norm": 0.19479598104953766, + "learning_rate": 0.00012802831057046377, + "loss": 0.855, + "step": 4657 + }, + { + "epoch": 1.62, + "grad_norm": 0.20453958213329315, + "learning_rate": 0.00012800173035485375, + "loss": 0.9296, + "step": 4658 + }, + { + "epoch": 1.62, + "grad_norm": 0.21561020612716675, + "learning_rate": 0.0001279751479924068, + "loss": 0.9152, + "step": 4659 + }, + { + "epoch": 1.62, + "grad_norm": 0.21022316813468933, + "learning_rate": 0.00012794856348516095, + "loss": 0.8859, + "step": 4660 + }, + { + "epoch": 1.62, + "grad_norm": 0.1952332854270935, + "learning_rate": 0.0001279219768351544, + "loss": 0.9103, + "step": 4661 + }, + { + "epoch": 1.62, + "grad_norm": 0.19976872205734253, + "learning_rate": 0.0001278953880444255, + "loss": 0.7599, + "step": 4662 + }, + { + "epoch": 1.62, + "grad_norm": 0.20693926513195038, + "learning_rate": 0.00012786879711501273, + "loss": 0.9499, + "step": 4663 + }, + { + "epoch": 1.62, + "grad_norm": 0.21582749485969543, + "learning_rate": 0.00012784220404895476, + "loss": 0.9489, + "step": 4664 + }, + { + "epoch": 1.62, + "grad_norm": 0.18999259173870087, + "learning_rate": 0.00012781560884829045, + "loss": 0.9087, + "step": 4665 + }, + { + "epoch": 1.62, + "grad_norm": 0.2033642679452896, + "learning_rate": 0.0001277890115150588, + "loss": 0.8906, + "step": 4666 + }, + { + "epoch": 1.62, + "grad_norm": 0.2041936218738556, + "learning_rate": 0.00012776241205129897, + "loss": 0.8471, + "step": 4667 + }, + { + "epoch": 1.62, + "grad_norm": 0.2005212903022766, + "learning_rate": 0.0001277358104590503, + "loss": 0.912, + "step": 4668 + }, + { + "epoch": 1.62, + "grad_norm": 0.2013813853263855, + "learning_rate": 0.00012770920674035226, + "loss": 0.9598, + "step": 4669 + }, + { + "epoch": 1.62, + "grad_norm": 0.19489338994026184, + "learning_rate": 0.00012768260089724448, + "loss": 0.8349, + "step": 4670 + }, + { + "epoch": 1.62, + "grad_norm": 0.20248544216156006, + "learning_rate": 0.00012765599293176687, + "loss": 0.9462, + "step": 4671 + }, + { + "epoch": 1.62, + "grad_norm": 0.20362122356891632, + "learning_rate": 0.00012762938284595931, + "loss": 0.9654, + "step": 4672 + }, + { + "epoch": 1.63, + "grad_norm": 0.19850721955299377, + "learning_rate": 0.000127602770641862, + "loss": 0.9181, + "step": 4673 + }, + { + "epoch": 1.63, + "grad_norm": 0.21032880246639252, + "learning_rate": 0.0001275761563215152, + "loss": 0.9282, + "step": 4674 + }, + { + "epoch": 1.63, + "grad_norm": 0.20525328814983368, + "learning_rate": 0.00012754953988695943, + "loss": 0.9266, + "step": 4675 + }, + { + "epoch": 1.63, + "grad_norm": 0.21305762231349945, + "learning_rate": 0.00012752292134023528, + "loss": 0.9671, + "step": 4676 + }, + { + "epoch": 1.63, + "grad_norm": 0.19181224703788757, + "learning_rate": 0.00012749630068338357, + "loss": 0.8691, + "step": 4677 + }, + { + "epoch": 1.63, + "grad_norm": 0.19330871105194092, + "learning_rate": 0.00012746967791844524, + "loss": 0.8276, + "step": 4678 + }, + { + "epoch": 1.63, + "grad_norm": 0.20834501087665558, + "learning_rate": 0.00012744305304746137, + "loss": 0.9028, + "step": 4679 + }, + { + "epoch": 1.63, + "grad_norm": 0.19693732261657715, + "learning_rate": 0.00012741642607247327, + "loss": 0.8798, + "step": 4680 + }, + { + "epoch": 1.63, + "grad_norm": 0.1976938545703888, + "learning_rate": 0.0001273897969955224, + "loss": 0.9087, + "step": 4681 + }, + { + "epoch": 1.63, + "grad_norm": 0.19912458956241608, + "learning_rate": 0.0001273631658186503, + "loss": 0.9223, + "step": 4682 + }, + { + "epoch": 1.63, + "grad_norm": 0.20554779469966888, + "learning_rate": 0.00012733653254389885, + "loss": 0.9097, + "step": 4683 + }, + { + "epoch": 1.63, + "grad_norm": 0.20539045333862305, + "learning_rate": 0.00012730989717330978, + "loss": 0.9172, + "step": 4684 + }, + { + "epoch": 1.63, + "grad_norm": 0.20110385119915009, + "learning_rate": 0.0001272832597089253, + "loss": 0.9048, + "step": 4685 + }, + { + "epoch": 1.63, + "grad_norm": 0.20027123391628265, + "learning_rate": 0.00012725662015278765, + "loss": 0.9018, + "step": 4686 + }, + { + "epoch": 1.63, + "grad_norm": 0.19668515026569366, + "learning_rate": 0.00012722997850693916, + "loss": 0.912, + "step": 4687 + }, + { + "epoch": 1.63, + "grad_norm": 0.19159843027591705, + "learning_rate": 0.0001272033347734225, + "loss": 0.8726, + "step": 4688 + }, + { + "epoch": 1.63, + "grad_norm": 0.19470220804214478, + "learning_rate": 0.00012717668895428027, + "loss": 0.8884, + "step": 4689 + }, + { + "epoch": 1.63, + "grad_norm": 0.20279020071029663, + "learning_rate": 0.00012715004105155544, + "loss": 0.857, + "step": 4690 + }, + { + "epoch": 1.63, + "grad_norm": 0.20406362414360046, + "learning_rate": 0.000127123391067291, + "loss": 0.8551, + "step": 4691 + }, + { + "epoch": 1.63, + "grad_norm": 0.20731474459171295, + "learning_rate": 0.0001270967390035302, + "loss": 0.8923, + "step": 4692 + }, + { + "epoch": 1.63, + "grad_norm": 0.2141023874282837, + "learning_rate": 0.0001270700848623164, + "loss": 0.9492, + "step": 4693 + }, + { + "epoch": 1.63, + "grad_norm": 0.19725339114665985, + "learning_rate": 0.00012704342864569302, + "loss": 0.8739, + "step": 4694 + }, + { + "epoch": 1.63, + "grad_norm": 0.2122267633676529, + "learning_rate": 0.00012701677035570387, + "loss": 0.976, + "step": 4695 + }, + { + "epoch": 1.63, + "grad_norm": 0.20237216353416443, + "learning_rate": 0.00012699010999439275, + "loss": 0.9657, + "step": 4696 + }, + { + "epoch": 1.63, + "grad_norm": 0.19359779357910156, + "learning_rate": 0.00012696344756380357, + "loss": 0.8157, + "step": 4697 + }, + { + "epoch": 1.63, + "grad_norm": 0.2037573903799057, + "learning_rate": 0.00012693678306598062, + "loss": 0.9134, + "step": 4698 + }, + { + "epoch": 1.63, + "grad_norm": 0.20230227708816528, + "learning_rate": 0.00012691011650296812, + "loss": 0.9193, + "step": 4699 + }, + { + "epoch": 1.63, + "grad_norm": 0.1983824223279953, + "learning_rate": 0.00012688344787681056, + "loss": 0.8738, + "step": 4700 + }, + { + "epoch": 1.64, + "grad_norm": 0.2047380656003952, + "learning_rate": 0.0001268567771895526, + "loss": 0.9393, + "step": 4701 + }, + { + "epoch": 1.64, + "grad_norm": 0.20342564582824707, + "learning_rate": 0.00012683010444323897, + "loss": 0.9246, + "step": 4702 + }, + { + "epoch": 1.64, + "grad_norm": 0.19755631685256958, + "learning_rate": 0.00012680342963991472, + "loss": 0.9372, + "step": 4703 + }, + { + "epoch": 1.64, + "grad_norm": 0.19999569654464722, + "learning_rate": 0.00012677675278162484, + "loss": 0.9368, + "step": 4704 + }, + { + "epoch": 1.64, + "grad_norm": 0.2041405439376831, + "learning_rate": 0.00012675007387041466, + "loss": 0.934, + "step": 4705 + }, + { + "epoch": 1.64, + "grad_norm": 0.1956358402967453, + "learning_rate": 0.00012672339290832957, + "loss": 0.8872, + "step": 4706 + }, + { + "epoch": 1.64, + "grad_norm": 0.2013956606388092, + "learning_rate": 0.00012669670989741517, + "loss": 0.9227, + "step": 4707 + }, + { + "epoch": 1.64, + "grad_norm": 0.2008161097764969, + "learning_rate": 0.0001266700248397172, + "loss": 0.9199, + "step": 4708 + }, + { + "epoch": 1.64, + "grad_norm": 0.2024879902601242, + "learning_rate": 0.0001266433377372815, + "loss": 0.894, + "step": 4709 + }, + { + "epoch": 1.64, + "grad_norm": 0.19811367988586426, + "learning_rate": 0.00012661664859215413, + "loss": 0.9339, + "step": 4710 + }, + { + "epoch": 1.64, + "grad_norm": 0.2045515477657318, + "learning_rate": 0.00012658995740638136, + "loss": 0.9124, + "step": 4711 + }, + { + "epoch": 1.64, + "grad_norm": 0.2044452279806137, + "learning_rate": 0.00012656326418200947, + "loss": 0.9922, + "step": 4712 + }, + { + "epoch": 1.64, + "grad_norm": 0.2022848129272461, + "learning_rate": 0.00012653656892108503, + "loss": 0.947, + "step": 4713 + }, + { + "epoch": 1.64, + "grad_norm": 0.1936623454093933, + "learning_rate": 0.00012650987162565465, + "loss": 0.8892, + "step": 4714 + }, + { + "epoch": 1.64, + "grad_norm": 0.2036278247833252, + "learning_rate": 0.00012648317229776524, + "loss": 0.922, + "step": 4715 + }, + { + "epoch": 1.64, + "grad_norm": 0.20058897137641907, + "learning_rate": 0.00012645647093946372, + "loss": 0.8689, + "step": 4716 + }, + { + "epoch": 1.64, + "grad_norm": 0.20195390284061432, + "learning_rate": 0.00012642976755279723, + "loss": 0.9431, + "step": 4717 + }, + { + "epoch": 1.64, + "grad_norm": 0.20222632586956024, + "learning_rate": 0.00012640306213981315, + "loss": 0.9048, + "step": 4718 + }, + { + "epoch": 1.64, + "grad_norm": 0.204377681016922, + "learning_rate": 0.00012637635470255879, + "loss": 0.9154, + "step": 4719 + }, + { + "epoch": 1.64, + "grad_norm": 0.19898901879787445, + "learning_rate": 0.0001263496452430819, + "loss": 0.8935, + "step": 4720 + }, + { + "epoch": 1.64, + "grad_norm": 0.19688498973846436, + "learning_rate": 0.00012632293376343016, + "loss": 0.8986, + "step": 4721 + }, + { + "epoch": 1.64, + "grad_norm": 0.19992433488368988, + "learning_rate": 0.00012629622026565147, + "loss": 0.8598, + "step": 4722 + }, + { + "epoch": 1.64, + "grad_norm": 0.2034882754087448, + "learning_rate": 0.00012626950475179397, + "loss": 0.8845, + "step": 4723 + }, + { + "epoch": 1.64, + "grad_norm": 0.2034643441438675, + "learning_rate": 0.00012624278722390584, + "loss": 0.9494, + "step": 4724 + }, + { + "epoch": 1.64, + "grad_norm": 0.20310690999031067, + "learning_rate": 0.00012621606768403547, + "loss": 0.9539, + "step": 4725 + }, + { + "epoch": 1.64, + "grad_norm": 0.20767931640148163, + "learning_rate": 0.00012618934613423138, + "loss": 0.9233, + "step": 4726 + }, + { + "epoch": 1.64, + "grad_norm": 0.19426491856575012, + "learning_rate": 0.00012616262257654224, + "loss": 0.9371, + "step": 4727 + }, + { + "epoch": 1.64, + "grad_norm": 0.19253315031528473, + "learning_rate": 0.000126135897013017, + "loss": 0.8632, + "step": 4728 + }, + { + "epoch": 1.65, + "grad_norm": 0.195959210395813, + "learning_rate": 0.00012610916944570452, + "loss": 0.8928, + "step": 4729 + }, + { + "epoch": 1.65, + "grad_norm": 0.2083369642496109, + "learning_rate": 0.00012608243987665403, + "loss": 0.9695, + "step": 4730 + }, + { + "epoch": 1.65, + "grad_norm": 0.1992987096309662, + "learning_rate": 0.0001260557083079148, + "loss": 0.9216, + "step": 4731 + }, + { + "epoch": 1.65, + "grad_norm": 0.2010459452867508, + "learning_rate": 0.00012602897474153627, + "loss": 0.8297, + "step": 4732 + }, + { + "epoch": 1.65, + "grad_norm": 0.20180568099021912, + "learning_rate": 0.0001260022391795681, + "loss": 0.9472, + "step": 4733 + }, + { + "epoch": 1.65, + "grad_norm": 0.1999882608652115, + "learning_rate": 0.00012597550162406004, + "loss": 0.898, + "step": 4734 + }, + { + "epoch": 1.65, + "grad_norm": 0.2005031853914261, + "learning_rate": 0.00012594876207706199, + "loss": 0.9134, + "step": 4735 + }, + { + "epoch": 1.65, + "grad_norm": 0.194036066532135, + "learning_rate": 0.00012592202054062402, + "loss": 0.9065, + "step": 4736 + }, + { + "epoch": 1.65, + "grad_norm": 0.21351303160190582, + "learning_rate": 0.0001258952770167963, + "loss": 0.8464, + "step": 4737 + }, + { + "epoch": 1.65, + "grad_norm": 0.19651855528354645, + "learning_rate": 0.0001258685315076293, + "loss": 0.907, + "step": 4738 + }, + { + "epoch": 1.65, + "grad_norm": 0.20761534571647644, + "learning_rate": 0.0001258417840151735, + "loss": 0.9521, + "step": 4739 + }, + { + "epoch": 1.65, + "grad_norm": 0.21457886695861816, + "learning_rate": 0.00012581503454147958, + "loss": 0.9144, + "step": 4740 + }, + { + "epoch": 1.65, + "grad_norm": 0.21407045423984528, + "learning_rate": 0.00012578828308859835, + "loss": 0.9467, + "step": 4741 + }, + { + "epoch": 1.65, + "grad_norm": 0.1977054923772812, + "learning_rate": 0.00012576152965858078, + "loss": 0.9275, + "step": 4742 + }, + { + "epoch": 1.65, + "grad_norm": 0.2006564736366272, + "learning_rate": 0.00012573477425347808, + "loss": 0.9641, + "step": 4743 + }, + { + "epoch": 1.65, + "grad_norm": 0.19844360649585724, + "learning_rate": 0.00012570801687534147, + "loss": 0.9349, + "step": 4744 + }, + { + "epoch": 1.65, + "grad_norm": 0.214335635304451, + "learning_rate": 0.00012568125752622237, + "loss": 0.9298, + "step": 4745 + }, + { + "epoch": 1.65, + "grad_norm": 0.19964855909347534, + "learning_rate": 0.0001256544962081724, + "loss": 0.8417, + "step": 4746 + }, + { + "epoch": 1.65, + "grad_norm": 0.1935940831899643, + "learning_rate": 0.0001256277329232433, + "loss": 0.8531, + "step": 4747 + }, + { + "epoch": 1.65, + "grad_norm": 0.2027629017829895, + "learning_rate": 0.0001256009676734869, + "loss": 0.9267, + "step": 4748 + }, + { + "epoch": 1.65, + "grad_norm": 0.1888791173696518, + "learning_rate": 0.00012557420046095533, + "loss": 0.8679, + "step": 4749 + }, + { + "epoch": 1.65, + "grad_norm": 0.20005565881729126, + "learning_rate": 0.00012554743128770073, + "loss": 0.9021, + "step": 4750 + }, + { + "epoch": 1.65, + "grad_norm": 0.19851699471473694, + "learning_rate": 0.00012552066015577543, + "loss": 0.8643, + "step": 4751 + }, + { + "epoch": 1.65, + "grad_norm": 0.19369323551654816, + "learning_rate": 0.00012549388706723194, + "loss": 0.855, + "step": 4752 + }, + { + "epoch": 1.65, + "grad_norm": 0.19861680269241333, + "learning_rate": 0.00012546711202412287, + "loss": 0.8797, + "step": 4753 + }, + { + "epoch": 1.65, + "grad_norm": 0.19953753054141998, + "learning_rate": 0.00012544033502850108, + "loss": 0.9289, + "step": 4754 + }, + { + "epoch": 1.65, + "grad_norm": 0.20581848919391632, + "learning_rate": 0.00012541355608241942, + "loss": 0.945, + "step": 4755 + }, + { + "epoch": 1.65, + "grad_norm": 0.19809095561504364, + "learning_rate": 0.00012538677518793103, + "loss": 0.8942, + "step": 4756 + }, + { + "epoch": 1.65, + "grad_norm": 0.2005758285522461, + "learning_rate": 0.00012535999234708912, + "loss": 0.9201, + "step": 4757 + }, + { + "epoch": 1.66, + "grad_norm": 0.2029479295015335, + "learning_rate": 0.0001253332075619471, + "loss": 0.8957, + "step": 4758 + }, + { + "epoch": 1.66, + "grad_norm": 0.199596107006073, + "learning_rate": 0.0001253064208345585, + "loss": 0.9275, + "step": 4759 + }, + { + "epoch": 1.66, + "grad_norm": 0.19497540593147278, + "learning_rate": 0.00012527963216697703, + "loss": 0.8849, + "step": 4760 + }, + { + "epoch": 1.66, + "grad_norm": 0.1980920284986496, + "learning_rate": 0.00012525284156125649, + "loss": 0.9224, + "step": 4761 + }, + { + "epoch": 1.66, + "grad_norm": 0.20496365427970886, + "learning_rate": 0.00012522604901945084, + "loss": 0.9739, + "step": 4762 + }, + { + "epoch": 1.66, + "grad_norm": 0.20327137410640717, + "learning_rate": 0.00012519925454361426, + "loss": 0.8848, + "step": 4763 + }, + { + "epoch": 1.66, + "grad_norm": 0.20324772596359253, + "learning_rate": 0.000125172458135801, + "loss": 0.8823, + "step": 4764 + }, + { + "epoch": 1.66, + "grad_norm": 0.2028888463973999, + "learning_rate": 0.00012514565979806553, + "loss": 0.9404, + "step": 4765 + }, + { + "epoch": 1.66, + "grad_norm": 0.20583532750606537, + "learning_rate": 0.00012511885953246237, + "loss": 0.9499, + "step": 4766 + }, + { + "epoch": 1.66, + "grad_norm": 0.18950171768665314, + "learning_rate": 0.00012509205734104623, + "loss": 0.8812, + "step": 4767 + }, + { + "epoch": 1.66, + "grad_norm": 0.20389115810394287, + "learning_rate": 0.00012506525322587207, + "loss": 0.9047, + "step": 4768 + }, + { + "epoch": 1.66, + "grad_norm": 0.20218288898468018, + "learning_rate": 0.00012503844718899485, + "loss": 0.8811, + "step": 4769 + }, + { + "epoch": 1.66, + "grad_norm": 0.19585788249969482, + "learning_rate": 0.0001250116392324697, + "loss": 0.8725, + "step": 4770 + }, + { + "epoch": 1.66, + "grad_norm": 0.20046821236610413, + "learning_rate": 0.000124984829358352, + "loss": 0.9135, + "step": 4771 + }, + { + "epoch": 1.66, + "grad_norm": 0.20927603542804718, + "learning_rate": 0.0001249580175686972, + "loss": 0.9032, + "step": 4772 + }, + { + "epoch": 1.66, + "grad_norm": 0.19529473781585693, + "learning_rate": 0.00012493120386556085, + "loss": 0.8774, + "step": 4773 + }, + { + "epoch": 1.66, + "grad_norm": 0.20796741545200348, + "learning_rate": 0.00012490438825099877, + "loss": 0.8518, + "step": 4774 + }, + { + "epoch": 1.66, + "grad_norm": 0.20350944995880127, + "learning_rate": 0.00012487757072706683, + "loss": 0.9255, + "step": 4775 + }, + { + "epoch": 1.66, + "grad_norm": 0.21421313285827637, + "learning_rate": 0.00012485075129582107, + "loss": 1.0059, + "step": 4776 + }, + { + "epoch": 1.66, + "grad_norm": 0.2068627029657364, + "learning_rate": 0.00012482392995931768, + "loss": 0.889, + "step": 4777 + }, + { + "epoch": 1.66, + "grad_norm": 0.20098544657230377, + "learning_rate": 0.00012479710671961302, + "loss": 0.9105, + "step": 4778 + }, + { + "epoch": 1.66, + "grad_norm": 0.2001083940267563, + "learning_rate": 0.00012477028157876355, + "loss": 0.9113, + "step": 4779 + }, + { + "epoch": 1.66, + "grad_norm": 0.20307984948158264, + "learning_rate": 0.00012474345453882595, + "loss": 0.8946, + "step": 4780 + }, + { + "epoch": 1.66, + "grad_norm": 0.20198144018650055, + "learning_rate": 0.00012471662560185694, + "loss": 0.9234, + "step": 4781 + }, + { + "epoch": 1.66, + "grad_norm": 0.20412342250347137, + "learning_rate": 0.00012468979476991344, + "loss": 0.9353, + "step": 4782 + }, + { + "epoch": 1.66, + "grad_norm": 0.21251653134822845, + "learning_rate": 0.00012466296204505256, + "loss": 0.8688, + "step": 4783 + }, + { + "epoch": 1.66, + "grad_norm": 0.20212705433368683, + "learning_rate": 0.00012463612742933148, + "loss": 0.9185, + "step": 4784 + }, + { + "epoch": 1.66, + "grad_norm": 0.19823376834392548, + "learning_rate": 0.00012460929092480757, + "loss": 0.8651, + "step": 4785 + }, + { + "epoch": 1.67, + "grad_norm": 0.1942797750234604, + "learning_rate": 0.00012458245253353832, + "loss": 0.8739, + "step": 4786 + }, + { + "epoch": 1.67, + "grad_norm": 0.2100387066602707, + "learning_rate": 0.0001245556122575814, + "loss": 0.8959, + "step": 4787 + }, + { + "epoch": 1.67, + "grad_norm": 0.21781910955905914, + "learning_rate": 0.00012452877009899455, + "loss": 0.9808, + "step": 4788 + }, + { + "epoch": 1.67, + "grad_norm": 0.2030804306268692, + "learning_rate": 0.00012450192605983578, + "loss": 0.909, + "step": 4789 + }, + { + "epoch": 1.67, + "grad_norm": 0.20184268057346344, + "learning_rate": 0.0001244750801421631, + "loss": 0.9259, + "step": 4790 + }, + { + "epoch": 1.67, + "grad_norm": 0.21130618453025818, + "learning_rate": 0.00012444823234803476, + "loss": 0.8928, + "step": 4791 + }, + { + "epoch": 1.67, + "grad_norm": 0.19274461269378662, + "learning_rate": 0.00012442138267950911, + "loss": 0.887, + "step": 4792 + }, + { + "epoch": 1.67, + "grad_norm": 0.21150325238704681, + "learning_rate": 0.0001243945311386447, + "loss": 0.966, + "step": 4793 + }, + { + "epoch": 1.67, + "grad_norm": 0.19882428646087646, + "learning_rate": 0.00012436767772750016, + "loss": 0.8655, + "step": 4794 + }, + { + "epoch": 1.67, + "grad_norm": 0.2035803347826004, + "learning_rate": 0.0001243408224481343, + "loss": 0.8424, + "step": 4795 + }, + { + "epoch": 1.67, + "grad_norm": 0.20144344866275787, + "learning_rate": 0.00012431396530260605, + "loss": 0.8622, + "step": 4796 + }, + { + "epoch": 1.67, + "grad_norm": 0.21372032165527344, + "learning_rate": 0.00012428710629297445, + "loss": 0.8885, + "step": 4797 + }, + { + "epoch": 1.67, + "grad_norm": 0.21561861038208008, + "learning_rate": 0.0001242602454212988, + "loss": 0.9356, + "step": 4798 + }, + { + "epoch": 1.67, + "grad_norm": 0.20541751384735107, + "learning_rate": 0.00012423338268963846, + "loss": 0.9053, + "step": 4799 + }, + { + "epoch": 1.67, + "grad_norm": 0.1994425356388092, + "learning_rate": 0.0001242065181000529, + "loss": 0.9568, + "step": 4800 + }, + { + "epoch": 1.67, + "grad_norm": 0.20360726118087769, + "learning_rate": 0.00012417965165460179, + "loss": 0.8055, + "step": 4801 + }, + { + "epoch": 1.67, + "grad_norm": 0.21201220154762268, + "learning_rate": 0.00012415278335534496, + "loss": 0.9122, + "step": 4802 + }, + { + "epoch": 1.67, + "grad_norm": 0.19801384210586548, + "learning_rate": 0.00012412591320434227, + "loss": 0.8405, + "step": 4803 + }, + { + "epoch": 1.67, + "grad_norm": 0.21161265671253204, + "learning_rate": 0.00012409904120365392, + "loss": 0.9093, + "step": 4804 + }, + { + "epoch": 1.67, + "grad_norm": 0.1941586285829544, + "learning_rate": 0.00012407216735534004, + "loss": 0.8723, + "step": 4805 + }, + { + "epoch": 1.67, + "grad_norm": 0.1975100338459015, + "learning_rate": 0.00012404529166146102, + "loss": 0.9366, + "step": 4806 + }, + { + "epoch": 1.67, + "grad_norm": 0.20188577473163605, + "learning_rate": 0.00012401841412407733, + "loss": 0.8629, + "step": 4807 + }, + { + "epoch": 1.67, + "grad_norm": 0.1966887265443802, + "learning_rate": 0.00012399153474524968, + "loss": 0.8388, + "step": 4808 + }, + { + "epoch": 1.67, + "grad_norm": 0.20200707018375397, + "learning_rate": 0.00012396465352703886, + "loss": 0.8711, + "step": 4809 + }, + { + "epoch": 1.67, + "grad_norm": 0.20064467191696167, + "learning_rate": 0.00012393777047150573, + "loss": 0.9107, + "step": 4810 + }, + { + "epoch": 1.67, + "grad_norm": 0.21187013387680054, + "learning_rate": 0.00012391088558071146, + "loss": 0.922, + "step": 4811 + }, + { + "epoch": 1.67, + "grad_norm": 0.20777884125709534, + "learning_rate": 0.00012388399885671715, + "loss": 0.9048, + "step": 4812 + }, + { + "epoch": 1.67, + "grad_norm": 0.20996583998203278, + "learning_rate": 0.00012385711030158422, + "loss": 0.9906, + "step": 4813 + }, + { + "epoch": 1.67, + "grad_norm": 0.20055098831653595, + "learning_rate": 0.00012383021991737415, + "loss": 0.8662, + "step": 4814 + }, + { + "epoch": 1.68, + "grad_norm": 0.21267199516296387, + "learning_rate": 0.00012380332770614856, + "loss": 0.894, + "step": 4815 + }, + { + "epoch": 1.68, + "grad_norm": 0.20670823752880096, + "learning_rate": 0.00012377643366996927, + "loss": 0.9398, + "step": 4816 + }, + { + "epoch": 1.68, + "grad_norm": 0.20194852352142334, + "learning_rate": 0.00012374953781089812, + "loss": 0.9119, + "step": 4817 + }, + { + "epoch": 1.68, + "grad_norm": 0.19411972165107727, + "learning_rate": 0.0001237226401309972, + "loss": 0.8739, + "step": 4818 + }, + { + "epoch": 1.68, + "grad_norm": 0.2021835595369339, + "learning_rate": 0.0001236957406323287, + "loss": 0.8944, + "step": 4819 + }, + { + "epoch": 1.68, + "grad_norm": 0.20480941236019135, + "learning_rate": 0.00012366883931695492, + "loss": 0.9131, + "step": 4820 + }, + { + "epoch": 1.68, + "grad_norm": 0.19868041574954987, + "learning_rate": 0.0001236419361869384, + "loss": 0.9164, + "step": 4821 + }, + { + "epoch": 1.68, + "grad_norm": 0.19789153337478638, + "learning_rate": 0.00012361503124434168, + "loss": 0.8702, + "step": 4822 + }, + { + "epoch": 1.68, + "grad_norm": 0.18909890949726105, + "learning_rate": 0.00012358812449122754, + "loss": 0.8115, + "step": 4823 + }, + { + "epoch": 1.68, + "grad_norm": 0.19557319581508636, + "learning_rate": 0.00012356121592965887, + "loss": 0.9356, + "step": 4824 + }, + { + "epoch": 1.68, + "grad_norm": 0.20103345811367035, + "learning_rate": 0.0001235343055616987, + "loss": 0.9251, + "step": 4825 + }, + { + "epoch": 1.68, + "grad_norm": 0.1910592019557953, + "learning_rate": 0.00012350739338941018, + "loss": 0.8771, + "step": 4826 + }, + { + "epoch": 1.68, + "grad_norm": 0.1996619999408722, + "learning_rate": 0.00012348047941485659, + "loss": 0.8778, + "step": 4827 + }, + { + "epoch": 1.68, + "grad_norm": 0.20036165416240692, + "learning_rate": 0.00012345356364010143, + "loss": 0.9669, + "step": 4828 + }, + { + "epoch": 1.68, + "grad_norm": 0.19795265793800354, + "learning_rate": 0.00012342664606720822, + "loss": 0.8803, + "step": 4829 + }, + { + "epoch": 1.68, + "grad_norm": 0.2031397819519043, + "learning_rate": 0.0001233997266982407, + "loss": 0.919, + "step": 4830 + }, + { + "epoch": 1.68, + "grad_norm": 0.1937347948551178, + "learning_rate": 0.00012337280553526277, + "loss": 0.8887, + "step": 4831 + }, + { + "epoch": 1.68, + "grad_norm": 0.20833584666252136, + "learning_rate": 0.00012334588258033834, + "loss": 0.9119, + "step": 4832 + }, + { + "epoch": 1.68, + "grad_norm": 0.1977413147687912, + "learning_rate": 0.0001233189578355316, + "loss": 0.7891, + "step": 4833 + }, + { + "epoch": 1.68, + "grad_norm": 0.19594435393810272, + "learning_rate": 0.0001232920313029068, + "loss": 0.9201, + "step": 4834 + }, + { + "epoch": 1.68, + "grad_norm": 0.20837093889713287, + "learning_rate": 0.0001232651029845283, + "loss": 0.9205, + "step": 4835 + }, + { + "epoch": 1.68, + "grad_norm": 0.2012907862663269, + "learning_rate": 0.00012323817288246072, + "loss": 0.8571, + "step": 4836 + }, + { + "epoch": 1.68, + "grad_norm": 0.20674099028110504, + "learning_rate": 0.00012321124099876865, + "loss": 0.9526, + "step": 4837 + }, + { + "epoch": 1.68, + "grad_norm": 0.19660276174545288, + "learning_rate": 0.00012318430733551699, + "loss": 0.8947, + "step": 4838 + }, + { + "epoch": 1.68, + "grad_norm": 0.20618146657943726, + "learning_rate": 0.00012315737189477063, + "loss": 0.9915, + "step": 4839 + }, + { + "epoch": 1.68, + "grad_norm": 0.20119090378284454, + "learning_rate": 0.00012313043467859468, + "loss": 0.9783, + "step": 4840 + }, + { + "epoch": 1.68, + "grad_norm": 0.19183722138404846, + "learning_rate": 0.0001231034956890544, + "loss": 0.8596, + "step": 4841 + }, + { + "epoch": 1.68, + "grad_norm": 0.19631201028823853, + "learning_rate": 0.00012307655492821507, + "loss": 0.8865, + "step": 4842 + }, + { + "epoch": 1.69, + "grad_norm": 0.2002319097518921, + "learning_rate": 0.0001230496123981422, + "loss": 0.8704, + "step": 4843 + }, + { + "epoch": 1.69, + "grad_norm": 0.2014852911233902, + "learning_rate": 0.00012302266810090148, + "loss": 0.9073, + "step": 4844 + }, + { + "epoch": 1.69, + "grad_norm": 0.20717839896678925, + "learning_rate": 0.00012299572203855862, + "loss": 0.885, + "step": 4845 + }, + { + "epoch": 1.69, + "grad_norm": 0.1985516995191574, + "learning_rate": 0.0001229687742131796, + "loss": 0.9247, + "step": 4846 + }, + { + "epoch": 1.69, + "grad_norm": 0.20868374407291412, + "learning_rate": 0.0001229418246268303, + "loss": 0.9499, + "step": 4847 + }, + { + "epoch": 1.69, + "grad_norm": 0.20093078911304474, + "learning_rate": 0.00012291487328157705, + "loss": 0.8834, + "step": 4848 + }, + { + "epoch": 1.69, + "grad_norm": 0.2085324078798294, + "learning_rate": 0.00012288792017948607, + "loss": 1.0308, + "step": 4849 + }, + { + "epoch": 1.69, + "grad_norm": 0.20289358496665955, + "learning_rate": 0.0001228609653226238, + "loss": 0.855, + "step": 4850 + }, + { + "epoch": 1.69, + "grad_norm": 0.205068439245224, + "learning_rate": 0.00012283400871305694, + "loss": 1.0064, + "step": 4851 + }, + { + "epoch": 1.69, + "grad_norm": 0.20595556497573853, + "learning_rate": 0.000122807050352852, + "loss": 0.9158, + "step": 4852 + }, + { + "epoch": 1.69, + "grad_norm": 0.20259448885917664, + "learning_rate": 0.00012278009024407592, + "loss": 0.8504, + "step": 4853 + }, + { + "epoch": 1.69, + "grad_norm": 0.20223982632160187, + "learning_rate": 0.00012275312838879574, + "loss": 0.942, + "step": 4854 + }, + { + "epoch": 1.69, + "grad_norm": 0.20645855367183685, + "learning_rate": 0.00012272616478907846, + "loss": 0.917, + "step": 4855 + }, + { + "epoch": 1.69, + "grad_norm": 0.20709437131881714, + "learning_rate": 0.0001226991994469914, + "loss": 0.8653, + "step": 4856 + }, + { + "epoch": 1.69, + "grad_norm": 0.19745272397994995, + "learning_rate": 0.00012267223236460192, + "loss": 0.8475, + "step": 4857 + }, + { + "epoch": 1.69, + "grad_norm": 0.21651792526245117, + "learning_rate": 0.00012264526354397755, + "loss": 0.9116, + "step": 4858 + }, + { + "epoch": 1.69, + "grad_norm": 0.20984449982643127, + "learning_rate": 0.00012261829298718587, + "loss": 0.9508, + "step": 4859 + }, + { + "epoch": 1.69, + "grad_norm": 0.22017496824264526, + "learning_rate": 0.0001225913206962947, + "loss": 1.0125, + "step": 4860 + }, + { + "epoch": 1.69, + "grad_norm": 0.19838780164718628, + "learning_rate": 0.000122564346673372, + "loss": 0.8886, + "step": 4861 + }, + { + "epoch": 1.69, + "grad_norm": 0.2012847363948822, + "learning_rate": 0.00012253737092048577, + "loss": 0.9365, + "step": 4862 + }, + { + "epoch": 1.69, + "grad_norm": 0.20742902159690857, + "learning_rate": 0.00012251039343970417, + "loss": 0.9472, + "step": 4863 + }, + { + "epoch": 1.69, + "grad_norm": 0.204187273979187, + "learning_rate": 0.0001224834142330955, + "loss": 0.8915, + "step": 4864 + }, + { + "epoch": 1.69, + "grad_norm": 0.20868054032325745, + "learning_rate": 0.00012245643330272826, + "loss": 0.8737, + "step": 4865 + }, + { + "epoch": 1.69, + "grad_norm": 0.20117759704589844, + "learning_rate": 0.00012242945065067098, + "loss": 0.8567, + "step": 4866 + }, + { + "epoch": 1.69, + "grad_norm": 0.19919933378696442, + "learning_rate": 0.00012240246627899238, + "loss": 0.9152, + "step": 4867 + }, + { + "epoch": 1.69, + "grad_norm": 0.2044895738363266, + "learning_rate": 0.00012237548018976133, + "loss": 0.9284, + "step": 4868 + }, + { + "epoch": 1.69, + "grad_norm": 0.21247272193431854, + "learning_rate": 0.00012234849238504673, + "loss": 0.9213, + "step": 4869 + }, + { + "epoch": 1.69, + "grad_norm": 0.2041437327861786, + "learning_rate": 0.00012232150286691773, + "loss": 0.9402, + "step": 4870 + }, + { + "epoch": 1.7, + "grad_norm": 0.206780806183815, + "learning_rate": 0.00012229451163744354, + "loss": 0.8597, + "step": 4871 + }, + { + "epoch": 1.7, + "grad_norm": 0.20794974267482758, + "learning_rate": 0.00012226751869869355, + "loss": 0.9023, + "step": 4872 + }, + { + "epoch": 1.7, + "grad_norm": 0.20286689698696136, + "learning_rate": 0.00012224052405273724, + "loss": 0.8945, + "step": 4873 + }, + { + "epoch": 1.7, + "grad_norm": 0.19486016035079956, + "learning_rate": 0.00012221352770164424, + "loss": 0.8933, + "step": 4874 + }, + { + "epoch": 1.7, + "grad_norm": 0.19991089403629303, + "learning_rate": 0.00012218652964748428, + "loss": 0.9064, + "step": 4875 + }, + { + "epoch": 1.7, + "grad_norm": 0.20874367654323578, + "learning_rate": 0.00012215952989232728, + "loss": 0.9028, + "step": 4876 + }, + { + "epoch": 1.7, + "grad_norm": 0.21600410342216492, + "learning_rate": 0.00012213252843824325, + "loss": 0.9277, + "step": 4877 + }, + { + "epoch": 1.7, + "grad_norm": 0.20367151498794556, + "learning_rate": 0.00012210552528730235, + "loss": 0.8838, + "step": 4878 + }, + { + "epoch": 1.7, + "grad_norm": 0.18601082265377045, + "learning_rate": 0.00012207852044157484, + "loss": 0.8004, + "step": 4879 + }, + { + "epoch": 1.7, + "grad_norm": 0.2015962302684784, + "learning_rate": 0.0001220515139031311, + "loss": 0.9, + "step": 4880 + }, + { + "epoch": 1.7, + "grad_norm": 0.20441995561122894, + "learning_rate": 0.00012202450567404171, + "loss": 0.8845, + "step": 4881 + }, + { + "epoch": 1.7, + "grad_norm": 0.20740507543087006, + "learning_rate": 0.00012199749575637733, + "loss": 0.8861, + "step": 4882 + }, + { + "epoch": 1.7, + "grad_norm": 0.210323765873909, + "learning_rate": 0.00012197048415220875, + "loss": 0.9043, + "step": 4883 + }, + { + "epoch": 1.7, + "grad_norm": 0.20224831998348236, + "learning_rate": 0.00012194347086360692, + "loss": 0.8717, + "step": 4884 + }, + { + "epoch": 1.7, + "grad_norm": 0.20253652334213257, + "learning_rate": 0.00012191645589264283, + "loss": 0.9113, + "step": 4885 + }, + { + "epoch": 1.7, + "grad_norm": 0.19361186027526855, + "learning_rate": 0.00012188943924138771, + "loss": 0.8605, + "step": 4886 + }, + { + "epoch": 1.7, + "grad_norm": 0.1945461481809616, + "learning_rate": 0.00012186242091191292, + "loss": 0.8891, + "step": 4887 + }, + { + "epoch": 1.7, + "grad_norm": 0.2082328498363495, + "learning_rate": 0.0001218354009062898, + "loss": 0.9408, + "step": 4888 + }, + { + "epoch": 1.7, + "grad_norm": 0.20235101878643036, + "learning_rate": 0.00012180837922659001, + "loss": 0.8895, + "step": 4889 + }, + { + "epoch": 1.7, + "grad_norm": 0.20408551394939423, + "learning_rate": 0.00012178135587488515, + "loss": 0.8531, + "step": 4890 + }, + { + "epoch": 1.7, + "grad_norm": 0.20237864553928375, + "learning_rate": 0.00012175433085324714, + "loss": 0.9257, + "step": 4891 + }, + { + "epoch": 1.7, + "grad_norm": 0.2030421644449234, + "learning_rate": 0.00012172730416374791, + "loss": 0.9628, + "step": 4892 + }, + { + "epoch": 1.7, + "grad_norm": 0.19940567016601562, + "learning_rate": 0.00012170027580845953, + "loss": 0.8989, + "step": 4893 + }, + { + "epoch": 1.7, + "grad_norm": 0.19683948159217834, + "learning_rate": 0.00012167324578945423, + "loss": 0.9434, + "step": 4894 + }, + { + "epoch": 1.7, + "grad_norm": 0.21224354207515717, + "learning_rate": 0.00012164621410880427, + "loss": 0.9947, + "step": 4895 + }, + { + "epoch": 1.7, + "grad_norm": 0.20414133369922638, + "learning_rate": 0.00012161918076858223, + "loss": 0.9006, + "step": 4896 + }, + { + "epoch": 1.7, + "grad_norm": 0.20747117698192596, + "learning_rate": 0.00012159214577086062, + "loss": 0.908, + "step": 4897 + }, + { + "epoch": 1.7, + "grad_norm": 0.19901777803897858, + "learning_rate": 0.00012156510911771221, + "loss": 0.9242, + "step": 4898 + }, + { + "epoch": 1.7, + "grad_norm": 0.21684972941875458, + "learning_rate": 0.00012153807081120983, + "loss": 1.0294, + "step": 4899 + }, + { + "epoch": 1.71, + "grad_norm": 0.20291352272033691, + "learning_rate": 0.0001215110308534264, + "loss": 0.8793, + "step": 4900 + }, + { + "epoch": 1.71, + "grad_norm": 0.1894405335187912, + "learning_rate": 0.0001214839892464351, + "loss": 0.8511, + "step": 4901 + }, + { + "epoch": 1.71, + "grad_norm": 0.2027515321969986, + "learning_rate": 0.00012145694599230913, + "loss": 0.9226, + "step": 4902 + }, + { + "epoch": 1.71, + "grad_norm": 0.20232322812080383, + "learning_rate": 0.00012142990109312182, + "loss": 0.8925, + "step": 4903 + }, + { + "epoch": 1.71, + "grad_norm": 0.19063344597816467, + "learning_rate": 0.00012140285455094669, + "loss": 0.8289, + "step": 4904 + }, + { + "epoch": 1.71, + "grad_norm": 0.20592787861824036, + "learning_rate": 0.00012137580636785726, + "loss": 0.9255, + "step": 4905 + }, + { + "epoch": 1.71, + "grad_norm": 0.1977776288986206, + "learning_rate": 0.00012134875654592736, + "loss": 0.9156, + "step": 4906 + }, + { + "epoch": 1.71, + "grad_norm": 0.20104831457138062, + "learning_rate": 0.00012132170508723082, + "loss": 0.9236, + "step": 4907 + }, + { + "epoch": 1.71, + "grad_norm": 0.2730604410171509, + "learning_rate": 0.00012129465199384157, + "loss": 0.8175, + "step": 4908 + }, + { + "epoch": 1.71, + "grad_norm": 0.20264138281345367, + "learning_rate": 0.00012126759726783379, + "loss": 0.9311, + "step": 4909 + }, + { + "epoch": 1.71, + "grad_norm": 0.21460282802581787, + "learning_rate": 0.00012124054091128165, + "loss": 0.9825, + "step": 4910 + }, + { + "epoch": 1.71, + "grad_norm": 0.20077942311763763, + "learning_rate": 0.00012121348292625954, + "loss": 0.8766, + "step": 4911 + }, + { + "epoch": 1.71, + "grad_norm": 0.19806306064128876, + "learning_rate": 0.00012118642331484194, + "loss": 0.8929, + "step": 4912 + }, + { + "epoch": 1.71, + "grad_norm": 0.19765718281269073, + "learning_rate": 0.00012115936207910343, + "loss": 0.9899, + "step": 4913 + }, + { + "epoch": 1.71, + "grad_norm": 0.20788128674030304, + "learning_rate": 0.00012113229922111877, + "loss": 0.9416, + "step": 4914 + }, + { + "epoch": 1.71, + "grad_norm": 0.20780085027217865, + "learning_rate": 0.00012110523474296281, + "loss": 0.9248, + "step": 4915 + }, + { + "epoch": 1.71, + "grad_norm": 0.21029643714427948, + "learning_rate": 0.00012107816864671054, + "loss": 0.9705, + "step": 4916 + }, + { + "epoch": 1.71, + "grad_norm": 0.2099684178829193, + "learning_rate": 0.00012105110093443703, + "loss": 0.9051, + "step": 4917 + }, + { + "epoch": 1.71, + "grad_norm": 0.20328673720359802, + "learning_rate": 0.00012102403160821753, + "loss": 0.9653, + "step": 4918 + }, + { + "epoch": 1.71, + "grad_norm": 0.20479902625083923, + "learning_rate": 0.0001209969606701274, + "loss": 0.9216, + "step": 4919 + }, + { + "epoch": 1.71, + "grad_norm": 0.1987340748310089, + "learning_rate": 0.00012096988812224208, + "loss": 0.8951, + "step": 4920 + }, + { + "epoch": 1.71, + "grad_norm": 0.1998843103647232, + "learning_rate": 0.00012094281396663722, + "loss": 0.8768, + "step": 4921 + }, + { + "epoch": 1.71, + "grad_norm": 0.20857340097427368, + "learning_rate": 0.0001209157382053885, + "loss": 0.9482, + "step": 4922 + }, + { + "epoch": 1.71, + "grad_norm": 0.21716679632663727, + "learning_rate": 0.0001208886608405718, + "loss": 0.8987, + "step": 4923 + }, + { + "epoch": 1.71, + "grad_norm": 0.20676922798156738, + "learning_rate": 0.00012086158187426304, + "loss": 0.8846, + "step": 4924 + }, + { + "epoch": 1.71, + "grad_norm": 0.20093367993831635, + "learning_rate": 0.00012083450130853833, + "loss": 0.8664, + "step": 4925 + }, + { + "epoch": 1.71, + "grad_norm": 0.20721964538097382, + "learning_rate": 0.00012080741914547391, + "loss": 0.8971, + "step": 4926 + }, + { + "epoch": 1.71, + "grad_norm": 0.2019132673740387, + "learning_rate": 0.00012078033538714611, + "loss": 0.8943, + "step": 4927 + }, + { + "epoch": 1.72, + "grad_norm": 0.18864411115646362, + "learning_rate": 0.00012075325003563133, + "loss": 0.8439, + "step": 4928 + }, + { + "epoch": 1.72, + "grad_norm": 0.2038918435573578, + "learning_rate": 0.00012072616309300625, + "loss": 0.9259, + "step": 4929 + }, + { + "epoch": 1.72, + "grad_norm": 0.19137632846832275, + "learning_rate": 0.00012069907456134746, + "loss": 0.8448, + "step": 4930 + }, + { + "epoch": 1.72, + "grad_norm": 0.20036281645298004, + "learning_rate": 0.00012067198444273187, + "loss": 0.923, + "step": 4931 + }, + { + "epoch": 1.72, + "grad_norm": 0.2106323540210724, + "learning_rate": 0.00012064489273923638, + "loss": 0.9007, + "step": 4932 + }, + { + "epoch": 1.72, + "grad_norm": 0.20353010296821594, + "learning_rate": 0.0001206177994529381, + "loss": 0.9127, + "step": 4933 + }, + { + "epoch": 1.72, + "grad_norm": 0.1995949000120163, + "learning_rate": 0.00012059070458591414, + "loss": 0.878, + "step": 4934 + }, + { + "epoch": 1.72, + "grad_norm": 0.19901487231254578, + "learning_rate": 0.00012056360814024188, + "loss": 0.9539, + "step": 4935 + }, + { + "epoch": 1.72, + "grad_norm": 0.2054724097251892, + "learning_rate": 0.00012053651011799876, + "loss": 0.9506, + "step": 4936 + }, + { + "epoch": 1.72, + "grad_norm": 0.19489338994026184, + "learning_rate": 0.00012050941052126226, + "loss": 0.8724, + "step": 4937 + }, + { + "epoch": 1.72, + "grad_norm": 0.20409442484378815, + "learning_rate": 0.0001204823093521101, + "loss": 0.873, + "step": 4938 + }, + { + "epoch": 1.72, + "grad_norm": 0.21491308510303497, + "learning_rate": 0.0001204552066126201, + "loss": 0.9649, + "step": 4939 + }, + { + "epoch": 1.72, + "grad_norm": 0.20102648437023163, + "learning_rate": 0.0001204281023048701, + "loss": 0.8908, + "step": 4940 + }, + { + "epoch": 1.72, + "grad_norm": 0.1951928734779358, + "learning_rate": 0.00012040099643093818, + "loss": 0.8707, + "step": 4941 + }, + { + "epoch": 1.72, + "grad_norm": 0.19312453269958496, + "learning_rate": 0.00012037388899290252, + "loss": 0.9023, + "step": 4942 + }, + { + "epoch": 1.72, + "grad_norm": 0.19747990369796753, + "learning_rate": 0.0001203467799928413, + "loss": 0.8916, + "step": 4943 + }, + { + "epoch": 1.72, + "grad_norm": 0.19958235323429108, + "learning_rate": 0.00012031966943283303, + "loss": 0.9143, + "step": 4944 + }, + { + "epoch": 1.72, + "grad_norm": 0.2075386494398117, + "learning_rate": 0.00012029255731495613, + "loss": 0.8906, + "step": 4945 + }, + { + "epoch": 1.72, + "grad_norm": 0.1987982839345932, + "learning_rate": 0.00012026544364128926, + "loss": 0.8988, + "step": 4946 + }, + { + "epoch": 1.72, + "grad_norm": 0.21121381223201752, + "learning_rate": 0.0001202383284139112, + "loss": 0.8728, + "step": 4947 + }, + { + "epoch": 1.72, + "grad_norm": 0.20049889385700226, + "learning_rate": 0.00012021121163490078, + "loss": 0.8491, + "step": 4948 + }, + { + "epoch": 1.72, + "grad_norm": 0.20627477765083313, + "learning_rate": 0.00012018409330633704, + "loss": 0.931, + "step": 4949 + }, + { + "epoch": 1.72, + "grad_norm": 0.2119373083114624, + "learning_rate": 0.00012015697343029903, + "loss": 0.9066, + "step": 4950 + }, + { + "epoch": 1.72, + "grad_norm": 0.20603740215301514, + "learning_rate": 0.00012012985200886602, + "loss": 0.911, + "step": 4951 + }, + { + "epoch": 1.72, + "grad_norm": 0.19633600115776062, + "learning_rate": 0.00012010272904411732, + "loss": 0.9493, + "step": 4952 + }, + { + "epoch": 1.72, + "grad_norm": 0.20193667709827423, + "learning_rate": 0.00012007560453813241, + "loss": 0.9339, + "step": 4953 + }, + { + "epoch": 1.72, + "grad_norm": 0.1957622915506363, + "learning_rate": 0.0001200484784929909, + "loss": 0.8399, + "step": 4954 + }, + { + "epoch": 1.72, + "grad_norm": 0.1923246532678604, + "learning_rate": 0.00012002135091077244, + "loss": 0.8511, + "step": 4955 + }, + { + "epoch": 1.72, + "grad_norm": 0.2052861452102661, + "learning_rate": 0.00011999422179355689, + "loss": 0.9062, + "step": 4956 + }, + { + "epoch": 1.73, + "grad_norm": 0.20123454928398132, + "learning_rate": 0.00011996709114342417, + "loss": 0.8783, + "step": 4957 + }, + { + "epoch": 1.73, + "grad_norm": 0.19618423283100128, + "learning_rate": 0.00011993995896245429, + "loss": 0.9468, + "step": 4958 + }, + { + "epoch": 1.73, + "grad_norm": 0.20387451350688934, + "learning_rate": 0.00011991282525272751, + "loss": 0.8704, + "step": 4959 + }, + { + "epoch": 1.73, + "grad_norm": 0.1999095231294632, + "learning_rate": 0.00011988569001632403, + "loss": 0.9186, + "step": 4960 + }, + { + "epoch": 1.73, + "grad_norm": 0.2033764272928238, + "learning_rate": 0.00011985855325532432, + "loss": 0.9135, + "step": 4961 + }, + { + "epoch": 1.73, + "grad_norm": 0.19539391994476318, + "learning_rate": 0.00011983141497180885, + "loss": 0.8872, + "step": 4962 + }, + { + "epoch": 1.73, + "grad_norm": 0.19846411049365997, + "learning_rate": 0.00011980427516785829, + "loss": 0.8634, + "step": 4963 + }, + { + "epoch": 1.73, + "grad_norm": 0.19619372487068176, + "learning_rate": 0.00011977713384555343, + "loss": 0.8708, + "step": 4964 + }, + { + "epoch": 1.73, + "grad_norm": 0.21282833814620972, + "learning_rate": 0.00011974999100697503, + "loss": 0.9144, + "step": 4965 + }, + { + "epoch": 1.73, + "grad_norm": 0.21747606992721558, + "learning_rate": 0.00011972284665420417, + "loss": 0.9693, + "step": 4966 + }, + { + "epoch": 1.73, + "grad_norm": 0.206832617521286, + "learning_rate": 0.00011969570078932193, + "loss": 0.8923, + "step": 4967 + }, + { + "epoch": 1.73, + "grad_norm": 0.19994622468948364, + "learning_rate": 0.00011966855341440952, + "loss": 0.9185, + "step": 4968 + }, + { + "epoch": 1.73, + "grad_norm": 0.19521817564964294, + "learning_rate": 0.00011964140453154833, + "loss": 0.9909, + "step": 4969 + }, + { + "epoch": 1.73, + "grad_norm": 0.20193180441856384, + "learning_rate": 0.0001196142541428197, + "loss": 0.9187, + "step": 4970 + }, + { + "epoch": 1.73, + "eval_loss": 0.9203041195869446, + "eval_runtime": 757.3258, + "eval_samples_per_second": 9.079, + "eval_steps_per_second": 4.54, + "step": 4970 + }, + { + "epoch": 1.73, + "grad_norm": 0.20618854463100433, + "learning_rate": 0.00011958710225030529, + "loss": 0.8969, + "step": 4971 + }, + { + "epoch": 1.73, + "grad_norm": 0.20384158194065094, + "learning_rate": 0.00011955994885608677, + "loss": 0.9259, + "step": 4972 + }, + { + "epoch": 1.73, + "grad_norm": 0.19744277000427246, + "learning_rate": 0.00011953279396224588, + "loss": 0.8548, + "step": 4973 + }, + { + "epoch": 1.73, + "grad_norm": 0.19920489192008972, + "learning_rate": 0.00011950563757086461, + "loss": 0.941, + "step": 4974 + }, + { + "epoch": 1.73, + "grad_norm": 0.18816323578357697, + "learning_rate": 0.0001194784796840249, + "loss": 0.842, + "step": 4975 + }, + { + "epoch": 1.73, + "grad_norm": 0.20093213021755219, + "learning_rate": 0.00011945132030380897, + "loss": 0.9264, + "step": 4976 + }, + { + "epoch": 1.73, + "grad_norm": 0.21550847589969635, + "learning_rate": 0.00011942415943229903, + "loss": 0.9098, + "step": 4977 + }, + { + "epoch": 1.73, + "grad_norm": 0.19980955123901367, + "learning_rate": 0.00011939699707157748, + "loss": 0.8741, + "step": 4978 + }, + { + "epoch": 1.73, + "grad_norm": 0.19830453395843506, + "learning_rate": 0.00011936983322372679, + "loss": 0.8678, + "step": 4979 + }, + { + "epoch": 1.73, + "grad_norm": 0.21155405044555664, + "learning_rate": 0.00011934266789082953, + "loss": 0.9155, + "step": 4980 + }, + { + "epoch": 1.73, + "grad_norm": 0.20855197310447693, + "learning_rate": 0.00011931550107496846, + "loss": 0.866, + "step": 4981 + }, + { + "epoch": 1.73, + "grad_norm": 0.200531005859375, + "learning_rate": 0.00011928833277822638, + "loss": 0.8902, + "step": 4982 + }, + { + "epoch": 1.73, + "grad_norm": 0.2080719918012619, + "learning_rate": 0.0001192611630026862, + "loss": 0.9563, + "step": 4983 + }, + { + "epoch": 1.73, + "grad_norm": 0.20154033601284027, + "learning_rate": 0.00011923399175043103, + "loss": 0.8818, + "step": 4984 + }, + { + "epoch": 1.74, + "grad_norm": 0.19243967533111572, + "learning_rate": 0.00011920681902354405, + "loss": 0.8583, + "step": 4985 + }, + { + "epoch": 1.74, + "grad_norm": 0.20547130703926086, + "learning_rate": 0.00011917964482410843, + "loss": 0.8994, + "step": 4986 + }, + { + "epoch": 1.74, + "grad_norm": 0.2043677717447281, + "learning_rate": 0.00011915246915420768, + "loss": 0.8972, + "step": 4987 + }, + { + "epoch": 1.74, + "grad_norm": 0.19929523766040802, + "learning_rate": 0.00011912529201592522, + "loss": 0.9099, + "step": 4988 + }, + { + "epoch": 1.74, + "grad_norm": 0.19746260344982147, + "learning_rate": 0.00011909811341134472, + "loss": 0.871, + "step": 4989 + }, + { + "epoch": 1.74, + "grad_norm": 0.20300501585006714, + "learning_rate": 0.00011907093334254993, + "loss": 0.8044, + "step": 4990 + }, + { + "epoch": 1.74, + "grad_norm": 0.19856730103492737, + "learning_rate": 0.0001190437518116246, + "loss": 0.8879, + "step": 4991 + }, + { + "epoch": 1.74, + "grad_norm": 0.20286604762077332, + "learning_rate": 0.00011901656882065276, + "loss": 0.9892, + "step": 4992 + }, + { + "epoch": 1.74, + "grad_norm": 0.20987409353256226, + "learning_rate": 0.00011898938437171842, + "loss": 0.8848, + "step": 4993 + }, + { + "epoch": 1.74, + "grad_norm": 0.19633051753044128, + "learning_rate": 0.00011896219846690582, + "loss": 0.8424, + "step": 4994 + }, + { + "epoch": 1.74, + "grad_norm": 0.19561024010181427, + "learning_rate": 0.00011893501110829926, + "loss": 0.8976, + "step": 4995 + }, + { + "epoch": 1.74, + "grad_norm": 0.19931741058826447, + "learning_rate": 0.00011890782229798305, + "loss": 0.9, + "step": 4996 + }, + { + "epoch": 1.74, + "grad_norm": 0.2051405906677246, + "learning_rate": 0.00011888063203804176, + "loss": 0.8666, + "step": 4997 + }, + { + "epoch": 1.74, + "grad_norm": 0.20421981811523438, + "learning_rate": 0.00011885344033056, + "loss": 0.9034, + "step": 4998 + }, + { + "epoch": 1.74, + "grad_norm": 0.2062828689813614, + "learning_rate": 0.00011882624717762252, + "loss": 0.867, + "step": 4999 + }, + { + "epoch": 1.74, + "grad_norm": 0.2086305171251297, + "learning_rate": 0.00011879905258131418, + "loss": 0.9182, + "step": 5000 + }, + { + "epoch": 1.74, + "grad_norm": 0.19741983711719513, + "learning_rate": 0.00011877185654371987, + "loss": 0.8988, + "step": 5001 + }, + { + "epoch": 1.74, + "grad_norm": 0.20895710587501526, + "learning_rate": 0.00011874465906692473, + "loss": 0.9702, + "step": 5002 + }, + { + "epoch": 1.74, + "grad_norm": 0.20342940092086792, + "learning_rate": 0.00011871746015301389, + "loss": 0.9123, + "step": 5003 + }, + { + "epoch": 1.74, + "grad_norm": 0.202351376414299, + "learning_rate": 0.00011869025980407264, + "loss": 0.9364, + "step": 5004 + }, + { + "epoch": 1.74, + "grad_norm": 0.20114704966545105, + "learning_rate": 0.00011866305802218642, + "loss": 0.8252, + "step": 5005 + }, + { + "epoch": 1.74, + "grad_norm": 0.21236039698123932, + "learning_rate": 0.00011863585480944066, + "loss": 0.9436, + "step": 5006 + }, + { + "epoch": 1.74, + "grad_norm": 0.21095523238182068, + "learning_rate": 0.00011860865016792105, + "loss": 0.9577, + "step": 5007 + }, + { + "epoch": 1.74, + "grad_norm": 0.20168866217136383, + "learning_rate": 0.00011858144409971326, + "loss": 0.9228, + "step": 5008 + }, + { + "epoch": 1.74, + "grad_norm": 0.19422025978565216, + "learning_rate": 0.00011855423660690317, + "loss": 0.8558, + "step": 5009 + }, + { + "epoch": 1.74, + "grad_norm": 0.20206353068351746, + "learning_rate": 0.0001185270276915767, + "loss": 0.8905, + "step": 5010 + }, + { + "epoch": 1.74, + "grad_norm": 0.20214833319187164, + "learning_rate": 0.0001184998173558199, + "loss": 0.9154, + "step": 5011 + }, + { + "epoch": 1.74, + "grad_norm": 0.20583628118038177, + "learning_rate": 0.00011847260560171896, + "loss": 0.9023, + "step": 5012 + }, + { + "epoch": 1.75, + "grad_norm": 0.20077918469905853, + "learning_rate": 0.00011844539243136008, + "loss": 0.9255, + "step": 5013 + }, + { + "epoch": 1.75, + "grad_norm": 0.20009995996952057, + "learning_rate": 0.00011841817784682974, + "loss": 0.9335, + "step": 5014 + }, + { + "epoch": 1.75, + "grad_norm": 0.20166489481925964, + "learning_rate": 0.00011839096185021438, + "loss": 0.9916, + "step": 5015 + }, + { + "epoch": 1.75, + "grad_norm": 0.1968214064836502, + "learning_rate": 0.00011836374444360056, + "loss": 0.8258, + "step": 5016 + }, + { + "epoch": 1.75, + "grad_norm": 0.2118564248085022, + "learning_rate": 0.00011833652562907504, + "loss": 0.916, + "step": 5017 + }, + { + "epoch": 1.75, + "grad_norm": 0.20984144508838654, + "learning_rate": 0.0001183093054087246, + "loss": 0.8969, + "step": 5018 + }, + { + "epoch": 1.75, + "grad_norm": 0.20295755565166473, + "learning_rate": 0.00011828208378463619, + "loss": 0.8394, + "step": 5019 + }, + { + "epoch": 1.75, + "grad_norm": 0.2015915960073471, + "learning_rate": 0.00011825486075889683, + "loss": 0.9105, + "step": 5020 + }, + { + "epoch": 1.75, + "grad_norm": 0.20518988370895386, + "learning_rate": 0.0001182276363335936, + "loss": 0.8709, + "step": 5021 + }, + { + "epoch": 1.75, + "grad_norm": 0.1976424902677536, + "learning_rate": 0.0001182004105108138, + "loss": 0.8797, + "step": 5022 + }, + { + "epoch": 1.75, + "grad_norm": 0.2075275331735611, + "learning_rate": 0.00011817318329264477, + "loss": 0.9243, + "step": 5023 + }, + { + "epoch": 1.75, + "grad_norm": 0.20736710727214813, + "learning_rate": 0.00011814595468117397, + "loss": 0.9199, + "step": 5024 + }, + { + "epoch": 1.75, + "grad_norm": 0.2057676911354065, + "learning_rate": 0.00011811872467848897, + "loss": 0.9225, + "step": 5025 + }, + { + "epoch": 1.75, + "grad_norm": 0.206811785697937, + "learning_rate": 0.0001180914932866774, + "loss": 0.9511, + "step": 5026 + }, + { + "epoch": 1.75, + "grad_norm": 0.1979030966758728, + "learning_rate": 0.00011806426050782709, + "loss": 0.8755, + "step": 5027 + }, + { + "epoch": 1.75, + "grad_norm": 0.21560432016849518, + "learning_rate": 0.00011803702634402587, + "loss": 0.9617, + "step": 5028 + }, + { + "epoch": 1.75, + "grad_norm": 0.22279982268810272, + "learning_rate": 0.00011800979079736176, + "loss": 0.9276, + "step": 5029 + }, + { + "epoch": 1.75, + "grad_norm": 0.1999060958623886, + "learning_rate": 0.00011798255386992288, + "loss": 0.8727, + "step": 5030 + }, + { + "epoch": 1.75, + "grad_norm": 0.2031540870666504, + "learning_rate": 0.00011795531556379737, + "loss": 0.9602, + "step": 5031 + }, + { + "epoch": 1.75, + "grad_norm": 0.20556539297103882, + "learning_rate": 0.00011792807588107357, + "loss": 0.9124, + "step": 5032 + }, + { + "epoch": 1.75, + "grad_norm": 0.20039956271648407, + "learning_rate": 0.00011790083482383989, + "loss": 0.8745, + "step": 5033 + }, + { + "epoch": 1.75, + "grad_norm": 0.20217512547969818, + "learning_rate": 0.00011787359239418485, + "loss": 0.9396, + "step": 5034 + }, + { + "epoch": 1.75, + "grad_norm": 0.20315247774124146, + "learning_rate": 0.0001178463485941971, + "loss": 0.9068, + "step": 5035 + }, + { + "epoch": 1.75, + "grad_norm": 0.20155490934848785, + "learning_rate": 0.00011781910342596529, + "loss": 0.9777, + "step": 5036 + }, + { + "epoch": 1.75, + "grad_norm": 0.20154711604118347, + "learning_rate": 0.00011779185689157835, + "loss": 0.9135, + "step": 5037 + }, + { + "epoch": 1.75, + "grad_norm": 0.20197591185569763, + "learning_rate": 0.00011776460899312514, + "loss": 0.8422, + "step": 5038 + }, + { + "epoch": 1.75, + "grad_norm": 0.19243617355823517, + "learning_rate": 0.00011773735973269472, + "loss": 0.886, + "step": 5039 + }, + { + "epoch": 1.75, + "grad_norm": 0.21045725047588348, + "learning_rate": 0.00011771010911237628, + "loss": 0.9763, + "step": 5040 + }, + { + "epoch": 1.75, + "grad_norm": 0.2029496282339096, + "learning_rate": 0.00011768285713425901, + "loss": 0.9022, + "step": 5041 + }, + { + "epoch": 1.76, + "grad_norm": 0.20475132763385773, + "learning_rate": 0.0001176556038004323, + "loss": 0.8977, + "step": 5042 + }, + { + "epoch": 1.76, + "grad_norm": 0.1879488229751587, + "learning_rate": 0.00011762834911298561, + "loss": 0.8861, + "step": 5043 + }, + { + "epoch": 1.76, + "grad_norm": 0.1994534581899643, + "learning_rate": 0.00011760109307400849, + "loss": 0.8782, + "step": 5044 + }, + { + "epoch": 1.76, + "grad_norm": 0.20935015380382538, + "learning_rate": 0.00011757383568559061, + "loss": 0.9391, + "step": 5045 + }, + { + "epoch": 1.76, + "grad_norm": 0.20415537059307098, + "learning_rate": 0.00011754657694982174, + "loss": 0.8946, + "step": 5046 + }, + { + "epoch": 1.76, + "grad_norm": 0.19910888373851776, + "learning_rate": 0.00011751931686879177, + "loss": 0.857, + "step": 5047 + }, + { + "epoch": 1.76, + "grad_norm": 0.18923191726207733, + "learning_rate": 0.00011749205544459063, + "loss": 0.8723, + "step": 5048 + }, + { + "epoch": 1.76, + "grad_norm": 0.2021467089653015, + "learning_rate": 0.00011746479267930844, + "loss": 0.8768, + "step": 5049 + }, + { + "epoch": 1.76, + "grad_norm": 0.19616064429283142, + "learning_rate": 0.00011743752857503538, + "loss": 0.8571, + "step": 5050 + }, + { + "epoch": 1.76, + "grad_norm": 0.19986507296562195, + "learning_rate": 0.00011741026313386172, + "loss": 0.885, + "step": 5051 + }, + { + "epoch": 1.76, + "grad_norm": 0.20162515342235565, + "learning_rate": 0.00011738299635787786, + "loss": 0.9214, + "step": 5052 + }, + { + "epoch": 1.76, + "grad_norm": 0.2050357162952423, + "learning_rate": 0.00011735572824917427, + "loss": 0.8715, + "step": 5053 + }, + { + "epoch": 1.76, + "grad_norm": 0.1958906352519989, + "learning_rate": 0.00011732845880984153, + "loss": 0.8739, + "step": 5054 + }, + { + "epoch": 1.76, + "grad_norm": 0.2050018459558487, + "learning_rate": 0.00011730118804197042, + "loss": 0.851, + "step": 5055 + }, + { + "epoch": 1.76, + "grad_norm": 0.19149170815944672, + "learning_rate": 0.00011727391594765163, + "loss": 0.9196, + "step": 5056 + }, + { + "epoch": 1.76, + "grad_norm": 0.20418493449687958, + "learning_rate": 0.0001172466425289761, + "loss": 0.8943, + "step": 5057 + }, + { + "epoch": 1.76, + "grad_norm": 0.19700072705745697, + "learning_rate": 0.00011721936778803484, + "loss": 0.8402, + "step": 5058 + }, + { + "epoch": 1.76, + "grad_norm": 0.19113709032535553, + "learning_rate": 0.00011719209172691892, + "loss": 0.8822, + "step": 5059 + }, + { + "epoch": 1.76, + "grad_norm": 0.19682195782661438, + "learning_rate": 0.00011716481434771958, + "loss": 0.9305, + "step": 5060 + }, + { + "epoch": 1.76, + "grad_norm": 0.19861525297164917, + "learning_rate": 0.0001171375356525281, + "loss": 0.8543, + "step": 5061 + }, + { + "epoch": 1.76, + "grad_norm": 0.20508785545825958, + "learning_rate": 0.0001171102556434359, + "loss": 0.8968, + "step": 5062 + }, + { + "epoch": 1.76, + "grad_norm": 0.19325780868530273, + "learning_rate": 0.00011708297432253444, + "loss": 0.8555, + "step": 5063 + }, + { + "epoch": 1.76, + "grad_norm": 0.202621191740036, + "learning_rate": 0.00011705569169191534, + "loss": 0.8708, + "step": 5064 + }, + { + "epoch": 1.76, + "grad_norm": 0.20007860660552979, + "learning_rate": 0.00011702840775367034, + "loss": 0.9425, + "step": 5065 + }, + { + "epoch": 1.76, + "grad_norm": 0.20317086577415466, + "learning_rate": 0.00011700112250989122, + "loss": 0.9159, + "step": 5066 + }, + { + "epoch": 1.76, + "grad_norm": 0.21285471320152283, + "learning_rate": 0.00011697383596266993, + "loss": 0.9658, + "step": 5067 + }, + { + "epoch": 1.76, + "grad_norm": 0.20765405893325806, + "learning_rate": 0.0001169465481140984, + "loss": 0.8651, + "step": 5068 + }, + { + "epoch": 1.76, + "grad_norm": 0.20606768131256104, + "learning_rate": 0.00011691925896626877, + "loss": 0.8446, + "step": 5069 + }, + { + "epoch": 1.77, + "grad_norm": 0.193104088306427, + "learning_rate": 0.00011689196852127325, + "loss": 0.8825, + "step": 5070 + }, + { + "epoch": 1.77, + "grad_norm": 0.19602930545806885, + "learning_rate": 0.0001168646767812041, + "loss": 0.8814, + "step": 5071 + }, + { + "epoch": 1.77, + "grad_norm": 0.19966290891170502, + "learning_rate": 0.00011683738374815382, + "loss": 0.8934, + "step": 5072 + }, + { + "epoch": 1.77, + "grad_norm": 0.21192531287670135, + "learning_rate": 0.00011681008942421483, + "loss": 0.9528, + "step": 5073 + }, + { + "epoch": 1.77, + "grad_norm": 0.2027505785226822, + "learning_rate": 0.00011678279381147975, + "loss": 0.8551, + "step": 5074 + }, + { + "epoch": 1.77, + "grad_norm": 0.2016507089138031, + "learning_rate": 0.00011675549691204129, + "loss": 0.925, + "step": 5075 + }, + { + "epoch": 1.77, + "grad_norm": 0.21192465722560883, + "learning_rate": 0.00011672819872799223, + "loss": 0.8924, + "step": 5076 + }, + { + "epoch": 1.77, + "grad_norm": 0.2008252888917923, + "learning_rate": 0.00011670089926142553, + "loss": 0.8713, + "step": 5077 + }, + { + "epoch": 1.77, + "grad_norm": 0.2006734311580658, + "learning_rate": 0.00011667359851443411, + "loss": 0.9093, + "step": 5078 + }, + { + "epoch": 1.77, + "grad_norm": 0.20377005636692047, + "learning_rate": 0.00011664629648911108, + "loss": 0.8406, + "step": 5079 + }, + { + "epoch": 1.77, + "grad_norm": 0.20884154736995697, + "learning_rate": 0.00011661899318754965, + "loss": 0.9309, + "step": 5080 + }, + { + "epoch": 1.77, + "grad_norm": 0.20641496777534485, + "learning_rate": 0.00011659168861184308, + "loss": 0.8986, + "step": 5081 + }, + { + "epoch": 1.77, + "grad_norm": 0.20506221055984497, + "learning_rate": 0.00011656438276408484, + "loss": 0.9009, + "step": 5082 + }, + { + "epoch": 1.77, + "grad_norm": 0.19785983860492706, + "learning_rate": 0.00011653707564636833, + "loss": 0.9092, + "step": 5083 + }, + { + "epoch": 1.77, + "grad_norm": 0.19785176217556, + "learning_rate": 0.00011650976726078712, + "loss": 0.8682, + "step": 5084 + }, + { + "epoch": 1.77, + "grad_norm": 0.2017369270324707, + "learning_rate": 0.00011648245760943499, + "loss": 0.9217, + "step": 5085 + }, + { + "epoch": 1.77, + "grad_norm": 0.2052314579486847, + "learning_rate": 0.0001164551466944056, + "loss": 0.8482, + "step": 5086 + }, + { + "epoch": 1.77, + "grad_norm": 0.19984367489814758, + "learning_rate": 0.00011642783451779294, + "loss": 0.8843, + "step": 5087 + }, + { + "epoch": 1.77, + "grad_norm": 0.21052081882953644, + "learning_rate": 0.00011640052108169088, + "loss": 0.9245, + "step": 5088 + }, + { + "epoch": 1.77, + "grad_norm": 0.19494381546974182, + "learning_rate": 0.00011637320638819349, + "loss": 0.8456, + "step": 5089 + }, + { + "epoch": 1.77, + "grad_norm": 0.2073839157819748, + "learning_rate": 0.00011634589043939501, + "loss": 0.8419, + "step": 5090 + }, + { + "epoch": 1.77, + "grad_norm": 0.19387920200824738, + "learning_rate": 0.00011631857323738966, + "loss": 0.8577, + "step": 5091 + }, + { + "epoch": 1.77, + "grad_norm": 0.20236201584339142, + "learning_rate": 0.00011629125478427178, + "loss": 0.9158, + "step": 5092 + }, + { + "epoch": 1.77, + "grad_norm": 0.20327600836753845, + "learning_rate": 0.00011626393508213586, + "loss": 0.9471, + "step": 5093 + }, + { + "epoch": 1.77, + "grad_norm": 0.19547243416309357, + "learning_rate": 0.00011623661413307639, + "loss": 0.853, + "step": 5094 + }, + { + "epoch": 1.77, + "grad_norm": 0.20263858139514923, + "learning_rate": 0.00011620929193918806, + "loss": 0.9924, + "step": 5095 + }, + { + "epoch": 1.77, + "grad_norm": 0.2074957937002182, + "learning_rate": 0.00011618196850256557, + "loss": 0.9559, + "step": 5096 + }, + { + "epoch": 1.77, + "grad_norm": 0.19843412935733795, + "learning_rate": 0.0001161546438253038, + "loss": 0.8685, + "step": 5097 + }, + { + "epoch": 1.77, + "grad_norm": 0.20189149677753448, + "learning_rate": 0.00011612731790949767, + "loss": 0.8867, + "step": 5098 + }, + { + "epoch": 1.78, + "grad_norm": 0.19635272026062012, + "learning_rate": 0.00011609999075724214, + "loss": 0.8595, + "step": 5099 + }, + { + "epoch": 1.78, + "grad_norm": 0.20424078404903412, + "learning_rate": 0.00011607266237063242, + "loss": 0.9016, + "step": 5100 + }, + { + "epoch": 1.78, + "grad_norm": 0.19970911741256714, + "learning_rate": 0.00011604533275176366, + "loss": 0.9319, + "step": 5101 + }, + { + "epoch": 1.78, + "grad_norm": 0.2071448713541031, + "learning_rate": 0.0001160180019027312, + "loss": 0.8947, + "step": 5102 + }, + { + "epoch": 1.78, + "grad_norm": 0.20625151693820953, + "learning_rate": 0.00011599066982563043, + "loss": 0.8917, + "step": 5103 + }, + { + "epoch": 1.78, + "grad_norm": 0.20384685695171356, + "learning_rate": 0.0001159633365225568, + "loss": 0.8873, + "step": 5104 + }, + { + "epoch": 1.78, + "grad_norm": 0.19870717823505402, + "learning_rate": 0.00011593600199560599, + "loss": 0.8186, + "step": 5105 + }, + { + "epoch": 1.78, + "grad_norm": 0.19644193351268768, + "learning_rate": 0.00011590866624687362, + "loss": 0.8968, + "step": 5106 + }, + { + "epoch": 1.78, + "grad_norm": 0.2026110291481018, + "learning_rate": 0.00011588132927845552, + "loss": 0.9062, + "step": 5107 + }, + { + "epoch": 1.78, + "grad_norm": 0.20113103091716766, + "learning_rate": 0.0001158539910924475, + "loss": 0.8294, + "step": 5108 + }, + { + "epoch": 1.78, + "grad_norm": 0.20522204041481018, + "learning_rate": 0.00011582665169094554, + "loss": 0.9486, + "step": 5109 + }, + { + "epoch": 1.78, + "grad_norm": 0.1949976086616516, + "learning_rate": 0.00011579931107604571, + "loss": 0.8544, + "step": 5110 + }, + { + "epoch": 1.78, + "grad_norm": 0.20692569017410278, + "learning_rate": 0.00011577196924984418, + "loss": 0.8792, + "step": 5111 + }, + { + "epoch": 1.78, + "grad_norm": 0.19932514429092407, + "learning_rate": 0.00011574462621443714, + "loss": 0.9022, + "step": 5112 + }, + { + "epoch": 1.78, + "grad_norm": 0.20348139107227325, + "learning_rate": 0.000115717281971921, + "loss": 0.8854, + "step": 5113 + }, + { + "epoch": 1.78, + "grad_norm": 0.1993340402841568, + "learning_rate": 0.0001156899365243921, + "loss": 0.9246, + "step": 5114 + }, + { + "epoch": 1.78, + "grad_norm": 0.2046261727809906, + "learning_rate": 0.00011566258987394704, + "loss": 0.8866, + "step": 5115 + }, + { + "epoch": 1.78, + "grad_norm": 0.21360766887664795, + "learning_rate": 0.00011563524202268241, + "loss": 0.9124, + "step": 5116 + }, + { + "epoch": 1.78, + "grad_norm": 0.2009619027376175, + "learning_rate": 0.00011560789297269487, + "loss": 0.9078, + "step": 5117 + }, + { + "epoch": 1.78, + "grad_norm": 0.20256301760673523, + "learning_rate": 0.0001155805427260813, + "loss": 0.9127, + "step": 5118 + }, + { + "epoch": 1.78, + "grad_norm": 0.19327925145626068, + "learning_rate": 0.00011555319128493851, + "loss": 0.8556, + "step": 5119 + }, + { + "epoch": 1.78, + "grad_norm": 0.19143442809581757, + "learning_rate": 0.00011552583865136353, + "loss": 0.8526, + "step": 5120 + }, + { + "epoch": 1.78, + "grad_norm": 0.20160861313343048, + "learning_rate": 0.0001154984848274534, + "loss": 0.8435, + "step": 5121 + }, + { + "epoch": 1.78, + "grad_norm": 0.19750486314296722, + "learning_rate": 0.00011547112981530531, + "loss": 0.8792, + "step": 5122 + }, + { + "epoch": 1.78, + "grad_norm": 0.2013956904411316, + "learning_rate": 0.0001154437736170165, + "loss": 0.8681, + "step": 5123 + }, + { + "epoch": 1.78, + "grad_norm": 0.20969411730766296, + "learning_rate": 0.00011541641623468431, + "loss": 0.9353, + "step": 5124 + }, + { + "epoch": 1.78, + "grad_norm": 0.2132592648267746, + "learning_rate": 0.0001153890576704062, + "loss": 0.9047, + "step": 5125 + }, + { + "epoch": 1.78, + "grad_norm": 0.2049732804298401, + "learning_rate": 0.00011536169792627967, + "loss": 0.901, + "step": 5126 + }, + { + "epoch": 1.79, + "grad_norm": 0.20980370044708252, + "learning_rate": 0.00011533433700440232, + "loss": 0.9547, + "step": 5127 + }, + { + "epoch": 1.79, + "grad_norm": 0.19606991112232208, + "learning_rate": 0.00011530697490687194, + "loss": 0.9157, + "step": 5128 + }, + { + "epoch": 1.79, + "grad_norm": 0.19489556550979614, + "learning_rate": 0.00011527961163578624, + "loss": 0.8315, + "step": 5129 + }, + { + "epoch": 1.79, + "grad_norm": 0.20620688796043396, + "learning_rate": 0.00011525224719324314, + "loss": 0.9093, + "step": 5130 + }, + { + "epoch": 1.79, + "grad_norm": 0.21620605885982513, + "learning_rate": 0.00011522488158134064, + "loss": 0.948, + "step": 5131 + }, + { + "epoch": 1.79, + "grad_norm": 0.20374895632266998, + "learning_rate": 0.00011519751480217676, + "loss": 0.8583, + "step": 5132 + }, + { + "epoch": 1.79, + "grad_norm": 0.19723199307918549, + "learning_rate": 0.0001151701468578497, + "loss": 0.9123, + "step": 5133 + }, + { + "epoch": 1.79, + "grad_norm": 0.20205992460250854, + "learning_rate": 0.00011514277775045768, + "loss": 0.9027, + "step": 5134 + }, + { + "epoch": 1.79, + "grad_norm": 0.19809818267822266, + "learning_rate": 0.00011511540748209903, + "loss": 0.9026, + "step": 5135 + }, + { + "epoch": 1.79, + "grad_norm": 0.20418870449066162, + "learning_rate": 0.00011508803605487222, + "loss": 0.848, + "step": 5136 + }, + { + "epoch": 1.79, + "grad_norm": 0.19335588812828064, + "learning_rate": 0.00011506066347087568, + "loss": 0.8672, + "step": 5137 + }, + { + "epoch": 1.79, + "grad_norm": 0.20655111968517303, + "learning_rate": 0.00011503328973220811, + "loss": 0.8827, + "step": 5138 + }, + { + "epoch": 1.79, + "grad_norm": 0.19734971225261688, + "learning_rate": 0.0001150059148409681, + "loss": 0.8798, + "step": 5139 + }, + { + "epoch": 1.79, + "grad_norm": 0.2011449933052063, + "learning_rate": 0.0001149785387992545, + "loss": 0.9196, + "step": 5140 + }, + { + "epoch": 1.79, + "grad_norm": 0.19869692623615265, + "learning_rate": 0.00011495116160916617, + "loss": 0.8314, + "step": 5141 + }, + { + "epoch": 1.79, + "grad_norm": 0.20650891959667206, + "learning_rate": 0.00011492378327280202, + "loss": 0.8761, + "step": 5142 + }, + { + "epoch": 1.79, + "grad_norm": 0.2007884383201599, + "learning_rate": 0.00011489640379226116, + "loss": 0.8532, + "step": 5143 + }, + { + "epoch": 1.79, + "grad_norm": 0.19970685243606567, + "learning_rate": 0.00011486902316964265, + "loss": 0.9252, + "step": 5144 + }, + { + "epoch": 1.79, + "grad_norm": 0.20636236667633057, + "learning_rate": 0.00011484164140704577, + "loss": 0.8707, + "step": 5145 + }, + { + "epoch": 1.79, + "grad_norm": 0.2008487433195114, + "learning_rate": 0.00011481425850656978, + "loss": 0.899, + "step": 5146 + }, + { + "epoch": 1.79, + "grad_norm": 0.2033643275499344, + "learning_rate": 0.00011478687447031407, + "loss": 0.9197, + "step": 5147 + }, + { + "epoch": 1.79, + "grad_norm": 0.20582693815231323, + "learning_rate": 0.00011475948930037819, + "loss": 1.0164, + "step": 5148 + }, + { + "epoch": 1.79, + "grad_norm": 0.2104448676109314, + "learning_rate": 0.0001147321029988616, + "loss": 0.9111, + "step": 5149 + }, + { + "epoch": 1.79, + "grad_norm": 0.19654923677444458, + "learning_rate": 0.00011470471556786401, + "loss": 0.9337, + "step": 5150 + }, + { + "epoch": 1.79, + "grad_norm": 0.19730478525161743, + "learning_rate": 0.00011467732700948519, + "loss": 0.955, + "step": 5151 + }, + { + "epoch": 1.79, + "grad_norm": 0.19658218324184418, + "learning_rate": 0.0001146499373258249, + "loss": 0.893, + "step": 5152 + }, + { + "epoch": 1.79, + "grad_norm": 0.2017667293548584, + "learning_rate": 0.00011462254651898312, + "loss": 0.933, + "step": 5153 + }, + { + "epoch": 1.79, + "grad_norm": 0.19858188927173615, + "learning_rate": 0.00011459515459105978, + "loss": 0.8692, + "step": 5154 + }, + { + "epoch": 1.8, + "grad_norm": 0.19772326946258545, + "learning_rate": 0.00011456776154415502, + "loss": 0.8988, + "step": 5155 + }, + { + "epoch": 1.8, + "grad_norm": 0.19625569880008698, + "learning_rate": 0.00011454036738036899, + "loss": 0.8658, + "step": 5156 + }, + { + "epoch": 1.8, + "grad_norm": 0.20135082304477692, + "learning_rate": 0.00011451297210180195, + "loss": 0.8202, + "step": 5157 + }, + { + "epoch": 1.8, + "grad_norm": 0.20419570803642273, + "learning_rate": 0.00011448557571055423, + "loss": 0.9184, + "step": 5158 + }, + { + "epoch": 1.8, + "grad_norm": 0.2074098289012909, + "learning_rate": 0.00011445817820872627, + "loss": 0.9901, + "step": 5159 + }, + { + "epoch": 1.8, + "grad_norm": 0.20851200819015503, + "learning_rate": 0.00011443077959841856, + "loss": 0.9857, + "step": 5160 + }, + { + "epoch": 1.8, + "grad_norm": 0.20411436259746552, + "learning_rate": 0.00011440337988173173, + "loss": 0.9306, + "step": 5161 + }, + { + "epoch": 1.8, + "grad_norm": 0.2083197683095932, + "learning_rate": 0.00011437597906076641, + "loss": 0.8815, + "step": 5162 + }, + { + "epoch": 1.8, + "grad_norm": 0.20529308915138245, + "learning_rate": 0.00011434857713762344, + "loss": 0.9152, + "step": 5163 + }, + { + "epoch": 1.8, + "grad_norm": 0.20252062380313873, + "learning_rate": 0.00011432117411440359, + "loss": 0.9166, + "step": 5164 + }, + { + "epoch": 1.8, + "grad_norm": 0.1970435082912445, + "learning_rate": 0.00011429376999320789, + "loss": 0.8705, + "step": 5165 + }, + { + "epoch": 1.8, + "grad_norm": 0.2025022804737091, + "learning_rate": 0.00011426636477613728, + "loss": 0.8794, + "step": 5166 + }, + { + "epoch": 1.8, + "grad_norm": 0.20577043294906616, + "learning_rate": 0.00011423895846529286, + "loss": 0.9026, + "step": 5167 + }, + { + "epoch": 1.8, + "grad_norm": 0.2037774920463562, + "learning_rate": 0.00011421155106277588, + "loss": 0.9078, + "step": 5168 + }, + { + "epoch": 1.8, + "grad_norm": 0.20879001915454865, + "learning_rate": 0.00011418414257068759, + "loss": 0.9129, + "step": 5169 + }, + { + "epoch": 1.8, + "grad_norm": 0.1920623779296875, + "learning_rate": 0.00011415673299112932, + "loss": 0.8451, + "step": 5170 + }, + { + "epoch": 1.8, + "grad_norm": 0.19346140325069427, + "learning_rate": 0.0001141293223262025, + "loss": 0.8318, + "step": 5171 + }, + { + "epoch": 1.8, + "grad_norm": 0.19851449131965637, + "learning_rate": 0.00011410191057800868, + "loss": 0.9736, + "step": 5172 + }, + { + "epoch": 1.8, + "grad_norm": 0.19635413587093353, + "learning_rate": 0.00011407449774864947, + "loss": 0.8718, + "step": 5173 + }, + { + "epoch": 1.8, + "grad_norm": 0.2130098044872284, + "learning_rate": 0.00011404708384022654, + "loss": 0.8962, + "step": 5174 + }, + { + "epoch": 1.8, + "grad_norm": 0.18981891870498657, + "learning_rate": 0.00011401966885484165, + "loss": 0.8019, + "step": 5175 + }, + { + "epoch": 1.8, + "grad_norm": 0.2004854828119278, + "learning_rate": 0.00011399225279459669, + "loss": 0.8952, + "step": 5176 + }, + { + "epoch": 1.8, + "grad_norm": 0.20224079489707947, + "learning_rate": 0.00011396483566159352, + "loss": 0.9395, + "step": 5177 + }, + { + "epoch": 1.8, + "grad_norm": 0.2006518691778183, + "learning_rate": 0.00011393741745793425, + "loss": 0.9353, + "step": 5178 + }, + { + "epoch": 1.8, + "grad_norm": 0.197438046336174, + "learning_rate": 0.00011390999818572093, + "loss": 0.8748, + "step": 5179 + }, + { + "epoch": 1.8, + "grad_norm": 0.2060369998216629, + "learning_rate": 0.00011388257784705573, + "loss": 0.8877, + "step": 5180 + }, + { + "epoch": 1.8, + "grad_norm": 0.20259609818458557, + "learning_rate": 0.00011385515644404095, + "loss": 0.8416, + "step": 5181 + }, + { + "epoch": 1.8, + "grad_norm": 0.2101147472858429, + "learning_rate": 0.00011382773397877888, + "loss": 0.9739, + "step": 5182 + }, + { + "epoch": 1.8, + "grad_norm": 0.19835442304611206, + "learning_rate": 0.00011380031045337201, + "loss": 0.8296, + "step": 5183 + }, + { + "epoch": 1.81, + "grad_norm": 0.20819783210754395, + "learning_rate": 0.00011377288586992281, + "loss": 0.9094, + "step": 5184 + }, + { + "epoch": 1.81, + "grad_norm": 0.20873680710792542, + "learning_rate": 0.00011374546023053388, + "loss": 0.9217, + "step": 5185 + }, + { + "epoch": 1.81, + "grad_norm": 0.20260164141654968, + "learning_rate": 0.00011371803353730786, + "loss": 0.933, + "step": 5186 + }, + { + "epoch": 1.81, + "grad_norm": 0.20431824028491974, + "learning_rate": 0.00011369060579234754, + "loss": 0.9032, + "step": 5187 + }, + { + "epoch": 1.81, + "grad_norm": 0.21404831111431122, + "learning_rate": 0.00011366317699775573, + "loss": 0.9451, + "step": 5188 + }, + { + "epoch": 1.81, + "grad_norm": 0.1977844089269638, + "learning_rate": 0.00011363574715563534, + "loss": 0.8453, + "step": 5189 + }, + { + "epoch": 1.81, + "grad_norm": 0.21856170892715454, + "learning_rate": 0.00011360831626808938, + "loss": 0.9768, + "step": 5190 + }, + { + "epoch": 1.81, + "grad_norm": 0.20709745585918427, + "learning_rate": 0.00011358088433722092, + "loss": 0.8808, + "step": 5191 + }, + { + "epoch": 1.81, + "grad_norm": 0.20399156212806702, + "learning_rate": 0.00011355345136513307, + "loss": 0.9244, + "step": 5192 + }, + { + "epoch": 1.81, + "grad_norm": 0.20622244477272034, + "learning_rate": 0.0001135260173539291, + "loss": 0.9885, + "step": 5193 + }, + { + "epoch": 1.81, + "grad_norm": 0.19313405454158783, + "learning_rate": 0.00011349858230571233, + "loss": 0.8616, + "step": 5194 + }, + { + "epoch": 1.81, + "grad_norm": 0.19614113867282867, + "learning_rate": 0.00011347114622258612, + "loss": 0.9269, + "step": 5195 + }, + { + "epoch": 1.81, + "grad_norm": 0.1973341405391693, + "learning_rate": 0.00011344370910665393, + "loss": 0.925, + "step": 5196 + }, + { + "epoch": 1.81, + "grad_norm": 0.2052834928035736, + "learning_rate": 0.00011341627096001937, + "loss": 0.9339, + "step": 5197 + }, + { + "epoch": 1.81, + "grad_norm": 0.20807234942913055, + "learning_rate": 0.000113388831784786, + "loss": 0.9083, + "step": 5198 + }, + { + "epoch": 1.81, + "grad_norm": 0.2042306512594223, + "learning_rate": 0.00011336139158305758, + "loss": 0.9299, + "step": 5199 + }, + { + "epoch": 1.81, + "grad_norm": 0.20556193590164185, + "learning_rate": 0.00011333395035693786, + "loss": 0.9511, + "step": 5200 + }, + { + "epoch": 1.81, + "grad_norm": 0.19857318699359894, + "learning_rate": 0.00011330650810853072, + "loss": 0.8736, + "step": 5201 + }, + { + "epoch": 1.81, + "grad_norm": 0.1991368979215622, + "learning_rate": 0.00011327906483994008, + "loss": 0.8783, + "step": 5202 + }, + { + "epoch": 1.81, + "grad_norm": 0.20878826081752777, + "learning_rate": 0.00011325162055327001, + "loss": 0.8912, + "step": 5203 + }, + { + "epoch": 1.81, + "grad_norm": 0.2100679725408554, + "learning_rate": 0.00011322417525062456, + "loss": 0.9602, + "step": 5204 + }, + { + "epoch": 1.81, + "grad_norm": 0.19470907747745514, + "learning_rate": 0.00011319672893410796, + "loss": 0.9297, + "step": 5205 + }, + { + "epoch": 1.81, + "grad_norm": 0.20928390324115753, + "learning_rate": 0.00011316928160582438, + "loss": 0.9481, + "step": 5206 + }, + { + "epoch": 1.81, + "grad_norm": 0.194465771317482, + "learning_rate": 0.00011314183326787822, + "loss": 0.9818, + "step": 5207 + }, + { + "epoch": 1.81, + "grad_norm": 0.23972180485725403, + "learning_rate": 0.00011311438392237387, + "loss": 1.1, + "step": 5208 + }, + { + "epoch": 1.81, + "grad_norm": 0.1954929530620575, + "learning_rate": 0.00011308693357141582, + "loss": 0.8943, + "step": 5209 + }, + { + "epoch": 1.81, + "grad_norm": 0.203359916806221, + "learning_rate": 0.00011305948221710865, + "loss": 0.8708, + "step": 5210 + }, + { + "epoch": 1.81, + "grad_norm": 0.20809759199619293, + "learning_rate": 0.00011303202986155696, + "loss": 0.8858, + "step": 5211 + }, + { + "epoch": 1.82, + "grad_norm": 0.2132718712091446, + "learning_rate": 0.00011300457650686547, + "loss": 0.9174, + "step": 5212 + }, + { + "epoch": 1.82, + "grad_norm": 0.20165328681468964, + "learning_rate": 0.00011297712215513903, + "loss": 0.8968, + "step": 5213 + }, + { + "epoch": 1.82, + "grad_norm": 0.19978751242160797, + "learning_rate": 0.00011294966680848247, + "loss": 0.9146, + "step": 5214 + }, + { + "epoch": 1.82, + "grad_norm": 0.19985590875148773, + "learning_rate": 0.00011292221046900074, + "loss": 0.9535, + "step": 5215 + }, + { + "epoch": 1.82, + "grad_norm": 0.201857790350914, + "learning_rate": 0.00011289475313879885, + "loss": 0.8486, + "step": 5216 + }, + { + "epoch": 1.82, + "grad_norm": 0.20473961532115936, + "learning_rate": 0.0001128672948199819, + "loss": 0.8757, + "step": 5217 + }, + { + "epoch": 1.82, + "grad_norm": 0.2123999297618866, + "learning_rate": 0.00011283983551465511, + "loss": 0.8902, + "step": 5218 + }, + { + "epoch": 1.82, + "grad_norm": 0.19770240783691406, + "learning_rate": 0.00011281237522492367, + "loss": 0.9091, + "step": 5219 + }, + { + "epoch": 1.82, + "grad_norm": 0.20275746285915375, + "learning_rate": 0.00011278491395289296, + "loss": 0.9041, + "step": 5220 + }, + { + "epoch": 1.82, + "grad_norm": 0.19588282704353333, + "learning_rate": 0.00011275745170066834, + "loss": 0.8687, + "step": 5221 + }, + { + "epoch": 1.82, + "grad_norm": 0.20717737078666687, + "learning_rate": 0.0001127299884703553, + "loss": 0.9517, + "step": 5222 + }, + { + "epoch": 1.82, + "grad_norm": 0.2018229365348816, + "learning_rate": 0.00011270252426405939, + "loss": 0.8683, + "step": 5223 + }, + { + "epoch": 1.82, + "grad_norm": 0.20680849254131317, + "learning_rate": 0.00011267505908388625, + "loss": 0.8913, + "step": 5224 + }, + { + "epoch": 1.82, + "grad_norm": 0.2059405893087387, + "learning_rate": 0.00011264759293194156, + "loss": 0.8993, + "step": 5225 + }, + { + "epoch": 1.82, + "grad_norm": 0.20858971774578094, + "learning_rate": 0.0001126201258103311, + "loss": 0.9484, + "step": 5226 + }, + { + "epoch": 1.82, + "grad_norm": 0.20857854187488556, + "learning_rate": 0.00011259265772116072, + "loss": 0.8994, + "step": 5227 + }, + { + "epoch": 1.82, + "grad_norm": 0.1971004158258438, + "learning_rate": 0.00011256518866653636, + "loss": 0.867, + "step": 5228 + }, + { + "epoch": 1.82, + "grad_norm": 0.20058520138263702, + "learning_rate": 0.000112537718648564, + "loss": 0.9123, + "step": 5229 + }, + { + "epoch": 1.82, + "grad_norm": 0.19929702579975128, + "learning_rate": 0.00011251024766934973, + "loss": 0.9114, + "step": 5230 + }, + { + "epoch": 1.82, + "grad_norm": 0.20408372581005096, + "learning_rate": 0.00011248277573099966, + "loss": 0.9088, + "step": 5231 + }, + { + "epoch": 1.82, + "grad_norm": 0.21157945692539215, + "learning_rate": 0.00011245530283562, + "loss": 0.8944, + "step": 5232 + }, + { + "epoch": 1.82, + "grad_norm": 0.1991204172372818, + "learning_rate": 0.00011242782898531712, + "loss": 0.9014, + "step": 5233 + }, + { + "epoch": 1.82, + "grad_norm": 0.19715304672718048, + "learning_rate": 0.00011240035418219732, + "loss": 0.8788, + "step": 5234 + }, + { + "epoch": 1.82, + "grad_norm": 0.19844309985637665, + "learning_rate": 0.00011237287842836707, + "loss": 0.8362, + "step": 5235 + }, + { + "epoch": 1.82, + "grad_norm": 0.20528234541416168, + "learning_rate": 0.00011234540172593287, + "loss": 0.8645, + "step": 5236 + }, + { + "epoch": 1.82, + "grad_norm": 0.21071383357048035, + "learning_rate": 0.00011231792407700125, + "loss": 0.8801, + "step": 5237 + }, + { + "epoch": 1.82, + "grad_norm": 0.20560497045516968, + "learning_rate": 0.00011229044548367897, + "loss": 0.889, + "step": 5238 + }, + { + "epoch": 1.82, + "grad_norm": 0.19932425022125244, + "learning_rate": 0.00011226296594807271, + "loss": 0.9381, + "step": 5239 + }, + { + "epoch": 1.82, + "grad_norm": 0.21976758539676666, + "learning_rate": 0.00011223548547228924, + "loss": 0.8839, + "step": 5240 + }, + { + "epoch": 1.83, + "grad_norm": 0.20521189272403717, + "learning_rate": 0.0001122080040584355, + "loss": 0.9186, + "step": 5241 + }, + { + "epoch": 1.83, + "grad_norm": 0.2094995379447937, + "learning_rate": 0.00011218052170861835, + "loss": 0.956, + "step": 5242 + }, + { + "epoch": 1.83, + "grad_norm": 0.2032909095287323, + "learning_rate": 0.00011215303842494489, + "loss": 0.9085, + "step": 5243 + }, + { + "epoch": 1.83, + "grad_norm": 0.1991930603981018, + "learning_rate": 0.00011212555420952215, + "loss": 0.9086, + "step": 5244 + }, + { + "epoch": 1.83, + "grad_norm": 0.20247501134872437, + "learning_rate": 0.0001120980690644573, + "loss": 0.8857, + "step": 5245 + }, + { + "epoch": 1.83, + "grad_norm": 0.19612811505794525, + "learning_rate": 0.00011207058299185765, + "loss": 0.8563, + "step": 5246 + }, + { + "epoch": 1.83, + "grad_norm": 0.2090568244457245, + "learning_rate": 0.00011204309599383035, + "loss": 0.9022, + "step": 5247 + }, + { + "epoch": 1.83, + "grad_norm": 0.1988435536623001, + "learning_rate": 0.00011201560807248293, + "loss": 0.8883, + "step": 5248 + }, + { + "epoch": 1.83, + "grad_norm": 0.2111375778913498, + "learning_rate": 0.00011198811922992274, + "loss": 0.9505, + "step": 5249 + }, + { + "epoch": 1.83, + "grad_norm": 0.20622937381267548, + "learning_rate": 0.0001119606294682573, + "loss": 0.9294, + "step": 5250 + }, + { + "epoch": 1.83, + "grad_norm": 0.21491630375385284, + "learning_rate": 0.00011193313878959427, + "loss": 0.9827, + "step": 5251 + }, + { + "epoch": 1.83, + "grad_norm": 0.20232580602169037, + "learning_rate": 0.00011190564719604118, + "loss": 0.9016, + "step": 5252 + }, + { + "epoch": 1.83, + "grad_norm": 0.2048744410276413, + "learning_rate": 0.00011187815468970586, + "loss": 0.8494, + "step": 5253 + }, + { + "epoch": 1.83, + "grad_norm": 0.2030027210712433, + "learning_rate": 0.00011185066127269607, + "loss": 0.9294, + "step": 5254 + }, + { + "epoch": 1.83, + "grad_norm": 0.21077583730220795, + "learning_rate": 0.00011182316694711965, + "loss": 0.8665, + "step": 5255 + }, + { + "epoch": 1.83, + "grad_norm": 0.201682448387146, + "learning_rate": 0.00011179567171508463, + "loss": 0.8872, + "step": 5256 + }, + { + "epoch": 1.83, + "grad_norm": 0.20252567529678345, + "learning_rate": 0.00011176817557869887, + "loss": 0.872, + "step": 5257 + }, + { + "epoch": 1.83, + "grad_norm": 0.1869186908006668, + "learning_rate": 0.00011174067854007058, + "loss": 0.8627, + "step": 5258 + }, + { + "epoch": 1.83, + "grad_norm": 0.20626534521579742, + "learning_rate": 0.00011171318060130783, + "loss": 0.959, + "step": 5259 + }, + { + "epoch": 1.83, + "grad_norm": 0.21301251649856567, + "learning_rate": 0.00011168568176451881, + "loss": 0.8972, + "step": 5260 + }, + { + "epoch": 1.83, + "grad_norm": 0.21265006065368652, + "learning_rate": 0.00011165818203181189, + "loss": 0.9791, + "step": 5261 + }, + { + "epoch": 1.83, + "grad_norm": 0.21141482889652252, + "learning_rate": 0.00011163068140529531, + "loss": 0.9556, + "step": 5262 + }, + { + "epoch": 1.83, + "grad_norm": 0.1883353292942047, + "learning_rate": 0.00011160317988707759, + "loss": 0.8423, + "step": 5263 + }, + { + "epoch": 1.83, + "grad_norm": 0.19268368184566498, + "learning_rate": 0.00011157567747926716, + "loss": 0.89, + "step": 5264 + }, + { + "epoch": 1.83, + "grad_norm": 0.1984165757894516, + "learning_rate": 0.00011154817418397257, + "loss": 0.9212, + "step": 5265 + }, + { + "epoch": 1.83, + "grad_norm": 0.2028784602880478, + "learning_rate": 0.0001115206700033025, + "loss": 0.8673, + "step": 5266 + }, + { + "epoch": 1.83, + "grad_norm": 0.19984976947307587, + "learning_rate": 0.00011149316493936555, + "loss": 0.9039, + "step": 5267 + }, + { + "epoch": 1.83, + "grad_norm": 0.20026452839374542, + "learning_rate": 0.00011146565899427057, + "loss": 0.9054, + "step": 5268 + }, + { + "epoch": 1.84, + "grad_norm": 0.20769165456295013, + "learning_rate": 0.00011143815217012633, + "loss": 0.9541, + "step": 5269 + }, + { + "epoch": 1.84, + "grad_norm": 0.2046111524105072, + "learning_rate": 0.00011141064446904172, + "loss": 0.9185, + "step": 5270 + }, + { + "epoch": 1.84, + "grad_norm": 0.2113402932882309, + "learning_rate": 0.00011138313589312577, + "loss": 0.9294, + "step": 5271 + }, + { + "epoch": 1.84, + "grad_norm": 0.20995759963989258, + "learning_rate": 0.00011135562644448741, + "loss": 0.8825, + "step": 5272 + }, + { + "epoch": 1.84, + "grad_norm": 0.20764729380607605, + "learning_rate": 0.0001113281161252358, + "loss": 0.8985, + "step": 5273 + }, + { + "epoch": 1.84, + "grad_norm": 0.21376466751098633, + "learning_rate": 0.00011130060493748007, + "loss": 0.9318, + "step": 5274 + }, + { + "epoch": 1.84, + "grad_norm": 0.19655349850654602, + "learning_rate": 0.00011127309288332945, + "loss": 0.8771, + "step": 5275 + }, + { + "epoch": 1.84, + "grad_norm": 0.19957830011844635, + "learning_rate": 0.00011124557996489329, + "loss": 0.89, + "step": 5276 + }, + { + "epoch": 1.84, + "grad_norm": 0.19681084156036377, + "learning_rate": 0.00011121806618428085, + "loss": 0.8815, + "step": 5277 + }, + { + "epoch": 1.84, + "grad_norm": 0.21147172152996063, + "learning_rate": 0.00011119055154360165, + "loss": 0.8854, + "step": 5278 + }, + { + "epoch": 1.84, + "grad_norm": 0.19879549741744995, + "learning_rate": 0.00011116303604496517, + "loss": 0.8512, + "step": 5279 + }, + { + "epoch": 1.84, + "grad_norm": 0.1971660852432251, + "learning_rate": 0.00011113551969048089, + "loss": 0.8922, + "step": 5280 + }, + { + "epoch": 1.84, + "grad_norm": 0.20926682651042938, + "learning_rate": 0.0001111080024822585, + "loss": 0.9082, + "step": 5281 + }, + { + "epoch": 1.84, + "grad_norm": 0.20934419333934784, + "learning_rate": 0.0001110804844224077, + "loss": 0.948, + "step": 5282 + }, + { + "epoch": 1.84, + "grad_norm": 0.2003125250339508, + "learning_rate": 0.0001110529655130382, + "loss": 0.8796, + "step": 5283 + }, + { + "epoch": 1.84, + "grad_norm": 0.208330899477005, + "learning_rate": 0.00011102544575625988, + "loss": 0.92, + "step": 5284 + }, + { + "epoch": 1.84, + "grad_norm": 0.20303812623023987, + "learning_rate": 0.00011099792515418252, + "loss": 0.9019, + "step": 5285 + }, + { + "epoch": 1.84, + "grad_norm": 0.19247807562351227, + "learning_rate": 0.00011097040370891621, + "loss": 0.8892, + "step": 5286 + }, + { + "epoch": 1.84, + "grad_norm": 0.20970343053340912, + "learning_rate": 0.00011094288142257086, + "loss": 0.9262, + "step": 5287 + }, + { + "epoch": 1.84, + "grad_norm": 0.20390614867210388, + "learning_rate": 0.00011091535829725659, + "loss": 0.9264, + "step": 5288 + }, + { + "epoch": 1.84, + "grad_norm": 0.19981978833675385, + "learning_rate": 0.00011088783433508352, + "loss": 0.8799, + "step": 5289 + }, + { + "epoch": 1.84, + "grad_norm": 0.20050524175167084, + "learning_rate": 0.00011086030953816187, + "loss": 0.8639, + "step": 5290 + }, + { + "epoch": 1.84, + "grad_norm": 0.2076718658208847, + "learning_rate": 0.0001108327839086019, + "loss": 0.8632, + "step": 5291 + }, + { + "epoch": 1.84, + "grad_norm": 0.21506258845329285, + "learning_rate": 0.000110805257448514, + "loss": 0.912, + "step": 5292 + }, + { + "epoch": 1.84, + "grad_norm": 0.21158722043037415, + "learning_rate": 0.00011077773016000849, + "loss": 0.936, + "step": 5293 + }, + { + "epoch": 1.84, + "grad_norm": 0.21115998923778534, + "learning_rate": 0.00011075020204519588, + "loss": 0.9155, + "step": 5294 + }, + { + "epoch": 1.84, + "grad_norm": 0.20540079474449158, + "learning_rate": 0.00011072267310618669, + "loss": 0.9158, + "step": 5295 + }, + { + "epoch": 1.84, + "grad_norm": 0.19588345289230347, + "learning_rate": 0.0001106951433450915, + "loss": 0.846, + "step": 5296 + }, + { + "epoch": 1.85, + "grad_norm": 0.20122110843658447, + "learning_rate": 0.00011066761276402095, + "loss": 0.846, + "step": 5297 + }, + { + "epoch": 1.85, + "grad_norm": 0.19394049048423767, + "learning_rate": 0.00011064008136508581, + "loss": 0.8491, + "step": 5298 + }, + { + "epoch": 1.85, + "grad_norm": 0.20480981469154358, + "learning_rate": 0.0001106125491503968, + "loss": 0.8792, + "step": 5299 + }, + { + "epoch": 1.85, + "grad_norm": 0.205979123711586, + "learning_rate": 0.00011058501612206476, + "loss": 0.907, + "step": 5300 + }, + { + "epoch": 1.85, + "grad_norm": 0.20306456089019775, + "learning_rate": 0.00011055748228220062, + "loss": 0.8966, + "step": 5301 + }, + { + "epoch": 1.85, + "grad_norm": 0.20474489033222198, + "learning_rate": 0.00011052994763291533, + "loss": 0.8474, + "step": 5302 + }, + { + "epoch": 1.85, + "grad_norm": 0.19772367179393768, + "learning_rate": 0.00011050241217631994, + "loss": 0.914, + "step": 5303 + }, + { + "epoch": 1.85, + "grad_norm": 0.2080567330121994, + "learning_rate": 0.00011047487591452552, + "loss": 0.8831, + "step": 5304 + }, + { + "epoch": 1.85, + "grad_norm": 0.20726226270198822, + "learning_rate": 0.0001104473388496432, + "loss": 0.9431, + "step": 5305 + }, + { + "epoch": 1.85, + "grad_norm": 0.20143887400627136, + "learning_rate": 0.00011041980098378421, + "loss": 0.9187, + "step": 5306 + }, + { + "epoch": 1.85, + "grad_norm": 0.20895779132843018, + "learning_rate": 0.00011039226231905986, + "loss": 0.8455, + "step": 5307 + }, + { + "epoch": 1.85, + "grad_norm": 0.20928290486335754, + "learning_rate": 0.00011036472285758144, + "loss": 0.9203, + "step": 5308 + }, + { + "epoch": 1.85, + "grad_norm": 0.20436181128025055, + "learning_rate": 0.00011033718260146036, + "loss": 0.9224, + "step": 5309 + }, + { + "epoch": 1.85, + "grad_norm": 0.19997340440750122, + "learning_rate": 0.00011030964155280803, + "loss": 0.9114, + "step": 5310 + }, + { + "epoch": 1.85, + "grad_norm": 0.19829915463924408, + "learning_rate": 0.00011028209971373605, + "loss": 0.8384, + "step": 5311 + }, + { + "epoch": 1.85, + "grad_norm": 0.19507120549678802, + "learning_rate": 0.00011025455708635595, + "loss": 0.8925, + "step": 5312 + }, + { + "epoch": 1.85, + "grad_norm": 0.1935853362083435, + "learning_rate": 0.0001102270136727794, + "loss": 0.8382, + "step": 5313 + }, + { + "epoch": 1.85, + "grad_norm": 0.2118246704339981, + "learning_rate": 0.00011019946947511804, + "loss": 0.9267, + "step": 5314 + }, + { + "epoch": 1.85, + "grad_norm": 0.20547886192798615, + "learning_rate": 0.00011017192449548365, + "loss": 0.9465, + "step": 5315 + }, + { + "epoch": 1.85, + "grad_norm": 0.20173491537570953, + "learning_rate": 0.00011014437873598811, + "loss": 0.9013, + "step": 5316 + }, + { + "epoch": 1.85, + "grad_norm": 0.19979619979858398, + "learning_rate": 0.00011011683219874323, + "loss": 0.9181, + "step": 5317 + }, + { + "epoch": 1.85, + "grad_norm": 0.2074444591999054, + "learning_rate": 0.00011008928488586096, + "loss": 0.9071, + "step": 5318 + }, + { + "epoch": 1.85, + "grad_norm": 0.2015903890132904, + "learning_rate": 0.00011006173679945333, + "loss": 0.8847, + "step": 5319 + }, + { + "epoch": 1.85, + "grad_norm": 0.20304925739765167, + "learning_rate": 0.00011003418794163236, + "loss": 0.8981, + "step": 5320 + }, + { + "epoch": 1.85, + "grad_norm": 0.209546759724617, + "learning_rate": 0.00011000663831451017, + "loss": 0.9768, + "step": 5321 + }, + { + "epoch": 1.85, + "grad_norm": 0.20055563747882843, + "learning_rate": 0.00010997908792019898, + "loss": 0.9023, + "step": 5322 + }, + { + "epoch": 1.85, + "grad_norm": 0.20044977962970734, + "learning_rate": 0.00010995153676081096, + "loss": 0.9173, + "step": 5323 + }, + { + "epoch": 1.85, + "grad_norm": 0.20793165266513824, + "learning_rate": 0.00010992398483845845, + "loss": 0.9098, + "step": 5324 + }, + { + "epoch": 1.85, + "grad_norm": 0.2018725723028183, + "learning_rate": 0.00010989643215525375, + "loss": 0.8702, + "step": 5325 + }, + { + "epoch": 1.86, + "grad_norm": 0.21068057417869568, + "learning_rate": 0.00010986887871330934, + "loss": 0.9116, + "step": 5326 + }, + { + "epoch": 1.86, + "grad_norm": 0.20038604736328125, + "learning_rate": 0.00010984132451473763, + "loss": 0.9127, + "step": 5327 + }, + { + "epoch": 1.86, + "grad_norm": 0.21165314316749573, + "learning_rate": 0.00010981376956165117, + "loss": 0.939, + "step": 5328 + }, + { + "epoch": 1.86, + "grad_norm": 0.2063116729259491, + "learning_rate": 0.00010978621385616255, + "loss": 0.8957, + "step": 5329 + }, + { + "epoch": 1.86, + "grad_norm": 0.20496952533721924, + "learning_rate": 0.00010975865740038438, + "loss": 0.9017, + "step": 5330 + }, + { + "epoch": 1.86, + "grad_norm": 0.21283504366874695, + "learning_rate": 0.00010973110019642938, + "loss": 0.9653, + "step": 5331 + }, + { + "epoch": 1.86, + "grad_norm": 0.2020772397518158, + "learning_rate": 0.0001097035422464103, + "loss": 0.9211, + "step": 5332 + }, + { + "epoch": 1.86, + "grad_norm": 0.20327343046665192, + "learning_rate": 0.00010967598355244, + "loss": 0.8538, + "step": 5333 + }, + { + "epoch": 1.86, + "grad_norm": 0.19840683043003082, + "learning_rate": 0.00010964842411663128, + "loss": 0.8665, + "step": 5334 + }, + { + "epoch": 1.86, + "grad_norm": 0.20020124316215515, + "learning_rate": 0.00010962086394109705, + "loss": 0.9099, + "step": 5335 + }, + { + "epoch": 1.86, + "grad_norm": 0.2016928642988205, + "learning_rate": 0.0001095933030279504, + "loss": 0.862, + "step": 5336 + }, + { + "epoch": 1.86, + "grad_norm": 0.21030563116073608, + "learning_rate": 0.00010956574137930429, + "loss": 0.9831, + "step": 5337 + }, + { + "epoch": 1.86, + "grad_norm": 0.20427197217941284, + "learning_rate": 0.00010953817899727183, + "loss": 0.9434, + "step": 5338 + }, + { + "epoch": 1.86, + "grad_norm": 0.1986006200313568, + "learning_rate": 0.00010951061588396618, + "loss": 0.8907, + "step": 5339 + }, + { + "epoch": 1.86, + "grad_norm": 0.20330186188220978, + "learning_rate": 0.00010948305204150054, + "loss": 0.9612, + "step": 5340 + }, + { + "epoch": 1.86, + "grad_norm": 0.20180857181549072, + "learning_rate": 0.00010945548747198819, + "loss": 0.9131, + "step": 5341 + }, + { + "epoch": 1.86, + "grad_norm": 0.19510334730148315, + "learning_rate": 0.00010942792217754245, + "loss": 0.8674, + "step": 5342 + }, + { + "epoch": 1.86, + "grad_norm": 0.20021146535873413, + "learning_rate": 0.00010940035616027669, + "loss": 0.9286, + "step": 5343 + }, + { + "epoch": 1.86, + "grad_norm": 0.202460378408432, + "learning_rate": 0.00010937278942230435, + "loss": 0.9394, + "step": 5344 + }, + { + "epoch": 1.86, + "grad_norm": 0.2063404768705368, + "learning_rate": 0.00010934522196573887, + "loss": 0.9157, + "step": 5345 + }, + { + "epoch": 1.86, + "grad_norm": 0.2022324502468109, + "learning_rate": 0.00010931765379269385, + "loss": 0.9371, + "step": 5346 + }, + { + "epoch": 1.86, + "grad_norm": 0.2042325735092163, + "learning_rate": 0.00010929008490528287, + "loss": 0.8429, + "step": 5347 + }, + { + "epoch": 1.86, + "grad_norm": 0.20193937420845032, + "learning_rate": 0.0001092625153056196, + "loss": 0.8439, + "step": 5348 + }, + { + "epoch": 1.86, + "grad_norm": 0.19789718091487885, + "learning_rate": 0.0001092349449958177, + "loss": 0.8861, + "step": 5349 + }, + { + "epoch": 1.86, + "grad_norm": 0.20109151303768158, + "learning_rate": 0.00010920737397799095, + "loss": 0.8545, + "step": 5350 + }, + { + "epoch": 1.86, + "grad_norm": 0.21629342436790466, + "learning_rate": 0.00010917980225425317, + "loss": 0.9521, + "step": 5351 + }, + { + "epoch": 1.86, + "grad_norm": 0.21499894559383392, + "learning_rate": 0.00010915222982671825, + "loss": 0.9516, + "step": 5352 + }, + { + "epoch": 1.86, + "grad_norm": 0.2034975290298462, + "learning_rate": 0.0001091246566975001, + "loss": 0.9194, + "step": 5353 + }, + { + "epoch": 1.87, + "grad_norm": 0.19859300553798676, + "learning_rate": 0.00010909708286871265, + "loss": 0.8968, + "step": 5354 + }, + { + "epoch": 1.87, + "grad_norm": 0.20240305364131927, + "learning_rate": 0.00010906950834246999, + "loss": 0.8883, + "step": 5355 + }, + { + "epoch": 1.87, + "grad_norm": 0.21117694675922394, + "learning_rate": 0.0001090419331208862, + "loss": 0.9253, + "step": 5356 + }, + { + "epoch": 1.87, + "grad_norm": 0.20424620807170868, + "learning_rate": 0.00010901435720607538, + "loss": 0.9334, + "step": 5357 + }, + { + "epoch": 1.87, + "grad_norm": 0.19281210005283356, + "learning_rate": 0.00010898678060015175, + "loss": 0.8969, + "step": 5358 + }, + { + "epoch": 1.87, + "grad_norm": 0.20645880699157715, + "learning_rate": 0.00010895920330522956, + "loss": 0.8592, + "step": 5359 + }, + { + "epoch": 1.87, + "grad_norm": 0.18876774609088898, + "learning_rate": 0.00010893162532342305, + "loss": 0.8815, + "step": 5360 + }, + { + "epoch": 1.87, + "grad_norm": 0.20588569343090057, + "learning_rate": 0.00010890404665684665, + "loss": 0.8889, + "step": 5361 + }, + { + "epoch": 1.87, + "grad_norm": 0.20744764804840088, + "learning_rate": 0.00010887646730761472, + "loss": 0.8864, + "step": 5362 + }, + { + "epoch": 1.87, + "grad_norm": 0.2112501859664917, + "learning_rate": 0.0001088488872778417, + "loss": 0.9765, + "step": 5363 + }, + { + "epoch": 1.87, + "grad_norm": 0.1971355676651001, + "learning_rate": 0.00010882130656964213, + "loss": 0.9024, + "step": 5364 + }, + { + "epoch": 1.87, + "grad_norm": 0.19987548887729645, + "learning_rate": 0.00010879372518513053, + "loss": 0.8502, + "step": 5365 + }, + { + "epoch": 1.87, + "grad_norm": 0.21339647471904755, + "learning_rate": 0.00010876614312642153, + "loss": 0.9464, + "step": 5366 + }, + { + "epoch": 1.87, + "grad_norm": 0.20979535579681396, + "learning_rate": 0.00010873856039562982, + "loss": 0.9526, + "step": 5367 + }, + { + "epoch": 1.87, + "grad_norm": 0.20245172083377838, + "learning_rate": 0.00010871097699487005, + "loss": 0.9115, + "step": 5368 + }, + { + "epoch": 1.87, + "grad_norm": 0.2003164291381836, + "learning_rate": 0.00010868339292625705, + "loss": 0.9226, + "step": 5369 + }, + { + "epoch": 1.87, + "grad_norm": 0.2036878913640976, + "learning_rate": 0.00010865580819190557, + "loss": 0.8324, + "step": 5370 + }, + { + "epoch": 1.87, + "grad_norm": 0.20090122520923615, + "learning_rate": 0.00010862822279393054, + "loss": 0.9138, + "step": 5371 + }, + { + "epoch": 1.87, + "grad_norm": 0.20926058292388916, + "learning_rate": 0.00010860063673444685, + "loss": 0.9332, + "step": 5372 + }, + { + "epoch": 1.87, + "grad_norm": 0.20082472264766693, + "learning_rate": 0.00010857305001556944, + "loss": 0.8328, + "step": 5373 + }, + { + "epoch": 1.87, + "grad_norm": 0.21082501113414764, + "learning_rate": 0.00010854546263941339, + "loss": 0.9512, + "step": 5374 + }, + { + "epoch": 1.87, + "grad_norm": 0.19284385442733765, + "learning_rate": 0.0001085178746080937, + "loss": 0.9419, + "step": 5375 + }, + { + "epoch": 1.87, + "grad_norm": 0.19388435781002045, + "learning_rate": 0.00010849028592372555, + "loss": 0.9202, + "step": 5376 + }, + { + "epoch": 1.87, + "grad_norm": 0.2086414396762848, + "learning_rate": 0.00010846269658842407, + "loss": 0.8658, + "step": 5377 + }, + { + "epoch": 1.87, + "grad_norm": 0.20063738524913788, + "learning_rate": 0.00010843510660430447, + "loss": 0.9396, + "step": 5378 + }, + { + "epoch": 1.87, + "grad_norm": 0.19891178607940674, + "learning_rate": 0.00010840751597348211, + "loss": 0.8995, + "step": 5379 + }, + { + "epoch": 1.87, + "grad_norm": 0.20803341269493103, + "learning_rate": 0.00010837992469807218, + "loss": 0.9674, + "step": 5380 + }, + { + "epoch": 1.87, + "grad_norm": 0.20508162677288055, + "learning_rate": 0.00010835233278019015, + "loss": 0.8785, + "step": 5381 + }, + { + "epoch": 1.87, + "grad_norm": 0.19641000032424927, + "learning_rate": 0.00010832474022195138, + "loss": 0.8814, + "step": 5382 + }, + { + "epoch": 1.88, + "grad_norm": 0.1972811073064804, + "learning_rate": 0.00010829714702547132, + "loss": 0.8922, + "step": 5383 + }, + { + "epoch": 1.88, + "grad_norm": 0.2041017860174179, + "learning_rate": 0.00010826955319286558, + "loss": 0.9163, + "step": 5384 + }, + { + "epoch": 1.88, + "grad_norm": 0.21064582467079163, + "learning_rate": 0.00010824195872624963, + "loss": 0.8693, + "step": 5385 + }, + { + "epoch": 1.88, + "grad_norm": 0.20119690895080566, + "learning_rate": 0.00010821436362773912, + "loss": 1.0056, + "step": 5386 + }, + { + "epoch": 1.88, + "grad_norm": 0.20159223675727844, + "learning_rate": 0.0001081867678994497, + "loss": 0.922, + "step": 5387 + }, + { + "epoch": 1.88, + "grad_norm": 0.2144201546907425, + "learning_rate": 0.00010815917154349706, + "loss": 0.9242, + "step": 5388 + }, + { + "epoch": 1.88, + "grad_norm": 0.20381437242031097, + "learning_rate": 0.00010813157456199705, + "loss": 0.9295, + "step": 5389 + }, + { + "epoch": 1.88, + "grad_norm": 0.19609691202640533, + "learning_rate": 0.00010810397695706535, + "loss": 0.8845, + "step": 5390 + }, + { + "epoch": 1.88, + "grad_norm": 0.20411543548107147, + "learning_rate": 0.0001080763787308179, + "loss": 0.9235, + "step": 5391 + }, + { + "epoch": 1.88, + "grad_norm": 0.20571023225784302, + "learning_rate": 0.00010804877988537059, + "loss": 0.852, + "step": 5392 + }, + { + "epoch": 1.88, + "grad_norm": 0.2107377052307129, + "learning_rate": 0.00010802118042283931, + "loss": 0.9347, + "step": 5393 + }, + { + "epoch": 1.88, + "grad_norm": 0.19296841323375702, + "learning_rate": 0.00010799358034534015, + "loss": 0.7903, + "step": 5394 + }, + { + "epoch": 1.88, + "grad_norm": 0.19987763464450836, + "learning_rate": 0.00010796597965498908, + "loss": 0.8674, + "step": 5395 + }, + { + "epoch": 1.88, + "grad_norm": 0.19931264221668243, + "learning_rate": 0.00010793837835390219, + "loss": 0.914, + "step": 5396 + }, + { + "epoch": 1.88, + "grad_norm": 0.20454499125480652, + "learning_rate": 0.00010791077644419567, + "loss": 0.8788, + "step": 5397 + }, + { + "epoch": 1.88, + "grad_norm": 0.20866768062114716, + "learning_rate": 0.00010788317392798563, + "loss": 0.9846, + "step": 5398 + }, + { + "epoch": 1.88, + "grad_norm": 0.20036864280700684, + "learning_rate": 0.00010785557080738839, + "loss": 0.8952, + "step": 5399 + }, + { + "epoch": 1.88, + "grad_norm": 0.19880080223083496, + "learning_rate": 0.00010782796708452014, + "loss": 0.919, + "step": 5400 + }, + { + "epoch": 1.88, + "grad_norm": 0.19589628279209137, + "learning_rate": 0.00010780036276149728, + "loss": 0.8344, + "step": 5401 + }, + { + "epoch": 1.88, + "grad_norm": 0.20191796123981476, + "learning_rate": 0.00010777275784043613, + "loss": 0.8675, + "step": 5402 + }, + { + "epoch": 1.88, + "grad_norm": 0.20033222436904907, + "learning_rate": 0.00010774515232345305, + "loss": 0.9171, + "step": 5403 + }, + { + "epoch": 1.88, + "grad_norm": 0.20329776406288147, + "learning_rate": 0.00010771754621266466, + "loss": 0.8448, + "step": 5404 + }, + { + "epoch": 1.88, + "grad_norm": 0.20034590363502502, + "learning_rate": 0.0001076899395101873, + "loss": 0.9414, + "step": 5405 + }, + { + "epoch": 1.88, + "grad_norm": 0.20257985591888428, + "learning_rate": 0.00010766233221813762, + "loss": 0.8854, + "step": 5406 + }, + { + "epoch": 1.88, + "grad_norm": 0.20048066973686218, + "learning_rate": 0.00010763472433863217, + "loss": 0.8535, + "step": 5407 + }, + { + "epoch": 1.88, + "grad_norm": 0.20843522250652313, + "learning_rate": 0.0001076071158737876, + "loss": 0.9159, + "step": 5408 + }, + { + "epoch": 1.88, + "grad_norm": 0.19864386320114136, + "learning_rate": 0.00010757950682572063, + "loss": 0.857, + "step": 5409 + }, + { + "epoch": 1.88, + "grad_norm": 0.193053737282753, + "learning_rate": 0.00010755189719654795, + "loss": 0.8336, + "step": 5410 + }, + { + "epoch": 1.89, + "grad_norm": 0.19992056488990784, + "learning_rate": 0.00010752428698838637, + "loss": 0.8091, + "step": 5411 + }, + { + "epoch": 1.89, + "grad_norm": 0.2062869817018509, + "learning_rate": 0.00010749667620335266, + "loss": 0.8749, + "step": 5412 + }, + { + "epoch": 1.89, + "grad_norm": 0.203649640083313, + "learning_rate": 0.00010746906484356372, + "loss": 0.8286, + "step": 5413 + }, + { + "epoch": 1.89, + "grad_norm": 0.2094353884458542, + "learning_rate": 0.00010744145291113645, + "loss": 0.9006, + "step": 5414 + }, + { + "epoch": 1.89, + "grad_norm": 0.2003667950630188, + "learning_rate": 0.00010741384040818782, + "loss": 0.8428, + "step": 5415 + }, + { + "epoch": 1.89, + "grad_norm": 0.20398718118667603, + "learning_rate": 0.00010738622733683479, + "loss": 0.9237, + "step": 5416 + }, + { + "epoch": 1.89, + "grad_norm": 0.19468474388122559, + "learning_rate": 0.00010735861369919442, + "loss": 0.8502, + "step": 5417 + }, + { + "epoch": 1.89, + "grad_norm": 0.2077466994524002, + "learning_rate": 0.00010733099949738376, + "loss": 0.9258, + "step": 5418 + }, + { + "epoch": 1.89, + "grad_norm": 0.19644780457019806, + "learning_rate": 0.00010730338473352001, + "loss": 0.8911, + "step": 5419 + }, + { + "epoch": 1.89, + "grad_norm": 0.19907157123088837, + "learning_rate": 0.00010727576940972031, + "loss": 0.9216, + "step": 5420 + }, + { + "epoch": 1.89, + "grad_norm": 0.21069294214248657, + "learning_rate": 0.00010724815352810183, + "loss": 0.9574, + "step": 5421 + }, + { + "epoch": 1.89, + "grad_norm": 0.20110972225666046, + "learning_rate": 0.00010722053709078184, + "loss": 0.8654, + "step": 5422 + }, + { + "epoch": 1.89, + "grad_norm": 0.2043222188949585, + "learning_rate": 0.00010719292009987765, + "loss": 0.8942, + "step": 5423 + }, + { + "epoch": 1.89, + "grad_norm": 0.19561101496219635, + "learning_rate": 0.0001071653025575066, + "loss": 0.8844, + "step": 5424 + }, + { + "epoch": 1.89, + "grad_norm": 0.21482384204864502, + "learning_rate": 0.00010713768446578612, + "loss": 0.868, + "step": 5425 + }, + { + "epoch": 1.89, + "grad_norm": 0.1975470781326294, + "learning_rate": 0.00010711006582683352, + "loss": 0.8686, + "step": 5426 + }, + { + "epoch": 1.89, + "grad_norm": 0.18916769325733185, + "learning_rate": 0.00010708244664276634, + "loss": 0.8201, + "step": 5427 + }, + { + "epoch": 1.89, + "grad_norm": 0.20201200246810913, + "learning_rate": 0.00010705482691570207, + "loss": 0.9239, + "step": 5428 + }, + { + "epoch": 1.89, + "grad_norm": 0.19807839393615723, + "learning_rate": 0.00010702720664775829, + "loss": 0.8829, + "step": 5429 + }, + { + "epoch": 1.89, + "grad_norm": 0.1970471292734146, + "learning_rate": 0.00010699958584105261, + "loss": 0.9509, + "step": 5430 + }, + { + "epoch": 1.89, + "grad_norm": 0.20313721895217896, + "learning_rate": 0.00010697196449770255, + "loss": 0.8714, + "step": 5431 + }, + { + "epoch": 1.89, + "grad_norm": 0.19797174632549286, + "learning_rate": 0.00010694434261982589, + "loss": 0.9216, + "step": 5432 + }, + { + "epoch": 1.89, + "grad_norm": 0.20096881687641144, + "learning_rate": 0.00010691672020954028, + "loss": 0.906, + "step": 5433 + }, + { + "epoch": 1.89, + "grad_norm": 0.2070157676935196, + "learning_rate": 0.00010688909726896353, + "loss": 0.9213, + "step": 5434 + }, + { + "epoch": 1.89, + "grad_norm": 0.20100905001163483, + "learning_rate": 0.00010686147380021342, + "loss": 0.8678, + "step": 5435 + }, + { + "epoch": 1.89, + "grad_norm": 0.2077058106660843, + "learning_rate": 0.00010683384980540776, + "loss": 0.9615, + "step": 5436 + }, + { + "epoch": 1.89, + "grad_norm": 0.21475407481193542, + "learning_rate": 0.00010680622528666444, + "loss": 0.967, + "step": 5437 + }, + { + "epoch": 1.89, + "grad_norm": 0.2054007202386856, + "learning_rate": 0.00010677860024610139, + "loss": 0.8804, + "step": 5438 + }, + { + "epoch": 1.9, + "grad_norm": 0.20920714735984802, + "learning_rate": 0.00010675097468583652, + "loss": 0.9073, + "step": 5439 + }, + { + "epoch": 1.9, + "grad_norm": 0.19480057060718536, + "learning_rate": 0.00010672334860798794, + "loss": 0.8315, + "step": 5440 + }, + { + "epoch": 1.9, + "grad_norm": 0.20092402398586273, + "learning_rate": 0.00010669572201467355, + "loss": 0.8936, + "step": 5441 + }, + { + "epoch": 1.9, + "grad_norm": 0.20771557092666626, + "learning_rate": 0.00010666809490801148, + "loss": 0.8878, + "step": 5442 + }, + { + "epoch": 1.9, + "grad_norm": 0.21026811003684998, + "learning_rate": 0.00010664046729011987, + "loss": 0.9235, + "step": 5443 + }, + { + "epoch": 1.9, + "grad_norm": 0.21163447201251984, + "learning_rate": 0.00010661283916311684, + "loss": 0.9107, + "step": 5444 + }, + { + "epoch": 1.9, + "grad_norm": 0.20571337640285492, + "learning_rate": 0.00010658521052912065, + "loss": 0.9288, + "step": 5445 + }, + { + "epoch": 1.9, + "grad_norm": 0.20502333343029022, + "learning_rate": 0.00010655758139024942, + "loss": 0.8934, + "step": 5446 + }, + { + "epoch": 1.9, + "grad_norm": 0.19831927120685577, + "learning_rate": 0.00010652995174862152, + "loss": 0.8669, + "step": 5447 + }, + { + "epoch": 1.9, + "grad_norm": 0.21284537017345428, + "learning_rate": 0.00010650232160635519, + "loss": 0.8566, + "step": 5448 + }, + { + "epoch": 1.9, + "grad_norm": 0.2107332944869995, + "learning_rate": 0.00010647469096556883, + "loss": 0.9208, + "step": 5449 + }, + { + "epoch": 1.9, + "grad_norm": 0.19804082810878754, + "learning_rate": 0.00010644705982838081, + "loss": 0.9367, + "step": 5450 + }, + { + "epoch": 1.9, + "grad_norm": 0.20850442349910736, + "learning_rate": 0.00010641942819690953, + "loss": 0.9054, + "step": 5451 + }, + { + "epoch": 1.9, + "grad_norm": 0.20339353382587433, + "learning_rate": 0.0001063917960732735, + "loss": 0.9204, + "step": 5452 + }, + { + "epoch": 1.9, + "grad_norm": 0.19770273566246033, + "learning_rate": 0.00010636416345959117, + "loss": 0.8832, + "step": 5453 + }, + { + "epoch": 1.9, + "grad_norm": 0.21037791669368744, + "learning_rate": 0.0001063365303579811, + "loss": 0.9366, + "step": 5454 + }, + { + "epoch": 1.9, + "grad_norm": 0.20975439250469208, + "learning_rate": 0.00010630889677056189, + "loss": 0.8848, + "step": 5455 + }, + { + "epoch": 1.9, + "grad_norm": 0.21157856285572052, + "learning_rate": 0.00010628126269945211, + "loss": 0.9573, + "step": 5456 + }, + { + "epoch": 1.9, + "grad_norm": 0.18961650133132935, + "learning_rate": 0.00010625362814677043, + "loss": 0.7911, + "step": 5457 + }, + { + "epoch": 1.9, + "grad_norm": 0.2068662941455841, + "learning_rate": 0.0001062259931146355, + "loss": 0.9004, + "step": 5458 + }, + { + "epoch": 1.9, + "grad_norm": 0.2038230299949646, + "learning_rate": 0.00010619835760516612, + "loss": 0.9452, + "step": 5459 + }, + { + "epoch": 1.9, + "grad_norm": 0.20304733514785767, + "learning_rate": 0.00010617072162048099, + "loss": 0.8527, + "step": 5460 + }, + { + "epoch": 1.9, + "grad_norm": 0.20015372335910797, + "learning_rate": 0.0001061430851626989, + "loss": 0.8984, + "step": 5461 + }, + { + "epoch": 1.9, + "grad_norm": 0.2066597193479538, + "learning_rate": 0.00010611544823393873, + "loss": 0.9431, + "step": 5462 + }, + { + "epoch": 1.9, + "grad_norm": 0.20975813269615173, + "learning_rate": 0.00010608781083631931, + "loss": 0.9342, + "step": 5463 + }, + { + "epoch": 1.9, + "grad_norm": 0.1995689421892166, + "learning_rate": 0.00010606017297195956, + "loss": 0.9373, + "step": 5464 + }, + { + "epoch": 1.9, + "grad_norm": 0.19717445969581604, + "learning_rate": 0.0001060325346429784, + "loss": 0.9466, + "step": 5465 + }, + { + "epoch": 1.9, + "grad_norm": 0.2298421710729599, + "learning_rate": 0.00010600489585149484, + "loss": 0.9011, + "step": 5466 + }, + { + "epoch": 1.9, + "grad_norm": 0.22285929322242737, + "learning_rate": 0.00010597725659962788, + "loss": 0.9999, + "step": 5467 + }, + { + "epoch": 1.91, + "grad_norm": 0.2079205960035324, + "learning_rate": 0.00010594961688949654, + "loss": 0.9186, + "step": 5468 + }, + { + "epoch": 1.91, + "grad_norm": 0.2025509625673294, + "learning_rate": 0.00010592197672321991, + "loss": 0.8173, + "step": 5469 + }, + { + "epoch": 1.91, + "grad_norm": 0.1945144236087799, + "learning_rate": 0.00010589433610291713, + "loss": 0.8464, + "step": 5470 + }, + { + "epoch": 1.91, + "grad_norm": 0.20176583528518677, + "learning_rate": 0.00010586669503070734, + "loss": 0.8429, + "step": 5471 + }, + { + "epoch": 1.91, + "grad_norm": 0.1998482644557953, + "learning_rate": 0.00010583905350870971, + "loss": 0.8806, + "step": 5472 + }, + { + "epoch": 1.91, + "grad_norm": 0.18746793270111084, + "learning_rate": 0.00010581141153904348, + "loss": 0.8106, + "step": 5473 + }, + { + "epoch": 1.91, + "grad_norm": 0.2047126293182373, + "learning_rate": 0.0001057837691238279, + "loss": 0.9248, + "step": 5474 + }, + { + "epoch": 1.91, + "grad_norm": 0.2051793783903122, + "learning_rate": 0.00010575612626518225, + "loss": 0.891, + "step": 5475 + }, + { + "epoch": 1.91, + "grad_norm": 0.2004009336233139, + "learning_rate": 0.00010572848296522588, + "loss": 0.8177, + "step": 5476 + }, + { + "epoch": 1.91, + "grad_norm": 0.20141753554344177, + "learning_rate": 0.00010570083922607812, + "loss": 0.8132, + "step": 5477 + }, + { + "epoch": 1.91, + "grad_norm": 0.20456604659557343, + "learning_rate": 0.00010567319504985837, + "loss": 0.9394, + "step": 5478 + }, + { + "epoch": 1.91, + "grad_norm": 0.2142971158027649, + "learning_rate": 0.00010564555043868604, + "loss": 0.9591, + "step": 5479 + }, + { + "epoch": 1.91, + "grad_norm": 0.20223455131053925, + "learning_rate": 0.00010561790539468062, + "loss": 0.9081, + "step": 5480 + }, + { + "epoch": 1.91, + "grad_norm": 0.20663003623485565, + "learning_rate": 0.00010559025991996157, + "loss": 0.8495, + "step": 5481 + }, + { + "epoch": 1.91, + "grad_norm": 0.2013295292854309, + "learning_rate": 0.00010556261401664842, + "loss": 0.9142, + "step": 5482 + }, + { + "epoch": 1.91, + "grad_norm": 0.20907741785049438, + "learning_rate": 0.00010553496768686073, + "loss": 0.958, + "step": 5483 + }, + { + "epoch": 1.91, + "grad_norm": 0.21043746173381805, + "learning_rate": 0.00010550732093271807, + "loss": 0.9505, + "step": 5484 + }, + { + "epoch": 1.91, + "grad_norm": 0.19993579387664795, + "learning_rate": 0.00010547967375634009, + "loss": 0.8621, + "step": 5485 + }, + { + "epoch": 1.91, + "grad_norm": 0.21031905710697174, + "learning_rate": 0.00010545202615984646, + "loss": 0.9439, + "step": 5486 + }, + { + "epoch": 1.91, + "grad_norm": 0.20142893493175507, + "learning_rate": 0.00010542437814535682, + "loss": 0.9008, + "step": 5487 + }, + { + "epoch": 1.91, + "grad_norm": 0.20271191000938416, + "learning_rate": 0.00010539672971499093, + "loss": 0.9142, + "step": 5488 + }, + { + "epoch": 1.91, + "grad_norm": 0.20329239964485168, + "learning_rate": 0.00010536908087086847, + "loss": 0.9426, + "step": 5489 + }, + { + "epoch": 1.91, + "grad_norm": 0.20079709589481354, + "learning_rate": 0.0001053414316151093, + "loss": 0.9146, + "step": 5490 + }, + { + "epoch": 1.91, + "grad_norm": 0.20103353261947632, + "learning_rate": 0.00010531378194983321, + "loss": 0.9292, + "step": 5491 + }, + { + "epoch": 1.91, + "grad_norm": 0.20483773946762085, + "learning_rate": 0.00010528613187716004, + "loss": 0.9024, + "step": 5492 + }, + { + "epoch": 1.91, + "grad_norm": 0.20483364164829254, + "learning_rate": 0.00010525848139920967, + "loss": 0.8938, + "step": 5493 + }, + { + "epoch": 1.91, + "grad_norm": 0.19378747045993805, + "learning_rate": 0.00010523083051810197, + "loss": 0.884, + "step": 5494 + }, + { + "epoch": 1.91, + "grad_norm": 0.20826518535614014, + "learning_rate": 0.00010520317923595693, + "loss": 0.9423, + "step": 5495 + }, + { + "epoch": 1.92, + "grad_norm": 0.2042119950056076, + "learning_rate": 0.00010517552755489449, + "loss": 0.9201, + "step": 5496 + }, + { + "epoch": 1.92, + "grad_norm": 0.2060500979423523, + "learning_rate": 0.00010514787547703466, + "loss": 0.8883, + "step": 5497 + }, + { + "epoch": 1.92, + "grad_norm": 0.20850086212158203, + "learning_rate": 0.00010512022300449749, + "loss": 0.9031, + "step": 5498 + }, + { + "epoch": 1.92, + "grad_norm": 0.21010854840278625, + "learning_rate": 0.00010509257013940299, + "loss": 0.9278, + "step": 5499 + }, + { + "epoch": 1.92, + "grad_norm": 0.20167683064937592, + "learning_rate": 0.00010506491688387127, + "loss": 0.8522, + "step": 5500 + }, + { + "epoch": 1.92, + "grad_norm": 0.20341619849205017, + "learning_rate": 0.00010503726324002248, + "loss": 0.8767, + "step": 5501 + }, + { + "epoch": 1.92, + "grad_norm": 0.2075185924768448, + "learning_rate": 0.00010500960920997676, + "loss": 0.8936, + "step": 5502 + }, + { + "epoch": 1.92, + "grad_norm": 0.20581576228141785, + "learning_rate": 0.00010498195479585427, + "loss": 0.8568, + "step": 5503 + }, + { + "epoch": 1.92, + "grad_norm": 0.21758943796157837, + "learning_rate": 0.0001049542999997752, + "loss": 0.9707, + "step": 5504 + }, + { + "epoch": 1.92, + "grad_norm": 0.19635866582393646, + "learning_rate": 0.00010492664482385981, + "loss": 0.8547, + "step": 5505 + }, + { + "epoch": 1.92, + "grad_norm": 0.21904851496219635, + "learning_rate": 0.00010489898927022838, + "loss": 0.7793, + "step": 5506 + }, + { + "epoch": 1.92, + "grad_norm": 0.2064557820558548, + "learning_rate": 0.00010487133334100124, + "loss": 0.9223, + "step": 5507 + }, + { + "epoch": 1.92, + "grad_norm": 0.20519934594631195, + "learning_rate": 0.00010484367703829865, + "loss": 0.8152, + "step": 5508 + }, + { + "epoch": 1.92, + "grad_norm": 0.20812737941741943, + "learning_rate": 0.00010481602036424094, + "loss": 0.8843, + "step": 5509 + }, + { + "epoch": 1.92, + "grad_norm": 0.20696045458316803, + "learning_rate": 0.00010478836332094859, + "loss": 0.8851, + "step": 5510 + }, + { + "epoch": 1.92, + "grad_norm": 0.20457100868225098, + "learning_rate": 0.00010476070591054193, + "loss": 0.913, + "step": 5511 + }, + { + "epoch": 1.92, + "grad_norm": 0.2012094110250473, + "learning_rate": 0.00010473304813514146, + "loss": 0.8772, + "step": 5512 + }, + { + "epoch": 1.92, + "grad_norm": 0.20374369621276855, + "learning_rate": 0.00010470538999686759, + "loss": 0.8856, + "step": 5513 + }, + { + "epoch": 1.92, + "grad_norm": 0.20570087432861328, + "learning_rate": 0.00010467773149784083, + "loss": 0.9468, + "step": 5514 + }, + { + "epoch": 1.92, + "grad_norm": 0.21451853215694427, + "learning_rate": 0.00010465007264018172, + "loss": 0.9318, + "step": 5515 + }, + { + "epoch": 1.92, + "grad_norm": 0.20936594903469086, + "learning_rate": 0.00010462241342601081, + "loss": 0.8838, + "step": 5516 + }, + { + "epoch": 1.92, + "grad_norm": 0.20236121118068695, + "learning_rate": 0.00010459475385744866, + "loss": 0.8472, + "step": 5517 + }, + { + "epoch": 1.92, + "grad_norm": 0.20448645949363708, + "learning_rate": 0.0001045670939366159, + "loss": 0.8967, + "step": 5518 + }, + { + "epoch": 1.92, + "grad_norm": 0.20693820714950562, + "learning_rate": 0.0001045394336656331, + "loss": 0.9088, + "step": 5519 + }, + { + "epoch": 1.92, + "grad_norm": 0.20853427052497864, + "learning_rate": 0.00010451177304662098, + "loss": 0.9635, + "step": 5520 + }, + { + "epoch": 1.92, + "grad_norm": 0.2139703333377838, + "learning_rate": 0.00010448411208170021, + "loss": 0.8683, + "step": 5521 + }, + { + "epoch": 1.92, + "grad_norm": 0.196266308426857, + "learning_rate": 0.00010445645077299149, + "loss": 0.8463, + "step": 5522 + }, + { + "epoch": 1.92, + "grad_norm": 0.20476581156253815, + "learning_rate": 0.00010442878912261558, + "loss": 0.936, + "step": 5523 + }, + { + "epoch": 1.92, + "grad_norm": 0.20537997782230377, + "learning_rate": 0.00010440112713269319, + "loss": 0.9305, + "step": 5524 + }, + { + "epoch": 1.93, + "grad_norm": 0.21176749467849731, + "learning_rate": 0.00010437346480534516, + "loss": 0.9282, + "step": 5525 + }, + { + "epoch": 1.93, + "grad_norm": 0.19759652018547058, + "learning_rate": 0.0001043458021426923, + "loss": 0.9027, + "step": 5526 + }, + { + "epoch": 1.93, + "grad_norm": 0.19534112513065338, + "learning_rate": 0.00010431813914685544, + "loss": 0.8246, + "step": 5527 + }, + { + "epoch": 1.93, + "grad_norm": 0.2069014310836792, + "learning_rate": 0.00010429047581995546, + "loss": 0.9569, + "step": 5528 + }, + { + "epoch": 1.93, + "grad_norm": 0.2005814164876938, + "learning_rate": 0.00010426281216411322, + "loss": 0.8368, + "step": 5529 + }, + { + "epoch": 1.93, + "grad_norm": 0.20376819372177124, + "learning_rate": 0.00010423514818144968, + "loss": 0.9077, + "step": 5530 + }, + { + "epoch": 1.93, + "grad_norm": 0.20055246353149414, + "learning_rate": 0.00010420748387408574, + "loss": 0.8813, + "step": 5531 + }, + { + "epoch": 1.93, + "grad_norm": 0.20620344579219818, + "learning_rate": 0.00010417981924414242, + "loss": 0.8969, + "step": 5532 + }, + { + "epoch": 1.93, + "grad_norm": 0.20845210552215576, + "learning_rate": 0.00010415215429374068, + "loss": 0.8923, + "step": 5533 + }, + { + "epoch": 1.93, + "grad_norm": 0.20220451056957245, + "learning_rate": 0.00010412448902500149, + "loss": 0.8537, + "step": 5534 + }, + { + "epoch": 1.93, + "grad_norm": 0.21402427554130554, + "learning_rate": 0.00010409682344004598, + "loss": 0.8981, + "step": 5535 + }, + { + "epoch": 1.93, + "grad_norm": 0.20166133344173431, + "learning_rate": 0.00010406915754099518, + "loss": 0.8941, + "step": 5536 + }, + { + "epoch": 1.93, + "grad_norm": 0.21470798552036285, + "learning_rate": 0.00010404149132997012, + "loss": 0.9071, + "step": 5537 + }, + { + "epoch": 1.93, + "grad_norm": 0.2156185358762741, + "learning_rate": 0.00010401382480909205, + "loss": 0.9286, + "step": 5538 + }, + { + "epoch": 1.93, + "grad_norm": 0.2048928141593933, + "learning_rate": 0.00010398615798048194, + "loss": 0.9145, + "step": 5539 + }, + { + "epoch": 1.93, + "grad_norm": 0.21606525778770447, + "learning_rate": 0.00010395849084626111, + "loss": 0.9124, + "step": 5540 + }, + { + "epoch": 1.93, + "grad_norm": 0.20806032419204712, + "learning_rate": 0.00010393082340855063, + "loss": 0.8577, + "step": 5541 + }, + { + "epoch": 1.93, + "grad_norm": 0.2055501490831375, + "learning_rate": 0.00010390315566947174, + "loss": 0.905, + "step": 5542 + }, + { + "epoch": 1.93, + "grad_norm": 0.22344693541526794, + "learning_rate": 0.0001038754876311457, + "loss": 0.8798, + "step": 5543 + }, + { + "epoch": 1.93, + "grad_norm": 0.2084258794784546, + "learning_rate": 0.00010384781929569372, + "loss": 0.8989, + "step": 5544 + }, + { + "epoch": 1.93, + "grad_norm": 0.20271384716033936, + "learning_rate": 0.00010382015066523709, + "loss": 0.8499, + "step": 5545 + }, + { + "epoch": 1.93, + "grad_norm": 0.1985735446214676, + "learning_rate": 0.00010379248174189715, + "loss": 0.8993, + "step": 5546 + }, + { + "epoch": 1.93, + "grad_norm": 0.2049076408147812, + "learning_rate": 0.00010376481252779513, + "loss": 0.8773, + "step": 5547 + }, + { + "epoch": 1.93, + "grad_norm": 0.2088305801153183, + "learning_rate": 0.00010373714302505251, + "loss": 0.9891, + "step": 5548 + }, + { + "epoch": 1.93, + "grad_norm": 0.2174626737833023, + "learning_rate": 0.0001037094732357905, + "loss": 0.9326, + "step": 5549 + }, + { + "epoch": 1.93, + "grad_norm": 0.21114954352378845, + "learning_rate": 0.0001036818031621306, + "loss": 0.8854, + "step": 5550 + }, + { + "epoch": 1.93, + "grad_norm": 0.2056587040424347, + "learning_rate": 0.00010365413280619418, + "loss": 0.9057, + "step": 5551 + }, + { + "epoch": 1.93, + "grad_norm": 0.21228408813476562, + "learning_rate": 0.00010362646217010266, + "loss": 0.9768, + "step": 5552 + }, + { + "epoch": 1.94, + "grad_norm": 0.2071719616651535, + "learning_rate": 0.00010359879125597753, + "loss": 0.9171, + "step": 5553 + }, + { + "epoch": 1.94, + "grad_norm": 0.2048521190881729, + "learning_rate": 0.00010357112006594021, + "loss": 0.95, + "step": 5554 + }, + { + "epoch": 1.94, + "grad_norm": 0.20829981565475464, + "learning_rate": 0.00010354344860211225, + "loss": 0.9078, + "step": 5555 + }, + { + "epoch": 1.94, + "grad_norm": 0.21465882658958435, + "learning_rate": 0.00010351577686661513, + "loss": 0.8951, + "step": 5556 + }, + { + "epoch": 1.94, + "grad_norm": 0.21603715419769287, + "learning_rate": 0.00010348810486157039, + "loss": 0.9098, + "step": 5557 + }, + { + "epoch": 1.94, + "grad_norm": 0.20947815477848053, + "learning_rate": 0.00010346043258909963, + "loss": 0.897, + "step": 5558 + }, + { + "epoch": 1.94, + "grad_norm": 0.19364146888256073, + "learning_rate": 0.00010343276005132436, + "loss": 0.8358, + "step": 5559 + }, + { + "epoch": 1.94, + "grad_norm": 0.20746031403541565, + "learning_rate": 0.00010340508725036622, + "loss": 0.8454, + "step": 5560 + }, + { + "epoch": 1.94, + "grad_norm": 0.22006148099899292, + "learning_rate": 0.00010337741418834684, + "loss": 0.8814, + "step": 5561 + }, + { + "epoch": 1.94, + "grad_norm": 0.20638620853424072, + "learning_rate": 0.0001033497408673878, + "loss": 0.8643, + "step": 5562 + }, + { + "epoch": 1.94, + "grad_norm": 0.20269517600536346, + "learning_rate": 0.00010332206728961085, + "loss": 0.8548, + "step": 5563 + }, + { + "epoch": 1.94, + "grad_norm": 0.20134706795215607, + "learning_rate": 0.00010329439345713756, + "loss": 0.9191, + "step": 5564 + }, + { + "epoch": 1.94, + "grad_norm": 0.19523276388645172, + "learning_rate": 0.00010326671937208971, + "loss": 0.8223, + "step": 5565 + }, + { + "epoch": 1.94, + "grad_norm": 0.20336094498634338, + "learning_rate": 0.000103239045036589, + "loss": 0.9309, + "step": 5566 + }, + { + "epoch": 1.94, + "grad_norm": 0.19728174805641174, + "learning_rate": 0.00010321137045275713, + "loss": 0.8479, + "step": 5567 + }, + { + "epoch": 1.94, + "grad_norm": 0.20552708208560944, + "learning_rate": 0.00010318369562271593, + "loss": 0.8819, + "step": 5568 + }, + { + "epoch": 1.94, + "grad_norm": 0.20273354649543762, + "learning_rate": 0.00010315602054858706, + "loss": 0.8672, + "step": 5569 + }, + { + "epoch": 1.94, + "grad_norm": 0.2043936848640442, + "learning_rate": 0.00010312834523249243, + "loss": 0.8995, + "step": 5570 + }, + { + "epoch": 1.94, + "grad_norm": 0.20490287244319916, + "learning_rate": 0.00010310066967655378, + "loss": 0.8514, + "step": 5571 + }, + { + "epoch": 1.94, + "grad_norm": 0.19761638343334198, + "learning_rate": 0.00010307299388289296, + "loss": 0.8673, + "step": 5572 + }, + { + "epoch": 1.94, + "grad_norm": 0.1997511088848114, + "learning_rate": 0.00010304531785363183, + "loss": 0.8748, + "step": 5573 + }, + { + "epoch": 1.94, + "grad_norm": 0.2093546837568283, + "learning_rate": 0.00010301764159089222, + "loss": 0.9083, + "step": 5574 + }, + { + "epoch": 1.94, + "grad_norm": 0.20290273427963257, + "learning_rate": 0.00010298996509679606, + "loss": 0.8587, + "step": 5575 + }, + { + "epoch": 1.94, + "grad_norm": 0.20612122118473053, + "learning_rate": 0.00010296228837346522, + "loss": 0.9561, + "step": 5576 + }, + { + "epoch": 1.94, + "grad_norm": 0.2027217149734497, + "learning_rate": 0.00010293461142302163, + "loss": 0.893, + "step": 5577 + }, + { + "epoch": 1.94, + "grad_norm": 0.20365840196609497, + "learning_rate": 0.00010290693424758722, + "loss": 0.8572, + "step": 5578 + }, + { + "epoch": 1.94, + "grad_norm": 0.20379266142845154, + "learning_rate": 0.00010287925684928396, + "loss": 0.9155, + "step": 5579 + }, + { + "epoch": 1.94, + "grad_norm": 0.20149919390678406, + "learning_rate": 0.00010285157923023381, + "loss": 0.9137, + "step": 5580 + }, + { + "epoch": 1.95, + "grad_norm": 0.20707061886787415, + "learning_rate": 0.00010282390139255878, + "loss": 0.9025, + "step": 5581 + }, + { + "epoch": 1.95, + "grad_norm": 0.20673343539237976, + "learning_rate": 0.0001027962233383808, + "loss": 0.9554, + "step": 5582 + }, + { + "epoch": 1.95, + "grad_norm": 0.21456193923950195, + "learning_rate": 0.00010276854506982196, + "loss": 0.9115, + "step": 5583 + }, + { + "epoch": 1.95, + "grad_norm": 0.2044607400894165, + "learning_rate": 0.0001027408665890043, + "loss": 0.8923, + "step": 5584 + }, + { + "epoch": 1.95, + "grad_norm": 0.19878527522087097, + "learning_rate": 0.00010271318789804987, + "loss": 0.9447, + "step": 5585 + }, + { + "epoch": 1.95, + "grad_norm": 0.19507496058940887, + "learning_rate": 0.00010268550899908071, + "loss": 0.8844, + "step": 5586 + }, + { + "epoch": 1.95, + "grad_norm": 0.1981414258480072, + "learning_rate": 0.0001026578298942189, + "loss": 0.8851, + "step": 5587 + }, + { + "epoch": 1.95, + "grad_norm": 0.20765522122383118, + "learning_rate": 0.00010263015058558659, + "loss": 0.9509, + "step": 5588 + }, + { + "epoch": 1.95, + "grad_norm": 0.20648297667503357, + "learning_rate": 0.00010260247107530587, + "loss": 0.8989, + "step": 5589 + }, + { + "epoch": 1.95, + "grad_norm": 0.19959251582622528, + "learning_rate": 0.00010257479136549889, + "loss": 0.864, + "step": 5590 + }, + { + "epoch": 1.95, + "grad_norm": 0.19457371532917023, + "learning_rate": 0.00010254711145828774, + "loss": 0.8499, + "step": 5591 + }, + { + "epoch": 1.95, + "grad_norm": 0.21515204012393951, + "learning_rate": 0.00010251943135579464, + "loss": 0.9099, + "step": 5592 + }, + { + "epoch": 1.95, + "grad_norm": 0.20317937433719635, + "learning_rate": 0.00010249175106014178, + "loss": 0.9047, + "step": 5593 + }, + { + "epoch": 1.95, + "grad_norm": 0.19348680973052979, + "learning_rate": 0.00010246407057345131, + "loss": 0.8087, + "step": 5594 + }, + { + "epoch": 1.95, + "grad_norm": 0.19511255621910095, + "learning_rate": 0.00010243638989784545, + "loss": 0.8805, + "step": 5595 + }, + { + "epoch": 1.95, + "grad_norm": 0.20101837813854218, + "learning_rate": 0.00010240870903544644, + "loss": 0.8821, + "step": 5596 + }, + { + "epoch": 1.95, + "grad_norm": 0.19923505187034607, + "learning_rate": 0.00010238102798837648, + "loss": 0.8281, + "step": 5597 + }, + { + "epoch": 1.95, + "grad_norm": 0.20761752128601074, + "learning_rate": 0.00010235334675875787, + "loss": 0.9093, + "step": 5598 + }, + { + "epoch": 1.95, + "grad_norm": 0.2042294442653656, + "learning_rate": 0.00010232566534871286, + "loss": 0.8987, + "step": 5599 + }, + { + "epoch": 1.95, + "grad_norm": 0.20492446422576904, + "learning_rate": 0.0001022979837603637, + "loss": 0.8391, + "step": 5600 + }, + { + "epoch": 1.95, + "grad_norm": 0.2098143994808197, + "learning_rate": 0.00010227030199583271, + "loss": 0.9079, + "step": 5601 + }, + { + "epoch": 1.95, + "grad_norm": 0.20019160211086273, + "learning_rate": 0.00010224262005724217, + "loss": 0.8631, + "step": 5602 + }, + { + "epoch": 1.95, + "grad_norm": 0.21009786427021027, + "learning_rate": 0.00010221493794671442, + "loss": 0.9236, + "step": 5603 + }, + { + "epoch": 1.95, + "grad_norm": 0.20378471910953522, + "learning_rate": 0.0001021872556663718, + "loss": 0.9148, + "step": 5604 + }, + { + "epoch": 1.95, + "grad_norm": 0.20996125042438507, + "learning_rate": 0.00010215957321833664, + "loss": 0.9775, + "step": 5605 + }, + { + "epoch": 1.95, + "grad_norm": 0.19774721562862396, + "learning_rate": 0.0001021318906047313, + "loss": 0.9339, + "step": 5606 + }, + { + "epoch": 1.95, + "grad_norm": 0.2106465995311737, + "learning_rate": 0.00010210420782767813, + "loss": 0.8776, + "step": 5607 + }, + { + "epoch": 1.95, + "grad_norm": 0.20622365176677704, + "learning_rate": 0.00010207652488929958, + "loss": 0.9095, + "step": 5608 + }, + { + "epoch": 1.95, + "grad_norm": 0.20106709003448486, + "learning_rate": 0.00010204884179171797, + "loss": 0.9572, + "step": 5609 + }, + { + "epoch": 1.96, + "grad_norm": 0.20053446292877197, + "learning_rate": 0.00010202115853705576, + "loss": 0.8891, + "step": 5610 + }, + { + "epoch": 1.96, + "grad_norm": 0.21232885122299194, + "learning_rate": 0.00010199347512743533, + "loss": 0.8949, + "step": 5611 + }, + { + "epoch": 1.96, + "grad_norm": 0.2083313763141632, + "learning_rate": 0.00010196579156497912, + "loss": 0.8339, + "step": 5612 + }, + { + "epoch": 1.96, + "grad_norm": 0.20660704374313354, + "learning_rate": 0.00010193810785180962, + "loss": 0.8816, + "step": 5613 + }, + { + "epoch": 1.96, + "grad_norm": 0.19979746639728546, + "learning_rate": 0.00010191042399004922, + "loss": 0.894, + "step": 5614 + }, + { + "epoch": 1.96, + "grad_norm": 0.20635730028152466, + "learning_rate": 0.00010188273998182045, + "loss": 0.922, + "step": 5615 + }, + { + "epoch": 1.96, + "grad_norm": 0.21255262196063995, + "learning_rate": 0.00010185505582924573, + "loss": 0.9356, + "step": 5616 + }, + { + "epoch": 1.96, + "grad_norm": 0.19595664739608765, + "learning_rate": 0.00010182737153444758, + "loss": 0.8222, + "step": 5617 + }, + { + "epoch": 1.96, + "grad_norm": 0.2002105861902237, + "learning_rate": 0.00010179968709954847, + "loss": 0.9032, + "step": 5618 + }, + { + "epoch": 1.96, + "grad_norm": 0.2023312747478485, + "learning_rate": 0.00010177200252667096, + "loss": 0.8727, + "step": 5619 + }, + { + "epoch": 1.96, + "grad_norm": 0.2006431221961975, + "learning_rate": 0.00010174431781793754, + "loss": 0.8597, + "step": 5620 + }, + { + "epoch": 1.96, + "grad_norm": 0.20587624609470367, + "learning_rate": 0.00010171663297547076, + "loss": 0.8933, + "step": 5621 + }, + { + "epoch": 1.96, + "grad_norm": 0.2047545611858368, + "learning_rate": 0.0001016889480013931, + "loss": 0.9077, + "step": 5622 + }, + { + "epoch": 1.96, + "grad_norm": 0.2119145542383194, + "learning_rate": 0.00010166126289782721, + "loss": 0.8601, + "step": 5623 + }, + { + "epoch": 1.96, + "grad_norm": 0.20192745327949524, + "learning_rate": 0.00010163357766689558, + "loss": 0.8753, + "step": 5624 + }, + { + "epoch": 1.96, + "grad_norm": 0.20670832693576813, + "learning_rate": 0.00010160589231072082, + "loss": 0.9099, + "step": 5625 + }, + { + "epoch": 1.96, + "grad_norm": 0.21294564008712769, + "learning_rate": 0.00010157820683142549, + "loss": 0.8567, + "step": 5626 + }, + { + "epoch": 1.96, + "grad_norm": 0.194480761885643, + "learning_rate": 0.00010155052123113215, + "loss": 0.8606, + "step": 5627 + }, + { + "epoch": 1.96, + "grad_norm": 0.20041505992412567, + "learning_rate": 0.00010152283551196347, + "loss": 0.925, + "step": 5628 + }, + { + "epoch": 1.96, + "grad_norm": 0.20152615010738373, + "learning_rate": 0.00010149514967604202, + "loss": 0.8863, + "step": 5629 + }, + { + "epoch": 1.96, + "grad_norm": 0.20309147238731384, + "learning_rate": 0.00010146746372549043, + "loss": 0.8359, + "step": 5630 + }, + { + "epoch": 1.96, + "grad_norm": 0.21429483592510223, + "learning_rate": 0.00010143977766243129, + "loss": 1.0148, + "step": 5631 + }, + { + "epoch": 1.96, + "grad_norm": 0.19496549665927887, + "learning_rate": 0.00010141209148898727, + "loss": 0.9032, + "step": 5632 + }, + { + "epoch": 1.96, + "grad_norm": 0.2035844326019287, + "learning_rate": 0.00010138440520728101, + "loss": 0.8656, + "step": 5633 + }, + { + "epoch": 1.96, + "grad_norm": 0.19976894557476044, + "learning_rate": 0.00010135671881943518, + "loss": 0.8728, + "step": 5634 + }, + { + "epoch": 1.96, + "grad_norm": 0.20249591767787933, + "learning_rate": 0.0001013290323275724, + "loss": 0.8846, + "step": 5635 + }, + { + "epoch": 1.96, + "grad_norm": 0.1962704211473465, + "learning_rate": 0.00010130134573381538, + "loss": 0.8569, + "step": 5636 + }, + { + "epoch": 1.96, + "grad_norm": 0.2182995229959488, + "learning_rate": 0.00010127365904028672, + "loss": 0.9651, + "step": 5637 + }, + { + "epoch": 1.97, + "grad_norm": 0.18703502416610718, + "learning_rate": 0.00010124597224910921, + "loss": 0.7896, + "step": 5638 + }, + { + "epoch": 1.97, + "grad_norm": 0.2021150439977646, + "learning_rate": 0.00010121828536240548, + "loss": 0.8708, + "step": 5639 + }, + { + "epoch": 1.97, + "grad_norm": 0.21872109174728394, + "learning_rate": 0.00010119059838229823, + "loss": 0.886, + "step": 5640 + }, + { + "epoch": 1.97, + "grad_norm": 0.20219261944293976, + "learning_rate": 0.00010116291131091016, + "loss": 0.9473, + "step": 5641 + }, + { + "epoch": 1.97, + "grad_norm": 0.20678988099098206, + "learning_rate": 0.000101135224150364, + "loss": 0.8906, + "step": 5642 + }, + { + "epoch": 1.97, + "grad_norm": 0.1961716264486313, + "learning_rate": 0.00010110753690278248, + "loss": 0.8878, + "step": 5643 + }, + { + "epoch": 1.97, + "grad_norm": 0.2135319858789444, + "learning_rate": 0.00010107984957028829, + "loss": 0.956, + "step": 5644 + }, + { + "epoch": 1.97, + "grad_norm": 0.20089684426784515, + "learning_rate": 0.00010105216215500422, + "loss": 0.8844, + "step": 5645 + }, + { + "epoch": 1.97, + "grad_norm": 0.19634021818637848, + "learning_rate": 0.00010102447465905293, + "loss": 0.872, + "step": 5646 + }, + { + "epoch": 1.97, + "grad_norm": 0.20131458342075348, + "learning_rate": 0.00010099678708455722, + "loss": 0.8583, + "step": 5647 + }, + { + "epoch": 1.97, + "grad_norm": 0.20227965712547302, + "learning_rate": 0.00010096909943363983, + "loss": 0.9438, + "step": 5648 + }, + { + "epoch": 1.97, + "grad_norm": 0.2081882357597351, + "learning_rate": 0.00010094141170842353, + "loss": 0.9539, + "step": 5649 + }, + { + "epoch": 1.97, + "grad_norm": 0.20764583349227905, + "learning_rate": 0.00010091372391103107, + "loss": 0.8397, + "step": 5650 + }, + { + "epoch": 1.97, + "grad_norm": 0.19473712146282196, + "learning_rate": 0.00010088603604358521, + "loss": 0.8871, + "step": 5651 + }, + { + "epoch": 1.97, + "grad_norm": 0.2055901437997818, + "learning_rate": 0.00010085834810820871, + "loss": 0.9054, + "step": 5652 + }, + { + "epoch": 1.97, + "grad_norm": 0.20546817779541016, + "learning_rate": 0.0001008306601070244, + "loss": 0.9194, + "step": 5653 + }, + { + "epoch": 1.97, + "grad_norm": 0.2075883150100708, + "learning_rate": 0.00010080297204215501, + "loss": 0.9154, + "step": 5654 + }, + { + "epoch": 1.97, + "grad_norm": 0.1996559500694275, + "learning_rate": 0.00010077528391572338, + "loss": 0.8629, + "step": 5655 + }, + { + "epoch": 1.97, + "grad_norm": 0.20948490500450134, + "learning_rate": 0.00010074759572985227, + "loss": 0.9011, + "step": 5656 + }, + { + "epoch": 1.97, + "grad_norm": 0.19752012193202972, + "learning_rate": 0.00010071990748666445, + "loss": 0.9041, + "step": 5657 + }, + { + "epoch": 1.97, + "grad_norm": 0.2031921148300171, + "learning_rate": 0.00010069221918828278, + "loss": 0.8492, + "step": 5658 + }, + { + "epoch": 1.97, + "grad_norm": 0.20917463302612305, + "learning_rate": 0.00010066453083683006, + "loss": 0.9072, + "step": 5659 + }, + { + "epoch": 1.97, + "grad_norm": 0.21156203746795654, + "learning_rate": 0.00010063684243442906, + "loss": 0.8781, + "step": 5660 + }, + { + "epoch": 1.97, + "grad_norm": 0.2049502581357956, + "learning_rate": 0.00010060915398320262, + "loss": 0.9036, + "step": 5661 + }, + { + "epoch": 1.97, + "grad_norm": 0.2090534120798111, + "learning_rate": 0.00010058146548527357, + "loss": 0.8892, + "step": 5662 + }, + { + "epoch": 1.97, + "grad_norm": 0.20506758987903595, + "learning_rate": 0.0001005537769427647, + "loss": 0.8151, + "step": 5663 + }, + { + "epoch": 1.97, + "grad_norm": 0.20771467685699463, + "learning_rate": 0.00010052608835779887, + "loss": 0.9167, + "step": 5664 + }, + { + "epoch": 1.97, + "grad_norm": 0.21362152695655823, + "learning_rate": 0.00010049839973249888, + "loss": 0.9169, + "step": 5665 + }, + { + "epoch": 1.97, + "grad_norm": 0.19874230027198792, + "learning_rate": 0.0001004707110689876, + "loss": 0.8439, + "step": 5666 + }, + { + "epoch": 1.98, + "grad_norm": 0.19645732641220093, + "learning_rate": 0.00010044302236938781, + "loss": 0.8103, + "step": 5667 + }, + { + "epoch": 1.98, + "grad_norm": 0.2064276784658432, + "learning_rate": 0.0001004153336358224, + "loss": 0.8839, + "step": 5668 + }, + { + "epoch": 1.98, + "grad_norm": 0.1985475718975067, + "learning_rate": 0.00010038764487041419, + "loss": 0.8588, + "step": 5669 + }, + { + "epoch": 1.98, + "grad_norm": 0.19935201108455658, + "learning_rate": 0.00010035995607528599, + "loss": 0.8933, + "step": 5670 + }, + { + "epoch": 1.98, + "grad_norm": 0.20382753014564514, + "learning_rate": 0.00010033226725256071, + "loss": 0.9026, + "step": 5671 + }, + { + "epoch": 1.98, + "grad_norm": 0.2041386365890503, + "learning_rate": 0.00010030457840436112, + "loss": 0.9249, + "step": 5672 + }, + { + "epoch": 1.98, + "grad_norm": 0.20588405430316925, + "learning_rate": 0.00010027688953281014, + "loss": 0.9808, + "step": 5673 + }, + { + "epoch": 1.98, + "grad_norm": 0.20263512432575226, + "learning_rate": 0.00010024920064003059, + "loss": 0.8611, + "step": 5674 + }, + { + "epoch": 1.98, + "grad_norm": 0.20413020253181458, + "learning_rate": 0.0001002215117281453, + "loss": 0.8908, + "step": 5675 + }, + { + "epoch": 1.98, + "grad_norm": 0.20494332909584045, + "learning_rate": 0.00010019382279927718, + "loss": 0.9052, + "step": 5676 + }, + { + "epoch": 1.98, + "grad_norm": 0.2025218904018402, + "learning_rate": 0.00010016613385554903, + "loss": 0.891, + "step": 5677 + }, + { + "epoch": 1.98, + "grad_norm": 0.20399563014507294, + "learning_rate": 0.00010013844489908376, + "loss": 0.9221, + "step": 5678 + } + ], + "logging_steps": 1, + "max_steps": 11356, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 2839, + "total_flos": 1.183008524297306e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}