{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1516, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013192612137203166, "grad_norm": 12.233341930238273, "learning_rate": 6.578947368421053e-08, "loss": 0.3407, "step": 1 }, { "epoch": 0.002638522427440633, "grad_norm": 12.816762328416463, "learning_rate": 1.3157894736842107e-07, "loss": 0.3118, "step": 2 }, { "epoch": 0.00395778364116095, "grad_norm": 12.015688553148182, "learning_rate": 1.9736842105263157e-07, "loss": 0.3131, "step": 3 }, { "epoch": 0.005277044854881266, "grad_norm": 13.05850661637904, "learning_rate": 2.6315789473684213e-07, "loss": 0.3361, "step": 4 }, { "epoch": 0.006596306068601583, "grad_norm": 11.88542762458142, "learning_rate": 3.2894736842105264e-07, "loss": 0.332, "step": 5 }, { "epoch": 0.0079155672823219, "grad_norm": 13.124966565305824, "learning_rate": 3.9473684210526315e-07, "loss": 0.328, "step": 6 }, { "epoch": 0.009234828496042216, "grad_norm": 12.075469159428398, "learning_rate": 4.605263157894737e-07, "loss": 0.3071, "step": 7 }, { "epoch": 0.010554089709762533, "grad_norm": 12.356648628552895, "learning_rate": 5.263157894736843e-07, "loss": 0.3072, "step": 8 }, { "epoch": 0.011873350923482849, "grad_norm": 10.352882639373675, "learning_rate": 5.921052631578947e-07, "loss": 0.261, "step": 9 }, { "epoch": 0.013192612137203167, "grad_norm": 10.058615814168975, "learning_rate": 6.578947368421053e-07, "loss": 0.253, "step": 10 }, { "epoch": 0.014511873350923483, "grad_norm": 8.63214503438111, "learning_rate": 7.236842105263158e-07, "loss": 0.2593, "step": 11 }, { "epoch": 0.0158311345646438, "grad_norm": 7.870088141405597, "learning_rate": 7.894736842105263e-07, "loss": 0.2304, "step": 12 }, { "epoch": 0.017150395778364115, "grad_norm": 5.393787971545889, "learning_rate": 8.55263157894737e-07, "loss": 0.1915, "step": 13 }, { "epoch": 0.018469656992084433, "grad_norm": 5.535158104456134, "learning_rate": 9.210526315789474e-07, "loss": 0.206, "step": 14 }, { "epoch": 0.01978891820580475, "grad_norm": 5.476677916959981, "learning_rate": 9.86842105263158e-07, "loss": 0.236, "step": 15 }, { "epoch": 0.021108179419525065, "grad_norm": 4.151444478521054, "learning_rate": 1.0526315789473685e-06, "loss": 0.1496, "step": 16 }, { "epoch": 0.022427440633245383, "grad_norm": 3.0628867506650748, "learning_rate": 1.118421052631579e-06, "loss": 0.2017, "step": 17 }, { "epoch": 0.023746701846965697, "grad_norm": 3.036924011607267, "learning_rate": 1.1842105263157894e-06, "loss": 0.1449, "step": 18 }, { "epoch": 0.025065963060686015, "grad_norm": 2.4996954384635646, "learning_rate": 1.25e-06, "loss": 0.134, "step": 19 }, { "epoch": 0.026385224274406333, "grad_norm": 2.11406547625844, "learning_rate": 1.3157894736842106e-06, "loss": 0.1119, "step": 20 }, { "epoch": 0.027704485488126648, "grad_norm": 1.9596253186522326, "learning_rate": 1.3815789473684212e-06, "loss": 0.1629, "step": 21 }, { "epoch": 0.029023746701846966, "grad_norm": 1.7753792784753228, "learning_rate": 1.4473684210526317e-06, "loss": 0.139, "step": 22 }, { "epoch": 0.030343007915567283, "grad_norm": 2.43386121958396, "learning_rate": 1.5131578947368421e-06, "loss": 0.1343, "step": 23 }, { "epoch": 0.0316622691292876, "grad_norm": 1.8993087578136192, "learning_rate": 1.5789473684210526e-06, "loss": 0.1151, "step": 24 }, { "epoch": 0.032981530343007916, "grad_norm": 3.0583374294765853, "learning_rate": 1.6447368421052635e-06, "loss": 0.154, "step": 25 }, { "epoch": 0.03430079155672823, "grad_norm": 2.253460947598232, "learning_rate": 1.710526315789474e-06, "loss": 0.142, "step": 26 }, { "epoch": 0.03562005277044855, "grad_norm": 2.016611774580363, "learning_rate": 1.7763157894736844e-06, "loss": 0.125, "step": 27 }, { "epoch": 0.036939313984168866, "grad_norm": 2.017120787608575, "learning_rate": 1.8421052631578948e-06, "loss": 0.1567, "step": 28 }, { "epoch": 0.03825857519788918, "grad_norm": 2.1936744617942034, "learning_rate": 1.9078947368421057e-06, "loss": 0.1257, "step": 29 }, { "epoch": 0.0395778364116095, "grad_norm": 1.9830690908180522, "learning_rate": 1.973684210526316e-06, "loss": 0.1331, "step": 30 }, { "epoch": 0.040897097625329816, "grad_norm": 1.5631779172937716, "learning_rate": 2.0394736842105266e-06, "loss": 0.1178, "step": 31 }, { "epoch": 0.04221635883905013, "grad_norm": 1.4861255029401097, "learning_rate": 2.105263157894737e-06, "loss": 0.0911, "step": 32 }, { "epoch": 0.04353562005277045, "grad_norm": 2.4000033930532956, "learning_rate": 2.1710526315789475e-06, "loss": 0.1316, "step": 33 }, { "epoch": 0.044854881266490766, "grad_norm": 1.6157534765776684, "learning_rate": 2.236842105263158e-06, "loss": 0.0996, "step": 34 }, { "epoch": 0.04617414248021108, "grad_norm": 1.6467470674972677, "learning_rate": 2.3026315789473684e-06, "loss": 0.0848, "step": 35 }, { "epoch": 0.047493403693931395, "grad_norm": 1.4693385869948326, "learning_rate": 2.368421052631579e-06, "loss": 0.111, "step": 36 }, { "epoch": 0.048812664907651716, "grad_norm": 2.300813961627334, "learning_rate": 2.4342105263157898e-06, "loss": 0.1299, "step": 37 }, { "epoch": 0.05013192612137203, "grad_norm": 1.6680540076451675, "learning_rate": 2.5e-06, "loss": 0.0996, "step": 38 }, { "epoch": 0.051451187335092345, "grad_norm": 2.314118463144595, "learning_rate": 2.565789473684211e-06, "loss": 0.1161, "step": 39 }, { "epoch": 0.052770448548812667, "grad_norm": 2.6523025026316795, "learning_rate": 2.631578947368421e-06, "loss": 0.1378, "step": 40 }, { "epoch": 0.05408970976253298, "grad_norm": 2.3663275040840426, "learning_rate": 2.697368421052632e-06, "loss": 0.1456, "step": 41 }, { "epoch": 0.055408970976253295, "grad_norm": 1.4303633951040533, "learning_rate": 2.7631578947368424e-06, "loss": 0.0858, "step": 42 }, { "epoch": 0.05672823218997362, "grad_norm": 1.000990164023815, "learning_rate": 2.828947368421053e-06, "loss": 0.0823, "step": 43 }, { "epoch": 0.05804749340369393, "grad_norm": 1.5872577094636717, "learning_rate": 2.8947368421052634e-06, "loss": 0.0958, "step": 44 }, { "epoch": 0.059366754617414245, "grad_norm": 1.7841530289241443, "learning_rate": 2.960526315789474e-06, "loss": 0.1239, "step": 45 }, { "epoch": 0.06068601583113457, "grad_norm": 1.1766620869883984, "learning_rate": 3.0263157894736843e-06, "loss": 0.0697, "step": 46 }, { "epoch": 0.06200527704485488, "grad_norm": 1.2853170690028022, "learning_rate": 3.092105263157895e-06, "loss": 0.0963, "step": 47 }, { "epoch": 0.0633245382585752, "grad_norm": 1.4709634028779943, "learning_rate": 3.157894736842105e-06, "loss": 0.0915, "step": 48 }, { "epoch": 0.06464379947229551, "grad_norm": 1.3745315046203157, "learning_rate": 3.223684210526316e-06, "loss": 0.0799, "step": 49 }, { "epoch": 0.06596306068601583, "grad_norm": 1.4761871536724172, "learning_rate": 3.289473684210527e-06, "loss": 0.1046, "step": 50 }, { "epoch": 0.06728232189973615, "grad_norm": 1.5105153887101839, "learning_rate": 3.355263157894737e-06, "loss": 0.0853, "step": 51 }, { "epoch": 0.06860158311345646, "grad_norm": 1.3535267919848646, "learning_rate": 3.421052631578948e-06, "loss": 0.1319, "step": 52 }, { "epoch": 0.06992084432717678, "grad_norm": 1.6687947303259212, "learning_rate": 3.486842105263158e-06, "loss": 0.1055, "step": 53 }, { "epoch": 0.0712401055408971, "grad_norm": 3.6501802247402027, "learning_rate": 3.5526315789473687e-06, "loss": 0.1094, "step": 54 }, { "epoch": 0.07255936675461741, "grad_norm": 1.3210848086773013, "learning_rate": 3.618421052631579e-06, "loss": 0.1088, "step": 55 }, { "epoch": 0.07387862796833773, "grad_norm": 1.0101400792927315, "learning_rate": 3.6842105263157896e-06, "loss": 0.0763, "step": 56 }, { "epoch": 0.07519788918205805, "grad_norm": 1.3009728579134852, "learning_rate": 3.7500000000000005e-06, "loss": 0.0922, "step": 57 }, { "epoch": 0.07651715039577836, "grad_norm": 1.5671514737535006, "learning_rate": 3.815789473684211e-06, "loss": 0.0967, "step": 58 }, { "epoch": 0.07783641160949868, "grad_norm": 1.0883160796639246, "learning_rate": 3.8815789473684214e-06, "loss": 0.1078, "step": 59 }, { "epoch": 0.079155672823219, "grad_norm": 2.311711212772536, "learning_rate": 3.947368421052632e-06, "loss": 0.0964, "step": 60 }, { "epoch": 0.08047493403693931, "grad_norm": 1.6181560133721102, "learning_rate": 4.013157894736842e-06, "loss": 0.093, "step": 61 }, { "epoch": 0.08179419525065963, "grad_norm": 1.4364617838539906, "learning_rate": 4.078947368421053e-06, "loss": 0.0803, "step": 62 }, { "epoch": 0.08311345646437995, "grad_norm": 1.49841077642279, "learning_rate": 4.144736842105263e-06, "loss": 0.105, "step": 63 }, { "epoch": 0.08443271767810026, "grad_norm": 1.449891300704642, "learning_rate": 4.210526315789474e-06, "loss": 0.0702, "step": 64 }, { "epoch": 0.08575197889182058, "grad_norm": 1.8407494276748049, "learning_rate": 4.276315789473684e-06, "loss": 0.1237, "step": 65 }, { "epoch": 0.0870712401055409, "grad_norm": 1.4445786143372672, "learning_rate": 4.342105263157895e-06, "loss": 0.0861, "step": 66 }, { "epoch": 0.08839050131926121, "grad_norm": 1.5820793398923942, "learning_rate": 4.407894736842105e-06, "loss": 0.1063, "step": 67 }, { "epoch": 0.08970976253298153, "grad_norm": 1.6266178158804763, "learning_rate": 4.473684210526316e-06, "loss": 0.1281, "step": 68 }, { "epoch": 0.09102902374670185, "grad_norm": 1.921098471296673, "learning_rate": 4.539473684210527e-06, "loss": 0.1121, "step": 69 }, { "epoch": 0.09234828496042216, "grad_norm": 1.3652318289138008, "learning_rate": 4.605263157894737e-06, "loss": 0.1167, "step": 70 }, { "epoch": 0.09366754617414248, "grad_norm": 1.9511646011862807, "learning_rate": 4.671052631578948e-06, "loss": 0.0977, "step": 71 }, { "epoch": 0.09498680738786279, "grad_norm": 1.3555845548236822, "learning_rate": 4.736842105263158e-06, "loss": 0.0911, "step": 72 }, { "epoch": 0.09630606860158311, "grad_norm": 2.2259057577872445, "learning_rate": 4.802631578947369e-06, "loss": 0.1279, "step": 73 }, { "epoch": 0.09762532981530343, "grad_norm": 1.3888531130344683, "learning_rate": 4.8684210526315795e-06, "loss": 0.0784, "step": 74 }, { "epoch": 0.09894459102902374, "grad_norm": 1.3960319673373207, "learning_rate": 4.9342105263157895e-06, "loss": 0.0978, "step": 75 }, { "epoch": 0.10026385224274406, "grad_norm": 1.7054291543895856, "learning_rate": 5e-06, "loss": 0.0984, "step": 76 }, { "epoch": 0.10158311345646438, "grad_norm": 1.6458987247042498, "learning_rate": 5.0657894736842104e-06, "loss": 0.0999, "step": 77 }, { "epoch": 0.10290237467018469, "grad_norm": 1.6822123202405193, "learning_rate": 5.131578947368422e-06, "loss": 0.0922, "step": 78 }, { "epoch": 0.10422163588390501, "grad_norm": 1.5932180454716234, "learning_rate": 5.197368421052632e-06, "loss": 0.1223, "step": 79 }, { "epoch": 0.10554089709762533, "grad_norm": 1.8249077406901284, "learning_rate": 5.263157894736842e-06, "loss": 0.0975, "step": 80 }, { "epoch": 0.10686015831134564, "grad_norm": 1.6365604083770662, "learning_rate": 5.328947368421054e-06, "loss": 0.129, "step": 81 }, { "epoch": 0.10817941952506596, "grad_norm": 1.9363957711801008, "learning_rate": 5.394736842105264e-06, "loss": 0.1022, "step": 82 }, { "epoch": 0.10949868073878628, "grad_norm": 1.1568619049418536, "learning_rate": 5.460526315789474e-06, "loss": 0.0687, "step": 83 }, { "epoch": 0.11081794195250659, "grad_norm": 1.5128549314706825, "learning_rate": 5.526315789473685e-06, "loss": 0.1096, "step": 84 }, { "epoch": 0.11213720316622691, "grad_norm": 1.416923068463139, "learning_rate": 5.592105263157896e-06, "loss": 0.1052, "step": 85 }, { "epoch": 0.11345646437994723, "grad_norm": 1.423800222860217, "learning_rate": 5.657894736842106e-06, "loss": 0.0986, "step": 86 }, { "epoch": 0.11477572559366754, "grad_norm": 1.4869568465395704, "learning_rate": 5.723684210526316e-06, "loss": 0.0906, "step": 87 }, { "epoch": 0.11609498680738786, "grad_norm": 1.302701356412605, "learning_rate": 5.789473684210527e-06, "loss": 0.09, "step": 88 }, { "epoch": 0.11741424802110818, "grad_norm": 1.2568159914563075, "learning_rate": 5.855263157894738e-06, "loss": 0.0988, "step": 89 }, { "epoch": 0.11873350923482849, "grad_norm": 1.1819489308351352, "learning_rate": 5.921052631578948e-06, "loss": 0.0816, "step": 90 }, { "epoch": 0.12005277044854881, "grad_norm": 0.9550864379939853, "learning_rate": 5.9868421052631585e-06, "loss": 0.073, "step": 91 }, { "epoch": 0.12137203166226913, "grad_norm": 1.4611228215909369, "learning_rate": 6.0526315789473685e-06, "loss": 0.0889, "step": 92 }, { "epoch": 0.12269129287598944, "grad_norm": 1.0971390981785767, "learning_rate": 6.118421052631579e-06, "loss": 0.1041, "step": 93 }, { "epoch": 0.12401055408970976, "grad_norm": 1.364124444234249, "learning_rate": 6.18421052631579e-06, "loss": 0.0837, "step": 94 }, { "epoch": 0.12532981530343007, "grad_norm": 1.5344985712619694, "learning_rate": 6.25e-06, "loss": 0.0793, "step": 95 }, { "epoch": 0.1266490765171504, "grad_norm": 1.1050557737138007, "learning_rate": 6.31578947368421e-06, "loss": 0.1012, "step": 96 }, { "epoch": 0.1279683377308707, "grad_norm": 1.6313639186233246, "learning_rate": 6.381578947368422e-06, "loss": 0.0725, "step": 97 }, { "epoch": 0.12928759894459102, "grad_norm": 1.0300085400623038, "learning_rate": 6.447368421052632e-06, "loss": 0.0874, "step": 98 }, { "epoch": 0.13060686015831136, "grad_norm": 1.4468801863848761, "learning_rate": 6.513157894736842e-06, "loss": 0.077, "step": 99 }, { "epoch": 0.13192612137203166, "grad_norm": 1.245638712405433, "learning_rate": 6.578947368421054e-06, "loss": 0.0933, "step": 100 }, { "epoch": 0.13324538258575197, "grad_norm": 1.5693953291777372, "learning_rate": 6.644736842105264e-06, "loss": 0.1102, "step": 101 }, { "epoch": 0.1345646437994723, "grad_norm": 1.283076950435831, "learning_rate": 6.710526315789474e-06, "loss": 0.0988, "step": 102 }, { "epoch": 0.1358839050131926, "grad_norm": 1.350859235672517, "learning_rate": 6.776315789473686e-06, "loss": 0.085, "step": 103 }, { "epoch": 0.13720316622691292, "grad_norm": 1.0796020601602057, "learning_rate": 6.842105263157896e-06, "loss": 0.099, "step": 104 }, { "epoch": 0.13852242744063326, "grad_norm": 1.1275227482126249, "learning_rate": 6.907894736842106e-06, "loss": 0.0797, "step": 105 }, { "epoch": 0.13984168865435356, "grad_norm": 1.0832151473779532, "learning_rate": 6.973684210526316e-06, "loss": 0.0918, "step": 106 }, { "epoch": 0.14116094986807387, "grad_norm": 0.7930455753517637, "learning_rate": 7.0394736842105274e-06, "loss": 0.0644, "step": 107 }, { "epoch": 0.1424802110817942, "grad_norm": 1.592218242485485, "learning_rate": 7.1052631578947375e-06, "loss": 0.1265, "step": 108 }, { "epoch": 0.1437994722955145, "grad_norm": 1.3389622863855597, "learning_rate": 7.1710526315789475e-06, "loss": 0.0718, "step": 109 }, { "epoch": 0.14511873350923482, "grad_norm": 1.544338181296543, "learning_rate": 7.236842105263158e-06, "loss": 0.1003, "step": 110 }, { "epoch": 0.14643799472295516, "grad_norm": 1.349874808615941, "learning_rate": 7.302631578947369e-06, "loss": 0.1069, "step": 111 }, { "epoch": 0.14775725593667546, "grad_norm": 1.49277474685825, "learning_rate": 7.368421052631579e-06, "loss": 0.1045, "step": 112 }, { "epoch": 0.14907651715039577, "grad_norm": 1.8999681992305248, "learning_rate": 7.43421052631579e-06, "loss": 0.1392, "step": 113 }, { "epoch": 0.1503957783641161, "grad_norm": 1.3082439341312861, "learning_rate": 7.500000000000001e-06, "loss": 0.0947, "step": 114 }, { "epoch": 0.1517150395778364, "grad_norm": 0.8488265412383513, "learning_rate": 7.565789473684211e-06, "loss": 0.0716, "step": 115 }, { "epoch": 0.15303430079155672, "grad_norm": 0.9426471120997849, "learning_rate": 7.631578947368423e-06, "loss": 0.0793, "step": 116 }, { "epoch": 0.15435356200527706, "grad_norm": 0.9809029094053667, "learning_rate": 7.697368421052632e-06, "loss": 0.0829, "step": 117 }, { "epoch": 0.15567282321899736, "grad_norm": 1.2026525607016512, "learning_rate": 7.763157894736843e-06, "loss": 0.0948, "step": 118 }, { "epoch": 0.15699208443271767, "grad_norm": 1.3972003502648243, "learning_rate": 7.828947368421054e-06, "loss": 0.0802, "step": 119 }, { "epoch": 0.158311345646438, "grad_norm": 1.464974484595394, "learning_rate": 7.894736842105265e-06, "loss": 0.1252, "step": 120 }, { "epoch": 0.15963060686015831, "grad_norm": 1.3659579209534767, "learning_rate": 7.960526315789474e-06, "loss": 0.0882, "step": 121 }, { "epoch": 0.16094986807387862, "grad_norm": 0.8868074475530271, "learning_rate": 8.026315789473685e-06, "loss": 0.066, "step": 122 }, { "epoch": 0.16226912928759896, "grad_norm": 1.2621289694628735, "learning_rate": 8.092105263157896e-06, "loss": 0.0687, "step": 123 }, { "epoch": 0.16358839050131926, "grad_norm": 1.6829617306054463, "learning_rate": 8.157894736842106e-06, "loss": 0.1144, "step": 124 }, { "epoch": 0.16490765171503957, "grad_norm": 1.251837581274231, "learning_rate": 8.223684210526316e-06, "loss": 0.0881, "step": 125 }, { "epoch": 0.1662269129287599, "grad_norm": 0.8602941113686816, "learning_rate": 8.289473684210526e-06, "loss": 0.0743, "step": 126 }, { "epoch": 0.16754617414248021, "grad_norm": 1.0273660667252307, "learning_rate": 8.355263157894737e-06, "loss": 0.08, "step": 127 }, { "epoch": 0.16886543535620052, "grad_norm": 2.6198303125098352, "learning_rate": 8.421052631578948e-06, "loss": 0.0983, "step": 128 }, { "epoch": 0.17018469656992086, "grad_norm": 1.0880312381207564, "learning_rate": 8.486842105263159e-06, "loss": 0.0698, "step": 129 }, { "epoch": 0.17150395778364116, "grad_norm": 1.9176206928976167, "learning_rate": 8.552631578947368e-06, "loss": 0.095, "step": 130 }, { "epoch": 0.17282321899736147, "grad_norm": 4.856631037789346, "learning_rate": 8.61842105263158e-06, "loss": 0.0972, "step": 131 }, { "epoch": 0.1741424802110818, "grad_norm": 1.3189375702822577, "learning_rate": 8.68421052631579e-06, "loss": 0.1083, "step": 132 }, { "epoch": 0.17546174142480211, "grad_norm": 1.348920100799652, "learning_rate": 8.750000000000001e-06, "loss": 0.0801, "step": 133 }, { "epoch": 0.17678100263852242, "grad_norm": 1.0662674243220835, "learning_rate": 8.81578947368421e-06, "loss": 0.085, "step": 134 }, { "epoch": 0.17810026385224276, "grad_norm": 1.8683103052234746, "learning_rate": 8.881578947368423e-06, "loss": 0.0812, "step": 135 }, { "epoch": 0.17941952506596306, "grad_norm": 0.9975688194192199, "learning_rate": 8.947368421052632e-06, "loss": 0.0693, "step": 136 }, { "epoch": 0.18073878627968337, "grad_norm": 0.9103667124063366, "learning_rate": 9.013157894736843e-06, "loss": 0.0751, "step": 137 }, { "epoch": 0.1820580474934037, "grad_norm": 1.8372221347381417, "learning_rate": 9.078947368421054e-06, "loss": 0.1082, "step": 138 }, { "epoch": 0.18337730870712401, "grad_norm": 1.0083599090393338, "learning_rate": 9.144736842105264e-06, "loss": 0.106, "step": 139 }, { "epoch": 0.18469656992084432, "grad_norm": 0.9763025849661937, "learning_rate": 9.210526315789474e-06, "loss": 0.0893, "step": 140 }, { "epoch": 0.18601583113456466, "grad_norm": 2.6732971208195955, "learning_rate": 9.276315789473686e-06, "loss": 0.1015, "step": 141 }, { "epoch": 0.18733509234828497, "grad_norm": 1.3827420544593714, "learning_rate": 9.342105263157895e-06, "loss": 0.1079, "step": 142 }, { "epoch": 0.18865435356200527, "grad_norm": 7.163906695650369, "learning_rate": 9.407894736842106e-06, "loss": 0.1498, "step": 143 }, { "epoch": 0.18997361477572558, "grad_norm": 1.363322395020731, "learning_rate": 9.473684210526315e-06, "loss": 0.0799, "step": 144 }, { "epoch": 0.19129287598944592, "grad_norm": 0.9848860707778788, "learning_rate": 9.539473684210528e-06, "loss": 0.0714, "step": 145 }, { "epoch": 0.19261213720316622, "grad_norm": 0.7355035178684772, "learning_rate": 9.605263157894737e-06, "loss": 0.0639, "step": 146 }, { "epoch": 0.19393139841688653, "grad_norm": 0.9708590815947932, "learning_rate": 9.671052631578948e-06, "loss": 0.0829, "step": 147 }, { "epoch": 0.19525065963060687, "grad_norm": 1.3176260473068497, "learning_rate": 9.736842105263159e-06, "loss": 0.0869, "step": 148 }, { "epoch": 0.19656992084432717, "grad_norm": 0.8290463498340741, "learning_rate": 9.80263157894737e-06, "loss": 0.075, "step": 149 }, { "epoch": 0.19788918205804748, "grad_norm": 1.2224418207646102, "learning_rate": 9.868421052631579e-06, "loss": 0.1049, "step": 150 }, { "epoch": 0.19920844327176782, "grad_norm": 0.9383534609793346, "learning_rate": 9.93421052631579e-06, "loss": 0.0693, "step": 151 }, { "epoch": 0.20052770448548812, "grad_norm": 1.157120293265899, "learning_rate": 1e-05, "loss": 0.1242, "step": 152 }, { "epoch": 0.20184696569920843, "grad_norm": 1.3848254542503755, "learning_rate": 9.9999867379451e-06, "loss": 0.0949, "step": 153 }, { "epoch": 0.20316622691292877, "grad_norm": 1.1050363308018347, "learning_rate": 9.99994695185075e-06, "loss": 0.0681, "step": 154 }, { "epoch": 0.20448548812664907, "grad_norm": 1.241266481405317, "learning_rate": 9.999880641928008e-06, "loss": 0.1013, "step": 155 }, { "epoch": 0.20580474934036938, "grad_norm": 1.2502664498399243, "learning_rate": 9.999787808528639e-06, "loss": 0.0958, "step": 156 }, { "epoch": 0.20712401055408972, "grad_norm": 0.8902305137007472, "learning_rate": 9.999668452145104e-06, "loss": 0.0707, "step": 157 }, { "epoch": 0.20844327176781002, "grad_norm": 1.2565435303569972, "learning_rate": 9.99952257341057e-06, "loss": 0.0855, "step": 158 }, { "epoch": 0.20976253298153033, "grad_norm": 1.0108162794824818, "learning_rate": 9.999350173098899e-06, "loss": 0.0828, "step": 159 }, { "epoch": 0.21108179419525067, "grad_norm": 1.1432130463466625, "learning_rate": 9.999151252124639e-06, "loss": 0.0788, "step": 160 }, { "epoch": 0.21240105540897097, "grad_norm": 1.2307082448027162, "learning_rate": 9.998925811543034e-06, "loss": 0.073, "step": 161 }, { "epoch": 0.21372031662269128, "grad_norm": 1.1691071584736277, "learning_rate": 9.998673852550007e-06, "loss": 0.103, "step": 162 }, { "epoch": 0.21503957783641162, "grad_norm": 0.9809626759661192, "learning_rate": 9.998395376482152e-06, "loss": 0.0682, "step": 163 }, { "epoch": 0.21635883905013192, "grad_norm": 0.903420296034603, "learning_rate": 9.99809038481674e-06, "loss": 0.0756, "step": 164 }, { "epoch": 0.21767810026385223, "grad_norm": 1.2846822797227462, "learning_rate": 9.997758879171693e-06, "loss": 0.0923, "step": 165 }, { "epoch": 0.21899736147757257, "grad_norm": 1.0264204749663686, "learning_rate": 9.99740086130559e-06, "loss": 0.0953, "step": 166 }, { "epoch": 0.22031662269129287, "grad_norm": 1.0371481360943675, "learning_rate": 9.997016333117655e-06, "loss": 0.0652, "step": 167 }, { "epoch": 0.22163588390501318, "grad_norm": 1.2651198967501744, "learning_rate": 9.996605296647737e-06, "loss": 0.1112, "step": 168 }, { "epoch": 0.22295514511873352, "grad_norm": 0.9017358797708437, "learning_rate": 9.996167754076315e-06, "loss": 0.0621, "step": 169 }, { "epoch": 0.22427440633245382, "grad_norm": 1.3003546894370093, "learning_rate": 9.995703707724474e-06, "loss": 0.1046, "step": 170 }, { "epoch": 0.22559366754617413, "grad_norm": 1.2408727983936052, "learning_rate": 9.995213160053897e-06, "loss": 0.0983, "step": 171 }, { "epoch": 0.22691292875989447, "grad_norm": 0.8514473822455528, "learning_rate": 9.99469611366685e-06, "loss": 0.0803, "step": 172 }, { "epoch": 0.22823218997361477, "grad_norm": 0.8677470665047625, "learning_rate": 9.994152571306174e-06, "loss": 0.0669, "step": 173 }, { "epoch": 0.22955145118733508, "grad_norm": 1.0781370854799601, "learning_rate": 9.993582535855265e-06, "loss": 0.0831, "step": 174 }, { "epoch": 0.23087071240105542, "grad_norm": 1.3878995049892582, "learning_rate": 9.992986010338058e-06, "loss": 0.1166, "step": 175 }, { "epoch": 0.23218997361477572, "grad_norm": 1.2107428243400042, "learning_rate": 9.992362997919016e-06, "loss": 0.1049, "step": 176 }, { "epoch": 0.23350923482849603, "grad_norm": 0.7146543623664033, "learning_rate": 9.991713501903107e-06, "loss": 0.0581, "step": 177 }, { "epoch": 0.23482849604221637, "grad_norm": 1.05069811104466, "learning_rate": 9.991037525735794e-06, "loss": 0.0736, "step": 178 }, { "epoch": 0.23614775725593667, "grad_norm": 0.8007759874947845, "learning_rate": 9.99033507300301e-06, "loss": 0.06, "step": 179 }, { "epoch": 0.23746701846965698, "grad_norm": 1.79713660450814, "learning_rate": 9.98960614743114e-06, "loss": 0.1229, "step": 180 }, { "epoch": 0.23878627968337732, "grad_norm": 0.912606187507227, "learning_rate": 9.988850752887006e-06, "loss": 0.0773, "step": 181 }, { "epoch": 0.24010554089709762, "grad_norm": 0.9167662097707882, "learning_rate": 9.988068893377841e-06, "loss": 0.0643, "step": 182 }, { "epoch": 0.24142480211081793, "grad_norm": 1.3792149016555213, "learning_rate": 9.987260573051268e-06, "loss": 0.0754, "step": 183 }, { "epoch": 0.24274406332453827, "grad_norm": 1.1294403281169605, "learning_rate": 9.986425796195287e-06, "loss": 0.1082, "step": 184 }, { "epoch": 0.24406332453825857, "grad_norm": 1.375642418993936, "learning_rate": 9.985564567238237e-06, "loss": 0.0977, "step": 185 }, { "epoch": 0.24538258575197888, "grad_norm": 1.1064905057054284, "learning_rate": 9.984676890748787e-06, "loss": 0.088, "step": 186 }, { "epoch": 0.24670184696569922, "grad_norm": 0.9950645396905647, "learning_rate": 9.983762771435902e-06, "loss": 0.0897, "step": 187 }, { "epoch": 0.24802110817941952, "grad_norm": 0.7705270623146346, "learning_rate": 9.98282221414882e-06, "loss": 0.0749, "step": 188 }, { "epoch": 0.24934036939313983, "grad_norm": 2.201638940290167, "learning_rate": 9.981855223877032e-06, "loss": 0.1012, "step": 189 }, { "epoch": 0.25065963060686014, "grad_norm": 1.2712247018236318, "learning_rate": 9.98086180575025e-06, "loss": 0.0923, "step": 190 }, { "epoch": 0.2519788918205805, "grad_norm": 0.6249886222978469, "learning_rate": 9.97984196503838e-06, "loss": 0.0609, "step": 191 }, { "epoch": 0.2532981530343008, "grad_norm": 1.5723383799027848, "learning_rate": 9.978795707151492e-06, "loss": 0.0854, "step": 192 }, { "epoch": 0.2546174142480211, "grad_norm": 0.957499710618973, "learning_rate": 9.9777230376398e-06, "loss": 0.082, "step": 193 }, { "epoch": 0.2559366754617414, "grad_norm": 1.4768620196656903, "learning_rate": 9.976623962193627e-06, "loss": 0.0923, "step": 194 }, { "epoch": 0.25725593667546176, "grad_norm": 1.5999528442765696, "learning_rate": 9.975498486643368e-06, "loss": 0.0696, "step": 195 }, { "epoch": 0.25857519788918204, "grad_norm": 1.235675356279894, "learning_rate": 9.974346616959476e-06, "loss": 0.0816, "step": 196 }, { "epoch": 0.2598944591029024, "grad_norm": 1.1903571766318424, "learning_rate": 9.973168359252411e-06, "loss": 0.0753, "step": 197 }, { "epoch": 0.2612137203166227, "grad_norm": 1.0319046665463454, "learning_rate": 9.971963719772621e-06, "loss": 0.0802, "step": 198 }, { "epoch": 0.262532981530343, "grad_norm": 0.9113306304093047, "learning_rate": 9.970732704910504e-06, "loss": 0.0768, "step": 199 }, { "epoch": 0.2638522427440633, "grad_norm": 1.552482915540632, "learning_rate": 9.969475321196374e-06, "loss": 0.1028, "step": 200 }, { "epoch": 0.26517150395778366, "grad_norm": 0.8279365731599866, "learning_rate": 9.968191575300428e-06, "loss": 0.0699, "step": 201 }, { "epoch": 0.26649076517150394, "grad_norm": 1.023073346353744, "learning_rate": 9.966881474032711e-06, "loss": 0.0644, "step": 202 }, { "epoch": 0.2678100263852243, "grad_norm": 0.8921330416372578, "learning_rate": 9.965545024343076e-06, "loss": 0.0734, "step": 203 }, { "epoch": 0.2691292875989446, "grad_norm": 0.8293631684614154, "learning_rate": 9.96418223332115e-06, "loss": 0.0866, "step": 204 }, { "epoch": 0.2704485488126649, "grad_norm": 1.2587903865990873, "learning_rate": 9.962793108196299e-06, "loss": 0.0901, "step": 205 }, { "epoch": 0.2717678100263852, "grad_norm": 1.2148007735564763, "learning_rate": 9.961377656337579e-06, "loss": 0.0771, "step": 206 }, { "epoch": 0.27308707124010556, "grad_norm": 1.264408003966606, "learning_rate": 9.959935885253715e-06, "loss": 0.0703, "step": 207 }, { "epoch": 0.27440633245382584, "grad_norm": 1.446028223338931, "learning_rate": 9.958467802593046e-06, "loss": 0.088, "step": 208 }, { "epoch": 0.2757255936675462, "grad_norm": 0.7220366729686982, "learning_rate": 9.956973416143488e-06, "loss": 0.0815, "step": 209 }, { "epoch": 0.2770448548812665, "grad_norm": 1.5607728210271763, "learning_rate": 9.955452733832493e-06, "loss": 0.1029, "step": 210 }, { "epoch": 0.2783641160949868, "grad_norm": 1.4020322542800785, "learning_rate": 9.953905763727015e-06, "loss": 0.0666, "step": 211 }, { "epoch": 0.2796833773087071, "grad_norm": 0.7076793659310983, "learning_rate": 9.952332514033449e-06, "loss": 0.0688, "step": 212 }, { "epoch": 0.28100263852242746, "grad_norm": 1.2502152245372904, "learning_rate": 9.950732993097608e-06, "loss": 0.0955, "step": 213 }, { "epoch": 0.28232189973614774, "grad_norm": 0.6863402755600805, "learning_rate": 9.949107209404664e-06, "loss": 0.0664, "step": 214 }, { "epoch": 0.2836411609498681, "grad_norm": 0.9324176689524081, "learning_rate": 9.947455171579112e-06, "loss": 0.0792, "step": 215 }, { "epoch": 0.2849604221635884, "grad_norm": 0.7850777734100792, "learning_rate": 9.94577688838472e-06, "loss": 0.0811, "step": 216 }, { "epoch": 0.2862796833773087, "grad_norm": 0.8058284364958739, "learning_rate": 9.944072368724476e-06, "loss": 0.0722, "step": 217 }, { "epoch": 0.287598944591029, "grad_norm": 0.969649949605035, "learning_rate": 9.942341621640558e-06, "loss": 0.0755, "step": 218 }, { "epoch": 0.28891820580474936, "grad_norm": 0.9927927695753133, "learning_rate": 9.940584656314269e-06, "loss": 0.0826, "step": 219 }, { "epoch": 0.29023746701846964, "grad_norm": 0.7580345939150698, "learning_rate": 9.938801482065998e-06, "loss": 0.0704, "step": 220 }, { "epoch": 0.29155672823219, "grad_norm": 0.5935527535305033, "learning_rate": 9.936992108355168e-06, "loss": 0.062, "step": 221 }, { "epoch": 0.2928759894459103, "grad_norm": 0.7799550966222378, "learning_rate": 9.935156544780183e-06, "loss": 0.0715, "step": 222 }, { "epoch": 0.2941952506596306, "grad_norm": 0.6585324473995096, "learning_rate": 9.93329480107838e-06, "loss": 0.0644, "step": 223 }, { "epoch": 0.2955145118733509, "grad_norm": 0.7062926953294116, "learning_rate": 9.93140688712598e-06, "loss": 0.0682, "step": 224 }, { "epoch": 0.29683377308707126, "grad_norm": 0.6994497926974289, "learning_rate": 9.929492812938028e-06, "loss": 0.0646, "step": 225 }, { "epoch": 0.29815303430079154, "grad_norm": 0.9870519318595599, "learning_rate": 9.92755258866835e-06, "loss": 0.088, "step": 226 }, { "epoch": 0.2994722955145119, "grad_norm": 0.7685429820565252, "learning_rate": 9.925586224609489e-06, "loss": 0.0533, "step": 227 }, { "epoch": 0.3007915567282322, "grad_norm": 0.7643317383642257, "learning_rate": 9.923593731192655e-06, "loss": 0.0626, "step": 228 }, { "epoch": 0.3021108179419525, "grad_norm": 0.6293974246135788, "learning_rate": 9.921575118987672e-06, "loss": 0.0599, "step": 229 }, { "epoch": 0.3034300791556728, "grad_norm": 1.1719158706259716, "learning_rate": 9.919530398702917e-06, "loss": 0.0766, "step": 230 }, { "epoch": 0.30474934036939316, "grad_norm": 1.1836452278232157, "learning_rate": 9.917459581185269e-06, "loss": 0.1078, "step": 231 }, { "epoch": 0.30606860158311344, "grad_norm": 0.7456849787237535, "learning_rate": 9.915362677420045e-06, "loss": 0.0594, "step": 232 }, { "epoch": 0.3073878627968338, "grad_norm": 1.0171580488977425, "learning_rate": 9.913239698530947e-06, "loss": 0.0764, "step": 233 }, { "epoch": 0.3087071240105541, "grad_norm": 0.7216218185691871, "learning_rate": 9.911090655779997e-06, "loss": 0.056, "step": 234 }, { "epoch": 0.3100263852242744, "grad_norm": 1.0205408613448763, "learning_rate": 9.90891556056749e-06, "loss": 0.0697, "step": 235 }, { "epoch": 0.3113456464379947, "grad_norm": 0.6787788419289598, "learning_rate": 9.906714424431914e-06, "loss": 0.0515, "step": 236 }, { "epoch": 0.31266490765171506, "grad_norm": 0.6511400074009881, "learning_rate": 9.904487259049907e-06, "loss": 0.052, "step": 237 }, { "epoch": 0.31398416886543534, "grad_norm": 0.9175979604887738, "learning_rate": 9.902234076236182e-06, "loss": 0.0712, "step": 238 }, { "epoch": 0.3153034300791557, "grad_norm": 1.0786583925042368, "learning_rate": 9.899954887943474e-06, "loss": 0.0937, "step": 239 }, { "epoch": 0.316622691292876, "grad_norm": 1.1109202209669133, "learning_rate": 9.897649706262474e-06, "loss": 0.0746, "step": 240 }, { "epoch": 0.3179419525065963, "grad_norm": 1.344033170501885, "learning_rate": 9.895318543421755e-06, "loss": 0.1053, "step": 241 }, { "epoch": 0.31926121372031663, "grad_norm": 0.6626492354910761, "learning_rate": 9.892961411787725e-06, "loss": 0.0623, "step": 242 }, { "epoch": 0.32058047493403696, "grad_norm": 1.3153644580390524, "learning_rate": 9.890578323864546e-06, "loss": 0.0823, "step": 243 }, { "epoch": 0.32189973614775724, "grad_norm": 0.9897293095755729, "learning_rate": 9.888169292294077e-06, "loss": 0.0986, "step": 244 }, { "epoch": 0.3232189973614776, "grad_norm": 0.9305565118911147, "learning_rate": 9.885734329855798e-06, "loss": 0.089, "step": 245 }, { "epoch": 0.3245382585751979, "grad_norm": 0.9362582559092152, "learning_rate": 9.883273449466755e-06, "loss": 0.0924, "step": 246 }, { "epoch": 0.3258575197889182, "grad_norm": 0.6789391312855251, "learning_rate": 9.880786664181477e-06, "loss": 0.0625, "step": 247 }, { "epoch": 0.32717678100263853, "grad_norm": 0.7753222910069528, "learning_rate": 9.87827398719192e-06, "loss": 0.072, "step": 248 }, { "epoch": 0.32849604221635886, "grad_norm": 0.9497710959608149, "learning_rate": 9.875735431827386e-06, "loss": 0.1043, "step": 249 }, { "epoch": 0.32981530343007914, "grad_norm": 0.9888364262598249, "learning_rate": 9.87317101155446e-06, "loss": 0.0684, "step": 250 }, { "epoch": 0.3311345646437995, "grad_norm": 0.7512692600971639, "learning_rate": 9.870580739976936e-06, "loss": 0.0771, "step": 251 }, { "epoch": 0.3324538258575198, "grad_norm": 1.3729675639562857, "learning_rate": 9.867964630835742e-06, "loss": 0.1031, "step": 252 }, { "epoch": 0.3337730870712401, "grad_norm": 0.8564354149718731, "learning_rate": 9.865322698008873e-06, "loss": 0.0948, "step": 253 }, { "epoch": 0.33509234828496043, "grad_norm": 0.6605521047813258, "learning_rate": 9.862654955511309e-06, "loss": 0.0592, "step": 254 }, { "epoch": 0.33641160949868076, "grad_norm": 0.8601197600270915, "learning_rate": 9.859961417494952e-06, "loss": 0.0741, "step": 255 }, { "epoch": 0.33773087071240104, "grad_norm": 0.9894110282228973, "learning_rate": 9.857242098248543e-06, "loss": 0.0858, "step": 256 }, { "epoch": 0.3390501319261214, "grad_norm": 1.4242673666601409, "learning_rate": 9.854497012197581e-06, "loss": 0.0827, "step": 257 }, { "epoch": 0.3403693931398417, "grad_norm": 1.0206885764543667, "learning_rate": 9.851726173904264e-06, "loss": 0.0864, "step": 258 }, { "epoch": 0.341688654353562, "grad_norm": 0.9123864354174962, "learning_rate": 9.848929598067393e-06, "loss": 0.0595, "step": 259 }, { "epoch": 0.34300791556728233, "grad_norm": 1.0914805753448291, "learning_rate": 9.846107299522305e-06, "loss": 0.0741, "step": 260 }, { "epoch": 0.34432717678100266, "grad_norm": 0.7658161529334118, "learning_rate": 9.843259293240793e-06, "loss": 0.0553, "step": 261 }, { "epoch": 0.34564643799472294, "grad_norm": 1.7776327682313273, "learning_rate": 9.840385594331022e-06, "loss": 0.0866, "step": 262 }, { "epoch": 0.3469656992084433, "grad_norm": 0.8366360091521257, "learning_rate": 9.837486218037453e-06, "loss": 0.0688, "step": 263 }, { "epoch": 0.3482849604221636, "grad_norm": 1.0370760066220694, "learning_rate": 9.834561179740763e-06, "loss": 0.066, "step": 264 }, { "epoch": 0.3496042216358839, "grad_norm": 0.735064894973884, "learning_rate": 9.831610494957756e-06, "loss": 0.0701, "step": 265 }, { "epoch": 0.35092348284960423, "grad_norm": 0.5842610061663901, "learning_rate": 9.828634179341292e-06, "loss": 0.0609, "step": 266 }, { "epoch": 0.35224274406332456, "grad_norm": 0.9027630240588085, "learning_rate": 9.825632248680195e-06, "loss": 0.0632, "step": 267 }, { "epoch": 0.35356200527704484, "grad_norm": 0.9391224059433742, "learning_rate": 9.82260471889917e-06, "loss": 0.0632, "step": 268 }, { "epoch": 0.3548812664907652, "grad_norm": 0.5654562995248813, "learning_rate": 9.819551606058729e-06, "loss": 0.0731, "step": 269 }, { "epoch": 0.3562005277044855, "grad_norm": 0.7487458340440738, "learning_rate": 9.816472926355087e-06, "loss": 0.0739, "step": 270 }, { "epoch": 0.3575197889182058, "grad_norm": 0.951138507364624, "learning_rate": 9.813368696120095e-06, "loss": 0.0623, "step": 271 }, { "epoch": 0.35883905013192613, "grad_norm": 0.5810859250822077, "learning_rate": 9.810238931821139e-06, "loss": 0.0604, "step": 272 }, { "epoch": 0.36015831134564646, "grad_norm": 0.7591306012531617, "learning_rate": 9.807083650061063e-06, "loss": 0.0717, "step": 273 }, { "epoch": 0.36147757255936674, "grad_norm": 0.7553326022985923, "learning_rate": 9.803902867578075e-06, "loss": 0.0748, "step": 274 }, { "epoch": 0.3627968337730871, "grad_norm": 0.8149927472285278, "learning_rate": 9.800696601245658e-06, "loss": 0.0869, "step": 275 }, { "epoch": 0.3641160949868074, "grad_norm": 0.7554080086493412, "learning_rate": 9.797464868072489e-06, "loss": 0.0882, "step": 276 }, { "epoch": 0.3654353562005277, "grad_norm": 0.9805029259857, "learning_rate": 9.79420768520233e-06, "loss": 0.086, "step": 277 }, { "epoch": 0.36675461741424803, "grad_norm": 0.5320046970882769, "learning_rate": 9.790925069913962e-06, "loss": 0.0556, "step": 278 }, { "epoch": 0.36807387862796836, "grad_norm": 1.0328295566370667, "learning_rate": 9.787617039621071e-06, "loss": 0.0562, "step": 279 }, { "epoch": 0.36939313984168864, "grad_norm": 0.7291324833094329, "learning_rate": 9.78428361187217e-06, "loss": 0.0753, "step": 280 }, { "epoch": 0.370712401055409, "grad_norm": 0.8601042782278124, "learning_rate": 9.7809248043505e-06, "loss": 0.0891, "step": 281 }, { "epoch": 0.3720316622691293, "grad_norm": 0.9846921049495289, "learning_rate": 9.777540634873939e-06, "loss": 0.0742, "step": 282 }, { "epoch": 0.3733509234828496, "grad_norm": 0.5770895688506988, "learning_rate": 9.774131121394898e-06, "loss": 0.0506, "step": 283 }, { "epoch": 0.37467018469656993, "grad_norm": 0.7478778631896724, "learning_rate": 9.770696282000245e-06, "loss": 0.0618, "step": 284 }, { "epoch": 0.3759894459102902, "grad_norm": 0.9950610396799793, "learning_rate": 9.767236134911188e-06, "loss": 0.0978, "step": 285 }, { "epoch": 0.37730870712401055, "grad_norm": 0.5646364519415039, "learning_rate": 9.763750698483192e-06, "loss": 0.0597, "step": 286 }, { "epoch": 0.3786279683377309, "grad_norm": 0.9558965606066078, "learning_rate": 9.760239991205878e-06, "loss": 0.0748, "step": 287 }, { "epoch": 0.37994722955145116, "grad_norm": 0.6806130282946865, "learning_rate": 9.756704031702919e-06, "loss": 0.0665, "step": 288 }, { "epoch": 0.3812664907651715, "grad_norm": 1.3837371430943852, "learning_rate": 9.753142838731956e-06, "loss": 0.0704, "step": 289 }, { "epoch": 0.38258575197889183, "grad_norm": 1.3991857061373745, "learning_rate": 9.74955643118448e-06, "loss": 0.1142, "step": 290 }, { "epoch": 0.3839050131926121, "grad_norm": 0.598821404550848, "learning_rate": 9.745944828085747e-06, "loss": 0.06, "step": 291 }, { "epoch": 0.38522427440633245, "grad_norm": 0.8680968549739181, "learning_rate": 9.742308048594665e-06, "loss": 0.0668, "step": 292 }, { "epoch": 0.3865435356200528, "grad_norm": 0.8256824842529236, "learning_rate": 9.738646112003705e-06, "loss": 0.0786, "step": 293 }, { "epoch": 0.38786279683377306, "grad_norm": 1.1696409634800806, "learning_rate": 9.734959037738788e-06, "loss": 0.0728, "step": 294 }, { "epoch": 0.3891820580474934, "grad_norm": 1.027580435542134, "learning_rate": 9.731246845359187e-06, "loss": 0.0728, "step": 295 }, { "epoch": 0.39050131926121373, "grad_norm": 0.6566278180751907, "learning_rate": 9.727509554557416e-06, "loss": 0.0788, "step": 296 }, { "epoch": 0.391820580474934, "grad_norm": 1.085693108648712, "learning_rate": 9.723747185159146e-06, "loss": 0.0689, "step": 297 }, { "epoch": 0.39313984168865435, "grad_norm": 0.8493527436632392, "learning_rate": 9.719959757123073e-06, "loss": 0.0643, "step": 298 }, { "epoch": 0.3944591029023747, "grad_norm": 0.6028556111386613, "learning_rate": 9.716147290540826e-06, "loss": 0.056, "step": 299 }, { "epoch": 0.39577836411609496, "grad_norm": 1.0034076205039018, "learning_rate": 9.712309805636863e-06, "loss": 0.0764, "step": 300 }, { "epoch": 0.3970976253298153, "grad_norm": 0.7798822407009329, "learning_rate": 9.708447322768361e-06, "loss": 0.0549, "step": 301 }, { "epoch": 0.39841688654353563, "grad_norm": 0.6552064477046319, "learning_rate": 9.704559862425101e-06, "loss": 0.0652, "step": 302 }, { "epoch": 0.3997361477572559, "grad_norm": 0.745247993577414, "learning_rate": 9.70064744522937e-06, "loss": 0.0602, "step": 303 }, { "epoch": 0.40105540897097625, "grad_norm": 0.9204575136344225, "learning_rate": 9.696710091935842e-06, "loss": 0.0702, "step": 304 }, { "epoch": 0.4023746701846966, "grad_norm": 1.1638098645253576, "learning_rate": 9.692747823431477e-06, "loss": 0.0696, "step": 305 }, { "epoch": 0.40369393139841686, "grad_norm": 0.796062028286607, "learning_rate": 9.688760660735403e-06, "loss": 0.0548, "step": 306 }, { "epoch": 0.4050131926121372, "grad_norm": 0.7029833454447884, "learning_rate": 9.68474862499881e-06, "loss": 0.0612, "step": 307 }, { "epoch": 0.40633245382585753, "grad_norm": 0.8614661145454562, "learning_rate": 9.680711737504832e-06, "loss": 0.072, "step": 308 }, { "epoch": 0.4076517150395778, "grad_norm": 0.9475143767728101, "learning_rate": 9.676650019668437e-06, "loss": 0.06, "step": 309 }, { "epoch": 0.40897097625329815, "grad_norm": 0.6977230732332008, "learning_rate": 9.672563493036318e-06, "loss": 0.0668, "step": 310 }, { "epoch": 0.4102902374670185, "grad_norm": 0.7219201569812285, "learning_rate": 9.668452179286769e-06, "loss": 0.069, "step": 311 }, { "epoch": 0.41160949868073876, "grad_norm": 0.8876890045434614, "learning_rate": 9.664316100229578e-06, "loss": 0.0717, "step": 312 }, { "epoch": 0.4129287598944591, "grad_norm": 0.740844881980912, "learning_rate": 9.66015527780591e-06, "loss": 0.0766, "step": 313 }, { "epoch": 0.41424802110817943, "grad_norm": 0.4778241518339464, "learning_rate": 9.655969734088184e-06, "loss": 0.05, "step": 314 }, { "epoch": 0.4155672823218997, "grad_norm": 0.7347326135730948, "learning_rate": 9.651759491279966e-06, "loss": 0.0669, "step": 315 }, { "epoch": 0.41688654353562005, "grad_norm": 1.1284858033400227, "learning_rate": 9.647524571715843e-06, "loss": 0.0758, "step": 316 }, { "epoch": 0.4182058047493404, "grad_norm": 0.6981388862851322, "learning_rate": 9.643264997861312e-06, "loss": 0.0613, "step": 317 }, { "epoch": 0.41952506596306066, "grad_norm": 0.6905887533373853, "learning_rate": 9.638980792312651e-06, "loss": 0.0569, "step": 318 }, { "epoch": 0.420844327176781, "grad_norm": 0.6460144104037705, "learning_rate": 9.63467197779681e-06, "loss": 0.0579, "step": 319 }, { "epoch": 0.42216358839050133, "grad_norm": 0.9556398511536974, "learning_rate": 9.630338577171282e-06, "loss": 0.0707, "step": 320 }, { "epoch": 0.4234828496042216, "grad_norm": 0.5522455835999988, "learning_rate": 9.625980613423985e-06, "loss": 0.055, "step": 321 }, { "epoch": 0.42480211081794195, "grad_norm": 1.7072118244107584, "learning_rate": 9.621598109673142e-06, "loss": 0.062, "step": 322 }, { "epoch": 0.4261213720316623, "grad_norm": 1.1522322988168188, "learning_rate": 9.617191089167155e-06, "loss": 0.0759, "step": 323 }, { "epoch": 0.42744063324538256, "grad_norm": 0.8347923880408826, "learning_rate": 9.612759575284483e-06, "loss": 0.075, "step": 324 }, { "epoch": 0.4287598944591029, "grad_norm": 1.313688139966534, "learning_rate": 9.608303591533516e-06, "loss": 0.0947, "step": 325 }, { "epoch": 0.43007915567282323, "grad_norm": 0.6733121421395389, "learning_rate": 9.603823161552459e-06, "loss": 0.0716, "step": 326 }, { "epoch": 0.4313984168865435, "grad_norm": 1.0558426500828677, "learning_rate": 9.599318309109191e-06, "loss": 0.0523, "step": 327 }, { "epoch": 0.43271767810026385, "grad_norm": 0.9638966469393847, "learning_rate": 9.594789058101154e-06, "loss": 0.0991, "step": 328 }, { "epoch": 0.4340369393139842, "grad_norm": 0.7507598600974192, "learning_rate": 9.590235432555219e-06, "loss": 0.1072, "step": 329 }, { "epoch": 0.43535620052770446, "grad_norm": 0.8808166305074641, "learning_rate": 9.585657456627557e-06, "loss": 0.1099, "step": 330 }, { "epoch": 0.4366754617414248, "grad_norm": 0.7479798613785579, "learning_rate": 9.581055154603517e-06, "loss": 0.082, "step": 331 }, { "epoch": 0.43799472295514513, "grad_norm": 0.6138270782142422, "learning_rate": 9.57642855089749e-06, "loss": 0.0602, "step": 332 }, { "epoch": 0.4393139841688654, "grad_norm": 0.5998863088241339, "learning_rate": 9.571777670052786e-06, "loss": 0.0669, "step": 333 }, { "epoch": 0.44063324538258575, "grad_norm": 1.0086143123332068, "learning_rate": 9.567102536741501e-06, "loss": 0.0615, "step": 334 }, { "epoch": 0.4419525065963061, "grad_norm": 0.8530962897236515, "learning_rate": 9.562403175764383e-06, "loss": 0.069, "step": 335 }, { "epoch": 0.44327176781002636, "grad_norm": 1.1309365259307091, "learning_rate": 9.557679612050708e-06, "loss": 0.0685, "step": 336 }, { "epoch": 0.4445910290237467, "grad_norm": 1.0140895037768696, "learning_rate": 9.552931870658136e-06, "loss": 0.0947, "step": 337 }, { "epoch": 0.44591029023746703, "grad_norm": 0.7953631852283257, "learning_rate": 9.548159976772593e-06, "loss": 0.0845, "step": 338 }, { "epoch": 0.4472295514511873, "grad_norm": 0.7841669983334505, "learning_rate": 9.543363955708124e-06, "loss": 0.0448, "step": 339 }, { "epoch": 0.44854881266490765, "grad_norm": 0.695290594399394, "learning_rate": 9.538543832906773e-06, "loss": 0.0571, "step": 340 }, { "epoch": 0.449868073878628, "grad_norm": 0.8762689209221858, "learning_rate": 9.533699633938425e-06, "loss": 0.0558, "step": 341 }, { "epoch": 0.45118733509234826, "grad_norm": 0.6259802212079215, "learning_rate": 9.528831384500699e-06, "loss": 0.0561, "step": 342 }, { "epoch": 0.4525065963060686, "grad_norm": 0.7451459063031873, "learning_rate": 9.523939110418789e-06, "loss": 0.0827, "step": 343 }, { "epoch": 0.45382585751978893, "grad_norm": 0.6913467923736692, "learning_rate": 9.519022837645337e-06, "loss": 0.0687, "step": 344 }, { "epoch": 0.4551451187335092, "grad_norm": 0.7169029001507153, "learning_rate": 9.514082592260299e-06, "loss": 0.0621, "step": 345 }, { "epoch": 0.45646437994722955, "grad_norm": 0.582058305192275, "learning_rate": 9.509118400470792e-06, "loss": 0.0575, "step": 346 }, { "epoch": 0.4577836411609499, "grad_norm": 0.6825579966883129, "learning_rate": 9.504130288610972e-06, "loss": 0.0662, "step": 347 }, { "epoch": 0.45910290237467016, "grad_norm": 0.5876062939595589, "learning_rate": 9.499118283141887e-06, "loss": 0.0442, "step": 348 }, { "epoch": 0.4604221635883905, "grad_norm": 0.7863259636475942, "learning_rate": 9.494082410651331e-06, "loss": 0.0774, "step": 349 }, { "epoch": 0.46174142480211083, "grad_norm": 0.6211367792942487, "learning_rate": 9.48902269785371e-06, "loss": 0.0653, "step": 350 }, { "epoch": 0.4630606860158311, "grad_norm": 0.6740261695203442, "learning_rate": 9.4839391715899e-06, "loss": 0.0825, "step": 351 }, { "epoch": 0.46437994722955145, "grad_norm": 1.043113360246788, "learning_rate": 9.478831858827105e-06, "loss": 0.0795, "step": 352 }, { "epoch": 0.4656992084432718, "grad_norm": 0.7359044485292054, "learning_rate": 9.473700786658706e-06, "loss": 0.0726, "step": 353 }, { "epoch": 0.46701846965699206, "grad_norm": 0.6909110207632071, "learning_rate": 9.468545982304132e-06, "loss": 0.0731, "step": 354 }, { "epoch": 0.4683377308707124, "grad_norm": 0.5814197718154829, "learning_rate": 9.463367473108697e-06, "loss": 0.0613, "step": 355 }, { "epoch": 0.46965699208443273, "grad_norm": 0.9262695641177886, "learning_rate": 9.458165286543477e-06, "loss": 0.0952, "step": 356 }, { "epoch": 0.470976253298153, "grad_norm": 0.6127534132821949, "learning_rate": 9.452939450205139e-06, "loss": 0.0727, "step": 357 }, { "epoch": 0.47229551451187335, "grad_norm": 0.5642022396225125, "learning_rate": 9.447689991815819e-06, "loss": 0.061, "step": 358 }, { "epoch": 0.4736147757255937, "grad_norm": 0.5652422361321967, "learning_rate": 9.442416939222956e-06, "loss": 0.0659, "step": 359 }, { "epoch": 0.47493403693931396, "grad_norm": 0.6652258751925895, "learning_rate": 9.437120320399158e-06, "loss": 0.0965, "step": 360 }, { "epoch": 0.4762532981530343, "grad_norm": 0.6901963164548323, "learning_rate": 9.431800163442043e-06, "loss": 0.0692, "step": 361 }, { "epoch": 0.47757255936675463, "grad_norm": 0.571898610656191, "learning_rate": 9.426456496574095e-06, "loss": 0.0509, "step": 362 }, { "epoch": 0.4788918205804749, "grad_norm": 0.5449257166951276, "learning_rate": 9.421089348142519e-06, "loss": 0.0411, "step": 363 }, { "epoch": 0.48021108179419525, "grad_norm": 0.9021664212076399, "learning_rate": 9.41569874661908e-06, "loss": 0.0785, "step": 364 }, { "epoch": 0.4815303430079156, "grad_norm": 0.8750083598482614, "learning_rate": 9.410284720599958e-06, "loss": 0.0867, "step": 365 }, { "epoch": 0.48284960422163586, "grad_norm": 0.6011018833711832, "learning_rate": 9.4048472988056e-06, "loss": 0.068, "step": 366 }, { "epoch": 0.4841688654353562, "grad_norm": 0.7520079528217786, "learning_rate": 9.39938651008056e-06, "loss": 0.0822, "step": 367 }, { "epoch": 0.48548812664907653, "grad_norm": 0.6250727259259536, "learning_rate": 9.393902383393347e-06, "loss": 0.0741, "step": 368 }, { "epoch": 0.4868073878627968, "grad_norm": 0.6953888116558605, "learning_rate": 9.388394947836278e-06, "loss": 0.058, "step": 369 }, { "epoch": 0.48812664907651715, "grad_norm": 0.7831220569741709, "learning_rate": 9.382864232625321e-06, "loss": 0.0747, "step": 370 }, { "epoch": 0.4894459102902375, "grad_norm": 0.8773155442663596, "learning_rate": 9.377310267099932e-06, "loss": 0.0943, "step": 371 }, { "epoch": 0.49076517150395776, "grad_norm": 0.7336952231337915, "learning_rate": 9.371733080722911e-06, "loss": 0.0553, "step": 372 }, { "epoch": 0.4920844327176781, "grad_norm": 0.8627759942056888, "learning_rate": 9.366132703080239e-06, "loss": 0.0574, "step": 373 }, { "epoch": 0.49340369393139843, "grad_norm": 0.8103532619274963, "learning_rate": 9.36050916388092e-06, "loss": 0.0554, "step": 374 }, { "epoch": 0.4947229551451187, "grad_norm": 0.741887593849594, "learning_rate": 9.354862492956831e-06, "loss": 0.069, "step": 375 }, { "epoch": 0.49604221635883905, "grad_norm": 0.7855112695865729, "learning_rate": 9.349192720262556e-06, "loss": 0.0661, "step": 376 }, { "epoch": 0.4973614775725594, "grad_norm": 0.6866863581677422, "learning_rate": 9.343499875875226e-06, "loss": 0.0594, "step": 377 }, { "epoch": 0.49868073878627966, "grad_norm": 0.8782759603345595, "learning_rate": 9.337783989994371e-06, "loss": 0.0772, "step": 378 }, { "epoch": 0.5, "grad_norm": 0.9549895199126721, "learning_rate": 9.332045092941747e-06, "loss": 0.068, "step": 379 }, { "epoch": 0.5013192612137203, "grad_norm": 0.5448660341904454, "learning_rate": 9.326283215161177e-06, "loss": 0.0667, "step": 380 }, { "epoch": 0.5026385224274407, "grad_norm": 0.5985824770826613, "learning_rate": 9.320498387218404e-06, "loss": 0.0664, "step": 381 }, { "epoch": 0.503957783641161, "grad_norm": 0.752830871271958, "learning_rate": 9.314690639800906e-06, "loss": 0.0645, "step": 382 }, { "epoch": 0.5052770448548812, "grad_norm": 0.5477776553953697, "learning_rate": 9.308860003717748e-06, "loss": 0.0603, "step": 383 }, { "epoch": 0.5065963060686016, "grad_norm": 0.6139407974821701, "learning_rate": 9.30300650989942e-06, "loss": 0.0877, "step": 384 }, { "epoch": 0.5079155672823219, "grad_norm": 0.6039560183881748, "learning_rate": 9.297130189397661e-06, "loss": 0.0519, "step": 385 }, { "epoch": 0.5092348284960422, "grad_norm": 0.6664238974836402, "learning_rate": 9.291231073385306e-06, "loss": 0.0809, "step": 386 }, { "epoch": 0.5105540897097626, "grad_norm": 0.6488779162272155, "learning_rate": 9.285309193156118e-06, "loss": 0.0745, "step": 387 }, { "epoch": 0.5118733509234829, "grad_norm": 0.8710463478016437, "learning_rate": 9.279364580124615e-06, "loss": 0.0653, "step": 388 }, { "epoch": 0.5131926121372031, "grad_norm": 0.7282940066695746, "learning_rate": 9.273397265825909e-06, "loss": 0.0706, "step": 389 }, { "epoch": 0.5145118733509235, "grad_norm": 0.6377222079463343, "learning_rate": 9.267407281915541e-06, "loss": 0.0566, "step": 390 }, { "epoch": 0.5158311345646438, "grad_norm": 0.5717894561053849, "learning_rate": 9.261394660169311e-06, "loss": 0.0481, "step": 391 }, { "epoch": 0.5171503957783641, "grad_norm": 0.6051053509035249, "learning_rate": 9.255359432483106e-06, "loss": 0.042, "step": 392 }, { "epoch": 0.5184696569920845, "grad_norm": 1.0265587669123954, "learning_rate": 9.249301630872735e-06, "loss": 0.0728, "step": 393 }, { "epoch": 0.5197889182058048, "grad_norm": 0.9466700337352391, "learning_rate": 9.243221287473755e-06, "loss": 0.0736, "step": 394 }, { "epoch": 0.521108179419525, "grad_norm": 1.1721998085842662, "learning_rate": 9.237118434541308e-06, "loss": 0.0639, "step": 395 }, { "epoch": 0.5224274406332454, "grad_norm": 0.964741992189812, "learning_rate": 9.23099310444994e-06, "loss": 0.0621, "step": 396 }, { "epoch": 0.5237467018469657, "grad_norm": 0.6970255221602126, "learning_rate": 9.224845329693434e-06, "loss": 0.079, "step": 397 }, { "epoch": 0.525065963060686, "grad_norm": 1.3823551560613605, "learning_rate": 9.218675142884648e-06, "loss": 0.0795, "step": 398 }, { "epoch": 0.5263852242744064, "grad_norm": 1.4558190810564504, "learning_rate": 9.212482576755318e-06, "loss": 0.067, "step": 399 }, { "epoch": 0.5277044854881267, "grad_norm": 0.6341626113530675, "learning_rate": 9.206267664155906e-06, "loss": 0.0647, "step": 400 }, { "epoch": 0.5290237467018469, "grad_norm": 0.7316499535743366, "learning_rate": 9.20003043805542e-06, "loss": 0.0535, "step": 401 }, { "epoch": 0.5303430079155673, "grad_norm": 0.7680943343177051, "learning_rate": 9.19377093154123e-06, "loss": 0.0679, "step": 402 }, { "epoch": 0.5316622691292876, "grad_norm": 1.2180641513033668, "learning_rate": 9.187489177818907e-06, "loss": 0.0716, "step": 403 }, { "epoch": 0.5329815303430079, "grad_norm": 0.7701251269926012, "learning_rate": 9.181185210212034e-06, "loss": 0.0826, "step": 404 }, { "epoch": 0.5343007915567283, "grad_norm": 1.0054417805604758, "learning_rate": 9.174859062162037e-06, "loss": 0.0723, "step": 405 }, { "epoch": 0.5356200527704486, "grad_norm": 0.9830147127682102, "learning_rate": 9.168510767228008e-06, "loss": 0.071, "step": 406 }, { "epoch": 0.5369393139841688, "grad_norm": 0.6662121630599811, "learning_rate": 9.162140359086515e-06, "loss": 0.0562, "step": 407 }, { "epoch": 0.5382585751978892, "grad_norm": 0.8657347202295601, "learning_rate": 9.155747871531444e-06, "loss": 0.0699, "step": 408 }, { "epoch": 0.5395778364116095, "grad_norm": 0.8713898850191355, "learning_rate": 9.149333338473803e-06, "loss": 0.0526, "step": 409 }, { "epoch": 0.5408970976253298, "grad_norm": 0.8321596239400251, "learning_rate": 9.142896793941546e-06, "loss": 0.0743, "step": 410 }, { "epoch": 0.5422163588390502, "grad_norm": 0.6268650641625824, "learning_rate": 9.136438272079397e-06, "loss": 0.0449, "step": 411 }, { "epoch": 0.5435356200527705, "grad_norm": 1.0046732601631165, "learning_rate": 9.129957807148666e-06, "loss": 0.069, "step": 412 }, { "epoch": 0.5448548812664907, "grad_norm": 0.7417313448011015, "learning_rate": 9.123455433527063e-06, "loss": 0.0605, "step": 413 }, { "epoch": 0.5461741424802111, "grad_norm": 0.5862198883042264, "learning_rate": 9.116931185708523e-06, "loss": 0.0565, "step": 414 }, { "epoch": 0.5474934036939314, "grad_norm": 0.5663121991718124, "learning_rate": 9.110385098303021e-06, "loss": 0.0626, "step": 415 }, { "epoch": 0.5488126649076517, "grad_norm": 0.513372498718201, "learning_rate": 9.103817206036383e-06, "loss": 0.0501, "step": 416 }, { "epoch": 0.5501319261213721, "grad_norm": 0.7663231048740375, "learning_rate": 9.097227543750109e-06, "loss": 0.0952, "step": 417 }, { "epoch": 0.5514511873350924, "grad_norm": 0.6760552525944711, "learning_rate": 9.090616146401183e-06, "loss": 0.0577, "step": 418 }, { "epoch": 0.5527704485488126, "grad_norm": 0.5317840409418032, "learning_rate": 9.083983049061893e-06, "loss": 0.0469, "step": 419 }, { "epoch": 0.554089709762533, "grad_norm": 0.8851719428502758, "learning_rate": 9.077328286919638e-06, "loss": 0.0617, "step": 420 }, { "epoch": 0.5554089709762533, "grad_norm": 0.7497996032541763, "learning_rate": 9.070651895276748e-06, "loss": 0.0714, "step": 421 }, { "epoch": 0.5567282321899736, "grad_norm": 0.6498492053332823, "learning_rate": 9.063953909550289e-06, "loss": 0.0538, "step": 422 }, { "epoch": 0.558047493403694, "grad_norm": 0.44622429830036786, "learning_rate": 9.057234365271886e-06, "loss": 0.0521, "step": 423 }, { "epoch": 0.5593667546174143, "grad_norm": 0.5512561169864484, "learning_rate": 9.050493298087523e-06, "loss": 0.054, "step": 424 }, { "epoch": 0.5606860158311345, "grad_norm": 0.631404862974141, "learning_rate": 9.043730743757362e-06, "loss": 0.0764, "step": 425 }, { "epoch": 0.5620052770448549, "grad_norm": 0.5870072097930791, "learning_rate": 9.036946738155548e-06, "loss": 0.0595, "step": 426 }, { "epoch": 0.5633245382585752, "grad_norm": 0.5040142704734094, "learning_rate": 9.030141317270026e-06, "loss": 0.0556, "step": 427 }, { "epoch": 0.5646437994722955, "grad_norm": 0.7403758206591379, "learning_rate": 9.023314517202341e-06, "loss": 0.0655, "step": 428 }, { "epoch": 0.5659630606860159, "grad_norm": 0.6611426759437872, "learning_rate": 9.016466374167451e-06, "loss": 0.0667, "step": 429 }, { "epoch": 0.5672823218997362, "grad_norm": 0.9850776098945953, "learning_rate": 9.009596924493536e-06, "loss": 0.0742, "step": 430 }, { "epoch": 0.5686015831134564, "grad_norm": 0.6785243477097369, "learning_rate": 9.002706204621802e-06, "loss": 0.0619, "step": 431 }, { "epoch": 0.5699208443271768, "grad_norm": 0.7190003125330587, "learning_rate": 8.995794251106295e-06, "loss": 0.0858, "step": 432 }, { "epoch": 0.5712401055408971, "grad_norm": 0.779155867242513, "learning_rate": 8.988861100613695e-06, "loss": 0.0894, "step": 433 }, { "epoch": 0.5725593667546174, "grad_norm": 0.821896375590076, "learning_rate": 8.98190678992313e-06, "loss": 0.0629, "step": 434 }, { "epoch": 0.5738786279683378, "grad_norm": 0.5076050481256884, "learning_rate": 8.974931355925983e-06, "loss": 0.0481, "step": 435 }, { "epoch": 0.575197889182058, "grad_norm": 0.5488701803801146, "learning_rate": 8.96793483562569e-06, "loss": 0.0495, "step": 436 }, { "epoch": 0.5765171503957783, "grad_norm": 0.4926498135199434, "learning_rate": 8.96091726613754e-06, "loss": 0.051, "step": 437 }, { "epoch": 0.5778364116094987, "grad_norm": 0.7305868797856492, "learning_rate": 8.953878684688492e-06, "loss": 0.0781, "step": 438 }, { "epoch": 0.579155672823219, "grad_norm": 0.7928116961537923, "learning_rate": 8.946819128616973e-06, "loss": 0.0798, "step": 439 }, { "epoch": 0.5804749340369393, "grad_norm": 0.8287924226908049, "learning_rate": 8.939738635372664e-06, "loss": 0.05, "step": 440 }, { "epoch": 0.5817941952506597, "grad_norm": 0.5086476050371508, "learning_rate": 8.932637242516325e-06, "loss": 0.0484, "step": 441 }, { "epoch": 0.58311345646438, "grad_norm": 0.44896912424993907, "learning_rate": 8.92551498771958e-06, "loss": 0.0535, "step": 442 }, { "epoch": 0.5844327176781002, "grad_norm": 0.7427267049471321, "learning_rate": 8.91837190876472e-06, "loss": 0.0448, "step": 443 }, { "epoch": 0.5857519788918206, "grad_norm": 0.4077238431271433, "learning_rate": 8.911208043544513e-06, "loss": 0.0404, "step": 444 }, { "epoch": 0.5870712401055409, "grad_norm": 0.8279103277542957, "learning_rate": 8.904023430061981e-06, "loss": 0.0839, "step": 445 }, { "epoch": 0.5883905013192612, "grad_norm": 0.5893492543117536, "learning_rate": 8.896818106430225e-06, "loss": 0.0607, "step": 446 }, { "epoch": 0.5897097625329816, "grad_norm": 0.6819067552196123, "learning_rate": 8.889592110872203e-06, "loss": 0.0674, "step": 447 }, { "epoch": 0.5910290237467019, "grad_norm": 0.4468129220866856, "learning_rate": 8.882345481720533e-06, "loss": 0.0404, "step": 448 }, { "epoch": 0.5923482849604221, "grad_norm": 0.595551681922815, "learning_rate": 8.875078257417294e-06, "loss": 0.0801, "step": 449 }, { "epoch": 0.5936675461741425, "grad_norm": 0.9018494496257856, "learning_rate": 8.867790476513818e-06, "loss": 0.0665, "step": 450 }, { "epoch": 0.5949868073878628, "grad_norm": 0.5943979685711214, "learning_rate": 8.860482177670482e-06, "loss": 0.0461, "step": 451 }, { "epoch": 0.5963060686015831, "grad_norm": 0.4301614688278578, "learning_rate": 8.853153399656513e-06, "loss": 0.0391, "step": 452 }, { "epoch": 0.5976253298153035, "grad_norm": 0.6967315070272085, "learning_rate": 8.845804181349773e-06, "loss": 0.077, "step": 453 }, { "epoch": 0.5989445910290238, "grad_norm": 0.4953833355499116, "learning_rate": 8.838434561736556e-06, "loss": 0.0566, "step": 454 }, { "epoch": 0.600263852242744, "grad_norm": 0.5907548455305766, "learning_rate": 8.831044579911383e-06, "loss": 0.0655, "step": 455 }, { "epoch": 0.6015831134564644, "grad_norm": 0.6236652449819667, "learning_rate": 8.823634275076792e-06, "loss": 0.0554, "step": 456 }, { "epoch": 0.6029023746701847, "grad_norm": 0.4168923412952401, "learning_rate": 8.816203686543128e-06, "loss": 0.0337, "step": 457 }, { "epoch": 0.604221635883905, "grad_norm": 0.48071330924907624, "learning_rate": 8.808752853728341e-06, "loss": 0.0492, "step": 458 }, { "epoch": 0.6055408970976254, "grad_norm": 0.4842702889425807, "learning_rate": 8.801281816157776e-06, "loss": 0.0515, "step": 459 }, { "epoch": 0.6068601583113457, "grad_norm": 0.7877007315164113, "learning_rate": 8.793790613463956e-06, "loss": 0.072, "step": 460 }, { "epoch": 0.6081794195250659, "grad_norm": 0.4597573113919865, "learning_rate": 8.786279285386374e-06, "loss": 0.0507, "step": 461 }, { "epoch": 0.6094986807387863, "grad_norm": 0.7295743451125444, "learning_rate": 8.778747871771293e-06, "loss": 0.092, "step": 462 }, { "epoch": 0.6108179419525066, "grad_norm": 0.6380120848265783, "learning_rate": 8.771196412571516e-06, "loss": 0.0754, "step": 463 }, { "epoch": 0.6121372031662269, "grad_norm": 0.6151625612709314, "learning_rate": 8.763624947846195e-06, "loss": 0.0492, "step": 464 }, { "epoch": 0.6134564643799473, "grad_norm": 0.6104995830766858, "learning_rate": 8.756033517760601e-06, "loss": 0.0594, "step": 465 }, { "epoch": 0.6147757255936676, "grad_norm": 0.4662314682875708, "learning_rate": 8.748422162585915e-06, "loss": 0.0405, "step": 466 }, { "epoch": 0.6160949868073878, "grad_norm": 0.7037103770745874, "learning_rate": 8.740790922699024e-06, "loss": 0.0681, "step": 467 }, { "epoch": 0.6174142480211082, "grad_norm": 0.46215628548577664, "learning_rate": 8.733139838582299e-06, "loss": 0.0396, "step": 468 }, { "epoch": 0.6187335092348285, "grad_norm": 0.5102031881535187, "learning_rate": 8.725468950823378e-06, "loss": 0.0515, "step": 469 }, { "epoch": 0.6200527704485488, "grad_norm": 0.4651826535622034, "learning_rate": 8.717778300114952e-06, "loss": 0.0419, "step": 470 }, { "epoch": 0.6213720316622692, "grad_norm": 1.1325412075126133, "learning_rate": 8.710067927254555e-06, "loss": 0.0792, "step": 471 }, { "epoch": 0.6226912928759895, "grad_norm": 0.45867525224339245, "learning_rate": 8.702337873144343e-06, "loss": 0.0492, "step": 472 }, { "epoch": 0.6240105540897097, "grad_norm": 0.5504256616903217, "learning_rate": 8.694588178790876e-06, "loss": 0.0392, "step": 473 }, { "epoch": 0.6253298153034301, "grad_norm": 0.9430488090364006, "learning_rate": 8.686818885304907e-06, "loss": 0.0693, "step": 474 }, { "epoch": 0.6266490765171504, "grad_norm": 0.46564772689942074, "learning_rate": 8.679030033901148e-06, "loss": 0.0528, "step": 475 }, { "epoch": 0.6279683377308707, "grad_norm": 0.6909276166774045, "learning_rate": 8.671221665898074e-06, "loss": 0.0705, "step": 476 }, { "epoch": 0.6292875989445911, "grad_norm": 0.8418254861723139, "learning_rate": 8.663393822717686e-06, "loss": 0.0593, "step": 477 }, { "epoch": 0.6306068601583114, "grad_norm": 0.6295040305026375, "learning_rate": 8.655546545885294e-06, "loss": 0.0634, "step": 478 }, { "epoch": 0.6319261213720316, "grad_norm": 0.6645364650357778, "learning_rate": 8.64767987702931e-06, "loss": 0.0476, "step": 479 }, { "epoch": 0.633245382585752, "grad_norm": 0.6135679991723683, "learning_rate": 8.63979385788101e-06, "loss": 0.0454, "step": 480 }, { "epoch": 0.6345646437994723, "grad_norm": 0.812727393806555, "learning_rate": 8.631888530274321e-06, "loss": 0.0791, "step": 481 }, { "epoch": 0.6358839050131926, "grad_norm": 0.5166039238114825, "learning_rate": 8.6239639361456e-06, "loss": 0.0489, "step": 482 }, { "epoch": 0.637203166226913, "grad_norm": 0.6158141240963521, "learning_rate": 8.616020117533406e-06, "loss": 0.0428, "step": 483 }, { "epoch": 0.6385224274406333, "grad_norm": 0.8250113594001276, "learning_rate": 8.608057116578283e-06, "loss": 0.0712, "step": 484 }, { "epoch": 0.6398416886543535, "grad_norm": 0.7451289676544177, "learning_rate": 8.600074975522534e-06, "loss": 0.06, "step": 485 }, { "epoch": 0.6411609498680739, "grad_norm": 0.5326398128654674, "learning_rate": 8.592073736709996e-06, "loss": 0.0603, "step": 486 }, { "epoch": 0.6424802110817942, "grad_norm": 0.5021499758399166, "learning_rate": 8.584053442585816e-06, "loss": 0.0505, "step": 487 }, { "epoch": 0.6437994722955145, "grad_norm": 0.6708410508573724, "learning_rate": 8.576014135696227e-06, "loss": 0.0507, "step": 488 }, { "epoch": 0.6451187335092349, "grad_norm": 1.58512995612873, "learning_rate": 8.567955858688319e-06, "loss": 0.0901, "step": 489 }, { "epoch": 0.6464379947229552, "grad_norm": 0.9243327973050967, "learning_rate": 8.559878654309818e-06, "loss": 0.0812, "step": 490 }, { "epoch": 0.6477572559366754, "grad_norm": 0.575309292027881, "learning_rate": 8.551782565408857e-06, "loss": 0.0504, "step": 491 }, { "epoch": 0.6490765171503958, "grad_norm": 0.6994300970953274, "learning_rate": 8.543667634933743e-06, "loss": 0.0764, "step": 492 }, { "epoch": 0.6503957783641161, "grad_norm": 0.7106135291337119, "learning_rate": 8.535533905932739e-06, "loss": 0.0744, "step": 493 }, { "epoch": 0.6517150395778364, "grad_norm": 1.1670675909226174, "learning_rate": 8.52738142155383e-06, "loss": 0.0592, "step": 494 }, { "epoch": 0.6530343007915568, "grad_norm": 0.6123285898097686, "learning_rate": 8.519210225044491e-06, "loss": 0.0491, "step": 495 }, { "epoch": 0.6543535620052771, "grad_norm": 0.5445510412762957, "learning_rate": 8.511020359751467e-06, "loss": 0.0606, "step": 496 }, { "epoch": 0.6556728232189973, "grad_norm": 0.5133147307482108, "learning_rate": 8.502811869120537e-06, "loss": 0.0555, "step": 497 }, { "epoch": 0.6569920844327177, "grad_norm": 0.5722478197244066, "learning_rate": 8.49458479669628e-06, "loss": 0.057, "step": 498 }, { "epoch": 0.658311345646438, "grad_norm": 0.49913744387906306, "learning_rate": 8.486339186121852e-06, "loss": 0.0557, "step": 499 }, { "epoch": 0.6596306068601583, "grad_norm": 0.448299138636958, "learning_rate": 8.478075081138746e-06, "loss": 0.0501, "step": 500 }, { "epoch": 0.6609498680738787, "grad_norm": 0.45827849751503286, "learning_rate": 8.46979252558657e-06, "loss": 0.0444, "step": 501 }, { "epoch": 0.662269129287599, "grad_norm": 0.40298508876722344, "learning_rate": 8.461491563402807e-06, "loss": 0.0517, "step": 502 }, { "epoch": 0.6635883905013192, "grad_norm": 0.9725533609650583, "learning_rate": 8.453172238622582e-06, "loss": 0.0578, "step": 503 }, { "epoch": 0.6649076517150396, "grad_norm": 2.3295313624861302, "learning_rate": 8.444834595378434e-06, "loss": 0.0843, "step": 504 }, { "epoch": 0.6662269129287599, "grad_norm": 0.6248802177290632, "learning_rate": 8.436478677900073e-06, "loss": 0.0764, "step": 505 }, { "epoch": 0.6675461741424802, "grad_norm": 0.7879242217892278, "learning_rate": 8.428104530514156e-06, "loss": 0.0578, "step": 506 }, { "epoch": 0.6688654353562006, "grad_norm": 0.723177780469888, "learning_rate": 8.419712197644042e-06, "loss": 0.0565, "step": 507 }, { "epoch": 0.6701846965699209, "grad_norm": 0.7752616672345674, "learning_rate": 8.411301723809563e-06, "loss": 0.0611, "step": 508 }, { "epoch": 0.6715039577836411, "grad_norm": 0.5940661722658356, "learning_rate": 8.402873153626787e-06, "loss": 0.0468, "step": 509 }, { "epoch": 0.6728232189973615, "grad_norm": 0.5920695638139996, "learning_rate": 8.394426531807777e-06, "loss": 0.0534, "step": 510 }, { "epoch": 0.6741424802110818, "grad_norm": 0.7901406180432282, "learning_rate": 8.38596190316036e-06, "loss": 0.0744, "step": 511 }, { "epoch": 0.6754617414248021, "grad_norm": 0.8264412056071058, "learning_rate": 8.37747931258788e-06, "loss": 0.0664, "step": 512 }, { "epoch": 0.6767810026385225, "grad_norm": 0.5451316404401848, "learning_rate": 8.368978805088972e-06, "loss": 0.0428, "step": 513 }, { "epoch": 0.6781002638522428, "grad_norm": 0.6935044264705226, "learning_rate": 8.360460425757316e-06, "loss": 0.0567, "step": 514 }, { "epoch": 0.679419525065963, "grad_norm": 1.3586667420447256, "learning_rate": 8.351924219781393e-06, "loss": 0.0584, "step": 515 }, { "epoch": 0.6807387862796834, "grad_norm": 0.4889504771837496, "learning_rate": 8.34337023244426e-06, "loss": 0.0558, "step": 516 }, { "epoch": 0.6820580474934037, "grad_norm": 0.4663280409333404, "learning_rate": 8.3347985091233e-06, "loss": 0.0524, "step": 517 }, { "epoch": 0.683377308707124, "grad_norm": 0.6536313802749268, "learning_rate": 8.326209095289973e-06, "loss": 0.0708, "step": 518 }, { "epoch": 0.6846965699208444, "grad_norm": 0.5460048593573863, "learning_rate": 8.31760203650959e-06, "loss": 0.0574, "step": 519 }, { "epoch": 0.6860158311345647, "grad_norm": 0.5057746693426484, "learning_rate": 8.308977378441072e-06, "loss": 0.0535, "step": 520 }, { "epoch": 0.6873350923482849, "grad_norm": 0.5246741406807683, "learning_rate": 8.300335166836688e-06, "loss": 0.061, "step": 521 }, { "epoch": 0.6886543535620053, "grad_norm": 0.7714492708651828, "learning_rate": 8.291675447541834e-06, "loss": 0.0757, "step": 522 }, { "epoch": 0.6899736147757256, "grad_norm": 0.47313909010317895, "learning_rate": 8.282998266494781e-06, "loss": 0.0652, "step": 523 }, { "epoch": 0.6912928759894459, "grad_norm": 1.3924325907131665, "learning_rate": 8.274303669726427e-06, "loss": 0.0732, "step": 524 }, { "epoch": 0.6926121372031663, "grad_norm": 1.447236862207532, "learning_rate": 8.26559170336006e-06, "loss": 0.0932, "step": 525 }, { "epoch": 0.6939313984168866, "grad_norm": 0.7203129772030047, "learning_rate": 8.256862413611113e-06, "loss": 0.0671, "step": 526 }, { "epoch": 0.6952506596306068, "grad_norm": 0.5928412775645093, "learning_rate": 8.24811584678691e-06, "loss": 0.0667, "step": 527 }, { "epoch": 0.6965699208443272, "grad_norm": 0.43230530042462856, "learning_rate": 8.239352049286435e-06, "loss": 0.0398, "step": 528 }, { "epoch": 0.6978891820580475, "grad_norm": 0.5428941662684312, "learning_rate": 8.230571067600071e-06, "loss": 0.0499, "step": 529 }, { "epoch": 0.6992084432717678, "grad_norm": 0.6009844543596541, "learning_rate": 8.221772948309363e-06, "loss": 0.0486, "step": 530 }, { "epoch": 0.7005277044854882, "grad_norm": 0.6214551055901713, "learning_rate": 8.212957738086766e-06, "loss": 0.0586, "step": 531 }, { "epoch": 0.7018469656992085, "grad_norm": 0.45273005517181125, "learning_rate": 8.204125483695403e-06, "loss": 0.0482, "step": 532 }, { "epoch": 0.7031662269129287, "grad_norm": 0.5258680926988596, "learning_rate": 8.195276231988811e-06, "loss": 0.06, "step": 533 }, { "epoch": 0.7044854881266491, "grad_norm": 0.5853091007879238, "learning_rate": 8.186410029910694e-06, "loss": 0.0597, "step": 534 }, { "epoch": 0.7058047493403694, "grad_norm": 0.8812261465136906, "learning_rate": 8.177526924494675e-06, "loss": 0.064, "step": 535 }, { "epoch": 0.7071240105540897, "grad_norm": 0.4749522463294217, "learning_rate": 8.168626962864045e-06, "loss": 0.0499, "step": 536 }, { "epoch": 0.7084432717678101, "grad_norm": 0.4848789823718647, "learning_rate": 8.15971019223152e-06, "loss": 0.0474, "step": 537 }, { "epoch": 0.7097625329815304, "grad_norm": 0.623357361345339, "learning_rate": 8.15077665989898e-06, "loss": 0.065, "step": 538 }, { "epoch": 0.7110817941952506, "grad_norm": 0.6147503017541885, "learning_rate": 8.14182641325722e-06, "loss": 0.0645, "step": 539 }, { "epoch": 0.712401055408971, "grad_norm": 0.7159205067722418, "learning_rate": 8.132859499785708e-06, "loss": 0.0622, "step": 540 }, { "epoch": 0.7137203166226913, "grad_norm": 0.601001813393786, "learning_rate": 8.123875967052324e-06, "loss": 0.0667, "step": 541 }, { "epoch": 0.7150395778364116, "grad_norm": 0.661382907823683, "learning_rate": 8.114875862713107e-06, "loss": 0.0477, "step": 542 }, { "epoch": 0.716358839050132, "grad_norm": 0.6808774589369142, "learning_rate": 8.105859234512011e-06, "loss": 0.0728, "step": 543 }, { "epoch": 0.7176781002638523, "grad_norm": 0.5645330972316417, "learning_rate": 8.09682613028064e-06, "loss": 0.0442, "step": 544 }, { "epoch": 0.7189973614775725, "grad_norm": 0.7000610285805919, "learning_rate": 8.087776597938005e-06, "loss": 0.0607, "step": 545 }, { "epoch": 0.7203166226912929, "grad_norm": 0.9717949396233259, "learning_rate": 8.078710685490266e-06, "loss": 0.0925, "step": 546 }, { "epoch": 0.7216358839050132, "grad_norm": 0.5768127265884399, "learning_rate": 8.069628441030472e-06, "loss": 0.0528, "step": 547 }, { "epoch": 0.7229551451187335, "grad_norm": 0.7540897188613321, "learning_rate": 8.060529912738316e-06, "loss": 0.0549, "step": 548 }, { "epoch": 0.7242744063324539, "grad_norm": 0.7070564475373282, "learning_rate": 8.051415148879866e-06, "loss": 0.0607, "step": 549 }, { "epoch": 0.7255936675461742, "grad_norm": 0.4581610747944336, "learning_rate": 8.042284197807323e-06, "loss": 0.0424, "step": 550 }, { "epoch": 0.7269129287598944, "grad_norm": 0.8281322422201968, "learning_rate": 8.03313710795876e-06, "loss": 0.0554, "step": 551 }, { "epoch": 0.7282321899736148, "grad_norm": 0.5221549426233524, "learning_rate": 8.023973927857857e-06, "loss": 0.0574, "step": 552 }, { "epoch": 0.7295514511873351, "grad_norm": 1.2249746651625872, "learning_rate": 8.014794706113655e-06, "loss": 0.101, "step": 553 }, { "epoch": 0.7308707124010554, "grad_norm": 0.6878460489135141, "learning_rate": 8.005599491420288e-06, "loss": 0.056, "step": 554 }, { "epoch": 0.7321899736147758, "grad_norm": 0.5396147843239631, "learning_rate": 7.996388332556735e-06, "loss": 0.0479, "step": 555 }, { "epoch": 0.7335092348284961, "grad_norm": 0.6003872582317632, "learning_rate": 7.987161278386555e-06, "loss": 0.0523, "step": 556 }, { "epoch": 0.7348284960422163, "grad_norm": 0.5390200103337964, "learning_rate": 7.977918377857625e-06, "loss": 0.0439, "step": 557 }, { "epoch": 0.7361477572559367, "grad_norm": 0.8055499994103575, "learning_rate": 7.968659680001887e-06, "loss": 0.0544, "step": 558 }, { "epoch": 0.737467018469657, "grad_norm": 0.7605406050858123, "learning_rate": 7.959385233935087e-06, "loss": 0.0617, "step": 559 }, { "epoch": 0.7387862796833773, "grad_norm": 0.5172162310985919, "learning_rate": 7.950095088856509e-06, "loss": 0.0434, "step": 560 }, { "epoch": 0.7401055408970977, "grad_norm": 0.6613151481614415, "learning_rate": 7.940789294048716e-06, "loss": 0.0836, "step": 561 }, { "epoch": 0.741424802110818, "grad_norm": 0.6934766814145727, "learning_rate": 7.931467898877298e-06, "loss": 0.0511, "step": 562 }, { "epoch": 0.7427440633245382, "grad_norm": 0.6553016483708853, "learning_rate": 7.922130952790591e-06, "loss": 0.0666, "step": 563 }, { "epoch": 0.7440633245382586, "grad_norm": 0.6453614377276752, "learning_rate": 7.912778505319436e-06, "loss": 0.0639, "step": 564 }, { "epoch": 0.7453825857519789, "grad_norm": 0.6281402686074141, "learning_rate": 7.9034106060769e-06, "loss": 0.0565, "step": 565 }, { "epoch": 0.7467018469656992, "grad_norm": 0.48369335771379274, "learning_rate": 7.894027304758023e-06, "loss": 0.0489, "step": 566 }, { "epoch": 0.7480211081794196, "grad_norm": 0.4919829789879673, "learning_rate": 7.884628651139543e-06, "loss": 0.0538, "step": 567 }, { "epoch": 0.7493403693931399, "grad_norm": 0.6107428053033679, "learning_rate": 7.875214695079647e-06, "loss": 0.0543, "step": 568 }, { "epoch": 0.7506596306068601, "grad_norm": 0.784889189073831, "learning_rate": 7.865785486517696e-06, "loss": 0.0696, "step": 569 }, { "epoch": 0.7519788918205804, "grad_norm": 0.7238248168827699, "learning_rate": 7.856341075473963e-06, "loss": 0.0878, "step": 570 }, { "epoch": 0.7532981530343008, "grad_norm": 0.498276868425689, "learning_rate": 7.846881512049364e-06, "loss": 0.0405, "step": 571 }, { "epoch": 0.7546174142480211, "grad_norm": 0.6434293792784026, "learning_rate": 7.837406846425205e-06, "loss": 0.0565, "step": 572 }, { "epoch": 0.7559366754617414, "grad_norm": 1.013063418755939, "learning_rate": 7.827917128862895e-06, "loss": 0.0598, "step": 573 }, { "epoch": 0.7572559366754618, "grad_norm": 0.46003480135417324, "learning_rate": 7.818412409703695e-06, "loss": 0.0427, "step": 574 }, { "epoch": 0.758575197889182, "grad_norm": 0.40641102052743017, "learning_rate": 7.808892739368453e-06, "loss": 0.0501, "step": 575 }, { "epoch": 0.7598944591029023, "grad_norm": 0.6471268390587719, "learning_rate": 7.799358168357323e-06, "loss": 0.0673, "step": 576 }, { "epoch": 0.7612137203166227, "grad_norm": 1.1321389794336494, "learning_rate": 7.789808747249505e-06, "loss": 0.0536, "step": 577 }, { "epoch": 0.762532981530343, "grad_norm": 0.5012512664683457, "learning_rate": 7.78024452670298e-06, "loss": 0.0502, "step": 578 }, { "epoch": 0.7638522427440633, "grad_norm": 0.7629572078707867, "learning_rate": 7.770665557454235e-06, "loss": 0.0743, "step": 579 }, { "epoch": 0.7651715039577837, "grad_norm": 0.4546048265700308, "learning_rate": 7.761071890317994e-06, "loss": 0.0449, "step": 580 }, { "epoch": 0.7664907651715039, "grad_norm": 0.584573861216974, "learning_rate": 7.751463576186957e-06, "loss": 0.0649, "step": 581 }, { "epoch": 0.7678100263852242, "grad_norm": 0.5063892476419637, "learning_rate": 7.741840666031517e-06, "loss": 0.0449, "step": 582 }, { "epoch": 0.7691292875989446, "grad_norm": 0.6406284654329897, "learning_rate": 7.7322032108995e-06, "loss": 0.0585, "step": 583 }, { "epoch": 0.7704485488126649, "grad_norm": 0.8624454092052274, "learning_rate": 7.72255126191589e-06, "loss": 0.0727, "step": 584 }, { "epoch": 0.7717678100263852, "grad_norm": 0.4723653050525128, "learning_rate": 7.712884870282558e-06, "loss": 0.0571, "step": 585 }, { "epoch": 0.7730870712401056, "grad_norm": 0.492657143902818, "learning_rate": 7.703204087277989e-06, "loss": 0.0381, "step": 586 }, { "epoch": 0.7744063324538258, "grad_norm": 0.7072714311281186, "learning_rate": 7.693508964257015e-06, "loss": 0.0712, "step": 587 }, { "epoch": 0.7757255936675461, "grad_norm": 0.4894057590546519, "learning_rate": 7.683799552650534e-06, "loss": 0.0421, "step": 588 }, { "epoch": 0.7770448548812665, "grad_norm": 0.7198612599000973, "learning_rate": 7.674075903965254e-06, "loss": 0.0552, "step": 589 }, { "epoch": 0.7783641160949868, "grad_norm": 0.41818543083228865, "learning_rate": 7.66433806978339e-06, "loss": 0.0465, "step": 590 }, { "epoch": 0.7796833773087071, "grad_norm": 0.40455196583885183, "learning_rate": 7.654586101762426e-06, "loss": 0.0496, "step": 591 }, { "epoch": 0.7810026385224275, "grad_norm": 0.40990203769244127, "learning_rate": 7.644820051634813e-06, "loss": 0.0502, "step": 592 }, { "epoch": 0.7823218997361477, "grad_norm": 0.5855401429694791, "learning_rate": 7.63503997120771e-06, "loss": 0.0588, "step": 593 }, { "epoch": 0.783641160949868, "grad_norm": 0.7633148802383083, "learning_rate": 7.625245912362699e-06, "loss": 0.0786, "step": 594 }, { "epoch": 0.7849604221635884, "grad_norm": 0.4709504379995211, "learning_rate": 7.615437927055521e-06, "loss": 0.0446, "step": 595 }, { "epoch": 0.7862796833773087, "grad_norm": 0.6482167373589524, "learning_rate": 7.605616067315793e-06, "loss": 0.0703, "step": 596 }, { "epoch": 0.787598944591029, "grad_norm": 0.5930419099399733, "learning_rate": 7.595780385246729e-06, "loss": 0.051, "step": 597 }, { "epoch": 0.7889182058047494, "grad_norm": 0.5511986661419541, "learning_rate": 7.585930933024874e-06, "loss": 0.0525, "step": 598 }, { "epoch": 0.7902374670184696, "grad_norm": 0.6286727069894341, "learning_rate": 7.576067762899817e-06, "loss": 0.0785, "step": 599 }, { "epoch": 0.7915567282321899, "grad_norm": 0.5164407352417415, "learning_rate": 7.56619092719392e-06, "loss": 0.0567, "step": 600 }, { "epoch": 0.7928759894459103, "grad_norm": 0.4818775374789297, "learning_rate": 7.556300478302038e-06, "loss": 0.05, "step": 601 }, { "epoch": 0.7941952506596306, "grad_norm": 0.7496483074018772, "learning_rate": 7.546396468691241e-06, "loss": 0.0801, "step": 602 }, { "epoch": 0.7955145118733509, "grad_norm": 0.523511921974713, "learning_rate": 7.536478950900537e-06, "loss": 0.0638, "step": 603 }, { "epoch": 0.7968337730870713, "grad_norm": 0.5506566059940079, "learning_rate": 7.526547977540592e-06, "loss": 0.0673, "step": 604 }, { "epoch": 0.7981530343007915, "grad_norm": 0.6637744672920765, "learning_rate": 7.516603601293453e-06, "loss": 0.0684, "step": 605 }, { "epoch": 0.7994722955145118, "grad_norm": 0.45775146693356783, "learning_rate": 7.506645874912264e-06, "loss": 0.0703, "step": 606 }, { "epoch": 0.8007915567282322, "grad_norm": 0.5296473777169002, "learning_rate": 7.4966748512209884e-06, "loss": 0.0576, "step": 607 }, { "epoch": 0.8021108179419525, "grad_norm": 0.5365328810956881, "learning_rate": 7.486690583114137e-06, "loss": 0.0445, "step": 608 }, { "epoch": 0.8034300791556728, "grad_norm": 0.4620464689864355, "learning_rate": 7.47669312355647e-06, "loss": 0.0379, "step": 609 }, { "epoch": 0.8047493403693932, "grad_norm": 0.5379473346536254, "learning_rate": 7.466682525582732e-06, "loss": 0.0378, "step": 610 }, { "epoch": 0.8060686015831134, "grad_norm": 0.5683775687269517, "learning_rate": 7.456658842297364e-06, "loss": 0.0574, "step": 611 }, { "epoch": 0.8073878627968337, "grad_norm": 0.7608705203135304, "learning_rate": 7.446622126874219e-06, "loss": 0.05, "step": 612 }, { "epoch": 0.8087071240105541, "grad_norm": 0.6638681521654088, "learning_rate": 7.436572432556286e-06, "loss": 0.0462, "step": 613 }, { "epoch": 0.8100263852242744, "grad_norm": 0.4691995535122923, "learning_rate": 7.4265098126554065e-06, "loss": 0.0457, "step": 614 }, { "epoch": 0.8113456464379947, "grad_norm": 0.7250619676691925, "learning_rate": 7.416434320551984e-06, "loss": 0.0684, "step": 615 }, { "epoch": 0.8126649076517151, "grad_norm": 0.5602711634901041, "learning_rate": 7.406346009694713e-06, "loss": 0.0428, "step": 616 }, { "epoch": 0.8139841688654353, "grad_norm": 0.605453105353561, "learning_rate": 7.396244933600285e-06, "loss": 0.0573, "step": 617 }, { "epoch": 0.8153034300791556, "grad_norm": 0.6004698644481296, "learning_rate": 7.386131145853111e-06, "loss": 0.0473, "step": 618 }, { "epoch": 0.816622691292876, "grad_norm": 0.7365332296846882, "learning_rate": 7.376004700105034e-06, "loss": 0.0578, "step": 619 }, { "epoch": 0.8179419525065963, "grad_norm": 0.7070376183840003, "learning_rate": 7.365865650075046e-06, "loss": 0.0647, "step": 620 }, { "epoch": 0.8192612137203166, "grad_norm": 0.4015642712910436, "learning_rate": 7.355714049549001e-06, "loss": 0.0341, "step": 621 }, { "epoch": 0.820580474934037, "grad_norm": 0.6779327868527507, "learning_rate": 7.345549952379334e-06, "loss": 0.0499, "step": 622 }, { "epoch": 0.8218997361477572, "grad_norm": 0.8680340537992209, "learning_rate": 7.335373412484772e-06, "loss": 0.0687, "step": 623 }, { "epoch": 0.8232189973614775, "grad_norm": 0.637295783632708, "learning_rate": 7.325184483850043e-06, "loss": 0.0604, "step": 624 }, { "epoch": 0.8245382585751979, "grad_norm": 0.8229308507890835, "learning_rate": 7.314983220525604e-06, "loss": 0.0642, "step": 625 }, { "epoch": 0.8258575197889182, "grad_norm": 0.9148264295561875, "learning_rate": 7.304769676627339e-06, "loss": 0.0657, "step": 626 }, { "epoch": 0.8271767810026385, "grad_norm": 0.5892300500844424, "learning_rate": 7.294543906336279e-06, "loss": 0.0508, "step": 627 }, { "epoch": 0.8284960422163589, "grad_norm": 0.7696278821983158, "learning_rate": 7.284305963898315e-06, "loss": 0.0593, "step": 628 }, { "epoch": 0.8298153034300791, "grad_norm": 0.7817662132002492, "learning_rate": 7.274055903623911e-06, "loss": 0.0661, "step": 629 }, { "epoch": 0.8311345646437994, "grad_norm": 0.599073351844615, "learning_rate": 7.2637937798878085e-06, "loss": 0.0493, "step": 630 }, { "epoch": 0.8324538258575198, "grad_norm": 0.45284300597451216, "learning_rate": 7.25351964712875e-06, "loss": 0.0484, "step": 631 }, { "epoch": 0.8337730870712401, "grad_norm": 0.6016647313251364, "learning_rate": 7.243233559849179e-06, "loss": 0.0465, "step": 632 }, { "epoch": 0.8350923482849604, "grad_norm": 0.49389914293306764, "learning_rate": 7.232935572614957e-06, "loss": 0.0447, "step": 633 }, { "epoch": 0.8364116094986808, "grad_norm": 0.7349002176146826, "learning_rate": 7.222625740055072e-06, "loss": 0.0563, "step": 634 }, { "epoch": 0.837730870712401, "grad_norm": 0.7897682173746414, "learning_rate": 7.212304116861354e-06, "loss": 0.0766, "step": 635 }, { "epoch": 0.8390501319261213, "grad_norm": 0.5196108727337423, "learning_rate": 7.201970757788172e-06, "loss": 0.0649, "step": 636 }, { "epoch": 0.8403693931398417, "grad_norm": 0.4247043219150824, "learning_rate": 7.191625717652158e-06, "loss": 0.0362, "step": 637 }, { "epoch": 0.841688654353562, "grad_norm": 0.4002182375426419, "learning_rate": 7.18126905133191e-06, "loss": 0.0475, "step": 638 }, { "epoch": 0.8430079155672823, "grad_norm": 0.5029922343647233, "learning_rate": 7.170900813767694e-06, "loss": 0.0478, "step": 639 }, { "epoch": 0.8443271767810027, "grad_norm": 0.7565385955901749, "learning_rate": 7.160521059961169e-06, "loss": 0.0782, "step": 640 }, { "epoch": 0.8456464379947229, "grad_norm": 0.9185509777670232, "learning_rate": 7.150129844975079e-06, "loss": 0.0659, "step": 641 }, { "epoch": 0.8469656992084432, "grad_norm": 0.4891563297818167, "learning_rate": 7.1397272239329684e-06, "loss": 0.0532, "step": 642 }, { "epoch": 0.8482849604221636, "grad_norm": 0.4994994913838797, "learning_rate": 7.129313252018892e-06, "loss": 0.0594, "step": 643 }, { "epoch": 0.8496042216358839, "grad_norm": 0.5191192379556658, "learning_rate": 7.118887984477116e-06, "loss": 0.0529, "step": 644 }, { "epoch": 0.8509234828496042, "grad_norm": 1.133290514169336, "learning_rate": 7.108451476611828e-06, "loss": 0.0444, "step": 645 }, { "epoch": 0.8522427440633246, "grad_norm": 1.9081274613732992, "learning_rate": 7.098003783786844e-06, "loss": 0.0777, "step": 646 }, { "epoch": 0.8535620052770448, "grad_norm": 0.5197615187332945, "learning_rate": 7.087544961425317e-06, "loss": 0.0502, "step": 647 }, { "epoch": 0.8548812664907651, "grad_norm": 0.5557110777870159, "learning_rate": 7.0770750650094335e-06, "loss": 0.0514, "step": 648 }, { "epoch": 0.8562005277044855, "grad_norm": 0.576391920153884, "learning_rate": 7.0665941500801306e-06, "loss": 0.0779, "step": 649 }, { "epoch": 0.8575197889182058, "grad_norm": 2.55924940843789, "learning_rate": 7.056102272236799e-06, "loss": 0.0565, "step": 650 }, { "epoch": 0.8588390501319261, "grad_norm": 0.5559838540095863, "learning_rate": 7.045599487136981e-06, "loss": 0.0392, "step": 651 }, { "epoch": 0.8601583113456465, "grad_norm": 1.1829207239081179, "learning_rate": 7.035085850496079e-06, "loss": 0.0614, "step": 652 }, { "epoch": 0.8614775725593667, "grad_norm": 1.0447984538206716, "learning_rate": 7.024561418087068e-06, "loss": 0.0817, "step": 653 }, { "epoch": 0.862796833773087, "grad_norm": 0.7783307249135734, "learning_rate": 7.014026245740185e-06, "loss": 0.0591, "step": 654 }, { "epoch": 0.8641160949868074, "grad_norm": 0.6461374423225957, "learning_rate": 7.003480389342645e-06, "loss": 0.0494, "step": 655 }, { "epoch": 0.8654353562005277, "grad_norm": 0.9384339853723209, "learning_rate": 6.992923904838341e-06, "loss": 0.0721, "step": 656 }, { "epoch": 0.866754617414248, "grad_norm": 0.6660800226382337, "learning_rate": 6.982356848227537e-06, "loss": 0.052, "step": 657 }, { "epoch": 0.8680738786279684, "grad_norm": 0.619900101241196, "learning_rate": 6.971779275566593e-06, "loss": 0.0578, "step": 658 }, { "epoch": 0.8693931398416886, "grad_norm": 0.5715915626402758, "learning_rate": 6.9611912429676485e-06, "loss": 0.0543, "step": 659 }, { "epoch": 0.8707124010554089, "grad_norm": 0.7270686004959181, "learning_rate": 6.9505928065983275e-06, "loss": 0.057, "step": 660 }, { "epoch": 0.8720316622691293, "grad_norm": 0.5198432335246478, "learning_rate": 6.939984022681451e-06, "loss": 0.0351, "step": 661 }, { "epoch": 0.8733509234828496, "grad_norm": 0.8985719129453157, "learning_rate": 6.929364947494729e-06, "loss": 0.0547, "step": 662 }, { "epoch": 0.8746701846965699, "grad_norm": 0.5769158166108265, "learning_rate": 6.918735637370462e-06, "loss": 0.0427, "step": 663 }, { "epoch": 0.8759894459102903, "grad_norm": 0.6653103479472001, "learning_rate": 6.908096148695251e-06, "loss": 0.0599, "step": 664 }, { "epoch": 0.8773087071240105, "grad_norm": 0.8404062260080382, "learning_rate": 6.897446537909686e-06, "loss": 0.0725, "step": 665 }, { "epoch": 0.8786279683377308, "grad_norm": 0.4752059893802516, "learning_rate": 6.886786861508061e-06, "loss": 0.0545, "step": 666 }, { "epoch": 0.8799472295514512, "grad_norm": 0.4778488711685745, "learning_rate": 6.876117176038058e-06, "loss": 0.0567, "step": 667 }, { "epoch": 0.8812664907651715, "grad_norm": 0.46031192038266594, "learning_rate": 6.865437538100456e-06, "loss": 0.0407, "step": 668 }, { "epoch": 0.8825857519788918, "grad_norm": 0.7180279054018667, "learning_rate": 6.85474800434884e-06, "loss": 0.0712, "step": 669 }, { "epoch": 0.8839050131926122, "grad_norm": 0.45300566773960316, "learning_rate": 6.8440486314892775e-06, "loss": 0.0488, "step": 670 }, { "epoch": 0.8852242744063324, "grad_norm": 0.5156880972814101, "learning_rate": 6.833339476280038e-06, "loss": 0.0581, "step": 671 }, { "epoch": 0.8865435356200527, "grad_norm": 0.36978448089350285, "learning_rate": 6.822620595531286e-06, "loss": 0.0439, "step": 672 }, { "epoch": 0.8878627968337731, "grad_norm": 0.6409149604296679, "learning_rate": 6.811892046104772e-06, "loss": 0.0724, "step": 673 }, { "epoch": 0.8891820580474934, "grad_norm": 0.5164562855547227, "learning_rate": 6.801153884913541e-06, "loss": 0.0617, "step": 674 }, { "epoch": 0.8905013192612137, "grad_norm": 0.5137537287977944, "learning_rate": 6.790406168921631e-06, "loss": 0.0537, "step": 675 }, { "epoch": 0.8918205804749341, "grad_norm": 0.6248217252051893, "learning_rate": 6.779648955143754e-06, "loss": 0.0595, "step": 676 }, { "epoch": 0.8931398416886543, "grad_norm": 0.9117584538075659, "learning_rate": 6.768882300645019e-06, "loss": 0.0778, "step": 677 }, { "epoch": 0.8944591029023746, "grad_norm": 1.0045913675480198, "learning_rate": 6.758106262540611e-06, "loss": 0.0365, "step": 678 }, { "epoch": 0.895778364116095, "grad_norm": 0.8182875631557209, "learning_rate": 6.747320897995493e-06, "loss": 0.0597, "step": 679 }, { "epoch": 0.8970976253298153, "grad_norm": 0.40301293105178015, "learning_rate": 6.736526264224101e-06, "loss": 0.0377, "step": 680 }, { "epoch": 0.8984168865435356, "grad_norm": 0.4830059646576911, "learning_rate": 6.7257224184900505e-06, "loss": 0.0581, "step": 681 }, { "epoch": 0.899736147757256, "grad_norm": 0.7384347801633252, "learning_rate": 6.714909418105816e-06, "loss": 0.0502, "step": 682 }, { "epoch": 0.9010554089709762, "grad_norm": 0.44478418897913596, "learning_rate": 6.70408732043244e-06, "loss": 0.0425, "step": 683 }, { "epoch": 0.9023746701846965, "grad_norm": 0.47600625569629695, "learning_rate": 6.693256182879224e-06, "loss": 0.047, "step": 684 }, { "epoch": 0.9036939313984169, "grad_norm": 1.5977235088888984, "learning_rate": 6.682416062903425e-06, "loss": 0.0674, "step": 685 }, { "epoch": 0.9050131926121372, "grad_norm": 0.5372132111670395, "learning_rate": 6.671567018009948e-06, "loss": 0.0584, "step": 686 }, { "epoch": 0.9063324538258575, "grad_norm": 0.6442744048697985, "learning_rate": 6.660709105751046e-06, "loss": 0.0538, "step": 687 }, { "epoch": 0.9076517150395779, "grad_norm": 0.506555769885308, "learning_rate": 6.649842383726011e-06, "loss": 0.0525, "step": 688 }, { "epoch": 0.9089709762532981, "grad_norm": 0.7054885308014072, "learning_rate": 6.638966909580866e-06, "loss": 0.0517, "step": 689 }, { "epoch": 0.9102902374670184, "grad_norm": 0.7442326445547589, "learning_rate": 6.628082741008068e-06, "loss": 0.0611, "step": 690 }, { "epoch": 0.9116094986807388, "grad_norm": 0.5183656940281165, "learning_rate": 6.617189935746191e-06, "loss": 0.064, "step": 691 }, { "epoch": 0.9129287598944591, "grad_norm": 0.4464207504937014, "learning_rate": 6.606288551579629e-06, "loss": 0.0446, "step": 692 }, { "epoch": 0.9142480211081794, "grad_norm": 0.37526282747721584, "learning_rate": 6.595378646338285e-06, "loss": 0.0288, "step": 693 }, { "epoch": 0.9155672823218998, "grad_norm": 0.5777950907682003, "learning_rate": 6.584460277897262e-06, "loss": 0.0677, "step": 694 }, { "epoch": 0.91688654353562, "grad_norm": 0.6652617133702854, "learning_rate": 6.573533504176562e-06, "loss": 0.0715, "step": 695 }, { "epoch": 0.9182058047493403, "grad_norm": 0.4667904960438139, "learning_rate": 6.562598383140773e-06, "loss": 0.0834, "step": 696 }, { "epoch": 0.9195250659630607, "grad_norm": 0.8345298859743209, "learning_rate": 6.551654972798765e-06, "loss": 0.058, "step": 697 }, { "epoch": 0.920844327176781, "grad_norm": 0.44292680991761424, "learning_rate": 6.540703331203382e-06, "loss": 0.0509, "step": 698 }, { "epoch": 0.9221635883905013, "grad_norm": 0.6291584576374266, "learning_rate": 6.529743516451135e-06, "loss": 0.048, "step": 699 }, { "epoch": 0.9234828496042217, "grad_norm": 0.38726410745858225, "learning_rate": 6.518775586681887e-06, "loss": 0.0428, "step": 700 }, { "epoch": 0.924802110817942, "grad_norm": 0.6681003494953462, "learning_rate": 6.507799600078554e-06, "loss": 0.0642, "step": 701 }, { "epoch": 0.9261213720316622, "grad_norm": 0.3838767558388101, "learning_rate": 6.496815614866792e-06, "loss": 0.0406, "step": 702 }, { "epoch": 0.9274406332453826, "grad_norm": 0.9951221686609095, "learning_rate": 6.485823689314683e-06, "loss": 0.0705, "step": 703 }, { "epoch": 0.9287598944591029, "grad_norm": 0.5300121081133167, "learning_rate": 6.4748238817324395e-06, "loss": 0.0679, "step": 704 }, { "epoch": 0.9300791556728232, "grad_norm": 0.6579693283135651, "learning_rate": 6.46381625047208e-06, "loss": 0.0443, "step": 705 }, { "epoch": 0.9313984168865436, "grad_norm": 0.7154530047746038, "learning_rate": 6.452800853927128e-06, "loss": 0.0639, "step": 706 }, { "epoch": 0.9327176781002638, "grad_norm": 0.43183046299555017, "learning_rate": 6.441777750532302e-06, "loss": 0.0432, "step": 707 }, { "epoch": 0.9340369393139841, "grad_norm": 0.6482390999792111, "learning_rate": 6.430746998763204e-06, "loss": 0.0404, "step": 708 }, { "epoch": 0.9353562005277045, "grad_norm": 0.5954280306571919, "learning_rate": 6.419708657136008e-06, "loss": 0.0476, "step": 709 }, { "epoch": 0.9366754617414248, "grad_norm": 0.5670884695354337, "learning_rate": 6.408662784207149e-06, "loss": 0.0465, "step": 710 }, { "epoch": 0.9379947229551451, "grad_norm": 0.39195350309825466, "learning_rate": 6.397609438573018e-06, "loss": 0.0367, "step": 711 }, { "epoch": 0.9393139841688655, "grad_norm": 0.5135065451574998, "learning_rate": 6.386548678869644e-06, "loss": 0.0545, "step": 712 }, { "epoch": 0.9406332453825857, "grad_norm": 0.5440916522763131, "learning_rate": 6.375480563772391e-06, "loss": 0.0516, "step": 713 }, { "epoch": 0.941952506596306, "grad_norm": 0.6146072852339167, "learning_rate": 6.3644051519956366e-06, "loss": 0.0657, "step": 714 }, { "epoch": 0.9432717678100264, "grad_norm": 0.46822032894373955, "learning_rate": 6.353322502292468e-06, "loss": 0.0487, "step": 715 }, { "epoch": 0.9445910290237467, "grad_norm": 0.7703382520399475, "learning_rate": 6.342232673454371e-06, "loss": 0.0824, "step": 716 }, { "epoch": 0.945910290237467, "grad_norm": 0.5658022209850417, "learning_rate": 6.331135724310912e-06, "loss": 0.0423, "step": 717 }, { "epoch": 0.9472295514511874, "grad_norm": 0.6941744607255129, "learning_rate": 6.320031713729429e-06, "loss": 0.0369, "step": 718 }, { "epoch": 0.9485488126649076, "grad_norm": 0.6379949588819936, "learning_rate": 6.308920700614722e-06, "loss": 0.055, "step": 719 }, { "epoch": 0.9498680738786279, "grad_norm": 0.5566867478458106, "learning_rate": 6.2978027439087405e-06, "loss": 0.0635, "step": 720 }, { "epoch": 0.9511873350923483, "grad_norm": 0.49482508553292304, "learning_rate": 6.286677902590262e-06, "loss": 0.0485, "step": 721 }, { "epoch": 0.9525065963060686, "grad_norm": 0.42405552441199723, "learning_rate": 6.2755462356745885e-06, "loss": 0.0398, "step": 722 }, { "epoch": 0.9538258575197889, "grad_norm": 0.3921926452470498, "learning_rate": 6.2644078022132344e-06, "loss": 0.0449, "step": 723 }, { "epoch": 0.9551451187335093, "grad_norm": 0.622187385729326, "learning_rate": 6.2532626612936035e-06, "loss": 0.0492, "step": 724 }, { "epoch": 0.9564643799472295, "grad_norm": 0.8846149336240449, "learning_rate": 6.2421108720386835e-06, "loss": 0.0718, "step": 725 }, { "epoch": 0.9577836411609498, "grad_norm": 0.835978497608325, "learning_rate": 6.2309524936067344e-06, "loss": 0.0651, "step": 726 }, { "epoch": 0.9591029023746702, "grad_norm": 0.49758894330199727, "learning_rate": 6.219787585190964e-06, "loss": 0.0422, "step": 727 }, { "epoch": 0.9604221635883905, "grad_norm": 0.5331036943980498, "learning_rate": 6.208616206019225e-06, "loss": 0.0579, "step": 728 }, { "epoch": 0.9617414248021108, "grad_norm": 0.6148288660973584, "learning_rate": 6.197438415353694e-06, "loss": 0.063, "step": 729 }, { "epoch": 0.9630606860158312, "grad_norm": 0.49695243951671314, "learning_rate": 6.1862542724905605e-06, "loss": 0.0487, "step": 730 }, { "epoch": 0.9643799472295514, "grad_norm": 0.4105127820148204, "learning_rate": 6.175063836759712e-06, "loss": 0.0488, "step": 731 }, { "epoch": 0.9656992084432717, "grad_norm": 0.47446528356917433, "learning_rate": 6.163867167524419e-06, "loss": 0.047, "step": 732 }, { "epoch": 0.9670184696569921, "grad_norm": 0.4703664127153639, "learning_rate": 6.1526643241810145e-06, "loss": 0.0419, "step": 733 }, { "epoch": 0.9683377308707124, "grad_norm": 0.39746241068010874, "learning_rate": 6.1414553661585905e-06, "loss": 0.0422, "step": 734 }, { "epoch": 0.9696569920844327, "grad_norm": 0.5170990993750981, "learning_rate": 6.130240352918675e-06, "loss": 0.0661, "step": 735 }, { "epoch": 0.9709762532981531, "grad_norm": 0.5152004279143637, "learning_rate": 6.119019343954914e-06, "loss": 0.0374, "step": 736 }, { "epoch": 0.9722955145118733, "grad_norm": 0.522388520081193, "learning_rate": 6.107792398792763e-06, "loss": 0.053, "step": 737 }, { "epoch": 0.9736147757255936, "grad_norm": 0.7152588462194387, "learning_rate": 6.096559576989166e-06, "loss": 0.0692, "step": 738 }, { "epoch": 0.974934036939314, "grad_norm": 0.4545424319736431, "learning_rate": 6.085320938132245e-06, "loss": 0.0469, "step": 739 }, { "epoch": 0.9762532981530343, "grad_norm": 0.7390386089115659, "learning_rate": 6.074076541840978e-06, "loss": 0.0574, "step": 740 }, { "epoch": 0.9775725593667546, "grad_norm": 0.5797819489692182, "learning_rate": 6.062826447764883e-06, "loss": 0.0551, "step": 741 }, { "epoch": 0.978891820580475, "grad_norm": 0.9043388866815173, "learning_rate": 6.05157071558371e-06, "loss": 0.0487, "step": 742 }, { "epoch": 0.9802110817941952, "grad_norm": 0.5300095964363111, "learning_rate": 6.0403094050071106e-06, "loss": 0.0461, "step": 743 }, { "epoch": 0.9815303430079155, "grad_norm": 1.2135080457223286, "learning_rate": 6.029042575774334e-06, "loss": 0.0744, "step": 744 }, { "epoch": 0.9828496042216359, "grad_norm": 0.3508635868880483, "learning_rate": 6.017770287653905e-06, "loss": 0.0414, "step": 745 }, { "epoch": 0.9841688654353562, "grad_norm": 0.46813312237331894, "learning_rate": 6.006492600443301e-06, "loss": 0.0479, "step": 746 }, { "epoch": 0.9854881266490765, "grad_norm": 0.4606890493355314, "learning_rate": 5.995209573968648e-06, "loss": 0.0369, "step": 747 }, { "epoch": 0.9868073878627969, "grad_norm": 0.7963291719507631, "learning_rate": 5.9839212680843925e-06, "loss": 0.0853, "step": 748 }, { "epoch": 0.9881266490765171, "grad_norm": 0.5155724288962596, "learning_rate": 5.9726277426729866e-06, "loss": 0.0695, "step": 749 }, { "epoch": 0.9894459102902374, "grad_norm": 0.4630288406158184, "learning_rate": 5.961329057644571e-06, "loss": 0.0384, "step": 750 }, { "epoch": 0.9907651715039578, "grad_norm": 1.051448267922444, "learning_rate": 5.95002527293666e-06, "loss": 0.0651, "step": 751 }, { "epoch": 0.9920844327176781, "grad_norm": 0.6193715825035618, "learning_rate": 5.938716448513819e-06, "loss": 0.0567, "step": 752 }, { "epoch": 0.9934036939313984, "grad_norm": 0.8381031857754667, "learning_rate": 5.927402644367345e-06, "loss": 0.0618, "step": 753 }, { "epoch": 0.9947229551451188, "grad_norm": 0.3803899696750758, "learning_rate": 5.916083920514959e-06, "loss": 0.0348, "step": 754 }, { "epoch": 0.996042216358839, "grad_norm": 0.6495643260632261, "learning_rate": 5.904760337000473e-06, "loss": 0.0718, "step": 755 }, { "epoch": 0.9973614775725593, "grad_norm": 1.1087437172957955, "learning_rate": 5.893431953893483e-06, "loss": 0.0466, "step": 756 }, { "epoch": 0.9986807387862797, "grad_norm": 1.202764067963593, "learning_rate": 5.882098831289044e-06, "loss": 0.039, "step": 757 }, { "epoch": 1.0, "grad_norm": 0.46604548846150806, "learning_rate": 5.8707610293073524e-06, "loss": 0.0458, "step": 758 }, { "epoch": 1.0, "eval_loss": 0.05592323839664459, "eval_runtime": 128.0063, "eval_samples_per_second": 39.881, "eval_steps_per_second": 1.25, "step": 758 }, { "epoch": 1.0013192612137203, "grad_norm": 0.6989593085889051, "learning_rate": 5.85941860809343e-06, "loss": 0.0458, "step": 759 }, { "epoch": 1.0026385224274406, "grad_norm": 0.5927038885689029, "learning_rate": 5.848071627816804e-06, "loss": 0.0538, "step": 760 }, { "epoch": 1.003957783641161, "grad_norm": 0.8857298937072677, "learning_rate": 5.836720148671182e-06, "loss": 0.0467, "step": 761 }, { "epoch": 1.0052770448548813, "grad_norm": 0.4109405171330407, "learning_rate": 5.82536423087414e-06, "loss": 0.038, "step": 762 }, { "epoch": 1.0065963060686016, "grad_norm": 1.5021621311748752, "learning_rate": 5.814003934666802e-06, "loss": 0.0574, "step": 763 }, { "epoch": 1.007915567282322, "grad_norm": 0.4718122478262424, "learning_rate": 5.8026393203135145e-06, "loss": 0.0473, "step": 764 }, { "epoch": 1.0092348284960422, "grad_norm": 0.6482837984831183, "learning_rate": 5.791270448101533e-06, "loss": 0.055, "step": 765 }, { "epoch": 1.0105540897097625, "grad_norm": 0.47324139957617567, "learning_rate": 5.779897378340705e-06, "loss": 0.0366, "step": 766 }, { "epoch": 1.0118733509234827, "grad_norm": 0.5751432075833953, "learning_rate": 5.768520171363135e-06, "loss": 0.0374, "step": 767 }, { "epoch": 1.0131926121372032, "grad_norm": 0.7353563730795333, "learning_rate": 5.757138887522884e-06, "loss": 0.045, "step": 768 }, { "epoch": 1.0145118733509235, "grad_norm": 0.6061560135738652, "learning_rate": 5.745753587195636e-06, "loss": 0.0595, "step": 769 }, { "epoch": 1.0158311345646438, "grad_norm": 0.49181644883396125, "learning_rate": 5.734364330778381e-06, "loss": 0.0415, "step": 770 }, { "epoch": 1.017150395778364, "grad_norm": 0.6430420819909749, "learning_rate": 5.722971178689096e-06, "loss": 0.0722, "step": 771 }, { "epoch": 1.0184696569920844, "grad_norm": 0.4839549081518064, "learning_rate": 5.711574191366427e-06, "loss": 0.0372, "step": 772 }, { "epoch": 1.0197889182058049, "grad_norm": 0.5188479898199087, "learning_rate": 5.70017342926936e-06, "loss": 0.0479, "step": 773 }, { "epoch": 1.0211081794195251, "grad_norm": 0.7917447001213317, "learning_rate": 5.68876895287691e-06, "loss": 0.0463, "step": 774 }, { "epoch": 1.0224274406332454, "grad_norm": 0.46624077553959226, "learning_rate": 5.677360822687794e-06, "loss": 0.046, "step": 775 }, { "epoch": 1.0237467018469657, "grad_norm": 0.4953051288461476, "learning_rate": 5.66594909922011e-06, "loss": 0.0367, "step": 776 }, { "epoch": 1.025065963060686, "grad_norm": 0.6059827706644242, "learning_rate": 5.654533843011019e-06, "loss": 0.0404, "step": 777 }, { "epoch": 1.0263852242744063, "grad_norm": 0.6064456753320037, "learning_rate": 5.6431151146164255e-06, "loss": 0.0479, "step": 778 }, { "epoch": 1.0277044854881265, "grad_norm": 0.775592672100191, "learning_rate": 5.631692974610647e-06, "loss": 0.0619, "step": 779 }, { "epoch": 1.029023746701847, "grad_norm": 0.4732107305616271, "learning_rate": 5.6202674835861045e-06, "loss": 0.0394, "step": 780 }, { "epoch": 1.0303430079155673, "grad_norm": 0.5693888680251252, "learning_rate": 5.6088387021529966e-06, "loss": 0.0526, "step": 781 }, { "epoch": 1.0316622691292876, "grad_norm": 0.6931708016123965, "learning_rate": 5.597406690938969e-06, "loss": 0.0513, "step": 782 }, { "epoch": 1.0329815303430079, "grad_norm": 0.8142189449285727, "learning_rate": 5.585971510588808e-06, "loss": 0.0424, "step": 783 }, { "epoch": 1.0343007915567282, "grad_norm": 0.6652724688329306, "learning_rate": 5.574533221764109e-06, "loss": 0.0367, "step": 784 }, { "epoch": 1.0356200527704487, "grad_norm": 0.3300674518769234, "learning_rate": 5.563091885142959e-06, "loss": 0.0272, "step": 785 }, { "epoch": 1.036939313984169, "grad_norm": 0.5380054416165114, "learning_rate": 5.551647561419611e-06, "loss": 0.0343, "step": 786 }, { "epoch": 1.0382585751978892, "grad_norm": 0.7056758338546055, "learning_rate": 5.540200311304164e-06, "loss": 0.0276, "step": 787 }, { "epoch": 1.0395778364116095, "grad_norm": 0.8092651006949911, "learning_rate": 5.528750195522244e-06, "loss": 0.0377, "step": 788 }, { "epoch": 1.0408970976253298, "grad_norm": 1.0814167405738828, "learning_rate": 5.517297274814674e-06, "loss": 0.0722, "step": 789 }, { "epoch": 1.04221635883905, "grad_norm": 0.6447685602617403, "learning_rate": 5.505841609937162e-06, "loss": 0.0522, "step": 790 }, { "epoch": 1.0435356200527703, "grad_norm": 0.45972221138369695, "learning_rate": 5.494383261659968e-06, "loss": 0.0306, "step": 791 }, { "epoch": 1.0448548812664908, "grad_norm": 0.5502963065155776, "learning_rate": 5.4829222907675895e-06, "loss": 0.0341, "step": 792 }, { "epoch": 1.0461741424802111, "grad_norm": 0.5923787629429828, "learning_rate": 5.47145875805844e-06, "loss": 0.0483, "step": 793 }, { "epoch": 1.0474934036939314, "grad_norm": 0.40453312954209114, "learning_rate": 5.459992724344516e-06, "loss": 0.0358, "step": 794 }, { "epoch": 1.0488126649076517, "grad_norm": 0.4634998773083594, "learning_rate": 5.448524250451086e-06, "loss": 0.042, "step": 795 }, { "epoch": 1.050131926121372, "grad_norm": 0.4911088924835721, "learning_rate": 5.437053397216364e-06, "loss": 0.0536, "step": 796 }, { "epoch": 1.0514511873350922, "grad_norm": 1.1796657206533812, "learning_rate": 5.425580225491182e-06, "loss": 0.0415, "step": 797 }, { "epoch": 1.0527704485488127, "grad_norm": 0.510337434330926, "learning_rate": 5.4141047961386724e-06, "loss": 0.0512, "step": 798 }, { "epoch": 1.054089709762533, "grad_norm": 0.40102710131189545, "learning_rate": 5.40262717003395e-06, "loss": 0.043, "step": 799 }, { "epoch": 1.0554089709762533, "grad_norm": 0.4964134440989535, "learning_rate": 5.3911474080637705e-06, "loss": 0.0408, "step": 800 }, { "epoch": 1.0567282321899736, "grad_norm": 0.7046860769411418, "learning_rate": 5.379665571126232e-06, "loss": 0.0322, "step": 801 }, { "epoch": 1.0580474934036939, "grad_norm": 0.732062985823511, "learning_rate": 5.368181720130434e-06, "loss": 0.0567, "step": 802 }, { "epoch": 1.0593667546174141, "grad_norm": 0.5027796864782664, "learning_rate": 5.356695915996162e-06, "loss": 0.0351, "step": 803 }, { "epoch": 1.0606860158311346, "grad_norm": 0.4996625186955253, "learning_rate": 5.345208219653562e-06, "loss": 0.0439, "step": 804 }, { "epoch": 1.062005277044855, "grad_norm": 0.9152620815670903, "learning_rate": 5.333718692042817e-06, "loss": 0.0689, "step": 805 }, { "epoch": 1.0633245382585752, "grad_norm": 0.411853927570493, "learning_rate": 5.322227394113826e-06, "loss": 0.0344, "step": 806 }, { "epoch": 1.0646437994722955, "grad_norm": 0.47740672031416576, "learning_rate": 5.310734386825877e-06, "loss": 0.0352, "step": 807 }, { "epoch": 1.0659630606860158, "grad_norm": 0.5250816082342158, "learning_rate": 5.299239731147332e-06, "loss": 0.0558, "step": 808 }, { "epoch": 1.0672823218997363, "grad_norm": 0.42169584826349693, "learning_rate": 5.287743488055288e-06, "loss": 0.042, "step": 809 }, { "epoch": 1.0686015831134565, "grad_norm": 0.4662553845238838, "learning_rate": 5.2762457185352685e-06, "loss": 0.0518, "step": 810 }, { "epoch": 1.0699208443271768, "grad_norm": 0.4205032546352608, "learning_rate": 5.264746483580897e-06, "loss": 0.0307, "step": 811 }, { "epoch": 1.071240105540897, "grad_norm": 0.3760397290861168, "learning_rate": 5.253245844193564e-06, "loss": 0.0332, "step": 812 }, { "epoch": 1.0725593667546174, "grad_norm": 0.42731752861802264, "learning_rate": 5.241743861382116e-06, "loss": 0.0366, "step": 813 }, { "epoch": 1.0738786279683377, "grad_norm": 0.6398543464753047, "learning_rate": 5.2302405961625225e-06, "loss": 0.0611, "step": 814 }, { "epoch": 1.075197889182058, "grad_norm": 0.5784969161066025, "learning_rate": 5.2187361095575596e-06, "loss": 0.0588, "step": 815 }, { "epoch": 1.0765171503957784, "grad_norm": 0.39490983342015973, "learning_rate": 5.2072304625964785e-06, "loss": 0.0296, "step": 816 }, { "epoch": 1.0778364116094987, "grad_norm": 0.57451932340248, "learning_rate": 5.195723716314688e-06, "loss": 0.0531, "step": 817 }, { "epoch": 1.079155672823219, "grad_norm": 0.6864743480441236, "learning_rate": 5.1842159317534304e-06, "loss": 0.0342, "step": 818 }, { "epoch": 1.0804749340369393, "grad_norm": 0.4862515416335429, "learning_rate": 5.172707169959451e-06, "loss": 0.0463, "step": 819 }, { "epoch": 1.0817941952506596, "grad_norm": 0.6315245551709583, "learning_rate": 5.161197491984684e-06, "loss": 0.0537, "step": 820 }, { "epoch": 1.08311345646438, "grad_norm": 0.6106456993177825, "learning_rate": 5.149686958885923e-06, "loss": 0.0614, "step": 821 }, { "epoch": 1.0844327176781003, "grad_norm": 0.49893031079039063, "learning_rate": 5.138175631724495e-06, "loss": 0.0491, "step": 822 }, { "epoch": 1.0857519788918206, "grad_norm": 0.31147981251138374, "learning_rate": 5.12666357156594e-06, "loss": 0.0278, "step": 823 }, { "epoch": 1.087071240105541, "grad_norm": 0.5875872281434777, "learning_rate": 5.11515083947969e-06, "loss": 0.0437, "step": 824 }, { "epoch": 1.0883905013192612, "grad_norm": 0.5730905962293109, "learning_rate": 5.103637496538738e-06, "loss": 0.0476, "step": 825 }, { "epoch": 1.0897097625329815, "grad_norm": 0.8895822545480768, "learning_rate": 5.092123603819318e-06, "loss": 0.0434, "step": 826 }, { "epoch": 1.0910290237467017, "grad_norm": 0.5226509819931012, "learning_rate": 5.080609222400582e-06, "loss": 0.0453, "step": 827 }, { "epoch": 1.0923482849604222, "grad_norm": 0.49301189424014497, "learning_rate": 5.069094413364272e-06, "loss": 0.0395, "step": 828 }, { "epoch": 1.0936675461741425, "grad_norm": 0.4358556106625212, "learning_rate": 5.0575792377944e-06, "loss": 0.0345, "step": 829 }, { "epoch": 1.0949868073878628, "grad_norm": 0.5273548010655754, "learning_rate": 5.046063756776926e-06, "loss": 0.0385, "step": 830 }, { "epoch": 1.096306068601583, "grad_norm": 1.3462457041300329, "learning_rate": 5.03454803139942e-06, "loss": 0.0402, "step": 831 }, { "epoch": 1.0976253298153034, "grad_norm": 0.4370812057351214, "learning_rate": 5.0230321227507595e-06, "loss": 0.0325, "step": 832 }, { "epoch": 1.0989445910290236, "grad_norm": 0.6267600963038585, "learning_rate": 5.0115160919207894e-06, "loss": 0.0459, "step": 833 }, { "epoch": 1.1002638522427441, "grad_norm": 0.6313390475018821, "learning_rate": 5e-06, "loss": 0.0571, "step": 834 }, { "epoch": 1.1015831134564644, "grad_norm": 0.5417310869246437, "learning_rate": 4.988483908079212e-06, "loss": 0.0515, "step": 835 }, { "epoch": 1.1029023746701847, "grad_norm": 0.3807618659150837, "learning_rate": 4.976967877249242e-06, "loss": 0.0345, "step": 836 }, { "epoch": 1.104221635883905, "grad_norm": 0.421468296572213, "learning_rate": 4.9654519686005815e-06, "loss": 0.0367, "step": 837 }, { "epoch": 1.1055408970976253, "grad_norm": 0.4542996221952275, "learning_rate": 4.953936243223077e-06, "loss": 0.0365, "step": 838 }, { "epoch": 1.1068601583113455, "grad_norm": 0.46168172671671587, "learning_rate": 4.942420762205601e-06, "loss": 0.0488, "step": 839 }, { "epoch": 1.108179419525066, "grad_norm": 0.5673446414814094, "learning_rate": 4.93090558663573e-06, "loss": 0.0425, "step": 840 }, { "epoch": 1.1094986807387863, "grad_norm": 0.4602088265244389, "learning_rate": 4.9193907775994196e-06, "loss": 0.0378, "step": 841 }, { "epoch": 1.1108179419525066, "grad_norm": 0.3983517164777114, "learning_rate": 4.907876396180684e-06, "loss": 0.0348, "step": 842 }, { "epoch": 1.1121372031662269, "grad_norm": 0.38537918505531044, "learning_rate": 4.896362503461264e-06, "loss": 0.0379, "step": 843 }, { "epoch": 1.1134564643799472, "grad_norm": 0.4910602117897347, "learning_rate": 4.884849160520311e-06, "loss": 0.039, "step": 844 }, { "epoch": 1.1147757255936674, "grad_norm": 0.8410599371803616, "learning_rate": 4.873336428434062e-06, "loss": 0.0795, "step": 845 }, { "epoch": 1.116094986807388, "grad_norm": 0.4238465190982125, "learning_rate": 4.861824368275508e-06, "loss": 0.039, "step": 846 }, { "epoch": 1.1174142480211082, "grad_norm": 0.8651752378440294, "learning_rate": 4.850313041114078e-06, "loss": 0.0403, "step": 847 }, { "epoch": 1.1187335092348285, "grad_norm": 0.417802081435934, "learning_rate": 4.838802508015316e-06, "loss": 0.0406, "step": 848 }, { "epoch": 1.1200527704485488, "grad_norm": 0.3506635201550878, "learning_rate": 4.82729283004055e-06, "loss": 0.0319, "step": 849 }, { "epoch": 1.121372031662269, "grad_norm": 0.7348279579830447, "learning_rate": 4.815784068246571e-06, "loss": 0.0482, "step": 850 }, { "epoch": 1.1226912928759893, "grad_norm": 0.4950364378777992, "learning_rate": 4.8042762836853135e-06, "loss": 0.0366, "step": 851 }, { "epoch": 1.1240105540897098, "grad_norm": 0.382207830811648, "learning_rate": 4.792769537403523e-06, "loss": 0.0306, "step": 852 }, { "epoch": 1.1253298153034301, "grad_norm": 0.40855083469378334, "learning_rate": 4.781263890442442e-06, "loss": 0.0368, "step": 853 }, { "epoch": 1.1266490765171504, "grad_norm": 0.5999735367431606, "learning_rate": 4.769759403837479e-06, "loss": 0.0455, "step": 854 }, { "epoch": 1.1279683377308707, "grad_norm": 0.714553966367233, "learning_rate": 4.758256138617886e-06, "loss": 0.0433, "step": 855 }, { "epoch": 1.129287598944591, "grad_norm": 0.5194206554238696, "learning_rate": 4.746754155806437e-06, "loss": 0.0351, "step": 856 }, { "epoch": 1.1306068601583115, "grad_norm": 0.7276753108318581, "learning_rate": 4.735253516419104e-06, "loss": 0.0624, "step": 857 }, { "epoch": 1.1319261213720317, "grad_norm": 0.580276316267312, "learning_rate": 4.723754281464732e-06, "loss": 0.0465, "step": 858 }, { "epoch": 1.133245382585752, "grad_norm": 0.5024855831871866, "learning_rate": 4.712256511944714e-06, "loss": 0.0349, "step": 859 }, { "epoch": 1.1345646437994723, "grad_norm": 0.5832512512257071, "learning_rate": 4.700760268852669e-06, "loss": 0.0344, "step": 860 }, { "epoch": 1.1358839050131926, "grad_norm": 0.37995497107724613, "learning_rate": 4.6892656131741235e-06, "loss": 0.0322, "step": 861 }, { "epoch": 1.1372031662269129, "grad_norm": 0.5083535498753513, "learning_rate": 4.677772605886175e-06, "loss": 0.0341, "step": 862 }, { "epoch": 1.1385224274406331, "grad_norm": 0.4017695107966174, "learning_rate": 4.666281307957184e-06, "loss": 0.0386, "step": 863 }, { "epoch": 1.1398416886543536, "grad_norm": 0.46305677767852116, "learning_rate": 4.65479178034644e-06, "loss": 0.0378, "step": 864 }, { "epoch": 1.141160949868074, "grad_norm": 0.5531056834185942, "learning_rate": 4.643304084003839e-06, "loss": 0.0398, "step": 865 }, { "epoch": 1.1424802110817942, "grad_norm": 0.6317168378781234, "learning_rate": 4.631818279869567e-06, "loss": 0.0448, "step": 866 }, { "epoch": 1.1437994722955145, "grad_norm": 0.44611502163005134, "learning_rate": 4.62033442887377e-06, "loss": 0.0474, "step": 867 }, { "epoch": 1.1451187335092348, "grad_norm": 0.4298271394707465, "learning_rate": 4.608852591936231e-06, "loss": 0.0366, "step": 868 }, { "epoch": 1.1464379947229553, "grad_norm": 0.5739408870671835, "learning_rate": 4.597372829966053e-06, "loss": 0.0536, "step": 869 }, { "epoch": 1.1477572559366755, "grad_norm": 0.6389398509440999, "learning_rate": 4.585895203861328e-06, "loss": 0.04, "step": 870 }, { "epoch": 1.1490765171503958, "grad_norm": 0.5450066757975351, "learning_rate": 4.57441977450882e-06, "loss": 0.0453, "step": 871 }, { "epoch": 1.150395778364116, "grad_norm": 0.5364173970448743, "learning_rate": 4.562946602783637e-06, "loss": 0.0639, "step": 872 }, { "epoch": 1.1517150395778364, "grad_norm": 0.8995373745104792, "learning_rate": 4.551475749548915e-06, "loss": 0.073, "step": 873 }, { "epoch": 1.1530343007915567, "grad_norm": 0.5515888196554394, "learning_rate": 4.540007275655485e-06, "loss": 0.0311, "step": 874 }, { "epoch": 1.154353562005277, "grad_norm": 0.4806942231289383, "learning_rate": 4.528541241941562e-06, "loss": 0.0383, "step": 875 }, { "epoch": 1.1556728232189974, "grad_norm": 0.45703528312267117, "learning_rate": 4.517077709232411e-06, "loss": 0.031, "step": 876 }, { "epoch": 1.1569920844327177, "grad_norm": 0.5649548132455126, "learning_rate": 4.505616738340034e-06, "loss": 0.0588, "step": 877 }, { "epoch": 1.158311345646438, "grad_norm": 0.42689901016539084, "learning_rate": 4.49415839006284e-06, "loss": 0.0391, "step": 878 }, { "epoch": 1.1596306068601583, "grad_norm": 0.47023892707216963, "learning_rate": 4.482702725185328e-06, "loss": 0.0351, "step": 879 }, { "epoch": 1.1609498680738786, "grad_norm": 0.5880176309157975, "learning_rate": 4.471249804477758e-06, "loss": 0.0495, "step": 880 }, { "epoch": 1.162269129287599, "grad_norm": 0.5468215371838773, "learning_rate": 4.4597996886958365e-06, "loss": 0.0416, "step": 881 }, { "epoch": 1.1635883905013193, "grad_norm": 0.36052344580049805, "learning_rate": 4.448352438580391e-06, "loss": 0.031, "step": 882 }, { "epoch": 1.1649076517150396, "grad_norm": 0.48404972523614165, "learning_rate": 4.436908114857043e-06, "loss": 0.0464, "step": 883 }, { "epoch": 1.16622691292876, "grad_norm": 0.595188768341347, "learning_rate": 4.4254667782358925e-06, "loss": 0.0511, "step": 884 }, { "epoch": 1.1675461741424802, "grad_norm": 0.4826949321918809, "learning_rate": 4.4140284894111954e-06, "loss": 0.0645, "step": 885 }, { "epoch": 1.1688654353562005, "grad_norm": 0.4942481166728681, "learning_rate": 4.402593309061034e-06, "loss": 0.0505, "step": 886 }, { "epoch": 1.1701846965699207, "grad_norm": 0.94776252788579, "learning_rate": 4.391161297847007e-06, "loss": 0.0467, "step": 887 }, { "epoch": 1.1715039577836412, "grad_norm": 0.5913035358203962, "learning_rate": 4.379732516413897e-06, "loss": 0.0319, "step": 888 }, { "epoch": 1.1728232189973615, "grad_norm": 0.5561776348517845, "learning_rate": 4.368307025389355e-06, "loss": 0.0473, "step": 889 }, { "epoch": 1.1741424802110818, "grad_norm": 0.4089933220910145, "learning_rate": 4.356884885383578e-06, "loss": 0.0396, "step": 890 }, { "epoch": 1.175461741424802, "grad_norm": 0.3693219469627808, "learning_rate": 4.345466156988984e-06, "loss": 0.0415, "step": 891 }, { "epoch": 1.1767810026385224, "grad_norm": 0.34435079223721005, "learning_rate": 4.334050900779893e-06, "loss": 0.0301, "step": 892 }, { "epoch": 1.1781002638522429, "grad_norm": 0.4369942166434897, "learning_rate": 4.322639177312209e-06, "loss": 0.034, "step": 893 }, { "epoch": 1.1794195250659631, "grad_norm": 0.44910021752481283, "learning_rate": 4.3112310471230925e-06, "loss": 0.0525, "step": 894 }, { "epoch": 1.1807387862796834, "grad_norm": 0.4047144759474002, "learning_rate": 4.299826570730642e-06, "loss": 0.0293, "step": 895 }, { "epoch": 1.1820580474934037, "grad_norm": 0.5291844637455119, "learning_rate": 4.2884258086335755e-06, "loss": 0.0454, "step": 896 }, { "epoch": 1.183377308707124, "grad_norm": 0.5033898941289409, "learning_rate": 4.277028821310907e-06, "loss": 0.0435, "step": 897 }, { "epoch": 1.1846965699208443, "grad_norm": 0.5416991303244074, "learning_rate": 4.265635669221622e-06, "loss": 0.0455, "step": 898 }, { "epoch": 1.1860158311345645, "grad_norm": 0.48088831064375376, "learning_rate": 4.254246412804366e-06, "loss": 0.0462, "step": 899 }, { "epoch": 1.187335092348285, "grad_norm": 0.5345434830020392, "learning_rate": 4.2428611124771184e-06, "loss": 0.051, "step": 900 }, { "epoch": 1.1886543535620053, "grad_norm": 0.4392194033994731, "learning_rate": 4.231479828636867e-06, "loss": 0.0377, "step": 901 }, { "epoch": 1.1899736147757256, "grad_norm": 0.4508926786816226, "learning_rate": 4.220102621659298e-06, "loss": 0.0356, "step": 902 }, { "epoch": 1.1912928759894459, "grad_norm": 0.4711436334816274, "learning_rate": 4.2087295518984675e-06, "loss": 0.0294, "step": 903 }, { "epoch": 1.1926121372031662, "grad_norm": 0.46115423803718414, "learning_rate": 4.197360679686489e-06, "loss": 0.0325, "step": 904 }, { "epoch": 1.1939313984168864, "grad_norm": 0.4330528618484114, "learning_rate": 4.185996065333201e-06, "loss": 0.0371, "step": 905 }, { "epoch": 1.195250659630607, "grad_norm": 0.5135651330330013, "learning_rate": 4.174635769125862e-06, "loss": 0.0435, "step": 906 }, { "epoch": 1.1965699208443272, "grad_norm": 0.43414421175381523, "learning_rate": 4.163279851328821e-06, "loss": 0.035, "step": 907 }, { "epoch": 1.1978891820580475, "grad_norm": 0.5253170032158321, "learning_rate": 4.151928372183198e-06, "loss": 0.0618, "step": 908 }, { "epoch": 1.1992084432717678, "grad_norm": 0.5792566627120969, "learning_rate": 4.1405813919065715e-06, "loss": 0.0477, "step": 909 }, { "epoch": 1.200527704485488, "grad_norm": 0.4846686952615566, "learning_rate": 4.129238970692651e-06, "loss": 0.0352, "step": 910 }, { "epoch": 1.2018469656992083, "grad_norm": 0.4193436838142133, "learning_rate": 4.11790116871096e-06, "loss": 0.0409, "step": 911 }, { "epoch": 1.2031662269129288, "grad_norm": 0.5357265529510714, "learning_rate": 4.10656804610652e-06, "loss": 0.0476, "step": 912 }, { "epoch": 1.2044854881266491, "grad_norm": 0.47412399206332867, "learning_rate": 4.09523966299953e-06, "loss": 0.0437, "step": 913 }, { "epoch": 1.2058047493403694, "grad_norm": 0.40662175721213795, "learning_rate": 4.083916079485044e-06, "loss": 0.034, "step": 914 }, { "epoch": 1.2071240105540897, "grad_norm": 0.3550538451157904, "learning_rate": 4.072597355632656e-06, "loss": 0.0376, "step": 915 }, { "epoch": 1.20844327176781, "grad_norm": 0.36691408572488504, "learning_rate": 4.061283551486185e-06, "loss": 0.0309, "step": 916 }, { "epoch": 1.2097625329815302, "grad_norm": 0.7508982125167554, "learning_rate": 4.0499747270633425e-06, "loss": 0.0527, "step": 917 }, { "epoch": 1.2110817941952507, "grad_norm": 0.4981336921560989, "learning_rate": 4.038670942355431e-06, "loss": 0.0569, "step": 918 }, { "epoch": 1.212401055408971, "grad_norm": 0.31515678753079435, "learning_rate": 4.027372257327017e-06, "loss": 0.0331, "step": 919 }, { "epoch": 1.2137203166226913, "grad_norm": 0.3508178905013858, "learning_rate": 4.016078731915608e-06, "loss": 0.0246, "step": 920 }, { "epoch": 1.2150395778364116, "grad_norm": 0.45178518763707604, "learning_rate": 4.004790426031353e-06, "loss": 0.0547, "step": 921 }, { "epoch": 1.2163588390501319, "grad_norm": 0.8034034003023315, "learning_rate": 3.993507399556699e-06, "loss": 0.0275, "step": 922 }, { "epoch": 1.2176781002638521, "grad_norm": 0.4725326112795675, "learning_rate": 3.982229712346096e-06, "loss": 0.0348, "step": 923 }, { "epoch": 1.2189973614775726, "grad_norm": 0.35517168294538254, "learning_rate": 3.970957424225666e-06, "loss": 0.0264, "step": 924 }, { "epoch": 1.220316622691293, "grad_norm": 0.3695274760387495, "learning_rate": 3.959690594992889e-06, "loss": 0.0289, "step": 925 }, { "epoch": 1.2216358839050132, "grad_norm": 0.42133612321865277, "learning_rate": 3.9484292844162905e-06, "loss": 0.0369, "step": 926 }, { "epoch": 1.2229551451187335, "grad_norm": 0.47170932535065363, "learning_rate": 3.937173552235117e-06, "loss": 0.0345, "step": 927 }, { "epoch": 1.2242744063324538, "grad_norm": 0.46520666681896905, "learning_rate": 3.925923458159023e-06, "loss": 0.0445, "step": 928 }, { "epoch": 1.225593667546174, "grad_norm": 0.4219364739293325, "learning_rate": 3.914679061867754e-06, "loss": 0.0341, "step": 929 }, { "epoch": 1.2269129287598945, "grad_norm": 0.49134686593171684, "learning_rate": 3.903440423010835e-06, "loss": 0.0426, "step": 930 }, { "epoch": 1.2282321899736148, "grad_norm": 0.35418526461050054, "learning_rate": 3.892207601207238e-06, "loss": 0.0328, "step": 931 }, { "epoch": 1.229551451187335, "grad_norm": 0.4155466911167312, "learning_rate": 3.880980656045087e-06, "loss": 0.0337, "step": 932 }, { "epoch": 1.2308707124010554, "grad_norm": 0.5147604758526607, "learning_rate": 3.869759647081326e-06, "loss": 0.0454, "step": 933 }, { "epoch": 1.2321899736147757, "grad_norm": 0.5061623638024214, "learning_rate": 3.858544633841409e-06, "loss": 0.0523, "step": 934 }, { "epoch": 1.233509234828496, "grad_norm": 0.43230450710501256, "learning_rate": 3.847335675818985e-06, "loss": 0.0427, "step": 935 }, { "epoch": 1.2348284960422165, "grad_norm": 0.3674699699030769, "learning_rate": 3.836132832475583e-06, "loss": 0.0332, "step": 936 }, { "epoch": 1.2361477572559367, "grad_norm": 0.41966842743992777, "learning_rate": 3.8249361632402884e-06, "loss": 0.0348, "step": 937 }, { "epoch": 1.237467018469657, "grad_norm": 0.4675730882835975, "learning_rate": 3.813745727509439e-06, "loss": 0.0263, "step": 938 }, { "epoch": 1.2387862796833773, "grad_norm": 0.5078899880579145, "learning_rate": 3.802561584646307e-06, "loss": 0.0455, "step": 939 }, { "epoch": 1.2401055408970976, "grad_norm": 0.42307371705498914, "learning_rate": 3.7913837939807763e-06, "loss": 0.0311, "step": 940 }, { "epoch": 1.2414248021108178, "grad_norm": 0.4776324071645813, "learning_rate": 3.7802124148090365e-06, "loss": 0.0387, "step": 941 }, { "epoch": 1.2427440633245384, "grad_norm": 0.506615988672597, "learning_rate": 3.769047506393267e-06, "loss": 0.0432, "step": 942 }, { "epoch": 1.2440633245382586, "grad_norm": 0.42547177727650803, "learning_rate": 3.757889127961316e-06, "loss": 0.0353, "step": 943 }, { "epoch": 1.245382585751979, "grad_norm": 0.5796557163775428, "learning_rate": 3.7467373387063973e-06, "loss": 0.0631, "step": 944 }, { "epoch": 1.2467018469656992, "grad_norm": 0.505430231894429, "learning_rate": 3.7355921977867672e-06, "loss": 0.0444, "step": 945 }, { "epoch": 1.2480211081794195, "grad_norm": 0.44127983017914596, "learning_rate": 3.7244537643254115e-06, "loss": 0.0348, "step": 946 }, { "epoch": 1.2493403693931397, "grad_norm": 0.705598008981505, "learning_rate": 3.7133220974097383e-06, "loss": 0.0596, "step": 947 }, { "epoch": 1.2506596306068603, "grad_norm": 0.5361833719044061, "learning_rate": 3.70219725609126e-06, "loss": 0.0541, "step": 948 }, { "epoch": 1.2519788918205805, "grad_norm": 0.5815052949402663, "learning_rate": 3.6910792993852773e-06, "loss": 0.0459, "step": 949 }, { "epoch": 1.2532981530343008, "grad_norm": 0.35605491319913013, "learning_rate": 3.679968286270571e-06, "loss": 0.0322, "step": 950 }, { "epoch": 1.254617414248021, "grad_norm": 0.3331663956238606, "learning_rate": 3.6688642756890895e-06, "loss": 0.0289, "step": 951 }, { "epoch": 1.2559366754617414, "grad_norm": 0.44158160096532845, "learning_rate": 3.6577673265456296e-06, "loss": 0.044, "step": 952 }, { "epoch": 1.2572559366754619, "grad_norm": 0.4746536128360755, "learning_rate": 3.646677497707532e-06, "loss": 0.0379, "step": 953 }, { "epoch": 1.258575197889182, "grad_norm": 0.4350590512773799, "learning_rate": 3.6355948480043647e-06, "loss": 0.0438, "step": 954 }, { "epoch": 1.2598944591029024, "grad_norm": 0.6315170149439323, "learning_rate": 3.62451943622761e-06, "loss": 0.0752, "step": 955 }, { "epoch": 1.2612137203166227, "grad_norm": 0.6284106690387802, "learning_rate": 3.6134513211303555e-06, "loss": 0.0422, "step": 956 }, { "epoch": 1.262532981530343, "grad_norm": 0.3910609991343049, "learning_rate": 3.6023905614269834e-06, "loss": 0.0361, "step": 957 }, { "epoch": 1.2638522427440633, "grad_norm": 0.410139517265721, "learning_rate": 3.5913372157928515e-06, "loss": 0.0434, "step": 958 }, { "epoch": 1.2651715039577835, "grad_norm": 0.42724744417074056, "learning_rate": 3.580291342863993e-06, "loss": 0.0345, "step": 959 }, { "epoch": 1.266490765171504, "grad_norm": 0.42009080909638846, "learning_rate": 3.569253001236795e-06, "loss": 0.0475, "step": 960 }, { "epoch": 1.2678100263852243, "grad_norm": 0.49743514051851584, "learning_rate": 3.5582222494676984e-06, "loss": 0.0315, "step": 961 }, { "epoch": 1.2691292875989446, "grad_norm": 0.5477142452802849, "learning_rate": 3.5471991460728725e-06, "loss": 0.0386, "step": 962 }, { "epoch": 1.270448548812665, "grad_norm": 0.41591914851810763, "learning_rate": 3.5361837495279217e-06, "loss": 0.0357, "step": 963 }, { "epoch": 1.2717678100263852, "grad_norm": 0.4996536327861138, "learning_rate": 3.5251761182675626e-06, "loss": 0.0358, "step": 964 }, { "epoch": 1.2730870712401057, "grad_norm": 0.4519451879403226, "learning_rate": 3.514176310685318e-06, "loss": 0.0427, "step": 965 }, { "epoch": 1.2744063324538257, "grad_norm": 0.5130131641395598, "learning_rate": 3.5031843851332105e-06, "loss": 0.0453, "step": 966 }, { "epoch": 1.2757255936675462, "grad_norm": 0.40396338697034684, "learning_rate": 3.492200399921447e-06, "loss": 0.0279, "step": 967 }, { "epoch": 1.2770448548812665, "grad_norm": 0.4384440979919693, "learning_rate": 3.481224413318114e-06, "loss": 0.0379, "step": 968 }, { "epoch": 1.2783641160949868, "grad_norm": 0.5529379502906039, "learning_rate": 3.470256483548866e-06, "loss": 0.0512, "step": 969 }, { "epoch": 1.279683377308707, "grad_norm": 0.48363343424476124, "learning_rate": 3.459296668796619e-06, "loss": 0.0371, "step": 970 }, { "epoch": 1.2810026385224274, "grad_norm": 0.6110356799840317, "learning_rate": 3.4483450272012364e-06, "loss": 0.0514, "step": 971 }, { "epoch": 1.2823218997361479, "grad_norm": 0.3797144655454073, "learning_rate": 3.4374016168592296e-06, "loss": 0.0323, "step": 972 }, { "epoch": 1.2836411609498681, "grad_norm": 0.386622854183218, "learning_rate": 3.426466495823441e-06, "loss": 0.0325, "step": 973 }, { "epoch": 1.2849604221635884, "grad_norm": 0.549381263129161, "learning_rate": 3.4155397221027396e-06, "loss": 0.0343, "step": 974 }, { "epoch": 1.2862796833773087, "grad_norm": 0.3809569973876317, "learning_rate": 3.4046213536617164e-06, "loss": 0.0246, "step": 975 }, { "epoch": 1.287598944591029, "grad_norm": 0.3785065948314082, "learning_rate": 3.393711448420372e-06, "loss": 0.0333, "step": 976 }, { "epoch": 1.2889182058047495, "grad_norm": 0.47345085429095884, "learning_rate": 3.3828100642538097e-06, "loss": 0.0329, "step": 977 }, { "epoch": 1.2902374670184695, "grad_norm": 0.46504857911868974, "learning_rate": 3.371917258991933e-06, "loss": 0.0468, "step": 978 }, { "epoch": 1.29155672823219, "grad_norm": 0.43387007700506586, "learning_rate": 3.3610330904191353e-06, "loss": 0.0389, "step": 979 }, { "epoch": 1.2928759894459103, "grad_norm": 0.43289132694716775, "learning_rate": 3.3501576162739903e-06, "loss": 0.0357, "step": 980 }, { "epoch": 1.2941952506596306, "grad_norm": 0.5338596807204832, "learning_rate": 3.339290894248954e-06, "loss": 0.0448, "step": 981 }, { "epoch": 1.2955145118733509, "grad_norm": 0.44703974769176946, "learning_rate": 3.328432981990053e-06, "loss": 0.045, "step": 982 }, { "epoch": 1.2968337730870712, "grad_norm": 0.4312648230012682, "learning_rate": 3.317583937096577e-06, "loss": 0.0375, "step": 983 }, { "epoch": 1.2981530343007917, "grad_norm": 0.39804375835899897, "learning_rate": 3.306743817120777e-06, "loss": 0.0305, "step": 984 }, { "epoch": 1.299472295514512, "grad_norm": 0.40547275062936783, "learning_rate": 3.2959126795675616e-06, "loss": 0.0365, "step": 985 }, { "epoch": 1.3007915567282322, "grad_norm": 0.44179488739988027, "learning_rate": 3.2850905818941853e-06, "loss": 0.0491, "step": 986 }, { "epoch": 1.3021108179419525, "grad_norm": 0.4599921621944571, "learning_rate": 3.2742775815099503e-06, "loss": 0.039, "step": 987 }, { "epoch": 1.3034300791556728, "grad_norm": 0.47203341421145645, "learning_rate": 3.2634737357758994e-06, "loss": 0.0338, "step": 988 }, { "epoch": 1.3047493403693933, "grad_norm": 0.35696717651954174, "learning_rate": 3.252679102004509e-06, "loss": 0.0405, "step": 989 }, { "epoch": 1.3060686015831133, "grad_norm": 0.41069975024427896, "learning_rate": 3.2418937374593895e-06, "loss": 0.0354, "step": 990 }, { "epoch": 1.3073878627968338, "grad_norm": 0.48133408200291883, "learning_rate": 3.231117699354982e-06, "loss": 0.0427, "step": 991 }, { "epoch": 1.3087071240105541, "grad_norm": 0.4323486452790916, "learning_rate": 3.220351044856247e-06, "loss": 0.0238, "step": 992 }, { "epoch": 1.3100263852242744, "grad_norm": 0.4824686861935317, "learning_rate": 3.209593831078371e-06, "loss": 0.0346, "step": 993 }, { "epoch": 1.3113456464379947, "grad_norm": 0.5357251409622038, "learning_rate": 3.198846115086459e-06, "loss": 0.045, "step": 994 }, { "epoch": 1.312664907651715, "grad_norm": 0.4895627677343242, "learning_rate": 3.188107953895229e-06, "loss": 0.0442, "step": 995 }, { "epoch": 1.3139841688654355, "grad_norm": 0.565268430457988, "learning_rate": 3.177379404468715e-06, "loss": 0.0454, "step": 996 }, { "epoch": 1.3153034300791557, "grad_norm": 0.4785911360758799, "learning_rate": 3.1666605237199625e-06, "loss": 0.0352, "step": 997 }, { "epoch": 1.316622691292876, "grad_norm": 0.32890904146738625, "learning_rate": 3.1559513685107233e-06, "loss": 0.0197, "step": 998 }, { "epoch": 1.3179419525065963, "grad_norm": 0.43538266074480436, "learning_rate": 3.145251995651162e-06, "loss": 0.0375, "step": 999 }, { "epoch": 1.3192612137203166, "grad_norm": 0.3726106605998871, "learning_rate": 3.1345624618995444e-06, "loss": 0.0333, "step": 1000 }, { "epoch": 1.320580474934037, "grad_norm": 1.0522326452050843, "learning_rate": 3.1238828239619447e-06, "loss": 0.0418, "step": 1001 }, { "epoch": 1.3218997361477571, "grad_norm": 0.46264860868018753, "learning_rate": 3.11321313849194e-06, "loss": 0.0376, "step": 1002 }, { "epoch": 1.3232189973614776, "grad_norm": 0.5999884424789149, "learning_rate": 3.102553462090314e-06, "loss": 0.0469, "step": 1003 }, { "epoch": 1.324538258575198, "grad_norm": 0.44953860397274076, "learning_rate": 3.0919038513047507e-06, "loss": 0.024, "step": 1004 }, { "epoch": 1.3258575197889182, "grad_norm": 0.4671921335921803, "learning_rate": 3.081264362629539e-06, "loss": 0.0343, "step": 1005 }, { "epoch": 1.3271767810026385, "grad_norm": 0.36928026939799047, "learning_rate": 3.070635052505273e-06, "loss": 0.0367, "step": 1006 }, { "epoch": 1.3284960422163588, "grad_norm": 0.46448096365916186, "learning_rate": 3.0600159773185506e-06, "loss": 0.0314, "step": 1007 }, { "epoch": 1.3298153034300793, "grad_norm": 0.5066608290740877, "learning_rate": 3.0494071934016737e-06, "loss": 0.0444, "step": 1008 }, { "epoch": 1.3311345646437995, "grad_norm": 0.40340427606125134, "learning_rate": 3.038808757032353e-06, "loss": 0.0387, "step": 1009 }, { "epoch": 1.3324538258575198, "grad_norm": 0.39352281811558804, "learning_rate": 3.0282207244334084e-06, "loss": 0.0332, "step": 1010 }, { "epoch": 1.33377308707124, "grad_norm": 0.38733414560498847, "learning_rate": 3.0176431517724636e-06, "loss": 0.0275, "step": 1011 }, { "epoch": 1.3350923482849604, "grad_norm": 0.4256685091983431, "learning_rate": 3.007076095161662e-06, "loss": 0.0473, "step": 1012 }, { "epoch": 1.3364116094986809, "grad_norm": 0.4165491156723393, "learning_rate": 2.996519610657356e-06, "loss": 0.0412, "step": 1013 }, { "epoch": 1.337730870712401, "grad_norm": 0.41360475602040825, "learning_rate": 2.9859737542598157e-06, "loss": 0.0435, "step": 1014 }, { "epoch": 1.3390501319261214, "grad_norm": 0.37936895912809304, "learning_rate": 2.975438581912933e-06, "loss": 0.0352, "step": 1015 }, { "epoch": 1.3403693931398417, "grad_norm": 0.32943810250125843, "learning_rate": 2.9649141495039225e-06, "loss": 0.03, "step": 1016 }, { "epoch": 1.341688654353562, "grad_norm": 0.49629621970844595, "learning_rate": 2.9544005128630217e-06, "loss": 0.0382, "step": 1017 }, { "epoch": 1.3430079155672823, "grad_norm": 0.6887166327460013, "learning_rate": 2.943897727763202e-06, "loss": 0.0708, "step": 1018 }, { "epoch": 1.3443271767810026, "grad_norm": 0.4093448467540715, "learning_rate": 2.9334058499198702e-06, "loss": 0.0335, "step": 1019 }, { "epoch": 1.345646437994723, "grad_norm": 0.4297548004407467, "learning_rate": 2.9229249349905686e-06, "loss": 0.0392, "step": 1020 }, { "epoch": 1.3469656992084433, "grad_norm": 0.39780686585256225, "learning_rate": 2.912455038574686e-06, "loss": 0.036, "step": 1021 }, { "epoch": 1.3482849604221636, "grad_norm": 0.3844234871086875, "learning_rate": 2.9019962162131564e-06, "loss": 0.0261, "step": 1022 }, { "epoch": 1.349604221635884, "grad_norm": 0.6872195624516678, "learning_rate": 2.891548523388173e-06, "loss": 0.0573, "step": 1023 }, { "epoch": 1.3509234828496042, "grad_norm": 0.462684090932404, "learning_rate": 2.8811120155228843e-06, "loss": 0.0514, "step": 1024 }, { "epoch": 1.3522427440633247, "grad_norm": 0.32721023602849464, "learning_rate": 2.870686747981108e-06, "loss": 0.0283, "step": 1025 }, { "epoch": 1.3535620052770447, "grad_norm": 0.506546676516316, "learning_rate": 2.8602727760670336e-06, "loss": 0.0412, "step": 1026 }, { "epoch": 1.3548812664907652, "grad_norm": 0.501844856487769, "learning_rate": 2.8498701550249234e-06, "loss": 0.0396, "step": 1027 }, { "epoch": 1.3562005277044855, "grad_norm": 0.4854469757169651, "learning_rate": 2.839478940038833e-06, "loss": 0.0359, "step": 1028 }, { "epoch": 1.3575197889182058, "grad_norm": 0.5197094676195364, "learning_rate": 2.8290991862323068e-06, "loss": 0.0436, "step": 1029 }, { "epoch": 1.358839050131926, "grad_norm": 0.45575980675496397, "learning_rate": 2.8187309486680924e-06, "loss": 0.0418, "step": 1030 }, { "epoch": 1.3601583113456464, "grad_norm": 0.5335924104959506, "learning_rate": 2.8083742823478423e-06, "loss": 0.0423, "step": 1031 }, { "epoch": 1.3614775725593669, "grad_norm": 0.6099518497682432, "learning_rate": 2.7980292422118282e-06, "loss": 0.0553, "step": 1032 }, { "epoch": 1.3627968337730871, "grad_norm": 0.45857130513461763, "learning_rate": 2.7876958831386487e-06, "loss": 0.0412, "step": 1033 }, { "epoch": 1.3641160949868074, "grad_norm": 0.4102160077304873, "learning_rate": 2.777374259944929e-06, "loss": 0.0308, "step": 1034 }, { "epoch": 1.3654353562005277, "grad_norm": 0.45878667026434056, "learning_rate": 2.7670644273850446e-06, "loss": 0.047, "step": 1035 }, { "epoch": 1.366754617414248, "grad_norm": 0.44773233254363415, "learning_rate": 2.7567664401508225e-06, "loss": 0.0472, "step": 1036 }, { "epoch": 1.3680738786279685, "grad_norm": 0.42822024536667164, "learning_rate": 2.7464803528712506e-06, "loss": 0.037, "step": 1037 }, { "epoch": 1.3693931398416885, "grad_norm": 0.6915956645143235, "learning_rate": 2.736206220112192e-06, "loss": 0.0295, "step": 1038 }, { "epoch": 1.370712401055409, "grad_norm": 0.8305607893389385, "learning_rate": 2.725944096376092e-06, "loss": 0.0391, "step": 1039 }, { "epoch": 1.3720316622691293, "grad_norm": 0.37663284086172133, "learning_rate": 2.7156940361016864e-06, "loss": 0.0298, "step": 1040 }, { "epoch": 1.3733509234828496, "grad_norm": 0.538910224000725, "learning_rate": 2.705456093663723e-06, "loss": 0.0444, "step": 1041 }, { "epoch": 1.3746701846965699, "grad_norm": 0.5319313138990732, "learning_rate": 2.6952303233726628e-06, "loss": 0.0436, "step": 1042 }, { "epoch": 1.3759894459102902, "grad_norm": 0.32666054786344273, "learning_rate": 2.6850167794743966e-06, "loss": 0.0307, "step": 1043 }, { "epoch": 1.3773087071240107, "grad_norm": 0.5617149051548302, "learning_rate": 2.6748155161499568e-06, "loss": 0.0506, "step": 1044 }, { "epoch": 1.378627968337731, "grad_norm": 0.471177550303539, "learning_rate": 2.664626587515231e-06, "loss": 0.0348, "step": 1045 }, { "epoch": 1.3799472295514512, "grad_norm": 0.4611663413099852, "learning_rate": 2.6544500476206675e-06, "loss": 0.0379, "step": 1046 }, { "epoch": 1.3812664907651715, "grad_norm": 0.7059929228923759, "learning_rate": 2.6442859504510005e-06, "loss": 0.0401, "step": 1047 }, { "epoch": 1.3825857519788918, "grad_norm": 0.4531225005491031, "learning_rate": 2.634134349924956e-06, "loss": 0.0396, "step": 1048 }, { "epoch": 1.383905013192612, "grad_norm": 0.41911814107181067, "learning_rate": 2.6239952998949676e-06, "loss": 0.0409, "step": 1049 }, { "epoch": 1.3852242744063323, "grad_norm": 0.36612524416461345, "learning_rate": 2.6138688541468903e-06, "loss": 0.0296, "step": 1050 }, { "epoch": 1.3865435356200528, "grad_norm": 0.3780056660999297, "learning_rate": 2.603755066399718e-06, "loss": 0.037, "step": 1051 }, { "epoch": 1.3878627968337731, "grad_norm": 0.46963275669637067, "learning_rate": 2.5936539903052893e-06, "loss": 0.05, "step": 1052 }, { "epoch": 1.3891820580474934, "grad_norm": 0.409153783085971, "learning_rate": 2.583565679448018e-06, "loss": 0.0247, "step": 1053 }, { "epoch": 1.3905013192612137, "grad_norm": 0.42054547753287036, "learning_rate": 2.573490187344596e-06, "loss": 0.0361, "step": 1054 }, { "epoch": 1.391820580474934, "grad_norm": 0.5255045835106068, "learning_rate": 2.563427567443715e-06, "loss": 0.0469, "step": 1055 }, { "epoch": 1.3931398416886545, "grad_norm": 0.9825521752923335, "learning_rate": 2.5533778731257824e-06, "loss": 0.0554, "step": 1056 }, { "epoch": 1.3944591029023747, "grad_norm": 0.38448801305592534, "learning_rate": 2.5433411577026396e-06, "loss": 0.0386, "step": 1057 }, { "epoch": 1.395778364116095, "grad_norm": 0.4886538622120843, "learning_rate": 2.5333174744172705e-06, "loss": 0.0451, "step": 1058 }, { "epoch": 1.3970976253298153, "grad_norm": 0.5256881739187035, "learning_rate": 2.523306876443532e-06, "loss": 0.053, "step": 1059 }, { "epoch": 1.3984168865435356, "grad_norm": 0.5884455219037508, "learning_rate": 2.513309416885865e-06, "loss": 0.0518, "step": 1060 }, { "epoch": 1.3997361477572559, "grad_norm": 0.45490704561246353, "learning_rate": 2.5033251487790124e-06, "loss": 0.0336, "step": 1061 }, { "epoch": 1.4010554089709761, "grad_norm": 0.44697786677075335, "learning_rate": 2.493354125087738e-06, "loss": 0.0424, "step": 1062 }, { "epoch": 1.4023746701846966, "grad_norm": 0.5197934850801327, "learning_rate": 2.48339639870655e-06, "loss": 0.026, "step": 1063 }, { "epoch": 1.403693931398417, "grad_norm": 0.36564648892730267, "learning_rate": 2.4734520224594094e-06, "loss": 0.0318, "step": 1064 }, { "epoch": 1.4050131926121372, "grad_norm": 0.5920632674980368, "learning_rate": 2.4635210490994648e-06, "loss": 0.0643, "step": 1065 }, { "epoch": 1.4063324538258575, "grad_norm": 0.5951808956519485, "learning_rate": 2.4536035313087603e-06, "loss": 0.0642, "step": 1066 }, { "epoch": 1.4076517150395778, "grad_norm": 0.5155676663484681, "learning_rate": 2.4436995216979635e-06, "loss": 0.036, "step": 1067 }, { "epoch": 1.4089709762532983, "grad_norm": 0.39084449761148354, "learning_rate": 2.4338090728060808e-06, "loss": 0.0399, "step": 1068 }, { "epoch": 1.4102902374670185, "grad_norm": 0.41720046914502273, "learning_rate": 2.4239322371001855e-06, "loss": 0.0434, "step": 1069 }, { "epoch": 1.4116094986807388, "grad_norm": 0.49043289317821803, "learning_rate": 2.414069066975128e-06, "loss": 0.0495, "step": 1070 }, { "epoch": 1.412928759894459, "grad_norm": 0.43929751116025734, "learning_rate": 2.404219614753273e-06, "loss": 0.0355, "step": 1071 }, { "epoch": 1.4142480211081794, "grad_norm": 0.8439643057831976, "learning_rate": 2.3943839326842096e-06, "loss": 0.0544, "step": 1072 }, { "epoch": 1.4155672823218997, "grad_norm": 0.46091464446991814, "learning_rate": 2.3845620729444803e-06, "loss": 0.0422, "step": 1073 }, { "epoch": 1.41688654353562, "grad_norm": 0.5467159719235117, "learning_rate": 2.3747540876373026e-06, "loss": 0.0515, "step": 1074 }, { "epoch": 1.4182058047493404, "grad_norm": 0.5121155616555213, "learning_rate": 2.364960028792292e-06, "loss": 0.0382, "step": 1075 }, { "epoch": 1.4195250659630607, "grad_norm": 0.5440053881184854, "learning_rate": 2.3551799483651894e-06, "loss": 0.0422, "step": 1076 }, { "epoch": 1.420844327176781, "grad_norm": 0.4357587566255143, "learning_rate": 2.345413898237576e-06, "loss": 0.043, "step": 1077 }, { "epoch": 1.4221635883905013, "grad_norm": 0.49773296255549826, "learning_rate": 2.335661930216611e-06, "loss": 0.0433, "step": 1078 }, { "epoch": 1.4234828496042216, "grad_norm": 0.5789417671457069, "learning_rate": 2.325924096034749e-06, "loss": 0.0495, "step": 1079 }, { "epoch": 1.424802110817942, "grad_norm": 0.5346328901027718, "learning_rate": 2.316200447349466e-06, "loss": 0.0422, "step": 1080 }, { "epoch": 1.4261213720316623, "grad_norm": 0.41947416672577986, "learning_rate": 2.306491035742987e-06, "loss": 0.0376, "step": 1081 }, { "epoch": 1.4274406332453826, "grad_norm": 0.5242085536717893, "learning_rate": 2.296795912722014e-06, "loss": 0.0459, "step": 1082 }, { "epoch": 1.428759894459103, "grad_norm": 0.5715780963086183, "learning_rate": 2.2871151297174448e-06, "loss": 0.0625, "step": 1083 }, { "epoch": 1.4300791556728232, "grad_norm": 0.6352727794275724, "learning_rate": 2.2774487380841116e-06, "loss": 0.0523, "step": 1084 }, { "epoch": 1.4313984168865435, "grad_norm": 0.3460910768833545, "learning_rate": 2.267796789100501e-06, "loss": 0.0333, "step": 1085 }, { "epoch": 1.4327176781002637, "grad_norm": 0.3787713556887917, "learning_rate": 2.258159333968484e-06, "loss": 0.0337, "step": 1086 }, { "epoch": 1.4340369393139842, "grad_norm": 0.4349330927269478, "learning_rate": 2.2485364238130435e-06, "loss": 0.0374, "step": 1087 }, { "epoch": 1.4353562005277045, "grad_norm": 0.5745490778847844, "learning_rate": 2.2389281096820077e-06, "loss": 0.051, "step": 1088 }, { "epoch": 1.4366754617414248, "grad_norm": 0.4307299555917101, "learning_rate": 2.2293344425457677e-06, "loss": 0.0304, "step": 1089 }, { "epoch": 1.437994722955145, "grad_norm": 0.38163317920987777, "learning_rate": 2.21975547329702e-06, "loss": 0.0398, "step": 1090 }, { "epoch": 1.4393139841688654, "grad_norm": 0.5784514783227251, "learning_rate": 2.210191252750495e-06, "loss": 0.0441, "step": 1091 }, { "epoch": 1.4406332453825859, "grad_norm": 0.3504151982802342, "learning_rate": 2.2006418316426773e-06, "loss": 0.0279, "step": 1092 }, { "epoch": 1.4419525065963061, "grad_norm": 0.37141871869746296, "learning_rate": 2.191107260631548e-06, "loss": 0.0301, "step": 1093 }, { "epoch": 1.4432717678100264, "grad_norm": 0.462488599649108, "learning_rate": 2.1815875902963058e-06, "loss": 0.0518, "step": 1094 }, { "epoch": 1.4445910290237467, "grad_norm": 0.4680050954217483, "learning_rate": 2.1720828711371073e-06, "loss": 0.0376, "step": 1095 }, { "epoch": 1.445910290237467, "grad_norm": 0.3459584036515211, "learning_rate": 2.1625931535747964e-06, "loss": 0.0446, "step": 1096 }, { "epoch": 1.4472295514511873, "grad_norm": 0.4247643610002131, "learning_rate": 2.1531184879506353e-06, "loss": 0.0343, "step": 1097 }, { "epoch": 1.4485488126649075, "grad_norm": 0.44795236208765893, "learning_rate": 2.1436589245260375e-06, "loss": 0.0383, "step": 1098 }, { "epoch": 1.449868073878628, "grad_norm": 0.3569544473987064, "learning_rate": 2.1342145134823055e-06, "loss": 0.0279, "step": 1099 }, { "epoch": 1.4511873350923483, "grad_norm": 0.48039130871149777, "learning_rate": 2.1247853049203543e-06, "loss": 0.0431, "step": 1100 }, { "epoch": 1.4525065963060686, "grad_norm": 0.4454599786746931, "learning_rate": 2.1153713488604584e-06, "loss": 0.0388, "step": 1101 }, { "epoch": 1.4538258575197889, "grad_norm": 0.4149647708392341, "learning_rate": 2.1059726952419782e-06, "loss": 0.0285, "step": 1102 }, { "epoch": 1.4551451187335092, "grad_norm": 1.07940821497593, "learning_rate": 2.096589393923099e-06, "loss": 0.0405, "step": 1103 }, { "epoch": 1.4564643799472297, "grad_norm": 0.41333463006294796, "learning_rate": 2.087221494680563e-06, "loss": 0.0335, "step": 1104 }, { "epoch": 1.45778364116095, "grad_norm": 0.47934857213428844, "learning_rate": 2.077869047209409e-06, "loss": 0.0395, "step": 1105 }, { "epoch": 1.4591029023746702, "grad_norm": 0.4301358428454764, "learning_rate": 2.068532101122704e-06, "loss": 0.0367, "step": 1106 }, { "epoch": 1.4604221635883905, "grad_norm": 0.7383799743036201, "learning_rate": 2.059210705951284e-06, "loss": 0.0358, "step": 1107 }, { "epoch": 1.4617414248021108, "grad_norm": 0.5076306495749731, "learning_rate": 2.0499049111434922e-06, "loss": 0.0395, "step": 1108 }, { "epoch": 1.463060686015831, "grad_norm": 0.6142156593697868, "learning_rate": 2.040614766064913e-06, "loss": 0.0603, "step": 1109 }, { "epoch": 1.4643799472295513, "grad_norm": 0.540003245309238, "learning_rate": 2.0313403199981125e-06, "loss": 0.0418, "step": 1110 }, { "epoch": 1.4656992084432718, "grad_norm": 0.48032030319386504, "learning_rate": 2.0220816221423766e-06, "loss": 0.0491, "step": 1111 }, { "epoch": 1.4670184696569921, "grad_norm": 0.7196975923390573, "learning_rate": 2.012838721613447e-06, "loss": 0.0409, "step": 1112 }, { "epoch": 1.4683377308707124, "grad_norm": 0.49600655875059707, "learning_rate": 2.0036116674432653e-06, "loss": 0.0445, "step": 1113 }, { "epoch": 1.4696569920844327, "grad_norm": 0.4826715832187464, "learning_rate": 1.9944005085797124e-06, "loss": 0.0447, "step": 1114 }, { "epoch": 1.470976253298153, "grad_norm": 0.4555273017972598, "learning_rate": 1.985205293886346e-06, "loss": 0.0319, "step": 1115 }, { "epoch": 1.4722955145118735, "grad_norm": 0.3522225723283794, "learning_rate": 1.9760260721421426e-06, "loss": 0.0304, "step": 1116 }, { "epoch": 1.4736147757255937, "grad_norm": 1.1574544927654264, "learning_rate": 1.9668628920412414e-06, "loss": 0.04, "step": 1117 }, { "epoch": 1.474934036939314, "grad_norm": 0.630790094221889, "learning_rate": 1.9577158021926774e-06, "loss": 0.0511, "step": 1118 }, { "epoch": 1.4762532981530343, "grad_norm": 0.6010138707763073, "learning_rate": 1.9485848511201356e-06, "loss": 0.0496, "step": 1119 }, { "epoch": 1.4775725593667546, "grad_norm": 0.46159248622529453, "learning_rate": 1.9394700872616856e-06, "loss": 0.0438, "step": 1120 }, { "epoch": 1.4788918205804749, "grad_norm": 0.33882700008461164, "learning_rate": 1.9303715589695276e-06, "loss": 0.0257, "step": 1121 }, { "epoch": 1.4802110817941951, "grad_norm": 0.6716654235718109, "learning_rate": 1.921289314509734e-06, "loss": 0.0597, "step": 1122 }, { "epoch": 1.4815303430079156, "grad_norm": 0.4586407143060212, "learning_rate": 1.912223402061996e-06, "loss": 0.0482, "step": 1123 }, { "epoch": 1.482849604221636, "grad_norm": 0.4701092667659717, "learning_rate": 1.9031738697193618e-06, "loss": 0.0381, "step": 1124 }, { "epoch": 1.4841688654353562, "grad_norm": 0.7401903083147542, "learning_rate": 1.894140765487991e-06, "loss": 0.0394, "step": 1125 }, { "epoch": 1.4854881266490765, "grad_norm": 0.8448139423441899, "learning_rate": 1.8851241372868938e-06, "loss": 0.0497, "step": 1126 }, { "epoch": 1.4868073878627968, "grad_norm": 0.4632288927244968, "learning_rate": 1.876124032947677e-06, "loss": 0.0386, "step": 1127 }, { "epoch": 1.4881266490765173, "grad_norm": 0.45762129332012513, "learning_rate": 1.8671405002142918e-06, "loss": 0.0319, "step": 1128 }, { "epoch": 1.4894459102902375, "grad_norm": 0.557904708285742, "learning_rate": 1.8581735867427814e-06, "loss": 0.0422, "step": 1129 }, { "epoch": 1.4907651715039578, "grad_norm": 0.46525204351630345, "learning_rate": 1.8492233401010218e-06, "loss": 0.0362, "step": 1130 }, { "epoch": 1.492084432717678, "grad_norm": 0.4519024069507292, "learning_rate": 1.8402898077684806e-06, "loss": 0.0352, "step": 1131 }, { "epoch": 1.4934036939313984, "grad_norm": 0.4104534322908132, "learning_rate": 1.831373037135955e-06, "loss": 0.0366, "step": 1132 }, { "epoch": 1.4947229551451187, "grad_norm": 0.5588912712416868, "learning_rate": 1.8224730755053271e-06, "loss": 0.046, "step": 1133 }, { "epoch": 1.496042216358839, "grad_norm": 0.40865740314473187, "learning_rate": 1.813589970089308e-06, "loss": 0.0403, "step": 1134 }, { "epoch": 1.4973614775725594, "grad_norm": 0.48716418995795524, "learning_rate": 1.8047237680111896e-06, "loss": 0.0333, "step": 1135 }, { "epoch": 1.4986807387862797, "grad_norm": 0.4569400934197829, "learning_rate": 1.7958745163045987e-06, "loss": 0.0418, "step": 1136 }, { "epoch": 1.5, "grad_norm": 0.37936714163481816, "learning_rate": 1.7870422619132354e-06, "loss": 0.0379, "step": 1137 }, { "epoch": 1.5013192612137203, "grad_norm": 0.4334062619986392, "learning_rate": 1.778227051690639e-06, "loss": 0.0385, "step": 1138 }, { "epoch": 1.5026385224274406, "grad_norm": 0.38575696956056177, "learning_rate": 1.769428932399931e-06, "loss": 0.0352, "step": 1139 }, { "epoch": 1.503957783641161, "grad_norm": 0.8199548463395913, "learning_rate": 1.760647950713566e-06, "loss": 0.0486, "step": 1140 }, { "epoch": 1.5052770448548811, "grad_norm": 0.708932848202034, "learning_rate": 1.7518841532130903e-06, "loss": 0.035, "step": 1141 }, { "epoch": 1.5065963060686016, "grad_norm": 0.7190192350306006, "learning_rate": 1.74313758638889e-06, "loss": 0.0547, "step": 1142 }, { "epoch": 1.507915567282322, "grad_norm": 0.5802755650204077, "learning_rate": 1.7344082966399417e-06, "loss": 0.0408, "step": 1143 }, { "epoch": 1.5092348284960422, "grad_norm": 0.5244149473179786, "learning_rate": 1.7256963302735752e-06, "loss": 0.0374, "step": 1144 }, { "epoch": 1.5105540897097627, "grad_norm": 0.5014348404012459, "learning_rate": 1.7170017335052207e-06, "loss": 0.0405, "step": 1145 }, { "epoch": 1.5118733509234827, "grad_norm": 0.37500709644274266, "learning_rate": 1.7083245524581666e-06, "loss": 0.032, "step": 1146 }, { "epoch": 1.5131926121372032, "grad_norm": 0.49025529294811243, "learning_rate": 1.6996648331633126e-06, "loss": 0.0524, "step": 1147 }, { "epoch": 1.5145118733509235, "grad_norm": 0.6564367602612013, "learning_rate": 1.6910226215589303e-06, "loss": 0.059, "step": 1148 }, { "epoch": 1.5158311345646438, "grad_norm": 0.4820037232796035, "learning_rate": 1.6823979634904104e-06, "loss": 0.0292, "step": 1149 }, { "epoch": 1.517150395778364, "grad_norm": 0.46343906367583637, "learning_rate": 1.6737909047100292e-06, "loss": 0.0489, "step": 1150 }, { "epoch": 1.5184696569920844, "grad_norm": 2.0598255401544385, "learning_rate": 1.6652014908767016e-06, "loss": 0.051, "step": 1151 }, { "epoch": 1.5197889182058049, "grad_norm": 0.8097910851993284, "learning_rate": 1.6566297675557392e-06, "loss": 0.0583, "step": 1152 }, { "epoch": 1.521108179419525, "grad_norm": 0.48356131039133166, "learning_rate": 1.648075780218607e-06, "loss": 0.0466, "step": 1153 }, { "epoch": 1.5224274406332454, "grad_norm": 0.4873772171600058, "learning_rate": 1.6395395742426873e-06, "loss": 0.0461, "step": 1154 }, { "epoch": 1.5237467018469657, "grad_norm": 0.3879805072219005, "learning_rate": 1.63102119491103e-06, "loss": 0.0425, "step": 1155 }, { "epoch": 1.525065963060686, "grad_norm": 0.862053634040402, "learning_rate": 1.6225206874121219e-06, "loss": 0.0419, "step": 1156 }, { "epoch": 1.5263852242744065, "grad_norm": 1.0214659212719512, "learning_rate": 1.6140380968396418e-06, "loss": 0.0617, "step": 1157 }, { "epoch": 1.5277044854881265, "grad_norm": 0.45324201742786346, "learning_rate": 1.6055734681922225e-06, "loss": 0.0335, "step": 1158 }, { "epoch": 1.529023746701847, "grad_norm": 0.5472418519896244, "learning_rate": 1.5971268463732126e-06, "loss": 0.0438, "step": 1159 }, { "epoch": 1.5303430079155673, "grad_norm": 0.48629761142198785, "learning_rate": 1.588698276190438e-06, "loss": 0.0403, "step": 1160 }, { "epoch": 1.5316622691292876, "grad_norm": 0.39348202971703883, "learning_rate": 1.5802878023559598e-06, "loss": 0.0271, "step": 1161 }, { "epoch": 1.5329815303430079, "grad_norm": 0.40738852272834497, "learning_rate": 1.5718954694858457e-06, "loss": 0.0314, "step": 1162 }, { "epoch": 1.5343007915567282, "grad_norm": 0.7127989672912001, "learning_rate": 1.5635213220999279e-06, "loss": 0.0376, "step": 1163 }, { "epoch": 1.5356200527704487, "grad_norm": 0.5634050350048752, "learning_rate": 1.555165404621567e-06, "loss": 0.0385, "step": 1164 }, { "epoch": 1.5369393139841687, "grad_norm": 0.39618676604739544, "learning_rate": 1.546827761377418e-06, "loss": 0.0349, "step": 1165 }, { "epoch": 1.5382585751978892, "grad_norm": 0.5127069648860455, "learning_rate": 1.5385084365971947e-06, "loss": 0.0328, "step": 1166 }, { "epoch": 1.5395778364116095, "grad_norm": 0.48381020413982256, "learning_rate": 1.530207474413431e-06, "loss": 0.0216, "step": 1167 }, { "epoch": 1.5408970976253298, "grad_norm": 0.5403500805062056, "learning_rate": 1.5219249188612556e-06, "loss": 0.0423, "step": 1168 }, { "epoch": 1.5422163588390503, "grad_norm": 0.7163076289045839, "learning_rate": 1.51366081387815e-06, "loss": 0.0728, "step": 1169 }, { "epoch": 1.5435356200527703, "grad_norm": 0.4615364027150893, "learning_rate": 1.5054152033037206e-06, "loss": 0.0391, "step": 1170 }, { "epoch": 1.5448548812664908, "grad_norm": 0.4966279274667812, "learning_rate": 1.4971881308794633e-06, "loss": 0.0286, "step": 1171 }, { "epoch": 1.5461741424802111, "grad_norm": 0.6445822769969329, "learning_rate": 1.488979640248534e-06, "loss": 0.0383, "step": 1172 }, { "epoch": 1.5474934036939314, "grad_norm": 0.5511797875272039, "learning_rate": 1.4807897749555112e-06, "loss": 0.0319, "step": 1173 }, { "epoch": 1.5488126649076517, "grad_norm": 0.4028896054759746, "learning_rate": 1.4726185784461726e-06, "loss": 0.0376, "step": 1174 }, { "epoch": 1.550131926121372, "grad_norm": 0.4373971805377109, "learning_rate": 1.4644660940672628e-06, "loss": 0.0346, "step": 1175 }, { "epoch": 1.5514511873350925, "grad_norm": 0.3813066629424425, "learning_rate": 1.4563323650662586e-06, "loss": 0.0313, "step": 1176 }, { "epoch": 1.5527704485488125, "grad_norm": 0.4811993173573029, "learning_rate": 1.4482174345911448e-06, "loss": 0.0485, "step": 1177 }, { "epoch": 1.554089709762533, "grad_norm": 0.46517591321514745, "learning_rate": 1.440121345690182e-06, "loss": 0.0347, "step": 1178 }, { "epoch": 1.5554089709762533, "grad_norm": 0.4512297860445867, "learning_rate": 1.432044141311683e-06, "loss": 0.0477, "step": 1179 }, { "epoch": 1.5567282321899736, "grad_norm": 0.5362718876076132, "learning_rate": 1.4239858643037753e-06, "loss": 0.0342, "step": 1180 }, { "epoch": 1.558047493403694, "grad_norm": 0.6080795983560243, "learning_rate": 1.415946557414185e-06, "loss": 0.046, "step": 1181 }, { "epoch": 1.5593667546174141, "grad_norm": 0.5075180443267415, "learning_rate": 1.4079262632900048e-06, "loss": 0.0405, "step": 1182 }, { "epoch": 1.5606860158311346, "grad_norm": 0.6975250836456051, "learning_rate": 1.399925024477466e-06, "loss": 0.0371, "step": 1183 }, { "epoch": 1.562005277044855, "grad_norm": 0.48944834666994125, "learning_rate": 1.3919428834217163e-06, "loss": 0.0435, "step": 1184 }, { "epoch": 1.5633245382585752, "grad_norm": 0.35255734243255743, "learning_rate": 1.3839798824665952e-06, "loss": 0.0415, "step": 1185 }, { "epoch": 1.5646437994722955, "grad_norm": 0.404850276527209, "learning_rate": 1.3760360638544012e-06, "loss": 0.0361, "step": 1186 }, { "epoch": 1.5659630606860158, "grad_norm": 0.6597734162030678, "learning_rate": 1.3681114697256792e-06, "loss": 0.0454, "step": 1187 }, { "epoch": 1.5672823218997363, "grad_norm": 0.5766222487962566, "learning_rate": 1.3602061421189899e-06, "loss": 0.0416, "step": 1188 }, { "epoch": 1.5686015831134563, "grad_norm": 0.4595353875796206, "learning_rate": 1.3523201229706901e-06, "loss": 0.0327, "step": 1189 }, { "epoch": 1.5699208443271768, "grad_norm": 0.33343538700583003, "learning_rate": 1.3444534541147058e-06, "loss": 0.0217, "step": 1190 }, { "epoch": 1.571240105540897, "grad_norm": 0.9071606502873505, "learning_rate": 1.3366061772823175e-06, "loss": 0.0551, "step": 1191 }, { "epoch": 1.5725593667546174, "grad_norm": 0.3435762760150652, "learning_rate": 1.3287783341019278e-06, "loss": 0.0236, "step": 1192 }, { "epoch": 1.5738786279683379, "grad_norm": 0.6241238414396604, "learning_rate": 1.3209699660988528e-06, "loss": 0.0438, "step": 1193 }, { "epoch": 1.575197889182058, "grad_norm": 0.40166273377163964, "learning_rate": 1.3131811146950946e-06, "loss": 0.041, "step": 1194 }, { "epoch": 1.5765171503957784, "grad_norm": 0.46253300012043386, "learning_rate": 1.3054118212091231e-06, "loss": 0.0302, "step": 1195 }, { "epoch": 1.5778364116094987, "grad_norm": 0.40714809268629054, "learning_rate": 1.2976621268556571e-06, "loss": 0.0407, "step": 1196 }, { "epoch": 1.579155672823219, "grad_norm": 0.32452512025205105, "learning_rate": 1.2899320727454472e-06, "loss": 0.0312, "step": 1197 }, { "epoch": 1.5804749340369393, "grad_norm": 0.4057644473007606, "learning_rate": 1.2822216998850506e-06, "loss": 0.0351, "step": 1198 }, { "epoch": 1.5817941952506596, "grad_norm": 0.5294484976583397, "learning_rate": 1.274531049176625e-06, "loss": 0.0397, "step": 1199 }, { "epoch": 1.58311345646438, "grad_norm": 0.5249033845880472, "learning_rate": 1.2668601614177017e-06, "loss": 0.0511, "step": 1200 }, { "epoch": 1.5844327176781001, "grad_norm": 0.5286195227893682, "learning_rate": 1.2592090773009757e-06, "loss": 0.0362, "step": 1201 }, { "epoch": 1.5857519788918206, "grad_norm": 0.43587737534445187, "learning_rate": 1.2515778374140858e-06, "loss": 0.0345, "step": 1202 }, { "epoch": 1.587071240105541, "grad_norm": 0.6504691814010135, "learning_rate": 1.2439664822394027e-06, "loss": 0.0468, "step": 1203 }, { "epoch": 1.5883905013192612, "grad_norm": 1.6349057673428626, "learning_rate": 1.2363750521538064e-06, "loss": 0.027, "step": 1204 }, { "epoch": 1.5897097625329817, "grad_norm": 0.5144142135467555, "learning_rate": 1.2288035874284847e-06, "loss": 0.0522, "step": 1205 }, { "epoch": 1.5910290237467017, "grad_norm": 0.7165703997721268, "learning_rate": 1.2212521282287093e-06, "loss": 0.0432, "step": 1206 }, { "epoch": 1.5923482849604222, "grad_norm": 0.5764139280217195, "learning_rate": 1.2137207146136265e-06, "loss": 0.0441, "step": 1207 }, { "epoch": 1.5936675461741425, "grad_norm": 0.38788539694879753, "learning_rate": 1.2062093865360458e-06, "loss": 0.0317, "step": 1208 }, { "epoch": 1.5949868073878628, "grad_norm": 0.4313898311629535, "learning_rate": 1.1987181838422252e-06, "loss": 0.0395, "step": 1209 }, { "epoch": 1.596306068601583, "grad_norm": 0.4785561711686053, "learning_rate": 1.1912471462716596e-06, "loss": 0.0349, "step": 1210 }, { "epoch": 1.5976253298153034, "grad_norm": 0.33015077804663906, "learning_rate": 1.1837963134568748e-06, "loss": 0.0239, "step": 1211 }, { "epoch": 1.5989445910290239, "grad_norm": 0.39922911811811035, "learning_rate": 1.1763657249232107e-06, "loss": 0.0298, "step": 1212 }, { "epoch": 1.600263852242744, "grad_norm": 0.45738013269687233, "learning_rate": 1.1689554200886183e-06, "loss": 0.0447, "step": 1213 }, { "epoch": 1.6015831134564644, "grad_norm": 0.5046432642457959, "learning_rate": 1.1615654382634444e-06, "loss": 0.0394, "step": 1214 }, { "epoch": 1.6029023746701847, "grad_norm": 0.49722274611796874, "learning_rate": 1.1541958186502288e-06, "loss": 0.0397, "step": 1215 }, { "epoch": 1.604221635883905, "grad_norm": 0.4384313895762189, "learning_rate": 1.146846600343488e-06, "loss": 0.0354, "step": 1216 }, { "epoch": 1.6055408970976255, "grad_norm": 0.3312059624947744, "learning_rate": 1.1395178223295188e-06, "loss": 0.0278, "step": 1217 }, { "epoch": 1.6068601583113455, "grad_norm": 0.4382612520669255, "learning_rate": 1.132209523486184e-06, "loss": 0.0368, "step": 1218 }, { "epoch": 1.608179419525066, "grad_norm": 0.5343958276159149, "learning_rate": 1.1249217425827063e-06, "loss": 0.0474, "step": 1219 }, { "epoch": 1.6094986807387863, "grad_norm": 0.49691100959061263, "learning_rate": 1.1176545182794674e-06, "loss": 0.0483, "step": 1220 }, { "epoch": 1.6108179419525066, "grad_norm": 0.4163737449098859, "learning_rate": 1.1104078891277981e-06, "loss": 0.0463, "step": 1221 }, { "epoch": 1.6121372031662269, "grad_norm": 0.48518725073291535, "learning_rate": 1.1031818935697763e-06, "loss": 0.0322, "step": 1222 }, { "epoch": 1.6134564643799472, "grad_norm": 0.7204476449881705, "learning_rate": 1.0959765699380204e-06, "loss": 0.0421, "step": 1223 }, { "epoch": 1.6147757255936677, "grad_norm": 0.5063474602550613, "learning_rate": 1.0887919564554893e-06, "loss": 0.0272, "step": 1224 }, { "epoch": 1.6160949868073877, "grad_norm": 0.4076166371752053, "learning_rate": 1.08162809123528e-06, "loss": 0.0309, "step": 1225 }, { "epoch": 1.6174142480211082, "grad_norm": 0.3268585941272018, "learning_rate": 1.0744850122804218e-06, "loss": 0.0319, "step": 1226 }, { "epoch": 1.6187335092348285, "grad_norm": 0.5850297589948477, "learning_rate": 1.0673627574836753e-06, "loss": 0.0382, "step": 1227 }, { "epoch": 1.6200527704485488, "grad_norm": 0.3998474874889947, "learning_rate": 1.0602613646273374e-06, "loss": 0.0319, "step": 1228 }, { "epoch": 1.6213720316622693, "grad_norm": 0.5798879351459741, "learning_rate": 1.0531808713830288e-06, "loss": 0.0667, "step": 1229 }, { "epoch": 1.6226912928759893, "grad_norm": 0.3914344031230687, "learning_rate": 1.046121315311508e-06, "loss": 0.0362, "step": 1230 }, { "epoch": 1.6240105540897098, "grad_norm": 0.49431640263561566, "learning_rate": 1.0390827338624622e-06, "loss": 0.0483, "step": 1231 }, { "epoch": 1.6253298153034301, "grad_norm": 0.4524775925868054, "learning_rate": 1.0320651643743128e-06, "loss": 0.0398, "step": 1232 }, { "epoch": 1.6266490765171504, "grad_norm": 0.5335799480206387, "learning_rate": 1.0250686440740177e-06, "loss": 0.0353, "step": 1233 }, { "epoch": 1.6279683377308707, "grad_norm": 0.46174733513990734, "learning_rate": 1.0180932100768714e-06, "loss": 0.0446, "step": 1234 }, { "epoch": 1.629287598944591, "grad_norm": 0.6857356411894094, "learning_rate": 1.0111388993863069e-06, "loss": 0.0362, "step": 1235 }, { "epoch": 1.6306068601583115, "grad_norm": 0.4245217775278478, "learning_rate": 1.0042057488937067e-06, "loss": 0.0355, "step": 1236 }, { "epoch": 1.6319261213720315, "grad_norm": 0.4322128765034416, "learning_rate": 9.972937953781985e-07, "loss": 0.0269, "step": 1237 }, { "epoch": 1.633245382585752, "grad_norm": 0.3973851330120696, "learning_rate": 9.904030755064659e-07, "loss": 0.0351, "step": 1238 }, { "epoch": 1.6345646437994723, "grad_norm": 0.46357405653390477, "learning_rate": 9.835336258325507e-07, "loss": 0.0329, "step": 1239 }, { "epoch": 1.6358839050131926, "grad_norm": 0.3994338842065798, "learning_rate": 9.76685482797662e-07, "loss": 0.0399, "step": 1240 }, { "epoch": 1.637203166226913, "grad_norm": 0.6076537453224168, "learning_rate": 9.69858682729976e-07, "loss": 0.0425, "step": 1241 }, { "epoch": 1.6385224274406331, "grad_norm": 0.37706954173942336, "learning_rate": 9.630532618444532e-07, "loss": 0.0284, "step": 1242 }, { "epoch": 1.6398416886543536, "grad_norm": 0.43604541408035125, "learning_rate": 9.562692562426408e-07, "loss": 0.0362, "step": 1243 }, { "epoch": 1.641160949868074, "grad_norm": 1.8554215408377712, "learning_rate": 9.495067019124793e-07, "loss": 0.027, "step": 1244 }, { "epoch": 1.6424802110817942, "grad_norm": 0.493805099763965, "learning_rate": 9.427656347281155e-07, "loss": 0.0356, "step": 1245 }, { "epoch": 1.6437994722955145, "grad_norm": 0.52867853062248, "learning_rate": 9.360460904497132e-07, "loss": 0.0452, "step": 1246 }, { "epoch": 1.6451187335092348, "grad_norm": 0.45808427819318753, "learning_rate": 9.29348104723255e-07, "loss": 0.0499, "step": 1247 }, { "epoch": 1.6464379947229553, "grad_norm": 0.4593564332792896, "learning_rate": 9.226717130803636e-07, "loss": 0.0427, "step": 1248 }, { "epoch": 1.6477572559366753, "grad_norm": 0.39012445742493634, "learning_rate": 9.160169509381083e-07, "loss": 0.041, "step": 1249 }, { "epoch": 1.6490765171503958, "grad_norm": 0.3868641320308475, "learning_rate": 9.093838535988181e-07, "loss": 0.035, "step": 1250 }, { "epoch": 1.650395778364116, "grad_norm": 0.33099630937478597, "learning_rate": 9.027724562498929e-07, "loss": 0.0285, "step": 1251 }, { "epoch": 1.6517150395778364, "grad_norm": 0.7114657305985929, "learning_rate": 8.961827939636198e-07, "loss": 0.0455, "step": 1252 }, { "epoch": 1.6530343007915569, "grad_norm": 0.34348560286066804, "learning_rate": 8.896149016969812e-07, "loss": 0.0323, "step": 1253 }, { "epoch": 1.654353562005277, "grad_norm": 0.4286068142548502, "learning_rate": 8.830688142914783e-07, "loss": 0.0278, "step": 1254 }, { "epoch": 1.6556728232189974, "grad_norm": 0.30546300488945166, "learning_rate": 8.765445664729383e-07, "loss": 0.0272, "step": 1255 }, { "epoch": 1.6569920844327177, "grad_norm": 0.31479895341967534, "learning_rate": 8.700421928513353e-07, "loss": 0.0219, "step": 1256 }, { "epoch": 1.658311345646438, "grad_norm": 0.4793459625577021, "learning_rate": 8.635617279206027e-07, "loss": 0.0566, "step": 1257 }, { "epoch": 1.6596306068601583, "grad_norm": 0.44954609692767467, "learning_rate": 8.571032060584555e-07, "loss": 0.0362, "step": 1258 }, { "epoch": 1.6609498680738786, "grad_norm": 0.4625040531332171, "learning_rate": 8.506666615261988e-07, "loss": 0.0327, "step": 1259 }, { "epoch": 1.662269129287599, "grad_norm": 0.44320479928975837, "learning_rate": 8.442521284685573e-07, "loss": 0.0383, "step": 1260 }, { "epoch": 1.6635883905013191, "grad_norm": 0.44263485851252193, "learning_rate": 8.378596409134854e-07, "loss": 0.0449, "step": 1261 }, { "epoch": 1.6649076517150396, "grad_norm": 0.588159347845755, "learning_rate": 8.314892327719937e-07, "loss": 0.044, "step": 1262 }, { "epoch": 1.66622691292876, "grad_norm": 0.35939392465672665, "learning_rate": 8.251409378379638e-07, "loss": 0.0264, "step": 1263 }, { "epoch": 1.6675461741424802, "grad_norm": 0.5671191712624316, "learning_rate": 8.188147897879667e-07, "loss": 0.0517, "step": 1264 }, { "epoch": 1.6688654353562007, "grad_norm": 0.358886847852907, "learning_rate": 8.125108221810935e-07, "loss": 0.0278, "step": 1265 }, { "epoch": 1.6701846965699207, "grad_norm": 0.42822626836193234, "learning_rate": 8.062290684587698e-07, "loss": 0.0416, "step": 1266 }, { "epoch": 1.6715039577836412, "grad_norm": 0.471691138664079, "learning_rate": 7.999695619445807e-07, "loss": 0.0384, "step": 1267 }, { "epoch": 1.6728232189973615, "grad_norm": 0.4307799868335856, "learning_rate": 7.937323358440935e-07, "loss": 0.0449, "step": 1268 }, { "epoch": 1.6741424802110818, "grad_norm": 0.4152109039926412, "learning_rate": 7.875174232446842e-07, "loss": 0.0265, "step": 1269 }, { "epoch": 1.675461741424802, "grad_norm": 0.43720557942166705, "learning_rate": 7.813248571153542e-07, "loss": 0.0321, "step": 1270 }, { "epoch": 1.6767810026385224, "grad_norm": 0.41610472560617734, "learning_rate": 7.75154670306566e-07, "loss": 0.0306, "step": 1271 }, { "epoch": 1.6781002638522429, "grad_norm": 0.4877701148340018, "learning_rate": 7.690068955500623e-07, "loss": 0.0343, "step": 1272 }, { "epoch": 1.679419525065963, "grad_norm": 0.604477161363688, "learning_rate": 7.628815654586935e-07, "loss": 0.0384, "step": 1273 }, { "epoch": 1.6807387862796834, "grad_norm": 0.9327966953078201, "learning_rate": 7.567787125262449e-07, "loss": 0.044, "step": 1274 }, { "epoch": 1.6820580474934037, "grad_norm": 0.49776099980226146, "learning_rate": 7.506983691272663e-07, "loss": 0.0368, "step": 1275 }, { "epoch": 1.683377308707124, "grad_norm": 0.488388274257174, "learning_rate": 7.446405675168938e-07, "loss": 0.0596, "step": 1276 }, { "epoch": 1.6846965699208445, "grad_norm": 0.35375559159393377, "learning_rate": 7.386053398306886e-07, "loss": 0.0307, "step": 1277 }, { "epoch": 1.6860158311345645, "grad_norm": 0.35309069441544605, "learning_rate": 7.325927180844589e-07, "loss": 0.0293, "step": 1278 }, { "epoch": 1.687335092348285, "grad_norm": 0.4344067842283927, "learning_rate": 7.266027341740917e-07, "loss": 0.0392, "step": 1279 }, { "epoch": 1.6886543535620053, "grad_norm": 0.506956664574459, "learning_rate": 7.206354198753862e-07, "loss": 0.0368, "step": 1280 }, { "epoch": 1.6899736147757256, "grad_norm": 1.4901594079060698, "learning_rate": 7.146908068438818e-07, "loss": 0.0559, "step": 1281 }, { "epoch": 1.6912928759894459, "grad_norm": 0.5192684059622095, "learning_rate": 7.087689266146935e-07, "loss": 0.0339, "step": 1282 }, { "epoch": 1.6926121372031662, "grad_norm": 0.631957801720068, "learning_rate": 7.028698106023396e-07, "loss": 0.0395, "step": 1283 }, { "epoch": 1.6939313984168867, "grad_norm": 0.5294491638321744, "learning_rate": 6.969934901005809e-07, "loss": 0.0288, "step": 1284 }, { "epoch": 1.6952506596306067, "grad_norm": 0.38669922322311945, "learning_rate": 6.911399962822518e-07, "loss": 0.0358, "step": 1285 }, { "epoch": 1.6965699208443272, "grad_norm": 0.434423689466899, "learning_rate": 6.853093601990946e-07, "loss": 0.0365, "step": 1286 }, { "epoch": 1.6978891820580475, "grad_norm": 0.3442245847955821, "learning_rate": 6.795016127815957e-07, "loss": 0.0315, "step": 1287 }, { "epoch": 1.6992084432717678, "grad_norm": 0.42198219872418546, "learning_rate": 6.737167848388227e-07, "loss": 0.0388, "step": 1288 }, { "epoch": 1.7005277044854883, "grad_norm": 0.5703992735819093, "learning_rate": 6.679549070582547e-07, "loss": 0.0506, "step": 1289 }, { "epoch": 1.7018469656992083, "grad_norm": 0.7031877797192093, "learning_rate": 6.622160100056296e-07, "loss": 0.0319, "step": 1290 }, { "epoch": 1.7031662269129288, "grad_norm": 0.48385734835300775, "learning_rate": 6.565001241247743e-07, "loss": 0.0418, "step": 1291 }, { "epoch": 1.7044854881266491, "grad_norm": 0.44649883806780155, "learning_rate": 6.508072797374454e-07, "loss": 0.0441, "step": 1292 }, { "epoch": 1.7058047493403694, "grad_norm": 0.3371838135534027, "learning_rate": 6.451375070431687e-07, "loss": 0.0293, "step": 1293 }, { "epoch": 1.7071240105540897, "grad_norm": 0.524737836501876, "learning_rate": 6.394908361190804e-07, "loss": 0.0301, "step": 1294 }, { "epoch": 1.70844327176781, "grad_norm": 0.42754210803421466, "learning_rate": 6.338672969197623e-07, "loss": 0.0296, "step": 1295 }, { "epoch": 1.7097625329815305, "grad_norm": 0.402266180002818, "learning_rate": 6.282669192770896e-07, "loss": 0.0372, "step": 1296 }, { "epoch": 1.7110817941952505, "grad_norm": 0.49238128867338316, "learning_rate": 6.226897329000687e-07, "loss": 0.0422, "step": 1297 }, { "epoch": 1.712401055408971, "grad_norm": 0.4021292733622415, "learning_rate": 6.171357673746798e-07, "loss": 0.0312, "step": 1298 }, { "epoch": 1.7137203166226913, "grad_norm": 0.4248158425984774, "learning_rate": 6.116050521637218e-07, "loss": 0.04, "step": 1299 }, { "epoch": 1.7150395778364116, "grad_norm": 0.7473101327400671, "learning_rate": 6.060976166066546e-07, "loss": 0.0468, "step": 1300 }, { "epoch": 1.716358839050132, "grad_norm": 0.35824922702307355, "learning_rate": 6.006134899194421e-07, "loss": 0.0312, "step": 1301 }, { "epoch": 1.7176781002638521, "grad_norm": 0.5925344230542687, "learning_rate": 5.951527011944008e-07, "loss": 0.0427, "step": 1302 }, { "epoch": 1.7189973614775726, "grad_norm": 0.405481695676839, "learning_rate": 5.897152794000421e-07, "loss": 0.0339, "step": 1303 }, { "epoch": 1.720316622691293, "grad_norm": 0.4482716029928217, "learning_rate": 5.843012533809211e-07, "loss": 0.038, "step": 1304 }, { "epoch": 1.7216358839050132, "grad_norm": 0.3595441867962531, "learning_rate": 5.789106518574816e-07, "loss": 0.0265, "step": 1305 }, { "epoch": 1.7229551451187335, "grad_norm": 0.599406764927074, "learning_rate": 5.735435034259057e-07, "loss": 0.0493, "step": 1306 }, { "epoch": 1.7242744063324538, "grad_norm": 0.40663759792765514, "learning_rate": 5.681998365579594e-07, "loss": 0.0241, "step": 1307 }, { "epoch": 1.7255936675461743, "grad_norm": 0.35374428779106726, "learning_rate": 5.628796796008435e-07, "loss": 0.0343, "step": 1308 }, { "epoch": 1.7269129287598943, "grad_norm": 0.36859477304072386, "learning_rate": 5.575830607770443e-07, "loss": 0.0354, "step": 1309 }, { "epoch": 1.7282321899736148, "grad_norm": 0.4613643600461862, "learning_rate": 5.52310008184182e-07, "loss": 0.0304, "step": 1310 }, { "epoch": 1.729551451187335, "grad_norm": 0.592134295913327, "learning_rate": 5.470605497948611e-07, "loss": 0.0336, "step": 1311 }, { "epoch": 1.7308707124010554, "grad_norm": 0.4694572788775886, "learning_rate": 5.418347134565249e-07, "loss": 0.0379, "step": 1312 }, { "epoch": 1.732189973614776, "grad_norm": 0.418013815042632, "learning_rate": 5.36632526891303e-07, "loss": 0.032, "step": 1313 }, { "epoch": 1.733509234828496, "grad_norm": 0.33921206405287646, "learning_rate": 5.314540176958699e-07, "loss": 0.0354, "step": 1314 }, { "epoch": 1.7348284960422165, "grad_norm": 0.45017885763771565, "learning_rate": 5.262992133412947e-07, "loss": 0.0443, "step": 1315 }, { "epoch": 1.7361477572559367, "grad_norm": 0.326162781727029, "learning_rate": 5.211681411728969e-07, "loss": 0.0296, "step": 1316 }, { "epoch": 1.737467018469657, "grad_norm": 0.4250567661186686, "learning_rate": 5.16060828410101e-07, "loss": 0.0445, "step": 1317 }, { "epoch": 1.7387862796833773, "grad_norm": 0.4018000498866085, "learning_rate": 5.109773021462921e-07, "loss": 0.0458, "step": 1318 }, { "epoch": 1.7401055408970976, "grad_norm": 0.7027370048578987, "learning_rate": 5.059175893486712e-07, "loss": 0.0411, "step": 1319 }, { "epoch": 1.741424802110818, "grad_norm": 0.4162139981216462, "learning_rate": 5.008817168581137e-07, "loss": 0.0402, "step": 1320 }, { "epoch": 1.7427440633245381, "grad_norm": 0.32860074949075946, "learning_rate": 4.958697113890271e-07, "loss": 0.0358, "step": 1321 }, { "epoch": 1.7440633245382586, "grad_norm": 0.42036600275964464, "learning_rate": 4.908815995292082e-07, "loss": 0.0344, "step": 1322 }, { "epoch": 1.745382585751979, "grad_norm": 0.52059293516186, "learning_rate": 4.859174077397022e-07, "loss": 0.0562, "step": 1323 }, { "epoch": 1.7467018469656992, "grad_norm": 0.5745641702799402, "learning_rate": 4.809771623546627e-07, "loss": 0.0492, "step": 1324 }, { "epoch": 1.7480211081794197, "grad_norm": 0.35422514466473387, "learning_rate": 4.76060889581213e-07, "loss": 0.0339, "step": 1325 }, { "epoch": 1.7493403693931397, "grad_norm": 0.3547143014757154, "learning_rate": 4.711686154993028e-07, "loss": 0.0407, "step": 1326 }, { "epoch": 1.7506596306068603, "grad_norm": 0.4717976706982805, "learning_rate": 4.6630036606157616e-07, "loss": 0.0282, "step": 1327 }, { "epoch": 1.7519788918205803, "grad_norm": 0.40394038783642244, "learning_rate": 4.614561670932288e-07, "loss": 0.0333, "step": 1328 }, { "epoch": 1.7532981530343008, "grad_norm": 0.5160848481103655, "learning_rate": 4.5663604429187547e-07, "loss": 0.0496, "step": 1329 }, { "epoch": 1.754617414248021, "grad_norm": 0.4232435269539701, "learning_rate": 4.5184002322740784e-07, "loss": 0.0339, "step": 1330 }, { "epoch": 1.7559366754617414, "grad_norm": 0.40699788742931914, "learning_rate": 4.470681293418655e-07, "loss": 0.0382, "step": 1331 }, { "epoch": 1.7572559366754619, "grad_norm": 0.4906969195565496, "learning_rate": 4.423203879492943e-07, "loss": 0.0389, "step": 1332 }, { "epoch": 1.758575197889182, "grad_norm": 0.4131458996348403, "learning_rate": 4.375968242356171e-07, "loss": 0.0396, "step": 1333 }, { "epoch": 1.7598944591029024, "grad_norm": 0.43933442998989664, "learning_rate": 4.3289746325849924e-07, "loss": 0.0419, "step": 1334 }, { "epoch": 1.7612137203166227, "grad_norm": 0.8251931213477443, "learning_rate": 4.282223299472138e-07, "loss": 0.0504, "step": 1335 }, { "epoch": 1.762532981530343, "grad_norm": 1.2581963511177185, "learning_rate": 4.2357144910251003e-07, "loss": 0.0426, "step": 1336 }, { "epoch": 1.7638522427440633, "grad_norm": 0.4329016484451927, "learning_rate": 4.189448453964845e-07, "loss": 0.0346, "step": 1337 }, { "epoch": 1.7651715039577835, "grad_norm": 0.3701763750178543, "learning_rate": 4.1434254337244404e-07, "loss": 0.0354, "step": 1338 }, { "epoch": 1.766490765171504, "grad_norm": 0.41529219024756975, "learning_rate": 4.0976456744478254e-07, "loss": 0.0299, "step": 1339 }, { "epoch": 1.767810026385224, "grad_norm": 0.4742675186457856, "learning_rate": 4.05210941898847e-07, "loss": 0.0513, "step": 1340 }, { "epoch": 1.7691292875989446, "grad_norm": 0.4247152497761631, "learning_rate": 4.006816908908101e-07, "loss": 0.0331, "step": 1341 }, { "epoch": 1.770448548812665, "grad_norm": 0.42305349223370864, "learning_rate": 3.9617683844754284e-07, "loss": 0.0361, "step": 1342 }, { "epoch": 1.7717678100263852, "grad_norm": 0.6014630922271789, "learning_rate": 3.916964084664848e-07, "loss": 0.0336, "step": 1343 }, { "epoch": 1.7730870712401057, "grad_norm": 0.3405494826572094, "learning_rate": 3.8724042471551925e-07, "loss": 0.0311, "step": 1344 }, { "epoch": 1.7744063324538257, "grad_norm": 0.37473108350403517, "learning_rate": 3.8280891083284646e-07, "loss": 0.0367, "step": 1345 }, { "epoch": 1.7757255936675462, "grad_norm": 0.611239757456385, "learning_rate": 3.784018903268588e-07, "loss": 0.0411, "step": 1346 }, { "epoch": 1.7770448548812665, "grad_norm": 0.5062233019350862, "learning_rate": 3.7401938657601555e-07, "loss": 0.0551, "step": 1347 }, { "epoch": 1.7783641160949868, "grad_norm": 0.3335548806824567, "learning_rate": 3.6966142282871873e-07, "loss": 0.0286, "step": 1348 }, { "epoch": 1.779683377308707, "grad_norm": 0.47839614180461826, "learning_rate": 3.653280222031913e-07, "loss": 0.0486, "step": 1349 }, { "epoch": 1.7810026385224274, "grad_norm": 0.4047331132243693, "learning_rate": 3.610192076873498e-07, "loss": 0.034, "step": 1350 }, { "epoch": 1.7823218997361479, "grad_norm": 0.6147779690764298, "learning_rate": 3.567350021386895e-07, "loss": 0.0557, "step": 1351 }, { "epoch": 1.783641160949868, "grad_norm": 0.4414521830326967, "learning_rate": 3.524754282841575e-07, "loss": 0.0301, "step": 1352 }, { "epoch": 1.7849604221635884, "grad_norm": 0.5631775902206335, "learning_rate": 3.482405087200352e-07, "loss": 0.0417, "step": 1353 }, { "epoch": 1.7862796833773087, "grad_norm": 0.35558876215797697, "learning_rate": 3.440302659118172e-07, "loss": 0.0235, "step": 1354 }, { "epoch": 1.787598944591029, "grad_norm": 0.4582685902858596, "learning_rate": 3.39844722194092e-07, "loss": 0.0416, "step": 1355 }, { "epoch": 1.7889182058047495, "grad_norm": 0.4048407621656869, "learning_rate": 3.356838997704226e-07, "loss": 0.0308, "step": 1356 }, { "epoch": 1.7902374670184695, "grad_norm": 0.4955145015945528, "learning_rate": 3.315478207132322e-07, "loss": 0.0519, "step": 1357 }, { "epoch": 1.79155672823219, "grad_norm": 0.3983405329563796, "learning_rate": 3.274365069636831e-07, "loss": 0.0348, "step": 1358 }, { "epoch": 1.7928759894459103, "grad_norm": 0.4454190763224718, "learning_rate": 3.233499803315637e-07, "loss": 0.0463, "step": 1359 }, { "epoch": 1.7941952506596306, "grad_norm": 0.46036440046593147, "learning_rate": 3.1928826249516984e-07, "loss": 0.0402, "step": 1360 }, { "epoch": 1.7955145118733509, "grad_norm": 0.4343802230808252, "learning_rate": 3.1525137500119207e-07, "loss": 0.0308, "step": 1361 }, { "epoch": 1.7968337730870712, "grad_norm": 0.39526583994521997, "learning_rate": 3.112393392645985e-07, "loss": 0.0258, "step": 1362 }, { "epoch": 1.7981530343007917, "grad_norm": 0.3522368913909313, "learning_rate": 3.072521765685249e-07, "loss": 0.0226, "step": 1363 }, { "epoch": 1.7994722955145117, "grad_norm": 0.5018803279485713, "learning_rate": 3.0328990806415935e-07, "loss": 0.0412, "step": 1364 }, { "epoch": 1.8007915567282322, "grad_norm": 0.6026623993978871, "learning_rate": 2.993525547706316e-07, "loss": 0.0416, "step": 1365 }, { "epoch": 1.8021108179419525, "grad_norm": 0.40115190329359646, "learning_rate": 2.9544013757489944e-07, "loss": 0.0308, "step": 1366 }, { "epoch": 1.8034300791556728, "grad_norm": 0.4042690712138384, "learning_rate": 2.915526772316402e-07, "loss": 0.0322, "step": 1367 }, { "epoch": 1.8047493403693933, "grad_norm": 0.5332516621967311, "learning_rate": 2.876901943631372e-07, "loss": 0.043, "step": 1368 }, { "epoch": 1.8060686015831133, "grad_norm": 0.5332780197622528, "learning_rate": 2.8385270945917584e-07, "loss": 0.039, "step": 1369 }, { "epoch": 1.8073878627968338, "grad_norm": 0.8182212019252886, "learning_rate": 2.8004024287692944e-07, "loss": 0.0386, "step": 1370 }, { "epoch": 1.8087071240105541, "grad_norm": 0.34092005245045315, "learning_rate": 2.76252814840855e-07, "loss": 0.0236, "step": 1371 }, { "epoch": 1.8100263852242744, "grad_norm": 0.5312862891963817, "learning_rate": 2.724904454425836e-07, "loss": 0.0394, "step": 1372 }, { "epoch": 1.8113456464379947, "grad_norm": 0.4783108589864548, "learning_rate": 2.6875315464081566e-07, "loss": 0.0425, "step": 1373 }, { "epoch": 1.812664907651715, "grad_norm": 0.49846941359788005, "learning_rate": 2.650409622612138e-07, "loss": 0.0358, "step": 1374 }, { "epoch": 1.8139841688654355, "grad_norm": 0.578478088885749, "learning_rate": 2.613538879962957e-07, "loss": 0.0616, "step": 1375 }, { "epoch": 1.8153034300791555, "grad_norm": 0.5716709856066361, "learning_rate": 2.5769195140533556e-07, "loss": 0.0412, "step": 1376 }, { "epoch": 1.816622691292876, "grad_norm": 0.365312638025696, "learning_rate": 2.540551719142548e-07, "loss": 0.0263, "step": 1377 }, { "epoch": 1.8179419525065963, "grad_norm": 0.4215565913003183, "learning_rate": 2.5044356881552045e-07, "loss": 0.0394, "step": 1378 }, { "epoch": 1.8192612137203166, "grad_norm": 0.4281630403371726, "learning_rate": 2.4685716126804485e-07, "loss": 0.0344, "step": 1379 }, { "epoch": 1.820580474934037, "grad_norm": 0.45713069089302194, "learning_rate": 2.4329596829708145e-07, "loss": 0.0387, "step": 1380 }, { "epoch": 1.8218997361477571, "grad_norm": 0.571386570983936, "learning_rate": 2.397600087941243e-07, "loss": 0.0486, "step": 1381 }, { "epoch": 1.8232189973614776, "grad_norm": 0.475957344717684, "learning_rate": 2.362493015168088e-07, "loss": 0.0537, "step": 1382 }, { "epoch": 1.824538258575198, "grad_norm": 0.41922931922262313, "learning_rate": 2.327638650888131e-07, "loss": 0.0289, "step": 1383 }, { "epoch": 1.8258575197889182, "grad_norm": 0.41085453215615697, "learning_rate": 2.2930371799975593e-07, "loss": 0.031, "step": 1384 }, { "epoch": 1.8271767810026385, "grad_norm": 0.5793799621353137, "learning_rate": 2.258688786051022e-07, "loss": 0.03, "step": 1385 }, { "epoch": 1.8284960422163588, "grad_norm": 0.4228166482758251, "learning_rate": 2.2245936512606314e-07, "loss": 0.0299, "step": 1386 }, { "epoch": 1.8298153034300793, "grad_norm": 0.47912620143754475, "learning_rate": 2.1907519564950075e-07, "loss": 0.0448, "step": 1387 }, { "epoch": 1.8311345646437993, "grad_norm": 0.46622053446154005, "learning_rate": 2.1571638812783125e-07, "loss": 0.0451, "step": 1388 }, { "epoch": 1.8324538258575198, "grad_norm": 0.478821469184056, "learning_rate": 2.123829603789307e-07, "loss": 0.0359, "step": 1389 }, { "epoch": 1.83377308707124, "grad_norm": 0.5485233771158393, "learning_rate": 2.0907493008604007e-07, "loss": 0.0458, "step": 1390 }, { "epoch": 1.8350923482849604, "grad_norm": 0.641372675811971, "learning_rate": 2.0579231479767093e-07, "loss": 0.0366, "step": 1391 }, { "epoch": 1.8364116094986809, "grad_norm": 0.6700473910897512, "learning_rate": 2.0253513192751374e-07, "loss": 0.0583, "step": 1392 }, { "epoch": 1.837730870712401, "grad_norm": 0.8939302910448729, "learning_rate": 1.993033987543419e-07, "loss": 0.0455, "step": 1393 }, { "epoch": 1.8390501319261214, "grad_norm": 0.3237337238677659, "learning_rate": 1.960971324219263e-07, "loss": 0.0249, "step": 1394 }, { "epoch": 1.8403693931398417, "grad_norm": 0.5188592511707015, "learning_rate": 1.9291634993893803e-07, "loss": 0.0405, "step": 1395 }, { "epoch": 1.841688654353562, "grad_norm": 0.711598934804882, "learning_rate": 1.8976106817886197e-07, "loss": 0.0393, "step": 1396 }, { "epoch": 1.8430079155672823, "grad_norm": 0.5158747451274224, "learning_rate": 1.8663130387990612e-07, "loss": 0.0495, "step": 1397 }, { "epoch": 1.8443271767810026, "grad_norm": 0.5051814837715082, "learning_rate": 1.8352707364491352e-07, "loss": 0.0408, "step": 1398 }, { "epoch": 1.845646437994723, "grad_norm": 0.364833204684728, "learning_rate": 1.804483939412721e-07, "loss": 0.0305, "step": 1399 }, { "epoch": 1.8469656992084431, "grad_norm": 0.4926948614968836, "learning_rate": 1.7739528110083003e-07, "loss": 0.0462, "step": 1400 }, { "epoch": 1.8482849604221636, "grad_norm": 0.4060254995491697, "learning_rate": 1.7436775131980665e-07, "loss": 0.0311, "step": 1401 }, { "epoch": 1.849604221635884, "grad_norm": 0.463198005789052, "learning_rate": 1.7136582065870876e-07, "loss": 0.0426, "step": 1402 }, { "epoch": 1.8509234828496042, "grad_norm": 0.36889044907936963, "learning_rate": 1.683895050422446e-07, "loss": 0.0263, "step": 1403 }, { "epoch": 1.8522427440633247, "grad_norm": 0.659379596842805, "learning_rate": 1.6543882025923884e-07, "loss": 0.0362, "step": 1404 }, { "epoch": 1.8535620052770447, "grad_norm": 0.48285367378926597, "learning_rate": 1.6251378196254775e-07, "loss": 0.0315, "step": 1405 }, { "epoch": 1.8548812664907652, "grad_norm": 0.31072920298807033, "learning_rate": 1.5961440566897913e-07, "loss": 0.0275, "step": 1406 }, { "epoch": 1.8562005277044855, "grad_norm": 0.38114528622335464, "learning_rate": 1.5674070675920805e-07, "loss": 0.0368, "step": 1407 }, { "epoch": 1.8575197889182058, "grad_norm": 0.3847157043647642, "learning_rate": 1.5389270047769578e-07, "loss": 0.0345, "step": 1408 }, { "epoch": 1.858839050131926, "grad_norm": 0.34769161488585415, "learning_rate": 1.5107040193260814e-07, "loss": 0.0275, "step": 1409 }, { "epoch": 1.8601583113456464, "grad_norm": 0.5361485360942428, "learning_rate": 1.482738260957378e-07, "loss": 0.0509, "step": 1410 }, { "epoch": 1.8614775725593669, "grad_norm": 0.3634369135567901, "learning_rate": 1.4550298780242e-07, "loss": 0.0355, "step": 1411 }, { "epoch": 1.862796833773087, "grad_norm": 0.46123718043387246, "learning_rate": 1.427579017514591e-07, "loss": 0.0441, "step": 1412 }, { "epoch": 1.8641160949868074, "grad_norm": 0.749043814853032, "learning_rate": 1.400385825050482e-07, "loss": 0.0374, "step": 1413 }, { "epoch": 1.8654353562005277, "grad_norm": 0.4128932447195577, "learning_rate": 1.3734504448869147e-07, "loss": 0.0316, "step": 1414 }, { "epoch": 1.866754617414248, "grad_norm": 0.3877671226172349, "learning_rate": 1.346773019911285e-07, "loss": 0.0302, "step": 1415 }, { "epoch": 1.8680738786279685, "grad_norm": 0.44772392186436794, "learning_rate": 1.3203536916425842e-07, "loss": 0.0364, "step": 1416 }, { "epoch": 1.8693931398416885, "grad_norm": 0.3061199809339557, "learning_rate": 1.2941926002306536e-07, "loss": 0.0252, "step": 1417 }, { "epoch": 1.870712401055409, "grad_norm": 0.42164260692116234, "learning_rate": 1.2682898844554093e-07, "loss": 0.0232, "step": 1418 }, { "epoch": 1.8720316622691293, "grad_norm": 0.34059144856622964, "learning_rate": 1.2426456817261513e-07, "loss": 0.0234, "step": 1419 }, { "epoch": 1.8733509234828496, "grad_norm": 0.3976682954800572, "learning_rate": 1.217260128080816e-07, "loss": 0.042, "step": 1420 }, { "epoch": 1.8746701846965699, "grad_norm": 0.46610358415516434, "learning_rate": 1.192133358185238e-07, "loss": 0.0435, "step": 1421 }, { "epoch": 1.8759894459102902, "grad_norm": 0.6845908917213743, "learning_rate": 1.1672655053324655e-07, "loss": 0.0398, "step": 1422 }, { "epoch": 1.8773087071240107, "grad_norm": 0.4319665923504084, "learning_rate": 1.1426567014420297e-07, "loss": 0.0218, "step": 1423 }, { "epoch": 1.8786279683377307, "grad_norm": 0.33201568533751724, "learning_rate": 1.1183070770592442e-07, "loss": 0.0287, "step": 1424 }, { "epoch": 1.8799472295514512, "grad_norm": 0.41576390147862746, "learning_rate": 1.094216761354544e-07, "loss": 0.0508, "step": 1425 }, { "epoch": 1.8812664907651715, "grad_norm": 0.4440946282610569, "learning_rate": 1.0703858821227541e-07, "loss": 0.0413, "step": 1426 }, { "epoch": 1.8825857519788918, "grad_norm": 0.4666316377271166, "learning_rate": 1.0468145657824558e-07, "loss": 0.0291, "step": 1427 }, { "epoch": 1.8839050131926123, "grad_norm": 0.5848051758884389, "learning_rate": 1.0235029373752758e-07, "loss": 0.0364, "step": 1428 }, { "epoch": 1.8852242744063323, "grad_norm": 0.38398899757542004, "learning_rate": 1.0004511205652656e-07, "loss": 0.0334, "step": 1429 }, { "epoch": 1.8865435356200528, "grad_norm": 0.36496273056510814, "learning_rate": 9.776592376381955e-08, "loss": 0.0336, "step": 1430 }, { "epoch": 1.8878627968337731, "grad_norm": 0.3788551185262741, "learning_rate": 9.551274095009444e-08, "loss": 0.0308, "step": 1431 }, { "epoch": 1.8891820580474934, "grad_norm": 0.4022971579863129, "learning_rate": 9.32855755680867e-08, "loss": 0.036, "step": 1432 }, { "epoch": 1.8905013192612137, "grad_norm": 0.393215166622088, "learning_rate": 9.108443943251055e-08, "loss": 0.041, "step": 1433 }, { "epoch": 1.891820580474934, "grad_norm": 0.5447231643725466, "learning_rate": 8.89093442200023e-08, "loss": 0.0428, "step": 1434 }, { "epoch": 1.8931398416886545, "grad_norm": 0.3790315267419849, "learning_rate": 8.676030146905434e-08, "loss": 0.0288, "step": 1435 }, { "epoch": 1.8944591029023745, "grad_norm": 0.37530226335630346, "learning_rate": 8.463732257995571e-08, "loss": 0.0277, "step": 1436 }, { "epoch": 1.895778364116095, "grad_norm": 0.37514039760418133, "learning_rate": 8.254041881473163e-08, "loss": 0.031, "step": 1437 }, { "epoch": 1.8970976253298153, "grad_norm": 0.35567700062264085, "learning_rate": 8.046960129708348e-08, "loss": 0.0258, "step": 1438 }, { "epoch": 1.8984168865435356, "grad_norm": 0.31803214794946344, "learning_rate": 7.842488101232893e-08, "loss": 0.0221, "step": 1439 }, { "epoch": 1.899736147757256, "grad_norm": 0.42855694638621145, "learning_rate": 7.640626880734581e-08, "loss": 0.0287, "step": 1440 }, { "epoch": 1.9010554089709761, "grad_norm": 0.44050404423122047, "learning_rate": 7.441377539051165e-08, "loss": 0.0231, "step": 1441 }, { "epoch": 1.9023746701846966, "grad_norm": 0.4006135382220008, "learning_rate": 7.244741133164979e-08, "loss": 0.0416, "step": 1442 }, { "epoch": 1.903693931398417, "grad_norm": 0.5035650996719838, "learning_rate": 7.050718706197168e-08, "loss": 0.0441, "step": 1443 }, { "epoch": 1.9050131926121372, "grad_norm": 0.38877200706439374, "learning_rate": 6.859311287402081e-08, "loss": 0.0383, "step": 1444 }, { "epoch": 1.9063324538258575, "grad_norm": 0.4710132583661197, "learning_rate": 6.670519892162053e-08, "loss": 0.0333, "step": 1445 }, { "epoch": 1.9076517150395778, "grad_norm": 0.39111866383182126, "learning_rate": 6.48434552198185e-08, "loss": 0.0344, "step": 1446 }, { "epoch": 1.9089709762532983, "grad_norm": 0.3369746744134288, "learning_rate": 6.30078916448329e-08, "loss": 0.0262, "step": 1447 }, { "epoch": 1.9102902374670183, "grad_norm": 0.3672285966035807, "learning_rate": 6.119851793400188e-08, "loss": 0.0357, "step": 1448 }, { "epoch": 1.9116094986807388, "grad_norm": 0.501282704651796, "learning_rate": 5.941534368573143e-08, "loss": 0.0513, "step": 1449 }, { "epoch": 1.912928759894459, "grad_norm": 0.515286690814764, "learning_rate": 5.7658378359443104e-08, "loss": 0.0261, "step": 1450 }, { "epoch": 1.9142480211081794, "grad_norm": 0.4150045067689141, "learning_rate": 5.59276312755247e-08, "loss": 0.0395, "step": 1451 }, { "epoch": 1.9155672823218999, "grad_norm": 0.3637171809597343, "learning_rate": 5.4223111615281935e-08, "loss": 0.0349, "step": 1452 }, { "epoch": 1.91688654353562, "grad_norm": 0.405991825012232, "learning_rate": 5.254482842088793e-08, "loss": 0.0302, "step": 1453 }, { "epoch": 1.9182058047493404, "grad_norm": 0.6027202091909221, "learning_rate": 5.089279059533658e-08, "loss": 0.0537, "step": 1454 }, { "epoch": 1.9195250659630607, "grad_norm": 0.3826886359954292, "learning_rate": 4.926700690239372e-08, "loss": 0.0325, "step": 1455 }, { "epoch": 1.920844327176781, "grad_norm": 0.5813404321771318, "learning_rate": 4.766748596655268e-08, "loss": 0.0368, "step": 1456 }, { "epoch": 1.9221635883905013, "grad_norm": 0.455817012611782, "learning_rate": 4.609423627298715e-08, "loss": 0.0237, "step": 1457 }, { "epoch": 1.9234828496042216, "grad_norm": 0.5487232651165294, "learning_rate": 4.4547266167507264e-08, "loss": 0.034, "step": 1458 }, { "epoch": 1.924802110817942, "grad_norm": 0.3923882257847083, "learning_rate": 4.302658385651359e-08, "loss": 0.0305, "step": 1459 }, { "epoch": 1.9261213720316621, "grad_norm": 0.361930421531541, "learning_rate": 4.1532197406954357e-08, "loss": 0.027, "step": 1460 }, { "epoch": 1.9274406332453826, "grad_norm": 0.3410974255034189, "learning_rate": 4.006411474628491e-08, "loss": 0.0257, "step": 1461 }, { "epoch": 1.928759894459103, "grad_norm": 0.3391437722549354, "learning_rate": 3.862234366242168e-08, "loss": 0.0257, "step": 1462 }, { "epoch": 1.9300791556728232, "grad_norm": 0.7996017631556153, "learning_rate": 3.720689180370329e-08, "loss": 0.0353, "step": 1463 }, { "epoch": 1.9313984168865437, "grad_norm": 0.3903649276794555, "learning_rate": 3.581776667885062e-08, "loss": 0.0317, "step": 1464 }, { "epoch": 1.9327176781002637, "grad_norm": 0.46536412104370617, "learning_rate": 3.445497565692457e-08, "loss": 0.0351, "step": 1465 }, { "epoch": 1.9340369393139842, "grad_norm": 0.4529803508470128, "learning_rate": 3.311852596728948e-08, "loss": 0.0339, "step": 1466 }, { "epoch": 1.9353562005277045, "grad_norm": 0.680215979452562, "learning_rate": 3.1808424699572014e-08, "loss": 0.0311, "step": 1467 }, { "epoch": 1.9366754617414248, "grad_norm": 0.409978370192348, "learning_rate": 3.052467880362675e-08, "loss": 0.0435, "step": 1468 }, { "epoch": 1.937994722955145, "grad_norm": 0.3182373919865013, "learning_rate": 2.9267295089497327e-08, "loss": 0.0344, "step": 1469 }, { "epoch": 1.9393139841688654, "grad_norm": 1.0684008641939509, "learning_rate": 2.8036280227379808e-08, "loss": 0.0382, "step": 1470 }, { "epoch": 1.9406332453825859, "grad_norm": 0.8240642381272889, "learning_rate": 2.6831640747589925e-08, "loss": 0.0253, "step": 1471 }, { "epoch": 1.941952506596306, "grad_norm": 0.39242837732937735, "learning_rate": 2.5653383040524228e-08, "loss": 0.0368, "step": 1472 }, { "epoch": 1.9432717678100264, "grad_norm": 0.433259089337428, "learning_rate": 2.4501513356631202e-08, "loss": 0.0296, "step": 1473 }, { "epoch": 1.9445910290237467, "grad_norm": 0.728610866735138, "learning_rate": 2.3376037806374097e-08, "loss": 0.0412, "step": 1474 }, { "epoch": 1.945910290237467, "grad_norm": 0.3805477215998381, "learning_rate": 2.2276962360200383e-08, "loss": 0.0297, "step": 1475 }, { "epoch": 1.9472295514511875, "grad_norm": 0.3452238003115353, "learning_rate": 2.1204292848509557e-08, "loss": 0.0303, "step": 1476 }, { "epoch": 1.9485488126649075, "grad_norm": 0.4659613637601965, "learning_rate": 2.0158034961622607e-08, "loss": 0.041, "step": 1477 }, { "epoch": 1.949868073878628, "grad_norm": 0.33093130150957134, "learning_rate": 1.9138194249750386e-08, "loss": 0.0346, "step": 1478 }, { "epoch": 1.9511873350923483, "grad_norm": 1.0830612352894715, "learning_rate": 1.81447761229675e-08, "loss": 0.0551, "step": 1479 }, { "epoch": 1.9525065963060686, "grad_norm": 0.4573678054826342, "learning_rate": 1.7177785851180127e-08, "loss": 0.0468, "step": 1480 }, { "epoch": 1.9538258575197889, "grad_norm": 0.36034300277622483, "learning_rate": 1.6237228564098818e-08, "loss": 0.0276, "step": 1481 }, { "epoch": 1.9551451187335092, "grad_norm": 0.4269174174424741, "learning_rate": 1.532310925121294e-08, "loss": 0.0334, "step": 1482 }, { "epoch": 1.9564643799472297, "grad_norm": 0.5534031509176094, "learning_rate": 1.4435432761762958e-08, "loss": 0.0454, "step": 1483 }, { "epoch": 1.9577836411609497, "grad_norm": 0.48672300692494025, "learning_rate": 1.3574203804713748e-08, "loss": 0.047, "step": 1484 }, { "epoch": 1.9591029023746702, "grad_norm": 0.3089514019798877, "learning_rate": 1.2739426948732426e-08, "loss": 0.0222, "step": 1485 }, { "epoch": 1.9604221635883905, "grad_norm": 0.45341517194027414, "learning_rate": 1.1931106622161127e-08, "loss": 0.0297, "step": 1486 }, { "epoch": 1.9617414248021108, "grad_norm": 0.5583483850941126, "learning_rate": 1.1149247112995365e-08, "loss": 0.0463, "step": 1487 }, { "epoch": 1.9630606860158313, "grad_norm": 0.3564316234881558, "learning_rate": 1.0393852568860718e-08, "loss": 0.0203, "step": 1488 }, { "epoch": 1.9643799472295513, "grad_norm": 0.44367974596971, "learning_rate": 9.664926996991176e-09, "loss": 0.0275, "step": 1489 }, { "epoch": 1.9656992084432718, "grad_norm": 0.39152274879775784, "learning_rate": 8.962474264206378e-09, "loss": 0.0315, "step": 1490 }, { "epoch": 1.9670184696569921, "grad_norm": 0.3557589209853032, "learning_rate": 8.286498096893304e-09, "loss": 0.0252, "step": 1491 }, { "epoch": 1.9683377308707124, "grad_norm": 0.41200361634618793, "learning_rate": 7.637002080985167e-09, "loss": 0.0302, "step": 1492 }, { "epoch": 1.9696569920844327, "grad_norm": 0.7439654263189737, "learning_rate": 7.013989661942555e-09, "loss": 0.042, "step": 1493 }, { "epoch": 1.970976253298153, "grad_norm": 0.410572155045626, "learning_rate": 6.417464144736208e-09, "loss": 0.0433, "step": 1494 }, { "epoch": 1.9722955145118735, "grad_norm": 0.43767939437112174, "learning_rate": 5.847428693826484e-09, "loss": 0.0322, "step": 1495 }, { "epoch": 1.9736147757255935, "grad_norm": 0.3220957811650404, "learning_rate": 5.303886333151154e-09, "loss": 0.0245, "step": 1496 }, { "epoch": 1.974934036939314, "grad_norm": 0.39372689260064875, "learning_rate": 4.786839946104849e-09, "loss": 0.0255, "step": 1497 }, { "epoch": 1.9762532981530343, "grad_norm": 0.45658707250148867, "learning_rate": 4.296292275526859e-09, "loss": 0.0387, "step": 1498 }, { "epoch": 1.9775725593667546, "grad_norm": 0.4758183336377512, "learning_rate": 3.8322459236850296e-09, "loss": 0.037, "step": 1499 }, { "epoch": 1.978891820580475, "grad_norm": 0.334075873932478, "learning_rate": 3.394703352263551e-09, "loss": 0.0277, "step": 1500 }, { "epoch": 1.9802110817941951, "grad_norm": 0.7256974162727998, "learning_rate": 2.9836668823468583e-09, "loss": 0.0993, "step": 1501 }, { "epoch": 1.9815303430079156, "grad_norm": 0.5370951547540892, "learning_rate": 2.5991386944107524e-09, "loss": 0.0401, "step": 1502 }, { "epoch": 1.982849604221636, "grad_norm": 0.5128469752391834, "learning_rate": 2.241120828308518e-09, "loss": 0.0429, "step": 1503 }, { "epoch": 1.9841688654353562, "grad_norm": 0.5835488080744212, "learning_rate": 1.9096151832609378e-09, "loss": 0.0305, "step": 1504 }, { "epoch": 1.9854881266490765, "grad_norm": 0.7672436878231969, "learning_rate": 1.6046235178474034e-09, "loss": 0.0621, "step": 1505 }, { "epoch": 1.9868073878627968, "grad_norm": 0.42712213180129055, "learning_rate": 1.326147449993709e-09, "loss": 0.0277, "step": 1506 }, { "epoch": 1.9881266490765173, "grad_norm": 0.443092608668473, "learning_rate": 1.0741884569659412e-09, "loss": 0.0455, "step": 1507 }, { "epoch": 1.9894459102902373, "grad_norm": 0.5669527089886233, "learning_rate": 8.487478753615997e-10, "loss": 0.0511, "step": 1508 }, { "epoch": 1.9907651715039578, "grad_norm": 0.49600894039730775, "learning_rate": 6.498269011029346e-10, "loss": 0.0536, "step": 1509 }, { "epoch": 1.992084432717678, "grad_norm": 0.4435904800966189, "learning_rate": 4.774265894302854e-10, "loss": 0.0298, "step": 1510 }, { "epoch": 1.9934036939313984, "grad_norm": 0.3237649659830547, "learning_rate": 3.3154785489653006e-10, "loss": 0.0244, "step": 1511 }, { "epoch": 1.9947229551451189, "grad_norm": 0.7407042869403138, "learning_rate": 2.1219147136264383e-10, "loss": 0.0434, "step": 1512 }, { "epoch": 1.996042216358839, "grad_norm": 0.4675300736701374, "learning_rate": 1.1935807199270343e-10, "loss": 0.0324, "step": 1513 }, { "epoch": 1.9973614775725594, "grad_norm": 0.29583225995771156, "learning_rate": 5.3048149251111456e-11, "loss": 0.0207, "step": 1514 }, { "epoch": 1.9986807387862797, "grad_norm": 0.3329196213750932, "learning_rate": 1.3262054900931021e-11, "loss": 0.0302, "step": 1515 }, { "epoch": 2.0, "grad_norm": 0.3715122507168322, "learning_rate": 0.0, "loss": 0.0308, "step": 1516 }, { "epoch": 2.0, "eval_loss": 0.04380889981985092, "eval_runtime": 125.8608, "eval_samples_per_second": 40.561, "eval_steps_per_second": 1.271, "step": 1516 }, { "epoch": 2.0, "step": 1516, "total_flos": 4.548718205631201e+17, "train_loss": 0.0573945701893881, "train_runtime": 16154.4675, "train_samples_per_second": 12.007, "train_steps_per_second": 0.094 } ], "logging_steps": 1, "max_steps": 1516, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.548718205631201e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }