{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2404886729835025, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.238859304449232, "learning_rate": 3.205128205128205e-08, "loss": 1.9292, "step": 1 }, { "epoch": 0.0, "grad_norm": 5.154761972828117, "learning_rate": 6.41025641025641e-08, "loss": 1.9106, "step": 2 }, { "epoch": 0.0, "grad_norm": 5.154761972828117, "learning_rate": 6.41025641025641e-08, "loss": 2.0762, "step": 3 }, { "epoch": 0.0, "grad_norm": 5.154761972828117, "learning_rate": 6.41025641025641e-08, "loss": 1.9214, "step": 4 }, { "epoch": 0.0, "grad_norm": 4.5501767239225925, "learning_rate": 9.615384615384617e-08, "loss": 1.9214, "step": 5 }, { "epoch": 0.0, "grad_norm": 8.121504141000438, "learning_rate": 1.282051282051282e-07, "loss": 1.9585, "step": 6 }, { "epoch": 0.0, "grad_norm": 5.023363133110595, "learning_rate": 1.6025641025641025e-07, "loss": 1.9023, "step": 7 }, { "epoch": 0.0, "grad_norm": 6.753326612596782, "learning_rate": 1.9230769230769234e-07, "loss": 2.0918, "step": 8 }, { "epoch": 0.0, "grad_norm": 5.96045460748767, "learning_rate": 2.2435897435897438e-07, "loss": 1.9512, "step": 9 }, { "epoch": 0.0, "grad_norm": 4.114270738139542, "learning_rate": 2.564102564102564e-07, "loss": 1.7202, "step": 10 }, { "epoch": 0.0, "grad_norm": 4.684535049833846, "learning_rate": 2.884615384615385e-07, "loss": 1.9663, "step": 11 }, { "epoch": 0.0, "grad_norm": 6.072578005006582, "learning_rate": 3.205128205128205e-07, "loss": 1.749, "step": 12 }, { "epoch": 0.0, "grad_norm": 7.46480114791522, "learning_rate": 3.525641025641026e-07, "loss": 1.874, "step": 13 }, { "epoch": 0.0, "grad_norm": 5.262086839396571, "learning_rate": 3.846153846153847e-07, "loss": 2.1279, "step": 14 }, { "epoch": 0.0, "grad_norm": 5.967246536097301, "learning_rate": 4.1666666666666667e-07, "loss": 1.978, "step": 15 }, { "epoch": 0.0, "grad_norm": 5.292134801724384, "learning_rate": 4.4871794871794876e-07, "loss": 1.8726, "step": 16 }, { "epoch": 0.0, "grad_norm": 5.840441578239468, "learning_rate": 4.807692307692308e-07, "loss": 1.7012, "step": 17 }, { "epoch": 0.0, "grad_norm": 6.883530970083456, "learning_rate": 5.128205128205128e-07, "loss": 1.9629, "step": 18 }, { "epoch": 0.0, "grad_norm": 5.941981671382089, "learning_rate": 5.448717948717949e-07, "loss": 1.8174, "step": 19 }, { "epoch": 0.0, "grad_norm": 5.148089740413392, "learning_rate": 5.76923076923077e-07, "loss": 2.0889, "step": 20 }, { "epoch": 0.0, "grad_norm": 3.2526377076981627, "learning_rate": 6.08974358974359e-07, "loss": 1.7114, "step": 21 }, { "epoch": 0.0, "grad_norm": 5.0623854154859576, "learning_rate": 6.41025641025641e-07, "loss": 1.749, "step": 22 }, { "epoch": 0.0, "grad_norm": 4.486633223129077, "learning_rate": 6.730769230769231e-07, "loss": 2.0391, "step": 23 }, { "epoch": 0.0, "grad_norm": 4.372206281873816, "learning_rate": 7.051282051282052e-07, "loss": 2.0615, "step": 24 }, { "epoch": 0.0, "grad_norm": 5.245200878071032, "learning_rate": 7.371794871794873e-07, "loss": 1.8257, "step": 25 }, { "epoch": 0.0, "grad_norm": 5.003738392680743, "learning_rate": 7.692307692307694e-07, "loss": 1.9463, "step": 26 }, { "epoch": 0.0, "grad_norm": 4.701079464001506, "learning_rate": 8.012820512820515e-07, "loss": 1.9487, "step": 27 }, { "epoch": 0.0, "grad_norm": 2.9499281691275945, "learning_rate": 8.333333333333333e-07, "loss": 1.792, "step": 28 }, { "epoch": 0.0, "grad_norm": 4.069030482849168, "learning_rate": 8.653846153846154e-07, "loss": 1.9136, "step": 29 }, { "epoch": 0.0, "grad_norm": 3.9368014731570184, "learning_rate": 8.974358974358975e-07, "loss": 1.9771, "step": 30 }, { "epoch": 0.0, "grad_norm": 3.2485080466515015, "learning_rate": 9.294871794871796e-07, "loss": 1.9028, "step": 31 }, { "epoch": 0.0, "grad_norm": 3.1622578885650086, "learning_rate": 9.615384615384617e-07, "loss": 1.7256, "step": 32 }, { "epoch": 0.0, "grad_norm": 3.3739906268828004, "learning_rate": 9.935897435897436e-07, "loss": 1.8687, "step": 33 }, { "epoch": 0.0, "grad_norm": 2.896288258241709, "learning_rate": 1.0256410256410257e-06, "loss": 1.6704, "step": 34 }, { "epoch": 0.0, "grad_norm": 3.3707774957236416, "learning_rate": 1.0576923076923078e-06, "loss": 1.8804, "step": 35 }, { "epoch": 0.0, "grad_norm": 3.3426838743476717, "learning_rate": 1.0897435897435899e-06, "loss": 1.6968, "step": 36 }, { "epoch": 0.0, "grad_norm": 3.6068389746435177, "learning_rate": 1.121794871794872e-06, "loss": 1.894, "step": 37 }, { "epoch": 0.0, "grad_norm": 3.521477257520405, "learning_rate": 1.153846153846154e-06, "loss": 1.8633, "step": 38 }, { "epoch": 0.0, "grad_norm": 2.776485593912886, "learning_rate": 1.185897435897436e-06, "loss": 1.9204, "step": 39 }, { "epoch": 0.0, "grad_norm": 2.7681508560658163, "learning_rate": 1.217948717948718e-06, "loss": 1.7188, "step": 40 }, { "epoch": 0.0, "grad_norm": 2.4439946119342766, "learning_rate": 1.25e-06, "loss": 1.6919, "step": 41 }, { "epoch": 0.0, "grad_norm": 3.4557738645575014, "learning_rate": 1.282051282051282e-06, "loss": 1.8833, "step": 42 }, { "epoch": 0.0, "grad_norm": 3.258303676978315, "learning_rate": 1.3141025641025643e-06, "loss": 1.8071, "step": 43 }, { "epoch": 0.0, "grad_norm": 2.458784669216295, "learning_rate": 1.3461538461538462e-06, "loss": 1.6909, "step": 44 }, { "epoch": 0.0, "grad_norm": 2.504431892544747, "learning_rate": 1.3782051282051285e-06, "loss": 1.8086, "step": 45 }, { "epoch": 0.0, "grad_norm": 2.4555502780430674, "learning_rate": 1.4102564102564104e-06, "loss": 1.751, "step": 46 }, { "epoch": 0.0, "grad_norm": 2.751022634659861, "learning_rate": 1.4423076923076922e-06, "loss": 1.8057, "step": 47 }, { "epoch": 0.0, "grad_norm": 2.5738415737254643, "learning_rate": 1.4743589743589745e-06, "loss": 1.9253, "step": 48 }, { "epoch": 0.0, "grad_norm": 2.8124133820243595, "learning_rate": 1.5064102564102564e-06, "loss": 1.793, "step": 49 }, { "epoch": 0.0, "grad_norm": 2.463335400346117, "learning_rate": 1.5384615384615387e-06, "loss": 1.7407, "step": 50 }, { "epoch": 0.0, "grad_norm": 2.8940656658334207, "learning_rate": 1.5705128205128206e-06, "loss": 1.8716, "step": 51 }, { "epoch": 0.0, "grad_norm": 2.584846431211917, "learning_rate": 1.602564102564103e-06, "loss": 1.9146, "step": 52 }, { "epoch": 0.0, "grad_norm": 2.1515635295651547, "learning_rate": 1.6346153846153848e-06, "loss": 1.5991, "step": 53 }, { "epoch": 0.0, "grad_norm": 2.3483985097141886, "learning_rate": 1.6666666666666667e-06, "loss": 1.7856, "step": 54 }, { "epoch": 0.0, "grad_norm": 2.1146516926157024, "learning_rate": 1.698717948717949e-06, "loss": 1.7661, "step": 55 }, { "epoch": 0.0, "grad_norm": 2.3653234986460254, "learning_rate": 1.7307692307692308e-06, "loss": 1.731, "step": 56 }, { "epoch": 0.0, "grad_norm": 2.21651123257784, "learning_rate": 1.7628205128205131e-06, "loss": 1.5532, "step": 57 }, { "epoch": 0.0, "grad_norm": 2.3531455091700266, "learning_rate": 1.794871794871795e-06, "loss": 1.7275, "step": 58 }, { "epoch": 0.0, "grad_norm": 2.2454207897169183, "learning_rate": 1.826923076923077e-06, "loss": 1.6841, "step": 59 }, { "epoch": 0.0, "grad_norm": 2.911505925139179, "learning_rate": 1.8589743589743592e-06, "loss": 1.5664, "step": 60 }, { "epoch": 0.0, "grad_norm": 2.3580392311833527, "learning_rate": 1.891025641025641e-06, "loss": 1.7896, "step": 61 }, { "epoch": 0.0, "grad_norm": 2.2925212989802692, "learning_rate": 1.9230769230769234e-06, "loss": 1.8101, "step": 62 }, { "epoch": 0.0, "grad_norm": 2.1153093773918763, "learning_rate": 1.9551282051282055e-06, "loss": 1.5747, "step": 63 }, { "epoch": 0.0, "grad_norm": 2.82147569579739, "learning_rate": 1.987179487179487e-06, "loss": 1.9238, "step": 64 }, { "epoch": 0.0, "grad_norm": 2.2045873509725165, "learning_rate": 2.0192307692307692e-06, "loss": 1.6338, "step": 65 }, { "epoch": 0.0, "grad_norm": 1.8958181368269862, "learning_rate": 2.0512820512820513e-06, "loss": 1.6543, "step": 66 }, { "epoch": 0.0, "grad_norm": 2.4962681332761973, "learning_rate": 2.0833333333333334e-06, "loss": 1.7227, "step": 67 }, { "epoch": 0.0, "grad_norm": 2.2904246633721383, "learning_rate": 2.1153846153846155e-06, "loss": 1.7822, "step": 68 }, { "epoch": 0.0, "grad_norm": 2.0946295372079704, "learning_rate": 2.1474358974358976e-06, "loss": 1.7529, "step": 69 }, { "epoch": 0.0, "grad_norm": 1.9129728821010556, "learning_rate": 2.1794871794871797e-06, "loss": 1.7168, "step": 70 }, { "epoch": 0.0, "grad_norm": 2.243146448623675, "learning_rate": 2.211538461538462e-06, "loss": 1.5776, "step": 71 }, { "epoch": 0.0, "grad_norm": 2.3234162989867078, "learning_rate": 2.243589743589744e-06, "loss": 1.7202, "step": 72 }, { "epoch": 0.0, "grad_norm": 2.2101029603221445, "learning_rate": 2.275641025641026e-06, "loss": 1.6567, "step": 73 }, { "epoch": 0.0, "grad_norm": 1.7394617855897616, "learning_rate": 2.307692307692308e-06, "loss": 1.6265, "step": 74 }, { "epoch": 0.0, "grad_norm": 2.1688691088118355, "learning_rate": 2.3397435897435897e-06, "loss": 1.7217, "step": 75 }, { "epoch": 0.0, "grad_norm": 2.125923801614282, "learning_rate": 2.371794871794872e-06, "loss": 1.6421, "step": 76 }, { "epoch": 0.0, "grad_norm": 1.922290559965317, "learning_rate": 2.403846153846154e-06, "loss": 1.7417, "step": 77 }, { "epoch": 0.0, "grad_norm": 1.8260537731707283, "learning_rate": 2.435897435897436e-06, "loss": 1.7046, "step": 78 }, { "epoch": 0.0, "grad_norm": 1.901577592787987, "learning_rate": 2.467948717948718e-06, "loss": 1.6465, "step": 79 }, { "epoch": 0.0, "grad_norm": 2.2404366777034657, "learning_rate": 2.5e-06, "loss": 1.9019, "step": 80 }, { "epoch": 0.0, "grad_norm": 2.3241381789925084, "learning_rate": 2.5320512820512823e-06, "loss": 1.9023, "step": 81 }, { "epoch": 0.0, "grad_norm": 1.6016055313372246, "learning_rate": 2.564102564102564e-06, "loss": 1.5063, "step": 82 }, { "epoch": 0.0, "grad_norm": 1.9570960890714142, "learning_rate": 2.5961538461538465e-06, "loss": 1.6626, "step": 83 }, { "epoch": 0.0, "grad_norm": 2.173041714414597, "learning_rate": 2.6282051282051286e-06, "loss": 1.8271, "step": 84 }, { "epoch": 0.0, "grad_norm": 1.7100248085650758, "learning_rate": 2.6602564102564107e-06, "loss": 1.6792, "step": 85 }, { "epoch": 0.0, "grad_norm": 2.0012255354709096, "learning_rate": 2.6923076923076923e-06, "loss": 1.6572, "step": 86 }, { "epoch": 0.0, "grad_norm": 1.9137825517030291, "learning_rate": 2.7243589743589744e-06, "loss": 1.5825, "step": 87 }, { "epoch": 0.0, "grad_norm": 1.8189687481089372, "learning_rate": 2.756410256410257e-06, "loss": 1.6406, "step": 88 }, { "epoch": 0.0, "grad_norm": 2.005488580790363, "learning_rate": 2.7884615384615386e-06, "loss": 1.4351, "step": 89 }, { "epoch": 0.0, "grad_norm": 1.5332368401576102, "learning_rate": 2.8205128205128207e-06, "loss": 1.6577, "step": 90 }, { "epoch": 0.0, "grad_norm": 1.8507605450940214, "learning_rate": 2.852564102564103e-06, "loss": 1.6392, "step": 91 }, { "epoch": 0.0, "grad_norm": 1.5910068501827332, "learning_rate": 2.8846153846153845e-06, "loss": 1.4976, "step": 92 }, { "epoch": 0.0, "grad_norm": 2.0608841750304494, "learning_rate": 2.916666666666667e-06, "loss": 1.5547, "step": 93 }, { "epoch": 0.0, "grad_norm": 2.1355602748315436, "learning_rate": 2.948717948717949e-06, "loss": 1.7173, "step": 94 }, { "epoch": 0.0, "grad_norm": 1.9032261555786585, "learning_rate": 2.980769230769231e-06, "loss": 1.5654, "step": 95 }, { "epoch": 0.0, "grad_norm": 1.846073015872819, "learning_rate": 3.012820512820513e-06, "loss": 1.6216, "step": 96 }, { "epoch": 0.0, "grad_norm": 1.6487987622267262, "learning_rate": 3.044871794871795e-06, "loss": 1.4878, "step": 97 }, { "epoch": 0.0, "grad_norm": 2.322763298152455, "learning_rate": 3.0769230769230774e-06, "loss": 1.792, "step": 98 }, { "epoch": 0.0, "grad_norm": 1.8121063877418546, "learning_rate": 3.108974358974359e-06, "loss": 1.7222, "step": 99 }, { "epoch": 0.0, "grad_norm": 1.6744336508338604, "learning_rate": 3.141025641025641e-06, "loss": 1.6226, "step": 100 }, { "epoch": 0.0, "grad_norm": 1.7247634842466844, "learning_rate": 3.1730769230769233e-06, "loss": 1.5952, "step": 101 }, { "epoch": 0.0, "grad_norm": 1.740366338268637, "learning_rate": 3.205128205128206e-06, "loss": 1.5981, "step": 102 }, { "epoch": 0.0, "grad_norm": 1.728970560061163, "learning_rate": 3.2371794871794875e-06, "loss": 1.5791, "step": 103 }, { "epoch": 0.01, "grad_norm": 1.858549928240061, "learning_rate": 3.2692307692307696e-06, "loss": 1.6162, "step": 104 }, { "epoch": 0.01, "grad_norm": 2.005425495184849, "learning_rate": 3.3012820512820517e-06, "loss": 1.7373, "step": 105 }, { "epoch": 0.01, "grad_norm": 1.9693982346018715, "learning_rate": 3.3333333333333333e-06, "loss": 1.5737, "step": 106 }, { "epoch": 0.01, "grad_norm": 1.5704486648348535, "learning_rate": 3.365384615384616e-06, "loss": 1.5737, "step": 107 }, { "epoch": 0.01, "grad_norm": 1.7219111956395152, "learning_rate": 3.397435897435898e-06, "loss": 1.4395, "step": 108 }, { "epoch": 0.01, "grad_norm": 1.8107668921480111, "learning_rate": 3.4294871794871796e-06, "loss": 1.6245, "step": 109 }, { "epoch": 0.01, "grad_norm": 2.060649729714151, "learning_rate": 3.4615384615384617e-06, "loss": 1.6484, "step": 110 }, { "epoch": 0.01, "grad_norm": 1.8211651888472016, "learning_rate": 3.4935897435897438e-06, "loss": 1.6504, "step": 111 }, { "epoch": 0.01, "grad_norm": 1.736650925195897, "learning_rate": 3.5256410256410263e-06, "loss": 1.5811, "step": 112 }, { "epoch": 0.01, "grad_norm": 2.0270331778194097, "learning_rate": 3.557692307692308e-06, "loss": 1.5796, "step": 113 }, { "epoch": 0.01, "grad_norm": 1.7649606794029367, "learning_rate": 3.58974358974359e-06, "loss": 1.6763, "step": 114 }, { "epoch": 0.01, "grad_norm": 1.6674123636417442, "learning_rate": 3.621794871794872e-06, "loss": 1.4824, "step": 115 }, { "epoch": 0.01, "grad_norm": 2.0434484011727907, "learning_rate": 3.653846153846154e-06, "loss": 1.6177, "step": 116 }, { "epoch": 0.01, "grad_norm": 1.5250602686441213, "learning_rate": 3.6858974358974363e-06, "loss": 1.4253, "step": 117 }, { "epoch": 0.01, "grad_norm": 1.6326566506759765, "learning_rate": 3.7179487179487184e-06, "loss": 1.6128, "step": 118 }, { "epoch": 0.01, "grad_norm": 1.7267931799000755, "learning_rate": 3.7500000000000005e-06, "loss": 1.7021, "step": 119 }, { "epoch": 0.01, "grad_norm": 1.9640148089571143, "learning_rate": 3.782051282051282e-06, "loss": 1.4609, "step": 120 }, { "epoch": 0.01, "grad_norm": 1.667202401374494, "learning_rate": 3.8141025641025643e-06, "loss": 1.4727, "step": 121 }, { "epoch": 0.01, "grad_norm": 1.8491336858211664, "learning_rate": 3.846153846153847e-06, "loss": 1.4805, "step": 122 }, { "epoch": 0.01, "grad_norm": 1.970502990992734, "learning_rate": 3.878205128205129e-06, "loss": 1.5898, "step": 123 }, { "epoch": 0.01, "grad_norm": 1.8106782959667795, "learning_rate": 3.910256410256411e-06, "loss": 1.3916, "step": 124 }, { "epoch": 0.01, "grad_norm": 1.5795454248785767, "learning_rate": 3.942307692307692e-06, "loss": 1.3828, "step": 125 }, { "epoch": 0.01, "grad_norm": 1.9710407871965574, "learning_rate": 3.974358974358974e-06, "loss": 1.5918, "step": 126 }, { "epoch": 0.01, "grad_norm": 1.720077657883205, "learning_rate": 4.006410256410257e-06, "loss": 1.5762, "step": 127 }, { "epoch": 0.01, "grad_norm": 1.5653310197103396, "learning_rate": 4.0384615384615385e-06, "loss": 1.4663, "step": 128 }, { "epoch": 0.01, "grad_norm": 1.960254201027659, "learning_rate": 4.070512820512821e-06, "loss": 1.585, "step": 129 }, { "epoch": 0.01, "grad_norm": 1.4653973577728605, "learning_rate": 4.102564102564103e-06, "loss": 1.291, "step": 130 }, { "epoch": 0.01, "grad_norm": 1.8782298332980036, "learning_rate": 4.134615384615385e-06, "loss": 1.6655, "step": 131 }, { "epoch": 0.01, "grad_norm": 1.926146971282728, "learning_rate": 4.166666666666667e-06, "loss": 1.6191, "step": 132 }, { "epoch": 0.01, "grad_norm": 1.9142058047642427, "learning_rate": 4.198717948717949e-06, "loss": 1.4238, "step": 133 }, { "epoch": 0.01, "grad_norm": 1.547722030633865, "learning_rate": 4.230769230769231e-06, "loss": 1.5288, "step": 134 }, { "epoch": 0.01, "grad_norm": 1.579733252201955, "learning_rate": 4.262820512820513e-06, "loss": 1.3857, "step": 135 }, { "epoch": 0.01, "grad_norm": 1.667805532959561, "learning_rate": 4.294871794871795e-06, "loss": 1.582, "step": 136 }, { "epoch": 0.01, "grad_norm": 1.5200512406347286, "learning_rate": 4.326923076923077e-06, "loss": 1.6738, "step": 137 }, { "epoch": 0.01, "grad_norm": 1.614965374449281, "learning_rate": 4.358974358974359e-06, "loss": 1.6611, "step": 138 }, { "epoch": 0.01, "grad_norm": 1.9691385979784952, "learning_rate": 4.3910256410256415e-06, "loss": 1.6792, "step": 139 }, { "epoch": 0.01, "grad_norm": 1.544450796833931, "learning_rate": 4.423076923076924e-06, "loss": 1.5615, "step": 140 }, { "epoch": 0.01, "grad_norm": 1.7221654662587071, "learning_rate": 4.455128205128206e-06, "loss": 1.4062, "step": 141 }, { "epoch": 0.01, "grad_norm": 1.5471251537385284, "learning_rate": 4.487179487179488e-06, "loss": 1.4097, "step": 142 }, { "epoch": 0.01, "grad_norm": 1.8635808812969707, "learning_rate": 4.51923076923077e-06, "loss": 1.5171, "step": 143 }, { "epoch": 0.01, "grad_norm": 1.535707715049538, "learning_rate": 4.551282051282052e-06, "loss": 1.6353, "step": 144 }, { "epoch": 0.01, "grad_norm": 1.7429306574282821, "learning_rate": 4.583333333333333e-06, "loss": 1.5894, "step": 145 }, { "epoch": 0.01, "grad_norm": 1.4567582558845829, "learning_rate": 4.615384615384616e-06, "loss": 1.3931, "step": 146 }, { "epoch": 0.01, "grad_norm": 1.573101841721847, "learning_rate": 4.647435897435898e-06, "loss": 1.4629, "step": 147 }, { "epoch": 0.01, "grad_norm": 1.5197795284379594, "learning_rate": 4.6794871794871795e-06, "loss": 1.6362, "step": 148 }, { "epoch": 0.01, "grad_norm": 1.584866687090061, "learning_rate": 4.711538461538462e-06, "loss": 1.7417, "step": 149 }, { "epoch": 0.01, "grad_norm": 1.8414381776962798, "learning_rate": 4.743589743589744e-06, "loss": 1.4585, "step": 150 }, { "epoch": 0.01, "grad_norm": 1.3521917098457124, "learning_rate": 4.775641025641027e-06, "loss": 1.5596, "step": 151 }, { "epoch": 0.01, "grad_norm": 2.2788563573731735, "learning_rate": 4.807692307692308e-06, "loss": 1.5112, "step": 152 }, { "epoch": 0.01, "grad_norm": 1.2635584746274662, "learning_rate": 4.83974358974359e-06, "loss": 1.3804, "step": 153 }, { "epoch": 0.01, "grad_norm": 1.5532220592334811, "learning_rate": 4.871794871794872e-06, "loss": 1.6523, "step": 154 }, { "epoch": 0.01, "grad_norm": 1.5664650376466702, "learning_rate": 4.903846153846154e-06, "loss": 1.4224, "step": 155 }, { "epoch": 0.01, "grad_norm": 1.656054619357628, "learning_rate": 4.935897435897436e-06, "loss": 1.6069, "step": 156 }, { "epoch": 0.01, "grad_norm": 1.5232097696566647, "learning_rate": 4.967948717948718e-06, "loss": 1.3662, "step": 157 }, { "epoch": 0.01, "grad_norm": 1.8399211171546899, "learning_rate": 5e-06, "loss": 1.5405, "step": 158 }, { "epoch": 0.01, "grad_norm": 1.5505603061874536, "learning_rate": 5.0320512820512825e-06, "loss": 1.3809, "step": 159 }, { "epoch": 0.01, "grad_norm": 1.8997162921701884, "learning_rate": 5.064102564102565e-06, "loss": 1.7207, "step": 160 }, { "epoch": 0.01, "grad_norm": 1.7575716528566327, "learning_rate": 5.096153846153846e-06, "loss": 1.6133, "step": 161 }, { "epoch": 0.01, "grad_norm": 1.716110507127467, "learning_rate": 5.128205128205128e-06, "loss": 1.5044, "step": 162 }, { "epoch": 0.01, "grad_norm": 1.7126143944879049, "learning_rate": 5.160256410256411e-06, "loss": 1.5459, "step": 163 }, { "epoch": 0.01, "grad_norm": 1.7034117954416954, "learning_rate": 5.192307692307693e-06, "loss": 1.4263, "step": 164 }, { "epoch": 0.01, "grad_norm": 1.8808184053448647, "learning_rate": 5.224358974358975e-06, "loss": 1.646, "step": 165 }, { "epoch": 0.01, "grad_norm": 1.8256974251245903, "learning_rate": 5.256410256410257e-06, "loss": 1.4595, "step": 166 }, { "epoch": 0.01, "grad_norm": 1.5332772616069503, "learning_rate": 5.288461538461539e-06, "loss": 1.605, "step": 167 }, { "epoch": 0.01, "grad_norm": 1.5439634014517032, "learning_rate": 5.320512820512821e-06, "loss": 1.6392, "step": 168 }, { "epoch": 0.01, "grad_norm": 1.727159551689649, "learning_rate": 5.3525641025641026e-06, "loss": 1.625, "step": 169 }, { "epoch": 0.01, "grad_norm": 1.7538202720011002, "learning_rate": 5.384615384615385e-06, "loss": 1.4429, "step": 170 }, { "epoch": 0.01, "grad_norm": 1.4133140212159618, "learning_rate": 5.416666666666667e-06, "loss": 1.4824, "step": 171 }, { "epoch": 0.01, "grad_norm": 1.3842900268580889, "learning_rate": 5.448717948717949e-06, "loss": 1.4517, "step": 172 }, { "epoch": 0.01, "grad_norm": 1.5202581050560613, "learning_rate": 5.480769230769232e-06, "loss": 1.564, "step": 173 }, { "epoch": 0.01, "grad_norm": 1.3648753550732138, "learning_rate": 5.512820512820514e-06, "loss": 1.4131, "step": 174 }, { "epoch": 0.01, "grad_norm": 1.3379157005922668, "learning_rate": 5.544871794871796e-06, "loss": 1.3105, "step": 175 }, { "epoch": 0.01, "grad_norm": 1.919194636838991, "learning_rate": 5.576923076923077e-06, "loss": 1.4619, "step": 176 }, { "epoch": 0.01, "grad_norm": 1.879199960482607, "learning_rate": 5.608974358974359e-06, "loss": 1.3032, "step": 177 }, { "epoch": 0.01, "grad_norm": 1.4657606490120825, "learning_rate": 5.641025641025641e-06, "loss": 1.2827, "step": 178 }, { "epoch": 0.01, "grad_norm": 1.666882178228744, "learning_rate": 5.6730769230769235e-06, "loss": 1.4805, "step": 179 }, { "epoch": 0.01, "grad_norm": 1.6258710061186548, "learning_rate": 5.705128205128206e-06, "loss": 1.3057, "step": 180 }, { "epoch": 0.01, "grad_norm": 1.7548011353711834, "learning_rate": 5.737179487179487e-06, "loss": 1.585, "step": 181 }, { "epoch": 0.01, "grad_norm": 1.5869944614054485, "learning_rate": 5.769230769230769e-06, "loss": 1.7427, "step": 182 }, { "epoch": 0.01, "grad_norm": 1.5335816636189274, "learning_rate": 5.801282051282052e-06, "loss": 1.5596, "step": 183 }, { "epoch": 0.01, "grad_norm": 1.4391860307780704, "learning_rate": 5.833333333333334e-06, "loss": 1.4199, "step": 184 }, { "epoch": 0.01, "grad_norm": 1.5469212526234435, "learning_rate": 5.865384615384616e-06, "loss": 1.5176, "step": 185 }, { "epoch": 0.01, "grad_norm": 1.5003501660650924, "learning_rate": 5.897435897435898e-06, "loss": 1.4272, "step": 186 }, { "epoch": 0.01, "grad_norm": 1.508234576270832, "learning_rate": 5.92948717948718e-06, "loss": 1.1782, "step": 187 }, { "epoch": 0.01, "grad_norm": 1.4748925618138098, "learning_rate": 5.961538461538462e-06, "loss": 1.5923, "step": 188 }, { "epoch": 0.01, "grad_norm": 1.4737944848710263, "learning_rate": 5.9935897435897436e-06, "loss": 1.4497, "step": 189 }, { "epoch": 0.01, "grad_norm": 1.459889107664886, "learning_rate": 6.025641025641026e-06, "loss": 1.4287, "step": 190 }, { "epoch": 0.01, "grad_norm": 1.3869579848195994, "learning_rate": 6.057692307692308e-06, "loss": 1.3354, "step": 191 }, { "epoch": 0.01, "grad_norm": 1.665655627688541, "learning_rate": 6.08974358974359e-06, "loss": 1.4106, "step": 192 }, { "epoch": 0.01, "grad_norm": 1.3134424580248774, "learning_rate": 6.121794871794873e-06, "loss": 1.4424, "step": 193 }, { "epoch": 0.01, "grad_norm": 1.415895492354165, "learning_rate": 6.153846153846155e-06, "loss": 1.4951, "step": 194 }, { "epoch": 0.01, "grad_norm": 1.371996146521701, "learning_rate": 6.185897435897437e-06, "loss": 1.3091, "step": 195 }, { "epoch": 0.01, "grad_norm": 1.6086430794753375, "learning_rate": 6.217948717948718e-06, "loss": 1.5337, "step": 196 }, { "epoch": 0.01, "grad_norm": 1.8127065466411951, "learning_rate": 6.25e-06, "loss": 1.4907, "step": 197 }, { "epoch": 0.01, "grad_norm": 1.384664309264018, "learning_rate": 6.282051282051282e-06, "loss": 1.3706, "step": 198 }, { "epoch": 0.01, "grad_norm": 1.5235422620822068, "learning_rate": 6.3141025641025645e-06, "loss": 1.4858, "step": 199 }, { "epoch": 0.01, "grad_norm": 1.2885597216980793, "learning_rate": 6.3461538461538466e-06, "loss": 1.3149, "step": 200 }, { "epoch": 0.01, "grad_norm": 1.7298762167347572, "learning_rate": 6.378205128205129e-06, "loss": 1.4976, "step": 201 }, { "epoch": 0.01, "grad_norm": 1.750337640265624, "learning_rate": 6.410256410256412e-06, "loss": 1.6279, "step": 202 }, { "epoch": 0.01, "grad_norm": 1.41841869268125, "learning_rate": 6.442307692307693e-06, "loss": 1.498, "step": 203 }, { "epoch": 0.01, "grad_norm": 1.6272909041184738, "learning_rate": 6.474358974358975e-06, "loss": 1.3926, "step": 204 }, { "epoch": 0.01, "grad_norm": 1.5477750847250453, "learning_rate": 6.506410256410257e-06, "loss": 1.5244, "step": 205 }, { "epoch": 0.01, "grad_norm": 1.5931696147327243, "learning_rate": 6.538461538461539e-06, "loss": 1.377, "step": 206 }, { "epoch": 0.01, "grad_norm": 1.4181826250987062, "learning_rate": 6.570512820512821e-06, "loss": 1.5122, "step": 207 }, { "epoch": 0.01, "grad_norm": 1.7443415454000577, "learning_rate": 6.602564102564103e-06, "loss": 1.519, "step": 208 }, { "epoch": 0.01, "grad_norm": 1.4180895910964568, "learning_rate": 6.6346153846153846e-06, "loss": 1.5557, "step": 209 }, { "epoch": 0.01, "grad_norm": 1.518469966973389, "learning_rate": 6.666666666666667e-06, "loss": 1.4531, "step": 210 }, { "epoch": 0.01, "grad_norm": 1.5601379784815306, "learning_rate": 6.698717948717949e-06, "loss": 1.5273, "step": 211 }, { "epoch": 0.01, "grad_norm": 1.5207608552390224, "learning_rate": 6.730769230769232e-06, "loss": 1.4692, "step": 212 }, { "epoch": 0.01, "grad_norm": 1.4720863691945865, "learning_rate": 6.762820512820514e-06, "loss": 1.5347, "step": 213 }, { "epoch": 0.01, "grad_norm": 1.371194906239484, "learning_rate": 6.794871794871796e-06, "loss": 1.5654, "step": 214 }, { "epoch": 0.01, "grad_norm": 1.7043644796428705, "learning_rate": 6.826923076923078e-06, "loss": 1.6494, "step": 215 }, { "epoch": 0.01, "grad_norm": 1.4816296880348712, "learning_rate": 6.858974358974359e-06, "loss": 1.4424, "step": 216 }, { "epoch": 0.01, "grad_norm": 1.817439747866579, "learning_rate": 6.891025641025641e-06, "loss": 1.6362, "step": 217 }, { "epoch": 0.01, "grad_norm": 1.5786979457571388, "learning_rate": 6.923076923076923e-06, "loss": 1.6304, "step": 218 }, { "epoch": 0.01, "grad_norm": 1.3133870079960415, "learning_rate": 6.9551282051282055e-06, "loss": 1.2573, "step": 219 }, { "epoch": 0.01, "grad_norm": 1.5132232235577348, "learning_rate": 6.9871794871794876e-06, "loss": 1.3345, "step": 220 }, { "epoch": 0.01, "grad_norm": 1.4274882147184411, "learning_rate": 7.01923076923077e-06, "loss": 1.4512, "step": 221 }, { "epoch": 0.01, "grad_norm": 1.5195232372898413, "learning_rate": 7.051282051282053e-06, "loss": 1.5044, "step": 222 }, { "epoch": 0.01, "grad_norm": 1.3974543525216674, "learning_rate": 7.083333333333335e-06, "loss": 1.4976, "step": 223 }, { "epoch": 0.01, "grad_norm": 1.4188522801923789, "learning_rate": 7.115384615384616e-06, "loss": 1.3765, "step": 224 }, { "epoch": 0.01, "grad_norm": 1.4453431214143329, "learning_rate": 7.147435897435898e-06, "loss": 1.5845, "step": 225 }, { "epoch": 0.01, "grad_norm": 1.3360802149976794, "learning_rate": 7.17948717948718e-06, "loss": 1.729, "step": 226 }, { "epoch": 0.01, "grad_norm": 1.3508905049958093, "learning_rate": 7.211538461538462e-06, "loss": 1.4604, "step": 227 }, { "epoch": 0.01, "grad_norm": 1.7696584130328374, "learning_rate": 7.243589743589744e-06, "loss": 1.5024, "step": 228 }, { "epoch": 0.01, "grad_norm": 1.4635829385660106, "learning_rate": 7.2756410256410255e-06, "loss": 1.6235, "step": 229 }, { "epoch": 0.01, "grad_norm": 1.669299411343517, "learning_rate": 7.307692307692308e-06, "loss": 1.6665, "step": 230 }, { "epoch": 0.01, "grad_norm": 1.6483065886016006, "learning_rate": 7.33974358974359e-06, "loss": 1.4634, "step": 231 }, { "epoch": 0.01, "grad_norm": 1.3387448202439354, "learning_rate": 7.371794871794873e-06, "loss": 1.6221, "step": 232 }, { "epoch": 0.01, "grad_norm": 1.437370038669659, "learning_rate": 7.403846153846155e-06, "loss": 1.4688, "step": 233 }, { "epoch": 0.01, "grad_norm": 1.7931577095568545, "learning_rate": 7.435897435897437e-06, "loss": 1.6265, "step": 234 }, { "epoch": 0.01, "grad_norm": 1.6064451616276418, "learning_rate": 7.467948717948719e-06, "loss": 1.3843, "step": 235 }, { "epoch": 0.01, "grad_norm": 1.3907874603976877, "learning_rate": 7.500000000000001e-06, "loss": 1.3608, "step": 236 }, { "epoch": 0.01, "grad_norm": 1.3369647395143316, "learning_rate": 7.532051282051282e-06, "loss": 1.4453, "step": 237 }, { "epoch": 0.01, "grad_norm": 1.698987018791686, "learning_rate": 7.564102564102564e-06, "loss": 1.415, "step": 238 }, { "epoch": 0.01, "grad_norm": 1.3649721263541053, "learning_rate": 7.5961538461538465e-06, "loss": 1.3789, "step": 239 }, { "epoch": 0.01, "grad_norm": 1.4088385190522619, "learning_rate": 7.6282051282051286e-06, "loss": 1.2163, "step": 240 }, { "epoch": 0.01, "grad_norm": 1.5219404939043188, "learning_rate": 7.660256410256411e-06, "loss": 1.4097, "step": 241 }, { "epoch": 0.01, "grad_norm": 1.5724354127233526, "learning_rate": 7.692307692307694e-06, "loss": 1.6787, "step": 242 }, { "epoch": 0.01, "grad_norm": 1.3970694181480925, "learning_rate": 7.724358974358976e-06, "loss": 1.4629, "step": 243 }, { "epoch": 0.01, "grad_norm": 1.3734568487301586, "learning_rate": 7.756410256410258e-06, "loss": 1.4873, "step": 244 }, { "epoch": 0.01, "grad_norm": 1.3863321702985334, "learning_rate": 7.78846153846154e-06, "loss": 1.5269, "step": 245 }, { "epoch": 0.01, "grad_norm": 1.8121022125386985, "learning_rate": 7.820512820512822e-06, "loss": 1.4814, "step": 246 }, { "epoch": 0.01, "grad_norm": 1.538933058041442, "learning_rate": 7.852564102564102e-06, "loss": 1.583, "step": 247 }, { "epoch": 0.01, "grad_norm": 1.395100966161363, "learning_rate": 7.884615384615384e-06, "loss": 1.4912, "step": 248 }, { "epoch": 0.01, "grad_norm": 1.4133233925764965, "learning_rate": 7.916666666666667e-06, "loss": 1.481, "step": 249 }, { "epoch": 0.01, "grad_norm": 1.3687218405311699, "learning_rate": 7.948717948717949e-06, "loss": 1.4595, "step": 250 }, { "epoch": 0.01, "grad_norm": 1.464734174966439, "learning_rate": 7.980769230769232e-06, "loss": 1.4932, "step": 251 }, { "epoch": 0.01, "grad_norm": 1.1884946092673951, "learning_rate": 8.012820512820515e-06, "loss": 1.4556, "step": 252 }, { "epoch": 0.01, "grad_norm": 1.54451676830231, "learning_rate": 8.044871794871797e-06, "loss": 1.6221, "step": 253 }, { "epoch": 0.01, "grad_norm": 1.754688343231471, "learning_rate": 8.076923076923077e-06, "loss": 1.4512, "step": 254 }, { "epoch": 0.01, "grad_norm": 1.3322302493409486, "learning_rate": 8.108974358974359e-06, "loss": 1.5825, "step": 255 }, { "epoch": 0.01, "grad_norm": 1.4674730077487839, "learning_rate": 8.141025641025641e-06, "loss": 1.5005, "step": 256 }, { "epoch": 0.01, "grad_norm": 1.2167921588898725, "learning_rate": 8.173076923076923e-06, "loss": 1.2827, "step": 257 }, { "epoch": 0.01, "grad_norm": 1.168540596036068, "learning_rate": 8.205128205128205e-06, "loss": 1.3428, "step": 258 }, { "epoch": 0.01, "grad_norm": 1.6789165955553473, "learning_rate": 8.237179487179487e-06, "loss": 1.5195, "step": 259 }, { "epoch": 0.01, "grad_norm": 1.264218952301381, "learning_rate": 8.26923076923077e-06, "loss": 1.4189, "step": 260 }, { "epoch": 0.01, "grad_norm": 1.7624012666345472, "learning_rate": 8.301282051282052e-06, "loss": 1.5059, "step": 261 }, { "epoch": 0.01, "grad_norm": 1.4194250760848537, "learning_rate": 8.333333333333334e-06, "loss": 1.5103, "step": 262 }, { "epoch": 0.01, "grad_norm": 1.5397114149208282, "learning_rate": 8.365384615384616e-06, "loss": 1.3716, "step": 263 }, { "epoch": 0.01, "grad_norm": 1.8412153258409343, "learning_rate": 8.397435897435898e-06, "loss": 1.4932, "step": 264 }, { "epoch": 0.01, "grad_norm": 1.6802563340508783, "learning_rate": 8.42948717948718e-06, "loss": 1.3311, "step": 265 }, { "epoch": 0.01, "grad_norm": 1.670878605132347, "learning_rate": 8.461538461538462e-06, "loss": 1.521, "step": 266 }, { "epoch": 0.01, "grad_norm": 1.2395749877315518, "learning_rate": 8.493589743589744e-06, "loss": 1.3379, "step": 267 }, { "epoch": 0.01, "grad_norm": 1.4818439873500375, "learning_rate": 8.525641025641026e-06, "loss": 1.3052, "step": 268 }, { "epoch": 0.01, "grad_norm": 1.9952198811090225, "learning_rate": 8.557692307692308e-06, "loss": 1.6255, "step": 269 }, { "epoch": 0.01, "grad_norm": 1.3128031554812432, "learning_rate": 8.58974358974359e-06, "loss": 1.4761, "step": 270 }, { "epoch": 0.01, "grad_norm": 1.3833992687798677, "learning_rate": 8.621794871794873e-06, "loss": 1.3945, "step": 271 }, { "epoch": 0.01, "grad_norm": 1.5293115012028113, "learning_rate": 8.653846153846155e-06, "loss": 1.3755, "step": 272 }, { "epoch": 0.01, "grad_norm": 1.482360230970466, "learning_rate": 8.685897435897437e-06, "loss": 1.4541, "step": 273 }, { "epoch": 0.01, "grad_norm": 1.4286930588039524, "learning_rate": 8.717948717948719e-06, "loss": 1.3594, "step": 274 }, { "epoch": 0.01, "grad_norm": 1.588811858248963, "learning_rate": 8.750000000000001e-06, "loss": 1.354, "step": 275 }, { "epoch": 0.01, "grad_norm": 1.374045211156847, "learning_rate": 8.782051282051283e-06, "loss": 1.3535, "step": 276 }, { "epoch": 0.01, "grad_norm": 1.4308512665840407, "learning_rate": 8.814102564102565e-06, "loss": 1.4482, "step": 277 }, { "epoch": 0.01, "grad_norm": 1.403680196525793, "learning_rate": 8.846153846153847e-06, "loss": 1.4131, "step": 278 }, { "epoch": 0.01, "grad_norm": 1.5477001210895838, "learning_rate": 8.87820512820513e-06, "loss": 1.5146, "step": 279 }, { "epoch": 0.01, "grad_norm": 1.5212020849365218, "learning_rate": 8.910256410256411e-06, "loss": 1.5601, "step": 280 }, { "epoch": 0.01, "grad_norm": 1.225342389365309, "learning_rate": 8.942307692307693e-06, "loss": 1.2358, "step": 281 }, { "epoch": 0.01, "grad_norm": 1.327043026339369, "learning_rate": 8.974358974358976e-06, "loss": 1.478, "step": 282 }, { "epoch": 0.01, "grad_norm": 1.4647986744960786, "learning_rate": 9.006410256410258e-06, "loss": 1.3413, "step": 283 }, { "epoch": 0.01, "grad_norm": 1.2104548586621704, "learning_rate": 9.03846153846154e-06, "loss": 1.4502, "step": 284 }, { "epoch": 0.01, "grad_norm": 1.8286186993485871, "learning_rate": 9.070512820512822e-06, "loss": 1.5854, "step": 285 }, { "epoch": 0.01, "grad_norm": 1.5998701932811834, "learning_rate": 9.102564102564104e-06, "loss": 1.4468, "step": 286 }, { "epoch": 0.01, "grad_norm": 1.6678199201254207, "learning_rate": 9.134615384615384e-06, "loss": 1.3965, "step": 287 }, { "epoch": 0.01, "grad_norm": 1.3070061722415838, "learning_rate": 9.166666666666666e-06, "loss": 1.6094, "step": 288 }, { "epoch": 0.01, "grad_norm": 1.225972205579119, "learning_rate": 9.198717948717949e-06, "loss": 1.3271, "step": 289 }, { "epoch": 0.01, "grad_norm": 1.3148158638144016, "learning_rate": 9.230769230769232e-06, "loss": 1.3511, "step": 290 }, { "epoch": 0.01, "grad_norm": 1.4129495542533224, "learning_rate": 9.262820512820514e-06, "loss": 1.5039, "step": 291 }, { "epoch": 0.01, "grad_norm": 1.4576480389490902, "learning_rate": 9.294871794871796e-06, "loss": 1.3574, "step": 292 }, { "epoch": 0.01, "grad_norm": 1.3393706825067717, "learning_rate": 9.326923076923079e-06, "loss": 1.6074, "step": 293 }, { "epoch": 0.01, "grad_norm": 1.5610340026880416, "learning_rate": 9.358974358974359e-06, "loss": 1.2837, "step": 294 }, { "epoch": 0.01, "grad_norm": 1.3423965904877306, "learning_rate": 9.391025641025641e-06, "loss": 1.3682, "step": 295 }, { "epoch": 0.01, "grad_norm": 1.4485470387685437, "learning_rate": 9.423076923076923e-06, "loss": 1.4712, "step": 296 }, { "epoch": 0.01, "grad_norm": 1.5008676191378, "learning_rate": 9.455128205128205e-06, "loss": 1.4629, "step": 297 }, { "epoch": 0.01, "grad_norm": 1.7676816459022766, "learning_rate": 9.487179487179487e-06, "loss": 1.5156, "step": 298 }, { "epoch": 0.01, "grad_norm": 2.1699144889775472, "learning_rate": 9.51923076923077e-06, "loss": 1.6279, "step": 299 }, { "epoch": 0.01, "grad_norm": 1.3484712772438978, "learning_rate": 9.551282051282053e-06, "loss": 1.4512, "step": 300 }, { "epoch": 0.01, "grad_norm": 1.3503855576206905, "learning_rate": 9.583333333333335e-06, "loss": 1.4546, "step": 301 }, { "epoch": 0.01, "grad_norm": 1.2700300079713553, "learning_rate": 9.615384615384616e-06, "loss": 1.3384, "step": 302 }, { "epoch": 0.01, "grad_norm": 1.465131921505914, "learning_rate": 9.647435897435898e-06, "loss": 1.3301, "step": 303 }, { "epoch": 0.01, "grad_norm": 1.3338387423956481, "learning_rate": 9.67948717948718e-06, "loss": 1.5278, "step": 304 }, { "epoch": 0.01, "grad_norm": 1.8912144068383967, "learning_rate": 9.711538461538462e-06, "loss": 1.481, "step": 305 }, { "epoch": 0.01, "grad_norm": 1.5121533642594989, "learning_rate": 9.743589743589744e-06, "loss": 1.3169, "step": 306 }, { "epoch": 0.01, "grad_norm": 1.433347119046012, "learning_rate": 9.775641025641026e-06, "loss": 1.2729, "step": 307 }, { "epoch": 0.01, "grad_norm": 1.186288832857276, "learning_rate": 9.807692307692308e-06, "loss": 1.4258, "step": 308 }, { "epoch": 0.01, "grad_norm": 1.5794770512780545, "learning_rate": 9.83974358974359e-06, "loss": 1.4204, "step": 309 }, { "epoch": 0.01, "grad_norm": 1.3527863244606433, "learning_rate": 9.871794871794872e-06, "loss": 1.4858, "step": 310 }, { "epoch": 0.01, "grad_norm": 1.326924198253285, "learning_rate": 9.903846153846155e-06, "loss": 1.5527, "step": 311 }, { "epoch": 0.02, "grad_norm": 1.6396670891042595, "learning_rate": 9.935897435897437e-06, "loss": 1.4624, "step": 312 }, { "epoch": 0.02, "grad_norm": 1.4244566829672403, "learning_rate": 9.967948717948719e-06, "loss": 1.4019, "step": 313 }, { "epoch": 0.02, "grad_norm": 1.236435094721605, "learning_rate": 1e-05, "loss": 1.3257, "step": 314 }, { "epoch": 0.02, "grad_norm": 1.1019481324141696, "learning_rate": 1.0032051282051283e-05, "loss": 1.4155, "step": 315 }, { "epoch": 0.02, "grad_norm": 1.4697759039594815, "learning_rate": 1.0064102564102565e-05, "loss": 1.4448, "step": 316 }, { "epoch": 0.02, "grad_norm": 1.6143069471150746, "learning_rate": 1.0096153846153847e-05, "loss": 1.3887, "step": 317 }, { "epoch": 0.02, "grad_norm": 1.464365054230673, "learning_rate": 1.012820512820513e-05, "loss": 1.498, "step": 318 }, { "epoch": 0.02, "grad_norm": 1.2608191515062133, "learning_rate": 1.0160256410256411e-05, "loss": 1.3281, "step": 319 }, { "epoch": 0.02, "grad_norm": 1.302480276204567, "learning_rate": 1.0192307692307692e-05, "loss": 1.3628, "step": 320 }, { "epoch": 0.02, "grad_norm": 1.942889152987852, "learning_rate": 1.0224358974358974e-05, "loss": 1.605, "step": 321 }, { "epoch": 0.02, "grad_norm": 1.5261387927655037, "learning_rate": 1.0256410256410256e-05, "loss": 1.4165, "step": 322 }, { "epoch": 0.02, "grad_norm": 1.206259089764386, "learning_rate": 1.0288461538461538e-05, "loss": 1.457, "step": 323 }, { "epoch": 0.02, "grad_norm": 1.2720669728637408, "learning_rate": 1.0320512820512822e-05, "loss": 1.3384, "step": 324 }, { "epoch": 0.02, "grad_norm": 1.3504509929969915, "learning_rate": 1.0352564102564104e-05, "loss": 1.3066, "step": 325 }, { "epoch": 0.02, "grad_norm": 1.265338651108809, "learning_rate": 1.0384615384615386e-05, "loss": 1.5186, "step": 326 }, { "epoch": 0.02, "grad_norm": 1.409351609089138, "learning_rate": 1.0416666666666668e-05, "loss": 1.4336, "step": 327 }, { "epoch": 0.02, "grad_norm": 1.1841520862995354, "learning_rate": 1.044871794871795e-05, "loss": 1.4272, "step": 328 }, { "epoch": 0.02, "grad_norm": 1.5794139564525111, "learning_rate": 1.0480769230769232e-05, "loss": 1.416, "step": 329 }, { "epoch": 0.02, "grad_norm": 1.6907594388650082, "learning_rate": 1.0512820512820514e-05, "loss": 1.3145, "step": 330 }, { "epoch": 0.02, "grad_norm": 1.0994459480533283, "learning_rate": 1.0544871794871796e-05, "loss": 1.3394, "step": 331 }, { "epoch": 0.02, "grad_norm": 1.377055170677683, "learning_rate": 1.0576923076923078e-05, "loss": 1.4604, "step": 332 }, { "epoch": 0.02, "grad_norm": 1.3129708771716166, "learning_rate": 1.060897435897436e-05, "loss": 1.3726, "step": 333 }, { "epoch": 0.02, "grad_norm": 1.5016114605414075, "learning_rate": 1.0641025641025643e-05, "loss": 1.4448, "step": 334 }, { "epoch": 0.02, "grad_norm": 1.4403819506870998, "learning_rate": 1.0673076923076923e-05, "loss": 1.478, "step": 335 }, { "epoch": 0.02, "grad_norm": 1.2893505889314858, "learning_rate": 1.0705128205128205e-05, "loss": 1.4937, "step": 336 }, { "epoch": 0.02, "grad_norm": 1.2944883906385094, "learning_rate": 1.0737179487179487e-05, "loss": 1.3599, "step": 337 }, { "epoch": 0.02, "grad_norm": 1.0415205266579957, "learning_rate": 1.076923076923077e-05, "loss": 1.3447, "step": 338 }, { "epoch": 0.02, "grad_norm": 1.6037439814419903, "learning_rate": 1.0801282051282051e-05, "loss": 1.563, "step": 339 }, { "epoch": 0.02, "grad_norm": 1.38426018047868, "learning_rate": 1.0833333333333334e-05, "loss": 1.3701, "step": 340 }, { "epoch": 0.02, "grad_norm": 1.1618201751547155, "learning_rate": 1.0865384615384616e-05, "loss": 1.3154, "step": 341 }, { "epoch": 0.02, "grad_norm": 1.5823963489019184, "learning_rate": 1.0897435897435898e-05, "loss": 1.5928, "step": 342 }, { "epoch": 0.02, "grad_norm": 1.576241294933178, "learning_rate": 1.092948717948718e-05, "loss": 1.5718, "step": 343 }, { "epoch": 0.02, "grad_norm": 1.388073717584795, "learning_rate": 1.0961538461538464e-05, "loss": 1.3481, "step": 344 }, { "epoch": 0.02, "grad_norm": 1.2535951872034437, "learning_rate": 1.0993589743589746e-05, "loss": 1.3926, "step": 345 }, { "epoch": 0.02, "grad_norm": 1.2944732193698896, "learning_rate": 1.1025641025641028e-05, "loss": 1.436, "step": 346 }, { "epoch": 0.02, "grad_norm": 1.2445746907666893, "learning_rate": 1.105769230769231e-05, "loss": 1.4834, "step": 347 }, { "epoch": 0.02, "grad_norm": 1.196209356878662, "learning_rate": 1.1089743589743592e-05, "loss": 1.5166, "step": 348 }, { "epoch": 0.02, "grad_norm": 1.6188803860018033, "learning_rate": 1.1121794871794872e-05, "loss": 1.46, "step": 349 }, { "epoch": 0.02, "grad_norm": 1.5883844024491693, "learning_rate": 1.1153846153846154e-05, "loss": 1.2119, "step": 350 }, { "epoch": 0.02, "grad_norm": 1.3706784869709538, "learning_rate": 1.1185897435897437e-05, "loss": 1.5693, "step": 351 }, { "epoch": 0.02, "grad_norm": 1.5282244569689725, "learning_rate": 1.1217948717948719e-05, "loss": 1.4248, "step": 352 }, { "epoch": 0.02, "grad_norm": 1.1965002388309016, "learning_rate": 1.125e-05, "loss": 1.4424, "step": 353 }, { "epoch": 0.02, "grad_norm": 1.4859550934148806, "learning_rate": 1.1282051282051283e-05, "loss": 1.377, "step": 354 }, { "epoch": 0.02, "grad_norm": 1.3428247798766801, "learning_rate": 1.1314102564102565e-05, "loss": 1.3701, "step": 355 }, { "epoch": 0.02, "grad_norm": 1.5244567142412975, "learning_rate": 1.1346153846153847e-05, "loss": 1.4443, "step": 356 }, { "epoch": 0.02, "grad_norm": 1.39242450188167, "learning_rate": 1.1378205128205129e-05, "loss": 1.4053, "step": 357 }, { "epoch": 0.02, "grad_norm": 1.365903690611676, "learning_rate": 1.1410256410256411e-05, "loss": 1.3633, "step": 358 }, { "epoch": 0.02, "grad_norm": 1.3300293389531603, "learning_rate": 1.1442307692307693e-05, "loss": 1.3662, "step": 359 }, { "epoch": 0.02, "grad_norm": 1.4397452366690222, "learning_rate": 1.1474358974358974e-05, "loss": 1.4614, "step": 360 }, { "epoch": 0.02, "grad_norm": 1.543567048299789, "learning_rate": 1.1506410256410256e-05, "loss": 1.5532, "step": 361 }, { "epoch": 0.02, "grad_norm": 1.41321670314368, "learning_rate": 1.1538461538461538e-05, "loss": 1.3584, "step": 362 }, { "epoch": 0.02, "grad_norm": 1.1748040935502257, "learning_rate": 1.1570512820512823e-05, "loss": 1.313, "step": 363 }, { "epoch": 0.02, "grad_norm": 1.3532571008432337, "learning_rate": 1.1602564102564104e-05, "loss": 1.2495, "step": 364 }, { "epoch": 0.02, "grad_norm": 1.3518482258480287, "learning_rate": 1.1634615384615386e-05, "loss": 1.3379, "step": 365 }, { "epoch": 0.02, "grad_norm": 1.332411463979091, "learning_rate": 1.1666666666666668e-05, "loss": 1.3198, "step": 366 }, { "epoch": 0.02, "grad_norm": 1.0707442437044281, "learning_rate": 1.169871794871795e-05, "loss": 1.0234, "step": 367 }, { "epoch": 0.02, "grad_norm": 1.6457862664444303, "learning_rate": 1.1730769230769232e-05, "loss": 1.5063, "step": 368 }, { "epoch": 0.02, "grad_norm": 1.476115170967113, "learning_rate": 1.1762820512820514e-05, "loss": 1.5054, "step": 369 }, { "epoch": 0.02, "grad_norm": 1.3399981445100086, "learning_rate": 1.1794871794871796e-05, "loss": 1.4072, "step": 370 }, { "epoch": 0.02, "grad_norm": 1.6543183279593394, "learning_rate": 1.1826923076923078e-05, "loss": 1.519, "step": 371 }, { "epoch": 0.02, "grad_norm": 1.4376527933589291, "learning_rate": 1.185897435897436e-05, "loss": 1.3296, "step": 372 }, { "epoch": 0.02, "grad_norm": 1.8176659158068955, "learning_rate": 1.1891025641025643e-05, "loss": 1.4937, "step": 373 }, { "epoch": 0.02, "grad_norm": 1.5771949777504417, "learning_rate": 1.1923076923076925e-05, "loss": 1.3008, "step": 374 }, { "epoch": 0.02, "grad_norm": 1.4240240847785275, "learning_rate": 1.1955128205128205e-05, "loss": 1.415, "step": 375 }, { "epoch": 0.02, "grad_norm": 1.4991960350937326, "learning_rate": 1.1987179487179487e-05, "loss": 1.4409, "step": 376 }, { "epoch": 0.02, "grad_norm": 1.273472829846749, "learning_rate": 1.201923076923077e-05, "loss": 1.481, "step": 377 }, { "epoch": 0.02, "grad_norm": 1.5817076265575158, "learning_rate": 1.2051282051282051e-05, "loss": 1.3716, "step": 378 }, { "epoch": 0.02, "grad_norm": 1.2303190554901131, "learning_rate": 1.2083333333333333e-05, "loss": 1.4932, "step": 379 }, { "epoch": 0.02, "grad_norm": 1.3208287584759297, "learning_rate": 1.2115384615384615e-05, "loss": 1.3291, "step": 380 }, { "epoch": 0.02, "grad_norm": 1.2963875519764945, "learning_rate": 1.2147435897435898e-05, "loss": 1.3232, "step": 381 }, { "epoch": 0.02, "grad_norm": 1.3919491990970962, "learning_rate": 1.217948717948718e-05, "loss": 1.2808, "step": 382 }, { "epoch": 0.02, "grad_norm": 1.2849178005288528, "learning_rate": 1.2211538461538463e-05, "loss": 1.2964, "step": 383 }, { "epoch": 0.02, "grad_norm": 1.3675399517474263, "learning_rate": 1.2243589743589746e-05, "loss": 1.5195, "step": 384 }, { "epoch": 0.02, "grad_norm": 1.3977606893412815, "learning_rate": 1.2275641025641028e-05, "loss": 1.3066, "step": 385 }, { "epoch": 0.02, "grad_norm": 1.2562214350443868, "learning_rate": 1.230769230769231e-05, "loss": 1.3281, "step": 386 }, { "epoch": 0.02, "grad_norm": 1.5722753928615885, "learning_rate": 1.2339743589743592e-05, "loss": 1.3892, "step": 387 }, { "epoch": 0.02, "grad_norm": 1.2486043672664813, "learning_rate": 1.2371794871794874e-05, "loss": 1.397, "step": 388 }, { "epoch": 0.02, "grad_norm": 1.2850751024473135, "learning_rate": 1.2403846153846156e-05, "loss": 1.4736, "step": 389 }, { "epoch": 0.02, "grad_norm": 1.4071165180325196, "learning_rate": 1.2435897435897436e-05, "loss": 1.353, "step": 390 }, { "epoch": 0.02, "grad_norm": 2.0851094197433793, "learning_rate": 1.2467948717948719e-05, "loss": 1.5176, "step": 391 }, { "epoch": 0.02, "grad_norm": 1.3229683183230674, "learning_rate": 1.25e-05, "loss": 1.4102, "step": 392 }, { "epoch": 0.02, "grad_norm": 1.4853370638473593, "learning_rate": 1.2532051282051283e-05, "loss": 1.2668, "step": 393 }, { "epoch": 0.02, "grad_norm": 1.1870355905247647, "learning_rate": 1.2564102564102565e-05, "loss": 1.3403, "step": 394 }, { "epoch": 0.02, "grad_norm": 1.504089343823328, "learning_rate": 1.2596153846153847e-05, "loss": 1.332, "step": 395 }, { "epoch": 0.02, "grad_norm": 1.5321805668699346, "learning_rate": 1.2628205128205129e-05, "loss": 1.3467, "step": 396 }, { "epoch": 0.02, "grad_norm": 1.5823430646814343, "learning_rate": 1.2660256410256411e-05, "loss": 1.5112, "step": 397 }, { "epoch": 0.02, "grad_norm": 1.2761497624577036, "learning_rate": 1.2692307692307693e-05, "loss": 1.3232, "step": 398 }, { "epoch": 0.02, "grad_norm": 1.1549200773500554, "learning_rate": 1.2724358974358975e-05, "loss": 1.396, "step": 399 }, { "epoch": 0.02, "grad_norm": 1.267847493836026, "learning_rate": 1.2756410256410257e-05, "loss": 1.2495, "step": 400 }, { "epoch": 0.02, "grad_norm": 1.3184570422607709, "learning_rate": 1.2788461538461538e-05, "loss": 1.3213, "step": 401 }, { "epoch": 0.02, "grad_norm": 1.2403072183011512, "learning_rate": 1.2820512820512823e-05, "loss": 1.3247, "step": 402 }, { "epoch": 0.02, "grad_norm": 1.3597493157154914, "learning_rate": 1.2852564102564105e-05, "loss": 1.6499, "step": 403 }, { "epoch": 0.02, "grad_norm": 1.214172883747685, "learning_rate": 1.2884615384615386e-05, "loss": 1.4644, "step": 404 }, { "epoch": 0.02, "grad_norm": 1.3371797055041754, "learning_rate": 1.2916666666666668e-05, "loss": 1.3667, "step": 405 }, { "epoch": 0.02, "grad_norm": 1.0974180779714644, "learning_rate": 1.294871794871795e-05, "loss": 1.3931, "step": 406 }, { "epoch": 0.02, "grad_norm": 1.218185854267764, "learning_rate": 1.2980769230769232e-05, "loss": 1.394, "step": 407 }, { "epoch": 0.02, "grad_norm": 1.300208779188169, "learning_rate": 1.3012820512820514e-05, "loss": 1.332, "step": 408 }, { "epoch": 0.02, "grad_norm": 1.276319102582932, "learning_rate": 1.3044871794871796e-05, "loss": 1.0845, "step": 409 }, { "epoch": 0.02, "grad_norm": 1.23973437135605, "learning_rate": 1.3076923076923078e-05, "loss": 1.3481, "step": 410 }, { "epoch": 0.02, "grad_norm": 1.3170565442346465, "learning_rate": 1.310897435897436e-05, "loss": 1.4185, "step": 411 }, { "epoch": 0.02, "grad_norm": 1.4969675783159206, "learning_rate": 1.3141025641025642e-05, "loss": 1.354, "step": 412 }, { "epoch": 0.02, "grad_norm": 1.7639450193235466, "learning_rate": 1.3173076923076925e-05, "loss": 1.3486, "step": 413 }, { "epoch": 0.02, "grad_norm": 1.9263564012989225, "learning_rate": 1.3205128205128207e-05, "loss": 1.6157, "step": 414 }, { "epoch": 0.02, "grad_norm": 1.2579722293146822, "learning_rate": 1.3237179487179487e-05, "loss": 1.2314, "step": 415 }, { "epoch": 0.02, "grad_norm": 1.5875289321322101, "learning_rate": 1.3269230769230769e-05, "loss": 1.3984, "step": 416 }, { "epoch": 0.02, "grad_norm": 1.4036819396205769, "learning_rate": 1.3301282051282051e-05, "loss": 1.2651, "step": 417 }, { "epoch": 0.02, "grad_norm": 1.3607301170192805, "learning_rate": 1.3333333333333333e-05, "loss": 1.4873, "step": 418 }, { "epoch": 0.02, "grad_norm": 1.2216593704601035, "learning_rate": 1.3365384615384615e-05, "loss": 1.2241, "step": 419 }, { "epoch": 0.02, "grad_norm": 1.2389080776413752, "learning_rate": 1.3397435897435897e-05, "loss": 1.2642, "step": 420 }, { "epoch": 0.02, "grad_norm": 1.117370907316905, "learning_rate": 1.342948717948718e-05, "loss": 1.2358, "step": 421 }, { "epoch": 0.02, "grad_norm": 1.118087185561451, "learning_rate": 1.3461538461538463e-05, "loss": 1.2661, "step": 422 }, { "epoch": 0.02, "grad_norm": 1.2654215672483675, "learning_rate": 1.3493589743589745e-05, "loss": 1.3218, "step": 423 }, { "epoch": 0.02, "grad_norm": 1.447322052774496, "learning_rate": 1.3525641025641028e-05, "loss": 1.5986, "step": 424 }, { "epoch": 0.02, "grad_norm": 1.2858801812393017, "learning_rate": 1.355769230769231e-05, "loss": 1.3184, "step": 425 }, { "epoch": 0.02, "grad_norm": 1.3029973554102459, "learning_rate": 1.3589743589743592e-05, "loss": 1.3193, "step": 426 }, { "epoch": 0.02, "grad_norm": 1.3263639292111544, "learning_rate": 1.3621794871794874e-05, "loss": 1.3906, "step": 427 }, { "epoch": 0.02, "grad_norm": 1.059254631454006, "learning_rate": 1.3653846153846156e-05, "loss": 1.2583, "step": 428 }, { "epoch": 0.02, "grad_norm": 1.2836032506218642, "learning_rate": 1.3685897435897438e-05, "loss": 1.3071, "step": 429 }, { "epoch": 0.02, "grad_norm": 1.2337693323714396, "learning_rate": 1.3717948717948718e-05, "loss": 1.4712, "step": 430 }, { "epoch": 0.02, "grad_norm": 1.492108564766739, "learning_rate": 1.375e-05, "loss": 1.4907, "step": 431 }, { "epoch": 0.02, "grad_norm": 1.3307189173873828, "learning_rate": 1.3782051282051283e-05, "loss": 1.2534, "step": 432 }, { "epoch": 0.02, "grad_norm": 1.2501749388020176, "learning_rate": 1.3814102564102565e-05, "loss": 1.2495, "step": 433 }, { "epoch": 0.02, "grad_norm": 1.4581552746325526, "learning_rate": 1.3846153846153847e-05, "loss": 1.3721, "step": 434 }, { "epoch": 0.02, "grad_norm": 1.5518329244245812, "learning_rate": 1.3878205128205129e-05, "loss": 1.3267, "step": 435 }, { "epoch": 0.02, "grad_norm": 1.8797997336582097, "learning_rate": 1.3910256410256411e-05, "loss": 1.5161, "step": 436 }, { "epoch": 0.02, "grad_norm": 1.1566499953624272, "learning_rate": 1.3942307692307693e-05, "loss": 1.4575, "step": 437 }, { "epoch": 0.02, "grad_norm": 1.4273618093245022, "learning_rate": 1.3974358974358975e-05, "loss": 1.4155, "step": 438 }, { "epoch": 0.02, "grad_norm": 1.418291971683312, "learning_rate": 1.4006410256410257e-05, "loss": 1.4375, "step": 439 }, { "epoch": 0.02, "grad_norm": 1.2541249133240433, "learning_rate": 1.403846153846154e-05, "loss": 1.4771, "step": 440 }, { "epoch": 0.02, "grad_norm": 1.1462392922714209, "learning_rate": 1.4070512820512823e-05, "loss": 1.3403, "step": 441 }, { "epoch": 0.02, "grad_norm": 1.2094459866707437, "learning_rate": 1.4102564102564105e-05, "loss": 1.4746, "step": 442 }, { "epoch": 0.02, "grad_norm": 1.391059730506335, "learning_rate": 1.4134615384615387e-05, "loss": 1.4907, "step": 443 }, { "epoch": 0.02, "grad_norm": 1.4291849581594014, "learning_rate": 1.416666666666667e-05, "loss": 1.4214, "step": 444 }, { "epoch": 0.02, "grad_norm": 1.3849603958342436, "learning_rate": 1.419871794871795e-05, "loss": 1.4814, "step": 445 }, { "epoch": 0.02, "grad_norm": 1.4123325940874079, "learning_rate": 1.4230769230769232e-05, "loss": 1.3867, "step": 446 }, { "epoch": 0.02, "grad_norm": 1.2491519301399518, "learning_rate": 1.4262820512820514e-05, "loss": 1.4355, "step": 447 }, { "epoch": 0.02, "grad_norm": 1.385088745064307, "learning_rate": 1.4294871794871796e-05, "loss": 1.4468, "step": 448 }, { "epoch": 0.02, "grad_norm": 1.4034533875763116, "learning_rate": 1.4326923076923078e-05, "loss": 1.4443, "step": 449 }, { "epoch": 0.02, "grad_norm": 1.6690925808258177, "learning_rate": 1.435897435897436e-05, "loss": 1.394, "step": 450 }, { "epoch": 0.02, "grad_norm": 1.382429368409419, "learning_rate": 1.4391025641025642e-05, "loss": 1.4604, "step": 451 }, { "epoch": 0.02, "grad_norm": 1.3383284079637572, "learning_rate": 1.4423076923076924e-05, "loss": 1.1294, "step": 452 }, { "epoch": 0.02, "grad_norm": 1.2705889899306166, "learning_rate": 1.4455128205128207e-05, "loss": 1.376, "step": 453 }, { "epoch": 0.02, "grad_norm": 1.1686124888717497, "learning_rate": 1.4487179487179489e-05, "loss": 1.2856, "step": 454 }, { "epoch": 0.02, "grad_norm": 1.2749741389421876, "learning_rate": 1.451923076923077e-05, "loss": 1.2329, "step": 455 }, { "epoch": 0.02, "grad_norm": 1.2580765489359371, "learning_rate": 1.4551282051282051e-05, "loss": 1.2793, "step": 456 }, { "epoch": 0.02, "grad_norm": 1.3592702221041502, "learning_rate": 1.4583333333333333e-05, "loss": 1.3877, "step": 457 }, { "epoch": 0.02, "grad_norm": 1.3895434960715267, "learning_rate": 1.4615384615384615e-05, "loss": 1.4829, "step": 458 }, { "epoch": 0.02, "grad_norm": 1.7269809081369298, "learning_rate": 1.4647435897435897e-05, "loss": 1.4624, "step": 459 }, { "epoch": 0.02, "grad_norm": 1.5239890261842506, "learning_rate": 1.467948717948718e-05, "loss": 1.4985, "step": 460 }, { "epoch": 0.02, "grad_norm": 1.4313301615232084, "learning_rate": 1.4711538461538463e-05, "loss": 1.3398, "step": 461 }, { "epoch": 0.02, "grad_norm": 1.2783271879799631, "learning_rate": 1.4743589743589745e-05, "loss": 1.3608, "step": 462 }, { "epoch": 0.02, "grad_norm": 1.3457406427324083, "learning_rate": 1.4775641025641027e-05, "loss": 1.418, "step": 463 }, { "epoch": 0.02, "grad_norm": 1.3931762446456175, "learning_rate": 1.480769230769231e-05, "loss": 1.3604, "step": 464 }, { "epoch": 0.02, "grad_norm": 1.4288505584678168, "learning_rate": 1.4839743589743592e-05, "loss": 1.4463, "step": 465 }, { "epoch": 0.02, "grad_norm": 1.342985301820977, "learning_rate": 1.4871794871794874e-05, "loss": 1.1196, "step": 466 }, { "epoch": 0.02, "grad_norm": 1.4593839183430386, "learning_rate": 1.4903846153846156e-05, "loss": 1.3877, "step": 467 }, { "epoch": 0.02, "grad_norm": 1.4032202932228555, "learning_rate": 1.4935897435897438e-05, "loss": 1.2837, "step": 468 }, { "epoch": 0.02, "grad_norm": 1.5592303289765068, "learning_rate": 1.496794871794872e-05, "loss": 1.584, "step": 469 }, { "epoch": 0.02, "grad_norm": 1.0671623940891528, "learning_rate": 1.5000000000000002e-05, "loss": 1.3833, "step": 470 }, { "epoch": 0.02, "grad_norm": 1.688656676510716, "learning_rate": 1.5032051282051282e-05, "loss": 1.3125, "step": 471 }, { "epoch": 0.02, "grad_norm": 1.3259426919079789, "learning_rate": 1.5064102564102565e-05, "loss": 1.3774, "step": 472 }, { "epoch": 0.02, "grad_norm": 1.5122403593391645, "learning_rate": 1.5096153846153847e-05, "loss": 1.3193, "step": 473 }, { "epoch": 0.02, "grad_norm": 1.298995478132946, "learning_rate": 1.5128205128205129e-05, "loss": 1.3608, "step": 474 }, { "epoch": 0.02, "grad_norm": 1.453250029728073, "learning_rate": 1.516025641025641e-05, "loss": 1.3594, "step": 475 }, { "epoch": 0.02, "grad_norm": 1.50172762292981, "learning_rate": 1.5192307692307693e-05, "loss": 1.4419, "step": 476 }, { "epoch": 0.02, "grad_norm": 1.2655914489475526, "learning_rate": 1.5224358974358975e-05, "loss": 1.3872, "step": 477 }, { "epoch": 0.02, "grad_norm": 1.5007388470501197, "learning_rate": 1.5256410256410257e-05, "loss": 1.4858, "step": 478 }, { "epoch": 0.02, "grad_norm": 1.3411706083877026, "learning_rate": 1.528846153846154e-05, "loss": 1.4971, "step": 479 }, { "epoch": 0.02, "grad_norm": 1.6111704141004897, "learning_rate": 1.5320512820512823e-05, "loss": 1.5166, "step": 480 }, { "epoch": 0.02, "grad_norm": 1.2560735029638563, "learning_rate": 1.5352564102564103e-05, "loss": 1.2227, "step": 481 }, { "epoch": 0.02, "grad_norm": 1.710631494146579, "learning_rate": 1.5384615384615387e-05, "loss": 1.4624, "step": 482 }, { "epoch": 0.02, "grad_norm": 1.4471676155677404, "learning_rate": 1.5416666666666668e-05, "loss": 1.4858, "step": 483 }, { "epoch": 0.02, "grad_norm": 1.2683673322986313, "learning_rate": 1.544871794871795e-05, "loss": 1.3262, "step": 484 }, { "epoch": 0.02, "grad_norm": 1.3431884638640537, "learning_rate": 1.5480769230769232e-05, "loss": 1.3931, "step": 485 }, { "epoch": 0.02, "grad_norm": 1.5859419898497757, "learning_rate": 1.5512820512820516e-05, "loss": 1.5361, "step": 486 }, { "epoch": 0.02, "grad_norm": 1.1940940906097317, "learning_rate": 1.5544871794871796e-05, "loss": 1.3555, "step": 487 }, { "epoch": 0.02, "grad_norm": 1.208184860037386, "learning_rate": 1.557692307692308e-05, "loss": 1.3296, "step": 488 }, { "epoch": 0.02, "grad_norm": 1.6571858267866035, "learning_rate": 1.560897435897436e-05, "loss": 1.5972, "step": 489 }, { "epoch": 0.02, "grad_norm": 1.2572573420259763, "learning_rate": 1.5641025641025644e-05, "loss": 1.3457, "step": 490 }, { "epoch": 0.02, "grad_norm": 1.129851551616406, "learning_rate": 1.5673076923076924e-05, "loss": 1.2217, "step": 491 }, { "epoch": 0.02, "grad_norm": 1.503313161081039, "learning_rate": 1.5705128205128205e-05, "loss": 1.3223, "step": 492 }, { "epoch": 0.02, "grad_norm": 1.2548475152856144, "learning_rate": 1.573717948717949e-05, "loss": 1.4355, "step": 493 }, { "epoch": 0.02, "grad_norm": 1.3378438528405046, "learning_rate": 1.576923076923077e-05, "loss": 1.3794, "step": 494 }, { "epoch": 0.02, "grad_norm": 1.3908827505746522, "learning_rate": 1.5801282051282053e-05, "loss": 1.313, "step": 495 }, { "epoch": 0.02, "grad_norm": 1.5951990819483384, "learning_rate": 1.5833333333333333e-05, "loss": 1.2749, "step": 496 }, { "epoch": 0.02, "grad_norm": 1.4305845608831436, "learning_rate": 1.5865384615384617e-05, "loss": 1.3608, "step": 497 }, { "epoch": 0.02, "grad_norm": 1.4012784517118198, "learning_rate": 1.5897435897435897e-05, "loss": 1.356, "step": 498 }, { "epoch": 0.02, "grad_norm": 1.5294506791195728, "learning_rate": 1.592948717948718e-05, "loss": 1.4619, "step": 499 }, { "epoch": 0.02, "grad_norm": 1.3985439146611949, "learning_rate": 1.5961538461538465e-05, "loss": 1.46, "step": 500 }, { "epoch": 0.02, "grad_norm": 1.6209106621364877, "learning_rate": 1.5993589743589745e-05, "loss": 1.4336, "step": 501 }, { "epoch": 0.02, "grad_norm": 1.19239001982262, "learning_rate": 1.602564102564103e-05, "loss": 1.3608, "step": 502 }, { "epoch": 0.02, "grad_norm": 1.4225331684582514, "learning_rate": 1.605769230769231e-05, "loss": 1.4028, "step": 503 }, { "epoch": 0.02, "grad_norm": 1.3313304414095088, "learning_rate": 1.6089743589743593e-05, "loss": 1.1594, "step": 504 }, { "epoch": 0.02, "grad_norm": 1.2610625192883973, "learning_rate": 1.6121794871794874e-05, "loss": 1.3315, "step": 505 }, { "epoch": 0.02, "grad_norm": 1.1458905809275148, "learning_rate": 1.6153846153846154e-05, "loss": 1.437, "step": 506 }, { "epoch": 0.02, "grad_norm": 1.640721088627141, "learning_rate": 1.6185897435897438e-05, "loss": 1.5537, "step": 507 }, { "epoch": 0.02, "grad_norm": 1.281672844333867, "learning_rate": 1.6217948717948718e-05, "loss": 1.1401, "step": 508 }, { "epoch": 0.02, "grad_norm": 1.305292744782369, "learning_rate": 1.6250000000000002e-05, "loss": 1.1514, "step": 509 }, { "epoch": 0.02, "grad_norm": 1.0516093435593628, "learning_rate": 1.6282051282051282e-05, "loss": 1.2852, "step": 510 }, { "epoch": 0.02, "grad_norm": 1.2679003136024116, "learning_rate": 1.6314102564102566e-05, "loss": 1.4624, "step": 511 }, { "epoch": 0.02, "grad_norm": 1.3047158184357295, "learning_rate": 1.6346153846153847e-05, "loss": 1.4932, "step": 512 }, { "epoch": 0.02, "grad_norm": 1.3715912148181022, "learning_rate": 1.637820512820513e-05, "loss": 1.3623, "step": 513 }, { "epoch": 0.02, "grad_norm": 1.4661172391319894, "learning_rate": 1.641025641025641e-05, "loss": 1.4819, "step": 514 }, { "epoch": 0.02, "grad_norm": 1.428185708269545, "learning_rate": 1.6442307692307695e-05, "loss": 1.2607, "step": 515 }, { "epoch": 0.02, "grad_norm": 1.421599428979329, "learning_rate": 1.6474358974358975e-05, "loss": 1.1675, "step": 516 }, { "epoch": 0.02, "grad_norm": 1.4434464214298612, "learning_rate": 1.6506410256410255e-05, "loss": 1.2471, "step": 517 }, { "epoch": 0.02, "grad_norm": 1.2912728677963612, "learning_rate": 1.653846153846154e-05, "loss": 1.4097, "step": 518 }, { "epoch": 0.02, "grad_norm": 1.37928644520339, "learning_rate": 1.6570512820512823e-05, "loss": 1.3325, "step": 519 }, { "epoch": 0.03, "grad_norm": 1.5243439790781024, "learning_rate": 1.6602564102564103e-05, "loss": 1.4595, "step": 520 }, { "epoch": 0.03, "grad_norm": 1.3872808014398006, "learning_rate": 1.6634615384615387e-05, "loss": 1.416, "step": 521 }, { "epoch": 0.03, "grad_norm": 1.252778685941513, "learning_rate": 1.6666666666666667e-05, "loss": 1.3066, "step": 522 }, { "epoch": 0.03, "grad_norm": 1.4097986795543291, "learning_rate": 1.669871794871795e-05, "loss": 1.3564, "step": 523 }, { "epoch": 0.03, "grad_norm": 1.5960003013699287, "learning_rate": 1.673076923076923e-05, "loss": 1.3701, "step": 524 }, { "epoch": 0.03, "grad_norm": 1.215067235980445, "learning_rate": 1.6762820512820515e-05, "loss": 1.3301, "step": 525 }, { "epoch": 0.03, "grad_norm": 1.6221162089496706, "learning_rate": 1.6794871794871796e-05, "loss": 1.3198, "step": 526 }, { "epoch": 0.03, "grad_norm": 1.169658269624973, "learning_rate": 1.682692307692308e-05, "loss": 1.2725, "step": 527 }, { "epoch": 0.03, "grad_norm": 1.2641198357132362, "learning_rate": 1.685897435897436e-05, "loss": 1.3477, "step": 528 }, { "epoch": 0.03, "grad_norm": 1.3383105949534142, "learning_rate": 1.6891025641025644e-05, "loss": 1.3906, "step": 529 }, { "epoch": 0.03, "grad_norm": 1.2125034834919766, "learning_rate": 1.6923076923076924e-05, "loss": 1.2959, "step": 530 }, { "epoch": 0.03, "grad_norm": 1.2193697422118333, "learning_rate": 1.6955128205128205e-05, "loss": 1.1348, "step": 531 }, { "epoch": 0.03, "grad_norm": 1.0483726110681362, "learning_rate": 1.698717948717949e-05, "loss": 1.125, "step": 532 }, { "epoch": 0.03, "grad_norm": 1.301435717124425, "learning_rate": 1.701923076923077e-05, "loss": 1.397, "step": 533 }, { "epoch": 0.03, "grad_norm": 1.4560095870444878, "learning_rate": 1.7051282051282053e-05, "loss": 1.4502, "step": 534 }, { "epoch": 0.03, "grad_norm": 1.5803969973354353, "learning_rate": 1.7083333333333333e-05, "loss": 1.479, "step": 535 }, { "epoch": 0.03, "grad_norm": 1.1313987662572405, "learning_rate": 1.7115384615384617e-05, "loss": 1.3579, "step": 536 }, { "epoch": 0.03, "grad_norm": 1.4399778764378397, "learning_rate": 1.7147435897435897e-05, "loss": 1.4668, "step": 537 }, { "epoch": 0.03, "grad_norm": 1.0867392422924127, "learning_rate": 1.717948717948718e-05, "loss": 1.3281, "step": 538 }, { "epoch": 0.03, "grad_norm": 1.3560007508174348, "learning_rate": 1.7211538461538465e-05, "loss": 1.3706, "step": 539 }, { "epoch": 0.03, "grad_norm": 1.577184214804529, "learning_rate": 1.7243589743589745e-05, "loss": 1.4482, "step": 540 }, { "epoch": 0.03, "grad_norm": 1.27974949664638, "learning_rate": 1.727564102564103e-05, "loss": 1.272, "step": 541 }, { "epoch": 0.03, "grad_norm": 1.649803265845458, "learning_rate": 1.730769230769231e-05, "loss": 1.5264, "step": 542 }, { "epoch": 0.03, "grad_norm": 1.0330544965306192, "learning_rate": 1.7339743589743593e-05, "loss": 1.293, "step": 543 }, { "epoch": 0.03, "grad_norm": 1.2495061811930788, "learning_rate": 1.7371794871794873e-05, "loss": 1.312, "step": 544 }, { "epoch": 0.03, "grad_norm": 1.4141018467100808, "learning_rate": 1.7403846153846157e-05, "loss": 1.4893, "step": 545 }, { "epoch": 0.03, "grad_norm": 1.4150550588689295, "learning_rate": 1.7435897435897438e-05, "loss": 1.3452, "step": 546 }, { "epoch": 0.03, "grad_norm": 1.3233757069406549, "learning_rate": 1.7467948717948718e-05, "loss": 1.2437, "step": 547 }, { "epoch": 0.03, "grad_norm": 1.3788723528022941, "learning_rate": 1.7500000000000002e-05, "loss": 1.4556, "step": 548 }, { "epoch": 0.03, "grad_norm": 1.525281909384566, "learning_rate": 1.7532051282051282e-05, "loss": 1.3765, "step": 549 }, { "epoch": 0.03, "grad_norm": 1.6300224651437343, "learning_rate": 1.7564102564102566e-05, "loss": 1.3359, "step": 550 }, { "epoch": 0.03, "grad_norm": 0.9827583501770526, "learning_rate": 1.7596153846153846e-05, "loss": 1.4575, "step": 551 }, { "epoch": 0.03, "grad_norm": 1.438150843256687, "learning_rate": 1.762820512820513e-05, "loss": 1.4668, "step": 552 }, { "epoch": 0.03, "grad_norm": 1.3881294157609494, "learning_rate": 1.766025641025641e-05, "loss": 1.4653, "step": 553 }, { "epoch": 0.03, "grad_norm": 1.3573317383984584, "learning_rate": 1.7692307692307694e-05, "loss": 1.4028, "step": 554 }, { "epoch": 0.03, "grad_norm": 1.2324225516442557, "learning_rate": 1.7724358974358975e-05, "loss": 1.2417, "step": 555 }, { "epoch": 0.03, "grad_norm": 1.6460870836234336, "learning_rate": 1.775641025641026e-05, "loss": 1.2334, "step": 556 }, { "epoch": 0.03, "grad_norm": 1.7766022952503593, "learning_rate": 1.778846153846154e-05, "loss": 1.457, "step": 557 }, { "epoch": 0.03, "grad_norm": 1.5932577093243538, "learning_rate": 1.7820512820512823e-05, "loss": 1.4517, "step": 558 }, { "epoch": 0.03, "grad_norm": 1.7265563714545993, "learning_rate": 1.7852564102564107e-05, "loss": 1.5249, "step": 559 }, { "epoch": 0.03, "grad_norm": 1.1021410412480328, "learning_rate": 1.7884615384615387e-05, "loss": 1.4058, "step": 560 }, { "epoch": 0.03, "grad_norm": 1.409330012836106, "learning_rate": 1.7916666666666667e-05, "loss": 1.4609, "step": 561 }, { "epoch": 0.03, "grad_norm": 1.2447276693231564, "learning_rate": 1.794871794871795e-05, "loss": 1.1631, "step": 562 }, { "epoch": 0.03, "grad_norm": 1.326329883052271, "learning_rate": 1.798076923076923e-05, "loss": 1.2812, "step": 563 }, { "epoch": 0.03, "grad_norm": 1.442058547769068, "learning_rate": 1.8012820512820515e-05, "loss": 1.4717, "step": 564 }, { "epoch": 0.03, "grad_norm": 1.3942885748480063, "learning_rate": 1.8044871794871796e-05, "loss": 1.248, "step": 565 }, { "epoch": 0.03, "grad_norm": 1.2460295677194884, "learning_rate": 1.807692307692308e-05, "loss": 1.3457, "step": 566 }, { "epoch": 0.03, "grad_norm": 1.2313538335007477, "learning_rate": 1.810897435897436e-05, "loss": 1.3013, "step": 567 }, { "epoch": 0.03, "grad_norm": 1.405532423083906, "learning_rate": 1.8141025641025644e-05, "loss": 1.4443, "step": 568 }, { "epoch": 0.03, "grad_norm": 1.3862902484466386, "learning_rate": 1.8173076923076924e-05, "loss": 1.4473, "step": 569 }, { "epoch": 0.03, "grad_norm": 1.5067306561733516, "learning_rate": 1.8205128205128208e-05, "loss": 1.3247, "step": 570 }, { "epoch": 0.03, "grad_norm": 1.168314966107389, "learning_rate": 1.8237179487179488e-05, "loss": 1.4487, "step": 571 }, { "epoch": 0.03, "grad_norm": 1.0320831440647809, "learning_rate": 1.826923076923077e-05, "loss": 1.3472, "step": 572 }, { "epoch": 0.03, "grad_norm": 1.4347909403959687, "learning_rate": 1.8301282051282052e-05, "loss": 1.3281, "step": 573 }, { "epoch": 0.03, "grad_norm": 1.6358956841756134, "learning_rate": 1.8333333333333333e-05, "loss": 1.1406, "step": 574 }, { "epoch": 0.03, "grad_norm": 1.5224740569058406, "learning_rate": 1.8365384615384617e-05, "loss": 1.2402, "step": 575 }, { "epoch": 0.03, "grad_norm": 1.472174960466282, "learning_rate": 1.8397435897435897e-05, "loss": 1.4648, "step": 576 }, { "epoch": 0.03, "grad_norm": 1.0629197960929988, "learning_rate": 1.842948717948718e-05, "loss": 1.2329, "step": 577 }, { "epoch": 0.03, "grad_norm": 1.3327914278952546, "learning_rate": 1.8461538461538465e-05, "loss": 1.3281, "step": 578 }, { "epoch": 0.03, "grad_norm": 1.2647857300728194, "learning_rate": 1.8493589743589745e-05, "loss": 1.5117, "step": 579 }, { "epoch": 0.03, "grad_norm": 1.675142820118515, "learning_rate": 1.852564102564103e-05, "loss": 1.4766, "step": 580 }, { "epoch": 0.03, "grad_norm": 1.3347226546451025, "learning_rate": 1.855769230769231e-05, "loss": 1.3975, "step": 581 }, { "epoch": 0.03, "grad_norm": 1.4814023981099653, "learning_rate": 1.8589743589743593e-05, "loss": 1.2139, "step": 582 }, { "epoch": 0.03, "grad_norm": 1.3236718773254184, "learning_rate": 1.8621794871794873e-05, "loss": 1.2974, "step": 583 }, { "epoch": 0.03, "grad_norm": 1.406576698027228, "learning_rate": 1.8653846153846157e-05, "loss": 1.4541, "step": 584 }, { "epoch": 0.03, "grad_norm": 1.1886971561706663, "learning_rate": 1.8685897435897438e-05, "loss": 1.3018, "step": 585 }, { "epoch": 0.03, "grad_norm": 1.607295599832651, "learning_rate": 1.8717948717948718e-05, "loss": 1.4907, "step": 586 }, { "epoch": 0.03, "grad_norm": 1.427253983843186, "learning_rate": 1.8750000000000002e-05, "loss": 1.4482, "step": 587 }, { "epoch": 0.03, "grad_norm": 1.4043818712862395, "learning_rate": 1.8782051282051282e-05, "loss": 1.4121, "step": 588 }, { "epoch": 0.03, "grad_norm": 1.1943086528526208, "learning_rate": 1.8814102564102566e-05, "loss": 1.1504, "step": 589 }, { "epoch": 0.03, "grad_norm": 1.38900044691441, "learning_rate": 1.8846153846153846e-05, "loss": 1.3633, "step": 590 }, { "epoch": 0.03, "grad_norm": 1.2721154130985086, "learning_rate": 1.887820512820513e-05, "loss": 1.3403, "step": 591 }, { "epoch": 0.03, "grad_norm": 1.2023222382446153, "learning_rate": 1.891025641025641e-05, "loss": 1.3726, "step": 592 }, { "epoch": 0.03, "grad_norm": 0.981422317443676, "learning_rate": 1.8942307692307694e-05, "loss": 1.2163, "step": 593 }, { "epoch": 0.03, "grad_norm": 1.4300202234460881, "learning_rate": 1.8974358974358975e-05, "loss": 1.4526, "step": 594 }, { "epoch": 0.03, "grad_norm": 1.3870582048133047, "learning_rate": 1.900641025641026e-05, "loss": 1.4004, "step": 595 }, { "epoch": 0.03, "grad_norm": 1.3147265536325958, "learning_rate": 1.903846153846154e-05, "loss": 1.438, "step": 596 }, { "epoch": 0.03, "grad_norm": 1.2256434276808312, "learning_rate": 1.9070512820512823e-05, "loss": 1.3521, "step": 597 }, { "epoch": 0.03, "grad_norm": 1.708185371555497, "learning_rate": 1.9102564102564106e-05, "loss": 1.2197, "step": 598 }, { "epoch": 0.03, "grad_norm": 1.3909140275716405, "learning_rate": 1.9134615384615387e-05, "loss": 1.23, "step": 599 }, { "epoch": 0.03, "grad_norm": 1.6289224779159834, "learning_rate": 1.916666666666667e-05, "loss": 1.3921, "step": 600 }, { "epoch": 0.03, "grad_norm": 1.4880436944566162, "learning_rate": 1.919871794871795e-05, "loss": 1.2505, "step": 601 }, { "epoch": 0.03, "grad_norm": 1.6331105528274013, "learning_rate": 1.923076923076923e-05, "loss": 1.3555, "step": 602 }, { "epoch": 0.03, "grad_norm": 1.254716190535359, "learning_rate": 1.9262820512820515e-05, "loss": 1.4829, "step": 603 }, { "epoch": 0.03, "grad_norm": 1.4288505474526056, "learning_rate": 1.9294871794871796e-05, "loss": 1.4751, "step": 604 }, { "epoch": 0.03, "grad_norm": 1.8105580735154534, "learning_rate": 1.932692307692308e-05, "loss": 1.4346, "step": 605 }, { "epoch": 0.03, "grad_norm": 1.2210290966783124, "learning_rate": 1.935897435897436e-05, "loss": 1.4565, "step": 606 }, { "epoch": 0.03, "grad_norm": 1.8079497280637575, "learning_rate": 1.9391025641025644e-05, "loss": 1.3965, "step": 607 }, { "epoch": 0.03, "grad_norm": 1.0564735052393015, "learning_rate": 1.9423076923076924e-05, "loss": 1.3125, "step": 608 }, { "epoch": 0.03, "grad_norm": 0.9849223233888563, "learning_rate": 1.9455128205128208e-05, "loss": 1.1646, "step": 609 }, { "epoch": 0.03, "grad_norm": 1.295635147333379, "learning_rate": 1.9487179487179488e-05, "loss": 1.2998, "step": 610 }, { "epoch": 0.03, "grad_norm": 1.3341206002337973, "learning_rate": 1.9519230769230772e-05, "loss": 1.3325, "step": 611 }, { "epoch": 0.03, "grad_norm": 1.6232397344974392, "learning_rate": 1.9551282051282052e-05, "loss": 1.4453, "step": 612 }, { "epoch": 0.03, "grad_norm": 1.3249373679083745, "learning_rate": 1.9583333333333333e-05, "loss": 1.2539, "step": 613 }, { "epoch": 0.03, "grad_norm": 1.3100531949710184, "learning_rate": 1.9615384615384617e-05, "loss": 1.4902, "step": 614 }, { "epoch": 0.03, "grad_norm": 1.3483583329678617, "learning_rate": 1.9647435897435897e-05, "loss": 1.3384, "step": 615 }, { "epoch": 0.03, "grad_norm": 1.0000197179180486, "learning_rate": 1.967948717948718e-05, "loss": 1.5186, "step": 616 }, { "epoch": 0.03, "grad_norm": 1.1997745547733292, "learning_rate": 1.9711538461538465e-05, "loss": 1.2812, "step": 617 }, { "epoch": 0.03, "grad_norm": 1.6149044939941823, "learning_rate": 1.9743589743589745e-05, "loss": 1.3481, "step": 618 }, { "epoch": 0.03, "grad_norm": 1.5312817527254543, "learning_rate": 1.977564102564103e-05, "loss": 1.4795, "step": 619 }, { "epoch": 0.03, "grad_norm": 1.163174928475769, "learning_rate": 1.980769230769231e-05, "loss": 1.354, "step": 620 }, { "epoch": 0.03, "grad_norm": 1.3993322421382928, "learning_rate": 1.9839743589743593e-05, "loss": 1.2803, "step": 621 }, { "epoch": 0.03, "grad_norm": 1.0781318823407806, "learning_rate": 1.9871794871794873e-05, "loss": 1.3472, "step": 622 }, { "epoch": 0.03, "grad_norm": 1.2750709065044106, "learning_rate": 1.9903846153846157e-05, "loss": 1.3125, "step": 623 }, { "epoch": 0.03, "grad_norm": 1.3771088269297513, "learning_rate": 1.9935897435897437e-05, "loss": 1.3945, "step": 624 }, { "epoch": 0.03, "grad_norm": 1.201338877783735, "learning_rate": 1.996794871794872e-05, "loss": 1.334, "step": 625 }, { "epoch": 0.03, "grad_norm": 1.27337359345505, "learning_rate": 2e-05, "loss": 1.2593, "step": 626 }, { "epoch": 0.03, "grad_norm": 1.2154574733109216, "learning_rate": 1.9999999878664707e-05, "loss": 1.2593, "step": 627 }, { "epoch": 0.03, "grad_norm": 1.4885944471570738, "learning_rate": 1.999999951465882e-05, "loss": 1.3374, "step": 628 }, { "epoch": 0.03, "grad_norm": 1.4201872667389484, "learning_rate": 1.999999890798236e-05, "loss": 1.2197, "step": 629 }, { "epoch": 0.03, "grad_norm": 1.381681285072961, "learning_rate": 1.9999998058635333e-05, "loss": 1.353, "step": 630 }, { "epoch": 0.03, "grad_norm": 1.780615297047386, "learning_rate": 1.999999696661776e-05, "loss": 1.2563, "step": 631 }, { "epoch": 0.03, "grad_norm": 1.4852794359538266, "learning_rate": 1.999999563192967e-05, "loss": 1.3301, "step": 632 }, { "epoch": 0.03, "grad_norm": 1.0576917547511393, "learning_rate": 1.9999994054571096e-05, "loss": 1.2632, "step": 633 }, { "epoch": 0.03, "grad_norm": 1.2324506864144815, "learning_rate": 1.9999992234542078e-05, "loss": 1.335, "step": 634 }, { "epoch": 0.03, "grad_norm": 1.7130166185057643, "learning_rate": 1.9999990171842654e-05, "loss": 1.3242, "step": 635 }, { "epoch": 0.03, "grad_norm": 1.638255887316567, "learning_rate": 1.9999987866472878e-05, "loss": 1.3271, "step": 636 }, { "epoch": 0.03, "grad_norm": 1.13777185853665, "learning_rate": 1.9999985318432804e-05, "loss": 1.3618, "step": 637 }, { "epoch": 0.03, "grad_norm": 1.3348508162826425, "learning_rate": 1.9999982527722498e-05, "loss": 1.3687, "step": 638 }, { "epoch": 0.03, "grad_norm": 1.292891647696483, "learning_rate": 1.9999979494342022e-05, "loss": 1.2964, "step": 639 }, { "epoch": 0.03, "grad_norm": 1.3064786446724772, "learning_rate": 1.9999976218291455e-05, "loss": 1.3174, "step": 640 }, { "epoch": 0.03, "grad_norm": 1.3228995536700574, "learning_rate": 1.9999972699570876e-05, "loss": 1.3315, "step": 641 }, { "epoch": 0.03, "grad_norm": 1.4839310999858324, "learning_rate": 1.9999968938180364e-05, "loss": 1.1321, "step": 642 }, { "epoch": 0.03, "grad_norm": 1.3113297027999637, "learning_rate": 1.9999964934120016e-05, "loss": 1.4497, "step": 643 }, { "epoch": 0.03, "grad_norm": 1.2262440934990706, "learning_rate": 1.999996068738993e-05, "loss": 1.3208, "step": 644 }, { "epoch": 0.03, "grad_norm": 1.6094160007058462, "learning_rate": 1.9999956197990205e-05, "loss": 1.3833, "step": 645 }, { "epoch": 0.03, "grad_norm": 1.40210192892748, "learning_rate": 1.9999951465920953e-05, "loss": 1.2588, "step": 646 }, { "epoch": 0.03, "grad_norm": 1.3510930685282643, "learning_rate": 1.9999946491182284e-05, "loss": 1.1685, "step": 647 }, { "epoch": 0.03, "grad_norm": 1.4178396760476157, "learning_rate": 1.9999941273774327e-05, "loss": 1.2607, "step": 648 }, { "epoch": 0.03, "grad_norm": 1.3372679427060812, "learning_rate": 1.9999935813697204e-05, "loss": 1.2666, "step": 649 }, { "epoch": 0.03, "grad_norm": 1.3955429564424948, "learning_rate": 1.9999930110951043e-05, "loss": 1.3135, "step": 650 }, { "epoch": 0.03, "grad_norm": 1.513719089547528, "learning_rate": 1.999992416553599e-05, "loss": 1.3252, "step": 651 }, { "epoch": 0.03, "grad_norm": 1.125039415775943, "learning_rate": 1.9999917977452187e-05, "loss": 1.125, "step": 652 }, { "epoch": 0.03, "grad_norm": 1.1144443787965195, "learning_rate": 1.9999911546699785e-05, "loss": 1.2812, "step": 653 }, { "epoch": 0.03, "grad_norm": 1.8090228940634654, "learning_rate": 1.9999904873278933e-05, "loss": 1.6245, "step": 654 }, { "epoch": 0.03, "grad_norm": 1.3638275169155951, "learning_rate": 1.9999897957189802e-05, "loss": 1.2183, "step": 655 }, { "epoch": 0.03, "grad_norm": 1.099661743913719, "learning_rate": 1.9999890798432556e-05, "loss": 1.4766, "step": 656 }, { "epoch": 0.03, "grad_norm": 1.2569381199188554, "learning_rate": 1.9999883397007366e-05, "loss": 1.4209, "step": 657 }, { "epoch": 0.03, "grad_norm": 1.3354351313729897, "learning_rate": 1.999987575291442e-05, "loss": 1.4453, "step": 658 }, { "epoch": 0.03, "grad_norm": 1.5904187692645484, "learning_rate": 1.9999867866153894e-05, "loss": 1.3887, "step": 659 }, { "epoch": 0.03, "grad_norm": 1.3520718152655091, "learning_rate": 1.9999859736725984e-05, "loss": 1.2485, "step": 660 }, { "epoch": 0.03, "grad_norm": 1.0832849240843234, "learning_rate": 1.9999851364630886e-05, "loss": 1.3101, "step": 661 }, { "epoch": 0.03, "grad_norm": 1.4484834850433754, "learning_rate": 1.9999842749868808e-05, "loss": 1.3623, "step": 662 }, { "epoch": 0.03, "grad_norm": 1.3835434957454869, "learning_rate": 1.9999833892439952e-05, "loss": 1.458, "step": 663 }, { "epoch": 0.03, "grad_norm": 1.4387506514228017, "learning_rate": 1.9999824792344536e-05, "loss": 1.3086, "step": 664 }, { "epoch": 0.03, "grad_norm": 1.633722983139014, "learning_rate": 1.999981544958278e-05, "loss": 1.4619, "step": 665 }, { "epoch": 0.03, "grad_norm": 1.4104499077626325, "learning_rate": 1.9999805864154913e-05, "loss": 1.4883, "step": 666 }, { "epoch": 0.03, "grad_norm": 1.4024389487281625, "learning_rate": 1.9999796036061164e-05, "loss": 1.3306, "step": 667 }, { "epoch": 0.03, "grad_norm": 1.4196268951832895, "learning_rate": 1.9999785965301776e-05, "loss": 1.334, "step": 668 }, { "epoch": 0.03, "grad_norm": 1.5241322504684633, "learning_rate": 1.999977565187699e-05, "loss": 1.3804, "step": 669 }, { "epoch": 0.03, "grad_norm": 1.322377393231278, "learning_rate": 1.9999765095787055e-05, "loss": 1.4146, "step": 670 }, { "epoch": 0.03, "grad_norm": 1.2112986037417892, "learning_rate": 1.999975429703223e-05, "loss": 1.4146, "step": 671 }, { "epoch": 0.03, "grad_norm": 1.7613183633276996, "learning_rate": 1.999974325561278e-05, "loss": 1.4287, "step": 672 }, { "epoch": 0.03, "grad_norm": 1.415522763797125, "learning_rate": 1.9999731971528965e-05, "loss": 1.1929, "step": 673 }, { "epoch": 0.03, "grad_norm": 1.2058028589701828, "learning_rate": 1.999972044478107e-05, "loss": 1.2112, "step": 674 }, { "epoch": 0.03, "grad_norm": 1.2016766692285767, "learning_rate": 1.999970867536936e-05, "loss": 1.2207, "step": 675 }, { "epoch": 0.03, "grad_norm": 1.3119343135913277, "learning_rate": 1.9999696663294133e-05, "loss": 1.3203, "step": 676 }, { "epoch": 0.03, "grad_norm": 1.4714608051845939, "learning_rate": 1.9999684408555673e-05, "loss": 1.2769, "step": 677 }, { "epoch": 0.03, "grad_norm": 1.2199275383656834, "learning_rate": 1.9999671911154285e-05, "loss": 1.2964, "step": 678 }, { "epoch": 0.03, "grad_norm": 1.3147058269536414, "learning_rate": 1.9999659171090263e-05, "loss": 1.1973, "step": 679 }, { "epoch": 0.03, "grad_norm": 1.7357123247497765, "learning_rate": 1.9999646188363925e-05, "loss": 1.4092, "step": 680 }, { "epoch": 0.03, "grad_norm": 1.4919718189660889, "learning_rate": 1.9999632962975578e-05, "loss": 1.3359, "step": 681 }, { "epoch": 0.03, "grad_norm": 1.2500333119084206, "learning_rate": 1.999961949492555e-05, "loss": 1.2661, "step": 682 }, { "epoch": 0.03, "grad_norm": 1.3391601982711716, "learning_rate": 1.999960578421416e-05, "loss": 1.4282, "step": 683 }, { "epoch": 0.03, "grad_norm": 1.1437023094598027, "learning_rate": 1.999959183084175e-05, "loss": 1.3047, "step": 684 }, { "epoch": 0.03, "grad_norm": 1.2939766728937352, "learning_rate": 1.999957763480865e-05, "loss": 1.2656, "step": 685 }, { "epoch": 0.03, "grad_norm": 1.3486653278963208, "learning_rate": 1.999956319611521e-05, "loss": 1.2534, "step": 686 }, { "epoch": 0.03, "grad_norm": 1.2827714385498983, "learning_rate": 1.9999548514761785e-05, "loss": 1.3911, "step": 687 }, { "epoch": 0.03, "grad_norm": 1.2473277660393025, "learning_rate": 1.999953359074872e-05, "loss": 1.292, "step": 688 }, { "epoch": 0.03, "grad_norm": 1.6688484502239775, "learning_rate": 1.999951842407638e-05, "loss": 1.2451, "step": 689 }, { "epoch": 0.03, "grad_norm": 1.4111806761504817, "learning_rate": 1.9999503014745138e-05, "loss": 1.4971, "step": 690 }, { "epoch": 0.03, "grad_norm": 1.1807885396091584, "learning_rate": 1.9999487362755366e-05, "loss": 1.3833, "step": 691 }, { "epoch": 0.03, "grad_norm": 1.229729630809447, "learning_rate": 1.9999471468107444e-05, "loss": 1.2485, "step": 692 }, { "epoch": 0.03, "grad_norm": 1.3521840672967698, "learning_rate": 1.9999455330801752e-05, "loss": 1.1895, "step": 693 }, { "epoch": 0.03, "grad_norm": 1.255910390114501, "learning_rate": 1.999943895083869e-05, "loss": 1.4165, "step": 694 }, { "epoch": 0.03, "grad_norm": 1.1220685824056627, "learning_rate": 1.999942232821865e-05, "loss": 1.2554, "step": 695 }, { "epoch": 0.03, "grad_norm": 1.3368643758760377, "learning_rate": 1.999940546294204e-05, "loss": 1.2964, "step": 696 }, { "epoch": 0.03, "grad_norm": 1.2558627009782874, "learning_rate": 1.9999388355009266e-05, "loss": 1.4546, "step": 697 }, { "epoch": 0.03, "grad_norm": 1.040379869762896, "learning_rate": 1.9999371004420744e-05, "loss": 1.2788, "step": 698 }, { "epoch": 0.03, "grad_norm": 1.5726006030407322, "learning_rate": 1.9999353411176893e-05, "loss": 1.355, "step": 699 }, { "epoch": 0.03, "grad_norm": 1.4255594994564806, "learning_rate": 1.9999335575278143e-05, "loss": 1.1953, "step": 700 }, { "epoch": 0.03, "grad_norm": 1.0974653573582915, "learning_rate": 1.9999317496724925e-05, "loss": 1.3413, "step": 701 }, { "epoch": 0.03, "grad_norm": 1.1409607887399758, "learning_rate": 1.999929917551768e-05, "loss": 1.4116, "step": 702 }, { "epoch": 0.03, "grad_norm": 1.5090099684549179, "learning_rate": 1.999928061165685e-05, "loss": 1.4741, "step": 703 }, { "epoch": 0.03, "grad_norm": 1.3732488839471322, "learning_rate": 1.9999261805142885e-05, "loss": 1.3623, "step": 704 }, { "epoch": 0.03, "grad_norm": 1.2991165014985107, "learning_rate": 1.9999242755976246e-05, "loss": 1.2764, "step": 705 }, { "epoch": 0.03, "grad_norm": 1.6110916976694905, "learning_rate": 1.999922346415739e-05, "loss": 1.3003, "step": 706 }, { "epoch": 0.03, "grad_norm": 1.618427640991831, "learning_rate": 1.9999203929686786e-05, "loss": 1.3882, "step": 707 }, { "epoch": 0.03, "grad_norm": 1.1866907206713557, "learning_rate": 1.9999184152564907e-05, "loss": 1.1997, "step": 708 }, { "epoch": 0.03, "grad_norm": 1.468406094681342, "learning_rate": 1.999916413279224e-05, "loss": 1.4199, "step": 709 }, { "epoch": 0.03, "grad_norm": 1.460738199686398, "learning_rate": 1.9999143870369265e-05, "loss": 1.1748, "step": 710 }, { "epoch": 0.03, "grad_norm": 1.1683732039943022, "learning_rate": 1.9999123365296473e-05, "loss": 1.4941, "step": 711 }, { "epoch": 0.03, "grad_norm": 1.6774087365998513, "learning_rate": 1.9999102617574366e-05, "loss": 1.4448, "step": 712 }, { "epoch": 0.03, "grad_norm": 1.2136062351323862, "learning_rate": 1.999908162720344e-05, "loss": 1.3315, "step": 713 }, { "epoch": 0.03, "grad_norm": 1.8453377615444224, "learning_rate": 1.9999060394184214e-05, "loss": 1.4028, "step": 714 }, { "epoch": 0.03, "grad_norm": 1.900708096988948, "learning_rate": 1.99990389185172e-05, "loss": 1.561, "step": 715 }, { "epoch": 0.03, "grad_norm": 1.3492499743946267, "learning_rate": 1.999901720020291e-05, "loss": 1.3774, "step": 716 }, { "epoch": 0.03, "grad_norm": 1.3184260429591028, "learning_rate": 1.9998995239241883e-05, "loss": 1.376, "step": 717 }, { "epoch": 0.03, "grad_norm": 1.2042285722651176, "learning_rate": 1.9998973035634648e-05, "loss": 1.3232, "step": 718 }, { "epoch": 0.03, "grad_norm": 1.3413778800098985, "learning_rate": 1.9998950589381743e-05, "loss": 1.4038, "step": 719 }, { "epoch": 0.03, "grad_norm": 1.2997924514913326, "learning_rate": 1.9998927900483714e-05, "loss": 1.2173, "step": 720 }, { "epoch": 0.03, "grad_norm": 1.300975241832525, "learning_rate": 1.9998904968941107e-05, "loss": 1.3589, "step": 721 }, { "epoch": 0.03, "grad_norm": 1.2371027962212908, "learning_rate": 1.9998881794754484e-05, "loss": 1.2871, "step": 722 }, { "epoch": 0.03, "grad_norm": 1.2715649525142014, "learning_rate": 1.9998858377924408e-05, "loss": 1.4395, "step": 723 }, { "epoch": 0.03, "grad_norm": 1.5480367133856647, "learning_rate": 1.9998834718451444e-05, "loss": 1.3467, "step": 724 }, { "epoch": 0.03, "grad_norm": 1.3262576734297975, "learning_rate": 1.999881081633616e-05, "loss": 1.3428, "step": 725 }, { "epoch": 0.03, "grad_norm": 1.0445114841380974, "learning_rate": 1.999878667157915e-05, "loss": 1.3618, "step": 726 }, { "epoch": 0.03, "grad_norm": 1.4900141606877817, "learning_rate": 1.9998762284180996e-05, "loss": 1.4702, "step": 727 }, { "epoch": 0.04, "grad_norm": 1.4158124879116423, "learning_rate": 1.999873765414228e-05, "loss": 1.3306, "step": 728 }, { "epoch": 0.04, "grad_norm": 1.3306758153136669, "learning_rate": 1.999871278146361e-05, "loss": 1.4717, "step": 729 }, { "epoch": 0.04, "grad_norm": 1.2493462164222073, "learning_rate": 1.9998687666145585e-05, "loss": 1.2705, "step": 730 }, { "epoch": 0.04, "grad_norm": 1.2819037431166764, "learning_rate": 1.9998662308188813e-05, "loss": 1.3662, "step": 731 }, { "epoch": 0.04, "grad_norm": 1.5485694140172468, "learning_rate": 1.9998636707593913e-05, "loss": 1.3242, "step": 732 }, { "epoch": 0.04, "grad_norm": 1.6726187063486977, "learning_rate": 1.9998610864361506e-05, "loss": 1.2388, "step": 733 }, { "epoch": 0.04, "grad_norm": 1.3027365315179606, "learning_rate": 1.999858477849222e-05, "loss": 1.394, "step": 734 }, { "epoch": 0.04, "grad_norm": 1.63117591246242, "learning_rate": 1.9998558449986684e-05, "loss": 1.2656, "step": 735 }, { "epoch": 0.04, "grad_norm": 1.4905958138989042, "learning_rate": 1.9998531878845536e-05, "loss": 1.437, "step": 736 }, { "epoch": 0.04, "grad_norm": 1.4233560041293754, "learning_rate": 1.999850506506943e-05, "loss": 1.1279, "step": 737 }, { "epoch": 0.04, "grad_norm": 1.2237265125736674, "learning_rate": 1.999847800865901e-05, "loss": 1.4512, "step": 738 }, { "epoch": 0.04, "grad_norm": 1.2094435678456228, "learning_rate": 1.9998450709614928e-05, "loss": 1.3623, "step": 739 }, { "epoch": 0.04, "grad_norm": 1.098222569374182, "learning_rate": 1.9998423167937852e-05, "loss": 1.4009, "step": 740 }, { "epoch": 0.04, "grad_norm": 1.3836299143956212, "learning_rate": 1.9998395383628457e-05, "loss": 1.4668, "step": 741 }, { "epoch": 0.04, "grad_norm": 1.3724904057857936, "learning_rate": 1.9998367356687405e-05, "loss": 1.3413, "step": 742 }, { "epoch": 0.04, "grad_norm": 1.4922691058508695, "learning_rate": 1.9998339087115378e-05, "loss": 1.3569, "step": 743 }, { "epoch": 0.04, "grad_norm": 1.4903142168930235, "learning_rate": 1.9998310574913074e-05, "loss": 1.4736, "step": 744 }, { "epoch": 0.04, "grad_norm": 1.2572580244976974, "learning_rate": 1.999828182008117e-05, "loss": 1.1865, "step": 745 }, { "epoch": 0.04, "grad_norm": 1.2150804774546125, "learning_rate": 1.9998252822620373e-05, "loss": 1.3745, "step": 746 }, { "epoch": 0.04, "grad_norm": 1.2620322468289291, "learning_rate": 1.9998223582531386e-05, "loss": 1.3818, "step": 747 }, { "epoch": 0.04, "grad_norm": 1.3504234177561378, "learning_rate": 1.9998194099814913e-05, "loss": 1.1621, "step": 748 }, { "epoch": 0.04, "grad_norm": 1.3845840320273697, "learning_rate": 1.9998164374471673e-05, "loss": 1.2871, "step": 749 }, { "epoch": 0.04, "grad_norm": 1.2180432235543677, "learning_rate": 1.9998134406502384e-05, "loss": 1.2368, "step": 750 }, { "epoch": 0.04, "grad_norm": 1.566176899465898, "learning_rate": 1.999810419590778e-05, "loss": 1.5542, "step": 751 }, { "epoch": 0.04, "grad_norm": 1.8016178704670875, "learning_rate": 1.9998073742688592e-05, "loss": 1.3696, "step": 752 }, { "epoch": 0.04, "grad_norm": 1.3963695940306424, "learning_rate": 1.9998043046845555e-05, "loss": 1.2524, "step": 753 }, { "epoch": 0.04, "grad_norm": 1.2385839620087902, "learning_rate": 1.9998012108379417e-05, "loss": 1.0708, "step": 754 }, { "epoch": 0.04, "grad_norm": 0.9952189015926733, "learning_rate": 1.9997980927290928e-05, "loss": 1.2524, "step": 755 }, { "epoch": 0.04, "grad_norm": 1.898800908272537, "learning_rate": 1.9997949503580844e-05, "loss": 1.4141, "step": 756 }, { "epoch": 0.04, "grad_norm": 1.1096830610570383, "learning_rate": 1.999791783724993e-05, "loss": 1.209, "step": 757 }, { "epoch": 0.04, "grad_norm": 1.3326798442394288, "learning_rate": 1.999788592829895e-05, "loss": 1.4028, "step": 758 }, { "epoch": 0.04, "grad_norm": 1.193933965673473, "learning_rate": 1.999785377672869e-05, "loss": 1.2407, "step": 759 }, { "epoch": 0.04, "grad_norm": 1.2446478852012381, "learning_rate": 1.9997821382539914e-05, "loss": 1.3652, "step": 760 }, { "epoch": 0.04, "grad_norm": 1.083712490553, "learning_rate": 1.9997788745733415e-05, "loss": 1.2368, "step": 761 }, { "epoch": 0.04, "grad_norm": 1.395052434574553, "learning_rate": 1.9997755866309988e-05, "loss": 1.3125, "step": 762 }, { "epoch": 0.04, "grad_norm": 1.1210722001304796, "learning_rate": 1.999772274427043e-05, "loss": 1.4429, "step": 763 }, { "epoch": 0.04, "grad_norm": 1.3111291398429166, "learning_rate": 1.999768937961554e-05, "loss": 1.3164, "step": 764 }, { "epoch": 0.04, "grad_norm": 1.4478495588307805, "learning_rate": 1.9997655772346132e-05, "loss": 1.3882, "step": 765 }, { "epoch": 0.04, "grad_norm": 1.2147992529334042, "learning_rate": 1.999762192246302e-05, "loss": 1.5459, "step": 766 }, { "epoch": 0.04, "grad_norm": 0.972615469522833, "learning_rate": 1.9997587829967027e-05, "loss": 1.3262, "step": 767 }, { "epoch": 0.04, "grad_norm": 1.467078828162015, "learning_rate": 1.999755349485898e-05, "loss": 1.3052, "step": 768 }, { "epoch": 0.04, "grad_norm": 1.4327908569178656, "learning_rate": 1.999751891713971e-05, "loss": 1.2573, "step": 769 }, { "epoch": 0.04, "grad_norm": 1.3749308319642244, "learning_rate": 1.9997484096810054e-05, "loss": 1.3789, "step": 770 }, { "epoch": 0.04, "grad_norm": 1.7499466344693686, "learning_rate": 1.999744903387087e-05, "loss": 1.4399, "step": 771 }, { "epoch": 0.04, "grad_norm": 1.460209871209967, "learning_rate": 1.9997413728322992e-05, "loss": 1.3213, "step": 772 }, { "epoch": 0.04, "grad_norm": 1.5988281894137066, "learning_rate": 1.9997378180167285e-05, "loss": 1.52, "step": 773 }, { "epoch": 0.04, "grad_norm": 1.3665061231996976, "learning_rate": 1.999734238940461e-05, "loss": 1.3228, "step": 774 }, { "epoch": 0.04, "grad_norm": 1.686762339093321, "learning_rate": 1.9997306356035838e-05, "loss": 1.4863, "step": 775 }, { "epoch": 0.04, "grad_norm": 1.1976695871601748, "learning_rate": 1.9997270080061843e-05, "loss": 1.4702, "step": 776 }, { "epoch": 0.04, "grad_norm": 1.4044419557562702, "learning_rate": 1.9997233561483503e-05, "loss": 1.4351, "step": 777 }, { "epoch": 0.04, "grad_norm": 1.3966351961035797, "learning_rate": 1.9997196800301704e-05, "loss": 1.2368, "step": 778 }, { "epoch": 0.04, "grad_norm": 1.451908513577683, "learning_rate": 1.9997159796517342e-05, "loss": 1.4766, "step": 779 }, { "epoch": 0.04, "grad_norm": 1.3883391465324681, "learning_rate": 1.999712255013131e-05, "loss": 1.3765, "step": 780 }, { "epoch": 0.04, "grad_norm": 1.4875210138421147, "learning_rate": 1.9997085061144514e-05, "loss": 1.1914, "step": 781 }, { "epoch": 0.04, "grad_norm": 1.108144098880012, "learning_rate": 1.9997047329557867e-05, "loss": 1.3037, "step": 782 }, { "epoch": 0.04, "grad_norm": 1.2296080571831245, "learning_rate": 1.9997009355372276e-05, "loss": 1.3413, "step": 783 }, { "epoch": 0.04, "grad_norm": 1.247361737424989, "learning_rate": 1.9996971138588674e-05, "loss": 1.2534, "step": 784 }, { "epoch": 0.04, "grad_norm": 1.5941379594424552, "learning_rate": 1.9996932679207977e-05, "loss": 1.2617, "step": 785 }, { "epoch": 0.04, "grad_norm": 1.2165656713257629, "learning_rate": 1.999689397723113e-05, "loss": 1.3613, "step": 786 }, { "epoch": 0.04, "grad_norm": 1.3816984020156546, "learning_rate": 1.9996855032659065e-05, "loss": 1.3418, "step": 787 }, { "epoch": 0.04, "grad_norm": 1.3090585611372187, "learning_rate": 1.9996815845492722e-05, "loss": 1.2456, "step": 788 }, { "epoch": 0.04, "grad_norm": 1.2095302763845621, "learning_rate": 1.9996776415733063e-05, "loss": 1.3442, "step": 789 }, { "epoch": 0.04, "grad_norm": 1.3766679969399749, "learning_rate": 1.9996736743381037e-05, "loss": 1.2803, "step": 790 }, { "epoch": 0.04, "grad_norm": 1.3831792721401492, "learning_rate": 1.9996696828437613e-05, "loss": 1.1392, "step": 791 }, { "epoch": 0.04, "grad_norm": 1.6131839924470461, "learning_rate": 1.9996656670903753e-05, "loss": 1.3677, "step": 792 }, { "epoch": 0.04, "grad_norm": 1.4481877963153207, "learning_rate": 1.999661627078044e-05, "loss": 1.1931, "step": 793 }, { "epoch": 0.04, "grad_norm": 1.3615849668811302, "learning_rate": 1.9996575628068645e-05, "loss": 1.3618, "step": 794 }, { "epoch": 0.04, "grad_norm": 1.2446850940595102, "learning_rate": 1.9996534742769355e-05, "loss": 1.2017, "step": 795 }, { "epoch": 0.04, "grad_norm": 1.1381553891867735, "learning_rate": 1.999649361488357e-05, "loss": 1.3569, "step": 796 }, { "epoch": 0.04, "grad_norm": 1.1865100510548208, "learning_rate": 1.9996452244412285e-05, "loss": 1.2861, "step": 797 }, { "epoch": 0.04, "grad_norm": 0.9535137323021992, "learning_rate": 1.9996410631356496e-05, "loss": 1.1475, "step": 798 }, { "epoch": 0.04, "grad_norm": 1.2201821690479515, "learning_rate": 1.9996368775717228e-05, "loss": 1.3218, "step": 799 }, { "epoch": 0.04, "grad_norm": 1.31421135017808, "learning_rate": 1.9996326677495482e-05, "loss": 1.415, "step": 800 }, { "epoch": 0.04, "grad_norm": 1.6015538484040908, "learning_rate": 1.9996284336692286e-05, "loss": 1.2593, "step": 801 }, { "epoch": 0.04, "grad_norm": 1.000818877230247, "learning_rate": 1.9996241753308673e-05, "loss": 1.2202, "step": 802 }, { "epoch": 0.04, "grad_norm": 1.2117657084148095, "learning_rate": 1.9996198927345665e-05, "loss": 1.2285, "step": 803 }, { "epoch": 0.04, "grad_norm": 1.4748773898628027, "learning_rate": 1.9996155858804307e-05, "loss": 1.2739, "step": 804 }, { "epoch": 0.04, "grad_norm": 1.0239156855587495, "learning_rate": 1.9996112547685645e-05, "loss": 1.2827, "step": 805 }, { "epoch": 0.04, "grad_norm": 1.1834695143488219, "learning_rate": 1.9996068993990727e-05, "loss": 1.3262, "step": 806 }, { "epoch": 0.04, "grad_norm": 1.297795622687012, "learning_rate": 1.9996025197720615e-05, "loss": 1.3218, "step": 807 }, { "epoch": 0.04, "grad_norm": 1.268877234777004, "learning_rate": 1.999598115887637e-05, "loss": 1.3281, "step": 808 }, { "epoch": 0.04, "grad_norm": 1.6616671974043395, "learning_rate": 1.9995936877459053e-05, "loss": 1.2124, "step": 809 }, { "epoch": 0.04, "grad_norm": 1.2708749111895, "learning_rate": 1.999589235346975e-05, "loss": 1.4038, "step": 810 }, { "epoch": 0.04, "grad_norm": 1.0576109054973595, "learning_rate": 1.9995847586909537e-05, "loss": 1.4028, "step": 811 }, { "epoch": 0.04, "grad_norm": 1.30430405411384, "learning_rate": 1.9995802577779498e-05, "loss": 1.2432, "step": 812 }, { "epoch": 0.04, "grad_norm": 1.3926925114135567, "learning_rate": 1.9995757326080727e-05, "loss": 1.1865, "step": 813 }, { "epoch": 0.04, "grad_norm": 1.352358047134358, "learning_rate": 1.999571183181432e-05, "loss": 1.3599, "step": 814 }, { "epoch": 0.04, "grad_norm": 1.2840718057316223, "learning_rate": 1.9995666094981385e-05, "loss": 1.3545, "step": 815 }, { "epoch": 0.04, "grad_norm": 1.1359953155079805, "learning_rate": 1.9995620115583033e-05, "loss": 1.1272, "step": 816 }, { "epoch": 0.04, "grad_norm": 1.240916883898558, "learning_rate": 1.999557389362037e-05, "loss": 1.2437, "step": 817 }, { "epoch": 0.04, "grad_norm": 1.3739790811888415, "learning_rate": 1.9995527429094534e-05, "loss": 1.3506, "step": 818 }, { "epoch": 0.04, "grad_norm": 1.2912649872504953, "learning_rate": 1.9995480722006636e-05, "loss": 1.5347, "step": 819 }, { "epoch": 0.04, "grad_norm": 1.3565251578259534, "learning_rate": 1.999543377235782e-05, "loss": 1.1157, "step": 820 }, { "epoch": 0.04, "grad_norm": 1.6728270669624243, "learning_rate": 1.999538658014922e-05, "loss": 1.4023, "step": 821 }, { "epoch": 0.04, "grad_norm": 1.504838627499413, "learning_rate": 1.9995339145381982e-05, "loss": 1.2886, "step": 822 }, { "epoch": 0.04, "grad_norm": 1.2483112189876708, "learning_rate": 1.999529146805726e-05, "loss": 1.2349, "step": 823 }, { "epoch": 0.04, "grad_norm": 1.335802161261752, "learning_rate": 1.999524354817621e-05, "loss": 1.4175, "step": 824 }, { "epoch": 0.04, "grad_norm": 1.2295105602634124, "learning_rate": 1.999519538573999e-05, "loss": 1.3979, "step": 825 }, { "epoch": 0.04, "grad_norm": 1.0403257473152625, "learning_rate": 1.9995146980749777e-05, "loss": 1.1855, "step": 826 }, { "epoch": 0.04, "grad_norm": 1.259663128552189, "learning_rate": 1.999509833320674e-05, "loss": 1.3735, "step": 827 }, { "epoch": 0.04, "grad_norm": 1.516548450973032, "learning_rate": 1.999504944311206e-05, "loss": 1.4141, "step": 828 }, { "epoch": 0.04, "grad_norm": 1.3346623828301412, "learning_rate": 1.9995000310466927e-05, "loss": 1.3379, "step": 829 }, { "epoch": 0.04, "grad_norm": 1.3868452669160183, "learning_rate": 1.999495093527253e-05, "loss": 1.2559, "step": 830 }, { "epoch": 0.04, "grad_norm": 1.44761352191918, "learning_rate": 1.9994901317530067e-05, "loss": 1.3979, "step": 831 }, { "epoch": 0.04, "grad_norm": 1.3263519078193244, "learning_rate": 1.999485145724074e-05, "loss": 1.2021, "step": 832 }, { "epoch": 0.04, "grad_norm": 1.3812156406234244, "learning_rate": 1.9994801354405768e-05, "loss": 1.2783, "step": 833 }, { "epoch": 0.04, "grad_norm": 1.5427679508642556, "learning_rate": 1.9994751009026355e-05, "loss": 1.2104, "step": 834 }, { "epoch": 0.04, "grad_norm": 0.8903903131415463, "learning_rate": 1.9994700421103734e-05, "loss": 1.2075, "step": 835 }, { "epoch": 0.04, "grad_norm": 1.0041952882628502, "learning_rate": 1.9994649590639124e-05, "loss": 1.2192, "step": 836 }, { "epoch": 0.04, "grad_norm": 1.3180355062295364, "learning_rate": 1.999459851763376e-05, "loss": 1.4043, "step": 837 }, { "epoch": 0.04, "grad_norm": 1.2183398651805275, "learning_rate": 1.9994547202088886e-05, "loss": 1.2729, "step": 838 }, { "epoch": 0.04, "grad_norm": 1.2664641140378452, "learning_rate": 1.9994495644005746e-05, "loss": 1.3604, "step": 839 }, { "epoch": 0.04, "grad_norm": 1.1983467702710018, "learning_rate": 1.9994443843385583e-05, "loss": 1.3555, "step": 840 }, { "epoch": 0.04, "grad_norm": 1.320101803469919, "learning_rate": 1.9994391800229666e-05, "loss": 1.3906, "step": 841 }, { "epoch": 0.04, "grad_norm": 1.333045435217759, "learning_rate": 1.999433951453925e-05, "loss": 1.4414, "step": 842 }, { "epoch": 0.04, "grad_norm": 1.372392634761786, "learning_rate": 1.999428698631561e-05, "loss": 1.2549, "step": 843 }, { "epoch": 0.04, "grad_norm": 1.4853586461352777, "learning_rate": 1.9994234215560014e-05, "loss": 1.2729, "step": 844 }, { "epoch": 0.04, "grad_norm": 1.5212548236723744, "learning_rate": 1.9994181202273745e-05, "loss": 1.2544, "step": 845 }, { "epoch": 0.04, "grad_norm": 1.3969086957767294, "learning_rate": 1.999412794645809e-05, "loss": 1.2773, "step": 846 }, { "epoch": 0.04, "grad_norm": 1.1942228609319236, "learning_rate": 1.9994074448114342e-05, "loss": 1.3682, "step": 847 }, { "epoch": 0.04, "grad_norm": 1.2821108163165773, "learning_rate": 1.99940207072438e-05, "loss": 1.4014, "step": 848 }, { "epoch": 0.04, "grad_norm": 1.573689769344591, "learning_rate": 1.9993966723847766e-05, "loss": 1.4219, "step": 849 }, { "epoch": 0.04, "grad_norm": 1.032525793857006, "learning_rate": 1.999391249792755e-05, "loss": 1.2939, "step": 850 }, { "epoch": 0.04, "grad_norm": 1.3641609972298137, "learning_rate": 1.999385802948447e-05, "loss": 1.4458, "step": 851 }, { "epoch": 0.04, "grad_norm": 1.2748678745258037, "learning_rate": 1.9993803318519845e-05, "loss": 1.3652, "step": 852 }, { "epoch": 0.04, "grad_norm": 1.1495923719849077, "learning_rate": 1.9993748365035004e-05, "loss": 1.356, "step": 853 }, { "epoch": 0.04, "grad_norm": 1.4585380361442615, "learning_rate": 1.999369316903128e-05, "loss": 1.3691, "step": 854 }, { "epoch": 0.04, "grad_norm": 1.5971893874586247, "learning_rate": 1.999363773051002e-05, "loss": 1.335, "step": 855 }, { "epoch": 0.04, "grad_norm": 1.2782834138306551, "learning_rate": 1.9993582049472554e-05, "loss": 1.355, "step": 856 }, { "epoch": 0.04, "grad_norm": 1.3735370733867682, "learning_rate": 1.9993526125920245e-05, "loss": 1.2002, "step": 857 }, { "epoch": 0.04, "grad_norm": 1.4777198320293181, "learning_rate": 1.9993469959854445e-05, "loss": 1.4048, "step": 858 }, { "epoch": 0.04, "grad_norm": 1.4690241627416423, "learning_rate": 1.9993413551276523e-05, "loss": 1.2598, "step": 859 }, { "epoch": 0.04, "grad_norm": 1.5494065605653378, "learning_rate": 1.9993356900187843e-05, "loss": 1.4937, "step": 860 }, { "epoch": 0.04, "grad_norm": 1.2696606634811864, "learning_rate": 1.9993300006589775e-05, "loss": 1.3706, "step": 861 }, { "epoch": 0.04, "grad_norm": 1.4664322689010696, "learning_rate": 1.999324287048371e-05, "loss": 1.2827, "step": 862 }, { "epoch": 0.04, "grad_norm": 1.26354958260517, "learning_rate": 1.999318549187103e-05, "loss": 1.3359, "step": 863 }, { "epoch": 0.04, "grad_norm": 1.286147290071792, "learning_rate": 1.9993127870753123e-05, "loss": 1.5239, "step": 864 }, { "epoch": 0.04, "grad_norm": 1.5487074695479017, "learning_rate": 1.9993070007131395e-05, "loss": 1.5107, "step": 865 }, { "epoch": 0.04, "grad_norm": 1.3287771996486357, "learning_rate": 1.9993011901007245e-05, "loss": 1.395, "step": 866 }, { "epoch": 0.04, "grad_norm": 1.1780584078649199, "learning_rate": 1.9992953552382085e-05, "loss": 1.4663, "step": 867 }, { "epoch": 0.04, "grad_norm": 1.4292089563685784, "learning_rate": 1.9992894961257332e-05, "loss": 1.4868, "step": 868 }, { "epoch": 0.04, "grad_norm": 1.6383959056472281, "learning_rate": 1.9992836127634402e-05, "loss": 1.2896, "step": 869 }, { "epoch": 0.04, "grad_norm": 1.1862031528209118, "learning_rate": 1.999277705151473e-05, "loss": 1.2622, "step": 870 }, { "epoch": 0.04, "grad_norm": 1.358043693780465, "learning_rate": 1.9992717732899746e-05, "loss": 1.2949, "step": 871 }, { "epoch": 0.04, "grad_norm": 1.5875828308957856, "learning_rate": 1.9992658171790893e-05, "loss": 1.4966, "step": 872 }, { "epoch": 0.04, "grad_norm": 1.3187869290512495, "learning_rate": 1.999259836818961e-05, "loss": 1.2563, "step": 873 }, { "epoch": 0.04, "grad_norm": 1.2673226435682952, "learning_rate": 1.9992538322097352e-05, "loss": 1.374, "step": 874 }, { "epoch": 0.04, "grad_norm": 1.2413983095599295, "learning_rate": 1.9992478033515578e-05, "loss": 1.3076, "step": 875 }, { "epoch": 0.04, "grad_norm": 1.6455998392466549, "learning_rate": 1.9992417502445746e-05, "loss": 1.4648, "step": 876 }, { "epoch": 0.04, "grad_norm": 1.1873026079802493, "learning_rate": 1.9992356728889332e-05, "loss": 1.4175, "step": 877 }, { "epoch": 0.04, "grad_norm": 1.2235097035104556, "learning_rate": 1.9992295712847802e-05, "loss": 1.3604, "step": 878 }, { "epoch": 0.04, "grad_norm": 1.4914296178169537, "learning_rate": 1.9992234454322643e-05, "loss": 1.3325, "step": 879 }, { "epoch": 0.04, "grad_norm": 0.9991799371724274, "learning_rate": 1.9992172953315343e-05, "loss": 1.1416, "step": 880 }, { "epoch": 0.04, "grad_norm": 1.3410768541678202, "learning_rate": 1.9992111209827386e-05, "loss": 1.3164, "step": 881 }, { "epoch": 0.04, "grad_norm": 1.2598398976427776, "learning_rate": 1.999204922386028e-05, "loss": 1.2388, "step": 882 }, { "epoch": 0.04, "grad_norm": 1.3398894209183871, "learning_rate": 1.999198699541552e-05, "loss": 1.3042, "step": 883 }, { "epoch": 0.04, "grad_norm": 1.2918563458745362, "learning_rate": 1.999192452449463e-05, "loss": 1.3545, "step": 884 }, { "epoch": 0.04, "grad_norm": 1.4364657216682528, "learning_rate": 1.999186181109911e-05, "loss": 1.3535, "step": 885 }, { "epoch": 0.04, "grad_norm": 1.0458957581471489, "learning_rate": 1.999179885523049e-05, "loss": 1.2451, "step": 886 }, { "epoch": 0.04, "grad_norm": 1.1827555740301956, "learning_rate": 1.9991735656890293e-05, "loss": 1.3438, "step": 887 }, { "epoch": 0.04, "grad_norm": 1.2764834425040195, "learning_rate": 1.9991672216080064e-05, "loss": 1.1558, "step": 888 }, { "epoch": 0.04, "grad_norm": 1.58680902234929, "learning_rate": 1.999160853280133e-05, "loss": 1.4512, "step": 889 }, { "epoch": 0.04, "grad_norm": 1.2793645510398626, "learning_rate": 1.9991544607055642e-05, "loss": 1.3193, "step": 890 }, { "epoch": 0.04, "grad_norm": 1.3759798622513246, "learning_rate": 1.999148043884455e-05, "loss": 1.4072, "step": 891 }, { "epoch": 0.04, "grad_norm": 1.2371516768307143, "learning_rate": 1.9991416028169612e-05, "loss": 1.3638, "step": 892 }, { "epoch": 0.04, "grad_norm": 1.412787366157253, "learning_rate": 1.999135137503239e-05, "loss": 1.1118, "step": 893 }, { "epoch": 0.04, "grad_norm": 1.1218864924868144, "learning_rate": 1.9991286479434456e-05, "loss": 1.3115, "step": 894 }, { "epoch": 0.04, "grad_norm": 1.660083387521531, "learning_rate": 1.999122134137738e-05, "loss": 1.3022, "step": 895 }, { "epoch": 0.04, "grad_norm": 1.1623454098746682, "learning_rate": 1.9991155960862743e-05, "loss": 1.2378, "step": 896 }, { "epoch": 0.04, "grad_norm": 1.6288733435735185, "learning_rate": 1.999109033789214e-05, "loss": 1.3486, "step": 897 }, { "epoch": 0.04, "grad_norm": 1.3826216979703472, "learning_rate": 1.9991024472467156e-05, "loss": 1.3203, "step": 898 }, { "epoch": 0.04, "grad_norm": 1.313357889627867, "learning_rate": 1.9990958364589388e-05, "loss": 1.3354, "step": 899 }, { "epoch": 0.04, "grad_norm": 1.5228343365715902, "learning_rate": 1.999089201426044e-05, "loss": 1.3755, "step": 900 }, { "epoch": 0.04, "grad_norm": 1.3529503812458403, "learning_rate": 1.999082542148193e-05, "loss": 1.4292, "step": 901 }, { "epoch": 0.04, "grad_norm": 1.4265214448601857, "learning_rate": 1.9990758586255467e-05, "loss": 1.3364, "step": 902 }, { "epoch": 0.04, "grad_norm": 1.1650873108662794, "learning_rate": 1.9990691508582678e-05, "loss": 1.3408, "step": 903 }, { "epoch": 0.04, "grad_norm": 1.3896385042247748, "learning_rate": 1.9990624188465183e-05, "loss": 1.5249, "step": 904 }, { "epoch": 0.04, "grad_norm": 1.3120643062527504, "learning_rate": 1.9990556625904623e-05, "loss": 1.3193, "step": 905 }, { "epoch": 0.04, "grad_norm": 1.5959920097069034, "learning_rate": 1.9990488820902633e-05, "loss": 1.2778, "step": 906 }, { "epoch": 0.04, "grad_norm": 1.1769556939383408, "learning_rate": 1.9990420773460864e-05, "loss": 1.3765, "step": 907 }, { "epoch": 0.04, "grad_norm": 1.3830733122713008, "learning_rate": 1.999035248358096e-05, "loss": 1.3867, "step": 908 }, { "epoch": 0.04, "grad_norm": 1.3307473572812227, "learning_rate": 1.999028395126458e-05, "loss": 1.395, "step": 909 }, { "epoch": 0.04, "grad_norm": 1.280182797255347, "learning_rate": 1.9990215176513394e-05, "loss": 1.3325, "step": 910 }, { "epoch": 0.04, "grad_norm": 1.463418313649634, "learning_rate": 1.9990146159329065e-05, "loss": 1.3774, "step": 911 }, { "epoch": 0.04, "grad_norm": 1.3738754408926348, "learning_rate": 1.9990076899713268e-05, "loss": 1.4053, "step": 912 }, { "epoch": 0.04, "grad_norm": 1.3597670053585398, "learning_rate": 1.999000739766768e-05, "loss": 1.2642, "step": 913 }, { "epoch": 0.04, "grad_norm": 1.2817434933570393, "learning_rate": 1.9989937653193995e-05, "loss": 1.3853, "step": 914 }, { "epoch": 0.04, "grad_norm": 1.1579417826711578, "learning_rate": 1.9989867666293904e-05, "loss": 1.3838, "step": 915 }, { "epoch": 0.04, "grad_norm": 1.2545116621573145, "learning_rate": 1.9989797436969103e-05, "loss": 1.3091, "step": 916 }, { "epoch": 0.04, "grad_norm": 1.3782429480905622, "learning_rate": 1.9989726965221298e-05, "loss": 1.3906, "step": 917 }, { "epoch": 0.04, "grad_norm": 1.1879849344972098, "learning_rate": 1.99896562510522e-05, "loss": 1.3843, "step": 918 }, { "epoch": 0.04, "grad_norm": 1.471959636571595, "learning_rate": 1.9989585294463518e-05, "loss": 1.3091, "step": 919 }, { "epoch": 0.04, "grad_norm": 1.377240148762396, "learning_rate": 1.998951409545698e-05, "loss": 1.2275, "step": 920 }, { "epoch": 0.04, "grad_norm": 1.2101114339339933, "learning_rate": 1.9989442654034315e-05, "loss": 1.3799, "step": 921 }, { "epoch": 0.04, "grad_norm": 1.3648937558088416, "learning_rate": 1.9989370970197257e-05, "loss": 1.332, "step": 922 }, { "epoch": 0.04, "grad_norm": 1.702236328558717, "learning_rate": 1.998929904394754e-05, "loss": 1.3672, "step": 923 }, { "epoch": 0.04, "grad_norm": 1.6932267976650008, "learning_rate": 1.9989226875286913e-05, "loss": 1.4204, "step": 924 }, { "epoch": 0.04, "grad_norm": 0.9774939856923439, "learning_rate": 1.9989154464217128e-05, "loss": 1.252, "step": 925 }, { "epoch": 0.04, "grad_norm": 1.0934491069369945, "learning_rate": 1.9989081810739943e-05, "loss": 1.2261, "step": 926 }, { "epoch": 0.04, "grad_norm": 1.38808903886525, "learning_rate": 1.9989008914857115e-05, "loss": 1.4512, "step": 927 }, { "epoch": 0.04, "grad_norm": 1.3685048522869565, "learning_rate": 1.998893577657042e-05, "loss": 1.3633, "step": 928 }, { "epoch": 0.04, "grad_norm": 1.492911551380535, "learning_rate": 1.998886239588163e-05, "loss": 1.3516, "step": 929 }, { "epoch": 0.04, "grad_norm": 1.161716391501544, "learning_rate": 1.9988788772792523e-05, "loss": 1.2988, "step": 930 }, { "epoch": 0.04, "grad_norm": 1.3969554674670923, "learning_rate": 1.998871490730489e-05, "loss": 1.356, "step": 931 }, { "epoch": 0.04, "grad_norm": 1.2416667768295848, "learning_rate": 1.9988640799420524e-05, "loss": 0.9905, "step": 932 }, { "epoch": 0.04, "grad_norm": 1.3794216875013767, "learning_rate": 1.9988566449141223e-05, "loss": 1.3296, "step": 933 }, { "epoch": 0.04, "grad_norm": 1.3525392740303408, "learning_rate": 1.9988491856468785e-05, "loss": 1.3052, "step": 934 }, { "epoch": 0.04, "grad_norm": 1.4813995662256194, "learning_rate": 1.9988417021405027e-05, "loss": 1.2573, "step": 935 }, { "epoch": 0.05, "grad_norm": 1.5574296899146205, "learning_rate": 1.998834194395176e-05, "loss": 1.3887, "step": 936 }, { "epoch": 0.05, "grad_norm": 1.2777452053799485, "learning_rate": 1.9988266624110813e-05, "loss": 1.2407, "step": 937 }, { "epoch": 0.05, "grad_norm": 1.199796946396947, "learning_rate": 1.9988191061884005e-05, "loss": 1.2485, "step": 938 }, { "epoch": 0.05, "grad_norm": 1.5543231980934835, "learning_rate": 1.9988115257273176e-05, "loss": 1.3604, "step": 939 }, { "epoch": 0.05, "grad_norm": 1.4342811199367758, "learning_rate": 1.9988039210280167e-05, "loss": 1.3174, "step": 940 }, { "epoch": 0.05, "grad_norm": 1.2253449183968281, "learning_rate": 1.9987962920906816e-05, "loss": 1.3066, "step": 941 }, { "epoch": 0.05, "grad_norm": 1.061222547339291, "learning_rate": 1.998788638915498e-05, "loss": 1.3638, "step": 942 }, { "epoch": 0.05, "grad_norm": 1.2021105385477442, "learning_rate": 1.9987809615026513e-05, "loss": 1.3774, "step": 943 }, { "epoch": 0.05, "grad_norm": 1.25154549549052, "learning_rate": 1.998773259852328e-05, "loss": 1.1147, "step": 944 }, { "epoch": 0.05, "grad_norm": 1.3440440674566698, "learning_rate": 1.9987655339647153e-05, "loss": 1.415, "step": 945 }, { "epoch": 0.05, "grad_norm": 1.1973225464047557, "learning_rate": 1.99875778384e-05, "loss": 1.1968, "step": 946 }, { "epoch": 0.05, "grad_norm": 0.9558058021713994, "learning_rate": 1.998750009478371e-05, "loss": 1.3579, "step": 947 }, { "epoch": 0.05, "grad_norm": 1.152101376464106, "learning_rate": 1.9987422108800157e-05, "loss": 1.2856, "step": 948 }, { "epoch": 0.05, "grad_norm": 1.5544153708756336, "learning_rate": 1.998734388045125e-05, "loss": 1.3643, "step": 949 }, { "epoch": 0.05, "grad_norm": 1.6908365237396161, "learning_rate": 1.998726540973887e-05, "loss": 1.3037, "step": 950 }, { "epoch": 0.05, "grad_norm": 1.2068550869487893, "learning_rate": 1.9987186696664937e-05, "loss": 1.2026, "step": 951 }, { "epoch": 0.05, "grad_norm": 1.3372898382685539, "learning_rate": 1.9987107741231352e-05, "loss": 1.0938, "step": 952 }, { "epoch": 0.05, "grad_norm": 1.2692466474211135, "learning_rate": 1.9987028543440035e-05, "loss": 1.2944, "step": 953 }, { "epoch": 0.05, "grad_norm": 0.9370550324299994, "learning_rate": 1.9986949103292904e-05, "loss": 1.187, "step": 954 }, { "epoch": 0.05, "grad_norm": 1.2263779901944991, "learning_rate": 1.998686942079189e-05, "loss": 1.1904, "step": 955 }, { "epoch": 0.05, "grad_norm": 1.4355687661452312, "learning_rate": 1.9986789495938922e-05, "loss": 1.332, "step": 956 }, { "epoch": 0.05, "grad_norm": 1.0667375738621068, "learning_rate": 1.9986709328735944e-05, "loss": 1.3604, "step": 957 }, { "epoch": 0.05, "grad_norm": 0.8746824338351591, "learning_rate": 1.9986628919184904e-05, "loss": 1.2314, "step": 958 }, { "epoch": 0.05, "grad_norm": 1.59216158893926, "learning_rate": 1.9986548267287745e-05, "loss": 1.3662, "step": 959 }, { "epoch": 0.05, "grad_norm": 1.3438436318676739, "learning_rate": 1.998646737304643e-05, "loss": 1.2744, "step": 960 }, { "epoch": 0.05, "grad_norm": 1.2138084069927937, "learning_rate": 1.9986386236462926e-05, "loss": 1.4468, "step": 961 }, { "epoch": 0.05, "grad_norm": 1.120099811063583, "learning_rate": 1.998630485753919e-05, "loss": 1.2373, "step": 962 }, { "epoch": 0.05, "grad_norm": 1.1876037110201647, "learning_rate": 1.998622323627721e-05, "loss": 1.3599, "step": 963 }, { "epoch": 0.05, "grad_norm": 1.550154343452363, "learning_rate": 1.998614137267896e-05, "loss": 1.3286, "step": 964 }, { "epoch": 0.05, "grad_norm": 1.347473590275112, "learning_rate": 1.998605926674642e-05, "loss": 1.2539, "step": 965 }, { "epoch": 0.05, "grad_norm": 1.265254502880324, "learning_rate": 1.9985976918481595e-05, "loss": 1.3901, "step": 966 }, { "epoch": 0.05, "grad_norm": 1.384265812571524, "learning_rate": 1.9985894327886476e-05, "loss": 1.2671, "step": 967 }, { "epoch": 0.05, "grad_norm": 2.0054482477943014, "learning_rate": 1.9985811494963074e-05, "loss": 1.3184, "step": 968 }, { "epoch": 0.05, "grad_norm": 1.2901370282303524, "learning_rate": 1.9985728419713388e-05, "loss": 1.3042, "step": 969 }, { "epoch": 0.05, "grad_norm": 1.3917510134693893, "learning_rate": 1.998564510213944e-05, "loss": 1.2856, "step": 970 }, { "epoch": 0.05, "grad_norm": 1.5084494731023785, "learning_rate": 1.9985561542243256e-05, "loss": 1.2646, "step": 971 }, { "epoch": 0.05, "grad_norm": 1.2970310646779586, "learning_rate": 1.998547774002686e-05, "loss": 1.439, "step": 972 }, { "epoch": 0.05, "grad_norm": 1.8433551361630716, "learning_rate": 1.9985393695492283e-05, "loss": 1.5034, "step": 973 }, { "epoch": 0.05, "grad_norm": 1.4097486765759444, "learning_rate": 1.998530940864157e-05, "loss": 1.3135, "step": 974 }, { "epoch": 0.05, "grad_norm": 1.1637630868752868, "learning_rate": 1.998522487947676e-05, "loss": 1.3672, "step": 975 }, { "epoch": 0.05, "grad_norm": 1.328044158647447, "learning_rate": 1.9985140107999908e-05, "loss": 1.4014, "step": 976 }, { "epoch": 0.05, "grad_norm": 1.3562911381641412, "learning_rate": 1.9985055094213072e-05, "loss": 1.3242, "step": 977 }, { "epoch": 0.05, "grad_norm": 1.4796538737526004, "learning_rate": 1.9984969838118316e-05, "loss": 1.2754, "step": 978 }, { "epoch": 0.05, "grad_norm": 1.3693163340337076, "learning_rate": 1.9984884339717704e-05, "loss": 1.3291, "step": 979 }, { "epoch": 0.05, "grad_norm": 1.302252228077707, "learning_rate": 1.9984798599013315e-05, "loss": 1.3442, "step": 980 }, { "epoch": 0.05, "grad_norm": 1.447844203185186, "learning_rate": 1.9984712616007226e-05, "loss": 1.3872, "step": 981 }, { "epoch": 0.05, "grad_norm": 1.2426830482315911, "learning_rate": 1.998462639070153e-05, "loss": 1.293, "step": 982 }, { "epoch": 0.05, "grad_norm": 1.1556279308136506, "learning_rate": 1.998453992309831e-05, "loss": 1.1377, "step": 983 }, { "epoch": 0.05, "grad_norm": 1.159318491672372, "learning_rate": 1.9984453213199673e-05, "loss": 1.3896, "step": 984 }, { "epoch": 0.05, "grad_norm": 1.425591113281386, "learning_rate": 1.9984366261007716e-05, "loss": 1.2471, "step": 985 }, { "epoch": 0.05, "grad_norm": 1.2896206053205888, "learning_rate": 1.998427906652456e-05, "loss": 1.1997, "step": 986 }, { "epoch": 0.05, "grad_norm": 1.8103005747023315, "learning_rate": 1.9984191629752306e-05, "loss": 1.1523, "step": 987 }, { "epoch": 0.05, "grad_norm": 1.2809953192571162, "learning_rate": 1.998410395069309e-05, "loss": 1.3008, "step": 988 }, { "epoch": 0.05, "grad_norm": 1.2166003063089528, "learning_rate": 1.998401602934903e-05, "loss": 1.3438, "step": 989 }, { "epoch": 0.05, "grad_norm": 1.3678053627449558, "learning_rate": 1.9983927865722262e-05, "loss": 1.3481, "step": 990 }, { "epoch": 0.05, "grad_norm": 1.0182068184441464, "learning_rate": 1.9983839459814925e-05, "loss": 1.3633, "step": 991 }, { "epoch": 0.05, "grad_norm": 1.4960249567706507, "learning_rate": 1.998375081162917e-05, "loss": 1.3462, "step": 992 }, { "epoch": 0.05, "grad_norm": 1.3160283144370801, "learning_rate": 1.998366192116714e-05, "loss": 1.2021, "step": 993 }, { "epoch": 0.05, "grad_norm": 1.6112957595299955, "learning_rate": 1.9983572788431e-05, "loss": 1.4639, "step": 994 }, { "epoch": 0.05, "grad_norm": 1.4365455223312873, "learning_rate": 1.9983483413422907e-05, "loss": 1.2476, "step": 995 }, { "epoch": 0.05, "grad_norm": 1.426085820937853, "learning_rate": 1.998339379614503e-05, "loss": 1.3105, "step": 996 }, { "epoch": 0.05, "grad_norm": 1.1363633654065086, "learning_rate": 1.9983303936599553e-05, "loss": 1.2622, "step": 997 }, { "epoch": 0.05, "grad_norm": 1.4016170045063903, "learning_rate": 1.9983213834788643e-05, "loss": 1.3848, "step": 998 }, { "epoch": 0.05, "grad_norm": 0.9543836124841405, "learning_rate": 1.9983123490714492e-05, "loss": 1.2832, "step": 999 }, { "epoch": 0.05, "grad_norm": 1.3643368717839928, "learning_rate": 1.9983032904379296e-05, "loss": 1.2617, "step": 1000 }, { "epoch": 0.05, "grad_norm": 1.313726043429224, "learning_rate": 1.9982942075785247e-05, "loss": 1.2256, "step": 1001 }, { "epoch": 0.05, "grad_norm": 1.3126296883658601, "learning_rate": 1.9982851004934557e-05, "loss": 1.3042, "step": 1002 }, { "epoch": 0.05, "grad_norm": 1.4807697975938474, "learning_rate": 1.998275969182943e-05, "loss": 1.3877, "step": 1003 }, { "epoch": 0.05, "grad_norm": 0.5789813796548606, "learning_rate": 1.998266813647208e-05, "loss": 1.2773, "step": 1004 }, { "epoch": 0.05, "grad_norm": 1.2317452211289648, "learning_rate": 1.9982576338864738e-05, "loss": 1.4082, "step": 1005 }, { "epoch": 0.05, "grad_norm": 1.1037718118357647, "learning_rate": 1.9982484299009624e-05, "loss": 1.417, "step": 1006 }, { "epoch": 0.05, "grad_norm": 1.0666238869576732, "learning_rate": 1.9982392016908975e-05, "loss": 1.1797, "step": 1007 }, { "epoch": 0.05, "grad_norm": 1.5182293726400478, "learning_rate": 1.9982299492565028e-05, "loss": 1.3657, "step": 1008 }, { "epoch": 0.05, "grad_norm": 1.4607145946659057, "learning_rate": 1.9982206725980026e-05, "loss": 1.5088, "step": 1009 }, { "epoch": 0.05, "grad_norm": 1.087138765004644, "learning_rate": 1.9982113717156225e-05, "loss": 1.292, "step": 1010 }, { "epoch": 0.05, "grad_norm": 1.5472316566217363, "learning_rate": 1.9982020466095882e-05, "loss": 1.4028, "step": 1011 }, { "epoch": 0.05, "grad_norm": 1.4748639915074149, "learning_rate": 1.9981926972801257e-05, "loss": 1.3838, "step": 1012 }, { "epoch": 0.05, "grad_norm": 1.3029442810828897, "learning_rate": 1.998183323727462e-05, "loss": 1.1328, "step": 1013 }, { "epoch": 0.05, "grad_norm": 1.4078644633126354, "learning_rate": 1.9981739259518246e-05, "loss": 1.3418, "step": 1014 }, { "epoch": 0.05, "grad_norm": 1.288986994516389, "learning_rate": 1.9981645039534417e-05, "loss": 1.3682, "step": 1015 }, { "epoch": 0.05, "grad_norm": 0.9186980005404712, "learning_rate": 1.9981550577325417e-05, "loss": 1.01, "step": 1016 }, { "epoch": 0.05, "grad_norm": 1.0519750137112254, "learning_rate": 1.998145587289354e-05, "loss": 1.3555, "step": 1017 }, { "epoch": 0.05, "grad_norm": 1.2943064510542623, "learning_rate": 1.998136092624108e-05, "loss": 1.3589, "step": 1018 }, { "epoch": 0.05, "grad_norm": 1.3431852715908437, "learning_rate": 1.998126573737035e-05, "loss": 1.4341, "step": 1019 }, { "epoch": 0.05, "grad_norm": 1.017731704818828, "learning_rate": 1.9981170306283647e-05, "loss": 1.3594, "step": 1020 }, { "epoch": 0.05, "grad_norm": 1.279071739788195, "learning_rate": 1.99810746329833e-05, "loss": 1.2241, "step": 1021 }, { "epoch": 0.05, "grad_norm": 1.4187560427439891, "learning_rate": 1.998097871747162e-05, "loss": 1.1978, "step": 1022 }, { "epoch": 0.05, "grad_norm": 1.1759021008035346, "learning_rate": 1.9980882559750947e-05, "loss": 1.3184, "step": 1023 }, { "epoch": 0.05, "grad_norm": 1.6098924473742597, "learning_rate": 1.99807861598236e-05, "loss": 1.5024, "step": 1024 }, { "epoch": 0.05, "grad_norm": 1.2735511470590686, "learning_rate": 1.9980689517691928e-05, "loss": 1.166, "step": 1025 }, { "epoch": 0.05, "grad_norm": 1.458595476858746, "learning_rate": 1.9980592633358276e-05, "loss": 1.3711, "step": 1026 }, { "epoch": 0.05, "grad_norm": 1.3764389204819925, "learning_rate": 1.998049550682499e-05, "loss": 1.3354, "step": 1027 }, { "epoch": 0.05, "grad_norm": 1.1539612502573597, "learning_rate": 1.998039813809443e-05, "loss": 1.3408, "step": 1028 }, { "epoch": 0.05, "grad_norm": 1.7183748178002722, "learning_rate": 1.998030052716896e-05, "loss": 1.2192, "step": 1029 }, { "epoch": 0.05, "grad_norm": 0.9732579221380275, "learning_rate": 1.9980202674050945e-05, "loss": 1.2935, "step": 1030 }, { "epoch": 0.05, "grad_norm": 1.2033255681476656, "learning_rate": 1.9980104578742762e-05, "loss": 1.2563, "step": 1031 }, { "epoch": 0.05, "grad_norm": 1.2674235273687364, "learning_rate": 1.998000624124679e-05, "loss": 1.2842, "step": 1032 }, { "epoch": 0.05, "grad_norm": 1.5700028976483593, "learning_rate": 1.9979907661565418e-05, "loss": 1.4556, "step": 1033 }, { "epoch": 0.05, "grad_norm": 1.382150574205717, "learning_rate": 1.9979808839701037e-05, "loss": 1.4253, "step": 1034 }, { "epoch": 0.05, "grad_norm": 0.9809797771500793, "learning_rate": 1.9979709775656048e-05, "loss": 1.3013, "step": 1035 }, { "epoch": 0.05, "grad_norm": 1.4084645177251898, "learning_rate": 1.9979610469432847e-05, "loss": 1.2549, "step": 1036 }, { "epoch": 0.05, "grad_norm": 1.2275901556255258, "learning_rate": 1.997951092103385e-05, "loss": 1.1196, "step": 1037 }, { "epoch": 0.05, "grad_norm": 1.2991430386229006, "learning_rate": 1.9979411130461475e-05, "loss": 1.2676, "step": 1038 }, { "epoch": 0.05, "grad_norm": 1.3067145350572373, "learning_rate": 1.997931109771814e-05, "loss": 1.1641, "step": 1039 }, { "epoch": 0.05, "grad_norm": 1.2179772723047235, "learning_rate": 1.997921082280627e-05, "loss": 1.189, "step": 1040 }, { "epoch": 0.05, "grad_norm": 1.2364356944868469, "learning_rate": 1.99791103057283e-05, "loss": 1.2559, "step": 1041 }, { "epoch": 0.05, "grad_norm": 1.285349654671994, "learning_rate": 1.9979009546486675e-05, "loss": 1.3296, "step": 1042 }, { "epoch": 0.05, "grad_norm": 1.4674984757160858, "learning_rate": 1.9978908545083833e-05, "loss": 1.3198, "step": 1043 }, { "epoch": 0.05, "grad_norm": 1.3766417113687375, "learning_rate": 1.9978807301522226e-05, "loss": 1.1938, "step": 1044 }, { "epoch": 0.05, "grad_norm": 1.2660597611255526, "learning_rate": 1.9978705815804312e-05, "loss": 1.3877, "step": 1045 }, { "epoch": 0.05, "grad_norm": 1.4515855786147038, "learning_rate": 1.9978604087932557e-05, "loss": 1.2271, "step": 1046 }, { "epoch": 0.05, "grad_norm": 1.2249349618501544, "learning_rate": 1.997850211790943e-05, "loss": 1.4624, "step": 1047 }, { "epoch": 0.05, "grad_norm": 1.3737596195236799, "learning_rate": 1.9978399905737395e-05, "loss": 1.1377, "step": 1048 }, { "epoch": 0.05, "grad_norm": 1.410093234311074, "learning_rate": 1.9978297451418945e-05, "loss": 1.3413, "step": 1049 }, { "epoch": 0.05, "grad_norm": 1.385375473930903, "learning_rate": 1.9978194754956558e-05, "loss": 1.209, "step": 1050 }, { "epoch": 0.05, "grad_norm": 1.1556466116903177, "learning_rate": 1.997809181635273e-05, "loss": 1.1904, "step": 1051 }, { "epoch": 0.05, "grad_norm": 1.2832249875876516, "learning_rate": 1.9977988635609957e-05, "loss": 1.2808, "step": 1052 }, { "epoch": 0.05, "grad_norm": 1.3725402556608373, "learning_rate": 1.9977885212730745e-05, "loss": 1.1382, "step": 1053 }, { "epoch": 0.05, "grad_norm": 1.1436873448820741, "learning_rate": 1.9977781547717604e-05, "loss": 1.1851, "step": 1054 }, { "epoch": 0.05, "grad_norm": 1.2909307533555916, "learning_rate": 1.9977677640573048e-05, "loss": 1.353, "step": 1055 }, { "epoch": 0.05, "grad_norm": 1.3290123916926229, "learning_rate": 1.9977573491299597e-05, "loss": 1.2451, "step": 1056 }, { "epoch": 0.05, "grad_norm": 1.188744240707465, "learning_rate": 1.997746909989978e-05, "loss": 1.1792, "step": 1057 }, { "epoch": 0.05, "grad_norm": 1.5016420828983263, "learning_rate": 1.9977364466376135e-05, "loss": 1.2886, "step": 1058 }, { "epoch": 0.05, "grad_norm": 1.1400916703906432, "learning_rate": 1.9977259590731193e-05, "loss": 1.1101, "step": 1059 }, { "epoch": 0.05, "grad_norm": 1.0412176445321804, "learning_rate": 1.9977154472967504e-05, "loss": 1.2749, "step": 1060 }, { "epoch": 0.05, "grad_norm": 1.4585181162653846, "learning_rate": 1.9977049113087615e-05, "loss": 1.3857, "step": 1061 }, { "epoch": 0.05, "grad_norm": 1.148231450264873, "learning_rate": 1.997694351109409e-05, "loss": 1.4458, "step": 1062 }, { "epoch": 0.05, "grad_norm": 1.2899884577320724, "learning_rate": 1.9976837666989484e-05, "loss": 1.3418, "step": 1063 }, { "epoch": 0.05, "grad_norm": 1.3691101556164778, "learning_rate": 1.9976731580776373e-05, "loss": 1.1855, "step": 1064 }, { "epoch": 0.05, "grad_norm": 1.3425889477227961, "learning_rate": 1.9976625252457322e-05, "loss": 1.3872, "step": 1065 }, { "epoch": 0.05, "grad_norm": 1.4385858218654797, "learning_rate": 1.9976518682034917e-05, "loss": 1.2993, "step": 1066 }, { "epoch": 0.05, "grad_norm": 1.2327936107020323, "learning_rate": 1.9976411869511746e-05, "loss": 1.3301, "step": 1067 }, { "epoch": 0.05, "grad_norm": 1.8240592143896712, "learning_rate": 1.9976304814890396e-05, "loss": 1.6006, "step": 1068 }, { "epoch": 0.05, "grad_norm": 1.1715095353111415, "learning_rate": 1.997619751817347e-05, "loss": 1.2319, "step": 1069 }, { "epoch": 0.05, "grad_norm": 1.0711689194041374, "learning_rate": 1.9976089979363566e-05, "loss": 1.3857, "step": 1070 }, { "epoch": 0.05, "grad_norm": 1.521870325509901, "learning_rate": 1.99759821984633e-05, "loss": 1.2783, "step": 1071 }, { "epoch": 0.05, "grad_norm": 1.182169956894781, "learning_rate": 1.9975874175475284e-05, "loss": 1.3784, "step": 1072 }, { "epoch": 0.05, "grad_norm": 1.2241119043581783, "learning_rate": 1.997576591040214e-05, "loss": 1.2466, "step": 1073 }, { "epoch": 0.05, "grad_norm": 1.469065139808045, "learning_rate": 1.9975657403246492e-05, "loss": 1.4126, "step": 1074 }, { "epoch": 0.05, "grad_norm": 1.2721829997540537, "learning_rate": 1.9975548654010978e-05, "loss": 1.0464, "step": 1075 }, { "epoch": 0.05, "grad_norm": 1.205158339062666, "learning_rate": 1.9975439662698234e-05, "loss": 1.0979, "step": 1076 }, { "epoch": 0.05, "grad_norm": 1.4595869557900465, "learning_rate": 1.9975330429310908e-05, "loss": 1.3276, "step": 1077 }, { "epoch": 0.05, "grad_norm": 1.252300005952983, "learning_rate": 1.9975220953851648e-05, "loss": 1.2217, "step": 1078 }, { "epoch": 0.05, "grad_norm": 1.5120549022982526, "learning_rate": 1.9975111236323112e-05, "loss": 1.2842, "step": 1079 }, { "epoch": 0.05, "grad_norm": 1.6142693724098107, "learning_rate": 1.997500127672796e-05, "loss": 1.4463, "step": 1080 }, { "epoch": 0.05, "grad_norm": 1.47708318024681, "learning_rate": 1.9974891075068864e-05, "loss": 1.4375, "step": 1081 }, { "epoch": 0.05, "grad_norm": 1.079179240068141, "learning_rate": 1.9974780631348495e-05, "loss": 1.3496, "step": 1082 }, { "epoch": 0.05, "grad_norm": 1.1515354403904188, "learning_rate": 1.997466994556954e-05, "loss": 1.0122, "step": 1083 }, { "epoch": 0.05, "grad_norm": 1.3026402130868953, "learning_rate": 1.9974559017734676e-05, "loss": 1.2588, "step": 1084 }, { "epoch": 0.05, "grad_norm": 0.9327533019718179, "learning_rate": 1.99744478478466e-05, "loss": 1.5537, "step": 1085 }, { "epoch": 0.05, "grad_norm": 1.3194511426117472, "learning_rate": 1.9974336435908005e-05, "loss": 1.3076, "step": 1086 }, { "epoch": 0.05, "grad_norm": 0.8997824308649448, "learning_rate": 1.99742247819216e-05, "loss": 1.1567, "step": 1087 }, { "epoch": 0.05, "grad_norm": 1.097157962495064, "learning_rate": 1.9974112885890094e-05, "loss": 1.2153, "step": 1088 }, { "epoch": 0.05, "grad_norm": 1.720646379252307, "learning_rate": 1.9974000747816203e-05, "loss": 1.415, "step": 1089 }, { "epoch": 0.05, "grad_norm": 1.4475660506494223, "learning_rate": 1.9973888367702643e-05, "loss": 1.23, "step": 1090 }, { "epoch": 0.05, "grad_norm": 1.1928797359440777, "learning_rate": 1.9973775745552146e-05, "loss": 1.2847, "step": 1091 }, { "epoch": 0.05, "grad_norm": 1.2121004372091027, "learning_rate": 1.9973662881367442e-05, "loss": 1.4785, "step": 1092 }, { "epoch": 0.05, "grad_norm": 1.388496467105192, "learning_rate": 1.9973549775151273e-05, "loss": 1.2998, "step": 1093 }, { "epoch": 0.05, "grad_norm": 1.2015814484511962, "learning_rate": 1.997343642690638e-05, "loss": 1.3418, "step": 1094 }, { "epoch": 0.05, "grad_norm": 1.2580049448353066, "learning_rate": 1.9973322836635517e-05, "loss": 1.2227, "step": 1095 }, { "epoch": 0.05, "grad_norm": 1.5166618484331251, "learning_rate": 1.9973209004341442e-05, "loss": 1.312, "step": 1096 }, { "epoch": 0.05, "grad_norm": 1.5346874023872539, "learning_rate": 1.997309493002691e-05, "loss": 1.2261, "step": 1097 }, { "epoch": 0.05, "grad_norm": 1.0903940761602384, "learning_rate": 1.9972980613694698e-05, "loss": 1.1782, "step": 1098 }, { "epoch": 0.05, "grad_norm": 1.4082155893939632, "learning_rate": 1.9972866055347572e-05, "loss": 1.0967, "step": 1099 }, { "epoch": 0.05, "grad_norm": 1.3086608062767373, "learning_rate": 1.9972751254988317e-05, "loss": 1.3193, "step": 1100 }, { "epoch": 0.05, "grad_norm": 1.420563449658825, "learning_rate": 1.997263621261972e-05, "loss": 1.2358, "step": 1101 }, { "epoch": 0.05, "grad_norm": 1.4396599528523966, "learning_rate": 1.997252092824457e-05, "loss": 1.3574, "step": 1102 }, { "epoch": 0.05, "grad_norm": 1.2443797362079498, "learning_rate": 1.9972405401865663e-05, "loss": 1.3032, "step": 1103 }, { "epoch": 0.05, "grad_norm": 1.3214306726538065, "learning_rate": 1.997228963348581e-05, "loss": 1.1357, "step": 1104 }, { "epoch": 0.05, "grad_norm": 1.5002109099648913, "learning_rate": 1.997217362310781e-05, "loss": 1.3071, "step": 1105 }, { "epoch": 0.05, "grad_norm": 1.1375524972364377, "learning_rate": 1.9972057370734483e-05, "loss": 1.1943, "step": 1106 }, { "epoch": 0.05, "grad_norm": 1.5354869028515123, "learning_rate": 1.9971940876368653e-05, "loss": 1.3135, "step": 1107 }, { "epoch": 0.05, "grad_norm": 1.3774621372482108, "learning_rate": 1.9971824140013143e-05, "loss": 1.3682, "step": 1108 }, { "epoch": 0.05, "grad_norm": 1.2979932882725553, "learning_rate": 1.9971707161670786e-05, "loss": 1.334, "step": 1109 }, { "epoch": 0.05, "grad_norm": 1.083177672941417, "learning_rate": 1.997158994134442e-05, "loss": 1.2217, "step": 1110 }, { "epoch": 0.05, "grad_norm": 1.2409487754649744, "learning_rate": 1.99714724790369e-05, "loss": 1.3159, "step": 1111 }, { "epoch": 0.05, "grad_norm": 1.125500447027702, "learning_rate": 1.9971354774751063e-05, "loss": 1.3516, "step": 1112 }, { "epoch": 0.05, "grad_norm": 1.4519523497960383, "learning_rate": 1.9971236828489768e-05, "loss": 1.1709, "step": 1113 }, { "epoch": 0.05, "grad_norm": 1.5074835114732652, "learning_rate": 1.9971118640255883e-05, "loss": 1.3833, "step": 1114 }, { "epoch": 0.05, "grad_norm": 1.5595971352011893, "learning_rate": 1.997100021005227e-05, "loss": 1.3818, "step": 1115 }, { "epoch": 0.05, "grad_norm": 1.5218119961028296, "learning_rate": 1.997088153788181e-05, "loss": 1.3252, "step": 1116 }, { "epoch": 0.05, "grad_norm": 1.6747049093004815, "learning_rate": 1.9970762623747373e-05, "loss": 1.3853, "step": 1117 }, { "epoch": 0.05, "grad_norm": 0.7828877955130968, "learning_rate": 1.9970643467651853e-05, "loss": 1.1121, "step": 1118 }, { "epoch": 0.05, "grad_norm": 1.3288959626352281, "learning_rate": 1.9970524069598136e-05, "loss": 1.186, "step": 1119 }, { "epoch": 0.05, "grad_norm": 1.5918321671779148, "learning_rate": 1.9970404429589126e-05, "loss": 1.1863, "step": 1120 }, { "epoch": 0.05, "grad_norm": 1.6003154680307907, "learning_rate": 1.997028454762772e-05, "loss": 1.5312, "step": 1121 }, { "epoch": 0.05, "grad_norm": 1.3116972355784613, "learning_rate": 1.997016442371683e-05, "loss": 1.4238, "step": 1122 }, { "epoch": 0.05, "grad_norm": 1.3878890923202327, "learning_rate": 1.997004405785937e-05, "loss": 1.2744, "step": 1123 }, { "epoch": 0.05, "grad_norm": 0.9714710554799669, "learning_rate": 1.9969923450058264e-05, "loss": 1.2695, "step": 1124 }, { "epoch": 0.05, "grad_norm": 1.2695514060839288, "learning_rate": 1.9969802600316433e-05, "loss": 1.2212, "step": 1125 }, { "epoch": 0.05, "grad_norm": 1.5624113266494528, "learning_rate": 1.9969681508636814e-05, "loss": 1.3369, "step": 1126 }, { "epoch": 0.05, "grad_norm": 1.3926226602763765, "learning_rate": 1.9969560175022343e-05, "loss": 1.3213, "step": 1127 }, { "epoch": 0.05, "grad_norm": 1.1318689410201102, "learning_rate": 1.996943859947597e-05, "loss": 1.3164, "step": 1128 }, { "epoch": 0.05, "grad_norm": 1.2810954468263356, "learning_rate": 1.996931678200064e-05, "loss": 1.186, "step": 1129 }, { "epoch": 0.05, "grad_norm": 0.9818086438479263, "learning_rate": 1.996919472259931e-05, "loss": 1.2148, "step": 1130 }, { "epoch": 0.05, "grad_norm": 1.3148385191747365, "learning_rate": 1.996907242127494e-05, "loss": 1.1277, "step": 1131 }, { "epoch": 0.05, "grad_norm": 1.367787033337665, "learning_rate": 1.9968949878030503e-05, "loss": 1.1631, "step": 1132 }, { "epoch": 0.05, "grad_norm": 1.2825564224474144, "learning_rate": 1.996882709286897e-05, "loss": 1.2852, "step": 1133 }, { "epoch": 0.05, "grad_norm": 1.2546059153143287, "learning_rate": 1.996870406579332e-05, "loss": 1.2817, "step": 1134 }, { "epoch": 0.05, "grad_norm": 1.2373774497817236, "learning_rate": 1.9968580796806542e-05, "loss": 1.373, "step": 1135 }, { "epoch": 0.05, "grad_norm": 1.3912403623737093, "learning_rate": 1.9968457285911624e-05, "loss": 1.2969, "step": 1136 }, { "epoch": 0.05, "grad_norm": 1.230223280602699, "learning_rate": 1.996833353311156e-05, "loss": 1.2324, "step": 1137 }, { "epoch": 0.05, "grad_norm": 1.1424821971684167, "learning_rate": 1.996820953840936e-05, "loss": 1.3501, "step": 1138 }, { "epoch": 0.05, "grad_norm": 1.2089742676861317, "learning_rate": 1.9968085301808026e-05, "loss": 1.2222, "step": 1139 }, { "epoch": 0.05, "grad_norm": 1.3259714498879585, "learning_rate": 1.996796082331058e-05, "loss": 1.2773, "step": 1140 }, { "epoch": 0.05, "grad_norm": 1.2567871202981753, "learning_rate": 1.9967836102920043e-05, "loss": 1.2578, "step": 1141 }, { "epoch": 0.05, "grad_norm": 1.4823825681370333, "learning_rate": 1.996771114063943e-05, "loss": 1.3818, "step": 1142 }, { "epoch": 0.05, "grad_norm": 1.3685625825721024, "learning_rate": 1.996758593647179e-05, "loss": 1.3789, "step": 1143 }, { "epoch": 0.06, "grad_norm": 1.4202231952290987, "learning_rate": 1.996746049042015e-05, "loss": 1.4033, "step": 1144 }, { "epoch": 0.06, "grad_norm": 0.9503595399266787, "learning_rate": 1.9967334802487553e-05, "loss": 1.1909, "step": 1145 }, { "epoch": 0.06, "grad_norm": 1.22782020882523, "learning_rate": 1.996720887267706e-05, "loss": 1.2036, "step": 1146 }, { "epoch": 0.06, "grad_norm": 0.9689495329997446, "learning_rate": 1.9967082700991712e-05, "loss": 1.1216, "step": 1147 }, { "epoch": 0.06, "grad_norm": 1.2876398434548313, "learning_rate": 1.9966956287434586e-05, "loss": 1.269, "step": 1148 }, { "epoch": 0.06, "grad_norm": 1.2892987134470477, "learning_rate": 1.996682963200874e-05, "loss": 1.2012, "step": 1149 }, { "epoch": 0.06, "grad_norm": 1.7091973788178347, "learning_rate": 1.9966702734717248e-05, "loss": 1.3086, "step": 1150 }, { "epoch": 0.06, "grad_norm": 1.3708082196985756, "learning_rate": 1.9966575595563195e-05, "loss": 1.1763, "step": 1151 }, { "epoch": 0.06, "grad_norm": 1.2515434593652914, "learning_rate": 1.996644821454966e-05, "loss": 1.2568, "step": 1152 }, { "epoch": 0.06, "grad_norm": 1.0946773935871086, "learning_rate": 1.996632059167974e-05, "loss": 1.2261, "step": 1153 }, { "epoch": 0.06, "grad_norm": 1.2157228863881675, "learning_rate": 1.996619272695653e-05, "loss": 1.1108, "step": 1154 }, { "epoch": 0.06, "grad_norm": 1.6932113416384522, "learning_rate": 1.996606462038313e-05, "loss": 1.3428, "step": 1155 }, { "epoch": 0.06, "grad_norm": 1.4647696001651183, "learning_rate": 1.9965936271962652e-05, "loss": 1.2695, "step": 1156 }, { "epoch": 0.06, "grad_norm": 1.2295350215446725, "learning_rate": 1.9965807681698208e-05, "loss": 1.2803, "step": 1157 }, { "epoch": 0.06, "grad_norm": 1.373098498516551, "learning_rate": 1.996567884959292e-05, "loss": 1.1987, "step": 1158 }, { "epoch": 0.06, "grad_norm": 1.5807001091777488, "learning_rate": 1.9965549775649914e-05, "loss": 1.3135, "step": 1159 }, { "epoch": 0.06, "grad_norm": 1.3516132384332489, "learning_rate": 1.9965420459872325e-05, "loss": 1.3462, "step": 1160 }, { "epoch": 0.06, "grad_norm": 0.9376380215771913, "learning_rate": 1.9965290902263286e-05, "loss": 1.1313, "step": 1161 }, { "epoch": 0.06, "grad_norm": 1.5192124577801223, "learning_rate": 1.9965161102825944e-05, "loss": 1.2974, "step": 1162 }, { "epoch": 0.06, "grad_norm": 1.216461988499304, "learning_rate": 1.996503106156345e-05, "loss": 1.1021, "step": 1163 }, { "epoch": 0.06, "grad_norm": 1.3185857008412607, "learning_rate": 1.9964900778478958e-05, "loss": 1.1602, "step": 1164 }, { "epoch": 0.06, "grad_norm": 1.3167107658086623, "learning_rate": 1.996477025357563e-05, "loss": 1.3154, "step": 1165 }, { "epoch": 0.06, "grad_norm": 1.533119646432664, "learning_rate": 1.9964639486856634e-05, "loss": 1.3481, "step": 1166 }, { "epoch": 0.06, "grad_norm": 1.3475747914470217, "learning_rate": 1.996450847832514e-05, "loss": 1.3926, "step": 1167 }, { "epoch": 0.06, "grad_norm": 1.4695003677549798, "learning_rate": 1.996437722798433e-05, "loss": 1.3618, "step": 1168 }, { "epoch": 0.06, "grad_norm": 1.5164983395177585, "learning_rate": 1.996424573583739e-05, "loss": 1.2734, "step": 1169 }, { "epoch": 0.06, "grad_norm": 1.1819627237885333, "learning_rate": 1.996411400188751e-05, "loss": 1.4136, "step": 1170 }, { "epoch": 0.06, "grad_norm": 1.3064104902561453, "learning_rate": 1.9963982026137886e-05, "loss": 1.3745, "step": 1171 }, { "epoch": 0.06, "grad_norm": 1.2216111937511662, "learning_rate": 1.9963849808591723e-05, "loss": 1.2139, "step": 1172 }, { "epoch": 0.06, "grad_norm": 1.3494338413905522, "learning_rate": 1.9963717349252226e-05, "loss": 1.3394, "step": 1173 }, { "epoch": 0.06, "grad_norm": 1.5671959441253445, "learning_rate": 1.996358464812261e-05, "loss": 1.2563, "step": 1174 }, { "epoch": 0.06, "grad_norm": 1.4628730401414831, "learning_rate": 1.99634517052061e-05, "loss": 1.3271, "step": 1175 }, { "epoch": 0.06, "grad_norm": 1.1846291370121216, "learning_rate": 1.9963318520505915e-05, "loss": 1.1338, "step": 1176 }, { "epoch": 0.06, "grad_norm": 1.1379832590739252, "learning_rate": 1.9963185094025293e-05, "loss": 1.3252, "step": 1177 }, { "epoch": 0.06, "grad_norm": 1.07582844654449, "learning_rate": 1.996305142576747e-05, "loss": 1.3032, "step": 1178 }, { "epoch": 0.06, "grad_norm": 1.4479034270947602, "learning_rate": 1.9962917515735686e-05, "loss": 1.4873, "step": 1179 }, { "epoch": 0.06, "grad_norm": 1.291283301212534, "learning_rate": 1.9962783363933193e-05, "loss": 1.335, "step": 1180 }, { "epoch": 0.06, "grad_norm": 1.1647519211756345, "learning_rate": 1.996264897036325e-05, "loss": 1.3672, "step": 1181 }, { "epoch": 0.06, "grad_norm": 1.3937575401335844, "learning_rate": 1.9962514335029116e-05, "loss": 1.271, "step": 1182 }, { "epoch": 0.06, "grad_norm": 1.2190172506548969, "learning_rate": 1.9962379457934058e-05, "loss": 1.125, "step": 1183 }, { "epoch": 0.06, "grad_norm": 1.5111536041203253, "learning_rate": 1.9962244339081347e-05, "loss": 1.335, "step": 1184 }, { "epoch": 0.06, "grad_norm": 1.239181429722639, "learning_rate": 1.9962108978474265e-05, "loss": 1.1531, "step": 1185 }, { "epoch": 0.06, "grad_norm": 1.3292158173489255, "learning_rate": 1.9961973376116096e-05, "loss": 1.3589, "step": 1186 }, { "epoch": 0.06, "grad_norm": 1.4756829849163433, "learning_rate": 1.996183753201013e-05, "loss": 1.2295, "step": 1187 }, { "epoch": 0.06, "grad_norm": 1.2533272314373796, "learning_rate": 1.996170144615966e-05, "loss": 1.1562, "step": 1188 }, { "epoch": 0.06, "grad_norm": 1.195261892495741, "learning_rate": 1.9961565118567997e-05, "loss": 1.209, "step": 1189 }, { "epoch": 0.06, "grad_norm": 1.4115696169291663, "learning_rate": 1.9961428549238445e-05, "loss": 1.3379, "step": 1190 }, { "epoch": 0.06, "grad_norm": 1.0946129814718788, "learning_rate": 1.9961291738174316e-05, "loss": 1.2041, "step": 1191 }, { "epoch": 0.06, "grad_norm": 0.9573481612574324, "learning_rate": 1.996115468537893e-05, "loss": 1.3262, "step": 1192 }, { "epoch": 0.06, "grad_norm": 1.424991619755627, "learning_rate": 1.9961017390855616e-05, "loss": 1.417, "step": 1193 }, { "epoch": 0.06, "grad_norm": 1.1350965708477252, "learning_rate": 1.9960879854607702e-05, "loss": 1.2861, "step": 1194 }, { "epoch": 0.06, "grad_norm": 1.0689624805928288, "learning_rate": 1.996074207663853e-05, "loss": 1.3394, "step": 1195 }, { "epoch": 0.06, "grad_norm": 1.5539093781707936, "learning_rate": 1.9960604056951444e-05, "loss": 1.2012, "step": 1196 }, { "epoch": 0.06, "grad_norm": 1.225275624668329, "learning_rate": 1.9960465795549787e-05, "loss": 1.2031, "step": 1197 }, { "epoch": 0.06, "grad_norm": 1.473099900938001, "learning_rate": 1.996032729243692e-05, "loss": 1.4087, "step": 1198 }, { "epoch": 0.06, "grad_norm": 1.3366495219677799, "learning_rate": 1.99601885476162e-05, "loss": 1.3208, "step": 1199 }, { "epoch": 0.06, "grad_norm": 1.147001149538042, "learning_rate": 1.9960049561090995e-05, "loss": 1.2168, "step": 1200 }, { "epoch": 0.06, "grad_norm": 1.0803206246913872, "learning_rate": 1.995991033286468e-05, "loss": 1.1743, "step": 1201 }, { "epoch": 0.06, "grad_norm": 1.4648990783925437, "learning_rate": 1.9959770862940632e-05, "loss": 1.1528, "step": 1202 }, { "epoch": 0.06, "grad_norm": 1.3083019976676895, "learning_rate": 1.9959631151322235e-05, "loss": 1.0869, "step": 1203 }, { "epoch": 0.06, "grad_norm": 1.2110769472077494, "learning_rate": 1.995949119801288e-05, "loss": 1.0327, "step": 1204 }, { "epoch": 0.06, "grad_norm": 1.2123005534772617, "learning_rate": 1.995935100301597e-05, "loss": 1.2925, "step": 1205 }, { "epoch": 0.06, "grad_norm": 1.2951882506699464, "learning_rate": 1.9959210566334893e-05, "loss": 1.3853, "step": 1206 }, { "epoch": 0.06, "grad_norm": 1.1243474873559611, "learning_rate": 1.9959069887973067e-05, "loss": 1.2036, "step": 1207 }, { "epoch": 0.06, "grad_norm": 1.2277240140640617, "learning_rate": 1.9958928967933903e-05, "loss": 1.2793, "step": 1208 }, { "epoch": 0.06, "grad_norm": 1.3279887764039047, "learning_rate": 1.995878780622082e-05, "loss": 1.4385, "step": 1209 }, { "epoch": 0.06, "grad_norm": 1.370675130590996, "learning_rate": 1.9958646402837247e-05, "loss": 1.3022, "step": 1210 }, { "epoch": 0.06, "grad_norm": 1.1271506053195537, "learning_rate": 1.995850475778661e-05, "loss": 1.3257, "step": 1211 }, { "epoch": 0.06, "grad_norm": 1.1631018990266622, "learning_rate": 1.9958362871072353e-05, "loss": 1.2949, "step": 1212 }, { "epoch": 0.06, "grad_norm": 1.3891065799920526, "learning_rate": 1.9958220742697915e-05, "loss": 1.0747, "step": 1213 }, { "epoch": 0.06, "grad_norm": 1.470306328225646, "learning_rate": 1.995807837266674e-05, "loss": 1.2344, "step": 1214 }, { "epoch": 0.06, "grad_norm": 1.278263299225308, "learning_rate": 1.9957935760982297e-05, "loss": 1.207, "step": 1215 }, { "epoch": 0.06, "grad_norm": 0.9830605949860025, "learning_rate": 1.9957792907648033e-05, "loss": 1.3228, "step": 1216 }, { "epoch": 0.06, "grad_norm": 1.476941127801184, "learning_rate": 1.9957649812667422e-05, "loss": 1.3149, "step": 1217 }, { "epoch": 0.06, "grad_norm": 1.2887060110366275, "learning_rate": 1.9957506476043934e-05, "loss": 1.314, "step": 1218 }, { "epoch": 0.06, "grad_norm": 1.2211380058914756, "learning_rate": 1.995736289778105e-05, "loss": 1.1582, "step": 1219 }, { "epoch": 0.06, "grad_norm": 1.5017902688747176, "learning_rate": 1.9957219077882245e-05, "loss": 1.3203, "step": 1220 }, { "epoch": 0.06, "grad_norm": 1.3104990621999943, "learning_rate": 1.9957075016351023e-05, "loss": 1.1719, "step": 1221 }, { "epoch": 0.06, "grad_norm": 1.559348620012353, "learning_rate": 1.995693071319087e-05, "loss": 1.2964, "step": 1222 }, { "epoch": 0.06, "grad_norm": 1.2221872960263491, "learning_rate": 1.995678616840529e-05, "loss": 1.3496, "step": 1223 }, { "epoch": 0.06, "grad_norm": 1.4867455771993463, "learning_rate": 1.9956641381997795e-05, "loss": 1.3628, "step": 1224 }, { "epoch": 0.06, "grad_norm": 1.312437079989776, "learning_rate": 1.9956496353971894e-05, "loss": 1.1685, "step": 1225 }, { "epoch": 0.06, "grad_norm": 1.400428692330916, "learning_rate": 1.995635108433111e-05, "loss": 1.27, "step": 1226 }, { "epoch": 0.06, "grad_norm": 1.39554428381017, "learning_rate": 1.995620557307896e-05, "loss": 1.4028, "step": 1227 }, { "epoch": 0.06, "grad_norm": 1.4300370941024458, "learning_rate": 1.9956059820218982e-05, "loss": 1.3345, "step": 1228 }, { "epoch": 0.06, "grad_norm": 1.4501762115521841, "learning_rate": 1.9955913825754713e-05, "loss": 1.2832, "step": 1229 }, { "epoch": 0.06, "grad_norm": 1.263179748375819, "learning_rate": 1.9955767589689697e-05, "loss": 1.2065, "step": 1230 }, { "epoch": 0.06, "grad_norm": 1.4508581161707867, "learning_rate": 1.9955621112027476e-05, "loss": 1.1113, "step": 1231 }, { "epoch": 0.06, "grad_norm": 1.2902706477353545, "learning_rate": 1.995547439277161e-05, "loss": 1.2368, "step": 1232 }, { "epoch": 0.06, "grad_norm": 1.1376601519827074, "learning_rate": 1.9955327431925663e-05, "loss": 1.2046, "step": 1233 }, { "epoch": 0.06, "grad_norm": 1.1354097356870878, "learning_rate": 1.9955180229493193e-05, "loss": 1.1265, "step": 1234 }, { "epoch": 0.06, "grad_norm": 1.368987275813263, "learning_rate": 1.9955032785477778e-05, "loss": 1.207, "step": 1235 }, { "epoch": 0.06, "grad_norm": 1.2637365078810723, "learning_rate": 1.9954885099882992e-05, "loss": 1.2427, "step": 1236 }, { "epoch": 0.06, "grad_norm": 1.3107876140061971, "learning_rate": 1.9954737172712422e-05, "loss": 1.4326, "step": 1237 }, { "epoch": 0.06, "grad_norm": 1.042413046016357, "learning_rate": 1.9954589003969657e-05, "loss": 1.2153, "step": 1238 }, { "epoch": 0.06, "grad_norm": 1.16453364319218, "learning_rate": 1.9954440593658294e-05, "loss": 1.2295, "step": 1239 }, { "epoch": 0.06, "grad_norm": 1.4867303837130328, "learning_rate": 1.995429194178193e-05, "loss": 1.3428, "step": 1240 }, { "epoch": 0.06, "grad_norm": 1.3543670934561156, "learning_rate": 1.9954143048344175e-05, "loss": 1.2729, "step": 1241 }, { "epoch": 0.06, "grad_norm": 1.490986306613243, "learning_rate": 1.995399391334864e-05, "loss": 1.0986, "step": 1242 }, { "epoch": 0.06, "grad_norm": 1.6485574754422299, "learning_rate": 1.995384453679895e-05, "loss": 1.4683, "step": 1243 }, { "epoch": 0.06, "grad_norm": 1.6474747997173635, "learning_rate": 1.9953694918698726e-05, "loss": 1.3154, "step": 1244 }, { "epoch": 0.06, "grad_norm": 1.298660024384456, "learning_rate": 1.99535450590516e-05, "loss": 1.2156, "step": 1245 }, { "epoch": 0.06, "grad_norm": 1.4061305508015278, "learning_rate": 1.9953394957861206e-05, "loss": 1.3213, "step": 1246 }, { "epoch": 0.06, "grad_norm": 1.1603509108889312, "learning_rate": 1.9953244615131187e-05, "loss": 1.1973, "step": 1247 }, { "epoch": 0.06, "grad_norm": 1.2727357328249131, "learning_rate": 1.9953094030865197e-05, "loss": 1.2554, "step": 1248 }, { "epoch": 0.06, "grad_norm": 1.0464557773522094, "learning_rate": 1.995294320506688e-05, "loss": 1.2686, "step": 1249 }, { "epoch": 0.06, "grad_norm": 1.286866906481843, "learning_rate": 1.9952792137739908e-05, "loss": 1.2329, "step": 1250 }, { "epoch": 0.06, "grad_norm": 1.7802029262338075, "learning_rate": 1.995264082888794e-05, "loss": 1.3354, "step": 1251 }, { "epoch": 0.06, "grad_norm": 1.3584798556192952, "learning_rate": 1.9952489278514644e-05, "loss": 1.3096, "step": 1252 }, { "epoch": 0.06, "grad_norm": 1.1536646876550822, "learning_rate": 1.9952337486623704e-05, "loss": 1.2271, "step": 1253 }, { "epoch": 0.06, "grad_norm": 1.1733279836514077, "learning_rate": 1.9952185453218803e-05, "loss": 1.251, "step": 1254 }, { "epoch": 0.06, "grad_norm": 0.9328158037883202, "learning_rate": 1.9952033178303632e-05, "loss": 1.1709, "step": 1255 }, { "epoch": 0.06, "grad_norm": 1.2231153708462144, "learning_rate": 1.995188066188188e-05, "loss": 1.1377, "step": 1256 }, { "epoch": 0.06, "grad_norm": 1.365406926993071, "learning_rate": 1.9951727903957252e-05, "loss": 0.9807, "step": 1257 }, { "epoch": 0.06, "grad_norm": 1.393741522021695, "learning_rate": 1.9951574904533456e-05, "loss": 1.124, "step": 1258 }, { "epoch": 0.06, "grad_norm": 1.1703974884170252, "learning_rate": 1.9951421663614204e-05, "loss": 1.3628, "step": 1259 }, { "epoch": 0.06, "grad_norm": 1.3550930677001618, "learning_rate": 1.9951268181203213e-05, "loss": 1.4443, "step": 1260 }, { "epoch": 0.06, "grad_norm": 1.2995875089008637, "learning_rate": 1.995111445730421e-05, "loss": 1.4395, "step": 1261 }, { "epoch": 0.06, "grad_norm": 1.6041301104242178, "learning_rate": 1.9950960491920923e-05, "loss": 1.3838, "step": 1262 }, { "epoch": 0.06, "grad_norm": 1.3617842935784956, "learning_rate": 1.9950806285057092e-05, "loss": 1.4146, "step": 1263 }, { "epoch": 0.06, "grad_norm": 1.1917380968021707, "learning_rate": 1.9950651836716453e-05, "loss": 1.2012, "step": 1264 }, { "epoch": 0.06, "grad_norm": 1.4911871138398949, "learning_rate": 1.9950497146902757e-05, "loss": 1.3071, "step": 1265 }, { "epoch": 0.06, "grad_norm": 1.3596208582085392, "learning_rate": 1.9950342215619764e-05, "loss": 1.3115, "step": 1266 }, { "epoch": 0.06, "grad_norm": 1.2542492962960106, "learning_rate": 1.9950187042871226e-05, "loss": 1.3838, "step": 1267 }, { "epoch": 0.06, "grad_norm": 1.2812399628302293, "learning_rate": 1.995003162866091e-05, "loss": 1.208, "step": 1268 }, { "epoch": 0.06, "grad_norm": 1.297500216398593, "learning_rate": 1.994987597299259e-05, "loss": 1.082, "step": 1269 }, { "epoch": 0.06, "grad_norm": 1.3053800157913977, "learning_rate": 1.994972007587004e-05, "loss": 1.3872, "step": 1270 }, { "epoch": 0.06, "grad_norm": 1.5337097542258213, "learning_rate": 1.9949563937297045e-05, "loss": 1.3774, "step": 1271 }, { "epoch": 0.06, "grad_norm": 1.125004238135324, "learning_rate": 1.9949407557277394e-05, "loss": 1.3018, "step": 1272 }, { "epoch": 0.06, "grad_norm": 1.2074938600316385, "learning_rate": 1.9949250935814884e-05, "loss": 1.2852, "step": 1273 }, { "epoch": 0.06, "grad_norm": 1.4033995706433413, "learning_rate": 1.994909407291331e-05, "loss": 1.1458, "step": 1274 }, { "epoch": 0.06, "grad_norm": 1.2663365713129235, "learning_rate": 1.9948936968576483e-05, "loss": 1.2007, "step": 1275 }, { "epoch": 0.06, "grad_norm": 1.9732730010784716, "learning_rate": 1.9948779622808215e-05, "loss": 1.5381, "step": 1276 }, { "epoch": 0.06, "grad_norm": 1.2967888900004794, "learning_rate": 1.9948622035612326e-05, "loss": 1.167, "step": 1277 }, { "epoch": 0.06, "grad_norm": 1.0589116558130922, "learning_rate": 1.9948464206992635e-05, "loss": 1.2319, "step": 1278 }, { "epoch": 0.06, "grad_norm": 1.371921469653475, "learning_rate": 1.9948306136952976e-05, "loss": 1.3428, "step": 1279 }, { "epoch": 0.06, "grad_norm": 1.2649379702790597, "learning_rate": 1.9948147825497184e-05, "loss": 1.3208, "step": 1280 }, { "epoch": 0.06, "grad_norm": 1.0884103595084624, "learning_rate": 1.99479892726291e-05, "loss": 1.1987, "step": 1281 }, { "epoch": 0.06, "grad_norm": 1.396697050028248, "learning_rate": 1.9947830478352578e-05, "loss": 1.2285, "step": 1282 }, { "epoch": 0.06, "grad_norm": 1.4652892035840763, "learning_rate": 1.994767144267146e-05, "loss": 1.3389, "step": 1283 }, { "epoch": 0.06, "grad_norm": 1.5894635822180903, "learning_rate": 1.994751216558961e-05, "loss": 1.2773, "step": 1284 }, { "epoch": 0.06, "grad_norm": 1.5236110869070074, "learning_rate": 1.9947352647110895e-05, "loss": 1.3784, "step": 1285 }, { "epoch": 0.06, "grad_norm": 1.5149126773094834, "learning_rate": 1.9947192887239188e-05, "loss": 1.334, "step": 1286 }, { "epoch": 0.06, "grad_norm": 1.184195971447859, "learning_rate": 1.9947032885978365e-05, "loss": 1.248, "step": 1287 }, { "epoch": 0.06, "grad_norm": 1.4476136379552171, "learning_rate": 1.99468726433323e-05, "loss": 1.2671, "step": 1288 }, { "epoch": 0.06, "grad_norm": 1.2968907566338115, "learning_rate": 1.9946712159304894e-05, "loss": 1.4019, "step": 1289 }, { "epoch": 0.06, "grad_norm": 1.2321153544339347, "learning_rate": 1.9946551433900033e-05, "loss": 1.4419, "step": 1290 }, { "epoch": 0.06, "grad_norm": 1.1562539628242532, "learning_rate": 1.9946390467121627e-05, "loss": 1.23, "step": 1291 }, { "epoch": 0.06, "grad_norm": 1.2818727390920253, "learning_rate": 1.994622925897357e-05, "loss": 1.4551, "step": 1292 }, { "epoch": 0.06, "grad_norm": 1.439566200317379, "learning_rate": 1.994606780945978e-05, "loss": 1.2866, "step": 1293 }, { "epoch": 0.06, "grad_norm": 1.2248805255492867, "learning_rate": 1.9945906118584173e-05, "loss": 1.2651, "step": 1294 }, { "epoch": 0.06, "grad_norm": 1.2895242662428423, "learning_rate": 1.994574418635068e-05, "loss": 1.2354, "step": 1295 }, { "epoch": 0.06, "grad_norm": 1.2051327639930187, "learning_rate": 1.994558201276322e-05, "loss": 1.3589, "step": 1296 }, { "epoch": 0.06, "grad_norm": 1.2218824120117846, "learning_rate": 1.9945419597825736e-05, "loss": 1.3154, "step": 1297 }, { "epoch": 0.06, "grad_norm": 1.4508929015520482, "learning_rate": 1.9945256941542163e-05, "loss": 1.4048, "step": 1298 }, { "epoch": 0.06, "grad_norm": 0.8708801647261728, "learning_rate": 1.9945094043916456e-05, "loss": 1.3525, "step": 1299 }, { "epoch": 0.06, "grad_norm": 0.9579914326918091, "learning_rate": 1.9944930904952564e-05, "loss": 1.2129, "step": 1300 }, { "epoch": 0.06, "grad_norm": 1.0087790790346514, "learning_rate": 1.9944767524654446e-05, "loss": 1.166, "step": 1301 }, { "epoch": 0.06, "grad_norm": 1.2192589641900438, "learning_rate": 1.9944603903026064e-05, "loss": 1.2939, "step": 1302 }, { "epoch": 0.06, "grad_norm": 1.3659336768010804, "learning_rate": 1.9944440040071392e-05, "loss": 1.1621, "step": 1303 }, { "epoch": 0.06, "grad_norm": 1.472047190540106, "learning_rate": 1.9944275935794407e-05, "loss": 1.124, "step": 1304 }, { "epoch": 0.06, "grad_norm": 1.5450025048461162, "learning_rate": 1.9944111590199088e-05, "loss": 1.3579, "step": 1305 }, { "epoch": 0.06, "grad_norm": 1.3225064598105063, "learning_rate": 1.994394700328943e-05, "loss": 1.2134, "step": 1306 }, { "epoch": 0.06, "grad_norm": 1.1863429912280317, "learning_rate": 1.994378217506942e-05, "loss": 1.2075, "step": 1307 }, { "epoch": 0.06, "grad_norm": 1.2683223640801193, "learning_rate": 1.994361710554306e-05, "loss": 1.0977, "step": 1308 }, { "epoch": 0.06, "grad_norm": 1.472763852052605, "learning_rate": 1.9943451794714354e-05, "loss": 1.2231, "step": 1309 }, { "epoch": 0.06, "grad_norm": 1.2699309476000522, "learning_rate": 1.994328624258732e-05, "loss": 1.2026, "step": 1310 }, { "epoch": 0.06, "grad_norm": 1.2505712299051948, "learning_rate": 1.9943120449165963e-05, "loss": 1.1123, "step": 1311 }, { "epoch": 0.06, "grad_norm": 1.348282844968032, "learning_rate": 1.9942954414454322e-05, "loss": 1.2944, "step": 1312 }, { "epoch": 0.06, "grad_norm": 1.4110775760992165, "learning_rate": 1.9942788138456418e-05, "loss": 1.3853, "step": 1313 }, { "epoch": 0.06, "grad_norm": 1.4753986978839488, "learning_rate": 1.9942621621176283e-05, "loss": 1.2661, "step": 1314 }, { "epoch": 0.06, "grad_norm": 1.4816304914246252, "learning_rate": 1.9942454862617962e-05, "loss": 1.2441, "step": 1315 }, { "epoch": 0.06, "grad_norm": 1.4089683236478527, "learning_rate": 1.9942287862785502e-05, "loss": 1.3994, "step": 1316 }, { "epoch": 0.06, "grad_norm": 1.55150202861976, "learning_rate": 1.9942120621682957e-05, "loss": 1.3525, "step": 1317 }, { "epoch": 0.06, "grad_norm": 1.6127446011686273, "learning_rate": 1.994195313931438e-05, "loss": 1.3975, "step": 1318 }, { "epoch": 0.06, "grad_norm": 1.248655141878362, "learning_rate": 1.994178541568384e-05, "loss": 1.4741, "step": 1319 }, { "epoch": 0.06, "grad_norm": 1.6931543595463938, "learning_rate": 1.9941617450795406e-05, "loss": 1.3164, "step": 1320 }, { "epoch": 0.06, "grad_norm": 1.2494081960015793, "learning_rate": 1.9941449244653154e-05, "loss": 1.2188, "step": 1321 }, { "epoch": 0.06, "grad_norm": 0.9613072570667507, "learning_rate": 1.9941280797261163e-05, "loss": 1.0884, "step": 1322 }, { "epoch": 0.06, "grad_norm": 1.0406840732447598, "learning_rate": 1.9941112108623523e-05, "loss": 1.2163, "step": 1323 }, { "epoch": 0.06, "grad_norm": 1.1864671485546363, "learning_rate": 1.9940943178744333e-05, "loss": 1.2002, "step": 1324 }, { "epoch": 0.06, "grad_norm": 1.4703775677346624, "learning_rate": 1.9940774007627684e-05, "loss": 1.1768, "step": 1325 }, { "epoch": 0.06, "grad_norm": 1.5021683558422232, "learning_rate": 1.9940604595277687e-05, "loss": 1.3594, "step": 1326 }, { "epoch": 0.06, "grad_norm": 1.257417975740565, "learning_rate": 1.9940434941698447e-05, "loss": 1.3262, "step": 1327 }, { "epoch": 0.06, "grad_norm": 1.121121314135909, "learning_rate": 1.9940265046894086e-05, "loss": 1.2344, "step": 1328 }, { "epoch": 0.06, "grad_norm": 1.0915395672259784, "learning_rate": 1.9940094910868727e-05, "loss": 1.2251, "step": 1329 }, { "epoch": 0.06, "grad_norm": 1.28639539286163, "learning_rate": 1.9939924533626492e-05, "loss": 1.1787, "step": 1330 }, { "epoch": 0.06, "grad_norm": 1.435184693101793, "learning_rate": 1.993975391517153e-05, "loss": 1.1221, "step": 1331 }, { "epoch": 0.06, "grad_norm": 1.2957081159201869, "learning_rate": 1.9939583055507964e-05, "loss": 1.2109, "step": 1332 }, { "epoch": 0.06, "grad_norm": 1.3229321740622841, "learning_rate": 1.9939411954639953e-05, "loss": 1.4004, "step": 1333 }, { "epoch": 0.06, "grad_norm": 1.4300079959553285, "learning_rate": 1.9939240612571642e-05, "loss": 1.4131, "step": 1334 }, { "epoch": 0.06, "grad_norm": 1.6863993585990351, "learning_rate": 1.9939069029307193e-05, "loss": 1.293, "step": 1335 }, { "epoch": 0.06, "grad_norm": 1.2927305247390872, "learning_rate": 1.993889720485077e-05, "loss": 1.2075, "step": 1336 }, { "epoch": 0.06, "grad_norm": 1.3234304136185842, "learning_rate": 1.993872513920654e-05, "loss": 1.5342, "step": 1337 }, { "epoch": 0.06, "grad_norm": 1.171711948130348, "learning_rate": 1.993855283237868e-05, "loss": 1.1953, "step": 1338 }, { "epoch": 0.06, "grad_norm": 1.442382354987733, "learning_rate": 1.993838028437137e-05, "loss": 1.2437, "step": 1339 }, { "epoch": 0.06, "grad_norm": 1.449520592327356, "learning_rate": 1.9938207495188797e-05, "loss": 1.2607, "step": 1340 }, { "epoch": 0.06, "grad_norm": 1.0691742024503519, "learning_rate": 1.9938034464835155e-05, "loss": 1.1895, "step": 1341 }, { "epoch": 0.06, "grad_norm": 1.1548860976525601, "learning_rate": 1.9937861193314648e-05, "loss": 1.3018, "step": 1342 }, { "epoch": 0.06, "grad_norm": 1.142832178627285, "learning_rate": 1.9937687680631473e-05, "loss": 1.2114, "step": 1343 }, { "epoch": 0.06, "grad_norm": 1.2772702918666565, "learning_rate": 1.9937513926789843e-05, "loss": 1.3818, "step": 1344 }, { "epoch": 0.06, "grad_norm": 1.2234163539902638, "learning_rate": 1.9937339931793975e-05, "loss": 1.2837, "step": 1345 }, { "epoch": 0.06, "grad_norm": 1.0254636156558115, "learning_rate": 1.9937165695648092e-05, "loss": 1.3755, "step": 1346 }, { "epoch": 0.06, "grad_norm": 1.2212329869291185, "learning_rate": 1.993699121835642e-05, "loss": 1.1953, "step": 1347 }, { "epoch": 0.06, "grad_norm": 0.927080231663782, "learning_rate": 1.9936816499923198e-05, "loss": 1.3765, "step": 1348 }, { "epoch": 0.06, "grad_norm": 1.2762963244472094, "learning_rate": 1.9936641540352665e-05, "loss": 1.3057, "step": 1349 }, { "epoch": 0.06, "grad_norm": 1.3982538698603952, "learning_rate": 1.993646633964906e-05, "loss": 1.2593, "step": 1350 }, { "epoch": 0.06, "grad_norm": 1.1462402150254367, "learning_rate": 1.993629089781664e-05, "loss": 1.2104, "step": 1351 }, { "epoch": 0.07, "grad_norm": 1.0834354215367321, "learning_rate": 1.9936115214859663e-05, "loss": 1.1821, "step": 1352 }, { "epoch": 0.07, "grad_norm": 1.305800458690814, "learning_rate": 1.9935939290782386e-05, "loss": 1.0305, "step": 1353 }, { "epoch": 0.07, "grad_norm": 1.2161794332576066, "learning_rate": 1.9935763125589086e-05, "loss": 1.2417, "step": 1354 }, { "epoch": 0.07, "grad_norm": 1.2993021021247946, "learning_rate": 1.9935586719284036e-05, "loss": 1.3936, "step": 1355 }, { "epoch": 0.07, "grad_norm": 1.1982271054726785, "learning_rate": 1.9935410071871514e-05, "loss": 1.3086, "step": 1356 }, { "epoch": 0.07, "grad_norm": 1.037615113640751, "learning_rate": 1.993523318335581e-05, "loss": 1.3188, "step": 1357 }, { "epoch": 0.07, "grad_norm": 1.444077956757576, "learning_rate": 1.9935056053741216e-05, "loss": 1.2871, "step": 1358 }, { "epoch": 0.07, "grad_norm": 1.2543192523643627, "learning_rate": 1.9934878683032028e-05, "loss": 1.2144, "step": 1359 }, { "epoch": 0.07, "grad_norm": 1.2995480228195606, "learning_rate": 1.993470107123255e-05, "loss": 1.1572, "step": 1360 }, { "epoch": 0.07, "grad_norm": 1.4480398694941132, "learning_rate": 1.9934523218347096e-05, "loss": 1.3926, "step": 1361 }, { "epoch": 0.07, "grad_norm": 1.3825211370149932, "learning_rate": 1.993434512437998e-05, "loss": 1.3301, "step": 1362 }, { "epoch": 0.07, "grad_norm": 1.1622085955295531, "learning_rate": 1.9934166789335526e-05, "loss": 1.3032, "step": 1363 }, { "epoch": 0.07, "grad_norm": 1.2323791671836448, "learning_rate": 1.9933988213218054e-05, "loss": 1.1133, "step": 1364 }, { "epoch": 0.07, "grad_norm": 1.5556026620591226, "learning_rate": 1.9933809396031908e-05, "loss": 1.4131, "step": 1365 }, { "epoch": 0.07, "grad_norm": 1.211147726914795, "learning_rate": 1.993363033778142e-05, "loss": 1.3223, "step": 1366 }, { "epoch": 0.07, "grad_norm": 1.2275874073012218, "learning_rate": 1.9933451038470936e-05, "loss": 1.2915, "step": 1367 }, { "epoch": 0.07, "grad_norm": 1.5045309682465917, "learning_rate": 1.9933271498104808e-05, "loss": 1.354, "step": 1368 }, { "epoch": 0.07, "grad_norm": 1.3556982898542607, "learning_rate": 1.9933091716687397e-05, "loss": 1.1379, "step": 1369 }, { "epoch": 0.07, "grad_norm": 1.132605795114454, "learning_rate": 1.9932911694223064e-05, "loss": 1.2886, "step": 1370 }, { "epoch": 0.07, "grad_norm": 1.32993469859226, "learning_rate": 1.993273143071617e-05, "loss": 1.2256, "step": 1371 }, { "epoch": 0.07, "grad_norm": 1.3776664628802224, "learning_rate": 1.9932550926171096e-05, "loss": 1.4473, "step": 1372 }, { "epoch": 0.07, "grad_norm": 1.238845158569798, "learning_rate": 1.9932370180592226e-05, "loss": 1.3281, "step": 1373 }, { "epoch": 0.07, "grad_norm": 1.4682763463046296, "learning_rate": 1.9932189193983937e-05, "loss": 1.3276, "step": 1374 }, { "epoch": 0.07, "grad_norm": 1.2174430556961697, "learning_rate": 1.9932007966350627e-05, "loss": 1.1646, "step": 1375 }, { "epoch": 0.07, "grad_norm": 1.2076662667786797, "learning_rate": 1.9931826497696694e-05, "loss": 1.1763, "step": 1376 }, { "epoch": 0.07, "grad_norm": 1.2079638090979452, "learning_rate": 1.993164478802654e-05, "loss": 1.3047, "step": 1377 }, { "epoch": 0.07, "grad_norm": 1.2719426115206187, "learning_rate": 1.9931462837344578e-05, "loss": 1.124, "step": 1378 }, { "epoch": 0.07, "grad_norm": 1.5243411469129524, "learning_rate": 1.9931280645655216e-05, "loss": 1.3022, "step": 1379 }, { "epoch": 0.07, "grad_norm": 1.2152936829799958, "learning_rate": 1.993109821296288e-05, "loss": 1.2314, "step": 1380 }, { "epoch": 0.07, "grad_norm": 1.267029554317984, "learning_rate": 1.9930915539271996e-05, "loss": 1.2378, "step": 1381 }, { "epoch": 0.07, "grad_norm": 1.4557000863168827, "learning_rate": 1.9930732624587e-05, "loss": 1.377, "step": 1382 }, { "epoch": 0.07, "grad_norm": 1.4864072026830715, "learning_rate": 1.9930549468912326e-05, "loss": 1.2578, "step": 1383 }, { "epoch": 0.07, "grad_norm": 1.3776042993007556, "learning_rate": 1.9930366072252424e-05, "loss": 1.2178, "step": 1384 }, { "epoch": 0.07, "grad_norm": 1.383563413897334, "learning_rate": 1.9930182434611736e-05, "loss": 1.2134, "step": 1385 }, { "epoch": 0.07, "grad_norm": 1.3108136252968499, "learning_rate": 1.9929998555994732e-05, "loss": 1.3867, "step": 1386 }, { "epoch": 0.07, "grad_norm": 1.2363894243578233, "learning_rate": 1.992981443640586e-05, "loss": 1.4189, "step": 1387 }, { "epoch": 0.07, "grad_norm": 1.2714197773673799, "learning_rate": 1.9929630075849597e-05, "loss": 1.2007, "step": 1388 }, { "epoch": 0.07, "grad_norm": 1.6806202308037703, "learning_rate": 1.9929445474330413e-05, "loss": 1.2041, "step": 1389 }, { "epoch": 0.07, "grad_norm": 1.4196269843479719, "learning_rate": 1.9929260631852792e-05, "loss": 1.3335, "step": 1390 }, { "epoch": 0.07, "grad_norm": 1.4957491469644797, "learning_rate": 1.992907554842121e-05, "loss": 1.1992, "step": 1391 }, { "epoch": 0.07, "grad_norm": 1.3156499266476749, "learning_rate": 1.9928890224040168e-05, "loss": 1.2129, "step": 1392 }, { "epoch": 0.07, "grad_norm": 1.223826089667699, "learning_rate": 1.992870465871416e-05, "loss": 1.2004, "step": 1393 }, { "epoch": 0.07, "grad_norm": 1.334007652230572, "learning_rate": 1.992851885244769e-05, "loss": 1.2368, "step": 1394 }, { "epoch": 0.07, "grad_norm": 1.6228962034131904, "learning_rate": 1.9928332805245266e-05, "loss": 1.2549, "step": 1395 }, { "epoch": 0.07, "grad_norm": 1.5205183212992135, "learning_rate": 1.9928146517111404e-05, "loss": 1.3491, "step": 1396 }, { "epoch": 0.07, "grad_norm": 1.6062500598407505, "learning_rate": 1.9927959988050622e-05, "loss": 1.209, "step": 1397 }, { "epoch": 0.07, "grad_norm": 1.2325180103622548, "learning_rate": 1.992777321806745e-05, "loss": 1.1982, "step": 1398 }, { "epoch": 0.07, "grad_norm": 1.1152678341927806, "learning_rate": 1.9927586207166417e-05, "loss": 1.1309, "step": 1399 }, { "epoch": 0.07, "grad_norm": 1.4096037907596766, "learning_rate": 1.9927398955352062e-05, "loss": 1.3442, "step": 1400 }, { "epoch": 0.07, "grad_norm": 1.3007957112164257, "learning_rate": 1.992721146262893e-05, "loss": 1.3384, "step": 1401 }, { "epoch": 0.07, "grad_norm": 1.2546083146961347, "learning_rate": 1.992702372900157e-05, "loss": 1.252, "step": 1402 }, { "epoch": 0.07, "grad_norm": 1.3753501992500714, "learning_rate": 1.9926835754474543e-05, "loss": 1.4658, "step": 1403 }, { "epoch": 0.07, "grad_norm": 1.2164889583185936, "learning_rate": 1.9926647539052403e-05, "loss": 1.2334, "step": 1404 }, { "epoch": 0.07, "grad_norm": 1.1240347480776658, "learning_rate": 1.9926459082739717e-05, "loss": 1.3125, "step": 1405 }, { "epoch": 0.07, "grad_norm": 1.156936948968369, "learning_rate": 1.9926270385541067e-05, "loss": 1.1958, "step": 1406 }, { "epoch": 0.07, "grad_norm": 1.8440576725815776, "learning_rate": 1.9926081447461025e-05, "loss": 1.4282, "step": 1407 }, { "epoch": 0.07, "grad_norm": 1.28041784047807, "learning_rate": 1.9925892268504176e-05, "loss": 1.3638, "step": 1408 }, { "epoch": 0.07, "grad_norm": 1.4118294107265381, "learning_rate": 1.9925702848675117e-05, "loss": 1.5029, "step": 1409 }, { "epoch": 0.07, "grad_norm": 0.8418050887003885, "learning_rate": 1.9925513187978437e-05, "loss": 1.2466, "step": 1410 }, { "epoch": 0.07, "grad_norm": 1.26568793304336, "learning_rate": 1.992532328641874e-05, "loss": 1.1118, "step": 1411 }, { "epoch": 0.07, "grad_norm": 1.3990228932998787, "learning_rate": 1.9925133144000643e-05, "loss": 1.1753, "step": 1412 }, { "epoch": 0.07, "grad_norm": 1.1771885673990807, "learning_rate": 1.9924942760728748e-05, "loss": 1.436, "step": 1413 }, { "epoch": 0.07, "grad_norm": 1.0964211591170832, "learning_rate": 1.992475213660768e-05, "loss": 1.1934, "step": 1414 }, { "epoch": 0.07, "grad_norm": 1.5033229338026457, "learning_rate": 1.9924561271642066e-05, "loss": 1.3525, "step": 1415 }, { "epoch": 0.07, "grad_norm": 1.3266353831658246, "learning_rate": 1.992437016583654e-05, "loss": 1.2612, "step": 1416 }, { "epoch": 0.07, "grad_norm": 1.1138066027238473, "learning_rate": 1.9924178819195732e-05, "loss": 1.3315, "step": 1417 }, { "epoch": 0.07, "grad_norm": 1.2084269745740877, "learning_rate": 1.992398723172429e-05, "loss": 1.2661, "step": 1418 }, { "epoch": 0.07, "grad_norm": 1.4515500640698449, "learning_rate": 1.9923795403426865e-05, "loss": 1.334, "step": 1419 }, { "epoch": 0.07, "grad_norm": 1.4031900836937272, "learning_rate": 1.9923603334308114e-05, "loss": 1.4062, "step": 1420 }, { "epoch": 0.07, "grad_norm": 1.3790162630535476, "learning_rate": 1.992341102437269e-05, "loss": 1.2788, "step": 1421 }, { "epoch": 0.07, "grad_norm": 1.4016628014803134, "learning_rate": 1.9923218473625264e-05, "loss": 1.293, "step": 1422 }, { "epoch": 0.07, "grad_norm": 1.4946429940300798, "learning_rate": 1.992302568207051e-05, "loss": 1.2676, "step": 1423 }, { "epoch": 0.07, "grad_norm": 1.4223738012660017, "learning_rate": 1.9922832649713108e-05, "loss": 1.437, "step": 1424 }, { "epoch": 0.07, "grad_norm": 1.2134851088597227, "learning_rate": 1.9922639376557734e-05, "loss": 1.1807, "step": 1425 }, { "epoch": 0.07, "grad_norm": 1.626068459793825, "learning_rate": 1.9922445862609088e-05, "loss": 1.2466, "step": 1426 }, { "epoch": 0.07, "grad_norm": 1.2291064910557952, "learning_rate": 1.992225210787186e-05, "loss": 1.2739, "step": 1427 }, { "epoch": 0.07, "grad_norm": 1.1577841037186118, "learning_rate": 1.9922058112350754e-05, "loss": 1.0413, "step": 1428 }, { "epoch": 0.07, "grad_norm": 1.0278786394752673, "learning_rate": 1.992186387605048e-05, "loss": 1.3018, "step": 1429 }, { "epoch": 0.07, "grad_norm": 1.369540039125186, "learning_rate": 1.9921669398975745e-05, "loss": 1.2129, "step": 1430 }, { "epoch": 0.07, "grad_norm": 1.3628679703644266, "learning_rate": 1.9921474681131273e-05, "loss": 1.3179, "step": 1431 }, { "epoch": 0.07, "grad_norm": 1.137980072856824, "learning_rate": 1.992127972252179e-05, "loss": 1.1699, "step": 1432 }, { "epoch": 0.07, "grad_norm": 1.2337385382160426, "learning_rate": 1.992108452315202e-05, "loss": 1.2998, "step": 1433 }, { "epoch": 0.07, "grad_norm": 1.3849109007443743, "learning_rate": 1.9920889083026716e-05, "loss": 1.2856, "step": 1434 }, { "epoch": 0.07, "grad_norm": 1.2391311036418342, "learning_rate": 1.9920693402150604e-05, "loss": 1.2549, "step": 1435 }, { "epoch": 0.07, "grad_norm": 1.1789559684174638, "learning_rate": 1.992049748052844e-05, "loss": 1.2744, "step": 1436 }, { "epoch": 0.07, "grad_norm": 1.245715591223344, "learning_rate": 1.9920301318164978e-05, "loss": 1.334, "step": 1437 }, { "epoch": 0.07, "grad_norm": 1.295265660739528, "learning_rate": 1.9920104915064974e-05, "loss": 1.4155, "step": 1438 }, { "epoch": 0.07, "grad_norm": 1.4348875943250352, "learning_rate": 1.9919908271233198e-05, "loss": 1.2363, "step": 1439 }, { "epoch": 0.07, "grad_norm": 1.2527831010634491, "learning_rate": 1.9919711386674425e-05, "loss": 1.333, "step": 1440 }, { "epoch": 0.07, "grad_norm": 1.2241122136101408, "learning_rate": 1.991951426139343e-05, "loss": 1.1523, "step": 1441 }, { "epoch": 0.07, "grad_norm": 1.4375264190581758, "learning_rate": 1.9919316895394993e-05, "loss": 1.2871, "step": 1442 }, { "epoch": 0.07, "grad_norm": 1.1756348699771237, "learning_rate": 1.9919119288683908e-05, "loss": 1.2241, "step": 1443 }, { "epoch": 0.07, "grad_norm": 1.1770647408325017, "learning_rate": 1.9918921441264966e-05, "loss": 1.2041, "step": 1444 }, { "epoch": 0.07, "grad_norm": 1.251305447796246, "learning_rate": 1.9918723353142973e-05, "loss": 1.3398, "step": 1445 }, { "epoch": 0.07, "grad_norm": 1.0058090428602453, "learning_rate": 1.9918525024322738e-05, "loss": 1.2974, "step": 1446 }, { "epoch": 0.07, "grad_norm": 1.500605048075872, "learning_rate": 1.9918326454809066e-05, "loss": 1.4976, "step": 1447 }, { "epoch": 0.07, "grad_norm": 1.1913521669028075, "learning_rate": 1.991812764460678e-05, "loss": 1.0981, "step": 1448 }, { "epoch": 0.07, "grad_norm": 1.1976542593605477, "learning_rate": 1.9917928593720705e-05, "loss": 1.2393, "step": 1449 }, { "epoch": 0.07, "grad_norm": 1.3392154133794578, "learning_rate": 1.991772930215567e-05, "loss": 1.1665, "step": 1450 }, { "epoch": 0.07, "grad_norm": 1.3280140970821812, "learning_rate": 1.9917529769916513e-05, "loss": 1.4048, "step": 1451 }, { "epoch": 0.07, "grad_norm": 1.765722542644754, "learning_rate": 1.9917329997008075e-05, "loss": 1.2114, "step": 1452 }, { "epoch": 0.07, "grad_norm": 1.796473971202343, "learning_rate": 1.99171299834352e-05, "loss": 1.3452, "step": 1453 }, { "epoch": 0.07, "grad_norm": 1.2322445064718075, "learning_rate": 1.991692972920275e-05, "loss": 1.437, "step": 1454 }, { "epoch": 0.07, "grad_norm": 1.3784514298298651, "learning_rate": 1.991672923431558e-05, "loss": 1.1514, "step": 1455 }, { "epoch": 0.07, "grad_norm": 1.0411835252057078, "learning_rate": 1.9916528498778554e-05, "loss": 1.2026, "step": 1456 }, { "epoch": 0.07, "grad_norm": 1.4621490652590339, "learning_rate": 1.9916327522596545e-05, "loss": 1.2837, "step": 1457 }, { "epoch": 0.07, "grad_norm": 1.363472977369013, "learning_rate": 1.9916126305774427e-05, "loss": 1.2188, "step": 1458 }, { "epoch": 0.07, "grad_norm": 1.3042743372597452, "learning_rate": 1.991592484831709e-05, "loss": 1.2515, "step": 1459 }, { "epoch": 0.07, "grad_norm": 1.46964437455073, "learning_rate": 1.9915723150229417e-05, "loss": 1.3105, "step": 1460 }, { "epoch": 0.07, "grad_norm": 1.3973980507752068, "learning_rate": 1.9915521211516307e-05, "loss": 1.3276, "step": 1461 }, { "epoch": 0.07, "grad_norm": 1.4104210777910715, "learning_rate": 1.9915319032182655e-05, "loss": 1.2993, "step": 1462 }, { "epoch": 0.07, "grad_norm": 1.1720659592481752, "learning_rate": 1.9915116612233367e-05, "loss": 1.2026, "step": 1463 }, { "epoch": 0.07, "grad_norm": 1.0166718129232097, "learning_rate": 1.991491395167336e-05, "loss": 1.355, "step": 1464 }, { "epoch": 0.07, "grad_norm": 1.2034308260207902, "learning_rate": 1.9914711050507556e-05, "loss": 1.1782, "step": 1465 }, { "epoch": 0.07, "grad_norm": 1.3177553048763881, "learning_rate": 1.991450790874087e-05, "loss": 1.1797, "step": 1466 }, { "epoch": 0.07, "grad_norm": 1.4815300215000597, "learning_rate": 1.991430452637823e-05, "loss": 1.3452, "step": 1467 }, { "epoch": 0.07, "grad_norm": 1.4485677415618512, "learning_rate": 1.991410090342458e-05, "loss": 1.438, "step": 1468 }, { "epoch": 0.07, "grad_norm": 1.3351137610241712, "learning_rate": 1.9913897039884855e-05, "loss": 1.1685, "step": 1469 }, { "epoch": 0.07, "grad_norm": 1.645088674098112, "learning_rate": 1.9913692935764006e-05, "loss": 1.3628, "step": 1470 }, { "epoch": 0.07, "grad_norm": 1.5003155018901277, "learning_rate": 1.9913488591066986e-05, "loss": 1.3555, "step": 1471 }, { "epoch": 0.07, "grad_norm": 1.5718343162437676, "learning_rate": 1.991328400579875e-05, "loss": 1.2598, "step": 1472 }, { "epoch": 0.07, "grad_norm": 1.4812916850245716, "learning_rate": 1.9913079179964266e-05, "loss": 1.3745, "step": 1473 }, { "epoch": 0.07, "grad_norm": 1.1226854440297376, "learning_rate": 1.9912874113568503e-05, "loss": 1.3823, "step": 1474 }, { "epoch": 0.07, "grad_norm": 1.4075576260289377, "learning_rate": 1.9912668806616437e-05, "loss": 1.4629, "step": 1475 }, { "epoch": 0.07, "grad_norm": 1.216170486534676, "learning_rate": 1.9912463259113055e-05, "loss": 1.3877, "step": 1476 }, { "epoch": 0.07, "grad_norm": 1.2318996831006586, "learning_rate": 1.9912257471063338e-05, "loss": 1.1685, "step": 1477 }, { "epoch": 0.07, "grad_norm": 1.3947620009984845, "learning_rate": 1.9912051442472283e-05, "loss": 1.353, "step": 1478 }, { "epoch": 0.07, "grad_norm": 1.3001850006599116, "learning_rate": 1.9911845173344894e-05, "loss": 1.1216, "step": 1479 }, { "epoch": 0.07, "grad_norm": 0.9708094925111745, "learning_rate": 1.991163866368617e-05, "loss": 1.1738, "step": 1480 }, { "epoch": 0.07, "grad_norm": 1.2927166906501864, "learning_rate": 1.991143191350112e-05, "loss": 1.2583, "step": 1481 }, { "epoch": 0.07, "grad_norm": 1.4595692280622141, "learning_rate": 1.991122492279477e-05, "loss": 1.353, "step": 1482 }, { "epoch": 0.07, "grad_norm": 1.3604513442753057, "learning_rate": 1.991101769157214e-05, "loss": 1.3027, "step": 1483 }, { "epoch": 0.07, "grad_norm": 1.3354820794666604, "learning_rate": 1.9910810219838257e-05, "loss": 1.2949, "step": 1484 }, { "epoch": 0.07, "grad_norm": 1.2003770470780624, "learning_rate": 1.991060250759816e-05, "loss": 1.2935, "step": 1485 }, { "epoch": 0.07, "grad_norm": 1.359718359784763, "learning_rate": 1.991039455485688e-05, "loss": 1.2515, "step": 1486 }, { "epoch": 0.07, "grad_norm": 1.57738476612396, "learning_rate": 1.9910186361619473e-05, "loss": 1.3135, "step": 1487 }, { "epoch": 0.07, "grad_norm": 1.2345141618691469, "learning_rate": 1.9909977927890988e-05, "loss": 1.355, "step": 1488 }, { "epoch": 0.07, "grad_norm": 1.1773120429689279, "learning_rate": 1.990976925367648e-05, "loss": 1.2314, "step": 1489 }, { "epoch": 0.07, "grad_norm": 1.3689849382132173, "learning_rate": 1.9909560338981014e-05, "loss": 1.1895, "step": 1490 }, { "epoch": 0.07, "grad_norm": 1.2795907544594944, "learning_rate": 1.990935118380967e-05, "loss": 1.2905, "step": 1491 }, { "epoch": 0.07, "grad_norm": 1.2302644483881415, "learning_rate": 1.9909141788167506e-05, "loss": 1.3169, "step": 1492 }, { "epoch": 0.07, "grad_norm": 1.3145224086251992, "learning_rate": 1.9908932152059618e-05, "loss": 1.4199, "step": 1493 }, { "epoch": 0.07, "grad_norm": 1.381769964469777, "learning_rate": 1.9908722275491084e-05, "loss": 1.2549, "step": 1494 }, { "epoch": 0.07, "grad_norm": 1.7681629391811717, "learning_rate": 1.9908512158467007e-05, "loss": 1.4409, "step": 1495 }, { "epoch": 0.07, "grad_norm": 1.1959588208284817, "learning_rate": 1.9908301800992475e-05, "loss": 1.2363, "step": 1496 }, { "epoch": 0.07, "grad_norm": 1.0851397794592823, "learning_rate": 1.99080912030726e-05, "loss": 1.1836, "step": 1497 }, { "epoch": 0.07, "grad_norm": 1.6596823944501662, "learning_rate": 1.990788036471249e-05, "loss": 1.3906, "step": 1498 }, { "epoch": 0.07, "grad_norm": 1.2978717137036486, "learning_rate": 1.990766928591726e-05, "loss": 1.417, "step": 1499 }, { "epoch": 0.07, "grad_norm": 1.448104548787884, "learning_rate": 1.9907457966692036e-05, "loss": 1.2358, "step": 1500 }, { "epoch": 0.07, "grad_norm": 1.2223556211615152, "learning_rate": 1.990724640704194e-05, "loss": 1.3599, "step": 1501 }, { "epoch": 0.07, "grad_norm": 1.4549745382362926, "learning_rate": 1.990703460697211e-05, "loss": 1.3008, "step": 1502 }, { "epoch": 0.07, "grad_norm": 1.2613435305139584, "learning_rate": 1.990682256648769e-05, "loss": 1.3569, "step": 1503 }, { "epoch": 0.07, "grad_norm": 1.5226508216800463, "learning_rate": 1.990661028559382e-05, "loss": 1.2544, "step": 1504 }, { "epoch": 0.07, "grad_norm": 1.3951679675998976, "learning_rate": 1.990639776429565e-05, "loss": 1.2822, "step": 1505 }, { "epoch": 0.07, "grad_norm": 1.4698148431118634, "learning_rate": 1.9906185002598343e-05, "loss": 1.2959, "step": 1506 }, { "epoch": 0.07, "grad_norm": 1.0797488353148252, "learning_rate": 1.9905972000507057e-05, "loss": 1.0576, "step": 1507 }, { "epoch": 0.07, "grad_norm": 1.2917481352638471, "learning_rate": 1.9905758758026966e-05, "loss": 1.2959, "step": 1508 }, { "epoch": 0.07, "grad_norm": 1.2975900632757669, "learning_rate": 1.9905545275163235e-05, "loss": 1.1963, "step": 1509 }, { "epoch": 0.07, "grad_norm": 1.3098096659716842, "learning_rate": 1.9905331551921056e-05, "loss": 1.291, "step": 1510 }, { "epoch": 0.07, "grad_norm": 1.2133002311892283, "learning_rate": 1.9905117588305612e-05, "loss": 1.2266, "step": 1511 }, { "epoch": 0.07, "grad_norm": 0.9157456773977323, "learning_rate": 1.9904903384322095e-05, "loss": 1.2007, "step": 1512 }, { "epoch": 0.07, "grad_norm": 1.5340829574616839, "learning_rate": 1.9904688939975697e-05, "loss": 1.2354, "step": 1513 }, { "epoch": 0.07, "grad_norm": 1.0285143206847205, "learning_rate": 1.990447425527163e-05, "loss": 1.2441, "step": 1514 }, { "epoch": 0.07, "grad_norm": 1.2277967478275797, "learning_rate": 1.99042593302151e-05, "loss": 1.2134, "step": 1515 }, { "epoch": 0.07, "grad_norm": 1.5261108458169501, "learning_rate": 1.9904044164811325e-05, "loss": 1.1528, "step": 1516 }, { "epoch": 0.07, "grad_norm": 1.3669737565406528, "learning_rate": 1.9903828759065524e-05, "loss": 1.4238, "step": 1517 }, { "epoch": 0.07, "grad_norm": 0.901020546402158, "learning_rate": 1.9903613112982925e-05, "loss": 1.0947, "step": 1518 }, { "epoch": 0.07, "grad_norm": 1.361943732142787, "learning_rate": 1.990339722656876e-05, "loss": 1.2598, "step": 1519 }, { "epoch": 0.07, "grad_norm": 1.3270740173221431, "learning_rate": 1.990318109982827e-05, "loss": 1.2759, "step": 1520 }, { "epoch": 0.07, "grad_norm": 1.4063054821315533, "learning_rate": 1.9902964732766702e-05, "loss": 1.3301, "step": 1521 }, { "epoch": 0.07, "grad_norm": 1.4473955325036976, "learning_rate": 1.99027481253893e-05, "loss": 1.1973, "step": 1522 }, { "epoch": 0.07, "grad_norm": 1.2606933083358174, "learning_rate": 1.9902531277701323e-05, "loss": 1.3223, "step": 1523 }, { "epoch": 0.07, "grad_norm": 1.4957291588826798, "learning_rate": 1.9902314189708037e-05, "loss": 1.313, "step": 1524 }, { "epoch": 0.07, "grad_norm": 1.454478105778005, "learning_rate": 1.9902096861414706e-05, "loss": 1.0818, "step": 1525 }, { "epoch": 0.07, "grad_norm": 1.4365998776564808, "learning_rate": 1.9901879292826604e-05, "loss": 1.2764, "step": 1526 }, { "epoch": 0.07, "grad_norm": 1.3223192304811888, "learning_rate": 1.9901661483949015e-05, "loss": 1.3623, "step": 1527 }, { "epoch": 0.07, "grad_norm": 1.2613836098664017, "learning_rate": 1.990144343478722e-05, "loss": 1.2358, "step": 1528 }, { "epoch": 0.07, "grad_norm": 1.2032502647553969, "learning_rate": 1.990122514534651e-05, "loss": 1.3867, "step": 1529 }, { "epoch": 0.07, "grad_norm": 1.2687412034755952, "learning_rate": 1.9901006615632187e-05, "loss": 1.1367, "step": 1530 }, { "epoch": 0.07, "grad_norm": 1.304270888629178, "learning_rate": 1.9900787845649548e-05, "loss": 1.2539, "step": 1531 }, { "epoch": 0.07, "grad_norm": 1.454196134439052, "learning_rate": 1.990056883540391e-05, "loss": 1.4575, "step": 1532 }, { "epoch": 0.07, "grad_norm": 1.2623343702246448, "learning_rate": 1.990034958490058e-05, "loss": 1.3267, "step": 1533 }, { "epoch": 0.07, "grad_norm": 1.2099283886607295, "learning_rate": 1.990013009414488e-05, "loss": 1.23, "step": 1534 }, { "epoch": 0.07, "grad_norm": 1.227256255467023, "learning_rate": 1.989991036314214e-05, "loss": 1.4194, "step": 1535 }, { "epoch": 0.07, "grad_norm": 1.2908572103590255, "learning_rate": 1.9899690391897694e-05, "loss": 1.3613, "step": 1536 }, { "epoch": 0.07, "grad_norm": 1.2152102695042957, "learning_rate": 1.989947018041687e-05, "loss": 1.0488, "step": 1537 }, { "epoch": 0.07, "grad_norm": 1.4487791003308965, "learning_rate": 1.9899249728705018e-05, "loss": 1.1084, "step": 1538 }, { "epoch": 0.07, "grad_norm": 1.6983596650857669, "learning_rate": 1.989902903676749e-05, "loss": 1.3203, "step": 1539 }, { "epoch": 0.07, "grad_norm": 1.834983455073885, "learning_rate": 1.9898808104609638e-05, "loss": 1.1826, "step": 1540 }, { "epoch": 0.07, "grad_norm": 1.097634132284742, "learning_rate": 1.9898586932236826e-05, "loss": 1.2241, "step": 1541 }, { "epoch": 0.07, "grad_norm": 0.806012654348552, "learning_rate": 1.989836551965442e-05, "loss": 1.3599, "step": 1542 }, { "epoch": 0.07, "grad_norm": 1.223519648754178, "learning_rate": 1.9898143866867792e-05, "loss": 1.1421, "step": 1543 }, { "epoch": 0.07, "grad_norm": 1.4110700431343324, "learning_rate": 1.989792197388232e-05, "loss": 1.3213, "step": 1544 }, { "epoch": 0.07, "grad_norm": 1.202117398694309, "learning_rate": 1.9897699840703393e-05, "loss": 1.2373, "step": 1545 }, { "epoch": 0.07, "grad_norm": 1.339756995962155, "learning_rate": 1.98974774673364e-05, "loss": 1.2466, "step": 1546 }, { "epoch": 0.07, "grad_norm": 1.4554283893364546, "learning_rate": 1.9897254853786735e-05, "loss": 1.248, "step": 1547 }, { "epoch": 0.07, "grad_norm": 1.4564968460683352, "learning_rate": 1.98970320000598e-05, "loss": 1.3657, "step": 1548 }, { "epoch": 0.07, "grad_norm": 1.3010550588051382, "learning_rate": 1.9896808906161005e-05, "loss": 1.1069, "step": 1549 }, { "epoch": 0.07, "grad_norm": 1.233219549601465, "learning_rate": 1.9896585572095764e-05, "loss": 1.4097, "step": 1550 }, { "epoch": 0.07, "grad_norm": 1.17311531547124, "learning_rate": 1.9896361997869496e-05, "loss": 1.4214, "step": 1551 }, { "epoch": 0.07, "grad_norm": 1.3701272614900835, "learning_rate": 1.9896138183487626e-05, "loss": 1.3428, "step": 1552 }, { "epoch": 0.07, "grad_norm": 1.3973681044263002, "learning_rate": 1.9895914128955588e-05, "loss": 1.4731, "step": 1553 }, { "epoch": 0.07, "grad_norm": 1.4114786074005963, "learning_rate": 1.9895689834278813e-05, "loss": 1.1826, "step": 1554 }, { "epoch": 0.07, "grad_norm": 1.25859079407795, "learning_rate": 1.989546529946275e-05, "loss": 1.1895, "step": 1555 }, { "epoch": 0.07, "grad_norm": 1.5578364177212496, "learning_rate": 1.9895240524512845e-05, "loss": 1.2603, "step": 1556 }, { "epoch": 0.07, "grad_norm": 1.0054603939783162, "learning_rate": 1.9895015509434555e-05, "loss": 1.2095, "step": 1557 }, { "epoch": 0.07, "grad_norm": 1.590877295251649, "learning_rate": 1.9894790254233338e-05, "loss": 1.2598, "step": 1558 }, { "epoch": 0.07, "grad_norm": 1.3165696940664513, "learning_rate": 1.9894564758914662e-05, "loss": 1.2026, "step": 1559 }, { "epoch": 0.08, "grad_norm": 1.4457349129093637, "learning_rate": 1.9894339023484e-05, "loss": 1.2075, "step": 1560 }, { "epoch": 0.08, "grad_norm": 1.0539962160608323, "learning_rate": 1.989411304794682e-05, "loss": 1.1658, "step": 1561 }, { "epoch": 0.08, "grad_norm": 1.1036717345663354, "learning_rate": 1.989388683230862e-05, "loss": 1.4878, "step": 1562 }, { "epoch": 0.08, "grad_norm": 1.5203261302873377, "learning_rate": 1.9893660376574883e-05, "loss": 1.2847, "step": 1563 }, { "epoch": 0.08, "grad_norm": 1.1414078820252838, "learning_rate": 1.9893433680751105e-05, "loss": 1.2124, "step": 1564 }, { "epoch": 0.08, "grad_norm": 1.2244943552055296, "learning_rate": 1.9893206744842787e-05, "loss": 1.2896, "step": 1565 }, { "epoch": 0.08, "grad_norm": 1.4526670004325604, "learning_rate": 1.9892979568855435e-05, "loss": 1.3374, "step": 1566 }, { "epoch": 0.08, "grad_norm": 1.4901947070025887, "learning_rate": 1.9892752152794565e-05, "loss": 1.4336, "step": 1567 }, { "epoch": 0.08, "grad_norm": 1.1495692600295953, "learning_rate": 1.9892524496665692e-05, "loss": 1.1333, "step": 1568 }, { "epoch": 0.08, "grad_norm": 1.3362214175544957, "learning_rate": 1.989229660047434e-05, "loss": 1.3179, "step": 1569 }, { "epoch": 0.08, "grad_norm": 0.9101052924937294, "learning_rate": 1.9892068464226044e-05, "loss": 1.2139, "step": 1570 }, { "epoch": 0.08, "grad_norm": 1.2305954368645287, "learning_rate": 1.989184008792634e-05, "loss": 1.23, "step": 1571 }, { "epoch": 0.08, "grad_norm": 1.28461249039843, "learning_rate": 1.9891611471580767e-05, "loss": 1.335, "step": 1572 }, { "epoch": 0.08, "grad_norm": 1.243418036856141, "learning_rate": 1.989138261519487e-05, "loss": 1.074, "step": 1573 }, { "epoch": 0.08, "grad_norm": 1.4806900169399335, "learning_rate": 1.989115351877421e-05, "loss": 1.1506, "step": 1574 }, { "epoch": 0.08, "grad_norm": 1.6575453653293377, "learning_rate": 1.9890924182324345e-05, "loss": 1.3291, "step": 1575 }, { "epoch": 0.08, "grad_norm": 1.0139399294063387, "learning_rate": 1.989069460585083e-05, "loss": 1.2173, "step": 1576 }, { "epoch": 0.08, "grad_norm": 1.0789917896841428, "learning_rate": 1.9890464789359253e-05, "loss": 1.2812, "step": 1577 }, { "epoch": 0.08, "grad_norm": 1.2007249017241937, "learning_rate": 1.989023473285518e-05, "loss": 1.2251, "step": 1578 }, { "epoch": 0.08, "grad_norm": 1.2517010110202547, "learning_rate": 1.9890004436344197e-05, "loss": 1.2334, "step": 1579 }, { "epoch": 0.08, "grad_norm": 1.4170168069490596, "learning_rate": 1.988977389983189e-05, "loss": 1.3179, "step": 1580 }, { "epoch": 0.08, "grad_norm": 1.1579833466809972, "learning_rate": 1.9889543123323854e-05, "loss": 1.1509, "step": 1581 }, { "epoch": 0.08, "grad_norm": 1.1463972085249254, "learning_rate": 1.9889312106825694e-05, "loss": 1.3062, "step": 1582 }, { "epoch": 0.08, "grad_norm": 1.520664041023211, "learning_rate": 1.988908085034301e-05, "loss": 1.3545, "step": 1583 }, { "epoch": 0.08, "grad_norm": 1.6153801602091513, "learning_rate": 1.988884935388142e-05, "loss": 1.3633, "step": 1584 }, { "epoch": 0.08, "grad_norm": 1.1790341643428508, "learning_rate": 1.988861761744653e-05, "loss": 1.186, "step": 1585 }, { "epoch": 0.08, "grad_norm": 1.3991151533865573, "learning_rate": 1.988838564104398e-05, "loss": 1.165, "step": 1586 }, { "epoch": 0.08, "grad_norm": 1.3216074139577418, "learning_rate": 1.9888153424679387e-05, "loss": 1.2905, "step": 1587 }, { "epoch": 0.08, "grad_norm": 1.2769752196103168, "learning_rate": 1.9887920968358394e-05, "loss": 1.2935, "step": 1588 }, { "epoch": 0.08, "grad_norm": 1.0543367676367663, "learning_rate": 1.9887688272086637e-05, "loss": 1.2705, "step": 1589 }, { "epoch": 0.08, "grad_norm": 1.3940021071872246, "learning_rate": 1.9887455335869762e-05, "loss": 1.3506, "step": 1590 }, { "epoch": 0.08, "grad_norm": 1.2887218012159793, "learning_rate": 1.9887222159713427e-05, "loss": 1.2407, "step": 1591 }, { "epoch": 0.08, "grad_norm": 1.3359888801163557, "learning_rate": 1.9886988743623284e-05, "loss": 1.1309, "step": 1592 }, { "epoch": 0.08, "grad_norm": 1.361908692215102, "learning_rate": 1.9886755087605004e-05, "loss": 1.0789, "step": 1593 }, { "epoch": 0.08, "grad_norm": 1.435841711510411, "learning_rate": 1.9886521191664255e-05, "loss": 1.3628, "step": 1594 }, { "epoch": 0.08, "grad_norm": 1.4826435088788372, "learning_rate": 1.988628705580671e-05, "loss": 1.1841, "step": 1595 }, { "epoch": 0.08, "grad_norm": 1.5094290273500288, "learning_rate": 1.9886052680038048e-05, "loss": 1.3467, "step": 1596 }, { "epoch": 0.08, "grad_norm": 1.2516729119574375, "learning_rate": 1.9885818064363968e-05, "loss": 1.2466, "step": 1597 }, { "epoch": 0.08, "grad_norm": 1.231356623547569, "learning_rate": 1.9885583208790154e-05, "loss": 1.231, "step": 1598 }, { "epoch": 0.08, "grad_norm": 1.401862325435475, "learning_rate": 1.988534811332231e-05, "loss": 1.2988, "step": 1599 }, { "epoch": 0.08, "grad_norm": 1.2879223757687983, "learning_rate": 1.9885112777966135e-05, "loss": 1.3047, "step": 1600 }, { "epoch": 0.08, "grad_norm": 1.5887824075634995, "learning_rate": 1.9884877202727345e-05, "loss": 1.4385, "step": 1601 }, { "epoch": 0.08, "grad_norm": 1.17229564677236, "learning_rate": 1.988464138761166e-05, "loss": 0.9707, "step": 1602 }, { "epoch": 0.08, "grad_norm": 1.2709309503271358, "learning_rate": 1.9884405332624793e-05, "loss": 1.3823, "step": 1603 }, { "epoch": 0.08, "grad_norm": 1.4270151500846249, "learning_rate": 1.988416903777248e-05, "loss": 1.332, "step": 1604 }, { "epoch": 0.08, "grad_norm": 1.5303637365133647, "learning_rate": 1.9883932503060452e-05, "loss": 1.1738, "step": 1605 }, { "epoch": 0.08, "grad_norm": 1.1082389386754241, "learning_rate": 1.988369572849445e-05, "loss": 1.2852, "step": 1606 }, { "epoch": 0.08, "grad_norm": 1.5163589158538997, "learning_rate": 1.9883458714080222e-05, "loss": 1.2524, "step": 1607 }, { "epoch": 0.08, "grad_norm": 1.264746250459986, "learning_rate": 1.9883221459823515e-05, "loss": 1.2256, "step": 1608 }, { "epoch": 0.08, "grad_norm": 1.2397655318797134, "learning_rate": 1.9882983965730086e-05, "loss": 1.23, "step": 1609 }, { "epoch": 0.08, "grad_norm": 1.4039035081920999, "learning_rate": 1.9882746231805705e-05, "loss": 1.1587, "step": 1610 }, { "epoch": 0.08, "grad_norm": 1.1860489237763503, "learning_rate": 1.9882508258056136e-05, "loss": 1.1875, "step": 1611 }, { "epoch": 0.08, "grad_norm": 1.2153527259245853, "learning_rate": 1.9882270044487155e-05, "loss": 1.2852, "step": 1612 }, { "epoch": 0.08, "grad_norm": 1.2226321710437238, "learning_rate": 1.9882031591104543e-05, "loss": 1.2261, "step": 1613 }, { "epoch": 0.08, "grad_norm": 1.0394641102434825, "learning_rate": 1.9881792897914086e-05, "loss": 1.2559, "step": 1614 }, { "epoch": 0.08, "grad_norm": 1.3501411026326307, "learning_rate": 1.9881553964921574e-05, "loss": 1.2544, "step": 1615 }, { "epoch": 0.08, "grad_norm": 1.1872091019577975, "learning_rate": 1.9881314792132812e-05, "loss": 1.167, "step": 1616 }, { "epoch": 0.08, "grad_norm": 1.0945637575394478, "learning_rate": 1.9881075379553597e-05, "loss": 1.1665, "step": 1617 }, { "epoch": 0.08, "grad_norm": 1.2695999424164235, "learning_rate": 1.9880835727189742e-05, "loss": 1.3223, "step": 1618 }, { "epoch": 0.08, "grad_norm": 1.2291856405387098, "learning_rate": 1.9880595835047062e-05, "loss": 1.1958, "step": 1619 }, { "epoch": 0.08, "grad_norm": 1.538452209073263, "learning_rate": 1.988035570313138e-05, "loss": 1.4199, "step": 1620 }, { "epoch": 0.08, "grad_norm": 1.3443875217933037, "learning_rate": 1.9880115331448526e-05, "loss": 1.3145, "step": 1621 }, { "epoch": 0.08, "grad_norm": 1.3366903311264422, "learning_rate": 1.9879874720004326e-05, "loss": 1.2085, "step": 1622 }, { "epoch": 0.08, "grad_norm": 1.1615198994199438, "learning_rate": 1.9879633868804624e-05, "loss": 1.2158, "step": 1623 }, { "epoch": 0.08, "grad_norm": 1.2467508503223026, "learning_rate": 1.9879392777855258e-05, "loss": 1.2339, "step": 1624 }, { "epoch": 0.08, "grad_norm": 1.1879156511843596, "learning_rate": 1.9879151447162086e-05, "loss": 1.2954, "step": 1625 }, { "epoch": 0.08, "grad_norm": 1.0901017835414069, "learning_rate": 1.987890987673096e-05, "loss": 1.2397, "step": 1626 }, { "epoch": 0.08, "grad_norm": 1.196241581636429, "learning_rate": 1.987866806656775e-05, "loss": 1.2427, "step": 1627 }, { "epoch": 0.08, "grad_norm": 1.6871214355079287, "learning_rate": 1.9878426016678313e-05, "loss": 1.3296, "step": 1628 }, { "epoch": 0.08, "grad_norm": 1.3760034104199597, "learning_rate": 1.987818372706853e-05, "loss": 1.2065, "step": 1629 }, { "epoch": 0.08, "grad_norm": 1.4896311686300578, "learning_rate": 1.9877941197744277e-05, "loss": 1.2632, "step": 1630 }, { "epoch": 0.08, "grad_norm": 1.189490311276859, "learning_rate": 1.9877698428711444e-05, "loss": 1.3726, "step": 1631 }, { "epoch": 0.08, "grad_norm": 1.1495260741744449, "learning_rate": 1.9877455419975917e-05, "loss": 1.3813, "step": 1632 }, { "epoch": 0.08, "grad_norm": 1.4554475895087005, "learning_rate": 1.9877212171543595e-05, "loss": 1.375, "step": 1633 }, { "epoch": 0.08, "grad_norm": 1.4916905656242379, "learning_rate": 1.9876968683420384e-05, "loss": 1.2109, "step": 1634 }, { "epoch": 0.08, "grad_norm": 1.3254516615011172, "learning_rate": 1.9876724955612188e-05, "loss": 1.3076, "step": 1635 }, { "epoch": 0.08, "grad_norm": 1.3002319869064318, "learning_rate": 1.9876480988124923e-05, "loss": 1.2085, "step": 1636 }, { "epoch": 0.08, "grad_norm": 1.1348351019604566, "learning_rate": 1.9876236780964513e-05, "loss": 1.3828, "step": 1637 }, { "epoch": 0.08, "grad_norm": 1.1995198013855326, "learning_rate": 1.9875992334136878e-05, "loss": 1.2827, "step": 1638 }, { "epoch": 0.08, "grad_norm": 1.0529793886411385, "learning_rate": 1.9875747647647956e-05, "loss": 1.2666, "step": 1639 }, { "epoch": 0.08, "grad_norm": 0.9996034953533504, "learning_rate": 1.9875502721503683e-05, "loss": 0.968, "step": 1640 }, { "epoch": 0.08, "grad_norm": 1.2624351297586964, "learning_rate": 1.987525755571e-05, "loss": 1.2778, "step": 1641 }, { "epoch": 0.08, "grad_norm": 1.3316781822753583, "learning_rate": 1.987501215027286e-05, "loss": 1.1416, "step": 1642 }, { "epoch": 0.08, "grad_norm": 1.232776923526557, "learning_rate": 1.9874766505198214e-05, "loss": 1.2124, "step": 1643 }, { "epoch": 0.08, "grad_norm": 1.3890462211734886, "learning_rate": 1.9874520620492026e-05, "loss": 1.4321, "step": 1644 }, { "epoch": 0.08, "grad_norm": 1.3434850681980908, "learning_rate": 1.987427449616026e-05, "loss": 1.2729, "step": 1645 }, { "epoch": 0.08, "grad_norm": 1.2536702304110583, "learning_rate": 1.9874028132208897e-05, "loss": 1.1973, "step": 1646 }, { "epoch": 0.08, "grad_norm": 1.390632822548055, "learning_rate": 1.9873781528643905e-05, "loss": 1.2603, "step": 1647 }, { "epoch": 0.08, "grad_norm": 1.3883345110599374, "learning_rate": 1.9873534685471277e-05, "loss": 1.3755, "step": 1648 }, { "epoch": 0.08, "grad_norm": 1.1388837556808524, "learning_rate": 1.9873287602696996e-05, "loss": 1.1748, "step": 1649 }, { "epoch": 0.08, "grad_norm": 1.258550032275601, "learning_rate": 1.9873040280327062e-05, "loss": 1.2114, "step": 1650 }, { "epoch": 0.08, "grad_norm": 1.2582746824760067, "learning_rate": 1.9872792718367476e-05, "loss": 1.2334, "step": 1651 }, { "epoch": 0.08, "grad_norm": 2.087091504698617, "learning_rate": 1.9872544916824244e-05, "loss": 1.2114, "step": 1652 }, { "epoch": 0.08, "grad_norm": 1.307547305181814, "learning_rate": 1.987229687570338e-05, "loss": 1.2935, "step": 1653 }, { "epoch": 0.08, "grad_norm": 1.050906895485712, "learning_rate": 1.987204859501091e-05, "loss": 1.3115, "step": 1654 }, { "epoch": 0.08, "grad_norm": 1.1024383989189768, "learning_rate": 1.9871800074752848e-05, "loss": 1.3032, "step": 1655 }, { "epoch": 0.08, "grad_norm": 1.247800984742456, "learning_rate": 1.9871551314935233e-05, "loss": 1.3359, "step": 1656 }, { "epoch": 0.08, "grad_norm": 1.338746492487518, "learning_rate": 1.98713023155641e-05, "loss": 1.2397, "step": 1657 }, { "epoch": 0.08, "grad_norm": 1.2123242181888063, "learning_rate": 1.987105307664549e-05, "loss": 1.2178, "step": 1658 }, { "epoch": 0.08, "grad_norm": 0.9988686411556564, "learning_rate": 1.9870803598185446e-05, "loss": 1.3276, "step": 1659 }, { "epoch": 0.08, "grad_norm": 1.0665999153495263, "learning_rate": 1.9870553880190032e-05, "loss": 1.2241, "step": 1660 }, { "epoch": 0.08, "grad_norm": 1.7921682381594624, "learning_rate": 1.9870303922665305e-05, "loss": 1.4106, "step": 1661 }, { "epoch": 0.08, "grad_norm": 1.0860295802671933, "learning_rate": 1.9870053725617326e-05, "loss": 1.1519, "step": 1662 }, { "epoch": 0.08, "grad_norm": 1.4259572759552495, "learning_rate": 1.986980328905217e-05, "loss": 1.4238, "step": 1663 }, { "epoch": 0.08, "grad_norm": 1.1318832440347604, "learning_rate": 1.9869552612975917e-05, "loss": 1.3979, "step": 1664 }, { "epoch": 0.08, "grad_norm": 1.1599514792312433, "learning_rate": 1.9869301697394646e-05, "loss": 1.2583, "step": 1665 }, { "epoch": 0.08, "grad_norm": 1.273737456698302, "learning_rate": 1.9869050542314446e-05, "loss": 1.3564, "step": 1666 }, { "epoch": 0.08, "grad_norm": 1.3812323968356381, "learning_rate": 1.9868799147741417e-05, "loss": 1.188, "step": 1667 }, { "epoch": 0.08, "grad_norm": 1.25229318921687, "learning_rate": 1.986854751368165e-05, "loss": 1.1338, "step": 1668 }, { "epoch": 0.08, "grad_norm": 1.2867282300315708, "learning_rate": 1.986829564014126e-05, "loss": 1.2427, "step": 1669 }, { "epoch": 0.08, "grad_norm": 1.3142788160100252, "learning_rate": 1.9868043527126358e-05, "loss": 1.1284, "step": 1670 }, { "epoch": 0.08, "grad_norm": 1.407989324241409, "learning_rate": 1.9867791174643057e-05, "loss": 1.2314, "step": 1671 }, { "epoch": 0.08, "grad_norm": 1.2386292115262292, "learning_rate": 1.986753858269749e-05, "loss": 1.2891, "step": 1672 }, { "epoch": 0.08, "grad_norm": 1.008441866169437, "learning_rate": 1.986728575129578e-05, "loss": 1.188, "step": 1673 }, { "epoch": 0.08, "grad_norm": 1.4395956167137967, "learning_rate": 1.986703268044406e-05, "loss": 1.4038, "step": 1674 }, { "epoch": 0.08, "grad_norm": 1.0760815766206608, "learning_rate": 1.9866779370148475e-05, "loss": 1.1475, "step": 1675 }, { "epoch": 0.08, "grad_norm": 1.0606003357941316, "learning_rate": 1.9866525820415173e-05, "loss": 1.332, "step": 1676 }, { "epoch": 0.08, "grad_norm": 1.4274839216865483, "learning_rate": 1.98662720312503e-05, "loss": 1.377, "step": 1677 }, { "epoch": 0.08, "grad_norm": 1.3985423627355724, "learning_rate": 1.9866018002660027e-05, "loss": 1.145, "step": 1678 }, { "epoch": 0.08, "grad_norm": 1.4008991790051306, "learning_rate": 1.9865763734650514e-05, "loss": 1.3228, "step": 1679 }, { "epoch": 0.08, "grad_norm": 1.3981520424334788, "learning_rate": 1.9865509227227924e-05, "loss": 1.1333, "step": 1680 }, { "epoch": 0.08, "grad_norm": 1.1267362552651785, "learning_rate": 1.986525448039844e-05, "loss": 1.1572, "step": 1681 }, { "epoch": 0.08, "grad_norm": 1.4605626183875666, "learning_rate": 1.9864999494168245e-05, "loss": 1.2822, "step": 1682 }, { "epoch": 0.08, "grad_norm": 1.1863992658304676, "learning_rate": 1.9864744268543522e-05, "loss": 1.1021, "step": 1683 }, { "epoch": 0.08, "grad_norm": 1.350047894774326, "learning_rate": 1.9864488803530467e-05, "loss": 1.2588, "step": 1684 }, { "epoch": 0.08, "grad_norm": 1.1717315399856874, "learning_rate": 1.9864233099135278e-05, "loss": 1.3286, "step": 1685 }, { "epoch": 0.08, "grad_norm": 0.9509228839276824, "learning_rate": 1.9863977155364164e-05, "loss": 1.3027, "step": 1686 }, { "epoch": 0.08, "grad_norm": 1.6431046192161922, "learning_rate": 1.986372097222333e-05, "loss": 1.3428, "step": 1687 }, { "epoch": 0.08, "grad_norm": 1.2154091438616301, "learning_rate": 1.9863464549719e-05, "loss": 1.2646, "step": 1688 }, { "epoch": 0.08, "grad_norm": 1.2270820712795016, "learning_rate": 1.986320788785739e-05, "loss": 1.0159, "step": 1689 }, { "epoch": 0.08, "grad_norm": 1.1005047455452612, "learning_rate": 1.9862950986644732e-05, "loss": 1.1743, "step": 1690 }, { "epoch": 0.08, "grad_norm": 1.09309379584541, "learning_rate": 1.9862693846087258e-05, "loss": 1.27, "step": 1691 }, { "epoch": 0.08, "grad_norm": 1.2853589198248638, "learning_rate": 1.986243646619121e-05, "loss": 1.3306, "step": 1692 }, { "epoch": 0.08, "grad_norm": 1.1653208862261084, "learning_rate": 1.9862178846962837e-05, "loss": 1.3843, "step": 1693 }, { "epoch": 0.08, "grad_norm": 1.0989835615343597, "learning_rate": 1.9861920988408382e-05, "loss": 1.4165, "step": 1694 }, { "epoch": 0.08, "grad_norm": 1.3746966645217578, "learning_rate": 1.986166289053411e-05, "loss": 1.2471, "step": 1695 }, { "epoch": 0.08, "grad_norm": 1.1661207129785116, "learning_rate": 1.9861404553346282e-05, "loss": 1.3765, "step": 1696 }, { "epoch": 0.08, "grad_norm": 1.6329624589721, "learning_rate": 1.9861145976851167e-05, "loss": 1.2764, "step": 1697 }, { "epoch": 0.08, "grad_norm": 1.1422753020485674, "learning_rate": 1.986088716105504e-05, "loss": 1.2266, "step": 1698 }, { "epoch": 0.08, "grad_norm": 1.236680790160474, "learning_rate": 1.986062810596418e-05, "loss": 1.116, "step": 1699 }, { "epoch": 0.08, "grad_norm": 1.199952893622414, "learning_rate": 1.9860368811584875e-05, "loss": 1.1143, "step": 1700 }, { "epoch": 0.08, "grad_norm": 1.1771297493808275, "learning_rate": 1.9860109277923417e-05, "loss": 1.0007, "step": 1701 }, { "epoch": 0.08, "grad_norm": 1.2338716231711284, "learning_rate": 1.9859849504986105e-05, "loss": 1.2822, "step": 1702 }, { "epoch": 0.08, "grad_norm": 1.425362496656904, "learning_rate": 1.9859589492779244e-05, "loss": 1.332, "step": 1703 }, { "epoch": 0.08, "grad_norm": 0.9567276935318477, "learning_rate": 1.985932924130914e-05, "loss": 1.2632, "step": 1704 }, { "epoch": 0.08, "grad_norm": 1.2269417995299456, "learning_rate": 1.9859068750582112e-05, "loss": 1.2314, "step": 1705 }, { "epoch": 0.08, "grad_norm": 1.2536930256888825, "learning_rate": 1.985880802060448e-05, "loss": 1.377, "step": 1706 }, { "epoch": 0.08, "grad_norm": 1.4224238152340487, "learning_rate": 1.9858547051382565e-05, "loss": 1.3413, "step": 1707 }, { "epoch": 0.08, "grad_norm": 1.2259825440738519, "learning_rate": 1.9858285842922715e-05, "loss": 1.3301, "step": 1708 }, { "epoch": 0.08, "grad_norm": 1.4595370591453598, "learning_rate": 1.9858024395231256e-05, "loss": 1.1655, "step": 1709 }, { "epoch": 0.08, "grad_norm": 1.325258420400975, "learning_rate": 1.9857762708314535e-05, "loss": 1.2168, "step": 1710 }, { "epoch": 0.08, "grad_norm": 1.0285904602415692, "learning_rate": 1.9857500782178905e-05, "loss": 1.4336, "step": 1711 }, { "epoch": 0.08, "grad_norm": 1.2701582065726138, "learning_rate": 1.9857238616830718e-05, "loss": 1.2202, "step": 1712 }, { "epoch": 0.08, "grad_norm": 1.1025760437458634, "learning_rate": 1.9856976212276344e-05, "loss": 1.2827, "step": 1713 }, { "epoch": 0.08, "grad_norm": 1.467506877024449, "learning_rate": 1.9856713568522143e-05, "loss": 1.4424, "step": 1714 }, { "epoch": 0.08, "grad_norm": 0.8754132866254062, "learning_rate": 1.985645068557449e-05, "loss": 1.0776, "step": 1715 }, { "epoch": 0.08, "grad_norm": 1.104738135089978, "learning_rate": 1.985618756343977e-05, "loss": 1.1758, "step": 1716 }, { "epoch": 0.08, "grad_norm": 1.2047305424772685, "learning_rate": 1.9855924202124358e-05, "loss": 1.3252, "step": 1717 }, { "epoch": 0.08, "grad_norm": 1.322928053264383, "learning_rate": 1.9855660601634656e-05, "loss": 1.1968, "step": 1718 }, { "epoch": 0.08, "grad_norm": 1.7323570095694776, "learning_rate": 1.9855396761977052e-05, "loss": 1.3301, "step": 1719 }, { "epoch": 0.08, "grad_norm": 1.3841468423555108, "learning_rate": 1.985513268315795e-05, "loss": 1.1597, "step": 1720 }, { "epoch": 0.08, "grad_norm": 1.0837885105472314, "learning_rate": 1.985486836518377e-05, "loss": 1.314, "step": 1721 }, { "epoch": 0.08, "grad_norm": 1.343073132074367, "learning_rate": 1.9854603808060907e-05, "loss": 1.1477, "step": 1722 }, { "epoch": 0.08, "grad_norm": 1.3670448751058277, "learning_rate": 1.9854339011795795e-05, "loss": 1.4141, "step": 1723 }, { "epoch": 0.08, "grad_norm": 1.173810842612231, "learning_rate": 1.9854073976394858e-05, "loss": 1.252, "step": 1724 }, { "epoch": 0.08, "grad_norm": 1.3748869680961737, "learning_rate": 1.9853808701864522e-05, "loss": 1.2627, "step": 1725 }, { "epoch": 0.08, "grad_norm": 1.1308112006159756, "learning_rate": 1.985354318821123e-05, "loss": 1.2632, "step": 1726 }, { "epoch": 0.08, "grad_norm": 1.230740619326656, "learning_rate": 1.9853277435441422e-05, "loss": 1.1558, "step": 1727 }, { "epoch": 0.08, "grad_norm": 1.369306932536038, "learning_rate": 1.985301144356155e-05, "loss": 1.3789, "step": 1728 }, { "epoch": 0.08, "grad_norm": 1.257157813591945, "learning_rate": 1.9852745212578063e-05, "loss": 1.3359, "step": 1729 }, { "epoch": 0.08, "grad_norm": 1.2735745089701531, "learning_rate": 1.9852478742497426e-05, "loss": 1.2554, "step": 1730 }, { "epoch": 0.08, "grad_norm": 1.2966505055890094, "learning_rate": 1.985221203332611e-05, "loss": 1.3311, "step": 1731 }, { "epoch": 0.08, "grad_norm": 1.6055596929694471, "learning_rate": 1.985194508507058e-05, "loss": 1.2646, "step": 1732 }, { "epoch": 0.08, "grad_norm": 1.384939252942962, "learning_rate": 1.9851677897737314e-05, "loss": 1.2837, "step": 1733 }, { "epoch": 0.08, "grad_norm": 1.2763714375228592, "learning_rate": 1.98514104713328e-05, "loss": 1.3398, "step": 1734 }, { "epoch": 0.08, "grad_norm": 1.3062282027424854, "learning_rate": 1.9851142805863523e-05, "loss": 1.1411, "step": 1735 }, { "epoch": 0.08, "grad_norm": 1.2165095725823665, "learning_rate": 1.9850874901335984e-05, "loss": 1.0901, "step": 1736 }, { "epoch": 0.08, "grad_norm": 1.231108688971998, "learning_rate": 1.9850606757756683e-05, "loss": 1.2671, "step": 1737 }, { "epoch": 0.08, "grad_norm": 1.289500160671035, "learning_rate": 1.9850338375132125e-05, "loss": 1.3472, "step": 1738 }, { "epoch": 0.08, "grad_norm": 1.1194221539170175, "learning_rate": 1.985006975346882e-05, "loss": 1.0845, "step": 1739 }, { "epoch": 0.08, "grad_norm": 1.7074085507493169, "learning_rate": 1.9849800892773293e-05, "loss": 1.2471, "step": 1740 }, { "epoch": 0.08, "grad_norm": 1.5780239214584602, "learning_rate": 1.9849531793052064e-05, "loss": 1.1357, "step": 1741 }, { "epoch": 0.08, "grad_norm": 1.382660377420584, "learning_rate": 1.9849262454311663e-05, "loss": 1.2007, "step": 1742 }, { "epoch": 0.08, "grad_norm": 1.421342155171571, "learning_rate": 1.9848992876558633e-05, "loss": 1.2993, "step": 1743 }, { "epoch": 0.08, "grad_norm": 1.2706096746238078, "learning_rate": 1.9848723059799508e-05, "loss": 1.2686, "step": 1744 }, { "epoch": 0.08, "grad_norm": 0.655617420267706, "learning_rate": 1.9848453004040838e-05, "loss": 1.2148, "step": 1745 }, { "epoch": 0.08, "grad_norm": 1.2300310108755486, "learning_rate": 1.984818270928918e-05, "loss": 1.2744, "step": 1746 }, { "epoch": 0.08, "grad_norm": 1.2478262913144302, "learning_rate": 1.9847912175551085e-05, "loss": 1.2637, "step": 1747 }, { "epoch": 0.08, "grad_norm": 1.2687880338273334, "learning_rate": 1.9847641402833127e-05, "loss": 1.3018, "step": 1748 }, { "epoch": 0.08, "grad_norm": 1.400401039984681, "learning_rate": 1.9847370391141872e-05, "loss": 1.2183, "step": 1749 }, { "epoch": 0.08, "grad_norm": 1.1702973385522992, "learning_rate": 1.9847099140483896e-05, "loss": 1.0503, "step": 1750 }, { "epoch": 0.08, "grad_norm": 1.6880487717902481, "learning_rate": 1.9846827650865787e-05, "loss": 1.2178, "step": 1751 }, { "epoch": 0.08, "grad_norm": 1.3334919040360198, "learning_rate": 1.9846555922294123e-05, "loss": 1.1689, "step": 1752 }, { "epoch": 0.08, "grad_norm": 1.3801483334467752, "learning_rate": 1.984628395477551e-05, "loss": 1.3003, "step": 1753 }, { "epoch": 0.08, "grad_norm": 1.3913933444369806, "learning_rate": 1.984601174831654e-05, "loss": 1.3491, "step": 1754 }, { "epoch": 0.08, "grad_norm": 1.4331349726429305, "learning_rate": 1.9845739302923822e-05, "loss": 1.2676, "step": 1755 }, { "epoch": 0.08, "grad_norm": 1.1899476594072496, "learning_rate": 1.984546661860397e-05, "loss": 1.1714, "step": 1756 }, { "epoch": 0.08, "grad_norm": 1.123957126670351, "learning_rate": 1.9845193695363592e-05, "loss": 1.1875, "step": 1757 }, { "epoch": 0.08, "grad_norm": 1.4441281009373768, "learning_rate": 1.984492053320932e-05, "loss": 1.1602, "step": 1758 }, { "epoch": 0.08, "grad_norm": 1.3919848625473352, "learning_rate": 1.9844647132147773e-05, "loss": 1.2852, "step": 1759 }, { "epoch": 0.08, "grad_norm": 1.3349011256731225, "learning_rate": 1.98443734921856e-05, "loss": 1.1182, "step": 1760 }, { "epoch": 0.08, "grad_norm": 1.3500934566137928, "learning_rate": 1.984409961332943e-05, "loss": 1.2739, "step": 1761 }, { "epoch": 0.08, "grad_norm": 1.3002065222767116, "learning_rate": 1.9843825495585912e-05, "loss": 1.2202, "step": 1762 }, { "epoch": 0.08, "grad_norm": 0.9301056197056421, "learning_rate": 1.98435511389617e-05, "loss": 1.1401, "step": 1763 }, { "epoch": 0.08, "grad_norm": 1.3731401215903167, "learning_rate": 1.9843276543463452e-05, "loss": 1.2202, "step": 1764 }, { "epoch": 0.08, "grad_norm": 1.4753248084204624, "learning_rate": 1.9843001709097832e-05, "loss": 1.1294, "step": 1765 }, { "epoch": 0.08, "grad_norm": 1.4634661076421687, "learning_rate": 1.9842726635871503e-05, "loss": 1.4268, "step": 1766 }, { "epoch": 0.08, "grad_norm": 1.2622118966740314, "learning_rate": 1.9842451323791146e-05, "loss": 1.2544, "step": 1767 }, { "epoch": 0.09, "grad_norm": 1.003937793988478, "learning_rate": 1.9842175772863442e-05, "loss": 1.3047, "step": 1768 }, { "epoch": 0.09, "grad_norm": 1.4018160737830376, "learning_rate": 1.984189998309508e-05, "loss": 1.2236, "step": 1769 }, { "epoch": 0.09, "grad_norm": 1.5237917683404207, "learning_rate": 1.9841623954492744e-05, "loss": 1.0415, "step": 1770 }, { "epoch": 0.09, "grad_norm": 1.0315678916968165, "learning_rate": 1.984134768706314e-05, "loss": 1.3403, "step": 1771 }, { "epoch": 0.09, "grad_norm": 1.4584492678227996, "learning_rate": 1.9841071180812972e-05, "loss": 1.21, "step": 1772 }, { "epoch": 0.09, "grad_norm": 1.2569077078070172, "learning_rate": 1.9840794435748946e-05, "loss": 1.3501, "step": 1773 }, { "epoch": 0.09, "grad_norm": 1.4250537933506815, "learning_rate": 1.984051745187778e-05, "loss": 1.3262, "step": 1774 }, { "epoch": 0.09, "grad_norm": 1.4691990883754478, "learning_rate": 1.9840240229206194e-05, "loss": 1.4224, "step": 1775 }, { "epoch": 0.09, "grad_norm": 1.1503567405444488, "learning_rate": 1.9839962767740918e-05, "loss": 1.104, "step": 1776 }, { "epoch": 0.09, "grad_norm": 1.4069277259214634, "learning_rate": 1.9839685067488683e-05, "loss": 1.272, "step": 1777 }, { "epoch": 0.09, "grad_norm": 1.3570906265536857, "learning_rate": 1.983940712845623e-05, "loss": 1.3423, "step": 1778 }, { "epoch": 0.09, "grad_norm": 1.2968982949166132, "learning_rate": 1.9839128950650302e-05, "loss": 1.1843, "step": 1779 }, { "epoch": 0.09, "grad_norm": 1.440589170789255, "learning_rate": 1.9838850534077652e-05, "loss": 1.0952, "step": 1780 }, { "epoch": 0.09, "grad_norm": 1.4900470325711788, "learning_rate": 1.983857187874503e-05, "loss": 1.3486, "step": 1781 }, { "epoch": 0.09, "grad_norm": 1.3055924610428082, "learning_rate": 1.9838292984659207e-05, "loss": 1.1548, "step": 1782 }, { "epoch": 0.09, "grad_norm": 1.311845292910859, "learning_rate": 1.9838013851826948e-05, "loss": 1.2798, "step": 1783 }, { "epoch": 0.09, "grad_norm": 1.1315891682844044, "learning_rate": 1.983773448025502e-05, "loss": 1.2222, "step": 1784 }, { "epoch": 0.09, "grad_norm": 1.3755940003257658, "learning_rate": 1.983745486995021e-05, "loss": 1.2964, "step": 1785 }, { "epoch": 0.09, "grad_norm": 1.0437521118940725, "learning_rate": 1.9837175020919304e-05, "loss": 1.1621, "step": 1786 }, { "epoch": 0.09, "grad_norm": 1.3482654757471504, "learning_rate": 1.983689493316909e-05, "loss": 1.188, "step": 1787 }, { "epoch": 0.09, "grad_norm": 0.9508350029086002, "learning_rate": 1.983661460670636e-05, "loss": 1.1768, "step": 1788 }, { "epoch": 0.09, "grad_norm": 1.2362234811230115, "learning_rate": 1.9836334041537928e-05, "loss": 1.4507, "step": 1789 }, { "epoch": 0.09, "grad_norm": 1.195258275515163, "learning_rate": 1.9836053237670594e-05, "loss": 1.2319, "step": 1790 }, { "epoch": 0.09, "grad_norm": 1.362622719888542, "learning_rate": 1.9835772195111174e-05, "loss": 1.2119, "step": 1791 }, { "epoch": 0.09, "grad_norm": 1.6713560550560689, "learning_rate": 1.9835490913866492e-05, "loss": 1.3672, "step": 1792 }, { "epoch": 0.09, "grad_norm": 1.2060415800712028, "learning_rate": 1.9835209393943366e-05, "loss": 1.2549, "step": 1793 }, { "epoch": 0.09, "grad_norm": 1.1140079926468116, "learning_rate": 1.9834927635348635e-05, "loss": 1.4097, "step": 1794 }, { "epoch": 0.09, "grad_norm": 1.1289887483072814, "learning_rate": 1.983464563808913e-05, "loss": 1.2627, "step": 1795 }, { "epoch": 0.09, "grad_norm": 1.3942316271006572, "learning_rate": 1.9834363402171704e-05, "loss": 1.2837, "step": 1796 }, { "epoch": 0.09, "grad_norm": 1.214359073336569, "learning_rate": 1.9834080927603198e-05, "loss": 1.2261, "step": 1797 }, { "epoch": 0.09, "grad_norm": 1.335786196950659, "learning_rate": 1.9833798214390467e-05, "loss": 1.3462, "step": 1798 }, { "epoch": 0.09, "grad_norm": 0.952411291889435, "learning_rate": 1.9833515262540376e-05, "loss": 1.083, "step": 1799 }, { "epoch": 0.09, "grad_norm": 1.0824754526560696, "learning_rate": 1.9833232072059786e-05, "loss": 1.0635, "step": 1800 }, { "epoch": 0.09, "grad_norm": 1.1343186769665288, "learning_rate": 1.9832948642955574e-05, "loss": 1.2031, "step": 1801 }, { "epoch": 0.09, "grad_norm": 1.5606034603676007, "learning_rate": 1.983266497523462e-05, "loss": 1.2207, "step": 1802 }, { "epoch": 0.09, "grad_norm": 1.3328582285144686, "learning_rate": 1.9832381068903797e-05, "loss": 1.1987, "step": 1803 }, { "epoch": 0.09, "grad_norm": 1.1072247820328662, "learning_rate": 1.9832096923970002e-05, "loss": 1.2178, "step": 1804 }, { "epoch": 0.09, "grad_norm": 1.1737685891545009, "learning_rate": 1.983181254044013e-05, "loss": 1.1289, "step": 1805 }, { "epoch": 0.09, "grad_norm": 1.2881356953876921, "learning_rate": 1.983152791832109e-05, "loss": 1.1987, "step": 1806 }, { "epoch": 0.09, "grad_norm": 1.1960482924607332, "learning_rate": 1.9831243057619774e-05, "loss": 1.2588, "step": 1807 }, { "epoch": 0.09, "grad_norm": 1.161388606630917, "learning_rate": 1.98309579583431e-05, "loss": 1.1812, "step": 1808 }, { "epoch": 0.09, "grad_norm": 1.3470186324330802, "learning_rate": 1.983067262049799e-05, "loss": 1.2427, "step": 1809 }, { "epoch": 0.09, "grad_norm": 1.0795804952988468, "learning_rate": 1.983038704409137e-05, "loss": 1.2471, "step": 1810 }, { "epoch": 0.09, "grad_norm": 1.0960805566454825, "learning_rate": 1.9830101229130162e-05, "loss": 1.2769, "step": 1811 }, { "epoch": 0.09, "grad_norm": 1.5931898942851315, "learning_rate": 1.9829815175621306e-05, "loss": 1.2056, "step": 1812 }, { "epoch": 0.09, "grad_norm": 1.132318931297347, "learning_rate": 1.9829528883571747e-05, "loss": 1.2393, "step": 1813 }, { "epoch": 0.09, "grad_norm": 1.196680125022584, "learning_rate": 1.982924235298843e-05, "loss": 1.1641, "step": 1814 }, { "epoch": 0.09, "grad_norm": 1.7384397136070218, "learning_rate": 1.9828955583878306e-05, "loss": 1.3389, "step": 1815 }, { "epoch": 0.09, "grad_norm": 1.4234072559778765, "learning_rate": 1.9828668576248336e-05, "loss": 1.355, "step": 1816 }, { "epoch": 0.09, "grad_norm": 1.4004840104245582, "learning_rate": 1.9828381330105487e-05, "loss": 1.3145, "step": 1817 }, { "epoch": 0.09, "grad_norm": 1.430145483068612, "learning_rate": 1.9828093845456726e-05, "loss": 1.292, "step": 1818 }, { "epoch": 0.09, "grad_norm": 1.131864765010865, "learning_rate": 1.982780612230903e-05, "loss": 1.1602, "step": 1819 }, { "epoch": 0.09, "grad_norm": 1.2271419667502215, "learning_rate": 1.9827518160669382e-05, "loss": 1.0393, "step": 1820 }, { "epoch": 0.09, "grad_norm": 1.4873407662173224, "learning_rate": 1.9827229960544773e-05, "loss": 1.2061, "step": 1821 }, { "epoch": 0.09, "grad_norm": 1.4788968370922488, "learning_rate": 1.9826941521942187e-05, "loss": 1.3604, "step": 1822 }, { "epoch": 0.09, "grad_norm": 1.380609336499077, "learning_rate": 1.9826652844868636e-05, "loss": 1.2534, "step": 1823 }, { "epoch": 0.09, "grad_norm": 1.048810656379789, "learning_rate": 1.9826363929331118e-05, "loss": 1.0664, "step": 1824 }, { "epoch": 0.09, "grad_norm": 1.381017797331505, "learning_rate": 1.9826074775336645e-05, "loss": 1.4009, "step": 1825 }, { "epoch": 0.09, "grad_norm": 1.1808942145118297, "learning_rate": 1.9825785382892236e-05, "loss": 1.1921, "step": 1826 }, { "epoch": 0.09, "grad_norm": 1.18670521590121, "learning_rate": 1.9825495752004912e-05, "loss": 1.2437, "step": 1827 }, { "epoch": 0.09, "grad_norm": 1.2587701650784606, "learning_rate": 1.98252058826817e-05, "loss": 1.3994, "step": 1828 }, { "epoch": 0.09, "grad_norm": 1.1739826744407886, "learning_rate": 1.982491577492964e-05, "loss": 1.2363, "step": 1829 }, { "epoch": 0.09, "grad_norm": 0.7998882876890707, "learning_rate": 1.982462542875576e-05, "loss": 1.4238, "step": 1830 }, { "epoch": 0.09, "grad_norm": 1.2450536714957336, "learning_rate": 1.9824334844167122e-05, "loss": 1.2188, "step": 1831 }, { "epoch": 0.09, "grad_norm": 1.0456690526533845, "learning_rate": 1.982404402117077e-05, "loss": 1.3252, "step": 1832 }, { "epoch": 0.09, "grad_norm": 0.9356278345584655, "learning_rate": 1.9823752959773758e-05, "loss": 1.2603, "step": 1833 }, { "epoch": 0.09, "grad_norm": 1.3633069944597016, "learning_rate": 1.9823461659983154e-05, "loss": 1.2686, "step": 1834 }, { "epoch": 0.09, "grad_norm": 1.6136539881603067, "learning_rate": 1.9823170121806023e-05, "loss": 1.3042, "step": 1835 }, { "epoch": 0.09, "grad_norm": 1.2750514200637628, "learning_rate": 1.9822878345249443e-05, "loss": 1.27, "step": 1836 }, { "epoch": 0.09, "grad_norm": 1.4391801210518234, "learning_rate": 1.9822586330320492e-05, "loss": 1.3682, "step": 1837 }, { "epoch": 0.09, "grad_norm": 1.1536653593791943, "learning_rate": 1.982229407702626e-05, "loss": 1.0552, "step": 1838 }, { "epoch": 0.09, "grad_norm": 1.372185787930189, "learning_rate": 1.9822001585373835e-05, "loss": 1.3882, "step": 1839 }, { "epoch": 0.09, "grad_norm": 1.3925096135691213, "learning_rate": 1.9821708855370322e-05, "loss": 1.1118, "step": 1840 }, { "epoch": 0.09, "grad_norm": 1.0259853358389837, "learning_rate": 1.9821415887022813e-05, "loss": 1.2349, "step": 1841 }, { "epoch": 0.09, "grad_norm": 1.687585957117712, "learning_rate": 1.9821122680338428e-05, "loss": 1.3047, "step": 1842 }, { "epoch": 0.09, "grad_norm": 1.3350539595930961, "learning_rate": 1.982082923532428e-05, "loss": 1.1519, "step": 1843 }, { "epoch": 0.09, "grad_norm": 1.1884548795914591, "learning_rate": 1.9820535551987486e-05, "loss": 1.333, "step": 1844 }, { "epoch": 0.09, "grad_norm": 1.2306533442196632, "learning_rate": 1.9820241630335176e-05, "loss": 1.0474, "step": 1845 }, { "epoch": 0.09, "grad_norm": 1.3535765161711673, "learning_rate": 1.9819947470374484e-05, "loss": 1.3374, "step": 1846 }, { "epoch": 0.09, "grad_norm": 1.3616763642056717, "learning_rate": 1.981965307211254e-05, "loss": 1.3306, "step": 1847 }, { "epoch": 0.09, "grad_norm": 1.2960870503315047, "learning_rate": 1.98193584355565e-05, "loss": 1.3975, "step": 1848 }, { "epoch": 0.09, "grad_norm": 1.2024150132485367, "learning_rate": 1.9819063560713507e-05, "loss": 1.2456, "step": 1849 }, { "epoch": 0.09, "grad_norm": 1.1419363856404334, "learning_rate": 1.981876844759072e-05, "loss": 1.2158, "step": 1850 }, { "epoch": 0.09, "grad_norm": 1.2143021083560086, "learning_rate": 1.9818473096195295e-05, "loss": 1.1821, "step": 1851 }, { "epoch": 0.09, "grad_norm": 1.5311817229537181, "learning_rate": 1.9818177506534406e-05, "loss": 1.2852, "step": 1852 }, { "epoch": 0.09, "grad_norm": 1.0880649701378982, "learning_rate": 1.981788167861522e-05, "loss": 1.2422, "step": 1853 }, { "epoch": 0.09, "grad_norm": 1.3099342549073392, "learning_rate": 1.9817585612444923e-05, "loss": 1.3076, "step": 1854 }, { "epoch": 0.09, "grad_norm": 1.143384774778335, "learning_rate": 1.9817289308030695e-05, "loss": 1.4126, "step": 1855 }, { "epoch": 0.09, "grad_norm": 1.2933426037574447, "learning_rate": 1.9816992765379725e-05, "loss": 1.2622, "step": 1856 }, { "epoch": 0.09, "grad_norm": 1.3653525395177861, "learning_rate": 1.981669598449921e-05, "loss": 1.2393, "step": 1857 }, { "epoch": 0.09, "grad_norm": 1.1037763411359711, "learning_rate": 1.981639896539636e-05, "loss": 1.2681, "step": 1858 }, { "epoch": 0.09, "grad_norm": 1.2235161558151872, "learning_rate": 1.981610170807837e-05, "loss": 1.3755, "step": 1859 }, { "epoch": 0.09, "grad_norm": 1.1917725639178294, "learning_rate": 1.981580421255246e-05, "loss": 1.1958, "step": 1860 }, { "epoch": 0.09, "grad_norm": 1.463007154066213, "learning_rate": 1.981550647882585e-05, "loss": 1.3535, "step": 1861 }, { "epoch": 0.09, "grad_norm": 1.5289923588107923, "learning_rate": 1.9815208506905764e-05, "loss": 1.3887, "step": 1862 }, { "epoch": 0.09, "grad_norm": 1.1503786950060537, "learning_rate": 1.9814910296799436e-05, "loss": 1.1577, "step": 1863 }, { "epoch": 0.09, "grad_norm": 1.5339452811644811, "learning_rate": 1.9814611848514095e-05, "loss": 0.9851, "step": 1864 }, { "epoch": 0.09, "grad_norm": 1.6069252152410434, "learning_rate": 1.9814313162056993e-05, "loss": 1.4082, "step": 1865 }, { "epoch": 0.09, "grad_norm": 1.336403139218595, "learning_rate": 1.981401423743537e-05, "loss": 1.2539, "step": 1866 }, { "epoch": 0.09, "grad_norm": 1.143071807788205, "learning_rate": 1.9813715074656482e-05, "loss": 1.2822, "step": 1867 }, { "epoch": 0.09, "grad_norm": 1.271020837079988, "learning_rate": 1.9813415673727594e-05, "loss": 1.1118, "step": 1868 }, { "epoch": 0.09, "grad_norm": 1.3314967392576869, "learning_rate": 1.9813116034655966e-05, "loss": 1.3105, "step": 1869 }, { "epoch": 0.09, "grad_norm": 1.3462319275916297, "learning_rate": 1.9812816157448874e-05, "loss": 1.1533, "step": 1870 }, { "epoch": 0.09, "grad_norm": 1.096247330792168, "learning_rate": 1.9812516042113588e-05, "loss": 1.1172, "step": 1871 }, { "epoch": 0.09, "grad_norm": 1.1371645632359064, "learning_rate": 1.9812215688657397e-05, "loss": 1.2212, "step": 1872 }, { "epoch": 0.09, "grad_norm": 1.448132484891773, "learning_rate": 1.9811915097087587e-05, "loss": 1.1807, "step": 1873 }, { "epoch": 0.09, "grad_norm": 1.3924437232799183, "learning_rate": 1.9811614267411453e-05, "loss": 1.4575, "step": 1874 }, { "epoch": 0.09, "grad_norm": 1.32699154787606, "learning_rate": 1.98113131996363e-05, "loss": 1.3921, "step": 1875 }, { "epoch": 0.09, "grad_norm": 1.5497160891791912, "learning_rate": 1.9811011893769424e-05, "loss": 1.1748, "step": 1876 }, { "epoch": 0.09, "grad_norm": 1.1260457209155514, "learning_rate": 1.9810710349818147e-05, "loss": 1.3169, "step": 1877 }, { "epoch": 0.09, "grad_norm": 1.4889529331914144, "learning_rate": 1.9810408567789777e-05, "loss": 1.3105, "step": 1878 }, { "epoch": 0.09, "grad_norm": 1.241501955003818, "learning_rate": 1.9810106547691648e-05, "loss": 1.2632, "step": 1879 }, { "epoch": 0.09, "grad_norm": 1.3583733722203615, "learning_rate": 1.980980428953108e-05, "loss": 1.1792, "step": 1880 }, { "epoch": 0.09, "grad_norm": 1.341871972146023, "learning_rate": 1.9809501793315412e-05, "loss": 1.2075, "step": 1881 }, { "epoch": 0.09, "grad_norm": 1.3535527634985074, "learning_rate": 1.9809199059051987e-05, "loss": 1.1455, "step": 1882 }, { "epoch": 0.09, "grad_norm": 1.505812085624719, "learning_rate": 1.9808896086748146e-05, "loss": 1.2812, "step": 1883 }, { "epoch": 0.09, "grad_norm": 0.9410656715009651, "learning_rate": 1.9808592876411245e-05, "loss": 1.3184, "step": 1884 }, { "epoch": 0.09, "grad_norm": 1.1909379414074224, "learning_rate": 1.980828942804864e-05, "loss": 1.25, "step": 1885 }, { "epoch": 0.09, "grad_norm": 1.4323633281455233, "learning_rate": 1.9807985741667696e-05, "loss": 1.3062, "step": 1886 }, { "epoch": 0.09, "grad_norm": 1.3912976285764427, "learning_rate": 1.9807681817275783e-05, "loss": 1.3447, "step": 1887 }, { "epoch": 0.09, "grad_norm": 1.2244205229669156, "learning_rate": 1.9807377654880274e-05, "loss": 1.4263, "step": 1888 }, { "epoch": 0.09, "grad_norm": 1.4648441371114906, "learning_rate": 1.9807073254488554e-05, "loss": 1.395, "step": 1889 }, { "epoch": 0.09, "grad_norm": 1.1879106042254508, "learning_rate": 1.9806768616108006e-05, "loss": 1.2793, "step": 1890 }, { "epoch": 0.09, "grad_norm": 1.3341631637910782, "learning_rate": 1.9806463739746026e-05, "loss": 1.0142, "step": 1891 }, { "epoch": 0.09, "grad_norm": 0.8959301333413131, "learning_rate": 1.9806158625410008e-05, "loss": 1.3291, "step": 1892 }, { "epoch": 0.09, "grad_norm": 1.1413543119468637, "learning_rate": 1.980585327310736e-05, "loss": 1.3394, "step": 1893 }, { "epoch": 0.09, "grad_norm": 1.5555458544381386, "learning_rate": 1.9805547682845494e-05, "loss": 1.2676, "step": 1894 }, { "epoch": 0.09, "grad_norm": 1.3135357680091526, "learning_rate": 1.980524185463182e-05, "loss": 1.2974, "step": 1895 }, { "epoch": 0.09, "grad_norm": 1.2931462983151132, "learning_rate": 1.9804935788473765e-05, "loss": 1.1538, "step": 1896 }, { "epoch": 0.09, "grad_norm": 1.271293707715259, "learning_rate": 1.980462948437875e-05, "loss": 1.2432, "step": 1897 }, { "epoch": 0.09, "grad_norm": 1.2205407584606955, "learning_rate": 1.9804322942354216e-05, "loss": 1.3623, "step": 1898 }, { "epoch": 0.09, "grad_norm": 1.271260876053814, "learning_rate": 1.9804016162407593e-05, "loss": 1.2573, "step": 1899 }, { "epoch": 0.09, "grad_norm": 1.2050906094853715, "learning_rate": 1.980370914454633e-05, "loss": 1.3423, "step": 1900 }, { "epoch": 0.09, "grad_norm": 1.0436959750842918, "learning_rate": 1.9803401888777882e-05, "loss": 1.2832, "step": 1901 }, { "epoch": 0.09, "grad_norm": 1.5487180477430478, "learning_rate": 1.98030943951097e-05, "loss": 1.2437, "step": 1902 }, { "epoch": 0.09, "grad_norm": 1.2160795571778615, "learning_rate": 1.9802786663549244e-05, "loss": 1.251, "step": 1903 }, { "epoch": 0.09, "grad_norm": 1.4803574845508547, "learning_rate": 1.9802478694103987e-05, "loss": 1.1494, "step": 1904 }, { "epoch": 0.09, "grad_norm": 1.365936864911018, "learning_rate": 1.98021704867814e-05, "loss": 1.3901, "step": 1905 }, { "epoch": 0.09, "grad_norm": 1.2355988877387292, "learning_rate": 1.9801862041588966e-05, "loss": 1.2583, "step": 1906 }, { "epoch": 0.09, "grad_norm": 1.225360345430169, "learning_rate": 1.9801553358534157e-05, "loss": 1.249, "step": 1907 }, { "epoch": 0.09, "grad_norm": 1.3061305562978804, "learning_rate": 1.980124443762448e-05, "loss": 1.3647, "step": 1908 }, { "epoch": 0.09, "grad_norm": 1.2210649214891178, "learning_rate": 1.9800935278867425e-05, "loss": 1.1528, "step": 1909 }, { "epoch": 0.09, "grad_norm": 1.186099945768962, "learning_rate": 1.9800625882270493e-05, "loss": 1.2646, "step": 1910 }, { "epoch": 0.09, "grad_norm": 1.5234890191114479, "learning_rate": 1.9800316247841194e-05, "loss": 1.3188, "step": 1911 }, { "epoch": 0.09, "grad_norm": 1.1577257558176532, "learning_rate": 1.9800006375587043e-05, "loss": 1.1816, "step": 1912 }, { "epoch": 0.09, "grad_norm": 1.1232217587412756, "learning_rate": 1.9799696265515554e-05, "loss": 1.042, "step": 1913 }, { "epoch": 0.09, "grad_norm": 1.3539644189490196, "learning_rate": 1.9799385917634256e-05, "loss": 1.29, "step": 1914 }, { "epoch": 0.09, "grad_norm": 1.576710776008854, "learning_rate": 1.9799075331950687e-05, "loss": 1.2925, "step": 1915 }, { "epoch": 0.09, "grad_norm": 1.1629957241868367, "learning_rate": 1.9798764508472373e-05, "loss": 1.1528, "step": 1916 }, { "epoch": 0.09, "grad_norm": 1.2714239662622298, "learning_rate": 1.9798453447206862e-05, "loss": 1.2642, "step": 1917 }, { "epoch": 0.09, "grad_norm": 1.4317572925039406, "learning_rate": 1.9798142148161703e-05, "loss": 1.4307, "step": 1918 }, { "epoch": 0.09, "grad_norm": 0.8748751891435529, "learning_rate": 1.9797830611344448e-05, "loss": 1.2036, "step": 1919 }, { "epoch": 0.09, "grad_norm": 1.1420997441221192, "learning_rate": 1.979751883676266e-05, "loss": 1.0698, "step": 1920 }, { "epoch": 0.09, "grad_norm": 1.1107925757162522, "learning_rate": 1.9797206824423904e-05, "loss": 1.3062, "step": 1921 }, { "epoch": 0.09, "grad_norm": 1.3699510783233686, "learning_rate": 1.979689457433575e-05, "loss": 1.2842, "step": 1922 }, { "epoch": 0.09, "grad_norm": 1.2783085978969468, "learning_rate": 1.9796582086505774e-05, "loss": 1.3325, "step": 1923 }, { "epoch": 0.09, "grad_norm": 1.1116854271641414, "learning_rate": 1.9796269360941563e-05, "loss": 1.2915, "step": 1924 }, { "epoch": 0.09, "grad_norm": 1.1713580322894388, "learning_rate": 1.9795956397650708e-05, "loss": 1.2319, "step": 1925 }, { "epoch": 0.09, "grad_norm": 1.8197100156795338, "learning_rate": 1.9795643196640794e-05, "loss": 1.1592, "step": 1926 }, { "epoch": 0.09, "grad_norm": 1.201138460275207, "learning_rate": 1.9795329757919433e-05, "loss": 1.1328, "step": 1927 }, { "epoch": 0.09, "grad_norm": 1.180477836679222, "learning_rate": 1.9795016081494222e-05, "loss": 1.1226, "step": 1928 }, { "epoch": 0.09, "grad_norm": 0.9534884172599033, "learning_rate": 1.9794702167372776e-05, "loss": 1.1675, "step": 1929 }, { "epoch": 0.09, "grad_norm": 1.3825822589924321, "learning_rate": 1.9794388015562718e-05, "loss": 1.3335, "step": 1930 }, { "epoch": 0.09, "grad_norm": 1.420058194474751, "learning_rate": 1.9794073626071664e-05, "loss": 1.3564, "step": 1931 }, { "epoch": 0.09, "grad_norm": 1.3617818143415814, "learning_rate": 1.9793758998907248e-05, "loss": 1.3984, "step": 1932 }, { "epoch": 0.09, "grad_norm": 1.3508097019586434, "learning_rate": 1.9793444134077102e-05, "loss": 1.3389, "step": 1933 }, { "epoch": 0.09, "grad_norm": 1.1818094785205957, "learning_rate": 1.979312903158887e-05, "loss": 1.2568, "step": 1934 }, { "epoch": 0.09, "grad_norm": 1.1395130624512262, "learning_rate": 1.9792813691450193e-05, "loss": 1.1558, "step": 1935 }, { "epoch": 0.09, "grad_norm": 1.4412372225196295, "learning_rate": 1.9792498113668733e-05, "loss": 1.2866, "step": 1936 }, { "epoch": 0.09, "grad_norm": 1.5415974413723252, "learning_rate": 1.979218229825214e-05, "loss": 1.4028, "step": 1937 }, { "epoch": 0.09, "grad_norm": 0.9975591377603297, "learning_rate": 1.979186624520808e-05, "loss": 1.1362, "step": 1938 }, { "epoch": 0.09, "grad_norm": 1.3604283528299983, "learning_rate": 1.9791549954544226e-05, "loss": 1.2744, "step": 1939 }, { "epoch": 0.09, "grad_norm": 1.2611282165658084, "learning_rate": 1.9791233426268245e-05, "loss": 1.3066, "step": 1940 }, { "epoch": 0.09, "grad_norm": 1.3773837719073607, "learning_rate": 1.9790916660387825e-05, "loss": 1.3213, "step": 1941 }, { "epoch": 0.09, "grad_norm": 1.331772185699569, "learning_rate": 1.9790599656910653e-05, "loss": 1.2944, "step": 1942 }, { "epoch": 0.09, "grad_norm": 1.6354764154756627, "learning_rate": 1.9790282415844425e-05, "loss": 1.2729, "step": 1943 }, { "epoch": 0.09, "grad_norm": 1.3435888999656587, "learning_rate": 1.978996493719683e-05, "loss": 1.353, "step": 1944 }, { "epoch": 0.09, "grad_norm": 1.0170187415256409, "learning_rate": 1.9789647220975578e-05, "loss": 1.2197, "step": 1945 }, { "epoch": 0.09, "grad_norm": 1.3423722400733198, "learning_rate": 1.9789329267188378e-05, "loss": 1.0098, "step": 1946 }, { "epoch": 0.09, "grad_norm": 0.9092439866234423, "learning_rate": 1.9789011075842947e-05, "loss": 1.0308, "step": 1947 }, { "epoch": 0.09, "grad_norm": 1.0190733705070874, "learning_rate": 1.9788692646947e-05, "loss": 1.2646, "step": 1948 }, { "epoch": 0.09, "grad_norm": 1.335657136392883, "learning_rate": 1.978837398050828e-05, "loss": 1.2798, "step": 1949 }, { "epoch": 0.09, "grad_norm": 1.426907493007665, "learning_rate": 1.9788055076534504e-05, "loss": 1.3481, "step": 1950 }, { "epoch": 0.09, "grad_norm": 1.3543197094558357, "learning_rate": 1.9787735935033415e-05, "loss": 1.1167, "step": 1951 }, { "epoch": 0.09, "grad_norm": 1.440283363594555, "learning_rate": 1.9787416556012766e-05, "loss": 1.2124, "step": 1952 }, { "epoch": 0.09, "grad_norm": 1.2758173427514399, "learning_rate": 1.9787096939480296e-05, "loss": 1.3149, "step": 1953 }, { "epoch": 0.09, "grad_norm": 1.3577642509279624, "learning_rate": 1.9786777085443766e-05, "loss": 1.1011, "step": 1954 }, { "epoch": 0.09, "grad_norm": 1.3296651840060834, "learning_rate": 1.9786456993910938e-05, "loss": 1.335, "step": 1955 }, { "epoch": 0.09, "grad_norm": 1.274691401027712, "learning_rate": 1.9786136664889583e-05, "loss": 1.2183, "step": 1956 }, { "epoch": 0.09, "grad_norm": 1.3340410916063563, "learning_rate": 1.9785816098387468e-05, "loss": 1.1196, "step": 1957 }, { "epoch": 0.09, "grad_norm": 1.3834039904808135, "learning_rate": 1.9785495294412378e-05, "loss": 1.1489, "step": 1958 }, { "epoch": 0.09, "grad_norm": 1.3465260967343657, "learning_rate": 1.9785174252972092e-05, "loss": 1.2104, "step": 1959 }, { "epoch": 0.09, "grad_norm": 1.3693942367620284, "learning_rate": 1.9784852974074408e-05, "loss": 1.1582, "step": 1960 }, { "epoch": 0.09, "grad_norm": 1.5251741720063983, "learning_rate": 1.9784531457727116e-05, "loss": 1.3569, "step": 1961 }, { "epoch": 0.09, "grad_norm": 1.1505700713100273, "learning_rate": 1.978420970393802e-05, "loss": 1.3442, "step": 1962 }, { "epoch": 0.09, "grad_norm": 1.4424192531196682, "learning_rate": 1.978388771271493e-05, "loss": 1.3423, "step": 1963 }, { "epoch": 0.09, "grad_norm": 1.2637014815844476, "learning_rate": 1.9783565484065657e-05, "loss": 1.1538, "step": 1964 }, { "epoch": 0.09, "grad_norm": 1.2633761561461772, "learning_rate": 1.978324301799803e-05, "loss": 1.3345, "step": 1965 }, { "epoch": 0.09, "grad_norm": 0.7021856767812172, "learning_rate": 1.978292031451986e-05, "loss": 1.1279, "step": 1966 }, { "epoch": 0.09, "grad_norm": 1.115255513490482, "learning_rate": 1.978259737363898e-05, "loss": 1.2012, "step": 1967 }, { "epoch": 0.09, "grad_norm": 1.3305716134564918, "learning_rate": 1.9782274195363237e-05, "loss": 1.2954, "step": 1968 }, { "epoch": 0.09, "grad_norm": 1.3944257079307671, "learning_rate": 1.978195077970047e-05, "loss": 1.2012, "step": 1969 }, { "epoch": 0.09, "grad_norm": 1.4524427003404283, "learning_rate": 1.978162712665852e-05, "loss": 1.1606, "step": 1970 }, { "epoch": 0.09, "grad_norm": 1.2487084159904944, "learning_rate": 1.978130323624525e-05, "loss": 1.3584, "step": 1971 }, { "epoch": 0.09, "grad_norm": 1.4492066332832005, "learning_rate": 1.9780979108468513e-05, "loss": 1.2188, "step": 1972 }, { "epoch": 0.09, "grad_norm": 1.325234865682575, "learning_rate": 1.978065474333618e-05, "loss": 1.2339, "step": 1973 }, { "epoch": 0.09, "grad_norm": 1.4077117149483565, "learning_rate": 1.978033014085612e-05, "loss": 1.355, "step": 1974 }, { "epoch": 0.09, "grad_norm": 1.15703544827588, "learning_rate": 1.9780005301036208e-05, "loss": 1.3276, "step": 1975 }, { "epoch": 0.1, "grad_norm": 1.1049386815963613, "learning_rate": 1.9779680223884335e-05, "loss": 1.2349, "step": 1976 }, { "epoch": 0.1, "grad_norm": 1.2077627868885916, "learning_rate": 1.977935490940838e-05, "loss": 1.2646, "step": 1977 }, { "epoch": 0.1, "grad_norm": 1.3986305644768324, "learning_rate": 1.977902935761624e-05, "loss": 1.251, "step": 1978 }, { "epoch": 0.1, "grad_norm": 1.6066739875333151, "learning_rate": 1.977870356851582e-05, "loss": 1.2417, "step": 1979 }, { "epoch": 0.1, "grad_norm": 1.3034633542589464, "learning_rate": 1.9778377542115022e-05, "loss": 1.3931, "step": 1980 }, { "epoch": 0.1, "grad_norm": 1.291240417190449, "learning_rate": 1.9778051278421758e-05, "loss": 1.2734, "step": 1981 }, { "epoch": 0.1, "grad_norm": 1.121421271892713, "learning_rate": 1.9777724777443943e-05, "loss": 1.3477, "step": 1982 }, { "epoch": 0.1, "grad_norm": 0.7898969503456257, "learning_rate": 1.9777398039189507e-05, "loss": 1.1743, "step": 1983 }, { "epoch": 0.1, "grad_norm": 1.267344391691263, "learning_rate": 1.9777071063666372e-05, "loss": 1.2603, "step": 1984 }, { "epoch": 0.1, "grad_norm": 1.1057886422483736, "learning_rate": 1.9776743850882476e-05, "loss": 1.1899, "step": 1985 }, { "epoch": 0.1, "grad_norm": 1.252124578417424, "learning_rate": 1.977641640084576e-05, "loss": 1.2109, "step": 1986 }, { "epoch": 0.1, "grad_norm": 1.2469902171086902, "learning_rate": 1.977608871356417e-05, "loss": 1.208, "step": 1987 }, { "epoch": 0.1, "grad_norm": 1.234566485219563, "learning_rate": 1.9775760789045656e-05, "loss": 1.2529, "step": 1988 }, { "epoch": 0.1, "grad_norm": 1.1096364742127283, "learning_rate": 1.9775432627298175e-05, "loss": 1.3228, "step": 1989 }, { "epoch": 0.1, "grad_norm": 1.264457002539019, "learning_rate": 1.97751042283297e-05, "loss": 1.1265, "step": 1990 }, { "epoch": 0.1, "grad_norm": 1.0905838112933341, "learning_rate": 1.9774775592148185e-05, "loss": 1.103, "step": 1991 }, { "epoch": 0.1, "grad_norm": 1.5294572622315383, "learning_rate": 1.9774446718761616e-05, "loss": 1.3809, "step": 1992 }, { "epoch": 0.1, "grad_norm": 0.9072722587924513, "learning_rate": 1.9774117608177968e-05, "loss": 1.1846, "step": 1993 }, { "epoch": 0.1, "grad_norm": 1.4550173029197349, "learning_rate": 1.9773788260405237e-05, "loss": 1.2935, "step": 1994 }, { "epoch": 0.1, "grad_norm": 1.3806620182380174, "learning_rate": 1.9773458675451406e-05, "loss": 1.292, "step": 1995 }, { "epoch": 0.1, "grad_norm": 1.2539535465453435, "learning_rate": 1.977312885332447e-05, "loss": 1.0999, "step": 1996 }, { "epoch": 0.1, "grad_norm": 1.1418369914818083, "learning_rate": 1.9772798794032445e-05, "loss": 1.3149, "step": 1997 }, { "epoch": 0.1, "grad_norm": 1.189015858024471, "learning_rate": 1.977246849758333e-05, "loss": 1.2783, "step": 1998 }, { "epoch": 0.1, "grad_norm": 1.0220195531797676, "learning_rate": 1.977213796398515e-05, "loss": 1.3462, "step": 1999 }, { "epoch": 0.1, "grad_norm": 1.2476356362494831, "learning_rate": 1.9771807193245913e-05, "loss": 1.3086, "step": 2000 }, { "epoch": 0.1, "grad_norm": 1.3285604339072905, "learning_rate": 1.9771476185373658e-05, "loss": 1.4009, "step": 2001 }, { "epoch": 0.1, "grad_norm": 1.2975221481531984, "learning_rate": 1.9771144940376413e-05, "loss": 1.3105, "step": 2002 }, { "epoch": 0.1, "grad_norm": 1.2816241879339203, "learning_rate": 1.9770813458262212e-05, "loss": 1.3433, "step": 2003 }, { "epoch": 0.1, "grad_norm": 0.6431789835457682, "learning_rate": 1.977048173903911e-05, "loss": 1.1978, "step": 2004 }, { "epoch": 0.1, "grad_norm": 0.6431789835457682, "learning_rate": 1.977048173903911e-05, "loss": 1.3042, "step": 2005 }, { "epoch": 0.1, "grad_norm": 0.6431789835457682, "learning_rate": 1.977048173903911e-05, "loss": 1.2979, "step": 2006 }, { "epoch": 0.1, "grad_norm": 1.0425169825151315, "learning_rate": 1.9770149782715143e-05, "loss": 1.1895, "step": 2007 }, { "epoch": 0.1, "grad_norm": 1.811741374945619, "learning_rate": 1.9769817589298377e-05, "loss": 1.4292, "step": 2008 }, { "epoch": 0.1, "grad_norm": 1.302940854013643, "learning_rate": 1.976948515879687e-05, "loss": 1.2368, "step": 2009 }, { "epoch": 0.1, "grad_norm": 1.2686137054636135, "learning_rate": 1.976915249121869e-05, "loss": 1.0947, "step": 2010 }, { "epoch": 0.1, "grad_norm": 1.5734766186908669, "learning_rate": 1.9768819586571907e-05, "loss": 1.1758, "step": 2011 }, { "epoch": 0.1, "grad_norm": 1.6118039212175372, "learning_rate": 1.9768486444864605e-05, "loss": 1.3184, "step": 2012 }, { "epoch": 0.1, "grad_norm": 1.1596293889916438, "learning_rate": 1.9768153066104863e-05, "loss": 1.1724, "step": 2013 }, { "epoch": 0.1, "grad_norm": 1.151504379768896, "learning_rate": 1.9767819450300773e-05, "loss": 1.1367, "step": 2014 }, { "epoch": 0.1, "grad_norm": 1.4823393060304144, "learning_rate": 1.976748559746043e-05, "loss": 1.332, "step": 2015 }, { "epoch": 0.1, "grad_norm": 1.375982025313618, "learning_rate": 1.9767151507591943e-05, "loss": 1.2598, "step": 2016 }, { "epoch": 0.1, "grad_norm": 1.1952637599835143, "learning_rate": 1.976681718070341e-05, "loss": 1.21, "step": 2017 }, { "epoch": 0.1, "grad_norm": 1.2192532735607047, "learning_rate": 1.9766482616802943e-05, "loss": 1.2207, "step": 2018 }, { "epoch": 0.1, "grad_norm": 1.1588166900258954, "learning_rate": 1.9766147815898668e-05, "loss": 1.4526, "step": 2019 }, { "epoch": 0.1, "grad_norm": 1.0178724967355894, "learning_rate": 1.9765812777998704e-05, "loss": 1.2188, "step": 2020 }, { "epoch": 0.1, "grad_norm": 1.058335571937333, "learning_rate": 1.9765477503111187e-05, "loss": 1.3188, "step": 2021 }, { "epoch": 0.1, "grad_norm": 1.11593542253497, "learning_rate": 1.976514199124425e-05, "loss": 1.2622, "step": 2022 }, { "epoch": 0.1, "grad_norm": 1.2694036694685196, "learning_rate": 1.9764806242406034e-05, "loss": 1.2954, "step": 2023 }, { "epoch": 0.1, "grad_norm": 1.2511534529361326, "learning_rate": 1.976447025660469e-05, "loss": 1.2246, "step": 2024 }, { "epoch": 0.1, "grad_norm": 1.3675042023071473, "learning_rate": 1.9764134033848367e-05, "loss": 1.3057, "step": 2025 }, { "epoch": 0.1, "grad_norm": 1.374546723449986, "learning_rate": 1.9763797574145227e-05, "loss": 1.1943, "step": 2026 }, { "epoch": 0.1, "grad_norm": 1.301712629317597, "learning_rate": 1.9763460877503435e-05, "loss": 1.3218, "step": 2027 }, { "epoch": 0.1, "grad_norm": 1.0436843628174695, "learning_rate": 1.976312394393116e-05, "loss": 1.3774, "step": 2028 }, { "epoch": 0.1, "grad_norm": 1.1615515825629406, "learning_rate": 1.976278677343658e-05, "loss": 1.1602, "step": 2029 }, { "epoch": 0.1, "grad_norm": 1.2740689357527017, "learning_rate": 1.9762449366027873e-05, "loss": 1.2324, "step": 2030 }, { "epoch": 0.1, "grad_norm": 1.0471733195665198, "learning_rate": 1.9762111721713232e-05, "loss": 1.2847, "step": 2031 }, { "epoch": 0.1, "grad_norm": 1.2869648584857225, "learning_rate": 1.976177384050085e-05, "loss": 1.208, "step": 2032 }, { "epoch": 0.1, "grad_norm": 0.8759124624504705, "learning_rate": 1.9761435722398926e-05, "loss": 1.2861, "step": 2033 }, { "epoch": 0.1, "grad_norm": 1.6012159367671621, "learning_rate": 1.9761097367415663e-05, "loss": 1.3315, "step": 2034 }, { "epoch": 0.1, "grad_norm": 1.4941478423089558, "learning_rate": 1.9760758775559275e-05, "loss": 1.1357, "step": 2035 }, { "epoch": 0.1, "grad_norm": 1.267835879913572, "learning_rate": 1.9760419946837972e-05, "loss": 1.165, "step": 2036 }, { "epoch": 0.1, "grad_norm": 1.327553436482191, "learning_rate": 1.9760080881259986e-05, "loss": 1.1343, "step": 2037 }, { "epoch": 0.1, "grad_norm": 1.0747600695097697, "learning_rate": 1.975974157883354e-05, "loss": 1.168, "step": 2038 }, { "epoch": 0.1, "grad_norm": 0.9535143422121894, "learning_rate": 1.9759402039566865e-05, "loss": 1.1592, "step": 2039 }, { "epoch": 0.1, "grad_norm": 1.2257212165477773, "learning_rate": 1.9759062263468207e-05, "loss": 1.2427, "step": 2040 }, { "epoch": 0.1, "grad_norm": 1.33870772525154, "learning_rate": 1.9758722250545808e-05, "loss": 1.033, "step": 2041 }, { "epoch": 0.1, "grad_norm": 1.2690613040827619, "learning_rate": 1.9758382000807915e-05, "loss": 1.2939, "step": 2042 }, { "epoch": 0.1, "grad_norm": 1.1847167988261655, "learning_rate": 1.9758041514262795e-05, "loss": 1.1997, "step": 2043 }, { "epoch": 0.1, "grad_norm": 1.327837899753317, "learning_rate": 1.9757700790918704e-05, "loss": 1.2539, "step": 2044 }, { "epoch": 0.1, "grad_norm": 0.8751618093928132, "learning_rate": 1.9757359830783907e-05, "loss": 1.3491, "step": 2045 }, { "epoch": 0.1, "grad_norm": 0.9442029754260975, "learning_rate": 1.9757018633866688e-05, "loss": 1.3022, "step": 2046 }, { "epoch": 0.1, "grad_norm": 1.4035365147878054, "learning_rate": 1.9756677200175316e-05, "loss": 1.063, "step": 2047 }, { "epoch": 0.1, "grad_norm": 1.8211198988476747, "learning_rate": 1.9756335529718086e-05, "loss": 1.25, "step": 2048 }, { "epoch": 0.1, "grad_norm": 1.3777729027049, "learning_rate": 1.9755993622503283e-05, "loss": 1.0942, "step": 2049 }, { "epoch": 0.1, "grad_norm": 0.8899834118655995, "learning_rate": 1.9755651478539203e-05, "loss": 1.106, "step": 2050 }, { "epoch": 0.1, "grad_norm": 1.2674975470000245, "learning_rate": 1.975530909783416e-05, "loss": 1.4248, "step": 2051 }, { "epoch": 0.1, "grad_norm": 1.3766201367701416, "learning_rate": 1.9754966480396447e-05, "loss": 1.2812, "step": 2052 }, { "epoch": 0.1, "grad_norm": 1.4859819454376624, "learning_rate": 1.9754623626234387e-05, "loss": 1.4546, "step": 2053 }, { "epoch": 0.1, "grad_norm": 1.5759141603448967, "learning_rate": 1.9754280535356302e-05, "loss": 1.2925, "step": 2054 }, { "epoch": 0.1, "grad_norm": 1.170172428054944, "learning_rate": 1.9753937207770513e-05, "loss": 1.1431, "step": 2055 }, { "epoch": 0.1, "grad_norm": 1.1480184458850897, "learning_rate": 1.975359364348535e-05, "loss": 1.2285, "step": 2056 }, { "epoch": 0.1, "grad_norm": 1.2025079683706563, "learning_rate": 1.975324984250916e-05, "loss": 1.2104, "step": 2057 }, { "epoch": 0.1, "grad_norm": 1.1901110596937803, "learning_rate": 1.9752905804850275e-05, "loss": 1.3276, "step": 2058 }, { "epoch": 0.1, "grad_norm": 0.9789933871269692, "learning_rate": 1.975256153051705e-05, "loss": 1.2852, "step": 2059 }, { "epoch": 0.1, "grad_norm": 1.2673149648134425, "learning_rate": 1.9752217019517838e-05, "loss": 1.0425, "step": 2060 }, { "epoch": 0.1, "grad_norm": 1.145225361730154, "learning_rate": 1.9751872271860998e-05, "loss": 1.1748, "step": 2061 }, { "epoch": 0.1, "grad_norm": 1.050211635967624, "learning_rate": 1.9751527287554898e-05, "loss": 1.1855, "step": 2062 }, { "epoch": 0.1, "grad_norm": 1.2586377001323827, "learning_rate": 1.975118206660791e-05, "loss": 1.376, "step": 2063 }, { "epoch": 0.1, "grad_norm": 1.6282943310107802, "learning_rate": 1.975083660902841e-05, "loss": 1.1802, "step": 2064 }, { "epoch": 0.1, "grad_norm": 1.3840946754986514, "learning_rate": 1.975049091482478e-05, "loss": 1.1899, "step": 2065 }, { "epoch": 0.1, "grad_norm": 1.2440375351921649, "learning_rate": 1.975014498400541e-05, "loss": 1.1987, "step": 2066 }, { "epoch": 0.1, "grad_norm": 1.258269264829545, "learning_rate": 1.97497988165787e-05, "loss": 1.1309, "step": 2067 }, { "epoch": 0.1, "grad_norm": 1.3391240575611765, "learning_rate": 1.974945241255304e-05, "loss": 1.3579, "step": 2068 }, { "epoch": 0.1, "grad_norm": 1.352244351802837, "learning_rate": 1.974910577193685e-05, "loss": 1.2871, "step": 2069 }, { "epoch": 0.1, "grad_norm": 1.1585402461726408, "learning_rate": 1.974875889473853e-05, "loss": 1.3091, "step": 2070 }, { "epoch": 0.1, "grad_norm": 1.3295266585758037, "learning_rate": 1.9748411780966497e-05, "loss": 1.2925, "step": 2071 }, { "epoch": 0.1, "grad_norm": 1.3470523494497024, "learning_rate": 1.9748064430629185e-05, "loss": 1.1895, "step": 2072 }, { "epoch": 0.1, "grad_norm": 1.4811851783744878, "learning_rate": 1.9747716843735015e-05, "loss": 1.1597, "step": 2073 }, { "epoch": 0.1, "grad_norm": 1.136876600197557, "learning_rate": 1.9747369020292424e-05, "loss": 1.2231, "step": 2074 }, { "epoch": 0.1, "grad_norm": 1.4512039065338111, "learning_rate": 1.9747020960309852e-05, "loss": 1.3145, "step": 2075 }, { "epoch": 0.1, "grad_norm": 1.3581551911941208, "learning_rate": 1.9746672663795748e-05, "loss": 1.25, "step": 2076 }, { "epoch": 0.1, "grad_norm": 1.4253364179785786, "learning_rate": 1.9746324130758563e-05, "loss": 1.4414, "step": 2077 }, { "epoch": 0.1, "grad_norm": 1.3165962230892718, "learning_rate": 1.9745975361206753e-05, "loss": 1.2324, "step": 2078 }, { "epoch": 0.1, "grad_norm": 1.133620162564484, "learning_rate": 1.9745626355148782e-05, "loss": 1.2598, "step": 2079 }, { "epoch": 0.1, "grad_norm": 1.2880082756464366, "learning_rate": 1.974527711259312e-05, "loss": 1.2729, "step": 2080 }, { "epoch": 0.1, "grad_norm": 1.304017640875334, "learning_rate": 1.9744927633548246e-05, "loss": 1.3809, "step": 2081 }, { "epoch": 0.1, "grad_norm": 1.246161601012671, "learning_rate": 1.9744577918022637e-05, "loss": 1.1362, "step": 2082 }, { "epoch": 0.1, "grad_norm": 1.6868296538718328, "learning_rate": 1.9744227966024776e-05, "loss": 1.3516, "step": 2083 }, { "epoch": 0.1, "grad_norm": 1.2192448983173112, "learning_rate": 1.974387777756316e-05, "loss": 1.3057, "step": 2084 }, { "epoch": 0.1, "grad_norm": 1.4770540258135623, "learning_rate": 1.9743527352646286e-05, "loss": 1.1897, "step": 2085 }, { "epoch": 0.1, "grad_norm": 1.3566882731581809, "learning_rate": 1.974317669128266e-05, "loss": 1.3057, "step": 2086 }, { "epoch": 0.1, "grad_norm": 1.40642642558953, "learning_rate": 1.9742825793480785e-05, "loss": 1.1465, "step": 2087 }, { "epoch": 0.1, "grad_norm": 1.3383711792757, "learning_rate": 1.9742474659249183e-05, "loss": 1.1821, "step": 2088 }, { "epoch": 0.1, "grad_norm": 1.8507926201850724, "learning_rate": 1.9742123288596375e-05, "loss": 1.3882, "step": 2089 }, { "epoch": 0.1, "grad_norm": 0.937094559559786, "learning_rate": 1.974177168153088e-05, "loss": 1.2422, "step": 2090 }, { "epoch": 0.1, "grad_norm": 1.5027441288268308, "learning_rate": 1.974141983806124e-05, "loss": 1.1851, "step": 2091 }, { "epoch": 0.1, "grad_norm": 1.3965964756986515, "learning_rate": 1.974106775819599e-05, "loss": 1.2402, "step": 2092 }, { "epoch": 0.1, "grad_norm": 1.5253514816118088, "learning_rate": 1.974071544194367e-05, "loss": 1.1699, "step": 2093 }, { "epoch": 0.1, "grad_norm": 1.1816931807600346, "learning_rate": 1.9740362889312835e-05, "loss": 1.1284, "step": 2094 }, { "epoch": 0.1, "grad_norm": 1.6299425479368872, "learning_rate": 1.9740010100312036e-05, "loss": 1.4028, "step": 2095 }, { "epoch": 0.1, "grad_norm": 1.3689840410137184, "learning_rate": 1.9739657074949835e-05, "loss": 1.2056, "step": 2096 }, { "epoch": 0.1, "grad_norm": 1.292208207739903, "learning_rate": 1.9739303813234802e-05, "loss": 1.2471, "step": 2097 }, { "epoch": 0.1, "grad_norm": 1.151404044502586, "learning_rate": 1.973895031517551e-05, "loss": 1.2822, "step": 2098 }, { "epoch": 0.1, "grad_norm": 1.3827946539761797, "learning_rate": 1.973859658078053e-05, "loss": 1.2808, "step": 2099 }, { "epoch": 0.1, "grad_norm": 1.2385556342990627, "learning_rate": 1.9738242610058457e-05, "loss": 1.1689, "step": 2100 }, { "epoch": 0.1, "grad_norm": 1.2421587436161778, "learning_rate": 1.973788840301787e-05, "loss": 1.0898, "step": 2101 }, { "epoch": 0.1, "grad_norm": 0.9548569947090044, "learning_rate": 1.9737533959667373e-05, "loss": 1.228, "step": 2102 }, { "epoch": 0.1, "grad_norm": 1.3207046958723425, "learning_rate": 1.973717928001556e-05, "loss": 1.2935, "step": 2103 }, { "epoch": 0.1, "grad_norm": 1.2168872154894168, "learning_rate": 1.9736824364071047e-05, "loss": 1.1958, "step": 2104 }, { "epoch": 0.1, "grad_norm": 0.9753675355316361, "learning_rate": 1.9736469211842437e-05, "loss": 1.1328, "step": 2105 }, { "epoch": 0.1, "grad_norm": 1.2892610595489347, "learning_rate": 1.9736113823338357e-05, "loss": 1.0649, "step": 2106 }, { "epoch": 0.1, "grad_norm": 1.2332679278406518, "learning_rate": 1.9735758198567427e-05, "loss": 1.1318, "step": 2107 }, { "epoch": 0.1, "grad_norm": 1.1141534225552783, "learning_rate": 1.9735402337538275e-05, "loss": 1.2563, "step": 2108 }, { "epoch": 0.1, "grad_norm": 1.3548231220713212, "learning_rate": 1.973504624025954e-05, "loss": 1.293, "step": 2109 }, { "epoch": 0.1, "grad_norm": 1.2570493791585278, "learning_rate": 1.9734689906739862e-05, "loss": 1.1519, "step": 2110 }, { "epoch": 0.1, "grad_norm": 1.4854070157850352, "learning_rate": 1.9734333336987886e-05, "loss": 1.2329, "step": 2111 }, { "epoch": 0.1, "grad_norm": 1.403713357679581, "learning_rate": 1.9733976531012272e-05, "loss": 1.3428, "step": 2112 }, { "epoch": 0.1, "grad_norm": 1.1337088642976172, "learning_rate": 1.9733619488821673e-05, "loss": 1.3022, "step": 2113 }, { "epoch": 0.1, "grad_norm": 1.0579583369122714, "learning_rate": 1.9733262210424752e-05, "loss": 1.0232, "step": 2114 }, { "epoch": 0.1, "grad_norm": 1.273359711884571, "learning_rate": 1.9732904695830186e-05, "loss": 1.2739, "step": 2115 }, { "epoch": 0.1, "grad_norm": 1.338082259003379, "learning_rate": 1.9732546945046643e-05, "loss": 1.3257, "step": 2116 }, { "epoch": 0.1, "grad_norm": 1.2832025327284362, "learning_rate": 1.9732188958082813e-05, "loss": 1.2368, "step": 2117 }, { "epoch": 0.1, "grad_norm": 1.6326102404701082, "learning_rate": 1.9731830734947373e-05, "loss": 0.9412, "step": 2118 }, { "epoch": 0.1, "grad_norm": 1.2148110510365433, "learning_rate": 1.9731472275649023e-05, "loss": 1.2104, "step": 2119 }, { "epoch": 0.1, "grad_norm": 1.0166740413692146, "learning_rate": 1.9731113580196458e-05, "loss": 1.2202, "step": 2120 }, { "epoch": 0.1, "grad_norm": 1.9341972562232586, "learning_rate": 1.9730754648598388e-05, "loss": 1.2808, "step": 2121 }, { "epoch": 0.1, "grad_norm": 1.3533844202073724, "learning_rate": 1.973039548086352e-05, "loss": 1.1768, "step": 2122 }, { "epoch": 0.1, "grad_norm": 1.4614785310324496, "learning_rate": 1.9730036077000568e-05, "loss": 1.3877, "step": 2123 }, { "epoch": 0.1, "grad_norm": 1.0374640083242765, "learning_rate": 1.9729676437018256e-05, "loss": 1.1162, "step": 2124 }, { "epoch": 0.1, "grad_norm": 1.2556881264038768, "learning_rate": 1.972931656092531e-05, "loss": 1.2446, "step": 2125 }, { "epoch": 0.1, "grad_norm": 1.4977403772459381, "learning_rate": 1.9728956448730466e-05, "loss": 1.1123, "step": 2126 }, { "epoch": 0.1, "grad_norm": 1.2958075708469714, "learning_rate": 1.972859610044246e-05, "loss": 1.2554, "step": 2127 }, { "epoch": 0.1, "grad_norm": 1.5050797903707775, "learning_rate": 1.9728235516070037e-05, "loss": 1.1829, "step": 2128 }, { "epoch": 0.1, "grad_norm": 1.4070360104616044, "learning_rate": 1.9727874695621946e-05, "loss": 1.3149, "step": 2129 }, { "epoch": 0.1, "grad_norm": 1.2756179722337815, "learning_rate": 1.9727513639106948e-05, "loss": 1.2456, "step": 2130 }, { "epoch": 0.1, "grad_norm": 1.1904486641369034, "learning_rate": 1.9727152346533798e-05, "loss": 1.2983, "step": 2131 }, { "epoch": 0.1, "grad_norm": 1.3542080249190152, "learning_rate": 1.972679081791127e-05, "loss": 1.395, "step": 2132 }, { "epoch": 0.1, "grad_norm": 1.2319622520050943, "learning_rate": 1.972642905324813e-05, "loss": 1.249, "step": 2133 }, { "epoch": 0.1, "grad_norm": 1.3157071911880212, "learning_rate": 1.9726067052553167e-05, "loss": 1.3086, "step": 2134 }, { "epoch": 0.1, "grad_norm": 1.5315780072687646, "learning_rate": 1.9725704815835156e-05, "loss": 1.3633, "step": 2135 }, { "epoch": 0.1, "grad_norm": 1.2300932964116849, "learning_rate": 1.9725342343102897e-05, "loss": 1.2749, "step": 2136 }, { "epoch": 0.1, "grad_norm": 0.9929257144150984, "learning_rate": 1.9724979634365174e-05, "loss": 1.3149, "step": 2137 }, { "epoch": 0.1, "grad_norm": 1.4011339791909128, "learning_rate": 1.9724616689630798e-05, "loss": 1.1426, "step": 2138 }, { "epoch": 0.1, "grad_norm": 1.4323935656314877, "learning_rate": 1.9724253508908574e-05, "loss": 1.3105, "step": 2139 }, { "epoch": 0.1, "grad_norm": 1.0584622995563442, "learning_rate": 1.9723890092207317e-05, "loss": 1.2896, "step": 2140 }, { "epoch": 0.1, "grad_norm": 1.2755369057107113, "learning_rate": 1.972352643953584e-05, "loss": 1.2412, "step": 2141 }, { "epoch": 0.1, "grad_norm": 1.2500332080010053, "learning_rate": 1.9723162550902975e-05, "loss": 1.0994, "step": 2142 }, { "epoch": 0.1, "grad_norm": 1.25418154486793, "learning_rate": 1.9722798426317552e-05, "loss": 1.2861, "step": 2143 }, { "epoch": 0.1, "grad_norm": 1.4625784062922456, "learning_rate": 1.9722434065788398e-05, "loss": 1.2549, "step": 2144 }, { "epoch": 0.1, "grad_norm": 1.48333333155445, "learning_rate": 1.972206946932437e-05, "loss": 1.3779, "step": 2145 }, { "epoch": 0.1, "grad_norm": 1.3964587729054958, "learning_rate": 1.97217046369343e-05, "loss": 1.1909, "step": 2146 }, { "epoch": 0.1, "grad_norm": 1.191622960455367, "learning_rate": 1.9721339568627055e-05, "loss": 1.27, "step": 2147 }, { "epoch": 0.1, "grad_norm": 1.2775699739197994, "learning_rate": 1.9720974264411484e-05, "loss": 1.2368, "step": 2148 }, { "epoch": 0.1, "grad_norm": 1.4304155257018925, "learning_rate": 1.972060872429646e-05, "loss": 1.3389, "step": 2149 }, { "epoch": 0.1, "grad_norm": 1.3215539046233244, "learning_rate": 1.9720242948290847e-05, "loss": 1.2529, "step": 2150 }, { "epoch": 0.1, "grad_norm": 1.329016274850806, "learning_rate": 1.9719876936403524e-05, "loss": 0.9905, "step": 2151 }, { "epoch": 0.1, "grad_norm": 1.1632314250435223, "learning_rate": 1.9719510688643374e-05, "loss": 1.1396, "step": 2152 }, { "epoch": 0.1, "grad_norm": 1.1810842936387398, "learning_rate": 1.971914420501928e-05, "loss": 1.2329, "step": 2153 }, { "epoch": 0.1, "grad_norm": 1.2491477077365487, "learning_rate": 1.9718777485540145e-05, "loss": 0.9775, "step": 2154 }, { "epoch": 0.1, "grad_norm": 1.4565873287129338, "learning_rate": 1.9718410530214857e-05, "loss": 1.3506, "step": 2155 }, { "epoch": 0.1, "grad_norm": 1.0582235453189632, "learning_rate": 1.971804333905233e-05, "loss": 1.1216, "step": 2156 }, { "epoch": 0.1, "grad_norm": 0.8193773551486896, "learning_rate": 1.971767591206147e-05, "loss": 1.1626, "step": 2157 }, { "epoch": 0.1, "grad_norm": 1.2243326529487073, "learning_rate": 1.971730824925119e-05, "loss": 1.2705, "step": 2158 }, { "epoch": 0.1, "grad_norm": 1.3928725035388059, "learning_rate": 1.9716940350630424e-05, "loss": 1.2793, "step": 2159 }, { "epoch": 0.1, "grad_norm": 1.2580838735793547, "learning_rate": 1.9716572216208084e-05, "loss": 1.2393, "step": 2160 }, { "epoch": 0.1, "grad_norm": 1.3763429441908046, "learning_rate": 1.9716203845993117e-05, "loss": 1.168, "step": 2161 }, { "epoch": 0.1, "grad_norm": 1.3131380484082913, "learning_rate": 1.9715835239994457e-05, "loss": 1.2876, "step": 2162 }, { "epoch": 0.1, "grad_norm": 1.2889133253102536, "learning_rate": 1.971546639822105e-05, "loss": 1.2837, "step": 2163 }, { "epoch": 0.1, "grad_norm": 1.322283944970901, "learning_rate": 1.971509732068184e-05, "loss": 1.4043, "step": 2164 }, { "epoch": 0.1, "grad_norm": 1.0803193461755334, "learning_rate": 1.9714728007385795e-05, "loss": 1.2217, "step": 2165 }, { "epoch": 0.1, "grad_norm": 1.3071896599789303, "learning_rate": 1.9714358458341868e-05, "loss": 1.1089, "step": 2166 }, { "epoch": 0.1, "grad_norm": 0.8966287410798779, "learning_rate": 1.971398867355903e-05, "loss": 1.1191, "step": 2167 }, { "epoch": 0.1, "grad_norm": 1.4911874698166165, "learning_rate": 1.9713618653046258e-05, "loss": 1.2568, "step": 2168 }, { "epoch": 0.1, "grad_norm": 0.9340372935958634, "learning_rate": 1.9713248396812524e-05, "loss": 1.3438, "step": 2169 }, { "epoch": 0.1, "grad_norm": 1.1168161803944012, "learning_rate": 1.9712877904866818e-05, "loss": 1.3057, "step": 2170 }, { "epoch": 0.1, "grad_norm": 1.3448838702239219, "learning_rate": 1.971250717721813e-05, "loss": 1.251, "step": 2171 }, { "epoch": 0.1, "grad_norm": 1.40966509736875, "learning_rate": 1.971213621387546e-05, "loss": 1.3921, "step": 2172 }, { "epoch": 0.1, "grad_norm": 1.172476475615654, "learning_rate": 1.9711765014847804e-05, "loss": 1.3052, "step": 2173 }, { "epoch": 0.1, "grad_norm": 0.9991115048866612, "learning_rate": 1.9711393580144168e-05, "loss": 1.2124, "step": 2174 }, { "epoch": 0.1, "grad_norm": 1.0677529325172748, "learning_rate": 1.9711021909773574e-05, "loss": 1.1143, "step": 2175 }, { "epoch": 0.1, "grad_norm": 1.310085742242461, "learning_rate": 1.971065000374504e-05, "loss": 1.3862, "step": 2176 }, { "epoch": 0.1, "grad_norm": 1.4581366870295722, "learning_rate": 1.9710277862067583e-05, "loss": 1.2397, "step": 2177 }, { "epoch": 0.1, "grad_norm": 1.0506712722155953, "learning_rate": 1.9709905484750243e-05, "loss": 1.2871, "step": 2178 }, { "epoch": 0.1, "grad_norm": 1.050543208369896, "learning_rate": 1.970953287180205e-05, "loss": 1.0964, "step": 2179 }, { "epoch": 0.1, "grad_norm": 1.5287673107439805, "learning_rate": 1.9709160023232052e-05, "loss": 1.3682, "step": 2180 }, { "epoch": 0.1, "grad_norm": 1.4431057675230472, "learning_rate": 1.9708786939049292e-05, "loss": 1.2622, "step": 2181 }, { "epoch": 0.1, "grad_norm": 1.1044629381037343, "learning_rate": 1.9708413619262825e-05, "loss": 1.2661, "step": 2182 }, { "epoch": 0.1, "grad_norm": 1.3391312731493032, "learning_rate": 1.970804006388171e-05, "loss": 1.2534, "step": 2183 }, { "epoch": 0.11, "grad_norm": 1.2437614938960124, "learning_rate": 1.9707666272915016e-05, "loss": 1.0361, "step": 2184 }, { "epoch": 0.11, "grad_norm": 1.2731024331142595, "learning_rate": 1.970729224637181e-05, "loss": 1.2246, "step": 2185 }, { "epoch": 0.11, "grad_norm": 1.1721838975771055, "learning_rate": 1.9706917984261168e-05, "loss": 1.2979, "step": 2186 }, { "epoch": 0.11, "grad_norm": 1.1374349820330243, "learning_rate": 1.9706543486592174e-05, "loss": 1.207, "step": 2187 }, { "epoch": 0.11, "grad_norm": 1.258722432133754, "learning_rate": 1.9706168753373912e-05, "loss": 1.1509, "step": 2188 }, { "epoch": 0.11, "grad_norm": 1.3466829680110652, "learning_rate": 1.9705793784615487e-05, "loss": 1.3887, "step": 2189 }, { "epoch": 0.11, "grad_norm": 1.1578238661773583, "learning_rate": 1.9705418580325984e-05, "loss": 1.2056, "step": 2190 }, { "epoch": 0.11, "grad_norm": 1.34854711097351, "learning_rate": 1.970504314051452e-05, "loss": 1.3804, "step": 2191 }, { "epoch": 0.11, "grad_norm": 1.1965515210477877, "learning_rate": 1.9704667465190194e-05, "loss": 1.2246, "step": 2192 }, { "epoch": 0.11, "grad_norm": 1.4211826049396064, "learning_rate": 1.970429155436213e-05, "loss": 1.2729, "step": 2193 }, { "epoch": 0.11, "grad_norm": 1.1721021305457244, "learning_rate": 1.9703915408039454e-05, "loss": 1.0693, "step": 2194 }, { "epoch": 0.11, "grad_norm": 1.251026974392511, "learning_rate": 1.9703539026231288e-05, "loss": 1.1841, "step": 2195 }, { "epoch": 0.11, "grad_norm": 0.9687360002657329, "learning_rate": 1.9703162408946766e-05, "loss": 1.2485, "step": 2196 }, { "epoch": 0.11, "grad_norm": 1.492166201903374, "learning_rate": 1.9702785556195026e-05, "loss": 1.2319, "step": 2197 }, { "epoch": 0.11, "grad_norm": 1.333167487981378, "learning_rate": 1.970240846798522e-05, "loss": 1.3213, "step": 2198 }, { "epoch": 0.11, "grad_norm": 1.2279493293742691, "learning_rate": 1.970203114432649e-05, "loss": 1.2485, "step": 2199 }, { "epoch": 0.11, "grad_norm": 1.3160159940988119, "learning_rate": 1.9701653585228e-05, "loss": 1.2524, "step": 2200 }, { "epoch": 0.11, "grad_norm": 1.3336733004274464, "learning_rate": 1.9701275790698906e-05, "loss": 1.3032, "step": 2201 }, { "epoch": 0.11, "grad_norm": 1.1803000509169097, "learning_rate": 1.9700897760748382e-05, "loss": 1.208, "step": 2202 }, { "epoch": 0.11, "grad_norm": 1.0330360356642547, "learning_rate": 1.9700519495385597e-05, "loss": 1.2109, "step": 2203 }, { "epoch": 0.11, "grad_norm": 1.1054502766236436, "learning_rate": 1.970014099461973e-05, "loss": 1.1831, "step": 2204 }, { "epoch": 0.11, "grad_norm": 1.2300499083637746, "learning_rate": 1.969976225845997e-05, "loss": 1.2793, "step": 2205 }, { "epoch": 0.11, "grad_norm": 1.3317171863311308, "learning_rate": 1.969938328691551e-05, "loss": 1.4126, "step": 2206 }, { "epoch": 0.11, "grad_norm": 1.192453554819283, "learning_rate": 1.969900407999554e-05, "loss": 1.3057, "step": 2207 }, { "epoch": 0.11, "grad_norm": 1.3720659545412983, "learning_rate": 1.9698624637709263e-05, "loss": 1.2612, "step": 2208 }, { "epoch": 0.11, "grad_norm": 1.3099077790402383, "learning_rate": 1.969824496006589e-05, "loss": 1.1592, "step": 2209 }, { "epoch": 0.11, "grad_norm": 1.0654351700601843, "learning_rate": 1.9697865047074633e-05, "loss": 1.2603, "step": 2210 }, { "epoch": 0.11, "grad_norm": 1.1273138524777162, "learning_rate": 1.969748489874471e-05, "loss": 1.2202, "step": 2211 }, { "epoch": 0.11, "grad_norm": 1.459347742876156, "learning_rate": 1.969710451508535e-05, "loss": 1.3569, "step": 2212 }, { "epoch": 0.11, "grad_norm": 1.2605064193056073, "learning_rate": 1.9696723896105778e-05, "loss": 1.2285, "step": 2213 }, { "epoch": 0.11, "grad_norm": 1.3877413518052322, "learning_rate": 1.969634304181524e-05, "loss": 1.1885, "step": 2214 }, { "epoch": 0.11, "grad_norm": 1.236894217568725, "learning_rate": 1.9695961952222966e-05, "loss": 1.2544, "step": 2215 }, { "epoch": 0.11, "grad_norm": 1.544198763844224, "learning_rate": 1.9695580627338212e-05, "loss": 1.3823, "step": 2216 }, { "epoch": 0.11, "grad_norm": 1.5777650148308828, "learning_rate": 1.9695199067170236e-05, "loss": 1.2754, "step": 2217 }, { "epoch": 0.11, "grad_norm": 1.3059067871906982, "learning_rate": 1.9694817271728284e-05, "loss": 1.1226, "step": 2218 }, { "epoch": 0.11, "grad_norm": 1.3356583121424808, "learning_rate": 1.969443524102163e-05, "loss": 1.3682, "step": 2219 }, { "epoch": 0.11, "grad_norm": 1.220787371217912, "learning_rate": 1.9694052975059545e-05, "loss": 1.2158, "step": 2220 }, { "epoch": 0.11, "grad_norm": 1.0028701773898565, "learning_rate": 1.96936704738513e-05, "loss": 1.2271, "step": 2221 }, { "epoch": 0.11, "grad_norm": 1.7863584418465375, "learning_rate": 1.9693287737406183e-05, "loss": 1.4067, "step": 2222 }, { "epoch": 0.11, "grad_norm": 1.396771746324201, "learning_rate": 1.969290476573348e-05, "loss": 1.2817, "step": 2223 }, { "epoch": 0.11, "grad_norm": 1.3989268673438904, "learning_rate": 1.9692521558842485e-05, "loss": 1.2993, "step": 2224 }, { "epoch": 0.11, "grad_norm": 1.3431949081427952, "learning_rate": 1.9692138116742494e-05, "loss": 1.2207, "step": 2225 }, { "epoch": 0.11, "grad_norm": 1.282804624528867, "learning_rate": 1.9691754439442815e-05, "loss": 1.3584, "step": 2226 }, { "epoch": 0.11, "grad_norm": 1.355513042202185, "learning_rate": 1.9691370526952756e-05, "loss": 1.2163, "step": 2227 }, { "epoch": 0.11, "grad_norm": 1.5888231601312783, "learning_rate": 1.969098637928164e-05, "loss": 1.188, "step": 2228 }, { "epoch": 0.11, "grad_norm": 1.0706681794545767, "learning_rate": 1.9690601996438782e-05, "loss": 1.2349, "step": 2229 }, { "epoch": 0.11, "grad_norm": 1.4565796088073892, "learning_rate": 1.9690217378433514e-05, "loss": 1.3081, "step": 2230 }, { "epoch": 0.11, "grad_norm": 1.9345458699031222, "learning_rate": 1.9689832525275166e-05, "loss": 1.2964, "step": 2231 }, { "epoch": 0.11, "grad_norm": 1.4622029446393785, "learning_rate": 1.9689447436973083e-05, "loss": 1.311, "step": 2232 }, { "epoch": 0.11, "grad_norm": 1.300267452040358, "learning_rate": 1.9689062113536605e-05, "loss": 1.27, "step": 2233 }, { "epoch": 0.11, "grad_norm": 1.4333126415570174, "learning_rate": 1.968867655497508e-05, "loss": 1.1792, "step": 2234 }, { "epoch": 0.11, "grad_norm": 1.2399378047399128, "learning_rate": 1.9688290761297873e-05, "loss": 1.0454, "step": 2235 }, { "epoch": 0.11, "grad_norm": 1.274317290210688, "learning_rate": 1.968790473251434e-05, "loss": 1.1343, "step": 2236 }, { "epoch": 0.11, "grad_norm": 1.224776227250419, "learning_rate": 1.968751846863385e-05, "loss": 1.2456, "step": 2237 }, { "epoch": 0.11, "grad_norm": 1.111936081119711, "learning_rate": 1.968713196966578e-05, "loss": 1.2217, "step": 2238 }, { "epoch": 0.11, "grad_norm": 1.5881267740295038, "learning_rate": 1.96867452356195e-05, "loss": 1.1489, "step": 2239 }, { "epoch": 0.11, "grad_norm": 1.1864456119450237, "learning_rate": 1.9686358266504406e-05, "loss": 1.1997, "step": 2240 }, { "epoch": 0.11, "grad_norm": 1.1976903061205968, "learning_rate": 1.968597106232988e-05, "loss": 1.1089, "step": 2241 }, { "epoch": 0.11, "grad_norm": 1.0909978013744854, "learning_rate": 1.968558362310532e-05, "loss": 1.0474, "step": 2242 }, { "epoch": 0.11, "grad_norm": 1.3430512688099636, "learning_rate": 1.9685195948840137e-05, "loss": 1.248, "step": 2243 }, { "epoch": 0.11, "grad_norm": 1.4201138009128356, "learning_rate": 1.9684808039543727e-05, "loss": 1.2437, "step": 2244 }, { "epoch": 0.11, "grad_norm": 1.048907793099642, "learning_rate": 1.9684419895225506e-05, "loss": 1.2402, "step": 2245 }, { "epoch": 0.11, "grad_norm": 0.888413731853847, "learning_rate": 1.9684031515894898e-05, "loss": 1.1821, "step": 2246 }, { "epoch": 0.11, "grad_norm": 1.0960299248353569, "learning_rate": 1.9683642901561324e-05, "loss": 1.1504, "step": 2247 }, { "epoch": 0.11, "grad_norm": 1.1551932889225376, "learning_rate": 1.9683254052234217e-05, "loss": 1.1367, "step": 2248 }, { "epoch": 0.11, "grad_norm": 1.5400012488065769, "learning_rate": 1.9682864967923006e-05, "loss": 1.4224, "step": 2249 }, { "epoch": 0.11, "grad_norm": 1.3900434985216465, "learning_rate": 1.9682475648637145e-05, "loss": 1.2051, "step": 2250 }, { "epoch": 0.11, "grad_norm": 1.3674639054540176, "learning_rate": 1.968208609438607e-05, "loss": 1.1685, "step": 2251 }, { "epoch": 0.11, "grad_norm": 0.9724746960949947, "learning_rate": 1.9681696305179243e-05, "loss": 1.0908, "step": 2252 }, { "epoch": 0.11, "grad_norm": 1.361262627046411, "learning_rate": 1.9681306281026122e-05, "loss": 1.353, "step": 2253 }, { "epoch": 0.11, "grad_norm": 1.2299752503888868, "learning_rate": 1.968091602193616e-05, "loss": 1.1709, "step": 2254 }, { "epoch": 0.11, "grad_norm": 1.5067574226829261, "learning_rate": 1.9680525527918845e-05, "loss": 1.3696, "step": 2255 }, { "epoch": 0.11, "grad_norm": 1.3628436134244943, "learning_rate": 1.9680134798983642e-05, "loss": 1.3276, "step": 2256 }, { "epoch": 0.11, "grad_norm": 1.2824537324973393, "learning_rate": 1.9679743835140037e-05, "loss": 1.2139, "step": 2257 }, { "epoch": 0.11, "grad_norm": 0.9763746179502948, "learning_rate": 1.967935263639752e-05, "loss": 1.146, "step": 2258 }, { "epoch": 0.11, "grad_norm": 1.559862922004395, "learning_rate": 1.9678961202765572e-05, "loss": 1.4263, "step": 2259 }, { "epoch": 0.11, "grad_norm": 1.28215842267662, "learning_rate": 1.9678569534253706e-05, "loss": 1.2979, "step": 2260 }, { "epoch": 0.11, "grad_norm": 1.2817655063508682, "learning_rate": 1.9678177630871418e-05, "loss": 1.0942, "step": 2261 }, { "epoch": 0.11, "grad_norm": 1.3553024078036955, "learning_rate": 1.967778549262822e-05, "loss": 1.2251, "step": 2262 }, { "epoch": 0.11, "grad_norm": 1.2614573093453105, "learning_rate": 1.967739311953363e-05, "loss": 1.2983, "step": 2263 }, { "epoch": 0.11, "grad_norm": 1.4965479599135652, "learning_rate": 1.9677000511597175e-05, "loss": 1.25, "step": 2264 }, { "epoch": 0.11, "grad_norm": 1.4580527290430283, "learning_rate": 1.9676607668828372e-05, "loss": 1.2148, "step": 2265 }, { "epoch": 0.11, "grad_norm": 0.8617660615333492, "learning_rate": 1.967621459123676e-05, "loss": 1.3057, "step": 2266 }, { "epoch": 0.11, "grad_norm": 1.2357334569397314, "learning_rate": 1.9675821278831875e-05, "loss": 1.0193, "step": 2267 }, { "epoch": 0.11, "grad_norm": 1.537265712736465, "learning_rate": 1.9675427731623267e-05, "loss": 1.2319, "step": 2268 }, { "epoch": 0.11, "grad_norm": 1.189566844670059, "learning_rate": 1.967503394962048e-05, "loss": 1.0479, "step": 2269 }, { "epoch": 0.11, "grad_norm": 1.2242446437831958, "learning_rate": 1.9674639932833068e-05, "loss": 1.3169, "step": 2270 }, { "epoch": 0.11, "grad_norm": 1.518469922649539, "learning_rate": 1.9674245681270604e-05, "loss": 1.2017, "step": 2271 }, { "epoch": 0.11, "grad_norm": 1.2319310365549312, "learning_rate": 1.9673851194942645e-05, "loss": 1.3271, "step": 2272 }, { "epoch": 0.11, "grad_norm": 1.3541197604875195, "learning_rate": 1.9673456473858766e-05, "loss": 1.3716, "step": 2273 }, { "epoch": 0.11, "grad_norm": 1.4181693084536233, "learning_rate": 1.967306151802855e-05, "loss": 1.252, "step": 2274 }, { "epoch": 0.11, "grad_norm": 1.2591792338988002, "learning_rate": 1.9672666327461575e-05, "loss": 1.3774, "step": 2275 }, { "epoch": 0.11, "grad_norm": 1.5235686649094224, "learning_rate": 1.967227090216744e-05, "loss": 1.188, "step": 2276 }, { "epoch": 0.11, "grad_norm": 1.2435096529440157, "learning_rate": 1.967187524215573e-05, "loss": 1.0249, "step": 2277 }, { "epoch": 0.11, "grad_norm": 1.182257214397943, "learning_rate": 1.967147934743605e-05, "loss": 1.2178, "step": 2278 }, { "epoch": 0.11, "grad_norm": 1.61175548863758, "learning_rate": 1.9671083218018016e-05, "loss": 1.4038, "step": 2279 }, { "epoch": 0.11, "grad_norm": 1.1246976081152869, "learning_rate": 1.967068685391123e-05, "loss": 1.4409, "step": 2280 }, { "epoch": 0.11, "grad_norm": 1.3380067883932696, "learning_rate": 1.9670290255125314e-05, "loss": 1.2622, "step": 2281 }, { "epoch": 0.11, "grad_norm": 1.0116006149703554, "learning_rate": 1.9669893421669895e-05, "loss": 1.2983, "step": 2282 }, { "epoch": 0.11, "grad_norm": 1.42973602616009, "learning_rate": 1.96694963535546e-05, "loss": 1.3164, "step": 2283 }, { "epoch": 0.11, "grad_norm": 1.3509573256508152, "learning_rate": 1.9669099050789063e-05, "loss": 1.4326, "step": 2284 }, { "epoch": 0.11, "grad_norm": 1.2826141493545928, "learning_rate": 1.966870151338293e-05, "loss": 1.1489, "step": 2285 }, { "epoch": 0.11, "grad_norm": 1.6025426715316093, "learning_rate": 1.9668303741345845e-05, "loss": 1.3042, "step": 2286 }, { "epoch": 0.11, "grad_norm": 1.4540991916132446, "learning_rate": 1.9667905734687463e-05, "loss": 1.3066, "step": 2287 }, { "epoch": 0.11, "grad_norm": 1.206949412339831, "learning_rate": 1.966750749341744e-05, "loss": 1.2207, "step": 2288 }, { "epoch": 0.11, "grad_norm": 1.1944772778344892, "learning_rate": 1.9667109017545442e-05, "loss": 1.0554, "step": 2289 }, { "epoch": 0.11, "grad_norm": 1.2391235023992146, "learning_rate": 1.966671030708114e-05, "loss": 1.23, "step": 2290 }, { "epoch": 0.11, "grad_norm": 1.215193892088057, "learning_rate": 1.9666311362034203e-05, "loss": 0.9995, "step": 2291 }, { "epoch": 0.11, "grad_norm": 1.0536926179477066, "learning_rate": 1.966591218241432e-05, "loss": 1.1626, "step": 2292 }, { "epoch": 0.11, "grad_norm": 1.3804302271479423, "learning_rate": 1.9665512768231175e-05, "loss": 1.25, "step": 2293 }, { "epoch": 0.11, "grad_norm": 1.6398853729958296, "learning_rate": 1.966511311949446e-05, "loss": 1.2319, "step": 2294 }, { "epoch": 0.11, "grad_norm": 1.2827130434175404, "learning_rate": 1.966471323621387e-05, "loss": 1.2666, "step": 2295 }, { "epoch": 0.11, "grad_norm": 1.5117877906942554, "learning_rate": 1.966431311839912e-05, "loss": 1.2822, "step": 2296 }, { "epoch": 0.11, "grad_norm": 1.317221441650805, "learning_rate": 1.9663912766059904e-05, "loss": 1.2891, "step": 2297 }, { "epoch": 0.11, "grad_norm": 0.9899135406090642, "learning_rate": 1.966351217920595e-05, "loss": 1.3975, "step": 2298 }, { "epoch": 0.11, "grad_norm": 1.3283649830814415, "learning_rate": 1.9663111357846976e-05, "loss": 1.2109, "step": 2299 }, { "epoch": 0.11, "grad_norm": 1.09722163379172, "learning_rate": 1.9662710301992705e-05, "loss": 1.2212, "step": 2300 }, { "epoch": 0.11, "grad_norm": 1.4682823059941397, "learning_rate": 1.9662309011652874e-05, "loss": 1.373, "step": 2301 }, { "epoch": 0.11, "grad_norm": 1.2659435363782705, "learning_rate": 1.966190748683722e-05, "loss": 1.0723, "step": 2302 }, { "epoch": 0.11, "grad_norm": 1.098558617845344, "learning_rate": 1.9661505727555482e-05, "loss": 1.354, "step": 2303 }, { "epoch": 0.11, "grad_norm": 1.4785383862881551, "learning_rate": 1.9661103733817418e-05, "loss": 1.4214, "step": 2304 }, { "epoch": 0.11, "grad_norm": 1.5636660680093912, "learning_rate": 1.9660701505632773e-05, "loss": 1.3472, "step": 2305 }, { "epoch": 0.11, "grad_norm": 0.8536453487072471, "learning_rate": 1.9660299043011316e-05, "loss": 1.3354, "step": 2306 }, { "epoch": 0.11, "grad_norm": 1.2637424095773873, "learning_rate": 1.9659896345962815e-05, "loss": 1.2446, "step": 2307 }, { "epoch": 0.11, "grad_norm": 1.2131535028861988, "learning_rate": 1.9659493414497034e-05, "loss": 1.2046, "step": 2308 }, { "epoch": 0.11, "grad_norm": 1.4206976425747657, "learning_rate": 1.965909024862376e-05, "loss": 1.4697, "step": 2309 }, { "epoch": 0.11, "grad_norm": 1.056190471735001, "learning_rate": 1.9658686848352767e-05, "loss": 1.2178, "step": 2310 }, { "epoch": 0.11, "grad_norm": 1.4553576270112303, "learning_rate": 1.965828321369385e-05, "loss": 1.1528, "step": 2311 }, { "epoch": 0.11, "grad_norm": 1.3498112138390386, "learning_rate": 1.9657879344656804e-05, "loss": 1.3857, "step": 2312 }, { "epoch": 0.11, "grad_norm": 1.1622261227579345, "learning_rate": 1.965747524125143e-05, "loss": 1.1802, "step": 2313 }, { "epoch": 0.11, "grad_norm": 1.2096272035824487, "learning_rate": 1.9657070903487534e-05, "loss": 1.1445, "step": 2314 }, { "epoch": 0.11, "grad_norm": 1.0043944203194202, "learning_rate": 1.9656666331374927e-05, "loss": 1.2119, "step": 2315 }, { "epoch": 0.11, "grad_norm": 1.4284110934721865, "learning_rate": 1.9656261524923428e-05, "loss": 1.2739, "step": 2316 }, { "epoch": 0.11, "grad_norm": 1.3808852245701946, "learning_rate": 1.965585648414286e-05, "loss": 1.3408, "step": 2317 }, { "epoch": 0.11, "grad_norm": 1.3448842824304936, "learning_rate": 1.965545120904305e-05, "loss": 1.1221, "step": 2318 }, { "epoch": 0.11, "grad_norm": 1.7761480865008914, "learning_rate": 1.9655045699633836e-05, "loss": 1.2227, "step": 2319 }, { "epoch": 0.11, "grad_norm": 1.2377917117865687, "learning_rate": 1.9654639955925057e-05, "loss": 1.1733, "step": 2320 }, { "epoch": 0.11, "grad_norm": 1.361409411469485, "learning_rate": 1.9654233977926557e-05, "loss": 1.3677, "step": 2321 }, { "epoch": 0.11, "grad_norm": 1.4360314926580808, "learning_rate": 1.9653827765648194e-05, "loss": 1.1372, "step": 2322 }, { "epoch": 0.11, "grad_norm": 1.4070003835547202, "learning_rate": 1.965342131909982e-05, "loss": 1.3633, "step": 2323 }, { "epoch": 0.11, "grad_norm": 1.2537364996215397, "learning_rate": 1.9653014638291304e-05, "loss": 1.104, "step": 2324 }, { "epoch": 0.11, "grad_norm": 1.360258461326476, "learning_rate": 1.9652607723232507e-05, "loss": 1.3418, "step": 2325 }, { "epoch": 0.11, "grad_norm": 1.2048037774424762, "learning_rate": 1.965220057393331e-05, "loss": 1.1228, "step": 2326 }, { "epoch": 0.11, "grad_norm": 1.1449742820233493, "learning_rate": 1.9651793190403592e-05, "loss": 1.1362, "step": 2327 }, { "epoch": 0.11, "grad_norm": 1.3130055875721254, "learning_rate": 1.9651385572653235e-05, "loss": 1.3042, "step": 2328 }, { "epoch": 0.11, "grad_norm": 1.386455270687136, "learning_rate": 1.9650977720692138e-05, "loss": 1.3623, "step": 2329 }, { "epoch": 0.11, "grad_norm": 1.0417586423144762, "learning_rate": 1.965056963453019e-05, "loss": 1.0337, "step": 2330 }, { "epoch": 0.11, "grad_norm": 1.3789512530929682, "learning_rate": 1.96501613141773e-05, "loss": 1.3721, "step": 2331 }, { "epoch": 0.11, "grad_norm": 1.1465251128598535, "learning_rate": 1.9649752759643377e-05, "loss": 1.2163, "step": 2332 }, { "epoch": 0.11, "grad_norm": 1.2839212495289904, "learning_rate": 1.964934397093833e-05, "loss": 1.2729, "step": 2333 }, { "epoch": 0.11, "grad_norm": 1.6409262437267662, "learning_rate": 1.9648934948072086e-05, "loss": 1.2227, "step": 2334 }, { "epoch": 0.11, "grad_norm": 1.2363990136998246, "learning_rate": 1.9648525691054563e-05, "loss": 1.1709, "step": 2335 }, { "epoch": 0.11, "grad_norm": 1.2248303580444435, "learning_rate": 1.9648116199895703e-05, "loss": 1.2417, "step": 2336 }, { "epoch": 0.11, "grad_norm": 0.9194565592758108, "learning_rate": 1.964770647460543e-05, "loss": 1.2144, "step": 2337 }, { "epoch": 0.11, "grad_norm": 1.2503533949272256, "learning_rate": 1.96472965151937e-05, "loss": 1.189, "step": 2338 }, { "epoch": 0.11, "grad_norm": 1.2055128099450905, "learning_rate": 1.964688632167045e-05, "loss": 1.0854, "step": 2339 }, { "epoch": 0.11, "grad_norm": 1.2572758951314735, "learning_rate": 1.964647589404564e-05, "loss": 1.2446, "step": 2340 }, { "epoch": 0.11, "grad_norm": 1.105488729612356, "learning_rate": 1.964606523232923e-05, "loss": 1.1836, "step": 2341 }, { "epoch": 0.11, "grad_norm": 1.1067278312450484, "learning_rate": 1.964565433653119e-05, "loss": 1.2656, "step": 2342 }, { "epoch": 0.11, "grad_norm": 1.0888629241419383, "learning_rate": 1.964524320666148e-05, "loss": 1.1196, "step": 2343 }, { "epoch": 0.11, "grad_norm": 1.2574006426184852, "learning_rate": 1.9644831842730084e-05, "loss": 1.2524, "step": 2344 }, { "epoch": 0.11, "grad_norm": 1.2281574315340626, "learning_rate": 1.9644420244746984e-05, "loss": 1.3159, "step": 2345 }, { "epoch": 0.11, "grad_norm": 1.1589071922520195, "learning_rate": 1.964400841272217e-05, "loss": 1.0168, "step": 2346 }, { "epoch": 0.11, "grad_norm": 1.872261357443138, "learning_rate": 1.9643596346665634e-05, "loss": 1.2373, "step": 2347 }, { "epoch": 0.11, "grad_norm": 1.273959957222254, "learning_rate": 1.9643184046587373e-05, "loss": 1.0742, "step": 2348 }, { "epoch": 0.11, "grad_norm": 1.5004161463739305, "learning_rate": 1.9642771512497395e-05, "loss": 1.3921, "step": 2349 }, { "epoch": 0.11, "grad_norm": 1.4541454262176057, "learning_rate": 1.964235874440571e-05, "loss": 1.1753, "step": 2350 }, { "epoch": 0.11, "grad_norm": 1.09418371216279, "learning_rate": 1.9641945742322337e-05, "loss": 1.1978, "step": 2351 }, { "epoch": 0.11, "grad_norm": 1.5768573921220472, "learning_rate": 1.9641532506257297e-05, "loss": 1.2959, "step": 2352 }, { "epoch": 0.11, "grad_norm": 1.4616516763714056, "learning_rate": 1.9641119036220616e-05, "loss": 1.1401, "step": 2353 }, { "epoch": 0.11, "grad_norm": 1.2262769522045187, "learning_rate": 1.964070533222233e-05, "loss": 1.0522, "step": 2354 }, { "epoch": 0.11, "grad_norm": 1.0980319908008442, "learning_rate": 1.964029139427248e-05, "loss": 1.2446, "step": 2355 }, { "epoch": 0.11, "grad_norm": 0.9159828150683357, "learning_rate": 1.9639877222381106e-05, "loss": 1.1328, "step": 2356 }, { "epoch": 0.11, "grad_norm": 1.2356240241255108, "learning_rate": 1.9639462816558264e-05, "loss": 1.2852, "step": 2357 }, { "epoch": 0.11, "grad_norm": 1.3479498604917606, "learning_rate": 1.963904817681401e-05, "loss": 1.2354, "step": 2358 }, { "epoch": 0.11, "grad_norm": 1.295466568975589, "learning_rate": 1.96386333031584e-05, "loss": 1.3779, "step": 2359 }, { "epoch": 0.11, "grad_norm": 1.1534702425930583, "learning_rate": 1.9638218195601507e-05, "loss": 1.2104, "step": 2360 }, { "epoch": 0.11, "grad_norm": 1.171145138824874, "learning_rate": 1.9637802854153403e-05, "loss": 1.2295, "step": 2361 }, { "epoch": 0.11, "grad_norm": 1.2395792275796897, "learning_rate": 1.9637387278824168e-05, "loss": 1.1606, "step": 2362 }, { "epoch": 0.11, "grad_norm": 1.5297127580546614, "learning_rate": 1.9636971469623888e-05, "loss": 1.2178, "step": 2363 }, { "epoch": 0.11, "grad_norm": 1.4001134710145433, "learning_rate": 1.9636555426562653e-05, "loss": 1.3862, "step": 2364 }, { "epoch": 0.11, "grad_norm": 1.2612628193150617, "learning_rate": 1.963613914965055e-05, "loss": 1.3511, "step": 2365 }, { "epoch": 0.11, "grad_norm": 1.278201316498869, "learning_rate": 1.9635722638897697e-05, "loss": 1.1133, "step": 2366 }, { "epoch": 0.11, "grad_norm": 1.5234213502944443, "learning_rate": 1.963530589431419e-05, "loss": 1.0898, "step": 2367 }, { "epoch": 0.11, "grad_norm": 1.6250984455337114, "learning_rate": 1.9634888915910144e-05, "loss": 1.3267, "step": 2368 }, { "epoch": 0.11, "grad_norm": 1.287766571031839, "learning_rate": 1.963447170369568e-05, "loss": 1.3276, "step": 2369 }, { "epoch": 0.11, "grad_norm": 1.1912450340310021, "learning_rate": 1.9634054257680924e-05, "loss": 1.2661, "step": 2370 }, { "epoch": 0.11, "grad_norm": 1.5596823857482258, "learning_rate": 1.9633636577876004e-05, "loss": 1.2217, "step": 2371 }, { "epoch": 0.11, "grad_norm": 1.5257067616099147, "learning_rate": 1.963321866429105e-05, "loss": 1.1396, "step": 2372 }, { "epoch": 0.11, "grad_norm": 1.1229686921837245, "learning_rate": 1.9632800516936215e-05, "loss": 1.1982, "step": 2373 }, { "epoch": 0.11, "grad_norm": 1.3691876069949276, "learning_rate": 1.9632382135821638e-05, "loss": 1.1589, "step": 2374 }, { "epoch": 0.11, "grad_norm": 1.1890184812325273, "learning_rate": 1.9631963520957477e-05, "loss": 1.1812, "step": 2375 }, { "epoch": 0.11, "grad_norm": 1.1001247370953569, "learning_rate": 1.9631544672353886e-05, "loss": 1.2661, "step": 2376 }, { "epoch": 0.11, "grad_norm": 1.1264158280457661, "learning_rate": 1.963112559002103e-05, "loss": 1.2808, "step": 2377 }, { "epoch": 0.11, "grad_norm": 1.1766519217538112, "learning_rate": 1.963070627396908e-05, "loss": 1.2109, "step": 2378 }, { "epoch": 0.11, "grad_norm": 1.2647975176197777, "learning_rate": 1.9630286724208216e-05, "loss": 1.3291, "step": 2379 }, { "epoch": 0.11, "grad_norm": 1.3255061986928103, "learning_rate": 1.9629866940748612e-05, "loss": 1.3022, "step": 2380 }, { "epoch": 0.11, "grad_norm": 1.1895019992513642, "learning_rate": 1.9629446923600458e-05, "loss": 1.1436, "step": 2381 }, { "epoch": 0.11, "grad_norm": 1.144680760225059, "learning_rate": 1.9629026672773946e-05, "loss": 1.2271, "step": 2382 }, { "epoch": 0.11, "grad_norm": 1.312535077918102, "learning_rate": 1.9628606188279273e-05, "loss": 1.2485, "step": 2383 }, { "epoch": 0.11, "grad_norm": 1.0505008321765719, "learning_rate": 1.9628185470126645e-05, "loss": 1.3066, "step": 2384 }, { "epoch": 0.11, "grad_norm": 1.0705432413280491, "learning_rate": 1.9627764518326274e-05, "loss": 1.0874, "step": 2385 }, { "epoch": 0.11, "grad_norm": 1.24543877227386, "learning_rate": 1.962734333288837e-05, "loss": 1.1768, "step": 2386 }, { "epoch": 0.11, "grad_norm": 1.3343826179409273, "learning_rate": 1.9626921913823156e-05, "loss": 1.4805, "step": 2387 }, { "epoch": 0.11, "grad_norm": 1.155866728477415, "learning_rate": 1.962650026114086e-05, "loss": 1.2197, "step": 2388 }, { "epoch": 0.11, "grad_norm": 1.2072519443484715, "learning_rate": 1.9626078374851715e-05, "loss": 1.2627, "step": 2389 }, { "epoch": 0.11, "grad_norm": 1.313082703834362, "learning_rate": 1.9625656254965954e-05, "loss": 1.2471, "step": 2390 }, { "epoch": 0.12, "grad_norm": 1.2807905617250746, "learning_rate": 1.962523390149382e-05, "loss": 1.2446, "step": 2391 }, { "epoch": 0.12, "grad_norm": 1.004081603932383, "learning_rate": 1.9624811314445575e-05, "loss": 1.4062, "step": 2392 }, { "epoch": 0.12, "grad_norm": 0.9226988281900343, "learning_rate": 1.9624388493831462e-05, "loss": 1.2373, "step": 2393 }, { "epoch": 0.12, "grad_norm": 1.5818734194165454, "learning_rate": 1.962396543966174e-05, "loss": 1.3633, "step": 2394 }, { "epoch": 0.12, "grad_norm": 1.5113380432549346, "learning_rate": 1.9623542151946683e-05, "loss": 1.313, "step": 2395 }, { "epoch": 0.12, "grad_norm": 1.421060205152798, "learning_rate": 1.962311863069656e-05, "loss": 1.3955, "step": 2396 }, { "epoch": 0.12, "grad_norm": 1.096427553183146, "learning_rate": 1.962269487592165e-05, "loss": 1.2739, "step": 2397 }, { "epoch": 0.12, "grad_norm": 1.0203307315592112, "learning_rate": 1.9622270887632234e-05, "loss": 1.0977, "step": 2398 }, { "epoch": 0.12, "grad_norm": 1.2426730520849518, "learning_rate": 1.9621846665838598e-05, "loss": 1.2427, "step": 2399 }, { "epoch": 0.12, "grad_norm": 1.2766575791597585, "learning_rate": 1.9621422210551047e-05, "loss": 1.2705, "step": 2400 }, { "epoch": 0.12, "grad_norm": 1.1286198693191283, "learning_rate": 1.962099752177987e-05, "loss": 1.1636, "step": 2401 }, { "epoch": 0.12, "grad_norm": 1.4023808461006182, "learning_rate": 1.9620572599535378e-05, "loss": 1.2778, "step": 2402 }, { "epoch": 0.12, "grad_norm": 1.1617512577802296, "learning_rate": 1.962014744382788e-05, "loss": 1.2812, "step": 2403 }, { "epoch": 0.12, "grad_norm": 1.3379242629301378, "learning_rate": 1.9619722054667698e-05, "loss": 1.2178, "step": 2404 }, { "epoch": 0.12, "grad_norm": 1.2769647137193416, "learning_rate": 1.961929643206515e-05, "loss": 1.2612, "step": 2405 }, { "epoch": 0.12, "grad_norm": 1.19527223474733, "learning_rate": 1.961887057603057e-05, "loss": 1.3301, "step": 2406 }, { "epoch": 0.12, "grad_norm": 1.3087808311152336, "learning_rate": 1.9618444486574287e-05, "loss": 1.1763, "step": 2407 }, { "epoch": 0.12, "grad_norm": 1.1256135597904087, "learning_rate": 1.9618018163706644e-05, "loss": 1.0889, "step": 2408 }, { "epoch": 0.12, "grad_norm": 1.195483977549286, "learning_rate": 1.9617591607437988e-05, "loss": 1.2183, "step": 2409 }, { "epoch": 0.12, "grad_norm": 1.1485427155394363, "learning_rate": 1.9617164817778666e-05, "loss": 1.2388, "step": 2410 }, { "epoch": 0.12, "grad_norm": 1.30883342753104, "learning_rate": 1.9616737794739036e-05, "loss": 1.2437, "step": 2411 }, { "epoch": 0.12, "grad_norm": 1.2371062904454795, "learning_rate": 1.9616310538329463e-05, "loss": 1.3789, "step": 2412 }, { "epoch": 0.12, "grad_norm": 1.4021656225106407, "learning_rate": 1.961588304856031e-05, "loss": 1.2285, "step": 2413 }, { "epoch": 0.12, "grad_norm": 1.1731240121593056, "learning_rate": 1.961545532544196e-05, "loss": 1.2832, "step": 2414 }, { "epoch": 0.12, "grad_norm": 1.278818047464594, "learning_rate": 1.9615027368984785e-05, "loss": 1.291, "step": 2415 }, { "epoch": 0.12, "grad_norm": 1.1518043986831197, "learning_rate": 1.9614599179199172e-05, "loss": 0.9465, "step": 2416 }, { "epoch": 0.12, "grad_norm": 1.4806238248093422, "learning_rate": 1.961417075609551e-05, "loss": 1.4141, "step": 2417 }, { "epoch": 0.12, "grad_norm": 1.1285556910911094, "learning_rate": 1.9613742099684204e-05, "loss": 1.3057, "step": 2418 }, { "epoch": 0.12, "grad_norm": 1.3499216917200085, "learning_rate": 1.9613313209975645e-05, "loss": 1.2407, "step": 2419 }, { "epoch": 0.12, "grad_norm": 1.2679333929135839, "learning_rate": 1.961288408698025e-05, "loss": 1.1958, "step": 2420 }, { "epoch": 0.12, "grad_norm": 1.1483162991349474, "learning_rate": 1.961245473070843e-05, "loss": 1.2148, "step": 2421 }, { "epoch": 0.12, "grad_norm": 0.9214240803198256, "learning_rate": 1.96120251411706e-05, "loss": 1.2314, "step": 2422 }, { "epoch": 0.12, "grad_norm": 1.2529885235202343, "learning_rate": 1.9611595318377184e-05, "loss": 1.3672, "step": 2423 }, { "epoch": 0.12, "grad_norm": 1.4033944317545468, "learning_rate": 1.961116526233862e-05, "loss": 1.2632, "step": 2424 }, { "epoch": 0.12, "grad_norm": 1.1269224969593372, "learning_rate": 1.9610734973065342e-05, "loss": 1.3423, "step": 2425 }, { "epoch": 0.12, "grad_norm": 1.357224105931364, "learning_rate": 1.961030445056779e-05, "loss": 1.4346, "step": 2426 }, { "epoch": 0.12, "grad_norm": 1.3469847944195492, "learning_rate": 1.960987369485641e-05, "loss": 1.3721, "step": 2427 }, { "epoch": 0.12, "grad_norm": 1.30985738464524, "learning_rate": 1.960944270594166e-05, "loss": 1.2319, "step": 2428 }, { "epoch": 0.12, "grad_norm": 1.2812752012047819, "learning_rate": 1.9609011483833993e-05, "loss": 1.3027, "step": 2429 }, { "epoch": 0.12, "grad_norm": 1.560387929004423, "learning_rate": 1.9608580028543875e-05, "loss": 1.3882, "step": 2430 }, { "epoch": 0.12, "grad_norm": 1.0744188151971599, "learning_rate": 1.960814834008178e-05, "loss": 1.2866, "step": 2431 }, { "epoch": 0.12, "grad_norm": 1.0274341318740199, "learning_rate": 1.960771641845818e-05, "loss": 1.157, "step": 2432 }, { "epoch": 0.12, "grad_norm": 1.1871497694330204, "learning_rate": 1.960728426368356e-05, "loss": 1.1836, "step": 2433 }, { "epoch": 0.12, "grad_norm": 1.5638691628691719, "learning_rate": 1.9606851875768404e-05, "loss": 1.1543, "step": 2434 }, { "epoch": 0.12, "grad_norm": 1.4151425716564714, "learning_rate": 1.9606419254723205e-05, "loss": 1.0837, "step": 2435 }, { "epoch": 0.12, "grad_norm": 1.324307886979762, "learning_rate": 1.9605986400558462e-05, "loss": 1.23, "step": 2436 }, { "epoch": 0.12, "grad_norm": 0.9799720856040415, "learning_rate": 1.9605553313284682e-05, "loss": 1.0884, "step": 2437 }, { "epoch": 0.12, "grad_norm": 1.2656681114291513, "learning_rate": 1.9605119992912368e-05, "loss": 1.3472, "step": 2438 }, { "epoch": 0.12, "grad_norm": 1.2059794718373356, "learning_rate": 1.960468643945204e-05, "loss": 1.1846, "step": 2439 }, { "epoch": 0.12, "grad_norm": 1.3239270641480203, "learning_rate": 1.9604252652914222e-05, "loss": 1.2397, "step": 2440 }, { "epoch": 0.12, "grad_norm": 1.598138040968418, "learning_rate": 1.9603818633309434e-05, "loss": 1.3169, "step": 2441 }, { "epoch": 0.12, "grad_norm": 1.2665232835889386, "learning_rate": 1.9603384380648213e-05, "loss": 1.1663, "step": 2442 }, { "epoch": 0.12, "grad_norm": 1.4105574795427946, "learning_rate": 1.9602949894941096e-05, "loss": 1.2988, "step": 2443 }, { "epoch": 0.12, "grad_norm": 1.124746707470773, "learning_rate": 1.9602515176198623e-05, "loss": 1.3354, "step": 2444 }, { "epoch": 0.12, "grad_norm": 1.047550100685344, "learning_rate": 1.960208022443135e-05, "loss": 1.2676, "step": 2445 }, { "epoch": 0.12, "grad_norm": 1.4108711807779835, "learning_rate": 1.9601645039649828e-05, "loss": 1.2515, "step": 2446 }, { "epoch": 0.12, "grad_norm": 1.3328310165310426, "learning_rate": 1.9601209621864616e-05, "loss": 1.1851, "step": 2447 }, { "epoch": 0.12, "grad_norm": 1.229659908431658, "learning_rate": 1.9600773971086286e-05, "loss": 1.1709, "step": 2448 }, { "epoch": 0.12, "grad_norm": 1.225084249140839, "learning_rate": 1.9600338087325407e-05, "loss": 1.189, "step": 2449 }, { "epoch": 0.12, "grad_norm": 1.245548605235522, "learning_rate": 1.959990197059255e-05, "loss": 1.3105, "step": 2450 }, { "epoch": 0.12, "grad_norm": 1.53512254937094, "learning_rate": 1.9599465620898314e-05, "loss": 1.3086, "step": 2451 }, { "epoch": 0.12, "grad_norm": 1.1580350944690099, "learning_rate": 1.959902903825327e-05, "loss": 1.3223, "step": 2452 }, { "epoch": 0.12, "grad_norm": 1.531640929789945, "learning_rate": 1.959859222266803e-05, "loss": 1.2827, "step": 2453 }, { "epoch": 0.12, "grad_norm": 1.1337689347631477, "learning_rate": 1.9598155174153174e-05, "loss": 1.2603, "step": 2454 }, { "epoch": 0.12, "grad_norm": 1.2204928391586622, "learning_rate": 1.9597717892719326e-05, "loss": 1.2412, "step": 2455 }, { "epoch": 0.12, "grad_norm": 1.1612531697793014, "learning_rate": 1.9597280378377087e-05, "loss": 1.1694, "step": 2456 }, { "epoch": 0.12, "grad_norm": 1.3711515961128697, "learning_rate": 1.959684263113708e-05, "loss": 1.3076, "step": 2457 }, { "epoch": 0.12, "grad_norm": 1.434354278884517, "learning_rate": 1.9596404651009928e-05, "loss": 1.2007, "step": 2458 }, { "epoch": 0.12, "grad_norm": 1.3669500565608845, "learning_rate": 1.9595966438006253e-05, "loss": 1.3276, "step": 2459 }, { "epoch": 0.12, "grad_norm": 1.351034087300645, "learning_rate": 1.9595527992136697e-05, "loss": 1.231, "step": 2460 }, { "epoch": 0.12, "grad_norm": 1.2965194965792934, "learning_rate": 1.9595089313411892e-05, "loss": 1.2397, "step": 2461 }, { "epoch": 0.12, "grad_norm": 1.2544750643138867, "learning_rate": 1.9594650401842493e-05, "loss": 1.1113, "step": 2462 }, { "epoch": 0.12, "grad_norm": 1.302407483722702, "learning_rate": 1.959421125743914e-05, "loss": 1.1201, "step": 2463 }, { "epoch": 0.12, "grad_norm": 1.2766225793278472, "learning_rate": 1.9593771880212498e-05, "loss": 1.2822, "step": 2464 }, { "epoch": 0.12, "grad_norm": 1.5861913607961307, "learning_rate": 1.9593332270173225e-05, "loss": 1.1416, "step": 2465 }, { "epoch": 0.12, "grad_norm": 1.2984664062934461, "learning_rate": 1.9592892427331993e-05, "loss": 1.2148, "step": 2466 }, { "epoch": 0.12, "grad_norm": 1.5021415314302897, "learning_rate": 1.9592452351699475e-05, "loss": 1.2886, "step": 2467 }, { "epoch": 0.12, "grad_norm": 1.321530101180719, "learning_rate": 1.9592012043286342e-05, "loss": 1.2026, "step": 2468 }, { "epoch": 0.12, "grad_norm": 1.4072328595076355, "learning_rate": 1.9591571502103294e-05, "loss": 1.1577, "step": 2469 }, { "epoch": 0.12, "grad_norm": 1.2182644914677294, "learning_rate": 1.959113072816101e-05, "loss": 1.1343, "step": 2470 }, { "epoch": 0.12, "grad_norm": 1.4104888949377745, "learning_rate": 1.9590689721470188e-05, "loss": 1.2358, "step": 2471 }, { "epoch": 0.12, "grad_norm": 0.6961421274735642, "learning_rate": 1.9590248482041533e-05, "loss": 1.2969, "step": 2472 }, { "epoch": 0.12, "grad_norm": 1.3907948908161734, "learning_rate": 1.958980700988575e-05, "loss": 1.3198, "step": 2473 }, { "epoch": 0.12, "grad_norm": 1.2553004135796941, "learning_rate": 1.9589365305013556e-05, "loss": 1.2202, "step": 2474 }, { "epoch": 0.12, "grad_norm": 1.218533837696613, "learning_rate": 1.9588923367435667e-05, "loss": 1.2515, "step": 2475 }, { "epoch": 0.12, "grad_norm": 1.648522718900821, "learning_rate": 1.9588481197162804e-05, "loss": 1.1035, "step": 2476 }, { "epoch": 0.12, "grad_norm": 1.0967915039245528, "learning_rate": 1.9588038794205705e-05, "loss": 1.2847, "step": 2477 }, { "epoch": 0.12, "grad_norm": 1.2119963886914744, "learning_rate": 1.9587596158575102e-05, "loss": 1.1187, "step": 2478 }, { "epoch": 0.12, "grad_norm": 1.2669029385296082, "learning_rate": 1.9587153290281734e-05, "loss": 1.168, "step": 2479 }, { "epoch": 0.12, "grad_norm": 1.204797449252495, "learning_rate": 1.958671018933635e-05, "loss": 1.2051, "step": 2480 }, { "epoch": 0.12, "grad_norm": 1.0970592524335145, "learning_rate": 1.958626685574971e-05, "loss": 1.293, "step": 2481 }, { "epoch": 0.12, "grad_norm": 0.9473074251097754, "learning_rate": 1.9585823289532556e-05, "loss": 1.1812, "step": 2482 }, { "epoch": 0.12, "grad_norm": 1.6550291120624565, "learning_rate": 1.958537949069567e-05, "loss": 1.1382, "step": 2483 }, { "epoch": 0.12, "grad_norm": 1.338278912102079, "learning_rate": 1.9584935459249807e-05, "loss": 1.4204, "step": 2484 }, { "epoch": 0.12, "grad_norm": 1.311648532400192, "learning_rate": 1.958449119520575e-05, "loss": 1.3037, "step": 2485 }, { "epoch": 0.12, "grad_norm": 1.1005799797656695, "learning_rate": 1.9584046698574282e-05, "loss": 1.1064, "step": 2486 }, { "epoch": 0.12, "grad_norm": 1.0672076846271734, "learning_rate": 1.958360196936618e-05, "loss": 1.1934, "step": 2487 }, { "epoch": 0.12, "grad_norm": 1.1558947992208253, "learning_rate": 1.958315700759225e-05, "loss": 1.3779, "step": 2488 }, { "epoch": 0.12, "grad_norm": 1.574008515859585, "learning_rate": 1.9582711813263277e-05, "loss": 1.29, "step": 2489 }, { "epoch": 0.12, "grad_norm": 1.1865040571335295, "learning_rate": 1.9582266386390075e-05, "loss": 1.2261, "step": 2490 }, { "epoch": 0.12, "grad_norm": 1.2593440403947982, "learning_rate": 1.9581820726983443e-05, "loss": 1.2559, "step": 2491 }, { "epoch": 0.12, "grad_norm": 1.1742214226761714, "learning_rate": 1.9581374835054205e-05, "loss": 1.269, "step": 2492 }, { "epoch": 0.12, "grad_norm": 1.4270853656376172, "learning_rate": 1.9580928710613176e-05, "loss": 1.333, "step": 2493 }, { "epoch": 0.12, "grad_norm": 1.20208617408777, "learning_rate": 1.9580482353671184e-05, "loss": 1.002, "step": 2494 }, { "epoch": 0.12, "grad_norm": 1.298617437152622, "learning_rate": 1.958003576423906e-05, "loss": 1.0962, "step": 2495 }, { "epoch": 0.12, "grad_norm": 1.2116954482290827, "learning_rate": 1.9579588942327642e-05, "loss": 1.1411, "step": 2496 }, { "epoch": 0.12, "grad_norm": 0.9719017423123277, "learning_rate": 1.9579141887947772e-05, "loss": 1.1509, "step": 2497 }, { "epoch": 0.12, "grad_norm": 1.12164002510178, "learning_rate": 1.95786946011103e-05, "loss": 1.3472, "step": 2498 }, { "epoch": 0.12, "grad_norm": 1.4180597723917212, "learning_rate": 1.9578247081826083e-05, "loss": 1.186, "step": 2499 }, { "epoch": 0.12, "grad_norm": 1.3084173166108897, "learning_rate": 1.9577799330105973e-05, "loss": 1.2881, "step": 2500 }, { "epoch": 0.12, "grad_norm": 1.299736565275629, "learning_rate": 1.9577351345960845e-05, "loss": 1.3809, "step": 2501 }, { "epoch": 0.12, "grad_norm": 1.3746082732864824, "learning_rate": 1.9576903129401563e-05, "loss": 1.2676, "step": 2502 }, { "epoch": 0.12, "grad_norm": 1.2640632647327177, "learning_rate": 1.957645468043901e-05, "loss": 1.3125, "step": 2503 }, { "epoch": 0.12, "grad_norm": 1.2860940365915123, "learning_rate": 1.957600599908406e-05, "loss": 1.3188, "step": 2504 }, { "epoch": 0.12, "grad_norm": 1.1169321533739667, "learning_rate": 1.957555708534761e-05, "loss": 1.2637, "step": 2505 }, { "epoch": 0.12, "grad_norm": 1.360334653838007, "learning_rate": 1.9575107939240548e-05, "loss": 1.2456, "step": 2506 }, { "epoch": 0.12, "grad_norm": 1.390121526999914, "learning_rate": 1.957465856077378e-05, "loss": 1.3091, "step": 2507 }, { "epoch": 0.12, "grad_norm": 1.2463547431948927, "learning_rate": 1.9574208949958203e-05, "loss": 1.1924, "step": 2508 }, { "epoch": 0.12, "grad_norm": 1.3434113042787004, "learning_rate": 1.9573759106804732e-05, "loss": 1.2554, "step": 2509 }, { "epoch": 0.12, "grad_norm": 1.0404267395429299, "learning_rate": 1.9573309031324284e-05, "loss": 1.2856, "step": 2510 }, { "epoch": 0.12, "grad_norm": 1.193358636108902, "learning_rate": 1.957285872352778e-05, "loss": 1.2168, "step": 2511 }, { "epoch": 0.12, "grad_norm": 0.9772835024094433, "learning_rate": 1.9572408183426145e-05, "loss": 1.1079, "step": 2512 }, { "epoch": 0.12, "grad_norm": 1.2076002086011914, "learning_rate": 1.9571957411030318e-05, "loss": 1.0574, "step": 2513 }, { "epoch": 0.12, "grad_norm": 1.1905960519884033, "learning_rate": 1.9571506406351233e-05, "loss": 1.1367, "step": 2514 }, { "epoch": 0.12, "grad_norm": 1.0675195487722746, "learning_rate": 1.9571055169399837e-05, "loss": 0.959, "step": 2515 }, { "epoch": 0.12, "grad_norm": 1.0630188218764152, "learning_rate": 1.957060370018708e-05, "loss": 1.2651, "step": 2516 }, { "epoch": 0.12, "grad_norm": 1.135285453656186, "learning_rate": 1.9570151998723918e-05, "loss": 1.3384, "step": 2517 }, { "epoch": 0.12, "grad_norm": 1.1967640924909215, "learning_rate": 1.956970006502131e-05, "loss": 1.165, "step": 2518 }, { "epoch": 0.12, "grad_norm": 1.2657818878411677, "learning_rate": 1.956924789909022e-05, "loss": 1.2822, "step": 2519 }, { "epoch": 0.12, "grad_norm": 1.28813565836533, "learning_rate": 1.9568795500941635e-05, "loss": 1.2642, "step": 2520 }, { "epoch": 0.12, "grad_norm": 1.2742719865336056, "learning_rate": 1.956834287058652e-05, "loss": 1.249, "step": 2521 }, { "epoch": 0.12, "grad_norm": 1.2157443690434664, "learning_rate": 1.9567890008035865e-05, "loss": 1.2173, "step": 2522 }, { "epoch": 0.12, "grad_norm": 1.2288650982979246, "learning_rate": 1.956743691330065e-05, "loss": 1.0474, "step": 2523 }, { "epoch": 0.12, "grad_norm": 1.2431435582546921, "learning_rate": 1.9566983586391884e-05, "loss": 1.2158, "step": 2524 }, { "epoch": 0.12, "grad_norm": 1.406443882668918, "learning_rate": 1.956653002732056e-05, "loss": 1.1934, "step": 2525 }, { "epoch": 0.12, "grad_norm": 1.1933260696554162, "learning_rate": 1.9566076236097695e-05, "loss": 1.2246, "step": 2526 }, { "epoch": 0.12, "grad_norm": 1.229206244179249, "learning_rate": 1.956562221273428e-05, "loss": 1.0867, "step": 2527 }, { "epoch": 0.12, "grad_norm": 1.2948996800080066, "learning_rate": 1.9565167957241353e-05, "loss": 1.2993, "step": 2528 }, { "epoch": 0.12, "grad_norm": 1.3555341576612308, "learning_rate": 1.9564713469629928e-05, "loss": 1.2974, "step": 2529 }, { "epoch": 0.12, "grad_norm": 1.1326747749344523, "learning_rate": 1.9564258749911035e-05, "loss": 1.1313, "step": 2530 }, { "epoch": 0.12, "grad_norm": 1.0366328506288642, "learning_rate": 1.956380379809571e-05, "loss": 1.3643, "step": 2531 }, { "epoch": 0.12, "grad_norm": 1.2003266400327943, "learning_rate": 1.9563348614194992e-05, "loss": 1.168, "step": 2532 }, { "epoch": 0.12, "grad_norm": 1.1976393241519945, "learning_rate": 1.956289319821993e-05, "loss": 1.272, "step": 2533 }, { "epoch": 0.12, "grad_norm": 1.1002543056157317, "learning_rate": 1.9562437550181573e-05, "loss": 1.1284, "step": 2534 }, { "epoch": 0.12, "grad_norm": 1.8569249495359663, "learning_rate": 1.9561981670090978e-05, "loss": 1.3965, "step": 2535 }, { "epoch": 0.12, "grad_norm": 1.2969697636452504, "learning_rate": 1.9561525557959207e-05, "loss": 1.2036, "step": 2536 }, { "epoch": 0.12, "grad_norm": 1.2208261278774912, "learning_rate": 1.9561069213797333e-05, "loss": 1.2969, "step": 2537 }, { "epoch": 0.12, "grad_norm": 1.3067256060846895, "learning_rate": 1.9560612637616428e-05, "loss": 1.2017, "step": 2538 }, { "epoch": 0.12, "grad_norm": 1.3265013685582838, "learning_rate": 1.9560155829427567e-05, "loss": 1.1797, "step": 2539 }, { "epoch": 0.12, "grad_norm": 1.1594222004434382, "learning_rate": 1.9559698789241844e-05, "loss": 1.2778, "step": 2540 }, { "epoch": 0.12, "grad_norm": 1.3768017931854704, "learning_rate": 1.955924151707034e-05, "loss": 1.1724, "step": 2541 }, { "epoch": 0.12, "grad_norm": 0.973083906625703, "learning_rate": 1.955878401292416e-05, "loss": 1.2808, "step": 2542 }, { "epoch": 0.12, "grad_norm": 1.3339386892132308, "learning_rate": 1.955832627681441e-05, "loss": 1.2246, "step": 2543 }, { "epoch": 0.12, "grad_norm": 1.241590596089356, "learning_rate": 1.955786830875218e-05, "loss": 1.1182, "step": 2544 }, { "epoch": 0.12, "grad_norm": 1.1066817381920073, "learning_rate": 1.95574101087486e-05, "loss": 1.2676, "step": 2545 }, { "epoch": 0.12, "grad_norm": 1.1578904508740844, "learning_rate": 1.9556951676814787e-05, "loss": 1.1108, "step": 2546 }, { "epoch": 0.12, "grad_norm": 0.8413746696123369, "learning_rate": 1.9556493012961856e-05, "loss": 1.2534, "step": 2547 }, { "epoch": 0.12, "grad_norm": 1.3731716400194995, "learning_rate": 1.955603411720095e-05, "loss": 1.2393, "step": 2548 }, { "epoch": 0.12, "grad_norm": 1.3809773629016897, "learning_rate": 1.9555574989543197e-05, "loss": 1.3438, "step": 2549 }, { "epoch": 0.12, "grad_norm": 1.1396521329289346, "learning_rate": 1.9555115629999738e-05, "loss": 1.2383, "step": 2550 }, { "epoch": 0.12, "grad_norm": 1.1579492274205814, "learning_rate": 1.9554656038581728e-05, "loss": 1.3022, "step": 2551 }, { "epoch": 0.12, "grad_norm": 1.2294145735131206, "learning_rate": 1.9554196215300314e-05, "loss": 1.2847, "step": 2552 }, { "epoch": 0.12, "grad_norm": 1.0548017698656251, "learning_rate": 1.9553736160166657e-05, "loss": 1.2065, "step": 2553 }, { "epoch": 0.12, "grad_norm": 1.5035341990551603, "learning_rate": 1.9553275873191916e-05, "loss": 1.3442, "step": 2554 }, { "epoch": 0.12, "grad_norm": 1.2044633300176522, "learning_rate": 1.9552815354387267e-05, "loss": 1.0396, "step": 2555 }, { "epoch": 0.12, "grad_norm": 1.0050409787691135, "learning_rate": 1.9552354603763882e-05, "loss": 1.3657, "step": 2556 }, { "epoch": 0.12, "grad_norm": 1.1926393835383338, "learning_rate": 1.9551893621332944e-05, "loss": 1.2524, "step": 2557 }, { "epoch": 0.12, "grad_norm": 1.1322618278041925, "learning_rate": 1.9551432407105642e-05, "loss": 1.1406, "step": 2558 }, { "epoch": 0.12, "grad_norm": 1.2732769596000317, "learning_rate": 1.955097096109316e-05, "loss": 1.2109, "step": 2559 }, { "epoch": 0.12, "grad_norm": 1.3140435344672652, "learning_rate": 1.9550509283306703e-05, "loss": 1.2598, "step": 2560 }, { "epoch": 0.12, "grad_norm": 1.31704069217963, "learning_rate": 1.9550047373757475e-05, "loss": 1.3794, "step": 2561 }, { "epoch": 0.12, "grad_norm": 1.149483495781311, "learning_rate": 1.9549585232456682e-05, "loss": 1.1572, "step": 2562 }, { "epoch": 0.12, "grad_norm": 1.447439199725947, "learning_rate": 1.9549122859415538e-05, "loss": 1.2598, "step": 2563 }, { "epoch": 0.12, "grad_norm": 1.2720953620813826, "learning_rate": 1.9548660254645265e-05, "loss": 1.3188, "step": 2564 }, { "epoch": 0.12, "grad_norm": 1.20609021116678, "learning_rate": 1.954819741815709e-05, "loss": 1.1548, "step": 2565 }, { "epoch": 0.12, "grad_norm": 1.3947520685983186, "learning_rate": 1.9547734349962246e-05, "loss": 1.3662, "step": 2566 }, { "epoch": 0.12, "grad_norm": 1.3989359877201486, "learning_rate": 1.9547271050071965e-05, "loss": 1.2241, "step": 2567 }, { "epoch": 0.12, "grad_norm": 1.4325290086939035, "learning_rate": 1.9546807518497496e-05, "loss": 1.2622, "step": 2568 }, { "epoch": 0.12, "grad_norm": 1.3047474618513795, "learning_rate": 1.954634375525008e-05, "loss": 1.1157, "step": 2569 }, { "epoch": 0.12, "grad_norm": 1.414767017813904, "learning_rate": 1.9545879760340983e-05, "loss": 1.3994, "step": 2570 }, { "epoch": 0.12, "grad_norm": 1.2003062612738746, "learning_rate": 1.9545415533781453e-05, "loss": 1.2212, "step": 2571 }, { "epoch": 0.12, "grad_norm": 1.1903294018204333, "learning_rate": 1.954495107558276e-05, "loss": 1.271, "step": 2572 }, { "epoch": 0.12, "grad_norm": 1.1706819487490872, "learning_rate": 1.9544486385756176e-05, "loss": 1.1875, "step": 2573 }, { "epoch": 0.12, "grad_norm": 1.3106494686487498, "learning_rate": 1.9544021464312977e-05, "loss": 1.1704, "step": 2574 }, { "epoch": 0.12, "grad_norm": 1.1807119241232262, "learning_rate": 1.9543556311264445e-05, "loss": 1.1567, "step": 2575 }, { "epoch": 0.12, "grad_norm": 1.1699019314697918, "learning_rate": 1.954309092662187e-05, "loss": 1.1992, "step": 2576 }, { "epoch": 0.12, "grad_norm": 1.034930724978826, "learning_rate": 1.9542625310396538e-05, "loss": 0.9543, "step": 2577 }, { "epoch": 0.12, "grad_norm": 0.9398418428180848, "learning_rate": 1.9542159462599755e-05, "loss": 1.127, "step": 2578 }, { "epoch": 0.12, "grad_norm": 1.4324097547456225, "learning_rate": 1.954169338324283e-05, "loss": 1.3579, "step": 2579 }, { "epoch": 0.12, "grad_norm": 1.2763489788869873, "learning_rate": 1.954122707233706e-05, "loss": 1.2388, "step": 2580 }, { "epoch": 0.12, "grad_norm": 1.1688819639416301, "learning_rate": 1.954076052989377e-05, "loss": 1.1948, "step": 2581 }, { "epoch": 0.12, "grad_norm": 1.2252312817180646, "learning_rate": 1.9540293755924285e-05, "loss": 1.1094, "step": 2582 }, { "epoch": 0.12, "grad_norm": 0.918004253672766, "learning_rate": 1.9539826750439926e-05, "loss": 1.1992, "step": 2583 }, { "epoch": 0.12, "grad_norm": 1.3443329623597808, "learning_rate": 1.9539359513452026e-05, "loss": 1.3086, "step": 2584 }, { "epoch": 0.12, "grad_norm": 1.163462173070634, "learning_rate": 1.9538892044971925e-05, "loss": 1.2451, "step": 2585 }, { "epoch": 0.12, "grad_norm": 1.256946538894521, "learning_rate": 1.9538424345010968e-05, "loss": 1.2422, "step": 2586 }, { "epoch": 0.12, "grad_norm": 1.1091409836490176, "learning_rate": 1.9537956413580504e-05, "loss": 1.2983, "step": 2587 }, { "epoch": 0.12, "grad_norm": 1.2413419147935638, "learning_rate": 1.9537488250691884e-05, "loss": 1.1318, "step": 2588 }, { "epoch": 0.12, "grad_norm": 1.3492905223479534, "learning_rate": 1.9537019856356478e-05, "loss": 1.2739, "step": 2589 }, { "epoch": 0.12, "grad_norm": 1.1375947542570342, "learning_rate": 1.9536551230585643e-05, "loss": 1.0891, "step": 2590 }, { "epoch": 0.12, "grad_norm": 1.3212637437025099, "learning_rate": 1.953608237339076e-05, "loss": 1.2842, "step": 2591 }, { "epoch": 0.12, "grad_norm": 1.3761988711110171, "learning_rate": 1.95356132847832e-05, "loss": 1.2778, "step": 2592 }, { "epoch": 0.12, "grad_norm": 1.4514536335341264, "learning_rate": 1.953514396477435e-05, "loss": 1.1572, "step": 2593 }, { "epoch": 0.12, "grad_norm": 1.1903706105917298, "learning_rate": 1.9534674413375595e-05, "loss": 1.3467, "step": 2594 }, { "epoch": 0.12, "grad_norm": 1.1040056340807305, "learning_rate": 1.9534204630598334e-05, "loss": 1.2041, "step": 2595 }, { "epoch": 0.12, "grad_norm": 1.2644244620636267, "learning_rate": 1.953373461645397e-05, "loss": 1.1506, "step": 2596 }, { "epoch": 0.12, "grad_norm": 1.377398691695164, "learning_rate": 1.9533264370953898e-05, "loss": 1.2046, "step": 2597 }, { "epoch": 0.12, "grad_norm": 1.2565133448946917, "learning_rate": 1.953279389410954e-05, "loss": 1.3662, "step": 2598 }, { "epoch": 0.13, "grad_norm": 1.5464803177947397, "learning_rate": 1.9532323185932306e-05, "loss": 1.2217, "step": 2599 }, { "epoch": 0.13, "grad_norm": 1.301454939421954, "learning_rate": 1.953185224643362e-05, "loss": 1.0884, "step": 2600 }, { "epoch": 0.13, "grad_norm": 1.3985990883137358, "learning_rate": 1.953138107562492e-05, "loss": 1.1089, "step": 2601 }, { "epoch": 0.13, "grad_norm": 1.1090103385598904, "learning_rate": 1.953090967351763e-05, "loss": 1.1455, "step": 2602 }, { "epoch": 0.13, "grad_norm": 1.2274952346429011, "learning_rate": 1.9530438040123188e-05, "loss": 1.2515, "step": 2603 }, { "epoch": 0.13, "grad_norm": 1.3273102530417267, "learning_rate": 1.952996617545304e-05, "loss": 1.2134, "step": 2604 }, { "epoch": 0.13, "grad_norm": 1.383412273319498, "learning_rate": 1.9529494079518647e-05, "loss": 1.2109, "step": 2605 }, { "epoch": 0.13, "grad_norm": 1.4826394588138023, "learning_rate": 1.9529021752331455e-05, "loss": 1.4165, "step": 2606 }, { "epoch": 0.13, "grad_norm": 1.5892180018441058, "learning_rate": 1.9528549193902926e-05, "loss": 0.9695, "step": 2607 }, { "epoch": 0.13, "grad_norm": 1.4107540037903843, "learning_rate": 1.9528076404244537e-05, "loss": 1.1387, "step": 2608 }, { "epoch": 0.13, "grad_norm": 1.2886044296723107, "learning_rate": 1.952760338336775e-05, "loss": 1.2129, "step": 2609 }, { "epoch": 0.13, "grad_norm": 1.5333376630874707, "learning_rate": 1.952713013128405e-05, "loss": 1.0986, "step": 2610 }, { "epoch": 0.13, "grad_norm": 1.1946910437376104, "learning_rate": 1.9526656648004918e-05, "loss": 1.2456, "step": 2611 }, { "epoch": 0.13, "grad_norm": 1.168488957707521, "learning_rate": 1.952618293354185e-05, "loss": 1.0388, "step": 2612 }, { "epoch": 0.13, "grad_norm": 1.329239601506811, "learning_rate": 1.9525708987906334e-05, "loss": 1.1284, "step": 2613 }, { "epoch": 0.13, "grad_norm": 1.2847056575150058, "learning_rate": 1.9525234811109874e-05, "loss": 1.2842, "step": 2614 }, { "epoch": 0.13, "grad_norm": 1.3916592701412236, "learning_rate": 1.952476040316398e-05, "loss": 1.2749, "step": 2615 }, { "epoch": 0.13, "grad_norm": 1.0536584767894503, "learning_rate": 1.9524285764080166e-05, "loss": 1.2012, "step": 2616 }, { "epoch": 0.13, "grad_norm": 1.1068126620545042, "learning_rate": 1.9523810893869937e-05, "loss": 1.2588, "step": 2617 }, { "epoch": 0.13, "grad_norm": 1.381231981422122, "learning_rate": 1.9523335792544835e-05, "loss": 1.2227, "step": 2618 }, { "epoch": 0.13, "grad_norm": 1.419782463688498, "learning_rate": 1.9522860460116377e-05, "loss": 1.3232, "step": 2619 }, { "epoch": 0.13, "grad_norm": 0.8967362204817978, "learning_rate": 1.9522384896596102e-05, "loss": 1.1235, "step": 2620 }, { "epoch": 0.13, "grad_norm": 1.3903719482081585, "learning_rate": 1.952190910199555e-05, "loss": 1.3281, "step": 2621 }, { "epoch": 0.13, "grad_norm": 1.1788294324506943, "learning_rate": 1.9521433076326267e-05, "loss": 0.9014, "step": 2622 }, { "epoch": 0.13, "grad_norm": 1.238667270163528, "learning_rate": 1.9520956819599804e-05, "loss": 1.3223, "step": 2623 }, { "epoch": 0.13, "grad_norm": 1.3185670949971537, "learning_rate": 1.9520480331827718e-05, "loss": 1.0427, "step": 2624 }, { "epoch": 0.13, "grad_norm": 1.3208644992634633, "learning_rate": 1.9520003613021577e-05, "loss": 1.1436, "step": 2625 }, { "epoch": 0.13, "grad_norm": 1.1201621181408399, "learning_rate": 1.951952666319294e-05, "loss": 1.228, "step": 2626 }, { "epoch": 0.13, "grad_norm": 1.1443857375294177, "learning_rate": 1.9519049482353393e-05, "loss": 1.2354, "step": 2627 }, { "epoch": 0.13, "grad_norm": 1.390447214870896, "learning_rate": 1.9518572070514507e-05, "loss": 1.2109, "step": 2628 }, { "epoch": 0.13, "grad_norm": 1.2813332936432766, "learning_rate": 1.9518094427687866e-05, "loss": 1.2993, "step": 2629 }, { "epoch": 0.13, "grad_norm": 1.3516008541789637, "learning_rate": 1.951761655388507e-05, "loss": 1.1265, "step": 2630 }, { "epoch": 0.13, "grad_norm": 1.3861402566311032, "learning_rate": 1.9517138449117706e-05, "loss": 1.1309, "step": 2631 }, { "epoch": 0.13, "grad_norm": 1.277975067018898, "learning_rate": 1.9516660113397386e-05, "loss": 1.2993, "step": 2632 }, { "epoch": 0.13, "grad_norm": 1.2048315741015423, "learning_rate": 1.951618154673571e-05, "loss": 1.1948, "step": 2633 }, { "epoch": 0.13, "grad_norm": 1.4532587668465788, "learning_rate": 1.9515702749144293e-05, "loss": 1.2021, "step": 2634 }, { "epoch": 0.13, "grad_norm": 1.2551261815297083, "learning_rate": 1.951522372063476e-05, "loss": 1.2505, "step": 2635 }, { "epoch": 0.13, "grad_norm": 1.105726220711786, "learning_rate": 1.9514744461218725e-05, "loss": 1.1606, "step": 2636 }, { "epoch": 0.13, "grad_norm": 1.3688134028680958, "learning_rate": 1.9514264970907825e-05, "loss": 1.1997, "step": 2637 }, { "epoch": 0.13, "grad_norm": 1.3219094090564436, "learning_rate": 1.9513785249713697e-05, "loss": 1.1135, "step": 2638 }, { "epoch": 0.13, "grad_norm": 1.7853339886908384, "learning_rate": 1.9513305297647976e-05, "loss": 1.0913, "step": 2639 }, { "epoch": 0.13, "grad_norm": 1.5353443092519012, "learning_rate": 1.9512825114722314e-05, "loss": 1.1616, "step": 2640 }, { "epoch": 0.13, "grad_norm": 1.3129276225767958, "learning_rate": 1.9512344700948363e-05, "loss": 1.2583, "step": 2641 }, { "epoch": 0.13, "grad_norm": 1.1660160742699783, "learning_rate": 1.9511864056337784e-05, "loss": 1.2637, "step": 2642 }, { "epoch": 0.13, "grad_norm": 1.411010402149613, "learning_rate": 1.9511383180902237e-05, "loss": 1.2092, "step": 2643 }, { "epoch": 0.13, "grad_norm": 1.1203285779663665, "learning_rate": 1.951090207465339e-05, "loss": 1.1519, "step": 2644 }, { "epoch": 0.13, "grad_norm": 1.130191699555457, "learning_rate": 1.951042073760292e-05, "loss": 1.1143, "step": 2645 }, { "epoch": 0.13, "grad_norm": 1.4040264993422515, "learning_rate": 1.950993916976251e-05, "loss": 1.3159, "step": 2646 }, { "epoch": 0.13, "grad_norm": 1.3799814099078165, "learning_rate": 1.9509457371143843e-05, "loss": 1.2622, "step": 2647 }, { "epoch": 0.13, "grad_norm": 1.374255281661571, "learning_rate": 1.950897534175861e-05, "loss": 1.3154, "step": 2648 }, { "epoch": 0.13, "grad_norm": 1.207799743896137, "learning_rate": 1.9508493081618515e-05, "loss": 1.3262, "step": 2649 }, { "epoch": 0.13, "grad_norm": 1.3411659752367298, "learning_rate": 1.9508010590735252e-05, "loss": 1.3589, "step": 2650 }, { "epoch": 0.13, "grad_norm": 0.9885599863767687, "learning_rate": 1.9507527869120534e-05, "loss": 1.2959, "step": 2651 }, { "epoch": 0.13, "grad_norm": 1.1772614334998182, "learning_rate": 1.950704491678608e-05, "loss": 1.2778, "step": 2652 }, { "epoch": 0.13, "grad_norm": 1.2873046350426027, "learning_rate": 1.95065617337436e-05, "loss": 1.1743, "step": 2653 }, { "epoch": 0.13, "grad_norm": 1.0156569352513143, "learning_rate": 1.9506078320004825e-05, "loss": 1.2769, "step": 2654 }, { "epoch": 0.13, "grad_norm": 1.7578318469170935, "learning_rate": 1.950559467558149e-05, "loss": 1.3306, "step": 2655 }, { "epoch": 0.13, "grad_norm": 1.3665634164972262, "learning_rate": 1.9505110800485324e-05, "loss": 1.2495, "step": 2656 }, { "epoch": 0.13, "grad_norm": 1.1897784573473844, "learning_rate": 1.950462669472807e-05, "loss": 1.2061, "step": 2657 }, { "epoch": 0.13, "grad_norm": 1.6778824954422504, "learning_rate": 1.950414235832148e-05, "loss": 1.4326, "step": 2658 }, { "epoch": 0.13, "grad_norm": 0.953544044382961, "learning_rate": 1.950365779127731e-05, "loss": 1.2246, "step": 2659 }, { "epoch": 0.13, "grad_norm": 0.9635973363183532, "learning_rate": 1.950317299360731e-05, "loss": 1.1284, "step": 2660 }, { "epoch": 0.13, "grad_norm": 1.2991755335551114, "learning_rate": 1.950268796532325e-05, "loss": 1.2446, "step": 2661 }, { "epoch": 0.13, "grad_norm": 1.1850769031460728, "learning_rate": 1.95022027064369e-05, "loss": 1.2559, "step": 2662 }, { "epoch": 0.13, "grad_norm": 1.3261594403826105, "learning_rate": 1.9501717216960035e-05, "loss": 1.3042, "step": 2663 }, { "epoch": 0.13, "grad_norm": 1.3376705315092543, "learning_rate": 1.9501231496904435e-05, "loss": 1.1865, "step": 2664 }, { "epoch": 0.13, "grad_norm": 1.6840414577597647, "learning_rate": 1.9500745546281893e-05, "loss": 1.335, "step": 2665 }, { "epoch": 0.13, "grad_norm": 0.9630277393012481, "learning_rate": 1.9500259365104192e-05, "loss": 1.3535, "step": 2666 }, { "epoch": 0.13, "grad_norm": 1.1648728122722483, "learning_rate": 1.949977295338314e-05, "loss": 1.3013, "step": 2667 }, { "epoch": 0.13, "grad_norm": 1.3870353307330323, "learning_rate": 1.9499286311130533e-05, "loss": 1.2734, "step": 2668 }, { "epoch": 0.13, "grad_norm": 1.0051247445864258, "learning_rate": 1.9498799438358186e-05, "loss": 1.1218, "step": 2669 }, { "epoch": 0.13, "grad_norm": 1.4692828817837424, "learning_rate": 1.949831233507791e-05, "loss": 1.2891, "step": 2670 }, { "epoch": 0.13, "grad_norm": 1.1884340648055194, "learning_rate": 1.949782500130153e-05, "loss": 1.0845, "step": 2671 }, { "epoch": 0.13, "grad_norm": 1.1801397884745513, "learning_rate": 1.9497337437040867e-05, "loss": 1.168, "step": 2672 }, { "epoch": 0.13, "grad_norm": 1.1591175863288263, "learning_rate": 1.9496849642307754e-05, "loss": 1.1201, "step": 2673 }, { "epoch": 0.13, "grad_norm": 1.2845953082500707, "learning_rate": 1.949636161711403e-05, "loss": 1.1519, "step": 2674 }, { "epoch": 0.13, "grad_norm": 1.3130348856329377, "learning_rate": 1.9495873361471538e-05, "loss": 1.3472, "step": 2675 }, { "epoch": 0.13, "grad_norm": 1.6390066783489718, "learning_rate": 1.9495384875392125e-05, "loss": 1.2969, "step": 2676 }, { "epoch": 0.13, "grad_norm": 1.1702465578936851, "learning_rate": 1.9494896158887647e-05, "loss": 1.043, "step": 2677 }, { "epoch": 0.13, "grad_norm": 1.4083714326241368, "learning_rate": 1.949440721196996e-05, "loss": 1.1934, "step": 2678 }, { "epoch": 0.13, "grad_norm": 1.2217101762651914, "learning_rate": 1.9493918034650934e-05, "loss": 1.23, "step": 2679 }, { "epoch": 0.13, "grad_norm": 1.3457682326286258, "learning_rate": 1.9493428626942443e-05, "loss": 1.3691, "step": 2680 }, { "epoch": 0.13, "grad_norm": 1.2843865777156251, "learning_rate": 1.9492938988856354e-05, "loss": 1.3447, "step": 2681 }, { "epoch": 0.13, "grad_norm": 1.3189724785444232, "learning_rate": 1.949244912040455e-05, "loss": 1.3823, "step": 2682 }, { "epoch": 0.13, "grad_norm": 0.9085058924385045, "learning_rate": 1.9491959021598927e-05, "loss": 1.3394, "step": 2683 }, { "epoch": 0.13, "grad_norm": 1.2680059982484344, "learning_rate": 1.9491468692451373e-05, "loss": 1.1914, "step": 2684 }, { "epoch": 0.13, "grad_norm": 1.0906562293328212, "learning_rate": 1.9490978132973785e-05, "loss": 1.3433, "step": 2685 }, { "epoch": 0.13, "grad_norm": 1.3892974353480247, "learning_rate": 1.9490487343178072e-05, "loss": 1.3438, "step": 2686 }, { "epoch": 0.13, "grad_norm": 1.2405708105728084, "learning_rate": 1.948999632307614e-05, "loss": 1.2207, "step": 2687 }, { "epoch": 0.13, "grad_norm": 1.1794882816821655, "learning_rate": 1.9489505072679907e-05, "loss": 1.0903, "step": 2688 }, { "epoch": 0.13, "grad_norm": 1.125115722075324, "learning_rate": 1.9489013592001293e-05, "loss": 1.3315, "step": 2689 }, { "epoch": 0.13, "grad_norm": 1.2514326790760983, "learning_rate": 1.9488521881052225e-05, "loss": 1.3057, "step": 2690 }, { "epoch": 0.13, "grad_norm": 1.1834700329893788, "learning_rate": 1.9488029939844634e-05, "loss": 1.2227, "step": 2691 }, { "epoch": 0.13, "grad_norm": 1.0646793545606132, "learning_rate": 1.9487537768390465e-05, "loss": 1.0684, "step": 2692 }, { "epoch": 0.13, "grad_norm": 1.4078775873977178, "learning_rate": 1.9487045366701652e-05, "loss": 1.0378, "step": 2693 }, { "epoch": 0.13, "grad_norm": 1.4567505641646958, "learning_rate": 1.948655273479015e-05, "loss": 1.1978, "step": 2694 }, { "epoch": 0.13, "grad_norm": 1.3837350515519076, "learning_rate": 1.948605987266791e-05, "loss": 1.2319, "step": 2695 }, { "epoch": 0.13, "grad_norm": 1.6274504076448262, "learning_rate": 1.94855667803469e-05, "loss": 1.3555, "step": 2696 }, { "epoch": 0.13, "grad_norm": 1.2482950348343738, "learning_rate": 1.9485073457839072e-05, "loss": 1.1016, "step": 2697 }, { "epoch": 0.13, "grad_norm": 1.508445004528826, "learning_rate": 1.948457990515641e-05, "loss": 1.2202, "step": 2698 }, { "epoch": 0.13, "grad_norm": 1.5784204118832343, "learning_rate": 1.9484086122310887e-05, "loss": 1.3579, "step": 2699 }, { "epoch": 0.13, "grad_norm": 1.195766630008451, "learning_rate": 1.9483592109314487e-05, "loss": 1.0229, "step": 2700 }, { "epoch": 0.13, "grad_norm": 1.4892974489448472, "learning_rate": 1.9483097866179194e-05, "loss": 1.2578, "step": 2701 }, { "epoch": 0.13, "grad_norm": 1.2523307189241772, "learning_rate": 1.9482603392917006e-05, "loss": 1.2104, "step": 2702 }, { "epoch": 0.13, "grad_norm": 1.3213753338638972, "learning_rate": 1.948210868953992e-05, "loss": 1.2202, "step": 2703 }, { "epoch": 0.13, "grad_norm": 1.2371515699912812, "learning_rate": 1.9481613756059944e-05, "loss": 1.1626, "step": 2704 }, { "epoch": 0.13, "grad_norm": 1.1528619183220852, "learning_rate": 1.9481118592489086e-05, "loss": 1.241, "step": 2705 }, { "epoch": 0.13, "grad_norm": 1.2205704708623017, "learning_rate": 1.9480623198839362e-05, "loss": 1.0713, "step": 2706 }, { "epoch": 0.13, "grad_norm": 1.351149162062429, "learning_rate": 1.9480127575122795e-05, "loss": 1.1763, "step": 2707 }, { "epoch": 0.13, "grad_norm": 1.3660114624059205, "learning_rate": 1.9479631721351412e-05, "loss": 1.2842, "step": 2708 }, { "epoch": 0.13, "grad_norm": 1.1406711728619938, "learning_rate": 1.947913563753724e-05, "loss": 1.1855, "step": 2709 }, { "epoch": 0.13, "grad_norm": 1.526836677224904, "learning_rate": 1.947863932369233e-05, "loss": 1.2378, "step": 2710 }, { "epoch": 0.13, "grad_norm": 1.295356122452706, "learning_rate": 1.9478142779828717e-05, "loss": 1.1992, "step": 2711 }, { "epoch": 0.13, "grad_norm": 1.4371026241903622, "learning_rate": 1.9477646005958454e-05, "loss": 1.1907, "step": 2712 }, { "epoch": 0.13, "grad_norm": 1.293887046264935, "learning_rate": 1.9477149002093595e-05, "loss": 1.189, "step": 2713 }, { "epoch": 0.13, "grad_norm": 0.7677306817929203, "learning_rate": 1.9476651768246203e-05, "loss": 1.1489, "step": 2714 }, { "epoch": 0.13, "grad_norm": 1.1650026630874295, "learning_rate": 1.947615430442834e-05, "loss": 1.1592, "step": 2715 }, { "epoch": 0.13, "grad_norm": 1.3561259175171914, "learning_rate": 1.947565661065208e-05, "loss": 1.333, "step": 2716 }, { "epoch": 0.13, "grad_norm": 1.2982224443944543, "learning_rate": 1.9475158686929498e-05, "loss": 1.1304, "step": 2717 }, { "epoch": 0.13, "grad_norm": 1.2277092994660086, "learning_rate": 1.9474660533272684e-05, "loss": 1.3174, "step": 2718 }, { "epoch": 0.13, "grad_norm": 1.181934872824549, "learning_rate": 1.9474162149693724e-05, "loss": 1.0298, "step": 2719 }, { "epoch": 0.13, "grad_norm": 1.121761323390677, "learning_rate": 1.947366353620471e-05, "loss": 1.2305, "step": 2720 }, { "epoch": 0.13, "grad_norm": 1.0986682076128282, "learning_rate": 1.947316469281774e-05, "loss": 1.1812, "step": 2721 }, { "epoch": 0.13, "grad_norm": 1.2771962711350335, "learning_rate": 1.9472665619544927e-05, "loss": 1.0493, "step": 2722 }, { "epoch": 0.13, "grad_norm": 1.2470280695813276, "learning_rate": 1.9472166316398376e-05, "loss": 1.2861, "step": 2723 }, { "epoch": 0.13, "grad_norm": 1.248492657409618, "learning_rate": 1.9471666783390204e-05, "loss": 1.0554, "step": 2724 }, { "epoch": 0.13, "grad_norm": 1.4363602022184598, "learning_rate": 1.9471167020532533e-05, "loss": 1.1252, "step": 2725 }, { "epoch": 0.13, "grad_norm": 1.3951853943635082, "learning_rate": 1.9470667027837497e-05, "loss": 1.2505, "step": 2726 }, { "epoch": 0.13, "grad_norm": 1.3385909599332624, "learning_rate": 1.9470166805317217e-05, "loss": 1.1313, "step": 2727 }, { "epoch": 0.13, "grad_norm": 1.6498237244370961, "learning_rate": 1.9469666352983845e-05, "loss": 1.2231, "step": 2728 }, { "epoch": 0.13, "grad_norm": 1.3026633112911212, "learning_rate": 1.946916567084952e-05, "loss": 1.1807, "step": 2729 }, { "epoch": 0.13, "grad_norm": 1.2384827840532582, "learning_rate": 1.9468664758926393e-05, "loss": 1.2754, "step": 2730 }, { "epoch": 0.13, "grad_norm": 1.5013219580003692, "learning_rate": 1.9468163617226613e-05, "loss": 1.1187, "step": 2731 }, { "epoch": 0.13, "grad_norm": 1.2395617381795718, "learning_rate": 1.9467662245762354e-05, "loss": 1.1982, "step": 2732 }, { "epoch": 0.13, "grad_norm": 1.287145630682147, "learning_rate": 1.9467160644545767e-05, "loss": 1.3584, "step": 2733 }, { "epoch": 0.13, "grad_norm": 0.9605145961361162, "learning_rate": 1.946665881358904e-05, "loss": 1.2373, "step": 2734 }, { "epoch": 0.13, "grad_norm": 1.592310642093392, "learning_rate": 1.9466156752904344e-05, "loss": 1.3408, "step": 2735 }, { "epoch": 0.13, "grad_norm": 1.3522835239576694, "learning_rate": 1.9465654462503862e-05, "loss": 1.2944, "step": 2736 }, { "epoch": 0.13, "grad_norm": 1.0767142539538348, "learning_rate": 1.946515194239978e-05, "loss": 1.2373, "step": 2737 }, { "epoch": 0.13, "grad_norm": 1.2732301832076856, "learning_rate": 1.94646491926043e-05, "loss": 1.2183, "step": 2738 }, { "epoch": 0.13, "grad_norm": 1.3273572409543741, "learning_rate": 1.9464146213129615e-05, "loss": 1.249, "step": 2739 }, { "epoch": 0.13, "grad_norm": 1.2203196314608338, "learning_rate": 1.9463643003987938e-05, "loss": 1.2061, "step": 2740 }, { "epoch": 0.13, "grad_norm": 1.2888686118285542, "learning_rate": 1.9463139565191476e-05, "loss": 1.1279, "step": 2741 }, { "epoch": 0.13, "grad_norm": 1.6738485890431998, "learning_rate": 1.9462635896752448e-05, "loss": 1.1709, "step": 2742 }, { "epoch": 0.13, "grad_norm": 1.5179778655302758, "learning_rate": 1.9462131998683073e-05, "loss": 1.2949, "step": 2743 }, { "epoch": 0.13, "grad_norm": 1.2581998347242902, "learning_rate": 1.9461627870995585e-05, "loss": 1.1841, "step": 2744 }, { "epoch": 0.13, "grad_norm": 1.1706289263251706, "learning_rate": 1.9461123513702208e-05, "loss": 1.1514, "step": 2745 }, { "epoch": 0.13, "grad_norm": 1.3540656260335235, "learning_rate": 1.9460618926815195e-05, "loss": 1.1431, "step": 2746 }, { "epoch": 0.13, "grad_norm": 1.181371854506658, "learning_rate": 1.9460114110346775e-05, "loss": 1.3228, "step": 2747 }, { "epoch": 0.13, "grad_norm": 1.3060698985202708, "learning_rate": 1.9459609064309212e-05, "loss": 1.2285, "step": 2748 }, { "epoch": 0.13, "grad_norm": 1.2193820099135526, "learning_rate": 1.9459103788714756e-05, "loss": 1.3706, "step": 2749 }, { "epoch": 0.13, "grad_norm": 1.2632294251288847, "learning_rate": 1.945859828357567e-05, "loss": 1.2524, "step": 2750 }, { "epoch": 0.13, "grad_norm": 1.292272289006726, "learning_rate": 1.945809254890422e-05, "loss": 1.231, "step": 2751 }, { "epoch": 0.13, "grad_norm": 1.285518832559783, "learning_rate": 1.9457586584712678e-05, "loss": 1.2417, "step": 2752 }, { "epoch": 0.13, "grad_norm": 1.377721273313054, "learning_rate": 1.9457080391013325e-05, "loss": 1.3311, "step": 2753 }, { "epoch": 0.13, "grad_norm": 1.3065472062513732, "learning_rate": 1.945657396781844e-05, "loss": 1.2876, "step": 2754 }, { "epoch": 0.13, "grad_norm": 1.075218528309244, "learning_rate": 1.945606731514032e-05, "loss": 1.252, "step": 2755 }, { "epoch": 0.13, "grad_norm": 1.1053559904967885, "learning_rate": 1.9455560432991253e-05, "loss": 1.1362, "step": 2756 }, { "epoch": 0.13, "grad_norm": 1.1445442057947144, "learning_rate": 1.9455053321383542e-05, "loss": 1.27, "step": 2757 }, { "epoch": 0.13, "grad_norm": 1.3036088298767414, "learning_rate": 1.9454545980329493e-05, "loss": 1.271, "step": 2758 }, { "epoch": 0.13, "grad_norm": 1.2492236477141547, "learning_rate": 1.945403840984142e-05, "loss": 1.2715, "step": 2759 }, { "epoch": 0.13, "grad_norm": 1.1645602544295508, "learning_rate": 1.9453530609931635e-05, "loss": 1.1592, "step": 2760 }, { "epoch": 0.13, "grad_norm": 1.3344633931376393, "learning_rate": 1.9453022580612468e-05, "loss": 1.3325, "step": 2761 }, { "epoch": 0.13, "grad_norm": 1.500064226615529, "learning_rate": 1.9452514321896242e-05, "loss": 1.2261, "step": 2762 }, { "epoch": 0.13, "grad_norm": 1.3835826017292094, "learning_rate": 1.945200583379529e-05, "loss": 1.0522, "step": 2763 }, { "epoch": 0.13, "grad_norm": 1.4977693613637881, "learning_rate": 1.9451497116321954e-05, "loss": 1.1924, "step": 2764 }, { "epoch": 0.13, "grad_norm": 1.4334526378868404, "learning_rate": 1.9450988169488577e-05, "loss": 1.2202, "step": 2765 }, { "epoch": 0.13, "grad_norm": 1.1886315228731148, "learning_rate": 1.9450478993307517e-05, "loss": 1.2563, "step": 2766 }, { "epoch": 0.13, "grad_norm": 1.4960322913632391, "learning_rate": 1.944996958779112e-05, "loss": 1.3008, "step": 2767 }, { "epoch": 0.13, "grad_norm": 1.2168292510914869, "learning_rate": 1.9449459952951756e-05, "loss": 1.124, "step": 2768 }, { "epoch": 0.13, "grad_norm": 1.2263149321092663, "learning_rate": 1.944895008880179e-05, "loss": 1.2905, "step": 2769 }, { "epoch": 0.13, "grad_norm": 1.1176357673464454, "learning_rate": 1.944843999535359e-05, "loss": 1.2422, "step": 2770 }, { "epoch": 0.13, "grad_norm": 1.8034217207186087, "learning_rate": 1.944792967261954e-05, "loss": 1.6475, "step": 2771 }, { "epoch": 0.13, "grad_norm": 1.1444175143709956, "learning_rate": 1.9447419120612018e-05, "loss": 1.3428, "step": 2772 }, { "epoch": 0.13, "grad_norm": 1.0889057681188112, "learning_rate": 1.9446908339343422e-05, "loss": 1.1899, "step": 2773 }, { "epoch": 0.13, "grad_norm": 1.2913008673174342, "learning_rate": 1.9446397328826145e-05, "loss": 1.3525, "step": 2774 }, { "epoch": 0.13, "grad_norm": 1.314183139570361, "learning_rate": 1.944588608907258e-05, "loss": 1.2114, "step": 2775 }, { "epoch": 0.13, "grad_norm": 1.2383782063969953, "learning_rate": 1.9445374620095142e-05, "loss": 1.1538, "step": 2776 }, { "epoch": 0.13, "grad_norm": 1.433031193669727, "learning_rate": 1.944486292190624e-05, "loss": 1.2524, "step": 2777 }, { "epoch": 0.13, "grad_norm": 1.5220803640520921, "learning_rate": 1.944435099451829e-05, "loss": 1.2231, "step": 2778 }, { "epoch": 0.13, "grad_norm": 1.348756151540924, "learning_rate": 1.9443838837943717e-05, "loss": 1.1802, "step": 2779 }, { "epoch": 0.13, "grad_norm": 1.230658827031683, "learning_rate": 1.9443326452194948e-05, "loss": 1.1519, "step": 2780 }, { "epoch": 0.13, "grad_norm": 1.4795190677676013, "learning_rate": 1.9442813837284416e-05, "loss": 1.2837, "step": 2781 }, { "epoch": 0.13, "grad_norm": 1.2253175063018085, "learning_rate": 1.9442300993224568e-05, "loss": 1.1782, "step": 2782 }, { "epoch": 0.13, "grad_norm": 1.1116310288251972, "learning_rate": 1.9441787920027843e-05, "loss": 1.1006, "step": 2783 }, { "epoch": 0.13, "grad_norm": 1.185254908474156, "learning_rate": 1.944127461770669e-05, "loss": 1.0732, "step": 2784 }, { "epoch": 0.13, "grad_norm": 1.2034505974271008, "learning_rate": 1.9440761086273564e-05, "loss": 1.3276, "step": 2785 }, { "epoch": 0.13, "grad_norm": 1.1833301613336273, "learning_rate": 1.944024732574094e-05, "loss": 1.2153, "step": 2786 }, { "epoch": 0.13, "grad_norm": 1.324195497593842, "learning_rate": 1.9439733336121267e-05, "loss": 1.2314, "step": 2787 }, { "epoch": 0.13, "grad_norm": 1.2966746003677356, "learning_rate": 1.9439219117427034e-05, "loss": 1.3203, "step": 2788 }, { "epoch": 0.13, "grad_norm": 1.0775020684596006, "learning_rate": 1.943870466967071e-05, "loss": 1.2168, "step": 2789 }, { "epoch": 0.13, "grad_norm": 1.2704756976158058, "learning_rate": 1.943818999286478e-05, "loss": 1.2061, "step": 2790 }, { "epoch": 0.13, "grad_norm": 1.2884705986945755, "learning_rate": 1.9437675087021736e-05, "loss": 1.1616, "step": 2791 }, { "epoch": 0.13, "grad_norm": 1.2689655803543933, "learning_rate": 1.9437159952154078e-05, "loss": 1.2549, "step": 2792 }, { "epoch": 0.13, "grad_norm": 1.1270979399452048, "learning_rate": 1.9436644588274295e-05, "loss": 1.2104, "step": 2793 }, { "epoch": 0.13, "grad_norm": 1.3237854685338393, "learning_rate": 1.9436128995394903e-05, "loss": 1.3237, "step": 2794 }, { "epoch": 0.13, "grad_norm": 1.3714890476815498, "learning_rate": 1.943561317352841e-05, "loss": 1.2314, "step": 2795 }, { "epoch": 0.13, "grad_norm": 1.2147327364398894, "learning_rate": 1.9435097122687337e-05, "loss": 1.0242, "step": 2796 }, { "epoch": 0.13, "grad_norm": 1.2251699293128333, "learning_rate": 1.94345808428842e-05, "loss": 1.144, "step": 2797 }, { "epoch": 0.13, "grad_norm": 1.0229270236297132, "learning_rate": 1.943406433413154e-05, "loss": 1.0718, "step": 2798 }, { "epoch": 0.13, "grad_norm": 1.3890659210640006, "learning_rate": 1.9433547596441877e-05, "loss": 1.3267, "step": 2799 }, { "epoch": 0.13, "grad_norm": 0.9435971767170067, "learning_rate": 1.9433030629827757e-05, "loss": 1.1553, "step": 2800 }, { "epoch": 0.13, "grad_norm": 1.016031788757542, "learning_rate": 1.9432513434301727e-05, "loss": 1.2246, "step": 2801 }, { "epoch": 0.13, "grad_norm": 1.1638690201680657, "learning_rate": 1.943199600987633e-05, "loss": 1.3799, "step": 2802 }, { "epoch": 0.13, "grad_norm": 1.4807006934398304, "learning_rate": 1.943147835656414e-05, "loss": 1.1758, "step": 2803 }, { "epoch": 0.13, "grad_norm": 1.2804400313523154, "learning_rate": 1.9430960474377697e-05, "loss": 1.2778, "step": 2804 }, { "epoch": 0.13, "grad_norm": 1.4479869707589912, "learning_rate": 1.9430442363329583e-05, "loss": 1.3037, "step": 2805 }, { "epoch": 0.13, "grad_norm": 1.268016148060741, "learning_rate": 1.9429924023432364e-05, "loss": 1.1455, "step": 2806 }, { "epoch": 0.14, "grad_norm": 1.1999081222443693, "learning_rate": 1.9429405454698624e-05, "loss": 1.1465, "step": 2807 }, { "epoch": 0.14, "grad_norm": 1.344376244059421, "learning_rate": 1.9428886657140945e-05, "loss": 1.2593, "step": 2808 }, { "epoch": 0.14, "grad_norm": 1.4701314379464177, "learning_rate": 1.9428367630771915e-05, "loss": 1.1157, "step": 2809 }, { "epoch": 0.14, "grad_norm": 1.1431747322150458, "learning_rate": 1.942784837560413e-05, "loss": 1.2329, "step": 2810 }, { "epoch": 0.14, "grad_norm": 1.1353575488991032, "learning_rate": 1.9427328891650194e-05, "loss": 1.1895, "step": 2811 }, { "epoch": 0.14, "grad_norm": 1.223411346174628, "learning_rate": 1.9426809178922706e-05, "loss": 1.3247, "step": 2812 }, { "epoch": 0.14, "grad_norm": 1.4771901573125445, "learning_rate": 1.9426289237434286e-05, "loss": 1.2012, "step": 2813 }, { "epoch": 0.14, "grad_norm": 1.4287777913212347, "learning_rate": 1.9425769067197548e-05, "loss": 1.3433, "step": 2814 }, { "epoch": 0.14, "grad_norm": 1.0946886583466218, "learning_rate": 1.942524866822511e-05, "loss": 1.0933, "step": 2815 }, { "epoch": 0.14, "grad_norm": 1.3088641905831098, "learning_rate": 1.9424728040529612e-05, "loss": 1.0471, "step": 2816 }, { "epoch": 0.14, "grad_norm": 1.2290919181640083, "learning_rate": 1.9424207184123677e-05, "loss": 1.3198, "step": 2817 }, { "epoch": 0.14, "grad_norm": 1.0831557333734951, "learning_rate": 1.942368609901995e-05, "loss": 0.9758, "step": 2818 }, { "epoch": 0.14, "grad_norm": 1.3183986851601692, "learning_rate": 1.9423164785231078e-05, "loss": 1.2158, "step": 2819 }, { "epoch": 0.14, "grad_norm": 1.1323878749227911, "learning_rate": 1.942264324276971e-05, "loss": 1.022, "step": 2820 }, { "epoch": 0.14, "grad_norm": 1.1829668326401326, "learning_rate": 1.94221214716485e-05, "loss": 1.0972, "step": 2821 }, { "epoch": 0.14, "grad_norm": 1.2783245566215902, "learning_rate": 1.9421599471880108e-05, "loss": 1.21, "step": 2822 }, { "epoch": 0.14, "grad_norm": 1.2009905850972302, "learning_rate": 1.9421077243477208e-05, "loss": 1.3799, "step": 2823 }, { "epoch": 0.14, "grad_norm": 1.2397979054599495, "learning_rate": 1.942055478645247e-05, "loss": 1.3169, "step": 2824 }, { "epoch": 0.14, "grad_norm": 1.180182993732635, "learning_rate": 1.942003210081857e-05, "loss": 1.2998, "step": 2825 }, { "epoch": 0.14, "grad_norm": 1.631284546716238, "learning_rate": 1.9419509186588196e-05, "loss": 1.272, "step": 2826 }, { "epoch": 0.14, "grad_norm": 1.0786286689935665, "learning_rate": 1.9418986043774036e-05, "loss": 1.1714, "step": 2827 }, { "epoch": 0.14, "grad_norm": 1.2827376010283227, "learning_rate": 1.9418462672388784e-05, "loss": 1.2622, "step": 2828 }, { "epoch": 0.14, "grad_norm": 1.1964889851275509, "learning_rate": 1.941793907244514e-05, "loss": 1.1714, "step": 2829 }, { "epoch": 0.14, "grad_norm": 1.4398068155600379, "learning_rate": 1.941741524395581e-05, "loss": 1.3882, "step": 2830 }, { "epoch": 0.14, "grad_norm": 1.2662431762458628, "learning_rate": 1.9416891186933516e-05, "loss": 1.0977, "step": 2831 }, { "epoch": 0.14, "grad_norm": 1.3637622733023576, "learning_rate": 1.941636690139096e-05, "loss": 1.1455, "step": 2832 }, { "epoch": 0.14, "grad_norm": 1.1228120769985024, "learning_rate": 1.9415842387340875e-05, "loss": 1.1284, "step": 2833 }, { "epoch": 0.14, "grad_norm": 1.2381627564542637, "learning_rate": 1.9415317644795984e-05, "loss": 1.334, "step": 2834 }, { "epoch": 0.14, "grad_norm": 1.614476748504299, "learning_rate": 1.9414792673769028e-05, "loss": 1.2759, "step": 2835 }, { "epoch": 0.14, "grad_norm": 0.8831736471113798, "learning_rate": 1.9414267474272735e-05, "loss": 1.1509, "step": 2836 }, { "epoch": 0.14, "grad_norm": 1.1343365978856845, "learning_rate": 1.9413742046319863e-05, "loss": 1.1235, "step": 2837 }, { "epoch": 0.14, "grad_norm": 1.1277340059972034, "learning_rate": 1.941321638992315e-05, "loss": 1.1104, "step": 2838 }, { "epoch": 0.14, "grad_norm": 1.3081434152123395, "learning_rate": 1.9412690505095363e-05, "loss": 1.3784, "step": 2839 }, { "epoch": 0.14, "grad_norm": 1.1820263421719863, "learning_rate": 1.941216439184926e-05, "loss": 1.1406, "step": 2840 }, { "epoch": 0.14, "grad_norm": 1.522730300341463, "learning_rate": 1.9411638050197605e-05, "loss": 1.4102, "step": 2841 }, { "epoch": 0.14, "grad_norm": 1.2497431120805444, "learning_rate": 1.9411111480153174e-05, "loss": 1.2256, "step": 2842 }, { "epoch": 0.14, "grad_norm": 1.3342194215737273, "learning_rate": 1.9410584681728745e-05, "loss": 1.2856, "step": 2843 }, { "epoch": 0.14, "grad_norm": 1.3671614051186127, "learning_rate": 1.94100576549371e-05, "loss": 1.1626, "step": 2844 }, { "epoch": 0.14, "grad_norm": 1.1948901482433931, "learning_rate": 1.9409530399791026e-05, "loss": 1.2554, "step": 2845 }, { "epoch": 0.14, "grad_norm": 1.1420759416532227, "learning_rate": 1.940900291630333e-05, "loss": 1.1836, "step": 2846 }, { "epoch": 0.14, "grad_norm": 1.2146583496516814, "learning_rate": 1.94084752044868e-05, "loss": 1.2368, "step": 2847 }, { "epoch": 0.14, "grad_norm": 1.1691374353048996, "learning_rate": 1.9407947264354242e-05, "loss": 1.4341, "step": 2848 }, { "epoch": 0.14, "grad_norm": 1.3914484795982989, "learning_rate": 1.9407419095918477e-05, "loss": 1.3584, "step": 2849 }, { "epoch": 0.14, "grad_norm": 1.2906483851568866, "learning_rate": 1.9406890699192316e-05, "loss": 1.4043, "step": 2850 }, { "epoch": 0.14, "grad_norm": 1.2314056440264272, "learning_rate": 1.9406362074188584e-05, "loss": 1.1089, "step": 2851 }, { "epoch": 0.14, "grad_norm": 1.387199764006199, "learning_rate": 1.9405833220920104e-05, "loss": 1.3174, "step": 2852 }, { "epoch": 0.14, "grad_norm": 1.1937886869506873, "learning_rate": 1.9405304139399715e-05, "loss": 1.1172, "step": 2853 }, { "epoch": 0.14, "grad_norm": 1.0330574898950282, "learning_rate": 1.9404774829640254e-05, "loss": 1.2061, "step": 2854 }, { "epoch": 0.14, "grad_norm": 1.435143103603759, "learning_rate": 1.9404245291654568e-05, "loss": 1.2505, "step": 2855 }, { "epoch": 0.14, "grad_norm": 1.1900085016834696, "learning_rate": 1.9403715525455503e-05, "loss": 1.1272, "step": 2856 }, { "epoch": 0.14, "grad_norm": 1.1741867093081544, "learning_rate": 1.9403185531055915e-05, "loss": 1.2363, "step": 2857 }, { "epoch": 0.14, "grad_norm": 0.9827998480717662, "learning_rate": 1.9402655308468678e-05, "loss": 1.2646, "step": 2858 }, { "epoch": 0.14, "grad_norm": 1.3321912662631679, "learning_rate": 1.940212485770664e-05, "loss": 1.2554, "step": 2859 }, { "epoch": 0.14, "grad_norm": 1.4227229296680717, "learning_rate": 1.9401594178782686e-05, "loss": 1.1904, "step": 2860 }, { "epoch": 0.14, "grad_norm": 1.2650474875443942, "learning_rate": 1.940106327170969e-05, "loss": 1.2798, "step": 2861 }, { "epoch": 0.14, "grad_norm": 1.295340436165034, "learning_rate": 1.940053213650053e-05, "loss": 1.1738, "step": 2862 }, { "epoch": 0.14, "grad_norm": 1.3602678315128385, "learning_rate": 1.9400000773168107e-05, "loss": 1.2959, "step": 2863 }, { "epoch": 0.14, "grad_norm": 1.1744440207663347, "learning_rate": 1.939946918172531e-05, "loss": 1.2168, "step": 2864 }, { "epoch": 0.14, "grad_norm": 1.2350274681748403, "learning_rate": 1.9398937362185037e-05, "loss": 1.1997, "step": 2865 }, { "epoch": 0.14, "grad_norm": 1.1954224388447448, "learning_rate": 1.9398405314560197e-05, "loss": 1.2983, "step": 2866 }, { "epoch": 0.14, "grad_norm": 0.6766409597139228, "learning_rate": 1.93978730388637e-05, "loss": 1.1377, "step": 2867 }, { "epoch": 0.14, "grad_norm": 1.287479590998543, "learning_rate": 1.939734053510846e-05, "loss": 1.377, "step": 2868 }, { "epoch": 0.14, "grad_norm": 1.3014899293005142, "learning_rate": 1.9396807803307405e-05, "loss": 1.3901, "step": 2869 }, { "epoch": 0.14, "grad_norm": 1.2910748736843476, "learning_rate": 1.9396274843473455e-05, "loss": 1.1484, "step": 2870 }, { "epoch": 0.14, "grad_norm": 1.2284481350730798, "learning_rate": 1.9395741655619554e-05, "loss": 1.084, "step": 2871 }, { "epoch": 0.14, "grad_norm": 1.0081170246234077, "learning_rate": 1.939520823975863e-05, "loss": 1.2383, "step": 2872 }, { "epoch": 0.14, "grad_norm": 1.0848551407598979, "learning_rate": 1.9394674595903635e-05, "loss": 1.2505, "step": 2873 }, { "epoch": 0.14, "grad_norm": 1.3138431610009718, "learning_rate": 1.9394140724067515e-05, "loss": 1.2227, "step": 2874 }, { "epoch": 0.14, "grad_norm": 1.2475545719264889, "learning_rate": 1.9393606624263228e-05, "loss": 1.3516, "step": 2875 }, { "epoch": 0.14, "grad_norm": 1.223908275107291, "learning_rate": 1.9393072296503733e-05, "loss": 1.1172, "step": 2876 }, { "epoch": 0.14, "grad_norm": 1.207440857658899, "learning_rate": 1.9392537740801997e-05, "loss": 1.1533, "step": 2877 }, { "epoch": 0.14, "grad_norm": 0.849340513623634, "learning_rate": 1.9392002957170994e-05, "loss": 1.0964, "step": 2878 }, { "epoch": 0.14, "grad_norm": 1.2255856122266813, "learning_rate": 1.9391467945623698e-05, "loss": 1.1108, "step": 2879 }, { "epoch": 0.14, "grad_norm": 1.3291703189901876, "learning_rate": 1.9390932706173095e-05, "loss": 1.1284, "step": 2880 }, { "epoch": 0.14, "grad_norm": 1.2074881202673384, "learning_rate": 1.9390397238832173e-05, "loss": 1.2275, "step": 2881 }, { "epoch": 0.14, "grad_norm": 1.3632766844384925, "learning_rate": 1.9389861543613927e-05, "loss": 1.0986, "step": 2882 }, { "epoch": 0.14, "grad_norm": 1.5500235121861214, "learning_rate": 1.9389325620531356e-05, "loss": 1.355, "step": 2883 }, { "epoch": 0.14, "grad_norm": 1.4438013724127303, "learning_rate": 1.9388789469597464e-05, "loss": 1.1875, "step": 2884 }, { "epoch": 0.14, "grad_norm": 1.381271555245736, "learning_rate": 1.9388253090825263e-05, "loss": 1.2827, "step": 2885 }, { "epoch": 0.14, "grad_norm": 1.33160012904605, "learning_rate": 1.938771648422777e-05, "loss": 1.25, "step": 2886 }, { "epoch": 0.14, "grad_norm": 1.2600183244002516, "learning_rate": 1.9387179649818007e-05, "loss": 1.1997, "step": 2887 }, { "epoch": 0.14, "grad_norm": 1.2691824222201216, "learning_rate": 1.9386642587609002e-05, "loss": 1.1123, "step": 2888 }, { "epoch": 0.14, "grad_norm": 1.3222823154899834, "learning_rate": 1.9386105297613782e-05, "loss": 1.2236, "step": 2889 }, { "epoch": 0.14, "grad_norm": 1.727656873841732, "learning_rate": 1.9385567779845392e-05, "loss": 1.397, "step": 2890 }, { "epoch": 0.14, "grad_norm": 1.1716748045652734, "learning_rate": 1.9385030034316873e-05, "loss": 1.271, "step": 2891 }, { "epoch": 0.14, "grad_norm": 1.0743448966645597, "learning_rate": 1.938449206104128e-05, "loss": 1.2446, "step": 2892 }, { "epoch": 0.14, "grad_norm": 1.2440447271779054, "learning_rate": 1.9383953860031658e-05, "loss": 1.2192, "step": 2893 }, { "epoch": 0.14, "grad_norm": 1.2234636328430466, "learning_rate": 1.9383415431301075e-05, "loss": 1.354, "step": 2894 }, { "epoch": 0.14, "grad_norm": 1.2926155694284962, "learning_rate": 1.9382876774862594e-05, "loss": 1.2871, "step": 2895 }, { "epoch": 0.14, "grad_norm": 1.1247259686362867, "learning_rate": 1.938233789072929e-05, "loss": 1.1035, "step": 2896 }, { "epoch": 0.14, "grad_norm": 1.2908942394913634, "learning_rate": 1.938179877891424e-05, "loss": 1.2627, "step": 2897 }, { "epoch": 0.14, "grad_norm": 1.0951613496511399, "learning_rate": 1.9381259439430517e-05, "loss": 1.2178, "step": 2898 }, { "epoch": 0.14, "grad_norm": 1.4864806510464303, "learning_rate": 1.9380719872291223e-05, "loss": 1.3579, "step": 2899 }, { "epoch": 0.14, "grad_norm": 1.5601123950092, "learning_rate": 1.9380180077509444e-05, "loss": 1.2017, "step": 2900 }, { "epoch": 0.14, "grad_norm": 1.1336975833300569, "learning_rate": 1.937964005509828e-05, "loss": 1.248, "step": 2901 }, { "epoch": 0.14, "grad_norm": 1.3093503858395548, "learning_rate": 1.9379099805070836e-05, "loss": 1.1987, "step": 2902 }, { "epoch": 0.14, "grad_norm": 1.2516376160662592, "learning_rate": 1.9378559327440222e-05, "loss": 1.2012, "step": 2903 }, { "epoch": 0.14, "grad_norm": 1.3418745248182566, "learning_rate": 1.9378018622219557e-05, "loss": 1.1094, "step": 2904 }, { "epoch": 0.14, "grad_norm": 1.3590443223004038, "learning_rate": 1.9377477689421958e-05, "loss": 1.2173, "step": 2905 }, { "epoch": 0.14, "grad_norm": 1.3145202139020222, "learning_rate": 1.9376936529060556e-05, "loss": 1.2744, "step": 2906 }, { "epoch": 0.14, "grad_norm": 1.3943386221279719, "learning_rate": 1.9376395141148475e-05, "loss": 1.2988, "step": 2907 }, { "epoch": 0.14, "grad_norm": 1.3135279656858143, "learning_rate": 1.9375853525698866e-05, "loss": 1.4087, "step": 2908 }, { "epoch": 0.14, "grad_norm": 1.6335412643943827, "learning_rate": 1.9375311682724863e-05, "loss": 1.2163, "step": 2909 }, { "epoch": 0.14, "grad_norm": 1.1971603231447447, "learning_rate": 1.9374769612239617e-05, "loss": 1.1895, "step": 2910 }, { "epoch": 0.14, "grad_norm": 1.241647064541242, "learning_rate": 1.937422731425628e-05, "loss": 1.2295, "step": 2911 }, { "epoch": 0.14, "grad_norm": 1.6091302461929209, "learning_rate": 1.9373684788788018e-05, "loss": 1.1665, "step": 2912 }, { "epoch": 0.14, "grad_norm": 1.526103089677711, "learning_rate": 1.937314203584799e-05, "loss": 1.3149, "step": 2913 }, { "epoch": 0.14, "grad_norm": 1.3940387424809453, "learning_rate": 1.9372599055449374e-05, "loss": 1.2944, "step": 2914 }, { "epoch": 0.14, "grad_norm": 1.2232665457251142, "learning_rate": 1.937205584760534e-05, "loss": 1.0942, "step": 2915 }, { "epoch": 0.14, "grad_norm": 0.869537510745098, "learning_rate": 1.9371512412329078e-05, "loss": 1.1807, "step": 2916 }, { "epoch": 0.14, "grad_norm": 1.3508680641481063, "learning_rate": 1.9370968749633764e-05, "loss": 1.2188, "step": 2917 }, { "epoch": 0.14, "grad_norm": 1.532186470029494, "learning_rate": 1.9370424859532598e-05, "loss": 1.1377, "step": 2918 }, { "epoch": 0.14, "grad_norm": 1.5170045376298094, "learning_rate": 1.9369880742038783e-05, "loss": 1.4053, "step": 2919 }, { "epoch": 0.14, "grad_norm": 1.2588222663125679, "learning_rate": 1.9369336397165512e-05, "loss": 1.2554, "step": 2920 }, { "epoch": 0.14, "grad_norm": 1.4576297768308812, "learning_rate": 1.9368791824926006e-05, "loss": 1.2891, "step": 2921 }, { "epoch": 0.14, "grad_norm": 1.2020014702773025, "learning_rate": 1.9368247025333472e-05, "loss": 1.3325, "step": 2922 }, { "epoch": 0.14, "grad_norm": 1.210431747429196, "learning_rate": 1.9367701998401137e-05, "loss": 1.2207, "step": 2923 }, { "epoch": 0.14, "grad_norm": 1.2691949457388636, "learning_rate": 1.9367156744142218e-05, "loss": 1.2441, "step": 2924 }, { "epoch": 0.14, "grad_norm": 1.246384634578654, "learning_rate": 1.936661126256996e-05, "loss": 1.1211, "step": 2925 }, { "epoch": 0.14, "grad_norm": 1.396193353152672, "learning_rate": 1.9366065553697586e-05, "loss": 1.1299, "step": 2926 }, { "epoch": 0.14, "grad_norm": 1.279349403380316, "learning_rate": 1.936551961753835e-05, "loss": 1.1035, "step": 2927 }, { "epoch": 0.14, "grad_norm": 1.0620824453173423, "learning_rate": 1.9364973454105494e-05, "loss": 1.2095, "step": 2928 }, { "epoch": 0.14, "grad_norm": 1.370856057630793, "learning_rate": 1.9364427063412276e-05, "loss": 1.2896, "step": 2929 }, { "epoch": 0.14, "grad_norm": 1.3752984633077525, "learning_rate": 1.936388044547195e-05, "loss": 1.3745, "step": 2930 }, { "epoch": 0.14, "grad_norm": 1.4670274920233504, "learning_rate": 1.9363333600297784e-05, "loss": 1.1851, "step": 2931 }, { "epoch": 0.14, "grad_norm": 1.4387603088539793, "learning_rate": 1.936278652790305e-05, "loss": 1.2393, "step": 2932 }, { "epoch": 0.14, "grad_norm": 1.1919158677693653, "learning_rate": 1.9362239228301023e-05, "loss": 1.2422, "step": 2933 }, { "epoch": 0.14, "grad_norm": 1.2441899473788247, "learning_rate": 1.936169170150498e-05, "loss": 1.1548, "step": 2934 }, { "epoch": 0.14, "grad_norm": 1.084504634501322, "learning_rate": 1.936114394752821e-05, "loss": 1.3555, "step": 2935 }, { "epoch": 0.14, "grad_norm": 1.9528109730223289, "learning_rate": 1.936059596638401e-05, "loss": 1.2139, "step": 2936 }, { "epoch": 0.14, "grad_norm": 1.3467692265002342, "learning_rate": 1.9360047758085675e-05, "loss": 1.2109, "step": 2937 }, { "epoch": 0.14, "grad_norm": 1.0982701834788784, "learning_rate": 1.9359499322646505e-05, "loss": 1.0618, "step": 2938 }, { "epoch": 0.14, "grad_norm": 1.3195073368046937, "learning_rate": 1.9358950660079815e-05, "loss": 1.2686, "step": 2939 }, { "epoch": 0.14, "grad_norm": 1.560928375363077, "learning_rate": 1.9358401770398912e-05, "loss": 1.3193, "step": 2940 }, { "epoch": 0.14, "grad_norm": 1.101672521039128, "learning_rate": 1.9357852653617123e-05, "loss": 1.2524, "step": 2941 }, { "epoch": 0.14, "grad_norm": 1.2595039615382313, "learning_rate": 1.9357303309747772e-05, "loss": 1.2729, "step": 2942 }, { "epoch": 0.14, "grad_norm": 1.3034737541725454, "learning_rate": 1.9356753738804185e-05, "loss": 0.9446, "step": 2943 }, { "epoch": 0.14, "grad_norm": 1.1626915117429417, "learning_rate": 1.9356203940799702e-05, "loss": 1.0962, "step": 2944 }, { "epoch": 0.14, "grad_norm": 1.314753080722927, "learning_rate": 1.9355653915747668e-05, "loss": 1.3486, "step": 2945 }, { "epoch": 0.14, "grad_norm": 1.4003058438943607, "learning_rate": 1.9355103663661426e-05, "loss": 1.3301, "step": 2946 }, { "epoch": 0.14, "grad_norm": 1.145373178713599, "learning_rate": 1.935455318455433e-05, "loss": 1.3735, "step": 2947 }, { "epoch": 0.14, "grad_norm": 1.2853894647528195, "learning_rate": 1.935400247843974e-05, "loss": 1.0918, "step": 2948 }, { "epoch": 0.14, "grad_norm": 1.5485365002981988, "learning_rate": 1.935345154533102e-05, "loss": 1.2358, "step": 2949 }, { "epoch": 0.14, "grad_norm": 1.1168567865763082, "learning_rate": 1.9352900385241534e-05, "loss": 1.2935, "step": 2950 }, { "epoch": 0.14, "grad_norm": 1.4933172406138409, "learning_rate": 1.9352348998184664e-05, "loss": 1.3511, "step": 2951 }, { "epoch": 0.14, "grad_norm": 0.9939718091189073, "learning_rate": 1.9351797384173787e-05, "loss": 1.2573, "step": 2952 }, { "epoch": 0.14, "grad_norm": 1.1678461605219512, "learning_rate": 1.9351245543222292e-05, "loss": 1.2798, "step": 2953 }, { "epoch": 0.14, "grad_norm": 1.4173948098542624, "learning_rate": 1.935069347534357e-05, "loss": 1.3101, "step": 2954 }, { "epoch": 0.14, "grad_norm": 1.3002542311896452, "learning_rate": 1.9350141180551014e-05, "loss": 1.209, "step": 2955 }, { "epoch": 0.14, "grad_norm": 1.2297739833424703, "learning_rate": 1.934958865885803e-05, "loss": 1.1694, "step": 2956 }, { "epoch": 0.14, "grad_norm": 1.162918653752359, "learning_rate": 1.9349035910278027e-05, "loss": 1.2207, "step": 2957 }, { "epoch": 0.14, "grad_norm": 1.4716188212053143, "learning_rate": 1.9348482934824413e-05, "loss": 1.1709, "step": 2958 }, { "epoch": 0.14, "grad_norm": 1.1016008588384305, "learning_rate": 1.9347929732510614e-05, "loss": 1.1641, "step": 2959 }, { "epoch": 0.14, "grad_norm": 1.2505792230068842, "learning_rate": 1.9347376303350056e-05, "loss": 1.1233, "step": 2960 }, { "epoch": 0.14, "grad_norm": 1.150845883725662, "learning_rate": 1.9346822647356158e-05, "loss": 1.1294, "step": 2961 }, { "epoch": 0.14, "grad_norm": 1.5674348944751384, "learning_rate": 1.934626876454236e-05, "loss": 1.2881, "step": 2962 }, { "epoch": 0.14, "grad_norm": 1.2399475778333615, "learning_rate": 1.9345714654922112e-05, "loss": 1.1709, "step": 2963 }, { "epoch": 0.14, "grad_norm": 1.4075367647942583, "learning_rate": 1.9345160318508853e-05, "loss": 1.2729, "step": 2964 }, { "epoch": 0.14, "grad_norm": 1.774775440964435, "learning_rate": 1.9344605755316035e-05, "loss": 1.25, "step": 2965 }, { "epoch": 0.14, "grad_norm": 1.1854452926484105, "learning_rate": 1.9344050965357117e-05, "loss": 1.1855, "step": 2966 }, { "epoch": 0.14, "grad_norm": 1.57962807882461, "learning_rate": 1.9343495948645562e-05, "loss": 1.1899, "step": 2967 }, { "epoch": 0.14, "grad_norm": 1.249092047556739, "learning_rate": 1.9342940705194838e-05, "loss": 1.188, "step": 2968 }, { "epoch": 0.14, "grad_norm": 1.112430103836025, "learning_rate": 1.934238523501842e-05, "loss": 1.167, "step": 2969 }, { "epoch": 0.14, "grad_norm": 1.228684222359549, "learning_rate": 1.9341829538129787e-05, "loss": 1.2031, "step": 2970 }, { "epoch": 0.14, "grad_norm": 1.458894171360412, "learning_rate": 1.9341273614542427e-05, "loss": 1.1392, "step": 2971 }, { "epoch": 0.14, "grad_norm": 1.3175523296336085, "learning_rate": 1.9340717464269823e-05, "loss": 1.0886, "step": 2972 }, { "epoch": 0.14, "grad_norm": 1.2211189764412265, "learning_rate": 1.9340161087325483e-05, "loss": 1.1416, "step": 2973 }, { "epoch": 0.14, "grad_norm": 0.9998661829261598, "learning_rate": 1.9339604483722896e-05, "loss": 1.1709, "step": 2974 }, { "epoch": 0.14, "grad_norm": 1.4087249631872492, "learning_rate": 1.933904765347558e-05, "loss": 1.3682, "step": 2975 }, { "epoch": 0.14, "grad_norm": 1.3070761991045476, "learning_rate": 1.933849059659704e-05, "loss": 1.2407, "step": 2976 }, { "epoch": 0.14, "grad_norm": 1.6598449357375045, "learning_rate": 1.9337933313100793e-05, "loss": 1.3931, "step": 2977 }, { "epoch": 0.14, "grad_norm": 1.255462805231643, "learning_rate": 1.933737580300037e-05, "loss": 1.2251, "step": 2978 }, { "epoch": 0.14, "grad_norm": 1.3894189553612528, "learning_rate": 1.9336818066309297e-05, "loss": 1.2114, "step": 2979 }, { "epoch": 0.14, "grad_norm": 1.1741196141600498, "learning_rate": 1.9336260103041108e-05, "loss": 1.2334, "step": 2980 }, { "epoch": 0.14, "grad_norm": 1.4301647264329325, "learning_rate": 1.9335701913209342e-05, "loss": 1.2114, "step": 2981 }, { "epoch": 0.14, "grad_norm": 1.1921892421819977, "learning_rate": 1.9335143496827546e-05, "loss": 1.2368, "step": 2982 }, { "epoch": 0.14, "grad_norm": 1.1651282835990595, "learning_rate": 1.933458485390927e-05, "loss": 1.2944, "step": 2983 }, { "epoch": 0.14, "grad_norm": 1.6254933937416454, "learning_rate": 1.9334025984468075e-05, "loss": 1.4185, "step": 2984 }, { "epoch": 0.14, "grad_norm": 1.1498636995020743, "learning_rate": 1.9333466888517518e-05, "loss": 1.3096, "step": 2985 }, { "epoch": 0.14, "grad_norm": 1.317404524453681, "learning_rate": 1.9332907566071168e-05, "loss": 1.2007, "step": 2986 }, { "epoch": 0.14, "grad_norm": 1.3223614714945782, "learning_rate": 1.9332348017142598e-05, "loss": 1.0515, "step": 2987 }, { "epoch": 0.14, "grad_norm": 1.509033766563693, "learning_rate": 1.933178824174539e-05, "loss": 1.1147, "step": 2988 }, { "epoch": 0.14, "grad_norm": 1.4603826300014386, "learning_rate": 1.933122823989312e-05, "loss": 1.1323, "step": 2989 }, { "epoch": 0.14, "grad_norm": 1.2058278933823126, "learning_rate": 1.9330668011599388e-05, "loss": 1.1392, "step": 2990 }, { "epoch": 0.14, "grad_norm": 1.4506088713710252, "learning_rate": 1.933010755687778e-05, "loss": 1.3208, "step": 2991 }, { "epoch": 0.14, "grad_norm": 1.2862844930151287, "learning_rate": 1.9329546875741904e-05, "loss": 1.1802, "step": 2992 }, { "epoch": 0.14, "grad_norm": 1.227107346000838, "learning_rate": 1.932898596820536e-05, "loss": 1.1147, "step": 2993 }, { "epoch": 0.14, "grad_norm": 1.1452125720172628, "learning_rate": 1.9328424834281763e-05, "loss": 1.3101, "step": 2994 }, { "epoch": 0.14, "grad_norm": 1.231417952959909, "learning_rate": 1.932786347398473e-05, "loss": 1.2373, "step": 2995 }, { "epoch": 0.14, "grad_norm": 1.3664720052018267, "learning_rate": 1.932730188732788e-05, "loss": 1.314, "step": 2996 }, { "epoch": 0.14, "grad_norm": 1.1901767384229063, "learning_rate": 1.9326740074324846e-05, "loss": 1.168, "step": 2997 }, { "epoch": 0.14, "grad_norm": 1.4373672076317907, "learning_rate": 1.932617803498926e-05, "loss": 1.2534, "step": 2998 }, { "epoch": 0.14, "grad_norm": 1.1085511355035569, "learning_rate": 1.9325615769334755e-05, "loss": 1.2783, "step": 2999 }, { "epoch": 0.14, "grad_norm": 1.157548910940475, "learning_rate": 1.9325053277374986e-05, "loss": 1.1382, "step": 3000 }, { "epoch": 0.14, "grad_norm": 1.139544924294377, "learning_rate": 1.93244905591236e-05, "loss": 1.2803, "step": 3001 }, { "epoch": 0.14, "grad_norm": 1.2337387773548278, "learning_rate": 1.9323927614594245e-05, "loss": 1.2827, "step": 3002 }, { "epoch": 0.14, "grad_norm": 1.2006878656236861, "learning_rate": 1.932336444380059e-05, "loss": 1.3169, "step": 3003 }, { "epoch": 0.14, "grad_norm": 1.4002106224079722, "learning_rate": 1.93228010467563e-05, "loss": 1.1323, "step": 3004 }, { "epoch": 0.14, "grad_norm": 1.2013557568883364, "learning_rate": 1.9322237423475044e-05, "loss": 1.2974, "step": 3005 }, { "epoch": 0.14, "grad_norm": 0.5322585269629087, "learning_rate": 1.9321673573970502e-05, "loss": 1.1294, "step": 3006 }, { "epoch": 0.14, "grad_norm": 0.5322585269629087, "learning_rate": 1.9321673573970502e-05, "loss": 1.2109, "step": 3007 }, { "epoch": 0.14, "grad_norm": 0.5322585269629087, "learning_rate": 1.9321673573970502e-05, "loss": 1.3091, "step": 3008 }, { "epoch": 0.14, "grad_norm": 1.4317125877611152, "learning_rate": 1.932110949825636e-05, "loss": 1.3071, "step": 3009 }, { "epoch": 0.14, "grad_norm": 1.2030898317417371, "learning_rate": 1.9320545196346295e-05, "loss": 1.2334, "step": 3010 }, { "epoch": 0.14, "grad_norm": 1.282216751222411, "learning_rate": 1.9319980668254016e-05, "loss": 1.1538, "step": 3011 }, { "epoch": 0.14, "grad_norm": 1.3224892439898936, "learning_rate": 1.9319415913993214e-05, "loss": 1.2397, "step": 3012 }, { "epoch": 0.14, "grad_norm": 1.4633256072031364, "learning_rate": 1.9318850933577592e-05, "loss": 1.2607, "step": 3013 }, { "epoch": 0.14, "grad_norm": 1.3104788993236278, "learning_rate": 1.9318285727020867e-05, "loss": 1.2339, "step": 3014 }, { "epoch": 0.15, "grad_norm": 1.6216098802296015, "learning_rate": 1.9317720294336747e-05, "loss": 1.0896, "step": 3015 }, { "epoch": 0.15, "grad_norm": 1.225382644795014, "learning_rate": 1.9317154635538964e-05, "loss": 1.1567, "step": 3016 }, { "epoch": 0.15, "grad_norm": 1.04132229071204, "learning_rate": 1.9316588750641233e-05, "loss": 1.126, "step": 3017 }, { "epoch": 0.15, "grad_norm": 1.3100428993386624, "learning_rate": 1.9316022639657296e-05, "loss": 1.2412, "step": 3018 }, { "epoch": 0.15, "grad_norm": 1.3632123125628408, "learning_rate": 1.9315456302600885e-05, "loss": 1.2988, "step": 3019 }, { "epoch": 0.15, "grad_norm": 1.2558219669347763, "learning_rate": 1.9314889739485747e-05, "loss": 1.2681, "step": 3020 }, { "epoch": 0.15, "grad_norm": 1.800040031953831, "learning_rate": 1.931432295032563e-05, "loss": 1.3394, "step": 3021 }, { "epoch": 0.15, "grad_norm": 1.3846538708489489, "learning_rate": 1.9313755935134286e-05, "loss": 1.209, "step": 3022 }, { "epoch": 0.15, "grad_norm": 1.3409148319535897, "learning_rate": 1.9313188693925475e-05, "loss": 1.3911, "step": 3023 }, { "epoch": 0.15, "grad_norm": 1.5697181266106894, "learning_rate": 1.9312621226712962e-05, "loss": 1.2046, "step": 3024 }, { "epoch": 0.15, "grad_norm": 1.2183770340015154, "learning_rate": 1.9312053533510525e-05, "loss": 1.126, "step": 3025 }, { "epoch": 0.15, "grad_norm": 1.3590076274993719, "learning_rate": 1.9311485614331928e-05, "loss": 1.1152, "step": 3026 }, { "epoch": 0.15, "grad_norm": 1.079184566109758, "learning_rate": 1.9310917469190965e-05, "loss": 1.084, "step": 3027 }, { "epoch": 0.15, "grad_norm": 1.5564598895017048, "learning_rate": 1.9310349098101412e-05, "loss": 1.2583, "step": 3028 }, { "epoch": 0.15, "grad_norm": 1.1829024292556005, "learning_rate": 1.930978050107707e-05, "loss": 1.1938, "step": 3029 }, { "epoch": 0.15, "grad_norm": 1.4003044748318692, "learning_rate": 1.9309211678131733e-05, "loss": 1.209, "step": 3030 }, { "epoch": 0.15, "grad_norm": 1.2993091693882184, "learning_rate": 1.930864262927921e-05, "loss": 1.2217, "step": 3031 }, { "epoch": 0.15, "grad_norm": 1.6874019309077977, "learning_rate": 1.9308073354533302e-05, "loss": 1.2622, "step": 3032 }, { "epoch": 0.15, "grad_norm": 1.3656694550501622, "learning_rate": 1.9307503853907832e-05, "loss": 1.1724, "step": 3033 }, { "epoch": 0.15, "grad_norm": 1.131890075012984, "learning_rate": 1.930693412741661e-05, "loss": 1.1562, "step": 3034 }, { "epoch": 0.15, "grad_norm": 1.2308945910026752, "learning_rate": 1.9306364175073474e-05, "loss": 1.2085, "step": 3035 }, { "epoch": 0.15, "grad_norm": 1.1345609459101549, "learning_rate": 1.9305793996892244e-05, "loss": 1.2339, "step": 3036 }, { "epoch": 0.15, "grad_norm": 1.1888340436519842, "learning_rate": 1.930522359288676e-05, "loss": 1.1582, "step": 3037 }, { "epoch": 0.15, "grad_norm": 1.3400514732964528, "learning_rate": 1.9304652963070868e-05, "loss": 1.2197, "step": 3038 }, { "epoch": 0.15, "grad_norm": 1.4634561100482675, "learning_rate": 1.9304082107458412e-05, "loss": 1.293, "step": 3039 }, { "epoch": 0.15, "grad_norm": 1.3485909064911454, "learning_rate": 1.9303511026063244e-05, "loss": 1.125, "step": 3040 }, { "epoch": 0.15, "grad_norm": 1.2571479310049944, "learning_rate": 1.930293971889923e-05, "loss": 1.1006, "step": 3041 }, { "epoch": 0.15, "grad_norm": 1.749790476366772, "learning_rate": 1.9302368185980218e-05, "loss": 1.3413, "step": 3042 }, { "epoch": 0.15, "grad_norm": 1.3457894161381458, "learning_rate": 1.9301796427320093e-05, "loss": 1.249, "step": 3043 }, { "epoch": 0.15, "grad_norm": 1.3305815695479835, "learning_rate": 1.9301224442932725e-05, "loss": 1.2437, "step": 3044 }, { "epoch": 0.15, "grad_norm": 1.1352367869023638, "learning_rate": 1.930065223283199e-05, "loss": 1.208, "step": 3045 }, { "epoch": 0.15, "grad_norm": 1.302432789974811, "learning_rate": 1.930007979703178e-05, "loss": 1.2026, "step": 3046 }, { "epoch": 0.15, "grad_norm": 1.5346401054207415, "learning_rate": 1.9299507135545986e-05, "loss": 1.1372, "step": 3047 }, { "epoch": 0.15, "grad_norm": 1.2564328599429304, "learning_rate": 1.92989342483885e-05, "loss": 1.1987, "step": 3048 }, { "epoch": 0.15, "grad_norm": 1.3151406041977647, "learning_rate": 1.929836113557323e-05, "loss": 1.2031, "step": 3049 }, { "epoch": 0.15, "grad_norm": 1.3495106141485107, "learning_rate": 1.9297787797114078e-05, "loss": 1.2422, "step": 3050 }, { "epoch": 0.15, "grad_norm": 1.55584606587957, "learning_rate": 1.929721423302496e-05, "loss": 1.2715, "step": 3051 }, { "epoch": 0.15, "grad_norm": 1.2290724009427396, "learning_rate": 1.9296640443319793e-05, "loss": 1.1138, "step": 3052 }, { "epoch": 0.15, "grad_norm": 1.3271959717051078, "learning_rate": 1.9296066428012508e-05, "loss": 1.167, "step": 3053 }, { "epoch": 0.15, "grad_norm": 1.19984137185531, "learning_rate": 1.9295492187117025e-05, "loss": 1.2256, "step": 3054 }, { "epoch": 0.15, "grad_norm": 1.732776271032898, "learning_rate": 1.9294917720647287e-05, "loss": 1.2617, "step": 3055 }, { "epoch": 0.15, "grad_norm": 1.2120947721655226, "learning_rate": 1.929434302861723e-05, "loss": 1.0796, "step": 3056 }, { "epoch": 0.15, "grad_norm": 1.1441187832211681, "learning_rate": 1.92937681110408e-05, "loss": 1.3364, "step": 3057 }, { "epoch": 0.15, "grad_norm": 1.8905853520566493, "learning_rate": 1.929319296793195e-05, "loss": 1.3657, "step": 3058 }, { "epoch": 0.15, "grad_norm": 1.1199066147268264, "learning_rate": 1.929261759930464e-05, "loss": 1.1475, "step": 3059 }, { "epoch": 0.15, "grad_norm": 1.4527253714544248, "learning_rate": 1.9292042005172823e-05, "loss": 1.186, "step": 3060 }, { "epoch": 0.15, "grad_norm": 1.3284500574964342, "learning_rate": 1.9291466185550482e-05, "loss": 1.2617, "step": 3061 }, { "epoch": 0.15, "grad_norm": 1.245207818308442, "learning_rate": 1.9290890140451576e-05, "loss": 1.2632, "step": 3062 }, { "epoch": 0.15, "grad_norm": 1.6666940216527228, "learning_rate": 1.929031386989009e-05, "loss": 1.231, "step": 3063 }, { "epoch": 0.15, "grad_norm": 1.503844207028459, "learning_rate": 1.928973737388001e-05, "loss": 1.2227, "step": 3064 }, { "epoch": 0.15, "grad_norm": 1.3121861701723743, "learning_rate": 1.9289160652435326e-05, "loss": 1.3374, "step": 3065 }, { "epoch": 0.15, "grad_norm": 1.242909065753283, "learning_rate": 1.9288583705570026e-05, "loss": 1.1406, "step": 3066 }, { "epoch": 0.15, "grad_norm": 1.1336518066320826, "learning_rate": 1.928800653329812e-05, "loss": 1.1514, "step": 3067 }, { "epoch": 0.15, "grad_norm": 1.3418252311107524, "learning_rate": 1.928742913563361e-05, "loss": 1.3364, "step": 3068 }, { "epoch": 0.15, "grad_norm": 1.3156332343820873, "learning_rate": 1.928685151259051e-05, "loss": 1.3154, "step": 3069 }, { "epoch": 0.15, "grad_norm": 1.142414407158179, "learning_rate": 1.9286273664182832e-05, "loss": 1.1123, "step": 3070 }, { "epoch": 0.15, "grad_norm": 1.111022767972263, "learning_rate": 1.9285695590424604e-05, "loss": 1.3198, "step": 3071 }, { "epoch": 0.15, "grad_norm": 1.2429427554912134, "learning_rate": 1.9285117291329853e-05, "loss": 1.2422, "step": 3072 }, { "epoch": 0.15, "grad_norm": 1.2033406194319491, "learning_rate": 1.9284538766912608e-05, "loss": 1.2271, "step": 3073 }, { "epoch": 0.15, "grad_norm": 1.1394314698421877, "learning_rate": 1.9283960017186916e-05, "loss": 1.1772, "step": 3074 }, { "epoch": 0.15, "grad_norm": 1.1019240550098173, "learning_rate": 1.928338104216682e-05, "loss": 1.2744, "step": 3075 }, { "epoch": 0.15, "grad_norm": 1.2161661553620706, "learning_rate": 1.928280184186636e-05, "loss": 1.3145, "step": 3076 }, { "epoch": 0.15, "grad_norm": 0.9252672391972474, "learning_rate": 1.9282222416299604e-05, "loss": 1.2163, "step": 3077 }, { "epoch": 0.15, "grad_norm": 1.4118151234177778, "learning_rate": 1.9281642765480605e-05, "loss": 1.1519, "step": 3078 }, { "epoch": 0.15, "grad_norm": 1.066121836601918, "learning_rate": 1.9281062889423436e-05, "loss": 1.1865, "step": 3079 }, { "epoch": 0.15, "grad_norm": 1.2096180533259848, "learning_rate": 1.9280482788142162e-05, "loss": 1.1118, "step": 3080 }, { "epoch": 0.15, "grad_norm": 0.9950144971291458, "learning_rate": 1.9279902461650866e-05, "loss": 1.1953, "step": 3081 }, { "epoch": 0.15, "grad_norm": 1.4284363977199883, "learning_rate": 1.9279321909963627e-05, "loss": 1.3081, "step": 3082 }, { "epoch": 0.15, "grad_norm": 1.324176406180008, "learning_rate": 1.9278741133094535e-05, "loss": 1.1279, "step": 3083 }, { "epoch": 0.15, "grad_norm": 1.1550629392408664, "learning_rate": 1.9278160131057686e-05, "loss": 1.2085, "step": 3084 }, { "epoch": 0.15, "grad_norm": 1.4150246774139525, "learning_rate": 1.9277578903867174e-05, "loss": 1.2227, "step": 3085 }, { "epoch": 0.15, "grad_norm": 1.3001586835067318, "learning_rate": 1.9276997451537107e-05, "loss": 1.147, "step": 3086 }, { "epoch": 0.15, "grad_norm": 1.1953763853986643, "learning_rate": 1.9276415774081593e-05, "loss": 1.2827, "step": 3087 }, { "epoch": 0.15, "grad_norm": 1.4822112528539182, "learning_rate": 1.927583387151475e-05, "loss": 1.29, "step": 3088 }, { "epoch": 0.15, "grad_norm": 1.198334612973982, "learning_rate": 1.92752517438507e-05, "loss": 1.1816, "step": 3089 }, { "epoch": 0.15, "grad_norm": 1.4156043323848013, "learning_rate": 1.9274669391103567e-05, "loss": 1.438, "step": 3090 }, { "epoch": 0.15, "grad_norm": 1.3182684786654073, "learning_rate": 1.9274086813287484e-05, "loss": 1.1768, "step": 3091 }, { "epoch": 0.15, "grad_norm": 1.5091170899007402, "learning_rate": 1.9273504010416586e-05, "loss": 1.2178, "step": 3092 }, { "epoch": 0.15, "grad_norm": 1.124929640265413, "learning_rate": 1.927292098250502e-05, "loss": 1.2051, "step": 3093 }, { "epoch": 0.15, "grad_norm": 1.1630835115651765, "learning_rate": 1.9272337729566933e-05, "loss": 1.0693, "step": 3094 }, { "epoch": 0.15, "grad_norm": 1.3969823191465958, "learning_rate": 1.9271754251616475e-05, "loss": 1.3315, "step": 3095 }, { "epoch": 0.15, "grad_norm": 1.363496141846321, "learning_rate": 1.927117054866781e-05, "loss": 1.1553, "step": 3096 }, { "epoch": 0.15, "grad_norm": 0.9245227065643407, "learning_rate": 1.9270586620735102e-05, "loss": 1.1733, "step": 3097 }, { "epoch": 0.15, "grad_norm": 1.7068560028724749, "learning_rate": 1.927000246783252e-05, "loss": 1.3232, "step": 3098 }, { "epoch": 0.15, "grad_norm": 1.2594261245120757, "learning_rate": 1.926941808997424e-05, "loss": 1.1392, "step": 3099 }, { "epoch": 0.15, "grad_norm": 1.1124782573931644, "learning_rate": 1.9268833487174447e-05, "loss": 1.2847, "step": 3100 }, { "epoch": 0.15, "grad_norm": 1.3143097613782713, "learning_rate": 1.926824865944732e-05, "loss": 1.1865, "step": 3101 }, { "epoch": 0.15, "grad_norm": 1.4781615678028057, "learning_rate": 1.9267663606807055e-05, "loss": 1.272, "step": 3102 }, { "epoch": 0.15, "grad_norm": 1.1998999541658504, "learning_rate": 1.9267078329267853e-05, "loss": 1.2451, "step": 3103 }, { "epoch": 0.15, "grad_norm": 1.4414163284251973, "learning_rate": 1.926649282684391e-05, "loss": 1.2676, "step": 3104 }, { "epoch": 0.15, "grad_norm": 1.3749570973261536, "learning_rate": 1.9265907099549438e-05, "loss": 1.2466, "step": 3105 }, { "epoch": 0.15, "grad_norm": 1.155045251366464, "learning_rate": 1.926532114739865e-05, "loss": 1.2646, "step": 3106 }, { "epoch": 0.15, "grad_norm": 1.5687319012631566, "learning_rate": 1.926473497040577e-05, "loss": 1.3579, "step": 3107 }, { "epoch": 0.15, "grad_norm": 1.4297536606604089, "learning_rate": 1.9264148568585013e-05, "loss": 1.2329, "step": 3108 }, { "epoch": 0.15, "grad_norm": 1.0374112589216304, "learning_rate": 1.9263561941950622e-05, "loss": 1.0117, "step": 3109 }, { "epoch": 0.15, "grad_norm": 1.4395775548875753, "learning_rate": 1.926297509051682e-05, "loss": 1.2954, "step": 3110 }, { "epoch": 0.15, "grad_norm": 1.3459763310597939, "learning_rate": 1.926238801429786e-05, "loss": 1.0918, "step": 3111 }, { "epoch": 0.15, "grad_norm": 1.2439806121689414, "learning_rate": 1.926180071330798e-05, "loss": 1.354, "step": 3112 }, { "epoch": 0.15, "grad_norm": 1.3279591559033619, "learning_rate": 1.9261213187561433e-05, "loss": 1.2417, "step": 3113 }, { "epoch": 0.15, "grad_norm": 1.2676529333547255, "learning_rate": 1.926062543707248e-05, "loss": 1.2271, "step": 3114 }, { "epoch": 0.15, "grad_norm": 1.2748020755836789, "learning_rate": 1.926003746185538e-05, "loss": 1.2222, "step": 3115 }, { "epoch": 0.15, "grad_norm": 1.267040859295656, "learning_rate": 1.9259449261924405e-05, "loss": 1.2974, "step": 3116 }, { "epoch": 0.15, "grad_norm": 1.0271989735355551, "learning_rate": 1.9258860837293824e-05, "loss": 1.2529, "step": 3117 }, { "epoch": 0.15, "grad_norm": 1.792449229331658, "learning_rate": 1.9258272187977924e-05, "loss": 1.4697, "step": 3118 }, { "epoch": 0.15, "grad_norm": 1.2440361952478378, "learning_rate": 1.9257683313990984e-05, "loss": 1.2104, "step": 3119 }, { "epoch": 0.15, "grad_norm": 1.3422039327856625, "learning_rate": 1.9257094215347298e-05, "loss": 1.2109, "step": 3120 }, { "epoch": 0.15, "grad_norm": 1.0496451227909271, "learning_rate": 1.9256504892061156e-05, "loss": 1.1948, "step": 3121 }, { "epoch": 0.15, "grad_norm": 1.1990741599890107, "learning_rate": 1.9255915344146865e-05, "loss": 1.2148, "step": 3122 }, { "epoch": 0.15, "grad_norm": 1.2063977158147765, "learning_rate": 1.9255325571618728e-05, "loss": 1.2446, "step": 3123 }, { "epoch": 0.15, "grad_norm": 1.3698891231489732, "learning_rate": 1.925473557449106e-05, "loss": 1.0845, "step": 3124 }, { "epoch": 0.15, "grad_norm": 1.5576477536123707, "learning_rate": 1.9254145352778176e-05, "loss": 1.2734, "step": 3125 }, { "epoch": 0.15, "grad_norm": 1.139878549160004, "learning_rate": 1.92535549064944e-05, "loss": 1.2612, "step": 3126 }, { "epoch": 0.15, "grad_norm": 1.2229965694378526, "learning_rate": 1.9252964235654058e-05, "loss": 1.1245, "step": 3127 }, { "epoch": 0.15, "grad_norm": 1.373472162683065, "learning_rate": 1.925237334027149e-05, "loss": 1.2461, "step": 3128 }, { "epoch": 0.15, "grad_norm": 1.1179867050881462, "learning_rate": 1.9251782220361027e-05, "loss": 1.2178, "step": 3129 }, { "epoch": 0.15, "grad_norm": 0.9395920992359477, "learning_rate": 1.9251190875937024e-05, "loss": 1.2295, "step": 3130 }, { "epoch": 0.15, "grad_norm": 1.4135131924545745, "learning_rate": 1.925059930701382e-05, "loss": 1.2373, "step": 3131 }, { "epoch": 0.15, "grad_norm": 1.0546772182750832, "learning_rate": 1.9250007513605776e-05, "loss": 1.3164, "step": 3132 }, { "epoch": 0.15, "grad_norm": 1.1336359511783607, "learning_rate": 1.9249415495727252e-05, "loss": 1.1582, "step": 3133 }, { "epoch": 0.15, "grad_norm": 1.2106308581225789, "learning_rate": 1.924882325339262e-05, "loss": 1.0952, "step": 3134 }, { "epoch": 0.15, "grad_norm": 1.3427650642482178, "learning_rate": 1.9248230786616244e-05, "loss": 1.2563, "step": 3135 }, { "epoch": 0.15, "grad_norm": 1.286248344493991, "learning_rate": 1.9247638095412508e-05, "loss": 1.2607, "step": 3136 }, { "epoch": 0.15, "grad_norm": 1.2003174582953844, "learning_rate": 1.9247045179795788e-05, "loss": 1.4019, "step": 3137 }, { "epoch": 0.15, "grad_norm": 1.46540067797969, "learning_rate": 1.924645203978048e-05, "loss": 1.2788, "step": 3138 }, { "epoch": 0.15, "grad_norm": 1.4160554516427803, "learning_rate": 1.924585867538097e-05, "loss": 1.0605, "step": 3139 }, { "epoch": 0.15, "grad_norm": 1.0421158950573985, "learning_rate": 1.924526508661166e-05, "loss": 1.2295, "step": 3140 }, { "epoch": 0.15, "grad_norm": 1.2714392998800266, "learning_rate": 1.9244671273486962e-05, "loss": 1.1147, "step": 3141 }, { "epoch": 0.15, "grad_norm": 1.2094020607601468, "learning_rate": 1.9244077236021273e-05, "loss": 1.2188, "step": 3142 }, { "epoch": 0.15, "grad_norm": 1.1503982152000995, "learning_rate": 1.924348297422902e-05, "loss": 1.1938, "step": 3143 }, { "epoch": 0.15, "grad_norm": 1.154485425668514, "learning_rate": 1.924288848812462e-05, "loss": 1.1836, "step": 3144 }, { "epoch": 0.15, "grad_norm": 1.7466091631814367, "learning_rate": 1.9242293777722496e-05, "loss": 1.4111, "step": 3145 }, { "epoch": 0.15, "grad_norm": 1.3134332695684225, "learning_rate": 1.9241698843037083e-05, "loss": 1.3667, "step": 3146 }, { "epoch": 0.15, "grad_norm": 1.4891651862530928, "learning_rate": 1.9241103684082815e-05, "loss": 1.3564, "step": 3147 }, { "epoch": 0.15, "grad_norm": 1.3313770338366688, "learning_rate": 1.9240508300874145e-05, "loss": 1.2793, "step": 3148 }, { "epoch": 0.15, "grad_norm": 1.2272705064888552, "learning_rate": 1.9239912693425506e-05, "loss": 1.0977, "step": 3149 }, { "epoch": 0.15, "grad_norm": 1.2976749950124493, "learning_rate": 1.9239316861751365e-05, "loss": 1.3325, "step": 3150 }, { "epoch": 0.15, "grad_norm": 1.2021299616085872, "learning_rate": 1.9238720805866174e-05, "loss": 1.1675, "step": 3151 }, { "epoch": 0.15, "grad_norm": 1.2187129141954007, "learning_rate": 1.92381245257844e-05, "loss": 1.2246, "step": 3152 }, { "epoch": 0.15, "grad_norm": 1.3049319861624888, "learning_rate": 1.9237528021520512e-05, "loss": 1.0996, "step": 3153 }, { "epoch": 0.15, "grad_norm": 1.2211786156344366, "learning_rate": 1.9236931293088982e-05, "loss": 1.165, "step": 3154 }, { "epoch": 0.15, "grad_norm": 1.3603753566213534, "learning_rate": 1.9236334340504298e-05, "loss": 1.23, "step": 3155 }, { "epoch": 0.15, "grad_norm": 1.2725965678540356, "learning_rate": 1.923573716378094e-05, "loss": 1.1802, "step": 3156 }, { "epoch": 0.15, "grad_norm": 1.2838763776741542, "learning_rate": 1.9235139762933402e-05, "loss": 1.3286, "step": 3157 }, { "epoch": 0.15, "grad_norm": 1.6352114820112145, "learning_rate": 1.9234542137976184e-05, "loss": 1.2998, "step": 3158 }, { "epoch": 0.15, "grad_norm": 1.2617672335639216, "learning_rate": 1.9233944288923788e-05, "loss": 1.2168, "step": 3159 }, { "epoch": 0.15, "grad_norm": 1.258115143542949, "learning_rate": 1.9233346215790717e-05, "loss": 1.2354, "step": 3160 }, { "epoch": 0.15, "grad_norm": 1.3685587732601576, "learning_rate": 1.9232747918591488e-05, "loss": 1.1846, "step": 3161 }, { "epoch": 0.15, "grad_norm": 1.3526818515933696, "learning_rate": 1.923214939734062e-05, "loss": 1.2617, "step": 3162 }, { "epoch": 0.15, "grad_norm": 0.9608276419574779, "learning_rate": 1.9231550652052635e-05, "loss": 1.2012, "step": 3163 }, { "epoch": 0.15, "grad_norm": 1.2813981932115546, "learning_rate": 1.9230951682742066e-05, "loss": 1.1704, "step": 3164 }, { "epoch": 0.15, "grad_norm": 1.728844453850068, "learning_rate": 1.923035248942345e-05, "loss": 1.2803, "step": 3165 }, { "epoch": 0.15, "grad_norm": 1.3070559898785106, "learning_rate": 1.9229753072111325e-05, "loss": 1.3267, "step": 3166 }, { "epoch": 0.15, "grad_norm": 1.4232696991712674, "learning_rate": 1.9229153430820232e-05, "loss": 1.2114, "step": 3167 }, { "epoch": 0.15, "grad_norm": 1.1294274047683792, "learning_rate": 1.9228553565564728e-05, "loss": 1.0342, "step": 3168 }, { "epoch": 0.15, "grad_norm": 1.1525029386314793, "learning_rate": 1.922795347635937e-05, "loss": 1.2515, "step": 3169 }, { "epoch": 0.15, "grad_norm": 1.2001799199146916, "learning_rate": 1.922735316321872e-05, "loss": 1.249, "step": 3170 }, { "epoch": 0.15, "grad_norm": 1.109044445476171, "learning_rate": 1.9226752626157345e-05, "loss": 1.2275, "step": 3171 }, { "epoch": 0.15, "grad_norm": 1.0981249628319258, "learning_rate": 1.922615186518982e-05, "loss": 1.1602, "step": 3172 }, { "epoch": 0.15, "grad_norm": 1.2357928005924046, "learning_rate": 1.9225550880330718e-05, "loss": 1.293, "step": 3173 }, { "epoch": 0.15, "grad_norm": 1.7749866974864135, "learning_rate": 1.9224949671594633e-05, "loss": 1.166, "step": 3174 }, { "epoch": 0.15, "grad_norm": 1.581812927274863, "learning_rate": 1.9224348238996146e-05, "loss": 1.1699, "step": 3175 }, { "epoch": 0.15, "grad_norm": 0.9773446428989382, "learning_rate": 1.9223746582549853e-05, "loss": 1.0918, "step": 3176 }, { "epoch": 0.15, "grad_norm": 1.4359502768822099, "learning_rate": 1.922314470227036e-05, "loss": 1.2734, "step": 3177 }, { "epoch": 0.15, "grad_norm": 1.1911741162362157, "learning_rate": 1.9222542598172268e-05, "loss": 1.1494, "step": 3178 }, { "epoch": 0.15, "grad_norm": 1.1722405371531426, "learning_rate": 1.922194027027019e-05, "loss": 1.1399, "step": 3179 }, { "epoch": 0.15, "grad_norm": 1.48895139079885, "learning_rate": 1.9221337718578744e-05, "loss": 1.2114, "step": 3180 }, { "epoch": 0.15, "grad_norm": 1.1132858933528669, "learning_rate": 1.922073494311255e-05, "loss": 1.1177, "step": 3181 }, { "epoch": 0.15, "grad_norm": 1.1976209934127564, "learning_rate": 1.9220131943886232e-05, "loss": 1.1191, "step": 3182 }, { "epoch": 0.15, "grad_norm": 1.3777538622926346, "learning_rate": 1.9219528720914432e-05, "loss": 1.1719, "step": 3183 }, { "epoch": 0.15, "grad_norm": 1.211925304529795, "learning_rate": 1.921892527421178e-05, "loss": 1.2861, "step": 3184 }, { "epoch": 0.15, "grad_norm": 0.984252346658007, "learning_rate": 1.9218321603792928e-05, "loss": 1.1294, "step": 3185 }, { "epoch": 0.15, "grad_norm": 1.5122573943301294, "learning_rate": 1.921771770967252e-05, "loss": 1.2056, "step": 3186 }, { "epoch": 0.15, "grad_norm": 1.1989577976770938, "learning_rate": 1.921711359186521e-05, "loss": 1.1526, "step": 3187 }, { "epoch": 0.15, "grad_norm": 1.0578294558765375, "learning_rate": 1.921650925038566e-05, "loss": 1.0903, "step": 3188 }, { "epoch": 0.15, "grad_norm": 1.5652049626009776, "learning_rate": 1.9215904685248534e-05, "loss": 1.1367, "step": 3189 }, { "epoch": 0.15, "grad_norm": 1.2948561383581576, "learning_rate": 1.9215299896468503e-05, "loss": 1.2461, "step": 3190 }, { "epoch": 0.15, "grad_norm": 1.2023739370529565, "learning_rate": 1.9214694884060248e-05, "loss": 1.2451, "step": 3191 }, { "epoch": 0.15, "grad_norm": 1.2530020045322814, "learning_rate": 1.9214089648038446e-05, "loss": 1.0586, "step": 3192 }, { "epoch": 0.15, "grad_norm": 1.2231694711019683, "learning_rate": 1.9213484188417788e-05, "loss": 1.2256, "step": 3193 }, { "epoch": 0.15, "grad_norm": 1.3042288474627586, "learning_rate": 1.921287850521296e-05, "loss": 1.2632, "step": 3194 }, { "epoch": 0.15, "grad_norm": 1.2633247232384919, "learning_rate": 1.921227259843867e-05, "loss": 1.4316, "step": 3195 }, { "epoch": 0.15, "grad_norm": 1.5081471647328097, "learning_rate": 1.9211666468109612e-05, "loss": 1.1204, "step": 3196 }, { "epoch": 0.15, "grad_norm": 1.354955238405076, "learning_rate": 1.9211060114240503e-05, "loss": 1.3384, "step": 3197 }, { "epoch": 0.15, "grad_norm": 1.3654795336445933, "learning_rate": 1.9210453536846053e-05, "loss": 1.1992, "step": 3198 }, { "epoch": 0.15, "grad_norm": 1.4513065860578782, "learning_rate": 1.920984673594098e-05, "loss": 1.3887, "step": 3199 }, { "epoch": 0.15, "grad_norm": 1.3096306225998307, "learning_rate": 1.9209239711540014e-05, "loss": 1.2192, "step": 3200 }, { "epoch": 0.15, "grad_norm": 1.4120287328633923, "learning_rate": 1.9208632463657885e-05, "loss": 1.3345, "step": 3201 }, { "epoch": 0.15, "grad_norm": 1.197070760340014, "learning_rate": 1.9208024992309325e-05, "loss": 1.207, "step": 3202 }, { "epoch": 0.15, "grad_norm": 1.511202019841012, "learning_rate": 1.920741729750908e-05, "loss": 1.1987, "step": 3203 }, { "epoch": 0.15, "grad_norm": 1.4149024814629425, "learning_rate": 1.9206809379271892e-05, "loss": 1.0085, "step": 3204 }, { "epoch": 0.15, "grad_norm": 1.1330514407917776, "learning_rate": 1.920620123761252e-05, "loss": 1.2007, "step": 3205 }, { "epoch": 0.15, "grad_norm": 1.1489426366975175, "learning_rate": 1.920559287254572e-05, "loss": 1.0903, "step": 3206 }, { "epoch": 0.15, "grad_norm": 1.3760573644115506, "learning_rate": 1.920498428408625e-05, "loss": 1.2456, "step": 3207 }, { "epoch": 0.15, "grad_norm": 1.2645293339026886, "learning_rate": 1.9204375472248885e-05, "loss": 1.0312, "step": 3208 }, { "epoch": 0.15, "grad_norm": 1.2354487232065399, "learning_rate": 1.9203766437048395e-05, "loss": 1.2031, "step": 3209 }, { "epoch": 0.15, "grad_norm": 1.2034704083831596, "learning_rate": 1.920315717849956e-05, "loss": 1.0881, "step": 3210 }, { "epoch": 0.15, "grad_norm": 1.151322822918863, "learning_rate": 1.9202547696617165e-05, "loss": 1.2607, "step": 3211 }, { "epoch": 0.15, "grad_norm": 1.2702232168040244, "learning_rate": 1.9201937991416003e-05, "loss": 1.2212, "step": 3212 }, { "epoch": 0.15, "grad_norm": 1.3044355176258788, "learning_rate": 1.920132806291087e-05, "loss": 1.1333, "step": 3213 }, { "epoch": 0.15, "grad_norm": 1.041827686713765, "learning_rate": 1.9200717911116564e-05, "loss": 1.1729, "step": 3214 }, { "epoch": 0.15, "grad_norm": 1.3204559185451559, "learning_rate": 1.920010753604789e-05, "loss": 1.3037, "step": 3215 }, { "epoch": 0.15, "grad_norm": 1.0896720324648572, "learning_rate": 1.9199496937719663e-05, "loss": 1.229, "step": 3216 }, { "epoch": 0.15, "grad_norm": 0.9805985214621347, "learning_rate": 1.91988861161467e-05, "loss": 1.1577, "step": 3217 }, { "epoch": 0.15, "grad_norm": 1.4020151660195619, "learning_rate": 1.9198275071343827e-05, "loss": 1.2295, "step": 3218 }, { "epoch": 0.15, "grad_norm": 1.2302906356804546, "learning_rate": 1.9197663803325867e-05, "loss": 1.1348, "step": 3219 }, { "epoch": 0.15, "grad_norm": 1.3100800663201668, "learning_rate": 1.9197052312107655e-05, "loss": 1.1738, "step": 3220 }, { "epoch": 0.15, "grad_norm": 1.3234478958093208, "learning_rate": 1.9196440597704033e-05, "loss": 1.3457, "step": 3221 }, { "epoch": 0.15, "grad_norm": 1.8482811441299494, "learning_rate": 1.9195828660129842e-05, "loss": 1.2188, "step": 3222 }, { "epoch": 0.16, "grad_norm": 1.227203709863614, "learning_rate": 1.9195216499399932e-05, "loss": 1.2363, "step": 3223 }, { "epoch": 0.16, "grad_norm": 1.3155049772863951, "learning_rate": 1.9194604115529163e-05, "loss": 1.2314, "step": 3224 }, { "epoch": 0.16, "grad_norm": 1.4861482994324133, "learning_rate": 1.919399150853239e-05, "loss": 1.3667, "step": 3225 }, { "epoch": 0.16, "grad_norm": 1.128708611271439, "learning_rate": 1.9193378678424484e-05, "loss": 1.3989, "step": 3226 }, { "epoch": 0.16, "grad_norm": 1.1681202428452513, "learning_rate": 1.9192765625220312e-05, "loss": 1.0029, "step": 3227 }, { "epoch": 0.16, "grad_norm": 1.425401977458417, "learning_rate": 1.9192152348934753e-05, "loss": 1.353, "step": 3228 }, { "epoch": 0.16, "grad_norm": 1.0801798752729985, "learning_rate": 1.919153884958269e-05, "loss": 1.0732, "step": 3229 }, { "epoch": 0.16, "grad_norm": 1.0796260488188545, "learning_rate": 1.9190925127179013e-05, "loss": 1.1543, "step": 3230 }, { "epoch": 0.16, "grad_norm": 1.2012466703980946, "learning_rate": 1.919031118173861e-05, "loss": 1.3218, "step": 3231 }, { "epoch": 0.16, "grad_norm": 1.1982596996210904, "learning_rate": 1.9189697013276386e-05, "loss": 1.229, "step": 3232 }, { "epoch": 0.16, "grad_norm": 1.2499002436871722, "learning_rate": 1.9189082621807235e-05, "loss": 1.0449, "step": 3233 }, { "epoch": 0.16, "grad_norm": 1.3996070553498183, "learning_rate": 1.918846800734608e-05, "loss": 1.2314, "step": 3234 }, { "epoch": 0.16, "grad_norm": 0.915790367861212, "learning_rate": 1.9187853169907824e-05, "loss": 1.1587, "step": 3235 }, { "epoch": 0.16, "grad_norm": 1.436699776015021, "learning_rate": 1.9187238109507393e-05, "loss": 1.2861, "step": 3236 }, { "epoch": 0.16, "grad_norm": 1.2735185512935436, "learning_rate": 1.918662282615971e-05, "loss": 1.2129, "step": 3237 }, { "epoch": 0.16, "grad_norm": 1.0077080237119949, "learning_rate": 1.9186007319879714e-05, "loss": 1.0166, "step": 3238 }, { "epoch": 0.16, "grad_norm": 1.0663724828074437, "learning_rate": 1.918539159068233e-05, "loss": 1.2383, "step": 3239 }, { "epoch": 0.16, "grad_norm": 1.2079140395571206, "learning_rate": 1.9184775638582508e-05, "loss": 1.1831, "step": 3240 }, { "epoch": 0.16, "grad_norm": 1.2289598940511517, "learning_rate": 1.918415946359519e-05, "loss": 1.3481, "step": 3241 }, { "epoch": 0.16, "grad_norm": 1.286769503110253, "learning_rate": 1.918354306573533e-05, "loss": 1.106, "step": 3242 }, { "epoch": 0.16, "grad_norm": 1.6097444218054162, "learning_rate": 1.9182926445017893e-05, "loss": 1.146, "step": 3243 }, { "epoch": 0.16, "grad_norm": 1.1518526248298382, "learning_rate": 1.9182309601457837e-05, "loss": 1.1851, "step": 3244 }, { "epoch": 0.16, "grad_norm": 1.2750674550853311, "learning_rate": 1.9181692535070128e-05, "loss": 1.2788, "step": 3245 }, { "epoch": 0.16, "grad_norm": 1.1901764445950018, "learning_rate": 1.9181075245869744e-05, "loss": 1.1104, "step": 3246 }, { "epoch": 0.16, "grad_norm": 1.4533642433564156, "learning_rate": 1.9180457733871666e-05, "loss": 1.4009, "step": 3247 }, { "epoch": 0.16, "grad_norm": 1.4421781102816449, "learning_rate": 1.9179839999090874e-05, "loss": 1.085, "step": 3248 }, { "epoch": 0.16, "grad_norm": 1.2823452074289379, "learning_rate": 1.9179222041542366e-05, "loss": 1.1577, "step": 3249 }, { "epoch": 0.16, "grad_norm": 1.3410178346160073, "learning_rate": 1.9178603861241133e-05, "loss": 1.1592, "step": 3250 }, { "epoch": 0.16, "grad_norm": 1.1848751584645898, "learning_rate": 1.917798545820218e-05, "loss": 1.1577, "step": 3251 }, { "epoch": 0.16, "grad_norm": 1.169865695684074, "learning_rate": 1.9177366832440505e-05, "loss": 1.188, "step": 3252 }, { "epoch": 0.16, "grad_norm": 1.311134508701534, "learning_rate": 1.917674798397113e-05, "loss": 1.1409, "step": 3253 }, { "epoch": 0.16, "grad_norm": 1.3934566916824618, "learning_rate": 1.917612891280907e-05, "loss": 1.0288, "step": 3254 }, { "epoch": 0.16, "grad_norm": 1.4714124528990467, "learning_rate": 1.9175509618969347e-05, "loss": 1.2725, "step": 3255 }, { "epoch": 0.16, "grad_norm": 1.2340181903968557, "learning_rate": 1.917489010246699e-05, "loss": 1.2856, "step": 3256 }, { "epoch": 0.16, "grad_norm": 1.2377763824761008, "learning_rate": 1.917427036331703e-05, "loss": 1.0815, "step": 3257 }, { "epoch": 0.16, "grad_norm": 1.2362250461104585, "learning_rate": 1.9173650401534514e-05, "loss": 1.1367, "step": 3258 }, { "epoch": 0.16, "grad_norm": 1.340782819488919, "learning_rate": 1.917303021713448e-05, "loss": 1.2627, "step": 3259 }, { "epoch": 0.16, "grad_norm": 0.9819302563156266, "learning_rate": 1.9172409810131975e-05, "loss": 1.1992, "step": 3260 }, { "epoch": 0.16, "grad_norm": 1.138762679077594, "learning_rate": 1.9171789180542066e-05, "loss": 1.3281, "step": 3261 }, { "epoch": 0.16, "grad_norm": 1.160705982941046, "learning_rate": 1.9171168328379803e-05, "loss": 1.2065, "step": 3262 }, { "epoch": 0.16, "grad_norm": 1.1628760230182205, "learning_rate": 1.9170547253660253e-05, "loss": 1.1631, "step": 3263 }, { "epoch": 0.16, "grad_norm": 1.3011917238331285, "learning_rate": 1.9169925956398497e-05, "loss": 1.2812, "step": 3264 }, { "epoch": 0.16, "grad_norm": 1.1834111112133472, "learning_rate": 1.91693044366096e-05, "loss": 1.1431, "step": 3265 }, { "epoch": 0.16, "grad_norm": 1.4338876265972165, "learning_rate": 1.9168682694308654e-05, "loss": 1.2358, "step": 3266 }, { "epoch": 0.16, "grad_norm": 1.301993699706082, "learning_rate": 1.9168060729510742e-05, "loss": 1.2573, "step": 3267 }, { "epoch": 0.16, "grad_norm": 1.141202197193616, "learning_rate": 1.916743854223096e-05, "loss": 1.1621, "step": 3268 }, { "epoch": 0.16, "grad_norm": 1.2959983736052776, "learning_rate": 1.9166816132484404e-05, "loss": 1.1743, "step": 3269 }, { "epoch": 0.16, "grad_norm": 1.0994182877608496, "learning_rate": 1.9166193500286177e-05, "loss": 1.2266, "step": 3270 }, { "epoch": 0.16, "grad_norm": 1.2729245894586005, "learning_rate": 1.9165570645651392e-05, "loss": 1.3574, "step": 3271 }, { "epoch": 0.16, "grad_norm": 1.198575949537184, "learning_rate": 1.9164947568595164e-05, "loss": 1.0708, "step": 3272 }, { "epoch": 0.16, "grad_norm": 1.0355880960606632, "learning_rate": 1.9164324269132608e-05, "loss": 1.312, "step": 3273 }, { "epoch": 0.16, "grad_norm": 1.5012673483608803, "learning_rate": 1.9163700747278857e-05, "loss": 1.252, "step": 3274 }, { "epoch": 0.16, "grad_norm": 1.3902932645488821, "learning_rate": 1.9163077003049037e-05, "loss": 1.1489, "step": 3275 }, { "epoch": 0.16, "grad_norm": 1.3697016974445062, "learning_rate": 1.9162453036458287e-05, "loss": 1.1445, "step": 3276 }, { "epoch": 0.16, "grad_norm": 1.384346632723937, "learning_rate": 1.9161828847521743e-05, "loss": 1.1768, "step": 3277 }, { "epoch": 0.16, "grad_norm": 1.4532367074046704, "learning_rate": 1.9161204436254565e-05, "loss": 1.1938, "step": 3278 }, { "epoch": 0.16, "grad_norm": 0.8107543584463126, "learning_rate": 1.916057980267189e-05, "loss": 1.2822, "step": 3279 }, { "epoch": 0.16, "grad_norm": 1.1403544888436512, "learning_rate": 1.915995494678889e-05, "loss": 1.1475, "step": 3280 }, { "epoch": 0.16, "grad_norm": 1.3016194371269918, "learning_rate": 1.9159329868620714e-05, "loss": 1.2388, "step": 3281 }, { "epoch": 0.16, "grad_norm": 0.9402640749686217, "learning_rate": 1.9158704568182543e-05, "loss": 1.3003, "step": 3282 }, { "epoch": 0.16, "grad_norm": 1.3976297775050865, "learning_rate": 1.9158079045489547e-05, "loss": 1.3145, "step": 3283 }, { "epoch": 0.16, "grad_norm": 1.3886812138794298, "learning_rate": 1.9157453300556904e-05, "loss": 1.3237, "step": 3284 }, { "epoch": 0.16, "grad_norm": 1.0848237459843737, "learning_rate": 1.9156827333399805e-05, "loss": 1.2085, "step": 3285 }, { "epoch": 0.16, "grad_norm": 1.3390284017502314, "learning_rate": 1.915620114403343e-05, "loss": 1.1001, "step": 3286 }, { "epoch": 0.16, "grad_norm": 1.300741096498043, "learning_rate": 1.9155574732472983e-05, "loss": 1.189, "step": 3287 }, { "epoch": 0.16, "grad_norm": 1.5626760176984218, "learning_rate": 1.9154948098733663e-05, "loss": 1.3203, "step": 3288 }, { "epoch": 0.16, "grad_norm": 0.983084039685459, "learning_rate": 1.9154321242830676e-05, "loss": 1.1675, "step": 3289 }, { "epoch": 0.16, "grad_norm": 1.1459107357901923, "learning_rate": 1.9153694164779234e-05, "loss": 1.2236, "step": 3290 }, { "epoch": 0.16, "grad_norm": 1.2901140455108338, "learning_rate": 1.9153066864594558e-05, "loss": 1.1621, "step": 3291 }, { "epoch": 0.16, "grad_norm": 1.3600450985762282, "learning_rate": 1.9152439342291865e-05, "loss": 1.2505, "step": 3292 }, { "epoch": 0.16, "grad_norm": 1.1335627276282472, "learning_rate": 1.9151811597886383e-05, "loss": 1.1792, "step": 3293 }, { "epoch": 0.16, "grad_norm": 1.4808421324251864, "learning_rate": 1.9151183631393352e-05, "loss": 1.2617, "step": 3294 }, { "epoch": 0.16, "grad_norm": 1.3401465204735845, "learning_rate": 1.9150555442828004e-05, "loss": 1.2158, "step": 3295 }, { "epoch": 0.16, "grad_norm": 1.4346504818605406, "learning_rate": 1.914992703220559e-05, "loss": 1.2666, "step": 3296 }, { "epoch": 0.16, "grad_norm": 1.0656054663693846, "learning_rate": 1.9149298399541353e-05, "loss": 1.3208, "step": 3297 }, { "epoch": 0.16, "grad_norm": 1.1598502478700552, "learning_rate": 1.9148669544850552e-05, "loss": 1.1533, "step": 3298 }, { "epoch": 0.16, "grad_norm": 1.0593546186396363, "learning_rate": 1.9148040468148442e-05, "loss": 1.2222, "step": 3299 }, { "epoch": 0.16, "grad_norm": 1.0547919593110826, "learning_rate": 1.9147411169450302e-05, "loss": 1.1003, "step": 3300 }, { "epoch": 0.16, "grad_norm": 1.1105855910899944, "learning_rate": 1.9146781648771387e-05, "loss": 1.2837, "step": 3301 }, { "epoch": 0.16, "grad_norm": 1.310590119840075, "learning_rate": 1.9146151906126983e-05, "loss": 1.1729, "step": 3302 }, { "epoch": 0.16, "grad_norm": 1.4155347430441267, "learning_rate": 1.9145521941532374e-05, "loss": 1.2896, "step": 3303 }, { "epoch": 0.16, "grad_norm": 1.4429837405809112, "learning_rate": 1.9144891755002837e-05, "loss": 1.1191, "step": 3304 }, { "epoch": 0.16, "grad_norm": 1.1962297401111752, "learning_rate": 1.9144261346553677e-05, "loss": 1.0203, "step": 3305 }, { "epoch": 0.16, "grad_norm": 1.0701963695661167, "learning_rate": 1.9143630716200184e-05, "loss": 1.1548, "step": 3306 }, { "epoch": 0.16, "grad_norm": 1.26291148260121, "learning_rate": 1.9142999863957662e-05, "loss": 1.1709, "step": 3307 }, { "epoch": 0.16, "grad_norm": 1.3717698283498643, "learning_rate": 1.9142368789841422e-05, "loss": 1.0994, "step": 3308 }, { "epoch": 0.16, "grad_norm": 1.3876681956263295, "learning_rate": 1.914173749386678e-05, "loss": 1.1816, "step": 3309 }, { "epoch": 0.16, "grad_norm": 1.319963056145712, "learning_rate": 1.9141105976049054e-05, "loss": 1.0542, "step": 3310 }, { "epoch": 0.16, "grad_norm": 1.1941066140005174, "learning_rate": 1.914047423640357e-05, "loss": 1.1282, "step": 3311 }, { "epoch": 0.16, "grad_norm": 1.0840098191209004, "learning_rate": 1.9139842274945655e-05, "loss": 1.123, "step": 3312 }, { "epoch": 0.16, "grad_norm": 1.3403826422183485, "learning_rate": 1.913921009169065e-05, "loss": 1.1743, "step": 3313 }, { "epoch": 0.16, "grad_norm": 1.369176435098339, "learning_rate": 1.9138577686653893e-05, "loss": 1.3311, "step": 3314 }, { "epoch": 0.16, "grad_norm": 1.461684735555145, "learning_rate": 1.913794505985073e-05, "loss": 1.2915, "step": 3315 }, { "epoch": 0.16, "grad_norm": 1.4514423035405328, "learning_rate": 1.9137312211296516e-05, "loss": 1.2329, "step": 3316 }, { "epoch": 0.16, "grad_norm": 1.3081437474948776, "learning_rate": 1.9136679141006603e-05, "loss": 1.2197, "step": 3317 }, { "epoch": 0.16, "grad_norm": 1.4101289320201498, "learning_rate": 1.9136045848996357e-05, "loss": 1.2383, "step": 3318 }, { "epoch": 0.16, "grad_norm": 1.3771496002505703, "learning_rate": 1.9135412335281152e-05, "loss": 1.3154, "step": 3319 }, { "epoch": 0.16, "grad_norm": 1.1009254004735336, "learning_rate": 1.913477859987635e-05, "loss": 1.1445, "step": 3320 }, { "epoch": 0.16, "grad_norm": 1.262282660201594, "learning_rate": 1.913414464279734e-05, "loss": 1.1157, "step": 3321 }, { "epoch": 0.16, "grad_norm": 1.3156485384147394, "learning_rate": 1.91335104640595e-05, "loss": 1.2915, "step": 3322 }, { "epoch": 0.16, "grad_norm": 1.3776080767379462, "learning_rate": 1.913287606367822e-05, "loss": 1.3965, "step": 3323 }, { "epoch": 0.16, "grad_norm": 1.1337973942644777, "learning_rate": 1.9132241441668903e-05, "loss": 1.3291, "step": 3324 }, { "epoch": 0.16, "grad_norm": 1.188572054739497, "learning_rate": 1.9131606598046936e-05, "loss": 1.2603, "step": 3325 }, { "epoch": 0.16, "grad_norm": 1.3734833200805108, "learning_rate": 1.9130971532827737e-05, "loss": 1.2915, "step": 3326 }, { "epoch": 0.16, "grad_norm": 0.9970961097642511, "learning_rate": 1.9130336246026707e-05, "loss": 1.1426, "step": 3327 }, { "epoch": 0.16, "grad_norm": 1.5458997142137034, "learning_rate": 1.9129700737659273e-05, "loss": 1.0947, "step": 3328 }, { "epoch": 0.16, "grad_norm": 1.4720049727342825, "learning_rate": 1.9129065007740848e-05, "loss": 1.1309, "step": 3329 }, { "epoch": 0.16, "grad_norm": 1.466103566596986, "learning_rate": 1.9128429056286865e-05, "loss": 1.1992, "step": 3330 }, { "epoch": 0.16, "grad_norm": 1.470948772150686, "learning_rate": 1.9127792883312756e-05, "loss": 1.2915, "step": 3331 }, { "epoch": 0.16, "grad_norm": 1.4086016961772665, "learning_rate": 1.912715648883395e-05, "loss": 1.271, "step": 3332 }, { "epoch": 0.16, "grad_norm": 1.2065711039219162, "learning_rate": 1.9126519872865906e-05, "loss": 1.1904, "step": 3333 }, { "epoch": 0.16, "grad_norm": 1.4024989805028794, "learning_rate": 1.9125883035424062e-05, "loss": 1.2339, "step": 3334 }, { "epoch": 0.16, "grad_norm": 1.0098965844345091, "learning_rate": 1.9125245976523876e-05, "loss": 1.2573, "step": 3335 }, { "epoch": 0.16, "grad_norm": 1.418633570773244, "learning_rate": 1.9124608696180806e-05, "loss": 1.2944, "step": 3336 }, { "epoch": 0.16, "grad_norm": 0.9870543517266978, "learning_rate": 1.9123971194410317e-05, "loss": 1.2388, "step": 3337 }, { "epoch": 0.16, "grad_norm": 1.3934344769329365, "learning_rate": 1.912333347122788e-05, "loss": 1.0916, "step": 3338 }, { "epoch": 0.16, "grad_norm": 0.8544380033941715, "learning_rate": 1.9122695526648968e-05, "loss": 1.1729, "step": 3339 }, { "epoch": 0.16, "grad_norm": 1.2578918172698272, "learning_rate": 1.912205736068907e-05, "loss": 1.2222, "step": 3340 }, { "epoch": 0.16, "grad_norm": 1.1303585863480128, "learning_rate": 1.912141897336366e-05, "loss": 1.0295, "step": 3341 }, { "epoch": 0.16, "grad_norm": 1.0592746270955613, "learning_rate": 1.9120780364688243e-05, "loss": 1.2349, "step": 3342 }, { "epoch": 0.16, "grad_norm": 1.4202475589515526, "learning_rate": 1.912014153467831e-05, "loss": 1.2334, "step": 3343 }, { "epoch": 0.16, "grad_norm": 1.455280609476356, "learning_rate": 1.9119502483349357e-05, "loss": 1.2881, "step": 3344 }, { "epoch": 0.16, "grad_norm": 1.5081191935038936, "learning_rate": 1.91188632107169e-05, "loss": 1.2944, "step": 3345 }, { "epoch": 0.16, "grad_norm": 1.2659974987489735, "learning_rate": 1.9118223716796453e-05, "loss": 1.0742, "step": 3346 }, { "epoch": 0.16, "grad_norm": 1.3465612730953058, "learning_rate": 1.9117584001603533e-05, "loss": 1.2485, "step": 3347 }, { "epoch": 0.16, "grad_norm": 1.2356006268914674, "learning_rate": 1.911694406515366e-05, "loss": 1.1958, "step": 3348 }, { "epoch": 0.16, "grad_norm": 1.2902679127789012, "learning_rate": 1.9116303907462365e-05, "loss": 1.189, "step": 3349 }, { "epoch": 0.16, "grad_norm": 1.2014811906424085, "learning_rate": 1.911566352854519e-05, "loss": 1.231, "step": 3350 }, { "epoch": 0.16, "grad_norm": 1.0184713161248993, "learning_rate": 1.9115022928417664e-05, "loss": 1.1655, "step": 3351 }, { "epoch": 0.16, "grad_norm": 1.281688394174819, "learning_rate": 1.911438210709534e-05, "loss": 1.1758, "step": 3352 }, { "epoch": 0.16, "grad_norm": 1.2874084308017681, "learning_rate": 1.911374106459377e-05, "loss": 1.2344, "step": 3353 }, { "epoch": 0.16, "grad_norm": 1.200483750801493, "learning_rate": 1.9113099800928502e-05, "loss": 1.1272, "step": 3354 }, { "epoch": 0.16, "grad_norm": 1.1353626533943664, "learning_rate": 1.9112458316115107e-05, "loss": 1.2002, "step": 3355 }, { "epoch": 0.16, "grad_norm": 1.2683902467615629, "learning_rate": 1.911181661016914e-05, "loss": 1.3447, "step": 3356 }, { "epoch": 0.16, "grad_norm": 1.3513511814970065, "learning_rate": 1.911117468310619e-05, "loss": 1.2573, "step": 3357 }, { "epoch": 0.16, "grad_norm": 1.0262668330225175, "learning_rate": 1.9110532534941822e-05, "loss": 1.3013, "step": 3358 }, { "epoch": 0.16, "grad_norm": 1.4931484525455858, "learning_rate": 1.910989016569162e-05, "loss": 1.2612, "step": 3359 }, { "epoch": 0.16, "grad_norm": 1.5719462552209862, "learning_rate": 1.9109247575371177e-05, "loss": 1.1987, "step": 3360 }, { "epoch": 0.16, "grad_norm": 1.3027737470672192, "learning_rate": 1.9108604763996084e-05, "loss": 1.2505, "step": 3361 }, { "epoch": 0.16, "grad_norm": 1.3514448675497033, "learning_rate": 1.910796173158194e-05, "loss": 1.293, "step": 3362 }, { "epoch": 0.16, "grad_norm": 1.1191405447678988, "learning_rate": 1.910731847814435e-05, "loss": 1.1489, "step": 3363 }, { "epoch": 0.16, "grad_norm": 1.2889240736779, "learning_rate": 1.9106675003698928e-05, "loss": 1.2178, "step": 3364 }, { "epoch": 0.16, "grad_norm": 1.2078311253226441, "learning_rate": 1.9106031308261284e-05, "loss": 1.1401, "step": 3365 }, { "epoch": 0.16, "grad_norm": 1.334502031894775, "learning_rate": 1.9105387391847036e-05, "loss": 1.3076, "step": 3366 }, { "epoch": 0.16, "grad_norm": 1.482265548849895, "learning_rate": 1.910474325447182e-05, "loss": 1.4399, "step": 3367 }, { "epoch": 0.16, "grad_norm": 1.3375032312834523, "learning_rate": 1.9104098896151256e-05, "loss": 1.2168, "step": 3368 }, { "epoch": 0.16, "grad_norm": 1.1613342978176464, "learning_rate": 1.9103454316900987e-05, "loss": 1.1465, "step": 3369 }, { "epoch": 0.16, "grad_norm": 1.2513884140212268, "learning_rate": 1.9102809516736655e-05, "loss": 1.1582, "step": 3370 }, { "epoch": 0.16, "grad_norm": 1.3913983078261047, "learning_rate": 1.9102164495673906e-05, "loss": 1.2876, "step": 3371 }, { "epoch": 0.16, "grad_norm": 1.0604715227880055, "learning_rate": 1.9101519253728396e-05, "loss": 1.1562, "step": 3372 }, { "epoch": 0.16, "grad_norm": 1.314723562682753, "learning_rate": 1.9100873790915776e-05, "loss": 1.3174, "step": 3373 }, { "epoch": 0.16, "grad_norm": 1.1473104216606567, "learning_rate": 1.9100228107251715e-05, "loss": 1.1997, "step": 3374 }, { "epoch": 0.16, "grad_norm": 1.1765591741576595, "learning_rate": 1.9099582202751885e-05, "loss": 1.2642, "step": 3375 }, { "epoch": 0.16, "grad_norm": 1.233614072002368, "learning_rate": 1.9098936077431953e-05, "loss": 1.1465, "step": 3376 }, { "epoch": 0.16, "grad_norm": 1.262141563210866, "learning_rate": 1.90982897313076e-05, "loss": 1.3247, "step": 3377 }, { "epoch": 0.16, "grad_norm": 1.4277861144132267, "learning_rate": 1.9097643164394512e-05, "loss": 1.2842, "step": 3378 }, { "epoch": 0.16, "grad_norm": 1.218187941133866, "learning_rate": 1.9096996376708382e-05, "loss": 1.2104, "step": 3379 }, { "epoch": 0.16, "grad_norm": 1.3927349486467182, "learning_rate": 1.9096349368264904e-05, "loss": 1.2695, "step": 3380 }, { "epoch": 0.16, "grad_norm": 1.0900937678011182, "learning_rate": 1.9095702139079774e-05, "loss": 1.2534, "step": 3381 }, { "epoch": 0.16, "grad_norm": 1.7369519439630712, "learning_rate": 1.9095054689168707e-05, "loss": 1.3662, "step": 3382 }, { "epoch": 0.16, "grad_norm": 1.3570911860418593, "learning_rate": 1.9094407018547406e-05, "loss": 1.3281, "step": 3383 }, { "epoch": 0.16, "grad_norm": 1.1906135010798118, "learning_rate": 1.9093759127231594e-05, "loss": 1.0847, "step": 3384 }, { "epoch": 0.16, "grad_norm": 1.0563427075282246, "learning_rate": 1.9093111015236993e-05, "loss": 1.186, "step": 3385 }, { "epoch": 0.16, "grad_norm": 1.5732127759750896, "learning_rate": 1.909246268257933e-05, "loss": 1.2764, "step": 3386 }, { "epoch": 0.16, "grad_norm": 0.8099677381674971, "learning_rate": 1.9091814129274335e-05, "loss": 1.1763, "step": 3387 }, { "epoch": 0.16, "grad_norm": 1.3280054255515636, "learning_rate": 1.9091165355337754e-05, "loss": 1.314, "step": 3388 }, { "epoch": 0.16, "grad_norm": 1.4297321033066963, "learning_rate": 1.9090516360785322e-05, "loss": 1.2827, "step": 3389 }, { "epoch": 0.16, "grad_norm": 1.3074057623837756, "learning_rate": 1.908986714563279e-05, "loss": 1.2441, "step": 3390 }, { "epoch": 0.16, "grad_norm": 1.2139888822239169, "learning_rate": 1.9089217709895918e-05, "loss": 1.2158, "step": 3391 }, { "epoch": 0.16, "grad_norm": 1.1846089998682012, "learning_rate": 1.908856805359046e-05, "loss": 1.0352, "step": 3392 }, { "epoch": 0.16, "grad_norm": 1.1576000245559765, "learning_rate": 1.9087918176732188e-05, "loss": 1.2886, "step": 3393 }, { "epoch": 0.16, "grad_norm": 1.2721878378160747, "learning_rate": 1.9087268079336865e-05, "loss": 1.207, "step": 3394 }, { "epoch": 0.16, "grad_norm": 1.1974007589573203, "learning_rate": 1.9086617761420272e-05, "loss": 1.208, "step": 3395 }, { "epoch": 0.16, "grad_norm": 1.3530979246449515, "learning_rate": 1.908596722299819e-05, "loss": 0.9688, "step": 3396 }, { "epoch": 0.16, "grad_norm": 1.3938382471400188, "learning_rate": 1.9085316464086403e-05, "loss": 1.2896, "step": 3397 }, { "epoch": 0.16, "grad_norm": 1.1536037564042814, "learning_rate": 1.9084665484700704e-05, "loss": 1.1504, "step": 3398 }, { "epoch": 0.16, "grad_norm": 1.504639338073859, "learning_rate": 1.908401428485689e-05, "loss": 1.4126, "step": 3399 }, { "epoch": 0.16, "grad_norm": 1.3060017549578955, "learning_rate": 1.9083362864570768e-05, "loss": 1.3384, "step": 3400 }, { "epoch": 0.16, "grad_norm": 1.1782226634862971, "learning_rate": 1.9082711223858136e-05, "loss": 1.1704, "step": 3401 }, { "epoch": 0.16, "grad_norm": 1.5818991824012074, "learning_rate": 1.9082059362734822e-05, "loss": 1.3936, "step": 3402 }, { "epoch": 0.16, "grad_norm": 1.3790135223863071, "learning_rate": 1.908140728121663e-05, "loss": 1.2744, "step": 3403 }, { "epoch": 0.16, "grad_norm": 1.5163021351939618, "learning_rate": 1.9080754979319395e-05, "loss": 1.3569, "step": 3404 }, { "epoch": 0.16, "grad_norm": 1.2562461486340941, "learning_rate": 1.908010245705894e-05, "loss": 1.2373, "step": 3405 }, { "epoch": 0.16, "grad_norm": 1.3459506691829743, "learning_rate": 1.90794497144511e-05, "loss": 1.2646, "step": 3406 }, { "epoch": 0.16, "grad_norm": 1.2814780467452862, "learning_rate": 1.907879675151172e-05, "loss": 1.2456, "step": 3407 }, { "epoch": 0.16, "grad_norm": 1.135136988082013, "learning_rate": 1.9078143568256644e-05, "loss": 1.313, "step": 3408 }, { "epoch": 0.16, "grad_norm": 1.1906305940110877, "learning_rate": 1.907749016470172e-05, "loss": 1.2715, "step": 3409 }, { "epoch": 0.16, "grad_norm": 1.340748236076362, "learning_rate": 1.9076836540862804e-05, "loss": 1.0942, "step": 3410 }, { "epoch": 0.16, "grad_norm": 1.1704827619637361, "learning_rate": 1.907618269675576e-05, "loss": 1.1101, "step": 3411 }, { "epoch": 0.16, "grad_norm": 1.164027208587502, "learning_rate": 1.9075528632396456e-05, "loss": 1.0806, "step": 3412 }, { "epoch": 0.16, "grad_norm": 1.3753563549328363, "learning_rate": 1.907487434780076e-05, "loss": 1.0962, "step": 3413 }, { "epoch": 0.16, "grad_norm": 1.0827122609609527, "learning_rate": 1.9074219842984553e-05, "loss": 1.1621, "step": 3414 }, { "epoch": 0.16, "grad_norm": 1.1002823442021386, "learning_rate": 1.9073565117963714e-05, "loss": 1.187, "step": 3415 }, { "epoch": 0.16, "grad_norm": 0.9161460015603792, "learning_rate": 1.9072910172754137e-05, "loss": 1.29, "step": 3416 }, { "epoch": 0.16, "grad_norm": 1.3085559713220312, "learning_rate": 1.9072255007371715e-05, "loss": 1.2495, "step": 3417 }, { "epoch": 0.16, "grad_norm": 0.8976772182946938, "learning_rate": 1.9071599621832338e-05, "loss": 1.0964, "step": 3418 }, { "epoch": 0.16, "grad_norm": 1.1313603954949405, "learning_rate": 1.9070944016151923e-05, "loss": 1.1465, "step": 3419 }, { "epoch": 0.16, "grad_norm": 1.1601825377628743, "learning_rate": 1.907028819034637e-05, "loss": 1.3013, "step": 3420 }, { "epoch": 0.16, "grad_norm": 1.46035244829555, "learning_rate": 1.9069632144431595e-05, "loss": 1.2744, "step": 3421 }, { "epoch": 0.16, "grad_norm": 1.269634538282727, "learning_rate": 1.9068975878423526e-05, "loss": 1.2168, "step": 3422 }, { "epoch": 0.16, "grad_norm": 1.414267580638929, "learning_rate": 1.9068319392338082e-05, "loss": 1.3008, "step": 3423 }, { "epoch": 0.16, "grad_norm": 1.134663738953098, "learning_rate": 1.9067662686191192e-05, "loss": 1.2153, "step": 3424 }, { "epoch": 0.16, "grad_norm": 1.3283651410892432, "learning_rate": 1.9067005759998797e-05, "loss": 1.1401, "step": 3425 }, { "epoch": 0.16, "grad_norm": 1.3301722702616716, "learning_rate": 1.906634861377684e-05, "loss": 1.1577, "step": 3426 }, { "epoch": 0.16, "grad_norm": 1.374026038292277, "learning_rate": 1.9065691247541264e-05, "loss": 1.1328, "step": 3427 }, { "epoch": 0.16, "grad_norm": 1.3808860229369102, "learning_rate": 1.9065033661308022e-05, "loss": 1.2217, "step": 3428 }, { "epoch": 0.16, "grad_norm": 1.1289704402054235, "learning_rate": 1.906437585509307e-05, "loss": 1.1523, "step": 3429 }, { "epoch": 0.16, "grad_norm": 1.1878133140223652, "learning_rate": 1.9063717828912376e-05, "loss": 1.2261, "step": 3430 }, { "epoch": 0.17, "grad_norm": 1.435088834030497, "learning_rate": 1.9063059582781905e-05, "loss": 1.2427, "step": 3431 }, { "epoch": 0.17, "grad_norm": 1.1468859055752265, "learning_rate": 1.9062401116717635e-05, "loss": 1.1221, "step": 3432 }, { "epoch": 0.17, "grad_norm": 1.2690723415247267, "learning_rate": 1.9061742430735538e-05, "loss": 1.3877, "step": 3433 }, { "epoch": 0.17, "grad_norm": 1.1763833657108405, "learning_rate": 1.90610835248516e-05, "loss": 1.1272, "step": 3434 }, { "epoch": 0.17, "grad_norm": 1.253049845792166, "learning_rate": 1.9060424399081816e-05, "loss": 1.3101, "step": 3435 }, { "epoch": 0.17, "grad_norm": 1.3901398722097753, "learning_rate": 1.9059765053442176e-05, "loss": 1.3579, "step": 3436 }, { "epoch": 0.17, "grad_norm": 1.2092532252360655, "learning_rate": 1.9059105487948683e-05, "loss": 1.0681, "step": 3437 }, { "epoch": 0.17, "grad_norm": 1.126670970926636, "learning_rate": 1.905844570261734e-05, "loss": 1.1064, "step": 3438 }, { "epoch": 0.17, "grad_norm": 1.475431887500409, "learning_rate": 1.9057785697464162e-05, "loss": 1.1992, "step": 3439 }, { "epoch": 0.17, "grad_norm": 1.259728378055384, "learning_rate": 1.9057125472505162e-05, "loss": 1.1699, "step": 3440 }, { "epoch": 0.17, "grad_norm": 1.2203236023076494, "learning_rate": 1.9056465027756362e-05, "loss": 1.1802, "step": 3441 }, { "epoch": 0.17, "grad_norm": 1.1639194585140666, "learning_rate": 1.905580436323379e-05, "loss": 1.1514, "step": 3442 }, { "epoch": 0.17, "grad_norm": 1.3793136599864775, "learning_rate": 1.905514347895348e-05, "loss": 1.2661, "step": 3443 }, { "epoch": 0.17, "grad_norm": 1.3067493388006646, "learning_rate": 1.905448237493147e-05, "loss": 1.1523, "step": 3444 }, { "epoch": 0.17, "grad_norm": 1.243098571736717, "learning_rate": 1.9053821051183797e-05, "loss": 1.2349, "step": 3445 }, { "epoch": 0.17, "grad_norm": 1.2056901232127195, "learning_rate": 1.9053159507726514e-05, "loss": 1.2588, "step": 3446 }, { "epoch": 0.17, "grad_norm": 1.0236529168045698, "learning_rate": 1.9052497744575675e-05, "loss": 1.2441, "step": 3447 }, { "epoch": 0.17, "grad_norm": 1.2058679998078612, "learning_rate": 1.9051835761747336e-05, "loss": 1.1738, "step": 3448 }, { "epoch": 0.17, "grad_norm": 0.9569810327051237, "learning_rate": 1.9051173559257567e-05, "loss": 1.0222, "step": 3449 }, { "epoch": 0.17, "grad_norm": 1.2864669294748858, "learning_rate": 1.905051113712243e-05, "loss": 1.2627, "step": 3450 }, { "epoch": 0.17, "grad_norm": 1.1350427974720596, "learning_rate": 1.9049848495358006e-05, "loss": 1.0508, "step": 3451 }, { "epoch": 0.17, "grad_norm": 1.186515849698308, "learning_rate": 1.9049185633980376e-05, "loss": 1.187, "step": 3452 }, { "epoch": 0.17, "grad_norm": 1.3057233069249745, "learning_rate": 1.904852255300562e-05, "loss": 1.269, "step": 3453 }, { "epoch": 0.17, "grad_norm": 1.0846472267219252, "learning_rate": 1.9047859252449837e-05, "loss": 1.1689, "step": 3454 }, { "epoch": 0.17, "grad_norm": 1.1582830338425567, "learning_rate": 1.9047195732329117e-05, "loss": 1.1782, "step": 3455 }, { "epoch": 0.17, "grad_norm": 1.6229961918120983, "learning_rate": 1.904653199265956e-05, "loss": 1.3447, "step": 3456 }, { "epoch": 0.17, "grad_norm": 1.3661231189533265, "learning_rate": 1.904586803345728e-05, "loss": 1.2031, "step": 3457 }, { "epoch": 0.17, "grad_norm": 1.2339232710239585, "learning_rate": 1.9045203854738386e-05, "loss": 1.2183, "step": 3458 }, { "epoch": 0.17, "grad_norm": 1.3426602344960223, "learning_rate": 1.9044539456518992e-05, "loss": 1.2925, "step": 3459 }, { "epoch": 0.17, "grad_norm": 1.2698438806140884, "learning_rate": 1.9043874838815225e-05, "loss": 1.1416, "step": 3460 }, { "epoch": 0.17, "grad_norm": 1.6318930615394507, "learning_rate": 1.9043210001643215e-05, "loss": 1.3799, "step": 3461 }, { "epoch": 0.17, "grad_norm": 1.0593470016947977, "learning_rate": 1.9042544945019094e-05, "loss": 1.105, "step": 3462 }, { "epoch": 0.17, "grad_norm": 1.042632124024118, "learning_rate": 1.9041879668959e-05, "loss": 1.2153, "step": 3463 }, { "epoch": 0.17, "grad_norm": 1.4340591605153423, "learning_rate": 1.9041214173479075e-05, "loss": 1.3052, "step": 3464 }, { "epoch": 0.17, "grad_norm": 1.304950280103722, "learning_rate": 1.9040548458595472e-05, "loss": 1.165, "step": 3465 }, { "epoch": 0.17, "grad_norm": 1.1239623083973622, "learning_rate": 1.9039882524324346e-05, "loss": 0.9529, "step": 3466 }, { "epoch": 0.17, "grad_norm": 1.092030026245719, "learning_rate": 1.9039216370681858e-05, "loss": 1.0654, "step": 3467 }, { "epoch": 0.17, "grad_norm": 1.433344471504051, "learning_rate": 1.903854999768417e-05, "loss": 1.2397, "step": 3468 }, { "epoch": 0.17, "grad_norm": 1.2220771015752594, "learning_rate": 1.9037883405347454e-05, "loss": 1.1362, "step": 3469 }, { "epoch": 0.17, "grad_norm": 1.3435392218176976, "learning_rate": 1.903721659368789e-05, "loss": 1.1895, "step": 3470 }, { "epoch": 0.17, "grad_norm": 1.3643229528333654, "learning_rate": 1.9036549562721657e-05, "loss": 1.3164, "step": 3471 }, { "epoch": 0.17, "grad_norm": 1.6660096029887215, "learning_rate": 1.9035882312464938e-05, "loss": 1.2007, "step": 3472 }, { "epoch": 0.17, "grad_norm": 1.3121658753653158, "learning_rate": 1.903521484293393e-05, "loss": 1.1777, "step": 3473 }, { "epoch": 0.17, "grad_norm": 1.402801044668341, "learning_rate": 1.9034547154144832e-05, "loss": 1.2593, "step": 3474 }, { "epoch": 0.17, "grad_norm": 1.3527465012212114, "learning_rate": 1.9033879246113842e-05, "loss": 1.2583, "step": 3475 }, { "epoch": 0.17, "grad_norm": 1.4168580586071178, "learning_rate": 1.903321111885717e-05, "loss": 1.415, "step": 3476 }, { "epoch": 0.17, "grad_norm": 1.2701370813552848, "learning_rate": 1.903254277239103e-05, "loss": 1.3804, "step": 3477 }, { "epoch": 0.17, "grad_norm": 1.0567543778320632, "learning_rate": 1.9031874206731644e-05, "loss": 1.2798, "step": 3478 }, { "epoch": 0.17, "grad_norm": 1.4409476896058968, "learning_rate": 1.903120542189523e-05, "loss": 1.0874, "step": 3479 }, { "epoch": 0.17, "grad_norm": 1.4250590688424878, "learning_rate": 1.903053641789802e-05, "loss": 1.1982, "step": 3480 }, { "epoch": 0.17, "grad_norm": 1.3117211229675227, "learning_rate": 1.9029867194756248e-05, "loss": 1.3174, "step": 3481 }, { "epoch": 0.17, "grad_norm": 1.2311488687538699, "learning_rate": 1.902919775248616e-05, "loss": 1.1777, "step": 3482 }, { "epoch": 0.17, "grad_norm": 1.2705094076186667, "learning_rate": 1.902852809110399e-05, "loss": 1.2378, "step": 3483 }, { "epoch": 0.17, "grad_norm": 1.283364904603143, "learning_rate": 1.9027858210626e-05, "loss": 1.2866, "step": 3484 }, { "epoch": 0.17, "grad_norm": 1.2668388834029731, "learning_rate": 1.902718811106844e-05, "loss": 1.2944, "step": 3485 }, { "epoch": 0.17, "grad_norm": 1.309092563470368, "learning_rate": 1.902651779244757e-05, "loss": 1.207, "step": 3486 }, { "epoch": 0.17, "grad_norm": 1.25982773621972, "learning_rate": 1.9025847254779662e-05, "loss": 1.1299, "step": 3487 }, { "epoch": 0.17, "grad_norm": 1.0265128470780536, "learning_rate": 1.9025176498080987e-05, "loss": 1.3545, "step": 3488 }, { "epoch": 0.17, "grad_norm": 1.1715510290275362, "learning_rate": 1.9024505522367818e-05, "loss": 1.1064, "step": 3489 }, { "epoch": 0.17, "grad_norm": 1.3384391309013637, "learning_rate": 1.902383432765644e-05, "loss": 1.1753, "step": 3490 }, { "epoch": 0.17, "grad_norm": 1.2530482532800944, "learning_rate": 1.9023162913963136e-05, "loss": 1.1685, "step": 3491 }, { "epoch": 0.17, "grad_norm": 1.2967187557760993, "learning_rate": 1.9022491281304214e-05, "loss": 1.2051, "step": 3492 }, { "epoch": 0.17, "grad_norm": 1.5705028665339298, "learning_rate": 1.9021819429695955e-05, "loss": 1.3745, "step": 3493 }, { "epoch": 0.17, "grad_norm": 1.4197195753828622, "learning_rate": 1.9021147359154674e-05, "loss": 1.2056, "step": 3494 }, { "epoch": 0.17, "grad_norm": 1.3579660047883084, "learning_rate": 1.9020475069696676e-05, "loss": 1.2007, "step": 3495 }, { "epoch": 0.17, "grad_norm": 1.1106877664985146, "learning_rate": 1.9019802561338278e-05, "loss": 1.0825, "step": 3496 }, { "epoch": 0.17, "grad_norm": 0.9691338841231183, "learning_rate": 1.9019129834095798e-05, "loss": 1.231, "step": 3497 }, { "epoch": 0.17, "grad_norm": 1.111083004103819, "learning_rate": 1.9018456887985558e-05, "loss": 1.2393, "step": 3498 }, { "epoch": 0.17, "grad_norm": 1.1044886069304185, "learning_rate": 1.9017783723023895e-05, "loss": 1.1514, "step": 3499 }, { "epoch": 0.17, "grad_norm": 1.568967648481884, "learning_rate": 1.901711033922714e-05, "loss": 1.3447, "step": 3500 }, { "epoch": 0.17, "grad_norm": 1.2882593593769245, "learning_rate": 1.9016436736611637e-05, "loss": 1.2881, "step": 3501 }, { "epoch": 0.17, "grad_norm": 1.5871391928864795, "learning_rate": 1.9015762915193727e-05, "loss": 1.3506, "step": 3502 }, { "epoch": 0.17, "grad_norm": 1.3148739027941356, "learning_rate": 1.901508887498977e-05, "loss": 1.1252, "step": 3503 }, { "epoch": 0.17, "grad_norm": 1.3340529043604683, "learning_rate": 1.9014414616016116e-05, "loss": 1.2192, "step": 3504 }, { "epoch": 0.17, "grad_norm": 0.9483817732541353, "learning_rate": 1.901374013828913e-05, "loss": 1.1694, "step": 3505 }, { "epoch": 0.17, "grad_norm": 1.2427106132764631, "learning_rate": 1.901306544182518e-05, "loss": 1.2344, "step": 3506 }, { "epoch": 0.17, "grad_norm": 1.7107141636413918, "learning_rate": 1.9012390526640638e-05, "loss": 1.3071, "step": 3507 }, { "epoch": 0.17, "grad_norm": 1.5163754552813258, "learning_rate": 1.9011715392751882e-05, "loss": 1.167, "step": 3508 }, { "epoch": 0.17, "grad_norm": 1.1507178230866322, "learning_rate": 1.9011040040175295e-05, "loss": 1.186, "step": 3509 }, { "epoch": 0.17, "grad_norm": 1.1312044338316605, "learning_rate": 1.9010364468927267e-05, "loss": 1.2285, "step": 3510 }, { "epoch": 0.17, "grad_norm": 1.2565688955407572, "learning_rate": 1.900968867902419e-05, "loss": 1.2886, "step": 3511 }, { "epoch": 0.17, "grad_norm": 1.2056617483792658, "learning_rate": 1.900901267048247e-05, "loss": 1.1968, "step": 3512 }, { "epoch": 0.17, "grad_norm": 1.2219931676063458, "learning_rate": 1.9008336443318505e-05, "loss": 1.2271, "step": 3513 }, { "epoch": 0.17, "grad_norm": 1.793085385712175, "learning_rate": 1.900765999754871e-05, "loss": 1.2383, "step": 3514 }, { "epoch": 0.17, "grad_norm": 1.226931841342433, "learning_rate": 1.9006983333189493e-05, "loss": 1.2441, "step": 3515 }, { "epoch": 0.17, "grad_norm": 1.3629737279476966, "learning_rate": 1.9006306450257278e-05, "loss": 1.3965, "step": 3516 }, { "epoch": 0.17, "grad_norm": 1.192179781260468, "learning_rate": 1.90056293487685e-05, "loss": 1.2568, "step": 3517 }, { "epoch": 0.17, "grad_norm": 1.3823821965532772, "learning_rate": 1.900495202873957e-05, "loss": 1.1807, "step": 3518 }, { "epoch": 0.17, "grad_norm": 1.2191159058335401, "learning_rate": 1.9004274490186944e-05, "loss": 1.249, "step": 3519 }, { "epoch": 0.17, "grad_norm": 1.277065057424459, "learning_rate": 1.9003596733127058e-05, "loss": 1.208, "step": 3520 }, { "epoch": 0.17, "grad_norm": 1.31355811000581, "learning_rate": 1.9002918757576358e-05, "loss": 1.3193, "step": 3521 }, { "epoch": 0.17, "grad_norm": 1.3820486167605102, "learning_rate": 1.9002240563551293e-05, "loss": 1.4141, "step": 3522 }, { "epoch": 0.17, "grad_norm": 1.436566153388607, "learning_rate": 1.9001562151068327e-05, "loss": 1.3042, "step": 3523 }, { "epoch": 0.17, "grad_norm": 1.4006583375161052, "learning_rate": 1.900088352014392e-05, "loss": 1.3613, "step": 3524 }, { "epoch": 0.17, "grad_norm": 1.2369682909837802, "learning_rate": 1.900020467079454e-05, "loss": 1.2388, "step": 3525 }, { "epoch": 0.17, "grad_norm": 1.3149014128965715, "learning_rate": 1.899952560303666e-05, "loss": 1.2471, "step": 3526 }, { "epoch": 0.17, "grad_norm": 1.1909194974878563, "learning_rate": 1.899884631688676e-05, "loss": 1.2017, "step": 3527 }, { "epoch": 0.17, "grad_norm": 1.2394565843489909, "learning_rate": 1.8998166812361325e-05, "loss": 1.1646, "step": 3528 }, { "epoch": 0.17, "grad_norm": 1.2224360132467993, "learning_rate": 1.8997487089476844e-05, "loss": 1.167, "step": 3529 }, { "epoch": 0.17, "grad_norm": 1.1449503131802734, "learning_rate": 1.8996807148249817e-05, "loss": 1.2485, "step": 3530 }, { "epoch": 0.17, "grad_norm": 1.7713327005071833, "learning_rate": 1.899612698869673e-05, "loss": 1.3789, "step": 3531 }, { "epoch": 0.17, "grad_norm": 1.195505679817318, "learning_rate": 1.8995446610834104e-05, "loss": 1.4048, "step": 3532 }, { "epoch": 0.17, "grad_norm": 1.171285195268107, "learning_rate": 1.8994766014678444e-05, "loss": 1.1606, "step": 3533 }, { "epoch": 0.17, "grad_norm": 1.1718742921141723, "learning_rate": 1.8994085200246263e-05, "loss": 1.1362, "step": 3534 }, { "epoch": 0.17, "grad_norm": 1.2819988648831853, "learning_rate": 1.8993404167554085e-05, "loss": 1.2344, "step": 3535 }, { "epoch": 0.17, "grad_norm": 1.6577945550692839, "learning_rate": 1.899272291661844e-05, "loss": 1.3525, "step": 3536 }, { "epoch": 0.17, "grad_norm": 1.2369649973696033, "learning_rate": 1.899204144745585e-05, "loss": 1.2676, "step": 3537 }, { "epoch": 0.17, "grad_norm": 0.9052190857530272, "learning_rate": 1.8991359760082864e-05, "loss": 1.2368, "step": 3538 }, { "epoch": 0.17, "grad_norm": 1.7076724212876317, "learning_rate": 1.8990677854516017e-05, "loss": 1.5767, "step": 3539 }, { "epoch": 0.17, "grad_norm": 1.4706620336401766, "learning_rate": 1.898999573077186e-05, "loss": 1.1851, "step": 3540 }, { "epoch": 0.17, "grad_norm": 1.7024157383228438, "learning_rate": 1.8989313388866944e-05, "loss": 1.3984, "step": 3541 }, { "epoch": 0.17, "grad_norm": 1.3204783535091573, "learning_rate": 1.8988630828817827e-05, "loss": 1.2314, "step": 3542 }, { "epoch": 0.17, "grad_norm": 1.2481863061470888, "learning_rate": 1.898794805064108e-05, "loss": 1.2471, "step": 3543 }, { "epoch": 0.17, "grad_norm": 1.4312016194235124, "learning_rate": 1.8987265054353262e-05, "loss": 1.314, "step": 3544 }, { "epoch": 0.17, "grad_norm": 1.3460563758169157, "learning_rate": 1.8986581839970956e-05, "loss": 1.2046, "step": 3545 }, { "epoch": 0.17, "grad_norm": 1.1047090815518474, "learning_rate": 1.898589840751073e-05, "loss": 1.2437, "step": 3546 }, { "epoch": 0.17, "grad_norm": 1.3928567406062362, "learning_rate": 1.898521475698918e-05, "loss": 1.2544, "step": 3547 }, { "epoch": 0.17, "grad_norm": 1.4400888953585051, "learning_rate": 1.8984530888422897e-05, "loss": 1.1929, "step": 3548 }, { "epoch": 0.17, "grad_norm": 1.197208644107919, "learning_rate": 1.8983846801828468e-05, "loss": 1.2246, "step": 3549 }, { "epoch": 0.17, "grad_norm": 1.2905525034008316, "learning_rate": 1.8983162497222497e-05, "loss": 1.0415, "step": 3550 }, { "epoch": 0.17, "grad_norm": 1.3094268738408616, "learning_rate": 1.898247797462159e-05, "loss": 1.2783, "step": 3551 }, { "epoch": 0.17, "grad_norm": 1.6058081925737373, "learning_rate": 1.8981793234042362e-05, "loss": 1.0774, "step": 3552 }, { "epoch": 0.17, "grad_norm": 1.3552176275314036, "learning_rate": 1.8981108275501422e-05, "loss": 1.0388, "step": 3553 }, { "epoch": 0.17, "grad_norm": 1.3166927931567363, "learning_rate": 1.8980423099015402e-05, "loss": 1.3281, "step": 3554 }, { "epoch": 0.17, "grad_norm": 1.2069203959085173, "learning_rate": 1.8979737704600923e-05, "loss": 1.2607, "step": 3555 }, { "epoch": 0.17, "grad_norm": 1.2242322131968812, "learning_rate": 1.8979052092274615e-05, "loss": 1.3237, "step": 3556 }, { "epoch": 0.17, "grad_norm": 1.2284040713645001, "learning_rate": 1.897836626205312e-05, "loss": 1.1724, "step": 3557 }, { "epoch": 0.17, "grad_norm": 1.0548807383135015, "learning_rate": 1.8977680213953084e-05, "loss": 1.2734, "step": 3558 }, { "epoch": 0.17, "grad_norm": 1.5177787557455222, "learning_rate": 1.8976993947991145e-05, "loss": 1.2666, "step": 3559 }, { "epoch": 0.17, "grad_norm": 1.2641678959368694, "learning_rate": 1.897630746418397e-05, "loss": 1.2617, "step": 3560 }, { "epoch": 0.17, "grad_norm": 1.3905737912018168, "learning_rate": 1.8975620762548207e-05, "loss": 1.188, "step": 3561 }, { "epoch": 0.17, "grad_norm": 1.061550015247218, "learning_rate": 1.8974933843100526e-05, "loss": 0.99, "step": 3562 }, { "epoch": 0.17, "grad_norm": 0.9708175444710299, "learning_rate": 1.8974246705857594e-05, "loss": 1.2656, "step": 3563 }, { "epoch": 0.17, "grad_norm": 1.4653176366219918, "learning_rate": 1.8973559350836092e-05, "loss": 1.0999, "step": 3564 }, { "epoch": 0.17, "grad_norm": 1.3604778417732646, "learning_rate": 1.8972871778052688e-05, "loss": 1.1616, "step": 3565 }, { "epoch": 0.17, "grad_norm": 0.9276852885748215, "learning_rate": 1.8972183987524076e-05, "loss": 1.1377, "step": 3566 }, { "epoch": 0.17, "grad_norm": 1.3235576841209322, "learning_rate": 1.8971495979266946e-05, "loss": 1.1748, "step": 3567 }, { "epoch": 0.17, "grad_norm": 1.2058984556877652, "learning_rate": 1.897080775329799e-05, "loss": 1.1245, "step": 3568 }, { "epoch": 0.17, "grad_norm": 1.1789568403336739, "learning_rate": 1.8970119309633916e-05, "loss": 1.1919, "step": 3569 }, { "epoch": 0.17, "grad_norm": 1.4182993377699844, "learning_rate": 1.8969430648291425e-05, "loss": 1.2188, "step": 3570 }, { "epoch": 0.17, "grad_norm": 1.5400017767698304, "learning_rate": 1.896874176928723e-05, "loss": 1.2949, "step": 3571 }, { "epoch": 0.17, "grad_norm": 1.158846839349765, "learning_rate": 1.896805267263805e-05, "loss": 0.9419, "step": 3572 }, { "epoch": 0.17, "grad_norm": 1.2360413200287725, "learning_rate": 1.89673633583606e-05, "loss": 1.2114, "step": 3573 }, { "epoch": 0.17, "grad_norm": 1.2518618639230985, "learning_rate": 1.896667382647162e-05, "loss": 1.1694, "step": 3574 }, { "epoch": 0.17, "grad_norm": 1.259223721267541, "learning_rate": 1.8965984076987835e-05, "loss": 1.2622, "step": 3575 }, { "epoch": 0.17, "grad_norm": 1.0783113239056321, "learning_rate": 1.8965294109925984e-05, "loss": 1.1772, "step": 3576 }, { "epoch": 0.17, "grad_norm": 1.1785923503817366, "learning_rate": 1.8964603925302813e-05, "loss": 1.2212, "step": 3577 }, { "epoch": 0.17, "grad_norm": 1.2877150923147622, "learning_rate": 1.8963913523135062e-05, "loss": 1.0962, "step": 3578 }, { "epoch": 0.17, "grad_norm": 1.3956768856720954, "learning_rate": 1.8963222903439495e-05, "loss": 1.3525, "step": 3579 }, { "epoch": 0.17, "grad_norm": 1.4754851269973712, "learning_rate": 1.896253206623287e-05, "loss": 1.1655, "step": 3580 }, { "epoch": 0.17, "grad_norm": 1.28801775962986, "learning_rate": 1.8961841011531948e-05, "loss": 1.2334, "step": 3581 }, { "epoch": 0.17, "grad_norm": 1.1022009828558093, "learning_rate": 1.8961149739353502e-05, "loss": 1.1558, "step": 3582 }, { "epoch": 0.17, "grad_norm": 1.3058212216231804, "learning_rate": 1.8960458249714303e-05, "loss": 1.2021, "step": 3583 }, { "epoch": 0.17, "grad_norm": 1.373422130822137, "learning_rate": 1.8959766542631137e-05, "loss": 1.2388, "step": 3584 }, { "epoch": 0.17, "grad_norm": 1.2168947645536965, "learning_rate": 1.8959074618120784e-05, "loss": 1.1143, "step": 3585 }, { "epoch": 0.17, "grad_norm": 1.1435409271267243, "learning_rate": 1.8958382476200038e-05, "loss": 1.1816, "step": 3586 }, { "epoch": 0.17, "grad_norm": 1.1310150354771937, "learning_rate": 1.8957690116885697e-05, "loss": 1.0913, "step": 3587 }, { "epoch": 0.17, "grad_norm": 1.3291267264748485, "learning_rate": 1.895699754019456e-05, "loss": 1.2051, "step": 3588 }, { "epoch": 0.17, "grad_norm": 1.4400889258373317, "learning_rate": 1.8956304746143433e-05, "loss": 1.2808, "step": 3589 }, { "epoch": 0.17, "grad_norm": 1.283446745301071, "learning_rate": 1.895561173474913e-05, "loss": 1.2095, "step": 3590 }, { "epoch": 0.17, "grad_norm": 1.2154893123256345, "learning_rate": 1.8954918506028467e-05, "loss": 1.2456, "step": 3591 }, { "epoch": 0.17, "grad_norm": 1.5662101578989225, "learning_rate": 1.895422505999827e-05, "loss": 1.3628, "step": 3592 }, { "epoch": 0.17, "grad_norm": 1.2758310255591814, "learning_rate": 1.895353139667536e-05, "loss": 1.271, "step": 3593 }, { "epoch": 0.17, "grad_norm": 1.3280614545300653, "learning_rate": 1.895283751607658e-05, "loss": 1.4087, "step": 3594 }, { "epoch": 0.17, "grad_norm": 1.2326827291385223, "learning_rate": 1.895214341821876e-05, "loss": 1.3315, "step": 3595 }, { "epoch": 0.17, "grad_norm": 1.1553501120457372, "learning_rate": 1.8951449103118743e-05, "loss": 1.2036, "step": 3596 }, { "epoch": 0.17, "grad_norm": 0.9376955759375059, "learning_rate": 1.8950754570793384e-05, "loss": 1.1777, "step": 3597 }, { "epoch": 0.17, "grad_norm": 1.3048526908850409, "learning_rate": 1.8950059821259535e-05, "loss": 1.3281, "step": 3598 }, { "epoch": 0.17, "grad_norm": 1.1420596151614748, "learning_rate": 1.8949364854534054e-05, "loss": 1.3262, "step": 3599 }, { "epoch": 0.17, "grad_norm": 1.0154276464122167, "learning_rate": 1.8948669670633812e-05, "loss": 1.1226, "step": 3600 }, { "epoch": 0.17, "grad_norm": 1.2413960124425851, "learning_rate": 1.894797426957567e-05, "loss": 1.1206, "step": 3601 }, { "epoch": 0.17, "grad_norm": 1.2656053569812258, "learning_rate": 1.894727865137651e-05, "loss": 1.144, "step": 3602 }, { "epoch": 0.17, "grad_norm": 1.4954938300722012, "learning_rate": 1.894658281605321e-05, "loss": 1.3447, "step": 3603 }, { "epoch": 0.17, "grad_norm": 1.3261257661775274, "learning_rate": 1.894588676362265e-05, "loss": 1.2837, "step": 3604 }, { "epoch": 0.17, "grad_norm": 0.9853776149677267, "learning_rate": 1.8945190494101734e-05, "loss": 1.1685, "step": 3605 }, { "epoch": 0.17, "grad_norm": 1.2902588886626283, "learning_rate": 1.894449400750735e-05, "loss": 1.1929, "step": 3606 }, { "epoch": 0.17, "grad_norm": 1.2780292976969228, "learning_rate": 1.89437973038564e-05, "loss": 1.2656, "step": 3607 }, { "epoch": 0.17, "grad_norm": 1.1931754662349552, "learning_rate": 1.8943100383165794e-05, "loss": 1.1729, "step": 3608 }, { "epoch": 0.17, "grad_norm": 1.3640246593042178, "learning_rate": 1.894240324545244e-05, "loss": 1.1162, "step": 3609 }, { "epoch": 0.17, "grad_norm": 1.2492943513402945, "learning_rate": 1.894170589073326e-05, "loss": 1.0857, "step": 3610 }, { "epoch": 0.17, "grad_norm": 1.0731786207130478, "learning_rate": 1.8941008319025174e-05, "loss": 1.1665, "step": 3611 }, { "epoch": 0.17, "grad_norm": 1.3320710137359644, "learning_rate": 1.894031053034511e-05, "loss": 1.1431, "step": 3612 }, { "epoch": 0.17, "grad_norm": 1.5379752790524126, "learning_rate": 1.8939612524710003e-05, "loss": 1.1792, "step": 3613 }, { "epoch": 0.17, "grad_norm": 1.0549033594565047, "learning_rate": 1.8938914302136792e-05, "loss": 1.0781, "step": 3614 }, { "epoch": 0.17, "grad_norm": 1.2449546713246005, "learning_rate": 1.893821586264241e-05, "loss": 1.2656, "step": 3615 }, { "epoch": 0.17, "grad_norm": 1.1972093397998964, "learning_rate": 1.8937517206243828e-05, "loss": 1.2036, "step": 3616 }, { "epoch": 0.17, "grad_norm": 1.3089700578820127, "learning_rate": 1.8936818332957983e-05, "loss": 1.061, "step": 3617 }, { "epoch": 0.17, "grad_norm": 1.0829759536104209, "learning_rate": 1.8936119242801837e-05, "loss": 1.1758, "step": 3618 }, { "epoch": 0.17, "grad_norm": 1.5780819327370337, "learning_rate": 1.8935419935792358e-05, "loss": 1.0601, "step": 3619 }, { "epoch": 0.17, "grad_norm": 1.3071436132541263, "learning_rate": 1.8934720411946513e-05, "loss": 1.0972, "step": 3620 }, { "epoch": 0.17, "grad_norm": 1.181345352197252, "learning_rate": 1.8934020671281285e-05, "loss": 1.0649, "step": 3621 }, { "epoch": 0.17, "grad_norm": 1.232643308269587, "learning_rate": 1.8933320713813647e-05, "loss": 1.0249, "step": 3622 }, { "epoch": 0.17, "grad_norm": 1.2484904192824775, "learning_rate": 1.8932620539560587e-05, "loss": 1.2412, "step": 3623 }, { "epoch": 0.17, "grad_norm": 1.2471219870753092, "learning_rate": 1.8931920148539098e-05, "loss": 1.1865, "step": 3624 }, { "epoch": 0.17, "grad_norm": 1.3653289599732004, "learning_rate": 1.8931219540766174e-05, "loss": 1.2881, "step": 3625 }, { "epoch": 0.17, "grad_norm": 0.9769790700111366, "learning_rate": 1.8930518716258816e-05, "loss": 1.2764, "step": 3626 }, { "epoch": 0.17, "grad_norm": 1.4968585655888613, "learning_rate": 1.8929817675034033e-05, "loss": 1.2822, "step": 3627 }, { "epoch": 0.17, "grad_norm": 0.8731129049381064, "learning_rate": 1.8929116417108837e-05, "loss": 1.2368, "step": 3628 }, { "epoch": 0.17, "grad_norm": 1.559846542338931, "learning_rate": 1.8928414942500243e-05, "loss": 1.1904, "step": 3629 }, { "epoch": 0.17, "grad_norm": 1.0816138495478058, "learning_rate": 1.892771325122528e-05, "loss": 1.1426, "step": 3630 }, { "epoch": 0.17, "grad_norm": 1.4059530599081784, "learning_rate": 1.892701134330097e-05, "loss": 1.417, "step": 3631 }, { "epoch": 0.17, "grad_norm": 1.0538247257106157, "learning_rate": 1.8926309218744348e-05, "loss": 1.229, "step": 3632 }, { "epoch": 0.17, "grad_norm": 1.26765068040345, "learning_rate": 1.892560687757245e-05, "loss": 1.2139, "step": 3633 }, { "epoch": 0.17, "grad_norm": 1.304834710329461, "learning_rate": 1.8924904319802327e-05, "loss": 1.3115, "step": 3634 }, { "epoch": 0.17, "grad_norm": 1.4661961393931422, "learning_rate": 1.892420154545102e-05, "loss": 1.1284, "step": 3635 }, { "epoch": 0.17, "grad_norm": 1.3541217377736774, "learning_rate": 1.892349855453559e-05, "loss": 1.1138, "step": 3636 }, { "epoch": 0.17, "grad_norm": 1.0806088054347471, "learning_rate": 1.892279534707309e-05, "loss": 1.2568, "step": 3637 }, { "epoch": 0.17, "grad_norm": 1.39767776903962, "learning_rate": 1.892209192308059e-05, "loss": 1.2871, "step": 3638 }, { "epoch": 0.18, "grad_norm": 1.2489802188673031, "learning_rate": 1.8921388282575157e-05, "loss": 1.0835, "step": 3639 }, { "epoch": 0.18, "grad_norm": 0.8472664305747158, "learning_rate": 1.8920684425573865e-05, "loss": 1.1353, "step": 3640 }, { "epoch": 0.18, "grad_norm": 1.132805781721083, "learning_rate": 1.8919980352093802e-05, "loss": 1.0493, "step": 3641 }, { "epoch": 0.18, "grad_norm": 1.1077117143097983, "learning_rate": 1.8919276062152043e-05, "loss": 1.1758, "step": 3642 }, { "epoch": 0.18, "grad_norm": 1.0509894199531091, "learning_rate": 1.891857155576569e-05, "loss": 1.0325, "step": 3643 }, { "epoch": 0.18, "grad_norm": 0.9711251239399257, "learning_rate": 1.891786683295183e-05, "loss": 1.2583, "step": 3644 }, { "epoch": 0.18, "grad_norm": 1.3158561594565725, "learning_rate": 1.891716189372757e-05, "loss": 1.2573, "step": 3645 }, { "epoch": 0.18, "grad_norm": 1.0916462694002096, "learning_rate": 1.8916456738110013e-05, "loss": 1.2935, "step": 3646 }, { "epoch": 0.18, "grad_norm": 0.9478399918198038, "learning_rate": 1.8915751366116275e-05, "loss": 1.1636, "step": 3647 }, { "epoch": 0.18, "grad_norm": 1.7001234054145673, "learning_rate": 1.891504577776347e-05, "loss": 1.3154, "step": 3648 }, { "epoch": 0.18, "grad_norm": 1.1940893897843095, "learning_rate": 1.8914339973068725e-05, "loss": 1.0752, "step": 3649 }, { "epoch": 0.18, "grad_norm": 0.9028736489994187, "learning_rate": 1.8913633952049168e-05, "loss": 1.3711, "step": 3650 }, { "epoch": 0.18, "grad_norm": 0.8516567756285512, "learning_rate": 1.8912927714721922e-05, "loss": 1.1704, "step": 3651 }, { "epoch": 0.18, "grad_norm": 1.252034486717095, "learning_rate": 1.8912221261104136e-05, "loss": 1.2583, "step": 3652 }, { "epoch": 0.18, "grad_norm": 0.8309900414580386, "learning_rate": 1.891151459121295e-05, "loss": 1.1611, "step": 3653 }, { "epoch": 0.18, "grad_norm": 1.2492653827046116, "learning_rate": 1.8910807705065514e-05, "loss": 1.1101, "step": 3654 }, { "epoch": 0.18, "grad_norm": 1.3049209212974258, "learning_rate": 1.8910100602678976e-05, "loss": 1.2456, "step": 3655 }, { "epoch": 0.18, "grad_norm": 1.2010094897817882, "learning_rate": 1.89093932840705e-05, "loss": 1.2217, "step": 3656 }, { "epoch": 0.18, "grad_norm": 1.2563973267924362, "learning_rate": 1.8908685749257257e-05, "loss": 1.3003, "step": 3657 }, { "epoch": 0.18, "grad_norm": 1.316051770696901, "learning_rate": 1.8907977998256405e-05, "loss": 1.228, "step": 3658 }, { "epoch": 0.18, "grad_norm": 1.1421829775841765, "learning_rate": 1.890727003108513e-05, "loss": 1.2207, "step": 3659 }, { "epoch": 0.18, "grad_norm": 1.1131430822264827, "learning_rate": 1.89065618477606e-05, "loss": 1.2241, "step": 3660 }, { "epoch": 0.18, "grad_norm": 0.6918465772015077, "learning_rate": 1.890585344830001e-05, "loss": 1.249, "step": 3661 }, { "epoch": 0.18, "grad_norm": 1.57902913654736, "learning_rate": 1.8905144832720547e-05, "loss": 1.3584, "step": 3662 }, { "epoch": 0.18, "grad_norm": 1.1870140476364872, "learning_rate": 1.8904436001039408e-05, "loss": 1.2651, "step": 3663 }, { "epoch": 0.18, "grad_norm": 1.2188930192951837, "learning_rate": 1.8903726953273794e-05, "loss": 1.1948, "step": 3664 }, { "epoch": 0.18, "grad_norm": 1.3877812380598347, "learning_rate": 1.8903017689440914e-05, "loss": 1.2407, "step": 3665 }, { "epoch": 0.18, "grad_norm": 1.4908036406515537, "learning_rate": 1.8902308209557976e-05, "loss": 1.1934, "step": 3666 }, { "epoch": 0.18, "grad_norm": 1.2857774425289568, "learning_rate": 1.89015985136422e-05, "loss": 1.25, "step": 3667 }, { "epoch": 0.18, "grad_norm": 1.0597580582070834, "learning_rate": 1.8900888601710804e-05, "loss": 1.0247, "step": 3668 }, { "epoch": 0.18, "grad_norm": 0.9230086729612763, "learning_rate": 1.890017847378102e-05, "loss": 1.0859, "step": 3669 }, { "epoch": 0.18, "grad_norm": 1.264643691157165, "learning_rate": 1.889946812987008e-05, "loss": 1.3506, "step": 3670 }, { "epoch": 0.18, "grad_norm": 1.580986450900499, "learning_rate": 1.8898757569995218e-05, "loss": 1.1865, "step": 3671 }, { "epoch": 0.18, "grad_norm": 1.491744764326917, "learning_rate": 1.8898046794173684e-05, "loss": 1.23, "step": 3672 }, { "epoch": 0.18, "grad_norm": 1.253224421154269, "learning_rate": 1.889733580242272e-05, "loss": 1.2017, "step": 3673 }, { "epoch": 0.18, "grad_norm": 1.3133833148737182, "learning_rate": 1.8896624594759583e-05, "loss": 1.1631, "step": 3674 }, { "epoch": 0.18, "grad_norm": 1.4663772417957828, "learning_rate": 1.889591317120153e-05, "loss": 1.5039, "step": 3675 }, { "epoch": 0.18, "grad_norm": 1.6223238572187488, "learning_rate": 1.889520153176583e-05, "loss": 1.2764, "step": 3676 }, { "epoch": 0.18, "grad_norm": 1.2070971492301161, "learning_rate": 1.889448967646974e-05, "loss": 1.2271, "step": 3677 }, { "epoch": 0.18, "grad_norm": 1.3014235975740673, "learning_rate": 1.8893777605330553e-05, "loss": 1.1318, "step": 3678 }, { "epoch": 0.18, "grad_norm": 0.9208121000618421, "learning_rate": 1.8893065318365534e-05, "loss": 1.0352, "step": 3679 }, { "epoch": 0.18, "grad_norm": 1.1522134695093353, "learning_rate": 1.8892352815591973e-05, "loss": 1.0713, "step": 3680 }, { "epoch": 0.18, "grad_norm": 1.0639145441085855, "learning_rate": 1.8891640097027163e-05, "loss": 1.2422, "step": 3681 }, { "epoch": 0.18, "grad_norm": 1.3138911940540887, "learning_rate": 1.8890927162688398e-05, "loss": 1.2266, "step": 3682 }, { "epoch": 0.18, "grad_norm": 1.1737051108063394, "learning_rate": 1.8890214012592977e-05, "loss": 1.1069, "step": 3683 }, { "epoch": 0.18, "grad_norm": 1.3586156443044153, "learning_rate": 1.8889500646758203e-05, "loss": 1.1348, "step": 3684 }, { "epoch": 0.18, "grad_norm": 1.1501428815039894, "learning_rate": 1.8888787065201393e-05, "loss": 1.1577, "step": 3685 }, { "epoch": 0.18, "grad_norm": 1.1280832333763993, "learning_rate": 1.8888073267939865e-05, "loss": 1.1606, "step": 3686 }, { "epoch": 0.18, "grad_norm": 0.9736109700976114, "learning_rate": 1.8887359254990937e-05, "loss": 1.356, "step": 3687 }, { "epoch": 0.18, "grad_norm": 1.351859816499735, "learning_rate": 1.8886645026371937e-05, "loss": 1.1782, "step": 3688 }, { "epoch": 0.18, "grad_norm": 1.0222057352188998, "learning_rate": 1.8885930582100195e-05, "loss": 1.292, "step": 3689 }, { "epoch": 0.18, "grad_norm": 1.3108316496931331, "learning_rate": 1.888521592219305e-05, "loss": 1.2314, "step": 3690 }, { "epoch": 0.18, "grad_norm": 1.2772211534972377, "learning_rate": 1.8884501046667847e-05, "loss": 1.1333, "step": 3691 }, { "epoch": 0.18, "grad_norm": 1.2364247972408073, "learning_rate": 1.888378595554193e-05, "loss": 1.2129, "step": 3692 }, { "epoch": 0.18, "grad_norm": 1.1219474513970653, "learning_rate": 1.888307064883266e-05, "loss": 1.1914, "step": 3693 }, { "epoch": 0.18, "grad_norm": 1.160069885816394, "learning_rate": 1.8882355126557382e-05, "loss": 1.2056, "step": 3694 }, { "epoch": 0.18, "grad_norm": 1.3341110907576428, "learning_rate": 1.8881639388733468e-05, "loss": 1.1699, "step": 3695 }, { "epoch": 0.18, "grad_norm": 1.1767751648654203, "learning_rate": 1.888092343537829e-05, "loss": 1.1108, "step": 3696 }, { "epoch": 0.18, "grad_norm": 1.149779115536934, "learning_rate": 1.8880207266509215e-05, "loss": 1.2134, "step": 3697 }, { "epoch": 0.18, "grad_norm": 1.3572112889100154, "learning_rate": 1.887949088214363e-05, "loss": 1.3096, "step": 3698 }, { "epoch": 0.18, "grad_norm": 1.1029074734937117, "learning_rate": 1.887877428229891e-05, "loss": 1.1587, "step": 3699 }, { "epoch": 0.18, "grad_norm": 1.405590892741505, "learning_rate": 1.887805746699245e-05, "loss": 1.3447, "step": 3700 }, { "epoch": 0.18, "grad_norm": 1.4401741644069757, "learning_rate": 1.8877340436241645e-05, "loss": 1.4341, "step": 3701 }, { "epoch": 0.18, "grad_norm": 1.2997791277392416, "learning_rate": 1.8876623190063898e-05, "loss": 1.1738, "step": 3702 }, { "epoch": 0.18, "grad_norm": 1.2716386359224923, "learning_rate": 1.887590572847661e-05, "loss": 1.1953, "step": 3703 }, { "epoch": 0.18, "grad_norm": 1.0667280793171918, "learning_rate": 1.887518805149719e-05, "loss": 1.0391, "step": 3704 }, { "epoch": 0.18, "grad_norm": 1.1378099978690308, "learning_rate": 1.887447015914306e-05, "loss": 1.1694, "step": 3705 }, { "epoch": 0.18, "grad_norm": 1.2474015827841545, "learning_rate": 1.8873752051431635e-05, "loss": 1.0288, "step": 3706 }, { "epoch": 0.18, "grad_norm": 1.2208256301025968, "learning_rate": 1.887303372838035e-05, "loss": 1.1328, "step": 3707 }, { "epoch": 0.18, "grad_norm": 1.2618329431392548, "learning_rate": 1.8872315190006624e-05, "loss": 1.3228, "step": 3708 }, { "epoch": 0.18, "grad_norm": 0.9863934260957063, "learning_rate": 1.8871596436327908e-05, "loss": 1.1633, "step": 3709 }, { "epoch": 0.18, "grad_norm": 1.1554600980134562, "learning_rate": 1.8870877467361633e-05, "loss": 1.2202, "step": 3710 }, { "epoch": 0.18, "grad_norm": 1.2883754057537926, "learning_rate": 1.887015828312525e-05, "loss": 1.1479, "step": 3711 }, { "epoch": 0.18, "grad_norm": 1.2102941619254475, "learning_rate": 1.8869438883636212e-05, "loss": 1.2695, "step": 3712 }, { "epoch": 0.18, "grad_norm": 0.863423135284559, "learning_rate": 1.8868719268911978e-05, "loss": 1.1611, "step": 3713 }, { "epoch": 0.18, "grad_norm": 1.0259796841418751, "learning_rate": 1.8867999438970007e-05, "loss": 1.3765, "step": 3714 }, { "epoch": 0.18, "grad_norm": 1.0722650926952464, "learning_rate": 1.886727939382777e-05, "loss": 1.3281, "step": 3715 }, { "epoch": 0.18, "grad_norm": 1.2082455332211228, "learning_rate": 1.886655913350274e-05, "loss": 1.1191, "step": 3716 }, { "epoch": 0.18, "grad_norm": 1.1157686428334062, "learning_rate": 1.8865838658012397e-05, "loss": 1.1606, "step": 3717 }, { "epoch": 0.18, "grad_norm": 1.1748478839337293, "learning_rate": 1.8865117967374225e-05, "loss": 1.2896, "step": 3718 }, { "epoch": 0.18, "grad_norm": 1.2611356543925225, "learning_rate": 1.886439706160571e-05, "loss": 1.2612, "step": 3719 }, { "epoch": 0.18, "grad_norm": 1.242920559591456, "learning_rate": 1.8863675940724344e-05, "loss": 1.2227, "step": 3720 }, { "epoch": 0.18, "grad_norm": 1.2432927870940162, "learning_rate": 1.886295460474763e-05, "loss": 1.312, "step": 3721 }, { "epoch": 0.18, "grad_norm": 1.462449010154458, "learning_rate": 1.8862233053693074e-05, "loss": 1.4336, "step": 3722 }, { "epoch": 0.18, "grad_norm": 1.4179011200585068, "learning_rate": 1.8861511287578188e-05, "loss": 1.2515, "step": 3723 }, { "epoch": 0.18, "grad_norm": 1.4889714384957256, "learning_rate": 1.8860789306420478e-05, "loss": 1.2163, "step": 3724 }, { "epoch": 0.18, "grad_norm": 1.16835799299909, "learning_rate": 1.8860067110237478e-05, "loss": 1.2334, "step": 3725 }, { "epoch": 0.18, "grad_norm": 1.5256834674882724, "learning_rate": 1.88593446990467e-05, "loss": 1.3501, "step": 3726 }, { "epoch": 0.18, "grad_norm": 1.2079897914234985, "learning_rate": 1.885862207286568e-05, "loss": 1.2061, "step": 3727 }, { "epoch": 0.18, "grad_norm": 1.0972876049522948, "learning_rate": 1.8857899231711956e-05, "loss": 1.1016, "step": 3728 }, { "epoch": 0.18, "grad_norm": 1.0947956419622145, "learning_rate": 1.885717617560307e-05, "loss": 1.2715, "step": 3729 }, { "epoch": 0.18, "grad_norm": 1.234416509562634, "learning_rate": 1.8856452904556564e-05, "loss": 1.5317, "step": 3730 }, { "epoch": 0.18, "grad_norm": 1.337767371652006, "learning_rate": 1.8855729418589994e-05, "loss": 1.2173, "step": 3731 }, { "epoch": 0.18, "grad_norm": 1.2032048901301533, "learning_rate": 1.885500571772091e-05, "loss": 1.0674, "step": 3732 }, { "epoch": 0.18, "grad_norm": 1.2379614735665712, "learning_rate": 1.8854281801966886e-05, "loss": 1.1455, "step": 3733 }, { "epoch": 0.18, "grad_norm": 1.1658780991774809, "learning_rate": 1.8853557671345477e-05, "loss": 1.127, "step": 3734 }, { "epoch": 0.18, "grad_norm": 1.4071086124357972, "learning_rate": 1.885283332587426e-05, "loss": 1.334, "step": 3735 }, { "epoch": 0.18, "grad_norm": 1.3444186444640516, "learning_rate": 1.8852108765570814e-05, "loss": 1.3262, "step": 3736 }, { "epoch": 0.18, "grad_norm": 1.3019296465288477, "learning_rate": 1.8851383990452726e-05, "loss": 1.0913, "step": 3737 }, { "epoch": 0.18, "grad_norm": 1.5475351121046275, "learning_rate": 1.8850659000537575e-05, "loss": 1.1741, "step": 3738 }, { "epoch": 0.18, "grad_norm": 1.2504138840101464, "learning_rate": 1.884993379584296e-05, "loss": 1.2012, "step": 3739 }, { "epoch": 0.18, "grad_norm": 1.4442662620732705, "learning_rate": 1.8849208376386477e-05, "loss": 1.0894, "step": 3740 }, { "epoch": 0.18, "grad_norm": 1.5679425089521672, "learning_rate": 1.8848482742185737e-05, "loss": 1.1177, "step": 3741 }, { "epoch": 0.18, "grad_norm": 1.429682427297351, "learning_rate": 1.8847756893258336e-05, "loss": 1.3677, "step": 3742 }, { "epoch": 0.18, "grad_norm": 1.2266382132361986, "learning_rate": 1.88470308296219e-05, "loss": 1.2163, "step": 3743 }, { "epoch": 0.18, "grad_norm": 1.3613235197136249, "learning_rate": 1.884630455129404e-05, "loss": 1.3115, "step": 3744 }, { "epoch": 0.18, "grad_norm": 1.439085301882927, "learning_rate": 1.8845578058292392e-05, "loss": 1.3105, "step": 3745 }, { "epoch": 0.18, "grad_norm": 1.418423496826268, "learning_rate": 1.8844851350634573e-05, "loss": 1.062, "step": 3746 }, { "epoch": 0.18, "grad_norm": 1.1752753028903469, "learning_rate": 1.8844124428338223e-05, "loss": 1.2651, "step": 3747 }, { "epoch": 0.18, "grad_norm": 1.310704397389374, "learning_rate": 1.8843397291420982e-05, "loss": 1.1152, "step": 3748 }, { "epoch": 0.18, "grad_norm": 1.3333521281252285, "learning_rate": 1.88426699399005e-05, "loss": 1.2134, "step": 3749 }, { "epoch": 0.18, "grad_norm": 1.2237217081444318, "learning_rate": 1.884194237379442e-05, "loss": 1.2729, "step": 3750 }, { "epoch": 0.18, "grad_norm": 1.4067633171011622, "learning_rate": 1.8841214593120405e-05, "loss": 1.1343, "step": 3751 }, { "epoch": 0.18, "grad_norm": 1.030748578560186, "learning_rate": 1.8840486597896114e-05, "loss": 1.3579, "step": 3752 }, { "epoch": 0.18, "grad_norm": 1.3373837197422835, "learning_rate": 1.8839758388139208e-05, "loss": 1.2285, "step": 3753 }, { "epoch": 0.18, "grad_norm": 1.161523665885835, "learning_rate": 1.8839029963867362e-05, "loss": 1.146, "step": 3754 }, { "epoch": 0.18, "grad_norm": 1.1488379132663662, "learning_rate": 1.883830132509826e-05, "loss": 1.1396, "step": 3755 }, { "epoch": 0.18, "grad_norm": 1.283233218592384, "learning_rate": 1.8837572471849574e-05, "loss": 1.1816, "step": 3756 }, { "epoch": 0.18, "grad_norm": 1.305138447674614, "learning_rate": 1.8836843404138992e-05, "loss": 1.1074, "step": 3757 }, { "epoch": 0.18, "grad_norm": 1.149463996054947, "learning_rate": 1.883611412198421e-05, "loss": 1.2524, "step": 3758 }, { "epoch": 0.18, "grad_norm": 1.3319400841835491, "learning_rate": 1.8835384625402927e-05, "loss": 1.2778, "step": 3759 }, { "epoch": 0.18, "grad_norm": 1.1460389568818394, "learning_rate": 1.883465491441284e-05, "loss": 1.1016, "step": 3760 }, { "epoch": 0.18, "grad_norm": 1.3692038818273549, "learning_rate": 1.8833924989031663e-05, "loss": 1.2842, "step": 3761 }, { "epoch": 0.18, "grad_norm": 1.1918474248417137, "learning_rate": 1.8833194849277105e-05, "loss": 1.2241, "step": 3762 }, { "epoch": 0.18, "grad_norm": 1.56327597800025, "learning_rate": 1.8832464495166883e-05, "loss": 1.2397, "step": 3763 }, { "epoch": 0.18, "grad_norm": 1.1632329712146787, "learning_rate": 1.8831733926718725e-05, "loss": 1.1328, "step": 3764 }, { "epoch": 0.18, "grad_norm": 1.1770576580086047, "learning_rate": 1.8831003143950357e-05, "loss": 1.2446, "step": 3765 }, { "epoch": 0.18, "grad_norm": 1.1730230019162593, "learning_rate": 1.8830272146879513e-05, "loss": 1.1033, "step": 3766 }, { "epoch": 0.18, "grad_norm": 1.1005964851702894, "learning_rate": 1.882954093552394e-05, "loss": 1.2847, "step": 3767 }, { "epoch": 0.18, "grad_norm": 1.5140957230136651, "learning_rate": 1.8828809509901366e-05, "loss": 1.1431, "step": 3768 }, { "epoch": 0.18, "grad_norm": 1.3434076548471803, "learning_rate": 1.8828077870029554e-05, "loss": 1.1401, "step": 3769 }, { "epoch": 0.18, "grad_norm": 1.3278875953104918, "learning_rate": 1.8827346015926253e-05, "loss": 1.3286, "step": 3770 }, { "epoch": 0.18, "grad_norm": 1.30623818909198, "learning_rate": 1.8826613947609225e-05, "loss": 1.2637, "step": 3771 }, { "epoch": 0.18, "grad_norm": 1.6193891024745923, "learning_rate": 1.8825881665096237e-05, "loss": 1.2202, "step": 3772 }, { "epoch": 0.18, "grad_norm": 1.303574844509896, "learning_rate": 1.8825149168405055e-05, "loss": 1.335, "step": 3773 }, { "epoch": 0.18, "grad_norm": 1.333488492486229, "learning_rate": 1.8824416457553455e-05, "loss": 1.2764, "step": 3774 }, { "epoch": 0.18, "grad_norm": 1.1483100902172072, "learning_rate": 1.882368353255922e-05, "loss": 1.1299, "step": 3775 }, { "epoch": 0.18, "grad_norm": 1.0690048071825466, "learning_rate": 1.8822950393440135e-05, "loss": 1.1426, "step": 3776 }, { "epoch": 0.18, "grad_norm": 1.073616231866021, "learning_rate": 1.8822217040213994e-05, "loss": 1.2705, "step": 3777 }, { "epoch": 0.18, "grad_norm": 1.439643442987364, "learning_rate": 1.882148347289859e-05, "loss": 1.2148, "step": 3778 }, { "epoch": 0.18, "grad_norm": 1.3650361032701719, "learning_rate": 1.8820749691511723e-05, "loss": 1.3389, "step": 3779 }, { "epoch": 0.18, "grad_norm": 1.3163335386066355, "learning_rate": 1.8820015696071202e-05, "loss": 1.2085, "step": 3780 }, { "epoch": 0.18, "grad_norm": 1.0858565596555425, "learning_rate": 1.881928148659484e-05, "loss": 1.2026, "step": 3781 }, { "epoch": 0.18, "grad_norm": 1.498516697511471, "learning_rate": 1.8818547063100453e-05, "loss": 1.313, "step": 3782 }, { "epoch": 0.18, "grad_norm": 1.1377353541692377, "learning_rate": 1.8817812425605864e-05, "loss": 1.2095, "step": 3783 }, { "epoch": 0.18, "grad_norm": 1.4100640063579568, "learning_rate": 1.88170775741289e-05, "loss": 1.2461, "step": 3784 }, { "epoch": 0.18, "grad_norm": 1.1440090994169752, "learning_rate": 1.8816342508687388e-05, "loss": 1.2305, "step": 3785 }, { "epoch": 0.18, "grad_norm": 1.3787698783532052, "learning_rate": 1.8815607229299177e-05, "loss": 1.311, "step": 3786 }, { "epoch": 0.18, "grad_norm": 1.3737421671533245, "learning_rate": 1.8814871735982102e-05, "loss": 1.1519, "step": 3787 }, { "epoch": 0.18, "grad_norm": 1.2524807661501367, "learning_rate": 1.8814136028754016e-05, "loss": 1.1626, "step": 3788 }, { "epoch": 0.18, "grad_norm": 1.337601149623135, "learning_rate": 1.8813400107632763e-05, "loss": 1.2817, "step": 3789 }, { "epoch": 0.18, "grad_norm": 1.1558400418305887, "learning_rate": 1.881266397263622e-05, "loss": 1.1753, "step": 3790 }, { "epoch": 0.18, "grad_norm": 1.3364311218657992, "learning_rate": 1.8811927623782227e-05, "loss": 1.2715, "step": 3791 }, { "epoch": 0.18, "grad_norm": 1.5076759390985712, "learning_rate": 1.8811191061088673e-05, "loss": 1.23, "step": 3792 }, { "epoch": 0.18, "grad_norm": 1.1204120136565816, "learning_rate": 1.8810454284573424e-05, "loss": 1.0903, "step": 3793 }, { "epoch": 0.18, "grad_norm": 1.6802007416923506, "learning_rate": 1.880971729425436e-05, "loss": 1.2148, "step": 3794 }, { "epoch": 0.18, "grad_norm": 1.454270523731031, "learning_rate": 1.880898009014937e-05, "loss": 1.21, "step": 3795 }, { "epoch": 0.18, "grad_norm": 1.3459560743645265, "learning_rate": 1.880824267227633e-05, "loss": 1.2773, "step": 3796 }, { "epoch": 0.18, "grad_norm": 1.1387966852435205, "learning_rate": 1.880750504065315e-05, "loss": 1.1646, "step": 3797 }, { "epoch": 0.18, "grad_norm": 1.3375672706286275, "learning_rate": 1.8806767195297724e-05, "loss": 1.2461, "step": 3798 }, { "epoch": 0.18, "grad_norm": 1.3050725420711256, "learning_rate": 1.880602913622796e-05, "loss": 1.0225, "step": 3799 }, { "epoch": 0.18, "grad_norm": 1.4088485801730106, "learning_rate": 1.880529086346176e-05, "loss": 1.3271, "step": 3800 }, { "epoch": 0.18, "grad_norm": 1.5373577673294068, "learning_rate": 1.880455237701705e-05, "loss": 1.1387, "step": 3801 }, { "epoch": 0.18, "grad_norm": 1.1999183668612758, "learning_rate": 1.880381367691175e-05, "loss": 1.168, "step": 3802 }, { "epoch": 0.18, "grad_norm": 1.1043503335111857, "learning_rate": 1.880307476316378e-05, "loss": 1.3496, "step": 3803 }, { "epoch": 0.18, "grad_norm": 1.4458193677077311, "learning_rate": 1.8802335635791076e-05, "loss": 1.3052, "step": 3804 }, { "epoch": 0.18, "grad_norm": 1.2558432906925352, "learning_rate": 1.880159629481157e-05, "loss": 1.1895, "step": 3805 }, { "epoch": 0.18, "grad_norm": 1.1388344337073755, "learning_rate": 1.880085674024321e-05, "loss": 1.1035, "step": 3806 }, { "epoch": 0.18, "grad_norm": 1.4996358351220862, "learning_rate": 1.8800116972103935e-05, "loss": 1.2993, "step": 3807 }, { "epoch": 0.18, "grad_norm": 1.1338488703690763, "learning_rate": 1.8799376990411708e-05, "loss": 1.071, "step": 3808 }, { "epoch": 0.18, "grad_norm": 1.2807907645427477, "learning_rate": 1.8798636795184473e-05, "loss": 1.3374, "step": 3809 }, { "epoch": 0.18, "grad_norm": 1.4471101731064926, "learning_rate": 1.8797896386440202e-05, "loss": 1.2466, "step": 3810 }, { "epoch": 0.18, "grad_norm": 1.3026443115209179, "learning_rate": 1.879715576419686e-05, "loss": 1.1382, "step": 3811 }, { "epoch": 0.18, "grad_norm": 1.4583755041022222, "learning_rate": 1.8796414928472417e-05, "loss": 1.187, "step": 3812 }, { "epoch": 0.18, "grad_norm": 1.0043399525900796, "learning_rate": 1.879567387928485e-05, "loss": 1.3135, "step": 3813 }, { "epoch": 0.18, "grad_norm": 1.2031043274170252, "learning_rate": 1.8794932616652152e-05, "loss": 1.1592, "step": 3814 }, { "epoch": 0.18, "grad_norm": 1.382158804800577, "learning_rate": 1.8794191140592303e-05, "loss": 1.1953, "step": 3815 }, { "epoch": 0.18, "grad_norm": 1.417447970303134, "learning_rate": 1.8793449451123296e-05, "loss": 1.1924, "step": 3816 }, { "epoch": 0.18, "grad_norm": 1.1792468720277238, "learning_rate": 1.879270754826313e-05, "loss": 1.0928, "step": 3817 }, { "epoch": 0.18, "grad_norm": 1.0697752745825342, "learning_rate": 1.8791965432029812e-05, "loss": 1.3208, "step": 3818 }, { "epoch": 0.18, "grad_norm": 1.3039658221473143, "learning_rate": 1.8791223102441347e-05, "loss": 1.207, "step": 3819 }, { "epoch": 0.18, "grad_norm": 1.2643289308563401, "learning_rate": 1.8790480559515756e-05, "loss": 1.2407, "step": 3820 }, { "epoch": 0.18, "grad_norm": 1.0489146698141427, "learning_rate": 1.878973780327105e-05, "loss": 1.3149, "step": 3821 }, { "epoch": 0.18, "grad_norm": 1.0460145907126146, "learning_rate": 1.878899483372526e-05, "loss": 0.9519, "step": 3822 }, { "epoch": 0.18, "grad_norm": 0.8675795702203667, "learning_rate": 1.8788251650896407e-05, "loss": 1.2515, "step": 3823 }, { "epoch": 0.18, "grad_norm": 1.3120237838550692, "learning_rate": 1.8787508254802538e-05, "loss": 1.2598, "step": 3824 }, { "epoch": 0.18, "grad_norm": 1.1427240029204004, "learning_rate": 1.8786764645461684e-05, "loss": 1.187, "step": 3825 }, { "epoch": 0.18, "grad_norm": 1.3074253271676255, "learning_rate": 1.8786020822891892e-05, "loss": 1.3105, "step": 3826 }, { "epoch": 0.18, "grad_norm": 0.9681325250284729, "learning_rate": 1.8785276787111216e-05, "loss": 1.3135, "step": 3827 }, { "epoch": 0.18, "grad_norm": 1.2734646269311545, "learning_rate": 1.878453253813771e-05, "loss": 1.2324, "step": 3828 }, { "epoch": 0.18, "grad_norm": 1.235195368426881, "learning_rate": 1.878378807598943e-05, "loss": 1.2749, "step": 3829 }, { "epoch": 0.18, "grad_norm": 1.3021276837479405, "learning_rate": 1.8783043400684447e-05, "loss": 1.3218, "step": 3830 }, { "epoch": 0.18, "grad_norm": 1.3404915641881334, "learning_rate": 1.878229851224083e-05, "loss": 1.2339, "step": 3831 }, { "epoch": 0.18, "grad_norm": 1.2111111755003627, "learning_rate": 1.8781553410676658e-05, "loss": 1.1831, "step": 3832 }, { "epoch": 0.18, "grad_norm": 1.2031021206989294, "learning_rate": 1.878080809601001e-05, "loss": 1.1899, "step": 3833 }, { "epoch": 0.18, "grad_norm": 1.1719337081564711, "learning_rate": 1.878006256825897e-05, "loss": 1.1152, "step": 3834 }, { "epoch": 0.18, "grad_norm": 1.215657390160487, "learning_rate": 1.8779316827441636e-05, "loss": 1.1465, "step": 3835 }, { "epoch": 0.18, "grad_norm": 1.160060174894915, "learning_rate": 1.87785708735761e-05, "loss": 1.095, "step": 3836 }, { "epoch": 0.18, "grad_norm": 1.4826166128691558, "learning_rate": 1.8777824706680466e-05, "loss": 1.3706, "step": 3837 }, { "epoch": 0.18, "grad_norm": 1.5238786812581249, "learning_rate": 1.8777078326772843e-05, "loss": 1.2354, "step": 3838 }, { "epoch": 0.18, "grad_norm": 1.1781480276801504, "learning_rate": 1.877633173387134e-05, "loss": 1.2222, "step": 3839 }, { "epoch": 0.18, "grad_norm": 1.2353840607166555, "learning_rate": 1.8775584927994074e-05, "loss": 1.2397, "step": 3840 }, { "epoch": 0.18, "grad_norm": 1.3523137596210626, "learning_rate": 1.877483790915917e-05, "loss": 1.1323, "step": 3841 }, { "epoch": 0.18, "grad_norm": 1.1636779975353855, "learning_rate": 1.877409067738476e-05, "loss": 1.0442, "step": 3842 }, { "epoch": 0.18, "grad_norm": 1.3061011866152812, "learning_rate": 1.877334323268897e-05, "loss": 1.2549, "step": 3843 }, { "epoch": 0.18, "grad_norm": 1.4300377485575873, "learning_rate": 1.8772595575089943e-05, "loss": 1.1504, "step": 3844 }, { "epoch": 0.18, "grad_norm": 1.2268235120723918, "learning_rate": 1.8771847704605818e-05, "loss": 1.0898, "step": 3845 }, { "epoch": 0.18, "grad_norm": 0.884719747692226, "learning_rate": 1.8771099621254748e-05, "loss": 1.3413, "step": 3846 }, { "epoch": 0.19, "grad_norm": 1.344024477694934, "learning_rate": 1.8770351325054882e-05, "loss": 1.1621, "step": 3847 }, { "epoch": 0.19, "grad_norm": 1.2124190722235857, "learning_rate": 1.876960281602439e-05, "loss": 1.332, "step": 3848 }, { "epoch": 0.19, "grad_norm": 1.0502453824008753, "learning_rate": 1.876885409418142e-05, "loss": 1.1968, "step": 3849 }, { "epoch": 0.19, "grad_norm": 1.2112787750863305, "learning_rate": 1.8768105159544152e-05, "loss": 1.2432, "step": 3850 }, { "epoch": 0.19, "grad_norm": 1.3042155393188326, "learning_rate": 1.8767356012130758e-05, "loss": 1.2578, "step": 3851 }, { "epoch": 0.19, "grad_norm": 1.2534735628451261, "learning_rate": 1.8766606651959417e-05, "loss": 1.2749, "step": 3852 }, { "epoch": 0.19, "grad_norm": 1.1699911680327308, "learning_rate": 1.876585707904831e-05, "loss": 1.2456, "step": 3853 }, { "epoch": 0.19, "grad_norm": 1.1870813396675193, "learning_rate": 1.876510729341564e-05, "loss": 1.2734, "step": 3854 }, { "epoch": 0.19, "grad_norm": 1.6277851196544832, "learning_rate": 1.876435729507959e-05, "loss": 1.1045, "step": 3855 }, { "epoch": 0.19, "grad_norm": 1.238931215704037, "learning_rate": 1.876360708405836e-05, "loss": 1.167, "step": 3856 }, { "epoch": 0.19, "grad_norm": 0.9338497411819181, "learning_rate": 1.8762856660370165e-05, "loss": 1.2227, "step": 3857 }, { "epoch": 0.19, "grad_norm": 1.2282635897015475, "learning_rate": 1.8762106024033206e-05, "loss": 1.3608, "step": 3858 }, { "epoch": 0.19, "grad_norm": 1.228309933636235, "learning_rate": 1.87613551750657e-05, "loss": 1.2007, "step": 3859 }, { "epoch": 0.19, "grad_norm": 1.3023676612436272, "learning_rate": 1.8760604113485873e-05, "loss": 1.2544, "step": 3860 }, { "epoch": 0.19, "grad_norm": 1.6230074189041543, "learning_rate": 1.8759852839311946e-05, "loss": 1.3442, "step": 3861 }, { "epoch": 0.19, "grad_norm": 1.4622782613613992, "learning_rate": 1.8759101352562154e-05, "loss": 1.3257, "step": 3862 }, { "epoch": 0.19, "grad_norm": 1.278946359539599, "learning_rate": 1.8758349653254733e-05, "loss": 1.2207, "step": 3863 }, { "epoch": 0.19, "grad_norm": 1.315382894285985, "learning_rate": 1.8757597741407922e-05, "loss": 1.1357, "step": 3864 }, { "epoch": 0.19, "grad_norm": 1.2066435817690058, "learning_rate": 1.875684561703997e-05, "loss": 1.052, "step": 3865 }, { "epoch": 0.19, "grad_norm": 1.1928937325703166, "learning_rate": 1.8756093280169126e-05, "loss": 1.2866, "step": 3866 }, { "epoch": 0.19, "grad_norm": 1.1426461835134458, "learning_rate": 1.875534073081365e-05, "loss": 1.126, "step": 3867 }, { "epoch": 0.19, "grad_norm": 1.331965711635815, "learning_rate": 1.8754587968991803e-05, "loss": 0.9929, "step": 3868 }, { "epoch": 0.19, "grad_norm": 1.5635288817577517, "learning_rate": 1.8753834994721852e-05, "loss": 1.3032, "step": 3869 }, { "epoch": 0.19, "grad_norm": 1.6735194966695957, "learning_rate": 1.8753081808022065e-05, "loss": 1.1265, "step": 3870 }, { "epoch": 0.19, "grad_norm": 1.3229242919674427, "learning_rate": 1.8752328408910732e-05, "loss": 1.0933, "step": 3871 }, { "epoch": 0.19, "grad_norm": 1.5871042059219043, "learning_rate": 1.8751574797406124e-05, "loss": 1.3804, "step": 3872 }, { "epoch": 0.19, "grad_norm": 1.3127714773241774, "learning_rate": 1.8750820973526535e-05, "loss": 1.2646, "step": 3873 }, { "epoch": 0.19, "grad_norm": 1.1817012313084219, "learning_rate": 1.8750066937290256e-05, "loss": 1.1924, "step": 3874 }, { "epoch": 0.19, "grad_norm": 1.3522395479196476, "learning_rate": 1.8749312688715587e-05, "loss": 1.1484, "step": 3875 }, { "epoch": 0.19, "grad_norm": 1.2757339847063416, "learning_rate": 1.8748558227820828e-05, "loss": 1.2344, "step": 3876 }, { "epoch": 0.19, "grad_norm": 0.8481924274997216, "learning_rate": 1.8747803554624287e-05, "loss": 0.9304, "step": 3877 }, { "epoch": 0.19, "grad_norm": 1.3533843298319492, "learning_rate": 1.874704866914428e-05, "loss": 1.2896, "step": 3878 }, { "epoch": 0.19, "grad_norm": 1.7211111753068293, "learning_rate": 1.874629357139913e-05, "loss": 1.2041, "step": 3879 }, { "epoch": 0.19, "grad_norm": 1.19081771796767, "learning_rate": 1.8745538261407157e-05, "loss": 1.2241, "step": 3880 }, { "epoch": 0.19, "grad_norm": 1.2209056245156116, "learning_rate": 1.8744782739186688e-05, "loss": 1.1328, "step": 3881 }, { "epoch": 0.19, "grad_norm": 1.4811144373647276, "learning_rate": 1.874402700475606e-05, "loss": 1.3232, "step": 3882 }, { "epoch": 0.19, "grad_norm": 1.5018082171913014, "learning_rate": 1.874327105813361e-05, "loss": 1.3926, "step": 3883 }, { "epoch": 0.19, "grad_norm": 1.1395584637375353, "learning_rate": 1.874251489933769e-05, "loss": 0.9792, "step": 3884 }, { "epoch": 0.19, "grad_norm": 1.3944578863904746, "learning_rate": 1.874175852838664e-05, "loss": 1.1118, "step": 3885 }, { "epoch": 0.19, "grad_norm": 0.8916927462233674, "learning_rate": 1.8741001945298817e-05, "loss": 1.1802, "step": 3886 }, { "epoch": 0.19, "grad_norm": 1.3135201781873118, "learning_rate": 1.8740245150092585e-05, "loss": 1.252, "step": 3887 }, { "epoch": 0.19, "grad_norm": 1.1443115260373942, "learning_rate": 1.873948814278631e-05, "loss": 1.1738, "step": 3888 }, { "epoch": 0.19, "grad_norm": 1.3461935481077292, "learning_rate": 1.8738730923398357e-05, "loss": 1.3398, "step": 3889 }, { "epoch": 0.19, "grad_norm": 1.0615053787453288, "learning_rate": 1.8737973491947102e-05, "loss": 1.2036, "step": 3890 }, { "epoch": 0.19, "grad_norm": 1.0188809157873129, "learning_rate": 1.8737215848450933e-05, "loss": 1.1924, "step": 3891 }, { "epoch": 0.19, "grad_norm": 1.5045985881628183, "learning_rate": 1.8736457992928228e-05, "loss": 1.3145, "step": 3892 }, { "epoch": 0.19, "grad_norm": 1.1357893377651747, "learning_rate": 1.873569992539738e-05, "loss": 1.104, "step": 3893 }, { "epoch": 0.19, "grad_norm": 1.3037439217132705, "learning_rate": 1.8734941645876786e-05, "loss": 1.3403, "step": 3894 }, { "epoch": 0.19, "grad_norm": 1.169668532029814, "learning_rate": 1.8734183154384848e-05, "loss": 1.0701, "step": 3895 }, { "epoch": 0.19, "grad_norm": 1.2517172934027079, "learning_rate": 1.873342445093997e-05, "loss": 1.2075, "step": 3896 }, { "epoch": 0.19, "grad_norm": 1.6314924205382637, "learning_rate": 1.8732665535560564e-05, "loss": 1.3081, "step": 3897 }, { "epoch": 0.19, "grad_norm": 1.2119416132223482, "learning_rate": 1.873190640826505e-05, "loss": 1.1509, "step": 3898 }, { "epoch": 0.19, "grad_norm": 1.1319996684413565, "learning_rate": 1.8731147069071843e-05, "loss": 1.0159, "step": 3899 }, { "epoch": 0.19, "grad_norm": 1.0653718539190122, "learning_rate": 1.8730387517999378e-05, "loss": 1.1123, "step": 3900 }, { "epoch": 0.19, "grad_norm": 0.8359395207604404, "learning_rate": 1.8729627755066082e-05, "loss": 1.1758, "step": 3901 }, { "epoch": 0.19, "grad_norm": 1.2224139068867883, "learning_rate": 1.8728867780290393e-05, "loss": 1.1802, "step": 3902 }, { "epoch": 0.19, "grad_norm": 1.2362163227668923, "learning_rate": 1.8728107593690753e-05, "loss": 1.2378, "step": 3903 }, { "epoch": 0.19, "grad_norm": 1.1327031168571218, "learning_rate": 1.872734719528561e-05, "loss": 1.0459, "step": 3904 }, { "epoch": 0.19, "grad_norm": 1.3659739168261704, "learning_rate": 1.872658658509342e-05, "loss": 1.1211, "step": 3905 }, { "epoch": 0.19, "grad_norm": 1.2708763017008287, "learning_rate": 1.872582576313263e-05, "loss": 1.2827, "step": 3906 }, { "epoch": 0.19, "grad_norm": 0.9610992766299828, "learning_rate": 1.8725064729421717e-05, "loss": 1.1343, "step": 3907 }, { "epoch": 0.19, "grad_norm": 1.1744418965325993, "learning_rate": 1.872430348397914e-05, "loss": 1.3696, "step": 3908 }, { "epoch": 0.19, "grad_norm": 1.163794860534079, "learning_rate": 1.8723542026823375e-05, "loss": 1.2505, "step": 3909 }, { "epoch": 0.19, "grad_norm": 1.1645938154070419, "learning_rate": 1.8722780357972903e-05, "loss": 1.1431, "step": 3910 }, { "epoch": 0.19, "grad_norm": 1.3756586166558558, "learning_rate": 1.87220184774462e-05, "loss": 1.1167, "step": 3911 }, { "epoch": 0.19, "grad_norm": 1.4178143226225328, "learning_rate": 1.872125638526176e-05, "loss": 1.1719, "step": 3912 }, { "epoch": 0.19, "grad_norm": 1.8714871332099345, "learning_rate": 1.872049408143808e-05, "loss": 1.2051, "step": 3913 }, { "epoch": 0.19, "grad_norm": 1.3358384519871518, "learning_rate": 1.8719731565993647e-05, "loss": 1.2764, "step": 3914 }, { "epoch": 0.19, "grad_norm": 1.3332416938948266, "learning_rate": 1.871896883894698e-05, "loss": 1.2144, "step": 3915 }, { "epoch": 0.19, "grad_norm": 1.1754403372366058, "learning_rate": 1.8718205900316578e-05, "loss": 1.2646, "step": 3916 }, { "epoch": 0.19, "grad_norm": 1.2972778522489423, "learning_rate": 1.8717442750120956e-05, "loss": 1.2939, "step": 3917 }, { "epoch": 0.19, "grad_norm": 1.5763964280058587, "learning_rate": 1.871667938837864e-05, "loss": 1.2205, "step": 3918 }, { "epoch": 0.19, "grad_norm": 1.2942851967515048, "learning_rate": 1.871591581510815e-05, "loss": 1.126, "step": 3919 }, { "epoch": 0.19, "grad_norm": 1.415956311788413, "learning_rate": 1.871515203032801e-05, "loss": 1.355, "step": 3920 }, { "epoch": 0.19, "grad_norm": 1.352519506332349, "learning_rate": 1.8714388034056764e-05, "loss": 1.1724, "step": 3921 }, { "epoch": 0.19, "grad_norm": 0.9014310260347684, "learning_rate": 1.871362382631295e-05, "loss": 1.2583, "step": 3922 }, { "epoch": 0.19, "grad_norm": 0.8473281680322229, "learning_rate": 1.871285940711511e-05, "loss": 1.0483, "step": 3923 }, { "epoch": 0.19, "grad_norm": 1.131629680354648, "learning_rate": 1.8712094776481798e-05, "loss": 1.3716, "step": 3924 }, { "epoch": 0.19, "grad_norm": 1.1668819565405113, "learning_rate": 1.8711329934431563e-05, "loss": 1.209, "step": 3925 }, { "epoch": 0.19, "grad_norm": 1.1365089860312, "learning_rate": 1.8710564880982975e-05, "loss": 1.1704, "step": 3926 }, { "epoch": 0.19, "grad_norm": 1.24820267432663, "learning_rate": 1.8709799616154587e-05, "loss": 1.2124, "step": 3927 }, { "epoch": 0.19, "grad_norm": 1.056203779023951, "learning_rate": 1.870903413996498e-05, "loss": 1.2036, "step": 3928 }, { "epoch": 0.19, "grad_norm": 1.346414462677292, "learning_rate": 1.870826845243273e-05, "loss": 1.249, "step": 3929 }, { "epoch": 0.19, "grad_norm": 1.5574945850531015, "learning_rate": 1.870750255357641e-05, "loss": 1.3755, "step": 3930 }, { "epoch": 0.19, "grad_norm": 1.3534182082820945, "learning_rate": 1.8706736443414616e-05, "loss": 1.3037, "step": 3931 }, { "epoch": 0.19, "grad_norm": 1.292932590349432, "learning_rate": 1.870597012196593e-05, "loss": 1.3325, "step": 3932 }, { "epoch": 0.19, "grad_norm": 1.3016419000077595, "learning_rate": 1.8705203589248953e-05, "loss": 1.2383, "step": 3933 }, { "epoch": 0.19, "grad_norm": 1.4711409634459682, "learning_rate": 1.8704436845282288e-05, "loss": 1.2305, "step": 3934 }, { "epoch": 0.19, "grad_norm": 1.3201087854345772, "learning_rate": 1.8703669890084536e-05, "loss": 1.146, "step": 3935 }, { "epoch": 0.19, "grad_norm": 1.2880522836509962, "learning_rate": 1.8702902723674317e-05, "loss": 1.0225, "step": 3936 }, { "epoch": 0.19, "grad_norm": 1.2073706475243196, "learning_rate": 1.870213534607024e-05, "loss": 1.124, "step": 3937 }, { "epoch": 0.19, "grad_norm": 1.3722918500196595, "learning_rate": 1.870136775729093e-05, "loss": 1.3149, "step": 3938 }, { "epoch": 0.19, "grad_norm": 1.1412119436060875, "learning_rate": 1.8700599957355017e-05, "loss": 1.1606, "step": 3939 }, { "epoch": 0.19, "grad_norm": 1.2770070656688284, "learning_rate": 1.8699831946281127e-05, "loss": 1.0574, "step": 3940 }, { "epoch": 0.19, "grad_norm": 1.1895870246195248, "learning_rate": 1.8699063724087905e-05, "loss": 1.1846, "step": 3941 }, { "epoch": 0.19, "grad_norm": 1.250305219746284, "learning_rate": 1.869829529079399e-05, "loss": 1.1406, "step": 3942 }, { "epoch": 0.19, "grad_norm": 1.4379638756083881, "learning_rate": 1.869752664641802e-05, "loss": 1.2324, "step": 3943 }, { "epoch": 0.19, "grad_norm": 1.4047790257549924, "learning_rate": 1.8696757790978668e-05, "loss": 1.3726, "step": 3944 }, { "epoch": 0.19, "grad_norm": 0.991124058493983, "learning_rate": 1.8695988724494577e-05, "loss": 1.1021, "step": 3945 }, { "epoch": 0.19, "grad_norm": 1.2007610949203509, "learning_rate": 1.869521944698441e-05, "loss": 1.21, "step": 3946 }, { "epoch": 0.19, "grad_norm": 1.3027661511695565, "learning_rate": 1.869444995846684e-05, "loss": 1.1602, "step": 3947 }, { "epoch": 0.19, "grad_norm": 1.5304224868907503, "learning_rate": 1.8693680258960543e-05, "loss": 1.375, "step": 3948 }, { "epoch": 0.19, "grad_norm": 1.1395084335689283, "learning_rate": 1.869291034848419e-05, "loss": 1.2578, "step": 3949 }, { "epoch": 0.19, "grad_norm": 1.1816648026309864, "learning_rate": 1.8692140227056468e-05, "loss": 1.0859, "step": 3950 }, { "epoch": 0.19, "grad_norm": 1.2253997178548492, "learning_rate": 1.8691369894696064e-05, "loss": 1.2661, "step": 3951 }, { "epoch": 0.19, "grad_norm": 1.5235399730674277, "learning_rate": 1.8690599351421675e-05, "loss": 1.1543, "step": 3952 }, { "epoch": 0.19, "grad_norm": 1.2679788543780428, "learning_rate": 1.8689828597252e-05, "loss": 1.2314, "step": 3953 }, { "epoch": 0.19, "grad_norm": 1.107975330301084, "learning_rate": 1.8689057632205737e-05, "loss": 1.1973, "step": 3954 }, { "epoch": 0.19, "grad_norm": 1.2353064109019314, "learning_rate": 1.8688286456301602e-05, "loss": 1.228, "step": 3955 }, { "epoch": 0.19, "grad_norm": 1.000964267296076, "learning_rate": 1.8687515069558303e-05, "loss": 1.0698, "step": 3956 }, { "epoch": 0.19, "grad_norm": 1.3057040228059902, "learning_rate": 1.8686743471994564e-05, "loss": 1.1865, "step": 3957 }, { "epoch": 0.19, "grad_norm": 1.2506485663763338, "learning_rate": 1.868597166362911e-05, "loss": 1.0251, "step": 3958 }, { "epoch": 0.19, "grad_norm": 1.2267620789141656, "learning_rate": 1.8685199644480663e-05, "loss": 1.1421, "step": 3959 }, { "epoch": 0.19, "grad_norm": 0.9289221285820728, "learning_rate": 1.8684427414567964e-05, "loss": 1.1846, "step": 3960 }, { "epoch": 0.19, "grad_norm": 1.5458055611704382, "learning_rate": 1.8683654973909754e-05, "loss": 1.0938, "step": 3961 }, { "epoch": 0.19, "grad_norm": 1.0991711553182602, "learning_rate": 1.8682882322524777e-05, "loss": 1.1792, "step": 3962 }, { "epoch": 0.19, "grad_norm": 1.4460814595186802, "learning_rate": 1.8682109460431775e-05, "loss": 1.2495, "step": 3963 }, { "epoch": 0.19, "grad_norm": 1.226625539794446, "learning_rate": 1.8681336387649516e-05, "loss": 1.1865, "step": 3964 }, { "epoch": 0.19, "grad_norm": 1.328142744885303, "learning_rate": 1.8680563104196753e-05, "loss": 1.0723, "step": 3965 }, { "epoch": 0.19, "grad_norm": 1.3428319480915605, "learning_rate": 1.867978961009225e-05, "loss": 1.2554, "step": 3966 }, { "epoch": 0.19, "grad_norm": 1.160441924824465, "learning_rate": 1.8679015905354777e-05, "loss": 1.189, "step": 3967 }, { "epoch": 0.19, "grad_norm": 1.4509386785671345, "learning_rate": 1.8678241990003116e-05, "loss": 1.4263, "step": 3968 }, { "epoch": 0.19, "grad_norm": 1.1447097283768006, "learning_rate": 1.8677467864056045e-05, "loss": 1.2588, "step": 3969 }, { "epoch": 0.19, "grad_norm": 1.3178983289632742, "learning_rate": 1.8676693527532344e-05, "loss": 1.3096, "step": 3970 }, { "epoch": 0.19, "grad_norm": 1.3022997373755076, "learning_rate": 1.8675918980450812e-05, "loss": 1.1208, "step": 3971 }, { "epoch": 0.19, "grad_norm": 1.1572461321362646, "learning_rate": 1.8675144222830242e-05, "loss": 1.2539, "step": 3972 }, { "epoch": 0.19, "grad_norm": 1.4055783999786589, "learning_rate": 1.867436925468943e-05, "loss": 1.3281, "step": 3973 }, { "epoch": 0.19, "grad_norm": 1.3462481291131096, "learning_rate": 1.867359407604719e-05, "loss": 1.2378, "step": 3974 }, { "epoch": 0.19, "grad_norm": 1.230309887761091, "learning_rate": 1.867281868692233e-05, "loss": 1.3135, "step": 3975 }, { "epoch": 0.19, "grad_norm": 1.6730470878317056, "learning_rate": 1.8672043087333662e-05, "loss": 1.2915, "step": 3976 }, { "epoch": 0.19, "grad_norm": 1.0625494286405814, "learning_rate": 1.8671267277300015e-05, "loss": 1.1577, "step": 3977 }, { "epoch": 0.19, "grad_norm": 1.1916740770839762, "learning_rate": 1.8670491256840212e-05, "loss": 1.1558, "step": 3978 }, { "epoch": 0.19, "grad_norm": 1.136398315892802, "learning_rate": 1.866971502597309e-05, "loss": 1.1821, "step": 3979 }, { "epoch": 0.19, "grad_norm": 1.4434610840204503, "learning_rate": 1.8668938584717473e-05, "loss": 1.1987, "step": 3980 }, { "epoch": 0.19, "grad_norm": 1.2612515445238641, "learning_rate": 1.8668161933092218e-05, "loss": 1.2036, "step": 3981 }, { "epoch": 0.19, "grad_norm": 1.1360093747059894, "learning_rate": 1.8667385071116157e-05, "loss": 1.1807, "step": 3982 }, { "epoch": 0.19, "grad_norm": 0.9340181030742526, "learning_rate": 1.8666607998808157e-05, "loss": 1.1294, "step": 3983 }, { "epoch": 0.19, "grad_norm": 1.0110360650430625, "learning_rate": 1.8665830716187064e-05, "loss": 1.0996, "step": 3984 }, { "epoch": 0.19, "grad_norm": 0.9615981851748967, "learning_rate": 1.866505322327175e-05, "loss": 1.1831, "step": 3985 }, { "epoch": 0.19, "grad_norm": 1.6591517773470519, "learning_rate": 1.866427552008107e-05, "loss": 1.3486, "step": 3986 }, { "epoch": 0.19, "grad_norm": 1.166510244451287, "learning_rate": 1.866349760663391e-05, "loss": 1.1294, "step": 3987 }, { "epoch": 0.19, "grad_norm": 1.4422220350059793, "learning_rate": 1.8662719482949142e-05, "loss": 1.0378, "step": 3988 }, { "epoch": 0.19, "grad_norm": 0.9679712752166828, "learning_rate": 1.8661941149045646e-05, "loss": 1.1777, "step": 3989 }, { "epoch": 0.19, "grad_norm": 1.3790346285161186, "learning_rate": 1.866116260494231e-05, "loss": 1.2129, "step": 3990 }, { "epoch": 0.19, "grad_norm": 1.27910918646187, "learning_rate": 1.8660383850658033e-05, "loss": 1.2549, "step": 3991 }, { "epoch": 0.19, "grad_norm": 1.3701733361146509, "learning_rate": 1.8659604886211705e-05, "loss": 1.0586, "step": 3992 }, { "epoch": 0.19, "grad_norm": 1.219742831097941, "learning_rate": 1.865882571162224e-05, "loss": 1.033, "step": 3993 }, { "epoch": 0.19, "grad_norm": 1.460602917698877, "learning_rate": 1.8658046326908533e-05, "loss": 1.2861, "step": 3994 }, { "epoch": 0.19, "grad_norm": 1.194225352857267, "learning_rate": 1.8657266732089508e-05, "loss": 1.4536, "step": 3995 }, { "epoch": 0.19, "grad_norm": 1.28681962828183, "learning_rate": 1.865648692718408e-05, "loss": 1.0637, "step": 3996 }, { "epoch": 0.19, "grad_norm": 1.4701572913356562, "learning_rate": 1.865570691221117e-05, "loss": 1.1587, "step": 3997 }, { "epoch": 0.19, "grad_norm": 1.0219382138147313, "learning_rate": 1.8654926687189706e-05, "loss": 1.2065, "step": 3998 }, { "epoch": 0.19, "grad_norm": 1.4286972976619932, "learning_rate": 1.865414625213863e-05, "loss": 1.2373, "step": 3999 }, { "epoch": 0.19, "grad_norm": 1.3527938271436013, "learning_rate": 1.8653365607076873e-05, "loss": 0.9985, "step": 4000 }, { "epoch": 0.19, "grad_norm": 1.180975451229571, "learning_rate": 1.865258475202338e-05, "loss": 1.1411, "step": 4001 }, { "epoch": 0.19, "grad_norm": 1.1307559834843286, "learning_rate": 1.86518036869971e-05, "loss": 1.1685, "step": 4002 }, { "epoch": 0.19, "grad_norm": 1.0874404911391171, "learning_rate": 1.8651022412016993e-05, "loss": 1.1042, "step": 4003 }, { "epoch": 0.19, "grad_norm": 1.2390577376748106, "learning_rate": 1.8650240927102012e-05, "loss": 1.2734, "step": 4004 }, { "epoch": 0.19, "grad_norm": 1.7721751493529376, "learning_rate": 1.8649459232271124e-05, "loss": 1.3252, "step": 4005 }, { "epoch": 0.19, "grad_norm": 1.0920328255182932, "learning_rate": 1.8648677327543297e-05, "loss": 1.2598, "step": 4006 }, { "epoch": 0.19, "grad_norm": 1.2267799807229007, "learning_rate": 1.8647895212937504e-05, "loss": 1.2944, "step": 4007 }, { "epoch": 0.19, "grad_norm": 0.6985251899901863, "learning_rate": 1.8647112888472732e-05, "loss": 1.3057, "step": 4008 }, { "epoch": 0.19, "grad_norm": 0.6985251899901863, "learning_rate": 1.8647112888472732e-05, "loss": 1.292, "step": 4009 }, { "epoch": 0.19, "grad_norm": 0.6985251899901863, "learning_rate": 1.8647112888472732e-05, "loss": 1.3335, "step": 4010 }, { "epoch": 0.19, "grad_norm": 1.2072504222586309, "learning_rate": 1.8646330354167955e-05, "loss": 1.2251, "step": 4011 }, { "epoch": 0.19, "grad_norm": 0.9971013369200885, "learning_rate": 1.8645547610042173e-05, "loss": 1.2759, "step": 4012 }, { "epoch": 0.19, "grad_norm": 1.1713869399847658, "learning_rate": 1.864476465611437e-05, "loss": 1.2954, "step": 4013 }, { "epoch": 0.19, "grad_norm": 1.2119973031274707, "learning_rate": 1.8643981492403557e-05, "loss": 1.4028, "step": 4014 }, { "epoch": 0.19, "grad_norm": 1.5477807570315671, "learning_rate": 1.864319811892873e-05, "loss": 1.3501, "step": 4015 }, { "epoch": 0.19, "grad_norm": 1.31246692751042, "learning_rate": 1.864241453570891e-05, "loss": 1.2021, "step": 4016 }, { "epoch": 0.19, "grad_norm": 1.3950640748743661, "learning_rate": 1.86416307427631e-05, "loss": 1.1855, "step": 4017 }, { "epoch": 0.19, "grad_norm": 1.1711374395497032, "learning_rate": 1.8640846740110327e-05, "loss": 1.2178, "step": 4018 }, { "epoch": 0.19, "grad_norm": 1.4784912898470401, "learning_rate": 1.8640062527769615e-05, "loss": 1.2334, "step": 4019 }, { "epoch": 0.19, "grad_norm": 1.1598381009710796, "learning_rate": 1.8639278105759998e-05, "loss": 1.1895, "step": 4020 }, { "epoch": 0.19, "grad_norm": 1.2900965614272344, "learning_rate": 1.8638493474100507e-05, "loss": 1.3613, "step": 4021 }, { "epoch": 0.19, "grad_norm": 1.5011005466112193, "learning_rate": 1.8637708632810185e-05, "loss": 1.2202, "step": 4022 }, { "epoch": 0.19, "grad_norm": 1.1908129171224233, "learning_rate": 1.8636923581908074e-05, "loss": 1.1436, "step": 4023 }, { "epoch": 0.19, "grad_norm": 1.3704352330799328, "learning_rate": 1.863613832141323e-05, "loss": 1.1973, "step": 4024 }, { "epoch": 0.19, "grad_norm": 1.5780296807059113, "learning_rate": 1.8635352851344707e-05, "loss": 1.1587, "step": 4025 }, { "epoch": 0.19, "grad_norm": 1.3755307848363783, "learning_rate": 1.8634567171721567e-05, "loss": 1.1958, "step": 4026 }, { "epoch": 0.19, "grad_norm": 1.0196600743124304, "learning_rate": 1.8633781282562875e-05, "loss": 1.1816, "step": 4027 }, { "epoch": 0.19, "grad_norm": 1.1135500516869925, "learning_rate": 1.8632995183887697e-05, "loss": 1.1689, "step": 4028 }, { "epoch": 0.19, "grad_norm": 1.5794191723626292, "learning_rate": 1.8632208875715122e-05, "loss": 1.3574, "step": 4029 }, { "epoch": 0.19, "grad_norm": 1.2853998276527956, "learning_rate": 1.8631422358064218e-05, "loss": 1.2651, "step": 4030 }, { "epoch": 0.19, "grad_norm": 1.2070200690549162, "learning_rate": 1.8630635630954083e-05, "loss": 1.2324, "step": 4031 }, { "epoch": 0.19, "grad_norm": 0.8296746451782641, "learning_rate": 1.86298486944038e-05, "loss": 1.189, "step": 4032 }, { "epoch": 0.19, "grad_norm": 1.1361775403737224, "learning_rate": 1.862906154843247e-05, "loss": 1.2124, "step": 4033 }, { "epoch": 0.19, "grad_norm": 1.277653276869562, "learning_rate": 1.8628274193059193e-05, "loss": 1.4023, "step": 4034 }, { "epoch": 0.19, "grad_norm": 1.1822094866439385, "learning_rate": 1.8627486628303076e-05, "loss": 1.3003, "step": 4035 }, { "epoch": 0.19, "grad_norm": 1.7984650888984377, "learning_rate": 1.862669885418323e-05, "loss": 1.4639, "step": 4036 }, { "epoch": 0.19, "grad_norm": 1.211996793703571, "learning_rate": 1.8625910870718775e-05, "loss": 1.2715, "step": 4037 }, { "epoch": 0.19, "grad_norm": 1.3122843321113327, "learning_rate": 1.862512267792883e-05, "loss": 1.208, "step": 4038 }, { "epoch": 0.19, "grad_norm": 1.270237328766837, "learning_rate": 1.8624334275832522e-05, "loss": 1.272, "step": 4039 }, { "epoch": 0.19, "grad_norm": 1.2435806751164216, "learning_rate": 1.8623545664448987e-05, "loss": 1.1172, "step": 4040 }, { "epoch": 0.19, "grad_norm": 0.9365948884497056, "learning_rate": 1.8622756843797356e-05, "loss": 1.2339, "step": 4041 }, { "epoch": 0.19, "grad_norm": 1.1310258572948733, "learning_rate": 1.8621967813896776e-05, "loss": 1.1899, "step": 4042 }, { "epoch": 0.19, "grad_norm": 1.147296773685079, "learning_rate": 1.8621178574766397e-05, "loss": 1.2378, "step": 4043 }, { "epoch": 0.19, "grad_norm": 1.156570515712008, "learning_rate": 1.8620389126425365e-05, "loss": 1.168, "step": 4044 }, { "epoch": 0.19, "grad_norm": 1.085455703315342, "learning_rate": 1.861959946889284e-05, "loss": 1.1611, "step": 4045 }, { "epoch": 0.19, "grad_norm": 1.0750719752527533, "learning_rate": 1.8618809602187987e-05, "loss": 1.2363, "step": 4046 }, { "epoch": 0.19, "grad_norm": 1.4296058674166023, "learning_rate": 1.861801952632997e-05, "loss": 1.4385, "step": 4047 }, { "epoch": 0.19, "grad_norm": 1.2628358213212882, "learning_rate": 1.8617229241337967e-05, "loss": 1.186, "step": 4048 }, { "epoch": 0.19, "grad_norm": 1.2992977698904784, "learning_rate": 1.861643874723115e-05, "loss": 1.0522, "step": 4049 }, { "epoch": 0.19, "grad_norm": 1.1832575411770572, "learning_rate": 1.86156480440287e-05, "loss": 1.1738, "step": 4050 }, { "epoch": 0.19, "grad_norm": 1.176487909262996, "learning_rate": 1.8614857131749818e-05, "loss": 1.0251, "step": 4051 }, { "epoch": 0.19, "grad_norm": 1.3183352762968201, "learning_rate": 1.8614066010413686e-05, "loss": 1.1968, "step": 4052 }, { "epoch": 0.19, "grad_norm": 1.299496089846326, "learning_rate": 1.8613274680039506e-05, "loss": 1.0444, "step": 4053 }, { "epoch": 0.19, "grad_norm": 1.3557618564047709, "learning_rate": 1.861248314064648e-05, "loss": 1.3291, "step": 4054 }, { "epoch": 0.2, "grad_norm": 1.7274637403124014, "learning_rate": 1.8611691392253814e-05, "loss": 1.2808, "step": 4055 }, { "epoch": 0.2, "grad_norm": 1.3819401540973475, "learning_rate": 1.8610899434880724e-05, "loss": 1.2432, "step": 4056 }, { "epoch": 0.2, "grad_norm": 1.3127849647201817, "learning_rate": 1.861010726854643e-05, "loss": 1.1602, "step": 4057 }, { "epoch": 0.2, "grad_norm": 1.222572807489657, "learning_rate": 1.8609314893270155e-05, "loss": 1.0508, "step": 4058 }, { "epoch": 0.2, "grad_norm": 1.5190225340208345, "learning_rate": 1.8608522309071128e-05, "loss": 1.2974, "step": 4059 }, { "epoch": 0.2, "grad_norm": 1.0858464472728957, "learning_rate": 1.860772951596858e-05, "loss": 1.1238, "step": 4060 }, { "epoch": 0.2, "grad_norm": 1.1799474681391802, "learning_rate": 1.8606936513981745e-05, "loss": 1.1465, "step": 4061 }, { "epoch": 0.2, "grad_norm": 1.1793517171025751, "learning_rate": 1.860614330312988e-05, "loss": 1.1929, "step": 4062 }, { "epoch": 0.2, "grad_norm": 1.0906440698647153, "learning_rate": 1.8605349883432223e-05, "loss": 1.1353, "step": 4063 }, { "epoch": 0.2, "grad_norm": 1.17589073592679, "learning_rate": 1.8604556254908034e-05, "loss": 1.0181, "step": 4064 }, { "epoch": 0.2, "grad_norm": 1.1227455590242628, "learning_rate": 1.860376241757657e-05, "loss": 1.2456, "step": 4065 }, { "epoch": 0.2, "grad_norm": 1.4229203457135131, "learning_rate": 1.8602968371457094e-05, "loss": 1.1938, "step": 4066 }, { "epoch": 0.2, "grad_norm": 1.1744472829555495, "learning_rate": 1.8602174116568876e-05, "loss": 1.2485, "step": 4067 }, { "epoch": 0.2, "grad_norm": 1.2439463105745132, "learning_rate": 1.860137965293119e-05, "loss": 1.1721, "step": 4068 }, { "epoch": 0.2, "grad_norm": 1.1310561592468147, "learning_rate": 1.8600584980563316e-05, "loss": 1.0874, "step": 4069 }, { "epoch": 0.2, "grad_norm": 1.1025239353376606, "learning_rate": 1.859979009948454e-05, "loss": 1.1265, "step": 4070 }, { "epoch": 0.2, "grad_norm": 1.3759714244344854, "learning_rate": 1.8598995009714145e-05, "loss": 1.2119, "step": 4071 }, { "epoch": 0.2, "grad_norm": 1.3976468760554495, "learning_rate": 1.8598199711271433e-05, "loss": 1.2642, "step": 4072 }, { "epoch": 0.2, "grad_norm": 1.1419146594021978, "learning_rate": 1.85974042041757e-05, "loss": 1.126, "step": 4073 }, { "epoch": 0.2, "grad_norm": 1.3440392288596028, "learning_rate": 1.859660848844625e-05, "loss": 1.3516, "step": 4074 }, { "epoch": 0.2, "grad_norm": 1.2532012475804237, "learning_rate": 1.8595812564102393e-05, "loss": 1.2163, "step": 4075 }, { "epoch": 0.2, "grad_norm": 1.1395847342928271, "learning_rate": 1.8595016431163448e-05, "loss": 1.0503, "step": 4076 }, { "epoch": 0.2, "grad_norm": 1.289748910948621, "learning_rate": 1.8594220089648727e-05, "loss": 1.2983, "step": 4077 }, { "epoch": 0.2, "grad_norm": 1.2032993748683838, "learning_rate": 1.8593423539577565e-05, "loss": 1.2114, "step": 4078 }, { "epoch": 0.2, "grad_norm": 1.0451257690461093, "learning_rate": 1.859262678096928e-05, "loss": 1.292, "step": 4079 }, { "epoch": 0.2, "grad_norm": 1.130293144287229, "learning_rate": 1.8591829813843215e-05, "loss": 1.1836, "step": 4080 }, { "epoch": 0.2, "grad_norm": 1.282573577566955, "learning_rate": 1.859103263821871e-05, "loss": 1.2993, "step": 4081 }, { "epoch": 0.2, "grad_norm": 1.5304038829234583, "learning_rate": 1.8590235254115105e-05, "loss": 1.3652, "step": 4082 }, { "epoch": 0.2, "grad_norm": 1.3743093601276695, "learning_rate": 1.8589437661551756e-05, "loss": 1.1895, "step": 4083 }, { "epoch": 0.2, "grad_norm": 1.018150619419648, "learning_rate": 1.8588639860548012e-05, "loss": 1.1802, "step": 4084 }, { "epoch": 0.2, "grad_norm": 1.0584588759529323, "learning_rate": 1.858784185112324e-05, "loss": 1.0977, "step": 4085 }, { "epoch": 0.2, "grad_norm": 1.282416211463359, "learning_rate": 1.85870436332968e-05, "loss": 1.1113, "step": 4086 }, { "epoch": 0.2, "grad_norm": 1.377231446417847, "learning_rate": 1.8586245207088068e-05, "loss": 1.2441, "step": 4087 }, { "epoch": 0.2, "grad_norm": 1.1053607513835888, "learning_rate": 1.8585446572516416e-05, "loss": 0.9426, "step": 4088 }, { "epoch": 0.2, "grad_norm": 1.444169813642195, "learning_rate": 1.858464772960122e-05, "loss": 1.2349, "step": 4089 }, { "epoch": 0.2, "grad_norm": 1.3352340136898877, "learning_rate": 1.858384867836187e-05, "loss": 1.1543, "step": 4090 }, { "epoch": 0.2, "grad_norm": 1.17589012656732, "learning_rate": 1.8583049418817764e-05, "loss": 1.0894, "step": 4091 }, { "epoch": 0.2, "grad_norm": 1.33890680327144, "learning_rate": 1.8582249950988285e-05, "loss": 1.2612, "step": 4092 }, { "epoch": 0.2, "grad_norm": 1.3516983841445027, "learning_rate": 1.8581450274892842e-05, "loss": 1.1357, "step": 4093 }, { "epoch": 0.2, "grad_norm": 1.2519027242997316, "learning_rate": 1.8580650390550835e-05, "loss": 1.1287, "step": 4094 }, { "epoch": 0.2, "grad_norm": 1.548654717704127, "learning_rate": 1.857985029798168e-05, "loss": 1.3433, "step": 4095 }, { "epoch": 0.2, "grad_norm": 1.3082021829750532, "learning_rate": 1.857904999720479e-05, "loss": 1.1816, "step": 4096 }, { "epoch": 0.2, "grad_norm": 1.0011880186548685, "learning_rate": 1.8578249488239584e-05, "loss": 1.1895, "step": 4097 }, { "epoch": 0.2, "grad_norm": 1.1643480642299666, "learning_rate": 1.8577448771105494e-05, "loss": 1.2129, "step": 4098 }, { "epoch": 0.2, "grad_norm": 1.1465602586939319, "learning_rate": 1.8576647845821947e-05, "loss": 1.1309, "step": 4099 }, { "epoch": 0.2, "grad_norm": 1.4399496754213852, "learning_rate": 1.857584671240838e-05, "loss": 1.2363, "step": 4100 }, { "epoch": 0.2, "grad_norm": 1.2431984412687682, "learning_rate": 1.8575045370884232e-05, "loss": 1.0767, "step": 4101 }, { "epoch": 0.2, "grad_norm": 1.1150801915026713, "learning_rate": 1.8574243821268953e-05, "loss": 1.1016, "step": 4102 }, { "epoch": 0.2, "grad_norm": 1.5024062819185104, "learning_rate": 1.857344206358199e-05, "loss": 1.1899, "step": 4103 }, { "epoch": 0.2, "grad_norm": 1.258775897807646, "learning_rate": 1.8572640097842804e-05, "loss": 1.0237, "step": 4104 }, { "epoch": 0.2, "grad_norm": 1.1999899557197724, "learning_rate": 1.8571837924070853e-05, "loss": 1.1973, "step": 4105 }, { "epoch": 0.2, "grad_norm": 1.3318212018843154, "learning_rate": 1.857103554228561e-05, "loss": 1.2231, "step": 4106 }, { "epoch": 0.2, "grad_norm": 1.3423468104856457, "learning_rate": 1.857023295250653e-05, "loss": 1.3101, "step": 4107 }, { "epoch": 0.2, "grad_norm": 1.1308608711999477, "learning_rate": 1.856943015475311e-05, "loss": 1.1401, "step": 4108 }, { "epoch": 0.2, "grad_norm": 1.3013475580474347, "learning_rate": 1.856862714904482e-05, "loss": 1.2319, "step": 4109 }, { "epoch": 0.2, "grad_norm": 1.443788370474475, "learning_rate": 1.8567823935401145e-05, "loss": 1.3726, "step": 4110 }, { "epoch": 0.2, "grad_norm": 1.1322124424621796, "learning_rate": 1.8567020513841582e-05, "loss": 1.2368, "step": 4111 }, { "epoch": 0.2, "grad_norm": 0.8735615807465672, "learning_rate": 1.8566216884385625e-05, "loss": 1.1838, "step": 4112 }, { "epoch": 0.2, "grad_norm": 1.3664829469756556, "learning_rate": 1.8565413047052778e-05, "loss": 1.2979, "step": 4113 }, { "epoch": 0.2, "grad_norm": 1.2550115910719717, "learning_rate": 1.8564609001862547e-05, "loss": 1.3135, "step": 4114 }, { "epoch": 0.2, "grad_norm": 1.3225248418499602, "learning_rate": 1.856380474883444e-05, "loss": 1.2725, "step": 4115 }, { "epoch": 0.2, "grad_norm": 1.3487911200666083, "learning_rate": 1.856300028798798e-05, "loss": 1.2432, "step": 4116 }, { "epoch": 0.2, "grad_norm": 1.3151088171468028, "learning_rate": 1.8562195619342684e-05, "loss": 1.25, "step": 4117 }, { "epoch": 0.2, "grad_norm": 1.0509979454235223, "learning_rate": 1.856139074291808e-05, "loss": 1.1836, "step": 4118 }, { "epoch": 0.2, "grad_norm": 1.1790086887586955, "learning_rate": 1.8560585658733707e-05, "loss": 1.1484, "step": 4119 }, { "epoch": 0.2, "grad_norm": 1.1937663594067485, "learning_rate": 1.855978036680909e-05, "loss": 1.1958, "step": 4120 }, { "epoch": 0.2, "grad_norm": 1.0342488538230754, "learning_rate": 1.8558974867163778e-05, "loss": 1.0681, "step": 4121 }, { "epoch": 0.2, "grad_norm": 1.2401356405511152, "learning_rate": 1.8558169159817316e-05, "loss": 1.2666, "step": 4122 }, { "epoch": 0.2, "grad_norm": 1.2353932889679449, "learning_rate": 1.855736324478926e-05, "loss": 1.1138, "step": 4123 }, { "epoch": 0.2, "grad_norm": 1.188501278773705, "learning_rate": 1.8556557122099163e-05, "loss": 1.1626, "step": 4124 }, { "epoch": 0.2, "grad_norm": 1.241308529981415, "learning_rate": 1.8555750791766588e-05, "loss": 1.127, "step": 4125 }, { "epoch": 0.2, "grad_norm": 0.9194257544323624, "learning_rate": 1.8554944253811103e-05, "loss": 1.0869, "step": 4126 }, { "epoch": 0.2, "grad_norm": 1.3874513770403745, "learning_rate": 1.8554137508252278e-05, "loss": 1.1812, "step": 4127 }, { "epoch": 0.2, "grad_norm": 1.6114129529297765, "learning_rate": 1.8553330555109696e-05, "loss": 1.021, "step": 4128 }, { "epoch": 0.2, "grad_norm": 0.9140859211868847, "learning_rate": 1.8552523394402933e-05, "loss": 1.2339, "step": 4129 }, { "epoch": 0.2, "grad_norm": 1.0200767486149613, "learning_rate": 1.8551716026151584e-05, "loss": 1.2285, "step": 4130 }, { "epoch": 0.2, "grad_norm": 1.1360759961305973, "learning_rate": 1.8550908450375232e-05, "loss": 1.2036, "step": 4131 }, { "epoch": 0.2, "grad_norm": 1.334897294547008, "learning_rate": 1.855010066709348e-05, "loss": 1.2817, "step": 4132 }, { "epoch": 0.2, "grad_norm": 1.0445626300196393, "learning_rate": 1.8549292676325935e-05, "loss": 1.2119, "step": 4133 }, { "epoch": 0.2, "grad_norm": 1.1694187508678697, "learning_rate": 1.8548484478092192e-05, "loss": 1.2539, "step": 4134 }, { "epoch": 0.2, "grad_norm": 1.3204599814527618, "learning_rate": 1.8547676072411874e-05, "loss": 1.2715, "step": 4135 }, { "epoch": 0.2, "grad_norm": 1.267508517666056, "learning_rate": 1.8546867459304595e-05, "loss": 1.0735, "step": 4136 }, { "epoch": 0.2, "grad_norm": 1.3529333658156226, "learning_rate": 1.8546058638789982e-05, "loss": 1.022, "step": 4137 }, { "epoch": 0.2, "grad_norm": 1.3988577317646531, "learning_rate": 1.8545249610887653e-05, "loss": 1.1865, "step": 4138 }, { "epoch": 0.2, "grad_norm": 1.3622125402700604, "learning_rate": 1.854444037561725e-05, "loss": 1.3599, "step": 4139 }, { "epoch": 0.2, "grad_norm": 1.268202136096835, "learning_rate": 1.8543630932998408e-05, "loss": 1.2241, "step": 4140 }, { "epoch": 0.2, "grad_norm": 1.2668940880671669, "learning_rate": 1.8542821283050766e-05, "loss": 1.2798, "step": 4141 }, { "epoch": 0.2, "grad_norm": 1.1186714887735982, "learning_rate": 1.8542011425793976e-05, "loss": 1.0989, "step": 4142 }, { "epoch": 0.2, "grad_norm": 1.1720288135966577, "learning_rate": 1.854120136124769e-05, "loss": 1.3032, "step": 4143 }, { "epoch": 0.2, "grad_norm": 1.3015133164185093, "learning_rate": 1.8540391089431566e-05, "loss": 1.0559, "step": 4144 }, { "epoch": 0.2, "grad_norm": 1.14830781398766, "learning_rate": 1.8539580610365267e-05, "loss": 1.2021, "step": 4145 }, { "epoch": 0.2, "grad_norm": 1.3145868499971776, "learning_rate": 1.853876992406846e-05, "loss": 1.1064, "step": 4146 }, { "epoch": 0.2, "grad_norm": 1.2347419765602072, "learning_rate": 1.853795903056082e-05, "loss": 1.1523, "step": 4147 }, { "epoch": 0.2, "grad_norm": 1.330156791648631, "learning_rate": 1.8537147929862023e-05, "loss": 1.2114, "step": 4148 }, { "epoch": 0.2, "grad_norm": 1.331656795511666, "learning_rate": 1.853633662199175e-05, "loss": 1.249, "step": 4149 }, { "epoch": 0.2, "grad_norm": 1.3549114614257127, "learning_rate": 1.8535525106969694e-05, "loss": 1.2676, "step": 4150 }, { "epoch": 0.2, "grad_norm": 1.2517788727714279, "learning_rate": 1.8534713384815548e-05, "loss": 0.9663, "step": 4151 }, { "epoch": 0.2, "grad_norm": 0.9985303710454936, "learning_rate": 1.8533901455549005e-05, "loss": 1.2173, "step": 4152 }, { "epoch": 0.2, "grad_norm": 1.4754270682675756, "learning_rate": 1.853308931918977e-05, "loss": 1.228, "step": 4153 }, { "epoch": 0.2, "grad_norm": 1.086503077286127, "learning_rate": 1.8532276975757554e-05, "loss": 1.3154, "step": 4154 }, { "epoch": 0.2, "grad_norm": 1.1673689470719117, "learning_rate": 1.8531464425272067e-05, "loss": 1.3208, "step": 4155 }, { "epoch": 0.2, "grad_norm": 1.2607448923119657, "learning_rate": 1.853065166775303e-05, "loss": 1.2144, "step": 4156 }, { "epoch": 0.2, "grad_norm": 1.1611744516404576, "learning_rate": 1.8529838703220164e-05, "loss": 1.229, "step": 4157 }, { "epoch": 0.2, "grad_norm": 1.4408591685869967, "learning_rate": 1.85290255316932e-05, "loss": 1.1816, "step": 4158 }, { "epoch": 0.2, "grad_norm": 1.4020591277558896, "learning_rate": 1.8528212153191868e-05, "loss": 1.1772, "step": 4159 }, { "epoch": 0.2, "grad_norm": 1.2194187538832462, "learning_rate": 1.8527398567735904e-05, "loss": 1.395, "step": 4160 }, { "epoch": 0.2, "grad_norm": 1.067501965280121, "learning_rate": 1.852658477534506e-05, "loss": 1.1577, "step": 4161 }, { "epoch": 0.2, "grad_norm": 0.9527127994954667, "learning_rate": 1.8525770776039077e-05, "loss": 1.2627, "step": 4162 }, { "epoch": 0.2, "grad_norm": 1.2850949171434338, "learning_rate": 1.852495656983771e-05, "loss": 1.2603, "step": 4163 }, { "epoch": 0.2, "grad_norm": 1.359353225780469, "learning_rate": 1.852414215676072e-05, "loss": 1.25, "step": 4164 }, { "epoch": 0.2, "grad_norm": 1.2000712052211069, "learning_rate": 1.8523327536827866e-05, "loss": 1.1538, "step": 4165 }, { "epoch": 0.2, "grad_norm": 0.9872529213860134, "learning_rate": 1.852251271005892e-05, "loss": 1.3633, "step": 4166 }, { "epoch": 0.2, "grad_norm": 1.2057145478190794, "learning_rate": 1.852169767647366e-05, "loss": 1.2622, "step": 4167 }, { "epoch": 0.2, "grad_norm": 1.4214380282252763, "learning_rate": 1.852088243609185e-05, "loss": 1.3071, "step": 4168 }, { "epoch": 0.2, "grad_norm": 1.0650573271121984, "learning_rate": 1.8520066988933286e-05, "loss": 1.1367, "step": 4169 }, { "epoch": 0.2, "grad_norm": 1.2725866491690976, "learning_rate": 1.8519251335017753e-05, "loss": 1.2646, "step": 4170 }, { "epoch": 0.2, "grad_norm": 1.2355227136123728, "learning_rate": 1.8518435474365045e-05, "loss": 1.1875, "step": 4171 }, { "epoch": 0.2, "grad_norm": 1.1069896540290733, "learning_rate": 1.8517619406994956e-05, "loss": 1.2646, "step": 4172 }, { "epoch": 0.2, "grad_norm": 1.0901824238326834, "learning_rate": 1.8516803132927296e-05, "loss": 1.1021, "step": 4173 }, { "epoch": 0.2, "grad_norm": 1.3446969946823994, "learning_rate": 1.8515986652181873e-05, "loss": 1.272, "step": 4174 }, { "epoch": 0.2, "grad_norm": 1.1282039494541214, "learning_rate": 1.8515169964778496e-05, "loss": 1.0696, "step": 4175 }, { "epoch": 0.2, "grad_norm": 1.1888091573612913, "learning_rate": 1.8514353070736987e-05, "loss": 1.1533, "step": 4176 }, { "epoch": 0.2, "grad_norm": 1.4288224317460807, "learning_rate": 1.851353597007717e-05, "loss": 1.1772, "step": 4177 }, { "epoch": 0.2, "grad_norm": 1.5190094185248741, "learning_rate": 1.851271866281887e-05, "loss": 1.1865, "step": 4178 }, { "epoch": 0.2, "grad_norm": 1.3632856713110568, "learning_rate": 1.8511901148981922e-05, "loss": 1.0034, "step": 4179 }, { "epoch": 0.2, "grad_norm": 1.2095574863012208, "learning_rate": 1.8511083428586172e-05, "loss": 1.1704, "step": 4180 }, { "epoch": 0.2, "grad_norm": 1.1804983312626556, "learning_rate": 1.8510265501651454e-05, "loss": 1.1562, "step": 4181 }, { "epoch": 0.2, "grad_norm": 1.2736008950834392, "learning_rate": 1.8509447368197617e-05, "loss": 1.1646, "step": 4182 }, { "epoch": 0.2, "grad_norm": 1.391396046968214, "learning_rate": 1.850862902824452e-05, "loss": 1.1455, "step": 4183 }, { "epoch": 0.2, "grad_norm": 1.279392491854659, "learning_rate": 1.850781048181202e-05, "loss": 1.2148, "step": 4184 }, { "epoch": 0.2, "grad_norm": 1.3934675451867045, "learning_rate": 1.850699172891998e-05, "loss": 1.29, "step": 4185 }, { "epoch": 0.2, "grad_norm": 1.399065926106856, "learning_rate": 1.850617276958827e-05, "loss": 1.3066, "step": 4186 }, { "epoch": 0.2, "grad_norm": 1.342277947271469, "learning_rate": 1.8505353603836756e-05, "loss": 1.2593, "step": 4187 }, { "epoch": 0.2, "grad_norm": 1.2433572565931112, "learning_rate": 1.8504534231685332e-05, "loss": 1.27, "step": 4188 }, { "epoch": 0.2, "grad_norm": 1.5413421882822174, "learning_rate": 1.8503714653153867e-05, "loss": 1.2002, "step": 4189 }, { "epoch": 0.2, "grad_norm": 1.5940626006428704, "learning_rate": 1.850289486826226e-05, "loss": 1.4697, "step": 4190 }, { "epoch": 0.2, "grad_norm": 1.2007115894130946, "learning_rate": 1.85020748770304e-05, "loss": 1.1597, "step": 4191 }, { "epoch": 0.2, "grad_norm": 1.2748548530703248, "learning_rate": 1.8501254679478188e-05, "loss": 1.2124, "step": 4192 }, { "epoch": 0.2, "grad_norm": 1.3580175101456484, "learning_rate": 1.850043427562552e-05, "loss": 1.3315, "step": 4193 }, { "epoch": 0.2, "grad_norm": 1.2679041459496958, "learning_rate": 1.8499613665492317e-05, "loss": 1.1445, "step": 4194 }, { "epoch": 0.2, "grad_norm": 1.308714992695532, "learning_rate": 1.8498792849098482e-05, "loss": 1.1714, "step": 4195 }, { "epoch": 0.2, "grad_norm": 1.1419236233975938, "learning_rate": 1.8497971826463943e-05, "loss": 1.1387, "step": 4196 }, { "epoch": 0.2, "grad_norm": 1.3680264206537802, "learning_rate": 1.849715059760862e-05, "loss": 1.0938, "step": 4197 }, { "epoch": 0.2, "grad_norm": 1.0554024630881416, "learning_rate": 1.8496329162552437e-05, "loss": 1.1909, "step": 4198 }, { "epoch": 0.2, "grad_norm": 1.3247361815669028, "learning_rate": 1.8495507521315333e-05, "loss": 1.3242, "step": 4199 }, { "epoch": 0.2, "grad_norm": 1.3691684259102028, "learning_rate": 1.8494685673917248e-05, "loss": 1.1631, "step": 4200 }, { "epoch": 0.2, "grad_norm": 0.9816598401959337, "learning_rate": 1.8493863620378123e-05, "loss": 1.0603, "step": 4201 }, { "epoch": 0.2, "grad_norm": 1.302983550842362, "learning_rate": 1.8493041360717908e-05, "loss": 1.0918, "step": 4202 }, { "epoch": 0.2, "grad_norm": 1.140101638460897, "learning_rate": 1.8492218894956555e-05, "loss": 1.0808, "step": 4203 }, { "epoch": 0.2, "grad_norm": 1.1254497839665953, "learning_rate": 1.8491396223114024e-05, "loss": 1.083, "step": 4204 }, { "epoch": 0.2, "grad_norm": 1.2468739506945612, "learning_rate": 1.849057334521028e-05, "loss": 1.1709, "step": 4205 }, { "epoch": 0.2, "grad_norm": 1.2713147208405864, "learning_rate": 1.848975026126529e-05, "loss": 1.0715, "step": 4206 }, { "epoch": 0.2, "grad_norm": 1.4647761320819583, "learning_rate": 1.848892697129903e-05, "loss": 1.1387, "step": 4207 }, { "epoch": 0.2, "grad_norm": 1.451367144306609, "learning_rate": 1.8488103475331476e-05, "loss": 1.1494, "step": 4208 }, { "epoch": 0.2, "grad_norm": 1.1748359046691985, "learning_rate": 1.8487279773382613e-05, "loss": 1.252, "step": 4209 }, { "epoch": 0.2, "grad_norm": 1.451547846636832, "learning_rate": 1.848645586547243e-05, "loss": 1.3037, "step": 4210 }, { "epoch": 0.2, "grad_norm": 1.071505911075315, "learning_rate": 1.848563175162092e-05, "loss": 1.1069, "step": 4211 }, { "epoch": 0.2, "grad_norm": 1.298805880463429, "learning_rate": 1.8484807431848085e-05, "loss": 1.1328, "step": 4212 }, { "epoch": 0.2, "grad_norm": 1.3489296035658007, "learning_rate": 1.8483982906173928e-05, "loss": 1.2246, "step": 4213 }, { "epoch": 0.2, "grad_norm": 1.7049457628546592, "learning_rate": 1.8483158174618456e-05, "loss": 1.2197, "step": 4214 }, { "epoch": 0.2, "grad_norm": 1.3715469652769137, "learning_rate": 1.8482333237201678e-05, "loss": 1.2021, "step": 4215 }, { "epoch": 0.2, "grad_norm": 1.2749093909537395, "learning_rate": 1.8481508093943622e-05, "loss": 1.0974, "step": 4216 }, { "epoch": 0.2, "grad_norm": 1.5052534009984304, "learning_rate": 1.8480682744864306e-05, "loss": 1.2832, "step": 4217 }, { "epoch": 0.2, "grad_norm": 1.1086398386383465, "learning_rate": 1.8479857189983762e-05, "loss": 1.2014, "step": 4218 }, { "epoch": 0.2, "grad_norm": 1.1528936808295647, "learning_rate": 1.8479031429322022e-05, "loss": 1.127, "step": 4219 }, { "epoch": 0.2, "grad_norm": 1.2366414510234975, "learning_rate": 1.8478205462899127e-05, "loss": 1.3101, "step": 4220 }, { "epoch": 0.2, "grad_norm": 1.3126704712487294, "learning_rate": 1.8477379290735117e-05, "loss": 1.3428, "step": 4221 }, { "epoch": 0.2, "grad_norm": 1.267665747581158, "learning_rate": 1.8476552912850044e-05, "loss": 1.0806, "step": 4222 }, { "epoch": 0.2, "grad_norm": 1.207512205747138, "learning_rate": 1.8475726329263958e-05, "loss": 1.207, "step": 4223 }, { "epoch": 0.2, "grad_norm": 1.547937287442803, "learning_rate": 1.8474899539996923e-05, "loss": 1.2681, "step": 4224 }, { "epoch": 0.2, "grad_norm": 1.167564111040716, "learning_rate": 1.8474072545068998e-05, "loss": 1.2451, "step": 4225 }, { "epoch": 0.2, "grad_norm": 1.3219273124657043, "learning_rate": 1.8473245344500254e-05, "loss": 1.2969, "step": 4226 }, { "epoch": 0.2, "grad_norm": 1.5052245514323321, "learning_rate": 1.8472417938310765e-05, "loss": 1.2646, "step": 4227 }, { "epoch": 0.2, "grad_norm": 1.4758628237977864, "learning_rate": 1.8471590326520607e-05, "loss": 1.2202, "step": 4228 }, { "epoch": 0.2, "grad_norm": 1.8327802442789343, "learning_rate": 1.8470762509149867e-05, "loss": 1.4146, "step": 4229 }, { "epoch": 0.2, "grad_norm": 1.282405552669812, "learning_rate": 1.8469934486218634e-05, "loss": 1.3384, "step": 4230 }, { "epoch": 0.2, "grad_norm": 1.0689462947001933, "learning_rate": 1.8469106257747002e-05, "loss": 1.3042, "step": 4231 }, { "epoch": 0.2, "grad_norm": 1.4404399510897399, "learning_rate": 1.846827782375506e-05, "loss": 1.3667, "step": 4232 }, { "epoch": 0.2, "grad_norm": 1.335211588858977, "learning_rate": 1.8467449184262927e-05, "loss": 1.23, "step": 4233 }, { "epoch": 0.2, "grad_norm": 1.2171570851601994, "learning_rate": 1.84666203392907e-05, "loss": 1.1084, "step": 4234 }, { "epoch": 0.2, "grad_norm": 1.2848395775847028, "learning_rate": 1.84657912888585e-05, "loss": 0.9661, "step": 4235 }, { "epoch": 0.2, "grad_norm": 1.402225621396294, "learning_rate": 1.846496203298644e-05, "loss": 1.0029, "step": 4236 }, { "epoch": 0.2, "grad_norm": 0.878622540245646, "learning_rate": 1.8464132571694648e-05, "loss": 1.2715, "step": 4237 }, { "epoch": 0.2, "grad_norm": 1.0552685633034466, "learning_rate": 1.8463302905003247e-05, "loss": 1.2026, "step": 4238 }, { "epoch": 0.2, "grad_norm": 1.1661218772978283, "learning_rate": 1.846247303293238e-05, "loss": 1.1919, "step": 4239 }, { "epoch": 0.2, "grad_norm": 1.2956341356750836, "learning_rate": 1.846164295550218e-05, "loss": 1.2583, "step": 4240 }, { "epoch": 0.2, "grad_norm": 1.3619675717698394, "learning_rate": 1.846081267273278e-05, "loss": 1.2363, "step": 4241 }, { "epoch": 0.2, "grad_norm": 1.3114334856959564, "learning_rate": 1.845998218464435e-05, "loss": 1.2212, "step": 4242 }, { "epoch": 0.2, "grad_norm": 1.2328288411861021, "learning_rate": 1.845915149125703e-05, "loss": 1.1138, "step": 4243 }, { "epoch": 0.2, "grad_norm": 1.377982512840893, "learning_rate": 1.8458320592590976e-05, "loss": 1.1934, "step": 4244 }, { "epoch": 0.2, "grad_norm": 1.4516065412242682, "learning_rate": 1.845748948866636e-05, "loss": 1.394, "step": 4245 }, { "epoch": 0.2, "grad_norm": 1.2077974107751783, "learning_rate": 1.8456658179503345e-05, "loss": 1.2158, "step": 4246 }, { "epoch": 0.2, "grad_norm": 1.285646335993893, "learning_rate": 1.8455826665122107e-05, "loss": 1.2061, "step": 4247 }, { "epoch": 0.2, "grad_norm": 1.1520888614894382, "learning_rate": 1.845499494554282e-05, "loss": 1.1338, "step": 4248 }, { "epoch": 0.2, "grad_norm": 1.4433784518574095, "learning_rate": 1.8454163020785676e-05, "loss": 1.1768, "step": 4249 }, { "epoch": 0.2, "grad_norm": 1.5148379588501744, "learning_rate": 1.8453330890870857e-05, "loss": 1.2285, "step": 4250 }, { "epoch": 0.2, "grad_norm": 1.3808642715745274, "learning_rate": 1.8452498555818556e-05, "loss": 1.2764, "step": 4251 }, { "epoch": 0.2, "grad_norm": 1.3902208129784777, "learning_rate": 1.8451666015648976e-05, "loss": 1.2715, "step": 4252 }, { "epoch": 0.2, "grad_norm": 1.303100768600207, "learning_rate": 1.8450833270382312e-05, "loss": 1.271, "step": 4253 }, { "epoch": 0.2, "grad_norm": 1.3042622307791834, "learning_rate": 1.845000032003878e-05, "loss": 1.2422, "step": 4254 }, { "epoch": 0.2, "grad_norm": 1.3324718978215424, "learning_rate": 1.8449167164638596e-05, "loss": 1.3647, "step": 4255 }, { "epoch": 0.2, "grad_norm": 1.5055500671418578, "learning_rate": 1.844833380420197e-05, "loss": 1.1782, "step": 4256 }, { "epoch": 0.2, "grad_norm": 1.2260888770474974, "learning_rate": 1.8447500238749124e-05, "loss": 1.2114, "step": 4257 }, { "epoch": 0.2, "grad_norm": 1.145822775444332, "learning_rate": 1.8446666468300292e-05, "loss": 1.3066, "step": 4258 }, { "epoch": 0.2, "grad_norm": 1.1795511210437146, "learning_rate": 1.844583249287571e-05, "loss": 1.2075, "step": 4259 }, { "epoch": 0.2, "grad_norm": 1.0221081799664247, "learning_rate": 1.844499831249561e-05, "loss": 1.1997, "step": 4260 }, { "epoch": 0.2, "grad_norm": 1.1775765311042579, "learning_rate": 1.8444163927180232e-05, "loss": 1.2256, "step": 4261 }, { "epoch": 0.2, "grad_norm": 1.050419383848904, "learning_rate": 1.8443329336949835e-05, "loss": 1.1128, "step": 4262 }, { "epoch": 0.21, "grad_norm": 0.9692025963376726, "learning_rate": 1.844249454182466e-05, "loss": 1.0955, "step": 4263 }, { "epoch": 0.21, "grad_norm": 1.0436446747053556, "learning_rate": 1.8441659541824975e-05, "loss": 1.1665, "step": 4264 }, { "epoch": 0.21, "grad_norm": 1.2716823928322454, "learning_rate": 1.844082433697104e-05, "loss": 0.9268, "step": 4265 }, { "epoch": 0.21, "grad_norm": 1.1860696355805678, "learning_rate": 1.843998892728312e-05, "loss": 1.0767, "step": 4266 }, { "epoch": 0.21, "grad_norm": 1.222307748183581, "learning_rate": 1.8439153312781487e-05, "loss": 1.167, "step": 4267 }, { "epoch": 0.21, "grad_norm": 1.1742592929300073, "learning_rate": 1.8438317493486426e-05, "loss": 1.1753, "step": 4268 }, { "epoch": 0.21, "grad_norm": 1.2179130268814071, "learning_rate": 1.843748146941821e-05, "loss": 1.0601, "step": 4269 }, { "epoch": 0.21, "grad_norm": 1.2155805552549275, "learning_rate": 1.843664524059714e-05, "loss": 1.3125, "step": 4270 }, { "epoch": 0.21, "grad_norm": 0.9213401624689068, "learning_rate": 1.8435808807043498e-05, "loss": 1.1763, "step": 4271 }, { "epoch": 0.21, "grad_norm": 1.459907473794051, "learning_rate": 1.8434972168777585e-05, "loss": 1.3271, "step": 4272 }, { "epoch": 0.21, "grad_norm": 1.1873806966122182, "learning_rate": 1.8434135325819703e-05, "loss": 1.1802, "step": 4273 }, { "epoch": 0.21, "grad_norm": 1.1846934727522582, "learning_rate": 1.8433298278190163e-05, "loss": 1.2354, "step": 4274 }, { "epoch": 0.21, "grad_norm": 1.4761188850611127, "learning_rate": 1.843246102590927e-05, "loss": 1.1953, "step": 4275 }, { "epoch": 0.21, "grad_norm": 1.441773373996717, "learning_rate": 1.843162356899735e-05, "loss": 1.1455, "step": 4276 }, { "epoch": 0.21, "grad_norm": 1.400691306437527, "learning_rate": 1.8430785907474727e-05, "loss": 1.3354, "step": 4277 }, { "epoch": 0.21, "grad_norm": 1.1445927064733374, "learning_rate": 1.842994804136172e-05, "loss": 1.1113, "step": 4278 }, { "epoch": 0.21, "grad_norm": 1.293525562603318, "learning_rate": 1.8429109970678664e-05, "loss": 1.1482, "step": 4279 }, { "epoch": 0.21, "grad_norm": 1.3635538417803064, "learning_rate": 1.8428271695445903e-05, "loss": 1.2803, "step": 4280 }, { "epoch": 0.21, "grad_norm": 1.3405343655640127, "learning_rate": 1.842743321568377e-05, "loss": 1.0874, "step": 4281 }, { "epoch": 0.21, "grad_norm": 1.5075261459911644, "learning_rate": 1.842659453141262e-05, "loss": 1.2512, "step": 4282 }, { "epoch": 0.21, "grad_norm": 1.0840194397371827, "learning_rate": 1.8425755642652797e-05, "loss": 1.186, "step": 4283 }, { "epoch": 0.21, "grad_norm": 1.262757500225548, "learning_rate": 1.842491654942467e-05, "loss": 1.0442, "step": 4284 }, { "epoch": 0.21, "grad_norm": 1.0120831136887372, "learning_rate": 1.8424077251748593e-05, "loss": 1.1099, "step": 4285 }, { "epoch": 0.21, "grad_norm": 1.2785094421996668, "learning_rate": 1.8423237749644936e-05, "loss": 1.1055, "step": 4286 }, { "epoch": 0.21, "grad_norm": 1.268028033798254, "learning_rate": 1.8422398043134068e-05, "loss": 1.1396, "step": 4287 }, { "epoch": 0.21, "grad_norm": 1.1934523587156094, "learning_rate": 1.842155813223637e-05, "loss": 1.1855, "step": 4288 }, { "epoch": 0.21, "grad_norm": 1.3428325561855743, "learning_rate": 1.8420718016972227e-05, "loss": 1.2622, "step": 4289 }, { "epoch": 0.21, "grad_norm": 1.600228392755461, "learning_rate": 1.8419877697362015e-05, "loss": 1.479, "step": 4290 }, { "epoch": 0.21, "grad_norm": 1.2255156357671224, "learning_rate": 1.841903717342614e-05, "loss": 0.9629, "step": 4291 }, { "epoch": 0.21, "grad_norm": 1.281119136565635, "learning_rate": 1.8418196445184988e-05, "loss": 1.124, "step": 4292 }, { "epoch": 0.21, "grad_norm": 1.0545532408771852, "learning_rate": 1.841735551265897e-05, "loss": 1.2363, "step": 4293 }, { "epoch": 0.21, "grad_norm": 1.2960023816568036, "learning_rate": 1.8416514375868482e-05, "loss": 1.1533, "step": 4294 }, { "epoch": 0.21, "grad_norm": 1.7684732434569443, "learning_rate": 1.8415673034833945e-05, "loss": 1.5186, "step": 4295 }, { "epoch": 0.21, "grad_norm": 1.5952701570377055, "learning_rate": 1.8414831489575774e-05, "loss": 1.272, "step": 4296 }, { "epoch": 0.21, "grad_norm": 1.2585022929312808, "learning_rate": 1.841398974011439e-05, "loss": 1.1172, "step": 4297 }, { "epoch": 0.21, "grad_norm": 1.1833482149937713, "learning_rate": 1.8413147786470217e-05, "loss": 1.0635, "step": 4298 }, { "epoch": 0.21, "grad_norm": 1.2330734219595594, "learning_rate": 1.8412305628663693e-05, "loss": 0.9761, "step": 4299 }, { "epoch": 0.21, "grad_norm": 0.8959577688950373, "learning_rate": 1.841146326671525e-05, "loss": 1.104, "step": 4300 }, { "epoch": 0.21, "grad_norm": 0.9921477615481038, "learning_rate": 1.841062070064533e-05, "loss": 1.2764, "step": 4301 }, { "epoch": 0.21, "grad_norm": 1.1861829273809994, "learning_rate": 1.840977793047438e-05, "loss": 1.1675, "step": 4302 }, { "epoch": 0.21, "grad_norm": 1.089692818469624, "learning_rate": 1.8408934956222855e-05, "loss": 1.1255, "step": 4303 }, { "epoch": 0.21, "grad_norm": 1.3214412264249773, "learning_rate": 1.8408091777911203e-05, "loss": 1.1377, "step": 4304 }, { "epoch": 0.21, "grad_norm": 1.2597300655607568, "learning_rate": 1.8407248395559897e-05, "loss": 1.249, "step": 4305 }, { "epoch": 0.21, "grad_norm": 1.4200508896652564, "learning_rate": 1.840640480918939e-05, "loss": 1.3115, "step": 4306 }, { "epoch": 0.21, "grad_norm": 1.3800727277787403, "learning_rate": 1.8405561018820166e-05, "loss": 1.2559, "step": 4307 }, { "epoch": 0.21, "grad_norm": 1.2455103069388707, "learning_rate": 1.8404717024472696e-05, "loss": 1.2607, "step": 4308 }, { "epoch": 0.21, "grad_norm": 1.192796447923225, "learning_rate": 1.8403872826167458e-05, "loss": 1.1665, "step": 4309 }, { "epoch": 0.21, "grad_norm": 1.1755147627656064, "learning_rate": 1.8403028423924943e-05, "loss": 1.1387, "step": 4310 }, { "epoch": 0.21, "grad_norm": 1.1334124926798785, "learning_rate": 1.8402183817765643e-05, "loss": 1.1079, "step": 4311 }, { "epoch": 0.21, "grad_norm": 1.265548475637155, "learning_rate": 1.840133900771005e-05, "loss": 1.1797, "step": 4312 }, { "epoch": 0.21, "grad_norm": 0.9923242303422974, "learning_rate": 1.8400493993778666e-05, "loss": 1.1479, "step": 4313 }, { "epoch": 0.21, "grad_norm": 1.3480077193876068, "learning_rate": 1.8399648775991996e-05, "loss": 1.4546, "step": 4314 }, { "epoch": 0.21, "grad_norm": 1.519711538996555, "learning_rate": 1.8398803354370554e-05, "loss": 1.1895, "step": 4315 }, { "epoch": 0.21, "grad_norm": 1.2088068685152145, "learning_rate": 1.8397957728934855e-05, "loss": 1.1831, "step": 4316 }, { "epoch": 0.21, "grad_norm": 1.3965213875404816, "learning_rate": 1.839711189970542e-05, "loss": 1.1851, "step": 4317 }, { "epoch": 0.21, "grad_norm": 1.3171397259987716, "learning_rate": 1.8396265866702773e-05, "loss": 1.2612, "step": 4318 }, { "epoch": 0.21, "grad_norm": 1.0823642947845018, "learning_rate": 1.8395419629947448e-05, "loss": 1.2339, "step": 4319 }, { "epoch": 0.21, "grad_norm": 0.9601873798509978, "learning_rate": 1.8394573189459977e-05, "loss": 1.2935, "step": 4320 }, { "epoch": 0.21, "grad_norm": 0.9252434598722146, "learning_rate": 1.8393726545260903e-05, "loss": 1.2144, "step": 4321 }, { "epoch": 0.21, "grad_norm": 1.159528182178332, "learning_rate": 1.839287969737077e-05, "loss": 1.1533, "step": 4322 }, { "epoch": 0.21, "grad_norm": 1.1427304286664348, "learning_rate": 1.839203264581013e-05, "loss": 1.1567, "step": 4323 }, { "epoch": 0.21, "grad_norm": 1.1977915599653646, "learning_rate": 1.8391185390599537e-05, "loss": 1.1309, "step": 4324 }, { "epoch": 0.21, "grad_norm": 1.1825227859076293, "learning_rate": 1.8390337931759553e-05, "loss": 1.1895, "step": 4325 }, { "epoch": 0.21, "grad_norm": 0.8859841562494186, "learning_rate": 1.8389490269310744e-05, "loss": 1.2598, "step": 4326 }, { "epoch": 0.21, "grad_norm": 1.3251669424448402, "learning_rate": 1.8388642403273674e-05, "loss": 1.1821, "step": 4327 }, { "epoch": 0.21, "grad_norm": 1.2576512394049328, "learning_rate": 1.8387794333668928e-05, "loss": 1.2368, "step": 4328 }, { "epoch": 0.21, "grad_norm": 1.1089926554755873, "learning_rate": 1.838694606051708e-05, "loss": 1.4102, "step": 4329 }, { "epoch": 0.21, "grad_norm": 1.6859834834249956, "learning_rate": 1.8386097583838714e-05, "loss": 1.2588, "step": 4330 }, { "epoch": 0.21, "grad_norm": 0.8857354529267414, "learning_rate": 1.8385248903654423e-05, "loss": 1.1338, "step": 4331 }, { "epoch": 0.21, "grad_norm": 1.1006670809197643, "learning_rate": 1.83844000199848e-05, "loss": 1.0825, "step": 4332 }, { "epoch": 0.21, "grad_norm": 1.3480711138896766, "learning_rate": 1.838355093285045e-05, "loss": 1.3486, "step": 4333 }, { "epoch": 0.21, "grad_norm": 1.064301777815102, "learning_rate": 1.838270164227197e-05, "loss": 1.167, "step": 4334 }, { "epoch": 0.21, "grad_norm": 1.1299314705998726, "learning_rate": 1.8381852148269976e-05, "loss": 1.2866, "step": 4335 }, { "epoch": 0.21, "grad_norm": 1.1111686717881117, "learning_rate": 1.8381002450865078e-05, "loss": 1.1899, "step": 4336 }, { "epoch": 0.21, "grad_norm": 1.0762080526460733, "learning_rate": 1.8380152550077903e-05, "loss": 1.168, "step": 4337 }, { "epoch": 0.21, "grad_norm": 1.69634102660346, "learning_rate": 1.8379302445929068e-05, "loss": 1.3389, "step": 4338 }, { "epoch": 0.21, "grad_norm": 1.1069750803256408, "learning_rate": 1.8378452138439206e-05, "loss": 1.1245, "step": 4339 }, { "epoch": 0.21, "grad_norm": 1.5198919999997635, "learning_rate": 1.8377601627628952e-05, "loss": 1.2117, "step": 4340 }, { "epoch": 0.21, "grad_norm": 1.2849232093457805, "learning_rate": 1.837675091351894e-05, "loss": 1.3276, "step": 4341 }, { "epoch": 0.21, "grad_norm": 1.3245178907534891, "learning_rate": 1.8375899996129823e-05, "loss": 1.2334, "step": 4342 }, { "epoch": 0.21, "grad_norm": 1.1710587072978353, "learning_rate": 1.837504887548224e-05, "loss": 1.3115, "step": 4343 }, { "epoch": 0.21, "grad_norm": 1.1711341077635804, "learning_rate": 1.8374197551596857e-05, "loss": 1.2407, "step": 4344 }, { "epoch": 0.21, "grad_norm": 1.351122142769699, "learning_rate": 1.8373346024494324e-05, "loss": 1.2139, "step": 4345 }, { "epoch": 0.21, "grad_norm": 1.291248686494472, "learning_rate": 1.8372494294195306e-05, "loss": 1.2114, "step": 4346 }, { "epoch": 0.21, "grad_norm": 1.313426605180628, "learning_rate": 1.837164236072048e-05, "loss": 1.2026, "step": 4347 }, { "epoch": 0.21, "grad_norm": 1.2803192542937483, "learning_rate": 1.8370790224090508e-05, "loss": 1.0732, "step": 4348 }, { "epoch": 0.21, "grad_norm": 1.3237252403933644, "learning_rate": 1.836993788432608e-05, "loss": 1.252, "step": 4349 }, { "epoch": 0.21, "grad_norm": 1.2718012814451056, "learning_rate": 1.836908534144787e-05, "loss": 1.2681, "step": 4350 }, { "epoch": 0.21, "grad_norm": 1.0384780528307402, "learning_rate": 1.8368232595476575e-05, "loss": 1.022, "step": 4351 }, { "epoch": 0.21, "grad_norm": 1.37726427042209, "learning_rate": 1.8367379646432884e-05, "loss": 1.311, "step": 4352 }, { "epoch": 0.21, "grad_norm": 1.346598331897287, "learning_rate": 1.8366526494337497e-05, "loss": 1.0469, "step": 4353 }, { "epoch": 0.21, "grad_norm": 1.354594778260442, "learning_rate": 1.8365673139211114e-05, "loss": 1.2026, "step": 4354 }, { "epoch": 0.21, "grad_norm": 1.3065495673237775, "learning_rate": 1.8364819581074447e-05, "loss": 1.1719, "step": 4355 }, { "epoch": 0.21, "grad_norm": 1.3949235441545145, "learning_rate": 1.836396581994821e-05, "loss": 1.3105, "step": 4356 }, { "epoch": 0.21, "grad_norm": 1.1343675057281937, "learning_rate": 1.8363111855853122e-05, "loss": 1.0754, "step": 4357 }, { "epoch": 0.21, "grad_norm": 1.0403631129789366, "learning_rate": 1.8362257688809904e-05, "loss": 1.125, "step": 4358 }, { "epoch": 0.21, "grad_norm": 1.2588665415334734, "learning_rate": 1.8361403318839283e-05, "loss": 1.1626, "step": 4359 }, { "epoch": 0.21, "grad_norm": 1.1924071091446717, "learning_rate": 1.8360548745961994e-05, "loss": 1.0679, "step": 4360 }, { "epoch": 0.21, "grad_norm": 1.1521465689194352, "learning_rate": 1.8359693970198772e-05, "loss": 1.1025, "step": 4361 }, { "epoch": 0.21, "grad_norm": 1.2422747844506365, "learning_rate": 1.8358838991570363e-05, "loss": 1.1797, "step": 4362 }, { "epoch": 0.21, "grad_norm": 1.3327084422214552, "learning_rate": 1.835798381009752e-05, "loss": 1.2656, "step": 4363 }, { "epoch": 0.21, "grad_norm": 1.1635719403038898, "learning_rate": 1.8357128425800983e-05, "loss": 1.2041, "step": 4364 }, { "epoch": 0.21, "grad_norm": 1.205936391738759, "learning_rate": 1.835627283870152e-05, "loss": 1.1421, "step": 4365 }, { "epoch": 0.21, "grad_norm": 1.1358011217877584, "learning_rate": 1.835541704881989e-05, "loss": 1.2051, "step": 4366 }, { "epoch": 0.21, "grad_norm": 1.2603171946899037, "learning_rate": 1.8354561056176857e-05, "loss": 1.2041, "step": 4367 }, { "epoch": 0.21, "grad_norm": 1.3172620041735963, "learning_rate": 1.8353704860793202e-05, "loss": 1.3438, "step": 4368 }, { "epoch": 0.21, "grad_norm": 1.2569577749121277, "learning_rate": 1.8352848462689694e-05, "loss": 1.2485, "step": 4369 }, { "epoch": 0.21, "grad_norm": 1.112500915435981, "learning_rate": 1.835199186188712e-05, "loss": 1.0049, "step": 4370 }, { "epoch": 0.21, "grad_norm": 1.3335744300242611, "learning_rate": 1.8351135058406266e-05, "loss": 1.23, "step": 4371 }, { "epoch": 0.21, "grad_norm": 1.2493962448341316, "learning_rate": 1.8350278052267922e-05, "loss": 1.251, "step": 4372 }, { "epoch": 0.21, "grad_norm": 1.1394376322426596, "learning_rate": 1.834942084349289e-05, "loss": 1.1641, "step": 4373 }, { "epoch": 0.21, "grad_norm": 1.2384156789301703, "learning_rate": 1.8348563432101967e-05, "loss": 1.2847, "step": 4374 }, { "epoch": 0.21, "grad_norm": 1.294771402108518, "learning_rate": 1.834770581811596e-05, "loss": 1.1196, "step": 4375 }, { "epoch": 0.21, "grad_norm": 1.3594825144871425, "learning_rate": 1.8346848001555688e-05, "loss": 1.2256, "step": 4376 }, { "epoch": 0.21, "grad_norm": 1.5877083077854959, "learning_rate": 1.8345989982441955e-05, "loss": 1.1758, "step": 4377 }, { "epoch": 0.21, "grad_norm": 1.3651050024365659, "learning_rate": 1.8345131760795598e-05, "loss": 1.093, "step": 4378 }, { "epoch": 0.21, "grad_norm": 1.2404091384231468, "learning_rate": 1.834427333663743e-05, "loss": 1.2524, "step": 4379 }, { "epoch": 0.21, "grad_norm": 1.1032470694468395, "learning_rate": 1.8343414709988288e-05, "loss": 1.2437, "step": 4380 }, { "epoch": 0.21, "grad_norm": 1.553879186412182, "learning_rate": 1.834255588086901e-05, "loss": 1.2642, "step": 4381 }, { "epoch": 0.21, "grad_norm": 1.082103365663125, "learning_rate": 1.834169684930043e-05, "loss": 1.1377, "step": 4382 }, { "epoch": 0.21, "grad_norm": 1.233339196372351, "learning_rate": 1.8340837615303405e-05, "loss": 1.0454, "step": 4383 }, { "epoch": 0.21, "grad_norm": 1.163908360191841, "learning_rate": 1.833997817889878e-05, "loss": 1.1973, "step": 4384 }, { "epoch": 0.21, "grad_norm": 1.2801311246224156, "learning_rate": 1.833911854010741e-05, "loss": 0.9805, "step": 4385 }, { "epoch": 0.21, "grad_norm": 1.1100500031733034, "learning_rate": 1.833825869895016e-05, "loss": 1.2002, "step": 4386 }, { "epoch": 0.21, "grad_norm": 1.3607910660401024, "learning_rate": 1.8337398655447894e-05, "loss": 1.1875, "step": 4387 }, { "epoch": 0.21, "grad_norm": 1.0935972240834675, "learning_rate": 1.8336538409621474e-05, "loss": 1.0972, "step": 4388 }, { "epoch": 0.21, "grad_norm": 0.85099296303356, "learning_rate": 1.8335677961491793e-05, "loss": 1.2363, "step": 4389 }, { "epoch": 0.21, "grad_norm": 1.1068701609854668, "learning_rate": 1.833481731107972e-05, "loss": 1.2803, "step": 4390 }, { "epoch": 0.21, "grad_norm": 1.2710857691538058, "learning_rate": 1.833395645840614e-05, "loss": 1.1089, "step": 4391 }, { "epoch": 0.21, "grad_norm": 1.274529249606377, "learning_rate": 1.8333095403491946e-05, "loss": 1.2852, "step": 4392 }, { "epoch": 0.21, "grad_norm": 1.4841455435658981, "learning_rate": 1.8332234146358034e-05, "loss": 1.0542, "step": 4393 }, { "epoch": 0.21, "grad_norm": 1.1400405150411008, "learning_rate": 1.8331372687025305e-05, "loss": 1.228, "step": 4394 }, { "epoch": 0.21, "grad_norm": 1.001509514333863, "learning_rate": 1.8330511025514662e-05, "loss": 1.1963, "step": 4395 }, { "epoch": 0.21, "grad_norm": 1.4162009773935669, "learning_rate": 1.8329649161847016e-05, "loss": 1.2334, "step": 4396 }, { "epoch": 0.21, "grad_norm": 1.4279910557608084, "learning_rate": 1.8328787096043278e-05, "loss": 1.2539, "step": 4397 }, { "epoch": 0.21, "grad_norm": 1.0916459919720378, "learning_rate": 1.8327924828124377e-05, "loss": 1.3433, "step": 4398 }, { "epoch": 0.21, "grad_norm": 1.3709913981084314, "learning_rate": 1.8327062358111228e-05, "loss": 1.1665, "step": 4399 }, { "epoch": 0.21, "grad_norm": 1.2604488420483582, "learning_rate": 1.8326199686024765e-05, "loss": 1.2236, "step": 4400 }, { "epoch": 0.21, "grad_norm": 1.1783845316970833, "learning_rate": 1.8325336811885926e-05, "loss": 1.1772, "step": 4401 }, { "epoch": 0.21, "grad_norm": 1.5155437721199205, "learning_rate": 1.8324473735715643e-05, "loss": 1.2871, "step": 4402 }, { "epoch": 0.21, "grad_norm": 1.2126955865557882, "learning_rate": 1.832361045753486e-05, "loss": 1.2124, "step": 4403 }, { "epoch": 0.21, "grad_norm": 1.1462010516925394, "learning_rate": 1.8322746977364537e-05, "loss": 1.2578, "step": 4404 }, { "epoch": 0.21, "grad_norm": 1.4348288162101668, "learning_rate": 1.8321883295225617e-05, "loss": 1.1934, "step": 4405 }, { "epoch": 0.21, "grad_norm": 1.341410701654294, "learning_rate": 1.8321019411139064e-05, "loss": 1.3447, "step": 4406 }, { "epoch": 0.21, "grad_norm": 1.2265451926352096, "learning_rate": 1.8320155325125843e-05, "loss": 1.0771, "step": 4407 }, { "epoch": 0.21, "grad_norm": 1.2818728311861178, "learning_rate": 1.831929103720692e-05, "loss": 1.187, "step": 4408 }, { "epoch": 0.21, "grad_norm": 1.053947176561706, "learning_rate": 1.831842654740327e-05, "loss": 1.2222, "step": 4409 }, { "epoch": 0.21, "grad_norm": 1.1175363361114774, "learning_rate": 1.8317561855735867e-05, "loss": 1.2583, "step": 4410 }, { "epoch": 0.21, "grad_norm": 1.194162504044974, "learning_rate": 1.8316696962225704e-05, "loss": 1.2563, "step": 4411 }, { "epoch": 0.21, "grad_norm": 1.2124599838835748, "learning_rate": 1.8315831866893762e-05, "loss": 1.0986, "step": 4412 }, { "epoch": 0.21, "grad_norm": 1.0204118523751704, "learning_rate": 1.831496656976104e-05, "loss": 1.2178, "step": 4413 }, { "epoch": 0.21, "grad_norm": 1.294761487971946, "learning_rate": 1.8314101070848527e-05, "loss": 1.2651, "step": 4414 }, { "epoch": 0.21, "grad_norm": 1.4593616346792158, "learning_rate": 1.8313235370177235e-05, "loss": 1.3716, "step": 4415 }, { "epoch": 0.21, "grad_norm": 0.8286899143410447, "learning_rate": 1.8312369467768168e-05, "loss": 1.1748, "step": 4416 }, { "epoch": 0.21, "grad_norm": 1.1133376696373642, "learning_rate": 1.831150336364234e-05, "loss": 1.2505, "step": 4417 }, { "epoch": 0.21, "grad_norm": 1.1445535024330389, "learning_rate": 1.831063705782077e-05, "loss": 1.188, "step": 4418 }, { "epoch": 0.21, "grad_norm": 1.3372278208720862, "learning_rate": 1.830977055032448e-05, "loss": 1.1475, "step": 4419 }, { "epoch": 0.21, "grad_norm": 1.1727120318061122, "learning_rate": 1.8308903841174493e-05, "loss": 1.1865, "step": 4420 }, { "epoch": 0.21, "grad_norm": 1.177387747709442, "learning_rate": 1.8308036930391848e-05, "loss": 1.1919, "step": 4421 }, { "epoch": 0.21, "grad_norm": 1.1436942508698513, "learning_rate": 1.830716981799758e-05, "loss": 1.1729, "step": 4422 }, { "epoch": 0.21, "grad_norm": 1.1260921876531866, "learning_rate": 1.8306302504012732e-05, "loss": 1.2432, "step": 4423 }, { "epoch": 0.21, "grad_norm": 1.4202033452306044, "learning_rate": 1.8305434988458348e-05, "loss": 1.2954, "step": 4424 }, { "epoch": 0.21, "grad_norm": 1.4152252637122622, "learning_rate": 1.8304567271355482e-05, "loss": 1.2681, "step": 4425 }, { "epoch": 0.21, "grad_norm": 1.3364826784807082, "learning_rate": 1.8303699352725193e-05, "loss": 1.1973, "step": 4426 }, { "epoch": 0.21, "grad_norm": 1.2497867957997453, "learning_rate": 1.830283123258854e-05, "loss": 1.2539, "step": 4427 }, { "epoch": 0.21, "grad_norm": 1.5164324067681045, "learning_rate": 1.8301962910966592e-05, "loss": 1.231, "step": 4428 }, { "epoch": 0.21, "grad_norm": 1.3714908596926016, "learning_rate": 1.830109438788042e-05, "loss": 1.1899, "step": 4429 }, { "epoch": 0.21, "grad_norm": 1.156096259997161, "learning_rate": 1.8300225663351098e-05, "loss": 1.1167, "step": 4430 }, { "epoch": 0.21, "grad_norm": 1.1921371159765468, "learning_rate": 1.8299356737399707e-05, "loss": 1.1172, "step": 4431 }, { "epoch": 0.21, "grad_norm": 1.3109963307738512, "learning_rate": 1.8298487610047337e-05, "loss": 1.1978, "step": 4432 }, { "epoch": 0.21, "grad_norm": 1.3561817855462865, "learning_rate": 1.829761828131508e-05, "loss": 1.2358, "step": 4433 }, { "epoch": 0.21, "grad_norm": 1.2272859334251314, "learning_rate": 1.829674875122403e-05, "loss": 1.1509, "step": 4434 }, { "epoch": 0.21, "grad_norm": 1.2237822280882102, "learning_rate": 1.8295879019795283e-05, "loss": 1.2363, "step": 4435 }, { "epoch": 0.21, "grad_norm": 1.4625865557257534, "learning_rate": 1.8295009087049954e-05, "loss": 1.1013, "step": 4436 }, { "epoch": 0.21, "grad_norm": 1.1068670832215481, "learning_rate": 1.8294138953009145e-05, "loss": 1.2744, "step": 4437 }, { "epoch": 0.21, "grad_norm": 1.3166042983804738, "learning_rate": 1.829326861769398e-05, "loss": 1.312, "step": 4438 }, { "epoch": 0.21, "grad_norm": 1.1222240968576276, "learning_rate": 1.8292398081125572e-05, "loss": 1.29, "step": 4439 }, { "epoch": 0.21, "grad_norm": 1.2425263673217237, "learning_rate": 1.8291527343325052e-05, "loss": 1.1787, "step": 4440 }, { "epoch": 0.21, "grad_norm": 1.2660095726734806, "learning_rate": 1.8290656404313546e-05, "loss": 1.2866, "step": 4441 }, { "epoch": 0.21, "grad_norm": 1.5966594972773576, "learning_rate": 1.8289785264112193e-05, "loss": 1.0398, "step": 4442 }, { "epoch": 0.21, "grad_norm": 1.3912000831615057, "learning_rate": 1.8288913922742134e-05, "loss": 1.3228, "step": 4443 }, { "epoch": 0.21, "grad_norm": 1.0244953965039716, "learning_rate": 1.8288042380224508e-05, "loss": 1.2705, "step": 4444 }, { "epoch": 0.21, "grad_norm": 1.1616114023164374, "learning_rate": 1.8287170636580464e-05, "loss": 1.1914, "step": 4445 }, { "epoch": 0.21, "grad_norm": 1.2402760735307439, "learning_rate": 1.8286298691831164e-05, "loss": 1.2319, "step": 4446 }, { "epoch": 0.21, "grad_norm": 1.3949685144614963, "learning_rate": 1.8285426545997764e-05, "loss": 1.3271, "step": 4447 }, { "epoch": 0.21, "grad_norm": 1.1685433055251706, "learning_rate": 1.828455419910143e-05, "loss": 1.1719, "step": 4448 }, { "epoch": 0.21, "grad_norm": 1.1237463669299028, "learning_rate": 1.8283681651163324e-05, "loss": 1.1997, "step": 4449 }, { "epoch": 0.21, "grad_norm": 1.158733318569599, "learning_rate": 1.8282808902204627e-05, "loss": 1.124, "step": 4450 }, { "epoch": 0.21, "grad_norm": 0.9612133330125446, "learning_rate": 1.828193595224652e-05, "loss": 1.1875, "step": 4451 }, { "epoch": 0.21, "grad_norm": 1.3678092758865583, "learning_rate": 1.828106280131018e-05, "loss": 1.1924, "step": 4452 }, { "epoch": 0.21, "grad_norm": 0.9879781232005463, "learning_rate": 1.8280189449416805e-05, "loss": 0.9697, "step": 4453 }, { "epoch": 0.21, "grad_norm": 1.1315172097835489, "learning_rate": 1.827931589658758e-05, "loss": 0.947, "step": 4454 }, { "epoch": 0.21, "grad_norm": 1.3802500703391178, "learning_rate": 1.8278442142843703e-05, "loss": 1.1353, "step": 4455 }, { "epoch": 0.21, "grad_norm": 1.1964050042532848, "learning_rate": 1.8277568188206386e-05, "loss": 1.2593, "step": 4456 }, { "epoch": 0.21, "grad_norm": 1.259133140742844, "learning_rate": 1.8276694032696835e-05, "loss": 1.2256, "step": 4457 }, { "epoch": 0.21, "grad_norm": 1.2368374113655682, "learning_rate": 1.8275819676336256e-05, "loss": 1.2285, "step": 4458 }, { "epoch": 0.21, "grad_norm": 1.216916536156771, "learning_rate": 1.827494511914587e-05, "loss": 1.1689, "step": 4459 }, { "epoch": 0.21, "grad_norm": 1.4479300444283665, "learning_rate": 1.8274070361146906e-05, "loss": 1.1235, "step": 4460 }, { "epoch": 0.21, "grad_norm": 1.2965212773819812, "learning_rate": 1.8273195402360585e-05, "loss": 1.2441, "step": 4461 }, { "epoch": 0.21, "grad_norm": 1.2601135581933998, "learning_rate": 1.8272320242808143e-05, "loss": 1.231, "step": 4462 }, { "epoch": 0.21, "grad_norm": 1.145110579727934, "learning_rate": 1.827144488251082e-05, "loss": 1.0388, "step": 4463 }, { "epoch": 0.21, "grad_norm": 1.2187248381767366, "learning_rate": 1.827056932148985e-05, "loss": 1.1743, "step": 4464 }, { "epoch": 0.21, "grad_norm": 1.1462336378133944, "learning_rate": 1.8269693559766487e-05, "loss": 1.1704, "step": 4465 }, { "epoch": 0.21, "grad_norm": 1.3039185887070022, "learning_rate": 1.8268817597361983e-05, "loss": 1.2729, "step": 4466 }, { "epoch": 0.21, "grad_norm": 1.32158738655301, "learning_rate": 1.8267941434297594e-05, "loss": 1.1772, "step": 4467 }, { "epoch": 0.21, "grad_norm": 1.3844879297362085, "learning_rate": 1.826706507059458e-05, "loss": 1.1792, "step": 4468 }, { "epoch": 0.21, "grad_norm": 1.198188707613644, "learning_rate": 1.826618850627421e-05, "loss": 1.2822, "step": 4469 }, { "epoch": 0.21, "grad_norm": 1.3114276053440401, "learning_rate": 1.8265311741357753e-05, "loss": 1.1748, "step": 4470 }, { "epoch": 0.22, "grad_norm": 0.9250861616109476, "learning_rate": 1.826443477586649e-05, "loss": 1.1382, "step": 4471 }, { "epoch": 0.22, "grad_norm": 1.2842128501267753, "learning_rate": 1.82635576098217e-05, "loss": 1.1343, "step": 4472 }, { "epoch": 0.22, "grad_norm": 1.3135278098643988, "learning_rate": 1.826268024324467e-05, "loss": 1.2627, "step": 4473 }, { "epoch": 0.22, "grad_norm": 1.2634191162276973, "learning_rate": 1.8261802676156685e-05, "loss": 1.3154, "step": 4474 }, { "epoch": 0.22, "grad_norm": 1.4024666984830052, "learning_rate": 1.826092490857905e-05, "loss": 1.1704, "step": 4475 }, { "epoch": 0.22, "grad_norm": 1.1964876183556148, "learning_rate": 1.8260046940533063e-05, "loss": 1.2637, "step": 4476 }, { "epoch": 0.22, "grad_norm": 1.5453843746772333, "learning_rate": 1.8259168772040027e-05, "loss": 1.1426, "step": 4477 }, { "epoch": 0.22, "grad_norm": 1.8381837480923824, "learning_rate": 1.8258290403121252e-05, "loss": 1.4521, "step": 4478 }, { "epoch": 0.22, "grad_norm": 1.095843465479473, "learning_rate": 1.825741183379806e-05, "loss": 1.2646, "step": 4479 }, { "epoch": 0.22, "grad_norm": 1.4047802635390465, "learning_rate": 1.8256533064091765e-05, "loss": 1.2231, "step": 4480 }, { "epoch": 0.22, "grad_norm": 1.1197960187250817, "learning_rate": 1.8255654094023692e-05, "loss": 1.0601, "step": 4481 }, { "epoch": 0.22, "grad_norm": 1.555085623641309, "learning_rate": 1.8254774923615177e-05, "loss": 1.4136, "step": 4482 }, { "epoch": 0.22, "grad_norm": 1.1976674896456259, "learning_rate": 1.8253895552887547e-05, "loss": 1.2017, "step": 4483 }, { "epoch": 0.22, "grad_norm": 0.9212330799176107, "learning_rate": 1.825301598186215e-05, "loss": 1.269, "step": 4484 }, { "epoch": 0.22, "grad_norm": 1.288832936900788, "learning_rate": 1.825213621056032e-05, "loss": 1.0784, "step": 4485 }, { "epoch": 0.22, "grad_norm": 1.1812441029269136, "learning_rate": 1.825125623900342e-05, "loss": 1.2695, "step": 4486 }, { "epoch": 0.22, "grad_norm": 1.22825994489677, "learning_rate": 1.8250376067212794e-05, "loss": 1.2065, "step": 4487 }, { "epoch": 0.22, "grad_norm": 1.2918690996063715, "learning_rate": 1.8249495695209805e-05, "loss": 1.189, "step": 4488 }, { "epoch": 0.22, "grad_norm": 1.1649445923779318, "learning_rate": 1.8248615123015816e-05, "loss": 1.2124, "step": 4489 }, { "epoch": 0.22, "grad_norm": 1.6552548310659787, "learning_rate": 1.8247734350652197e-05, "loss": 1.3071, "step": 4490 }, { "epoch": 0.22, "grad_norm": 0.9589697721067002, "learning_rate": 1.824685337814032e-05, "loss": 1.2197, "step": 4491 }, { "epoch": 0.22, "grad_norm": 1.4550110427528729, "learning_rate": 1.8245972205501565e-05, "loss": 1.1973, "step": 4492 }, { "epoch": 0.22, "grad_norm": 1.2470638515193744, "learning_rate": 1.8245090832757317e-05, "loss": 1.2144, "step": 4493 }, { "epoch": 0.22, "grad_norm": 1.1176478343002583, "learning_rate": 1.824420925992896e-05, "loss": 1.123, "step": 4494 }, { "epoch": 0.22, "grad_norm": 1.1761169999359093, "learning_rate": 1.824332748703789e-05, "loss": 1.2896, "step": 4495 }, { "epoch": 0.22, "grad_norm": 1.121247774541991, "learning_rate": 1.8242445514105505e-05, "loss": 1.0864, "step": 4496 }, { "epoch": 0.22, "grad_norm": 1.1827644944555011, "learning_rate": 1.8241563341153203e-05, "loss": 1.1968, "step": 4497 }, { "epoch": 0.22, "grad_norm": 1.1911362907737977, "learning_rate": 1.82406809682024e-05, "loss": 1.3081, "step": 4498 }, { "epoch": 0.22, "grad_norm": 1.1524727482387132, "learning_rate": 1.8239798395274507e-05, "loss": 1.2407, "step": 4499 }, { "epoch": 0.22, "grad_norm": 1.3638494421798013, "learning_rate": 1.823891562239094e-05, "loss": 1.1768, "step": 4500 }, { "epoch": 0.22, "grad_norm": 1.2868339439180396, "learning_rate": 1.8238032649573116e-05, "loss": 1.1113, "step": 4501 }, { "epoch": 0.22, "grad_norm": 0.9042157059120498, "learning_rate": 1.8237149476842472e-05, "loss": 1.2207, "step": 4502 }, { "epoch": 0.22, "grad_norm": 1.4403237455833862, "learning_rate": 1.8236266104220432e-05, "loss": 1.2915, "step": 4503 }, { "epoch": 0.22, "grad_norm": 1.1798528507888026, "learning_rate": 1.8235382531728435e-05, "loss": 1.271, "step": 4504 }, { "epoch": 0.22, "grad_norm": 1.4238698437276276, "learning_rate": 1.8234498759387925e-05, "loss": 1.1646, "step": 4505 }, { "epoch": 0.22, "grad_norm": 1.2189178542394283, "learning_rate": 1.8233614787220345e-05, "loss": 1.0879, "step": 4506 }, { "epoch": 0.22, "grad_norm": 1.2342390470652775, "learning_rate": 1.8232730615247146e-05, "loss": 1.3276, "step": 4507 }, { "epoch": 0.22, "grad_norm": 0.9358752241332112, "learning_rate": 1.823184624348979e-05, "loss": 1.1597, "step": 4508 }, { "epoch": 0.22, "grad_norm": 1.5266393128380753, "learning_rate": 1.8230961671969735e-05, "loss": 1.2549, "step": 4509 }, { "epoch": 0.22, "grad_norm": 1.207426722133219, "learning_rate": 1.8230076900708447e-05, "loss": 1.1831, "step": 4510 }, { "epoch": 0.22, "grad_norm": 1.2005922361277106, "learning_rate": 1.8229191929727395e-05, "loss": 1.1636, "step": 4511 }, { "epoch": 0.22, "grad_norm": 1.1681189171275623, "learning_rate": 1.8228306759048057e-05, "loss": 1.1938, "step": 4512 }, { "epoch": 0.22, "grad_norm": 1.145374224064277, "learning_rate": 1.8227421388691912e-05, "loss": 1.1509, "step": 4513 }, { "epoch": 0.22, "grad_norm": 1.4923910564979537, "learning_rate": 1.8226535818680445e-05, "loss": 1.3838, "step": 4514 }, { "epoch": 0.22, "grad_norm": 1.2508198993250346, "learning_rate": 1.8225650049035148e-05, "loss": 1.1943, "step": 4515 }, { "epoch": 0.22, "grad_norm": 1.2538243019994604, "learning_rate": 1.8224764079777514e-05, "loss": 1.1777, "step": 4516 }, { "epoch": 0.22, "grad_norm": 1.3167587067297597, "learning_rate": 1.8223877910929048e-05, "loss": 1.3594, "step": 4517 }, { "epoch": 0.22, "grad_norm": 1.3193928811673092, "learning_rate": 1.8222991542511247e-05, "loss": 1.0942, "step": 4518 }, { "epoch": 0.22, "grad_norm": 1.2651289216691577, "learning_rate": 1.8222104974545623e-05, "loss": 1.1689, "step": 4519 }, { "epoch": 0.22, "grad_norm": 1.2098304978889205, "learning_rate": 1.8221218207053694e-05, "loss": 1.3418, "step": 4520 }, { "epoch": 0.22, "grad_norm": 1.0102883189265233, "learning_rate": 1.8220331240056974e-05, "loss": 1.0869, "step": 4521 }, { "epoch": 0.22, "grad_norm": 1.2367135995282215, "learning_rate": 1.8219444073576993e-05, "loss": 1.2637, "step": 4522 }, { "epoch": 0.22, "grad_norm": 1.2091978162114032, "learning_rate": 1.8218556707635277e-05, "loss": 1.3296, "step": 4523 }, { "epoch": 0.22, "grad_norm": 1.3074445822917833, "learning_rate": 1.8217669142253358e-05, "loss": 1.3042, "step": 4524 }, { "epoch": 0.22, "grad_norm": 0.9470282554386227, "learning_rate": 1.8216781377452775e-05, "loss": 1.2642, "step": 4525 }, { "epoch": 0.22, "grad_norm": 1.1356587202134663, "learning_rate": 1.8215893413255073e-05, "loss": 1.1387, "step": 4526 }, { "epoch": 0.22, "grad_norm": 1.2435357012454047, "learning_rate": 1.8215005249681804e-05, "loss": 1.1982, "step": 4527 }, { "epoch": 0.22, "grad_norm": 0.9305433027310314, "learning_rate": 1.8214116886754513e-05, "loss": 1.2915, "step": 4528 }, { "epoch": 0.22, "grad_norm": 1.2849112769256064, "learning_rate": 1.8213228324494765e-05, "loss": 1.1074, "step": 4529 }, { "epoch": 0.22, "grad_norm": 1.1825432861288747, "learning_rate": 1.821233956292412e-05, "loss": 1.1133, "step": 4530 }, { "epoch": 0.22, "grad_norm": 1.3555059100317235, "learning_rate": 1.821145060206414e-05, "loss": 1.2925, "step": 4531 }, { "epoch": 0.22, "grad_norm": 1.154147526166616, "learning_rate": 1.8210561441936406e-05, "loss": 0.9744, "step": 4532 }, { "epoch": 0.22, "grad_norm": 1.4893802260523012, "learning_rate": 1.8209672082562496e-05, "loss": 1.3159, "step": 4533 }, { "epoch": 0.22, "grad_norm": 1.1768601753751666, "learning_rate": 1.8208782523963985e-05, "loss": 1.0874, "step": 4534 }, { "epoch": 0.22, "grad_norm": 1.283298926868818, "learning_rate": 1.8207892766162463e-05, "loss": 1.1973, "step": 4535 }, { "epoch": 0.22, "grad_norm": 1.1767595510160582, "learning_rate": 1.820700280917952e-05, "loss": 1.1729, "step": 4536 }, { "epoch": 0.22, "grad_norm": 1.5455150919020944, "learning_rate": 1.820611265303676e-05, "loss": 1.2715, "step": 4537 }, { "epoch": 0.22, "grad_norm": 1.19630592465284, "learning_rate": 1.8205222297755774e-05, "loss": 1.1665, "step": 4538 }, { "epoch": 0.22, "grad_norm": 1.5881166850367703, "learning_rate": 1.820433174335818e-05, "loss": 1.2129, "step": 4539 }, { "epoch": 0.22, "grad_norm": 1.2169017990956086, "learning_rate": 1.8203440989865577e-05, "loss": 1.2842, "step": 4540 }, { "epoch": 0.22, "grad_norm": 1.3138389524247547, "learning_rate": 1.8202550037299588e-05, "loss": 1.1362, "step": 4541 }, { "epoch": 0.22, "grad_norm": 1.4023005408257163, "learning_rate": 1.820165888568183e-05, "loss": 1.189, "step": 4542 }, { "epoch": 0.22, "grad_norm": 1.2829723945837808, "learning_rate": 1.8200767535033938e-05, "loss": 1.1262, "step": 4543 }, { "epoch": 0.22, "grad_norm": 1.257113965490729, "learning_rate": 1.819987598537753e-05, "loss": 1.1133, "step": 4544 }, { "epoch": 0.22, "grad_norm": 1.3105959400257408, "learning_rate": 1.8198984236734246e-05, "loss": 1.2603, "step": 4545 }, { "epoch": 0.22, "grad_norm": 1.3689214589749619, "learning_rate": 1.819809228912573e-05, "loss": 1.1714, "step": 4546 }, { "epoch": 0.22, "grad_norm": 1.1067254396759134, "learning_rate": 1.8197200142573625e-05, "loss": 1.1636, "step": 4547 }, { "epoch": 0.22, "grad_norm": 1.0566380197437628, "learning_rate": 1.819630779709958e-05, "loss": 1.3125, "step": 4548 }, { "epoch": 0.22, "grad_norm": 1.2223831176440698, "learning_rate": 1.8195415252725242e-05, "loss": 1.2397, "step": 4549 }, { "epoch": 0.22, "grad_norm": 1.3573172171336447, "learning_rate": 1.8194522509472283e-05, "loss": 1.1211, "step": 4550 }, { "epoch": 0.22, "grad_norm": 1.0505272535023944, "learning_rate": 1.819362956736236e-05, "loss": 1.146, "step": 4551 }, { "epoch": 0.22, "grad_norm": 1.240758795350619, "learning_rate": 1.8192736426417146e-05, "loss": 1.2788, "step": 4552 }, { "epoch": 0.22, "grad_norm": 1.1781434626497482, "learning_rate": 1.8191843086658313e-05, "loss": 1.166, "step": 4553 }, { "epoch": 0.22, "grad_norm": 1.1669323503573172, "learning_rate": 1.819094954810754e-05, "loss": 1.2974, "step": 4554 }, { "epoch": 0.22, "grad_norm": 1.4065541957388346, "learning_rate": 1.8190055810786507e-05, "loss": 1.231, "step": 4555 }, { "epoch": 0.22, "grad_norm": 1.5062359846487385, "learning_rate": 1.818916187471691e-05, "loss": 1.2437, "step": 4556 }, { "epoch": 0.22, "grad_norm": 1.2409215413689512, "learning_rate": 1.8188267739920433e-05, "loss": 1.2661, "step": 4557 }, { "epoch": 0.22, "grad_norm": 1.447612927903066, "learning_rate": 1.8187373406418782e-05, "loss": 1.1636, "step": 4558 }, { "epoch": 0.22, "grad_norm": 1.1464625403950617, "learning_rate": 1.8186478874233655e-05, "loss": 1.1262, "step": 4559 }, { "epoch": 0.22, "grad_norm": 1.254938830341179, "learning_rate": 1.8185584143386764e-05, "loss": 1.27, "step": 4560 }, { "epoch": 0.22, "grad_norm": 1.2202923558945054, "learning_rate": 1.8184689213899816e-05, "loss": 1.1104, "step": 4561 }, { "epoch": 0.22, "grad_norm": 1.381893860733172, "learning_rate": 1.818379408579453e-05, "loss": 1.2466, "step": 4562 }, { "epoch": 0.22, "grad_norm": 1.1773444391803471, "learning_rate": 1.818289875909263e-05, "loss": 0.9851, "step": 4563 }, { "epoch": 0.22, "grad_norm": 1.3159040180146608, "learning_rate": 1.8182003233815847e-05, "loss": 1.2212, "step": 4564 }, { "epoch": 0.22, "grad_norm": 1.236711597280253, "learning_rate": 1.8181107509985903e-05, "loss": 1.2227, "step": 4565 }, { "epoch": 0.22, "grad_norm": 1.2043327331817868, "learning_rate": 1.8180211587624543e-05, "loss": 1.1919, "step": 4566 }, { "epoch": 0.22, "grad_norm": 1.4056949238101222, "learning_rate": 1.81793154667535e-05, "loss": 1.3003, "step": 4567 }, { "epoch": 0.22, "grad_norm": 0.8639074970296128, "learning_rate": 1.8178419147394528e-05, "loss": 1.1616, "step": 4568 }, { "epoch": 0.22, "grad_norm": 0.8593021626952934, "learning_rate": 1.8177522629569375e-05, "loss": 1.1929, "step": 4569 }, { "epoch": 0.22, "grad_norm": 1.3233087136727573, "learning_rate": 1.8176625913299797e-05, "loss": 1.1475, "step": 4570 }, { "epoch": 0.22, "grad_norm": 1.3104274036697134, "learning_rate": 1.8175728998607555e-05, "loss": 1.2021, "step": 4571 }, { "epoch": 0.22, "grad_norm": 1.3861350468006663, "learning_rate": 1.8174831885514415e-05, "loss": 1.1992, "step": 4572 }, { "epoch": 0.22, "grad_norm": 1.0633115593324878, "learning_rate": 1.817393457404214e-05, "loss": 1.0625, "step": 4573 }, { "epoch": 0.22, "grad_norm": 1.433954381429989, "learning_rate": 1.8173037064212522e-05, "loss": 1.2695, "step": 4574 }, { "epoch": 0.22, "grad_norm": 1.1343511597634355, "learning_rate": 1.8172139356047323e-05, "loss": 1.1953, "step": 4575 }, { "epoch": 0.22, "grad_norm": 1.163750207033128, "learning_rate": 1.817124144956834e-05, "loss": 1.1265, "step": 4576 }, { "epoch": 0.22, "grad_norm": 1.1815829047351063, "learning_rate": 1.8170343344797354e-05, "loss": 1.2119, "step": 4577 }, { "epoch": 0.22, "grad_norm": 1.131773265841813, "learning_rate": 1.8169445041756165e-05, "loss": 1.2339, "step": 4578 }, { "epoch": 0.22, "grad_norm": 1.252572049031637, "learning_rate": 1.8168546540466567e-05, "loss": 1.1929, "step": 4579 }, { "epoch": 0.22, "grad_norm": 1.4860152201521228, "learning_rate": 1.816764784095037e-05, "loss": 1.1296, "step": 4580 }, { "epoch": 0.22, "grad_norm": 1.2703307768621983, "learning_rate": 1.816674894322938e-05, "loss": 1.2686, "step": 4581 }, { "epoch": 0.22, "grad_norm": 0.9938050614995264, "learning_rate": 1.8165849847325413e-05, "loss": 1.1475, "step": 4582 }, { "epoch": 0.22, "grad_norm": 1.2557570803082503, "learning_rate": 1.816495055326028e-05, "loss": 1.1729, "step": 4583 }, { "epoch": 0.22, "grad_norm": 1.163541368273416, "learning_rate": 1.8164051061055812e-05, "loss": 1.1826, "step": 4584 }, { "epoch": 0.22, "grad_norm": 1.2462238711394678, "learning_rate": 1.8163151370733838e-05, "loss": 1.0959, "step": 4585 }, { "epoch": 0.22, "grad_norm": 1.0155756247208563, "learning_rate": 1.8162251482316186e-05, "loss": 1.2344, "step": 4586 }, { "epoch": 0.22, "grad_norm": 1.17158040399833, "learning_rate": 1.8161351395824688e-05, "loss": 1.0554, "step": 4587 }, { "epoch": 0.22, "grad_norm": 1.3400881649873202, "learning_rate": 1.8160451111281202e-05, "loss": 1.1655, "step": 4588 }, { "epoch": 0.22, "grad_norm": 1.4088595037933995, "learning_rate": 1.815955062870756e-05, "loss": 1.2529, "step": 4589 }, { "epoch": 0.22, "grad_norm": 1.0331386747422902, "learning_rate": 1.815864994812562e-05, "loss": 1.2554, "step": 4590 }, { "epoch": 0.22, "grad_norm": 1.2915819402640634, "learning_rate": 1.8157749069557246e-05, "loss": 1.2915, "step": 4591 }, { "epoch": 0.22, "grad_norm": 1.2295079151815633, "learning_rate": 1.815684799302429e-05, "loss": 1.2319, "step": 4592 }, { "epoch": 0.22, "grad_norm": 1.450097118179294, "learning_rate": 1.815594671854862e-05, "loss": 1.2778, "step": 4593 }, { "epoch": 0.22, "grad_norm": 1.6400034541951787, "learning_rate": 1.8155045246152113e-05, "loss": 1.3809, "step": 4594 }, { "epoch": 0.22, "grad_norm": 1.2738217663270686, "learning_rate": 1.8154143575856634e-05, "loss": 1.1165, "step": 4595 }, { "epoch": 0.22, "grad_norm": 1.2388055532968931, "learning_rate": 1.8153241707684077e-05, "loss": 1.2412, "step": 4596 }, { "epoch": 0.22, "grad_norm": 1.3627463398397146, "learning_rate": 1.815233964165632e-05, "loss": 1.1597, "step": 4597 }, { "epoch": 0.22, "grad_norm": 1.6635038564195472, "learning_rate": 1.8151437377795256e-05, "loss": 1.3872, "step": 4598 }, { "epoch": 0.22, "grad_norm": 1.4666642456214967, "learning_rate": 1.8150534916122777e-05, "loss": 1.1501, "step": 4599 }, { "epoch": 0.22, "grad_norm": 1.1958517283179682, "learning_rate": 1.814963225666079e-05, "loss": 1.1245, "step": 4600 }, { "epoch": 0.22, "grad_norm": 1.1377236677950153, "learning_rate": 1.8148729399431187e-05, "loss": 1.2949, "step": 4601 }, { "epoch": 0.22, "grad_norm": 1.1023934242931985, "learning_rate": 1.8147826344455893e-05, "loss": 1.1028, "step": 4602 }, { "epoch": 0.22, "grad_norm": 1.3928060635674817, "learning_rate": 1.8146923091756813e-05, "loss": 1.2632, "step": 4603 }, { "epoch": 0.22, "grad_norm": 1.629824777921582, "learning_rate": 1.814601964135587e-05, "loss": 1.2871, "step": 4604 }, { "epoch": 0.22, "grad_norm": 1.2174438834394472, "learning_rate": 1.8145115993274986e-05, "loss": 1.2236, "step": 4605 }, { "epoch": 0.22, "grad_norm": 1.338686371469044, "learning_rate": 1.814421214753609e-05, "loss": 1.3105, "step": 4606 }, { "epoch": 0.22, "grad_norm": 1.3490404595455674, "learning_rate": 1.8143308104161117e-05, "loss": 1.1128, "step": 4607 }, { "epoch": 0.22, "grad_norm": 1.0294270664527179, "learning_rate": 1.8142403863172007e-05, "loss": 1.1577, "step": 4608 }, { "epoch": 0.22, "grad_norm": 1.1953509373105422, "learning_rate": 1.81414994245907e-05, "loss": 1.2612, "step": 4609 }, { "epoch": 0.22, "grad_norm": 1.4910919606664206, "learning_rate": 1.8140594788439142e-05, "loss": 1.3237, "step": 4610 }, { "epoch": 0.22, "grad_norm": 1.2718684865838354, "learning_rate": 1.8139689954739294e-05, "loss": 1.1841, "step": 4611 }, { "epoch": 0.22, "grad_norm": 1.166179800983941, "learning_rate": 1.8138784923513107e-05, "loss": 1.1611, "step": 4612 }, { "epoch": 0.22, "grad_norm": 1.2310101381141216, "learning_rate": 1.8137879694782543e-05, "loss": 1.2954, "step": 4613 }, { "epoch": 0.22, "grad_norm": 1.0405853906149551, "learning_rate": 1.8136974268569575e-05, "loss": 1.1548, "step": 4614 }, { "epoch": 0.22, "grad_norm": 1.1212238706301387, "learning_rate": 1.813606864489617e-05, "loss": 1.2681, "step": 4615 }, { "epoch": 0.22, "grad_norm": 1.3154975022881827, "learning_rate": 1.8135162823784303e-05, "loss": 1.2627, "step": 4616 }, { "epoch": 0.22, "grad_norm": 1.2191234161168363, "learning_rate": 1.8134256805255964e-05, "loss": 1.0776, "step": 4617 }, { "epoch": 0.22, "grad_norm": 1.0208052385445114, "learning_rate": 1.8133350589333136e-05, "loss": 1.1079, "step": 4618 }, { "epoch": 0.22, "grad_norm": 1.7370577678234242, "learning_rate": 1.8132444176037803e-05, "loss": 1.3447, "step": 4619 }, { "epoch": 0.22, "grad_norm": 1.493415811044422, "learning_rate": 1.8131537565391967e-05, "loss": 1.2651, "step": 4620 }, { "epoch": 0.22, "grad_norm": 1.1911851356491736, "learning_rate": 1.8130630757417628e-05, "loss": 1.1416, "step": 4621 }, { "epoch": 0.22, "grad_norm": 1.2023250058361632, "learning_rate": 1.8129723752136797e-05, "loss": 1.2549, "step": 4622 }, { "epoch": 0.22, "grad_norm": 1.3678320258723224, "learning_rate": 1.8128816549571472e-05, "loss": 1.1433, "step": 4623 }, { "epoch": 0.22, "grad_norm": 0.9700521748477612, "learning_rate": 1.812790914974368e-05, "loss": 1.1104, "step": 4624 }, { "epoch": 0.22, "grad_norm": 1.4602357639415833, "learning_rate": 1.8127001552675436e-05, "loss": 1.2271, "step": 4625 }, { "epoch": 0.22, "grad_norm": 1.0263956005285435, "learning_rate": 1.8126093758388764e-05, "loss": 1.1777, "step": 4626 }, { "epoch": 0.22, "grad_norm": 1.1572182002603058, "learning_rate": 1.8125185766905697e-05, "loss": 1.1597, "step": 4627 }, { "epoch": 0.22, "grad_norm": 1.5627797461937931, "learning_rate": 1.812427757824826e-05, "loss": 1.291, "step": 4628 }, { "epoch": 0.22, "grad_norm": 1.5197939867251296, "learning_rate": 1.8123369192438508e-05, "loss": 1.3496, "step": 4629 }, { "epoch": 0.22, "grad_norm": 1.3386229114356023, "learning_rate": 1.812246060949847e-05, "loss": 1.2944, "step": 4630 }, { "epoch": 0.22, "grad_norm": 1.1531589280809833, "learning_rate": 1.81215518294502e-05, "loss": 1.0706, "step": 4631 }, { "epoch": 0.22, "grad_norm": 1.4108602737109173, "learning_rate": 1.812064285231575e-05, "loss": 1.334, "step": 4632 }, { "epoch": 0.22, "grad_norm": 1.6144806203569138, "learning_rate": 1.8119733678117185e-05, "loss": 1.27, "step": 4633 }, { "epoch": 0.22, "grad_norm": 1.0268770566122665, "learning_rate": 1.811882430687656e-05, "loss": 1.0422, "step": 4634 }, { "epoch": 0.22, "grad_norm": 1.142928670678361, "learning_rate": 1.811791473861595e-05, "loss": 1.0596, "step": 4635 }, { "epoch": 0.22, "grad_norm": 1.3579603648545955, "learning_rate": 1.811700497335742e-05, "loss": 1.2012, "step": 4636 }, { "epoch": 0.22, "grad_norm": 1.1758436186801722, "learning_rate": 1.811609501112305e-05, "loss": 1.2964, "step": 4637 }, { "epoch": 0.22, "grad_norm": 1.2541432638474839, "learning_rate": 1.8115184851934922e-05, "loss": 1.2681, "step": 4638 }, { "epoch": 0.22, "grad_norm": 1.1100636068886438, "learning_rate": 1.8114274495815123e-05, "loss": 1.1328, "step": 4639 }, { "epoch": 0.22, "grad_norm": 1.2897330910440368, "learning_rate": 1.8113363942785747e-05, "loss": 0.9832, "step": 4640 }, { "epoch": 0.22, "grad_norm": 1.1791665593102456, "learning_rate": 1.811245319286889e-05, "loss": 1.2139, "step": 4641 }, { "epoch": 0.22, "grad_norm": 1.1231716784049621, "learning_rate": 1.811154224608665e-05, "loss": 1.293, "step": 4642 }, { "epoch": 0.22, "grad_norm": 1.5772101956715967, "learning_rate": 1.8110631102461134e-05, "loss": 1.2188, "step": 4643 }, { "epoch": 0.22, "grad_norm": 1.678270196969465, "learning_rate": 1.8109719762014454e-05, "loss": 1.3218, "step": 4644 }, { "epoch": 0.22, "grad_norm": 1.380857222522095, "learning_rate": 1.8108808224768724e-05, "loss": 1.2241, "step": 4645 }, { "epoch": 0.22, "grad_norm": 1.331295903625861, "learning_rate": 1.8107896490746067e-05, "loss": 1.3525, "step": 4646 }, { "epoch": 0.22, "grad_norm": 1.2217658418801198, "learning_rate": 1.810698455996861e-05, "loss": 1.1104, "step": 4647 }, { "epoch": 0.22, "grad_norm": 0.9478161645158517, "learning_rate": 1.8106072432458478e-05, "loss": 1.3071, "step": 4648 }, { "epoch": 0.22, "grad_norm": 1.2996884044059291, "learning_rate": 1.8105160108237805e-05, "loss": 1.2095, "step": 4649 }, { "epoch": 0.22, "grad_norm": 1.3422511021133121, "learning_rate": 1.8104247587328733e-05, "loss": 1.2427, "step": 4650 }, { "epoch": 0.22, "grad_norm": 1.1943585443760827, "learning_rate": 1.8103334869753406e-05, "loss": 1.2417, "step": 4651 }, { "epoch": 0.22, "grad_norm": 1.500404145338423, "learning_rate": 1.8102421955533974e-05, "loss": 1.1416, "step": 4652 }, { "epoch": 0.22, "grad_norm": 1.2650365621977697, "learning_rate": 1.8101508844692586e-05, "loss": 1.3857, "step": 4653 }, { "epoch": 0.22, "grad_norm": 0.9260314947208633, "learning_rate": 1.810059553725141e-05, "loss": 1.1357, "step": 4654 }, { "epoch": 0.22, "grad_norm": 0.8374162154043597, "learning_rate": 1.80996820332326e-05, "loss": 1.1787, "step": 4655 }, { "epoch": 0.22, "grad_norm": 1.352659063077424, "learning_rate": 1.8098768332658325e-05, "loss": 1.272, "step": 4656 }, { "epoch": 0.22, "grad_norm": 0.9113014214280893, "learning_rate": 1.809785443555076e-05, "loss": 1.0488, "step": 4657 }, { "epoch": 0.22, "grad_norm": 1.27989336755472, "learning_rate": 1.809694034193209e-05, "loss": 1.165, "step": 4658 }, { "epoch": 0.22, "grad_norm": 1.1974905411158188, "learning_rate": 1.8096026051824483e-05, "loss": 1.1865, "step": 4659 }, { "epoch": 0.22, "grad_norm": 1.1429835704058984, "learning_rate": 1.8095111565250137e-05, "loss": 1.145, "step": 4660 }, { "epoch": 0.22, "grad_norm": 1.1218077237322053, "learning_rate": 1.8094196882231235e-05, "loss": 0.9109, "step": 4661 }, { "epoch": 0.22, "grad_norm": 1.1574367209299006, "learning_rate": 1.8093282002789983e-05, "loss": 1.1396, "step": 4662 }, { "epoch": 0.22, "grad_norm": 1.3280984293424698, "learning_rate": 1.8092366926948578e-05, "loss": 1.2305, "step": 4663 }, { "epoch": 0.22, "grad_norm": 1.2223373518221028, "learning_rate": 1.8091451654729225e-05, "loss": 1.1841, "step": 4664 }, { "epoch": 0.22, "grad_norm": 1.1678002358034585, "learning_rate": 1.8090536186154143e-05, "loss": 1.1987, "step": 4665 }, { "epoch": 0.22, "grad_norm": 1.2867389150206026, "learning_rate": 1.8089620521245534e-05, "loss": 1.1099, "step": 4666 }, { "epoch": 0.22, "grad_norm": 1.2484091593722442, "learning_rate": 1.8088704660025626e-05, "loss": 1.4219, "step": 4667 }, { "epoch": 0.22, "grad_norm": 1.3487127972814297, "learning_rate": 1.8087788602516643e-05, "loss": 1.249, "step": 4668 }, { "epoch": 0.22, "grad_norm": 1.312864461818492, "learning_rate": 1.808687234874082e-05, "loss": 1.3477, "step": 4669 }, { "epoch": 0.22, "grad_norm": 1.7293317329192608, "learning_rate": 1.8085955898720388e-05, "loss": 1.1082, "step": 4670 }, { "epoch": 0.22, "grad_norm": 1.6595396477992443, "learning_rate": 1.8085039252477584e-05, "loss": 1.2505, "step": 4671 }, { "epoch": 0.22, "grad_norm": 1.293217248766018, "learning_rate": 1.8084122410034655e-05, "loss": 1.2881, "step": 4672 }, { "epoch": 0.22, "grad_norm": 1.3450871286105295, "learning_rate": 1.808320537141385e-05, "loss": 1.3066, "step": 4673 }, { "epoch": 0.22, "grad_norm": 1.424562524846477, "learning_rate": 1.808228813663742e-05, "loss": 1.1279, "step": 4674 }, { "epoch": 0.22, "grad_norm": 1.1907554194828616, "learning_rate": 1.8081370705727632e-05, "loss": 1.0933, "step": 4675 }, { "epoch": 0.22, "grad_norm": 1.398530463177658, "learning_rate": 1.8080453078706737e-05, "loss": 1.3457, "step": 4676 }, { "epoch": 0.22, "grad_norm": 1.2875249787611744, "learning_rate": 1.8079535255597014e-05, "loss": 1.2524, "step": 4677 }, { "epoch": 0.23, "grad_norm": 1.0729790530513408, "learning_rate": 1.807861723642073e-05, "loss": 1.1504, "step": 4678 }, { "epoch": 0.23, "grad_norm": 1.1350773964669618, "learning_rate": 1.8077699021200163e-05, "loss": 1.0835, "step": 4679 }, { "epoch": 0.23, "grad_norm": 1.3236305609677372, "learning_rate": 1.80767806099576e-05, "loss": 1.2319, "step": 4680 }, { "epoch": 0.23, "grad_norm": 1.1780538090194632, "learning_rate": 1.807586200271532e-05, "loss": 1.1465, "step": 4681 }, { "epoch": 0.23, "grad_norm": 0.9526330233164249, "learning_rate": 1.8074943199495622e-05, "loss": 1.0022, "step": 4682 }, { "epoch": 0.23, "grad_norm": 1.1646270808452792, "learning_rate": 1.8074024200320797e-05, "loss": 1.1699, "step": 4683 }, { "epoch": 0.23, "grad_norm": 1.3649003790727408, "learning_rate": 1.8073105005213154e-05, "loss": 1.2568, "step": 4684 }, { "epoch": 0.23, "grad_norm": 1.275701296985823, "learning_rate": 1.807218561419499e-05, "loss": 1.3047, "step": 4685 }, { "epoch": 0.23, "grad_norm": 1.0909534141027, "learning_rate": 1.8071266027288625e-05, "loss": 1.2085, "step": 4686 }, { "epoch": 0.23, "grad_norm": 1.238954844341487, "learning_rate": 1.8070346244516367e-05, "loss": 1.3262, "step": 4687 }, { "epoch": 0.23, "grad_norm": 1.0873599920950383, "learning_rate": 1.806942626590054e-05, "loss": 1.1831, "step": 4688 }, { "epoch": 0.23, "grad_norm": 1.1494026566616067, "learning_rate": 1.8068506091463473e-05, "loss": 1.0579, "step": 4689 }, { "epoch": 0.23, "grad_norm": 1.4794095107274565, "learning_rate": 1.806758572122749e-05, "loss": 1.2139, "step": 4690 }, { "epoch": 0.23, "grad_norm": 1.3590725589600858, "learning_rate": 1.806666515521492e-05, "loss": 1.2275, "step": 4691 }, { "epoch": 0.23, "grad_norm": 1.1980065702956226, "learning_rate": 1.8065744393448118e-05, "loss": 1.1875, "step": 4692 }, { "epoch": 0.23, "grad_norm": 1.3513001456924731, "learning_rate": 1.806482343594942e-05, "loss": 1.2705, "step": 4693 }, { "epoch": 0.23, "grad_norm": 1.0791575345875006, "learning_rate": 1.806390228274117e-05, "loss": 1.1836, "step": 4694 }, { "epoch": 0.23, "grad_norm": 1.204674006048473, "learning_rate": 1.8062980933845732e-05, "loss": 1.2476, "step": 4695 }, { "epoch": 0.23, "grad_norm": 1.076139799641576, "learning_rate": 1.8062059389285455e-05, "loss": 0.9609, "step": 4696 }, { "epoch": 0.23, "grad_norm": 1.5047779205361997, "learning_rate": 1.806113764908271e-05, "loss": 1.2627, "step": 4697 }, { "epoch": 0.23, "grad_norm": 1.1404898331352722, "learning_rate": 1.8060215713259856e-05, "loss": 1.1719, "step": 4698 }, { "epoch": 0.23, "grad_norm": 1.1329321419713207, "learning_rate": 1.8059293581839277e-05, "loss": 1.0942, "step": 4699 }, { "epoch": 0.23, "grad_norm": 1.1406016954405083, "learning_rate": 1.805837125484334e-05, "loss": 1.3154, "step": 4700 }, { "epoch": 0.23, "grad_norm": 0.9965141206771454, "learning_rate": 1.8057448732294432e-05, "loss": 1.0796, "step": 4701 }, { "epoch": 0.23, "grad_norm": 0.9782328337948923, "learning_rate": 1.8056526014214944e-05, "loss": 1.2197, "step": 4702 }, { "epoch": 0.23, "grad_norm": 1.1966014738920447, "learning_rate": 1.805560310062726e-05, "loss": 1.2222, "step": 4703 }, { "epoch": 0.23, "grad_norm": 1.3818728511207918, "learning_rate": 1.8054679991553777e-05, "loss": 1.1831, "step": 4704 }, { "epoch": 0.23, "grad_norm": 1.2504069740202592, "learning_rate": 1.80537566870169e-05, "loss": 1.1816, "step": 4705 }, { "epoch": 0.23, "grad_norm": 1.1062159558297529, "learning_rate": 1.805283318703903e-05, "loss": 1.2119, "step": 4706 }, { "epoch": 0.23, "grad_norm": 1.1956071527943828, "learning_rate": 1.805190949164259e-05, "loss": 1.0005, "step": 4707 }, { "epoch": 0.23, "grad_norm": 1.1017141840320077, "learning_rate": 1.805098560084998e-05, "loss": 1.2017, "step": 4708 }, { "epoch": 0.23, "grad_norm": 1.2509543503500475, "learning_rate": 1.8050061514683624e-05, "loss": 1.0874, "step": 4709 }, { "epoch": 0.23, "grad_norm": 1.2154513707525691, "learning_rate": 1.8049137233165955e-05, "loss": 1.2056, "step": 4710 }, { "epoch": 0.23, "grad_norm": 1.36147357569537, "learning_rate": 1.8048212756319395e-05, "loss": 1.2837, "step": 4711 }, { "epoch": 0.23, "grad_norm": 1.279927035136454, "learning_rate": 1.804728808416638e-05, "loss": 1.1895, "step": 4712 }, { "epoch": 0.23, "grad_norm": 1.3683547969348335, "learning_rate": 1.8046363216729354e-05, "loss": 1.0684, "step": 4713 }, { "epoch": 0.23, "grad_norm": 1.3404813937571631, "learning_rate": 1.8045438154030752e-05, "loss": 1.2148, "step": 4714 }, { "epoch": 0.23, "grad_norm": 1.2152566777538842, "learning_rate": 1.8044512896093027e-05, "loss": 0.9871, "step": 4715 }, { "epoch": 0.23, "grad_norm": 1.2983493509408623, "learning_rate": 1.8043587442938633e-05, "loss": 1.168, "step": 4716 }, { "epoch": 0.23, "grad_norm": 1.1635304243486215, "learning_rate": 1.8042661794590023e-05, "loss": 1.1123, "step": 4717 }, { "epoch": 0.23, "grad_norm": 0.9551003047991745, "learning_rate": 1.804173595106967e-05, "loss": 1.2651, "step": 4718 }, { "epoch": 0.23, "grad_norm": 1.3840139914082743, "learning_rate": 1.804080991240003e-05, "loss": 1.3867, "step": 4719 }, { "epoch": 0.23, "grad_norm": 1.186623851955907, "learning_rate": 1.8039883678603583e-05, "loss": 1.0918, "step": 4720 }, { "epoch": 0.23, "grad_norm": 1.1163932318198322, "learning_rate": 1.8038957249702806e-05, "loss": 1.2358, "step": 4721 }, { "epoch": 0.23, "grad_norm": 1.3966442448731171, "learning_rate": 1.8038030625720173e-05, "loss": 1.1479, "step": 4722 }, { "epoch": 0.23, "grad_norm": 1.4219015614645245, "learning_rate": 1.803710380667818e-05, "loss": 1.0059, "step": 4723 }, { "epoch": 0.23, "grad_norm": 1.379041125802112, "learning_rate": 1.8036176792599313e-05, "loss": 1.3364, "step": 4724 }, { "epoch": 0.23, "grad_norm": 1.1356256732697534, "learning_rate": 1.803524958350607e-05, "loss": 1.1646, "step": 4725 }, { "epoch": 0.23, "grad_norm": 1.2266014799152656, "learning_rate": 1.8034322179420946e-05, "loss": 1.0283, "step": 4726 }, { "epoch": 0.23, "grad_norm": 1.3864735664591499, "learning_rate": 1.8033394580366453e-05, "loss": 1.1938, "step": 4727 }, { "epoch": 0.23, "grad_norm": 1.582256514201053, "learning_rate": 1.8032466786365098e-05, "loss": 1.2363, "step": 4728 }, { "epoch": 0.23, "grad_norm": 1.3057015405120276, "learning_rate": 1.80315387974394e-05, "loss": 1.2915, "step": 4729 }, { "epoch": 0.23, "grad_norm": 1.3958838152400168, "learning_rate": 1.8030610613611874e-05, "loss": 1.1582, "step": 4730 }, { "epoch": 0.23, "grad_norm": 1.1065903287778764, "learning_rate": 1.8029682234905044e-05, "loss": 1.2671, "step": 4731 }, { "epoch": 0.23, "grad_norm": 0.7720622182915661, "learning_rate": 1.8028753661341442e-05, "loss": 1.0376, "step": 4732 }, { "epoch": 0.23, "grad_norm": 1.410948277583298, "learning_rate": 1.80278248929436e-05, "loss": 1.3672, "step": 4733 }, { "epoch": 0.23, "grad_norm": 0.9547728565598028, "learning_rate": 1.8026895929734057e-05, "loss": 0.9995, "step": 4734 }, { "epoch": 0.23, "grad_norm": 1.0832624512365303, "learning_rate": 1.8025966771735354e-05, "loss": 0.9922, "step": 4735 }, { "epoch": 0.23, "grad_norm": 1.433393149373672, "learning_rate": 1.8025037418970047e-05, "loss": 1.3467, "step": 4736 }, { "epoch": 0.23, "grad_norm": 1.2307513808084152, "learning_rate": 1.8024107871460678e-05, "loss": 1.0269, "step": 4737 }, { "epoch": 0.23, "grad_norm": 1.4384987513189464, "learning_rate": 1.8023178129229808e-05, "loss": 1.2583, "step": 4738 }, { "epoch": 0.23, "grad_norm": 1.4057937018469655, "learning_rate": 1.8022248192300002e-05, "loss": 1.0439, "step": 4739 }, { "epoch": 0.23, "grad_norm": 0.8866770934228514, "learning_rate": 1.8021318060693823e-05, "loss": 1.1943, "step": 4740 }, { "epoch": 0.23, "grad_norm": 1.2165626877882418, "learning_rate": 1.8020387734433848e-05, "loss": 1.2075, "step": 4741 }, { "epoch": 0.23, "grad_norm": 1.2185663914878777, "learning_rate": 1.801945721354265e-05, "loss": 0.9609, "step": 4742 }, { "epoch": 0.23, "grad_norm": 1.1869525692472314, "learning_rate": 1.8018526498042805e-05, "loss": 1.1023, "step": 4743 }, { "epoch": 0.23, "grad_norm": 1.5349550653870092, "learning_rate": 1.8017595587956907e-05, "loss": 1.3047, "step": 4744 }, { "epoch": 0.23, "grad_norm": 1.1776730807964357, "learning_rate": 1.801666448330754e-05, "loss": 1.1792, "step": 4745 }, { "epoch": 0.23, "grad_norm": 1.1159289533317938, "learning_rate": 1.8015733184117307e-05, "loss": 1.3286, "step": 4746 }, { "epoch": 0.23, "grad_norm": 1.1316172821772992, "learning_rate": 1.8014801690408798e-05, "loss": 1.2197, "step": 4747 }, { "epoch": 0.23, "grad_norm": 1.1723004652094446, "learning_rate": 1.8013870002204625e-05, "loss": 1.2144, "step": 4748 }, { "epoch": 0.23, "grad_norm": 1.3347042350890088, "learning_rate": 1.8012938119527396e-05, "loss": 1.1846, "step": 4749 }, { "epoch": 0.23, "grad_norm": 1.284119406691614, "learning_rate": 1.8012006042399724e-05, "loss": 1.1831, "step": 4750 }, { "epoch": 0.23, "grad_norm": 1.1480390619709482, "learning_rate": 1.8011073770844225e-05, "loss": 1.186, "step": 4751 }, { "epoch": 0.23, "grad_norm": 1.29659185452966, "learning_rate": 1.8010141304883527e-05, "loss": 1.1284, "step": 4752 }, { "epoch": 0.23, "grad_norm": 1.2409893403238503, "learning_rate": 1.800920864454026e-05, "loss": 1.1631, "step": 4753 }, { "epoch": 0.23, "grad_norm": 1.110250623152587, "learning_rate": 1.8008275789837047e-05, "loss": 1.0918, "step": 4754 }, { "epoch": 0.23, "grad_norm": 1.3717125788767786, "learning_rate": 1.8007342740796538e-05, "loss": 1.1348, "step": 4755 }, { "epoch": 0.23, "grad_norm": 1.5619640843294615, "learning_rate": 1.8006409497441364e-05, "loss": 1.3594, "step": 4756 }, { "epoch": 0.23, "grad_norm": 1.6241318112234875, "learning_rate": 1.8005476059794183e-05, "loss": 1.2324, "step": 4757 }, { "epoch": 0.23, "grad_norm": 1.1312551821153949, "learning_rate": 1.8004542427877635e-05, "loss": 1.2104, "step": 4758 }, { "epoch": 0.23, "grad_norm": 1.2075139994914001, "learning_rate": 1.8003608601714388e-05, "loss": 1.1006, "step": 4759 }, { "epoch": 0.23, "grad_norm": 1.2999486701092993, "learning_rate": 1.8002674581327096e-05, "loss": 1.2046, "step": 4760 }, { "epoch": 0.23, "grad_norm": 1.2247780356224924, "learning_rate": 1.8001740366738426e-05, "loss": 1.1016, "step": 4761 }, { "epoch": 0.23, "grad_norm": 1.2123503978391823, "learning_rate": 1.8000805957971054e-05, "loss": 1.1504, "step": 4762 }, { "epoch": 0.23, "grad_norm": 1.2475202729322248, "learning_rate": 1.7999871355047647e-05, "loss": 1.1982, "step": 4763 }, { "epoch": 0.23, "grad_norm": 1.3599648169897485, "learning_rate": 1.799893655799089e-05, "loss": 1.3276, "step": 4764 }, { "epoch": 0.23, "grad_norm": 1.2323664153259568, "learning_rate": 1.7998001566823466e-05, "loss": 1.2236, "step": 4765 }, { "epoch": 0.23, "grad_norm": 1.1606424902794972, "learning_rate": 1.7997066381568066e-05, "loss": 1.2412, "step": 4766 }, { "epoch": 0.23, "grad_norm": 1.1299954595629869, "learning_rate": 1.7996131002247382e-05, "loss": 1.2407, "step": 4767 }, { "epoch": 0.23, "grad_norm": 1.1064547574625627, "learning_rate": 1.7995195428884114e-05, "loss": 1.1333, "step": 4768 }, { "epoch": 0.23, "grad_norm": 1.3252188493516122, "learning_rate": 1.7994259661500967e-05, "loss": 1.1938, "step": 4769 }, { "epoch": 0.23, "grad_norm": 1.2156675392033995, "learning_rate": 1.7993323700120648e-05, "loss": 1.2422, "step": 4770 }, { "epoch": 0.23, "grad_norm": 1.295932156209699, "learning_rate": 1.7992387544765874e-05, "loss": 1.0852, "step": 4771 }, { "epoch": 0.23, "grad_norm": 1.4342274391199086, "learning_rate": 1.7991451195459356e-05, "loss": 1.2583, "step": 4772 }, { "epoch": 0.23, "grad_norm": 0.9798805677724934, "learning_rate": 1.7990514652223818e-05, "loss": 1.2061, "step": 4773 }, { "epoch": 0.23, "grad_norm": 1.2468675401289773, "learning_rate": 1.798957791508199e-05, "loss": 1.1191, "step": 4774 }, { "epoch": 0.23, "grad_norm": 1.452404609825306, "learning_rate": 1.7988640984056602e-05, "loss": 1.293, "step": 4775 }, { "epoch": 0.23, "grad_norm": 1.0065926599060373, "learning_rate": 1.7987703859170393e-05, "loss": 1.1562, "step": 4776 }, { "epoch": 0.23, "grad_norm": 1.3762877160114013, "learning_rate": 1.79867665404461e-05, "loss": 1.2427, "step": 4777 }, { "epoch": 0.23, "grad_norm": 1.1571259337010766, "learning_rate": 1.7985829027906475e-05, "loss": 1.1548, "step": 4778 }, { "epoch": 0.23, "grad_norm": 1.1624275712190564, "learning_rate": 1.798489132157426e-05, "loss": 1.1709, "step": 4779 }, { "epoch": 0.23, "grad_norm": 1.2628034483910522, "learning_rate": 1.798395342147222e-05, "loss": 1.2441, "step": 4780 }, { "epoch": 0.23, "grad_norm": 1.0570751089905746, "learning_rate": 1.798301532762311e-05, "loss": 1.1772, "step": 4781 }, { "epoch": 0.23, "grad_norm": 1.1724390717228557, "learning_rate": 1.7982077040049692e-05, "loss": 0.9526, "step": 4782 }, { "epoch": 0.23, "grad_norm": 1.3380807932209657, "learning_rate": 1.7981138558774737e-05, "loss": 1.1899, "step": 4783 }, { "epoch": 0.23, "grad_norm": 1.175950190282243, "learning_rate": 1.7980199883821026e-05, "loss": 1.187, "step": 4784 }, { "epoch": 0.23, "grad_norm": 1.3960075398347727, "learning_rate": 1.797926101521133e-05, "loss": 1.2534, "step": 4785 }, { "epoch": 0.23, "grad_norm": 1.1427773343251844, "learning_rate": 1.7978321952968435e-05, "loss": 1.187, "step": 4786 }, { "epoch": 0.23, "grad_norm": 1.328917360343138, "learning_rate": 1.797738269711513e-05, "loss": 1.2603, "step": 4787 }, { "epoch": 0.23, "grad_norm": 1.3513627504284176, "learning_rate": 1.797644324767421e-05, "loss": 1.2188, "step": 4788 }, { "epoch": 0.23, "grad_norm": 1.0366685966583733, "learning_rate": 1.7975503604668468e-05, "loss": 1.25, "step": 4789 }, { "epoch": 0.23, "grad_norm": 1.3898379720692817, "learning_rate": 1.797456376812071e-05, "loss": 1.2241, "step": 4790 }, { "epoch": 0.23, "grad_norm": 0.9445117226727996, "learning_rate": 1.797362373805374e-05, "loss": 1.2661, "step": 4791 }, { "epoch": 0.23, "grad_norm": 1.314243341061258, "learning_rate": 1.7972683514490372e-05, "loss": 1.123, "step": 4792 }, { "epoch": 0.23, "grad_norm": 1.3616706015419788, "learning_rate": 1.797174309745342e-05, "loss": 1.3003, "step": 4793 }, { "epoch": 0.23, "grad_norm": 1.201533515826434, "learning_rate": 1.7970802486965712e-05, "loss": 1.0708, "step": 4794 }, { "epoch": 0.23, "grad_norm": 1.1441043823401753, "learning_rate": 1.7969861683050064e-05, "loss": 1.1226, "step": 4795 }, { "epoch": 0.23, "grad_norm": 1.5796538111838478, "learning_rate": 1.7968920685729317e-05, "loss": 1.1914, "step": 4796 }, { "epoch": 0.23, "grad_norm": 1.2639130581718852, "learning_rate": 1.7967979495026296e-05, "loss": 1.2871, "step": 4797 }, { "epoch": 0.23, "grad_norm": 1.16603459755483, "learning_rate": 1.7967038110963847e-05, "loss": 1.1216, "step": 4798 }, { "epoch": 0.23, "grad_norm": 1.0547743515031858, "learning_rate": 1.7966096533564813e-05, "loss": 1.1396, "step": 4799 }, { "epoch": 0.23, "grad_norm": 1.2066425090812345, "learning_rate": 1.7965154762852044e-05, "loss": 1.2417, "step": 4800 }, { "epoch": 0.23, "grad_norm": 1.314674221179987, "learning_rate": 1.7964212798848393e-05, "loss": 1.1577, "step": 4801 }, { "epoch": 0.23, "grad_norm": 1.6450692987135276, "learning_rate": 1.796327064157672e-05, "loss": 1.269, "step": 4802 }, { "epoch": 0.23, "grad_norm": 1.6003494322657144, "learning_rate": 1.7962328291059886e-05, "loss": 1.1396, "step": 4803 }, { "epoch": 0.23, "grad_norm": 1.4306269225126373, "learning_rate": 1.7961385747320763e-05, "loss": 1.1934, "step": 4804 }, { "epoch": 0.23, "grad_norm": 1.468541914484602, "learning_rate": 1.796044301038222e-05, "loss": 1.3286, "step": 4805 }, { "epoch": 0.23, "grad_norm": 1.2703703708065843, "learning_rate": 1.795950008026714e-05, "loss": 1.0669, "step": 4806 }, { "epoch": 0.23, "grad_norm": 1.6668995399979225, "learning_rate": 1.7958556956998397e-05, "loss": 1.3218, "step": 4807 }, { "epoch": 0.23, "grad_norm": 1.1741470114324215, "learning_rate": 1.795761364059888e-05, "loss": 1.1846, "step": 4808 }, { "epoch": 0.23, "grad_norm": 1.1277221931292671, "learning_rate": 1.7956670131091486e-05, "loss": 1.2593, "step": 4809 }, { "epoch": 0.23, "grad_norm": 1.5172018310534539, "learning_rate": 1.7955726428499107e-05, "loss": 1.2334, "step": 4810 }, { "epoch": 0.23, "grad_norm": 1.252945819130846, "learning_rate": 1.7954782532844643e-05, "loss": 1.1846, "step": 4811 }, { "epoch": 0.23, "grad_norm": 1.5275761624829902, "learning_rate": 1.7953838444151004e-05, "loss": 1.1392, "step": 4812 }, { "epoch": 0.23, "grad_norm": 1.209969705119666, "learning_rate": 1.7952894162441094e-05, "loss": 1.2251, "step": 4813 }, { "epoch": 0.23, "grad_norm": 1.3100477949217855, "learning_rate": 1.795194968773783e-05, "loss": 1.1064, "step": 4814 }, { "epoch": 0.23, "grad_norm": 1.2553842089208935, "learning_rate": 1.7951005020064142e-05, "loss": 1.2485, "step": 4815 }, { "epoch": 0.23, "grad_norm": 1.1107125769051212, "learning_rate": 1.7950060159442934e-05, "loss": 1.2549, "step": 4816 }, { "epoch": 0.23, "grad_norm": 1.2053836021335163, "learning_rate": 1.7949115105897155e-05, "loss": 1.165, "step": 4817 }, { "epoch": 0.23, "grad_norm": 1.442472508822967, "learning_rate": 1.7948169859449726e-05, "loss": 1.2046, "step": 4818 }, { "epoch": 0.23, "grad_norm": 1.128925272271494, "learning_rate": 1.7947224420123587e-05, "loss": 1.1099, "step": 4819 }, { "epoch": 0.23, "grad_norm": 1.3313418654082176, "learning_rate": 1.7946278787941687e-05, "loss": 1.1851, "step": 4820 }, { "epoch": 0.23, "grad_norm": 1.311067265840723, "learning_rate": 1.7945332962926966e-05, "loss": 1.1807, "step": 4821 }, { "epoch": 0.23, "grad_norm": 1.602106963558663, "learning_rate": 1.7944386945102387e-05, "loss": 1.3335, "step": 4822 }, { "epoch": 0.23, "grad_norm": 1.2347212191041443, "learning_rate": 1.7943440734490893e-05, "loss": 1.2441, "step": 4823 }, { "epoch": 0.23, "grad_norm": 1.261935041212208, "learning_rate": 1.794249433111546e-05, "loss": 1.2686, "step": 4824 }, { "epoch": 0.23, "grad_norm": 1.2452358253451232, "learning_rate": 1.7941547734999043e-05, "loss": 1.2002, "step": 4825 }, { "epoch": 0.23, "grad_norm": 1.4188010894400374, "learning_rate": 1.794060094616462e-05, "loss": 1.3076, "step": 4826 }, { "epoch": 0.23, "grad_norm": 1.232720977092351, "learning_rate": 1.7939653964635163e-05, "loss": 1.1558, "step": 4827 }, { "epoch": 0.23, "grad_norm": 1.1606859077341687, "learning_rate": 1.7938706790433655e-05, "loss": 1.228, "step": 4828 }, { "epoch": 0.23, "grad_norm": 0.9413895400191855, "learning_rate": 1.793775942358308e-05, "loss": 1.0776, "step": 4829 }, { "epoch": 0.23, "grad_norm": 1.3247558943150954, "learning_rate": 1.7936811864106425e-05, "loss": 1.1338, "step": 4830 }, { "epoch": 0.23, "grad_norm": 1.2420599709159084, "learning_rate": 1.793586411202669e-05, "loss": 1.1748, "step": 4831 }, { "epoch": 0.23, "grad_norm": 1.1182037019886841, "learning_rate": 1.793491616736687e-05, "loss": 1.1768, "step": 4832 }, { "epoch": 0.23, "grad_norm": 1.2873283607637003, "learning_rate": 1.7933968030149972e-05, "loss": 1.0615, "step": 4833 }, { "epoch": 0.23, "grad_norm": 1.3958028992618157, "learning_rate": 1.7933019700399006e-05, "loss": 1.2744, "step": 4834 }, { "epoch": 0.23, "grad_norm": 1.6302902440836315, "learning_rate": 1.793207117813698e-05, "loss": 1.2744, "step": 4835 }, { "epoch": 0.23, "grad_norm": 1.3365901088581946, "learning_rate": 1.793112246338691e-05, "loss": 1.2334, "step": 4836 }, { "epoch": 0.23, "grad_norm": 1.2052241546992237, "learning_rate": 1.7930173556171824e-05, "loss": 1.0522, "step": 4837 }, { "epoch": 0.23, "grad_norm": 1.2874896991537066, "learning_rate": 1.792922445651475e-05, "loss": 1.1042, "step": 4838 }, { "epoch": 0.23, "grad_norm": 1.0761764553702429, "learning_rate": 1.7928275164438715e-05, "loss": 1.1743, "step": 4839 }, { "epoch": 0.23, "grad_norm": 1.2762026316667618, "learning_rate": 1.792732567996676e-05, "loss": 1.2715, "step": 4840 }, { "epoch": 0.23, "grad_norm": 1.239287824799206, "learning_rate": 1.7926376003121922e-05, "loss": 1.1602, "step": 4841 }, { "epoch": 0.23, "grad_norm": 1.6974471067355485, "learning_rate": 1.792542613392725e-05, "loss": 1.3389, "step": 4842 }, { "epoch": 0.23, "grad_norm": 1.2385153571119034, "learning_rate": 1.7924476072405795e-05, "loss": 1.2061, "step": 4843 }, { "epoch": 0.23, "grad_norm": 1.490074876952386, "learning_rate": 1.792352581858061e-05, "loss": 1.2109, "step": 4844 }, { "epoch": 0.23, "grad_norm": 1.4414356011566127, "learning_rate": 1.7922575372474755e-05, "loss": 1.2192, "step": 4845 }, { "epoch": 0.23, "grad_norm": 0.8255915598651289, "learning_rate": 1.7921624734111292e-05, "loss": 1.094, "step": 4846 }, { "epoch": 0.23, "grad_norm": 1.1501745247902115, "learning_rate": 1.79206739035133e-05, "loss": 1.272, "step": 4847 }, { "epoch": 0.23, "grad_norm": 1.1884486796434799, "learning_rate": 1.7919722880703843e-05, "loss": 1.3252, "step": 4848 }, { "epoch": 0.23, "grad_norm": 1.4740860411457277, "learning_rate": 1.7918771665706e-05, "loss": 1.186, "step": 4849 }, { "epoch": 0.23, "grad_norm": 1.2783430253409893, "learning_rate": 1.7917820258542863e-05, "loss": 1.1904, "step": 4850 }, { "epoch": 0.23, "grad_norm": 1.4131829380239787, "learning_rate": 1.791686865923751e-05, "loss": 1.2471, "step": 4851 }, { "epoch": 0.23, "grad_norm": 1.0272441516966353, "learning_rate": 1.7915916867813037e-05, "loss": 1.2832, "step": 4852 }, { "epoch": 0.23, "grad_norm": 1.4023420395124582, "learning_rate": 1.7914964884292543e-05, "loss": 1.2075, "step": 4853 }, { "epoch": 0.23, "grad_norm": 1.2076652210030183, "learning_rate": 1.7914012708699126e-05, "loss": 1.1355, "step": 4854 }, { "epoch": 0.23, "grad_norm": 1.3869291607103191, "learning_rate": 1.7913060341055895e-05, "loss": 1.3179, "step": 4855 }, { "epoch": 0.23, "grad_norm": 1.08342390083113, "learning_rate": 1.7912107781385963e-05, "loss": 1.2021, "step": 4856 }, { "epoch": 0.23, "grad_norm": 1.206832994213776, "learning_rate": 1.7911155029712444e-05, "loss": 1.1333, "step": 4857 }, { "epoch": 0.23, "grad_norm": 1.1264510552471978, "learning_rate": 1.7910202086058458e-05, "loss": 1.0659, "step": 4858 }, { "epoch": 0.23, "grad_norm": 1.2905997997978627, "learning_rate": 1.790924895044713e-05, "loss": 1.1777, "step": 4859 }, { "epoch": 0.23, "grad_norm": 1.3101097016670136, "learning_rate": 1.790829562290159e-05, "loss": 1.21, "step": 4860 }, { "epoch": 0.23, "grad_norm": 1.2403979649533146, "learning_rate": 1.7907342103444975e-05, "loss": 1.1846, "step": 4861 }, { "epoch": 0.23, "grad_norm": 1.2746990628386623, "learning_rate": 1.790638839210042e-05, "loss": 1.3071, "step": 4862 }, { "epoch": 0.23, "grad_norm": 1.2747284607947689, "learning_rate": 1.790543448889107e-05, "loss": 1.1499, "step": 4863 }, { "epoch": 0.23, "grad_norm": 1.1685847828910976, "learning_rate": 1.7904480393840074e-05, "loss": 1.2178, "step": 4864 }, { "epoch": 0.23, "grad_norm": 1.1109384979867274, "learning_rate": 1.7903526106970585e-05, "loss": 1.0681, "step": 4865 }, { "epoch": 0.23, "grad_norm": 1.2436524915169016, "learning_rate": 1.790257162830576e-05, "loss": 1.1724, "step": 4866 }, { "epoch": 0.23, "grad_norm": 1.31575919496647, "learning_rate": 1.7901616957868766e-05, "loss": 1.0962, "step": 4867 }, { "epoch": 0.23, "grad_norm": 1.253257180362507, "learning_rate": 1.7900662095682762e-05, "loss": 1.2891, "step": 4868 }, { "epoch": 0.23, "grad_norm": 1.0411566895070299, "learning_rate": 1.7899707041770925e-05, "loss": 1.2373, "step": 4869 }, { "epoch": 0.23, "grad_norm": 1.213919844737081, "learning_rate": 1.7898751796156433e-05, "loss": 1.1631, "step": 4870 }, { "epoch": 0.23, "grad_norm": 1.3632067434132507, "learning_rate": 1.789779635886246e-05, "loss": 1.0728, "step": 4871 }, { "epoch": 0.23, "grad_norm": 1.1393019543225746, "learning_rate": 1.7896840729912198e-05, "loss": 1.1274, "step": 4872 }, { "epoch": 0.23, "grad_norm": 1.237438298901906, "learning_rate": 1.7895884909328835e-05, "loss": 1.0305, "step": 4873 }, { "epoch": 0.23, "grad_norm": 1.1540523427972924, "learning_rate": 1.789492889713557e-05, "loss": 1.2158, "step": 4874 }, { "epoch": 0.23, "grad_norm": 1.5325534815204809, "learning_rate": 1.7893972693355595e-05, "loss": 1.2212, "step": 4875 }, { "epoch": 0.23, "grad_norm": 1.240533783595969, "learning_rate": 1.7893016298012117e-05, "loss": 1.0847, "step": 4876 }, { "epoch": 0.23, "grad_norm": 1.4304131228610446, "learning_rate": 1.7892059711128346e-05, "loss": 1.2085, "step": 4877 }, { "epoch": 0.23, "grad_norm": 1.3855019199928713, "learning_rate": 1.78911029327275e-05, "loss": 1.1375, "step": 4878 }, { "epoch": 0.23, "grad_norm": 1.309576136328083, "learning_rate": 1.7890145962832786e-05, "loss": 1.219, "step": 4879 }, { "epoch": 0.23, "grad_norm": 1.4506581637571443, "learning_rate": 1.788918880146744e-05, "loss": 1.332, "step": 4880 }, { "epoch": 0.23, "grad_norm": 1.2916938964173776, "learning_rate": 1.788823144865468e-05, "loss": 1.2129, "step": 4881 }, { "epoch": 0.23, "grad_norm": 1.10011917355303, "learning_rate": 1.7887273904417742e-05, "loss": 1.1069, "step": 4882 }, { "epoch": 0.23, "grad_norm": 1.3572908076976662, "learning_rate": 1.7886316168779862e-05, "loss": 1.0693, "step": 4883 }, { "epoch": 0.23, "grad_norm": 1.1720517729891105, "learning_rate": 1.7885358241764282e-05, "loss": 1.0847, "step": 4884 }, { "epoch": 0.23, "grad_norm": 1.4090624946605568, "learning_rate": 1.7884400123394243e-05, "loss": 1.1606, "step": 4885 }, { "epoch": 0.24, "grad_norm": 1.342055959668762, "learning_rate": 1.7883441813693006e-05, "loss": 1.3096, "step": 4886 }, { "epoch": 0.24, "grad_norm": 1.2033290836822448, "learning_rate": 1.7882483312683816e-05, "loss": 1.187, "step": 4887 }, { "epoch": 0.24, "grad_norm": 1.2271961206823496, "learning_rate": 1.788152462038994e-05, "loss": 1.1162, "step": 4888 }, { "epoch": 0.24, "grad_norm": 1.325104086239715, "learning_rate": 1.7880565736834642e-05, "loss": 1.1484, "step": 4889 }, { "epoch": 0.24, "grad_norm": 1.2300055391952998, "learning_rate": 1.7879606662041186e-05, "loss": 1.0547, "step": 4890 }, { "epoch": 0.24, "grad_norm": 1.2248587362007275, "learning_rate": 1.787864739603285e-05, "loss": 1.2417, "step": 4891 }, { "epoch": 0.24, "grad_norm": 1.1078914218472715, "learning_rate": 1.7877687938832915e-05, "loss": 1.3486, "step": 4892 }, { "epoch": 0.24, "grad_norm": 1.0592133924622917, "learning_rate": 1.7876728290464658e-05, "loss": 1.1431, "step": 4893 }, { "epoch": 0.24, "grad_norm": 1.3412408214458693, "learning_rate": 1.787576845095137e-05, "loss": 1.2339, "step": 4894 }, { "epoch": 0.24, "grad_norm": 1.1061983543608782, "learning_rate": 1.7874808420316345e-05, "loss": 0.8818, "step": 4895 }, { "epoch": 0.24, "grad_norm": 1.396994611076, "learning_rate": 1.787384819858288e-05, "loss": 1.2852, "step": 4896 }, { "epoch": 0.24, "grad_norm": 1.2108125250907351, "learning_rate": 1.787288778577427e-05, "loss": 1.3237, "step": 4897 }, { "epoch": 0.24, "grad_norm": 1.0203444495536762, "learning_rate": 1.7871927181913832e-05, "loss": 1.1597, "step": 4898 }, { "epoch": 0.24, "grad_norm": 1.1473891799165707, "learning_rate": 1.787096638702487e-05, "loss": 1.2539, "step": 4899 }, { "epoch": 0.24, "grad_norm": 1.0423639691788777, "learning_rate": 1.78700054011307e-05, "loss": 1.2891, "step": 4900 }, { "epoch": 0.24, "grad_norm": 1.3553928839144185, "learning_rate": 1.7869044224254648e-05, "loss": 1.2939, "step": 4901 }, { "epoch": 0.24, "grad_norm": 1.6269742100185638, "learning_rate": 1.786808285642003e-05, "loss": 1.3418, "step": 4902 }, { "epoch": 0.24, "grad_norm": 1.2910689119201095, "learning_rate": 1.7867121297650184e-05, "loss": 1.1641, "step": 4903 }, { "epoch": 0.24, "grad_norm": 1.372005169159716, "learning_rate": 1.786615954796844e-05, "loss": 1.334, "step": 4904 }, { "epoch": 0.24, "grad_norm": 1.1887744708047752, "learning_rate": 1.7865197607398133e-05, "loss": 1.23, "step": 4905 }, { "epoch": 0.24, "grad_norm": 1.3319805544671728, "learning_rate": 1.7864235475962616e-05, "loss": 1.2383, "step": 4906 }, { "epoch": 0.24, "grad_norm": 1.3453655278083003, "learning_rate": 1.786327315368523e-05, "loss": 1.1528, "step": 4907 }, { "epoch": 0.24, "grad_norm": 1.0449369142233305, "learning_rate": 1.7862310640589328e-05, "loss": 1.2539, "step": 4908 }, { "epoch": 0.24, "grad_norm": 1.1801755222964156, "learning_rate": 1.786134793669827e-05, "loss": 1.0686, "step": 4909 }, { "epoch": 0.24, "grad_norm": 1.2405030857612542, "learning_rate": 1.7860385042035418e-05, "loss": 1.2139, "step": 4910 }, { "epoch": 0.24, "grad_norm": 1.3638758352578797, "learning_rate": 1.7859421956624135e-05, "loss": 1.2612, "step": 4911 }, { "epoch": 0.24, "grad_norm": 1.286899891343759, "learning_rate": 1.7858458680487798e-05, "loss": 1.3159, "step": 4912 }, { "epoch": 0.24, "grad_norm": 1.263679968761987, "learning_rate": 1.785749521364978e-05, "loss": 1.2539, "step": 4913 }, { "epoch": 0.24, "grad_norm": 1.1716779480091566, "learning_rate": 1.7856531556133457e-05, "loss": 1.3159, "step": 4914 }, { "epoch": 0.24, "grad_norm": 1.281444551446763, "learning_rate": 1.785556770796222e-05, "loss": 1.1353, "step": 4915 }, { "epoch": 0.24, "grad_norm": 1.2561890182054964, "learning_rate": 1.785460366915946e-05, "loss": 1.3096, "step": 4916 }, { "epoch": 0.24, "grad_norm": 1.4511476586583223, "learning_rate": 1.7853639439748564e-05, "loss": 1.0789, "step": 4917 }, { "epoch": 0.24, "grad_norm": 1.1232233001210574, "learning_rate": 1.785267501975294e-05, "loss": 1.2651, "step": 4918 }, { "epoch": 0.24, "grad_norm": 1.066829476618541, "learning_rate": 1.7851710409195987e-05, "loss": 1.1602, "step": 4919 }, { "epoch": 0.24, "grad_norm": 0.8674621841001435, "learning_rate": 1.785074560810111e-05, "loss": 1.0442, "step": 4920 }, { "epoch": 0.24, "grad_norm": 1.3115864251158849, "learning_rate": 1.7849780616491726e-05, "loss": 1.2861, "step": 4921 }, { "epoch": 0.24, "grad_norm": 1.322655778231526, "learning_rate": 1.7848815434391254e-05, "loss": 1.3882, "step": 4922 }, { "epoch": 0.24, "grad_norm": 1.433923179026627, "learning_rate": 1.784785006182311e-05, "loss": 1.2983, "step": 4923 }, { "epoch": 0.24, "grad_norm": 1.288801606226345, "learning_rate": 1.784688449881073e-05, "loss": 1.168, "step": 4924 }, { "epoch": 0.24, "grad_norm": 1.1470195672799404, "learning_rate": 1.784591874537754e-05, "loss": 1.2241, "step": 4925 }, { "epoch": 0.24, "grad_norm": 1.3686436557433157, "learning_rate": 1.784495280154697e-05, "loss": 1.2354, "step": 4926 }, { "epoch": 0.24, "grad_norm": 1.335580331481445, "learning_rate": 1.784398666734247e-05, "loss": 1.2065, "step": 4927 }, { "epoch": 0.24, "grad_norm": 1.3334503623163296, "learning_rate": 1.784302034278748e-05, "loss": 1.1118, "step": 4928 }, { "epoch": 0.24, "grad_norm": 1.3919412200538406, "learning_rate": 1.7842053827905457e-05, "loss": 1.2812, "step": 4929 }, { "epoch": 0.24, "grad_norm": 1.0850334105205488, "learning_rate": 1.784108712271985e-05, "loss": 1.0513, "step": 4930 }, { "epoch": 0.24, "grad_norm": 1.1149797987981631, "learning_rate": 1.7840120227254115e-05, "loss": 1.0723, "step": 4931 }, { "epoch": 0.24, "grad_norm": 1.5002103952248196, "learning_rate": 1.783915314153172e-05, "loss": 1.3828, "step": 4932 }, { "epoch": 0.24, "grad_norm": 1.27629226848574, "learning_rate": 1.783818586557613e-05, "loss": 1.0608, "step": 4933 }, { "epoch": 0.24, "grad_norm": 1.0764915894362816, "learning_rate": 1.7837218399410822e-05, "loss": 1.1724, "step": 4934 }, { "epoch": 0.24, "grad_norm": 1.2377380650314207, "learning_rate": 1.783625074305927e-05, "loss": 1.2451, "step": 4935 }, { "epoch": 0.24, "grad_norm": 1.0683279959189522, "learning_rate": 1.7835282896544963e-05, "loss": 1.0791, "step": 4936 }, { "epoch": 0.24, "grad_norm": 1.2937508499973909, "learning_rate": 1.783431485989138e-05, "loss": 1.2378, "step": 4937 }, { "epoch": 0.24, "grad_norm": 1.8520149462321878, "learning_rate": 1.7833346633122013e-05, "loss": 1.1221, "step": 4938 }, { "epoch": 0.24, "grad_norm": 1.3689484250388382, "learning_rate": 1.7832378216260365e-05, "loss": 1.3691, "step": 4939 }, { "epoch": 0.24, "grad_norm": 1.6504506682226088, "learning_rate": 1.7831409609329927e-05, "loss": 1.3696, "step": 4940 }, { "epoch": 0.24, "grad_norm": 1.3262818769406042, "learning_rate": 1.7830440812354216e-05, "loss": 1.1694, "step": 4941 }, { "epoch": 0.24, "grad_norm": 1.4552880071623104, "learning_rate": 1.782947182535673e-05, "loss": 1.2402, "step": 4942 }, { "epoch": 0.24, "grad_norm": 0.9688034010685299, "learning_rate": 1.782850264836099e-05, "loss": 1.2788, "step": 4943 }, { "epoch": 0.24, "grad_norm": 1.3676749455560824, "learning_rate": 1.782753328139051e-05, "loss": 1.311, "step": 4944 }, { "epoch": 0.24, "grad_norm": 1.29895755491313, "learning_rate": 1.782656372446882e-05, "loss": 1.1992, "step": 4945 }, { "epoch": 0.24, "grad_norm": 1.1218307853685279, "learning_rate": 1.7825593977619443e-05, "loss": 1.2095, "step": 4946 }, { "epoch": 0.24, "grad_norm": 1.180209126800921, "learning_rate": 1.782462404086592e-05, "loss": 1.2866, "step": 4947 }, { "epoch": 0.24, "grad_norm": 1.487061209111331, "learning_rate": 1.782365391423178e-05, "loss": 1.021, "step": 4948 }, { "epoch": 0.24, "grad_norm": 1.2380517682839416, "learning_rate": 1.7822683597740568e-05, "loss": 1.1333, "step": 4949 }, { "epoch": 0.24, "grad_norm": 1.3501835005580012, "learning_rate": 1.782171309141583e-05, "loss": 1.1455, "step": 4950 }, { "epoch": 0.24, "grad_norm": 0.9397772784857986, "learning_rate": 1.782074239528112e-05, "loss": 1.271, "step": 4951 }, { "epoch": 0.24, "grad_norm": 1.4412547854375268, "learning_rate": 1.781977150935999e-05, "loss": 1.189, "step": 4952 }, { "epoch": 0.24, "grad_norm": 1.0772538275516599, "learning_rate": 1.7818800433676e-05, "loss": 1.0056, "step": 4953 }, { "epoch": 0.24, "grad_norm": 1.3155811646100826, "learning_rate": 1.781782916825272e-05, "loss": 1.1709, "step": 4954 }, { "epoch": 0.24, "grad_norm": 1.4229297353596286, "learning_rate": 1.781685771311372e-05, "loss": 1.3018, "step": 4955 }, { "epoch": 0.24, "grad_norm": 1.2676701758879354, "learning_rate": 1.781588606828257e-05, "loss": 1.2173, "step": 4956 }, { "epoch": 0.24, "grad_norm": 0.9799388514532664, "learning_rate": 1.7814914233782848e-05, "loss": 1.2622, "step": 4957 }, { "epoch": 0.24, "grad_norm": 0.895089975317318, "learning_rate": 1.7813942209638148e-05, "loss": 1.1523, "step": 4958 }, { "epoch": 0.24, "grad_norm": 1.733938010659696, "learning_rate": 1.7812969995872044e-05, "loss": 1.4243, "step": 4959 }, { "epoch": 0.24, "grad_norm": 1.242460300113702, "learning_rate": 1.781199759250814e-05, "loss": 1.1504, "step": 4960 }, { "epoch": 0.24, "grad_norm": 1.2739475123840172, "learning_rate": 1.7811024999570023e-05, "loss": 1.2847, "step": 4961 }, { "epoch": 0.24, "grad_norm": 1.1079936455784112, "learning_rate": 1.7810052217081306e-05, "loss": 1.2529, "step": 4962 }, { "epoch": 0.24, "grad_norm": 1.1726863997792616, "learning_rate": 1.7809079245065586e-05, "loss": 1.1323, "step": 4963 }, { "epoch": 0.24, "grad_norm": 1.2875414212580971, "learning_rate": 1.7808106083546478e-05, "loss": 1.1851, "step": 4964 }, { "epoch": 0.24, "grad_norm": 1.504052652793305, "learning_rate": 1.7807132732547603e-05, "loss": 1.272, "step": 4965 }, { "epoch": 0.24, "grad_norm": 1.2237363519298097, "learning_rate": 1.7806159192092575e-05, "loss": 1.1836, "step": 4966 }, { "epoch": 0.24, "grad_norm": 1.1819063182836653, "learning_rate": 1.7805185462205018e-05, "loss": 1.147, "step": 4967 }, { "epoch": 0.24, "grad_norm": 1.192661765064858, "learning_rate": 1.7804211542908568e-05, "loss": 0.9597, "step": 4968 }, { "epoch": 0.24, "grad_norm": 1.540314064050184, "learning_rate": 1.780323743422685e-05, "loss": 1.1479, "step": 4969 }, { "epoch": 0.24, "grad_norm": 0.6709026019589093, "learning_rate": 1.7802263136183514e-05, "loss": 1.1641, "step": 4970 }, { "epoch": 0.24, "grad_norm": 1.4903600438363398, "learning_rate": 1.780128864880219e-05, "loss": 1.1523, "step": 4971 }, { "epoch": 0.24, "grad_norm": 1.1248822665318277, "learning_rate": 1.780031397210654e-05, "loss": 1.0879, "step": 4972 }, { "epoch": 0.24, "grad_norm": 0.8927747292997149, "learning_rate": 1.7799339106120205e-05, "loss": 1.2217, "step": 4973 }, { "epoch": 0.24, "grad_norm": 1.2911374662243948, "learning_rate": 1.7798364050866853e-05, "loss": 1.3936, "step": 4974 }, { "epoch": 0.24, "grad_norm": 1.2146881223244403, "learning_rate": 1.7797388806370132e-05, "loss": 1.2178, "step": 4975 }, { "epoch": 0.24, "grad_norm": 1.308162937555163, "learning_rate": 1.779641337265372e-05, "loss": 1.1597, "step": 4976 }, { "epoch": 0.24, "grad_norm": 1.252103453433163, "learning_rate": 1.7795437749741283e-05, "loss": 1.1172, "step": 4977 }, { "epoch": 0.24, "grad_norm": 1.2446202226452117, "learning_rate": 1.77944619376565e-05, "loss": 1.2456, "step": 4978 }, { "epoch": 0.24, "grad_norm": 1.2668285964413382, "learning_rate": 1.7793485936423045e-05, "loss": 1.1309, "step": 4979 }, { "epoch": 0.24, "grad_norm": 1.041425536357868, "learning_rate": 1.7792509746064608e-05, "loss": 1.1221, "step": 4980 }, { "epoch": 0.24, "grad_norm": 1.226855119693531, "learning_rate": 1.7791533366604876e-05, "loss": 1.0796, "step": 4981 }, { "epoch": 0.24, "grad_norm": 1.283223034793816, "learning_rate": 1.7790556798067543e-05, "loss": 1.1938, "step": 4982 }, { "epoch": 0.24, "grad_norm": 1.207181001199412, "learning_rate": 1.7789580040476305e-05, "loss": 1.3042, "step": 4983 }, { "epoch": 0.24, "grad_norm": 1.3137804938992095, "learning_rate": 1.7788603093854872e-05, "loss": 1.1812, "step": 4984 }, { "epoch": 0.24, "grad_norm": 1.1236509437566278, "learning_rate": 1.7787625958226947e-05, "loss": 1.1831, "step": 4985 }, { "epoch": 0.24, "grad_norm": 0.9347268155601564, "learning_rate": 1.778664863361624e-05, "loss": 1.1436, "step": 4986 }, { "epoch": 0.24, "grad_norm": 1.3955904084815802, "learning_rate": 1.7785671120046472e-05, "loss": 1.2495, "step": 4987 }, { "epoch": 0.24, "grad_norm": 1.2040355581246427, "learning_rate": 1.7784693417541364e-05, "loss": 1.1831, "step": 4988 }, { "epoch": 0.24, "grad_norm": 1.2946661412713283, "learning_rate": 1.7783715526124637e-05, "loss": 1.2144, "step": 4989 }, { "epoch": 0.24, "grad_norm": 1.182121091033394, "learning_rate": 1.778273744582003e-05, "loss": 1.1416, "step": 4990 }, { "epoch": 0.24, "grad_norm": 1.227929835923367, "learning_rate": 1.7781759176651273e-05, "loss": 1.145, "step": 4991 }, { "epoch": 0.24, "grad_norm": 1.5166415741911123, "learning_rate": 1.77807807186421e-05, "loss": 1.1904, "step": 4992 }, { "epoch": 0.24, "grad_norm": 1.1879650104998185, "learning_rate": 1.777980207181627e-05, "loss": 1.2212, "step": 4993 }, { "epoch": 0.24, "grad_norm": 1.525328249469443, "learning_rate": 1.7778823236197515e-05, "loss": 1.4395, "step": 4994 }, { "epoch": 0.24, "grad_norm": 1.4704145207756008, "learning_rate": 1.7777844211809602e-05, "loss": 1.3623, "step": 4995 }, { "epoch": 0.24, "grad_norm": 1.4389811187367623, "learning_rate": 1.7776864998676284e-05, "loss": 1.0876, "step": 4996 }, { "epoch": 0.24, "grad_norm": 1.496230803623764, "learning_rate": 1.777588559682132e-05, "loss": 1.1997, "step": 4997 }, { "epoch": 0.24, "grad_norm": 1.2646560804873412, "learning_rate": 1.7774906006268482e-05, "loss": 1.1143, "step": 4998 }, { "epoch": 0.24, "grad_norm": 1.2233090355096585, "learning_rate": 1.777392622704154e-05, "loss": 1.2925, "step": 4999 }, { "epoch": 0.24, "grad_norm": 1.0835058930794512, "learning_rate": 1.7772946259164272e-05, "loss": 1.0217, "step": 5000 } ], "logging_steps": 1.0, "max_steps": 20791, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "total_flos": 245260153888768.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }