diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,35021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2404886729835025, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 5.238859304449232, + "learning_rate": 3.205128205128205e-08, + "loss": 1.9292, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 5.154761972828117, + "learning_rate": 6.41025641025641e-08, + "loss": 1.9106, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 5.154761972828117, + "learning_rate": 6.41025641025641e-08, + "loss": 2.0762, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 5.154761972828117, + "learning_rate": 6.41025641025641e-08, + "loss": 1.9214, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 4.5501767239225925, + "learning_rate": 9.615384615384617e-08, + "loss": 1.9214, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 8.121504141000438, + "learning_rate": 1.282051282051282e-07, + "loss": 1.9585, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 5.023363133110595, + "learning_rate": 1.6025641025641025e-07, + "loss": 1.9023, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 6.753326612596782, + "learning_rate": 1.9230769230769234e-07, + "loss": 2.0918, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 5.96045460748767, + "learning_rate": 2.2435897435897438e-07, + "loss": 1.9512, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 4.114270738139542, + "learning_rate": 2.564102564102564e-07, + "loss": 1.7202, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 4.684535049833846, + "learning_rate": 2.884615384615385e-07, + "loss": 1.9663, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 6.072578005006582, + "learning_rate": 3.205128205128205e-07, + "loss": 1.749, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 7.46480114791522, + "learning_rate": 3.525641025641026e-07, + "loss": 1.874, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 5.262086839396571, + "learning_rate": 3.846153846153847e-07, + "loss": 2.1279, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 5.967246536097301, + "learning_rate": 4.1666666666666667e-07, + "loss": 1.978, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 5.292134801724384, + "learning_rate": 4.4871794871794876e-07, + "loss": 1.8726, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 5.840441578239468, + "learning_rate": 4.807692307692308e-07, + "loss": 1.7012, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 6.883530970083456, + "learning_rate": 5.128205128205128e-07, + "loss": 1.9629, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 5.941981671382089, + "learning_rate": 5.448717948717949e-07, + "loss": 1.8174, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 5.148089740413392, + "learning_rate": 5.76923076923077e-07, + "loss": 2.0889, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 3.2526377076981627, + "learning_rate": 6.08974358974359e-07, + "loss": 1.7114, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 5.0623854154859576, + "learning_rate": 6.41025641025641e-07, + "loss": 1.749, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 4.486633223129077, + "learning_rate": 6.730769230769231e-07, + "loss": 2.0391, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 4.372206281873816, + "learning_rate": 7.051282051282052e-07, + "loss": 2.0615, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 5.245200878071032, + "learning_rate": 7.371794871794873e-07, + "loss": 1.8257, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 5.003738392680743, + "learning_rate": 7.692307692307694e-07, + "loss": 1.9463, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 4.701079464001506, + "learning_rate": 8.012820512820515e-07, + "loss": 1.9487, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 2.9499281691275945, + "learning_rate": 8.333333333333333e-07, + "loss": 1.792, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 4.069030482849168, + "learning_rate": 8.653846153846154e-07, + "loss": 1.9136, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 3.9368014731570184, + "learning_rate": 8.974358974358975e-07, + "loss": 1.9771, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.2485080466515015, + "learning_rate": 9.294871794871796e-07, + "loss": 1.9028, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 3.1622578885650086, + "learning_rate": 9.615384615384617e-07, + "loss": 1.7256, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 3.3739906268828004, + "learning_rate": 9.935897435897436e-07, + "loss": 1.8687, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 2.896288258241709, + "learning_rate": 1.0256410256410257e-06, + "loss": 1.6704, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 3.3707774957236416, + "learning_rate": 1.0576923076923078e-06, + "loss": 1.8804, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 3.3426838743476717, + "learning_rate": 1.0897435897435899e-06, + "loss": 1.6968, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 3.6068389746435177, + "learning_rate": 1.121794871794872e-06, + "loss": 1.894, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 3.521477257520405, + "learning_rate": 1.153846153846154e-06, + "loss": 1.8633, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 2.776485593912886, + "learning_rate": 1.185897435897436e-06, + "loss": 1.9204, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 2.7681508560658163, + "learning_rate": 1.217948717948718e-06, + "loss": 1.7188, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 2.4439946119342766, + "learning_rate": 1.25e-06, + "loss": 1.6919, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 3.4557738645575014, + "learning_rate": 1.282051282051282e-06, + "loss": 1.8833, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 3.258303676978315, + "learning_rate": 1.3141025641025643e-06, + "loss": 1.8071, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 2.458784669216295, + "learning_rate": 1.3461538461538462e-06, + "loss": 1.6909, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 2.504431892544747, + "learning_rate": 1.3782051282051285e-06, + "loss": 1.8086, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 2.4555502780430674, + "learning_rate": 1.4102564102564104e-06, + "loss": 1.751, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 2.751022634659861, + "learning_rate": 1.4423076923076922e-06, + "loss": 1.8057, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 2.5738415737254643, + "learning_rate": 1.4743589743589745e-06, + "loss": 1.9253, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 2.8124133820243595, + "learning_rate": 1.5064102564102564e-06, + "loss": 1.793, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 2.463335400346117, + "learning_rate": 1.5384615384615387e-06, + "loss": 1.7407, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 2.8940656658334207, + "learning_rate": 1.5705128205128206e-06, + "loss": 1.8716, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 2.584846431211917, + "learning_rate": 1.602564102564103e-06, + "loss": 1.9146, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 2.1515635295651547, + "learning_rate": 1.6346153846153848e-06, + "loss": 1.5991, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 2.3483985097141886, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.7856, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 2.1146516926157024, + "learning_rate": 1.698717948717949e-06, + "loss": 1.7661, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 2.3653234986460254, + "learning_rate": 1.7307692307692308e-06, + "loss": 1.731, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 2.21651123257784, + "learning_rate": 1.7628205128205131e-06, + "loss": 1.5532, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 2.3531455091700266, + "learning_rate": 1.794871794871795e-06, + "loss": 1.7275, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 2.2454207897169183, + "learning_rate": 1.826923076923077e-06, + "loss": 1.6841, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 2.911505925139179, + "learning_rate": 1.8589743589743592e-06, + "loss": 1.5664, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 2.3580392311833527, + "learning_rate": 1.891025641025641e-06, + "loss": 1.7896, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 2.2925212989802692, + "learning_rate": 1.9230769230769234e-06, + "loss": 1.8101, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 2.1153093773918763, + "learning_rate": 1.9551282051282055e-06, + "loss": 1.5747, + "step": 63 + }, + { + "epoch": 0.0, + "grad_norm": 2.82147569579739, + "learning_rate": 1.987179487179487e-06, + "loss": 1.9238, + "step": 64 + }, + { + "epoch": 0.0, + "grad_norm": 2.2045873509725165, + "learning_rate": 2.0192307692307692e-06, + "loss": 1.6338, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 1.8958181368269862, + "learning_rate": 2.0512820512820513e-06, + "loss": 1.6543, + "step": 66 + }, + { + "epoch": 0.0, + "grad_norm": 2.4962681332761973, + "learning_rate": 2.0833333333333334e-06, + "loss": 1.7227, + "step": 67 + }, + { + "epoch": 0.0, + "grad_norm": 2.2904246633721383, + "learning_rate": 2.1153846153846155e-06, + "loss": 1.7822, + "step": 68 + }, + { + "epoch": 0.0, + "grad_norm": 2.0946295372079704, + "learning_rate": 2.1474358974358976e-06, + "loss": 1.7529, + "step": 69 + }, + { + "epoch": 0.0, + "grad_norm": 1.9129728821010556, + "learning_rate": 2.1794871794871797e-06, + "loss": 1.7168, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 2.243146448623675, + "learning_rate": 2.211538461538462e-06, + "loss": 1.5776, + "step": 71 + }, + { + "epoch": 0.0, + "grad_norm": 2.3234162989867078, + "learning_rate": 2.243589743589744e-06, + "loss": 1.7202, + "step": 72 + }, + { + "epoch": 0.0, + "grad_norm": 2.2101029603221445, + "learning_rate": 2.275641025641026e-06, + "loss": 1.6567, + "step": 73 + }, + { + "epoch": 0.0, + "grad_norm": 1.7394617855897616, + "learning_rate": 2.307692307692308e-06, + "loss": 1.6265, + "step": 74 + }, + { + "epoch": 0.0, + "grad_norm": 2.1688691088118355, + "learning_rate": 2.3397435897435897e-06, + "loss": 1.7217, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 2.125923801614282, + "learning_rate": 2.371794871794872e-06, + "loss": 1.6421, + "step": 76 + }, + { + "epoch": 0.0, + "grad_norm": 1.922290559965317, + "learning_rate": 2.403846153846154e-06, + "loss": 1.7417, + "step": 77 + }, + { + "epoch": 0.0, + "grad_norm": 1.8260537731707283, + "learning_rate": 2.435897435897436e-06, + "loss": 1.7046, + "step": 78 + }, + { + "epoch": 0.0, + "grad_norm": 1.901577592787987, + "learning_rate": 2.467948717948718e-06, + "loss": 1.6465, + "step": 79 + }, + { + "epoch": 0.0, + "grad_norm": 2.2404366777034657, + "learning_rate": 2.5e-06, + "loss": 1.9019, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 2.3241381789925084, + "learning_rate": 2.5320512820512823e-06, + "loss": 1.9023, + "step": 81 + }, + { + "epoch": 0.0, + "grad_norm": 1.6016055313372246, + "learning_rate": 2.564102564102564e-06, + "loss": 1.5063, + "step": 82 + }, + { + "epoch": 0.0, + "grad_norm": 1.9570960890714142, + "learning_rate": 2.5961538461538465e-06, + "loss": 1.6626, + "step": 83 + }, + { + "epoch": 0.0, + "grad_norm": 2.173041714414597, + "learning_rate": 2.6282051282051286e-06, + "loss": 1.8271, + "step": 84 + }, + { + "epoch": 0.0, + "grad_norm": 1.7100248085650758, + "learning_rate": 2.6602564102564107e-06, + "loss": 1.6792, + "step": 85 + }, + { + "epoch": 0.0, + "grad_norm": 2.0012255354709096, + "learning_rate": 2.6923076923076923e-06, + "loss": 1.6572, + "step": 86 + }, + { + "epoch": 0.0, + "grad_norm": 1.9137825517030291, + "learning_rate": 2.7243589743589744e-06, + "loss": 1.5825, + "step": 87 + }, + { + "epoch": 0.0, + "grad_norm": 1.8189687481089372, + "learning_rate": 2.756410256410257e-06, + "loss": 1.6406, + "step": 88 + }, + { + "epoch": 0.0, + "grad_norm": 2.005488580790363, + "learning_rate": 2.7884615384615386e-06, + "loss": 1.4351, + "step": 89 + }, + { + "epoch": 0.0, + "grad_norm": 1.5332368401576102, + "learning_rate": 2.8205128205128207e-06, + "loss": 1.6577, + "step": 90 + }, + { + "epoch": 0.0, + "grad_norm": 1.8507605450940214, + "learning_rate": 2.852564102564103e-06, + "loss": 1.6392, + "step": 91 + }, + { + "epoch": 0.0, + "grad_norm": 1.5910068501827332, + "learning_rate": 2.8846153846153845e-06, + "loss": 1.4976, + "step": 92 + }, + { + "epoch": 0.0, + "grad_norm": 2.0608841750304494, + "learning_rate": 2.916666666666667e-06, + "loss": 1.5547, + "step": 93 + }, + { + "epoch": 0.0, + "grad_norm": 2.1355602748315436, + "learning_rate": 2.948717948717949e-06, + "loss": 1.7173, + "step": 94 + }, + { + "epoch": 0.0, + "grad_norm": 1.9032261555786585, + "learning_rate": 2.980769230769231e-06, + "loss": 1.5654, + "step": 95 + }, + { + "epoch": 0.0, + "grad_norm": 1.846073015872819, + "learning_rate": 3.012820512820513e-06, + "loss": 1.6216, + "step": 96 + }, + { + "epoch": 0.0, + "grad_norm": 1.6487987622267262, + "learning_rate": 3.044871794871795e-06, + "loss": 1.4878, + "step": 97 + }, + { + "epoch": 0.0, + "grad_norm": 2.322763298152455, + "learning_rate": 3.0769230769230774e-06, + "loss": 1.792, + "step": 98 + }, + { + "epoch": 0.0, + "grad_norm": 1.8121063877418546, + "learning_rate": 3.108974358974359e-06, + "loss": 1.7222, + "step": 99 + }, + { + "epoch": 0.0, + "grad_norm": 1.6744336508338604, + "learning_rate": 3.141025641025641e-06, + "loss": 1.6226, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 1.7247634842466844, + "learning_rate": 3.1730769230769233e-06, + "loss": 1.5952, + "step": 101 + }, + { + "epoch": 0.0, + "grad_norm": 1.740366338268637, + "learning_rate": 3.205128205128206e-06, + "loss": 1.5981, + "step": 102 + }, + { + "epoch": 0.0, + "grad_norm": 1.728970560061163, + "learning_rate": 3.2371794871794875e-06, + "loss": 1.5791, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 1.858549928240061, + "learning_rate": 3.2692307692307696e-06, + "loss": 1.6162, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 2.005425495184849, + "learning_rate": 3.3012820512820517e-06, + "loss": 1.7373, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 1.9693982346018715, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.5737, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 1.5704486648348535, + "learning_rate": 3.365384615384616e-06, + "loss": 1.5737, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 1.7219111956395152, + "learning_rate": 3.397435897435898e-06, + "loss": 1.4395, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.8107668921480111, + "learning_rate": 3.4294871794871796e-06, + "loss": 1.6245, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 2.060649729714151, + "learning_rate": 3.4615384615384617e-06, + "loss": 1.6484, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 1.8211651888472016, + "learning_rate": 3.4935897435897438e-06, + "loss": 1.6504, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.736650925195897, + "learning_rate": 3.5256410256410263e-06, + "loss": 1.5811, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 2.0270331778194097, + "learning_rate": 3.557692307692308e-06, + "loss": 1.5796, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 1.7649606794029367, + "learning_rate": 3.58974358974359e-06, + "loss": 1.6763, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 1.6674123636417442, + "learning_rate": 3.621794871794872e-06, + "loss": 1.4824, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 2.0434484011727907, + "learning_rate": 3.653846153846154e-06, + "loss": 1.6177, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.5250602686441213, + "learning_rate": 3.6858974358974363e-06, + "loss": 1.4253, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 1.6326566506759765, + "learning_rate": 3.7179487179487184e-06, + "loss": 1.6128, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 1.7267931799000755, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.7021, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 1.9640148089571143, + "learning_rate": 3.782051282051282e-06, + "loss": 1.4609, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 1.667202401374494, + "learning_rate": 3.8141025641025643e-06, + "loss": 1.4727, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 1.8491336858211664, + "learning_rate": 3.846153846153847e-06, + "loss": 1.4805, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 1.970502990992734, + "learning_rate": 3.878205128205129e-06, + "loss": 1.5898, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 1.8106782959667795, + "learning_rate": 3.910256410256411e-06, + "loss": 1.3916, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 1.5795454248785767, + "learning_rate": 3.942307692307692e-06, + "loss": 1.3828, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 1.9710407871965574, + "learning_rate": 3.974358974358974e-06, + "loss": 1.5918, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 1.720077657883205, + "learning_rate": 4.006410256410257e-06, + "loss": 1.5762, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 1.5653310197103396, + "learning_rate": 4.0384615384615385e-06, + "loss": 1.4663, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.960254201027659, + "learning_rate": 4.070512820512821e-06, + "loss": 1.585, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 1.4653973577728605, + "learning_rate": 4.102564102564103e-06, + "loss": 1.291, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 1.8782298332980036, + "learning_rate": 4.134615384615385e-06, + "loss": 1.6655, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 1.926146971282728, + "learning_rate": 4.166666666666667e-06, + "loss": 1.6191, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.9142058047642427, + "learning_rate": 4.198717948717949e-06, + "loss": 1.4238, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 1.547722030633865, + "learning_rate": 4.230769230769231e-06, + "loss": 1.5288, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 1.579733252201955, + "learning_rate": 4.262820512820513e-06, + "loss": 1.3857, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 1.667805532959561, + "learning_rate": 4.294871794871795e-06, + "loss": 1.582, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 1.5200512406347286, + "learning_rate": 4.326923076923077e-06, + "loss": 1.6738, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 1.614965374449281, + "learning_rate": 4.358974358974359e-06, + "loss": 1.6611, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 1.9691385979784952, + "learning_rate": 4.3910256410256415e-06, + "loss": 1.6792, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 1.544450796833931, + "learning_rate": 4.423076923076924e-06, + "loss": 1.5615, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.7221654662587071, + "learning_rate": 4.455128205128206e-06, + "loss": 1.4062, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 1.5471251537385284, + "learning_rate": 4.487179487179488e-06, + "loss": 1.4097, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 1.8635808812969707, + "learning_rate": 4.51923076923077e-06, + "loss": 1.5171, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 1.535707715049538, + "learning_rate": 4.551282051282052e-06, + "loss": 1.6353, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 1.7429306574282821, + "learning_rate": 4.583333333333333e-06, + "loss": 1.5894, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 1.4567582558845829, + "learning_rate": 4.615384615384616e-06, + "loss": 1.3931, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 1.573101841721847, + "learning_rate": 4.647435897435898e-06, + "loss": 1.4629, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 1.5197795284379594, + "learning_rate": 4.6794871794871795e-06, + "loss": 1.6362, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.584866687090061, + "learning_rate": 4.711538461538462e-06, + "loss": 1.7417, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 1.8414381776962798, + "learning_rate": 4.743589743589744e-06, + "loss": 1.4585, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 1.3521917098457124, + "learning_rate": 4.775641025641027e-06, + "loss": 1.5596, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 2.2788563573731735, + "learning_rate": 4.807692307692308e-06, + "loss": 1.5112, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 1.2635584746274662, + "learning_rate": 4.83974358974359e-06, + "loss": 1.3804, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 1.5532220592334811, + "learning_rate": 4.871794871794872e-06, + "loss": 1.6523, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 1.5664650376466702, + "learning_rate": 4.903846153846154e-06, + "loss": 1.4224, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 1.656054619357628, + "learning_rate": 4.935897435897436e-06, + "loss": 1.6069, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 1.5232097696566647, + "learning_rate": 4.967948717948718e-06, + "loss": 1.3662, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 1.8399211171546899, + "learning_rate": 5e-06, + "loss": 1.5405, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 1.5505603061874536, + "learning_rate": 5.0320512820512825e-06, + "loss": 1.3809, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 1.8997162921701884, + "learning_rate": 5.064102564102565e-06, + "loss": 1.7207, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 1.7575716528566327, + "learning_rate": 5.096153846153846e-06, + "loss": 1.6133, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 1.716110507127467, + "learning_rate": 5.128205128205128e-06, + "loss": 1.5044, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 1.7126143944879049, + "learning_rate": 5.160256410256411e-06, + "loss": 1.5459, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 1.7034117954416954, + "learning_rate": 5.192307692307693e-06, + "loss": 1.4263, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 1.8808184053448647, + "learning_rate": 5.224358974358975e-06, + "loss": 1.646, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 1.8256974251245903, + "learning_rate": 5.256410256410257e-06, + "loss": 1.4595, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 1.5332772616069503, + "learning_rate": 5.288461538461539e-06, + "loss": 1.605, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 1.5439634014517032, + "learning_rate": 5.320512820512821e-06, + "loss": 1.6392, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 1.727159551689649, + "learning_rate": 5.3525641025641026e-06, + "loss": 1.625, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 1.7538202720011002, + "learning_rate": 5.384615384615385e-06, + "loss": 1.4429, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 1.4133140212159618, + "learning_rate": 5.416666666666667e-06, + "loss": 1.4824, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 1.3842900268580889, + "learning_rate": 5.448717948717949e-06, + "loss": 1.4517, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 1.5202581050560613, + "learning_rate": 5.480769230769232e-06, + "loss": 1.564, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 1.3648753550732138, + "learning_rate": 5.512820512820514e-06, + "loss": 1.4131, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 1.3379157005922668, + "learning_rate": 5.544871794871796e-06, + "loss": 1.3105, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 1.919194636838991, + "learning_rate": 5.576923076923077e-06, + "loss": 1.4619, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 1.879199960482607, + "learning_rate": 5.608974358974359e-06, + "loss": 1.3032, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 1.4657606490120825, + "learning_rate": 5.641025641025641e-06, + "loss": 1.2827, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 1.666882178228744, + "learning_rate": 5.6730769230769235e-06, + "loss": 1.4805, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 1.6258710061186548, + "learning_rate": 5.705128205128206e-06, + "loss": 1.3057, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 1.7548011353711834, + "learning_rate": 5.737179487179487e-06, + "loss": 1.585, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 1.5869944614054485, + "learning_rate": 5.769230769230769e-06, + "loss": 1.7427, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 1.5335816636189274, + "learning_rate": 5.801282051282052e-06, + "loss": 1.5596, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 1.4391860307780704, + "learning_rate": 5.833333333333334e-06, + "loss": 1.4199, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 1.5469212526234435, + "learning_rate": 5.865384615384616e-06, + "loss": 1.5176, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 1.5003501660650924, + "learning_rate": 5.897435897435898e-06, + "loss": 1.4272, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 1.508234576270832, + "learning_rate": 5.92948717948718e-06, + "loss": 1.1782, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 1.4748925618138098, + "learning_rate": 5.961538461538462e-06, + "loss": 1.5923, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 1.4737944848710263, + "learning_rate": 5.9935897435897436e-06, + "loss": 1.4497, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 1.459889107664886, + "learning_rate": 6.025641025641026e-06, + "loss": 1.4287, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 1.3869579848195994, + "learning_rate": 6.057692307692308e-06, + "loss": 1.3354, + "step": 191 + }, + { + "epoch": 0.01, + "grad_norm": 1.665655627688541, + "learning_rate": 6.08974358974359e-06, + "loss": 1.4106, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 1.3134424580248774, + "learning_rate": 6.121794871794873e-06, + "loss": 1.4424, + "step": 193 + }, + { + "epoch": 0.01, + "grad_norm": 1.415895492354165, + "learning_rate": 6.153846153846155e-06, + "loss": 1.4951, + "step": 194 + }, + { + "epoch": 0.01, + "grad_norm": 1.371996146521701, + "learning_rate": 6.185897435897437e-06, + "loss": 1.3091, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 1.6086430794753375, + "learning_rate": 6.217948717948718e-06, + "loss": 1.5337, + "step": 196 + }, + { + "epoch": 0.01, + "grad_norm": 1.8127065466411951, + "learning_rate": 6.25e-06, + "loss": 1.4907, + "step": 197 + }, + { + "epoch": 0.01, + "grad_norm": 1.384664309264018, + "learning_rate": 6.282051282051282e-06, + "loss": 1.3706, + "step": 198 + }, + { + "epoch": 0.01, + "grad_norm": 1.5235422620822068, + "learning_rate": 6.3141025641025645e-06, + "loss": 1.4858, + "step": 199 + }, + { + "epoch": 0.01, + "grad_norm": 1.2885597216980793, + "learning_rate": 6.3461538461538466e-06, + "loss": 1.3149, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 1.7298762167347572, + "learning_rate": 6.378205128205129e-06, + "loss": 1.4976, + "step": 201 + }, + { + "epoch": 0.01, + "grad_norm": 1.750337640265624, + "learning_rate": 6.410256410256412e-06, + "loss": 1.6279, + "step": 202 + }, + { + "epoch": 0.01, + "grad_norm": 1.41841869268125, + "learning_rate": 6.442307692307693e-06, + "loss": 1.498, + "step": 203 + }, + { + "epoch": 0.01, + "grad_norm": 1.6272909041184738, + "learning_rate": 6.474358974358975e-06, + "loss": 1.3926, + "step": 204 + }, + { + "epoch": 0.01, + "grad_norm": 1.5477750847250453, + "learning_rate": 6.506410256410257e-06, + "loss": 1.5244, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 1.5931696147327243, + "learning_rate": 6.538461538461539e-06, + "loss": 1.377, + "step": 206 + }, + { + "epoch": 0.01, + "grad_norm": 1.4181826250987062, + "learning_rate": 6.570512820512821e-06, + "loss": 1.5122, + "step": 207 + }, + { + "epoch": 0.01, + "grad_norm": 1.7443415454000577, + "learning_rate": 6.602564102564103e-06, + "loss": 1.519, + "step": 208 + }, + { + "epoch": 0.01, + "grad_norm": 1.4180895910964568, + "learning_rate": 6.6346153846153846e-06, + "loss": 1.5557, + "step": 209 + }, + { + "epoch": 0.01, + "grad_norm": 1.518469966973389, + "learning_rate": 6.666666666666667e-06, + "loss": 1.4531, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 1.5601379784815306, + "learning_rate": 6.698717948717949e-06, + "loss": 1.5273, + "step": 211 + }, + { + "epoch": 0.01, + "grad_norm": 1.5207608552390224, + "learning_rate": 6.730769230769232e-06, + "loss": 1.4692, + "step": 212 + }, + { + "epoch": 0.01, + "grad_norm": 1.4720863691945865, + "learning_rate": 6.762820512820514e-06, + "loss": 1.5347, + "step": 213 + }, + { + "epoch": 0.01, + "grad_norm": 1.371194906239484, + "learning_rate": 6.794871794871796e-06, + "loss": 1.5654, + "step": 214 + }, + { + "epoch": 0.01, + "grad_norm": 1.7043644796428705, + "learning_rate": 6.826923076923078e-06, + "loss": 1.6494, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 1.4816296880348712, + "learning_rate": 6.858974358974359e-06, + "loss": 1.4424, + "step": 216 + }, + { + "epoch": 0.01, + "grad_norm": 1.817439747866579, + "learning_rate": 6.891025641025641e-06, + "loss": 1.6362, + "step": 217 + }, + { + "epoch": 0.01, + "grad_norm": 1.5786979457571388, + "learning_rate": 6.923076923076923e-06, + "loss": 1.6304, + "step": 218 + }, + { + "epoch": 0.01, + "grad_norm": 1.3133870079960415, + "learning_rate": 6.9551282051282055e-06, + "loss": 1.2573, + "step": 219 + }, + { + "epoch": 0.01, + "grad_norm": 1.5132232235577348, + "learning_rate": 6.9871794871794876e-06, + "loss": 1.3345, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 1.4274882147184411, + "learning_rate": 7.01923076923077e-06, + "loss": 1.4512, + "step": 221 + }, + { + "epoch": 0.01, + "grad_norm": 1.5195232372898413, + "learning_rate": 7.051282051282053e-06, + "loss": 1.5044, + "step": 222 + }, + { + "epoch": 0.01, + "grad_norm": 1.3974543525216674, + "learning_rate": 7.083333333333335e-06, + "loss": 1.4976, + "step": 223 + }, + { + "epoch": 0.01, + "grad_norm": 1.4188522801923789, + "learning_rate": 7.115384615384616e-06, + "loss": 1.3765, + "step": 224 + }, + { + "epoch": 0.01, + "grad_norm": 1.4453431214143329, + "learning_rate": 7.147435897435898e-06, + "loss": 1.5845, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 1.3360802149976794, + "learning_rate": 7.17948717948718e-06, + "loss": 1.729, + "step": 226 + }, + { + "epoch": 0.01, + "grad_norm": 1.3508905049958093, + "learning_rate": 7.211538461538462e-06, + "loss": 1.4604, + "step": 227 + }, + { + "epoch": 0.01, + "grad_norm": 1.7696584130328374, + "learning_rate": 7.243589743589744e-06, + "loss": 1.5024, + "step": 228 + }, + { + "epoch": 0.01, + "grad_norm": 1.4635829385660106, + "learning_rate": 7.2756410256410255e-06, + "loss": 1.6235, + "step": 229 + }, + { + "epoch": 0.01, + "grad_norm": 1.669299411343517, + "learning_rate": 7.307692307692308e-06, + "loss": 1.6665, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 1.6483065886016006, + "learning_rate": 7.33974358974359e-06, + "loss": 1.4634, + "step": 231 + }, + { + "epoch": 0.01, + "grad_norm": 1.3387448202439354, + "learning_rate": 7.371794871794873e-06, + "loss": 1.6221, + "step": 232 + }, + { + "epoch": 0.01, + "grad_norm": 1.437370038669659, + "learning_rate": 7.403846153846155e-06, + "loss": 1.4688, + "step": 233 + }, + { + "epoch": 0.01, + "grad_norm": 1.7931577095568545, + "learning_rate": 7.435897435897437e-06, + "loss": 1.6265, + "step": 234 + }, + { + "epoch": 0.01, + "grad_norm": 1.6064451616276418, + "learning_rate": 7.467948717948719e-06, + "loss": 1.3843, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 1.3907874603976877, + "learning_rate": 7.500000000000001e-06, + "loss": 1.3608, + "step": 236 + }, + { + "epoch": 0.01, + "grad_norm": 1.3369647395143316, + "learning_rate": 7.532051282051282e-06, + "loss": 1.4453, + "step": 237 + }, + { + "epoch": 0.01, + "grad_norm": 1.698987018791686, + "learning_rate": 7.564102564102564e-06, + "loss": 1.415, + "step": 238 + }, + { + "epoch": 0.01, + "grad_norm": 1.3649721263541053, + "learning_rate": 7.5961538461538465e-06, + "loss": 1.3789, + "step": 239 + }, + { + "epoch": 0.01, + "grad_norm": 1.4088385190522619, + "learning_rate": 7.6282051282051286e-06, + "loss": 1.2163, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 1.5219404939043188, + "learning_rate": 7.660256410256411e-06, + "loss": 1.4097, + "step": 241 + }, + { + "epoch": 0.01, + "grad_norm": 1.5724354127233526, + "learning_rate": 7.692307692307694e-06, + "loss": 1.6787, + "step": 242 + }, + { + "epoch": 0.01, + "grad_norm": 1.3970694181480925, + "learning_rate": 7.724358974358976e-06, + "loss": 1.4629, + "step": 243 + }, + { + "epoch": 0.01, + "grad_norm": 1.3734568487301586, + "learning_rate": 7.756410256410258e-06, + "loss": 1.4873, + "step": 244 + }, + { + "epoch": 0.01, + "grad_norm": 1.3863321702985334, + "learning_rate": 7.78846153846154e-06, + "loss": 1.5269, + "step": 245 + }, + { + "epoch": 0.01, + "grad_norm": 1.8121022125386985, + "learning_rate": 7.820512820512822e-06, + "loss": 1.4814, + "step": 246 + }, + { + "epoch": 0.01, + "grad_norm": 1.538933058041442, + "learning_rate": 7.852564102564102e-06, + "loss": 1.583, + "step": 247 + }, + { + "epoch": 0.01, + "grad_norm": 1.395100966161363, + "learning_rate": 7.884615384615384e-06, + "loss": 1.4912, + "step": 248 + }, + { + "epoch": 0.01, + "grad_norm": 1.4133233925764965, + "learning_rate": 7.916666666666667e-06, + "loss": 1.481, + "step": 249 + }, + { + "epoch": 0.01, + "grad_norm": 1.3687218405311699, + "learning_rate": 7.948717948717949e-06, + "loss": 1.4595, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 1.464734174966439, + "learning_rate": 7.980769230769232e-06, + "loss": 1.4932, + "step": 251 + }, + { + "epoch": 0.01, + "grad_norm": 1.1884946092673951, + "learning_rate": 8.012820512820515e-06, + "loss": 1.4556, + "step": 252 + }, + { + "epoch": 0.01, + "grad_norm": 1.54451676830231, + "learning_rate": 8.044871794871797e-06, + "loss": 1.6221, + "step": 253 + }, + { + "epoch": 0.01, + "grad_norm": 1.754688343231471, + "learning_rate": 8.076923076923077e-06, + "loss": 1.4512, + "step": 254 + }, + { + "epoch": 0.01, + "grad_norm": 1.3322302493409486, + "learning_rate": 8.108974358974359e-06, + "loss": 1.5825, + "step": 255 + }, + { + "epoch": 0.01, + "grad_norm": 1.4674730077487839, + "learning_rate": 8.141025641025641e-06, + "loss": 1.5005, + "step": 256 + }, + { + "epoch": 0.01, + "grad_norm": 1.2167921588898725, + "learning_rate": 8.173076923076923e-06, + "loss": 1.2827, + "step": 257 + }, + { + "epoch": 0.01, + "grad_norm": 1.168540596036068, + "learning_rate": 8.205128205128205e-06, + "loss": 1.3428, + "step": 258 + }, + { + "epoch": 0.01, + "grad_norm": 1.6789165955553473, + "learning_rate": 8.237179487179487e-06, + "loss": 1.5195, + "step": 259 + }, + { + "epoch": 0.01, + "grad_norm": 1.264218952301381, + "learning_rate": 8.26923076923077e-06, + "loss": 1.4189, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 1.7624012666345472, + "learning_rate": 8.301282051282052e-06, + "loss": 1.5059, + "step": 261 + }, + { + "epoch": 0.01, + "grad_norm": 1.4194250760848537, + "learning_rate": 8.333333333333334e-06, + "loss": 1.5103, + "step": 262 + }, + { + "epoch": 0.01, + "grad_norm": 1.5397114149208282, + "learning_rate": 8.365384615384616e-06, + "loss": 1.3716, + "step": 263 + }, + { + "epoch": 0.01, + "grad_norm": 1.8412153258409343, + "learning_rate": 8.397435897435898e-06, + "loss": 1.4932, + "step": 264 + }, + { + "epoch": 0.01, + "grad_norm": 1.6802563340508783, + "learning_rate": 8.42948717948718e-06, + "loss": 1.3311, + "step": 265 + }, + { + "epoch": 0.01, + "grad_norm": 1.670878605132347, + "learning_rate": 8.461538461538462e-06, + "loss": 1.521, + "step": 266 + }, + { + "epoch": 0.01, + "grad_norm": 1.2395749877315518, + "learning_rate": 8.493589743589744e-06, + "loss": 1.3379, + "step": 267 + }, + { + "epoch": 0.01, + "grad_norm": 1.4818439873500375, + "learning_rate": 8.525641025641026e-06, + "loss": 1.3052, + "step": 268 + }, + { + "epoch": 0.01, + "grad_norm": 1.9952198811090225, + "learning_rate": 8.557692307692308e-06, + "loss": 1.6255, + "step": 269 + }, + { + "epoch": 0.01, + "grad_norm": 1.3128031554812432, + "learning_rate": 8.58974358974359e-06, + "loss": 1.4761, + "step": 270 + }, + { + "epoch": 0.01, + "grad_norm": 1.3833992687798677, + "learning_rate": 8.621794871794873e-06, + "loss": 1.3945, + "step": 271 + }, + { + "epoch": 0.01, + "grad_norm": 1.5293115012028113, + "learning_rate": 8.653846153846155e-06, + "loss": 1.3755, + "step": 272 + }, + { + "epoch": 0.01, + "grad_norm": 1.482360230970466, + "learning_rate": 8.685897435897437e-06, + "loss": 1.4541, + "step": 273 + }, + { + "epoch": 0.01, + "grad_norm": 1.4286930588039524, + "learning_rate": 8.717948717948719e-06, + "loss": 1.3594, + "step": 274 + }, + { + "epoch": 0.01, + "grad_norm": 1.588811858248963, + "learning_rate": 8.750000000000001e-06, + "loss": 1.354, + "step": 275 + }, + { + "epoch": 0.01, + "grad_norm": 1.374045211156847, + "learning_rate": 8.782051282051283e-06, + "loss": 1.3535, + "step": 276 + }, + { + "epoch": 0.01, + "grad_norm": 1.4308512665840407, + "learning_rate": 8.814102564102565e-06, + "loss": 1.4482, + "step": 277 + }, + { + "epoch": 0.01, + "grad_norm": 1.403680196525793, + "learning_rate": 8.846153846153847e-06, + "loss": 1.4131, + "step": 278 + }, + { + "epoch": 0.01, + "grad_norm": 1.5477001210895838, + "learning_rate": 8.87820512820513e-06, + "loss": 1.5146, + "step": 279 + }, + { + "epoch": 0.01, + "grad_norm": 1.5212020849365218, + "learning_rate": 8.910256410256411e-06, + "loss": 1.5601, + "step": 280 + }, + { + "epoch": 0.01, + "grad_norm": 1.225342389365309, + "learning_rate": 8.942307692307693e-06, + "loss": 1.2358, + "step": 281 + }, + { + "epoch": 0.01, + "grad_norm": 1.327043026339369, + "learning_rate": 8.974358974358976e-06, + "loss": 1.478, + "step": 282 + }, + { + "epoch": 0.01, + "grad_norm": 1.4647986744960786, + "learning_rate": 9.006410256410258e-06, + "loss": 1.3413, + "step": 283 + }, + { + "epoch": 0.01, + "grad_norm": 1.2104548586621704, + "learning_rate": 9.03846153846154e-06, + "loss": 1.4502, + "step": 284 + }, + { + "epoch": 0.01, + "grad_norm": 1.8286186993485871, + "learning_rate": 9.070512820512822e-06, + "loss": 1.5854, + "step": 285 + }, + { + "epoch": 0.01, + "grad_norm": 1.5998701932811834, + "learning_rate": 9.102564102564104e-06, + "loss": 1.4468, + "step": 286 + }, + { + "epoch": 0.01, + "grad_norm": 1.6678199201254207, + "learning_rate": 9.134615384615384e-06, + "loss": 1.3965, + "step": 287 + }, + { + "epoch": 0.01, + "grad_norm": 1.3070061722415838, + "learning_rate": 9.166666666666666e-06, + "loss": 1.6094, + "step": 288 + }, + { + "epoch": 0.01, + "grad_norm": 1.225972205579119, + "learning_rate": 9.198717948717949e-06, + "loss": 1.3271, + "step": 289 + }, + { + "epoch": 0.01, + "grad_norm": 1.3148158638144016, + "learning_rate": 9.230769230769232e-06, + "loss": 1.3511, + "step": 290 + }, + { + "epoch": 0.01, + "grad_norm": 1.4129495542533224, + "learning_rate": 9.262820512820514e-06, + "loss": 1.5039, + "step": 291 + }, + { + "epoch": 0.01, + "grad_norm": 1.4576480389490902, + "learning_rate": 9.294871794871796e-06, + "loss": 1.3574, + "step": 292 + }, + { + "epoch": 0.01, + "grad_norm": 1.3393706825067717, + "learning_rate": 9.326923076923079e-06, + "loss": 1.6074, + "step": 293 + }, + { + "epoch": 0.01, + "grad_norm": 1.5610340026880416, + "learning_rate": 9.358974358974359e-06, + "loss": 1.2837, + "step": 294 + }, + { + "epoch": 0.01, + "grad_norm": 1.3423965904877306, + "learning_rate": 9.391025641025641e-06, + "loss": 1.3682, + "step": 295 + }, + { + "epoch": 0.01, + "grad_norm": 1.4485470387685437, + "learning_rate": 9.423076923076923e-06, + "loss": 1.4712, + "step": 296 + }, + { + "epoch": 0.01, + "grad_norm": 1.5008676191378, + "learning_rate": 9.455128205128205e-06, + "loss": 1.4629, + "step": 297 + }, + { + "epoch": 0.01, + "grad_norm": 1.7676816459022766, + "learning_rate": 9.487179487179487e-06, + "loss": 1.5156, + "step": 298 + }, + { + "epoch": 0.01, + "grad_norm": 2.1699144889775472, + "learning_rate": 9.51923076923077e-06, + "loss": 1.6279, + "step": 299 + }, + { + "epoch": 0.01, + "grad_norm": 1.3484712772438978, + "learning_rate": 9.551282051282053e-06, + "loss": 1.4512, + "step": 300 + }, + { + "epoch": 0.01, + "grad_norm": 1.3503855576206905, + "learning_rate": 9.583333333333335e-06, + "loss": 1.4546, + "step": 301 + }, + { + "epoch": 0.01, + "grad_norm": 1.2700300079713553, + "learning_rate": 9.615384615384616e-06, + "loss": 1.3384, + "step": 302 + }, + { + "epoch": 0.01, + "grad_norm": 1.465131921505914, + "learning_rate": 9.647435897435898e-06, + "loss": 1.3301, + "step": 303 + }, + { + "epoch": 0.01, + "grad_norm": 1.3338387423956481, + "learning_rate": 9.67948717948718e-06, + "loss": 1.5278, + "step": 304 + }, + { + "epoch": 0.01, + "grad_norm": 1.8912144068383967, + "learning_rate": 9.711538461538462e-06, + "loss": 1.481, + "step": 305 + }, + { + "epoch": 0.01, + "grad_norm": 1.5121533642594989, + "learning_rate": 9.743589743589744e-06, + "loss": 1.3169, + "step": 306 + }, + { + "epoch": 0.01, + "grad_norm": 1.433347119046012, + "learning_rate": 9.775641025641026e-06, + "loss": 1.2729, + "step": 307 + }, + { + "epoch": 0.01, + "grad_norm": 1.186288832857276, + "learning_rate": 9.807692307692308e-06, + "loss": 1.4258, + "step": 308 + }, + { + "epoch": 0.01, + "grad_norm": 1.5794770512780545, + "learning_rate": 9.83974358974359e-06, + "loss": 1.4204, + "step": 309 + }, + { + "epoch": 0.01, + "grad_norm": 1.3527863244606433, + "learning_rate": 9.871794871794872e-06, + "loss": 1.4858, + "step": 310 + }, + { + "epoch": 0.01, + "grad_norm": 1.326924198253285, + "learning_rate": 9.903846153846155e-06, + "loss": 1.5527, + "step": 311 + }, + { + "epoch": 0.02, + "grad_norm": 1.6396670891042595, + "learning_rate": 9.935897435897437e-06, + "loss": 1.4624, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 1.4244566829672403, + "learning_rate": 9.967948717948719e-06, + "loss": 1.4019, + "step": 313 + }, + { + "epoch": 0.02, + "grad_norm": 1.236435094721605, + "learning_rate": 1e-05, + "loss": 1.3257, + "step": 314 + }, + { + "epoch": 0.02, + "grad_norm": 1.1019481324141696, + "learning_rate": 1.0032051282051283e-05, + "loss": 1.4155, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 1.4697759039594815, + "learning_rate": 1.0064102564102565e-05, + "loss": 1.4448, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 1.6143069471150746, + "learning_rate": 1.0096153846153847e-05, + "loss": 1.3887, + "step": 317 + }, + { + "epoch": 0.02, + "grad_norm": 1.464365054230673, + "learning_rate": 1.012820512820513e-05, + "loss": 1.498, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 1.2608191515062133, + "learning_rate": 1.0160256410256411e-05, + "loss": 1.3281, + "step": 319 + }, + { + "epoch": 0.02, + "grad_norm": 1.302480276204567, + "learning_rate": 1.0192307692307692e-05, + "loss": 1.3628, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 1.942889152987852, + "learning_rate": 1.0224358974358974e-05, + "loss": 1.605, + "step": 321 + }, + { + "epoch": 0.02, + "grad_norm": 1.5261387927655037, + "learning_rate": 1.0256410256410256e-05, + "loss": 1.4165, + "step": 322 + }, + { + "epoch": 0.02, + "grad_norm": 1.206259089764386, + "learning_rate": 1.0288461538461538e-05, + "loss": 1.457, + "step": 323 + }, + { + "epoch": 0.02, + "grad_norm": 1.2720669728637408, + "learning_rate": 1.0320512820512822e-05, + "loss": 1.3384, + "step": 324 + }, + { + "epoch": 0.02, + "grad_norm": 1.3504509929969915, + "learning_rate": 1.0352564102564104e-05, + "loss": 1.3066, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 1.265338651108809, + "learning_rate": 1.0384615384615386e-05, + "loss": 1.5186, + "step": 326 + }, + { + "epoch": 0.02, + "grad_norm": 1.409351609089138, + "learning_rate": 1.0416666666666668e-05, + "loss": 1.4336, + "step": 327 + }, + { + "epoch": 0.02, + "grad_norm": 1.1841520862995354, + "learning_rate": 1.044871794871795e-05, + "loss": 1.4272, + "step": 328 + }, + { + "epoch": 0.02, + "grad_norm": 1.5794139564525111, + "learning_rate": 1.0480769230769232e-05, + "loss": 1.416, + "step": 329 + }, + { + "epoch": 0.02, + "grad_norm": 1.6907594388650082, + "learning_rate": 1.0512820512820514e-05, + "loss": 1.3145, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 1.0994459480533283, + "learning_rate": 1.0544871794871796e-05, + "loss": 1.3394, + "step": 331 + }, + { + "epoch": 0.02, + "grad_norm": 1.377055170677683, + "learning_rate": 1.0576923076923078e-05, + "loss": 1.4604, + "step": 332 + }, + { + "epoch": 0.02, + "grad_norm": 1.3129708771716166, + "learning_rate": 1.060897435897436e-05, + "loss": 1.3726, + "step": 333 + }, + { + "epoch": 0.02, + "grad_norm": 1.5016114605414075, + "learning_rate": 1.0641025641025643e-05, + "loss": 1.4448, + "step": 334 + }, + { + "epoch": 0.02, + "grad_norm": 1.4403819506870998, + "learning_rate": 1.0673076923076923e-05, + "loss": 1.478, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 1.2893505889314858, + "learning_rate": 1.0705128205128205e-05, + "loss": 1.4937, + "step": 336 + }, + { + "epoch": 0.02, + "grad_norm": 1.2944883906385094, + "learning_rate": 1.0737179487179487e-05, + "loss": 1.3599, + "step": 337 + }, + { + "epoch": 0.02, + "grad_norm": 1.0415205266579957, + "learning_rate": 1.076923076923077e-05, + "loss": 1.3447, + "step": 338 + }, + { + "epoch": 0.02, + "grad_norm": 1.6037439814419903, + "learning_rate": 1.0801282051282051e-05, + "loss": 1.563, + "step": 339 + }, + { + "epoch": 0.02, + "grad_norm": 1.38426018047868, + "learning_rate": 1.0833333333333334e-05, + "loss": 1.3701, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 1.1618201751547155, + "learning_rate": 1.0865384615384616e-05, + "loss": 1.3154, + "step": 341 + }, + { + "epoch": 0.02, + "grad_norm": 1.5823963489019184, + "learning_rate": 1.0897435897435898e-05, + "loss": 1.5928, + "step": 342 + }, + { + "epoch": 0.02, + "grad_norm": 1.576241294933178, + "learning_rate": 1.092948717948718e-05, + "loss": 1.5718, + "step": 343 + }, + { + "epoch": 0.02, + "grad_norm": 1.388073717584795, + "learning_rate": 1.0961538461538464e-05, + "loss": 1.3481, + "step": 344 + }, + { + "epoch": 0.02, + "grad_norm": 1.2535951872034437, + "learning_rate": 1.0993589743589746e-05, + "loss": 1.3926, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 1.2944732193698896, + "learning_rate": 1.1025641025641028e-05, + "loss": 1.436, + "step": 346 + }, + { + "epoch": 0.02, + "grad_norm": 1.2445746907666893, + "learning_rate": 1.105769230769231e-05, + "loss": 1.4834, + "step": 347 + }, + { + "epoch": 0.02, + "grad_norm": 1.196209356878662, + "learning_rate": 1.1089743589743592e-05, + "loss": 1.5166, + "step": 348 + }, + { + "epoch": 0.02, + "grad_norm": 1.6188803860018033, + "learning_rate": 1.1121794871794872e-05, + "loss": 1.46, + "step": 349 + }, + { + "epoch": 0.02, + "grad_norm": 1.5883844024491693, + "learning_rate": 1.1153846153846154e-05, + "loss": 1.2119, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 1.3706784869709538, + "learning_rate": 1.1185897435897437e-05, + "loss": 1.5693, + "step": 351 + }, + { + "epoch": 0.02, + "grad_norm": 1.5282244569689725, + "learning_rate": 1.1217948717948719e-05, + "loss": 1.4248, + "step": 352 + }, + { + "epoch": 0.02, + "grad_norm": 1.1965002388309016, + "learning_rate": 1.125e-05, + "loss": 1.4424, + "step": 353 + }, + { + "epoch": 0.02, + "grad_norm": 1.4859550934148806, + "learning_rate": 1.1282051282051283e-05, + "loss": 1.377, + "step": 354 + }, + { + "epoch": 0.02, + "grad_norm": 1.3428247798766801, + "learning_rate": 1.1314102564102565e-05, + "loss": 1.3701, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 1.5244567142412975, + "learning_rate": 1.1346153846153847e-05, + "loss": 1.4443, + "step": 356 + }, + { + "epoch": 0.02, + "grad_norm": 1.39242450188167, + "learning_rate": 1.1378205128205129e-05, + "loss": 1.4053, + "step": 357 + }, + { + "epoch": 0.02, + "grad_norm": 1.365903690611676, + "learning_rate": 1.1410256410256411e-05, + "loss": 1.3633, + "step": 358 + }, + { + "epoch": 0.02, + "grad_norm": 1.3300293389531603, + "learning_rate": 1.1442307692307693e-05, + "loss": 1.3662, + "step": 359 + }, + { + "epoch": 0.02, + "grad_norm": 1.4397452366690222, + "learning_rate": 1.1474358974358974e-05, + "loss": 1.4614, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 1.543567048299789, + "learning_rate": 1.1506410256410256e-05, + "loss": 1.5532, + "step": 361 + }, + { + "epoch": 0.02, + "grad_norm": 1.41321670314368, + "learning_rate": 1.1538461538461538e-05, + "loss": 1.3584, + "step": 362 + }, + { + "epoch": 0.02, + "grad_norm": 1.1748040935502257, + "learning_rate": 1.1570512820512823e-05, + "loss": 1.313, + "step": 363 + }, + { + "epoch": 0.02, + "grad_norm": 1.3532571008432337, + "learning_rate": 1.1602564102564104e-05, + "loss": 1.2495, + "step": 364 + }, + { + "epoch": 0.02, + "grad_norm": 1.3518482258480287, + "learning_rate": 1.1634615384615386e-05, + "loss": 1.3379, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 1.332411463979091, + "learning_rate": 1.1666666666666668e-05, + "loss": 1.3198, + "step": 366 + }, + { + "epoch": 0.02, + "grad_norm": 1.0707442437044281, + "learning_rate": 1.169871794871795e-05, + "loss": 1.0234, + "step": 367 + }, + { + "epoch": 0.02, + "grad_norm": 1.6457862664444303, + "learning_rate": 1.1730769230769232e-05, + "loss": 1.5063, + "step": 368 + }, + { + "epoch": 0.02, + "grad_norm": 1.476115170967113, + "learning_rate": 1.1762820512820514e-05, + "loss": 1.5054, + "step": 369 + }, + { + "epoch": 0.02, + "grad_norm": 1.3399981445100086, + "learning_rate": 1.1794871794871796e-05, + "loss": 1.4072, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 1.6543183279593394, + "learning_rate": 1.1826923076923078e-05, + "loss": 1.519, + "step": 371 + }, + { + "epoch": 0.02, + "grad_norm": 1.4376527933589291, + "learning_rate": 1.185897435897436e-05, + "loss": 1.3296, + "step": 372 + }, + { + "epoch": 0.02, + "grad_norm": 1.8176659158068955, + "learning_rate": 1.1891025641025643e-05, + "loss": 1.4937, + "step": 373 + }, + { + "epoch": 0.02, + "grad_norm": 1.5771949777504417, + "learning_rate": 1.1923076923076925e-05, + "loss": 1.3008, + "step": 374 + }, + { + "epoch": 0.02, + "grad_norm": 1.4240240847785275, + "learning_rate": 1.1955128205128205e-05, + "loss": 1.415, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 1.4991960350937326, + "learning_rate": 1.1987179487179487e-05, + "loss": 1.4409, + "step": 376 + }, + { + "epoch": 0.02, + "grad_norm": 1.273472829846749, + "learning_rate": 1.201923076923077e-05, + "loss": 1.481, + "step": 377 + }, + { + "epoch": 0.02, + "grad_norm": 1.5817076265575158, + "learning_rate": 1.2051282051282051e-05, + "loss": 1.3716, + "step": 378 + }, + { + "epoch": 0.02, + "grad_norm": 1.2303190554901131, + "learning_rate": 1.2083333333333333e-05, + "loss": 1.4932, + "step": 379 + }, + { + "epoch": 0.02, + "grad_norm": 1.3208287584759297, + "learning_rate": 1.2115384615384615e-05, + "loss": 1.3291, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 1.2963875519764945, + "learning_rate": 1.2147435897435898e-05, + "loss": 1.3232, + "step": 381 + }, + { + "epoch": 0.02, + "grad_norm": 1.3919491990970962, + "learning_rate": 1.217948717948718e-05, + "loss": 1.2808, + "step": 382 + }, + { + "epoch": 0.02, + "grad_norm": 1.2849178005288528, + "learning_rate": 1.2211538461538463e-05, + "loss": 1.2964, + "step": 383 + }, + { + "epoch": 0.02, + "grad_norm": 1.3675399517474263, + "learning_rate": 1.2243589743589746e-05, + "loss": 1.5195, + "step": 384 + }, + { + "epoch": 0.02, + "grad_norm": 1.3977606893412815, + "learning_rate": 1.2275641025641028e-05, + "loss": 1.3066, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 1.2562214350443868, + "learning_rate": 1.230769230769231e-05, + "loss": 1.3281, + "step": 386 + }, + { + "epoch": 0.02, + "grad_norm": 1.5722753928615885, + "learning_rate": 1.2339743589743592e-05, + "loss": 1.3892, + "step": 387 + }, + { + "epoch": 0.02, + "grad_norm": 1.2486043672664813, + "learning_rate": 1.2371794871794874e-05, + "loss": 1.397, + "step": 388 + }, + { + "epoch": 0.02, + "grad_norm": 1.2850751024473135, + "learning_rate": 1.2403846153846156e-05, + "loss": 1.4736, + "step": 389 + }, + { + "epoch": 0.02, + "grad_norm": 1.4071165180325196, + "learning_rate": 1.2435897435897436e-05, + "loss": 1.353, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 2.0851094197433793, + "learning_rate": 1.2467948717948719e-05, + "loss": 1.5176, + "step": 391 + }, + { + "epoch": 0.02, + "grad_norm": 1.3229683183230674, + "learning_rate": 1.25e-05, + "loss": 1.4102, + "step": 392 + }, + { + "epoch": 0.02, + "grad_norm": 1.4853370638473593, + "learning_rate": 1.2532051282051283e-05, + "loss": 1.2668, + "step": 393 + }, + { + "epoch": 0.02, + "grad_norm": 1.1870355905247647, + "learning_rate": 1.2564102564102565e-05, + "loss": 1.3403, + "step": 394 + }, + { + "epoch": 0.02, + "grad_norm": 1.504089343823328, + "learning_rate": 1.2596153846153847e-05, + "loss": 1.332, + "step": 395 + }, + { + "epoch": 0.02, + "grad_norm": 1.5321805668699346, + "learning_rate": 1.2628205128205129e-05, + "loss": 1.3467, + "step": 396 + }, + { + "epoch": 0.02, + "grad_norm": 1.5823430646814343, + "learning_rate": 1.2660256410256411e-05, + "loss": 1.5112, + "step": 397 + }, + { + "epoch": 0.02, + "grad_norm": 1.2761497624577036, + "learning_rate": 1.2692307692307693e-05, + "loss": 1.3232, + "step": 398 + }, + { + "epoch": 0.02, + "grad_norm": 1.1549200773500554, + "learning_rate": 1.2724358974358975e-05, + "loss": 1.396, + "step": 399 + }, + { + "epoch": 0.02, + "grad_norm": 1.267847493836026, + "learning_rate": 1.2756410256410257e-05, + "loss": 1.2495, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 1.3184570422607709, + "learning_rate": 1.2788461538461538e-05, + "loss": 1.3213, + "step": 401 + }, + { + "epoch": 0.02, + "grad_norm": 1.2403072183011512, + "learning_rate": 1.2820512820512823e-05, + "loss": 1.3247, + "step": 402 + }, + { + "epoch": 0.02, + "grad_norm": 1.3597493157154914, + "learning_rate": 1.2852564102564105e-05, + "loss": 1.6499, + "step": 403 + }, + { + "epoch": 0.02, + "grad_norm": 1.214172883747685, + "learning_rate": 1.2884615384615386e-05, + "loss": 1.4644, + "step": 404 + }, + { + "epoch": 0.02, + "grad_norm": 1.3371797055041754, + "learning_rate": 1.2916666666666668e-05, + "loss": 1.3667, + "step": 405 + }, + { + "epoch": 0.02, + "grad_norm": 1.0974180779714644, + "learning_rate": 1.294871794871795e-05, + "loss": 1.3931, + "step": 406 + }, + { + "epoch": 0.02, + "grad_norm": 1.218185854267764, + "learning_rate": 1.2980769230769232e-05, + "loss": 1.394, + "step": 407 + }, + { + "epoch": 0.02, + "grad_norm": 1.300208779188169, + "learning_rate": 1.3012820512820514e-05, + "loss": 1.332, + "step": 408 + }, + { + "epoch": 0.02, + "grad_norm": 1.276319102582932, + "learning_rate": 1.3044871794871796e-05, + "loss": 1.0845, + "step": 409 + }, + { + "epoch": 0.02, + "grad_norm": 1.23973437135605, + "learning_rate": 1.3076923076923078e-05, + "loss": 1.3481, + "step": 410 + }, + { + "epoch": 0.02, + "grad_norm": 1.3170565442346465, + "learning_rate": 1.310897435897436e-05, + "loss": 1.4185, + "step": 411 + }, + { + "epoch": 0.02, + "grad_norm": 1.4969675783159206, + "learning_rate": 1.3141025641025642e-05, + "loss": 1.354, + "step": 412 + }, + { + "epoch": 0.02, + "grad_norm": 1.7639450193235466, + "learning_rate": 1.3173076923076925e-05, + "loss": 1.3486, + "step": 413 + }, + { + "epoch": 0.02, + "grad_norm": 1.9263564012989225, + "learning_rate": 1.3205128205128207e-05, + "loss": 1.6157, + "step": 414 + }, + { + "epoch": 0.02, + "grad_norm": 1.2579722293146822, + "learning_rate": 1.3237179487179487e-05, + "loss": 1.2314, + "step": 415 + }, + { + "epoch": 0.02, + "grad_norm": 1.5875289321322101, + "learning_rate": 1.3269230769230769e-05, + "loss": 1.3984, + "step": 416 + }, + { + "epoch": 0.02, + "grad_norm": 1.4036819396205769, + "learning_rate": 1.3301282051282051e-05, + "loss": 1.2651, + "step": 417 + }, + { + "epoch": 0.02, + "grad_norm": 1.3607301170192805, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4873, + "step": 418 + }, + { + "epoch": 0.02, + "grad_norm": 1.2216593704601035, + "learning_rate": 1.3365384615384615e-05, + "loss": 1.2241, + "step": 419 + }, + { + "epoch": 0.02, + "grad_norm": 1.2389080776413752, + "learning_rate": 1.3397435897435897e-05, + "loss": 1.2642, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 1.117370907316905, + "learning_rate": 1.342948717948718e-05, + "loss": 1.2358, + "step": 421 + }, + { + "epoch": 0.02, + "grad_norm": 1.118087185561451, + "learning_rate": 1.3461538461538463e-05, + "loss": 1.2661, + "step": 422 + }, + { + "epoch": 0.02, + "grad_norm": 1.2654215672483675, + "learning_rate": 1.3493589743589745e-05, + "loss": 1.3218, + "step": 423 + }, + { + "epoch": 0.02, + "grad_norm": 1.447322052774496, + "learning_rate": 1.3525641025641028e-05, + "loss": 1.5986, + "step": 424 + }, + { + "epoch": 0.02, + "grad_norm": 1.2858801812393017, + "learning_rate": 1.355769230769231e-05, + "loss": 1.3184, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 1.3029973554102459, + "learning_rate": 1.3589743589743592e-05, + "loss": 1.3193, + "step": 426 + }, + { + "epoch": 0.02, + "grad_norm": 1.3263639292111544, + "learning_rate": 1.3621794871794874e-05, + "loss": 1.3906, + "step": 427 + }, + { + "epoch": 0.02, + "grad_norm": 1.059254631454006, + "learning_rate": 1.3653846153846156e-05, + "loss": 1.2583, + "step": 428 + }, + { + "epoch": 0.02, + "grad_norm": 1.2836032506218642, + "learning_rate": 1.3685897435897438e-05, + "loss": 1.3071, + "step": 429 + }, + { + "epoch": 0.02, + "grad_norm": 1.2337693323714396, + "learning_rate": 1.3717948717948718e-05, + "loss": 1.4712, + "step": 430 + }, + { + "epoch": 0.02, + "grad_norm": 1.492108564766739, + "learning_rate": 1.375e-05, + "loss": 1.4907, + "step": 431 + }, + { + "epoch": 0.02, + "grad_norm": 1.3307189173873828, + "learning_rate": 1.3782051282051283e-05, + "loss": 1.2534, + "step": 432 + }, + { + "epoch": 0.02, + "grad_norm": 1.2501749388020176, + "learning_rate": 1.3814102564102565e-05, + "loss": 1.2495, + "step": 433 + }, + { + "epoch": 0.02, + "grad_norm": 1.4581552746325526, + "learning_rate": 1.3846153846153847e-05, + "loss": 1.3721, + "step": 434 + }, + { + "epoch": 0.02, + "grad_norm": 1.5518329244245812, + "learning_rate": 1.3878205128205129e-05, + "loss": 1.3267, + "step": 435 + }, + { + "epoch": 0.02, + "grad_norm": 1.8797997336582097, + "learning_rate": 1.3910256410256411e-05, + "loss": 1.5161, + "step": 436 + }, + { + "epoch": 0.02, + "grad_norm": 1.1566499953624272, + "learning_rate": 1.3942307692307693e-05, + "loss": 1.4575, + "step": 437 + }, + { + "epoch": 0.02, + "grad_norm": 1.4273618093245022, + "learning_rate": 1.3974358974358975e-05, + "loss": 1.4155, + "step": 438 + }, + { + "epoch": 0.02, + "grad_norm": 1.418291971683312, + "learning_rate": 1.4006410256410257e-05, + "loss": 1.4375, + "step": 439 + }, + { + "epoch": 0.02, + "grad_norm": 1.2541249133240433, + "learning_rate": 1.403846153846154e-05, + "loss": 1.4771, + "step": 440 + }, + { + "epoch": 0.02, + "grad_norm": 1.1462392922714209, + "learning_rate": 1.4070512820512823e-05, + "loss": 1.3403, + "step": 441 + }, + { + "epoch": 0.02, + "grad_norm": 1.2094459866707437, + "learning_rate": 1.4102564102564105e-05, + "loss": 1.4746, + "step": 442 + }, + { + "epoch": 0.02, + "grad_norm": 1.391059730506335, + "learning_rate": 1.4134615384615387e-05, + "loss": 1.4907, + "step": 443 + }, + { + "epoch": 0.02, + "grad_norm": 1.4291849581594014, + "learning_rate": 1.416666666666667e-05, + "loss": 1.4214, + "step": 444 + }, + { + "epoch": 0.02, + "grad_norm": 1.3849603958342436, + "learning_rate": 1.419871794871795e-05, + "loss": 1.4814, + "step": 445 + }, + { + "epoch": 0.02, + "grad_norm": 1.4123325940874079, + "learning_rate": 1.4230769230769232e-05, + "loss": 1.3867, + "step": 446 + }, + { + "epoch": 0.02, + "grad_norm": 1.2491519301399518, + "learning_rate": 1.4262820512820514e-05, + "loss": 1.4355, + "step": 447 + }, + { + "epoch": 0.02, + "grad_norm": 1.385088745064307, + "learning_rate": 1.4294871794871796e-05, + "loss": 1.4468, + "step": 448 + }, + { + "epoch": 0.02, + "grad_norm": 1.4034533875763116, + "learning_rate": 1.4326923076923078e-05, + "loss": 1.4443, + "step": 449 + }, + { + "epoch": 0.02, + "grad_norm": 1.6690925808258177, + "learning_rate": 1.435897435897436e-05, + "loss": 1.394, + "step": 450 + }, + { + "epoch": 0.02, + "grad_norm": 1.382429368409419, + "learning_rate": 1.4391025641025642e-05, + "loss": 1.4604, + "step": 451 + }, + { + "epoch": 0.02, + "grad_norm": 1.3383284079637572, + "learning_rate": 1.4423076923076924e-05, + "loss": 1.1294, + "step": 452 + }, + { + "epoch": 0.02, + "grad_norm": 1.2705889899306166, + "learning_rate": 1.4455128205128207e-05, + "loss": 1.376, + "step": 453 + }, + { + "epoch": 0.02, + "grad_norm": 1.1686124888717497, + "learning_rate": 1.4487179487179489e-05, + "loss": 1.2856, + "step": 454 + }, + { + "epoch": 0.02, + "grad_norm": 1.2749741389421876, + "learning_rate": 1.451923076923077e-05, + "loss": 1.2329, + "step": 455 + }, + { + "epoch": 0.02, + "grad_norm": 1.2580765489359371, + "learning_rate": 1.4551282051282051e-05, + "loss": 1.2793, + "step": 456 + }, + { + "epoch": 0.02, + "grad_norm": 1.3592702221041502, + "learning_rate": 1.4583333333333333e-05, + "loss": 1.3877, + "step": 457 + }, + { + "epoch": 0.02, + "grad_norm": 1.3895434960715267, + "learning_rate": 1.4615384615384615e-05, + "loss": 1.4829, + "step": 458 + }, + { + "epoch": 0.02, + "grad_norm": 1.7269809081369298, + "learning_rate": 1.4647435897435897e-05, + "loss": 1.4624, + "step": 459 + }, + { + "epoch": 0.02, + "grad_norm": 1.5239890261842506, + "learning_rate": 1.467948717948718e-05, + "loss": 1.4985, + "step": 460 + }, + { + "epoch": 0.02, + "grad_norm": 1.4313301615232084, + "learning_rate": 1.4711538461538463e-05, + "loss": 1.3398, + "step": 461 + }, + { + "epoch": 0.02, + "grad_norm": 1.2783271879799631, + "learning_rate": 1.4743589743589745e-05, + "loss": 1.3608, + "step": 462 + }, + { + "epoch": 0.02, + "grad_norm": 1.3457406427324083, + "learning_rate": 1.4775641025641027e-05, + "loss": 1.418, + "step": 463 + }, + { + "epoch": 0.02, + "grad_norm": 1.3931762446456175, + "learning_rate": 1.480769230769231e-05, + "loss": 1.3604, + "step": 464 + }, + { + "epoch": 0.02, + "grad_norm": 1.4288505584678168, + "learning_rate": 1.4839743589743592e-05, + "loss": 1.4463, + "step": 465 + }, + { + "epoch": 0.02, + "grad_norm": 1.342985301820977, + "learning_rate": 1.4871794871794874e-05, + "loss": 1.1196, + "step": 466 + }, + { + "epoch": 0.02, + "grad_norm": 1.4593839183430386, + "learning_rate": 1.4903846153846156e-05, + "loss": 1.3877, + "step": 467 + }, + { + "epoch": 0.02, + "grad_norm": 1.4032202932228555, + "learning_rate": 1.4935897435897438e-05, + "loss": 1.2837, + "step": 468 + }, + { + "epoch": 0.02, + "grad_norm": 1.5592303289765068, + "learning_rate": 1.496794871794872e-05, + "loss": 1.584, + "step": 469 + }, + { + "epoch": 0.02, + "grad_norm": 1.0671623940891528, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.3833, + "step": 470 + }, + { + "epoch": 0.02, + "grad_norm": 1.688656676510716, + "learning_rate": 1.5032051282051282e-05, + "loss": 1.3125, + "step": 471 + }, + { + "epoch": 0.02, + "grad_norm": 1.3259426919079789, + "learning_rate": 1.5064102564102565e-05, + "loss": 1.3774, + "step": 472 + }, + { + "epoch": 0.02, + "grad_norm": 1.5122403593391645, + "learning_rate": 1.5096153846153847e-05, + "loss": 1.3193, + "step": 473 + }, + { + "epoch": 0.02, + "grad_norm": 1.298995478132946, + "learning_rate": 1.5128205128205129e-05, + "loss": 1.3608, + "step": 474 + }, + { + "epoch": 0.02, + "grad_norm": 1.453250029728073, + "learning_rate": 1.516025641025641e-05, + "loss": 1.3594, + "step": 475 + }, + { + "epoch": 0.02, + "grad_norm": 1.50172762292981, + "learning_rate": 1.5192307692307693e-05, + "loss": 1.4419, + "step": 476 + }, + { + "epoch": 0.02, + "grad_norm": 1.2655914489475526, + "learning_rate": 1.5224358974358975e-05, + "loss": 1.3872, + "step": 477 + }, + { + "epoch": 0.02, + "grad_norm": 1.5007388470501197, + "learning_rate": 1.5256410256410257e-05, + "loss": 1.4858, + "step": 478 + }, + { + "epoch": 0.02, + "grad_norm": 1.3411706083877026, + "learning_rate": 1.528846153846154e-05, + "loss": 1.4971, + "step": 479 + }, + { + "epoch": 0.02, + "grad_norm": 1.6111704141004897, + "learning_rate": 1.5320512820512823e-05, + "loss": 1.5166, + "step": 480 + }, + { + "epoch": 0.02, + "grad_norm": 1.2560735029638563, + "learning_rate": 1.5352564102564103e-05, + "loss": 1.2227, + "step": 481 + }, + { + "epoch": 0.02, + "grad_norm": 1.710631494146579, + "learning_rate": 1.5384615384615387e-05, + "loss": 1.4624, + "step": 482 + }, + { + "epoch": 0.02, + "grad_norm": 1.4471676155677404, + "learning_rate": 1.5416666666666668e-05, + "loss": 1.4858, + "step": 483 + }, + { + "epoch": 0.02, + "grad_norm": 1.2683673322986313, + "learning_rate": 1.544871794871795e-05, + "loss": 1.3262, + "step": 484 + }, + { + "epoch": 0.02, + "grad_norm": 1.3431884638640537, + "learning_rate": 1.5480769230769232e-05, + "loss": 1.3931, + "step": 485 + }, + { + "epoch": 0.02, + "grad_norm": 1.5859419898497757, + "learning_rate": 1.5512820512820516e-05, + "loss": 1.5361, + "step": 486 + }, + { + "epoch": 0.02, + "grad_norm": 1.1940940906097317, + "learning_rate": 1.5544871794871796e-05, + "loss": 1.3555, + "step": 487 + }, + { + "epoch": 0.02, + "grad_norm": 1.208184860037386, + "learning_rate": 1.557692307692308e-05, + "loss": 1.3296, + "step": 488 + }, + { + "epoch": 0.02, + "grad_norm": 1.6571858267866035, + "learning_rate": 1.560897435897436e-05, + "loss": 1.5972, + "step": 489 + }, + { + "epoch": 0.02, + "grad_norm": 1.2572573420259763, + "learning_rate": 1.5641025641025644e-05, + "loss": 1.3457, + "step": 490 + }, + { + "epoch": 0.02, + "grad_norm": 1.129851551616406, + "learning_rate": 1.5673076923076924e-05, + "loss": 1.2217, + "step": 491 + }, + { + "epoch": 0.02, + "grad_norm": 1.503313161081039, + "learning_rate": 1.5705128205128205e-05, + "loss": 1.3223, + "step": 492 + }, + { + "epoch": 0.02, + "grad_norm": 1.2548475152856144, + "learning_rate": 1.573717948717949e-05, + "loss": 1.4355, + "step": 493 + }, + { + "epoch": 0.02, + "grad_norm": 1.3378438528405046, + "learning_rate": 1.576923076923077e-05, + "loss": 1.3794, + "step": 494 + }, + { + "epoch": 0.02, + "grad_norm": 1.3908827505746522, + "learning_rate": 1.5801282051282053e-05, + "loss": 1.313, + "step": 495 + }, + { + "epoch": 0.02, + "grad_norm": 1.5951990819483384, + "learning_rate": 1.5833333333333333e-05, + "loss": 1.2749, + "step": 496 + }, + { + "epoch": 0.02, + "grad_norm": 1.4305845608831436, + "learning_rate": 1.5865384615384617e-05, + "loss": 1.3608, + "step": 497 + }, + { + "epoch": 0.02, + "grad_norm": 1.4012784517118198, + "learning_rate": 1.5897435897435897e-05, + "loss": 1.356, + "step": 498 + }, + { + "epoch": 0.02, + "grad_norm": 1.5294506791195728, + "learning_rate": 1.592948717948718e-05, + "loss": 1.4619, + "step": 499 + }, + { + "epoch": 0.02, + "grad_norm": 1.3985439146611949, + "learning_rate": 1.5961538461538465e-05, + "loss": 1.46, + "step": 500 + }, + { + "epoch": 0.02, + "grad_norm": 1.6209106621364877, + "learning_rate": 1.5993589743589745e-05, + "loss": 1.4336, + "step": 501 + }, + { + "epoch": 0.02, + "grad_norm": 1.19239001982262, + "learning_rate": 1.602564102564103e-05, + "loss": 1.3608, + "step": 502 + }, + { + "epoch": 0.02, + "grad_norm": 1.4225331684582514, + "learning_rate": 1.605769230769231e-05, + "loss": 1.4028, + "step": 503 + }, + { + "epoch": 0.02, + "grad_norm": 1.3313304414095088, + "learning_rate": 1.6089743589743593e-05, + "loss": 1.1594, + "step": 504 + }, + { + "epoch": 0.02, + "grad_norm": 1.2610625192883973, + "learning_rate": 1.6121794871794874e-05, + "loss": 1.3315, + "step": 505 + }, + { + "epoch": 0.02, + "grad_norm": 1.1458905809275148, + "learning_rate": 1.6153846153846154e-05, + "loss": 1.437, + "step": 506 + }, + { + "epoch": 0.02, + "grad_norm": 1.640721088627141, + "learning_rate": 1.6185897435897438e-05, + "loss": 1.5537, + "step": 507 + }, + { + "epoch": 0.02, + "grad_norm": 1.281672844333867, + "learning_rate": 1.6217948717948718e-05, + "loss": 1.1401, + "step": 508 + }, + { + "epoch": 0.02, + "grad_norm": 1.305292744782369, + "learning_rate": 1.6250000000000002e-05, + "loss": 1.1514, + "step": 509 + }, + { + "epoch": 0.02, + "grad_norm": 1.0516093435593628, + "learning_rate": 1.6282051282051282e-05, + "loss": 1.2852, + "step": 510 + }, + { + "epoch": 0.02, + "grad_norm": 1.2679003136024116, + "learning_rate": 1.6314102564102566e-05, + "loss": 1.4624, + "step": 511 + }, + { + "epoch": 0.02, + "grad_norm": 1.3047158184357295, + "learning_rate": 1.6346153846153847e-05, + "loss": 1.4932, + "step": 512 + }, + { + "epoch": 0.02, + "grad_norm": 1.3715912148181022, + "learning_rate": 1.637820512820513e-05, + "loss": 1.3623, + "step": 513 + }, + { + "epoch": 0.02, + "grad_norm": 1.4661172391319894, + "learning_rate": 1.641025641025641e-05, + "loss": 1.4819, + "step": 514 + }, + { + "epoch": 0.02, + "grad_norm": 1.428185708269545, + "learning_rate": 1.6442307692307695e-05, + "loss": 1.2607, + "step": 515 + }, + { + "epoch": 0.02, + "grad_norm": 1.421599428979329, + "learning_rate": 1.6474358974358975e-05, + "loss": 1.1675, + "step": 516 + }, + { + "epoch": 0.02, + "grad_norm": 1.4434464214298612, + "learning_rate": 1.6506410256410255e-05, + "loss": 1.2471, + "step": 517 + }, + { + "epoch": 0.02, + "grad_norm": 1.2912728677963612, + "learning_rate": 1.653846153846154e-05, + "loss": 1.4097, + "step": 518 + }, + { + "epoch": 0.02, + "grad_norm": 1.37928644520339, + "learning_rate": 1.6570512820512823e-05, + "loss": 1.3325, + "step": 519 + }, + { + "epoch": 0.03, + "grad_norm": 1.5243439790781024, + "learning_rate": 1.6602564102564103e-05, + "loss": 1.4595, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 1.3872808014398006, + "learning_rate": 1.6634615384615387e-05, + "loss": 1.416, + "step": 521 + }, + { + "epoch": 0.03, + "grad_norm": 1.252778685941513, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.3066, + "step": 522 + }, + { + "epoch": 0.03, + "grad_norm": 1.4097986795543291, + "learning_rate": 1.669871794871795e-05, + "loss": 1.3564, + "step": 523 + }, + { + "epoch": 0.03, + "grad_norm": 1.5960003013699287, + "learning_rate": 1.673076923076923e-05, + "loss": 1.3701, + "step": 524 + }, + { + "epoch": 0.03, + "grad_norm": 1.215067235980445, + "learning_rate": 1.6762820512820515e-05, + "loss": 1.3301, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 1.6221162089496706, + "learning_rate": 1.6794871794871796e-05, + "loss": 1.3198, + "step": 526 + }, + { + "epoch": 0.03, + "grad_norm": 1.169658269624973, + "learning_rate": 1.682692307692308e-05, + "loss": 1.2725, + "step": 527 + }, + { + "epoch": 0.03, + "grad_norm": 1.2641198357132362, + "learning_rate": 1.685897435897436e-05, + "loss": 1.3477, + "step": 528 + }, + { + "epoch": 0.03, + "grad_norm": 1.3383105949534142, + "learning_rate": 1.6891025641025644e-05, + "loss": 1.3906, + "step": 529 + }, + { + "epoch": 0.03, + "grad_norm": 1.2125034834919766, + "learning_rate": 1.6923076923076924e-05, + "loss": 1.2959, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 1.2193697422118333, + "learning_rate": 1.6955128205128205e-05, + "loss": 1.1348, + "step": 531 + }, + { + "epoch": 0.03, + "grad_norm": 1.0483726110681362, + "learning_rate": 1.698717948717949e-05, + "loss": 1.125, + "step": 532 + }, + { + "epoch": 0.03, + "grad_norm": 1.301435717124425, + "learning_rate": 1.701923076923077e-05, + "loss": 1.397, + "step": 533 + }, + { + "epoch": 0.03, + "grad_norm": 1.4560095870444878, + "learning_rate": 1.7051282051282053e-05, + "loss": 1.4502, + "step": 534 + }, + { + "epoch": 0.03, + "grad_norm": 1.5803969973354353, + "learning_rate": 1.7083333333333333e-05, + "loss": 1.479, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 1.1313987662572405, + "learning_rate": 1.7115384615384617e-05, + "loss": 1.3579, + "step": 536 + }, + { + "epoch": 0.03, + "grad_norm": 1.4399778764378397, + "learning_rate": 1.7147435897435897e-05, + "loss": 1.4668, + "step": 537 + }, + { + "epoch": 0.03, + "grad_norm": 1.0867392422924127, + "learning_rate": 1.717948717948718e-05, + "loss": 1.3281, + "step": 538 + }, + { + "epoch": 0.03, + "grad_norm": 1.3560007508174348, + "learning_rate": 1.7211538461538465e-05, + "loss": 1.3706, + "step": 539 + }, + { + "epoch": 0.03, + "grad_norm": 1.577184214804529, + "learning_rate": 1.7243589743589745e-05, + "loss": 1.4482, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 1.27974949664638, + "learning_rate": 1.727564102564103e-05, + "loss": 1.272, + "step": 541 + }, + { + "epoch": 0.03, + "grad_norm": 1.649803265845458, + "learning_rate": 1.730769230769231e-05, + "loss": 1.5264, + "step": 542 + }, + { + "epoch": 0.03, + "grad_norm": 1.0330544965306192, + "learning_rate": 1.7339743589743593e-05, + "loss": 1.293, + "step": 543 + }, + { + "epoch": 0.03, + "grad_norm": 1.2495061811930788, + "learning_rate": 1.7371794871794873e-05, + "loss": 1.312, + "step": 544 + }, + { + "epoch": 0.03, + "grad_norm": 1.4141018467100808, + "learning_rate": 1.7403846153846157e-05, + "loss": 1.4893, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 1.4150550588689295, + "learning_rate": 1.7435897435897438e-05, + "loss": 1.3452, + "step": 546 + }, + { + "epoch": 0.03, + "grad_norm": 1.3233757069406549, + "learning_rate": 1.7467948717948718e-05, + "loss": 1.2437, + "step": 547 + }, + { + "epoch": 0.03, + "grad_norm": 1.3788723528022941, + "learning_rate": 1.7500000000000002e-05, + "loss": 1.4556, + "step": 548 + }, + { + "epoch": 0.03, + "grad_norm": 1.525281909384566, + "learning_rate": 1.7532051282051282e-05, + "loss": 1.3765, + "step": 549 + }, + { + "epoch": 0.03, + "grad_norm": 1.6300224651437343, + "learning_rate": 1.7564102564102566e-05, + "loss": 1.3359, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 0.9827583501770526, + "learning_rate": 1.7596153846153846e-05, + "loss": 1.4575, + "step": 551 + }, + { + "epoch": 0.03, + "grad_norm": 1.438150843256687, + "learning_rate": 1.762820512820513e-05, + "loss": 1.4668, + "step": 552 + }, + { + "epoch": 0.03, + "grad_norm": 1.3881294157609494, + "learning_rate": 1.766025641025641e-05, + "loss": 1.4653, + "step": 553 + }, + { + "epoch": 0.03, + "grad_norm": 1.3573317383984584, + "learning_rate": 1.7692307692307694e-05, + "loss": 1.4028, + "step": 554 + }, + { + "epoch": 0.03, + "grad_norm": 1.2324225516442557, + "learning_rate": 1.7724358974358975e-05, + "loss": 1.2417, + "step": 555 + }, + { + "epoch": 0.03, + "grad_norm": 1.6460870836234336, + "learning_rate": 1.775641025641026e-05, + "loss": 1.2334, + "step": 556 + }, + { + "epoch": 0.03, + "grad_norm": 1.7766022952503593, + "learning_rate": 1.778846153846154e-05, + "loss": 1.457, + "step": 557 + }, + { + "epoch": 0.03, + "grad_norm": 1.5932577093243538, + "learning_rate": 1.7820512820512823e-05, + "loss": 1.4517, + "step": 558 + }, + { + "epoch": 0.03, + "grad_norm": 1.7265563714545993, + "learning_rate": 1.7852564102564107e-05, + "loss": 1.5249, + "step": 559 + }, + { + "epoch": 0.03, + "grad_norm": 1.1021410412480328, + "learning_rate": 1.7884615384615387e-05, + "loss": 1.4058, + "step": 560 + }, + { + "epoch": 0.03, + "grad_norm": 1.409330012836106, + "learning_rate": 1.7916666666666667e-05, + "loss": 1.4609, + "step": 561 + }, + { + "epoch": 0.03, + "grad_norm": 1.2447276693231564, + "learning_rate": 1.794871794871795e-05, + "loss": 1.1631, + "step": 562 + }, + { + "epoch": 0.03, + "grad_norm": 1.326329883052271, + "learning_rate": 1.798076923076923e-05, + "loss": 1.2812, + "step": 563 + }, + { + "epoch": 0.03, + "grad_norm": 1.442058547769068, + "learning_rate": 1.8012820512820515e-05, + "loss": 1.4717, + "step": 564 + }, + { + "epoch": 0.03, + "grad_norm": 1.3942885748480063, + "learning_rate": 1.8044871794871796e-05, + "loss": 1.248, + "step": 565 + }, + { + "epoch": 0.03, + "grad_norm": 1.2460295677194884, + "learning_rate": 1.807692307692308e-05, + "loss": 1.3457, + "step": 566 + }, + { + "epoch": 0.03, + "grad_norm": 1.2313538335007477, + "learning_rate": 1.810897435897436e-05, + "loss": 1.3013, + "step": 567 + }, + { + "epoch": 0.03, + "grad_norm": 1.405532423083906, + "learning_rate": 1.8141025641025644e-05, + "loss": 1.4443, + "step": 568 + }, + { + "epoch": 0.03, + "grad_norm": 1.3862902484466386, + "learning_rate": 1.8173076923076924e-05, + "loss": 1.4473, + "step": 569 + }, + { + "epoch": 0.03, + "grad_norm": 1.5067306561733516, + "learning_rate": 1.8205128205128208e-05, + "loss": 1.3247, + "step": 570 + }, + { + "epoch": 0.03, + "grad_norm": 1.168314966107389, + "learning_rate": 1.8237179487179488e-05, + "loss": 1.4487, + "step": 571 + }, + { + "epoch": 0.03, + "grad_norm": 1.0320831440647809, + "learning_rate": 1.826923076923077e-05, + "loss": 1.3472, + "step": 572 + }, + { + "epoch": 0.03, + "grad_norm": 1.4347909403959687, + "learning_rate": 1.8301282051282052e-05, + "loss": 1.3281, + "step": 573 + }, + { + "epoch": 0.03, + "grad_norm": 1.6358956841756134, + "learning_rate": 1.8333333333333333e-05, + "loss": 1.1406, + "step": 574 + }, + { + "epoch": 0.03, + "grad_norm": 1.5224740569058406, + "learning_rate": 1.8365384615384617e-05, + "loss": 1.2402, + "step": 575 + }, + { + "epoch": 0.03, + "grad_norm": 1.472174960466282, + "learning_rate": 1.8397435897435897e-05, + "loss": 1.4648, + "step": 576 + }, + { + "epoch": 0.03, + "grad_norm": 1.0629197960929988, + "learning_rate": 1.842948717948718e-05, + "loss": 1.2329, + "step": 577 + }, + { + "epoch": 0.03, + "grad_norm": 1.3327914278952546, + "learning_rate": 1.8461538461538465e-05, + "loss": 1.3281, + "step": 578 + }, + { + "epoch": 0.03, + "grad_norm": 1.2647857300728194, + "learning_rate": 1.8493589743589745e-05, + "loss": 1.5117, + "step": 579 + }, + { + "epoch": 0.03, + "grad_norm": 1.675142820118515, + "learning_rate": 1.852564102564103e-05, + "loss": 1.4766, + "step": 580 + }, + { + "epoch": 0.03, + "grad_norm": 1.3347226546451025, + "learning_rate": 1.855769230769231e-05, + "loss": 1.3975, + "step": 581 + }, + { + "epoch": 0.03, + "grad_norm": 1.4814023981099653, + "learning_rate": 1.8589743589743593e-05, + "loss": 1.2139, + "step": 582 + }, + { + "epoch": 0.03, + "grad_norm": 1.3236718773254184, + "learning_rate": 1.8621794871794873e-05, + "loss": 1.2974, + "step": 583 + }, + { + "epoch": 0.03, + "grad_norm": 1.406576698027228, + "learning_rate": 1.8653846153846157e-05, + "loss": 1.4541, + "step": 584 + }, + { + "epoch": 0.03, + "grad_norm": 1.1886971561706663, + "learning_rate": 1.8685897435897438e-05, + "loss": 1.3018, + "step": 585 + }, + { + "epoch": 0.03, + "grad_norm": 1.607295599832651, + "learning_rate": 1.8717948717948718e-05, + "loss": 1.4907, + "step": 586 + }, + { + "epoch": 0.03, + "grad_norm": 1.427253983843186, + "learning_rate": 1.8750000000000002e-05, + "loss": 1.4482, + "step": 587 + }, + { + "epoch": 0.03, + "grad_norm": 1.4043818712862395, + "learning_rate": 1.8782051282051282e-05, + "loss": 1.4121, + "step": 588 + }, + { + "epoch": 0.03, + "grad_norm": 1.1943086528526208, + "learning_rate": 1.8814102564102566e-05, + "loss": 1.1504, + "step": 589 + }, + { + "epoch": 0.03, + "grad_norm": 1.38900044691441, + "learning_rate": 1.8846153846153846e-05, + "loss": 1.3633, + "step": 590 + }, + { + "epoch": 0.03, + "grad_norm": 1.2721154130985086, + "learning_rate": 1.887820512820513e-05, + "loss": 1.3403, + "step": 591 + }, + { + "epoch": 0.03, + "grad_norm": 1.2023222382446153, + "learning_rate": 1.891025641025641e-05, + "loss": 1.3726, + "step": 592 + }, + { + "epoch": 0.03, + "grad_norm": 0.981422317443676, + "learning_rate": 1.8942307692307694e-05, + "loss": 1.2163, + "step": 593 + }, + { + "epoch": 0.03, + "grad_norm": 1.4300202234460881, + "learning_rate": 1.8974358974358975e-05, + "loss": 1.4526, + "step": 594 + }, + { + "epoch": 0.03, + "grad_norm": 1.3870582048133047, + "learning_rate": 1.900641025641026e-05, + "loss": 1.4004, + "step": 595 + }, + { + "epoch": 0.03, + "grad_norm": 1.3147265536325958, + "learning_rate": 1.903846153846154e-05, + "loss": 1.438, + "step": 596 + }, + { + "epoch": 0.03, + "grad_norm": 1.2256434276808312, + "learning_rate": 1.9070512820512823e-05, + "loss": 1.3521, + "step": 597 + }, + { + "epoch": 0.03, + "grad_norm": 1.708185371555497, + "learning_rate": 1.9102564102564106e-05, + "loss": 1.2197, + "step": 598 + }, + { + "epoch": 0.03, + "grad_norm": 1.3909140275716405, + "learning_rate": 1.9134615384615387e-05, + "loss": 1.23, + "step": 599 + }, + { + "epoch": 0.03, + "grad_norm": 1.6289224779159834, + "learning_rate": 1.916666666666667e-05, + "loss": 1.3921, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 1.4880436944566162, + "learning_rate": 1.919871794871795e-05, + "loss": 1.2505, + "step": 601 + }, + { + "epoch": 0.03, + "grad_norm": 1.6331105528274013, + "learning_rate": 1.923076923076923e-05, + "loss": 1.3555, + "step": 602 + }, + { + "epoch": 0.03, + "grad_norm": 1.254716190535359, + "learning_rate": 1.9262820512820515e-05, + "loss": 1.4829, + "step": 603 + }, + { + "epoch": 0.03, + "grad_norm": 1.4288505474526056, + "learning_rate": 1.9294871794871796e-05, + "loss": 1.4751, + "step": 604 + }, + { + "epoch": 0.03, + "grad_norm": 1.8105580735154534, + "learning_rate": 1.932692307692308e-05, + "loss": 1.4346, + "step": 605 + }, + { + "epoch": 0.03, + "grad_norm": 1.2210290966783124, + "learning_rate": 1.935897435897436e-05, + "loss": 1.4565, + "step": 606 + }, + { + "epoch": 0.03, + "grad_norm": 1.8079497280637575, + "learning_rate": 1.9391025641025644e-05, + "loss": 1.3965, + "step": 607 + }, + { + "epoch": 0.03, + "grad_norm": 1.0564735052393015, + "learning_rate": 1.9423076923076924e-05, + "loss": 1.3125, + "step": 608 + }, + { + "epoch": 0.03, + "grad_norm": 0.9849223233888563, + "learning_rate": 1.9455128205128208e-05, + "loss": 1.1646, + "step": 609 + }, + { + "epoch": 0.03, + "grad_norm": 1.295635147333379, + "learning_rate": 1.9487179487179488e-05, + "loss": 1.2998, + "step": 610 + }, + { + "epoch": 0.03, + "grad_norm": 1.3341206002337973, + "learning_rate": 1.9519230769230772e-05, + "loss": 1.3325, + "step": 611 + }, + { + "epoch": 0.03, + "grad_norm": 1.6232397344974392, + "learning_rate": 1.9551282051282052e-05, + "loss": 1.4453, + "step": 612 + }, + { + "epoch": 0.03, + "grad_norm": 1.3249373679083745, + "learning_rate": 1.9583333333333333e-05, + "loss": 1.2539, + "step": 613 + }, + { + "epoch": 0.03, + "grad_norm": 1.3100531949710184, + "learning_rate": 1.9615384615384617e-05, + "loss": 1.4902, + "step": 614 + }, + { + "epoch": 0.03, + "grad_norm": 1.3483583329678617, + "learning_rate": 1.9647435897435897e-05, + "loss": 1.3384, + "step": 615 + }, + { + "epoch": 0.03, + "grad_norm": 1.0000197179180486, + "learning_rate": 1.967948717948718e-05, + "loss": 1.5186, + "step": 616 + }, + { + "epoch": 0.03, + "grad_norm": 1.1997745547733292, + "learning_rate": 1.9711538461538465e-05, + "loss": 1.2812, + "step": 617 + }, + { + "epoch": 0.03, + "grad_norm": 1.6149044939941823, + "learning_rate": 1.9743589743589745e-05, + "loss": 1.3481, + "step": 618 + }, + { + "epoch": 0.03, + "grad_norm": 1.5312817527254543, + "learning_rate": 1.977564102564103e-05, + "loss": 1.4795, + "step": 619 + }, + { + "epoch": 0.03, + "grad_norm": 1.163174928475769, + "learning_rate": 1.980769230769231e-05, + "loss": 1.354, + "step": 620 + }, + { + "epoch": 0.03, + "grad_norm": 1.3993322421382928, + "learning_rate": 1.9839743589743593e-05, + "loss": 1.2803, + "step": 621 + }, + { + "epoch": 0.03, + "grad_norm": 1.0781318823407806, + "learning_rate": 1.9871794871794873e-05, + "loss": 1.3472, + "step": 622 + }, + { + "epoch": 0.03, + "grad_norm": 1.2750709065044106, + "learning_rate": 1.9903846153846157e-05, + "loss": 1.3125, + "step": 623 + }, + { + "epoch": 0.03, + "grad_norm": 1.3771088269297513, + "learning_rate": 1.9935897435897437e-05, + "loss": 1.3945, + "step": 624 + }, + { + "epoch": 0.03, + "grad_norm": 1.201338877783735, + "learning_rate": 1.996794871794872e-05, + "loss": 1.334, + "step": 625 + }, + { + "epoch": 0.03, + "grad_norm": 1.27337359345505, + "learning_rate": 2e-05, + "loss": 1.2593, + "step": 626 + }, + { + "epoch": 0.03, + "grad_norm": 1.2154574733109216, + "learning_rate": 1.9999999878664707e-05, + "loss": 1.2593, + "step": 627 + }, + { + "epoch": 0.03, + "grad_norm": 1.4885944471570738, + "learning_rate": 1.999999951465882e-05, + "loss": 1.3374, + "step": 628 + }, + { + "epoch": 0.03, + "grad_norm": 1.4201872667389484, + "learning_rate": 1.999999890798236e-05, + "loss": 1.2197, + "step": 629 + }, + { + "epoch": 0.03, + "grad_norm": 1.381681285072961, + "learning_rate": 1.9999998058635333e-05, + "loss": 1.353, + "step": 630 + }, + { + "epoch": 0.03, + "grad_norm": 1.780615297047386, + "learning_rate": 1.999999696661776e-05, + "loss": 1.2563, + "step": 631 + }, + { + "epoch": 0.03, + "grad_norm": 1.4852794359538266, + "learning_rate": 1.999999563192967e-05, + "loss": 1.3301, + "step": 632 + }, + { + "epoch": 0.03, + "grad_norm": 1.0576917547511393, + "learning_rate": 1.9999994054571096e-05, + "loss": 1.2632, + "step": 633 + }, + { + "epoch": 0.03, + "grad_norm": 1.2324506864144815, + "learning_rate": 1.9999992234542078e-05, + "loss": 1.335, + "step": 634 + }, + { + "epoch": 0.03, + "grad_norm": 1.7130166185057643, + "learning_rate": 1.9999990171842654e-05, + "loss": 1.3242, + "step": 635 + }, + { + "epoch": 0.03, + "grad_norm": 1.638255887316567, + "learning_rate": 1.9999987866472878e-05, + "loss": 1.3271, + "step": 636 + }, + { + "epoch": 0.03, + "grad_norm": 1.13777185853665, + "learning_rate": 1.9999985318432804e-05, + "loss": 1.3618, + "step": 637 + }, + { + "epoch": 0.03, + "grad_norm": 1.3348508162826425, + "learning_rate": 1.9999982527722498e-05, + "loss": 1.3687, + "step": 638 + }, + { + "epoch": 0.03, + "grad_norm": 1.292891647696483, + "learning_rate": 1.9999979494342022e-05, + "loss": 1.2964, + "step": 639 + }, + { + "epoch": 0.03, + "grad_norm": 1.3064786446724772, + "learning_rate": 1.9999976218291455e-05, + "loss": 1.3174, + "step": 640 + }, + { + "epoch": 0.03, + "grad_norm": 1.3228995536700574, + "learning_rate": 1.9999972699570876e-05, + "loss": 1.3315, + "step": 641 + }, + { + "epoch": 0.03, + "grad_norm": 1.4839310999858324, + "learning_rate": 1.9999968938180364e-05, + "loss": 1.1321, + "step": 642 + }, + { + "epoch": 0.03, + "grad_norm": 1.3113297027999637, + "learning_rate": 1.9999964934120016e-05, + "loss": 1.4497, + "step": 643 + }, + { + "epoch": 0.03, + "grad_norm": 1.2262440934990706, + "learning_rate": 1.999996068738993e-05, + "loss": 1.3208, + "step": 644 + }, + { + "epoch": 0.03, + "grad_norm": 1.6094160007058462, + "learning_rate": 1.9999956197990205e-05, + "loss": 1.3833, + "step": 645 + }, + { + "epoch": 0.03, + "grad_norm": 1.40210192892748, + "learning_rate": 1.9999951465920953e-05, + "loss": 1.2588, + "step": 646 + }, + { + "epoch": 0.03, + "grad_norm": 1.3510930685282643, + "learning_rate": 1.9999946491182284e-05, + "loss": 1.1685, + "step": 647 + }, + { + "epoch": 0.03, + "grad_norm": 1.4178396760476157, + "learning_rate": 1.9999941273774327e-05, + "loss": 1.2607, + "step": 648 + }, + { + "epoch": 0.03, + "grad_norm": 1.3372679427060812, + "learning_rate": 1.9999935813697204e-05, + "loss": 1.2666, + "step": 649 + }, + { + "epoch": 0.03, + "grad_norm": 1.3955429564424948, + "learning_rate": 1.9999930110951043e-05, + "loss": 1.3135, + "step": 650 + }, + { + "epoch": 0.03, + "grad_norm": 1.513719089547528, + "learning_rate": 1.999992416553599e-05, + "loss": 1.3252, + "step": 651 + }, + { + "epoch": 0.03, + "grad_norm": 1.125039415775943, + "learning_rate": 1.9999917977452187e-05, + "loss": 1.125, + "step": 652 + }, + { + "epoch": 0.03, + "grad_norm": 1.1144443787965195, + "learning_rate": 1.9999911546699785e-05, + "loss": 1.2812, + "step": 653 + }, + { + "epoch": 0.03, + "grad_norm": 1.8090228940634654, + "learning_rate": 1.9999904873278933e-05, + "loss": 1.6245, + "step": 654 + }, + { + "epoch": 0.03, + "grad_norm": 1.3638275169155951, + "learning_rate": 1.9999897957189802e-05, + "loss": 1.2183, + "step": 655 + }, + { + "epoch": 0.03, + "grad_norm": 1.099661743913719, + "learning_rate": 1.9999890798432556e-05, + "loss": 1.4766, + "step": 656 + }, + { + "epoch": 0.03, + "grad_norm": 1.2569381199188554, + "learning_rate": 1.9999883397007366e-05, + "loss": 1.4209, + "step": 657 + }, + { + "epoch": 0.03, + "grad_norm": 1.3354351313729897, + "learning_rate": 1.999987575291442e-05, + "loss": 1.4453, + "step": 658 + }, + { + "epoch": 0.03, + "grad_norm": 1.5904187692645484, + "learning_rate": 1.9999867866153894e-05, + "loss": 1.3887, + "step": 659 + }, + { + "epoch": 0.03, + "grad_norm": 1.3520718152655091, + "learning_rate": 1.9999859736725984e-05, + "loss": 1.2485, + "step": 660 + }, + { + "epoch": 0.03, + "grad_norm": 1.0832849240843234, + "learning_rate": 1.9999851364630886e-05, + "loss": 1.3101, + "step": 661 + }, + { + "epoch": 0.03, + "grad_norm": 1.4484834850433754, + "learning_rate": 1.9999842749868808e-05, + "loss": 1.3623, + "step": 662 + }, + { + "epoch": 0.03, + "grad_norm": 1.3835434957454869, + "learning_rate": 1.9999833892439952e-05, + "loss": 1.458, + "step": 663 + }, + { + "epoch": 0.03, + "grad_norm": 1.4387506514228017, + "learning_rate": 1.9999824792344536e-05, + "loss": 1.3086, + "step": 664 + }, + { + "epoch": 0.03, + "grad_norm": 1.633722983139014, + "learning_rate": 1.999981544958278e-05, + "loss": 1.4619, + "step": 665 + }, + { + "epoch": 0.03, + "grad_norm": 1.4104499077626325, + "learning_rate": 1.9999805864154913e-05, + "loss": 1.4883, + "step": 666 + }, + { + "epoch": 0.03, + "grad_norm": 1.4024389487281625, + "learning_rate": 1.9999796036061164e-05, + "loss": 1.3306, + "step": 667 + }, + { + "epoch": 0.03, + "grad_norm": 1.4196268951832895, + "learning_rate": 1.9999785965301776e-05, + "loss": 1.334, + "step": 668 + }, + { + "epoch": 0.03, + "grad_norm": 1.5241322504684633, + "learning_rate": 1.999977565187699e-05, + "loss": 1.3804, + "step": 669 + }, + { + "epoch": 0.03, + "grad_norm": 1.322377393231278, + "learning_rate": 1.9999765095787055e-05, + "loss": 1.4146, + "step": 670 + }, + { + "epoch": 0.03, + "grad_norm": 1.2112986037417892, + "learning_rate": 1.999975429703223e-05, + "loss": 1.4146, + "step": 671 + }, + { + "epoch": 0.03, + "grad_norm": 1.7613183633276996, + "learning_rate": 1.999974325561278e-05, + "loss": 1.4287, + "step": 672 + }, + { + "epoch": 0.03, + "grad_norm": 1.415522763797125, + "learning_rate": 1.9999731971528965e-05, + "loss": 1.1929, + "step": 673 + }, + { + "epoch": 0.03, + "grad_norm": 1.2058028589701828, + "learning_rate": 1.999972044478107e-05, + "loss": 1.2112, + "step": 674 + }, + { + "epoch": 0.03, + "grad_norm": 1.2016766692285767, + "learning_rate": 1.999970867536936e-05, + "loss": 1.2207, + "step": 675 + }, + { + "epoch": 0.03, + "grad_norm": 1.3119343135913277, + "learning_rate": 1.9999696663294133e-05, + "loss": 1.3203, + "step": 676 + }, + { + "epoch": 0.03, + "grad_norm": 1.4714608051845939, + "learning_rate": 1.9999684408555673e-05, + "loss": 1.2769, + "step": 677 + }, + { + "epoch": 0.03, + "grad_norm": 1.2199275383656834, + "learning_rate": 1.9999671911154285e-05, + "loss": 1.2964, + "step": 678 + }, + { + "epoch": 0.03, + "grad_norm": 1.3147058269536414, + "learning_rate": 1.9999659171090263e-05, + "loss": 1.1973, + "step": 679 + }, + { + "epoch": 0.03, + "grad_norm": 1.7357123247497765, + "learning_rate": 1.9999646188363925e-05, + "loss": 1.4092, + "step": 680 + }, + { + "epoch": 0.03, + "grad_norm": 1.4919718189660889, + "learning_rate": 1.9999632962975578e-05, + "loss": 1.3359, + "step": 681 + }, + { + "epoch": 0.03, + "grad_norm": 1.2500333119084206, + "learning_rate": 1.999961949492555e-05, + "loss": 1.2661, + "step": 682 + }, + { + "epoch": 0.03, + "grad_norm": 1.3391601982711716, + "learning_rate": 1.999960578421416e-05, + "loss": 1.4282, + "step": 683 + }, + { + "epoch": 0.03, + "grad_norm": 1.1437023094598027, + "learning_rate": 1.999959183084175e-05, + "loss": 1.3047, + "step": 684 + }, + { + "epoch": 0.03, + "grad_norm": 1.2939766728937352, + "learning_rate": 1.999957763480865e-05, + "loss": 1.2656, + "step": 685 + }, + { + "epoch": 0.03, + "grad_norm": 1.3486653278963208, + "learning_rate": 1.999956319611521e-05, + "loss": 1.2534, + "step": 686 + }, + { + "epoch": 0.03, + "grad_norm": 1.2827714385498983, + "learning_rate": 1.9999548514761785e-05, + "loss": 1.3911, + "step": 687 + }, + { + "epoch": 0.03, + "grad_norm": 1.2473277660393025, + "learning_rate": 1.999953359074872e-05, + "loss": 1.292, + "step": 688 + }, + { + "epoch": 0.03, + "grad_norm": 1.6688484502239775, + "learning_rate": 1.999951842407638e-05, + "loss": 1.2451, + "step": 689 + }, + { + "epoch": 0.03, + "grad_norm": 1.4111806761504817, + "learning_rate": 1.9999503014745138e-05, + "loss": 1.4971, + "step": 690 + }, + { + "epoch": 0.03, + "grad_norm": 1.1807885396091584, + "learning_rate": 1.9999487362755366e-05, + "loss": 1.3833, + "step": 691 + }, + { + "epoch": 0.03, + "grad_norm": 1.229729630809447, + "learning_rate": 1.9999471468107444e-05, + "loss": 1.2485, + "step": 692 + }, + { + "epoch": 0.03, + "grad_norm": 1.3521840672967698, + "learning_rate": 1.9999455330801752e-05, + "loss": 1.1895, + "step": 693 + }, + { + "epoch": 0.03, + "grad_norm": 1.255910390114501, + "learning_rate": 1.999943895083869e-05, + "loss": 1.4165, + "step": 694 + }, + { + "epoch": 0.03, + "grad_norm": 1.1220685824056627, + "learning_rate": 1.999942232821865e-05, + "loss": 1.2554, + "step": 695 + }, + { + "epoch": 0.03, + "grad_norm": 1.3368643758760377, + "learning_rate": 1.999940546294204e-05, + "loss": 1.2964, + "step": 696 + }, + { + "epoch": 0.03, + "grad_norm": 1.2558627009782874, + "learning_rate": 1.9999388355009266e-05, + "loss": 1.4546, + "step": 697 + }, + { + "epoch": 0.03, + "grad_norm": 1.040379869762896, + "learning_rate": 1.9999371004420744e-05, + "loss": 1.2788, + "step": 698 + }, + { + "epoch": 0.03, + "grad_norm": 1.5726006030407322, + "learning_rate": 1.9999353411176893e-05, + "loss": 1.355, + "step": 699 + }, + { + "epoch": 0.03, + "grad_norm": 1.4255594994564806, + "learning_rate": 1.9999335575278143e-05, + "loss": 1.1953, + "step": 700 + }, + { + "epoch": 0.03, + "grad_norm": 1.0974653573582915, + "learning_rate": 1.9999317496724925e-05, + "loss": 1.3413, + "step": 701 + }, + { + "epoch": 0.03, + "grad_norm": 1.1409607887399758, + "learning_rate": 1.999929917551768e-05, + "loss": 1.4116, + "step": 702 + }, + { + "epoch": 0.03, + "grad_norm": 1.5090099684549179, + "learning_rate": 1.999928061165685e-05, + "loss": 1.4741, + "step": 703 + }, + { + "epoch": 0.03, + "grad_norm": 1.3732488839471322, + "learning_rate": 1.9999261805142885e-05, + "loss": 1.3623, + "step": 704 + }, + { + "epoch": 0.03, + "grad_norm": 1.2991165014985107, + "learning_rate": 1.9999242755976246e-05, + "loss": 1.2764, + "step": 705 + }, + { + "epoch": 0.03, + "grad_norm": 1.6110916976694905, + "learning_rate": 1.999922346415739e-05, + "loss": 1.3003, + "step": 706 + }, + { + "epoch": 0.03, + "grad_norm": 1.618427640991831, + "learning_rate": 1.9999203929686786e-05, + "loss": 1.3882, + "step": 707 + }, + { + "epoch": 0.03, + "grad_norm": 1.1866907206713557, + "learning_rate": 1.9999184152564907e-05, + "loss": 1.1997, + "step": 708 + }, + { + "epoch": 0.03, + "grad_norm": 1.468406094681342, + "learning_rate": 1.999916413279224e-05, + "loss": 1.4199, + "step": 709 + }, + { + "epoch": 0.03, + "grad_norm": 1.460738199686398, + "learning_rate": 1.9999143870369265e-05, + "loss": 1.1748, + "step": 710 + }, + { + "epoch": 0.03, + "grad_norm": 1.1683732039943022, + "learning_rate": 1.9999123365296473e-05, + "loss": 1.4941, + "step": 711 + }, + { + "epoch": 0.03, + "grad_norm": 1.6774087365998513, + "learning_rate": 1.9999102617574366e-05, + "loss": 1.4448, + "step": 712 + }, + { + "epoch": 0.03, + "grad_norm": 1.2136062351323862, + "learning_rate": 1.999908162720344e-05, + "loss": 1.3315, + "step": 713 + }, + { + "epoch": 0.03, + "grad_norm": 1.8453377615444224, + "learning_rate": 1.9999060394184214e-05, + "loss": 1.4028, + "step": 714 + }, + { + "epoch": 0.03, + "grad_norm": 1.900708096988948, + "learning_rate": 1.99990389185172e-05, + "loss": 1.561, + "step": 715 + }, + { + "epoch": 0.03, + "grad_norm": 1.3492499743946267, + "learning_rate": 1.999901720020291e-05, + "loss": 1.3774, + "step": 716 + }, + { + "epoch": 0.03, + "grad_norm": 1.3184260429591028, + "learning_rate": 1.9998995239241883e-05, + "loss": 1.376, + "step": 717 + }, + { + "epoch": 0.03, + "grad_norm": 1.2042285722651176, + "learning_rate": 1.9998973035634648e-05, + "loss": 1.3232, + "step": 718 + }, + { + "epoch": 0.03, + "grad_norm": 1.3413778800098985, + "learning_rate": 1.9998950589381743e-05, + "loss": 1.4038, + "step": 719 + }, + { + "epoch": 0.03, + "grad_norm": 1.2997924514913326, + "learning_rate": 1.9998927900483714e-05, + "loss": 1.2173, + "step": 720 + }, + { + "epoch": 0.03, + "grad_norm": 1.300975241832525, + "learning_rate": 1.9998904968941107e-05, + "loss": 1.3589, + "step": 721 + }, + { + "epoch": 0.03, + "grad_norm": 1.2371027962212908, + "learning_rate": 1.9998881794754484e-05, + "loss": 1.2871, + "step": 722 + }, + { + "epoch": 0.03, + "grad_norm": 1.2715649525142014, + "learning_rate": 1.9998858377924408e-05, + "loss": 1.4395, + "step": 723 + }, + { + "epoch": 0.03, + "grad_norm": 1.5480367133856647, + "learning_rate": 1.9998834718451444e-05, + "loss": 1.3467, + "step": 724 + }, + { + "epoch": 0.03, + "grad_norm": 1.3262576734297975, + "learning_rate": 1.999881081633616e-05, + "loss": 1.3428, + "step": 725 + }, + { + "epoch": 0.03, + "grad_norm": 1.0445114841380974, + "learning_rate": 1.999878667157915e-05, + "loss": 1.3618, + "step": 726 + }, + { + "epoch": 0.03, + "grad_norm": 1.4900141606877817, + "learning_rate": 1.9998762284180996e-05, + "loss": 1.4702, + "step": 727 + }, + { + "epoch": 0.04, + "grad_norm": 1.4158124879116423, + "learning_rate": 1.999873765414228e-05, + "loss": 1.3306, + "step": 728 + }, + { + "epoch": 0.04, + "grad_norm": 1.3306758153136669, + "learning_rate": 1.999871278146361e-05, + "loss": 1.4717, + "step": 729 + }, + { + "epoch": 0.04, + "grad_norm": 1.2493462164222073, + "learning_rate": 1.9998687666145585e-05, + "loss": 1.2705, + "step": 730 + }, + { + "epoch": 0.04, + "grad_norm": 1.2819037431166764, + "learning_rate": 1.9998662308188813e-05, + "loss": 1.3662, + "step": 731 + }, + { + "epoch": 0.04, + "grad_norm": 1.5485694140172468, + "learning_rate": 1.9998636707593913e-05, + "loss": 1.3242, + "step": 732 + }, + { + "epoch": 0.04, + "grad_norm": 1.6726187063486977, + "learning_rate": 1.9998610864361506e-05, + "loss": 1.2388, + "step": 733 + }, + { + "epoch": 0.04, + "grad_norm": 1.3027365315179606, + "learning_rate": 1.999858477849222e-05, + "loss": 1.394, + "step": 734 + }, + { + "epoch": 0.04, + "grad_norm": 1.63117591246242, + "learning_rate": 1.9998558449986684e-05, + "loss": 1.2656, + "step": 735 + }, + { + "epoch": 0.04, + "grad_norm": 1.4905958138989042, + "learning_rate": 1.9998531878845536e-05, + "loss": 1.437, + "step": 736 + }, + { + "epoch": 0.04, + "grad_norm": 1.4233560041293754, + "learning_rate": 1.999850506506943e-05, + "loss": 1.1279, + "step": 737 + }, + { + "epoch": 0.04, + "grad_norm": 1.2237265125736674, + "learning_rate": 1.999847800865901e-05, + "loss": 1.4512, + "step": 738 + }, + { + "epoch": 0.04, + "grad_norm": 1.2094435678456228, + "learning_rate": 1.9998450709614928e-05, + "loss": 1.3623, + "step": 739 + }, + { + "epoch": 0.04, + "grad_norm": 1.098222569374182, + "learning_rate": 1.9998423167937852e-05, + "loss": 1.4009, + "step": 740 + }, + { + "epoch": 0.04, + "grad_norm": 1.3836299143956212, + "learning_rate": 1.9998395383628457e-05, + "loss": 1.4668, + "step": 741 + }, + { + "epoch": 0.04, + "grad_norm": 1.3724904057857936, + "learning_rate": 1.9998367356687405e-05, + "loss": 1.3413, + "step": 742 + }, + { + "epoch": 0.04, + "grad_norm": 1.4922691058508695, + "learning_rate": 1.9998339087115378e-05, + "loss": 1.3569, + "step": 743 + }, + { + "epoch": 0.04, + "grad_norm": 1.4903142168930235, + "learning_rate": 1.9998310574913074e-05, + "loss": 1.4736, + "step": 744 + }, + { + "epoch": 0.04, + "grad_norm": 1.2572580244976974, + "learning_rate": 1.999828182008117e-05, + "loss": 1.1865, + "step": 745 + }, + { + "epoch": 0.04, + "grad_norm": 1.2150804774546125, + "learning_rate": 1.9998252822620373e-05, + "loss": 1.3745, + "step": 746 + }, + { + "epoch": 0.04, + "grad_norm": 1.2620322468289291, + "learning_rate": 1.9998223582531386e-05, + "loss": 1.3818, + "step": 747 + }, + { + "epoch": 0.04, + "grad_norm": 1.3504234177561378, + "learning_rate": 1.9998194099814913e-05, + "loss": 1.1621, + "step": 748 + }, + { + "epoch": 0.04, + "grad_norm": 1.3845840320273697, + "learning_rate": 1.9998164374471673e-05, + "loss": 1.2871, + "step": 749 + }, + { + "epoch": 0.04, + "grad_norm": 1.2180432235543677, + "learning_rate": 1.9998134406502384e-05, + "loss": 1.2368, + "step": 750 + }, + { + "epoch": 0.04, + "grad_norm": 1.566176899465898, + "learning_rate": 1.999810419590778e-05, + "loss": 1.5542, + "step": 751 + }, + { + "epoch": 0.04, + "grad_norm": 1.8016178704670875, + "learning_rate": 1.9998073742688592e-05, + "loss": 1.3696, + "step": 752 + }, + { + "epoch": 0.04, + "grad_norm": 1.3963695940306424, + "learning_rate": 1.9998043046845555e-05, + "loss": 1.2524, + "step": 753 + }, + { + "epoch": 0.04, + "grad_norm": 1.2385839620087902, + "learning_rate": 1.9998012108379417e-05, + "loss": 1.0708, + "step": 754 + }, + { + "epoch": 0.04, + "grad_norm": 0.9952189015926733, + "learning_rate": 1.9997980927290928e-05, + "loss": 1.2524, + "step": 755 + }, + { + "epoch": 0.04, + "grad_norm": 1.898800908272537, + "learning_rate": 1.9997949503580844e-05, + "loss": 1.4141, + "step": 756 + }, + { + "epoch": 0.04, + "grad_norm": 1.1096830610570383, + "learning_rate": 1.999791783724993e-05, + "loss": 1.209, + "step": 757 + }, + { + "epoch": 0.04, + "grad_norm": 1.3326798442394288, + "learning_rate": 1.999788592829895e-05, + "loss": 1.4028, + "step": 758 + }, + { + "epoch": 0.04, + "grad_norm": 1.193933965673473, + "learning_rate": 1.999785377672869e-05, + "loss": 1.2407, + "step": 759 + }, + { + "epoch": 0.04, + "grad_norm": 1.2446478852012381, + "learning_rate": 1.9997821382539914e-05, + "loss": 1.3652, + "step": 760 + }, + { + "epoch": 0.04, + "grad_norm": 1.083712490553, + "learning_rate": 1.9997788745733415e-05, + "loss": 1.2368, + "step": 761 + }, + { + "epoch": 0.04, + "grad_norm": 1.395052434574553, + "learning_rate": 1.9997755866309988e-05, + "loss": 1.3125, + "step": 762 + }, + { + "epoch": 0.04, + "grad_norm": 1.1210722001304796, + "learning_rate": 1.999772274427043e-05, + "loss": 1.4429, + "step": 763 + }, + { + "epoch": 0.04, + "grad_norm": 1.3111291398429166, + "learning_rate": 1.999768937961554e-05, + "loss": 1.3164, + "step": 764 + }, + { + "epoch": 0.04, + "grad_norm": 1.4478495588307805, + "learning_rate": 1.9997655772346132e-05, + "loss": 1.3882, + "step": 765 + }, + { + "epoch": 0.04, + "grad_norm": 1.2147992529334042, + "learning_rate": 1.999762192246302e-05, + "loss": 1.5459, + "step": 766 + }, + { + "epoch": 0.04, + "grad_norm": 0.972615469522833, + "learning_rate": 1.9997587829967027e-05, + "loss": 1.3262, + "step": 767 + }, + { + "epoch": 0.04, + "grad_norm": 1.467078828162015, + "learning_rate": 1.999755349485898e-05, + "loss": 1.3052, + "step": 768 + }, + { + "epoch": 0.04, + "grad_norm": 1.4327908569178656, + "learning_rate": 1.999751891713971e-05, + "loss": 1.2573, + "step": 769 + }, + { + "epoch": 0.04, + "grad_norm": 1.3749308319642244, + "learning_rate": 1.9997484096810054e-05, + "loss": 1.3789, + "step": 770 + }, + { + "epoch": 0.04, + "grad_norm": 1.7499466344693686, + "learning_rate": 1.999744903387087e-05, + "loss": 1.4399, + "step": 771 + }, + { + "epoch": 0.04, + "grad_norm": 1.460209871209967, + "learning_rate": 1.9997413728322992e-05, + "loss": 1.3213, + "step": 772 + }, + { + "epoch": 0.04, + "grad_norm": 1.5988281894137066, + "learning_rate": 1.9997378180167285e-05, + "loss": 1.52, + "step": 773 + }, + { + "epoch": 0.04, + "grad_norm": 1.3665061231996976, + "learning_rate": 1.999734238940461e-05, + "loss": 1.3228, + "step": 774 + }, + { + "epoch": 0.04, + "grad_norm": 1.686762339093321, + "learning_rate": 1.9997306356035838e-05, + "loss": 1.4863, + "step": 775 + }, + { + "epoch": 0.04, + "grad_norm": 1.1976695871601748, + "learning_rate": 1.9997270080061843e-05, + "loss": 1.4702, + "step": 776 + }, + { + "epoch": 0.04, + "grad_norm": 1.4044419557562702, + "learning_rate": 1.9997233561483503e-05, + "loss": 1.4351, + "step": 777 + }, + { + "epoch": 0.04, + "grad_norm": 1.3966351961035797, + "learning_rate": 1.9997196800301704e-05, + "loss": 1.2368, + "step": 778 + }, + { + "epoch": 0.04, + "grad_norm": 1.451908513577683, + "learning_rate": 1.9997159796517342e-05, + "loss": 1.4766, + "step": 779 + }, + { + "epoch": 0.04, + "grad_norm": 1.3883391465324681, + "learning_rate": 1.999712255013131e-05, + "loss": 1.3765, + "step": 780 + }, + { + "epoch": 0.04, + "grad_norm": 1.4875210138421147, + "learning_rate": 1.9997085061144514e-05, + "loss": 1.1914, + "step": 781 + }, + { + "epoch": 0.04, + "grad_norm": 1.108144098880012, + "learning_rate": 1.9997047329557867e-05, + "loss": 1.3037, + "step": 782 + }, + { + "epoch": 0.04, + "grad_norm": 1.2296080571831245, + "learning_rate": 1.9997009355372276e-05, + "loss": 1.3413, + "step": 783 + }, + { + "epoch": 0.04, + "grad_norm": 1.247361737424989, + "learning_rate": 1.9996971138588674e-05, + "loss": 1.2534, + "step": 784 + }, + { + "epoch": 0.04, + "grad_norm": 1.5941379594424552, + "learning_rate": 1.9996932679207977e-05, + "loss": 1.2617, + "step": 785 + }, + { + "epoch": 0.04, + "grad_norm": 1.2165656713257629, + "learning_rate": 1.999689397723113e-05, + "loss": 1.3613, + "step": 786 + }, + { + "epoch": 0.04, + "grad_norm": 1.3816984020156546, + "learning_rate": 1.9996855032659065e-05, + "loss": 1.3418, + "step": 787 + }, + { + "epoch": 0.04, + "grad_norm": 1.3090585611372187, + "learning_rate": 1.9996815845492722e-05, + "loss": 1.2456, + "step": 788 + }, + { + "epoch": 0.04, + "grad_norm": 1.2095302763845621, + "learning_rate": 1.9996776415733063e-05, + "loss": 1.3442, + "step": 789 + }, + { + "epoch": 0.04, + "grad_norm": 1.3766679969399749, + "learning_rate": 1.9996736743381037e-05, + "loss": 1.2803, + "step": 790 + }, + { + "epoch": 0.04, + "grad_norm": 1.3831792721401492, + "learning_rate": 1.9996696828437613e-05, + "loss": 1.1392, + "step": 791 + }, + { + "epoch": 0.04, + "grad_norm": 1.6131839924470461, + "learning_rate": 1.9996656670903753e-05, + "loss": 1.3677, + "step": 792 + }, + { + "epoch": 0.04, + "grad_norm": 1.4481877963153207, + "learning_rate": 1.999661627078044e-05, + "loss": 1.1931, + "step": 793 + }, + { + "epoch": 0.04, + "grad_norm": 1.3615849668811302, + "learning_rate": 1.9996575628068645e-05, + "loss": 1.3618, + "step": 794 + }, + { + "epoch": 0.04, + "grad_norm": 1.2446850940595102, + "learning_rate": 1.9996534742769355e-05, + "loss": 1.2017, + "step": 795 + }, + { + "epoch": 0.04, + "grad_norm": 1.1381553891867735, + "learning_rate": 1.999649361488357e-05, + "loss": 1.3569, + "step": 796 + }, + { + "epoch": 0.04, + "grad_norm": 1.1865100510548208, + "learning_rate": 1.9996452244412285e-05, + "loss": 1.2861, + "step": 797 + }, + { + "epoch": 0.04, + "grad_norm": 0.9535137323021992, + "learning_rate": 1.9996410631356496e-05, + "loss": 1.1475, + "step": 798 + }, + { + "epoch": 0.04, + "grad_norm": 1.2201821690479515, + "learning_rate": 1.9996368775717228e-05, + "loss": 1.3218, + "step": 799 + }, + { + "epoch": 0.04, + "grad_norm": 1.31421135017808, + "learning_rate": 1.9996326677495482e-05, + "loss": 1.415, + "step": 800 + }, + { + "epoch": 0.04, + "grad_norm": 1.6015538484040908, + "learning_rate": 1.9996284336692286e-05, + "loss": 1.2593, + "step": 801 + }, + { + "epoch": 0.04, + "grad_norm": 1.000818877230247, + "learning_rate": 1.9996241753308673e-05, + "loss": 1.2202, + "step": 802 + }, + { + "epoch": 0.04, + "grad_norm": 1.2117657084148095, + "learning_rate": 1.9996198927345665e-05, + "loss": 1.2285, + "step": 803 + }, + { + "epoch": 0.04, + "grad_norm": 1.4748773898628027, + "learning_rate": 1.9996155858804307e-05, + "loss": 1.2739, + "step": 804 + }, + { + "epoch": 0.04, + "grad_norm": 1.0239156855587495, + "learning_rate": 1.9996112547685645e-05, + "loss": 1.2827, + "step": 805 + }, + { + "epoch": 0.04, + "grad_norm": 1.1834695143488219, + "learning_rate": 1.9996068993990727e-05, + "loss": 1.3262, + "step": 806 + }, + { + "epoch": 0.04, + "grad_norm": 1.297795622687012, + "learning_rate": 1.9996025197720615e-05, + "loss": 1.3218, + "step": 807 + }, + { + "epoch": 0.04, + "grad_norm": 1.268877234777004, + "learning_rate": 1.999598115887637e-05, + "loss": 1.3281, + "step": 808 + }, + { + "epoch": 0.04, + "grad_norm": 1.6616671974043395, + "learning_rate": 1.9995936877459053e-05, + "loss": 1.2124, + "step": 809 + }, + { + "epoch": 0.04, + "grad_norm": 1.2708749111895, + "learning_rate": 1.999589235346975e-05, + "loss": 1.4038, + "step": 810 + }, + { + "epoch": 0.04, + "grad_norm": 1.0576109054973595, + "learning_rate": 1.9995847586909537e-05, + "loss": 1.4028, + "step": 811 + }, + { + "epoch": 0.04, + "grad_norm": 1.30430405411384, + "learning_rate": 1.9995802577779498e-05, + "loss": 1.2432, + "step": 812 + }, + { + "epoch": 0.04, + "grad_norm": 1.3926925114135567, + "learning_rate": 1.9995757326080727e-05, + "loss": 1.1865, + "step": 813 + }, + { + "epoch": 0.04, + "grad_norm": 1.352358047134358, + "learning_rate": 1.999571183181432e-05, + "loss": 1.3599, + "step": 814 + }, + { + "epoch": 0.04, + "grad_norm": 1.2840718057316223, + "learning_rate": 1.9995666094981385e-05, + "loss": 1.3545, + "step": 815 + }, + { + "epoch": 0.04, + "grad_norm": 1.1359953155079805, + "learning_rate": 1.9995620115583033e-05, + "loss": 1.1272, + "step": 816 + }, + { + "epoch": 0.04, + "grad_norm": 1.240916883898558, + "learning_rate": 1.999557389362037e-05, + "loss": 1.2437, + "step": 817 + }, + { + "epoch": 0.04, + "grad_norm": 1.3739790811888415, + "learning_rate": 1.9995527429094534e-05, + "loss": 1.3506, + "step": 818 + }, + { + "epoch": 0.04, + "grad_norm": 1.2912649872504953, + "learning_rate": 1.9995480722006636e-05, + "loss": 1.5347, + "step": 819 + }, + { + "epoch": 0.04, + "grad_norm": 1.3565251578259534, + "learning_rate": 1.999543377235782e-05, + "loss": 1.1157, + "step": 820 + }, + { + "epoch": 0.04, + "grad_norm": 1.6728270669624243, + "learning_rate": 1.999538658014922e-05, + "loss": 1.4023, + "step": 821 + }, + { + "epoch": 0.04, + "grad_norm": 1.504838627499413, + "learning_rate": 1.9995339145381982e-05, + "loss": 1.2886, + "step": 822 + }, + { + "epoch": 0.04, + "grad_norm": 1.2483112189876708, + "learning_rate": 1.999529146805726e-05, + "loss": 1.2349, + "step": 823 + }, + { + "epoch": 0.04, + "grad_norm": 1.335802161261752, + "learning_rate": 1.999524354817621e-05, + "loss": 1.4175, + "step": 824 + }, + { + "epoch": 0.04, + "grad_norm": 1.2295105602634124, + "learning_rate": 1.999519538573999e-05, + "loss": 1.3979, + "step": 825 + }, + { + "epoch": 0.04, + "grad_norm": 1.0403257473152625, + "learning_rate": 1.9995146980749777e-05, + "loss": 1.1855, + "step": 826 + }, + { + "epoch": 0.04, + "grad_norm": 1.259663128552189, + "learning_rate": 1.999509833320674e-05, + "loss": 1.3735, + "step": 827 + }, + { + "epoch": 0.04, + "grad_norm": 1.516548450973032, + "learning_rate": 1.999504944311206e-05, + "loss": 1.4141, + "step": 828 + }, + { + "epoch": 0.04, + "grad_norm": 1.3346623828301412, + "learning_rate": 1.9995000310466927e-05, + "loss": 1.3379, + "step": 829 + }, + { + "epoch": 0.04, + "grad_norm": 1.3868452669160183, + "learning_rate": 1.999495093527253e-05, + "loss": 1.2559, + "step": 830 + }, + { + "epoch": 0.04, + "grad_norm": 1.44761352191918, + "learning_rate": 1.9994901317530067e-05, + "loss": 1.3979, + "step": 831 + }, + { + "epoch": 0.04, + "grad_norm": 1.3263519078193244, + "learning_rate": 1.999485145724074e-05, + "loss": 1.2021, + "step": 832 + }, + { + "epoch": 0.04, + "grad_norm": 1.3812156406234244, + "learning_rate": 1.9994801354405768e-05, + "loss": 1.2783, + "step": 833 + }, + { + "epoch": 0.04, + "grad_norm": 1.5427679508642556, + "learning_rate": 1.9994751009026355e-05, + "loss": 1.2104, + "step": 834 + }, + { + "epoch": 0.04, + "grad_norm": 0.8903903131415463, + "learning_rate": 1.9994700421103734e-05, + "loss": 1.2075, + "step": 835 + }, + { + "epoch": 0.04, + "grad_norm": 1.0041952882628502, + "learning_rate": 1.9994649590639124e-05, + "loss": 1.2192, + "step": 836 + }, + { + "epoch": 0.04, + "grad_norm": 1.3180355062295364, + "learning_rate": 1.999459851763376e-05, + "loss": 1.4043, + "step": 837 + }, + { + "epoch": 0.04, + "grad_norm": 1.2183398651805275, + "learning_rate": 1.9994547202088886e-05, + "loss": 1.2729, + "step": 838 + }, + { + "epoch": 0.04, + "grad_norm": 1.2664641140378452, + "learning_rate": 1.9994495644005746e-05, + "loss": 1.3604, + "step": 839 + }, + { + "epoch": 0.04, + "grad_norm": 1.1983467702710018, + "learning_rate": 1.9994443843385583e-05, + "loss": 1.3555, + "step": 840 + }, + { + "epoch": 0.04, + "grad_norm": 1.320101803469919, + "learning_rate": 1.9994391800229666e-05, + "loss": 1.3906, + "step": 841 + }, + { + "epoch": 0.04, + "grad_norm": 1.333045435217759, + "learning_rate": 1.999433951453925e-05, + "loss": 1.4414, + "step": 842 + }, + { + "epoch": 0.04, + "grad_norm": 1.372392634761786, + "learning_rate": 1.999428698631561e-05, + "loss": 1.2549, + "step": 843 + }, + { + "epoch": 0.04, + "grad_norm": 1.4853586461352777, + "learning_rate": 1.9994234215560014e-05, + "loss": 1.2729, + "step": 844 + }, + { + "epoch": 0.04, + "grad_norm": 1.5212548236723744, + "learning_rate": 1.9994181202273745e-05, + "loss": 1.2544, + "step": 845 + }, + { + "epoch": 0.04, + "grad_norm": 1.3969086957767294, + "learning_rate": 1.999412794645809e-05, + "loss": 1.2773, + "step": 846 + }, + { + "epoch": 0.04, + "grad_norm": 1.1942228609319236, + "learning_rate": 1.9994074448114342e-05, + "loss": 1.3682, + "step": 847 + }, + { + "epoch": 0.04, + "grad_norm": 1.2821108163165773, + "learning_rate": 1.99940207072438e-05, + "loss": 1.4014, + "step": 848 + }, + { + "epoch": 0.04, + "grad_norm": 1.573689769344591, + "learning_rate": 1.9993966723847766e-05, + "loss": 1.4219, + "step": 849 + }, + { + "epoch": 0.04, + "grad_norm": 1.032525793857006, + "learning_rate": 1.999391249792755e-05, + "loss": 1.2939, + "step": 850 + }, + { + "epoch": 0.04, + "grad_norm": 1.3641609972298137, + "learning_rate": 1.999385802948447e-05, + "loss": 1.4458, + "step": 851 + }, + { + "epoch": 0.04, + "grad_norm": 1.2748678745258037, + "learning_rate": 1.9993803318519845e-05, + "loss": 1.3652, + "step": 852 + }, + { + "epoch": 0.04, + "grad_norm": 1.1495923719849077, + "learning_rate": 1.9993748365035004e-05, + "loss": 1.356, + "step": 853 + }, + { + "epoch": 0.04, + "grad_norm": 1.4585380361442615, + "learning_rate": 1.999369316903128e-05, + "loss": 1.3691, + "step": 854 + }, + { + "epoch": 0.04, + "grad_norm": 1.5971893874586247, + "learning_rate": 1.999363773051002e-05, + "loss": 1.335, + "step": 855 + }, + { + "epoch": 0.04, + "grad_norm": 1.2782834138306551, + "learning_rate": 1.9993582049472554e-05, + "loss": 1.355, + "step": 856 + }, + { + "epoch": 0.04, + "grad_norm": 1.3735370733867682, + "learning_rate": 1.9993526125920245e-05, + "loss": 1.2002, + "step": 857 + }, + { + "epoch": 0.04, + "grad_norm": 1.4777198320293181, + "learning_rate": 1.9993469959854445e-05, + "loss": 1.4048, + "step": 858 + }, + { + "epoch": 0.04, + "grad_norm": 1.4690241627416423, + "learning_rate": 1.9993413551276523e-05, + "loss": 1.2598, + "step": 859 + }, + { + "epoch": 0.04, + "grad_norm": 1.5494065605653378, + "learning_rate": 1.9993356900187843e-05, + "loss": 1.4937, + "step": 860 + }, + { + "epoch": 0.04, + "grad_norm": 1.2696606634811864, + "learning_rate": 1.9993300006589775e-05, + "loss": 1.3706, + "step": 861 + }, + { + "epoch": 0.04, + "grad_norm": 1.4664322689010696, + "learning_rate": 1.999324287048371e-05, + "loss": 1.2827, + "step": 862 + }, + { + "epoch": 0.04, + "grad_norm": 1.26354958260517, + "learning_rate": 1.999318549187103e-05, + "loss": 1.3359, + "step": 863 + }, + { + "epoch": 0.04, + "grad_norm": 1.286147290071792, + "learning_rate": 1.9993127870753123e-05, + "loss": 1.5239, + "step": 864 + }, + { + "epoch": 0.04, + "grad_norm": 1.5487074695479017, + "learning_rate": 1.9993070007131395e-05, + "loss": 1.5107, + "step": 865 + }, + { + "epoch": 0.04, + "grad_norm": 1.3287771996486357, + "learning_rate": 1.9993011901007245e-05, + "loss": 1.395, + "step": 866 + }, + { + "epoch": 0.04, + "grad_norm": 1.1780584078649199, + "learning_rate": 1.9992953552382085e-05, + "loss": 1.4663, + "step": 867 + }, + { + "epoch": 0.04, + "grad_norm": 1.4292089563685784, + "learning_rate": 1.9992894961257332e-05, + "loss": 1.4868, + "step": 868 + }, + { + "epoch": 0.04, + "grad_norm": 1.6383959056472281, + "learning_rate": 1.9992836127634402e-05, + "loss": 1.2896, + "step": 869 + }, + { + "epoch": 0.04, + "grad_norm": 1.1862031528209118, + "learning_rate": 1.999277705151473e-05, + "loss": 1.2622, + "step": 870 + }, + { + "epoch": 0.04, + "grad_norm": 1.358043693780465, + "learning_rate": 1.9992717732899746e-05, + "loss": 1.2949, + "step": 871 + }, + { + "epoch": 0.04, + "grad_norm": 1.5875828308957856, + "learning_rate": 1.9992658171790893e-05, + "loss": 1.4966, + "step": 872 + }, + { + "epoch": 0.04, + "grad_norm": 1.3187869290512495, + "learning_rate": 1.999259836818961e-05, + "loss": 1.2563, + "step": 873 + }, + { + "epoch": 0.04, + "grad_norm": 1.2673226435682952, + "learning_rate": 1.9992538322097352e-05, + "loss": 1.374, + "step": 874 + }, + { + "epoch": 0.04, + "grad_norm": 1.2413983095599295, + "learning_rate": 1.9992478033515578e-05, + "loss": 1.3076, + "step": 875 + }, + { + "epoch": 0.04, + "grad_norm": 1.6455998392466549, + "learning_rate": 1.9992417502445746e-05, + "loss": 1.4648, + "step": 876 + }, + { + "epoch": 0.04, + "grad_norm": 1.1873026079802493, + "learning_rate": 1.9992356728889332e-05, + "loss": 1.4175, + "step": 877 + }, + { + "epoch": 0.04, + "grad_norm": 1.2235097035104556, + "learning_rate": 1.9992295712847802e-05, + "loss": 1.3604, + "step": 878 + }, + { + "epoch": 0.04, + "grad_norm": 1.4914296178169537, + "learning_rate": 1.9992234454322643e-05, + "loss": 1.3325, + "step": 879 + }, + { + "epoch": 0.04, + "grad_norm": 0.9991799371724274, + "learning_rate": 1.9992172953315343e-05, + "loss": 1.1416, + "step": 880 + }, + { + "epoch": 0.04, + "grad_norm": 1.3410768541678202, + "learning_rate": 1.9992111209827386e-05, + "loss": 1.3164, + "step": 881 + }, + { + "epoch": 0.04, + "grad_norm": 1.2598398976427776, + "learning_rate": 1.999204922386028e-05, + "loss": 1.2388, + "step": 882 + }, + { + "epoch": 0.04, + "grad_norm": 1.3398894209183871, + "learning_rate": 1.999198699541552e-05, + "loss": 1.3042, + "step": 883 + }, + { + "epoch": 0.04, + "grad_norm": 1.2918563458745362, + "learning_rate": 1.999192452449463e-05, + "loss": 1.3545, + "step": 884 + }, + { + "epoch": 0.04, + "grad_norm": 1.4364657216682528, + "learning_rate": 1.999186181109911e-05, + "loss": 1.3535, + "step": 885 + }, + { + "epoch": 0.04, + "grad_norm": 1.0458957581471489, + "learning_rate": 1.999179885523049e-05, + "loss": 1.2451, + "step": 886 + }, + { + "epoch": 0.04, + "grad_norm": 1.1827555740301956, + "learning_rate": 1.9991735656890293e-05, + "loss": 1.3438, + "step": 887 + }, + { + "epoch": 0.04, + "grad_norm": 1.2764834425040195, + "learning_rate": 1.9991672216080064e-05, + "loss": 1.1558, + "step": 888 + }, + { + "epoch": 0.04, + "grad_norm": 1.58680902234929, + "learning_rate": 1.999160853280133e-05, + "loss": 1.4512, + "step": 889 + }, + { + "epoch": 0.04, + "grad_norm": 1.2793645510398626, + "learning_rate": 1.9991544607055642e-05, + "loss": 1.3193, + "step": 890 + }, + { + "epoch": 0.04, + "grad_norm": 1.3759798622513246, + "learning_rate": 1.999148043884455e-05, + "loss": 1.4072, + "step": 891 + }, + { + "epoch": 0.04, + "grad_norm": 1.2371516768307143, + "learning_rate": 1.9991416028169612e-05, + "loss": 1.3638, + "step": 892 + }, + { + "epoch": 0.04, + "grad_norm": 1.412787366157253, + "learning_rate": 1.999135137503239e-05, + "loss": 1.1118, + "step": 893 + }, + { + "epoch": 0.04, + "grad_norm": 1.1218864924868144, + "learning_rate": 1.9991286479434456e-05, + "loss": 1.3115, + "step": 894 + }, + { + "epoch": 0.04, + "grad_norm": 1.660083387521531, + "learning_rate": 1.999122134137738e-05, + "loss": 1.3022, + "step": 895 + }, + { + "epoch": 0.04, + "grad_norm": 1.1623454098746682, + "learning_rate": 1.9991155960862743e-05, + "loss": 1.2378, + "step": 896 + }, + { + "epoch": 0.04, + "grad_norm": 1.6288733435735185, + "learning_rate": 1.999109033789214e-05, + "loss": 1.3486, + "step": 897 + }, + { + "epoch": 0.04, + "grad_norm": 1.3826216979703472, + "learning_rate": 1.9991024472467156e-05, + "loss": 1.3203, + "step": 898 + }, + { + "epoch": 0.04, + "grad_norm": 1.313357889627867, + "learning_rate": 1.9990958364589388e-05, + "loss": 1.3354, + "step": 899 + }, + { + "epoch": 0.04, + "grad_norm": 1.5228343365715902, + "learning_rate": 1.999089201426044e-05, + "loss": 1.3755, + "step": 900 + }, + { + "epoch": 0.04, + "grad_norm": 1.3529503812458403, + "learning_rate": 1.999082542148193e-05, + "loss": 1.4292, + "step": 901 + }, + { + "epoch": 0.04, + "grad_norm": 1.4265214448601857, + "learning_rate": 1.9990758586255467e-05, + "loss": 1.3364, + "step": 902 + }, + { + "epoch": 0.04, + "grad_norm": 1.1650873108662794, + "learning_rate": 1.9990691508582678e-05, + "loss": 1.3408, + "step": 903 + }, + { + "epoch": 0.04, + "grad_norm": 1.3896385042247748, + "learning_rate": 1.9990624188465183e-05, + "loss": 1.5249, + "step": 904 + }, + { + "epoch": 0.04, + "grad_norm": 1.3120643062527504, + "learning_rate": 1.9990556625904623e-05, + "loss": 1.3193, + "step": 905 + }, + { + "epoch": 0.04, + "grad_norm": 1.5959920097069034, + "learning_rate": 1.9990488820902633e-05, + "loss": 1.2778, + "step": 906 + }, + { + "epoch": 0.04, + "grad_norm": 1.1769556939383408, + "learning_rate": 1.9990420773460864e-05, + "loss": 1.3765, + "step": 907 + }, + { + "epoch": 0.04, + "grad_norm": 1.3830733122713008, + "learning_rate": 1.999035248358096e-05, + "loss": 1.3867, + "step": 908 + }, + { + "epoch": 0.04, + "grad_norm": 1.3307473572812227, + "learning_rate": 1.999028395126458e-05, + "loss": 1.395, + "step": 909 + }, + { + "epoch": 0.04, + "grad_norm": 1.280182797255347, + "learning_rate": 1.9990215176513394e-05, + "loss": 1.3325, + "step": 910 + }, + { + "epoch": 0.04, + "grad_norm": 1.463418313649634, + "learning_rate": 1.9990146159329065e-05, + "loss": 1.3774, + "step": 911 + }, + { + "epoch": 0.04, + "grad_norm": 1.3738754408926348, + "learning_rate": 1.9990076899713268e-05, + "loss": 1.4053, + "step": 912 + }, + { + "epoch": 0.04, + "grad_norm": 1.3597670053585398, + "learning_rate": 1.999000739766768e-05, + "loss": 1.2642, + "step": 913 + }, + { + "epoch": 0.04, + "grad_norm": 1.2817434933570393, + "learning_rate": 1.9989937653193995e-05, + "loss": 1.3853, + "step": 914 + }, + { + "epoch": 0.04, + "grad_norm": 1.1579417826711578, + "learning_rate": 1.9989867666293904e-05, + "loss": 1.3838, + "step": 915 + }, + { + "epoch": 0.04, + "grad_norm": 1.2545116621573145, + "learning_rate": 1.9989797436969103e-05, + "loss": 1.3091, + "step": 916 + }, + { + "epoch": 0.04, + "grad_norm": 1.3782429480905622, + "learning_rate": 1.9989726965221298e-05, + "loss": 1.3906, + "step": 917 + }, + { + "epoch": 0.04, + "grad_norm": 1.1879849344972098, + "learning_rate": 1.99896562510522e-05, + "loss": 1.3843, + "step": 918 + }, + { + "epoch": 0.04, + "grad_norm": 1.471959636571595, + "learning_rate": 1.9989585294463518e-05, + "loss": 1.3091, + "step": 919 + }, + { + "epoch": 0.04, + "grad_norm": 1.377240148762396, + "learning_rate": 1.998951409545698e-05, + "loss": 1.2275, + "step": 920 + }, + { + "epoch": 0.04, + "grad_norm": 1.2101114339339933, + "learning_rate": 1.9989442654034315e-05, + "loss": 1.3799, + "step": 921 + }, + { + "epoch": 0.04, + "grad_norm": 1.3648937558088416, + "learning_rate": 1.9989370970197257e-05, + "loss": 1.332, + "step": 922 + }, + { + "epoch": 0.04, + "grad_norm": 1.702236328558717, + "learning_rate": 1.998929904394754e-05, + "loss": 1.3672, + "step": 923 + }, + { + "epoch": 0.04, + "grad_norm": 1.6932267976650008, + "learning_rate": 1.9989226875286913e-05, + "loss": 1.4204, + "step": 924 + }, + { + "epoch": 0.04, + "grad_norm": 0.9774939856923439, + "learning_rate": 1.9989154464217128e-05, + "loss": 1.252, + "step": 925 + }, + { + "epoch": 0.04, + "grad_norm": 1.0934491069369945, + "learning_rate": 1.9989081810739943e-05, + "loss": 1.2261, + "step": 926 + }, + { + "epoch": 0.04, + "grad_norm": 1.38808903886525, + "learning_rate": 1.9989008914857115e-05, + "loss": 1.4512, + "step": 927 + }, + { + "epoch": 0.04, + "grad_norm": 1.3685048522869565, + "learning_rate": 1.998893577657042e-05, + "loss": 1.3633, + "step": 928 + }, + { + "epoch": 0.04, + "grad_norm": 1.492911551380535, + "learning_rate": 1.998886239588163e-05, + "loss": 1.3516, + "step": 929 + }, + { + "epoch": 0.04, + "grad_norm": 1.161716391501544, + "learning_rate": 1.9988788772792523e-05, + "loss": 1.2988, + "step": 930 + }, + { + "epoch": 0.04, + "grad_norm": 1.3969554674670923, + "learning_rate": 1.998871490730489e-05, + "loss": 1.356, + "step": 931 + }, + { + "epoch": 0.04, + "grad_norm": 1.2416667768295848, + "learning_rate": 1.9988640799420524e-05, + "loss": 0.9905, + "step": 932 + }, + { + "epoch": 0.04, + "grad_norm": 1.3794216875013767, + "learning_rate": 1.9988566449141223e-05, + "loss": 1.3296, + "step": 933 + }, + { + "epoch": 0.04, + "grad_norm": 1.3525392740303408, + "learning_rate": 1.9988491856468785e-05, + "loss": 1.3052, + "step": 934 + }, + { + "epoch": 0.04, + "grad_norm": 1.4813995662256194, + "learning_rate": 1.9988417021405027e-05, + "loss": 1.2573, + "step": 935 + }, + { + "epoch": 0.05, + "grad_norm": 1.5574296899146205, + "learning_rate": 1.998834194395176e-05, + "loss": 1.3887, + "step": 936 + }, + { + "epoch": 0.05, + "grad_norm": 1.2777452053799485, + "learning_rate": 1.9988266624110813e-05, + "loss": 1.2407, + "step": 937 + }, + { + "epoch": 0.05, + "grad_norm": 1.199796946396947, + "learning_rate": 1.9988191061884005e-05, + "loss": 1.2485, + "step": 938 + }, + { + "epoch": 0.05, + "grad_norm": 1.5543231980934835, + "learning_rate": 1.9988115257273176e-05, + "loss": 1.3604, + "step": 939 + }, + { + "epoch": 0.05, + "grad_norm": 1.4342811199367758, + "learning_rate": 1.9988039210280167e-05, + "loss": 1.3174, + "step": 940 + }, + { + "epoch": 0.05, + "grad_norm": 1.2253449183968281, + "learning_rate": 1.9987962920906816e-05, + "loss": 1.3066, + "step": 941 + }, + { + "epoch": 0.05, + "grad_norm": 1.061222547339291, + "learning_rate": 1.998788638915498e-05, + "loss": 1.3638, + "step": 942 + }, + { + "epoch": 0.05, + "grad_norm": 1.2021105385477442, + "learning_rate": 1.9987809615026513e-05, + "loss": 1.3774, + "step": 943 + }, + { + "epoch": 0.05, + "grad_norm": 1.25154549549052, + "learning_rate": 1.998773259852328e-05, + "loss": 1.1147, + "step": 944 + }, + { + "epoch": 0.05, + "grad_norm": 1.3440440674566698, + "learning_rate": 1.9987655339647153e-05, + "loss": 1.415, + "step": 945 + }, + { + "epoch": 0.05, + "grad_norm": 1.1973225464047557, + "learning_rate": 1.99875778384e-05, + "loss": 1.1968, + "step": 946 + }, + { + "epoch": 0.05, + "grad_norm": 0.9558058021713994, + "learning_rate": 1.998750009478371e-05, + "loss": 1.3579, + "step": 947 + }, + { + "epoch": 0.05, + "grad_norm": 1.152101376464106, + "learning_rate": 1.9987422108800157e-05, + "loss": 1.2856, + "step": 948 + }, + { + "epoch": 0.05, + "grad_norm": 1.5544153708756336, + "learning_rate": 1.998734388045125e-05, + "loss": 1.3643, + "step": 949 + }, + { + "epoch": 0.05, + "grad_norm": 1.6908365237396161, + "learning_rate": 1.998726540973887e-05, + "loss": 1.3037, + "step": 950 + }, + { + "epoch": 0.05, + "grad_norm": 1.2068550869487893, + "learning_rate": 1.9987186696664937e-05, + "loss": 1.2026, + "step": 951 + }, + { + "epoch": 0.05, + "grad_norm": 1.3372898382685539, + "learning_rate": 1.9987107741231352e-05, + "loss": 1.0938, + "step": 952 + }, + { + "epoch": 0.05, + "grad_norm": 1.2692466474211135, + "learning_rate": 1.9987028543440035e-05, + "loss": 1.2944, + "step": 953 + }, + { + "epoch": 0.05, + "grad_norm": 0.9370550324299994, + "learning_rate": 1.9986949103292904e-05, + "loss": 1.187, + "step": 954 + }, + { + "epoch": 0.05, + "grad_norm": 1.2263779901944991, + "learning_rate": 1.998686942079189e-05, + "loss": 1.1904, + "step": 955 + }, + { + "epoch": 0.05, + "grad_norm": 1.4355687661452312, + "learning_rate": 1.9986789495938922e-05, + "loss": 1.332, + "step": 956 + }, + { + "epoch": 0.05, + "grad_norm": 1.0667375738621068, + "learning_rate": 1.9986709328735944e-05, + "loss": 1.3604, + "step": 957 + }, + { + "epoch": 0.05, + "grad_norm": 0.8746824338351591, + "learning_rate": 1.9986628919184904e-05, + "loss": 1.2314, + "step": 958 + }, + { + "epoch": 0.05, + "grad_norm": 1.59216158893926, + "learning_rate": 1.9986548267287745e-05, + "loss": 1.3662, + "step": 959 + }, + { + "epoch": 0.05, + "grad_norm": 1.3438436318676739, + "learning_rate": 1.998646737304643e-05, + "loss": 1.2744, + "step": 960 + }, + { + "epoch": 0.05, + "grad_norm": 1.2138084069927937, + "learning_rate": 1.9986386236462926e-05, + "loss": 1.4468, + "step": 961 + }, + { + "epoch": 0.05, + "grad_norm": 1.120099811063583, + "learning_rate": 1.998630485753919e-05, + "loss": 1.2373, + "step": 962 + }, + { + "epoch": 0.05, + "grad_norm": 1.1876037110201647, + "learning_rate": 1.998622323627721e-05, + "loss": 1.3599, + "step": 963 + }, + { + "epoch": 0.05, + "grad_norm": 1.550154343452363, + "learning_rate": 1.998614137267896e-05, + "loss": 1.3286, + "step": 964 + }, + { + "epoch": 0.05, + "grad_norm": 1.347473590275112, + "learning_rate": 1.998605926674642e-05, + "loss": 1.2539, + "step": 965 + }, + { + "epoch": 0.05, + "grad_norm": 1.265254502880324, + "learning_rate": 1.9985976918481595e-05, + "loss": 1.3901, + "step": 966 + }, + { + "epoch": 0.05, + "grad_norm": 1.384265812571524, + "learning_rate": 1.9985894327886476e-05, + "loss": 1.2671, + "step": 967 + }, + { + "epoch": 0.05, + "grad_norm": 2.0054482477943014, + "learning_rate": 1.9985811494963074e-05, + "loss": 1.3184, + "step": 968 + }, + { + "epoch": 0.05, + "grad_norm": 1.2901370282303524, + "learning_rate": 1.9985728419713388e-05, + "loss": 1.3042, + "step": 969 + }, + { + "epoch": 0.05, + "grad_norm": 1.3917510134693893, + "learning_rate": 1.998564510213944e-05, + "loss": 1.2856, + "step": 970 + }, + { + "epoch": 0.05, + "grad_norm": 1.5084494731023785, + "learning_rate": 1.9985561542243256e-05, + "loss": 1.2646, + "step": 971 + }, + { + "epoch": 0.05, + "grad_norm": 1.2970310646779586, + "learning_rate": 1.998547774002686e-05, + "loss": 1.439, + "step": 972 + }, + { + "epoch": 0.05, + "grad_norm": 1.8433551361630716, + "learning_rate": 1.9985393695492283e-05, + "loss": 1.5034, + "step": 973 + }, + { + "epoch": 0.05, + "grad_norm": 1.4097486765759444, + "learning_rate": 1.998530940864157e-05, + "loss": 1.3135, + "step": 974 + }, + { + "epoch": 0.05, + "grad_norm": 1.1637630868752868, + "learning_rate": 1.998522487947676e-05, + "loss": 1.3672, + "step": 975 + }, + { + "epoch": 0.05, + "grad_norm": 1.328044158647447, + "learning_rate": 1.9985140107999908e-05, + "loss": 1.4014, + "step": 976 + }, + { + "epoch": 0.05, + "grad_norm": 1.3562911381641412, + "learning_rate": 1.9985055094213072e-05, + "loss": 1.3242, + "step": 977 + }, + { + "epoch": 0.05, + "grad_norm": 1.4796538737526004, + "learning_rate": 1.9984969838118316e-05, + "loss": 1.2754, + "step": 978 + }, + { + "epoch": 0.05, + "grad_norm": 1.3693163340337076, + "learning_rate": 1.9984884339717704e-05, + "loss": 1.3291, + "step": 979 + }, + { + "epoch": 0.05, + "grad_norm": 1.302252228077707, + "learning_rate": 1.9984798599013315e-05, + "loss": 1.3442, + "step": 980 + }, + { + "epoch": 0.05, + "grad_norm": 1.447844203185186, + "learning_rate": 1.9984712616007226e-05, + "loss": 1.3872, + "step": 981 + }, + { + "epoch": 0.05, + "grad_norm": 1.2426830482315911, + "learning_rate": 1.998462639070153e-05, + "loss": 1.293, + "step": 982 + }, + { + "epoch": 0.05, + "grad_norm": 1.1556279308136506, + "learning_rate": 1.998453992309831e-05, + "loss": 1.1377, + "step": 983 + }, + { + "epoch": 0.05, + "grad_norm": 1.159318491672372, + "learning_rate": 1.9984453213199673e-05, + "loss": 1.3896, + "step": 984 + }, + { + "epoch": 0.05, + "grad_norm": 1.425591113281386, + "learning_rate": 1.9984366261007716e-05, + "loss": 1.2471, + "step": 985 + }, + { + "epoch": 0.05, + "grad_norm": 1.2896206053205888, + "learning_rate": 1.998427906652456e-05, + "loss": 1.1997, + "step": 986 + }, + { + "epoch": 0.05, + "grad_norm": 1.8103005747023315, + "learning_rate": 1.9984191629752306e-05, + "loss": 1.1523, + "step": 987 + }, + { + "epoch": 0.05, + "grad_norm": 1.2809953192571162, + "learning_rate": 1.998410395069309e-05, + "loss": 1.3008, + "step": 988 + }, + { + "epoch": 0.05, + "grad_norm": 1.2166003063089528, + "learning_rate": 1.998401602934903e-05, + "loss": 1.3438, + "step": 989 + }, + { + "epoch": 0.05, + "grad_norm": 1.3678053627449558, + "learning_rate": 1.9983927865722262e-05, + "loss": 1.3481, + "step": 990 + }, + { + "epoch": 0.05, + "grad_norm": 1.0182068184441464, + "learning_rate": 1.9983839459814925e-05, + "loss": 1.3633, + "step": 991 + }, + { + "epoch": 0.05, + "grad_norm": 1.4960249567706507, + "learning_rate": 1.998375081162917e-05, + "loss": 1.3462, + "step": 992 + }, + { + "epoch": 0.05, + "grad_norm": 1.3160283144370801, + "learning_rate": 1.998366192116714e-05, + "loss": 1.2021, + "step": 993 + }, + { + "epoch": 0.05, + "grad_norm": 1.6112957595299955, + "learning_rate": 1.9983572788431e-05, + "loss": 1.4639, + "step": 994 + }, + { + "epoch": 0.05, + "grad_norm": 1.4365455223312873, + "learning_rate": 1.9983483413422907e-05, + "loss": 1.2476, + "step": 995 + }, + { + "epoch": 0.05, + "grad_norm": 1.426085820937853, + "learning_rate": 1.998339379614503e-05, + "loss": 1.3105, + "step": 996 + }, + { + "epoch": 0.05, + "grad_norm": 1.1363633654065086, + "learning_rate": 1.9983303936599553e-05, + "loss": 1.2622, + "step": 997 + }, + { + "epoch": 0.05, + "grad_norm": 1.4016170045063903, + "learning_rate": 1.9983213834788643e-05, + "loss": 1.3848, + "step": 998 + }, + { + "epoch": 0.05, + "grad_norm": 0.9543836124841405, + "learning_rate": 1.9983123490714492e-05, + "loss": 1.2832, + "step": 999 + }, + { + "epoch": 0.05, + "grad_norm": 1.3643368717839928, + "learning_rate": 1.9983032904379296e-05, + "loss": 1.2617, + "step": 1000 + }, + { + "epoch": 0.05, + "grad_norm": 1.313726043429224, + "learning_rate": 1.9982942075785247e-05, + "loss": 1.2256, + "step": 1001 + }, + { + "epoch": 0.05, + "grad_norm": 1.3126296883658601, + "learning_rate": 1.9982851004934557e-05, + "loss": 1.3042, + "step": 1002 + }, + { + "epoch": 0.05, + "grad_norm": 1.4807697975938474, + "learning_rate": 1.998275969182943e-05, + "loss": 1.3877, + "step": 1003 + }, + { + "epoch": 0.05, + "grad_norm": 0.5789813796548606, + "learning_rate": 1.998266813647208e-05, + "loss": 1.2773, + "step": 1004 + }, + { + "epoch": 0.05, + "grad_norm": 1.2317452211289648, + "learning_rate": 1.9982576338864738e-05, + "loss": 1.4082, + "step": 1005 + }, + { + "epoch": 0.05, + "grad_norm": 1.1037718118357647, + "learning_rate": 1.9982484299009624e-05, + "loss": 1.417, + "step": 1006 + }, + { + "epoch": 0.05, + "grad_norm": 1.0666238869576732, + "learning_rate": 1.9982392016908975e-05, + "loss": 1.1797, + "step": 1007 + }, + { + "epoch": 0.05, + "grad_norm": 1.5182293726400478, + "learning_rate": 1.9982299492565028e-05, + "loss": 1.3657, + "step": 1008 + }, + { + "epoch": 0.05, + "grad_norm": 1.4607145946659057, + "learning_rate": 1.9982206725980026e-05, + "loss": 1.5088, + "step": 1009 + }, + { + "epoch": 0.05, + "grad_norm": 1.087138765004644, + "learning_rate": 1.9982113717156225e-05, + "loss": 1.292, + "step": 1010 + }, + { + "epoch": 0.05, + "grad_norm": 1.5472316566217363, + "learning_rate": 1.9982020466095882e-05, + "loss": 1.4028, + "step": 1011 + }, + { + "epoch": 0.05, + "grad_norm": 1.4748639915074149, + "learning_rate": 1.9981926972801257e-05, + "loss": 1.3838, + "step": 1012 + }, + { + "epoch": 0.05, + "grad_norm": 1.3029442810828897, + "learning_rate": 1.998183323727462e-05, + "loss": 1.1328, + "step": 1013 + }, + { + "epoch": 0.05, + "grad_norm": 1.4078644633126354, + "learning_rate": 1.9981739259518246e-05, + "loss": 1.3418, + "step": 1014 + }, + { + "epoch": 0.05, + "grad_norm": 1.288986994516389, + "learning_rate": 1.9981645039534417e-05, + "loss": 1.3682, + "step": 1015 + }, + { + "epoch": 0.05, + "grad_norm": 0.9186980005404712, + "learning_rate": 1.9981550577325417e-05, + "loss": 1.01, + "step": 1016 + }, + { + "epoch": 0.05, + "grad_norm": 1.0519750137112254, + "learning_rate": 1.998145587289354e-05, + "loss": 1.3555, + "step": 1017 + }, + { + "epoch": 0.05, + "grad_norm": 1.2943064510542623, + "learning_rate": 1.998136092624108e-05, + "loss": 1.3589, + "step": 1018 + }, + { + "epoch": 0.05, + "grad_norm": 1.3431852715908437, + "learning_rate": 1.998126573737035e-05, + "loss": 1.4341, + "step": 1019 + }, + { + "epoch": 0.05, + "grad_norm": 1.017731704818828, + "learning_rate": 1.9981170306283647e-05, + "loss": 1.3594, + "step": 1020 + }, + { + "epoch": 0.05, + "grad_norm": 1.279071739788195, + "learning_rate": 1.99810746329833e-05, + "loss": 1.2241, + "step": 1021 + }, + { + "epoch": 0.05, + "grad_norm": 1.4187560427439891, + "learning_rate": 1.998097871747162e-05, + "loss": 1.1978, + "step": 1022 + }, + { + "epoch": 0.05, + "grad_norm": 1.1759021008035346, + "learning_rate": 1.9980882559750947e-05, + "loss": 1.3184, + "step": 1023 + }, + { + "epoch": 0.05, + "grad_norm": 1.6098924473742597, + "learning_rate": 1.99807861598236e-05, + "loss": 1.5024, + "step": 1024 + }, + { + "epoch": 0.05, + "grad_norm": 1.2735511470590686, + "learning_rate": 1.9980689517691928e-05, + "loss": 1.166, + "step": 1025 + }, + { + "epoch": 0.05, + "grad_norm": 1.458595476858746, + "learning_rate": 1.9980592633358276e-05, + "loss": 1.3711, + "step": 1026 + }, + { + "epoch": 0.05, + "grad_norm": 1.3764389204819925, + "learning_rate": 1.998049550682499e-05, + "loss": 1.3354, + "step": 1027 + }, + { + "epoch": 0.05, + "grad_norm": 1.1539612502573597, + "learning_rate": 1.998039813809443e-05, + "loss": 1.3408, + "step": 1028 + }, + { + "epoch": 0.05, + "grad_norm": 1.7183748178002722, + "learning_rate": 1.998030052716896e-05, + "loss": 1.2192, + "step": 1029 + }, + { + "epoch": 0.05, + "grad_norm": 0.9732579221380275, + "learning_rate": 1.9980202674050945e-05, + "loss": 1.2935, + "step": 1030 + }, + { + "epoch": 0.05, + "grad_norm": 1.2033255681476656, + "learning_rate": 1.9980104578742762e-05, + "loss": 1.2563, + "step": 1031 + }, + { + "epoch": 0.05, + "grad_norm": 1.2674235273687364, + "learning_rate": 1.998000624124679e-05, + "loss": 1.2842, + "step": 1032 + }, + { + "epoch": 0.05, + "grad_norm": 1.5700028976483593, + "learning_rate": 1.9979907661565418e-05, + "loss": 1.4556, + "step": 1033 + }, + { + "epoch": 0.05, + "grad_norm": 1.382150574205717, + "learning_rate": 1.9979808839701037e-05, + "loss": 1.4253, + "step": 1034 + }, + { + "epoch": 0.05, + "grad_norm": 0.9809797771500793, + "learning_rate": 1.9979709775656048e-05, + "loss": 1.3013, + "step": 1035 + }, + { + "epoch": 0.05, + "grad_norm": 1.4084645177251898, + "learning_rate": 1.9979610469432847e-05, + "loss": 1.2549, + "step": 1036 + }, + { + "epoch": 0.05, + "grad_norm": 1.2275901556255258, + "learning_rate": 1.997951092103385e-05, + "loss": 1.1196, + "step": 1037 + }, + { + "epoch": 0.05, + "grad_norm": 1.2991430386229006, + "learning_rate": 1.9979411130461475e-05, + "loss": 1.2676, + "step": 1038 + }, + { + "epoch": 0.05, + "grad_norm": 1.3067145350572373, + "learning_rate": 1.997931109771814e-05, + "loss": 1.1641, + "step": 1039 + }, + { + "epoch": 0.05, + "grad_norm": 1.2179772723047235, + "learning_rate": 1.997921082280627e-05, + "loss": 1.189, + "step": 1040 + }, + { + "epoch": 0.05, + "grad_norm": 1.2364356944868469, + "learning_rate": 1.99791103057283e-05, + "loss": 1.2559, + "step": 1041 + }, + { + "epoch": 0.05, + "grad_norm": 1.285349654671994, + "learning_rate": 1.9979009546486675e-05, + "loss": 1.3296, + "step": 1042 + }, + { + "epoch": 0.05, + "grad_norm": 1.4674984757160858, + "learning_rate": 1.9978908545083833e-05, + "loss": 1.3198, + "step": 1043 + }, + { + "epoch": 0.05, + "grad_norm": 1.3766417113687375, + "learning_rate": 1.9978807301522226e-05, + "loss": 1.1938, + "step": 1044 + }, + { + "epoch": 0.05, + "grad_norm": 1.2660597611255526, + "learning_rate": 1.9978705815804312e-05, + "loss": 1.3877, + "step": 1045 + }, + { + "epoch": 0.05, + "grad_norm": 1.4515855786147038, + "learning_rate": 1.9978604087932557e-05, + "loss": 1.2271, + "step": 1046 + }, + { + "epoch": 0.05, + "grad_norm": 1.2249349618501544, + "learning_rate": 1.997850211790943e-05, + "loss": 1.4624, + "step": 1047 + }, + { + "epoch": 0.05, + "grad_norm": 1.3737596195236799, + "learning_rate": 1.9978399905737395e-05, + "loss": 1.1377, + "step": 1048 + }, + { + "epoch": 0.05, + "grad_norm": 1.410093234311074, + "learning_rate": 1.9978297451418945e-05, + "loss": 1.3413, + "step": 1049 + }, + { + "epoch": 0.05, + "grad_norm": 1.385375473930903, + "learning_rate": 1.9978194754956558e-05, + "loss": 1.209, + "step": 1050 + }, + { + "epoch": 0.05, + "grad_norm": 1.1556466116903177, + "learning_rate": 1.997809181635273e-05, + "loss": 1.1904, + "step": 1051 + }, + { + "epoch": 0.05, + "grad_norm": 1.2832249875876516, + "learning_rate": 1.9977988635609957e-05, + "loss": 1.2808, + "step": 1052 + }, + { + "epoch": 0.05, + "grad_norm": 1.3725402556608373, + "learning_rate": 1.9977885212730745e-05, + "loss": 1.1382, + "step": 1053 + }, + { + "epoch": 0.05, + "grad_norm": 1.1436873448820741, + "learning_rate": 1.9977781547717604e-05, + "loss": 1.1851, + "step": 1054 + }, + { + "epoch": 0.05, + "grad_norm": 1.2909307533555916, + "learning_rate": 1.9977677640573048e-05, + "loss": 1.353, + "step": 1055 + }, + { + "epoch": 0.05, + "grad_norm": 1.3290123916926229, + "learning_rate": 1.9977573491299597e-05, + "loss": 1.2451, + "step": 1056 + }, + { + "epoch": 0.05, + "grad_norm": 1.188744240707465, + "learning_rate": 1.997746909989978e-05, + "loss": 1.1792, + "step": 1057 + }, + { + "epoch": 0.05, + "grad_norm": 1.5016420828983263, + "learning_rate": 1.9977364466376135e-05, + "loss": 1.2886, + "step": 1058 + }, + { + "epoch": 0.05, + "grad_norm": 1.1400916703906432, + "learning_rate": 1.9977259590731193e-05, + "loss": 1.1101, + "step": 1059 + }, + { + "epoch": 0.05, + "grad_norm": 1.0412176445321804, + "learning_rate": 1.9977154472967504e-05, + "loss": 1.2749, + "step": 1060 + }, + { + "epoch": 0.05, + "grad_norm": 1.4585181162653846, + "learning_rate": 1.9977049113087615e-05, + "loss": 1.3857, + "step": 1061 + }, + { + "epoch": 0.05, + "grad_norm": 1.148231450264873, + "learning_rate": 1.997694351109409e-05, + "loss": 1.4458, + "step": 1062 + }, + { + "epoch": 0.05, + "grad_norm": 1.2899884577320724, + "learning_rate": 1.9976837666989484e-05, + "loss": 1.3418, + "step": 1063 + }, + { + "epoch": 0.05, + "grad_norm": 1.3691101556164778, + "learning_rate": 1.9976731580776373e-05, + "loss": 1.1855, + "step": 1064 + }, + { + "epoch": 0.05, + "grad_norm": 1.3425889477227961, + "learning_rate": 1.9976625252457322e-05, + "loss": 1.3872, + "step": 1065 + }, + { + "epoch": 0.05, + "grad_norm": 1.4385858218654797, + "learning_rate": 1.9976518682034917e-05, + "loss": 1.2993, + "step": 1066 + }, + { + "epoch": 0.05, + "grad_norm": 1.2327936107020323, + "learning_rate": 1.9976411869511746e-05, + "loss": 1.3301, + "step": 1067 + }, + { + "epoch": 0.05, + "grad_norm": 1.8240592143896712, + "learning_rate": 1.9976304814890396e-05, + "loss": 1.6006, + "step": 1068 + }, + { + "epoch": 0.05, + "grad_norm": 1.1715095353111415, + "learning_rate": 1.997619751817347e-05, + "loss": 1.2319, + "step": 1069 + }, + { + "epoch": 0.05, + "grad_norm": 1.0711689194041374, + "learning_rate": 1.9976089979363566e-05, + "loss": 1.3857, + "step": 1070 + }, + { + "epoch": 0.05, + "grad_norm": 1.521870325509901, + "learning_rate": 1.99759821984633e-05, + "loss": 1.2783, + "step": 1071 + }, + { + "epoch": 0.05, + "grad_norm": 1.182169956894781, + "learning_rate": 1.9975874175475284e-05, + "loss": 1.3784, + "step": 1072 + }, + { + "epoch": 0.05, + "grad_norm": 1.2241119043581783, + "learning_rate": 1.997576591040214e-05, + "loss": 1.2466, + "step": 1073 + }, + { + "epoch": 0.05, + "grad_norm": 1.469065139808045, + "learning_rate": 1.9975657403246492e-05, + "loss": 1.4126, + "step": 1074 + }, + { + "epoch": 0.05, + "grad_norm": 1.2721829997540537, + "learning_rate": 1.9975548654010978e-05, + "loss": 1.0464, + "step": 1075 + }, + { + "epoch": 0.05, + "grad_norm": 1.205158339062666, + "learning_rate": 1.9975439662698234e-05, + "loss": 1.0979, + "step": 1076 + }, + { + "epoch": 0.05, + "grad_norm": 1.4595869557900465, + "learning_rate": 1.9975330429310908e-05, + "loss": 1.3276, + "step": 1077 + }, + { + "epoch": 0.05, + "grad_norm": 1.252300005952983, + "learning_rate": 1.9975220953851648e-05, + "loss": 1.2217, + "step": 1078 + }, + { + "epoch": 0.05, + "grad_norm": 1.5120549022982526, + "learning_rate": 1.9975111236323112e-05, + "loss": 1.2842, + "step": 1079 + }, + { + "epoch": 0.05, + "grad_norm": 1.6142693724098107, + "learning_rate": 1.997500127672796e-05, + "loss": 1.4463, + "step": 1080 + }, + { + "epoch": 0.05, + "grad_norm": 1.47708318024681, + "learning_rate": 1.9974891075068864e-05, + "loss": 1.4375, + "step": 1081 + }, + { + "epoch": 0.05, + "grad_norm": 1.079179240068141, + "learning_rate": 1.9974780631348495e-05, + "loss": 1.3496, + "step": 1082 + }, + { + "epoch": 0.05, + "grad_norm": 1.1515354403904188, + "learning_rate": 1.997466994556954e-05, + "loss": 1.0122, + "step": 1083 + }, + { + "epoch": 0.05, + "grad_norm": 1.3026402130868953, + "learning_rate": 1.9974559017734676e-05, + "loss": 1.2588, + "step": 1084 + }, + { + "epoch": 0.05, + "grad_norm": 0.9327533019718179, + "learning_rate": 1.99744478478466e-05, + "loss": 1.5537, + "step": 1085 + }, + { + "epoch": 0.05, + "grad_norm": 1.3194511426117472, + "learning_rate": 1.9974336435908005e-05, + "loss": 1.3076, + "step": 1086 + }, + { + "epoch": 0.05, + "grad_norm": 0.8997824308649448, + "learning_rate": 1.99742247819216e-05, + "loss": 1.1567, + "step": 1087 + }, + { + "epoch": 0.05, + "grad_norm": 1.097157962495064, + "learning_rate": 1.9974112885890094e-05, + "loss": 1.2153, + "step": 1088 + }, + { + "epoch": 0.05, + "grad_norm": 1.720646379252307, + "learning_rate": 1.9974000747816203e-05, + "loss": 1.415, + "step": 1089 + }, + { + "epoch": 0.05, + "grad_norm": 1.4475660506494223, + "learning_rate": 1.9973888367702643e-05, + "loss": 1.23, + "step": 1090 + }, + { + "epoch": 0.05, + "grad_norm": 1.1928797359440777, + "learning_rate": 1.9973775745552146e-05, + "loss": 1.2847, + "step": 1091 + }, + { + "epoch": 0.05, + "grad_norm": 1.2121004372091027, + "learning_rate": 1.9973662881367442e-05, + "loss": 1.4785, + "step": 1092 + }, + { + "epoch": 0.05, + "grad_norm": 1.388496467105192, + "learning_rate": 1.9973549775151273e-05, + "loss": 1.2998, + "step": 1093 + }, + { + "epoch": 0.05, + "grad_norm": 1.2015814484511962, + "learning_rate": 1.997343642690638e-05, + "loss": 1.3418, + "step": 1094 + }, + { + "epoch": 0.05, + "grad_norm": 1.2580049448353066, + "learning_rate": 1.9973322836635517e-05, + "loss": 1.2227, + "step": 1095 + }, + { + "epoch": 0.05, + "grad_norm": 1.5166618484331251, + "learning_rate": 1.9973209004341442e-05, + "loss": 1.312, + "step": 1096 + }, + { + "epoch": 0.05, + "grad_norm": 1.5346874023872539, + "learning_rate": 1.997309493002691e-05, + "loss": 1.2261, + "step": 1097 + }, + { + "epoch": 0.05, + "grad_norm": 1.0903940761602384, + "learning_rate": 1.9972980613694698e-05, + "loss": 1.1782, + "step": 1098 + }, + { + "epoch": 0.05, + "grad_norm": 1.4082155893939632, + "learning_rate": 1.9972866055347572e-05, + "loss": 1.0967, + "step": 1099 + }, + { + "epoch": 0.05, + "grad_norm": 1.3086608062767373, + "learning_rate": 1.9972751254988317e-05, + "loss": 1.3193, + "step": 1100 + }, + { + "epoch": 0.05, + "grad_norm": 1.420563449658825, + "learning_rate": 1.997263621261972e-05, + "loss": 1.2358, + "step": 1101 + }, + { + "epoch": 0.05, + "grad_norm": 1.4396599528523966, + "learning_rate": 1.997252092824457e-05, + "loss": 1.3574, + "step": 1102 + }, + { + "epoch": 0.05, + "grad_norm": 1.2443797362079498, + "learning_rate": 1.9972405401865663e-05, + "loss": 1.3032, + "step": 1103 + }, + { + "epoch": 0.05, + "grad_norm": 1.3214306726538065, + "learning_rate": 1.997228963348581e-05, + "loss": 1.1357, + "step": 1104 + }, + { + "epoch": 0.05, + "grad_norm": 1.5002109099648913, + "learning_rate": 1.997217362310781e-05, + "loss": 1.3071, + "step": 1105 + }, + { + "epoch": 0.05, + "grad_norm": 1.1375524972364377, + "learning_rate": 1.9972057370734483e-05, + "loss": 1.1943, + "step": 1106 + }, + { + "epoch": 0.05, + "grad_norm": 1.5354869028515123, + "learning_rate": 1.9971940876368653e-05, + "loss": 1.3135, + "step": 1107 + }, + { + "epoch": 0.05, + "grad_norm": 1.3774621372482108, + "learning_rate": 1.9971824140013143e-05, + "loss": 1.3682, + "step": 1108 + }, + { + "epoch": 0.05, + "grad_norm": 1.2979932882725553, + "learning_rate": 1.9971707161670786e-05, + "loss": 1.334, + "step": 1109 + }, + { + "epoch": 0.05, + "grad_norm": 1.083177672941417, + "learning_rate": 1.997158994134442e-05, + "loss": 1.2217, + "step": 1110 + }, + { + "epoch": 0.05, + "grad_norm": 1.2409487754649744, + "learning_rate": 1.99714724790369e-05, + "loss": 1.3159, + "step": 1111 + }, + { + "epoch": 0.05, + "grad_norm": 1.125500447027702, + "learning_rate": 1.9971354774751063e-05, + "loss": 1.3516, + "step": 1112 + }, + { + "epoch": 0.05, + "grad_norm": 1.4519523497960383, + "learning_rate": 1.9971236828489768e-05, + "loss": 1.1709, + "step": 1113 + }, + { + "epoch": 0.05, + "grad_norm": 1.5074835114732652, + "learning_rate": 1.9971118640255883e-05, + "loss": 1.3833, + "step": 1114 + }, + { + "epoch": 0.05, + "grad_norm": 1.5595971352011893, + "learning_rate": 1.997100021005227e-05, + "loss": 1.3818, + "step": 1115 + }, + { + "epoch": 0.05, + "grad_norm": 1.5218119961028296, + "learning_rate": 1.997088153788181e-05, + "loss": 1.3252, + "step": 1116 + }, + { + "epoch": 0.05, + "grad_norm": 1.6747049093004815, + "learning_rate": 1.9970762623747373e-05, + "loss": 1.3853, + "step": 1117 + }, + { + "epoch": 0.05, + "grad_norm": 0.7828877955130968, + "learning_rate": 1.9970643467651853e-05, + "loss": 1.1121, + "step": 1118 + }, + { + "epoch": 0.05, + "grad_norm": 1.3288959626352281, + "learning_rate": 1.9970524069598136e-05, + "loss": 1.186, + "step": 1119 + }, + { + "epoch": 0.05, + "grad_norm": 1.5918321671779148, + "learning_rate": 1.9970404429589126e-05, + "loss": 1.1863, + "step": 1120 + }, + { + "epoch": 0.05, + "grad_norm": 1.6003154680307907, + "learning_rate": 1.997028454762772e-05, + "loss": 1.5312, + "step": 1121 + }, + { + "epoch": 0.05, + "grad_norm": 1.3116972355784613, + "learning_rate": 1.997016442371683e-05, + "loss": 1.4238, + "step": 1122 + }, + { + "epoch": 0.05, + "grad_norm": 1.3878890923202327, + "learning_rate": 1.997004405785937e-05, + "loss": 1.2744, + "step": 1123 + }, + { + "epoch": 0.05, + "grad_norm": 0.9714710554799669, + "learning_rate": 1.9969923450058264e-05, + "loss": 1.2695, + "step": 1124 + }, + { + "epoch": 0.05, + "grad_norm": 1.2695514060839288, + "learning_rate": 1.9969802600316433e-05, + "loss": 1.2212, + "step": 1125 + }, + { + "epoch": 0.05, + "grad_norm": 1.5624113266494528, + "learning_rate": 1.9969681508636814e-05, + "loss": 1.3369, + "step": 1126 + }, + { + "epoch": 0.05, + "grad_norm": 1.3926226602763765, + "learning_rate": 1.9969560175022343e-05, + "loss": 1.3213, + "step": 1127 + }, + { + "epoch": 0.05, + "grad_norm": 1.1318689410201102, + "learning_rate": 1.996943859947597e-05, + "loss": 1.3164, + "step": 1128 + }, + { + "epoch": 0.05, + "grad_norm": 1.2810954468263356, + "learning_rate": 1.996931678200064e-05, + "loss": 1.186, + "step": 1129 + }, + { + "epoch": 0.05, + "grad_norm": 0.9818086438479263, + "learning_rate": 1.996919472259931e-05, + "loss": 1.2148, + "step": 1130 + }, + { + "epoch": 0.05, + "grad_norm": 1.3148385191747365, + "learning_rate": 1.996907242127494e-05, + "loss": 1.1277, + "step": 1131 + }, + { + "epoch": 0.05, + "grad_norm": 1.367787033337665, + "learning_rate": 1.9968949878030503e-05, + "loss": 1.1631, + "step": 1132 + }, + { + "epoch": 0.05, + "grad_norm": 1.2825564224474144, + "learning_rate": 1.996882709286897e-05, + "loss": 1.2852, + "step": 1133 + }, + { + "epoch": 0.05, + "grad_norm": 1.2546059153143287, + "learning_rate": 1.996870406579332e-05, + "loss": 1.2817, + "step": 1134 + }, + { + "epoch": 0.05, + "grad_norm": 1.2373774497817236, + "learning_rate": 1.9968580796806542e-05, + "loss": 1.373, + "step": 1135 + }, + { + "epoch": 0.05, + "grad_norm": 1.3912403623737093, + "learning_rate": 1.9968457285911624e-05, + "loss": 1.2969, + "step": 1136 + }, + { + "epoch": 0.05, + "grad_norm": 1.230223280602699, + "learning_rate": 1.996833353311156e-05, + "loss": 1.2324, + "step": 1137 + }, + { + "epoch": 0.05, + "grad_norm": 1.1424821971684167, + "learning_rate": 1.996820953840936e-05, + "loss": 1.3501, + "step": 1138 + }, + { + "epoch": 0.05, + "grad_norm": 1.2089742676861317, + "learning_rate": 1.9968085301808026e-05, + "loss": 1.2222, + "step": 1139 + }, + { + "epoch": 0.05, + "grad_norm": 1.3259714498879585, + "learning_rate": 1.996796082331058e-05, + "loss": 1.2773, + "step": 1140 + }, + { + "epoch": 0.05, + "grad_norm": 1.2567871202981753, + "learning_rate": 1.9967836102920043e-05, + "loss": 1.2578, + "step": 1141 + }, + { + "epoch": 0.05, + "grad_norm": 1.4823825681370333, + "learning_rate": 1.996771114063943e-05, + "loss": 1.3818, + "step": 1142 + }, + { + "epoch": 0.05, + "grad_norm": 1.3685625825721024, + "learning_rate": 1.996758593647179e-05, + "loss": 1.3789, + "step": 1143 + }, + { + "epoch": 0.06, + "grad_norm": 1.4202231952290987, + "learning_rate": 1.996746049042015e-05, + "loss": 1.4033, + "step": 1144 + }, + { + "epoch": 0.06, + "grad_norm": 0.9503595399266787, + "learning_rate": 1.9967334802487553e-05, + "loss": 1.1909, + "step": 1145 + }, + { + "epoch": 0.06, + "grad_norm": 1.22782020882523, + "learning_rate": 1.996720887267706e-05, + "loss": 1.2036, + "step": 1146 + }, + { + "epoch": 0.06, + "grad_norm": 0.9689495329997446, + "learning_rate": 1.9967082700991712e-05, + "loss": 1.1216, + "step": 1147 + }, + { + "epoch": 0.06, + "grad_norm": 1.2876398434548313, + "learning_rate": 1.9966956287434586e-05, + "loss": 1.269, + "step": 1148 + }, + { + "epoch": 0.06, + "grad_norm": 1.2892987134470477, + "learning_rate": 1.996682963200874e-05, + "loss": 1.2012, + "step": 1149 + }, + { + "epoch": 0.06, + "grad_norm": 1.7091973788178347, + "learning_rate": 1.9966702734717248e-05, + "loss": 1.3086, + "step": 1150 + }, + { + "epoch": 0.06, + "grad_norm": 1.3708082196985756, + "learning_rate": 1.9966575595563195e-05, + "loss": 1.1763, + "step": 1151 + }, + { + "epoch": 0.06, + "grad_norm": 1.2515434593652914, + "learning_rate": 1.996644821454966e-05, + "loss": 1.2568, + "step": 1152 + }, + { + "epoch": 0.06, + "grad_norm": 1.0946773935871086, + "learning_rate": 1.996632059167974e-05, + "loss": 1.2261, + "step": 1153 + }, + { + "epoch": 0.06, + "grad_norm": 1.2157228863881675, + "learning_rate": 1.996619272695653e-05, + "loss": 1.1108, + "step": 1154 + }, + { + "epoch": 0.06, + "grad_norm": 1.6932113416384522, + "learning_rate": 1.996606462038313e-05, + "loss": 1.3428, + "step": 1155 + }, + { + "epoch": 0.06, + "grad_norm": 1.4647696001651183, + "learning_rate": 1.9965936271962652e-05, + "loss": 1.2695, + "step": 1156 + }, + { + "epoch": 0.06, + "grad_norm": 1.2295350215446725, + "learning_rate": 1.9965807681698208e-05, + "loss": 1.2803, + "step": 1157 + }, + { + "epoch": 0.06, + "grad_norm": 1.373098498516551, + "learning_rate": 1.996567884959292e-05, + "loss": 1.1987, + "step": 1158 + }, + { + "epoch": 0.06, + "grad_norm": 1.5807001091777488, + "learning_rate": 1.9965549775649914e-05, + "loss": 1.3135, + "step": 1159 + }, + { + "epoch": 0.06, + "grad_norm": 1.3516132384332489, + "learning_rate": 1.9965420459872325e-05, + "loss": 1.3462, + "step": 1160 + }, + { + "epoch": 0.06, + "grad_norm": 0.9376380215771913, + "learning_rate": 1.9965290902263286e-05, + "loss": 1.1313, + "step": 1161 + }, + { + "epoch": 0.06, + "grad_norm": 1.5192124577801223, + "learning_rate": 1.9965161102825944e-05, + "loss": 1.2974, + "step": 1162 + }, + { + "epoch": 0.06, + "grad_norm": 1.216461988499304, + "learning_rate": 1.996503106156345e-05, + "loss": 1.1021, + "step": 1163 + }, + { + "epoch": 0.06, + "grad_norm": 1.3185857008412607, + "learning_rate": 1.9964900778478958e-05, + "loss": 1.1602, + "step": 1164 + }, + { + "epoch": 0.06, + "grad_norm": 1.3167107658086623, + "learning_rate": 1.996477025357563e-05, + "loss": 1.3154, + "step": 1165 + }, + { + "epoch": 0.06, + "grad_norm": 1.533119646432664, + "learning_rate": 1.9964639486856634e-05, + "loss": 1.3481, + "step": 1166 + }, + { + "epoch": 0.06, + "grad_norm": 1.3475747914470217, + "learning_rate": 1.996450847832514e-05, + "loss": 1.3926, + "step": 1167 + }, + { + "epoch": 0.06, + "grad_norm": 1.4695003677549798, + "learning_rate": 1.996437722798433e-05, + "loss": 1.3618, + "step": 1168 + }, + { + "epoch": 0.06, + "grad_norm": 1.5164983395177585, + "learning_rate": 1.996424573583739e-05, + "loss": 1.2734, + "step": 1169 + }, + { + "epoch": 0.06, + "grad_norm": 1.1819627237885333, + "learning_rate": 1.996411400188751e-05, + "loss": 1.4136, + "step": 1170 + }, + { + "epoch": 0.06, + "grad_norm": 1.3064104902561453, + "learning_rate": 1.9963982026137886e-05, + "loss": 1.3745, + "step": 1171 + }, + { + "epoch": 0.06, + "grad_norm": 1.2216111937511662, + "learning_rate": 1.9963849808591723e-05, + "loss": 1.2139, + "step": 1172 + }, + { + "epoch": 0.06, + "grad_norm": 1.3494338413905522, + "learning_rate": 1.9963717349252226e-05, + "loss": 1.3394, + "step": 1173 + }, + { + "epoch": 0.06, + "grad_norm": 1.5671959441253445, + "learning_rate": 1.996358464812261e-05, + "loss": 1.2563, + "step": 1174 + }, + { + "epoch": 0.06, + "grad_norm": 1.4628730401414831, + "learning_rate": 1.99634517052061e-05, + "loss": 1.3271, + "step": 1175 + }, + { + "epoch": 0.06, + "grad_norm": 1.1846291370121216, + "learning_rate": 1.9963318520505915e-05, + "loss": 1.1338, + "step": 1176 + }, + { + "epoch": 0.06, + "grad_norm": 1.1379832590739252, + "learning_rate": 1.9963185094025293e-05, + "loss": 1.3252, + "step": 1177 + }, + { + "epoch": 0.06, + "grad_norm": 1.07582844654449, + "learning_rate": 1.996305142576747e-05, + "loss": 1.3032, + "step": 1178 + }, + { + "epoch": 0.06, + "grad_norm": 1.4479034270947602, + "learning_rate": 1.9962917515735686e-05, + "loss": 1.4873, + "step": 1179 + }, + { + "epoch": 0.06, + "grad_norm": 1.291283301212534, + "learning_rate": 1.9962783363933193e-05, + "loss": 1.335, + "step": 1180 + }, + { + "epoch": 0.06, + "grad_norm": 1.1647519211756345, + "learning_rate": 1.996264897036325e-05, + "loss": 1.3672, + "step": 1181 + }, + { + "epoch": 0.06, + "grad_norm": 1.3937575401335844, + "learning_rate": 1.9962514335029116e-05, + "loss": 1.271, + "step": 1182 + }, + { + "epoch": 0.06, + "grad_norm": 1.2190172506548969, + "learning_rate": 1.9962379457934058e-05, + "loss": 1.125, + "step": 1183 + }, + { + "epoch": 0.06, + "grad_norm": 1.5111536041203253, + "learning_rate": 1.9962244339081347e-05, + "loss": 1.335, + "step": 1184 + }, + { + "epoch": 0.06, + "grad_norm": 1.239181429722639, + "learning_rate": 1.9962108978474265e-05, + "loss": 1.1531, + "step": 1185 + }, + { + "epoch": 0.06, + "grad_norm": 1.3292158173489255, + "learning_rate": 1.9961973376116096e-05, + "loss": 1.3589, + "step": 1186 + }, + { + "epoch": 0.06, + "grad_norm": 1.4756829849163433, + "learning_rate": 1.996183753201013e-05, + "loss": 1.2295, + "step": 1187 + }, + { + "epoch": 0.06, + "grad_norm": 1.2533272314373796, + "learning_rate": 1.996170144615966e-05, + "loss": 1.1562, + "step": 1188 + }, + { + "epoch": 0.06, + "grad_norm": 1.195261892495741, + "learning_rate": 1.9961565118567997e-05, + "loss": 1.209, + "step": 1189 + }, + { + "epoch": 0.06, + "grad_norm": 1.4115696169291663, + "learning_rate": 1.9961428549238445e-05, + "loss": 1.3379, + "step": 1190 + }, + { + "epoch": 0.06, + "grad_norm": 1.0946129814718788, + "learning_rate": 1.9961291738174316e-05, + "loss": 1.2041, + "step": 1191 + }, + { + "epoch": 0.06, + "grad_norm": 0.9573481612574324, + "learning_rate": 1.996115468537893e-05, + "loss": 1.3262, + "step": 1192 + }, + { + "epoch": 0.06, + "grad_norm": 1.424991619755627, + "learning_rate": 1.9961017390855616e-05, + "loss": 1.417, + "step": 1193 + }, + { + "epoch": 0.06, + "grad_norm": 1.1350965708477252, + "learning_rate": 1.9960879854607702e-05, + "loss": 1.2861, + "step": 1194 + }, + { + "epoch": 0.06, + "grad_norm": 1.0689624805928288, + "learning_rate": 1.996074207663853e-05, + "loss": 1.3394, + "step": 1195 + }, + { + "epoch": 0.06, + "grad_norm": 1.5539093781707936, + "learning_rate": 1.9960604056951444e-05, + "loss": 1.2012, + "step": 1196 + }, + { + "epoch": 0.06, + "grad_norm": 1.225275624668329, + "learning_rate": 1.9960465795549787e-05, + "loss": 1.2031, + "step": 1197 + }, + { + "epoch": 0.06, + "grad_norm": 1.473099900938001, + "learning_rate": 1.996032729243692e-05, + "loss": 1.4087, + "step": 1198 + }, + { + "epoch": 0.06, + "grad_norm": 1.3366495219677799, + "learning_rate": 1.99601885476162e-05, + "loss": 1.3208, + "step": 1199 + }, + { + "epoch": 0.06, + "grad_norm": 1.147001149538042, + "learning_rate": 1.9960049561090995e-05, + "loss": 1.2168, + "step": 1200 + }, + { + "epoch": 0.06, + "grad_norm": 1.0803206246913872, + "learning_rate": 1.995991033286468e-05, + "loss": 1.1743, + "step": 1201 + }, + { + "epoch": 0.06, + "grad_norm": 1.4648990783925437, + "learning_rate": 1.9959770862940632e-05, + "loss": 1.1528, + "step": 1202 + }, + { + "epoch": 0.06, + "grad_norm": 1.3083019976676895, + "learning_rate": 1.9959631151322235e-05, + "loss": 1.0869, + "step": 1203 + }, + { + "epoch": 0.06, + "grad_norm": 1.2110769472077494, + "learning_rate": 1.995949119801288e-05, + "loss": 1.0327, + "step": 1204 + }, + { + "epoch": 0.06, + "grad_norm": 1.2123005534772617, + "learning_rate": 1.995935100301597e-05, + "loss": 1.2925, + "step": 1205 + }, + { + "epoch": 0.06, + "grad_norm": 1.2951882506699464, + "learning_rate": 1.9959210566334893e-05, + "loss": 1.3853, + "step": 1206 + }, + { + "epoch": 0.06, + "grad_norm": 1.1243474873559611, + "learning_rate": 1.9959069887973067e-05, + "loss": 1.2036, + "step": 1207 + }, + { + "epoch": 0.06, + "grad_norm": 1.2277240140640617, + "learning_rate": 1.9958928967933903e-05, + "loss": 1.2793, + "step": 1208 + }, + { + "epoch": 0.06, + "grad_norm": 1.3279887764039047, + "learning_rate": 1.995878780622082e-05, + "loss": 1.4385, + "step": 1209 + }, + { + "epoch": 0.06, + "grad_norm": 1.370675130590996, + "learning_rate": 1.9958646402837247e-05, + "loss": 1.3022, + "step": 1210 + }, + { + "epoch": 0.06, + "grad_norm": 1.1271506053195537, + "learning_rate": 1.995850475778661e-05, + "loss": 1.3257, + "step": 1211 + }, + { + "epoch": 0.06, + "grad_norm": 1.1631018990266622, + "learning_rate": 1.9958362871072353e-05, + "loss": 1.2949, + "step": 1212 + }, + { + "epoch": 0.06, + "grad_norm": 1.3891065799920526, + "learning_rate": 1.9958220742697915e-05, + "loss": 1.0747, + "step": 1213 + }, + { + "epoch": 0.06, + "grad_norm": 1.470306328225646, + "learning_rate": 1.995807837266674e-05, + "loss": 1.2344, + "step": 1214 + }, + { + "epoch": 0.06, + "grad_norm": 1.278263299225308, + "learning_rate": 1.9957935760982297e-05, + "loss": 1.207, + "step": 1215 + }, + { + "epoch": 0.06, + "grad_norm": 0.9830605949860025, + "learning_rate": 1.9957792907648033e-05, + "loss": 1.3228, + "step": 1216 + }, + { + "epoch": 0.06, + "grad_norm": 1.476941127801184, + "learning_rate": 1.9957649812667422e-05, + "loss": 1.3149, + "step": 1217 + }, + { + "epoch": 0.06, + "grad_norm": 1.2887060110366275, + "learning_rate": 1.9957506476043934e-05, + "loss": 1.314, + "step": 1218 + }, + { + "epoch": 0.06, + "grad_norm": 1.2211380058914756, + "learning_rate": 1.995736289778105e-05, + "loss": 1.1582, + "step": 1219 + }, + { + "epoch": 0.06, + "grad_norm": 1.5017902688747176, + "learning_rate": 1.9957219077882245e-05, + "loss": 1.3203, + "step": 1220 + }, + { + "epoch": 0.06, + "grad_norm": 1.3104990621999943, + "learning_rate": 1.9957075016351023e-05, + "loss": 1.1719, + "step": 1221 + }, + { + "epoch": 0.06, + "grad_norm": 1.559348620012353, + "learning_rate": 1.995693071319087e-05, + "loss": 1.2964, + "step": 1222 + }, + { + "epoch": 0.06, + "grad_norm": 1.2221872960263491, + "learning_rate": 1.995678616840529e-05, + "loss": 1.3496, + "step": 1223 + }, + { + "epoch": 0.06, + "grad_norm": 1.4867455771993463, + "learning_rate": 1.9956641381997795e-05, + "loss": 1.3628, + "step": 1224 + }, + { + "epoch": 0.06, + "grad_norm": 1.312437079989776, + "learning_rate": 1.9956496353971894e-05, + "loss": 1.1685, + "step": 1225 + }, + { + "epoch": 0.06, + "grad_norm": 1.400428692330916, + "learning_rate": 1.995635108433111e-05, + "loss": 1.27, + "step": 1226 + }, + { + "epoch": 0.06, + "grad_norm": 1.39554428381017, + "learning_rate": 1.995620557307896e-05, + "loss": 1.4028, + "step": 1227 + }, + { + "epoch": 0.06, + "grad_norm": 1.4300370941024458, + "learning_rate": 1.9956059820218982e-05, + "loss": 1.3345, + "step": 1228 + }, + { + "epoch": 0.06, + "grad_norm": 1.4501762115521841, + "learning_rate": 1.9955913825754713e-05, + "loss": 1.2832, + "step": 1229 + }, + { + "epoch": 0.06, + "grad_norm": 1.263179748375819, + "learning_rate": 1.9955767589689697e-05, + "loss": 1.2065, + "step": 1230 + }, + { + "epoch": 0.06, + "grad_norm": 1.4508581161707867, + "learning_rate": 1.9955621112027476e-05, + "loss": 1.1113, + "step": 1231 + }, + { + "epoch": 0.06, + "grad_norm": 1.2902706477353545, + "learning_rate": 1.995547439277161e-05, + "loss": 1.2368, + "step": 1232 + }, + { + "epoch": 0.06, + "grad_norm": 1.1376601519827074, + "learning_rate": 1.9955327431925663e-05, + "loss": 1.2046, + "step": 1233 + }, + { + "epoch": 0.06, + "grad_norm": 1.1354097356870878, + "learning_rate": 1.9955180229493193e-05, + "loss": 1.1265, + "step": 1234 + }, + { + "epoch": 0.06, + "grad_norm": 1.368987275813263, + "learning_rate": 1.9955032785477778e-05, + "loss": 1.207, + "step": 1235 + }, + { + "epoch": 0.06, + "grad_norm": 1.2637365078810723, + "learning_rate": 1.9954885099882992e-05, + "loss": 1.2427, + "step": 1236 + }, + { + "epoch": 0.06, + "grad_norm": 1.3107876140061971, + "learning_rate": 1.9954737172712422e-05, + "loss": 1.4326, + "step": 1237 + }, + { + "epoch": 0.06, + "grad_norm": 1.042413046016357, + "learning_rate": 1.9954589003969657e-05, + "loss": 1.2153, + "step": 1238 + }, + { + "epoch": 0.06, + "grad_norm": 1.16453364319218, + "learning_rate": 1.9954440593658294e-05, + "loss": 1.2295, + "step": 1239 + }, + { + "epoch": 0.06, + "grad_norm": 1.4867303837130328, + "learning_rate": 1.995429194178193e-05, + "loss": 1.3428, + "step": 1240 + }, + { + "epoch": 0.06, + "grad_norm": 1.3543670934561156, + "learning_rate": 1.9954143048344175e-05, + "loss": 1.2729, + "step": 1241 + }, + { + "epoch": 0.06, + "grad_norm": 1.490986306613243, + "learning_rate": 1.995399391334864e-05, + "loss": 1.0986, + "step": 1242 + }, + { + "epoch": 0.06, + "grad_norm": 1.6485574754422299, + "learning_rate": 1.995384453679895e-05, + "loss": 1.4683, + "step": 1243 + }, + { + "epoch": 0.06, + "grad_norm": 1.6474747997173635, + "learning_rate": 1.9953694918698726e-05, + "loss": 1.3154, + "step": 1244 + }, + { + "epoch": 0.06, + "grad_norm": 1.298660024384456, + "learning_rate": 1.99535450590516e-05, + "loss": 1.2156, + "step": 1245 + }, + { + "epoch": 0.06, + "grad_norm": 1.4061305508015278, + "learning_rate": 1.9953394957861206e-05, + "loss": 1.3213, + "step": 1246 + }, + { + "epoch": 0.06, + "grad_norm": 1.1603509108889312, + "learning_rate": 1.9953244615131187e-05, + "loss": 1.1973, + "step": 1247 + }, + { + "epoch": 0.06, + "grad_norm": 1.2727357328249131, + "learning_rate": 1.9953094030865197e-05, + "loss": 1.2554, + "step": 1248 + }, + { + "epoch": 0.06, + "grad_norm": 1.0464557773522094, + "learning_rate": 1.995294320506688e-05, + "loss": 1.2686, + "step": 1249 + }, + { + "epoch": 0.06, + "grad_norm": 1.286866906481843, + "learning_rate": 1.9952792137739908e-05, + "loss": 1.2329, + "step": 1250 + }, + { + "epoch": 0.06, + "grad_norm": 1.7802029262338075, + "learning_rate": 1.995264082888794e-05, + "loss": 1.3354, + "step": 1251 + }, + { + "epoch": 0.06, + "grad_norm": 1.3584798556192952, + "learning_rate": 1.9952489278514644e-05, + "loss": 1.3096, + "step": 1252 + }, + { + "epoch": 0.06, + "grad_norm": 1.1536646876550822, + "learning_rate": 1.9952337486623704e-05, + "loss": 1.2271, + "step": 1253 + }, + { + "epoch": 0.06, + "grad_norm": 1.1733279836514077, + "learning_rate": 1.9952185453218803e-05, + "loss": 1.251, + "step": 1254 + }, + { + "epoch": 0.06, + "grad_norm": 0.9328158037883202, + "learning_rate": 1.9952033178303632e-05, + "loss": 1.1709, + "step": 1255 + }, + { + "epoch": 0.06, + "grad_norm": 1.2231153708462144, + "learning_rate": 1.995188066188188e-05, + "loss": 1.1377, + "step": 1256 + }, + { + "epoch": 0.06, + "grad_norm": 1.365406926993071, + "learning_rate": 1.9951727903957252e-05, + "loss": 0.9807, + "step": 1257 + }, + { + "epoch": 0.06, + "grad_norm": 1.393741522021695, + "learning_rate": 1.9951574904533456e-05, + "loss": 1.124, + "step": 1258 + }, + { + "epoch": 0.06, + "grad_norm": 1.1703974884170252, + "learning_rate": 1.9951421663614204e-05, + "loss": 1.3628, + "step": 1259 + }, + { + "epoch": 0.06, + "grad_norm": 1.3550930677001618, + "learning_rate": 1.9951268181203213e-05, + "loss": 1.4443, + "step": 1260 + }, + { + "epoch": 0.06, + "grad_norm": 1.2995875089008637, + "learning_rate": 1.995111445730421e-05, + "loss": 1.4395, + "step": 1261 + }, + { + "epoch": 0.06, + "grad_norm": 1.6041301104242178, + "learning_rate": 1.9950960491920923e-05, + "loss": 1.3838, + "step": 1262 + }, + { + "epoch": 0.06, + "grad_norm": 1.3617842935784956, + "learning_rate": 1.9950806285057092e-05, + "loss": 1.4146, + "step": 1263 + }, + { + "epoch": 0.06, + "grad_norm": 1.1917380968021707, + "learning_rate": 1.9950651836716453e-05, + "loss": 1.2012, + "step": 1264 + }, + { + "epoch": 0.06, + "grad_norm": 1.4911871138398949, + "learning_rate": 1.9950497146902757e-05, + "loss": 1.3071, + "step": 1265 + }, + { + "epoch": 0.06, + "grad_norm": 1.3596208582085392, + "learning_rate": 1.9950342215619764e-05, + "loss": 1.3115, + "step": 1266 + }, + { + "epoch": 0.06, + "grad_norm": 1.2542492962960106, + "learning_rate": 1.9950187042871226e-05, + "loss": 1.3838, + "step": 1267 + }, + { + "epoch": 0.06, + "grad_norm": 1.2812399628302293, + "learning_rate": 1.995003162866091e-05, + "loss": 1.208, + "step": 1268 + }, + { + "epoch": 0.06, + "grad_norm": 1.297500216398593, + "learning_rate": 1.994987597299259e-05, + "loss": 1.082, + "step": 1269 + }, + { + "epoch": 0.06, + "grad_norm": 1.3053800157913977, + "learning_rate": 1.994972007587004e-05, + "loss": 1.3872, + "step": 1270 + }, + { + "epoch": 0.06, + "grad_norm": 1.5337097542258213, + "learning_rate": 1.9949563937297045e-05, + "loss": 1.3774, + "step": 1271 + }, + { + "epoch": 0.06, + "grad_norm": 1.125004238135324, + "learning_rate": 1.9949407557277394e-05, + "loss": 1.3018, + "step": 1272 + }, + { + "epoch": 0.06, + "grad_norm": 1.2074938600316385, + "learning_rate": 1.9949250935814884e-05, + "loss": 1.2852, + "step": 1273 + }, + { + "epoch": 0.06, + "grad_norm": 1.4033995706433413, + "learning_rate": 1.994909407291331e-05, + "loss": 1.1458, + "step": 1274 + }, + { + "epoch": 0.06, + "grad_norm": 1.2663365713129235, + "learning_rate": 1.9948936968576483e-05, + "loss": 1.2007, + "step": 1275 + }, + { + "epoch": 0.06, + "grad_norm": 1.9732730010784716, + "learning_rate": 1.9948779622808215e-05, + "loss": 1.5381, + "step": 1276 + }, + { + "epoch": 0.06, + "grad_norm": 1.2967888900004794, + "learning_rate": 1.9948622035612326e-05, + "loss": 1.167, + "step": 1277 + }, + { + "epoch": 0.06, + "grad_norm": 1.0589116558130922, + "learning_rate": 1.9948464206992635e-05, + "loss": 1.2319, + "step": 1278 + }, + { + "epoch": 0.06, + "grad_norm": 1.371921469653475, + "learning_rate": 1.9948306136952976e-05, + "loss": 1.3428, + "step": 1279 + }, + { + "epoch": 0.06, + "grad_norm": 1.2649379702790597, + "learning_rate": 1.9948147825497184e-05, + "loss": 1.3208, + "step": 1280 + }, + { + "epoch": 0.06, + "grad_norm": 1.0884103595084624, + "learning_rate": 1.99479892726291e-05, + "loss": 1.1987, + "step": 1281 + }, + { + "epoch": 0.06, + "grad_norm": 1.396697050028248, + "learning_rate": 1.9947830478352578e-05, + "loss": 1.2285, + "step": 1282 + }, + { + "epoch": 0.06, + "grad_norm": 1.4652892035840763, + "learning_rate": 1.994767144267146e-05, + "loss": 1.3389, + "step": 1283 + }, + { + "epoch": 0.06, + "grad_norm": 1.5894635822180903, + "learning_rate": 1.994751216558961e-05, + "loss": 1.2773, + "step": 1284 + }, + { + "epoch": 0.06, + "grad_norm": 1.5236110869070074, + "learning_rate": 1.9947352647110895e-05, + "loss": 1.3784, + "step": 1285 + }, + { + "epoch": 0.06, + "grad_norm": 1.5149126773094834, + "learning_rate": 1.9947192887239188e-05, + "loss": 1.334, + "step": 1286 + }, + { + "epoch": 0.06, + "grad_norm": 1.184195971447859, + "learning_rate": 1.9947032885978365e-05, + "loss": 1.248, + "step": 1287 + }, + { + "epoch": 0.06, + "grad_norm": 1.4476136379552171, + "learning_rate": 1.99468726433323e-05, + "loss": 1.2671, + "step": 1288 + }, + { + "epoch": 0.06, + "grad_norm": 1.2968907566338115, + "learning_rate": 1.9946712159304894e-05, + "loss": 1.4019, + "step": 1289 + }, + { + "epoch": 0.06, + "grad_norm": 1.2321153544339347, + "learning_rate": 1.9946551433900033e-05, + "loss": 1.4419, + "step": 1290 + }, + { + "epoch": 0.06, + "grad_norm": 1.1562539628242532, + "learning_rate": 1.9946390467121627e-05, + "loss": 1.23, + "step": 1291 + }, + { + "epoch": 0.06, + "grad_norm": 1.2818727390920253, + "learning_rate": 1.994622925897357e-05, + "loss": 1.4551, + "step": 1292 + }, + { + "epoch": 0.06, + "grad_norm": 1.439566200317379, + "learning_rate": 1.994606780945978e-05, + "loss": 1.2866, + "step": 1293 + }, + { + "epoch": 0.06, + "grad_norm": 1.2248805255492867, + "learning_rate": 1.9945906118584173e-05, + "loss": 1.2651, + "step": 1294 + }, + { + "epoch": 0.06, + "grad_norm": 1.2895242662428423, + "learning_rate": 1.994574418635068e-05, + "loss": 1.2354, + "step": 1295 + }, + { + "epoch": 0.06, + "grad_norm": 1.2051327639930187, + "learning_rate": 1.994558201276322e-05, + "loss": 1.3589, + "step": 1296 + }, + { + "epoch": 0.06, + "grad_norm": 1.2218824120117846, + "learning_rate": 1.9945419597825736e-05, + "loss": 1.3154, + "step": 1297 + }, + { + "epoch": 0.06, + "grad_norm": 1.4508929015520482, + "learning_rate": 1.9945256941542163e-05, + "loss": 1.4048, + "step": 1298 + }, + { + "epoch": 0.06, + "grad_norm": 0.8708801647261728, + "learning_rate": 1.9945094043916456e-05, + "loss": 1.3525, + "step": 1299 + }, + { + "epoch": 0.06, + "grad_norm": 0.9579914326918091, + "learning_rate": 1.9944930904952564e-05, + "loss": 1.2129, + "step": 1300 + }, + { + "epoch": 0.06, + "grad_norm": 1.0087790790346514, + "learning_rate": 1.9944767524654446e-05, + "loss": 1.166, + "step": 1301 + }, + { + "epoch": 0.06, + "grad_norm": 1.2192589641900438, + "learning_rate": 1.9944603903026064e-05, + "loss": 1.2939, + "step": 1302 + }, + { + "epoch": 0.06, + "grad_norm": 1.3659336768010804, + "learning_rate": 1.9944440040071392e-05, + "loss": 1.1621, + "step": 1303 + }, + { + "epoch": 0.06, + "grad_norm": 1.472047190540106, + "learning_rate": 1.9944275935794407e-05, + "loss": 1.124, + "step": 1304 + }, + { + "epoch": 0.06, + "grad_norm": 1.5450025048461162, + "learning_rate": 1.9944111590199088e-05, + "loss": 1.3579, + "step": 1305 + }, + { + "epoch": 0.06, + "grad_norm": 1.3225064598105063, + "learning_rate": 1.994394700328943e-05, + "loss": 1.2134, + "step": 1306 + }, + { + "epoch": 0.06, + "grad_norm": 1.1863429912280317, + "learning_rate": 1.994378217506942e-05, + "loss": 1.2075, + "step": 1307 + }, + { + "epoch": 0.06, + "grad_norm": 1.2683223640801193, + "learning_rate": 1.994361710554306e-05, + "loss": 1.0977, + "step": 1308 + }, + { + "epoch": 0.06, + "grad_norm": 1.472763852052605, + "learning_rate": 1.9943451794714354e-05, + "loss": 1.2231, + "step": 1309 + }, + { + "epoch": 0.06, + "grad_norm": 1.2699309476000522, + "learning_rate": 1.994328624258732e-05, + "loss": 1.2026, + "step": 1310 + }, + { + "epoch": 0.06, + "grad_norm": 1.2505712299051948, + "learning_rate": 1.9943120449165963e-05, + "loss": 1.1123, + "step": 1311 + }, + { + "epoch": 0.06, + "grad_norm": 1.348282844968032, + "learning_rate": 1.9942954414454322e-05, + "loss": 1.2944, + "step": 1312 + }, + { + "epoch": 0.06, + "grad_norm": 1.4110775760992165, + "learning_rate": 1.9942788138456418e-05, + "loss": 1.3853, + "step": 1313 + }, + { + "epoch": 0.06, + "grad_norm": 1.4753986978839488, + "learning_rate": 1.9942621621176283e-05, + "loss": 1.2661, + "step": 1314 + }, + { + "epoch": 0.06, + "grad_norm": 1.4816304914246252, + "learning_rate": 1.9942454862617962e-05, + "loss": 1.2441, + "step": 1315 + }, + { + "epoch": 0.06, + "grad_norm": 1.4089683236478527, + "learning_rate": 1.9942287862785502e-05, + "loss": 1.3994, + "step": 1316 + }, + { + "epoch": 0.06, + "grad_norm": 1.55150202861976, + "learning_rate": 1.9942120621682957e-05, + "loss": 1.3525, + "step": 1317 + }, + { + "epoch": 0.06, + "grad_norm": 1.6127446011686273, + "learning_rate": 1.994195313931438e-05, + "loss": 1.3975, + "step": 1318 + }, + { + "epoch": 0.06, + "grad_norm": 1.248655141878362, + "learning_rate": 1.994178541568384e-05, + "loss": 1.4741, + "step": 1319 + }, + { + "epoch": 0.06, + "grad_norm": 1.6931543595463938, + "learning_rate": 1.9941617450795406e-05, + "loss": 1.3164, + "step": 1320 + }, + { + "epoch": 0.06, + "grad_norm": 1.2494081960015793, + "learning_rate": 1.9941449244653154e-05, + "loss": 1.2188, + "step": 1321 + }, + { + "epoch": 0.06, + "grad_norm": 0.9613072570667507, + "learning_rate": 1.9941280797261163e-05, + "loss": 1.0884, + "step": 1322 + }, + { + "epoch": 0.06, + "grad_norm": 1.0406840732447598, + "learning_rate": 1.9941112108623523e-05, + "loss": 1.2163, + "step": 1323 + }, + { + "epoch": 0.06, + "grad_norm": 1.1864671485546363, + "learning_rate": 1.9940943178744333e-05, + "loss": 1.2002, + "step": 1324 + }, + { + "epoch": 0.06, + "grad_norm": 1.4703775677346624, + "learning_rate": 1.9940774007627684e-05, + "loss": 1.1768, + "step": 1325 + }, + { + "epoch": 0.06, + "grad_norm": 1.5021683558422232, + "learning_rate": 1.9940604595277687e-05, + "loss": 1.3594, + "step": 1326 + }, + { + "epoch": 0.06, + "grad_norm": 1.257417975740565, + "learning_rate": 1.9940434941698447e-05, + "loss": 1.3262, + "step": 1327 + }, + { + "epoch": 0.06, + "grad_norm": 1.121121314135909, + "learning_rate": 1.9940265046894086e-05, + "loss": 1.2344, + "step": 1328 + }, + { + "epoch": 0.06, + "grad_norm": 1.0915395672259784, + "learning_rate": 1.9940094910868727e-05, + "loss": 1.2251, + "step": 1329 + }, + { + "epoch": 0.06, + "grad_norm": 1.28639539286163, + "learning_rate": 1.9939924533626492e-05, + "loss": 1.1787, + "step": 1330 + }, + { + "epoch": 0.06, + "grad_norm": 1.435184693101793, + "learning_rate": 1.993975391517153e-05, + "loss": 1.1221, + "step": 1331 + }, + { + "epoch": 0.06, + "grad_norm": 1.2957081159201869, + "learning_rate": 1.9939583055507964e-05, + "loss": 1.2109, + "step": 1332 + }, + { + "epoch": 0.06, + "grad_norm": 1.3229321740622841, + "learning_rate": 1.9939411954639953e-05, + "loss": 1.4004, + "step": 1333 + }, + { + "epoch": 0.06, + "grad_norm": 1.4300079959553285, + "learning_rate": 1.9939240612571642e-05, + "loss": 1.4131, + "step": 1334 + }, + { + "epoch": 0.06, + "grad_norm": 1.6863993585990351, + "learning_rate": 1.9939069029307193e-05, + "loss": 1.293, + "step": 1335 + }, + { + "epoch": 0.06, + "grad_norm": 1.2927305247390872, + "learning_rate": 1.993889720485077e-05, + "loss": 1.2075, + "step": 1336 + }, + { + "epoch": 0.06, + "grad_norm": 1.3234304136185842, + "learning_rate": 1.993872513920654e-05, + "loss": 1.5342, + "step": 1337 + }, + { + "epoch": 0.06, + "grad_norm": 1.171711948130348, + "learning_rate": 1.993855283237868e-05, + "loss": 1.1953, + "step": 1338 + }, + { + "epoch": 0.06, + "grad_norm": 1.442382354987733, + "learning_rate": 1.993838028437137e-05, + "loss": 1.2437, + "step": 1339 + }, + { + "epoch": 0.06, + "grad_norm": 1.449520592327356, + "learning_rate": 1.9938207495188797e-05, + "loss": 1.2607, + "step": 1340 + }, + { + "epoch": 0.06, + "grad_norm": 1.0691742024503519, + "learning_rate": 1.9938034464835155e-05, + "loss": 1.1895, + "step": 1341 + }, + { + "epoch": 0.06, + "grad_norm": 1.1548860976525601, + "learning_rate": 1.9937861193314648e-05, + "loss": 1.3018, + "step": 1342 + }, + { + "epoch": 0.06, + "grad_norm": 1.142832178627285, + "learning_rate": 1.9937687680631473e-05, + "loss": 1.2114, + "step": 1343 + }, + { + "epoch": 0.06, + "grad_norm": 1.2772702918666565, + "learning_rate": 1.9937513926789843e-05, + "loss": 1.3818, + "step": 1344 + }, + { + "epoch": 0.06, + "grad_norm": 1.2234163539902638, + "learning_rate": 1.9937339931793975e-05, + "loss": 1.2837, + "step": 1345 + }, + { + "epoch": 0.06, + "grad_norm": 1.0254636156558115, + "learning_rate": 1.9937165695648092e-05, + "loss": 1.3755, + "step": 1346 + }, + { + "epoch": 0.06, + "grad_norm": 1.2212329869291185, + "learning_rate": 1.993699121835642e-05, + "loss": 1.1953, + "step": 1347 + }, + { + "epoch": 0.06, + "grad_norm": 0.927080231663782, + "learning_rate": 1.9936816499923198e-05, + "loss": 1.3765, + "step": 1348 + }, + { + "epoch": 0.06, + "grad_norm": 1.2762963244472094, + "learning_rate": 1.9936641540352665e-05, + "loss": 1.3057, + "step": 1349 + }, + { + "epoch": 0.06, + "grad_norm": 1.3982538698603952, + "learning_rate": 1.993646633964906e-05, + "loss": 1.2593, + "step": 1350 + }, + { + "epoch": 0.06, + "grad_norm": 1.1462402150254367, + "learning_rate": 1.993629089781664e-05, + "loss": 1.2104, + "step": 1351 + }, + { + "epoch": 0.07, + "grad_norm": 1.0834354215367321, + "learning_rate": 1.9936115214859663e-05, + "loss": 1.1821, + "step": 1352 + }, + { + "epoch": 0.07, + "grad_norm": 1.305800458690814, + "learning_rate": 1.9935939290782386e-05, + "loss": 1.0305, + "step": 1353 + }, + { + "epoch": 0.07, + "grad_norm": 1.2161794332576066, + "learning_rate": 1.9935763125589086e-05, + "loss": 1.2417, + "step": 1354 + }, + { + "epoch": 0.07, + "grad_norm": 1.2993021021247946, + "learning_rate": 1.9935586719284036e-05, + "loss": 1.3936, + "step": 1355 + }, + { + "epoch": 0.07, + "grad_norm": 1.1982271054726785, + "learning_rate": 1.9935410071871514e-05, + "loss": 1.3086, + "step": 1356 + }, + { + "epoch": 0.07, + "grad_norm": 1.037615113640751, + "learning_rate": 1.993523318335581e-05, + "loss": 1.3188, + "step": 1357 + }, + { + "epoch": 0.07, + "grad_norm": 1.444077956757576, + "learning_rate": 1.9935056053741216e-05, + "loss": 1.2871, + "step": 1358 + }, + { + "epoch": 0.07, + "grad_norm": 1.2543192523643627, + "learning_rate": 1.9934878683032028e-05, + "loss": 1.2144, + "step": 1359 + }, + { + "epoch": 0.07, + "grad_norm": 1.2995480228195606, + "learning_rate": 1.993470107123255e-05, + "loss": 1.1572, + "step": 1360 + }, + { + "epoch": 0.07, + "grad_norm": 1.4480398694941132, + "learning_rate": 1.9934523218347096e-05, + "loss": 1.3926, + "step": 1361 + }, + { + "epoch": 0.07, + "grad_norm": 1.3825211370149932, + "learning_rate": 1.993434512437998e-05, + "loss": 1.3301, + "step": 1362 + }, + { + "epoch": 0.07, + "grad_norm": 1.1622085955295531, + "learning_rate": 1.9934166789335526e-05, + "loss": 1.3032, + "step": 1363 + }, + { + "epoch": 0.07, + "grad_norm": 1.2323791671836448, + "learning_rate": 1.9933988213218054e-05, + "loss": 1.1133, + "step": 1364 + }, + { + "epoch": 0.07, + "grad_norm": 1.5556026620591226, + "learning_rate": 1.9933809396031908e-05, + "loss": 1.4131, + "step": 1365 + }, + { + "epoch": 0.07, + "grad_norm": 1.211147726914795, + "learning_rate": 1.993363033778142e-05, + "loss": 1.3223, + "step": 1366 + }, + { + "epoch": 0.07, + "grad_norm": 1.2275874073012218, + "learning_rate": 1.9933451038470936e-05, + "loss": 1.2915, + "step": 1367 + }, + { + "epoch": 0.07, + "grad_norm": 1.5045309682465917, + "learning_rate": 1.9933271498104808e-05, + "loss": 1.354, + "step": 1368 + }, + { + "epoch": 0.07, + "grad_norm": 1.3556982898542607, + "learning_rate": 1.9933091716687397e-05, + "loss": 1.1379, + "step": 1369 + }, + { + "epoch": 0.07, + "grad_norm": 1.132605795114454, + "learning_rate": 1.9932911694223064e-05, + "loss": 1.2886, + "step": 1370 + }, + { + "epoch": 0.07, + "grad_norm": 1.32993469859226, + "learning_rate": 1.993273143071617e-05, + "loss": 1.2256, + "step": 1371 + }, + { + "epoch": 0.07, + "grad_norm": 1.3776664628802224, + "learning_rate": 1.9932550926171096e-05, + "loss": 1.4473, + "step": 1372 + }, + { + "epoch": 0.07, + "grad_norm": 1.238845158569798, + "learning_rate": 1.9932370180592226e-05, + "loss": 1.3281, + "step": 1373 + }, + { + "epoch": 0.07, + "grad_norm": 1.4682763463046296, + "learning_rate": 1.9932189193983937e-05, + "loss": 1.3276, + "step": 1374 + }, + { + "epoch": 0.07, + "grad_norm": 1.2174430556961697, + "learning_rate": 1.9932007966350627e-05, + "loss": 1.1646, + "step": 1375 + }, + { + "epoch": 0.07, + "grad_norm": 1.2076662667786797, + "learning_rate": 1.9931826497696694e-05, + "loss": 1.1763, + "step": 1376 + }, + { + "epoch": 0.07, + "grad_norm": 1.2079638090979452, + "learning_rate": 1.993164478802654e-05, + "loss": 1.3047, + "step": 1377 + }, + { + "epoch": 0.07, + "grad_norm": 1.2719426115206187, + "learning_rate": 1.9931462837344578e-05, + "loss": 1.124, + "step": 1378 + }, + { + "epoch": 0.07, + "grad_norm": 1.5243411469129524, + "learning_rate": 1.9931280645655216e-05, + "loss": 1.3022, + "step": 1379 + }, + { + "epoch": 0.07, + "grad_norm": 1.2152936829799958, + "learning_rate": 1.993109821296288e-05, + "loss": 1.2314, + "step": 1380 + }, + { + "epoch": 0.07, + "grad_norm": 1.267029554317984, + "learning_rate": 1.9930915539271996e-05, + "loss": 1.2378, + "step": 1381 + }, + { + "epoch": 0.07, + "grad_norm": 1.4557000863168827, + "learning_rate": 1.9930732624587e-05, + "loss": 1.377, + "step": 1382 + }, + { + "epoch": 0.07, + "grad_norm": 1.4864072026830715, + "learning_rate": 1.9930549468912326e-05, + "loss": 1.2578, + "step": 1383 + }, + { + "epoch": 0.07, + "grad_norm": 1.3776042993007556, + "learning_rate": 1.9930366072252424e-05, + "loss": 1.2178, + "step": 1384 + }, + { + "epoch": 0.07, + "grad_norm": 1.383563413897334, + "learning_rate": 1.9930182434611736e-05, + "loss": 1.2134, + "step": 1385 + }, + { + "epoch": 0.07, + "grad_norm": 1.3108136252968499, + "learning_rate": 1.9929998555994732e-05, + "loss": 1.3867, + "step": 1386 + }, + { + "epoch": 0.07, + "grad_norm": 1.2363894243578233, + "learning_rate": 1.992981443640586e-05, + "loss": 1.4189, + "step": 1387 + }, + { + "epoch": 0.07, + "grad_norm": 1.2714197773673799, + "learning_rate": 1.9929630075849597e-05, + "loss": 1.2007, + "step": 1388 + }, + { + "epoch": 0.07, + "grad_norm": 1.6806202308037703, + "learning_rate": 1.9929445474330413e-05, + "loss": 1.2041, + "step": 1389 + }, + { + "epoch": 0.07, + "grad_norm": 1.4196269843479719, + "learning_rate": 1.9929260631852792e-05, + "loss": 1.3335, + "step": 1390 + }, + { + "epoch": 0.07, + "grad_norm": 1.4957491469644797, + "learning_rate": 1.992907554842121e-05, + "loss": 1.1992, + "step": 1391 + }, + { + "epoch": 0.07, + "grad_norm": 1.3156499266476749, + "learning_rate": 1.9928890224040168e-05, + "loss": 1.2129, + "step": 1392 + }, + { + "epoch": 0.07, + "grad_norm": 1.223826089667699, + "learning_rate": 1.992870465871416e-05, + "loss": 1.2004, + "step": 1393 + }, + { + "epoch": 0.07, + "grad_norm": 1.334007652230572, + "learning_rate": 1.992851885244769e-05, + "loss": 1.2368, + "step": 1394 + }, + { + "epoch": 0.07, + "grad_norm": 1.6228962034131904, + "learning_rate": 1.9928332805245266e-05, + "loss": 1.2549, + "step": 1395 + }, + { + "epoch": 0.07, + "grad_norm": 1.5205183212992135, + "learning_rate": 1.9928146517111404e-05, + "loss": 1.3491, + "step": 1396 + }, + { + "epoch": 0.07, + "grad_norm": 1.6062500598407505, + "learning_rate": 1.9927959988050622e-05, + "loss": 1.209, + "step": 1397 + }, + { + "epoch": 0.07, + "grad_norm": 1.2325180103622548, + "learning_rate": 1.992777321806745e-05, + "loss": 1.1982, + "step": 1398 + }, + { + "epoch": 0.07, + "grad_norm": 1.1152678341927806, + "learning_rate": 1.9927586207166417e-05, + "loss": 1.1309, + "step": 1399 + }, + { + "epoch": 0.07, + "grad_norm": 1.4096037907596766, + "learning_rate": 1.9927398955352062e-05, + "loss": 1.3442, + "step": 1400 + }, + { + "epoch": 0.07, + "grad_norm": 1.3007957112164257, + "learning_rate": 1.992721146262893e-05, + "loss": 1.3384, + "step": 1401 + }, + { + "epoch": 0.07, + "grad_norm": 1.2546083146961347, + "learning_rate": 1.992702372900157e-05, + "loss": 1.252, + "step": 1402 + }, + { + "epoch": 0.07, + "grad_norm": 1.3753501992500714, + "learning_rate": 1.9926835754474543e-05, + "loss": 1.4658, + "step": 1403 + }, + { + "epoch": 0.07, + "grad_norm": 1.2164889583185936, + "learning_rate": 1.9926647539052403e-05, + "loss": 1.2334, + "step": 1404 + }, + { + "epoch": 0.07, + "grad_norm": 1.1240347480776658, + "learning_rate": 1.9926459082739717e-05, + "loss": 1.3125, + "step": 1405 + }, + { + "epoch": 0.07, + "grad_norm": 1.156936948968369, + "learning_rate": 1.9926270385541067e-05, + "loss": 1.1958, + "step": 1406 + }, + { + "epoch": 0.07, + "grad_norm": 1.8440576725815776, + "learning_rate": 1.9926081447461025e-05, + "loss": 1.4282, + "step": 1407 + }, + { + "epoch": 0.07, + "grad_norm": 1.28041784047807, + "learning_rate": 1.9925892268504176e-05, + "loss": 1.3638, + "step": 1408 + }, + { + "epoch": 0.07, + "grad_norm": 1.4118294107265381, + "learning_rate": 1.9925702848675117e-05, + "loss": 1.5029, + "step": 1409 + }, + { + "epoch": 0.07, + "grad_norm": 0.8418050887003885, + "learning_rate": 1.9925513187978437e-05, + "loss": 1.2466, + "step": 1410 + }, + { + "epoch": 0.07, + "grad_norm": 1.26568793304336, + "learning_rate": 1.992532328641874e-05, + "loss": 1.1118, + "step": 1411 + }, + { + "epoch": 0.07, + "grad_norm": 1.3990228932998787, + "learning_rate": 1.9925133144000643e-05, + "loss": 1.1753, + "step": 1412 + }, + { + "epoch": 0.07, + "grad_norm": 1.1771885673990807, + "learning_rate": 1.9924942760728748e-05, + "loss": 1.436, + "step": 1413 + }, + { + "epoch": 0.07, + "grad_norm": 1.0964211591170832, + "learning_rate": 1.992475213660768e-05, + "loss": 1.1934, + "step": 1414 + }, + { + "epoch": 0.07, + "grad_norm": 1.5033229338026457, + "learning_rate": 1.9924561271642066e-05, + "loss": 1.3525, + "step": 1415 + }, + { + "epoch": 0.07, + "grad_norm": 1.3266353831658246, + "learning_rate": 1.992437016583654e-05, + "loss": 1.2612, + "step": 1416 + }, + { + "epoch": 0.07, + "grad_norm": 1.1138066027238473, + "learning_rate": 1.9924178819195732e-05, + "loss": 1.3315, + "step": 1417 + }, + { + "epoch": 0.07, + "grad_norm": 1.2084269745740877, + "learning_rate": 1.992398723172429e-05, + "loss": 1.2661, + "step": 1418 + }, + { + "epoch": 0.07, + "grad_norm": 1.4515500640698449, + "learning_rate": 1.9923795403426865e-05, + "loss": 1.334, + "step": 1419 + }, + { + "epoch": 0.07, + "grad_norm": 1.4031900836937272, + "learning_rate": 1.9923603334308114e-05, + "loss": 1.4062, + "step": 1420 + }, + { + "epoch": 0.07, + "grad_norm": 1.3790162630535476, + "learning_rate": 1.992341102437269e-05, + "loss": 1.2788, + "step": 1421 + }, + { + "epoch": 0.07, + "grad_norm": 1.4016628014803134, + "learning_rate": 1.9923218473625264e-05, + "loss": 1.293, + "step": 1422 + }, + { + "epoch": 0.07, + "grad_norm": 1.4946429940300798, + "learning_rate": 1.992302568207051e-05, + "loss": 1.2676, + "step": 1423 + }, + { + "epoch": 0.07, + "grad_norm": 1.4223738012660017, + "learning_rate": 1.9922832649713108e-05, + "loss": 1.437, + "step": 1424 + }, + { + "epoch": 0.07, + "grad_norm": 1.2134851088597227, + "learning_rate": 1.9922639376557734e-05, + "loss": 1.1807, + "step": 1425 + }, + { + "epoch": 0.07, + "grad_norm": 1.626068459793825, + "learning_rate": 1.9922445862609088e-05, + "loss": 1.2466, + "step": 1426 + }, + { + "epoch": 0.07, + "grad_norm": 1.2291064910557952, + "learning_rate": 1.992225210787186e-05, + "loss": 1.2739, + "step": 1427 + }, + { + "epoch": 0.07, + "grad_norm": 1.1577841037186118, + "learning_rate": 1.9922058112350754e-05, + "loss": 1.0413, + "step": 1428 + }, + { + "epoch": 0.07, + "grad_norm": 1.0278786394752673, + "learning_rate": 1.992186387605048e-05, + "loss": 1.3018, + "step": 1429 + }, + { + "epoch": 0.07, + "grad_norm": 1.369540039125186, + "learning_rate": 1.9921669398975745e-05, + "loss": 1.2129, + "step": 1430 + }, + { + "epoch": 0.07, + "grad_norm": 1.3628679703644266, + "learning_rate": 1.9921474681131273e-05, + "loss": 1.3179, + "step": 1431 + }, + { + "epoch": 0.07, + "grad_norm": 1.137980072856824, + "learning_rate": 1.992127972252179e-05, + "loss": 1.1699, + "step": 1432 + }, + { + "epoch": 0.07, + "grad_norm": 1.2337385382160426, + "learning_rate": 1.992108452315202e-05, + "loss": 1.2998, + "step": 1433 + }, + { + "epoch": 0.07, + "grad_norm": 1.3849109007443743, + "learning_rate": 1.9920889083026716e-05, + "loss": 1.2856, + "step": 1434 + }, + { + "epoch": 0.07, + "grad_norm": 1.2391311036418342, + "learning_rate": 1.9920693402150604e-05, + "loss": 1.2549, + "step": 1435 + }, + { + "epoch": 0.07, + "grad_norm": 1.1789559684174638, + "learning_rate": 1.992049748052844e-05, + "loss": 1.2744, + "step": 1436 + }, + { + "epoch": 0.07, + "grad_norm": 1.245715591223344, + "learning_rate": 1.9920301318164978e-05, + "loss": 1.334, + "step": 1437 + }, + { + "epoch": 0.07, + "grad_norm": 1.295265660739528, + "learning_rate": 1.9920104915064974e-05, + "loss": 1.4155, + "step": 1438 + }, + { + "epoch": 0.07, + "grad_norm": 1.4348875943250352, + "learning_rate": 1.9919908271233198e-05, + "loss": 1.2363, + "step": 1439 + }, + { + "epoch": 0.07, + "grad_norm": 1.2527831010634491, + "learning_rate": 1.9919711386674425e-05, + "loss": 1.333, + "step": 1440 + }, + { + "epoch": 0.07, + "grad_norm": 1.2241122136101408, + "learning_rate": 1.991951426139343e-05, + "loss": 1.1523, + "step": 1441 + }, + { + "epoch": 0.07, + "grad_norm": 1.4375264190581758, + "learning_rate": 1.9919316895394993e-05, + "loss": 1.2871, + "step": 1442 + }, + { + "epoch": 0.07, + "grad_norm": 1.1756348699771237, + "learning_rate": 1.9919119288683908e-05, + "loss": 1.2241, + "step": 1443 + }, + { + "epoch": 0.07, + "grad_norm": 1.1770647408325017, + "learning_rate": 1.9918921441264966e-05, + "loss": 1.2041, + "step": 1444 + }, + { + "epoch": 0.07, + "grad_norm": 1.251305447796246, + "learning_rate": 1.9918723353142973e-05, + "loss": 1.3398, + "step": 1445 + }, + { + "epoch": 0.07, + "grad_norm": 1.0058090428602453, + "learning_rate": 1.9918525024322738e-05, + "loss": 1.2974, + "step": 1446 + }, + { + "epoch": 0.07, + "grad_norm": 1.500605048075872, + "learning_rate": 1.9918326454809066e-05, + "loss": 1.4976, + "step": 1447 + }, + { + "epoch": 0.07, + "grad_norm": 1.1913521669028075, + "learning_rate": 1.991812764460678e-05, + "loss": 1.0981, + "step": 1448 + }, + { + "epoch": 0.07, + "grad_norm": 1.1976542593605477, + "learning_rate": 1.9917928593720705e-05, + "loss": 1.2393, + "step": 1449 + }, + { + "epoch": 0.07, + "grad_norm": 1.3392154133794578, + "learning_rate": 1.991772930215567e-05, + "loss": 1.1665, + "step": 1450 + }, + { + "epoch": 0.07, + "grad_norm": 1.3280140970821812, + "learning_rate": 1.9917529769916513e-05, + "loss": 1.4048, + "step": 1451 + }, + { + "epoch": 0.07, + "grad_norm": 1.765722542644754, + "learning_rate": 1.9917329997008075e-05, + "loss": 1.2114, + "step": 1452 + }, + { + "epoch": 0.07, + "grad_norm": 1.796473971202343, + "learning_rate": 1.99171299834352e-05, + "loss": 1.3452, + "step": 1453 + }, + { + "epoch": 0.07, + "grad_norm": 1.2322445064718075, + "learning_rate": 1.991692972920275e-05, + "loss": 1.437, + "step": 1454 + }, + { + "epoch": 0.07, + "grad_norm": 1.3784514298298651, + "learning_rate": 1.991672923431558e-05, + "loss": 1.1514, + "step": 1455 + }, + { + "epoch": 0.07, + "grad_norm": 1.0411835252057078, + "learning_rate": 1.9916528498778554e-05, + "loss": 1.2026, + "step": 1456 + }, + { + "epoch": 0.07, + "grad_norm": 1.4621490652590339, + "learning_rate": 1.9916327522596545e-05, + "loss": 1.2837, + "step": 1457 + }, + { + "epoch": 0.07, + "grad_norm": 1.363472977369013, + "learning_rate": 1.9916126305774427e-05, + "loss": 1.2188, + "step": 1458 + }, + { + "epoch": 0.07, + "grad_norm": 1.3042743372597452, + "learning_rate": 1.991592484831709e-05, + "loss": 1.2515, + "step": 1459 + }, + { + "epoch": 0.07, + "grad_norm": 1.46964437455073, + "learning_rate": 1.9915723150229417e-05, + "loss": 1.3105, + "step": 1460 + }, + { + "epoch": 0.07, + "grad_norm": 1.3973980507752068, + "learning_rate": 1.9915521211516307e-05, + "loss": 1.3276, + "step": 1461 + }, + { + "epoch": 0.07, + "grad_norm": 1.4104210777910715, + "learning_rate": 1.9915319032182655e-05, + "loss": 1.2993, + "step": 1462 + }, + { + "epoch": 0.07, + "grad_norm": 1.1720659592481752, + "learning_rate": 1.9915116612233367e-05, + "loss": 1.2026, + "step": 1463 + }, + { + "epoch": 0.07, + "grad_norm": 1.0166718129232097, + "learning_rate": 1.991491395167336e-05, + "loss": 1.355, + "step": 1464 + }, + { + "epoch": 0.07, + "grad_norm": 1.2034308260207902, + "learning_rate": 1.9914711050507556e-05, + "loss": 1.1782, + "step": 1465 + }, + { + "epoch": 0.07, + "grad_norm": 1.3177553048763881, + "learning_rate": 1.991450790874087e-05, + "loss": 1.1797, + "step": 1466 + }, + { + "epoch": 0.07, + "grad_norm": 1.4815300215000597, + "learning_rate": 1.991430452637823e-05, + "loss": 1.3452, + "step": 1467 + }, + { + "epoch": 0.07, + "grad_norm": 1.4485677415618512, + "learning_rate": 1.991410090342458e-05, + "loss": 1.438, + "step": 1468 + }, + { + "epoch": 0.07, + "grad_norm": 1.3351137610241712, + "learning_rate": 1.9913897039884855e-05, + "loss": 1.1685, + "step": 1469 + }, + { + "epoch": 0.07, + "grad_norm": 1.645088674098112, + "learning_rate": 1.9913692935764006e-05, + "loss": 1.3628, + "step": 1470 + }, + { + "epoch": 0.07, + "grad_norm": 1.5003155018901277, + "learning_rate": 1.9913488591066986e-05, + "loss": 1.3555, + "step": 1471 + }, + { + "epoch": 0.07, + "grad_norm": 1.5718343162437676, + "learning_rate": 1.991328400579875e-05, + "loss": 1.2598, + "step": 1472 + }, + { + "epoch": 0.07, + "grad_norm": 1.4812916850245716, + "learning_rate": 1.9913079179964266e-05, + "loss": 1.3745, + "step": 1473 + }, + { + "epoch": 0.07, + "grad_norm": 1.1226854440297376, + "learning_rate": 1.9912874113568503e-05, + "loss": 1.3823, + "step": 1474 + }, + { + "epoch": 0.07, + "grad_norm": 1.4075576260289377, + "learning_rate": 1.9912668806616437e-05, + "loss": 1.4629, + "step": 1475 + }, + { + "epoch": 0.07, + "grad_norm": 1.216170486534676, + "learning_rate": 1.9912463259113055e-05, + "loss": 1.3877, + "step": 1476 + }, + { + "epoch": 0.07, + "grad_norm": 1.2318996831006586, + "learning_rate": 1.9912257471063338e-05, + "loss": 1.1685, + "step": 1477 + }, + { + "epoch": 0.07, + "grad_norm": 1.3947620009984845, + "learning_rate": 1.9912051442472283e-05, + "loss": 1.353, + "step": 1478 + }, + { + "epoch": 0.07, + "grad_norm": 1.3001850006599116, + "learning_rate": 1.9911845173344894e-05, + "loss": 1.1216, + "step": 1479 + }, + { + "epoch": 0.07, + "grad_norm": 0.9708094925111745, + "learning_rate": 1.991163866368617e-05, + "loss": 1.1738, + "step": 1480 + }, + { + "epoch": 0.07, + "grad_norm": 1.2927166906501864, + "learning_rate": 1.991143191350112e-05, + "loss": 1.2583, + "step": 1481 + }, + { + "epoch": 0.07, + "grad_norm": 1.4595692280622141, + "learning_rate": 1.991122492279477e-05, + "loss": 1.353, + "step": 1482 + }, + { + "epoch": 0.07, + "grad_norm": 1.3604513442753057, + "learning_rate": 1.991101769157214e-05, + "loss": 1.3027, + "step": 1483 + }, + { + "epoch": 0.07, + "grad_norm": 1.3354820794666604, + "learning_rate": 1.9910810219838257e-05, + "loss": 1.2949, + "step": 1484 + }, + { + "epoch": 0.07, + "grad_norm": 1.2003770470780624, + "learning_rate": 1.991060250759816e-05, + "loss": 1.2935, + "step": 1485 + }, + { + "epoch": 0.07, + "grad_norm": 1.359718359784763, + "learning_rate": 1.991039455485688e-05, + "loss": 1.2515, + "step": 1486 + }, + { + "epoch": 0.07, + "grad_norm": 1.57738476612396, + "learning_rate": 1.9910186361619473e-05, + "loss": 1.3135, + "step": 1487 + }, + { + "epoch": 0.07, + "grad_norm": 1.2345141618691469, + "learning_rate": 1.9909977927890988e-05, + "loss": 1.355, + "step": 1488 + }, + { + "epoch": 0.07, + "grad_norm": 1.1773120429689279, + "learning_rate": 1.990976925367648e-05, + "loss": 1.2314, + "step": 1489 + }, + { + "epoch": 0.07, + "grad_norm": 1.3689849382132173, + "learning_rate": 1.9909560338981014e-05, + "loss": 1.1895, + "step": 1490 + }, + { + "epoch": 0.07, + "grad_norm": 1.2795907544594944, + "learning_rate": 1.990935118380967e-05, + "loss": 1.2905, + "step": 1491 + }, + { + "epoch": 0.07, + "grad_norm": 1.2302644483881415, + "learning_rate": 1.9909141788167506e-05, + "loss": 1.3169, + "step": 1492 + }, + { + "epoch": 0.07, + "grad_norm": 1.3145224086251992, + "learning_rate": 1.9908932152059618e-05, + "loss": 1.4199, + "step": 1493 + }, + { + "epoch": 0.07, + "grad_norm": 1.381769964469777, + "learning_rate": 1.9908722275491084e-05, + "loss": 1.2549, + "step": 1494 + }, + { + "epoch": 0.07, + "grad_norm": 1.7681629391811717, + "learning_rate": 1.9908512158467007e-05, + "loss": 1.4409, + "step": 1495 + }, + { + "epoch": 0.07, + "grad_norm": 1.1959588208284817, + "learning_rate": 1.9908301800992475e-05, + "loss": 1.2363, + "step": 1496 + }, + { + "epoch": 0.07, + "grad_norm": 1.0851397794592823, + "learning_rate": 1.99080912030726e-05, + "loss": 1.1836, + "step": 1497 + }, + { + "epoch": 0.07, + "grad_norm": 1.6596823944501662, + "learning_rate": 1.990788036471249e-05, + "loss": 1.3906, + "step": 1498 + }, + { + "epoch": 0.07, + "grad_norm": 1.2978717137036486, + "learning_rate": 1.990766928591726e-05, + "loss": 1.417, + "step": 1499 + }, + { + "epoch": 0.07, + "grad_norm": 1.448104548787884, + "learning_rate": 1.9907457966692036e-05, + "loss": 1.2358, + "step": 1500 + }, + { + "epoch": 0.07, + "grad_norm": 1.2223556211615152, + "learning_rate": 1.990724640704194e-05, + "loss": 1.3599, + "step": 1501 + }, + { + "epoch": 0.07, + "grad_norm": 1.4549745382362926, + "learning_rate": 1.990703460697211e-05, + "loss": 1.3008, + "step": 1502 + }, + { + "epoch": 0.07, + "grad_norm": 1.2613435305139584, + "learning_rate": 1.990682256648769e-05, + "loss": 1.3569, + "step": 1503 + }, + { + "epoch": 0.07, + "grad_norm": 1.5226508216800463, + "learning_rate": 1.990661028559382e-05, + "loss": 1.2544, + "step": 1504 + }, + { + "epoch": 0.07, + "grad_norm": 1.3951679675998976, + "learning_rate": 1.990639776429565e-05, + "loss": 1.2822, + "step": 1505 + }, + { + "epoch": 0.07, + "grad_norm": 1.4698148431118634, + "learning_rate": 1.9906185002598343e-05, + "loss": 1.2959, + "step": 1506 + }, + { + "epoch": 0.07, + "grad_norm": 1.0797488353148252, + "learning_rate": 1.9905972000507057e-05, + "loss": 1.0576, + "step": 1507 + }, + { + "epoch": 0.07, + "grad_norm": 1.2917481352638471, + "learning_rate": 1.9905758758026966e-05, + "loss": 1.2959, + "step": 1508 + }, + { + "epoch": 0.07, + "grad_norm": 1.2975900632757669, + "learning_rate": 1.9905545275163235e-05, + "loss": 1.1963, + "step": 1509 + }, + { + "epoch": 0.07, + "grad_norm": 1.3098096659716842, + "learning_rate": 1.9905331551921056e-05, + "loss": 1.291, + "step": 1510 + }, + { + "epoch": 0.07, + "grad_norm": 1.2133002311892283, + "learning_rate": 1.9905117588305612e-05, + "loss": 1.2266, + "step": 1511 + }, + { + "epoch": 0.07, + "grad_norm": 0.9157456773977323, + "learning_rate": 1.9904903384322095e-05, + "loss": 1.2007, + "step": 1512 + }, + { + "epoch": 0.07, + "grad_norm": 1.5340829574616839, + "learning_rate": 1.9904688939975697e-05, + "loss": 1.2354, + "step": 1513 + }, + { + "epoch": 0.07, + "grad_norm": 1.0285143206847205, + "learning_rate": 1.990447425527163e-05, + "loss": 1.2441, + "step": 1514 + }, + { + "epoch": 0.07, + "grad_norm": 1.2277967478275797, + "learning_rate": 1.99042593302151e-05, + "loss": 1.2134, + "step": 1515 + }, + { + "epoch": 0.07, + "grad_norm": 1.5261108458169501, + "learning_rate": 1.9904044164811325e-05, + "loss": 1.1528, + "step": 1516 + }, + { + "epoch": 0.07, + "grad_norm": 1.3669737565406528, + "learning_rate": 1.9903828759065524e-05, + "loss": 1.4238, + "step": 1517 + }, + { + "epoch": 0.07, + "grad_norm": 0.901020546402158, + "learning_rate": 1.9903613112982925e-05, + "loss": 1.0947, + "step": 1518 + }, + { + "epoch": 0.07, + "grad_norm": 1.361943732142787, + "learning_rate": 1.990339722656876e-05, + "loss": 1.2598, + "step": 1519 + }, + { + "epoch": 0.07, + "grad_norm": 1.3270740173221431, + "learning_rate": 1.990318109982827e-05, + "loss": 1.2759, + "step": 1520 + }, + { + "epoch": 0.07, + "grad_norm": 1.4063054821315533, + "learning_rate": 1.9902964732766702e-05, + "loss": 1.3301, + "step": 1521 + }, + { + "epoch": 0.07, + "grad_norm": 1.4473955325036976, + "learning_rate": 1.99027481253893e-05, + "loss": 1.1973, + "step": 1522 + }, + { + "epoch": 0.07, + "grad_norm": 1.2606933083358174, + "learning_rate": 1.9902531277701323e-05, + "loss": 1.3223, + "step": 1523 + }, + { + "epoch": 0.07, + "grad_norm": 1.4957291588826798, + "learning_rate": 1.9902314189708037e-05, + "loss": 1.313, + "step": 1524 + }, + { + "epoch": 0.07, + "grad_norm": 1.454478105778005, + "learning_rate": 1.9902096861414706e-05, + "loss": 1.0818, + "step": 1525 + }, + { + "epoch": 0.07, + "grad_norm": 1.4365998776564808, + "learning_rate": 1.9901879292826604e-05, + "loss": 1.2764, + "step": 1526 + }, + { + "epoch": 0.07, + "grad_norm": 1.3223192304811888, + "learning_rate": 1.9901661483949015e-05, + "loss": 1.3623, + "step": 1527 + }, + { + "epoch": 0.07, + "grad_norm": 1.2613836098664017, + "learning_rate": 1.990144343478722e-05, + "loss": 1.2358, + "step": 1528 + }, + { + "epoch": 0.07, + "grad_norm": 1.2032502647553969, + "learning_rate": 1.990122514534651e-05, + "loss": 1.3867, + "step": 1529 + }, + { + "epoch": 0.07, + "grad_norm": 1.2687412034755952, + "learning_rate": 1.9901006615632187e-05, + "loss": 1.1367, + "step": 1530 + }, + { + "epoch": 0.07, + "grad_norm": 1.304270888629178, + "learning_rate": 1.9900787845649548e-05, + "loss": 1.2539, + "step": 1531 + }, + { + "epoch": 0.07, + "grad_norm": 1.454196134439052, + "learning_rate": 1.990056883540391e-05, + "loss": 1.4575, + "step": 1532 + }, + { + "epoch": 0.07, + "grad_norm": 1.2623343702246448, + "learning_rate": 1.990034958490058e-05, + "loss": 1.3267, + "step": 1533 + }, + { + "epoch": 0.07, + "grad_norm": 1.2099283886607295, + "learning_rate": 1.990013009414488e-05, + "loss": 1.23, + "step": 1534 + }, + { + "epoch": 0.07, + "grad_norm": 1.227256255467023, + "learning_rate": 1.989991036314214e-05, + "loss": 1.4194, + "step": 1535 + }, + { + "epoch": 0.07, + "grad_norm": 1.2908572103590255, + "learning_rate": 1.9899690391897694e-05, + "loss": 1.3613, + "step": 1536 + }, + { + "epoch": 0.07, + "grad_norm": 1.2152102695042957, + "learning_rate": 1.989947018041687e-05, + "loss": 1.0488, + "step": 1537 + }, + { + "epoch": 0.07, + "grad_norm": 1.4487791003308965, + "learning_rate": 1.9899249728705018e-05, + "loss": 1.1084, + "step": 1538 + }, + { + "epoch": 0.07, + "grad_norm": 1.6983596650857669, + "learning_rate": 1.989902903676749e-05, + "loss": 1.3203, + "step": 1539 + }, + { + "epoch": 0.07, + "grad_norm": 1.834983455073885, + "learning_rate": 1.9898808104609638e-05, + "loss": 1.1826, + "step": 1540 + }, + { + "epoch": 0.07, + "grad_norm": 1.097634132284742, + "learning_rate": 1.9898586932236826e-05, + "loss": 1.2241, + "step": 1541 + }, + { + "epoch": 0.07, + "grad_norm": 0.806012654348552, + "learning_rate": 1.989836551965442e-05, + "loss": 1.3599, + "step": 1542 + }, + { + "epoch": 0.07, + "grad_norm": 1.223519648754178, + "learning_rate": 1.9898143866867792e-05, + "loss": 1.1421, + "step": 1543 + }, + { + "epoch": 0.07, + "grad_norm": 1.4110700431343324, + "learning_rate": 1.989792197388232e-05, + "loss": 1.3213, + "step": 1544 + }, + { + "epoch": 0.07, + "grad_norm": 1.202117398694309, + "learning_rate": 1.9897699840703393e-05, + "loss": 1.2373, + "step": 1545 + }, + { + "epoch": 0.07, + "grad_norm": 1.339756995962155, + "learning_rate": 1.98974774673364e-05, + "loss": 1.2466, + "step": 1546 + }, + { + "epoch": 0.07, + "grad_norm": 1.4554283893364546, + "learning_rate": 1.9897254853786735e-05, + "loss": 1.248, + "step": 1547 + }, + { + "epoch": 0.07, + "grad_norm": 1.4564968460683352, + "learning_rate": 1.98970320000598e-05, + "loss": 1.3657, + "step": 1548 + }, + { + "epoch": 0.07, + "grad_norm": 1.3010550588051382, + "learning_rate": 1.9896808906161005e-05, + "loss": 1.1069, + "step": 1549 + }, + { + "epoch": 0.07, + "grad_norm": 1.233219549601465, + "learning_rate": 1.9896585572095764e-05, + "loss": 1.4097, + "step": 1550 + }, + { + "epoch": 0.07, + "grad_norm": 1.17311531547124, + "learning_rate": 1.9896361997869496e-05, + "loss": 1.4214, + "step": 1551 + }, + { + "epoch": 0.07, + "grad_norm": 1.3701272614900835, + "learning_rate": 1.9896138183487626e-05, + "loss": 1.3428, + "step": 1552 + }, + { + "epoch": 0.07, + "grad_norm": 1.3973681044263002, + "learning_rate": 1.9895914128955588e-05, + "loss": 1.4731, + "step": 1553 + }, + { + "epoch": 0.07, + "grad_norm": 1.4114786074005963, + "learning_rate": 1.9895689834278813e-05, + "loss": 1.1826, + "step": 1554 + }, + { + "epoch": 0.07, + "grad_norm": 1.25859079407795, + "learning_rate": 1.989546529946275e-05, + "loss": 1.1895, + "step": 1555 + }, + { + "epoch": 0.07, + "grad_norm": 1.5578364177212496, + "learning_rate": 1.9895240524512845e-05, + "loss": 1.2603, + "step": 1556 + }, + { + "epoch": 0.07, + "grad_norm": 1.0054603939783162, + "learning_rate": 1.9895015509434555e-05, + "loss": 1.2095, + "step": 1557 + }, + { + "epoch": 0.07, + "grad_norm": 1.590877295251649, + "learning_rate": 1.9894790254233338e-05, + "loss": 1.2598, + "step": 1558 + }, + { + "epoch": 0.07, + "grad_norm": 1.3165696940664513, + "learning_rate": 1.9894564758914662e-05, + "loss": 1.2026, + "step": 1559 + }, + { + "epoch": 0.08, + "grad_norm": 1.4457349129093637, + "learning_rate": 1.9894339023484e-05, + "loss": 1.2075, + "step": 1560 + }, + { + "epoch": 0.08, + "grad_norm": 1.0539962160608323, + "learning_rate": 1.989411304794682e-05, + "loss": 1.1658, + "step": 1561 + }, + { + "epoch": 0.08, + "grad_norm": 1.1036717345663354, + "learning_rate": 1.989388683230862e-05, + "loss": 1.4878, + "step": 1562 + }, + { + "epoch": 0.08, + "grad_norm": 1.5203261302873377, + "learning_rate": 1.9893660376574883e-05, + "loss": 1.2847, + "step": 1563 + }, + { + "epoch": 0.08, + "grad_norm": 1.1414078820252838, + "learning_rate": 1.9893433680751105e-05, + "loss": 1.2124, + "step": 1564 + }, + { + "epoch": 0.08, + "grad_norm": 1.2244943552055296, + "learning_rate": 1.9893206744842787e-05, + "loss": 1.2896, + "step": 1565 + }, + { + "epoch": 0.08, + "grad_norm": 1.4526670004325604, + "learning_rate": 1.9892979568855435e-05, + "loss": 1.3374, + "step": 1566 + }, + { + "epoch": 0.08, + "grad_norm": 1.4901947070025887, + "learning_rate": 1.9892752152794565e-05, + "loss": 1.4336, + "step": 1567 + }, + { + "epoch": 0.08, + "grad_norm": 1.1495692600295953, + "learning_rate": 1.9892524496665692e-05, + "loss": 1.1333, + "step": 1568 + }, + { + "epoch": 0.08, + "grad_norm": 1.3362214175544957, + "learning_rate": 1.989229660047434e-05, + "loss": 1.3179, + "step": 1569 + }, + { + "epoch": 0.08, + "grad_norm": 0.9101052924937294, + "learning_rate": 1.9892068464226044e-05, + "loss": 1.2139, + "step": 1570 + }, + { + "epoch": 0.08, + "grad_norm": 1.2305954368645287, + "learning_rate": 1.989184008792634e-05, + "loss": 1.23, + "step": 1571 + }, + { + "epoch": 0.08, + "grad_norm": 1.28461249039843, + "learning_rate": 1.9891611471580767e-05, + "loss": 1.335, + "step": 1572 + }, + { + "epoch": 0.08, + "grad_norm": 1.243418036856141, + "learning_rate": 1.989138261519487e-05, + "loss": 1.074, + "step": 1573 + }, + { + "epoch": 0.08, + "grad_norm": 1.4806900169399335, + "learning_rate": 1.989115351877421e-05, + "loss": 1.1506, + "step": 1574 + }, + { + "epoch": 0.08, + "grad_norm": 1.6575453653293377, + "learning_rate": 1.9890924182324345e-05, + "loss": 1.3291, + "step": 1575 + }, + { + "epoch": 0.08, + "grad_norm": 1.0139399294063387, + "learning_rate": 1.989069460585083e-05, + "loss": 1.2173, + "step": 1576 + }, + { + "epoch": 0.08, + "grad_norm": 1.0789917896841428, + "learning_rate": 1.9890464789359253e-05, + "loss": 1.2812, + "step": 1577 + }, + { + "epoch": 0.08, + "grad_norm": 1.2007249017241937, + "learning_rate": 1.989023473285518e-05, + "loss": 1.2251, + "step": 1578 + }, + { + "epoch": 0.08, + "grad_norm": 1.2517010110202547, + "learning_rate": 1.9890004436344197e-05, + "loss": 1.2334, + "step": 1579 + }, + { + "epoch": 0.08, + "grad_norm": 1.4170168069490596, + "learning_rate": 1.988977389983189e-05, + "loss": 1.3179, + "step": 1580 + }, + { + "epoch": 0.08, + "grad_norm": 1.1579833466809972, + "learning_rate": 1.9889543123323854e-05, + "loss": 1.1509, + "step": 1581 + }, + { + "epoch": 0.08, + "grad_norm": 1.1463972085249254, + "learning_rate": 1.9889312106825694e-05, + "loss": 1.3062, + "step": 1582 + }, + { + "epoch": 0.08, + "grad_norm": 1.520664041023211, + "learning_rate": 1.988908085034301e-05, + "loss": 1.3545, + "step": 1583 + }, + { + "epoch": 0.08, + "grad_norm": 1.6153801602091513, + "learning_rate": 1.988884935388142e-05, + "loss": 1.3633, + "step": 1584 + }, + { + "epoch": 0.08, + "grad_norm": 1.1790341643428508, + "learning_rate": 1.988861761744653e-05, + "loss": 1.186, + "step": 1585 + }, + { + "epoch": 0.08, + "grad_norm": 1.3991151533865573, + "learning_rate": 1.988838564104398e-05, + "loss": 1.165, + "step": 1586 + }, + { + "epoch": 0.08, + "grad_norm": 1.3216074139577418, + "learning_rate": 1.9888153424679387e-05, + "loss": 1.2905, + "step": 1587 + }, + { + "epoch": 0.08, + "grad_norm": 1.2769752196103168, + "learning_rate": 1.9887920968358394e-05, + "loss": 1.2935, + "step": 1588 + }, + { + "epoch": 0.08, + "grad_norm": 1.0543367676367663, + "learning_rate": 1.9887688272086637e-05, + "loss": 1.2705, + "step": 1589 + }, + { + "epoch": 0.08, + "grad_norm": 1.3940021071872246, + "learning_rate": 1.9887455335869762e-05, + "loss": 1.3506, + "step": 1590 + }, + { + "epoch": 0.08, + "grad_norm": 1.2887218012159793, + "learning_rate": 1.9887222159713427e-05, + "loss": 1.2407, + "step": 1591 + }, + { + "epoch": 0.08, + "grad_norm": 1.3359888801163557, + "learning_rate": 1.9886988743623284e-05, + "loss": 1.1309, + "step": 1592 + }, + { + "epoch": 0.08, + "grad_norm": 1.361908692215102, + "learning_rate": 1.9886755087605004e-05, + "loss": 1.0789, + "step": 1593 + }, + { + "epoch": 0.08, + "grad_norm": 1.435841711510411, + "learning_rate": 1.9886521191664255e-05, + "loss": 1.3628, + "step": 1594 + }, + { + "epoch": 0.08, + "grad_norm": 1.4826435088788372, + "learning_rate": 1.988628705580671e-05, + "loss": 1.1841, + "step": 1595 + }, + { + "epoch": 0.08, + "grad_norm": 1.5094290273500288, + "learning_rate": 1.9886052680038048e-05, + "loss": 1.3467, + "step": 1596 + }, + { + "epoch": 0.08, + "grad_norm": 1.2516729119574375, + "learning_rate": 1.9885818064363968e-05, + "loss": 1.2466, + "step": 1597 + }, + { + "epoch": 0.08, + "grad_norm": 1.231356623547569, + "learning_rate": 1.9885583208790154e-05, + "loss": 1.231, + "step": 1598 + }, + { + "epoch": 0.08, + "grad_norm": 1.401862325435475, + "learning_rate": 1.988534811332231e-05, + "loss": 1.2988, + "step": 1599 + }, + { + "epoch": 0.08, + "grad_norm": 1.2879223757687983, + "learning_rate": 1.9885112777966135e-05, + "loss": 1.3047, + "step": 1600 + }, + { + "epoch": 0.08, + "grad_norm": 1.5887824075634995, + "learning_rate": 1.9884877202727345e-05, + "loss": 1.4385, + "step": 1601 + }, + { + "epoch": 0.08, + "grad_norm": 1.17229564677236, + "learning_rate": 1.988464138761166e-05, + "loss": 0.9707, + "step": 1602 + }, + { + "epoch": 0.08, + "grad_norm": 1.2709309503271358, + "learning_rate": 1.9884405332624793e-05, + "loss": 1.3823, + "step": 1603 + }, + { + "epoch": 0.08, + "grad_norm": 1.4270151500846249, + "learning_rate": 1.988416903777248e-05, + "loss": 1.332, + "step": 1604 + }, + { + "epoch": 0.08, + "grad_norm": 1.5303637365133647, + "learning_rate": 1.9883932503060452e-05, + "loss": 1.1738, + "step": 1605 + }, + { + "epoch": 0.08, + "grad_norm": 1.1082389386754241, + "learning_rate": 1.988369572849445e-05, + "loss": 1.2852, + "step": 1606 + }, + { + "epoch": 0.08, + "grad_norm": 1.5163589158538997, + "learning_rate": 1.9883458714080222e-05, + "loss": 1.2524, + "step": 1607 + }, + { + "epoch": 0.08, + "grad_norm": 1.264746250459986, + "learning_rate": 1.9883221459823515e-05, + "loss": 1.2256, + "step": 1608 + }, + { + "epoch": 0.08, + "grad_norm": 1.2397655318797134, + "learning_rate": 1.9882983965730086e-05, + "loss": 1.23, + "step": 1609 + }, + { + "epoch": 0.08, + "grad_norm": 1.4039035081920999, + "learning_rate": 1.9882746231805705e-05, + "loss": 1.1587, + "step": 1610 + }, + { + "epoch": 0.08, + "grad_norm": 1.1860489237763503, + "learning_rate": 1.9882508258056136e-05, + "loss": 1.1875, + "step": 1611 + }, + { + "epoch": 0.08, + "grad_norm": 1.2153527259245853, + "learning_rate": 1.9882270044487155e-05, + "loss": 1.2852, + "step": 1612 + }, + { + "epoch": 0.08, + "grad_norm": 1.2226321710437238, + "learning_rate": 1.9882031591104543e-05, + "loss": 1.2261, + "step": 1613 + }, + { + "epoch": 0.08, + "grad_norm": 1.0394641102434825, + "learning_rate": 1.9881792897914086e-05, + "loss": 1.2559, + "step": 1614 + }, + { + "epoch": 0.08, + "grad_norm": 1.3501411026326307, + "learning_rate": 1.9881553964921574e-05, + "loss": 1.2544, + "step": 1615 + }, + { + "epoch": 0.08, + "grad_norm": 1.1872091019577975, + "learning_rate": 1.9881314792132812e-05, + "loss": 1.167, + "step": 1616 + }, + { + "epoch": 0.08, + "grad_norm": 1.0945637575394478, + "learning_rate": 1.9881075379553597e-05, + "loss": 1.1665, + "step": 1617 + }, + { + "epoch": 0.08, + "grad_norm": 1.2695999424164235, + "learning_rate": 1.9880835727189742e-05, + "loss": 1.3223, + "step": 1618 + }, + { + "epoch": 0.08, + "grad_norm": 1.2291856405387098, + "learning_rate": 1.9880595835047062e-05, + "loss": 1.1958, + "step": 1619 + }, + { + "epoch": 0.08, + "grad_norm": 1.538452209073263, + "learning_rate": 1.988035570313138e-05, + "loss": 1.4199, + "step": 1620 + }, + { + "epoch": 0.08, + "grad_norm": 1.3443875217933037, + "learning_rate": 1.9880115331448526e-05, + "loss": 1.3145, + "step": 1621 + }, + { + "epoch": 0.08, + "grad_norm": 1.3366903311264422, + "learning_rate": 1.9879874720004326e-05, + "loss": 1.2085, + "step": 1622 + }, + { + "epoch": 0.08, + "grad_norm": 1.1615198994199438, + "learning_rate": 1.9879633868804624e-05, + "loss": 1.2158, + "step": 1623 + }, + { + "epoch": 0.08, + "grad_norm": 1.2467508503223026, + "learning_rate": 1.9879392777855258e-05, + "loss": 1.2339, + "step": 1624 + }, + { + "epoch": 0.08, + "grad_norm": 1.1879156511843596, + "learning_rate": 1.9879151447162086e-05, + "loss": 1.2954, + "step": 1625 + }, + { + "epoch": 0.08, + "grad_norm": 1.0901017835414069, + "learning_rate": 1.987890987673096e-05, + "loss": 1.2397, + "step": 1626 + }, + { + "epoch": 0.08, + "grad_norm": 1.196241581636429, + "learning_rate": 1.987866806656775e-05, + "loss": 1.2427, + "step": 1627 + }, + { + "epoch": 0.08, + "grad_norm": 1.6871214355079287, + "learning_rate": 1.9878426016678313e-05, + "loss": 1.3296, + "step": 1628 + }, + { + "epoch": 0.08, + "grad_norm": 1.3760034104199597, + "learning_rate": 1.987818372706853e-05, + "loss": 1.2065, + "step": 1629 + }, + { + "epoch": 0.08, + "grad_norm": 1.4896311686300578, + "learning_rate": 1.9877941197744277e-05, + "loss": 1.2632, + "step": 1630 + }, + { + "epoch": 0.08, + "grad_norm": 1.189490311276859, + "learning_rate": 1.9877698428711444e-05, + "loss": 1.3726, + "step": 1631 + }, + { + "epoch": 0.08, + "grad_norm": 1.1495260741744449, + "learning_rate": 1.9877455419975917e-05, + "loss": 1.3813, + "step": 1632 + }, + { + "epoch": 0.08, + "grad_norm": 1.4554475895087005, + "learning_rate": 1.9877212171543595e-05, + "loss": 1.375, + "step": 1633 + }, + { + "epoch": 0.08, + "grad_norm": 1.4916905656242379, + "learning_rate": 1.9876968683420384e-05, + "loss": 1.2109, + "step": 1634 + }, + { + "epoch": 0.08, + "grad_norm": 1.3254516615011172, + "learning_rate": 1.9876724955612188e-05, + "loss": 1.3076, + "step": 1635 + }, + { + "epoch": 0.08, + "grad_norm": 1.3002319869064318, + "learning_rate": 1.9876480988124923e-05, + "loss": 1.2085, + "step": 1636 + }, + { + "epoch": 0.08, + "grad_norm": 1.1348351019604566, + "learning_rate": 1.9876236780964513e-05, + "loss": 1.3828, + "step": 1637 + }, + { + "epoch": 0.08, + "grad_norm": 1.1995198013855326, + "learning_rate": 1.9875992334136878e-05, + "loss": 1.2827, + "step": 1638 + }, + { + "epoch": 0.08, + "grad_norm": 1.0529793886411385, + "learning_rate": 1.9875747647647956e-05, + "loss": 1.2666, + "step": 1639 + }, + { + "epoch": 0.08, + "grad_norm": 0.9996034953533504, + "learning_rate": 1.9875502721503683e-05, + "loss": 0.968, + "step": 1640 + }, + { + "epoch": 0.08, + "grad_norm": 1.2624351297586964, + "learning_rate": 1.987525755571e-05, + "loss": 1.2778, + "step": 1641 + }, + { + "epoch": 0.08, + "grad_norm": 1.3316781822753583, + "learning_rate": 1.987501215027286e-05, + "loss": 1.1416, + "step": 1642 + }, + { + "epoch": 0.08, + "grad_norm": 1.232776923526557, + "learning_rate": 1.9874766505198214e-05, + "loss": 1.2124, + "step": 1643 + }, + { + "epoch": 0.08, + "grad_norm": 1.3890462211734886, + "learning_rate": 1.9874520620492026e-05, + "loss": 1.4321, + "step": 1644 + }, + { + "epoch": 0.08, + "grad_norm": 1.3434850681980908, + "learning_rate": 1.987427449616026e-05, + "loss": 1.2729, + "step": 1645 + }, + { + "epoch": 0.08, + "grad_norm": 1.2536702304110583, + "learning_rate": 1.9874028132208897e-05, + "loss": 1.1973, + "step": 1646 + }, + { + "epoch": 0.08, + "grad_norm": 1.390632822548055, + "learning_rate": 1.9873781528643905e-05, + "loss": 1.2603, + "step": 1647 + }, + { + "epoch": 0.08, + "grad_norm": 1.3883345110599374, + "learning_rate": 1.9873534685471277e-05, + "loss": 1.3755, + "step": 1648 + }, + { + "epoch": 0.08, + "grad_norm": 1.1388837556808524, + "learning_rate": 1.9873287602696996e-05, + "loss": 1.1748, + "step": 1649 + }, + { + "epoch": 0.08, + "grad_norm": 1.258550032275601, + "learning_rate": 1.9873040280327062e-05, + "loss": 1.2114, + "step": 1650 + }, + { + "epoch": 0.08, + "grad_norm": 1.2582746824760067, + "learning_rate": 1.9872792718367476e-05, + "loss": 1.2334, + "step": 1651 + }, + { + "epoch": 0.08, + "grad_norm": 2.087091504698617, + "learning_rate": 1.9872544916824244e-05, + "loss": 1.2114, + "step": 1652 + }, + { + "epoch": 0.08, + "grad_norm": 1.307547305181814, + "learning_rate": 1.987229687570338e-05, + "loss": 1.2935, + "step": 1653 + }, + { + "epoch": 0.08, + "grad_norm": 1.050906895485712, + "learning_rate": 1.987204859501091e-05, + "loss": 1.3115, + "step": 1654 + }, + { + "epoch": 0.08, + "grad_norm": 1.1024383989189768, + "learning_rate": 1.9871800074752848e-05, + "loss": 1.3032, + "step": 1655 + }, + { + "epoch": 0.08, + "grad_norm": 1.247800984742456, + "learning_rate": 1.9871551314935233e-05, + "loss": 1.3359, + "step": 1656 + }, + { + "epoch": 0.08, + "grad_norm": 1.338746492487518, + "learning_rate": 1.98713023155641e-05, + "loss": 1.2397, + "step": 1657 + }, + { + "epoch": 0.08, + "grad_norm": 1.2123242181888063, + "learning_rate": 1.987105307664549e-05, + "loss": 1.2178, + "step": 1658 + }, + { + "epoch": 0.08, + "grad_norm": 0.9988686411556564, + "learning_rate": 1.9870803598185446e-05, + "loss": 1.3276, + "step": 1659 + }, + { + "epoch": 0.08, + "grad_norm": 1.0665999153495263, + "learning_rate": 1.9870553880190032e-05, + "loss": 1.2241, + "step": 1660 + }, + { + "epoch": 0.08, + "grad_norm": 1.7921682381594624, + "learning_rate": 1.9870303922665305e-05, + "loss": 1.4106, + "step": 1661 + }, + { + "epoch": 0.08, + "grad_norm": 1.0860295802671933, + "learning_rate": 1.9870053725617326e-05, + "loss": 1.1519, + "step": 1662 + }, + { + "epoch": 0.08, + "grad_norm": 1.4259572759552495, + "learning_rate": 1.986980328905217e-05, + "loss": 1.4238, + "step": 1663 + }, + { + "epoch": 0.08, + "grad_norm": 1.1318832440347604, + "learning_rate": 1.9869552612975917e-05, + "loss": 1.3979, + "step": 1664 + }, + { + "epoch": 0.08, + "grad_norm": 1.1599514792312433, + "learning_rate": 1.9869301697394646e-05, + "loss": 1.2583, + "step": 1665 + }, + { + "epoch": 0.08, + "grad_norm": 1.273737456698302, + "learning_rate": 1.9869050542314446e-05, + "loss": 1.3564, + "step": 1666 + }, + { + "epoch": 0.08, + "grad_norm": 1.3812323968356381, + "learning_rate": 1.9868799147741417e-05, + "loss": 1.188, + "step": 1667 + }, + { + "epoch": 0.08, + "grad_norm": 1.25229318921687, + "learning_rate": 1.986854751368165e-05, + "loss": 1.1338, + "step": 1668 + }, + { + "epoch": 0.08, + "grad_norm": 1.2867282300315708, + "learning_rate": 1.986829564014126e-05, + "loss": 1.2427, + "step": 1669 + }, + { + "epoch": 0.08, + "grad_norm": 1.3142788160100252, + "learning_rate": 1.9868043527126358e-05, + "loss": 1.1284, + "step": 1670 + }, + { + "epoch": 0.08, + "grad_norm": 1.407989324241409, + "learning_rate": 1.9867791174643057e-05, + "loss": 1.2314, + "step": 1671 + }, + { + "epoch": 0.08, + "grad_norm": 1.2386292115262292, + "learning_rate": 1.986753858269749e-05, + "loss": 1.2891, + "step": 1672 + }, + { + "epoch": 0.08, + "grad_norm": 1.008441866169437, + "learning_rate": 1.986728575129578e-05, + "loss": 1.188, + "step": 1673 + }, + { + "epoch": 0.08, + "grad_norm": 1.4395956167137967, + "learning_rate": 1.986703268044406e-05, + "loss": 1.4038, + "step": 1674 + }, + { + "epoch": 0.08, + "grad_norm": 1.0760815766206608, + "learning_rate": 1.9866779370148475e-05, + "loss": 1.1475, + "step": 1675 + }, + { + "epoch": 0.08, + "grad_norm": 1.0606003357941316, + "learning_rate": 1.9866525820415173e-05, + "loss": 1.332, + "step": 1676 + }, + { + "epoch": 0.08, + "grad_norm": 1.4274839216865483, + "learning_rate": 1.98662720312503e-05, + "loss": 1.377, + "step": 1677 + }, + { + "epoch": 0.08, + "grad_norm": 1.3985423627355724, + "learning_rate": 1.9866018002660027e-05, + "loss": 1.145, + "step": 1678 + }, + { + "epoch": 0.08, + "grad_norm": 1.4008991790051306, + "learning_rate": 1.9865763734650514e-05, + "loss": 1.3228, + "step": 1679 + }, + { + "epoch": 0.08, + "grad_norm": 1.3981520424334788, + "learning_rate": 1.9865509227227924e-05, + "loss": 1.1333, + "step": 1680 + }, + { + "epoch": 0.08, + "grad_norm": 1.1267362552651785, + "learning_rate": 1.986525448039844e-05, + "loss": 1.1572, + "step": 1681 + }, + { + "epoch": 0.08, + "grad_norm": 1.4605626183875666, + "learning_rate": 1.9864999494168245e-05, + "loss": 1.2822, + "step": 1682 + }, + { + "epoch": 0.08, + "grad_norm": 1.1863992658304676, + "learning_rate": 1.9864744268543522e-05, + "loss": 1.1021, + "step": 1683 + }, + { + "epoch": 0.08, + "grad_norm": 1.350047894774326, + "learning_rate": 1.9864488803530467e-05, + "loss": 1.2588, + "step": 1684 + }, + { + "epoch": 0.08, + "grad_norm": 1.1717315399856874, + "learning_rate": 1.9864233099135278e-05, + "loss": 1.3286, + "step": 1685 + }, + { + "epoch": 0.08, + "grad_norm": 0.9509228839276824, + "learning_rate": 1.9863977155364164e-05, + "loss": 1.3027, + "step": 1686 + }, + { + "epoch": 0.08, + "grad_norm": 1.6431046192161922, + "learning_rate": 1.986372097222333e-05, + "loss": 1.3428, + "step": 1687 + }, + { + "epoch": 0.08, + "grad_norm": 1.2154091438616301, + "learning_rate": 1.9863464549719e-05, + "loss": 1.2646, + "step": 1688 + }, + { + "epoch": 0.08, + "grad_norm": 1.2270820712795016, + "learning_rate": 1.986320788785739e-05, + "loss": 1.0159, + "step": 1689 + }, + { + "epoch": 0.08, + "grad_norm": 1.1005047455452612, + "learning_rate": 1.9862950986644732e-05, + "loss": 1.1743, + "step": 1690 + }, + { + "epoch": 0.08, + "grad_norm": 1.09309379584541, + "learning_rate": 1.9862693846087258e-05, + "loss": 1.27, + "step": 1691 + }, + { + "epoch": 0.08, + "grad_norm": 1.2853589198248638, + "learning_rate": 1.986243646619121e-05, + "loss": 1.3306, + "step": 1692 + }, + { + "epoch": 0.08, + "grad_norm": 1.1653208862261084, + "learning_rate": 1.9862178846962837e-05, + "loss": 1.3843, + "step": 1693 + }, + { + "epoch": 0.08, + "grad_norm": 1.0989835615343597, + "learning_rate": 1.9861920988408382e-05, + "loss": 1.4165, + "step": 1694 + }, + { + "epoch": 0.08, + "grad_norm": 1.3746966645217578, + "learning_rate": 1.986166289053411e-05, + "loss": 1.2471, + "step": 1695 + }, + { + "epoch": 0.08, + "grad_norm": 1.1661207129785116, + "learning_rate": 1.9861404553346282e-05, + "loss": 1.3765, + "step": 1696 + }, + { + "epoch": 0.08, + "grad_norm": 1.6329624589721, + "learning_rate": 1.9861145976851167e-05, + "loss": 1.2764, + "step": 1697 + }, + { + "epoch": 0.08, + "grad_norm": 1.1422753020485674, + "learning_rate": 1.986088716105504e-05, + "loss": 1.2266, + "step": 1698 + }, + { + "epoch": 0.08, + "grad_norm": 1.236680790160474, + "learning_rate": 1.986062810596418e-05, + "loss": 1.116, + "step": 1699 + }, + { + "epoch": 0.08, + "grad_norm": 1.199952893622414, + "learning_rate": 1.9860368811584875e-05, + "loss": 1.1143, + "step": 1700 + }, + { + "epoch": 0.08, + "grad_norm": 1.1771297493808275, + "learning_rate": 1.9860109277923417e-05, + "loss": 1.0007, + "step": 1701 + }, + { + "epoch": 0.08, + "grad_norm": 1.2338716231711284, + "learning_rate": 1.9859849504986105e-05, + "loss": 1.2822, + "step": 1702 + }, + { + "epoch": 0.08, + "grad_norm": 1.425362496656904, + "learning_rate": 1.9859589492779244e-05, + "loss": 1.332, + "step": 1703 + }, + { + "epoch": 0.08, + "grad_norm": 0.9567276935318477, + "learning_rate": 1.985932924130914e-05, + "loss": 1.2632, + "step": 1704 + }, + { + "epoch": 0.08, + "grad_norm": 1.2269417995299456, + "learning_rate": 1.9859068750582112e-05, + "loss": 1.2314, + "step": 1705 + }, + { + "epoch": 0.08, + "grad_norm": 1.2536930256888825, + "learning_rate": 1.985880802060448e-05, + "loss": 1.377, + "step": 1706 + }, + { + "epoch": 0.08, + "grad_norm": 1.4224238152340487, + "learning_rate": 1.9858547051382565e-05, + "loss": 1.3413, + "step": 1707 + }, + { + "epoch": 0.08, + "grad_norm": 1.2259825440738519, + "learning_rate": 1.9858285842922715e-05, + "loss": 1.3301, + "step": 1708 + }, + { + "epoch": 0.08, + "grad_norm": 1.4595370591453598, + "learning_rate": 1.9858024395231256e-05, + "loss": 1.1655, + "step": 1709 + }, + { + "epoch": 0.08, + "grad_norm": 1.325258420400975, + "learning_rate": 1.9857762708314535e-05, + "loss": 1.2168, + "step": 1710 + }, + { + "epoch": 0.08, + "grad_norm": 1.0285904602415692, + "learning_rate": 1.9857500782178905e-05, + "loss": 1.4336, + "step": 1711 + }, + { + "epoch": 0.08, + "grad_norm": 1.2701582065726138, + "learning_rate": 1.9857238616830718e-05, + "loss": 1.2202, + "step": 1712 + }, + { + "epoch": 0.08, + "grad_norm": 1.1025760437458634, + "learning_rate": 1.9856976212276344e-05, + "loss": 1.2827, + "step": 1713 + }, + { + "epoch": 0.08, + "grad_norm": 1.467506877024449, + "learning_rate": 1.9856713568522143e-05, + "loss": 1.4424, + "step": 1714 + }, + { + "epoch": 0.08, + "grad_norm": 0.8754132866254062, + "learning_rate": 1.985645068557449e-05, + "loss": 1.0776, + "step": 1715 + }, + { + "epoch": 0.08, + "grad_norm": 1.104738135089978, + "learning_rate": 1.985618756343977e-05, + "loss": 1.1758, + "step": 1716 + }, + { + "epoch": 0.08, + "grad_norm": 1.2047305424772685, + "learning_rate": 1.9855924202124358e-05, + "loss": 1.3252, + "step": 1717 + }, + { + "epoch": 0.08, + "grad_norm": 1.322928053264383, + "learning_rate": 1.9855660601634656e-05, + "loss": 1.1968, + "step": 1718 + }, + { + "epoch": 0.08, + "grad_norm": 1.7323570095694776, + "learning_rate": 1.9855396761977052e-05, + "loss": 1.3301, + "step": 1719 + }, + { + "epoch": 0.08, + "grad_norm": 1.3841468423555108, + "learning_rate": 1.985513268315795e-05, + "loss": 1.1597, + "step": 1720 + }, + { + "epoch": 0.08, + "grad_norm": 1.0837885105472314, + "learning_rate": 1.985486836518377e-05, + "loss": 1.314, + "step": 1721 + }, + { + "epoch": 0.08, + "grad_norm": 1.343073132074367, + "learning_rate": 1.9854603808060907e-05, + "loss": 1.1477, + "step": 1722 + }, + { + "epoch": 0.08, + "grad_norm": 1.3670448751058277, + "learning_rate": 1.9854339011795795e-05, + "loss": 1.4141, + "step": 1723 + }, + { + "epoch": 0.08, + "grad_norm": 1.173810842612231, + "learning_rate": 1.9854073976394858e-05, + "loss": 1.252, + "step": 1724 + }, + { + "epoch": 0.08, + "grad_norm": 1.3748869680961737, + "learning_rate": 1.9853808701864522e-05, + "loss": 1.2627, + "step": 1725 + }, + { + "epoch": 0.08, + "grad_norm": 1.1308112006159756, + "learning_rate": 1.985354318821123e-05, + "loss": 1.2632, + "step": 1726 + }, + { + "epoch": 0.08, + "grad_norm": 1.230740619326656, + "learning_rate": 1.9853277435441422e-05, + "loss": 1.1558, + "step": 1727 + }, + { + "epoch": 0.08, + "grad_norm": 1.369306932536038, + "learning_rate": 1.985301144356155e-05, + "loss": 1.3789, + "step": 1728 + }, + { + "epoch": 0.08, + "grad_norm": 1.257157813591945, + "learning_rate": 1.9852745212578063e-05, + "loss": 1.3359, + "step": 1729 + }, + { + "epoch": 0.08, + "grad_norm": 1.2735745089701531, + "learning_rate": 1.9852478742497426e-05, + "loss": 1.2554, + "step": 1730 + }, + { + "epoch": 0.08, + "grad_norm": 1.2966505055890094, + "learning_rate": 1.985221203332611e-05, + "loss": 1.3311, + "step": 1731 + }, + { + "epoch": 0.08, + "grad_norm": 1.6055596929694471, + "learning_rate": 1.985194508507058e-05, + "loss": 1.2646, + "step": 1732 + }, + { + "epoch": 0.08, + "grad_norm": 1.384939252942962, + "learning_rate": 1.9851677897737314e-05, + "loss": 1.2837, + "step": 1733 + }, + { + "epoch": 0.08, + "grad_norm": 1.2763714375228592, + "learning_rate": 1.98514104713328e-05, + "loss": 1.3398, + "step": 1734 + }, + { + "epoch": 0.08, + "grad_norm": 1.3062282027424854, + "learning_rate": 1.9851142805863523e-05, + "loss": 1.1411, + "step": 1735 + }, + { + "epoch": 0.08, + "grad_norm": 1.2165095725823665, + "learning_rate": 1.9850874901335984e-05, + "loss": 1.0901, + "step": 1736 + }, + { + "epoch": 0.08, + "grad_norm": 1.231108688971998, + "learning_rate": 1.9850606757756683e-05, + "loss": 1.2671, + "step": 1737 + }, + { + "epoch": 0.08, + "grad_norm": 1.289500160671035, + "learning_rate": 1.9850338375132125e-05, + "loss": 1.3472, + "step": 1738 + }, + { + "epoch": 0.08, + "grad_norm": 1.1194221539170175, + "learning_rate": 1.985006975346882e-05, + "loss": 1.0845, + "step": 1739 + }, + { + "epoch": 0.08, + "grad_norm": 1.7074085507493169, + "learning_rate": 1.9849800892773293e-05, + "loss": 1.2471, + "step": 1740 + }, + { + "epoch": 0.08, + "grad_norm": 1.5780239214584602, + "learning_rate": 1.9849531793052064e-05, + "loss": 1.1357, + "step": 1741 + }, + { + "epoch": 0.08, + "grad_norm": 1.382660377420584, + "learning_rate": 1.9849262454311663e-05, + "loss": 1.2007, + "step": 1742 + }, + { + "epoch": 0.08, + "grad_norm": 1.421342155171571, + "learning_rate": 1.9848992876558633e-05, + "loss": 1.2993, + "step": 1743 + }, + { + "epoch": 0.08, + "grad_norm": 1.2706096746238078, + "learning_rate": 1.9848723059799508e-05, + "loss": 1.2686, + "step": 1744 + }, + { + "epoch": 0.08, + "grad_norm": 0.655617420267706, + "learning_rate": 1.9848453004040838e-05, + "loss": 1.2148, + "step": 1745 + }, + { + "epoch": 0.08, + "grad_norm": 1.2300310108755486, + "learning_rate": 1.984818270928918e-05, + "loss": 1.2744, + "step": 1746 + }, + { + "epoch": 0.08, + "grad_norm": 1.2478262913144302, + "learning_rate": 1.9847912175551085e-05, + "loss": 1.2637, + "step": 1747 + }, + { + "epoch": 0.08, + "grad_norm": 1.2687880338273334, + "learning_rate": 1.9847641402833127e-05, + "loss": 1.3018, + "step": 1748 + }, + { + "epoch": 0.08, + "grad_norm": 1.400401039984681, + "learning_rate": 1.9847370391141872e-05, + "loss": 1.2183, + "step": 1749 + }, + { + "epoch": 0.08, + "grad_norm": 1.1702973385522992, + "learning_rate": 1.9847099140483896e-05, + "loss": 1.0503, + "step": 1750 + }, + { + "epoch": 0.08, + "grad_norm": 1.6880487717902481, + "learning_rate": 1.9846827650865787e-05, + "loss": 1.2178, + "step": 1751 + }, + { + "epoch": 0.08, + "grad_norm": 1.3334919040360198, + "learning_rate": 1.9846555922294123e-05, + "loss": 1.1689, + "step": 1752 + }, + { + "epoch": 0.08, + "grad_norm": 1.3801483334467752, + "learning_rate": 1.984628395477551e-05, + "loss": 1.3003, + "step": 1753 + }, + { + "epoch": 0.08, + "grad_norm": 1.3913933444369806, + "learning_rate": 1.984601174831654e-05, + "loss": 1.3491, + "step": 1754 + }, + { + "epoch": 0.08, + "grad_norm": 1.4331349726429305, + "learning_rate": 1.9845739302923822e-05, + "loss": 1.2676, + "step": 1755 + }, + { + "epoch": 0.08, + "grad_norm": 1.1899476594072496, + "learning_rate": 1.984546661860397e-05, + "loss": 1.1714, + "step": 1756 + }, + { + "epoch": 0.08, + "grad_norm": 1.123957126670351, + "learning_rate": 1.9845193695363592e-05, + "loss": 1.1875, + "step": 1757 + }, + { + "epoch": 0.08, + "grad_norm": 1.4441281009373768, + "learning_rate": 1.984492053320932e-05, + "loss": 1.1602, + "step": 1758 + }, + { + "epoch": 0.08, + "grad_norm": 1.3919848625473352, + "learning_rate": 1.9844647132147773e-05, + "loss": 1.2852, + "step": 1759 + }, + { + "epoch": 0.08, + "grad_norm": 1.3349011256731225, + "learning_rate": 1.98443734921856e-05, + "loss": 1.1182, + "step": 1760 + }, + { + "epoch": 0.08, + "grad_norm": 1.3500934566137928, + "learning_rate": 1.984409961332943e-05, + "loss": 1.2739, + "step": 1761 + }, + { + "epoch": 0.08, + "grad_norm": 1.3002065222767116, + "learning_rate": 1.9843825495585912e-05, + "loss": 1.2202, + "step": 1762 + }, + { + "epoch": 0.08, + "grad_norm": 0.9301056197056421, + "learning_rate": 1.98435511389617e-05, + "loss": 1.1401, + "step": 1763 + }, + { + "epoch": 0.08, + "grad_norm": 1.3731401215903167, + "learning_rate": 1.9843276543463452e-05, + "loss": 1.2202, + "step": 1764 + }, + { + "epoch": 0.08, + "grad_norm": 1.4753248084204624, + "learning_rate": 1.9843001709097832e-05, + "loss": 1.1294, + "step": 1765 + }, + { + "epoch": 0.08, + "grad_norm": 1.4634661076421687, + "learning_rate": 1.9842726635871503e-05, + "loss": 1.4268, + "step": 1766 + }, + { + "epoch": 0.08, + "grad_norm": 1.2622118966740314, + "learning_rate": 1.9842451323791146e-05, + "loss": 1.2544, + "step": 1767 + }, + { + "epoch": 0.09, + "grad_norm": 1.003937793988478, + "learning_rate": 1.9842175772863442e-05, + "loss": 1.3047, + "step": 1768 + }, + { + "epoch": 0.09, + "grad_norm": 1.4018160737830376, + "learning_rate": 1.984189998309508e-05, + "loss": 1.2236, + "step": 1769 + }, + { + "epoch": 0.09, + "grad_norm": 1.5237917683404207, + "learning_rate": 1.9841623954492744e-05, + "loss": 1.0415, + "step": 1770 + }, + { + "epoch": 0.09, + "grad_norm": 1.0315678916968165, + "learning_rate": 1.984134768706314e-05, + "loss": 1.3403, + "step": 1771 + }, + { + "epoch": 0.09, + "grad_norm": 1.4584492678227996, + "learning_rate": 1.9841071180812972e-05, + "loss": 1.21, + "step": 1772 + }, + { + "epoch": 0.09, + "grad_norm": 1.2569077078070172, + "learning_rate": 1.9840794435748946e-05, + "loss": 1.3501, + "step": 1773 + }, + { + "epoch": 0.09, + "grad_norm": 1.4250537933506815, + "learning_rate": 1.984051745187778e-05, + "loss": 1.3262, + "step": 1774 + }, + { + "epoch": 0.09, + "grad_norm": 1.4691990883754478, + "learning_rate": 1.9840240229206194e-05, + "loss": 1.4224, + "step": 1775 + }, + { + "epoch": 0.09, + "grad_norm": 1.1503567405444488, + "learning_rate": 1.9839962767740918e-05, + "loss": 1.104, + "step": 1776 + }, + { + "epoch": 0.09, + "grad_norm": 1.4069277259214634, + "learning_rate": 1.9839685067488683e-05, + "loss": 1.272, + "step": 1777 + }, + { + "epoch": 0.09, + "grad_norm": 1.3570906265536857, + "learning_rate": 1.983940712845623e-05, + "loss": 1.3423, + "step": 1778 + }, + { + "epoch": 0.09, + "grad_norm": 1.2968982949166132, + "learning_rate": 1.9839128950650302e-05, + "loss": 1.1843, + "step": 1779 + }, + { + "epoch": 0.09, + "grad_norm": 1.440589170789255, + "learning_rate": 1.9838850534077652e-05, + "loss": 1.0952, + "step": 1780 + }, + { + "epoch": 0.09, + "grad_norm": 1.4900470325711788, + "learning_rate": 1.983857187874503e-05, + "loss": 1.3486, + "step": 1781 + }, + { + "epoch": 0.09, + "grad_norm": 1.3055924610428082, + "learning_rate": 1.9838292984659207e-05, + "loss": 1.1548, + "step": 1782 + }, + { + "epoch": 0.09, + "grad_norm": 1.311845292910859, + "learning_rate": 1.9838013851826948e-05, + "loss": 1.2798, + "step": 1783 + }, + { + "epoch": 0.09, + "grad_norm": 1.1315891682844044, + "learning_rate": 1.983773448025502e-05, + "loss": 1.2222, + "step": 1784 + }, + { + "epoch": 0.09, + "grad_norm": 1.3755940003257658, + "learning_rate": 1.983745486995021e-05, + "loss": 1.2964, + "step": 1785 + }, + { + "epoch": 0.09, + "grad_norm": 1.0437521118940725, + "learning_rate": 1.9837175020919304e-05, + "loss": 1.1621, + "step": 1786 + }, + { + "epoch": 0.09, + "grad_norm": 1.3482654757471504, + "learning_rate": 1.983689493316909e-05, + "loss": 1.188, + "step": 1787 + }, + { + "epoch": 0.09, + "grad_norm": 0.9508350029086002, + "learning_rate": 1.983661460670636e-05, + "loss": 1.1768, + "step": 1788 + }, + { + "epoch": 0.09, + "grad_norm": 1.2362234811230115, + "learning_rate": 1.9836334041537928e-05, + "loss": 1.4507, + "step": 1789 + }, + { + "epoch": 0.09, + "grad_norm": 1.195258275515163, + "learning_rate": 1.9836053237670594e-05, + "loss": 1.2319, + "step": 1790 + }, + { + "epoch": 0.09, + "grad_norm": 1.362622719888542, + "learning_rate": 1.9835772195111174e-05, + "loss": 1.2119, + "step": 1791 + }, + { + "epoch": 0.09, + "grad_norm": 1.6713560550560689, + "learning_rate": 1.9835490913866492e-05, + "loss": 1.3672, + "step": 1792 + }, + { + "epoch": 0.09, + "grad_norm": 1.2060415800712028, + "learning_rate": 1.9835209393943366e-05, + "loss": 1.2549, + "step": 1793 + }, + { + "epoch": 0.09, + "grad_norm": 1.1140079926468116, + "learning_rate": 1.9834927635348635e-05, + "loss": 1.4097, + "step": 1794 + }, + { + "epoch": 0.09, + "grad_norm": 1.1289887483072814, + "learning_rate": 1.983464563808913e-05, + "loss": 1.2627, + "step": 1795 + }, + { + "epoch": 0.09, + "grad_norm": 1.3942316271006572, + "learning_rate": 1.9834363402171704e-05, + "loss": 1.2837, + "step": 1796 + }, + { + "epoch": 0.09, + "grad_norm": 1.214359073336569, + "learning_rate": 1.9834080927603198e-05, + "loss": 1.2261, + "step": 1797 + }, + { + "epoch": 0.09, + "grad_norm": 1.335786196950659, + "learning_rate": 1.9833798214390467e-05, + "loss": 1.3462, + "step": 1798 + }, + { + "epoch": 0.09, + "grad_norm": 0.952411291889435, + "learning_rate": 1.9833515262540376e-05, + "loss": 1.083, + "step": 1799 + }, + { + "epoch": 0.09, + "grad_norm": 1.0824754526560696, + "learning_rate": 1.9833232072059786e-05, + "loss": 1.0635, + "step": 1800 + }, + { + "epoch": 0.09, + "grad_norm": 1.1343186769665288, + "learning_rate": 1.9832948642955574e-05, + "loss": 1.2031, + "step": 1801 + }, + { + "epoch": 0.09, + "grad_norm": 1.5606034603676007, + "learning_rate": 1.983266497523462e-05, + "loss": 1.2207, + "step": 1802 + }, + { + "epoch": 0.09, + "grad_norm": 1.3328582285144686, + "learning_rate": 1.9832381068903797e-05, + "loss": 1.1987, + "step": 1803 + }, + { + "epoch": 0.09, + "grad_norm": 1.1072247820328662, + "learning_rate": 1.9832096923970002e-05, + "loss": 1.2178, + "step": 1804 + }, + { + "epoch": 0.09, + "grad_norm": 1.1737685891545009, + "learning_rate": 1.983181254044013e-05, + "loss": 1.1289, + "step": 1805 + }, + { + "epoch": 0.09, + "grad_norm": 1.2881356953876921, + "learning_rate": 1.983152791832109e-05, + "loss": 1.1987, + "step": 1806 + }, + { + "epoch": 0.09, + "grad_norm": 1.1960482924607332, + "learning_rate": 1.9831243057619774e-05, + "loss": 1.2588, + "step": 1807 + }, + { + "epoch": 0.09, + "grad_norm": 1.161388606630917, + "learning_rate": 1.98309579583431e-05, + "loss": 1.1812, + "step": 1808 + }, + { + "epoch": 0.09, + "grad_norm": 1.3470186324330802, + "learning_rate": 1.983067262049799e-05, + "loss": 1.2427, + "step": 1809 + }, + { + "epoch": 0.09, + "grad_norm": 1.0795804952988468, + "learning_rate": 1.983038704409137e-05, + "loss": 1.2471, + "step": 1810 + }, + { + "epoch": 0.09, + "grad_norm": 1.0960805566454825, + "learning_rate": 1.9830101229130162e-05, + "loss": 1.2769, + "step": 1811 + }, + { + "epoch": 0.09, + "grad_norm": 1.5931898942851315, + "learning_rate": 1.9829815175621306e-05, + "loss": 1.2056, + "step": 1812 + }, + { + "epoch": 0.09, + "grad_norm": 1.132318931297347, + "learning_rate": 1.9829528883571747e-05, + "loss": 1.2393, + "step": 1813 + }, + { + "epoch": 0.09, + "grad_norm": 1.196680125022584, + "learning_rate": 1.982924235298843e-05, + "loss": 1.1641, + "step": 1814 + }, + { + "epoch": 0.09, + "grad_norm": 1.7384397136070218, + "learning_rate": 1.9828955583878306e-05, + "loss": 1.3389, + "step": 1815 + }, + { + "epoch": 0.09, + "grad_norm": 1.4234072559778765, + "learning_rate": 1.9828668576248336e-05, + "loss": 1.355, + "step": 1816 + }, + { + "epoch": 0.09, + "grad_norm": 1.4004840104245582, + "learning_rate": 1.9828381330105487e-05, + "loss": 1.3145, + "step": 1817 + }, + { + "epoch": 0.09, + "grad_norm": 1.430145483068612, + "learning_rate": 1.9828093845456726e-05, + "loss": 1.292, + "step": 1818 + }, + { + "epoch": 0.09, + "grad_norm": 1.131864765010865, + "learning_rate": 1.982780612230903e-05, + "loss": 1.1602, + "step": 1819 + }, + { + "epoch": 0.09, + "grad_norm": 1.2271419667502215, + "learning_rate": 1.9827518160669382e-05, + "loss": 1.0393, + "step": 1820 + }, + { + "epoch": 0.09, + "grad_norm": 1.4873407662173224, + "learning_rate": 1.9827229960544773e-05, + "loss": 1.2061, + "step": 1821 + }, + { + "epoch": 0.09, + "grad_norm": 1.4788968370922488, + "learning_rate": 1.9826941521942187e-05, + "loss": 1.3604, + "step": 1822 + }, + { + "epoch": 0.09, + "grad_norm": 1.380609336499077, + "learning_rate": 1.9826652844868636e-05, + "loss": 1.2534, + "step": 1823 + }, + { + "epoch": 0.09, + "grad_norm": 1.048810656379789, + "learning_rate": 1.9826363929331118e-05, + "loss": 1.0664, + "step": 1824 + }, + { + "epoch": 0.09, + "grad_norm": 1.381017797331505, + "learning_rate": 1.9826074775336645e-05, + "loss": 1.4009, + "step": 1825 + }, + { + "epoch": 0.09, + "grad_norm": 1.1808942145118297, + "learning_rate": 1.9825785382892236e-05, + "loss": 1.1921, + "step": 1826 + }, + { + "epoch": 0.09, + "grad_norm": 1.18670521590121, + "learning_rate": 1.9825495752004912e-05, + "loss": 1.2437, + "step": 1827 + }, + { + "epoch": 0.09, + "grad_norm": 1.2587701650784606, + "learning_rate": 1.98252058826817e-05, + "loss": 1.3994, + "step": 1828 + }, + { + "epoch": 0.09, + "grad_norm": 1.1739826744407886, + "learning_rate": 1.982491577492964e-05, + "loss": 1.2363, + "step": 1829 + }, + { + "epoch": 0.09, + "grad_norm": 0.7998882876890707, + "learning_rate": 1.982462542875576e-05, + "loss": 1.4238, + "step": 1830 + }, + { + "epoch": 0.09, + "grad_norm": 1.2450536714957336, + "learning_rate": 1.9824334844167122e-05, + "loss": 1.2188, + "step": 1831 + }, + { + "epoch": 0.09, + "grad_norm": 1.0456690526533845, + "learning_rate": 1.982404402117077e-05, + "loss": 1.3252, + "step": 1832 + }, + { + "epoch": 0.09, + "grad_norm": 0.9356278345584655, + "learning_rate": 1.9823752959773758e-05, + "loss": 1.2603, + "step": 1833 + }, + { + "epoch": 0.09, + "grad_norm": 1.3633069944597016, + "learning_rate": 1.9823461659983154e-05, + "loss": 1.2686, + "step": 1834 + }, + { + "epoch": 0.09, + "grad_norm": 1.6136539881603067, + "learning_rate": 1.9823170121806023e-05, + "loss": 1.3042, + "step": 1835 + }, + { + "epoch": 0.09, + "grad_norm": 1.2750514200637628, + "learning_rate": 1.9822878345249443e-05, + "loss": 1.27, + "step": 1836 + }, + { + "epoch": 0.09, + "grad_norm": 1.4391801210518234, + "learning_rate": 1.9822586330320492e-05, + "loss": 1.3682, + "step": 1837 + }, + { + "epoch": 0.09, + "grad_norm": 1.1536653593791943, + "learning_rate": 1.982229407702626e-05, + "loss": 1.0552, + "step": 1838 + }, + { + "epoch": 0.09, + "grad_norm": 1.372185787930189, + "learning_rate": 1.9822001585373835e-05, + "loss": 1.3882, + "step": 1839 + }, + { + "epoch": 0.09, + "grad_norm": 1.3925096135691213, + "learning_rate": 1.9821708855370322e-05, + "loss": 1.1118, + "step": 1840 + }, + { + "epoch": 0.09, + "grad_norm": 1.0259853358389837, + "learning_rate": 1.9821415887022813e-05, + "loss": 1.2349, + "step": 1841 + }, + { + "epoch": 0.09, + "grad_norm": 1.687585957117712, + "learning_rate": 1.9821122680338428e-05, + "loss": 1.3047, + "step": 1842 + }, + { + "epoch": 0.09, + "grad_norm": 1.3350539595930961, + "learning_rate": 1.982082923532428e-05, + "loss": 1.1519, + "step": 1843 + }, + { + "epoch": 0.09, + "grad_norm": 1.1884548795914591, + "learning_rate": 1.9820535551987486e-05, + "loss": 1.333, + "step": 1844 + }, + { + "epoch": 0.09, + "grad_norm": 1.2306533442196632, + "learning_rate": 1.9820241630335176e-05, + "loss": 1.0474, + "step": 1845 + }, + { + "epoch": 0.09, + "grad_norm": 1.3535765161711673, + "learning_rate": 1.9819947470374484e-05, + "loss": 1.3374, + "step": 1846 + }, + { + "epoch": 0.09, + "grad_norm": 1.3616763642056717, + "learning_rate": 1.981965307211254e-05, + "loss": 1.3306, + "step": 1847 + }, + { + "epoch": 0.09, + "grad_norm": 1.2960870503315047, + "learning_rate": 1.98193584355565e-05, + "loss": 1.3975, + "step": 1848 + }, + { + "epoch": 0.09, + "grad_norm": 1.2024150132485367, + "learning_rate": 1.9819063560713507e-05, + "loss": 1.2456, + "step": 1849 + }, + { + "epoch": 0.09, + "grad_norm": 1.1419363856404334, + "learning_rate": 1.981876844759072e-05, + "loss": 1.2158, + "step": 1850 + }, + { + "epoch": 0.09, + "grad_norm": 1.2143021083560086, + "learning_rate": 1.9818473096195295e-05, + "loss": 1.1821, + "step": 1851 + }, + { + "epoch": 0.09, + "grad_norm": 1.5311817229537181, + "learning_rate": 1.9818177506534406e-05, + "loss": 1.2852, + "step": 1852 + }, + { + "epoch": 0.09, + "grad_norm": 1.0880649701378982, + "learning_rate": 1.981788167861522e-05, + "loss": 1.2422, + "step": 1853 + }, + { + "epoch": 0.09, + "grad_norm": 1.3099342549073392, + "learning_rate": 1.9817585612444923e-05, + "loss": 1.3076, + "step": 1854 + }, + { + "epoch": 0.09, + "grad_norm": 1.143384774778335, + "learning_rate": 1.9817289308030695e-05, + "loss": 1.4126, + "step": 1855 + }, + { + "epoch": 0.09, + "grad_norm": 1.2933426037574447, + "learning_rate": 1.9816992765379725e-05, + "loss": 1.2622, + "step": 1856 + }, + { + "epoch": 0.09, + "grad_norm": 1.3653525395177861, + "learning_rate": 1.981669598449921e-05, + "loss": 1.2393, + "step": 1857 + }, + { + "epoch": 0.09, + "grad_norm": 1.1037763411359711, + "learning_rate": 1.981639896539636e-05, + "loss": 1.2681, + "step": 1858 + }, + { + "epoch": 0.09, + "grad_norm": 1.2235161558151872, + "learning_rate": 1.981610170807837e-05, + "loss": 1.3755, + "step": 1859 + }, + { + "epoch": 0.09, + "grad_norm": 1.1917725639178294, + "learning_rate": 1.981580421255246e-05, + "loss": 1.1958, + "step": 1860 + }, + { + "epoch": 0.09, + "grad_norm": 1.463007154066213, + "learning_rate": 1.981550647882585e-05, + "loss": 1.3535, + "step": 1861 + }, + { + "epoch": 0.09, + "grad_norm": 1.5289923588107923, + "learning_rate": 1.9815208506905764e-05, + "loss": 1.3887, + "step": 1862 + }, + { + "epoch": 0.09, + "grad_norm": 1.1503786950060537, + "learning_rate": 1.9814910296799436e-05, + "loss": 1.1577, + "step": 1863 + }, + { + "epoch": 0.09, + "grad_norm": 1.5339452811644811, + "learning_rate": 1.9814611848514095e-05, + "loss": 0.9851, + "step": 1864 + }, + { + "epoch": 0.09, + "grad_norm": 1.6069252152410434, + "learning_rate": 1.9814313162056993e-05, + "loss": 1.4082, + "step": 1865 + }, + { + "epoch": 0.09, + "grad_norm": 1.336403139218595, + "learning_rate": 1.981401423743537e-05, + "loss": 1.2539, + "step": 1866 + }, + { + "epoch": 0.09, + "grad_norm": 1.143071807788205, + "learning_rate": 1.9813715074656482e-05, + "loss": 1.2822, + "step": 1867 + }, + { + "epoch": 0.09, + "grad_norm": 1.271020837079988, + "learning_rate": 1.9813415673727594e-05, + "loss": 1.1118, + "step": 1868 + }, + { + "epoch": 0.09, + "grad_norm": 1.3314967392576869, + "learning_rate": 1.9813116034655966e-05, + "loss": 1.3105, + "step": 1869 + }, + { + "epoch": 0.09, + "grad_norm": 1.3462319275916297, + "learning_rate": 1.9812816157448874e-05, + "loss": 1.1533, + "step": 1870 + }, + { + "epoch": 0.09, + "grad_norm": 1.096247330792168, + "learning_rate": 1.9812516042113588e-05, + "loss": 1.1172, + "step": 1871 + }, + { + "epoch": 0.09, + "grad_norm": 1.1371645632359064, + "learning_rate": 1.9812215688657397e-05, + "loss": 1.2212, + "step": 1872 + }, + { + "epoch": 0.09, + "grad_norm": 1.448132484891773, + "learning_rate": 1.9811915097087587e-05, + "loss": 1.1807, + "step": 1873 + }, + { + "epoch": 0.09, + "grad_norm": 1.3924437232799183, + "learning_rate": 1.9811614267411453e-05, + "loss": 1.4575, + "step": 1874 + }, + { + "epoch": 0.09, + "grad_norm": 1.32699154787606, + "learning_rate": 1.98113131996363e-05, + "loss": 1.3921, + "step": 1875 + }, + { + "epoch": 0.09, + "grad_norm": 1.5497160891791912, + "learning_rate": 1.9811011893769424e-05, + "loss": 1.1748, + "step": 1876 + }, + { + "epoch": 0.09, + "grad_norm": 1.1260457209155514, + "learning_rate": 1.9810710349818147e-05, + "loss": 1.3169, + "step": 1877 + }, + { + "epoch": 0.09, + "grad_norm": 1.4889529331914144, + "learning_rate": 1.9810408567789777e-05, + "loss": 1.3105, + "step": 1878 + }, + { + "epoch": 0.09, + "grad_norm": 1.241501955003818, + "learning_rate": 1.9810106547691648e-05, + "loss": 1.2632, + "step": 1879 + }, + { + "epoch": 0.09, + "grad_norm": 1.3583733722203615, + "learning_rate": 1.980980428953108e-05, + "loss": 1.1792, + "step": 1880 + }, + { + "epoch": 0.09, + "grad_norm": 1.341871972146023, + "learning_rate": 1.9809501793315412e-05, + "loss": 1.2075, + "step": 1881 + }, + { + "epoch": 0.09, + "grad_norm": 1.3535527634985074, + "learning_rate": 1.9809199059051987e-05, + "loss": 1.1455, + "step": 1882 + }, + { + "epoch": 0.09, + "grad_norm": 1.505812085624719, + "learning_rate": 1.9808896086748146e-05, + "loss": 1.2812, + "step": 1883 + }, + { + "epoch": 0.09, + "grad_norm": 0.9410656715009651, + "learning_rate": 1.9808592876411245e-05, + "loss": 1.3184, + "step": 1884 + }, + { + "epoch": 0.09, + "grad_norm": 1.1909379414074224, + "learning_rate": 1.980828942804864e-05, + "loss": 1.25, + "step": 1885 + }, + { + "epoch": 0.09, + "grad_norm": 1.4323633281455233, + "learning_rate": 1.9807985741667696e-05, + "loss": 1.3062, + "step": 1886 + }, + { + "epoch": 0.09, + "grad_norm": 1.3912976285764427, + "learning_rate": 1.9807681817275783e-05, + "loss": 1.3447, + "step": 1887 + }, + { + "epoch": 0.09, + "grad_norm": 1.2244205229669156, + "learning_rate": 1.9807377654880274e-05, + "loss": 1.4263, + "step": 1888 + }, + { + "epoch": 0.09, + "grad_norm": 1.4648441371114906, + "learning_rate": 1.9807073254488554e-05, + "loss": 1.395, + "step": 1889 + }, + { + "epoch": 0.09, + "grad_norm": 1.1879106042254508, + "learning_rate": 1.9806768616108006e-05, + "loss": 1.2793, + "step": 1890 + }, + { + "epoch": 0.09, + "grad_norm": 1.3341631637910782, + "learning_rate": 1.9806463739746026e-05, + "loss": 1.0142, + "step": 1891 + }, + { + "epoch": 0.09, + "grad_norm": 0.8959301333413131, + "learning_rate": 1.9806158625410008e-05, + "loss": 1.3291, + "step": 1892 + }, + { + "epoch": 0.09, + "grad_norm": 1.1413543119468637, + "learning_rate": 1.980585327310736e-05, + "loss": 1.3394, + "step": 1893 + }, + { + "epoch": 0.09, + "grad_norm": 1.5555458544381386, + "learning_rate": 1.9805547682845494e-05, + "loss": 1.2676, + "step": 1894 + }, + { + "epoch": 0.09, + "grad_norm": 1.3135357680091526, + "learning_rate": 1.980524185463182e-05, + "loss": 1.2974, + "step": 1895 + }, + { + "epoch": 0.09, + "grad_norm": 1.2931462983151132, + "learning_rate": 1.9804935788473765e-05, + "loss": 1.1538, + "step": 1896 + }, + { + "epoch": 0.09, + "grad_norm": 1.271293707715259, + "learning_rate": 1.980462948437875e-05, + "loss": 1.2432, + "step": 1897 + }, + { + "epoch": 0.09, + "grad_norm": 1.2205407584606955, + "learning_rate": 1.9804322942354216e-05, + "loss": 1.3623, + "step": 1898 + }, + { + "epoch": 0.09, + "grad_norm": 1.271260876053814, + "learning_rate": 1.9804016162407593e-05, + "loss": 1.2573, + "step": 1899 + }, + { + "epoch": 0.09, + "grad_norm": 1.2050906094853715, + "learning_rate": 1.980370914454633e-05, + "loss": 1.3423, + "step": 1900 + }, + { + "epoch": 0.09, + "grad_norm": 1.0436959750842918, + "learning_rate": 1.9803401888777882e-05, + "loss": 1.2832, + "step": 1901 + }, + { + "epoch": 0.09, + "grad_norm": 1.5487180477430478, + "learning_rate": 1.98030943951097e-05, + "loss": 1.2437, + "step": 1902 + }, + { + "epoch": 0.09, + "grad_norm": 1.2160795571778615, + "learning_rate": 1.9802786663549244e-05, + "loss": 1.251, + "step": 1903 + }, + { + "epoch": 0.09, + "grad_norm": 1.4803574845508547, + "learning_rate": 1.9802478694103987e-05, + "loss": 1.1494, + "step": 1904 + }, + { + "epoch": 0.09, + "grad_norm": 1.365936864911018, + "learning_rate": 1.98021704867814e-05, + "loss": 1.3901, + "step": 1905 + }, + { + "epoch": 0.09, + "grad_norm": 1.2355988877387292, + "learning_rate": 1.9801862041588966e-05, + "loss": 1.2583, + "step": 1906 + }, + { + "epoch": 0.09, + "grad_norm": 1.225360345430169, + "learning_rate": 1.9801553358534157e-05, + "loss": 1.249, + "step": 1907 + }, + { + "epoch": 0.09, + "grad_norm": 1.3061305562978804, + "learning_rate": 1.980124443762448e-05, + "loss": 1.3647, + "step": 1908 + }, + { + "epoch": 0.09, + "grad_norm": 1.2210649214891178, + "learning_rate": 1.9800935278867425e-05, + "loss": 1.1528, + "step": 1909 + }, + { + "epoch": 0.09, + "grad_norm": 1.186099945768962, + "learning_rate": 1.9800625882270493e-05, + "loss": 1.2646, + "step": 1910 + }, + { + "epoch": 0.09, + "grad_norm": 1.5234890191114479, + "learning_rate": 1.9800316247841194e-05, + "loss": 1.3188, + "step": 1911 + }, + { + "epoch": 0.09, + "grad_norm": 1.1577257558176532, + "learning_rate": 1.9800006375587043e-05, + "loss": 1.1816, + "step": 1912 + }, + { + "epoch": 0.09, + "grad_norm": 1.1232217587412756, + "learning_rate": 1.9799696265515554e-05, + "loss": 1.042, + "step": 1913 + }, + { + "epoch": 0.09, + "grad_norm": 1.3539644189490196, + "learning_rate": 1.9799385917634256e-05, + "loss": 1.29, + "step": 1914 + }, + { + "epoch": 0.09, + "grad_norm": 1.576710776008854, + "learning_rate": 1.9799075331950687e-05, + "loss": 1.2925, + "step": 1915 + }, + { + "epoch": 0.09, + "grad_norm": 1.1629957241868367, + "learning_rate": 1.9798764508472373e-05, + "loss": 1.1528, + "step": 1916 + }, + { + "epoch": 0.09, + "grad_norm": 1.2714239662622298, + "learning_rate": 1.9798453447206862e-05, + "loss": 1.2642, + "step": 1917 + }, + { + "epoch": 0.09, + "grad_norm": 1.4317572925039406, + "learning_rate": 1.9798142148161703e-05, + "loss": 1.4307, + "step": 1918 + }, + { + "epoch": 0.09, + "grad_norm": 0.8748751891435529, + "learning_rate": 1.9797830611344448e-05, + "loss": 1.2036, + "step": 1919 + }, + { + "epoch": 0.09, + "grad_norm": 1.1420997441221192, + "learning_rate": 1.979751883676266e-05, + "loss": 1.0698, + "step": 1920 + }, + { + "epoch": 0.09, + "grad_norm": 1.1107925757162522, + "learning_rate": 1.9797206824423904e-05, + "loss": 1.3062, + "step": 1921 + }, + { + "epoch": 0.09, + "grad_norm": 1.3699510783233686, + "learning_rate": 1.979689457433575e-05, + "loss": 1.2842, + "step": 1922 + }, + { + "epoch": 0.09, + "grad_norm": 1.2783085978969468, + "learning_rate": 1.9796582086505774e-05, + "loss": 1.3325, + "step": 1923 + }, + { + "epoch": 0.09, + "grad_norm": 1.1116854271641414, + "learning_rate": 1.9796269360941563e-05, + "loss": 1.2915, + "step": 1924 + }, + { + "epoch": 0.09, + "grad_norm": 1.1713580322894388, + "learning_rate": 1.9795956397650708e-05, + "loss": 1.2319, + "step": 1925 + }, + { + "epoch": 0.09, + "grad_norm": 1.8197100156795338, + "learning_rate": 1.9795643196640794e-05, + "loss": 1.1592, + "step": 1926 + }, + { + "epoch": 0.09, + "grad_norm": 1.201138460275207, + "learning_rate": 1.9795329757919433e-05, + "loss": 1.1328, + "step": 1927 + }, + { + "epoch": 0.09, + "grad_norm": 1.180477836679222, + "learning_rate": 1.9795016081494222e-05, + "loss": 1.1226, + "step": 1928 + }, + { + "epoch": 0.09, + "grad_norm": 0.9534884172599033, + "learning_rate": 1.9794702167372776e-05, + "loss": 1.1675, + "step": 1929 + }, + { + "epoch": 0.09, + "grad_norm": 1.3825822589924321, + "learning_rate": 1.9794388015562718e-05, + "loss": 1.3335, + "step": 1930 + }, + { + "epoch": 0.09, + "grad_norm": 1.420058194474751, + "learning_rate": 1.9794073626071664e-05, + "loss": 1.3564, + "step": 1931 + }, + { + "epoch": 0.09, + "grad_norm": 1.3617818143415814, + "learning_rate": 1.9793758998907248e-05, + "loss": 1.3984, + "step": 1932 + }, + { + "epoch": 0.09, + "grad_norm": 1.3508097019586434, + "learning_rate": 1.9793444134077102e-05, + "loss": 1.3389, + "step": 1933 + }, + { + "epoch": 0.09, + "grad_norm": 1.1818094785205957, + "learning_rate": 1.979312903158887e-05, + "loss": 1.2568, + "step": 1934 + }, + { + "epoch": 0.09, + "grad_norm": 1.1395130624512262, + "learning_rate": 1.9792813691450193e-05, + "loss": 1.1558, + "step": 1935 + }, + { + "epoch": 0.09, + "grad_norm": 1.4412372225196295, + "learning_rate": 1.9792498113668733e-05, + "loss": 1.2866, + "step": 1936 + }, + { + "epoch": 0.09, + "grad_norm": 1.5415974413723252, + "learning_rate": 1.979218229825214e-05, + "loss": 1.4028, + "step": 1937 + }, + { + "epoch": 0.09, + "grad_norm": 0.9975591377603297, + "learning_rate": 1.979186624520808e-05, + "loss": 1.1362, + "step": 1938 + }, + { + "epoch": 0.09, + "grad_norm": 1.3604283528299983, + "learning_rate": 1.9791549954544226e-05, + "loss": 1.2744, + "step": 1939 + }, + { + "epoch": 0.09, + "grad_norm": 1.2611282165658084, + "learning_rate": 1.9791233426268245e-05, + "loss": 1.3066, + "step": 1940 + }, + { + "epoch": 0.09, + "grad_norm": 1.3773837719073607, + "learning_rate": 1.9790916660387825e-05, + "loss": 1.3213, + "step": 1941 + }, + { + "epoch": 0.09, + "grad_norm": 1.331772185699569, + "learning_rate": 1.9790599656910653e-05, + "loss": 1.2944, + "step": 1942 + }, + { + "epoch": 0.09, + "grad_norm": 1.6354764154756627, + "learning_rate": 1.9790282415844425e-05, + "loss": 1.2729, + "step": 1943 + }, + { + "epoch": 0.09, + "grad_norm": 1.3435888999656587, + "learning_rate": 1.978996493719683e-05, + "loss": 1.353, + "step": 1944 + }, + { + "epoch": 0.09, + "grad_norm": 1.0170187415256409, + "learning_rate": 1.9789647220975578e-05, + "loss": 1.2197, + "step": 1945 + }, + { + "epoch": 0.09, + "grad_norm": 1.3423722400733198, + "learning_rate": 1.9789329267188378e-05, + "loss": 1.0098, + "step": 1946 + }, + { + "epoch": 0.09, + "grad_norm": 0.9092439866234423, + "learning_rate": 1.9789011075842947e-05, + "loss": 1.0308, + "step": 1947 + }, + { + "epoch": 0.09, + "grad_norm": 1.0190733705070874, + "learning_rate": 1.9788692646947e-05, + "loss": 1.2646, + "step": 1948 + }, + { + "epoch": 0.09, + "grad_norm": 1.335657136392883, + "learning_rate": 1.978837398050828e-05, + "loss": 1.2798, + "step": 1949 + }, + { + "epoch": 0.09, + "grad_norm": 1.426907493007665, + "learning_rate": 1.9788055076534504e-05, + "loss": 1.3481, + "step": 1950 + }, + { + "epoch": 0.09, + "grad_norm": 1.3543197094558357, + "learning_rate": 1.9787735935033415e-05, + "loss": 1.1167, + "step": 1951 + }, + { + "epoch": 0.09, + "grad_norm": 1.440283363594555, + "learning_rate": 1.9787416556012766e-05, + "loss": 1.2124, + "step": 1952 + }, + { + "epoch": 0.09, + "grad_norm": 1.2758173427514399, + "learning_rate": 1.9787096939480296e-05, + "loss": 1.3149, + "step": 1953 + }, + { + "epoch": 0.09, + "grad_norm": 1.3577642509279624, + "learning_rate": 1.9786777085443766e-05, + "loss": 1.1011, + "step": 1954 + }, + { + "epoch": 0.09, + "grad_norm": 1.3296651840060834, + "learning_rate": 1.9786456993910938e-05, + "loss": 1.335, + "step": 1955 + }, + { + "epoch": 0.09, + "grad_norm": 1.274691401027712, + "learning_rate": 1.9786136664889583e-05, + "loss": 1.2183, + "step": 1956 + }, + { + "epoch": 0.09, + "grad_norm": 1.3340410916063563, + "learning_rate": 1.9785816098387468e-05, + "loss": 1.1196, + "step": 1957 + }, + { + "epoch": 0.09, + "grad_norm": 1.3834039904808135, + "learning_rate": 1.9785495294412378e-05, + "loss": 1.1489, + "step": 1958 + }, + { + "epoch": 0.09, + "grad_norm": 1.3465260967343657, + "learning_rate": 1.9785174252972092e-05, + "loss": 1.2104, + "step": 1959 + }, + { + "epoch": 0.09, + "grad_norm": 1.3693942367620284, + "learning_rate": 1.9784852974074408e-05, + "loss": 1.1582, + "step": 1960 + }, + { + "epoch": 0.09, + "grad_norm": 1.5251741720063983, + "learning_rate": 1.9784531457727116e-05, + "loss": 1.3569, + "step": 1961 + }, + { + "epoch": 0.09, + "grad_norm": 1.1505700713100273, + "learning_rate": 1.978420970393802e-05, + "loss": 1.3442, + "step": 1962 + }, + { + "epoch": 0.09, + "grad_norm": 1.4424192531196682, + "learning_rate": 1.978388771271493e-05, + "loss": 1.3423, + "step": 1963 + }, + { + "epoch": 0.09, + "grad_norm": 1.2637014815844476, + "learning_rate": 1.9783565484065657e-05, + "loss": 1.1538, + "step": 1964 + }, + { + "epoch": 0.09, + "grad_norm": 1.2633761561461772, + "learning_rate": 1.978324301799803e-05, + "loss": 1.3345, + "step": 1965 + }, + { + "epoch": 0.09, + "grad_norm": 0.7021856767812172, + "learning_rate": 1.978292031451986e-05, + "loss": 1.1279, + "step": 1966 + }, + { + "epoch": 0.09, + "grad_norm": 1.115255513490482, + "learning_rate": 1.978259737363898e-05, + "loss": 1.2012, + "step": 1967 + }, + { + "epoch": 0.09, + "grad_norm": 1.3305716134564918, + "learning_rate": 1.9782274195363237e-05, + "loss": 1.2954, + "step": 1968 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944257079307671, + "learning_rate": 1.978195077970047e-05, + "loss": 1.2012, + "step": 1969 + }, + { + "epoch": 0.09, + "grad_norm": 1.4524427003404283, + "learning_rate": 1.978162712665852e-05, + "loss": 1.1606, + "step": 1970 + }, + { + "epoch": 0.09, + "grad_norm": 1.2487084159904944, + "learning_rate": 1.978130323624525e-05, + "loss": 1.3584, + "step": 1971 + }, + { + "epoch": 0.09, + "grad_norm": 1.4492066332832005, + "learning_rate": 1.9780979108468513e-05, + "loss": 1.2188, + "step": 1972 + }, + { + "epoch": 0.09, + "grad_norm": 1.325234865682575, + "learning_rate": 1.978065474333618e-05, + "loss": 1.2339, + "step": 1973 + }, + { + "epoch": 0.09, + "grad_norm": 1.4077117149483565, + "learning_rate": 1.978033014085612e-05, + "loss": 1.355, + "step": 1974 + }, + { + "epoch": 0.09, + "grad_norm": 1.15703544827588, + "learning_rate": 1.9780005301036208e-05, + "loss": 1.3276, + "step": 1975 + }, + { + "epoch": 0.1, + "grad_norm": 1.1049386815963613, + "learning_rate": 1.9779680223884335e-05, + "loss": 1.2349, + "step": 1976 + }, + { + "epoch": 0.1, + "grad_norm": 1.2077627868885916, + "learning_rate": 1.977935490940838e-05, + "loss": 1.2646, + "step": 1977 + }, + { + "epoch": 0.1, + "grad_norm": 1.3986305644768324, + "learning_rate": 1.977902935761624e-05, + "loss": 1.251, + "step": 1978 + }, + { + "epoch": 0.1, + "grad_norm": 1.6066739875333151, + "learning_rate": 1.977870356851582e-05, + "loss": 1.2417, + "step": 1979 + }, + { + "epoch": 0.1, + "grad_norm": 1.3034633542589464, + "learning_rate": 1.9778377542115022e-05, + "loss": 1.3931, + "step": 1980 + }, + { + "epoch": 0.1, + "grad_norm": 1.291240417190449, + "learning_rate": 1.9778051278421758e-05, + "loss": 1.2734, + "step": 1981 + }, + { + "epoch": 0.1, + "grad_norm": 1.121421271892713, + "learning_rate": 1.9777724777443943e-05, + "loss": 1.3477, + "step": 1982 + }, + { + "epoch": 0.1, + "grad_norm": 0.7898969503456257, + "learning_rate": 1.9777398039189507e-05, + "loss": 1.1743, + "step": 1983 + }, + { + "epoch": 0.1, + "grad_norm": 1.267344391691263, + "learning_rate": 1.9777071063666372e-05, + "loss": 1.2603, + "step": 1984 + }, + { + "epoch": 0.1, + "grad_norm": 1.1057886422483736, + "learning_rate": 1.9776743850882476e-05, + "loss": 1.1899, + "step": 1985 + }, + { + "epoch": 0.1, + "grad_norm": 1.252124578417424, + "learning_rate": 1.977641640084576e-05, + "loss": 1.2109, + "step": 1986 + }, + { + "epoch": 0.1, + "grad_norm": 1.2469902171086902, + "learning_rate": 1.977608871356417e-05, + "loss": 1.208, + "step": 1987 + }, + { + "epoch": 0.1, + "grad_norm": 1.234566485219563, + "learning_rate": 1.9775760789045656e-05, + "loss": 1.2529, + "step": 1988 + }, + { + "epoch": 0.1, + "grad_norm": 1.1096364742127283, + "learning_rate": 1.9775432627298175e-05, + "loss": 1.3228, + "step": 1989 + }, + { + "epoch": 0.1, + "grad_norm": 1.264457002539019, + "learning_rate": 1.97751042283297e-05, + "loss": 1.1265, + "step": 1990 + }, + { + "epoch": 0.1, + "grad_norm": 1.0905838112933341, + "learning_rate": 1.9774775592148185e-05, + "loss": 1.103, + "step": 1991 + }, + { + "epoch": 0.1, + "grad_norm": 1.5294572622315383, + "learning_rate": 1.9774446718761616e-05, + "loss": 1.3809, + "step": 1992 + }, + { + "epoch": 0.1, + "grad_norm": 0.9072722587924513, + "learning_rate": 1.9774117608177968e-05, + "loss": 1.1846, + "step": 1993 + }, + { + "epoch": 0.1, + "grad_norm": 1.4550173029197349, + "learning_rate": 1.9773788260405237e-05, + "loss": 1.2935, + "step": 1994 + }, + { + "epoch": 0.1, + "grad_norm": 1.3806620182380174, + "learning_rate": 1.9773458675451406e-05, + "loss": 1.292, + "step": 1995 + }, + { + "epoch": 0.1, + "grad_norm": 1.2539535465453435, + "learning_rate": 1.977312885332447e-05, + "loss": 1.0999, + "step": 1996 + }, + { + "epoch": 0.1, + "grad_norm": 1.1418369914818083, + "learning_rate": 1.9772798794032445e-05, + "loss": 1.3149, + "step": 1997 + }, + { + "epoch": 0.1, + "grad_norm": 1.189015858024471, + "learning_rate": 1.977246849758333e-05, + "loss": 1.2783, + "step": 1998 + }, + { + "epoch": 0.1, + "grad_norm": 1.0220195531797676, + "learning_rate": 1.977213796398515e-05, + "loss": 1.3462, + "step": 1999 + }, + { + "epoch": 0.1, + "grad_norm": 1.2476356362494831, + "learning_rate": 1.9771807193245913e-05, + "loss": 1.3086, + "step": 2000 + }, + { + "epoch": 0.1, + "grad_norm": 1.3285604339072905, + "learning_rate": 1.9771476185373658e-05, + "loss": 1.4009, + "step": 2001 + }, + { + "epoch": 0.1, + "grad_norm": 1.2975221481531984, + "learning_rate": 1.9771144940376413e-05, + "loss": 1.3105, + "step": 2002 + }, + { + "epoch": 0.1, + "grad_norm": 1.2816241879339203, + "learning_rate": 1.9770813458262212e-05, + "loss": 1.3433, + "step": 2003 + }, + { + "epoch": 0.1, + "grad_norm": 0.6431789835457682, + "learning_rate": 1.977048173903911e-05, + "loss": 1.1978, + "step": 2004 + }, + { + "epoch": 0.1, + "grad_norm": 0.6431789835457682, + "learning_rate": 1.977048173903911e-05, + "loss": 1.3042, + "step": 2005 + }, + { + "epoch": 0.1, + "grad_norm": 0.6431789835457682, + "learning_rate": 1.977048173903911e-05, + "loss": 1.2979, + "step": 2006 + }, + { + "epoch": 0.1, + "grad_norm": 1.0425169825151315, + "learning_rate": 1.9770149782715143e-05, + "loss": 1.1895, + "step": 2007 + }, + { + "epoch": 0.1, + "grad_norm": 1.811741374945619, + "learning_rate": 1.9769817589298377e-05, + "loss": 1.4292, + "step": 2008 + }, + { + "epoch": 0.1, + "grad_norm": 1.302940854013643, + "learning_rate": 1.976948515879687e-05, + "loss": 1.2368, + "step": 2009 + }, + { + "epoch": 0.1, + "grad_norm": 1.2686137054636135, + "learning_rate": 1.976915249121869e-05, + "loss": 1.0947, + "step": 2010 + }, + { + "epoch": 0.1, + "grad_norm": 1.5734766186908669, + "learning_rate": 1.9768819586571907e-05, + "loss": 1.1758, + "step": 2011 + }, + { + "epoch": 0.1, + "grad_norm": 1.6118039212175372, + "learning_rate": 1.9768486444864605e-05, + "loss": 1.3184, + "step": 2012 + }, + { + "epoch": 0.1, + "grad_norm": 1.1596293889916438, + "learning_rate": 1.9768153066104863e-05, + "loss": 1.1724, + "step": 2013 + }, + { + "epoch": 0.1, + "grad_norm": 1.151504379768896, + "learning_rate": 1.9767819450300773e-05, + "loss": 1.1367, + "step": 2014 + }, + { + "epoch": 0.1, + "grad_norm": 1.4823393060304144, + "learning_rate": 1.976748559746043e-05, + "loss": 1.332, + "step": 2015 + }, + { + "epoch": 0.1, + "grad_norm": 1.375982025313618, + "learning_rate": 1.9767151507591943e-05, + "loss": 1.2598, + "step": 2016 + }, + { + "epoch": 0.1, + "grad_norm": 1.1952637599835143, + "learning_rate": 1.976681718070341e-05, + "loss": 1.21, + "step": 2017 + }, + { + "epoch": 0.1, + "grad_norm": 1.2192532735607047, + "learning_rate": 1.9766482616802943e-05, + "loss": 1.2207, + "step": 2018 + }, + { + "epoch": 0.1, + "grad_norm": 1.1588166900258954, + "learning_rate": 1.9766147815898668e-05, + "loss": 1.4526, + "step": 2019 + }, + { + "epoch": 0.1, + "grad_norm": 1.0178724967355894, + "learning_rate": 1.9765812777998704e-05, + "loss": 1.2188, + "step": 2020 + }, + { + "epoch": 0.1, + "grad_norm": 1.058335571937333, + "learning_rate": 1.9765477503111187e-05, + "loss": 1.3188, + "step": 2021 + }, + { + "epoch": 0.1, + "grad_norm": 1.11593542253497, + "learning_rate": 1.976514199124425e-05, + "loss": 1.2622, + "step": 2022 + }, + { + "epoch": 0.1, + "grad_norm": 1.2694036694685196, + "learning_rate": 1.9764806242406034e-05, + "loss": 1.2954, + "step": 2023 + }, + { + "epoch": 0.1, + "grad_norm": 1.2511534529361326, + "learning_rate": 1.976447025660469e-05, + "loss": 1.2246, + "step": 2024 + }, + { + "epoch": 0.1, + "grad_norm": 1.3675042023071473, + "learning_rate": 1.9764134033848367e-05, + "loss": 1.3057, + "step": 2025 + }, + { + "epoch": 0.1, + "grad_norm": 1.374546723449986, + "learning_rate": 1.9763797574145227e-05, + "loss": 1.1943, + "step": 2026 + }, + { + "epoch": 0.1, + "grad_norm": 1.301712629317597, + "learning_rate": 1.9763460877503435e-05, + "loss": 1.3218, + "step": 2027 + }, + { + "epoch": 0.1, + "grad_norm": 1.0436843628174695, + "learning_rate": 1.976312394393116e-05, + "loss": 1.3774, + "step": 2028 + }, + { + "epoch": 0.1, + "grad_norm": 1.1615515825629406, + "learning_rate": 1.976278677343658e-05, + "loss": 1.1602, + "step": 2029 + }, + { + "epoch": 0.1, + "grad_norm": 1.2740689357527017, + "learning_rate": 1.9762449366027873e-05, + "loss": 1.2324, + "step": 2030 + }, + { + "epoch": 0.1, + "grad_norm": 1.0471733195665198, + "learning_rate": 1.9762111721713232e-05, + "loss": 1.2847, + "step": 2031 + }, + { + "epoch": 0.1, + "grad_norm": 1.2869648584857225, + "learning_rate": 1.976177384050085e-05, + "loss": 1.208, + "step": 2032 + }, + { + "epoch": 0.1, + "grad_norm": 0.8759124624504705, + "learning_rate": 1.9761435722398926e-05, + "loss": 1.2861, + "step": 2033 + }, + { + "epoch": 0.1, + "grad_norm": 1.6012159367671621, + "learning_rate": 1.9761097367415663e-05, + "loss": 1.3315, + "step": 2034 + }, + { + "epoch": 0.1, + "grad_norm": 1.4941478423089558, + "learning_rate": 1.9760758775559275e-05, + "loss": 1.1357, + "step": 2035 + }, + { + "epoch": 0.1, + "grad_norm": 1.267835879913572, + "learning_rate": 1.9760419946837972e-05, + "loss": 1.165, + "step": 2036 + }, + { + "epoch": 0.1, + "grad_norm": 1.327553436482191, + "learning_rate": 1.9760080881259986e-05, + "loss": 1.1343, + "step": 2037 + }, + { + "epoch": 0.1, + "grad_norm": 1.0747600695097697, + "learning_rate": 1.975974157883354e-05, + "loss": 1.168, + "step": 2038 + }, + { + "epoch": 0.1, + "grad_norm": 0.9535143422121894, + "learning_rate": 1.9759402039566865e-05, + "loss": 1.1592, + "step": 2039 + }, + { + "epoch": 0.1, + "grad_norm": 1.2257212165477773, + "learning_rate": 1.9759062263468207e-05, + "loss": 1.2427, + "step": 2040 + }, + { + "epoch": 0.1, + "grad_norm": 1.33870772525154, + "learning_rate": 1.9758722250545808e-05, + "loss": 1.033, + "step": 2041 + }, + { + "epoch": 0.1, + "grad_norm": 1.2690613040827619, + "learning_rate": 1.9758382000807915e-05, + "loss": 1.2939, + "step": 2042 + }, + { + "epoch": 0.1, + "grad_norm": 1.1847167988261655, + "learning_rate": 1.9758041514262795e-05, + "loss": 1.1997, + "step": 2043 + }, + { + "epoch": 0.1, + "grad_norm": 1.327837899753317, + "learning_rate": 1.9757700790918704e-05, + "loss": 1.2539, + "step": 2044 + }, + { + "epoch": 0.1, + "grad_norm": 0.8751618093928132, + "learning_rate": 1.9757359830783907e-05, + "loss": 1.3491, + "step": 2045 + }, + { + "epoch": 0.1, + "grad_norm": 0.9442029754260975, + "learning_rate": 1.9757018633866688e-05, + "loss": 1.3022, + "step": 2046 + }, + { + "epoch": 0.1, + "grad_norm": 1.4035365147878054, + "learning_rate": 1.9756677200175316e-05, + "loss": 1.063, + "step": 2047 + }, + { + "epoch": 0.1, + "grad_norm": 1.8211198988476747, + "learning_rate": 1.9756335529718086e-05, + "loss": 1.25, + "step": 2048 + }, + { + "epoch": 0.1, + "grad_norm": 1.3777729027049, + "learning_rate": 1.9755993622503283e-05, + "loss": 1.0942, + "step": 2049 + }, + { + "epoch": 0.1, + "grad_norm": 0.8899834118655995, + "learning_rate": 1.9755651478539203e-05, + "loss": 1.106, + "step": 2050 + }, + { + "epoch": 0.1, + "grad_norm": 1.2674975470000245, + "learning_rate": 1.975530909783416e-05, + "loss": 1.4248, + "step": 2051 + }, + { + "epoch": 0.1, + "grad_norm": 1.3766201367701416, + "learning_rate": 1.9754966480396447e-05, + "loss": 1.2812, + "step": 2052 + }, + { + "epoch": 0.1, + "grad_norm": 1.4859819454376624, + "learning_rate": 1.9754623626234387e-05, + "loss": 1.4546, + "step": 2053 + }, + { + "epoch": 0.1, + "grad_norm": 1.5759141603448967, + "learning_rate": 1.9754280535356302e-05, + "loss": 1.2925, + "step": 2054 + }, + { + "epoch": 0.1, + "grad_norm": 1.170172428054944, + "learning_rate": 1.9753937207770513e-05, + "loss": 1.1431, + "step": 2055 + }, + { + "epoch": 0.1, + "grad_norm": 1.1480184458850897, + "learning_rate": 1.975359364348535e-05, + "loss": 1.2285, + "step": 2056 + }, + { + "epoch": 0.1, + "grad_norm": 1.2025079683706563, + "learning_rate": 1.975324984250916e-05, + "loss": 1.2104, + "step": 2057 + }, + { + "epoch": 0.1, + "grad_norm": 1.1901110596937803, + "learning_rate": 1.9752905804850275e-05, + "loss": 1.3276, + "step": 2058 + }, + { + "epoch": 0.1, + "grad_norm": 0.9789933871269692, + "learning_rate": 1.975256153051705e-05, + "loss": 1.2852, + "step": 2059 + }, + { + "epoch": 0.1, + "grad_norm": 1.2673149648134425, + "learning_rate": 1.9752217019517838e-05, + "loss": 1.0425, + "step": 2060 + }, + { + "epoch": 0.1, + "grad_norm": 1.145225361730154, + "learning_rate": 1.9751872271860998e-05, + "loss": 1.1748, + "step": 2061 + }, + { + "epoch": 0.1, + "grad_norm": 1.050211635967624, + "learning_rate": 1.9751527287554898e-05, + "loss": 1.1855, + "step": 2062 + }, + { + "epoch": 0.1, + "grad_norm": 1.2586377001323827, + "learning_rate": 1.975118206660791e-05, + "loss": 1.376, + "step": 2063 + }, + { + "epoch": 0.1, + "grad_norm": 1.6282943310107802, + "learning_rate": 1.975083660902841e-05, + "loss": 1.1802, + "step": 2064 + }, + { + "epoch": 0.1, + "grad_norm": 1.3840946754986514, + "learning_rate": 1.975049091482478e-05, + "loss": 1.1899, + "step": 2065 + }, + { + "epoch": 0.1, + "grad_norm": 1.2440375351921649, + "learning_rate": 1.975014498400541e-05, + "loss": 1.1987, + "step": 2066 + }, + { + "epoch": 0.1, + "grad_norm": 1.258269264829545, + "learning_rate": 1.97497988165787e-05, + "loss": 1.1309, + "step": 2067 + }, + { + "epoch": 0.1, + "grad_norm": 1.3391240575611765, + "learning_rate": 1.974945241255304e-05, + "loss": 1.3579, + "step": 2068 + }, + { + "epoch": 0.1, + "grad_norm": 1.352244351802837, + "learning_rate": 1.974910577193685e-05, + "loss": 1.2871, + "step": 2069 + }, + { + "epoch": 0.1, + "grad_norm": 1.1585402461726408, + "learning_rate": 1.974875889473853e-05, + "loss": 1.3091, + "step": 2070 + }, + { + "epoch": 0.1, + "grad_norm": 1.3295266585758037, + "learning_rate": 1.9748411780966497e-05, + "loss": 1.2925, + "step": 2071 + }, + { + "epoch": 0.1, + "grad_norm": 1.3470523494497024, + "learning_rate": 1.9748064430629185e-05, + "loss": 1.1895, + "step": 2072 + }, + { + "epoch": 0.1, + "grad_norm": 1.4811851783744878, + "learning_rate": 1.9747716843735015e-05, + "loss": 1.1597, + "step": 2073 + }, + { + "epoch": 0.1, + "grad_norm": 1.136876600197557, + "learning_rate": 1.9747369020292424e-05, + "loss": 1.2231, + "step": 2074 + }, + { + "epoch": 0.1, + "grad_norm": 1.4512039065338111, + "learning_rate": 1.9747020960309852e-05, + "loss": 1.3145, + "step": 2075 + }, + { + "epoch": 0.1, + "grad_norm": 1.3581551911941208, + "learning_rate": 1.9746672663795748e-05, + "loss": 1.25, + "step": 2076 + }, + { + "epoch": 0.1, + "grad_norm": 1.4253364179785786, + "learning_rate": 1.9746324130758563e-05, + "loss": 1.4414, + "step": 2077 + }, + { + "epoch": 0.1, + "grad_norm": 1.3165962230892718, + "learning_rate": 1.9745975361206753e-05, + "loss": 1.2324, + "step": 2078 + }, + { + "epoch": 0.1, + "grad_norm": 1.133620162564484, + "learning_rate": 1.9745626355148782e-05, + "loss": 1.2598, + "step": 2079 + }, + { + "epoch": 0.1, + "grad_norm": 1.2880082756464366, + "learning_rate": 1.974527711259312e-05, + "loss": 1.2729, + "step": 2080 + }, + { + "epoch": 0.1, + "grad_norm": 1.304017640875334, + "learning_rate": 1.9744927633548246e-05, + "loss": 1.3809, + "step": 2081 + }, + { + "epoch": 0.1, + "grad_norm": 1.246161601012671, + "learning_rate": 1.9744577918022637e-05, + "loss": 1.1362, + "step": 2082 + }, + { + "epoch": 0.1, + "grad_norm": 1.6868296538718328, + "learning_rate": 1.9744227966024776e-05, + "loss": 1.3516, + "step": 2083 + }, + { + "epoch": 0.1, + "grad_norm": 1.2192448983173112, + "learning_rate": 1.974387777756316e-05, + "loss": 1.3057, + "step": 2084 + }, + { + "epoch": 0.1, + "grad_norm": 1.4770540258135623, + "learning_rate": 1.9743527352646286e-05, + "loss": 1.1897, + "step": 2085 + }, + { + "epoch": 0.1, + "grad_norm": 1.3566882731581809, + "learning_rate": 1.974317669128266e-05, + "loss": 1.3057, + "step": 2086 + }, + { + "epoch": 0.1, + "grad_norm": 1.40642642558953, + "learning_rate": 1.9742825793480785e-05, + "loss": 1.1465, + "step": 2087 + }, + { + "epoch": 0.1, + "grad_norm": 1.3383711792757, + "learning_rate": 1.9742474659249183e-05, + "loss": 1.1821, + "step": 2088 + }, + { + "epoch": 0.1, + "grad_norm": 1.8507926201850724, + "learning_rate": 1.9742123288596375e-05, + "loss": 1.3882, + "step": 2089 + }, + { + "epoch": 0.1, + "grad_norm": 0.937094559559786, + "learning_rate": 1.974177168153088e-05, + "loss": 1.2422, + "step": 2090 + }, + { + "epoch": 0.1, + "grad_norm": 1.5027441288268308, + "learning_rate": 1.974141983806124e-05, + "loss": 1.1851, + "step": 2091 + }, + { + "epoch": 0.1, + "grad_norm": 1.3965964756986515, + "learning_rate": 1.974106775819599e-05, + "loss": 1.2402, + "step": 2092 + }, + { + "epoch": 0.1, + "grad_norm": 1.5253514816118088, + "learning_rate": 1.974071544194367e-05, + "loss": 1.1699, + "step": 2093 + }, + { + "epoch": 0.1, + "grad_norm": 1.1816931807600346, + "learning_rate": 1.9740362889312835e-05, + "loss": 1.1284, + "step": 2094 + }, + { + "epoch": 0.1, + "grad_norm": 1.6299425479368872, + "learning_rate": 1.9740010100312036e-05, + "loss": 1.4028, + "step": 2095 + }, + { + "epoch": 0.1, + "grad_norm": 1.3689840410137184, + "learning_rate": 1.9739657074949835e-05, + "loss": 1.2056, + "step": 2096 + }, + { + "epoch": 0.1, + "grad_norm": 1.292208207739903, + "learning_rate": 1.9739303813234802e-05, + "loss": 1.2471, + "step": 2097 + }, + { + "epoch": 0.1, + "grad_norm": 1.151404044502586, + "learning_rate": 1.973895031517551e-05, + "loss": 1.2822, + "step": 2098 + }, + { + "epoch": 0.1, + "grad_norm": 1.3827946539761797, + "learning_rate": 1.973859658078053e-05, + "loss": 1.2808, + "step": 2099 + }, + { + "epoch": 0.1, + "grad_norm": 1.2385556342990627, + "learning_rate": 1.9738242610058457e-05, + "loss": 1.1689, + "step": 2100 + }, + { + "epoch": 0.1, + "grad_norm": 1.2421587436161778, + "learning_rate": 1.973788840301787e-05, + "loss": 1.0898, + "step": 2101 + }, + { + "epoch": 0.1, + "grad_norm": 0.9548569947090044, + "learning_rate": 1.9737533959667373e-05, + "loss": 1.228, + "step": 2102 + }, + { + "epoch": 0.1, + "grad_norm": 1.3207046958723425, + "learning_rate": 1.973717928001556e-05, + "loss": 1.2935, + "step": 2103 + }, + { + "epoch": 0.1, + "grad_norm": 1.2168872154894168, + "learning_rate": 1.9736824364071047e-05, + "loss": 1.1958, + "step": 2104 + }, + { + "epoch": 0.1, + "grad_norm": 0.9753675355316361, + "learning_rate": 1.9736469211842437e-05, + "loss": 1.1328, + "step": 2105 + }, + { + "epoch": 0.1, + "grad_norm": 1.2892610595489347, + "learning_rate": 1.9736113823338357e-05, + "loss": 1.0649, + "step": 2106 + }, + { + "epoch": 0.1, + "grad_norm": 1.2332679278406518, + "learning_rate": 1.9735758198567427e-05, + "loss": 1.1318, + "step": 2107 + }, + { + "epoch": 0.1, + "grad_norm": 1.1141534225552783, + "learning_rate": 1.9735402337538275e-05, + "loss": 1.2563, + "step": 2108 + }, + { + "epoch": 0.1, + "grad_norm": 1.3548231220713212, + "learning_rate": 1.973504624025954e-05, + "loss": 1.293, + "step": 2109 + }, + { + "epoch": 0.1, + "grad_norm": 1.2570493791585278, + "learning_rate": 1.9734689906739862e-05, + "loss": 1.1519, + "step": 2110 + }, + { + "epoch": 0.1, + "grad_norm": 1.4854070157850352, + "learning_rate": 1.9734333336987886e-05, + "loss": 1.2329, + "step": 2111 + }, + { + "epoch": 0.1, + "grad_norm": 1.403713357679581, + "learning_rate": 1.9733976531012272e-05, + "loss": 1.3428, + "step": 2112 + }, + { + "epoch": 0.1, + "grad_norm": 1.1337088642976172, + "learning_rate": 1.9733619488821673e-05, + "loss": 1.3022, + "step": 2113 + }, + { + "epoch": 0.1, + "grad_norm": 1.0579583369122714, + "learning_rate": 1.9733262210424752e-05, + "loss": 1.0232, + "step": 2114 + }, + { + "epoch": 0.1, + "grad_norm": 1.273359711884571, + "learning_rate": 1.9732904695830186e-05, + "loss": 1.2739, + "step": 2115 + }, + { + "epoch": 0.1, + "grad_norm": 1.338082259003379, + "learning_rate": 1.9732546945046643e-05, + "loss": 1.3257, + "step": 2116 + }, + { + "epoch": 0.1, + "grad_norm": 1.2832025327284362, + "learning_rate": 1.9732188958082813e-05, + "loss": 1.2368, + "step": 2117 + }, + { + "epoch": 0.1, + "grad_norm": 1.6326102404701082, + "learning_rate": 1.9731830734947373e-05, + "loss": 0.9412, + "step": 2118 + }, + { + "epoch": 0.1, + "grad_norm": 1.2148110510365433, + "learning_rate": 1.9731472275649023e-05, + "loss": 1.2104, + "step": 2119 + }, + { + "epoch": 0.1, + "grad_norm": 1.0166740413692146, + "learning_rate": 1.9731113580196458e-05, + "loss": 1.2202, + "step": 2120 + }, + { + "epoch": 0.1, + "grad_norm": 1.9341972562232586, + "learning_rate": 1.9730754648598388e-05, + "loss": 1.2808, + "step": 2121 + }, + { + "epoch": 0.1, + "grad_norm": 1.3533844202073724, + "learning_rate": 1.973039548086352e-05, + "loss": 1.1768, + "step": 2122 + }, + { + "epoch": 0.1, + "grad_norm": 1.4614785310324496, + "learning_rate": 1.9730036077000568e-05, + "loss": 1.3877, + "step": 2123 + }, + { + "epoch": 0.1, + "grad_norm": 1.0374640083242765, + "learning_rate": 1.9729676437018256e-05, + "loss": 1.1162, + "step": 2124 + }, + { + "epoch": 0.1, + "grad_norm": 1.2556881264038768, + "learning_rate": 1.972931656092531e-05, + "loss": 1.2446, + "step": 2125 + }, + { + "epoch": 0.1, + "grad_norm": 1.4977403772459381, + "learning_rate": 1.9728956448730466e-05, + "loss": 1.1123, + "step": 2126 + }, + { + "epoch": 0.1, + "grad_norm": 1.2958075708469714, + "learning_rate": 1.972859610044246e-05, + "loss": 1.2554, + "step": 2127 + }, + { + "epoch": 0.1, + "grad_norm": 1.5050797903707775, + "learning_rate": 1.9728235516070037e-05, + "loss": 1.1829, + "step": 2128 + }, + { + "epoch": 0.1, + "grad_norm": 1.4070360104616044, + "learning_rate": 1.9727874695621946e-05, + "loss": 1.3149, + "step": 2129 + }, + { + "epoch": 0.1, + "grad_norm": 1.2756179722337815, + "learning_rate": 1.9727513639106948e-05, + "loss": 1.2456, + "step": 2130 + }, + { + "epoch": 0.1, + "grad_norm": 1.1904486641369034, + "learning_rate": 1.9727152346533798e-05, + "loss": 1.2983, + "step": 2131 + }, + { + "epoch": 0.1, + "grad_norm": 1.3542080249190152, + "learning_rate": 1.972679081791127e-05, + "loss": 1.395, + "step": 2132 + }, + { + "epoch": 0.1, + "grad_norm": 1.2319622520050943, + "learning_rate": 1.972642905324813e-05, + "loss": 1.249, + "step": 2133 + }, + { + "epoch": 0.1, + "grad_norm": 1.3157071911880212, + "learning_rate": 1.9726067052553167e-05, + "loss": 1.3086, + "step": 2134 + }, + { + "epoch": 0.1, + "grad_norm": 1.5315780072687646, + "learning_rate": 1.9725704815835156e-05, + "loss": 1.3633, + "step": 2135 + }, + { + "epoch": 0.1, + "grad_norm": 1.2300932964116849, + "learning_rate": 1.9725342343102897e-05, + "loss": 1.2749, + "step": 2136 + }, + { + "epoch": 0.1, + "grad_norm": 0.9929257144150984, + "learning_rate": 1.9724979634365174e-05, + "loss": 1.3149, + "step": 2137 + }, + { + "epoch": 0.1, + "grad_norm": 1.4011339791909128, + "learning_rate": 1.9724616689630798e-05, + "loss": 1.1426, + "step": 2138 + }, + { + "epoch": 0.1, + "grad_norm": 1.4323935656314877, + "learning_rate": 1.9724253508908574e-05, + "loss": 1.3105, + "step": 2139 + }, + { + "epoch": 0.1, + "grad_norm": 1.0584622995563442, + "learning_rate": 1.9723890092207317e-05, + "loss": 1.2896, + "step": 2140 + }, + { + "epoch": 0.1, + "grad_norm": 1.2755369057107113, + "learning_rate": 1.972352643953584e-05, + "loss": 1.2412, + "step": 2141 + }, + { + "epoch": 0.1, + "grad_norm": 1.2500332080010053, + "learning_rate": 1.9723162550902975e-05, + "loss": 1.0994, + "step": 2142 + }, + { + "epoch": 0.1, + "grad_norm": 1.25418154486793, + "learning_rate": 1.9722798426317552e-05, + "loss": 1.2861, + "step": 2143 + }, + { + "epoch": 0.1, + "grad_norm": 1.4625784062922456, + "learning_rate": 1.9722434065788398e-05, + "loss": 1.2549, + "step": 2144 + }, + { + "epoch": 0.1, + "grad_norm": 1.48333333155445, + "learning_rate": 1.972206946932437e-05, + "loss": 1.3779, + "step": 2145 + }, + { + "epoch": 0.1, + "grad_norm": 1.3964587729054958, + "learning_rate": 1.97217046369343e-05, + "loss": 1.1909, + "step": 2146 + }, + { + "epoch": 0.1, + "grad_norm": 1.191622960455367, + "learning_rate": 1.9721339568627055e-05, + "loss": 1.27, + "step": 2147 + }, + { + "epoch": 0.1, + "grad_norm": 1.2775699739197994, + "learning_rate": 1.9720974264411484e-05, + "loss": 1.2368, + "step": 2148 + }, + { + "epoch": 0.1, + "grad_norm": 1.4304155257018925, + "learning_rate": 1.972060872429646e-05, + "loss": 1.3389, + "step": 2149 + }, + { + "epoch": 0.1, + "grad_norm": 1.3215539046233244, + "learning_rate": 1.9720242948290847e-05, + "loss": 1.2529, + "step": 2150 + }, + { + "epoch": 0.1, + "grad_norm": 1.329016274850806, + "learning_rate": 1.9719876936403524e-05, + "loss": 0.9905, + "step": 2151 + }, + { + "epoch": 0.1, + "grad_norm": 1.1632314250435223, + "learning_rate": 1.9719510688643374e-05, + "loss": 1.1396, + "step": 2152 + }, + { + "epoch": 0.1, + "grad_norm": 1.1810842936387398, + "learning_rate": 1.971914420501928e-05, + "loss": 1.2329, + "step": 2153 + }, + { + "epoch": 0.1, + "grad_norm": 1.2491477077365487, + "learning_rate": 1.9718777485540145e-05, + "loss": 0.9775, + "step": 2154 + }, + { + "epoch": 0.1, + "grad_norm": 1.4565873287129338, + "learning_rate": 1.9718410530214857e-05, + "loss": 1.3506, + "step": 2155 + }, + { + "epoch": 0.1, + "grad_norm": 1.0582235453189632, + "learning_rate": 1.971804333905233e-05, + "loss": 1.1216, + "step": 2156 + }, + { + "epoch": 0.1, + "grad_norm": 0.8193773551486896, + "learning_rate": 1.971767591206147e-05, + "loss": 1.1626, + "step": 2157 + }, + { + "epoch": 0.1, + "grad_norm": 1.2243326529487073, + "learning_rate": 1.971730824925119e-05, + "loss": 1.2705, + "step": 2158 + }, + { + "epoch": 0.1, + "grad_norm": 1.3928725035388059, + "learning_rate": 1.9716940350630424e-05, + "loss": 1.2793, + "step": 2159 + }, + { + "epoch": 0.1, + "grad_norm": 1.2580838735793547, + "learning_rate": 1.9716572216208084e-05, + "loss": 1.2393, + "step": 2160 + }, + { + "epoch": 0.1, + "grad_norm": 1.3763429441908046, + "learning_rate": 1.9716203845993117e-05, + "loss": 1.168, + "step": 2161 + }, + { + "epoch": 0.1, + "grad_norm": 1.3131380484082913, + "learning_rate": 1.9715835239994457e-05, + "loss": 1.2876, + "step": 2162 + }, + { + "epoch": 0.1, + "grad_norm": 1.2889133253102536, + "learning_rate": 1.971546639822105e-05, + "loss": 1.2837, + "step": 2163 + }, + { + "epoch": 0.1, + "grad_norm": 1.322283944970901, + "learning_rate": 1.971509732068184e-05, + "loss": 1.4043, + "step": 2164 + }, + { + "epoch": 0.1, + "grad_norm": 1.0803193461755334, + "learning_rate": 1.9714728007385795e-05, + "loss": 1.2217, + "step": 2165 + }, + { + "epoch": 0.1, + "grad_norm": 1.3071896599789303, + "learning_rate": 1.9714358458341868e-05, + "loss": 1.1089, + "step": 2166 + }, + { + "epoch": 0.1, + "grad_norm": 0.8966287410798779, + "learning_rate": 1.971398867355903e-05, + "loss": 1.1191, + "step": 2167 + }, + { + "epoch": 0.1, + "grad_norm": 1.4911874698166165, + "learning_rate": 1.9713618653046258e-05, + "loss": 1.2568, + "step": 2168 + }, + { + "epoch": 0.1, + "grad_norm": 0.9340372935958634, + "learning_rate": 1.9713248396812524e-05, + "loss": 1.3438, + "step": 2169 + }, + { + "epoch": 0.1, + "grad_norm": 1.1168161803944012, + "learning_rate": 1.9712877904866818e-05, + "loss": 1.3057, + "step": 2170 + }, + { + "epoch": 0.1, + "grad_norm": 1.3448838702239219, + "learning_rate": 1.971250717721813e-05, + "loss": 1.251, + "step": 2171 + }, + { + "epoch": 0.1, + "grad_norm": 1.40966509736875, + "learning_rate": 1.971213621387546e-05, + "loss": 1.3921, + "step": 2172 + }, + { + "epoch": 0.1, + "grad_norm": 1.172476475615654, + "learning_rate": 1.9711765014847804e-05, + "loss": 1.3052, + "step": 2173 + }, + { + "epoch": 0.1, + "grad_norm": 0.9991115048866612, + "learning_rate": 1.9711393580144168e-05, + "loss": 1.2124, + "step": 2174 + }, + { + "epoch": 0.1, + "grad_norm": 1.0677529325172748, + "learning_rate": 1.9711021909773574e-05, + "loss": 1.1143, + "step": 2175 + }, + { + "epoch": 0.1, + "grad_norm": 1.310085742242461, + "learning_rate": 1.971065000374504e-05, + "loss": 1.3862, + "step": 2176 + }, + { + "epoch": 0.1, + "grad_norm": 1.4581366870295722, + "learning_rate": 1.9710277862067583e-05, + "loss": 1.2397, + "step": 2177 + }, + { + "epoch": 0.1, + "grad_norm": 1.0506712722155953, + "learning_rate": 1.9709905484750243e-05, + "loss": 1.2871, + "step": 2178 + }, + { + "epoch": 0.1, + "grad_norm": 1.050543208369896, + "learning_rate": 1.970953287180205e-05, + "loss": 1.0964, + "step": 2179 + }, + { + "epoch": 0.1, + "grad_norm": 1.5287673107439805, + "learning_rate": 1.9709160023232052e-05, + "loss": 1.3682, + "step": 2180 + }, + { + "epoch": 0.1, + "grad_norm": 1.4431057675230472, + "learning_rate": 1.9708786939049292e-05, + "loss": 1.2622, + "step": 2181 + }, + { + "epoch": 0.1, + "grad_norm": 1.1044629381037343, + "learning_rate": 1.9708413619262825e-05, + "loss": 1.2661, + "step": 2182 + }, + { + "epoch": 0.1, + "grad_norm": 1.3391312731493032, + "learning_rate": 1.970804006388171e-05, + "loss": 1.2534, + "step": 2183 + }, + { + "epoch": 0.11, + "grad_norm": 1.2437614938960124, + "learning_rate": 1.9707666272915016e-05, + "loss": 1.0361, + "step": 2184 + }, + { + "epoch": 0.11, + "grad_norm": 1.2731024331142595, + "learning_rate": 1.970729224637181e-05, + "loss": 1.2246, + "step": 2185 + }, + { + "epoch": 0.11, + "grad_norm": 1.1721838975771055, + "learning_rate": 1.9706917984261168e-05, + "loss": 1.2979, + "step": 2186 + }, + { + "epoch": 0.11, + "grad_norm": 1.1374349820330243, + "learning_rate": 1.9706543486592174e-05, + "loss": 1.207, + "step": 2187 + }, + { + "epoch": 0.11, + "grad_norm": 1.258722432133754, + "learning_rate": 1.9706168753373912e-05, + "loss": 1.1509, + "step": 2188 + }, + { + "epoch": 0.11, + "grad_norm": 1.3466829680110652, + "learning_rate": 1.9705793784615487e-05, + "loss": 1.3887, + "step": 2189 + }, + { + "epoch": 0.11, + "grad_norm": 1.1578238661773583, + "learning_rate": 1.9705418580325984e-05, + "loss": 1.2056, + "step": 2190 + }, + { + "epoch": 0.11, + "grad_norm": 1.34854711097351, + "learning_rate": 1.970504314051452e-05, + "loss": 1.3804, + "step": 2191 + }, + { + "epoch": 0.11, + "grad_norm": 1.1965515210477877, + "learning_rate": 1.9704667465190194e-05, + "loss": 1.2246, + "step": 2192 + }, + { + "epoch": 0.11, + "grad_norm": 1.4211826049396064, + "learning_rate": 1.970429155436213e-05, + "loss": 1.2729, + "step": 2193 + }, + { + "epoch": 0.11, + "grad_norm": 1.1721021305457244, + "learning_rate": 1.9703915408039454e-05, + "loss": 1.0693, + "step": 2194 + }, + { + "epoch": 0.11, + "grad_norm": 1.251026974392511, + "learning_rate": 1.9703539026231288e-05, + "loss": 1.1841, + "step": 2195 + }, + { + "epoch": 0.11, + "grad_norm": 0.9687360002657329, + "learning_rate": 1.9703162408946766e-05, + "loss": 1.2485, + "step": 2196 + }, + { + "epoch": 0.11, + "grad_norm": 1.492166201903374, + "learning_rate": 1.9702785556195026e-05, + "loss": 1.2319, + "step": 2197 + }, + { + "epoch": 0.11, + "grad_norm": 1.333167487981378, + "learning_rate": 1.970240846798522e-05, + "loss": 1.3213, + "step": 2198 + }, + { + "epoch": 0.11, + "grad_norm": 1.2279493293742691, + "learning_rate": 1.970203114432649e-05, + "loss": 1.2485, + "step": 2199 + }, + { + "epoch": 0.11, + "grad_norm": 1.3160159940988119, + "learning_rate": 1.9701653585228e-05, + "loss": 1.2524, + "step": 2200 + }, + { + "epoch": 0.11, + "grad_norm": 1.3336733004274464, + "learning_rate": 1.9701275790698906e-05, + "loss": 1.3032, + "step": 2201 + }, + { + "epoch": 0.11, + "grad_norm": 1.1803000509169097, + "learning_rate": 1.9700897760748382e-05, + "loss": 1.208, + "step": 2202 + }, + { + "epoch": 0.11, + "grad_norm": 1.0330360356642547, + "learning_rate": 1.9700519495385597e-05, + "loss": 1.2109, + "step": 2203 + }, + { + "epoch": 0.11, + "grad_norm": 1.1054502766236436, + "learning_rate": 1.970014099461973e-05, + "loss": 1.1831, + "step": 2204 + }, + { + "epoch": 0.11, + "grad_norm": 1.2300499083637746, + "learning_rate": 1.969976225845997e-05, + "loss": 1.2793, + "step": 2205 + }, + { + "epoch": 0.11, + "grad_norm": 1.3317171863311308, + "learning_rate": 1.969938328691551e-05, + "loss": 1.4126, + "step": 2206 + }, + { + "epoch": 0.11, + "grad_norm": 1.192453554819283, + "learning_rate": 1.969900407999554e-05, + "loss": 1.3057, + "step": 2207 + }, + { + "epoch": 0.11, + "grad_norm": 1.3720659545412983, + "learning_rate": 1.9698624637709263e-05, + "loss": 1.2612, + "step": 2208 + }, + { + "epoch": 0.11, + "grad_norm": 1.3099077790402383, + "learning_rate": 1.969824496006589e-05, + "loss": 1.1592, + "step": 2209 + }, + { + "epoch": 0.11, + "grad_norm": 1.0654351700601843, + "learning_rate": 1.9697865047074633e-05, + "loss": 1.2603, + "step": 2210 + }, + { + "epoch": 0.11, + "grad_norm": 1.1273138524777162, + "learning_rate": 1.969748489874471e-05, + "loss": 1.2202, + "step": 2211 + }, + { + "epoch": 0.11, + "grad_norm": 1.459347742876156, + "learning_rate": 1.969710451508535e-05, + "loss": 1.3569, + "step": 2212 + }, + { + "epoch": 0.11, + "grad_norm": 1.2605064193056073, + "learning_rate": 1.9696723896105778e-05, + "loss": 1.2285, + "step": 2213 + }, + { + "epoch": 0.11, + "grad_norm": 1.3877413518052322, + "learning_rate": 1.969634304181524e-05, + "loss": 1.1885, + "step": 2214 + }, + { + "epoch": 0.11, + "grad_norm": 1.236894217568725, + "learning_rate": 1.9695961952222966e-05, + "loss": 1.2544, + "step": 2215 + }, + { + "epoch": 0.11, + "grad_norm": 1.544198763844224, + "learning_rate": 1.9695580627338212e-05, + "loss": 1.3823, + "step": 2216 + }, + { + "epoch": 0.11, + "grad_norm": 1.5777650148308828, + "learning_rate": 1.9695199067170236e-05, + "loss": 1.2754, + "step": 2217 + }, + { + "epoch": 0.11, + "grad_norm": 1.3059067871906982, + "learning_rate": 1.9694817271728284e-05, + "loss": 1.1226, + "step": 2218 + }, + { + "epoch": 0.11, + "grad_norm": 1.3356583121424808, + "learning_rate": 1.969443524102163e-05, + "loss": 1.3682, + "step": 2219 + }, + { + "epoch": 0.11, + "grad_norm": 1.220787371217912, + "learning_rate": 1.9694052975059545e-05, + "loss": 1.2158, + "step": 2220 + }, + { + "epoch": 0.11, + "grad_norm": 1.0028701773898565, + "learning_rate": 1.96936704738513e-05, + "loss": 1.2271, + "step": 2221 + }, + { + "epoch": 0.11, + "grad_norm": 1.7863584418465375, + "learning_rate": 1.9693287737406183e-05, + "loss": 1.4067, + "step": 2222 + }, + { + "epoch": 0.11, + "grad_norm": 1.396771746324201, + "learning_rate": 1.969290476573348e-05, + "loss": 1.2817, + "step": 2223 + }, + { + "epoch": 0.11, + "grad_norm": 1.3989268673438904, + "learning_rate": 1.9692521558842485e-05, + "loss": 1.2993, + "step": 2224 + }, + { + "epoch": 0.11, + "grad_norm": 1.3431949081427952, + "learning_rate": 1.9692138116742494e-05, + "loss": 1.2207, + "step": 2225 + }, + { + "epoch": 0.11, + "grad_norm": 1.282804624528867, + "learning_rate": 1.9691754439442815e-05, + "loss": 1.3584, + "step": 2226 + }, + { + "epoch": 0.11, + "grad_norm": 1.355513042202185, + "learning_rate": 1.9691370526952756e-05, + "loss": 1.2163, + "step": 2227 + }, + { + "epoch": 0.11, + "grad_norm": 1.5888231601312783, + "learning_rate": 1.969098637928164e-05, + "loss": 1.188, + "step": 2228 + }, + { + "epoch": 0.11, + "grad_norm": 1.0706681794545767, + "learning_rate": 1.9690601996438782e-05, + "loss": 1.2349, + "step": 2229 + }, + { + "epoch": 0.11, + "grad_norm": 1.4565796088073892, + "learning_rate": 1.9690217378433514e-05, + "loss": 1.3081, + "step": 2230 + }, + { + "epoch": 0.11, + "grad_norm": 1.9345458699031222, + "learning_rate": 1.9689832525275166e-05, + "loss": 1.2964, + "step": 2231 + }, + { + "epoch": 0.11, + "grad_norm": 1.4622029446393785, + "learning_rate": 1.9689447436973083e-05, + "loss": 1.311, + "step": 2232 + }, + { + "epoch": 0.11, + "grad_norm": 1.300267452040358, + "learning_rate": 1.9689062113536605e-05, + "loss": 1.27, + "step": 2233 + }, + { + "epoch": 0.11, + "grad_norm": 1.4333126415570174, + "learning_rate": 1.968867655497508e-05, + "loss": 1.1792, + "step": 2234 + }, + { + "epoch": 0.11, + "grad_norm": 1.2399378047399128, + "learning_rate": 1.9688290761297873e-05, + "loss": 1.0454, + "step": 2235 + }, + { + "epoch": 0.11, + "grad_norm": 1.274317290210688, + "learning_rate": 1.968790473251434e-05, + "loss": 1.1343, + "step": 2236 + }, + { + "epoch": 0.11, + "grad_norm": 1.224776227250419, + "learning_rate": 1.968751846863385e-05, + "loss": 1.2456, + "step": 2237 + }, + { + "epoch": 0.11, + "grad_norm": 1.111936081119711, + "learning_rate": 1.968713196966578e-05, + "loss": 1.2217, + "step": 2238 + }, + { + "epoch": 0.11, + "grad_norm": 1.5881267740295038, + "learning_rate": 1.96867452356195e-05, + "loss": 1.1489, + "step": 2239 + }, + { + "epoch": 0.11, + "grad_norm": 1.1864456119450237, + "learning_rate": 1.9686358266504406e-05, + "loss": 1.1997, + "step": 2240 + }, + { + "epoch": 0.11, + "grad_norm": 1.1976903061205968, + "learning_rate": 1.968597106232988e-05, + "loss": 1.1089, + "step": 2241 + }, + { + "epoch": 0.11, + "grad_norm": 1.0909978013744854, + "learning_rate": 1.968558362310532e-05, + "loss": 1.0474, + "step": 2242 + }, + { + "epoch": 0.11, + "grad_norm": 1.3430512688099636, + "learning_rate": 1.9685195948840137e-05, + "loss": 1.248, + "step": 2243 + }, + { + "epoch": 0.11, + "grad_norm": 1.4201138009128356, + "learning_rate": 1.9684808039543727e-05, + "loss": 1.2437, + "step": 2244 + }, + { + "epoch": 0.11, + "grad_norm": 1.048907793099642, + "learning_rate": 1.9684419895225506e-05, + "loss": 1.2402, + "step": 2245 + }, + { + "epoch": 0.11, + "grad_norm": 0.888413731853847, + "learning_rate": 1.9684031515894898e-05, + "loss": 1.1821, + "step": 2246 + }, + { + "epoch": 0.11, + "grad_norm": 1.0960299248353569, + "learning_rate": 1.9683642901561324e-05, + "loss": 1.1504, + "step": 2247 + }, + { + "epoch": 0.11, + "grad_norm": 1.1551932889225376, + "learning_rate": 1.9683254052234217e-05, + "loss": 1.1367, + "step": 2248 + }, + { + "epoch": 0.11, + "grad_norm": 1.5400012488065769, + "learning_rate": 1.9682864967923006e-05, + "loss": 1.4224, + "step": 2249 + }, + { + "epoch": 0.11, + "grad_norm": 1.3900434985216465, + "learning_rate": 1.9682475648637145e-05, + "loss": 1.2051, + "step": 2250 + }, + { + "epoch": 0.11, + "grad_norm": 1.3674639054540176, + "learning_rate": 1.968208609438607e-05, + "loss": 1.1685, + "step": 2251 + }, + { + "epoch": 0.11, + "grad_norm": 0.9724746960949947, + "learning_rate": 1.9681696305179243e-05, + "loss": 1.0908, + "step": 2252 + }, + { + "epoch": 0.11, + "grad_norm": 1.361262627046411, + "learning_rate": 1.9681306281026122e-05, + "loss": 1.353, + "step": 2253 + }, + { + "epoch": 0.11, + "grad_norm": 1.2299752503888868, + "learning_rate": 1.968091602193616e-05, + "loss": 1.1709, + "step": 2254 + }, + { + "epoch": 0.11, + "grad_norm": 1.5067574226829261, + "learning_rate": 1.9680525527918845e-05, + "loss": 1.3696, + "step": 2255 + }, + { + "epoch": 0.11, + "grad_norm": 1.3628436134244943, + "learning_rate": 1.9680134798983642e-05, + "loss": 1.3276, + "step": 2256 + }, + { + "epoch": 0.11, + "grad_norm": 1.2824537324973393, + "learning_rate": 1.9679743835140037e-05, + "loss": 1.2139, + "step": 2257 + }, + { + "epoch": 0.11, + "grad_norm": 0.9763746179502948, + "learning_rate": 1.967935263639752e-05, + "loss": 1.146, + "step": 2258 + }, + { + "epoch": 0.11, + "grad_norm": 1.559862922004395, + "learning_rate": 1.9678961202765572e-05, + "loss": 1.4263, + "step": 2259 + }, + { + "epoch": 0.11, + "grad_norm": 1.28215842267662, + "learning_rate": 1.9678569534253706e-05, + "loss": 1.2979, + "step": 2260 + }, + { + "epoch": 0.11, + "grad_norm": 1.2817655063508682, + "learning_rate": 1.9678177630871418e-05, + "loss": 1.0942, + "step": 2261 + }, + { + "epoch": 0.11, + "grad_norm": 1.3553024078036955, + "learning_rate": 1.967778549262822e-05, + "loss": 1.2251, + "step": 2262 + }, + { + "epoch": 0.11, + "grad_norm": 1.2614573093453105, + "learning_rate": 1.967739311953363e-05, + "loss": 1.2983, + "step": 2263 + }, + { + "epoch": 0.11, + "grad_norm": 1.4965479599135652, + "learning_rate": 1.9677000511597175e-05, + "loss": 1.25, + "step": 2264 + }, + { + "epoch": 0.11, + "grad_norm": 1.4580527290430283, + "learning_rate": 1.9676607668828372e-05, + "loss": 1.2148, + "step": 2265 + }, + { + "epoch": 0.11, + "grad_norm": 0.8617660615333492, + "learning_rate": 1.967621459123676e-05, + "loss": 1.3057, + "step": 2266 + }, + { + "epoch": 0.11, + "grad_norm": 1.2357334569397314, + "learning_rate": 1.9675821278831875e-05, + "loss": 1.0193, + "step": 2267 + }, + { + "epoch": 0.11, + "grad_norm": 1.537265712736465, + "learning_rate": 1.9675427731623267e-05, + "loss": 1.2319, + "step": 2268 + }, + { + "epoch": 0.11, + "grad_norm": 1.189566844670059, + "learning_rate": 1.967503394962048e-05, + "loss": 1.0479, + "step": 2269 + }, + { + "epoch": 0.11, + "grad_norm": 1.2242446437831958, + "learning_rate": 1.9674639932833068e-05, + "loss": 1.3169, + "step": 2270 + }, + { + "epoch": 0.11, + "grad_norm": 1.518469922649539, + "learning_rate": 1.9674245681270604e-05, + "loss": 1.2017, + "step": 2271 + }, + { + "epoch": 0.11, + "grad_norm": 1.2319310365549312, + "learning_rate": 1.9673851194942645e-05, + "loss": 1.3271, + "step": 2272 + }, + { + "epoch": 0.11, + "grad_norm": 1.3541197604875195, + "learning_rate": 1.9673456473858766e-05, + "loss": 1.3716, + "step": 2273 + }, + { + "epoch": 0.11, + "grad_norm": 1.4181693084536233, + "learning_rate": 1.967306151802855e-05, + "loss": 1.252, + "step": 2274 + }, + { + "epoch": 0.11, + "grad_norm": 1.2591792338988002, + "learning_rate": 1.9672666327461575e-05, + "loss": 1.3774, + "step": 2275 + }, + { + "epoch": 0.11, + "grad_norm": 1.5235686649094224, + "learning_rate": 1.967227090216744e-05, + "loss": 1.188, + "step": 2276 + }, + { + "epoch": 0.11, + "grad_norm": 1.2435096529440157, + "learning_rate": 1.967187524215573e-05, + "loss": 1.0249, + "step": 2277 + }, + { + "epoch": 0.11, + "grad_norm": 1.182257214397943, + "learning_rate": 1.967147934743605e-05, + "loss": 1.2178, + "step": 2278 + }, + { + "epoch": 0.11, + "grad_norm": 1.61175548863758, + "learning_rate": 1.9671083218018016e-05, + "loss": 1.4038, + "step": 2279 + }, + { + "epoch": 0.11, + "grad_norm": 1.1246976081152869, + "learning_rate": 1.967068685391123e-05, + "loss": 1.4409, + "step": 2280 + }, + { + "epoch": 0.11, + "grad_norm": 1.3380067883932696, + "learning_rate": 1.9670290255125314e-05, + "loss": 1.2622, + "step": 2281 + }, + { + "epoch": 0.11, + "grad_norm": 1.0116006149703554, + "learning_rate": 1.9669893421669895e-05, + "loss": 1.2983, + "step": 2282 + }, + { + "epoch": 0.11, + "grad_norm": 1.42973602616009, + "learning_rate": 1.96694963535546e-05, + "loss": 1.3164, + "step": 2283 + }, + { + "epoch": 0.11, + "grad_norm": 1.3509573256508152, + "learning_rate": 1.9669099050789063e-05, + "loss": 1.4326, + "step": 2284 + }, + { + "epoch": 0.11, + "grad_norm": 1.2826141493545928, + "learning_rate": 1.966870151338293e-05, + "loss": 1.1489, + "step": 2285 + }, + { + "epoch": 0.11, + "grad_norm": 1.6025426715316093, + "learning_rate": 1.9668303741345845e-05, + "loss": 1.3042, + "step": 2286 + }, + { + "epoch": 0.11, + "grad_norm": 1.4540991916132446, + "learning_rate": 1.9667905734687463e-05, + "loss": 1.3066, + "step": 2287 + }, + { + "epoch": 0.11, + "grad_norm": 1.206949412339831, + "learning_rate": 1.966750749341744e-05, + "loss": 1.2207, + "step": 2288 + }, + { + "epoch": 0.11, + "grad_norm": 1.1944772778344892, + "learning_rate": 1.9667109017545442e-05, + "loss": 1.0554, + "step": 2289 + }, + { + "epoch": 0.11, + "grad_norm": 1.2391235023992146, + "learning_rate": 1.966671030708114e-05, + "loss": 1.23, + "step": 2290 + }, + { + "epoch": 0.11, + "grad_norm": 1.215193892088057, + "learning_rate": 1.9666311362034203e-05, + "loss": 0.9995, + "step": 2291 + }, + { + "epoch": 0.11, + "grad_norm": 1.0536926179477066, + "learning_rate": 1.966591218241432e-05, + "loss": 1.1626, + "step": 2292 + }, + { + "epoch": 0.11, + "grad_norm": 1.3804302271479423, + "learning_rate": 1.9665512768231175e-05, + "loss": 1.25, + "step": 2293 + }, + { + "epoch": 0.11, + "grad_norm": 1.6398853729958296, + "learning_rate": 1.966511311949446e-05, + "loss": 1.2319, + "step": 2294 + }, + { + "epoch": 0.11, + "grad_norm": 1.2827130434175404, + "learning_rate": 1.966471323621387e-05, + "loss": 1.2666, + "step": 2295 + }, + { + "epoch": 0.11, + "grad_norm": 1.5117877906942554, + "learning_rate": 1.966431311839912e-05, + "loss": 1.2822, + "step": 2296 + }, + { + "epoch": 0.11, + "grad_norm": 1.317221441650805, + "learning_rate": 1.9663912766059904e-05, + "loss": 1.2891, + "step": 2297 + }, + { + "epoch": 0.11, + "grad_norm": 0.9899135406090642, + "learning_rate": 1.966351217920595e-05, + "loss": 1.3975, + "step": 2298 + }, + { + "epoch": 0.11, + "grad_norm": 1.3283649830814415, + "learning_rate": 1.9663111357846976e-05, + "loss": 1.2109, + "step": 2299 + }, + { + "epoch": 0.11, + "grad_norm": 1.09722163379172, + "learning_rate": 1.9662710301992705e-05, + "loss": 1.2212, + "step": 2300 + }, + { + "epoch": 0.11, + "grad_norm": 1.4682823059941397, + "learning_rate": 1.9662309011652874e-05, + "loss": 1.373, + "step": 2301 + }, + { + "epoch": 0.11, + "grad_norm": 1.2659435363782705, + "learning_rate": 1.966190748683722e-05, + "loss": 1.0723, + "step": 2302 + }, + { + "epoch": 0.11, + "grad_norm": 1.098558617845344, + "learning_rate": 1.9661505727555482e-05, + "loss": 1.354, + "step": 2303 + }, + { + "epoch": 0.11, + "grad_norm": 1.4785383862881551, + "learning_rate": 1.9661103733817418e-05, + "loss": 1.4214, + "step": 2304 + }, + { + "epoch": 0.11, + "grad_norm": 1.5636660680093912, + "learning_rate": 1.9660701505632773e-05, + "loss": 1.3472, + "step": 2305 + }, + { + "epoch": 0.11, + "grad_norm": 0.8536453487072471, + "learning_rate": 1.9660299043011316e-05, + "loss": 1.3354, + "step": 2306 + }, + { + "epoch": 0.11, + "grad_norm": 1.2637424095773873, + "learning_rate": 1.9659896345962815e-05, + "loss": 1.2446, + "step": 2307 + }, + { + "epoch": 0.11, + "grad_norm": 1.2131535028861988, + "learning_rate": 1.9659493414497034e-05, + "loss": 1.2046, + "step": 2308 + }, + { + "epoch": 0.11, + "grad_norm": 1.4206976425747657, + "learning_rate": 1.965909024862376e-05, + "loss": 1.4697, + "step": 2309 + }, + { + "epoch": 0.11, + "grad_norm": 1.056190471735001, + "learning_rate": 1.9658686848352767e-05, + "loss": 1.2178, + "step": 2310 + }, + { + "epoch": 0.11, + "grad_norm": 1.4553576270112303, + "learning_rate": 1.965828321369385e-05, + "loss": 1.1528, + "step": 2311 + }, + { + "epoch": 0.11, + "grad_norm": 1.3498112138390386, + "learning_rate": 1.9657879344656804e-05, + "loss": 1.3857, + "step": 2312 + }, + { + "epoch": 0.11, + "grad_norm": 1.1622261227579345, + "learning_rate": 1.965747524125143e-05, + "loss": 1.1802, + "step": 2313 + }, + { + "epoch": 0.11, + "grad_norm": 1.2096272035824487, + "learning_rate": 1.9657070903487534e-05, + "loss": 1.1445, + "step": 2314 + }, + { + "epoch": 0.11, + "grad_norm": 1.0043944203194202, + "learning_rate": 1.9656666331374927e-05, + "loss": 1.2119, + "step": 2315 + }, + { + "epoch": 0.11, + "grad_norm": 1.4284110934721865, + "learning_rate": 1.9656261524923428e-05, + "loss": 1.2739, + "step": 2316 + }, + { + "epoch": 0.11, + "grad_norm": 1.3808852245701946, + "learning_rate": 1.965585648414286e-05, + "loss": 1.3408, + "step": 2317 + }, + { + "epoch": 0.11, + "grad_norm": 1.3448842824304936, + "learning_rate": 1.965545120904305e-05, + "loss": 1.1221, + "step": 2318 + }, + { + "epoch": 0.11, + "grad_norm": 1.7761480865008914, + "learning_rate": 1.9655045699633836e-05, + "loss": 1.2227, + "step": 2319 + }, + { + "epoch": 0.11, + "grad_norm": 1.2377917117865687, + "learning_rate": 1.9654639955925057e-05, + "loss": 1.1733, + "step": 2320 + }, + { + "epoch": 0.11, + "grad_norm": 1.361409411469485, + "learning_rate": 1.9654233977926557e-05, + "loss": 1.3677, + "step": 2321 + }, + { + "epoch": 0.11, + "grad_norm": 1.4360314926580808, + "learning_rate": 1.9653827765648194e-05, + "loss": 1.1372, + "step": 2322 + }, + { + "epoch": 0.11, + "grad_norm": 1.4070003835547202, + "learning_rate": 1.965342131909982e-05, + "loss": 1.3633, + "step": 2323 + }, + { + "epoch": 0.11, + "grad_norm": 1.2537364996215397, + "learning_rate": 1.9653014638291304e-05, + "loss": 1.104, + "step": 2324 + }, + { + "epoch": 0.11, + "grad_norm": 1.360258461326476, + "learning_rate": 1.9652607723232507e-05, + "loss": 1.3418, + "step": 2325 + }, + { + "epoch": 0.11, + "grad_norm": 1.2048037774424762, + "learning_rate": 1.965220057393331e-05, + "loss": 1.1228, + "step": 2326 + }, + { + "epoch": 0.11, + "grad_norm": 1.1449742820233493, + "learning_rate": 1.9651793190403592e-05, + "loss": 1.1362, + "step": 2327 + }, + { + "epoch": 0.11, + "grad_norm": 1.3130055875721254, + "learning_rate": 1.9651385572653235e-05, + "loss": 1.3042, + "step": 2328 + }, + { + "epoch": 0.11, + "grad_norm": 1.386455270687136, + "learning_rate": 1.9650977720692138e-05, + "loss": 1.3623, + "step": 2329 + }, + { + "epoch": 0.11, + "grad_norm": 1.0417586423144762, + "learning_rate": 1.965056963453019e-05, + "loss": 1.0337, + "step": 2330 + }, + { + "epoch": 0.11, + "grad_norm": 1.3789512530929682, + "learning_rate": 1.96501613141773e-05, + "loss": 1.3721, + "step": 2331 + }, + { + "epoch": 0.11, + "grad_norm": 1.1465251128598535, + "learning_rate": 1.9649752759643377e-05, + "loss": 1.2163, + "step": 2332 + }, + { + "epoch": 0.11, + "grad_norm": 1.2839212495289904, + "learning_rate": 1.964934397093833e-05, + "loss": 1.2729, + "step": 2333 + }, + { + "epoch": 0.11, + "grad_norm": 1.6409262437267662, + "learning_rate": 1.9648934948072086e-05, + "loss": 1.2227, + "step": 2334 + }, + { + "epoch": 0.11, + "grad_norm": 1.2363990136998246, + "learning_rate": 1.9648525691054563e-05, + "loss": 1.1709, + "step": 2335 + }, + { + "epoch": 0.11, + "grad_norm": 1.2248303580444435, + "learning_rate": 1.9648116199895703e-05, + "loss": 1.2417, + "step": 2336 + }, + { + "epoch": 0.11, + "grad_norm": 0.9194565592758108, + "learning_rate": 1.964770647460543e-05, + "loss": 1.2144, + "step": 2337 + }, + { + "epoch": 0.11, + "grad_norm": 1.2503533949272256, + "learning_rate": 1.96472965151937e-05, + "loss": 1.189, + "step": 2338 + }, + { + "epoch": 0.11, + "grad_norm": 1.2055128099450905, + "learning_rate": 1.964688632167045e-05, + "loss": 1.0854, + "step": 2339 + }, + { + "epoch": 0.11, + "grad_norm": 1.2572758951314735, + "learning_rate": 1.964647589404564e-05, + "loss": 1.2446, + "step": 2340 + }, + { + "epoch": 0.11, + "grad_norm": 1.105488729612356, + "learning_rate": 1.964606523232923e-05, + "loss": 1.1836, + "step": 2341 + }, + { + "epoch": 0.11, + "grad_norm": 1.1067278312450484, + "learning_rate": 1.964565433653119e-05, + "loss": 1.2656, + "step": 2342 + }, + { + "epoch": 0.11, + "grad_norm": 1.0888629241419383, + "learning_rate": 1.964524320666148e-05, + "loss": 1.1196, + "step": 2343 + }, + { + "epoch": 0.11, + "grad_norm": 1.2574006426184852, + "learning_rate": 1.9644831842730084e-05, + "loss": 1.2524, + "step": 2344 + }, + { + "epoch": 0.11, + "grad_norm": 1.2281574315340626, + "learning_rate": 1.9644420244746984e-05, + "loss": 1.3159, + "step": 2345 + }, + { + "epoch": 0.11, + "grad_norm": 1.1589071922520195, + "learning_rate": 1.964400841272217e-05, + "loss": 1.0168, + "step": 2346 + }, + { + "epoch": 0.11, + "grad_norm": 1.872261357443138, + "learning_rate": 1.9643596346665634e-05, + "loss": 1.2373, + "step": 2347 + }, + { + "epoch": 0.11, + "grad_norm": 1.273959957222254, + "learning_rate": 1.9643184046587373e-05, + "loss": 1.0742, + "step": 2348 + }, + { + "epoch": 0.11, + "grad_norm": 1.5004161463739305, + "learning_rate": 1.9642771512497395e-05, + "loss": 1.3921, + "step": 2349 + }, + { + "epoch": 0.11, + "grad_norm": 1.4541454262176057, + "learning_rate": 1.964235874440571e-05, + "loss": 1.1753, + "step": 2350 + }, + { + "epoch": 0.11, + "grad_norm": 1.09418371216279, + "learning_rate": 1.9641945742322337e-05, + "loss": 1.1978, + "step": 2351 + }, + { + "epoch": 0.11, + "grad_norm": 1.5768573921220472, + "learning_rate": 1.9641532506257297e-05, + "loss": 1.2959, + "step": 2352 + }, + { + "epoch": 0.11, + "grad_norm": 1.4616516763714056, + "learning_rate": 1.9641119036220616e-05, + "loss": 1.1401, + "step": 2353 + }, + { + "epoch": 0.11, + "grad_norm": 1.2262769522045187, + "learning_rate": 1.964070533222233e-05, + "loss": 1.0522, + "step": 2354 + }, + { + "epoch": 0.11, + "grad_norm": 1.0980319908008442, + "learning_rate": 1.964029139427248e-05, + "loss": 1.2446, + "step": 2355 + }, + { + "epoch": 0.11, + "grad_norm": 0.9159828150683357, + "learning_rate": 1.9639877222381106e-05, + "loss": 1.1328, + "step": 2356 + }, + { + "epoch": 0.11, + "grad_norm": 1.2356240241255108, + "learning_rate": 1.9639462816558264e-05, + "loss": 1.2852, + "step": 2357 + }, + { + "epoch": 0.11, + "grad_norm": 1.3479498604917606, + "learning_rate": 1.963904817681401e-05, + "loss": 1.2354, + "step": 2358 + }, + { + "epoch": 0.11, + "grad_norm": 1.295466568975589, + "learning_rate": 1.96386333031584e-05, + "loss": 1.3779, + "step": 2359 + }, + { + "epoch": 0.11, + "grad_norm": 1.1534702425930583, + "learning_rate": 1.9638218195601507e-05, + "loss": 1.2104, + "step": 2360 + }, + { + "epoch": 0.11, + "grad_norm": 1.171145138824874, + "learning_rate": 1.9637802854153403e-05, + "loss": 1.2295, + "step": 2361 + }, + { + "epoch": 0.11, + "grad_norm": 1.2395792275796897, + "learning_rate": 1.9637387278824168e-05, + "loss": 1.1606, + "step": 2362 + }, + { + "epoch": 0.11, + "grad_norm": 1.5297127580546614, + "learning_rate": 1.9636971469623888e-05, + "loss": 1.2178, + "step": 2363 + }, + { + "epoch": 0.11, + "grad_norm": 1.4001134710145433, + "learning_rate": 1.9636555426562653e-05, + "loss": 1.3862, + "step": 2364 + }, + { + "epoch": 0.11, + "grad_norm": 1.2612628193150617, + "learning_rate": 1.963613914965055e-05, + "loss": 1.3511, + "step": 2365 + }, + { + "epoch": 0.11, + "grad_norm": 1.278201316498869, + "learning_rate": 1.9635722638897697e-05, + "loss": 1.1133, + "step": 2366 + }, + { + "epoch": 0.11, + "grad_norm": 1.5234213502944443, + "learning_rate": 1.963530589431419e-05, + "loss": 1.0898, + "step": 2367 + }, + { + "epoch": 0.11, + "grad_norm": 1.6250984455337114, + "learning_rate": 1.9634888915910144e-05, + "loss": 1.3267, + "step": 2368 + }, + { + "epoch": 0.11, + "grad_norm": 1.287766571031839, + "learning_rate": 1.963447170369568e-05, + "loss": 1.3276, + "step": 2369 + }, + { + "epoch": 0.11, + "grad_norm": 1.1912450340310021, + "learning_rate": 1.9634054257680924e-05, + "loss": 1.2661, + "step": 2370 + }, + { + "epoch": 0.11, + "grad_norm": 1.5596823857482258, + "learning_rate": 1.9633636577876004e-05, + "loss": 1.2217, + "step": 2371 + }, + { + "epoch": 0.11, + "grad_norm": 1.5257067616099147, + "learning_rate": 1.963321866429105e-05, + "loss": 1.1396, + "step": 2372 + }, + { + "epoch": 0.11, + "grad_norm": 1.1229686921837245, + "learning_rate": 1.9632800516936215e-05, + "loss": 1.1982, + "step": 2373 + }, + { + "epoch": 0.11, + "grad_norm": 1.3691876069949276, + "learning_rate": 1.9632382135821638e-05, + "loss": 1.1589, + "step": 2374 + }, + { + "epoch": 0.11, + "grad_norm": 1.1890184812325273, + "learning_rate": 1.9631963520957477e-05, + "loss": 1.1812, + "step": 2375 + }, + { + "epoch": 0.11, + "grad_norm": 1.1001247370953569, + "learning_rate": 1.9631544672353886e-05, + "loss": 1.2661, + "step": 2376 + }, + { + "epoch": 0.11, + "grad_norm": 1.1264158280457661, + "learning_rate": 1.963112559002103e-05, + "loss": 1.2808, + "step": 2377 + }, + { + "epoch": 0.11, + "grad_norm": 1.1766519217538112, + "learning_rate": 1.963070627396908e-05, + "loss": 1.2109, + "step": 2378 + }, + { + "epoch": 0.11, + "grad_norm": 1.2647975176197777, + "learning_rate": 1.9630286724208216e-05, + "loss": 1.3291, + "step": 2379 + }, + { + "epoch": 0.11, + "grad_norm": 1.3255061986928103, + "learning_rate": 1.9629866940748612e-05, + "loss": 1.3022, + "step": 2380 + }, + { + "epoch": 0.11, + "grad_norm": 1.1895019992513642, + "learning_rate": 1.9629446923600458e-05, + "loss": 1.1436, + "step": 2381 + }, + { + "epoch": 0.11, + "grad_norm": 1.144680760225059, + "learning_rate": 1.9629026672773946e-05, + "loss": 1.2271, + "step": 2382 + }, + { + "epoch": 0.11, + "grad_norm": 1.312535077918102, + "learning_rate": 1.9628606188279273e-05, + "loss": 1.2485, + "step": 2383 + }, + { + "epoch": 0.11, + "grad_norm": 1.0505008321765719, + "learning_rate": 1.9628185470126645e-05, + "loss": 1.3066, + "step": 2384 + }, + { + "epoch": 0.11, + "grad_norm": 1.0705432413280491, + "learning_rate": 1.9627764518326274e-05, + "loss": 1.0874, + "step": 2385 + }, + { + "epoch": 0.11, + "grad_norm": 1.24543877227386, + "learning_rate": 1.962734333288837e-05, + "loss": 1.1768, + "step": 2386 + }, + { + "epoch": 0.11, + "grad_norm": 1.3343826179409273, + "learning_rate": 1.9626921913823156e-05, + "loss": 1.4805, + "step": 2387 + }, + { + "epoch": 0.11, + "grad_norm": 1.155866728477415, + "learning_rate": 1.962650026114086e-05, + "loss": 1.2197, + "step": 2388 + }, + { + "epoch": 0.11, + "grad_norm": 1.2072519443484715, + "learning_rate": 1.9626078374851715e-05, + "loss": 1.2627, + "step": 2389 + }, + { + "epoch": 0.11, + "grad_norm": 1.313082703834362, + "learning_rate": 1.9625656254965954e-05, + "loss": 1.2471, + "step": 2390 + }, + { + "epoch": 0.12, + "grad_norm": 1.2807905617250746, + "learning_rate": 1.962523390149382e-05, + "loss": 1.2446, + "step": 2391 + }, + { + "epoch": 0.12, + "grad_norm": 1.004081603932383, + "learning_rate": 1.9624811314445575e-05, + "loss": 1.4062, + "step": 2392 + }, + { + "epoch": 0.12, + "grad_norm": 0.9226988281900343, + "learning_rate": 1.9624388493831462e-05, + "loss": 1.2373, + "step": 2393 + }, + { + "epoch": 0.12, + "grad_norm": 1.5818734194165454, + "learning_rate": 1.962396543966174e-05, + "loss": 1.3633, + "step": 2394 + }, + { + "epoch": 0.12, + "grad_norm": 1.5113380432549346, + "learning_rate": 1.9623542151946683e-05, + "loss": 1.313, + "step": 2395 + }, + { + "epoch": 0.12, + "grad_norm": 1.421060205152798, + "learning_rate": 1.962311863069656e-05, + "loss": 1.3955, + "step": 2396 + }, + { + "epoch": 0.12, + "grad_norm": 1.096427553183146, + "learning_rate": 1.962269487592165e-05, + "loss": 1.2739, + "step": 2397 + }, + { + "epoch": 0.12, + "grad_norm": 1.0203307315592112, + "learning_rate": 1.9622270887632234e-05, + "loss": 1.0977, + "step": 2398 + }, + { + "epoch": 0.12, + "grad_norm": 1.2426730520849518, + "learning_rate": 1.9621846665838598e-05, + "loss": 1.2427, + "step": 2399 + }, + { + "epoch": 0.12, + "grad_norm": 1.2766575791597585, + "learning_rate": 1.9621422210551047e-05, + "loss": 1.2705, + "step": 2400 + }, + { + "epoch": 0.12, + "grad_norm": 1.1286198693191283, + "learning_rate": 1.962099752177987e-05, + "loss": 1.1636, + "step": 2401 + }, + { + "epoch": 0.12, + "grad_norm": 1.4023808461006182, + "learning_rate": 1.9620572599535378e-05, + "loss": 1.2778, + "step": 2402 + }, + { + "epoch": 0.12, + "grad_norm": 1.1617512577802296, + "learning_rate": 1.962014744382788e-05, + "loss": 1.2812, + "step": 2403 + }, + { + "epoch": 0.12, + "grad_norm": 1.3379242629301378, + "learning_rate": 1.9619722054667698e-05, + "loss": 1.2178, + "step": 2404 + }, + { + "epoch": 0.12, + "grad_norm": 1.2769647137193416, + "learning_rate": 1.961929643206515e-05, + "loss": 1.2612, + "step": 2405 + }, + { + "epoch": 0.12, + "grad_norm": 1.19527223474733, + "learning_rate": 1.961887057603057e-05, + "loss": 1.3301, + "step": 2406 + }, + { + "epoch": 0.12, + "grad_norm": 1.3087808311152336, + "learning_rate": 1.9618444486574287e-05, + "loss": 1.1763, + "step": 2407 + }, + { + "epoch": 0.12, + "grad_norm": 1.1256135597904087, + "learning_rate": 1.9618018163706644e-05, + "loss": 1.0889, + "step": 2408 + }, + { + "epoch": 0.12, + "grad_norm": 1.195483977549286, + "learning_rate": 1.9617591607437988e-05, + "loss": 1.2183, + "step": 2409 + }, + { + "epoch": 0.12, + "grad_norm": 1.1485427155394363, + "learning_rate": 1.9617164817778666e-05, + "loss": 1.2388, + "step": 2410 + }, + { + "epoch": 0.12, + "grad_norm": 1.30883342753104, + "learning_rate": 1.9616737794739036e-05, + "loss": 1.2437, + "step": 2411 + }, + { + "epoch": 0.12, + "grad_norm": 1.2371062904454795, + "learning_rate": 1.9616310538329463e-05, + "loss": 1.3789, + "step": 2412 + }, + { + "epoch": 0.12, + "grad_norm": 1.4021656225106407, + "learning_rate": 1.961588304856031e-05, + "loss": 1.2285, + "step": 2413 + }, + { + "epoch": 0.12, + "grad_norm": 1.1731240121593056, + "learning_rate": 1.961545532544196e-05, + "loss": 1.2832, + "step": 2414 + }, + { + "epoch": 0.12, + "grad_norm": 1.278818047464594, + "learning_rate": 1.9615027368984785e-05, + "loss": 1.291, + "step": 2415 + }, + { + "epoch": 0.12, + "grad_norm": 1.1518043986831197, + "learning_rate": 1.9614599179199172e-05, + "loss": 0.9465, + "step": 2416 + }, + { + "epoch": 0.12, + "grad_norm": 1.4806238248093422, + "learning_rate": 1.961417075609551e-05, + "loss": 1.4141, + "step": 2417 + }, + { + "epoch": 0.12, + "grad_norm": 1.1285556910911094, + "learning_rate": 1.9613742099684204e-05, + "loss": 1.3057, + "step": 2418 + }, + { + "epoch": 0.12, + "grad_norm": 1.3499216917200085, + "learning_rate": 1.9613313209975645e-05, + "loss": 1.2407, + "step": 2419 + }, + { + "epoch": 0.12, + "grad_norm": 1.2679333929135839, + "learning_rate": 1.961288408698025e-05, + "loss": 1.1958, + "step": 2420 + }, + { + "epoch": 0.12, + "grad_norm": 1.1483162991349474, + "learning_rate": 1.961245473070843e-05, + "loss": 1.2148, + "step": 2421 + }, + { + "epoch": 0.12, + "grad_norm": 0.9214240803198256, + "learning_rate": 1.96120251411706e-05, + "loss": 1.2314, + "step": 2422 + }, + { + "epoch": 0.12, + "grad_norm": 1.2529885235202343, + "learning_rate": 1.9611595318377184e-05, + "loss": 1.3672, + "step": 2423 + }, + { + "epoch": 0.12, + "grad_norm": 1.4033944317545468, + "learning_rate": 1.961116526233862e-05, + "loss": 1.2632, + "step": 2424 + }, + { + "epoch": 0.12, + "grad_norm": 1.1269224969593372, + "learning_rate": 1.9610734973065342e-05, + "loss": 1.3423, + "step": 2425 + }, + { + "epoch": 0.12, + "grad_norm": 1.357224105931364, + "learning_rate": 1.961030445056779e-05, + "loss": 1.4346, + "step": 2426 + }, + { + "epoch": 0.12, + "grad_norm": 1.3469847944195492, + "learning_rate": 1.960987369485641e-05, + "loss": 1.3721, + "step": 2427 + }, + { + "epoch": 0.12, + "grad_norm": 1.30985738464524, + "learning_rate": 1.960944270594166e-05, + "loss": 1.2319, + "step": 2428 + }, + { + "epoch": 0.12, + "grad_norm": 1.2812752012047819, + "learning_rate": 1.9609011483833993e-05, + "loss": 1.3027, + "step": 2429 + }, + { + "epoch": 0.12, + "grad_norm": 1.560387929004423, + "learning_rate": 1.9608580028543875e-05, + "loss": 1.3882, + "step": 2430 + }, + { + "epoch": 0.12, + "grad_norm": 1.0744188151971599, + "learning_rate": 1.960814834008178e-05, + "loss": 1.2866, + "step": 2431 + }, + { + "epoch": 0.12, + "grad_norm": 1.0274341318740199, + "learning_rate": 1.960771641845818e-05, + "loss": 1.157, + "step": 2432 + }, + { + "epoch": 0.12, + "grad_norm": 1.1871497694330204, + "learning_rate": 1.960728426368356e-05, + "loss": 1.1836, + "step": 2433 + }, + { + "epoch": 0.12, + "grad_norm": 1.5638691628691719, + "learning_rate": 1.9606851875768404e-05, + "loss": 1.1543, + "step": 2434 + }, + { + "epoch": 0.12, + "grad_norm": 1.4151425716564714, + "learning_rate": 1.9606419254723205e-05, + "loss": 1.0837, + "step": 2435 + }, + { + "epoch": 0.12, + "grad_norm": 1.324307886979762, + "learning_rate": 1.9605986400558462e-05, + "loss": 1.23, + "step": 2436 + }, + { + "epoch": 0.12, + "grad_norm": 0.9799720856040415, + "learning_rate": 1.9605553313284682e-05, + "loss": 1.0884, + "step": 2437 + }, + { + "epoch": 0.12, + "grad_norm": 1.2656681114291513, + "learning_rate": 1.9605119992912368e-05, + "loss": 1.3472, + "step": 2438 + }, + { + "epoch": 0.12, + "grad_norm": 1.2059794718373356, + "learning_rate": 1.960468643945204e-05, + "loss": 1.1846, + "step": 2439 + }, + { + "epoch": 0.12, + "grad_norm": 1.3239270641480203, + "learning_rate": 1.9604252652914222e-05, + "loss": 1.2397, + "step": 2440 + }, + { + "epoch": 0.12, + "grad_norm": 1.598138040968418, + "learning_rate": 1.9603818633309434e-05, + "loss": 1.3169, + "step": 2441 + }, + { + "epoch": 0.12, + "grad_norm": 1.2665232835889386, + "learning_rate": 1.9603384380648213e-05, + "loss": 1.1663, + "step": 2442 + }, + { + "epoch": 0.12, + "grad_norm": 1.4105574795427946, + "learning_rate": 1.9602949894941096e-05, + "loss": 1.2988, + "step": 2443 + }, + { + "epoch": 0.12, + "grad_norm": 1.124746707470773, + "learning_rate": 1.9602515176198623e-05, + "loss": 1.3354, + "step": 2444 + }, + { + "epoch": 0.12, + "grad_norm": 1.047550100685344, + "learning_rate": 1.960208022443135e-05, + "loss": 1.2676, + "step": 2445 + }, + { + "epoch": 0.12, + "grad_norm": 1.4108711807779835, + "learning_rate": 1.9601645039649828e-05, + "loss": 1.2515, + "step": 2446 + }, + { + "epoch": 0.12, + "grad_norm": 1.3328310165310426, + "learning_rate": 1.9601209621864616e-05, + "loss": 1.1851, + "step": 2447 + }, + { + "epoch": 0.12, + "grad_norm": 1.229659908431658, + "learning_rate": 1.9600773971086286e-05, + "loss": 1.1709, + "step": 2448 + }, + { + "epoch": 0.12, + "grad_norm": 1.225084249140839, + "learning_rate": 1.9600338087325407e-05, + "loss": 1.189, + "step": 2449 + }, + { + "epoch": 0.12, + "grad_norm": 1.245548605235522, + "learning_rate": 1.959990197059255e-05, + "loss": 1.3105, + "step": 2450 + }, + { + "epoch": 0.12, + "grad_norm": 1.53512254937094, + "learning_rate": 1.9599465620898314e-05, + "loss": 1.3086, + "step": 2451 + }, + { + "epoch": 0.12, + "grad_norm": 1.1580350944690099, + "learning_rate": 1.959902903825327e-05, + "loss": 1.3223, + "step": 2452 + }, + { + "epoch": 0.12, + "grad_norm": 1.531640929789945, + "learning_rate": 1.959859222266803e-05, + "loss": 1.2827, + "step": 2453 + }, + { + "epoch": 0.12, + "grad_norm": 1.1337689347631477, + "learning_rate": 1.9598155174153174e-05, + "loss": 1.2603, + "step": 2454 + }, + { + "epoch": 0.12, + "grad_norm": 1.2204928391586622, + "learning_rate": 1.9597717892719326e-05, + "loss": 1.2412, + "step": 2455 + }, + { + "epoch": 0.12, + "grad_norm": 1.1612531697793014, + "learning_rate": 1.9597280378377087e-05, + "loss": 1.1694, + "step": 2456 + }, + { + "epoch": 0.12, + "grad_norm": 1.3711515961128697, + "learning_rate": 1.959684263113708e-05, + "loss": 1.3076, + "step": 2457 + }, + { + "epoch": 0.12, + "grad_norm": 1.434354278884517, + "learning_rate": 1.9596404651009928e-05, + "loss": 1.2007, + "step": 2458 + }, + { + "epoch": 0.12, + "grad_norm": 1.3669500565608845, + "learning_rate": 1.9595966438006253e-05, + "loss": 1.3276, + "step": 2459 + }, + { + "epoch": 0.12, + "grad_norm": 1.351034087300645, + "learning_rate": 1.9595527992136697e-05, + "loss": 1.231, + "step": 2460 + }, + { + "epoch": 0.12, + "grad_norm": 1.2965194965792934, + "learning_rate": 1.9595089313411892e-05, + "loss": 1.2397, + "step": 2461 + }, + { + "epoch": 0.12, + "grad_norm": 1.2544750643138867, + "learning_rate": 1.9594650401842493e-05, + "loss": 1.1113, + "step": 2462 + }, + { + "epoch": 0.12, + "grad_norm": 1.302407483722702, + "learning_rate": 1.959421125743914e-05, + "loss": 1.1201, + "step": 2463 + }, + { + "epoch": 0.12, + "grad_norm": 1.2766225793278472, + "learning_rate": 1.9593771880212498e-05, + "loss": 1.2822, + "step": 2464 + }, + { + "epoch": 0.12, + "grad_norm": 1.5861913607961307, + "learning_rate": 1.9593332270173225e-05, + "loss": 1.1416, + "step": 2465 + }, + { + "epoch": 0.12, + "grad_norm": 1.2984664062934461, + "learning_rate": 1.9592892427331993e-05, + "loss": 1.2148, + "step": 2466 + }, + { + "epoch": 0.12, + "grad_norm": 1.5021415314302897, + "learning_rate": 1.9592452351699475e-05, + "loss": 1.2886, + "step": 2467 + }, + { + "epoch": 0.12, + "grad_norm": 1.321530101180719, + "learning_rate": 1.9592012043286342e-05, + "loss": 1.2026, + "step": 2468 + }, + { + "epoch": 0.12, + "grad_norm": 1.4072328595076355, + "learning_rate": 1.9591571502103294e-05, + "loss": 1.1577, + "step": 2469 + }, + { + "epoch": 0.12, + "grad_norm": 1.2182644914677294, + "learning_rate": 1.959113072816101e-05, + "loss": 1.1343, + "step": 2470 + }, + { + "epoch": 0.12, + "grad_norm": 1.4104888949377745, + "learning_rate": 1.9590689721470188e-05, + "loss": 1.2358, + "step": 2471 + }, + { + "epoch": 0.12, + "grad_norm": 0.6961421274735642, + "learning_rate": 1.9590248482041533e-05, + "loss": 1.2969, + "step": 2472 + }, + { + "epoch": 0.12, + "grad_norm": 1.3907948908161734, + "learning_rate": 1.958980700988575e-05, + "loss": 1.3198, + "step": 2473 + }, + { + "epoch": 0.12, + "grad_norm": 1.2553004135796941, + "learning_rate": 1.9589365305013556e-05, + "loss": 1.2202, + "step": 2474 + }, + { + "epoch": 0.12, + "grad_norm": 1.218533837696613, + "learning_rate": 1.9588923367435667e-05, + "loss": 1.2515, + "step": 2475 + }, + { + "epoch": 0.12, + "grad_norm": 1.648522718900821, + "learning_rate": 1.9588481197162804e-05, + "loss": 1.1035, + "step": 2476 + }, + { + "epoch": 0.12, + "grad_norm": 1.0967915039245528, + "learning_rate": 1.9588038794205705e-05, + "loss": 1.2847, + "step": 2477 + }, + { + "epoch": 0.12, + "grad_norm": 1.2119963886914744, + "learning_rate": 1.9587596158575102e-05, + "loss": 1.1187, + "step": 2478 + }, + { + "epoch": 0.12, + "grad_norm": 1.2669029385296082, + "learning_rate": 1.9587153290281734e-05, + "loss": 1.168, + "step": 2479 + }, + { + "epoch": 0.12, + "grad_norm": 1.204797449252495, + "learning_rate": 1.958671018933635e-05, + "loss": 1.2051, + "step": 2480 + }, + { + "epoch": 0.12, + "grad_norm": 1.0970592524335145, + "learning_rate": 1.958626685574971e-05, + "loss": 1.293, + "step": 2481 + }, + { + "epoch": 0.12, + "grad_norm": 0.9473074251097754, + "learning_rate": 1.9585823289532556e-05, + "loss": 1.1812, + "step": 2482 + }, + { + "epoch": 0.12, + "grad_norm": 1.6550291120624565, + "learning_rate": 1.958537949069567e-05, + "loss": 1.1382, + "step": 2483 + }, + { + "epoch": 0.12, + "grad_norm": 1.338278912102079, + "learning_rate": 1.9584935459249807e-05, + "loss": 1.4204, + "step": 2484 + }, + { + "epoch": 0.12, + "grad_norm": 1.311648532400192, + "learning_rate": 1.958449119520575e-05, + "loss": 1.3037, + "step": 2485 + }, + { + "epoch": 0.12, + "grad_norm": 1.1005799797656695, + "learning_rate": 1.9584046698574282e-05, + "loss": 1.1064, + "step": 2486 + }, + { + "epoch": 0.12, + "grad_norm": 1.0672076846271734, + "learning_rate": 1.958360196936618e-05, + "loss": 1.1934, + "step": 2487 + }, + { + "epoch": 0.12, + "grad_norm": 1.1558947992208253, + "learning_rate": 1.958315700759225e-05, + "loss": 1.3779, + "step": 2488 + }, + { + "epoch": 0.12, + "grad_norm": 1.574008515859585, + "learning_rate": 1.9582711813263277e-05, + "loss": 1.29, + "step": 2489 + }, + { + "epoch": 0.12, + "grad_norm": 1.1865040571335295, + "learning_rate": 1.9582266386390075e-05, + "loss": 1.2261, + "step": 2490 + }, + { + "epoch": 0.12, + "grad_norm": 1.2593440403947982, + "learning_rate": 1.9581820726983443e-05, + "loss": 1.2559, + "step": 2491 + }, + { + "epoch": 0.12, + "grad_norm": 1.1742214226761714, + "learning_rate": 1.9581374835054205e-05, + "loss": 1.269, + "step": 2492 + }, + { + "epoch": 0.12, + "grad_norm": 1.4270853656376172, + "learning_rate": 1.9580928710613176e-05, + "loss": 1.333, + "step": 2493 + }, + { + "epoch": 0.12, + "grad_norm": 1.20208617408777, + "learning_rate": 1.9580482353671184e-05, + "loss": 1.002, + "step": 2494 + }, + { + "epoch": 0.12, + "grad_norm": 1.298617437152622, + "learning_rate": 1.958003576423906e-05, + "loss": 1.0962, + "step": 2495 + }, + { + "epoch": 0.12, + "grad_norm": 1.2116954482290827, + "learning_rate": 1.9579588942327642e-05, + "loss": 1.1411, + "step": 2496 + }, + { + "epoch": 0.12, + "grad_norm": 0.9719017423123277, + "learning_rate": 1.9579141887947772e-05, + "loss": 1.1509, + "step": 2497 + }, + { + "epoch": 0.12, + "grad_norm": 1.12164002510178, + "learning_rate": 1.95786946011103e-05, + "loss": 1.3472, + "step": 2498 + }, + { + "epoch": 0.12, + "grad_norm": 1.4180597723917212, + "learning_rate": 1.9578247081826083e-05, + "loss": 1.186, + "step": 2499 + }, + { + "epoch": 0.12, + "grad_norm": 1.3084173166108897, + "learning_rate": 1.9577799330105973e-05, + "loss": 1.2881, + "step": 2500 + }, + { + "epoch": 0.12, + "grad_norm": 1.299736565275629, + "learning_rate": 1.9577351345960845e-05, + "loss": 1.3809, + "step": 2501 + }, + { + "epoch": 0.12, + "grad_norm": 1.3746082732864824, + "learning_rate": 1.9576903129401563e-05, + "loss": 1.2676, + "step": 2502 + }, + { + "epoch": 0.12, + "grad_norm": 1.2640632647327177, + "learning_rate": 1.957645468043901e-05, + "loss": 1.3125, + "step": 2503 + }, + { + "epoch": 0.12, + "grad_norm": 1.2860940365915123, + "learning_rate": 1.957600599908406e-05, + "loss": 1.3188, + "step": 2504 + }, + { + "epoch": 0.12, + "grad_norm": 1.1169321533739667, + "learning_rate": 1.957555708534761e-05, + "loss": 1.2637, + "step": 2505 + }, + { + "epoch": 0.12, + "grad_norm": 1.360334653838007, + "learning_rate": 1.9575107939240548e-05, + "loss": 1.2456, + "step": 2506 + }, + { + "epoch": 0.12, + "grad_norm": 1.390121526999914, + "learning_rate": 1.957465856077378e-05, + "loss": 1.3091, + "step": 2507 + }, + { + "epoch": 0.12, + "grad_norm": 1.2463547431948927, + "learning_rate": 1.9574208949958203e-05, + "loss": 1.1924, + "step": 2508 + }, + { + "epoch": 0.12, + "grad_norm": 1.3434113042787004, + "learning_rate": 1.9573759106804732e-05, + "loss": 1.2554, + "step": 2509 + }, + { + "epoch": 0.12, + "grad_norm": 1.0404267395429299, + "learning_rate": 1.9573309031324284e-05, + "loss": 1.2856, + "step": 2510 + }, + { + "epoch": 0.12, + "grad_norm": 1.193358636108902, + "learning_rate": 1.957285872352778e-05, + "loss": 1.2168, + "step": 2511 + }, + { + "epoch": 0.12, + "grad_norm": 0.9772835024094433, + "learning_rate": 1.9572408183426145e-05, + "loss": 1.1079, + "step": 2512 + }, + { + "epoch": 0.12, + "grad_norm": 1.2076002086011914, + "learning_rate": 1.9571957411030318e-05, + "loss": 1.0574, + "step": 2513 + }, + { + "epoch": 0.12, + "grad_norm": 1.1905960519884033, + "learning_rate": 1.9571506406351233e-05, + "loss": 1.1367, + "step": 2514 + }, + { + "epoch": 0.12, + "grad_norm": 1.0675195487722746, + "learning_rate": 1.9571055169399837e-05, + "loss": 0.959, + "step": 2515 + }, + { + "epoch": 0.12, + "grad_norm": 1.0630188218764152, + "learning_rate": 1.957060370018708e-05, + "loss": 1.2651, + "step": 2516 + }, + { + "epoch": 0.12, + "grad_norm": 1.135285453656186, + "learning_rate": 1.9570151998723918e-05, + "loss": 1.3384, + "step": 2517 + }, + { + "epoch": 0.12, + "grad_norm": 1.1967640924909215, + "learning_rate": 1.956970006502131e-05, + "loss": 1.165, + "step": 2518 + }, + { + "epoch": 0.12, + "grad_norm": 1.2657818878411677, + "learning_rate": 1.956924789909022e-05, + "loss": 1.2822, + "step": 2519 + }, + { + "epoch": 0.12, + "grad_norm": 1.28813565836533, + "learning_rate": 1.9568795500941635e-05, + "loss": 1.2642, + "step": 2520 + }, + { + "epoch": 0.12, + "grad_norm": 1.2742719865336056, + "learning_rate": 1.956834287058652e-05, + "loss": 1.249, + "step": 2521 + }, + { + "epoch": 0.12, + "grad_norm": 1.2157443690434664, + "learning_rate": 1.9567890008035865e-05, + "loss": 1.2173, + "step": 2522 + }, + { + "epoch": 0.12, + "grad_norm": 1.2288650982979246, + "learning_rate": 1.956743691330065e-05, + "loss": 1.0474, + "step": 2523 + }, + { + "epoch": 0.12, + "grad_norm": 1.2431435582546921, + "learning_rate": 1.9566983586391884e-05, + "loss": 1.2158, + "step": 2524 + }, + { + "epoch": 0.12, + "grad_norm": 1.406443882668918, + "learning_rate": 1.956653002732056e-05, + "loss": 1.1934, + "step": 2525 + }, + { + "epoch": 0.12, + "grad_norm": 1.1933260696554162, + "learning_rate": 1.9566076236097695e-05, + "loss": 1.2246, + "step": 2526 + }, + { + "epoch": 0.12, + "grad_norm": 1.229206244179249, + "learning_rate": 1.956562221273428e-05, + "loss": 1.0867, + "step": 2527 + }, + { + "epoch": 0.12, + "grad_norm": 1.2948996800080066, + "learning_rate": 1.9565167957241353e-05, + "loss": 1.2993, + "step": 2528 + }, + { + "epoch": 0.12, + "grad_norm": 1.3555341576612308, + "learning_rate": 1.9564713469629928e-05, + "loss": 1.2974, + "step": 2529 + }, + { + "epoch": 0.12, + "grad_norm": 1.1326747749344523, + "learning_rate": 1.9564258749911035e-05, + "loss": 1.1313, + "step": 2530 + }, + { + "epoch": 0.12, + "grad_norm": 1.0366328506288642, + "learning_rate": 1.956380379809571e-05, + "loss": 1.3643, + "step": 2531 + }, + { + "epoch": 0.12, + "grad_norm": 1.2003266400327943, + "learning_rate": 1.9563348614194992e-05, + "loss": 1.168, + "step": 2532 + }, + { + "epoch": 0.12, + "grad_norm": 1.1976393241519945, + "learning_rate": 1.956289319821993e-05, + "loss": 1.272, + "step": 2533 + }, + { + "epoch": 0.12, + "grad_norm": 1.1002543056157317, + "learning_rate": 1.9562437550181573e-05, + "loss": 1.1284, + "step": 2534 + }, + { + "epoch": 0.12, + "grad_norm": 1.8569249495359663, + "learning_rate": 1.9561981670090978e-05, + "loss": 1.3965, + "step": 2535 + }, + { + "epoch": 0.12, + "grad_norm": 1.2969697636452504, + "learning_rate": 1.9561525557959207e-05, + "loss": 1.2036, + "step": 2536 + }, + { + "epoch": 0.12, + "grad_norm": 1.2208261278774912, + "learning_rate": 1.9561069213797333e-05, + "loss": 1.2969, + "step": 2537 + }, + { + "epoch": 0.12, + "grad_norm": 1.3067256060846895, + "learning_rate": 1.9560612637616428e-05, + "loss": 1.2017, + "step": 2538 + }, + { + "epoch": 0.12, + "grad_norm": 1.3265013685582838, + "learning_rate": 1.9560155829427567e-05, + "loss": 1.1797, + "step": 2539 + }, + { + "epoch": 0.12, + "grad_norm": 1.1594222004434382, + "learning_rate": 1.9559698789241844e-05, + "loss": 1.2778, + "step": 2540 + }, + { + "epoch": 0.12, + "grad_norm": 1.3768017931854704, + "learning_rate": 1.955924151707034e-05, + "loss": 1.1724, + "step": 2541 + }, + { + "epoch": 0.12, + "grad_norm": 0.973083906625703, + "learning_rate": 1.955878401292416e-05, + "loss": 1.2808, + "step": 2542 + }, + { + "epoch": 0.12, + "grad_norm": 1.3339386892132308, + "learning_rate": 1.955832627681441e-05, + "loss": 1.2246, + "step": 2543 + }, + { + "epoch": 0.12, + "grad_norm": 1.241590596089356, + "learning_rate": 1.955786830875218e-05, + "loss": 1.1182, + "step": 2544 + }, + { + "epoch": 0.12, + "grad_norm": 1.1066817381920073, + "learning_rate": 1.95574101087486e-05, + "loss": 1.2676, + "step": 2545 + }, + { + "epoch": 0.12, + "grad_norm": 1.1578904508740844, + "learning_rate": 1.9556951676814787e-05, + "loss": 1.1108, + "step": 2546 + }, + { + "epoch": 0.12, + "grad_norm": 0.8413746696123369, + "learning_rate": 1.9556493012961856e-05, + "loss": 1.2534, + "step": 2547 + }, + { + "epoch": 0.12, + "grad_norm": 1.3731716400194995, + "learning_rate": 1.955603411720095e-05, + "loss": 1.2393, + "step": 2548 + }, + { + "epoch": 0.12, + "grad_norm": 1.3809773629016897, + "learning_rate": 1.9555574989543197e-05, + "loss": 1.3438, + "step": 2549 + }, + { + "epoch": 0.12, + "grad_norm": 1.1396521329289346, + "learning_rate": 1.9555115629999738e-05, + "loss": 1.2383, + "step": 2550 + }, + { + "epoch": 0.12, + "grad_norm": 1.1579492274205814, + "learning_rate": 1.9554656038581728e-05, + "loss": 1.3022, + "step": 2551 + }, + { + "epoch": 0.12, + "grad_norm": 1.2294145735131206, + "learning_rate": 1.9554196215300314e-05, + "loss": 1.2847, + "step": 2552 + }, + { + "epoch": 0.12, + "grad_norm": 1.0548017698656251, + "learning_rate": 1.9553736160166657e-05, + "loss": 1.2065, + "step": 2553 + }, + { + "epoch": 0.12, + "grad_norm": 1.5035341990551603, + "learning_rate": 1.9553275873191916e-05, + "loss": 1.3442, + "step": 2554 + }, + { + "epoch": 0.12, + "grad_norm": 1.2044633300176522, + "learning_rate": 1.9552815354387267e-05, + "loss": 1.0396, + "step": 2555 + }, + { + "epoch": 0.12, + "grad_norm": 1.0050409787691135, + "learning_rate": 1.9552354603763882e-05, + "loss": 1.3657, + "step": 2556 + }, + { + "epoch": 0.12, + "grad_norm": 1.1926393835383338, + "learning_rate": 1.9551893621332944e-05, + "loss": 1.2524, + "step": 2557 + }, + { + "epoch": 0.12, + "grad_norm": 1.1322618278041925, + "learning_rate": 1.9551432407105642e-05, + "loss": 1.1406, + "step": 2558 + }, + { + "epoch": 0.12, + "grad_norm": 1.2732769596000317, + "learning_rate": 1.955097096109316e-05, + "loss": 1.2109, + "step": 2559 + }, + { + "epoch": 0.12, + "grad_norm": 1.3140435344672652, + "learning_rate": 1.9550509283306703e-05, + "loss": 1.2598, + "step": 2560 + }, + { + "epoch": 0.12, + "grad_norm": 1.31704069217963, + "learning_rate": 1.9550047373757475e-05, + "loss": 1.3794, + "step": 2561 + }, + { + "epoch": 0.12, + "grad_norm": 1.149483495781311, + "learning_rate": 1.9549585232456682e-05, + "loss": 1.1572, + "step": 2562 + }, + { + "epoch": 0.12, + "grad_norm": 1.447439199725947, + "learning_rate": 1.9549122859415538e-05, + "loss": 1.2598, + "step": 2563 + }, + { + "epoch": 0.12, + "grad_norm": 1.2720953620813826, + "learning_rate": 1.9548660254645265e-05, + "loss": 1.3188, + "step": 2564 + }, + { + "epoch": 0.12, + "grad_norm": 1.20609021116678, + "learning_rate": 1.954819741815709e-05, + "loss": 1.1548, + "step": 2565 + }, + { + "epoch": 0.12, + "grad_norm": 1.3947520685983186, + "learning_rate": 1.9547734349962246e-05, + "loss": 1.3662, + "step": 2566 + }, + { + "epoch": 0.12, + "grad_norm": 1.3989359877201486, + "learning_rate": 1.9547271050071965e-05, + "loss": 1.2241, + "step": 2567 + }, + { + "epoch": 0.12, + "grad_norm": 1.4325290086939035, + "learning_rate": 1.9546807518497496e-05, + "loss": 1.2622, + "step": 2568 + }, + { + "epoch": 0.12, + "grad_norm": 1.3047474618513795, + "learning_rate": 1.954634375525008e-05, + "loss": 1.1157, + "step": 2569 + }, + { + "epoch": 0.12, + "grad_norm": 1.414767017813904, + "learning_rate": 1.9545879760340983e-05, + "loss": 1.3994, + "step": 2570 + }, + { + "epoch": 0.12, + "grad_norm": 1.2003062612738746, + "learning_rate": 1.9545415533781453e-05, + "loss": 1.2212, + "step": 2571 + }, + { + "epoch": 0.12, + "grad_norm": 1.1903294018204333, + "learning_rate": 1.954495107558276e-05, + "loss": 1.271, + "step": 2572 + }, + { + "epoch": 0.12, + "grad_norm": 1.1706819487490872, + "learning_rate": 1.9544486385756176e-05, + "loss": 1.1875, + "step": 2573 + }, + { + "epoch": 0.12, + "grad_norm": 1.3106494686487498, + "learning_rate": 1.9544021464312977e-05, + "loss": 1.1704, + "step": 2574 + }, + { + "epoch": 0.12, + "grad_norm": 1.1807119241232262, + "learning_rate": 1.9543556311264445e-05, + "loss": 1.1567, + "step": 2575 + }, + { + "epoch": 0.12, + "grad_norm": 1.1699019314697918, + "learning_rate": 1.954309092662187e-05, + "loss": 1.1992, + "step": 2576 + }, + { + "epoch": 0.12, + "grad_norm": 1.034930724978826, + "learning_rate": 1.9542625310396538e-05, + "loss": 0.9543, + "step": 2577 + }, + { + "epoch": 0.12, + "grad_norm": 0.9398418428180848, + "learning_rate": 1.9542159462599755e-05, + "loss": 1.127, + "step": 2578 + }, + { + "epoch": 0.12, + "grad_norm": 1.4324097547456225, + "learning_rate": 1.954169338324283e-05, + "loss": 1.3579, + "step": 2579 + }, + { + "epoch": 0.12, + "grad_norm": 1.2763489788869873, + "learning_rate": 1.954122707233706e-05, + "loss": 1.2388, + "step": 2580 + }, + { + "epoch": 0.12, + "grad_norm": 1.1688819639416301, + "learning_rate": 1.954076052989377e-05, + "loss": 1.1948, + "step": 2581 + }, + { + "epoch": 0.12, + "grad_norm": 1.2252312817180646, + "learning_rate": 1.9540293755924285e-05, + "loss": 1.1094, + "step": 2582 + }, + { + "epoch": 0.12, + "grad_norm": 0.918004253672766, + "learning_rate": 1.9539826750439926e-05, + "loss": 1.1992, + "step": 2583 + }, + { + "epoch": 0.12, + "grad_norm": 1.3443329623597808, + "learning_rate": 1.9539359513452026e-05, + "loss": 1.3086, + "step": 2584 + }, + { + "epoch": 0.12, + "grad_norm": 1.163462173070634, + "learning_rate": 1.9538892044971925e-05, + "loss": 1.2451, + "step": 2585 + }, + { + "epoch": 0.12, + "grad_norm": 1.256946538894521, + "learning_rate": 1.9538424345010968e-05, + "loss": 1.2422, + "step": 2586 + }, + { + "epoch": 0.12, + "grad_norm": 1.1091409836490176, + "learning_rate": 1.9537956413580504e-05, + "loss": 1.2983, + "step": 2587 + }, + { + "epoch": 0.12, + "grad_norm": 1.2413419147935638, + "learning_rate": 1.9537488250691884e-05, + "loss": 1.1318, + "step": 2588 + }, + { + "epoch": 0.12, + "grad_norm": 1.3492905223479534, + "learning_rate": 1.9537019856356478e-05, + "loss": 1.2739, + "step": 2589 + }, + { + "epoch": 0.12, + "grad_norm": 1.1375947542570342, + "learning_rate": 1.9536551230585643e-05, + "loss": 1.0891, + "step": 2590 + }, + { + "epoch": 0.12, + "grad_norm": 1.3212637437025099, + "learning_rate": 1.953608237339076e-05, + "loss": 1.2842, + "step": 2591 + }, + { + "epoch": 0.12, + "grad_norm": 1.3761988711110171, + "learning_rate": 1.95356132847832e-05, + "loss": 1.2778, + "step": 2592 + }, + { + "epoch": 0.12, + "grad_norm": 1.4514536335341264, + "learning_rate": 1.953514396477435e-05, + "loss": 1.1572, + "step": 2593 + }, + { + "epoch": 0.12, + "grad_norm": 1.1903706105917298, + "learning_rate": 1.9534674413375595e-05, + "loss": 1.3467, + "step": 2594 + }, + { + "epoch": 0.12, + "grad_norm": 1.1040056340807305, + "learning_rate": 1.9534204630598334e-05, + "loss": 1.2041, + "step": 2595 + }, + { + "epoch": 0.12, + "grad_norm": 1.2644244620636267, + "learning_rate": 1.953373461645397e-05, + "loss": 1.1506, + "step": 2596 + }, + { + "epoch": 0.12, + "grad_norm": 1.377398691695164, + "learning_rate": 1.9533264370953898e-05, + "loss": 1.2046, + "step": 2597 + }, + { + "epoch": 0.12, + "grad_norm": 1.2565133448946917, + "learning_rate": 1.953279389410954e-05, + "loss": 1.3662, + "step": 2598 + }, + { + "epoch": 0.13, + "grad_norm": 1.5464803177947397, + "learning_rate": 1.9532323185932306e-05, + "loss": 1.2217, + "step": 2599 + }, + { + "epoch": 0.13, + "grad_norm": 1.301454939421954, + "learning_rate": 1.953185224643362e-05, + "loss": 1.0884, + "step": 2600 + }, + { + "epoch": 0.13, + "grad_norm": 1.3985990883137358, + "learning_rate": 1.953138107562492e-05, + "loss": 1.1089, + "step": 2601 + }, + { + "epoch": 0.13, + "grad_norm": 1.1090103385598904, + "learning_rate": 1.953090967351763e-05, + "loss": 1.1455, + "step": 2602 + }, + { + "epoch": 0.13, + "grad_norm": 1.2274952346429011, + "learning_rate": 1.9530438040123188e-05, + "loss": 1.2515, + "step": 2603 + }, + { + "epoch": 0.13, + "grad_norm": 1.3273102530417267, + "learning_rate": 1.952996617545304e-05, + "loss": 1.2134, + "step": 2604 + }, + { + "epoch": 0.13, + "grad_norm": 1.383412273319498, + "learning_rate": 1.9529494079518647e-05, + "loss": 1.2109, + "step": 2605 + }, + { + "epoch": 0.13, + "grad_norm": 1.4826394588138023, + "learning_rate": 1.9529021752331455e-05, + "loss": 1.4165, + "step": 2606 + }, + { + "epoch": 0.13, + "grad_norm": 1.5892180018441058, + "learning_rate": 1.9528549193902926e-05, + "loss": 0.9695, + "step": 2607 + }, + { + "epoch": 0.13, + "grad_norm": 1.4107540037903843, + "learning_rate": 1.9528076404244537e-05, + "loss": 1.1387, + "step": 2608 + }, + { + "epoch": 0.13, + "grad_norm": 1.2886044296723107, + "learning_rate": 1.952760338336775e-05, + "loss": 1.2129, + "step": 2609 + }, + { + "epoch": 0.13, + "grad_norm": 1.5333376630874707, + "learning_rate": 1.952713013128405e-05, + "loss": 1.0986, + "step": 2610 + }, + { + "epoch": 0.13, + "grad_norm": 1.1946910437376104, + "learning_rate": 1.9526656648004918e-05, + "loss": 1.2456, + "step": 2611 + }, + { + "epoch": 0.13, + "grad_norm": 1.168488957707521, + "learning_rate": 1.952618293354185e-05, + "loss": 1.0388, + "step": 2612 + }, + { + "epoch": 0.13, + "grad_norm": 1.329239601506811, + "learning_rate": 1.9525708987906334e-05, + "loss": 1.1284, + "step": 2613 + }, + { + "epoch": 0.13, + "grad_norm": 1.2847056575150058, + "learning_rate": 1.9525234811109874e-05, + "loss": 1.2842, + "step": 2614 + }, + { + "epoch": 0.13, + "grad_norm": 1.3916592701412236, + "learning_rate": 1.952476040316398e-05, + "loss": 1.2749, + "step": 2615 + }, + { + "epoch": 0.13, + "grad_norm": 1.0536584767894503, + "learning_rate": 1.9524285764080166e-05, + "loss": 1.2012, + "step": 2616 + }, + { + "epoch": 0.13, + "grad_norm": 1.1068126620545042, + "learning_rate": 1.9523810893869937e-05, + "loss": 1.2588, + "step": 2617 + }, + { + "epoch": 0.13, + "grad_norm": 1.381231981422122, + "learning_rate": 1.9523335792544835e-05, + "loss": 1.2227, + "step": 2618 + }, + { + "epoch": 0.13, + "grad_norm": 1.419782463688498, + "learning_rate": 1.9522860460116377e-05, + "loss": 1.3232, + "step": 2619 + }, + { + "epoch": 0.13, + "grad_norm": 0.8967362204817978, + "learning_rate": 1.9522384896596102e-05, + "loss": 1.1235, + "step": 2620 + }, + { + "epoch": 0.13, + "grad_norm": 1.3903719482081585, + "learning_rate": 1.952190910199555e-05, + "loss": 1.3281, + "step": 2621 + }, + { + "epoch": 0.13, + "grad_norm": 1.1788294324506943, + "learning_rate": 1.9521433076326267e-05, + "loss": 0.9014, + "step": 2622 + }, + { + "epoch": 0.13, + "grad_norm": 1.238667270163528, + "learning_rate": 1.9520956819599804e-05, + "loss": 1.3223, + "step": 2623 + }, + { + "epoch": 0.13, + "grad_norm": 1.3185670949971537, + "learning_rate": 1.9520480331827718e-05, + "loss": 1.0427, + "step": 2624 + }, + { + "epoch": 0.13, + "grad_norm": 1.3208644992634633, + "learning_rate": 1.9520003613021577e-05, + "loss": 1.1436, + "step": 2625 + }, + { + "epoch": 0.13, + "grad_norm": 1.1201621181408399, + "learning_rate": 1.951952666319294e-05, + "loss": 1.228, + "step": 2626 + }, + { + "epoch": 0.13, + "grad_norm": 1.1443857375294177, + "learning_rate": 1.9519049482353393e-05, + "loss": 1.2354, + "step": 2627 + }, + { + "epoch": 0.13, + "grad_norm": 1.390447214870896, + "learning_rate": 1.9518572070514507e-05, + "loss": 1.2109, + "step": 2628 + }, + { + "epoch": 0.13, + "grad_norm": 1.2813332936432766, + "learning_rate": 1.9518094427687866e-05, + "loss": 1.2993, + "step": 2629 + }, + { + "epoch": 0.13, + "grad_norm": 1.3516008541789637, + "learning_rate": 1.951761655388507e-05, + "loss": 1.1265, + "step": 2630 + }, + { + "epoch": 0.13, + "grad_norm": 1.3861402566311032, + "learning_rate": 1.9517138449117706e-05, + "loss": 1.1309, + "step": 2631 + }, + { + "epoch": 0.13, + "grad_norm": 1.277975067018898, + "learning_rate": 1.9516660113397386e-05, + "loss": 1.2993, + "step": 2632 + }, + { + "epoch": 0.13, + "grad_norm": 1.2048315741015423, + "learning_rate": 1.951618154673571e-05, + "loss": 1.1948, + "step": 2633 + }, + { + "epoch": 0.13, + "grad_norm": 1.4532587668465788, + "learning_rate": 1.9515702749144293e-05, + "loss": 1.2021, + "step": 2634 + }, + { + "epoch": 0.13, + "grad_norm": 1.2551261815297083, + "learning_rate": 1.951522372063476e-05, + "loss": 1.2505, + "step": 2635 + }, + { + "epoch": 0.13, + "grad_norm": 1.105726220711786, + "learning_rate": 1.9514744461218725e-05, + "loss": 1.1606, + "step": 2636 + }, + { + "epoch": 0.13, + "grad_norm": 1.3688134028680958, + "learning_rate": 1.9514264970907825e-05, + "loss": 1.1997, + "step": 2637 + }, + { + "epoch": 0.13, + "grad_norm": 1.3219094090564436, + "learning_rate": 1.9513785249713697e-05, + "loss": 1.1135, + "step": 2638 + }, + { + "epoch": 0.13, + "grad_norm": 1.7853339886908384, + "learning_rate": 1.9513305297647976e-05, + "loss": 1.0913, + "step": 2639 + }, + { + "epoch": 0.13, + "grad_norm": 1.5353443092519012, + "learning_rate": 1.9512825114722314e-05, + "loss": 1.1616, + "step": 2640 + }, + { + "epoch": 0.13, + "grad_norm": 1.3129276225767958, + "learning_rate": 1.9512344700948363e-05, + "loss": 1.2583, + "step": 2641 + }, + { + "epoch": 0.13, + "grad_norm": 1.1660160742699783, + "learning_rate": 1.9511864056337784e-05, + "loss": 1.2637, + "step": 2642 + }, + { + "epoch": 0.13, + "grad_norm": 1.411010402149613, + "learning_rate": 1.9511383180902237e-05, + "loss": 1.2092, + "step": 2643 + }, + { + "epoch": 0.13, + "grad_norm": 1.1203285779663665, + "learning_rate": 1.951090207465339e-05, + "loss": 1.1519, + "step": 2644 + }, + { + "epoch": 0.13, + "grad_norm": 1.130191699555457, + "learning_rate": 1.951042073760292e-05, + "loss": 1.1143, + "step": 2645 + }, + { + "epoch": 0.13, + "grad_norm": 1.4040264993422515, + "learning_rate": 1.950993916976251e-05, + "loss": 1.3159, + "step": 2646 + }, + { + "epoch": 0.13, + "grad_norm": 1.3799814099078165, + "learning_rate": 1.9509457371143843e-05, + "loss": 1.2622, + "step": 2647 + }, + { + "epoch": 0.13, + "grad_norm": 1.374255281661571, + "learning_rate": 1.950897534175861e-05, + "loss": 1.3154, + "step": 2648 + }, + { + "epoch": 0.13, + "grad_norm": 1.207799743896137, + "learning_rate": 1.9508493081618515e-05, + "loss": 1.3262, + "step": 2649 + }, + { + "epoch": 0.13, + "grad_norm": 1.3411659752367298, + "learning_rate": 1.9508010590735252e-05, + "loss": 1.3589, + "step": 2650 + }, + { + "epoch": 0.13, + "grad_norm": 0.9885599863767687, + "learning_rate": 1.9507527869120534e-05, + "loss": 1.2959, + "step": 2651 + }, + { + "epoch": 0.13, + "grad_norm": 1.1772614334998182, + "learning_rate": 1.950704491678608e-05, + "loss": 1.2778, + "step": 2652 + }, + { + "epoch": 0.13, + "grad_norm": 1.2873046350426027, + "learning_rate": 1.95065617337436e-05, + "loss": 1.1743, + "step": 2653 + }, + { + "epoch": 0.13, + "grad_norm": 1.0156569352513143, + "learning_rate": 1.9506078320004825e-05, + "loss": 1.2769, + "step": 2654 + }, + { + "epoch": 0.13, + "grad_norm": 1.7578318469170935, + "learning_rate": 1.950559467558149e-05, + "loss": 1.3306, + "step": 2655 + }, + { + "epoch": 0.13, + "grad_norm": 1.3665634164972262, + "learning_rate": 1.9505110800485324e-05, + "loss": 1.2495, + "step": 2656 + }, + { + "epoch": 0.13, + "grad_norm": 1.1897784573473844, + "learning_rate": 1.950462669472807e-05, + "loss": 1.2061, + "step": 2657 + }, + { + "epoch": 0.13, + "grad_norm": 1.6778824954422504, + "learning_rate": 1.950414235832148e-05, + "loss": 1.4326, + "step": 2658 + }, + { + "epoch": 0.13, + "grad_norm": 0.953544044382961, + "learning_rate": 1.950365779127731e-05, + "loss": 1.2246, + "step": 2659 + }, + { + "epoch": 0.13, + "grad_norm": 0.9635973363183532, + "learning_rate": 1.950317299360731e-05, + "loss": 1.1284, + "step": 2660 + }, + { + "epoch": 0.13, + "grad_norm": 1.2991755335551114, + "learning_rate": 1.950268796532325e-05, + "loss": 1.2446, + "step": 2661 + }, + { + "epoch": 0.13, + "grad_norm": 1.1850769031460728, + "learning_rate": 1.95022027064369e-05, + "loss": 1.2559, + "step": 2662 + }, + { + "epoch": 0.13, + "grad_norm": 1.3261594403826105, + "learning_rate": 1.9501717216960035e-05, + "loss": 1.3042, + "step": 2663 + }, + { + "epoch": 0.13, + "grad_norm": 1.3376705315092543, + "learning_rate": 1.9501231496904435e-05, + "loss": 1.1865, + "step": 2664 + }, + { + "epoch": 0.13, + "grad_norm": 1.6840414577597647, + "learning_rate": 1.9500745546281893e-05, + "loss": 1.335, + "step": 2665 + }, + { + "epoch": 0.13, + "grad_norm": 0.9630277393012481, + "learning_rate": 1.9500259365104192e-05, + "loss": 1.3535, + "step": 2666 + }, + { + "epoch": 0.13, + "grad_norm": 1.1648728122722483, + "learning_rate": 1.949977295338314e-05, + "loss": 1.3013, + "step": 2667 + }, + { + "epoch": 0.13, + "grad_norm": 1.3870353307330323, + "learning_rate": 1.9499286311130533e-05, + "loss": 1.2734, + "step": 2668 + }, + { + "epoch": 0.13, + "grad_norm": 1.0051247445864258, + "learning_rate": 1.9498799438358186e-05, + "loss": 1.1218, + "step": 2669 + }, + { + "epoch": 0.13, + "grad_norm": 1.4692828817837424, + "learning_rate": 1.949831233507791e-05, + "loss": 1.2891, + "step": 2670 + }, + { + "epoch": 0.13, + "grad_norm": 1.1884340648055194, + "learning_rate": 1.949782500130153e-05, + "loss": 1.0845, + "step": 2671 + }, + { + "epoch": 0.13, + "grad_norm": 1.1801397884745513, + "learning_rate": 1.9497337437040867e-05, + "loss": 1.168, + "step": 2672 + }, + { + "epoch": 0.13, + "grad_norm": 1.1591175863288263, + "learning_rate": 1.9496849642307754e-05, + "loss": 1.1201, + "step": 2673 + }, + { + "epoch": 0.13, + "grad_norm": 1.2845953082500707, + "learning_rate": 1.949636161711403e-05, + "loss": 1.1519, + "step": 2674 + }, + { + "epoch": 0.13, + "grad_norm": 1.3130348856329377, + "learning_rate": 1.9495873361471538e-05, + "loss": 1.3472, + "step": 2675 + }, + { + "epoch": 0.13, + "grad_norm": 1.6390066783489718, + "learning_rate": 1.9495384875392125e-05, + "loss": 1.2969, + "step": 2676 + }, + { + "epoch": 0.13, + "grad_norm": 1.1702465578936851, + "learning_rate": 1.9494896158887647e-05, + "loss": 1.043, + "step": 2677 + }, + { + "epoch": 0.13, + "grad_norm": 1.4083714326241368, + "learning_rate": 1.949440721196996e-05, + "loss": 1.1934, + "step": 2678 + }, + { + "epoch": 0.13, + "grad_norm": 1.2217101762651914, + "learning_rate": 1.9493918034650934e-05, + "loss": 1.23, + "step": 2679 + }, + { + "epoch": 0.13, + "grad_norm": 1.3457682326286258, + "learning_rate": 1.9493428626942443e-05, + "loss": 1.3691, + "step": 2680 + }, + { + "epoch": 0.13, + "grad_norm": 1.2843865777156251, + "learning_rate": 1.9492938988856354e-05, + "loss": 1.3447, + "step": 2681 + }, + { + "epoch": 0.13, + "grad_norm": 1.3189724785444232, + "learning_rate": 1.949244912040455e-05, + "loss": 1.3823, + "step": 2682 + }, + { + "epoch": 0.13, + "grad_norm": 0.9085058924385045, + "learning_rate": 1.9491959021598927e-05, + "loss": 1.3394, + "step": 2683 + }, + { + "epoch": 0.13, + "grad_norm": 1.2680059982484344, + "learning_rate": 1.9491468692451373e-05, + "loss": 1.1914, + "step": 2684 + }, + { + "epoch": 0.13, + "grad_norm": 1.0906562293328212, + "learning_rate": 1.9490978132973785e-05, + "loss": 1.3433, + "step": 2685 + }, + { + "epoch": 0.13, + "grad_norm": 1.3892974353480247, + "learning_rate": 1.9490487343178072e-05, + "loss": 1.3438, + "step": 2686 + }, + { + "epoch": 0.13, + "grad_norm": 1.2405708105728084, + "learning_rate": 1.948999632307614e-05, + "loss": 1.2207, + "step": 2687 + }, + { + "epoch": 0.13, + "grad_norm": 1.1794882816821655, + "learning_rate": 1.9489505072679907e-05, + "loss": 1.0903, + "step": 2688 + }, + { + "epoch": 0.13, + "grad_norm": 1.125115722075324, + "learning_rate": 1.9489013592001293e-05, + "loss": 1.3315, + "step": 2689 + }, + { + "epoch": 0.13, + "grad_norm": 1.2514326790760983, + "learning_rate": 1.9488521881052225e-05, + "loss": 1.3057, + "step": 2690 + }, + { + "epoch": 0.13, + "grad_norm": 1.1834700329893788, + "learning_rate": 1.9488029939844634e-05, + "loss": 1.2227, + "step": 2691 + }, + { + "epoch": 0.13, + "grad_norm": 1.0646793545606132, + "learning_rate": 1.9487537768390465e-05, + "loss": 1.0684, + "step": 2692 + }, + { + "epoch": 0.13, + "grad_norm": 1.4078775873977178, + "learning_rate": 1.9487045366701652e-05, + "loss": 1.0378, + "step": 2693 + }, + { + "epoch": 0.13, + "grad_norm": 1.4567505641646958, + "learning_rate": 1.948655273479015e-05, + "loss": 1.1978, + "step": 2694 + }, + { + "epoch": 0.13, + "grad_norm": 1.3837350515519076, + "learning_rate": 1.948605987266791e-05, + "loss": 1.2319, + "step": 2695 + }, + { + "epoch": 0.13, + "grad_norm": 1.6274504076448262, + "learning_rate": 1.94855667803469e-05, + "loss": 1.3555, + "step": 2696 + }, + { + "epoch": 0.13, + "grad_norm": 1.2482950348343738, + "learning_rate": 1.9485073457839072e-05, + "loss": 1.1016, + "step": 2697 + }, + { + "epoch": 0.13, + "grad_norm": 1.508445004528826, + "learning_rate": 1.948457990515641e-05, + "loss": 1.2202, + "step": 2698 + }, + { + "epoch": 0.13, + "grad_norm": 1.5784204118832343, + "learning_rate": 1.9484086122310887e-05, + "loss": 1.3579, + "step": 2699 + }, + { + "epoch": 0.13, + "grad_norm": 1.195766630008451, + "learning_rate": 1.9483592109314487e-05, + "loss": 1.0229, + "step": 2700 + }, + { + "epoch": 0.13, + "grad_norm": 1.4892974489448472, + "learning_rate": 1.9483097866179194e-05, + "loss": 1.2578, + "step": 2701 + }, + { + "epoch": 0.13, + "grad_norm": 1.2523307189241772, + "learning_rate": 1.9482603392917006e-05, + "loss": 1.2104, + "step": 2702 + }, + { + "epoch": 0.13, + "grad_norm": 1.3213753338638972, + "learning_rate": 1.948210868953992e-05, + "loss": 1.2202, + "step": 2703 + }, + { + "epoch": 0.13, + "grad_norm": 1.2371515699912812, + "learning_rate": 1.9481613756059944e-05, + "loss": 1.1626, + "step": 2704 + }, + { + "epoch": 0.13, + "grad_norm": 1.1528619183220852, + "learning_rate": 1.9481118592489086e-05, + "loss": 1.241, + "step": 2705 + }, + { + "epoch": 0.13, + "grad_norm": 1.2205704708623017, + "learning_rate": 1.9480623198839362e-05, + "loss": 1.0713, + "step": 2706 + }, + { + "epoch": 0.13, + "grad_norm": 1.351149162062429, + "learning_rate": 1.9480127575122795e-05, + "loss": 1.1763, + "step": 2707 + }, + { + "epoch": 0.13, + "grad_norm": 1.3660114624059205, + "learning_rate": 1.9479631721351412e-05, + "loss": 1.2842, + "step": 2708 + }, + { + "epoch": 0.13, + "grad_norm": 1.1406711728619938, + "learning_rate": 1.947913563753724e-05, + "loss": 1.1855, + "step": 2709 + }, + { + "epoch": 0.13, + "grad_norm": 1.526836677224904, + "learning_rate": 1.947863932369233e-05, + "loss": 1.2378, + "step": 2710 + }, + { + "epoch": 0.13, + "grad_norm": 1.295356122452706, + "learning_rate": 1.9478142779828717e-05, + "loss": 1.1992, + "step": 2711 + }, + { + "epoch": 0.13, + "grad_norm": 1.4371026241903622, + "learning_rate": 1.9477646005958454e-05, + "loss": 1.1907, + "step": 2712 + }, + { + "epoch": 0.13, + "grad_norm": 1.293887046264935, + "learning_rate": 1.9477149002093595e-05, + "loss": 1.189, + "step": 2713 + }, + { + "epoch": 0.13, + "grad_norm": 0.7677306817929203, + "learning_rate": 1.9476651768246203e-05, + "loss": 1.1489, + "step": 2714 + }, + { + "epoch": 0.13, + "grad_norm": 1.1650026630874295, + "learning_rate": 1.947615430442834e-05, + "loss": 1.1592, + "step": 2715 + }, + { + "epoch": 0.13, + "grad_norm": 1.3561259175171914, + "learning_rate": 1.947565661065208e-05, + "loss": 1.333, + "step": 2716 + }, + { + "epoch": 0.13, + "grad_norm": 1.2982224443944543, + "learning_rate": 1.9475158686929498e-05, + "loss": 1.1304, + "step": 2717 + }, + { + "epoch": 0.13, + "grad_norm": 1.2277092994660086, + "learning_rate": 1.9474660533272684e-05, + "loss": 1.3174, + "step": 2718 + }, + { + "epoch": 0.13, + "grad_norm": 1.181934872824549, + "learning_rate": 1.9474162149693724e-05, + "loss": 1.0298, + "step": 2719 + }, + { + "epoch": 0.13, + "grad_norm": 1.121761323390677, + "learning_rate": 1.947366353620471e-05, + "loss": 1.2305, + "step": 2720 + }, + { + "epoch": 0.13, + "grad_norm": 1.0986682076128282, + "learning_rate": 1.947316469281774e-05, + "loss": 1.1812, + "step": 2721 + }, + { + "epoch": 0.13, + "grad_norm": 1.2771962711350335, + "learning_rate": 1.9472665619544927e-05, + "loss": 1.0493, + "step": 2722 + }, + { + "epoch": 0.13, + "grad_norm": 1.2470280695813276, + "learning_rate": 1.9472166316398376e-05, + "loss": 1.2861, + "step": 2723 + }, + { + "epoch": 0.13, + "grad_norm": 1.248492657409618, + "learning_rate": 1.9471666783390204e-05, + "loss": 1.0554, + "step": 2724 + }, + { + "epoch": 0.13, + "grad_norm": 1.4363602022184598, + "learning_rate": 1.9471167020532533e-05, + "loss": 1.1252, + "step": 2725 + }, + { + "epoch": 0.13, + "grad_norm": 1.3951853943635082, + "learning_rate": 1.9470667027837497e-05, + "loss": 1.2505, + "step": 2726 + }, + { + "epoch": 0.13, + "grad_norm": 1.3385909599332624, + "learning_rate": 1.9470166805317217e-05, + "loss": 1.1313, + "step": 2727 + }, + { + "epoch": 0.13, + "grad_norm": 1.6498237244370961, + "learning_rate": 1.9469666352983845e-05, + "loss": 1.2231, + "step": 2728 + }, + { + "epoch": 0.13, + "grad_norm": 1.3026633112911212, + "learning_rate": 1.946916567084952e-05, + "loss": 1.1807, + "step": 2729 + }, + { + "epoch": 0.13, + "grad_norm": 1.2384827840532582, + "learning_rate": 1.9468664758926393e-05, + "loss": 1.2754, + "step": 2730 + }, + { + "epoch": 0.13, + "grad_norm": 1.5013219580003692, + "learning_rate": 1.9468163617226613e-05, + "loss": 1.1187, + "step": 2731 + }, + { + "epoch": 0.13, + "grad_norm": 1.2395617381795718, + "learning_rate": 1.9467662245762354e-05, + "loss": 1.1982, + "step": 2732 + }, + { + "epoch": 0.13, + "grad_norm": 1.287145630682147, + "learning_rate": 1.9467160644545767e-05, + "loss": 1.3584, + "step": 2733 + }, + { + "epoch": 0.13, + "grad_norm": 0.9605145961361162, + "learning_rate": 1.946665881358904e-05, + "loss": 1.2373, + "step": 2734 + }, + { + "epoch": 0.13, + "grad_norm": 1.592310642093392, + "learning_rate": 1.9466156752904344e-05, + "loss": 1.3408, + "step": 2735 + }, + { + "epoch": 0.13, + "grad_norm": 1.3522835239576694, + "learning_rate": 1.9465654462503862e-05, + "loss": 1.2944, + "step": 2736 + }, + { + "epoch": 0.13, + "grad_norm": 1.0767142539538348, + "learning_rate": 1.946515194239978e-05, + "loss": 1.2373, + "step": 2737 + }, + { + "epoch": 0.13, + "grad_norm": 1.2732301832076856, + "learning_rate": 1.94646491926043e-05, + "loss": 1.2183, + "step": 2738 + }, + { + "epoch": 0.13, + "grad_norm": 1.3273572409543741, + "learning_rate": 1.9464146213129615e-05, + "loss": 1.249, + "step": 2739 + }, + { + "epoch": 0.13, + "grad_norm": 1.2203196314608338, + "learning_rate": 1.9463643003987938e-05, + "loss": 1.2061, + "step": 2740 + }, + { + "epoch": 0.13, + "grad_norm": 1.2888686118285542, + "learning_rate": 1.9463139565191476e-05, + "loss": 1.1279, + "step": 2741 + }, + { + "epoch": 0.13, + "grad_norm": 1.6738485890431998, + "learning_rate": 1.9462635896752448e-05, + "loss": 1.1709, + "step": 2742 + }, + { + "epoch": 0.13, + "grad_norm": 1.5179778655302758, + "learning_rate": 1.9462131998683073e-05, + "loss": 1.2949, + "step": 2743 + }, + { + "epoch": 0.13, + "grad_norm": 1.2581998347242902, + "learning_rate": 1.9461627870995585e-05, + "loss": 1.1841, + "step": 2744 + }, + { + "epoch": 0.13, + "grad_norm": 1.1706289263251706, + "learning_rate": 1.9461123513702208e-05, + "loss": 1.1514, + "step": 2745 + }, + { + "epoch": 0.13, + "grad_norm": 1.3540656260335235, + "learning_rate": 1.9460618926815195e-05, + "loss": 1.1431, + "step": 2746 + }, + { + "epoch": 0.13, + "grad_norm": 1.181371854506658, + "learning_rate": 1.9460114110346775e-05, + "loss": 1.3228, + "step": 2747 + }, + { + "epoch": 0.13, + "grad_norm": 1.3060698985202708, + "learning_rate": 1.9459609064309212e-05, + "loss": 1.2285, + "step": 2748 + }, + { + "epoch": 0.13, + "grad_norm": 1.2193820099135526, + "learning_rate": 1.9459103788714756e-05, + "loss": 1.3706, + "step": 2749 + }, + { + "epoch": 0.13, + "grad_norm": 1.2632294251288847, + "learning_rate": 1.945859828357567e-05, + "loss": 1.2524, + "step": 2750 + }, + { + "epoch": 0.13, + "grad_norm": 1.292272289006726, + "learning_rate": 1.945809254890422e-05, + "loss": 1.231, + "step": 2751 + }, + { + "epoch": 0.13, + "grad_norm": 1.285518832559783, + "learning_rate": 1.9457586584712678e-05, + "loss": 1.2417, + "step": 2752 + }, + { + "epoch": 0.13, + "grad_norm": 1.377721273313054, + "learning_rate": 1.9457080391013325e-05, + "loss": 1.3311, + "step": 2753 + }, + { + "epoch": 0.13, + "grad_norm": 1.3065472062513732, + "learning_rate": 1.945657396781844e-05, + "loss": 1.2876, + "step": 2754 + }, + { + "epoch": 0.13, + "grad_norm": 1.075218528309244, + "learning_rate": 1.945606731514032e-05, + "loss": 1.252, + "step": 2755 + }, + { + "epoch": 0.13, + "grad_norm": 1.1053559904967885, + "learning_rate": 1.9455560432991253e-05, + "loss": 1.1362, + "step": 2756 + }, + { + "epoch": 0.13, + "grad_norm": 1.1445442057947144, + "learning_rate": 1.9455053321383542e-05, + "loss": 1.27, + "step": 2757 + }, + { + "epoch": 0.13, + "grad_norm": 1.3036088298767414, + "learning_rate": 1.9454545980329493e-05, + "loss": 1.271, + "step": 2758 + }, + { + "epoch": 0.13, + "grad_norm": 1.2492236477141547, + "learning_rate": 1.945403840984142e-05, + "loss": 1.2715, + "step": 2759 + }, + { + "epoch": 0.13, + "grad_norm": 1.1645602544295508, + "learning_rate": 1.9453530609931635e-05, + "loss": 1.1592, + "step": 2760 + }, + { + "epoch": 0.13, + "grad_norm": 1.3344633931376393, + "learning_rate": 1.9453022580612468e-05, + "loss": 1.3325, + "step": 2761 + }, + { + "epoch": 0.13, + "grad_norm": 1.500064226615529, + "learning_rate": 1.9452514321896242e-05, + "loss": 1.2261, + "step": 2762 + }, + { + "epoch": 0.13, + "grad_norm": 1.3835826017292094, + "learning_rate": 1.945200583379529e-05, + "loss": 1.0522, + "step": 2763 + }, + { + "epoch": 0.13, + "grad_norm": 1.4977693613637881, + "learning_rate": 1.9451497116321954e-05, + "loss": 1.1924, + "step": 2764 + }, + { + "epoch": 0.13, + "grad_norm": 1.4334526378868404, + "learning_rate": 1.9450988169488577e-05, + "loss": 1.2202, + "step": 2765 + }, + { + "epoch": 0.13, + "grad_norm": 1.1886315228731148, + "learning_rate": 1.9450478993307517e-05, + "loss": 1.2563, + "step": 2766 + }, + { + "epoch": 0.13, + "grad_norm": 1.4960322913632391, + "learning_rate": 1.944996958779112e-05, + "loss": 1.3008, + "step": 2767 + }, + { + "epoch": 0.13, + "grad_norm": 1.2168292510914869, + "learning_rate": 1.9449459952951756e-05, + "loss": 1.124, + "step": 2768 + }, + { + "epoch": 0.13, + "grad_norm": 1.2263149321092663, + "learning_rate": 1.944895008880179e-05, + "loss": 1.2905, + "step": 2769 + }, + { + "epoch": 0.13, + "grad_norm": 1.1176357673464454, + "learning_rate": 1.944843999535359e-05, + "loss": 1.2422, + "step": 2770 + }, + { + "epoch": 0.13, + "grad_norm": 1.8034217207186087, + "learning_rate": 1.944792967261954e-05, + "loss": 1.6475, + "step": 2771 + }, + { + "epoch": 0.13, + "grad_norm": 1.1444175143709956, + "learning_rate": 1.9447419120612018e-05, + "loss": 1.3428, + "step": 2772 + }, + { + "epoch": 0.13, + "grad_norm": 1.0889057681188112, + "learning_rate": 1.9446908339343422e-05, + "loss": 1.1899, + "step": 2773 + }, + { + "epoch": 0.13, + "grad_norm": 1.2913008673174342, + "learning_rate": 1.9446397328826145e-05, + "loss": 1.3525, + "step": 2774 + }, + { + "epoch": 0.13, + "grad_norm": 1.314183139570361, + "learning_rate": 1.944588608907258e-05, + "loss": 1.2114, + "step": 2775 + }, + { + "epoch": 0.13, + "grad_norm": 1.2383782063969953, + "learning_rate": 1.9445374620095142e-05, + "loss": 1.1538, + "step": 2776 + }, + { + "epoch": 0.13, + "grad_norm": 1.433031193669727, + "learning_rate": 1.944486292190624e-05, + "loss": 1.2524, + "step": 2777 + }, + { + "epoch": 0.13, + "grad_norm": 1.5220803640520921, + "learning_rate": 1.944435099451829e-05, + "loss": 1.2231, + "step": 2778 + }, + { + "epoch": 0.13, + "grad_norm": 1.348756151540924, + "learning_rate": 1.9443838837943717e-05, + "loss": 1.1802, + "step": 2779 + }, + { + "epoch": 0.13, + "grad_norm": 1.230658827031683, + "learning_rate": 1.9443326452194948e-05, + "loss": 1.1519, + "step": 2780 + }, + { + "epoch": 0.13, + "grad_norm": 1.4795190677676013, + "learning_rate": 1.9442813837284416e-05, + "loss": 1.2837, + "step": 2781 + }, + { + "epoch": 0.13, + "grad_norm": 1.2253175063018085, + "learning_rate": 1.9442300993224568e-05, + "loss": 1.1782, + "step": 2782 + }, + { + "epoch": 0.13, + "grad_norm": 1.1116310288251972, + "learning_rate": 1.9441787920027843e-05, + "loss": 1.1006, + "step": 2783 + }, + { + "epoch": 0.13, + "grad_norm": 1.185254908474156, + "learning_rate": 1.944127461770669e-05, + "loss": 1.0732, + "step": 2784 + }, + { + "epoch": 0.13, + "grad_norm": 1.2034505974271008, + "learning_rate": 1.9440761086273564e-05, + "loss": 1.3276, + "step": 2785 + }, + { + "epoch": 0.13, + "grad_norm": 1.1833301613336273, + "learning_rate": 1.944024732574094e-05, + "loss": 1.2153, + "step": 2786 + }, + { + "epoch": 0.13, + "grad_norm": 1.324195497593842, + "learning_rate": 1.9439733336121267e-05, + "loss": 1.2314, + "step": 2787 + }, + { + "epoch": 0.13, + "grad_norm": 1.2966746003677356, + "learning_rate": 1.9439219117427034e-05, + "loss": 1.3203, + "step": 2788 + }, + { + "epoch": 0.13, + "grad_norm": 1.0775020684596006, + "learning_rate": 1.943870466967071e-05, + "loss": 1.2168, + "step": 2789 + }, + { + "epoch": 0.13, + "grad_norm": 1.2704756976158058, + "learning_rate": 1.943818999286478e-05, + "loss": 1.2061, + "step": 2790 + }, + { + "epoch": 0.13, + "grad_norm": 1.2884705986945755, + "learning_rate": 1.9437675087021736e-05, + "loss": 1.1616, + "step": 2791 + }, + { + "epoch": 0.13, + "grad_norm": 1.2689655803543933, + "learning_rate": 1.9437159952154078e-05, + "loss": 1.2549, + "step": 2792 + }, + { + "epoch": 0.13, + "grad_norm": 1.1270979399452048, + "learning_rate": 1.9436644588274295e-05, + "loss": 1.2104, + "step": 2793 + }, + { + "epoch": 0.13, + "grad_norm": 1.3237854685338393, + "learning_rate": 1.9436128995394903e-05, + "loss": 1.3237, + "step": 2794 + }, + { + "epoch": 0.13, + "grad_norm": 1.3714890476815498, + "learning_rate": 1.943561317352841e-05, + "loss": 1.2314, + "step": 2795 + }, + { + "epoch": 0.13, + "grad_norm": 1.2147327364398894, + "learning_rate": 1.9435097122687337e-05, + "loss": 1.0242, + "step": 2796 + }, + { + "epoch": 0.13, + "grad_norm": 1.2251699293128333, + "learning_rate": 1.94345808428842e-05, + "loss": 1.144, + "step": 2797 + }, + { + "epoch": 0.13, + "grad_norm": 1.0229270236297132, + "learning_rate": 1.943406433413154e-05, + "loss": 1.0718, + "step": 2798 + }, + { + "epoch": 0.13, + "grad_norm": 1.3890659210640006, + "learning_rate": 1.9433547596441877e-05, + "loss": 1.3267, + "step": 2799 + }, + { + "epoch": 0.13, + "grad_norm": 0.9435971767170067, + "learning_rate": 1.9433030629827757e-05, + "loss": 1.1553, + "step": 2800 + }, + { + "epoch": 0.13, + "grad_norm": 1.016031788757542, + "learning_rate": 1.9432513434301727e-05, + "loss": 1.2246, + "step": 2801 + }, + { + "epoch": 0.13, + "grad_norm": 1.1638690201680657, + "learning_rate": 1.943199600987633e-05, + "loss": 1.3799, + "step": 2802 + }, + { + "epoch": 0.13, + "grad_norm": 1.4807006934398304, + "learning_rate": 1.943147835656414e-05, + "loss": 1.1758, + "step": 2803 + }, + { + "epoch": 0.13, + "grad_norm": 1.2804400313523154, + "learning_rate": 1.9430960474377697e-05, + "loss": 1.2778, + "step": 2804 + }, + { + "epoch": 0.13, + "grad_norm": 1.4479869707589912, + "learning_rate": 1.9430442363329583e-05, + "loss": 1.3037, + "step": 2805 + }, + { + "epoch": 0.13, + "grad_norm": 1.268016148060741, + "learning_rate": 1.9429924023432364e-05, + "loss": 1.1455, + "step": 2806 + }, + { + "epoch": 0.14, + "grad_norm": 1.1999081222443693, + "learning_rate": 1.9429405454698624e-05, + "loss": 1.1465, + "step": 2807 + }, + { + "epoch": 0.14, + "grad_norm": 1.344376244059421, + "learning_rate": 1.9428886657140945e-05, + "loss": 1.2593, + "step": 2808 + }, + { + "epoch": 0.14, + "grad_norm": 1.4701314379464177, + "learning_rate": 1.9428367630771915e-05, + "loss": 1.1157, + "step": 2809 + }, + { + "epoch": 0.14, + "grad_norm": 1.1431747322150458, + "learning_rate": 1.942784837560413e-05, + "loss": 1.2329, + "step": 2810 + }, + { + "epoch": 0.14, + "grad_norm": 1.1353575488991032, + "learning_rate": 1.9427328891650194e-05, + "loss": 1.1895, + "step": 2811 + }, + { + "epoch": 0.14, + "grad_norm": 1.223411346174628, + "learning_rate": 1.9426809178922706e-05, + "loss": 1.3247, + "step": 2812 + }, + { + "epoch": 0.14, + "grad_norm": 1.4771901573125445, + "learning_rate": 1.9426289237434286e-05, + "loss": 1.2012, + "step": 2813 + }, + { + "epoch": 0.14, + "grad_norm": 1.4287777913212347, + "learning_rate": 1.9425769067197548e-05, + "loss": 1.3433, + "step": 2814 + }, + { + "epoch": 0.14, + "grad_norm": 1.0946886583466218, + "learning_rate": 1.942524866822511e-05, + "loss": 1.0933, + "step": 2815 + }, + { + "epoch": 0.14, + "grad_norm": 1.3088641905831098, + "learning_rate": 1.9424728040529612e-05, + "loss": 1.0471, + "step": 2816 + }, + { + "epoch": 0.14, + "grad_norm": 1.2290919181640083, + "learning_rate": 1.9424207184123677e-05, + "loss": 1.3198, + "step": 2817 + }, + { + "epoch": 0.14, + "grad_norm": 1.0831557333734951, + "learning_rate": 1.942368609901995e-05, + "loss": 0.9758, + "step": 2818 + }, + { + "epoch": 0.14, + "grad_norm": 1.3183986851601692, + "learning_rate": 1.9423164785231078e-05, + "loss": 1.2158, + "step": 2819 + }, + { + "epoch": 0.14, + "grad_norm": 1.1323878749227911, + "learning_rate": 1.942264324276971e-05, + "loss": 1.022, + "step": 2820 + }, + { + "epoch": 0.14, + "grad_norm": 1.1829668326401326, + "learning_rate": 1.94221214716485e-05, + "loss": 1.0972, + "step": 2821 + }, + { + "epoch": 0.14, + "grad_norm": 1.2783245566215902, + "learning_rate": 1.9421599471880108e-05, + "loss": 1.21, + "step": 2822 + }, + { + "epoch": 0.14, + "grad_norm": 1.2009905850972302, + "learning_rate": 1.9421077243477208e-05, + "loss": 1.3799, + "step": 2823 + }, + { + "epoch": 0.14, + "grad_norm": 1.2397979054599495, + "learning_rate": 1.942055478645247e-05, + "loss": 1.3169, + "step": 2824 + }, + { + "epoch": 0.14, + "grad_norm": 1.180182993732635, + "learning_rate": 1.942003210081857e-05, + "loss": 1.2998, + "step": 2825 + }, + { + "epoch": 0.14, + "grad_norm": 1.631284546716238, + "learning_rate": 1.9419509186588196e-05, + "loss": 1.272, + "step": 2826 + }, + { + "epoch": 0.14, + "grad_norm": 1.0786286689935665, + "learning_rate": 1.9418986043774036e-05, + "loss": 1.1714, + "step": 2827 + }, + { + "epoch": 0.14, + "grad_norm": 1.2827376010283227, + "learning_rate": 1.9418462672388784e-05, + "loss": 1.2622, + "step": 2828 + }, + { + "epoch": 0.14, + "grad_norm": 1.1964889851275509, + "learning_rate": 1.941793907244514e-05, + "loss": 1.1714, + "step": 2829 + }, + { + "epoch": 0.14, + "grad_norm": 1.4398068155600379, + "learning_rate": 1.941741524395581e-05, + "loss": 1.3882, + "step": 2830 + }, + { + "epoch": 0.14, + "grad_norm": 1.2662431762458628, + "learning_rate": 1.9416891186933516e-05, + "loss": 1.0977, + "step": 2831 + }, + { + "epoch": 0.14, + "grad_norm": 1.3637622733023576, + "learning_rate": 1.941636690139096e-05, + "loss": 1.1455, + "step": 2832 + }, + { + "epoch": 0.14, + "grad_norm": 1.1228120769985024, + "learning_rate": 1.9415842387340875e-05, + "loss": 1.1284, + "step": 2833 + }, + { + "epoch": 0.14, + "grad_norm": 1.2381627564542637, + "learning_rate": 1.9415317644795984e-05, + "loss": 1.334, + "step": 2834 + }, + { + "epoch": 0.14, + "grad_norm": 1.614476748504299, + "learning_rate": 1.9414792673769028e-05, + "loss": 1.2759, + "step": 2835 + }, + { + "epoch": 0.14, + "grad_norm": 0.8831736471113798, + "learning_rate": 1.9414267474272735e-05, + "loss": 1.1509, + "step": 2836 + }, + { + "epoch": 0.14, + "grad_norm": 1.1343365978856845, + "learning_rate": 1.9413742046319863e-05, + "loss": 1.1235, + "step": 2837 + }, + { + "epoch": 0.14, + "grad_norm": 1.1277340059972034, + "learning_rate": 1.941321638992315e-05, + "loss": 1.1104, + "step": 2838 + }, + { + "epoch": 0.14, + "grad_norm": 1.3081434152123395, + "learning_rate": 1.9412690505095363e-05, + "loss": 1.3784, + "step": 2839 + }, + { + "epoch": 0.14, + "grad_norm": 1.1820263421719863, + "learning_rate": 1.941216439184926e-05, + "loss": 1.1406, + "step": 2840 + }, + { + "epoch": 0.14, + "grad_norm": 1.522730300341463, + "learning_rate": 1.9411638050197605e-05, + "loss": 1.4102, + "step": 2841 + }, + { + "epoch": 0.14, + "grad_norm": 1.2497431120805444, + "learning_rate": 1.9411111480153174e-05, + "loss": 1.2256, + "step": 2842 + }, + { + "epoch": 0.14, + "grad_norm": 1.3342194215737273, + "learning_rate": 1.9410584681728745e-05, + "loss": 1.2856, + "step": 2843 + }, + { + "epoch": 0.14, + "grad_norm": 1.3671614051186127, + "learning_rate": 1.94100576549371e-05, + "loss": 1.1626, + "step": 2844 + }, + { + "epoch": 0.14, + "grad_norm": 1.1948901482433931, + "learning_rate": 1.9409530399791026e-05, + "loss": 1.2554, + "step": 2845 + }, + { + "epoch": 0.14, + "grad_norm": 1.1420759416532227, + "learning_rate": 1.940900291630333e-05, + "loss": 1.1836, + "step": 2846 + }, + { + "epoch": 0.14, + "grad_norm": 1.2146583496516814, + "learning_rate": 1.94084752044868e-05, + "loss": 1.2368, + "step": 2847 + }, + { + "epoch": 0.14, + "grad_norm": 1.1691374353048996, + "learning_rate": 1.9407947264354242e-05, + "loss": 1.4341, + "step": 2848 + }, + { + "epoch": 0.14, + "grad_norm": 1.3914484795982989, + "learning_rate": 1.9407419095918477e-05, + "loss": 1.3584, + "step": 2849 + }, + { + "epoch": 0.14, + "grad_norm": 1.2906483851568866, + "learning_rate": 1.9406890699192316e-05, + "loss": 1.4043, + "step": 2850 + }, + { + "epoch": 0.14, + "grad_norm": 1.2314056440264272, + "learning_rate": 1.9406362074188584e-05, + "loss": 1.1089, + "step": 2851 + }, + { + "epoch": 0.14, + "grad_norm": 1.387199764006199, + "learning_rate": 1.9405833220920104e-05, + "loss": 1.3174, + "step": 2852 + }, + { + "epoch": 0.14, + "grad_norm": 1.1937886869506873, + "learning_rate": 1.9405304139399715e-05, + "loss": 1.1172, + "step": 2853 + }, + { + "epoch": 0.14, + "grad_norm": 1.0330574898950282, + "learning_rate": 1.9404774829640254e-05, + "loss": 1.2061, + "step": 2854 + }, + { + "epoch": 0.14, + "grad_norm": 1.435143103603759, + "learning_rate": 1.9404245291654568e-05, + "loss": 1.2505, + "step": 2855 + }, + { + "epoch": 0.14, + "grad_norm": 1.1900085016834696, + "learning_rate": 1.9403715525455503e-05, + "loss": 1.1272, + "step": 2856 + }, + { + "epoch": 0.14, + "grad_norm": 1.1741867093081544, + "learning_rate": 1.9403185531055915e-05, + "loss": 1.2363, + "step": 2857 + }, + { + "epoch": 0.14, + "grad_norm": 0.9827998480717662, + "learning_rate": 1.9402655308468678e-05, + "loss": 1.2646, + "step": 2858 + }, + { + "epoch": 0.14, + "grad_norm": 1.3321912662631679, + "learning_rate": 1.940212485770664e-05, + "loss": 1.2554, + "step": 2859 + }, + { + "epoch": 0.14, + "grad_norm": 1.4227229296680717, + "learning_rate": 1.9401594178782686e-05, + "loss": 1.1904, + "step": 2860 + }, + { + "epoch": 0.14, + "grad_norm": 1.2650474875443942, + "learning_rate": 1.940106327170969e-05, + "loss": 1.2798, + "step": 2861 + }, + { + "epoch": 0.14, + "grad_norm": 1.295340436165034, + "learning_rate": 1.940053213650053e-05, + "loss": 1.1738, + "step": 2862 + }, + { + "epoch": 0.14, + "grad_norm": 1.3602678315128385, + "learning_rate": 1.9400000773168107e-05, + "loss": 1.2959, + "step": 2863 + }, + { + "epoch": 0.14, + "grad_norm": 1.1744440207663347, + "learning_rate": 1.939946918172531e-05, + "loss": 1.2168, + "step": 2864 + }, + { + "epoch": 0.14, + "grad_norm": 1.2350274681748403, + "learning_rate": 1.9398937362185037e-05, + "loss": 1.1997, + "step": 2865 + }, + { + "epoch": 0.14, + "grad_norm": 1.1954224388447448, + "learning_rate": 1.9398405314560197e-05, + "loss": 1.2983, + "step": 2866 + }, + { + "epoch": 0.14, + "grad_norm": 0.6766409597139228, + "learning_rate": 1.93978730388637e-05, + "loss": 1.1377, + "step": 2867 + }, + { + "epoch": 0.14, + "grad_norm": 1.287479590998543, + "learning_rate": 1.939734053510846e-05, + "loss": 1.377, + "step": 2868 + }, + { + "epoch": 0.14, + "grad_norm": 1.3014899293005142, + "learning_rate": 1.9396807803307405e-05, + "loss": 1.3901, + "step": 2869 + }, + { + "epoch": 0.14, + "grad_norm": 1.2910748736843476, + "learning_rate": 1.9396274843473455e-05, + "loss": 1.1484, + "step": 2870 + }, + { + "epoch": 0.14, + "grad_norm": 1.2284481350730798, + "learning_rate": 1.9395741655619554e-05, + "loss": 1.084, + "step": 2871 + }, + { + "epoch": 0.14, + "grad_norm": 1.0081170246234077, + "learning_rate": 1.939520823975863e-05, + "loss": 1.2383, + "step": 2872 + }, + { + "epoch": 0.14, + "grad_norm": 1.0848551407598979, + "learning_rate": 1.9394674595903635e-05, + "loss": 1.2505, + "step": 2873 + }, + { + "epoch": 0.14, + "grad_norm": 1.3138431610009718, + "learning_rate": 1.9394140724067515e-05, + "loss": 1.2227, + "step": 2874 + }, + { + "epoch": 0.14, + "grad_norm": 1.2475545719264889, + "learning_rate": 1.9393606624263228e-05, + "loss": 1.3516, + "step": 2875 + }, + { + "epoch": 0.14, + "grad_norm": 1.223908275107291, + "learning_rate": 1.9393072296503733e-05, + "loss": 1.1172, + "step": 2876 + }, + { + "epoch": 0.14, + "grad_norm": 1.207440857658899, + "learning_rate": 1.9392537740801997e-05, + "loss": 1.1533, + "step": 2877 + }, + { + "epoch": 0.14, + "grad_norm": 0.849340513623634, + "learning_rate": 1.9392002957170994e-05, + "loss": 1.0964, + "step": 2878 + }, + { + "epoch": 0.14, + "grad_norm": 1.2255856122266813, + "learning_rate": 1.9391467945623698e-05, + "loss": 1.1108, + "step": 2879 + }, + { + "epoch": 0.14, + "grad_norm": 1.3291703189901876, + "learning_rate": 1.9390932706173095e-05, + "loss": 1.1284, + "step": 2880 + }, + { + "epoch": 0.14, + "grad_norm": 1.2074881202673384, + "learning_rate": 1.9390397238832173e-05, + "loss": 1.2275, + "step": 2881 + }, + { + "epoch": 0.14, + "grad_norm": 1.3632766844384925, + "learning_rate": 1.9389861543613927e-05, + "loss": 1.0986, + "step": 2882 + }, + { + "epoch": 0.14, + "grad_norm": 1.5500235121861214, + "learning_rate": 1.9389325620531356e-05, + "loss": 1.355, + "step": 2883 + }, + { + "epoch": 0.14, + "grad_norm": 1.4438013724127303, + "learning_rate": 1.9388789469597464e-05, + "loss": 1.1875, + "step": 2884 + }, + { + "epoch": 0.14, + "grad_norm": 1.381271555245736, + "learning_rate": 1.9388253090825263e-05, + "loss": 1.2827, + "step": 2885 + }, + { + "epoch": 0.14, + "grad_norm": 1.33160012904605, + "learning_rate": 1.938771648422777e-05, + "loss": 1.25, + "step": 2886 + }, + { + "epoch": 0.14, + "grad_norm": 1.2600183244002516, + "learning_rate": 1.9387179649818007e-05, + "loss": 1.1997, + "step": 2887 + }, + { + "epoch": 0.14, + "grad_norm": 1.2691824222201216, + "learning_rate": 1.9386642587609002e-05, + "loss": 1.1123, + "step": 2888 + }, + { + "epoch": 0.14, + "grad_norm": 1.3222823154899834, + "learning_rate": 1.9386105297613782e-05, + "loss": 1.2236, + "step": 2889 + }, + { + "epoch": 0.14, + "grad_norm": 1.727656873841732, + "learning_rate": 1.9385567779845392e-05, + "loss": 1.397, + "step": 2890 + }, + { + "epoch": 0.14, + "grad_norm": 1.1716748045652734, + "learning_rate": 1.9385030034316873e-05, + "loss": 1.271, + "step": 2891 + }, + { + "epoch": 0.14, + "grad_norm": 1.0743448966645597, + "learning_rate": 1.938449206104128e-05, + "loss": 1.2446, + "step": 2892 + }, + { + "epoch": 0.14, + "grad_norm": 1.2440447271779054, + "learning_rate": 1.9383953860031658e-05, + "loss": 1.2192, + "step": 2893 + }, + { + "epoch": 0.14, + "grad_norm": 1.2234636328430466, + "learning_rate": 1.9383415431301075e-05, + "loss": 1.354, + "step": 2894 + }, + { + "epoch": 0.14, + "grad_norm": 1.2926155694284962, + "learning_rate": 1.9382876774862594e-05, + "loss": 1.2871, + "step": 2895 + }, + { + "epoch": 0.14, + "grad_norm": 1.1247259686362867, + "learning_rate": 1.938233789072929e-05, + "loss": 1.1035, + "step": 2896 + }, + { + "epoch": 0.14, + "grad_norm": 1.2908942394913634, + "learning_rate": 1.938179877891424e-05, + "loss": 1.2627, + "step": 2897 + }, + { + "epoch": 0.14, + "grad_norm": 1.0951613496511399, + "learning_rate": 1.9381259439430517e-05, + "loss": 1.2178, + "step": 2898 + }, + { + "epoch": 0.14, + "grad_norm": 1.4864806510464303, + "learning_rate": 1.9380719872291223e-05, + "loss": 1.3579, + "step": 2899 + }, + { + "epoch": 0.14, + "grad_norm": 1.5601123950092, + "learning_rate": 1.9380180077509444e-05, + "loss": 1.2017, + "step": 2900 + }, + { + "epoch": 0.14, + "grad_norm": 1.1336975833300569, + "learning_rate": 1.937964005509828e-05, + "loss": 1.248, + "step": 2901 + }, + { + "epoch": 0.14, + "grad_norm": 1.3093503858395548, + "learning_rate": 1.9379099805070836e-05, + "loss": 1.1987, + "step": 2902 + }, + { + "epoch": 0.14, + "grad_norm": 1.2516376160662592, + "learning_rate": 1.9378559327440222e-05, + "loss": 1.2012, + "step": 2903 + }, + { + "epoch": 0.14, + "grad_norm": 1.3418745248182566, + "learning_rate": 1.9378018622219557e-05, + "loss": 1.1094, + "step": 2904 + }, + { + "epoch": 0.14, + "grad_norm": 1.3590443223004038, + "learning_rate": 1.9377477689421958e-05, + "loss": 1.2173, + "step": 2905 + }, + { + "epoch": 0.14, + "grad_norm": 1.3145202139020222, + "learning_rate": 1.9376936529060556e-05, + "loss": 1.2744, + "step": 2906 + }, + { + "epoch": 0.14, + "grad_norm": 1.3943386221279719, + "learning_rate": 1.9376395141148475e-05, + "loss": 1.2988, + "step": 2907 + }, + { + "epoch": 0.14, + "grad_norm": 1.3135279656858143, + "learning_rate": 1.9375853525698866e-05, + "loss": 1.4087, + "step": 2908 + }, + { + "epoch": 0.14, + "grad_norm": 1.6335412643943827, + "learning_rate": 1.9375311682724863e-05, + "loss": 1.2163, + "step": 2909 + }, + { + "epoch": 0.14, + "grad_norm": 1.1971603231447447, + "learning_rate": 1.9374769612239617e-05, + "loss": 1.1895, + "step": 2910 + }, + { + "epoch": 0.14, + "grad_norm": 1.241647064541242, + "learning_rate": 1.937422731425628e-05, + "loss": 1.2295, + "step": 2911 + }, + { + "epoch": 0.14, + "grad_norm": 1.6091302461929209, + "learning_rate": 1.9373684788788018e-05, + "loss": 1.1665, + "step": 2912 + }, + { + "epoch": 0.14, + "grad_norm": 1.526103089677711, + "learning_rate": 1.937314203584799e-05, + "loss": 1.3149, + "step": 2913 + }, + { + "epoch": 0.14, + "grad_norm": 1.3940387424809453, + "learning_rate": 1.9372599055449374e-05, + "loss": 1.2944, + "step": 2914 + }, + { + "epoch": 0.14, + "grad_norm": 1.2232665457251142, + "learning_rate": 1.937205584760534e-05, + "loss": 1.0942, + "step": 2915 + }, + { + "epoch": 0.14, + "grad_norm": 0.869537510745098, + "learning_rate": 1.9371512412329078e-05, + "loss": 1.1807, + "step": 2916 + }, + { + "epoch": 0.14, + "grad_norm": 1.3508680641481063, + "learning_rate": 1.9370968749633764e-05, + "loss": 1.2188, + "step": 2917 + }, + { + "epoch": 0.14, + "grad_norm": 1.532186470029494, + "learning_rate": 1.9370424859532598e-05, + "loss": 1.1377, + "step": 2918 + }, + { + "epoch": 0.14, + "grad_norm": 1.5170045376298094, + "learning_rate": 1.9369880742038783e-05, + "loss": 1.4053, + "step": 2919 + }, + { + "epoch": 0.14, + "grad_norm": 1.2588222663125679, + "learning_rate": 1.9369336397165512e-05, + "loss": 1.2554, + "step": 2920 + }, + { + "epoch": 0.14, + "grad_norm": 1.4576297768308812, + "learning_rate": 1.9368791824926006e-05, + "loss": 1.2891, + "step": 2921 + }, + { + "epoch": 0.14, + "grad_norm": 1.2020014702773025, + "learning_rate": 1.9368247025333472e-05, + "loss": 1.3325, + "step": 2922 + }, + { + "epoch": 0.14, + "grad_norm": 1.210431747429196, + "learning_rate": 1.9367701998401137e-05, + "loss": 1.2207, + "step": 2923 + }, + { + "epoch": 0.14, + "grad_norm": 1.2691949457388636, + "learning_rate": 1.9367156744142218e-05, + "loss": 1.2441, + "step": 2924 + }, + { + "epoch": 0.14, + "grad_norm": 1.246384634578654, + "learning_rate": 1.936661126256996e-05, + "loss": 1.1211, + "step": 2925 + }, + { + "epoch": 0.14, + "grad_norm": 1.396193353152672, + "learning_rate": 1.9366065553697586e-05, + "loss": 1.1299, + "step": 2926 + }, + { + "epoch": 0.14, + "grad_norm": 1.279349403380316, + "learning_rate": 1.936551961753835e-05, + "loss": 1.1035, + "step": 2927 + }, + { + "epoch": 0.14, + "grad_norm": 1.0620824453173423, + "learning_rate": 1.9364973454105494e-05, + "loss": 1.2095, + "step": 2928 + }, + { + "epoch": 0.14, + "grad_norm": 1.370856057630793, + "learning_rate": 1.9364427063412276e-05, + "loss": 1.2896, + "step": 2929 + }, + { + "epoch": 0.14, + "grad_norm": 1.3752984633077525, + "learning_rate": 1.936388044547195e-05, + "loss": 1.3745, + "step": 2930 + }, + { + "epoch": 0.14, + "grad_norm": 1.4670274920233504, + "learning_rate": 1.9363333600297784e-05, + "loss": 1.1851, + "step": 2931 + }, + { + "epoch": 0.14, + "grad_norm": 1.4387603088539793, + "learning_rate": 1.936278652790305e-05, + "loss": 1.2393, + "step": 2932 + }, + { + "epoch": 0.14, + "grad_norm": 1.1919158677693653, + "learning_rate": 1.9362239228301023e-05, + "loss": 1.2422, + "step": 2933 + }, + { + "epoch": 0.14, + "grad_norm": 1.2441899473788247, + "learning_rate": 1.936169170150498e-05, + "loss": 1.1548, + "step": 2934 + }, + { + "epoch": 0.14, + "grad_norm": 1.084504634501322, + "learning_rate": 1.936114394752821e-05, + "loss": 1.3555, + "step": 2935 + }, + { + "epoch": 0.14, + "grad_norm": 1.9528109730223289, + "learning_rate": 1.936059596638401e-05, + "loss": 1.2139, + "step": 2936 + }, + { + "epoch": 0.14, + "grad_norm": 1.3467692265002342, + "learning_rate": 1.9360047758085675e-05, + "loss": 1.2109, + "step": 2937 + }, + { + "epoch": 0.14, + "grad_norm": 1.0982701834788784, + "learning_rate": 1.9359499322646505e-05, + "loss": 1.0618, + "step": 2938 + }, + { + "epoch": 0.14, + "grad_norm": 1.3195073368046937, + "learning_rate": 1.9358950660079815e-05, + "loss": 1.2686, + "step": 2939 + }, + { + "epoch": 0.14, + "grad_norm": 1.560928375363077, + "learning_rate": 1.9358401770398912e-05, + "loss": 1.3193, + "step": 2940 + }, + { + "epoch": 0.14, + "grad_norm": 1.101672521039128, + "learning_rate": 1.9357852653617123e-05, + "loss": 1.2524, + "step": 2941 + }, + { + "epoch": 0.14, + "grad_norm": 1.2595039615382313, + "learning_rate": 1.9357303309747772e-05, + "loss": 1.2729, + "step": 2942 + }, + { + "epoch": 0.14, + "grad_norm": 1.3034737541725454, + "learning_rate": 1.9356753738804185e-05, + "loss": 0.9446, + "step": 2943 + }, + { + "epoch": 0.14, + "grad_norm": 1.1626915117429417, + "learning_rate": 1.9356203940799702e-05, + "loss": 1.0962, + "step": 2944 + }, + { + "epoch": 0.14, + "grad_norm": 1.314753080722927, + "learning_rate": 1.9355653915747668e-05, + "loss": 1.3486, + "step": 2945 + }, + { + "epoch": 0.14, + "grad_norm": 1.4003058438943607, + "learning_rate": 1.9355103663661426e-05, + "loss": 1.3301, + "step": 2946 + }, + { + "epoch": 0.14, + "grad_norm": 1.145373178713599, + "learning_rate": 1.935455318455433e-05, + "loss": 1.3735, + "step": 2947 + }, + { + "epoch": 0.14, + "grad_norm": 1.2853894647528195, + "learning_rate": 1.935400247843974e-05, + "loss": 1.0918, + "step": 2948 + }, + { + "epoch": 0.14, + "grad_norm": 1.5485365002981988, + "learning_rate": 1.935345154533102e-05, + "loss": 1.2358, + "step": 2949 + }, + { + "epoch": 0.14, + "grad_norm": 1.1168567865763082, + "learning_rate": 1.9352900385241534e-05, + "loss": 1.2935, + "step": 2950 + }, + { + "epoch": 0.14, + "grad_norm": 1.4933172406138409, + "learning_rate": 1.9352348998184664e-05, + "loss": 1.3511, + "step": 2951 + }, + { + "epoch": 0.14, + "grad_norm": 0.9939718091189073, + "learning_rate": 1.9351797384173787e-05, + "loss": 1.2573, + "step": 2952 + }, + { + "epoch": 0.14, + "grad_norm": 1.1678461605219512, + "learning_rate": 1.9351245543222292e-05, + "loss": 1.2798, + "step": 2953 + }, + { + "epoch": 0.14, + "grad_norm": 1.4173948098542624, + "learning_rate": 1.935069347534357e-05, + "loss": 1.3101, + "step": 2954 + }, + { + "epoch": 0.14, + "grad_norm": 1.3002542311896452, + "learning_rate": 1.9350141180551014e-05, + "loss": 1.209, + "step": 2955 + }, + { + "epoch": 0.14, + "grad_norm": 1.2297739833424703, + "learning_rate": 1.934958865885803e-05, + "loss": 1.1694, + "step": 2956 + }, + { + "epoch": 0.14, + "grad_norm": 1.162918653752359, + "learning_rate": 1.9349035910278027e-05, + "loss": 1.2207, + "step": 2957 + }, + { + "epoch": 0.14, + "grad_norm": 1.4716188212053143, + "learning_rate": 1.9348482934824413e-05, + "loss": 1.1709, + "step": 2958 + }, + { + "epoch": 0.14, + "grad_norm": 1.1016008588384305, + "learning_rate": 1.9347929732510614e-05, + "loss": 1.1641, + "step": 2959 + }, + { + "epoch": 0.14, + "grad_norm": 1.2505792230068842, + "learning_rate": 1.9347376303350056e-05, + "loss": 1.1233, + "step": 2960 + }, + { + "epoch": 0.14, + "grad_norm": 1.150845883725662, + "learning_rate": 1.9346822647356158e-05, + "loss": 1.1294, + "step": 2961 + }, + { + "epoch": 0.14, + "grad_norm": 1.5674348944751384, + "learning_rate": 1.934626876454236e-05, + "loss": 1.2881, + "step": 2962 + }, + { + "epoch": 0.14, + "grad_norm": 1.2399475778333615, + "learning_rate": 1.9345714654922112e-05, + "loss": 1.1709, + "step": 2963 + }, + { + "epoch": 0.14, + "grad_norm": 1.4075367647942583, + "learning_rate": 1.9345160318508853e-05, + "loss": 1.2729, + "step": 2964 + }, + { + "epoch": 0.14, + "grad_norm": 1.774775440964435, + "learning_rate": 1.9344605755316035e-05, + "loss": 1.25, + "step": 2965 + }, + { + "epoch": 0.14, + "grad_norm": 1.1854452926484105, + "learning_rate": 1.9344050965357117e-05, + "loss": 1.1855, + "step": 2966 + }, + { + "epoch": 0.14, + "grad_norm": 1.57962807882461, + "learning_rate": 1.9343495948645562e-05, + "loss": 1.1899, + "step": 2967 + }, + { + "epoch": 0.14, + "grad_norm": 1.249092047556739, + "learning_rate": 1.9342940705194838e-05, + "loss": 1.188, + "step": 2968 + }, + { + "epoch": 0.14, + "grad_norm": 1.112430103836025, + "learning_rate": 1.934238523501842e-05, + "loss": 1.167, + "step": 2969 + }, + { + "epoch": 0.14, + "grad_norm": 1.228684222359549, + "learning_rate": 1.9341829538129787e-05, + "loss": 1.2031, + "step": 2970 + }, + { + "epoch": 0.14, + "grad_norm": 1.458894171360412, + "learning_rate": 1.9341273614542427e-05, + "loss": 1.1392, + "step": 2971 + }, + { + "epoch": 0.14, + "grad_norm": 1.3175523296336085, + "learning_rate": 1.9340717464269823e-05, + "loss": 1.0886, + "step": 2972 + }, + { + "epoch": 0.14, + "grad_norm": 1.2211189764412265, + "learning_rate": 1.9340161087325483e-05, + "loss": 1.1416, + "step": 2973 + }, + { + "epoch": 0.14, + "grad_norm": 0.9998661829261598, + "learning_rate": 1.9339604483722896e-05, + "loss": 1.1709, + "step": 2974 + }, + { + "epoch": 0.14, + "grad_norm": 1.4087249631872492, + "learning_rate": 1.933904765347558e-05, + "loss": 1.3682, + "step": 2975 + }, + { + "epoch": 0.14, + "grad_norm": 1.3070761991045476, + "learning_rate": 1.933849059659704e-05, + "loss": 1.2407, + "step": 2976 + }, + { + "epoch": 0.14, + "grad_norm": 1.6598449357375045, + "learning_rate": 1.9337933313100793e-05, + "loss": 1.3931, + "step": 2977 + }, + { + "epoch": 0.14, + "grad_norm": 1.255462805231643, + "learning_rate": 1.933737580300037e-05, + "loss": 1.2251, + "step": 2978 + }, + { + "epoch": 0.14, + "grad_norm": 1.3894189553612528, + "learning_rate": 1.9336818066309297e-05, + "loss": 1.2114, + "step": 2979 + }, + { + "epoch": 0.14, + "grad_norm": 1.1741196141600498, + "learning_rate": 1.9336260103041108e-05, + "loss": 1.2334, + "step": 2980 + }, + { + "epoch": 0.14, + "grad_norm": 1.4301647264329325, + "learning_rate": 1.9335701913209342e-05, + "loss": 1.2114, + "step": 2981 + }, + { + "epoch": 0.14, + "grad_norm": 1.1921892421819977, + "learning_rate": 1.9335143496827546e-05, + "loss": 1.2368, + "step": 2982 + }, + { + "epoch": 0.14, + "grad_norm": 1.1651282835990595, + "learning_rate": 1.933458485390927e-05, + "loss": 1.2944, + "step": 2983 + }, + { + "epoch": 0.14, + "grad_norm": 1.6254933937416454, + "learning_rate": 1.9334025984468075e-05, + "loss": 1.4185, + "step": 2984 + }, + { + "epoch": 0.14, + "grad_norm": 1.1498636995020743, + "learning_rate": 1.9333466888517518e-05, + "loss": 1.3096, + "step": 2985 + }, + { + "epoch": 0.14, + "grad_norm": 1.317404524453681, + "learning_rate": 1.9332907566071168e-05, + "loss": 1.2007, + "step": 2986 + }, + { + "epoch": 0.14, + "grad_norm": 1.3223614714945782, + "learning_rate": 1.9332348017142598e-05, + "loss": 1.0515, + "step": 2987 + }, + { + "epoch": 0.14, + "grad_norm": 1.509033766563693, + "learning_rate": 1.933178824174539e-05, + "loss": 1.1147, + "step": 2988 + }, + { + "epoch": 0.14, + "grad_norm": 1.4603826300014386, + "learning_rate": 1.933122823989312e-05, + "loss": 1.1323, + "step": 2989 + }, + { + "epoch": 0.14, + "grad_norm": 1.2058278933823126, + "learning_rate": 1.9330668011599388e-05, + "loss": 1.1392, + "step": 2990 + }, + { + "epoch": 0.14, + "grad_norm": 1.4506088713710252, + "learning_rate": 1.933010755687778e-05, + "loss": 1.3208, + "step": 2991 + }, + { + "epoch": 0.14, + "grad_norm": 1.2862844930151287, + "learning_rate": 1.9329546875741904e-05, + "loss": 1.1802, + "step": 2992 + }, + { + "epoch": 0.14, + "grad_norm": 1.227107346000838, + "learning_rate": 1.932898596820536e-05, + "loss": 1.1147, + "step": 2993 + }, + { + "epoch": 0.14, + "grad_norm": 1.1452125720172628, + "learning_rate": 1.9328424834281763e-05, + "loss": 1.3101, + "step": 2994 + }, + { + "epoch": 0.14, + "grad_norm": 1.231417952959909, + "learning_rate": 1.932786347398473e-05, + "loss": 1.2373, + "step": 2995 + }, + { + "epoch": 0.14, + "grad_norm": 1.3664720052018267, + "learning_rate": 1.932730188732788e-05, + "loss": 1.314, + "step": 2996 + }, + { + "epoch": 0.14, + "grad_norm": 1.1901767384229063, + "learning_rate": 1.9326740074324846e-05, + "loss": 1.168, + "step": 2997 + }, + { + "epoch": 0.14, + "grad_norm": 1.4373672076317907, + "learning_rate": 1.932617803498926e-05, + "loss": 1.2534, + "step": 2998 + }, + { + "epoch": 0.14, + "grad_norm": 1.1085511355035569, + "learning_rate": 1.9325615769334755e-05, + "loss": 1.2783, + "step": 2999 + }, + { + "epoch": 0.14, + "grad_norm": 1.157548910940475, + "learning_rate": 1.9325053277374986e-05, + "loss": 1.1382, + "step": 3000 + }, + { + "epoch": 0.14, + "grad_norm": 1.139544924294377, + "learning_rate": 1.93244905591236e-05, + "loss": 1.2803, + "step": 3001 + }, + { + "epoch": 0.14, + "grad_norm": 1.2337387773548278, + "learning_rate": 1.9323927614594245e-05, + "loss": 1.2827, + "step": 3002 + }, + { + "epoch": 0.14, + "grad_norm": 1.2006878656236861, + "learning_rate": 1.932336444380059e-05, + "loss": 1.3169, + "step": 3003 + }, + { + "epoch": 0.14, + "grad_norm": 1.4002106224079722, + "learning_rate": 1.93228010467563e-05, + "loss": 1.1323, + "step": 3004 + }, + { + "epoch": 0.14, + "grad_norm": 1.2013557568883364, + "learning_rate": 1.9322237423475044e-05, + "loss": 1.2974, + "step": 3005 + }, + { + "epoch": 0.14, + "grad_norm": 0.5322585269629087, + "learning_rate": 1.9321673573970502e-05, + "loss": 1.1294, + "step": 3006 + }, + { + "epoch": 0.14, + "grad_norm": 0.5322585269629087, + "learning_rate": 1.9321673573970502e-05, + "loss": 1.2109, + "step": 3007 + }, + { + "epoch": 0.14, + "grad_norm": 0.5322585269629087, + "learning_rate": 1.9321673573970502e-05, + "loss": 1.3091, + "step": 3008 + }, + { + "epoch": 0.14, + "grad_norm": 1.4317125877611152, + "learning_rate": 1.932110949825636e-05, + "loss": 1.3071, + "step": 3009 + }, + { + "epoch": 0.14, + "grad_norm": 1.2030898317417371, + "learning_rate": 1.9320545196346295e-05, + "loss": 1.2334, + "step": 3010 + }, + { + "epoch": 0.14, + "grad_norm": 1.282216751222411, + "learning_rate": 1.9319980668254016e-05, + "loss": 1.1538, + "step": 3011 + }, + { + "epoch": 0.14, + "grad_norm": 1.3224892439898936, + "learning_rate": 1.9319415913993214e-05, + "loss": 1.2397, + "step": 3012 + }, + { + "epoch": 0.14, + "grad_norm": 1.4633256072031364, + "learning_rate": 1.9318850933577592e-05, + "loss": 1.2607, + "step": 3013 + }, + { + "epoch": 0.14, + "grad_norm": 1.3104788993236278, + "learning_rate": 1.9318285727020867e-05, + "loss": 1.2339, + "step": 3014 + }, + { + "epoch": 0.15, + "grad_norm": 1.6216098802296015, + "learning_rate": 1.9317720294336747e-05, + "loss": 1.0896, + "step": 3015 + }, + { + "epoch": 0.15, + "grad_norm": 1.225382644795014, + "learning_rate": 1.9317154635538964e-05, + "loss": 1.1567, + "step": 3016 + }, + { + "epoch": 0.15, + "grad_norm": 1.04132229071204, + "learning_rate": 1.9316588750641233e-05, + "loss": 1.126, + "step": 3017 + }, + { + "epoch": 0.15, + "grad_norm": 1.3100428993386624, + "learning_rate": 1.9316022639657296e-05, + "loss": 1.2412, + "step": 3018 + }, + { + "epoch": 0.15, + "grad_norm": 1.3632123125628408, + "learning_rate": 1.9315456302600885e-05, + "loss": 1.2988, + "step": 3019 + }, + { + "epoch": 0.15, + "grad_norm": 1.2558219669347763, + "learning_rate": 1.9314889739485747e-05, + "loss": 1.2681, + "step": 3020 + }, + { + "epoch": 0.15, + "grad_norm": 1.800040031953831, + "learning_rate": 1.931432295032563e-05, + "loss": 1.3394, + "step": 3021 + }, + { + "epoch": 0.15, + "grad_norm": 1.3846538708489489, + "learning_rate": 1.9313755935134286e-05, + "loss": 1.209, + "step": 3022 + }, + { + "epoch": 0.15, + "grad_norm": 1.3409148319535897, + "learning_rate": 1.9313188693925475e-05, + "loss": 1.3911, + "step": 3023 + }, + { + "epoch": 0.15, + "grad_norm": 1.5697181266106894, + "learning_rate": 1.9312621226712962e-05, + "loss": 1.2046, + "step": 3024 + }, + { + "epoch": 0.15, + "grad_norm": 1.2183770340015154, + "learning_rate": 1.9312053533510525e-05, + "loss": 1.126, + "step": 3025 + }, + { + "epoch": 0.15, + "grad_norm": 1.3590076274993719, + "learning_rate": 1.9311485614331928e-05, + "loss": 1.1152, + "step": 3026 + }, + { + "epoch": 0.15, + "grad_norm": 1.079184566109758, + "learning_rate": 1.9310917469190965e-05, + "loss": 1.084, + "step": 3027 + }, + { + "epoch": 0.15, + "grad_norm": 1.5564598895017048, + "learning_rate": 1.9310349098101412e-05, + "loss": 1.2583, + "step": 3028 + }, + { + "epoch": 0.15, + "grad_norm": 1.1829024292556005, + "learning_rate": 1.930978050107707e-05, + "loss": 1.1938, + "step": 3029 + }, + { + "epoch": 0.15, + "grad_norm": 1.4003044748318692, + "learning_rate": 1.9309211678131733e-05, + "loss": 1.209, + "step": 3030 + }, + { + "epoch": 0.15, + "grad_norm": 1.2993091693882184, + "learning_rate": 1.930864262927921e-05, + "loss": 1.2217, + "step": 3031 + }, + { + "epoch": 0.15, + "grad_norm": 1.6874019309077977, + "learning_rate": 1.9308073354533302e-05, + "loss": 1.2622, + "step": 3032 + }, + { + "epoch": 0.15, + "grad_norm": 1.3656694550501622, + "learning_rate": 1.9307503853907832e-05, + "loss": 1.1724, + "step": 3033 + }, + { + "epoch": 0.15, + "grad_norm": 1.131890075012984, + "learning_rate": 1.930693412741661e-05, + "loss": 1.1562, + "step": 3034 + }, + { + "epoch": 0.15, + "grad_norm": 1.2308945910026752, + "learning_rate": 1.9306364175073474e-05, + "loss": 1.2085, + "step": 3035 + }, + { + "epoch": 0.15, + "grad_norm": 1.1345609459101549, + "learning_rate": 1.9305793996892244e-05, + "loss": 1.2339, + "step": 3036 + }, + { + "epoch": 0.15, + "grad_norm": 1.1888340436519842, + "learning_rate": 1.930522359288676e-05, + "loss": 1.1582, + "step": 3037 + }, + { + "epoch": 0.15, + "grad_norm": 1.3400514732964528, + "learning_rate": 1.9304652963070868e-05, + "loss": 1.2197, + "step": 3038 + }, + { + "epoch": 0.15, + "grad_norm": 1.4634561100482675, + "learning_rate": 1.9304082107458412e-05, + "loss": 1.293, + "step": 3039 + }, + { + "epoch": 0.15, + "grad_norm": 1.3485909064911454, + "learning_rate": 1.9303511026063244e-05, + "loss": 1.125, + "step": 3040 + }, + { + "epoch": 0.15, + "grad_norm": 1.2571479310049944, + "learning_rate": 1.930293971889923e-05, + "loss": 1.1006, + "step": 3041 + }, + { + "epoch": 0.15, + "grad_norm": 1.749790476366772, + "learning_rate": 1.9302368185980218e-05, + "loss": 1.3413, + "step": 3042 + }, + { + "epoch": 0.15, + "grad_norm": 1.3457894161381458, + "learning_rate": 1.9301796427320093e-05, + "loss": 1.249, + "step": 3043 + }, + { + "epoch": 0.15, + "grad_norm": 1.3305815695479835, + "learning_rate": 1.9301224442932725e-05, + "loss": 1.2437, + "step": 3044 + }, + { + "epoch": 0.15, + "grad_norm": 1.1352367869023638, + "learning_rate": 1.930065223283199e-05, + "loss": 1.208, + "step": 3045 + }, + { + "epoch": 0.15, + "grad_norm": 1.302432789974811, + "learning_rate": 1.930007979703178e-05, + "loss": 1.2026, + "step": 3046 + }, + { + "epoch": 0.15, + "grad_norm": 1.5346401054207415, + "learning_rate": 1.9299507135545986e-05, + "loss": 1.1372, + "step": 3047 + }, + { + "epoch": 0.15, + "grad_norm": 1.2564328599429304, + "learning_rate": 1.92989342483885e-05, + "loss": 1.1987, + "step": 3048 + }, + { + "epoch": 0.15, + "grad_norm": 1.3151406041977647, + "learning_rate": 1.929836113557323e-05, + "loss": 1.2031, + "step": 3049 + }, + { + "epoch": 0.15, + "grad_norm": 1.3495106141485107, + "learning_rate": 1.9297787797114078e-05, + "loss": 1.2422, + "step": 3050 + }, + { + "epoch": 0.15, + "grad_norm": 1.55584606587957, + "learning_rate": 1.929721423302496e-05, + "loss": 1.2715, + "step": 3051 + }, + { + "epoch": 0.15, + "grad_norm": 1.2290724009427396, + "learning_rate": 1.9296640443319793e-05, + "loss": 1.1138, + "step": 3052 + }, + { + "epoch": 0.15, + "grad_norm": 1.3271959717051078, + "learning_rate": 1.9296066428012508e-05, + "loss": 1.167, + "step": 3053 + }, + { + "epoch": 0.15, + "grad_norm": 1.19984137185531, + "learning_rate": 1.9295492187117025e-05, + "loss": 1.2256, + "step": 3054 + }, + { + "epoch": 0.15, + "grad_norm": 1.732776271032898, + "learning_rate": 1.9294917720647287e-05, + "loss": 1.2617, + "step": 3055 + }, + { + "epoch": 0.15, + "grad_norm": 1.2120947721655226, + "learning_rate": 1.929434302861723e-05, + "loss": 1.0796, + "step": 3056 + }, + { + "epoch": 0.15, + "grad_norm": 1.1441187832211681, + "learning_rate": 1.92937681110408e-05, + "loss": 1.3364, + "step": 3057 + }, + { + "epoch": 0.15, + "grad_norm": 1.8905853520566493, + "learning_rate": 1.929319296793195e-05, + "loss": 1.3657, + "step": 3058 + }, + { + "epoch": 0.15, + "grad_norm": 1.1199066147268264, + "learning_rate": 1.929261759930464e-05, + "loss": 1.1475, + "step": 3059 + }, + { + "epoch": 0.15, + "grad_norm": 1.4527253714544248, + "learning_rate": 1.9292042005172823e-05, + "loss": 1.186, + "step": 3060 + }, + { + "epoch": 0.15, + "grad_norm": 1.3284500574964342, + "learning_rate": 1.9291466185550482e-05, + "loss": 1.2617, + "step": 3061 + }, + { + "epoch": 0.15, + "grad_norm": 1.245207818308442, + "learning_rate": 1.9290890140451576e-05, + "loss": 1.2632, + "step": 3062 + }, + { + "epoch": 0.15, + "grad_norm": 1.6666940216527228, + "learning_rate": 1.929031386989009e-05, + "loss": 1.231, + "step": 3063 + }, + { + "epoch": 0.15, + "grad_norm": 1.503844207028459, + "learning_rate": 1.928973737388001e-05, + "loss": 1.2227, + "step": 3064 + }, + { + "epoch": 0.15, + "grad_norm": 1.3121861701723743, + "learning_rate": 1.9289160652435326e-05, + "loss": 1.3374, + "step": 3065 + }, + { + "epoch": 0.15, + "grad_norm": 1.242909065753283, + "learning_rate": 1.9288583705570026e-05, + "loss": 1.1406, + "step": 3066 + }, + { + "epoch": 0.15, + "grad_norm": 1.1336518066320826, + "learning_rate": 1.928800653329812e-05, + "loss": 1.1514, + "step": 3067 + }, + { + "epoch": 0.15, + "grad_norm": 1.3418252311107524, + "learning_rate": 1.928742913563361e-05, + "loss": 1.3364, + "step": 3068 + }, + { + "epoch": 0.15, + "grad_norm": 1.3156332343820873, + "learning_rate": 1.928685151259051e-05, + "loss": 1.3154, + "step": 3069 + }, + { + "epoch": 0.15, + "grad_norm": 1.142414407158179, + "learning_rate": 1.9286273664182832e-05, + "loss": 1.1123, + "step": 3070 + }, + { + "epoch": 0.15, + "grad_norm": 1.111022767972263, + "learning_rate": 1.9285695590424604e-05, + "loss": 1.3198, + "step": 3071 + }, + { + "epoch": 0.15, + "grad_norm": 1.2429427554912134, + "learning_rate": 1.9285117291329853e-05, + "loss": 1.2422, + "step": 3072 + }, + { + "epoch": 0.15, + "grad_norm": 1.2033406194319491, + "learning_rate": 1.9284538766912608e-05, + "loss": 1.2271, + "step": 3073 + }, + { + "epoch": 0.15, + "grad_norm": 1.1394314698421877, + "learning_rate": 1.9283960017186916e-05, + "loss": 1.1772, + "step": 3074 + }, + { + "epoch": 0.15, + "grad_norm": 1.1019240550098173, + "learning_rate": 1.928338104216682e-05, + "loss": 1.2744, + "step": 3075 + }, + { + "epoch": 0.15, + "grad_norm": 1.2161661553620706, + "learning_rate": 1.928280184186636e-05, + "loss": 1.3145, + "step": 3076 + }, + { + "epoch": 0.15, + "grad_norm": 0.9252672391972474, + "learning_rate": 1.9282222416299604e-05, + "loss": 1.2163, + "step": 3077 + }, + { + "epoch": 0.15, + "grad_norm": 1.4118151234177778, + "learning_rate": 1.9281642765480605e-05, + "loss": 1.1519, + "step": 3078 + }, + { + "epoch": 0.15, + "grad_norm": 1.066121836601918, + "learning_rate": 1.9281062889423436e-05, + "loss": 1.1865, + "step": 3079 + }, + { + "epoch": 0.15, + "grad_norm": 1.2096180533259848, + "learning_rate": 1.9280482788142162e-05, + "loss": 1.1118, + "step": 3080 + }, + { + "epoch": 0.15, + "grad_norm": 0.9950144971291458, + "learning_rate": 1.9279902461650866e-05, + "loss": 1.1953, + "step": 3081 + }, + { + "epoch": 0.15, + "grad_norm": 1.4284363977199883, + "learning_rate": 1.9279321909963627e-05, + "loss": 1.3081, + "step": 3082 + }, + { + "epoch": 0.15, + "grad_norm": 1.324176406180008, + "learning_rate": 1.9278741133094535e-05, + "loss": 1.1279, + "step": 3083 + }, + { + "epoch": 0.15, + "grad_norm": 1.1550629392408664, + "learning_rate": 1.9278160131057686e-05, + "loss": 1.2085, + "step": 3084 + }, + { + "epoch": 0.15, + "grad_norm": 1.4150246774139525, + "learning_rate": 1.9277578903867174e-05, + "loss": 1.2227, + "step": 3085 + }, + { + "epoch": 0.15, + "grad_norm": 1.3001586835067318, + "learning_rate": 1.9276997451537107e-05, + "loss": 1.147, + "step": 3086 + }, + { + "epoch": 0.15, + "grad_norm": 1.1953763853986643, + "learning_rate": 1.9276415774081593e-05, + "loss": 1.2827, + "step": 3087 + }, + { + "epoch": 0.15, + "grad_norm": 1.4822112528539182, + "learning_rate": 1.927583387151475e-05, + "loss": 1.29, + "step": 3088 + }, + { + "epoch": 0.15, + "grad_norm": 1.198334612973982, + "learning_rate": 1.92752517438507e-05, + "loss": 1.1816, + "step": 3089 + }, + { + "epoch": 0.15, + "grad_norm": 1.4156043323848013, + "learning_rate": 1.9274669391103567e-05, + "loss": 1.438, + "step": 3090 + }, + { + "epoch": 0.15, + "grad_norm": 1.3182684786654073, + "learning_rate": 1.9274086813287484e-05, + "loss": 1.1768, + "step": 3091 + }, + { + "epoch": 0.15, + "grad_norm": 1.5091170899007402, + "learning_rate": 1.9273504010416586e-05, + "loss": 1.2178, + "step": 3092 + }, + { + "epoch": 0.15, + "grad_norm": 1.124929640265413, + "learning_rate": 1.927292098250502e-05, + "loss": 1.2051, + "step": 3093 + }, + { + "epoch": 0.15, + "grad_norm": 1.1630835115651765, + "learning_rate": 1.9272337729566933e-05, + "loss": 1.0693, + "step": 3094 + }, + { + "epoch": 0.15, + "grad_norm": 1.3969823191465958, + "learning_rate": 1.9271754251616475e-05, + "loss": 1.3315, + "step": 3095 + }, + { + "epoch": 0.15, + "grad_norm": 1.363496141846321, + "learning_rate": 1.927117054866781e-05, + "loss": 1.1553, + "step": 3096 + }, + { + "epoch": 0.15, + "grad_norm": 0.9245227065643407, + "learning_rate": 1.9270586620735102e-05, + "loss": 1.1733, + "step": 3097 + }, + { + "epoch": 0.15, + "grad_norm": 1.7068560028724749, + "learning_rate": 1.927000246783252e-05, + "loss": 1.3232, + "step": 3098 + }, + { + "epoch": 0.15, + "grad_norm": 1.2594261245120757, + "learning_rate": 1.926941808997424e-05, + "loss": 1.1392, + "step": 3099 + }, + { + "epoch": 0.15, + "grad_norm": 1.1124782573931644, + "learning_rate": 1.9268833487174447e-05, + "loss": 1.2847, + "step": 3100 + }, + { + "epoch": 0.15, + "grad_norm": 1.3143097613782713, + "learning_rate": 1.926824865944732e-05, + "loss": 1.1865, + "step": 3101 + }, + { + "epoch": 0.15, + "grad_norm": 1.4781615678028057, + "learning_rate": 1.9267663606807055e-05, + "loss": 1.272, + "step": 3102 + }, + { + "epoch": 0.15, + "grad_norm": 1.1998999541658504, + "learning_rate": 1.9267078329267853e-05, + "loss": 1.2451, + "step": 3103 + }, + { + "epoch": 0.15, + "grad_norm": 1.4414163284251973, + "learning_rate": 1.926649282684391e-05, + "loss": 1.2676, + "step": 3104 + }, + { + "epoch": 0.15, + "grad_norm": 1.3749570973261536, + "learning_rate": 1.9265907099549438e-05, + "loss": 1.2466, + "step": 3105 + }, + { + "epoch": 0.15, + "grad_norm": 1.155045251366464, + "learning_rate": 1.926532114739865e-05, + "loss": 1.2646, + "step": 3106 + }, + { + "epoch": 0.15, + "grad_norm": 1.5687319012631566, + "learning_rate": 1.926473497040577e-05, + "loss": 1.3579, + "step": 3107 + }, + { + "epoch": 0.15, + "grad_norm": 1.4297536606604089, + "learning_rate": 1.9264148568585013e-05, + "loss": 1.2329, + "step": 3108 + }, + { + "epoch": 0.15, + "grad_norm": 1.0374112589216304, + "learning_rate": 1.9263561941950622e-05, + "loss": 1.0117, + "step": 3109 + }, + { + "epoch": 0.15, + "grad_norm": 1.4395775548875753, + "learning_rate": 1.926297509051682e-05, + "loss": 1.2954, + "step": 3110 + }, + { + "epoch": 0.15, + "grad_norm": 1.3459763310597939, + "learning_rate": 1.926238801429786e-05, + "loss": 1.0918, + "step": 3111 + }, + { + "epoch": 0.15, + "grad_norm": 1.2439806121689414, + "learning_rate": 1.926180071330798e-05, + "loss": 1.354, + "step": 3112 + }, + { + "epoch": 0.15, + "grad_norm": 1.3279591559033619, + "learning_rate": 1.9261213187561433e-05, + "loss": 1.2417, + "step": 3113 + }, + { + "epoch": 0.15, + "grad_norm": 1.2676529333547255, + "learning_rate": 1.926062543707248e-05, + "loss": 1.2271, + "step": 3114 + }, + { + "epoch": 0.15, + "grad_norm": 1.2748020755836789, + "learning_rate": 1.926003746185538e-05, + "loss": 1.2222, + "step": 3115 + }, + { + "epoch": 0.15, + "grad_norm": 1.267040859295656, + "learning_rate": 1.9259449261924405e-05, + "loss": 1.2974, + "step": 3116 + }, + { + "epoch": 0.15, + "grad_norm": 1.0271989735355551, + "learning_rate": 1.9258860837293824e-05, + "loss": 1.2529, + "step": 3117 + }, + { + "epoch": 0.15, + "grad_norm": 1.792449229331658, + "learning_rate": 1.9258272187977924e-05, + "loss": 1.4697, + "step": 3118 + }, + { + "epoch": 0.15, + "grad_norm": 1.2440361952478378, + "learning_rate": 1.9257683313990984e-05, + "loss": 1.2104, + "step": 3119 + }, + { + "epoch": 0.15, + "grad_norm": 1.3422039327856625, + "learning_rate": 1.9257094215347298e-05, + "loss": 1.2109, + "step": 3120 + }, + { + "epoch": 0.15, + "grad_norm": 1.0496451227909271, + "learning_rate": 1.9256504892061156e-05, + "loss": 1.1948, + "step": 3121 + }, + { + "epoch": 0.15, + "grad_norm": 1.1990741599890107, + "learning_rate": 1.9255915344146865e-05, + "loss": 1.2148, + "step": 3122 + }, + { + "epoch": 0.15, + "grad_norm": 1.2063977158147765, + "learning_rate": 1.9255325571618728e-05, + "loss": 1.2446, + "step": 3123 + }, + { + "epoch": 0.15, + "grad_norm": 1.3698891231489732, + "learning_rate": 1.925473557449106e-05, + "loss": 1.0845, + "step": 3124 + }, + { + "epoch": 0.15, + "grad_norm": 1.5576477536123707, + "learning_rate": 1.9254145352778176e-05, + "loss": 1.2734, + "step": 3125 + }, + { + "epoch": 0.15, + "grad_norm": 1.139878549160004, + "learning_rate": 1.92535549064944e-05, + "loss": 1.2612, + "step": 3126 + }, + { + "epoch": 0.15, + "grad_norm": 1.2229965694378526, + "learning_rate": 1.9252964235654058e-05, + "loss": 1.1245, + "step": 3127 + }, + { + "epoch": 0.15, + "grad_norm": 1.373472162683065, + "learning_rate": 1.925237334027149e-05, + "loss": 1.2461, + "step": 3128 + }, + { + "epoch": 0.15, + "grad_norm": 1.1179867050881462, + "learning_rate": 1.9251782220361027e-05, + "loss": 1.2178, + "step": 3129 + }, + { + "epoch": 0.15, + "grad_norm": 0.9395920992359477, + "learning_rate": 1.9251190875937024e-05, + "loss": 1.2295, + "step": 3130 + }, + { + "epoch": 0.15, + "grad_norm": 1.4135131924545745, + "learning_rate": 1.925059930701382e-05, + "loss": 1.2373, + "step": 3131 + }, + { + "epoch": 0.15, + "grad_norm": 1.0546772182750832, + "learning_rate": 1.9250007513605776e-05, + "loss": 1.3164, + "step": 3132 + }, + { + "epoch": 0.15, + "grad_norm": 1.1336359511783607, + "learning_rate": 1.9249415495727252e-05, + "loss": 1.1582, + "step": 3133 + }, + { + "epoch": 0.15, + "grad_norm": 1.2106308581225789, + "learning_rate": 1.924882325339262e-05, + "loss": 1.0952, + "step": 3134 + }, + { + "epoch": 0.15, + "grad_norm": 1.3427650642482178, + "learning_rate": 1.9248230786616244e-05, + "loss": 1.2563, + "step": 3135 + }, + { + "epoch": 0.15, + "grad_norm": 1.286248344493991, + "learning_rate": 1.9247638095412508e-05, + "loss": 1.2607, + "step": 3136 + }, + { + "epoch": 0.15, + "grad_norm": 1.2003174582953844, + "learning_rate": 1.9247045179795788e-05, + "loss": 1.4019, + "step": 3137 + }, + { + "epoch": 0.15, + "grad_norm": 1.46540067797969, + "learning_rate": 1.924645203978048e-05, + "loss": 1.2788, + "step": 3138 + }, + { + "epoch": 0.15, + "grad_norm": 1.4160554516427803, + "learning_rate": 1.924585867538097e-05, + "loss": 1.0605, + "step": 3139 + }, + { + "epoch": 0.15, + "grad_norm": 1.0421158950573985, + "learning_rate": 1.924526508661166e-05, + "loss": 1.2295, + "step": 3140 + }, + { + "epoch": 0.15, + "grad_norm": 1.2714392998800266, + "learning_rate": 1.9244671273486962e-05, + "loss": 1.1147, + "step": 3141 + }, + { + "epoch": 0.15, + "grad_norm": 1.2094020607601468, + "learning_rate": 1.9244077236021273e-05, + "loss": 1.2188, + "step": 3142 + }, + { + "epoch": 0.15, + "grad_norm": 1.1503982152000995, + "learning_rate": 1.924348297422902e-05, + "loss": 1.1938, + "step": 3143 + }, + { + "epoch": 0.15, + "grad_norm": 1.154485425668514, + "learning_rate": 1.924288848812462e-05, + "loss": 1.1836, + "step": 3144 + }, + { + "epoch": 0.15, + "grad_norm": 1.7466091631814367, + "learning_rate": 1.9242293777722496e-05, + "loss": 1.4111, + "step": 3145 + }, + { + "epoch": 0.15, + "grad_norm": 1.3134332695684225, + "learning_rate": 1.9241698843037083e-05, + "loss": 1.3667, + "step": 3146 + }, + { + "epoch": 0.15, + "grad_norm": 1.4891651862530928, + "learning_rate": 1.9241103684082815e-05, + "loss": 1.3564, + "step": 3147 + }, + { + "epoch": 0.15, + "grad_norm": 1.3313770338366688, + "learning_rate": 1.9240508300874145e-05, + "loss": 1.2793, + "step": 3148 + }, + { + "epoch": 0.15, + "grad_norm": 1.2272705064888552, + "learning_rate": 1.9239912693425506e-05, + "loss": 1.0977, + "step": 3149 + }, + { + "epoch": 0.15, + "grad_norm": 1.2976749950124493, + "learning_rate": 1.9239316861751365e-05, + "loss": 1.3325, + "step": 3150 + }, + { + "epoch": 0.15, + "grad_norm": 1.2021299616085872, + "learning_rate": 1.9238720805866174e-05, + "loss": 1.1675, + "step": 3151 + }, + { + "epoch": 0.15, + "grad_norm": 1.2187129141954007, + "learning_rate": 1.92381245257844e-05, + "loss": 1.2246, + "step": 3152 + }, + { + "epoch": 0.15, + "grad_norm": 1.3049319861624888, + "learning_rate": 1.9237528021520512e-05, + "loss": 1.0996, + "step": 3153 + }, + { + "epoch": 0.15, + "grad_norm": 1.2211786156344366, + "learning_rate": 1.9236931293088982e-05, + "loss": 1.165, + "step": 3154 + }, + { + "epoch": 0.15, + "grad_norm": 1.3603753566213534, + "learning_rate": 1.9236334340504298e-05, + "loss": 1.23, + "step": 3155 + }, + { + "epoch": 0.15, + "grad_norm": 1.2725965678540356, + "learning_rate": 1.923573716378094e-05, + "loss": 1.1802, + "step": 3156 + }, + { + "epoch": 0.15, + "grad_norm": 1.2838763776741542, + "learning_rate": 1.9235139762933402e-05, + "loss": 1.3286, + "step": 3157 + }, + { + "epoch": 0.15, + "grad_norm": 1.6352114820112145, + "learning_rate": 1.9234542137976184e-05, + "loss": 1.2998, + "step": 3158 + }, + { + "epoch": 0.15, + "grad_norm": 1.2617672335639216, + "learning_rate": 1.9233944288923788e-05, + "loss": 1.2168, + "step": 3159 + }, + { + "epoch": 0.15, + "grad_norm": 1.258115143542949, + "learning_rate": 1.9233346215790717e-05, + "loss": 1.2354, + "step": 3160 + }, + { + "epoch": 0.15, + "grad_norm": 1.3685587732601576, + "learning_rate": 1.9232747918591488e-05, + "loss": 1.1846, + "step": 3161 + }, + { + "epoch": 0.15, + "grad_norm": 1.3526818515933696, + "learning_rate": 1.923214939734062e-05, + "loss": 1.2617, + "step": 3162 + }, + { + "epoch": 0.15, + "grad_norm": 0.9608276419574779, + "learning_rate": 1.9231550652052635e-05, + "loss": 1.2012, + "step": 3163 + }, + { + "epoch": 0.15, + "grad_norm": 1.2813981932115546, + "learning_rate": 1.9230951682742066e-05, + "loss": 1.1704, + "step": 3164 + }, + { + "epoch": 0.15, + "grad_norm": 1.728844453850068, + "learning_rate": 1.923035248942345e-05, + "loss": 1.2803, + "step": 3165 + }, + { + "epoch": 0.15, + "grad_norm": 1.3070559898785106, + "learning_rate": 1.9229753072111325e-05, + "loss": 1.3267, + "step": 3166 + }, + { + "epoch": 0.15, + "grad_norm": 1.4232696991712674, + "learning_rate": 1.9229153430820232e-05, + "loss": 1.2114, + "step": 3167 + }, + { + "epoch": 0.15, + "grad_norm": 1.1294274047683792, + "learning_rate": 1.9228553565564728e-05, + "loss": 1.0342, + "step": 3168 + }, + { + "epoch": 0.15, + "grad_norm": 1.1525029386314793, + "learning_rate": 1.922795347635937e-05, + "loss": 1.2515, + "step": 3169 + }, + { + "epoch": 0.15, + "grad_norm": 1.2001799199146916, + "learning_rate": 1.922735316321872e-05, + "loss": 1.249, + "step": 3170 + }, + { + "epoch": 0.15, + "grad_norm": 1.109044445476171, + "learning_rate": 1.9226752626157345e-05, + "loss": 1.2275, + "step": 3171 + }, + { + "epoch": 0.15, + "grad_norm": 1.0981249628319258, + "learning_rate": 1.922615186518982e-05, + "loss": 1.1602, + "step": 3172 + }, + { + "epoch": 0.15, + "grad_norm": 1.2357928005924046, + "learning_rate": 1.9225550880330718e-05, + "loss": 1.293, + "step": 3173 + }, + { + "epoch": 0.15, + "grad_norm": 1.7749866974864135, + "learning_rate": 1.9224949671594633e-05, + "loss": 1.166, + "step": 3174 + }, + { + "epoch": 0.15, + "grad_norm": 1.581812927274863, + "learning_rate": 1.9224348238996146e-05, + "loss": 1.1699, + "step": 3175 + }, + { + "epoch": 0.15, + "grad_norm": 0.9773446428989382, + "learning_rate": 1.9223746582549853e-05, + "loss": 1.0918, + "step": 3176 + }, + { + "epoch": 0.15, + "grad_norm": 1.4359502768822099, + "learning_rate": 1.922314470227036e-05, + "loss": 1.2734, + "step": 3177 + }, + { + "epoch": 0.15, + "grad_norm": 1.1911741162362157, + "learning_rate": 1.9222542598172268e-05, + "loss": 1.1494, + "step": 3178 + }, + { + "epoch": 0.15, + "grad_norm": 1.1722405371531426, + "learning_rate": 1.922194027027019e-05, + "loss": 1.1399, + "step": 3179 + }, + { + "epoch": 0.15, + "grad_norm": 1.48895139079885, + "learning_rate": 1.9221337718578744e-05, + "loss": 1.2114, + "step": 3180 + }, + { + "epoch": 0.15, + "grad_norm": 1.1132858933528669, + "learning_rate": 1.922073494311255e-05, + "loss": 1.1177, + "step": 3181 + }, + { + "epoch": 0.15, + "grad_norm": 1.1976209934127564, + "learning_rate": 1.9220131943886232e-05, + "loss": 1.1191, + "step": 3182 + }, + { + "epoch": 0.15, + "grad_norm": 1.3777538622926346, + "learning_rate": 1.9219528720914432e-05, + "loss": 1.1719, + "step": 3183 + }, + { + "epoch": 0.15, + "grad_norm": 1.211925304529795, + "learning_rate": 1.921892527421178e-05, + "loss": 1.2861, + "step": 3184 + }, + { + "epoch": 0.15, + "grad_norm": 0.984252346658007, + "learning_rate": 1.9218321603792928e-05, + "loss": 1.1294, + "step": 3185 + }, + { + "epoch": 0.15, + "grad_norm": 1.5122573943301294, + "learning_rate": 1.921771770967252e-05, + "loss": 1.2056, + "step": 3186 + }, + { + "epoch": 0.15, + "grad_norm": 1.1989577976770938, + "learning_rate": 1.921711359186521e-05, + "loss": 1.1526, + "step": 3187 + }, + { + "epoch": 0.15, + "grad_norm": 1.0578294558765375, + "learning_rate": 1.921650925038566e-05, + "loss": 1.0903, + "step": 3188 + }, + { + "epoch": 0.15, + "grad_norm": 1.5652049626009776, + "learning_rate": 1.9215904685248534e-05, + "loss": 1.1367, + "step": 3189 + }, + { + "epoch": 0.15, + "grad_norm": 1.2948561383581576, + "learning_rate": 1.9215299896468503e-05, + "loss": 1.2461, + "step": 3190 + }, + { + "epoch": 0.15, + "grad_norm": 1.2023739370529565, + "learning_rate": 1.9214694884060248e-05, + "loss": 1.2451, + "step": 3191 + }, + { + "epoch": 0.15, + "grad_norm": 1.2530020045322814, + "learning_rate": 1.9214089648038446e-05, + "loss": 1.0586, + "step": 3192 + }, + { + "epoch": 0.15, + "grad_norm": 1.2231694711019683, + "learning_rate": 1.9213484188417788e-05, + "loss": 1.2256, + "step": 3193 + }, + { + "epoch": 0.15, + "grad_norm": 1.3042288474627586, + "learning_rate": 1.921287850521296e-05, + "loss": 1.2632, + "step": 3194 + }, + { + "epoch": 0.15, + "grad_norm": 1.2633247232384919, + "learning_rate": 1.921227259843867e-05, + "loss": 1.4316, + "step": 3195 + }, + { + "epoch": 0.15, + "grad_norm": 1.5081471647328097, + "learning_rate": 1.9211666468109612e-05, + "loss": 1.1204, + "step": 3196 + }, + { + "epoch": 0.15, + "grad_norm": 1.354955238405076, + "learning_rate": 1.9211060114240503e-05, + "loss": 1.3384, + "step": 3197 + }, + { + "epoch": 0.15, + "grad_norm": 1.3654795336445933, + "learning_rate": 1.9210453536846053e-05, + "loss": 1.1992, + "step": 3198 + }, + { + "epoch": 0.15, + "grad_norm": 1.4513065860578782, + "learning_rate": 1.920984673594098e-05, + "loss": 1.3887, + "step": 3199 + }, + { + "epoch": 0.15, + "grad_norm": 1.3096306225998307, + "learning_rate": 1.9209239711540014e-05, + "loss": 1.2192, + "step": 3200 + }, + { + "epoch": 0.15, + "grad_norm": 1.4120287328633923, + "learning_rate": 1.9208632463657885e-05, + "loss": 1.3345, + "step": 3201 + }, + { + "epoch": 0.15, + "grad_norm": 1.197070760340014, + "learning_rate": 1.9208024992309325e-05, + "loss": 1.207, + "step": 3202 + }, + { + "epoch": 0.15, + "grad_norm": 1.511202019841012, + "learning_rate": 1.920741729750908e-05, + "loss": 1.1987, + "step": 3203 + }, + { + "epoch": 0.15, + "grad_norm": 1.4149024814629425, + "learning_rate": 1.9206809379271892e-05, + "loss": 1.0085, + "step": 3204 + }, + { + "epoch": 0.15, + "grad_norm": 1.1330514407917776, + "learning_rate": 1.920620123761252e-05, + "loss": 1.2007, + "step": 3205 + }, + { + "epoch": 0.15, + "grad_norm": 1.1489426366975175, + "learning_rate": 1.920559287254572e-05, + "loss": 1.0903, + "step": 3206 + }, + { + "epoch": 0.15, + "grad_norm": 1.3760573644115506, + "learning_rate": 1.920498428408625e-05, + "loss": 1.2456, + "step": 3207 + }, + { + "epoch": 0.15, + "grad_norm": 1.2645293339026886, + "learning_rate": 1.9204375472248885e-05, + "loss": 1.0312, + "step": 3208 + }, + { + "epoch": 0.15, + "grad_norm": 1.2354487232065399, + "learning_rate": 1.9203766437048395e-05, + "loss": 1.2031, + "step": 3209 + }, + { + "epoch": 0.15, + "grad_norm": 1.2034704083831596, + "learning_rate": 1.920315717849956e-05, + "loss": 1.0881, + "step": 3210 + }, + { + "epoch": 0.15, + "grad_norm": 1.151322822918863, + "learning_rate": 1.9202547696617165e-05, + "loss": 1.2607, + "step": 3211 + }, + { + "epoch": 0.15, + "grad_norm": 1.2702232168040244, + "learning_rate": 1.9201937991416003e-05, + "loss": 1.2212, + "step": 3212 + }, + { + "epoch": 0.15, + "grad_norm": 1.3044355176258788, + "learning_rate": 1.920132806291087e-05, + "loss": 1.1333, + "step": 3213 + }, + { + "epoch": 0.15, + "grad_norm": 1.041827686713765, + "learning_rate": 1.9200717911116564e-05, + "loss": 1.1729, + "step": 3214 + }, + { + "epoch": 0.15, + "grad_norm": 1.3204559185451559, + "learning_rate": 1.920010753604789e-05, + "loss": 1.3037, + "step": 3215 + }, + { + "epoch": 0.15, + "grad_norm": 1.0896720324648572, + "learning_rate": 1.9199496937719663e-05, + "loss": 1.229, + "step": 3216 + }, + { + "epoch": 0.15, + "grad_norm": 0.9805985214621347, + "learning_rate": 1.91988861161467e-05, + "loss": 1.1577, + "step": 3217 + }, + { + "epoch": 0.15, + "grad_norm": 1.4020151660195619, + "learning_rate": 1.9198275071343827e-05, + "loss": 1.2295, + "step": 3218 + }, + { + "epoch": 0.15, + "grad_norm": 1.2302906356804546, + "learning_rate": 1.9197663803325867e-05, + "loss": 1.1348, + "step": 3219 + }, + { + "epoch": 0.15, + "grad_norm": 1.3100800663201668, + "learning_rate": 1.9197052312107655e-05, + "loss": 1.1738, + "step": 3220 + }, + { + "epoch": 0.15, + "grad_norm": 1.3234478958093208, + "learning_rate": 1.9196440597704033e-05, + "loss": 1.3457, + "step": 3221 + }, + { + "epoch": 0.15, + "grad_norm": 1.8482811441299494, + "learning_rate": 1.9195828660129842e-05, + "loss": 1.2188, + "step": 3222 + }, + { + "epoch": 0.16, + "grad_norm": 1.227203709863614, + "learning_rate": 1.9195216499399932e-05, + "loss": 1.2363, + "step": 3223 + }, + { + "epoch": 0.16, + "grad_norm": 1.3155049772863951, + "learning_rate": 1.9194604115529163e-05, + "loss": 1.2314, + "step": 3224 + }, + { + "epoch": 0.16, + "grad_norm": 1.4861482994324133, + "learning_rate": 1.919399150853239e-05, + "loss": 1.3667, + "step": 3225 + }, + { + "epoch": 0.16, + "grad_norm": 1.128708611271439, + "learning_rate": 1.9193378678424484e-05, + "loss": 1.3989, + "step": 3226 + }, + { + "epoch": 0.16, + "grad_norm": 1.1681202428452513, + "learning_rate": 1.9192765625220312e-05, + "loss": 1.0029, + "step": 3227 + }, + { + "epoch": 0.16, + "grad_norm": 1.425401977458417, + "learning_rate": 1.9192152348934753e-05, + "loss": 1.353, + "step": 3228 + }, + { + "epoch": 0.16, + "grad_norm": 1.0801798752729985, + "learning_rate": 1.919153884958269e-05, + "loss": 1.0732, + "step": 3229 + }, + { + "epoch": 0.16, + "grad_norm": 1.0796260488188545, + "learning_rate": 1.9190925127179013e-05, + "loss": 1.1543, + "step": 3230 + }, + { + "epoch": 0.16, + "grad_norm": 1.2012466703980946, + "learning_rate": 1.919031118173861e-05, + "loss": 1.3218, + "step": 3231 + }, + { + "epoch": 0.16, + "grad_norm": 1.1982596996210904, + "learning_rate": 1.9189697013276386e-05, + "loss": 1.229, + "step": 3232 + }, + { + "epoch": 0.16, + "grad_norm": 1.2499002436871722, + "learning_rate": 1.9189082621807235e-05, + "loss": 1.0449, + "step": 3233 + }, + { + "epoch": 0.16, + "grad_norm": 1.3996070553498183, + "learning_rate": 1.918846800734608e-05, + "loss": 1.2314, + "step": 3234 + }, + { + "epoch": 0.16, + "grad_norm": 0.915790367861212, + "learning_rate": 1.9187853169907824e-05, + "loss": 1.1587, + "step": 3235 + }, + { + "epoch": 0.16, + "grad_norm": 1.436699776015021, + "learning_rate": 1.9187238109507393e-05, + "loss": 1.2861, + "step": 3236 + }, + { + "epoch": 0.16, + "grad_norm": 1.2735185512935436, + "learning_rate": 1.918662282615971e-05, + "loss": 1.2129, + "step": 3237 + }, + { + "epoch": 0.16, + "grad_norm": 1.0077080237119949, + "learning_rate": 1.9186007319879714e-05, + "loss": 1.0166, + "step": 3238 + }, + { + "epoch": 0.16, + "grad_norm": 1.0663724828074437, + "learning_rate": 1.918539159068233e-05, + "loss": 1.2383, + "step": 3239 + }, + { + "epoch": 0.16, + "grad_norm": 1.2079140395571206, + "learning_rate": 1.9184775638582508e-05, + "loss": 1.1831, + "step": 3240 + }, + { + "epoch": 0.16, + "grad_norm": 1.2289598940511517, + "learning_rate": 1.918415946359519e-05, + "loss": 1.3481, + "step": 3241 + }, + { + "epoch": 0.16, + "grad_norm": 1.286769503110253, + "learning_rate": 1.918354306573533e-05, + "loss": 1.106, + "step": 3242 + }, + { + "epoch": 0.16, + "grad_norm": 1.6097444218054162, + "learning_rate": 1.9182926445017893e-05, + "loss": 1.146, + "step": 3243 + }, + { + "epoch": 0.16, + "grad_norm": 1.1518526248298382, + "learning_rate": 1.9182309601457837e-05, + "loss": 1.1851, + "step": 3244 + }, + { + "epoch": 0.16, + "grad_norm": 1.2750674550853311, + "learning_rate": 1.9181692535070128e-05, + "loss": 1.2788, + "step": 3245 + }, + { + "epoch": 0.16, + "grad_norm": 1.1901764445950018, + "learning_rate": 1.9181075245869744e-05, + "loss": 1.1104, + "step": 3246 + }, + { + "epoch": 0.16, + "grad_norm": 1.4533642433564156, + "learning_rate": 1.9180457733871666e-05, + "loss": 1.4009, + "step": 3247 + }, + { + "epoch": 0.16, + "grad_norm": 1.4421781102816449, + "learning_rate": 1.9179839999090874e-05, + "loss": 1.085, + "step": 3248 + }, + { + "epoch": 0.16, + "grad_norm": 1.2823452074289379, + "learning_rate": 1.9179222041542366e-05, + "loss": 1.1577, + "step": 3249 + }, + { + "epoch": 0.16, + "grad_norm": 1.3410178346160073, + "learning_rate": 1.9178603861241133e-05, + "loss": 1.1592, + "step": 3250 + }, + { + "epoch": 0.16, + "grad_norm": 1.1848751584645898, + "learning_rate": 1.917798545820218e-05, + "loss": 1.1577, + "step": 3251 + }, + { + "epoch": 0.16, + "grad_norm": 1.169865695684074, + "learning_rate": 1.9177366832440505e-05, + "loss": 1.188, + "step": 3252 + }, + { + "epoch": 0.16, + "grad_norm": 1.311134508701534, + "learning_rate": 1.917674798397113e-05, + "loss": 1.1409, + "step": 3253 + }, + { + "epoch": 0.16, + "grad_norm": 1.3934566916824618, + "learning_rate": 1.917612891280907e-05, + "loss": 1.0288, + "step": 3254 + }, + { + "epoch": 0.16, + "grad_norm": 1.4714124528990467, + "learning_rate": 1.9175509618969347e-05, + "loss": 1.2725, + "step": 3255 + }, + { + "epoch": 0.16, + "grad_norm": 1.2340181903968557, + "learning_rate": 1.917489010246699e-05, + "loss": 1.2856, + "step": 3256 + }, + { + "epoch": 0.16, + "grad_norm": 1.2377763824761008, + "learning_rate": 1.917427036331703e-05, + "loss": 1.0815, + "step": 3257 + }, + { + "epoch": 0.16, + "grad_norm": 1.2362250461104585, + "learning_rate": 1.9173650401534514e-05, + "loss": 1.1367, + "step": 3258 + }, + { + "epoch": 0.16, + "grad_norm": 1.340782819488919, + "learning_rate": 1.917303021713448e-05, + "loss": 1.2627, + "step": 3259 + }, + { + "epoch": 0.16, + "grad_norm": 0.9819302563156266, + "learning_rate": 1.9172409810131975e-05, + "loss": 1.1992, + "step": 3260 + }, + { + "epoch": 0.16, + "grad_norm": 1.138762679077594, + "learning_rate": 1.9171789180542066e-05, + "loss": 1.3281, + "step": 3261 + }, + { + "epoch": 0.16, + "grad_norm": 1.160705982941046, + "learning_rate": 1.9171168328379803e-05, + "loss": 1.2065, + "step": 3262 + }, + { + "epoch": 0.16, + "grad_norm": 1.1628760230182205, + "learning_rate": 1.9170547253660253e-05, + "loss": 1.1631, + "step": 3263 + }, + { + "epoch": 0.16, + "grad_norm": 1.3011917238331285, + "learning_rate": 1.9169925956398497e-05, + "loss": 1.2812, + "step": 3264 + }, + { + "epoch": 0.16, + "grad_norm": 1.1834111112133472, + "learning_rate": 1.91693044366096e-05, + "loss": 1.1431, + "step": 3265 + }, + { + "epoch": 0.16, + "grad_norm": 1.4338876265972165, + "learning_rate": 1.9168682694308654e-05, + "loss": 1.2358, + "step": 3266 + }, + { + "epoch": 0.16, + "grad_norm": 1.301993699706082, + "learning_rate": 1.9168060729510742e-05, + "loss": 1.2573, + "step": 3267 + }, + { + "epoch": 0.16, + "grad_norm": 1.141202197193616, + "learning_rate": 1.916743854223096e-05, + "loss": 1.1621, + "step": 3268 + }, + { + "epoch": 0.16, + "grad_norm": 1.2959983736052776, + "learning_rate": 1.9166816132484404e-05, + "loss": 1.1743, + "step": 3269 + }, + { + "epoch": 0.16, + "grad_norm": 1.0994182877608496, + "learning_rate": 1.9166193500286177e-05, + "loss": 1.2266, + "step": 3270 + }, + { + "epoch": 0.16, + "grad_norm": 1.2729245894586005, + "learning_rate": 1.9165570645651392e-05, + "loss": 1.3574, + "step": 3271 + }, + { + "epoch": 0.16, + "grad_norm": 1.198575949537184, + "learning_rate": 1.9164947568595164e-05, + "loss": 1.0708, + "step": 3272 + }, + { + "epoch": 0.16, + "grad_norm": 1.0355880960606632, + "learning_rate": 1.9164324269132608e-05, + "loss": 1.312, + "step": 3273 + }, + { + "epoch": 0.16, + "grad_norm": 1.5012673483608803, + "learning_rate": 1.9163700747278857e-05, + "loss": 1.252, + "step": 3274 + }, + { + "epoch": 0.16, + "grad_norm": 1.3902932645488821, + "learning_rate": 1.9163077003049037e-05, + "loss": 1.1489, + "step": 3275 + }, + { + "epoch": 0.16, + "grad_norm": 1.3697016974445062, + "learning_rate": 1.9162453036458287e-05, + "loss": 1.1445, + "step": 3276 + }, + { + "epoch": 0.16, + "grad_norm": 1.384346632723937, + "learning_rate": 1.9161828847521743e-05, + "loss": 1.1768, + "step": 3277 + }, + { + "epoch": 0.16, + "grad_norm": 1.4532367074046704, + "learning_rate": 1.9161204436254565e-05, + "loss": 1.1938, + "step": 3278 + }, + { + "epoch": 0.16, + "grad_norm": 0.8107543584463126, + "learning_rate": 1.916057980267189e-05, + "loss": 1.2822, + "step": 3279 + }, + { + "epoch": 0.16, + "grad_norm": 1.1403544888436512, + "learning_rate": 1.915995494678889e-05, + "loss": 1.1475, + "step": 3280 + }, + { + "epoch": 0.16, + "grad_norm": 1.3016194371269918, + "learning_rate": 1.9159329868620714e-05, + "loss": 1.2388, + "step": 3281 + }, + { + "epoch": 0.16, + "grad_norm": 0.9402640749686217, + "learning_rate": 1.9158704568182543e-05, + "loss": 1.3003, + "step": 3282 + }, + { + "epoch": 0.16, + "grad_norm": 1.3976297775050865, + "learning_rate": 1.9158079045489547e-05, + "loss": 1.3145, + "step": 3283 + }, + { + "epoch": 0.16, + "grad_norm": 1.3886812138794298, + "learning_rate": 1.9157453300556904e-05, + "loss": 1.3237, + "step": 3284 + }, + { + "epoch": 0.16, + "grad_norm": 1.0848237459843737, + "learning_rate": 1.9156827333399805e-05, + "loss": 1.2085, + "step": 3285 + }, + { + "epoch": 0.16, + "grad_norm": 1.3390284017502314, + "learning_rate": 1.915620114403343e-05, + "loss": 1.1001, + "step": 3286 + }, + { + "epoch": 0.16, + "grad_norm": 1.300741096498043, + "learning_rate": 1.9155574732472983e-05, + "loss": 1.189, + "step": 3287 + }, + { + "epoch": 0.16, + "grad_norm": 1.5626760176984218, + "learning_rate": 1.9154948098733663e-05, + "loss": 1.3203, + "step": 3288 + }, + { + "epoch": 0.16, + "grad_norm": 0.983084039685459, + "learning_rate": 1.9154321242830676e-05, + "loss": 1.1675, + "step": 3289 + }, + { + "epoch": 0.16, + "grad_norm": 1.1459107357901923, + "learning_rate": 1.9153694164779234e-05, + "loss": 1.2236, + "step": 3290 + }, + { + "epoch": 0.16, + "grad_norm": 1.2901140455108338, + "learning_rate": 1.9153066864594558e-05, + "loss": 1.1621, + "step": 3291 + }, + { + "epoch": 0.16, + "grad_norm": 1.3600450985762282, + "learning_rate": 1.9152439342291865e-05, + "loss": 1.2505, + "step": 3292 + }, + { + "epoch": 0.16, + "grad_norm": 1.1335627276282472, + "learning_rate": 1.9151811597886383e-05, + "loss": 1.1792, + "step": 3293 + }, + { + "epoch": 0.16, + "grad_norm": 1.4808421324251864, + "learning_rate": 1.9151183631393352e-05, + "loss": 1.2617, + "step": 3294 + }, + { + "epoch": 0.16, + "grad_norm": 1.3401465204735845, + "learning_rate": 1.9150555442828004e-05, + "loss": 1.2158, + "step": 3295 + }, + { + "epoch": 0.16, + "grad_norm": 1.4346504818605406, + "learning_rate": 1.914992703220559e-05, + "loss": 1.2666, + "step": 3296 + }, + { + "epoch": 0.16, + "grad_norm": 1.0656054663693846, + "learning_rate": 1.9149298399541353e-05, + "loss": 1.3208, + "step": 3297 + }, + { + "epoch": 0.16, + "grad_norm": 1.1598502478700552, + "learning_rate": 1.9148669544850552e-05, + "loss": 1.1533, + "step": 3298 + }, + { + "epoch": 0.16, + "grad_norm": 1.0593546186396363, + "learning_rate": 1.9148040468148442e-05, + "loss": 1.2222, + "step": 3299 + }, + { + "epoch": 0.16, + "grad_norm": 1.0547919593110826, + "learning_rate": 1.9147411169450302e-05, + "loss": 1.1003, + "step": 3300 + }, + { + "epoch": 0.16, + "grad_norm": 1.1105855910899944, + "learning_rate": 1.9146781648771387e-05, + "loss": 1.2837, + "step": 3301 + }, + { + "epoch": 0.16, + "grad_norm": 1.310590119840075, + "learning_rate": 1.9146151906126983e-05, + "loss": 1.1729, + "step": 3302 + }, + { + "epoch": 0.16, + "grad_norm": 1.4155347430441267, + "learning_rate": 1.9145521941532374e-05, + "loss": 1.2896, + "step": 3303 + }, + { + "epoch": 0.16, + "grad_norm": 1.4429837405809112, + "learning_rate": 1.9144891755002837e-05, + "loss": 1.1191, + "step": 3304 + }, + { + "epoch": 0.16, + "grad_norm": 1.1962297401111752, + "learning_rate": 1.9144261346553677e-05, + "loss": 1.0203, + "step": 3305 + }, + { + "epoch": 0.16, + "grad_norm": 1.0701963695661167, + "learning_rate": 1.9143630716200184e-05, + "loss": 1.1548, + "step": 3306 + }, + { + "epoch": 0.16, + "grad_norm": 1.26291148260121, + "learning_rate": 1.9142999863957662e-05, + "loss": 1.1709, + "step": 3307 + }, + { + "epoch": 0.16, + "grad_norm": 1.3717698283498643, + "learning_rate": 1.9142368789841422e-05, + "loss": 1.0994, + "step": 3308 + }, + { + "epoch": 0.16, + "grad_norm": 1.3876681956263295, + "learning_rate": 1.914173749386678e-05, + "loss": 1.1816, + "step": 3309 + }, + { + "epoch": 0.16, + "grad_norm": 1.319963056145712, + "learning_rate": 1.9141105976049054e-05, + "loss": 1.0542, + "step": 3310 + }, + { + "epoch": 0.16, + "grad_norm": 1.1941066140005174, + "learning_rate": 1.914047423640357e-05, + "loss": 1.1282, + "step": 3311 + }, + { + "epoch": 0.16, + "grad_norm": 1.0840098191209004, + "learning_rate": 1.9139842274945655e-05, + "loss": 1.123, + "step": 3312 + }, + { + "epoch": 0.16, + "grad_norm": 1.3403826422183485, + "learning_rate": 1.913921009169065e-05, + "loss": 1.1743, + "step": 3313 + }, + { + "epoch": 0.16, + "grad_norm": 1.369176435098339, + "learning_rate": 1.9138577686653893e-05, + "loss": 1.3311, + "step": 3314 + }, + { + "epoch": 0.16, + "grad_norm": 1.461684735555145, + "learning_rate": 1.913794505985073e-05, + "loss": 1.2915, + "step": 3315 + }, + { + "epoch": 0.16, + "grad_norm": 1.4514423035405328, + "learning_rate": 1.9137312211296516e-05, + "loss": 1.2329, + "step": 3316 + }, + { + "epoch": 0.16, + "grad_norm": 1.3081437474948776, + "learning_rate": 1.9136679141006603e-05, + "loss": 1.2197, + "step": 3317 + }, + { + "epoch": 0.16, + "grad_norm": 1.4101289320201498, + "learning_rate": 1.9136045848996357e-05, + "loss": 1.2383, + "step": 3318 + }, + { + "epoch": 0.16, + "grad_norm": 1.3771496002505703, + "learning_rate": 1.9135412335281152e-05, + "loss": 1.3154, + "step": 3319 + }, + { + "epoch": 0.16, + "grad_norm": 1.1009254004735336, + "learning_rate": 1.913477859987635e-05, + "loss": 1.1445, + "step": 3320 + }, + { + "epoch": 0.16, + "grad_norm": 1.262282660201594, + "learning_rate": 1.913414464279734e-05, + "loss": 1.1157, + "step": 3321 + }, + { + "epoch": 0.16, + "grad_norm": 1.3156485384147394, + "learning_rate": 1.91335104640595e-05, + "loss": 1.2915, + "step": 3322 + }, + { + "epoch": 0.16, + "grad_norm": 1.3776080767379462, + "learning_rate": 1.913287606367822e-05, + "loss": 1.3965, + "step": 3323 + }, + { + "epoch": 0.16, + "grad_norm": 1.1337973942644777, + "learning_rate": 1.9132241441668903e-05, + "loss": 1.3291, + "step": 3324 + }, + { + "epoch": 0.16, + "grad_norm": 1.188572054739497, + "learning_rate": 1.9131606598046936e-05, + "loss": 1.2603, + "step": 3325 + }, + { + "epoch": 0.16, + "grad_norm": 1.3734833200805108, + "learning_rate": 1.9130971532827737e-05, + "loss": 1.2915, + "step": 3326 + }, + { + "epoch": 0.16, + "grad_norm": 0.9970961097642511, + "learning_rate": 1.9130336246026707e-05, + "loss": 1.1426, + "step": 3327 + }, + { + "epoch": 0.16, + "grad_norm": 1.5458997142137034, + "learning_rate": 1.9129700737659273e-05, + "loss": 1.0947, + "step": 3328 + }, + { + "epoch": 0.16, + "grad_norm": 1.4720049727342825, + "learning_rate": 1.9129065007740848e-05, + "loss": 1.1309, + "step": 3329 + }, + { + "epoch": 0.16, + "grad_norm": 1.466103566596986, + "learning_rate": 1.9128429056286865e-05, + "loss": 1.1992, + "step": 3330 + }, + { + "epoch": 0.16, + "grad_norm": 1.470948772150686, + "learning_rate": 1.9127792883312756e-05, + "loss": 1.2915, + "step": 3331 + }, + { + "epoch": 0.16, + "grad_norm": 1.4086016961772665, + "learning_rate": 1.912715648883395e-05, + "loss": 1.271, + "step": 3332 + }, + { + "epoch": 0.16, + "grad_norm": 1.2065711039219162, + "learning_rate": 1.9126519872865906e-05, + "loss": 1.1904, + "step": 3333 + }, + { + "epoch": 0.16, + "grad_norm": 1.4024989805028794, + "learning_rate": 1.9125883035424062e-05, + "loss": 1.2339, + "step": 3334 + }, + { + "epoch": 0.16, + "grad_norm": 1.0098965844345091, + "learning_rate": 1.9125245976523876e-05, + "loss": 1.2573, + "step": 3335 + }, + { + "epoch": 0.16, + "grad_norm": 1.418633570773244, + "learning_rate": 1.9124608696180806e-05, + "loss": 1.2944, + "step": 3336 + }, + { + "epoch": 0.16, + "grad_norm": 0.9870543517266978, + "learning_rate": 1.9123971194410317e-05, + "loss": 1.2388, + "step": 3337 + }, + { + "epoch": 0.16, + "grad_norm": 1.3934344769329365, + "learning_rate": 1.912333347122788e-05, + "loss": 1.0916, + "step": 3338 + }, + { + "epoch": 0.16, + "grad_norm": 0.8544380033941715, + "learning_rate": 1.9122695526648968e-05, + "loss": 1.1729, + "step": 3339 + }, + { + "epoch": 0.16, + "grad_norm": 1.2578918172698272, + "learning_rate": 1.912205736068907e-05, + "loss": 1.2222, + "step": 3340 + }, + { + "epoch": 0.16, + "grad_norm": 1.1303585863480128, + "learning_rate": 1.912141897336366e-05, + "loss": 1.0295, + "step": 3341 + }, + { + "epoch": 0.16, + "grad_norm": 1.0592746270955613, + "learning_rate": 1.9120780364688243e-05, + "loss": 1.2349, + "step": 3342 + }, + { + "epoch": 0.16, + "grad_norm": 1.4202475589515526, + "learning_rate": 1.912014153467831e-05, + "loss": 1.2334, + "step": 3343 + }, + { + "epoch": 0.16, + "grad_norm": 1.455280609476356, + "learning_rate": 1.9119502483349357e-05, + "loss": 1.2881, + "step": 3344 + }, + { + "epoch": 0.16, + "grad_norm": 1.5081191935038936, + "learning_rate": 1.91188632107169e-05, + "loss": 1.2944, + "step": 3345 + }, + { + "epoch": 0.16, + "grad_norm": 1.2659974987489735, + "learning_rate": 1.9118223716796453e-05, + "loss": 1.0742, + "step": 3346 + }, + { + "epoch": 0.16, + "grad_norm": 1.3465612730953058, + "learning_rate": 1.9117584001603533e-05, + "loss": 1.2485, + "step": 3347 + }, + { + "epoch": 0.16, + "grad_norm": 1.2356006268914674, + "learning_rate": 1.911694406515366e-05, + "loss": 1.1958, + "step": 3348 + }, + { + "epoch": 0.16, + "grad_norm": 1.2902679127789012, + "learning_rate": 1.9116303907462365e-05, + "loss": 1.189, + "step": 3349 + }, + { + "epoch": 0.16, + "grad_norm": 1.2014811906424085, + "learning_rate": 1.911566352854519e-05, + "loss": 1.231, + "step": 3350 + }, + { + "epoch": 0.16, + "grad_norm": 1.0184713161248993, + "learning_rate": 1.9115022928417664e-05, + "loss": 1.1655, + "step": 3351 + }, + { + "epoch": 0.16, + "grad_norm": 1.281688394174819, + "learning_rate": 1.911438210709534e-05, + "loss": 1.1758, + "step": 3352 + }, + { + "epoch": 0.16, + "grad_norm": 1.2874084308017681, + "learning_rate": 1.911374106459377e-05, + "loss": 1.2344, + "step": 3353 + }, + { + "epoch": 0.16, + "grad_norm": 1.200483750801493, + "learning_rate": 1.9113099800928502e-05, + "loss": 1.1272, + "step": 3354 + }, + { + "epoch": 0.16, + "grad_norm": 1.1353626533943664, + "learning_rate": 1.9112458316115107e-05, + "loss": 1.2002, + "step": 3355 + }, + { + "epoch": 0.16, + "grad_norm": 1.2683902467615629, + "learning_rate": 1.911181661016914e-05, + "loss": 1.3447, + "step": 3356 + }, + { + "epoch": 0.16, + "grad_norm": 1.3513511814970065, + "learning_rate": 1.911117468310619e-05, + "loss": 1.2573, + "step": 3357 + }, + { + "epoch": 0.16, + "grad_norm": 1.0262668330225175, + "learning_rate": 1.9110532534941822e-05, + "loss": 1.3013, + "step": 3358 + }, + { + "epoch": 0.16, + "grad_norm": 1.4931484525455858, + "learning_rate": 1.910989016569162e-05, + "loss": 1.2612, + "step": 3359 + }, + { + "epoch": 0.16, + "grad_norm": 1.5719462552209862, + "learning_rate": 1.9109247575371177e-05, + "loss": 1.1987, + "step": 3360 + }, + { + "epoch": 0.16, + "grad_norm": 1.3027737470672192, + "learning_rate": 1.9108604763996084e-05, + "loss": 1.2505, + "step": 3361 + }, + { + "epoch": 0.16, + "grad_norm": 1.3514448675497033, + "learning_rate": 1.910796173158194e-05, + "loss": 1.293, + "step": 3362 + }, + { + "epoch": 0.16, + "grad_norm": 1.1191405447678988, + "learning_rate": 1.910731847814435e-05, + "loss": 1.1489, + "step": 3363 + }, + { + "epoch": 0.16, + "grad_norm": 1.2889240736779, + "learning_rate": 1.9106675003698928e-05, + "loss": 1.2178, + "step": 3364 + }, + { + "epoch": 0.16, + "grad_norm": 1.2078311253226441, + "learning_rate": 1.9106031308261284e-05, + "loss": 1.1401, + "step": 3365 + }, + { + "epoch": 0.16, + "grad_norm": 1.334502031894775, + "learning_rate": 1.9105387391847036e-05, + "loss": 1.3076, + "step": 3366 + }, + { + "epoch": 0.16, + "grad_norm": 1.482265548849895, + "learning_rate": 1.910474325447182e-05, + "loss": 1.4399, + "step": 3367 + }, + { + "epoch": 0.16, + "grad_norm": 1.3375032312834523, + "learning_rate": 1.9104098896151256e-05, + "loss": 1.2168, + "step": 3368 + }, + { + "epoch": 0.16, + "grad_norm": 1.1613342978176464, + "learning_rate": 1.9103454316900987e-05, + "loss": 1.1465, + "step": 3369 + }, + { + "epoch": 0.16, + "grad_norm": 1.2513884140212268, + "learning_rate": 1.9102809516736655e-05, + "loss": 1.1582, + "step": 3370 + }, + { + "epoch": 0.16, + "grad_norm": 1.3913983078261047, + "learning_rate": 1.9102164495673906e-05, + "loss": 1.2876, + "step": 3371 + }, + { + "epoch": 0.16, + "grad_norm": 1.0604715227880055, + "learning_rate": 1.9101519253728396e-05, + "loss": 1.1562, + "step": 3372 + }, + { + "epoch": 0.16, + "grad_norm": 1.314723562682753, + "learning_rate": 1.9100873790915776e-05, + "loss": 1.3174, + "step": 3373 + }, + { + "epoch": 0.16, + "grad_norm": 1.1473104216606567, + "learning_rate": 1.9100228107251715e-05, + "loss": 1.1997, + "step": 3374 + }, + { + "epoch": 0.16, + "grad_norm": 1.1765591741576595, + "learning_rate": 1.9099582202751885e-05, + "loss": 1.2642, + "step": 3375 + }, + { + "epoch": 0.16, + "grad_norm": 1.233614072002368, + "learning_rate": 1.9098936077431953e-05, + "loss": 1.1465, + "step": 3376 + }, + { + "epoch": 0.16, + "grad_norm": 1.262141563210866, + "learning_rate": 1.90982897313076e-05, + "loss": 1.3247, + "step": 3377 + }, + { + "epoch": 0.16, + "grad_norm": 1.4277861144132267, + "learning_rate": 1.9097643164394512e-05, + "loss": 1.2842, + "step": 3378 + }, + { + "epoch": 0.16, + "grad_norm": 1.218187941133866, + "learning_rate": 1.9096996376708382e-05, + "loss": 1.2104, + "step": 3379 + }, + { + "epoch": 0.16, + "grad_norm": 1.3927349486467182, + "learning_rate": 1.9096349368264904e-05, + "loss": 1.2695, + "step": 3380 + }, + { + "epoch": 0.16, + "grad_norm": 1.0900937678011182, + "learning_rate": 1.9095702139079774e-05, + "loss": 1.2534, + "step": 3381 + }, + { + "epoch": 0.16, + "grad_norm": 1.7369519439630712, + "learning_rate": 1.9095054689168707e-05, + "loss": 1.3662, + "step": 3382 + }, + { + "epoch": 0.16, + "grad_norm": 1.3570911860418593, + "learning_rate": 1.9094407018547406e-05, + "loss": 1.3281, + "step": 3383 + }, + { + "epoch": 0.16, + "grad_norm": 1.1906135010798118, + "learning_rate": 1.9093759127231594e-05, + "loss": 1.0847, + "step": 3384 + }, + { + "epoch": 0.16, + "grad_norm": 1.0563427075282246, + "learning_rate": 1.9093111015236993e-05, + "loss": 1.186, + "step": 3385 + }, + { + "epoch": 0.16, + "grad_norm": 1.5732127759750896, + "learning_rate": 1.909246268257933e-05, + "loss": 1.2764, + "step": 3386 + }, + { + "epoch": 0.16, + "grad_norm": 0.8099677381674971, + "learning_rate": 1.9091814129274335e-05, + "loss": 1.1763, + "step": 3387 + }, + { + "epoch": 0.16, + "grad_norm": 1.3280054255515636, + "learning_rate": 1.9091165355337754e-05, + "loss": 1.314, + "step": 3388 + }, + { + "epoch": 0.16, + "grad_norm": 1.4297321033066963, + "learning_rate": 1.9090516360785322e-05, + "loss": 1.2827, + "step": 3389 + }, + { + "epoch": 0.16, + "grad_norm": 1.3074057623837756, + "learning_rate": 1.908986714563279e-05, + "loss": 1.2441, + "step": 3390 + }, + { + "epoch": 0.16, + "grad_norm": 1.2139888822239169, + "learning_rate": 1.9089217709895918e-05, + "loss": 1.2158, + "step": 3391 + }, + { + "epoch": 0.16, + "grad_norm": 1.1846089998682012, + "learning_rate": 1.908856805359046e-05, + "loss": 1.0352, + "step": 3392 + }, + { + "epoch": 0.16, + "grad_norm": 1.1576000245559765, + "learning_rate": 1.9087918176732188e-05, + "loss": 1.2886, + "step": 3393 + }, + { + "epoch": 0.16, + "grad_norm": 1.2721878378160747, + "learning_rate": 1.9087268079336865e-05, + "loss": 1.207, + "step": 3394 + }, + { + "epoch": 0.16, + "grad_norm": 1.1974007589573203, + "learning_rate": 1.9086617761420272e-05, + "loss": 1.208, + "step": 3395 + }, + { + "epoch": 0.16, + "grad_norm": 1.3530979246449515, + "learning_rate": 1.908596722299819e-05, + "loss": 0.9688, + "step": 3396 + }, + { + "epoch": 0.16, + "grad_norm": 1.3938382471400188, + "learning_rate": 1.9085316464086403e-05, + "loss": 1.2896, + "step": 3397 + }, + { + "epoch": 0.16, + "grad_norm": 1.1536037564042814, + "learning_rate": 1.9084665484700704e-05, + "loss": 1.1504, + "step": 3398 + }, + { + "epoch": 0.16, + "grad_norm": 1.504639338073859, + "learning_rate": 1.908401428485689e-05, + "loss": 1.4126, + "step": 3399 + }, + { + "epoch": 0.16, + "grad_norm": 1.3060017549578955, + "learning_rate": 1.9083362864570768e-05, + "loss": 1.3384, + "step": 3400 + }, + { + "epoch": 0.16, + "grad_norm": 1.1782226634862971, + "learning_rate": 1.9082711223858136e-05, + "loss": 1.1704, + "step": 3401 + }, + { + "epoch": 0.16, + "grad_norm": 1.5818991824012074, + "learning_rate": 1.9082059362734822e-05, + "loss": 1.3936, + "step": 3402 + }, + { + "epoch": 0.16, + "grad_norm": 1.3790135223863071, + "learning_rate": 1.908140728121663e-05, + "loss": 1.2744, + "step": 3403 + }, + { + "epoch": 0.16, + "grad_norm": 1.5163021351939618, + "learning_rate": 1.9080754979319395e-05, + "loss": 1.3569, + "step": 3404 + }, + { + "epoch": 0.16, + "grad_norm": 1.2562461486340941, + "learning_rate": 1.908010245705894e-05, + "loss": 1.2373, + "step": 3405 + }, + { + "epoch": 0.16, + "grad_norm": 1.3459506691829743, + "learning_rate": 1.90794497144511e-05, + "loss": 1.2646, + "step": 3406 + }, + { + "epoch": 0.16, + "grad_norm": 1.2814780467452862, + "learning_rate": 1.907879675151172e-05, + "loss": 1.2456, + "step": 3407 + }, + { + "epoch": 0.16, + "grad_norm": 1.135136988082013, + "learning_rate": 1.9078143568256644e-05, + "loss": 1.313, + "step": 3408 + }, + { + "epoch": 0.16, + "grad_norm": 1.1906305940110877, + "learning_rate": 1.907749016470172e-05, + "loss": 1.2715, + "step": 3409 + }, + { + "epoch": 0.16, + "grad_norm": 1.340748236076362, + "learning_rate": 1.9076836540862804e-05, + "loss": 1.0942, + "step": 3410 + }, + { + "epoch": 0.16, + "grad_norm": 1.1704827619637361, + "learning_rate": 1.907618269675576e-05, + "loss": 1.1101, + "step": 3411 + }, + { + "epoch": 0.16, + "grad_norm": 1.164027208587502, + "learning_rate": 1.9075528632396456e-05, + "loss": 1.0806, + "step": 3412 + }, + { + "epoch": 0.16, + "grad_norm": 1.3753563549328363, + "learning_rate": 1.907487434780076e-05, + "loss": 1.0962, + "step": 3413 + }, + { + "epoch": 0.16, + "grad_norm": 1.0827122609609527, + "learning_rate": 1.9074219842984553e-05, + "loss": 1.1621, + "step": 3414 + }, + { + "epoch": 0.16, + "grad_norm": 1.1002823442021386, + "learning_rate": 1.9073565117963714e-05, + "loss": 1.187, + "step": 3415 + }, + { + "epoch": 0.16, + "grad_norm": 0.9161460015603792, + "learning_rate": 1.9072910172754137e-05, + "loss": 1.29, + "step": 3416 + }, + { + "epoch": 0.16, + "grad_norm": 1.3085559713220312, + "learning_rate": 1.9072255007371715e-05, + "loss": 1.2495, + "step": 3417 + }, + { + "epoch": 0.16, + "grad_norm": 0.8976772182946938, + "learning_rate": 1.9071599621832338e-05, + "loss": 1.0964, + "step": 3418 + }, + { + "epoch": 0.16, + "grad_norm": 1.1313603954949405, + "learning_rate": 1.9070944016151923e-05, + "loss": 1.1465, + "step": 3419 + }, + { + "epoch": 0.16, + "grad_norm": 1.1601825377628743, + "learning_rate": 1.907028819034637e-05, + "loss": 1.3013, + "step": 3420 + }, + { + "epoch": 0.16, + "grad_norm": 1.46035244829555, + "learning_rate": 1.9069632144431595e-05, + "loss": 1.2744, + "step": 3421 + }, + { + "epoch": 0.16, + "grad_norm": 1.269634538282727, + "learning_rate": 1.9068975878423526e-05, + "loss": 1.2168, + "step": 3422 + }, + { + "epoch": 0.16, + "grad_norm": 1.414267580638929, + "learning_rate": 1.9068319392338082e-05, + "loss": 1.3008, + "step": 3423 + }, + { + "epoch": 0.16, + "grad_norm": 1.134663738953098, + "learning_rate": 1.9067662686191192e-05, + "loss": 1.2153, + "step": 3424 + }, + { + "epoch": 0.16, + "grad_norm": 1.3283651410892432, + "learning_rate": 1.9067005759998797e-05, + "loss": 1.1401, + "step": 3425 + }, + { + "epoch": 0.16, + "grad_norm": 1.3301722702616716, + "learning_rate": 1.906634861377684e-05, + "loss": 1.1577, + "step": 3426 + }, + { + "epoch": 0.16, + "grad_norm": 1.374026038292277, + "learning_rate": 1.9065691247541264e-05, + "loss": 1.1328, + "step": 3427 + }, + { + "epoch": 0.16, + "grad_norm": 1.3808860229369102, + "learning_rate": 1.9065033661308022e-05, + "loss": 1.2217, + "step": 3428 + }, + { + "epoch": 0.16, + "grad_norm": 1.1289704402054235, + "learning_rate": 1.906437585509307e-05, + "loss": 1.1523, + "step": 3429 + }, + { + "epoch": 0.16, + "grad_norm": 1.1878133140223652, + "learning_rate": 1.9063717828912376e-05, + "loss": 1.2261, + "step": 3430 + }, + { + "epoch": 0.17, + "grad_norm": 1.435088834030497, + "learning_rate": 1.9063059582781905e-05, + "loss": 1.2427, + "step": 3431 + }, + { + "epoch": 0.17, + "grad_norm": 1.1468859055752265, + "learning_rate": 1.9062401116717635e-05, + "loss": 1.1221, + "step": 3432 + }, + { + "epoch": 0.17, + "grad_norm": 1.2690723415247267, + "learning_rate": 1.9061742430735538e-05, + "loss": 1.3877, + "step": 3433 + }, + { + "epoch": 0.17, + "grad_norm": 1.1763833657108405, + "learning_rate": 1.90610835248516e-05, + "loss": 1.1272, + "step": 3434 + }, + { + "epoch": 0.17, + "grad_norm": 1.253049845792166, + "learning_rate": 1.9060424399081816e-05, + "loss": 1.3101, + "step": 3435 + }, + { + "epoch": 0.17, + "grad_norm": 1.3901398722097753, + "learning_rate": 1.9059765053442176e-05, + "loss": 1.3579, + "step": 3436 + }, + { + "epoch": 0.17, + "grad_norm": 1.2092532252360655, + "learning_rate": 1.9059105487948683e-05, + "loss": 1.0681, + "step": 3437 + }, + { + "epoch": 0.17, + "grad_norm": 1.126670970926636, + "learning_rate": 1.905844570261734e-05, + "loss": 1.1064, + "step": 3438 + }, + { + "epoch": 0.17, + "grad_norm": 1.475431887500409, + "learning_rate": 1.9057785697464162e-05, + "loss": 1.1992, + "step": 3439 + }, + { + "epoch": 0.17, + "grad_norm": 1.259728378055384, + "learning_rate": 1.9057125472505162e-05, + "loss": 1.1699, + "step": 3440 + }, + { + "epoch": 0.17, + "grad_norm": 1.2203236023076494, + "learning_rate": 1.9056465027756362e-05, + "loss": 1.1802, + "step": 3441 + }, + { + "epoch": 0.17, + "grad_norm": 1.1639194585140666, + "learning_rate": 1.905580436323379e-05, + "loss": 1.1514, + "step": 3442 + }, + { + "epoch": 0.17, + "grad_norm": 1.3793136599864775, + "learning_rate": 1.905514347895348e-05, + "loss": 1.2661, + "step": 3443 + }, + { + "epoch": 0.17, + "grad_norm": 1.3067493388006646, + "learning_rate": 1.905448237493147e-05, + "loss": 1.1523, + "step": 3444 + }, + { + "epoch": 0.17, + "grad_norm": 1.243098571736717, + "learning_rate": 1.9053821051183797e-05, + "loss": 1.2349, + "step": 3445 + }, + { + "epoch": 0.17, + "grad_norm": 1.2056901232127195, + "learning_rate": 1.9053159507726514e-05, + "loss": 1.2588, + "step": 3446 + }, + { + "epoch": 0.17, + "grad_norm": 1.0236529168045698, + "learning_rate": 1.9052497744575675e-05, + "loss": 1.2441, + "step": 3447 + }, + { + "epoch": 0.17, + "grad_norm": 1.2058679998078612, + "learning_rate": 1.9051835761747336e-05, + "loss": 1.1738, + "step": 3448 + }, + { + "epoch": 0.17, + "grad_norm": 0.9569810327051237, + "learning_rate": 1.9051173559257567e-05, + "loss": 1.0222, + "step": 3449 + }, + { + "epoch": 0.17, + "grad_norm": 1.2864669294748858, + "learning_rate": 1.905051113712243e-05, + "loss": 1.2627, + "step": 3450 + }, + { + "epoch": 0.17, + "grad_norm": 1.1350427974720596, + "learning_rate": 1.9049848495358006e-05, + "loss": 1.0508, + "step": 3451 + }, + { + "epoch": 0.17, + "grad_norm": 1.186515849698308, + "learning_rate": 1.9049185633980376e-05, + "loss": 1.187, + "step": 3452 + }, + { + "epoch": 0.17, + "grad_norm": 1.3057233069249745, + "learning_rate": 1.904852255300562e-05, + "loss": 1.269, + "step": 3453 + }, + { + "epoch": 0.17, + "grad_norm": 1.0846472267219252, + "learning_rate": 1.9047859252449837e-05, + "loss": 1.1689, + "step": 3454 + }, + { + "epoch": 0.17, + "grad_norm": 1.1582830338425567, + "learning_rate": 1.9047195732329117e-05, + "loss": 1.1782, + "step": 3455 + }, + { + "epoch": 0.17, + "grad_norm": 1.6229961918120983, + "learning_rate": 1.904653199265956e-05, + "loss": 1.3447, + "step": 3456 + }, + { + "epoch": 0.17, + "grad_norm": 1.3661231189533265, + "learning_rate": 1.904586803345728e-05, + "loss": 1.2031, + "step": 3457 + }, + { + "epoch": 0.17, + "grad_norm": 1.2339232710239585, + "learning_rate": 1.9045203854738386e-05, + "loss": 1.2183, + "step": 3458 + }, + { + "epoch": 0.17, + "grad_norm": 1.3426602344960223, + "learning_rate": 1.9044539456518992e-05, + "loss": 1.2925, + "step": 3459 + }, + { + "epoch": 0.17, + "grad_norm": 1.2698438806140884, + "learning_rate": 1.9043874838815225e-05, + "loss": 1.1416, + "step": 3460 + }, + { + "epoch": 0.17, + "grad_norm": 1.6318930615394507, + "learning_rate": 1.9043210001643215e-05, + "loss": 1.3799, + "step": 3461 + }, + { + "epoch": 0.17, + "grad_norm": 1.0593470016947977, + "learning_rate": 1.9042544945019094e-05, + "loss": 1.105, + "step": 3462 + }, + { + "epoch": 0.17, + "grad_norm": 1.042632124024118, + "learning_rate": 1.9041879668959e-05, + "loss": 1.2153, + "step": 3463 + }, + { + "epoch": 0.17, + "grad_norm": 1.4340591605153423, + "learning_rate": 1.9041214173479075e-05, + "loss": 1.3052, + "step": 3464 + }, + { + "epoch": 0.17, + "grad_norm": 1.304950280103722, + "learning_rate": 1.9040548458595472e-05, + "loss": 1.165, + "step": 3465 + }, + { + "epoch": 0.17, + "grad_norm": 1.1239623083973622, + "learning_rate": 1.9039882524324346e-05, + "loss": 0.9529, + "step": 3466 + }, + { + "epoch": 0.17, + "grad_norm": 1.092030026245719, + "learning_rate": 1.9039216370681858e-05, + "loss": 1.0654, + "step": 3467 + }, + { + "epoch": 0.17, + "grad_norm": 1.433344471504051, + "learning_rate": 1.903854999768417e-05, + "loss": 1.2397, + "step": 3468 + }, + { + "epoch": 0.17, + "grad_norm": 1.2220771015752594, + "learning_rate": 1.9037883405347454e-05, + "loss": 1.1362, + "step": 3469 + }, + { + "epoch": 0.17, + "grad_norm": 1.3435392218176976, + "learning_rate": 1.903721659368789e-05, + "loss": 1.1895, + "step": 3470 + }, + { + "epoch": 0.17, + "grad_norm": 1.3643229528333654, + "learning_rate": 1.9036549562721657e-05, + "loss": 1.3164, + "step": 3471 + }, + { + "epoch": 0.17, + "grad_norm": 1.6660096029887215, + "learning_rate": 1.9035882312464938e-05, + "loss": 1.2007, + "step": 3472 + }, + { + "epoch": 0.17, + "grad_norm": 1.3121658753653158, + "learning_rate": 1.903521484293393e-05, + "loss": 1.1777, + "step": 3473 + }, + { + "epoch": 0.17, + "grad_norm": 1.402801044668341, + "learning_rate": 1.9034547154144832e-05, + "loss": 1.2593, + "step": 3474 + }, + { + "epoch": 0.17, + "grad_norm": 1.3527465012212114, + "learning_rate": 1.9033879246113842e-05, + "loss": 1.2583, + "step": 3475 + }, + { + "epoch": 0.17, + "grad_norm": 1.4168580586071178, + "learning_rate": 1.903321111885717e-05, + "loss": 1.415, + "step": 3476 + }, + { + "epoch": 0.17, + "grad_norm": 1.2701370813552848, + "learning_rate": 1.903254277239103e-05, + "loss": 1.3804, + "step": 3477 + }, + { + "epoch": 0.17, + "grad_norm": 1.0567543778320632, + "learning_rate": 1.9031874206731644e-05, + "loss": 1.2798, + "step": 3478 + }, + { + "epoch": 0.17, + "grad_norm": 1.4409476896058968, + "learning_rate": 1.903120542189523e-05, + "loss": 1.0874, + "step": 3479 + }, + { + "epoch": 0.17, + "grad_norm": 1.4250590688424878, + "learning_rate": 1.903053641789802e-05, + "loss": 1.1982, + "step": 3480 + }, + { + "epoch": 0.17, + "grad_norm": 1.3117211229675227, + "learning_rate": 1.9029867194756248e-05, + "loss": 1.3174, + "step": 3481 + }, + { + "epoch": 0.17, + "grad_norm": 1.2311488687538699, + "learning_rate": 1.902919775248616e-05, + "loss": 1.1777, + "step": 3482 + }, + { + "epoch": 0.17, + "grad_norm": 1.2705094076186667, + "learning_rate": 1.902852809110399e-05, + "loss": 1.2378, + "step": 3483 + }, + { + "epoch": 0.17, + "grad_norm": 1.283364904603143, + "learning_rate": 1.9027858210626e-05, + "loss": 1.2866, + "step": 3484 + }, + { + "epoch": 0.17, + "grad_norm": 1.2668388834029731, + "learning_rate": 1.902718811106844e-05, + "loss": 1.2944, + "step": 3485 + }, + { + "epoch": 0.17, + "grad_norm": 1.309092563470368, + "learning_rate": 1.902651779244757e-05, + "loss": 1.207, + "step": 3486 + }, + { + "epoch": 0.17, + "grad_norm": 1.25982773621972, + "learning_rate": 1.9025847254779662e-05, + "loss": 1.1299, + "step": 3487 + }, + { + "epoch": 0.17, + "grad_norm": 1.0265128470780536, + "learning_rate": 1.9025176498080987e-05, + "loss": 1.3545, + "step": 3488 + }, + { + "epoch": 0.17, + "grad_norm": 1.1715510290275362, + "learning_rate": 1.9024505522367818e-05, + "loss": 1.1064, + "step": 3489 + }, + { + "epoch": 0.17, + "grad_norm": 1.3384391309013637, + "learning_rate": 1.902383432765644e-05, + "loss": 1.1753, + "step": 3490 + }, + { + "epoch": 0.17, + "grad_norm": 1.2530482532800944, + "learning_rate": 1.9023162913963136e-05, + "loss": 1.1685, + "step": 3491 + }, + { + "epoch": 0.17, + "grad_norm": 1.2967187557760993, + "learning_rate": 1.9022491281304214e-05, + "loss": 1.2051, + "step": 3492 + }, + { + "epoch": 0.17, + "grad_norm": 1.5705028665339298, + "learning_rate": 1.9021819429695955e-05, + "loss": 1.3745, + "step": 3493 + }, + { + "epoch": 0.17, + "grad_norm": 1.4197195753828622, + "learning_rate": 1.9021147359154674e-05, + "loss": 1.2056, + "step": 3494 + }, + { + "epoch": 0.17, + "grad_norm": 1.3579660047883084, + "learning_rate": 1.9020475069696676e-05, + "loss": 1.2007, + "step": 3495 + }, + { + "epoch": 0.17, + "grad_norm": 1.1106877664985146, + "learning_rate": 1.9019802561338278e-05, + "loss": 1.0825, + "step": 3496 + }, + { + "epoch": 0.17, + "grad_norm": 0.9691338841231183, + "learning_rate": 1.9019129834095798e-05, + "loss": 1.231, + "step": 3497 + }, + { + "epoch": 0.17, + "grad_norm": 1.111083004103819, + "learning_rate": 1.9018456887985558e-05, + "loss": 1.2393, + "step": 3498 + }, + { + "epoch": 0.17, + "grad_norm": 1.1044886069304185, + "learning_rate": 1.9017783723023895e-05, + "loss": 1.1514, + "step": 3499 + }, + { + "epoch": 0.17, + "grad_norm": 1.568967648481884, + "learning_rate": 1.901711033922714e-05, + "loss": 1.3447, + "step": 3500 + }, + { + "epoch": 0.17, + "grad_norm": 1.2882593593769245, + "learning_rate": 1.9016436736611637e-05, + "loss": 1.2881, + "step": 3501 + }, + { + "epoch": 0.17, + "grad_norm": 1.5871391928864795, + "learning_rate": 1.9015762915193727e-05, + "loss": 1.3506, + "step": 3502 + }, + { + "epoch": 0.17, + "grad_norm": 1.3148739027941356, + "learning_rate": 1.901508887498977e-05, + "loss": 1.1252, + "step": 3503 + }, + { + "epoch": 0.17, + "grad_norm": 1.3340529043604683, + "learning_rate": 1.9014414616016116e-05, + "loss": 1.2192, + "step": 3504 + }, + { + "epoch": 0.17, + "grad_norm": 0.9483817732541353, + "learning_rate": 1.901374013828913e-05, + "loss": 1.1694, + "step": 3505 + }, + { + "epoch": 0.17, + "grad_norm": 1.2427106132764631, + "learning_rate": 1.901306544182518e-05, + "loss": 1.2344, + "step": 3506 + }, + { + "epoch": 0.17, + "grad_norm": 1.7107141636413918, + "learning_rate": 1.9012390526640638e-05, + "loss": 1.3071, + "step": 3507 + }, + { + "epoch": 0.17, + "grad_norm": 1.5163754552813258, + "learning_rate": 1.9011715392751882e-05, + "loss": 1.167, + "step": 3508 + }, + { + "epoch": 0.17, + "grad_norm": 1.1507178230866322, + "learning_rate": 1.9011040040175295e-05, + "loss": 1.186, + "step": 3509 + }, + { + "epoch": 0.17, + "grad_norm": 1.1312044338316605, + "learning_rate": 1.9010364468927267e-05, + "loss": 1.2285, + "step": 3510 + }, + { + "epoch": 0.17, + "grad_norm": 1.2565688955407572, + "learning_rate": 1.900968867902419e-05, + "loss": 1.2886, + "step": 3511 + }, + { + "epoch": 0.17, + "grad_norm": 1.2056617483792658, + "learning_rate": 1.900901267048247e-05, + "loss": 1.1968, + "step": 3512 + }, + { + "epoch": 0.17, + "grad_norm": 1.2219931676063458, + "learning_rate": 1.9008336443318505e-05, + "loss": 1.2271, + "step": 3513 + }, + { + "epoch": 0.17, + "grad_norm": 1.793085385712175, + "learning_rate": 1.900765999754871e-05, + "loss": 1.2383, + "step": 3514 + }, + { + "epoch": 0.17, + "grad_norm": 1.226931841342433, + "learning_rate": 1.9006983333189493e-05, + "loss": 1.2441, + "step": 3515 + }, + { + "epoch": 0.17, + "grad_norm": 1.3629737279476966, + "learning_rate": 1.9006306450257278e-05, + "loss": 1.3965, + "step": 3516 + }, + { + "epoch": 0.17, + "grad_norm": 1.192179781260468, + "learning_rate": 1.90056293487685e-05, + "loss": 1.2568, + "step": 3517 + }, + { + "epoch": 0.17, + "grad_norm": 1.3823821965532772, + "learning_rate": 1.900495202873957e-05, + "loss": 1.1807, + "step": 3518 + }, + { + "epoch": 0.17, + "grad_norm": 1.2191159058335401, + "learning_rate": 1.9004274490186944e-05, + "loss": 1.249, + "step": 3519 + }, + { + "epoch": 0.17, + "grad_norm": 1.277065057424459, + "learning_rate": 1.9003596733127058e-05, + "loss": 1.208, + "step": 3520 + }, + { + "epoch": 0.17, + "grad_norm": 1.31355811000581, + "learning_rate": 1.9002918757576358e-05, + "loss": 1.3193, + "step": 3521 + }, + { + "epoch": 0.17, + "grad_norm": 1.3820486167605102, + "learning_rate": 1.9002240563551293e-05, + "loss": 1.4141, + "step": 3522 + }, + { + "epoch": 0.17, + "grad_norm": 1.436566153388607, + "learning_rate": 1.9001562151068327e-05, + "loss": 1.3042, + "step": 3523 + }, + { + "epoch": 0.17, + "grad_norm": 1.4006583375161052, + "learning_rate": 1.900088352014392e-05, + "loss": 1.3613, + "step": 3524 + }, + { + "epoch": 0.17, + "grad_norm": 1.2369682909837802, + "learning_rate": 1.900020467079454e-05, + "loss": 1.2388, + "step": 3525 + }, + { + "epoch": 0.17, + "grad_norm": 1.3149014128965715, + "learning_rate": 1.899952560303666e-05, + "loss": 1.2471, + "step": 3526 + }, + { + "epoch": 0.17, + "grad_norm": 1.1909194974878563, + "learning_rate": 1.899884631688676e-05, + "loss": 1.2017, + "step": 3527 + }, + { + "epoch": 0.17, + "grad_norm": 1.2394565843489909, + "learning_rate": 1.8998166812361325e-05, + "loss": 1.1646, + "step": 3528 + }, + { + "epoch": 0.17, + "grad_norm": 1.2224360132467993, + "learning_rate": 1.8997487089476844e-05, + "loss": 1.167, + "step": 3529 + }, + { + "epoch": 0.17, + "grad_norm": 1.1449503131802734, + "learning_rate": 1.8996807148249817e-05, + "loss": 1.2485, + "step": 3530 + }, + { + "epoch": 0.17, + "grad_norm": 1.7713327005071833, + "learning_rate": 1.899612698869673e-05, + "loss": 1.3789, + "step": 3531 + }, + { + "epoch": 0.17, + "grad_norm": 1.195505679817318, + "learning_rate": 1.8995446610834104e-05, + "loss": 1.4048, + "step": 3532 + }, + { + "epoch": 0.17, + "grad_norm": 1.171285195268107, + "learning_rate": 1.8994766014678444e-05, + "loss": 1.1606, + "step": 3533 + }, + { + "epoch": 0.17, + "grad_norm": 1.1718742921141723, + "learning_rate": 1.8994085200246263e-05, + "loss": 1.1362, + "step": 3534 + }, + { + "epoch": 0.17, + "grad_norm": 1.2819988648831853, + "learning_rate": 1.8993404167554085e-05, + "loss": 1.2344, + "step": 3535 + }, + { + "epoch": 0.17, + "grad_norm": 1.6577945550692839, + "learning_rate": 1.899272291661844e-05, + "loss": 1.3525, + "step": 3536 + }, + { + "epoch": 0.17, + "grad_norm": 1.2369649973696033, + "learning_rate": 1.899204144745585e-05, + "loss": 1.2676, + "step": 3537 + }, + { + "epoch": 0.17, + "grad_norm": 0.9052190857530272, + "learning_rate": 1.8991359760082864e-05, + "loss": 1.2368, + "step": 3538 + }, + { + "epoch": 0.17, + "grad_norm": 1.7076724212876317, + "learning_rate": 1.8990677854516017e-05, + "loss": 1.5767, + "step": 3539 + }, + { + "epoch": 0.17, + "grad_norm": 1.4706620336401766, + "learning_rate": 1.898999573077186e-05, + "loss": 1.1851, + "step": 3540 + }, + { + "epoch": 0.17, + "grad_norm": 1.7024157383228438, + "learning_rate": 1.8989313388866944e-05, + "loss": 1.3984, + "step": 3541 + }, + { + "epoch": 0.17, + "grad_norm": 1.3204783535091573, + "learning_rate": 1.8988630828817827e-05, + "loss": 1.2314, + "step": 3542 + }, + { + "epoch": 0.17, + "grad_norm": 1.2481863061470888, + "learning_rate": 1.898794805064108e-05, + "loss": 1.2471, + "step": 3543 + }, + { + "epoch": 0.17, + "grad_norm": 1.4312016194235124, + "learning_rate": 1.8987265054353262e-05, + "loss": 1.314, + "step": 3544 + }, + { + "epoch": 0.17, + "grad_norm": 1.3460563758169157, + "learning_rate": 1.8986581839970956e-05, + "loss": 1.2046, + "step": 3545 + }, + { + "epoch": 0.17, + "grad_norm": 1.1047090815518474, + "learning_rate": 1.898589840751073e-05, + "loss": 1.2437, + "step": 3546 + }, + { + "epoch": 0.17, + "grad_norm": 1.3928567406062362, + "learning_rate": 1.898521475698918e-05, + "loss": 1.2544, + "step": 3547 + }, + { + "epoch": 0.17, + "grad_norm": 1.4400888953585051, + "learning_rate": 1.8984530888422897e-05, + "loss": 1.1929, + "step": 3548 + }, + { + "epoch": 0.17, + "grad_norm": 1.197208644107919, + "learning_rate": 1.8983846801828468e-05, + "loss": 1.2246, + "step": 3549 + }, + { + "epoch": 0.17, + "grad_norm": 1.2905525034008316, + "learning_rate": 1.8983162497222497e-05, + "loss": 1.0415, + "step": 3550 + }, + { + "epoch": 0.17, + "grad_norm": 1.3094268738408616, + "learning_rate": 1.898247797462159e-05, + "loss": 1.2783, + "step": 3551 + }, + { + "epoch": 0.17, + "grad_norm": 1.6058081925737373, + "learning_rate": 1.8981793234042362e-05, + "loss": 1.0774, + "step": 3552 + }, + { + "epoch": 0.17, + "grad_norm": 1.3552176275314036, + "learning_rate": 1.8981108275501422e-05, + "loss": 1.0388, + "step": 3553 + }, + { + "epoch": 0.17, + "grad_norm": 1.3166927931567363, + "learning_rate": 1.8980423099015402e-05, + "loss": 1.3281, + "step": 3554 + }, + { + "epoch": 0.17, + "grad_norm": 1.2069203959085173, + "learning_rate": 1.8979737704600923e-05, + "loss": 1.2607, + "step": 3555 + }, + { + "epoch": 0.17, + "grad_norm": 1.2242322131968812, + "learning_rate": 1.8979052092274615e-05, + "loss": 1.3237, + "step": 3556 + }, + { + "epoch": 0.17, + "grad_norm": 1.2284040713645001, + "learning_rate": 1.897836626205312e-05, + "loss": 1.1724, + "step": 3557 + }, + { + "epoch": 0.17, + "grad_norm": 1.0548807383135015, + "learning_rate": 1.8977680213953084e-05, + "loss": 1.2734, + "step": 3558 + }, + { + "epoch": 0.17, + "grad_norm": 1.5177787557455222, + "learning_rate": 1.8976993947991145e-05, + "loss": 1.2666, + "step": 3559 + }, + { + "epoch": 0.17, + "grad_norm": 1.2641678959368694, + "learning_rate": 1.897630746418397e-05, + "loss": 1.2617, + "step": 3560 + }, + { + "epoch": 0.17, + "grad_norm": 1.3905737912018168, + "learning_rate": 1.8975620762548207e-05, + "loss": 1.188, + "step": 3561 + }, + { + "epoch": 0.17, + "grad_norm": 1.061550015247218, + "learning_rate": 1.8974933843100526e-05, + "loss": 0.99, + "step": 3562 + }, + { + "epoch": 0.17, + "grad_norm": 0.9708175444710299, + "learning_rate": 1.8974246705857594e-05, + "loss": 1.2656, + "step": 3563 + }, + { + "epoch": 0.17, + "grad_norm": 1.4653176366219918, + "learning_rate": 1.8973559350836092e-05, + "loss": 1.0999, + "step": 3564 + }, + { + "epoch": 0.17, + "grad_norm": 1.3604778417732646, + "learning_rate": 1.8972871778052688e-05, + "loss": 1.1616, + "step": 3565 + }, + { + "epoch": 0.17, + "grad_norm": 0.9276852885748215, + "learning_rate": 1.8972183987524076e-05, + "loss": 1.1377, + "step": 3566 + }, + { + "epoch": 0.17, + "grad_norm": 1.3235576841209322, + "learning_rate": 1.8971495979266946e-05, + "loss": 1.1748, + "step": 3567 + }, + { + "epoch": 0.17, + "grad_norm": 1.2058984556877652, + "learning_rate": 1.897080775329799e-05, + "loss": 1.1245, + "step": 3568 + }, + { + "epoch": 0.17, + "grad_norm": 1.1789568403336739, + "learning_rate": 1.8970119309633916e-05, + "loss": 1.1919, + "step": 3569 + }, + { + "epoch": 0.17, + "grad_norm": 1.4182993377699844, + "learning_rate": 1.8969430648291425e-05, + "loss": 1.2188, + "step": 3570 + }, + { + "epoch": 0.17, + "grad_norm": 1.5400017767698304, + "learning_rate": 1.896874176928723e-05, + "loss": 1.2949, + "step": 3571 + }, + { + "epoch": 0.17, + "grad_norm": 1.158846839349765, + "learning_rate": 1.896805267263805e-05, + "loss": 0.9419, + "step": 3572 + }, + { + "epoch": 0.17, + "grad_norm": 1.2360413200287725, + "learning_rate": 1.89673633583606e-05, + "loss": 1.2114, + "step": 3573 + }, + { + "epoch": 0.17, + "grad_norm": 1.2518618639230985, + "learning_rate": 1.896667382647162e-05, + "loss": 1.1694, + "step": 3574 + }, + { + "epoch": 0.17, + "grad_norm": 1.259223721267541, + "learning_rate": 1.8965984076987835e-05, + "loss": 1.2622, + "step": 3575 + }, + { + "epoch": 0.17, + "grad_norm": 1.0783113239056321, + "learning_rate": 1.8965294109925984e-05, + "loss": 1.1772, + "step": 3576 + }, + { + "epoch": 0.17, + "grad_norm": 1.1785923503817366, + "learning_rate": 1.8964603925302813e-05, + "loss": 1.2212, + "step": 3577 + }, + { + "epoch": 0.17, + "grad_norm": 1.2877150923147622, + "learning_rate": 1.8963913523135062e-05, + "loss": 1.0962, + "step": 3578 + }, + { + "epoch": 0.17, + "grad_norm": 1.3956768856720954, + "learning_rate": 1.8963222903439495e-05, + "loss": 1.3525, + "step": 3579 + }, + { + "epoch": 0.17, + "grad_norm": 1.4754851269973712, + "learning_rate": 1.896253206623287e-05, + "loss": 1.1655, + "step": 3580 + }, + { + "epoch": 0.17, + "grad_norm": 1.28801775962986, + "learning_rate": 1.8961841011531948e-05, + "loss": 1.2334, + "step": 3581 + }, + { + "epoch": 0.17, + "grad_norm": 1.1022009828558093, + "learning_rate": 1.8961149739353502e-05, + "loss": 1.1558, + "step": 3582 + }, + { + "epoch": 0.17, + "grad_norm": 1.3058212216231804, + "learning_rate": 1.8960458249714303e-05, + "loss": 1.2021, + "step": 3583 + }, + { + "epoch": 0.17, + "grad_norm": 1.373422130822137, + "learning_rate": 1.8959766542631137e-05, + "loss": 1.2388, + "step": 3584 + }, + { + "epoch": 0.17, + "grad_norm": 1.2168947645536965, + "learning_rate": 1.8959074618120784e-05, + "loss": 1.1143, + "step": 3585 + }, + { + "epoch": 0.17, + "grad_norm": 1.1435409271267243, + "learning_rate": 1.8958382476200038e-05, + "loss": 1.1816, + "step": 3586 + }, + { + "epoch": 0.17, + "grad_norm": 1.1310150354771937, + "learning_rate": 1.8957690116885697e-05, + "loss": 1.0913, + "step": 3587 + }, + { + "epoch": 0.17, + "grad_norm": 1.3291267264748485, + "learning_rate": 1.895699754019456e-05, + "loss": 1.2051, + "step": 3588 + }, + { + "epoch": 0.17, + "grad_norm": 1.4400889258373317, + "learning_rate": 1.8956304746143433e-05, + "loss": 1.2808, + "step": 3589 + }, + { + "epoch": 0.17, + "grad_norm": 1.283446745301071, + "learning_rate": 1.895561173474913e-05, + "loss": 1.2095, + "step": 3590 + }, + { + "epoch": 0.17, + "grad_norm": 1.2154893123256345, + "learning_rate": 1.8954918506028467e-05, + "loss": 1.2456, + "step": 3591 + }, + { + "epoch": 0.17, + "grad_norm": 1.5662101578989225, + "learning_rate": 1.895422505999827e-05, + "loss": 1.3628, + "step": 3592 + }, + { + "epoch": 0.17, + "grad_norm": 1.2758310255591814, + "learning_rate": 1.895353139667536e-05, + "loss": 1.271, + "step": 3593 + }, + { + "epoch": 0.17, + "grad_norm": 1.3280614545300653, + "learning_rate": 1.895283751607658e-05, + "loss": 1.4087, + "step": 3594 + }, + { + "epoch": 0.17, + "grad_norm": 1.2326827291385223, + "learning_rate": 1.895214341821876e-05, + "loss": 1.3315, + "step": 3595 + }, + { + "epoch": 0.17, + "grad_norm": 1.1553501120457372, + "learning_rate": 1.8951449103118743e-05, + "loss": 1.2036, + "step": 3596 + }, + { + "epoch": 0.17, + "grad_norm": 0.9376955759375059, + "learning_rate": 1.8950754570793384e-05, + "loss": 1.1777, + "step": 3597 + }, + { + "epoch": 0.17, + "grad_norm": 1.3048526908850409, + "learning_rate": 1.8950059821259535e-05, + "loss": 1.3281, + "step": 3598 + }, + { + "epoch": 0.17, + "grad_norm": 1.1420596151614748, + "learning_rate": 1.8949364854534054e-05, + "loss": 1.3262, + "step": 3599 + }, + { + "epoch": 0.17, + "grad_norm": 1.0154276464122167, + "learning_rate": 1.8948669670633812e-05, + "loss": 1.1226, + "step": 3600 + }, + { + "epoch": 0.17, + "grad_norm": 1.2413960124425851, + "learning_rate": 1.894797426957567e-05, + "loss": 1.1206, + "step": 3601 + }, + { + "epoch": 0.17, + "grad_norm": 1.2656053569812258, + "learning_rate": 1.894727865137651e-05, + "loss": 1.144, + "step": 3602 + }, + { + "epoch": 0.17, + "grad_norm": 1.4954938300722012, + "learning_rate": 1.894658281605321e-05, + "loss": 1.3447, + "step": 3603 + }, + { + "epoch": 0.17, + "grad_norm": 1.3261257661775274, + "learning_rate": 1.894588676362265e-05, + "loss": 1.2837, + "step": 3604 + }, + { + "epoch": 0.17, + "grad_norm": 0.9853776149677267, + "learning_rate": 1.8945190494101734e-05, + "loss": 1.1685, + "step": 3605 + }, + { + "epoch": 0.17, + "grad_norm": 1.2902588886626283, + "learning_rate": 1.894449400750735e-05, + "loss": 1.1929, + "step": 3606 + }, + { + "epoch": 0.17, + "grad_norm": 1.2780292976969228, + "learning_rate": 1.89437973038564e-05, + "loss": 1.2656, + "step": 3607 + }, + { + "epoch": 0.17, + "grad_norm": 1.1931754662349552, + "learning_rate": 1.8943100383165794e-05, + "loss": 1.1729, + "step": 3608 + }, + { + "epoch": 0.17, + "grad_norm": 1.3640246593042178, + "learning_rate": 1.894240324545244e-05, + "loss": 1.1162, + "step": 3609 + }, + { + "epoch": 0.17, + "grad_norm": 1.2492943513402945, + "learning_rate": 1.894170589073326e-05, + "loss": 1.0857, + "step": 3610 + }, + { + "epoch": 0.17, + "grad_norm": 1.0731786207130478, + "learning_rate": 1.8941008319025174e-05, + "loss": 1.1665, + "step": 3611 + }, + { + "epoch": 0.17, + "grad_norm": 1.3320710137359644, + "learning_rate": 1.894031053034511e-05, + "loss": 1.1431, + "step": 3612 + }, + { + "epoch": 0.17, + "grad_norm": 1.5379752790524126, + "learning_rate": 1.8939612524710003e-05, + "loss": 1.1792, + "step": 3613 + }, + { + "epoch": 0.17, + "grad_norm": 1.0549033594565047, + "learning_rate": 1.8938914302136792e-05, + "loss": 1.0781, + "step": 3614 + }, + { + "epoch": 0.17, + "grad_norm": 1.2449546713246005, + "learning_rate": 1.893821586264241e-05, + "loss": 1.2656, + "step": 3615 + }, + { + "epoch": 0.17, + "grad_norm": 1.1972093397998964, + "learning_rate": 1.8937517206243828e-05, + "loss": 1.2036, + "step": 3616 + }, + { + "epoch": 0.17, + "grad_norm": 1.3089700578820127, + "learning_rate": 1.8936818332957983e-05, + "loss": 1.061, + "step": 3617 + }, + { + "epoch": 0.17, + "grad_norm": 1.0829759536104209, + "learning_rate": 1.8936119242801837e-05, + "loss": 1.1758, + "step": 3618 + }, + { + "epoch": 0.17, + "grad_norm": 1.5780819327370337, + "learning_rate": 1.8935419935792358e-05, + "loss": 1.0601, + "step": 3619 + }, + { + "epoch": 0.17, + "grad_norm": 1.3071436132541263, + "learning_rate": 1.8934720411946513e-05, + "loss": 1.0972, + "step": 3620 + }, + { + "epoch": 0.17, + "grad_norm": 1.181345352197252, + "learning_rate": 1.8934020671281285e-05, + "loss": 1.0649, + "step": 3621 + }, + { + "epoch": 0.17, + "grad_norm": 1.232643308269587, + "learning_rate": 1.8933320713813647e-05, + "loss": 1.0249, + "step": 3622 + }, + { + "epoch": 0.17, + "grad_norm": 1.2484904192824775, + "learning_rate": 1.8932620539560587e-05, + "loss": 1.2412, + "step": 3623 + }, + { + "epoch": 0.17, + "grad_norm": 1.2471219870753092, + "learning_rate": 1.8931920148539098e-05, + "loss": 1.1865, + "step": 3624 + }, + { + "epoch": 0.17, + "grad_norm": 1.3653289599732004, + "learning_rate": 1.8931219540766174e-05, + "loss": 1.2881, + "step": 3625 + }, + { + "epoch": 0.17, + "grad_norm": 0.9769790700111366, + "learning_rate": 1.8930518716258816e-05, + "loss": 1.2764, + "step": 3626 + }, + { + "epoch": 0.17, + "grad_norm": 1.4968585655888613, + "learning_rate": 1.8929817675034033e-05, + "loss": 1.2822, + "step": 3627 + }, + { + "epoch": 0.17, + "grad_norm": 0.8731129049381064, + "learning_rate": 1.8929116417108837e-05, + "loss": 1.2368, + "step": 3628 + }, + { + "epoch": 0.17, + "grad_norm": 1.559846542338931, + "learning_rate": 1.8928414942500243e-05, + "loss": 1.1904, + "step": 3629 + }, + { + "epoch": 0.17, + "grad_norm": 1.0816138495478058, + "learning_rate": 1.892771325122528e-05, + "loss": 1.1426, + "step": 3630 + }, + { + "epoch": 0.17, + "grad_norm": 1.4059530599081784, + "learning_rate": 1.892701134330097e-05, + "loss": 1.417, + "step": 3631 + }, + { + "epoch": 0.17, + "grad_norm": 1.0538247257106157, + "learning_rate": 1.8926309218744348e-05, + "loss": 1.229, + "step": 3632 + }, + { + "epoch": 0.17, + "grad_norm": 1.26765068040345, + "learning_rate": 1.892560687757245e-05, + "loss": 1.2139, + "step": 3633 + }, + { + "epoch": 0.17, + "grad_norm": 1.304834710329461, + "learning_rate": 1.8924904319802327e-05, + "loss": 1.3115, + "step": 3634 + }, + { + "epoch": 0.17, + "grad_norm": 1.4661961393931422, + "learning_rate": 1.892420154545102e-05, + "loss": 1.1284, + "step": 3635 + }, + { + "epoch": 0.17, + "grad_norm": 1.3541217377736774, + "learning_rate": 1.892349855453559e-05, + "loss": 1.1138, + "step": 3636 + }, + { + "epoch": 0.17, + "grad_norm": 1.0806088054347471, + "learning_rate": 1.892279534707309e-05, + "loss": 1.2568, + "step": 3637 + }, + { + "epoch": 0.17, + "grad_norm": 1.39767776903962, + "learning_rate": 1.892209192308059e-05, + "loss": 1.2871, + "step": 3638 + }, + { + "epoch": 0.18, + "grad_norm": 1.2489802188673031, + "learning_rate": 1.8921388282575157e-05, + "loss": 1.0835, + "step": 3639 + }, + { + "epoch": 0.18, + "grad_norm": 0.8472664305747158, + "learning_rate": 1.8920684425573865e-05, + "loss": 1.1353, + "step": 3640 + }, + { + "epoch": 0.18, + "grad_norm": 1.132805781721083, + "learning_rate": 1.8919980352093802e-05, + "loss": 1.0493, + "step": 3641 + }, + { + "epoch": 0.18, + "grad_norm": 1.1077117143097983, + "learning_rate": 1.8919276062152043e-05, + "loss": 1.1758, + "step": 3642 + }, + { + "epoch": 0.18, + "grad_norm": 1.0509894199531091, + "learning_rate": 1.891857155576569e-05, + "loss": 1.0325, + "step": 3643 + }, + { + "epoch": 0.18, + "grad_norm": 0.9711251239399257, + "learning_rate": 1.891786683295183e-05, + "loss": 1.2583, + "step": 3644 + }, + { + "epoch": 0.18, + "grad_norm": 1.3158561594565725, + "learning_rate": 1.891716189372757e-05, + "loss": 1.2573, + "step": 3645 + }, + { + "epoch": 0.18, + "grad_norm": 1.0916462694002096, + "learning_rate": 1.8916456738110013e-05, + "loss": 1.2935, + "step": 3646 + }, + { + "epoch": 0.18, + "grad_norm": 0.9478399918198038, + "learning_rate": 1.8915751366116275e-05, + "loss": 1.1636, + "step": 3647 + }, + { + "epoch": 0.18, + "grad_norm": 1.7001234054145673, + "learning_rate": 1.891504577776347e-05, + "loss": 1.3154, + "step": 3648 + }, + { + "epoch": 0.18, + "grad_norm": 1.1940893897843095, + "learning_rate": 1.8914339973068725e-05, + "loss": 1.0752, + "step": 3649 + }, + { + "epoch": 0.18, + "grad_norm": 0.9028736489994187, + "learning_rate": 1.8913633952049168e-05, + "loss": 1.3711, + "step": 3650 + }, + { + "epoch": 0.18, + "grad_norm": 0.8516567756285512, + "learning_rate": 1.8912927714721922e-05, + "loss": 1.1704, + "step": 3651 + }, + { + "epoch": 0.18, + "grad_norm": 1.252034486717095, + "learning_rate": 1.8912221261104136e-05, + "loss": 1.2583, + "step": 3652 + }, + { + "epoch": 0.18, + "grad_norm": 0.8309900414580386, + "learning_rate": 1.891151459121295e-05, + "loss": 1.1611, + "step": 3653 + }, + { + "epoch": 0.18, + "grad_norm": 1.2492653827046116, + "learning_rate": 1.8910807705065514e-05, + "loss": 1.1101, + "step": 3654 + }, + { + "epoch": 0.18, + "grad_norm": 1.3049209212974258, + "learning_rate": 1.8910100602678976e-05, + "loss": 1.2456, + "step": 3655 + }, + { + "epoch": 0.18, + "grad_norm": 1.2010094897817882, + "learning_rate": 1.89093932840705e-05, + "loss": 1.2217, + "step": 3656 + }, + { + "epoch": 0.18, + "grad_norm": 1.2563973267924362, + "learning_rate": 1.8908685749257257e-05, + "loss": 1.3003, + "step": 3657 + }, + { + "epoch": 0.18, + "grad_norm": 1.316051770696901, + "learning_rate": 1.8907977998256405e-05, + "loss": 1.228, + "step": 3658 + }, + { + "epoch": 0.18, + "grad_norm": 1.1421829775841765, + "learning_rate": 1.890727003108513e-05, + "loss": 1.2207, + "step": 3659 + }, + { + "epoch": 0.18, + "grad_norm": 1.1131430822264827, + "learning_rate": 1.89065618477606e-05, + "loss": 1.2241, + "step": 3660 + }, + { + "epoch": 0.18, + "grad_norm": 0.6918465772015077, + "learning_rate": 1.890585344830001e-05, + "loss": 1.249, + "step": 3661 + }, + { + "epoch": 0.18, + "grad_norm": 1.57902913654736, + "learning_rate": 1.8905144832720547e-05, + "loss": 1.3584, + "step": 3662 + }, + { + "epoch": 0.18, + "grad_norm": 1.1870140476364872, + "learning_rate": 1.8904436001039408e-05, + "loss": 1.2651, + "step": 3663 + }, + { + "epoch": 0.18, + "grad_norm": 1.2188930192951837, + "learning_rate": 1.8903726953273794e-05, + "loss": 1.1948, + "step": 3664 + }, + { + "epoch": 0.18, + "grad_norm": 1.3877812380598347, + "learning_rate": 1.8903017689440914e-05, + "loss": 1.2407, + "step": 3665 + }, + { + "epoch": 0.18, + "grad_norm": 1.4908036406515537, + "learning_rate": 1.8902308209557976e-05, + "loss": 1.1934, + "step": 3666 + }, + { + "epoch": 0.18, + "grad_norm": 1.2857774425289568, + "learning_rate": 1.89015985136422e-05, + "loss": 1.25, + "step": 3667 + }, + { + "epoch": 0.18, + "grad_norm": 1.0597580582070834, + "learning_rate": 1.8900888601710804e-05, + "loss": 1.0247, + "step": 3668 + }, + { + "epoch": 0.18, + "grad_norm": 0.9230086729612763, + "learning_rate": 1.890017847378102e-05, + "loss": 1.0859, + "step": 3669 + }, + { + "epoch": 0.18, + "grad_norm": 1.264643691157165, + "learning_rate": 1.889946812987008e-05, + "loss": 1.3506, + "step": 3670 + }, + { + "epoch": 0.18, + "grad_norm": 1.580986450900499, + "learning_rate": 1.8898757569995218e-05, + "loss": 1.1865, + "step": 3671 + }, + { + "epoch": 0.18, + "grad_norm": 1.491744764326917, + "learning_rate": 1.8898046794173684e-05, + "loss": 1.23, + "step": 3672 + }, + { + "epoch": 0.18, + "grad_norm": 1.253224421154269, + "learning_rate": 1.889733580242272e-05, + "loss": 1.2017, + "step": 3673 + }, + { + "epoch": 0.18, + "grad_norm": 1.3133833148737182, + "learning_rate": 1.8896624594759583e-05, + "loss": 1.1631, + "step": 3674 + }, + { + "epoch": 0.18, + "grad_norm": 1.4663772417957828, + "learning_rate": 1.889591317120153e-05, + "loss": 1.5039, + "step": 3675 + }, + { + "epoch": 0.18, + "grad_norm": 1.6223238572187488, + "learning_rate": 1.889520153176583e-05, + "loss": 1.2764, + "step": 3676 + }, + { + "epoch": 0.18, + "grad_norm": 1.2070971492301161, + "learning_rate": 1.889448967646974e-05, + "loss": 1.2271, + "step": 3677 + }, + { + "epoch": 0.18, + "grad_norm": 1.3014235975740673, + "learning_rate": 1.8893777605330553e-05, + "loss": 1.1318, + "step": 3678 + }, + { + "epoch": 0.18, + "grad_norm": 0.9208121000618421, + "learning_rate": 1.8893065318365534e-05, + "loss": 1.0352, + "step": 3679 + }, + { + "epoch": 0.18, + "grad_norm": 1.1522134695093353, + "learning_rate": 1.8892352815591973e-05, + "loss": 1.0713, + "step": 3680 + }, + { + "epoch": 0.18, + "grad_norm": 1.0639145441085855, + "learning_rate": 1.8891640097027163e-05, + "loss": 1.2422, + "step": 3681 + }, + { + "epoch": 0.18, + "grad_norm": 1.3138911940540887, + "learning_rate": 1.8890927162688398e-05, + "loss": 1.2266, + "step": 3682 + }, + { + "epoch": 0.18, + "grad_norm": 1.1737051108063394, + "learning_rate": 1.8890214012592977e-05, + "loss": 1.1069, + "step": 3683 + }, + { + "epoch": 0.18, + "grad_norm": 1.3586156443044153, + "learning_rate": 1.8889500646758203e-05, + "loss": 1.1348, + "step": 3684 + }, + { + "epoch": 0.18, + "grad_norm": 1.1501428815039894, + "learning_rate": 1.8888787065201393e-05, + "loss": 1.1577, + "step": 3685 + }, + { + "epoch": 0.18, + "grad_norm": 1.1280832333763993, + "learning_rate": 1.8888073267939865e-05, + "loss": 1.1606, + "step": 3686 + }, + { + "epoch": 0.18, + "grad_norm": 0.9736109700976114, + "learning_rate": 1.8887359254990937e-05, + "loss": 1.356, + "step": 3687 + }, + { + "epoch": 0.18, + "grad_norm": 1.351859816499735, + "learning_rate": 1.8886645026371937e-05, + "loss": 1.1782, + "step": 3688 + }, + { + "epoch": 0.18, + "grad_norm": 1.0222057352188998, + "learning_rate": 1.8885930582100195e-05, + "loss": 1.292, + "step": 3689 + }, + { + "epoch": 0.18, + "grad_norm": 1.3108316496931331, + "learning_rate": 1.888521592219305e-05, + "loss": 1.2314, + "step": 3690 + }, + { + "epoch": 0.18, + "grad_norm": 1.2772211534972377, + "learning_rate": 1.8884501046667847e-05, + "loss": 1.1333, + "step": 3691 + }, + { + "epoch": 0.18, + "grad_norm": 1.2364247972408073, + "learning_rate": 1.888378595554193e-05, + "loss": 1.2129, + "step": 3692 + }, + { + "epoch": 0.18, + "grad_norm": 1.1219474513970653, + "learning_rate": 1.888307064883266e-05, + "loss": 1.1914, + "step": 3693 + }, + { + "epoch": 0.18, + "grad_norm": 1.160069885816394, + "learning_rate": 1.8882355126557382e-05, + "loss": 1.2056, + "step": 3694 + }, + { + "epoch": 0.18, + "grad_norm": 1.3341110907576428, + "learning_rate": 1.8881639388733468e-05, + "loss": 1.1699, + "step": 3695 + }, + { + "epoch": 0.18, + "grad_norm": 1.1767751648654203, + "learning_rate": 1.888092343537829e-05, + "loss": 1.1108, + "step": 3696 + }, + { + "epoch": 0.18, + "grad_norm": 1.149779115536934, + "learning_rate": 1.8880207266509215e-05, + "loss": 1.2134, + "step": 3697 + }, + { + "epoch": 0.18, + "grad_norm": 1.3572112889100154, + "learning_rate": 1.887949088214363e-05, + "loss": 1.3096, + "step": 3698 + }, + { + "epoch": 0.18, + "grad_norm": 1.1029074734937117, + "learning_rate": 1.887877428229891e-05, + "loss": 1.1587, + "step": 3699 + }, + { + "epoch": 0.18, + "grad_norm": 1.405590892741505, + "learning_rate": 1.887805746699245e-05, + "loss": 1.3447, + "step": 3700 + }, + { + "epoch": 0.18, + "grad_norm": 1.4401741644069757, + "learning_rate": 1.8877340436241645e-05, + "loss": 1.4341, + "step": 3701 + }, + { + "epoch": 0.18, + "grad_norm": 1.2997791277392416, + "learning_rate": 1.8876623190063898e-05, + "loss": 1.1738, + "step": 3702 + }, + { + "epoch": 0.18, + "grad_norm": 1.2716386359224923, + "learning_rate": 1.887590572847661e-05, + "loss": 1.1953, + "step": 3703 + }, + { + "epoch": 0.18, + "grad_norm": 1.0667280793171918, + "learning_rate": 1.887518805149719e-05, + "loss": 1.0391, + "step": 3704 + }, + { + "epoch": 0.18, + "grad_norm": 1.1378099978690308, + "learning_rate": 1.887447015914306e-05, + "loss": 1.1694, + "step": 3705 + }, + { + "epoch": 0.18, + "grad_norm": 1.2474015827841545, + "learning_rate": 1.8873752051431635e-05, + "loss": 1.0288, + "step": 3706 + }, + { + "epoch": 0.18, + "grad_norm": 1.2208256301025968, + "learning_rate": 1.887303372838035e-05, + "loss": 1.1328, + "step": 3707 + }, + { + "epoch": 0.18, + "grad_norm": 1.2618329431392548, + "learning_rate": 1.8872315190006624e-05, + "loss": 1.3228, + "step": 3708 + }, + { + "epoch": 0.18, + "grad_norm": 0.9863934260957063, + "learning_rate": 1.8871596436327908e-05, + "loss": 1.1633, + "step": 3709 + }, + { + "epoch": 0.18, + "grad_norm": 1.1554600980134562, + "learning_rate": 1.8870877467361633e-05, + "loss": 1.2202, + "step": 3710 + }, + { + "epoch": 0.18, + "grad_norm": 1.2883754057537926, + "learning_rate": 1.887015828312525e-05, + "loss": 1.1479, + "step": 3711 + }, + { + "epoch": 0.18, + "grad_norm": 1.2102941619254475, + "learning_rate": 1.8869438883636212e-05, + "loss": 1.2695, + "step": 3712 + }, + { + "epoch": 0.18, + "grad_norm": 0.863423135284559, + "learning_rate": 1.8868719268911978e-05, + "loss": 1.1611, + "step": 3713 + }, + { + "epoch": 0.18, + "grad_norm": 1.0259796841418751, + "learning_rate": 1.8867999438970007e-05, + "loss": 1.3765, + "step": 3714 + }, + { + "epoch": 0.18, + "grad_norm": 1.0722650926952464, + "learning_rate": 1.886727939382777e-05, + "loss": 1.3281, + "step": 3715 + }, + { + "epoch": 0.18, + "grad_norm": 1.2082455332211228, + "learning_rate": 1.886655913350274e-05, + "loss": 1.1191, + "step": 3716 + }, + { + "epoch": 0.18, + "grad_norm": 1.1157686428334062, + "learning_rate": 1.8865838658012397e-05, + "loss": 1.1606, + "step": 3717 + }, + { + "epoch": 0.18, + "grad_norm": 1.1748478839337293, + "learning_rate": 1.8865117967374225e-05, + "loss": 1.2896, + "step": 3718 + }, + { + "epoch": 0.18, + "grad_norm": 1.2611356543925225, + "learning_rate": 1.886439706160571e-05, + "loss": 1.2612, + "step": 3719 + }, + { + "epoch": 0.18, + "grad_norm": 1.242920559591456, + "learning_rate": 1.8863675940724344e-05, + "loss": 1.2227, + "step": 3720 + }, + { + "epoch": 0.18, + "grad_norm": 1.2432927870940162, + "learning_rate": 1.886295460474763e-05, + "loss": 1.312, + "step": 3721 + }, + { + "epoch": 0.18, + "grad_norm": 1.462449010154458, + "learning_rate": 1.8862233053693074e-05, + "loss": 1.4336, + "step": 3722 + }, + { + "epoch": 0.18, + "grad_norm": 1.4179011200585068, + "learning_rate": 1.8861511287578188e-05, + "loss": 1.2515, + "step": 3723 + }, + { + "epoch": 0.18, + "grad_norm": 1.4889714384957256, + "learning_rate": 1.8860789306420478e-05, + "loss": 1.2163, + "step": 3724 + }, + { + "epoch": 0.18, + "grad_norm": 1.16835799299909, + "learning_rate": 1.8860067110237478e-05, + "loss": 1.2334, + "step": 3725 + }, + { + "epoch": 0.18, + "grad_norm": 1.5256834674882724, + "learning_rate": 1.88593446990467e-05, + "loss": 1.3501, + "step": 3726 + }, + { + "epoch": 0.18, + "grad_norm": 1.2079897914234985, + "learning_rate": 1.885862207286568e-05, + "loss": 1.2061, + "step": 3727 + }, + { + "epoch": 0.18, + "grad_norm": 1.0972876049522948, + "learning_rate": 1.8857899231711956e-05, + "loss": 1.1016, + "step": 3728 + }, + { + "epoch": 0.18, + "grad_norm": 1.0947956419622145, + "learning_rate": 1.885717617560307e-05, + "loss": 1.2715, + "step": 3729 + }, + { + "epoch": 0.18, + "grad_norm": 1.234416509562634, + "learning_rate": 1.8856452904556564e-05, + "loss": 1.5317, + "step": 3730 + }, + { + "epoch": 0.18, + "grad_norm": 1.337767371652006, + "learning_rate": 1.8855729418589994e-05, + "loss": 1.2173, + "step": 3731 + }, + { + "epoch": 0.18, + "grad_norm": 1.2032048901301533, + "learning_rate": 1.885500571772091e-05, + "loss": 1.0674, + "step": 3732 + }, + { + "epoch": 0.18, + "grad_norm": 1.2379614735665712, + "learning_rate": 1.8854281801966886e-05, + "loss": 1.1455, + "step": 3733 + }, + { + "epoch": 0.18, + "grad_norm": 1.1658780991774809, + "learning_rate": 1.8853557671345477e-05, + "loss": 1.127, + "step": 3734 + }, + { + "epoch": 0.18, + "grad_norm": 1.4071086124357972, + "learning_rate": 1.885283332587426e-05, + "loss": 1.334, + "step": 3735 + }, + { + "epoch": 0.18, + "grad_norm": 1.3444186444640516, + "learning_rate": 1.8852108765570814e-05, + "loss": 1.3262, + "step": 3736 + }, + { + "epoch": 0.18, + "grad_norm": 1.3019296465288477, + "learning_rate": 1.8851383990452726e-05, + "loss": 1.0913, + "step": 3737 + }, + { + "epoch": 0.18, + "grad_norm": 1.5475351121046275, + "learning_rate": 1.8850659000537575e-05, + "loss": 1.1741, + "step": 3738 + }, + { + "epoch": 0.18, + "grad_norm": 1.2504138840101464, + "learning_rate": 1.884993379584296e-05, + "loss": 1.2012, + "step": 3739 + }, + { + "epoch": 0.18, + "grad_norm": 1.4442662620732705, + "learning_rate": 1.8849208376386477e-05, + "loss": 1.0894, + "step": 3740 + }, + { + "epoch": 0.18, + "grad_norm": 1.5679425089521672, + "learning_rate": 1.8848482742185737e-05, + "loss": 1.1177, + "step": 3741 + }, + { + "epoch": 0.18, + "grad_norm": 1.429682427297351, + "learning_rate": 1.8847756893258336e-05, + "loss": 1.3677, + "step": 3742 + }, + { + "epoch": 0.18, + "grad_norm": 1.2266382132361986, + "learning_rate": 1.88470308296219e-05, + "loss": 1.2163, + "step": 3743 + }, + { + "epoch": 0.18, + "grad_norm": 1.3613235197136249, + "learning_rate": 1.884630455129404e-05, + "loss": 1.3115, + "step": 3744 + }, + { + "epoch": 0.18, + "grad_norm": 1.439085301882927, + "learning_rate": 1.8845578058292392e-05, + "loss": 1.3105, + "step": 3745 + }, + { + "epoch": 0.18, + "grad_norm": 1.418423496826268, + "learning_rate": 1.8844851350634573e-05, + "loss": 1.062, + "step": 3746 + }, + { + "epoch": 0.18, + "grad_norm": 1.1752753028903469, + "learning_rate": 1.8844124428338223e-05, + "loss": 1.2651, + "step": 3747 + }, + { + "epoch": 0.18, + "grad_norm": 1.310704397389374, + "learning_rate": 1.8843397291420982e-05, + "loss": 1.1152, + "step": 3748 + }, + { + "epoch": 0.18, + "grad_norm": 1.3333521281252285, + "learning_rate": 1.88426699399005e-05, + "loss": 1.2134, + "step": 3749 + }, + { + "epoch": 0.18, + "grad_norm": 1.2237217081444318, + "learning_rate": 1.884194237379442e-05, + "loss": 1.2729, + "step": 3750 + }, + { + "epoch": 0.18, + "grad_norm": 1.4067633171011622, + "learning_rate": 1.8841214593120405e-05, + "loss": 1.1343, + "step": 3751 + }, + { + "epoch": 0.18, + "grad_norm": 1.030748578560186, + "learning_rate": 1.8840486597896114e-05, + "loss": 1.3579, + "step": 3752 + }, + { + "epoch": 0.18, + "grad_norm": 1.3373837197422835, + "learning_rate": 1.8839758388139208e-05, + "loss": 1.2285, + "step": 3753 + }, + { + "epoch": 0.18, + "grad_norm": 1.161523665885835, + "learning_rate": 1.8839029963867362e-05, + "loss": 1.146, + "step": 3754 + }, + { + "epoch": 0.18, + "grad_norm": 1.1488379132663662, + "learning_rate": 1.883830132509826e-05, + "loss": 1.1396, + "step": 3755 + }, + { + "epoch": 0.18, + "grad_norm": 1.283233218592384, + "learning_rate": 1.8837572471849574e-05, + "loss": 1.1816, + "step": 3756 + }, + { + "epoch": 0.18, + "grad_norm": 1.305138447674614, + "learning_rate": 1.8836843404138992e-05, + "loss": 1.1074, + "step": 3757 + }, + { + "epoch": 0.18, + "grad_norm": 1.149463996054947, + "learning_rate": 1.883611412198421e-05, + "loss": 1.2524, + "step": 3758 + }, + { + "epoch": 0.18, + "grad_norm": 1.3319400841835491, + "learning_rate": 1.8835384625402927e-05, + "loss": 1.2778, + "step": 3759 + }, + { + "epoch": 0.18, + "grad_norm": 1.1460389568818394, + "learning_rate": 1.883465491441284e-05, + "loss": 1.1016, + "step": 3760 + }, + { + "epoch": 0.18, + "grad_norm": 1.3692038818273549, + "learning_rate": 1.8833924989031663e-05, + "loss": 1.2842, + "step": 3761 + }, + { + "epoch": 0.18, + "grad_norm": 1.1918474248417137, + "learning_rate": 1.8833194849277105e-05, + "loss": 1.2241, + "step": 3762 + }, + { + "epoch": 0.18, + "grad_norm": 1.56327597800025, + "learning_rate": 1.8832464495166883e-05, + "loss": 1.2397, + "step": 3763 + }, + { + "epoch": 0.18, + "grad_norm": 1.1632329712146787, + "learning_rate": 1.8831733926718725e-05, + "loss": 1.1328, + "step": 3764 + }, + { + "epoch": 0.18, + "grad_norm": 1.1770576580086047, + "learning_rate": 1.8831003143950357e-05, + "loss": 1.2446, + "step": 3765 + }, + { + "epoch": 0.18, + "grad_norm": 1.1730230019162593, + "learning_rate": 1.8830272146879513e-05, + "loss": 1.1033, + "step": 3766 + }, + { + "epoch": 0.18, + "grad_norm": 1.1005964851702894, + "learning_rate": 1.882954093552394e-05, + "loss": 1.2847, + "step": 3767 + }, + { + "epoch": 0.18, + "grad_norm": 1.5140957230136651, + "learning_rate": 1.8828809509901366e-05, + "loss": 1.1431, + "step": 3768 + }, + { + "epoch": 0.18, + "grad_norm": 1.3434076548471803, + "learning_rate": 1.8828077870029554e-05, + "loss": 1.1401, + "step": 3769 + }, + { + "epoch": 0.18, + "grad_norm": 1.3278875953104918, + "learning_rate": 1.8827346015926253e-05, + "loss": 1.3286, + "step": 3770 + }, + { + "epoch": 0.18, + "grad_norm": 1.30623818909198, + "learning_rate": 1.8826613947609225e-05, + "loss": 1.2637, + "step": 3771 + }, + { + "epoch": 0.18, + "grad_norm": 1.6193891024745923, + "learning_rate": 1.8825881665096237e-05, + "loss": 1.2202, + "step": 3772 + }, + { + "epoch": 0.18, + "grad_norm": 1.303574844509896, + "learning_rate": 1.8825149168405055e-05, + "loss": 1.335, + "step": 3773 + }, + { + "epoch": 0.18, + "grad_norm": 1.333488492486229, + "learning_rate": 1.8824416457553455e-05, + "loss": 1.2764, + "step": 3774 + }, + { + "epoch": 0.18, + "grad_norm": 1.1483100902172072, + "learning_rate": 1.882368353255922e-05, + "loss": 1.1299, + "step": 3775 + }, + { + "epoch": 0.18, + "grad_norm": 1.0690048071825466, + "learning_rate": 1.8822950393440135e-05, + "loss": 1.1426, + "step": 3776 + }, + { + "epoch": 0.18, + "grad_norm": 1.073616231866021, + "learning_rate": 1.8822217040213994e-05, + "loss": 1.2705, + "step": 3777 + }, + { + "epoch": 0.18, + "grad_norm": 1.439643442987364, + "learning_rate": 1.882148347289859e-05, + "loss": 1.2148, + "step": 3778 + }, + { + "epoch": 0.18, + "grad_norm": 1.3650361032701719, + "learning_rate": 1.8820749691511723e-05, + "loss": 1.3389, + "step": 3779 + }, + { + "epoch": 0.18, + "grad_norm": 1.3163335386066355, + "learning_rate": 1.8820015696071202e-05, + "loss": 1.2085, + "step": 3780 + }, + { + "epoch": 0.18, + "grad_norm": 1.0858565596555425, + "learning_rate": 1.881928148659484e-05, + "loss": 1.2026, + "step": 3781 + }, + { + "epoch": 0.18, + "grad_norm": 1.498516697511471, + "learning_rate": 1.8818547063100453e-05, + "loss": 1.313, + "step": 3782 + }, + { + "epoch": 0.18, + "grad_norm": 1.1377353541692377, + "learning_rate": 1.8817812425605864e-05, + "loss": 1.2095, + "step": 3783 + }, + { + "epoch": 0.18, + "grad_norm": 1.4100640063579568, + "learning_rate": 1.88170775741289e-05, + "loss": 1.2461, + "step": 3784 + }, + { + "epoch": 0.18, + "grad_norm": 1.1440090994169752, + "learning_rate": 1.8816342508687388e-05, + "loss": 1.2305, + "step": 3785 + }, + { + "epoch": 0.18, + "grad_norm": 1.3787698783532052, + "learning_rate": 1.8815607229299177e-05, + "loss": 1.311, + "step": 3786 + }, + { + "epoch": 0.18, + "grad_norm": 1.3737421671533245, + "learning_rate": 1.8814871735982102e-05, + "loss": 1.1519, + "step": 3787 + }, + { + "epoch": 0.18, + "grad_norm": 1.2524807661501367, + "learning_rate": 1.8814136028754016e-05, + "loss": 1.1626, + "step": 3788 + }, + { + "epoch": 0.18, + "grad_norm": 1.337601149623135, + "learning_rate": 1.8813400107632763e-05, + "loss": 1.2817, + "step": 3789 + }, + { + "epoch": 0.18, + "grad_norm": 1.1558400418305887, + "learning_rate": 1.881266397263622e-05, + "loss": 1.1753, + "step": 3790 + }, + { + "epoch": 0.18, + "grad_norm": 1.3364311218657992, + "learning_rate": 1.8811927623782227e-05, + "loss": 1.2715, + "step": 3791 + }, + { + "epoch": 0.18, + "grad_norm": 1.5076759390985712, + "learning_rate": 1.8811191061088673e-05, + "loss": 1.23, + "step": 3792 + }, + { + "epoch": 0.18, + "grad_norm": 1.1204120136565816, + "learning_rate": 1.8810454284573424e-05, + "loss": 1.0903, + "step": 3793 + }, + { + "epoch": 0.18, + "grad_norm": 1.6802007416923506, + "learning_rate": 1.880971729425436e-05, + "loss": 1.2148, + "step": 3794 + }, + { + "epoch": 0.18, + "grad_norm": 1.454270523731031, + "learning_rate": 1.880898009014937e-05, + "loss": 1.21, + "step": 3795 + }, + { + "epoch": 0.18, + "grad_norm": 1.3459560743645265, + "learning_rate": 1.880824267227633e-05, + "loss": 1.2773, + "step": 3796 + }, + { + "epoch": 0.18, + "grad_norm": 1.1387966852435205, + "learning_rate": 1.880750504065315e-05, + "loss": 1.1646, + "step": 3797 + }, + { + "epoch": 0.18, + "grad_norm": 1.3375672706286275, + "learning_rate": 1.8806767195297724e-05, + "loss": 1.2461, + "step": 3798 + }, + { + "epoch": 0.18, + "grad_norm": 1.3050725420711256, + "learning_rate": 1.880602913622796e-05, + "loss": 1.0225, + "step": 3799 + }, + { + "epoch": 0.18, + "grad_norm": 1.4088485801730106, + "learning_rate": 1.880529086346176e-05, + "loss": 1.3271, + "step": 3800 + }, + { + "epoch": 0.18, + "grad_norm": 1.5373577673294068, + "learning_rate": 1.880455237701705e-05, + "loss": 1.1387, + "step": 3801 + }, + { + "epoch": 0.18, + "grad_norm": 1.1999183668612758, + "learning_rate": 1.880381367691175e-05, + "loss": 1.168, + "step": 3802 + }, + { + "epoch": 0.18, + "grad_norm": 1.1043503335111857, + "learning_rate": 1.880307476316378e-05, + "loss": 1.3496, + "step": 3803 + }, + { + "epoch": 0.18, + "grad_norm": 1.4458193677077311, + "learning_rate": 1.8802335635791076e-05, + "loss": 1.3052, + "step": 3804 + }, + { + "epoch": 0.18, + "grad_norm": 1.2558432906925352, + "learning_rate": 1.880159629481157e-05, + "loss": 1.1895, + "step": 3805 + }, + { + "epoch": 0.18, + "grad_norm": 1.1388344337073755, + "learning_rate": 1.880085674024321e-05, + "loss": 1.1035, + "step": 3806 + }, + { + "epoch": 0.18, + "grad_norm": 1.4996358351220862, + "learning_rate": 1.8800116972103935e-05, + "loss": 1.2993, + "step": 3807 + }, + { + "epoch": 0.18, + "grad_norm": 1.1338488703690763, + "learning_rate": 1.8799376990411708e-05, + "loss": 1.071, + "step": 3808 + }, + { + "epoch": 0.18, + "grad_norm": 1.2807907645427477, + "learning_rate": 1.8798636795184473e-05, + "loss": 1.3374, + "step": 3809 + }, + { + "epoch": 0.18, + "grad_norm": 1.4471101731064926, + "learning_rate": 1.8797896386440202e-05, + "loss": 1.2466, + "step": 3810 + }, + { + "epoch": 0.18, + "grad_norm": 1.3026443115209179, + "learning_rate": 1.879715576419686e-05, + "loss": 1.1382, + "step": 3811 + }, + { + "epoch": 0.18, + "grad_norm": 1.4583755041022222, + "learning_rate": 1.8796414928472417e-05, + "loss": 1.187, + "step": 3812 + }, + { + "epoch": 0.18, + "grad_norm": 1.0043399525900796, + "learning_rate": 1.879567387928485e-05, + "loss": 1.3135, + "step": 3813 + }, + { + "epoch": 0.18, + "grad_norm": 1.2031043274170252, + "learning_rate": 1.8794932616652152e-05, + "loss": 1.1592, + "step": 3814 + }, + { + "epoch": 0.18, + "grad_norm": 1.382158804800577, + "learning_rate": 1.8794191140592303e-05, + "loss": 1.1953, + "step": 3815 + }, + { + "epoch": 0.18, + "grad_norm": 1.417447970303134, + "learning_rate": 1.8793449451123296e-05, + "loss": 1.1924, + "step": 3816 + }, + { + "epoch": 0.18, + "grad_norm": 1.1792468720277238, + "learning_rate": 1.879270754826313e-05, + "loss": 1.0928, + "step": 3817 + }, + { + "epoch": 0.18, + "grad_norm": 1.0697752745825342, + "learning_rate": 1.8791965432029812e-05, + "loss": 1.3208, + "step": 3818 + }, + { + "epoch": 0.18, + "grad_norm": 1.3039658221473143, + "learning_rate": 1.8791223102441347e-05, + "loss": 1.207, + "step": 3819 + }, + { + "epoch": 0.18, + "grad_norm": 1.2643289308563401, + "learning_rate": 1.8790480559515756e-05, + "loss": 1.2407, + "step": 3820 + }, + { + "epoch": 0.18, + "grad_norm": 1.0489146698141427, + "learning_rate": 1.878973780327105e-05, + "loss": 1.3149, + "step": 3821 + }, + { + "epoch": 0.18, + "grad_norm": 1.0460145907126146, + "learning_rate": 1.878899483372526e-05, + "loss": 0.9519, + "step": 3822 + }, + { + "epoch": 0.18, + "grad_norm": 0.8675795702203667, + "learning_rate": 1.8788251650896407e-05, + "loss": 1.2515, + "step": 3823 + }, + { + "epoch": 0.18, + "grad_norm": 1.3120237838550692, + "learning_rate": 1.8787508254802538e-05, + "loss": 1.2598, + "step": 3824 + }, + { + "epoch": 0.18, + "grad_norm": 1.1427240029204004, + "learning_rate": 1.8786764645461684e-05, + "loss": 1.187, + "step": 3825 + }, + { + "epoch": 0.18, + "grad_norm": 1.3074253271676255, + "learning_rate": 1.8786020822891892e-05, + "loss": 1.3105, + "step": 3826 + }, + { + "epoch": 0.18, + "grad_norm": 0.9681325250284729, + "learning_rate": 1.8785276787111216e-05, + "loss": 1.3135, + "step": 3827 + }, + { + "epoch": 0.18, + "grad_norm": 1.2734646269311545, + "learning_rate": 1.878453253813771e-05, + "loss": 1.2324, + "step": 3828 + }, + { + "epoch": 0.18, + "grad_norm": 1.235195368426881, + "learning_rate": 1.878378807598943e-05, + "loss": 1.2749, + "step": 3829 + }, + { + "epoch": 0.18, + "grad_norm": 1.3021276837479405, + "learning_rate": 1.8783043400684447e-05, + "loss": 1.3218, + "step": 3830 + }, + { + "epoch": 0.18, + "grad_norm": 1.3404915641881334, + "learning_rate": 1.878229851224083e-05, + "loss": 1.2339, + "step": 3831 + }, + { + "epoch": 0.18, + "grad_norm": 1.2111111755003627, + "learning_rate": 1.8781553410676658e-05, + "loss": 1.1831, + "step": 3832 + }, + { + "epoch": 0.18, + "grad_norm": 1.2031021206989294, + "learning_rate": 1.878080809601001e-05, + "loss": 1.1899, + "step": 3833 + }, + { + "epoch": 0.18, + "grad_norm": 1.1719337081564711, + "learning_rate": 1.878006256825897e-05, + "loss": 1.1152, + "step": 3834 + }, + { + "epoch": 0.18, + "grad_norm": 1.215657390160487, + "learning_rate": 1.8779316827441636e-05, + "loss": 1.1465, + "step": 3835 + }, + { + "epoch": 0.18, + "grad_norm": 1.160060174894915, + "learning_rate": 1.87785708735761e-05, + "loss": 1.095, + "step": 3836 + }, + { + "epoch": 0.18, + "grad_norm": 1.4826166128691558, + "learning_rate": 1.8777824706680466e-05, + "loss": 1.3706, + "step": 3837 + }, + { + "epoch": 0.18, + "grad_norm": 1.5238786812581249, + "learning_rate": 1.8777078326772843e-05, + "loss": 1.2354, + "step": 3838 + }, + { + "epoch": 0.18, + "grad_norm": 1.1781480276801504, + "learning_rate": 1.877633173387134e-05, + "loss": 1.2222, + "step": 3839 + }, + { + "epoch": 0.18, + "grad_norm": 1.2353840607166555, + "learning_rate": 1.8775584927994074e-05, + "loss": 1.2397, + "step": 3840 + }, + { + "epoch": 0.18, + "grad_norm": 1.3523137596210626, + "learning_rate": 1.877483790915917e-05, + "loss": 1.1323, + "step": 3841 + }, + { + "epoch": 0.18, + "grad_norm": 1.1636779975353855, + "learning_rate": 1.877409067738476e-05, + "loss": 1.0442, + "step": 3842 + }, + { + "epoch": 0.18, + "grad_norm": 1.3061011866152812, + "learning_rate": 1.877334323268897e-05, + "loss": 1.2549, + "step": 3843 + }, + { + "epoch": 0.18, + "grad_norm": 1.4300377485575873, + "learning_rate": 1.8772595575089943e-05, + "loss": 1.1504, + "step": 3844 + }, + { + "epoch": 0.18, + "grad_norm": 1.2268235120723918, + "learning_rate": 1.8771847704605818e-05, + "loss": 1.0898, + "step": 3845 + }, + { + "epoch": 0.18, + "grad_norm": 0.884719747692226, + "learning_rate": 1.8771099621254748e-05, + "loss": 1.3413, + "step": 3846 + }, + { + "epoch": 0.19, + "grad_norm": 1.344024477694934, + "learning_rate": 1.8770351325054882e-05, + "loss": 1.1621, + "step": 3847 + }, + { + "epoch": 0.19, + "grad_norm": 1.2124190722235857, + "learning_rate": 1.876960281602439e-05, + "loss": 1.332, + "step": 3848 + }, + { + "epoch": 0.19, + "grad_norm": 1.0502453824008753, + "learning_rate": 1.876885409418142e-05, + "loss": 1.1968, + "step": 3849 + }, + { + "epoch": 0.19, + "grad_norm": 1.2112787750863305, + "learning_rate": 1.8768105159544152e-05, + "loss": 1.2432, + "step": 3850 + }, + { + "epoch": 0.19, + "grad_norm": 1.3042155393188326, + "learning_rate": 1.8767356012130758e-05, + "loss": 1.2578, + "step": 3851 + }, + { + "epoch": 0.19, + "grad_norm": 1.2534735628451261, + "learning_rate": 1.8766606651959417e-05, + "loss": 1.2749, + "step": 3852 + }, + { + "epoch": 0.19, + "grad_norm": 1.1699911680327308, + "learning_rate": 1.876585707904831e-05, + "loss": 1.2456, + "step": 3853 + }, + { + "epoch": 0.19, + "grad_norm": 1.1870813396675193, + "learning_rate": 1.876510729341564e-05, + "loss": 1.2734, + "step": 3854 + }, + { + "epoch": 0.19, + "grad_norm": 1.6277851196544832, + "learning_rate": 1.876435729507959e-05, + "loss": 1.1045, + "step": 3855 + }, + { + "epoch": 0.19, + "grad_norm": 1.238931215704037, + "learning_rate": 1.876360708405836e-05, + "loss": 1.167, + "step": 3856 + }, + { + "epoch": 0.19, + "grad_norm": 0.9338497411819181, + "learning_rate": 1.8762856660370165e-05, + "loss": 1.2227, + "step": 3857 + }, + { + "epoch": 0.19, + "grad_norm": 1.2282635897015475, + "learning_rate": 1.8762106024033206e-05, + "loss": 1.3608, + "step": 3858 + }, + { + "epoch": 0.19, + "grad_norm": 1.228309933636235, + "learning_rate": 1.87613551750657e-05, + "loss": 1.2007, + "step": 3859 + }, + { + "epoch": 0.19, + "grad_norm": 1.3023676612436272, + "learning_rate": 1.8760604113485873e-05, + "loss": 1.2544, + "step": 3860 + }, + { + "epoch": 0.19, + "grad_norm": 1.6230074189041543, + "learning_rate": 1.8759852839311946e-05, + "loss": 1.3442, + "step": 3861 + }, + { + "epoch": 0.19, + "grad_norm": 1.4622782613613992, + "learning_rate": 1.8759101352562154e-05, + "loss": 1.3257, + "step": 3862 + }, + { + "epoch": 0.19, + "grad_norm": 1.278946359539599, + "learning_rate": 1.8758349653254733e-05, + "loss": 1.2207, + "step": 3863 + }, + { + "epoch": 0.19, + "grad_norm": 1.315382894285985, + "learning_rate": 1.8757597741407922e-05, + "loss": 1.1357, + "step": 3864 + }, + { + "epoch": 0.19, + "grad_norm": 1.2066435817690058, + "learning_rate": 1.875684561703997e-05, + "loss": 1.052, + "step": 3865 + }, + { + "epoch": 0.19, + "grad_norm": 1.1928937325703166, + "learning_rate": 1.8756093280169126e-05, + "loss": 1.2866, + "step": 3866 + }, + { + "epoch": 0.19, + "grad_norm": 1.1426461835134458, + "learning_rate": 1.875534073081365e-05, + "loss": 1.126, + "step": 3867 + }, + { + "epoch": 0.19, + "grad_norm": 1.331965711635815, + "learning_rate": 1.8754587968991803e-05, + "loss": 0.9929, + "step": 3868 + }, + { + "epoch": 0.19, + "grad_norm": 1.5635288817577517, + "learning_rate": 1.8753834994721852e-05, + "loss": 1.3032, + "step": 3869 + }, + { + "epoch": 0.19, + "grad_norm": 1.6735194966695957, + "learning_rate": 1.8753081808022065e-05, + "loss": 1.1265, + "step": 3870 + }, + { + "epoch": 0.19, + "grad_norm": 1.3229242919674427, + "learning_rate": 1.8752328408910732e-05, + "loss": 1.0933, + "step": 3871 + }, + { + "epoch": 0.19, + "grad_norm": 1.5871042059219043, + "learning_rate": 1.8751574797406124e-05, + "loss": 1.3804, + "step": 3872 + }, + { + "epoch": 0.19, + "grad_norm": 1.3127714773241774, + "learning_rate": 1.8750820973526535e-05, + "loss": 1.2646, + "step": 3873 + }, + { + "epoch": 0.19, + "grad_norm": 1.1817012313084219, + "learning_rate": 1.8750066937290256e-05, + "loss": 1.1924, + "step": 3874 + }, + { + "epoch": 0.19, + "grad_norm": 1.3522395479196476, + "learning_rate": 1.8749312688715587e-05, + "loss": 1.1484, + "step": 3875 + }, + { + "epoch": 0.19, + "grad_norm": 1.2757339847063416, + "learning_rate": 1.8748558227820828e-05, + "loss": 1.2344, + "step": 3876 + }, + { + "epoch": 0.19, + "grad_norm": 0.8481924274997216, + "learning_rate": 1.8747803554624287e-05, + "loss": 0.9304, + "step": 3877 + }, + { + "epoch": 0.19, + "grad_norm": 1.3533843298319492, + "learning_rate": 1.874704866914428e-05, + "loss": 1.2896, + "step": 3878 + }, + { + "epoch": 0.19, + "grad_norm": 1.7211111753068293, + "learning_rate": 1.874629357139913e-05, + "loss": 1.2041, + "step": 3879 + }, + { + "epoch": 0.19, + "grad_norm": 1.19081771796767, + "learning_rate": 1.8745538261407157e-05, + "loss": 1.2241, + "step": 3880 + }, + { + "epoch": 0.19, + "grad_norm": 1.2209056245156116, + "learning_rate": 1.8744782739186688e-05, + "loss": 1.1328, + "step": 3881 + }, + { + "epoch": 0.19, + "grad_norm": 1.4811144373647276, + "learning_rate": 1.874402700475606e-05, + "loss": 1.3232, + "step": 3882 + }, + { + "epoch": 0.19, + "grad_norm": 1.5018082171913014, + "learning_rate": 1.874327105813361e-05, + "loss": 1.3926, + "step": 3883 + }, + { + "epoch": 0.19, + "grad_norm": 1.1395584637375353, + "learning_rate": 1.874251489933769e-05, + "loss": 0.9792, + "step": 3884 + }, + { + "epoch": 0.19, + "grad_norm": 1.3944578863904746, + "learning_rate": 1.874175852838664e-05, + "loss": 1.1118, + "step": 3885 + }, + { + "epoch": 0.19, + "grad_norm": 0.8916927462233674, + "learning_rate": 1.8741001945298817e-05, + "loss": 1.1802, + "step": 3886 + }, + { + "epoch": 0.19, + "grad_norm": 1.3135201781873118, + "learning_rate": 1.8740245150092585e-05, + "loss": 1.252, + "step": 3887 + }, + { + "epoch": 0.19, + "grad_norm": 1.1443115260373942, + "learning_rate": 1.873948814278631e-05, + "loss": 1.1738, + "step": 3888 + }, + { + "epoch": 0.19, + "grad_norm": 1.3461935481077292, + "learning_rate": 1.8738730923398357e-05, + "loss": 1.3398, + "step": 3889 + }, + { + "epoch": 0.19, + "grad_norm": 1.0615053787453288, + "learning_rate": 1.8737973491947102e-05, + "loss": 1.2036, + "step": 3890 + }, + { + "epoch": 0.19, + "grad_norm": 1.0188809157873129, + "learning_rate": 1.8737215848450933e-05, + "loss": 1.1924, + "step": 3891 + }, + { + "epoch": 0.19, + "grad_norm": 1.5045985881628183, + "learning_rate": 1.8736457992928228e-05, + "loss": 1.3145, + "step": 3892 + }, + { + "epoch": 0.19, + "grad_norm": 1.1357893377651747, + "learning_rate": 1.873569992539738e-05, + "loss": 1.104, + "step": 3893 + }, + { + "epoch": 0.19, + "grad_norm": 1.3037439217132705, + "learning_rate": 1.8734941645876786e-05, + "loss": 1.3403, + "step": 3894 + }, + { + "epoch": 0.19, + "grad_norm": 1.169668532029814, + "learning_rate": 1.8734183154384848e-05, + "loss": 1.0701, + "step": 3895 + }, + { + "epoch": 0.19, + "grad_norm": 1.2517172934027079, + "learning_rate": 1.873342445093997e-05, + "loss": 1.2075, + "step": 3896 + }, + { + "epoch": 0.19, + "grad_norm": 1.6314924205382637, + "learning_rate": 1.8732665535560564e-05, + "loss": 1.3081, + "step": 3897 + }, + { + "epoch": 0.19, + "grad_norm": 1.2119416132223482, + "learning_rate": 1.873190640826505e-05, + "loss": 1.1509, + "step": 3898 + }, + { + "epoch": 0.19, + "grad_norm": 1.1319996684413565, + "learning_rate": 1.8731147069071843e-05, + "loss": 1.0159, + "step": 3899 + }, + { + "epoch": 0.19, + "grad_norm": 1.0653718539190122, + "learning_rate": 1.8730387517999378e-05, + "loss": 1.1123, + "step": 3900 + }, + { + "epoch": 0.19, + "grad_norm": 0.8359395207604404, + "learning_rate": 1.8729627755066082e-05, + "loss": 1.1758, + "step": 3901 + }, + { + "epoch": 0.19, + "grad_norm": 1.2224139068867883, + "learning_rate": 1.8728867780290393e-05, + "loss": 1.1802, + "step": 3902 + }, + { + "epoch": 0.19, + "grad_norm": 1.2362163227668923, + "learning_rate": 1.8728107593690753e-05, + "loss": 1.2378, + "step": 3903 + }, + { + "epoch": 0.19, + "grad_norm": 1.1327031168571218, + "learning_rate": 1.872734719528561e-05, + "loss": 1.0459, + "step": 3904 + }, + { + "epoch": 0.19, + "grad_norm": 1.3659739168261704, + "learning_rate": 1.872658658509342e-05, + "loss": 1.1211, + "step": 3905 + }, + { + "epoch": 0.19, + "grad_norm": 1.2708763017008287, + "learning_rate": 1.872582576313263e-05, + "loss": 1.2827, + "step": 3906 + }, + { + "epoch": 0.19, + "grad_norm": 0.9610992766299828, + "learning_rate": 1.8725064729421717e-05, + "loss": 1.1343, + "step": 3907 + }, + { + "epoch": 0.19, + "grad_norm": 1.1744418965325993, + "learning_rate": 1.872430348397914e-05, + "loss": 1.3696, + "step": 3908 + }, + { + "epoch": 0.19, + "grad_norm": 1.163794860534079, + "learning_rate": 1.8723542026823375e-05, + "loss": 1.2505, + "step": 3909 + }, + { + "epoch": 0.19, + "grad_norm": 1.1645938154070419, + "learning_rate": 1.8722780357972903e-05, + "loss": 1.1431, + "step": 3910 + }, + { + "epoch": 0.19, + "grad_norm": 1.3756586166558558, + "learning_rate": 1.87220184774462e-05, + "loss": 1.1167, + "step": 3911 + }, + { + "epoch": 0.19, + "grad_norm": 1.4178143226225328, + "learning_rate": 1.872125638526176e-05, + "loss": 1.1719, + "step": 3912 + }, + { + "epoch": 0.19, + "grad_norm": 1.8714871332099345, + "learning_rate": 1.872049408143808e-05, + "loss": 1.2051, + "step": 3913 + }, + { + "epoch": 0.19, + "grad_norm": 1.3358384519871518, + "learning_rate": 1.8719731565993647e-05, + "loss": 1.2764, + "step": 3914 + }, + { + "epoch": 0.19, + "grad_norm": 1.3332416938948266, + "learning_rate": 1.871896883894698e-05, + "loss": 1.2144, + "step": 3915 + }, + { + "epoch": 0.19, + "grad_norm": 1.1754403372366058, + "learning_rate": 1.8718205900316578e-05, + "loss": 1.2646, + "step": 3916 + }, + { + "epoch": 0.19, + "grad_norm": 1.2972778522489423, + "learning_rate": 1.8717442750120956e-05, + "loss": 1.2939, + "step": 3917 + }, + { + "epoch": 0.19, + "grad_norm": 1.5763964280058587, + "learning_rate": 1.871667938837864e-05, + "loss": 1.2205, + "step": 3918 + }, + { + "epoch": 0.19, + "grad_norm": 1.2942851967515048, + "learning_rate": 1.871591581510815e-05, + "loss": 1.126, + "step": 3919 + }, + { + "epoch": 0.19, + "grad_norm": 1.415956311788413, + "learning_rate": 1.871515203032801e-05, + "loss": 1.355, + "step": 3920 + }, + { + "epoch": 0.19, + "grad_norm": 1.352519506332349, + "learning_rate": 1.8714388034056764e-05, + "loss": 1.1724, + "step": 3921 + }, + { + "epoch": 0.19, + "grad_norm": 0.9014310260347684, + "learning_rate": 1.871362382631295e-05, + "loss": 1.2583, + "step": 3922 + }, + { + "epoch": 0.19, + "grad_norm": 0.8473281680322229, + "learning_rate": 1.871285940711511e-05, + "loss": 1.0483, + "step": 3923 + }, + { + "epoch": 0.19, + "grad_norm": 1.131629680354648, + "learning_rate": 1.8712094776481798e-05, + "loss": 1.3716, + "step": 3924 + }, + { + "epoch": 0.19, + "grad_norm": 1.1668819565405113, + "learning_rate": 1.8711329934431563e-05, + "loss": 1.209, + "step": 3925 + }, + { + "epoch": 0.19, + "grad_norm": 1.1365089860312, + "learning_rate": 1.8710564880982975e-05, + "loss": 1.1704, + "step": 3926 + }, + { + "epoch": 0.19, + "grad_norm": 1.24820267432663, + "learning_rate": 1.8709799616154587e-05, + "loss": 1.2124, + "step": 3927 + }, + { + "epoch": 0.19, + "grad_norm": 1.056203779023951, + "learning_rate": 1.870903413996498e-05, + "loss": 1.2036, + "step": 3928 + }, + { + "epoch": 0.19, + "grad_norm": 1.346414462677292, + "learning_rate": 1.870826845243273e-05, + "loss": 1.249, + "step": 3929 + }, + { + "epoch": 0.19, + "grad_norm": 1.5574945850531015, + "learning_rate": 1.870750255357641e-05, + "loss": 1.3755, + "step": 3930 + }, + { + "epoch": 0.19, + "grad_norm": 1.3534182082820945, + "learning_rate": 1.8706736443414616e-05, + "loss": 1.3037, + "step": 3931 + }, + { + "epoch": 0.19, + "grad_norm": 1.292932590349432, + "learning_rate": 1.870597012196593e-05, + "loss": 1.3325, + "step": 3932 + }, + { + "epoch": 0.19, + "grad_norm": 1.3016419000077595, + "learning_rate": 1.8705203589248953e-05, + "loss": 1.2383, + "step": 3933 + }, + { + "epoch": 0.19, + "grad_norm": 1.4711409634459682, + "learning_rate": 1.8704436845282288e-05, + "loss": 1.2305, + "step": 3934 + }, + { + "epoch": 0.19, + "grad_norm": 1.3201087854345772, + "learning_rate": 1.8703669890084536e-05, + "loss": 1.146, + "step": 3935 + }, + { + "epoch": 0.19, + "grad_norm": 1.2880522836509962, + "learning_rate": 1.8702902723674317e-05, + "loss": 1.0225, + "step": 3936 + }, + { + "epoch": 0.19, + "grad_norm": 1.2073706475243196, + "learning_rate": 1.870213534607024e-05, + "loss": 1.124, + "step": 3937 + }, + { + "epoch": 0.19, + "grad_norm": 1.3722918500196595, + "learning_rate": 1.870136775729093e-05, + "loss": 1.3149, + "step": 3938 + }, + { + "epoch": 0.19, + "grad_norm": 1.1412119436060875, + "learning_rate": 1.8700599957355017e-05, + "loss": 1.1606, + "step": 3939 + }, + { + "epoch": 0.19, + "grad_norm": 1.2770070656688284, + "learning_rate": 1.8699831946281127e-05, + "loss": 1.0574, + "step": 3940 + }, + { + "epoch": 0.19, + "grad_norm": 1.1895870246195248, + "learning_rate": 1.8699063724087905e-05, + "loss": 1.1846, + "step": 3941 + }, + { + "epoch": 0.19, + "grad_norm": 1.250305219746284, + "learning_rate": 1.869829529079399e-05, + "loss": 1.1406, + "step": 3942 + }, + { + "epoch": 0.19, + "grad_norm": 1.4379638756083881, + "learning_rate": 1.869752664641802e-05, + "loss": 1.2324, + "step": 3943 + }, + { + "epoch": 0.19, + "grad_norm": 1.4047790257549924, + "learning_rate": 1.8696757790978668e-05, + "loss": 1.3726, + "step": 3944 + }, + { + "epoch": 0.19, + "grad_norm": 0.991124058493983, + "learning_rate": 1.8695988724494577e-05, + "loss": 1.1021, + "step": 3945 + }, + { + "epoch": 0.19, + "grad_norm": 1.2007610949203509, + "learning_rate": 1.869521944698441e-05, + "loss": 1.21, + "step": 3946 + }, + { + "epoch": 0.19, + "grad_norm": 1.3027661511695565, + "learning_rate": 1.869444995846684e-05, + "loss": 1.1602, + "step": 3947 + }, + { + "epoch": 0.19, + "grad_norm": 1.5304224868907503, + "learning_rate": 1.8693680258960543e-05, + "loss": 1.375, + "step": 3948 + }, + { + "epoch": 0.19, + "grad_norm": 1.1395084335689283, + "learning_rate": 1.869291034848419e-05, + "loss": 1.2578, + "step": 3949 + }, + { + "epoch": 0.19, + "grad_norm": 1.1816648026309864, + "learning_rate": 1.8692140227056468e-05, + "loss": 1.0859, + "step": 3950 + }, + { + "epoch": 0.19, + "grad_norm": 1.2253997178548492, + "learning_rate": 1.8691369894696064e-05, + "loss": 1.2661, + "step": 3951 + }, + { + "epoch": 0.19, + "grad_norm": 1.5235399730674277, + "learning_rate": 1.8690599351421675e-05, + "loss": 1.1543, + "step": 3952 + }, + { + "epoch": 0.19, + "grad_norm": 1.2679788543780428, + "learning_rate": 1.8689828597252e-05, + "loss": 1.2314, + "step": 3953 + }, + { + "epoch": 0.19, + "grad_norm": 1.107975330301084, + "learning_rate": 1.8689057632205737e-05, + "loss": 1.1973, + "step": 3954 + }, + { + "epoch": 0.19, + "grad_norm": 1.2353064109019314, + "learning_rate": 1.8688286456301602e-05, + "loss": 1.228, + "step": 3955 + }, + { + "epoch": 0.19, + "grad_norm": 1.000964267296076, + "learning_rate": 1.8687515069558303e-05, + "loss": 1.0698, + "step": 3956 + }, + { + "epoch": 0.19, + "grad_norm": 1.3057040228059902, + "learning_rate": 1.8686743471994564e-05, + "loss": 1.1865, + "step": 3957 + }, + { + "epoch": 0.19, + "grad_norm": 1.2506485663763338, + "learning_rate": 1.868597166362911e-05, + "loss": 1.0251, + "step": 3958 + }, + { + "epoch": 0.19, + "grad_norm": 1.2267620789141656, + "learning_rate": 1.8685199644480663e-05, + "loss": 1.1421, + "step": 3959 + }, + { + "epoch": 0.19, + "grad_norm": 0.9289221285820728, + "learning_rate": 1.8684427414567964e-05, + "loss": 1.1846, + "step": 3960 + }, + { + "epoch": 0.19, + "grad_norm": 1.5458055611704382, + "learning_rate": 1.8683654973909754e-05, + "loss": 1.0938, + "step": 3961 + }, + { + "epoch": 0.19, + "grad_norm": 1.0991711553182602, + "learning_rate": 1.8682882322524777e-05, + "loss": 1.1792, + "step": 3962 + }, + { + "epoch": 0.19, + "grad_norm": 1.4460814595186802, + "learning_rate": 1.8682109460431775e-05, + "loss": 1.2495, + "step": 3963 + }, + { + "epoch": 0.19, + "grad_norm": 1.226625539794446, + "learning_rate": 1.8681336387649516e-05, + "loss": 1.1865, + "step": 3964 + }, + { + "epoch": 0.19, + "grad_norm": 1.328142744885303, + "learning_rate": 1.8680563104196753e-05, + "loss": 1.0723, + "step": 3965 + }, + { + "epoch": 0.19, + "grad_norm": 1.3428319480915605, + "learning_rate": 1.867978961009225e-05, + "loss": 1.2554, + "step": 3966 + }, + { + "epoch": 0.19, + "grad_norm": 1.160441924824465, + "learning_rate": 1.8679015905354777e-05, + "loss": 1.189, + "step": 3967 + }, + { + "epoch": 0.19, + "grad_norm": 1.4509386785671345, + "learning_rate": 1.8678241990003116e-05, + "loss": 1.4263, + "step": 3968 + }, + { + "epoch": 0.19, + "grad_norm": 1.1447097283768006, + "learning_rate": 1.8677467864056045e-05, + "loss": 1.2588, + "step": 3969 + }, + { + "epoch": 0.19, + "grad_norm": 1.3178983289632742, + "learning_rate": 1.8676693527532344e-05, + "loss": 1.3096, + "step": 3970 + }, + { + "epoch": 0.19, + "grad_norm": 1.3022997373755076, + "learning_rate": 1.8675918980450812e-05, + "loss": 1.1208, + "step": 3971 + }, + { + "epoch": 0.19, + "grad_norm": 1.1572461321362646, + "learning_rate": 1.8675144222830242e-05, + "loss": 1.2539, + "step": 3972 + }, + { + "epoch": 0.19, + "grad_norm": 1.4055783999786589, + "learning_rate": 1.867436925468943e-05, + "loss": 1.3281, + "step": 3973 + }, + { + "epoch": 0.19, + "grad_norm": 1.3462481291131096, + "learning_rate": 1.867359407604719e-05, + "loss": 1.2378, + "step": 3974 + }, + { + "epoch": 0.19, + "grad_norm": 1.230309887761091, + "learning_rate": 1.867281868692233e-05, + "loss": 1.3135, + "step": 3975 + }, + { + "epoch": 0.19, + "grad_norm": 1.6730470878317056, + "learning_rate": 1.8672043087333662e-05, + "loss": 1.2915, + "step": 3976 + }, + { + "epoch": 0.19, + "grad_norm": 1.0625494286405814, + "learning_rate": 1.8671267277300015e-05, + "loss": 1.1577, + "step": 3977 + }, + { + "epoch": 0.19, + "grad_norm": 1.1916740770839762, + "learning_rate": 1.8670491256840212e-05, + "loss": 1.1558, + "step": 3978 + }, + { + "epoch": 0.19, + "grad_norm": 1.136398315892802, + "learning_rate": 1.866971502597309e-05, + "loss": 1.1821, + "step": 3979 + }, + { + "epoch": 0.19, + "grad_norm": 1.4434610840204503, + "learning_rate": 1.8668938584717473e-05, + "loss": 1.1987, + "step": 3980 + }, + { + "epoch": 0.19, + "grad_norm": 1.2612515445238641, + "learning_rate": 1.8668161933092218e-05, + "loss": 1.2036, + "step": 3981 + }, + { + "epoch": 0.19, + "grad_norm": 1.1360093747059894, + "learning_rate": 1.8667385071116157e-05, + "loss": 1.1807, + "step": 3982 + }, + { + "epoch": 0.19, + "grad_norm": 0.9340181030742526, + "learning_rate": 1.8666607998808157e-05, + "loss": 1.1294, + "step": 3983 + }, + { + "epoch": 0.19, + "grad_norm": 1.0110360650430625, + "learning_rate": 1.8665830716187064e-05, + "loss": 1.0996, + "step": 3984 + }, + { + "epoch": 0.19, + "grad_norm": 0.9615981851748967, + "learning_rate": 1.866505322327175e-05, + "loss": 1.1831, + "step": 3985 + }, + { + "epoch": 0.19, + "grad_norm": 1.6591517773470519, + "learning_rate": 1.866427552008107e-05, + "loss": 1.3486, + "step": 3986 + }, + { + "epoch": 0.19, + "grad_norm": 1.166510244451287, + "learning_rate": 1.866349760663391e-05, + "loss": 1.1294, + "step": 3987 + }, + { + "epoch": 0.19, + "grad_norm": 1.4422220350059793, + "learning_rate": 1.8662719482949142e-05, + "loss": 1.0378, + "step": 3988 + }, + { + "epoch": 0.19, + "grad_norm": 0.9679712752166828, + "learning_rate": 1.8661941149045646e-05, + "loss": 1.1777, + "step": 3989 + }, + { + "epoch": 0.19, + "grad_norm": 1.3790346285161186, + "learning_rate": 1.866116260494231e-05, + "loss": 1.2129, + "step": 3990 + }, + { + "epoch": 0.19, + "grad_norm": 1.27910918646187, + "learning_rate": 1.8660383850658033e-05, + "loss": 1.2549, + "step": 3991 + }, + { + "epoch": 0.19, + "grad_norm": 1.3701733361146509, + "learning_rate": 1.8659604886211705e-05, + "loss": 1.0586, + "step": 3992 + }, + { + "epoch": 0.19, + "grad_norm": 1.219742831097941, + "learning_rate": 1.865882571162224e-05, + "loss": 1.033, + "step": 3993 + }, + { + "epoch": 0.19, + "grad_norm": 1.460602917698877, + "learning_rate": 1.8658046326908533e-05, + "loss": 1.2861, + "step": 3994 + }, + { + "epoch": 0.19, + "grad_norm": 1.194225352857267, + "learning_rate": 1.8657266732089508e-05, + "loss": 1.4536, + "step": 3995 + }, + { + "epoch": 0.19, + "grad_norm": 1.28681962828183, + "learning_rate": 1.865648692718408e-05, + "loss": 1.0637, + "step": 3996 + }, + { + "epoch": 0.19, + "grad_norm": 1.4701572913356562, + "learning_rate": 1.865570691221117e-05, + "loss": 1.1587, + "step": 3997 + }, + { + "epoch": 0.19, + "grad_norm": 1.0219382138147313, + "learning_rate": 1.8654926687189706e-05, + "loss": 1.2065, + "step": 3998 + }, + { + "epoch": 0.19, + "grad_norm": 1.4286972976619932, + "learning_rate": 1.865414625213863e-05, + "loss": 1.2373, + "step": 3999 + }, + { + "epoch": 0.19, + "grad_norm": 1.3527938271436013, + "learning_rate": 1.8653365607076873e-05, + "loss": 0.9985, + "step": 4000 + }, + { + "epoch": 0.19, + "grad_norm": 1.180975451229571, + "learning_rate": 1.865258475202338e-05, + "loss": 1.1411, + "step": 4001 + }, + { + "epoch": 0.19, + "grad_norm": 1.1307559834843286, + "learning_rate": 1.86518036869971e-05, + "loss": 1.1685, + "step": 4002 + }, + { + "epoch": 0.19, + "grad_norm": 1.0874404911391171, + "learning_rate": 1.8651022412016993e-05, + "loss": 1.1042, + "step": 4003 + }, + { + "epoch": 0.19, + "grad_norm": 1.2390577376748106, + "learning_rate": 1.8650240927102012e-05, + "loss": 1.2734, + "step": 4004 + }, + { + "epoch": 0.19, + "grad_norm": 1.7721751493529376, + "learning_rate": 1.8649459232271124e-05, + "loss": 1.3252, + "step": 4005 + }, + { + "epoch": 0.19, + "grad_norm": 1.0920328255182932, + "learning_rate": 1.8648677327543297e-05, + "loss": 1.2598, + "step": 4006 + }, + { + "epoch": 0.19, + "grad_norm": 1.2267799807229007, + "learning_rate": 1.8647895212937504e-05, + "loss": 1.2944, + "step": 4007 + }, + { + "epoch": 0.19, + "grad_norm": 0.6985251899901863, + "learning_rate": 1.8647112888472732e-05, + "loss": 1.3057, + "step": 4008 + }, + { + "epoch": 0.19, + "grad_norm": 0.6985251899901863, + "learning_rate": 1.8647112888472732e-05, + "loss": 1.292, + "step": 4009 + }, + { + "epoch": 0.19, + "grad_norm": 0.6985251899901863, + "learning_rate": 1.8647112888472732e-05, + "loss": 1.3335, + "step": 4010 + }, + { + "epoch": 0.19, + "grad_norm": 1.2072504222586309, + "learning_rate": 1.8646330354167955e-05, + "loss": 1.2251, + "step": 4011 + }, + { + "epoch": 0.19, + "grad_norm": 0.9971013369200885, + "learning_rate": 1.8645547610042173e-05, + "loss": 1.2759, + "step": 4012 + }, + { + "epoch": 0.19, + "grad_norm": 1.1713869399847658, + "learning_rate": 1.864476465611437e-05, + "loss": 1.2954, + "step": 4013 + }, + { + "epoch": 0.19, + "grad_norm": 1.2119973031274707, + "learning_rate": 1.8643981492403557e-05, + "loss": 1.4028, + "step": 4014 + }, + { + "epoch": 0.19, + "grad_norm": 1.5477807570315671, + "learning_rate": 1.864319811892873e-05, + "loss": 1.3501, + "step": 4015 + }, + { + "epoch": 0.19, + "grad_norm": 1.31246692751042, + "learning_rate": 1.864241453570891e-05, + "loss": 1.2021, + "step": 4016 + }, + { + "epoch": 0.19, + "grad_norm": 1.3950640748743661, + "learning_rate": 1.86416307427631e-05, + "loss": 1.1855, + "step": 4017 + }, + { + "epoch": 0.19, + "grad_norm": 1.1711374395497032, + "learning_rate": 1.8640846740110327e-05, + "loss": 1.2178, + "step": 4018 + }, + { + "epoch": 0.19, + "grad_norm": 1.4784912898470401, + "learning_rate": 1.8640062527769615e-05, + "loss": 1.2334, + "step": 4019 + }, + { + "epoch": 0.19, + "grad_norm": 1.1598381009710796, + "learning_rate": 1.8639278105759998e-05, + "loss": 1.1895, + "step": 4020 + }, + { + "epoch": 0.19, + "grad_norm": 1.2900965614272344, + "learning_rate": 1.8638493474100507e-05, + "loss": 1.3613, + "step": 4021 + }, + { + "epoch": 0.19, + "grad_norm": 1.5011005466112193, + "learning_rate": 1.8637708632810185e-05, + "loss": 1.2202, + "step": 4022 + }, + { + "epoch": 0.19, + "grad_norm": 1.1908129171224233, + "learning_rate": 1.8636923581908074e-05, + "loss": 1.1436, + "step": 4023 + }, + { + "epoch": 0.19, + "grad_norm": 1.3704352330799328, + "learning_rate": 1.863613832141323e-05, + "loss": 1.1973, + "step": 4024 + }, + { + "epoch": 0.19, + "grad_norm": 1.5780296807059113, + "learning_rate": 1.8635352851344707e-05, + "loss": 1.1587, + "step": 4025 + }, + { + "epoch": 0.19, + "grad_norm": 1.3755307848363783, + "learning_rate": 1.8634567171721567e-05, + "loss": 1.1958, + "step": 4026 + }, + { + "epoch": 0.19, + "grad_norm": 1.0196600743124304, + "learning_rate": 1.8633781282562875e-05, + "loss": 1.1816, + "step": 4027 + }, + { + "epoch": 0.19, + "grad_norm": 1.1135500516869925, + "learning_rate": 1.8632995183887697e-05, + "loss": 1.1689, + "step": 4028 + }, + { + "epoch": 0.19, + "grad_norm": 1.5794191723626292, + "learning_rate": 1.8632208875715122e-05, + "loss": 1.3574, + "step": 4029 + }, + { + "epoch": 0.19, + "grad_norm": 1.2853998276527956, + "learning_rate": 1.8631422358064218e-05, + "loss": 1.2651, + "step": 4030 + }, + { + "epoch": 0.19, + "grad_norm": 1.2070200690549162, + "learning_rate": 1.8630635630954083e-05, + "loss": 1.2324, + "step": 4031 + }, + { + "epoch": 0.19, + "grad_norm": 0.8296746451782641, + "learning_rate": 1.86298486944038e-05, + "loss": 1.189, + "step": 4032 + }, + { + "epoch": 0.19, + "grad_norm": 1.1361775403737224, + "learning_rate": 1.862906154843247e-05, + "loss": 1.2124, + "step": 4033 + }, + { + "epoch": 0.19, + "grad_norm": 1.277653276869562, + "learning_rate": 1.8628274193059193e-05, + "loss": 1.4023, + "step": 4034 + }, + { + "epoch": 0.19, + "grad_norm": 1.1822094866439385, + "learning_rate": 1.8627486628303076e-05, + "loss": 1.3003, + "step": 4035 + }, + { + "epoch": 0.19, + "grad_norm": 1.7984650888984377, + "learning_rate": 1.862669885418323e-05, + "loss": 1.4639, + "step": 4036 + }, + { + "epoch": 0.19, + "grad_norm": 1.211996793703571, + "learning_rate": 1.8625910870718775e-05, + "loss": 1.2715, + "step": 4037 + }, + { + "epoch": 0.19, + "grad_norm": 1.3122843321113327, + "learning_rate": 1.862512267792883e-05, + "loss": 1.208, + "step": 4038 + }, + { + "epoch": 0.19, + "grad_norm": 1.270237328766837, + "learning_rate": 1.8624334275832522e-05, + "loss": 1.272, + "step": 4039 + }, + { + "epoch": 0.19, + "grad_norm": 1.2435806751164216, + "learning_rate": 1.8623545664448987e-05, + "loss": 1.1172, + "step": 4040 + }, + { + "epoch": 0.19, + "grad_norm": 0.9365948884497056, + "learning_rate": 1.8622756843797356e-05, + "loss": 1.2339, + "step": 4041 + }, + { + "epoch": 0.19, + "grad_norm": 1.1310258572948733, + "learning_rate": 1.8621967813896776e-05, + "loss": 1.1899, + "step": 4042 + }, + { + "epoch": 0.19, + "grad_norm": 1.147296773685079, + "learning_rate": 1.8621178574766397e-05, + "loss": 1.2378, + "step": 4043 + }, + { + "epoch": 0.19, + "grad_norm": 1.156570515712008, + "learning_rate": 1.8620389126425365e-05, + "loss": 1.168, + "step": 4044 + }, + { + "epoch": 0.19, + "grad_norm": 1.085455703315342, + "learning_rate": 1.861959946889284e-05, + "loss": 1.1611, + "step": 4045 + }, + { + "epoch": 0.19, + "grad_norm": 1.0750719752527533, + "learning_rate": 1.8618809602187987e-05, + "loss": 1.2363, + "step": 4046 + }, + { + "epoch": 0.19, + "grad_norm": 1.4296058674166023, + "learning_rate": 1.861801952632997e-05, + "loss": 1.4385, + "step": 4047 + }, + { + "epoch": 0.19, + "grad_norm": 1.2628358213212882, + "learning_rate": 1.8617229241337967e-05, + "loss": 1.186, + "step": 4048 + }, + { + "epoch": 0.19, + "grad_norm": 1.2992977698904784, + "learning_rate": 1.861643874723115e-05, + "loss": 1.0522, + "step": 4049 + }, + { + "epoch": 0.19, + "grad_norm": 1.1832575411770572, + "learning_rate": 1.86156480440287e-05, + "loss": 1.1738, + "step": 4050 + }, + { + "epoch": 0.19, + "grad_norm": 1.176487909262996, + "learning_rate": 1.8614857131749818e-05, + "loss": 1.0251, + "step": 4051 + }, + { + "epoch": 0.19, + "grad_norm": 1.3183352762968201, + "learning_rate": 1.8614066010413686e-05, + "loss": 1.1968, + "step": 4052 + }, + { + "epoch": 0.19, + "grad_norm": 1.299496089846326, + "learning_rate": 1.8613274680039506e-05, + "loss": 1.0444, + "step": 4053 + }, + { + "epoch": 0.19, + "grad_norm": 1.3557618564047709, + "learning_rate": 1.861248314064648e-05, + "loss": 1.3291, + "step": 4054 + }, + { + "epoch": 0.2, + "grad_norm": 1.7274637403124014, + "learning_rate": 1.8611691392253814e-05, + "loss": 1.2808, + "step": 4055 + }, + { + "epoch": 0.2, + "grad_norm": 1.3819401540973475, + "learning_rate": 1.8610899434880724e-05, + "loss": 1.2432, + "step": 4056 + }, + { + "epoch": 0.2, + "grad_norm": 1.3127849647201817, + "learning_rate": 1.861010726854643e-05, + "loss": 1.1602, + "step": 4057 + }, + { + "epoch": 0.2, + "grad_norm": 1.222572807489657, + "learning_rate": 1.8609314893270155e-05, + "loss": 1.0508, + "step": 4058 + }, + { + "epoch": 0.2, + "grad_norm": 1.5190225340208345, + "learning_rate": 1.8608522309071128e-05, + "loss": 1.2974, + "step": 4059 + }, + { + "epoch": 0.2, + "grad_norm": 1.0858464472728957, + "learning_rate": 1.860772951596858e-05, + "loss": 1.1238, + "step": 4060 + }, + { + "epoch": 0.2, + "grad_norm": 1.1799474681391802, + "learning_rate": 1.8606936513981745e-05, + "loss": 1.1465, + "step": 4061 + }, + { + "epoch": 0.2, + "grad_norm": 1.1793517171025751, + "learning_rate": 1.860614330312988e-05, + "loss": 1.1929, + "step": 4062 + }, + { + "epoch": 0.2, + "grad_norm": 1.0906440698647153, + "learning_rate": 1.8605349883432223e-05, + "loss": 1.1353, + "step": 4063 + }, + { + "epoch": 0.2, + "grad_norm": 1.17589073592679, + "learning_rate": 1.8604556254908034e-05, + "loss": 1.0181, + "step": 4064 + }, + { + "epoch": 0.2, + "grad_norm": 1.1227455590242628, + "learning_rate": 1.860376241757657e-05, + "loss": 1.2456, + "step": 4065 + }, + { + "epoch": 0.2, + "grad_norm": 1.4229203457135131, + "learning_rate": 1.8602968371457094e-05, + "loss": 1.1938, + "step": 4066 + }, + { + "epoch": 0.2, + "grad_norm": 1.1744472829555495, + "learning_rate": 1.8602174116568876e-05, + "loss": 1.2485, + "step": 4067 + }, + { + "epoch": 0.2, + "grad_norm": 1.2439463105745132, + "learning_rate": 1.860137965293119e-05, + "loss": 1.1721, + "step": 4068 + }, + { + "epoch": 0.2, + "grad_norm": 1.1310561592468147, + "learning_rate": 1.8600584980563316e-05, + "loss": 1.0874, + "step": 4069 + }, + { + "epoch": 0.2, + "grad_norm": 1.1025239353376606, + "learning_rate": 1.859979009948454e-05, + "loss": 1.1265, + "step": 4070 + }, + { + "epoch": 0.2, + "grad_norm": 1.3759714244344854, + "learning_rate": 1.8598995009714145e-05, + "loss": 1.2119, + "step": 4071 + }, + { + "epoch": 0.2, + "grad_norm": 1.3976468760554495, + "learning_rate": 1.8598199711271433e-05, + "loss": 1.2642, + "step": 4072 + }, + { + "epoch": 0.2, + "grad_norm": 1.1419146594021978, + "learning_rate": 1.85974042041757e-05, + "loss": 1.126, + "step": 4073 + }, + { + "epoch": 0.2, + "grad_norm": 1.3440392288596028, + "learning_rate": 1.859660848844625e-05, + "loss": 1.3516, + "step": 4074 + }, + { + "epoch": 0.2, + "grad_norm": 1.2532012475804237, + "learning_rate": 1.8595812564102393e-05, + "loss": 1.2163, + "step": 4075 + }, + { + "epoch": 0.2, + "grad_norm": 1.1395847342928271, + "learning_rate": 1.8595016431163448e-05, + "loss": 1.0503, + "step": 4076 + }, + { + "epoch": 0.2, + "grad_norm": 1.289748910948621, + "learning_rate": 1.8594220089648727e-05, + "loss": 1.2983, + "step": 4077 + }, + { + "epoch": 0.2, + "grad_norm": 1.2032993748683838, + "learning_rate": 1.8593423539577565e-05, + "loss": 1.2114, + "step": 4078 + }, + { + "epoch": 0.2, + "grad_norm": 1.0451257690461093, + "learning_rate": 1.859262678096928e-05, + "loss": 1.292, + "step": 4079 + }, + { + "epoch": 0.2, + "grad_norm": 1.130293144287229, + "learning_rate": 1.8591829813843215e-05, + "loss": 1.1836, + "step": 4080 + }, + { + "epoch": 0.2, + "grad_norm": 1.282573577566955, + "learning_rate": 1.859103263821871e-05, + "loss": 1.2993, + "step": 4081 + }, + { + "epoch": 0.2, + "grad_norm": 1.5304038829234583, + "learning_rate": 1.8590235254115105e-05, + "loss": 1.3652, + "step": 4082 + }, + { + "epoch": 0.2, + "grad_norm": 1.3743093601276695, + "learning_rate": 1.8589437661551756e-05, + "loss": 1.1895, + "step": 4083 + }, + { + "epoch": 0.2, + "grad_norm": 1.018150619419648, + "learning_rate": 1.8588639860548012e-05, + "loss": 1.1802, + "step": 4084 + }, + { + "epoch": 0.2, + "grad_norm": 1.0584588759529323, + "learning_rate": 1.858784185112324e-05, + "loss": 1.0977, + "step": 4085 + }, + { + "epoch": 0.2, + "grad_norm": 1.282416211463359, + "learning_rate": 1.85870436332968e-05, + "loss": 1.1113, + "step": 4086 + }, + { + "epoch": 0.2, + "grad_norm": 1.377231446417847, + "learning_rate": 1.8586245207088068e-05, + "loss": 1.2441, + "step": 4087 + }, + { + "epoch": 0.2, + "grad_norm": 1.1053607513835888, + "learning_rate": 1.8585446572516416e-05, + "loss": 0.9426, + "step": 4088 + }, + { + "epoch": 0.2, + "grad_norm": 1.444169813642195, + "learning_rate": 1.858464772960122e-05, + "loss": 1.2349, + "step": 4089 + }, + { + "epoch": 0.2, + "grad_norm": 1.3352340136898877, + "learning_rate": 1.858384867836187e-05, + "loss": 1.1543, + "step": 4090 + }, + { + "epoch": 0.2, + "grad_norm": 1.17589012656732, + "learning_rate": 1.8583049418817764e-05, + "loss": 1.0894, + "step": 4091 + }, + { + "epoch": 0.2, + "grad_norm": 1.33890680327144, + "learning_rate": 1.8582249950988285e-05, + "loss": 1.2612, + "step": 4092 + }, + { + "epoch": 0.2, + "grad_norm": 1.3516983841445027, + "learning_rate": 1.8581450274892842e-05, + "loss": 1.1357, + "step": 4093 + }, + { + "epoch": 0.2, + "grad_norm": 1.2519027242997316, + "learning_rate": 1.8580650390550835e-05, + "loss": 1.1287, + "step": 4094 + }, + { + "epoch": 0.2, + "grad_norm": 1.548654717704127, + "learning_rate": 1.857985029798168e-05, + "loss": 1.3433, + "step": 4095 + }, + { + "epoch": 0.2, + "grad_norm": 1.3082021829750532, + "learning_rate": 1.857904999720479e-05, + "loss": 1.1816, + "step": 4096 + }, + { + "epoch": 0.2, + "grad_norm": 1.0011880186548685, + "learning_rate": 1.8578249488239584e-05, + "loss": 1.1895, + "step": 4097 + }, + { + "epoch": 0.2, + "grad_norm": 1.1643480642299666, + "learning_rate": 1.8577448771105494e-05, + "loss": 1.2129, + "step": 4098 + }, + { + "epoch": 0.2, + "grad_norm": 1.1465602586939319, + "learning_rate": 1.8576647845821947e-05, + "loss": 1.1309, + "step": 4099 + }, + { + "epoch": 0.2, + "grad_norm": 1.4399496754213852, + "learning_rate": 1.857584671240838e-05, + "loss": 1.2363, + "step": 4100 + }, + { + "epoch": 0.2, + "grad_norm": 1.2431984412687682, + "learning_rate": 1.8575045370884232e-05, + "loss": 1.0767, + "step": 4101 + }, + { + "epoch": 0.2, + "grad_norm": 1.1150801915026713, + "learning_rate": 1.8574243821268953e-05, + "loss": 1.1016, + "step": 4102 + }, + { + "epoch": 0.2, + "grad_norm": 1.5024062819185104, + "learning_rate": 1.857344206358199e-05, + "loss": 1.1899, + "step": 4103 + }, + { + "epoch": 0.2, + "grad_norm": 1.258775897807646, + "learning_rate": 1.8572640097842804e-05, + "loss": 1.0237, + "step": 4104 + }, + { + "epoch": 0.2, + "grad_norm": 1.1999899557197724, + "learning_rate": 1.8571837924070853e-05, + "loss": 1.1973, + "step": 4105 + }, + { + "epoch": 0.2, + "grad_norm": 1.3318212018843154, + "learning_rate": 1.857103554228561e-05, + "loss": 1.2231, + "step": 4106 + }, + { + "epoch": 0.2, + "grad_norm": 1.3423468104856457, + "learning_rate": 1.857023295250653e-05, + "loss": 1.3101, + "step": 4107 + }, + { + "epoch": 0.2, + "grad_norm": 1.1308608711999477, + "learning_rate": 1.856943015475311e-05, + "loss": 1.1401, + "step": 4108 + }, + { + "epoch": 0.2, + "grad_norm": 1.3013475580474347, + "learning_rate": 1.856862714904482e-05, + "loss": 1.2319, + "step": 4109 + }, + { + "epoch": 0.2, + "grad_norm": 1.443788370474475, + "learning_rate": 1.8567823935401145e-05, + "loss": 1.3726, + "step": 4110 + }, + { + "epoch": 0.2, + "grad_norm": 1.1322124424621796, + "learning_rate": 1.8567020513841582e-05, + "loss": 1.2368, + "step": 4111 + }, + { + "epoch": 0.2, + "grad_norm": 0.8735615807465672, + "learning_rate": 1.8566216884385625e-05, + "loss": 1.1838, + "step": 4112 + }, + { + "epoch": 0.2, + "grad_norm": 1.3664829469756556, + "learning_rate": 1.8565413047052778e-05, + "loss": 1.2979, + "step": 4113 + }, + { + "epoch": 0.2, + "grad_norm": 1.2550115910719717, + "learning_rate": 1.8564609001862547e-05, + "loss": 1.3135, + "step": 4114 + }, + { + "epoch": 0.2, + "grad_norm": 1.3225248418499602, + "learning_rate": 1.856380474883444e-05, + "loss": 1.2725, + "step": 4115 + }, + { + "epoch": 0.2, + "grad_norm": 1.3487911200666083, + "learning_rate": 1.856300028798798e-05, + "loss": 1.2432, + "step": 4116 + }, + { + "epoch": 0.2, + "grad_norm": 1.3151088171468028, + "learning_rate": 1.8562195619342684e-05, + "loss": 1.25, + "step": 4117 + }, + { + "epoch": 0.2, + "grad_norm": 1.0509979454235223, + "learning_rate": 1.856139074291808e-05, + "loss": 1.1836, + "step": 4118 + }, + { + "epoch": 0.2, + "grad_norm": 1.1790086887586955, + "learning_rate": 1.8560585658733707e-05, + "loss": 1.1484, + "step": 4119 + }, + { + "epoch": 0.2, + "grad_norm": 1.1937663594067485, + "learning_rate": 1.855978036680909e-05, + "loss": 1.1958, + "step": 4120 + }, + { + "epoch": 0.2, + "grad_norm": 1.0342488538230754, + "learning_rate": 1.8558974867163778e-05, + "loss": 1.0681, + "step": 4121 + }, + { + "epoch": 0.2, + "grad_norm": 1.2401356405511152, + "learning_rate": 1.8558169159817316e-05, + "loss": 1.2666, + "step": 4122 + }, + { + "epoch": 0.2, + "grad_norm": 1.2353932889679449, + "learning_rate": 1.855736324478926e-05, + "loss": 1.1138, + "step": 4123 + }, + { + "epoch": 0.2, + "grad_norm": 1.188501278773705, + "learning_rate": 1.8556557122099163e-05, + "loss": 1.1626, + "step": 4124 + }, + { + "epoch": 0.2, + "grad_norm": 1.241308529981415, + "learning_rate": 1.8555750791766588e-05, + "loss": 1.127, + "step": 4125 + }, + { + "epoch": 0.2, + "grad_norm": 0.9194257544323624, + "learning_rate": 1.8554944253811103e-05, + "loss": 1.0869, + "step": 4126 + }, + { + "epoch": 0.2, + "grad_norm": 1.3874513770403745, + "learning_rate": 1.8554137508252278e-05, + "loss": 1.1812, + "step": 4127 + }, + { + "epoch": 0.2, + "grad_norm": 1.6114129529297765, + "learning_rate": 1.8553330555109696e-05, + "loss": 1.021, + "step": 4128 + }, + { + "epoch": 0.2, + "grad_norm": 0.9140859211868847, + "learning_rate": 1.8552523394402933e-05, + "loss": 1.2339, + "step": 4129 + }, + { + "epoch": 0.2, + "grad_norm": 1.0200767486149613, + "learning_rate": 1.8551716026151584e-05, + "loss": 1.2285, + "step": 4130 + }, + { + "epoch": 0.2, + "grad_norm": 1.1360759961305973, + "learning_rate": 1.8550908450375232e-05, + "loss": 1.2036, + "step": 4131 + }, + { + "epoch": 0.2, + "grad_norm": 1.334897294547008, + "learning_rate": 1.855010066709348e-05, + "loss": 1.2817, + "step": 4132 + }, + { + "epoch": 0.2, + "grad_norm": 1.0445626300196393, + "learning_rate": 1.8549292676325935e-05, + "loss": 1.2119, + "step": 4133 + }, + { + "epoch": 0.2, + "grad_norm": 1.1694187508678697, + "learning_rate": 1.8548484478092192e-05, + "loss": 1.2539, + "step": 4134 + }, + { + "epoch": 0.2, + "grad_norm": 1.3204599814527618, + "learning_rate": 1.8547676072411874e-05, + "loss": 1.2715, + "step": 4135 + }, + { + "epoch": 0.2, + "grad_norm": 1.267508517666056, + "learning_rate": 1.8546867459304595e-05, + "loss": 1.0735, + "step": 4136 + }, + { + "epoch": 0.2, + "grad_norm": 1.3529333658156226, + "learning_rate": 1.8546058638789982e-05, + "loss": 1.022, + "step": 4137 + }, + { + "epoch": 0.2, + "grad_norm": 1.3988577317646531, + "learning_rate": 1.8545249610887653e-05, + "loss": 1.1865, + "step": 4138 + }, + { + "epoch": 0.2, + "grad_norm": 1.3622125402700604, + "learning_rate": 1.854444037561725e-05, + "loss": 1.3599, + "step": 4139 + }, + { + "epoch": 0.2, + "grad_norm": 1.268202136096835, + "learning_rate": 1.8543630932998408e-05, + "loss": 1.2241, + "step": 4140 + }, + { + "epoch": 0.2, + "grad_norm": 1.2668940880671669, + "learning_rate": 1.8542821283050766e-05, + "loss": 1.2798, + "step": 4141 + }, + { + "epoch": 0.2, + "grad_norm": 1.1186714887735982, + "learning_rate": 1.8542011425793976e-05, + "loss": 1.0989, + "step": 4142 + }, + { + "epoch": 0.2, + "grad_norm": 1.1720288135966577, + "learning_rate": 1.854120136124769e-05, + "loss": 1.3032, + "step": 4143 + }, + { + "epoch": 0.2, + "grad_norm": 1.3015133164185093, + "learning_rate": 1.8540391089431566e-05, + "loss": 1.0559, + "step": 4144 + }, + { + "epoch": 0.2, + "grad_norm": 1.14830781398766, + "learning_rate": 1.8539580610365267e-05, + "loss": 1.2021, + "step": 4145 + }, + { + "epoch": 0.2, + "grad_norm": 1.3145868499971776, + "learning_rate": 1.853876992406846e-05, + "loss": 1.1064, + "step": 4146 + }, + { + "epoch": 0.2, + "grad_norm": 1.2347419765602072, + "learning_rate": 1.853795903056082e-05, + "loss": 1.1523, + "step": 4147 + }, + { + "epoch": 0.2, + "grad_norm": 1.330156791648631, + "learning_rate": 1.8537147929862023e-05, + "loss": 1.2114, + "step": 4148 + }, + { + "epoch": 0.2, + "grad_norm": 1.331656795511666, + "learning_rate": 1.853633662199175e-05, + "loss": 1.249, + "step": 4149 + }, + { + "epoch": 0.2, + "grad_norm": 1.3549114614257127, + "learning_rate": 1.8535525106969694e-05, + "loss": 1.2676, + "step": 4150 + }, + { + "epoch": 0.2, + "grad_norm": 1.2517788727714279, + "learning_rate": 1.8534713384815548e-05, + "loss": 0.9663, + "step": 4151 + }, + { + "epoch": 0.2, + "grad_norm": 0.9985303710454936, + "learning_rate": 1.8533901455549005e-05, + "loss": 1.2173, + "step": 4152 + }, + { + "epoch": 0.2, + "grad_norm": 1.4754270682675756, + "learning_rate": 1.853308931918977e-05, + "loss": 1.228, + "step": 4153 + }, + { + "epoch": 0.2, + "grad_norm": 1.086503077286127, + "learning_rate": 1.8532276975757554e-05, + "loss": 1.3154, + "step": 4154 + }, + { + "epoch": 0.2, + "grad_norm": 1.1673689470719117, + "learning_rate": 1.8531464425272067e-05, + "loss": 1.3208, + "step": 4155 + }, + { + "epoch": 0.2, + "grad_norm": 1.2607448923119657, + "learning_rate": 1.853065166775303e-05, + "loss": 1.2144, + "step": 4156 + }, + { + "epoch": 0.2, + "grad_norm": 1.1611744516404576, + "learning_rate": 1.8529838703220164e-05, + "loss": 1.229, + "step": 4157 + }, + { + "epoch": 0.2, + "grad_norm": 1.4408591685869967, + "learning_rate": 1.85290255316932e-05, + "loss": 1.1816, + "step": 4158 + }, + { + "epoch": 0.2, + "grad_norm": 1.4020591277558896, + "learning_rate": 1.8528212153191868e-05, + "loss": 1.1772, + "step": 4159 + }, + { + "epoch": 0.2, + "grad_norm": 1.2194187538832462, + "learning_rate": 1.8527398567735904e-05, + "loss": 1.395, + "step": 4160 + }, + { + "epoch": 0.2, + "grad_norm": 1.067501965280121, + "learning_rate": 1.852658477534506e-05, + "loss": 1.1577, + "step": 4161 + }, + { + "epoch": 0.2, + "grad_norm": 0.9527127994954667, + "learning_rate": 1.8525770776039077e-05, + "loss": 1.2627, + "step": 4162 + }, + { + "epoch": 0.2, + "grad_norm": 1.2850949171434338, + "learning_rate": 1.852495656983771e-05, + "loss": 1.2603, + "step": 4163 + }, + { + "epoch": 0.2, + "grad_norm": 1.359353225780469, + "learning_rate": 1.852414215676072e-05, + "loss": 1.25, + "step": 4164 + }, + { + "epoch": 0.2, + "grad_norm": 1.2000712052211069, + "learning_rate": 1.8523327536827866e-05, + "loss": 1.1538, + "step": 4165 + }, + { + "epoch": 0.2, + "grad_norm": 0.9872529213860134, + "learning_rate": 1.852251271005892e-05, + "loss": 1.3633, + "step": 4166 + }, + { + "epoch": 0.2, + "grad_norm": 1.2057145478190794, + "learning_rate": 1.852169767647366e-05, + "loss": 1.2622, + "step": 4167 + }, + { + "epoch": 0.2, + "grad_norm": 1.4214380282252763, + "learning_rate": 1.852088243609185e-05, + "loss": 1.3071, + "step": 4168 + }, + { + "epoch": 0.2, + "grad_norm": 1.0650573271121984, + "learning_rate": 1.8520066988933286e-05, + "loss": 1.1367, + "step": 4169 + }, + { + "epoch": 0.2, + "grad_norm": 1.2725866491690976, + "learning_rate": 1.8519251335017753e-05, + "loss": 1.2646, + "step": 4170 + }, + { + "epoch": 0.2, + "grad_norm": 1.2355227136123728, + "learning_rate": 1.8518435474365045e-05, + "loss": 1.1875, + "step": 4171 + }, + { + "epoch": 0.2, + "grad_norm": 1.1069896540290733, + "learning_rate": 1.8517619406994956e-05, + "loss": 1.2646, + "step": 4172 + }, + { + "epoch": 0.2, + "grad_norm": 1.0901824238326834, + "learning_rate": 1.8516803132927296e-05, + "loss": 1.1021, + "step": 4173 + }, + { + "epoch": 0.2, + "grad_norm": 1.3446969946823994, + "learning_rate": 1.8515986652181873e-05, + "loss": 1.272, + "step": 4174 + }, + { + "epoch": 0.2, + "grad_norm": 1.1282039494541214, + "learning_rate": 1.8515169964778496e-05, + "loss": 1.0696, + "step": 4175 + }, + { + "epoch": 0.2, + "grad_norm": 1.1888091573612913, + "learning_rate": 1.8514353070736987e-05, + "loss": 1.1533, + "step": 4176 + }, + { + "epoch": 0.2, + "grad_norm": 1.4288224317460807, + "learning_rate": 1.851353597007717e-05, + "loss": 1.1772, + "step": 4177 + }, + { + "epoch": 0.2, + "grad_norm": 1.5190094185248741, + "learning_rate": 1.851271866281887e-05, + "loss": 1.1865, + "step": 4178 + }, + { + "epoch": 0.2, + "grad_norm": 1.3632856713110568, + "learning_rate": 1.8511901148981922e-05, + "loss": 1.0034, + "step": 4179 + }, + { + "epoch": 0.2, + "grad_norm": 1.2095574863012208, + "learning_rate": 1.8511083428586172e-05, + "loss": 1.1704, + "step": 4180 + }, + { + "epoch": 0.2, + "grad_norm": 1.1804983312626556, + "learning_rate": 1.8510265501651454e-05, + "loss": 1.1562, + "step": 4181 + }, + { + "epoch": 0.2, + "grad_norm": 1.2736008950834392, + "learning_rate": 1.8509447368197617e-05, + "loss": 1.1646, + "step": 4182 + }, + { + "epoch": 0.2, + "grad_norm": 1.391396046968214, + "learning_rate": 1.850862902824452e-05, + "loss": 1.1455, + "step": 4183 + }, + { + "epoch": 0.2, + "grad_norm": 1.279392491854659, + "learning_rate": 1.850781048181202e-05, + "loss": 1.2148, + "step": 4184 + }, + { + "epoch": 0.2, + "grad_norm": 1.3934675451867045, + "learning_rate": 1.850699172891998e-05, + "loss": 1.29, + "step": 4185 + }, + { + "epoch": 0.2, + "grad_norm": 1.399065926106856, + "learning_rate": 1.850617276958827e-05, + "loss": 1.3066, + "step": 4186 + }, + { + "epoch": 0.2, + "grad_norm": 1.342277947271469, + "learning_rate": 1.8505353603836756e-05, + "loss": 1.2593, + "step": 4187 + }, + { + "epoch": 0.2, + "grad_norm": 1.2433572565931112, + "learning_rate": 1.8504534231685332e-05, + "loss": 1.27, + "step": 4188 + }, + { + "epoch": 0.2, + "grad_norm": 1.5413421882822174, + "learning_rate": 1.8503714653153867e-05, + "loss": 1.2002, + "step": 4189 + }, + { + "epoch": 0.2, + "grad_norm": 1.5940626006428704, + "learning_rate": 1.850289486826226e-05, + "loss": 1.4697, + "step": 4190 + }, + { + "epoch": 0.2, + "grad_norm": 1.2007115894130946, + "learning_rate": 1.85020748770304e-05, + "loss": 1.1597, + "step": 4191 + }, + { + "epoch": 0.2, + "grad_norm": 1.2748548530703248, + "learning_rate": 1.8501254679478188e-05, + "loss": 1.2124, + "step": 4192 + }, + { + "epoch": 0.2, + "grad_norm": 1.3580175101456484, + "learning_rate": 1.850043427562552e-05, + "loss": 1.3315, + "step": 4193 + }, + { + "epoch": 0.2, + "grad_norm": 1.2679041459496958, + "learning_rate": 1.8499613665492317e-05, + "loss": 1.1445, + "step": 4194 + }, + { + "epoch": 0.2, + "grad_norm": 1.308714992695532, + "learning_rate": 1.8498792849098482e-05, + "loss": 1.1714, + "step": 4195 + }, + { + "epoch": 0.2, + "grad_norm": 1.1419236233975938, + "learning_rate": 1.8497971826463943e-05, + "loss": 1.1387, + "step": 4196 + }, + { + "epoch": 0.2, + "grad_norm": 1.3680264206537802, + "learning_rate": 1.849715059760862e-05, + "loss": 1.0938, + "step": 4197 + }, + { + "epoch": 0.2, + "grad_norm": 1.0554024630881416, + "learning_rate": 1.8496329162552437e-05, + "loss": 1.1909, + "step": 4198 + }, + { + "epoch": 0.2, + "grad_norm": 1.3247361815669028, + "learning_rate": 1.8495507521315333e-05, + "loss": 1.3242, + "step": 4199 + }, + { + "epoch": 0.2, + "grad_norm": 1.3691684259102028, + "learning_rate": 1.8494685673917248e-05, + "loss": 1.1631, + "step": 4200 + }, + { + "epoch": 0.2, + "grad_norm": 0.9816598401959337, + "learning_rate": 1.8493863620378123e-05, + "loss": 1.0603, + "step": 4201 + }, + { + "epoch": 0.2, + "grad_norm": 1.302983550842362, + "learning_rate": 1.8493041360717908e-05, + "loss": 1.0918, + "step": 4202 + }, + { + "epoch": 0.2, + "grad_norm": 1.140101638460897, + "learning_rate": 1.8492218894956555e-05, + "loss": 1.0808, + "step": 4203 + }, + { + "epoch": 0.2, + "grad_norm": 1.1254497839665953, + "learning_rate": 1.8491396223114024e-05, + "loss": 1.083, + "step": 4204 + }, + { + "epoch": 0.2, + "grad_norm": 1.2468739506945612, + "learning_rate": 1.849057334521028e-05, + "loss": 1.1709, + "step": 4205 + }, + { + "epoch": 0.2, + "grad_norm": 1.2713147208405864, + "learning_rate": 1.848975026126529e-05, + "loss": 1.0715, + "step": 4206 + }, + { + "epoch": 0.2, + "grad_norm": 1.4647761320819583, + "learning_rate": 1.848892697129903e-05, + "loss": 1.1387, + "step": 4207 + }, + { + "epoch": 0.2, + "grad_norm": 1.451367144306609, + "learning_rate": 1.8488103475331476e-05, + "loss": 1.1494, + "step": 4208 + }, + { + "epoch": 0.2, + "grad_norm": 1.1748359046691985, + "learning_rate": 1.8487279773382613e-05, + "loss": 1.252, + "step": 4209 + }, + { + "epoch": 0.2, + "grad_norm": 1.451547846636832, + "learning_rate": 1.848645586547243e-05, + "loss": 1.3037, + "step": 4210 + }, + { + "epoch": 0.2, + "grad_norm": 1.071505911075315, + "learning_rate": 1.848563175162092e-05, + "loss": 1.1069, + "step": 4211 + }, + { + "epoch": 0.2, + "grad_norm": 1.298805880463429, + "learning_rate": 1.8484807431848085e-05, + "loss": 1.1328, + "step": 4212 + }, + { + "epoch": 0.2, + "grad_norm": 1.3489296035658007, + "learning_rate": 1.8483982906173928e-05, + "loss": 1.2246, + "step": 4213 + }, + { + "epoch": 0.2, + "grad_norm": 1.7049457628546592, + "learning_rate": 1.8483158174618456e-05, + "loss": 1.2197, + "step": 4214 + }, + { + "epoch": 0.2, + "grad_norm": 1.3715469652769137, + "learning_rate": 1.8482333237201678e-05, + "loss": 1.2021, + "step": 4215 + }, + { + "epoch": 0.2, + "grad_norm": 1.2749093909537395, + "learning_rate": 1.8481508093943622e-05, + "loss": 1.0974, + "step": 4216 + }, + { + "epoch": 0.2, + "grad_norm": 1.5052534009984304, + "learning_rate": 1.8480682744864306e-05, + "loss": 1.2832, + "step": 4217 + }, + { + "epoch": 0.2, + "grad_norm": 1.1086398386383465, + "learning_rate": 1.8479857189983762e-05, + "loss": 1.2014, + "step": 4218 + }, + { + "epoch": 0.2, + "grad_norm": 1.1528936808295647, + "learning_rate": 1.8479031429322022e-05, + "loss": 1.127, + "step": 4219 + }, + { + "epoch": 0.2, + "grad_norm": 1.2366414510234975, + "learning_rate": 1.8478205462899127e-05, + "loss": 1.3101, + "step": 4220 + }, + { + "epoch": 0.2, + "grad_norm": 1.3126704712487294, + "learning_rate": 1.8477379290735117e-05, + "loss": 1.3428, + "step": 4221 + }, + { + "epoch": 0.2, + "grad_norm": 1.267665747581158, + "learning_rate": 1.8476552912850044e-05, + "loss": 1.0806, + "step": 4222 + }, + { + "epoch": 0.2, + "grad_norm": 1.207512205747138, + "learning_rate": 1.8475726329263958e-05, + "loss": 1.207, + "step": 4223 + }, + { + "epoch": 0.2, + "grad_norm": 1.547937287442803, + "learning_rate": 1.8474899539996923e-05, + "loss": 1.2681, + "step": 4224 + }, + { + "epoch": 0.2, + "grad_norm": 1.167564111040716, + "learning_rate": 1.8474072545068998e-05, + "loss": 1.2451, + "step": 4225 + }, + { + "epoch": 0.2, + "grad_norm": 1.3219273124657043, + "learning_rate": 1.8473245344500254e-05, + "loss": 1.2969, + "step": 4226 + }, + { + "epoch": 0.2, + "grad_norm": 1.5052245514323321, + "learning_rate": 1.8472417938310765e-05, + "loss": 1.2646, + "step": 4227 + }, + { + "epoch": 0.2, + "grad_norm": 1.4758628237977864, + "learning_rate": 1.8471590326520607e-05, + "loss": 1.2202, + "step": 4228 + }, + { + "epoch": 0.2, + "grad_norm": 1.8327802442789343, + "learning_rate": 1.8470762509149867e-05, + "loss": 1.4146, + "step": 4229 + }, + { + "epoch": 0.2, + "grad_norm": 1.282405552669812, + "learning_rate": 1.8469934486218634e-05, + "loss": 1.3384, + "step": 4230 + }, + { + "epoch": 0.2, + "grad_norm": 1.0689462947001933, + "learning_rate": 1.8469106257747002e-05, + "loss": 1.3042, + "step": 4231 + }, + { + "epoch": 0.2, + "grad_norm": 1.4404399510897399, + "learning_rate": 1.846827782375506e-05, + "loss": 1.3667, + "step": 4232 + }, + { + "epoch": 0.2, + "grad_norm": 1.335211588858977, + "learning_rate": 1.8467449184262927e-05, + "loss": 1.23, + "step": 4233 + }, + { + "epoch": 0.2, + "grad_norm": 1.2171570851601994, + "learning_rate": 1.84666203392907e-05, + "loss": 1.1084, + "step": 4234 + }, + { + "epoch": 0.2, + "grad_norm": 1.2848395775847028, + "learning_rate": 1.84657912888585e-05, + "loss": 0.9661, + "step": 4235 + }, + { + "epoch": 0.2, + "grad_norm": 1.402225621396294, + "learning_rate": 1.846496203298644e-05, + "loss": 1.0029, + "step": 4236 + }, + { + "epoch": 0.2, + "grad_norm": 0.878622540245646, + "learning_rate": 1.8464132571694648e-05, + "loss": 1.2715, + "step": 4237 + }, + { + "epoch": 0.2, + "grad_norm": 1.0552685633034466, + "learning_rate": 1.8463302905003247e-05, + "loss": 1.2026, + "step": 4238 + }, + { + "epoch": 0.2, + "grad_norm": 1.1661218772978283, + "learning_rate": 1.846247303293238e-05, + "loss": 1.1919, + "step": 4239 + }, + { + "epoch": 0.2, + "grad_norm": 1.2956341356750836, + "learning_rate": 1.846164295550218e-05, + "loss": 1.2583, + "step": 4240 + }, + { + "epoch": 0.2, + "grad_norm": 1.3619675717698394, + "learning_rate": 1.846081267273278e-05, + "loss": 1.2363, + "step": 4241 + }, + { + "epoch": 0.2, + "grad_norm": 1.3114334856959564, + "learning_rate": 1.845998218464435e-05, + "loss": 1.2212, + "step": 4242 + }, + { + "epoch": 0.2, + "grad_norm": 1.2328288411861021, + "learning_rate": 1.845915149125703e-05, + "loss": 1.1138, + "step": 4243 + }, + { + "epoch": 0.2, + "grad_norm": 1.377982512840893, + "learning_rate": 1.8458320592590976e-05, + "loss": 1.1934, + "step": 4244 + }, + { + "epoch": 0.2, + "grad_norm": 1.4516065412242682, + "learning_rate": 1.845748948866636e-05, + "loss": 1.394, + "step": 4245 + }, + { + "epoch": 0.2, + "grad_norm": 1.2077974107751783, + "learning_rate": 1.8456658179503345e-05, + "loss": 1.2158, + "step": 4246 + }, + { + "epoch": 0.2, + "grad_norm": 1.285646335993893, + "learning_rate": 1.8455826665122107e-05, + "loss": 1.2061, + "step": 4247 + }, + { + "epoch": 0.2, + "grad_norm": 1.1520888614894382, + "learning_rate": 1.845499494554282e-05, + "loss": 1.1338, + "step": 4248 + }, + { + "epoch": 0.2, + "grad_norm": 1.4433784518574095, + "learning_rate": 1.8454163020785676e-05, + "loss": 1.1768, + "step": 4249 + }, + { + "epoch": 0.2, + "grad_norm": 1.5148379588501744, + "learning_rate": 1.8453330890870857e-05, + "loss": 1.2285, + "step": 4250 + }, + { + "epoch": 0.2, + "grad_norm": 1.3808642715745274, + "learning_rate": 1.8452498555818556e-05, + "loss": 1.2764, + "step": 4251 + }, + { + "epoch": 0.2, + "grad_norm": 1.3902208129784777, + "learning_rate": 1.8451666015648976e-05, + "loss": 1.2715, + "step": 4252 + }, + { + "epoch": 0.2, + "grad_norm": 1.303100768600207, + "learning_rate": 1.8450833270382312e-05, + "loss": 1.271, + "step": 4253 + }, + { + "epoch": 0.2, + "grad_norm": 1.3042622307791834, + "learning_rate": 1.845000032003878e-05, + "loss": 1.2422, + "step": 4254 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324718978215424, + "learning_rate": 1.8449167164638596e-05, + "loss": 1.3647, + "step": 4255 + }, + { + "epoch": 0.2, + "grad_norm": 1.5055500671418578, + "learning_rate": 1.844833380420197e-05, + "loss": 1.1782, + "step": 4256 + }, + { + "epoch": 0.2, + "grad_norm": 1.2260888770474974, + "learning_rate": 1.8447500238749124e-05, + "loss": 1.2114, + "step": 4257 + }, + { + "epoch": 0.2, + "grad_norm": 1.145822775444332, + "learning_rate": 1.8446666468300292e-05, + "loss": 1.3066, + "step": 4258 + }, + { + "epoch": 0.2, + "grad_norm": 1.1795511210437146, + "learning_rate": 1.844583249287571e-05, + "loss": 1.2075, + "step": 4259 + }, + { + "epoch": 0.2, + "grad_norm": 1.0221081799664247, + "learning_rate": 1.844499831249561e-05, + "loss": 1.1997, + "step": 4260 + }, + { + "epoch": 0.2, + "grad_norm": 1.1775765311042579, + "learning_rate": 1.8444163927180232e-05, + "loss": 1.2256, + "step": 4261 + }, + { + "epoch": 0.2, + "grad_norm": 1.050419383848904, + "learning_rate": 1.8443329336949835e-05, + "loss": 1.1128, + "step": 4262 + }, + { + "epoch": 0.21, + "grad_norm": 0.9692025963376726, + "learning_rate": 1.844249454182466e-05, + "loss": 1.0955, + "step": 4263 + }, + { + "epoch": 0.21, + "grad_norm": 1.0436446747053556, + "learning_rate": 1.8441659541824975e-05, + "loss": 1.1665, + "step": 4264 + }, + { + "epoch": 0.21, + "grad_norm": 1.2716823928322454, + "learning_rate": 1.844082433697104e-05, + "loss": 0.9268, + "step": 4265 + }, + { + "epoch": 0.21, + "grad_norm": 1.1860696355805678, + "learning_rate": 1.843998892728312e-05, + "loss": 1.0767, + "step": 4266 + }, + { + "epoch": 0.21, + "grad_norm": 1.222307748183581, + "learning_rate": 1.8439153312781487e-05, + "loss": 1.167, + "step": 4267 + }, + { + "epoch": 0.21, + "grad_norm": 1.1742592929300073, + "learning_rate": 1.8438317493486426e-05, + "loss": 1.1753, + "step": 4268 + }, + { + "epoch": 0.21, + "grad_norm": 1.2179130268814071, + "learning_rate": 1.843748146941821e-05, + "loss": 1.0601, + "step": 4269 + }, + { + "epoch": 0.21, + "grad_norm": 1.2155805552549275, + "learning_rate": 1.843664524059714e-05, + "loss": 1.3125, + "step": 4270 + }, + { + "epoch": 0.21, + "grad_norm": 0.9213401624689068, + "learning_rate": 1.8435808807043498e-05, + "loss": 1.1763, + "step": 4271 + }, + { + "epoch": 0.21, + "grad_norm": 1.459907473794051, + "learning_rate": 1.8434972168777585e-05, + "loss": 1.3271, + "step": 4272 + }, + { + "epoch": 0.21, + "grad_norm": 1.1873806966122182, + "learning_rate": 1.8434135325819703e-05, + "loss": 1.1802, + "step": 4273 + }, + { + "epoch": 0.21, + "grad_norm": 1.1846934727522582, + "learning_rate": 1.8433298278190163e-05, + "loss": 1.2354, + "step": 4274 + }, + { + "epoch": 0.21, + "grad_norm": 1.4761188850611127, + "learning_rate": 1.843246102590927e-05, + "loss": 1.1953, + "step": 4275 + }, + { + "epoch": 0.21, + "grad_norm": 1.441773373996717, + "learning_rate": 1.843162356899735e-05, + "loss": 1.1455, + "step": 4276 + }, + { + "epoch": 0.21, + "grad_norm": 1.400691306437527, + "learning_rate": 1.8430785907474727e-05, + "loss": 1.3354, + "step": 4277 + }, + { + "epoch": 0.21, + "grad_norm": 1.1445927064733374, + "learning_rate": 1.842994804136172e-05, + "loss": 1.1113, + "step": 4278 + }, + { + "epoch": 0.21, + "grad_norm": 1.293525562603318, + "learning_rate": 1.8429109970678664e-05, + "loss": 1.1482, + "step": 4279 + }, + { + "epoch": 0.21, + "grad_norm": 1.3635538417803064, + "learning_rate": 1.8428271695445903e-05, + "loss": 1.2803, + "step": 4280 + }, + { + "epoch": 0.21, + "grad_norm": 1.3405343655640127, + "learning_rate": 1.842743321568377e-05, + "loss": 1.0874, + "step": 4281 + }, + { + "epoch": 0.21, + "grad_norm": 1.5075261459911644, + "learning_rate": 1.842659453141262e-05, + "loss": 1.2512, + "step": 4282 + }, + { + "epoch": 0.21, + "grad_norm": 1.0840194397371827, + "learning_rate": 1.8425755642652797e-05, + "loss": 1.186, + "step": 4283 + }, + { + "epoch": 0.21, + "grad_norm": 1.262757500225548, + "learning_rate": 1.842491654942467e-05, + "loss": 1.0442, + "step": 4284 + }, + { + "epoch": 0.21, + "grad_norm": 1.0120831136887372, + "learning_rate": 1.8424077251748593e-05, + "loss": 1.1099, + "step": 4285 + }, + { + "epoch": 0.21, + "grad_norm": 1.2785094421996668, + "learning_rate": 1.8423237749644936e-05, + "loss": 1.1055, + "step": 4286 + }, + { + "epoch": 0.21, + "grad_norm": 1.268028033798254, + "learning_rate": 1.8422398043134068e-05, + "loss": 1.1396, + "step": 4287 + }, + { + "epoch": 0.21, + "grad_norm": 1.1934523587156094, + "learning_rate": 1.842155813223637e-05, + "loss": 1.1855, + "step": 4288 + }, + { + "epoch": 0.21, + "grad_norm": 1.3428325561855743, + "learning_rate": 1.8420718016972227e-05, + "loss": 1.2622, + "step": 4289 + }, + { + "epoch": 0.21, + "grad_norm": 1.600228392755461, + "learning_rate": 1.8419877697362015e-05, + "loss": 1.479, + "step": 4290 + }, + { + "epoch": 0.21, + "grad_norm": 1.2255156357671224, + "learning_rate": 1.841903717342614e-05, + "loss": 0.9629, + "step": 4291 + }, + { + "epoch": 0.21, + "grad_norm": 1.281119136565635, + "learning_rate": 1.8418196445184988e-05, + "loss": 1.124, + "step": 4292 + }, + { + "epoch": 0.21, + "grad_norm": 1.0545532408771852, + "learning_rate": 1.841735551265897e-05, + "loss": 1.2363, + "step": 4293 + }, + { + "epoch": 0.21, + "grad_norm": 1.2960023816568036, + "learning_rate": 1.8416514375868482e-05, + "loss": 1.1533, + "step": 4294 + }, + { + "epoch": 0.21, + "grad_norm": 1.7684732434569443, + "learning_rate": 1.8415673034833945e-05, + "loss": 1.5186, + "step": 4295 + }, + { + "epoch": 0.21, + "grad_norm": 1.5952701570377055, + "learning_rate": 1.8414831489575774e-05, + "loss": 1.272, + "step": 4296 + }, + { + "epoch": 0.21, + "grad_norm": 1.2585022929312808, + "learning_rate": 1.841398974011439e-05, + "loss": 1.1172, + "step": 4297 + }, + { + "epoch": 0.21, + "grad_norm": 1.1833482149937713, + "learning_rate": 1.8413147786470217e-05, + "loss": 1.0635, + "step": 4298 + }, + { + "epoch": 0.21, + "grad_norm": 1.2330734219595594, + "learning_rate": 1.8412305628663693e-05, + "loss": 0.9761, + "step": 4299 + }, + { + "epoch": 0.21, + "grad_norm": 0.8959577688950373, + "learning_rate": 1.841146326671525e-05, + "loss": 1.104, + "step": 4300 + }, + { + "epoch": 0.21, + "grad_norm": 0.9921477615481038, + "learning_rate": 1.841062070064533e-05, + "loss": 1.2764, + "step": 4301 + }, + { + "epoch": 0.21, + "grad_norm": 1.1861829273809994, + "learning_rate": 1.840977793047438e-05, + "loss": 1.1675, + "step": 4302 + }, + { + "epoch": 0.21, + "grad_norm": 1.089692818469624, + "learning_rate": 1.8408934956222855e-05, + "loss": 1.1255, + "step": 4303 + }, + { + "epoch": 0.21, + "grad_norm": 1.3214412264249773, + "learning_rate": 1.8408091777911203e-05, + "loss": 1.1377, + "step": 4304 + }, + { + "epoch": 0.21, + "grad_norm": 1.2597300655607568, + "learning_rate": 1.8407248395559897e-05, + "loss": 1.249, + "step": 4305 + }, + { + "epoch": 0.21, + "grad_norm": 1.4200508896652564, + "learning_rate": 1.840640480918939e-05, + "loss": 1.3115, + "step": 4306 + }, + { + "epoch": 0.21, + "grad_norm": 1.3800727277787403, + "learning_rate": 1.8405561018820166e-05, + "loss": 1.2559, + "step": 4307 + }, + { + "epoch": 0.21, + "grad_norm": 1.2455103069388707, + "learning_rate": 1.8404717024472696e-05, + "loss": 1.2607, + "step": 4308 + }, + { + "epoch": 0.21, + "grad_norm": 1.192796447923225, + "learning_rate": 1.8403872826167458e-05, + "loss": 1.1665, + "step": 4309 + }, + { + "epoch": 0.21, + "grad_norm": 1.1755147627656064, + "learning_rate": 1.8403028423924943e-05, + "loss": 1.1387, + "step": 4310 + }, + { + "epoch": 0.21, + "grad_norm": 1.1334124926798785, + "learning_rate": 1.8402183817765643e-05, + "loss": 1.1079, + "step": 4311 + }, + { + "epoch": 0.21, + "grad_norm": 1.265548475637155, + "learning_rate": 1.840133900771005e-05, + "loss": 1.1797, + "step": 4312 + }, + { + "epoch": 0.21, + "grad_norm": 0.9923242303422974, + "learning_rate": 1.8400493993778666e-05, + "loss": 1.1479, + "step": 4313 + }, + { + "epoch": 0.21, + "grad_norm": 1.3480077193876068, + "learning_rate": 1.8399648775991996e-05, + "loss": 1.4546, + "step": 4314 + }, + { + "epoch": 0.21, + "grad_norm": 1.519711538996555, + "learning_rate": 1.8398803354370554e-05, + "loss": 1.1895, + "step": 4315 + }, + { + "epoch": 0.21, + "grad_norm": 1.2088068685152145, + "learning_rate": 1.8397957728934855e-05, + "loss": 1.1831, + "step": 4316 + }, + { + "epoch": 0.21, + "grad_norm": 1.3965213875404816, + "learning_rate": 1.839711189970542e-05, + "loss": 1.1851, + "step": 4317 + }, + { + "epoch": 0.21, + "grad_norm": 1.3171397259987716, + "learning_rate": 1.8396265866702773e-05, + "loss": 1.2612, + "step": 4318 + }, + { + "epoch": 0.21, + "grad_norm": 1.0823642947845018, + "learning_rate": 1.8395419629947448e-05, + "loss": 1.2339, + "step": 4319 + }, + { + "epoch": 0.21, + "grad_norm": 0.9601873798509978, + "learning_rate": 1.8394573189459977e-05, + "loss": 1.2935, + "step": 4320 + }, + { + "epoch": 0.21, + "grad_norm": 0.9252434598722146, + "learning_rate": 1.8393726545260903e-05, + "loss": 1.2144, + "step": 4321 + }, + { + "epoch": 0.21, + "grad_norm": 1.159528182178332, + "learning_rate": 1.839287969737077e-05, + "loss": 1.1533, + "step": 4322 + }, + { + "epoch": 0.21, + "grad_norm": 1.1427304286664348, + "learning_rate": 1.839203264581013e-05, + "loss": 1.1567, + "step": 4323 + }, + { + "epoch": 0.21, + "grad_norm": 1.1977915599653646, + "learning_rate": 1.8391185390599537e-05, + "loss": 1.1309, + "step": 4324 + }, + { + "epoch": 0.21, + "grad_norm": 1.1825227859076293, + "learning_rate": 1.8390337931759553e-05, + "loss": 1.1895, + "step": 4325 + }, + { + "epoch": 0.21, + "grad_norm": 0.8859841562494186, + "learning_rate": 1.8389490269310744e-05, + "loss": 1.2598, + "step": 4326 + }, + { + "epoch": 0.21, + "grad_norm": 1.3251669424448402, + "learning_rate": 1.8388642403273674e-05, + "loss": 1.1821, + "step": 4327 + }, + { + "epoch": 0.21, + "grad_norm": 1.2576512394049328, + "learning_rate": 1.8387794333668928e-05, + "loss": 1.2368, + "step": 4328 + }, + { + "epoch": 0.21, + "grad_norm": 1.1089926554755873, + "learning_rate": 1.838694606051708e-05, + "loss": 1.4102, + "step": 4329 + }, + { + "epoch": 0.21, + "grad_norm": 1.6859834834249956, + "learning_rate": 1.8386097583838714e-05, + "loss": 1.2588, + "step": 4330 + }, + { + "epoch": 0.21, + "grad_norm": 0.8857354529267414, + "learning_rate": 1.8385248903654423e-05, + "loss": 1.1338, + "step": 4331 + }, + { + "epoch": 0.21, + "grad_norm": 1.1006670809197643, + "learning_rate": 1.83844000199848e-05, + "loss": 1.0825, + "step": 4332 + }, + { + "epoch": 0.21, + "grad_norm": 1.3480711138896766, + "learning_rate": 1.838355093285045e-05, + "loss": 1.3486, + "step": 4333 + }, + { + "epoch": 0.21, + "grad_norm": 1.064301777815102, + "learning_rate": 1.838270164227197e-05, + "loss": 1.167, + "step": 4334 + }, + { + "epoch": 0.21, + "grad_norm": 1.1299314705998726, + "learning_rate": 1.8381852148269976e-05, + "loss": 1.2866, + "step": 4335 + }, + { + "epoch": 0.21, + "grad_norm": 1.1111686717881117, + "learning_rate": 1.8381002450865078e-05, + "loss": 1.1899, + "step": 4336 + }, + { + "epoch": 0.21, + "grad_norm": 1.0762080526460733, + "learning_rate": 1.8380152550077903e-05, + "loss": 1.168, + "step": 4337 + }, + { + "epoch": 0.21, + "grad_norm": 1.69634102660346, + "learning_rate": 1.8379302445929068e-05, + "loss": 1.3389, + "step": 4338 + }, + { + "epoch": 0.21, + "grad_norm": 1.1069750803256408, + "learning_rate": 1.8378452138439206e-05, + "loss": 1.1245, + "step": 4339 + }, + { + "epoch": 0.21, + "grad_norm": 1.5198919999997635, + "learning_rate": 1.8377601627628952e-05, + "loss": 1.2117, + "step": 4340 + }, + { + "epoch": 0.21, + "grad_norm": 1.2849232093457805, + "learning_rate": 1.837675091351894e-05, + "loss": 1.3276, + "step": 4341 + }, + { + "epoch": 0.21, + "grad_norm": 1.3245178907534891, + "learning_rate": 1.8375899996129823e-05, + "loss": 1.2334, + "step": 4342 + }, + { + "epoch": 0.21, + "grad_norm": 1.1710587072978353, + "learning_rate": 1.837504887548224e-05, + "loss": 1.3115, + "step": 4343 + }, + { + "epoch": 0.21, + "grad_norm": 1.1711341077635804, + "learning_rate": 1.8374197551596857e-05, + "loss": 1.2407, + "step": 4344 + }, + { + "epoch": 0.21, + "grad_norm": 1.351122142769699, + "learning_rate": 1.8373346024494324e-05, + "loss": 1.2139, + "step": 4345 + }, + { + "epoch": 0.21, + "grad_norm": 1.291248686494472, + "learning_rate": 1.8372494294195306e-05, + "loss": 1.2114, + "step": 4346 + }, + { + "epoch": 0.21, + "grad_norm": 1.313426605180628, + "learning_rate": 1.837164236072048e-05, + "loss": 1.2026, + "step": 4347 + }, + { + "epoch": 0.21, + "grad_norm": 1.2803192542937483, + "learning_rate": 1.8370790224090508e-05, + "loss": 1.0732, + "step": 4348 + }, + { + "epoch": 0.21, + "grad_norm": 1.3237252403933644, + "learning_rate": 1.836993788432608e-05, + "loss": 1.252, + "step": 4349 + }, + { + "epoch": 0.21, + "grad_norm": 1.2718012814451056, + "learning_rate": 1.836908534144787e-05, + "loss": 1.2681, + "step": 4350 + }, + { + "epoch": 0.21, + "grad_norm": 1.0384780528307402, + "learning_rate": 1.8368232595476575e-05, + "loss": 1.022, + "step": 4351 + }, + { + "epoch": 0.21, + "grad_norm": 1.37726427042209, + "learning_rate": 1.8367379646432884e-05, + "loss": 1.311, + "step": 4352 + }, + { + "epoch": 0.21, + "grad_norm": 1.346598331897287, + "learning_rate": 1.8366526494337497e-05, + "loss": 1.0469, + "step": 4353 + }, + { + "epoch": 0.21, + "grad_norm": 1.354594778260442, + "learning_rate": 1.8365673139211114e-05, + "loss": 1.2026, + "step": 4354 + }, + { + "epoch": 0.21, + "grad_norm": 1.3065495673237775, + "learning_rate": 1.8364819581074447e-05, + "loss": 1.1719, + "step": 4355 + }, + { + "epoch": 0.21, + "grad_norm": 1.3949235441545145, + "learning_rate": 1.836396581994821e-05, + "loss": 1.3105, + "step": 4356 + }, + { + "epoch": 0.21, + "grad_norm": 1.1343675057281937, + "learning_rate": 1.8363111855853122e-05, + "loss": 1.0754, + "step": 4357 + }, + { + "epoch": 0.21, + "grad_norm": 1.0403631129789366, + "learning_rate": 1.8362257688809904e-05, + "loss": 1.125, + "step": 4358 + }, + { + "epoch": 0.21, + "grad_norm": 1.2588665415334734, + "learning_rate": 1.8361403318839283e-05, + "loss": 1.1626, + "step": 4359 + }, + { + "epoch": 0.21, + "grad_norm": 1.1924071091446717, + "learning_rate": 1.8360548745961994e-05, + "loss": 1.0679, + "step": 4360 + }, + { + "epoch": 0.21, + "grad_norm": 1.1521465689194352, + "learning_rate": 1.8359693970198772e-05, + "loss": 1.1025, + "step": 4361 + }, + { + "epoch": 0.21, + "grad_norm": 1.2422747844506365, + "learning_rate": 1.8358838991570363e-05, + "loss": 1.1797, + "step": 4362 + }, + { + "epoch": 0.21, + "grad_norm": 1.3327084422214552, + "learning_rate": 1.835798381009752e-05, + "loss": 1.2656, + "step": 4363 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635719403038898, + "learning_rate": 1.8357128425800983e-05, + "loss": 1.2041, + "step": 4364 + }, + { + "epoch": 0.21, + "grad_norm": 1.205936391738759, + "learning_rate": 1.835627283870152e-05, + "loss": 1.1421, + "step": 4365 + }, + { + "epoch": 0.21, + "grad_norm": 1.1358011217877584, + "learning_rate": 1.835541704881989e-05, + "loss": 1.2051, + "step": 4366 + }, + { + "epoch": 0.21, + "grad_norm": 1.2603171946899037, + "learning_rate": 1.8354561056176857e-05, + "loss": 1.2041, + "step": 4367 + }, + { + "epoch": 0.21, + "grad_norm": 1.3172620041735963, + "learning_rate": 1.8353704860793202e-05, + "loss": 1.3438, + "step": 4368 + }, + { + "epoch": 0.21, + "grad_norm": 1.2569577749121277, + "learning_rate": 1.8352848462689694e-05, + "loss": 1.2485, + "step": 4369 + }, + { + "epoch": 0.21, + "grad_norm": 1.112500915435981, + "learning_rate": 1.835199186188712e-05, + "loss": 1.0049, + "step": 4370 + }, + { + "epoch": 0.21, + "grad_norm": 1.3335744300242611, + "learning_rate": 1.8351135058406266e-05, + "loss": 1.23, + "step": 4371 + }, + { + "epoch": 0.21, + "grad_norm": 1.2493962448341316, + "learning_rate": 1.8350278052267922e-05, + "loss": 1.251, + "step": 4372 + }, + { + "epoch": 0.21, + "grad_norm": 1.1394376322426596, + "learning_rate": 1.834942084349289e-05, + "loss": 1.1641, + "step": 4373 + }, + { + "epoch": 0.21, + "grad_norm": 1.2384156789301703, + "learning_rate": 1.8348563432101967e-05, + "loss": 1.2847, + "step": 4374 + }, + { + "epoch": 0.21, + "grad_norm": 1.294771402108518, + "learning_rate": 1.834770581811596e-05, + "loss": 1.1196, + "step": 4375 + }, + { + "epoch": 0.21, + "grad_norm": 1.3594825144871425, + "learning_rate": 1.8346848001555688e-05, + "loss": 1.2256, + "step": 4376 + }, + { + "epoch": 0.21, + "grad_norm": 1.5877083077854959, + "learning_rate": 1.8345989982441955e-05, + "loss": 1.1758, + "step": 4377 + }, + { + "epoch": 0.21, + "grad_norm": 1.3651050024365659, + "learning_rate": 1.8345131760795598e-05, + "loss": 1.093, + "step": 4378 + }, + { + "epoch": 0.21, + "grad_norm": 1.2404091384231468, + "learning_rate": 1.834427333663743e-05, + "loss": 1.2524, + "step": 4379 + }, + { + "epoch": 0.21, + "grad_norm": 1.1032470694468395, + "learning_rate": 1.8343414709988288e-05, + "loss": 1.2437, + "step": 4380 + }, + { + "epoch": 0.21, + "grad_norm": 1.553879186412182, + "learning_rate": 1.834255588086901e-05, + "loss": 1.2642, + "step": 4381 + }, + { + "epoch": 0.21, + "grad_norm": 1.082103365663125, + "learning_rate": 1.834169684930043e-05, + "loss": 1.1377, + "step": 4382 + }, + { + "epoch": 0.21, + "grad_norm": 1.233339196372351, + "learning_rate": 1.8340837615303405e-05, + "loss": 1.0454, + "step": 4383 + }, + { + "epoch": 0.21, + "grad_norm": 1.163908360191841, + "learning_rate": 1.833997817889878e-05, + "loss": 1.1973, + "step": 4384 + }, + { + "epoch": 0.21, + "grad_norm": 1.2801311246224156, + "learning_rate": 1.833911854010741e-05, + "loss": 0.9805, + "step": 4385 + }, + { + "epoch": 0.21, + "grad_norm": 1.1100500031733034, + "learning_rate": 1.833825869895016e-05, + "loss": 1.2002, + "step": 4386 + }, + { + "epoch": 0.21, + "grad_norm": 1.3607910660401024, + "learning_rate": 1.8337398655447894e-05, + "loss": 1.1875, + "step": 4387 + }, + { + "epoch": 0.21, + "grad_norm": 1.0935972240834675, + "learning_rate": 1.8336538409621474e-05, + "loss": 1.0972, + "step": 4388 + }, + { + "epoch": 0.21, + "grad_norm": 0.85099296303356, + "learning_rate": 1.8335677961491793e-05, + "loss": 1.2363, + "step": 4389 + }, + { + "epoch": 0.21, + "grad_norm": 1.1068701609854668, + "learning_rate": 1.833481731107972e-05, + "loss": 1.2803, + "step": 4390 + }, + { + "epoch": 0.21, + "grad_norm": 1.2710857691538058, + "learning_rate": 1.833395645840614e-05, + "loss": 1.1089, + "step": 4391 + }, + { + "epoch": 0.21, + "grad_norm": 1.274529249606377, + "learning_rate": 1.8333095403491946e-05, + "loss": 1.2852, + "step": 4392 + }, + { + "epoch": 0.21, + "grad_norm": 1.4841455435658981, + "learning_rate": 1.8332234146358034e-05, + "loss": 1.0542, + "step": 4393 + }, + { + "epoch": 0.21, + "grad_norm": 1.1400405150411008, + "learning_rate": 1.8331372687025305e-05, + "loss": 1.228, + "step": 4394 + }, + { + "epoch": 0.21, + "grad_norm": 1.001509514333863, + "learning_rate": 1.8330511025514662e-05, + "loss": 1.1963, + "step": 4395 + }, + { + "epoch": 0.21, + "grad_norm": 1.4162009773935669, + "learning_rate": 1.8329649161847016e-05, + "loss": 1.2334, + "step": 4396 + }, + { + "epoch": 0.21, + "grad_norm": 1.4279910557608084, + "learning_rate": 1.8328787096043278e-05, + "loss": 1.2539, + "step": 4397 + }, + { + "epoch": 0.21, + "grad_norm": 1.0916459919720378, + "learning_rate": 1.8327924828124377e-05, + "loss": 1.3433, + "step": 4398 + }, + { + "epoch": 0.21, + "grad_norm": 1.3709913981084314, + "learning_rate": 1.8327062358111228e-05, + "loss": 1.1665, + "step": 4399 + }, + { + "epoch": 0.21, + "grad_norm": 1.2604488420483582, + "learning_rate": 1.8326199686024765e-05, + "loss": 1.2236, + "step": 4400 + }, + { + "epoch": 0.21, + "grad_norm": 1.1783845316970833, + "learning_rate": 1.8325336811885926e-05, + "loss": 1.1772, + "step": 4401 + }, + { + "epoch": 0.21, + "grad_norm": 1.5155437721199205, + "learning_rate": 1.8324473735715643e-05, + "loss": 1.2871, + "step": 4402 + }, + { + "epoch": 0.21, + "grad_norm": 1.2126955865557882, + "learning_rate": 1.832361045753486e-05, + "loss": 1.2124, + "step": 4403 + }, + { + "epoch": 0.21, + "grad_norm": 1.1462010516925394, + "learning_rate": 1.8322746977364537e-05, + "loss": 1.2578, + "step": 4404 + }, + { + "epoch": 0.21, + "grad_norm": 1.4348288162101668, + "learning_rate": 1.8321883295225617e-05, + "loss": 1.1934, + "step": 4405 + }, + { + "epoch": 0.21, + "grad_norm": 1.341410701654294, + "learning_rate": 1.8321019411139064e-05, + "loss": 1.3447, + "step": 4406 + }, + { + "epoch": 0.21, + "grad_norm": 1.2265451926352096, + "learning_rate": 1.8320155325125843e-05, + "loss": 1.0771, + "step": 4407 + }, + { + "epoch": 0.21, + "grad_norm": 1.2818728311861178, + "learning_rate": 1.831929103720692e-05, + "loss": 1.187, + "step": 4408 + }, + { + "epoch": 0.21, + "grad_norm": 1.053947176561706, + "learning_rate": 1.831842654740327e-05, + "loss": 1.2222, + "step": 4409 + }, + { + "epoch": 0.21, + "grad_norm": 1.1175363361114774, + "learning_rate": 1.8317561855735867e-05, + "loss": 1.2583, + "step": 4410 + }, + { + "epoch": 0.21, + "grad_norm": 1.194162504044974, + "learning_rate": 1.8316696962225704e-05, + "loss": 1.2563, + "step": 4411 + }, + { + "epoch": 0.21, + "grad_norm": 1.2124599838835748, + "learning_rate": 1.8315831866893762e-05, + "loss": 1.0986, + "step": 4412 + }, + { + "epoch": 0.21, + "grad_norm": 1.0204118523751704, + "learning_rate": 1.831496656976104e-05, + "loss": 1.2178, + "step": 4413 + }, + { + "epoch": 0.21, + "grad_norm": 1.294761487971946, + "learning_rate": 1.8314101070848527e-05, + "loss": 1.2651, + "step": 4414 + }, + { + "epoch": 0.21, + "grad_norm": 1.4593616346792158, + "learning_rate": 1.8313235370177235e-05, + "loss": 1.3716, + "step": 4415 + }, + { + "epoch": 0.21, + "grad_norm": 0.8286899143410447, + "learning_rate": 1.8312369467768168e-05, + "loss": 1.1748, + "step": 4416 + }, + { + "epoch": 0.21, + "grad_norm": 1.1133376696373642, + "learning_rate": 1.831150336364234e-05, + "loss": 1.2505, + "step": 4417 + }, + { + "epoch": 0.21, + "grad_norm": 1.1445535024330389, + "learning_rate": 1.831063705782077e-05, + "loss": 1.188, + "step": 4418 + }, + { + "epoch": 0.21, + "grad_norm": 1.3372278208720862, + "learning_rate": 1.830977055032448e-05, + "loss": 1.1475, + "step": 4419 + }, + { + "epoch": 0.21, + "grad_norm": 1.1727120318061122, + "learning_rate": 1.8308903841174493e-05, + "loss": 1.1865, + "step": 4420 + }, + { + "epoch": 0.21, + "grad_norm": 1.177387747709442, + "learning_rate": 1.8308036930391848e-05, + "loss": 1.1919, + "step": 4421 + }, + { + "epoch": 0.21, + "grad_norm": 1.1436942508698513, + "learning_rate": 1.830716981799758e-05, + "loss": 1.1729, + "step": 4422 + }, + { + "epoch": 0.21, + "grad_norm": 1.1260921876531866, + "learning_rate": 1.8306302504012732e-05, + "loss": 1.2432, + "step": 4423 + }, + { + "epoch": 0.21, + "grad_norm": 1.4202033452306044, + "learning_rate": 1.8305434988458348e-05, + "loss": 1.2954, + "step": 4424 + }, + { + "epoch": 0.21, + "grad_norm": 1.4152252637122622, + "learning_rate": 1.8304567271355482e-05, + "loss": 1.2681, + "step": 4425 + }, + { + "epoch": 0.21, + "grad_norm": 1.3364826784807082, + "learning_rate": 1.8303699352725193e-05, + "loss": 1.1973, + "step": 4426 + }, + { + "epoch": 0.21, + "grad_norm": 1.2497867957997453, + "learning_rate": 1.830283123258854e-05, + "loss": 1.2539, + "step": 4427 + }, + { + "epoch": 0.21, + "grad_norm": 1.5164324067681045, + "learning_rate": 1.8301962910966592e-05, + "loss": 1.231, + "step": 4428 + }, + { + "epoch": 0.21, + "grad_norm": 1.3714908596926016, + "learning_rate": 1.830109438788042e-05, + "loss": 1.1899, + "step": 4429 + }, + { + "epoch": 0.21, + "grad_norm": 1.156096259997161, + "learning_rate": 1.8300225663351098e-05, + "loss": 1.1167, + "step": 4430 + }, + { + "epoch": 0.21, + "grad_norm": 1.1921371159765468, + "learning_rate": 1.8299356737399707e-05, + "loss": 1.1172, + "step": 4431 + }, + { + "epoch": 0.21, + "grad_norm": 1.3109963307738512, + "learning_rate": 1.8298487610047337e-05, + "loss": 1.1978, + "step": 4432 + }, + { + "epoch": 0.21, + "grad_norm": 1.3561817855462865, + "learning_rate": 1.829761828131508e-05, + "loss": 1.2358, + "step": 4433 + }, + { + "epoch": 0.21, + "grad_norm": 1.2272859334251314, + "learning_rate": 1.829674875122403e-05, + "loss": 1.1509, + "step": 4434 + }, + { + "epoch": 0.21, + "grad_norm": 1.2237822280882102, + "learning_rate": 1.8295879019795283e-05, + "loss": 1.2363, + "step": 4435 + }, + { + "epoch": 0.21, + "grad_norm": 1.4625865557257534, + "learning_rate": 1.8295009087049954e-05, + "loss": 1.1013, + "step": 4436 + }, + { + "epoch": 0.21, + "grad_norm": 1.1068670832215481, + "learning_rate": 1.8294138953009145e-05, + "loss": 1.2744, + "step": 4437 + }, + { + "epoch": 0.21, + "grad_norm": 1.3166042983804738, + "learning_rate": 1.829326861769398e-05, + "loss": 1.312, + "step": 4438 + }, + { + "epoch": 0.21, + "grad_norm": 1.1222240968576276, + "learning_rate": 1.8292398081125572e-05, + "loss": 1.29, + "step": 4439 + }, + { + "epoch": 0.21, + "grad_norm": 1.2425263673217237, + "learning_rate": 1.8291527343325052e-05, + "loss": 1.1787, + "step": 4440 + }, + { + "epoch": 0.21, + "grad_norm": 1.2660095726734806, + "learning_rate": 1.8290656404313546e-05, + "loss": 1.2866, + "step": 4441 + }, + { + "epoch": 0.21, + "grad_norm": 1.5966594972773576, + "learning_rate": 1.8289785264112193e-05, + "loss": 1.0398, + "step": 4442 + }, + { + "epoch": 0.21, + "grad_norm": 1.3912000831615057, + "learning_rate": 1.8288913922742134e-05, + "loss": 1.3228, + "step": 4443 + }, + { + "epoch": 0.21, + "grad_norm": 1.0244953965039716, + "learning_rate": 1.8288042380224508e-05, + "loss": 1.2705, + "step": 4444 + }, + { + "epoch": 0.21, + "grad_norm": 1.1616114023164374, + "learning_rate": 1.8287170636580464e-05, + "loss": 1.1914, + "step": 4445 + }, + { + "epoch": 0.21, + "grad_norm": 1.2402760735307439, + "learning_rate": 1.8286298691831164e-05, + "loss": 1.2319, + "step": 4446 + }, + { + "epoch": 0.21, + "grad_norm": 1.3949685144614963, + "learning_rate": 1.8285426545997764e-05, + "loss": 1.3271, + "step": 4447 + }, + { + "epoch": 0.21, + "grad_norm": 1.1685433055251706, + "learning_rate": 1.828455419910143e-05, + "loss": 1.1719, + "step": 4448 + }, + { + "epoch": 0.21, + "grad_norm": 1.1237463669299028, + "learning_rate": 1.8283681651163324e-05, + "loss": 1.1997, + "step": 4449 + }, + { + "epoch": 0.21, + "grad_norm": 1.158733318569599, + "learning_rate": 1.8282808902204627e-05, + "loss": 1.124, + "step": 4450 + }, + { + "epoch": 0.21, + "grad_norm": 0.9612133330125446, + "learning_rate": 1.828193595224652e-05, + "loss": 1.1875, + "step": 4451 + }, + { + "epoch": 0.21, + "grad_norm": 1.3678092758865583, + "learning_rate": 1.828106280131018e-05, + "loss": 1.1924, + "step": 4452 + }, + { + "epoch": 0.21, + "grad_norm": 0.9879781232005463, + "learning_rate": 1.8280189449416805e-05, + "loss": 0.9697, + "step": 4453 + }, + { + "epoch": 0.21, + "grad_norm": 1.1315172097835489, + "learning_rate": 1.827931589658758e-05, + "loss": 0.947, + "step": 4454 + }, + { + "epoch": 0.21, + "grad_norm": 1.3802500703391178, + "learning_rate": 1.8278442142843703e-05, + "loss": 1.1353, + "step": 4455 + }, + { + "epoch": 0.21, + "grad_norm": 1.1964050042532848, + "learning_rate": 1.8277568188206386e-05, + "loss": 1.2593, + "step": 4456 + }, + { + "epoch": 0.21, + "grad_norm": 1.259133140742844, + "learning_rate": 1.8276694032696835e-05, + "loss": 1.2256, + "step": 4457 + }, + { + "epoch": 0.21, + "grad_norm": 1.2368374113655682, + "learning_rate": 1.8275819676336256e-05, + "loss": 1.2285, + "step": 4458 + }, + { + "epoch": 0.21, + "grad_norm": 1.216916536156771, + "learning_rate": 1.827494511914587e-05, + "loss": 1.1689, + "step": 4459 + }, + { + "epoch": 0.21, + "grad_norm": 1.4479300444283665, + "learning_rate": 1.8274070361146906e-05, + "loss": 1.1235, + "step": 4460 + }, + { + "epoch": 0.21, + "grad_norm": 1.2965212773819812, + "learning_rate": 1.8273195402360585e-05, + "loss": 1.2441, + "step": 4461 + }, + { + "epoch": 0.21, + "grad_norm": 1.2601135581933998, + "learning_rate": 1.8272320242808143e-05, + "loss": 1.231, + "step": 4462 + }, + { + "epoch": 0.21, + "grad_norm": 1.145110579727934, + "learning_rate": 1.827144488251082e-05, + "loss": 1.0388, + "step": 4463 + }, + { + "epoch": 0.21, + "grad_norm": 1.2187248381767366, + "learning_rate": 1.827056932148985e-05, + "loss": 1.1743, + "step": 4464 + }, + { + "epoch": 0.21, + "grad_norm": 1.1462336378133944, + "learning_rate": 1.8269693559766487e-05, + "loss": 1.1704, + "step": 4465 + }, + { + "epoch": 0.21, + "grad_norm": 1.3039185887070022, + "learning_rate": 1.8268817597361983e-05, + "loss": 1.2729, + "step": 4466 + }, + { + "epoch": 0.21, + "grad_norm": 1.32158738655301, + "learning_rate": 1.8267941434297594e-05, + "loss": 1.1772, + "step": 4467 + }, + { + "epoch": 0.21, + "grad_norm": 1.3844879297362085, + "learning_rate": 1.826706507059458e-05, + "loss": 1.1792, + "step": 4468 + }, + { + "epoch": 0.21, + "grad_norm": 1.198188707613644, + "learning_rate": 1.826618850627421e-05, + "loss": 1.2822, + "step": 4469 + }, + { + "epoch": 0.21, + "grad_norm": 1.3114276053440401, + "learning_rate": 1.8265311741357753e-05, + "loss": 1.1748, + "step": 4470 + }, + { + "epoch": 0.22, + "grad_norm": 0.9250861616109476, + "learning_rate": 1.826443477586649e-05, + "loss": 1.1382, + "step": 4471 + }, + { + "epoch": 0.22, + "grad_norm": 1.2842128501267753, + "learning_rate": 1.82635576098217e-05, + "loss": 1.1343, + "step": 4472 + }, + { + "epoch": 0.22, + "grad_norm": 1.3135278098643988, + "learning_rate": 1.826268024324467e-05, + "loss": 1.2627, + "step": 4473 + }, + { + "epoch": 0.22, + "grad_norm": 1.2634191162276973, + "learning_rate": 1.8261802676156685e-05, + "loss": 1.3154, + "step": 4474 + }, + { + "epoch": 0.22, + "grad_norm": 1.4024666984830052, + "learning_rate": 1.826092490857905e-05, + "loss": 1.1704, + "step": 4475 + }, + { + "epoch": 0.22, + "grad_norm": 1.1964876183556148, + "learning_rate": 1.8260046940533063e-05, + "loss": 1.2637, + "step": 4476 + }, + { + "epoch": 0.22, + "grad_norm": 1.5453843746772333, + "learning_rate": 1.8259168772040027e-05, + "loss": 1.1426, + "step": 4477 + }, + { + "epoch": 0.22, + "grad_norm": 1.8381837480923824, + "learning_rate": 1.8258290403121252e-05, + "loss": 1.4521, + "step": 4478 + }, + { + "epoch": 0.22, + "grad_norm": 1.095843465479473, + "learning_rate": 1.825741183379806e-05, + "loss": 1.2646, + "step": 4479 + }, + { + "epoch": 0.22, + "grad_norm": 1.4047802635390465, + "learning_rate": 1.8256533064091765e-05, + "loss": 1.2231, + "step": 4480 + }, + { + "epoch": 0.22, + "grad_norm": 1.1197960187250817, + "learning_rate": 1.8255654094023692e-05, + "loss": 1.0601, + "step": 4481 + }, + { + "epoch": 0.22, + "grad_norm": 1.555085623641309, + "learning_rate": 1.8254774923615177e-05, + "loss": 1.4136, + "step": 4482 + }, + { + "epoch": 0.22, + "grad_norm": 1.1976674896456259, + "learning_rate": 1.8253895552887547e-05, + "loss": 1.2017, + "step": 4483 + }, + { + "epoch": 0.22, + "grad_norm": 0.9212330799176107, + "learning_rate": 1.825301598186215e-05, + "loss": 1.269, + "step": 4484 + }, + { + "epoch": 0.22, + "grad_norm": 1.288832936900788, + "learning_rate": 1.825213621056032e-05, + "loss": 1.0784, + "step": 4485 + }, + { + "epoch": 0.22, + "grad_norm": 1.1812441029269136, + "learning_rate": 1.825125623900342e-05, + "loss": 1.2695, + "step": 4486 + }, + { + "epoch": 0.22, + "grad_norm": 1.22825994489677, + "learning_rate": 1.8250376067212794e-05, + "loss": 1.2065, + "step": 4487 + }, + { + "epoch": 0.22, + "grad_norm": 1.2918690996063715, + "learning_rate": 1.8249495695209805e-05, + "loss": 1.189, + "step": 4488 + }, + { + "epoch": 0.22, + "grad_norm": 1.1649445923779318, + "learning_rate": 1.8248615123015816e-05, + "loss": 1.2124, + "step": 4489 + }, + { + "epoch": 0.22, + "grad_norm": 1.6552548310659787, + "learning_rate": 1.8247734350652197e-05, + "loss": 1.3071, + "step": 4490 + }, + { + "epoch": 0.22, + "grad_norm": 0.9589697721067002, + "learning_rate": 1.824685337814032e-05, + "loss": 1.2197, + "step": 4491 + }, + { + "epoch": 0.22, + "grad_norm": 1.4550110427528729, + "learning_rate": 1.8245972205501565e-05, + "loss": 1.1973, + "step": 4492 + }, + { + "epoch": 0.22, + "grad_norm": 1.2470638515193744, + "learning_rate": 1.8245090832757317e-05, + "loss": 1.2144, + "step": 4493 + }, + { + "epoch": 0.22, + "grad_norm": 1.1176478343002583, + "learning_rate": 1.824420925992896e-05, + "loss": 1.123, + "step": 4494 + }, + { + "epoch": 0.22, + "grad_norm": 1.1761169999359093, + "learning_rate": 1.824332748703789e-05, + "loss": 1.2896, + "step": 4495 + }, + { + "epoch": 0.22, + "grad_norm": 1.121247774541991, + "learning_rate": 1.8242445514105505e-05, + "loss": 1.0864, + "step": 4496 + }, + { + "epoch": 0.22, + "grad_norm": 1.1827644944555011, + "learning_rate": 1.8241563341153203e-05, + "loss": 1.1968, + "step": 4497 + }, + { + "epoch": 0.22, + "grad_norm": 1.1911362907737977, + "learning_rate": 1.82406809682024e-05, + "loss": 1.3081, + "step": 4498 + }, + { + "epoch": 0.22, + "grad_norm": 1.1524727482387132, + "learning_rate": 1.8239798395274507e-05, + "loss": 1.2407, + "step": 4499 + }, + { + "epoch": 0.22, + "grad_norm": 1.3638494421798013, + "learning_rate": 1.823891562239094e-05, + "loss": 1.1768, + "step": 4500 + }, + { + "epoch": 0.22, + "grad_norm": 1.2868339439180396, + "learning_rate": 1.8238032649573116e-05, + "loss": 1.1113, + "step": 4501 + }, + { + "epoch": 0.22, + "grad_norm": 0.9042157059120498, + "learning_rate": 1.8237149476842472e-05, + "loss": 1.2207, + "step": 4502 + }, + { + "epoch": 0.22, + "grad_norm": 1.4403237455833862, + "learning_rate": 1.8236266104220432e-05, + "loss": 1.2915, + "step": 4503 + }, + { + "epoch": 0.22, + "grad_norm": 1.1798528507888026, + "learning_rate": 1.8235382531728435e-05, + "loss": 1.271, + "step": 4504 + }, + { + "epoch": 0.22, + "grad_norm": 1.4238698437276276, + "learning_rate": 1.8234498759387925e-05, + "loss": 1.1646, + "step": 4505 + }, + { + "epoch": 0.22, + "grad_norm": 1.2189178542394283, + "learning_rate": 1.8233614787220345e-05, + "loss": 1.0879, + "step": 4506 + }, + { + "epoch": 0.22, + "grad_norm": 1.2342390470652775, + "learning_rate": 1.8232730615247146e-05, + "loss": 1.3276, + "step": 4507 + }, + { + "epoch": 0.22, + "grad_norm": 0.9358752241332112, + "learning_rate": 1.823184624348979e-05, + "loss": 1.1597, + "step": 4508 + }, + { + "epoch": 0.22, + "grad_norm": 1.5266393128380753, + "learning_rate": 1.8230961671969735e-05, + "loss": 1.2549, + "step": 4509 + }, + { + "epoch": 0.22, + "grad_norm": 1.207426722133219, + "learning_rate": 1.8230076900708447e-05, + "loss": 1.1831, + "step": 4510 + }, + { + "epoch": 0.22, + "grad_norm": 1.2005922361277106, + "learning_rate": 1.8229191929727395e-05, + "loss": 1.1636, + "step": 4511 + }, + { + "epoch": 0.22, + "grad_norm": 1.1681189171275623, + "learning_rate": 1.8228306759048057e-05, + "loss": 1.1938, + "step": 4512 + }, + { + "epoch": 0.22, + "grad_norm": 1.145374224064277, + "learning_rate": 1.8227421388691912e-05, + "loss": 1.1509, + "step": 4513 + }, + { + "epoch": 0.22, + "grad_norm": 1.4923910564979537, + "learning_rate": 1.8226535818680445e-05, + "loss": 1.3838, + "step": 4514 + }, + { + "epoch": 0.22, + "grad_norm": 1.2508198993250346, + "learning_rate": 1.8225650049035148e-05, + "loss": 1.1943, + "step": 4515 + }, + { + "epoch": 0.22, + "grad_norm": 1.2538243019994604, + "learning_rate": 1.8224764079777514e-05, + "loss": 1.1777, + "step": 4516 + }, + { + "epoch": 0.22, + "grad_norm": 1.3167587067297597, + "learning_rate": 1.8223877910929048e-05, + "loss": 1.3594, + "step": 4517 + }, + { + "epoch": 0.22, + "grad_norm": 1.3193928811673092, + "learning_rate": 1.8222991542511247e-05, + "loss": 1.0942, + "step": 4518 + }, + { + "epoch": 0.22, + "grad_norm": 1.2651289216691577, + "learning_rate": 1.8222104974545623e-05, + "loss": 1.1689, + "step": 4519 + }, + { + "epoch": 0.22, + "grad_norm": 1.2098304978889205, + "learning_rate": 1.8221218207053694e-05, + "loss": 1.3418, + "step": 4520 + }, + { + "epoch": 0.22, + "grad_norm": 1.0102883189265233, + "learning_rate": 1.8220331240056974e-05, + "loss": 1.0869, + "step": 4521 + }, + { + "epoch": 0.22, + "grad_norm": 1.2367135995282215, + "learning_rate": 1.8219444073576993e-05, + "loss": 1.2637, + "step": 4522 + }, + { + "epoch": 0.22, + "grad_norm": 1.2091978162114032, + "learning_rate": 1.8218556707635277e-05, + "loss": 1.3296, + "step": 4523 + }, + { + "epoch": 0.22, + "grad_norm": 1.3074445822917833, + "learning_rate": 1.8217669142253358e-05, + "loss": 1.3042, + "step": 4524 + }, + { + "epoch": 0.22, + "grad_norm": 0.9470282554386227, + "learning_rate": 1.8216781377452775e-05, + "loss": 1.2642, + "step": 4525 + }, + { + "epoch": 0.22, + "grad_norm": 1.1356587202134663, + "learning_rate": 1.8215893413255073e-05, + "loss": 1.1387, + "step": 4526 + }, + { + "epoch": 0.22, + "grad_norm": 1.2435357012454047, + "learning_rate": 1.8215005249681804e-05, + "loss": 1.1982, + "step": 4527 + }, + { + "epoch": 0.22, + "grad_norm": 0.9305433027310314, + "learning_rate": 1.8214116886754513e-05, + "loss": 1.2915, + "step": 4528 + }, + { + "epoch": 0.22, + "grad_norm": 1.2849112769256064, + "learning_rate": 1.8213228324494765e-05, + "loss": 1.1074, + "step": 4529 + }, + { + "epoch": 0.22, + "grad_norm": 1.1825432861288747, + "learning_rate": 1.821233956292412e-05, + "loss": 1.1133, + "step": 4530 + }, + { + "epoch": 0.22, + "grad_norm": 1.3555059100317235, + "learning_rate": 1.821145060206414e-05, + "loss": 1.2925, + "step": 4531 + }, + { + "epoch": 0.22, + "grad_norm": 1.154147526166616, + "learning_rate": 1.8210561441936406e-05, + "loss": 0.9744, + "step": 4532 + }, + { + "epoch": 0.22, + "grad_norm": 1.4893802260523012, + "learning_rate": 1.8209672082562496e-05, + "loss": 1.3159, + "step": 4533 + }, + { + "epoch": 0.22, + "grad_norm": 1.1768601753751666, + "learning_rate": 1.8208782523963985e-05, + "loss": 1.0874, + "step": 4534 + }, + { + "epoch": 0.22, + "grad_norm": 1.283298926868818, + "learning_rate": 1.8207892766162463e-05, + "loss": 1.1973, + "step": 4535 + }, + { + "epoch": 0.22, + "grad_norm": 1.1767595510160582, + "learning_rate": 1.820700280917952e-05, + "loss": 1.1729, + "step": 4536 + }, + { + "epoch": 0.22, + "grad_norm": 1.5455150919020944, + "learning_rate": 1.820611265303676e-05, + "loss": 1.2715, + "step": 4537 + }, + { + "epoch": 0.22, + "grad_norm": 1.19630592465284, + "learning_rate": 1.8205222297755774e-05, + "loss": 1.1665, + "step": 4538 + }, + { + "epoch": 0.22, + "grad_norm": 1.5881166850367703, + "learning_rate": 1.820433174335818e-05, + "loss": 1.2129, + "step": 4539 + }, + { + "epoch": 0.22, + "grad_norm": 1.2169017990956086, + "learning_rate": 1.8203440989865577e-05, + "loss": 1.2842, + "step": 4540 + }, + { + "epoch": 0.22, + "grad_norm": 1.3138389524247547, + "learning_rate": 1.8202550037299588e-05, + "loss": 1.1362, + "step": 4541 + }, + { + "epoch": 0.22, + "grad_norm": 1.4023005408257163, + "learning_rate": 1.820165888568183e-05, + "loss": 1.189, + "step": 4542 + }, + { + "epoch": 0.22, + "grad_norm": 1.2829723945837808, + "learning_rate": 1.8200767535033938e-05, + "loss": 1.1262, + "step": 4543 + }, + { + "epoch": 0.22, + "grad_norm": 1.257113965490729, + "learning_rate": 1.819987598537753e-05, + "loss": 1.1133, + "step": 4544 + }, + { + "epoch": 0.22, + "grad_norm": 1.3105959400257408, + "learning_rate": 1.8198984236734246e-05, + "loss": 1.2603, + "step": 4545 + }, + { + "epoch": 0.22, + "grad_norm": 1.3689214589749619, + "learning_rate": 1.819809228912573e-05, + "loss": 1.1714, + "step": 4546 + }, + { + "epoch": 0.22, + "grad_norm": 1.1067254396759134, + "learning_rate": 1.8197200142573625e-05, + "loss": 1.1636, + "step": 4547 + }, + { + "epoch": 0.22, + "grad_norm": 1.0566380197437628, + "learning_rate": 1.819630779709958e-05, + "loss": 1.3125, + "step": 4548 + }, + { + "epoch": 0.22, + "grad_norm": 1.2223831176440698, + "learning_rate": 1.8195415252725242e-05, + "loss": 1.2397, + "step": 4549 + }, + { + "epoch": 0.22, + "grad_norm": 1.3573172171336447, + "learning_rate": 1.8194522509472283e-05, + "loss": 1.1211, + "step": 4550 + }, + { + "epoch": 0.22, + "grad_norm": 1.0505272535023944, + "learning_rate": 1.819362956736236e-05, + "loss": 1.146, + "step": 4551 + }, + { + "epoch": 0.22, + "grad_norm": 1.240758795350619, + "learning_rate": 1.8192736426417146e-05, + "loss": 1.2788, + "step": 4552 + }, + { + "epoch": 0.22, + "grad_norm": 1.1781434626497482, + "learning_rate": 1.8191843086658313e-05, + "loss": 1.166, + "step": 4553 + }, + { + "epoch": 0.22, + "grad_norm": 1.1669323503573172, + "learning_rate": 1.819094954810754e-05, + "loss": 1.2974, + "step": 4554 + }, + { + "epoch": 0.22, + "grad_norm": 1.4065541957388346, + "learning_rate": 1.8190055810786507e-05, + "loss": 1.231, + "step": 4555 + }, + { + "epoch": 0.22, + "grad_norm": 1.5062359846487385, + "learning_rate": 1.818916187471691e-05, + "loss": 1.2437, + "step": 4556 + }, + { + "epoch": 0.22, + "grad_norm": 1.2409215413689512, + "learning_rate": 1.8188267739920433e-05, + "loss": 1.2661, + "step": 4557 + }, + { + "epoch": 0.22, + "grad_norm": 1.447612927903066, + "learning_rate": 1.8187373406418782e-05, + "loss": 1.1636, + "step": 4558 + }, + { + "epoch": 0.22, + "grad_norm": 1.1464625403950617, + "learning_rate": 1.8186478874233655e-05, + "loss": 1.1262, + "step": 4559 + }, + { + "epoch": 0.22, + "grad_norm": 1.254938830341179, + "learning_rate": 1.8185584143386764e-05, + "loss": 1.27, + "step": 4560 + }, + { + "epoch": 0.22, + "grad_norm": 1.2202923558945054, + "learning_rate": 1.8184689213899816e-05, + "loss": 1.1104, + "step": 4561 + }, + { + "epoch": 0.22, + "grad_norm": 1.381893860733172, + "learning_rate": 1.818379408579453e-05, + "loss": 1.2466, + "step": 4562 + }, + { + "epoch": 0.22, + "grad_norm": 1.1773444391803471, + "learning_rate": 1.818289875909263e-05, + "loss": 0.9851, + "step": 4563 + }, + { + "epoch": 0.22, + "grad_norm": 1.3159040180146608, + "learning_rate": 1.8182003233815847e-05, + "loss": 1.2212, + "step": 4564 + }, + { + "epoch": 0.22, + "grad_norm": 1.236711597280253, + "learning_rate": 1.8181107509985903e-05, + "loss": 1.2227, + "step": 4565 + }, + { + "epoch": 0.22, + "grad_norm": 1.2043327331817868, + "learning_rate": 1.8180211587624543e-05, + "loss": 1.1919, + "step": 4566 + }, + { + "epoch": 0.22, + "grad_norm": 1.4056949238101222, + "learning_rate": 1.81793154667535e-05, + "loss": 1.3003, + "step": 4567 + }, + { + "epoch": 0.22, + "grad_norm": 0.8639074970296128, + "learning_rate": 1.8178419147394528e-05, + "loss": 1.1616, + "step": 4568 + }, + { + "epoch": 0.22, + "grad_norm": 0.8593021626952934, + "learning_rate": 1.8177522629569375e-05, + "loss": 1.1929, + "step": 4569 + }, + { + "epoch": 0.22, + "grad_norm": 1.3233087136727573, + "learning_rate": 1.8176625913299797e-05, + "loss": 1.1475, + "step": 4570 + }, + { + "epoch": 0.22, + "grad_norm": 1.3104274036697134, + "learning_rate": 1.8175728998607555e-05, + "loss": 1.2021, + "step": 4571 + }, + { + "epoch": 0.22, + "grad_norm": 1.3861350468006663, + "learning_rate": 1.8174831885514415e-05, + "loss": 1.1992, + "step": 4572 + }, + { + "epoch": 0.22, + "grad_norm": 1.0633115593324878, + "learning_rate": 1.817393457404214e-05, + "loss": 1.0625, + "step": 4573 + }, + { + "epoch": 0.22, + "grad_norm": 1.433954381429989, + "learning_rate": 1.8173037064212522e-05, + "loss": 1.2695, + "step": 4574 + }, + { + "epoch": 0.22, + "grad_norm": 1.1343511597634355, + "learning_rate": 1.8172139356047323e-05, + "loss": 1.1953, + "step": 4575 + }, + { + "epoch": 0.22, + "grad_norm": 1.163750207033128, + "learning_rate": 1.817124144956834e-05, + "loss": 1.1265, + "step": 4576 + }, + { + "epoch": 0.22, + "grad_norm": 1.1815829047351063, + "learning_rate": 1.8170343344797354e-05, + "loss": 1.2119, + "step": 4577 + }, + { + "epoch": 0.22, + "grad_norm": 1.131773265841813, + "learning_rate": 1.8169445041756165e-05, + "loss": 1.2339, + "step": 4578 + }, + { + "epoch": 0.22, + "grad_norm": 1.252572049031637, + "learning_rate": 1.8168546540466567e-05, + "loss": 1.1929, + "step": 4579 + }, + { + "epoch": 0.22, + "grad_norm": 1.4860152201521228, + "learning_rate": 1.816764784095037e-05, + "loss": 1.1296, + "step": 4580 + }, + { + "epoch": 0.22, + "grad_norm": 1.2703307768621983, + "learning_rate": 1.816674894322938e-05, + "loss": 1.2686, + "step": 4581 + }, + { + "epoch": 0.22, + "grad_norm": 0.9938050614995264, + "learning_rate": 1.8165849847325413e-05, + "loss": 1.1475, + "step": 4582 + }, + { + "epoch": 0.22, + "grad_norm": 1.2557570803082503, + "learning_rate": 1.816495055326028e-05, + "loss": 1.1729, + "step": 4583 + }, + { + "epoch": 0.22, + "grad_norm": 1.163541368273416, + "learning_rate": 1.8164051061055812e-05, + "loss": 1.1826, + "step": 4584 + }, + { + "epoch": 0.22, + "grad_norm": 1.2462238711394678, + "learning_rate": 1.8163151370733838e-05, + "loss": 1.0959, + "step": 4585 + }, + { + "epoch": 0.22, + "grad_norm": 1.0155756247208563, + "learning_rate": 1.8162251482316186e-05, + "loss": 1.2344, + "step": 4586 + }, + { + "epoch": 0.22, + "grad_norm": 1.17158040399833, + "learning_rate": 1.8161351395824688e-05, + "loss": 1.0554, + "step": 4587 + }, + { + "epoch": 0.22, + "grad_norm": 1.3400881649873202, + "learning_rate": 1.8160451111281202e-05, + "loss": 1.1655, + "step": 4588 + }, + { + "epoch": 0.22, + "grad_norm": 1.4088595037933995, + "learning_rate": 1.815955062870756e-05, + "loss": 1.2529, + "step": 4589 + }, + { + "epoch": 0.22, + "grad_norm": 1.0331386747422902, + "learning_rate": 1.815864994812562e-05, + "loss": 1.2554, + "step": 4590 + }, + { + "epoch": 0.22, + "grad_norm": 1.2915819402640634, + "learning_rate": 1.8157749069557246e-05, + "loss": 1.2915, + "step": 4591 + }, + { + "epoch": 0.22, + "grad_norm": 1.2295079151815633, + "learning_rate": 1.815684799302429e-05, + "loss": 1.2319, + "step": 4592 + }, + { + "epoch": 0.22, + "grad_norm": 1.450097118179294, + "learning_rate": 1.815594671854862e-05, + "loss": 1.2778, + "step": 4593 + }, + { + "epoch": 0.22, + "grad_norm": 1.6400034541951787, + "learning_rate": 1.8155045246152113e-05, + "loss": 1.3809, + "step": 4594 + }, + { + "epoch": 0.22, + "grad_norm": 1.2738217663270686, + "learning_rate": 1.8154143575856634e-05, + "loss": 1.1165, + "step": 4595 + }, + { + "epoch": 0.22, + "grad_norm": 1.2388055532968931, + "learning_rate": 1.8153241707684077e-05, + "loss": 1.2412, + "step": 4596 + }, + { + "epoch": 0.22, + "grad_norm": 1.3627463398397146, + "learning_rate": 1.815233964165632e-05, + "loss": 1.1597, + "step": 4597 + }, + { + "epoch": 0.22, + "grad_norm": 1.6635038564195472, + "learning_rate": 1.8151437377795256e-05, + "loss": 1.3872, + "step": 4598 + }, + { + "epoch": 0.22, + "grad_norm": 1.4666642456214967, + "learning_rate": 1.8150534916122777e-05, + "loss": 1.1501, + "step": 4599 + }, + { + "epoch": 0.22, + "grad_norm": 1.1958517283179682, + "learning_rate": 1.814963225666079e-05, + "loss": 1.1245, + "step": 4600 + }, + { + "epoch": 0.22, + "grad_norm": 1.1377236677950153, + "learning_rate": 1.8148729399431187e-05, + "loss": 1.2949, + "step": 4601 + }, + { + "epoch": 0.22, + "grad_norm": 1.1023934242931985, + "learning_rate": 1.8147826344455893e-05, + "loss": 1.1028, + "step": 4602 + }, + { + "epoch": 0.22, + "grad_norm": 1.3928060635674817, + "learning_rate": 1.8146923091756813e-05, + "loss": 1.2632, + "step": 4603 + }, + { + "epoch": 0.22, + "grad_norm": 1.629824777921582, + "learning_rate": 1.814601964135587e-05, + "loss": 1.2871, + "step": 4604 + }, + { + "epoch": 0.22, + "grad_norm": 1.2174438834394472, + "learning_rate": 1.8145115993274986e-05, + "loss": 1.2236, + "step": 4605 + }, + { + "epoch": 0.22, + "grad_norm": 1.338686371469044, + "learning_rate": 1.814421214753609e-05, + "loss": 1.3105, + "step": 4606 + }, + { + "epoch": 0.22, + "grad_norm": 1.3490404595455674, + "learning_rate": 1.8143308104161117e-05, + "loss": 1.1128, + "step": 4607 + }, + { + "epoch": 0.22, + "grad_norm": 1.0294270664527179, + "learning_rate": 1.8142403863172007e-05, + "loss": 1.1577, + "step": 4608 + }, + { + "epoch": 0.22, + "grad_norm": 1.1953509373105422, + "learning_rate": 1.81414994245907e-05, + "loss": 1.2612, + "step": 4609 + }, + { + "epoch": 0.22, + "grad_norm": 1.4910919606664206, + "learning_rate": 1.8140594788439142e-05, + "loss": 1.3237, + "step": 4610 + }, + { + "epoch": 0.22, + "grad_norm": 1.2718684865838354, + "learning_rate": 1.8139689954739294e-05, + "loss": 1.1841, + "step": 4611 + }, + { + "epoch": 0.22, + "grad_norm": 1.166179800983941, + "learning_rate": 1.8138784923513107e-05, + "loss": 1.1611, + "step": 4612 + }, + { + "epoch": 0.22, + "grad_norm": 1.2310101381141216, + "learning_rate": 1.8137879694782543e-05, + "loss": 1.2954, + "step": 4613 + }, + { + "epoch": 0.22, + "grad_norm": 1.0405853906149551, + "learning_rate": 1.8136974268569575e-05, + "loss": 1.1548, + "step": 4614 + }, + { + "epoch": 0.22, + "grad_norm": 1.1212238706301387, + "learning_rate": 1.813606864489617e-05, + "loss": 1.2681, + "step": 4615 + }, + { + "epoch": 0.22, + "grad_norm": 1.3154975022881827, + "learning_rate": 1.8135162823784303e-05, + "loss": 1.2627, + "step": 4616 + }, + { + "epoch": 0.22, + "grad_norm": 1.2191234161168363, + "learning_rate": 1.8134256805255964e-05, + "loss": 1.0776, + "step": 4617 + }, + { + "epoch": 0.22, + "grad_norm": 1.0208052385445114, + "learning_rate": 1.8133350589333136e-05, + "loss": 1.1079, + "step": 4618 + }, + { + "epoch": 0.22, + "grad_norm": 1.7370577678234242, + "learning_rate": 1.8132444176037803e-05, + "loss": 1.3447, + "step": 4619 + }, + { + "epoch": 0.22, + "grad_norm": 1.493415811044422, + "learning_rate": 1.8131537565391967e-05, + "loss": 1.2651, + "step": 4620 + }, + { + "epoch": 0.22, + "grad_norm": 1.1911851356491736, + "learning_rate": 1.8130630757417628e-05, + "loss": 1.1416, + "step": 4621 + }, + { + "epoch": 0.22, + "grad_norm": 1.2023250058361632, + "learning_rate": 1.8129723752136797e-05, + "loss": 1.2549, + "step": 4622 + }, + { + "epoch": 0.22, + "grad_norm": 1.3678320258723224, + "learning_rate": 1.8128816549571472e-05, + "loss": 1.1433, + "step": 4623 + }, + { + "epoch": 0.22, + "grad_norm": 0.9700521748477612, + "learning_rate": 1.812790914974368e-05, + "loss": 1.1104, + "step": 4624 + }, + { + "epoch": 0.22, + "grad_norm": 1.4602357639415833, + "learning_rate": 1.8127001552675436e-05, + "loss": 1.2271, + "step": 4625 + }, + { + "epoch": 0.22, + "grad_norm": 1.0263956005285435, + "learning_rate": 1.8126093758388764e-05, + "loss": 1.1777, + "step": 4626 + }, + { + "epoch": 0.22, + "grad_norm": 1.1572182002603058, + "learning_rate": 1.8125185766905697e-05, + "loss": 1.1597, + "step": 4627 + }, + { + "epoch": 0.22, + "grad_norm": 1.5627797461937931, + "learning_rate": 1.812427757824826e-05, + "loss": 1.291, + "step": 4628 + }, + { + "epoch": 0.22, + "grad_norm": 1.5197939867251296, + "learning_rate": 1.8123369192438508e-05, + "loss": 1.3496, + "step": 4629 + }, + { + "epoch": 0.22, + "grad_norm": 1.3386229114356023, + "learning_rate": 1.812246060949847e-05, + "loss": 1.2944, + "step": 4630 + }, + { + "epoch": 0.22, + "grad_norm": 1.1531589280809833, + "learning_rate": 1.81215518294502e-05, + "loss": 1.0706, + "step": 4631 + }, + { + "epoch": 0.22, + "grad_norm": 1.4108602737109173, + "learning_rate": 1.812064285231575e-05, + "loss": 1.334, + "step": 4632 + }, + { + "epoch": 0.22, + "grad_norm": 1.6144806203569138, + "learning_rate": 1.8119733678117185e-05, + "loss": 1.27, + "step": 4633 + }, + { + "epoch": 0.22, + "grad_norm": 1.0268770566122665, + "learning_rate": 1.811882430687656e-05, + "loss": 1.0422, + "step": 4634 + }, + { + "epoch": 0.22, + "grad_norm": 1.142928670678361, + "learning_rate": 1.811791473861595e-05, + "loss": 1.0596, + "step": 4635 + }, + { + "epoch": 0.22, + "grad_norm": 1.3579603648545955, + "learning_rate": 1.811700497335742e-05, + "loss": 1.2012, + "step": 4636 + }, + { + "epoch": 0.22, + "grad_norm": 1.1758436186801722, + "learning_rate": 1.811609501112305e-05, + "loss": 1.2964, + "step": 4637 + }, + { + "epoch": 0.22, + "grad_norm": 1.2541432638474839, + "learning_rate": 1.8115184851934922e-05, + "loss": 1.2681, + "step": 4638 + }, + { + "epoch": 0.22, + "grad_norm": 1.1100636068886438, + "learning_rate": 1.8114274495815123e-05, + "loss": 1.1328, + "step": 4639 + }, + { + "epoch": 0.22, + "grad_norm": 1.2897330910440368, + "learning_rate": 1.8113363942785747e-05, + "loss": 0.9832, + "step": 4640 + }, + { + "epoch": 0.22, + "grad_norm": 1.1791665593102456, + "learning_rate": 1.811245319286889e-05, + "loss": 1.2139, + "step": 4641 + }, + { + "epoch": 0.22, + "grad_norm": 1.1231716784049621, + "learning_rate": 1.811154224608665e-05, + "loss": 1.293, + "step": 4642 + }, + { + "epoch": 0.22, + "grad_norm": 1.5772101956715967, + "learning_rate": 1.8110631102461134e-05, + "loss": 1.2188, + "step": 4643 + }, + { + "epoch": 0.22, + "grad_norm": 1.678270196969465, + "learning_rate": 1.8109719762014454e-05, + "loss": 1.3218, + "step": 4644 + }, + { + "epoch": 0.22, + "grad_norm": 1.380857222522095, + "learning_rate": 1.8108808224768724e-05, + "loss": 1.2241, + "step": 4645 + }, + { + "epoch": 0.22, + "grad_norm": 1.331295903625861, + "learning_rate": 1.8107896490746067e-05, + "loss": 1.3525, + "step": 4646 + }, + { + "epoch": 0.22, + "grad_norm": 1.2217658418801198, + "learning_rate": 1.810698455996861e-05, + "loss": 1.1104, + "step": 4647 + }, + { + "epoch": 0.22, + "grad_norm": 0.9478161645158517, + "learning_rate": 1.8106072432458478e-05, + "loss": 1.3071, + "step": 4648 + }, + { + "epoch": 0.22, + "grad_norm": 1.2996884044059291, + "learning_rate": 1.8105160108237805e-05, + "loss": 1.2095, + "step": 4649 + }, + { + "epoch": 0.22, + "grad_norm": 1.3422511021133121, + "learning_rate": 1.8104247587328733e-05, + "loss": 1.2427, + "step": 4650 + }, + { + "epoch": 0.22, + "grad_norm": 1.1943585443760827, + "learning_rate": 1.8103334869753406e-05, + "loss": 1.2417, + "step": 4651 + }, + { + "epoch": 0.22, + "grad_norm": 1.500404145338423, + "learning_rate": 1.8102421955533974e-05, + "loss": 1.1416, + "step": 4652 + }, + { + "epoch": 0.22, + "grad_norm": 1.2650365621977697, + "learning_rate": 1.8101508844692586e-05, + "loss": 1.3857, + "step": 4653 + }, + { + "epoch": 0.22, + "grad_norm": 0.9260314947208633, + "learning_rate": 1.810059553725141e-05, + "loss": 1.1357, + "step": 4654 + }, + { + "epoch": 0.22, + "grad_norm": 0.8374162154043597, + "learning_rate": 1.80996820332326e-05, + "loss": 1.1787, + "step": 4655 + }, + { + "epoch": 0.22, + "grad_norm": 1.352659063077424, + "learning_rate": 1.8098768332658325e-05, + "loss": 1.272, + "step": 4656 + }, + { + "epoch": 0.22, + "grad_norm": 0.9113014214280893, + "learning_rate": 1.809785443555076e-05, + "loss": 1.0488, + "step": 4657 + }, + { + "epoch": 0.22, + "grad_norm": 1.27989336755472, + "learning_rate": 1.809694034193209e-05, + "loss": 1.165, + "step": 4658 + }, + { + "epoch": 0.22, + "grad_norm": 1.1974905411158188, + "learning_rate": 1.8096026051824483e-05, + "loss": 1.1865, + "step": 4659 + }, + { + "epoch": 0.22, + "grad_norm": 1.1429835704058984, + "learning_rate": 1.8095111565250137e-05, + "loss": 1.145, + "step": 4660 + }, + { + "epoch": 0.22, + "grad_norm": 1.1218077237322053, + "learning_rate": 1.8094196882231235e-05, + "loss": 0.9109, + "step": 4661 + }, + { + "epoch": 0.22, + "grad_norm": 1.1574367209299006, + "learning_rate": 1.8093282002789983e-05, + "loss": 1.1396, + "step": 4662 + }, + { + "epoch": 0.22, + "grad_norm": 1.3280984293424698, + "learning_rate": 1.8092366926948578e-05, + "loss": 1.2305, + "step": 4663 + }, + { + "epoch": 0.22, + "grad_norm": 1.2223373518221028, + "learning_rate": 1.8091451654729225e-05, + "loss": 1.1841, + "step": 4664 + }, + { + "epoch": 0.22, + "grad_norm": 1.1678002358034585, + "learning_rate": 1.8090536186154143e-05, + "loss": 1.1987, + "step": 4665 + }, + { + "epoch": 0.22, + "grad_norm": 1.2867389150206026, + "learning_rate": 1.8089620521245534e-05, + "loss": 1.1099, + "step": 4666 + }, + { + "epoch": 0.22, + "grad_norm": 1.2484091593722442, + "learning_rate": 1.8088704660025626e-05, + "loss": 1.4219, + "step": 4667 + }, + { + "epoch": 0.22, + "grad_norm": 1.3487127972814297, + "learning_rate": 1.8087788602516643e-05, + "loss": 1.249, + "step": 4668 + }, + { + "epoch": 0.22, + "grad_norm": 1.312864461818492, + "learning_rate": 1.808687234874082e-05, + "loss": 1.3477, + "step": 4669 + }, + { + "epoch": 0.22, + "grad_norm": 1.7293317329192608, + "learning_rate": 1.8085955898720388e-05, + "loss": 1.1082, + "step": 4670 + }, + { + "epoch": 0.22, + "grad_norm": 1.6595396477992443, + "learning_rate": 1.8085039252477584e-05, + "loss": 1.2505, + "step": 4671 + }, + { + "epoch": 0.22, + "grad_norm": 1.293217248766018, + "learning_rate": 1.8084122410034655e-05, + "loss": 1.2881, + "step": 4672 + }, + { + "epoch": 0.22, + "grad_norm": 1.3450871286105295, + "learning_rate": 1.808320537141385e-05, + "loss": 1.3066, + "step": 4673 + }, + { + "epoch": 0.22, + "grad_norm": 1.424562524846477, + "learning_rate": 1.808228813663742e-05, + "loss": 1.1279, + "step": 4674 + }, + { + "epoch": 0.22, + "grad_norm": 1.1907554194828616, + "learning_rate": 1.8081370705727632e-05, + "loss": 1.0933, + "step": 4675 + }, + { + "epoch": 0.22, + "grad_norm": 1.398530463177658, + "learning_rate": 1.8080453078706737e-05, + "loss": 1.3457, + "step": 4676 + }, + { + "epoch": 0.22, + "grad_norm": 1.2875249787611744, + "learning_rate": 1.8079535255597014e-05, + "loss": 1.2524, + "step": 4677 + }, + { + "epoch": 0.23, + "grad_norm": 1.0729790530513408, + "learning_rate": 1.807861723642073e-05, + "loss": 1.1504, + "step": 4678 + }, + { + "epoch": 0.23, + "grad_norm": 1.1350773964669618, + "learning_rate": 1.8077699021200163e-05, + "loss": 1.0835, + "step": 4679 + }, + { + "epoch": 0.23, + "grad_norm": 1.3236305609677372, + "learning_rate": 1.80767806099576e-05, + "loss": 1.2319, + "step": 4680 + }, + { + "epoch": 0.23, + "grad_norm": 1.1780538090194632, + "learning_rate": 1.807586200271532e-05, + "loss": 1.1465, + "step": 4681 + }, + { + "epoch": 0.23, + "grad_norm": 0.9526330233164249, + "learning_rate": 1.8074943199495622e-05, + "loss": 1.0022, + "step": 4682 + }, + { + "epoch": 0.23, + "grad_norm": 1.1646270808452792, + "learning_rate": 1.8074024200320797e-05, + "loss": 1.1699, + "step": 4683 + }, + { + "epoch": 0.23, + "grad_norm": 1.3649003790727408, + "learning_rate": 1.8073105005213154e-05, + "loss": 1.2568, + "step": 4684 + }, + { + "epoch": 0.23, + "grad_norm": 1.275701296985823, + "learning_rate": 1.807218561419499e-05, + "loss": 1.3047, + "step": 4685 + }, + { + "epoch": 0.23, + "grad_norm": 1.0909534141027, + "learning_rate": 1.8071266027288625e-05, + "loss": 1.2085, + "step": 4686 + }, + { + "epoch": 0.23, + "grad_norm": 1.238954844341487, + "learning_rate": 1.8070346244516367e-05, + "loss": 1.3262, + "step": 4687 + }, + { + "epoch": 0.23, + "grad_norm": 1.0873599920950383, + "learning_rate": 1.806942626590054e-05, + "loss": 1.1831, + "step": 4688 + }, + { + "epoch": 0.23, + "grad_norm": 1.1494026566616067, + "learning_rate": 1.8068506091463473e-05, + "loss": 1.0579, + "step": 4689 + }, + { + "epoch": 0.23, + "grad_norm": 1.4794095107274565, + "learning_rate": 1.806758572122749e-05, + "loss": 1.2139, + "step": 4690 + }, + { + "epoch": 0.23, + "grad_norm": 1.3590725589600858, + "learning_rate": 1.806666515521492e-05, + "loss": 1.2275, + "step": 4691 + }, + { + "epoch": 0.23, + "grad_norm": 1.1980065702956226, + "learning_rate": 1.8065744393448118e-05, + "loss": 1.1875, + "step": 4692 + }, + { + "epoch": 0.23, + "grad_norm": 1.3513001456924731, + "learning_rate": 1.806482343594942e-05, + "loss": 1.2705, + "step": 4693 + }, + { + "epoch": 0.23, + "grad_norm": 1.0791575345875006, + "learning_rate": 1.806390228274117e-05, + "loss": 1.1836, + "step": 4694 + }, + { + "epoch": 0.23, + "grad_norm": 1.204674006048473, + "learning_rate": 1.8062980933845732e-05, + "loss": 1.2476, + "step": 4695 + }, + { + "epoch": 0.23, + "grad_norm": 1.076139799641576, + "learning_rate": 1.8062059389285455e-05, + "loss": 0.9609, + "step": 4696 + }, + { + "epoch": 0.23, + "grad_norm": 1.5047779205361997, + "learning_rate": 1.806113764908271e-05, + "loss": 1.2627, + "step": 4697 + }, + { + "epoch": 0.23, + "grad_norm": 1.1404898331352722, + "learning_rate": 1.8060215713259856e-05, + "loss": 1.1719, + "step": 4698 + }, + { + "epoch": 0.23, + "grad_norm": 1.1329321419713207, + "learning_rate": 1.8059293581839277e-05, + "loss": 1.0942, + "step": 4699 + }, + { + "epoch": 0.23, + "grad_norm": 1.1406016954405083, + "learning_rate": 1.805837125484334e-05, + "loss": 1.3154, + "step": 4700 + }, + { + "epoch": 0.23, + "grad_norm": 0.9965141206771454, + "learning_rate": 1.8057448732294432e-05, + "loss": 1.0796, + "step": 4701 + }, + { + "epoch": 0.23, + "grad_norm": 0.9782328337948923, + "learning_rate": 1.8056526014214944e-05, + "loss": 1.2197, + "step": 4702 + }, + { + "epoch": 0.23, + "grad_norm": 1.1966014738920447, + "learning_rate": 1.805560310062726e-05, + "loss": 1.2222, + "step": 4703 + }, + { + "epoch": 0.23, + "grad_norm": 1.3818728511207918, + "learning_rate": 1.8054679991553777e-05, + "loss": 1.1831, + "step": 4704 + }, + { + "epoch": 0.23, + "grad_norm": 1.2504069740202592, + "learning_rate": 1.80537566870169e-05, + "loss": 1.1816, + "step": 4705 + }, + { + "epoch": 0.23, + "grad_norm": 1.1062159558297529, + "learning_rate": 1.805283318703903e-05, + "loss": 1.2119, + "step": 4706 + }, + { + "epoch": 0.23, + "grad_norm": 1.1956071527943828, + "learning_rate": 1.805190949164259e-05, + "loss": 1.0005, + "step": 4707 + }, + { + "epoch": 0.23, + "grad_norm": 1.1017141840320077, + "learning_rate": 1.805098560084998e-05, + "loss": 1.2017, + "step": 4708 + }, + { + "epoch": 0.23, + "grad_norm": 1.2509543503500475, + "learning_rate": 1.8050061514683624e-05, + "loss": 1.0874, + "step": 4709 + }, + { + "epoch": 0.23, + "grad_norm": 1.2154513707525691, + "learning_rate": 1.8049137233165955e-05, + "loss": 1.2056, + "step": 4710 + }, + { + "epoch": 0.23, + "grad_norm": 1.36147357569537, + "learning_rate": 1.8048212756319395e-05, + "loss": 1.2837, + "step": 4711 + }, + { + "epoch": 0.23, + "grad_norm": 1.279927035136454, + "learning_rate": 1.804728808416638e-05, + "loss": 1.1895, + "step": 4712 + }, + { + "epoch": 0.23, + "grad_norm": 1.3683547969348335, + "learning_rate": 1.8046363216729354e-05, + "loss": 1.0684, + "step": 4713 + }, + { + "epoch": 0.23, + "grad_norm": 1.3404813937571631, + "learning_rate": 1.8045438154030752e-05, + "loss": 1.2148, + "step": 4714 + }, + { + "epoch": 0.23, + "grad_norm": 1.2152566777538842, + "learning_rate": 1.8044512896093027e-05, + "loss": 0.9871, + "step": 4715 + }, + { + "epoch": 0.23, + "grad_norm": 1.2983493509408623, + "learning_rate": 1.8043587442938633e-05, + "loss": 1.168, + "step": 4716 + }, + { + "epoch": 0.23, + "grad_norm": 1.1635304243486215, + "learning_rate": 1.8042661794590023e-05, + "loss": 1.1123, + "step": 4717 + }, + { + "epoch": 0.23, + "grad_norm": 0.9551003047991745, + "learning_rate": 1.804173595106967e-05, + "loss": 1.2651, + "step": 4718 + }, + { + "epoch": 0.23, + "grad_norm": 1.3840139914082743, + "learning_rate": 1.804080991240003e-05, + "loss": 1.3867, + "step": 4719 + }, + { + "epoch": 0.23, + "grad_norm": 1.186623851955907, + "learning_rate": 1.8039883678603583e-05, + "loss": 1.0918, + "step": 4720 + }, + { + "epoch": 0.23, + "grad_norm": 1.1163932318198322, + "learning_rate": 1.8038957249702806e-05, + "loss": 1.2358, + "step": 4721 + }, + { + "epoch": 0.23, + "grad_norm": 1.3966442448731171, + "learning_rate": 1.8038030625720173e-05, + "loss": 1.1479, + "step": 4722 + }, + { + "epoch": 0.23, + "grad_norm": 1.4219015614645245, + "learning_rate": 1.803710380667818e-05, + "loss": 1.0059, + "step": 4723 + }, + { + "epoch": 0.23, + "grad_norm": 1.379041125802112, + "learning_rate": 1.8036176792599313e-05, + "loss": 1.3364, + "step": 4724 + }, + { + "epoch": 0.23, + "grad_norm": 1.1356256732697534, + "learning_rate": 1.803524958350607e-05, + "loss": 1.1646, + "step": 4725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2266014799152656, + "learning_rate": 1.8034322179420946e-05, + "loss": 1.0283, + "step": 4726 + }, + { + "epoch": 0.23, + "grad_norm": 1.3864735664591499, + "learning_rate": 1.8033394580366453e-05, + "loss": 1.1938, + "step": 4727 + }, + { + "epoch": 0.23, + "grad_norm": 1.582256514201053, + "learning_rate": 1.8032466786365098e-05, + "loss": 1.2363, + "step": 4728 + }, + { + "epoch": 0.23, + "grad_norm": 1.3057015405120276, + "learning_rate": 1.80315387974394e-05, + "loss": 1.2915, + "step": 4729 + }, + { + "epoch": 0.23, + "grad_norm": 1.3958838152400168, + "learning_rate": 1.8030610613611874e-05, + "loss": 1.1582, + "step": 4730 + }, + { + "epoch": 0.23, + "grad_norm": 1.1065903287778764, + "learning_rate": 1.8029682234905044e-05, + "loss": 1.2671, + "step": 4731 + }, + { + "epoch": 0.23, + "grad_norm": 0.7720622182915661, + "learning_rate": 1.8028753661341442e-05, + "loss": 1.0376, + "step": 4732 + }, + { + "epoch": 0.23, + "grad_norm": 1.410948277583298, + "learning_rate": 1.80278248929436e-05, + "loss": 1.3672, + "step": 4733 + }, + { + "epoch": 0.23, + "grad_norm": 0.9547728565598028, + "learning_rate": 1.8026895929734057e-05, + "loss": 0.9995, + "step": 4734 + }, + { + "epoch": 0.23, + "grad_norm": 1.0832624512365303, + "learning_rate": 1.8025966771735354e-05, + "loss": 0.9922, + "step": 4735 + }, + { + "epoch": 0.23, + "grad_norm": 1.433393149373672, + "learning_rate": 1.8025037418970047e-05, + "loss": 1.3467, + "step": 4736 + }, + { + "epoch": 0.23, + "grad_norm": 1.2307513808084152, + "learning_rate": 1.8024107871460678e-05, + "loss": 1.0269, + "step": 4737 + }, + { + "epoch": 0.23, + "grad_norm": 1.4384987513189464, + "learning_rate": 1.8023178129229808e-05, + "loss": 1.2583, + "step": 4738 + }, + { + "epoch": 0.23, + "grad_norm": 1.4057937018469655, + "learning_rate": 1.8022248192300002e-05, + "loss": 1.0439, + "step": 4739 + }, + { + "epoch": 0.23, + "grad_norm": 0.8866770934228514, + "learning_rate": 1.8021318060693823e-05, + "loss": 1.1943, + "step": 4740 + }, + { + "epoch": 0.23, + "grad_norm": 1.2165626877882418, + "learning_rate": 1.8020387734433848e-05, + "loss": 1.2075, + "step": 4741 + }, + { + "epoch": 0.23, + "grad_norm": 1.2185663914878777, + "learning_rate": 1.801945721354265e-05, + "loss": 0.9609, + "step": 4742 + }, + { + "epoch": 0.23, + "grad_norm": 1.1869525692472314, + "learning_rate": 1.8018526498042805e-05, + "loss": 1.1023, + "step": 4743 + }, + { + "epoch": 0.23, + "grad_norm": 1.5349550653870092, + "learning_rate": 1.8017595587956907e-05, + "loss": 1.3047, + "step": 4744 + }, + { + "epoch": 0.23, + "grad_norm": 1.1776730807964357, + "learning_rate": 1.801666448330754e-05, + "loss": 1.1792, + "step": 4745 + }, + { + "epoch": 0.23, + "grad_norm": 1.1159289533317938, + "learning_rate": 1.8015733184117307e-05, + "loss": 1.3286, + "step": 4746 + }, + { + "epoch": 0.23, + "grad_norm": 1.1316172821772992, + "learning_rate": 1.8014801690408798e-05, + "loss": 1.2197, + "step": 4747 + }, + { + "epoch": 0.23, + "grad_norm": 1.1723004652094446, + "learning_rate": 1.8013870002204625e-05, + "loss": 1.2144, + "step": 4748 + }, + { + "epoch": 0.23, + "grad_norm": 1.3347042350890088, + "learning_rate": 1.8012938119527396e-05, + "loss": 1.1846, + "step": 4749 + }, + { + "epoch": 0.23, + "grad_norm": 1.284119406691614, + "learning_rate": 1.8012006042399724e-05, + "loss": 1.1831, + "step": 4750 + }, + { + "epoch": 0.23, + "grad_norm": 1.1480390619709482, + "learning_rate": 1.8011073770844225e-05, + "loss": 1.186, + "step": 4751 + }, + { + "epoch": 0.23, + "grad_norm": 1.29659185452966, + "learning_rate": 1.8010141304883527e-05, + "loss": 1.1284, + "step": 4752 + }, + { + "epoch": 0.23, + "grad_norm": 1.2409893403238503, + "learning_rate": 1.800920864454026e-05, + "loss": 1.1631, + "step": 4753 + }, + { + "epoch": 0.23, + "grad_norm": 1.110250623152587, + "learning_rate": 1.8008275789837047e-05, + "loss": 1.0918, + "step": 4754 + }, + { + "epoch": 0.23, + "grad_norm": 1.3717125788767786, + "learning_rate": 1.8007342740796538e-05, + "loss": 1.1348, + "step": 4755 + }, + { + "epoch": 0.23, + "grad_norm": 1.5619640843294615, + "learning_rate": 1.8006409497441364e-05, + "loss": 1.3594, + "step": 4756 + }, + { + "epoch": 0.23, + "grad_norm": 1.6241318112234875, + "learning_rate": 1.8005476059794183e-05, + "loss": 1.2324, + "step": 4757 + }, + { + "epoch": 0.23, + "grad_norm": 1.1312551821153949, + "learning_rate": 1.8004542427877635e-05, + "loss": 1.2104, + "step": 4758 + }, + { + "epoch": 0.23, + "grad_norm": 1.2075139994914001, + "learning_rate": 1.8003608601714388e-05, + "loss": 1.1006, + "step": 4759 + }, + { + "epoch": 0.23, + "grad_norm": 1.2999486701092993, + "learning_rate": 1.8002674581327096e-05, + "loss": 1.2046, + "step": 4760 + }, + { + "epoch": 0.23, + "grad_norm": 1.2247780356224924, + "learning_rate": 1.8001740366738426e-05, + "loss": 1.1016, + "step": 4761 + }, + { + "epoch": 0.23, + "grad_norm": 1.2123503978391823, + "learning_rate": 1.8000805957971054e-05, + "loss": 1.1504, + "step": 4762 + }, + { + "epoch": 0.23, + "grad_norm": 1.2475202729322248, + "learning_rate": 1.7999871355047647e-05, + "loss": 1.1982, + "step": 4763 + }, + { + "epoch": 0.23, + "grad_norm": 1.3599648169897485, + "learning_rate": 1.799893655799089e-05, + "loss": 1.3276, + "step": 4764 + }, + { + "epoch": 0.23, + "grad_norm": 1.2323664153259568, + "learning_rate": 1.7998001566823466e-05, + "loss": 1.2236, + "step": 4765 + }, + { + "epoch": 0.23, + "grad_norm": 1.1606424902794972, + "learning_rate": 1.7997066381568066e-05, + "loss": 1.2412, + "step": 4766 + }, + { + "epoch": 0.23, + "grad_norm": 1.1299954595629869, + "learning_rate": 1.7996131002247382e-05, + "loss": 1.2407, + "step": 4767 + }, + { + "epoch": 0.23, + "grad_norm": 1.1064547574625627, + "learning_rate": 1.7995195428884114e-05, + "loss": 1.1333, + "step": 4768 + }, + { + "epoch": 0.23, + "grad_norm": 1.3252188493516122, + "learning_rate": 1.7994259661500967e-05, + "loss": 1.1938, + "step": 4769 + }, + { + "epoch": 0.23, + "grad_norm": 1.2156675392033995, + "learning_rate": 1.7993323700120648e-05, + "loss": 1.2422, + "step": 4770 + }, + { + "epoch": 0.23, + "grad_norm": 1.295932156209699, + "learning_rate": 1.7992387544765874e-05, + "loss": 1.0852, + "step": 4771 + }, + { + "epoch": 0.23, + "grad_norm": 1.4342274391199086, + "learning_rate": 1.7991451195459356e-05, + "loss": 1.2583, + "step": 4772 + }, + { + "epoch": 0.23, + "grad_norm": 0.9798805677724934, + "learning_rate": 1.7990514652223818e-05, + "loss": 1.2061, + "step": 4773 + }, + { + "epoch": 0.23, + "grad_norm": 1.2468675401289773, + "learning_rate": 1.798957791508199e-05, + "loss": 1.1191, + "step": 4774 + }, + { + "epoch": 0.23, + "grad_norm": 1.452404609825306, + "learning_rate": 1.7988640984056602e-05, + "loss": 1.293, + "step": 4775 + }, + { + "epoch": 0.23, + "grad_norm": 1.0065926599060373, + "learning_rate": 1.7987703859170393e-05, + "loss": 1.1562, + "step": 4776 + }, + { + "epoch": 0.23, + "grad_norm": 1.3762877160114013, + "learning_rate": 1.79867665404461e-05, + "loss": 1.2427, + "step": 4777 + }, + { + "epoch": 0.23, + "grad_norm": 1.1571259337010766, + "learning_rate": 1.7985829027906475e-05, + "loss": 1.1548, + "step": 4778 + }, + { + "epoch": 0.23, + "grad_norm": 1.1624275712190564, + "learning_rate": 1.798489132157426e-05, + "loss": 1.1709, + "step": 4779 + }, + { + "epoch": 0.23, + "grad_norm": 1.2628034483910522, + "learning_rate": 1.798395342147222e-05, + "loss": 1.2441, + "step": 4780 + }, + { + "epoch": 0.23, + "grad_norm": 1.0570751089905746, + "learning_rate": 1.798301532762311e-05, + "loss": 1.1772, + "step": 4781 + }, + { + "epoch": 0.23, + "grad_norm": 1.1724390717228557, + "learning_rate": 1.7982077040049692e-05, + "loss": 0.9526, + "step": 4782 + }, + { + "epoch": 0.23, + "grad_norm": 1.3380807932209657, + "learning_rate": 1.7981138558774737e-05, + "loss": 1.1899, + "step": 4783 + }, + { + "epoch": 0.23, + "grad_norm": 1.175950190282243, + "learning_rate": 1.7980199883821026e-05, + "loss": 1.187, + "step": 4784 + }, + { + "epoch": 0.23, + "grad_norm": 1.3960075398347727, + "learning_rate": 1.797926101521133e-05, + "loss": 1.2534, + "step": 4785 + }, + { + "epoch": 0.23, + "grad_norm": 1.1427773343251844, + "learning_rate": 1.7978321952968435e-05, + "loss": 1.187, + "step": 4786 + }, + { + "epoch": 0.23, + "grad_norm": 1.328917360343138, + "learning_rate": 1.797738269711513e-05, + "loss": 1.2603, + "step": 4787 + }, + { + "epoch": 0.23, + "grad_norm": 1.3513627504284176, + "learning_rate": 1.797644324767421e-05, + "loss": 1.2188, + "step": 4788 + }, + { + "epoch": 0.23, + "grad_norm": 1.0366685966583733, + "learning_rate": 1.7975503604668468e-05, + "loss": 1.25, + "step": 4789 + }, + { + "epoch": 0.23, + "grad_norm": 1.3898379720692817, + "learning_rate": 1.797456376812071e-05, + "loss": 1.2241, + "step": 4790 + }, + { + "epoch": 0.23, + "grad_norm": 0.9445117226727996, + "learning_rate": 1.797362373805374e-05, + "loss": 1.2661, + "step": 4791 + }, + { + "epoch": 0.23, + "grad_norm": 1.314243341061258, + "learning_rate": 1.7972683514490372e-05, + "loss": 1.123, + "step": 4792 + }, + { + "epoch": 0.23, + "grad_norm": 1.3616706015419788, + "learning_rate": 1.797174309745342e-05, + "loss": 1.3003, + "step": 4793 + }, + { + "epoch": 0.23, + "grad_norm": 1.201533515826434, + "learning_rate": 1.7970802486965712e-05, + "loss": 1.0708, + "step": 4794 + }, + { + "epoch": 0.23, + "grad_norm": 1.1441043823401753, + "learning_rate": 1.7969861683050064e-05, + "loss": 1.1226, + "step": 4795 + }, + { + "epoch": 0.23, + "grad_norm": 1.5796538111838478, + "learning_rate": 1.7968920685729317e-05, + "loss": 1.1914, + "step": 4796 + }, + { + "epoch": 0.23, + "grad_norm": 1.2639130581718852, + "learning_rate": 1.7967979495026296e-05, + "loss": 1.2871, + "step": 4797 + }, + { + "epoch": 0.23, + "grad_norm": 1.16603459755483, + "learning_rate": 1.7967038110963847e-05, + "loss": 1.1216, + "step": 4798 + }, + { + "epoch": 0.23, + "grad_norm": 1.0547743515031858, + "learning_rate": 1.7966096533564813e-05, + "loss": 1.1396, + "step": 4799 + }, + { + "epoch": 0.23, + "grad_norm": 1.2066425090812345, + "learning_rate": 1.7965154762852044e-05, + "loss": 1.2417, + "step": 4800 + }, + { + "epoch": 0.23, + "grad_norm": 1.314674221179987, + "learning_rate": 1.7964212798848393e-05, + "loss": 1.1577, + "step": 4801 + }, + { + "epoch": 0.23, + "grad_norm": 1.6450692987135276, + "learning_rate": 1.796327064157672e-05, + "loss": 1.269, + "step": 4802 + }, + { + "epoch": 0.23, + "grad_norm": 1.6003494322657144, + "learning_rate": 1.7962328291059886e-05, + "loss": 1.1396, + "step": 4803 + }, + { + "epoch": 0.23, + "grad_norm": 1.4306269225126373, + "learning_rate": 1.7961385747320763e-05, + "loss": 1.1934, + "step": 4804 + }, + { + "epoch": 0.23, + "grad_norm": 1.468541914484602, + "learning_rate": 1.796044301038222e-05, + "loss": 1.3286, + "step": 4805 + }, + { + "epoch": 0.23, + "grad_norm": 1.2703703708065843, + "learning_rate": 1.795950008026714e-05, + "loss": 1.0669, + "step": 4806 + }, + { + "epoch": 0.23, + "grad_norm": 1.6668995399979225, + "learning_rate": 1.7958556956998397e-05, + "loss": 1.3218, + "step": 4807 + }, + { + "epoch": 0.23, + "grad_norm": 1.1741470114324215, + "learning_rate": 1.795761364059888e-05, + "loss": 1.1846, + "step": 4808 + }, + { + "epoch": 0.23, + "grad_norm": 1.1277221931292671, + "learning_rate": 1.7956670131091486e-05, + "loss": 1.2593, + "step": 4809 + }, + { + "epoch": 0.23, + "grad_norm": 1.5172018310534539, + "learning_rate": 1.7955726428499107e-05, + "loss": 1.2334, + "step": 4810 + }, + { + "epoch": 0.23, + "grad_norm": 1.252945819130846, + "learning_rate": 1.7954782532844643e-05, + "loss": 1.1846, + "step": 4811 + }, + { + "epoch": 0.23, + "grad_norm": 1.5275761624829902, + "learning_rate": 1.7953838444151004e-05, + "loss": 1.1392, + "step": 4812 + }, + { + "epoch": 0.23, + "grad_norm": 1.209969705119666, + "learning_rate": 1.7952894162441094e-05, + "loss": 1.2251, + "step": 4813 + }, + { + "epoch": 0.23, + "grad_norm": 1.3100477949217855, + "learning_rate": 1.795194968773783e-05, + "loss": 1.1064, + "step": 4814 + }, + { + "epoch": 0.23, + "grad_norm": 1.2553842089208935, + "learning_rate": 1.7951005020064142e-05, + "loss": 1.2485, + "step": 4815 + }, + { + "epoch": 0.23, + "grad_norm": 1.1107125769051212, + "learning_rate": 1.7950060159442934e-05, + "loss": 1.2549, + "step": 4816 + }, + { + "epoch": 0.23, + "grad_norm": 1.2053836021335163, + "learning_rate": 1.7949115105897155e-05, + "loss": 1.165, + "step": 4817 + }, + { + "epoch": 0.23, + "grad_norm": 1.442472508822967, + "learning_rate": 1.7948169859449726e-05, + "loss": 1.2046, + "step": 4818 + }, + { + "epoch": 0.23, + "grad_norm": 1.128925272271494, + "learning_rate": 1.7947224420123587e-05, + "loss": 1.1099, + "step": 4819 + }, + { + "epoch": 0.23, + "grad_norm": 1.3313418654082176, + "learning_rate": 1.7946278787941687e-05, + "loss": 1.1851, + "step": 4820 + }, + { + "epoch": 0.23, + "grad_norm": 1.311067265840723, + "learning_rate": 1.7945332962926966e-05, + "loss": 1.1807, + "step": 4821 + }, + { + "epoch": 0.23, + "grad_norm": 1.602106963558663, + "learning_rate": 1.7944386945102387e-05, + "loss": 1.3335, + "step": 4822 + }, + { + "epoch": 0.23, + "grad_norm": 1.2347212191041443, + "learning_rate": 1.7943440734490893e-05, + "loss": 1.2441, + "step": 4823 + }, + { + "epoch": 0.23, + "grad_norm": 1.261935041212208, + "learning_rate": 1.794249433111546e-05, + "loss": 1.2686, + "step": 4824 + }, + { + "epoch": 0.23, + "grad_norm": 1.2452358253451232, + "learning_rate": 1.7941547734999043e-05, + "loss": 1.2002, + "step": 4825 + }, + { + "epoch": 0.23, + "grad_norm": 1.4188010894400374, + "learning_rate": 1.794060094616462e-05, + "loss": 1.3076, + "step": 4826 + }, + { + "epoch": 0.23, + "grad_norm": 1.232720977092351, + "learning_rate": 1.7939653964635163e-05, + "loss": 1.1558, + "step": 4827 + }, + { + "epoch": 0.23, + "grad_norm": 1.1606859077341687, + "learning_rate": 1.7938706790433655e-05, + "loss": 1.228, + "step": 4828 + }, + { + "epoch": 0.23, + "grad_norm": 0.9413895400191855, + "learning_rate": 1.793775942358308e-05, + "loss": 1.0776, + "step": 4829 + }, + { + "epoch": 0.23, + "grad_norm": 1.3247558943150954, + "learning_rate": 1.7936811864106425e-05, + "loss": 1.1338, + "step": 4830 + }, + { + "epoch": 0.23, + "grad_norm": 1.2420599709159084, + "learning_rate": 1.793586411202669e-05, + "loss": 1.1748, + "step": 4831 + }, + { + "epoch": 0.23, + "grad_norm": 1.1182037019886841, + "learning_rate": 1.793491616736687e-05, + "loss": 1.1768, + "step": 4832 + }, + { + "epoch": 0.23, + "grad_norm": 1.2873283607637003, + "learning_rate": 1.7933968030149972e-05, + "loss": 1.0615, + "step": 4833 + }, + { + "epoch": 0.23, + "grad_norm": 1.3958028992618157, + "learning_rate": 1.7933019700399006e-05, + "loss": 1.2744, + "step": 4834 + }, + { + "epoch": 0.23, + "grad_norm": 1.6302902440836315, + "learning_rate": 1.793207117813698e-05, + "loss": 1.2744, + "step": 4835 + }, + { + "epoch": 0.23, + "grad_norm": 1.3365901088581946, + "learning_rate": 1.793112246338691e-05, + "loss": 1.2334, + "step": 4836 + }, + { + "epoch": 0.23, + "grad_norm": 1.2052241546992237, + "learning_rate": 1.7930173556171824e-05, + "loss": 1.0522, + "step": 4837 + }, + { + "epoch": 0.23, + "grad_norm": 1.2874896991537066, + "learning_rate": 1.792922445651475e-05, + "loss": 1.1042, + "step": 4838 + }, + { + "epoch": 0.23, + "grad_norm": 1.0761764553702429, + "learning_rate": 1.7928275164438715e-05, + "loss": 1.1743, + "step": 4839 + }, + { + "epoch": 0.23, + "grad_norm": 1.2762026316667618, + "learning_rate": 1.792732567996676e-05, + "loss": 1.2715, + "step": 4840 + }, + { + "epoch": 0.23, + "grad_norm": 1.239287824799206, + "learning_rate": 1.7926376003121922e-05, + "loss": 1.1602, + "step": 4841 + }, + { + "epoch": 0.23, + "grad_norm": 1.6974471067355485, + "learning_rate": 1.792542613392725e-05, + "loss": 1.3389, + "step": 4842 + }, + { + "epoch": 0.23, + "grad_norm": 1.2385153571119034, + "learning_rate": 1.7924476072405795e-05, + "loss": 1.2061, + "step": 4843 + }, + { + "epoch": 0.23, + "grad_norm": 1.490074876952386, + "learning_rate": 1.792352581858061e-05, + "loss": 1.2109, + "step": 4844 + }, + { + "epoch": 0.23, + "grad_norm": 1.4414356011566127, + "learning_rate": 1.7922575372474755e-05, + "loss": 1.2192, + "step": 4845 + }, + { + "epoch": 0.23, + "grad_norm": 0.8255915598651289, + "learning_rate": 1.7921624734111292e-05, + "loss": 1.094, + "step": 4846 + }, + { + "epoch": 0.23, + "grad_norm": 1.1501745247902115, + "learning_rate": 1.79206739035133e-05, + "loss": 1.272, + "step": 4847 + }, + { + "epoch": 0.23, + "grad_norm": 1.1884486796434799, + "learning_rate": 1.7919722880703843e-05, + "loss": 1.3252, + "step": 4848 + }, + { + "epoch": 0.23, + "grad_norm": 1.4740860411457277, + "learning_rate": 1.7918771665706e-05, + "loss": 1.186, + "step": 4849 + }, + { + "epoch": 0.23, + "grad_norm": 1.2783430253409893, + "learning_rate": 1.7917820258542863e-05, + "loss": 1.1904, + "step": 4850 + }, + { + "epoch": 0.23, + "grad_norm": 1.4131829380239787, + "learning_rate": 1.791686865923751e-05, + "loss": 1.2471, + "step": 4851 + }, + { + "epoch": 0.23, + "grad_norm": 1.0272441516966353, + "learning_rate": 1.7915916867813037e-05, + "loss": 1.2832, + "step": 4852 + }, + { + "epoch": 0.23, + "grad_norm": 1.4023420395124582, + "learning_rate": 1.7914964884292543e-05, + "loss": 1.2075, + "step": 4853 + }, + { + "epoch": 0.23, + "grad_norm": 1.2076652210030183, + "learning_rate": 1.7914012708699126e-05, + "loss": 1.1355, + "step": 4854 + }, + { + "epoch": 0.23, + "grad_norm": 1.3869291607103191, + "learning_rate": 1.7913060341055895e-05, + "loss": 1.3179, + "step": 4855 + }, + { + "epoch": 0.23, + "grad_norm": 1.08342390083113, + "learning_rate": 1.7912107781385963e-05, + "loss": 1.2021, + "step": 4856 + }, + { + "epoch": 0.23, + "grad_norm": 1.206832994213776, + "learning_rate": 1.7911155029712444e-05, + "loss": 1.1333, + "step": 4857 + }, + { + "epoch": 0.23, + "grad_norm": 1.1264510552471978, + "learning_rate": 1.7910202086058458e-05, + "loss": 1.0659, + "step": 4858 + }, + { + "epoch": 0.23, + "grad_norm": 1.2905997997978627, + "learning_rate": 1.790924895044713e-05, + "loss": 1.1777, + "step": 4859 + }, + { + "epoch": 0.23, + "grad_norm": 1.3101097016670136, + "learning_rate": 1.790829562290159e-05, + "loss": 1.21, + "step": 4860 + }, + { + "epoch": 0.23, + "grad_norm": 1.2403979649533146, + "learning_rate": 1.7907342103444975e-05, + "loss": 1.1846, + "step": 4861 + }, + { + "epoch": 0.23, + "grad_norm": 1.2746990628386623, + "learning_rate": 1.790638839210042e-05, + "loss": 1.3071, + "step": 4862 + }, + { + "epoch": 0.23, + "grad_norm": 1.2747284607947689, + "learning_rate": 1.790543448889107e-05, + "loss": 1.1499, + "step": 4863 + }, + { + "epoch": 0.23, + "grad_norm": 1.1685847828910976, + "learning_rate": 1.7904480393840074e-05, + "loss": 1.2178, + "step": 4864 + }, + { + "epoch": 0.23, + "grad_norm": 1.1109384979867274, + "learning_rate": 1.7903526106970585e-05, + "loss": 1.0681, + "step": 4865 + }, + { + "epoch": 0.23, + "grad_norm": 1.2436524915169016, + "learning_rate": 1.790257162830576e-05, + "loss": 1.1724, + "step": 4866 + }, + { + "epoch": 0.23, + "grad_norm": 1.31575919496647, + "learning_rate": 1.7901616957868766e-05, + "loss": 1.0962, + "step": 4867 + }, + { + "epoch": 0.23, + "grad_norm": 1.253257180362507, + "learning_rate": 1.7900662095682762e-05, + "loss": 1.2891, + "step": 4868 + }, + { + "epoch": 0.23, + "grad_norm": 1.0411566895070299, + "learning_rate": 1.7899707041770925e-05, + "loss": 1.2373, + "step": 4869 + }, + { + "epoch": 0.23, + "grad_norm": 1.213919844737081, + "learning_rate": 1.7898751796156433e-05, + "loss": 1.1631, + "step": 4870 + }, + { + "epoch": 0.23, + "grad_norm": 1.3632067434132507, + "learning_rate": 1.789779635886246e-05, + "loss": 1.0728, + "step": 4871 + }, + { + "epoch": 0.23, + "grad_norm": 1.1393019543225746, + "learning_rate": 1.7896840729912198e-05, + "loss": 1.1274, + "step": 4872 + }, + { + "epoch": 0.23, + "grad_norm": 1.237438298901906, + "learning_rate": 1.7895884909328835e-05, + "loss": 1.0305, + "step": 4873 + }, + { + "epoch": 0.23, + "grad_norm": 1.1540523427972924, + "learning_rate": 1.789492889713557e-05, + "loss": 1.2158, + "step": 4874 + }, + { + "epoch": 0.23, + "grad_norm": 1.5325534815204809, + "learning_rate": 1.7893972693355595e-05, + "loss": 1.2212, + "step": 4875 + }, + { + "epoch": 0.23, + "grad_norm": 1.240533783595969, + "learning_rate": 1.7893016298012117e-05, + "loss": 1.0847, + "step": 4876 + }, + { + "epoch": 0.23, + "grad_norm": 1.4304131228610446, + "learning_rate": 1.7892059711128346e-05, + "loss": 1.2085, + "step": 4877 + }, + { + "epoch": 0.23, + "grad_norm": 1.3855019199928713, + "learning_rate": 1.78911029327275e-05, + "loss": 1.1375, + "step": 4878 + }, + { + "epoch": 0.23, + "grad_norm": 1.309576136328083, + "learning_rate": 1.7890145962832786e-05, + "loss": 1.219, + "step": 4879 + }, + { + "epoch": 0.23, + "grad_norm": 1.4506581637571443, + "learning_rate": 1.788918880146744e-05, + "loss": 1.332, + "step": 4880 + }, + { + "epoch": 0.23, + "grad_norm": 1.2916938964173776, + "learning_rate": 1.788823144865468e-05, + "loss": 1.2129, + "step": 4881 + }, + { + "epoch": 0.23, + "grad_norm": 1.10011917355303, + "learning_rate": 1.7887273904417742e-05, + "loss": 1.1069, + "step": 4882 + }, + { + "epoch": 0.23, + "grad_norm": 1.3572908076976662, + "learning_rate": 1.7886316168779862e-05, + "loss": 1.0693, + "step": 4883 + }, + { + "epoch": 0.23, + "grad_norm": 1.1720517729891105, + "learning_rate": 1.7885358241764282e-05, + "loss": 1.0847, + "step": 4884 + }, + { + "epoch": 0.23, + "grad_norm": 1.4090624946605568, + "learning_rate": 1.7884400123394243e-05, + "loss": 1.1606, + "step": 4885 + }, + { + "epoch": 0.24, + "grad_norm": 1.342055959668762, + "learning_rate": 1.7883441813693006e-05, + "loss": 1.3096, + "step": 4886 + }, + { + "epoch": 0.24, + "grad_norm": 1.2033290836822448, + "learning_rate": 1.7882483312683816e-05, + "loss": 1.187, + "step": 4887 + }, + { + "epoch": 0.24, + "grad_norm": 1.2271961206823496, + "learning_rate": 1.788152462038994e-05, + "loss": 1.1162, + "step": 4888 + }, + { + "epoch": 0.24, + "grad_norm": 1.325104086239715, + "learning_rate": 1.7880565736834642e-05, + "loss": 1.1484, + "step": 4889 + }, + { + "epoch": 0.24, + "grad_norm": 1.2300055391952998, + "learning_rate": 1.7879606662041186e-05, + "loss": 1.0547, + "step": 4890 + }, + { + "epoch": 0.24, + "grad_norm": 1.2248587362007275, + "learning_rate": 1.787864739603285e-05, + "loss": 1.2417, + "step": 4891 + }, + { + "epoch": 0.24, + "grad_norm": 1.1078914218472715, + "learning_rate": 1.7877687938832915e-05, + "loss": 1.3486, + "step": 4892 + }, + { + "epoch": 0.24, + "grad_norm": 1.0592133924622917, + "learning_rate": 1.7876728290464658e-05, + "loss": 1.1431, + "step": 4893 + }, + { + "epoch": 0.24, + "grad_norm": 1.3412408214458693, + "learning_rate": 1.787576845095137e-05, + "loss": 1.2339, + "step": 4894 + }, + { + "epoch": 0.24, + "grad_norm": 1.1061983543608782, + "learning_rate": 1.7874808420316345e-05, + "loss": 0.8818, + "step": 4895 + }, + { + "epoch": 0.24, + "grad_norm": 1.396994611076, + "learning_rate": 1.787384819858288e-05, + "loss": 1.2852, + "step": 4896 + }, + { + "epoch": 0.24, + "grad_norm": 1.2108125250907351, + "learning_rate": 1.787288778577427e-05, + "loss": 1.3237, + "step": 4897 + }, + { + "epoch": 0.24, + "grad_norm": 1.0203444495536762, + "learning_rate": 1.7871927181913832e-05, + "loss": 1.1597, + "step": 4898 + }, + { + "epoch": 0.24, + "grad_norm": 1.1473891799165707, + "learning_rate": 1.787096638702487e-05, + "loss": 1.2539, + "step": 4899 + }, + { + "epoch": 0.24, + "grad_norm": 1.0423639691788777, + "learning_rate": 1.78700054011307e-05, + "loss": 1.2891, + "step": 4900 + }, + { + "epoch": 0.24, + "grad_norm": 1.3553928839144185, + "learning_rate": 1.7869044224254648e-05, + "loss": 1.2939, + "step": 4901 + }, + { + "epoch": 0.24, + "grad_norm": 1.6269742100185638, + "learning_rate": 1.786808285642003e-05, + "loss": 1.3418, + "step": 4902 + }, + { + "epoch": 0.24, + "grad_norm": 1.2910689119201095, + "learning_rate": 1.7867121297650184e-05, + "loss": 1.1641, + "step": 4903 + }, + { + "epoch": 0.24, + "grad_norm": 1.372005169159716, + "learning_rate": 1.786615954796844e-05, + "loss": 1.334, + "step": 4904 + }, + { + "epoch": 0.24, + "grad_norm": 1.1887744708047752, + "learning_rate": 1.7865197607398133e-05, + "loss": 1.23, + "step": 4905 + }, + { + "epoch": 0.24, + "grad_norm": 1.3319805544671728, + "learning_rate": 1.7864235475962616e-05, + "loss": 1.2383, + "step": 4906 + }, + { + "epoch": 0.24, + "grad_norm": 1.3453655278083003, + "learning_rate": 1.786327315368523e-05, + "loss": 1.1528, + "step": 4907 + }, + { + "epoch": 0.24, + "grad_norm": 1.0449369142233305, + "learning_rate": 1.7862310640589328e-05, + "loss": 1.2539, + "step": 4908 + }, + { + "epoch": 0.24, + "grad_norm": 1.1801755222964156, + "learning_rate": 1.786134793669827e-05, + "loss": 1.0686, + "step": 4909 + }, + { + "epoch": 0.24, + "grad_norm": 1.2405030857612542, + "learning_rate": 1.7860385042035418e-05, + "loss": 1.2139, + "step": 4910 + }, + { + "epoch": 0.24, + "grad_norm": 1.3638758352578797, + "learning_rate": 1.7859421956624135e-05, + "loss": 1.2612, + "step": 4911 + }, + { + "epoch": 0.24, + "grad_norm": 1.286899891343759, + "learning_rate": 1.7858458680487798e-05, + "loss": 1.3159, + "step": 4912 + }, + { + "epoch": 0.24, + "grad_norm": 1.263679968761987, + "learning_rate": 1.785749521364978e-05, + "loss": 1.2539, + "step": 4913 + }, + { + "epoch": 0.24, + "grad_norm": 1.1716779480091566, + "learning_rate": 1.7856531556133457e-05, + "loss": 1.3159, + "step": 4914 + }, + { + "epoch": 0.24, + "grad_norm": 1.281444551446763, + "learning_rate": 1.785556770796222e-05, + "loss": 1.1353, + "step": 4915 + }, + { + "epoch": 0.24, + "grad_norm": 1.2561890182054964, + "learning_rate": 1.785460366915946e-05, + "loss": 1.3096, + "step": 4916 + }, + { + "epoch": 0.24, + "grad_norm": 1.4511476586583223, + "learning_rate": 1.7853639439748564e-05, + "loss": 1.0789, + "step": 4917 + }, + { + "epoch": 0.24, + "grad_norm": 1.1232233001210574, + "learning_rate": 1.785267501975294e-05, + "loss": 1.2651, + "step": 4918 + }, + { + "epoch": 0.24, + "grad_norm": 1.066829476618541, + "learning_rate": 1.7851710409195987e-05, + "loss": 1.1602, + "step": 4919 + }, + { + "epoch": 0.24, + "grad_norm": 0.8674621841001435, + "learning_rate": 1.785074560810111e-05, + "loss": 1.0442, + "step": 4920 + }, + { + "epoch": 0.24, + "grad_norm": 1.3115864251158849, + "learning_rate": 1.7849780616491726e-05, + "loss": 1.2861, + "step": 4921 + }, + { + "epoch": 0.24, + "grad_norm": 1.322655778231526, + "learning_rate": 1.7848815434391254e-05, + "loss": 1.3882, + "step": 4922 + }, + { + "epoch": 0.24, + "grad_norm": 1.433923179026627, + "learning_rate": 1.784785006182311e-05, + "loss": 1.2983, + "step": 4923 + }, + { + "epoch": 0.24, + "grad_norm": 1.288801606226345, + "learning_rate": 1.784688449881073e-05, + "loss": 1.168, + "step": 4924 + }, + { + "epoch": 0.24, + "grad_norm": 1.1470195672799404, + "learning_rate": 1.784591874537754e-05, + "loss": 1.2241, + "step": 4925 + }, + { + "epoch": 0.24, + "grad_norm": 1.3686436557433157, + "learning_rate": 1.784495280154697e-05, + "loss": 1.2354, + "step": 4926 + }, + { + "epoch": 0.24, + "grad_norm": 1.335580331481445, + "learning_rate": 1.784398666734247e-05, + "loss": 1.2065, + "step": 4927 + }, + { + "epoch": 0.24, + "grad_norm": 1.3334503623163296, + "learning_rate": 1.784302034278748e-05, + "loss": 1.1118, + "step": 4928 + }, + { + "epoch": 0.24, + "grad_norm": 1.3919412200538406, + "learning_rate": 1.7842053827905457e-05, + "loss": 1.2812, + "step": 4929 + }, + { + "epoch": 0.24, + "grad_norm": 1.0850334105205488, + "learning_rate": 1.784108712271985e-05, + "loss": 1.0513, + "step": 4930 + }, + { + "epoch": 0.24, + "grad_norm": 1.1149797987981631, + "learning_rate": 1.7840120227254115e-05, + "loss": 1.0723, + "step": 4931 + }, + { + "epoch": 0.24, + "grad_norm": 1.5002103952248196, + "learning_rate": 1.783915314153172e-05, + "loss": 1.3828, + "step": 4932 + }, + { + "epoch": 0.24, + "grad_norm": 1.27629226848574, + "learning_rate": 1.783818586557613e-05, + "loss": 1.0608, + "step": 4933 + }, + { + "epoch": 0.24, + "grad_norm": 1.0764915894362816, + "learning_rate": 1.7837218399410822e-05, + "loss": 1.1724, + "step": 4934 + }, + { + "epoch": 0.24, + "grad_norm": 1.2377380650314207, + "learning_rate": 1.783625074305927e-05, + "loss": 1.2451, + "step": 4935 + }, + { + "epoch": 0.24, + "grad_norm": 1.0683279959189522, + "learning_rate": 1.7835282896544963e-05, + "loss": 1.0791, + "step": 4936 + }, + { + "epoch": 0.24, + "grad_norm": 1.2937508499973909, + "learning_rate": 1.783431485989138e-05, + "loss": 1.2378, + "step": 4937 + }, + { + "epoch": 0.24, + "grad_norm": 1.8520149462321878, + "learning_rate": 1.7833346633122013e-05, + "loss": 1.1221, + "step": 4938 + }, + { + "epoch": 0.24, + "grad_norm": 1.3689484250388382, + "learning_rate": 1.7832378216260365e-05, + "loss": 1.3691, + "step": 4939 + }, + { + "epoch": 0.24, + "grad_norm": 1.6504506682226088, + "learning_rate": 1.7831409609329927e-05, + "loss": 1.3696, + "step": 4940 + }, + { + "epoch": 0.24, + "grad_norm": 1.3262818769406042, + "learning_rate": 1.7830440812354216e-05, + "loss": 1.1694, + "step": 4941 + }, + { + "epoch": 0.24, + "grad_norm": 1.4552880071623104, + "learning_rate": 1.782947182535673e-05, + "loss": 1.2402, + "step": 4942 + }, + { + "epoch": 0.24, + "grad_norm": 0.9688034010685299, + "learning_rate": 1.782850264836099e-05, + "loss": 1.2788, + "step": 4943 + }, + { + "epoch": 0.24, + "grad_norm": 1.3676749455560824, + "learning_rate": 1.782753328139051e-05, + "loss": 1.311, + "step": 4944 + }, + { + "epoch": 0.24, + "grad_norm": 1.29895755491313, + "learning_rate": 1.782656372446882e-05, + "loss": 1.1992, + "step": 4945 + }, + { + "epoch": 0.24, + "grad_norm": 1.1218307853685279, + "learning_rate": 1.7825593977619443e-05, + "loss": 1.2095, + "step": 4946 + }, + { + "epoch": 0.24, + "grad_norm": 1.180209126800921, + "learning_rate": 1.782462404086592e-05, + "loss": 1.2866, + "step": 4947 + }, + { + "epoch": 0.24, + "grad_norm": 1.487061209111331, + "learning_rate": 1.782365391423178e-05, + "loss": 1.021, + "step": 4948 + }, + { + "epoch": 0.24, + "grad_norm": 1.2380517682839416, + "learning_rate": 1.7822683597740568e-05, + "loss": 1.1333, + "step": 4949 + }, + { + "epoch": 0.24, + "grad_norm": 1.3501835005580012, + "learning_rate": 1.782171309141583e-05, + "loss": 1.1455, + "step": 4950 + }, + { + "epoch": 0.24, + "grad_norm": 0.9397772784857986, + "learning_rate": 1.782074239528112e-05, + "loss": 1.271, + "step": 4951 + }, + { + "epoch": 0.24, + "grad_norm": 1.4412547854375268, + "learning_rate": 1.781977150935999e-05, + "loss": 1.189, + "step": 4952 + }, + { + "epoch": 0.24, + "grad_norm": 1.0772538275516599, + "learning_rate": 1.7818800433676e-05, + "loss": 1.0056, + "step": 4953 + }, + { + "epoch": 0.24, + "grad_norm": 1.3155811646100826, + "learning_rate": 1.781782916825272e-05, + "loss": 1.1709, + "step": 4954 + }, + { + "epoch": 0.24, + "grad_norm": 1.4229297353596286, + "learning_rate": 1.781685771311372e-05, + "loss": 1.3018, + "step": 4955 + }, + { + "epoch": 0.24, + "grad_norm": 1.2676701758879354, + "learning_rate": 1.781588606828257e-05, + "loss": 1.2173, + "step": 4956 + }, + { + "epoch": 0.24, + "grad_norm": 0.9799388514532664, + "learning_rate": 1.7814914233782848e-05, + "loss": 1.2622, + "step": 4957 + }, + { + "epoch": 0.24, + "grad_norm": 0.895089975317318, + "learning_rate": 1.7813942209638148e-05, + "loss": 1.1523, + "step": 4958 + }, + { + "epoch": 0.24, + "grad_norm": 1.733938010659696, + "learning_rate": 1.7812969995872044e-05, + "loss": 1.4243, + "step": 4959 + }, + { + "epoch": 0.24, + "grad_norm": 1.242460300113702, + "learning_rate": 1.781199759250814e-05, + "loss": 1.1504, + "step": 4960 + }, + { + "epoch": 0.24, + "grad_norm": 1.2739475123840172, + "learning_rate": 1.7811024999570023e-05, + "loss": 1.2847, + "step": 4961 + }, + { + "epoch": 0.24, + "grad_norm": 1.1079936455784112, + "learning_rate": 1.7810052217081306e-05, + "loss": 1.2529, + "step": 4962 + }, + { + "epoch": 0.24, + "grad_norm": 1.1726863997792616, + "learning_rate": 1.7809079245065586e-05, + "loss": 1.1323, + "step": 4963 + }, + { + "epoch": 0.24, + "grad_norm": 1.2875414212580971, + "learning_rate": 1.7808106083546478e-05, + "loss": 1.1851, + "step": 4964 + }, + { + "epoch": 0.24, + "grad_norm": 1.504052652793305, + "learning_rate": 1.7807132732547603e-05, + "loss": 1.272, + "step": 4965 + }, + { + "epoch": 0.24, + "grad_norm": 1.2237363519298097, + "learning_rate": 1.7806159192092575e-05, + "loss": 1.1836, + "step": 4966 + }, + { + "epoch": 0.24, + "grad_norm": 1.1819063182836653, + "learning_rate": 1.7805185462205018e-05, + "loss": 1.147, + "step": 4967 + }, + { + "epoch": 0.24, + "grad_norm": 1.192661765064858, + "learning_rate": 1.7804211542908568e-05, + "loss": 0.9597, + "step": 4968 + }, + { + "epoch": 0.24, + "grad_norm": 1.540314064050184, + "learning_rate": 1.780323743422685e-05, + "loss": 1.1479, + "step": 4969 + }, + { + "epoch": 0.24, + "grad_norm": 0.6709026019589093, + "learning_rate": 1.7802263136183514e-05, + "loss": 1.1641, + "step": 4970 + }, + { + "epoch": 0.24, + "grad_norm": 1.4903600438363398, + "learning_rate": 1.780128864880219e-05, + "loss": 1.1523, + "step": 4971 + }, + { + "epoch": 0.24, + "grad_norm": 1.1248822665318277, + "learning_rate": 1.780031397210654e-05, + "loss": 1.0879, + "step": 4972 + }, + { + "epoch": 0.24, + "grad_norm": 0.8927747292997149, + "learning_rate": 1.7799339106120205e-05, + "loss": 1.2217, + "step": 4973 + }, + { + "epoch": 0.24, + "grad_norm": 1.2911374662243948, + "learning_rate": 1.7798364050866853e-05, + "loss": 1.3936, + "step": 4974 + }, + { + "epoch": 0.24, + "grad_norm": 1.2146881223244403, + "learning_rate": 1.7797388806370132e-05, + "loss": 1.2178, + "step": 4975 + }, + { + "epoch": 0.24, + "grad_norm": 1.308162937555163, + "learning_rate": 1.779641337265372e-05, + "loss": 1.1597, + "step": 4976 + }, + { + "epoch": 0.24, + "grad_norm": 1.252103453433163, + "learning_rate": 1.7795437749741283e-05, + "loss": 1.1172, + "step": 4977 + }, + { + "epoch": 0.24, + "grad_norm": 1.2446202226452117, + "learning_rate": 1.77944619376565e-05, + "loss": 1.2456, + "step": 4978 + }, + { + "epoch": 0.24, + "grad_norm": 1.2668285964413382, + "learning_rate": 1.7793485936423045e-05, + "loss": 1.1309, + "step": 4979 + }, + { + "epoch": 0.24, + "grad_norm": 1.041425536357868, + "learning_rate": 1.7792509746064608e-05, + "loss": 1.1221, + "step": 4980 + }, + { + "epoch": 0.24, + "grad_norm": 1.226855119693531, + "learning_rate": 1.7791533366604876e-05, + "loss": 1.0796, + "step": 4981 + }, + { + "epoch": 0.24, + "grad_norm": 1.283223034793816, + "learning_rate": 1.7790556798067543e-05, + "loss": 1.1938, + "step": 4982 + }, + { + "epoch": 0.24, + "grad_norm": 1.207181001199412, + "learning_rate": 1.7789580040476305e-05, + "loss": 1.3042, + "step": 4983 + }, + { + "epoch": 0.24, + "grad_norm": 1.3137804938992095, + "learning_rate": 1.7788603093854872e-05, + "loss": 1.1812, + "step": 4984 + }, + { + "epoch": 0.24, + "grad_norm": 1.1236509437566278, + "learning_rate": 1.7787625958226947e-05, + "loss": 1.1831, + "step": 4985 + }, + { + "epoch": 0.24, + "grad_norm": 0.9347268155601564, + "learning_rate": 1.778664863361624e-05, + "loss": 1.1436, + "step": 4986 + }, + { + "epoch": 0.24, + "grad_norm": 1.3955904084815802, + "learning_rate": 1.7785671120046472e-05, + "loss": 1.2495, + "step": 4987 + }, + { + "epoch": 0.24, + "grad_norm": 1.2040355581246427, + "learning_rate": 1.7784693417541364e-05, + "loss": 1.1831, + "step": 4988 + }, + { + "epoch": 0.24, + "grad_norm": 1.2946661412713283, + "learning_rate": 1.7783715526124637e-05, + "loss": 1.2144, + "step": 4989 + }, + { + "epoch": 0.24, + "grad_norm": 1.182121091033394, + "learning_rate": 1.778273744582003e-05, + "loss": 1.1416, + "step": 4990 + }, + { + "epoch": 0.24, + "grad_norm": 1.227929835923367, + "learning_rate": 1.7781759176651273e-05, + "loss": 1.145, + "step": 4991 + }, + { + "epoch": 0.24, + "grad_norm": 1.5166415741911123, + "learning_rate": 1.77807807186421e-05, + "loss": 1.1904, + "step": 4992 + }, + { + "epoch": 0.24, + "grad_norm": 1.1879650104998185, + "learning_rate": 1.777980207181627e-05, + "loss": 1.2212, + "step": 4993 + }, + { + "epoch": 0.24, + "grad_norm": 1.525328249469443, + "learning_rate": 1.7778823236197515e-05, + "loss": 1.4395, + "step": 4994 + }, + { + "epoch": 0.24, + "grad_norm": 1.4704145207756008, + "learning_rate": 1.7777844211809602e-05, + "loss": 1.3623, + "step": 4995 + }, + { + "epoch": 0.24, + "grad_norm": 1.4389811187367623, + "learning_rate": 1.7776864998676284e-05, + "loss": 1.0876, + "step": 4996 + }, + { + "epoch": 0.24, + "grad_norm": 1.496230803623764, + "learning_rate": 1.777588559682132e-05, + "loss": 1.1997, + "step": 4997 + }, + { + "epoch": 0.24, + "grad_norm": 1.2646560804873412, + "learning_rate": 1.7774906006268482e-05, + "loss": 1.1143, + "step": 4998 + }, + { + "epoch": 0.24, + "grad_norm": 1.2233090355096585, + "learning_rate": 1.777392622704154e-05, + "loss": 1.2925, + "step": 4999 + }, + { + "epoch": 0.24, + "grad_norm": 1.0835058930794512, + "learning_rate": 1.7772946259164272e-05, + "loss": 1.0217, + "step": 5000 + } + ], + "logging_steps": 1.0, + "max_steps": 20791, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "total_flos": 245260153888768.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}