diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100755--- "a/trainer_state.json" +++ /dev/null @@ -1,13560 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.998172403289674, - "eval_steps": 25.0, - "global_step": 1230, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "full_loss": 0.6115, - "grad_norm": 11.4375, - "learning_rate": 6.756756756756758e-07, - "long_answer_loss": 0.6115, - "loss": 0.6482, - "short_answer_loss": NaN, - "step": 1, - "template_loss": 0.0 - }, - { - "epoch": 0.0, - "full_loss": 0.6372, - "grad_norm": 11.3125, - "learning_rate": 1.3513513513513515e-06, - "long_answer_loss": 0.6372, - "loss": 0.6202, - "short_answer_loss": NaN, - "step": 2, - "template_loss": 0.0 - }, - { - "epoch": 0.0, - "full_loss": 0.7199, - "grad_norm": 10.75, - "learning_rate": 2.0270270270270273e-06, - "long_answer_loss": 0.7199, - "loss": 0.6418, - "short_answer_loss": NaN, - "step": 3, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.6846, - "grad_norm": 10.625, - "learning_rate": 2.702702702702703e-06, - "long_answer_loss": 0.6846, - "loss": 0.6412, - "short_answer_loss": NaN, - "step": 4, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.5437, - "grad_norm": 9.5625, - "learning_rate": 3.3783783783783788e-06, - "long_answer_loss": 0.5437, - "loss": 0.5699, - "short_answer_loss": NaN, - "step": 5, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.5362, - "grad_norm": 8.3125, - "learning_rate": 4.0540540540540545e-06, - "long_answer_loss": 0.5362, - "loss": 0.5227, - "short_answer_loss": NaN, - "step": 6, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.4596, - "grad_norm": 7.0625, - "learning_rate": 4.72972972972973e-06, - "long_answer_loss": 0.4596, - "loss": 0.489, - "short_answer_loss": NaN, - "step": 7, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.3601, - "grad_norm": 5.3125, - "learning_rate": 5.405405405405406e-06, - "long_answer_loss": 0.3601, - "loss": 0.3993, - "short_answer_loss": NaN, - "step": 8, - "template_loss": 0.0 - }, - { - "epoch": 0.01, - "full_loss": 0.361, - "grad_norm": 6.03125, - "learning_rate": 6.081081081081082e-06, - "long_answer_loss": 0.361, - "loss": 0.3465, - "short_answer_loss": NaN, - "step": 9, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.3461, - "grad_norm": 5.78125, - "learning_rate": 6.7567567567567575e-06, - "long_answer_loss": 0.3461, - "loss": 0.3391, - "short_answer_loss": NaN, - "step": 10, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.3227, - "grad_norm": 5.0625, - "learning_rate": 7.432432432432433e-06, - "long_answer_loss": 0.3227, - "loss": 0.3084, - "short_answer_loss": NaN, - "step": 11, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2806, - "grad_norm": 4.25, - "learning_rate": 8.108108108108109e-06, - "long_answer_loss": 0.2806, - "loss": 0.3001, - "short_answer_loss": NaN, - "step": 12, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.3417, - "grad_norm": 3.46875, - "learning_rate": 8.783783783783785e-06, - "long_answer_loss": 0.3417, - "loss": 0.3107, - "short_answer_loss": NaN, - "step": 13, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.3048, - "grad_norm": 3.0625, - "learning_rate": 9.45945945945946e-06, - "long_answer_loss": 0.3048, - "loss": 0.2865, - "short_answer_loss": NaN, - "step": 14, - "template_loss": 0.0 - }, - { - "epoch": 0.02, - "full_loss": 0.2859, - "grad_norm": 2.953125, - "learning_rate": 1.0135135135135136e-05, - "long_answer_loss": 0.2859, - "loss": 0.2784, - "short_answer_loss": NaN, - "step": 15, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.3344, - "grad_norm": 3.140625, - "learning_rate": 1.0810810810810812e-05, - "long_answer_loss": 0.3344, - "loss": 0.2624, - "short_answer_loss": NaN, - "step": 16, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2341, - "grad_norm": 3.15625, - "learning_rate": 1.1486486486486488e-05, - "long_answer_loss": 0.2341, - "loss": 0.2604, - "short_answer_loss": NaN, - "step": 17, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2536, - "grad_norm": 2.796875, - "learning_rate": 1.2162162162162164e-05, - "long_answer_loss": 0.2536, - "loss": 0.2468, - "short_answer_loss": NaN, - "step": 18, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2628, - "grad_norm": 2.828125, - "learning_rate": 1.2837837837837838e-05, - "long_answer_loss": 0.2628, - "loss": 0.2661, - "short_answer_loss": NaN, - "step": 19, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2042, - "grad_norm": 2.875, - "learning_rate": 1.3513513513513515e-05, - "long_answer_loss": 0.2042, - "loss": 0.2447, - "short_answer_loss": NaN, - "step": 20, - "template_loss": 0.0 - }, - { - "epoch": 0.03, - "full_loss": 0.2151, - "grad_norm": 2.40625, - "learning_rate": 1.4189189189189189e-05, - "long_answer_loss": 0.2151, - "loss": 0.233, - "short_answer_loss": NaN, - "step": 21, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.2608, - "grad_norm": 2.734375, - "learning_rate": 1.4864864864864867e-05, - "long_answer_loss": 0.2608, - "loss": 0.2464, - "short_answer_loss": NaN, - "step": 22, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.3018, - "grad_norm": 2.109375, - "learning_rate": 1.554054054054054e-05, - "long_answer_loss": 0.3018, - "loss": 0.2438, - "short_answer_loss": NaN, - "step": 23, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.2322, - "grad_norm": 2.59375, - "learning_rate": 1.6216216216216218e-05, - "long_answer_loss": 0.2322, - "loss": 0.2321, - "short_answer_loss": NaN, - "step": 24, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.2401, - "grad_norm": 2.40625, - "learning_rate": 1.6891891891891892e-05, - "long_answer_loss": 0.2401, - "loss": 0.2233, - "short_answer_loss": NaN, - "step": 25, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.2226, - "grad_norm": 2.609375, - "learning_rate": 1.756756756756757e-05, - "long_answer_loss": 0.2226, - "loss": 0.2298, - "short_answer_loss": NaN, - "step": 26, - "template_loss": 0.0 - }, - { - "epoch": 0.04, - "full_loss": 0.1849, - "grad_norm": 2.5, - "learning_rate": 1.8243243243243244e-05, - "long_answer_loss": 0.1849, - "loss": 0.2131, - "short_answer_loss": NaN, - "step": 27, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.2093, - "grad_norm": 2.421875, - "learning_rate": 1.891891891891892e-05, - "long_answer_loss": 0.2093, - "loss": 0.2247, - "short_answer_loss": NaN, - "step": 28, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.2368, - "grad_norm": 2.59375, - "learning_rate": 1.9594594594594595e-05, - "long_answer_loss": 0.2368, - "loss": 0.2149, - "short_answer_loss": NaN, - "step": 29, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.2029, - "grad_norm": 2.609375, - "learning_rate": 2.0270270270270273e-05, - "long_answer_loss": 0.2029, - "loss": 0.2181, - "short_answer_loss": NaN, - "step": 30, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.2043, - "grad_norm": 2.484375, - "learning_rate": 2.0945945945945947e-05, - "long_answer_loss": 0.2043, - "loss": 0.2165, - "short_answer_loss": NaN, - "step": 31, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.2517, - "grad_norm": 2.546875, - "learning_rate": 2.1621621621621624e-05, - "long_answer_loss": 0.2517, - "loss": 0.2321, - "short_answer_loss": NaN, - "step": 32, - "template_loss": 0.0 - }, - { - "epoch": 0.05, - "full_loss": 0.2206, - "grad_norm": 2.484375, - "learning_rate": 2.2297297297297298e-05, - "long_answer_loss": 0.2206, - "loss": 0.2293, - "short_answer_loss": NaN, - "step": 33, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2023, - "grad_norm": 2.375, - "learning_rate": 2.2972972972972976e-05, - "long_answer_loss": 0.2023, - "loss": 0.2153, - "short_answer_loss": NaN, - "step": 34, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2392, - "grad_norm": 2.796875, - "learning_rate": 2.364864864864865e-05, - "long_answer_loss": 0.2392, - "loss": 0.2223, - "short_answer_loss": NaN, - "step": 35, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2382, - "grad_norm": 2.578125, - "learning_rate": 2.4324324324324327e-05, - "long_answer_loss": 0.2382, - "loss": 0.2365, - "short_answer_loss": NaN, - "step": 36, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2238, - "grad_norm": 2.375, - "learning_rate": 2.5e-05, - "long_answer_loss": 0.2238, - "loss": 0.2225, - "short_answer_loss": NaN, - "step": 37, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2399, - "grad_norm": 2.65625, - "learning_rate": 2.499995665903025e-05, - "long_answer_loss": 0.2399, - "loss": 0.2232, - "short_answer_loss": NaN, - "step": 38, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.2196, - "grad_norm": 2.65625, - "learning_rate": 2.4999826636421537e-05, - "long_answer_loss": 0.2196, - "loss": 0.2163, - "short_answer_loss": NaN, - "step": 39, - "template_loss": 0.0 - }, - { - "epoch": 0.06, - "full_loss": 0.1898, - "grad_norm": 2.28125, - "learning_rate": 2.4999609933075525e-05, - "long_answer_loss": 0.1898, - "loss": 0.2222, - "short_answer_loss": NaN, - "step": 40, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.2008, - "grad_norm": 2.6875, - "learning_rate": 2.4999306550494938e-05, - "long_answer_loss": 0.2008, - "loss": 0.2219, - "short_answer_loss": NaN, - "step": 41, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.2256, - "grad_norm": 2.921875, - "learning_rate": 2.4998916490783615e-05, - "long_answer_loss": 0.2256, - "loss": 0.2265, - "short_answer_loss": NaN, - "step": 42, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.2206, - "grad_norm": 2.640625, - "learning_rate": 2.499843975664644e-05, - "long_answer_loss": 0.2206, - "loss": 0.2157, - "short_answer_loss": NaN, - "step": 43, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.2298, - "grad_norm": 2.453125, - "learning_rate": 2.499787635138935e-05, - "long_answer_loss": 0.2298, - "loss": 0.2231, - "short_answer_loss": NaN, - "step": 44, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1787, - "grad_norm": 2.171875, - "learning_rate": 2.4997226278919313e-05, - "long_answer_loss": 0.1787, - "loss": 0.207, - "short_answer_loss": NaN, - "step": 45, - "template_loss": 0.0 - }, - { - "epoch": 0.07, - "full_loss": 0.1969, - "grad_norm": 2.515625, - "learning_rate": 2.499648954374429e-05, - "long_answer_loss": 0.1969, - "loss": 0.2057, - "short_answer_loss": NaN, - "step": 46, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.2109, - "grad_norm": 2.234375, - "learning_rate": 2.4995666150973213e-05, - "long_answer_loss": 0.2109, - "loss": 0.2189, - "short_answer_loss": NaN, - "step": 47, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.2601, - "grad_norm": 2.34375, - "learning_rate": 2.4994756106315946e-05, - "long_answer_loss": 0.2601, - "loss": 0.2189, - "short_answer_loss": NaN, - "step": 48, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1932, - "grad_norm": 2.203125, - "learning_rate": 2.4993759416083243e-05, - "long_answer_loss": 0.1932, - "loss": 0.2043, - "short_answer_loss": NaN, - "step": 49, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.2208, - "grad_norm": 2.3125, - "learning_rate": 2.4992676087186707e-05, - "long_answer_loss": 0.2208, - "loss": 0.2145, - "short_answer_loss": NaN, - "step": 50, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.2448, - "grad_norm": 2.296875, - "learning_rate": 2.4991506127138743e-05, - "long_answer_loss": 0.2448, - "loss": 0.2071, - "short_answer_loss": NaN, - "step": 51, - "template_loss": 0.0 - }, - { - "epoch": 0.08, - "full_loss": 0.1996, - "grad_norm": 2.265625, - "learning_rate": 2.4990249544052495e-05, - "long_answer_loss": 0.1996, - "loss": 0.2221, - "short_answer_loss": NaN, - "step": 52, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.2223, - "grad_norm": 2.203125, - "learning_rate": 2.4988906346641824e-05, - "long_answer_loss": 0.2223, - "loss": 0.2183, - "short_answer_loss": NaN, - "step": 53, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.2183, - "grad_norm": 2.421875, - "learning_rate": 2.4987476544221195e-05, - "long_answer_loss": 0.2183, - "loss": 0.2144, - "short_answer_loss": NaN, - "step": 54, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.2327, - "grad_norm": 2.25, - "learning_rate": 2.4985960146705657e-05, - "long_answer_loss": 0.2327, - "loss": 0.2208, - "short_answer_loss": NaN, - "step": 55, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.1777, - "grad_norm": 2.0, - "learning_rate": 2.4984357164610743e-05, - "long_answer_loss": 0.1777, - "loss": 0.2029, - "short_answer_loss": NaN, - "step": 56, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.2131, - "grad_norm": 2.359375, - "learning_rate": 2.4982667609052434e-05, - "long_answer_loss": 0.2131, - "loss": 0.2168, - "short_answer_loss": NaN, - "step": 57, - "template_loss": 0.0 - }, - { - "epoch": 0.09, - "full_loss": 0.2355, - "grad_norm": 2.40625, - "learning_rate": 2.4980891491747036e-05, - "long_answer_loss": 0.2355, - "loss": 0.228, - "short_answer_loss": NaN, - "step": 58, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.2016, - "grad_norm": 2.140625, - "learning_rate": 2.4979028825011137e-05, - "long_answer_loss": 0.2016, - "loss": 0.2088, - "short_answer_loss": NaN, - "step": 59, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.197, - "grad_norm": 2.265625, - "learning_rate": 2.49770796217615e-05, - "long_answer_loss": 0.197, - "loss": 0.2219, - "short_answer_loss": NaN, - "step": 60, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.2062, - "grad_norm": 2.109375, - "learning_rate": 2.4975043895514987e-05, - "long_answer_loss": 0.2062, - "loss": 0.2069, - "short_answer_loss": NaN, - "step": 61, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.2231, - "grad_norm": 2.125, - "learning_rate": 2.4972921660388448e-05, - "long_answer_loss": 0.2231, - "loss": 0.2046, - "short_answer_loss": NaN, - "step": 62, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.2231, - "grad_norm": 2.25, - "learning_rate": 2.4970712931098644e-05, - "long_answer_loss": 0.2231, - "loss": 0.2077, - "short_answer_loss": NaN, - "step": 63, - "template_loss": 0.0 - }, - { - "epoch": 0.1, - "full_loss": 0.2322, - "grad_norm": 2.234375, - "learning_rate": 2.496841772296213e-05, - "long_answer_loss": 0.2322, - "loss": 0.2194, - "short_answer_loss": NaN, - "step": 64, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2073, - "grad_norm": 2.140625, - "learning_rate": 2.4966036051895147e-05, - "long_answer_loss": 0.2073, - "loss": 0.2224, - "short_answer_loss": NaN, - "step": 65, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.1991, - "grad_norm": 2.015625, - "learning_rate": 2.4963567934413533e-05, - "long_answer_loss": 0.1991, - "loss": 0.2086, - "short_answer_loss": NaN, - "step": 66, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2302, - "grad_norm": 2.140625, - "learning_rate": 2.4961013387632583e-05, - "long_answer_loss": 0.2302, - "loss": 0.2187, - "short_answer_loss": NaN, - "step": 67, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2221, - "grad_norm": 2.125, - "learning_rate": 2.4958372429266934e-05, - "long_answer_loss": 0.2221, - "loss": 0.2174, - "short_answer_loss": NaN, - "step": 68, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2164, - "grad_norm": 2.28125, - "learning_rate": 2.495564507763047e-05, - "long_answer_loss": 0.2164, - "loss": 0.2186, - "short_answer_loss": NaN, - "step": 69, - "template_loss": 0.0 - }, - { - "epoch": 0.11, - "full_loss": 0.2571, - "grad_norm": 2.125, - "learning_rate": 2.495283135163615e-05, - "long_answer_loss": 0.2571, - "loss": 0.217, - "short_answer_loss": NaN, - "step": 70, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.2136, - "grad_norm": 2.25, - "learning_rate": 2.494993127079592e-05, - "long_answer_loss": 0.2136, - "loss": 0.2178, - "short_answer_loss": NaN, - "step": 71, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1974, - "grad_norm": 2.140625, - "learning_rate": 2.494694485522055e-05, - "long_answer_loss": 0.1974, - "loss": 0.2117, - "short_answer_loss": NaN, - "step": 72, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.2323, - "grad_norm": 2.125, - "learning_rate": 2.49438721256195e-05, - "long_answer_loss": 0.2323, - "loss": 0.2239, - "short_answer_loss": NaN, - "step": 73, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.2445, - "grad_norm": 2.234375, - "learning_rate": 2.4940713103300783e-05, - "long_answer_loss": 0.2445, - "loss": 0.2155, - "short_answer_loss": NaN, - "step": 74, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.2139, - "grad_norm": 2.046875, - "learning_rate": 2.4937467810170818e-05, - "long_answer_loss": 0.2139, - "loss": 0.2217, - "short_answer_loss": NaN, - "step": 75, - "template_loss": 0.0 - }, - { - "epoch": 0.12, - "full_loss": 0.1922, - "grad_norm": 2.15625, - "learning_rate": 2.4934136268734265e-05, - "long_answer_loss": 0.1922, - "loss": 0.2067, - "short_answer_loss": NaN, - "step": 76, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.1909, - "grad_norm": 2.203125, - "learning_rate": 2.493071850209388e-05, - "long_answer_loss": 0.1909, - "loss": 0.2108, - "short_answer_loss": NaN, - "step": 77, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.2392, - "grad_norm": 2.171875, - "learning_rate": 2.4927214533950362e-05, - "long_answer_loss": 0.2392, - "loss": 0.2229, - "short_answer_loss": NaN, - "step": 78, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.2388, - "grad_norm": 2.078125, - "learning_rate": 2.4923624388602164e-05, - "long_answer_loss": 0.2388, - "loss": 0.2164, - "short_answer_loss": NaN, - "step": 79, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.2281, - "grad_norm": 2.28125, - "learning_rate": 2.491994809094535e-05, - "long_answer_loss": 0.2281, - "loss": 0.2121, - "short_answer_loss": NaN, - "step": 80, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.2057, - "grad_norm": 2.21875, - "learning_rate": 2.4916185666473413e-05, - "long_answer_loss": 0.2057, - "loss": 0.2131, - "short_answer_loss": NaN, - "step": 81, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.2526, - "grad_norm": 2.265625, - "learning_rate": 2.4912337141277083e-05, - "long_answer_loss": 0.2526, - "loss": 0.2175, - "short_answer_loss": NaN, - "step": 82, - "template_loss": 0.0 - }, - { - "epoch": 0.13, - "full_loss": 0.2265, - "grad_norm": 2.296875, - "learning_rate": 2.4908402542044178e-05, - "long_answer_loss": 0.2265, - "loss": 0.2176, - "short_answer_loss": NaN, - "step": 83, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.2284, - "grad_norm": 2.21875, - "learning_rate": 2.4904381896059393e-05, - "long_answer_loss": 0.2284, - "loss": 0.2216, - "short_answer_loss": NaN, - "step": 84, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.2304, - "grad_norm": 2.140625, - "learning_rate": 2.490027523120412e-05, - "long_answer_loss": 0.2304, - "loss": 0.2233, - "short_answer_loss": NaN, - "step": 85, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.2291, - "grad_norm": 2.21875, - "learning_rate": 2.4896082575956242e-05, - "long_answer_loss": 0.2291, - "loss": 0.2229, - "short_answer_loss": NaN, - "step": 86, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1959, - "grad_norm": 2.109375, - "learning_rate": 2.4891803959389973e-05, - "long_answer_loss": 0.1959, - "loss": 0.2194, - "short_answer_loss": NaN, - "step": 87, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1872, - "grad_norm": 1.8359375, - "learning_rate": 2.4887439411175605e-05, - "long_answer_loss": 0.1872, - "loss": 0.2153, - "short_answer_loss": NaN, - "step": 88, - "template_loss": 0.0 - }, - { - "epoch": 0.14, - "full_loss": 0.1849, - "grad_norm": 2.03125, - "learning_rate": 2.488298896157935e-05, - "long_answer_loss": 0.1849, - "loss": 0.2242, - "short_answer_loss": NaN, - "step": 89, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1916, - "grad_norm": 1.9765625, - "learning_rate": 2.4878452641463083e-05, - "long_answer_loss": 0.1916, - "loss": 0.2008, - "short_answer_loss": NaN, - "step": 90, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.2591, - "grad_norm": 1.9765625, - "learning_rate": 2.4873830482284173e-05, - "long_answer_loss": 0.2591, - "loss": 0.2244, - "short_answer_loss": NaN, - "step": 91, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.2048, - "grad_norm": 2.078125, - "learning_rate": 2.486912251609524e-05, - "long_answer_loss": 0.2048, - "loss": 0.2217, - "short_answer_loss": NaN, - "step": 92, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.1927, - "grad_norm": 2.171875, - "learning_rate": 2.4864328775543927e-05, - "long_answer_loss": 0.1927, - "loss": 0.2027, - "short_answer_loss": NaN, - "step": 93, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.2159, - "grad_norm": 2.03125, - "learning_rate": 2.48594492938727e-05, - "long_answer_loss": 0.2159, - "loss": 0.2074, - "short_answer_loss": NaN, - "step": 94, - "template_loss": 0.0 - }, - { - "epoch": 0.15, - "full_loss": 0.2118, - "grad_norm": 2.046875, - "learning_rate": 2.485448410491859e-05, - "long_answer_loss": 0.2118, - "loss": 0.2206, - "short_answer_loss": NaN, - "step": 95, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.2104, - "grad_norm": 2.125, - "learning_rate": 2.4849433243112977e-05, - "long_answer_loss": 0.2104, - "loss": 0.2178, - "short_answer_loss": NaN, - "step": 96, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.2231, - "grad_norm": 1.9296875, - "learning_rate": 2.4844296743481338e-05, - "long_answer_loss": 0.2231, - "loss": 0.2093, - "short_answer_loss": NaN, - "step": 97, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.2079, - "grad_norm": 2.1875, - "learning_rate": 2.4839074641643012e-05, - "long_answer_loss": 0.2079, - "loss": 0.2246, - "short_answer_loss": NaN, - "step": 98, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1989, - "grad_norm": 1.9921875, - "learning_rate": 2.4833766973810953e-05, - "long_answer_loss": 0.1989, - "loss": 0.2094, - "short_answer_loss": NaN, - "step": 99, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.2063, - "grad_norm": 2.0, - "learning_rate": 2.482837377679148e-05, - "long_answer_loss": 0.2063, - "loss": 0.2056, - "short_answer_loss": NaN, - "step": 100, - "template_loss": 0.0 - }, - { - "epoch": 0.16, - "full_loss": 0.1934, - "grad_norm": 1.9375, - "learning_rate": 2.482289508798401e-05, - "long_answer_loss": 0.1934, - "loss": 0.2093, - "short_answer_loss": NaN, - "step": 101, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.2258, - "grad_norm": 2.15625, - "learning_rate": 2.4817330945380817e-05, - "long_answer_loss": 0.2258, - "loss": 0.2083, - "short_answer_loss": NaN, - "step": 102, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.1891, - "grad_norm": 1.96875, - "learning_rate": 2.4811681387566755e-05, - "long_answer_loss": 0.1891, - "loss": 0.213, - "short_answer_loss": NaN, - "step": 103, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.202, - "grad_norm": 1.859375, - "learning_rate": 2.4805946453718987e-05, - "long_answer_loss": 0.202, - "loss": 0.2108, - "short_answer_loss": NaN, - "step": 104, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.2068, - "grad_norm": 2.03125, - "learning_rate": 2.4800126183606735e-05, - "long_answer_loss": 0.2068, - "loss": 0.2098, - "short_answer_loss": NaN, - "step": 105, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.2159, - "grad_norm": 2.03125, - "learning_rate": 2.4794220617590985e-05, - "long_answer_loss": 0.2159, - "loss": 0.2147, - "short_answer_loss": NaN, - "step": 106, - "template_loss": 0.0 - }, - { - "epoch": 0.17, - "full_loss": 0.203, - "grad_norm": 1.828125, - "learning_rate": 2.47882297966242e-05, - "long_answer_loss": 0.203, - "loss": 0.1986, - "short_answer_loss": NaN, - "step": 107, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.2326, - "grad_norm": 2.03125, - "learning_rate": 2.478215376225007e-05, - "long_answer_loss": 0.2326, - "loss": 0.2143, - "short_answer_loss": NaN, - "step": 108, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.2194, - "grad_norm": 2.03125, - "learning_rate": 2.4775992556603188e-05, - "long_answer_loss": 0.2194, - "loss": 0.2204, - "short_answer_loss": NaN, - "step": 109, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1908, - "grad_norm": 1.921875, - "learning_rate": 2.476974622240877e-05, - "long_answer_loss": 0.1908, - "loss": 0.2131, - "short_answer_loss": NaN, - "step": 110, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.2184, - "grad_norm": 1.9609375, - "learning_rate": 2.4763414802982364e-05, - "long_answer_loss": 0.2184, - "loss": 0.2103, - "short_answer_loss": NaN, - "step": 111, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.1973, - "grad_norm": 1.90625, - "learning_rate": 2.4756998342229552e-05, - "long_answer_loss": 0.1973, - "loss": 0.1993, - "short_answer_loss": NaN, - "step": 112, - "template_loss": 0.0 - }, - { - "epoch": 0.18, - "full_loss": 0.203, - "grad_norm": 1.9453125, - "learning_rate": 2.4750496884645637e-05, - "long_answer_loss": 0.203, - "loss": 0.2125, - "short_answer_loss": NaN, - "step": 113, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.2308, - "grad_norm": 2.140625, - "learning_rate": 2.474391047531533e-05, - "long_answer_loss": 0.2308, - "loss": 0.2103, - "short_answer_loss": NaN, - "step": 114, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.176, - "grad_norm": 2.015625, - "learning_rate": 2.4737239159912452e-05, - "long_answer_loss": 0.176, - "loss": 0.2066, - "short_answer_loss": NaN, - "step": 115, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.2447, - "grad_norm": 2.046875, - "learning_rate": 2.4730482984699603e-05, - "long_answer_loss": 0.2447, - "loss": 0.213, - "short_answer_loss": NaN, - "step": 116, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1966, - "grad_norm": 2.015625, - "learning_rate": 2.4723641996527863e-05, - "long_answer_loss": 0.1966, - "loss": 0.2085, - "short_answer_loss": NaN, - "step": 117, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.2139, - "grad_norm": 1.96875, - "learning_rate": 2.4716716242836432e-05, - "long_answer_loss": 0.2139, - "loss": 0.2043, - "short_answer_loss": NaN, - "step": 118, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.2305, - "grad_norm": 2.03125, - "learning_rate": 2.4709705771652336e-05, - "long_answer_loss": 0.2305, - "loss": 0.2182, - "short_answer_loss": NaN, - "step": 119, - "template_loss": 0.0 - }, - { - "epoch": 0.19, - "full_loss": 0.1965, - "grad_norm": 1.84375, - "learning_rate": 2.4702610631590073e-05, - "long_answer_loss": 0.1965, - "loss": 0.2, - "short_answer_loss": NaN, - "step": 120, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.234, - "grad_norm": 1.9453125, - "learning_rate": 2.4695430871851283e-05, - "long_answer_loss": 0.234, - "loss": 0.2136, - "short_answer_loss": NaN, - "step": 121, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.19, - "grad_norm": 2.125, - "learning_rate": 2.4688166542224403e-05, - "long_answer_loss": 0.19, - "loss": 0.2076, - "short_answer_loss": NaN, - "step": 122, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.2186, - "grad_norm": 1.9609375, - "learning_rate": 2.4680817693084332e-05, - "long_answer_loss": 0.2186, - "loss": 0.2162, - "short_answer_loss": NaN, - "step": 123, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.234, - "grad_norm": 2.3125, - "learning_rate": 2.467338437539207e-05, - "long_answer_loss": 0.234, - "loss": 0.2225, - "short_answer_loss": NaN, - "step": 124, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.229, - "grad_norm": 2.046875, - "learning_rate": 2.4665866640694367e-05, - "long_answer_loss": 0.229, - "loss": 0.2066, - "short_answer_loss": NaN, - "step": 125, - "template_loss": 0.0 - }, - { - "epoch": 0.2, - "full_loss": 0.2273, - "grad_norm": 1.921875, - "learning_rate": 2.4658264541123365e-05, - "long_answer_loss": 0.2273, - "loss": 0.2122, - "short_answer_loss": NaN, - "step": 126, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.2163, - "grad_norm": 2.109375, - "learning_rate": 2.4650578129396247e-05, - "long_answer_loss": 0.2163, - "loss": 0.2091, - "short_answer_loss": NaN, - "step": 127, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.2265, - "grad_norm": 1.796875, - "learning_rate": 2.4642807458814864e-05, - "long_answer_loss": 0.2265, - "loss": 0.2126, - "short_answer_loss": NaN, - "step": 128, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1896, - "grad_norm": 2.09375, - "learning_rate": 2.463495258326535e-05, - "long_answer_loss": 0.1896, - "loss": 0.2037, - "short_answer_loss": NaN, - "step": 129, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.2125, - "grad_norm": 2.015625, - "learning_rate": 2.4627013557217777e-05, - "long_answer_loss": 0.2125, - "loss": 0.1978, - "short_answer_loss": NaN, - "step": 130, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.2263, - "grad_norm": 1.8984375, - "learning_rate": 2.4618990435725758e-05, - "long_answer_loss": 0.2263, - "loss": 0.2285, - "short_answer_loss": NaN, - "step": 131, - "template_loss": 0.0 - }, - { - "epoch": 0.21, - "full_loss": 0.1875, - "grad_norm": 2.140625, - "learning_rate": 2.4610883274426076e-05, - "long_answer_loss": 0.1875, - "loss": 0.2163, - "short_answer_loss": NaN, - "step": 132, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.1998, - "grad_norm": 2.078125, - "learning_rate": 2.4602692129538286e-05, - "long_answer_loss": 0.1998, - "loss": 0.2166, - "short_answer_loss": NaN, - "step": 133, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.2387, - "grad_norm": 1.9140625, - "learning_rate": 2.4594417057864327e-05, - "long_answer_loss": 0.2387, - "loss": 0.2039, - "short_answer_loss": NaN, - "step": 134, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.2086, - "grad_norm": 1.9765625, - "learning_rate": 2.458605811678815e-05, - "long_answer_loss": 0.2086, - "loss": 0.2059, - "short_answer_loss": NaN, - "step": 135, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.2005, - "grad_norm": 2.015625, - "learning_rate": 2.4577615364275292e-05, - "long_answer_loss": 0.2005, - "loss": 0.1979, - "short_answer_loss": NaN, - "step": 136, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.2188, - "grad_norm": 1.8984375, - "learning_rate": 2.4569088858872478e-05, - "long_answer_loss": 0.2188, - "loss": 0.2078, - "short_answer_loss": NaN, - "step": 137, - "template_loss": 0.0 - }, - { - "epoch": 0.22, - "full_loss": 0.2044, - "grad_norm": 1.953125, - "learning_rate": 2.4560478659707236e-05, - "long_answer_loss": 0.2044, - "loss": 0.2228, - "short_answer_loss": NaN, - "step": 138, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1837, - "grad_norm": 2.015625, - "learning_rate": 2.4551784826487466e-05, - "long_answer_loss": 0.1837, - "loss": 0.2176, - "short_answer_loss": NaN, - "step": 139, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.2002, - "grad_norm": 1.828125, - "learning_rate": 2.4543007419501034e-05, - "long_answer_loss": 0.2002, - "loss": 0.2067, - "short_answer_loss": NaN, - "step": 140, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.227, - "grad_norm": 1.8671875, - "learning_rate": 2.4534146499615356e-05, - "long_answer_loss": 0.227, - "loss": 0.2045, - "short_answer_loss": NaN, - "step": 141, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.2361, - "grad_norm": 1.875, - "learning_rate": 2.4525202128276962e-05, - "long_answer_loss": 0.2361, - "loss": 0.2197, - "short_answer_loss": NaN, - "step": 142, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.1792, - "grad_norm": 1.921875, - "learning_rate": 2.4516174367511095e-05, - "long_answer_loss": 0.1792, - "loss": 0.2087, - "short_answer_loss": NaN, - "step": 143, - "template_loss": 0.0 - }, - { - "epoch": 0.23, - "full_loss": 0.2059, - "grad_norm": 1.875, - "learning_rate": 2.450706327992126e-05, - "long_answer_loss": 0.2059, - "loss": 0.2137, - "short_answer_loss": NaN, - "step": 144, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.213, - "grad_norm": 1.8046875, - "learning_rate": 2.4497868928688794e-05, - "long_answer_loss": 0.213, - "loss": 0.2005, - "short_answer_loss": NaN, - "step": 145, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.2122, - "grad_norm": 2.0, - "learning_rate": 2.4488591377572434e-05, - "long_answer_loss": 0.2122, - "loss": 0.2188, - "short_answer_loss": NaN, - "step": 146, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.2375, - "grad_norm": 1.78125, - "learning_rate": 2.4479230690907868e-05, - "long_answer_loss": 0.2375, - "loss": 0.22, - "short_answer_loss": NaN, - "step": 147, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.163, - "grad_norm": 1.96875, - "learning_rate": 2.4469786933607296e-05, - "long_answer_loss": 0.163, - "loss": 0.2075, - "short_answer_loss": NaN, - "step": 148, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.2084, - "grad_norm": 1.8671875, - "learning_rate": 2.4460260171158973e-05, - "long_answer_loss": 0.2084, - "loss": 0.2139, - "short_answer_loss": NaN, - "step": 149, - "template_loss": 0.0 - }, - { - "epoch": 0.24, - "full_loss": 0.2117, - "grad_norm": 1.8515625, - "learning_rate": 2.4450650469626758e-05, - "long_answer_loss": 0.2117, - "loss": 0.2037, - "short_answer_loss": NaN, - "step": 150, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.2042, - "grad_norm": 1.90625, - "learning_rate": 2.4440957895649658e-05, - "long_answer_loss": 0.2042, - "loss": 0.209, - "short_answer_loss": NaN, - "step": 151, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.214, - "grad_norm": 1.8671875, - "learning_rate": 2.4431182516441363e-05, - "long_answer_loss": 0.214, - "loss": 0.2001, - "short_answer_loss": NaN, - "step": 152, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1751, - "grad_norm": 1.8125, - "learning_rate": 2.4421324399789775e-05, - "long_answer_loss": 0.1751, - "loss": 0.218, - "short_answer_loss": NaN, - "step": 153, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.1983, - "grad_norm": 1.8671875, - "learning_rate": 2.4411383614056554e-05, - "long_answer_loss": 0.1983, - "loss": 0.2138, - "short_answer_loss": NaN, - "step": 154, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.2409, - "grad_norm": 1.7578125, - "learning_rate": 2.440136022817662e-05, - "long_answer_loss": 0.2409, - "loss": 0.2068, - "short_answer_loss": NaN, - "step": 155, - "template_loss": 0.0 - }, - { - "epoch": 0.25, - "full_loss": 0.2302, - "grad_norm": 1.75, - "learning_rate": 2.4391254311657698e-05, - "long_answer_loss": 0.2302, - "loss": 0.2146, - "short_answer_loss": NaN, - "step": 156, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.2086, - "grad_norm": 1.796875, - "learning_rate": 2.4381065934579827e-05, - "long_answer_loss": 0.2086, - "loss": 0.2047, - "short_answer_loss": NaN, - "step": 157, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.208, - "grad_norm": 1.890625, - "learning_rate": 2.4370795167594864e-05, - "long_answer_loss": 0.208, - "loss": 0.2037, - "short_answer_loss": NaN, - "step": 158, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.212, - "grad_norm": 1.828125, - "learning_rate": 2.4360442081926016e-05, - "long_answer_loss": 0.212, - "loss": 0.1997, - "short_answer_loss": NaN, - "step": 159, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.2122, - "grad_norm": 1.7578125, - "learning_rate": 2.435000674936732e-05, - "long_answer_loss": 0.2122, - "loss": 0.21, - "short_answer_loss": NaN, - "step": 160, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.2249, - "grad_norm": 1.8671875, - "learning_rate": 2.4339489242283166e-05, - "long_answer_loss": 0.2249, - "loss": 0.2093, - "short_answer_loss": NaN, - "step": 161, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.2184, - "grad_norm": 1.828125, - "learning_rate": 2.4328889633607794e-05, - "long_answer_loss": 0.2184, - "loss": 0.2061, - "short_answer_loss": NaN, - "step": 162, - "template_loss": 0.0 - }, - { - "epoch": 0.26, - "full_loss": 0.2118, - "grad_norm": 1.8125, - "learning_rate": 2.4318207996844767e-05, - "long_answer_loss": 0.2118, - "loss": 0.2087, - "short_answer_loss": NaN, - "step": 163, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1999, - "grad_norm": 1.7890625, - "learning_rate": 2.4307444406066488e-05, - "long_answer_loss": 0.1999, - "loss": 0.2065, - "short_answer_loss": NaN, - "step": 164, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1897, - "grad_norm": 1.6484375, - "learning_rate": 2.429659893591367e-05, - "long_answer_loss": 0.1897, - "loss": 0.2055, - "short_answer_loss": NaN, - "step": 165, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.2282, - "grad_norm": 1.8046875, - "learning_rate": 2.4285671661594827e-05, - "long_answer_loss": 0.2282, - "loss": 0.2235, - "short_answer_loss": NaN, - "step": 166, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.213, - "grad_norm": 1.7890625, - "learning_rate": 2.427466265888574e-05, - "long_answer_loss": 0.213, - "loss": 0.2049, - "short_answer_loss": NaN, - "step": 167, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.2374, - "grad_norm": 2.0625, - "learning_rate": 2.426357200412895e-05, - "long_answer_loss": 0.2374, - "loss": 0.2186, - "short_answer_loss": NaN, - "step": 168, - "template_loss": 0.0 - }, - { - "epoch": 0.27, - "full_loss": 0.1913, - "grad_norm": 1.890625, - "learning_rate": 2.4252399774233216e-05, - "long_answer_loss": 0.1913, - "loss": 0.1997, - "short_answer_loss": NaN, - "step": 169, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.2179, - "grad_norm": 1.8046875, - "learning_rate": 2.4241146046672972e-05, - "long_answer_loss": 0.2179, - "loss": 0.2187, - "short_answer_loss": NaN, - "step": 170, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1981, - "grad_norm": 1.9921875, - "learning_rate": 2.4229810899487824e-05, - "long_answer_loss": 0.1981, - "loss": 0.2142, - "short_answer_loss": NaN, - "step": 171, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1987, - "grad_norm": 1.8828125, - "learning_rate": 2.421839441128197e-05, - "long_answer_loss": 0.1987, - "loss": 0.2143, - "short_answer_loss": NaN, - "step": 172, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1928, - "grad_norm": 2.328125, - "learning_rate": 2.4206896661223676e-05, - "long_answer_loss": 0.1928, - "loss": 0.2202, - "short_answer_loss": NaN, - "step": 173, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.2017, - "grad_norm": 2.109375, - "learning_rate": 2.4195317729044732e-05, - "long_answer_loss": 0.2017, - "loss": 0.2074, - "short_answer_loss": NaN, - "step": 174, - "template_loss": 0.0 - }, - { - "epoch": 0.28, - "full_loss": 0.1975, - "grad_norm": 1.8515625, - "learning_rate": 2.4183657695039874e-05, - "long_answer_loss": 0.1975, - "loss": 0.2064, - "short_answer_loss": NaN, - "step": 175, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.2251, - "grad_norm": 1.953125, - "learning_rate": 2.417191664006625e-05, - "long_answer_loss": 0.2251, - "loss": 0.2147, - "short_answer_loss": NaN, - "step": 176, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.2047, - "grad_norm": 1.84375, - "learning_rate": 2.4160094645542857e-05, - "long_answer_loss": 0.2047, - "loss": 0.2036, - "short_answer_loss": NaN, - "step": 177, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.2278, - "grad_norm": 1.8125, - "learning_rate": 2.4148191793449974e-05, - "long_answer_loss": 0.2278, - "loss": 0.2078, - "short_answer_loss": NaN, - "step": 178, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.2469, - "grad_norm": 2.046875, - "learning_rate": 2.4136208166328573e-05, - "long_answer_loss": 0.2469, - "loss": 0.2153, - "short_answer_loss": NaN, - "step": 179, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1945, - "grad_norm": 1.84375, - "learning_rate": 2.4124143847279785e-05, - "long_answer_loss": 0.1945, - "loss": 0.1965, - "short_answer_loss": NaN, - "step": 180, - "template_loss": 0.0 - }, - { - "epoch": 0.29, - "full_loss": 0.1612, - "grad_norm": 2.015625, - "learning_rate": 2.4111998919964297e-05, - "long_answer_loss": 0.1612, - "loss": 0.2107, - "short_answer_loss": NaN, - "step": 181, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.2099, - "grad_norm": 2.078125, - "learning_rate": 2.4099773468601773e-05, - "long_answer_loss": 0.2099, - "loss": 0.2054, - "short_answer_loss": NaN, - "step": 182, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1939, - "grad_norm": 1.9765625, - "learning_rate": 2.408746757797028e-05, - "long_answer_loss": 0.1939, - "loss": 0.2171, - "short_answer_loss": NaN, - "step": 183, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.2258, - "grad_norm": 2.015625, - "learning_rate": 2.4075081333405697e-05, - "long_answer_loss": 0.2258, - "loss": 0.21, - "short_answer_loss": NaN, - "step": 184, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.251, - "grad_norm": 2.015625, - "learning_rate": 2.406261482080112e-05, - "long_answer_loss": 0.251, - "loss": 0.2233, - "short_answer_loss": NaN, - "step": 185, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1729, - "grad_norm": 1.9140625, - "learning_rate": 2.4050068126606267e-05, - "long_answer_loss": 0.1729, - "loss": 0.2058, - "short_answer_loss": NaN, - "step": 186, - "template_loss": 0.0 - }, - { - "epoch": 0.3, - "full_loss": 0.1879, - "grad_norm": 1.84375, - "learning_rate": 2.4037441337826884e-05, - "long_answer_loss": 0.1879, - "loss": 0.2084, - "short_answer_loss": NaN, - "step": 187, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1935, - "grad_norm": 1.796875, - "learning_rate": 2.4024734542024135e-05, - "long_answer_loss": 0.1935, - "loss": 0.1992, - "short_answer_loss": NaN, - "step": 188, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.2111, - "grad_norm": 1.8359375, - "learning_rate": 2.401194782731399e-05, - "long_answer_loss": 0.2111, - "loss": 0.2078, - "short_answer_loss": NaN, - "step": 189, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.2188, - "grad_norm": 1.953125, - "learning_rate": 2.3999081282366636e-05, - "long_answer_loss": 0.2188, - "loss": 0.2192, - "short_answer_loss": NaN, - "step": 190, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.2283, - "grad_norm": 1.953125, - "learning_rate": 2.3986134996405832e-05, - "long_answer_loss": 0.2283, - "loss": 0.2084, - "short_answer_loss": NaN, - "step": 191, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.1982, - "grad_norm": 1.75, - "learning_rate": 2.3973109059208315e-05, - "long_answer_loss": 0.1982, - "loss": 0.2095, - "short_answer_loss": NaN, - "step": 192, - "template_loss": 0.0 - }, - { - "epoch": 0.31, - "full_loss": 0.2264, - "grad_norm": 1.8515625, - "learning_rate": 2.396000356110316e-05, - "long_answer_loss": 0.2264, - "loss": 0.2161, - "short_answer_loss": NaN, - "step": 193, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1977, - "grad_norm": 1.9921875, - "learning_rate": 2.3946818592971176e-05, - "long_answer_loss": 0.1977, - "loss": 0.2139, - "short_answer_loss": NaN, - "step": 194, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.2048, - "grad_norm": 1.890625, - "learning_rate": 2.3933554246244243e-05, - "long_answer_loss": 0.2048, - "loss": 0.1998, - "short_answer_loss": NaN, - "step": 195, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.1993, - "grad_norm": 1.703125, - "learning_rate": 2.3920210612904715e-05, - "long_answer_loss": 0.1993, - "loss": 0.204, - "short_answer_loss": NaN, - "step": 196, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.2245, - "grad_norm": 1.8984375, - "learning_rate": 2.3906787785484742e-05, - "long_answer_loss": 0.2245, - "loss": 0.1939, - "short_answer_loss": NaN, - "step": 197, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.2195, - "grad_norm": 2.03125, - "learning_rate": 2.3893285857065666e-05, - "long_answer_loss": 0.2195, - "loss": 0.2158, - "short_answer_loss": NaN, - "step": 198, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.2314, - "grad_norm": 1.9765625, - "learning_rate": 2.3879704921277356e-05, - "long_answer_loss": 0.2314, - "loss": 0.2045, - "short_answer_loss": NaN, - "step": 199, - "template_loss": 0.0 - }, - { - "epoch": 0.32, - "full_loss": 0.2089, - "grad_norm": 1.9765625, - "learning_rate": 2.386604507229756e-05, - "long_answer_loss": 0.2089, - "loss": 0.2042, - "short_answer_loss": NaN, - "step": 200, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1831, - "grad_norm": 1.9921875, - "learning_rate": 2.385230640485125e-05, - "long_answer_loss": 0.1831, - "loss": 0.1976, - "short_answer_loss": NaN, - "step": 201, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1918, - "grad_norm": 1.828125, - "learning_rate": 2.383848901420998e-05, - "long_answer_loss": 0.1918, - "loss": 0.2033, - "short_answer_loss": NaN, - "step": 202, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1952, - "grad_norm": 1.796875, - "learning_rate": 2.3824592996191204e-05, - "long_answer_loss": 0.1952, - "loss": 0.2051, - "short_answer_loss": NaN, - "step": 203, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.2222, - "grad_norm": 1.8359375, - "learning_rate": 2.3810618447157622e-05, - "long_answer_loss": 0.2222, - "loss": 0.2054, - "short_answer_loss": NaN, - "step": 204, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1614, - "grad_norm": 1.921875, - "learning_rate": 2.3796565464016523e-05, - "long_answer_loss": 0.1614, - "loss": 0.2093, - "short_answer_loss": NaN, - "step": 205, - "template_loss": 0.0 - }, - { - "epoch": 0.33, - "full_loss": 0.1581, - "grad_norm": 1.796875, - "learning_rate": 2.378243414421909e-05, - "long_answer_loss": 0.1581, - "loss": 0.2025, - "short_answer_loss": NaN, - "step": 206, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.2477, - "grad_norm": 1.8203125, - "learning_rate": 2.3768224585759735e-05, - "long_answer_loss": 0.2477, - "loss": 0.2058, - "short_answer_loss": NaN, - "step": 207, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.2161, - "grad_norm": 1.8046875, - "learning_rate": 2.3753936887175433e-05, - "long_answer_loss": 0.2161, - "loss": 0.1956, - "short_answer_loss": NaN, - "step": 208, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1787, - "grad_norm": 1.734375, - "learning_rate": 2.3739571147545007e-05, - "long_answer_loss": 0.1787, - "loss": 0.2096, - "short_answer_loss": NaN, - "step": 209, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1975, - "grad_norm": 1.828125, - "learning_rate": 2.3725127466488483e-05, - "long_answer_loss": 0.1975, - "loss": 0.2039, - "short_answer_loss": NaN, - "step": 210, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.2476, - "grad_norm": 1.9140625, - "learning_rate": 2.371060594416636e-05, - "long_answer_loss": 0.2476, - "loss": 0.2051, - "short_answer_loss": NaN, - "step": 211, - "template_loss": 0.0 - }, - { - "epoch": 0.34, - "full_loss": 0.1977, - "grad_norm": 1.875, - "learning_rate": 2.369600668127893e-05, - "long_answer_loss": 0.1977, - "loss": 0.1993, - "short_answer_loss": NaN, - "step": 212, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.198, - "grad_norm": 1.953125, - "learning_rate": 2.3681329779065596e-05, - "long_answer_loss": 0.198, - "loss": 0.2029, - "short_answer_loss": NaN, - "step": 213, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.2019, - "grad_norm": 1.859375, - "learning_rate": 2.366657533930414e-05, - "long_answer_loss": 0.2019, - "loss": 0.201, - "short_answer_loss": NaN, - "step": 214, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.2116, - "grad_norm": 1.8203125, - "learning_rate": 2.3651743464310038e-05, - "long_answer_loss": 0.2116, - "loss": 0.1928, - "short_answer_loss": NaN, - "step": 215, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.2155, - "grad_norm": 2.125, - "learning_rate": 2.3636834256935745e-05, - "long_answer_loss": 0.2155, - "loss": 0.2115, - "short_answer_loss": NaN, - "step": 216, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1885, - "grad_norm": 1.8359375, - "learning_rate": 2.3621847820569988e-05, - "long_answer_loss": 0.1885, - "loss": 0.2007, - "short_answer_loss": NaN, - "step": 217, - "template_loss": 0.0 - }, - { - "epoch": 0.35, - "full_loss": 0.1827, - "grad_norm": 1.7890625, - "learning_rate": 2.3606784259137033e-05, - "long_answer_loss": 0.1827, - "loss": 0.1991, - "short_answer_loss": NaN, - "step": 218, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.2293, - "grad_norm": 1.9921875, - "learning_rate": 2.3591643677095973e-05, - "long_answer_loss": 0.2293, - "loss": 0.2025, - "short_answer_loss": NaN, - "step": 219, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1983, - "grad_norm": 1.828125, - "learning_rate": 2.3576426179440014e-05, - "long_answer_loss": 0.1983, - "loss": 0.2017, - "short_answer_loss": NaN, - "step": 220, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.2172, - "grad_norm": 1.9375, - "learning_rate": 2.3561131871695736e-05, - "long_answer_loss": 0.2172, - "loss": 0.2012, - "short_answer_loss": NaN, - "step": 221, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1724, - "grad_norm": 2.015625, - "learning_rate": 2.3545760859922354e-05, - "long_answer_loss": 0.1724, - "loss": 0.2022, - "short_answer_loss": NaN, - "step": 222, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1781, - "grad_norm": 2.015625, - "learning_rate": 2.3530313250710998e-05, - "long_answer_loss": 0.1781, - "loss": 0.2049, - "short_answer_loss": NaN, - "step": 223, - "template_loss": 0.0 - }, - { - "epoch": 0.36, - "full_loss": 0.1954, - "grad_norm": 1.875, - "learning_rate": 2.351478915118397e-05, - "long_answer_loss": 0.1954, - "loss": 0.1994, - "short_answer_loss": NaN, - "step": 224, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.2026, - "grad_norm": 1.9375, - "learning_rate": 2.349918866899399e-05, - "long_answer_loss": 0.2026, - "loss": 0.2201, - "short_answer_loss": NaN, - "step": 225, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.2047, - "grad_norm": 1.71875, - "learning_rate": 2.348351191232346e-05, - "long_answer_loss": 0.2047, - "loss": 0.1915, - "short_answer_loss": NaN, - "step": 226, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1743, - "grad_norm": 1.8125, - "learning_rate": 2.346775898988372e-05, - "long_answer_loss": 0.1743, - "loss": 0.2042, - "short_answer_loss": NaN, - "step": 227, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1933, - "grad_norm": 1.796875, - "learning_rate": 2.345193001091428e-05, - "long_answer_loss": 0.1933, - "loss": 0.2044, - "short_answer_loss": NaN, - "step": 228, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.1829, - "grad_norm": 1.6796875, - "learning_rate": 2.3436025085182064e-05, - "long_answer_loss": 0.1829, - "loss": 0.1989, - "short_answer_loss": NaN, - "step": 229, - "template_loss": 0.0 - }, - { - "epoch": 0.37, - "full_loss": 0.187, - "grad_norm": 1.9296875, - "learning_rate": 2.3420044322980662e-05, - "long_answer_loss": 0.187, - "loss": 0.2025, - "short_answer_loss": NaN, - "step": 230, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1994, - "grad_norm": 1.859375, - "learning_rate": 2.340398783512954e-05, - "long_answer_loss": 0.1994, - "loss": 0.212, - "short_answer_loss": NaN, - "step": 231, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.2226, - "grad_norm": 2.03125, - "learning_rate": 2.3387855732973307e-05, - "long_answer_loss": 0.2226, - "loss": 0.2044, - "short_answer_loss": NaN, - "step": 232, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.1717, - "grad_norm": 1.8671875, - "learning_rate": 2.3371648128380918e-05, - "long_answer_loss": 0.1717, - "loss": 0.2019, - "short_answer_loss": NaN, - "step": 233, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.2201, - "grad_norm": 2.0, - "learning_rate": 2.3355365133744894e-05, - "long_answer_loss": 0.2201, - "loss": 0.2004, - "short_answer_loss": NaN, - "step": 234, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.166, - "grad_norm": 2.015625, - "learning_rate": 2.3339006861980562e-05, - "long_answer_loss": 0.166, - "loss": 0.2043, - "short_answer_loss": NaN, - "step": 235, - "template_loss": 0.0 - }, - { - "epoch": 0.38, - "full_loss": 0.2252, - "grad_norm": 1.9609375, - "learning_rate": 2.3322573426525262e-05, - "long_answer_loss": 0.2252, - "loss": 0.2121, - "short_answer_loss": NaN, - "step": 236, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.2206, - "grad_norm": 1.9296875, - "learning_rate": 2.330606494133756e-05, - "long_answer_loss": 0.2206, - "loss": 0.216, - "short_answer_loss": NaN, - "step": 237, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.2155, - "grad_norm": 1.8671875, - "learning_rate": 2.328948152089645e-05, - "long_answer_loss": 0.2155, - "loss": 0.2058, - "short_answer_loss": NaN, - "step": 238, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.228, - "grad_norm": 1.9375, - "learning_rate": 2.327282328020058e-05, - "long_answer_loss": 0.228, - "loss": 0.2064, - "short_answer_loss": NaN, - "step": 239, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1713, - "grad_norm": 1.8359375, - "learning_rate": 2.3256090334767443e-05, - "long_answer_loss": 0.1713, - "loss": 0.1939, - "short_answer_loss": NaN, - "step": 240, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1933, - "grad_norm": 1.6484375, - "learning_rate": 2.3239282800632564e-05, - "long_answer_loss": 0.1933, - "loss": 0.2032, - "short_answer_loss": NaN, - "step": 241, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1856, - "grad_norm": 1.8828125, - "learning_rate": 2.322240079434872e-05, - "long_answer_loss": 0.1856, - "loss": 0.2067, - "short_answer_loss": NaN, - "step": 242, - "template_loss": 0.0 - }, - { - "epoch": 0.39, - "full_loss": 0.1947, - "grad_norm": 1.7578125, - "learning_rate": 2.320544443298512e-05, - "long_answer_loss": 0.1947, - "loss": 0.2113, - "short_answer_loss": NaN, - "step": 243, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1911, - "grad_norm": 1.7890625, - "learning_rate": 2.3188413834126573e-05, - "long_answer_loss": 0.1911, - "loss": 0.202, - "short_answer_loss": NaN, - "step": 244, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.2166, - "grad_norm": 1.984375, - "learning_rate": 2.317130911587272e-05, - "long_answer_loss": 0.2166, - "loss": 0.2018, - "short_answer_loss": NaN, - "step": 245, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.2132, - "grad_norm": 1.7734375, - "learning_rate": 2.3154130396837166e-05, - "long_answer_loss": 0.2132, - "loss": 0.1956, - "short_answer_loss": NaN, - "step": 246, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.1885, - "grad_norm": 1.859375, - "learning_rate": 2.313687779614669e-05, - "long_answer_loss": 0.1885, - "loss": 0.2011, - "short_answer_loss": NaN, - "step": 247, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.2018, - "grad_norm": 1.84375, - "learning_rate": 2.31195514334404e-05, - "long_answer_loss": 0.2018, - "loss": 0.2032, - "short_answer_loss": NaN, - "step": 248, - "template_loss": 0.0 - }, - { - "epoch": 0.4, - "full_loss": 0.2596, - "grad_norm": 2.046875, - "learning_rate": 2.3102151428868912e-05, - "long_answer_loss": 0.2596, - "loss": 0.2129, - "short_answer_loss": NaN, - "step": 249, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1831, - "grad_norm": 1.9765625, - "learning_rate": 2.3084677903093528e-05, - "long_answer_loss": 0.1831, - "loss": 0.1961, - "short_answer_loss": NaN, - "step": 250, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.199, - "grad_norm": 1.859375, - "learning_rate": 2.306713097728536e-05, - "long_answer_loss": 0.199, - "loss": 0.1935, - "short_answer_loss": NaN, - "step": 251, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.2525, - "grad_norm": 1.8515625, - "learning_rate": 2.3049510773124546e-05, - "long_answer_loss": 0.2525, - "loss": 0.2168, - "short_answer_loss": NaN, - "step": 252, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.226, - "grad_norm": 1.9453125, - "learning_rate": 2.303181741279936e-05, - "long_answer_loss": 0.226, - "loss": 0.1991, - "short_answer_loss": NaN, - "step": 253, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.1931, - "grad_norm": 1.9609375, - "learning_rate": 2.3014051019005383e-05, - "long_answer_loss": 0.1931, - "loss": 0.2099, - "short_answer_loss": NaN, - "step": 254, - "template_loss": 0.0 - }, - { - "epoch": 0.41, - "full_loss": 0.2221, - "grad_norm": 1.90625, - "learning_rate": 2.2996211714944653e-05, - "long_answer_loss": 0.2221, - "loss": 0.2075, - "short_answer_loss": NaN, - "step": 255, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.2067, - "grad_norm": 2.03125, - "learning_rate": 2.297829962432481e-05, - "long_answer_loss": 0.2067, - "loss": 0.1982, - "short_answer_loss": NaN, - "step": 256, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.234, - "grad_norm": 2.0625, - "learning_rate": 2.296031487135824e-05, - "long_answer_loss": 0.234, - "loss": 0.2171, - "short_answer_loss": NaN, - "step": 257, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.2223, - "grad_norm": 1.8515625, - "learning_rate": 2.294225758076119e-05, - "long_answer_loss": 0.2223, - "loss": 0.2007, - "short_answer_loss": NaN, - "step": 258, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1731, - "grad_norm": 1.75, - "learning_rate": 2.292412787775295e-05, - "long_answer_loss": 0.1731, - "loss": 0.1939, - "short_answer_loss": NaN, - "step": 259, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.1604, - "grad_norm": 2.0625, - "learning_rate": 2.290592588805494e-05, - "long_answer_loss": 0.1604, - "loss": 0.1877, - "short_answer_loss": NaN, - "step": 260, - "template_loss": 0.0 - }, - { - "epoch": 0.42, - "full_loss": 0.2115, - "grad_norm": 2.0, - "learning_rate": 2.2887651737889866e-05, - "long_answer_loss": 0.2115, - "loss": 0.2019, - "short_answer_loss": NaN, - "step": 261, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1951, - "grad_norm": 1.9296875, - "learning_rate": 2.2869305553980823e-05, - "long_answer_loss": 0.1951, - "loss": 0.2116, - "short_answer_loss": NaN, - "step": 262, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.2196, - "grad_norm": 1.9296875, - "learning_rate": 2.2850887463550442e-05, - "long_answer_loss": 0.2196, - "loss": 0.1976, - "short_answer_loss": NaN, - "step": 263, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.2057, - "grad_norm": 1.8671875, - "learning_rate": 2.2832397594319983e-05, - "long_answer_loss": 0.2057, - "loss": 0.2028, - "short_answer_loss": NaN, - "step": 264, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.212, - "grad_norm": 1.734375, - "learning_rate": 2.2813836074508467e-05, - "long_answer_loss": 0.212, - "loss": 0.2052, - "short_answer_loss": NaN, - "step": 265, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.2168, - "grad_norm": 1.8125, - "learning_rate": 2.2795203032831776e-05, - "long_answer_loss": 0.2168, - "loss": 0.1947, - "short_answer_loss": NaN, - "step": 266, - "template_loss": 0.0 - }, - { - "epoch": 0.43, - "full_loss": 0.1731, - "grad_norm": 1.734375, - "learning_rate": 2.2776498598501767e-05, - "long_answer_loss": 0.1731, - "loss": 0.1975, - "short_answer_loss": NaN, - "step": 267, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1762, - "grad_norm": 1.796875, - "learning_rate": 2.2757722901225367e-05, - "long_answer_loss": 0.1762, - "loss": 0.2008, - "short_answer_loss": NaN, - "step": 268, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.2079, - "grad_norm": 1.796875, - "learning_rate": 2.2738876071203688e-05, - "long_answer_loss": 0.2079, - "loss": 0.2111, - "short_answer_loss": NaN, - "step": 269, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.2049, - "grad_norm": 1.796875, - "learning_rate": 2.271995823913111e-05, - "long_answer_loss": 0.2049, - "loss": 0.2021, - "short_answer_loss": NaN, - "step": 270, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1998, - "grad_norm": 1.703125, - "learning_rate": 2.270096953619439e-05, - "long_answer_loss": 0.1998, - "loss": 0.2074, - "short_answer_loss": NaN, - "step": 271, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1979, - "grad_norm": 1.7265625, - "learning_rate": 2.2681910094071724e-05, - "long_answer_loss": 0.1979, - "loss": 0.1884, - "short_answer_loss": NaN, - "step": 272, - "template_loss": 0.0 - }, - { - "epoch": 0.44, - "full_loss": 0.1774, - "grad_norm": 1.7109375, - "learning_rate": 2.2662780044931874e-05, - "long_answer_loss": 0.1774, - "loss": 0.1848, - "short_answer_loss": NaN, - "step": 273, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.2164, - "grad_norm": 1.7890625, - "learning_rate": 2.264357952143322e-05, - "long_answer_loss": 0.2164, - "loss": 0.2013, - "short_answer_loss": NaN, - "step": 274, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.2208, - "grad_norm": 1.8046875, - "learning_rate": 2.2624308656722846e-05, - "long_answer_loss": 0.2208, - "loss": 0.1935, - "short_answer_loss": NaN, - "step": 275, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.2249, - "grad_norm": 2.0, - "learning_rate": 2.260496758443563e-05, - "long_answer_loss": 0.2249, - "loss": 0.2183, - "short_answer_loss": NaN, - "step": 276, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.2462, - "grad_norm": 1.8203125, - "learning_rate": 2.258555643869331e-05, - "long_answer_loss": 0.2462, - "loss": 0.2003, - "short_answer_loss": NaN, - "step": 277, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.2265, - "grad_norm": 1.9921875, - "learning_rate": 2.256607535410354e-05, - "long_answer_loss": 0.2265, - "loss": 0.2058, - "short_answer_loss": NaN, - "step": 278, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.185, - "grad_norm": 1.890625, - "learning_rate": 2.2546524465758973e-05, - "long_answer_loss": 0.185, - "loss": 0.1978, - "short_answer_loss": NaN, - "step": 279, - "template_loss": 0.0 - }, - { - "epoch": 0.45, - "full_loss": 0.1871, - "grad_norm": 1.96875, - "learning_rate": 2.252690390923633e-05, - "long_answer_loss": 0.1871, - "loss": 0.2059, - "short_answer_loss": NaN, - "step": 280, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.2192, - "grad_norm": 1.8125, - "learning_rate": 2.2507213820595435e-05, - "long_answer_loss": 0.2192, - "loss": 0.2039, - "short_answer_loss": NaN, - "step": 281, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1982, - "grad_norm": 1.7578125, - "learning_rate": 2.2487454336378303e-05, - "long_answer_loss": 0.1982, - "loss": 0.1963, - "short_answer_loss": NaN, - "step": 282, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.2183, - "grad_norm": 1.8359375, - "learning_rate": 2.246762559360816e-05, - "long_answer_loss": 0.2183, - "loss": 0.1968, - "short_answer_loss": NaN, - "step": 283, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.2063, - "grad_norm": 1.7421875, - "learning_rate": 2.244772772978852e-05, - "long_answer_loss": 0.2063, - "loss": 0.2033, - "short_answer_loss": NaN, - "step": 284, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1825, - "grad_norm": 1.8203125, - "learning_rate": 2.2427760882902217e-05, - "long_answer_loss": 0.1825, - "loss": 0.2233, - "short_answer_loss": NaN, - "step": 285, - "template_loss": 0.0 - }, - { - "epoch": 0.46, - "full_loss": 0.1893, - "grad_norm": 1.75, - "learning_rate": 2.2407725191410446e-05, - "long_answer_loss": 0.1893, - "loss": 0.1977, - "short_answer_loss": NaN, - "step": 286, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.201, - "grad_norm": 1.6796875, - "learning_rate": 2.2387620794251824e-05, - "long_answer_loss": 0.201, - "loss": 0.207, - "short_answer_loss": NaN, - "step": 287, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.2089, - "grad_norm": 1.6796875, - "learning_rate": 2.2367447830841398e-05, - "long_answer_loss": 0.2089, - "loss": 0.1944, - "short_answer_loss": NaN, - "step": 288, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.2115, - "grad_norm": 1.8046875, - "learning_rate": 2.234720644106969e-05, - "long_answer_loss": 0.2115, - "loss": 0.1969, - "short_answer_loss": NaN, - "step": 289, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1723, - "grad_norm": 1.609375, - "learning_rate": 2.2326896765301746e-05, - "long_answer_loss": 0.1723, - "loss": 0.1827, - "short_answer_loss": NaN, - "step": 290, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.2066, - "grad_norm": 1.828125, - "learning_rate": 2.2306518944376125e-05, - "long_answer_loss": 0.2066, - "loss": 0.2062, - "short_answer_loss": NaN, - "step": 291, - "template_loss": 0.0 - }, - { - "epoch": 0.47, - "full_loss": 0.1793, - "grad_norm": 1.796875, - "learning_rate": 2.2286073119603952e-05, - "long_answer_loss": 0.1793, - "loss": 0.1906, - "short_answer_loss": NaN, - "step": 292, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.2182, - "grad_norm": 1.7734375, - "learning_rate": 2.2265559432767924e-05, - "long_answer_loss": 0.2182, - "loss": 0.2009, - "short_answer_loss": NaN, - "step": 293, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1888, - "grad_norm": 1.7265625, - "learning_rate": 2.224497802612134e-05, - "long_answer_loss": 0.1888, - "loss": 0.1953, - "short_answer_loss": NaN, - "step": 294, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1951, - "grad_norm": 1.96875, - "learning_rate": 2.2224329042387093e-05, - "long_answer_loss": 0.1951, - "loss": 0.2054, - "short_answer_loss": NaN, - "step": 295, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1802, - "grad_norm": 1.78125, - "learning_rate": 2.2203612624756704e-05, - "long_answer_loss": 0.1802, - "loss": 0.1995, - "short_answer_loss": NaN, - "step": 296, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.2195, - "grad_norm": 1.9375, - "learning_rate": 2.218282891688931e-05, - "long_answer_loss": 0.2195, - "loss": 0.1905, - "short_answer_loss": NaN, - "step": 297, - "template_loss": 0.0 - }, - { - "epoch": 0.48, - "full_loss": 0.1794, - "grad_norm": 1.8203125, - "learning_rate": 2.216197806291068e-05, - "long_answer_loss": 0.1794, - "loss": 0.1977, - "short_answer_loss": NaN, - "step": 298, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.199, - "grad_norm": 1.7421875, - "learning_rate": 2.2141060207412224e-05, - "long_answer_loss": 0.199, - "loss": 0.2008, - "short_answer_loss": NaN, - "step": 299, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1984, - "grad_norm": 1.8828125, - "learning_rate": 2.2120075495449944e-05, - "long_answer_loss": 0.1984, - "loss": 0.1934, - "short_answer_loss": NaN, - "step": 300, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.2078, - "grad_norm": 1.9140625, - "learning_rate": 2.2099024072543495e-05, - "long_answer_loss": 0.2078, - "loss": 0.2045, - "short_answer_loss": NaN, - "step": 301, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1886, - "grad_norm": 1.828125, - "learning_rate": 2.2077906084675126e-05, - "long_answer_loss": 0.1886, - "loss": 0.1967, - "short_answer_loss": NaN, - "step": 302, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.1978, - "grad_norm": 1.9453125, - "learning_rate": 2.2056721678288693e-05, - "long_answer_loss": 0.1978, - "loss": 0.2, - "short_answer_loss": NaN, - "step": 303, - "template_loss": 0.0 - }, - { - "epoch": 0.49, - "full_loss": 0.199, - "grad_norm": 1.9375, - "learning_rate": 2.2035471000288628e-05, - "long_answer_loss": 0.199, - "loss": 0.2, - "short_answer_loss": NaN, - "step": 304, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.2371, - "grad_norm": 1.6875, - "learning_rate": 2.201415419803893e-05, - "long_answer_loss": 0.2371, - "loss": 0.2014, - "short_answer_loss": NaN, - "step": 305, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1504, - "grad_norm": 1.8125, - "learning_rate": 2.199277141936214e-05, - "long_answer_loss": 0.1504, - "loss": 0.2031, - "short_answer_loss": NaN, - "step": 306, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1945, - "grad_norm": 1.6953125, - "learning_rate": 2.1971322812538314e-05, - "long_answer_loss": 0.1945, - "loss": 0.1907, - "short_answer_loss": NaN, - "step": 307, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1924, - "grad_norm": 1.8046875, - "learning_rate": 2.1949808526304006e-05, - "long_answer_loss": 0.1924, - "loss": 0.2033, - "short_answer_loss": NaN, - "step": 308, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1645, - "grad_norm": 1.7421875, - "learning_rate": 2.1928228709851212e-05, - "long_answer_loss": 0.1645, - "loss": 0.1964, - "short_answer_loss": NaN, - "step": 309, - "template_loss": 0.0 - }, - { - "epoch": 0.5, - "full_loss": 0.1908, - "grad_norm": 1.7578125, - "learning_rate": 2.190658351282637e-05, - "long_answer_loss": 0.1908, - "loss": 0.1965, - "short_answer_loss": NaN, - "step": 310, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.2053, - "grad_norm": 1.734375, - "learning_rate": 2.1884873085329276e-05, - "long_answer_loss": 0.2053, - "loss": 0.1976, - "short_answer_loss": NaN, - "step": 311, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1693, - "grad_norm": 1.828125, - "learning_rate": 2.1863097577912107e-05, - "long_answer_loss": 0.1693, - "loss": 0.2003, - "short_answer_loss": NaN, - "step": 312, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.191, - "grad_norm": 1.8359375, - "learning_rate": 2.1841257141578304e-05, - "long_answer_loss": 0.191, - "loss": 0.2086, - "short_answer_loss": NaN, - "step": 313, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.217, - "grad_norm": 1.78125, - "learning_rate": 2.181935192778159e-05, - "long_answer_loss": 0.217, - "loss": 0.205, - "short_answer_loss": NaN, - "step": 314, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.1765, - "grad_norm": 1.6171875, - "learning_rate": 2.1797382088424866e-05, - "long_answer_loss": 0.1765, - "loss": 0.1931, - "short_answer_loss": NaN, - "step": 315, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.242, - "grad_norm": 1.765625, - "learning_rate": 2.1775347775859205e-05, - "long_answer_loss": 0.242, - "loss": 0.2047, - "short_answer_loss": NaN, - "step": 316, - "template_loss": 0.0 - }, - { - "epoch": 0.51, - "full_loss": 0.2099, - "grad_norm": 1.765625, - "learning_rate": 2.175324914288276e-05, - "long_answer_loss": 0.2099, - "loss": 0.1956, - "short_answer_loss": NaN, - "step": 317, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1706, - "grad_norm": 1.75, - "learning_rate": 2.173108634273972e-05, - "long_answer_loss": 0.1706, - "loss": 0.2052, - "short_answer_loss": NaN, - "step": 318, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.1948, - "grad_norm": 1.7265625, - "learning_rate": 2.1708859529119242e-05, - "long_answer_loss": 0.1948, - "loss": 0.1881, - "short_answer_loss": NaN, - "step": 319, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.2132, - "grad_norm": 1.796875, - "learning_rate": 2.1686568856154397e-05, - "long_answer_loss": 0.2132, - "loss": 0.1945, - "short_answer_loss": NaN, - "step": 320, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.2051, - "grad_norm": 1.640625, - "learning_rate": 2.166421447842108e-05, - "long_answer_loss": 0.2051, - "loss": 0.2039, - "short_answer_loss": NaN, - "step": 321, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.2143, - "grad_norm": 1.9453125, - "learning_rate": 2.1641796550936964e-05, - "long_answer_loss": 0.2143, - "loss": 0.2026, - "short_answer_loss": NaN, - "step": 322, - "template_loss": 0.0 - }, - { - "epoch": 0.52, - "full_loss": 0.2044, - "grad_norm": 1.8359375, - "learning_rate": 2.1619315229160396e-05, - "long_answer_loss": 0.2044, - "loss": 0.2007, - "short_answer_loss": NaN, - "step": 323, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.2178, - "grad_norm": 1.90625, - "learning_rate": 2.1596770668989347e-05, - "long_answer_loss": 0.2178, - "loss": 0.2133, - "short_answer_loss": NaN, - "step": 324, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1888, - "grad_norm": 1.8671875, - "learning_rate": 2.1574163026760308e-05, - "long_answer_loss": 0.1888, - "loss": 0.1957, - "short_answer_loss": NaN, - "step": 325, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1592, - "grad_norm": 1.7578125, - "learning_rate": 2.1551492459247227e-05, - "long_answer_loss": 0.1592, - "loss": 0.2062, - "short_answer_loss": NaN, - "step": 326, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1695, - "grad_norm": 1.7265625, - "learning_rate": 2.1528759123660404e-05, - "long_answer_loss": 0.1695, - "loss": 0.198, - "short_answer_loss": NaN, - "step": 327, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1931, - "grad_norm": 1.7890625, - "learning_rate": 2.1505963177645404e-05, - "long_answer_loss": 0.1931, - "loss": 0.1908, - "short_answer_loss": NaN, - "step": 328, - "template_loss": 0.0 - }, - { - "epoch": 0.53, - "full_loss": 0.1755, - "grad_norm": 1.7421875, - "learning_rate": 2.1483104779281975e-05, - "long_answer_loss": 0.1755, - "loss": 0.205, - "short_answer_loss": NaN, - "step": 329, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.227, - "grad_norm": 1.765625, - "learning_rate": 2.1460184087082944e-05, - "long_answer_loss": 0.227, - "loss": 0.2101, - "short_answer_loss": NaN, - "step": 330, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1985, - "grad_norm": 1.6640625, - "learning_rate": 2.1437201259993112e-05, - "long_answer_loss": 0.1985, - "loss": 0.2003, - "short_answer_loss": NaN, - "step": 331, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1934, - "grad_norm": 1.7578125, - "learning_rate": 2.141415645738816e-05, - "long_answer_loss": 0.1934, - "loss": 0.201, - "short_answer_loss": NaN, - "step": 332, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.2029, - "grad_norm": 1.765625, - "learning_rate": 2.1391049839073544e-05, - "long_answer_loss": 0.2029, - "loss": 0.1983, - "short_answer_loss": NaN, - "step": 333, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1792, - "grad_norm": 1.6953125, - "learning_rate": 2.136788156528339e-05, - "long_answer_loss": 0.1792, - "loss": 0.1974, - "short_answer_loss": NaN, - "step": 334, - "template_loss": 0.0 - }, - { - "epoch": 0.54, - "full_loss": 0.1899, - "grad_norm": 1.8671875, - "learning_rate": 2.134465179667936e-05, - "long_answer_loss": 0.1899, - "loss": 0.1941, - "short_answer_loss": NaN, - "step": 335, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1943, - "grad_norm": 1.7109375, - "learning_rate": 2.1321360694349573e-05, - "long_answer_loss": 0.1943, - "loss": 0.2083, - "short_answer_loss": NaN, - "step": 336, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1913, - "grad_norm": 1.765625, - "learning_rate": 2.129800841980746e-05, - "long_answer_loss": 0.1913, - "loss": 0.1945, - "short_answer_loss": NaN, - "step": 337, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.2173, - "grad_norm": 1.8125, - "learning_rate": 2.1274595134990656e-05, - "long_answer_loss": 0.2173, - "loss": 0.1996, - "short_answer_loss": NaN, - "step": 338, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1984, - "grad_norm": 1.71875, - "learning_rate": 2.1251121002259875e-05, - "long_answer_loss": 0.1984, - "loss": 0.2004, - "short_answer_loss": NaN, - "step": 339, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.2009, - "grad_norm": 1.84375, - "learning_rate": 2.122758618439779e-05, - "long_answer_loss": 0.2009, - "loss": 0.2014, - "short_answer_loss": NaN, - "step": 340, - "template_loss": 0.0 - }, - { - "epoch": 0.55, - "full_loss": 0.1869, - "grad_norm": 1.6796875, - "learning_rate": 2.120399084460789e-05, - "long_answer_loss": 0.1869, - "loss": 0.1843, - "short_answer_loss": NaN, - "step": 341, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1868, - "grad_norm": 1.8125, - "learning_rate": 2.1180335146513363e-05, - "long_answer_loss": 0.1868, - "loss": 0.1896, - "short_answer_loss": NaN, - "step": 342, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1844, - "grad_norm": 1.75, - "learning_rate": 2.1156619254155948e-05, - "long_answer_loss": 0.1844, - "loss": 0.1888, - "short_answer_loss": NaN, - "step": 343, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.2027, - "grad_norm": 1.6953125, - "learning_rate": 2.1132843331994817e-05, - "long_answer_loss": 0.2027, - "loss": 0.1868, - "short_answer_loss": NaN, - "step": 344, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1783, - "grad_norm": 1.96875, - "learning_rate": 2.1109007544905402e-05, - "long_answer_loss": 0.1783, - "loss": 0.2011, - "short_answer_loss": NaN, - "step": 345, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.2077, - "grad_norm": 1.8046875, - "learning_rate": 2.108511205817829e-05, - "long_answer_loss": 0.2077, - "loss": 0.198, - "short_answer_loss": NaN, - "step": 346, - "template_loss": 0.0 - }, - { - "epoch": 0.56, - "full_loss": 0.1811, - "grad_norm": 1.671875, - "learning_rate": 2.1061157037518057e-05, - "long_answer_loss": 0.1811, - "loss": 0.1946, - "short_answer_loss": NaN, - "step": 347, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.173, - "grad_norm": 1.75, - "learning_rate": 2.1037142649042107e-05, - "long_answer_loss": 0.173, - "loss": 0.1922, - "short_answer_loss": NaN, - "step": 348, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1804, - "grad_norm": 1.765625, - "learning_rate": 2.1013069059279542e-05, - "long_answer_loss": 0.1804, - "loss": 0.1918, - "short_answer_loss": NaN, - "step": 349, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1974, - "grad_norm": 1.65625, - "learning_rate": 2.0988936435170004e-05, - "long_answer_loss": 0.1974, - "loss": 0.1914, - "short_answer_loss": NaN, - "step": 350, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.2522, - "grad_norm": 1.859375, - "learning_rate": 2.09647449440625e-05, - "long_answer_loss": 0.2522, - "loss": 0.2157, - "short_answer_loss": NaN, - "step": 351, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.2153, - "grad_norm": 1.75, - "learning_rate": 2.094049475371426e-05, - "long_answer_loss": 0.2153, - "loss": 0.2018, - "short_answer_loss": NaN, - "step": 352, - "template_loss": 0.0 - }, - { - "epoch": 0.57, - "full_loss": 0.1612, - "grad_norm": 1.609375, - "learning_rate": 2.091618603228957e-05, - "long_answer_loss": 0.1612, - "loss": 0.1818, - "short_answer_loss": NaN, - "step": 353, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.2161, - "grad_norm": 1.765625, - "learning_rate": 2.0891818948358597e-05, - "long_answer_loss": 0.2161, - "loss": 0.1963, - "short_answer_loss": NaN, - "step": 354, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.2255, - "grad_norm": 1.8125, - "learning_rate": 2.086739367089623e-05, - "long_answer_loss": 0.2255, - "loss": 0.2045, - "short_answer_loss": NaN, - "step": 355, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.1808, - "grad_norm": 1.7890625, - "learning_rate": 2.08429103692809e-05, - "long_answer_loss": 0.1808, - "loss": 0.1989, - "short_answer_loss": NaN, - "step": 356, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.2142, - "grad_norm": 1.7421875, - "learning_rate": 2.0818369213293422e-05, - "long_answer_loss": 0.2142, - "loss": 0.2053, - "short_answer_loss": NaN, - "step": 357, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.2077, - "grad_norm": 1.7578125, - "learning_rate": 2.0793770373115782e-05, - "long_answer_loss": 0.2077, - "loss": 0.1966, - "short_answer_loss": NaN, - "step": 358, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.2175, - "grad_norm": 1.7734375, - "learning_rate": 2.0769114019330006e-05, - "long_answer_loss": 0.2175, - "loss": 0.2082, - "short_answer_loss": NaN, - "step": 359, - "template_loss": 0.0 - }, - { - "epoch": 0.58, - "full_loss": 0.2193, - "grad_norm": 1.75, - "learning_rate": 2.074440032291693e-05, - "long_answer_loss": 0.2193, - "loss": 0.1979, - "short_answer_loss": NaN, - "step": 360, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1886, - "grad_norm": 1.796875, - "learning_rate": 2.0719629455255052e-05, - "long_answer_loss": 0.1886, - "loss": 0.1982, - "short_answer_loss": NaN, - "step": 361, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1853, - "grad_norm": 1.7890625, - "learning_rate": 2.0694801588119315e-05, - "long_answer_loss": 0.1853, - "loss": 0.199, - "short_answer_loss": NaN, - "step": 362, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.2133, - "grad_norm": 1.671875, - "learning_rate": 2.066991689367994e-05, - "long_answer_loss": 0.2133, - "loss": 0.1855, - "short_answer_loss": NaN, - "step": 363, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.1858, - "grad_norm": 1.8203125, - "learning_rate": 2.0644975544501206e-05, - "long_answer_loss": 0.1858, - "loss": 0.1977, - "short_answer_loss": NaN, - "step": 364, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.182, - "grad_norm": 1.7578125, - "learning_rate": 2.061997771354028e-05, - "long_answer_loss": 0.182, - "loss": 0.1867, - "short_answer_loss": NaN, - "step": 365, - "template_loss": 0.0 - }, - { - "epoch": 0.59, - "full_loss": 0.2098, - "grad_norm": 1.75, - "learning_rate": 2.0594923574145994e-05, - "long_answer_loss": 0.2098, - "loss": 0.1901, - "short_answer_loss": NaN, - "step": 366, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1777, - "grad_norm": 1.78125, - "learning_rate": 2.0569813300057667e-05, - "long_answer_loss": 0.1777, - "loss": 0.1942, - "short_answer_loss": NaN, - "step": 367, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1621, - "grad_norm": 1.8125, - "learning_rate": 2.054464706540387e-05, - "long_answer_loss": 0.1621, - "loss": 0.1891, - "short_answer_loss": NaN, - "step": 368, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.2067, - "grad_norm": 1.6796875, - "learning_rate": 2.0519425044701256e-05, - "long_answer_loss": 0.2067, - "loss": 0.1956, - "short_answer_loss": NaN, - "step": 369, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.2081, - "grad_norm": 1.875, - "learning_rate": 2.049414741285331e-05, - "long_answer_loss": 0.2081, - "loss": 0.2041, - "short_answer_loss": NaN, - "step": 370, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1938, - "grad_norm": 1.7265625, - "learning_rate": 2.0468814345149173e-05, - "long_answer_loss": 0.1938, - "loss": 0.1946, - "short_answer_loss": NaN, - "step": 371, - "template_loss": 0.0 - }, - { - "epoch": 0.6, - "full_loss": 0.1969, - "grad_norm": 1.71875, - "learning_rate": 2.0443426017262395e-05, - "long_answer_loss": 0.1969, - "loss": 0.1923, - "short_answer_loss": NaN, - "step": 372, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1651, - "grad_norm": 1.640625, - "learning_rate": 2.0417982605249734e-05, - "long_answer_loss": 0.1651, - "loss": 0.191, - "short_answer_loss": NaN, - "step": 373, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1948, - "grad_norm": 1.6328125, - "learning_rate": 2.0392484285549936e-05, - "long_answer_loss": 0.1948, - "loss": 0.1997, - "short_answer_loss": NaN, - "step": 374, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.2046, - "grad_norm": 1.6875, - "learning_rate": 2.0366931234982513e-05, - "long_answer_loss": 0.2046, - "loss": 0.2086, - "short_answer_loss": NaN, - "step": 375, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.2052, - "grad_norm": 1.6875, - "learning_rate": 2.034132363074649e-05, - "long_answer_loss": 0.2052, - "loss": 0.1911, - "short_answer_loss": NaN, - "step": 376, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.2002, - "grad_norm": 1.6875, - "learning_rate": 2.0315661650419222e-05, - "long_answer_loss": 0.2002, - "loss": 0.1858, - "short_answer_loss": NaN, - "step": 377, - "template_loss": 0.0 - }, - { - "epoch": 0.61, - "full_loss": 0.1988, - "grad_norm": 1.703125, - "learning_rate": 2.028994547195512e-05, - "long_answer_loss": 0.1988, - "loss": 0.1907, - "short_answer_loss": NaN, - "step": 378, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.2357, - "grad_norm": 1.84375, - "learning_rate": 2.026417527368445e-05, - "long_answer_loss": 0.2357, - "loss": 0.219, - "short_answer_loss": NaN, - "step": 379, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.2111, - "grad_norm": 1.703125, - "learning_rate": 2.0238351234312063e-05, - "long_answer_loss": 0.2111, - "loss": 0.1933, - "short_answer_loss": NaN, - "step": 380, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.2037, - "grad_norm": 1.6171875, - "learning_rate": 2.0212473532916192e-05, - "long_answer_loss": 0.2037, - "loss": 0.1859, - "short_answer_loss": NaN, - "step": 381, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.2295, - "grad_norm": 1.890625, - "learning_rate": 2.0186542348947185e-05, - "long_answer_loss": 0.2295, - "loss": 0.2089, - "short_answer_loss": NaN, - "step": 382, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1794, - "grad_norm": 1.71875, - "learning_rate": 2.0160557862226266e-05, - "long_answer_loss": 0.1794, - "loss": 0.1905, - "short_answer_loss": NaN, - "step": 383, - "template_loss": 0.0 - }, - { - "epoch": 0.62, - "full_loss": 0.1677, - "grad_norm": 1.859375, - "learning_rate": 2.013452025294429e-05, - "long_answer_loss": 0.1677, - "loss": 0.1936, - "short_answer_loss": NaN, - "step": 384, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.212, - "grad_norm": 1.8828125, - "learning_rate": 2.0108429701660496e-05, - "long_answer_loss": 0.212, - "loss": 0.196, - "short_answer_loss": NaN, - "step": 385, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1945, - "grad_norm": 1.625, - "learning_rate": 2.008228638930125e-05, - "long_answer_loss": 0.1945, - "loss": 0.1926, - "short_answer_loss": NaN, - "step": 386, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.201, - "grad_norm": 1.75, - "learning_rate": 2.0056090497158797e-05, - "long_answer_loss": 0.201, - "loss": 0.1863, - "short_answer_loss": NaN, - "step": 387, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.2279, - "grad_norm": 1.7265625, - "learning_rate": 2.002984220688999e-05, - "long_answer_loss": 0.2279, - "loss": 0.2077, - "short_answer_loss": NaN, - "step": 388, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1748, - "grad_norm": 1.6875, - "learning_rate": 2.0003541700515056e-05, - "long_answer_loss": 0.1748, - "loss": 0.1907, - "short_answer_loss": NaN, - "step": 389, - "template_loss": 0.0 - }, - { - "epoch": 0.63, - "full_loss": 0.1925, - "grad_norm": 1.6328125, - "learning_rate": 1.9977189160416293e-05, - "long_answer_loss": 0.1925, - "loss": 0.1977, - "short_answer_loss": NaN, - "step": 390, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.205, - "grad_norm": 1.7265625, - "learning_rate": 1.9950784769336856e-05, - "long_answer_loss": 0.205, - "loss": 0.1951, - "short_answer_loss": NaN, - "step": 391, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.2024, - "grad_norm": 1.5859375, - "learning_rate": 1.9924328710379443e-05, - "long_answer_loss": 0.2024, - "loss": 0.2014, - "short_answer_loss": NaN, - "step": 392, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.2031, - "grad_norm": 1.78125, - "learning_rate": 1.989782116700506e-05, - "long_answer_loss": 0.2031, - "loss": 0.1955, - "short_answer_loss": NaN, - "step": 393, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.1843, - "grad_norm": 1.8671875, - "learning_rate": 1.9871262323031725e-05, - "long_answer_loss": 0.1843, - "loss": 0.2188, - "short_answer_loss": NaN, - "step": 394, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.2244, - "grad_norm": 1.7578125, - "learning_rate": 1.9844652362633214e-05, - "long_answer_loss": 0.2244, - "loss": 0.2111, - "short_answer_loss": NaN, - "step": 395, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.2076, - "grad_norm": 1.6484375, - "learning_rate": 1.9817991470337756e-05, - "long_answer_loss": 0.2076, - "loss": 0.1906, - "short_answer_loss": NaN, - "step": 396, - "template_loss": 0.0 - }, - { - "epoch": 0.64, - "full_loss": 0.182, - "grad_norm": 1.7734375, - "learning_rate": 1.9791279831026783e-05, - "long_answer_loss": 0.182, - "loss": 0.1974, - "short_answer_loss": NaN, - "step": 397, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1739, - "grad_norm": 1.828125, - "learning_rate": 1.9764517629933632e-05, - "long_answer_loss": 0.1739, - "loss": 0.1918, - "short_answer_loss": NaN, - "step": 398, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.2019, - "grad_norm": 1.859375, - "learning_rate": 1.9737705052642257e-05, - "long_answer_loss": 0.2019, - "loss": 0.1957, - "short_answer_loss": NaN, - "step": 399, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.1593, - "grad_norm": 1.6328125, - "learning_rate": 1.9710842285085963e-05, - "long_answer_loss": 0.1593, - "loss": 0.1871, - "short_answer_loss": NaN, - "step": 400, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.2025, - "grad_norm": 1.84375, - "learning_rate": 1.968392951354609e-05, - "long_answer_loss": 0.2025, - "loss": 0.1932, - "short_answer_loss": NaN, - "step": 401, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.2095, - "grad_norm": 2.0, - "learning_rate": 1.965696692465074e-05, - "long_answer_loss": 0.2095, - "loss": 0.1955, - "short_answer_loss": NaN, - "step": 402, - "template_loss": 0.0 - }, - { - "epoch": 0.65, - "full_loss": 0.2044, - "grad_norm": 1.7109375, - "learning_rate": 1.962995470537346e-05, - "long_answer_loss": 0.2044, - "loss": 0.1873, - "short_answer_loss": NaN, - "step": 403, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.2193, - "grad_norm": 1.6953125, - "learning_rate": 1.960289304303199e-05, - "long_answer_loss": 0.2193, - "loss": 0.1903, - "short_answer_loss": NaN, - "step": 404, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1501, - "grad_norm": 1.7578125, - "learning_rate": 1.9575782125286907e-05, - "long_answer_loss": 0.1501, - "loss": 0.1911, - "short_answer_loss": NaN, - "step": 405, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1722, - "grad_norm": 1.9609375, - "learning_rate": 1.954862214014038e-05, - "long_answer_loss": 0.1722, - "loss": 0.1893, - "short_answer_loss": NaN, - "step": 406, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1493, - "grad_norm": 1.796875, - "learning_rate": 1.952141327593481e-05, - "long_answer_loss": 0.1493, - "loss": 0.1941, - "short_answer_loss": NaN, - "step": 407, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1972, - "grad_norm": 1.78125, - "learning_rate": 1.949415572135158e-05, - "long_answer_loss": 0.1972, - "loss": 0.2043, - "short_answer_loss": NaN, - "step": 408, - "template_loss": 0.0 - }, - { - "epoch": 0.66, - "full_loss": 0.1868, - "grad_norm": 1.765625, - "learning_rate": 1.9466849665409695e-05, - "long_answer_loss": 0.1868, - "loss": 0.1885, - "short_answer_loss": NaN, - "step": 409, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1901, - "grad_norm": 1.65625, - "learning_rate": 1.9439495297464512e-05, - "long_answer_loss": 0.1901, - "loss": 0.1907, - "short_answer_loss": NaN, - "step": 410, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.2132, - "grad_norm": 1.8359375, - "learning_rate": 1.94120928072064e-05, - "long_answer_loss": 0.2132, - "loss": 0.2018, - "short_answer_loss": NaN, - "step": 411, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.2384, - "grad_norm": 1.8125, - "learning_rate": 1.9384642384659446e-05, - "long_answer_loss": 0.2384, - "loss": 0.1993, - "short_answer_loss": NaN, - "step": 412, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.2068, - "grad_norm": 1.765625, - "learning_rate": 1.935714422018011e-05, - "long_answer_loss": 0.2068, - "loss": 0.1958, - "short_answer_loss": NaN, - "step": 413, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1957, - "grad_norm": 1.7578125, - "learning_rate": 1.932959850445594e-05, - "long_answer_loss": 0.1957, - "loss": 0.1832, - "short_answer_loss": NaN, - "step": 414, - "template_loss": 0.0 - }, - { - "epoch": 0.67, - "full_loss": 0.1987, - "grad_norm": 1.703125, - "learning_rate": 1.9302005428504216e-05, - "long_answer_loss": 0.1987, - "loss": 0.1965, - "short_answer_loss": NaN, - "step": 415, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1942, - "grad_norm": 1.7265625, - "learning_rate": 1.9274365183670645e-05, - "long_answer_loss": 0.1942, - "loss": 0.1973, - "short_answer_loss": NaN, - "step": 416, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1949, - "grad_norm": 1.734375, - "learning_rate": 1.9246677961628028e-05, - "long_answer_loss": 0.1949, - "loss": 0.1808, - "short_answer_loss": NaN, - "step": 417, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1828, - "grad_norm": 1.765625, - "learning_rate": 1.921894395437494e-05, - "long_answer_loss": 0.1828, - "loss": 0.1912, - "short_answer_loss": NaN, - "step": 418, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1984, - "grad_norm": 1.7890625, - "learning_rate": 1.9191163354234375e-05, - "long_answer_loss": 0.1984, - "loss": 0.1899, - "short_answer_loss": NaN, - "step": 419, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.2025, - "grad_norm": 1.796875, - "learning_rate": 1.9163336353852447e-05, - "long_answer_loss": 0.2025, - "loss": 0.1932, - "short_answer_loss": NaN, - "step": 420, - "template_loss": 0.0 - }, - { - "epoch": 0.68, - "full_loss": 0.1864, - "grad_norm": 1.78125, - "learning_rate": 1.9135463146197018e-05, - "long_answer_loss": 0.1864, - "loss": 0.1836, - "short_answer_loss": NaN, - "step": 421, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1888, - "grad_norm": 1.5859375, - "learning_rate": 1.9107543924556388e-05, - "long_answer_loss": 0.1888, - "loss": 0.191, - "short_answer_loss": NaN, - "step": 422, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1497, - "grad_norm": 1.640625, - "learning_rate": 1.9079578882537935e-05, - "long_answer_loss": 0.1497, - "loss": 0.1897, - "short_answer_loss": NaN, - "step": 423, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1934, - "grad_norm": 1.7890625, - "learning_rate": 1.905156821406679e-05, - "long_answer_loss": 0.1934, - "loss": 0.187, - "short_answer_loss": NaN, - "step": 424, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1715, - "grad_norm": 1.59375, - "learning_rate": 1.902351211338448e-05, - "long_answer_loss": 0.1715, - "loss": 0.179, - "short_answer_loss": NaN, - "step": 425, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.1962, - "grad_norm": 1.6875, - "learning_rate": 1.8995410775047573e-05, - "long_answer_loss": 0.1962, - "loss": 0.1918, - "short_answer_loss": NaN, - "step": 426, - "template_loss": 0.0 - }, - { - "epoch": 0.69, - "full_loss": 0.183, - "grad_norm": 1.7265625, - "learning_rate": 1.8967264393926355e-05, - "long_answer_loss": 0.183, - "loss": 0.1884, - "short_answer_loss": NaN, - "step": 427, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.2169, - "grad_norm": 1.6171875, - "learning_rate": 1.8939073165203462e-05, - "long_answer_loss": 0.2169, - "loss": 0.1868, - "short_answer_loss": NaN, - "step": 428, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1911, - "grad_norm": 1.640625, - "learning_rate": 1.8910837284372528e-05, - "long_answer_loss": 0.1911, - "loss": 0.1747, - "short_answer_loss": NaN, - "step": 429, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1722, - "grad_norm": 1.6484375, - "learning_rate": 1.8882556947236817e-05, - "long_answer_loss": 0.1722, - "loss": 0.1815, - "short_answer_loss": NaN, - "step": 430, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1825, - "grad_norm": 1.7421875, - "learning_rate": 1.8854232349907885e-05, - "long_answer_loss": 0.1825, - "loss": 0.1875, - "short_answer_loss": NaN, - "step": 431, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1976, - "grad_norm": 1.71875, - "learning_rate": 1.882586368880423e-05, - "long_answer_loss": 0.1976, - "loss": 0.1922, - "short_answer_loss": NaN, - "step": 432, - "template_loss": 0.0 - }, - { - "epoch": 0.7, - "full_loss": 0.1526, - "grad_norm": 1.671875, - "learning_rate": 1.8797451160649875e-05, - "long_answer_loss": 0.1526, - "loss": 0.188, - "short_answer_loss": NaN, - "step": 433, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1688, - "grad_norm": 1.7578125, - "learning_rate": 1.8768994962473085e-05, - "long_answer_loss": 0.1688, - "loss": 0.1917, - "short_answer_loss": NaN, - "step": 434, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.2068, - "grad_norm": 1.7109375, - "learning_rate": 1.8740495291604927e-05, - "long_answer_loss": 0.2068, - "loss": 0.1988, - "short_answer_loss": NaN, - "step": 435, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1973, - "grad_norm": 1.6796875, - "learning_rate": 1.8711952345677936e-05, - "long_answer_loss": 0.1973, - "loss": 0.1863, - "short_answer_loss": NaN, - "step": 436, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.2113, - "grad_norm": 1.8046875, - "learning_rate": 1.868336632262475e-05, - "long_answer_loss": 0.2113, - "loss": 0.1987, - "short_answer_loss": NaN, - "step": 437, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1502, - "grad_norm": 1.734375, - "learning_rate": 1.8654737420676722e-05, - "long_answer_loss": 0.1502, - "loss": 0.1935, - "short_answer_loss": NaN, - "step": 438, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.1857, - "grad_norm": 1.640625, - "learning_rate": 1.8626065838362554e-05, - "long_answer_loss": 0.1857, - "loss": 0.1895, - "short_answer_loss": NaN, - "step": 439, - "template_loss": 0.0 - }, - { - "epoch": 0.71, - "full_loss": 0.2016, - "grad_norm": 1.6328125, - "learning_rate": 1.8597351774506912e-05, - "long_answer_loss": 0.2016, - "loss": 0.1812, - "short_answer_loss": NaN, - "step": 440, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.2299, - "grad_norm": 1.625, - "learning_rate": 1.8568595428229057e-05, - "long_answer_loss": 0.2299, - "loss": 0.1816, - "short_answer_loss": NaN, - "step": 441, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.2218, - "grad_norm": 1.65625, - "learning_rate": 1.8539796998941456e-05, - "long_answer_loss": 0.2218, - "loss": 0.1856, - "short_answer_loss": NaN, - "step": 442, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.2028, - "grad_norm": 1.609375, - "learning_rate": 1.851095668634841e-05, - "long_answer_loss": 0.2028, - "loss": 0.1904, - "short_answer_loss": NaN, - "step": 443, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.2133, - "grad_norm": 1.71875, - "learning_rate": 1.8482074690444652e-05, - "long_answer_loss": 0.2133, - "loss": 0.1965, - "short_answer_loss": NaN, - "step": 444, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.2011, - "grad_norm": 1.8046875, - "learning_rate": 1.8453151211513984e-05, - "long_answer_loss": 0.2011, - "loss": 0.1925, - "short_answer_loss": NaN, - "step": 445, - "template_loss": 0.0 - }, - { - "epoch": 0.72, - "full_loss": 0.1733, - "grad_norm": 1.8125, - "learning_rate": 1.8424186450127857e-05, - "long_answer_loss": 0.1733, - "loss": 0.1821, - "short_answer_loss": NaN, - "step": 446, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1843, - "grad_norm": 1.703125, - "learning_rate": 1.839518060714401e-05, - "long_answer_loss": 0.1843, - "loss": 0.1801, - "short_answer_loss": NaN, - "step": 447, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1795, - "grad_norm": 1.875, - "learning_rate": 1.8366133883705063e-05, - "long_answer_loss": 0.1795, - "loss": 0.1842, - "short_answer_loss": NaN, - "step": 448, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.228, - "grad_norm": 1.9921875, - "learning_rate": 1.833704648123712e-05, - "long_answer_loss": 0.228, - "loss": 0.1947, - "short_answer_loss": NaN, - "step": 449, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.2036, - "grad_norm": 1.7421875, - "learning_rate": 1.830791860144838e-05, - "long_answer_loss": 0.2036, - "loss": 0.189, - "short_answer_loss": NaN, - "step": 450, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.1889, - "grad_norm": 1.84375, - "learning_rate": 1.827875044632773e-05, - "long_answer_loss": 0.1889, - "loss": 0.1839, - "short_answer_loss": NaN, - "step": 451, - "template_loss": 0.0 - }, - { - "epoch": 0.73, - "full_loss": 0.2054, - "grad_norm": 2.0, - "learning_rate": 1.824954221814335e-05, - "long_answer_loss": 0.2054, - "loss": 0.1839, - "short_answer_loss": NaN, - "step": 452, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.2043, - "grad_norm": 1.796875, - "learning_rate": 1.822029411944131e-05, - "long_answer_loss": 0.2043, - "loss": 0.1838, - "short_answer_loss": NaN, - "step": 453, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1935, - "grad_norm": 1.9765625, - "learning_rate": 1.8191006353044165e-05, - "long_answer_loss": 0.1935, - "loss": 0.2006, - "short_answer_loss": NaN, - "step": 454, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1797, - "grad_norm": 1.7734375, - "learning_rate": 1.8161679122049545e-05, - "long_answer_loss": 0.1797, - "loss": 0.1889, - "short_answer_loss": NaN, - "step": 455, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1899, - "grad_norm": 1.8046875, - "learning_rate": 1.8132312629828758e-05, - "long_answer_loss": 0.1899, - "loss": 0.2, - "short_answer_loss": NaN, - "step": 456, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1709, - "grad_norm": 1.703125, - "learning_rate": 1.810290708002535e-05, - "long_answer_loss": 0.1709, - "loss": 0.1758, - "short_answer_loss": NaN, - "step": 457, - "template_loss": 0.0 - }, - { - "epoch": 0.74, - "full_loss": 0.1712, - "grad_norm": 1.625, - "learning_rate": 1.807346267655374e-05, - "long_answer_loss": 0.1712, - "loss": 0.1902, - "short_answer_loss": NaN, - "step": 458, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1963, - "grad_norm": 1.703125, - "learning_rate": 1.8043979623597766e-05, - "long_answer_loss": 0.1963, - "loss": 0.1862, - "short_answer_loss": NaN, - "step": 459, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.162, - "grad_norm": 1.671875, - "learning_rate": 1.801445812560928e-05, - "long_answer_loss": 0.162, - "loss": 0.182, - "short_answer_loss": NaN, - "step": 460, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1656, - "grad_norm": 1.6015625, - "learning_rate": 1.7984898387306743e-05, - "long_answer_loss": 0.1656, - "loss": 0.1773, - "short_answer_loss": NaN, - "step": 461, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1905, - "grad_norm": 1.734375, - "learning_rate": 1.795530061367379e-05, - "long_answer_loss": 0.1905, - "loss": 0.1913, - "short_answer_loss": NaN, - "step": 462, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1724, - "grad_norm": 1.71875, - "learning_rate": 1.7925665009957815e-05, - "long_answer_loss": 0.1724, - "loss": 0.1764, - "short_answer_loss": NaN, - "step": 463, - "template_loss": 0.0 - }, - { - "epoch": 0.75, - "full_loss": 0.1785, - "grad_norm": 1.8359375, - "learning_rate": 1.7895991781668544e-05, - "long_answer_loss": 0.1785, - "loss": 0.1905, - "short_answer_loss": NaN, - "step": 464, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1875, - "grad_norm": 1.7890625, - "learning_rate": 1.786628113457662e-05, - "long_answer_loss": 0.1875, - "loss": 0.1954, - "short_answer_loss": NaN, - "step": 465, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1853, - "grad_norm": 1.9921875, - "learning_rate": 1.783653327471216e-05, - "long_answer_loss": 0.1853, - "loss": 0.1987, - "short_answer_loss": NaN, - "step": 466, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1805, - "grad_norm": 1.8828125, - "learning_rate": 1.7806748408363337e-05, - "long_answer_loss": 0.1805, - "loss": 0.1925, - "short_answer_loss": NaN, - "step": 467, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1832, - "grad_norm": 1.9140625, - "learning_rate": 1.7776926742074957e-05, - "long_answer_loss": 0.1832, - "loss": 0.1964, - "short_answer_loss": NaN, - "step": 468, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.1755, - "grad_norm": 1.6875, - "learning_rate": 1.7747068482647004e-05, - "long_answer_loss": 0.1755, - "loss": 0.1927, - "short_answer_loss": NaN, - "step": 469, - "template_loss": 0.0 - }, - { - "epoch": 0.76, - "full_loss": 0.175, - "grad_norm": 1.7421875, - "learning_rate": 1.7717173837133233e-05, - "long_answer_loss": 0.175, - "loss": 0.2062, - "short_answer_loss": NaN, - "step": 470, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.2087, - "grad_norm": 1.796875, - "learning_rate": 1.7687243012839698e-05, - "long_answer_loss": 0.2087, - "loss": 0.2025, - "short_answer_loss": NaN, - "step": 471, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1843, - "grad_norm": 1.7109375, - "learning_rate": 1.7657276217323364e-05, - "long_answer_loss": 0.1843, - "loss": 0.1831, - "short_answer_loss": NaN, - "step": 472, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1978, - "grad_norm": 1.6640625, - "learning_rate": 1.7627273658390625e-05, - "long_answer_loss": 0.1978, - "loss": 0.1911, - "short_answer_loss": NaN, - "step": 473, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.201, - "grad_norm": 1.7421875, - "learning_rate": 1.7597235544095873e-05, - "long_answer_loss": 0.201, - "loss": 0.1844, - "short_answer_loss": NaN, - "step": 474, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.223, - "grad_norm": 1.5703125, - "learning_rate": 1.7567162082740082e-05, - "long_answer_loss": 0.223, - "loss": 0.1894, - "short_answer_loss": NaN, - "step": 475, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.2016, - "grad_norm": 1.8515625, - "learning_rate": 1.7537053482869316e-05, - "long_answer_loss": 0.2016, - "loss": 0.1947, - "short_answer_loss": NaN, - "step": 476, - "template_loss": 0.0 - }, - { - "epoch": 0.77, - "full_loss": 0.1819, - "grad_norm": 1.8125, - "learning_rate": 1.7506909953273327e-05, - "long_answer_loss": 0.1819, - "loss": 0.1917, - "short_answer_loss": NaN, - "step": 477, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.2318, - "grad_norm": 1.78125, - "learning_rate": 1.7476731702984084e-05, - "long_answer_loss": 0.2318, - "loss": 0.199, - "short_answer_loss": NaN, - "step": 478, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1817, - "grad_norm": 1.734375, - "learning_rate": 1.744651894127433e-05, - "long_answer_loss": 0.1817, - "loss": 0.1928, - "short_answer_loss": NaN, - "step": 479, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1705, - "grad_norm": 1.859375, - "learning_rate": 1.741627187765612e-05, - "long_answer_loss": 0.1705, - "loss": 0.1837, - "short_answer_loss": NaN, - "step": 480, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1718, - "grad_norm": 1.6484375, - "learning_rate": 1.7385990721879393e-05, - "long_answer_loss": 0.1718, - "loss": 0.1797, - "short_answer_loss": NaN, - "step": 481, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1963, - "grad_norm": 1.6796875, - "learning_rate": 1.735567568393049e-05, - "long_answer_loss": 0.1963, - "loss": 0.1869, - "short_answer_loss": NaN, - "step": 482, - "template_loss": 0.0 - }, - { - "epoch": 0.78, - "full_loss": 0.1836, - "grad_norm": 1.8203125, - "learning_rate": 1.7325326974030717e-05, - "long_answer_loss": 0.1836, - "loss": 0.18, - "short_answer_loss": NaN, - "step": 483, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1609, - "grad_norm": 1.78125, - "learning_rate": 1.729494480263487e-05, - "long_answer_loss": 0.1609, - "loss": 0.1895, - "short_answer_loss": NaN, - "step": 484, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1826, - "grad_norm": 1.609375, - "learning_rate": 1.7264529380429805e-05, - "long_answer_loss": 0.1826, - "loss": 0.1836, - "short_answer_loss": NaN, - "step": 485, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.2017, - "grad_norm": 1.7421875, - "learning_rate": 1.723408091833294e-05, - "long_answer_loss": 0.2017, - "loss": 0.1864, - "short_answer_loss": NaN, - "step": 486, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1734, - "grad_norm": 1.7265625, - "learning_rate": 1.7203599627490803e-05, - "long_answer_loss": 0.1734, - "loss": 0.1876, - "short_answer_loss": NaN, - "step": 487, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1711, - "grad_norm": 1.6640625, - "learning_rate": 1.71730857192776e-05, - "long_answer_loss": 0.1711, - "loss": 0.1834, - "short_answer_loss": NaN, - "step": 488, - "template_loss": 0.0 - }, - { - "epoch": 0.79, - "full_loss": 0.1615, - "grad_norm": 1.6328125, - "learning_rate": 1.7142539405293706e-05, - "long_answer_loss": 0.1615, - "loss": 0.1786, - "short_answer_loss": NaN, - "step": 489, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1881, - "grad_norm": 1.75, - "learning_rate": 1.7111960897364226e-05, - "long_answer_loss": 0.1881, - "loss": 0.191, - "short_answer_loss": NaN, - "step": 490, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1699, - "grad_norm": 1.7421875, - "learning_rate": 1.7081350407537497e-05, - "long_answer_loss": 0.1699, - "loss": 0.1849, - "short_answer_loss": NaN, - "step": 491, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1865, - "grad_norm": 1.8046875, - "learning_rate": 1.7050708148083664e-05, - "long_answer_loss": 0.1865, - "loss": 0.1818, - "short_answer_loss": NaN, - "step": 492, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1678, - "grad_norm": 1.7578125, - "learning_rate": 1.7020034331493157e-05, - "long_answer_loss": 0.1678, - "loss": 0.1812, - "short_answer_loss": NaN, - "step": 493, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.1888, - "grad_norm": 1.703125, - "learning_rate": 1.698932917047525e-05, - "long_answer_loss": 0.1888, - "loss": 0.1779, - "short_answer_loss": NaN, - "step": 494, - "template_loss": 0.0 - }, - { - "epoch": 0.8, - "full_loss": 0.2053, - "grad_norm": 1.6953125, - "learning_rate": 1.6958592877956574e-05, - "long_answer_loss": 0.2053, - "loss": 0.1902, - "short_answer_loss": NaN, - "step": 495, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1983, - "grad_norm": 1.734375, - "learning_rate": 1.692782566707965e-05, - "long_answer_loss": 0.1983, - "loss": 0.195, - "short_answer_loss": NaN, - "step": 496, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.2063, - "grad_norm": 1.8359375, - "learning_rate": 1.68970277512014e-05, - "long_answer_loss": 0.2063, - "loss": 0.1885, - "short_answer_loss": NaN, - "step": 497, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1664, - "grad_norm": 1.6796875, - "learning_rate": 1.6866199343891665e-05, - "long_answer_loss": 0.1664, - "loss": 0.1824, - "short_answer_loss": NaN, - "step": 498, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1684, - "grad_norm": 1.6640625, - "learning_rate": 1.6835340658931735e-05, - "long_answer_loss": 0.1684, - "loss": 0.1917, - "short_answer_loss": NaN, - "step": 499, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1759, - "grad_norm": 1.59375, - "learning_rate": 1.680445191031287e-05, - "long_answer_loss": 0.1759, - "loss": 0.1744, - "short_answer_loss": NaN, - "step": 500, - "template_loss": 0.0 - }, - { - "epoch": 0.81, - "full_loss": 0.1767, - "grad_norm": 1.734375, - "learning_rate": 1.6773533312234794e-05, - "long_answer_loss": 0.1767, - "loss": 0.1707, - "short_answer_loss": NaN, - "step": 501, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.2144, - "grad_norm": 1.6328125, - "learning_rate": 1.6742585079104234e-05, - "long_answer_loss": 0.2144, - "loss": 0.1946, - "short_answer_loss": NaN, - "step": 502, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1822, - "grad_norm": 1.78125, - "learning_rate": 1.6711607425533422e-05, - "long_answer_loss": 0.1822, - "loss": 0.1989, - "short_answer_loss": NaN, - "step": 503, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1467, - "grad_norm": 1.75, - "learning_rate": 1.66806005663386e-05, - "long_answer_loss": 0.1467, - "loss": 0.1771, - "short_answer_loss": NaN, - "step": 504, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1862, - "grad_norm": 1.7421875, - "learning_rate": 1.664956471653855e-05, - "long_answer_loss": 0.1862, - "loss": 0.1844, - "short_answer_loss": NaN, - "step": 505, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1797, - "grad_norm": 1.765625, - "learning_rate": 1.661850009135308e-05, - "long_answer_loss": 0.1797, - "loss": 0.1909, - "short_answer_loss": NaN, - "step": 506, - "template_loss": 0.0 - }, - { - "epoch": 0.82, - "full_loss": 0.1752, - "grad_norm": 1.7578125, - "learning_rate": 1.6587406906201546e-05, - "long_answer_loss": 0.1752, - "loss": 0.1763, - "short_answer_loss": NaN, - "step": 507, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1691, - "grad_norm": 1.796875, - "learning_rate": 1.6556285376701357e-05, - "long_answer_loss": 0.1691, - "loss": 0.1866, - "short_answer_loss": NaN, - "step": 508, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1908, - "grad_norm": 1.7578125, - "learning_rate": 1.652513571866648e-05, - "long_answer_loss": 0.1908, - "loss": 0.1844, - "short_answer_loss": NaN, - "step": 509, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.2111, - "grad_norm": 1.71875, - "learning_rate": 1.6493958148105934e-05, - "long_answer_loss": 0.2111, - "loss": 0.1786, - "short_answer_loss": NaN, - "step": 510, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1876, - "grad_norm": 1.734375, - "learning_rate": 1.64627528812223e-05, - "long_answer_loss": 0.1876, - "loss": 0.1928, - "short_answer_loss": NaN, - "step": 511, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1758, - "grad_norm": 1.671875, - "learning_rate": 1.6431520134410222e-05, - "long_answer_loss": 0.1758, - "loss": 0.1751, - "short_answer_loss": NaN, - "step": 512, - "template_loss": 0.0 - }, - { - "epoch": 0.83, - "full_loss": 0.1475, - "grad_norm": 1.78125, - "learning_rate": 1.6400260124254905e-05, - "long_answer_loss": 0.1475, - "loss": 0.1894, - "short_answer_loss": NaN, - "step": 513, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1734, - "grad_norm": 1.6875, - "learning_rate": 1.6368973067530618e-05, - "long_answer_loss": 0.1734, - "loss": 0.1699, - "short_answer_loss": NaN, - "step": 514, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.2011, - "grad_norm": 1.71875, - "learning_rate": 1.6337659181199184e-05, - "long_answer_loss": 0.2011, - "loss": 0.191, - "short_answer_loss": NaN, - "step": 515, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.2202, - "grad_norm": 1.7109375, - "learning_rate": 1.6306318682408468e-05, - "long_answer_loss": 0.2202, - "loss": 0.1879, - "short_answer_loss": NaN, - "step": 516, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1634, - "grad_norm": 1.71875, - "learning_rate": 1.6274951788490887e-05, - "long_answer_loss": 0.1634, - "loss": 0.1849, - "short_answer_loss": NaN, - "step": 517, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.2103, - "grad_norm": 1.7734375, - "learning_rate": 1.6243558716961904e-05, - "long_answer_loss": 0.2103, - "loss": 0.1863, - "short_answer_loss": NaN, - "step": 518, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.1896, - "grad_norm": 1.6171875, - "learning_rate": 1.6212139685518503e-05, - "long_answer_loss": 0.1896, - "loss": 0.1803, - "short_answer_loss": NaN, - "step": 519, - "template_loss": 0.0 - }, - { - "epoch": 0.84, - "full_loss": 0.204, - "grad_norm": 1.71875, - "learning_rate": 1.6180694912037687e-05, - "long_answer_loss": 0.204, - "loss": 0.2032, - "short_answer_loss": NaN, - "step": 520, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1782, - "grad_norm": 1.6171875, - "learning_rate": 1.6149224614574974e-05, - "long_answer_loss": 0.1782, - "loss": 0.1894, - "short_answer_loss": NaN, - "step": 521, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1584, - "grad_norm": 1.734375, - "learning_rate": 1.6117729011362882e-05, - "long_answer_loss": 0.1584, - "loss": 0.1796, - "short_answer_loss": NaN, - "step": 522, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1931, - "grad_norm": 1.7265625, - "learning_rate": 1.6086208320809403e-05, - "long_answer_loss": 0.1931, - "loss": 0.1804, - "short_answer_loss": NaN, - "step": 523, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.174, - "grad_norm": 1.6015625, - "learning_rate": 1.6054662761496504e-05, - "long_answer_loss": 0.174, - "loss": 0.1843, - "short_answer_loss": NaN, - "step": 524, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.1966, - "grad_norm": 1.625, - "learning_rate": 1.6023092552178607e-05, - "long_answer_loss": 0.1966, - "loss": 0.1843, - "short_answer_loss": NaN, - "step": 525, - "template_loss": 0.0 - }, - { - "epoch": 0.85, - "full_loss": 0.201, - "grad_norm": 1.625, - "learning_rate": 1.5991497911781073e-05, - "long_answer_loss": 0.201, - "loss": 0.177, - "short_answer_loss": NaN, - "step": 526, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1943, - "grad_norm": 1.7265625, - "learning_rate": 1.595987905939868e-05, - "long_answer_loss": 0.1943, - "loss": 0.1752, - "short_answer_loss": NaN, - "step": 527, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1929, - "grad_norm": 1.828125, - "learning_rate": 1.5928236214294097e-05, - "long_answer_loss": 0.1929, - "loss": 0.1858, - "short_answer_loss": NaN, - "step": 528, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1678, - "grad_norm": 1.765625, - "learning_rate": 1.5896569595896384e-05, - "long_answer_loss": 0.1678, - "loss": 0.1783, - "short_answer_loss": NaN, - "step": 529, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1622, - "grad_norm": 1.8671875, - "learning_rate": 1.5864879423799454e-05, - "long_answer_loss": 0.1622, - "loss": 0.1778, - "short_answer_loss": NaN, - "step": 530, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1978, - "grad_norm": 1.6484375, - "learning_rate": 1.5833165917760544e-05, - "long_answer_loss": 0.1978, - "loss": 0.1749, - "short_answer_loss": NaN, - "step": 531, - "template_loss": 0.0 - }, - { - "epoch": 0.86, - "full_loss": 0.1788, - "grad_norm": 1.6796875, - "learning_rate": 1.5801429297698725e-05, - "long_answer_loss": 0.1788, - "loss": 0.1911, - "short_answer_loss": NaN, - "step": 532, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.195, - "grad_norm": 1.703125, - "learning_rate": 1.576966978369333e-05, - "long_answer_loss": 0.195, - "loss": 0.1905, - "short_answer_loss": NaN, - "step": 533, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1846, - "grad_norm": 1.7578125, - "learning_rate": 1.5737887595982466e-05, - "long_answer_loss": 0.1846, - "loss": 0.1779, - "short_answer_loss": NaN, - "step": 534, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.1965, - "grad_norm": 1.6484375, - "learning_rate": 1.570608295496146e-05, - "long_answer_loss": 0.1965, - "loss": 0.1819, - "short_answer_loss": NaN, - "step": 535, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.2053, - "grad_norm": 1.625, - "learning_rate": 1.567425608118136e-05, - "long_answer_loss": 0.2053, - "loss": 0.1809, - "short_answer_loss": NaN, - "step": 536, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.2033, - "grad_norm": 1.6328125, - "learning_rate": 1.5642407195347367e-05, - "long_answer_loss": 0.2033, - "loss": 0.1914, - "short_answer_loss": NaN, - "step": 537, - "template_loss": 0.0 - }, - { - "epoch": 0.87, - "full_loss": 0.181, - "grad_norm": 1.640625, - "learning_rate": 1.561053651831734e-05, - "long_answer_loss": 0.181, - "loss": 0.1728, - "short_answer_loss": NaN, - "step": 538, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1719, - "grad_norm": 1.734375, - "learning_rate": 1.557864427110025e-05, - "long_answer_loss": 0.1719, - "loss": 0.2032, - "short_answer_loss": NaN, - "step": 539, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1808, - "grad_norm": 1.765625, - "learning_rate": 1.5546730674854643e-05, - "long_answer_loss": 0.1808, - "loss": 0.1807, - "short_answer_loss": NaN, - "step": 540, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.189, - "grad_norm": 1.75, - "learning_rate": 1.5514795950887107e-05, - "long_answer_loss": 0.189, - "loss": 0.1861, - "short_answer_loss": NaN, - "step": 541, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.2012, - "grad_norm": 1.765625, - "learning_rate": 1.5482840320650756e-05, - "long_answer_loss": 0.2012, - "loss": 0.1855, - "short_answer_loss": NaN, - "step": 542, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1631, - "grad_norm": 1.71875, - "learning_rate": 1.545086400574367e-05, - "long_answer_loss": 0.1631, - "loss": 0.1741, - "short_answer_loss": NaN, - "step": 543, - "template_loss": 0.0 - }, - { - "epoch": 0.88, - "full_loss": 0.1732, - "grad_norm": 1.7734375, - "learning_rate": 1.541886722790736e-05, - "long_answer_loss": 0.1732, - "loss": 0.1891, - "short_answer_loss": NaN, - "step": 544, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.2084, - "grad_norm": 1.671875, - "learning_rate": 1.538685020902525e-05, - "long_answer_loss": 0.2084, - "loss": 0.1823, - "short_answer_loss": NaN, - "step": 545, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1735, - "grad_norm": 1.7734375, - "learning_rate": 1.5354813171121133e-05, - "long_answer_loss": 0.1735, - "loss": 0.1918, - "short_answer_loss": NaN, - "step": 546, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1942, - "grad_norm": 1.6640625, - "learning_rate": 1.53227563363576e-05, - "long_answer_loss": 0.1942, - "loss": 0.1803, - "short_answer_loss": NaN, - "step": 547, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1862, - "grad_norm": 1.84375, - "learning_rate": 1.529067992703455e-05, - "long_answer_loss": 0.1862, - "loss": 0.1843, - "short_answer_loss": NaN, - "step": 548, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1778, - "grad_norm": 1.71875, - "learning_rate": 1.5258584165587614e-05, - "long_answer_loss": 0.1778, - "loss": 0.1847, - "short_answer_loss": NaN, - "step": 549, - "template_loss": 0.0 - }, - { - "epoch": 0.89, - "full_loss": 0.1799, - "grad_norm": 1.7265625, - "learning_rate": 1.5226469274586617e-05, - "long_answer_loss": 0.1799, - "loss": 0.1859, - "short_answer_loss": NaN, - "step": 550, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1702, - "grad_norm": 1.75, - "learning_rate": 1.5194335476734036e-05, - "long_answer_loss": 0.1702, - "loss": 0.1678, - "short_answer_loss": NaN, - "step": 551, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1955, - "grad_norm": 1.71875, - "learning_rate": 1.5162182994863474e-05, - "long_answer_loss": 0.1955, - "loss": 0.1836, - "short_answer_loss": NaN, - "step": 552, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.2061, - "grad_norm": 1.7421875, - "learning_rate": 1.5130012051938089e-05, - "long_answer_loss": 0.2061, - "loss": 0.1799, - "short_answer_loss": NaN, - "step": 553, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1503, - "grad_norm": 1.6484375, - "learning_rate": 1.5097822871049055e-05, - "long_answer_loss": 0.1503, - "loss": 0.1681, - "short_answer_loss": NaN, - "step": 554, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1901, - "grad_norm": 1.6328125, - "learning_rate": 1.506561567541402e-05, - "long_answer_loss": 0.1901, - "loss": 0.1692, - "short_answer_loss": NaN, - "step": 555, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1438, - "grad_norm": 1.6328125, - "learning_rate": 1.503339068837557e-05, - "long_answer_loss": 0.1438, - "loss": 0.1724, - "short_answer_loss": NaN, - "step": 556, - "template_loss": 0.0 - }, - { - "epoch": 0.9, - "full_loss": 0.1826, - "grad_norm": 1.640625, - "learning_rate": 1.500114813339965e-05, - "long_answer_loss": 0.1826, - "loss": 0.1844, - "short_answer_loss": NaN, - "step": 557, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1934, - "grad_norm": 1.71875, - "learning_rate": 1.4968888234074027e-05, - "long_answer_loss": 0.1934, - "loss": 0.1834, - "short_answer_loss": NaN, - "step": 558, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1649, - "grad_norm": 1.6015625, - "learning_rate": 1.4936611214106763e-05, - "long_answer_loss": 0.1649, - "loss": 0.1748, - "short_answer_loss": NaN, - "step": 559, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.203, - "grad_norm": 1.75, - "learning_rate": 1.4904317297324633e-05, - "long_answer_loss": 0.203, - "loss": 0.1752, - "short_answer_loss": NaN, - "step": 560, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1788, - "grad_norm": 1.6328125, - "learning_rate": 1.487200670767158e-05, - "long_answer_loss": 0.1788, - "loss": 0.1727, - "short_answer_loss": NaN, - "step": 561, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.1531, - "grad_norm": 1.703125, - "learning_rate": 1.4839679669207168e-05, - "long_answer_loss": 0.1531, - "loss": 0.1823, - "short_answer_loss": NaN, - "step": 562, - "template_loss": 0.0 - }, - { - "epoch": 0.91, - "full_loss": 0.196, - "grad_norm": 1.5390625, - "learning_rate": 1.4807336406105032e-05, - "long_answer_loss": 0.196, - "loss": 0.168, - "short_answer_loss": NaN, - "step": 563, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1461, - "grad_norm": 1.546875, - "learning_rate": 1.4774977142651316e-05, - "long_answer_loss": 0.1461, - "loss": 0.1678, - "short_answer_loss": NaN, - "step": 564, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1903, - "grad_norm": 1.796875, - "learning_rate": 1.4742602103243113e-05, - "long_answer_loss": 0.1903, - "loss": 0.1829, - "short_answer_loss": NaN, - "step": 565, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.195, - "grad_norm": 1.7421875, - "learning_rate": 1.4710211512386927e-05, - "long_answer_loss": 0.195, - "loss": 0.1775, - "short_answer_loss": NaN, - "step": 566, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1732, - "grad_norm": 1.8671875, - "learning_rate": 1.467780559469709e-05, - "long_answer_loss": 0.1732, - "loss": 0.1705, - "short_answer_loss": NaN, - "step": 567, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.181, - "grad_norm": 1.7265625, - "learning_rate": 1.4645384574894224e-05, - "long_answer_loss": 0.181, - "loss": 0.1701, - "short_answer_loss": NaN, - "step": 568, - "template_loss": 0.0 - }, - { - "epoch": 0.92, - "full_loss": 0.1779, - "grad_norm": 1.7421875, - "learning_rate": 1.4612948677803687e-05, - "long_answer_loss": 0.1779, - "loss": 0.1809, - "short_answer_loss": NaN, - "step": 569, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1518, - "grad_norm": 1.6171875, - "learning_rate": 1.4580498128353992e-05, - "long_answer_loss": 0.1518, - "loss": 0.1701, - "short_answer_loss": NaN, - "step": 570, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1903, - "grad_norm": 1.7890625, - "learning_rate": 1.4548033151575264e-05, - "long_answer_loss": 0.1903, - "loss": 0.1726, - "short_answer_loss": NaN, - "step": 571, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.2106, - "grad_norm": 1.8203125, - "learning_rate": 1.4515553972597672e-05, - "long_answer_loss": 0.2106, - "loss": 0.1886, - "short_answer_loss": NaN, - "step": 572, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1501, - "grad_norm": 1.6796875, - "learning_rate": 1.4483060816649885e-05, - "long_answer_loss": 0.1501, - "loss": 0.1799, - "short_answer_loss": NaN, - "step": 573, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.1709, - "grad_norm": 1.609375, - "learning_rate": 1.4450553909057473e-05, - "long_answer_loss": 0.1709, - "loss": 0.1709, - "short_answer_loss": NaN, - "step": 574, - "template_loss": 0.0 - }, - { - "epoch": 0.93, - "full_loss": 0.2119, - "grad_norm": 1.703125, - "learning_rate": 1.4418033475241388e-05, - "long_answer_loss": 0.2119, - "loss": 0.1827, - "short_answer_loss": NaN, - "step": 575, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1803, - "grad_norm": 1.7421875, - "learning_rate": 1.4385499740716369e-05, - "long_answer_loss": 0.1803, - "loss": 0.1831, - "short_answer_loss": NaN, - "step": 576, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1662, - "grad_norm": 1.765625, - "learning_rate": 1.4352952931089392e-05, - "long_answer_loss": 0.1662, - "loss": 0.1784, - "short_answer_loss": NaN, - "step": 577, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.2044, - "grad_norm": 1.734375, - "learning_rate": 1.4320393272058103e-05, - "long_answer_loss": 0.2044, - "loss": 0.1848, - "short_answer_loss": NaN, - "step": 578, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1684, - "grad_norm": 1.734375, - "learning_rate": 1.4287820989409257e-05, - "long_answer_loss": 0.1684, - "loss": 0.1852, - "short_answer_loss": NaN, - "step": 579, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1639, - "grad_norm": 1.625, - "learning_rate": 1.4255236309017145e-05, - "long_answer_loss": 0.1639, - "loss": 0.1847, - "short_answer_loss": NaN, - "step": 580, - "template_loss": 0.0 - }, - { - "epoch": 0.94, - "full_loss": 0.1733, - "grad_norm": 1.6953125, - "learning_rate": 1.4222639456842024e-05, - "long_answer_loss": 0.1733, - "loss": 0.1781, - "short_answer_loss": NaN, - "step": 581, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1701, - "grad_norm": 1.7265625, - "learning_rate": 1.4190030658928568e-05, - "long_answer_loss": 0.1701, - "loss": 0.1802, - "short_answer_loss": NaN, - "step": 582, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.175, - "grad_norm": 1.8046875, - "learning_rate": 1.415741014140429e-05, - "long_answer_loss": 0.175, - "loss": 0.182, - "short_answer_loss": NaN, - "step": 583, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1588, - "grad_norm": 1.7578125, - "learning_rate": 1.4124778130477962e-05, - "long_answer_loss": 0.1588, - "loss": 0.1865, - "short_answer_loss": NaN, - "step": 584, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1698, - "grad_norm": 1.7421875, - "learning_rate": 1.4092134852438054e-05, - "long_answer_loss": 0.1698, - "loss": 0.1725, - "short_answer_loss": NaN, - "step": 585, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.2202, - "grad_norm": 1.71875, - "learning_rate": 1.4059480533651201e-05, - "long_answer_loss": 0.2202, - "loss": 0.1891, - "short_answer_loss": NaN, - "step": 586, - "template_loss": 0.0 - }, - { - "epoch": 0.95, - "full_loss": 0.1516, - "grad_norm": 1.6328125, - "learning_rate": 1.4026815400560561e-05, - "long_answer_loss": 0.1516, - "loss": 0.1634, - "short_answer_loss": NaN, - "step": 587, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1903, - "grad_norm": 1.65625, - "learning_rate": 1.3994139679684306e-05, - "long_answer_loss": 0.1903, - "loss": 0.1833, - "short_answer_loss": NaN, - "step": 588, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1829, - "grad_norm": 1.7109375, - "learning_rate": 1.3961453597614025e-05, - "long_answer_loss": 0.1829, - "loss": 0.1844, - "short_answer_loss": NaN, - "step": 589, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1826, - "grad_norm": 1.71875, - "learning_rate": 1.3928757381013158e-05, - "long_answer_loss": 0.1826, - "loss": 0.179, - "short_answer_loss": NaN, - "step": 590, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1558, - "grad_norm": 1.6875, - "learning_rate": 1.3896051256615423e-05, - "long_answer_loss": 0.1558, - "loss": 0.1763, - "short_answer_loss": NaN, - "step": 591, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.173, - "grad_norm": 1.640625, - "learning_rate": 1.3863335451223235e-05, - "long_answer_loss": 0.173, - "loss": 0.1724, - "short_answer_loss": NaN, - "step": 592, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.2119, - "grad_norm": 1.7265625, - "learning_rate": 1.3830610191706162e-05, - "long_answer_loss": 0.2119, - "loss": 0.178, - "short_answer_loss": NaN, - "step": 593, - "template_loss": 0.0 - }, - { - "epoch": 0.96, - "full_loss": 0.1676, - "grad_norm": 1.796875, - "learning_rate": 1.3797875704999319e-05, - "long_answer_loss": 0.1676, - "loss": 0.1809, - "short_answer_loss": NaN, - "step": 594, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1737, - "grad_norm": 1.6015625, - "learning_rate": 1.3765132218101806e-05, - "long_answer_loss": 0.1737, - "loss": 0.1718, - "short_answer_loss": NaN, - "step": 595, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1874, - "grad_norm": 1.6875, - "learning_rate": 1.3732379958075142e-05, - "long_answer_loss": 0.1874, - "loss": 0.1743, - "short_answer_loss": NaN, - "step": 596, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1611, - "grad_norm": 1.6171875, - "learning_rate": 1.3699619152041681e-05, - "long_answer_loss": 0.1611, - "loss": 0.1879, - "short_answer_loss": NaN, - "step": 597, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.173, - "grad_norm": 1.6796875, - "learning_rate": 1.366685002718304e-05, - "long_answer_loss": 0.173, - "loss": 0.1777, - "short_answer_loss": NaN, - "step": 598, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1729, - "grad_norm": 1.7109375, - "learning_rate": 1.3634072810738518e-05, - "long_answer_loss": 0.1729, - "loss": 0.1718, - "short_answer_loss": NaN, - "step": 599, - "template_loss": 0.0 - }, - { - "epoch": 0.97, - "full_loss": 0.1439, - "grad_norm": 1.7421875, - "learning_rate": 1.3601287730003534e-05, - "long_answer_loss": 0.1439, - "loss": 0.1749, - "short_answer_loss": NaN, - "step": 600, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1837, - "grad_norm": 1.7109375, - "learning_rate": 1.3568495012328041e-05, - "long_answer_loss": 0.1837, - "loss": 0.182, - "short_answer_loss": NaN, - "step": 601, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1871, - "grad_norm": 1.6484375, - "learning_rate": 1.3535694885114947e-05, - "long_answer_loss": 0.1871, - "loss": 0.1758, - "short_answer_loss": NaN, - "step": 602, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1669, - "grad_norm": 1.7578125, - "learning_rate": 1.350288757581854e-05, - "long_answer_loss": 0.1669, - "loss": 0.1647, - "short_answer_loss": NaN, - "step": 603, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1866, - "grad_norm": 1.703125, - "learning_rate": 1.3470073311942918e-05, - "long_answer_loss": 0.1866, - "loss": 0.1774, - "short_answer_loss": NaN, - "step": 604, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.1827, - "grad_norm": 1.6953125, - "learning_rate": 1.3437252321040406e-05, - "long_answer_loss": 0.1827, - "loss": 0.1828, - "short_answer_loss": NaN, - "step": 605, - "template_loss": 0.0 - }, - { - "epoch": 0.98, - "full_loss": 0.197, - "grad_norm": 1.671875, - "learning_rate": 1.3404424830709978e-05, - "long_answer_loss": 0.197, - "loss": 0.1771, - "short_answer_loss": NaN, - "step": 606, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1834, - "grad_norm": 1.640625, - "learning_rate": 1.337159106859567e-05, - "long_answer_loss": 0.1834, - "loss": 0.1801, - "short_answer_loss": NaN, - "step": 607, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1698, - "grad_norm": 1.7265625, - "learning_rate": 1.333875126238502e-05, - "long_answer_loss": 0.1698, - "loss": 0.18, - "short_answer_loss": NaN, - "step": 608, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1541, - "grad_norm": 1.6875, - "learning_rate": 1.330590563980747e-05, - "long_answer_loss": 0.1541, - "loss": 0.1847, - "short_answer_loss": NaN, - "step": 609, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1769, - "grad_norm": 1.6953125, - "learning_rate": 1.3273054428632814e-05, - "long_answer_loss": 0.1769, - "loss": 0.1792, - "short_answer_loss": NaN, - "step": 610, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.2038, - "grad_norm": 1.578125, - "learning_rate": 1.3240197856669576e-05, - "long_answer_loss": 0.2038, - "loss": 0.1751, - "short_answer_loss": NaN, - "step": 611, - "template_loss": 0.0 - }, - { - "epoch": 0.99, - "full_loss": 0.1696, - "grad_norm": 1.640625, - "learning_rate": 1.3207336151763469e-05, - "long_answer_loss": 0.1696, - "loss": 0.1659, - "short_answer_loss": NaN, - "step": 612, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1555, - "grad_norm": 1.5859375, - "learning_rate": 1.3174469541795806e-05, - "long_answer_loss": 0.1555, - "loss": 0.1661, - "short_answer_loss": NaN, - "step": 613, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1672, - "grad_norm": 1.6484375, - "learning_rate": 1.3141598254681903e-05, - "long_answer_loss": 0.1672, - "loss": 0.1765, - "short_answer_loss": NaN, - "step": 614, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1691, - "grad_norm": 1.828125, - "learning_rate": 1.3108722518369507e-05, - "long_answer_loss": 0.1691, - "loss": 0.1751, - "short_answer_loss": NaN, - "step": 615, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1399, - "grad_norm": 1.59375, - "learning_rate": 1.3075842560837229e-05, - "long_answer_loss": 0.1399, - "loss": 0.1428, - "short_answer_loss": NaN, - "step": 616, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.0948, - "grad_norm": 1.4765625, - "learning_rate": 1.3042958610092953e-05, - "long_answer_loss": 0.0948, - "loss": 0.0965, - "short_answer_loss": NaN, - "step": 617, - "template_loss": 0.0 - }, - { - "epoch": 1.0, - "full_loss": 0.1109, - "grad_norm": 1.421875, - "learning_rate": 1.3010070894172247e-05, - "long_answer_loss": 0.1109, - "loss": 0.1022, - "short_answer_loss": NaN, - "step": 618, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0929, - "grad_norm": 1.4296875, - "learning_rate": 1.297717964113678e-05, - "long_answer_loss": 0.0929, - "loss": 0.0954, - "short_answer_loss": NaN, - "step": 619, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0971, - "grad_norm": 1.46875, - "learning_rate": 1.294428507907278e-05, - "long_answer_loss": 0.0971, - "loss": 0.1051, - "short_answer_loss": NaN, - "step": 620, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0911, - "grad_norm": 1.5234375, - "learning_rate": 1.291138743608939e-05, - "long_answer_loss": 0.0911, - "loss": 0.0948, - "short_answer_loss": NaN, - "step": 621, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0939, - "grad_norm": 1.6640625, - "learning_rate": 1.2878486940317128e-05, - "long_answer_loss": 0.0939, - "loss": 0.0926, - "short_answer_loss": NaN, - "step": 622, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.0889, - "grad_norm": 1.8125, - "learning_rate": 1.2845583819906303e-05, - "long_answer_loss": 0.0889, - "loss": 0.0924, - "short_answer_loss": NaN, - "step": 623, - "template_loss": 0.0 - }, - { - "epoch": 1.01, - "full_loss": 0.1197, - "grad_norm": 1.75, - "learning_rate": 1.2812678303025419e-05, - "long_answer_loss": 0.1197, - "loss": 0.1079, - "short_answer_loss": NaN, - "step": 624, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.105, - "grad_norm": 1.7265625, - "learning_rate": 1.2779770617859593e-05, - "long_answer_loss": 0.105, - "loss": 0.0952, - "short_answer_loss": NaN, - "step": 625, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.1089, - "grad_norm": 1.625, - "learning_rate": 1.2746860992608987e-05, - "long_answer_loss": 0.1089, - "loss": 0.0861, - "short_answer_loss": NaN, - "step": 626, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.1044, - "grad_norm": 1.6640625, - "learning_rate": 1.2713949655487206e-05, - "long_answer_loss": 0.1044, - "loss": 0.1001, - "short_answer_loss": NaN, - "step": 627, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.1123, - "grad_norm": 1.5625, - "learning_rate": 1.2681036834719742e-05, - "long_answer_loss": 0.1123, - "loss": 0.1015, - "short_answer_loss": NaN, - "step": 628, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.1049, - "grad_norm": 1.625, - "learning_rate": 1.2648122758542358e-05, - "long_answer_loss": 0.1049, - "loss": 0.0966, - "short_answer_loss": NaN, - "step": 629, - "template_loss": 0.0 - }, - { - "epoch": 1.02, - "full_loss": 0.0878, - "grad_norm": 1.515625, - "learning_rate": 1.2615207655199534e-05, - "long_answer_loss": 0.0878, - "loss": 0.0913, - "short_answer_loss": NaN, - "step": 630, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0944, - "grad_norm": 1.5078125, - "learning_rate": 1.2582291752942876e-05, - "long_answer_loss": 0.0944, - "loss": 0.096, - "short_answer_loss": NaN, - "step": 631, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0822, - "grad_norm": 1.4609375, - "learning_rate": 1.2549375280029513e-05, - "long_answer_loss": 0.0822, - "loss": 0.0907, - "short_answer_loss": NaN, - "step": 632, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.1061, - "grad_norm": 1.46875, - "learning_rate": 1.2516458464720552e-05, - "long_answer_loss": 0.1061, - "loss": 0.0994, - "short_answer_loss": NaN, - "step": 633, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0888, - "grad_norm": 1.4140625, - "learning_rate": 1.2483541535279453e-05, - "long_answer_loss": 0.0888, - "loss": 0.0921, - "short_answer_loss": NaN, - "step": 634, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.0851, - "grad_norm": 1.4140625, - "learning_rate": 1.2450624719970487e-05, - "long_answer_loss": 0.0851, - "loss": 0.0929, - "short_answer_loss": NaN, - "step": 635, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.1028, - "grad_norm": 1.4375, - "learning_rate": 1.2417708247057127e-05, - "long_answer_loss": 0.1028, - "loss": 0.0945, - "short_answer_loss": NaN, - "step": 636, - "template_loss": 0.0 - }, - { - "epoch": 1.03, - "full_loss": 0.1081, - "grad_norm": 1.4921875, - "learning_rate": 1.2384792344800467e-05, - "long_answer_loss": 0.1081, - "loss": 0.0908, - "short_answer_loss": NaN, - "step": 637, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.1061, - "grad_norm": 1.4609375, - "learning_rate": 1.2351877241457644e-05, - "long_answer_loss": 0.1061, - "loss": 0.0998, - "short_answer_loss": NaN, - "step": 638, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0892, - "grad_norm": 1.3515625, - "learning_rate": 1.2318963165280264e-05, - "long_answer_loss": 0.0892, - "loss": 0.0832, - "short_answer_loss": NaN, - "step": 639, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0781, - "grad_norm": 1.5546875, - "learning_rate": 1.22860503445128e-05, - "long_answer_loss": 0.0781, - "loss": 0.0913, - "short_answer_loss": NaN, - "step": 640, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0827, - "grad_norm": 1.4921875, - "learning_rate": 1.2253139007391018e-05, - "long_answer_loss": 0.0827, - "loss": 0.0909, - "short_answer_loss": NaN, - "step": 641, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0894, - "grad_norm": 1.4453125, - "learning_rate": 1.222022938214041e-05, - "long_answer_loss": 0.0894, - "loss": 0.0957, - "short_answer_loss": NaN, - "step": 642, - "template_loss": 0.0 - }, - { - "epoch": 1.04, - "full_loss": 0.0977, - "grad_norm": 1.53125, - "learning_rate": 1.2187321696974584e-05, - "long_answer_loss": 0.0977, - "loss": 0.0945, - "short_answer_loss": NaN, - "step": 643, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.1224, - "grad_norm": 1.5, - "learning_rate": 1.21544161800937e-05, - "long_answer_loss": 0.1224, - "loss": 0.0942, - "short_answer_loss": NaN, - "step": 644, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0931, - "grad_norm": 1.484375, - "learning_rate": 1.2121513059682873e-05, - "long_answer_loss": 0.0931, - "loss": 0.0946, - "short_answer_loss": NaN, - "step": 645, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.1158, - "grad_norm": 1.640625, - "learning_rate": 1.2088612563910615e-05, - "long_answer_loss": 0.1158, - "loss": 0.1018, - "short_answer_loss": NaN, - "step": 646, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0816, - "grad_norm": 1.5, - "learning_rate": 1.2055714920927221e-05, - "long_answer_loss": 0.0816, - "loss": 0.0944, - "short_answer_loss": NaN, - "step": 647, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.0927, - "grad_norm": 1.4296875, - "learning_rate": 1.202282035886322e-05, - "long_answer_loss": 0.0927, - "loss": 0.0868, - "short_answer_loss": NaN, - "step": 648, - "template_loss": 0.0 - }, - { - "epoch": 1.05, - "full_loss": 0.096, - "grad_norm": 1.515625, - "learning_rate": 1.1989929105827757e-05, - "long_answer_loss": 0.096, - "loss": 0.0928, - "short_answer_loss": NaN, - "step": 649, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.1016, - "grad_norm": 1.5546875, - "learning_rate": 1.1957041389907051e-05, - "long_answer_loss": 0.1016, - "loss": 0.0926, - "short_answer_loss": NaN, - "step": 650, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.1134, - "grad_norm": 1.546875, - "learning_rate": 1.1924157439162774e-05, - "long_answer_loss": 0.1134, - "loss": 0.0901, - "short_answer_loss": NaN, - "step": 651, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0934, - "grad_norm": 1.515625, - "learning_rate": 1.1891277481630497e-05, - "long_answer_loss": 0.0934, - "loss": 0.0917, - "short_answer_loss": NaN, - "step": 652, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0942, - "grad_norm": 1.609375, - "learning_rate": 1.1858401745318105e-05, - "long_answer_loss": 0.0942, - "loss": 0.0914, - "short_answer_loss": NaN, - "step": 653, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0835, - "grad_norm": 1.4921875, - "learning_rate": 1.1825530458204193e-05, - "long_answer_loss": 0.0835, - "loss": 0.0885, - "short_answer_loss": NaN, - "step": 654, - "template_loss": 0.0 - }, - { - "epoch": 1.06, - "full_loss": 0.0952, - "grad_norm": 1.5390625, - "learning_rate": 1.1792663848236529e-05, - "long_answer_loss": 0.0952, - "loss": 0.0942, - "short_answer_loss": NaN, - "step": 655, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.1325, - "grad_norm": 1.7890625, - "learning_rate": 1.1759802143330429e-05, - "long_answer_loss": 0.1325, - "loss": 0.1, - "short_answer_loss": NaN, - "step": 656, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.086, - "grad_norm": 1.5, - "learning_rate": 1.172694557136719e-05, - "long_answer_loss": 0.086, - "loss": 0.093, - "short_answer_loss": NaN, - "step": 657, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0779, - "grad_norm": 1.5703125, - "learning_rate": 1.1694094360192532e-05, - "long_answer_loss": 0.0779, - "loss": 0.091, - "short_answer_loss": NaN, - "step": 658, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0932, - "grad_norm": 1.546875, - "learning_rate": 1.1661248737614989e-05, - "long_answer_loss": 0.0932, - "loss": 0.0948, - "short_answer_loss": NaN, - "step": 659, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.0828, - "grad_norm": 1.5859375, - "learning_rate": 1.1628408931404336e-05, - "long_answer_loss": 0.0828, - "loss": 0.0879, - "short_answer_loss": NaN, - "step": 660, - "template_loss": 0.0 - }, - { - "epoch": 1.07, - "full_loss": 0.1188, - "grad_norm": 1.4921875, - "learning_rate": 1.1595575169290022e-05, - "long_answer_loss": 0.1188, - "loss": 0.0959, - "short_answer_loss": NaN, - "step": 661, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0722, - "grad_norm": 1.5078125, - "learning_rate": 1.1562747678959593e-05, - "long_answer_loss": 0.0722, - "loss": 0.0897, - "short_answer_loss": NaN, - "step": 662, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.1015, - "grad_norm": 1.625, - "learning_rate": 1.1529926688057083e-05, - "long_answer_loss": 0.1015, - "loss": 0.0949, - "short_answer_loss": NaN, - "step": 663, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.098, - "grad_norm": 1.59375, - "learning_rate": 1.149711242418146e-05, - "long_answer_loss": 0.098, - "loss": 0.0972, - "short_answer_loss": NaN, - "step": 664, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0964, - "grad_norm": 1.53125, - "learning_rate": 1.1464305114885057e-05, - "long_answer_loss": 0.0964, - "loss": 0.0938, - "short_answer_loss": NaN, - "step": 665, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0898, - "grad_norm": 1.390625, - "learning_rate": 1.1431504987671963e-05, - "long_answer_loss": 0.0898, - "loss": 0.0874, - "short_answer_loss": NaN, - "step": 666, - "template_loss": 0.0 - }, - { - "epoch": 1.08, - "full_loss": 0.0903, - "grad_norm": 1.546875, - "learning_rate": 1.1398712269996468e-05, - "long_answer_loss": 0.0903, - "loss": 0.0954, - "short_answer_loss": NaN, - "step": 667, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0939, - "grad_norm": 1.53125, - "learning_rate": 1.1365927189261482e-05, - "long_answer_loss": 0.0939, - "loss": 0.0982, - "short_answer_loss": NaN, - "step": 668, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.1123, - "grad_norm": 1.5078125, - "learning_rate": 1.1333149972816961e-05, - "long_answer_loss": 0.1123, - "loss": 0.0886, - "short_answer_loss": NaN, - "step": 669, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0926, - "grad_norm": 1.5234375, - "learning_rate": 1.130038084795832e-05, - "long_answer_loss": 0.0926, - "loss": 0.0901, - "short_answer_loss": NaN, - "step": 670, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0919, - "grad_norm": 1.4921875, - "learning_rate": 1.126762004192486e-05, - "long_answer_loss": 0.0919, - "loss": 0.0873, - "short_answer_loss": NaN, - "step": 671, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0889, - "grad_norm": 1.546875, - "learning_rate": 1.1234867781898195e-05, - "long_answer_loss": 0.0889, - "loss": 0.0982, - "short_answer_loss": NaN, - "step": 672, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.0893, - "grad_norm": 1.4375, - "learning_rate": 1.1202124295000686e-05, - "long_answer_loss": 0.0893, - "loss": 0.0882, - "short_answer_loss": NaN, - "step": 673, - "template_loss": 0.0 - }, - { - "epoch": 1.09, - "full_loss": 0.1006, - "grad_norm": 1.546875, - "learning_rate": 1.1169389808293836e-05, - "long_answer_loss": 0.1006, - "loss": 0.0916, - "short_answer_loss": NaN, - "step": 674, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.1058, - "grad_norm": 1.5390625, - "learning_rate": 1.1136664548776766e-05, - "long_answer_loss": 0.1058, - "loss": 0.09, - "short_answer_loss": NaN, - "step": 675, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.1017, - "grad_norm": 1.46875, - "learning_rate": 1.1103948743384582e-05, - "long_answer_loss": 0.1017, - "loss": 0.0946, - "short_answer_loss": NaN, - "step": 676, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0869, - "grad_norm": 1.4765625, - "learning_rate": 1.1071242618986844e-05, - "long_answer_loss": 0.0869, - "loss": 0.0928, - "short_answer_loss": NaN, - "step": 677, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.083, - "grad_norm": 1.6015625, - "learning_rate": 1.1038546402385978e-05, - "long_answer_loss": 0.083, - "loss": 0.0897, - "short_answer_loss": NaN, - "step": 678, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0731, - "grad_norm": 1.53125, - "learning_rate": 1.1005860320315697e-05, - "long_answer_loss": 0.0731, - "loss": 0.086, - "short_answer_loss": NaN, - "step": 679, - "template_loss": 0.0 - }, - { - "epoch": 1.1, - "full_loss": 0.0819, - "grad_norm": 1.5859375, - "learning_rate": 1.0973184599439443e-05, - "long_answer_loss": 0.0819, - "loss": 0.0897, - "short_answer_loss": NaN, - "step": 680, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.1036, - "grad_norm": 1.578125, - "learning_rate": 1.09405194663488e-05, - "long_answer_loss": 0.1036, - "loss": 0.0921, - "short_answer_loss": NaN, - "step": 681, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0792, - "grad_norm": 1.4765625, - "learning_rate": 1.0907865147561944e-05, - "long_answer_loss": 0.0792, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 682, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.103, - "grad_norm": 1.53125, - "learning_rate": 1.0875221869522043e-05, - "long_answer_loss": 0.103, - "loss": 0.0947, - "short_answer_loss": NaN, - "step": 683, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0753, - "grad_norm": 1.453125, - "learning_rate": 1.0842589858595714e-05, - "long_answer_loss": 0.0753, - "loss": 0.0835, - "short_answer_loss": NaN, - "step": 684, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0965, - "grad_norm": 1.53125, - "learning_rate": 1.0809969341071433e-05, - "long_answer_loss": 0.0965, - "loss": 0.088, - "short_answer_loss": NaN, - "step": 685, - "template_loss": 0.0 - }, - { - "epoch": 1.11, - "full_loss": 0.0702, - "grad_norm": 1.46875, - "learning_rate": 1.0777360543157978e-05, - "long_answer_loss": 0.0702, - "loss": 0.0869, - "short_answer_loss": NaN, - "step": 686, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0926, - "grad_norm": 1.5, - "learning_rate": 1.074476369098286e-05, - "long_answer_loss": 0.0926, - "loss": 0.0914, - "short_answer_loss": NaN, - "step": 687, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0942, - "grad_norm": 1.53125, - "learning_rate": 1.0712179010590742e-05, - "long_answer_loss": 0.0942, - "loss": 0.09, - "short_answer_loss": NaN, - "step": 688, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0905, - "grad_norm": 1.390625, - "learning_rate": 1.0679606727941897e-05, - "long_answer_loss": 0.0905, - "loss": 0.0863, - "short_answer_loss": NaN, - "step": 689, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0831, - "grad_norm": 1.4453125, - "learning_rate": 1.0647047068910613e-05, - "long_answer_loss": 0.0831, - "loss": 0.0872, - "short_answer_loss": NaN, - "step": 690, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0856, - "grad_norm": 1.4921875, - "learning_rate": 1.0614500259283632e-05, - "long_answer_loss": 0.0856, - "loss": 0.0904, - "short_answer_loss": NaN, - "step": 691, - "template_loss": 0.0 - }, - { - "epoch": 1.12, - "full_loss": 0.0705, - "grad_norm": 1.5, - "learning_rate": 1.0581966524758615e-05, - "long_answer_loss": 0.0705, - "loss": 0.0858, - "short_answer_loss": NaN, - "step": 692, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0898, - "grad_norm": 1.5, - "learning_rate": 1.054944609094253e-05, - "long_answer_loss": 0.0898, - "loss": 0.0851, - "short_answer_loss": NaN, - "step": 693, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0935, - "grad_norm": 1.546875, - "learning_rate": 1.051693918335012e-05, - "long_answer_loss": 0.0935, - "loss": 0.0948, - "short_answer_loss": NaN, - "step": 694, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.069, - "grad_norm": 1.453125, - "learning_rate": 1.0484446027402326e-05, - "long_answer_loss": 0.069, - "loss": 0.0836, - "short_answer_loss": NaN, - "step": 695, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0909, - "grad_norm": 1.5234375, - "learning_rate": 1.0451966848424737e-05, - "long_answer_loss": 0.0909, - "loss": 0.0887, - "short_answer_loss": NaN, - "step": 696, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.0903, - "grad_norm": 1.796875, - "learning_rate": 1.0419501871646012e-05, - "long_answer_loss": 0.0903, - "loss": 0.098, - "short_answer_loss": NaN, - "step": 697, - "template_loss": 0.0 - }, - { - "epoch": 1.13, - "full_loss": 0.098, - "grad_norm": 1.4921875, - "learning_rate": 1.0387051322196314e-05, - "long_answer_loss": 0.098, - "loss": 0.0919, - "short_answer_loss": NaN, - "step": 698, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.097, - "grad_norm": 1.625, - "learning_rate": 1.0354615425105779e-05, - "long_answer_loss": 0.097, - "loss": 0.0908, - "short_answer_loss": NaN, - "step": 699, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0693, - "grad_norm": 1.5390625, - "learning_rate": 1.0322194405302917e-05, - "long_answer_loss": 0.0693, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 700, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.1016, - "grad_norm": 1.5546875, - "learning_rate": 1.0289788487613074e-05, - "long_answer_loss": 0.1016, - "loss": 0.0888, - "short_answer_loss": NaN, - "step": 701, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0705, - "grad_norm": 1.578125, - "learning_rate": 1.0257397896756884e-05, - "long_answer_loss": 0.0705, - "loss": 0.0971, - "short_answer_loss": NaN, - "step": 702, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.0892, - "grad_norm": 1.4375, - "learning_rate": 1.0225022857348685e-05, - "long_answer_loss": 0.0892, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 703, - "template_loss": 0.0 - }, - { - "epoch": 1.14, - "full_loss": 0.1117, - "grad_norm": 1.5078125, - "learning_rate": 1.0192663593894971e-05, - "long_answer_loss": 0.1117, - "loss": 0.0937, - "short_answer_loss": NaN, - "step": 704, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0962, - "grad_norm": 1.4296875, - "learning_rate": 1.0160320330792835e-05, - "long_answer_loss": 0.0962, - "loss": 0.0959, - "short_answer_loss": NaN, - "step": 705, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.078, - "grad_norm": 1.4296875, - "learning_rate": 1.0127993292328427e-05, - "long_answer_loss": 0.078, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 706, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0944, - "grad_norm": 1.484375, - "learning_rate": 1.0095682702675375e-05, - "long_answer_loss": 0.0944, - "loss": 0.0924, - "short_answer_loss": NaN, - "step": 707, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0743, - "grad_norm": 1.46875, - "learning_rate": 1.0063388785893236e-05, - "long_answer_loss": 0.0743, - "loss": 0.0943, - "short_answer_loss": NaN, - "step": 708, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.0889, - "grad_norm": 1.53125, - "learning_rate": 1.0031111765925974e-05, - "long_answer_loss": 0.0889, - "loss": 0.0868, - "short_answer_loss": NaN, - "step": 709, - "template_loss": 0.0 - }, - { - "epoch": 1.15, - "full_loss": 0.1036, - "grad_norm": 1.515625, - "learning_rate": 9.998851866600353e-06, - "long_answer_loss": 0.1036, - "loss": 0.0873, - "short_answer_loss": NaN, - "step": 710, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0933, - "grad_norm": 1.4765625, - "learning_rate": 9.966609311624431e-06, - "long_answer_loss": 0.0933, - "loss": 0.092, - "short_answer_loss": NaN, - "step": 711, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0835, - "grad_norm": 1.4921875, - "learning_rate": 9.93438432458598e-06, - "long_answer_loss": 0.0835, - "loss": 0.085, - "short_answer_loss": NaN, - "step": 712, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.092, - "grad_norm": 1.4921875, - "learning_rate": 9.90217712895095e-06, - "long_answer_loss": 0.092, - "loss": 0.0888, - "short_answer_loss": NaN, - "step": 713, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0753, - "grad_norm": 1.5859375, - "learning_rate": 9.869987948061916e-06, - "long_answer_loss": 0.0753, - "loss": 0.0919, - "short_answer_loss": NaN, - "step": 714, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0875, - "grad_norm": 1.7109375, - "learning_rate": 9.837817005136525e-06, - "long_answer_loss": 0.0875, - "loss": 0.0911, - "short_answer_loss": NaN, - "step": 715, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.0663, - "grad_norm": 1.515625, - "learning_rate": 9.805664523265965e-06, - "long_answer_loss": 0.0663, - "loss": 0.0869, - "short_answer_loss": NaN, - "step": 716, - "template_loss": 0.0 - }, - { - "epoch": 1.16, - "full_loss": 0.1043, - "grad_norm": 1.484375, - "learning_rate": 9.773530725413386e-06, - "long_answer_loss": 0.1043, - "loss": 0.0893, - "short_answer_loss": NaN, - "step": 717, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0888, - "grad_norm": 1.4765625, - "learning_rate": 9.741415834412389e-06, - "long_answer_loss": 0.0888, - "loss": 0.0877, - "short_answer_loss": NaN, - "step": 718, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.1257, - "grad_norm": 1.6328125, - "learning_rate": 9.709320072965451e-06, - "long_answer_loss": 0.1257, - "loss": 0.0957, - "short_answer_loss": NaN, - "step": 719, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0669, - "grad_norm": 1.53125, - "learning_rate": 9.677243663642402e-06, - "long_answer_loss": 0.0669, - "loss": 0.0948, - "short_answer_loss": NaN, - "step": 720, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0985, - "grad_norm": 1.421875, - "learning_rate": 9.645186828878875e-06, - "long_answer_loss": 0.0985, - "loss": 0.091, - "short_answer_loss": NaN, - "step": 721, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.0915, - "grad_norm": 1.625, - "learning_rate": 9.613149790974748e-06, - "long_answer_loss": 0.0915, - "loss": 0.0947, - "short_answer_loss": NaN, - "step": 722, - "template_loss": 0.0 - }, - { - "epoch": 1.17, - "full_loss": 0.1125, - "grad_norm": 1.5390625, - "learning_rate": 9.581132772092642e-06, - "long_answer_loss": 0.1125, - "loss": 0.0837, - "short_answer_loss": NaN, - "step": 723, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0838, - "grad_norm": 1.46875, - "learning_rate": 9.549135994256332e-06, - "long_answer_loss": 0.0838, - "loss": 0.0829, - "short_answer_loss": NaN, - "step": 724, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0866, - "grad_norm": 1.515625, - "learning_rate": 9.517159679349244e-06, - "long_answer_loss": 0.0866, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 725, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0817, - "grad_norm": 1.4765625, - "learning_rate": 9.485204049112894e-06, - "long_answer_loss": 0.0817, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 726, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.1044, - "grad_norm": 1.609375, - "learning_rate": 9.453269325145362e-06, - "long_answer_loss": 0.1044, - "loss": 0.0966, - "short_answer_loss": NaN, - "step": 727, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.076, - "grad_norm": 1.6875, - "learning_rate": 9.421355728899752e-06, - "long_answer_loss": 0.076, - "loss": 0.0877, - "short_answer_loss": NaN, - "step": 728, - "template_loss": 0.0 - }, - { - "epoch": 1.18, - "full_loss": 0.0922, - "grad_norm": 1.4765625, - "learning_rate": 9.38946348168266e-06, - "long_answer_loss": 0.0922, - "loss": 0.0925, - "short_answer_loss": NaN, - "step": 729, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.091, - "grad_norm": 1.5, - "learning_rate": 9.357592804652636e-06, - "long_answer_loss": 0.091, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 730, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0953, - "grad_norm": 1.453125, - "learning_rate": 9.325743918818644e-06, - "long_answer_loss": 0.0953, - "loss": 0.089, - "short_answer_loss": NaN, - "step": 731, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0762, - "grad_norm": 1.578125, - "learning_rate": 9.293917045038538e-06, - "long_answer_loss": 0.0762, - "loss": 0.0865, - "short_answer_loss": NaN, - "step": 732, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.1159, - "grad_norm": 1.6484375, - "learning_rate": 9.262112404017536e-06, - "long_answer_loss": 0.1159, - "loss": 0.0933, - "short_answer_loss": NaN, - "step": 733, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0838, - "grad_norm": 1.5078125, - "learning_rate": 9.230330216306674e-06, - "long_answer_loss": 0.0838, - "loss": 0.0895, - "short_answer_loss": NaN, - "step": 734, - "template_loss": 0.0 - }, - { - "epoch": 1.19, - "full_loss": 0.0787, - "grad_norm": 1.5078125, - "learning_rate": 9.198570702301276e-06, - "long_answer_loss": 0.0787, - "loss": 0.0869, - "short_answer_loss": NaN, - "step": 735, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0879, - "grad_norm": 1.5546875, - "learning_rate": 9.166834082239453e-06, - "long_answer_loss": 0.0879, - "loss": 0.0879, - "short_answer_loss": NaN, - "step": 736, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.085, - "grad_norm": 1.515625, - "learning_rate": 9.13512057620055e-06, - "long_answer_loss": 0.085, - "loss": 0.0868, - "short_answer_loss": NaN, - "step": 737, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0678, - "grad_norm": 1.5, - "learning_rate": 9.103430404103619e-06, - "long_answer_loss": 0.0678, - "loss": 0.0869, - "short_answer_loss": NaN, - "step": 738, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0784, - "grad_norm": 1.484375, - "learning_rate": 9.071763785705906e-06, - "long_answer_loss": 0.0784, - "loss": 0.0867, - "short_answer_loss": NaN, - "step": 739, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.1004, - "grad_norm": 1.4453125, - "learning_rate": 9.040120940601326e-06, - "long_answer_loss": 0.1004, - "loss": 0.0932, - "short_answer_loss": NaN, - "step": 740, - "template_loss": 0.0 - }, - { - "epoch": 1.2, - "full_loss": 0.0808, - "grad_norm": 1.4921875, - "learning_rate": 9.008502088218931e-06, - "long_answer_loss": 0.0808, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 741, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0878, - "grad_norm": 1.4375, - "learning_rate": 8.976907447821394e-06, - "long_answer_loss": 0.0878, - "loss": 0.0871, - "short_answer_loss": NaN, - "step": 742, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.1063, - "grad_norm": 1.828125, - "learning_rate": 8.9453372385035e-06, - "long_answer_loss": 0.1063, - "loss": 0.0984, - "short_answer_loss": NaN, - "step": 743, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0968, - "grad_norm": 1.53125, - "learning_rate": 8.913791679190604e-06, - "long_answer_loss": 0.0968, - "loss": 0.0886, - "short_answer_loss": NaN, - "step": 744, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0897, - "grad_norm": 1.5078125, - "learning_rate": 8.882270988637123e-06, - "long_answer_loss": 0.0897, - "loss": 0.0856, - "short_answer_loss": NaN, - "step": 745, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.0761, - "grad_norm": 1.5078125, - "learning_rate": 8.85077538542503e-06, - "long_answer_loss": 0.0761, - "loss": 0.0897, - "short_answer_loss": NaN, - "step": 746, - "template_loss": 0.0 - }, - { - "epoch": 1.21, - "full_loss": 0.093, - "grad_norm": 1.6171875, - "learning_rate": 8.819305087962316e-06, - "long_answer_loss": 0.093, - "loss": 0.0915, - "short_answer_loss": NaN, - "step": 747, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0806, - "grad_norm": 1.65625, - "learning_rate": 8.787860314481502e-06, - "long_answer_loss": 0.0806, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 748, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0772, - "grad_norm": 1.3984375, - "learning_rate": 8.756441283038094e-06, - "long_answer_loss": 0.0772, - "loss": 0.0854, - "short_answer_loss": NaN, - "step": 749, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0762, - "grad_norm": 1.453125, - "learning_rate": 8.725048211509113e-06, - "long_answer_loss": 0.0762, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 750, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0878, - "grad_norm": 1.5234375, - "learning_rate": 8.693681317591535e-06, - "long_answer_loss": 0.0878, - "loss": 0.0871, - "short_answer_loss": NaN, - "step": 751, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0957, - "grad_norm": 1.5625, - "learning_rate": 8.662340818800819e-06, - "long_answer_loss": 0.0957, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 752, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0688, - "grad_norm": 1.3828125, - "learning_rate": 8.631026932469383e-06, - "long_answer_loss": 0.0688, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 753, - "template_loss": 0.0 - }, - { - "epoch": 1.22, - "full_loss": 0.0761, - "grad_norm": 1.5234375, - "learning_rate": 8.599739875745096e-06, - "long_answer_loss": 0.0761, - "loss": 0.0885, - "short_answer_loss": NaN, - "step": 754, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0962, - "grad_norm": 1.4375, - "learning_rate": 8.56847986558978e-06, - "long_answer_loss": 0.0962, - "loss": 0.0855, - "short_answer_loss": NaN, - "step": 755, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.1052, - "grad_norm": 1.453125, - "learning_rate": 8.537247118777703e-06, - "long_answer_loss": 0.1052, - "loss": 0.0882, - "short_answer_loss": NaN, - "step": 756, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0654, - "grad_norm": 1.5625, - "learning_rate": 8.50604185189407e-06, - "long_answer_loss": 0.0654, - "loss": 0.0851, - "short_answer_loss": NaN, - "step": 757, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0946, - "grad_norm": 1.578125, - "learning_rate": 8.474864281333519e-06, - "long_answer_loss": 0.0946, - "loss": 0.0911, - "short_answer_loss": NaN, - "step": 758, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0872, - "grad_norm": 1.3984375, - "learning_rate": 8.443714623298644e-06, - "long_answer_loss": 0.0872, - "loss": 0.079, - "short_answer_loss": NaN, - "step": 759, - "template_loss": 0.0 - }, - { - "epoch": 1.23, - "full_loss": 0.0908, - "grad_norm": 1.53125, - "learning_rate": 8.412593093798457e-06, - "long_answer_loss": 0.0908, - "loss": 0.0877, - "short_answer_loss": NaN, - "step": 760, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0874, - "grad_norm": 1.3984375, - "learning_rate": 8.381499908646925e-06, - "long_answer_loss": 0.0874, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 761, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0735, - "grad_norm": 1.53125, - "learning_rate": 8.350435283461452e-06, - "long_answer_loss": 0.0735, - "loss": 0.086, - "short_answer_loss": NaN, - "step": 762, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0816, - "grad_norm": 1.5234375, - "learning_rate": 8.319399433661402e-06, - "long_answer_loss": 0.0816, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 763, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0738, - "grad_norm": 1.484375, - "learning_rate": 8.288392574466583e-06, - "long_answer_loss": 0.0738, - "loss": 0.09, - "short_answer_loss": NaN, - "step": 764, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0955, - "grad_norm": 1.5390625, - "learning_rate": 8.25741492089577e-06, - "long_answer_loss": 0.0955, - "loss": 0.0844, - "short_answer_loss": NaN, - "step": 765, - "template_loss": 0.0 - }, - { - "epoch": 1.24, - "full_loss": 0.0802, - "grad_norm": 1.421875, - "learning_rate": 8.22646668776521e-06, - "long_answer_loss": 0.0802, - "loss": 0.0868, - "short_answer_loss": NaN, - "step": 766, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0749, - "grad_norm": 1.4296875, - "learning_rate": 8.195548089687138e-06, - "long_answer_loss": 0.0749, - "loss": 0.0829, - "short_answer_loss": NaN, - "step": 767, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0843, - "grad_norm": 1.5625, - "learning_rate": 8.16465934106827e-06, - "long_answer_loss": 0.0843, - "loss": 0.0995, - "short_answer_loss": NaN, - "step": 768, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0919, - "grad_norm": 1.46875, - "learning_rate": 8.13380065610834e-06, - "long_answer_loss": 0.0919, - "loss": 0.0909, - "short_answer_loss": NaN, - "step": 769, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0775, - "grad_norm": 1.546875, - "learning_rate": 8.102972248798602e-06, - "long_answer_loss": 0.0775, - "loss": 0.0862, - "short_answer_loss": NaN, - "step": 770, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0987, - "grad_norm": 1.4453125, - "learning_rate": 8.07217433292035e-06, - "long_answer_loss": 0.0987, - "loss": 0.0893, - "short_answer_loss": NaN, - "step": 771, - "template_loss": 0.0 - }, - { - "epoch": 1.25, - "full_loss": 0.0897, - "grad_norm": 1.5390625, - "learning_rate": 8.041407122043429e-06, - "long_answer_loss": 0.0897, - "loss": 0.0954, - "short_answer_loss": NaN, - "step": 772, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.09, - "grad_norm": 1.46875, - "learning_rate": 8.010670829524753e-06, - "long_answer_loss": 0.09, - "loss": 0.0899, - "short_answer_loss": NaN, - "step": 773, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0913, - "grad_norm": 1.6015625, - "learning_rate": 7.979965668506848e-06, - "long_answer_loss": 0.0913, - "loss": 0.091, - "short_answer_loss": NaN, - "step": 774, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.1082, - "grad_norm": 1.484375, - "learning_rate": 7.949291851916342e-06, - "long_answer_loss": 0.1082, - "loss": 0.0851, - "short_answer_loss": NaN, - "step": 775, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.1277, - "grad_norm": 1.546875, - "learning_rate": 7.918649592462502e-06, - "long_answer_loss": 0.1277, - "loss": 0.091, - "short_answer_loss": NaN, - "step": 776, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.0676, - "grad_norm": 1.46875, - "learning_rate": 7.888039102635778e-06, - "long_answer_loss": 0.0676, - "loss": 0.0814, - "short_answer_loss": NaN, - "step": 777, - "template_loss": 0.0 - }, - { - "epoch": 1.26, - "full_loss": 0.1, - "grad_norm": 1.453125, - "learning_rate": 7.857460594706296e-06, - "long_answer_loss": 0.1, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 778, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.1002, - "grad_norm": 1.5234375, - "learning_rate": 7.826914280722402e-06, - "long_answer_loss": 0.1002, - "loss": 0.0871, - "short_answer_loss": NaN, - "step": 779, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0749, - "grad_norm": 1.4609375, - "learning_rate": 7.796400372509199e-06, - "long_answer_loss": 0.0749, - "loss": 0.0829, - "short_answer_loss": NaN, - "step": 780, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.071, - "grad_norm": 1.3984375, - "learning_rate": 7.765919081667068e-06, - "long_answer_loss": 0.071, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 781, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0827, - "grad_norm": 1.4765625, - "learning_rate": 7.735470619570195e-06, - "long_answer_loss": 0.0827, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 782, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0791, - "grad_norm": 1.4921875, - "learning_rate": 7.705055197365126e-06, - "long_answer_loss": 0.0791, - "loss": 0.0852, - "short_answer_loss": NaN, - "step": 783, - "template_loss": 0.0 - }, - { - "epoch": 1.27, - "full_loss": 0.0813, - "grad_norm": 1.484375, - "learning_rate": 7.674673025969287e-06, - "long_answer_loss": 0.0813, - "loss": 0.0823, - "short_answer_loss": NaN, - "step": 784, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0715, - "grad_norm": 1.5, - "learning_rate": 7.644324316069512e-06, - "long_answer_loss": 0.0715, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 785, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0808, - "grad_norm": 1.46875, - "learning_rate": 7.614009278120611e-06, - "long_answer_loss": 0.0808, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 786, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0748, - "grad_norm": 1.5703125, - "learning_rate": 7.583728122343886e-06, - "long_answer_loss": 0.0748, - "loss": 0.0921, - "short_answer_loss": NaN, - "step": 787, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0905, - "grad_norm": 1.546875, - "learning_rate": 7.553481058725677e-06, - "long_answer_loss": 0.0905, - "loss": 0.0895, - "short_answer_loss": NaN, - "step": 788, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0696, - "grad_norm": 1.515625, - "learning_rate": 7.523268297015916e-06, - "long_answer_loss": 0.0696, - "loss": 0.0877, - "short_answer_loss": NaN, - "step": 789, - "template_loss": 0.0 - }, - { - "epoch": 1.28, - "full_loss": 0.0886, - "grad_norm": 1.4921875, - "learning_rate": 7.4930900467266755e-06, - "long_answer_loss": 0.0886, - "loss": 0.0864, - "short_answer_loss": NaN, - "step": 790, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0733, - "grad_norm": 1.421875, - "learning_rate": 7.462946517130686e-06, - "long_answer_loss": 0.0733, - "loss": 0.0835, - "short_answer_loss": NaN, - "step": 791, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0834, - "grad_norm": 1.4453125, - "learning_rate": 7.432837917259922e-06, - "long_answer_loss": 0.0834, - "loss": 0.0801, - "short_answer_loss": NaN, - "step": 792, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0858, - "grad_norm": 1.4921875, - "learning_rate": 7.402764455904126e-06, - "long_answer_loss": 0.0858, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 793, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0693, - "grad_norm": 1.453125, - "learning_rate": 7.37272634160938e-06, - "long_answer_loss": 0.0693, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 794, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0837, - "grad_norm": 1.5703125, - "learning_rate": 7.342723782676637e-06, - "long_answer_loss": 0.0837, - "loss": 0.0886, - "short_answer_loss": NaN, - "step": 795, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0897, - "grad_norm": 1.5078125, - "learning_rate": 7.312756987160299e-06, - "long_answer_loss": 0.0897, - "loss": 0.0894, - "short_answer_loss": NaN, - "step": 796, - "template_loss": 0.0 - }, - { - "epoch": 1.29, - "full_loss": 0.0807, - "grad_norm": 1.4765625, - "learning_rate": 7.282826162866771e-06, - "long_answer_loss": 0.0807, - "loss": 0.0812, - "short_answer_loss": NaN, - "step": 797, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0864, - "grad_norm": 1.5234375, - "learning_rate": 7.252931517352994e-06, - "long_answer_loss": 0.0864, - "loss": 0.092, - "short_answer_loss": NaN, - "step": 798, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0942, - "grad_norm": 1.40625, - "learning_rate": 7.223073257925047e-06, - "long_answer_loss": 0.0942, - "loss": 0.0787, - "short_answer_loss": NaN, - "step": 799, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0842, - "grad_norm": 1.53125, - "learning_rate": 7.193251591636665e-06, - "long_answer_loss": 0.0842, - "loss": 0.086, - "short_answer_loss": NaN, - "step": 800, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0653, - "grad_norm": 1.515625, - "learning_rate": 7.163466725287844e-06, - "long_answer_loss": 0.0653, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 801, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.0801, - "grad_norm": 1.4609375, - "learning_rate": 7.133718865423388e-06, - "long_answer_loss": 0.0801, - "loss": 0.0912, - "short_answer_loss": NaN, - "step": 802, - "template_loss": 0.0 - }, - { - "epoch": 1.3, - "full_loss": 0.1055, - "grad_norm": 1.484375, - "learning_rate": 7.104008218331457e-06, - "long_answer_loss": 0.1055, - "loss": 0.0875, - "short_answer_loss": NaN, - "step": 803, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0916, - "grad_norm": 1.4296875, - "learning_rate": 7.074334990042189e-06, - "long_answer_loss": 0.0916, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 804, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0818, - "grad_norm": 1.4921875, - "learning_rate": 7.044699386326212e-06, - "long_answer_loss": 0.0818, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 805, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0928, - "grad_norm": 1.453125, - "learning_rate": 7.015101612693259e-06, - "long_answer_loss": 0.0928, - "loss": 0.0891, - "short_answer_loss": NaN, - "step": 806, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0782, - "grad_norm": 1.484375, - "learning_rate": 6.985541874390725e-06, - "long_answer_loss": 0.0782, - "loss": 0.0847, - "short_answer_loss": NaN, - "step": 807, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0712, - "grad_norm": 1.5234375, - "learning_rate": 6.956020376402239e-06, - "long_answer_loss": 0.0712, - "loss": 0.0875, - "short_answer_loss": NaN, - "step": 808, - "template_loss": 0.0 - }, - { - "epoch": 1.31, - "full_loss": 0.0711, - "grad_norm": 1.4375, - "learning_rate": 6.926537323446264e-06, - "long_answer_loss": 0.0711, - "loss": 0.082, - "short_answer_loss": NaN, - "step": 809, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.081, - "grad_norm": 1.4921875, - "learning_rate": 6.897092919974652e-06, - "long_answer_loss": 0.081, - "loss": 0.0894, - "short_answer_loss": NaN, - "step": 810, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0808, - "grad_norm": 1.5234375, - "learning_rate": 6.867687370171247e-06, - "long_answer_loss": 0.0808, - "loss": 0.0906, - "short_answer_loss": NaN, - "step": 811, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0892, - "grad_norm": 1.46875, - "learning_rate": 6.838320877950458e-06, - "long_answer_loss": 0.0892, - "loss": 0.0892, - "short_answer_loss": NaN, - "step": 812, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0722, - "grad_norm": 1.484375, - "learning_rate": 6.808993646955838e-06, - "long_answer_loss": 0.0722, - "loss": 0.087, - "short_answer_loss": NaN, - "step": 813, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0721, - "grad_norm": 1.40625, - "learning_rate": 6.7797058805586926e-06, - "long_answer_loss": 0.0721, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 814, - "template_loss": 0.0 - }, - { - "epoch": 1.32, - "full_loss": 0.0864, - "grad_norm": 1.53125, - "learning_rate": 6.750457781856658e-06, - "long_answer_loss": 0.0864, - "loss": 0.0867, - "short_answer_loss": NaN, - "step": 815, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.083, - "grad_norm": 1.4765625, - "learning_rate": 6.721249553672271e-06, - "long_answer_loss": 0.083, - "loss": 0.0883, - "short_answer_loss": NaN, - "step": 816, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0823, - "grad_norm": 1.4609375, - "learning_rate": 6.6920813985516195e-06, - "long_answer_loss": 0.0823, - "loss": 0.0804, - "short_answer_loss": NaN, - "step": 817, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0912, - "grad_norm": 1.5078125, - "learning_rate": 6.6629535187628825e-06, - "long_answer_loss": 0.0912, - "loss": 0.088, - "short_answer_loss": NaN, - "step": 818, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0763, - "grad_norm": 1.53125, - "learning_rate": 6.633866116294939e-06, - "long_answer_loss": 0.0763, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 819, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0721, - "grad_norm": 1.53125, - "learning_rate": 6.60481939285599e-06, - "long_answer_loss": 0.0721, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 820, - "template_loss": 0.0 - }, - { - "epoch": 1.33, - "full_loss": 0.0835, - "grad_norm": 1.5546875, - "learning_rate": 6.575813549872148e-06, - "long_answer_loss": 0.0835, - "loss": 0.0957, - "short_answer_loss": NaN, - "step": 821, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0824, - "grad_norm": 1.5625, - "learning_rate": 6.546848788486021e-06, - "long_answer_loss": 0.0824, - "loss": 0.0887, - "short_answer_loss": NaN, - "step": 822, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0673, - "grad_norm": 1.5625, - "learning_rate": 6.517925309555348e-06, - "long_answer_loss": 0.0673, - "loss": 0.0811, - "short_answer_loss": NaN, - "step": 823, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0776, - "grad_norm": 1.5234375, - "learning_rate": 6.489043313651591e-06, - "long_answer_loss": 0.0776, - "loss": 0.0906, - "short_answer_loss": NaN, - "step": 824, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0924, - "grad_norm": 1.4296875, - "learning_rate": 6.460203001058543e-06, - "long_answer_loss": 0.0924, - "loss": 0.0833, - "short_answer_loss": NaN, - "step": 825, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0745, - "grad_norm": 1.453125, - "learning_rate": 6.431404571770948e-06, - "long_answer_loss": 0.0745, - "loss": 0.088, - "short_answer_loss": NaN, - "step": 826, - "template_loss": 0.0 - }, - { - "epoch": 1.34, - "full_loss": 0.0834, - "grad_norm": 1.4453125, - "learning_rate": 6.402648225493091e-06, - "long_answer_loss": 0.0834, - "loss": 0.09, - "short_answer_loss": NaN, - "step": 827, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0724, - "grad_norm": 1.4296875, - "learning_rate": 6.373934161637449e-06, - "long_answer_loss": 0.0724, - "loss": 0.0869, - "short_answer_loss": NaN, - "step": 828, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0672, - "grad_norm": 1.4453125, - "learning_rate": 6.345262579323278e-06, - "long_answer_loss": 0.0672, - "loss": 0.09, - "short_answer_loss": NaN, - "step": 829, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.1102, - "grad_norm": 1.4453125, - "learning_rate": 6.31663367737525e-06, - "long_answer_loss": 0.1102, - "loss": 0.0882, - "short_answer_loss": NaN, - "step": 830, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.1072, - "grad_norm": 1.3671875, - "learning_rate": 6.288047654322068e-06, - "long_answer_loss": 0.1072, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 831, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.1034, - "grad_norm": 1.40625, - "learning_rate": 6.259504708395078e-06, - "long_answer_loss": 0.1034, - "loss": 0.089, - "short_answer_loss": NaN, - "step": 832, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.0705, - "grad_norm": 1.3828125, - "learning_rate": 6.231005037526916e-06, - "long_answer_loss": 0.0705, - "loss": 0.0857, - "short_answer_loss": NaN, - "step": 833, - "template_loss": 0.0 - }, - { - "epoch": 1.35, - "full_loss": 0.09, - "grad_norm": 1.4609375, - "learning_rate": 6.202548839350126e-06, - "long_answer_loss": 0.09, - "loss": 0.0854, - "short_answer_loss": NaN, - "step": 834, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0819, - "grad_norm": 1.359375, - "learning_rate": 6.174136311195777e-06, - "long_answer_loss": 0.0819, - "loss": 0.078, - "short_answer_loss": NaN, - "step": 835, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0914, - "grad_norm": 1.3984375, - "learning_rate": 6.1457676500921104e-06, - "long_answer_loss": 0.0914, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 836, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0717, - "grad_norm": 1.53125, - "learning_rate": 6.117443052763188e-06, - "long_answer_loss": 0.0717, - "loss": 0.0852, - "short_answer_loss": NaN, - "step": 837, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0772, - "grad_norm": 1.3828125, - "learning_rate": 6.089162715627474e-06, - "long_answer_loss": 0.0772, - "loss": 0.08, - "short_answer_loss": NaN, - "step": 838, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.089, - "grad_norm": 1.4375, - "learning_rate": 6.060926834796535e-06, - "long_answer_loss": 0.089, - "loss": 0.0844, - "short_answer_loss": NaN, - "step": 839, - "template_loss": 0.0 - }, - { - "epoch": 1.36, - "full_loss": 0.0994, - "grad_norm": 1.5234375, - "learning_rate": 6.0327356060736475e-06, - "long_answer_loss": 0.0994, - "loss": 0.0851, - "short_answer_loss": NaN, - "step": 840, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0764, - "grad_norm": 1.3828125, - "learning_rate": 6.004589224952431e-06, - "long_answer_loss": 0.0764, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 841, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0858, - "grad_norm": 1.5390625, - "learning_rate": 5.976487886615526e-06, - "long_answer_loss": 0.0858, - "loss": 0.0863, - "short_answer_loss": NaN, - "step": 842, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0862, - "grad_norm": 1.484375, - "learning_rate": 5.94843178593321e-06, - "long_answer_loss": 0.0862, - "loss": 0.0879, - "short_answer_loss": NaN, - "step": 843, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0812, - "grad_norm": 1.5, - "learning_rate": 5.9204211174620645e-06, - "long_answer_loss": 0.0812, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 844, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0947, - "grad_norm": 1.46875, - "learning_rate": 5.892456075443616e-06, - "long_answer_loss": 0.0947, - "loss": 0.0863, - "short_answer_loss": NaN, - "step": 845, - "template_loss": 0.0 - }, - { - "epoch": 1.37, - "full_loss": 0.0915, - "grad_norm": 1.4765625, - "learning_rate": 5.864536853802983e-06, - "long_answer_loss": 0.0915, - "loss": 0.0858, - "short_answer_loss": NaN, - "step": 846, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0862, - "grad_norm": 1.40625, - "learning_rate": 5.836663646147554e-06, - "long_answer_loss": 0.0862, - "loss": 0.0796, - "short_answer_loss": NaN, - "step": 847, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0778, - "grad_norm": 1.375, - "learning_rate": 5.808836645765628e-06, - "long_answer_loss": 0.0778, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 848, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0924, - "grad_norm": 1.5625, - "learning_rate": 5.781056045625065e-06, - "long_answer_loss": 0.0924, - "loss": 0.0926, - "short_answer_loss": NaN, - "step": 849, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0687, - "grad_norm": 1.4765625, - "learning_rate": 5.753322038371975e-06, - "long_answer_loss": 0.0687, - "loss": 0.0858, - "short_answer_loss": NaN, - "step": 850, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.0741, - "grad_norm": 1.375, - "learning_rate": 5.7256348163293585e-06, - "long_answer_loss": 0.0741, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 851, - "template_loss": 0.0 - }, - { - "epoch": 1.38, - "full_loss": 0.078, - "grad_norm": 1.4921875, - "learning_rate": 5.697994571495786e-06, - "long_answer_loss": 0.078, - "loss": 0.0831, - "short_answer_loss": NaN, - "step": 852, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.093, - "grad_norm": 1.484375, - "learning_rate": 5.670401495544065e-06, - "long_answer_loss": 0.093, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 853, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0877, - "grad_norm": 1.421875, - "learning_rate": 5.642855779819893e-06, - "long_answer_loss": 0.0877, - "loss": 0.0885, - "short_answer_loss": NaN, - "step": 854, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0833, - "grad_norm": 1.453125, - "learning_rate": 5.615357615340558e-06, - "long_answer_loss": 0.0833, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 855, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.1065, - "grad_norm": 1.484375, - "learning_rate": 5.587907192793601e-06, - "long_answer_loss": 0.1065, - "loss": 0.0852, - "short_answer_loss": NaN, - "step": 856, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0855, - "grad_norm": 1.4296875, - "learning_rate": 5.560504702535489e-06, - "long_answer_loss": 0.0855, - "loss": 0.0799, - "short_answer_loss": NaN, - "step": 857, - "template_loss": 0.0 - }, - { - "epoch": 1.39, - "full_loss": 0.0911, - "grad_norm": 1.4375, - "learning_rate": 5.533150334590305e-06, - "long_answer_loss": 0.0911, - "loss": 0.0865, - "short_answer_loss": NaN, - "step": 858, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0909, - "grad_norm": 1.359375, - "learning_rate": 5.505844278648424e-06, - "long_answer_loss": 0.0909, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 859, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0901, - "grad_norm": 1.5078125, - "learning_rate": 5.47858672406519e-06, - "long_answer_loss": 0.0901, - "loss": 0.0878, - "short_answer_loss": NaN, - "step": 860, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0916, - "grad_norm": 1.6015625, - "learning_rate": 5.451377859859623e-06, - "long_answer_loss": 0.0916, - "loss": 0.0936, - "short_answer_loss": NaN, - "step": 861, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0748, - "grad_norm": 1.5390625, - "learning_rate": 5.424217874713096e-06, - "long_answer_loss": 0.0748, - "loss": 0.0841, - "short_answer_loss": NaN, - "step": 862, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.0997, - "grad_norm": 1.4375, - "learning_rate": 5.397106956968012e-06, - "long_answer_loss": 0.0997, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 863, - "template_loss": 0.0 - }, - { - "epoch": 1.4, - "full_loss": 0.072, - "grad_norm": 1.5234375, - "learning_rate": 5.370045294626544e-06, - "long_answer_loss": 0.072, - "loss": 0.0854, - "short_answer_loss": NaN, - "step": 864, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.095, - "grad_norm": 1.4140625, - "learning_rate": 5.343033075349266e-06, - "long_answer_loss": 0.095, - "loss": 0.0821, - "short_answer_loss": NaN, - "step": 865, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.09, - "grad_norm": 1.40625, - "learning_rate": 5.31607048645391e-06, - "long_answer_loss": 0.09, - "loss": 0.0806, - "short_answer_loss": NaN, - "step": 866, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0883, - "grad_norm": 1.4375, - "learning_rate": 5.289157714914039e-06, - "long_answer_loss": 0.0883, - "loss": 0.0823, - "short_answer_loss": NaN, - "step": 867, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0823, - "grad_norm": 1.3984375, - "learning_rate": 5.262294947357744e-06, - "long_answer_loss": 0.0823, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 868, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0869, - "grad_norm": 1.546875, - "learning_rate": 5.235482370066372e-06, - "long_answer_loss": 0.0869, - "loss": 0.0864, - "short_answer_loss": NaN, - "step": 869, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0882, - "grad_norm": 1.515625, - "learning_rate": 5.208720168973219e-06, - "long_answer_loss": 0.0882, - "loss": 0.0884, - "short_answer_loss": NaN, - "step": 870, - "template_loss": 0.0 - }, - { - "epoch": 1.41, - "full_loss": 0.0815, - "grad_norm": 1.3984375, - "learning_rate": 5.1820085296622455e-06, - "long_answer_loss": 0.0815, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 871, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0988, - "grad_norm": 1.4296875, - "learning_rate": 5.15534763736679e-06, - "long_answer_loss": 0.0988, - "loss": 0.08, - "short_answer_loss": NaN, - "step": 872, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0807, - "grad_norm": 1.515625, - "learning_rate": 5.128737676968274e-06, - "long_answer_loss": 0.0807, - "loss": 0.0867, - "short_answer_loss": NaN, - "step": 873, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0773, - "grad_norm": 1.421875, - "learning_rate": 5.102178832994941e-06, - "long_answer_loss": 0.0773, - "loss": 0.0835, - "short_answer_loss": NaN, - "step": 874, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0802, - "grad_norm": 1.4609375, - "learning_rate": 5.0756712896205625e-06, - "long_answer_loss": 0.0802, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 875, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0838, - "grad_norm": 1.4609375, - "learning_rate": 5.049215230663152e-06, - "long_answer_loss": 0.0838, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 876, - "template_loss": 0.0 - }, - { - "epoch": 1.42, - "full_loss": 0.0724, - "grad_norm": 1.390625, - "learning_rate": 5.022810839583707e-06, - "long_answer_loss": 0.0724, - "loss": 0.0781, - "short_answer_loss": NaN, - "step": 877, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.1062, - "grad_norm": 1.5390625, - "learning_rate": 4.99645829948495e-06, - "long_answer_loss": 0.1062, - "loss": 0.0844, - "short_answer_loss": NaN, - "step": 878, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0807, - "grad_norm": 1.546875, - "learning_rate": 4.970157793110011e-06, - "long_answer_loss": 0.0807, - "loss": 0.0853, - "short_answer_loss": NaN, - "step": 879, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.077, - "grad_norm": 1.40625, - "learning_rate": 4.9439095028412055e-06, - "long_answer_loss": 0.077, - "loss": 0.0821, - "short_answer_loss": NaN, - "step": 880, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0782, - "grad_norm": 1.4375, - "learning_rate": 4.917713610698754e-06, - "long_answer_loss": 0.0782, - "loss": 0.0853, - "short_answer_loss": NaN, - "step": 881, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0888, - "grad_norm": 1.421875, - "learning_rate": 4.891570298339508e-06, - "long_answer_loss": 0.0888, - "loss": 0.0869, - "short_answer_loss": NaN, - "step": 882, - "template_loss": 0.0 - }, - { - "epoch": 1.43, - "full_loss": 0.0763, - "grad_norm": 1.390625, - "learning_rate": 4.8654797470557125e-06, - "long_answer_loss": 0.0763, - "loss": 0.0803, - "short_answer_loss": NaN, - "step": 883, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0933, - "grad_norm": 1.4296875, - "learning_rate": 4.8394421377737354e-06, - "long_answer_loss": 0.0933, - "loss": 0.0853, - "short_answer_loss": NaN, - "step": 884, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0856, - "grad_norm": 1.5234375, - "learning_rate": 4.813457651052815e-06, - "long_answer_loss": 0.0856, - "loss": 0.0894, - "short_answer_loss": NaN, - "step": 885, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0769, - "grad_norm": 1.34375, - "learning_rate": 4.7875264670838096e-06, - "long_answer_loss": 0.0769, - "loss": 0.0781, - "short_answer_loss": NaN, - "step": 886, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.072, - "grad_norm": 1.4296875, - "learning_rate": 4.76164876568794e-06, - "long_answer_loss": 0.072, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 887, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0695, - "grad_norm": 1.5234375, - "learning_rate": 4.735824726315555e-06, - "long_answer_loss": 0.0695, - "loss": 0.0893, - "short_answer_loss": NaN, - "step": 888, - "template_loss": 0.0 - }, - { - "epoch": 1.44, - "full_loss": 0.0804, - "grad_norm": 1.4921875, - "learning_rate": 4.710054528044885e-06, - "long_answer_loss": 0.0804, - "loss": 0.0894, - "short_answer_loss": NaN, - "step": 889, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0988, - "grad_norm": 1.4921875, - "learning_rate": 4.68433834958078e-06, - "long_answer_loss": 0.0988, - "loss": 0.0898, - "short_answer_loss": NaN, - "step": 890, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.084, - "grad_norm": 1.4765625, - "learning_rate": 4.658676369253513e-06, - "long_answer_loss": 0.084, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 891, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0776, - "grad_norm": 1.3359375, - "learning_rate": 4.633068765017493e-06, - "long_answer_loss": 0.0776, - "loss": 0.082, - "short_answer_loss": NaN, - "step": 892, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0767, - "grad_norm": 1.5703125, - "learning_rate": 4.607515714450064e-06, - "long_answer_loss": 0.0767, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 893, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0933, - "grad_norm": 1.3515625, - "learning_rate": 4.582017394750271e-06, - "long_answer_loss": 0.0933, - "loss": 0.0855, - "short_answer_loss": NaN, - "step": 894, - "template_loss": 0.0 - }, - { - "epoch": 1.45, - "full_loss": 0.0737, - "grad_norm": 1.4609375, - "learning_rate": 4.556573982737611e-06, - "long_answer_loss": 0.0737, - "loss": 0.0886, - "short_answer_loss": NaN, - "step": 895, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0906, - "grad_norm": 1.3984375, - "learning_rate": 4.531185654850829e-06, - "long_answer_loss": 0.0906, - "loss": 0.0856, - "short_answer_loss": NaN, - "step": 896, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.1176, - "grad_norm": 1.421875, - "learning_rate": 4.505852587146689e-06, - "long_answer_loss": 0.1176, - "loss": 0.0853, - "short_answer_loss": NaN, - "step": 897, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0898, - "grad_norm": 1.4453125, - "learning_rate": 4.480574955298746e-06, - "long_answer_loss": 0.0898, - "loss": 0.0841, - "short_answer_loss": NaN, - "step": 898, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0729, - "grad_norm": 1.421875, - "learning_rate": 4.4553529345961285e-06, - "long_answer_loss": 0.0729, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 899, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.0803, - "grad_norm": 1.34375, - "learning_rate": 4.430186699942337e-06, - "long_answer_loss": 0.0803, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 900, - "template_loss": 0.0 - }, - { - "epoch": 1.46, - "full_loss": 0.072, - "grad_norm": 1.3515625, - "learning_rate": 4.405076425854007e-06, - "long_answer_loss": 0.072, - "loss": 0.0785, - "short_answer_loss": NaN, - "step": 901, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0764, - "grad_norm": 1.4375, - "learning_rate": 4.380022286459727e-06, - "long_answer_loss": 0.0764, - "loss": 0.0803, - "short_answer_loss": NaN, - "step": 902, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0782, - "grad_norm": 1.4296875, - "learning_rate": 4.3550244554987975e-06, - "long_answer_loss": 0.0782, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 903, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0868, - "grad_norm": 1.484375, - "learning_rate": 4.33008310632006e-06, - "long_answer_loss": 0.0868, - "loss": 0.0886, - "short_answer_loss": NaN, - "step": 904, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0722, - "grad_norm": 1.4609375, - "learning_rate": 4.305198411880686e-06, - "long_answer_loss": 0.0722, - "loss": 0.084, - "short_answer_loss": NaN, - "step": 905, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0877, - "grad_norm": 1.453125, - "learning_rate": 4.280370544744949e-06, - "long_answer_loss": 0.0877, - "loss": 0.0865, - "short_answer_loss": NaN, - "step": 906, - "template_loss": 0.0 - }, - { - "epoch": 1.47, - "full_loss": 0.0978, - "grad_norm": 1.46875, - "learning_rate": 4.25559967708307e-06, - "long_answer_loss": 0.0978, - "loss": 0.086, - "short_answer_loss": NaN, - "step": 907, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0803, - "grad_norm": 1.4609375, - "learning_rate": 4.230885980669999e-06, - "long_answer_loss": 0.0803, - "loss": 0.0876, - "short_answer_loss": NaN, - "step": 908, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.1023, - "grad_norm": 1.5, - "learning_rate": 4.2062296268842196e-06, - "long_answer_loss": 0.1023, - "loss": 0.0847, - "short_answer_loss": NaN, - "step": 909, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0721, - "grad_norm": 1.3984375, - "learning_rate": 4.181630786706582e-06, - "long_answer_loss": 0.0721, - "loss": 0.0744, - "short_answer_loss": NaN, - "step": 910, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.106, - "grad_norm": 1.5, - "learning_rate": 4.157089630719099e-06, - "long_answer_loss": 0.106, - "loss": 0.0985, - "short_answer_loss": NaN, - "step": 911, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0823, - "grad_norm": 1.375, - "learning_rate": 4.1326063291037705e-06, - "long_answer_loss": 0.0823, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 912, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0964, - "grad_norm": 1.5078125, - "learning_rate": 4.108181051641408e-06, - "long_answer_loss": 0.0964, - "loss": 0.0892, - "short_answer_loss": NaN, - "step": 913, - "template_loss": 0.0 - }, - { - "epoch": 1.48, - "full_loss": 0.0932, - "grad_norm": 1.453125, - "learning_rate": 4.083813967710435e-06, - "long_answer_loss": 0.0932, - "loss": 0.0842, - "short_answer_loss": NaN, - "step": 914, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0904, - "grad_norm": 1.5390625, - "learning_rate": 4.059505246285743e-06, - "long_answer_loss": 0.0904, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 915, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0775, - "grad_norm": 1.4296875, - "learning_rate": 4.035255055937507e-06, - "long_answer_loss": 0.0775, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 916, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0812, - "grad_norm": 1.4375, - "learning_rate": 4.011063564829997e-06, - "long_answer_loss": 0.0812, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 917, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0924, - "grad_norm": 1.453125, - "learning_rate": 3.986930940720461e-06, - "long_answer_loss": 0.0924, - "loss": 0.0813, - "short_answer_loss": NaN, - "step": 918, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.071, - "grad_norm": 1.375, - "learning_rate": 3.962857350957896e-06, - "long_answer_loss": 0.071, - "loss": 0.0817, - "short_answer_loss": NaN, - "step": 919, - "template_loss": 0.0 - }, - { - "epoch": 1.49, - "full_loss": 0.0876, - "grad_norm": 1.484375, - "learning_rate": 3.938842962481945e-06, - "long_answer_loss": 0.0876, - "loss": 0.0802, - "short_answer_loss": NaN, - "step": 920, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0865, - "grad_norm": 1.4375, - "learning_rate": 3.9148879418217115e-06, - "long_answer_loss": 0.0865, - "loss": 0.0901, - "short_answer_loss": NaN, - "step": 921, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.08, - "grad_norm": 1.4296875, - "learning_rate": 3.890992455094601e-06, - "long_answer_loss": 0.08, - "loss": 0.0829, - "short_answer_loss": NaN, - "step": 922, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0704, - "grad_norm": 1.46875, - "learning_rate": 3.867156668005188e-06, - "long_answer_loss": 0.0704, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 923, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0806, - "grad_norm": 1.390625, - "learning_rate": 3.8433807458440525e-06, - "long_answer_loss": 0.0806, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 924, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0928, - "grad_norm": 1.5625, - "learning_rate": 3.8196648534866386e-06, - "long_answer_loss": 0.0928, - "loss": 0.0868, - "short_answer_loss": NaN, - "step": 925, - "template_loss": 0.0 - }, - { - "epoch": 1.5, - "full_loss": 0.0807, - "grad_norm": 1.3984375, - "learning_rate": 3.7960091553921094e-06, - "long_answer_loss": 0.0807, - "loss": 0.0868, - "short_answer_loss": NaN, - "step": 926, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0838, - "grad_norm": 1.515625, - "learning_rate": 3.7724138156022132e-06, - "long_answer_loss": 0.0838, - "loss": 0.0835, - "short_answer_loss": NaN, - "step": 927, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.09, - "grad_norm": 1.5078125, - "learning_rate": 3.7488789977401282e-06, - "long_answer_loss": 0.09, - "loss": 0.0887, - "short_answer_loss": NaN, - "step": 928, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.087, - "grad_norm": 1.5078125, - "learning_rate": 3.725404865009348e-06, - "long_answer_loss": 0.087, - "loss": 0.0896, - "short_answer_loss": NaN, - "step": 929, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0751, - "grad_norm": 1.3671875, - "learning_rate": 3.7019915801925474e-06, - "long_answer_loss": 0.0751, - "loss": 0.0813, - "short_answer_loss": NaN, - "step": 930, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.0687, - "grad_norm": 1.328125, - "learning_rate": 3.6786393056504277e-06, - "long_answer_loss": 0.0687, - "loss": 0.0814, - "short_answer_loss": NaN, - "step": 931, - "template_loss": 0.0 - }, - { - "epoch": 1.51, - "full_loss": 0.089, - "grad_norm": 1.4765625, - "learning_rate": 3.6553482033206422e-06, - "long_answer_loss": 0.089, - "loss": 0.0932, - "short_answer_loss": NaN, - "step": 932, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.1103, - "grad_norm": 1.5078125, - "learning_rate": 3.6321184347166124e-06, - "long_answer_loss": 0.1103, - "loss": 0.0907, - "short_answer_loss": NaN, - "step": 933, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.092, - "grad_norm": 1.6875, - "learning_rate": 3.608950160926454e-06, - "long_answer_loss": 0.092, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 934, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0665, - "grad_norm": 1.390625, - "learning_rate": 3.5858435426118426e-06, - "long_answer_loss": 0.0665, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 935, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0871, - "grad_norm": 1.3671875, - "learning_rate": 3.5627987400068924e-06, - "long_answer_loss": 0.0871, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 936, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0774, - "grad_norm": 1.53125, - "learning_rate": 3.5398159129170577e-06, - "long_answer_loss": 0.0774, - "loss": 0.0856, - "short_answer_loss": NaN, - "step": 937, - "template_loss": 0.0 - }, - { - "epoch": 1.52, - "full_loss": 0.0776, - "grad_norm": 1.40625, - "learning_rate": 3.516895220718025e-06, - "long_answer_loss": 0.0776, - "loss": 0.08, - "short_answer_loss": NaN, - "step": 938, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0764, - "grad_norm": 1.484375, - "learning_rate": 3.4940368223545975e-06, - "long_answer_loss": 0.0764, - "loss": 0.0811, - "short_answer_loss": NaN, - "step": 939, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0876, - "grad_norm": 1.40625, - "learning_rate": 3.4712408763396013e-06, - "long_answer_loss": 0.0876, - "loss": 0.0881, - "short_answer_loss": NaN, - "step": 940, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0918, - "grad_norm": 1.4375, - "learning_rate": 3.4485075407527755e-06, - "long_answer_loss": 0.0918, - "loss": 0.0906, - "short_answer_loss": NaN, - "step": 941, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0601, - "grad_norm": 1.390625, - "learning_rate": 3.4258369732396934e-06, - "long_answer_loss": 0.0601, - "loss": 0.0804, - "short_answer_loss": NaN, - "step": 942, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0877, - "grad_norm": 1.4765625, - "learning_rate": 3.40322933101066e-06, - "long_answer_loss": 0.0877, - "loss": 0.0827, - "short_answer_loss": NaN, - "step": 943, - "template_loss": 0.0 - }, - { - "epoch": 1.53, - "full_loss": 0.0845, - "grad_norm": 1.4453125, - "learning_rate": 3.3806847708396054e-06, - "long_answer_loss": 0.0845, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 944, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0625, - "grad_norm": 1.3671875, - "learning_rate": 3.3582034490630366e-06, - "long_answer_loss": 0.0625, - "loss": 0.0862, - "short_answer_loss": NaN, - "step": 945, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0842, - "grad_norm": 1.4453125, - "learning_rate": 3.3357855215789214e-06, - "long_answer_loss": 0.0842, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 946, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0917, - "grad_norm": 1.4140625, - "learning_rate": 3.3134311438456064e-06, - "long_answer_loss": 0.0917, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 947, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0973, - "grad_norm": 1.3828125, - "learning_rate": 3.291140470880759e-06, - "long_answer_loss": 0.0973, - "loss": 0.0817, - "short_answer_loss": NaN, - "step": 948, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0791, - "grad_norm": 1.5859375, - "learning_rate": 3.268913657260285e-06, - "long_answer_loss": 0.0791, - "loss": 0.0956, - "short_answer_loss": NaN, - "step": 949, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.0989, - "grad_norm": 1.484375, - "learning_rate": 3.2467508571172437e-06, - "long_answer_loss": 0.0989, - "loss": 0.0851, - "short_answer_loss": NaN, - "step": 950, - "template_loss": 0.0 - }, - { - "epoch": 1.54, - "full_loss": 0.1028, - "grad_norm": 1.4296875, - "learning_rate": 3.2246522241407965e-06, - "long_answer_loss": 0.1028, - "loss": 0.0879, - "short_answer_loss": NaN, - "step": 951, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0698, - "grad_norm": 1.46875, - "learning_rate": 3.2026179115751344e-06, - "long_answer_loss": 0.0698, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 952, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0667, - "grad_norm": 1.40625, - "learning_rate": 3.1806480722184134e-06, - "long_answer_loss": 0.0667, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 953, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0943, - "grad_norm": 1.484375, - "learning_rate": 3.1587428584216998e-06, - "long_answer_loss": 0.0943, - "loss": 0.0903, - "short_answer_loss": NaN, - "step": 954, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0777, - "grad_norm": 1.40625, - "learning_rate": 3.136902422087898e-06, - "long_answer_loss": 0.0777, - "loss": 0.0794, - "short_answer_loss": NaN, - "step": 955, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0694, - "grad_norm": 1.328125, - "learning_rate": 3.115126914670724e-06, - "long_answer_loss": 0.0694, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 956, - "template_loss": 0.0 - }, - { - "epoch": 1.55, - "full_loss": 0.0919, - "grad_norm": 1.4375, - "learning_rate": 3.093416487173638e-06, - "long_answer_loss": 0.0919, - "loss": 0.0886, - "short_answer_loss": NaN, - "step": 957, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0752, - "grad_norm": 1.3828125, - "learning_rate": 3.071771290148788e-06, - "long_answer_loss": 0.0752, - "loss": 0.0802, - "short_answer_loss": NaN, - "step": 958, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.076, - "grad_norm": 1.453125, - "learning_rate": 3.050191473695997e-06, - "long_answer_loss": 0.076, - "loss": 0.0844, - "short_answer_loss": NaN, - "step": 959, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.1016, - "grad_norm": 1.390625, - "learning_rate": 3.028677187461687e-06, - "long_answer_loss": 0.1016, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 960, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0936, - "grad_norm": 1.4453125, - "learning_rate": 3.007228580637862e-06, - "long_answer_loss": 0.0936, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 961, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0809, - "grad_norm": 1.4296875, - "learning_rate": 2.985845801961075e-06, - "long_answer_loss": 0.0809, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 962, - "template_loss": 0.0 - }, - { - "epoch": 1.56, - "full_loss": 0.0871, - "grad_norm": 1.3984375, - "learning_rate": 2.964528999711376e-06, - "long_answer_loss": 0.0871, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 963, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0838, - "grad_norm": 1.375, - "learning_rate": 2.9432783217113055e-06, - "long_answer_loss": 0.0838, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 964, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.099, - "grad_norm": 1.4609375, - "learning_rate": 2.922093915324875e-06, - "long_answer_loss": 0.099, - "loss": 0.082, - "short_answer_loss": NaN, - "step": 965, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0942, - "grad_norm": 1.375, - "learning_rate": 2.900975927456508e-06, - "long_answer_loss": 0.0942, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 966, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0817, - "grad_norm": 1.453125, - "learning_rate": 2.879924504550058e-06, - "long_answer_loss": 0.0817, - "loss": 0.0899, - "short_answer_loss": NaN, - "step": 967, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.0851, - "grad_norm": 1.421875, - "learning_rate": 2.858939792587782e-06, - "long_answer_loss": 0.0851, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 968, - "template_loss": 0.0 - }, - { - "epoch": 1.57, - "full_loss": 0.069, - "grad_norm": 1.3828125, - "learning_rate": 2.8380219370893192e-06, - "long_answer_loss": 0.069, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 969, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0749, - "grad_norm": 1.3359375, - "learning_rate": 2.817171083110691e-06, - "long_answer_loss": 0.0749, - "loss": 0.081, - "short_answer_loss": NaN, - "step": 970, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.096, - "grad_norm": 1.4765625, - "learning_rate": 2.796387375243298e-06, - "long_answer_loss": 0.096, - "loss": 0.087, - "short_answer_loss": NaN, - "step": 971, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0782, - "grad_norm": 1.390625, - "learning_rate": 2.775670957612908e-06, - "long_answer_loss": 0.0782, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 972, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0783, - "grad_norm": 1.5234375, - "learning_rate": 2.755021973878663e-06, - "long_answer_loss": 0.0783, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 973, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.0794, - "grad_norm": 1.4375, - "learning_rate": 2.734440567232077e-06, - "long_answer_loss": 0.0794, - "loss": 0.0787, - "short_answer_loss": NaN, - "step": 974, - "template_loss": 0.0 - }, - { - "epoch": 1.58, - "full_loss": 0.09, - "grad_norm": 1.4453125, - "learning_rate": 2.713926880396049e-06, - "long_answer_loss": 0.09, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 975, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0732, - "grad_norm": 1.4921875, - "learning_rate": 2.693481055623878e-06, - "long_answer_loss": 0.0732, - "loss": 0.089, - "short_answer_loss": NaN, - "step": 976, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0841, - "grad_norm": 1.4765625, - "learning_rate": 2.673103234698256e-06, - "long_answer_loss": 0.0841, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 977, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.1165, - "grad_norm": 1.5625, - "learning_rate": 2.65279355893031e-06, - "long_answer_loss": 0.1165, - "loss": 0.0844, - "short_answer_loss": NaN, - "step": 978, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0704, - "grad_norm": 1.359375, - "learning_rate": 2.6325521691586057e-06, - "long_answer_loss": 0.0704, - "loss": 0.0821, - "short_answer_loss": NaN, - "step": 979, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0962, - "grad_norm": 1.484375, - "learning_rate": 2.612379205748178e-06, - "long_answer_loss": 0.0962, - "loss": 0.0843, - "short_answer_loss": NaN, - "step": 980, - "template_loss": 0.0 - }, - { - "epoch": 1.59, - "full_loss": 0.0906, - "grad_norm": 1.5546875, - "learning_rate": 2.592274808589558e-06, - "long_answer_loss": 0.0906, - "loss": 0.0907, - "short_answer_loss": NaN, - "step": 981, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0914, - "grad_norm": 1.4765625, - "learning_rate": 2.5722391170977896e-06, - "long_answer_loss": 0.0914, - "loss": 0.0851, - "short_answer_loss": NaN, - "step": 982, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.113, - "grad_norm": 1.375, - "learning_rate": 2.552272270211484e-06, - "long_answer_loss": 0.113, - "loss": 0.0828, - "short_answer_loss": NaN, - "step": 983, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0792, - "grad_norm": 1.5390625, - "learning_rate": 2.5323744063918423e-06, - "long_answer_loss": 0.0792, - "loss": 0.0906, - "short_answer_loss": NaN, - "step": 984, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.1093, - "grad_norm": 1.453125, - "learning_rate": 2.5125456636216987e-06, - "long_answer_loss": 0.1093, - "loss": 0.0864, - "short_answer_loss": NaN, - "step": 985, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.0726, - "grad_norm": 1.453125, - "learning_rate": 2.4927861794045633e-06, - "long_answer_loss": 0.0726, - "loss": 0.0825, - "short_answer_loss": NaN, - "step": 986, - "template_loss": 0.0 - }, - { - "epoch": 1.6, - "full_loss": 0.078, - "grad_norm": 1.4375, - "learning_rate": 2.473096090763674e-06, - "long_answer_loss": 0.078, - "loss": 0.0811, - "short_answer_loss": NaN, - "step": 987, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0892, - "grad_norm": 1.3828125, - "learning_rate": 2.4534755342410286e-06, - "long_answer_loss": 0.0892, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 988, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0843, - "grad_norm": 1.3984375, - "learning_rate": 2.433924645896464e-06, - "long_answer_loss": 0.0843, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 989, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0857, - "grad_norm": 1.4765625, - "learning_rate": 2.4144435613066938e-06, - "long_answer_loss": 0.0857, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 990, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0943, - "grad_norm": 1.5, - "learning_rate": 2.395032415564366e-06, - "long_answer_loss": 0.0943, - "loss": 0.0882, - "short_answer_loss": NaN, - "step": 991, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0844, - "grad_norm": 1.4921875, - "learning_rate": 2.3756913432771547e-06, - "long_answer_loss": 0.0844, - "loss": 0.0887, - "short_answer_loss": NaN, - "step": 992, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0862, - "grad_norm": 1.359375, - "learning_rate": 2.3564204785667815e-06, - "long_answer_loss": 0.0862, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 993, - "template_loss": 0.0 - }, - { - "epoch": 1.61, - "full_loss": 0.0689, - "grad_norm": 1.484375, - "learning_rate": 2.3372199550681263e-06, - "long_answer_loss": 0.0689, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 994, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0752, - "grad_norm": 1.34375, - "learning_rate": 2.318089905928278e-06, - "long_answer_loss": 0.0752, - "loss": 0.0774, - "short_answer_loss": NaN, - "step": 995, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0785, - "grad_norm": 1.484375, - "learning_rate": 2.299030463805614e-06, - "long_answer_loss": 0.0785, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 996, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0745, - "grad_norm": 1.453125, - "learning_rate": 2.2800417608688894e-06, - "long_answer_loss": 0.0745, - "loss": 0.0804, - "short_answer_loss": NaN, - "step": 997, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0745, - "grad_norm": 1.4453125, - "learning_rate": 2.2611239287963133e-06, - "long_answer_loss": 0.0745, - "loss": 0.0836, - "short_answer_loss": NaN, - "step": 998, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0708, - "grad_norm": 1.4765625, - "learning_rate": 2.242277098774634e-06, - "long_answer_loss": 0.0708, - "loss": 0.0813, - "short_answer_loss": NaN, - "step": 999, - "template_loss": 0.0 - }, - { - "epoch": 1.62, - "full_loss": 0.0711, - "grad_norm": 1.3828125, - "learning_rate": 2.2235014014982374e-06, - "long_answer_loss": 0.0711, - "loss": 0.0794, - "short_answer_loss": NaN, - "step": 1000, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0783, - "grad_norm": 1.4765625, - "learning_rate": 2.204796967168225e-06, - "long_answer_loss": 0.0783, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 1001, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0967, - "grad_norm": 1.359375, - "learning_rate": 2.1861639254915343e-06, - "long_answer_loss": 0.0967, - "loss": 0.078, - "short_answer_loss": NaN, - "step": 1002, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0626, - "grad_norm": 1.40625, - "learning_rate": 2.167602405680021e-06, - "long_answer_loss": 0.0626, - "loss": 0.0861, - "short_answer_loss": NaN, - "step": 1003, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.1006, - "grad_norm": 1.5078125, - "learning_rate": 2.1491125364495615e-06, - "long_answer_loss": 0.1006, - "loss": 0.0877, - "short_answer_loss": NaN, - "step": 1004, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0726, - "grad_norm": 1.375, - "learning_rate": 2.130694446019177e-06, - "long_answer_loss": 0.0726, - "loss": 0.0802, - "short_answer_loss": NaN, - "step": 1005, - "template_loss": 0.0 - }, - { - "epoch": 1.63, - "full_loss": 0.0736, - "grad_norm": 1.4453125, - "learning_rate": 2.112348262110138e-06, - "long_answer_loss": 0.0736, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 1006, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.1027, - "grad_norm": 1.484375, - "learning_rate": 2.0940741119450615e-06, - "long_answer_loss": 0.1027, - "loss": 0.0941, - "short_answer_loss": NaN, - "step": 1007, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.1083, - "grad_norm": 1.375, - "learning_rate": 2.075872122247051e-06, - "long_answer_loss": 0.1083, - "loss": 0.0809, - "short_answer_loss": NaN, - "step": 1008, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0752, - "grad_norm": 1.4375, - "learning_rate": 2.0577424192388117e-06, - "long_answer_loss": 0.0752, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 1009, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0827, - "grad_norm": 1.40625, - "learning_rate": 2.0396851286417647e-06, - "long_answer_loss": 0.0827, - "loss": 0.0802, - "short_answer_loss": NaN, - "step": 1010, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0947, - "grad_norm": 1.4375, - "learning_rate": 2.021700375675188e-06, - "long_answer_loss": 0.0947, - "loss": 0.0882, - "short_answer_loss": NaN, - "step": 1011, - "template_loss": 0.0 - }, - { - "epoch": 1.64, - "full_loss": 0.0824, - "grad_norm": 1.4765625, - "learning_rate": 2.0037882850553455e-06, - "long_answer_loss": 0.0824, - "loss": 0.085, - "short_answer_loss": NaN, - "step": 1012, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.063, - "grad_norm": 1.390625, - "learning_rate": 1.9859489809946176e-06, - "long_answer_loss": 0.063, - "loss": 0.081, - "short_answer_loss": NaN, - "step": 1013, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0673, - "grad_norm": 1.3203125, - "learning_rate": 1.9681825872006427e-06, - "long_answer_loss": 0.0673, - "loss": 0.075, - "short_answer_loss": NaN, - "step": 1014, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0714, - "grad_norm": 1.40625, - "learning_rate": 1.950489226875454e-06, - "long_answer_loss": 0.0714, - "loss": 0.0838, - "short_answer_loss": NaN, - "step": 1015, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0823, - "grad_norm": 1.5078125, - "learning_rate": 1.9328690227146394e-06, - "long_answer_loss": 0.0823, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 1016, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0778, - "grad_norm": 1.4296875, - "learning_rate": 1.9153220969064788e-06, - "long_answer_loss": 0.0778, - "loss": 0.0786, - "short_answer_loss": NaN, - "step": 1017, - "template_loss": 0.0 - }, - { - "epoch": 1.65, - "full_loss": 0.0987, - "grad_norm": 1.4296875, - "learning_rate": 1.8978485711310862e-06, - "long_answer_loss": 0.0987, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 1018, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.1099, - "grad_norm": 1.4140625, - "learning_rate": 1.8804485665596033e-06, - "long_answer_loss": 0.1099, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 1019, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0835, - "grad_norm": 1.3984375, - "learning_rate": 1.8631222038533124e-06, - "long_answer_loss": 0.0835, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 1020, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0782, - "grad_norm": 1.3671875, - "learning_rate": 1.845869603162835e-06, - "long_answer_loss": 0.0782, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 1021, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0854, - "grad_norm": 1.484375, - "learning_rate": 1.8286908841272834e-06, - "long_answer_loss": 0.0854, - "loss": 0.0889, - "short_answer_loss": NaN, - "step": 1022, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0844, - "grad_norm": 1.4609375, - "learning_rate": 1.8115861658734287e-06, - "long_answer_loss": 0.0844, - "loss": 0.0833, - "short_answer_loss": NaN, - "step": 1023, - "template_loss": 0.0 - }, - { - "epoch": 1.66, - "full_loss": 0.0728, - "grad_norm": 1.46875, - "learning_rate": 1.7945555670148856e-06, - "long_answer_loss": 0.0728, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 1024, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0882, - "grad_norm": 1.3046875, - "learning_rate": 1.77759920565128e-06, - "long_answer_loss": 0.0882, - "loss": 0.0843, - "short_answer_loss": NaN, - "step": 1025, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0867, - "grad_norm": 1.359375, - "learning_rate": 1.7607171993674371e-06, - "long_answer_loss": 0.0867, - "loss": 0.0762, - "short_answer_loss": NaN, - "step": 1026, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.079, - "grad_norm": 1.40625, - "learning_rate": 1.7439096652325584e-06, - "long_answer_loss": 0.079, - "loss": 0.0806, - "short_answer_loss": NaN, - "step": 1027, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.092, - "grad_norm": 1.4609375, - "learning_rate": 1.7271767197994213e-06, - "long_answer_loss": 0.092, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 1028, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0865, - "grad_norm": 1.4296875, - "learning_rate": 1.710518479103551e-06, - "long_answer_loss": 0.0865, - "loss": 0.0835, - "short_answer_loss": NaN, - "step": 1029, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0924, - "grad_norm": 1.3828125, - "learning_rate": 1.693935058662445e-06, - "long_answer_loss": 0.0924, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 1030, - "template_loss": 0.0 - }, - { - "epoch": 1.67, - "full_loss": 0.0865, - "grad_norm": 1.3671875, - "learning_rate": 1.6774265734747384e-06, - "long_answer_loss": 0.0865, - "loss": 0.0739, - "short_answer_loss": NaN, - "step": 1031, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.1034, - "grad_norm": 1.46875, - "learning_rate": 1.6609931380194358e-06, - "long_answer_loss": 0.1034, - "loss": 0.0891, - "short_answer_loss": NaN, - "step": 1032, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0842, - "grad_norm": 1.5078125, - "learning_rate": 1.644634866255107e-06, - "long_answer_loss": 0.0842, - "loss": 0.0892, - "short_answer_loss": NaN, - "step": 1033, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0779, - "grad_norm": 1.390625, - "learning_rate": 1.628351871619084e-06, - "long_answer_loss": 0.0779, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 1034, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0844, - "grad_norm": 1.4765625, - "learning_rate": 1.6121442670266925e-06, - "long_answer_loss": 0.0844, - "loss": 0.0852, - "short_answer_loss": NaN, - "step": 1035, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.1114, - "grad_norm": 1.3359375, - "learning_rate": 1.5960121648704626e-06, - "long_answer_loss": 0.1114, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 1036, - "template_loss": 0.0 - }, - { - "epoch": 1.68, - "full_loss": 0.0986, - "grad_norm": 1.5234375, - "learning_rate": 1.579955677019343e-06, - "long_answer_loss": 0.0986, - "loss": 0.0879, - "short_answer_loss": NaN, - "step": 1037, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0789, - "grad_norm": 1.3515625, - "learning_rate": 1.5639749148179368e-06, - "long_answer_loss": 0.0789, - "loss": 0.0812, - "short_answer_loss": NaN, - "step": 1038, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.087, - "grad_norm": 1.3671875, - "learning_rate": 1.548069989085721e-06, - "long_answer_loss": 0.087, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 1039, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0908, - "grad_norm": 1.515625, - "learning_rate": 1.5322410101162796e-06, - "long_answer_loss": 0.0908, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 1040, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0962, - "grad_norm": 1.421875, - "learning_rate": 1.516488087676543e-06, - "long_answer_loss": 0.0962, - "loss": 0.0874, - "short_answer_loss": NaN, - "step": 1041, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0835, - "grad_norm": 1.484375, - "learning_rate": 1.5008113310060148e-06, - "long_answer_loss": 0.0835, - "loss": 0.0855, - "short_answer_loss": NaN, - "step": 1042, - "template_loss": 0.0 - }, - { - "epoch": 1.69, - "full_loss": 0.0724, - "grad_norm": 1.3828125, - "learning_rate": 1.4852108488160349e-06, - "long_answer_loss": 0.0724, - "loss": 0.0871, - "short_answer_loss": NaN, - "step": 1043, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0869, - "grad_norm": 1.4296875, - "learning_rate": 1.4696867492890066e-06, - "long_answer_loss": 0.0869, - "loss": 0.0887, - "short_answer_loss": NaN, - "step": 1044, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0802, - "grad_norm": 1.4140625, - "learning_rate": 1.4542391400776484e-06, - "long_answer_loss": 0.0802, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 1045, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0871, - "grad_norm": 1.4296875, - "learning_rate": 1.4388681283042675e-06, - "long_answer_loss": 0.0871, - "loss": 0.0817, - "short_answer_loss": NaN, - "step": 1046, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0739, - "grad_norm": 1.46875, - "learning_rate": 1.423573820559987e-06, - "long_answer_loss": 0.0739, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 1047, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0827, - "grad_norm": 1.421875, - "learning_rate": 1.408356322904028e-06, - "long_answer_loss": 0.0827, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 1048, - "template_loss": 0.0 - }, - { - "epoch": 1.7, - "full_loss": 0.0925, - "grad_norm": 1.6015625, - "learning_rate": 1.3932157408629715e-06, - "long_answer_loss": 0.0925, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 1049, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0879, - "grad_norm": 1.4921875, - "learning_rate": 1.3781521794300145e-06, - "long_answer_loss": 0.0879, - "loss": 0.0884, - "short_answer_loss": NaN, - "step": 1050, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0786, - "grad_norm": 1.4296875, - "learning_rate": 1.363165743064254e-06, - "long_answer_loss": 0.0786, - "loss": 0.0948, - "short_answer_loss": NaN, - "step": 1051, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.098, - "grad_norm": 1.3203125, - "learning_rate": 1.3482565356899637e-06, - "long_answer_loss": 0.098, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 1052, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0926, - "grad_norm": 1.375, - "learning_rate": 1.3334246606958616e-06, - "long_answer_loss": 0.0926, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 1053, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0933, - "grad_norm": 1.3984375, - "learning_rate": 1.3186702209344052e-06, - "long_answer_loss": 0.0933, - "loss": 0.0835, - "short_answer_loss": NaN, - "step": 1054, - "template_loss": 0.0 - }, - { - "epoch": 1.71, - "full_loss": 0.0866, - "grad_norm": 1.4140625, - "learning_rate": 1.303993318721071e-06, - "long_answer_loss": 0.0866, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 1055, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0888, - "grad_norm": 1.390625, - "learning_rate": 1.2893940558336427e-06, - "long_answer_loss": 0.0888, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 1056, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.068, - "grad_norm": 1.4140625, - "learning_rate": 1.2748725335115175e-06, - "long_answer_loss": 0.068, - "loss": 0.0811, - "short_answer_loss": NaN, - "step": 1057, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0896, - "grad_norm": 1.4609375, - "learning_rate": 1.260428852454995e-06, - "long_answer_loss": 0.0896, - "loss": 0.0829, - "short_answer_loss": NaN, - "step": 1058, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0721, - "grad_norm": 1.4140625, - "learning_rate": 1.2460631128245705e-06, - "long_answer_loss": 0.0721, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 1059, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.0784, - "grad_norm": 1.484375, - "learning_rate": 1.2317754142402674e-06, - "long_answer_loss": 0.0784, - "loss": 0.0897, - "short_answer_loss": NaN, - "step": 1060, - "template_loss": 0.0 - }, - { - "epoch": 1.72, - "full_loss": 0.077, - "grad_norm": 1.5, - "learning_rate": 1.2175658557809139e-06, - "long_answer_loss": 0.077, - "loss": 0.0825, - "short_answer_loss": NaN, - "step": 1061, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.1099, - "grad_norm": 1.4609375, - "learning_rate": 1.2034345359834775e-06, - "long_answer_loss": 0.1099, - "loss": 0.0824, - "short_answer_loss": NaN, - "step": 1062, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0767, - "grad_norm": 1.3984375, - "learning_rate": 1.1893815528423793e-06, - "long_answer_loss": 0.0767, - "loss": 0.0823, - "short_answer_loss": NaN, - "step": 1063, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0766, - "grad_norm": 1.484375, - "learning_rate": 1.1754070038087984e-06, - "long_answer_loss": 0.0766, - "loss": 0.0871, - "short_answer_loss": NaN, - "step": 1064, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.1005, - "grad_norm": 1.4921875, - "learning_rate": 1.1615109857900208e-06, - "long_answer_loss": 0.1005, - "loss": 0.0817, - "short_answer_loss": NaN, - "step": 1065, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0747, - "grad_norm": 1.3359375, - "learning_rate": 1.1476935951487494e-06, - "long_answer_loss": 0.0747, - "loss": 0.0799, - "short_answer_loss": NaN, - "step": 1066, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.0735, - "grad_norm": 1.359375, - "learning_rate": 1.1339549277024408e-06, - "long_answer_loss": 0.0735, - "loss": 0.0793, - "short_answer_loss": NaN, - "step": 1067, - "template_loss": 0.0 - }, - { - "epoch": 1.73, - "full_loss": 0.115, - "grad_norm": 1.34375, - "learning_rate": 1.1202950787226453e-06, - "long_answer_loss": 0.115, - "loss": 0.0882, - "short_answer_loss": NaN, - "step": 1068, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0886, - "grad_norm": 1.3984375, - "learning_rate": 1.1067141429343356e-06, - "long_answer_loss": 0.0886, - "loss": 0.0866, - "short_answer_loss": NaN, - "step": 1069, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0841, - "grad_norm": 1.3828125, - "learning_rate": 1.0932122145152601e-06, - "long_answer_loss": 0.0841, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 1070, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0916, - "grad_norm": 1.390625, - "learning_rate": 1.0797893870952897e-06, - "long_answer_loss": 0.0916, - "loss": 0.0862, - "short_answer_loss": NaN, - "step": 1071, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0967, - "grad_norm": 1.4921875, - "learning_rate": 1.0664457537557543e-06, - "long_answer_loss": 0.0967, - "loss": 0.0924, - "short_answer_loss": NaN, - "step": 1072, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0863, - "grad_norm": 1.3671875, - "learning_rate": 1.0531814070288234e-06, - "long_answer_loss": 0.0863, - "loss": 0.0791, - "short_answer_loss": NaN, - "step": 1073, - "template_loss": 0.0 - }, - { - "epoch": 1.74, - "full_loss": 0.0847, - "grad_norm": 1.46875, - "learning_rate": 1.0399964388968397e-06, - "long_answer_loss": 0.0847, - "loss": 0.0862, - "short_answer_loss": NaN, - "step": 1074, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.112, - "grad_norm": 1.90625, - "learning_rate": 1.0268909407916874e-06, - "long_answer_loss": 0.112, - "loss": 0.0878, - "short_answer_loss": NaN, - "step": 1075, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0947, - "grad_norm": 1.453125, - "learning_rate": 1.0138650035941694e-06, - "long_answer_loss": 0.0947, - "loss": 0.0863, - "short_answer_loss": NaN, - "step": 1076, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0711, - "grad_norm": 1.5078125, - "learning_rate": 1.0009187176333672e-06, - "long_answer_loss": 0.0711, - "loss": 0.0869, - "short_answer_loss": NaN, - "step": 1077, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.1091, - "grad_norm": 1.4453125, - "learning_rate": 9.880521726860103e-07, - "long_answer_loss": 0.1091, - "loss": 0.0863, - "short_answer_loss": NaN, - "step": 1078, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.1005, - "grad_norm": 1.4296875, - "learning_rate": 9.752654579758672e-07, - "long_answer_loss": 0.1005, - "loss": 0.087, - "short_answer_loss": NaN, - "step": 1079, - "template_loss": 0.0 - }, - { - "epoch": 1.75, - "full_loss": 0.0745, - "grad_norm": 1.34375, - "learning_rate": 9.625586621731158e-07, - "long_answer_loss": 0.0745, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 1080, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0781, - "grad_norm": 1.4296875, - "learning_rate": 9.499318733937334e-07, - "long_answer_loss": 0.0781, - "loss": 0.0811, - "short_answer_loss": NaN, - "step": 1081, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0762, - "grad_norm": 1.390625, - "learning_rate": 9.373851791988839e-07, - "long_answer_loss": 0.0762, - "loss": 0.0864, - "short_answer_loss": NaN, - "step": 1082, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0854, - "grad_norm": 1.40625, - "learning_rate": 9.249186665943072e-07, - "long_answer_loss": 0.0854, - "loss": 0.0838, - "short_answer_loss": NaN, - "step": 1083, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.087, - "grad_norm": 1.4453125, - "learning_rate": 9.125324220297243e-07, - "long_answer_loss": 0.087, - "loss": 0.0867, - "short_answer_loss": NaN, - "step": 1084, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0991, - "grad_norm": 1.359375, - "learning_rate": 9.002265313982336e-07, - "long_answer_loss": 0.0991, - "loss": 0.0804, - "short_answer_loss": NaN, - "step": 1085, - "template_loss": 0.0 - }, - { - "epoch": 1.76, - "full_loss": 0.0868, - "grad_norm": 1.53125, - "learning_rate": 8.880010800357053e-07, - "long_answer_loss": 0.0868, - "loss": 0.0905, - "short_answer_loss": NaN, - "step": 1086, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0932, - "grad_norm": 1.3515625, - "learning_rate": 8.758561527202169e-07, - "long_answer_loss": 0.0932, - "loss": 0.0892, - "short_answer_loss": NaN, - "step": 1087, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0954, - "grad_norm": 1.40625, - "learning_rate": 8.637918336714291e-07, - "long_answer_loss": 0.0954, - "loss": 0.0799, - "short_answer_loss": NaN, - "step": 1088, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0752, - "grad_norm": 1.3359375, - "learning_rate": 8.518082065500296e-07, - "long_answer_loss": 0.0752, - "loss": 0.0774, - "short_answer_loss": NaN, - "step": 1089, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0756, - "grad_norm": 1.4765625, - "learning_rate": 8.399053544571434e-07, - "long_answer_loss": 0.0756, - "loss": 0.0909, - "short_answer_loss": NaN, - "step": 1090, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.0844, - "grad_norm": 1.359375, - "learning_rate": 8.280833599337512e-07, - "long_answer_loss": 0.0844, - "loss": 0.0809, - "short_answer_loss": NaN, - "step": 1091, - "template_loss": 0.0 - }, - { - "epoch": 1.77, - "full_loss": 0.1157, - "grad_norm": 1.4296875, - "learning_rate": 8.16342304960127e-07, - "long_answer_loss": 0.1157, - "loss": 0.0863, - "short_answer_loss": NaN, - "step": 1092, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0911, - "grad_norm": 1.4609375, - "learning_rate": 8.046822709552704e-07, - "long_answer_loss": 0.0911, - "loss": 0.0874, - "short_answer_loss": NaN, - "step": 1093, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.076, - "grad_norm": 1.375, - "learning_rate": 7.931033387763229e-07, - "long_answer_loss": 0.076, - "loss": 0.0838, - "short_answer_loss": NaN, - "step": 1094, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0807, - "grad_norm": 1.390625, - "learning_rate": 7.816055887180318e-07, - "long_answer_loss": 0.0807, - "loss": 0.0736, - "short_answer_loss": NaN, - "step": 1095, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.1084, - "grad_norm": 1.3984375, - "learning_rate": 7.701891005121794e-07, - "long_answer_loss": 0.1084, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1096, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0891, - "grad_norm": 1.40625, - "learning_rate": 7.588539533270295e-07, - "long_answer_loss": 0.0891, - "loss": 0.0893, - "short_answer_loss": NaN, - "step": 1097, - "template_loss": 0.0 - }, - { - "epoch": 1.78, - "full_loss": 0.0747, - "grad_norm": 1.3828125, - "learning_rate": 7.476002257667897e-07, - "long_answer_loss": 0.0747, - "loss": 0.0752, - "short_answer_loss": NaN, - "step": 1098, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0709, - "grad_norm": 1.453125, - "learning_rate": 7.364279958710516e-07, - "long_answer_loss": 0.0709, - "loss": 0.0832, - "short_answer_loss": NaN, - "step": 1099, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0963, - "grad_norm": 1.359375, - "learning_rate": 7.2533734111426e-07, - "long_answer_loss": 0.0963, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 1100, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0834, - "grad_norm": 1.5390625, - "learning_rate": 7.143283384051763e-07, - "long_answer_loss": 0.0834, - "loss": 0.087, - "short_answer_loss": NaN, - "step": 1101, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0738, - "grad_norm": 1.3046875, - "learning_rate": 7.034010640863301e-07, - "long_answer_loss": 0.0738, - "loss": 0.077, - "short_answer_loss": NaN, - "step": 1102, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0679, - "grad_norm": 1.4609375, - "learning_rate": 6.925555939335124e-07, - "long_answer_loss": 0.0679, - "loss": 0.0847, - "short_answer_loss": NaN, - "step": 1103, - "template_loss": 0.0 - }, - { - "epoch": 1.79, - "full_loss": 0.0554, - "grad_norm": 1.4375, - "learning_rate": 6.81792003155235e-07, - "long_answer_loss": 0.0554, - "loss": 0.082, - "short_answer_loss": NaN, - "step": 1104, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0686, - "grad_norm": 1.4296875, - "learning_rate": 6.711103663922094e-07, - "long_answer_loss": 0.0686, - "loss": 0.0813, - "short_answer_loss": NaN, - "step": 1105, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0825, - "grad_norm": 1.34375, - "learning_rate": 6.605107577168341e-07, - "long_answer_loss": 0.0825, - "loss": 0.0847, - "short_answer_loss": NaN, - "step": 1106, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0875, - "grad_norm": 1.4296875, - "learning_rate": 6.499932506326828e-07, - "long_answer_loss": 0.0875, - "loss": 0.0821, - "short_answer_loss": NaN, - "step": 1107, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0868, - "grad_norm": 1.3359375, - "learning_rate": 6.395579180739866e-07, - "long_answer_loss": 0.0868, - "loss": 0.0813, - "short_answer_loss": NaN, - "step": 1108, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.1108, - "grad_norm": 1.40625, - "learning_rate": 6.292048324051378e-07, - "long_answer_loss": 0.1108, - "loss": 0.0874, - "short_answer_loss": NaN, - "step": 1109, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0865, - "grad_norm": 1.4609375, - "learning_rate": 6.189340654201751e-07, - "long_answer_loss": 0.0865, - "loss": 0.0787, - "short_answer_loss": NaN, - "step": 1110, - "template_loss": 0.0 - }, - { - "epoch": 1.8, - "full_loss": 0.0636, - "grad_norm": 1.3984375, - "learning_rate": 6.08745688342302e-07, - "long_answer_loss": 0.0636, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1111, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0621, - "grad_norm": 1.3828125, - "learning_rate": 5.986397718233838e-07, - "long_answer_loss": 0.0621, - "loss": 0.0949, - "short_answer_loss": NaN, - "step": 1112, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0806, - "grad_norm": 1.3671875, - "learning_rate": 5.886163859434482e-07, - "long_answer_loss": 0.0806, - "loss": 0.0812, - "short_answer_loss": NaN, - "step": 1113, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0829, - "grad_norm": 1.4453125, - "learning_rate": 5.786756002102239e-07, - "long_answer_loss": 0.0829, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 1114, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0766, - "grad_norm": 1.484375, - "learning_rate": 5.688174835586382e-07, - "long_answer_loss": 0.0766, - "loss": 0.0884, - "short_answer_loss": NaN, - "step": 1115, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0892, - "grad_norm": 1.4375, - "learning_rate": 5.590421043503422e-07, - "long_answer_loss": 0.0892, - "loss": 0.085, - "short_answer_loss": NaN, - "step": 1116, - "template_loss": 0.0 - }, - { - "epoch": 1.81, - "full_loss": 0.0944, - "grad_norm": 1.453125, - "learning_rate": 5.493495303732415e-07, - "long_answer_loss": 0.0944, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 1117, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0704, - "grad_norm": 1.421875, - "learning_rate": 5.397398288410293e-07, - "long_answer_loss": 0.0704, - "loss": 0.0751, - "short_answer_loss": NaN, - "step": 1118, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0949, - "grad_norm": 1.3359375, - "learning_rate": 5.30213066392704e-07, - "long_answer_loss": 0.0949, - "loss": 0.0802, - "short_answer_loss": NaN, - "step": 1119, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0618, - "grad_norm": 1.484375, - "learning_rate": 5.207693090921325e-07, - "long_answer_loss": 0.0618, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1120, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0886, - "grad_norm": 1.375, - "learning_rate": 5.114086224275671e-07, - "long_answer_loss": 0.0886, - "loss": 0.0895, - "short_answer_loss": NaN, - "step": 1121, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.1024, - "grad_norm": 1.5078125, - "learning_rate": 5.021310713112057e-07, - "long_answer_loss": 0.1024, - "loss": 0.0876, - "short_answer_loss": NaN, - "step": 1122, - "template_loss": 0.0 - }, - { - "epoch": 1.82, - "full_loss": 0.0832, - "grad_norm": 1.421875, - "learning_rate": 4.929367200787405e-07, - "long_answer_loss": 0.0832, - "loss": 0.0817, - "short_answer_loss": NaN, - "step": 1123, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0724, - "grad_norm": 1.4375, - "learning_rate": 4.838256324889046e-07, - "long_answer_loss": 0.0724, - "loss": 0.082, - "short_answer_loss": NaN, - "step": 1124, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0846, - "grad_norm": 1.421875, - "learning_rate": 4.7479787172303727e-07, - "long_answer_loss": 0.0846, - "loss": 0.0879, - "short_answer_loss": NaN, - "step": 1125, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0958, - "grad_norm": 1.34375, - "learning_rate": 4.6585350038464565e-07, - "long_answer_loss": 0.0958, - "loss": 0.0832, - "short_answer_loss": NaN, - "step": 1126, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0911, - "grad_norm": 1.328125, - "learning_rate": 4.569925804989647e-07, - "long_answer_loss": 0.0911, - "loss": 0.0802, - "short_answer_loss": NaN, - "step": 1127, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0884, - "grad_norm": 1.46875, - "learning_rate": 4.4821517351253533e-07, - "long_answer_loss": 0.0884, - "loss": 0.0843, - "short_answer_loss": NaN, - "step": 1128, - "template_loss": 0.0 - }, - { - "epoch": 1.83, - "full_loss": 0.0768, - "grad_norm": 1.40625, - "learning_rate": 4.3952134029276587e-07, - "long_answer_loss": 0.0768, - "loss": 0.0819, - "short_answer_loss": NaN, - "step": 1129, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0706, - "grad_norm": 1.4375, - "learning_rate": 4.309111411275241e-07, - "long_answer_loss": 0.0706, - "loss": 0.0904, - "short_answer_loss": NaN, - "step": 1130, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.1012, - "grad_norm": 1.3671875, - "learning_rate": 4.223846357247124e-07, - "long_answer_loss": 0.1012, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 1131, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0776, - "grad_norm": 1.4609375, - "learning_rate": 4.139418832118505e-07, - "long_answer_loss": 0.0776, - "loss": 0.081, - "short_answer_loss": NaN, - "step": 1132, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0971, - "grad_norm": 1.6328125, - "learning_rate": 4.0558294213567105e-07, - "long_answer_loss": 0.0971, - "loss": 0.0893, - "short_answer_loss": NaN, - "step": 1133, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0809, - "grad_norm": 1.453125, - "learning_rate": 3.973078704617175e-07, - "long_answer_loss": 0.0809, - "loss": 0.0823, - "short_answer_loss": NaN, - "step": 1134, - "template_loss": 0.0 - }, - { - "epoch": 1.84, - "full_loss": 0.0923, - "grad_norm": 1.3984375, - "learning_rate": 3.89116725573925e-07, - "long_answer_loss": 0.0923, - "loss": 0.0855, - "short_answer_loss": NaN, - "step": 1135, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0775, - "grad_norm": 1.328125, - "learning_rate": 3.810095642742414e-07, - "long_answer_loss": 0.0775, - "loss": 0.0767, - "short_answer_loss": NaN, - "step": 1136, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0858, - "grad_norm": 1.3671875, - "learning_rate": 3.7298644278222494e-07, - "long_answer_loss": 0.0858, - "loss": 0.0772, - "short_answer_loss": NaN, - "step": 1137, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0852, - "grad_norm": 1.4453125, - "learning_rate": 3.6504741673465264e-07, - "long_answer_loss": 0.0852, - "loss": 0.0799, - "short_answer_loss": NaN, - "step": 1138, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0737, - "grad_norm": 1.375, - "learning_rate": 3.57192541185139e-07, - "long_answer_loss": 0.0737, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 1139, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.087, - "grad_norm": 1.4453125, - "learning_rate": 3.4942187060375277e-07, - "long_answer_loss": 0.087, - "loss": 0.0792, - "short_answer_loss": NaN, - "step": 1140, - "template_loss": 0.0 - }, - { - "epoch": 1.85, - "full_loss": 0.0693, - "grad_norm": 1.421875, - "learning_rate": 3.417354588766353e-07, - "long_answer_loss": 0.0693, - "loss": 0.0809, - "short_answer_loss": NaN, - "step": 1141, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0885, - "grad_norm": 1.5078125, - "learning_rate": 3.34133359305637e-07, - "long_answer_loss": 0.0885, - "loss": 0.086, - "short_answer_loss": NaN, - "step": 1142, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0765, - "grad_norm": 1.3984375, - "learning_rate": 3.266156246079316e-07, - "long_answer_loss": 0.0765, - "loss": 0.0848, - "short_answer_loss": NaN, - "step": 1143, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0744, - "grad_norm": 1.3984375, - "learning_rate": 3.1918230691566906e-07, - "long_answer_loss": 0.0744, - "loss": 0.0893, - "short_answer_loss": NaN, - "step": 1144, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0869, - "grad_norm": 1.390625, - "learning_rate": 3.1183345777559964e-07, - "long_answer_loss": 0.0869, - "loss": 0.0794, - "short_answer_loss": NaN, - "step": 1145, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0858, - "grad_norm": 1.421875, - "learning_rate": 3.045691281487198e-07, - "long_answer_loss": 0.0858, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 1146, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.104, - "grad_norm": 1.484375, - "learning_rate": 2.9738936840992967e-07, - "long_answer_loss": 0.104, - "loss": 0.0841, - "short_answer_loss": NaN, - "step": 1147, - "template_loss": 0.0 - }, - { - "epoch": 1.86, - "full_loss": 0.0851, - "grad_norm": 1.3515625, - "learning_rate": 2.9029422834766645e-07, - "long_answer_loss": 0.0851, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 1148, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.081, - "grad_norm": 1.390625, - "learning_rate": 2.832837571635688e-07, - "long_answer_loss": 0.081, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 1149, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.08, - "grad_norm": 1.4140625, - "learning_rate": 2.763580034721394e-07, - "long_answer_loss": 0.08, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1150, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.081, - "grad_norm": 1.421875, - "learning_rate": 2.6951701530039676e-07, - "long_answer_loss": 0.081, - "loss": 0.0814, - "short_answer_loss": NaN, - "step": 1151, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0649, - "grad_norm": 1.515625, - "learning_rate": 2.627608400875503e-07, - "long_answer_loss": 0.0649, - "loss": 0.0764, - "short_answer_loss": NaN, - "step": 1152, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0826, - "grad_norm": 1.5703125, - "learning_rate": 2.5608952468467175e-07, - "long_answer_loss": 0.0826, - "loss": 0.0866, - "short_answer_loss": NaN, - "step": 1153, - "template_loss": 0.0 - }, - { - "epoch": 1.87, - "full_loss": 0.0923, - "grad_norm": 1.328125, - "learning_rate": 2.495031153543631e-07, - "long_answer_loss": 0.0923, - "loss": 0.0838, - "short_answer_loss": NaN, - "step": 1154, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0851, - "grad_norm": 1.4296875, - "learning_rate": 2.430016577704461e-07, - "long_answer_loss": 0.0851, - "loss": 0.0851, - "short_answer_loss": NaN, - "step": 1155, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0785, - "grad_norm": 1.4609375, - "learning_rate": 2.365851970176358e-07, - "long_answer_loss": 0.0785, - "loss": 0.0839, - "short_answer_loss": NaN, - "step": 1156, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0705, - "grad_norm": 1.4140625, - "learning_rate": 2.3025377759123279e-07, - "long_answer_loss": 0.0705, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 1157, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0859, - "grad_norm": 1.390625, - "learning_rate": 2.240074433968134e-07, - "long_answer_loss": 0.0859, - "loss": 0.0822, - "short_answer_loss": NaN, - "step": 1158, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.0678, - "grad_norm": 1.3125, - "learning_rate": 2.178462377499302e-07, - "long_answer_loss": 0.0678, - "loss": 0.0768, - "short_answer_loss": NaN, - "step": 1159, - "template_loss": 0.0 - }, - { - "epoch": 1.88, - "full_loss": 0.1107, - "grad_norm": 1.4765625, - "learning_rate": 2.1177020337579818e-07, - "long_answer_loss": 0.1107, - "loss": 0.0831, - "short_answer_loss": NaN, - "step": 1160, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.09, - "grad_norm": 1.5078125, - "learning_rate": 2.0577938240901873e-07, - "long_answer_loss": 0.09, - "loss": 0.0836, - "short_answer_loss": NaN, - "step": 1161, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.1141, - "grad_norm": 1.3984375, - "learning_rate": 1.998738163932659e-07, - "long_answer_loss": 0.1141, - "loss": 0.0827, - "short_answer_loss": NaN, - "step": 1162, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0861, - "grad_norm": 1.4375, - "learning_rate": 1.9405354628101448e-07, - "long_answer_loss": 0.0861, - "loss": 0.0859, - "short_answer_loss": NaN, - "step": 1163, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0883, - "grad_norm": 1.375, - "learning_rate": 1.8831861243324978e-07, - "long_answer_loss": 0.0883, - "loss": 0.0789, - "short_answer_loss": NaN, - "step": 1164, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0847, - "grad_norm": 1.4375, - "learning_rate": 1.8266905461918475e-07, - "long_answer_loss": 0.0847, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 1165, - "template_loss": 0.0 - }, - { - "epoch": 1.89, - "full_loss": 0.0852, - "grad_norm": 1.421875, - "learning_rate": 1.7710491201599065e-07, - "long_answer_loss": 0.0852, - "loss": 0.08, - "short_answer_loss": NaN, - "step": 1166, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0977, - "grad_norm": 1.3984375, - "learning_rate": 1.7162622320852084e-07, - "long_answer_loss": 0.0977, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 1167, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0761, - "grad_norm": 1.5078125, - "learning_rate": 1.6623302618904713e-07, - "long_answer_loss": 0.0761, - "loss": 0.0821, - "short_answer_loss": NaN, - "step": 1168, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0999, - "grad_norm": 1.5078125, - "learning_rate": 1.6092535835699058e-07, - "long_answer_loss": 0.0999, - "loss": 0.0907, - "short_answer_loss": NaN, - "step": 1169, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0967, - "grad_norm": 1.421875, - "learning_rate": 1.5570325651866475e-07, - "long_answer_loss": 0.0967, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 1170, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0865, - "grad_norm": 1.3828125, - "learning_rate": 1.5056675688702449e-07, - "long_answer_loss": 0.0865, - "loss": 0.0782, - "short_answer_loss": NaN, - "step": 1171, - "template_loss": 0.0 - }, - { - "epoch": 1.9, - "full_loss": 0.0635, - "grad_norm": 1.375, - "learning_rate": 1.4551589508141062e-07, - "long_answer_loss": 0.0635, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1172, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0787, - "grad_norm": 1.359375, - "learning_rate": 1.405507061273001e-07, - "long_answer_loss": 0.0787, - "loss": 0.0826, - "short_answer_loss": NaN, - "step": 1173, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0891, - "grad_norm": 1.4296875, - "learning_rate": 1.3567122445607427e-07, - "long_answer_loss": 0.0891, - "loss": 0.0834, - "short_answer_loss": NaN, - "step": 1174, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.1124, - "grad_norm": 1.46875, - "learning_rate": 1.3087748390476356e-07, - "long_answer_loss": 0.1124, - "loss": 0.0879, - "short_answer_loss": NaN, - "step": 1175, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0836, - "grad_norm": 1.3203125, - "learning_rate": 1.2616951771582674e-07, - "long_answer_loss": 0.0836, - "loss": 0.0808, - "short_answer_loss": NaN, - "step": 1176, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0714, - "grad_norm": 1.3671875, - "learning_rate": 1.2154735853691923e-07, - "long_answer_loss": 0.0714, - "loss": 0.0775, - "short_answer_loss": NaN, - "step": 1177, - "template_loss": 0.0 - }, - { - "epoch": 1.91, - "full_loss": 0.0685, - "grad_norm": 1.4453125, - "learning_rate": 1.1701103842065298e-07, - "long_answer_loss": 0.0685, - "loss": 0.0837, - "short_answer_loss": NaN, - "step": 1178, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0862, - "grad_norm": 1.40625, - "learning_rate": 1.1256058882439386e-07, - "long_answer_loss": 0.0862, - "loss": 0.0776, - "short_answer_loss": NaN, - "step": 1179, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0845, - "grad_norm": 1.390625, - "learning_rate": 1.0819604061002852e-07, - "long_answer_loss": 0.0845, - "loss": 0.0796, - "short_answer_loss": NaN, - "step": 1180, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.1098, - "grad_norm": 1.421875, - "learning_rate": 1.0391742404375765e-07, - "long_answer_loss": 0.1098, - "loss": 0.0813, - "short_answer_loss": NaN, - "step": 1181, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.1155, - "grad_norm": 1.4765625, - "learning_rate": 9.972476879588494e-08, - "long_answer_loss": 0.1155, - "loss": 0.081, - "short_answer_loss": NaN, - "step": 1182, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.1045, - "grad_norm": 1.484375, - "learning_rate": 9.5618103940609e-08, - "long_answer_loss": 0.1045, - "loss": 0.0891, - "short_answer_loss": NaN, - "step": 1183, - "template_loss": 0.0 - }, - { - "epoch": 1.92, - "full_loss": 0.0814, - "grad_norm": 1.40625, - "learning_rate": 9.159745795582209e-08, - "long_answer_loss": 0.0814, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 1184, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0752, - "grad_norm": 1.3203125, - "learning_rate": 8.766285872291725e-08, - "long_answer_loss": 0.0752, - "loss": 0.0789, - "short_answer_loss": NaN, - "step": 1185, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.073, - "grad_norm": 1.3359375, - "learning_rate": 8.38143335265898e-08, - "long_answer_loss": 0.073, - "loss": 0.0779, - "short_answer_loss": NaN, - "step": 1186, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0729, - "grad_norm": 1.5, - "learning_rate": 8.005190905464866e-08, - "long_answer_loss": 0.0729, - "loss": 0.0849, - "short_answer_loss": NaN, - "step": 1187, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0817, - "grad_norm": 1.390625, - "learning_rate": 7.637561139783589e-08, - "long_answer_loss": 0.0817, - "loss": 0.0807, - "short_answer_loss": NaN, - "step": 1188, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0808, - "grad_norm": 1.4609375, - "learning_rate": 7.278546604963937e-08, - "long_answer_loss": 0.0808, - "loss": 0.0888, - "short_answer_loss": NaN, - "step": 1189, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.0643, - "grad_norm": 1.40625, - "learning_rate": 6.928149790611932e-08, - "long_answer_loss": 0.0643, - "loss": 0.0778, - "short_answer_loss": NaN, - "step": 1190, - "template_loss": 0.0 - }, - { - "epoch": 1.93, - "full_loss": 0.077, - "grad_norm": 1.3828125, - "learning_rate": 6.586373126573759e-08, - "long_answer_loss": 0.077, - "loss": 0.0801, - "short_answer_loss": NaN, - "step": 1191, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0875, - "grad_norm": 1.3671875, - "learning_rate": 6.253218982918418e-08, - "long_answer_loss": 0.0875, - "loss": 0.086, - "short_answer_loss": NaN, - "step": 1192, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.07, - "grad_norm": 1.3359375, - "learning_rate": 5.928689669921772e-08, - "long_answer_loss": 0.07, - "loss": 0.0733, - "short_answer_loss": NaN, - "step": 1193, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0873, - "grad_norm": 1.3203125, - "learning_rate": 5.612787438050299e-08, - "long_answer_loss": 0.0873, - "loss": 0.0777, - "short_answer_loss": NaN, - "step": 1194, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.1155, - "grad_norm": 1.3828125, - "learning_rate": 5.305514477945278e-08, - "long_answer_loss": 0.1155, - "loss": 0.0843, - "short_answer_loss": NaN, - "step": 1195, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0967, - "grad_norm": 1.65625, - "learning_rate": 5.006872920408079e-08, - "long_answer_loss": 0.0967, - "loss": 0.0931, - "short_answer_loss": NaN, - "step": 1196, - "template_loss": 0.0 - }, - { - "epoch": 1.94, - "full_loss": 0.0776, - "grad_norm": 1.4296875, - "learning_rate": 4.716864836385171e-08, - "long_answer_loss": 0.0776, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1197, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0943, - "grad_norm": 1.5234375, - "learning_rate": 4.435492236953415e-08, - "long_answer_loss": 0.0943, - "loss": 0.0904, - "short_answer_loss": NaN, - "step": 1198, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0857, - "grad_norm": 1.421875, - "learning_rate": 4.1627570733067386e-08, - "long_answer_loss": 0.0857, - "loss": 0.0818, - "short_answer_loss": NaN, - "step": 1199, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0795, - "grad_norm": 1.359375, - "learning_rate": 3.898661236742124e-08, - "long_answer_loss": 0.0795, - "loss": 0.0795, - "short_answer_loss": NaN, - "step": 1200, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.064, - "grad_norm": 1.3515625, - "learning_rate": 3.643206558646833e-08, - "long_answer_loss": 0.064, - "loss": 0.0757, - "short_answer_loss": NaN, - "step": 1201, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0763, - "grad_norm": 1.4453125, - "learning_rate": 3.39639481048537e-08, - "long_answer_loss": 0.0763, - "loss": 0.0805, - "short_answer_loss": NaN, - "step": 1202, - "template_loss": 0.0 - }, - { - "epoch": 1.95, - "full_loss": 0.0784, - "grad_norm": 1.3671875, - "learning_rate": 3.158227703787264e-08, - "long_answer_loss": 0.0784, - "loss": 0.0806, - "short_answer_loss": NaN, - "step": 1203, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0618, - "grad_norm": 1.5, - "learning_rate": 2.9287068901356907e-08, - "long_answer_loss": 0.0618, - "loss": 0.0823, - "short_answer_loss": NaN, - "step": 1204, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0754, - "grad_norm": 1.3984375, - "learning_rate": 2.7078339611552595e-08, - "long_answer_loss": 0.0754, - "loss": 0.0846, - "short_answer_loss": NaN, - "step": 1205, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0835, - "grad_norm": 1.4921875, - "learning_rate": 2.4956104485014674e-08, - "long_answer_loss": 0.0835, - "loss": 0.0816, - "short_answer_loss": NaN, - "step": 1206, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0722, - "grad_norm": 1.328125, - "learning_rate": 2.292037823849874e-08, - "long_answer_loss": 0.0722, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1207, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.1098, - "grad_norm": 1.3671875, - "learning_rate": 2.0971174988863862e-08, - "long_answer_loss": 0.1098, - "loss": 0.083, - "short_answer_loss": NaN, - "step": 1208, - "template_loss": 0.0 - }, - { - "epoch": 1.96, - "full_loss": 0.0786, - "grad_norm": 1.4140625, - "learning_rate": 1.910850825296573e-08, - "long_answer_loss": 0.0786, - "loss": 0.0785, - "short_answer_loss": NaN, - "step": 1209, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.093, - "grad_norm": 1.3828125, - "learning_rate": 1.7332390947567835e-08, - "long_answer_loss": 0.093, - "loss": 0.0845, - "short_answer_loss": NaN, - "step": 1210, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0732, - "grad_norm": 1.46875, - "learning_rate": 1.5642835389256817e-08, - "long_answer_loss": 0.0732, - "loss": 0.0809, - "short_answer_loss": NaN, - "step": 1211, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0713, - "grad_norm": 1.4609375, - "learning_rate": 1.4039853294346705e-08, - "long_answer_loss": 0.0713, - "loss": 0.0842, - "short_answer_loss": NaN, - "step": 1212, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0793, - "grad_norm": 1.484375, - "learning_rate": 1.2523455778806758e-08, - "long_answer_loss": 0.0793, - "loss": 0.0769, - "short_answer_loss": NaN, - "step": 1213, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0905, - "grad_norm": 1.3828125, - "learning_rate": 1.1093653358176804e-08, - "long_answer_loss": 0.0905, - "loss": 0.0784, - "short_answer_loss": NaN, - "step": 1214, - "template_loss": 0.0 - }, - { - "epoch": 1.97, - "full_loss": 0.0979, - "grad_norm": 1.421875, - "learning_rate": 9.7504559475034e-09, - "long_answer_loss": 0.0979, - "loss": 0.0912, - "short_answer_loss": NaN, - "step": 1215, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0766, - "grad_norm": 1.359375, - "learning_rate": 8.493872861260744e-09, - "long_answer_loss": 0.0766, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1216, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0754, - "grad_norm": 1.484375, - "learning_rate": 7.323912813295142e-09, - "long_answer_loss": 0.0754, - "loss": 0.0914, - "short_answer_loss": NaN, - "step": 1217, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0735, - "grad_norm": 1.421875, - "learning_rate": 6.240583916758414e-09, - "long_answer_loss": 0.0735, - "loss": 0.0827, - "short_answer_loss": NaN, - "step": 1218, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0803, - "grad_norm": 1.4140625, - "learning_rate": 5.2438936840551454e-09, - "long_answer_loss": 0.0803, - "loss": 0.0869, - "short_answer_loss": NaN, - "step": 1219, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0725, - "grad_norm": 1.3515625, - "learning_rate": 4.3338490267871845e-09, - "long_answer_loss": 0.0725, - "loss": 0.0842, - "short_answer_loss": NaN, - "step": 1220, - "template_loss": 0.0 - }, - { - "epoch": 1.98, - "full_loss": 0.0937, - "grad_norm": 1.4296875, - "learning_rate": 3.5104562557120026e-09, - "long_answer_loss": 0.0937, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 1221, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0521, - "grad_norm": 1.4609375, - "learning_rate": 2.7737210806899618e-09, - "long_answer_loss": 0.0521, - "loss": 0.0773, - "short_answer_loss": NaN, - "step": 1222, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0802, - "grad_norm": 1.4375, - "learning_rate": 2.1236486106523957e-09, - "long_answer_loss": 0.0802, - "loss": 0.0854, - "short_answer_loss": NaN, - "step": 1223, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0651, - "grad_norm": 1.40625, - "learning_rate": 1.5602433535627515e-09, - "long_answer_loss": 0.0651, - "loss": 0.0781, - "short_answer_loss": NaN, - "step": 1224, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0774, - "grad_norm": 1.40625, - "learning_rate": 1.0835092163860582e-09, - "long_answer_loss": 0.0774, - "loss": 0.0771, - "short_answer_loss": NaN, - "step": 1225, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.095, - "grad_norm": 1.46875, - "learning_rate": 6.934495050611723e-10, - "long_answer_loss": 0.095, - "loss": 0.0827, - "short_answer_loss": NaN, - "step": 1226, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0651, - "grad_norm": 1.34375, - "learning_rate": 3.9006692447857153e-10, - "long_answer_loss": 0.0651, - "loss": 0.0788, - "short_answer_loss": NaN, - "step": 1227, - "template_loss": 0.0 - }, - { - "epoch": 1.99, - "full_loss": 0.0897, - "grad_norm": 1.484375, - "learning_rate": 1.7336357846231556e-10, - "long_answer_loss": 0.0897, - "loss": 0.0797, - "short_answer_loss": NaN, - "step": 1228, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0987, - "grad_norm": 1.3515625, - "learning_rate": 4.3340969753391526e-11, - "long_answer_loss": 0.0987, - "loss": 0.0815, - "short_answer_loss": NaN, - "step": 1229, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "full_loss": 0.0642, - "grad_norm": 1.421875, - "learning_rate": 0.0, - "long_answer_loss": 0.0642, - "loss": 0.0863, - "short_answer_loss": NaN, - "step": 1230, - "template_loss": 0.0 - }, - { - "epoch": 2.0, - "step": 1230, - "total_flos": 9.273133451978998e+17, - "train_loss": 0.14522774513295997, - "train_runtime": 5219.7207, - "train_samples_per_second": 30.187, - "train_steps_per_second": 0.236 - } - ], - "logging_steps": 1.0, - "max_steps": 1230, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 1.0, - "total_flos": 9.273133451978998e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}