diff --git "a/output/checkpoint-3/trainer_state.json" "b/output/checkpoint-3/trainer_state.json" new file mode 100644--- /dev/null +++ "b/output/checkpoint-3/trainer_state.json" @@ -0,0 +1,5633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9149207505920933, + "eval_steps": 500, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036436509382401167, + "grad_norm": 0.6875, + "learning_rate": 9.987852283770651e-05, + "loss": 3.4902, + "step": 10 + }, + { + "epoch": 0.007287301876480233, + "grad_norm": 0.66796875, + "learning_rate": 9.975704567541302e-05, + "loss": 3.3432, + "step": 20 + }, + { + "epoch": 0.01093095281472035, + "grad_norm": 0.5546875, + "learning_rate": 9.963556851311953e-05, + "loss": 3.2381, + "step": 30 + }, + { + "epoch": 0.014574603752960467, + "grad_norm": 0.65234375, + "learning_rate": 9.951409135082604e-05, + "loss": 3.2931, + "step": 40 + }, + { + "epoch": 0.018218254691200583, + "grad_norm": 0.6328125, + "learning_rate": 9.939261418853257e-05, + "loss": 3.3235, + "step": 50 + }, + { + "epoch": 0.0218619056294407, + "grad_norm": 0.64453125, + "learning_rate": 9.927113702623908e-05, + "loss": 3.2988, + "step": 60 + }, + { + "epoch": 0.025505556567680818, + "grad_norm": 0.59765625, + "learning_rate": 9.914965986394558e-05, + "loss": 3.2927, + "step": 70 + }, + { + "epoch": 0.029149207505920934, + "grad_norm": 0.57421875, + "learning_rate": 9.90281827016521e-05, + "loss": 3.275, + "step": 80 + }, + { + "epoch": 0.03279285844416105, + "grad_norm": 0.640625, + "learning_rate": 9.89067055393586e-05, + "loss": 3.316, + "step": 90 + }, + { + "epoch": 0.036436509382401165, + "grad_norm": 0.57421875, + "learning_rate": 9.878522837706513e-05, + "loss": 3.2611, + "step": 100 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 0.51171875, + "learning_rate": 9.866375121477162e-05, + "loss": 3.268, + "step": 110 + }, + { + "epoch": 0.0437238112588814, + "grad_norm": 0.703125, + "learning_rate": 9.854227405247813e-05, + "loss": 3.3032, + "step": 120 + }, + { + "epoch": 0.04736746219712151, + "grad_norm": 0.5546875, + "learning_rate": 9.842079689018465e-05, + "loss": 3.3334, + "step": 130 + }, + { + "epoch": 0.051011113135361635, + "grad_norm": 0.671875, + "learning_rate": 9.829931972789116e-05, + "loss": 3.1943, + "step": 140 + }, + { + "epoch": 0.05465476407360175, + "grad_norm": 0.6171875, + "learning_rate": 9.817784256559767e-05, + "loss": 3.2574, + "step": 150 + }, + { + "epoch": 0.05829841501184187, + "grad_norm": 0.66015625, + "learning_rate": 9.805636540330418e-05, + "loss": 3.3747, + "step": 160 + }, + { + "epoch": 0.06194206595008198, + "grad_norm": 0.52734375, + "learning_rate": 9.793488824101069e-05, + "loss": 3.2992, + "step": 170 + }, + { + "epoch": 0.0655857168883221, + "grad_norm": 0.50390625, + "learning_rate": 9.781341107871722e-05, + "loss": 3.2342, + "step": 180 + }, + { + "epoch": 0.06922936782656222, + "grad_norm": 0.65234375, + "learning_rate": 9.769193391642371e-05, + "loss": 3.356, + "step": 190 + }, + { + "epoch": 0.07287301876480233, + "grad_norm": 0.57421875, + "learning_rate": 9.757045675413022e-05, + "loss": 3.3618, + "step": 200 + }, + { + "epoch": 0.07651666970304245, + "grad_norm": 0.58984375, + "learning_rate": 9.744897959183674e-05, + "loss": 3.2931, + "step": 210 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 0.77734375, + "learning_rate": 9.732750242954325e-05, + "loss": 3.3246, + "step": 220 + }, + { + "epoch": 0.08380397157952268, + "grad_norm": 0.5859375, + "learning_rate": 9.720602526724975e-05, + "loss": 3.3181, + "step": 230 + }, + { + "epoch": 0.0874476225177628, + "grad_norm": 0.640625, + "learning_rate": 9.708454810495627e-05, + "loss": 3.2757, + "step": 240 + }, + { + "epoch": 0.09109127345600292, + "grad_norm": 0.55859375, + "learning_rate": 9.696307094266278e-05, + "loss": 3.2753, + "step": 250 + }, + { + "epoch": 0.09473492439424303, + "grad_norm": 0.58203125, + "learning_rate": 9.68415937803693e-05, + "loss": 3.3207, + "step": 260 + }, + { + "epoch": 0.09837857533248315, + "grad_norm": 0.63671875, + "learning_rate": 9.67201166180758e-05, + "loss": 3.3035, + "step": 270 + }, + { + "epoch": 0.10202222627072327, + "grad_norm": 0.578125, + "learning_rate": 9.659863945578231e-05, + "loss": 3.3025, + "step": 280 + }, + { + "epoch": 0.10566587720896338, + "grad_norm": 0.5859375, + "learning_rate": 9.647716229348883e-05, + "loss": 3.2066, + "step": 290 + }, + { + "epoch": 0.1093095281472035, + "grad_norm": 0.7109375, + "learning_rate": 9.635568513119534e-05, + "loss": 3.2757, + "step": 300 + }, + { + "epoch": 0.11295317908544361, + "grad_norm": 0.609375, + "learning_rate": 9.623420796890185e-05, + "loss": 3.1904, + "step": 310 + }, + { + "epoch": 0.11659683002368373, + "grad_norm": 0.60546875, + "learning_rate": 9.611273080660836e-05, + "loss": 3.1947, + "step": 320 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 0.6171875, + "learning_rate": 9.599125364431487e-05, + "loss": 3.2016, + "step": 330 + }, + { + "epoch": 0.12388413190016397, + "grad_norm": 0.640625, + "learning_rate": 9.58697764820214e-05, + "loss": 3.329, + "step": 340 + }, + { + "epoch": 0.12752778283840407, + "grad_norm": 0.66796875, + "learning_rate": 9.574829931972789e-05, + "loss": 3.2483, + "step": 350 + }, + { + "epoch": 0.1311714337766442, + "grad_norm": 0.57421875, + "learning_rate": 9.56268221574344e-05, + "loss": 3.2388, + "step": 360 + }, + { + "epoch": 0.13481508471488432, + "grad_norm": 0.58984375, + "learning_rate": 9.550534499514092e-05, + "loss": 3.2722, + "step": 370 + }, + { + "epoch": 0.13845873565312444, + "grad_norm": 0.58203125, + "learning_rate": 9.538386783284743e-05, + "loss": 3.2672, + "step": 380 + }, + { + "epoch": 0.14210238659136454, + "grad_norm": 0.5234375, + "learning_rate": 9.526239067055394e-05, + "loss": 3.3378, + "step": 390 + }, + { + "epoch": 0.14574603752960466, + "grad_norm": 0.55859375, + "learning_rate": 9.514091350826045e-05, + "loss": 3.2637, + "step": 400 + }, + { + "epoch": 0.14938968846784478, + "grad_norm": 0.70703125, + "learning_rate": 9.501943634596696e-05, + "loss": 3.2879, + "step": 410 + }, + { + "epoch": 0.1530333394060849, + "grad_norm": 0.6640625, + "learning_rate": 9.489795918367348e-05, + "loss": 3.2614, + "step": 420 + }, + { + "epoch": 0.156676990344325, + "grad_norm": 0.625, + "learning_rate": 9.477648202137999e-05, + "loss": 3.2469, + "step": 430 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 0.5703125, + "learning_rate": 9.465500485908649e-05, + "loss": 3.1614, + "step": 440 + }, + { + "epoch": 0.16396429222080525, + "grad_norm": 0.59765625, + "learning_rate": 9.453352769679301e-05, + "loss": 3.2658, + "step": 450 + }, + { + "epoch": 0.16760794315904537, + "grad_norm": 0.6953125, + "learning_rate": 9.441205053449952e-05, + "loss": 3.3253, + "step": 460 + }, + { + "epoch": 0.1712515940972855, + "grad_norm": 0.67578125, + "learning_rate": 9.429057337220603e-05, + "loss": 3.2311, + "step": 470 + }, + { + "epoch": 0.1748952450355256, + "grad_norm": 0.625, + "learning_rate": 9.416909620991254e-05, + "loss": 3.3117, + "step": 480 + }, + { + "epoch": 0.1785388959737657, + "grad_norm": 0.6640625, + "learning_rate": 9.404761904761905e-05, + "loss": 3.3513, + "step": 490 + }, + { + "epoch": 0.18218254691200583, + "grad_norm": 0.5703125, + "learning_rate": 9.392614188532556e-05, + "loss": 3.3071, + "step": 500 + }, + { + "epoch": 0.18582619785024596, + "grad_norm": 0.5703125, + "learning_rate": 9.380466472303208e-05, + "loss": 3.3047, + "step": 510 + }, + { + "epoch": 0.18946984878848605, + "grad_norm": 0.58984375, + "learning_rate": 9.368318756073858e-05, + "loss": 3.1964, + "step": 520 + }, + { + "epoch": 0.19311349972672617, + "grad_norm": 0.57421875, + "learning_rate": 9.35617103984451e-05, + "loss": 3.2459, + "step": 530 + }, + { + "epoch": 0.1967571506649663, + "grad_norm": 0.62109375, + "learning_rate": 9.344023323615161e-05, + "loss": 3.205, + "step": 540 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 0.66015625, + "learning_rate": 9.331875607385812e-05, + "loss": 3.2856, + "step": 550 + }, + { + "epoch": 0.20404445254144654, + "grad_norm": 0.52734375, + "learning_rate": 9.319727891156463e-05, + "loss": 3.185, + "step": 560 + }, + { + "epoch": 0.20768810347968664, + "grad_norm": 0.5546875, + "learning_rate": 9.307580174927114e-05, + "loss": 3.3071, + "step": 570 + }, + { + "epoch": 0.21133175441792676, + "grad_norm": 0.63671875, + "learning_rate": 9.295432458697765e-05, + "loss": 3.2363, + "step": 580 + }, + { + "epoch": 0.21497540535616688, + "grad_norm": 0.5625, + "learning_rate": 9.283284742468417e-05, + "loss": 3.2697, + "step": 590 + }, + { + "epoch": 0.218619056294407, + "grad_norm": 0.56640625, + "learning_rate": 9.271137026239067e-05, + "loss": 3.3037, + "step": 600 + }, + { + "epoch": 0.2222627072326471, + "grad_norm": 0.53125, + "learning_rate": 9.258989310009719e-05, + "loss": 3.2371, + "step": 610 + }, + { + "epoch": 0.22590635817088722, + "grad_norm": 0.61328125, + "learning_rate": 9.24684159378037e-05, + "loss": 3.3367, + "step": 620 + }, + { + "epoch": 0.22955000910912735, + "grad_norm": 0.5703125, + "learning_rate": 9.234693877551021e-05, + "loss": 3.2109, + "step": 630 + }, + { + "epoch": 0.23319366004736747, + "grad_norm": 0.59375, + "learning_rate": 9.222546161321672e-05, + "loss": 3.2374, + "step": 640 + }, + { + "epoch": 0.2368373109856076, + "grad_norm": 0.6875, + "learning_rate": 9.210398445092323e-05, + "loss": 3.3066, + "step": 650 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 0.6484375, + "learning_rate": 9.198250728862974e-05, + "loss": 3.2635, + "step": 660 + }, + { + "epoch": 0.2441246128620878, + "grad_norm": 0.60546875, + "learning_rate": 9.186103012633626e-05, + "loss": 3.26, + "step": 670 + }, + { + "epoch": 0.24776826380032793, + "grad_norm": 0.65234375, + "learning_rate": 9.173955296404276e-05, + "loss": 3.2641, + "step": 680 + }, + { + "epoch": 0.25141191473856805, + "grad_norm": 0.6015625, + "learning_rate": 9.161807580174927e-05, + "loss": 3.2907, + "step": 690 + }, + { + "epoch": 0.25505556567680815, + "grad_norm": 0.54296875, + "learning_rate": 9.149659863945579e-05, + "loss": 3.2567, + "step": 700 + }, + { + "epoch": 0.2586992166150483, + "grad_norm": 0.62890625, + "learning_rate": 9.13751214771623e-05, + "loss": 3.2838, + "step": 710 + }, + { + "epoch": 0.2623428675532884, + "grad_norm": 0.546875, + "learning_rate": 9.125364431486881e-05, + "loss": 3.2969, + "step": 720 + }, + { + "epoch": 0.2659865184915285, + "grad_norm": 0.6328125, + "learning_rate": 9.113216715257532e-05, + "loss": 3.2212, + "step": 730 + }, + { + "epoch": 0.26963016942976864, + "grad_norm": 0.6328125, + "learning_rate": 9.101068999028183e-05, + "loss": 3.212, + "step": 740 + }, + { + "epoch": 0.27327382036800874, + "grad_norm": 0.5859375, + "learning_rate": 9.088921282798835e-05, + "loss": 3.3488, + "step": 750 + }, + { + "epoch": 0.2769174713062489, + "grad_norm": 0.546875, + "learning_rate": 9.076773566569486e-05, + "loss": 3.2143, + "step": 760 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 0.56640625, + "learning_rate": 9.064625850340136e-05, + "loss": 3.2518, + "step": 770 + }, + { + "epoch": 0.2842047731827291, + "grad_norm": 0.578125, + "learning_rate": 9.052478134110788e-05, + "loss": 3.2638, + "step": 780 + }, + { + "epoch": 0.2878484241209692, + "grad_norm": 0.58203125, + "learning_rate": 9.040330417881439e-05, + "loss": 3.2584, + "step": 790 + }, + { + "epoch": 0.2914920750592093, + "grad_norm": 0.62890625, + "learning_rate": 9.02818270165209e-05, + "loss": 3.2841, + "step": 800 + }, + { + "epoch": 0.29513572599744947, + "grad_norm": 0.55078125, + "learning_rate": 9.01603498542274e-05, + "loss": 3.261, + "step": 810 + }, + { + "epoch": 0.29877937693568957, + "grad_norm": 0.6171875, + "learning_rate": 9.003887269193392e-05, + "loss": 3.2954, + "step": 820 + }, + { + "epoch": 0.30242302787392966, + "grad_norm": 0.54296875, + "learning_rate": 8.991739552964044e-05, + "loss": 3.2337, + "step": 830 + }, + { + "epoch": 0.3060666788121698, + "grad_norm": 0.6171875, + "learning_rate": 8.979591836734695e-05, + "loss": 3.2881, + "step": 840 + }, + { + "epoch": 0.3097103297504099, + "grad_norm": 0.5546875, + "learning_rate": 8.967444120505344e-05, + "loss": 3.3519, + "step": 850 + }, + { + "epoch": 0.31335398068865, + "grad_norm": 0.5859375, + "learning_rate": 8.955296404275997e-05, + "loss": 3.3147, + "step": 860 + }, + { + "epoch": 0.31699763162689015, + "grad_norm": 0.62890625, + "learning_rate": 8.943148688046648e-05, + "loss": 3.2304, + "step": 870 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 0.60546875, + "learning_rate": 8.931000971817299e-05, + "loss": 3.2526, + "step": 880 + }, + { + "epoch": 0.3242849335033704, + "grad_norm": 0.6640625, + "learning_rate": 8.91885325558795e-05, + "loss": 3.309, + "step": 890 + }, + { + "epoch": 0.3279285844416105, + "grad_norm": 0.6484375, + "learning_rate": 8.9067055393586e-05, + "loss": 3.2513, + "step": 900 + }, + { + "epoch": 0.3315722353798506, + "grad_norm": 0.5703125, + "learning_rate": 8.894557823129253e-05, + "loss": 3.2135, + "step": 910 + }, + { + "epoch": 0.33521588631809074, + "grad_norm": 0.64453125, + "learning_rate": 8.882410106899904e-05, + "loss": 3.3048, + "step": 920 + }, + { + "epoch": 0.33885953725633083, + "grad_norm": 0.6015625, + "learning_rate": 8.870262390670553e-05, + "loss": 3.3047, + "step": 930 + }, + { + "epoch": 0.342503188194571, + "grad_norm": 0.6015625, + "learning_rate": 8.858114674441206e-05, + "loss": 3.2616, + "step": 940 + }, + { + "epoch": 0.3461468391328111, + "grad_norm": 0.5859375, + "learning_rate": 8.845966958211857e-05, + "loss": 3.2697, + "step": 950 + }, + { + "epoch": 0.3497904900710512, + "grad_norm": 0.72265625, + "learning_rate": 8.833819241982508e-05, + "loss": 3.2395, + "step": 960 + }, + { + "epoch": 0.3534341410092913, + "grad_norm": 0.61328125, + "learning_rate": 8.821671525753159e-05, + "loss": 3.2137, + "step": 970 + }, + { + "epoch": 0.3570777919475314, + "grad_norm": 0.625, + "learning_rate": 8.80952380952381e-05, + "loss": 3.2872, + "step": 980 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 0.5859375, + "learning_rate": 8.797376093294462e-05, + "loss": 3.2682, + "step": 990 + }, + { + "epoch": 0.36436509382401167, + "grad_norm": 0.5390625, + "learning_rate": 8.785228377065113e-05, + "loss": 3.204, + "step": 1000 + }, + { + "epoch": 0.36800874476225176, + "grad_norm": 0.71875, + "learning_rate": 8.773080660835762e-05, + "loss": 3.2472, + "step": 1010 + }, + { + "epoch": 0.3716523957004919, + "grad_norm": 0.609375, + "learning_rate": 8.760932944606415e-05, + "loss": 3.2638, + "step": 1020 + }, + { + "epoch": 0.375296046638732, + "grad_norm": 0.60546875, + "learning_rate": 8.748785228377066e-05, + "loss": 3.2803, + "step": 1030 + }, + { + "epoch": 0.3789396975769721, + "grad_norm": 0.66796875, + "learning_rate": 8.736637512147716e-05, + "loss": 3.273, + "step": 1040 + }, + { + "epoch": 0.38258334851521225, + "grad_norm": 0.65625, + "learning_rate": 8.724489795918367e-05, + "loss": 3.2854, + "step": 1050 + }, + { + "epoch": 0.38622699945345235, + "grad_norm": 0.640625, + "learning_rate": 8.712342079689018e-05, + "loss": 3.2373, + "step": 1060 + }, + { + "epoch": 0.3898706503916925, + "grad_norm": 0.55859375, + "learning_rate": 8.700194363459671e-05, + "loss": 3.2259, + "step": 1070 + }, + { + "epoch": 0.3935143013299326, + "grad_norm": 0.5078125, + "learning_rate": 8.688046647230322e-05, + "loss": 3.2402, + "step": 1080 + }, + { + "epoch": 0.3971579522681727, + "grad_norm": 0.61328125, + "learning_rate": 8.675898931000973e-05, + "loss": 3.2379, + "step": 1090 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 0.59375, + "learning_rate": 8.663751214771624e-05, + "loss": 3.2564, + "step": 1100 + }, + { + "epoch": 0.40444525414465293, + "grad_norm": 0.69921875, + "learning_rate": 8.651603498542274e-05, + "loss": 3.2342, + "step": 1110 + }, + { + "epoch": 0.4080889050828931, + "grad_norm": 0.53125, + "learning_rate": 8.639455782312925e-05, + "loss": 3.3336, + "step": 1120 + }, + { + "epoch": 0.4117325560211332, + "grad_norm": 0.63671875, + "learning_rate": 8.627308066083576e-05, + "loss": 3.2684, + "step": 1130 + }, + { + "epoch": 0.4153762069593733, + "grad_norm": 0.61328125, + "learning_rate": 8.615160349854227e-05, + "loss": 3.2581, + "step": 1140 + }, + { + "epoch": 0.4190198578976134, + "grad_norm": 0.50390625, + "learning_rate": 8.603012633624878e-05, + "loss": 3.3428, + "step": 1150 + }, + { + "epoch": 0.4226635088358535, + "grad_norm": 0.58203125, + "learning_rate": 8.59086491739553e-05, + "loss": 3.2331, + "step": 1160 + }, + { + "epoch": 0.42630715977409367, + "grad_norm": 0.63671875, + "learning_rate": 8.578717201166182e-05, + "loss": 3.2203, + "step": 1170 + }, + { + "epoch": 0.42995081071233376, + "grad_norm": 0.57421875, + "learning_rate": 8.566569484936832e-05, + "loss": 3.248, + "step": 1180 + }, + { + "epoch": 0.43359446165057386, + "grad_norm": 0.6015625, + "learning_rate": 8.554421768707483e-05, + "loss": 3.3052, + "step": 1190 + }, + { + "epoch": 0.437238112588814, + "grad_norm": 0.5546875, + "learning_rate": 8.542274052478134e-05, + "loss": 3.2036, + "step": 1200 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 0.64453125, + "learning_rate": 8.530126336248787e-05, + "loss": 3.2199, + "step": 1210 + }, + { + "epoch": 0.4445254144652942, + "grad_norm": 0.68359375, + "learning_rate": 8.517978620019436e-05, + "loss": 3.2594, + "step": 1220 + }, + { + "epoch": 0.44816906540353435, + "grad_norm": 0.6953125, + "learning_rate": 8.505830903790087e-05, + "loss": 3.26, + "step": 1230 + }, + { + "epoch": 0.45181271634177445, + "grad_norm": 0.66015625, + "learning_rate": 8.49368318756074e-05, + "loss": 3.3623, + "step": 1240 + }, + { + "epoch": 0.4554563672800146, + "grad_norm": 0.7421875, + "learning_rate": 8.48153547133139e-05, + "loss": 3.2625, + "step": 1250 + }, + { + "epoch": 0.4591000182182547, + "grad_norm": 0.6875, + "learning_rate": 8.469387755102041e-05, + "loss": 3.2738, + "step": 1260 + }, + { + "epoch": 0.4627436691564948, + "grad_norm": 0.61328125, + "learning_rate": 8.457240038872692e-05, + "loss": 3.2688, + "step": 1270 + }, + { + "epoch": 0.46638732009473494, + "grad_norm": 0.609375, + "learning_rate": 8.445092322643343e-05, + "loss": 3.2392, + "step": 1280 + }, + { + "epoch": 0.47003097103297503, + "grad_norm": 0.56640625, + "learning_rate": 8.432944606413996e-05, + "loss": 3.2414, + "step": 1290 + }, + { + "epoch": 0.4736746219712152, + "grad_norm": 0.640625, + "learning_rate": 8.420796890184645e-05, + "loss": 3.2461, + "step": 1300 + }, + { + "epoch": 0.4773182729094553, + "grad_norm": 0.578125, + "learning_rate": 8.408649173955296e-05, + "loss": 3.3459, + "step": 1310 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 0.6953125, + "learning_rate": 8.396501457725948e-05, + "loss": 3.2631, + "step": 1320 + }, + { + "epoch": 0.4846055747859355, + "grad_norm": 0.59765625, + "learning_rate": 8.3843537414966e-05, + "loss": 3.2883, + "step": 1330 + }, + { + "epoch": 0.4882492257241756, + "grad_norm": 0.625, + "learning_rate": 8.372206025267249e-05, + "loss": 3.2085, + "step": 1340 + }, + { + "epoch": 0.49189287666241577, + "grad_norm": 0.6640625, + "learning_rate": 8.360058309037901e-05, + "loss": 3.3132, + "step": 1350 + }, + { + "epoch": 0.49553652760065586, + "grad_norm": 0.61328125, + "learning_rate": 8.347910592808552e-05, + "loss": 3.3076, + "step": 1360 + }, + { + "epoch": 0.49918017853889596, + "grad_norm": 0.7265625, + "learning_rate": 8.335762876579204e-05, + "loss": 3.3183, + "step": 1370 + }, + { + "epoch": 0.5028238294771361, + "grad_norm": 0.55859375, + "learning_rate": 8.323615160349854e-05, + "loss": 3.1761, + "step": 1380 + }, + { + "epoch": 0.5064674804153763, + "grad_norm": 0.60546875, + "learning_rate": 8.311467444120505e-05, + "loss": 3.2079, + "step": 1390 + }, + { + "epoch": 0.5101111313536163, + "grad_norm": 0.703125, + "learning_rate": 8.299319727891157e-05, + "loss": 3.2844, + "step": 1400 + }, + { + "epoch": 0.5137547822918564, + "grad_norm": 0.578125, + "learning_rate": 8.287172011661808e-05, + "loss": 3.2492, + "step": 1410 + }, + { + "epoch": 0.5173984332300966, + "grad_norm": 0.6328125, + "learning_rate": 8.275024295432459e-05, + "loss": 3.2525, + "step": 1420 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 0.5703125, + "learning_rate": 8.26287657920311e-05, + "loss": 3.2449, + "step": 1430 + }, + { + "epoch": 0.5246857351065768, + "grad_norm": 0.54296875, + "learning_rate": 8.250728862973761e-05, + "loss": 3.2279, + "step": 1440 + }, + { + "epoch": 0.5283293860448169, + "grad_norm": 0.5859375, + "learning_rate": 8.238581146744413e-05, + "loss": 3.2751, + "step": 1450 + }, + { + "epoch": 0.531973036983057, + "grad_norm": 0.57421875, + "learning_rate": 8.226433430515063e-05, + "loss": 3.2404, + "step": 1460 + }, + { + "epoch": 0.5356166879212971, + "grad_norm": 0.67578125, + "learning_rate": 8.214285714285714e-05, + "loss": 3.2911, + "step": 1470 + }, + { + "epoch": 0.5392603388595373, + "grad_norm": 0.6796875, + "learning_rate": 8.202137998056366e-05, + "loss": 3.2637, + "step": 1480 + }, + { + "epoch": 0.5429039897977773, + "grad_norm": 0.61328125, + "learning_rate": 8.189990281827017e-05, + "loss": 3.2004, + "step": 1490 + }, + { + "epoch": 0.5465476407360175, + "grad_norm": 0.6875, + "learning_rate": 8.177842565597668e-05, + "loss": 3.2958, + "step": 1500 + }, + { + "epoch": 0.5501912916742576, + "grad_norm": 0.609375, + "learning_rate": 8.165694849368319e-05, + "loss": 3.2371, + "step": 1510 + }, + { + "epoch": 0.5538349426124978, + "grad_norm": 0.6171875, + "learning_rate": 8.15354713313897e-05, + "loss": 3.2798, + "step": 1520 + }, + { + "epoch": 0.5574785935507378, + "grad_norm": 0.6953125, + "learning_rate": 8.141399416909622e-05, + "loss": 3.2608, + "step": 1530 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 0.62109375, + "learning_rate": 8.129251700680273e-05, + "loss": 3.2374, + "step": 1540 + }, + { + "epoch": 0.5647658954272181, + "grad_norm": 0.625, + "learning_rate": 8.117103984450923e-05, + "loss": 3.189, + "step": 1550 + }, + { + "epoch": 0.5684095463654582, + "grad_norm": 0.57421875, + "learning_rate": 8.104956268221575e-05, + "loss": 3.2008, + "step": 1560 + }, + { + "epoch": 0.5720531973036983, + "grad_norm": 0.58984375, + "learning_rate": 8.092808551992226e-05, + "loss": 3.219, + "step": 1570 + }, + { + "epoch": 0.5756968482419385, + "grad_norm": 0.58203125, + "learning_rate": 8.080660835762877e-05, + "loss": 3.2417, + "step": 1580 + }, + { + "epoch": 0.5793404991801785, + "grad_norm": 0.63671875, + "learning_rate": 8.068513119533528e-05, + "loss": 3.236, + "step": 1590 + }, + { + "epoch": 0.5829841501184186, + "grad_norm": 0.703125, + "learning_rate": 8.056365403304179e-05, + "loss": 3.3037, + "step": 1600 + }, + { + "epoch": 0.5866278010566588, + "grad_norm": 0.703125, + "learning_rate": 8.04421768707483e-05, + "loss": 3.2412, + "step": 1610 + }, + { + "epoch": 0.5902714519948989, + "grad_norm": 0.66796875, + "learning_rate": 8.032069970845482e-05, + "loss": 3.2293, + "step": 1620 + }, + { + "epoch": 0.593915102933139, + "grad_norm": 0.6640625, + "learning_rate": 8.019922254616132e-05, + "loss": 3.2208, + "step": 1630 + }, + { + "epoch": 0.5975587538713791, + "grad_norm": 0.671875, + "learning_rate": 8.007774538386784e-05, + "loss": 3.2251, + "step": 1640 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 0.63671875, + "learning_rate": 7.995626822157435e-05, + "loss": 3.284, + "step": 1650 + }, + { + "epoch": 0.6048460557478593, + "grad_norm": 0.6484375, + "learning_rate": 7.983479105928086e-05, + "loss": 3.2404, + "step": 1660 + }, + { + "epoch": 0.6084897066860995, + "grad_norm": 0.69140625, + "learning_rate": 7.971331389698737e-05, + "loss": 3.3335, + "step": 1670 + }, + { + "epoch": 0.6121333576243396, + "grad_norm": 0.59765625, + "learning_rate": 7.959183673469388e-05, + "loss": 3.276, + "step": 1680 + }, + { + "epoch": 0.6157770085625797, + "grad_norm": 0.63671875, + "learning_rate": 7.947035957240039e-05, + "loss": 3.2263, + "step": 1690 + }, + { + "epoch": 0.6194206595008198, + "grad_norm": 0.546875, + "learning_rate": 7.934888241010691e-05, + "loss": 3.1878, + "step": 1700 + }, + { + "epoch": 0.62306431043906, + "grad_norm": 0.625, + "learning_rate": 7.922740524781341e-05, + "loss": 3.294, + "step": 1710 + }, + { + "epoch": 0.6267079613773, + "grad_norm": 0.578125, + "learning_rate": 7.910592808551993e-05, + "loss": 3.2183, + "step": 1720 + }, + { + "epoch": 0.6303516123155402, + "grad_norm": 0.69140625, + "learning_rate": 7.898445092322644e-05, + "loss": 3.1985, + "step": 1730 + }, + { + "epoch": 0.6339952632537803, + "grad_norm": 0.74609375, + "learning_rate": 7.886297376093295e-05, + "loss": 3.1563, + "step": 1740 + }, + { + "epoch": 0.6376389141920205, + "grad_norm": 0.6484375, + "learning_rate": 7.874149659863946e-05, + "loss": 3.2806, + "step": 1750 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 0.6328125, + "learning_rate": 7.862001943634597e-05, + "loss": 3.2288, + "step": 1760 + }, + { + "epoch": 0.6449262160685006, + "grad_norm": 0.5859375, + "learning_rate": 7.849854227405248e-05, + "loss": 3.2785, + "step": 1770 + }, + { + "epoch": 0.6485698670067408, + "grad_norm": 0.6875, + "learning_rate": 7.8377065111759e-05, + "loss": 3.2952, + "step": 1780 + }, + { + "epoch": 0.6522135179449808, + "grad_norm": 0.6796875, + "learning_rate": 7.82555879494655e-05, + "loss": 3.1665, + "step": 1790 + }, + { + "epoch": 0.655857168883221, + "grad_norm": 0.6796875, + "learning_rate": 7.8134110787172e-05, + "loss": 3.1984, + "step": 1800 + }, + { + "epoch": 0.6595008198214611, + "grad_norm": 0.625, + "learning_rate": 7.801263362487853e-05, + "loss": 3.2051, + "step": 1810 + }, + { + "epoch": 0.6631444707597012, + "grad_norm": 0.6640625, + "learning_rate": 7.789115646258504e-05, + "loss": 3.2141, + "step": 1820 + }, + { + "epoch": 0.6667881216979413, + "grad_norm": 0.59375, + "learning_rate": 7.776967930029155e-05, + "loss": 3.312, + "step": 1830 + }, + { + "epoch": 0.6704317726361815, + "grad_norm": 0.65234375, + "learning_rate": 7.764820213799806e-05, + "loss": 3.2473, + "step": 1840 + }, + { + "epoch": 0.6740754235744215, + "grad_norm": 0.61328125, + "learning_rate": 7.752672497570457e-05, + "loss": 3.2924, + "step": 1850 + }, + { + "epoch": 0.6777190745126617, + "grad_norm": 0.71484375, + "learning_rate": 7.740524781341109e-05, + "loss": 3.2799, + "step": 1860 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 0.55078125, + "learning_rate": 7.72837706511176e-05, + "loss": 3.2251, + "step": 1870 + }, + { + "epoch": 0.685006376389142, + "grad_norm": 0.70703125, + "learning_rate": 7.71622934888241e-05, + "loss": 3.209, + "step": 1880 + }, + { + "epoch": 0.688650027327382, + "grad_norm": 0.63671875, + "learning_rate": 7.704081632653062e-05, + "loss": 3.2312, + "step": 1890 + }, + { + "epoch": 0.6922936782656222, + "grad_norm": 0.6328125, + "learning_rate": 7.691933916423713e-05, + "loss": 3.2487, + "step": 1900 + }, + { + "epoch": 0.6959373292038623, + "grad_norm": 0.5703125, + "learning_rate": 7.679786200194364e-05, + "loss": 3.3157, + "step": 1910 + }, + { + "epoch": 0.6995809801421023, + "grad_norm": 0.63671875, + "learning_rate": 7.667638483965015e-05, + "loss": 3.299, + "step": 1920 + }, + { + "epoch": 0.7032246310803425, + "grad_norm": 0.69140625, + "learning_rate": 7.655490767735666e-05, + "loss": 3.2755, + "step": 1930 + }, + { + "epoch": 0.7068682820185826, + "grad_norm": 0.625, + "learning_rate": 7.643343051506318e-05, + "loss": 3.317, + "step": 1940 + }, + { + "epoch": 0.7105119329568227, + "grad_norm": 0.55078125, + "learning_rate": 7.631195335276969e-05, + "loss": 3.1871, + "step": 1950 + }, + { + "epoch": 0.7141555838950628, + "grad_norm": 0.74609375, + "learning_rate": 7.619047619047618e-05, + "loss": 3.2405, + "step": 1960 + }, + { + "epoch": 0.717799234833303, + "grad_norm": 0.69921875, + "learning_rate": 7.606899902818271e-05, + "loss": 3.3068, + "step": 1970 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 0.578125, + "learning_rate": 7.594752186588922e-05, + "loss": 3.335, + "step": 1980 + }, + { + "epoch": 0.7250865367097832, + "grad_norm": 0.6484375, + "learning_rate": 7.582604470359573e-05, + "loss": 3.2617, + "step": 1990 + }, + { + "epoch": 0.7287301876480233, + "grad_norm": 0.5234375, + "learning_rate": 7.570456754130224e-05, + "loss": 3.2335, + "step": 2000 + }, + { + "epoch": 0.7323738385862635, + "grad_norm": 0.640625, + "learning_rate": 7.558309037900875e-05, + "loss": 3.2604, + "step": 2010 + }, + { + "epoch": 0.7360174895245035, + "grad_norm": 0.57421875, + "learning_rate": 7.546161321671527e-05, + "loss": 3.2632, + "step": 2020 + }, + { + "epoch": 0.7396611404627437, + "grad_norm": 0.61328125, + "learning_rate": 7.534013605442178e-05, + "loss": 3.2184, + "step": 2030 + }, + { + "epoch": 0.7433047914009838, + "grad_norm": 0.6171875, + "learning_rate": 7.521865889212827e-05, + "loss": 3.2848, + "step": 2040 + }, + { + "epoch": 0.7469484423392239, + "grad_norm": 0.6484375, + "learning_rate": 7.50971817298348e-05, + "loss": 3.2473, + "step": 2050 + }, + { + "epoch": 0.750592093277464, + "grad_norm": 0.6953125, + "learning_rate": 7.49757045675413e-05, + "loss": 3.195, + "step": 2060 + }, + { + "epoch": 0.7542357442157042, + "grad_norm": 0.73046875, + "learning_rate": 7.485422740524782e-05, + "loss": 3.2248, + "step": 2070 + }, + { + "epoch": 0.7578793951539442, + "grad_norm": 0.5390625, + "learning_rate": 7.473275024295433e-05, + "loss": 3.1511, + "step": 2080 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 0.66796875, + "learning_rate": 7.461127308066083e-05, + "loss": 3.2719, + "step": 2090 + }, + { + "epoch": 0.7651666970304245, + "grad_norm": 0.57421875, + "learning_rate": 7.448979591836736e-05, + "loss": 3.2339, + "step": 2100 + }, + { + "epoch": 0.7688103479686647, + "grad_norm": 0.61328125, + "learning_rate": 7.436831875607387e-05, + "loss": 3.2863, + "step": 2110 + }, + { + "epoch": 0.7724539989069047, + "grad_norm": 0.55859375, + "learning_rate": 7.424684159378036e-05, + "loss": 3.2057, + "step": 2120 + }, + { + "epoch": 0.7760976498451448, + "grad_norm": 0.73046875, + "learning_rate": 7.412536443148689e-05, + "loss": 3.2397, + "step": 2130 + }, + { + "epoch": 0.779741300783385, + "grad_norm": 0.59375, + "learning_rate": 7.40038872691934e-05, + "loss": 3.2323, + "step": 2140 + }, + { + "epoch": 0.783384951721625, + "grad_norm": 0.63671875, + "learning_rate": 7.38824101068999e-05, + "loss": 3.2764, + "step": 2150 + }, + { + "epoch": 0.7870286026598652, + "grad_norm": 0.60546875, + "learning_rate": 7.376093294460641e-05, + "loss": 3.2668, + "step": 2160 + }, + { + "epoch": 0.7906722535981053, + "grad_norm": 0.63671875, + "learning_rate": 7.363945578231292e-05, + "loss": 3.2953, + "step": 2170 + }, + { + "epoch": 0.7943159045363454, + "grad_norm": 0.5625, + "learning_rate": 7.351797862001945e-05, + "loss": 3.1915, + "step": 2180 + }, + { + "epoch": 0.7979595554745855, + "grad_norm": 0.66015625, + "learning_rate": 7.339650145772596e-05, + "loss": 3.2622, + "step": 2190 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 0.6171875, + "learning_rate": 7.327502429543247e-05, + "loss": 3.2522, + "step": 2200 + }, + { + "epoch": 0.8052468573510657, + "grad_norm": 0.64453125, + "learning_rate": 7.315354713313898e-05, + "loss": 3.1673, + "step": 2210 + }, + { + "epoch": 0.8088905082893059, + "grad_norm": 0.625, + "learning_rate": 7.303206997084548e-05, + "loss": 3.2722, + "step": 2220 + }, + { + "epoch": 0.812534159227546, + "grad_norm": 0.6640625, + "learning_rate": 7.2910592808552e-05, + "loss": 3.2377, + "step": 2230 + }, + { + "epoch": 0.8161778101657862, + "grad_norm": 0.6171875, + "learning_rate": 7.27891156462585e-05, + "loss": 3.179, + "step": 2240 + }, + { + "epoch": 0.8198214611040262, + "grad_norm": 0.57421875, + "learning_rate": 7.266763848396501e-05, + "loss": 3.2588, + "step": 2250 + }, + { + "epoch": 0.8234651120422664, + "grad_norm": 0.578125, + "learning_rate": 7.254616132167152e-05, + "loss": 3.2664, + "step": 2260 + }, + { + "epoch": 0.8271087629805065, + "grad_norm": 0.73046875, + "learning_rate": 7.242468415937805e-05, + "loss": 3.2515, + "step": 2270 + }, + { + "epoch": 0.8307524139187465, + "grad_norm": 0.6328125, + "learning_rate": 7.230320699708455e-05, + "loss": 3.2102, + "step": 2280 + }, + { + "epoch": 0.8343960648569867, + "grad_norm": 0.6484375, + "learning_rate": 7.218172983479106e-05, + "loss": 3.246, + "step": 2290 + }, + { + "epoch": 0.8380397157952268, + "grad_norm": 0.58203125, + "learning_rate": 7.206025267249757e-05, + "loss": 3.3321, + "step": 2300 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 0.59765625, + "learning_rate": 7.193877551020408e-05, + "loss": 3.0889, + "step": 2310 + }, + { + "epoch": 0.845327017671707, + "grad_norm": 0.66015625, + "learning_rate": 7.18172983479106e-05, + "loss": 3.2811, + "step": 2320 + }, + { + "epoch": 0.8489706686099472, + "grad_norm": 0.65234375, + "learning_rate": 7.16958211856171e-05, + "loss": 3.1688, + "step": 2330 + }, + { + "epoch": 0.8526143195481873, + "grad_norm": 0.76171875, + "learning_rate": 7.157434402332361e-05, + "loss": 3.2495, + "step": 2340 + }, + { + "epoch": 0.8562579704864274, + "grad_norm": 0.6484375, + "learning_rate": 7.145286686103013e-05, + "loss": 3.1742, + "step": 2350 + }, + { + "epoch": 0.8599016214246675, + "grad_norm": 0.5859375, + "learning_rate": 7.133138969873664e-05, + "loss": 3.2293, + "step": 2360 + }, + { + "epoch": 0.8635452723629077, + "grad_norm": 0.640625, + "learning_rate": 7.120991253644315e-05, + "loss": 3.2574, + "step": 2370 + }, + { + "epoch": 0.8671889233011477, + "grad_norm": 0.55078125, + "learning_rate": 7.108843537414966e-05, + "loss": 3.2496, + "step": 2380 + }, + { + "epoch": 0.8708325742393879, + "grad_norm": 0.7109375, + "learning_rate": 7.096695821185617e-05, + "loss": 3.2527, + "step": 2390 + }, + { + "epoch": 0.874476225177628, + "grad_norm": 0.6640625, + "learning_rate": 7.08454810495627e-05, + "loss": 3.1984, + "step": 2400 + }, + { + "epoch": 0.8781198761158681, + "grad_norm": 0.58984375, + "learning_rate": 7.072400388726919e-05, + "loss": 3.2517, + "step": 2410 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 0.6171875, + "learning_rate": 7.06025267249757e-05, + "loss": 3.2105, + "step": 2420 + }, + { + "epoch": 0.8854071779923484, + "grad_norm": 0.62890625, + "learning_rate": 7.048104956268222e-05, + "loss": 3.2125, + "step": 2430 + }, + { + "epoch": 0.8890508289305884, + "grad_norm": 0.72265625, + "learning_rate": 7.035957240038873e-05, + "loss": 3.255, + "step": 2440 + }, + { + "epoch": 0.8926944798688285, + "grad_norm": 0.671875, + "learning_rate": 7.023809523809524e-05, + "loss": 3.3331, + "step": 2450 + }, + { + "epoch": 0.8963381308070687, + "grad_norm": 0.65234375, + "learning_rate": 7.011661807580175e-05, + "loss": 3.3545, + "step": 2460 + }, + { + "epoch": 0.8999817817453089, + "grad_norm": 0.62890625, + "learning_rate": 6.999514091350826e-05, + "loss": 3.2776, + "step": 2470 + }, + { + "epoch": 0.9036254326835489, + "grad_norm": 0.76953125, + "learning_rate": 6.987366375121478e-05, + "loss": 3.2331, + "step": 2480 + }, + { + "epoch": 0.907269083621789, + "grad_norm": 0.78515625, + "learning_rate": 6.975218658892128e-05, + "loss": 3.2803, + "step": 2490 + }, + { + "epoch": 0.9109127345600292, + "grad_norm": 0.671875, + "learning_rate": 6.963070942662779e-05, + "loss": 3.256, + "step": 2500 + }, + { + "epoch": 0.9145563854982692, + "grad_norm": 0.59765625, + "learning_rate": 6.950923226433431e-05, + "loss": 3.2896, + "step": 2510 + }, + { + "epoch": 0.9182000364365094, + "grad_norm": 0.62890625, + "learning_rate": 6.938775510204082e-05, + "loss": 3.2555, + "step": 2520 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 0.7421875, + "learning_rate": 6.926627793974733e-05, + "loss": 3.2682, + "step": 2530 + }, + { + "epoch": 0.9254873383129896, + "grad_norm": 0.671875, + "learning_rate": 6.914480077745384e-05, + "loss": 3.1564, + "step": 2540 + }, + { + "epoch": 0.9291309892512297, + "grad_norm": 0.6484375, + "learning_rate": 6.902332361516035e-05, + "loss": 3.1445, + "step": 2550 + }, + { + "epoch": 0.9327746401894699, + "grad_norm": 0.51953125, + "learning_rate": 6.890184645286687e-05, + "loss": 3.2515, + "step": 2560 + }, + { + "epoch": 0.9364182911277099, + "grad_norm": 0.65625, + "learning_rate": 6.878036929057337e-05, + "loss": 3.1962, + "step": 2570 + }, + { + "epoch": 0.9400619420659501, + "grad_norm": 0.59375, + "learning_rate": 6.865889212827988e-05, + "loss": 3.3199, + "step": 2580 + }, + { + "epoch": 0.9437055930041902, + "grad_norm": 0.65234375, + "learning_rate": 6.85374149659864e-05, + "loss": 3.264, + "step": 2590 + }, + { + "epoch": 0.9473492439424304, + "grad_norm": 0.63671875, + "learning_rate": 6.841593780369291e-05, + "loss": 3.1853, + "step": 2600 + }, + { + "epoch": 0.9509928948806704, + "grad_norm": 0.72265625, + "learning_rate": 6.829446064139942e-05, + "loss": 3.3017, + "step": 2610 + }, + { + "epoch": 0.9546365458189106, + "grad_norm": 0.6953125, + "learning_rate": 6.817298347910593e-05, + "loss": 3.2358, + "step": 2620 + }, + { + "epoch": 0.9582801967571507, + "grad_norm": 0.6328125, + "learning_rate": 6.805150631681244e-05, + "loss": 3.2854, + "step": 2630 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 0.5859375, + "learning_rate": 6.793002915451895e-05, + "loss": 3.1873, + "step": 2640 + }, + { + "epoch": 0.9655674986336309, + "grad_norm": 0.59375, + "learning_rate": 6.780855199222547e-05, + "loss": 3.2274, + "step": 2650 + }, + { + "epoch": 0.969211149571871, + "grad_norm": 0.63671875, + "learning_rate": 6.768707482993197e-05, + "loss": 3.2037, + "step": 2660 + }, + { + "epoch": 0.9728548005101111, + "grad_norm": 0.5703125, + "learning_rate": 6.756559766763849e-05, + "loss": 3.3132, + "step": 2670 + }, + { + "epoch": 0.9764984514483512, + "grad_norm": 0.72265625, + "learning_rate": 6.7444120505345e-05, + "loss": 3.2734, + "step": 2680 + }, + { + "epoch": 0.9801421023865914, + "grad_norm": 0.70703125, + "learning_rate": 6.732264334305151e-05, + "loss": 3.1784, + "step": 2690 + }, + { + "epoch": 0.9837857533248315, + "grad_norm": 0.57421875, + "learning_rate": 6.720116618075802e-05, + "loss": 3.2181, + "step": 2700 + }, + { + "epoch": 0.9874294042630716, + "grad_norm": 0.6953125, + "learning_rate": 6.707968901846453e-05, + "loss": 3.2676, + "step": 2710 + }, + { + "epoch": 0.9910730552013117, + "grad_norm": 0.6875, + "learning_rate": 6.695821185617104e-05, + "loss": 3.1952, + "step": 2720 + }, + { + "epoch": 0.9947167061395519, + "grad_norm": 0.609375, + "learning_rate": 6.683673469387756e-05, + "loss": 3.3135, + "step": 2730 + }, + { + "epoch": 0.9983603570777919, + "grad_norm": 0.6484375, + "learning_rate": 6.671525753158406e-05, + "loss": 3.2643, + "step": 2740 + }, + { + "epoch": 1.002004008016032, + "grad_norm": 0.6015625, + "learning_rate": 6.659378036929058e-05, + "loss": 3.1996, + "step": 2750 + }, + { + "epoch": 1.0056476589542722, + "grad_norm": 0.75, + "learning_rate": 6.647230320699709e-05, + "loss": 3.0862, + "step": 2760 + }, + { + "epoch": 1.0092913098925123, + "grad_norm": 0.671875, + "learning_rate": 6.63508260447036e-05, + "loss": 3.1886, + "step": 2770 + }, + { + "epoch": 1.0129349608307525, + "grad_norm": 0.65625, + "learning_rate": 6.622934888241011e-05, + "loss": 3.1478, + "step": 2780 + }, + { + "epoch": 1.0165786117689926, + "grad_norm": 0.69921875, + "learning_rate": 6.610787172011662e-05, + "loss": 3.1577, + "step": 2790 + }, + { + "epoch": 1.0202222627072326, + "grad_norm": 0.77734375, + "learning_rate": 6.598639455782313e-05, + "loss": 3.148, + "step": 2800 + }, + { + "epoch": 1.0238659136454729, + "grad_norm": 0.640625, + "learning_rate": 6.586491739552965e-05, + "loss": 3.1971, + "step": 2810 + }, + { + "epoch": 1.027509564583713, + "grad_norm": 0.58984375, + "learning_rate": 6.574344023323615e-05, + "loss": 3.1351, + "step": 2820 + }, + { + "epoch": 1.031153215521953, + "grad_norm": 0.734375, + "learning_rate": 6.562196307094267e-05, + "loss": 3.2304, + "step": 2830 + }, + { + "epoch": 1.0347968664601932, + "grad_norm": 0.71484375, + "learning_rate": 6.550048590864918e-05, + "loss": 3.1582, + "step": 2840 + }, + { + "epoch": 1.0384405173984332, + "grad_norm": 0.71875, + "learning_rate": 6.537900874635569e-05, + "loss": 3.1183, + "step": 2850 + }, + { + "epoch": 1.0420841683366733, + "grad_norm": 0.8046875, + "learning_rate": 6.52575315840622e-05, + "loss": 3.2056, + "step": 2860 + }, + { + "epoch": 1.0457278192749135, + "grad_norm": 0.765625, + "learning_rate": 6.513605442176871e-05, + "loss": 3.1694, + "step": 2870 + }, + { + "epoch": 1.0493714702131536, + "grad_norm": 0.890625, + "learning_rate": 6.501457725947522e-05, + "loss": 3.1428, + "step": 2880 + }, + { + "epoch": 1.0530151211513936, + "grad_norm": 0.65625, + "learning_rate": 6.489310009718174e-05, + "loss": 3.1052, + "step": 2890 + }, + { + "epoch": 1.0566587720896339, + "grad_norm": 0.83203125, + "learning_rate": 6.477162293488824e-05, + "loss": 3.1195, + "step": 2900 + }, + { + "epoch": 1.060302423027874, + "grad_norm": 0.7421875, + "learning_rate": 6.465014577259475e-05, + "loss": 3.2278, + "step": 2910 + }, + { + "epoch": 1.063946073966114, + "grad_norm": 0.71875, + "learning_rate": 6.452866861030127e-05, + "loss": 3.1563, + "step": 2920 + }, + { + "epoch": 1.0675897249043542, + "grad_norm": 0.69140625, + "learning_rate": 6.440719144800778e-05, + "loss": 3.1505, + "step": 2930 + }, + { + "epoch": 1.0712333758425943, + "grad_norm": 0.8515625, + "learning_rate": 6.428571428571429e-05, + "loss": 3.1681, + "step": 2940 + }, + { + "epoch": 1.0748770267808343, + "grad_norm": 0.71484375, + "learning_rate": 6.41642371234208e-05, + "loss": 3.17, + "step": 2950 + }, + { + "epoch": 1.0785206777190746, + "grad_norm": 0.90625, + "learning_rate": 6.40427599611273e-05, + "loss": 3.1775, + "step": 2960 + }, + { + "epoch": 1.0821643286573146, + "grad_norm": 0.73828125, + "learning_rate": 6.392128279883383e-05, + "loss": 3.0921, + "step": 2970 + }, + { + "epoch": 1.0858079795955549, + "grad_norm": 0.75390625, + "learning_rate": 6.379980563654034e-05, + "loss": 3.1666, + "step": 2980 + }, + { + "epoch": 1.089451630533795, + "grad_norm": 0.80859375, + "learning_rate": 6.367832847424684e-05, + "loss": 3.1935, + "step": 2990 + }, + { + "epoch": 1.093095281472035, + "grad_norm": 0.67578125, + "learning_rate": 6.355685131195336e-05, + "loss": 3.0588, + "step": 3000 + }, + { + "epoch": 1.096738932410275, + "grad_norm": 0.74609375, + "learning_rate": 6.343537414965987e-05, + "loss": 3.1867, + "step": 3010 + }, + { + "epoch": 1.1003825833485152, + "grad_norm": 0.8828125, + "learning_rate": 6.331389698736638e-05, + "loss": 3.162, + "step": 3020 + }, + { + "epoch": 1.1040262342867553, + "grad_norm": 0.78515625, + "learning_rate": 6.319241982507289e-05, + "loss": 3.1737, + "step": 3030 + }, + { + "epoch": 1.1076698852249955, + "grad_norm": 0.76171875, + "learning_rate": 6.30709426627794e-05, + "loss": 3.1974, + "step": 3040 + }, + { + "epoch": 1.1113135361632356, + "grad_norm": 0.7734375, + "learning_rate": 6.294946550048592e-05, + "loss": 3.1584, + "step": 3050 + }, + { + "epoch": 1.1149571871014756, + "grad_norm": 0.74609375, + "learning_rate": 6.282798833819243e-05, + "loss": 3.1856, + "step": 3060 + }, + { + "epoch": 1.1186008380397159, + "grad_norm": 0.7109375, + "learning_rate": 6.270651117589892e-05, + "loss": 3.177, + "step": 3070 + }, + { + "epoch": 1.122244488977956, + "grad_norm": 0.85546875, + "learning_rate": 6.258503401360545e-05, + "loss": 3.2028, + "step": 3080 + }, + { + "epoch": 1.125888139916196, + "grad_norm": 0.93359375, + "learning_rate": 6.246355685131196e-05, + "loss": 3.2031, + "step": 3090 + }, + { + "epoch": 1.1295317908544362, + "grad_norm": 0.82421875, + "learning_rate": 6.234207968901847e-05, + "loss": 3.0629, + "step": 3100 + }, + { + "epoch": 1.1331754417926763, + "grad_norm": 0.6875, + "learning_rate": 6.222060252672498e-05, + "loss": 3.0927, + "step": 3110 + }, + { + "epoch": 1.1368190927309163, + "grad_norm": 0.765625, + "learning_rate": 6.209912536443149e-05, + "loss": 3.2134, + "step": 3120 + }, + { + "epoch": 1.1404627436691566, + "grad_norm": 0.84765625, + "learning_rate": 6.197764820213801e-05, + "loss": 3.2027, + "step": 3130 + }, + { + "epoch": 1.1441063946073966, + "grad_norm": 0.70703125, + "learning_rate": 6.185617103984452e-05, + "loss": 3.1448, + "step": 3140 + }, + { + "epoch": 1.1477500455456366, + "grad_norm": 0.70703125, + "learning_rate": 6.173469387755101e-05, + "loss": 3.1713, + "step": 3150 + }, + { + "epoch": 1.151393696483877, + "grad_norm": 0.77734375, + "learning_rate": 6.161321671525754e-05, + "loss": 3.1612, + "step": 3160 + }, + { + "epoch": 1.155037347422117, + "grad_norm": 0.79296875, + "learning_rate": 6.149173955296405e-05, + "loss": 3.1934, + "step": 3170 + }, + { + "epoch": 1.158680998360357, + "grad_norm": 0.89453125, + "learning_rate": 6.137026239067056e-05, + "loss": 3.1231, + "step": 3180 + }, + { + "epoch": 1.1623246492985972, + "grad_norm": 0.75390625, + "learning_rate": 6.124878522837707e-05, + "loss": 3.1606, + "step": 3190 + }, + { + "epoch": 1.1659683002368373, + "grad_norm": 0.75, + "learning_rate": 6.112730806608357e-05, + "loss": 3.135, + "step": 3200 + }, + { + "epoch": 1.1696119511750775, + "grad_norm": 0.78125, + "learning_rate": 6.10058309037901e-05, + "loss": 3.1592, + "step": 3210 + }, + { + "epoch": 1.1732556021133176, + "grad_norm": 0.84375, + "learning_rate": 6.08843537414966e-05, + "loss": 3.2429, + "step": 3220 + }, + { + "epoch": 1.1768992530515576, + "grad_norm": 0.921875, + "learning_rate": 6.076287657920311e-05, + "loss": 3.1182, + "step": 3230 + }, + { + "epoch": 1.1805429039897977, + "grad_norm": 0.83203125, + "learning_rate": 6.0641399416909626e-05, + "loss": 3.2273, + "step": 3240 + }, + { + "epoch": 1.184186554928038, + "grad_norm": 0.734375, + "learning_rate": 6.0519922254616135e-05, + "loss": 3.2101, + "step": 3250 + }, + { + "epoch": 1.187830205866278, + "grad_norm": 0.76953125, + "learning_rate": 6.0398445092322645e-05, + "loss": 3.1181, + "step": 3260 + }, + { + "epoch": 1.1914738568045182, + "grad_norm": 0.7265625, + "learning_rate": 6.027696793002916e-05, + "loss": 3.1349, + "step": 3270 + }, + { + "epoch": 1.1951175077427583, + "grad_norm": 0.90234375, + "learning_rate": 6.015549076773567e-05, + "loss": 3.152, + "step": 3280 + }, + { + "epoch": 1.1987611586809983, + "grad_norm": 0.75390625, + "learning_rate": 6.003401360544217e-05, + "loss": 3.1806, + "step": 3290 + }, + { + "epoch": 1.2024048096192386, + "grad_norm": 0.85546875, + "learning_rate": 5.991253644314869e-05, + "loss": 3.1708, + "step": 3300 + }, + { + "epoch": 1.2060484605574786, + "grad_norm": 0.78125, + "learning_rate": 5.97910592808552e-05, + "loss": 3.114, + "step": 3310 + }, + { + "epoch": 1.2096921114957186, + "grad_norm": 0.90625, + "learning_rate": 5.9669582118561715e-05, + "loss": 3.1852, + "step": 3320 + }, + { + "epoch": 1.213335762433959, + "grad_norm": 0.7578125, + "learning_rate": 5.9548104956268225e-05, + "loss": 3.2373, + "step": 3330 + }, + { + "epoch": 1.216979413372199, + "grad_norm": 0.8046875, + "learning_rate": 5.9426627793974734e-05, + "loss": 3.2133, + "step": 3340 + }, + { + "epoch": 1.220623064310439, + "grad_norm": 0.7890625, + "learning_rate": 5.930515063168125e-05, + "loss": 3.2556, + "step": 3350 + }, + { + "epoch": 1.2242667152486792, + "grad_norm": 0.71875, + "learning_rate": 5.918367346938776e-05, + "loss": 3.193, + "step": 3360 + }, + { + "epoch": 1.2279103661869193, + "grad_norm": 0.71484375, + "learning_rate": 5.906219630709426e-05, + "loss": 3.1619, + "step": 3370 + }, + { + "epoch": 1.2315540171251593, + "grad_norm": 0.94140625, + "learning_rate": 5.8940719144800785e-05, + "loss": 3.1265, + "step": 3380 + }, + { + "epoch": 1.2351976680633996, + "grad_norm": 0.80859375, + "learning_rate": 5.881924198250729e-05, + "loss": 3.2705, + "step": 3390 + }, + { + "epoch": 1.2388413190016396, + "grad_norm": 0.77734375, + "learning_rate": 5.8697764820213804e-05, + "loss": 3.1545, + "step": 3400 + }, + { + "epoch": 1.2424849699398797, + "grad_norm": 1.015625, + "learning_rate": 5.8576287657920314e-05, + "loss": 3.1632, + "step": 3410 + }, + { + "epoch": 1.24612862087812, + "grad_norm": 0.75390625, + "learning_rate": 5.845481049562682e-05, + "loss": 3.1776, + "step": 3420 + }, + { + "epoch": 1.24977227181636, + "grad_norm": 0.90625, + "learning_rate": 5.833333333333334e-05, + "loss": 3.1733, + "step": 3430 + }, + { + "epoch": 1.2534159227546002, + "grad_norm": 0.890625, + "learning_rate": 5.821185617103985e-05, + "loss": 3.0226, + "step": 3440 + }, + { + "epoch": 1.2570595736928403, + "grad_norm": 0.8046875, + "learning_rate": 5.809037900874635e-05, + "loss": 3.156, + "step": 3450 + }, + { + "epoch": 1.2607032246310803, + "grad_norm": 0.85546875, + "learning_rate": 5.7968901846452875e-05, + "loss": 3.0929, + "step": 3460 + }, + { + "epoch": 1.2643468755693203, + "grad_norm": 0.70703125, + "learning_rate": 5.784742468415938e-05, + "loss": 3.1027, + "step": 3470 + }, + { + "epoch": 1.2679905265075606, + "grad_norm": 0.76171875, + "learning_rate": 5.77259475218659e-05, + "loss": 3.2188, + "step": 3480 + }, + { + "epoch": 1.2716341774458007, + "grad_norm": 0.8671875, + "learning_rate": 5.76044703595724e-05, + "loss": 3.0835, + "step": 3490 + }, + { + "epoch": 1.275277828384041, + "grad_norm": 0.82421875, + "learning_rate": 5.748299319727891e-05, + "loss": 3.0709, + "step": 3500 + }, + { + "epoch": 1.278921479322281, + "grad_norm": 0.79296875, + "learning_rate": 5.736151603498543e-05, + "loss": 3.1397, + "step": 3510 + }, + { + "epoch": 1.282565130260521, + "grad_norm": 0.83203125, + "learning_rate": 5.724003887269194e-05, + "loss": 3.1717, + "step": 3520 + }, + { + "epoch": 1.286208781198761, + "grad_norm": 0.875, + "learning_rate": 5.711856171039844e-05, + "loss": 3.1881, + "step": 3530 + }, + { + "epoch": 1.2898524321370013, + "grad_norm": 0.859375, + "learning_rate": 5.6997084548104964e-05, + "loss": 3.1279, + "step": 3540 + }, + { + "epoch": 1.2934960830752413, + "grad_norm": 0.82421875, + "learning_rate": 5.6875607385811467e-05, + "loss": 3.1212, + "step": 3550 + }, + { + "epoch": 1.2971397340134816, + "grad_norm": 0.95703125, + "learning_rate": 5.6754130223517976e-05, + "loss": 3.1591, + "step": 3560 + }, + { + "epoch": 1.3007833849517216, + "grad_norm": 0.8125, + "learning_rate": 5.663265306122449e-05, + "loss": 3.1113, + "step": 3570 + }, + { + "epoch": 1.3044270358899617, + "grad_norm": 0.90234375, + "learning_rate": 5.6511175898931e-05, + "loss": 3.2222, + "step": 3580 + }, + { + "epoch": 1.308070686828202, + "grad_norm": 0.734375, + "learning_rate": 5.638969873663752e-05, + "loss": 3.1596, + "step": 3590 + }, + { + "epoch": 1.311714337766442, + "grad_norm": 0.76171875, + "learning_rate": 5.626822157434403e-05, + "loss": 3.1784, + "step": 3600 + }, + { + "epoch": 1.315357988704682, + "grad_norm": 0.7734375, + "learning_rate": 5.614674441205054e-05, + "loss": 3.1464, + "step": 3610 + }, + { + "epoch": 1.3190016396429223, + "grad_norm": 0.7265625, + "learning_rate": 5.602526724975705e-05, + "loss": 3.1616, + "step": 3620 + }, + { + "epoch": 1.3226452905811623, + "grad_norm": 0.81640625, + "learning_rate": 5.5903790087463556e-05, + "loss": 3.1747, + "step": 3630 + }, + { + "epoch": 1.3262889415194024, + "grad_norm": 0.88671875, + "learning_rate": 5.5782312925170065e-05, + "loss": 3.137, + "step": 3640 + }, + { + "epoch": 1.3299325924576426, + "grad_norm": 0.75390625, + "learning_rate": 5.566083576287658e-05, + "loss": 3.1302, + "step": 3650 + }, + { + "epoch": 1.3335762433958827, + "grad_norm": 0.79296875, + "learning_rate": 5.553935860058309e-05, + "loss": 3.2009, + "step": 3660 + }, + { + "epoch": 1.337219894334123, + "grad_norm": 0.8203125, + "learning_rate": 5.541788143828961e-05, + "loss": 3.1738, + "step": 3670 + }, + { + "epoch": 1.340863545272363, + "grad_norm": 0.83203125, + "learning_rate": 5.529640427599612e-05, + "loss": 3.0996, + "step": 3680 + }, + { + "epoch": 1.344507196210603, + "grad_norm": 1.1796875, + "learning_rate": 5.5174927113702626e-05, + "loss": 3.2209, + "step": 3690 + }, + { + "epoch": 1.348150847148843, + "grad_norm": 0.84765625, + "learning_rate": 5.505344995140914e-05, + "loss": 3.1315, + "step": 3700 + }, + { + "epoch": 1.3517944980870833, + "grad_norm": 0.78515625, + "learning_rate": 5.493197278911565e-05, + "loss": 3.1241, + "step": 3710 + }, + { + "epoch": 1.3554381490253233, + "grad_norm": 0.73046875, + "learning_rate": 5.4810495626822155e-05, + "loss": 3.2209, + "step": 3720 + }, + { + "epoch": 1.3590817999635636, + "grad_norm": 0.796875, + "learning_rate": 5.468901846452867e-05, + "loss": 3.1234, + "step": 3730 + }, + { + "epoch": 1.3627254509018036, + "grad_norm": 0.78515625, + "learning_rate": 5.456754130223518e-05, + "loss": 3.0762, + "step": 3740 + }, + { + "epoch": 1.3663691018400437, + "grad_norm": 0.8828125, + "learning_rate": 5.444606413994169e-05, + "loss": 3.1506, + "step": 3750 + }, + { + "epoch": 1.3700127527782837, + "grad_norm": 0.796875, + "learning_rate": 5.4324586977648206e-05, + "loss": 3.1047, + "step": 3760 + }, + { + "epoch": 1.373656403716524, + "grad_norm": 0.90234375, + "learning_rate": 5.4203109815354715e-05, + "loss": 3.1776, + "step": 3770 + }, + { + "epoch": 1.377300054654764, + "grad_norm": 0.859375, + "learning_rate": 5.408163265306123e-05, + "loss": 3.1998, + "step": 3780 + }, + { + "epoch": 1.3809437055930043, + "grad_norm": 0.87890625, + "learning_rate": 5.396015549076774e-05, + "loss": 3.3064, + "step": 3790 + }, + { + "epoch": 1.3845873565312443, + "grad_norm": 0.8671875, + "learning_rate": 5.3838678328474244e-05, + "loss": 3.1491, + "step": 3800 + }, + { + "epoch": 1.3882310074694844, + "grad_norm": 0.8828125, + "learning_rate": 5.371720116618077e-05, + "loss": 3.2158, + "step": 3810 + }, + { + "epoch": 1.3918746584077246, + "grad_norm": 0.84765625, + "learning_rate": 5.359572400388727e-05, + "loss": 3.0956, + "step": 3820 + }, + { + "epoch": 1.3955183093459647, + "grad_norm": 0.72265625, + "learning_rate": 5.347424684159378e-05, + "loss": 3.159, + "step": 3830 + }, + { + "epoch": 1.3991619602842047, + "grad_norm": 0.9296875, + "learning_rate": 5.3352769679300295e-05, + "loss": 3.2044, + "step": 3840 + }, + { + "epoch": 1.402805611222445, + "grad_norm": 0.76953125, + "learning_rate": 5.3231292517006805e-05, + "loss": 3.1354, + "step": 3850 + }, + { + "epoch": 1.406449262160685, + "grad_norm": 0.79296875, + "learning_rate": 5.310981535471332e-05, + "loss": 3.2342, + "step": 3860 + }, + { + "epoch": 1.410092913098925, + "grad_norm": 0.81640625, + "learning_rate": 5.298833819241983e-05, + "loss": 3.1566, + "step": 3870 + }, + { + "epoch": 1.4137365640371653, + "grad_norm": 0.8671875, + "learning_rate": 5.286686103012633e-05, + "loss": 3.1535, + "step": 3880 + }, + { + "epoch": 1.4173802149754053, + "grad_norm": 0.8046875, + "learning_rate": 5.2745383867832856e-05, + "loss": 3.1968, + "step": 3890 + }, + { + "epoch": 1.4210238659136456, + "grad_norm": 1.0390625, + "learning_rate": 5.262390670553936e-05, + "loss": 3.2237, + "step": 3900 + }, + { + "epoch": 1.4246675168518856, + "grad_norm": 0.8203125, + "learning_rate": 5.250242954324587e-05, + "loss": 3.154, + "step": 3910 + }, + { + "epoch": 1.4283111677901257, + "grad_norm": 0.921875, + "learning_rate": 5.2380952380952384e-05, + "loss": 3.2096, + "step": 3920 + }, + { + "epoch": 1.4319548187283657, + "grad_norm": 0.84765625, + "learning_rate": 5.2259475218658894e-05, + "loss": 3.1827, + "step": 3930 + }, + { + "epoch": 1.435598469666606, + "grad_norm": 1.0234375, + "learning_rate": 5.213799805636541e-05, + "loss": 3.1439, + "step": 3940 + }, + { + "epoch": 1.439242120604846, + "grad_norm": 0.78515625, + "learning_rate": 5.201652089407192e-05, + "loss": 3.1562, + "step": 3950 + }, + { + "epoch": 1.4428857715430863, + "grad_norm": 0.8828125, + "learning_rate": 5.189504373177842e-05, + "loss": 3.1539, + "step": 3960 + }, + { + "epoch": 1.4465294224813263, + "grad_norm": 0.75390625, + "learning_rate": 5.1773566569484945e-05, + "loss": 3.1449, + "step": 3970 + }, + { + "epoch": 1.4501730734195664, + "grad_norm": 0.94140625, + "learning_rate": 5.165208940719145e-05, + "loss": 3.1867, + "step": 3980 + }, + { + "epoch": 1.4538167243578064, + "grad_norm": 0.7578125, + "learning_rate": 5.153061224489796e-05, + "loss": 3.1182, + "step": 3990 + }, + { + "epoch": 1.4574603752960467, + "grad_norm": 0.83203125, + "learning_rate": 5.1409135082604474e-05, + "loss": 3.1719, + "step": 4000 + }, + { + "epoch": 1.4611040262342867, + "grad_norm": 0.87890625, + "learning_rate": 5.128765792031098e-05, + "loss": 3.2043, + "step": 4010 + }, + { + "epoch": 1.464747677172527, + "grad_norm": 0.875, + "learning_rate": 5.116618075801749e-05, + "loss": 3.1937, + "step": 4020 + }, + { + "epoch": 1.468391328110767, + "grad_norm": 0.72265625, + "learning_rate": 5.104470359572401e-05, + "loss": 3.1864, + "step": 4030 + }, + { + "epoch": 1.472034979049007, + "grad_norm": 0.8828125, + "learning_rate": 5.092322643343052e-05, + "loss": 3.1312, + "step": 4040 + }, + { + "epoch": 1.4756786299872473, + "grad_norm": 0.8828125, + "learning_rate": 5.0801749271137035e-05, + "loss": 3.0836, + "step": 4050 + }, + { + "epoch": 1.4793222809254873, + "grad_norm": 0.9296875, + "learning_rate": 5.068027210884354e-05, + "loss": 3.1631, + "step": 4060 + }, + { + "epoch": 1.4829659318637274, + "grad_norm": 0.77734375, + "learning_rate": 5.055879494655005e-05, + "loss": 3.1043, + "step": 4070 + }, + { + "epoch": 1.4866095828019676, + "grad_norm": 0.71875, + "learning_rate": 5.043731778425656e-05, + "loss": 3.281, + "step": 4080 + }, + { + "epoch": 1.4902532337402077, + "grad_norm": 0.6640625, + "learning_rate": 5.031584062196307e-05, + "loss": 3.1505, + "step": 4090 + }, + { + "epoch": 1.4938968846784477, + "grad_norm": 0.8359375, + "learning_rate": 5.019436345966958e-05, + "loss": 3.1435, + "step": 4100 + }, + { + "epoch": 1.497540535616688, + "grad_norm": 0.828125, + "learning_rate": 5.00728862973761e-05, + "loss": 3.1984, + "step": 4110 + }, + { + "epoch": 1.501184186554928, + "grad_norm": 0.984375, + "learning_rate": 4.995140913508261e-05, + "loss": 3.1526, + "step": 4120 + }, + { + "epoch": 1.5048278374931683, + "grad_norm": 0.84765625, + "learning_rate": 4.982993197278912e-05, + "loss": 3.1732, + "step": 4130 + }, + { + "epoch": 1.5084714884314083, + "grad_norm": 0.75, + "learning_rate": 4.970845481049563e-05, + "loss": 3.2319, + "step": 4140 + }, + { + "epoch": 1.5121151393696484, + "grad_norm": 0.76953125, + "learning_rate": 4.958697764820214e-05, + "loss": 3.1467, + "step": 4150 + }, + { + "epoch": 1.5157587903078884, + "grad_norm": 0.828125, + "learning_rate": 4.946550048590865e-05, + "loss": 3.0744, + "step": 4160 + }, + { + "epoch": 1.5194024412461287, + "grad_norm": 0.76171875, + "learning_rate": 4.934402332361516e-05, + "loss": 3.1253, + "step": 4170 + }, + { + "epoch": 1.5230460921843687, + "grad_norm": 0.83203125, + "learning_rate": 4.922254616132168e-05, + "loss": 3.1691, + "step": 4180 + }, + { + "epoch": 1.526689743122609, + "grad_norm": 0.6796875, + "learning_rate": 4.910106899902818e-05, + "loss": 3.1127, + "step": 4190 + }, + { + "epoch": 1.530333394060849, + "grad_norm": 0.8046875, + "learning_rate": 4.89795918367347e-05, + "loss": 3.199, + "step": 4200 + }, + { + "epoch": 1.533977044999089, + "grad_norm": 0.76171875, + "learning_rate": 4.8858114674441206e-05, + "loss": 3.1458, + "step": 4210 + }, + { + "epoch": 1.537620695937329, + "grad_norm": 0.90234375, + "learning_rate": 4.873663751214772e-05, + "loss": 3.144, + "step": 4220 + }, + { + "epoch": 1.5412643468755693, + "grad_norm": 0.86328125, + "learning_rate": 4.8615160349854225e-05, + "loss": 3.1611, + "step": 4230 + }, + { + "epoch": 1.5449079978138094, + "grad_norm": 0.9921875, + "learning_rate": 4.849368318756074e-05, + "loss": 3.2377, + "step": 4240 + }, + { + "epoch": 1.5485516487520496, + "grad_norm": 0.8046875, + "learning_rate": 4.837220602526725e-05, + "loss": 3.0911, + "step": 4250 + }, + { + "epoch": 1.5521952996902897, + "grad_norm": 0.859375, + "learning_rate": 4.825072886297377e-05, + "loss": 3.1286, + "step": 4260 + }, + { + "epoch": 1.5558389506285297, + "grad_norm": 0.875, + "learning_rate": 4.812925170068027e-05, + "loss": 3.1614, + "step": 4270 + }, + { + "epoch": 1.5594826015667698, + "grad_norm": 0.75390625, + "learning_rate": 4.8007774538386786e-05, + "loss": 3.1076, + "step": 4280 + }, + { + "epoch": 1.56312625250501, + "grad_norm": 0.83984375, + "learning_rate": 4.7886297376093295e-05, + "loss": 3.1806, + "step": 4290 + }, + { + "epoch": 1.5667699034432503, + "grad_norm": 0.80859375, + "learning_rate": 4.776482021379981e-05, + "loss": 3.1957, + "step": 4300 + }, + { + "epoch": 1.5704135543814903, + "grad_norm": 1.0546875, + "learning_rate": 4.7643343051506314e-05, + "loss": 3.1933, + "step": 4310 + }, + { + "epoch": 1.5740572053197304, + "grad_norm": 0.9375, + "learning_rate": 4.752186588921283e-05, + "loss": 3.212, + "step": 4320 + }, + { + "epoch": 1.5777008562579704, + "grad_norm": 0.8671875, + "learning_rate": 4.740038872691934e-05, + "loss": 3.1293, + "step": 4330 + }, + { + "epoch": 1.5813445071962104, + "grad_norm": 0.8359375, + "learning_rate": 4.7278911564625856e-05, + "loss": 3.2165, + "step": 4340 + }, + { + "epoch": 1.5849881581344507, + "grad_norm": 0.84765625, + "learning_rate": 4.715743440233236e-05, + "loss": 3.1911, + "step": 4350 + }, + { + "epoch": 1.588631809072691, + "grad_norm": 1.03125, + "learning_rate": 4.7035957240038875e-05, + "loss": 3.1359, + "step": 4360 + }, + { + "epoch": 1.592275460010931, + "grad_norm": 0.79296875, + "learning_rate": 4.6914480077745385e-05, + "loss": 3.2345, + "step": 4370 + }, + { + "epoch": 1.595919110949171, + "grad_norm": 0.80859375, + "learning_rate": 4.6793002915451894e-05, + "loss": 3.1874, + "step": 4380 + }, + { + "epoch": 1.599562761887411, + "grad_norm": 0.85546875, + "learning_rate": 4.667152575315841e-05, + "loss": 3.2192, + "step": 4390 + }, + { + "epoch": 1.6032064128256514, + "grad_norm": 0.7734375, + "learning_rate": 4.655004859086492e-05, + "loss": 3.1632, + "step": 4400 + }, + { + "epoch": 1.6068500637638914, + "grad_norm": 0.734375, + "learning_rate": 4.642857142857143e-05, + "loss": 3.1723, + "step": 4410 + }, + { + "epoch": 1.6104937147021317, + "grad_norm": 0.91015625, + "learning_rate": 4.630709426627794e-05, + "loss": 3.1858, + "step": 4420 + }, + { + "epoch": 1.6141373656403717, + "grad_norm": 0.84765625, + "learning_rate": 4.6185617103984455e-05, + "loss": 3.1226, + "step": 4430 + }, + { + "epoch": 1.6177810165786117, + "grad_norm": 0.87109375, + "learning_rate": 4.6064139941690965e-05, + "loss": 3.2065, + "step": 4440 + }, + { + "epoch": 1.6214246675168518, + "grad_norm": 0.87890625, + "learning_rate": 4.5942662779397474e-05, + "loss": 3.105, + "step": 4450 + }, + { + "epoch": 1.625068318455092, + "grad_norm": 0.9609375, + "learning_rate": 4.5821185617103983e-05, + "loss": 3.1379, + "step": 4460 + }, + { + "epoch": 1.628711969393332, + "grad_norm": 0.75, + "learning_rate": 4.56997084548105e-05, + "loss": 3.1684, + "step": 4470 + }, + { + "epoch": 1.6323556203315723, + "grad_norm": 0.74609375, + "learning_rate": 4.557823129251701e-05, + "loss": 3.1278, + "step": 4480 + }, + { + "epoch": 1.6359992712698124, + "grad_norm": 0.84765625, + "learning_rate": 4.5456754130223525e-05, + "loss": 3.1971, + "step": 4490 + }, + { + "epoch": 1.6396429222080524, + "grad_norm": 0.9296875, + "learning_rate": 4.533527696793003e-05, + "loss": 3.1004, + "step": 4500 + }, + { + "epoch": 1.6432865731462925, + "grad_norm": 0.81640625, + "learning_rate": 4.5213799805636544e-05, + "loss": 3.1026, + "step": 4510 + }, + { + "epoch": 1.6469302240845327, + "grad_norm": 0.80859375, + "learning_rate": 4.5092322643343054e-05, + "loss": 3.1681, + "step": 4520 + }, + { + "epoch": 1.650573875022773, + "grad_norm": 0.7109375, + "learning_rate": 4.497084548104957e-05, + "loss": 3.185, + "step": 4530 + }, + { + "epoch": 1.654217525961013, + "grad_norm": 0.859375, + "learning_rate": 4.484936831875607e-05, + "loss": 3.1992, + "step": 4540 + }, + { + "epoch": 1.657861176899253, + "grad_norm": 0.953125, + "learning_rate": 4.472789115646259e-05, + "loss": 3.1486, + "step": 4550 + }, + { + "epoch": 1.661504827837493, + "grad_norm": 0.8671875, + "learning_rate": 4.46064139941691e-05, + "loss": 3.1765, + "step": 4560 + }, + { + "epoch": 1.6651484787757331, + "grad_norm": 0.77734375, + "learning_rate": 4.4484936831875615e-05, + "loss": 3.1672, + "step": 4570 + }, + { + "epoch": 1.6687921297139734, + "grad_norm": 0.734375, + "learning_rate": 4.436345966958212e-05, + "loss": 3.1509, + "step": 4580 + }, + { + "epoch": 1.6724357806522137, + "grad_norm": 0.84765625, + "learning_rate": 4.4241982507288634e-05, + "loss": 3.1479, + "step": 4590 + }, + { + "epoch": 1.6760794315904537, + "grad_norm": 0.84765625, + "learning_rate": 4.412050534499514e-05, + "loss": 3.1274, + "step": 4600 + }, + { + "epoch": 1.6797230825286937, + "grad_norm": 0.859375, + "learning_rate": 4.399902818270165e-05, + "loss": 3.1988, + "step": 4610 + }, + { + "epoch": 1.6833667334669338, + "grad_norm": 0.765625, + "learning_rate": 4.387755102040816e-05, + "loss": 3.1433, + "step": 4620 + }, + { + "epoch": 1.687010384405174, + "grad_norm": 0.76171875, + "learning_rate": 4.375607385811468e-05, + "loss": 3.1616, + "step": 4630 + }, + { + "epoch": 1.690654035343414, + "grad_norm": 0.8515625, + "learning_rate": 4.363459669582119e-05, + "loss": 3.2244, + "step": 4640 + }, + { + "epoch": 1.6942976862816543, + "grad_norm": 0.9921875, + "learning_rate": 4.35131195335277e-05, + "loss": 3.2014, + "step": 4650 + }, + { + "epoch": 1.6979413372198944, + "grad_norm": 0.859375, + "learning_rate": 4.3391642371234207e-05, + "loss": 3.1558, + "step": 4660 + }, + { + "epoch": 1.7015849881581344, + "grad_norm": 0.93359375, + "learning_rate": 4.327016520894072e-05, + "loss": 3.1166, + "step": 4670 + }, + { + "epoch": 1.7052286390963745, + "grad_norm": 0.89453125, + "learning_rate": 4.314868804664723e-05, + "loss": 3.1352, + "step": 4680 + }, + { + "epoch": 1.7088722900346147, + "grad_norm": 0.9453125, + "learning_rate": 4.302721088435374e-05, + "loss": 3.1346, + "step": 4690 + }, + { + "epoch": 1.7125159409728548, + "grad_norm": 0.7890625, + "learning_rate": 4.290573372206025e-05, + "loss": 3.1268, + "step": 4700 + }, + { + "epoch": 1.716159591911095, + "grad_norm": 0.8828125, + "learning_rate": 4.278425655976677e-05, + "loss": 3.2035, + "step": 4710 + }, + { + "epoch": 1.719803242849335, + "grad_norm": 0.75390625, + "learning_rate": 4.266277939747328e-05, + "loss": 3.153, + "step": 4720 + }, + { + "epoch": 1.723446893787575, + "grad_norm": 0.78125, + "learning_rate": 4.2541302235179786e-05, + "loss": 3.1211, + "step": 4730 + }, + { + "epoch": 1.7270905447258151, + "grad_norm": 0.75, + "learning_rate": 4.2419825072886296e-05, + "loss": 3.1218, + "step": 4740 + }, + { + "epoch": 1.7307341956640554, + "grad_norm": 0.9140625, + "learning_rate": 4.229834791059281e-05, + "loss": 3.1847, + "step": 4750 + }, + { + "epoch": 1.7343778466022957, + "grad_norm": 0.99609375, + "learning_rate": 4.217687074829932e-05, + "loss": 3.1372, + "step": 4760 + }, + { + "epoch": 1.7380214975405357, + "grad_norm": 0.8359375, + "learning_rate": 4.205539358600583e-05, + "loss": 3.1543, + "step": 4770 + }, + { + "epoch": 1.7416651484787757, + "grad_norm": 0.94140625, + "learning_rate": 4.193391642371235e-05, + "loss": 3.2583, + "step": 4780 + }, + { + "epoch": 1.7453087994170158, + "grad_norm": 1.0703125, + "learning_rate": 4.181243926141886e-05, + "loss": 3.2017, + "step": 4790 + }, + { + "epoch": 1.7489524503552558, + "grad_norm": 0.84375, + "learning_rate": 4.1690962099125366e-05, + "loss": 3.1221, + "step": 4800 + }, + { + "epoch": 1.752596101293496, + "grad_norm": 0.9140625, + "learning_rate": 4.1569484936831876e-05, + "loss": 3.017, + "step": 4810 + }, + { + "epoch": 1.7562397522317363, + "grad_norm": 0.84375, + "learning_rate": 4.144800777453839e-05, + "loss": 3.0838, + "step": 4820 + }, + { + "epoch": 1.7598834031699764, + "grad_norm": 0.83984375, + "learning_rate": 4.13265306122449e-05, + "loss": 3.1651, + "step": 4830 + }, + { + "epoch": 1.7635270541082164, + "grad_norm": 0.74609375, + "learning_rate": 4.120505344995141e-05, + "loss": 3.2177, + "step": 4840 + }, + { + "epoch": 1.7671707050464565, + "grad_norm": 0.91796875, + "learning_rate": 4.108357628765792e-05, + "loss": 3.2003, + "step": 4850 + }, + { + "epoch": 1.7708143559846967, + "grad_norm": 0.8359375, + "learning_rate": 4.0962099125364436e-05, + "loss": 3.2039, + "step": 4860 + }, + { + "epoch": 1.7744580069229368, + "grad_norm": 0.7890625, + "learning_rate": 4.0840621963070946e-05, + "loss": 3.1705, + "step": 4870 + }, + { + "epoch": 1.778101657861177, + "grad_norm": 0.8515625, + "learning_rate": 4.0719144800777455e-05, + "loss": 3.1413, + "step": 4880 + }, + { + "epoch": 1.781745308799417, + "grad_norm": 0.83203125, + "learning_rate": 4.0597667638483965e-05, + "loss": 3.177, + "step": 4890 + }, + { + "epoch": 1.785388959737657, + "grad_norm": 0.79296875, + "learning_rate": 4.047619047619048e-05, + "loss": 3.19, + "step": 4900 + }, + { + "epoch": 1.7890326106758971, + "grad_norm": 0.76171875, + "learning_rate": 4.035471331389699e-05, + "loss": 3.1219, + "step": 4910 + }, + { + "epoch": 1.7926762616141374, + "grad_norm": 0.8046875, + "learning_rate": 4.02332361516035e-05, + "loss": 3.2115, + "step": 4920 + }, + { + "epoch": 1.7963199125523774, + "grad_norm": 0.8671875, + "learning_rate": 4.011175898931001e-05, + "loss": 3.2519, + "step": 4930 + }, + { + "epoch": 1.7999635634906177, + "grad_norm": 0.76953125, + "learning_rate": 3.9990281827016526e-05, + "loss": 3.1165, + "step": 4940 + }, + { + "epoch": 1.8036072144288577, + "grad_norm": 0.890625, + "learning_rate": 3.9868804664723035e-05, + "loss": 3.1574, + "step": 4950 + }, + { + "epoch": 1.8072508653670978, + "grad_norm": 0.765625, + "learning_rate": 3.9747327502429545e-05, + "loss": 3.1719, + "step": 4960 + }, + { + "epoch": 1.8108945163053378, + "grad_norm": 0.87109375, + "learning_rate": 3.9625850340136054e-05, + "loss": 3.204, + "step": 4970 + }, + { + "epoch": 1.814538167243578, + "grad_norm": 0.7421875, + "learning_rate": 3.950437317784257e-05, + "loss": 3.1539, + "step": 4980 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.875, + "learning_rate": 3.938289601554908e-05, + "loss": 3.2391, + "step": 4990 + }, + { + "epoch": 1.8218254691200584, + "grad_norm": 0.88671875, + "learning_rate": 3.926141885325559e-05, + "loss": 3.2341, + "step": 5000 + }, + { + "epoch": 1.8254691200582984, + "grad_norm": 0.9296875, + "learning_rate": 3.91399416909621e-05, + "loss": 3.1369, + "step": 5010 + }, + { + "epoch": 1.8291127709965385, + "grad_norm": 0.83203125, + "learning_rate": 3.9018464528668615e-05, + "loss": 3.2285, + "step": 5020 + }, + { + "epoch": 1.8327564219347785, + "grad_norm": 0.84375, + "learning_rate": 3.8896987366375124e-05, + "loss": 3.1143, + "step": 5030 + }, + { + "epoch": 1.8364000728730188, + "grad_norm": 0.83203125, + "learning_rate": 3.8775510204081634e-05, + "loss": 3.1757, + "step": 5040 + }, + { + "epoch": 1.840043723811259, + "grad_norm": 0.87109375, + "learning_rate": 3.865403304178814e-05, + "loss": 3.0718, + "step": 5050 + }, + { + "epoch": 1.843687374749499, + "grad_norm": 1.0546875, + "learning_rate": 3.853255587949466e-05, + "loss": 3.0605, + "step": 5060 + }, + { + "epoch": 1.847331025687739, + "grad_norm": 0.96484375, + "learning_rate": 3.841107871720116e-05, + "loss": 3.1806, + "step": 5070 + }, + { + "epoch": 1.8509746766259791, + "grad_norm": 1.0234375, + "learning_rate": 3.828960155490768e-05, + "loss": 3.1507, + "step": 5080 + }, + { + "epoch": 1.8546183275642192, + "grad_norm": 0.78515625, + "learning_rate": 3.816812439261419e-05, + "loss": 3.1032, + "step": 5090 + }, + { + "epoch": 1.8582619785024594, + "grad_norm": 0.859375, + "learning_rate": 3.8046647230320704e-05, + "loss": 3.1767, + "step": 5100 + }, + { + "epoch": 1.8619056294406997, + "grad_norm": 0.9609375, + "learning_rate": 3.7925170068027214e-05, + "loss": 3.1871, + "step": 5110 + }, + { + "epoch": 1.8655492803789397, + "grad_norm": 0.83984375, + "learning_rate": 3.780369290573372e-05, + "loss": 3.2559, + "step": 5120 + }, + { + "epoch": 1.8691929313171798, + "grad_norm": 0.875, + "learning_rate": 3.768221574344023e-05, + "loss": 3.1719, + "step": 5130 + }, + { + "epoch": 1.8728365822554198, + "grad_norm": 0.83203125, + "learning_rate": 3.756073858114675e-05, + "loss": 3.1794, + "step": 5140 + }, + { + "epoch": 1.87648023319366, + "grad_norm": 0.7265625, + "learning_rate": 3.743926141885326e-05, + "loss": 3.1099, + "step": 5150 + }, + { + "epoch": 1.8801238841319001, + "grad_norm": 0.81640625, + "learning_rate": 3.731778425655977e-05, + "loss": 3.1963, + "step": 5160 + }, + { + "epoch": 1.8837675350701404, + "grad_norm": 0.8046875, + "learning_rate": 3.7196307094266284e-05, + "loss": 3.1068, + "step": 5170 + }, + { + "epoch": 1.8874111860083804, + "grad_norm": 0.7734375, + "learning_rate": 3.707482993197279e-05, + "loss": 3.1131, + "step": 5180 + }, + { + "epoch": 1.8910548369466205, + "grad_norm": 0.90625, + "learning_rate": 3.69533527696793e-05, + "loss": 3.2152, + "step": 5190 + }, + { + "epoch": 1.8946984878848605, + "grad_norm": 0.99609375, + "learning_rate": 3.683187560738581e-05, + "loss": 3.2625, + "step": 5200 + }, + { + "epoch": 1.8983421388231008, + "grad_norm": 0.95703125, + "learning_rate": 3.671039844509233e-05, + "loss": 3.1989, + "step": 5210 + }, + { + "epoch": 1.901985789761341, + "grad_norm": 0.84765625, + "learning_rate": 3.658892128279884e-05, + "loss": 3.2053, + "step": 5220 + }, + { + "epoch": 1.905629440699581, + "grad_norm": 0.8515625, + "learning_rate": 3.646744412050535e-05, + "loss": 3.2008, + "step": 5230 + }, + { + "epoch": 1.909273091637821, + "grad_norm": 0.80078125, + "learning_rate": 3.634596695821186e-05, + "loss": 3.1759, + "step": 5240 + }, + { + "epoch": 1.9129167425760611, + "grad_norm": 1.0234375, + "learning_rate": 3.622448979591837e-05, + "loss": 3.1949, + "step": 5250 + }, + { + "epoch": 1.9165603935143012, + "grad_norm": 0.93359375, + "learning_rate": 3.6103012633624876e-05, + "loss": 3.1669, + "step": 5260 + }, + { + "epoch": 1.9202040444525414, + "grad_norm": 0.9296875, + "learning_rate": 3.598153547133139e-05, + "loss": 3.1745, + "step": 5270 + }, + { + "epoch": 1.9238476953907817, + "grad_norm": 0.81640625, + "learning_rate": 3.58600583090379e-05, + "loss": 3.1438, + "step": 5280 + }, + { + "epoch": 1.9274913463290217, + "grad_norm": 0.7421875, + "learning_rate": 3.573858114674442e-05, + "loss": 3.1506, + "step": 5290 + }, + { + "epoch": 1.9311349972672618, + "grad_norm": 0.82421875, + "learning_rate": 3.561710398445092e-05, + "loss": 3.1507, + "step": 5300 + }, + { + "epoch": 1.9347786482055018, + "grad_norm": 1.140625, + "learning_rate": 3.549562682215744e-05, + "loss": 3.1188, + "step": 5310 + }, + { + "epoch": 1.9384222991437419, + "grad_norm": 0.88671875, + "learning_rate": 3.5374149659863946e-05, + "loss": 3.1295, + "step": 5320 + }, + { + "epoch": 1.9420659500819821, + "grad_norm": 0.93359375, + "learning_rate": 3.525267249757046e-05, + "loss": 3.2526, + "step": 5330 + }, + { + "epoch": 1.9457096010202224, + "grad_norm": 0.859375, + "learning_rate": 3.5131195335276965e-05, + "loss": 3.1166, + "step": 5340 + }, + { + "epoch": 1.9493532519584624, + "grad_norm": 0.86328125, + "learning_rate": 3.500971817298348e-05, + "loss": 3.1793, + "step": 5350 + }, + { + "epoch": 1.9529969028967025, + "grad_norm": 0.95703125, + "learning_rate": 3.488824101068999e-05, + "loss": 3.078, + "step": 5360 + }, + { + "epoch": 1.9566405538349425, + "grad_norm": 0.89453125, + "learning_rate": 3.476676384839651e-05, + "loss": 3.1149, + "step": 5370 + }, + { + "epoch": 1.9602842047731828, + "grad_norm": 0.8125, + "learning_rate": 3.464528668610301e-05, + "loss": 3.1697, + "step": 5380 + }, + { + "epoch": 1.9639278557114228, + "grad_norm": 0.7578125, + "learning_rate": 3.4523809523809526e-05, + "loss": 3.1781, + "step": 5390 + }, + { + "epoch": 1.967571506649663, + "grad_norm": 0.83984375, + "learning_rate": 3.4402332361516035e-05, + "loss": 3.11, + "step": 5400 + }, + { + "epoch": 1.971215157587903, + "grad_norm": 0.8203125, + "learning_rate": 3.428085519922255e-05, + "loss": 3.1757, + "step": 5410 + }, + { + "epoch": 1.9748588085261432, + "grad_norm": 0.89453125, + "learning_rate": 3.4159378036929054e-05, + "loss": 3.2046, + "step": 5420 + }, + { + "epoch": 1.9785024594643832, + "grad_norm": 0.96484375, + "learning_rate": 3.403790087463557e-05, + "loss": 3.2087, + "step": 5430 + }, + { + "epoch": 1.9821461104026235, + "grad_norm": 0.796875, + "learning_rate": 3.391642371234208e-05, + "loss": 3.1347, + "step": 5440 + }, + { + "epoch": 1.9857897613408635, + "grad_norm": 0.78125, + "learning_rate": 3.3794946550048596e-05, + "loss": 3.14, + "step": 5450 + }, + { + "epoch": 1.9894334122791038, + "grad_norm": 0.81640625, + "learning_rate": 3.36734693877551e-05, + "loss": 3.1691, + "step": 5460 + }, + { + "epoch": 1.9930770632173438, + "grad_norm": 0.86328125, + "learning_rate": 3.3551992225461615e-05, + "loss": 3.1885, + "step": 5470 + }, + { + "epoch": 1.9967207141555838, + "grad_norm": 0.7734375, + "learning_rate": 3.3430515063168125e-05, + "loss": 3.2177, + "step": 5480 + }, + { + "epoch": 2.000364365093824, + "grad_norm": 0.8515625, + "learning_rate": 3.3309037900874634e-05, + "loss": 3.2009, + "step": 5490 + }, + { + "epoch": 2.004008016032064, + "grad_norm": 0.890625, + "learning_rate": 3.318756073858115e-05, + "loss": 3.0367, + "step": 5500 + }, + { + "epoch": 2.0076516669703044, + "grad_norm": 0.875, + "learning_rate": 3.306608357628766e-05, + "loss": 3.0228, + "step": 5510 + }, + { + "epoch": 2.0112953179085444, + "grad_norm": 0.796875, + "learning_rate": 3.294460641399417e-05, + "loss": 3.0782, + "step": 5520 + }, + { + "epoch": 2.0149389688467845, + "grad_norm": 0.93359375, + "learning_rate": 3.282312925170068e-05, + "loss": 3.1693, + "step": 5530 + }, + { + "epoch": 2.0185826197850245, + "grad_norm": 0.81640625, + "learning_rate": 3.2701652089407195e-05, + "loss": 3.0759, + "step": 5540 + }, + { + "epoch": 2.0222262707232646, + "grad_norm": 0.9140625, + "learning_rate": 3.2580174927113704e-05, + "loss": 3.135, + "step": 5550 + }, + { + "epoch": 2.025869921661505, + "grad_norm": 0.9375, + "learning_rate": 3.245869776482022e-05, + "loss": 3.1345, + "step": 5560 + }, + { + "epoch": 2.029513572599745, + "grad_norm": 0.85546875, + "learning_rate": 3.233722060252672e-05, + "loss": 3.0659, + "step": 5570 + }, + { + "epoch": 2.033157223537985, + "grad_norm": 0.8046875, + "learning_rate": 3.221574344023324e-05, + "loss": 3.1813, + "step": 5580 + }, + { + "epoch": 2.036800874476225, + "grad_norm": 0.83984375, + "learning_rate": 3.209426627793975e-05, + "loss": 3.0976, + "step": 5590 + }, + { + "epoch": 2.040444525414465, + "grad_norm": 0.8203125, + "learning_rate": 3.1972789115646265e-05, + "loss": 3.1173, + "step": 5600 + }, + { + "epoch": 2.0440881763527052, + "grad_norm": 0.8203125, + "learning_rate": 3.185131195335277e-05, + "loss": 3.1076, + "step": 5610 + }, + { + "epoch": 2.0477318272909457, + "grad_norm": 0.859375, + "learning_rate": 3.1729834791059284e-05, + "loss": 3.112, + "step": 5620 + }, + { + "epoch": 2.0513754782291858, + "grad_norm": 0.95703125, + "learning_rate": 3.1608357628765794e-05, + "loss": 3.0959, + "step": 5630 + }, + { + "epoch": 2.055019129167426, + "grad_norm": 1.0078125, + "learning_rate": 3.148688046647231e-05, + "loss": 3.0968, + "step": 5640 + }, + { + "epoch": 2.058662780105666, + "grad_norm": 0.8671875, + "learning_rate": 3.136540330417881e-05, + "loss": 3.0484, + "step": 5650 + }, + { + "epoch": 2.062306431043906, + "grad_norm": 0.7734375, + "learning_rate": 3.124392614188533e-05, + "loss": 3.1581, + "step": 5660 + }, + { + "epoch": 2.065950081982146, + "grad_norm": 0.9375, + "learning_rate": 3.112244897959184e-05, + "loss": 3.0967, + "step": 5670 + }, + { + "epoch": 2.0695937329203864, + "grad_norm": 0.88671875, + "learning_rate": 3.1000971817298355e-05, + "loss": 3.0299, + "step": 5680 + }, + { + "epoch": 2.0732373838586264, + "grad_norm": 0.82421875, + "learning_rate": 3.087949465500486e-05, + "loss": 3.1771, + "step": 5690 + }, + { + "epoch": 2.0768810347968665, + "grad_norm": 0.98828125, + "learning_rate": 3.0758017492711373e-05, + "loss": 3.1248, + "step": 5700 + }, + { + "epoch": 2.0805246857351065, + "grad_norm": 0.9609375, + "learning_rate": 3.063654033041788e-05, + "loss": 3.1227, + "step": 5710 + }, + { + "epoch": 2.0841683366733466, + "grad_norm": 0.83203125, + "learning_rate": 3.0515063168124392e-05, + "loss": 3.1019, + "step": 5720 + }, + { + "epoch": 2.0878119876115866, + "grad_norm": 1.015625, + "learning_rate": 3.0393586005830905e-05, + "loss": 3.151, + "step": 5730 + }, + { + "epoch": 2.091455638549827, + "grad_norm": 0.7890625, + "learning_rate": 3.0272108843537418e-05, + "loss": 3.0596, + "step": 5740 + }, + { + "epoch": 2.095099289488067, + "grad_norm": 0.84765625, + "learning_rate": 3.015063168124393e-05, + "loss": 3.1311, + "step": 5750 + }, + { + "epoch": 2.098742940426307, + "grad_norm": 0.8828125, + "learning_rate": 3.0029154518950437e-05, + "loss": 3.1926, + "step": 5760 + }, + { + "epoch": 2.102386591364547, + "grad_norm": 1.03125, + "learning_rate": 2.990767735665695e-05, + "loss": 3.1265, + "step": 5770 + }, + { + "epoch": 2.1060302423027872, + "grad_norm": 0.96875, + "learning_rate": 2.9786200194363463e-05, + "loss": 3.1637, + "step": 5780 + }, + { + "epoch": 2.1096738932410277, + "grad_norm": 0.9296875, + "learning_rate": 2.9664723032069976e-05, + "loss": 3.135, + "step": 5790 + }, + { + "epoch": 2.1133175441792678, + "grad_norm": 0.90234375, + "learning_rate": 2.954324586977648e-05, + "loss": 2.9778, + "step": 5800 + }, + { + "epoch": 2.116961195117508, + "grad_norm": 0.87890625, + "learning_rate": 2.9421768707482994e-05, + "loss": 3.1818, + "step": 5810 + }, + { + "epoch": 2.120604846055748, + "grad_norm": 0.8125, + "learning_rate": 2.9300291545189507e-05, + "loss": 3.0939, + "step": 5820 + }, + { + "epoch": 2.124248496993988, + "grad_norm": 0.94140625, + "learning_rate": 2.917881438289602e-05, + "loss": 3.1183, + "step": 5830 + }, + { + "epoch": 2.127892147932228, + "grad_norm": 0.8515625, + "learning_rate": 2.9057337220602526e-05, + "loss": 3.1304, + "step": 5840 + }, + { + "epoch": 2.1315357988704684, + "grad_norm": 0.8671875, + "learning_rate": 2.893586005830904e-05, + "loss": 3.0758, + "step": 5850 + }, + { + "epoch": 2.1351794498087084, + "grad_norm": 0.91015625, + "learning_rate": 2.8814382896015552e-05, + "loss": 3.0331, + "step": 5860 + }, + { + "epoch": 2.1388231007469485, + "grad_norm": 1.0234375, + "learning_rate": 2.8692905733722065e-05, + "loss": 3.1495, + "step": 5870 + }, + { + "epoch": 2.1424667516851885, + "grad_norm": 0.77734375, + "learning_rate": 2.857142857142857e-05, + "loss": 3.0309, + "step": 5880 + }, + { + "epoch": 2.1461104026234286, + "grad_norm": 0.8984375, + "learning_rate": 2.8449951409135084e-05, + "loss": 3.1425, + "step": 5890 + }, + { + "epoch": 2.1497540535616686, + "grad_norm": 0.890625, + "learning_rate": 2.8328474246841597e-05, + "loss": 3.0591, + "step": 5900 + }, + { + "epoch": 2.153397704499909, + "grad_norm": 1.1796875, + "learning_rate": 2.820699708454811e-05, + "loss": 3.1644, + "step": 5910 + }, + { + "epoch": 2.157041355438149, + "grad_norm": 0.92578125, + "learning_rate": 2.8085519922254615e-05, + "loss": 3.0664, + "step": 5920 + }, + { + "epoch": 2.160685006376389, + "grad_norm": 0.94921875, + "learning_rate": 2.796404275996113e-05, + "loss": 3.1222, + "step": 5930 + }, + { + "epoch": 2.164328657314629, + "grad_norm": 0.875, + "learning_rate": 2.784256559766764e-05, + "loss": 3.1721, + "step": 5940 + }, + { + "epoch": 2.1679723082528692, + "grad_norm": 1.046875, + "learning_rate": 2.7721088435374147e-05, + "loss": 3.2084, + "step": 5950 + }, + { + "epoch": 2.1716159591911097, + "grad_norm": 0.82421875, + "learning_rate": 2.759961127308066e-05, + "loss": 3.1617, + "step": 5960 + }, + { + "epoch": 2.1752596101293498, + "grad_norm": 0.8515625, + "learning_rate": 2.7478134110787173e-05, + "loss": 3.1179, + "step": 5970 + }, + { + "epoch": 2.17890326106759, + "grad_norm": 0.9765625, + "learning_rate": 2.7356656948493686e-05, + "loss": 3.09, + "step": 5980 + }, + { + "epoch": 2.18254691200583, + "grad_norm": 0.9296875, + "learning_rate": 2.7235179786200192e-05, + "loss": 3.0906, + "step": 5990 + }, + { + "epoch": 2.18619056294407, + "grad_norm": 0.83203125, + "learning_rate": 2.7113702623906705e-05, + "loss": 3.118, + "step": 6000 + }, + { + "epoch": 2.18983421388231, + "grad_norm": 0.953125, + "learning_rate": 2.6992225461613218e-05, + "loss": 3.0609, + "step": 6010 + }, + { + "epoch": 2.19347786482055, + "grad_norm": 0.9609375, + "learning_rate": 2.687074829931973e-05, + "loss": 3.1233, + "step": 6020 + }, + { + "epoch": 2.1971215157587904, + "grad_norm": 0.85546875, + "learning_rate": 2.674927113702624e-05, + "loss": 3.1228, + "step": 6030 + }, + { + "epoch": 2.2007651666970305, + "grad_norm": 0.828125, + "learning_rate": 2.662779397473275e-05, + "loss": 3.0195, + "step": 6040 + }, + { + "epoch": 2.2044088176352705, + "grad_norm": 0.921875, + "learning_rate": 2.6506316812439262e-05, + "loss": 3.1056, + "step": 6050 + }, + { + "epoch": 2.2080524685735106, + "grad_norm": 0.921875, + "learning_rate": 2.6384839650145775e-05, + "loss": 3.0823, + "step": 6060 + }, + { + "epoch": 2.2116961195117506, + "grad_norm": 0.97265625, + "learning_rate": 2.6263362487852285e-05, + "loss": 3.0095, + "step": 6070 + }, + { + "epoch": 2.215339770449991, + "grad_norm": 0.98828125, + "learning_rate": 2.6141885325558797e-05, + "loss": 3.1279, + "step": 6080 + }, + { + "epoch": 2.218983421388231, + "grad_norm": 0.875, + "learning_rate": 2.6020408163265307e-05, + "loss": 3.0758, + "step": 6090 + }, + { + "epoch": 2.222627072326471, + "grad_norm": 0.98828125, + "learning_rate": 2.589893100097182e-05, + "loss": 3.0668, + "step": 6100 + }, + { + "epoch": 2.226270723264711, + "grad_norm": 1.1171875, + "learning_rate": 2.577745383867833e-05, + "loss": 3.0601, + "step": 6110 + }, + { + "epoch": 2.2299143742029512, + "grad_norm": 0.94140625, + "learning_rate": 2.5655976676384842e-05, + "loss": 3.0568, + "step": 6120 + }, + { + "epoch": 2.2335580251411913, + "grad_norm": 0.90234375, + "learning_rate": 2.5534499514091355e-05, + "loss": 3.1399, + "step": 6130 + }, + { + "epoch": 2.2372016760794318, + "grad_norm": 0.9375, + "learning_rate": 2.541302235179786e-05, + "loss": 3.1351, + "step": 6140 + }, + { + "epoch": 2.240845327017672, + "grad_norm": 1.078125, + "learning_rate": 2.5291545189504374e-05, + "loss": 3.0368, + "step": 6150 + }, + { + "epoch": 2.244488977955912, + "grad_norm": 1.046875, + "learning_rate": 2.5170068027210887e-05, + "loss": 3.1425, + "step": 6160 + }, + { + "epoch": 2.248132628894152, + "grad_norm": 0.8984375, + "learning_rate": 2.50485908649174e-05, + "loss": 3.1455, + "step": 6170 + }, + { + "epoch": 2.251776279832392, + "grad_norm": 0.9453125, + "learning_rate": 2.492711370262391e-05, + "loss": 3.1195, + "step": 6180 + }, + { + "epoch": 2.255419930770632, + "grad_norm": 0.953125, + "learning_rate": 2.480563654033042e-05, + "loss": 3.1929, + "step": 6190 + }, + { + "epoch": 2.2590635817088724, + "grad_norm": 0.87109375, + "learning_rate": 2.468415937803693e-05, + "loss": 3.0456, + "step": 6200 + }, + { + "epoch": 2.2627072326471125, + "grad_norm": 1.1328125, + "learning_rate": 2.456268221574344e-05, + "loss": 3.1606, + "step": 6210 + }, + { + "epoch": 2.2663508835853525, + "grad_norm": 0.83984375, + "learning_rate": 2.4441205053449954e-05, + "loss": 3.054, + "step": 6220 + }, + { + "epoch": 2.2699945345235926, + "grad_norm": 1.0, + "learning_rate": 2.4319727891156463e-05, + "loss": 3.17, + "step": 6230 + }, + { + "epoch": 2.2736381854618326, + "grad_norm": 1.09375, + "learning_rate": 2.4198250728862976e-05, + "loss": 3.0642, + "step": 6240 + }, + { + "epoch": 2.277281836400073, + "grad_norm": 0.7734375, + "learning_rate": 2.4076773566569485e-05, + "loss": 2.9784, + "step": 6250 + }, + { + "epoch": 2.280925487338313, + "grad_norm": 0.96875, + "learning_rate": 2.3955296404275998e-05, + "loss": 3.0481, + "step": 6260 + }, + { + "epoch": 2.284569138276553, + "grad_norm": 0.91796875, + "learning_rate": 2.3833819241982508e-05, + "loss": 3.1128, + "step": 6270 + }, + { + "epoch": 2.288212789214793, + "grad_norm": 0.9296875, + "learning_rate": 2.371234207968902e-05, + "loss": 3.0554, + "step": 6280 + }, + { + "epoch": 2.2918564401530332, + "grad_norm": 1.1953125, + "learning_rate": 2.359086491739553e-05, + "loss": 3.1442, + "step": 6290 + }, + { + "epoch": 2.2955000910912733, + "grad_norm": 0.86328125, + "learning_rate": 2.3469387755102043e-05, + "loss": 3.1732, + "step": 6300 + }, + { + "epoch": 2.2991437420295133, + "grad_norm": 0.90625, + "learning_rate": 2.3347910592808552e-05, + "loss": 3.1065, + "step": 6310 + }, + { + "epoch": 2.302787392967754, + "grad_norm": 0.90234375, + "learning_rate": 2.3226433430515065e-05, + "loss": 3.1013, + "step": 6320 + }, + { + "epoch": 2.306431043905994, + "grad_norm": 0.859375, + "learning_rate": 2.3104956268221575e-05, + "loss": 3.1159, + "step": 6330 + }, + { + "epoch": 2.310074694844234, + "grad_norm": 0.953125, + "learning_rate": 2.2983479105928087e-05, + "loss": 3.0996, + "step": 6340 + }, + { + "epoch": 2.313718345782474, + "grad_norm": 0.85546875, + "learning_rate": 2.2862001943634597e-05, + "loss": 3.1101, + "step": 6350 + }, + { + "epoch": 2.317361996720714, + "grad_norm": 1.0546875, + "learning_rate": 2.2740524781341106e-05, + "loss": 3.1715, + "step": 6360 + }, + { + "epoch": 2.3210056476589545, + "grad_norm": 0.890625, + "learning_rate": 2.261904761904762e-05, + "loss": 3.0314, + "step": 6370 + }, + { + "epoch": 2.3246492985971945, + "grad_norm": 0.9921875, + "learning_rate": 2.249757045675413e-05, + "loss": 3.1542, + "step": 6380 + }, + { + "epoch": 2.3282929495354345, + "grad_norm": 0.8984375, + "learning_rate": 2.237609329446064e-05, + "loss": 3.1521, + "step": 6390 + }, + { + "epoch": 2.3319366004736746, + "grad_norm": 0.90625, + "learning_rate": 2.225461613216715e-05, + "loss": 3.1132, + "step": 6400 + }, + { + "epoch": 2.3355802514119146, + "grad_norm": 0.94921875, + "learning_rate": 2.2133138969873664e-05, + "loss": 3.0016, + "step": 6410 + }, + { + "epoch": 2.339223902350155, + "grad_norm": 0.9921875, + "learning_rate": 2.2011661807580177e-05, + "loss": 3.1012, + "step": 6420 + }, + { + "epoch": 2.342867553288395, + "grad_norm": 0.9375, + "learning_rate": 2.1890184645286686e-05, + "loss": 3.0911, + "step": 6430 + }, + { + "epoch": 2.346511204226635, + "grad_norm": 0.8984375, + "learning_rate": 2.17687074829932e-05, + "loss": 3.0734, + "step": 6440 + }, + { + "epoch": 2.350154855164875, + "grad_norm": 1.03125, + "learning_rate": 2.1647230320699712e-05, + "loss": 3.1034, + "step": 6450 + }, + { + "epoch": 2.3537985061031153, + "grad_norm": 0.90234375, + "learning_rate": 2.152575315840622e-05, + "loss": 3.1082, + "step": 6460 + }, + { + "epoch": 2.3574421570413553, + "grad_norm": 0.76953125, + "learning_rate": 2.1404275996112734e-05, + "loss": 3.078, + "step": 6470 + }, + { + "epoch": 2.3610858079795953, + "grad_norm": 0.828125, + "learning_rate": 2.1282798833819244e-05, + "loss": 3.077, + "step": 6480 + }, + { + "epoch": 2.364729458917836, + "grad_norm": 0.91015625, + "learning_rate": 2.1161321671525756e-05, + "loss": 3.0717, + "step": 6490 + }, + { + "epoch": 2.368373109856076, + "grad_norm": 0.91015625, + "learning_rate": 2.1039844509232266e-05, + "loss": 3.0983, + "step": 6500 + }, + { + "epoch": 2.372016760794316, + "grad_norm": 0.890625, + "learning_rate": 2.091836734693878e-05, + "loss": 3.0621, + "step": 6510 + }, + { + "epoch": 2.375660411732556, + "grad_norm": 1.015625, + "learning_rate": 2.0796890184645288e-05, + "loss": 3.0199, + "step": 6520 + }, + { + "epoch": 2.379304062670796, + "grad_norm": 0.94921875, + "learning_rate": 2.06754130223518e-05, + "loss": 3.1456, + "step": 6530 + }, + { + "epoch": 2.3829477136090365, + "grad_norm": 1.125, + "learning_rate": 2.055393586005831e-05, + "loss": 3.0592, + "step": 6540 + }, + { + "epoch": 2.3865913645472765, + "grad_norm": 0.96875, + "learning_rate": 2.0432458697764823e-05, + "loss": 3.0988, + "step": 6550 + }, + { + "epoch": 2.3902350154855165, + "grad_norm": 0.875, + "learning_rate": 2.0310981535471333e-05, + "loss": 3.1426, + "step": 6560 + }, + { + "epoch": 2.3938786664237566, + "grad_norm": 1.03125, + "learning_rate": 2.0189504373177842e-05, + "loss": 3.0916, + "step": 6570 + }, + { + "epoch": 2.3975223173619966, + "grad_norm": 0.92578125, + "learning_rate": 2.0068027210884355e-05, + "loss": 3.1088, + "step": 6580 + }, + { + "epoch": 2.4011659683002367, + "grad_norm": 0.91796875, + "learning_rate": 1.9946550048590865e-05, + "loss": 3.0977, + "step": 6590 + }, + { + "epoch": 2.404809619238477, + "grad_norm": 0.875, + "learning_rate": 1.9825072886297377e-05, + "loss": 3.1589, + "step": 6600 + }, + { + "epoch": 2.408453270176717, + "grad_norm": 0.9375, + "learning_rate": 1.9703595724003887e-05, + "loss": 3.0655, + "step": 6610 + }, + { + "epoch": 2.412096921114957, + "grad_norm": 0.95703125, + "learning_rate": 1.95821185617104e-05, + "loss": 3.085, + "step": 6620 + }, + { + "epoch": 2.4157405720531973, + "grad_norm": 1.0, + "learning_rate": 1.946064139941691e-05, + "loss": 3.0235, + "step": 6630 + }, + { + "epoch": 2.4193842229914373, + "grad_norm": 1.015625, + "learning_rate": 1.9339164237123422e-05, + "loss": 3.1214, + "step": 6640 + }, + { + "epoch": 2.4230278739296773, + "grad_norm": 0.8671875, + "learning_rate": 1.921768707482993e-05, + "loss": 3.1466, + "step": 6650 + }, + { + "epoch": 2.426671524867918, + "grad_norm": 1.1640625, + "learning_rate": 1.9096209912536444e-05, + "loss": 3.0947, + "step": 6660 + }, + { + "epoch": 2.430315175806158, + "grad_norm": 0.984375, + "learning_rate": 1.8974732750242954e-05, + "loss": 3.1565, + "step": 6670 + }, + { + "epoch": 2.433958826744398, + "grad_norm": 0.9921875, + "learning_rate": 1.8853255587949467e-05, + "loss": 3.1209, + "step": 6680 + }, + { + "epoch": 2.437602477682638, + "grad_norm": 0.80078125, + "learning_rate": 1.8731778425655976e-05, + "loss": 3.0784, + "step": 6690 + }, + { + "epoch": 2.441246128620878, + "grad_norm": 0.828125, + "learning_rate": 1.861030126336249e-05, + "loss": 3.1211, + "step": 6700 + }, + { + "epoch": 2.4448897795591185, + "grad_norm": 0.828125, + "learning_rate": 1.8488824101069e-05, + "loss": 3.1112, + "step": 6710 + }, + { + "epoch": 2.4485334304973585, + "grad_norm": 0.9375, + "learning_rate": 1.836734693877551e-05, + "loss": 3.1023, + "step": 6720 + }, + { + "epoch": 2.4521770814355985, + "grad_norm": 0.98046875, + "learning_rate": 1.824586977648202e-05, + "loss": 3.1156, + "step": 6730 + }, + { + "epoch": 2.4558207323738386, + "grad_norm": 0.9140625, + "learning_rate": 1.8124392614188534e-05, + "loss": 3.104, + "step": 6740 + }, + { + "epoch": 2.4594643833120786, + "grad_norm": 0.90234375, + "learning_rate": 1.8002915451895043e-05, + "loss": 3.1426, + "step": 6750 + }, + { + "epoch": 2.4631080342503187, + "grad_norm": 0.80859375, + "learning_rate": 1.7881438289601556e-05, + "loss": 3.126, + "step": 6760 + }, + { + "epoch": 2.4667516851885587, + "grad_norm": 0.953125, + "learning_rate": 1.7759961127308065e-05, + "loss": 3.104, + "step": 6770 + }, + { + "epoch": 2.470395336126799, + "grad_norm": 0.8359375, + "learning_rate": 1.7638483965014578e-05, + "loss": 3.1772, + "step": 6780 + }, + { + "epoch": 2.474038987065039, + "grad_norm": 1.0078125, + "learning_rate": 1.7517006802721088e-05, + "loss": 3.0617, + "step": 6790 + }, + { + "epoch": 2.4776826380032793, + "grad_norm": 1.0859375, + "learning_rate": 1.73955296404276e-05, + "loss": 3.0941, + "step": 6800 + }, + { + "epoch": 2.4813262889415193, + "grad_norm": 1.0625, + "learning_rate": 1.7274052478134113e-05, + "loss": 3.1459, + "step": 6810 + }, + { + "epoch": 2.4849699398797593, + "grad_norm": 0.9609375, + "learning_rate": 1.7152575315840623e-05, + "loss": 3.1183, + "step": 6820 + }, + { + "epoch": 2.488613590818, + "grad_norm": 1.25, + "learning_rate": 1.7031098153547136e-05, + "loss": 3.1467, + "step": 6830 + }, + { + "epoch": 2.49225724175624, + "grad_norm": 1.03125, + "learning_rate": 1.6909620991253645e-05, + "loss": 3.1645, + "step": 6840 + }, + { + "epoch": 2.49590089269448, + "grad_norm": 1.171875, + "learning_rate": 1.6788143828960158e-05, + "loss": 3.1622, + "step": 6850 + }, + { + "epoch": 2.49954454363272, + "grad_norm": 0.97265625, + "learning_rate": 1.6666666666666667e-05, + "loss": 3.1339, + "step": 6860 + }, + { + "epoch": 2.50318819457096, + "grad_norm": 0.93359375, + "learning_rate": 1.654518950437318e-05, + "loss": 3.0641, + "step": 6870 + }, + { + "epoch": 2.5068318455092005, + "grad_norm": 0.8984375, + "learning_rate": 1.642371234207969e-05, + "loss": 3.1318, + "step": 6880 + }, + { + "epoch": 2.51047549644744, + "grad_norm": 0.8671875, + "learning_rate": 1.6302235179786203e-05, + "loss": 3.1287, + "step": 6890 + }, + { + "epoch": 2.5141191473856805, + "grad_norm": 0.8203125, + "learning_rate": 1.6180758017492712e-05, + "loss": 3.0884, + "step": 6900 + }, + { + "epoch": 2.5177627983239206, + "grad_norm": 0.83203125, + "learning_rate": 1.6059280855199225e-05, + "loss": 3.1332, + "step": 6910 + }, + { + "epoch": 2.5214064492621606, + "grad_norm": 0.9375, + "learning_rate": 1.5937803692905734e-05, + "loss": 3.129, + "step": 6920 + }, + { + "epoch": 2.5250501002004007, + "grad_norm": 1.125, + "learning_rate": 1.5816326530612247e-05, + "loss": 3.134, + "step": 6930 + }, + { + "epoch": 2.5286937511386407, + "grad_norm": 0.93359375, + "learning_rate": 1.5694849368318757e-05, + "loss": 3.0906, + "step": 6940 + }, + { + "epoch": 2.532337402076881, + "grad_norm": 1.09375, + "learning_rate": 1.557337220602527e-05, + "loss": 3.1158, + "step": 6950 + }, + { + "epoch": 2.5359810530151212, + "grad_norm": 0.80859375, + "learning_rate": 1.545189504373178e-05, + "loss": 3.0513, + "step": 6960 + }, + { + "epoch": 2.5396247039533613, + "grad_norm": 0.87890625, + "learning_rate": 1.5330417881438292e-05, + "loss": 3.0564, + "step": 6970 + }, + { + "epoch": 2.5432683548916013, + "grad_norm": 0.9140625, + "learning_rate": 1.5208940719144801e-05, + "loss": 3.1163, + "step": 6980 + }, + { + "epoch": 2.5469120058298413, + "grad_norm": 0.83984375, + "learning_rate": 1.5087463556851314e-05, + "loss": 3.0554, + "step": 6990 + }, + { + "epoch": 2.550555656768082, + "grad_norm": 0.94921875, + "learning_rate": 1.4965986394557824e-05, + "loss": 3.1519, + "step": 7000 + }, + { + "epoch": 2.554199307706322, + "grad_norm": 0.96484375, + "learning_rate": 1.4844509232264333e-05, + "loss": 3.0809, + "step": 7010 + }, + { + "epoch": 2.557842958644562, + "grad_norm": 0.90234375, + "learning_rate": 1.4723032069970846e-05, + "loss": 3.0894, + "step": 7020 + }, + { + "epoch": 2.561486609582802, + "grad_norm": 1.1015625, + "learning_rate": 1.4601554907677355e-05, + "loss": 3.2001, + "step": 7030 + }, + { + "epoch": 2.565130260521042, + "grad_norm": 1.046875, + "learning_rate": 1.4480077745383868e-05, + "loss": 3.0819, + "step": 7040 + }, + { + "epoch": 2.5687739114592825, + "grad_norm": 1.2109375, + "learning_rate": 1.435860058309038e-05, + "loss": 3.084, + "step": 7050 + }, + { + "epoch": 2.572417562397522, + "grad_norm": 0.9296875, + "learning_rate": 1.423712342079689e-05, + "loss": 3.0735, + "step": 7060 + }, + { + "epoch": 2.5760612133357625, + "grad_norm": 1.0625, + "learning_rate": 1.4115646258503402e-05, + "loss": 3.1273, + "step": 7070 + }, + { + "epoch": 2.5797048642740026, + "grad_norm": 0.91015625, + "learning_rate": 1.3994169096209913e-05, + "loss": 3.1316, + "step": 7080 + }, + { + "epoch": 2.5833485152122426, + "grad_norm": 0.96484375, + "learning_rate": 1.3872691933916424e-05, + "loss": 3.1375, + "step": 7090 + }, + { + "epoch": 2.5869921661504827, + "grad_norm": 0.91015625, + "learning_rate": 1.3751214771622937e-05, + "loss": 3.1154, + "step": 7100 + }, + { + "epoch": 2.5906358170887227, + "grad_norm": 0.92578125, + "learning_rate": 1.3629737609329446e-05, + "loss": 3.1512, + "step": 7110 + }, + { + "epoch": 2.594279468026963, + "grad_norm": 0.9140625, + "learning_rate": 1.350826044703596e-05, + "loss": 3.087, + "step": 7120 + }, + { + "epoch": 2.5979231189652032, + "grad_norm": 1.046875, + "learning_rate": 1.3386783284742469e-05, + "loss": 3.1213, + "step": 7130 + }, + { + "epoch": 2.6015667699034433, + "grad_norm": 0.92578125, + "learning_rate": 1.3265306122448982e-05, + "loss": 3.1229, + "step": 7140 + }, + { + "epoch": 2.6052104208416833, + "grad_norm": 1.03125, + "learning_rate": 1.3143828960155491e-05, + "loss": 3.1252, + "step": 7150 + }, + { + "epoch": 2.6088540717799233, + "grad_norm": 0.890625, + "learning_rate": 1.3022351797862004e-05, + "loss": 3.0816, + "step": 7160 + }, + { + "epoch": 2.612497722718164, + "grad_norm": 1.109375, + "learning_rate": 1.2900874635568513e-05, + "loss": 3.1299, + "step": 7170 + }, + { + "epoch": 2.616141373656404, + "grad_norm": 0.92578125, + "learning_rate": 1.2779397473275026e-05, + "loss": 3.0499, + "step": 7180 + }, + { + "epoch": 2.619785024594644, + "grad_norm": 1.1640625, + "learning_rate": 1.2657920310981536e-05, + "loss": 3.1844, + "step": 7190 + }, + { + "epoch": 2.623428675532884, + "grad_norm": 0.83203125, + "learning_rate": 1.2536443148688048e-05, + "loss": 3.148, + "step": 7200 + }, + { + "epoch": 2.627072326471124, + "grad_norm": 0.90625, + "learning_rate": 1.2414965986394558e-05, + "loss": 3.0752, + "step": 7210 + }, + { + "epoch": 2.630715977409364, + "grad_norm": 0.99609375, + "learning_rate": 1.2293488824101069e-05, + "loss": 3.1968, + "step": 7220 + }, + { + "epoch": 2.634359628347604, + "grad_norm": 1.125, + "learning_rate": 1.217201166180758e-05, + "loss": 3.2027, + "step": 7230 + }, + { + "epoch": 2.6380032792858445, + "grad_norm": 0.890625, + "learning_rate": 1.2050534499514091e-05, + "loss": 3.1299, + "step": 7240 + }, + { + "epoch": 2.6416469302240846, + "grad_norm": 0.92578125, + "learning_rate": 1.1929057337220603e-05, + "loss": 3.1109, + "step": 7250 + }, + { + "epoch": 2.6452905811623246, + "grad_norm": 0.8515625, + "learning_rate": 1.1807580174927114e-05, + "loss": 3.1591, + "step": 7260 + }, + { + "epoch": 2.6489342321005647, + "grad_norm": 1.0703125, + "learning_rate": 1.1686103012633627e-05, + "loss": 3.0862, + "step": 7270 + }, + { + "epoch": 2.6525778830388047, + "grad_norm": 0.87890625, + "learning_rate": 1.1564625850340138e-05, + "loss": 3.1458, + "step": 7280 + }, + { + "epoch": 2.656221533977045, + "grad_norm": 1.1171875, + "learning_rate": 1.1443148688046649e-05, + "loss": 3.1476, + "step": 7290 + }, + { + "epoch": 2.6598651849152852, + "grad_norm": 0.99609375, + "learning_rate": 1.132167152575316e-05, + "loss": 3.136, + "step": 7300 + }, + { + "epoch": 2.6635088358535253, + "grad_norm": 0.87109375, + "learning_rate": 1.1200194363459671e-05, + "loss": 3.0971, + "step": 7310 + }, + { + "epoch": 2.6671524867917653, + "grad_norm": 0.95703125, + "learning_rate": 1.1078717201166182e-05, + "loss": 3.1187, + "step": 7320 + }, + { + "epoch": 2.6707961377300053, + "grad_norm": 0.88671875, + "learning_rate": 1.0957240038872693e-05, + "loss": 3.1171, + "step": 7330 + }, + { + "epoch": 2.674439788668246, + "grad_norm": 0.9453125, + "learning_rate": 1.0835762876579203e-05, + "loss": 3.1523, + "step": 7340 + }, + { + "epoch": 2.6780834396064854, + "grad_norm": 0.94921875, + "learning_rate": 1.0714285714285714e-05, + "loss": 3.1301, + "step": 7350 + }, + { + "epoch": 2.681727090544726, + "grad_norm": 1.0390625, + "learning_rate": 1.0592808551992225e-05, + "loss": 3.1293, + "step": 7360 + }, + { + "epoch": 2.685370741482966, + "grad_norm": 0.96875, + "learning_rate": 1.0471331389698736e-05, + "loss": 3.1171, + "step": 7370 + }, + { + "epoch": 2.689014392421206, + "grad_norm": 1.0859375, + "learning_rate": 1.0349854227405248e-05, + "loss": 3.0375, + "step": 7380 + }, + { + "epoch": 2.692658043359446, + "grad_norm": 1.0390625, + "learning_rate": 1.0228377065111759e-05, + "loss": 3.0265, + "step": 7390 + }, + { + "epoch": 2.696301694297686, + "grad_norm": 0.94921875, + "learning_rate": 1.010689990281827e-05, + "loss": 3.097, + "step": 7400 + }, + { + "epoch": 2.6999453452359266, + "grad_norm": 0.83984375, + "learning_rate": 9.985422740524781e-06, + "loss": 3.0494, + "step": 7410 + }, + { + "epoch": 2.7035889961741666, + "grad_norm": 0.9140625, + "learning_rate": 9.863945578231292e-06, + "loss": 3.0811, + "step": 7420 + }, + { + "epoch": 2.7072326471124066, + "grad_norm": 0.96484375, + "learning_rate": 9.742468415937803e-06, + "loss": 3.1214, + "step": 7430 + }, + { + "epoch": 2.7108762980506467, + "grad_norm": 0.96875, + "learning_rate": 9.620991253644314e-06, + "loss": 3.1006, + "step": 7440 + }, + { + "epoch": 2.7145199489888867, + "grad_norm": 0.99609375, + "learning_rate": 9.499514091350827e-06, + "loss": 3.1645, + "step": 7450 + }, + { + "epoch": 2.718163599927127, + "grad_norm": 0.96484375, + "learning_rate": 9.378036929057338e-06, + "loss": 3.07, + "step": 7460 + }, + { + "epoch": 2.7218072508653672, + "grad_norm": 0.8515625, + "learning_rate": 9.25655976676385e-06, + "loss": 3.1343, + "step": 7470 + }, + { + "epoch": 2.7254509018036073, + "grad_norm": 1.015625, + "learning_rate": 9.13508260447036e-06, + "loss": 3.1049, + "step": 7480 + }, + { + "epoch": 2.7290945527418473, + "grad_norm": 0.90234375, + "learning_rate": 9.013605442176872e-06, + "loss": 3.1182, + "step": 7490 + }, + { + "epoch": 2.7327382036800874, + "grad_norm": 0.84765625, + "learning_rate": 8.892128279883383e-06, + "loss": 3.063, + "step": 7500 + }, + { + "epoch": 2.736381854618328, + "grad_norm": 0.9453125, + "learning_rate": 8.770651117589894e-06, + "loss": 3.0678, + "step": 7510 + }, + { + "epoch": 2.7400255055565674, + "grad_norm": 0.84765625, + "learning_rate": 8.649173955296405e-06, + "loss": 3.1132, + "step": 7520 + }, + { + "epoch": 2.743669156494808, + "grad_norm": 0.9765625, + "learning_rate": 8.527696793002917e-06, + "loss": 3.0649, + "step": 7530 + }, + { + "epoch": 2.747312807433048, + "grad_norm": 0.9140625, + "learning_rate": 8.406219630709428e-06, + "loss": 3.0386, + "step": 7540 + }, + { + "epoch": 2.750956458371288, + "grad_norm": 0.96484375, + "learning_rate": 8.284742468415939e-06, + "loss": 3.0972, + "step": 7550 + }, + { + "epoch": 2.754600109309528, + "grad_norm": 1.0703125, + "learning_rate": 8.163265306122448e-06, + "loss": 3.1145, + "step": 7560 + }, + { + "epoch": 2.758243760247768, + "grad_norm": 0.94140625, + "learning_rate": 8.04178814382896e-06, + "loss": 3.1053, + "step": 7570 + }, + { + "epoch": 2.7618874111860086, + "grad_norm": 0.95703125, + "learning_rate": 7.92031098153547e-06, + "loss": 3.1086, + "step": 7580 + }, + { + "epoch": 2.7655310621242486, + "grad_norm": 0.875, + "learning_rate": 7.798833819241982e-06, + "loss": 3.0831, + "step": 7590 + }, + { + "epoch": 2.7691747130624886, + "grad_norm": 1.015625, + "learning_rate": 7.677356656948493e-06, + "loss": 3.1135, + "step": 7600 + }, + { + "epoch": 2.7728183640007287, + "grad_norm": 0.921875, + "learning_rate": 7.555879494655005e-06, + "loss": 3.0605, + "step": 7610 + }, + { + "epoch": 2.7764620149389687, + "grad_norm": 0.96484375, + "learning_rate": 7.434402332361516e-06, + "loss": 2.9854, + "step": 7620 + }, + { + "epoch": 2.780105665877209, + "grad_norm": 1.078125, + "learning_rate": 7.312925170068027e-06, + "loss": 3.15, + "step": 7630 + }, + { + "epoch": 2.7837493168154492, + "grad_norm": 0.921875, + "learning_rate": 7.191448007774538e-06, + "loss": 3.1166, + "step": 7640 + }, + { + "epoch": 2.7873929677536893, + "grad_norm": 0.9375, + "learning_rate": 7.06997084548105e-06, + "loss": 3.0783, + "step": 7650 + }, + { + "epoch": 2.7910366186919293, + "grad_norm": 0.796875, + "learning_rate": 6.948493683187561e-06, + "loss": 3.0845, + "step": 7660 + }, + { + "epoch": 2.7946802696301694, + "grad_norm": 1.015625, + "learning_rate": 6.827016520894072e-06, + "loss": 3.0717, + "step": 7670 + }, + { + "epoch": 2.7983239205684094, + "grad_norm": 1.109375, + "learning_rate": 6.705539358600584e-06, + "loss": 3.0456, + "step": 7680 + }, + { + "epoch": 2.8019675715066494, + "grad_norm": 0.890625, + "learning_rate": 6.584062196307095e-06, + "loss": 3.071, + "step": 7690 + }, + { + "epoch": 2.80561122244489, + "grad_norm": 1.0390625, + "learning_rate": 6.462585034013606e-06, + "loss": 3.063, + "step": 7700 + }, + { + "epoch": 2.80925487338313, + "grad_norm": 1.0546875, + "learning_rate": 6.341107871720117e-06, + "loss": 3.1031, + "step": 7710 + }, + { + "epoch": 2.81289852432137, + "grad_norm": 0.97265625, + "learning_rate": 6.219630709426628e-06, + "loss": 3.0297, + "step": 7720 + }, + { + "epoch": 2.81654217525961, + "grad_norm": 0.92578125, + "learning_rate": 6.098153547133139e-06, + "loss": 3.119, + "step": 7730 + }, + { + "epoch": 2.82018582619785, + "grad_norm": 0.92578125, + "learning_rate": 5.97667638483965e-06, + "loss": 3.0535, + "step": 7740 + }, + { + "epoch": 2.8238294771360906, + "grad_norm": 0.81640625, + "learning_rate": 5.855199222546161e-06, + "loss": 3.1086, + "step": 7750 + }, + { + "epoch": 2.8274731280743306, + "grad_norm": 0.9609375, + "learning_rate": 5.733722060252672e-06, + "loss": 3.133, + "step": 7760 + }, + { + "epoch": 2.8311167790125706, + "grad_norm": 1.0625, + "learning_rate": 5.612244897959184e-06, + "loss": 3.1374, + "step": 7770 + }, + { + "epoch": 2.8347604299508107, + "grad_norm": 0.9453125, + "learning_rate": 5.4907677356656954e-06, + "loss": 3.1706, + "step": 7780 + }, + { + "epoch": 2.8384040808890507, + "grad_norm": 0.9453125, + "learning_rate": 5.369290573372207e-06, + "loss": 3.0924, + "step": 7790 + }, + { + "epoch": 2.842047731827291, + "grad_norm": 0.87890625, + "learning_rate": 5.247813411078718e-06, + "loss": 3.0695, + "step": 7800 + }, + { + "epoch": 2.845691382765531, + "grad_norm": 0.89453125, + "learning_rate": 5.126336248785229e-06, + "loss": 3.0492, + "step": 7810 + }, + { + "epoch": 2.8493350337037713, + "grad_norm": 0.83984375, + "learning_rate": 5.00485908649174e-06, + "loss": 3.0992, + "step": 7820 + }, + { + "epoch": 2.8529786846420113, + "grad_norm": 0.875, + "learning_rate": 4.88338192419825e-06, + "loss": 3.0975, + "step": 7830 + }, + { + "epoch": 2.8566223355802514, + "grad_norm": 0.93359375, + "learning_rate": 4.7619047619047615e-06, + "loss": 3.1571, + "step": 7840 + }, + { + "epoch": 2.8602659865184914, + "grad_norm": 0.984375, + "learning_rate": 4.640427599611273e-06, + "loss": 3.1478, + "step": 7850 + }, + { + "epoch": 2.8639096374567314, + "grad_norm": 0.94140625, + "learning_rate": 4.518950437317785e-06, + "loss": 3.117, + "step": 7860 + }, + { + "epoch": 2.867553288394972, + "grad_norm": 0.94921875, + "learning_rate": 4.397473275024296e-06, + "loss": 3.0144, + "step": 7870 + }, + { + "epoch": 2.871196939333212, + "grad_norm": 0.8828125, + "learning_rate": 4.275996112730807e-06, + "loss": 3.1565, + "step": 7880 + }, + { + "epoch": 2.874840590271452, + "grad_norm": 1.015625, + "learning_rate": 4.154518950437318e-06, + "loss": 3.2086, + "step": 7890 + }, + { + "epoch": 2.878484241209692, + "grad_norm": 0.82421875, + "learning_rate": 4.033041788143829e-06, + "loss": 3.124, + "step": 7900 + }, + { + "epoch": 2.882127892147932, + "grad_norm": 0.94921875, + "learning_rate": 3.9115646258503405e-06, + "loss": 3.046, + "step": 7910 + }, + { + "epoch": 2.8857715430861726, + "grad_norm": 0.8828125, + "learning_rate": 3.7900874635568516e-06, + "loss": 3.1214, + "step": 7920 + }, + { + "epoch": 2.8894151940244126, + "grad_norm": 0.91796875, + "learning_rate": 3.6686103012633628e-06, + "loss": 3.0823, + "step": 7930 + }, + { + "epoch": 2.8930588449626526, + "grad_norm": 0.765625, + "learning_rate": 3.5471331389698735e-06, + "loss": 3.0588, + "step": 7940 + }, + { + "epoch": 2.8967024959008927, + "grad_norm": 0.93359375, + "learning_rate": 3.4256559766763847e-06, + "loss": 3.1368, + "step": 7950 + }, + { + "epoch": 2.9003461468391327, + "grad_norm": 0.87890625, + "learning_rate": 3.304178814382896e-06, + "loss": 3.0578, + "step": 7960 + }, + { + "epoch": 2.903989797777373, + "grad_norm": 0.890625, + "learning_rate": 3.1827016520894074e-06, + "loss": 3.1724, + "step": 7970 + }, + { + "epoch": 2.907633448715613, + "grad_norm": 0.85546875, + "learning_rate": 3.0612244897959185e-06, + "loss": 3.1477, + "step": 7980 + }, + { + "epoch": 2.9112770996538533, + "grad_norm": 0.96484375, + "learning_rate": 2.9397473275024297e-06, + "loss": 3.1196, + "step": 7990 + }, + { + "epoch": 2.9149207505920933, + "grad_norm": 0.9765625, + "learning_rate": 2.818270165208941e-06, + "loss": 3.1087, + "step": 8000 + } + ], + "logging_steps": 10, + "max_steps": 8232, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7504106874736026e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}