{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 1.7063226699829102, "learning_rate": 4.995e-05, "loss": 9.6305, "step": 10 }, { "epoch": 0.002, "grad_norm": 1.467505693435669, "learning_rate": 4.99e-05, "loss": 8.8474, "step": 20 }, { "epoch": 0.003, "grad_norm": 1.3338744640350342, "learning_rate": 4.9850000000000006e-05, "loss": 8.4272, "step": 30 }, { "epoch": 0.004, "grad_norm": 1.194218635559082, "learning_rate": 4.9800000000000004e-05, "loss": 7.9969, "step": 40 }, { "epoch": 0.005, "grad_norm": 0.9542586207389832, "learning_rate": 4.975e-05, "loss": 7.8018, "step": 50 }, { "epoch": 0.006, "grad_norm": 0.8312947154045105, "learning_rate": 4.97e-05, "loss": 7.5303, "step": 60 }, { "epoch": 0.007, "grad_norm": 0.6978892683982849, "learning_rate": 4.965e-05, "loss": 7.3733, "step": 70 }, { "epoch": 0.008, "grad_norm": 0.6895764470100403, "learning_rate": 4.96e-05, "loss": 7.2434, "step": 80 }, { "epoch": 0.009, "grad_norm": 0.5555976033210754, "learning_rate": 4.9550000000000005e-05, "loss": 7.0877, "step": 90 }, { "epoch": 0.01, "grad_norm": 0.836391806602478, "learning_rate": 4.9500000000000004e-05, "loss": 7.0338, "step": 100 }, { "epoch": 0.011, "grad_norm": 0.782464861869812, "learning_rate": 4.945e-05, "loss": 6.878, "step": 110 }, { "epoch": 0.012, "grad_norm": 1.3705933094024658, "learning_rate": 4.94e-05, "loss": 6.5874, "step": 120 }, { "epoch": 0.013, "grad_norm": 0.7560775876045227, "learning_rate": 4.935e-05, "loss": 6.4978, "step": 130 }, { "epoch": 0.014, "grad_norm": 1.3238508701324463, "learning_rate": 4.93e-05, "loss": 6.3998, "step": 140 }, { "epoch": 0.015, "grad_norm": 0.7834548950195312, "learning_rate": 4.9250000000000004e-05, "loss": 6.2838, "step": 150 }, { "epoch": 0.016, "grad_norm": 0.762347400188446, "learning_rate": 4.92e-05, "loss": 6.0387, "step": 160 }, { "epoch": 0.017, "grad_norm": 0.7799501419067383, "learning_rate": 4.915e-05, "loss": 6.0241, "step": 170 }, { "epoch": 0.018, "grad_norm": 0.7948866486549377, "learning_rate": 4.91e-05, "loss": 5.8776, "step": 180 }, { "epoch": 0.019, "grad_norm": 0.9890483021736145, "learning_rate": 4.905e-05, "loss": 5.747, "step": 190 }, { "epoch": 0.02, "grad_norm": 0.9131263494491577, "learning_rate": 4.9e-05, "loss": 5.644, "step": 200 }, { "epoch": 0.021, "grad_norm": 1.7073436975479126, "learning_rate": 4.8950000000000004e-05, "loss": 5.778, "step": 210 }, { "epoch": 0.022, "grad_norm": 0.8059922456741333, "learning_rate": 4.89e-05, "loss": 5.4755, "step": 220 }, { "epoch": 0.023, "grad_norm": 1.2500686645507812, "learning_rate": 4.885e-05, "loss": 5.3769, "step": 230 }, { "epoch": 0.024, "grad_norm": 1.3848680257797241, "learning_rate": 4.88e-05, "loss": 5.2105, "step": 240 }, { "epoch": 0.025, "grad_norm": 1.2381746768951416, "learning_rate": 4.875e-05, "loss": 5.1444, "step": 250 }, { "epoch": 0.026, "grad_norm": 2.7005224227905273, "learning_rate": 4.87e-05, "loss": 5.1608, "step": 260 }, { "epoch": 0.027, "grad_norm": 1.1472671031951904, "learning_rate": 4.8650000000000003e-05, "loss": 4.9456, "step": 270 }, { "epoch": 0.028, "grad_norm": 1.9849270582199097, "learning_rate": 4.86e-05, "loss": 4.8466, "step": 280 }, { "epoch": 0.029, "grad_norm": 1.857001781463623, "learning_rate": 4.855e-05, "loss": 4.7323, "step": 290 }, { "epoch": 0.03, "grad_norm": 1.6731220483779907, "learning_rate": 4.85e-05, "loss": 4.5786, "step": 300 }, { "epoch": 0.031, "grad_norm": 1.7968906164169312, "learning_rate": 4.845e-05, "loss": 4.4588, "step": 310 }, { "epoch": 0.032, "grad_norm": 1.7908226251602173, "learning_rate": 4.8400000000000004e-05, "loss": 4.3645, "step": 320 }, { "epoch": 0.033, "grad_norm": 2.538881540298462, "learning_rate": 4.835e-05, "loss": 4.1489, "step": 330 }, { "epoch": 0.034, "grad_norm": 2.306257486343384, "learning_rate": 4.83e-05, "loss": 3.9798, "step": 340 }, { "epoch": 0.035, "grad_norm": 2.1730940341949463, "learning_rate": 4.825e-05, "loss": 4.0231, "step": 350 }, { "epoch": 0.036, "grad_norm": 2.4211463928222656, "learning_rate": 4.82e-05, "loss": 3.8495, "step": 360 }, { "epoch": 0.037, "grad_norm": 2.3698794841766357, "learning_rate": 4.815e-05, "loss": 3.6977, "step": 370 }, { "epoch": 0.038, "grad_norm": 2.147799491882324, "learning_rate": 4.8100000000000004e-05, "loss": 3.8008, "step": 380 }, { "epoch": 0.039, "grad_norm": 2.3577606678009033, "learning_rate": 4.805e-05, "loss": 3.6983, "step": 390 }, { "epoch": 0.04, "grad_norm": 2.065912961959839, "learning_rate": 4.8e-05, "loss": 3.5738, "step": 400 }, { "epoch": 0.041, "grad_norm": 2.930288314819336, "learning_rate": 4.795e-05, "loss": 3.5117, "step": 410 }, { "epoch": 0.042, "grad_norm": 2.3703155517578125, "learning_rate": 4.79e-05, "loss": 3.2483, "step": 420 }, { "epoch": 0.043, "grad_norm": 2.6050736904144287, "learning_rate": 4.785e-05, "loss": 3.2342, "step": 430 }, { "epoch": 0.044, "grad_norm": 2.0790674686431885, "learning_rate": 4.78e-05, "loss": 3.1452, "step": 440 }, { "epoch": 0.045, "grad_norm": 2.2497427463531494, "learning_rate": 4.775e-05, "loss": 3.0316, "step": 450 }, { "epoch": 0.046, "grad_norm": 2.507902145385742, "learning_rate": 4.77e-05, "loss": 2.8938, "step": 460 }, { "epoch": 0.047, "grad_norm": 2.517744541168213, "learning_rate": 4.765e-05, "loss": 2.8137, "step": 470 }, { "epoch": 0.048, "grad_norm": 3.9981460571289062, "learning_rate": 4.76e-05, "loss": 2.9864, "step": 480 }, { "epoch": 0.049, "grad_norm": 2.265026569366455, "learning_rate": 4.755e-05, "loss": 2.7839, "step": 490 }, { "epoch": 0.05, "grad_norm": 2.257293701171875, "learning_rate": 4.75e-05, "loss": 2.6834, "step": 500 }, { "epoch": 0.051, "grad_norm": 2.6932270526885986, "learning_rate": 4.745e-05, "loss": 2.5755, "step": 510 }, { "epoch": 0.052, "grad_norm": 1.7177081108093262, "learning_rate": 4.74e-05, "loss": 2.425, "step": 520 }, { "epoch": 0.053, "grad_norm": 2.2452073097229004, "learning_rate": 4.735e-05, "loss": 2.5261, "step": 530 }, { "epoch": 0.054, "grad_norm": 2.2109947204589844, "learning_rate": 4.73e-05, "loss": 2.3825, "step": 540 }, { "epoch": 0.055, "grad_norm": 2.574531078338623, "learning_rate": 4.7249999999999997e-05, "loss": 2.3087, "step": 550 }, { "epoch": 0.056, "grad_norm": 2.3631017208099365, "learning_rate": 4.72e-05, "loss": 2.3099, "step": 560 }, { "epoch": 0.057, "grad_norm": 2.3809709548950195, "learning_rate": 4.715e-05, "loss": 2.3001, "step": 570 }, { "epoch": 0.058, "grad_norm": 2.0683534145355225, "learning_rate": 4.71e-05, "loss": 2.0813, "step": 580 }, { "epoch": 0.059, "grad_norm": 2.5471837520599365, "learning_rate": 4.705e-05, "loss": 2.0378, "step": 590 }, { "epoch": 0.06, "grad_norm": 2.585564374923706, "learning_rate": 4.7e-05, "loss": 2.2062, "step": 600 }, { "epoch": 0.061, "grad_norm": 2.062100648880005, "learning_rate": 4.695e-05, "loss": 1.9914, "step": 610 }, { "epoch": 0.062, "grad_norm": 2.1019210815429688, "learning_rate": 4.69e-05, "loss": 1.9635, "step": 620 }, { "epoch": 0.063, "grad_norm": 2.630436658859253, "learning_rate": 4.685000000000001e-05, "loss": 1.9123, "step": 630 }, { "epoch": 0.064, "grad_norm": 2.1028494834899902, "learning_rate": 4.6800000000000006e-05, "loss": 1.7583, "step": 640 }, { "epoch": 0.065, "grad_norm": 2.392193078994751, "learning_rate": 4.6750000000000005e-05, "loss": 1.7532, "step": 650 }, { "epoch": 0.066, "grad_norm": 2.004413366317749, "learning_rate": 4.6700000000000003e-05, "loss": 1.6978, "step": 660 }, { "epoch": 0.067, "grad_norm": 2.210513114929199, "learning_rate": 4.665e-05, "loss": 1.6311, "step": 670 }, { "epoch": 0.068, "grad_norm": 1.8464936017990112, "learning_rate": 4.660000000000001e-05, "loss": 1.5507, "step": 680 }, { "epoch": 0.069, "grad_norm": 2.0246541500091553, "learning_rate": 4.655000000000001e-05, "loss": 1.5637, "step": 690 }, { "epoch": 0.07, "grad_norm": 2.199751138687134, "learning_rate": 4.6500000000000005e-05, "loss": 1.5603, "step": 700 }, { "epoch": 0.071, "grad_norm": 2.2002196311950684, "learning_rate": 4.6450000000000004e-05, "loss": 1.4558, "step": 710 }, { "epoch": 0.072, "grad_norm": 1.7826759815216064, "learning_rate": 4.64e-05, "loss": 1.4309, "step": 720 }, { "epoch": 0.073, "grad_norm": 1.760297417640686, "learning_rate": 4.635e-05, "loss": 1.3531, "step": 730 }, { "epoch": 0.074, "grad_norm": 2.0505475997924805, "learning_rate": 4.630000000000001e-05, "loss": 1.3641, "step": 740 }, { "epoch": 0.075, "grad_norm": 2.1375396251678467, "learning_rate": 4.6250000000000006e-05, "loss": 1.3259, "step": 750 }, { "epoch": 0.076, "grad_norm": 1.8252328634262085, "learning_rate": 4.6200000000000005e-05, "loss": 1.2026, "step": 760 }, { "epoch": 0.077, "grad_norm": 1.8945906162261963, "learning_rate": 4.6150000000000004e-05, "loss": 1.2878, "step": 770 }, { "epoch": 0.078, "grad_norm": 1.7990881204605103, "learning_rate": 4.61e-05, "loss": 1.1853, "step": 780 }, { "epoch": 0.079, "grad_norm": 1.4897470474243164, "learning_rate": 4.605e-05, "loss": 1.1279, "step": 790 }, { "epoch": 0.08, "grad_norm": 2.2804617881774902, "learning_rate": 4.600000000000001e-05, "loss": 1.0804, "step": 800 }, { "epoch": 0.081, "grad_norm": 1.4800664186477661, "learning_rate": 4.5950000000000006e-05, "loss": 1.0361, "step": 810 }, { "epoch": 0.082, "grad_norm": 1.3526049852371216, "learning_rate": 4.5900000000000004e-05, "loss": 1.0585, "step": 820 }, { "epoch": 0.083, "grad_norm": 1.534173607826233, "learning_rate": 4.585e-05, "loss": 1.0206, "step": 830 }, { "epoch": 0.084, "grad_norm": 1.4844435453414917, "learning_rate": 4.58e-05, "loss": 0.9758, "step": 840 }, { "epoch": 0.085, "grad_norm": 1.533679485321045, "learning_rate": 4.575e-05, "loss": 0.9168, "step": 850 }, { "epoch": 0.086, "grad_norm": 1.456162691116333, "learning_rate": 4.5700000000000006e-05, "loss": 0.8913, "step": 860 }, { "epoch": 0.087, "grad_norm": 1.7335631847381592, "learning_rate": 4.5650000000000005e-05, "loss": 0.9154, "step": 870 }, { "epoch": 0.088, "grad_norm": 1.3331761360168457, "learning_rate": 4.5600000000000004e-05, "loss": 0.8483, "step": 880 }, { "epoch": 0.089, "grad_norm": 1.6703053712844849, "learning_rate": 4.555e-05, "loss": 0.8116, "step": 890 }, { "epoch": 0.09, "grad_norm": 1.275975227355957, "learning_rate": 4.55e-05, "loss": 0.7869, "step": 900 }, { "epoch": 0.091, "grad_norm": 1.3800309896469116, "learning_rate": 4.545000000000001e-05, "loss": 0.7637, "step": 910 }, { "epoch": 0.092, "grad_norm": 1.9472386837005615, "learning_rate": 4.5400000000000006e-05, "loss": 0.7212, "step": 920 }, { "epoch": 0.093, "grad_norm": 1.3451333045959473, "learning_rate": 4.5350000000000005e-05, "loss": 0.6829, "step": 930 }, { "epoch": 0.094, "grad_norm": 1.5209784507751465, "learning_rate": 4.53e-05, "loss": 0.729, "step": 940 }, { "epoch": 0.095, "grad_norm": 1.3944469690322876, "learning_rate": 4.525e-05, "loss": 0.6732, "step": 950 }, { "epoch": 0.096, "grad_norm": 1.2177132368087769, "learning_rate": 4.52e-05, "loss": 0.6188, "step": 960 }, { "epoch": 0.097, "grad_norm": 1.5988528728485107, "learning_rate": 4.5150000000000006e-05, "loss": 0.6622, "step": 970 }, { "epoch": 0.098, "grad_norm": 1.3636531829833984, "learning_rate": 4.5100000000000005e-05, "loss": 0.5792, "step": 980 }, { "epoch": 0.099, "grad_norm": 1.377453088760376, "learning_rate": 4.5050000000000004e-05, "loss": 0.6062, "step": 990 }, { "epoch": 0.1, "grad_norm": 2.295713186264038, "learning_rate": 4.5e-05, "loss": 0.5709, "step": 1000 }, { "epoch": 0.101, "grad_norm": 1.35196852684021, "learning_rate": 4.495e-05, "loss": 0.5521, "step": 1010 }, { "epoch": 0.102, "grad_norm": 1.0617187023162842, "learning_rate": 4.49e-05, "loss": 0.5147, "step": 1020 }, { "epoch": 0.103, "grad_norm": 1.3035167455673218, "learning_rate": 4.4850000000000006e-05, "loss": 0.5081, "step": 1030 }, { "epoch": 0.104, "grad_norm": 1.2835568189620972, "learning_rate": 4.4800000000000005e-05, "loss": 0.5, "step": 1040 }, { "epoch": 0.105, "grad_norm": 1.0403038263320923, "learning_rate": 4.4750000000000004e-05, "loss": 0.4825, "step": 1050 }, { "epoch": 0.106, "grad_norm": 0.9538235068321228, "learning_rate": 4.47e-05, "loss": 0.4316, "step": 1060 }, { "epoch": 0.107, "grad_norm": 1.4246289730072021, "learning_rate": 4.465e-05, "loss": 0.4304, "step": 1070 }, { "epoch": 0.108, "grad_norm": 1.1217833757400513, "learning_rate": 4.46e-05, "loss": 0.4397, "step": 1080 }, { "epoch": 0.109, "grad_norm": 1.0411335229873657, "learning_rate": 4.4550000000000005e-05, "loss": 0.4057, "step": 1090 }, { "epoch": 0.11, "grad_norm": 0.8498069643974304, "learning_rate": 4.4500000000000004e-05, "loss": 0.3933, "step": 1100 }, { "epoch": 0.111, "grad_norm": 1.1270406246185303, "learning_rate": 4.445e-05, "loss": 0.366, "step": 1110 }, { "epoch": 0.112, "grad_norm": 1.189041256904602, "learning_rate": 4.44e-05, "loss": 0.3407, "step": 1120 }, { "epoch": 0.113, "grad_norm": 0.9837467670440674, "learning_rate": 4.435e-05, "loss": 0.3511, "step": 1130 }, { "epoch": 0.114, "grad_norm": 1.0432955026626587, "learning_rate": 4.43e-05, "loss": 0.3381, "step": 1140 }, { "epoch": 0.115, "grad_norm": 0.9529951810836792, "learning_rate": 4.4250000000000005e-05, "loss": 0.3189, "step": 1150 }, { "epoch": 0.116, "grad_norm": 1.008836030960083, "learning_rate": 4.4200000000000004e-05, "loss": 0.3077, "step": 1160 }, { "epoch": 0.117, "grad_norm": 1.0005086660385132, "learning_rate": 4.415e-05, "loss": 0.3001, "step": 1170 }, { "epoch": 0.118, "grad_norm": 1.1065175533294678, "learning_rate": 4.41e-05, "loss": 0.28, "step": 1180 }, { "epoch": 0.119, "grad_norm": 0.6701949834823608, "learning_rate": 4.405e-05, "loss": 0.2692, "step": 1190 }, { "epoch": 0.12, "grad_norm": 0.7154658436775208, "learning_rate": 4.4000000000000006e-05, "loss": 0.2663, "step": 1200 }, { "epoch": 0.121, "grad_norm": 0.6997113823890686, "learning_rate": 4.3950000000000004e-05, "loss": 0.2595, "step": 1210 }, { "epoch": 0.122, "grad_norm": 0.9047608971595764, "learning_rate": 4.39e-05, "loss": 0.2558, "step": 1220 }, { "epoch": 0.123, "grad_norm": 0.8508415222167969, "learning_rate": 4.385e-05, "loss": 0.2459, "step": 1230 }, { "epoch": 0.124, "grad_norm": 0.6505220532417297, "learning_rate": 4.38e-05, "loss": 0.2236, "step": 1240 }, { "epoch": 0.125, "grad_norm": 0.5360460877418518, "learning_rate": 4.375e-05, "loss": 0.2189, "step": 1250 }, { "epoch": 0.126, "grad_norm": 0.560817539691925, "learning_rate": 4.3700000000000005e-05, "loss": 0.2166, "step": 1260 }, { "epoch": 0.127, "grad_norm": 0.7089666128158569, "learning_rate": 4.3650000000000004e-05, "loss": 0.2026, "step": 1270 }, { "epoch": 0.128, "grad_norm": 0.5265817046165466, "learning_rate": 4.36e-05, "loss": 0.197, "step": 1280 }, { "epoch": 0.129, "grad_norm": 0.6629377007484436, "learning_rate": 4.355e-05, "loss": 0.1934, "step": 1290 }, { "epoch": 0.13, "grad_norm": 1.0730735063552856, "learning_rate": 4.35e-05, "loss": 0.1807, "step": 1300 }, { "epoch": 0.131, "grad_norm": 0.6990699172019958, "learning_rate": 4.345e-05, "loss": 0.1845, "step": 1310 }, { "epoch": 0.132, "grad_norm": 0.5047340393066406, "learning_rate": 4.3400000000000005e-05, "loss": 0.1725, "step": 1320 }, { "epoch": 0.133, "grad_norm": 0.6830994486808777, "learning_rate": 4.335e-05, "loss": 0.1687, "step": 1330 }, { "epoch": 0.134, "grad_norm": 0.5861710906028748, "learning_rate": 4.33e-05, "loss": 0.1671, "step": 1340 }, { "epoch": 0.135, "grad_norm": 0.43594300746917725, "learning_rate": 4.325e-05, "loss": 0.1467, "step": 1350 }, { "epoch": 0.136, "grad_norm": 0.44587692618370056, "learning_rate": 4.32e-05, "loss": 0.1509, "step": 1360 }, { "epoch": 0.137, "grad_norm": 0.5523977875709534, "learning_rate": 4.315e-05, "loss": 0.1434, "step": 1370 }, { "epoch": 0.138, "grad_norm": 0.6139170527458191, "learning_rate": 4.3100000000000004e-05, "loss": 0.1433, "step": 1380 }, { "epoch": 0.139, "grad_norm": 0.6169497966766357, "learning_rate": 4.305e-05, "loss": 0.1365, "step": 1390 }, { "epoch": 0.14, "grad_norm": 0.49120134115219116, "learning_rate": 4.3e-05, "loss": 0.1287, "step": 1400 }, { "epoch": 0.141, "grad_norm": 0.451753169298172, "learning_rate": 4.295e-05, "loss": 0.1142, "step": 1410 }, { "epoch": 0.142, "grad_norm": 0.5429627895355225, "learning_rate": 4.29e-05, "loss": 0.134, "step": 1420 }, { "epoch": 0.143, "grad_norm": 0.7613041400909424, "learning_rate": 4.285e-05, "loss": 0.1391, "step": 1430 }, { "epoch": 0.144, "grad_norm": 0.4953358471393585, "learning_rate": 4.2800000000000004e-05, "loss": 0.1197, "step": 1440 }, { "epoch": 0.145, "grad_norm": 0.3657626509666443, "learning_rate": 4.275e-05, "loss": 0.1071, "step": 1450 }, { "epoch": 0.146, "grad_norm": 0.44240206480026245, "learning_rate": 4.27e-05, "loss": 0.1111, "step": 1460 }, { "epoch": 0.147, "grad_norm": 0.5007165670394897, "learning_rate": 4.265e-05, "loss": 0.1056, "step": 1470 }, { "epoch": 0.148, "grad_norm": 0.4580256938934326, "learning_rate": 4.26e-05, "loss": 0.1049, "step": 1480 }, { "epoch": 0.149, "grad_norm": 0.4970822036266327, "learning_rate": 4.2550000000000004e-05, "loss": 0.1032, "step": 1490 }, { "epoch": 0.15, "grad_norm": 0.4138182997703552, "learning_rate": 4.25e-05, "loss": 0.0961, "step": 1500 }, { "epoch": 0.151, "grad_norm": 0.4013712406158447, "learning_rate": 4.245e-05, "loss": 0.0949, "step": 1510 }, { "epoch": 0.152, "grad_norm": 0.3868940770626068, "learning_rate": 4.24e-05, "loss": 0.0837, "step": 1520 }, { "epoch": 0.153, "grad_norm": 0.3113015294075012, "learning_rate": 4.235e-05, "loss": 0.0909, "step": 1530 }, { "epoch": 0.154, "grad_norm": 0.3569623529911041, "learning_rate": 4.23e-05, "loss": 0.0908, "step": 1540 }, { "epoch": 0.155, "grad_norm": 0.3841746151447296, "learning_rate": 4.2250000000000004e-05, "loss": 0.0806, "step": 1550 }, { "epoch": 0.156, "grad_norm": 0.6565550565719604, "learning_rate": 4.22e-05, "loss": 0.075, "step": 1560 }, { "epoch": 0.157, "grad_norm": 0.4816874563694, "learning_rate": 4.215e-05, "loss": 0.0858, "step": 1570 }, { "epoch": 0.158, "grad_norm": 0.30408933758735657, "learning_rate": 4.21e-05, "loss": 0.0704, "step": 1580 }, { "epoch": 0.159, "grad_norm": 0.43388792872428894, "learning_rate": 4.205e-05, "loss": 0.0671, "step": 1590 }, { "epoch": 0.16, "grad_norm": 0.33304253220558167, "learning_rate": 4.2e-05, "loss": 0.07, "step": 1600 }, { "epoch": 0.161, "grad_norm": 0.4260387420654297, "learning_rate": 4.195e-05, "loss": 0.0691, "step": 1610 }, { "epoch": 0.162, "grad_norm": 0.37930798530578613, "learning_rate": 4.19e-05, "loss": 0.0715, "step": 1620 }, { "epoch": 0.163, "grad_norm": 0.3198983669281006, "learning_rate": 4.185e-05, "loss": 0.0651, "step": 1630 }, { "epoch": 0.164, "grad_norm": 0.3510359823703766, "learning_rate": 4.18e-05, "loss": 0.058, "step": 1640 }, { "epoch": 0.165, "grad_norm": 0.41047966480255127, "learning_rate": 4.175e-05, "loss": 0.065, "step": 1650 }, { "epoch": 0.166, "grad_norm": 0.3054174482822418, "learning_rate": 4.17e-05, "loss": 0.0564, "step": 1660 }, { "epoch": 0.167, "grad_norm": 0.29319772124290466, "learning_rate": 4.165e-05, "loss": 0.0599, "step": 1670 }, { "epoch": 0.168, "grad_norm": 0.257354736328125, "learning_rate": 4.16e-05, "loss": 0.0536, "step": 1680 }, { "epoch": 0.169, "grad_norm": 0.25215694308280945, "learning_rate": 4.155e-05, "loss": 0.0587, "step": 1690 }, { "epoch": 0.17, "grad_norm": 0.4573931097984314, "learning_rate": 4.15e-05, "loss": 0.0524, "step": 1700 }, { "epoch": 0.171, "grad_norm": 0.3514876663684845, "learning_rate": 4.145e-05, "loss": 0.0551, "step": 1710 }, { "epoch": 0.172, "grad_norm": 0.3239930272102356, "learning_rate": 4.14e-05, "loss": 0.0499, "step": 1720 }, { "epoch": 0.173, "grad_norm": 0.20213039219379425, "learning_rate": 4.135e-05, "loss": 0.0521, "step": 1730 }, { "epoch": 0.174, "grad_norm": 0.21831783652305603, "learning_rate": 4.13e-05, "loss": 0.0469, "step": 1740 }, { "epoch": 0.175, "grad_norm": 0.2585163712501526, "learning_rate": 4.125e-05, "loss": 0.0469, "step": 1750 }, { "epoch": 0.176, "grad_norm": 0.21717113256454468, "learning_rate": 4.12e-05, "loss": 0.0455, "step": 1760 }, { "epoch": 0.177, "grad_norm": 0.27248838543891907, "learning_rate": 4.115e-05, "loss": 0.046, "step": 1770 }, { "epoch": 0.178, "grad_norm": 0.2503461241722107, "learning_rate": 4.11e-05, "loss": 0.0447, "step": 1780 }, { "epoch": 0.179, "grad_norm": 0.27404382824897766, "learning_rate": 4.105e-05, "loss": 0.0437, "step": 1790 }, { "epoch": 0.18, "grad_norm": 0.23549066483974457, "learning_rate": 4.1e-05, "loss": 0.0423, "step": 1800 }, { "epoch": 0.181, "grad_norm": 0.19369937479496002, "learning_rate": 4.095e-05, "loss": 0.0408, "step": 1810 }, { "epoch": 0.182, "grad_norm": 0.20560242235660553, "learning_rate": 4.09e-05, "loss": 0.0379, "step": 1820 }, { "epoch": 0.183, "grad_norm": 0.34989863634109497, "learning_rate": 4.085e-05, "loss": 0.0364, "step": 1830 }, { "epoch": 0.184, "grad_norm": 0.2310326248407364, "learning_rate": 4.08e-05, "loss": 0.0385, "step": 1840 }, { "epoch": 0.185, "grad_norm": 0.21055462956428528, "learning_rate": 4.075e-05, "loss": 0.0351, "step": 1850 }, { "epoch": 0.186, "grad_norm": 0.3251895308494568, "learning_rate": 4.07e-05, "loss": 0.0381, "step": 1860 }, { "epoch": 0.187, "grad_norm": 0.2887445390224457, "learning_rate": 4.065e-05, "loss": 0.0341, "step": 1870 }, { "epoch": 0.188, "grad_norm": 0.15948843955993652, "learning_rate": 4.0600000000000004e-05, "loss": 0.0313, "step": 1880 }, { "epoch": 0.189, "grad_norm": 0.2413359135389328, "learning_rate": 4.055e-05, "loss": 0.0338, "step": 1890 }, { "epoch": 0.19, "grad_norm": 0.2132706493139267, "learning_rate": 4.05e-05, "loss": 0.0339, "step": 1900 }, { "epoch": 0.191, "grad_norm": 0.17968431115150452, "learning_rate": 4.045000000000001e-05, "loss": 0.0317, "step": 1910 }, { "epoch": 0.192, "grad_norm": 0.15828929841518402, "learning_rate": 4.0400000000000006e-05, "loss": 0.0302, "step": 1920 }, { "epoch": 0.193, "grad_norm": 0.18106874823570251, "learning_rate": 4.0350000000000005e-05, "loss": 0.0331, "step": 1930 }, { "epoch": 0.194, "grad_norm": 0.34827324748039246, "learning_rate": 4.0300000000000004e-05, "loss": 0.032, "step": 1940 }, { "epoch": 0.195, "grad_norm": 0.21621111035346985, "learning_rate": 4.025e-05, "loss": 0.0317, "step": 1950 }, { "epoch": 0.196, "grad_norm": 0.2159423679113388, "learning_rate": 4.02e-05, "loss": 0.0296, "step": 1960 }, { "epoch": 0.197, "grad_norm": 0.17750391364097595, "learning_rate": 4.015000000000001e-05, "loss": 0.0297, "step": 1970 }, { "epoch": 0.198, "grad_norm": 0.13952311873435974, "learning_rate": 4.0100000000000006e-05, "loss": 0.0279, "step": 1980 }, { "epoch": 0.199, "grad_norm": 0.19622887670993805, "learning_rate": 4.0050000000000004e-05, "loss": 0.0278, "step": 1990 }, { "epoch": 0.2, "grad_norm": 0.14959514141082764, "learning_rate": 4e-05, "loss": 0.0251, "step": 2000 }, { "epoch": 0.201, "grad_norm": 0.17456738650798798, "learning_rate": 3.995e-05, "loss": 0.0315, "step": 2010 }, { "epoch": 0.202, "grad_norm": 0.15893588960170746, "learning_rate": 3.99e-05, "loss": 0.0243, "step": 2020 }, { "epoch": 0.203, "grad_norm": 0.14638105034828186, "learning_rate": 3.9850000000000006e-05, "loss": 0.0247, "step": 2030 }, { "epoch": 0.204, "grad_norm": 0.1714017242193222, "learning_rate": 3.9800000000000005e-05, "loss": 0.0252, "step": 2040 }, { "epoch": 0.205, "grad_norm": 0.18679572641849518, "learning_rate": 3.9750000000000004e-05, "loss": 0.0234, "step": 2050 }, { "epoch": 0.206, "grad_norm": 0.10623681545257568, "learning_rate": 3.97e-05, "loss": 0.0256, "step": 2060 }, { "epoch": 0.207, "grad_norm": 0.18566076457500458, "learning_rate": 3.965e-05, "loss": 0.0238, "step": 2070 }, { "epoch": 0.208, "grad_norm": 0.12487553805112839, "learning_rate": 3.960000000000001e-05, "loss": 0.0226, "step": 2080 }, { "epoch": 0.209, "grad_norm": 0.13191473484039307, "learning_rate": 3.9550000000000006e-05, "loss": 0.0232, "step": 2090 }, { "epoch": 0.21, "grad_norm": 0.225613072514534, "learning_rate": 3.9500000000000005e-05, "loss": 0.0226, "step": 2100 }, { "epoch": 0.211, "grad_norm": 0.10896781831979752, "learning_rate": 3.9450000000000003e-05, "loss": 0.0206, "step": 2110 }, { "epoch": 0.212, "grad_norm": 0.16153796017169952, "learning_rate": 3.94e-05, "loss": 0.0195, "step": 2120 }, { "epoch": 0.213, "grad_norm": 0.19171251356601715, "learning_rate": 3.935e-05, "loss": 0.0203, "step": 2130 }, { "epoch": 0.214, "grad_norm": 0.13199982047080994, "learning_rate": 3.9300000000000007e-05, "loss": 0.0194, "step": 2140 }, { "epoch": 0.215, "grad_norm": 0.12839478254318237, "learning_rate": 3.9250000000000005e-05, "loss": 0.0227, "step": 2150 }, { "epoch": 0.216, "grad_norm": 0.10787441581487656, "learning_rate": 3.9200000000000004e-05, "loss": 0.0195, "step": 2160 }, { "epoch": 0.217, "grad_norm": 0.1551046371459961, "learning_rate": 3.915e-05, "loss": 0.019, "step": 2170 }, { "epoch": 0.218, "grad_norm": 0.18844197690486908, "learning_rate": 3.91e-05, "loss": 0.02, "step": 2180 }, { "epoch": 0.219, "grad_norm": 0.21247665584087372, "learning_rate": 3.905e-05, "loss": 0.0206, "step": 2190 }, { "epoch": 0.22, "grad_norm": 0.11881183087825775, "learning_rate": 3.9000000000000006e-05, "loss": 0.0176, "step": 2200 }, { "epoch": 0.221, "grad_norm": 0.16291823983192444, "learning_rate": 3.8950000000000005e-05, "loss": 0.0178, "step": 2210 }, { "epoch": 0.222, "grad_norm": 0.14063787460327148, "learning_rate": 3.8900000000000004e-05, "loss": 0.0177, "step": 2220 }, { "epoch": 0.223, "grad_norm": 0.15583930909633636, "learning_rate": 3.885e-05, "loss": 0.0185, "step": 2230 }, { "epoch": 0.224, "grad_norm": 0.16128882765769958, "learning_rate": 3.88e-05, "loss": 0.0168, "step": 2240 }, { "epoch": 0.225, "grad_norm": 0.1588473916053772, "learning_rate": 3.875e-05, "loss": 0.0167, "step": 2250 }, { "epoch": 0.226, "grad_norm": 0.10487533360719681, "learning_rate": 3.8700000000000006e-05, "loss": 0.0154, "step": 2260 }, { "epoch": 0.227, "grad_norm": 0.2638506591320038, "learning_rate": 3.8650000000000004e-05, "loss": 0.0179, "step": 2270 }, { "epoch": 0.228, "grad_norm": 0.12504911422729492, "learning_rate": 3.86e-05, "loss": 0.016, "step": 2280 }, { "epoch": 0.229, "grad_norm": 0.11655262857675552, "learning_rate": 3.855e-05, "loss": 0.0164, "step": 2290 }, { "epoch": 0.23, "grad_norm": 0.10052930563688278, "learning_rate": 3.85e-05, "loss": 0.0143, "step": 2300 }, { "epoch": 0.231, "grad_norm": 0.07682032138109207, "learning_rate": 3.845e-05, "loss": 0.0165, "step": 2310 }, { "epoch": 0.232, "grad_norm": 0.12146533280611038, "learning_rate": 3.8400000000000005e-05, "loss": 0.0147, "step": 2320 }, { "epoch": 0.233, "grad_norm": 0.16349685192108154, "learning_rate": 3.8350000000000004e-05, "loss": 0.0166, "step": 2330 }, { "epoch": 0.234, "grad_norm": 0.10822432488203049, "learning_rate": 3.83e-05, "loss": 0.0146, "step": 2340 }, { "epoch": 0.235, "grad_norm": 0.0805143415927887, "learning_rate": 3.825e-05, "loss": 0.0131, "step": 2350 }, { "epoch": 0.236, "grad_norm": 0.08285068720579147, "learning_rate": 3.82e-05, "loss": 0.0134, "step": 2360 }, { "epoch": 0.237, "grad_norm": 0.0886882022023201, "learning_rate": 3.8150000000000006e-05, "loss": 0.0133, "step": 2370 }, { "epoch": 0.238, "grad_norm": 0.08793161064386368, "learning_rate": 3.8100000000000005e-05, "loss": 0.0142, "step": 2380 }, { "epoch": 0.239, "grad_norm": 0.08325997740030289, "learning_rate": 3.805e-05, "loss": 0.012, "step": 2390 }, { "epoch": 0.24, "grad_norm": 0.10990972816944122, "learning_rate": 3.8e-05, "loss": 0.0134, "step": 2400 }, { "epoch": 0.241, "grad_norm": 0.06695697456598282, "learning_rate": 3.795e-05, "loss": 0.0119, "step": 2410 }, { "epoch": 0.242, "grad_norm": 0.08304648846387863, "learning_rate": 3.79e-05, "loss": 0.0128, "step": 2420 }, { "epoch": 0.243, "grad_norm": 0.09513472020626068, "learning_rate": 3.7850000000000005e-05, "loss": 0.0146, "step": 2430 }, { "epoch": 0.244, "grad_norm": 0.07892587035894394, "learning_rate": 3.7800000000000004e-05, "loss": 0.0116, "step": 2440 }, { "epoch": 0.245, "grad_norm": 0.12630197405815125, "learning_rate": 3.775e-05, "loss": 0.0132, "step": 2450 }, { "epoch": 0.246, "grad_norm": 0.08250122517347336, "learning_rate": 3.77e-05, "loss": 0.013, "step": 2460 }, { "epoch": 0.247, "grad_norm": 0.09903154522180557, "learning_rate": 3.765e-05, "loss": 0.0117, "step": 2470 }, { "epoch": 0.248, "grad_norm": 0.09059987217187881, "learning_rate": 3.76e-05, "loss": 0.0111, "step": 2480 }, { "epoch": 0.249, "grad_norm": 0.09777077287435532, "learning_rate": 3.7550000000000005e-05, "loss": 0.0142, "step": 2490 }, { "epoch": 0.25, "grad_norm": 0.1801980435848236, "learning_rate": 3.7500000000000003e-05, "loss": 0.0121, "step": 2500 }, { "epoch": 0.251, "grad_norm": 0.08936703950166702, "learning_rate": 3.745e-05, "loss": 0.0112, "step": 2510 }, { "epoch": 0.252, "grad_norm": 0.09601296484470367, "learning_rate": 3.74e-05, "loss": 0.0116, "step": 2520 }, { "epoch": 0.253, "grad_norm": 0.08924739062786102, "learning_rate": 3.735e-05, "loss": 0.0119, "step": 2530 }, { "epoch": 0.254, "grad_norm": 0.07558383047580719, "learning_rate": 3.73e-05, "loss": 0.0108, "step": 2540 }, { "epoch": 0.255, "grad_norm": 0.05701779946684837, "learning_rate": 3.7250000000000004e-05, "loss": 0.011, "step": 2550 }, { "epoch": 0.256, "grad_norm": 0.0955379456281662, "learning_rate": 3.72e-05, "loss": 0.0112, "step": 2560 }, { "epoch": 0.257, "grad_norm": 0.07837249338626862, "learning_rate": 3.715e-05, "loss": 0.0111, "step": 2570 }, { "epoch": 0.258, "grad_norm": 0.09438953548669815, "learning_rate": 3.71e-05, "loss": 0.0121, "step": 2580 }, { "epoch": 0.259, "grad_norm": 0.08802532404661179, "learning_rate": 3.705e-05, "loss": 0.0098, "step": 2590 }, { "epoch": 0.26, "grad_norm": 0.0785641148686409, "learning_rate": 3.7e-05, "loss": 0.0106, "step": 2600 }, { "epoch": 0.261, "grad_norm": 0.10036404430866241, "learning_rate": 3.6950000000000004e-05, "loss": 0.0107, "step": 2610 }, { "epoch": 0.262, "grad_norm": 0.0663432776927948, "learning_rate": 3.69e-05, "loss": 0.0098, "step": 2620 }, { "epoch": 0.263, "grad_norm": 0.06886564195156097, "learning_rate": 3.685e-05, "loss": 0.0094, "step": 2630 }, { "epoch": 0.264, "grad_norm": 0.06641737371683121, "learning_rate": 3.68e-05, "loss": 0.0112, "step": 2640 }, { "epoch": 0.265, "grad_norm": 0.1470363438129425, "learning_rate": 3.675e-05, "loss": 0.0118, "step": 2650 }, { "epoch": 0.266, "grad_norm": 0.08694775402545929, "learning_rate": 3.6700000000000004e-05, "loss": 0.0105, "step": 2660 }, { "epoch": 0.267, "grad_norm": 0.08168693631887436, "learning_rate": 3.665e-05, "loss": 0.0101, "step": 2670 }, { "epoch": 0.268, "grad_norm": 0.06114206463098526, "learning_rate": 3.66e-05, "loss": 0.0097, "step": 2680 }, { "epoch": 0.269, "grad_norm": 0.09011895209550858, "learning_rate": 3.655e-05, "loss": 0.0101, "step": 2690 }, { "epoch": 0.27, "grad_norm": 0.06499819457530975, "learning_rate": 3.65e-05, "loss": 0.0089, "step": 2700 }, { "epoch": 0.271, "grad_norm": 0.08157055824995041, "learning_rate": 3.645e-05, "loss": 0.0099, "step": 2710 }, { "epoch": 0.272, "grad_norm": 0.06255003809928894, "learning_rate": 3.6400000000000004e-05, "loss": 0.0091, "step": 2720 }, { "epoch": 0.273, "grad_norm": 0.13641099631786346, "learning_rate": 3.635e-05, "loss": 0.01, "step": 2730 }, { "epoch": 0.274, "grad_norm": 0.06449054926633835, "learning_rate": 3.63e-05, "loss": 0.0087, "step": 2740 }, { "epoch": 0.275, "grad_norm": 0.09242594987154007, "learning_rate": 3.625e-05, "loss": 0.0084, "step": 2750 }, { "epoch": 0.276, "grad_norm": 0.14216932654380798, "learning_rate": 3.62e-05, "loss": 0.01, "step": 2760 }, { "epoch": 0.277, "grad_norm": 0.11992328613996506, "learning_rate": 3.615e-05, "loss": 0.0107, "step": 2770 }, { "epoch": 0.278, "grad_norm": 0.10537979751825333, "learning_rate": 3.61e-05, "loss": 0.0101, "step": 2780 }, { "epoch": 0.279, "grad_norm": 0.06420467048883438, "learning_rate": 3.605e-05, "loss": 0.0088, "step": 2790 }, { "epoch": 0.28, "grad_norm": 0.10813489556312561, "learning_rate": 3.6e-05, "loss": 0.0093, "step": 2800 }, { "epoch": 0.281, "grad_norm": 0.05735234543681145, "learning_rate": 3.595e-05, "loss": 0.0087, "step": 2810 }, { "epoch": 0.282, "grad_norm": 0.05712522938847542, "learning_rate": 3.59e-05, "loss": 0.0082, "step": 2820 }, { "epoch": 0.283, "grad_norm": 0.07710873335599899, "learning_rate": 3.585e-05, "loss": 0.0088, "step": 2830 }, { "epoch": 0.284, "grad_norm": 0.11007268726825714, "learning_rate": 3.58e-05, "loss": 0.0075, "step": 2840 }, { "epoch": 0.285, "grad_norm": 0.07825978100299835, "learning_rate": 3.575e-05, "loss": 0.0089, "step": 2850 }, { "epoch": 0.286, "grad_norm": 0.06950812041759491, "learning_rate": 3.57e-05, "loss": 0.0077, "step": 2860 }, { "epoch": 0.287, "grad_norm": 0.052572544664144516, "learning_rate": 3.565e-05, "loss": 0.0076, "step": 2870 }, { "epoch": 0.288, "grad_norm": 0.06588669121265411, "learning_rate": 3.56e-05, "loss": 0.0073, "step": 2880 }, { "epoch": 0.289, "grad_norm": 0.052969031035900116, "learning_rate": 3.555e-05, "loss": 0.0066, "step": 2890 }, { "epoch": 0.29, "grad_norm": 0.04204658418893814, "learning_rate": 3.55e-05, "loss": 0.0064, "step": 2900 }, { "epoch": 0.291, "grad_norm": 0.04765693470835686, "learning_rate": 3.545e-05, "loss": 0.0067, "step": 2910 }, { "epoch": 0.292, "grad_norm": 0.06796044856309891, "learning_rate": 3.54e-05, "loss": 0.0069, "step": 2920 }, { "epoch": 0.293, "grad_norm": 0.12173280119895935, "learning_rate": 3.535e-05, "loss": 0.0126, "step": 2930 }, { "epoch": 0.294, "grad_norm": 0.09393921494483948, "learning_rate": 3.53e-05, "loss": 0.0096, "step": 2940 }, { "epoch": 0.295, "grad_norm": 0.08246493339538574, "learning_rate": 3.525e-05, "loss": 0.0084, "step": 2950 }, { "epoch": 0.296, "grad_norm": 0.04482726752758026, "learning_rate": 3.52e-05, "loss": 0.0073, "step": 2960 }, { "epoch": 0.297, "grad_norm": 0.1147686317563057, "learning_rate": 3.515e-05, "loss": 0.0094, "step": 2970 }, { "epoch": 0.298, "grad_norm": 0.09143181890249252, "learning_rate": 3.51e-05, "loss": 0.0075, "step": 2980 }, { "epoch": 0.299, "grad_norm": 0.05911434814333916, "learning_rate": 3.505e-05, "loss": 0.0075, "step": 2990 }, { "epoch": 0.3, "grad_norm": 0.04372965916991234, "learning_rate": 3.5e-05, "loss": 0.0072, "step": 3000 }, { "epoch": 0.301, "grad_norm": 0.05518479272723198, "learning_rate": 3.495e-05, "loss": 0.0068, "step": 3010 }, { "epoch": 0.302, "grad_norm": 0.04555105045437813, "learning_rate": 3.49e-05, "loss": 0.0064, "step": 3020 }, { "epoch": 0.303, "grad_norm": 0.03831150382757187, "learning_rate": 3.485e-05, "loss": 0.007, "step": 3030 }, { "epoch": 0.304, "grad_norm": 0.04596749320626259, "learning_rate": 3.48e-05, "loss": 0.0065, "step": 3040 }, { "epoch": 0.305, "grad_norm": 0.07694078236818314, "learning_rate": 3.475e-05, "loss": 0.0059, "step": 3050 }, { "epoch": 0.306, "grad_norm": 0.12307348102331161, "learning_rate": 3.4699999999999996e-05, "loss": 0.0099, "step": 3060 }, { "epoch": 0.307, "grad_norm": 0.059611763805150986, "learning_rate": 3.465e-05, "loss": 0.0067, "step": 3070 }, { "epoch": 0.308, "grad_norm": 0.07357806712388992, "learning_rate": 3.46e-05, "loss": 0.007, "step": 3080 }, { "epoch": 0.309, "grad_norm": 0.060446444898843765, "learning_rate": 3.455e-05, "loss": 0.0063, "step": 3090 }, { "epoch": 0.31, "grad_norm": 0.05178246274590492, "learning_rate": 3.45e-05, "loss": 0.0064, "step": 3100 }, { "epoch": 0.311, "grad_norm": 0.4560135006904602, "learning_rate": 3.445e-05, "loss": 0.0076, "step": 3110 }, { "epoch": 0.312, "grad_norm": 0.10910088568925858, "learning_rate": 3.4399999999999996e-05, "loss": 0.0082, "step": 3120 }, { "epoch": 0.313, "grad_norm": 0.05087321624159813, "learning_rate": 3.435e-05, "loss": 0.0061, "step": 3130 }, { "epoch": 0.314, "grad_norm": 0.055152568966150284, "learning_rate": 3.430000000000001e-05, "loss": 0.0065, "step": 3140 }, { "epoch": 0.315, "grad_norm": 0.48375555872917175, "learning_rate": 3.4250000000000006e-05, "loss": 0.0081, "step": 3150 }, { "epoch": 0.316, "grad_norm": 0.12223263829946518, "learning_rate": 3.4200000000000005e-05, "loss": 0.0098, "step": 3160 }, { "epoch": 0.317, "grad_norm": 0.15452681481838226, "learning_rate": 3.415e-05, "loss": 0.0087, "step": 3170 }, { "epoch": 0.318, "grad_norm": 0.06133843585848808, "learning_rate": 3.41e-05, "loss": 0.0087, "step": 3180 }, { "epoch": 0.319, "grad_norm": 0.04037950187921524, "learning_rate": 3.405e-05, "loss": 0.0062, "step": 3190 }, { "epoch": 0.32, "grad_norm": 0.05707933381199837, "learning_rate": 3.4000000000000007e-05, "loss": 0.0061, "step": 3200 }, { "epoch": 0.321, "grad_norm": 0.04124099016189575, "learning_rate": 3.3950000000000005e-05, "loss": 0.0058, "step": 3210 }, { "epoch": 0.322, "grad_norm": 0.12988638877868652, "learning_rate": 3.3900000000000004e-05, "loss": 0.0062, "step": 3220 }, { "epoch": 0.323, "grad_norm": 0.04961306229233742, "learning_rate": 3.385e-05, "loss": 0.0057, "step": 3230 }, { "epoch": 0.324, "grad_norm": 0.05354069173336029, "learning_rate": 3.38e-05, "loss": 0.0062, "step": 3240 }, { "epoch": 0.325, "grad_norm": 0.04944461211562157, "learning_rate": 3.375000000000001e-05, "loss": 0.0061, "step": 3250 }, { "epoch": 0.326, "grad_norm": 0.03429180383682251, "learning_rate": 3.3700000000000006e-05, "loss": 0.0055, "step": 3260 }, { "epoch": 0.327, "grad_norm": 0.05271946266293526, "learning_rate": 3.3650000000000005e-05, "loss": 0.0054, "step": 3270 }, { "epoch": 0.328, "grad_norm": 0.03602539002895355, "learning_rate": 3.3600000000000004e-05, "loss": 0.0049, "step": 3280 }, { "epoch": 0.329, "grad_norm": 0.03325178474187851, "learning_rate": 3.355e-05, "loss": 0.0052, "step": 3290 }, { "epoch": 0.33, "grad_norm": 0.03728079795837402, "learning_rate": 3.35e-05, "loss": 0.0057, "step": 3300 }, { "epoch": 0.331, "grad_norm": 0.053768668323755264, "learning_rate": 3.345000000000001e-05, "loss": 0.0055, "step": 3310 }, { "epoch": 0.332, "grad_norm": 0.054501548409461975, "learning_rate": 3.3400000000000005e-05, "loss": 0.0053, "step": 3320 }, { "epoch": 0.333, "grad_norm": 0.05519956722855568, "learning_rate": 3.3350000000000004e-05, "loss": 0.0053, "step": 3330 }, { "epoch": 0.334, "grad_norm": 0.05373954027891159, "learning_rate": 3.33e-05, "loss": 0.006, "step": 3340 }, { "epoch": 0.335, "grad_norm": 0.04272560030221939, "learning_rate": 3.325e-05, "loss": 0.0057, "step": 3350 }, { "epoch": 0.336, "grad_norm": 0.047061894088983536, "learning_rate": 3.32e-05, "loss": 0.0048, "step": 3360 }, { "epoch": 0.337, "grad_norm": 0.032794494181871414, "learning_rate": 3.3150000000000006e-05, "loss": 0.0046, "step": 3370 }, { "epoch": 0.338, "grad_norm": 0.027148200199007988, "learning_rate": 3.3100000000000005e-05, "loss": 0.0046, "step": 3380 }, { "epoch": 0.339, "grad_norm": 0.035516317933797836, "learning_rate": 3.3050000000000004e-05, "loss": 0.0048, "step": 3390 }, { "epoch": 0.34, "grad_norm": 0.046294230967760086, "learning_rate": 3.3e-05, "loss": 0.0063, "step": 3400 }, { "epoch": 0.341, "grad_norm": 0.07840899378061295, "learning_rate": 3.295e-05, "loss": 0.0047, "step": 3410 }, { "epoch": 0.342, "grad_norm": 0.04392802715301514, "learning_rate": 3.29e-05, "loss": 0.0048, "step": 3420 }, { "epoch": 0.343, "grad_norm": 0.04237942770123482, "learning_rate": 3.2850000000000006e-05, "loss": 0.0047, "step": 3430 }, { "epoch": 0.344, "grad_norm": 0.03379204496741295, "learning_rate": 3.2800000000000004e-05, "loss": 0.005, "step": 3440 }, { "epoch": 0.345, "grad_norm": 0.3130718469619751, "learning_rate": 3.275e-05, "loss": 0.0108, "step": 3450 }, { "epoch": 0.346, "grad_norm": 0.13712112605571747, "learning_rate": 3.27e-05, "loss": 0.0095, "step": 3460 }, { "epoch": 0.347, "grad_norm": 0.12321494519710541, "learning_rate": 3.265e-05, "loss": 0.0075, "step": 3470 }, { "epoch": 0.348, "grad_norm": 0.06602940708398819, "learning_rate": 3.26e-05, "loss": 0.0062, "step": 3480 }, { "epoch": 0.349, "grad_norm": 0.08250287175178528, "learning_rate": 3.2550000000000005e-05, "loss": 0.0053, "step": 3490 }, { "epoch": 0.35, "grad_norm": 0.04638442397117615, "learning_rate": 3.2500000000000004e-05, "loss": 0.0052, "step": 3500 }, { "epoch": 0.351, "grad_norm": 0.043373119086027145, "learning_rate": 3.245e-05, "loss": 0.0053, "step": 3510 }, { "epoch": 0.352, "grad_norm": 0.04895636439323425, "learning_rate": 3.24e-05, "loss": 0.0043, "step": 3520 }, { "epoch": 0.353, "grad_norm": 0.04256746545433998, "learning_rate": 3.235e-05, "loss": 0.0044, "step": 3530 }, { "epoch": 0.354, "grad_norm": 0.02967280149459839, "learning_rate": 3.2300000000000006e-05, "loss": 0.0046, "step": 3540 }, { "epoch": 0.355, "grad_norm": 0.02590947411954403, "learning_rate": 3.2250000000000005e-05, "loss": 0.0044, "step": 3550 }, { "epoch": 0.356, "grad_norm": 0.026240160688757896, "learning_rate": 3.2200000000000003e-05, "loss": 0.0041, "step": 3560 }, { "epoch": 0.357, "grad_norm": 0.048163361847400665, "learning_rate": 3.215e-05, "loss": 0.0048, "step": 3570 }, { "epoch": 0.358, "grad_norm": 0.04310280829668045, "learning_rate": 3.21e-05, "loss": 0.0044, "step": 3580 }, { "epoch": 0.359, "grad_norm": 0.027477843686938286, "learning_rate": 3.205e-05, "loss": 0.004, "step": 3590 }, { "epoch": 0.36, "grad_norm": 0.02773194946348667, "learning_rate": 3.2000000000000005e-05, "loss": 0.0046, "step": 3600 }, { "epoch": 0.361, "grad_norm": 0.027110638096928596, "learning_rate": 3.1950000000000004e-05, "loss": 0.004, "step": 3610 }, { "epoch": 0.362, "grad_norm": 0.04346521571278572, "learning_rate": 3.19e-05, "loss": 0.0039, "step": 3620 }, { "epoch": 0.363, "grad_norm": 0.024588119238615036, "learning_rate": 3.185e-05, "loss": 0.0041, "step": 3630 }, { "epoch": 0.364, "grad_norm": 0.03631160408258438, "learning_rate": 3.18e-05, "loss": 0.0039, "step": 3640 }, { "epoch": 0.365, "grad_norm": 0.028497323393821716, "learning_rate": 3.175e-05, "loss": 0.0045, "step": 3650 }, { "epoch": 0.366, "grad_norm": 0.06324070692062378, "learning_rate": 3.1700000000000005e-05, "loss": 0.004, "step": 3660 }, { "epoch": 0.367, "grad_norm": 0.22121182084083557, "learning_rate": 3.1650000000000004e-05, "loss": 0.0078, "step": 3670 }, { "epoch": 0.368, "grad_norm": 0.130402073264122, "learning_rate": 3.16e-05, "loss": 0.0068, "step": 3680 }, { "epoch": 0.369, "grad_norm": 0.11193361133337021, "learning_rate": 3.155e-05, "loss": 0.0052, "step": 3690 }, { "epoch": 0.37, "grad_norm": 0.043261051177978516, "learning_rate": 3.15e-05, "loss": 0.0046, "step": 3700 }, { "epoch": 0.371, "grad_norm": 0.05132100731134415, "learning_rate": 3.145e-05, "loss": 0.0041, "step": 3710 }, { "epoch": 0.372, "grad_norm": 0.03498228266835213, "learning_rate": 3.1400000000000004e-05, "loss": 0.0041, "step": 3720 }, { "epoch": 0.373, "grad_norm": 0.03594733029603958, "learning_rate": 3.135e-05, "loss": 0.0037, "step": 3730 }, { "epoch": 0.374, "grad_norm": 0.05542735382914543, "learning_rate": 3.13e-05, "loss": 0.0042, "step": 3740 }, { "epoch": 0.375, "grad_norm": 0.03302931785583496, "learning_rate": 3.125e-05, "loss": 0.0048, "step": 3750 }, { "epoch": 0.376, "grad_norm": 0.028392167761921883, "learning_rate": 3.12e-05, "loss": 0.0042, "step": 3760 }, { "epoch": 0.377, "grad_norm": 0.05274713411927223, "learning_rate": 3.115e-05, "loss": 0.0045, "step": 3770 }, { "epoch": 0.378, "grad_norm": 0.024890929460525513, "learning_rate": 3.1100000000000004e-05, "loss": 0.0038, "step": 3780 }, { "epoch": 0.379, "grad_norm": 0.02797631174325943, "learning_rate": 3.105e-05, "loss": 0.0036, "step": 3790 }, { "epoch": 0.38, "grad_norm": 0.033390454947948456, "learning_rate": 3.1e-05, "loss": 0.0039, "step": 3800 }, { "epoch": 0.381, "grad_norm": 0.024741416797041893, "learning_rate": 3.095e-05, "loss": 0.0042, "step": 3810 }, { "epoch": 0.382, "grad_norm": 0.05398337543010712, "learning_rate": 3.09e-05, "loss": 0.0049, "step": 3820 }, { "epoch": 0.383, "grad_norm": 0.027936646714806557, "learning_rate": 3.0850000000000004e-05, "loss": 0.0034, "step": 3830 }, { "epoch": 0.384, "grad_norm": 0.02413495071232319, "learning_rate": 3.08e-05, "loss": 0.0036, "step": 3840 }, { "epoch": 0.385, "grad_norm": 0.037689995020627975, "learning_rate": 3.075e-05, "loss": 0.0033, "step": 3850 }, { "epoch": 0.386, "grad_norm": 0.028174949809908867, "learning_rate": 3.07e-05, "loss": 0.0036, "step": 3860 }, { "epoch": 0.387, "grad_norm": 0.064354807138443, "learning_rate": 3.065e-05, "loss": 0.0037, "step": 3870 }, { "epoch": 0.388, "grad_norm": 0.028341595083475113, "learning_rate": 3.06e-05, "loss": 0.0034, "step": 3880 }, { "epoch": 0.389, "grad_norm": 0.06142325699329376, "learning_rate": 3.0550000000000004e-05, "loss": 0.004, "step": 3890 }, { "epoch": 0.39, "grad_norm": 0.03553822636604309, "learning_rate": 3.05e-05, "loss": 0.004, "step": 3900 }, { "epoch": 0.391, "grad_norm": 0.025645367801189423, "learning_rate": 3.045e-05, "loss": 0.0032, "step": 3910 }, { "epoch": 0.392, "grad_norm": 0.053947921842336655, "learning_rate": 3.04e-05, "loss": 0.0042, "step": 3920 }, { "epoch": 0.393, "grad_norm": 0.040126167237758636, "learning_rate": 3.035e-05, "loss": 0.0038, "step": 3930 }, { "epoch": 0.394, "grad_norm": 0.02956206165254116, "learning_rate": 3.03e-05, "loss": 0.0035, "step": 3940 }, { "epoch": 0.395, "grad_norm": 0.11952024698257446, "learning_rate": 3.025e-05, "loss": 0.0057, "step": 3950 }, { "epoch": 0.396, "grad_norm": 0.04155363142490387, "learning_rate": 3.02e-05, "loss": 0.0034, "step": 3960 }, { "epoch": 0.397, "grad_norm": 0.03884551301598549, "learning_rate": 3.015e-05, "loss": 0.0037, "step": 3970 }, { "epoch": 0.398, "grad_norm": 0.033869728446006775, "learning_rate": 3.01e-05, "loss": 0.0033, "step": 3980 }, { "epoch": 0.399, "grad_norm": 0.027508044615387917, "learning_rate": 3.0050000000000002e-05, "loss": 0.0038, "step": 3990 }, { "epoch": 0.4, "grad_norm": 0.019838711246848106, "learning_rate": 3e-05, "loss": 0.0034, "step": 4000 }, { "epoch": 0.401, "grad_norm": 0.042124535888433456, "learning_rate": 2.995e-05, "loss": 0.003, "step": 4010 }, { "epoch": 0.402, "grad_norm": 0.03583139553666115, "learning_rate": 2.9900000000000002e-05, "loss": 0.0029, "step": 4020 }, { "epoch": 0.403, "grad_norm": 0.024187223985791206, "learning_rate": 2.985e-05, "loss": 0.0031, "step": 4030 }, { "epoch": 0.404, "grad_norm": 0.02509123831987381, "learning_rate": 2.98e-05, "loss": 0.003, "step": 4040 }, { "epoch": 0.405, "grad_norm": 0.015798581764101982, "learning_rate": 2.975e-05, "loss": 0.003, "step": 4050 }, { "epoch": 0.406, "grad_norm": 0.01964486949145794, "learning_rate": 2.97e-05, "loss": 0.0032, "step": 4060 }, { "epoch": 0.407, "grad_norm": 0.025820232927799225, "learning_rate": 2.965e-05, "loss": 0.0032, "step": 4070 }, { "epoch": 0.408, "grad_norm": 0.03453589975833893, "learning_rate": 2.96e-05, "loss": 0.003, "step": 4080 }, { "epoch": 0.409, "grad_norm": 0.022311529144644737, "learning_rate": 2.955e-05, "loss": 0.0025, "step": 4090 }, { "epoch": 0.41, "grad_norm": 0.02296466939151287, "learning_rate": 2.95e-05, "loss": 0.003, "step": 4100 }, { "epoch": 0.411, "grad_norm": 0.022816313430666924, "learning_rate": 2.945e-05, "loss": 0.0032, "step": 4110 }, { "epoch": 0.412, "grad_norm": 0.021030904725193977, "learning_rate": 2.94e-05, "loss": 0.003, "step": 4120 }, { "epoch": 0.413, "grad_norm": 0.02336346171796322, "learning_rate": 2.935e-05, "loss": 0.0028, "step": 4130 }, { "epoch": 0.414, "grad_norm": 0.019582638517022133, "learning_rate": 2.93e-05, "loss": 0.0027, "step": 4140 }, { "epoch": 0.415, "grad_norm": 0.031429585069417953, "learning_rate": 2.925e-05, "loss": 0.0027, "step": 4150 }, { "epoch": 0.416, "grad_norm": 0.027804825454950333, "learning_rate": 2.9199999999999998e-05, "loss": 0.0027, "step": 4160 }, { "epoch": 0.417, "grad_norm": 0.022006656974554062, "learning_rate": 2.915e-05, "loss": 0.003, "step": 4170 }, { "epoch": 0.418, "grad_norm": 0.052478939294815063, "learning_rate": 2.91e-05, "loss": 0.0044, "step": 4180 }, { "epoch": 0.419, "grad_norm": 0.03854925185441971, "learning_rate": 2.9049999999999998e-05, "loss": 0.0036, "step": 4190 }, { "epoch": 0.42, "grad_norm": 0.02749469131231308, "learning_rate": 2.9e-05, "loss": 0.0033, "step": 4200 }, { "epoch": 0.421, "grad_norm": 0.01697971485555172, "learning_rate": 2.895e-05, "loss": 0.0032, "step": 4210 }, { "epoch": 0.422, "grad_norm": 0.05183997377753258, "learning_rate": 2.8899999999999998e-05, "loss": 0.0029, "step": 4220 }, { "epoch": 0.423, "grad_norm": 0.030102815479040146, "learning_rate": 2.885e-05, "loss": 0.0028, "step": 4230 }, { "epoch": 0.424, "grad_norm": 0.027241216972470284, "learning_rate": 2.88e-05, "loss": 0.0033, "step": 4240 }, { "epoch": 0.425, "grad_norm": 0.01855759136378765, "learning_rate": 2.8749999999999997e-05, "loss": 0.0024, "step": 4250 }, { "epoch": 0.426, "grad_norm": 0.019300928339362144, "learning_rate": 2.87e-05, "loss": 0.0024, "step": 4260 }, { "epoch": 0.427, "grad_norm": 0.01639522798359394, "learning_rate": 2.865e-05, "loss": 0.0026, "step": 4270 }, { "epoch": 0.428, "grad_norm": 0.027084793895483017, "learning_rate": 2.86e-05, "loss": 0.0027, "step": 4280 }, { "epoch": 0.429, "grad_norm": 0.021206015720963478, "learning_rate": 2.855e-05, "loss": 0.0025, "step": 4290 }, { "epoch": 0.43, "grad_norm": 0.0655827671289444, "learning_rate": 2.8499999999999998e-05, "loss": 0.0027, "step": 4300 }, { "epoch": 0.431, "grad_norm": 0.03779730945825577, "learning_rate": 2.845e-05, "loss": 0.0024, "step": 4310 }, { "epoch": 0.432, "grad_norm": 0.045267749577760696, "learning_rate": 2.84e-05, "loss": 0.0031, "step": 4320 }, { "epoch": 0.433, "grad_norm": 0.017473919317126274, "learning_rate": 2.8349999999999998e-05, "loss": 0.0027, "step": 4330 }, { "epoch": 0.434, "grad_norm": 0.019513197243213654, "learning_rate": 2.83e-05, "loss": 0.0024, "step": 4340 }, { "epoch": 0.435, "grad_norm": 0.01616765186190605, "learning_rate": 2.825e-05, "loss": 0.0025, "step": 4350 }, { "epoch": 0.436, "grad_norm": 0.02270474284887314, "learning_rate": 2.8199999999999998e-05, "loss": 0.0024, "step": 4360 }, { "epoch": 0.437, "grad_norm": 0.02363002672791481, "learning_rate": 2.815e-05, "loss": 0.0025, "step": 4370 }, { "epoch": 0.438, "grad_norm": 0.023898936808109283, "learning_rate": 2.8100000000000005e-05, "loss": 0.0023, "step": 4380 }, { "epoch": 0.439, "grad_norm": 0.01270793005824089, "learning_rate": 2.8050000000000004e-05, "loss": 0.0022, "step": 4390 }, { "epoch": 0.44, "grad_norm": 0.03917006403207779, "learning_rate": 2.8000000000000003e-05, "loss": 0.0029, "step": 4400 }, { "epoch": 0.441, "grad_norm": 0.028284449130296707, "learning_rate": 2.7950000000000005e-05, "loss": 0.0023, "step": 4410 }, { "epoch": 0.442, "grad_norm": 0.017493903636932373, "learning_rate": 2.7900000000000004e-05, "loss": 0.0022, "step": 4420 }, { "epoch": 0.443, "grad_norm": 0.03160572797060013, "learning_rate": 2.7850000000000003e-05, "loss": 0.0025, "step": 4430 }, { "epoch": 0.444, "grad_norm": 0.022926049306988716, "learning_rate": 2.7800000000000005e-05, "loss": 0.0024, "step": 4440 }, { "epoch": 0.445, "grad_norm": 0.032902974635362625, "learning_rate": 2.7750000000000004e-05, "loss": 0.0025, "step": 4450 }, { "epoch": 0.446, "grad_norm": 0.017781972885131836, "learning_rate": 2.7700000000000002e-05, "loss": 0.0022, "step": 4460 }, { "epoch": 0.447, "grad_norm": 0.1209416389465332, "learning_rate": 2.7650000000000005e-05, "loss": 0.0028, "step": 4470 }, { "epoch": 0.448, "grad_norm": 0.03747338801622391, "learning_rate": 2.7600000000000003e-05, "loss": 0.0024, "step": 4480 }, { "epoch": 0.449, "grad_norm": 0.04210735112428665, "learning_rate": 2.7550000000000002e-05, "loss": 0.0024, "step": 4490 }, { "epoch": 0.45, "grad_norm": 0.02306324429810047, "learning_rate": 2.7500000000000004e-05, "loss": 0.0022, "step": 4500 }, { "epoch": 0.451, "grad_norm": 0.027622856199741364, "learning_rate": 2.7450000000000003e-05, "loss": 0.0026, "step": 4510 }, { "epoch": 0.452, "grad_norm": 0.014202162623405457, "learning_rate": 2.7400000000000002e-05, "loss": 0.0022, "step": 4520 }, { "epoch": 0.453, "grad_norm": 0.051466915756464005, "learning_rate": 2.7350000000000004e-05, "loss": 0.0031, "step": 4530 }, { "epoch": 0.454, "grad_norm": 0.052768610417842865, "learning_rate": 2.7300000000000003e-05, "loss": 0.0028, "step": 4540 }, { "epoch": 0.455, "grad_norm": 0.02291076071560383, "learning_rate": 2.725e-05, "loss": 0.0023, "step": 4550 }, { "epoch": 0.456, "grad_norm": 0.027942989021539688, "learning_rate": 2.7200000000000004e-05, "loss": 0.0023, "step": 4560 }, { "epoch": 0.457, "grad_norm": 0.01529670413583517, "learning_rate": 2.7150000000000003e-05, "loss": 0.0024, "step": 4570 }, { "epoch": 0.458, "grad_norm": 0.02945224940776825, "learning_rate": 2.7100000000000005e-05, "loss": 0.0024, "step": 4580 }, { "epoch": 0.459, "grad_norm": 0.027197351679205894, "learning_rate": 2.7050000000000004e-05, "loss": 0.0024, "step": 4590 }, { "epoch": 0.46, "grad_norm": 0.022022951394319534, "learning_rate": 2.7000000000000002e-05, "loss": 0.0023, "step": 4600 }, { "epoch": 0.461, "grad_norm": 0.019739823415875435, "learning_rate": 2.6950000000000005e-05, "loss": 0.0022, "step": 4610 }, { "epoch": 0.462, "grad_norm": 0.06794995814561844, "learning_rate": 2.6900000000000003e-05, "loss": 0.0027, "step": 4620 }, { "epoch": 0.463, "grad_norm": 0.049228962510824203, "learning_rate": 2.6850000000000002e-05, "loss": 0.0026, "step": 4630 }, { "epoch": 0.464, "grad_norm": 0.0241558700799942, "learning_rate": 2.6800000000000004e-05, "loss": 0.0021, "step": 4640 }, { "epoch": 0.465, "grad_norm": 0.024576248601078987, "learning_rate": 2.6750000000000003e-05, "loss": 0.0022, "step": 4650 }, { "epoch": 0.466, "grad_norm": 0.030337205156683922, "learning_rate": 2.6700000000000002e-05, "loss": 0.0024, "step": 4660 }, { "epoch": 0.467, "grad_norm": 0.015081087127327919, "learning_rate": 2.6650000000000004e-05, "loss": 0.002, "step": 4670 }, { "epoch": 0.468, "grad_norm": 0.026368912309408188, "learning_rate": 2.6600000000000003e-05, "loss": 0.0022, "step": 4680 }, { "epoch": 0.469, "grad_norm": 0.018447600305080414, "learning_rate": 2.655e-05, "loss": 0.0022, "step": 4690 }, { "epoch": 0.47, "grad_norm": 0.018314722925424576, "learning_rate": 2.6500000000000004e-05, "loss": 0.0019, "step": 4700 }, { "epoch": 0.471, "grad_norm": 0.02361704409122467, "learning_rate": 2.6450000000000003e-05, "loss": 0.0023, "step": 4710 }, { "epoch": 0.472, "grad_norm": 0.02032247930765152, "learning_rate": 2.64e-05, "loss": 0.002, "step": 4720 }, { "epoch": 0.473, "grad_norm": 0.017889728769659996, "learning_rate": 2.6350000000000004e-05, "loss": 0.0019, "step": 4730 }, { "epoch": 0.474, "grad_norm": 0.01962173730134964, "learning_rate": 2.6300000000000002e-05, "loss": 0.0019, "step": 4740 }, { "epoch": 0.475, "grad_norm": 0.015778113156557083, "learning_rate": 2.625e-05, "loss": 0.0021, "step": 4750 }, { "epoch": 0.476, "grad_norm": 0.01894952729344368, "learning_rate": 2.6200000000000003e-05, "loss": 0.0023, "step": 4760 }, { "epoch": 0.477, "grad_norm": 0.0239462498575449, "learning_rate": 2.6150000000000002e-05, "loss": 0.002, "step": 4770 }, { "epoch": 0.478, "grad_norm": 0.025406278669834137, "learning_rate": 2.61e-05, "loss": 0.0021, "step": 4780 }, { "epoch": 0.479, "grad_norm": 0.01813661865890026, "learning_rate": 2.6050000000000003e-05, "loss": 0.0023, "step": 4790 }, { "epoch": 0.48, "grad_norm": 0.01979188807308674, "learning_rate": 2.6000000000000002e-05, "loss": 0.0019, "step": 4800 }, { "epoch": 0.481, "grad_norm": 0.02219184674322605, "learning_rate": 2.595e-05, "loss": 0.002, "step": 4810 }, { "epoch": 0.482, "grad_norm": 0.012867514044046402, "learning_rate": 2.5900000000000003e-05, "loss": 0.0017, "step": 4820 }, { "epoch": 0.483, "grad_norm": 0.014178570359945297, "learning_rate": 2.585e-05, "loss": 0.0023, "step": 4830 }, { "epoch": 0.484, "grad_norm": 0.013582895509898663, "learning_rate": 2.58e-05, "loss": 0.002, "step": 4840 }, { "epoch": 0.485, "grad_norm": 0.02718137763440609, "learning_rate": 2.5750000000000002e-05, "loss": 0.0019, "step": 4850 }, { "epoch": 0.486, "grad_norm": 0.016559738665819168, "learning_rate": 2.57e-05, "loss": 0.0019, "step": 4860 }, { "epoch": 0.487, "grad_norm": 0.01447515282779932, "learning_rate": 2.5650000000000003e-05, "loss": 0.0017, "step": 4870 }, { "epoch": 0.488, "grad_norm": 0.010251802392303944, "learning_rate": 2.5600000000000002e-05, "loss": 0.0017, "step": 4880 }, { "epoch": 0.489, "grad_norm": 0.021138856187462807, "learning_rate": 2.555e-05, "loss": 0.0023, "step": 4890 }, { "epoch": 0.49, "grad_norm": 0.026664163917303085, "learning_rate": 2.5500000000000003e-05, "loss": 0.0021, "step": 4900 }, { "epoch": 0.491, "grad_norm": 0.012794408947229385, "learning_rate": 2.5450000000000002e-05, "loss": 0.0017, "step": 4910 }, { "epoch": 0.492, "grad_norm": 0.013725240714848042, "learning_rate": 2.54e-05, "loss": 0.0018, "step": 4920 }, { "epoch": 0.493, "grad_norm": 0.01432815007865429, "learning_rate": 2.5350000000000003e-05, "loss": 0.0017, "step": 4930 }, { "epoch": 0.494, "grad_norm": 0.014761424623429775, "learning_rate": 2.5300000000000002e-05, "loss": 0.0019, "step": 4940 }, { "epoch": 0.495, "grad_norm": 0.06742983311414719, "learning_rate": 2.525e-05, "loss": 0.0025, "step": 4950 }, { "epoch": 0.496, "grad_norm": 0.024261673912405968, "learning_rate": 2.5200000000000003e-05, "loss": 0.0021, "step": 4960 }, { "epoch": 0.497, "grad_norm": 0.02116272784769535, "learning_rate": 2.515e-05, "loss": 0.002, "step": 4970 }, { "epoch": 0.498, "grad_norm": 0.014996570535004139, "learning_rate": 2.51e-05, "loss": 0.0017, "step": 4980 }, { "epoch": 0.499, "grad_norm": 0.014970551244914532, "learning_rate": 2.5050000000000002e-05, "loss": 0.002, "step": 4990 }, { "epoch": 0.5, "grad_norm": 0.01756688393652439, "learning_rate": 2.5e-05, "loss": 0.002, "step": 5000 }, { "epoch": 0.501, "grad_norm": 0.012683290056884289, "learning_rate": 2.495e-05, "loss": 0.0016, "step": 5010 }, { "epoch": 0.502, "grad_norm": 0.011495651677250862, "learning_rate": 2.4900000000000002e-05, "loss": 0.0016, "step": 5020 }, { "epoch": 0.503, "grad_norm": 0.014306634664535522, "learning_rate": 2.485e-05, "loss": 0.0018, "step": 5030 }, { "epoch": 0.504, "grad_norm": 0.02241896465420723, "learning_rate": 2.48e-05, "loss": 0.0021, "step": 5040 }, { "epoch": 0.505, "grad_norm": 0.017740361392498016, "learning_rate": 2.4750000000000002e-05, "loss": 0.0016, "step": 5050 }, { "epoch": 0.506, "grad_norm": 0.013199679553508759, "learning_rate": 2.47e-05, "loss": 0.0015, "step": 5060 }, { "epoch": 0.507, "grad_norm": 0.057298243045806885, "learning_rate": 2.465e-05, "loss": 0.0019, "step": 5070 }, { "epoch": 0.508, "grad_norm": 0.03238265961408615, "learning_rate": 2.46e-05, "loss": 0.0026, "step": 5080 }, { "epoch": 0.509, "grad_norm": 0.04820936918258667, "learning_rate": 2.455e-05, "loss": 0.0027, "step": 5090 }, { "epoch": 0.51, "grad_norm": 0.022526515647768974, "learning_rate": 2.45e-05, "loss": 0.0018, "step": 5100 }, { "epoch": 0.511, "grad_norm": 0.1899888962507248, "learning_rate": 2.445e-05, "loss": 0.0026, "step": 5110 }, { "epoch": 0.512, "grad_norm": 0.05366889387369156, "learning_rate": 2.44e-05, "loss": 0.003, "step": 5120 }, { "epoch": 0.513, "grad_norm": 0.028939131647348404, "learning_rate": 2.435e-05, "loss": 0.0021, "step": 5130 }, { "epoch": 0.514, "grad_norm": 0.023352844640612602, "learning_rate": 2.43e-05, "loss": 0.0019, "step": 5140 }, { "epoch": 0.515, "grad_norm": 0.015283104963600636, "learning_rate": 2.425e-05, "loss": 0.0017, "step": 5150 }, { "epoch": 0.516, "grad_norm": 0.0149134686216712, "learning_rate": 2.4200000000000002e-05, "loss": 0.0016, "step": 5160 }, { "epoch": 0.517, "grad_norm": 0.01739874854683876, "learning_rate": 2.415e-05, "loss": 0.0021, "step": 5170 }, { "epoch": 0.518, "grad_norm": 0.012562318705022335, "learning_rate": 2.41e-05, "loss": 0.0016, "step": 5180 }, { "epoch": 0.519, "grad_norm": 0.01181173324584961, "learning_rate": 2.4050000000000002e-05, "loss": 0.0017, "step": 5190 }, { "epoch": 0.52, "grad_norm": 0.0216183140873909, "learning_rate": 2.4e-05, "loss": 0.0017, "step": 5200 }, { "epoch": 0.521, "grad_norm": 0.014552557840943336, "learning_rate": 2.395e-05, "loss": 0.0017, "step": 5210 }, { "epoch": 0.522, "grad_norm": 0.013402258977293968, "learning_rate": 2.39e-05, "loss": 0.0015, "step": 5220 }, { "epoch": 0.523, "grad_norm": 0.017692307010293007, "learning_rate": 2.385e-05, "loss": 0.0017, "step": 5230 }, { "epoch": 0.524, "grad_norm": 0.007425515912473202, "learning_rate": 2.38e-05, "loss": 0.0015, "step": 5240 }, { "epoch": 0.525, "grad_norm": 0.010397032834589481, "learning_rate": 2.375e-05, "loss": 0.0014, "step": 5250 }, { "epoch": 0.526, "grad_norm": 0.013170558027923107, "learning_rate": 2.37e-05, "loss": 0.0017, "step": 5260 }, { "epoch": 0.527, "grad_norm": 0.47324055433273315, "learning_rate": 2.365e-05, "loss": 0.0037, "step": 5270 }, { "epoch": 0.528, "grad_norm": 0.06395496428012848, "learning_rate": 2.36e-05, "loss": 0.003, "step": 5280 }, { "epoch": 0.529, "grad_norm": 0.032293129712343216, "learning_rate": 2.355e-05, "loss": 0.0022, "step": 5290 }, { "epoch": 0.53, "grad_norm": 0.021514760330319405, "learning_rate": 2.35e-05, "loss": 0.002, "step": 5300 }, { "epoch": 0.531, "grad_norm": 0.016594447195529938, "learning_rate": 2.345e-05, "loss": 0.002, "step": 5310 }, { "epoch": 0.532, "grad_norm": 0.020661164075136185, "learning_rate": 2.3400000000000003e-05, "loss": 0.0018, "step": 5320 }, { "epoch": 0.533, "grad_norm": 0.01472094189375639, "learning_rate": 2.3350000000000002e-05, "loss": 0.0022, "step": 5330 }, { "epoch": 0.534, "grad_norm": 0.014501375146210194, "learning_rate": 2.3300000000000004e-05, "loss": 0.0017, "step": 5340 }, { "epoch": 0.535, "grad_norm": 0.01241264771670103, "learning_rate": 2.3250000000000003e-05, "loss": 0.0015, "step": 5350 }, { "epoch": 0.536, "grad_norm": 0.015589526854455471, "learning_rate": 2.32e-05, "loss": 0.0018, "step": 5360 }, { "epoch": 0.537, "grad_norm": 0.013468182645738125, "learning_rate": 2.3150000000000004e-05, "loss": 0.0018, "step": 5370 }, { "epoch": 0.538, "grad_norm": 0.015258733183145523, "learning_rate": 2.3100000000000002e-05, "loss": 0.0015, "step": 5380 }, { "epoch": 0.539, "grad_norm": 0.010932616889476776, "learning_rate": 2.305e-05, "loss": 0.0014, "step": 5390 }, { "epoch": 0.54, "grad_norm": 0.0102313794195652, "learning_rate": 2.3000000000000003e-05, "loss": 0.0014, "step": 5400 }, { "epoch": 0.541, "grad_norm": 0.00674120569601655, "learning_rate": 2.2950000000000002e-05, "loss": 0.0014, "step": 5410 }, { "epoch": 0.542, "grad_norm": 0.015179513022303581, "learning_rate": 2.29e-05, "loss": 0.0014, "step": 5420 }, { "epoch": 0.543, "grad_norm": 0.03448422998189926, "learning_rate": 2.2850000000000003e-05, "loss": 0.0019, "step": 5430 }, { "epoch": 0.544, "grad_norm": 0.028603358194231987, "learning_rate": 2.2800000000000002e-05, "loss": 0.0019, "step": 5440 }, { "epoch": 0.545, "grad_norm": 0.014372209087014198, "learning_rate": 2.275e-05, "loss": 0.0016, "step": 5450 }, { "epoch": 0.546, "grad_norm": 0.031532082706689835, "learning_rate": 2.2700000000000003e-05, "loss": 0.0017, "step": 5460 }, { "epoch": 0.547, "grad_norm": 0.018091056495904922, "learning_rate": 2.265e-05, "loss": 0.0016, "step": 5470 }, { "epoch": 0.548, "grad_norm": 0.014843069948256016, "learning_rate": 2.26e-05, "loss": 0.0015, "step": 5480 }, { "epoch": 0.549, "grad_norm": 0.011632148176431656, "learning_rate": 2.2550000000000003e-05, "loss": 0.0014, "step": 5490 }, { "epoch": 0.55, "grad_norm": 0.009511668235063553, "learning_rate": 2.25e-05, "loss": 0.0014, "step": 5500 }, { "epoch": 0.551, "grad_norm": 0.007981637492775917, "learning_rate": 2.245e-05, "loss": 0.0014, "step": 5510 }, { "epoch": 0.552, "grad_norm": 0.021288806572556496, "learning_rate": 2.2400000000000002e-05, "loss": 0.0015, "step": 5520 }, { "epoch": 0.553, "grad_norm": 0.01468642894178629, "learning_rate": 2.235e-05, "loss": 0.0018, "step": 5530 }, { "epoch": 0.554, "grad_norm": 0.011532713659107685, "learning_rate": 2.23e-05, "loss": 0.0012, "step": 5540 }, { "epoch": 0.555, "grad_norm": 0.00889046210795641, "learning_rate": 2.2250000000000002e-05, "loss": 0.0011, "step": 5550 }, { "epoch": 0.556, "grad_norm": 0.01401284895837307, "learning_rate": 2.22e-05, "loss": 0.0014, "step": 5560 }, { "epoch": 0.557, "grad_norm": 0.012369327247142792, "learning_rate": 2.215e-05, "loss": 0.0015, "step": 5570 }, { "epoch": 0.558, "grad_norm": 0.015258446335792542, "learning_rate": 2.2100000000000002e-05, "loss": 0.0015, "step": 5580 }, { "epoch": 0.559, "grad_norm": 0.009015046060085297, "learning_rate": 2.205e-05, "loss": 0.0012, "step": 5590 }, { "epoch": 0.56, "grad_norm": 0.011163819581270218, "learning_rate": 2.2000000000000003e-05, "loss": 0.0012, "step": 5600 }, { "epoch": 0.561, "grad_norm": 0.016389524564146996, "learning_rate": 2.195e-05, "loss": 0.0016, "step": 5610 }, { "epoch": 0.562, "grad_norm": 0.01325678639113903, "learning_rate": 2.19e-05, "loss": 0.0013, "step": 5620 }, { "epoch": 0.563, "grad_norm": 0.017966121435165405, "learning_rate": 2.1850000000000003e-05, "loss": 0.0014, "step": 5630 }, { "epoch": 0.564, "grad_norm": 0.012039076536893845, "learning_rate": 2.18e-05, "loss": 0.0013, "step": 5640 }, { "epoch": 0.565, "grad_norm": 0.006665175314992666, "learning_rate": 2.175e-05, "loss": 0.0012, "step": 5650 }, { "epoch": 0.566, "grad_norm": 0.0105441864579916, "learning_rate": 2.1700000000000002e-05, "loss": 0.0014, "step": 5660 }, { "epoch": 0.567, "grad_norm": 0.007554101757705212, "learning_rate": 2.165e-05, "loss": 0.0011, "step": 5670 }, { "epoch": 0.568, "grad_norm": 0.009823901578783989, "learning_rate": 2.16e-05, "loss": 0.0013, "step": 5680 }, { "epoch": 0.569, "grad_norm": 0.01720455475151539, "learning_rate": 2.1550000000000002e-05, "loss": 0.0015, "step": 5690 }, { "epoch": 0.57, "grad_norm": 0.01107338909059763, "learning_rate": 2.15e-05, "loss": 0.0012, "step": 5700 }, { "epoch": 0.571, "grad_norm": 0.01756761223077774, "learning_rate": 2.145e-05, "loss": 0.0014, "step": 5710 }, { "epoch": 0.572, "grad_norm": 0.022118983790278435, "learning_rate": 2.1400000000000002e-05, "loss": 0.0015, "step": 5720 }, { "epoch": 0.573, "grad_norm": 0.01616830937564373, "learning_rate": 2.135e-05, "loss": 0.0014, "step": 5730 }, { "epoch": 0.574, "grad_norm": 0.020481310784816742, "learning_rate": 2.13e-05, "loss": 0.0023, "step": 5740 }, { "epoch": 0.575, "grad_norm": 0.018176857382059097, "learning_rate": 2.125e-05, "loss": 0.0015, "step": 5750 }, { "epoch": 0.576, "grad_norm": 0.011317101307213306, "learning_rate": 2.12e-05, "loss": 0.0012, "step": 5760 }, { "epoch": 0.577, "grad_norm": 0.028791502118110657, "learning_rate": 2.115e-05, "loss": 0.0014, "step": 5770 }, { "epoch": 0.578, "grad_norm": 0.013037024065852165, "learning_rate": 2.11e-05, "loss": 0.0013, "step": 5780 }, { "epoch": 0.579, "grad_norm": 0.021426070481538773, "learning_rate": 2.105e-05, "loss": 0.0015, "step": 5790 }, { "epoch": 0.58, "grad_norm": 0.012033521197736263, "learning_rate": 2.1e-05, "loss": 0.0011, "step": 5800 }, { "epoch": 0.581, "grad_norm": 0.014337443746626377, "learning_rate": 2.095e-05, "loss": 0.0012, "step": 5810 }, { "epoch": 0.582, "grad_norm": 0.008603113703429699, "learning_rate": 2.09e-05, "loss": 0.0011, "step": 5820 }, { "epoch": 0.583, "grad_norm": 0.025418557226657867, "learning_rate": 2.085e-05, "loss": 0.0014, "step": 5830 }, { "epoch": 0.584, "grad_norm": 0.008621426299214363, "learning_rate": 2.08e-05, "loss": 0.0011, "step": 5840 }, { "epoch": 0.585, "grad_norm": 0.009969389997422695, "learning_rate": 2.075e-05, "loss": 0.0015, "step": 5850 }, { "epoch": 0.586, "grad_norm": 0.00997992418706417, "learning_rate": 2.07e-05, "loss": 0.0011, "step": 5860 }, { "epoch": 0.587, "grad_norm": 0.019949181005358696, "learning_rate": 2.065e-05, "loss": 0.001, "step": 5870 }, { "epoch": 0.588, "grad_norm": 0.009619793854653835, "learning_rate": 2.06e-05, "loss": 0.0011, "step": 5880 }, { "epoch": 0.589, "grad_norm": 0.007747489493340254, "learning_rate": 2.055e-05, "loss": 0.0012, "step": 5890 }, { "epoch": 0.59, "grad_norm": 0.01052554789930582, "learning_rate": 2.05e-05, "loss": 0.0014, "step": 5900 }, { "epoch": 0.591, "grad_norm": 0.014904200099408627, "learning_rate": 2.045e-05, "loss": 0.0012, "step": 5910 }, { "epoch": 0.592, "grad_norm": 0.00679561635479331, "learning_rate": 2.04e-05, "loss": 0.0011, "step": 5920 }, { "epoch": 0.593, "grad_norm": 0.006072670221328735, "learning_rate": 2.035e-05, "loss": 0.0011, "step": 5930 }, { "epoch": 0.594, "grad_norm": 0.014733157120645046, "learning_rate": 2.0300000000000002e-05, "loss": 0.0011, "step": 5940 }, { "epoch": 0.595, "grad_norm": 0.015511419624090195, "learning_rate": 2.025e-05, "loss": 0.0016, "step": 5950 }, { "epoch": 0.596, "grad_norm": 0.010620438493788242, "learning_rate": 2.0200000000000003e-05, "loss": 0.0012, "step": 5960 }, { "epoch": 0.597, "grad_norm": 0.0075794099830091, "learning_rate": 2.0150000000000002e-05, "loss": 0.0011, "step": 5970 }, { "epoch": 0.598, "grad_norm": 0.007882976904511452, "learning_rate": 2.01e-05, "loss": 0.0011, "step": 5980 }, { "epoch": 0.599, "grad_norm": 0.011548763141036034, "learning_rate": 2.0050000000000003e-05, "loss": 0.0013, "step": 5990 }, { "epoch": 0.6, "grad_norm": 0.0084703853353858, "learning_rate": 2e-05, "loss": 0.0011, "step": 6000 }, { "epoch": 0.601, "grad_norm": 0.007603704463690519, "learning_rate": 1.995e-05, "loss": 0.001, "step": 6010 }, { "epoch": 0.602, "grad_norm": 0.008562711998820305, "learning_rate": 1.9900000000000003e-05, "loss": 0.0012, "step": 6020 }, { "epoch": 0.603, "grad_norm": 0.007590813562273979, "learning_rate": 1.985e-05, "loss": 0.001, "step": 6030 }, { "epoch": 0.604, "grad_norm": 0.020342741161584854, "learning_rate": 1.9800000000000004e-05, "loss": 0.0017, "step": 6040 }, { "epoch": 0.605, "grad_norm": 0.16912633180618286, "learning_rate": 1.9750000000000002e-05, "loss": 0.0089, "step": 6050 }, { "epoch": 0.606, "grad_norm": 0.08793429285287857, "learning_rate": 1.97e-05, "loss": 0.0027, "step": 6060 }, { "epoch": 0.607, "grad_norm": 0.05196760594844818, "learning_rate": 1.9650000000000003e-05, "loss": 0.0022, "step": 6070 }, { "epoch": 0.608, "grad_norm": 0.02118327096104622, "learning_rate": 1.9600000000000002e-05, "loss": 0.0021, "step": 6080 }, { "epoch": 0.609, "grad_norm": 0.013289586640894413, "learning_rate": 1.955e-05, "loss": 0.0013, "step": 6090 }, { "epoch": 0.61, "grad_norm": 0.012911707162857056, "learning_rate": 1.9500000000000003e-05, "loss": 0.0013, "step": 6100 }, { "epoch": 0.611, "grad_norm": 0.018663186579942703, "learning_rate": 1.9450000000000002e-05, "loss": 0.0012, "step": 6110 }, { "epoch": 0.612, "grad_norm": 0.010551884770393372, "learning_rate": 1.94e-05, "loss": 0.0012, "step": 6120 }, { "epoch": 0.613, "grad_norm": 0.015853077173233032, "learning_rate": 1.9350000000000003e-05, "loss": 0.0013, "step": 6130 }, { "epoch": 0.614, "grad_norm": 0.020374910905957222, "learning_rate": 1.93e-05, "loss": 0.001, "step": 6140 }, { "epoch": 0.615, "grad_norm": 0.015159848146140575, "learning_rate": 1.925e-05, "loss": 0.0013, "step": 6150 }, { "epoch": 0.616, "grad_norm": 0.007991676218807697, "learning_rate": 1.9200000000000003e-05, "loss": 0.0013, "step": 6160 }, { "epoch": 0.617, "grad_norm": 0.007849587127566338, "learning_rate": 1.915e-05, "loss": 0.0011, "step": 6170 }, { "epoch": 0.618, "grad_norm": 0.022048622369766235, "learning_rate": 1.91e-05, "loss": 0.001, "step": 6180 }, { "epoch": 0.619, "grad_norm": 0.021215343847870827, "learning_rate": 1.9050000000000002e-05, "loss": 0.0011, "step": 6190 }, { "epoch": 0.62, "grad_norm": 0.012288344092667103, "learning_rate": 1.9e-05, "loss": 0.0012, "step": 6200 }, { "epoch": 0.621, "grad_norm": 0.020313331857323647, "learning_rate": 1.895e-05, "loss": 0.0011, "step": 6210 }, { "epoch": 0.622, "grad_norm": 0.008762447163462639, "learning_rate": 1.8900000000000002e-05, "loss": 0.001, "step": 6220 }, { "epoch": 0.623, "grad_norm": 0.0247616209089756, "learning_rate": 1.885e-05, "loss": 0.0011, "step": 6230 }, { "epoch": 0.624, "grad_norm": 0.09021363407373428, "learning_rate": 1.88e-05, "loss": 0.0016, "step": 6240 }, { "epoch": 0.625, "grad_norm": 0.017945896834135056, "learning_rate": 1.8750000000000002e-05, "loss": 0.0011, "step": 6250 }, { "epoch": 0.626, "grad_norm": 0.011303462088108063, "learning_rate": 1.87e-05, "loss": 0.0011, "step": 6260 }, { "epoch": 0.627, "grad_norm": 0.008381664752960205, "learning_rate": 1.865e-05, "loss": 0.0011, "step": 6270 }, { "epoch": 0.628, "grad_norm": 0.011003987863659859, "learning_rate": 1.86e-05, "loss": 0.0012, "step": 6280 }, { "epoch": 0.629, "grad_norm": 0.015965888276696205, "learning_rate": 1.855e-05, "loss": 0.001, "step": 6290 }, { "epoch": 0.63, "grad_norm": 0.006507181562483311, "learning_rate": 1.85e-05, "loss": 0.0009, "step": 6300 }, { "epoch": 0.631, "grad_norm": 0.015577591024339199, "learning_rate": 1.845e-05, "loss": 0.001, "step": 6310 }, { "epoch": 0.632, "grad_norm": 0.006741558667272329, "learning_rate": 1.84e-05, "loss": 0.0011, "step": 6320 }, { "epoch": 0.633, "grad_norm": 0.016030525788664818, "learning_rate": 1.8350000000000002e-05, "loss": 0.001, "step": 6330 }, { "epoch": 0.634, "grad_norm": 0.010763168334960938, "learning_rate": 1.83e-05, "loss": 0.0011, "step": 6340 }, { "epoch": 0.635, "grad_norm": 0.017273874953389168, "learning_rate": 1.825e-05, "loss": 0.001, "step": 6350 }, { "epoch": 0.636, "grad_norm": 0.010964670218527317, "learning_rate": 1.8200000000000002e-05, "loss": 0.0011, "step": 6360 }, { "epoch": 0.637, "grad_norm": 0.00803497713059187, "learning_rate": 1.815e-05, "loss": 0.0009, "step": 6370 }, { "epoch": 0.638, "grad_norm": 0.007479315157979727, "learning_rate": 1.81e-05, "loss": 0.0014, "step": 6380 }, { "epoch": 0.639, "grad_norm": 0.010598058812320232, "learning_rate": 1.805e-05, "loss": 0.001, "step": 6390 }, { "epoch": 0.64, "grad_norm": 0.009770036675035954, "learning_rate": 1.8e-05, "loss": 0.0009, "step": 6400 }, { "epoch": 0.641, "grad_norm": 0.011602561920881271, "learning_rate": 1.795e-05, "loss": 0.0008, "step": 6410 }, { "epoch": 0.642, "grad_norm": 0.0076597342267632484, "learning_rate": 1.79e-05, "loss": 0.0008, "step": 6420 }, { "epoch": 0.643, "grad_norm": 0.012248953804373741, "learning_rate": 1.785e-05, "loss": 0.0008, "step": 6430 }, { "epoch": 0.644, "grad_norm": 0.005626557394862175, "learning_rate": 1.78e-05, "loss": 0.0008, "step": 6440 }, { "epoch": 0.645, "grad_norm": 0.005482000298798084, "learning_rate": 1.775e-05, "loss": 0.0008, "step": 6450 }, { "epoch": 0.646, "grad_norm": 0.007456011138856411, "learning_rate": 1.77e-05, "loss": 0.0008, "step": 6460 }, { "epoch": 0.647, "grad_norm": 0.008909308351576328, "learning_rate": 1.765e-05, "loss": 0.0008, "step": 6470 }, { "epoch": 0.648, "grad_norm": 0.011135280132293701, "learning_rate": 1.76e-05, "loss": 0.0009, "step": 6480 }, { "epoch": 0.649, "grad_norm": 0.01595783233642578, "learning_rate": 1.755e-05, "loss": 0.001, "step": 6490 }, { "epoch": 0.65, "grad_norm": 0.013902807608246803, "learning_rate": 1.75e-05, "loss": 0.0011, "step": 6500 }, { "epoch": 0.651, "grad_norm": 0.010244622826576233, "learning_rate": 1.745e-05, "loss": 0.0009, "step": 6510 }, { "epoch": 0.652, "grad_norm": 0.007476091384887695, "learning_rate": 1.74e-05, "loss": 0.0009, "step": 6520 }, { "epoch": 0.653, "grad_norm": 0.013044660910964012, "learning_rate": 1.7349999999999998e-05, "loss": 0.0009, "step": 6530 }, { "epoch": 0.654, "grad_norm": 0.004804369527846575, "learning_rate": 1.73e-05, "loss": 0.0009, "step": 6540 }, { "epoch": 0.655, "grad_norm": 0.006042002234607935, "learning_rate": 1.725e-05, "loss": 0.0008, "step": 6550 }, { "epoch": 0.656, "grad_norm": 0.010785943828523159, "learning_rate": 1.7199999999999998e-05, "loss": 0.0009, "step": 6560 }, { "epoch": 0.657, "grad_norm": 0.011350172571837902, "learning_rate": 1.7150000000000004e-05, "loss": 0.0008, "step": 6570 }, { "epoch": 0.658, "grad_norm": 0.007638021372258663, "learning_rate": 1.7100000000000002e-05, "loss": 0.0009, "step": 6580 }, { "epoch": 0.659, "grad_norm": 0.005735939834266901, "learning_rate": 1.705e-05, "loss": 0.0009, "step": 6590 }, { "epoch": 0.66, "grad_norm": 0.02717960625886917, "learning_rate": 1.7000000000000003e-05, "loss": 0.0011, "step": 6600 }, { "epoch": 0.661, "grad_norm": 0.006012643221765757, "learning_rate": 1.6950000000000002e-05, "loss": 0.0008, "step": 6610 }, { "epoch": 0.662, "grad_norm": 0.00599683728069067, "learning_rate": 1.69e-05, "loss": 0.0008, "step": 6620 }, { "epoch": 0.663, "grad_norm": 0.026952974498271942, "learning_rate": 1.6850000000000003e-05, "loss": 0.0008, "step": 6630 }, { "epoch": 0.664, "grad_norm": 0.008171536959707737, "learning_rate": 1.6800000000000002e-05, "loss": 0.0008, "step": 6640 }, { "epoch": 0.665, "grad_norm": 0.007446442265063524, "learning_rate": 1.675e-05, "loss": 0.0009, "step": 6650 }, { "epoch": 0.666, "grad_norm": 0.006456063129007816, "learning_rate": 1.6700000000000003e-05, "loss": 0.0008, "step": 6660 }, { "epoch": 0.667, "grad_norm": 0.008162173442542553, "learning_rate": 1.665e-05, "loss": 0.0007, "step": 6670 }, { "epoch": 0.668, "grad_norm": 0.004432919900864363, "learning_rate": 1.66e-05, "loss": 0.0008, "step": 6680 }, { "epoch": 0.669, "grad_norm": 0.007158307824283838, "learning_rate": 1.6550000000000002e-05, "loss": 0.0008, "step": 6690 }, { "epoch": 0.67, "grad_norm": 0.003983801696449518, "learning_rate": 1.65e-05, "loss": 0.0007, "step": 6700 }, { "epoch": 0.671, "grad_norm": 0.005170087795704603, "learning_rate": 1.645e-05, "loss": 0.0008, "step": 6710 }, { "epoch": 0.672, "grad_norm": 0.004729804117232561, "learning_rate": 1.6400000000000002e-05, "loss": 0.0008, "step": 6720 }, { "epoch": 0.673, "grad_norm": 0.010037174448370934, "learning_rate": 1.635e-05, "loss": 0.001, "step": 6730 }, { "epoch": 0.674, "grad_norm": 0.050949569791555405, "learning_rate": 1.63e-05, "loss": 0.0023, "step": 6740 }, { "epoch": 0.675, "grad_norm": 0.0323474146425724, "learning_rate": 1.6250000000000002e-05, "loss": 0.0017, "step": 6750 }, { "epoch": 0.676, "grad_norm": 0.027231359854340553, "learning_rate": 1.62e-05, "loss": 0.0021, "step": 6760 }, { "epoch": 0.677, "grad_norm": 0.01555855292826891, "learning_rate": 1.6150000000000003e-05, "loss": 0.0013, "step": 6770 }, { "epoch": 0.678, "grad_norm": 0.01804298162460327, "learning_rate": 1.6100000000000002e-05, "loss": 0.0011, "step": 6780 }, { "epoch": 0.679, "grad_norm": 0.011248771101236343, "learning_rate": 1.605e-05, "loss": 0.0011, "step": 6790 }, { "epoch": 0.68, "grad_norm": 0.007389044389128685, "learning_rate": 1.6000000000000003e-05, "loss": 0.0009, "step": 6800 }, { "epoch": 0.681, "grad_norm": 0.014606145210564137, "learning_rate": 1.595e-05, "loss": 0.0012, "step": 6810 }, { "epoch": 0.682, "grad_norm": 0.012476052157580853, "learning_rate": 1.59e-05, "loss": 0.0009, "step": 6820 }, { "epoch": 0.683, "grad_norm": 0.009272475726902485, "learning_rate": 1.5850000000000002e-05, "loss": 0.0009, "step": 6830 }, { "epoch": 0.684, "grad_norm": 0.011705187149345875, "learning_rate": 1.58e-05, "loss": 0.0009, "step": 6840 }, { "epoch": 0.685, "grad_norm": 0.01874556578695774, "learning_rate": 1.575e-05, "loss": 0.0011, "step": 6850 }, { "epoch": 0.686, "grad_norm": 0.01463324110955, "learning_rate": 1.5700000000000002e-05, "loss": 0.0009, "step": 6860 }, { "epoch": 0.687, "grad_norm": 0.012001392431557178, "learning_rate": 1.565e-05, "loss": 0.001, "step": 6870 }, { "epoch": 0.688, "grad_norm": 0.009366356767714024, "learning_rate": 1.56e-05, "loss": 0.0008, "step": 6880 }, { "epoch": 0.689, "grad_norm": 0.010064000263810158, "learning_rate": 1.5550000000000002e-05, "loss": 0.0009, "step": 6890 }, { "epoch": 0.69, "grad_norm": 0.016703909263014793, "learning_rate": 1.55e-05, "loss": 0.0009, "step": 6900 }, { "epoch": 0.691, "grad_norm": 0.0146669652312994, "learning_rate": 1.545e-05, "loss": 0.001, "step": 6910 }, { "epoch": 0.692, "grad_norm": 0.006643705535680056, "learning_rate": 1.54e-05, "loss": 0.0009, "step": 6920 }, { "epoch": 0.693, "grad_norm": 0.011501871049404144, "learning_rate": 1.535e-05, "loss": 0.0008, "step": 6930 }, { "epoch": 0.694, "grad_norm": 0.008170065470039845, "learning_rate": 1.53e-05, "loss": 0.0008, "step": 6940 }, { "epoch": 0.695, "grad_norm": 0.00737554719671607, "learning_rate": 1.525e-05, "loss": 0.0007, "step": 6950 }, { "epoch": 0.696, "grad_norm": 0.006846282631158829, "learning_rate": 1.52e-05, "loss": 0.0009, "step": 6960 }, { "epoch": 0.697, "grad_norm": 0.007784941233694553, "learning_rate": 1.515e-05, "loss": 0.0008, "step": 6970 }, { "epoch": 0.698, "grad_norm": 0.009864069521427155, "learning_rate": 1.51e-05, "loss": 0.0008, "step": 6980 }, { "epoch": 0.699, "grad_norm": 0.007372863125056028, "learning_rate": 1.505e-05, "loss": 0.0009, "step": 6990 }, { "epoch": 0.7, "grad_norm": 0.006507135462015867, "learning_rate": 1.5e-05, "loss": 0.0008, "step": 7000 }, { "epoch": 0.701, "grad_norm": 0.03093353845179081, "learning_rate": 1.4950000000000001e-05, "loss": 0.0014, "step": 7010 }, { "epoch": 0.702, "grad_norm": 0.01417300570756197, "learning_rate": 1.49e-05, "loss": 0.001, "step": 7020 }, { "epoch": 0.703, "grad_norm": 0.010836401022970676, "learning_rate": 1.485e-05, "loss": 0.0012, "step": 7030 }, { "epoch": 0.704, "grad_norm": 0.01000068336725235, "learning_rate": 1.48e-05, "loss": 0.001, "step": 7040 }, { "epoch": 0.705, "grad_norm": 0.008654952049255371, "learning_rate": 1.475e-05, "loss": 0.0009, "step": 7050 }, { "epoch": 0.706, "grad_norm": 0.010761331766843796, "learning_rate": 1.47e-05, "loss": 0.001, "step": 7060 }, { "epoch": 0.707, "grad_norm": 0.006188638508319855, "learning_rate": 1.465e-05, "loss": 0.0008, "step": 7070 }, { "epoch": 0.708, "grad_norm": 0.007858789525926113, "learning_rate": 1.4599999999999999e-05, "loss": 0.0008, "step": 7080 }, { "epoch": 0.709, "grad_norm": 0.02773350477218628, "learning_rate": 1.455e-05, "loss": 0.0014, "step": 7090 }, { "epoch": 0.71, "grad_norm": 0.012381108477711678, "learning_rate": 1.45e-05, "loss": 0.0009, "step": 7100 }, { "epoch": 0.711, "grad_norm": 0.009256324730813503, "learning_rate": 1.4449999999999999e-05, "loss": 0.0008, "step": 7110 }, { "epoch": 0.712, "grad_norm": 0.007005748804658651, "learning_rate": 1.44e-05, "loss": 0.0009, "step": 7120 }, { "epoch": 0.713, "grad_norm": 0.0055755749344825745, "learning_rate": 1.435e-05, "loss": 0.0007, "step": 7130 }, { "epoch": 0.714, "grad_norm": 0.003967254888266325, "learning_rate": 1.43e-05, "loss": 0.0008, "step": 7140 }, { "epoch": 0.715, "grad_norm": 0.0079165268689394, "learning_rate": 1.4249999999999999e-05, "loss": 0.0011, "step": 7150 }, { "epoch": 0.716, "grad_norm": 0.004682580940425396, "learning_rate": 1.42e-05, "loss": 0.0007, "step": 7160 }, { "epoch": 0.717, "grad_norm": 0.008578700013458729, "learning_rate": 1.415e-05, "loss": 0.0011, "step": 7170 }, { "epoch": 0.718, "grad_norm": 0.006943961605429649, "learning_rate": 1.4099999999999999e-05, "loss": 0.0009, "step": 7180 }, { "epoch": 0.719, "grad_norm": 0.0072656250558793545, "learning_rate": 1.4050000000000003e-05, "loss": 0.0007, "step": 7190 }, { "epoch": 0.72, "grad_norm": 0.005639955401420593, "learning_rate": 1.4000000000000001e-05, "loss": 0.0007, "step": 7200 }, { "epoch": 0.721, "grad_norm": 0.005733838304877281, "learning_rate": 1.3950000000000002e-05, "loss": 0.0008, "step": 7210 }, { "epoch": 0.722, "grad_norm": 0.02654002234339714, "learning_rate": 1.3900000000000002e-05, "loss": 0.0008, "step": 7220 }, { "epoch": 0.723, "grad_norm": 0.007308628410100937, "learning_rate": 1.3850000000000001e-05, "loss": 0.0008, "step": 7230 }, { "epoch": 0.724, "grad_norm": 0.006939894054085016, "learning_rate": 1.3800000000000002e-05, "loss": 0.0007, "step": 7240 }, { "epoch": 0.725, "grad_norm": 0.03964811936020851, "learning_rate": 1.3750000000000002e-05, "loss": 0.0013, "step": 7250 }, { "epoch": 0.726, "grad_norm": 0.014138396829366684, "learning_rate": 1.3700000000000001e-05, "loss": 0.001, "step": 7260 }, { "epoch": 0.727, "grad_norm": 0.008445181883871555, "learning_rate": 1.3650000000000001e-05, "loss": 0.0008, "step": 7270 }, { "epoch": 0.728, "grad_norm": 0.01134855579584837, "learning_rate": 1.3600000000000002e-05, "loss": 0.0009, "step": 7280 }, { "epoch": 0.729, "grad_norm": 0.010982022620737553, "learning_rate": 1.3550000000000002e-05, "loss": 0.0015, "step": 7290 }, { "epoch": 0.73, "grad_norm": 0.011698734015226364, "learning_rate": 1.3500000000000001e-05, "loss": 0.0008, "step": 7300 }, { "epoch": 0.731, "grad_norm": 0.006420729216188192, "learning_rate": 1.3450000000000002e-05, "loss": 0.0008, "step": 7310 }, { "epoch": 0.732, "grad_norm": 0.006088167428970337, "learning_rate": 1.3400000000000002e-05, "loss": 0.0008, "step": 7320 }, { "epoch": 0.733, "grad_norm": 0.0071141645312309265, "learning_rate": 1.3350000000000001e-05, "loss": 0.0012, "step": 7330 }, { "epoch": 0.734, "grad_norm": 0.004975921008735895, "learning_rate": 1.3300000000000001e-05, "loss": 0.0006, "step": 7340 }, { "epoch": 0.735, "grad_norm": 0.004499469883739948, "learning_rate": 1.3250000000000002e-05, "loss": 0.0007, "step": 7350 }, { "epoch": 0.736, "grad_norm": 0.009738982655107975, "learning_rate": 1.32e-05, "loss": 0.001, "step": 7360 }, { "epoch": 0.737, "grad_norm": 0.006863337475806475, "learning_rate": 1.3150000000000001e-05, "loss": 0.001, "step": 7370 }, { "epoch": 0.738, "grad_norm": 0.008216536603868008, "learning_rate": 1.3100000000000002e-05, "loss": 0.0007, "step": 7380 }, { "epoch": 0.739, "grad_norm": 0.006803369149565697, "learning_rate": 1.305e-05, "loss": 0.0008, "step": 7390 }, { "epoch": 0.74, "grad_norm": 0.00551017839461565, "learning_rate": 1.3000000000000001e-05, "loss": 0.0008, "step": 7400 }, { "epoch": 0.741, "grad_norm": 0.009463651105761528, "learning_rate": 1.2950000000000001e-05, "loss": 0.0008, "step": 7410 }, { "epoch": 0.742, "grad_norm": 0.01233983039855957, "learning_rate": 1.29e-05, "loss": 0.0019, "step": 7420 }, { "epoch": 0.743, "grad_norm": 0.008470877073705196, "learning_rate": 1.285e-05, "loss": 0.0009, "step": 7430 }, { "epoch": 0.744, "grad_norm": 0.007592742796987295, "learning_rate": 1.2800000000000001e-05, "loss": 0.0008, "step": 7440 }, { "epoch": 0.745, "grad_norm": 0.03596987947821617, "learning_rate": 1.2750000000000002e-05, "loss": 0.001, "step": 7450 }, { "epoch": 0.746, "grad_norm": 0.005849502049386501, "learning_rate": 1.27e-05, "loss": 0.0008, "step": 7460 }, { "epoch": 0.747, "grad_norm": 0.009035659022629261, "learning_rate": 1.2650000000000001e-05, "loss": 0.0007, "step": 7470 }, { "epoch": 0.748, "grad_norm": 0.010397679172456264, "learning_rate": 1.2600000000000001e-05, "loss": 0.0014, "step": 7480 }, { "epoch": 0.749, "grad_norm": 0.014514378271996975, "learning_rate": 1.255e-05, "loss": 0.0008, "step": 7490 }, { "epoch": 0.75, "grad_norm": 0.004837281536310911, "learning_rate": 1.25e-05, "loss": 0.0006, "step": 7500 }, { "epoch": 0.751, "grad_norm": 0.007720770314335823, "learning_rate": 1.2450000000000001e-05, "loss": 0.0006, "step": 7510 }, { "epoch": 0.752, "grad_norm": 0.012046804651618004, "learning_rate": 1.24e-05, "loss": 0.0011, "step": 7520 }, { "epoch": 0.753, "grad_norm": 0.01343387458473444, "learning_rate": 1.235e-05, "loss": 0.0007, "step": 7530 }, { "epoch": 0.754, "grad_norm": 0.00810600072145462, "learning_rate": 1.23e-05, "loss": 0.0007, "step": 7540 }, { "epoch": 0.755, "grad_norm": 0.00925883837044239, "learning_rate": 1.225e-05, "loss": 0.0006, "step": 7550 }, { "epoch": 0.756, "grad_norm": 0.01927885413169861, "learning_rate": 1.22e-05, "loss": 0.0014, "step": 7560 }, { "epoch": 0.757, "grad_norm": 0.010129665955901146, "learning_rate": 1.215e-05, "loss": 0.0006, "step": 7570 }, { "epoch": 0.758, "grad_norm": 0.007863885723054409, "learning_rate": 1.2100000000000001e-05, "loss": 0.0006, "step": 7580 }, { "epoch": 0.759, "grad_norm": 0.005500464700162411, "learning_rate": 1.205e-05, "loss": 0.0007, "step": 7590 }, { "epoch": 0.76, "grad_norm": 0.0040563903748989105, "learning_rate": 1.2e-05, "loss": 0.0006, "step": 7600 }, { "epoch": 0.761, "grad_norm": 0.006361998151987791, "learning_rate": 1.195e-05, "loss": 0.0007, "step": 7610 }, { "epoch": 0.762, "grad_norm": 0.0136310625821352, "learning_rate": 1.19e-05, "loss": 0.0008, "step": 7620 }, { "epoch": 0.763, "grad_norm": 0.005384715739637613, "learning_rate": 1.185e-05, "loss": 0.0007, "step": 7630 }, { "epoch": 0.764, "grad_norm": 0.014707676135003567, "learning_rate": 1.18e-05, "loss": 0.0007, "step": 7640 }, { "epoch": 0.765, "grad_norm": 0.008092684671282768, "learning_rate": 1.175e-05, "loss": 0.0006, "step": 7650 }, { "epoch": 0.766, "grad_norm": 0.007185132242739201, "learning_rate": 1.1700000000000001e-05, "loss": 0.0006, "step": 7660 }, { "epoch": 0.767, "grad_norm": 0.005672789178788662, "learning_rate": 1.1650000000000002e-05, "loss": 0.0006, "step": 7670 }, { "epoch": 0.768, "grad_norm": 0.05434956029057503, "learning_rate": 1.16e-05, "loss": 0.001, "step": 7680 }, { "epoch": 0.769, "grad_norm": 0.00933472067117691, "learning_rate": 1.1550000000000001e-05, "loss": 0.0007, "step": 7690 }, { "epoch": 0.77, "grad_norm": 0.008684621192514896, "learning_rate": 1.1500000000000002e-05, "loss": 0.0006, "step": 7700 }, { "epoch": 0.771, "grad_norm": 0.03054739721119404, "learning_rate": 1.145e-05, "loss": 0.0006, "step": 7710 }, { "epoch": 0.772, "grad_norm": 0.005998207256197929, "learning_rate": 1.1400000000000001e-05, "loss": 0.0006, "step": 7720 }, { "epoch": 0.773, "grad_norm": 0.006153833121061325, "learning_rate": 1.1350000000000001e-05, "loss": 0.0006, "step": 7730 }, { "epoch": 0.774, "grad_norm": 0.007491481024771929, "learning_rate": 1.13e-05, "loss": 0.0007, "step": 7740 }, { "epoch": 0.775, "grad_norm": 0.01078925933688879, "learning_rate": 1.125e-05, "loss": 0.0006, "step": 7750 }, { "epoch": 0.776, "grad_norm": 0.005885554943233728, "learning_rate": 1.1200000000000001e-05, "loss": 0.0006, "step": 7760 }, { "epoch": 0.777, "grad_norm": 0.005423078313469887, "learning_rate": 1.115e-05, "loss": 0.0007, "step": 7770 }, { "epoch": 0.778, "grad_norm": 0.008044522255659103, "learning_rate": 1.11e-05, "loss": 0.0006, "step": 7780 }, { "epoch": 0.779, "grad_norm": 0.00733207818120718, "learning_rate": 1.1050000000000001e-05, "loss": 0.0007, "step": 7790 }, { "epoch": 0.78, "grad_norm": 0.0066906120628118515, "learning_rate": 1.1000000000000001e-05, "loss": 0.0009, "step": 7800 }, { "epoch": 0.781, "grad_norm": 0.004443836398422718, "learning_rate": 1.095e-05, "loss": 0.0006, "step": 7810 }, { "epoch": 0.782, "grad_norm": 0.0058379145339131355, "learning_rate": 1.09e-05, "loss": 0.0007, "step": 7820 }, { "epoch": 0.783, "grad_norm": 0.006808693055063486, "learning_rate": 1.0850000000000001e-05, "loss": 0.0006, "step": 7830 }, { "epoch": 0.784, "grad_norm": 0.008773542940616608, "learning_rate": 1.08e-05, "loss": 0.0006, "step": 7840 }, { "epoch": 0.785, "grad_norm": 0.006700740661472082, "learning_rate": 1.075e-05, "loss": 0.0006, "step": 7850 }, { "epoch": 0.786, "grad_norm": 0.00906393863260746, "learning_rate": 1.0700000000000001e-05, "loss": 0.0006, "step": 7860 }, { "epoch": 0.787, "grad_norm": 0.0030822190456092358, "learning_rate": 1.065e-05, "loss": 0.0005, "step": 7870 }, { "epoch": 0.788, "grad_norm": 0.0029632148798555136, "learning_rate": 1.06e-05, "loss": 0.0005, "step": 7880 }, { "epoch": 0.789, "grad_norm": 0.004798842128366232, "learning_rate": 1.055e-05, "loss": 0.0006, "step": 7890 }, { "epoch": 0.79, "grad_norm": 0.007376812864094973, "learning_rate": 1.05e-05, "loss": 0.0005, "step": 7900 }, { "epoch": 0.791, "grad_norm": 0.009337624534964561, "learning_rate": 1.045e-05, "loss": 0.0009, "step": 7910 }, { "epoch": 0.792, "grad_norm": 0.012847904115915298, "learning_rate": 1.04e-05, "loss": 0.0008, "step": 7920 }, { "epoch": 0.793, "grad_norm": 0.005587203428149223, "learning_rate": 1.035e-05, "loss": 0.0006, "step": 7930 }, { "epoch": 0.794, "grad_norm": 0.008464600890874863, "learning_rate": 1.03e-05, "loss": 0.0006, "step": 7940 }, { "epoch": 0.795, "grad_norm": 0.2516852617263794, "learning_rate": 1.025e-05, "loss": 0.002, "step": 7950 }, { "epoch": 0.796, "grad_norm": 0.04664693772792816, "learning_rate": 1.02e-05, "loss": 0.002, "step": 7960 }, { "epoch": 0.797, "grad_norm": 0.02456306852400303, "learning_rate": 1.0150000000000001e-05, "loss": 0.0013, "step": 7970 }, { "epoch": 0.798, "grad_norm": 0.011320951394736767, "learning_rate": 1.0100000000000002e-05, "loss": 0.0009, "step": 7980 }, { "epoch": 0.799, "grad_norm": 0.01860683411359787, "learning_rate": 1.005e-05, "loss": 0.0012, "step": 7990 }, { "epoch": 0.8, "grad_norm": 0.03227970749139786, "learning_rate": 1e-05, "loss": 0.0009, "step": 8000 }, { "epoch": 0.801, "grad_norm": 0.015873363241553307, "learning_rate": 9.950000000000001e-06, "loss": 0.0008, "step": 8010 }, { "epoch": 0.802, "grad_norm": 0.005454899277538061, "learning_rate": 9.900000000000002e-06, "loss": 0.0008, "step": 8020 }, { "epoch": 0.803, "grad_norm": 0.007948348298668861, "learning_rate": 9.85e-06, "loss": 0.0007, "step": 8030 }, { "epoch": 0.804, "grad_norm": 0.013328757137060165, "learning_rate": 9.800000000000001e-06, "loss": 0.0006, "step": 8040 }, { "epoch": 0.805, "grad_norm": 0.01018743496388197, "learning_rate": 9.750000000000002e-06, "loss": 0.0012, "step": 8050 }, { "epoch": 0.806, "grad_norm": 0.009421809576451778, "learning_rate": 9.7e-06, "loss": 0.0008, "step": 8060 }, { "epoch": 0.807, "grad_norm": 0.005202045664191246, "learning_rate": 9.65e-06, "loss": 0.0007, "step": 8070 }, { "epoch": 0.808, "grad_norm": 0.012956002727150917, "learning_rate": 9.600000000000001e-06, "loss": 0.0007, "step": 8080 }, { "epoch": 0.809, "grad_norm": 0.006403383333235979, "learning_rate": 9.55e-06, "loss": 0.0007, "step": 8090 }, { "epoch": 0.81, "grad_norm": 0.027560915797948837, "learning_rate": 9.5e-06, "loss": 0.0008, "step": 8100 }, { "epoch": 0.811, "grad_norm": 0.005196988116949797, "learning_rate": 9.450000000000001e-06, "loss": 0.0006, "step": 8110 }, { "epoch": 0.812, "grad_norm": 0.009510821662843227, "learning_rate": 9.4e-06, "loss": 0.0006, "step": 8120 }, { "epoch": 0.813, "grad_norm": 0.006430651992559433, "learning_rate": 9.35e-06, "loss": 0.0006, "step": 8130 }, { "epoch": 0.814, "grad_norm": 0.019426727667450905, "learning_rate": 9.3e-06, "loss": 0.0009, "step": 8140 }, { "epoch": 0.815, "grad_norm": 0.011564865708351135, "learning_rate": 9.25e-06, "loss": 0.0006, "step": 8150 }, { "epoch": 0.816, "grad_norm": 0.009036659263074398, "learning_rate": 9.2e-06, "loss": 0.0008, "step": 8160 }, { "epoch": 0.817, "grad_norm": 0.006685588974505663, "learning_rate": 9.15e-06, "loss": 0.0007, "step": 8170 }, { "epoch": 0.818, "grad_norm": 0.005980687215924263, "learning_rate": 9.100000000000001e-06, "loss": 0.0005, "step": 8180 }, { "epoch": 0.819, "grad_norm": 0.0029402158688753843, "learning_rate": 9.05e-06, "loss": 0.0005, "step": 8190 }, { "epoch": 0.82, "grad_norm": 0.0034720194526016712, "learning_rate": 9e-06, "loss": 0.0006, "step": 8200 }, { "epoch": 0.821, "grad_norm": 0.008967465721070766, "learning_rate": 8.95e-06, "loss": 0.0009, "step": 8210 }, { "epoch": 0.822, "grad_norm": 0.007418784312903881, "learning_rate": 8.9e-06, "loss": 0.0007, "step": 8220 }, { "epoch": 0.823, "grad_norm": 0.0077253603376448154, "learning_rate": 8.85e-06, "loss": 0.0006, "step": 8230 }, { "epoch": 0.824, "grad_norm": 0.011202674359083176, "learning_rate": 8.8e-06, "loss": 0.0013, "step": 8240 }, { "epoch": 0.825, "grad_norm": 0.022354573011398315, "learning_rate": 8.75e-06, "loss": 0.0008, "step": 8250 }, { "epoch": 0.826, "grad_norm": 0.01750505343079567, "learning_rate": 8.7e-06, "loss": 0.0013, "step": 8260 }, { "epoch": 0.827, "grad_norm": 0.01153852604329586, "learning_rate": 8.65e-06, "loss": 0.0009, "step": 8270 }, { "epoch": 0.828, "grad_norm": 0.008752427063882351, "learning_rate": 8.599999999999999e-06, "loss": 0.0006, "step": 8280 }, { "epoch": 0.829, "grad_norm": 0.007307702675461769, "learning_rate": 8.550000000000001e-06, "loss": 0.0007, "step": 8290 }, { "epoch": 0.83, "grad_norm": 0.0077101094648242, "learning_rate": 8.500000000000002e-06, "loss": 0.0006, "step": 8300 }, { "epoch": 0.831, "grad_norm": 0.006358897779136896, "learning_rate": 8.45e-06, "loss": 0.0005, "step": 8310 }, { "epoch": 0.832, "grad_norm": 0.003663134528324008, "learning_rate": 8.400000000000001e-06, "loss": 0.0006, "step": 8320 }, { "epoch": 0.833, "grad_norm": 0.005117372144013643, "learning_rate": 8.350000000000001e-06, "loss": 0.0006, "step": 8330 }, { "epoch": 0.834, "grad_norm": 0.004245636984705925, "learning_rate": 8.3e-06, "loss": 0.0005, "step": 8340 }, { "epoch": 0.835, "grad_norm": 0.005357146263122559, "learning_rate": 8.25e-06, "loss": 0.0006, "step": 8350 }, { "epoch": 0.836, "grad_norm": 0.01055213250219822, "learning_rate": 8.200000000000001e-06, "loss": 0.0008, "step": 8360 }, { "epoch": 0.837, "grad_norm": 0.01871907152235508, "learning_rate": 8.15e-06, "loss": 0.0007, "step": 8370 }, { "epoch": 0.838, "grad_norm": 0.013110162690281868, "learning_rate": 8.1e-06, "loss": 0.0005, "step": 8380 }, { "epoch": 0.839, "grad_norm": 0.005271353758871555, "learning_rate": 8.050000000000001e-06, "loss": 0.0007, "step": 8390 }, { "epoch": 0.84, "grad_norm": 0.004324494861066341, "learning_rate": 8.000000000000001e-06, "loss": 0.0005, "step": 8400 }, { "epoch": 0.841, "grad_norm": 0.0031851409003138542, "learning_rate": 7.95e-06, "loss": 0.0006, "step": 8410 }, { "epoch": 0.842, "grad_norm": 0.009736557491123676, "learning_rate": 7.9e-06, "loss": 0.0006, "step": 8420 }, { "epoch": 0.843, "grad_norm": 0.005168536212295294, "learning_rate": 7.850000000000001e-06, "loss": 0.0005, "step": 8430 }, { "epoch": 0.844, "grad_norm": 0.002579685300588608, "learning_rate": 7.8e-06, "loss": 0.0005, "step": 8440 }, { "epoch": 0.845, "grad_norm": 0.008710252121090889, "learning_rate": 7.75e-06, "loss": 0.0005, "step": 8450 }, { "epoch": 0.846, "grad_norm": 0.004952189512550831, "learning_rate": 7.7e-06, "loss": 0.0008, "step": 8460 }, { "epoch": 0.847, "grad_norm": 0.003375423140823841, "learning_rate": 7.65e-06, "loss": 0.0005, "step": 8470 }, { "epoch": 0.848, "grad_norm": 0.13184253871440887, "learning_rate": 7.6e-06, "loss": 0.0012, "step": 8480 }, { "epoch": 0.849, "grad_norm": 0.017549166455864906, "learning_rate": 7.55e-06, "loss": 0.0007, "step": 8490 }, { "epoch": 0.85, "grad_norm": 0.00852286908775568, "learning_rate": 7.5e-06, "loss": 0.0006, "step": 8500 }, { "epoch": 0.851, "grad_norm": 0.005547389388084412, "learning_rate": 7.45e-06, "loss": 0.0005, "step": 8510 }, { "epoch": 0.852, "grad_norm": 0.0061622606590390205, "learning_rate": 7.4e-06, "loss": 0.0005, "step": 8520 }, { "epoch": 0.853, "grad_norm": 0.005182339809834957, "learning_rate": 7.35e-06, "loss": 0.0008, "step": 8530 }, { "epoch": 0.854, "grad_norm": 0.005366960074752569, "learning_rate": 7.2999999999999996e-06, "loss": 0.0006, "step": 8540 }, { "epoch": 0.855, "grad_norm": 0.005542315077036619, "learning_rate": 7.25e-06, "loss": 0.0006, "step": 8550 }, { "epoch": 0.856, "grad_norm": 0.003940809518098831, "learning_rate": 7.2e-06, "loss": 0.0005, "step": 8560 }, { "epoch": 0.857, "grad_norm": 0.003730529686436057, "learning_rate": 7.15e-06, "loss": 0.0006, "step": 8570 }, { "epoch": 0.858, "grad_norm": 0.0033961348235607147, "learning_rate": 7.1e-06, "loss": 0.0005, "step": 8580 }, { "epoch": 0.859, "grad_norm": 0.004546662792563438, "learning_rate": 7.049999999999999e-06, "loss": 0.0006, "step": 8590 }, { "epoch": 0.86, "grad_norm": 0.009168008342385292, "learning_rate": 7.000000000000001e-06, "loss": 0.0005, "step": 8600 }, { "epoch": 0.861, "grad_norm": 0.008373426273465157, "learning_rate": 6.950000000000001e-06, "loss": 0.0008, "step": 8610 }, { "epoch": 0.862, "grad_norm": 0.004947313107550144, "learning_rate": 6.900000000000001e-06, "loss": 0.0006, "step": 8620 }, { "epoch": 0.863, "grad_norm": 0.015127859078347683, "learning_rate": 6.8500000000000005e-06, "loss": 0.0006, "step": 8630 }, { "epoch": 0.864, "grad_norm": 0.0056435600854456425, "learning_rate": 6.800000000000001e-06, "loss": 0.0006, "step": 8640 }, { "epoch": 0.865, "grad_norm": 0.004109732341021299, "learning_rate": 6.750000000000001e-06, "loss": 0.0005, "step": 8650 }, { "epoch": 0.866, "grad_norm": 0.006170314736664295, "learning_rate": 6.700000000000001e-06, "loss": 0.0005, "step": 8660 }, { "epoch": 0.867, "grad_norm": 0.002802550094202161, "learning_rate": 6.650000000000001e-06, "loss": 0.0005, "step": 8670 }, { "epoch": 0.868, "grad_norm": 0.0029788350220769644, "learning_rate": 6.6e-06, "loss": 0.0004, "step": 8680 }, { "epoch": 0.869, "grad_norm": 0.013022363185882568, "learning_rate": 6.550000000000001e-06, "loss": 0.0006, "step": 8690 }, { "epoch": 0.87, "grad_norm": 0.0036853367928415537, "learning_rate": 6.5000000000000004e-06, "loss": 0.0006, "step": 8700 }, { "epoch": 0.871, "grad_norm": 0.002578242914751172, "learning_rate": 6.45e-06, "loss": 0.0005, "step": 8710 }, { "epoch": 0.872, "grad_norm": 0.0036895396187901497, "learning_rate": 6.4000000000000006e-06, "loss": 0.0005, "step": 8720 }, { "epoch": 0.873, "grad_norm": 0.006020987406373024, "learning_rate": 6.35e-06, "loss": 0.0005, "step": 8730 }, { "epoch": 0.874, "grad_norm": 0.006671608425676823, "learning_rate": 6.300000000000001e-06, "loss": 0.0006, "step": 8740 }, { "epoch": 0.875, "grad_norm": 0.0038102639373391867, "learning_rate": 6.25e-06, "loss": 0.0006, "step": 8750 }, { "epoch": 0.876, "grad_norm": 0.006786294747143984, "learning_rate": 6.2e-06, "loss": 0.0004, "step": 8760 }, { "epoch": 0.877, "grad_norm": 0.00381205091252923, "learning_rate": 6.15e-06, "loss": 0.0004, "step": 8770 }, { "epoch": 0.878, "grad_norm": 0.007368630729615688, "learning_rate": 6.1e-06, "loss": 0.0005, "step": 8780 }, { "epoch": 0.879, "grad_norm": 0.0035172586794942617, "learning_rate": 6.0500000000000005e-06, "loss": 0.0006, "step": 8790 }, { "epoch": 0.88, "grad_norm": 0.005555720068514347, "learning_rate": 6e-06, "loss": 0.0007, "step": 8800 }, { "epoch": 0.881, "grad_norm": 0.0076825893484056, "learning_rate": 5.95e-06, "loss": 0.0005, "step": 8810 }, { "epoch": 0.882, "grad_norm": 0.0055446140468120575, "learning_rate": 5.9e-06, "loss": 0.0005, "step": 8820 }, { "epoch": 0.883, "grad_norm": 0.002265618182718754, "learning_rate": 5.850000000000001e-06, "loss": 0.0005, "step": 8830 }, { "epoch": 0.884, "grad_norm": 0.003428585361689329, "learning_rate": 5.8e-06, "loss": 0.0004, "step": 8840 }, { "epoch": 0.885, "grad_norm": 0.0044764927588403225, "learning_rate": 5.750000000000001e-06, "loss": 0.0005, "step": 8850 }, { "epoch": 0.886, "grad_norm": 0.003201392712071538, "learning_rate": 5.7000000000000005e-06, "loss": 0.0005, "step": 8860 }, { "epoch": 0.887, "grad_norm": 0.0029762780759483576, "learning_rate": 5.65e-06, "loss": 0.0006, "step": 8870 }, { "epoch": 0.888, "grad_norm": 0.07450267672538757, "learning_rate": 5.600000000000001e-06, "loss": 0.0009, "step": 8880 }, { "epoch": 0.889, "grad_norm": 0.006392148323357105, "learning_rate": 5.55e-06, "loss": 0.0006, "step": 8890 }, { "epoch": 0.89, "grad_norm": 0.0038995451759546995, "learning_rate": 5.500000000000001e-06, "loss": 0.0005, "step": 8900 }, { "epoch": 0.891, "grad_norm": 0.0028438065201044083, "learning_rate": 5.45e-06, "loss": 0.0004, "step": 8910 }, { "epoch": 0.892, "grad_norm": 0.003168331226333976, "learning_rate": 5.4e-06, "loss": 0.0004, "step": 8920 }, { "epoch": 0.893, "grad_norm": 0.0026163198053836823, "learning_rate": 5.3500000000000004e-06, "loss": 0.0004, "step": 8930 }, { "epoch": 0.894, "grad_norm": 0.0029086521826684475, "learning_rate": 5.3e-06, "loss": 0.0005, "step": 8940 }, { "epoch": 0.895, "grad_norm": 0.011433840729296207, "learning_rate": 5.25e-06, "loss": 0.0007, "step": 8950 }, { "epoch": 0.896, "grad_norm": 0.01782575435936451, "learning_rate": 5.2e-06, "loss": 0.0011, "step": 8960 }, { "epoch": 0.897, "grad_norm": 0.00613692682236433, "learning_rate": 5.15e-06, "loss": 0.0004, "step": 8970 }, { "epoch": 0.898, "grad_norm": 0.02408697083592415, "learning_rate": 5.1e-06, "loss": 0.0007, "step": 8980 }, { "epoch": 0.899, "grad_norm": 0.004028539173305035, "learning_rate": 5.050000000000001e-06, "loss": 0.0005, "step": 8990 }, { "epoch": 0.9, "grad_norm": 0.0032080088276416063, "learning_rate": 5e-06, "loss": 0.0005, "step": 9000 }, { "epoch": 0.901, "grad_norm": 0.0035681568551808596, "learning_rate": 4.950000000000001e-06, "loss": 0.0004, "step": 9010 }, { "epoch": 0.902, "grad_norm": 0.007591512985527515, "learning_rate": 4.9000000000000005e-06, "loss": 0.0005, "step": 9020 }, { "epoch": 0.903, "grad_norm": 0.004855870269238949, "learning_rate": 4.85e-06, "loss": 0.0004, "step": 9030 }, { "epoch": 0.904, "grad_norm": 0.004854188766330481, "learning_rate": 4.800000000000001e-06, "loss": 0.0004, "step": 9040 }, { "epoch": 0.905, "grad_norm": 0.004117886070162058, "learning_rate": 4.75e-06, "loss": 0.0005, "step": 9050 }, { "epoch": 0.906, "grad_norm": 0.0045243133790791035, "learning_rate": 4.7e-06, "loss": 0.0005, "step": 9060 }, { "epoch": 0.907, "grad_norm": 0.001863984507508576, "learning_rate": 4.65e-06, "loss": 0.0004, "step": 9070 }, { "epoch": 0.908, "grad_norm": 0.002472365740686655, "learning_rate": 4.6e-06, "loss": 0.0005, "step": 9080 }, { "epoch": 0.909, "grad_norm": 0.0020466954447329044, "learning_rate": 4.5500000000000005e-06, "loss": 0.0004, "step": 9090 }, { "epoch": 0.91, "grad_norm": 0.004180034622550011, "learning_rate": 4.5e-06, "loss": 0.0004, "step": 9100 }, { "epoch": 0.911, "grad_norm": 0.00341266137547791, "learning_rate": 4.45e-06, "loss": 0.0006, "step": 9110 }, { "epoch": 0.912, "grad_norm": 0.006567875389009714, "learning_rate": 4.4e-06, "loss": 0.0004, "step": 9120 }, { "epoch": 0.913, "grad_norm": 0.003975498490035534, "learning_rate": 4.35e-06, "loss": 0.0006, "step": 9130 }, { "epoch": 0.914, "grad_norm": 0.003391894046217203, "learning_rate": 4.2999999999999995e-06, "loss": 0.0006, "step": 9140 }, { "epoch": 0.915, "grad_norm": 0.005821021273732185, "learning_rate": 4.250000000000001e-06, "loss": 0.0004, "step": 9150 }, { "epoch": 0.916, "grad_norm": 0.0022448371164500713, "learning_rate": 4.2000000000000004e-06, "loss": 0.0004, "step": 9160 }, { "epoch": 0.917, "grad_norm": 0.003718709573149681, "learning_rate": 4.15e-06, "loss": 0.0004, "step": 9170 }, { "epoch": 0.918, "grad_norm": 0.008243223652243614, "learning_rate": 4.1000000000000006e-06, "loss": 0.0007, "step": 9180 }, { "epoch": 0.919, "grad_norm": 0.010773789137601852, "learning_rate": 4.05e-06, "loss": 0.0007, "step": 9190 }, { "epoch": 0.92, "grad_norm": 0.006589268799871206, "learning_rate": 4.000000000000001e-06, "loss": 0.0005, "step": 9200 }, { "epoch": 0.921, "grad_norm": 0.0026856744661927223, "learning_rate": 3.95e-06, "loss": 0.0004, "step": 9210 }, { "epoch": 0.922, "grad_norm": 0.012134186923503876, "learning_rate": 3.9e-06, "loss": 0.0005, "step": 9220 }, { "epoch": 0.923, "grad_norm": 0.004260225687175989, "learning_rate": 3.85e-06, "loss": 0.0005, "step": 9230 }, { "epoch": 0.924, "grad_norm": 0.0023803950753062963, "learning_rate": 3.8e-06, "loss": 0.0004, "step": 9240 }, { "epoch": 0.925, "grad_norm": 0.0037502460181713104, "learning_rate": 3.75e-06, "loss": 0.0005, "step": 9250 }, { "epoch": 0.926, "grad_norm": 0.0017525887815281749, "learning_rate": 3.7e-06, "loss": 0.0003, "step": 9260 }, { "epoch": 0.927, "grad_norm": 0.003996537532657385, "learning_rate": 3.6499999999999998e-06, "loss": 0.0005, "step": 9270 }, { "epoch": 0.928, "grad_norm": 0.009158821776509285, "learning_rate": 3.6e-06, "loss": 0.0007, "step": 9280 }, { "epoch": 0.929, "grad_norm": 0.003372638253495097, "learning_rate": 3.55e-06, "loss": 0.0004, "step": 9290 }, { "epoch": 0.93, "grad_norm": 0.0026602360885590315, "learning_rate": 3.5000000000000004e-06, "loss": 0.0004, "step": 9300 }, { "epoch": 0.931, "grad_norm": 0.014532738365232944, "learning_rate": 3.4500000000000004e-06, "loss": 0.0007, "step": 9310 }, { "epoch": 0.932, "grad_norm": 0.002912462456151843, "learning_rate": 3.4000000000000005e-06, "loss": 0.0004, "step": 9320 }, { "epoch": 0.933, "grad_norm": 0.0052029709331691265, "learning_rate": 3.3500000000000005e-06, "loss": 0.0006, "step": 9330 }, { "epoch": 0.934, "grad_norm": 0.016220854595303535, "learning_rate": 3.3e-06, "loss": 0.0004, "step": 9340 }, { "epoch": 0.935, "grad_norm": 0.0030162036418914795, "learning_rate": 3.2500000000000002e-06, "loss": 0.0004, "step": 9350 }, { "epoch": 0.936, "grad_norm": 0.002491691382601857, "learning_rate": 3.2000000000000003e-06, "loss": 0.0004, "step": 9360 }, { "epoch": 0.937, "grad_norm": 0.022630969062447548, "learning_rate": 3.1500000000000003e-06, "loss": 0.0005, "step": 9370 }, { "epoch": 0.938, "grad_norm": 0.005951160565018654, "learning_rate": 3.1e-06, "loss": 0.0005, "step": 9380 }, { "epoch": 0.939, "grad_norm": 0.0024763622786849737, "learning_rate": 3.05e-06, "loss": 0.0005, "step": 9390 }, { "epoch": 0.94, "grad_norm": 0.0049979290924966335, "learning_rate": 3e-06, "loss": 0.0005, "step": 9400 }, { "epoch": 0.941, "grad_norm": 0.0025999436620622873, "learning_rate": 2.95e-06, "loss": 0.0004, "step": 9410 }, { "epoch": 0.942, "grad_norm": 0.004584169946610928, "learning_rate": 2.9e-06, "loss": 0.0006, "step": 9420 }, { "epoch": 0.943, "grad_norm": 0.005211680196225643, "learning_rate": 2.8500000000000002e-06, "loss": 0.0005, "step": 9430 }, { "epoch": 0.944, "grad_norm": 0.0022507943212985992, "learning_rate": 2.8000000000000003e-06, "loss": 0.0004, "step": 9440 }, { "epoch": 0.945, "grad_norm": 0.0029024691320955753, "learning_rate": 2.7500000000000004e-06, "loss": 0.0004, "step": 9450 }, { "epoch": 0.946, "grad_norm": 0.003968573175370693, "learning_rate": 2.7e-06, "loss": 0.0004, "step": 9460 }, { "epoch": 0.947, "grad_norm": 0.003264777595177293, "learning_rate": 2.65e-06, "loss": 0.0005, "step": 9470 }, { "epoch": 0.948, "grad_norm": 0.0048127188347280025, "learning_rate": 2.6e-06, "loss": 0.0004, "step": 9480 }, { "epoch": 0.949, "grad_norm": 0.004405410494655371, "learning_rate": 2.55e-06, "loss": 0.0006, "step": 9490 }, { "epoch": 0.95, "grad_norm": 0.00462340796366334, "learning_rate": 2.5e-06, "loss": 0.0004, "step": 9500 }, { "epoch": 0.951, "grad_norm": 0.0021721452940255404, "learning_rate": 2.4500000000000003e-06, "loss": 0.0004, "step": 9510 }, { "epoch": 0.952, "grad_norm": 0.002355078933760524, "learning_rate": 2.4000000000000003e-06, "loss": 0.0006, "step": 9520 }, { "epoch": 0.953, "grad_norm": 0.0022414589766412973, "learning_rate": 2.35e-06, "loss": 0.0004, "step": 9530 }, { "epoch": 0.954, "grad_norm": 0.012005253694951534, "learning_rate": 2.3e-06, "loss": 0.0006, "step": 9540 }, { "epoch": 0.955, "grad_norm": 0.00513832364231348, "learning_rate": 2.25e-06, "loss": 0.0005, "step": 9550 }, { "epoch": 0.956, "grad_norm": 0.0027625642251223326, "learning_rate": 2.2e-06, "loss": 0.0005, "step": 9560 }, { "epoch": 0.957, "grad_norm": 0.008645957335829735, "learning_rate": 2.1499999999999997e-06, "loss": 0.0005, "step": 9570 }, { "epoch": 0.958, "grad_norm": 0.00188863230869174, "learning_rate": 2.1000000000000002e-06, "loss": 0.0004, "step": 9580 }, { "epoch": 0.959, "grad_norm": 0.0025561931543052197, "learning_rate": 2.0500000000000003e-06, "loss": 0.0004, "step": 9590 }, { "epoch": 0.96, "grad_norm": 0.0033618698362261057, "learning_rate": 2.0000000000000003e-06, "loss": 0.0004, "step": 9600 }, { "epoch": 0.961, "grad_norm": 0.0018735548947006464, "learning_rate": 1.95e-06, "loss": 0.0004, "step": 9610 }, { "epoch": 0.962, "grad_norm": 0.0019359014695510268, "learning_rate": 1.9e-06, "loss": 0.0005, "step": 9620 }, { "epoch": 0.963, "grad_norm": 0.005369434133172035, "learning_rate": 1.85e-06, "loss": 0.0004, "step": 9630 }, { "epoch": 0.964, "grad_norm": 0.0017576682148501277, "learning_rate": 1.8e-06, "loss": 0.0004, "step": 9640 }, { "epoch": 0.965, "grad_norm": 0.002633103635162115, "learning_rate": 1.7500000000000002e-06, "loss": 0.0004, "step": 9650 }, { "epoch": 0.966, "grad_norm": 0.007023205049335957, "learning_rate": 1.7000000000000002e-06, "loss": 0.0004, "step": 9660 }, { "epoch": 0.967, "grad_norm": 0.0026062438264489174, "learning_rate": 1.65e-06, "loss": 0.0005, "step": 9670 }, { "epoch": 0.968, "grad_norm": 0.0025111304130405188, "learning_rate": 1.6000000000000001e-06, "loss": 0.0005, "step": 9680 }, { "epoch": 0.969, "grad_norm": 0.0028218806255608797, "learning_rate": 1.55e-06, "loss": 0.0004, "step": 9690 }, { "epoch": 0.97, "grad_norm": 0.0024802633561193943, "learning_rate": 1.5e-06, "loss": 0.0005, "step": 9700 }, { "epoch": 0.971, "grad_norm": 0.002883223118260503, "learning_rate": 1.45e-06, "loss": 0.0005, "step": 9710 }, { "epoch": 0.972, "grad_norm": 0.002503247233107686, "learning_rate": 1.4000000000000001e-06, "loss": 0.0005, "step": 9720 }, { "epoch": 0.973, "grad_norm": 0.002495008986443281, "learning_rate": 1.35e-06, "loss": 0.0006, "step": 9730 }, { "epoch": 0.974, "grad_norm": 0.03429775312542915, "learning_rate": 1.3e-06, "loss": 0.0006, "step": 9740 }, { "epoch": 0.975, "grad_norm": 0.003482217201963067, "learning_rate": 1.25e-06, "loss": 0.0004, "step": 9750 }, { "epoch": 0.976, "grad_norm": 0.001837963704019785, "learning_rate": 1.2000000000000002e-06, "loss": 0.0004, "step": 9760 }, { "epoch": 0.977, "grad_norm": 0.0020507893059402704, "learning_rate": 1.15e-06, "loss": 0.0004, "step": 9770 }, { "epoch": 0.978, "grad_norm": 0.0022647210862487555, "learning_rate": 1.1e-06, "loss": 0.0005, "step": 9780 }, { "epoch": 0.979, "grad_norm": 0.0017425378318876028, "learning_rate": 1.0500000000000001e-06, "loss": 0.0004, "step": 9790 }, { "epoch": 0.98, "grad_norm": 0.2319187968969345, "learning_rate": 1.0000000000000002e-06, "loss": 0.0021, "step": 9800 }, { "epoch": 0.981, "grad_norm": 0.01799739897251129, "learning_rate": 9.5e-07, "loss": 0.0006, "step": 9810 }, { "epoch": 0.982, "grad_norm": 0.007147952448576689, "learning_rate": 9e-07, "loss": 0.0005, "step": 9820 }, { "epoch": 0.983, "grad_norm": 0.004181794356554747, "learning_rate": 8.500000000000001e-07, "loss": 0.0005, "step": 9830 }, { "epoch": 0.984, "grad_norm": 0.00277232495136559, "learning_rate": 8.000000000000001e-07, "loss": 0.0004, "step": 9840 }, { "epoch": 0.985, "grad_norm": 0.0024797001387923956, "learning_rate": 7.5e-07, "loss": 0.0006, "step": 9850 }, { "epoch": 0.986, "grad_norm": 0.002748242113739252, "learning_rate": 7.000000000000001e-07, "loss": 0.0005, "step": 9860 }, { "epoch": 0.987, "grad_norm": 0.002988820429891348, "learning_rate": 6.5e-07, "loss": 0.0004, "step": 9870 }, { "epoch": 0.988, "grad_norm": 0.002272873418405652, "learning_rate": 6.000000000000001e-07, "loss": 0.0006, "step": 9880 }, { "epoch": 0.989, "grad_norm": 0.0028824047185480595, "learning_rate": 5.5e-07, "loss": 0.0005, "step": 9890 }, { "epoch": 0.99, "grad_norm": 0.013895529322326183, "learning_rate": 5.000000000000001e-07, "loss": 0.0005, "step": 9900 }, { "epoch": 0.991, "grad_norm": 0.004210934974253178, "learning_rate": 4.5e-07, "loss": 0.0004, "step": 9910 }, { "epoch": 0.992, "grad_norm": 0.0017349456902593374, "learning_rate": 4.0000000000000003e-07, "loss": 0.0005, "step": 9920 }, { "epoch": 0.993, "grad_norm": 0.0036622195038944483, "learning_rate": 3.5000000000000004e-07, "loss": 0.0003, "step": 9930 }, { "epoch": 0.994, "grad_norm": 0.02928483486175537, "learning_rate": 3.0000000000000004e-07, "loss": 0.0006, "step": 9940 }, { "epoch": 0.995, "grad_norm": 0.004271595273166895, "learning_rate": 2.5000000000000004e-07, "loss": 0.0004, "step": 9950 }, { "epoch": 0.996, "grad_norm": 0.004935207776725292, "learning_rate": 2.0000000000000002e-07, "loss": 0.0004, "step": 9960 }, { "epoch": 0.997, "grad_norm": 0.005258087068796158, "learning_rate": 1.5000000000000002e-07, "loss": 0.0004, "step": 9970 }, { "epoch": 0.998, "grad_norm": 0.0014150363858789206, "learning_rate": 1.0000000000000001e-07, "loss": 0.0004, "step": 9980 }, { "epoch": 0.999, "grad_norm": 0.003183445893228054, "learning_rate": 5.0000000000000004e-08, "loss": 0.0004, "step": 9990 }, { "epoch": 1.0, "grad_norm": 0.0063432566821575165, "learning_rate": 0.0, "loss": 0.0004, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.6962203336704e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }