{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 1.7063226699829102, "learning_rate": 4.995e-05, "loss": 9.6305, "step": 10 }, { "epoch": 0.002, "grad_norm": 1.467505693435669, "learning_rate": 4.99e-05, "loss": 8.8474, "step": 20 }, { "epoch": 0.003, "grad_norm": 1.3338744640350342, "learning_rate": 4.9850000000000006e-05, "loss": 8.4272, "step": 30 }, { "epoch": 0.004, "grad_norm": 1.194218635559082, "learning_rate": 4.9800000000000004e-05, "loss": 7.9969, "step": 40 }, { "epoch": 0.005, "grad_norm": 0.9542586207389832, "learning_rate": 4.975e-05, "loss": 7.8018, "step": 50 }, { "epoch": 0.006, "grad_norm": 0.8312947154045105, "learning_rate": 4.97e-05, "loss": 7.5303, "step": 60 }, { "epoch": 0.007, "grad_norm": 0.6978892683982849, "learning_rate": 4.965e-05, "loss": 7.3733, "step": 70 }, { "epoch": 0.008, "grad_norm": 0.6895764470100403, "learning_rate": 4.96e-05, "loss": 7.2434, "step": 80 }, { "epoch": 0.009, "grad_norm": 0.5555976033210754, "learning_rate": 4.9550000000000005e-05, "loss": 7.0877, "step": 90 }, { "epoch": 0.01, "grad_norm": 0.836391806602478, "learning_rate": 4.9500000000000004e-05, "loss": 7.0338, "step": 100 }, { "epoch": 0.011, "grad_norm": 0.782464861869812, "learning_rate": 4.945e-05, "loss": 6.878, "step": 110 }, { "epoch": 0.012, "grad_norm": 1.3705933094024658, "learning_rate": 4.94e-05, "loss": 6.5874, "step": 120 }, { "epoch": 0.013, "grad_norm": 0.7560775876045227, "learning_rate": 4.935e-05, "loss": 6.4978, "step": 130 }, { "epoch": 0.014, "grad_norm": 1.3238508701324463, "learning_rate": 4.93e-05, "loss": 6.3998, "step": 140 }, { "epoch": 0.015, "grad_norm": 0.7834548950195312, "learning_rate": 4.9250000000000004e-05, "loss": 6.2838, "step": 150 }, { "epoch": 0.016, "grad_norm": 0.762347400188446, "learning_rate": 4.92e-05, "loss": 6.0387, "step": 160 }, { "epoch": 0.017, "grad_norm": 0.7799501419067383, "learning_rate": 4.915e-05, "loss": 6.0241, "step": 170 }, { "epoch": 0.018, "grad_norm": 0.7948866486549377, "learning_rate": 4.91e-05, "loss": 5.8776, "step": 180 }, { "epoch": 0.019, "grad_norm": 0.9890483021736145, "learning_rate": 4.905e-05, "loss": 5.747, "step": 190 }, { "epoch": 0.02, "grad_norm": 0.9131263494491577, "learning_rate": 4.9e-05, "loss": 5.644, "step": 200 }, { "epoch": 0.021, "grad_norm": 1.7073436975479126, "learning_rate": 4.8950000000000004e-05, "loss": 5.778, "step": 210 }, { "epoch": 0.022, "grad_norm": 0.8059922456741333, "learning_rate": 4.89e-05, "loss": 5.4755, "step": 220 }, { "epoch": 0.023, "grad_norm": 1.2500686645507812, "learning_rate": 4.885e-05, "loss": 5.3769, "step": 230 }, { "epoch": 0.024, "grad_norm": 1.3848680257797241, "learning_rate": 4.88e-05, "loss": 5.2105, "step": 240 }, { "epoch": 0.025, "grad_norm": 1.2381746768951416, "learning_rate": 4.875e-05, "loss": 5.1444, "step": 250 }, { "epoch": 0.026, "grad_norm": 2.7005224227905273, "learning_rate": 4.87e-05, "loss": 5.1608, "step": 260 }, { "epoch": 0.027, "grad_norm": 1.1472671031951904, "learning_rate": 4.8650000000000003e-05, "loss": 4.9456, "step": 270 }, { "epoch": 0.028, "grad_norm": 1.9849270582199097, "learning_rate": 4.86e-05, "loss": 4.8466, "step": 280 }, { "epoch": 0.029, "grad_norm": 1.857001781463623, "learning_rate": 4.855e-05, "loss": 4.7323, "step": 290 }, { "epoch": 0.03, "grad_norm": 1.6731220483779907, "learning_rate": 4.85e-05, "loss": 4.5786, "step": 300 }, { "epoch": 0.031, "grad_norm": 1.7968906164169312, "learning_rate": 4.845e-05, "loss": 4.4588, "step": 310 }, { "epoch": 0.032, "grad_norm": 1.7908226251602173, "learning_rate": 4.8400000000000004e-05, "loss": 4.3645, "step": 320 }, { "epoch": 0.033, "grad_norm": 2.538881540298462, "learning_rate": 4.835e-05, "loss": 4.1489, "step": 330 }, { "epoch": 0.034, "grad_norm": 2.306257486343384, "learning_rate": 4.83e-05, "loss": 3.9798, "step": 340 }, { "epoch": 0.035, "grad_norm": 2.1730940341949463, "learning_rate": 4.825e-05, "loss": 4.0231, "step": 350 }, { "epoch": 0.036, "grad_norm": 2.4211463928222656, "learning_rate": 4.82e-05, "loss": 3.8495, "step": 360 }, { "epoch": 0.037, "grad_norm": 2.3698794841766357, "learning_rate": 4.815e-05, "loss": 3.6977, "step": 370 }, { "epoch": 0.038, "grad_norm": 2.147799491882324, "learning_rate": 4.8100000000000004e-05, "loss": 3.8008, "step": 380 }, { "epoch": 0.039, "grad_norm": 2.3577606678009033, "learning_rate": 4.805e-05, "loss": 3.6983, "step": 390 }, { "epoch": 0.04, "grad_norm": 2.065912961959839, "learning_rate": 4.8e-05, "loss": 3.5738, "step": 400 }, { "epoch": 0.041, "grad_norm": 2.930288314819336, "learning_rate": 4.795e-05, "loss": 3.5117, "step": 410 }, { "epoch": 0.042, "grad_norm": 2.3703155517578125, "learning_rate": 4.79e-05, "loss": 3.2483, "step": 420 }, { "epoch": 0.043, "grad_norm": 2.6050736904144287, "learning_rate": 4.785e-05, "loss": 3.2342, "step": 430 }, { "epoch": 0.044, "grad_norm": 2.0790674686431885, "learning_rate": 4.78e-05, "loss": 3.1452, "step": 440 }, { "epoch": 0.045, "grad_norm": 2.2497427463531494, "learning_rate": 4.775e-05, "loss": 3.0316, "step": 450 }, { "epoch": 0.046, "grad_norm": 2.507902145385742, "learning_rate": 4.77e-05, "loss": 2.8938, "step": 460 }, { "epoch": 0.047, "grad_norm": 2.517744541168213, "learning_rate": 4.765e-05, "loss": 2.8137, "step": 470 }, { "epoch": 0.048, "grad_norm": 3.9981460571289062, "learning_rate": 4.76e-05, "loss": 2.9864, "step": 480 }, { "epoch": 0.049, "grad_norm": 2.265026569366455, "learning_rate": 4.755e-05, "loss": 2.7839, "step": 490 }, { "epoch": 0.05, "grad_norm": 2.257293701171875, "learning_rate": 4.75e-05, "loss": 2.6834, "step": 500 }, { "epoch": 0.051, "grad_norm": 2.6932270526885986, "learning_rate": 4.745e-05, "loss": 2.5755, "step": 510 }, { "epoch": 0.052, "grad_norm": 1.7177081108093262, "learning_rate": 4.74e-05, "loss": 2.425, "step": 520 }, { "epoch": 0.053, "grad_norm": 2.2452073097229004, "learning_rate": 4.735e-05, "loss": 2.5261, "step": 530 }, { "epoch": 0.054, "grad_norm": 2.2109947204589844, "learning_rate": 4.73e-05, "loss": 2.3825, "step": 540 }, { "epoch": 0.055, "grad_norm": 2.574531078338623, "learning_rate": 4.7249999999999997e-05, "loss": 2.3087, "step": 550 }, { "epoch": 0.056, "grad_norm": 2.3631017208099365, "learning_rate": 4.72e-05, "loss": 2.3099, "step": 560 }, { "epoch": 0.057, "grad_norm": 2.3809709548950195, "learning_rate": 4.715e-05, "loss": 2.3001, "step": 570 }, { "epoch": 0.058, "grad_norm": 2.0683534145355225, "learning_rate": 4.71e-05, "loss": 2.0813, "step": 580 }, { "epoch": 0.059, "grad_norm": 2.5471837520599365, "learning_rate": 4.705e-05, "loss": 2.0378, "step": 590 }, { "epoch": 0.06, "grad_norm": 2.585564374923706, "learning_rate": 4.7e-05, "loss": 2.2062, "step": 600 }, { "epoch": 0.061, "grad_norm": 2.062100648880005, "learning_rate": 4.695e-05, "loss": 1.9914, "step": 610 }, { "epoch": 0.062, "grad_norm": 2.1019210815429688, "learning_rate": 4.69e-05, "loss": 1.9635, "step": 620 }, { "epoch": 0.063, "grad_norm": 2.630436658859253, "learning_rate": 4.685000000000001e-05, "loss": 1.9123, "step": 630 }, { "epoch": 0.064, "grad_norm": 2.1028494834899902, "learning_rate": 4.6800000000000006e-05, "loss": 1.7583, "step": 640 }, { "epoch": 0.065, "grad_norm": 2.392193078994751, "learning_rate": 4.6750000000000005e-05, "loss": 1.7532, "step": 650 }, { "epoch": 0.066, "grad_norm": 2.004413366317749, "learning_rate": 4.6700000000000003e-05, "loss": 1.6978, "step": 660 }, { "epoch": 0.067, "grad_norm": 2.210513114929199, "learning_rate": 4.665e-05, "loss": 1.6311, "step": 670 }, { "epoch": 0.068, "grad_norm": 1.8464936017990112, "learning_rate": 4.660000000000001e-05, "loss": 1.5507, "step": 680 }, { "epoch": 0.069, "grad_norm": 2.0246541500091553, "learning_rate": 4.655000000000001e-05, "loss": 1.5637, "step": 690 }, { "epoch": 0.07, "grad_norm": 2.199751138687134, "learning_rate": 4.6500000000000005e-05, "loss": 1.5603, "step": 700 }, { "epoch": 0.071, "grad_norm": 2.2002196311950684, "learning_rate": 4.6450000000000004e-05, "loss": 1.4558, "step": 710 }, { "epoch": 0.072, "grad_norm": 1.7826759815216064, "learning_rate": 4.64e-05, "loss": 1.4309, "step": 720 }, { "epoch": 0.073, "grad_norm": 1.760297417640686, "learning_rate": 4.635e-05, "loss": 1.3531, "step": 730 }, { "epoch": 0.074, "grad_norm": 2.0505475997924805, "learning_rate": 4.630000000000001e-05, "loss": 1.3641, "step": 740 }, { "epoch": 0.075, "grad_norm": 2.1375396251678467, "learning_rate": 4.6250000000000006e-05, "loss": 1.3259, "step": 750 }, { "epoch": 0.076, "grad_norm": 1.8252328634262085, "learning_rate": 4.6200000000000005e-05, "loss": 1.2026, "step": 760 }, { "epoch": 0.077, "grad_norm": 1.8945906162261963, "learning_rate": 4.6150000000000004e-05, "loss": 1.2878, "step": 770 }, { "epoch": 0.078, "grad_norm": 1.7990881204605103, "learning_rate": 4.61e-05, "loss": 1.1853, "step": 780 }, { "epoch": 0.079, "grad_norm": 1.4897470474243164, "learning_rate": 4.605e-05, "loss": 1.1279, "step": 790 }, { "epoch": 0.08, "grad_norm": 2.2804617881774902, "learning_rate": 4.600000000000001e-05, "loss": 1.0804, "step": 800 }, { "epoch": 0.081, "grad_norm": 1.4800664186477661, "learning_rate": 4.5950000000000006e-05, "loss": 1.0361, "step": 810 }, { "epoch": 0.082, "grad_norm": 1.3526049852371216, "learning_rate": 4.5900000000000004e-05, "loss": 1.0585, "step": 820 }, { "epoch": 0.083, "grad_norm": 1.534173607826233, "learning_rate": 4.585e-05, "loss": 1.0206, "step": 830 }, { "epoch": 0.084, "grad_norm": 1.4844435453414917, "learning_rate": 4.58e-05, "loss": 0.9758, "step": 840 }, { "epoch": 0.085, "grad_norm": 1.533679485321045, "learning_rate": 4.575e-05, "loss": 0.9168, "step": 850 }, { "epoch": 0.086, "grad_norm": 1.456162691116333, "learning_rate": 4.5700000000000006e-05, "loss": 0.8913, "step": 860 }, { "epoch": 0.087, "grad_norm": 1.7335631847381592, "learning_rate": 4.5650000000000005e-05, "loss": 0.9154, "step": 870 }, { "epoch": 0.088, "grad_norm": 1.3331761360168457, "learning_rate": 4.5600000000000004e-05, "loss": 0.8483, "step": 880 }, { "epoch": 0.089, "grad_norm": 1.6703053712844849, "learning_rate": 4.555e-05, "loss": 0.8116, "step": 890 }, { "epoch": 0.09, "grad_norm": 1.275975227355957, "learning_rate": 4.55e-05, "loss": 0.7869, "step": 900 }, { "epoch": 0.091, "grad_norm": 1.3800309896469116, "learning_rate": 4.545000000000001e-05, "loss": 0.7637, "step": 910 }, { "epoch": 0.092, "grad_norm": 1.9472386837005615, "learning_rate": 4.5400000000000006e-05, "loss": 0.7212, "step": 920 }, { "epoch": 0.093, "grad_norm": 1.3451333045959473, "learning_rate": 4.5350000000000005e-05, "loss": 0.6829, "step": 930 }, { "epoch": 0.094, "grad_norm": 1.5209784507751465, "learning_rate": 4.53e-05, "loss": 0.729, "step": 940 }, { "epoch": 0.095, "grad_norm": 1.3944469690322876, "learning_rate": 4.525e-05, "loss": 0.6732, "step": 950 }, { "epoch": 0.096, "grad_norm": 1.2177132368087769, "learning_rate": 4.52e-05, "loss": 0.6188, "step": 960 }, { "epoch": 0.097, "grad_norm": 1.5988528728485107, "learning_rate": 4.5150000000000006e-05, "loss": 0.6622, "step": 970 }, { "epoch": 0.098, "grad_norm": 1.3636531829833984, "learning_rate": 4.5100000000000005e-05, "loss": 0.5792, "step": 980 }, { "epoch": 0.099, "grad_norm": 1.377453088760376, "learning_rate": 4.5050000000000004e-05, "loss": 0.6062, "step": 990 }, { "epoch": 0.1, "grad_norm": 2.295713186264038, "learning_rate": 4.5e-05, "loss": 0.5709, "step": 1000 }, { "epoch": 0.101, "grad_norm": 1.35196852684021, "learning_rate": 4.495e-05, "loss": 0.5521, "step": 1010 }, { "epoch": 0.102, "grad_norm": 1.0617187023162842, "learning_rate": 4.49e-05, "loss": 0.5147, "step": 1020 }, { "epoch": 0.103, "grad_norm": 1.3035167455673218, "learning_rate": 4.4850000000000006e-05, "loss": 0.5081, "step": 1030 }, { "epoch": 0.104, "grad_norm": 1.2835568189620972, "learning_rate": 4.4800000000000005e-05, "loss": 0.5, "step": 1040 }, { "epoch": 0.105, "grad_norm": 1.0403038263320923, "learning_rate": 4.4750000000000004e-05, "loss": 0.4825, "step": 1050 }, { "epoch": 0.106, "grad_norm": 0.9538235068321228, "learning_rate": 4.47e-05, "loss": 0.4316, "step": 1060 }, { "epoch": 0.107, "grad_norm": 1.4246289730072021, "learning_rate": 4.465e-05, "loss": 0.4304, "step": 1070 }, { "epoch": 0.108, "grad_norm": 1.1217833757400513, "learning_rate": 4.46e-05, "loss": 0.4397, "step": 1080 }, { "epoch": 0.109, "grad_norm": 1.0411335229873657, "learning_rate": 4.4550000000000005e-05, "loss": 0.4057, "step": 1090 }, { "epoch": 0.11, "grad_norm": 0.8498069643974304, "learning_rate": 4.4500000000000004e-05, "loss": 0.3933, "step": 1100 }, { "epoch": 0.111, "grad_norm": 1.1270406246185303, "learning_rate": 4.445e-05, "loss": 0.366, "step": 1110 }, { "epoch": 0.112, "grad_norm": 1.189041256904602, "learning_rate": 4.44e-05, "loss": 0.3407, "step": 1120 }, { "epoch": 0.113, "grad_norm": 0.9837467670440674, "learning_rate": 4.435e-05, "loss": 0.3511, "step": 1130 }, { "epoch": 0.114, "grad_norm": 1.0432955026626587, "learning_rate": 4.43e-05, "loss": 0.3381, "step": 1140 }, { "epoch": 0.115, "grad_norm": 0.9529951810836792, "learning_rate": 4.4250000000000005e-05, "loss": 0.3189, "step": 1150 }, { "epoch": 0.116, "grad_norm": 1.008836030960083, "learning_rate": 4.4200000000000004e-05, "loss": 0.3077, "step": 1160 }, { "epoch": 0.117, "grad_norm": 1.0005086660385132, "learning_rate": 4.415e-05, "loss": 0.3001, "step": 1170 }, { "epoch": 0.118, "grad_norm": 1.1065175533294678, "learning_rate": 4.41e-05, "loss": 0.28, "step": 1180 }, { "epoch": 0.119, "grad_norm": 0.6701949834823608, "learning_rate": 4.405e-05, "loss": 0.2692, "step": 1190 }, { "epoch": 0.12, "grad_norm": 0.7154658436775208, "learning_rate": 4.4000000000000006e-05, "loss": 0.2663, "step": 1200 }, { "epoch": 0.121, "grad_norm": 0.6997113823890686, "learning_rate": 4.3950000000000004e-05, "loss": 0.2595, "step": 1210 }, { "epoch": 0.122, "grad_norm": 0.9047608971595764, "learning_rate": 4.39e-05, "loss": 0.2558, "step": 1220 }, { "epoch": 0.123, "grad_norm": 0.8508415222167969, "learning_rate": 4.385e-05, "loss": 0.2459, "step": 1230 }, { "epoch": 0.124, "grad_norm": 0.6505220532417297, "learning_rate": 4.38e-05, "loss": 0.2236, "step": 1240 }, { "epoch": 0.125, "grad_norm": 0.5360460877418518, "learning_rate": 4.375e-05, "loss": 0.2189, "step": 1250 }, { "epoch": 0.126, "grad_norm": 0.560817539691925, "learning_rate": 4.3700000000000005e-05, "loss": 0.2166, "step": 1260 }, { "epoch": 0.127, "grad_norm": 0.7089666128158569, "learning_rate": 4.3650000000000004e-05, "loss": 0.2026, "step": 1270 }, { "epoch": 0.128, "grad_norm": 0.5265817046165466, "learning_rate": 4.36e-05, "loss": 0.197, "step": 1280 }, { "epoch": 0.129, "grad_norm": 0.6629377007484436, "learning_rate": 4.355e-05, "loss": 0.1934, "step": 1290 }, { "epoch": 0.13, "grad_norm": 1.0730735063552856, "learning_rate": 4.35e-05, "loss": 0.1807, "step": 1300 }, { "epoch": 0.131, "grad_norm": 0.6990699172019958, "learning_rate": 4.345e-05, "loss": 0.1845, "step": 1310 }, { "epoch": 0.132, "grad_norm": 0.5047340393066406, "learning_rate": 4.3400000000000005e-05, "loss": 0.1725, "step": 1320 }, { "epoch": 0.133, "grad_norm": 0.6830994486808777, "learning_rate": 4.335e-05, "loss": 0.1687, "step": 1330 }, { "epoch": 0.134, "grad_norm": 0.5861710906028748, "learning_rate": 4.33e-05, "loss": 0.1671, "step": 1340 }, { "epoch": 0.135, "grad_norm": 0.43594300746917725, "learning_rate": 4.325e-05, "loss": 0.1467, "step": 1350 }, { "epoch": 0.136, "grad_norm": 0.44587692618370056, "learning_rate": 4.32e-05, "loss": 0.1509, "step": 1360 }, { "epoch": 0.137, "grad_norm": 0.5523977875709534, "learning_rate": 4.315e-05, "loss": 0.1434, "step": 1370 }, { "epoch": 0.138, "grad_norm": 0.6139170527458191, "learning_rate": 4.3100000000000004e-05, "loss": 0.1433, "step": 1380 }, { "epoch": 0.139, "grad_norm": 0.6169497966766357, "learning_rate": 4.305e-05, "loss": 0.1365, "step": 1390 }, { "epoch": 0.14, "grad_norm": 0.49120134115219116, "learning_rate": 4.3e-05, "loss": 0.1287, "step": 1400 }, { "epoch": 0.141, "grad_norm": 0.451753169298172, "learning_rate": 4.295e-05, "loss": 0.1142, "step": 1410 }, { "epoch": 0.142, "grad_norm": 0.5429627895355225, "learning_rate": 4.29e-05, "loss": 0.134, "step": 1420 }, { "epoch": 0.143, "grad_norm": 0.7613041400909424, "learning_rate": 4.285e-05, "loss": 0.1391, "step": 1430 }, { "epoch": 0.144, "grad_norm": 0.4953358471393585, "learning_rate": 4.2800000000000004e-05, "loss": 0.1197, "step": 1440 }, { "epoch": 0.145, "grad_norm": 0.3657626509666443, "learning_rate": 4.275e-05, "loss": 0.1071, "step": 1450 }, { "epoch": 0.146, "grad_norm": 0.44240206480026245, "learning_rate": 4.27e-05, "loss": 0.1111, "step": 1460 }, { "epoch": 0.147, "grad_norm": 0.5007165670394897, "learning_rate": 4.265e-05, "loss": 0.1056, "step": 1470 }, { "epoch": 0.148, "grad_norm": 0.4580256938934326, "learning_rate": 4.26e-05, "loss": 0.1049, "step": 1480 }, { "epoch": 0.149, "grad_norm": 0.4970822036266327, "learning_rate": 4.2550000000000004e-05, "loss": 0.1032, "step": 1490 }, { "epoch": 0.15, "grad_norm": 0.4138182997703552, "learning_rate": 4.25e-05, "loss": 0.0961, "step": 1500 }, { "epoch": 0.151, "grad_norm": 0.4013712406158447, "learning_rate": 4.245e-05, "loss": 0.0949, "step": 1510 }, { "epoch": 0.152, "grad_norm": 0.3868940770626068, "learning_rate": 4.24e-05, "loss": 0.0837, "step": 1520 }, { "epoch": 0.153, "grad_norm": 0.3113015294075012, "learning_rate": 4.235e-05, "loss": 0.0909, "step": 1530 }, { "epoch": 0.154, "grad_norm": 0.3569623529911041, "learning_rate": 4.23e-05, "loss": 0.0908, "step": 1540 }, { "epoch": 0.155, "grad_norm": 0.3841746151447296, "learning_rate": 4.2250000000000004e-05, "loss": 0.0806, "step": 1550 }, { "epoch": 0.156, "grad_norm": 0.6565550565719604, "learning_rate": 4.22e-05, "loss": 0.075, "step": 1560 }, { "epoch": 0.157, "grad_norm": 0.4816874563694, "learning_rate": 4.215e-05, "loss": 0.0858, "step": 1570 }, { "epoch": 0.158, "grad_norm": 0.30408933758735657, "learning_rate": 4.21e-05, "loss": 0.0704, "step": 1580 }, { "epoch": 0.159, "grad_norm": 0.43388792872428894, "learning_rate": 4.205e-05, "loss": 0.0671, "step": 1590 }, { "epoch": 0.16, "grad_norm": 0.33304253220558167, "learning_rate": 4.2e-05, "loss": 0.07, "step": 1600 }, { "epoch": 0.161, "grad_norm": 0.4260387420654297, "learning_rate": 4.195e-05, "loss": 0.0691, "step": 1610 }, { "epoch": 0.162, "grad_norm": 0.37930798530578613, "learning_rate": 4.19e-05, "loss": 0.0715, "step": 1620 }, { "epoch": 0.163, "grad_norm": 0.3198983669281006, "learning_rate": 4.185e-05, "loss": 0.0651, "step": 1630 }, { "epoch": 0.164, "grad_norm": 0.3510359823703766, "learning_rate": 4.18e-05, "loss": 0.058, "step": 1640 }, { "epoch": 0.165, "grad_norm": 0.41047966480255127, "learning_rate": 4.175e-05, "loss": 0.065, "step": 1650 }, { "epoch": 0.166, "grad_norm": 0.3054174482822418, "learning_rate": 4.17e-05, "loss": 0.0564, "step": 1660 }, { "epoch": 0.167, "grad_norm": 0.29319772124290466, "learning_rate": 4.165e-05, "loss": 0.0599, "step": 1670 }, { "epoch": 0.168, "grad_norm": 0.257354736328125, "learning_rate": 4.16e-05, "loss": 0.0536, "step": 1680 }, { "epoch": 0.169, "grad_norm": 0.25215694308280945, "learning_rate": 4.155e-05, "loss": 0.0587, "step": 1690 }, { "epoch": 0.17, "grad_norm": 0.4573931097984314, "learning_rate": 4.15e-05, "loss": 0.0524, "step": 1700 }, { "epoch": 0.171, "grad_norm": 0.3514876663684845, "learning_rate": 4.145e-05, "loss": 0.0551, "step": 1710 }, { "epoch": 0.172, "grad_norm": 0.3239930272102356, "learning_rate": 4.14e-05, "loss": 0.0499, "step": 1720 }, { "epoch": 0.173, "grad_norm": 0.20213039219379425, "learning_rate": 4.135e-05, "loss": 0.0521, "step": 1730 }, { "epoch": 0.174, "grad_norm": 0.21831783652305603, "learning_rate": 4.13e-05, "loss": 0.0469, "step": 1740 }, { "epoch": 0.175, "grad_norm": 0.2585163712501526, "learning_rate": 4.125e-05, "loss": 0.0469, "step": 1750 }, { "epoch": 0.176, "grad_norm": 0.21717113256454468, "learning_rate": 4.12e-05, "loss": 0.0455, "step": 1760 }, { "epoch": 0.177, "grad_norm": 0.27248838543891907, "learning_rate": 4.115e-05, "loss": 0.046, "step": 1770 }, { "epoch": 0.178, "grad_norm": 0.2503461241722107, "learning_rate": 4.11e-05, "loss": 0.0447, "step": 1780 }, { "epoch": 0.179, "grad_norm": 0.27404382824897766, "learning_rate": 4.105e-05, "loss": 0.0437, "step": 1790 }, { "epoch": 0.18, "grad_norm": 0.23549066483974457, "learning_rate": 4.1e-05, "loss": 0.0423, "step": 1800 }, { "epoch": 0.181, "grad_norm": 0.19369937479496002, "learning_rate": 4.095e-05, "loss": 0.0408, "step": 1810 }, { "epoch": 0.182, "grad_norm": 0.20560242235660553, "learning_rate": 4.09e-05, "loss": 0.0379, "step": 1820 }, { "epoch": 0.183, "grad_norm": 0.34989863634109497, "learning_rate": 4.085e-05, "loss": 0.0364, "step": 1830 }, { "epoch": 0.184, "grad_norm": 0.2310326248407364, "learning_rate": 4.08e-05, "loss": 0.0385, "step": 1840 }, { "epoch": 0.185, "grad_norm": 0.21055462956428528, "learning_rate": 4.075e-05, "loss": 0.0351, "step": 1850 }, { "epoch": 0.186, "grad_norm": 0.3251895308494568, "learning_rate": 4.07e-05, "loss": 0.0381, "step": 1860 }, { "epoch": 0.187, "grad_norm": 0.2887445390224457, "learning_rate": 4.065e-05, "loss": 0.0341, "step": 1870 }, { "epoch": 0.188, "grad_norm": 0.15948843955993652, "learning_rate": 4.0600000000000004e-05, "loss": 0.0313, "step": 1880 }, { "epoch": 0.189, "grad_norm": 0.2413359135389328, "learning_rate": 4.055e-05, "loss": 0.0338, "step": 1890 }, { "epoch": 0.19, "grad_norm": 0.2132706493139267, "learning_rate": 4.05e-05, "loss": 0.0339, "step": 1900 }, { "epoch": 0.191, "grad_norm": 0.17968431115150452, "learning_rate": 4.045000000000001e-05, "loss": 0.0317, "step": 1910 }, { "epoch": 0.192, "grad_norm": 0.15828929841518402, "learning_rate": 4.0400000000000006e-05, "loss": 0.0302, "step": 1920 }, { "epoch": 0.193, "grad_norm": 0.18106874823570251, "learning_rate": 4.0350000000000005e-05, "loss": 0.0331, "step": 1930 }, { "epoch": 0.194, "grad_norm": 0.34827324748039246, "learning_rate": 4.0300000000000004e-05, "loss": 0.032, "step": 1940 }, { "epoch": 0.195, "grad_norm": 0.21621111035346985, "learning_rate": 4.025e-05, "loss": 0.0317, "step": 1950 }, { "epoch": 0.196, "grad_norm": 0.2159423679113388, "learning_rate": 4.02e-05, "loss": 0.0296, "step": 1960 }, { "epoch": 0.197, "grad_norm": 0.17750391364097595, "learning_rate": 4.015000000000001e-05, "loss": 0.0297, "step": 1970 }, { "epoch": 0.198, "grad_norm": 0.13952311873435974, "learning_rate": 4.0100000000000006e-05, "loss": 0.0279, "step": 1980 }, { "epoch": 0.199, "grad_norm": 0.19622887670993805, "learning_rate": 4.0050000000000004e-05, "loss": 0.0278, "step": 1990 }, { "epoch": 0.2, "grad_norm": 0.14959514141082764, "learning_rate": 4e-05, "loss": 0.0251, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.3924406673408e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }