diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,758 +1,14647 @@ { - "best_metric": 0.897123396396637, - "best_model_checkpoint": "./cocoa_outputs_resnet/checkpoint-980", - "epoch": 5.0, + "best_metric": 0.26566287875175476, + "best_model_checkpoint": "./cocoa_outputs_resnet/checkpoint-6468", + "epoch": 100.0, "eval_steps": 500, - "global_step": 980, + "global_step": 19600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05102040816326531, - "grad_norm": 9.393156051635742, - "learning_rate": 1.979591836734694e-05, - "loss": 1.8572, + "grad_norm": 4.57908821105957, + "learning_rate": 1.998979591836735e-05, + "loss": 1.7911, "step": 10 }, { "epoch": 0.10204081632653061, - "grad_norm": 6.8461503982543945, - "learning_rate": 1.9591836734693877e-05, - "loss": 1.8224, + "grad_norm": 4.909427165985107, + "learning_rate": 1.9979591836734697e-05, + "loss": 1.7576, "step": 20 }, { "epoch": 0.15306122448979592, - "grad_norm": 8.049448013305664, - "learning_rate": 1.9387755102040817e-05, - "loss": 1.7955, + "grad_norm": 4.646119117736816, + "learning_rate": 1.9969387755102042e-05, + "loss": 1.7171, "step": 30 }, { "epoch": 0.20408163265306123, - "grad_norm": 6.181807041168213, - "learning_rate": 1.9183673469387756e-05, - "loss": 1.7641, + "grad_norm": 4.798040866851807, + "learning_rate": 1.9959183673469388e-05, + "loss": 1.6832, "step": 40 }, { "epoch": 0.25510204081632654, - "grad_norm": 6.991772651672363, - "learning_rate": 1.8979591836734696e-05, - "loss": 1.7421, + "grad_norm": 4.766178131103516, + "learning_rate": 1.9948979591836737e-05, + "loss": 1.615, "step": 50 }, { "epoch": 0.30612244897959184, - "grad_norm": 6.233796119689941, - "learning_rate": 1.8775510204081636e-05, - "loss": 1.7018, + "grad_norm": 4.4977617263793945, + "learning_rate": 1.9938775510204083e-05, + "loss": 1.5784, "step": 60 }, { "epoch": 0.35714285714285715, - "grad_norm": 7.064742088317871, - "learning_rate": 1.8571428571428575e-05, - "loss": 1.6763, + "grad_norm": 4.778722286224365, + "learning_rate": 1.992857142857143e-05, + "loss": 1.5465, "step": 70 }, { "epoch": 0.40816326530612246, - "grad_norm": 7.047808647155762, - "learning_rate": 1.836734693877551e-05, - "loss": 1.654, + "grad_norm": 4.473388195037842, + "learning_rate": 1.9918367346938775e-05, + "loss": 1.506, "step": 80 }, { "epoch": 0.45918367346938777, - "grad_norm": 8.950271606445312, - "learning_rate": 1.816326530612245e-05, - "loss": 1.6461, + "grad_norm": 4.812692642211914, + "learning_rate": 1.9908163265306124e-05, + "loss": 1.4882, "step": 90 }, { "epoch": 0.5102040816326531, - "grad_norm": 7.686429023742676, - "learning_rate": 1.795918367346939e-05, - "loss": 1.6049, + "grad_norm": 4.0863494873046875, + "learning_rate": 1.9897959183673473e-05, + "loss": 1.4078, "step": 100 }, { "epoch": 0.5612244897959183, - "grad_norm": 6.434399127960205, - "learning_rate": 1.7755102040816327e-05, - "loss": 1.5926, + "grad_norm": 4.4336323738098145, + "learning_rate": 1.988775510204082e-05, + "loss": 1.404, "step": 110 }, { "epoch": 0.6122448979591837, - "grad_norm": 6.048853397369385, - "learning_rate": 1.7551020408163266e-05, - "loss": 1.5614, + "grad_norm": 4.0491719245910645, + "learning_rate": 1.9877551020408165e-05, + "loss": 1.34, "step": 120 }, { "epoch": 0.6632653061224489, - "grad_norm": 12.146856307983398, - "learning_rate": 1.7346938775510206e-05, - "loss": 1.5344, + "grad_norm": 4.225604057312012, + "learning_rate": 1.986734693877551e-05, + "loss": 1.2999, "step": 130 }, { "epoch": 0.7142857142857143, - "grad_norm": 8.939432144165039, - "learning_rate": 1.7142857142857142e-05, - "loss": 1.4892, + "grad_norm": 4.075715065002441, + "learning_rate": 1.985714285714286e-05, + "loss": 1.2529, "step": 140 }, { "epoch": 0.7653061224489796, - "grad_norm": 16.924053192138672, - "learning_rate": 1.6938775510204085e-05, - "loss": 1.4829, + "grad_norm": 4.563647747039795, + "learning_rate": 1.9846938775510205e-05, + "loss": 1.2366, "step": 150 }, { "epoch": 0.8163265306122449, - "grad_norm": 10.929306983947754, - "learning_rate": 1.673469387755102e-05, - "loss": 1.5136, + "grad_norm": 4.373153209686279, + "learning_rate": 1.983673469387755e-05, + "loss": 1.1994, "step": 160 }, { "epoch": 0.8673469387755102, - "grad_norm": 12.943446159362793, - "learning_rate": 1.653061224489796e-05, - "loss": 1.4316, + "grad_norm": 4.007843971252441, + "learning_rate": 1.9826530612244897e-05, + "loss": 1.149, "step": 170 }, { "epoch": 0.9183673469387755, - "grad_norm": 6.890258312225342, - "learning_rate": 1.63265306122449e-05, - "loss": 1.4373, + "grad_norm": 3.800847291946411, + "learning_rate": 1.9816326530612246e-05, + "loss": 1.137, "step": 180 }, { "epoch": 0.9693877551020408, - "grad_norm": 9.790729522705078, - "learning_rate": 1.612244897959184e-05, - "loss": 1.4094, + "grad_norm": 3.8902652263641357, + "learning_rate": 1.9806122448979595e-05, + "loss": 1.0627, "step": 190 }, { "epoch": 1.0, - "eval_accuracy": 0.8592057761732852, - "eval_loss": 1.4767718315124512, - "eval_runtime": 14.5787, - "eval_samples_per_second": 19.0, - "eval_steps_per_second": 2.401, + "eval_accuracy": 0.5595667870036101, + "eval_loss": 1.5222723484039307, + "eval_runtime": 0.9755, + "eval_samples_per_second": 283.96, + "eval_steps_per_second": 35.879, "step": 196 }, { "epoch": 1.0204081632653061, - "grad_norm": 20.24932861328125, - "learning_rate": 1.5918367346938776e-05, - "loss": 1.4063, + "grad_norm": 5.571325778961182, + "learning_rate": 1.979591836734694e-05, + "loss": 1.0586, "step": 200 }, { "epoch": 1.0714285714285714, - "grad_norm": 14.811591148376465, - "learning_rate": 1.5714285714285715e-05, - "loss": 1.3551, + "grad_norm": 5.324435710906982, + "learning_rate": 1.9785714285714287e-05, + "loss": 1.009, "step": 210 }, { "epoch": 1.1224489795918366, - "grad_norm": 18.894763946533203, - "learning_rate": 1.5510204081632655e-05, - "loss": 1.3314, + "grad_norm": 4.557438850402832, + "learning_rate": 1.9775510204081633e-05, + "loss": 0.9619, "step": 220 }, { "epoch": 1.1734693877551021, - "grad_norm": 13.06529426574707, - "learning_rate": 1.530612244897959e-05, - "loss": 1.3112, + "grad_norm": 4.46217679977417, + "learning_rate": 1.9765306122448982e-05, + "loss": 0.9068, "step": 230 }, { "epoch": 1.2244897959183674, - "grad_norm": 16.423351287841797, - "learning_rate": 1.510204081632653e-05, - "loss": 1.2519, + "grad_norm": 3.9600579738616943, + "learning_rate": 1.9755102040816328e-05, + "loss": 0.8059, "step": 240 }, { "epoch": 1.2755102040816326, - "grad_norm": 11.016277313232422, - "learning_rate": 1.4897959183673472e-05, - "loss": 1.3565, + "grad_norm": 5.61907958984375, + "learning_rate": 1.9744897959183677e-05, + "loss": 0.9749, "step": 250 }, { "epoch": 1.3265306122448979, - "grad_norm": 7.271716594696045, - "learning_rate": 1.469387755102041e-05, - "loss": 1.2681, + "grad_norm": 3.644207239151001, + "learning_rate": 1.9734693877551023e-05, + "loss": 0.8702, "step": 260 }, { "epoch": 1.3775510204081631, - "grad_norm": 8.437969207763672, - "learning_rate": 1.448979591836735e-05, - "loss": 1.2675, + "grad_norm": 3.3476579189300537, + "learning_rate": 1.972448979591837e-05, + "loss": 0.8272, "step": 270 }, { "epoch": 1.4285714285714286, - "grad_norm": 6.352739334106445, - "learning_rate": 1.4285714285714287e-05, - "loss": 1.1948, + "grad_norm": 3.4033284187316895, + "learning_rate": 1.9714285714285718e-05, + "loss": 0.7383, "step": 280 }, { "epoch": 1.4795918367346939, - "grad_norm": 17.921051025390625, - "learning_rate": 1.4081632653061225e-05, - "loss": 1.3366, + "grad_norm": 5.9955034255981445, + "learning_rate": 1.9704081632653063e-05, + "loss": 0.9357, "step": 290 }, { "epoch": 1.5306122448979593, - "grad_norm": 22.707721710205078, - "learning_rate": 1.3877551020408165e-05, - "loss": 1.2684, + "grad_norm": 5.240922451019287, + "learning_rate": 1.969387755102041e-05, + "loss": 0.8436, "step": 300 }, { "epoch": 1.5816326530612246, - "grad_norm": 8.037663459777832, - "learning_rate": 1.3673469387755102e-05, - "loss": 1.2322, + "grad_norm": 2.973389148712158, + "learning_rate": 1.9683673469387755e-05, + "loss": 0.7505, "step": 310 }, { "epoch": 1.6326530612244898, - "grad_norm": 9.421684265136719, - "learning_rate": 1.3469387755102042e-05, - "loss": 1.1945, + "grad_norm": 4.997092247009277, + "learning_rate": 1.9673469387755104e-05, + "loss": 0.7463, "step": 320 }, { "epoch": 1.683673469387755, - "grad_norm": 27.85601043701172, - "learning_rate": 1.326530612244898e-05, - "loss": 1.2519, + "grad_norm": 5.728514671325684, + "learning_rate": 1.966326530612245e-05, + "loss": 0.8559, "step": 330 }, { "epoch": 1.7346938775510203, - "grad_norm": 15.291147232055664, - "learning_rate": 1.3061224489795918e-05, - "loss": 1.1934, + "grad_norm": 4.56866455078125, + "learning_rate": 1.96530612244898e-05, + "loss": 0.7695, "step": 340 }, { "epoch": 1.7857142857142856, - "grad_norm": 10.985498428344727, - "learning_rate": 1.2857142857142859e-05, - "loss": 1.123, + "grad_norm": 5.454906940460205, + "learning_rate": 1.9642857142857145e-05, + "loss": 0.6812, "step": 350 }, { "epoch": 1.836734693877551, - "grad_norm": 9.827385902404785, - "learning_rate": 1.2653061224489798e-05, - "loss": 1.1677, + "grad_norm": 3.0533580780029297, + "learning_rate": 1.963265306122449e-05, + "loss": 0.7244, "step": 360 }, { "epoch": 1.8877551020408163, - "grad_norm": 16.994632720947266, - "learning_rate": 1.2448979591836736e-05, - "loss": 1.171, + "grad_norm": 5.244307994842529, + "learning_rate": 1.962244897959184e-05, + "loss": 0.7089, "step": 370 }, { "epoch": 1.9387755102040818, - "grad_norm": 16.16167449951172, - "learning_rate": 1.2244897959183674e-05, - "loss": 1.1488, + "grad_norm": 2.4669547080993652, + "learning_rate": 1.9612244897959186e-05, + "loss": 0.7261, "step": 380 }, { "epoch": 1.989795918367347, - "grad_norm": 12.466065406799316, - "learning_rate": 1.2040816326530614e-05, - "loss": 1.0664, + "grad_norm": 3.281259059906006, + "learning_rate": 1.960204081632653e-05, + "loss": 0.591, "step": 390 }, { "epoch": 2.0, - "eval_accuracy": 0.8628158844765343, - "eval_loss": 1.2089825868606567, - "eval_runtime": 17.1021, - "eval_samples_per_second": 16.197, - "eval_steps_per_second": 2.047, + "eval_accuracy": 0.8303249097472925, + "eval_loss": 0.8974758386611938, + "eval_runtime": 0.9365, + "eval_samples_per_second": 295.777, + "eval_steps_per_second": 37.373, "step": 392 }, { "epoch": 2.0408163265306123, - "grad_norm": 14.415572166442871, - "learning_rate": 1.1836734693877552e-05, - "loss": 1.1081, + "grad_norm": 3.226776361465454, + "learning_rate": 1.9591836734693877e-05, + "loss": 0.5875, "step": 400 }, { "epoch": 2.0918367346938775, - "grad_norm": 16.120677947998047, - "learning_rate": 1.1632653061224491e-05, - "loss": 1.1347, + "grad_norm": 4.076236724853516, + "learning_rate": 1.9581632653061227e-05, + "loss": 0.5969, "step": 410 }, { "epoch": 2.142857142857143, - "grad_norm": 9.960640907287598, - "learning_rate": 1.1428571428571429e-05, - "loss": 1.0755, + "grad_norm": 6.054863929748535, + "learning_rate": 1.9571428571428572e-05, + "loss": 0.6501, "step": 420 }, { "epoch": 2.193877551020408, - "grad_norm": 14.734106063842773, - "learning_rate": 1.1224489795918367e-05, - "loss": 1.0976, + "grad_norm": 3.8678038120269775, + "learning_rate": 1.956122448979592e-05, + "loss": 0.602, "step": 430 }, { "epoch": 2.2448979591836733, - "grad_norm": 12.98324203491211, - "learning_rate": 1.1020408163265306e-05, - "loss": 0.9844, + "grad_norm": 3.923830032348633, + "learning_rate": 1.9551020408163267e-05, + "loss": 0.5332, "step": 440 }, { "epoch": 2.295918367346939, - "grad_norm": 8.834336280822754, - "learning_rate": 1.0816326530612246e-05, - "loss": 1.0923, + "grad_norm": 2.2238922119140625, + "learning_rate": 1.9540816326530613e-05, + "loss": 0.6505, "step": 450 }, { "epoch": 2.3469387755102042, - "grad_norm": 10.948126792907715, - "learning_rate": 1.0612244897959186e-05, - "loss": 0.9711, + "grad_norm": 3.5019683837890625, + "learning_rate": 1.9530612244897962e-05, + "loss": 0.5529, "step": 460 }, { "epoch": 2.3979591836734695, - "grad_norm": 13.707374572753906, - "learning_rate": 1.0408163265306123e-05, - "loss": 1.0786, + "grad_norm": 3.935518264770508, + "learning_rate": 1.9520408163265308e-05, + "loss": 0.6241, "step": 470 }, { "epoch": 2.4489795918367347, - "grad_norm": 18.081064224243164, - "learning_rate": 1.0204081632653063e-05, - "loss": 1.0399, + "grad_norm": 2.3772923946380615, + "learning_rate": 1.9510204081632654e-05, + "loss": 0.6224, "step": 480 }, { "epoch": 2.5, - "grad_norm": 12.470099449157715, - "learning_rate": 1e-05, - "loss": 0.9344, + "grad_norm": 6.30583381652832, + "learning_rate": 1.95e-05, + "loss": 0.4854, "step": 490 }, { "epoch": 2.5510204081632653, - "grad_norm": 10.96716022491455, - "learning_rate": 9.795918367346939e-06, - "loss": 1.043, + "grad_norm": 2.9505112171173096, + "learning_rate": 1.948979591836735e-05, + "loss": 0.6642, "step": 500 }, { "epoch": 2.6020408163265305, - "grad_norm": 7.244050025939941, - "learning_rate": 9.591836734693878e-06, - "loss": 0.9058, + "grad_norm": 2.7075178623199463, + "learning_rate": 1.9479591836734695e-05, + "loss": 0.5366, "step": 510 }, { "epoch": 2.6530612244897958, - "grad_norm": 10.213088035583496, - "learning_rate": 9.387755102040818e-06, - "loss": 0.9683, + "grad_norm": 2.4478352069854736, + "learning_rate": 1.9469387755102044e-05, + "loss": 0.52, "step": 520 }, { "epoch": 2.704081632653061, - "grad_norm": 30.410015106201172, - "learning_rate": 9.183673469387756e-06, - "loss": 1.0396, + "grad_norm": 6.208697319030762, + "learning_rate": 1.945918367346939e-05, + "loss": 0.6528, "step": 530 }, { "epoch": 2.7551020408163263, - "grad_norm": 10.457310676574707, - "learning_rate": 8.979591836734695e-06, - "loss": 1.0215, + "grad_norm": 5.765097141265869, + "learning_rate": 1.9448979591836735e-05, + "loss": 0.683, "step": 540 }, { "epoch": 2.806122448979592, - "grad_norm": 12.294684410095215, - "learning_rate": 8.775510204081633e-06, - "loss": 0.9301, + "grad_norm": 2.4912049770355225, + "learning_rate": 1.9438775510204085e-05, + "loss": 0.4997, "step": 550 }, { "epoch": 2.857142857142857, - "grad_norm": 10.194703102111816, - "learning_rate": 8.571428571428571e-06, - "loss": 0.9541, + "grad_norm": 3.521888494491577, + "learning_rate": 1.942857142857143e-05, + "loss": 0.5453, "step": 560 }, { "epoch": 2.9081632653061225, - "grad_norm": 17.80396842956543, - "learning_rate": 8.36734693877551e-06, - "loss": 0.964, + "grad_norm": 3.898709535598755, + "learning_rate": 1.941836734693878e-05, + "loss": 0.5242, "step": 570 }, { "epoch": 2.9591836734693877, - "grad_norm": 12.601577758789062, - "learning_rate": 8.16326530612245e-06, - "loss": 1.0295, + "grad_norm": 6.8352437019348145, + "learning_rate": 1.9408163265306122e-05, + "loss": 0.6623, "step": 580 }, { "epoch": 3.0, - "eval_accuracy": 0.8628158844765343, - "eval_loss": 0.9923866987228394, - "eval_runtime": 14.8495, - "eval_samples_per_second": 18.654, - "eval_steps_per_second": 2.357, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.6563544273376465, + "eval_runtime": 0.9345, + "eval_samples_per_second": 296.427, + "eval_steps_per_second": 37.455, "step": 588 }, { "epoch": 3.010204081632653, - "grad_norm": 12.689225196838379, - "learning_rate": 7.959183673469388e-06, - "loss": 0.9564, + "grad_norm": 6.536755084991455, + "learning_rate": 1.939795918367347e-05, + "loss": 0.5816, "step": 590 }, { "epoch": 3.061224489795918, - "grad_norm": 13.568258285522461, - "learning_rate": 7.755102040816327e-06, - "loss": 1.0031, + "grad_norm": 4.247134685516357, + "learning_rate": 1.9387755102040817e-05, + "loss": 0.6536, "step": 600 }, { "epoch": 3.1122448979591835, - "grad_norm": 15.86646842956543, - "learning_rate": 7.551020408163265e-06, - "loss": 0.9434, + "grad_norm": 5.93636417388916, + "learning_rate": 1.9377551020408166e-05, + "loss": 0.5808, "step": 610 }, { "epoch": 3.163265306122449, - "grad_norm": 16.88734245300293, - "learning_rate": 7.346938775510205e-06, - "loss": 0.9468, + "grad_norm": 3.8680365085601807, + "learning_rate": 1.9367346938775512e-05, + "loss": 0.5401, "step": 620 }, { "epoch": 3.2142857142857144, - "grad_norm": 14.894389152526855, - "learning_rate": 7.1428571428571436e-06, - "loss": 0.9284, + "grad_norm": 4.432985305786133, + "learning_rate": 1.9357142857142858e-05, + "loss": 0.5204, "step": 630 }, { "epoch": 3.2653061224489797, - "grad_norm": 13.203142166137695, - "learning_rate": 6.938775510204082e-06, - "loss": 0.9288, + "grad_norm": 6.848782062530518, + "learning_rate": 1.9346938775510207e-05, + "loss": 0.5037, "step": 640 }, { "epoch": 3.316326530612245, - "grad_norm": 13.70313835144043, - "learning_rate": 6.734693877551021e-06, - "loss": 0.8879, + "grad_norm": 3.865110158920288, + "learning_rate": 1.9336734693877553e-05, + "loss": 0.4718, "step": 650 }, { "epoch": 3.36734693877551, - "grad_norm": 22.619674682617188, - "learning_rate": 6.530612244897959e-06, - "loss": 0.9367, + "grad_norm": 7.135827541351318, + "learning_rate": 1.9326530612244902e-05, + "loss": 0.5719, "step": 660 }, { "epoch": 3.4183673469387754, - "grad_norm": 17.59436798095703, - "learning_rate": 6.326530612244899e-06, - "loss": 0.9176, + "grad_norm": 6.565048694610596, + "learning_rate": 1.9316326530612248e-05, + "loss": 0.5502, "step": 670 }, { "epoch": 3.4693877551020407, - "grad_norm": 12.488346099853516, - "learning_rate": 6.122448979591837e-06, - "loss": 0.9448, + "grad_norm": 3.097395420074463, + "learning_rate": 1.9306122448979593e-05, + "loss": 0.6063, "step": 680 }, { "epoch": 3.520408163265306, - "grad_norm": 7.665136337280273, - "learning_rate": 5.918367346938776e-06, - "loss": 0.8446, + "grad_norm": 2.708237886428833, + "learning_rate": 1.929591836734694e-05, + "loss": 0.4113, "step": 690 }, { "epoch": 3.571428571428571, - "grad_norm": 7.547515392303467, - "learning_rate": 5.7142857142857145e-06, - "loss": 0.8465, + "grad_norm": 3.084146738052368, + "learning_rate": 1.928571428571429e-05, + "loss": 0.4986, "step": 700 }, { "epoch": 3.622448979591837, - "grad_norm": 5.4680304527282715, - "learning_rate": 5.510204081632653e-06, - "loss": 0.8302, + "grad_norm": 1.5593074560165405, + "learning_rate": 1.9275510204081634e-05, + "loss": 0.465, "step": 710 }, { "epoch": 3.673469387755102, - "grad_norm": 22.36733055114746, - "learning_rate": 5.306122448979593e-06, - "loss": 0.9601, + "grad_norm": 3.1597211360931396, + "learning_rate": 1.926530612244898e-05, + "loss": 0.611, "step": 720 }, { "epoch": 3.7244897959183674, - "grad_norm": 12.001816749572754, - "learning_rate": 5.1020408163265315e-06, - "loss": 0.8835, + "grad_norm": 4.91085147857666, + "learning_rate": 1.925510204081633e-05, + "loss": 0.5452, "step": 730 }, { "epoch": 3.7755102040816326, - "grad_norm": 23.104984283447266, - "learning_rate": 4.897959183673469e-06, - "loss": 0.8024, + "grad_norm": 4.3785505294799805, + "learning_rate": 1.9244897959183675e-05, + "loss": 0.3833, "step": 740 }, { "epoch": 3.826530612244898, - "grad_norm": 13.406716346740723, - "learning_rate": 4.693877551020409e-06, - "loss": 1.0554, + "grad_norm": 1.9421954154968262, + "learning_rate": 1.9234693877551024e-05, + "loss": 0.7101, "step": 750 }, { "epoch": 3.877551020408163, - "grad_norm": 9.763223648071289, - "learning_rate": 4.489795918367348e-06, - "loss": 0.79, + "grad_norm": 2.2657201290130615, + "learning_rate": 1.922448979591837e-05, + "loss": 0.3817, "step": 760 }, { "epoch": 3.928571428571429, - "grad_norm": 10.93140983581543, - "learning_rate": 4.2857142857142855e-06, - "loss": 0.8532, + "grad_norm": 2.7672324180603027, + "learning_rate": 1.9214285714285716e-05, + "loss": 0.4854, "step": 770 }, { "epoch": 3.979591836734694, - "grad_norm": 10.841426849365234, - "learning_rate": 4.081632653061225e-06, - "loss": 0.8401, + "grad_norm": 2.3359146118164062, + "learning_rate": 1.920408163265306e-05, + "loss": 0.4874, "step": 780 }, { "epoch": 4.0, - "eval_accuracy": 0.8628158844765343, - "eval_loss": 0.9142875075340271, - "eval_runtime": 14.5094, - "eval_samples_per_second": 19.091, - "eval_steps_per_second": 2.412, + "eval_accuracy": 0.8339350180505415, + "eval_loss": 0.6841569542884827, + "eval_runtime": 0.9488, + "eval_samples_per_second": 291.94, + "eval_steps_per_second": 36.888, "step": 784 }, { "epoch": 4.030612244897959, - "grad_norm": 11.179267883300781, - "learning_rate": 3.877551020408164e-06, - "loss": 0.8061, + "grad_norm": 2.9338905811309814, + "learning_rate": 1.919387755102041e-05, + "loss": 0.4634, "step": 790 }, { "epoch": 4.081632653061225, - "grad_norm": 6.5305585861206055, - "learning_rate": 3.6734693877551024e-06, - "loss": 0.8525, + "grad_norm": 1.7526339292526245, + "learning_rate": 1.9183673469387756e-05, + "loss": 0.5248, "step": 800 }, { "epoch": 4.13265306122449, - "grad_norm": 7.101966857910156, - "learning_rate": 3.469387755102041e-06, - "loss": 0.7967, + "grad_norm": 2.1538479328155518, + "learning_rate": 1.9173469387755102e-05, + "loss": 0.3662, "step": 810 }, { "epoch": 4.183673469387755, - "grad_norm": 9.439040184020996, - "learning_rate": 3.2653061224489794e-06, - "loss": 0.8487, + "grad_norm": 4.566152572631836, + "learning_rate": 1.916326530612245e-05, + "loss": 0.483, "step": 820 }, { "epoch": 4.23469387755102, - "grad_norm": 16.658971786499023, - "learning_rate": 3.0612244897959185e-06, - "loss": 0.8698, + "grad_norm": 5.012964725494385, + "learning_rate": 1.9153061224489797e-05, + "loss": 0.5225, "step": 830 }, { "epoch": 4.285714285714286, - "grad_norm": 14.88980484008789, - "learning_rate": 2.8571428571428573e-06, - "loss": 0.7877, + "grad_norm": 1.5150026082992554, + "learning_rate": 1.9142857142857146e-05, + "loss": 0.389, "step": 840 }, { "epoch": 4.336734693877551, - "grad_norm": 10.878366470336914, - "learning_rate": 2.6530612244897964e-06, - "loss": 0.7731, + "grad_norm": 7.2560296058654785, + "learning_rate": 1.9132653061224492e-05, + "loss": 0.3931, "step": 850 }, { "epoch": 4.387755102040816, - "grad_norm": 11.978877067565918, - "learning_rate": 2.4489795918367347e-06, - "loss": 0.8627, + "grad_norm": 1.9941030740737915, + "learning_rate": 1.9122448979591838e-05, + "loss": 0.5159, "step": 860 }, { "epoch": 4.438775510204081, - "grad_norm": 20.543222427368164, - "learning_rate": 2.244897959183674e-06, - "loss": 0.8106, + "grad_norm": 4.368014335632324, + "learning_rate": 1.9112244897959184e-05, + "loss": 0.468, "step": 870 }, { "epoch": 4.489795918367347, - "grad_norm": 5.99076509475708, - "learning_rate": 2.0408163265306125e-06, - "loss": 0.8779, + "grad_norm": 2.1148953437805176, + "learning_rate": 1.9102040816326533e-05, + "loss": 0.5074, "step": 880 }, { "epoch": 4.540816326530612, - "grad_norm": 13.510481834411621, - "learning_rate": 1.8367346938775512e-06, - "loss": 0.8535, + "grad_norm": 8.519877433776855, + "learning_rate": 1.909183673469388e-05, + "loss": 0.5314, "step": 890 }, { "epoch": 4.591836734693878, - "grad_norm": 14.781784057617188, - "learning_rate": 1.6326530612244897e-06, - "loss": 0.8972, + "grad_norm": 2.9456112384796143, + "learning_rate": 1.9081632653061225e-05, + "loss": 0.534, "step": 900 }, { "epoch": 4.642857142857143, - "grad_norm": 14.63200855255127, - "learning_rate": 1.4285714285714286e-06, - "loss": 0.9312, + "grad_norm": 7.007885932922363, + "learning_rate": 1.9071428571428574e-05, + "loss": 0.6075, "step": 910 }, { "epoch": 4.6938775510204085, - "grad_norm": 19.910818099975586, - "learning_rate": 1.2244897959183673e-06, - "loss": 0.8134, + "grad_norm": 5.156570911407471, + "learning_rate": 1.906122448979592e-05, + "loss": 0.4616, "step": 920 }, { "epoch": 4.744897959183674, - "grad_norm": 16.637697219848633, - "learning_rate": 1.0204081632653063e-06, - "loss": 0.8341, + "grad_norm": 1.7120983600616455, + "learning_rate": 1.905102040816327e-05, + "loss": 0.4637, "step": 930 }, { "epoch": 4.795918367346939, - "grad_norm": 5.32245397567749, - "learning_rate": 8.163265306122449e-07, - "loss": 0.7553, + "grad_norm": 1.9243345260620117, + "learning_rate": 1.9040816326530614e-05, + "loss": 0.3998, "step": 940 }, { "epoch": 4.846938775510204, - "grad_norm": 13.930354118347168, - "learning_rate": 6.122448979591837e-07, - "loss": 0.8303, + "grad_norm": 5.795691013336182, + "learning_rate": 1.903061224489796e-05, + "loss": 0.4539, "step": 950 }, { "epoch": 4.8979591836734695, - "grad_norm": 18.44146728515625, - "learning_rate": 4.0816326530612243e-07, - "loss": 0.8247, + "grad_norm": 1.691705584526062, + "learning_rate": 1.9020408163265306e-05, + "loss": 0.4351, "step": 960 }, { "epoch": 4.948979591836735, - "grad_norm": 15.450093269348145, - "learning_rate": 2.0408163265306121e-07, - "loss": 0.9838, + "grad_norm": 6.1225199699401855, + "learning_rate": 1.9010204081632655e-05, + "loss": 0.7722, "step": 970 }, { "epoch": 5.0, - "grad_norm": 38.064430236816406, - "learning_rate": 0.0, - "loss": 0.8213, + "grad_norm": 8.150745391845703, + "learning_rate": 1.9e-05, + "loss": 0.4671, "step": 980 }, { "epoch": 5.0, - "eval_accuracy": 0.8628158844765343, - "eval_loss": 0.897123396396637, - "eval_runtime": 14.3653, - "eval_samples_per_second": 19.283, - "eval_steps_per_second": 2.436, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.48941850662231445, + "eval_runtime": 0.9622, + "eval_samples_per_second": 287.881, + "eval_steps_per_second": 36.375, "step": 980 }, { - "epoch": 5.0, - "step": 980, - "total_flos": 1.6633116935737344e+17, - "train_loss": 1.1215976812401596, - "train_runtime": 1562.8364, - "train_samples_per_second": 5.01, - "train_steps_per_second": 0.627 + "epoch": 5.051020408163265, + "grad_norm": 6.351683139801025, + "learning_rate": 1.898979591836735e-05, + "loss": 0.3814, + "step": 990 + }, + { + "epoch": 5.1020408163265305, + "grad_norm": 3.669525384902954, + "learning_rate": 1.8979591836734696e-05, + "loss": 0.6377, + "step": 1000 + }, + { + "epoch": 5.153061224489796, + "grad_norm": 4.838572025299072, + "learning_rate": 1.8969387755102042e-05, + "loss": 0.283, + "step": 1010 + }, + { + "epoch": 5.204081632653061, + "grad_norm": 5.346072196960449, + "learning_rate": 1.895918367346939e-05, + "loss": 0.3222, + "step": 1020 + }, + { + "epoch": 5.255102040816326, + "grad_norm": 7.749392986297607, + "learning_rate": 1.8948979591836737e-05, + "loss": 0.5852, + "step": 1030 + }, + { + "epoch": 5.3061224489795915, + "grad_norm": 5.017237186431885, + "learning_rate": 1.8938775510204083e-05, + "loss": 0.4062, + "step": 1040 + }, + { + "epoch": 5.357142857142857, + "grad_norm": 3.6488564014434814, + "learning_rate": 1.892857142857143e-05, + "loss": 0.4372, + "step": 1050 + }, + { + "epoch": 5.408163265306122, + "grad_norm": 1.9676110744476318, + "learning_rate": 1.8918367346938778e-05, + "loss": 0.5442, + "step": 1060 + }, + { + "epoch": 5.459183673469388, + "grad_norm": 1.5495046377182007, + "learning_rate": 1.8908163265306123e-05, + "loss": 0.3801, + "step": 1070 + }, + { + "epoch": 5.510204081632653, + "grad_norm": 3.1108131408691406, + "learning_rate": 1.8897959183673473e-05, + "loss": 0.5089, + "step": 1080 + }, + { + "epoch": 5.561224489795919, + "grad_norm": 0.8886075615882874, + "learning_rate": 1.888775510204082e-05, + "loss": 0.6294, + "step": 1090 + }, + { + "epoch": 5.612244897959184, + "grad_norm": 2.3292956352233887, + "learning_rate": 1.8877551020408164e-05, + "loss": 0.7783, + "step": 1100 + }, + { + "epoch": 5.663265306122449, + "grad_norm": 6.512946128845215, + "learning_rate": 1.8867346938775513e-05, + "loss": 0.3822, + "step": 1110 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 1.3272405862808228, + "learning_rate": 1.885714285714286e-05, + "loss": 0.463, + "step": 1120 + }, + { + "epoch": 5.76530612244898, + "grad_norm": 5.868877410888672, + "learning_rate": 1.8846938775510205e-05, + "loss": 0.4467, + "step": 1130 + }, + { + "epoch": 5.816326530612245, + "grad_norm": 3.6009857654571533, + "learning_rate": 1.883673469387755e-05, + "loss": 0.4185, + "step": 1140 + }, + { + "epoch": 5.86734693877551, + "grad_norm": 4.642253875732422, + "learning_rate": 1.88265306122449e-05, + "loss": 0.4854, + "step": 1150 + }, + { + "epoch": 5.918367346938775, + "grad_norm": 3.2264819145202637, + "learning_rate": 1.8816326530612246e-05, + "loss": 0.3584, + "step": 1160 + }, + { + "epoch": 5.969387755102041, + "grad_norm": 2.7333457469940186, + "learning_rate": 1.8806122448979595e-05, + "loss": 0.5623, + "step": 1170 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.41603580117225647, + "eval_runtime": 0.9437, + "eval_samples_per_second": 293.529, + "eval_steps_per_second": 37.088, + "step": 1176 + }, + { + "epoch": 6.020408163265306, + "grad_norm": 3.8172926902770996, + "learning_rate": 1.879591836734694e-05, + "loss": 0.5799, + "step": 1180 + }, + { + "epoch": 6.071428571428571, + "grad_norm": 5.174800395965576, + "learning_rate": 1.8785714285714286e-05, + "loss": 0.3616, + "step": 1190 + }, + { + "epoch": 6.122448979591836, + "grad_norm": 4.672287940979004, + "learning_rate": 1.8775510204081636e-05, + "loss": 0.362, + "step": 1200 + }, + { + "epoch": 6.173469387755102, + "grad_norm": 1.3142999410629272, + "learning_rate": 1.876530612244898e-05, + "loss": 0.3637, + "step": 1210 + }, + { + "epoch": 6.224489795918367, + "grad_norm": 3.4073710441589355, + "learning_rate": 1.8755102040816327e-05, + "loss": 0.3755, + "step": 1220 + }, + { + "epoch": 6.275510204081632, + "grad_norm": 0.8577567338943481, + "learning_rate": 1.8744897959183673e-05, + "loss": 0.2254, + "step": 1230 + }, + { + "epoch": 6.326530612244898, + "grad_norm": 3.5044026374816895, + "learning_rate": 1.8734693877551022e-05, + "loss": 0.3727, + "step": 1240 + }, + { + "epoch": 6.377551020408164, + "grad_norm": 6.610156059265137, + "learning_rate": 1.8724489795918368e-05, + "loss": 0.8675, + "step": 1250 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 2.974456787109375, + "learning_rate": 1.8714285714285717e-05, + "loss": 0.6604, + "step": 1260 + }, + { + "epoch": 6.479591836734694, + "grad_norm": 3.033073902130127, + "learning_rate": 1.8704081632653063e-05, + "loss": 0.3997, + "step": 1270 + }, + { + "epoch": 6.530612244897959, + "grad_norm": 4.493134498596191, + "learning_rate": 1.869387755102041e-05, + "loss": 0.4472, + "step": 1280 + }, + { + "epoch": 6.581632653061225, + "grad_norm": 2.4048378467559814, + "learning_rate": 1.8683673469387758e-05, + "loss": 0.2935, + "step": 1290 + }, + { + "epoch": 6.63265306122449, + "grad_norm": 4.755849838256836, + "learning_rate": 1.8673469387755104e-05, + "loss": 0.5393, + "step": 1300 + }, + { + "epoch": 6.683673469387755, + "grad_norm": 1.5519723892211914, + "learning_rate": 1.866326530612245e-05, + "loss": 0.4035, + "step": 1310 + }, + { + "epoch": 6.73469387755102, + "grad_norm": 2.7238337993621826, + "learning_rate": 1.8653061224489795e-05, + "loss": 0.367, + "step": 1320 + }, + { + "epoch": 6.785714285714286, + "grad_norm": 7.095693588256836, + "learning_rate": 1.8642857142857144e-05, + "loss": 0.5464, + "step": 1330 + }, + { + "epoch": 6.836734693877551, + "grad_norm": 1.5902338027954102, + "learning_rate": 1.863265306122449e-05, + "loss": 0.5423, + "step": 1340 + }, + { + "epoch": 6.887755102040816, + "grad_norm": 3.9222233295440674, + "learning_rate": 1.862244897959184e-05, + "loss": 0.3587, + "step": 1350 + }, + { + "epoch": 6.938775510204081, + "grad_norm": 6.730372905731201, + "learning_rate": 1.8612244897959185e-05, + "loss": 0.384, + "step": 1360 + }, + { + "epoch": 6.989795918367347, + "grad_norm": 3.2234292030334473, + "learning_rate": 1.860204081632653e-05, + "loss": 0.3917, + "step": 1370 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.40221381187438965, + "eval_runtime": 0.994, + "eval_samples_per_second": 278.684, + "eval_steps_per_second": 35.213, + "step": 1372 + }, + { + "epoch": 7.040816326530612, + "grad_norm": 7.335348606109619, + "learning_rate": 1.859183673469388e-05, + "loss": 0.6036, + "step": 1380 + }, + { + "epoch": 7.091836734693878, + "grad_norm": 1.62227201461792, + "learning_rate": 1.8581632653061226e-05, + "loss": 0.1956, + "step": 1390 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 3.523392915725708, + "learning_rate": 1.8571428571428575e-05, + "loss": 0.3493, + "step": 1400 + }, + { + "epoch": 7.1938775510204085, + "grad_norm": 9.984464645385742, + "learning_rate": 1.856122448979592e-05, + "loss": 0.6654, + "step": 1410 + }, + { + "epoch": 7.244897959183674, + "grad_norm": 3.143925189971924, + "learning_rate": 1.8551020408163267e-05, + "loss": 0.3959, + "step": 1420 + }, + { + "epoch": 7.295918367346939, + "grad_norm": 6.098402976989746, + "learning_rate": 1.8540816326530613e-05, + "loss": 0.6455, + "step": 1430 + }, + { + "epoch": 7.346938775510204, + "grad_norm": 3.7786874771118164, + "learning_rate": 1.853061224489796e-05, + "loss": 0.2466, + "step": 1440 + }, + { + "epoch": 7.3979591836734695, + "grad_norm": 5.968966484069824, + "learning_rate": 1.8520408163265307e-05, + "loss": 0.1962, + "step": 1450 + }, + { + "epoch": 7.448979591836735, + "grad_norm": 3.820188045501709, + "learning_rate": 1.8510204081632653e-05, + "loss": 0.5675, + "step": 1460 + }, + { + "epoch": 7.5, + "grad_norm": 7.745204448699951, + "learning_rate": 1.8500000000000002e-05, + "loss": 0.3934, + "step": 1470 + }, + { + "epoch": 7.551020408163265, + "grad_norm": 4.091207981109619, + "learning_rate": 1.8489795918367348e-05, + "loss": 0.4264, + "step": 1480 + }, + { + "epoch": 7.6020408163265305, + "grad_norm": 3.334214448928833, + "learning_rate": 1.8479591836734697e-05, + "loss": 0.3577, + "step": 1490 + }, + { + "epoch": 7.653061224489796, + "grad_norm": 1.8498873710632324, + "learning_rate": 1.8469387755102043e-05, + "loss": 0.6491, + "step": 1500 + }, + { + "epoch": 7.704081632653061, + "grad_norm": 2.124959945678711, + "learning_rate": 1.845918367346939e-05, + "loss": 0.3526, + "step": 1510 + }, + { + "epoch": 7.755102040816326, + "grad_norm": 5.348793983459473, + "learning_rate": 1.8448979591836735e-05, + "loss": 0.5161, + "step": 1520 + }, + { + "epoch": 7.8061224489795915, + "grad_norm": 5.554693222045898, + "learning_rate": 1.8438775510204084e-05, + "loss": 0.3795, + "step": 1530 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 4.88253927230835, + "learning_rate": 1.842857142857143e-05, + "loss": 0.5021, + "step": 1540 + }, + { + "epoch": 7.908163265306122, + "grad_norm": 3.2645864486694336, + "learning_rate": 1.8418367346938776e-05, + "loss": 0.3677, + "step": 1550 + }, + { + "epoch": 7.959183673469388, + "grad_norm": 6.154125690460205, + "learning_rate": 1.8408163265306125e-05, + "loss": 0.3153, + "step": 1560 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.8411552346570397, + "eval_loss": 0.4939042627811432, + "eval_runtime": 0.9307, + "eval_samples_per_second": 297.613, + "eval_steps_per_second": 37.605, + "step": 1568 + }, + { + "epoch": 8.010204081632653, + "grad_norm": 5.937544822692871, + "learning_rate": 1.839795918367347e-05, + "loss": 0.3933, + "step": 1570 + }, + { + "epoch": 8.061224489795919, + "grad_norm": 6.466835021972656, + "learning_rate": 1.838775510204082e-05, + "loss": 0.4626, + "step": 1580 + }, + { + "epoch": 8.112244897959183, + "grad_norm": 2.7466700077056885, + "learning_rate": 1.8377551020408165e-05, + "loss": 0.5429, + "step": 1590 + }, + { + "epoch": 8.16326530612245, + "grad_norm": 0.5636491179466248, + "learning_rate": 1.836734693877551e-05, + "loss": 0.4122, + "step": 1600 + }, + { + "epoch": 8.214285714285714, + "grad_norm": 2.0550692081451416, + "learning_rate": 1.8357142857142857e-05, + "loss": 0.3833, + "step": 1610 + }, + { + "epoch": 8.26530612244898, + "grad_norm": 1.9433763027191162, + "learning_rate": 1.8346938775510206e-05, + "loss": 0.3483, + "step": 1620 + }, + { + "epoch": 8.316326530612244, + "grad_norm": 1.361655831336975, + "learning_rate": 1.8336734693877552e-05, + "loss": 0.5256, + "step": 1630 + }, + { + "epoch": 8.36734693877551, + "grad_norm": 0.9611966013908386, + "learning_rate": 1.8326530612244898e-05, + "loss": 0.4249, + "step": 1640 + }, + { + "epoch": 8.418367346938776, + "grad_norm": 1.3870189189910889, + "learning_rate": 1.8316326530612247e-05, + "loss": 0.2141, + "step": 1650 + }, + { + "epoch": 8.46938775510204, + "grad_norm": 1.577960729598999, + "learning_rate": 1.8306122448979593e-05, + "loss": 0.5005, + "step": 1660 + }, + { + "epoch": 8.520408163265307, + "grad_norm": 3.716379404067993, + "learning_rate": 1.8295918367346942e-05, + "loss": 0.3459, + "step": 1670 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 5.005441188812256, + "learning_rate": 1.8285714285714288e-05, + "loss": 0.3062, + "step": 1680 + }, + { + "epoch": 8.622448979591837, + "grad_norm": 4.600987434387207, + "learning_rate": 1.8275510204081634e-05, + "loss": 0.3241, + "step": 1690 + }, + { + "epoch": 8.673469387755102, + "grad_norm": 5.137531757354736, + "learning_rate": 1.826530612244898e-05, + "loss": 0.5951, + "step": 1700 + }, + { + "epoch": 8.724489795918368, + "grad_norm": 2.3758299350738525, + "learning_rate": 1.825510204081633e-05, + "loss": 0.3967, + "step": 1710 + }, + { + "epoch": 8.775510204081632, + "grad_norm": 1.4290903806686401, + "learning_rate": 1.8244897959183674e-05, + "loss": 0.3694, + "step": 1720 + }, + { + "epoch": 8.826530612244898, + "grad_norm": 5.8770432472229, + "learning_rate": 1.823469387755102e-05, + "loss": 0.4959, + "step": 1730 + }, + { + "epoch": 8.877551020408163, + "grad_norm": 8.544288635253906, + "learning_rate": 1.822448979591837e-05, + "loss": 0.416, + "step": 1740 + }, + { + "epoch": 8.928571428571429, + "grad_norm": 0.7648389339447021, + "learning_rate": 1.8214285714285715e-05, + "loss": 0.3586, + "step": 1750 + }, + { + "epoch": 8.979591836734693, + "grad_norm": 7.747754096984863, + "learning_rate": 1.8204081632653064e-05, + "loss": 0.5814, + "step": 1760 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.35403916239738464, + "eval_runtime": 0.9416, + "eval_samples_per_second": 294.185, + "eval_steps_per_second": 37.171, + "step": 1764 + }, + { + "epoch": 9.03061224489796, + "grad_norm": 7.367336750030518, + "learning_rate": 1.819387755102041e-05, + "loss": 0.39, + "step": 1770 + }, + { + "epoch": 9.081632653061224, + "grad_norm": 5.418698310852051, + "learning_rate": 1.8183673469387756e-05, + "loss": 0.3316, + "step": 1780 + }, + { + "epoch": 9.13265306122449, + "grad_norm": 1.490496277809143, + "learning_rate": 1.81734693877551e-05, + "loss": 0.4636, + "step": 1790 + }, + { + "epoch": 9.183673469387756, + "grad_norm": 4.0167646408081055, + "learning_rate": 1.816326530612245e-05, + "loss": 0.2302, + "step": 1800 + }, + { + "epoch": 9.23469387755102, + "grad_norm": 6.253035545349121, + "learning_rate": 1.8153061224489797e-05, + "loss": 0.4014, + "step": 1810 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 9.081564903259277, + "learning_rate": 1.8142857142857146e-05, + "loss": 0.4289, + "step": 1820 + }, + { + "epoch": 9.33673469387755, + "grad_norm": 3.3292837142944336, + "learning_rate": 1.813265306122449e-05, + "loss": 0.4435, + "step": 1830 + }, + { + "epoch": 9.387755102040817, + "grad_norm": 4.081367492675781, + "learning_rate": 1.8122448979591837e-05, + "loss": 0.6172, + "step": 1840 + }, + { + "epoch": 9.438775510204081, + "grad_norm": 4.028370380401611, + "learning_rate": 1.8112244897959187e-05, + "loss": 0.3676, + "step": 1850 + }, + { + "epoch": 9.489795918367347, + "grad_norm": 2.214043378829956, + "learning_rate": 1.8102040816326532e-05, + "loss": 0.2903, + "step": 1860 + }, + { + "epoch": 9.540816326530612, + "grad_norm": 7.913087844848633, + "learning_rate": 1.8091836734693878e-05, + "loss": 0.5894, + "step": 1870 + }, + { + "epoch": 9.591836734693878, + "grad_norm": 5.671935558319092, + "learning_rate": 1.8081632653061224e-05, + "loss": 0.2988, + "step": 1880 + }, + { + "epoch": 9.642857142857142, + "grad_norm": 5.60788631439209, + "learning_rate": 1.8071428571428573e-05, + "loss": 0.462, + "step": 1890 + }, + { + "epoch": 9.693877551020408, + "grad_norm": 2.223788261413574, + "learning_rate": 1.806122448979592e-05, + "loss": 0.4142, + "step": 1900 + }, + { + "epoch": 9.744897959183673, + "grad_norm": 2.553497791290283, + "learning_rate": 1.8051020408163268e-05, + "loss": 0.3389, + "step": 1910 + }, + { + "epoch": 9.795918367346939, + "grad_norm": 2.754664659500122, + "learning_rate": 1.8040816326530614e-05, + "loss": 0.3824, + "step": 1920 + }, + { + "epoch": 9.846938775510203, + "grad_norm": 0.7434467077255249, + "learning_rate": 1.803061224489796e-05, + "loss": 0.4129, + "step": 1930 + }, + { + "epoch": 9.89795918367347, + "grad_norm": 0.7156695127487183, + "learning_rate": 1.802040816326531e-05, + "loss": 0.3073, + "step": 1940 + }, + { + "epoch": 9.948979591836736, + "grad_norm": 2.1877171993255615, + "learning_rate": 1.8010204081632655e-05, + "loss": 0.3985, + "step": 1950 + }, + { + "epoch": 10.0, + "grad_norm": 4.8080620765686035, + "learning_rate": 1.8e-05, + "loss": 0.5883, + "step": 1960 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.349290668964386, + "eval_runtime": 0.9968, + "eval_samples_per_second": 277.89, + "eval_steps_per_second": 35.113, + "step": 1960 + }, + { + "epoch": 10.051020408163266, + "grad_norm": 0.7896466255187988, + "learning_rate": 1.7989795918367346e-05, + "loss": 0.5032, + "step": 1970 + }, + { + "epoch": 10.10204081632653, + "grad_norm": 0.5900990962982178, + "learning_rate": 1.7979591836734695e-05, + "loss": 0.3054, + "step": 1980 + }, + { + "epoch": 10.153061224489797, + "grad_norm": 2.6685738563537598, + "learning_rate": 1.796938775510204e-05, + "loss": 0.4091, + "step": 1990 + }, + { + "epoch": 10.204081632653061, + "grad_norm": 1.8895847797393799, + "learning_rate": 1.795918367346939e-05, + "loss": 0.337, + "step": 2000 + }, + { + "epoch": 10.255102040816327, + "grad_norm": 5.3162665367126465, + "learning_rate": 1.7948979591836736e-05, + "loss": 0.3889, + "step": 2010 + }, + { + "epoch": 10.306122448979592, + "grad_norm": 4.576247215270996, + "learning_rate": 1.7938775510204082e-05, + "loss": 0.4455, + "step": 2020 + }, + { + "epoch": 10.357142857142858, + "grad_norm": 1.8315520286560059, + "learning_rate": 1.792857142857143e-05, + "loss": 0.2955, + "step": 2030 + }, + { + "epoch": 10.408163265306122, + "grad_norm": 6.811487197875977, + "learning_rate": 1.7918367346938777e-05, + "loss": 0.4975, + "step": 2040 + }, + { + "epoch": 10.459183673469388, + "grad_norm": 6.817623615264893, + "learning_rate": 1.7908163265306123e-05, + "loss": 0.5212, + "step": 2050 + }, + { + "epoch": 10.510204081632653, + "grad_norm": 2.7684004306793213, + "learning_rate": 1.789795918367347e-05, + "loss": 0.3925, + "step": 2060 + }, + { + "epoch": 10.561224489795919, + "grad_norm": 1.4127514362335205, + "learning_rate": 1.7887755102040818e-05, + "loss": 0.4752, + "step": 2070 + }, + { + "epoch": 10.612244897959183, + "grad_norm": 6.725620269775391, + "learning_rate": 1.7877551020408164e-05, + "loss": 0.3967, + "step": 2080 + }, + { + "epoch": 10.66326530612245, + "grad_norm": 3.6214230060577393, + "learning_rate": 1.7867346938775513e-05, + "loss": 0.6277, + "step": 2090 + }, + { + "epoch": 10.714285714285714, + "grad_norm": 1.93522047996521, + "learning_rate": 1.785714285714286e-05, + "loss": 0.214, + "step": 2100 + }, + { + "epoch": 10.76530612244898, + "grad_norm": 5.977639198303223, + "learning_rate": 1.7846938775510204e-05, + "loss": 0.3622, + "step": 2110 + }, + { + "epoch": 10.816326530612244, + "grad_norm": 1.9556264877319336, + "learning_rate": 1.7836734693877553e-05, + "loss": 0.4324, + "step": 2120 + }, + { + "epoch": 10.86734693877551, + "grad_norm": 7.346195697784424, + "learning_rate": 1.78265306122449e-05, + "loss": 0.2873, + "step": 2130 + }, + { + "epoch": 10.918367346938776, + "grad_norm": 6.110283374786377, + "learning_rate": 1.781632653061225e-05, + "loss": 0.5476, + "step": 2140 + }, + { + "epoch": 10.96938775510204, + "grad_norm": 10.262624740600586, + "learning_rate": 1.780612244897959e-05, + "loss": 0.4616, + "step": 2150 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.776173285198556, + "eval_loss": 0.7928438186645508, + "eval_runtime": 0.959, + "eval_samples_per_second": 288.836, + "eval_steps_per_second": 36.495, + "step": 2156 + }, + { + "epoch": 11.020408163265307, + "grad_norm": 1.9167275428771973, + "learning_rate": 1.779591836734694e-05, + "loss": 0.3289, + "step": 2160 + }, + { + "epoch": 11.071428571428571, + "grad_norm": 1.6276081800460815, + "learning_rate": 1.7785714285714286e-05, + "loss": 0.3506, + "step": 2170 + }, + { + "epoch": 11.122448979591837, + "grad_norm": 6.090387344360352, + "learning_rate": 1.7775510204081635e-05, + "loss": 0.3149, + "step": 2180 + }, + { + "epoch": 11.173469387755102, + "grad_norm": 1.20468270778656, + "learning_rate": 1.776530612244898e-05, + "loss": 0.2804, + "step": 2190 + }, + { + "epoch": 11.224489795918368, + "grad_norm": 9.119256019592285, + "learning_rate": 1.7755102040816327e-05, + "loss": 0.3767, + "step": 2200 + }, + { + "epoch": 11.275510204081632, + "grad_norm": 3.9283931255340576, + "learning_rate": 1.7744897959183676e-05, + "loss": 0.5942, + "step": 2210 + }, + { + "epoch": 11.326530612244898, + "grad_norm": 1.8245526552200317, + "learning_rate": 1.773469387755102e-05, + "loss": 0.3046, + "step": 2220 + }, + { + "epoch": 11.377551020408163, + "grad_norm": 4.436433792114258, + "learning_rate": 1.772448979591837e-05, + "loss": 0.4403, + "step": 2230 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 5.837812423706055, + "learning_rate": 1.7714285714285717e-05, + "loss": 0.4211, + "step": 2240 + }, + { + "epoch": 11.479591836734693, + "grad_norm": 1.7079073190689087, + "learning_rate": 1.7704081632653062e-05, + "loss": 0.5829, + "step": 2250 + }, + { + "epoch": 11.53061224489796, + "grad_norm": 0.8393904566764832, + "learning_rate": 1.7693877551020408e-05, + "loss": 0.211, + "step": 2260 + }, + { + "epoch": 11.581632653061224, + "grad_norm": 2.068840742111206, + "learning_rate": 1.7683673469387757e-05, + "loss": 0.2437, + "step": 2270 + }, + { + "epoch": 11.63265306122449, + "grad_norm": 2.982731819152832, + "learning_rate": 1.7673469387755103e-05, + "loss": 0.2618, + "step": 2280 + }, + { + "epoch": 11.683673469387756, + "grad_norm": 4.23781156539917, + "learning_rate": 1.766326530612245e-05, + "loss": 0.3905, + "step": 2290 + }, + { + "epoch": 11.73469387755102, + "grad_norm": 5.205179214477539, + "learning_rate": 1.7653061224489798e-05, + "loss": 0.7493, + "step": 2300 + }, + { + "epoch": 11.785714285714286, + "grad_norm": 1.835451364517212, + "learning_rate": 1.7642857142857144e-05, + "loss": 0.3456, + "step": 2310 + }, + { + "epoch": 11.83673469387755, + "grad_norm": 4.031843662261963, + "learning_rate": 1.7632653061224493e-05, + "loss": 0.3092, + "step": 2320 + }, + { + "epoch": 11.887755102040817, + "grad_norm": 5.397056579589844, + "learning_rate": 1.762244897959184e-05, + "loss": 0.3759, + "step": 2330 + }, + { + "epoch": 11.938775510204081, + "grad_norm": 2.7502498626708984, + "learning_rate": 1.7612244897959185e-05, + "loss": 0.2369, + "step": 2340 + }, + { + "epoch": 11.989795918367347, + "grad_norm": 4.946924209594727, + "learning_rate": 1.760204081632653e-05, + "loss": 0.499, + "step": 2350 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.296028880866426, + "eval_loss": 2.06593918800354, + "eval_runtime": 0.9333, + "eval_samples_per_second": 296.781, + "eval_steps_per_second": 37.499, + "step": 2352 + }, + { + "epoch": 12.040816326530612, + "grad_norm": 3.4574103355407715, + "learning_rate": 1.759183673469388e-05, + "loss": 0.2957, + "step": 2360 + }, + { + "epoch": 12.091836734693878, + "grad_norm": 8.112685203552246, + "learning_rate": 1.7581632653061225e-05, + "loss": 0.3105, + "step": 2370 + }, + { + "epoch": 12.142857142857142, + "grad_norm": 4.740105628967285, + "learning_rate": 1.757142857142857e-05, + "loss": 0.4352, + "step": 2380 + }, + { + "epoch": 12.193877551020408, + "grad_norm": 1.1439720392227173, + "learning_rate": 1.756122448979592e-05, + "loss": 0.397, + "step": 2390 + }, + { + "epoch": 12.244897959183673, + "grad_norm": 0.700265645980835, + "learning_rate": 1.7551020408163266e-05, + "loss": 0.3186, + "step": 2400 + }, + { + "epoch": 12.295918367346939, + "grad_norm": 4.206295013427734, + "learning_rate": 1.7540816326530615e-05, + "loss": 0.5838, + "step": 2410 + }, + { + "epoch": 12.346938775510203, + "grad_norm": 3.2011196613311768, + "learning_rate": 1.753061224489796e-05, + "loss": 0.4628, + "step": 2420 + }, + { + "epoch": 12.39795918367347, + "grad_norm": 6.6271281242370605, + "learning_rate": 1.7520408163265307e-05, + "loss": 0.3546, + "step": 2430 + }, + { + "epoch": 12.448979591836734, + "grad_norm": 2.038639783859253, + "learning_rate": 1.7510204081632653e-05, + "loss": 0.2915, + "step": 2440 + }, + { + "epoch": 12.5, + "grad_norm": 1.8398466110229492, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.3023, + "step": 2450 + }, + { + "epoch": 12.551020408163264, + "grad_norm": 4.673349857330322, + "learning_rate": 1.748979591836735e-05, + "loss": 0.4503, + "step": 2460 + }, + { + "epoch": 12.60204081632653, + "grad_norm": 1.638249397277832, + "learning_rate": 1.7479591836734693e-05, + "loss": 0.3084, + "step": 2470 + }, + { + "epoch": 12.653061224489797, + "grad_norm": 2.2692887783050537, + "learning_rate": 1.7469387755102043e-05, + "loss": 0.3704, + "step": 2480 + }, + { + "epoch": 12.704081632653061, + "grad_norm": 8.47905445098877, + "learning_rate": 1.745918367346939e-05, + "loss": 0.3022, + "step": 2490 + }, + { + "epoch": 12.755102040816327, + "grad_norm": 7.4197492599487305, + "learning_rate": 1.7448979591836738e-05, + "loss": 0.5395, + "step": 2500 + }, + { + "epoch": 12.806122448979592, + "grad_norm": 3.075881242752075, + "learning_rate": 1.7438775510204083e-05, + "loss": 0.316, + "step": 2510 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 3.121246814727783, + "learning_rate": 1.742857142857143e-05, + "loss": 0.5723, + "step": 2520 + }, + { + "epoch": 12.908163265306122, + "grad_norm": 1.1997493505477905, + "learning_rate": 1.7418367346938775e-05, + "loss": 0.409, + "step": 2530 + }, + { + "epoch": 12.959183673469388, + "grad_norm": 3.3804609775543213, + "learning_rate": 1.7408163265306124e-05, + "loss": 0.2236, + "step": 2540 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.851985559566787, + "eval_loss": 0.44439464807510376, + "eval_runtime": 0.9402, + "eval_samples_per_second": 294.63, + "eval_steps_per_second": 37.228, + "step": 2548 + }, + { + "epoch": 13.010204081632653, + "grad_norm": 5.298696994781494, + "learning_rate": 1.7397959183673473e-05, + "loss": 0.538, + "step": 2550 + }, + { + "epoch": 13.061224489795919, + "grad_norm": 5.091602802276611, + "learning_rate": 1.738775510204082e-05, + "loss": 0.4475, + "step": 2560 + }, + { + "epoch": 13.112244897959183, + "grad_norm": 4.266963481903076, + "learning_rate": 1.7377551020408165e-05, + "loss": 0.3297, + "step": 2570 + }, + { + "epoch": 13.16326530612245, + "grad_norm": 4.778618812561035, + "learning_rate": 1.736734693877551e-05, + "loss": 0.3443, + "step": 2580 + }, + { + "epoch": 13.214285714285714, + "grad_norm": 2.724548101425171, + "learning_rate": 1.735714285714286e-05, + "loss": 0.4932, + "step": 2590 + }, + { + "epoch": 13.26530612244898, + "grad_norm": 2.435530185699463, + "learning_rate": 1.7346938775510206e-05, + "loss": 0.2496, + "step": 2600 + }, + { + "epoch": 13.316326530612244, + "grad_norm": 7.0745062828063965, + "learning_rate": 1.733673469387755e-05, + "loss": 0.4175, + "step": 2610 + }, + { + "epoch": 13.36734693877551, + "grad_norm": 2.8941125869750977, + "learning_rate": 1.7326530612244897e-05, + "loss": 0.4203, + "step": 2620 + }, + { + "epoch": 13.418367346938776, + "grad_norm": 4.122393608093262, + "learning_rate": 1.7316326530612246e-05, + "loss": 0.3874, + "step": 2630 + }, + { + "epoch": 13.46938775510204, + "grad_norm": 2.2892613410949707, + "learning_rate": 1.7306122448979596e-05, + "loss": 0.4277, + "step": 2640 + }, + { + "epoch": 13.520408163265307, + "grad_norm": 10.996116638183594, + "learning_rate": 1.729591836734694e-05, + "loss": 0.4704, + "step": 2650 + }, + { + "epoch": 13.571428571428571, + "grad_norm": 3.800659418106079, + "learning_rate": 1.7285714285714287e-05, + "loss": 0.3591, + "step": 2660 + }, + { + "epoch": 13.622448979591837, + "grad_norm": 1.313789963722229, + "learning_rate": 1.7275510204081633e-05, + "loss": 0.2531, + "step": 2670 + }, + { + "epoch": 13.673469387755102, + "grad_norm": 2.3375041484832764, + "learning_rate": 1.7265306122448982e-05, + "loss": 0.3973, + "step": 2680 + }, + { + "epoch": 13.724489795918368, + "grad_norm": 6.231153964996338, + "learning_rate": 1.7255102040816328e-05, + "loss": 0.4401, + "step": 2690 + }, + { + "epoch": 13.775510204081632, + "grad_norm": 4.105325698852539, + "learning_rate": 1.7244897959183674e-05, + "loss": 0.3107, + "step": 2700 + }, + { + "epoch": 13.826530612244898, + "grad_norm": 4.559372901916504, + "learning_rate": 1.723469387755102e-05, + "loss": 0.6584, + "step": 2710 + }, + { + "epoch": 13.877551020408163, + "grad_norm": 6.511418342590332, + "learning_rate": 1.722448979591837e-05, + "loss": 0.3436, + "step": 2720 + }, + { + "epoch": 13.928571428571429, + "grad_norm": 3.735102415084839, + "learning_rate": 1.7214285714285718e-05, + "loss": 0.2158, + "step": 2730 + }, + { + "epoch": 13.979591836734693, + "grad_norm": 0.3978160321712494, + "learning_rate": 1.7204081632653064e-05, + "loss": 0.2083, + "step": 2740 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.46400919556617737, + "eval_runtime": 0.9405, + "eval_samples_per_second": 294.528, + "eval_steps_per_second": 37.215, + "step": 2744 + }, + { + "epoch": 14.03061224489796, + "grad_norm": 10.202120780944824, + "learning_rate": 1.719387755102041e-05, + "loss": 0.5765, + "step": 2750 + }, + { + "epoch": 14.081632653061224, + "grad_norm": 0.6840581893920898, + "learning_rate": 1.7183673469387755e-05, + "loss": 0.2435, + "step": 2760 + }, + { + "epoch": 14.13265306122449, + "grad_norm": 2.577873945236206, + "learning_rate": 1.7173469387755104e-05, + "loss": 0.2914, + "step": 2770 + }, + { + "epoch": 14.183673469387756, + "grad_norm": 2.1203761100769043, + "learning_rate": 1.716326530612245e-05, + "loss": 0.3378, + "step": 2780 + }, + { + "epoch": 14.23469387755102, + "grad_norm": 0.5613865852355957, + "learning_rate": 1.7153061224489796e-05, + "loss": 0.3199, + "step": 2790 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 2.023918867111206, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.2684, + "step": 2800 + }, + { + "epoch": 14.33673469387755, + "grad_norm": 4.932062149047852, + "learning_rate": 1.713265306122449e-05, + "loss": 0.3795, + "step": 2810 + }, + { + "epoch": 14.387755102040817, + "grad_norm": 4.413599967956543, + "learning_rate": 1.712244897959184e-05, + "loss": 0.3618, + "step": 2820 + }, + { + "epoch": 14.438775510204081, + "grad_norm": 9.810211181640625, + "learning_rate": 1.7112244897959186e-05, + "loss": 0.4044, + "step": 2830 + }, + { + "epoch": 14.489795918367347, + "grad_norm": 2.67509126663208, + "learning_rate": 1.7102040816326532e-05, + "loss": 0.4036, + "step": 2840 + }, + { + "epoch": 14.540816326530612, + "grad_norm": 4.9529852867126465, + "learning_rate": 1.7091836734693878e-05, + "loss": 0.3714, + "step": 2850 + }, + { + "epoch": 14.591836734693878, + "grad_norm": 4.113321304321289, + "learning_rate": 1.7081632653061227e-05, + "loss": 0.2794, + "step": 2860 + }, + { + "epoch": 14.642857142857142, + "grad_norm": 1.3466196060180664, + "learning_rate": 1.7071428571428573e-05, + "loss": 0.7391, + "step": 2870 + }, + { + "epoch": 14.693877551020408, + "grad_norm": 6.835224151611328, + "learning_rate": 1.7061224489795922e-05, + "loss": 0.564, + "step": 2880 + }, + { + "epoch": 14.744897959183673, + "grad_norm": 7.45158576965332, + "learning_rate": 1.7051020408163264e-05, + "loss": 0.3609, + "step": 2890 + }, + { + "epoch": 14.795918367346939, + "grad_norm": 6.361330986022949, + "learning_rate": 1.7040816326530613e-05, + "loss": 0.445, + "step": 2900 + }, + { + "epoch": 14.846938775510203, + "grad_norm": 6.676398277282715, + "learning_rate": 1.7030612244897962e-05, + "loss": 0.3851, + "step": 2910 + }, + { + "epoch": 14.89795918367347, + "grad_norm": 2.9715816974639893, + "learning_rate": 1.7020408163265308e-05, + "loss": 0.3791, + "step": 2920 + }, + { + "epoch": 14.948979591836736, + "grad_norm": 3.722626209259033, + "learning_rate": 1.7010204081632654e-05, + "loss": 0.5322, + "step": 2930 + }, + { + "epoch": 15.0, + "grad_norm": 4.071798324584961, + "learning_rate": 1.7e-05, + "loss": 0.3408, + "step": 2940 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.37753719091415405, + "eval_runtime": 0.9331, + "eval_samples_per_second": 296.873, + "eval_steps_per_second": 37.511, + "step": 2940 + }, + { + "epoch": 15.051020408163266, + "grad_norm": 2.8461458683013916, + "learning_rate": 1.698979591836735e-05, + "loss": 0.3063, + "step": 2950 + }, + { + "epoch": 15.10204081632653, + "grad_norm": 2.373788595199585, + "learning_rate": 1.6979591836734695e-05, + "loss": 0.2771, + "step": 2960 + }, + { + "epoch": 15.153061224489797, + "grad_norm": 1.0632851123809814, + "learning_rate": 1.6969387755102044e-05, + "loss": 0.3648, + "step": 2970 + }, + { + "epoch": 15.204081632653061, + "grad_norm": 2.820613145828247, + "learning_rate": 1.695918367346939e-05, + "loss": 0.4814, + "step": 2980 + }, + { + "epoch": 15.255102040816327, + "grad_norm": 1.9779471158981323, + "learning_rate": 1.6948979591836736e-05, + "loss": 0.2805, + "step": 2990 + }, + { + "epoch": 15.306122448979592, + "grad_norm": 1.2560408115386963, + "learning_rate": 1.6938775510204085e-05, + "loss": 0.3092, + "step": 3000 + }, + { + "epoch": 15.357142857142858, + "grad_norm": 2.176464557647705, + "learning_rate": 1.692857142857143e-05, + "loss": 0.5962, + "step": 3010 + }, + { + "epoch": 15.408163265306122, + "grad_norm": 3.4904580116271973, + "learning_rate": 1.6918367346938776e-05, + "loss": 0.3132, + "step": 3020 + }, + { + "epoch": 15.459183673469388, + "grad_norm": 3.03112530708313, + "learning_rate": 1.6908163265306122e-05, + "loss": 0.2164, + "step": 3030 + }, + { + "epoch": 15.510204081632653, + "grad_norm": 1.1290910243988037, + "learning_rate": 1.689795918367347e-05, + "loss": 0.1778, + "step": 3040 + }, + { + "epoch": 15.561224489795919, + "grad_norm": 5.56776237487793, + "learning_rate": 1.6887755102040817e-05, + "loss": 0.2771, + "step": 3050 + }, + { + "epoch": 15.612244897959183, + "grad_norm": 3.38120436668396, + "learning_rate": 1.6877551020408166e-05, + "loss": 0.4485, + "step": 3060 + }, + { + "epoch": 15.66326530612245, + "grad_norm": 1.8314141035079956, + "learning_rate": 1.6867346938775512e-05, + "loss": 0.3359, + "step": 3070 + }, + { + "epoch": 15.714285714285714, + "grad_norm": 6.086407661437988, + "learning_rate": 1.6857142857142858e-05, + "loss": 0.5121, + "step": 3080 + }, + { + "epoch": 15.76530612244898, + "grad_norm": 3.3119635581970215, + "learning_rate": 1.6846938775510207e-05, + "loss": 0.283, + "step": 3090 + }, + { + "epoch": 15.816326530612244, + "grad_norm": 9.957314491271973, + "learning_rate": 1.6836734693877553e-05, + "loss": 0.7381, + "step": 3100 + }, + { + "epoch": 15.86734693877551, + "grad_norm": 6.10607385635376, + "learning_rate": 1.68265306122449e-05, + "loss": 0.5771, + "step": 3110 + }, + { + "epoch": 15.918367346938776, + "grad_norm": 1.159850001335144, + "learning_rate": 1.6816326530612244e-05, + "loss": 0.4173, + "step": 3120 + }, + { + "epoch": 15.96938775510204, + "grad_norm": 1.2680679559707642, + "learning_rate": 1.6806122448979594e-05, + "loss": 0.3529, + "step": 3130 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.3519056737422943, + "eval_runtime": 0.9395, + "eval_samples_per_second": 294.852, + "eval_steps_per_second": 37.256, + "step": 3136 + }, + { + "epoch": 16.020408163265305, + "grad_norm": 5.024880886077881, + "learning_rate": 1.679591836734694e-05, + "loss": 0.2527, + "step": 3140 + }, + { + "epoch": 16.071428571428573, + "grad_norm": 7.212440490722656, + "learning_rate": 1.678571428571429e-05, + "loss": 0.4876, + "step": 3150 + }, + { + "epoch": 16.122448979591837, + "grad_norm": 2.613367795944214, + "learning_rate": 1.6775510204081634e-05, + "loss": 0.1999, + "step": 3160 + }, + { + "epoch": 16.1734693877551, + "grad_norm": 4.305943965911865, + "learning_rate": 1.676530612244898e-05, + "loss": 0.1841, + "step": 3170 + }, + { + "epoch": 16.224489795918366, + "grad_norm": 4.246516227722168, + "learning_rate": 1.675510204081633e-05, + "loss": 0.2093, + "step": 3180 + }, + { + "epoch": 16.275510204081634, + "grad_norm": 1.0356781482696533, + "learning_rate": 1.6744897959183675e-05, + "loss": 0.3482, + "step": 3190 + }, + { + "epoch": 16.3265306122449, + "grad_norm": 6.927212238311768, + "learning_rate": 1.673469387755102e-05, + "loss": 0.4234, + "step": 3200 + }, + { + "epoch": 16.377551020408163, + "grad_norm": 1.6659942865371704, + "learning_rate": 1.6724489795918367e-05, + "loss": 0.3295, + "step": 3210 + }, + { + "epoch": 16.428571428571427, + "grad_norm": 3.484823703765869, + "learning_rate": 1.6714285714285716e-05, + "loss": 0.3618, + "step": 3220 + }, + { + "epoch": 16.479591836734695, + "grad_norm": 9.103923797607422, + "learning_rate": 1.6704081632653062e-05, + "loss": 0.3733, + "step": 3230 + }, + { + "epoch": 16.53061224489796, + "grad_norm": 3.0150811672210693, + "learning_rate": 1.669387755102041e-05, + "loss": 0.2741, + "step": 3240 + }, + { + "epoch": 16.581632653061224, + "grad_norm": 3.678077459335327, + "learning_rate": 1.6683673469387757e-05, + "loss": 0.2475, + "step": 3250 + }, + { + "epoch": 16.632653061224488, + "grad_norm": 4.152568817138672, + "learning_rate": 1.6673469387755102e-05, + "loss": 0.346, + "step": 3260 + }, + { + "epoch": 16.683673469387756, + "grad_norm": 3.0340733528137207, + "learning_rate": 1.666326530612245e-05, + "loss": 0.2615, + "step": 3270 + }, + { + "epoch": 16.73469387755102, + "grad_norm": 6.065981388092041, + "learning_rate": 1.6653061224489797e-05, + "loss": 0.4197, + "step": 3280 + }, + { + "epoch": 16.785714285714285, + "grad_norm": 4.897032260894775, + "learning_rate": 1.6642857142857147e-05, + "loss": 0.1801, + "step": 3290 + }, + { + "epoch": 16.836734693877553, + "grad_norm": 5.482302188873291, + "learning_rate": 1.6632653061224492e-05, + "loss": 0.6186, + "step": 3300 + }, + { + "epoch": 16.887755102040817, + "grad_norm": 3.145723581314087, + "learning_rate": 1.6622448979591838e-05, + "loss": 0.2651, + "step": 3310 + }, + { + "epoch": 16.93877551020408, + "grad_norm": 3.2254679203033447, + "learning_rate": 1.6612244897959184e-05, + "loss": 0.2739, + "step": 3320 + }, + { + "epoch": 16.989795918367346, + "grad_norm": 4.273975372314453, + "learning_rate": 1.6602040816326533e-05, + "loss": 0.3859, + "step": 3330 + }, + { + "epoch": 17.0, + "eval_accuracy": 0.9061371841155235, + "eval_loss": 0.33103030920028687, + "eval_runtime": 0.9376, + "eval_samples_per_second": 295.433, + "eval_steps_per_second": 37.329, + "step": 3332 + }, + { + "epoch": 17.040816326530614, + "grad_norm": 5.280905246734619, + "learning_rate": 1.659183673469388e-05, + "loss": 0.3949, + "step": 3340 + }, + { + "epoch": 17.091836734693878, + "grad_norm": 3.1650149822235107, + "learning_rate": 1.6581632653061225e-05, + "loss": 0.272, + "step": 3350 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 1.1566836833953857, + "learning_rate": 1.6571428571428574e-05, + "loss": 0.174, + "step": 3360 + }, + { + "epoch": 17.193877551020407, + "grad_norm": 3.5513110160827637, + "learning_rate": 1.656122448979592e-05, + "loss": 0.4144, + "step": 3370 + }, + { + "epoch": 17.244897959183675, + "grad_norm": 9.73366928100586, + "learning_rate": 1.655102040816327e-05, + "loss": 0.5116, + "step": 3380 + }, + { + "epoch": 17.29591836734694, + "grad_norm": 4.954830169677734, + "learning_rate": 1.6540816326530615e-05, + "loss": 0.1691, + "step": 3390 + }, + { + "epoch": 17.346938775510203, + "grad_norm": 0.931755781173706, + "learning_rate": 1.653061224489796e-05, + "loss": 0.397, + "step": 3400 + }, + { + "epoch": 17.397959183673468, + "grad_norm": 2.4209439754486084, + "learning_rate": 1.6520408163265306e-05, + "loss": 0.3927, + "step": 3410 + }, + { + "epoch": 17.448979591836736, + "grad_norm": 0.28612983226776123, + "learning_rate": 1.6510204081632655e-05, + "loss": 0.3412, + "step": 3420 + }, + { + "epoch": 17.5, + "grad_norm": 0.8477025628089905, + "learning_rate": 1.65e-05, + "loss": 0.5129, + "step": 3430 + }, + { + "epoch": 17.551020408163264, + "grad_norm": 8.41015338897705, + "learning_rate": 1.6489795918367347e-05, + "loss": 0.337, + "step": 3440 + }, + { + "epoch": 17.602040816326532, + "grad_norm": 1.4178277254104614, + "learning_rate": 1.6479591836734696e-05, + "loss": 0.3139, + "step": 3450 + }, + { + "epoch": 17.653061224489797, + "grad_norm": 3.9586143493652344, + "learning_rate": 1.6469387755102042e-05, + "loss": 0.3928, + "step": 3460 + }, + { + "epoch": 17.70408163265306, + "grad_norm": 3.131901264190674, + "learning_rate": 1.645918367346939e-05, + "loss": 0.3992, + "step": 3470 + }, + { + "epoch": 17.755102040816325, + "grad_norm": 2.3614821434020996, + "learning_rate": 1.6448979591836737e-05, + "loss": 0.5565, + "step": 3480 + }, + { + "epoch": 17.806122448979593, + "grad_norm": 0.8451371788978577, + "learning_rate": 1.6438775510204083e-05, + "loss": 0.6182, + "step": 3490 + }, + { + "epoch": 17.857142857142858, + "grad_norm": 8.3782377243042, + "learning_rate": 1.642857142857143e-05, + "loss": 0.2668, + "step": 3500 + }, + { + "epoch": 17.908163265306122, + "grad_norm": 3.256161689758301, + "learning_rate": 1.6418367346938778e-05, + "loss": 0.2882, + "step": 3510 + }, + { + "epoch": 17.959183673469386, + "grad_norm": 3.928743839263916, + "learning_rate": 1.6408163265306124e-05, + "loss": 0.3557, + "step": 3520 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.34751203656196594, + "eval_runtime": 0.9989, + "eval_samples_per_second": 277.319, + "eval_steps_per_second": 35.04, + "step": 3528 + }, + { + "epoch": 18.010204081632654, + "grad_norm": 0.5895347595214844, + "learning_rate": 1.639795918367347e-05, + "loss": 0.3817, + "step": 3530 + }, + { + "epoch": 18.06122448979592, + "grad_norm": 0.8526825904846191, + "learning_rate": 1.638775510204082e-05, + "loss": 0.4163, + "step": 3540 + }, + { + "epoch": 18.112244897959183, + "grad_norm": 7.634555816650391, + "learning_rate": 1.6377551020408164e-05, + "loss": 0.3727, + "step": 3550 + }, + { + "epoch": 18.163265306122447, + "grad_norm": 7.4755682945251465, + "learning_rate": 1.6367346938775513e-05, + "loss": 0.3624, + "step": 3560 + }, + { + "epoch": 18.214285714285715, + "grad_norm": 10.033106803894043, + "learning_rate": 1.635714285714286e-05, + "loss": 0.2825, + "step": 3570 + }, + { + "epoch": 18.26530612244898, + "grad_norm": 7.6165289878845215, + "learning_rate": 1.6346938775510205e-05, + "loss": 0.441, + "step": 3580 + }, + { + "epoch": 18.316326530612244, + "grad_norm": 2.656892776489258, + "learning_rate": 1.633673469387755e-05, + "loss": 0.3865, + "step": 3590 + }, + { + "epoch": 18.367346938775512, + "grad_norm": 3.905784845352173, + "learning_rate": 1.63265306122449e-05, + "loss": 0.4773, + "step": 3600 + }, + { + "epoch": 18.418367346938776, + "grad_norm": 2.772597551345825, + "learning_rate": 1.6316326530612246e-05, + "loss": 0.435, + "step": 3610 + }, + { + "epoch": 18.46938775510204, + "grad_norm": 8.21883487701416, + "learning_rate": 1.630612244897959e-05, + "loss": 0.3305, + "step": 3620 + }, + { + "epoch": 18.520408163265305, + "grad_norm": 7.803742408752441, + "learning_rate": 1.629591836734694e-05, + "loss": 0.2956, + "step": 3630 + }, + { + "epoch": 18.571428571428573, + "grad_norm": 1.3410613536834717, + "learning_rate": 1.6285714285714287e-05, + "loss": 0.3765, + "step": 3640 + }, + { + "epoch": 18.622448979591837, + "grad_norm": 8.857250213623047, + "learning_rate": 1.6275510204081636e-05, + "loss": 0.535, + "step": 3650 + }, + { + "epoch": 18.6734693877551, + "grad_norm": 7.803662300109863, + "learning_rate": 1.626530612244898e-05, + "loss": 0.3534, + "step": 3660 + }, + { + "epoch": 18.724489795918366, + "grad_norm": 0.8503233194351196, + "learning_rate": 1.6255102040816327e-05, + "loss": 0.3612, + "step": 3670 + }, + { + "epoch": 18.775510204081634, + "grad_norm": 4.057595252990723, + "learning_rate": 1.6244897959183673e-05, + "loss": 0.3237, + "step": 3680 + }, + { + "epoch": 18.8265306122449, + "grad_norm": 3.173178195953369, + "learning_rate": 1.6234693877551022e-05, + "loss": 0.1646, + "step": 3690 + }, + { + "epoch": 18.877551020408163, + "grad_norm": 3.036897659301758, + "learning_rate": 1.6224489795918368e-05, + "loss": 0.1965, + "step": 3700 + }, + { + "epoch": 18.928571428571427, + "grad_norm": 6.649494647979736, + "learning_rate": 1.6214285714285717e-05, + "loss": 0.4771, + "step": 3710 + }, + { + "epoch": 18.979591836734695, + "grad_norm": 1.3008012771606445, + "learning_rate": 1.6204081632653063e-05, + "loss": 0.4979, + "step": 3720 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.8592057761732852, + "eval_loss": 0.38391339778900146, + "eval_runtime": 0.9328, + "eval_samples_per_second": 296.961, + "eval_steps_per_second": 37.522, + "step": 3724 + }, + { + "epoch": 19.03061224489796, + "grad_norm": 5.8646464347839355, + "learning_rate": 1.619387755102041e-05, + "loss": 0.4242, + "step": 3730 + }, + { + "epoch": 19.081632653061224, + "grad_norm": 4.661500453948975, + "learning_rate": 1.6183673469387758e-05, + "loss": 0.2847, + "step": 3740 + }, + { + "epoch": 19.132653061224488, + "grad_norm": 2.1249194145202637, + "learning_rate": 1.6173469387755104e-05, + "loss": 0.4618, + "step": 3750 + }, + { + "epoch": 19.183673469387756, + "grad_norm": 6.6412458419799805, + "learning_rate": 1.616326530612245e-05, + "loss": 0.4795, + "step": 3760 + }, + { + "epoch": 19.23469387755102, + "grad_norm": 5.910946369171143, + "learning_rate": 1.6153061224489795e-05, + "loss": 0.2605, + "step": 3770 + }, + { + "epoch": 19.285714285714285, + "grad_norm": 5.426766395568848, + "learning_rate": 1.6142857142857145e-05, + "loss": 0.3467, + "step": 3780 + }, + { + "epoch": 19.336734693877553, + "grad_norm": 2.4188570976257324, + "learning_rate": 1.613265306122449e-05, + "loss": 0.3199, + "step": 3790 + }, + { + "epoch": 19.387755102040817, + "grad_norm": 3.9347238540649414, + "learning_rate": 1.612244897959184e-05, + "loss": 0.4666, + "step": 3800 + }, + { + "epoch": 19.43877551020408, + "grad_norm": 1.762243628501892, + "learning_rate": 1.6112244897959185e-05, + "loss": 0.1945, + "step": 3810 + }, + { + "epoch": 19.489795918367346, + "grad_norm": 1.250897765159607, + "learning_rate": 1.610204081632653e-05, + "loss": 0.3725, + "step": 3820 + }, + { + "epoch": 19.540816326530614, + "grad_norm": 4.747711658477783, + "learning_rate": 1.609183673469388e-05, + "loss": 0.3012, + "step": 3830 + }, + { + "epoch": 19.591836734693878, + "grad_norm": 6.576929092407227, + "learning_rate": 1.6081632653061226e-05, + "loss": 0.3594, + "step": 3840 + }, + { + "epoch": 19.642857142857142, + "grad_norm": 5.363382816314697, + "learning_rate": 1.6071428571428572e-05, + "loss": 0.32, + "step": 3850 + }, + { + "epoch": 19.693877551020407, + "grad_norm": 1.3070534467697144, + "learning_rate": 1.6061224489795918e-05, + "loss": 0.4294, + "step": 3860 + }, + { + "epoch": 19.744897959183675, + "grad_norm": 7.561715126037598, + "learning_rate": 1.6051020408163267e-05, + "loss": 0.3365, + "step": 3870 + }, + { + "epoch": 19.79591836734694, + "grad_norm": 3.3143508434295654, + "learning_rate": 1.6040816326530613e-05, + "loss": 0.3049, + "step": 3880 + }, + { + "epoch": 19.846938775510203, + "grad_norm": 3.5196471214294434, + "learning_rate": 1.6030612244897962e-05, + "loss": 0.2679, + "step": 3890 + }, + { + "epoch": 19.897959183673468, + "grad_norm": 0.6857045292854309, + "learning_rate": 1.6020408163265308e-05, + "loss": 0.2322, + "step": 3900 + }, + { + "epoch": 19.948979591836736, + "grad_norm": 0.5474905371665955, + "learning_rate": 1.6010204081632653e-05, + "loss": 0.2885, + "step": 3910 + }, + { + "epoch": 20.0, + "grad_norm": 2.948782444000244, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.7133, + "step": 3920 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.9133574007220217, + "eval_loss": 0.3031846880912781, + "eval_runtime": 0.9345, + "eval_samples_per_second": 296.417, + "eval_steps_per_second": 37.453, + "step": 3920 + }, + { + "epoch": 20.051020408163264, + "grad_norm": 8.756710052490234, + "learning_rate": 1.598979591836735e-05, + "loss": 0.4335, + "step": 3930 + }, + { + "epoch": 20.102040816326532, + "grad_norm": 7.383957862854004, + "learning_rate": 1.5979591836734694e-05, + "loss": 0.394, + "step": 3940 + }, + { + "epoch": 20.153061224489797, + "grad_norm": 5.428812503814697, + "learning_rate": 1.596938775510204e-05, + "loss": 0.3772, + "step": 3950 + }, + { + "epoch": 20.20408163265306, + "grad_norm": 8.893187522888184, + "learning_rate": 1.595918367346939e-05, + "loss": 0.3712, + "step": 3960 + }, + { + "epoch": 20.255102040816325, + "grad_norm": 2.3695433139801025, + "learning_rate": 1.5948979591836735e-05, + "loss": 0.3681, + "step": 3970 + }, + { + "epoch": 20.306122448979593, + "grad_norm": 5.1946611404418945, + "learning_rate": 1.5938775510204084e-05, + "loss": 0.4013, + "step": 3980 + }, + { + "epoch": 20.357142857142858, + "grad_norm": 4.6183342933654785, + "learning_rate": 1.592857142857143e-05, + "loss": 0.4489, + "step": 3990 + }, + { + "epoch": 20.408163265306122, + "grad_norm": 2.7871506214141846, + "learning_rate": 1.5918367346938776e-05, + "loss": 0.5088, + "step": 4000 + }, + { + "epoch": 20.459183673469386, + "grad_norm": 4.250699996948242, + "learning_rate": 1.5908163265306125e-05, + "loss": 0.3401, + "step": 4010 + }, + { + "epoch": 20.510204081632654, + "grad_norm": 5.782101631164551, + "learning_rate": 1.589795918367347e-05, + "loss": 0.2461, + "step": 4020 + }, + { + "epoch": 20.56122448979592, + "grad_norm": 5.054870128631592, + "learning_rate": 1.588775510204082e-05, + "loss": 0.2459, + "step": 4030 + }, + { + "epoch": 20.612244897959183, + "grad_norm": 8.228409767150879, + "learning_rate": 1.5877551020408162e-05, + "loss": 0.223, + "step": 4040 + }, + { + "epoch": 20.663265306122447, + "grad_norm": 1.5619908571243286, + "learning_rate": 1.586734693877551e-05, + "loss": 0.3247, + "step": 4050 + }, + { + "epoch": 20.714285714285715, + "grad_norm": 8.718620300292969, + "learning_rate": 1.5857142857142857e-05, + "loss": 0.2373, + "step": 4060 + }, + { + "epoch": 20.76530612244898, + "grad_norm": 0.9933898448944092, + "learning_rate": 1.5846938775510206e-05, + "loss": 0.196, + "step": 4070 + }, + { + "epoch": 20.816326530612244, + "grad_norm": 6.243815898895264, + "learning_rate": 1.5836734693877552e-05, + "loss": 0.1823, + "step": 4080 + }, + { + "epoch": 20.867346938775512, + "grad_norm": 4.635849952697754, + "learning_rate": 1.5826530612244898e-05, + "loss": 0.1604, + "step": 4090 + }, + { + "epoch": 20.918367346938776, + "grad_norm": 4.620690822601318, + "learning_rate": 1.5816326530612247e-05, + "loss": 0.383, + "step": 4100 + }, + { + "epoch": 20.96938775510204, + "grad_norm": 1.3753141164779663, + "learning_rate": 1.5806122448979593e-05, + "loss": 0.4489, + "step": 4110 + }, + { + "epoch": 21.0, + "eval_accuracy": 0.851985559566787, + "eval_loss": 0.4246116876602173, + "eval_runtime": 0.9757, + "eval_samples_per_second": 283.894, + "eval_steps_per_second": 35.871, + "step": 4116 + }, + { + "epoch": 21.020408163265305, + "grad_norm": 10.26496696472168, + "learning_rate": 1.5795918367346942e-05, + "loss": 0.6521, + "step": 4120 + }, + { + "epoch": 21.071428571428573, + "grad_norm": 12.259634971618652, + "learning_rate": 1.5785714285714288e-05, + "loss": 0.5814, + "step": 4130 + }, + { + "epoch": 21.122448979591837, + "grad_norm": 11.424445152282715, + "learning_rate": 1.5775510204081634e-05, + "loss": 0.3028, + "step": 4140 + }, + { + "epoch": 21.1734693877551, + "grad_norm": 3.9125359058380127, + "learning_rate": 1.576530612244898e-05, + "loss": 0.2002, + "step": 4150 + }, + { + "epoch": 21.224489795918366, + "grad_norm": 5.634211540222168, + "learning_rate": 1.575510204081633e-05, + "loss": 0.2863, + "step": 4160 + }, + { + "epoch": 21.275510204081634, + "grad_norm": 0.6257736682891846, + "learning_rate": 1.5744897959183675e-05, + "loss": 0.3551, + "step": 4170 + }, + { + "epoch": 21.3265306122449, + "grad_norm": 5.202589988708496, + "learning_rate": 1.573469387755102e-05, + "loss": 0.3546, + "step": 4180 + }, + { + "epoch": 21.377551020408163, + "grad_norm": 1.4646656513214111, + "learning_rate": 1.572448979591837e-05, + "loss": 0.2139, + "step": 4190 + }, + { + "epoch": 21.428571428571427, + "grad_norm": 2.697577953338623, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.2515, + "step": 4200 + }, + { + "epoch": 21.479591836734695, + "grad_norm": 0.9905484318733215, + "learning_rate": 1.5704081632653065e-05, + "loss": 0.3543, + "step": 4210 + }, + { + "epoch": 21.53061224489796, + "grad_norm": 0.8990330100059509, + "learning_rate": 1.569387755102041e-05, + "loss": 0.2124, + "step": 4220 + }, + { + "epoch": 21.581632653061224, + "grad_norm": 2.288618326187134, + "learning_rate": 1.5683673469387756e-05, + "loss": 0.3029, + "step": 4230 + }, + { + "epoch": 21.632653061224488, + "grad_norm": 5.491371154785156, + "learning_rate": 1.5673469387755102e-05, + "loss": 0.1861, + "step": 4240 + }, + { + "epoch": 21.683673469387756, + "grad_norm": 4.103919982910156, + "learning_rate": 1.566326530612245e-05, + "loss": 0.2237, + "step": 4250 + }, + { + "epoch": 21.73469387755102, + "grad_norm": 5.436093330383301, + "learning_rate": 1.5653061224489797e-05, + "loss": 0.1873, + "step": 4260 + }, + { + "epoch": 21.785714285714285, + "grad_norm": 1.5695730447769165, + "learning_rate": 1.5642857142857143e-05, + "loss": 0.3701, + "step": 4270 + }, + { + "epoch": 21.836734693877553, + "grad_norm": 8.12942886352539, + "learning_rate": 1.5632653061224492e-05, + "loss": 0.5966, + "step": 4280 + }, + { + "epoch": 21.887755102040817, + "grad_norm": 1.9971330165863037, + "learning_rate": 1.5622448979591838e-05, + "loss": 0.4555, + "step": 4290 + }, + { + "epoch": 21.93877551020408, + "grad_norm": 8.867376327514648, + "learning_rate": 1.5612244897959187e-05, + "loss": 0.3845, + "step": 4300 + }, + { + "epoch": 21.989795918367346, + "grad_norm": 3.649428129196167, + "learning_rate": 1.5602040816326533e-05, + "loss": 0.2605, + "step": 4310 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.29509276151657104, + "eval_runtime": 0.9476, + "eval_samples_per_second": 292.329, + "eval_steps_per_second": 36.937, + "step": 4312 + }, + { + "epoch": 22.040816326530614, + "grad_norm": 8.780238151550293, + "learning_rate": 1.559183673469388e-05, + "loss": 0.3356, + "step": 4320 + }, + { + "epoch": 22.091836734693878, + "grad_norm": 4.019279479980469, + "learning_rate": 1.5581632653061224e-05, + "loss": 0.3704, + "step": 4330 + }, + { + "epoch": 22.142857142857142, + "grad_norm": 6.3763556480407715, + "learning_rate": 1.5571428571428573e-05, + "loss": 0.3621, + "step": 4340 + }, + { + "epoch": 22.193877551020407, + "grad_norm": 10.312544822692871, + "learning_rate": 1.556122448979592e-05, + "loss": 0.364, + "step": 4350 + }, + { + "epoch": 22.244897959183675, + "grad_norm": 7.919386386871338, + "learning_rate": 1.5551020408163265e-05, + "loss": 0.2917, + "step": 4360 + }, + { + "epoch": 22.29591836734694, + "grad_norm": 9.648198127746582, + "learning_rate": 1.5540816326530614e-05, + "loss": 0.4103, + "step": 4370 + }, + { + "epoch": 22.346938775510203, + "grad_norm": 10.067304611206055, + "learning_rate": 1.553061224489796e-05, + "loss": 0.4598, + "step": 4380 + }, + { + "epoch": 22.397959183673468, + "grad_norm": 3.0613834857940674, + "learning_rate": 1.552040816326531e-05, + "loss": 0.2636, + "step": 4390 + }, + { + "epoch": 22.448979591836736, + "grad_norm": 4.73079252243042, + "learning_rate": 1.5510204081632655e-05, + "loss": 0.2499, + "step": 4400 + }, + { + "epoch": 22.5, + "grad_norm": 11.9879150390625, + "learning_rate": 1.55e-05, + "loss": 0.486, + "step": 4410 + }, + { + "epoch": 22.551020408163264, + "grad_norm": 2.766538619995117, + "learning_rate": 1.5489795918367346e-05, + "loss": 0.3824, + "step": 4420 + }, + { + "epoch": 22.602040816326532, + "grad_norm": 1.4572681188583374, + "learning_rate": 1.5479591836734696e-05, + "loss": 0.3105, + "step": 4430 + }, + { + "epoch": 22.653061224489797, + "grad_norm": 4.498220920562744, + "learning_rate": 1.546938775510204e-05, + "loss": 0.2207, + "step": 4440 + }, + { + "epoch": 22.70408163265306, + "grad_norm": 4.226017475128174, + "learning_rate": 1.545918367346939e-05, + "loss": 0.2277, + "step": 4450 + }, + { + "epoch": 22.755102040816325, + "grad_norm": 3.2783775329589844, + "learning_rate": 1.5448979591836736e-05, + "loss": 0.207, + "step": 4460 + }, + { + "epoch": 22.806122448979593, + "grad_norm": 2.267697811126709, + "learning_rate": 1.5438775510204082e-05, + "loss": 0.3034, + "step": 4470 + }, + { + "epoch": 22.857142857142858, + "grad_norm": 2.044019937515259, + "learning_rate": 1.542857142857143e-05, + "loss": 0.2496, + "step": 4480 + }, + { + "epoch": 22.908163265306122, + "grad_norm": 1.8587689399719238, + "learning_rate": 1.5418367346938777e-05, + "loss": 0.3425, + "step": 4490 + }, + { + "epoch": 22.959183673469386, + "grad_norm": 9.889906883239746, + "learning_rate": 1.5408163265306123e-05, + "loss": 0.3787, + "step": 4500 + }, + { + "epoch": 23.0, + "eval_accuracy": 0.851985559566787, + "eval_loss": 0.4356595277786255, + "eval_runtime": 0.9334, + "eval_samples_per_second": 296.758, + "eval_steps_per_second": 37.497, + "step": 4508 + }, + { + "epoch": 23.010204081632654, + "grad_norm": 9.485286712646484, + "learning_rate": 1.539795918367347e-05, + "loss": 0.2368, + "step": 4510 + }, + { + "epoch": 23.06122448979592, + "grad_norm": 4.098474025726318, + "learning_rate": 1.5387755102040818e-05, + "loss": 0.3777, + "step": 4520 + }, + { + "epoch": 23.112244897959183, + "grad_norm": 3.54447078704834, + "learning_rate": 1.5377551020408164e-05, + "loss": 0.2732, + "step": 4530 + }, + { + "epoch": 23.163265306122447, + "grad_norm": 12.153559684753418, + "learning_rate": 1.5367346938775513e-05, + "loss": 0.4394, + "step": 4540 + }, + { + "epoch": 23.214285714285715, + "grad_norm": 5.04149055480957, + "learning_rate": 1.535714285714286e-05, + "loss": 0.282, + "step": 4550 + }, + { + "epoch": 23.26530612244898, + "grad_norm": 2.7063755989074707, + "learning_rate": 1.5346938775510204e-05, + "loss": 0.2617, + "step": 4560 + }, + { + "epoch": 23.316326530612244, + "grad_norm": 2.060274124145508, + "learning_rate": 1.5336734693877554e-05, + "loss": 0.2776, + "step": 4570 + }, + { + "epoch": 23.367346938775512, + "grad_norm": 4.662898063659668, + "learning_rate": 1.53265306122449e-05, + "loss": 0.3443, + "step": 4580 + }, + { + "epoch": 23.418367346938776, + "grad_norm": 7.403005123138428, + "learning_rate": 1.5316326530612245e-05, + "loss": 0.3008, + "step": 4590 + }, + { + "epoch": 23.46938775510204, + "grad_norm": 7.6282958984375, + "learning_rate": 1.530612244897959e-05, + "loss": 0.4093, + "step": 4600 + }, + { + "epoch": 23.520408163265305, + "grad_norm": 3.8396637439727783, + "learning_rate": 1.529591836734694e-05, + "loss": 0.2757, + "step": 4610 + }, + { + "epoch": 23.571428571428573, + "grad_norm": 3.220073699951172, + "learning_rate": 1.5285714285714286e-05, + "loss": 0.2751, + "step": 4620 + }, + { + "epoch": 23.622448979591837, + "grad_norm": 5.4397993087768555, + "learning_rate": 1.5275510204081635e-05, + "loss": 0.3633, + "step": 4630 + }, + { + "epoch": 23.6734693877551, + "grad_norm": 1.3962507247924805, + "learning_rate": 1.526530612244898e-05, + "loss": 0.1703, + "step": 4640 + }, + { + "epoch": 23.724489795918366, + "grad_norm": 1.0168291330337524, + "learning_rate": 1.5255102040816327e-05, + "loss": 0.3073, + "step": 4650 + }, + { + "epoch": 23.775510204081634, + "grad_norm": 8.263096809387207, + "learning_rate": 1.5244897959183676e-05, + "loss": 0.2385, + "step": 4660 + }, + { + "epoch": 23.8265306122449, + "grad_norm": 6.277915954589844, + "learning_rate": 1.5234693877551022e-05, + "loss": 0.374, + "step": 4670 + }, + { + "epoch": 23.877551020408163, + "grad_norm": 2.2303617000579834, + "learning_rate": 1.522448979591837e-05, + "loss": 0.4268, + "step": 4680 + }, + { + "epoch": 23.928571428571427, + "grad_norm": 8.869429588317871, + "learning_rate": 1.5214285714285715e-05, + "loss": 0.2736, + "step": 4690 + }, + { + "epoch": 23.979591836734695, + "grad_norm": 0.8636966943740845, + "learning_rate": 1.5204081632653063e-05, + "loss": 0.3015, + "step": 4700 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.399036169052124, + "eval_runtime": 0.9356, + "eval_samples_per_second": 296.067, + "eval_steps_per_second": 37.409, + "step": 4704 + }, + { + "epoch": 24.03061224489796, + "grad_norm": 8.083073616027832, + "learning_rate": 1.5193877551020408e-05, + "loss": 0.3505, + "step": 4710 + }, + { + "epoch": 24.081632653061224, + "grad_norm": 2.8755526542663574, + "learning_rate": 1.5183673469387756e-05, + "loss": 0.3789, + "step": 4720 + }, + { + "epoch": 24.132653061224488, + "grad_norm": 5.0803542137146, + "learning_rate": 1.5173469387755105e-05, + "loss": 0.3118, + "step": 4730 + }, + { + "epoch": 24.183673469387756, + "grad_norm": 2.7088780403137207, + "learning_rate": 1.516326530612245e-05, + "loss": 0.5032, + "step": 4740 + }, + { + "epoch": 24.23469387755102, + "grad_norm": 1.4047300815582275, + "learning_rate": 1.5153061224489798e-05, + "loss": 0.4985, + "step": 4750 + }, + { + "epoch": 24.285714285714285, + "grad_norm": 1.823101282119751, + "learning_rate": 1.5142857142857144e-05, + "loss": 0.3021, + "step": 4760 + }, + { + "epoch": 24.336734693877553, + "grad_norm": 3.2638440132141113, + "learning_rate": 1.5132653061224492e-05, + "loss": 0.2204, + "step": 4770 + }, + { + "epoch": 24.387755102040817, + "grad_norm": 1.3173271417617798, + "learning_rate": 1.5122448979591837e-05, + "loss": 0.1328, + "step": 4780 + }, + { + "epoch": 24.43877551020408, + "grad_norm": 1.607823133468628, + "learning_rate": 1.5112244897959185e-05, + "loss": 0.2578, + "step": 4790 + }, + { + "epoch": 24.489795918367346, + "grad_norm": 4.061956405639648, + "learning_rate": 1.510204081632653e-05, + "loss": 0.3741, + "step": 4800 + }, + { + "epoch": 24.540816326530614, + "grad_norm": 5.889439582824707, + "learning_rate": 1.5091836734693878e-05, + "loss": 0.4102, + "step": 4810 + }, + { + "epoch": 24.591836734693878, + "grad_norm": 10.062840461730957, + "learning_rate": 1.5081632653061227e-05, + "loss": 0.3837, + "step": 4820 + }, + { + "epoch": 24.642857142857142, + "grad_norm": 0.9003174304962158, + "learning_rate": 1.5071428571428573e-05, + "loss": 0.1315, + "step": 4830 + }, + { + "epoch": 24.693877551020407, + "grad_norm": 5.709663391113281, + "learning_rate": 1.506122448979592e-05, + "loss": 0.4183, + "step": 4840 + }, + { + "epoch": 24.744897959183675, + "grad_norm": 8.025142669677734, + "learning_rate": 1.5051020408163266e-05, + "loss": 0.5562, + "step": 4850 + }, + { + "epoch": 24.79591836734694, + "grad_norm": 2.7048165798187256, + "learning_rate": 1.5040816326530614e-05, + "loss": 0.2714, + "step": 4860 + }, + { + "epoch": 24.846938775510203, + "grad_norm": 1.3981176614761353, + "learning_rate": 1.503061224489796e-05, + "loss": 0.2331, + "step": 4870 + }, + { + "epoch": 24.897959183673468, + "grad_norm": 6.711188316345215, + "learning_rate": 1.5020408163265307e-05, + "loss": 0.2991, + "step": 4880 + }, + { + "epoch": 24.948979591836736, + "grad_norm": 1.7855308055877686, + "learning_rate": 1.5010204081632653e-05, + "loss": 0.4607, + "step": 4890 + }, + { + "epoch": 25.0, + "grad_norm": 2.935310125350952, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.1965, + "step": 4900 + }, + { + "epoch": 25.0, + "eval_accuracy": 0.9097472924187726, + "eval_loss": 0.35359257459640503, + "eval_runtime": 0.9318, + "eval_samples_per_second": 297.276, + "eval_steps_per_second": 37.562, + "step": 4900 + }, + { + "epoch": 25.051020408163264, + "grad_norm": 11.405259132385254, + "learning_rate": 1.498979591836735e-05, + "loss": 0.3897, + "step": 4910 + }, + { + "epoch": 25.102040816326532, + "grad_norm": 8.36063003540039, + "learning_rate": 1.4979591836734695e-05, + "loss": 0.3899, + "step": 4920 + }, + { + "epoch": 25.153061224489797, + "grad_norm": 8.212508201599121, + "learning_rate": 1.4969387755102043e-05, + "loss": 0.2401, + "step": 4930 + }, + { + "epoch": 25.20408163265306, + "grad_norm": 0.3883512318134308, + "learning_rate": 1.4959183673469389e-05, + "loss": 0.4655, + "step": 4940 + }, + { + "epoch": 25.255102040816325, + "grad_norm": 8.68212890625, + "learning_rate": 1.4948979591836736e-05, + "loss": 0.4715, + "step": 4950 + }, + { + "epoch": 25.306122448979593, + "grad_norm": 1.6489917039871216, + "learning_rate": 1.4938775510204082e-05, + "loss": 0.1642, + "step": 4960 + }, + { + "epoch": 25.357142857142858, + "grad_norm": 1.582821011543274, + "learning_rate": 1.492857142857143e-05, + "loss": 0.3276, + "step": 4970 + }, + { + "epoch": 25.408163265306122, + "grad_norm": 0.6229325532913208, + "learning_rate": 1.4918367346938775e-05, + "loss": 0.3723, + "step": 4980 + }, + { + "epoch": 25.459183673469386, + "grad_norm": 2.6093995571136475, + "learning_rate": 1.4908163265306124e-05, + "loss": 0.1918, + "step": 4990 + }, + { + "epoch": 25.510204081632654, + "grad_norm": 5.734492301940918, + "learning_rate": 1.4897959183673472e-05, + "loss": 0.2327, + "step": 5000 + }, + { + "epoch": 25.56122448979592, + "grad_norm": 2.498661518096924, + "learning_rate": 1.4887755102040818e-05, + "loss": 0.1976, + "step": 5010 + }, + { + "epoch": 25.612244897959183, + "grad_norm": 4.310215473175049, + "learning_rate": 1.4877551020408165e-05, + "loss": 0.3573, + "step": 5020 + }, + { + "epoch": 25.663265306122447, + "grad_norm": 1.1879396438598633, + "learning_rate": 1.4867346938775511e-05, + "loss": 0.2937, + "step": 5030 + }, + { + "epoch": 25.714285714285715, + "grad_norm": 3.3825039863586426, + "learning_rate": 1.4857142857142858e-05, + "loss": 0.2267, + "step": 5040 + }, + { + "epoch": 25.76530612244898, + "grad_norm": 3.127350091934204, + "learning_rate": 1.4846938775510204e-05, + "loss": 0.3636, + "step": 5050 + }, + { + "epoch": 25.816326530612244, + "grad_norm": 1.4972935914993286, + "learning_rate": 1.4836734693877552e-05, + "loss": 0.1758, + "step": 5060 + }, + { + "epoch": 25.867346938775512, + "grad_norm": 12.983075141906738, + "learning_rate": 1.4826530612244897e-05, + "loss": 0.3909, + "step": 5070 + }, + { + "epoch": 25.918367346938776, + "grad_norm": 5.06458854675293, + "learning_rate": 1.4816326530612247e-05, + "loss": 0.38, + "step": 5080 + }, + { + "epoch": 25.96938775510204, + "grad_norm": 7.696865081787109, + "learning_rate": 1.4806122448979594e-05, + "loss": 0.3903, + "step": 5090 + }, + { + "epoch": 26.0, + "eval_accuracy": 0.8592057761732852, + "eval_loss": 0.4166140854358673, + "eval_runtime": 0.9346, + "eval_samples_per_second": 296.393, + "eval_steps_per_second": 37.45, + "step": 5096 + }, + { + "epoch": 26.020408163265305, + "grad_norm": 1.0229040384292603, + "learning_rate": 1.479591836734694e-05, + "loss": 0.1086, + "step": 5100 + }, + { + "epoch": 26.071428571428573, + "grad_norm": 3.7563860416412354, + "learning_rate": 1.4785714285714287e-05, + "loss": 0.3433, + "step": 5110 + }, + { + "epoch": 26.122448979591837, + "grad_norm": 2.8265397548675537, + "learning_rate": 1.4775510204081633e-05, + "loss": 0.2704, + "step": 5120 + }, + { + "epoch": 26.1734693877551, + "grad_norm": 0.6223945617675781, + "learning_rate": 1.476530612244898e-05, + "loss": 0.2733, + "step": 5130 + }, + { + "epoch": 26.224489795918366, + "grad_norm": 7.791995525360107, + "learning_rate": 1.4755102040816326e-05, + "loss": 0.3287, + "step": 5140 + }, + { + "epoch": 26.275510204081634, + "grad_norm": 6.363265514373779, + "learning_rate": 1.4744897959183676e-05, + "loss": 0.4244, + "step": 5150 + }, + { + "epoch": 26.3265306122449, + "grad_norm": 1.1195369958877563, + "learning_rate": 1.4734693877551021e-05, + "loss": 0.2089, + "step": 5160 + }, + { + "epoch": 26.377551020408163, + "grad_norm": 0.9491761922836304, + "learning_rate": 1.4724489795918369e-05, + "loss": 0.3159, + "step": 5170 + }, + { + "epoch": 26.428571428571427, + "grad_norm": 0.22193892300128937, + "learning_rate": 1.4714285714285716e-05, + "loss": 0.3834, + "step": 5180 + }, + { + "epoch": 26.479591836734695, + "grad_norm": 4.889684200286865, + "learning_rate": 1.4704081632653062e-05, + "loss": 0.3216, + "step": 5190 + }, + { + "epoch": 26.53061224489796, + "grad_norm": 4.601675987243652, + "learning_rate": 1.469387755102041e-05, + "loss": 0.3598, + "step": 5200 + }, + { + "epoch": 26.581632653061224, + "grad_norm": 7.6923441886901855, + "learning_rate": 1.4683673469387756e-05, + "loss": 0.2714, + "step": 5210 + }, + { + "epoch": 26.632653061224488, + "grad_norm": 8.871493339538574, + "learning_rate": 1.4673469387755103e-05, + "loss": 0.4931, + "step": 5220 + }, + { + "epoch": 26.683673469387756, + "grad_norm": 1.7044825553894043, + "learning_rate": 1.4663265306122449e-05, + "loss": 0.34, + "step": 5230 + }, + { + "epoch": 26.73469387755102, + "grad_norm": 2.5381484031677246, + "learning_rate": 1.4653061224489798e-05, + "loss": 0.2527, + "step": 5240 + }, + { + "epoch": 26.785714285714285, + "grad_norm": 11.087688446044922, + "learning_rate": 1.4642857142857144e-05, + "loss": 0.3773, + "step": 5250 + }, + { + "epoch": 26.836734693877553, + "grad_norm": 1.4621928930282593, + "learning_rate": 1.4632653061224491e-05, + "loss": 0.2989, + "step": 5260 + }, + { + "epoch": 26.887755102040817, + "grad_norm": 1.6922417879104614, + "learning_rate": 1.4622448979591839e-05, + "loss": 0.2947, + "step": 5270 + }, + { + "epoch": 26.93877551020408, + "grad_norm": 1.8192353248596191, + "learning_rate": 1.4612244897959185e-05, + "loss": 0.2285, + "step": 5280 + }, + { + "epoch": 26.989795918367346, + "grad_norm": 1.7619237899780273, + "learning_rate": 1.4602040816326532e-05, + "loss": 0.1902, + "step": 5290 + }, + { + "epoch": 27.0, + "eval_accuracy": 0.851985559566787, + "eval_loss": 0.4353993237018585, + "eval_runtime": 0.9607, + "eval_samples_per_second": 288.322, + "eval_steps_per_second": 36.431, + "step": 5292 + }, + { + "epoch": 27.040816326530614, + "grad_norm": 3.745148181915283, + "learning_rate": 1.4591836734693878e-05, + "loss": 0.1512, + "step": 5300 + }, + { + "epoch": 27.091836734693878, + "grad_norm": 6.893754482269287, + "learning_rate": 1.4581632653061227e-05, + "loss": 0.5395, + "step": 5310 + }, + { + "epoch": 27.142857142857142, + "grad_norm": 5.727985382080078, + "learning_rate": 1.4571428571428573e-05, + "loss": 0.3727, + "step": 5320 + }, + { + "epoch": 27.193877551020407, + "grad_norm": 1.6175438165664673, + "learning_rate": 1.456122448979592e-05, + "loss": 0.1489, + "step": 5330 + }, + { + "epoch": 27.244897959183675, + "grad_norm": 1.226585030555725, + "learning_rate": 1.4551020408163266e-05, + "loss": 0.3616, + "step": 5340 + }, + { + "epoch": 27.29591836734694, + "grad_norm": 3.1076860427856445, + "learning_rate": 1.4540816326530614e-05, + "loss": 0.2072, + "step": 5350 + }, + { + "epoch": 27.346938775510203, + "grad_norm": 5.883277416229248, + "learning_rate": 1.4530612244897961e-05, + "loss": 0.1949, + "step": 5360 + }, + { + "epoch": 27.397959183673468, + "grad_norm": 1.5578702688217163, + "learning_rate": 1.4520408163265307e-05, + "loss": 0.5629, + "step": 5370 + }, + { + "epoch": 27.448979591836736, + "grad_norm": 0.5179188847541809, + "learning_rate": 1.4510204081632654e-05, + "loss": 0.2987, + "step": 5380 + }, + { + "epoch": 27.5, + "grad_norm": 7.038111686706543, + "learning_rate": 1.45e-05, + "loss": 0.2434, + "step": 5390 + }, + { + "epoch": 27.551020408163264, + "grad_norm": 9.72697925567627, + "learning_rate": 1.448979591836735e-05, + "loss": 0.3516, + "step": 5400 + }, + { + "epoch": 27.602040816326532, + "grad_norm": 1.2628819942474365, + "learning_rate": 1.4479591836734695e-05, + "loss": 0.2303, + "step": 5410 + }, + { + "epoch": 27.653061224489797, + "grad_norm": 8.881223678588867, + "learning_rate": 1.4469387755102043e-05, + "loss": 0.3717, + "step": 5420 + }, + { + "epoch": 27.70408163265306, + "grad_norm": 0.9033036828041077, + "learning_rate": 1.4459183673469388e-05, + "loss": 0.4453, + "step": 5430 + }, + { + "epoch": 27.755102040816325, + "grad_norm": 2.869086503982544, + "learning_rate": 1.4448979591836736e-05, + "loss": 0.3423, + "step": 5440 + }, + { + "epoch": 27.806122448979593, + "grad_norm": 3.165790319442749, + "learning_rate": 1.4438775510204083e-05, + "loss": 0.3382, + "step": 5450 + }, + { + "epoch": 27.857142857142858, + "grad_norm": 1.9456369876861572, + "learning_rate": 1.4428571428571429e-05, + "loss": 0.4745, + "step": 5460 + }, + { + "epoch": 27.908163265306122, + "grad_norm": 10.290801048278809, + "learning_rate": 1.4418367346938778e-05, + "loss": 0.2637, + "step": 5470 + }, + { + "epoch": 27.959183673469386, + "grad_norm": 1.7097864151000977, + "learning_rate": 1.4408163265306122e-05, + "loss": 0.2089, + "step": 5480 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.8592057761732852, + "eval_loss": 0.4088672697544098, + "eval_runtime": 0.9344, + "eval_samples_per_second": 296.462, + "eval_steps_per_second": 37.459, + "step": 5488 + }, + { + "epoch": 28.010204081632654, + "grad_norm": 10.47132682800293, + "learning_rate": 1.4397959183673472e-05, + "loss": 0.3704, + "step": 5490 + }, + { + "epoch": 28.06122448979592, + "grad_norm": 2.4138362407684326, + "learning_rate": 1.4387755102040817e-05, + "loss": 0.2158, + "step": 5500 + }, + { + "epoch": 28.112244897959183, + "grad_norm": 14.150741577148438, + "learning_rate": 1.4377551020408165e-05, + "loss": 0.4084, + "step": 5510 + }, + { + "epoch": 28.163265306122447, + "grad_norm": 5.0319061279296875, + "learning_rate": 1.436734693877551e-05, + "loss": 0.2888, + "step": 5520 + }, + { + "epoch": 28.214285714285715, + "grad_norm": 8.061519622802734, + "learning_rate": 1.4357142857142858e-05, + "loss": 0.3848, + "step": 5530 + }, + { + "epoch": 28.26530612244898, + "grad_norm": 0.9189940690994263, + "learning_rate": 1.4346938775510206e-05, + "loss": 0.2323, + "step": 5540 + }, + { + "epoch": 28.316326530612244, + "grad_norm": 2.1076314449310303, + "learning_rate": 1.4336734693877551e-05, + "loss": 0.2321, + "step": 5550 + }, + { + "epoch": 28.367346938775512, + "grad_norm": 6.96281623840332, + "learning_rate": 1.43265306122449e-05, + "loss": 0.1701, + "step": 5560 + }, + { + "epoch": 28.418367346938776, + "grad_norm": 6.155792713165283, + "learning_rate": 1.4316326530612246e-05, + "loss": 0.3143, + "step": 5570 + }, + { + "epoch": 28.46938775510204, + "grad_norm": 6.938708782196045, + "learning_rate": 1.4306122448979594e-05, + "loss": 0.6464, + "step": 5580 + }, + { + "epoch": 28.520408163265305, + "grad_norm": 5.764797210693359, + "learning_rate": 1.429591836734694e-05, + "loss": 0.4086, + "step": 5590 + }, + { + "epoch": 28.571428571428573, + "grad_norm": 5.969749450683594, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.324, + "step": 5600 + }, + { + "epoch": 28.622448979591837, + "grad_norm": 3.8588509559631348, + "learning_rate": 1.4275510204081633e-05, + "loss": 0.2058, + "step": 5610 + }, + { + "epoch": 28.6734693877551, + "grad_norm": 0.7087581753730774, + "learning_rate": 1.426530612244898e-05, + "loss": 0.4878, + "step": 5620 + }, + { + "epoch": 28.724489795918366, + "grad_norm": 0.6094661355018616, + "learning_rate": 1.425510204081633e-05, + "loss": 0.2933, + "step": 5630 + }, + { + "epoch": 28.775510204081634, + "grad_norm": 8.96648120880127, + "learning_rate": 1.4244897959183674e-05, + "loss": 0.2122, + "step": 5640 + }, + { + "epoch": 28.8265306122449, + "grad_norm": 4.653097152709961, + "learning_rate": 1.4234693877551023e-05, + "loss": 0.2367, + "step": 5650 + }, + { + "epoch": 28.877551020408163, + "grad_norm": 1.0812416076660156, + "learning_rate": 1.4224489795918369e-05, + "loss": 0.2423, + "step": 5660 + }, + { + "epoch": 28.928571428571427, + "grad_norm": 1.8441146612167358, + "learning_rate": 1.4214285714285716e-05, + "loss": 0.3556, + "step": 5670 + }, + { + "epoch": 28.979591836734695, + "grad_norm": 6.369021892547607, + "learning_rate": 1.4204081632653062e-05, + "loss": 0.3574, + "step": 5680 + }, + { + "epoch": 29.0, + "eval_accuracy": 0.8231046931407943, + "eval_loss": 0.4787161946296692, + "eval_runtime": 0.9366, + "eval_samples_per_second": 295.746, + "eval_steps_per_second": 37.369, + "step": 5684 + }, + { + "epoch": 29.03061224489796, + "grad_norm": 7.944931983947754, + "learning_rate": 1.419387755102041e-05, + "loss": 0.2558, + "step": 5690 + }, + { + "epoch": 29.081632653061224, + "grad_norm": 8.573824882507324, + "learning_rate": 1.4183673469387755e-05, + "loss": 0.2327, + "step": 5700 + }, + { + "epoch": 29.132653061224488, + "grad_norm": 11.918915748596191, + "learning_rate": 1.4173469387755103e-05, + "loss": 0.5803, + "step": 5710 + }, + { + "epoch": 29.183673469387756, + "grad_norm": 1.88558030128479, + "learning_rate": 1.4163265306122452e-05, + "loss": 0.2855, + "step": 5720 + }, + { + "epoch": 29.23469387755102, + "grad_norm": 1.4961941242218018, + "learning_rate": 1.4153061224489798e-05, + "loss": 0.5397, + "step": 5730 + }, + { + "epoch": 29.285714285714285, + "grad_norm": 4.03718376159668, + "learning_rate": 1.4142857142857145e-05, + "loss": 0.3524, + "step": 5740 + }, + { + "epoch": 29.336734693877553, + "grad_norm": 0.6138362884521484, + "learning_rate": 1.4132653061224491e-05, + "loss": 0.2927, + "step": 5750 + }, + { + "epoch": 29.387755102040817, + "grad_norm": 3.0943140983581543, + "learning_rate": 1.4122448979591838e-05, + "loss": 0.2377, + "step": 5760 + }, + { + "epoch": 29.43877551020408, + "grad_norm": 2.0186758041381836, + "learning_rate": 1.4112244897959184e-05, + "loss": 0.2708, + "step": 5770 + }, + { + "epoch": 29.489795918367346, + "grad_norm": 1.613853096961975, + "learning_rate": 1.4102040816326532e-05, + "loss": 0.2674, + "step": 5780 + }, + { + "epoch": 29.540816326530614, + "grad_norm": 5.511127948760986, + "learning_rate": 1.4091836734693877e-05, + "loss": 0.4286, + "step": 5790 + }, + { + "epoch": 29.591836734693878, + "grad_norm": 1.3316036462783813, + "learning_rate": 1.4081632653061225e-05, + "loss": 0.3182, + "step": 5800 + }, + { + "epoch": 29.642857142857142, + "grad_norm": 0.6130739450454712, + "learning_rate": 1.4071428571428574e-05, + "loss": 0.3473, + "step": 5810 + }, + { + "epoch": 29.693877551020407, + "grad_norm": 1.8860011100769043, + "learning_rate": 1.406122448979592e-05, + "loss": 0.2681, + "step": 5820 + }, + { + "epoch": 29.744897959183675, + "grad_norm": 10.419530868530273, + "learning_rate": 1.4051020408163267e-05, + "loss": 0.2665, + "step": 5830 + }, + { + "epoch": 29.79591836734694, + "grad_norm": 4.105865955352783, + "learning_rate": 1.4040816326530613e-05, + "loss": 0.2498, + "step": 5840 + }, + { + "epoch": 29.846938775510203, + "grad_norm": 0.8506119847297668, + "learning_rate": 1.403061224489796e-05, + "loss": 0.2422, + "step": 5850 + }, + { + "epoch": 29.897959183673468, + "grad_norm": 6.38096809387207, + "learning_rate": 1.4020408163265307e-05, + "loss": 0.1484, + "step": 5860 + }, + { + "epoch": 29.948979591836736, + "grad_norm": 1.4350731372833252, + "learning_rate": 1.4010204081632654e-05, + "loss": 0.3318, + "step": 5870 + }, + { + "epoch": 30.0, + "grad_norm": 2.968691110610962, + "learning_rate": 1.4e-05, + "loss": 0.3532, + "step": 5880 + }, + { + "epoch": 30.0, + "eval_accuracy": 0.9097472924187726, + "eval_loss": 0.3165305256843567, + "eval_runtime": 0.9457, + "eval_samples_per_second": 292.897, + "eval_steps_per_second": 37.009, + "step": 5880 + }, + { + "epoch": 30.051020408163264, + "grad_norm": 1.0100151300430298, + "learning_rate": 1.3989795918367349e-05, + "loss": 0.2086, + "step": 5890 + }, + { + "epoch": 30.102040816326532, + "grad_norm": 2.064863681793213, + "learning_rate": 1.3979591836734696e-05, + "loss": 0.2836, + "step": 5900 + }, + { + "epoch": 30.153061224489797, + "grad_norm": 11.070706367492676, + "learning_rate": 1.3969387755102042e-05, + "loss": 0.451, + "step": 5910 + }, + { + "epoch": 30.20408163265306, + "grad_norm": 1.8765898942947388, + "learning_rate": 1.395918367346939e-05, + "loss": 0.2461, + "step": 5920 + }, + { + "epoch": 30.255102040816325, + "grad_norm": 3.754838466644287, + "learning_rate": 1.3948979591836736e-05, + "loss": 0.4792, + "step": 5930 + }, + { + "epoch": 30.306122448979593, + "grad_norm": 1.705304503440857, + "learning_rate": 1.3938775510204083e-05, + "loss": 0.2855, + "step": 5940 + }, + { + "epoch": 30.357142857142858, + "grad_norm": 7.264845371246338, + "learning_rate": 1.3928571428571429e-05, + "loss": 0.3534, + "step": 5950 + }, + { + "epoch": 30.408163265306122, + "grad_norm": 4.922964572906494, + "learning_rate": 1.3918367346938776e-05, + "loss": 0.2287, + "step": 5960 + }, + { + "epoch": 30.459183673469386, + "grad_norm": 11.60462474822998, + "learning_rate": 1.3908163265306122e-05, + "loss": 0.3458, + "step": 5970 + }, + { + "epoch": 30.510204081632654, + "grad_norm": 9.114325523376465, + "learning_rate": 1.3897959183673471e-05, + "loss": 0.3271, + "step": 5980 + }, + { + "epoch": 30.56122448979592, + "grad_norm": 2.0705983638763428, + "learning_rate": 1.3887755102040819e-05, + "loss": 0.3466, + "step": 5990 + }, + { + "epoch": 30.612244897959183, + "grad_norm": 3.5480122566223145, + "learning_rate": 1.3877551020408165e-05, + "loss": 0.267, + "step": 6000 + }, + { + "epoch": 30.663265306122447, + "grad_norm": 2.192810535430908, + "learning_rate": 1.3867346938775512e-05, + "loss": 0.241, + "step": 6010 + }, + { + "epoch": 30.714285714285715, + "grad_norm": 2.5832903385162354, + "learning_rate": 1.3857142857142858e-05, + "loss": 0.2774, + "step": 6020 + }, + { + "epoch": 30.76530612244898, + "grad_norm": 0.5127232074737549, + "learning_rate": 1.3846938775510205e-05, + "loss": 0.1853, + "step": 6030 + }, + { + "epoch": 30.816326530612244, + "grad_norm": 1.949842095375061, + "learning_rate": 1.3836734693877551e-05, + "loss": 0.2198, + "step": 6040 + }, + { + "epoch": 30.867346938775512, + "grad_norm": 5.536325931549072, + "learning_rate": 1.38265306122449e-05, + "loss": 0.4455, + "step": 6050 + }, + { + "epoch": 30.918367346938776, + "grad_norm": 7.894504070281982, + "learning_rate": 1.3816326530612244e-05, + "loss": 0.2814, + "step": 6060 + }, + { + "epoch": 30.96938775510204, + "grad_norm": 8.459007263183594, + "learning_rate": 1.3806122448979594e-05, + "loss": 0.2967, + "step": 6070 + }, + { + "epoch": 31.0, + "eval_accuracy": 0.9133574007220217, + "eval_loss": 0.31053850054740906, + "eval_runtime": 0.9369, + "eval_samples_per_second": 295.665, + "eval_steps_per_second": 37.358, + "step": 6076 + }, + { + "epoch": 31.020408163265305, + "grad_norm": 0.4988439381122589, + "learning_rate": 1.3795918367346941e-05, + "loss": 0.2044, + "step": 6080 + }, + { + "epoch": 31.071428571428573, + "grad_norm": 5.99605131149292, + "learning_rate": 1.3785714285714287e-05, + "loss": 0.4048, + "step": 6090 + }, + { + "epoch": 31.122448979591837, + "grad_norm": 0.37132564187049866, + "learning_rate": 1.3775510204081634e-05, + "loss": 0.2627, + "step": 6100 + }, + { + "epoch": 31.1734693877551, + "grad_norm": 6.084136486053467, + "learning_rate": 1.376530612244898e-05, + "loss": 0.2869, + "step": 6110 + }, + { + "epoch": 31.224489795918366, + "grad_norm": 11.331133842468262, + "learning_rate": 1.3755102040816328e-05, + "loss": 0.273, + "step": 6120 + }, + { + "epoch": 31.275510204081634, + "grad_norm": 3.7786152362823486, + "learning_rate": 1.3744897959183673e-05, + "loss": 0.2437, + "step": 6130 + }, + { + "epoch": 31.3265306122449, + "grad_norm": 3.998039722442627, + "learning_rate": 1.3734693877551023e-05, + "loss": 0.1321, + "step": 6140 + }, + { + "epoch": 31.377551020408163, + "grad_norm": 10.547959327697754, + "learning_rate": 1.3724489795918368e-05, + "loss": 0.43, + "step": 6150 + }, + { + "epoch": 31.428571428571427, + "grad_norm": 1.094752311706543, + "learning_rate": 1.3714285714285716e-05, + "loss": 0.3469, + "step": 6160 + }, + { + "epoch": 31.479591836734695, + "grad_norm": 8.193305969238281, + "learning_rate": 1.3704081632653062e-05, + "loss": 0.1532, + "step": 6170 + }, + { + "epoch": 31.53061224489796, + "grad_norm": 7.120598793029785, + "learning_rate": 1.3693877551020409e-05, + "loss": 0.3421, + "step": 6180 + }, + { + "epoch": 31.581632653061224, + "grad_norm": 6.944319248199463, + "learning_rate": 1.3683673469387757e-05, + "loss": 0.3574, + "step": 6190 + }, + { + "epoch": 31.632653061224488, + "grad_norm": 7.441619873046875, + "learning_rate": 1.3673469387755102e-05, + "loss": 0.384, + "step": 6200 + }, + { + "epoch": 31.683673469387756, + "grad_norm": 5.745674133300781, + "learning_rate": 1.366326530612245e-05, + "loss": 0.4554, + "step": 6210 + }, + { + "epoch": 31.73469387755102, + "grad_norm": 8.12261962890625, + "learning_rate": 1.3653061224489796e-05, + "loss": 0.2278, + "step": 6220 + }, + { + "epoch": 31.785714285714285, + "grad_norm": 1.7826637029647827, + "learning_rate": 1.3642857142857145e-05, + "loss": 0.1841, + "step": 6230 + }, + { + "epoch": 31.836734693877553, + "grad_norm": 8.810450553894043, + "learning_rate": 1.363265306122449e-05, + "loss": 0.3919, + "step": 6240 + }, + { + "epoch": 31.887755102040817, + "grad_norm": 2.942906618118286, + "learning_rate": 1.3622448979591838e-05, + "loss": 0.1034, + "step": 6250 + }, + { + "epoch": 31.93877551020408, + "grad_norm": 7.177513599395752, + "learning_rate": 1.3612244897959184e-05, + "loss": 0.2669, + "step": 6260 + }, + { + "epoch": 31.989795918367346, + "grad_norm": 0.5618299245834351, + "learning_rate": 1.3602040816326531e-05, + "loss": 0.2364, + "step": 6270 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.9061371841155235, + "eval_loss": 0.3559906780719757, + "eval_runtime": 0.9792, + "eval_samples_per_second": 282.89, + "eval_steps_per_second": 35.744, + "step": 6272 + }, + { + "epoch": 32.04081632653061, + "grad_norm": 0.4897635877132416, + "learning_rate": 1.3591836734693879e-05, + "loss": 0.4591, + "step": 6280 + }, + { + "epoch": 32.09183673469388, + "grad_norm": 5.982337474822998, + "learning_rate": 1.3581632653061225e-05, + "loss": 0.4622, + "step": 6290 + }, + { + "epoch": 32.142857142857146, + "grad_norm": 14.7687349319458, + "learning_rate": 1.3571428571428574e-05, + "loss": 0.3969, + "step": 6300 + }, + { + "epoch": 32.19387755102041, + "grad_norm": 1.2857763767242432, + "learning_rate": 1.356122448979592e-05, + "loss": 0.0797, + "step": 6310 + }, + { + "epoch": 32.244897959183675, + "grad_norm": 5.122455596923828, + "learning_rate": 1.3551020408163267e-05, + "loss": 0.2258, + "step": 6320 + }, + { + "epoch": 32.295918367346935, + "grad_norm": 3.5429632663726807, + "learning_rate": 1.3540816326530613e-05, + "loss": 0.3753, + "step": 6330 + }, + { + "epoch": 32.3469387755102, + "grad_norm": 12.08903694152832, + "learning_rate": 1.353061224489796e-05, + "loss": 0.3908, + "step": 6340 + }, + { + "epoch": 32.39795918367347, + "grad_norm": 12.813987731933594, + "learning_rate": 1.3520408163265306e-05, + "loss": 0.2859, + "step": 6350 + }, + { + "epoch": 32.44897959183673, + "grad_norm": 1.3361541032791138, + "learning_rate": 1.3510204081632654e-05, + "loss": 0.2899, + "step": 6360 + }, + { + "epoch": 32.5, + "grad_norm": 8.692803382873535, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.3218, + "step": 6370 + }, + { + "epoch": 32.55102040816327, + "grad_norm": 10.775219917297363, + "learning_rate": 1.3489795918367347e-05, + "loss": 0.3706, + "step": 6380 + }, + { + "epoch": 32.60204081632653, + "grad_norm": 6.802149295806885, + "learning_rate": 1.3479591836734696e-05, + "loss": 0.2627, + "step": 6390 + }, + { + "epoch": 32.6530612244898, + "grad_norm": 1.9009625911712646, + "learning_rate": 1.3469387755102042e-05, + "loss": 0.2294, + "step": 6400 + }, + { + "epoch": 32.704081632653065, + "grad_norm": 5.793837070465088, + "learning_rate": 1.345918367346939e-05, + "loss": 0.3042, + "step": 6410 + }, + { + "epoch": 32.755102040816325, + "grad_norm": 8.77164077758789, + "learning_rate": 1.3448979591836735e-05, + "loss": 0.5283, + "step": 6420 + }, + { + "epoch": 32.80612244897959, + "grad_norm": 6.6339616775512695, + "learning_rate": 1.3438775510204083e-05, + "loss": 0.3025, + "step": 6430 + }, + { + "epoch": 32.857142857142854, + "grad_norm": 9.5437593460083, + "learning_rate": 1.3428571428571429e-05, + "loss": 0.1714, + "step": 6440 + }, + { + "epoch": 32.90816326530612, + "grad_norm": 3.033761739730835, + "learning_rate": 1.3418367346938776e-05, + "loss": 0.36, + "step": 6450 + }, + { + "epoch": 32.95918367346939, + "grad_norm": 5.7934112548828125, + "learning_rate": 1.3408163265306125e-05, + "loss": 0.3136, + "step": 6460 + }, + { + "epoch": 33.0, + "eval_accuracy": 0.9097472924187726, + "eval_loss": 0.26566287875175476, + "eval_runtime": 0.9353, + "eval_samples_per_second": 296.157, + "eval_steps_per_second": 37.421, + "step": 6468 + }, + { + "epoch": 33.01020408163265, + "grad_norm": 1.0668991804122925, + "learning_rate": 1.3397959183673471e-05, + "loss": 0.1893, + "step": 6470 + }, + { + "epoch": 33.06122448979592, + "grad_norm": 1.4086322784423828, + "learning_rate": 1.3387755102040818e-05, + "loss": 0.5995, + "step": 6480 + }, + { + "epoch": 33.11224489795919, + "grad_norm": 4.279899597167969, + "learning_rate": 1.3377551020408164e-05, + "loss": 0.4313, + "step": 6490 + }, + { + "epoch": 33.16326530612245, + "grad_norm": 5.1103410720825195, + "learning_rate": 1.3367346938775512e-05, + "loss": 0.1508, + "step": 6500 + }, + { + "epoch": 33.214285714285715, + "grad_norm": 0.4270355999469757, + "learning_rate": 1.3357142857142858e-05, + "loss": 0.1682, + "step": 6510 + }, + { + "epoch": 33.265306122448976, + "grad_norm": 9.594346046447754, + "learning_rate": 1.3346938775510205e-05, + "loss": 0.2922, + "step": 6520 + }, + { + "epoch": 33.316326530612244, + "grad_norm": 0.859392523765564, + "learning_rate": 1.333673469387755e-05, + "loss": 0.2269, + "step": 6530 + }, + { + "epoch": 33.36734693877551, + "grad_norm": 2.0459938049316406, + "learning_rate": 1.3326530612244898e-05, + "loss": 0.211, + "step": 6540 + }, + { + "epoch": 33.41836734693877, + "grad_norm": 9.009632110595703, + "learning_rate": 1.3316326530612247e-05, + "loss": 0.3548, + "step": 6550 + }, + { + "epoch": 33.46938775510204, + "grad_norm": 3.5498781204223633, + "learning_rate": 1.3306122448979593e-05, + "loss": 0.3769, + "step": 6560 + }, + { + "epoch": 33.52040816326531, + "grad_norm": 2.0291507244110107, + "learning_rate": 1.329591836734694e-05, + "loss": 0.1357, + "step": 6570 + }, + { + "epoch": 33.57142857142857, + "grad_norm": 5.882681369781494, + "learning_rate": 1.3285714285714287e-05, + "loss": 0.5029, + "step": 6580 + }, + { + "epoch": 33.62244897959184, + "grad_norm": 8.612032890319824, + "learning_rate": 1.3275510204081634e-05, + "loss": 0.3223, + "step": 6590 + }, + { + "epoch": 33.673469387755105, + "grad_norm": 2.1690680980682373, + "learning_rate": 1.326530612244898e-05, + "loss": 0.3455, + "step": 6600 + }, + { + "epoch": 33.724489795918366, + "grad_norm": 2.9613852500915527, + "learning_rate": 1.3255102040816327e-05, + "loss": 0.3186, + "step": 6610 + }, + { + "epoch": 33.775510204081634, + "grad_norm": 2.842536687850952, + "learning_rate": 1.3244897959183673e-05, + "loss": 0.2595, + "step": 6620 + }, + { + "epoch": 33.826530612244895, + "grad_norm": 6.094658374786377, + "learning_rate": 1.323469387755102e-05, + "loss": 0.2713, + "step": 6630 + }, + { + "epoch": 33.87755102040816, + "grad_norm": 1.204209804534912, + "learning_rate": 1.322448979591837e-05, + "loss": 0.303, + "step": 6640 + }, + { + "epoch": 33.92857142857143, + "grad_norm": 8.503417015075684, + "learning_rate": 1.3214285714285716e-05, + "loss": 0.3531, + "step": 6650 + }, + { + "epoch": 33.97959183673469, + "grad_norm": 13.13253116607666, + "learning_rate": 1.3204081632653063e-05, + "loss": 0.4061, + "step": 6660 + }, + { + "epoch": 34.0, + "eval_accuracy": 0.9133574007220217, + "eval_loss": 0.2679595947265625, + "eval_runtime": 1.0011, + "eval_samples_per_second": 276.693, + "eval_steps_per_second": 34.961, + "step": 6664 + }, + { + "epoch": 34.03061224489796, + "grad_norm": 10.893752098083496, + "learning_rate": 1.3193877551020409e-05, + "loss": 0.2804, + "step": 6670 + }, + { + "epoch": 34.08163265306123, + "grad_norm": 1.127501368522644, + "learning_rate": 1.3183673469387756e-05, + "loss": 0.2987, + "step": 6680 + }, + { + "epoch": 34.13265306122449, + "grad_norm": 11.394792556762695, + "learning_rate": 1.3173469387755102e-05, + "loss": 0.3087, + "step": 6690 + }, + { + "epoch": 34.183673469387756, + "grad_norm": 3.2792036533355713, + "learning_rate": 1.316326530612245e-05, + "loss": 0.2087, + "step": 6700 + }, + { + "epoch": 34.234693877551024, + "grad_norm": 1.6803452968597412, + "learning_rate": 1.3153061224489795e-05, + "loss": 0.289, + "step": 6710 + }, + { + "epoch": 34.285714285714285, + "grad_norm": 5.636178493499756, + "learning_rate": 1.3142857142857145e-05, + "loss": 0.2163, + "step": 6720 + }, + { + "epoch": 34.33673469387755, + "grad_norm": 2.6255643367767334, + "learning_rate": 1.3132653061224492e-05, + "loss": 0.2604, + "step": 6730 + }, + { + "epoch": 34.38775510204081, + "grad_norm": 0.9665950536727905, + "learning_rate": 1.3122448979591838e-05, + "loss": 0.3136, + "step": 6740 + }, + { + "epoch": 34.43877551020408, + "grad_norm": 5.1290435791015625, + "learning_rate": 1.3112244897959185e-05, + "loss": 0.2152, + "step": 6750 + }, + { + "epoch": 34.48979591836735, + "grad_norm": 4.895906925201416, + "learning_rate": 1.3102040816326531e-05, + "loss": 0.1662, + "step": 6760 + }, + { + "epoch": 34.54081632653061, + "grad_norm": 3.9003512859344482, + "learning_rate": 1.3091836734693879e-05, + "loss": 0.3426, + "step": 6770 + }, + { + "epoch": 34.59183673469388, + "grad_norm": 1.398506760597229, + "learning_rate": 1.3081632653061224e-05, + "loss": 0.2563, + "step": 6780 + }, + { + "epoch": 34.642857142857146, + "grad_norm": 7.401740550994873, + "learning_rate": 1.3071428571428572e-05, + "loss": 0.4108, + "step": 6790 + }, + { + "epoch": 34.69387755102041, + "grad_norm": 16.010316848754883, + "learning_rate": 1.3061224489795918e-05, + "loss": 0.3883, + "step": 6800 + }, + { + "epoch": 34.744897959183675, + "grad_norm": 6.630160808563232, + "learning_rate": 1.3051020408163267e-05, + "loss": 0.3897, + "step": 6810 + }, + { + "epoch": 34.795918367346935, + "grad_norm": 5.36838960647583, + "learning_rate": 1.3040816326530614e-05, + "loss": 0.2516, + "step": 6820 + }, + { + "epoch": 34.8469387755102, + "grad_norm": 3.5793333053588867, + "learning_rate": 1.303061224489796e-05, + "loss": 0.2888, + "step": 6830 + }, + { + "epoch": 34.89795918367347, + "grad_norm": 0.8172181248664856, + "learning_rate": 1.3020408163265308e-05, + "loss": 0.3432, + "step": 6840 + }, + { + "epoch": 34.94897959183673, + "grad_norm": 9.127881050109863, + "learning_rate": 1.3010204081632653e-05, + "loss": 0.3537, + "step": 6850 + }, + { + "epoch": 35.0, + "grad_norm": 2.6013247966766357, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.3296, + "step": 6860 + }, + { + "epoch": 35.0, + "eval_accuracy": 0.9061371841155235, + "eval_loss": 0.3797638416290283, + "eval_runtime": 0.9276, + "eval_samples_per_second": 298.61, + "eval_steps_per_second": 37.73, + "step": 6860 + }, + { + "epoch": 35.05102040816327, + "grad_norm": 9.181253433227539, + "learning_rate": 1.2989795918367347e-05, + "loss": 0.2048, + "step": 6870 + }, + { + "epoch": 35.10204081632653, + "grad_norm": 4.2278666496276855, + "learning_rate": 1.2979591836734696e-05, + "loss": 0.4347, + "step": 6880 + }, + { + "epoch": 35.1530612244898, + "grad_norm": 1.9831122159957886, + "learning_rate": 1.2969387755102042e-05, + "loss": 0.2518, + "step": 6890 + }, + { + "epoch": 35.204081632653065, + "grad_norm": 2.629470109939575, + "learning_rate": 1.2959183673469389e-05, + "loss": 0.1867, + "step": 6900 + }, + { + "epoch": 35.255102040816325, + "grad_norm": 2.3086001873016357, + "learning_rate": 1.2948979591836737e-05, + "loss": 0.1446, + "step": 6910 + }, + { + "epoch": 35.30612244897959, + "grad_norm": 0.5559498071670532, + "learning_rate": 1.2938775510204082e-05, + "loss": 0.4085, + "step": 6920 + }, + { + "epoch": 35.357142857142854, + "grad_norm": 2.5674479007720947, + "learning_rate": 1.292857142857143e-05, + "loss": 0.2453, + "step": 6930 + }, + { + "epoch": 35.40816326530612, + "grad_norm": 1.0306987762451172, + "learning_rate": 1.2918367346938776e-05, + "loss": 0.3145, + "step": 6940 + }, + { + "epoch": 35.45918367346939, + "grad_norm": 0.5135687589645386, + "learning_rate": 1.2908163265306123e-05, + "loss": 0.243, + "step": 6950 + }, + { + "epoch": 35.51020408163265, + "grad_norm": 1.1978002786636353, + "learning_rate": 1.2897959183673469e-05, + "loss": 0.1959, + "step": 6960 + }, + { + "epoch": 35.56122448979592, + "grad_norm": 1.2959942817687988, + "learning_rate": 1.2887755102040818e-05, + "loss": 0.5525, + "step": 6970 + }, + { + "epoch": 35.61224489795919, + "grad_norm": 2.489936351776123, + "learning_rate": 1.2877551020408164e-05, + "loss": 0.2946, + "step": 6980 + }, + { + "epoch": 35.66326530612245, + "grad_norm": 3.4419736862182617, + "learning_rate": 1.2867346938775511e-05, + "loss": 0.3306, + "step": 6990 + }, + { + "epoch": 35.714285714285715, + "grad_norm": 0.7286782264709473, + "learning_rate": 1.2857142857142859e-05, + "loss": 0.1627, + "step": 7000 + }, + { + "epoch": 35.765306122448976, + "grad_norm": 8.017316818237305, + "learning_rate": 1.2846938775510205e-05, + "loss": 0.4873, + "step": 7010 + }, + { + "epoch": 35.816326530612244, + "grad_norm": 3.3010361194610596, + "learning_rate": 1.2836734693877552e-05, + "loss": 0.4093, + "step": 7020 + }, + { + "epoch": 35.86734693877551, + "grad_norm": 0.4932214319705963, + "learning_rate": 1.2826530612244898e-05, + "loss": 0.1981, + "step": 7030 + }, + { + "epoch": 35.91836734693877, + "grad_norm": 7.730387210845947, + "learning_rate": 1.2816326530612247e-05, + "loss": 0.3207, + "step": 7040 + }, + { + "epoch": 35.96938775510204, + "grad_norm": 14.77576732635498, + "learning_rate": 1.2806122448979591e-05, + "loss": 0.2905, + "step": 7050 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.855595667870036, + "eval_loss": 0.5098311305046082, + "eval_runtime": 0.9311, + "eval_samples_per_second": 297.499, + "eval_steps_per_second": 37.59, + "step": 7056 + }, + { + "epoch": 36.02040816326531, + "grad_norm": 3.1446189880371094, + "learning_rate": 1.279591836734694e-05, + "loss": 0.2348, + "step": 7060 + }, + { + "epoch": 36.07142857142857, + "grad_norm": 0.9582618474960327, + "learning_rate": 1.2785714285714286e-05, + "loss": 0.339, + "step": 7070 + }, + { + "epoch": 36.12244897959184, + "grad_norm": 2.8060154914855957, + "learning_rate": 1.2775510204081634e-05, + "loss": 0.55, + "step": 7080 + }, + { + "epoch": 36.173469387755105, + "grad_norm": 3.702188014984131, + "learning_rate": 1.2765306122448981e-05, + "loss": 0.2332, + "step": 7090 + }, + { + "epoch": 36.224489795918366, + "grad_norm": 3.444847345352173, + "learning_rate": 1.2755102040816327e-05, + "loss": 0.2069, + "step": 7100 + }, + { + "epoch": 36.275510204081634, + "grad_norm": 8.948756217956543, + "learning_rate": 1.2744897959183674e-05, + "loss": 0.2245, + "step": 7110 + }, + { + "epoch": 36.326530612244895, + "grad_norm": 0.4190171957015991, + "learning_rate": 1.273469387755102e-05, + "loss": 0.2305, + "step": 7120 + }, + { + "epoch": 36.37755102040816, + "grad_norm": 1.888331651687622, + "learning_rate": 1.272448979591837e-05, + "loss": 0.2685, + "step": 7130 + }, + { + "epoch": 36.42857142857143, + "grad_norm": 2.040560722351074, + "learning_rate": 1.2714285714285715e-05, + "loss": 0.274, + "step": 7140 + }, + { + "epoch": 36.47959183673469, + "grad_norm": 3.657184362411499, + "learning_rate": 1.2704081632653063e-05, + "loss": 0.3129, + "step": 7150 + }, + { + "epoch": 36.53061224489796, + "grad_norm": 7.266042709350586, + "learning_rate": 1.2693877551020409e-05, + "loss": 0.2951, + "step": 7160 + }, + { + "epoch": 36.58163265306123, + "grad_norm": 5.278898239135742, + "learning_rate": 1.2683673469387756e-05, + "loss": 0.1573, + "step": 7170 + }, + { + "epoch": 36.63265306122449, + "grad_norm": 0.6331238746643066, + "learning_rate": 1.2673469387755104e-05, + "loss": 0.1682, + "step": 7180 + }, + { + "epoch": 36.683673469387756, + "grad_norm": 7.0703558921813965, + "learning_rate": 1.266326530612245e-05, + "loss": 0.282, + "step": 7190 + }, + { + "epoch": 36.734693877551024, + "grad_norm": 7.899667263031006, + "learning_rate": 1.2653061224489798e-05, + "loss": 0.6806, + "step": 7200 + }, + { + "epoch": 36.785714285714285, + "grad_norm": 1.6253725290298462, + "learning_rate": 1.2642857142857143e-05, + "loss": 0.1931, + "step": 7210 + }, + { + "epoch": 36.83673469387755, + "grad_norm": 15.306461334228516, + "learning_rate": 1.2632653061224492e-05, + "loss": 0.4, + "step": 7220 + }, + { + "epoch": 36.88775510204081, + "grad_norm": 1.1493027210235596, + "learning_rate": 1.2622448979591838e-05, + "loss": 0.1519, + "step": 7230 + }, + { + "epoch": 36.93877551020408, + "grad_norm": 2.6926989555358887, + "learning_rate": 1.2612244897959185e-05, + "loss": 0.4129, + "step": 7240 + }, + { + "epoch": 36.98979591836735, + "grad_norm": 3.21987247467041, + "learning_rate": 1.260204081632653e-05, + "loss": 0.2763, + "step": 7250 + }, + { + "epoch": 37.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.42190420627593994, + "eval_runtime": 0.9352, + "eval_samples_per_second": 296.199, + "eval_steps_per_second": 37.426, + "step": 7252 + }, + { + "epoch": 37.04081632653061, + "grad_norm": 0.3253488540649414, + "learning_rate": 1.2591836734693878e-05, + "loss": 0.0951, + "step": 7260 + }, + { + "epoch": 37.09183673469388, + "grad_norm": 0.2564552426338196, + "learning_rate": 1.2581632653061226e-05, + "loss": 0.3048, + "step": 7270 + }, + { + "epoch": 37.142857142857146, + "grad_norm": 4.657108306884766, + "learning_rate": 1.2571428571428572e-05, + "loss": 0.2368, + "step": 7280 + }, + { + "epoch": 37.19387755102041, + "grad_norm": 7.701886177062988, + "learning_rate": 1.256122448979592e-05, + "loss": 0.4069, + "step": 7290 + }, + { + "epoch": 37.244897959183675, + "grad_norm": 1.6006007194519043, + "learning_rate": 1.2551020408163267e-05, + "loss": 0.2523, + "step": 7300 + }, + { + "epoch": 37.295918367346935, + "grad_norm": 4.586967945098877, + "learning_rate": 1.2540816326530614e-05, + "loss": 0.2706, + "step": 7310 + }, + { + "epoch": 37.3469387755102, + "grad_norm": 6.338481903076172, + "learning_rate": 1.253061224489796e-05, + "loss": 0.2888, + "step": 7320 + }, + { + "epoch": 37.39795918367347, + "grad_norm": 8.174271583557129, + "learning_rate": 1.2520408163265307e-05, + "loss": 0.1741, + "step": 7330 + }, + { + "epoch": 37.44897959183673, + "grad_norm": 0.7613666653633118, + "learning_rate": 1.2510204081632653e-05, + "loss": 0.337, + "step": 7340 + }, + { + "epoch": 37.5, + "grad_norm": 9.497109413146973, + "learning_rate": 1.25e-05, + "loss": 0.4445, + "step": 7350 + }, + { + "epoch": 37.55102040816327, + "grad_norm": 6.902894973754883, + "learning_rate": 1.248979591836735e-05, + "loss": 0.4536, + "step": 7360 + }, + { + "epoch": 37.60204081632653, + "grad_norm": 0.6783469319343567, + "learning_rate": 1.2479591836734694e-05, + "loss": 0.344, + "step": 7370 + }, + { + "epoch": 37.6530612244898, + "grad_norm": 2.9475955963134766, + "learning_rate": 1.2469387755102043e-05, + "loss": 0.1476, + "step": 7380 + }, + { + "epoch": 37.704081632653065, + "grad_norm": 3.033745527267456, + "learning_rate": 1.2459183673469389e-05, + "loss": 0.2898, + "step": 7390 + }, + { + "epoch": 37.755102040816325, + "grad_norm": 1.2969194650650024, + "learning_rate": 1.2448979591836736e-05, + "loss": 0.2384, + "step": 7400 + }, + { + "epoch": 37.80612244897959, + "grad_norm": 3.2489914894104004, + "learning_rate": 1.2438775510204082e-05, + "loss": 0.1458, + "step": 7410 + }, + { + "epoch": 37.857142857142854, + "grad_norm": 0.2780137360095978, + "learning_rate": 1.242857142857143e-05, + "loss": 0.385, + "step": 7420 + }, + { + "epoch": 37.90816326530612, + "grad_norm": 1.6038283109664917, + "learning_rate": 1.2418367346938775e-05, + "loss": 0.1636, + "step": 7430 + }, + { + "epoch": 37.95918367346939, + "grad_norm": 6.315403461456299, + "learning_rate": 1.2408163265306123e-05, + "loss": 0.2454, + "step": 7440 + }, + { + "epoch": 38.0, + "eval_accuracy": 0.9133574007220217, + "eval_loss": 0.2851579189300537, + "eval_runtime": 0.9342, + "eval_samples_per_second": 296.517, + "eval_steps_per_second": 37.466, + "step": 7448 + }, + { + "epoch": 38.01020408163265, + "grad_norm": 4.942686080932617, + "learning_rate": 1.2397959183673472e-05, + "loss": 0.4006, + "step": 7450 + }, + { + "epoch": 38.06122448979592, + "grad_norm": 8.764945030212402, + "learning_rate": 1.2387755102040818e-05, + "loss": 0.3837, + "step": 7460 + }, + { + "epoch": 38.11224489795919, + "grad_norm": 8.624089241027832, + "learning_rate": 1.2377551020408165e-05, + "loss": 0.2409, + "step": 7470 + }, + { + "epoch": 38.16326530612245, + "grad_norm": 4.476467609405518, + "learning_rate": 1.2367346938775511e-05, + "loss": 0.2747, + "step": 7480 + }, + { + "epoch": 38.214285714285715, + "grad_norm": 13.317136764526367, + "learning_rate": 1.2357142857142859e-05, + "loss": 0.306, + "step": 7490 + }, + { + "epoch": 38.265306122448976, + "grad_norm": 10.13068962097168, + "learning_rate": 1.2346938775510204e-05, + "loss": 0.3725, + "step": 7500 + }, + { + "epoch": 38.316326530612244, + "grad_norm": 3.029904842376709, + "learning_rate": 1.2336734693877552e-05, + "loss": 0.2611, + "step": 7510 + }, + { + "epoch": 38.36734693877551, + "grad_norm": 5.582315921783447, + "learning_rate": 1.2326530612244898e-05, + "loss": 0.1853, + "step": 7520 + }, + { + "epoch": 38.41836734693877, + "grad_norm": 2.4034624099731445, + "learning_rate": 1.2316326530612245e-05, + "loss": 0.3121, + "step": 7530 + }, + { + "epoch": 38.46938775510204, + "grad_norm": 6.040859222412109, + "learning_rate": 1.2306122448979594e-05, + "loss": 0.373, + "step": 7540 + }, + { + "epoch": 38.52040816326531, + "grad_norm": 9.853171348571777, + "learning_rate": 1.229591836734694e-05, + "loss": 0.2384, + "step": 7550 + }, + { + "epoch": 38.57142857142857, + "grad_norm": 1.5734626054763794, + "learning_rate": 1.2285714285714288e-05, + "loss": 0.2368, + "step": 7560 + }, + { + "epoch": 38.62244897959184, + "grad_norm": 6.7449116706848145, + "learning_rate": 1.2275510204081633e-05, + "loss": 0.3006, + "step": 7570 + }, + { + "epoch": 38.673469387755105, + "grad_norm": 5.025670051574707, + "learning_rate": 1.2265306122448981e-05, + "loss": 0.3014, + "step": 7580 + }, + { + "epoch": 38.724489795918366, + "grad_norm": 0.6662245392799377, + "learning_rate": 1.2255102040816327e-05, + "loss": 0.2153, + "step": 7590 + }, + { + "epoch": 38.775510204081634, + "grad_norm": 5.983375549316406, + "learning_rate": 1.2244897959183674e-05, + "loss": 0.3509, + "step": 7600 + }, + { + "epoch": 38.826530612244895, + "grad_norm": 1.9515364170074463, + "learning_rate": 1.223469387755102e-05, + "loss": 0.1339, + "step": 7610 + }, + { + "epoch": 38.87755102040816, + "grad_norm": 2.3098132610321045, + "learning_rate": 1.222448979591837e-05, + "loss": 0.0899, + "step": 7620 + }, + { + "epoch": 38.92857142857143, + "grad_norm": 4.075789451599121, + "learning_rate": 1.2214285714285717e-05, + "loss": 0.2255, + "step": 7630 + }, + { + "epoch": 38.97959183673469, + "grad_norm": 16.4624080657959, + "learning_rate": 1.2204081632653062e-05, + "loss": 0.6077, + "step": 7640 + }, + { + "epoch": 39.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.36030298471450806, + "eval_runtime": 0.9376, + "eval_samples_per_second": 295.437, + "eval_steps_per_second": 37.33, + "step": 7644 + }, + { + "epoch": 39.03061224489796, + "grad_norm": 0.3404889702796936, + "learning_rate": 1.219387755102041e-05, + "loss": 0.1859, + "step": 7650 + }, + { + "epoch": 39.08163265306123, + "grad_norm": 10.761521339416504, + "learning_rate": 1.2183673469387756e-05, + "loss": 0.1371, + "step": 7660 + }, + { + "epoch": 39.13265306122449, + "grad_norm": 4.0601959228515625, + "learning_rate": 1.2173469387755103e-05, + "loss": 0.2587, + "step": 7670 + }, + { + "epoch": 39.183673469387756, + "grad_norm": 2.3847389221191406, + "learning_rate": 1.2163265306122449e-05, + "loss": 0.115, + "step": 7680 + }, + { + "epoch": 39.234693877551024, + "grad_norm": 0.9999655485153198, + "learning_rate": 1.2153061224489796e-05, + "loss": 0.456, + "step": 7690 + }, + { + "epoch": 39.285714285714285, + "grad_norm": 3.566209316253662, + "learning_rate": 1.2142857142857142e-05, + "loss": 0.2196, + "step": 7700 + }, + { + "epoch": 39.33673469387755, + "grad_norm": 6.022881031036377, + "learning_rate": 1.2132653061224491e-05, + "loss": 0.2245, + "step": 7710 + }, + { + "epoch": 39.38775510204081, + "grad_norm": 2.5836009979248047, + "learning_rate": 1.2122448979591839e-05, + "loss": 0.291, + "step": 7720 + }, + { + "epoch": 39.43877551020408, + "grad_norm": 0.4639554023742676, + "learning_rate": 1.2112244897959185e-05, + "loss": 0.3267, + "step": 7730 + }, + { + "epoch": 39.48979591836735, + "grad_norm": 4.0531511306762695, + "learning_rate": 1.2102040816326532e-05, + "loss": 0.2647, + "step": 7740 + }, + { + "epoch": 39.54081632653061, + "grad_norm": 10.204673767089844, + "learning_rate": 1.2091836734693878e-05, + "loss": 0.2902, + "step": 7750 + }, + { + "epoch": 39.59183673469388, + "grad_norm": 7.673137664794922, + "learning_rate": 1.2081632653061225e-05, + "loss": 0.2398, + "step": 7760 + }, + { + "epoch": 39.642857142857146, + "grad_norm": 6.424933910369873, + "learning_rate": 1.2071428571428571e-05, + "loss": 0.2942, + "step": 7770 + }, + { + "epoch": 39.69387755102041, + "grad_norm": 0.6079707741737366, + "learning_rate": 1.206122448979592e-05, + "loss": 0.244, + "step": 7780 + }, + { + "epoch": 39.744897959183675, + "grad_norm": 9.169522285461426, + "learning_rate": 1.2051020408163265e-05, + "loss": 0.2587, + "step": 7790 + }, + { + "epoch": 39.795918367346935, + "grad_norm": 8.324515342712402, + "learning_rate": 1.2040816326530614e-05, + "loss": 0.3697, + "step": 7800 + }, + { + "epoch": 39.8469387755102, + "grad_norm": 13.407546043395996, + "learning_rate": 1.2030612244897961e-05, + "loss": 0.5134, + "step": 7810 + }, + { + "epoch": 39.89795918367347, + "grad_norm": 1.663726568222046, + "learning_rate": 1.2020408163265307e-05, + "loss": 0.3162, + "step": 7820 + }, + { + "epoch": 39.94897959183673, + "grad_norm": 2.873859405517578, + "learning_rate": 1.2010204081632655e-05, + "loss": 0.1389, + "step": 7830 + }, + { + "epoch": 40.0, + "grad_norm": 5.262413024902344, + "learning_rate": 1.2e-05, + "loss": 0.1966, + "step": 7840 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.351894736289978, + "eval_runtime": 0.9359, + "eval_samples_per_second": 295.964, + "eval_steps_per_second": 37.396, + "step": 7840 + }, + { + "epoch": 40.05102040816327, + "grad_norm": 5.261941909790039, + "learning_rate": 1.1989795918367348e-05, + "loss": 0.1488, + "step": 7850 + }, + { + "epoch": 40.10204081632653, + "grad_norm": 1.4000771045684814, + "learning_rate": 1.1979591836734694e-05, + "loss": 0.1324, + "step": 7860 + }, + { + "epoch": 40.1530612244898, + "grad_norm": 1.8460229635238647, + "learning_rate": 1.1969387755102043e-05, + "loss": 0.2866, + "step": 7870 + }, + { + "epoch": 40.204081632653065, + "grad_norm": 1.513795256614685, + "learning_rate": 1.1959183673469389e-05, + "loss": 0.3759, + "step": 7880 + }, + { + "epoch": 40.255102040816325, + "grad_norm": 0.34185507893562317, + "learning_rate": 1.1948979591836736e-05, + "loss": 0.1073, + "step": 7890 + }, + { + "epoch": 40.30612244897959, + "grad_norm": 9.449399948120117, + "learning_rate": 1.1938775510204084e-05, + "loss": 0.4452, + "step": 7900 + }, + { + "epoch": 40.357142857142854, + "grad_norm": 8.813591003417969, + "learning_rate": 1.192857142857143e-05, + "loss": 0.3319, + "step": 7910 + }, + { + "epoch": 40.40816326530612, + "grad_norm": 0.7910783290863037, + "learning_rate": 1.1918367346938777e-05, + "loss": 0.222, + "step": 7920 + }, + { + "epoch": 40.45918367346939, + "grad_norm": 0.7371392846107483, + "learning_rate": 1.1908163265306123e-05, + "loss": 0.304, + "step": 7930 + }, + { + "epoch": 40.51020408163265, + "grad_norm": 3.1124866008758545, + "learning_rate": 1.189795918367347e-05, + "loss": 0.2276, + "step": 7940 + }, + { + "epoch": 40.56122448979592, + "grad_norm": 8.691803932189941, + "learning_rate": 1.1887755102040816e-05, + "loss": 0.4638, + "step": 7950 + }, + { + "epoch": 40.61224489795919, + "grad_norm": 6.327417850494385, + "learning_rate": 1.1877551020408165e-05, + "loss": 0.2015, + "step": 7960 + }, + { + "epoch": 40.66326530612245, + "grad_norm": 3.4650707244873047, + "learning_rate": 1.186734693877551e-05, + "loss": 0.2033, + "step": 7970 + }, + { + "epoch": 40.714285714285715, + "grad_norm": 4.530669689178467, + "learning_rate": 1.1857142857142858e-05, + "loss": 0.3811, + "step": 7980 + }, + { + "epoch": 40.765306122448976, + "grad_norm": 7.533509254455566, + "learning_rate": 1.1846938775510206e-05, + "loss": 0.632, + "step": 7990 + }, + { + "epoch": 40.816326530612244, + "grad_norm": 1.9913711547851562, + "learning_rate": 1.1836734693877552e-05, + "loss": 0.1945, + "step": 8000 + }, + { + "epoch": 40.86734693877551, + "grad_norm": 3.851255178451538, + "learning_rate": 1.1826530612244899e-05, + "loss": 0.1426, + "step": 8010 + }, + { + "epoch": 40.91836734693877, + "grad_norm": 12.677950859069824, + "learning_rate": 1.1816326530612245e-05, + "loss": 0.3442, + "step": 8020 + }, + { + "epoch": 40.96938775510204, + "grad_norm": 6.756103992462158, + "learning_rate": 1.1806122448979594e-05, + "loss": 0.2473, + "step": 8030 + }, + { + "epoch": 41.0, + "eval_accuracy": 0.9025270758122743, + "eval_loss": 0.3343473970890045, + "eval_runtime": 0.9305, + "eval_samples_per_second": 297.683, + "eval_steps_per_second": 37.613, + "step": 8036 + }, + { + "epoch": 41.02040816326531, + "grad_norm": 2.6532843112945557, + "learning_rate": 1.179591836734694e-05, + "loss": 0.5848, + "step": 8040 + }, + { + "epoch": 41.07142857142857, + "grad_norm": 2.1744353771209717, + "learning_rate": 1.1785714285714287e-05, + "loss": 0.2735, + "step": 8050 + }, + { + "epoch": 41.12244897959184, + "grad_norm": 0.46514615416526794, + "learning_rate": 1.1775510204081633e-05, + "loss": 0.1303, + "step": 8060 + }, + { + "epoch": 41.173469387755105, + "grad_norm": 1.9780634641647339, + "learning_rate": 1.176530612244898e-05, + "loss": 0.2968, + "step": 8070 + }, + { + "epoch": 41.224489795918366, + "grad_norm": 2.0282843112945557, + "learning_rate": 1.1755102040816328e-05, + "loss": 0.2812, + "step": 8080 + }, + { + "epoch": 41.275510204081634, + "grad_norm": 9.81309986114502, + "learning_rate": 1.1744897959183674e-05, + "loss": 0.2277, + "step": 8090 + }, + { + "epoch": 41.326530612244895, + "grad_norm": 10.753496170043945, + "learning_rate": 1.1734693877551021e-05, + "loss": 0.3806, + "step": 8100 + }, + { + "epoch": 41.37755102040816, + "grad_norm": 8.6217041015625, + "learning_rate": 1.1724489795918367e-05, + "loss": 0.2826, + "step": 8110 + }, + { + "epoch": 41.42857142857143, + "grad_norm": 0.3755444586277008, + "learning_rate": 1.1714285714285716e-05, + "loss": 0.3433, + "step": 8120 + }, + { + "epoch": 41.47959183673469, + "grad_norm": 6.280050277709961, + "learning_rate": 1.1704081632653062e-05, + "loss": 0.3115, + "step": 8130 + }, + { + "epoch": 41.53061224489796, + "grad_norm": 5.224348068237305, + "learning_rate": 1.169387755102041e-05, + "loss": 0.181, + "step": 8140 + }, + { + "epoch": 41.58163265306123, + "grad_norm": 2.38151478767395, + "learning_rate": 1.1683673469387755e-05, + "loss": 0.2309, + "step": 8150 + }, + { + "epoch": 41.63265306122449, + "grad_norm": 2.4411683082580566, + "learning_rate": 1.1673469387755103e-05, + "loss": 0.4639, + "step": 8160 + }, + { + "epoch": 41.683673469387756, + "grad_norm": 1.6756824254989624, + "learning_rate": 1.166326530612245e-05, + "loss": 0.3495, + "step": 8170 + }, + { + "epoch": 41.734693877551024, + "grad_norm": 7.354377269744873, + "learning_rate": 1.1653061224489796e-05, + "loss": 0.1211, + "step": 8180 + }, + { + "epoch": 41.785714285714285, + "grad_norm": 4.022644519805908, + "learning_rate": 1.1642857142857145e-05, + "loss": 0.2025, + "step": 8190 + }, + { + "epoch": 41.83673469387755, + "grad_norm": 3.0944485664367676, + "learning_rate": 1.1632653061224491e-05, + "loss": 0.389, + "step": 8200 + }, + { + "epoch": 41.88775510204081, + "grad_norm": 0.8570572733879089, + "learning_rate": 1.1622448979591839e-05, + "loss": 0.1641, + "step": 8210 + }, + { + "epoch": 41.93877551020408, + "grad_norm": 6.50732946395874, + "learning_rate": 1.1612244897959184e-05, + "loss": 0.2132, + "step": 8220 + }, + { + "epoch": 41.98979591836735, + "grad_norm": 2.8220293521881104, + "learning_rate": 1.1602040816326532e-05, + "loss": 0.2795, + "step": 8230 + }, + { + "epoch": 42.0, + "eval_accuracy": 0.9169675090252708, + "eval_loss": 0.3383941352367401, + "eval_runtime": 0.9305, + "eval_samples_per_second": 297.686, + "eval_steps_per_second": 37.614, + "step": 8232 + }, + { + "epoch": 42.04081632653061, + "grad_norm": 5.730966091156006, + "learning_rate": 1.1591836734693878e-05, + "loss": 0.3407, + "step": 8240 + }, + { + "epoch": 42.09183673469388, + "grad_norm": 4.490549564361572, + "learning_rate": 1.1581632653061225e-05, + "loss": 0.2667, + "step": 8250 + }, + { + "epoch": 42.142857142857146, + "grad_norm": 1.790333867073059, + "learning_rate": 1.1571428571428573e-05, + "loss": 0.1892, + "step": 8260 + }, + { + "epoch": 42.19387755102041, + "grad_norm": 0.3163524270057678, + "learning_rate": 1.1561224489795918e-05, + "loss": 0.3141, + "step": 8270 + }, + { + "epoch": 42.244897959183675, + "grad_norm": 8.028840065002441, + "learning_rate": 1.1551020408163268e-05, + "loss": 0.0883, + "step": 8280 + }, + { + "epoch": 42.295918367346935, + "grad_norm": 1.7445170879364014, + "learning_rate": 1.1540816326530613e-05, + "loss": 0.3833, + "step": 8290 + }, + { + "epoch": 42.3469387755102, + "grad_norm": 1.6772736310958862, + "learning_rate": 1.1530612244897961e-05, + "loss": 0.3524, + "step": 8300 + }, + { + "epoch": 42.39795918367347, + "grad_norm": 11.508550643920898, + "learning_rate": 1.1520408163265307e-05, + "loss": 0.2454, + "step": 8310 + }, + { + "epoch": 42.44897959183673, + "grad_norm": 2.1842567920684814, + "learning_rate": 1.1510204081632654e-05, + "loss": 0.151, + "step": 8320 + }, + { + "epoch": 42.5, + "grad_norm": 9.214639663696289, + "learning_rate": 1.15e-05, + "loss": 0.246, + "step": 8330 + }, + { + "epoch": 42.55102040816327, + "grad_norm": 12.712059020996094, + "learning_rate": 1.1489795918367347e-05, + "loss": 0.4519, + "step": 8340 + }, + { + "epoch": 42.60204081632653, + "grad_norm": 10.930113792419434, + "learning_rate": 1.1479591836734697e-05, + "loss": 0.1964, + "step": 8350 + }, + { + "epoch": 42.6530612244898, + "grad_norm": 7.795054912567139, + "learning_rate": 1.146938775510204e-05, + "loss": 0.1999, + "step": 8360 + }, + { + "epoch": 42.704081632653065, + "grad_norm": 13.82280445098877, + "learning_rate": 1.145918367346939e-05, + "loss": 0.3961, + "step": 8370 + }, + { + "epoch": 42.755102040816325, + "grad_norm": 3.0605897903442383, + "learning_rate": 1.1448979591836736e-05, + "loss": 0.4642, + "step": 8380 + }, + { + "epoch": 42.80612244897959, + "grad_norm": 2.3108794689178467, + "learning_rate": 1.1438775510204083e-05, + "loss": 0.1843, + "step": 8390 + }, + { + "epoch": 42.857142857142854, + "grad_norm": 8.32869815826416, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.292, + "step": 8400 + }, + { + "epoch": 42.90816326530612, + "grad_norm": 3.965181827545166, + "learning_rate": 1.1418367346938777e-05, + "loss": 0.3194, + "step": 8410 + }, + { + "epoch": 42.95918367346939, + "grad_norm": 6.17874002456665, + "learning_rate": 1.1408163265306122e-05, + "loss": 0.1249, + "step": 8420 + }, + { + "epoch": 43.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.40458711981773376, + "eval_runtime": 0.9348, + "eval_samples_per_second": 296.317, + "eval_steps_per_second": 37.441, + "step": 8428 + }, + { + "epoch": 43.01020408163265, + "grad_norm": 14.760079383850098, + "learning_rate": 1.139795918367347e-05, + "loss": 0.1974, + "step": 8430 + }, + { + "epoch": 43.06122448979592, + "grad_norm": 4.747530937194824, + "learning_rate": 1.1387755102040819e-05, + "loss": 0.237, + "step": 8440 + }, + { + "epoch": 43.11224489795919, + "grad_norm": 2.4478297233581543, + "learning_rate": 1.1377551020408165e-05, + "loss": 0.1684, + "step": 8450 + }, + { + "epoch": 43.16326530612245, + "grad_norm": 11.319709777832031, + "learning_rate": 1.1367346938775512e-05, + "loss": 0.328, + "step": 8460 + }, + { + "epoch": 43.214285714285715, + "grad_norm": 4.042837619781494, + "learning_rate": 1.1357142857142858e-05, + "loss": 0.3286, + "step": 8470 + }, + { + "epoch": 43.265306122448976, + "grad_norm": 2.105111598968506, + "learning_rate": 1.1346938775510206e-05, + "loss": 0.0777, + "step": 8480 + }, + { + "epoch": 43.316326530612244, + "grad_norm": 3.559504985809326, + "learning_rate": 1.1336734693877551e-05, + "loss": 0.2873, + "step": 8490 + }, + { + "epoch": 43.36734693877551, + "grad_norm": 14.684622764587402, + "learning_rate": 1.1326530612244899e-05, + "loss": 0.2682, + "step": 8500 + }, + { + "epoch": 43.41836734693877, + "grad_norm": 4.134817123413086, + "learning_rate": 1.1316326530612245e-05, + "loss": 0.2208, + "step": 8510 + }, + { + "epoch": 43.46938775510204, + "grad_norm": 5.838129997253418, + "learning_rate": 1.1306122448979592e-05, + "loss": 0.2491, + "step": 8520 + }, + { + "epoch": 43.52040816326531, + "grad_norm": 3.218062162399292, + "learning_rate": 1.1295918367346941e-05, + "loss": 0.4019, + "step": 8530 + }, + { + "epoch": 43.57142857142857, + "grad_norm": 9.169731140136719, + "learning_rate": 1.1285714285714287e-05, + "loss": 0.212, + "step": 8540 + }, + { + "epoch": 43.62244897959184, + "grad_norm": 2.351957082748413, + "learning_rate": 1.1275510204081635e-05, + "loss": 0.356, + "step": 8550 + }, + { + "epoch": 43.673469387755105, + "grad_norm": 13.376652717590332, + "learning_rate": 1.126530612244898e-05, + "loss": 0.3859, + "step": 8560 + }, + { + "epoch": 43.724489795918366, + "grad_norm": 9.093658447265625, + "learning_rate": 1.1255102040816328e-05, + "loss": 0.2787, + "step": 8570 + }, + { + "epoch": 43.775510204081634, + "grad_norm": 1.1991856098175049, + "learning_rate": 1.1244897959183674e-05, + "loss": 0.1786, + "step": 8580 + }, + { + "epoch": 43.826530612244895, + "grad_norm": 8.349421501159668, + "learning_rate": 1.1234693877551021e-05, + "loss": 0.2838, + "step": 8590 + }, + { + "epoch": 43.87755102040816, + "grad_norm": 1.6250555515289307, + "learning_rate": 1.1224489795918367e-05, + "loss": 0.1497, + "step": 8600 + }, + { + "epoch": 43.92857142857143, + "grad_norm": 11.57792854309082, + "learning_rate": 1.1214285714285716e-05, + "loss": 0.5656, + "step": 8610 + }, + { + "epoch": 43.97959183673469, + "grad_norm": 7.346845626831055, + "learning_rate": 1.1204081632653062e-05, + "loss": 0.2943, + "step": 8620 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.39525121450424194, + "eval_runtime": 0.9323, + "eval_samples_per_second": 297.115, + "eval_steps_per_second": 37.542, + "step": 8624 + }, + { + "epoch": 44.03061224489796, + "grad_norm": 2.321767807006836, + "learning_rate": 1.119387755102041e-05, + "loss": 0.295, + "step": 8630 + }, + { + "epoch": 44.08163265306123, + "grad_norm": 10.005078315734863, + "learning_rate": 1.1183673469387757e-05, + "loss": 0.216, + "step": 8640 + }, + { + "epoch": 44.13265306122449, + "grad_norm": 10.172088623046875, + "learning_rate": 1.1173469387755103e-05, + "loss": 0.336, + "step": 8650 + }, + { + "epoch": 44.183673469387756, + "grad_norm": 9.204157829284668, + "learning_rate": 1.116326530612245e-05, + "loss": 0.1377, + "step": 8660 + }, + { + "epoch": 44.234693877551024, + "grad_norm": 0.3493322730064392, + "learning_rate": 1.1153061224489796e-05, + "loss": 0.5222, + "step": 8670 + }, + { + "epoch": 44.285714285714285, + "grad_norm": 4.95557975769043, + "learning_rate": 1.1142857142857143e-05, + "loss": 0.292, + "step": 8680 + }, + { + "epoch": 44.33673469387755, + "grad_norm": 1.297506332397461, + "learning_rate": 1.113265306122449e-05, + "loss": 0.1712, + "step": 8690 + }, + { + "epoch": 44.38775510204081, + "grad_norm": 2.6922519207000732, + "learning_rate": 1.1122448979591838e-05, + "loss": 0.2791, + "step": 8700 + }, + { + "epoch": 44.43877551020408, + "grad_norm": 2.3091373443603516, + "learning_rate": 1.1112244897959184e-05, + "loss": 0.3586, + "step": 8710 + }, + { + "epoch": 44.48979591836735, + "grad_norm": 0.26918989419937134, + "learning_rate": 1.1102040816326532e-05, + "loss": 0.3325, + "step": 8720 + }, + { + "epoch": 44.54081632653061, + "grad_norm": 11.546154975891113, + "learning_rate": 1.1091836734693879e-05, + "loss": 0.2399, + "step": 8730 + }, + { + "epoch": 44.59183673469388, + "grad_norm": 4.936864852905273, + "learning_rate": 1.1081632653061225e-05, + "loss": 0.0771, + "step": 8740 + }, + { + "epoch": 44.642857142857146, + "grad_norm": 4.6656813621521, + "learning_rate": 1.1071428571428572e-05, + "loss": 0.1901, + "step": 8750 + }, + { + "epoch": 44.69387755102041, + "grad_norm": 2.98370099067688, + "learning_rate": 1.1061224489795918e-05, + "loss": 0.2181, + "step": 8760 + }, + { + "epoch": 44.744897959183675, + "grad_norm": 11.47069263458252, + "learning_rate": 1.1051020408163267e-05, + "loss": 0.2669, + "step": 8770 + }, + { + "epoch": 44.795918367346935, + "grad_norm": 6.381840229034424, + "learning_rate": 1.1040816326530611e-05, + "loss": 0.2505, + "step": 8780 + }, + { + "epoch": 44.8469387755102, + "grad_norm": 0.29808229207992554, + "learning_rate": 1.103061224489796e-05, + "loss": 0.3411, + "step": 8790 + }, + { + "epoch": 44.89795918367347, + "grad_norm": 6.780035972595215, + "learning_rate": 1.1020408163265306e-05, + "loss": 0.215, + "step": 8800 + }, + { + "epoch": 44.94897959183673, + "grad_norm": 10.969614028930664, + "learning_rate": 1.1010204081632654e-05, + "loss": 0.2879, + "step": 8810 + }, + { + "epoch": 45.0, + "grad_norm": 0.764427900314331, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.3002, + "step": 8820 + }, + { + "epoch": 45.0, + "eval_accuracy": 0.8592057761732852, + "eval_loss": 0.5002943873405457, + "eval_runtime": 0.9404, + "eval_samples_per_second": 294.558, + "eval_steps_per_second": 37.218, + "step": 8820 + }, + { + "epoch": 45.05102040816327, + "grad_norm": 5.812986850738525, + "learning_rate": 1.0989795918367347e-05, + "loss": 0.15, + "step": 8830 + }, + { + "epoch": 45.10204081632653, + "grad_norm": 1.0182520151138306, + "learning_rate": 1.0979591836734695e-05, + "loss": 0.2247, + "step": 8840 + }, + { + "epoch": 45.1530612244898, + "grad_norm": 3.7717044353485107, + "learning_rate": 1.096938775510204e-05, + "loss": 0.1436, + "step": 8850 + }, + { + "epoch": 45.204081632653065, + "grad_norm": 17.78550148010254, + "learning_rate": 1.095918367346939e-05, + "loss": 0.2528, + "step": 8860 + }, + { + "epoch": 45.255102040816325, + "grad_norm": 3.487760305404663, + "learning_rate": 1.0948979591836735e-05, + "loss": 0.4551, + "step": 8870 + }, + { + "epoch": 45.30612244897959, + "grad_norm": 2.6087698936462402, + "learning_rate": 1.0938775510204083e-05, + "loss": 0.3815, + "step": 8880 + }, + { + "epoch": 45.357142857142854, + "grad_norm": 13.519532203674316, + "learning_rate": 1.0928571428571429e-05, + "loss": 0.3009, + "step": 8890 + }, + { + "epoch": 45.40816326530612, + "grad_norm": 3.4221670627593994, + "learning_rate": 1.0918367346938776e-05, + "loss": 0.3341, + "step": 8900 + }, + { + "epoch": 45.45918367346939, + "grad_norm": 0.41899165511131287, + "learning_rate": 1.0908163265306124e-05, + "loss": 0.0961, + "step": 8910 + }, + { + "epoch": 45.51020408163265, + "grad_norm": 1.055353045463562, + "learning_rate": 1.089795918367347e-05, + "loss": 0.4265, + "step": 8920 + }, + { + "epoch": 45.56122448979592, + "grad_norm": 1.2329767942428589, + "learning_rate": 1.0887755102040819e-05, + "loss": 0.3215, + "step": 8930 + }, + { + "epoch": 45.61224489795919, + "grad_norm": 3.0227720737457275, + "learning_rate": 1.0877551020408163e-05, + "loss": 0.502, + "step": 8940 + }, + { + "epoch": 45.66326530612245, + "grad_norm": 4.809809684753418, + "learning_rate": 1.0867346938775512e-05, + "loss": 0.3271, + "step": 8950 + }, + { + "epoch": 45.714285714285715, + "grad_norm": 3.6863017082214355, + "learning_rate": 1.0857142857142858e-05, + "loss": 0.3246, + "step": 8960 + }, + { + "epoch": 45.765306122448976, + "grad_norm": 7.67105770111084, + "learning_rate": 1.0846938775510205e-05, + "loss": 0.2358, + "step": 8970 + }, + { + "epoch": 45.816326530612244, + "grad_norm": 1.7396820783615112, + "learning_rate": 1.0836734693877551e-05, + "loss": 0.2026, + "step": 8980 + }, + { + "epoch": 45.86734693877551, + "grad_norm": 2.3171074390411377, + "learning_rate": 1.0826530612244899e-05, + "loss": 0.148, + "step": 8990 + }, + { + "epoch": 45.91836734693877, + "grad_norm": 4.390339374542236, + "learning_rate": 1.0816326530612246e-05, + "loss": 0.2545, + "step": 9000 + }, + { + "epoch": 45.96938775510204, + "grad_norm": 13.35239028930664, + "learning_rate": 1.0806122448979592e-05, + "loss": 0.1525, + "step": 9010 + }, + { + "epoch": 46.0, + "eval_accuracy": 0.9169675090252708, + "eval_loss": 0.3231888711452484, + "eval_runtime": 0.9348, + "eval_samples_per_second": 296.308, + "eval_steps_per_second": 37.44, + "step": 9016 + }, + { + "epoch": 46.02040816326531, + "grad_norm": 19.203357696533203, + "learning_rate": 1.0795918367346941e-05, + "loss": 0.4035, + "step": 9020 + }, + { + "epoch": 46.07142857142857, + "grad_norm": 2.2039167881011963, + "learning_rate": 1.0785714285714287e-05, + "loss": 0.3711, + "step": 9030 + }, + { + "epoch": 46.12244897959184, + "grad_norm": 1.917140007019043, + "learning_rate": 1.0775510204081634e-05, + "loss": 0.2079, + "step": 9040 + }, + { + "epoch": 46.173469387755105, + "grad_norm": 9.410566329956055, + "learning_rate": 1.076530612244898e-05, + "loss": 0.2509, + "step": 9050 + }, + { + "epoch": 46.224489795918366, + "grad_norm": 4.123296737670898, + "learning_rate": 1.0755102040816328e-05, + "loss": 0.4548, + "step": 9060 + }, + { + "epoch": 46.275510204081634, + "grad_norm": 2.996526002883911, + "learning_rate": 1.0744897959183673e-05, + "loss": 0.1775, + "step": 9070 + }, + { + "epoch": 46.326530612244895, + "grad_norm": 6.14008903503418, + "learning_rate": 1.073469387755102e-05, + "loss": 0.1807, + "step": 9080 + }, + { + "epoch": 46.37755102040816, + "grad_norm": 5.149447441101074, + "learning_rate": 1.072448979591837e-05, + "loss": 0.2278, + "step": 9090 + }, + { + "epoch": 46.42857142857143, + "grad_norm": 0.6238847374916077, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.2204, + "step": 9100 + }, + { + "epoch": 46.47959183673469, + "grad_norm": 8.696666717529297, + "learning_rate": 1.0704081632653063e-05, + "loss": 0.2924, + "step": 9110 + }, + { + "epoch": 46.53061224489796, + "grad_norm": 7.7894978523254395, + "learning_rate": 1.0693877551020409e-05, + "loss": 0.2058, + "step": 9120 + }, + { + "epoch": 46.58163265306123, + "grad_norm": 3.7465615272521973, + "learning_rate": 1.0683673469387757e-05, + "loss": 0.4511, + "step": 9130 + }, + { + "epoch": 46.63265306122449, + "grad_norm": 8.284019470214844, + "learning_rate": 1.0673469387755102e-05, + "loss": 0.2944, + "step": 9140 + }, + { + "epoch": 46.683673469387756, + "grad_norm": 4.210254192352295, + "learning_rate": 1.066326530612245e-05, + "loss": 0.1609, + "step": 9150 + }, + { + "epoch": 46.734693877551024, + "grad_norm": 1.5888322591781616, + "learning_rate": 1.0653061224489796e-05, + "loss": 0.1958, + "step": 9160 + }, + { + "epoch": 46.785714285714285, + "grad_norm": 10.489404678344727, + "learning_rate": 1.0642857142857143e-05, + "loss": 0.2796, + "step": 9170 + }, + { + "epoch": 46.83673469387755, + "grad_norm": 1.8088388442993164, + "learning_rate": 1.0632653061224492e-05, + "loss": 0.1466, + "step": 9180 + }, + { + "epoch": 46.88775510204081, + "grad_norm": 4.326778888702393, + "learning_rate": 1.0622448979591838e-05, + "loss": 0.2615, + "step": 9190 + }, + { + "epoch": 46.93877551020408, + "grad_norm": 6.9802632331848145, + "learning_rate": 1.0612244897959186e-05, + "loss": 0.3217, + "step": 9200 + }, + { + "epoch": 46.98979591836735, + "grad_norm": 9.483171463012695, + "learning_rate": 1.0602040816326531e-05, + "loss": 0.4022, + "step": 9210 + }, + { + "epoch": 47.0, + "eval_accuracy": 0.9169675090252708, + "eval_loss": 0.3112559914588928, + "eval_runtime": 1.0011, + "eval_samples_per_second": 276.691, + "eval_steps_per_second": 34.961, + "step": 9212 + }, + { + "epoch": 47.04081632653061, + "grad_norm": 5.846846103668213, + "learning_rate": 1.0591836734693879e-05, + "loss": 0.2519, + "step": 9220 + }, + { + "epoch": 47.09183673469388, + "grad_norm": 12.650550842285156, + "learning_rate": 1.0581632653061225e-05, + "loss": 0.1602, + "step": 9230 + }, + { + "epoch": 47.142857142857146, + "grad_norm": 6.220149993896484, + "learning_rate": 1.0571428571428572e-05, + "loss": 0.3288, + "step": 9240 + }, + { + "epoch": 47.19387755102041, + "grad_norm": 3.168201208114624, + "learning_rate": 1.0561224489795918e-05, + "loss": 0.1582, + "step": 9250 + }, + { + "epoch": 47.244897959183675, + "grad_norm": 2.817613124847412, + "learning_rate": 1.0551020408163265e-05, + "loss": 0.3362, + "step": 9260 + }, + { + "epoch": 47.295918367346935, + "grad_norm": 2.279808521270752, + "learning_rate": 1.0540816326530615e-05, + "loss": 0.3093, + "step": 9270 + }, + { + "epoch": 47.3469387755102, + "grad_norm": 9.442809104919434, + "learning_rate": 1.053061224489796e-05, + "loss": 0.4158, + "step": 9280 + }, + { + "epoch": 47.39795918367347, + "grad_norm": 7.1183342933654785, + "learning_rate": 1.0520408163265308e-05, + "loss": 0.1822, + "step": 9290 + }, + { + "epoch": 47.44897959183673, + "grad_norm": 1.518853783607483, + "learning_rate": 1.0510204081632654e-05, + "loss": 0.1303, + "step": 9300 + }, + { + "epoch": 47.5, + "grad_norm": 5.1223602294921875, + "learning_rate": 1.0500000000000001e-05, + "loss": 0.6394, + "step": 9310 + }, + { + "epoch": 47.55102040816327, + "grad_norm": 8.16996955871582, + "learning_rate": 1.0489795918367347e-05, + "loss": 0.286, + "step": 9320 + }, + { + "epoch": 47.60204081632653, + "grad_norm": 7.438451766967773, + "learning_rate": 1.0479591836734694e-05, + "loss": 0.2902, + "step": 9330 + }, + { + "epoch": 47.6530612244898, + "grad_norm": 9.90367603302002, + "learning_rate": 1.046938775510204e-05, + "loss": 0.1833, + "step": 9340 + }, + { + "epoch": 47.704081632653065, + "grad_norm": 1.0031007528305054, + "learning_rate": 1.045918367346939e-05, + "loss": 0.27, + "step": 9350 + }, + { + "epoch": 47.755102040816325, + "grad_norm": 5.630500793457031, + "learning_rate": 1.0448979591836737e-05, + "loss": 0.2838, + "step": 9360 + }, + { + "epoch": 47.80612244897959, + "grad_norm": 3.1275367736816406, + "learning_rate": 1.0438775510204083e-05, + "loss": 0.1895, + "step": 9370 + }, + { + "epoch": 47.857142857142854, + "grad_norm": 4.408709526062012, + "learning_rate": 1.042857142857143e-05, + "loss": 0.234, + "step": 9380 + }, + { + "epoch": 47.90816326530612, + "grad_norm": 1.6241774559020996, + "learning_rate": 1.0418367346938776e-05, + "loss": 0.0964, + "step": 9390 + }, + { + "epoch": 47.95918367346939, + "grad_norm": 13.035674095153809, + "learning_rate": 1.0408163265306123e-05, + "loss": 0.4994, + "step": 9400 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.855595667870036, + "eval_loss": 0.4494384229183197, + "eval_runtime": 0.943, + "eval_samples_per_second": 293.731, + "eval_steps_per_second": 37.114, + "step": 9408 + }, + { + "epoch": 48.01020408163265, + "grad_norm": 2.8087539672851562, + "learning_rate": 1.039795918367347e-05, + "loss": 0.2604, + "step": 9410 + }, + { + "epoch": 48.06122448979592, + "grad_norm": 2.577768325805664, + "learning_rate": 1.0387755102040817e-05, + "loss": 0.2844, + "step": 9420 + }, + { + "epoch": 48.11224489795919, + "grad_norm": 11.842287063598633, + "learning_rate": 1.0377551020408162e-05, + "loss": 0.3377, + "step": 9430 + }, + { + "epoch": 48.16326530612245, + "grad_norm": 14.558892250061035, + "learning_rate": 1.0367346938775512e-05, + "loss": 0.2846, + "step": 9440 + }, + { + "epoch": 48.214285714285715, + "grad_norm": 2.9422316551208496, + "learning_rate": 1.0357142857142859e-05, + "loss": 0.2325, + "step": 9450 + }, + { + "epoch": 48.265306122448976, + "grad_norm": 2.472435235977173, + "learning_rate": 1.0346938775510205e-05, + "loss": 0.1945, + "step": 9460 + }, + { + "epoch": 48.316326530612244, + "grad_norm": 6.785258769989014, + "learning_rate": 1.0336734693877552e-05, + "loss": 0.2503, + "step": 9470 + }, + { + "epoch": 48.36734693877551, + "grad_norm": 1.3686105012893677, + "learning_rate": 1.0326530612244898e-05, + "loss": 0.3424, + "step": 9480 + }, + { + "epoch": 48.41836734693877, + "grad_norm": 2.2825608253479004, + "learning_rate": 1.0316326530612246e-05, + "loss": 0.1742, + "step": 9490 + }, + { + "epoch": 48.46938775510204, + "grad_norm": 8.19665813446045, + "learning_rate": 1.0306122448979591e-05, + "loss": 0.2663, + "step": 9500 + }, + { + "epoch": 48.52040816326531, + "grad_norm": 5.659435272216797, + "learning_rate": 1.029591836734694e-05, + "loss": 0.2131, + "step": 9510 + }, + { + "epoch": 48.57142857142857, + "grad_norm": 6.981075763702393, + "learning_rate": 1.0285714285714285e-05, + "loss": 0.209, + "step": 9520 + }, + { + "epoch": 48.62244897959184, + "grad_norm": 5.7839202880859375, + "learning_rate": 1.0275510204081634e-05, + "loss": 0.1524, + "step": 9530 + }, + { + "epoch": 48.673469387755105, + "grad_norm": 6.4543890953063965, + "learning_rate": 1.0265306122448981e-05, + "loss": 0.332, + "step": 9540 + }, + { + "epoch": 48.724489795918366, + "grad_norm": 1.987481713294983, + "learning_rate": 1.0255102040816327e-05, + "loss": 0.2465, + "step": 9550 + }, + { + "epoch": 48.775510204081634, + "grad_norm": 4.892277717590332, + "learning_rate": 1.0244897959183675e-05, + "loss": 0.3518, + "step": 9560 + }, + { + "epoch": 48.826530612244895, + "grad_norm": 6.381310939788818, + "learning_rate": 1.023469387755102e-05, + "loss": 0.2743, + "step": 9570 + }, + { + "epoch": 48.87755102040816, + "grad_norm": 8.336471557617188, + "learning_rate": 1.0224489795918368e-05, + "loss": 0.4257, + "step": 9580 + }, + { + "epoch": 48.92857142857143, + "grad_norm": 1.3258341550827026, + "learning_rate": 1.0214285714285714e-05, + "loss": 0.055, + "step": 9590 + }, + { + "epoch": 48.97959183673469, + "grad_norm": 10.289790153503418, + "learning_rate": 1.0204081632653063e-05, + "loss": 0.6512, + "step": 9600 + }, + { + "epoch": 49.0, + "eval_accuracy": 0.9205776173285198, + "eval_loss": 0.37217509746551514, + "eval_runtime": 0.9375, + "eval_samples_per_second": 295.453, + "eval_steps_per_second": 37.332, + "step": 9604 + }, + { + "epoch": 49.03061224489796, + "grad_norm": 2.5601608753204346, + "learning_rate": 1.0193877551020409e-05, + "loss": 0.2695, + "step": 9610 + }, + { + "epoch": 49.08163265306123, + "grad_norm": 8.381750106811523, + "learning_rate": 1.0183673469387756e-05, + "loss": 0.3243, + "step": 9620 + }, + { + "epoch": 49.13265306122449, + "grad_norm": 9.810985565185547, + "learning_rate": 1.0173469387755104e-05, + "loss": 0.3454, + "step": 9630 + }, + { + "epoch": 49.183673469387756, + "grad_norm": 10.479945182800293, + "learning_rate": 1.016326530612245e-05, + "loss": 0.324, + "step": 9640 + }, + { + "epoch": 49.234693877551024, + "grad_norm": 9.079980850219727, + "learning_rate": 1.0153061224489797e-05, + "loss": 0.427, + "step": 9650 + }, + { + "epoch": 49.285714285714285, + "grad_norm": 11.63463020324707, + "learning_rate": 1.0142857142857143e-05, + "loss": 0.3591, + "step": 9660 + }, + { + "epoch": 49.33673469387755, + "grad_norm": 2.4312291145324707, + "learning_rate": 1.013265306122449e-05, + "loss": 0.2909, + "step": 9670 + }, + { + "epoch": 49.38775510204081, + "grad_norm": 1.2564700841903687, + "learning_rate": 1.0122448979591836e-05, + "loss": 0.1863, + "step": 9680 + }, + { + "epoch": 49.43877551020408, + "grad_norm": 5.644169330596924, + "learning_rate": 1.0112244897959185e-05, + "loss": 0.2788, + "step": 9690 + }, + { + "epoch": 49.48979591836735, + "grad_norm": 6.595949172973633, + "learning_rate": 1.0102040816326531e-05, + "loss": 0.3373, + "step": 9700 + }, + { + "epoch": 49.54081632653061, + "grad_norm": 0.5495048761367798, + "learning_rate": 1.0091836734693879e-05, + "loss": 0.1776, + "step": 9710 + }, + { + "epoch": 49.59183673469388, + "grad_norm": 11.747842788696289, + "learning_rate": 1.0081632653061226e-05, + "loss": 0.352, + "step": 9720 + }, + { + "epoch": 49.642857142857146, + "grad_norm": 2.48116135597229, + "learning_rate": 1.0071428571428572e-05, + "loss": 0.1722, + "step": 9730 + }, + { + "epoch": 49.69387755102041, + "grad_norm": 8.725930213928223, + "learning_rate": 1.006122448979592e-05, + "loss": 0.1544, + "step": 9740 + }, + { + "epoch": 49.744897959183675, + "grad_norm": 1.277961254119873, + "learning_rate": 1.0051020408163265e-05, + "loss": 0.2064, + "step": 9750 + }, + { + "epoch": 49.795918367346935, + "grad_norm": 0.349727064371109, + "learning_rate": 1.0040816326530614e-05, + "loss": 0.1728, + "step": 9760 + }, + { + "epoch": 49.8469387755102, + "grad_norm": 2.0223069190979004, + "learning_rate": 1.003061224489796e-05, + "loss": 0.4544, + "step": 9770 + }, + { + "epoch": 49.89795918367347, + "grad_norm": 0.8309898972511292, + "learning_rate": 1.0020408163265308e-05, + "loss": 0.1539, + "step": 9780 + }, + { + "epoch": 49.94897959183673, + "grad_norm": 2.0115163326263428, + "learning_rate": 1.0010204081632653e-05, + "loss": 0.4388, + "step": 9790 + }, + { + "epoch": 50.0, + "grad_norm": 7.568119525909424, + "learning_rate": 1e-05, + "loss": 0.3152, + "step": 9800 + }, + { + "epoch": 50.0, + "eval_accuracy": 0.9097472924187726, + "eval_loss": 0.2852139472961426, + "eval_runtime": 0.9346, + "eval_samples_per_second": 296.382, + "eval_steps_per_second": 37.449, + "step": 9800 + }, + { + "epoch": 50.05102040816327, + "grad_norm": 1.160243034362793, + "learning_rate": 9.989795918367348e-06, + "loss": 0.2282, + "step": 9810 + }, + { + "epoch": 50.10204081632653, + "grad_norm": 13.818071365356445, + "learning_rate": 9.979591836734694e-06, + "loss": 0.2916, + "step": 9820 + }, + { + "epoch": 50.1530612244898, + "grad_norm": 4.897973537445068, + "learning_rate": 9.969387755102042e-06, + "loss": 0.2524, + "step": 9830 + }, + { + "epoch": 50.204081632653065, + "grad_norm": 5.517409324645996, + "learning_rate": 9.959183673469387e-06, + "loss": 0.2813, + "step": 9840 + }, + { + "epoch": 50.255102040816325, + "grad_norm": 6.2214274406433105, + "learning_rate": 9.948979591836737e-06, + "loss": 0.2462, + "step": 9850 + }, + { + "epoch": 50.30612244897959, + "grad_norm": 0.3451865315437317, + "learning_rate": 9.938775510204082e-06, + "loss": 0.2339, + "step": 9860 + }, + { + "epoch": 50.357142857142854, + "grad_norm": 6.983676433563232, + "learning_rate": 9.92857142857143e-06, + "loss": 0.1045, + "step": 9870 + }, + { + "epoch": 50.40816326530612, + "grad_norm": 1.0857036113739014, + "learning_rate": 9.918367346938776e-06, + "loss": 0.0822, + "step": 9880 + }, + { + "epoch": 50.45918367346939, + "grad_norm": 0.8820812702178955, + "learning_rate": 9.908163265306123e-06, + "loss": 0.2875, + "step": 9890 + }, + { + "epoch": 50.51020408163265, + "grad_norm": 4.251252174377441, + "learning_rate": 9.89795918367347e-06, + "loss": 0.2354, + "step": 9900 + }, + { + "epoch": 50.56122448979592, + "grad_norm": 17.90270233154297, + "learning_rate": 9.887755102040816e-06, + "loss": 0.4852, + "step": 9910 + }, + { + "epoch": 50.61224489795919, + "grad_norm": 11.787466049194336, + "learning_rate": 9.877551020408164e-06, + "loss": 0.4315, + "step": 9920 + }, + { + "epoch": 50.66326530612245, + "grad_norm": 3.597476005554199, + "learning_rate": 9.867346938775511e-06, + "loss": 0.2477, + "step": 9930 + }, + { + "epoch": 50.714285714285715, + "grad_norm": 2.140202522277832, + "learning_rate": 9.857142857142859e-06, + "loss": 0.1874, + "step": 9940 + }, + { + "epoch": 50.765306122448976, + "grad_norm": 11.62266731262207, + "learning_rate": 9.846938775510205e-06, + "loss": 0.3983, + "step": 9950 + }, + { + "epoch": 50.816326530612244, + "grad_norm": 11.113944053649902, + "learning_rate": 9.836734693877552e-06, + "loss": 0.4397, + "step": 9960 + }, + { + "epoch": 50.86734693877551, + "grad_norm": 9.346400260925293, + "learning_rate": 9.8265306122449e-06, + "loss": 0.2636, + "step": 9970 + }, + { + "epoch": 50.91836734693877, + "grad_norm": 1.1383917331695557, + "learning_rate": 9.816326530612245e-06, + "loss": 0.2994, + "step": 9980 + }, + { + "epoch": 50.96938775510204, + "grad_norm": 6.278914451599121, + "learning_rate": 9.806122448979593e-06, + "loss": 0.1165, + "step": 9990 + }, + { + "epoch": 51.0, + "eval_accuracy": 0.8628158844765343, + "eval_loss": 0.4137951135635376, + "eval_runtime": 0.9343, + "eval_samples_per_second": 296.489, + "eval_steps_per_second": 37.463, + "step": 9996 + }, + { + "epoch": 51.02040816326531, + "grad_norm": 5.278802871704102, + "learning_rate": 9.795918367346939e-06, + "loss": 0.251, + "step": 10000 + }, + { + "epoch": 51.07142857142857, + "grad_norm": 10.420355796813965, + "learning_rate": 9.785714285714286e-06, + "loss": 0.1771, + "step": 10010 + }, + { + "epoch": 51.12244897959184, + "grad_norm": 14.361730575561523, + "learning_rate": 9.775510204081634e-06, + "loss": 0.3886, + "step": 10020 + }, + { + "epoch": 51.173469387755105, + "grad_norm": 7.604317665100098, + "learning_rate": 9.765306122448981e-06, + "loss": 0.1775, + "step": 10030 + }, + { + "epoch": 51.224489795918366, + "grad_norm": 1.9585540294647217, + "learning_rate": 9.755102040816327e-06, + "loss": 0.2672, + "step": 10040 + }, + { + "epoch": 51.275510204081634, + "grad_norm": 1.5716756582260132, + "learning_rate": 9.744897959183674e-06, + "loss": 0.093, + "step": 10050 + }, + { + "epoch": 51.326530612244895, + "grad_norm": 10.107017517089844, + "learning_rate": 9.734693877551022e-06, + "loss": 0.4693, + "step": 10060 + }, + { + "epoch": 51.37755102040816, + "grad_norm": 2.6521267890930176, + "learning_rate": 9.724489795918368e-06, + "loss": 0.2922, + "step": 10070 + }, + { + "epoch": 51.42857142857143, + "grad_norm": 3.5833919048309326, + "learning_rate": 9.714285714285715e-06, + "loss": 0.1893, + "step": 10080 + }, + { + "epoch": 51.47959183673469, + "grad_norm": 3.322532892227173, + "learning_rate": 9.704081632653061e-06, + "loss": 0.1881, + "step": 10090 + }, + { + "epoch": 51.53061224489796, + "grad_norm": 12.746216773986816, + "learning_rate": 9.693877551020408e-06, + "loss": 0.4767, + "step": 10100 + }, + { + "epoch": 51.58163265306123, + "grad_norm": 4.599057674407959, + "learning_rate": 9.683673469387756e-06, + "loss": 0.2251, + "step": 10110 + }, + { + "epoch": 51.63265306122449, + "grad_norm": 11.859138488769531, + "learning_rate": 9.673469387755103e-06, + "loss": 0.2792, + "step": 10120 + }, + { + "epoch": 51.683673469387756, + "grad_norm": 0.5115078091621399, + "learning_rate": 9.663265306122451e-06, + "loss": 0.0973, + "step": 10130 + }, + { + "epoch": 51.734693877551024, + "grad_norm": 1.4567404985427856, + "learning_rate": 9.653061224489797e-06, + "loss": 0.1249, + "step": 10140 + }, + { + "epoch": 51.785714285714285, + "grad_norm": 1.8316110372543335, + "learning_rate": 9.642857142857144e-06, + "loss": 0.2642, + "step": 10150 + }, + { + "epoch": 51.83673469387755, + "grad_norm": 7.921043395996094, + "learning_rate": 9.63265306122449e-06, + "loss": 0.2997, + "step": 10160 + }, + { + "epoch": 51.88775510204081, + "grad_norm": 0.15132638812065125, + "learning_rate": 9.622448979591837e-06, + "loss": 0.2621, + "step": 10170 + }, + { + "epoch": 51.93877551020408, + "grad_norm": 3.1799213886260986, + "learning_rate": 9.612244897959185e-06, + "loss": 0.4857, + "step": 10180 + }, + { + "epoch": 51.98979591836735, + "grad_norm": 6.599296569824219, + "learning_rate": 9.60204081632653e-06, + "loss": 0.216, + "step": 10190 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.34130236506462097, + "eval_runtime": 0.9456, + "eval_samples_per_second": 292.95, + "eval_steps_per_second": 37.015, + "step": 10192 + }, + { + "epoch": 52.04081632653061, + "grad_norm": 0.39954033493995667, + "learning_rate": 9.591836734693878e-06, + "loss": 0.2164, + "step": 10200 + }, + { + "epoch": 52.09183673469388, + "grad_norm": 0.30738261342048645, + "learning_rate": 9.581632653061226e-06, + "loss": 0.314, + "step": 10210 + }, + { + "epoch": 52.142857142857146, + "grad_norm": 4.521490573883057, + "learning_rate": 9.571428571428573e-06, + "loss": 0.4708, + "step": 10220 + }, + { + "epoch": 52.19387755102041, + "grad_norm": 8.1461181640625, + "learning_rate": 9.561224489795919e-06, + "loss": 0.3965, + "step": 10230 + }, + { + "epoch": 52.244897959183675, + "grad_norm": 8.73324966430664, + "learning_rate": 9.551020408163266e-06, + "loss": 0.2251, + "step": 10240 + }, + { + "epoch": 52.295918367346935, + "grad_norm": 5.212022304534912, + "learning_rate": 9.540816326530612e-06, + "loss": 0.0885, + "step": 10250 + }, + { + "epoch": 52.3469387755102, + "grad_norm": 3.9524590969085693, + "learning_rate": 9.53061224489796e-06, + "loss": 0.2023, + "step": 10260 + }, + { + "epoch": 52.39795918367347, + "grad_norm": 6.595174789428711, + "learning_rate": 9.520408163265307e-06, + "loss": 0.1592, + "step": 10270 + }, + { + "epoch": 52.44897959183673, + "grad_norm": 10.941976547241211, + "learning_rate": 9.510204081632653e-06, + "loss": 0.4717, + "step": 10280 + }, + { + "epoch": 52.5, + "grad_norm": 8.068604469299316, + "learning_rate": 9.5e-06, + "loss": 0.3821, + "step": 10290 + }, + { + "epoch": 52.55102040816327, + "grad_norm": 6.500422477722168, + "learning_rate": 9.489795918367348e-06, + "loss": 0.171, + "step": 10300 + }, + { + "epoch": 52.60204081632653, + "grad_norm": 18.862512588500977, + "learning_rate": 9.479591836734695e-06, + "loss": 0.3375, + "step": 10310 + }, + { + "epoch": 52.6530612244898, + "grad_norm": 4.508473873138428, + "learning_rate": 9.469387755102041e-06, + "loss": 0.2942, + "step": 10320 + }, + { + "epoch": 52.704081632653065, + "grad_norm": 2.781569004058838, + "learning_rate": 9.459183673469389e-06, + "loss": 0.2091, + "step": 10330 + }, + { + "epoch": 52.755102040816325, + "grad_norm": 14.221521377563477, + "learning_rate": 9.448979591836736e-06, + "loss": 0.3069, + "step": 10340 + }, + { + "epoch": 52.80612244897959, + "grad_norm": 5.661198139190674, + "learning_rate": 9.438775510204082e-06, + "loss": 0.2134, + "step": 10350 + }, + { + "epoch": 52.857142857142854, + "grad_norm": 10.020429611206055, + "learning_rate": 9.42857142857143e-06, + "loss": 0.2813, + "step": 10360 + }, + { + "epoch": 52.90816326530612, + "grad_norm": 10.018217086791992, + "learning_rate": 9.418367346938775e-06, + "loss": 0.3867, + "step": 10370 + }, + { + "epoch": 52.95918367346939, + "grad_norm": 0.26604777574539185, + "learning_rate": 9.408163265306123e-06, + "loss": 0.1455, + "step": 10380 + }, + { + "epoch": 53.0, + "eval_accuracy": 0.9169675090252708, + "eval_loss": 0.30462250113487244, + "eval_runtime": 0.9334, + "eval_samples_per_second": 296.752, + "eval_steps_per_second": 37.496, + "step": 10388 + }, + { + "epoch": 53.01020408163265, + "grad_norm": 2.2215254306793213, + "learning_rate": 9.39795918367347e-06, + "loss": 0.418, + "step": 10390 + }, + { + "epoch": 53.06122448979592, + "grad_norm": 5.510087966918945, + "learning_rate": 9.387755102040818e-06, + "loss": 0.362, + "step": 10400 + }, + { + "epoch": 53.11224489795919, + "grad_norm": 1.2044073343276978, + "learning_rate": 9.377551020408164e-06, + "loss": 0.2571, + "step": 10410 + }, + { + "epoch": 53.16326530612245, + "grad_norm": 6.720277786254883, + "learning_rate": 9.367346938775511e-06, + "loss": 0.2213, + "step": 10420 + }, + { + "epoch": 53.214285714285715, + "grad_norm": 10.51973819732666, + "learning_rate": 9.357142857142859e-06, + "loss": 0.149, + "step": 10430 + }, + { + "epoch": 53.265306122448976, + "grad_norm": 7.785160064697266, + "learning_rate": 9.346938775510204e-06, + "loss": 0.3354, + "step": 10440 + }, + { + "epoch": 53.316326530612244, + "grad_norm": 2.9712700843811035, + "learning_rate": 9.336734693877552e-06, + "loss": 0.2505, + "step": 10450 + }, + { + "epoch": 53.36734693877551, + "grad_norm": 4.9919962882995605, + "learning_rate": 9.326530612244898e-06, + "loss": 0.2684, + "step": 10460 + }, + { + "epoch": 53.41836734693877, + "grad_norm": 2.1093006134033203, + "learning_rate": 9.316326530612245e-06, + "loss": 0.1597, + "step": 10470 + }, + { + "epoch": 53.46938775510204, + "grad_norm": 4.116796016693115, + "learning_rate": 9.306122448979593e-06, + "loss": 0.2162, + "step": 10480 + }, + { + "epoch": 53.52040816326531, + "grad_norm": 4.428661346435547, + "learning_rate": 9.29591836734694e-06, + "loss": 0.1825, + "step": 10490 + }, + { + "epoch": 53.57142857142857, + "grad_norm": 5.626094341278076, + "learning_rate": 9.285714285714288e-06, + "loss": 0.1225, + "step": 10500 + }, + { + "epoch": 53.62244897959184, + "grad_norm": 1.5667091608047485, + "learning_rate": 9.275510204081633e-06, + "loss": 0.2052, + "step": 10510 + }, + { + "epoch": 53.673469387755105, + "grad_norm": 3.0137667655944824, + "learning_rate": 9.26530612244898e-06, + "loss": 0.1321, + "step": 10520 + }, + { + "epoch": 53.724489795918366, + "grad_norm": 7.578505516052246, + "learning_rate": 9.255102040816327e-06, + "loss": 0.2478, + "step": 10530 + }, + { + "epoch": 53.775510204081634, + "grad_norm": 2.6894452571868896, + "learning_rate": 9.244897959183674e-06, + "loss": 0.2133, + "step": 10540 + }, + { + "epoch": 53.826530612244895, + "grad_norm": 13.425848007202148, + "learning_rate": 9.234693877551022e-06, + "loss": 0.2095, + "step": 10550 + }, + { + "epoch": 53.87755102040816, + "grad_norm": 0.28253173828125, + "learning_rate": 9.224489795918367e-06, + "loss": 0.2264, + "step": 10560 + }, + { + "epoch": 53.92857142857143, + "grad_norm": 1.276113748550415, + "learning_rate": 9.214285714285715e-06, + "loss": 0.1314, + "step": 10570 + }, + { + "epoch": 53.97959183673469, + "grad_norm": 3.8790574073791504, + "learning_rate": 9.204081632653062e-06, + "loss": 0.554, + "step": 10580 + }, + { + "epoch": 54.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.2849152684211731, + "eval_runtime": 0.9325, + "eval_samples_per_second": 297.065, + "eval_steps_per_second": 37.535, + "step": 10584 + }, + { + "epoch": 54.03061224489796, + "grad_norm": 9.941154479980469, + "learning_rate": 9.19387755102041e-06, + "loss": 0.2049, + "step": 10590 + }, + { + "epoch": 54.08163265306123, + "grad_norm": 13.770801544189453, + "learning_rate": 9.183673469387756e-06, + "loss": 0.1889, + "step": 10600 + }, + { + "epoch": 54.13265306122449, + "grad_norm": 0.3031742572784424, + "learning_rate": 9.173469387755103e-06, + "loss": 0.3012, + "step": 10610 + }, + { + "epoch": 54.183673469387756, + "grad_norm": 6.222158908843994, + "learning_rate": 9.163265306122449e-06, + "loss": 0.1944, + "step": 10620 + }, + { + "epoch": 54.234693877551024, + "grad_norm": 0.2729008197784424, + "learning_rate": 9.153061224489796e-06, + "loss": 0.1875, + "step": 10630 + }, + { + "epoch": 54.285714285714285, + "grad_norm": 1.3354541063308716, + "learning_rate": 9.142857142857144e-06, + "loss": 0.1959, + "step": 10640 + }, + { + "epoch": 54.33673469387755, + "grad_norm": 3.6860952377319336, + "learning_rate": 9.13265306122449e-06, + "loss": 0.4611, + "step": 10650 + }, + { + "epoch": 54.38775510204081, + "grad_norm": 2.5632665157318115, + "learning_rate": 9.122448979591837e-06, + "loss": 0.1682, + "step": 10660 + }, + { + "epoch": 54.43877551020408, + "grad_norm": 0.9468786120414734, + "learning_rate": 9.112244897959185e-06, + "loss": 0.2813, + "step": 10670 + }, + { + "epoch": 54.48979591836735, + "grad_norm": 7.614074230194092, + "learning_rate": 9.102040816326532e-06, + "loss": 0.235, + "step": 10680 + }, + { + "epoch": 54.54081632653061, + "grad_norm": 10.54065227508545, + "learning_rate": 9.091836734693878e-06, + "loss": 0.2602, + "step": 10690 + }, + { + "epoch": 54.59183673469388, + "grad_norm": 14.505642890930176, + "learning_rate": 9.081632653061225e-06, + "loss": 0.3777, + "step": 10700 + }, + { + "epoch": 54.642857142857146, + "grad_norm": 6.543236255645752, + "learning_rate": 9.071428571428573e-06, + "loss": 0.2374, + "step": 10710 + }, + { + "epoch": 54.69387755102041, + "grad_norm": 1.4957891702651978, + "learning_rate": 9.061224489795919e-06, + "loss": 0.3301, + "step": 10720 + }, + { + "epoch": 54.744897959183675, + "grad_norm": 4.956941604614258, + "learning_rate": 9.051020408163266e-06, + "loss": 0.2131, + "step": 10730 + }, + { + "epoch": 54.795918367346935, + "grad_norm": 16.052499771118164, + "learning_rate": 9.040816326530612e-06, + "loss": 0.3309, + "step": 10740 + }, + { + "epoch": 54.8469387755102, + "grad_norm": 2.2586865425109863, + "learning_rate": 9.03061224489796e-06, + "loss": 0.2048, + "step": 10750 + }, + { + "epoch": 54.89795918367347, + "grad_norm": 6.091680526733398, + "learning_rate": 9.020408163265307e-06, + "loss": 0.1964, + "step": 10760 + }, + { + "epoch": 54.94897959183673, + "grad_norm": 2.350778818130493, + "learning_rate": 9.010204081632654e-06, + "loss": 0.1545, + "step": 10770 + }, + { + "epoch": 55.0, + "grad_norm": 15.803840637207031, + "learning_rate": 9e-06, + "loss": 0.3586, + "step": 10780 + }, + { + "epoch": 55.0, + "eval_accuracy": 0.9133574007220217, + "eval_loss": 0.35168057680130005, + "eval_runtime": 0.9347, + "eval_samples_per_second": 296.337, + "eval_steps_per_second": 37.443, + "step": 10780 + }, + { + "epoch": 55.05102040816327, + "grad_norm": 8.534765243530273, + "learning_rate": 8.989795918367348e-06, + "loss": 0.172, + "step": 10790 + }, + { + "epoch": 55.10204081632653, + "grad_norm": 6.930604457855225, + "learning_rate": 8.979591836734695e-06, + "loss": 0.3134, + "step": 10800 + }, + { + "epoch": 55.1530612244898, + "grad_norm": 4.500041961669922, + "learning_rate": 8.969387755102041e-06, + "loss": 0.3951, + "step": 10810 + }, + { + "epoch": 55.204081632653065, + "grad_norm": 10.691000938415527, + "learning_rate": 8.959183673469388e-06, + "loss": 0.556, + "step": 10820 + }, + { + "epoch": 55.255102040816325, + "grad_norm": 5.5316853523254395, + "learning_rate": 8.948979591836734e-06, + "loss": 0.2841, + "step": 10830 + }, + { + "epoch": 55.30612244897959, + "grad_norm": 11.046544075012207, + "learning_rate": 8.938775510204082e-06, + "loss": 0.1667, + "step": 10840 + }, + { + "epoch": 55.357142857142854, + "grad_norm": 3.7789947986602783, + "learning_rate": 8.92857142857143e-06, + "loss": 0.2502, + "step": 10850 + }, + { + "epoch": 55.40816326530612, + "grad_norm": 3.0397422313690186, + "learning_rate": 8.918367346938777e-06, + "loss": 0.3261, + "step": 10860 + }, + { + "epoch": 55.45918367346939, + "grad_norm": 10.146750450134277, + "learning_rate": 8.908163265306124e-06, + "loss": 0.4223, + "step": 10870 + }, + { + "epoch": 55.51020408163265, + "grad_norm": 14.126100540161133, + "learning_rate": 8.89795918367347e-06, + "loss": 0.3837, + "step": 10880 + }, + { + "epoch": 55.56122448979592, + "grad_norm": 12.385411262512207, + "learning_rate": 8.887755102040817e-06, + "loss": 0.4422, + "step": 10890 + }, + { + "epoch": 55.61224489795919, + "grad_norm": 1.6094768047332764, + "learning_rate": 8.877551020408163e-06, + "loss": 0.2001, + "step": 10900 + }, + { + "epoch": 55.66326530612245, + "grad_norm": 0.18602848052978516, + "learning_rate": 8.86734693877551e-06, + "loss": 0.2053, + "step": 10910 + }, + { + "epoch": 55.714285714285715, + "grad_norm": 7.022019386291504, + "learning_rate": 8.857142857142858e-06, + "loss": 0.3404, + "step": 10920 + }, + { + "epoch": 55.765306122448976, + "grad_norm": 3.6204686164855957, + "learning_rate": 8.846938775510204e-06, + "loss": 0.1141, + "step": 10930 + }, + { + "epoch": 55.816326530612244, + "grad_norm": 0.32874494791030884, + "learning_rate": 8.836734693877552e-06, + "loss": 0.1693, + "step": 10940 + }, + { + "epoch": 55.86734693877551, + "grad_norm": 3.1909286975860596, + "learning_rate": 8.826530612244899e-06, + "loss": 0.333, + "step": 10950 + }, + { + "epoch": 55.91836734693877, + "grad_norm": 6.641414165496826, + "learning_rate": 8.816326530612247e-06, + "loss": 0.2012, + "step": 10960 + }, + { + "epoch": 55.96938775510204, + "grad_norm": 9.18001651763916, + "learning_rate": 8.806122448979592e-06, + "loss": 0.2239, + "step": 10970 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.9025270758122743, + "eval_loss": 0.45381447672843933, + "eval_runtime": 0.9387, + "eval_samples_per_second": 295.088, + "eval_steps_per_second": 37.286, + "step": 10976 + }, + { + "epoch": 56.02040816326531, + "grad_norm": 7.012698173522949, + "learning_rate": 8.79591836734694e-06, + "loss": 0.3606, + "step": 10980 + }, + { + "epoch": 56.07142857142857, + "grad_norm": 0.33083972334861755, + "learning_rate": 8.785714285714286e-06, + "loss": 0.164, + "step": 10990 + }, + { + "epoch": 56.12244897959184, + "grad_norm": 1.1617504358291626, + "learning_rate": 8.775510204081633e-06, + "loss": 0.4036, + "step": 11000 + }, + { + "epoch": 56.173469387755105, + "grad_norm": 18.734424591064453, + "learning_rate": 8.76530612244898e-06, + "loss": 0.2242, + "step": 11010 + }, + { + "epoch": 56.224489795918366, + "grad_norm": 0.8280305862426758, + "learning_rate": 8.755102040816326e-06, + "loss": 0.24, + "step": 11020 + }, + { + "epoch": 56.275510204081634, + "grad_norm": 1.0710173845291138, + "learning_rate": 8.744897959183676e-06, + "loss": 0.2093, + "step": 11030 + }, + { + "epoch": 56.326530612244895, + "grad_norm": 0.26472461223602295, + "learning_rate": 8.734693877551021e-06, + "loss": 0.3167, + "step": 11040 + }, + { + "epoch": 56.37755102040816, + "grad_norm": 17.181556701660156, + "learning_rate": 8.724489795918369e-06, + "loss": 0.3231, + "step": 11050 + }, + { + "epoch": 56.42857142857143, + "grad_norm": 0.4559863209724426, + "learning_rate": 8.714285714285715e-06, + "loss": 0.1493, + "step": 11060 + }, + { + "epoch": 56.47959183673469, + "grad_norm": 14.466240882873535, + "learning_rate": 8.704081632653062e-06, + "loss": 0.3536, + "step": 11070 + }, + { + "epoch": 56.53061224489796, + "grad_norm": 1.9131622314453125, + "learning_rate": 8.69387755102041e-06, + "loss": 0.2293, + "step": 11080 + }, + { + "epoch": 56.58163265306123, + "grad_norm": 3.1065633296966553, + "learning_rate": 8.683673469387755e-06, + "loss": 0.1173, + "step": 11090 + }, + { + "epoch": 56.63265306122449, + "grad_norm": 12.009686470031738, + "learning_rate": 8.673469387755103e-06, + "loss": 0.2443, + "step": 11100 + }, + { + "epoch": 56.683673469387756, + "grad_norm": 2.6055593490600586, + "learning_rate": 8.663265306122449e-06, + "loss": 0.4624, + "step": 11110 + }, + { + "epoch": 56.734693877551024, + "grad_norm": 6.770671367645264, + "learning_rate": 8.653061224489798e-06, + "loss": 0.2318, + "step": 11120 + }, + { + "epoch": 56.785714285714285, + "grad_norm": 6.624651908874512, + "learning_rate": 8.642857142857144e-06, + "loss": 0.3801, + "step": 11130 + }, + { + "epoch": 56.83673469387755, + "grad_norm": 0.4821954667568207, + "learning_rate": 8.632653061224491e-06, + "loss": 0.2533, + "step": 11140 + }, + { + "epoch": 56.88775510204081, + "grad_norm": 7.2710113525390625, + "learning_rate": 8.622448979591837e-06, + "loss": 0.2761, + "step": 11150 + }, + { + "epoch": 56.93877551020408, + "grad_norm": 20.32927131652832, + "learning_rate": 8.612244897959184e-06, + "loss": 0.4597, + "step": 11160 + }, + { + "epoch": 56.98979591836735, + "grad_norm": 0.4477481245994568, + "learning_rate": 8.602040816326532e-06, + "loss": 0.1725, + "step": 11170 + }, + { + "epoch": 57.0, + "eval_accuracy": 0.8592057761732852, + "eval_loss": 0.44916799664497375, + "eval_runtime": 0.9302, + "eval_samples_per_second": 297.771, + "eval_steps_per_second": 37.625, + "step": 11172 + }, + { + "epoch": 57.04081632653061, + "grad_norm": 11.276688575744629, + "learning_rate": 8.591836734693878e-06, + "loss": 0.234, + "step": 11180 + }, + { + "epoch": 57.09183673469388, + "grad_norm": 1.517318844795227, + "learning_rate": 8.581632653061225e-06, + "loss": 0.1559, + "step": 11190 + }, + { + "epoch": 57.142857142857146, + "grad_norm": 7.025010585784912, + "learning_rate": 8.571428571428571e-06, + "loss": 0.2112, + "step": 11200 + }, + { + "epoch": 57.19387755102041, + "grad_norm": 10.036983489990234, + "learning_rate": 8.56122448979592e-06, + "loss": 0.2964, + "step": 11210 + }, + { + "epoch": 57.244897959183675, + "grad_norm": 9.007682800292969, + "learning_rate": 8.551020408163266e-06, + "loss": 0.2345, + "step": 11220 + }, + { + "epoch": 57.295918367346935, + "grad_norm": 15.974291801452637, + "learning_rate": 8.540816326530613e-06, + "loss": 0.5178, + "step": 11230 + }, + { + "epoch": 57.3469387755102, + "grad_norm": 0.5210385918617249, + "learning_rate": 8.530612244897961e-06, + "loss": 0.0861, + "step": 11240 + }, + { + "epoch": 57.39795918367347, + "grad_norm": 0.22501814365386963, + "learning_rate": 8.520408163265307e-06, + "loss": 0.1869, + "step": 11250 + }, + { + "epoch": 57.44897959183673, + "grad_norm": 1.9573888778686523, + "learning_rate": 8.510204081632654e-06, + "loss": 0.2643, + "step": 11260 + }, + { + "epoch": 57.5, + "grad_norm": 3.5790092945098877, + "learning_rate": 8.5e-06, + "loss": 0.1604, + "step": 11270 + }, + { + "epoch": 57.55102040816327, + "grad_norm": 10.310325622558594, + "learning_rate": 8.489795918367347e-06, + "loss": 0.1772, + "step": 11280 + }, + { + "epoch": 57.60204081632653, + "grad_norm": 10.196538925170898, + "learning_rate": 8.479591836734695e-06, + "loss": 0.1882, + "step": 11290 + }, + { + "epoch": 57.6530612244898, + "grad_norm": 6.247244358062744, + "learning_rate": 8.469387755102042e-06, + "loss": 0.2541, + "step": 11300 + }, + { + "epoch": 57.704081632653065, + "grad_norm": 11.577144622802734, + "learning_rate": 8.459183673469388e-06, + "loss": 0.3183, + "step": 11310 + }, + { + "epoch": 57.755102040816325, + "grad_norm": 1.9273695945739746, + "learning_rate": 8.448979591836736e-06, + "loss": 0.1365, + "step": 11320 + }, + { + "epoch": 57.80612244897959, + "grad_norm": 3.144540548324585, + "learning_rate": 8.438775510204083e-06, + "loss": 0.2859, + "step": 11330 + }, + { + "epoch": 57.857142857142854, + "grad_norm": 10.630606651306152, + "learning_rate": 8.428571428571429e-06, + "loss": 0.2768, + "step": 11340 + }, + { + "epoch": 57.90816326530612, + "grad_norm": 17.469684600830078, + "learning_rate": 8.418367346938776e-06, + "loss": 0.3497, + "step": 11350 + }, + { + "epoch": 57.95918367346939, + "grad_norm": 1.6836483478546143, + "learning_rate": 8.408163265306122e-06, + "loss": 0.4689, + "step": 11360 + }, + { + "epoch": 58.0, + "eval_accuracy": 0.8628158844765343, + "eval_loss": 0.4738902747631073, + "eval_runtime": 0.9341, + "eval_samples_per_second": 296.536, + "eval_steps_per_second": 37.468, + "step": 11368 + }, + { + "epoch": 58.01020408163265, + "grad_norm": 12.947834968566895, + "learning_rate": 8.39795918367347e-06, + "loss": 0.3388, + "step": 11370 + }, + { + "epoch": 58.06122448979592, + "grad_norm": 9.694038391113281, + "learning_rate": 8.387755102040817e-06, + "loss": 0.2179, + "step": 11380 + }, + { + "epoch": 58.11224489795919, + "grad_norm": 2.7403390407562256, + "learning_rate": 8.377551020408165e-06, + "loss": 0.2709, + "step": 11390 + }, + { + "epoch": 58.16326530612245, + "grad_norm": 6.763411045074463, + "learning_rate": 8.36734693877551e-06, + "loss": 0.1844, + "step": 11400 + }, + { + "epoch": 58.214285714285715, + "grad_norm": 9.05958080291748, + "learning_rate": 8.357142857142858e-06, + "loss": 0.1588, + "step": 11410 + }, + { + "epoch": 58.265306122448976, + "grad_norm": 8.429780960083008, + "learning_rate": 8.346938775510205e-06, + "loss": 0.3569, + "step": 11420 + }, + { + "epoch": 58.316326530612244, + "grad_norm": 5.592264175415039, + "learning_rate": 8.336734693877551e-06, + "loss": 0.2468, + "step": 11430 + }, + { + "epoch": 58.36734693877551, + "grad_norm": 3.081688642501831, + "learning_rate": 8.326530612244899e-06, + "loss": 0.0708, + "step": 11440 + }, + { + "epoch": 58.41836734693877, + "grad_norm": 1.1242700815200806, + "learning_rate": 8.316326530612246e-06, + "loss": 0.4146, + "step": 11450 + }, + { + "epoch": 58.46938775510204, + "grad_norm": 9.29017162322998, + "learning_rate": 8.306122448979592e-06, + "loss": 0.2877, + "step": 11460 + }, + { + "epoch": 58.52040816326531, + "grad_norm": 1.7449393272399902, + "learning_rate": 8.29591836734694e-06, + "loss": 0.3654, + "step": 11470 + }, + { + "epoch": 58.57142857142857, + "grad_norm": 9.28416919708252, + "learning_rate": 8.285714285714287e-06, + "loss": 0.2411, + "step": 11480 + }, + { + "epoch": 58.62244897959184, + "grad_norm": 2.053839683532715, + "learning_rate": 8.275510204081634e-06, + "loss": 0.2477, + "step": 11490 + }, + { + "epoch": 58.673469387755105, + "grad_norm": 0.5099808573722839, + "learning_rate": 8.26530612244898e-06, + "loss": 0.086, + "step": 11500 + }, + { + "epoch": 58.724489795918366, + "grad_norm": 5.123264789581299, + "learning_rate": 8.255102040816328e-06, + "loss": 0.1873, + "step": 11510 + }, + { + "epoch": 58.775510204081634, + "grad_norm": 0.3827672004699707, + "learning_rate": 8.244897959183674e-06, + "loss": 0.245, + "step": 11520 + }, + { + "epoch": 58.826530612244895, + "grad_norm": 6.846168041229248, + "learning_rate": 8.234693877551021e-06, + "loss": 0.6233, + "step": 11530 + }, + { + "epoch": 58.87755102040816, + "grad_norm": 0.9291605949401855, + "learning_rate": 8.224489795918369e-06, + "loss": 0.2342, + "step": 11540 + }, + { + "epoch": 58.92857142857143, + "grad_norm": 4.6036834716796875, + "learning_rate": 8.214285714285714e-06, + "loss": 0.2538, + "step": 11550 + }, + { + "epoch": 58.97959183673469, + "grad_norm": 10.339995384216309, + "learning_rate": 8.204081632653062e-06, + "loss": 0.3565, + "step": 11560 + }, + { + "epoch": 59.0, + "eval_accuracy": 0.9205776173285198, + "eval_loss": 0.2831103205680847, + "eval_runtime": 0.9374, + "eval_samples_per_second": 295.493, + "eval_steps_per_second": 37.337, + "step": 11564 + }, + { + "epoch": 59.03061224489796, + "grad_norm": 1.1292849779129028, + "learning_rate": 8.19387755102041e-06, + "loss": 0.1716, + "step": 11570 + }, + { + "epoch": 59.08163265306123, + "grad_norm": 0.5711737275123596, + "learning_rate": 8.183673469387757e-06, + "loss": 0.1703, + "step": 11580 + }, + { + "epoch": 59.13265306122449, + "grad_norm": 0.48393628001213074, + "learning_rate": 8.173469387755103e-06, + "loss": 0.1301, + "step": 11590 + }, + { + "epoch": 59.183673469387756, + "grad_norm": 0.1569240391254425, + "learning_rate": 8.16326530612245e-06, + "loss": 0.1911, + "step": 11600 + }, + { + "epoch": 59.234693877551024, + "grad_norm": 11.578142166137695, + "learning_rate": 8.153061224489796e-06, + "loss": 0.1351, + "step": 11610 + }, + { + "epoch": 59.285714285714285, + "grad_norm": 11.53231430053711, + "learning_rate": 8.142857142857143e-06, + "loss": 0.3704, + "step": 11620 + }, + { + "epoch": 59.33673469387755, + "grad_norm": 7.75083589553833, + "learning_rate": 8.13265306122449e-06, + "loss": 0.2867, + "step": 11630 + }, + { + "epoch": 59.38775510204081, + "grad_norm": 1.0234230756759644, + "learning_rate": 8.122448979591837e-06, + "loss": 0.2227, + "step": 11640 + }, + { + "epoch": 59.43877551020408, + "grad_norm": 6.651299953460693, + "learning_rate": 8.112244897959184e-06, + "loss": 0.1946, + "step": 11650 + }, + { + "epoch": 59.48979591836735, + "grad_norm": 7.23628568649292, + "learning_rate": 8.102040816326532e-06, + "loss": 0.1456, + "step": 11660 + }, + { + "epoch": 59.54081632653061, + "grad_norm": 2.716782331466675, + "learning_rate": 8.091836734693879e-06, + "loss": 0.1696, + "step": 11670 + }, + { + "epoch": 59.59183673469388, + "grad_norm": 4.466701030731201, + "learning_rate": 8.081632653061225e-06, + "loss": 0.29, + "step": 11680 + }, + { + "epoch": 59.642857142857146, + "grad_norm": 16.736553192138672, + "learning_rate": 8.071428571428572e-06, + "loss": 0.3853, + "step": 11690 + }, + { + "epoch": 59.69387755102041, + "grad_norm": 0.4944998621940613, + "learning_rate": 8.06122448979592e-06, + "loss": 0.354, + "step": 11700 + }, + { + "epoch": 59.744897959183675, + "grad_norm": 3.3472633361816406, + "learning_rate": 8.051020408163266e-06, + "loss": 0.2751, + "step": 11710 + }, + { + "epoch": 59.795918367346935, + "grad_norm": 2.8295209407806396, + "learning_rate": 8.040816326530613e-06, + "loss": 0.1301, + "step": 11720 + }, + { + "epoch": 59.8469387755102, + "grad_norm": 1.9733431339263916, + "learning_rate": 8.030612244897959e-06, + "loss": 0.5063, + "step": 11730 + }, + { + "epoch": 59.89795918367347, + "grad_norm": 0.1797264963388443, + "learning_rate": 8.020408163265306e-06, + "loss": 0.1427, + "step": 11740 + }, + { + "epoch": 59.94897959183673, + "grad_norm": 9.685842514038086, + "learning_rate": 8.010204081632654e-06, + "loss": 0.2899, + "step": 11750 + }, + { + "epoch": 60.0, + "grad_norm": 13.383129119873047, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2259, + "step": 11760 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.9205776173285198, + "eval_loss": 0.34654152393341064, + "eval_runtime": 0.9505, + "eval_samples_per_second": 291.425, + "eval_steps_per_second": 36.823, + "step": 11760 + }, + { + "epoch": 60.05102040816327, + "grad_norm": 2.9076764583587646, + "learning_rate": 7.989795918367347e-06, + "loss": 0.065, + "step": 11770 + }, + { + "epoch": 60.10204081632653, + "grad_norm": 8.378006935119629, + "learning_rate": 7.979591836734695e-06, + "loss": 0.2488, + "step": 11780 + }, + { + "epoch": 60.1530612244898, + "grad_norm": 12.380623817443848, + "learning_rate": 7.969387755102042e-06, + "loss": 0.3169, + "step": 11790 + }, + { + "epoch": 60.204081632653065, + "grad_norm": 4.164218425750732, + "learning_rate": 7.959183673469388e-06, + "loss": 0.1818, + "step": 11800 + }, + { + "epoch": 60.255102040816325, + "grad_norm": 14.524949073791504, + "learning_rate": 7.948979591836735e-06, + "loss": 0.3417, + "step": 11810 + }, + { + "epoch": 60.30612244897959, + "grad_norm": 6.435640811920166, + "learning_rate": 7.938775510204081e-06, + "loss": 0.274, + "step": 11820 + }, + { + "epoch": 60.357142857142854, + "grad_norm": 4.574618816375732, + "learning_rate": 7.928571428571429e-06, + "loss": 0.4462, + "step": 11830 + }, + { + "epoch": 60.40816326530612, + "grad_norm": 7.529806137084961, + "learning_rate": 7.918367346938776e-06, + "loss": 0.3905, + "step": 11840 + }, + { + "epoch": 60.45918367346939, + "grad_norm": 3.4086387157440186, + "learning_rate": 7.908163265306124e-06, + "loss": 0.301, + "step": 11850 + }, + { + "epoch": 60.51020408163265, + "grad_norm": 0.847341001033783, + "learning_rate": 7.897959183673471e-06, + "loss": 0.1661, + "step": 11860 + }, + { + "epoch": 60.56122448979592, + "grad_norm": 3.0656235218048096, + "learning_rate": 7.887755102040817e-06, + "loss": 0.2628, + "step": 11870 + }, + { + "epoch": 60.61224489795919, + "grad_norm": 1.6243197917938232, + "learning_rate": 7.877551020408164e-06, + "loss": 0.1612, + "step": 11880 + }, + { + "epoch": 60.66326530612245, + "grad_norm": 11.95365047454834, + "learning_rate": 7.86734693877551e-06, + "loss": 0.1425, + "step": 11890 + }, + { + "epoch": 60.714285714285715, + "grad_norm": 7.095501899719238, + "learning_rate": 7.857142857142858e-06, + "loss": 0.1915, + "step": 11900 + }, + { + "epoch": 60.765306122448976, + "grad_norm": 8.617204666137695, + "learning_rate": 7.846938775510205e-06, + "loss": 0.1871, + "step": 11910 + }, + { + "epoch": 60.816326530612244, + "grad_norm": 7.942294120788574, + "learning_rate": 7.836734693877551e-06, + "loss": 0.2194, + "step": 11920 + }, + { + "epoch": 60.86734693877551, + "grad_norm": 4.334501266479492, + "learning_rate": 7.826530612244898e-06, + "loss": 0.2936, + "step": 11930 + }, + { + "epoch": 60.91836734693877, + "grad_norm": 6.984222888946533, + "learning_rate": 7.816326530612246e-06, + "loss": 0.2264, + "step": 11940 + }, + { + "epoch": 60.96938775510204, + "grad_norm": 10.189035415649414, + "learning_rate": 7.806122448979593e-06, + "loss": 0.2212, + "step": 11950 + }, + { + "epoch": 61.0, + "eval_accuracy": 0.9314079422382672, + "eval_loss": 0.28843551874160767, + "eval_runtime": 0.9333, + "eval_samples_per_second": 296.782, + "eval_steps_per_second": 37.5, + "step": 11956 + }, + { + "epoch": 61.02040816326531, + "grad_norm": 5.532245635986328, + "learning_rate": 7.79591836734694e-06, + "loss": 0.2216, + "step": 11960 + }, + { + "epoch": 61.07142857142857, + "grad_norm": 2.590909242630005, + "learning_rate": 7.785714285714287e-06, + "loss": 0.1215, + "step": 11970 + }, + { + "epoch": 61.12244897959184, + "grad_norm": 7.619017601013184, + "learning_rate": 7.775510204081632e-06, + "loss": 0.1876, + "step": 11980 + }, + { + "epoch": 61.173469387755105, + "grad_norm": 17.85263442993164, + "learning_rate": 7.76530612244898e-06, + "loss": 0.2472, + "step": 11990 + }, + { + "epoch": 61.224489795918366, + "grad_norm": 5.559810161590576, + "learning_rate": 7.755102040816327e-06, + "loss": 0.3429, + "step": 12000 + }, + { + "epoch": 61.275510204081634, + "grad_norm": 3.257960796356201, + "learning_rate": 7.744897959183673e-06, + "loss": 0.2473, + "step": 12010 + }, + { + "epoch": 61.326530612244895, + "grad_norm": 2.2071876525878906, + "learning_rate": 7.73469387755102e-06, + "loss": 0.1332, + "step": 12020 + }, + { + "epoch": 61.37755102040816, + "grad_norm": 0.41261130571365356, + "learning_rate": 7.724489795918368e-06, + "loss": 0.1469, + "step": 12030 + }, + { + "epoch": 61.42857142857143, + "grad_norm": 5.125622272491455, + "learning_rate": 7.714285714285716e-06, + "loss": 0.3787, + "step": 12040 + }, + { + "epoch": 61.47959183673469, + "grad_norm": 0.23421549797058105, + "learning_rate": 7.704081632653061e-06, + "loss": 0.0996, + "step": 12050 + }, + { + "epoch": 61.53061224489796, + "grad_norm": 0.6105390787124634, + "learning_rate": 7.693877551020409e-06, + "loss": 0.1756, + "step": 12060 + }, + { + "epoch": 61.58163265306123, + "grad_norm": 9.674683570861816, + "learning_rate": 7.683673469387756e-06, + "loss": 0.2785, + "step": 12070 + }, + { + "epoch": 61.63265306122449, + "grad_norm": 10.404973030090332, + "learning_rate": 7.673469387755102e-06, + "loss": 0.3011, + "step": 12080 + }, + { + "epoch": 61.683673469387756, + "grad_norm": 22.518468856811523, + "learning_rate": 7.66326530612245e-06, + "loss": 0.5065, + "step": 12090 + }, + { + "epoch": 61.734693877551024, + "grad_norm": 7.028233051300049, + "learning_rate": 7.653061224489796e-06, + "loss": 0.1553, + "step": 12100 + }, + { + "epoch": 61.785714285714285, + "grad_norm": 0.5314661860466003, + "learning_rate": 7.642857142857143e-06, + "loss": 0.2187, + "step": 12110 + }, + { + "epoch": 61.83673469387755, + "grad_norm": 6.046501159667969, + "learning_rate": 7.63265306122449e-06, + "loss": 0.1042, + "step": 12120 + }, + { + "epoch": 61.88775510204081, + "grad_norm": 1.112051248550415, + "learning_rate": 7.622448979591838e-06, + "loss": 0.3595, + "step": 12130 + }, + { + "epoch": 61.93877551020408, + "grad_norm": 2.3171072006225586, + "learning_rate": 7.612244897959185e-06, + "loss": 0.19, + "step": 12140 + }, + { + "epoch": 61.98979591836735, + "grad_norm": 11.073331832885742, + "learning_rate": 7.602040816326531e-06, + "loss": 0.2648, + "step": 12150 + }, + { + "epoch": 62.0, + "eval_accuracy": 0.8447653429602888, + "eval_loss": 0.4874745309352875, + "eval_runtime": 0.9313, + "eval_samples_per_second": 297.448, + "eval_steps_per_second": 37.584, + "step": 12152 + }, + { + "epoch": 62.04081632653061, + "grad_norm": 2.1476967334747314, + "learning_rate": 7.591836734693878e-06, + "loss": 0.1787, + "step": 12160 + }, + { + "epoch": 62.09183673469388, + "grad_norm": 0.47355252504348755, + "learning_rate": 7.581632653061225e-06, + "loss": 0.1529, + "step": 12170 + }, + { + "epoch": 62.142857142857146, + "grad_norm": 6.79935359954834, + "learning_rate": 7.571428571428572e-06, + "loss": 0.2228, + "step": 12180 + }, + { + "epoch": 62.19387755102041, + "grad_norm": 4.178977966308594, + "learning_rate": 7.561224489795919e-06, + "loss": 0.1832, + "step": 12190 + }, + { + "epoch": 62.244897959183675, + "grad_norm": 12.742783546447754, + "learning_rate": 7.551020408163265e-06, + "loss": 0.2281, + "step": 12200 + }, + { + "epoch": 62.295918367346935, + "grad_norm": 1.5981056690216064, + "learning_rate": 7.540816326530614e-06, + "loss": 0.2196, + "step": 12210 + }, + { + "epoch": 62.3469387755102, + "grad_norm": 5.777743339538574, + "learning_rate": 7.53061224489796e-06, + "loss": 0.4064, + "step": 12220 + }, + { + "epoch": 62.39795918367347, + "grad_norm": 1.1234701871871948, + "learning_rate": 7.520408163265307e-06, + "loss": 0.2547, + "step": 12230 + }, + { + "epoch": 62.44897959183673, + "grad_norm": 3.0036509037017822, + "learning_rate": 7.5102040816326536e-06, + "loss": 0.2274, + "step": 12240 + }, + { + "epoch": 62.5, + "grad_norm": 0.5534256100654602, + "learning_rate": 7.500000000000001e-06, + "loss": 0.1548, + "step": 12250 + }, + { + "epoch": 62.55102040816327, + "grad_norm": 1.8934872150421143, + "learning_rate": 7.489795918367348e-06, + "loss": 0.0968, + "step": 12260 + }, + { + "epoch": 62.60204081632653, + "grad_norm": 4.307313919067383, + "learning_rate": 7.479591836734694e-06, + "loss": 0.2193, + "step": 12270 + }, + { + "epoch": 62.6530612244898, + "grad_norm": 11.095629692077637, + "learning_rate": 7.469387755102041e-06, + "loss": 0.2613, + "step": 12280 + }, + { + "epoch": 62.704081632653065, + "grad_norm": 2.2190847396850586, + "learning_rate": 7.459183673469388e-06, + "loss": 0.1524, + "step": 12290 + }, + { + "epoch": 62.755102040816325, + "grad_norm": 1.4255743026733398, + "learning_rate": 7.448979591836736e-06, + "loss": 0.0702, + "step": 12300 + }, + { + "epoch": 62.80612244897959, + "grad_norm": 0.909092903137207, + "learning_rate": 7.4387755102040826e-06, + "loss": 0.2265, + "step": 12310 + }, + { + "epoch": 62.857142857142854, + "grad_norm": 7.196882724761963, + "learning_rate": 7.428571428571429e-06, + "loss": 0.2823, + "step": 12320 + }, + { + "epoch": 62.90816326530612, + "grad_norm": 8.029342651367188, + "learning_rate": 7.418367346938776e-06, + "loss": 0.1778, + "step": 12330 + }, + { + "epoch": 62.95918367346939, + "grad_norm": 7.18194055557251, + "learning_rate": 7.408163265306123e-06, + "loss": 0.3438, + "step": 12340 + }, + { + "epoch": 63.0, + "eval_accuracy": 0.9061371841155235, + "eval_loss": 0.3988655209541321, + "eval_runtime": 0.9937, + "eval_samples_per_second": 278.765, + "eval_steps_per_second": 35.223, + "step": 12348 + }, + { + "epoch": 63.01020408163265, + "grad_norm": 8.065620422363281, + "learning_rate": 7.39795918367347e-06, + "loss": 0.2821, + "step": 12350 + }, + { + "epoch": 63.06122448979592, + "grad_norm": 1.9322210550308228, + "learning_rate": 7.387755102040817e-06, + "loss": 0.095, + "step": 12360 + }, + { + "epoch": 63.11224489795919, + "grad_norm": 3.922755002975464, + "learning_rate": 7.377551020408163e-06, + "loss": 0.2767, + "step": 12370 + }, + { + "epoch": 63.16326530612245, + "grad_norm": 10.300689697265625, + "learning_rate": 7.367346938775511e-06, + "loss": 0.1315, + "step": 12380 + }, + { + "epoch": 63.214285714285715, + "grad_norm": 12.600347518920898, + "learning_rate": 7.357142857142858e-06, + "loss": 0.2517, + "step": 12390 + }, + { + "epoch": 63.265306122448976, + "grad_norm": 7.00731897354126, + "learning_rate": 7.346938775510205e-06, + "loss": 0.1496, + "step": 12400 + }, + { + "epoch": 63.316326530612244, + "grad_norm": 9.240813255310059, + "learning_rate": 7.3367346938775515e-06, + "loss": 0.196, + "step": 12410 + }, + { + "epoch": 63.36734693877551, + "grad_norm": 9.863750457763672, + "learning_rate": 7.326530612244899e-06, + "loss": 0.3422, + "step": 12420 + }, + { + "epoch": 63.41836734693877, + "grad_norm": 1.6901395320892334, + "learning_rate": 7.316326530612246e-06, + "loss": 0.1819, + "step": 12430 + }, + { + "epoch": 63.46938775510204, + "grad_norm": 12.429134368896484, + "learning_rate": 7.306122448979592e-06, + "loss": 0.4074, + "step": 12440 + }, + { + "epoch": 63.52040816326531, + "grad_norm": 1.9005409479141235, + "learning_rate": 7.295918367346939e-06, + "loss": 0.323, + "step": 12450 + }, + { + "epoch": 63.57142857142857, + "grad_norm": 5.085525989532471, + "learning_rate": 7.285714285714286e-06, + "loss": 0.1967, + "step": 12460 + }, + { + "epoch": 63.62244897959184, + "grad_norm": 7.6504340171813965, + "learning_rate": 7.275510204081633e-06, + "loss": 0.4315, + "step": 12470 + }, + { + "epoch": 63.673469387755105, + "grad_norm": 0.2195933312177658, + "learning_rate": 7.2653061224489805e-06, + "loss": 0.183, + "step": 12480 + }, + { + "epoch": 63.724489795918366, + "grad_norm": 8.156294822692871, + "learning_rate": 7.255102040816327e-06, + "loss": 0.1558, + "step": 12490 + }, + { + "epoch": 63.775510204081634, + "grad_norm": 3.2828145027160645, + "learning_rate": 7.244897959183675e-06, + "loss": 0.3911, + "step": 12500 + }, + { + "epoch": 63.826530612244895, + "grad_norm": 1.5318927764892578, + "learning_rate": 7.234693877551021e-06, + "loss": 0.1245, + "step": 12510 + }, + { + "epoch": 63.87755102040816, + "grad_norm": 1.8835053443908691, + "learning_rate": 7.224489795918368e-06, + "loss": 0.2946, + "step": 12520 + }, + { + "epoch": 63.92857142857143, + "grad_norm": 10.958850860595703, + "learning_rate": 7.2142857142857145e-06, + "loss": 0.1734, + "step": 12530 + }, + { + "epoch": 63.97959183673469, + "grad_norm": 4.654305934906006, + "learning_rate": 7.204081632653061e-06, + "loss": 0.4785, + "step": 12540 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.851985559566787, + "eval_loss": 0.5952923893928528, + "eval_runtime": 0.9341, + "eval_samples_per_second": 296.557, + "eval_steps_per_second": 37.471, + "step": 12544 + }, + { + "epoch": 64.03061224489795, + "grad_norm": 8.106365203857422, + "learning_rate": 7.193877551020409e-06, + "loss": 0.1046, + "step": 12550 + }, + { + "epoch": 64.08163265306122, + "grad_norm": 18.67780113220215, + "learning_rate": 7.183673469387755e-06, + "loss": 0.294, + "step": 12560 + }, + { + "epoch": 64.13265306122449, + "grad_norm": 0.6589402556419373, + "learning_rate": 7.173469387755103e-06, + "loss": 0.2063, + "step": 12570 + }, + { + "epoch": 64.18367346938776, + "grad_norm": 17.626237869262695, + "learning_rate": 7.16326530612245e-06, + "loss": 0.246, + "step": 12580 + }, + { + "epoch": 64.23469387755102, + "grad_norm": 6.089940071105957, + "learning_rate": 7.153061224489797e-06, + "loss": 0.2377, + "step": 12590 + }, + { + "epoch": 64.28571428571429, + "grad_norm": 3.529348134994507, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.1449, + "step": 12600 + }, + { + "epoch": 64.33673469387755, + "grad_norm": 8.65750789642334, + "learning_rate": 7.13265306122449e-06, + "loss": 0.3385, + "step": 12610 + }, + { + "epoch": 64.38775510204081, + "grad_norm": 1.2404389381408691, + "learning_rate": 7.122448979591837e-06, + "loss": 0.1215, + "step": 12620 + }, + { + "epoch": 64.43877551020408, + "grad_norm": 0.9970741271972656, + "learning_rate": 7.112244897959184e-06, + "loss": 0.0967, + "step": 12630 + }, + { + "epoch": 64.48979591836735, + "grad_norm": 10.057463645935059, + "learning_rate": 7.102040816326531e-06, + "loss": 0.3337, + "step": 12640 + }, + { + "epoch": 64.54081632653062, + "grad_norm": 0.8182612061500549, + "learning_rate": 7.091836734693878e-06, + "loss": 0.2539, + "step": 12650 + }, + { + "epoch": 64.59183673469387, + "grad_norm": 2.4172310829162598, + "learning_rate": 7.081632653061226e-06, + "loss": 0.2421, + "step": 12660 + }, + { + "epoch": 64.64285714285714, + "grad_norm": 12.167418479919434, + "learning_rate": 7.0714285714285726e-06, + "loss": 0.2537, + "step": 12670 + }, + { + "epoch": 64.6938775510204, + "grad_norm": 6.439947605133057, + "learning_rate": 7.061224489795919e-06, + "loss": 0.471, + "step": 12680 + }, + { + "epoch": 64.74489795918367, + "grad_norm": 8.06596565246582, + "learning_rate": 7.051020408163266e-06, + "loss": 0.4742, + "step": 12690 + }, + { + "epoch": 64.79591836734694, + "grad_norm": 6.091670513153076, + "learning_rate": 7.0408163265306125e-06, + "loss": 0.1066, + "step": 12700 + }, + { + "epoch": 64.84693877551021, + "grad_norm": 7.659086227416992, + "learning_rate": 7.03061224489796e-06, + "loss": 0.1774, + "step": 12710 + }, + { + "epoch": 64.89795918367346, + "grad_norm": 4.606033802032471, + "learning_rate": 7.020408163265307e-06, + "loss": 0.157, + "step": 12720 + }, + { + "epoch": 64.94897959183673, + "grad_norm": 7.792344570159912, + "learning_rate": 7.010204081632653e-06, + "loss": 0.2453, + "step": 12730 + }, + { + "epoch": 65.0, + "grad_norm": 9.722793579101562, + "learning_rate": 7e-06, + "loss": 0.06, + "step": 12740 + }, + { + "epoch": 65.0, + "eval_accuracy": 0.927797833935018, + "eval_loss": 0.2953914701938629, + "eval_runtime": 0.9311, + "eval_samples_per_second": 297.487, + "eval_steps_per_second": 37.589, + "step": 12740 + }, + { + "epoch": 65.05102040816327, + "grad_norm": 1.075190544128418, + "learning_rate": 6.989795918367348e-06, + "loss": 0.0614, + "step": 12750 + }, + { + "epoch": 65.10204081632654, + "grad_norm": 0.13435253500938416, + "learning_rate": 6.979591836734695e-06, + "loss": 0.2266, + "step": 12760 + }, + { + "epoch": 65.15306122448979, + "grad_norm": 8.657276153564453, + "learning_rate": 6.9693877551020415e-06, + "loss": 0.1568, + "step": 12770 + }, + { + "epoch": 65.20408163265306, + "grad_norm": 4.1393656730651855, + "learning_rate": 6.959183673469388e-06, + "loss": 0.4365, + "step": 12780 + }, + { + "epoch": 65.25510204081633, + "grad_norm": 9.038256645202637, + "learning_rate": 6.948979591836736e-06, + "loss": 0.1867, + "step": 12790 + }, + { + "epoch": 65.3061224489796, + "grad_norm": 5.405453681945801, + "learning_rate": 6.938775510204082e-06, + "loss": 0.2024, + "step": 12800 + }, + { + "epoch": 65.35714285714286, + "grad_norm": 11.368095397949219, + "learning_rate": 6.928571428571429e-06, + "loss": 0.1922, + "step": 12810 + }, + { + "epoch": 65.40816326530613, + "grad_norm": 2.3432440757751465, + "learning_rate": 6.9183673469387755e-06, + "loss": 0.3727, + "step": 12820 + }, + { + "epoch": 65.45918367346938, + "grad_norm": 6.652347087860107, + "learning_rate": 6.908163265306122e-06, + "loss": 0.2511, + "step": 12830 + }, + { + "epoch": 65.51020408163265, + "grad_norm": 4.0521135330200195, + "learning_rate": 6.8979591836734705e-06, + "loss": 0.2601, + "step": 12840 + }, + { + "epoch": 65.56122448979592, + "grad_norm": 4.677563190460205, + "learning_rate": 6.887755102040817e-06, + "loss": 0.0881, + "step": 12850 + }, + { + "epoch": 65.61224489795919, + "grad_norm": 8.159587860107422, + "learning_rate": 6.877551020408164e-06, + "loss": 0.135, + "step": 12860 + }, + { + "epoch": 65.66326530612245, + "grad_norm": 7.858253479003906, + "learning_rate": 6.867346938775511e-06, + "loss": 0.188, + "step": 12870 + }, + { + "epoch": 65.71428571428571, + "grad_norm": 4.100142478942871, + "learning_rate": 6.857142857142858e-06, + "loss": 0.3009, + "step": 12880 + }, + { + "epoch": 65.76530612244898, + "grad_norm": 16.863325119018555, + "learning_rate": 6.8469387755102046e-06, + "loss": 0.4487, + "step": 12890 + }, + { + "epoch": 65.81632653061224, + "grad_norm": 1.883056402206421, + "learning_rate": 6.836734693877551e-06, + "loss": 0.1038, + "step": 12900 + }, + { + "epoch": 65.86734693877551, + "grad_norm": 10.750667572021484, + "learning_rate": 6.826530612244898e-06, + "loss": 0.2203, + "step": 12910 + }, + { + "epoch": 65.91836734693878, + "grad_norm": 11.105408668518066, + "learning_rate": 6.816326530612245e-06, + "loss": 0.2605, + "step": 12920 + }, + { + "epoch": 65.96938775510205, + "grad_norm": 1.4005035161972046, + "learning_rate": 6.806122448979592e-06, + "loss": 0.1965, + "step": 12930 + }, + { + "epoch": 66.0, + "eval_accuracy": 0.851985559566787, + "eval_loss": 0.5033073425292969, + "eval_runtime": 0.929, + "eval_samples_per_second": 298.158, + "eval_steps_per_second": 37.673, + "step": 12936 + }, + { + "epoch": 66.0204081632653, + "grad_norm": 1.441422462463379, + "learning_rate": 6.7959183673469394e-06, + "loss": 0.3474, + "step": 12940 + }, + { + "epoch": 66.07142857142857, + "grad_norm": 0.8666236996650696, + "learning_rate": 6.785714285714287e-06, + "loss": 0.2731, + "step": 12950 + }, + { + "epoch": 66.12244897959184, + "grad_norm": 1.4008103609085083, + "learning_rate": 6.7755102040816336e-06, + "loss": 0.1877, + "step": 12960 + }, + { + "epoch": 66.1734693877551, + "grad_norm": 3.096766948699951, + "learning_rate": 6.76530612244898e-06, + "loss": 0.2621, + "step": 12970 + }, + { + "epoch": 66.22448979591837, + "grad_norm": 2.314729690551758, + "learning_rate": 6.755102040816327e-06, + "loss": 0.2495, + "step": 12980 + }, + { + "epoch": 66.27551020408163, + "grad_norm": 0.8084143400192261, + "learning_rate": 6.7448979591836735e-06, + "loss": 0.0909, + "step": 12990 + }, + { + "epoch": 66.3265306122449, + "grad_norm": 10.513947486877441, + "learning_rate": 6.734693877551021e-06, + "loss": 0.4269, + "step": 13000 + }, + { + "epoch": 66.37755102040816, + "grad_norm": 21.57809829711914, + "learning_rate": 6.724489795918368e-06, + "loss": 0.3193, + "step": 13010 + }, + { + "epoch": 66.42857142857143, + "grad_norm": 1.8246338367462158, + "learning_rate": 6.714285714285714e-06, + "loss": 0.0993, + "step": 13020 + }, + { + "epoch": 66.4795918367347, + "grad_norm": 10.447412490844727, + "learning_rate": 6.704081632653063e-06, + "loss": 0.1901, + "step": 13030 + }, + { + "epoch": 66.53061224489795, + "grad_norm": 0.9140769243240356, + "learning_rate": 6.693877551020409e-06, + "loss": 0.0921, + "step": 13040 + }, + { + "epoch": 66.58163265306122, + "grad_norm": 7.879315376281738, + "learning_rate": 6.683673469387756e-06, + "loss": 0.1556, + "step": 13050 + }, + { + "epoch": 66.63265306122449, + "grad_norm": 5.356064319610596, + "learning_rate": 6.6734693877551025e-06, + "loss": 0.1502, + "step": 13060 + }, + { + "epoch": 66.68367346938776, + "grad_norm": 3.6449520587921143, + "learning_rate": 6.663265306122449e-06, + "loss": 0.1708, + "step": 13070 + }, + { + "epoch": 66.73469387755102, + "grad_norm": 14.209073066711426, + "learning_rate": 6.653061224489797e-06, + "loss": 0.2792, + "step": 13080 + }, + { + "epoch": 66.78571428571429, + "grad_norm": 0.243174210190773, + "learning_rate": 6.642857142857143e-06, + "loss": 0.2626, + "step": 13090 + }, + { + "epoch": 66.83673469387755, + "grad_norm": 0.8402270674705505, + "learning_rate": 6.63265306122449e-06, + "loss": 0.341, + "step": 13100 + }, + { + "epoch": 66.88775510204081, + "grad_norm": 1.1840009689331055, + "learning_rate": 6.6224489795918365e-06, + "loss": 0.077, + "step": 13110 + }, + { + "epoch": 66.93877551020408, + "grad_norm": 1.2926697731018066, + "learning_rate": 6.612244897959185e-06, + "loss": 0.1844, + "step": 13120 + }, + { + "epoch": 66.98979591836735, + "grad_norm": 0.45596539974212646, + "learning_rate": 6.6020408163265315e-06, + "loss": 0.3548, + "step": 13130 + }, + { + "epoch": 67.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.41321179270744324, + "eval_runtime": 0.9321, + "eval_samples_per_second": 297.182, + "eval_steps_per_second": 37.55, + "step": 13132 + }, + { + "epoch": 67.04081632653062, + "grad_norm": 6.747880458831787, + "learning_rate": 6.591836734693878e-06, + "loss": 0.2125, + "step": 13140 + }, + { + "epoch": 67.09183673469387, + "grad_norm": 12.555801391601562, + "learning_rate": 6.581632653061225e-06, + "loss": 0.1632, + "step": 13150 + }, + { + "epoch": 67.14285714285714, + "grad_norm": 11.827412605285645, + "learning_rate": 6.571428571428572e-06, + "loss": 0.1482, + "step": 13160 + }, + { + "epoch": 67.1938775510204, + "grad_norm": 2.0173394680023193, + "learning_rate": 6.561224489795919e-06, + "loss": 0.3297, + "step": 13170 + }, + { + "epoch": 67.24489795918367, + "grad_norm": 7.96342658996582, + "learning_rate": 6.5510204081632656e-06, + "loss": 0.093, + "step": 13180 + }, + { + "epoch": 67.29591836734694, + "grad_norm": 10.090936660766602, + "learning_rate": 6.540816326530612e-06, + "loss": 0.3328, + "step": 13190 + }, + { + "epoch": 67.34693877551021, + "grad_norm": 8.72697925567627, + "learning_rate": 6.530612244897959e-06, + "loss": 0.1716, + "step": 13200 + }, + { + "epoch": 67.39795918367346, + "grad_norm": 3.7887697219848633, + "learning_rate": 6.520408163265307e-06, + "loss": 0.316, + "step": 13210 + }, + { + "epoch": 67.44897959183673, + "grad_norm": 9.916276931762695, + "learning_rate": 6.510204081632654e-06, + "loss": 0.1719, + "step": 13220 + }, + { + "epoch": 67.5, + "grad_norm": 0.9465085864067078, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.1347, + "step": 13230 + }, + { + "epoch": 67.55102040816327, + "grad_norm": 2.688920259475708, + "learning_rate": 6.489795918367348e-06, + "loss": 0.2941, + "step": 13240 + }, + { + "epoch": 67.60204081632654, + "grad_norm": 4.175118446350098, + "learning_rate": 6.4795918367346946e-06, + "loss": 0.2066, + "step": 13250 + }, + { + "epoch": 67.65306122448979, + "grad_norm": 6.48569917678833, + "learning_rate": 6.469387755102041e-06, + "loss": 0.2816, + "step": 13260 + }, + { + "epoch": 67.70408163265306, + "grad_norm": 2.492340087890625, + "learning_rate": 6.459183673469388e-06, + "loss": 0.1307, + "step": 13270 + }, + { + "epoch": 67.75510204081633, + "grad_norm": 13.288766860961914, + "learning_rate": 6.4489795918367345e-06, + "loss": 0.2548, + "step": 13280 + }, + { + "epoch": 67.8061224489796, + "grad_norm": 2.2034645080566406, + "learning_rate": 6.438775510204082e-06, + "loss": 0.3435, + "step": 13290 + }, + { + "epoch": 67.85714285714286, + "grad_norm": 1.1759686470031738, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.2406, + "step": 13300 + }, + { + "epoch": 67.90816326530613, + "grad_norm": 17.009254455566406, + "learning_rate": 6.418367346938776e-06, + "loss": 0.1782, + "step": 13310 + }, + { + "epoch": 67.95918367346938, + "grad_norm": 7.602895736694336, + "learning_rate": 6.408163265306124e-06, + "loss": 0.1279, + "step": 13320 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.9169675090252708, + "eval_loss": 0.37430626153945923, + "eval_runtime": 0.9377, + "eval_samples_per_second": 295.419, + "eval_steps_per_second": 37.327, + "step": 13328 + }, + { + "epoch": 68.01020408163265, + "grad_norm": 0.3676498830318451, + "learning_rate": 6.39795918367347e-06, + "loss": 0.387, + "step": 13330 + }, + { + "epoch": 68.06122448979592, + "grad_norm": 9.916852951049805, + "learning_rate": 6.387755102040817e-06, + "loss": 0.3069, + "step": 13340 + }, + { + "epoch": 68.11224489795919, + "grad_norm": 1.7918875217437744, + "learning_rate": 6.3775510204081635e-06, + "loss": 0.2864, + "step": 13350 + }, + { + "epoch": 68.16326530612245, + "grad_norm": 12.890216827392578, + "learning_rate": 6.36734693877551e-06, + "loss": 0.2125, + "step": 13360 + }, + { + "epoch": 68.21428571428571, + "grad_norm": 7.735411643981934, + "learning_rate": 6.357142857142858e-06, + "loss": 0.3003, + "step": 13370 + }, + { + "epoch": 68.26530612244898, + "grad_norm": 1.7586148977279663, + "learning_rate": 6.346938775510204e-06, + "loss": 0.1708, + "step": 13380 + }, + { + "epoch": 68.31632653061224, + "grad_norm": 2.7423510551452637, + "learning_rate": 6.336734693877552e-06, + "loss": 0.2475, + "step": 13390 + }, + { + "epoch": 68.36734693877551, + "grad_norm": 10.699454307556152, + "learning_rate": 6.326530612244899e-06, + "loss": 0.2548, + "step": 13400 + }, + { + "epoch": 68.41836734693878, + "grad_norm": 18.133024215698242, + "learning_rate": 6.316326530612246e-06, + "loss": 0.3625, + "step": 13410 + }, + { + "epoch": 68.46938775510205, + "grad_norm": 10.453315734863281, + "learning_rate": 6.3061224489795925e-06, + "loss": 0.1363, + "step": 13420 + }, + { + "epoch": 68.5204081632653, + "grad_norm": 17.546966552734375, + "learning_rate": 6.295918367346939e-06, + "loss": 0.1487, + "step": 13430 + }, + { + "epoch": 68.57142857142857, + "grad_norm": 6.659013271331787, + "learning_rate": 6.285714285714286e-06, + "loss": 0.1051, + "step": 13440 + }, + { + "epoch": 68.62244897959184, + "grad_norm": 10.408591270446777, + "learning_rate": 6.275510204081633e-06, + "loss": 0.1431, + "step": 13450 + }, + { + "epoch": 68.6734693877551, + "grad_norm": 1.8873507976531982, + "learning_rate": 6.26530612244898e-06, + "loss": 0.2465, + "step": 13460 + }, + { + "epoch": 68.72448979591837, + "grad_norm": 2.6099853515625, + "learning_rate": 6.2551020408163266e-06, + "loss": 0.1926, + "step": 13470 + }, + { + "epoch": 68.77551020408163, + "grad_norm": 6.034646034240723, + "learning_rate": 6.244897959183675e-06, + "loss": 0.3591, + "step": 13480 + }, + { + "epoch": 68.8265306122449, + "grad_norm": 9.02139949798584, + "learning_rate": 6.2346938775510215e-06, + "loss": 0.2112, + "step": 13490 + }, + { + "epoch": 68.87755102040816, + "grad_norm": 3.453343391418457, + "learning_rate": 6.224489795918368e-06, + "loss": 0.2187, + "step": 13500 + }, + { + "epoch": 68.92857142857143, + "grad_norm": 1.6684788465499878, + "learning_rate": 6.214285714285715e-06, + "loss": 0.2207, + "step": 13510 + }, + { + "epoch": 68.9795918367347, + "grad_norm": 0.5747165679931641, + "learning_rate": 6.2040816326530614e-06, + "loss": 0.2879, + "step": 13520 + }, + { + "epoch": 69.0, + "eval_accuracy": 0.776173285198556, + "eval_loss": 0.6423271298408508, + "eval_runtime": 0.9319, + "eval_samples_per_second": 297.254, + "eval_steps_per_second": 37.559, + "step": 13524 + }, + { + "epoch": 69.03061224489795, + "grad_norm": 14.904623985290527, + "learning_rate": 6.193877551020409e-06, + "loss": 0.3672, + "step": 13530 + }, + { + "epoch": 69.08163265306122, + "grad_norm": 10.467020034790039, + "learning_rate": 6.1836734693877556e-06, + "loss": 0.17, + "step": 13540 + }, + { + "epoch": 69.13265306122449, + "grad_norm": 0.9199422597885132, + "learning_rate": 6.173469387755102e-06, + "loss": 0.1159, + "step": 13550 + }, + { + "epoch": 69.18367346938776, + "grad_norm": 4.06253719329834, + "learning_rate": 6.163265306122449e-06, + "loss": 0.1432, + "step": 13560 + }, + { + "epoch": 69.23469387755102, + "grad_norm": 4.386868476867676, + "learning_rate": 6.153061224489797e-06, + "loss": 0.2992, + "step": 13570 + }, + { + "epoch": 69.28571428571429, + "grad_norm": 12.222877502441406, + "learning_rate": 6.142857142857144e-06, + "loss": 0.2958, + "step": 13580 + }, + { + "epoch": 69.33673469387755, + "grad_norm": 6.182301044464111, + "learning_rate": 6.1326530612244905e-06, + "loss": 0.1483, + "step": 13590 + }, + { + "epoch": 69.38775510204081, + "grad_norm": 0.8524590134620667, + "learning_rate": 6.122448979591837e-06, + "loss": 0.1975, + "step": 13600 + }, + { + "epoch": 69.43877551020408, + "grad_norm": 0.8242807388305664, + "learning_rate": 6.112244897959185e-06, + "loss": 0.719, + "step": 13610 + }, + { + "epoch": 69.48979591836735, + "grad_norm": 1.1100050210952759, + "learning_rate": 6.102040816326531e-06, + "loss": 0.266, + "step": 13620 + }, + { + "epoch": 69.54081632653062, + "grad_norm": 1.066702961921692, + "learning_rate": 6.091836734693878e-06, + "loss": 0.218, + "step": 13630 + }, + { + "epoch": 69.59183673469387, + "grad_norm": 2.186619281768799, + "learning_rate": 6.0816326530612245e-06, + "loss": 0.1463, + "step": 13640 + }, + { + "epoch": 69.64285714285714, + "grad_norm": 2.3016552925109863, + "learning_rate": 6.071428571428571e-06, + "loss": 0.1605, + "step": 13650 + }, + { + "epoch": 69.6938775510204, + "grad_norm": 9.424914360046387, + "learning_rate": 6.0612244897959195e-06, + "loss": 0.1386, + "step": 13660 + }, + { + "epoch": 69.74489795918367, + "grad_norm": 1.9103364944458008, + "learning_rate": 6.051020408163266e-06, + "loss": 0.1881, + "step": 13670 + }, + { + "epoch": 69.79591836734694, + "grad_norm": 3.562779188156128, + "learning_rate": 6.040816326530613e-06, + "loss": 0.146, + "step": 13680 + }, + { + "epoch": 69.84693877551021, + "grad_norm": 0.7750720977783203, + "learning_rate": 6.03061224489796e-06, + "loss": 0.0724, + "step": 13690 + }, + { + "epoch": 69.89795918367346, + "grad_norm": 0.9294744729995728, + "learning_rate": 6.020408163265307e-06, + "loss": 0.2269, + "step": 13700 + }, + { + "epoch": 69.94897959183673, + "grad_norm": 0.2986402213573456, + "learning_rate": 6.0102040816326535e-06, + "loss": 0.1076, + "step": 13710 + }, + { + "epoch": 70.0, + "grad_norm": 5.967097759246826, + "learning_rate": 6e-06, + "loss": 0.1757, + "step": 13720 + }, + { + "epoch": 70.0, + "eval_accuracy": 0.8014440433212996, + "eval_loss": 0.5979345440864563, + "eval_runtime": 0.9344, + "eval_samples_per_second": 296.449, + "eval_steps_per_second": 37.457, + "step": 13720 + }, + { + "epoch": 70.05102040816327, + "grad_norm": 13.955671310424805, + "learning_rate": 5.989795918367347e-06, + "loss": 0.294, + "step": 13730 + }, + { + "epoch": 70.10204081632654, + "grad_norm": 0.3971206843852997, + "learning_rate": 5.979591836734694e-06, + "loss": 0.071, + "step": 13740 + }, + { + "epoch": 70.15306122448979, + "grad_norm": 6.5772528648376465, + "learning_rate": 5.969387755102042e-06, + "loss": 0.2846, + "step": 13750 + }, + { + "epoch": 70.20408163265306, + "grad_norm": 1.784285068511963, + "learning_rate": 5.959183673469388e-06, + "loss": 0.2171, + "step": 13760 + }, + { + "epoch": 70.25510204081633, + "grad_norm": 2.0860092639923096, + "learning_rate": 5.948979591836735e-06, + "loss": 0.2162, + "step": 13770 + }, + { + "epoch": 70.3061224489796, + "grad_norm": 0.8636461496353149, + "learning_rate": 5.9387755102040825e-06, + "loss": 0.1208, + "step": 13780 + }, + { + "epoch": 70.35714285714286, + "grad_norm": 0.5153712630271912, + "learning_rate": 5.928571428571429e-06, + "loss": 0.115, + "step": 13790 + }, + { + "epoch": 70.40816326530613, + "grad_norm": 9.724133491516113, + "learning_rate": 5.918367346938776e-06, + "loss": 0.2175, + "step": 13800 + }, + { + "epoch": 70.45918367346938, + "grad_norm": 7.4661407470703125, + "learning_rate": 5.9081632653061224e-06, + "loss": 0.2529, + "step": 13810 + }, + { + "epoch": 70.51020408163265, + "grad_norm": 10.019166946411133, + "learning_rate": 5.89795918367347e-06, + "loss": 0.4082, + "step": 13820 + }, + { + "epoch": 70.56122448979592, + "grad_norm": 1.403428316116333, + "learning_rate": 5.8877551020408166e-06, + "loss": 0.2125, + "step": 13830 + }, + { + "epoch": 70.61224489795919, + "grad_norm": 0.517578125, + "learning_rate": 5.877551020408164e-06, + "loss": 0.3907, + "step": 13840 + }, + { + "epoch": 70.66326530612245, + "grad_norm": 9.466553688049316, + "learning_rate": 5.867346938775511e-06, + "loss": 0.3029, + "step": 13850 + }, + { + "epoch": 70.71428571428571, + "grad_norm": 15.54366397857666, + "learning_rate": 5.857142857142858e-06, + "loss": 0.1536, + "step": 13860 + }, + { + "epoch": 70.76530612244898, + "grad_norm": 7.9617228507995605, + "learning_rate": 5.846938775510205e-06, + "loss": 0.1818, + "step": 13870 + }, + { + "epoch": 70.81632653061224, + "grad_norm": 1.013411521911621, + "learning_rate": 5.8367346938775515e-06, + "loss": 0.3629, + "step": 13880 + }, + { + "epoch": 70.86734693877551, + "grad_norm": 0.12895554304122925, + "learning_rate": 5.826530612244898e-06, + "loss": 0.1266, + "step": 13890 + }, + { + "epoch": 70.91836734693878, + "grad_norm": 10.661433219909668, + "learning_rate": 5.816326530612246e-06, + "loss": 0.2916, + "step": 13900 + }, + { + "epoch": 70.96938775510205, + "grad_norm": 16.746891021728516, + "learning_rate": 5.806122448979592e-06, + "loss": 0.3338, + "step": 13910 + }, + { + "epoch": 71.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.43981537222862244, + "eval_runtime": 0.9347, + "eval_samples_per_second": 296.341, + "eval_steps_per_second": 37.444, + "step": 13916 + }, + { + "epoch": 71.0204081632653, + "grad_norm": 12.110264778137207, + "learning_rate": 5.795918367346939e-06, + "loss": 0.3042, + "step": 13920 + }, + { + "epoch": 71.07142857142857, + "grad_norm": 4.592965602874756, + "learning_rate": 5.785714285714286e-06, + "loss": 0.1833, + "step": 13930 + }, + { + "epoch": 71.12244897959184, + "grad_norm": 3.0122435092926025, + "learning_rate": 5.775510204081634e-06, + "loss": 0.3606, + "step": 13940 + }, + { + "epoch": 71.1734693877551, + "grad_norm": 2.8679792881011963, + "learning_rate": 5.7653061224489805e-06, + "loss": 0.2585, + "step": 13950 + }, + { + "epoch": 71.22448979591837, + "grad_norm": 1.3968089818954468, + "learning_rate": 5.755102040816327e-06, + "loss": 0.117, + "step": 13960 + }, + { + "epoch": 71.27551020408163, + "grad_norm": 4.6621174812316895, + "learning_rate": 5.744897959183674e-06, + "loss": 0.1251, + "step": 13970 + }, + { + "epoch": 71.3265306122449, + "grad_norm": 4.598996162414551, + "learning_rate": 5.73469387755102e-06, + "loss": 0.1581, + "step": 13980 + }, + { + "epoch": 71.37755102040816, + "grad_norm": 16.922142028808594, + "learning_rate": 5.724489795918368e-06, + "loss": 0.2025, + "step": 13990 + }, + { + "epoch": 71.42857142857143, + "grad_norm": 3.0897347927093506, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.2834, + "step": 14000 + }, + { + "epoch": 71.4795918367347, + "grad_norm": 18.263383865356445, + "learning_rate": 5.704081632653061e-06, + "loss": 0.4577, + "step": 14010 + }, + { + "epoch": 71.53061224489795, + "grad_norm": 0.9202457070350647, + "learning_rate": 5.6938775510204095e-06, + "loss": 0.318, + "step": 14020 + }, + { + "epoch": 71.58163265306122, + "grad_norm": 0.4576379358768463, + "learning_rate": 5.683673469387756e-06, + "loss": 0.1531, + "step": 14030 + }, + { + "epoch": 71.63265306122449, + "grad_norm": 17.588665008544922, + "learning_rate": 5.673469387755103e-06, + "loss": 0.1395, + "step": 14040 + }, + { + "epoch": 71.68367346938776, + "grad_norm": 14.638072967529297, + "learning_rate": 5.663265306122449e-06, + "loss": 0.2767, + "step": 14050 + }, + { + "epoch": 71.73469387755102, + "grad_norm": 2.2523481845855713, + "learning_rate": 5.653061224489796e-06, + "loss": 0.2819, + "step": 14060 + }, + { + "epoch": 71.78571428571429, + "grad_norm": 10.696439743041992, + "learning_rate": 5.6428571428571435e-06, + "loss": 0.1996, + "step": 14070 + }, + { + "epoch": 71.83673469387755, + "grad_norm": 13.24187183380127, + "learning_rate": 5.63265306122449e-06, + "loss": 0.2872, + "step": 14080 + }, + { + "epoch": 71.88775510204081, + "grad_norm": 4.308647632598877, + "learning_rate": 5.622448979591837e-06, + "loss": 0.1112, + "step": 14090 + }, + { + "epoch": 71.93877551020408, + "grad_norm": 14.375484466552734, + "learning_rate": 5.6122448979591834e-06, + "loss": 0.1914, + "step": 14100 + }, + { + "epoch": 71.98979591836735, + "grad_norm": 6.740991115570068, + "learning_rate": 5.602040816326531e-06, + "loss": 0.1604, + "step": 14110 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.8231046931407943, + "eval_loss": 0.5633912086486816, + "eval_runtime": 0.9307, + "eval_samples_per_second": 297.639, + "eval_steps_per_second": 37.608, + "step": 14112 + }, + { + "epoch": 72.04081632653062, + "grad_norm": 0.3390078544616699, + "learning_rate": 5.591836734693878e-06, + "loss": 0.3075, + "step": 14120 + }, + { + "epoch": 72.09183673469387, + "grad_norm": 4.501482009887695, + "learning_rate": 5.581632653061225e-06, + "loss": 0.1095, + "step": 14130 + }, + { + "epoch": 72.14285714285714, + "grad_norm": 3.729031562805176, + "learning_rate": 5.571428571428572e-06, + "loss": 0.0767, + "step": 14140 + }, + { + "epoch": 72.1938775510204, + "grad_norm": 0.17877550423145294, + "learning_rate": 5.561224489795919e-06, + "loss": 0.3787, + "step": 14150 + }, + { + "epoch": 72.24489795918367, + "grad_norm": 5.799465179443359, + "learning_rate": 5.551020408163266e-06, + "loss": 0.1648, + "step": 14160 + }, + { + "epoch": 72.29591836734694, + "grad_norm": 16.26845359802246, + "learning_rate": 5.5408163265306125e-06, + "loss": 0.1484, + "step": 14170 + }, + { + "epoch": 72.34693877551021, + "grad_norm": 3.746767997741699, + "learning_rate": 5.530612244897959e-06, + "loss": 0.1447, + "step": 14180 + }, + { + "epoch": 72.39795918367346, + "grad_norm": 1.8594543933868408, + "learning_rate": 5.520408163265306e-06, + "loss": 0.1184, + "step": 14190 + }, + { + "epoch": 72.44897959183673, + "grad_norm": 10.922441482543945, + "learning_rate": 5.510204081632653e-06, + "loss": 0.1443, + "step": 14200 + }, + { + "epoch": 72.5, + "grad_norm": 1.249107003211975, + "learning_rate": 5.500000000000001e-06, + "loss": 0.189, + "step": 14210 + }, + { + "epoch": 72.55102040816327, + "grad_norm": 1.8553344011306763, + "learning_rate": 5.489795918367347e-06, + "loss": 0.2099, + "step": 14220 + }, + { + "epoch": 72.60204081632654, + "grad_norm": 19.02747344970703, + "learning_rate": 5.479591836734695e-06, + "loss": 0.4534, + "step": 14230 + }, + { + "epoch": 72.65306122448979, + "grad_norm": 0.5484607219696045, + "learning_rate": 5.4693877551020415e-06, + "loss": 0.1987, + "step": 14240 + }, + { + "epoch": 72.70408163265306, + "grad_norm": 3.448791265487671, + "learning_rate": 5.459183673469388e-06, + "loss": 0.1921, + "step": 14250 + }, + { + "epoch": 72.75510204081633, + "grad_norm": 0.8846081495285034, + "learning_rate": 5.448979591836735e-06, + "loss": 0.141, + "step": 14260 + }, + { + "epoch": 72.8061224489796, + "grad_norm": 9.476802825927734, + "learning_rate": 5.438775510204081e-06, + "loss": 0.2398, + "step": 14270 + }, + { + "epoch": 72.85714285714286, + "grad_norm": 8.394309997558594, + "learning_rate": 5.428571428571429e-06, + "loss": 0.1556, + "step": 14280 + }, + { + "epoch": 72.90816326530613, + "grad_norm": 2.2207627296447754, + "learning_rate": 5.4183673469387755e-06, + "loss": 0.1218, + "step": 14290 + }, + { + "epoch": 72.95918367346938, + "grad_norm": 5.007809162139893, + "learning_rate": 5.408163265306123e-06, + "loss": 0.1078, + "step": 14300 + }, + { + "epoch": 73.0, + "eval_accuracy": 0.776173285198556, + "eval_loss": 0.620398223400116, + "eval_runtime": 0.9313, + "eval_samples_per_second": 297.423, + "eval_steps_per_second": 37.581, + "step": 14308 + }, + { + "epoch": 73.01020408163265, + "grad_norm": 5.273265838623047, + "learning_rate": 5.3979591836734705e-06, + "loss": 0.1099, + "step": 14310 + }, + { + "epoch": 73.06122448979592, + "grad_norm": 0.09366770088672638, + "learning_rate": 5.387755102040817e-06, + "loss": 0.3264, + "step": 14320 + }, + { + "epoch": 73.11224489795919, + "grad_norm": 12.97093677520752, + "learning_rate": 5.377551020408164e-06, + "loss": 0.2472, + "step": 14330 + }, + { + "epoch": 73.16326530612245, + "grad_norm": 23.67729377746582, + "learning_rate": 5.36734693877551e-06, + "loss": 0.3731, + "step": 14340 + }, + { + "epoch": 73.21428571428571, + "grad_norm": 3.248290777206421, + "learning_rate": 5.357142857142857e-06, + "loss": 0.1723, + "step": 14350 + }, + { + "epoch": 73.26530612244898, + "grad_norm": 0.6866236925125122, + "learning_rate": 5.3469387755102045e-06, + "loss": 0.2337, + "step": 14360 + }, + { + "epoch": 73.31632653061224, + "grad_norm": 11.86717700958252, + "learning_rate": 5.336734693877551e-06, + "loss": 0.2086, + "step": 14370 + }, + { + "epoch": 73.36734693877551, + "grad_norm": 0.9687001705169678, + "learning_rate": 5.326530612244898e-06, + "loss": 0.1213, + "step": 14380 + }, + { + "epoch": 73.41836734693878, + "grad_norm": 1.566986322402954, + "learning_rate": 5.316326530612246e-06, + "loss": 0.3542, + "step": 14390 + }, + { + "epoch": 73.46938775510205, + "grad_norm": 11.866606712341309, + "learning_rate": 5.306122448979593e-06, + "loss": 0.5427, + "step": 14400 + }, + { + "epoch": 73.5204081632653, + "grad_norm": 3.2856667041778564, + "learning_rate": 5.295918367346939e-06, + "loss": 0.1367, + "step": 14410 + }, + { + "epoch": 73.57142857142857, + "grad_norm": 7.826207160949707, + "learning_rate": 5.285714285714286e-06, + "loss": 0.1293, + "step": 14420 + }, + { + "epoch": 73.62244897959184, + "grad_norm": 7.829553127288818, + "learning_rate": 5.275510204081633e-06, + "loss": 0.2473, + "step": 14430 + }, + { + "epoch": 73.6734693877551, + "grad_norm": 10.244202613830566, + "learning_rate": 5.26530612244898e-06, + "loss": 0.3114, + "step": 14440 + }, + { + "epoch": 73.72448979591837, + "grad_norm": 15.15924072265625, + "learning_rate": 5.255102040816327e-06, + "loss": 0.1595, + "step": 14450 + }, + { + "epoch": 73.77551020408163, + "grad_norm": 10.309656143188477, + "learning_rate": 5.2448979591836735e-06, + "loss": 0.3066, + "step": 14460 + }, + { + "epoch": 73.8265306122449, + "grad_norm": 2.3161091804504395, + "learning_rate": 5.23469387755102e-06, + "loss": 0.1034, + "step": 14470 + }, + { + "epoch": 73.87755102040816, + "grad_norm": 4.930816173553467, + "learning_rate": 5.2244897959183684e-06, + "loss": 0.1057, + "step": 14480 + }, + { + "epoch": 73.92857142857143, + "grad_norm": 3.2863380908966064, + "learning_rate": 5.214285714285715e-06, + "loss": 0.1034, + "step": 14490 + }, + { + "epoch": 73.9795918367347, + "grad_norm": 3.996938705444336, + "learning_rate": 5.204081632653062e-06, + "loss": 0.258, + "step": 14500 + }, + { + "epoch": 74.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.36853671073913574, + "eval_runtime": 0.9322, + "eval_samples_per_second": 297.152, + "eval_steps_per_second": 37.546, + "step": 14504 + }, + { + "epoch": 74.03061224489795, + "grad_norm": 5.564041614532471, + "learning_rate": 5.193877551020408e-06, + "loss": 0.152, + "step": 14510 + }, + { + "epoch": 74.08163265306122, + "grad_norm": 14.177313804626465, + "learning_rate": 5.183673469387756e-06, + "loss": 0.2189, + "step": 14520 + }, + { + "epoch": 74.13265306122449, + "grad_norm": 1.9523788690567017, + "learning_rate": 5.1734693877551025e-06, + "loss": 0.2653, + "step": 14530 + }, + { + "epoch": 74.18367346938776, + "grad_norm": 0.6890932321548462, + "learning_rate": 5.163265306122449e-06, + "loss": 0.1789, + "step": 14540 + }, + { + "epoch": 74.23469387755102, + "grad_norm": 15.715421676635742, + "learning_rate": 5.153061224489796e-06, + "loss": 0.4299, + "step": 14550 + }, + { + "epoch": 74.28571428571429, + "grad_norm": 1.4377721548080444, + "learning_rate": 5.142857142857142e-06, + "loss": 0.1145, + "step": 14560 + }, + { + "epoch": 74.33673469387755, + "grad_norm": 21.627445220947266, + "learning_rate": 5.132653061224491e-06, + "loss": 0.412, + "step": 14570 + }, + { + "epoch": 74.38775510204081, + "grad_norm": 6.41222620010376, + "learning_rate": 5.122448979591837e-06, + "loss": 0.4198, + "step": 14580 + }, + { + "epoch": 74.43877551020408, + "grad_norm": 1.1630773544311523, + "learning_rate": 5.112244897959184e-06, + "loss": 0.1438, + "step": 14590 + }, + { + "epoch": 74.48979591836735, + "grad_norm": 11.959744453430176, + "learning_rate": 5.1020408163265315e-06, + "loss": 0.1354, + "step": 14600 + }, + { + "epoch": 74.54081632653062, + "grad_norm": 12.669620513916016, + "learning_rate": 5.091836734693878e-06, + "loss": 0.2402, + "step": 14610 + }, + { + "epoch": 74.59183673469387, + "grad_norm": 11.747788429260254, + "learning_rate": 5.081632653061225e-06, + "loss": 0.0847, + "step": 14620 + }, + { + "epoch": 74.64285714285714, + "grad_norm": 15.508429527282715, + "learning_rate": 5.071428571428571e-06, + "loss": 0.2417, + "step": 14630 + }, + { + "epoch": 74.6938775510204, + "grad_norm": 1.273260474205017, + "learning_rate": 5.061224489795918e-06, + "loss": 0.2925, + "step": 14640 + }, + { + "epoch": 74.74489795918367, + "grad_norm": 0.26550906896591187, + "learning_rate": 5.0510204081632655e-06, + "loss": 0.2467, + "step": 14650 + }, + { + "epoch": 74.79591836734694, + "grad_norm": 5.7561936378479, + "learning_rate": 5.040816326530613e-06, + "loss": 0.1374, + "step": 14660 + }, + { + "epoch": 74.84693877551021, + "grad_norm": 10.998912811279297, + "learning_rate": 5.03061224489796e-06, + "loss": 0.1987, + "step": 14670 + }, + { + "epoch": 74.89795918367346, + "grad_norm": 2.961043357849121, + "learning_rate": 5.020408163265307e-06, + "loss": 0.1702, + "step": 14680 + }, + { + "epoch": 74.94897959183673, + "grad_norm": 11.977360725402832, + "learning_rate": 5.010204081632654e-06, + "loss": 0.2016, + "step": 14690 + }, + { + "epoch": 75.0, + "grad_norm": 0.5454691052436829, + "learning_rate": 5e-06, + "loss": 0.1227, + "step": 14700 + }, + { + "epoch": 75.0, + "eval_accuracy": 0.8158844765342961, + "eval_loss": 0.7026161551475525, + "eval_runtime": 0.9329, + "eval_samples_per_second": 296.916, + "eval_steps_per_second": 37.517, + "step": 14700 + }, + { + "epoch": 75.05102040816327, + "grad_norm": 6.788924217224121, + "learning_rate": 4.989795918367347e-06, + "loss": 0.1771, + "step": 14710 + }, + { + "epoch": 75.10204081632654, + "grad_norm": 0.8358808159828186, + "learning_rate": 4.979591836734694e-06, + "loss": 0.4148, + "step": 14720 + }, + { + "epoch": 75.15306122448979, + "grad_norm": 2.5965681076049805, + "learning_rate": 4.969387755102041e-06, + "loss": 0.1705, + "step": 14730 + }, + { + "epoch": 75.20408163265306, + "grad_norm": 2.074181079864502, + "learning_rate": 4.959183673469388e-06, + "loss": 0.2377, + "step": 14740 + }, + { + "epoch": 75.25510204081633, + "grad_norm": 8.877419471740723, + "learning_rate": 4.948979591836735e-06, + "loss": 0.2705, + "step": 14750 + }, + { + "epoch": 75.3061224489796, + "grad_norm": 2.2369658946990967, + "learning_rate": 4.938775510204082e-06, + "loss": 0.1848, + "step": 14760 + }, + { + "epoch": 75.35714285714286, + "grad_norm": 1.3437224626541138, + "learning_rate": 4.928571428571429e-06, + "loss": 0.309, + "step": 14770 + }, + { + "epoch": 75.40816326530613, + "grad_norm": 12.179243087768555, + "learning_rate": 4.918367346938776e-06, + "loss": 0.3758, + "step": 14780 + }, + { + "epoch": 75.45918367346938, + "grad_norm": 0.31163737177848816, + "learning_rate": 4.908163265306123e-06, + "loss": 0.0913, + "step": 14790 + }, + { + "epoch": 75.51020408163265, + "grad_norm": 6.074469089508057, + "learning_rate": 4.897959183673469e-06, + "loss": 0.1946, + "step": 14800 + }, + { + "epoch": 75.56122448979592, + "grad_norm": 0.20314039289951324, + "learning_rate": 4.887755102040817e-06, + "loss": 0.3418, + "step": 14810 + }, + { + "epoch": 75.61224489795919, + "grad_norm": 1.0009397268295288, + "learning_rate": 4.8775510204081635e-06, + "loss": 0.1232, + "step": 14820 + }, + { + "epoch": 75.66326530612245, + "grad_norm": 4.9547505378723145, + "learning_rate": 4.867346938775511e-06, + "loss": 0.179, + "step": 14830 + }, + { + "epoch": 75.71428571428571, + "grad_norm": 14.29842472076416, + "learning_rate": 4.857142857142858e-06, + "loss": 0.2963, + "step": 14840 + }, + { + "epoch": 75.76530612244898, + "grad_norm": 0.6081609129905701, + "learning_rate": 4.846938775510204e-06, + "loss": 0.3408, + "step": 14850 + }, + { + "epoch": 75.81632653061224, + "grad_norm": 16.967266082763672, + "learning_rate": 4.836734693877552e-06, + "loss": 0.2528, + "step": 14860 + }, + { + "epoch": 75.86734693877551, + "grad_norm": 4.57008171081543, + "learning_rate": 4.826530612244898e-06, + "loss": 0.3173, + "step": 14870 + }, + { + "epoch": 75.91836734693878, + "grad_norm": 10.530519485473633, + "learning_rate": 4.816326530612245e-06, + "loss": 0.1651, + "step": 14880 + }, + { + "epoch": 75.96938775510205, + "grad_norm": 15.925687789916992, + "learning_rate": 4.8061224489795925e-06, + "loss": 0.2257, + "step": 14890 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.9169675090252708, + "eval_loss": 0.40481600165367126, + "eval_runtime": 0.942, + "eval_samples_per_second": 294.046, + "eval_steps_per_second": 37.154, + "step": 14896 + }, + { + "epoch": 76.0204081632653, + "grad_norm": 5.312804222106934, + "learning_rate": 4.795918367346939e-06, + "loss": 0.4119, + "step": 14900 + }, + { + "epoch": 76.07142857142857, + "grad_norm": 2.115060567855835, + "learning_rate": 4.785714285714287e-06, + "loss": 0.1965, + "step": 14910 + }, + { + "epoch": 76.12244897959184, + "grad_norm": 2.3201823234558105, + "learning_rate": 4.775510204081633e-06, + "loss": 0.1222, + "step": 14920 + }, + { + "epoch": 76.1734693877551, + "grad_norm": 9.353812217712402, + "learning_rate": 4.76530612244898e-06, + "loss": 0.1391, + "step": 14930 + }, + { + "epoch": 76.22448979591837, + "grad_norm": 11.522284507751465, + "learning_rate": 4.7551020408163265e-06, + "loss": 0.2193, + "step": 14940 + }, + { + "epoch": 76.27551020408163, + "grad_norm": 9.714153289794922, + "learning_rate": 4.744897959183674e-06, + "loss": 0.3002, + "step": 14950 + }, + { + "epoch": 76.3265306122449, + "grad_norm": 1.5741373300552368, + "learning_rate": 4.734693877551021e-06, + "loss": 0.0989, + "step": 14960 + }, + { + "epoch": 76.37755102040816, + "grad_norm": 12.37441635131836, + "learning_rate": 4.724489795918368e-06, + "loss": 0.2207, + "step": 14970 + }, + { + "epoch": 76.42857142857143, + "grad_norm": 1.2197359800338745, + "learning_rate": 4.714285714285715e-06, + "loss": 0.3594, + "step": 14980 + }, + { + "epoch": 76.4795918367347, + "grad_norm": 1.7340853214263916, + "learning_rate": 4.704081632653061e-06, + "loss": 0.1337, + "step": 14990 + }, + { + "epoch": 76.53061224489795, + "grad_norm": 8.35191822052002, + "learning_rate": 4.693877551020409e-06, + "loss": 0.1877, + "step": 15000 + }, + { + "epoch": 76.58163265306122, + "grad_norm": 9.665618896484375, + "learning_rate": 4.6836734693877555e-06, + "loss": 0.2073, + "step": 15010 + }, + { + "epoch": 76.63265306122449, + "grad_norm": 0.48984846472740173, + "learning_rate": 4.673469387755102e-06, + "loss": 0.3014, + "step": 15020 + }, + { + "epoch": 76.68367346938776, + "grad_norm": 2.9640002250671387, + "learning_rate": 4.663265306122449e-06, + "loss": 0.215, + "step": 15030 + }, + { + "epoch": 76.73469387755102, + "grad_norm": 3.568899154663086, + "learning_rate": 4.653061224489796e-06, + "loss": 0.1687, + "step": 15040 + }, + { + "epoch": 76.78571428571429, + "grad_norm": 3.5330944061279297, + "learning_rate": 4.642857142857144e-06, + "loss": 0.2252, + "step": 15050 + }, + { + "epoch": 76.83673469387755, + "grad_norm": 7.781928062438965, + "learning_rate": 4.63265306122449e-06, + "loss": 0.212, + "step": 15060 + }, + { + "epoch": 76.88775510204081, + "grad_norm": 3.0376219749450684, + "learning_rate": 4.622448979591837e-06, + "loss": 0.141, + "step": 15070 + }, + { + "epoch": 76.93877551020408, + "grad_norm": 1.0159159898757935, + "learning_rate": 4.612244897959184e-06, + "loss": 0.2339, + "step": 15080 + }, + { + "epoch": 76.98979591836735, + "grad_norm": 3.5187315940856934, + "learning_rate": 4.602040816326531e-06, + "loss": 0.1786, + "step": 15090 + }, + { + "epoch": 77.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.48912858963012695, + "eval_runtime": 1.0003, + "eval_samples_per_second": 276.912, + "eval_steps_per_second": 34.989, + "step": 15092 + }, + { + "epoch": 77.04081632653062, + "grad_norm": 0.9008731842041016, + "learning_rate": 4.591836734693878e-06, + "loss": 0.2919, + "step": 15100 + }, + { + "epoch": 77.09183673469387, + "grad_norm": 7.379002094268799, + "learning_rate": 4.5816326530612245e-06, + "loss": 0.2478, + "step": 15110 + }, + { + "epoch": 77.14285714285714, + "grad_norm": 1.654058575630188, + "learning_rate": 4.571428571428572e-06, + "loss": 0.1717, + "step": 15120 + }, + { + "epoch": 77.1938775510204, + "grad_norm": 10.02890396118164, + "learning_rate": 4.561224489795919e-06, + "loss": 0.1046, + "step": 15130 + }, + { + "epoch": 77.24489795918367, + "grad_norm": 1.1923608779907227, + "learning_rate": 4.551020408163266e-06, + "loss": 0.1507, + "step": 15140 + }, + { + "epoch": 77.29591836734694, + "grad_norm": 2.4354844093322754, + "learning_rate": 4.540816326530613e-06, + "loss": 0.1582, + "step": 15150 + }, + { + "epoch": 77.34693877551021, + "grad_norm": 7.890419960021973, + "learning_rate": 4.530612244897959e-06, + "loss": 0.2618, + "step": 15160 + }, + { + "epoch": 77.39795918367346, + "grad_norm": 5.627359867095947, + "learning_rate": 4.520408163265306e-06, + "loss": 0.1926, + "step": 15170 + }, + { + "epoch": 77.44897959183673, + "grad_norm": 3.1545939445495605, + "learning_rate": 4.5102040816326535e-06, + "loss": 0.2936, + "step": 15180 + }, + { + "epoch": 77.5, + "grad_norm": 10.855856895446777, + "learning_rate": 4.5e-06, + "loss": 0.6137, + "step": 15190 + }, + { + "epoch": 77.55102040816327, + "grad_norm": 7.537050247192383, + "learning_rate": 4.489795918367348e-06, + "loss": 0.2718, + "step": 15200 + }, + { + "epoch": 77.60204081632654, + "grad_norm": 10.528838157653809, + "learning_rate": 4.479591836734694e-06, + "loss": 0.2494, + "step": 15210 + }, + { + "epoch": 77.65306122448979, + "grad_norm": 0.04168795049190521, + "learning_rate": 4.469387755102041e-06, + "loss": 0.3021, + "step": 15220 + }, + { + "epoch": 77.70408163265306, + "grad_norm": 6.072834014892578, + "learning_rate": 4.459183673469388e-06, + "loss": 0.1545, + "step": 15230 + }, + { + "epoch": 77.75510204081633, + "grad_norm": 1.0188490152359009, + "learning_rate": 4.448979591836735e-06, + "loss": 0.154, + "step": 15240 + }, + { + "epoch": 77.8061224489796, + "grad_norm": 2.8648037910461426, + "learning_rate": 4.438775510204082e-06, + "loss": 0.1568, + "step": 15250 + }, + { + "epoch": 77.85714285714286, + "grad_norm": 2.9121363162994385, + "learning_rate": 4.428571428571429e-06, + "loss": 0.1795, + "step": 15260 + }, + { + "epoch": 77.90816326530613, + "grad_norm": 7.241103172302246, + "learning_rate": 4.418367346938776e-06, + "loss": 0.1361, + "step": 15270 + }, + { + "epoch": 77.95918367346938, + "grad_norm": 18.421598434448242, + "learning_rate": 4.408163265306123e-06, + "loss": 0.2006, + "step": 15280 + }, + { + "epoch": 78.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.42159637808799744, + "eval_runtime": 0.9323, + "eval_samples_per_second": 297.124, + "eval_steps_per_second": 37.543, + "step": 15288 + }, + { + "epoch": 78.01020408163265, + "grad_norm": 0.5753517150878906, + "learning_rate": 4.39795918367347e-06, + "loss": 0.2196, + "step": 15290 + }, + { + "epoch": 78.06122448979592, + "grad_norm": 6.517210960388184, + "learning_rate": 4.3877551020408165e-06, + "loss": 0.1645, + "step": 15300 + }, + { + "epoch": 78.11224489795919, + "grad_norm": 1.0163953304290771, + "learning_rate": 4.377551020408163e-06, + "loss": 0.1594, + "step": 15310 + }, + { + "epoch": 78.16326530612245, + "grad_norm": 0.28174296021461487, + "learning_rate": 4.367346938775511e-06, + "loss": 0.3152, + "step": 15320 + }, + { + "epoch": 78.21428571428571, + "grad_norm": 8.987310409545898, + "learning_rate": 4.357142857142857e-06, + "loss": 0.1839, + "step": 15330 + }, + { + "epoch": 78.26530612244898, + "grad_norm": 3.5911307334899902, + "learning_rate": 4.346938775510205e-06, + "loss": 0.2263, + "step": 15340 + }, + { + "epoch": 78.31632653061224, + "grad_norm": 5.655363082885742, + "learning_rate": 4.336734693877551e-06, + "loss": 0.1143, + "step": 15350 + }, + { + "epoch": 78.36734693877551, + "grad_norm": 8.902242660522461, + "learning_rate": 4.326530612244899e-06, + "loss": 0.3429, + "step": 15360 + }, + { + "epoch": 78.41836734693878, + "grad_norm": 0.3062533140182495, + "learning_rate": 4.3163265306122455e-06, + "loss": 0.1505, + "step": 15370 + }, + { + "epoch": 78.46938775510205, + "grad_norm": 2.60772442817688, + "learning_rate": 4.306122448979592e-06, + "loss": 0.1027, + "step": 15380 + }, + { + "epoch": 78.5204081632653, + "grad_norm": 5.816764831542969, + "learning_rate": 4.295918367346939e-06, + "loss": 0.276, + "step": 15390 + }, + { + "epoch": 78.57142857142857, + "grad_norm": 11.878080368041992, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.3061, + "step": 15400 + }, + { + "epoch": 78.62244897959184, + "grad_norm": 2.4790613651275635, + "learning_rate": 4.275510204081633e-06, + "loss": 0.1698, + "step": 15410 + }, + { + "epoch": 78.6734693877551, + "grad_norm": 2.452232837677002, + "learning_rate": 4.2653061224489804e-06, + "loss": 0.0984, + "step": 15420 + }, + { + "epoch": 78.72448979591837, + "grad_norm": 3.86421799659729, + "learning_rate": 4.255102040816327e-06, + "loss": 0.2261, + "step": 15430 + }, + { + "epoch": 78.77551020408163, + "grad_norm": 6.461427688598633, + "learning_rate": 4.244897959183674e-06, + "loss": 0.1279, + "step": 15440 + }, + { + "epoch": 78.8265306122449, + "grad_norm": 0.5817276835441589, + "learning_rate": 4.234693877551021e-06, + "loss": 0.238, + "step": 15450 + }, + { + "epoch": 78.87755102040816, + "grad_norm": 1.191446304321289, + "learning_rate": 4.224489795918368e-06, + "loss": 0.1597, + "step": 15460 + }, + { + "epoch": 78.92857142857143, + "grad_norm": 0.9815487265586853, + "learning_rate": 4.2142857142857145e-06, + "loss": 0.1815, + "step": 15470 + }, + { + "epoch": 78.9795918367347, + "grad_norm": 13.951576232910156, + "learning_rate": 4.204081632653061e-06, + "loss": 0.3144, + "step": 15480 + }, + { + "epoch": 79.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.2720978558063507, + "eval_runtime": 1.0029, + "eval_samples_per_second": 276.21, + "eval_steps_per_second": 34.9, + "step": 15484 + }, + { + "epoch": 79.03061224489795, + "grad_norm": 0.6935974359512329, + "learning_rate": 4.193877551020409e-06, + "loss": 0.1547, + "step": 15490 + }, + { + "epoch": 79.08163265306122, + "grad_norm": 4.269100666046143, + "learning_rate": 4.183673469387755e-06, + "loss": 0.2228, + "step": 15500 + }, + { + "epoch": 79.13265306122449, + "grad_norm": 16.676679611206055, + "learning_rate": 4.173469387755103e-06, + "loss": 0.2051, + "step": 15510 + }, + { + "epoch": 79.18367346938776, + "grad_norm": 1.743389368057251, + "learning_rate": 4.163265306122449e-06, + "loss": 0.0857, + "step": 15520 + }, + { + "epoch": 79.23469387755102, + "grad_norm": 5.236385822296143, + "learning_rate": 4.153061224489796e-06, + "loss": 0.1267, + "step": 15530 + }, + { + "epoch": 79.28571428571429, + "grad_norm": 6.302170753479004, + "learning_rate": 4.1428571428571435e-06, + "loss": 0.1847, + "step": 15540 + }, + { + "epoch": 79.33673469387755, + "grad_norm": 12.430984497070312, + "learning_rate": 4.13265306122449e-06, + "loss": 0.2319, + "step": 15550 + }, + { + "epoch": 79.38775510204081, + "grad_norm": 0.6306114792823792, + "learning_rate": 4.122448979591837e-06, + "loss": 0.2798, + "step": 15560 + }, + { + "epoch": 79.43877551020408, + "grad_norm": 24.38300323486328, + "learning_rate": 4.112244897959184e-06, + "loss": 0.2502, + "step": 15570 + }, + { + "epoch": 79.48979591836735, + "grad_norm": 8.707317352294922, + "learning_rate": 4.102040816326531e-06, + "loss": 0.2853, + "step": 15580 + }, + { + "epoch": 79.54081632653062, + "grad_norm": 12.659032821655273, + "learning_rate": 4.091836734693878e-06, + "loss": 0.1358, + "step": 15590 + }, + { + "epoch": 79.59183673469387, + "grad_norm": 13.423051834106445, + "learning_rate": 4.081632653061225e-06, + "loss": 0.121, + "step": 15600 + }, + { + "epoch": 79.64285714285714, + "grad_norm": 2.983762502670288, + "learning_rate": 4.071428571428572e-06, + "loss": 0.1123, + "step": 15610 + }, + { + "epoch": 79.6938775510204, + "grad_norm": 3.414855480194092, + "learning_rate": 4.061224489795918e-06, + "loss": 0.1874, + "step": 15620 + }, + { + "epoch": 79.74489795918367, + "grad_norm": 13.161895751953125, + "learning_rate": 4.051020408163266e-06, + "loss": 0.1783, + "step": 15630 + }, + { + "epoch": 79.79591836734694, + "grad_norm": 7.434272766113281, + "learning_rate": 4.040816326530612e-06, + "loss": 0.2543, + "step": 15640 + }, + { + "epoch": 79.84693877551021, + "grad_norm": 16.419912338256836, + "learning_rate": 4.03061224489796e-06, + "loss": 0.3525, + "step": 15650 + }, + { + "epoch": 79.89795918367346, + "grad_norm": 3.6615853309631348, + "learning_rate": 4.0204081632653065e-06, + "loss": 0.1637, + "step": 15660 + }, + { + "epoch": 79.94897959183673, + "grad_norm": 2.119905948638916, + "learning_rate": 4.010204081632653e-06, + "loss": 0.3019, + "step": 15670 + }, + { + "epoch": 80.0, + "grad_norm": 0.8194112181663513, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1969, + "step": 15680 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.8483754512635379, + "eval_loss": 0.42701977491378784, + "eval_runtime": 0.9397, + "eval_samples_per_second": 294.767, + "eval_steps_per_second": 37.245, + "step": 15680 + }, + { + "epoch": 80.05102040816327, + "grad_norm": 9.35610580444336, + "learning_rate": 3.989795918367347e-06, + "loss": 0.19, + "step": 15690 + }, + { + "epoch": 80.10204081632654, + "grad_norm": 8.794118881225586, + "learning_rate": 3.979591836734694e-06, + "loss": 0.2407, + "step": 15700 + }, + { + "epoch": 80.15306122448979, + "grad_norm": 0.1584535837173462, + "learning_rate": 3.969387755102041e-06, + "loss": 0.0312, + "step": 15710 + }, + { + "epoch": 80.20408163265306, + "grad_norm": 4.157765865325928, + "learning_rate": 3.959183673469388e-06, + "loss": 0.3281, + "step": 15720 + }, + { + "epoch": 80.25510204081633, + "grad_norm": 11.907217979431152, + "learning_rate": 3.9489795918367356e-06, + "loss": 0.1984, + "step": 15730 + }, + { + "epoch": 80.3061224489796, + "grad_norm": 2.841599702835083, + "learning_rate": 3.938775510204082e-06, + "loss": 0.1251, + "step": 15740 + }, + { + "epoch": 80.35714285714286, + "grad_norm": 5.652760982513428, + "learning_rate": 3.928571428571429e-06, + "loss": 0.2669, + "step": 15750 + }, + { + "epoch": 80.40816326530613, + "grad_norm": 3.341134548187256, + "learning_rate": 3.9183673469387755e-06, + "loss": 0.1515, + "step": 15760 + }, + { + "epoch": 80.45918367346938, + "grad_norm": 0.34180694818496704, + "learning_rate": 3.908163265306123e-06, + "loss": 0.2429, + "step": 15770 + }, + { + "epoch": 80.51020408163265, + "grad_norm": 1.8589460849761963, + "learning_rate": 3.89795918367347e-06, + "loss": 0.1411, + "step": 15780 + }, + { + "epoch": 80.56122448979592, + "grad_norm": 1.2255891561508179, + "learning_rate": 3.887755102040816e-06, + "loss": 0.2386, + "step": 15790 + }, + { + "epoch": 80.61224489795919, + "grad_norm": 0.7362205386161804, + "learning_rate": 3.877551020408164e-06, + "loss": 0.0887, + "step": 15800 + }, + { + "epoch": 80.66326530612245, + "grad_norm": 11.652864456176758, + "learning_rate": 3.86734693877551e-06, + "loss": 0.2231, + "step": 15810 + }, + { + "epoch": 80.71428571428571, + "grad_norm": 1.9581044912338257, + "learning_rate": 3.857142857142858e-06, + "loss": 0.1942, + "step": 15820 + }, + { + "epoch": 80.76530612244898, + "grad_norm": 1.3246722221374512, + "learning_rate": 3.8469387755102045e-06, + "loss": 0.2956, + "step": 15830 + }, + { + "epoch": 80.81632653061224, + "grad_norm": 3.7175517082214355, + "learning_rate": 3.836734693877551e-06, + "loss": 0.1526, + "step": 15840 + }, + { + "epoch": 80.86734693877551, + "grad_norm": 1.4184467792510986, + "learning_rate": 3.826530612244898e-06, + "loss": 0.0829, + "step": 15850 + }, + { + "epoch": 80.91836734693878, + "grad_norm": 5.230016231536865, + "learning_rate": 3.816326530612245e-06, + "loss": 0.1705, + "step": 15860 + }, + { + "epoch": 80.96938775510205, + "grad_norm": 0.31383898854255676, + "learning_rate": 3.8061224489795923e-06, + "loss": 0.1405, + "step": 15870 + }, + { + "epoch": 81.0, + "eval_accuracy": 0.7833935018050542, + "eval_loss": 0.763225793838501, + "eval_runtime": 0.927, + "eval_samples_per_second": 298.824, + "eval_steps_per_second": 37.758, + "step": 15876 + }, + { + "epoch": 81.0204081632653, + "grad_norm": 5.370396614074707, + "learning_rate": 3.795918367346939e-06, + "loss": 0.1886, + "step": 15880 + }, + { + "epoch": 81.07142857142857, + "grad_norm": 0.8666300773620605, + "learning_rate": 3.785714285714286e-06, + "loss": 0.3796, + "step": 15890 + }, + { + "epoch": 81.12244897959184, + "grad_norm": 15.143610000610352, + "learning_rate": 3.7755102040816327e-06, + "loss": 0.2811, + "step": 15900 + }, + { + "epoch": 81.1734693877551, + "grad_norm": 0.3934500515460968, + "learning_rate": 3.76530612244898e-06, + "loss": 0.1487, + "step": 15910 + }, + { + "epoch": 81.22448979591837, + "grad_norm": 1.4975956678390503, + "learning_rate": 3.7551020408163268e-06, + "loss": 0.1721, + "step": 15920 + }, + { + "epoch": 81.27551020408163, + "grad_norm": 2.16916823387146, + "learning_rate": 3.744897959183674e-06, + "loss": 0.182, + "step": 15930 + }, + { + "epoch": 81.3265306122449, + "grad_norm": 8.05843448638916, + "learning_rate": 3.7346938775510205e-06, + "loss": 0.3428, + "step": 15940 + }, + { + "epoch": 81.37755102040816, + "grad_norm": 0.2246151566505432, + "learning_rate": 3.724489795918368e-06, + "loss": 0.1239, + "step": 15950 + }, + { + "epoch": 81.42857142857143, + "grad_norm": 8.20610237121582, + "learning_rate": 3.7142857142857146e-06, + "loss": 0.1717, + "step": 15960 + }, + { + "epoch": 81.4795918367347, + "grad_norm": 8.254730224609375, + "learning_rate": 3.7040816326530617e-06, + "loss": 0.3008, + "step": 15970 + }, + { + "epoch": 81.53061224489795, + "grad_norm": 0.9908814430236816, + "learning_rate": 3.6938775510204083e-06, + "loss": 0.0522, + "step": 15980 + }, + { + "epoch": 81.58163265306122, + "grad_norm": 3.838620901107788, + "learning_rate": 3.6836734693877554e-06, + "loss": 0.2645, + "step": 15990 + }, + { + "epoch": 81.63265306122449, + "grad_norm": 0.31536588072776794, + "learning_rate": 3.6734693877551024e-06, + "loss": 0.3646, + "step": 16000 + }, + { + "epoch": 81.68367346938776, + "grad_norm": 0.3082359731197357, + "learning_rate": 3.6632653061224495e-06, + "loss": 0.1775, + "step": 16010 + }, + { + "epoch": 81.73469387755102, + "grad_norm": 12.527641296386719, + "learning_rate": 3.653061224489796e-06, + "loss": 0.1322, + "step": 16020 + }, + { + "epoch": 81.78571428571429, + "grad_norm": 0.1477854698896408, + "learning_rate": 3.642857142857143e-06, + "loss": 0.112, + "step": 16030 + }, + { + "epoch": 81.83673469387755, + "grad_norm": 1.7172036170959473, + "learning_rate": 3.6326530612244903e-06, + "loss": 0.2226, + "step": 16040 + }, + { + "epoch": 81.88775510204081, + "grad_norm": 1.1714388132095337, + "learning_rate": 3.6224489795918373e-06, + "loss": 0.1911, + "step": 16050 + }, + { + "epoch": 81.93877551020408, + "grad_norm": 7.487484931945801, + "learning_rate": 3.612244897959184e-06, + "loss": 0.1288, + "step": 16060 + }, + { + "epoch": 81.98979591836735, + "grad_norm": 3.3677072525024414, + "learning_rate": 3.6020408163265306e-06, + "loss": 0.1427, + "step": 16070 + }, + { + "epoch": 82.0, + "eval_accuracy": 0.9025270758122743, + "eval_loss": 0.3249270021915436, + "eval_runtime": 0.9346, + "eval_samples_per_second": 296.382, + "eval_steps_per_second": 37.449, + "step": 16072 + }, + { + "epoch": 82.04081632653062, + "grad_norm": 2.0028810501098633, + "learning_rate": 3.5918367346938777e-06, + "loss": 0.1303, + "step": 16080 + }, + { + "epoch": 82.09183673469387, + "grad_norm": 1.0715469121932983, + "learning_rate": 3.581632653061225e-06, + "loss": 0.232, + "step": 16090 + }, + { + "epoch": 82.14285714285714, + "grad_norm": 6.861850261688232, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.1009, + "step": 16100 + }, + { + "epoch": 82.1938775510204, + "grad_norm": 0.32505741715431213, + "learning_rate": 3.5612244897959184e-06, + "loss": 0.1015, + "step": 16110 + }, + { + "epoch": 82.24489795918367, + "grad_norm": 11.258378028869629, + "learning_rate": 3.5510204081632655e-06, + "loss": 0.2765, + "step": 16120 + }, + { + "epoch": 82.29591836734694, + "grad_norm": 0.8208149671554565, + "learning_rate": 3.540816326530613e-06, + "loss": 0.1769, + "step": 16130 + }, + { + "epoch": 82.34693877551021, + "grad_norm": 14.091160774230957, + "learning_rate": 3.5306122448979596e-06, + "loss": 0.1296, + "step": 16140 + }, + { + "epoch": 82.39795918367346, + "grad_norm": 7.795243740081787, + "learning_rate": 3.5204081632653062e-06, + "loss": 0.4265, + "step": 16150 + }, + { + "epoch": 82.44897959183673, + "grad_norm": 19.263708114624023, + "learning_rate": 3.5102040816326533e-06, + "loss": 0.2897, + "step": 16160 + }, + { + "epoch": 82.5, + "grad_norm": 5.552840709686279, + "learning_rate": 3.5e-06, + "loss": 0.148, + "step": 16170 + }, + { + "epoch": 82.55102040816327, + "grad_norm": 16.170299530029297, + "learning_rate": 3.4897959183673474e-06, + "loss": 0.3484, + "step": 16180 + }, + { + "epoch": 82.60204081632654, + "grad_norm": 4.0577263832092285, + "learning_rate": 3.479591836734694e-06, + "loss": 0.2218, + "step": 16190 + }, + { + "epoch": 82.65306122448979, + "grad_norm": 5.736649990081787, + "learning_rate": 3.469387755102041e-06, + "loss": 0.1525, + "step": 16200 + }, + { + "epoch": 82.70408163265306, + "grad_norm": 21.887012481689453, + "learning_rate": 3.4591836734693878e-06, + "loss": 0.276, + "step": 16210 + }, + { + "epoch": 82.75510204081633, + "grad_norm": 18.0561580657959, + "learning_rate": 3.4489795918367353e-06, + "loss": 0.2404, + "step": 16220 + }, + { + "epoch": 82.8061224489796, + "grad_norm": 2.2050490379333496, + "learning_rate": 3.438775510204082e-06, + "loss": 0.1132, + "step": 16230 + }, + { + "epoch": 82.85714285714286, + "grad_norm": 1.9476908445358276, + "learning_rate": 3.428571428571429e-06, + "loss": 0.1607, + "step": 16240 + }, + { + "epoch": 82.90816326530613, + "grad_norm": 0.8884503841400146, + "learning_rate": 3.4183673469387756e-06, + "loss": 0.1731, + "step": 16250 + }, + { + "epoch": 82.95918367346938, + "grad_norm": 1.7185746431350708, + "learning_rate": 3.4081632653061227e-06, + "loss": 0.2493, + "step": 16260 + }, + { + "epoch": 83.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.38384127616882324, + "eval_runtime": 0.9321, + "eval_samples_per_second": 297.167, + "eval_steps_per_second": 37.548, + "step": 16268 + }, + { + "epoch": 83.01020408163265, + "grad_norm": 7.763315200805664, + "learning_rate": 3.3979591836734697e-06, + "loss": 0.2217, + "step": 16270 + }, + { + "epoch": 83.06122448979592, + "grad_norm": 1.9195549488067627, + "learning_rate": 3.3877551020408168e-06, + "loss": 0.137, + "step": 16280 + }, + { + "epoch": 83.11224489795919, + "grad_norm": 1.0584914684295654, + "learning_rate": 3.3775510204081634e-06, + "loss": 0.3192, + "step": 16290 + }, + { + "epoch": 83.16326530612245, + "grad_norm": 0.7407238483428955, + "learning_rate": 3.3673469387755105e-06, + "loss": 0.2872, + "step": 16300 + }, + { + "epoch": 83.21428571428571, + "grad_norm": 23.612741470336914, + "learning_rate": 3.357142857142857e-06, + "loss": 0.2775, + "step": 16310 + }, + { + "epoch": 83.26530612244898, + "grad_norm": 0.3100240230560303, + "learning_rate": 3.3469387755102046e-06, + "loss": 0.1608, + "step": 16320 + }, + { + "epoch": 83.31632653061224, + "grad_norm": 5.569301128387451, + "learning_rate": 3.3367346938775513e-06, + "loss": 0.2444, + "step": 16330 + }, + { + "epoch": 83.36734693877551, + "grad_norm": 14.591517448425293, + "learning_rate": 3.3265306122448983e-06, + "loss": 0.29, + "step": 16340 + }, + { + "epoch": 83.41836734693878, + "grad_norm": 7.931599140167236, + "learning_rate": 3.316326530612245e-06, + "loss": 0.277, + "step": 16350 + }, + { + "epoch": 83.46938775510205, + "grad_norm": 15.914363861083984, + "learning_rate": 3.3061224489795924e-06, + "loss": 0.1571, + "step": 16360 + }, + { + "epoch": 83.5204081632653, + "grad_norm": 13.947531700134277, + "learning_rate": 3.295918367346939e-06, + "loss": 0.3652, + "step": 16370 + }, + { + "epoch": 83.57142857142857, + "grad_norm": 3.993175506591797, + "learning_rate": 3.285714285714286e-06, + "loss": 0.4236, + "step": 16380 + }, + { + "epoch": 83.62244897959184, + "grad_norm": 9.41831111907959, + "learning_rate": 3.2755102040816328e-06, + "loss": 0.4799, + "step": 16390 + }, + { + "epoch": 83.6734693877551, + "grad_norm": 4.541845321655273, + "learning_rate": 3.2653061224489794e-06, + "loss": 0.2001, + "step": 16400 + }, + { + "epoch": 83.72448979591837, + "grad_norm": 17.684083938598633, + "learning_rate": 3.255102040816327e-06, + "loss": 0.1741, + "step": 16410 + }, + { + "epoch": 83.77551020408163, + "grad_norm": 0.664922297000885, + "learning_rate": 3.244897959183674e-06, + "loss": 0.1669, + "step": 16420 + }, + { + "epoch": 83.8265306122449, + "grad_norm": 4.690441608428955, + "learning_rate": 3.2346938775510206e-06, + "loss": 0.2087, + "step": 16430 + }, + { + "epoch": 83.87755102040816, + "grad_norm": 5.026739120483398, + "learning_rate": 3.2244897959183672e-06, + "loss": 0.1029, + "step": 16440 + }, + { + "epoch": 83.92857142857143, + "grad_norm": 8.364355087280273, + "learning_rate": 3.2142857142857147e-06, + "loss": 0.1695, + "step": 16450 + }, + { + "epoch": 83.9795918367347, + "grad_norm": 1.0005097389221191, + "learning_rate": 3.204081632653062e-06, + "loss": 0.331, + "step": 16460 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.9205776173285198, + "eval_loss": 0.3329784572124481, + "eval_runtime": 0.9402, + "eval_samples_per_second": 294.616, + "eval_steps_per_second": 37.226, + "step": 16464 + }, + { + "epoch": 84.03061224489795, + "grad_norm": 9.732176780700684, + "learning_rate": 3.1938775510204084e-06, + "loss": 0.1133, + "step": 16470 + }, + { + "epoch": 84.08163265306122, + "grad_norm": 1.5133066177368164, + "learning_rate": 3.183673469387755e-06, + "loss": 0.1528, + "step": 16480 + }, + { + "epoch": 84.13265306122449, + "grad_norm": 6.870362758636475, + "learning_rate": 3.173469387755102e-06, + "loss": 0.2269, + "step": 16490 + }, + { + "epoch": 84.18367346938776, + "grad_norm": 16.802181243896484, + "learning_rate": 3.1632653061224496e-06, + "loss": 0.3749, + "step": 16500 + }, + { + "epoch": 84.23469387755102, + "grad_norm": 8.427709579467773, + "learning_rate": 3.1530612244897963e-06, + "loss": 0.4603, + "step": 16510 + }, + { + "epoch": 84.28571428571429, + "grad_norm": 5.263187885284424, + "learning_rate": 3.142857142857143e-06, + "loss": 0.1094, + "step": 16520 + }, + { + "epoch": 84.33673469387755, + "grad_norm": 12.858963012695312, + "learning_rate": 3.13265306122449e-06, + "loss": 0.2149, + "step": 16530 + }, + { + "epoch": 84.38775510204081, + "grad_norm": 4.1460490226745605, + "learning_rate": 3.1224489795918374e-06, + "loss": 0.1401, + "step": 16540 + }, + { + "epoch": 84.43877551020408, + "grad_norm": 16.93732261657715, + "learning_rate": 3.112244897959184e-06, + "loss": 0.4208, + "step": 16550 + }, + { + "epoch": 84.48979591836735, + "grad_norm": 0.18154656887054443, + "learning_rate": 3.1020408163265307e-06, + "loss": 0.2127, + "step": 16560 + }, + { + "epoch": 84.54081632653062, + "grad_norm": 1.393512487411499, + "learning_rate": 3.0918367346938778e-06, + "loss": 0.1961, + "step": 16570 + }, + { + "epoch": 84.59183673469387, + "grad_norm": 3.731569290161133, + "learning_rate": 3.0816326530612244e-06, + "loss": 0.3392, + "step": 16580 + }, + { + "epoch": 84.64285714285714, + "grad_norm": 3.3272573947906494, + "learning_rate": 3.071428571428572e-06, + "loss": 0.1673, + "step": 16590 + }, + { + "epoch": 84.6938775510204, + "grad_norm": 12.289058685302734, + "learning_rate": 3.0612244897959185e-06, + "loss": 0.1943, + "step": 16600 + }, + { + "epoch": 84.74489795918367, + "grad_norm": 1.6420972347259521, + "learning_rate": 3.0510204081632656e-06, + "loss": 0.128, + "step": 16610 + }, + { + "epoch": 84.79591836734694, + "grad_norm": 2.594130754470825, + "learning_rate": 3.0408163265306122e-06, + "loss": 0.1445, + "step": 16620 + }, + { + "epoch": 84.84693877551021, + "grad_norm": 0.44964268803596497, + "learning_rate": 3.0306122448979597e-06, + "loss": 0.3691, + "step": 16630 + }, + { + "epoch": 84.89795918367346, + "grad_norm": 0.9908690452575684, + "learning_rate": 3.0204081632653064e-06, + "loss": 0.2374, + "step": 16640 + }, + { + "epoch": 84.94897959183673, + "grad_norm": 0.32848837971687317, + "learning_rate": 3.0102040816326534e-06, + "loss": 0.1338, + "step": 16650 + }, + { + "epoch": 85.0, + "grad_norm": 1.4169095754623413, + "learning_rate": 3e-06, + "loss": 0.1231, + "step": 16660 + }, + { + "epoch": 85.0, + "eval_accuracy": 0.8700361010830325, + "eval_loss": 0.32456666231155396, + "eval_runtime": 0.9552, + "eval_samples_per_second": 289.984, + "eval_steps_per_second": 36.641, + "step": 16660 + }, + { + "epoch": 85.05102040816327, + "grad_norm": 9.53570556640625, + "learning_rate": 2.989795918367347e-06, + "loss": 0.1539, + "step": 16670 + }, + { + "epoch": 85.10204081632654, + "grad_norm": 6.8302106857299805, + "learning_rate": 2.979591836734694e-06, + "loss": 0.1866, + "step": 16680 + }, + { + "epoch": 85.15306122448979, + "grad_norm": 0.5330386161804199, + "learning_rate": 2.9693877551020413e-06, + "loss": 0.1126, + "step": 16690 + }, + { + "epoch": 85.20408163265306, + "grad_norm": 0.9756630063056946, + "learning_rate": 2.959183673469388e-06, + "loss": 0.2992, + "step": 16700 + }, + { + "epoch": 85.25510204081633, + "grad_norm": 7.213351726531982, + "learning_rate": 2.948979591836735e-06, + "loss": 0.2454, + "step": 16710 + }, + { + "epoch": 85.3061224489796, + "grad_norm": 6.819469928741455, + "learning_rate": 2.938775510204082e-06, + "loss": 0.1805, + "step": 16720 + }, + { + "epoch": 85.35714285714286, + "grad_norm": 6.094568252563477, + "learning_rate": 2.928571428571429e-06, + "loss": 0.248, + "step": 16730 + }, + { + "epoch": 85.40816326530613, + "grad_norm": 1.5939466953277588, + "learning_rate": 2.9183673469387757e-06, + "loss": 0.2055, + "step": 16740 + }, + { + "epoch": 85.45918367346938, + "grad_norm": 1.6548603773117065, + "learning_rate": 2.908163265306123e-06, + "loss": 0.1873, + "step": 16750 + }, + { + "epoch": 85.51020408163265, + "grad_norm": 2.3544037342071533, + "learning_rate": 2.8979591836734694e-06, + "loss": 0.084, + "step": 16760 + }, + { + "epoch": 85.56122448979592, + "grad_norm": 11.432575225830078, + "learning_rate": 2.887755102040817e-06, + "loss": 0.3432, + "step": 16770 + }, + { + "epoch": 85.61224489795919, + "grad_norm": 3.1924562454223633, + "learning_rate": 2.8775510204081636e-06, + "loss": 0.4288, + "step": 16780 + }, + { + "epoch": 85.66326530612245, + "grad_norm": 0.5942087769508362, + "learning_rate": 2.86734693877551e-06, + "loss": 0.2546, + "step": 16790 + }, + { + "epoch": 85.71428571428571, + "grad_norm": 0.3294694423675537, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.2859, + "step": 16800 + }, + { + "epoch": 85.76530612244898, + "grad_norm": 1.8321048021316528, + "learning_rate": 2.8469387755102047e-06, + "loss": 0.3555, + "step": 16810 + }, + { + "epoch": 85.81632653061224, + "grad_norm": 10.187983512878418, + "learning_rate": 2.8367346938775514e-06, + "loss": 0.2943, + "step": 16820 + }, + { + "epoch": 85.86734693877551, + "grad_norm": 10.006871223449707, + "learning_rate": 2.826530612244898e-06, + "loss": 0.0889, + "step": 16830 + }, + { + "epoch": 85.91836734693878, + "grad_norm": 0.727992594242096, + "learning_rate": 2.816326530612245e-06, + "loss": 0.1329, + "step": 16840 + }, + { + "epoch": 85.96938775510205, + "grad_norm": 15.858503341674805, + "learning_rate": 2.8061224489795917e-06, + "loss": 0.2781, + "step": 16850 + }, + { + "epoch": 86.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.3710058927536011, + "eval_runtime": 0.9355, + "eval_samples_per_second": 296.084, + "eval_steps_per_second": 37.411, + "step": 16856 + }, + { + "epoch": 86.0204081632653, + "grad_norm": 8.392712593078613, + "learning_rate": 2.795918367346939e-06, + "loss": 0.1672, + "step": 16860 + }, + { + "epoch": 86.07142857142857, + "grad_norm": 13.223836898803711, + "learning_rate": 2.785714285714286e-06, + "loss": 0.2495, + "step": 16870 + }, + { + "epoch": 86.12244897959184, + "grad_norm": 6.981386661529541, + "learning_rate": 2.775510204081633e-06, + "loss": 0.1335, + "step": 16880 + }, + { + "epoch": 86.1734693877551, + "grad_norm": 16.30123519897461, + "learning_rate": 2.7653061224489795e-06, + "loss": 0.2004, + "step": 16890 + }, + { + "epoch": 86.22448979591837, + "grad_norm": 14.89189624786377, + "learning_rate": 2.7551020408163266e-06, + "loss": 0.201, + "step": 16900 + }, + { + "epoch": 86.27551020408163, + "grad_norm": 0.510366678237915, + "learning_rate": 2.7448979591836737e-06, + "loss": 0.1122, + "step": 16910 + }, + { + "epoch": 86.3265306122449, + "grad_norm": 8.832609176635742, + "learning_rate": 2.7346938775510207e-06, + "loss": 0.2677, + "step": 16920 + }, + { + "epoch": 86.37755102040816, + "grad_norm": 0.24932265281677246, + "learning_rate": 2.7244897959183674e-06, + "loss": 0.2158, + "step": 16930 + }, + { + "epoch": 86.42857142857143, + "grad_norm": 6.414112091064453, + "learning_rate": 2.7142857142857144e-06, + "loss": 0.1902, + "step": 16940 + }, + { + "epoch": 86.4795918367347, + "grad_norm": 0.8077690601348877, + "learning_rate": 2.7040816326530615e-06, + "loss": 0.1207, + "step": 16950 + }, + { + "epoch": 86.53061224489795, + "grad_norm": 1.0012266635894775, + "learning_rate": 2.6938775510204086e-06, + "loss": 0.29, + "step": 16960 + }, + { + "epoch": 86.58163265306122, + "grad_norm": 1.7598363161087036, + "learning_rate": 2.683673469387755e-06, + "loss": 0.4445, + "step": 16970 + }, + { + "epoch": 86.63265306122449, + "grad_norm": 11.231893539428711, + "learning_rate": 2.6734693877551023e-06, + "loss": 0.1348, + "step": 16980 + }, + { + "epoch": 86.68367346938776, + "grad_norm": 11.320291519165039, + "learning_rate": 2.663265306122449e-06, + "loss": 0.1525, + "step": 16990 + }, + { + "epoch": 86.73469387755102, + "grad_norm": 0.5130727291107178, + "learning_rate": 2.6530612244897964e-06, + "loss": 0.2855, + "step": 17000 + }, + { + "epoch": 86.78571428571429, + "grad_norm": 8.888845443725586, + "learning_rate": 2.642857142857143e-06, + "loss": 0.1978, + "step": 17010 + }, + { + "epoch": 86.83673469387755, + "grad_norm": 5.790079593658447, + "learning_rate": 2.63265306122449e-06, + "loss": 0.0955, + "step": 17020 + }, + { + "epoch": 86.88775510204081, + "grad_norm": 12.437826156616211, + "learning_rate": 2.6224489795918367e-06, + "loss": 0.1971, + "step": 17030 + }, + { + "epoch": 86.93877551020408, + "grad_norm": 2.4969210624694824, + "learning_rate": 2.6122448979591842e-06, + "loss": 0.2551, + "step": 17040 + }, + { + "epoch": 86.98979591836735, + "grad_norm": 22.423816680908203, + "learning_rate": 2.602040816326531e-06, + "loss": 0.7193, + "step": 17050 + }, + { + "epoch": 87.0, + "eval_accuracy": 0.9061371841155235, + "eval_loss": 0.33839184045791626, + "eval_runtime": 0.957, + "eval_samples_per_second": 289.444, + "eval_steps_per_second": 36.572, + "step": 17052 + }, + { + "epoch": 87.04081632653062, + "grad_norm": 4.3819355964660645, + "learning_rate": 2.591836734693878e-06, + "loss": 0.2473, + "step": 17060 + }, + { + "epoch": 87.09183673469387, + "grad_norm": 0.5776048302650452, + "learning_rate": 2.5816326530612246e-06, + "loss": 0.2227, + "step": 17070 + }, + { + "epoch": 87.14285714285714, + "grad_norm": 16.769515991210938, + "learning_rate": 2.571428571428571e-06, + "loss": 0.248, + "step": 17080 + }, + { + "epoch": 87.1938775510204, + "grad_norm": 9.135167121887207, + "learning_rate": 2.5612244897959187e-06, + "loss": 0.2463, + "step": 17090 + }, + { + "epoch": 87.24489795918367, + "grad_norm": 11.403244972229004, + "learning_rate": 2.5510204081632657e-06, + "loss": 0.4687, + "step": 17100 + }, + { + "epoch": 87.29591836734694, + "grad_norm": 14.178475379943848, + "learning_rate": 2.5408163265306124e-06, + "loss": 0.3548, + "step": 17110 + }, + { + "epoch": 87.34693877551021, + "grad_norm": 0.38246801495552063, + "learning_rate": 2.530612244897959e-06, + "loss": 0.0965, + "step": 17120 + }, + { + "epoch": 87.39795918367346, + "grad_norm": 0.6379547119140625, + "learning_rate": 2.5204081632653065e-06, + "loss": 0.0286, + "step": 17130 + }, + { + "epoch": 87.44897959183673, + "grad_norm": 1.0402100086212158, + "learning_rate": 2.5102040816326536e-06, + "loss": 0.134, + "step": 17140 + }, + { + "epoch": 87.5, + "grad_norm": 0.46137455105781555, + "learning_rate": 2.5e-06, + "loss": 0.1412, + "step": 17150 + }, + { + "epoch": 87.55102040816327, + "grad_norm": 10.064380645751953, + "learning_rate": 2.489795918367347e-06, + "loss": 0.264, + "step": 17160 + }, + { + "epoch": 87.60204081632654, + "grad_norm": 13.384167671203613, + "learning_rate": 2.479591836734694e-06, + "loss": 0.2698, + "step": 17170 + }, + { + "epoch": 87.65306122448979, + "grad_norm": 10.690115928649902, + "learning_rate": 2.469387755102041e-06, + "loss": 0.2503, + "step": 17180 + }, + { + "epoch": 87.70408163265306, + "grad_norm": 0.05725846067070961, + "learning_rate": 2.459183673469388e-06, + "loss": 0.2594, + "step": 17190 + }, + { + "epoch": 87.75510204081633, + "grad_norm": 0.4028679132461548, + "learning_rate": 2.4489795918367347e-06, + "loss": 0.059, + "step": 17200 + }, + { + "epoch": 87.8061224489796, + "grad_norm": 13.775506019592285, + "learning_rate": 2.4387755102040817e-06, + "loss": 0.3321, + "step": 17210 + }, + { + "epoch": 87.85714285714286, + "grad_norm": 1.2155430316925049, + "learning_rate": 2.428571428571429e-06, + "loss": 0.2333, + "step": 17220 + }, + { + "epoch": 87.90816326530613, + "grad_norm": 15.729743003845215, + "learning_rate": 2.418367346938776e-06, + "loss": 0.1389, + "step": 17230 + }, + { + "epoch": 87.95918367346938, + "grad_norm": 0.565786600112915, + "learning_rate": 2.4081632653061225e-06, + "loss": 0.1149, + "step": 17240 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.9097472924187726, + "eval_loss": 0.3703186511993408, + "eval_runtime": 1.0082, + "eval_samples_per_second": 274.745, + "eval_steps_per_second": 34.715, + "step": 17248 + }, + { + "epoch": 88.01020408163265, + "grad_norm": 0.8142939209938049, + "learning_rate": 2.3979591836734696e-06, + "loss": 0.148, + "step": 17250 + }, + { + "epoch": 88.06122448979592, + "grad_norm": 3.3334763050079346, + "learning_rate": 2.3877551020408166e-06, + "loss": 0.1569, + "step": 17260 + }, + { + "epoch": 88.11224489795919, + "grad_norm": 20.91797637939453, + "learning_rate": 2.3775510204081633e-06, + "loss": 0.1814, + "step": 17270 + }, + { + "epoch": 88.16326530612245, + "grad_norm": 15.569923400878906, + "learning_rate": 2.3673469387755103e-06, + "loss": 0.2453, + "step": 17280 + }, + { + "epoch": 88.21428571428571, + "grad_norm": 5.313724994659424, + "learning_rate": 2.3571428571428574e-06, + "loss": 0.0974, + "step": 17290 + }, + { + "epoch": 88.26530612244898, + "grad_norm": 6.935853958129883, + "learning_rate": 2.3469387755102044e-06, + "loss": 0.1783, + "step": 17300 + }, + { + "epoch": 88.31632653061224, + "grad_norm": 8.016472816467285, + "learning_rate": 2.336734693877551e-06, + "loss": 0.2279, + "step": 17310 + }, + { + "epoch": 88.36734693877551, + "grad_norm": 0.44269347190856934, + "learning_rate": 2.326530612244898e-06, + "loss": 0.2314, + "step": 17320 + }, + { + "epoch": 88.41836734693878, + "grad_norm": 4.441252708435059, + "learning_rate": 2.316326530612245e-06, + "loss": 0.1242, + "step": 17330 + }, + { + "epoch": 88.46938775510205, + "grad_norm": 1.3162168264389038, + "learning_rate": 2.306122448979592e-06, + "loss": 0.731, + "step": 17340 + }, + { + "epoch": 88.5204081632653, + "grad_norm": 3.208286762237549, + "learning_rate": 2.295918367346939e-06, + "loss": 0.357, + "step": 17350 + }, + { + "epoch": 88.57142857142857, + "grad_norm": 13.052005767822266, + "learning_rate": 2.285714285714286e-06, + "loss": 0.0895, + "step": 17360 + }, + { + "epoch": 88.62244897959184, + "grad_norm": 13.767204284667969, + "learning_rate": 2.275510204081633e-06, + "loss": 0.2135, + "step": 17370 + }, + { + "epoch": 88.6734693877551, + "grad_norm": 0.7811937928199768, + "learning_rate": 2.2653061224489797e-06, + "loss": 0.3113, + "step": 17380 + }, + { + "epoch": 88.72448979591837, + "grad_norm": 10.270852088928223, + "learning_rate": 2.2551020408163267e-06, + "loss": 0.2986, + "step": 17390 + }, + { + "epoch": 88.77551020408163, + "grad_norm": 0.29720714688301086, + "learning_rate": 2.244897959183674e-06, + "loss": 0.1637, + "step": 17400 + }, + { + "epoch": 88.8265306122449, + "grad_norm": 5.928308486938477, + "learning_rate": 2.2346938775510204e-06, + "loss": 0.1065, + "step": 17410 + }, + { + "epoch": 88.87755102040816, + "grad_norm": 5.635687351226807, + "learning_rate": 2.2244897959183675e-06, + "loss": 0.3591, + "step": 17420 + }, + { + "epoch": 88.92857142857143, + "grad_norm": 0.88820880651474, + "learning_rate": 2.2142857142857146e-06, + "loss": 0.1143, + "step": 17430 + }, + { + "epoch": 88.9795918367347, + "grad_norm": 5.052770614624023, + "learning_rate": 2.2040816326530616e-06, + "loss": 0.0269, + "step": 17440 + }, + { + "epoch": 89.0, + "eval_accuracy": 0.8592057761732852, + "eval_loss": 0.5013492703437805, + "eval_runtime": 0.9338, + "eval_samples_per_second": 296.635, + "eval_steps_per_second": 37.481, + "step": 17444 + }, + { + "epoch": 89.03061224489795, + "grad_norm": 15.623953819274902, + "learning_rate": 2.1938775510204083e-06, + "loss": 0.2499, + "step": 17450 + }, + { + "epoch": 89.08163265306122, + "grad_norm": 3.665088653564453, + "learning_rate": 2.1836734693877553e-06, + "loss": 0.1199, + "step": 17460 + }, + { + "epoch": 89.13265306122449, + "grad_norm": 1.6090409755706787, + "learning_rate": 2.1734693877551024e-06, + "loss": 0.3602, + "step": 17470 + }, + { + "epoch": 89.18367346938776, + "grad_norm": 16.96292495727539, + "learning_rate": 2.1632653061224495e-06, + "loss": 0.2923, + "step": 17480 + }, + { + "epoch": 89.23469387755102, + "grad_norm": 1.3773144483566284, + "learning_rate": 2.153061224489796e-06, + "loss": 0.2201, + "step": 17490 + }, + { + "epoch": 89.28571428571429, + "grad_norm": 1.2361564636230469, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.301, + "step": 17500 + }, + { + "epoch": 89.33673469387755, + "grad_norm": 0.3144189119338989, + "learning_rate": 2.1326530612244902e-06, + "loss": 0.1185, + "step": 17510 + }, + { + "epoch": 89.38775510204081, + "grad_norm": 2.084606170654297, + "learning_rate": 2.122448979591837e-06, + "loss": 0.2111, + "step": 17520 + }, + { + "epoch": 89.43877551020408, + "grad_norm": 0.8495397567749023, + "learning_rate": 2.112244897959184e-06, + "loss": 0.2766, + "step": 17530 + }, + { + "epoch": 89.48979591836735, + "grad_norm": 1.677476167678833, + "learning_rate": 2.1020408163265306e-06, + "loss": 0.17, + "step": 17540 + }, + { + "epoch": 89.54081632653062, + "grad_norm": 23.526060104370117, + "learning_rate": 2.0918367346938776e-06, + "loss": 0.272, + "step": 17550 + }, + { + "epoch": 89.59183673469387, + "grad_norm": 1.2515920400619507, + "learning_rate": 2.0816326530612247e-06, + "loss": 0.1553, + "step": 17560 + }, + { + "epoch": 89.64285714285714, + "grad_norm": 16.200321197509766, + "learning_rate": 2.0714285714285717e-06, + "loss": 0.1367, + "step": 17570 + }, + { + "epoch": 89.6938775510204, + "grad_norm": 1.1732423305511475, + "learning_rate": 2.0612244897959184e-06, + "loss": 0.1491, + "step": 17580 + }, + { + "epoch": 89.74489795918367, + "grad_norm": 0.4454115331172943, + "learning_rate": 2.0510204081632654e-06, + "loss": 0.1806, + "step": 17590 + }, + { + "epoch": 89.79591836734694, + "grad_norm": 3.885526657104492, + "learning_rate": 2.0408163265306125e-06, + "loss": 0.0694, + "step": 17600 + }, + { + "epoch": 89.84693877551021, + "grad_norm": 6.656656265258789, + "learning_rate": 2.030612244897959e-06, + "loss": 0.2663, + "step": 17610 + }, + { + "epoch": 89.89795918367346, + "grad_norm": 7.046533584594727, + "learning_rate": 2.020408163265306e-06, + "loss": 0.3706, + "step": 17620 + }, + { + "epoch": 89.94897959183673, + "grad_norm": 1.4293006658554077, + "learning_rate": 2.0102040816326533e-06, + "loss": 0.1359, + "step": 17630 + }, + { + "epoch": 90.0, + "grad_norm": 10.948988914489746, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0967, + "step": 17640 + }, + { + "epoch": 90.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.3456423580646515, + "eval_runtime": 0.9381, + "eval_samples_per_second": 295.278, + "eval_steps_per_second": 37.309, + "step": 17640 + }, + { + "epoch": 90.05102040816327, + "grad_norm": 0.12575684487819672, + "learning_rate": 1.989795918367347e-06, + "loss": 0.1386, + "step": 17650 + }, + { + "epoch": 90.10204081632654, + "grad_norm": 1.654032588005066, + "learning_rate": 1.979591836734694e-06, + "loss": 0.3257, + "step": 17660 + }, + { + "epoch": 90.15306122448979, + "grad_norm": 1.0453482866287231, + "learning_rate": 1.969387755102041e-06, + "loss": 0.2971, + "step": 17670 + }, + { + "epoch": 90.20408163265306, + "grad_norm": 4.922945022583008, + "learning_rate": 1.9591836734693877e-06, + "loss": 0.302, + "step": 17680 + }, + { + "epoch": 90.25510204081633, + "grad_norm": 1.6829240322113037, + "learning_rate": 1.948979591836735e-06, + "loss": 0.0836, + "step": 17690 + }, + { + "epoch": 90.3061224489796, + "grad_norm": 13.822141647338867, + "learning_rate": 1.938775510204082e-06, + "loss": 0.1002, + "step": 17700 + }, + { + "epoch": 90.35714285714286, + "grad_norm": 15.23730754852295, + "learning_rate": 1.928571428571429e-06, + "loss": 0.3489, + "step": 17710 + }, + { + "epoch": 90.40816326530613, + "grad_norm": 1.2752635478973389, + "learning_rate": 1.9183673469387756e-06, + "loss": 0.0777, + "step": 17720 + }, + { + "epoch": 90.45918367346938, + "grad_norm": 1.7229952812194824, + "learning_rate": 1.9081632653061226e-06, + "loss": 0.0637, + "step": 17730 + }, + { + "epoch": 90.51020408163265, + "grad_norm": 4.0512542724609375, + "learning_rate": 1.8979591836734695e-06, + "loss": 0.3496, + "step": 17740 + }, + { + "epoch": 90.56122448979592, + "grad_norm": 1.8018417358398438, + "learning_rate": 1.8877551020408163e-06, + "loss": 0.1146, + "step": 17750 + }, + { + "epoch": 90.61224489795919, + "grad_norm": 1.9088629484176636, + "learning_rate": 1.8775510204081634e-06, + "loss": 0.1174, + "step": 17760 + }, + { + "epoch": 90.66326530612245, + "grad_norm": 0.3909110426902771, + "learning_rate": 1.8673469387755102e-06, + "loss": 0.3986, + "step": 17770 + }, + { + "epoch": 90.71428571428571, + "grad_norm": 16.561473846435547, + "learning_rate": 1.8571428571428573e-06, + "loss": 0.2499, + "step": 17780 + }, + { + "epoch": 90.76530612244898, + "grad_norm": 0.9881889224052429, + "learning_rate": 1.8469387755102042e-06, + "loss": 0.2025, + "step": 17790 + }, + { + "epoch": 90.81632653061224, + "grad_norm": 1.2124401330947876, + "learning_rate": 1.8367346938775512e-06, + "loss": 0.1655, + "step": 17800 + }, + { + "epoch": 90.86734693877551, + "grad_norm": 0.8858227729797363, + "learning_rate": 1.826530612244898e-06, + "loss": 0.1717, + "step": 17810 + }, + { + "epoch": 90.91836734693878, + "grad_norm": 8.273017883300781, + "learning_rate": 1.8163265306122451e-06, + "loss": 0.1776, + "step": 17820 + }, + { + "epoch": 90.96938775510205, + "grad_norm": 16.358352661132812, + "learning_rate": 1.806122448979592e-06, + "loss": 0.177, + "step": 17830 + }, + { + "epoch": 91.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.3798539340496063, + "eval_runtime": 0.935, + "eval_samples_per_second": 296.266, + "eval_steps_per_second": 37.434, + "step": 17836 + }, + { + "epoch": 91.0204081632653, + "grad_norm": 11.906256675720215, + "learning_rate": 1.7959183673469388e-06, + "loss": 0.1371, + "step": 17840 + }, + { + "epoch": 91.07142857142857, + "grad_norm": 12.646199226379395, + "learning_rate": 1.7857142857142859e-06, + "loss": 0.133, + "step": 17850 + }, + { + "epoch": 91.12244897959184, + "grad_norm": 3.5620455741882324, + "learning_rate": 1.7755102040816327e-06, + "loss": 0.2049, + "step": 17860 + }, + { + "epoch": 91.1734693877551, + "grad_norm": 15.82820987701416, + "learning_rate": 1.7653061224489798e-06, + "loss": 0.1595, + "step": 17870 + }, + { + "epoch": 91.22448979591837, + "grad_norm": 0.8638943433761597, + "learning_rate": 1.7551020408163267e-06, + "loss": 0.1957, + "step": 17880 + }, + { + "epoch": 91.27551020408163, + "grad_norm": 6.596780776977539, + "learning_rate": 1.7448979591836737e-06, + "loss": 0.1857, + "step": 17890 + }, + { + "epoch": 91.3265306122449, + "grad_norm": 19.93320083618164, + "learning_rate": 1.7346938775510206e-06, + "loss": 0.3028, + "step": 17900 + }, + { + "epoch": 91.37755102040816, + "grad_norm": 1.593752145767212, + "learning_rate": 1.7244897959183676e-06, + "loss": 0.1536, + "step": 17910 + }, + { + "epoch": 91.42857142857143, + "grad_norm": 6.460892200469971, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.1282, + "step": 17920 + }, + { + "epoch": 91.4795918367347, + "grad_norm": 0.6627766489982605, + "learning_rate": 1.7040816326530613e-06, + "loss": 0.3066, + "step": 17930 + }, + { + "epoch": 91.53061224489795, + "grad_norm": 5.6217360496521, + "learning_rate": 1.6938775510204084e-06, + "loss": 0.1714, + "step": 17940 + }, + { + "epoch": 91.58163265306122, + "grad_norm": 2.5595052242279053, + "learning_rate": 1.6836734693877552e-06, + "loss": 0.1745, + "step": 17950 + }, + { + "epoch": 91.63265306122449, + "grad_norm": 6.671074867248535, + "learning_rate": 1.6734693877551023e-06, + "loss": 0.2174, + "step": 17960 + }, + { + "epoch": 91.68367346938776, + "grad_norm": 8.686925888061523, + "learning_rate": 1.6632653061224492e-06, + "loss": 0.2493, + "step": 17970 + }, + { + "epoch": 91.73469387755102, + "grad_norm": 0.4992872476577759, + "learning_rate": 1.6530612244897962e-06, + "loss": 0.0777, + "step": 17980 + }, + { + "epoch": 91.78571428571429, + "grad_norm": 0.5224694609642029, + "learning_rate": 1.642857142857143e-06, + "loss": 0.1589, + "step": 17990 + }, + { + "epoch": 91.83673469387755, + "grad_norm": 20.030405044555664, + "learning_rate": 1.6326530612244897e-06, + "loss": 0.5603, + "step": 18000 + }, + { + "epoch": 91.88775510204081, + "grad_norm": 8.830347061157227, + "learning_rate": 1.622448979591837e-06, + "loss": 0.093, + "step": 18010 + }, + { + "epoch": 91.93877551020408, + "grad_norm": 5.54904317855835, + "learning_rate": 1.6122448979591836e-06, + "loss": 0.1399, + "step": 18020 + }, + { + "epoch": 91.98979591836735, + "grad_norm": 1.547883152961731, + "learning_rate": 1.602040816326531e-06, + "loss": 0.1917, + "step": 18030 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.9061371841155235, + "eval_loss": 0.32391440868377686, + "eval_runtime": 0.9963, + "eval_samples_per_second": 278.038, + "eval_steps_per_second": 35.131, + "step": 18032 + }, + { + "epoch": 92.04081632653062, + "grad_norm": 0.934727132320404, + "learning_rate": 1.5918367346938775e-06, + "loss": 0.0907, + "step": 18040 + }, + { + "epoch": 92.09183673469387, + "grad_norm": 20.481788635253906, + "learning_rate": 1.5816326530612248e-06, + "loss": 0.2082, + "step": 18050 + }, + { + "epoch": 92.14285714285714, + "grad_norm": 0.2531684935092926, + "learning_rate": 1.5714285714285714e-06, + "loss": 0.1877, + "step": 18060 + }, + { + "epoch": 92.1938775510204, + "grad_norm": 1.0499683618545532, + "learning_rate": 1.5612244897959187e-06, + "loss": 0.2461, + "step": 18070 + }, + { + "epoch": 92.24489795918367, + "grad_norm": 7.276120185852051, + "learning_rate": 1.5510204081632654e-06, + "loss": 0.2614, + "step": 18080 + }, + { + "epoch": 92.29591836734694, + "grad_norm": 11.999966621398926, + "learning_rate": 1.5408163265306122e-06, + "loss": 0.2402, + "step": 18090 + }, + { + "epoch": 92.34693877551021, + "grad_norm": 7.187252998352051, + "learning_rate": 1.5306122448979593e-06, + "loss": 0.2077, + "step": 18100 + }, + { + "epoch": 92.39795918367346, + "grad_norm": 7.240973472595215, + "learning_rate": 1.5204081632653061e-06, + "loss": 0.298, + "step": 18110 + }, + { + "epoch": 92.44897959183673, + "grad_norm": 0.2603718936443329, + "learning_rate": 1.5102040816326532e-06, + "loss": 0.1392, + "step": 18120 + }, + { + "epoch": 92.5, + "grad_norm": 0.6799657344818115, + "learning_rate": 1.5e-06, + "loss": 0.1727, + "step": 18130 + }, + { + "epoch": 92.55102040816327, + "grad_norm": 6.5458574295043945, + "learning_rate": 1.489795918367347e-06, + "loss": 0.1966, + "step": 18140 + }, + { + "epoch": 92.60204081632654, + "grad_norm": 13.007467269897461, + "learning_rate": 1.479591836734694e-06, + "loss": 0.2052, + "step": 18150 + }, + { + "epoch": 92.65306122448979, + "grad_norm": 13.764470100402832, + "learning_rate": 1.469387755102041e-06, + "loss": 0.2512, + "step": 18160 + }, + { + "epoch": 92.70408163265306, + "grad_norm": 0.7824844717979431, + "learning_rate": 1.4591836734693879e-06, + "loss": 0.0945, + "step": 18170 + }, + { + "epoch": 92.75510204081633, + "grad_norm": 16.08002471923828, + "learning_rate": 1.4489795918367347e-06, + "loss": 0.3292, + "step": 18180 + }, + { + "epoch": 92.8061224489796, + "grad_norm": 1.7446142435073853, + "learning_rate": 1.4387755102040818e-06, + "loss": 0.3982, + "step": 18190 + }, + { + "epoch": 92.85714285714286, + "grad_norm": 0.45311930775642395, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.2266, + "step": 18200 + }, + { + "epoch": 92.90816326530613, + "grad_norm": 1.0091290473937988, + "learning_rate": 1.4183673469387757e-06, + "loss": 0.3635, + "step": 18210 + }, + { + "epoch": 92.95918367346938, + "grad_norm": 9.684619903564453, + "learning_rate": 1.4081632653061225e-06, + "loss": 0.2082, + "step": 18220 + }, + { + "epoch": 93.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.4861180782318115, + "eval_runtime": 0.9336, + "eval_samples_per_second": 296.71, + "eval_steps_per_second": 37.49, + "step": 18228 + }, + { + "epoch": 93.01020408163265, + "grad_norm": 7.2504401206970215, + "learning_rate": 1.3979591836734696e-06, + "loss": 0.4401, + "step": 18230 + }, + { + "epoch": 93.06122448979592, + "grad_norm": 2.0435526371002197, + "learning_rate": 1.3877551020408165e-06, + "loss": 0.224, + "step": 18240 + }, + { + "epoch": 93.11224489795919, + "grad_norm": 11.459671020507812, + "learning_rate": 1.3775510204081633e-06, + "loss": 0.258, + "step": 18250 + }, + { + "epoch": 93.16326530612245, + "grad_norm": 7.774988651275635, + "learning_rate": 1.3673469387755104e-06, + "loss": 0.1181, + "step": 18260 + }, + { + "epoch": 93.21428571428571, + "grad_norm": 11.726682662963867, + "learning_rate": 1.3571428571428572e-06, + "loss": 0.2191, + "step": 18270 + }, + { + "epoch": 93.26530612244898, + "grad_norm": 10.793588638305664, + "learning_rate": 1.3469387755102043e-06, + "loss": 0.2481, + "step": 18280 + }, + { + "epoch": 93.31632653061224, + "grad_norm": 6.10498571395874, + "learning_rate": 1.3367346938775511e-06, + "loss": 0.4219, + "step": 18290 + }, + { + "epoch": 93.36734693877551, + "grad_norm": 2.345142126083374, + "learning_rate": 1.3265306122448982e-06, + "loss": 0.2457, + "step": 18300 + }, + { + "epoch": 93.41836734693878, + "grad_norm": 10.917703628540039, + "learning_rate": 1.316326530612245e-06, + "loss": 0.2174, + "step": 18310 + }, + { + "epoch": 93.46938775510205, + "grad_norm": 1.5860787630081177, + "learning_rate": 1.3061224489795921e-06, + "loss": 0.1963, + "step": 18320 + }, + { + "epoch": 93.5204081632653, + "grad_norm": 0.7468981742858887, + "learning_rate": 1.295918367346939e-06, + "loss": 0.1721, + "step": 18330 + }, + { + "epoch": 93.57142857142857, + "grad_norm": 5.929870128631592, + "learning_rate": 1.2857142857142856e-06, + "loss": 0.2093, + "step": 18340 + }, + { + "epoch": 93.62244897959184, + "grad_norm": 7.6047821044921875, + "learning_rate": 1.2755102040816329e-06, + "loss": 0.1547, + "step": 18350 + }, + { + "epoch": 93.6734693877551, + "grad_norm": 0.3163466155529022, + "learning_rate": 1.2653061224489795e-06, + "loss": 0.1218, + "step": 18360 + }, + { + "epoch": 93.72448979591837, + "grad_norm": 1.5049079656600952, + "learning_rate": 1.2551020408163268e-06, + "loss": 0.1935, + "step": 18370 + }, + { + "epoch": 93.77551020408163, + "grad_norm": 2.0531539916992188, + "learning_rate": 1.2448979591836734e-06, + "loss": 0.3614, + "step": 18380 + }, + { + "epoch": 93.8265306122449, + "grad_norm": 2.066619634628296, + "learning_rate": 1.2346938775510205e-06, + "loss": 0.1524, + "step": 18390 + }, + { + "epoch": 93.87755102040816, + "grad_norm": 0.1106644943356514, + "learning_rate": 1.2244897959183673e-06, + "loss": 0.1908, + "step": 18400 + }, + { + "epoch": 93.92857142857143, + "grad_norm": 0.2896367609500885, + "learning_rate": 1.2142857142857144e-06, + "loss": 0.0657, + "step": 18410 + }, + { + "epoch": 93.9795918367347, + "grad_norm": 22.562599182128906, + "learning_rate": 1.2040816326530612e-06, + "loss": 0.3836, + "step": 18420 + }, + { + "epoch": 94.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.4443919062614441, + "eval_runtime": 0.9771, + "eval_samples_per_second": 283.506, + "eval_steps_per_second": 35.822, + "step": 18424 + }, + { + "epoch": 94.03061224489795, + "grad_norm": 4.277006149291992, + "learning_rate": 1.1938775510204083e-06, + "loss": 0.1953, + "step": 18430 + }, + { + "epoch": 94.08163265306122, + "grad_norm": 5.866278648376465, + "learning_rate": 1.1836734693877552e-06, + "loss": 0.2129, + "step": 18440 + }, + { + "epoch": 94.13265306122449, + "grad_norm": 13.665555000305176, + "learning_rate": 1.1734693877551022e-06, + "loss": 0.171, + "step": 18450 + }, + { + "epoch": 94.18367346938776, + "grad_norm": 6.091044902801514, + "learning_rate": 1.163265306122449e-06, + "loss": 0.172, + "step": 18460 + }, + { + "epoch": 94.23469387755102, + "grad_norm": 13.635805130004883, + "learning_rate": 1.153061224489796e-06, + "loss": 0.3361, + "step": 18470 + }, + { + "epoch": 94.28571428571429, + "grad_norm": 1.790496826171875, + "learning_rate": 1.142857142857143e-06, + "loss": 0.0893, + "step": 18480 + }, + { + "epoch": 94.33673469387755, + "grad_norm": 3.7977652549743652, + "learning_rate": 1.1326530612244898e-06, + "loss": 0.1547, + "step": 18490 + }, + { + "epoch": 94.38775510204081, + "grad_norm": 7.650182247161865, + "learning_rate": 1.122448979591837e-06, + "loss": 0.2096, + "step": 18500 + }, + { + "epoch": 94.43877551020408, + "grad_norm": 4.0888352394104, + "learning_rate": 1.1122448979591838e-06, + "loss": 0.1384, + "step": 18510 + }, + { + "epoch": 94.48979591836735, + "grad_norm": 4.018925666809082, + "learning_rate": 1.1020408163265308e-06, + "loss": 0.1704, + "step": 18520 + }, + { + "epoch": 94.54081632653062, + "grad_norm": 3.2753589153289795, + "learning_rate": 1.0918367346938777e-06, + "loss": 0.1741, + "step": 18530 + }, + { + "epoch": 94.59183673469387, + "grad_norm": 18.035402297973633, + "learning_rate": 1.0816326530612247e-06, + "loss": 0.1346, + "step": 18540 + }, + { + "epoch": 94.64285714285714, + "grad_norm": 6.348093032836914, + "learning_rate": 1.0714285714285714e-06, + "loss": 0.1886, + "step": 18550 + }, + { + "epoch": 94.6938775510204, + "grad_norm": 6.390273571014404, + "learning_rate": 1.0612244897959184e-06, + "loss": 0.1955, + "step": 18560 + }, + { + "epoch": 94.74489795918367, + "grad_norm": 0.4464976489543915, + "learning_rate": 1.0510204081632653e-06, + "loss": 0.2558, + "step": 18570 + }, + { + "epoch": 94.79591836734694, + "grad_norm": 11.005455017089844, + "learning_rate": 1.0408163265306123e-06, + "loss": 0.2175, + "step": 18580 + }, + { + "epoch": 94.84693877551021, + "grad_norm": 5.856806755065918, + "learning_rate": 1.0306122448979592e-06, + "loss": 0.2982, + "step": 18590 + }, + { + "epoch": 94.89795918367346, + "grad_norm": 1.477173089981079, + "learning_rate": 1.0204081632653063e-06, + "loss": 0.2076, + "step": 18600 + }, + { + "epoch": 94.94897959183673, + "grad_norm": 12.501953125, + "learning_rate": 1.010204081632653e-06, + "loss": 0.1872, + "step": 18610 + }, + { + "epoch": 95.0, + "grad_norm": 1.0849741697311401, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.1, + "step": 18620 + }, + { + "epoch": 95.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.3712524473667145, + "eval_runtime": 0.935, + "eval_samples_per_second": 296.245, + "eval_steps_per_second": 37.432, + "step": 18620 + }, + { + "epoch": 95.05102040816327, + "grad_norm": 0.12624509632587433, + "learning_rate": 9.89795918367347e-07, + "loss": 0.0413, + "step": 18630 + }, + { + "epoch": 95.10204081632654, + "grad_norm": 5.990360736846924, + "learning_rate": 9.795918367346939e-07, + "loss": 0.2007, + "step": 18640 + }, + { + "epoch": 95.15306122448979, + "grad_norm": 10.36894416809082, + "learning_rate": 9.69387755102041e-07, + "loss": 0.1651, + "step": 18650 + }, + { + "epoch": 95.20408163265306, + "grad_norm": 4.233863830566406, + "learning_rate": 9.591836734693878e-07, + "loss": 0.1365, + "step": 18660 + }, + { + "epoch": 95.25510204081633, + "grad_norm": 2.113361358642578, + "learning_rate": 9.489795918367347e-07, + "loss": 0.2163, + "step": 18670 + }, + { + "epoch": 95.3061224489796, + "grad_norm": 1.022985816001892, + "learning_rate": 9.387755102040817e-07, + "loss": 0.1879, + "step": 18680 + }, + { + "epoch": 95.35714285714286, + "grad_norm": 2.9235568046569824, + "learning_rate": 9.285714285714287e-07, + "loss": 0.0615, + "step": 18690 + }, + { + "epoch": 95.40816326530613, + "grad_norm": 3.9105029106140137, + "learning_rate": 9.183673469387756e-07, + "loss": 0.3205, + "step": 18700 + }, + { + "epoch": 95.45918367346938, + "grad_norm": 7.213064670562744, + "learning_rate": 9.081632653061226e-07, + "loss": 0.3415, + "step": 18710 + }, + { + "epoch": 95.51020408163265, + "grad_norm": 3.849168539047241, + "learning_rate": 8.979591836734694e-07, + "loss": 0.1077, + "step": 18720 + }, + { + "epoch": 95.56122448979592, + "grad_norm": 0.49878036975860596, + "learning_rate": 8.877551020408164e-07, + "loss": 0.2587, + "step": 18730 + }, + { + "epoch": 95.61224489795919, + "grad_norm": 18.13332748413086, + "learning_rate": 8.775510204081633e-07, + "loss": 0.3615, + "step": 18740 + }, + { + "epoch": 95.66326530612245, + "grad_norm": 6.391093730926514, + "learning_rate": 8.673469387755103e-07, + "loss": 0.1355, + "step": 18750 + }, + { + "epoch": 95.71428571428571, + "grad_norm": 15.012079238891602, + "learning_rate": 8.571428571428572e-07, + "loss": 0.1515, + "step": 18760 + }, + { + "epoch": 95.76530612244898, + "grad_norm": 0.518864631652832, + "learning_rate": 8.469387755102042e-07, + "loss": 0.2164, + "step": 18770 + }, + { + "epoch": 95.81632653061224, + "grad_norm": 21.811342239379883, + "learning_rate": 8.367346938775512e-07, + "loss": 0.3386, + "step": 18780 + }, + { + "epoch": 95.86734693877551, + "grad_norm": 8.917824745178223, + "learning_rate": 8.265306122448981e-07, + "loss": 0.2287, + "step": 18790 + }, + { + "epoch": 95.91836734693878, + "grad_norm": 2.237412214279175, + "learning_rate": 8.163265306122449e-07, + "loss": 0.0678, + "step": 18800 + }, + { + "epoch": 95.96938775510205, + "grad_norm": 6.585892677307129, + "learning_rate": 8.061224489795918e-07, + "loss": 0.1785, + "step": 18810 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.8303249097472925, + "eval_loss": 0.427941232919693, + "eval_runtime": 0.9327, + "eval_samples_per_second": 296.984, + "eval_steps_per_second": 37.525, + "step": 18816 + }, + { + "epoch": 96.0204081632653, + "grad_norm": 0.1639726758003235, + "learning_rate": 7.959183673469388e-07, + "loss": 0.1298, + "step": 18820 + }, + { + "epoch": 96.07142857142857, + "grad_norm": 2.6104323863983154, + "learning_rate": 7.857142857142857e-07, + "loss": 0.0609, + "step": 18830 + }, + { + "epoch": 96.12244897959184, + "grad_norm": 8.791910171508789, + "learning_rate": 7.755102040816327e-07, + "loss": 0.2782, + "step": 18840 + }, + { + "epoch": 96.1734693877551, + "grad_norm": 0.7717019319534302, + "learning_rate": 7.653061224489796e-07, + "loss": 0.5283, + "step": 18850 + }, + { + "epoch": 96.22448979591837, + "grad_norm": 0.5696031451225281, + "learning_rate": 7.551020408163266e-07, + "loss": 0.1675, + "step": 18860 + }, + { + "epoch": 96.27551020408163, + "grad_norm": 3.209054708480835, + "learning_rate": 7.448979591836736e-07, + "loss": 0.2733, + "step": 18870 + }, + { + "epoch": 96.3265306122449, + "grad_norm": 0.5075634121894836, + "learning_rate": 7.346938775510205e-07, + "loss": 0.0789, + "step": 18880 + }, + { + "epoch": 96.37755102040816, + "grad_norm": 6.202861785888672, + "learning_rate": 7.244897959183674e-07, + "loss": 0.1033, + "step": 18890 + }, + { + "epoch": 96.42857142857143, + "grad_norm": 0.5611489415168762, + "learning_rate": 7.142857142857143e-07, + "loss": 0.201, + "step": 18900 + }, + { + "epoch": 96.4795918367347, + "grad_norm": 1.8156996965408325, + "learning_rate": 7.040816326530613e-07, + "loss": 0.15, + "step": 18910 + }, + { + "epoch": 96.53061224489795, + "grad_norm": 8.122031211853027, + "learning_rate": 6.938775510204082e-07, + "loss": 0.0895, + "step": 18920 + }, + { + "epoch": 96.58163265306122, + "grad_norm": 5.461958408355713, + "learning_rate": 6.836734693877552e-07, + "loss": 0.0865, + "step": 18930 + }, + { + "epoch": 96.63265306122449, + "grad_norm": 2.365891456604004, + "learning_rate": 6.734693877551021e-07, + "loss": 0.2617, + "step": 18940 + }, + { + "epoch": 96.68367346938776, + "grad_norm": 6.673534393310547, + "learning_rate": 6.632653061224491e-07, + "loss": 0.2343, + "step": 18950 + }, + { + "epoch": 96.73469387755102, + "grad_norm": 8.938642501831055, + "learning_rate": 6.530612244897961e-07, + "loss": 0.2346, + "step": 18960 + }, + { + "epoch": 96.78571428571429, + "grad_norm": 1.8851449489593506, + "learning_rate": 6.428571428571428e-07, + "loss": 0.186, + "step": 18970 + }, + { + "epoch": 96.83673469387755, + "grad_norm": 4.962350368499756, + "learning_rate": 6.326530612244898e-07, + "loss": 0.2458, + "step": 18980 + }, + { + "epoch": 96.88775510204081, + "grad_norm": 3.843569755554199, + "learning_rate": 6.224489795918367e-07, + "loss": 0.3118, + "step": 18990 + }, + { + "epoch": 96.93877551020408, + "grad_norm": 0.8823779225349426, + "learning_rate": 6.122448979591837e-07, + "loss": 0.1672, + "step": 19000 + }, + { + "epoch": 96.98979591836735, + "grad_norm": 0.39520347118377686, + "learning_rate": 6.020408163265306e-07, + "loss": 0.19, + "step": 19010 + }, + { + "epoch": 97.0, + "eval_accuracy": 0.8411552346570397, + "eval_loss": 0.6587985157966614, + "eval_runtime": 0.9348, + "eval_samples_per_second": 296.305, + "eval_steps_per_second": 37.439, + "step": 19012 + }, + { + "epoch": 97.04081632653062, + "grad_norm": 1.7464901208877563, + "learning_rate": 5.918367346938776e-07, + "loss": 0.1802, + "step": 19020 + }, + { + "epoch": 97.09183673469387, + "grad_norm": 4.634471893310547, + "learning_rate": 5.816326530612245e-07, + "loss": 0.1461, + "step": 19030 + }, + { + "epoch": 97.14285714285714, + "grad_norm": 0.36374080181121826, + "learning_rate": 5.714285714285715e-07, + "loss": 0.1334, + "step": 19040 + }, + { + "epoch": 97.1938775510204, + "grad_norm": 2.8511147499084473, + "learning_rate": 5.612244897959184e-07, + "loss": 0.1569, + "step": 19050 + }, + { + "epoch": 97.24489795918367, + "grad_norm": 2.9540822505950928, + "learning_rate": 5.510204081632654e-07, + "loss": 0.1175, + "step": 19060 + }, + { + "epoch": 97.29591836734694, + "grad_norm": 0.20516809821128845, + "learning_rate": 5.408163265306124e-07, + "loss": 0.2094, + "step": 19070 + }, + { + "epoch": 97.34693877551021, + "grad_norm": 7.4667816162109375, + "learning_rate": 5.306122448979592e-07, + "loss": 0.1318, + "step": 19080 + }, + { + "epoch": 97.39795918367346, + "grad_norm": 0.7376777529716492, + "learning_rate": 5.204081632653062e-07, + "loss": 0.103, + "step": 19090 + }, + { + "epoch": 97.44897959183673, + "grad_norm": 9.432369232177734, + "learning_rate": 5.102040816326531e-07, + "loss": 0.1561, + "step": 19100 + }, + { + "epoch": 97.5, + "grad_norm": 15.827686309814453, + "learning_rate": 5.000000000000001e-07, + "loss": 0.3899, + "step": 19110 + }, + { + "epoch": 97.55102040816327, + "grad_norm": 7.54927396774292, + "learning_rate": 4.897959183673469e-07, + "loss": 0.0907, + "step": 19120 + }, + { + "epoch": 97.60204081632654, + "grad_norm": 14.595918655395508, + "learning_rate": 4.795918367346939e-07, + "loss": 0.5122, + "step": 19130 + }, + { + "epoch": 97.65306122448979, + "grad_norm": 1.9597631692886353, + "learning_rate": 4.6938775510204085e-07, + "loss": 0.3503, + "step": 19140 + }, + { + "epoch": 97.70408163265306, + "grad_norm": 19.388639450073242, + "learning_rate": 4.591836734693878e-07, + "loss": 0.3461, + "step": 19150 + }, + { + "epoch": 97.75510204081633, + "grad_norm": 2.2139663696289062, + "learning_rate": 4.489795918367347e-07, + "loss": 0.1561, + "step": 19160 + }, + { + "epoch": 97.8061224489796, + "grad_norm": 7.7265520095825195, + "learning_rate": 4.3877551020408166e-07, + "loss": 0.3166, + "step": 19170 + }, + { + "epoch": 97.85714285714286, + "grad_norm": 2.6642425060272217, + "learning_rate": 4.285714285714286e-07, + "loss": 0.0529, + "step": 19180 + }, + { + "epoch": 97.90816326530613, + "grad_norm": 5.023648262023926, + "learning_rate": 4.183673469387756e-07, + "loss": 0.2524, + "step": 19190 + }, + { + "epoch": 97.95918367346938, + "grad_norm": 13.295014381408691, + "learning_rate": 4.0816326530612243e-07, + "loss": 0.099, + "step": 19200 + }, + { + "epoch": 98.0, + "eval_accuracy": 0.8267148014440433, + "eval_loss": 0.66323322057724, + "eval_runtime": 0.9334, + "eval_samples_per_second": 296.754, + "eval_steps_per_second": 37.496, + "step": 19208 + }, + { + "epoch": 98.01020408163265, + "grad_norm": 0.9346858263015747, + "learning_rate": 3.979591836734694e-07, + "loss": 0.1299, + "step": 19210 + }, + { + "epoch": 98.06122448979592, + "grad_norm": 9.008681297302246, + "learning_rate": 3.8775510204081634e-07, + "loss": 0.3403, + "step": 19220 + }, + { + "epoch": 98.11224489795919, + "grad_norm": 2.1377320289611816, + "learning_rate": 3.775510204081633e-07, + "loss": 0.1092, + "step": 19230 + }, + { + "epoch": 98.16326530612245, + "grad_norm": 21.07135581970215, + "learning_rate": 3.6734693877551025e-07, + "loss": 0.1871, + "step": 19240 + }, + { + "epoch": 98.21428571428571, + "grad_norm": 6.922109127044678, + "learning_rate": 3.5714285714285716e-07, + "loss": 0.1675, + "step": 19250 + }, + { + "epoch": 98.26530612244898, + "grad_norm": 1.566651463508606, + "learning_rate": 3.469387755102041e-07, + "loss": 0.2421, + "step": 19260 + }, + { + "epoch": 98.31632653061224, + "grad_norm": 9.104406356811523, + "learning_rate": 3.3673469387755107e-07, + "loss": 0.441, + "step": 19270 + }, + { + "epoch": 98.36734693877551, + "grad_norm": 2.4959702491760254, + "learning_rate": 3.2653061224489803e-07, + "loss": 0.1556, + "step": 19280 + }, + { + "epoch": 98.41836734693878, + "grad_norm": 12.012667655944824, + "learning_rate": 3.163265306122449e-07, + "loss": 0.1268, + "step": 19290 + }, + { + "epoch": 98.46938775510205, + "grad_norm": 0.16600196063518524, + "learning_rate": 3.0612244897959183e-07, + "loss": 0.2915, + "step": 19300 + }, + { + "epoch": 98.5204081632653, + "grad_norm": 5.219106197357178, + "learning_rate": 2.959183673469388e-07, + "loss": 0.2216, + "step": 19310 + }, + { + "epoch": 98.57142857142857, + "grad_norm": 0.5317900776863098, + "learning_rate": 2.8571428571428575e-07, + "loss": 0.0964, + "step": 19320 + }, + { + "epoch": 98.62244897959184, + "grad_norm": 2.0076568126678467, + "learning_rate": 2.755102040816327e-07, + "loss": 0.112, + "step": 19330 + }, + { + "epoch": 98.6734693877551, + "grad_norm": 8.94671630859375, + "learning_rate": 2.653061224489796e-07, + "loss": 0.2102, + "step": 19340 + }, + { + "epoch": 98.72448979591837, + "grad_norm": 1.0493749380111694, + "learning_rate": 2.5510204081632656e-07, + "loss": 0.1934, + "step": 19350 + }, + { + "epoch": 98.77551020408163, + "grad_norm": 0.16692312061786652, + "learning_rate": 2.4489795918367347e-07, + "loss": 0.1121, + "step": 19360 + }, + { + "epoch": 98.8265306122449, + "grad_norm": 14.747842788696289, + "learning_rate": 2.3469387755102042e-07, + "loss": 0.1758, + "step": 19370 + }, + { + "epoch": 98.87755102040816, + "grad_norm": 13.387628555297852, + "learning_rate": 2.2448979591836735e-07, + "loss": 0.2225, + "step": 19380 + }, + { + "epoch": 98.92857142857143, + "grad_norm": 0.7674017548561096, + "learning_rate": 2.142857142857143e-07, + "loss": 0.2042, + "step": 19390 + }, + { + "epoch": 98.9795918367347, + "grad_norm": 6.51180362701416, + "learning_rate": 2.0408163265306121e-07, + "loss": 0.1467, + "step": 19400 + }, + { + "epoch": 99.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.4641692638397217, + "eval_runtime": 0.9294, + "eval_samples_per_second": 298.047, + "eval_steps_per_second": 37.659, + "step": 19404 + }, + { + "epoch": 99.03061224489795, + "grad_norm": 9.76559066772461, + "learning_rate": 1.9387755102040817e-07, + "loss": 0.2495, + "step": 19410 + }, + { + "epoch": 99.08163265306122, + "grad_norm": 1.884592056274414, + "learning_rate": 1.8367346938775513e-07, + "loss": 0.2225, + "step": 19420 + }, + { + "epoch": 99.13265306122449, + "grad_norm": 6.432409763336182, + "learning_rate": 1.7346938775510206e-07, + "loss": 0.2526, + "step": 19430 + }, + { + "epoch": 99.18367346938776, + "grad_norm": 4.175636291503906, + "learning_rate": 1.6326530612244901e-07, + "loss": 0.6109, + "step": 19440 + }, + { + "epoch": 99.23469387755102, + "grad_norm": 15.096162796020508, + "learning_rate": 1.5306122448979592e-07, + "loss": 0.276, + "step": 19450 + }, + { + "epoch": 99.28571428571429, + "grad_norm": 7.944954872131348, + "learning_rate": 1.4285714285714287e-07, + "loss": 0.0812, + "step": 19460 + }, + { + "epoch": 99.33673469387755, + "grad_norm": 3.630222797393799, + "learning_rate": 1.326530612244898e-07, + "loss": 0.2417, + "step": 19470 + }, + { + "epoch": 99.38775510204081, + "grad_norm": 1.4471020698547363, + "learning_rate": 1.2244897959183673e-07, + "loss": 0.1571, + "step": 19480 + }, + { + "epoch": 99.43877551020408, + "grad_norm": 1.2315754890441895, + "learning_rate": 1.1224489795918368e-07, + "loss": 0.199, + "step": 19490 + }, + { + "epoch": 99.48979591836735, + "grad_norm": 3.3233890533447266, + "learning_rate": 1.0204081632653061e-07, + "loss": 0.2458, + "step": 19500 + }, + { + "epoch": 99.54081632653062, + "grad_norm": 2.3438429832458496, + "learning_rate": 9.183673469387756e-08, + "loss": 0.0734, + "step": 19510 + }, + { + "epoch": 99.59183673469387, + "grad_norm": 4.115237712860107, + "learning_rate": 8.163265306122451e-08, + "loss": 0.1425, + "step": 19520 + }, + { + "epoch": 99.64285714285714, + "grad_norm": 16.608760833740234, + "learning_rate": 7.142857142857144e-08, + "loss": 0.1921, + "step": 19530 + }, + { + "epoch": 99.6938775510204, + "grad_norm": 6.704372406005859, + "learning_rate": 6.122448979591837e-08, + "loss": 0.1868, + "step": 19540 + }, + { + "epoch": 99.74489795918367, + "grad_norm": 6.689555644989014, + "learning_rate": 5.1020408163265303e-08, + "loss": 0.0779, + "step": 19550 + }, + { + "epoch": 99.79591836734694, + "grad_norm": 1.298974871635437, + "learning_rate": 4.0816326530612253e-08, + "loss": 0.148, + "step": 19560 + }, + { + "epoch": 99.84693877551021, + "grad_norm": 4.185670375823975, + "learning_rate": 3.0612244897959183e-08, + "loss": 0.2843, + "step": 19570 + }, + { + "epoch": 99.89795918367346, + "grad_norm": 2.600241184234619, + "learning_rate": 2.0408163265306127e-08, + "loss": 0.1492, + "step": 19580 + }, + { + "epoch": 99.94897959183673, + "grad_norm": 14.082707405090332, + "learning_rate": 1.0204081632653063e-08, + "loss": 0.1229, + "step": 19590 + }, + { + "epoch": 100.0, + "grad_norm": 0.1434904783964157, + "learning_rate": 0.0, + "loss": 0.2617, + "step": 19600 + }, + { + "epoch": 100.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.3624129593372345, + "eval_runtime": 1.0133, + "eval_samples_per_second": 273.363, + "eval_steps_per_second": 34.54, + "step": 19600 + }, + { + "epoch": 100.0, + "step": 19600, + "total_flos": 5.678990728814592e+17, + "train_loss": 0.30055975163317455, + "train_runtime": 951.6633, + "train_samples_per_second": 164.554, + "train_steps_per_second": 20.596 } ], "logging_steps": 10, - "max_steps": 980, + "max_steps": 19600, "num_input_tokens_seen": 0, - "num_train_epochs": 5, + "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -766,7 +14655,7 @@ "attributes": {} } }, - "total_flos": 1.6633116935737344e+17, + "total_flos": 5.678990728814592e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null