{ "best_metric": 0.5957671957671957, "best_model_checkpoint": "vivit-b-16x2-kinetics400-ft-3620\\checkpoint-2442", "epoch": 49.01109090909091, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018181818181818182, "grad_norm": 3.741217613220215, "learning_rate": 9.09090909090909e-07, "loss": 1.1478, "step": 10 }, { "epoch": 0.0036363636363636364, "grad_norm": 10.490788459777832, "learning_rate": 1.818181818181818e-06, "loss": 1.1191, "step": 20 }, { "epoch": 0.005454545454545455, "grad_norm": 5.955273151397705, "learning_rate": 2.7272727272727272e-06, "loss": 1.1088, "step": 30 }, { "epoch": 0.007272727272727273, "grad_norm": 7.072606086730957, "learning_rate": 3.636363636363636e-06, "loss": 1.1015, "step": 40 }, { "epoch": 0.00909090909090909, "grad_norm": 8.753866195678711, "learning_rate": 4.5454545454545455e-06, "loss": 1.0891, "step": 50 }, { "epoch": 0.01090909090909091, "grad_norm": 4.427119255065918, "learning_rate": 5.4545454545454545e-06, "loss": 1.1044, "step": 60 }, { "epoch": 0.012727272727272728, "grad_norm": 14.78829288482666, "learning_rate": 6.363636363636363e-06, "loss": 1.1065, "step": 70 }, { "epoch": 0.014545454545454545, "grad_norm": 3.910771131515503, "learning_rate": 7.272727272727272e-06, "loss": 1.0878, "step": 80 }, { "epoch": 0.016363636363636365, "grad_norm": 9.14891529083252, "learning_rate": 8.181818181818183e-06, "loss": 1.0748, "step": 90 }, { "epoch": 0.01818181818181818, "grad_norm": 5.1411004066467285, "learning_rate": 9.090909090909091e-06, "loss": 1.1173, "step": 100 }, { "epoch": 0.02, "grad_norm": 8.358501434326172, "learning_rate": 1e-05, "loss": 1.0684, "step": 110 }, { "epoch": 0.020181818181818183, "eval_accuracy": 0.3798941798941799, "eval_loss": 1.1114321947097778, "eval_runtime": 131.7711, "eval_samples_per_second": 7.172, "eval_steps_per_second": 0.903, "step": 111 }, { "epoch": 1.0016363636363637, "grad_norm": 4.467060565948486, "learning_rate": 1.0909090909090909e-05, "loss": 1.0851, "step": 120 }, { "epoch": 1.0034545454545454, "grad_norm": 4.482585430145264, "learning_rate": 1.1818181818181819e-05, "loss": 1.1045, "step": 130 }, { "epoch": 1.0052727272727273, "grad_norm": 10.72672176361084, "learning_rate": 1.2727272727272727e-05, "loss": 1.0377, "step": 140 }, { "epoch": 1.007090909090909, "grad_norm": 5.500144958496094, "learning_rate": 1.3636363636363637e-05, "loss": 0.9757, "step": 150 }, { "epoch": 1.008909090909091, "grad_norm": 10.576444625854492, "learning_rate": 1.4545454545454545e-05, "loss": 1.006, "step": 160 }, { "epoch": 1.0107272727272727, "grad_norm": 11.98294734954834, "learning_rate": 1.5454545454545454e-05, "loss": 1.0666, "step": 170 }, { "epoch": 1.0125454545454546, "grad_norm": 7.530726909637451, "learning_rate": 1.6363636363636366e-05, "loss": 1.1369, "step": 180 }, { "epoch": 1.0143636363636364, "grad_norm": 7.2719221115112305, "learning_rate": 1.7272727272727274e-05, "loss": 1.0945, "step": 190 }, { "epoch": 1.016181818181818, "grad_norm": 7.152905464172363, "learning_rate": 1.8181818181818182e-05, "loss": 0.9561, "step": 200 }, { "epoch": 1.018, "grad_norm": 7.458053112030029, "learning_rate": 1.9090909090909094e-05, "loss": 1.1724, "step": 210 }, { "epoch": 1.0198181818181817, "grad_norm": 8.600687980651855, "learning_rate": 2e-05, "loss": 1.0415, "step": 220 }, { "epoch": 1.020181818181818, "eval_accuracy": 0.5248677248677248, "eval_loss": 1.0135376453399658, "eval_runtime": 134.43, "eval_samples_per_second": 7.03, "eval_steps_per_second": 0.885, "step": 222 }, { "epoch": 2.0014545454545454, "grad_norm": 4.647419452667236, "learning_rate": 2.090909090909091e-05, "loss": 0.9994, "step": 230 }, { "epoch": 2.0032727272727273, "grad_norm": 3.433703660964966, "learning_rate": 2.1818181818181818e-05, "loss": 1.0031, "step": 240 }, { "epoch": 2.0050909090909093, "grad_norm": 8.593055725097656, "learning_rate": 2.272727272727273e-05, "loss": 1.0471, "step": 250 }, { "epoch": 2.0069090909090908, "grad_norm": 10.779848098754883, "learning_rate": 2.3636363636363637e-05, "loss": 1.015, "step": 260 }, { "epoch": 2.0087272727272727, "grad_norm": 5.282721042633057, "learning_rate": 2.4545454545454545e-05, "loss": 1.1351, "step": 270 }, { "epoch": 2.0105454545454546, "grad_norm": 7.395419597625732, "learning_rate": 2.5454545454545454e-05, "loss": 1.0316, "step": 280 }, { "epoch": 2.0123636363636366, "grad_norm": 7.214972019195557, "learning_rate": 2.636363636363636e-05, "loss": 1.0889, "step": 290 }, { "epoch": 2.014181818181818, "grad_norm": 8.676461219787598, "learning_rate": 2.7272727272727273e-05, "loss": 0.9877, "step": 300 }, { "epoch": 2.016, "grad_norm": 15.418935775756836, "learning_rate": 2.818181818181818e-05, "loss": 0.9833, "step": 310 }, { "epoch": 2.017818181818182, "grad_norm": 12.667856216430664, "learning_rate": 2.909090909090909e-05, "loss": 0.9863, "step": 320 }, { "epoch": 2.0196363636363635, "grad_norm": 9.46337890625, "learning_rate": 3e-05, "loss": 1.0271, "step": 330 }, { "epoch": 2.0201818181818183, "eval_accuracy": 0.4857142857142857, "eval_loss": 1.0630271434783936, "eval_runtime": 135.405, "eval_samples_per_second": 6.979, "eval_steps_per_second": 0.879, "step": 333 }, { "epoch": 3.001272727272727, "grad_norm": 7.892024993896484, "learning_rate": 3.090909090909091e-05, "loss": 1.0633, "step": 340 }, { "epoch": 3.003090909090909, "grad_norm": 6.610105037689209, "learning_rate": 3.181818181818182e-05, "loss": 0.9679, "step": 350 }, { "epoch": 3.004909090909091, "grad_norm": 10.291769981384277, "learning_rate": 3.272727272727273e-05, "loss": 1.0005, "step": 360 }, { "epoch": 3.006727272727273, "grad_norm": 15.271839141845703, "learning_rate": 3.3636363636363636e-05, "loss": 1.1185, "step": 370 }, { "epoch": 3.0085454545454544, "grad_norm": 4.6768879890441895, "learning_rate": 3.454545454545455e-05, "loss": 0.9465, "step": 380 }, { "epoch": 3.0103636363636364, "grad_norm": 16.15147590637207, "learning_rate": 3.545454545454546e-05, "loss": 1.0068, "step": 390 }, { "epoch": 3.0121818181818183, "grad_norm": 14.014873504638672, "learning_rate": 3.6363636363636364e-05, "loss": 1.1818, "step": 400 }, { "epoch": 3.014, "grad_norm": 7.938851356506348, "learning_rate": 3.7272727272727276e-05, "loss": 0.9357, "step": 410 }, { "epoch": 3.0158181818181817, "grad_norm": 7.2831807136535645, "learning_rate": 3.818181818181819e-05, "loss": 1.0555, "step": 420 }, { "epoch": 3.0176363636363637, "grad_norm": 14.499138832092285, "learning_rate": 3.909090909090909e-05, "loss": 1.0117, "step": 430 }, { "epoch": 3.0194545454545456, "grad_norm": 5.862173080444336, "learning_rate": 4e-05, "loss": 1.1609, "step": 440 }, { "epoch": 3.0201818181818183, "eval_accuracy": 0.4222222222222222, "eval_loss": 1.0202808380126953, "eval_runtime": 134.3297, "eval_samples_per_second": 7.035, "eval_steps_per_second": 0.886, "step": 444 }, { "epoch": 4.001090909090909, "grad_norm": 12.035215377807617, "learning_rate": 4.0909090909090915e-05, "loss": 1.065, "step": 450 }, { "epoch": 4.002909090909091, "grad_norm": 14.393060684204102, "learning_rate": 4.181818181818182e-05, "loss": 1.1474, "step": 460 }, { "epoch": 4.004727272727273, "grad_norm": 13.286993026733398, "learning_rate": 4.2727272727272724e-05, "loss": 1.1265, "step": 470 }, { "epoch": 4.006545454545455, "grad_norm": 6.933895587921143, "learning_rate": 4.3636363636363636e-05, "loss": 1.0134, "step": 480 }, { "epoch": 4.008363636363637, "grad_norm": 3.2165865898132324, "learning_rate": 4.454545454545455e-05, "loss": 1.0298, "step": 490 }, { "epoch": 4.0101818181818185, "grad_norm": 14.604901313781738, "learning_rate": 4.545454545454546e-05, "loss": 0.9201, "step": 500 }, { "epoch": 4.012, "grad_norm": 3.9145591259002686, "learning_rate": 4.636363636363636e-05, "loss": 1.1511, "step": 510 }, { "epoch": 4.0138181818181815, "grad_norm": 10.365116119384766, "learning_rate": 4.7272727272727275e-05, "loss": 1.0987, "step": 520 }, { "epoch": 4.0156363636363634, "grad_norm": 4.58921480178833, "learning_rate": 4.8181818181818186e-05, "loss": 1.0293, "step": 530 }, { "epoch": 4.017454545454545, "grad_norm": 4.619180679321289, "learning_rate": 4.909090909090909e-05, "loss": 0.9006, "step": 540 }, { "epoch": 4.019272727272727, "grad_norm": 10.984779357910156, "learning_rate": 5e-05, "loss": 0.9824, "step": 550 }, { "epoch": 4.020181818181818, "eval_accuracy": 0.5248677248677248, "eval_loss": 1.0219199657440186, "eval_runtime": 132.6767, "eval_samples_per_second": 7.123, "eval_steps_per_second": 0.897, "step": 555 }, { "epoch": 5.000909090909091, "grad_norm": 5.827091217041016, "learning_rate": 4.98989898989899e-05, "loss": 1.067, "step": 560 }, { "epoch": 5.002727272727273, "grad_norm": 12.666735649108887, "learning_rate": 4.97979797979798e-05, "loss": 1.0333, "step": 570 }, { "epoch": 5.004545454545455, "grad_norm": 10.529485702514648, "learning_rate": 4.9696969696969694e-05, "loss": 1.1087, "step": 580 }, { "epoch": 5.006363636363637, "grad_norm": 5.182685852050781, "learning_rate": 4.9595959595959594e-05, "loss": 0.9791, "step": 590 }, { "epoch": 5.008181818181818, "grad_norm": 9.57801628112793, "learning_rate": 4.94949494949495e-05, "loss": 0.9401, "step": 600 }, { "epoch": 5.01, "grad_norm": 9.004761695861816, "learning_rate": 4.93939393939394e-05, "loss": 0.9347, "step": 610 }, { "epoch": 5.011818181818182, "grad_norm": 7.18081521987915, "learning_rate": 4.92929292929293e-05, "loss": 1.0563, "step": 620 }, { "epoch": 5.013636363636364, "grad_norm": 4.530564785003662, "learning_rate": 4.919191919191919e-05, "loss": 1.1042, "step": 630 }, { "epoch": 5.015454545454546, "grad_norm": 8.461360931396484, "learning_rate": 4.909090909090909e-05, "loss": 1.073, "step": 640 }, { "epoch": 5.0172727272727276, "grad_norm": 3.762699842453003, "learning_rate": 4.898989898989899e-05, "loss": 1.0176, "step": 650 }, { "epoch": 5.0190909090909095, "grad_norm": 5.622332572937012, "learning_rate": 4.888888888888889e-05, "loss": 1.0247, "step": 660 }, { "epoch": 5.020181818181818, "eval_accuracy": 0.5026455026455027, "eval_loss": 1.020993947982788, "eval_runtime": 132.4215, "eval_samples_per_second": 7.136, "eval_steps_per_second": 0.899, "step": 666 }, { "epoch": 6.000727272727273, "grad_norm": 10.156366348266602, "learning_rate": 4.878787878787879e-05, "loss": 0.9959, "step": 670 }, { "epoch": 6.002545454545454, "grad_norm": 7.045960426330566, "learning_rate": 4.868686868686869e-05, "loss": 0.9444, "step": 680 }, { "epoch": 6.004363636363636, "grad_norm": 16.915678024291992, "learning_rate": 4.858585858585859e-05, "loss": 0.9367, "step": 690 }, { "epoch": 6.006181818181818, "grad_norm": 6.158443927764893, "learning_rate": 4.848484848484849e-05, "loss": 1.05, "step": 700 }, { "epoch": 6.008, "grad_norm": 13.542768478393555, "learning_rate": 4.838383838383839e-05, "loss": 0.9787, "step": 710 }, { "epoch": 6.009818181818182, "grad_norm": 5.779530048370361, "learning_rate": 4.828282828282829e-05, "loss": 0.9801, "step": 720 }, { "epoch": 6.011636363636364, "grad_norm": 5.2666192054748535, "learning_rate": 4.8181818181818186e-05, "loss": 1.0266, "step": 730 }, { "epoch": 6.013454545454546, "grad_norm": 5.806363105773926, "learning_rate": 4.808080808080808e-05, "loss": 0.9121, "step": 740 }, { "epoch": 6.015272727272727, "grad_norm": 7.450347423553467, "learning_rate": 4.797979797979798e-05, "loss": 1.079, "step": 750 }, { "epoch": 6.017090909090909, "grad_norm": 8.759308815002441, "learning_rate": 4.787878787878788e-05, "loss": 0.8933, "step": 760 }, { "epoch": 6.018909090909091, "grad_norm": 7.260105133056641, "learning_rate": 4.7777777777777784e-05, "loss": 1.0824, "step": 770 }, { "epoch": 6.020181818181818, "eval_accuracy": 0.47195767195767196, "eval_loss": 0.9947441220283508, "eval_runtime": 134.8662, "eval_samples_per_second": 7.007, "eval_steps_per_second": 0.882, "step": 777 }, { "epoch": 7.000545454545454, "grad_norm": 7.011205196380615, "learning_rate": 4.7676767676767684e-05, "loss": 1.0872, "step": 780 }, { "epoch": 7.002363636363636, "grad_norm": 5.094061851501465, "learning_rate": 4.7575757575757576e-05, "loss": 0.8856, "step": 790 }, { "epoch": 7.004181818181818, "grad_norm": 8.960957527160645, "learning_rate": 4.7474747474747476e-05, "loss": 0.8666, "step": 800 }, { "epoch": 7.006, "grad_norm": 7.651601314544678, "learning_rate": 4.7373737373737375e-05, "loss": 1.1694, "step": 810 }, { "epoch": 7.007818181818182, "grad_norm": 14.146735191345215, "learning_rate": 4.7272727272727275e-05, "loss": 1.05, "step": 820 }, { "epoch": 7.009636363636363, "grad_norm": 14.381610870361328, "learning_rate": 4.7171717171717174e-05, "loss": 0.9629, "step": 830 }, { "epoch": 7.011454545454545, "grad_norm": 10.896899223327637, "learning_rate": 4.7070707070707074e-05, "loss": 0.9518, "step": 840 }, { "epoch": 7.013272727272727, "grad_norm": 2.556317090988159, "learning_rate": 4.696969696969697e-05, "loss": 1.0336, "step": 850 }, { "epoch": 7.015090909090909, "grad_norm": 8.311176300048828, "learning_rate": 4.686868686868687e-05, "loss": 1.0507, "step": 860 }, { "epoch": 7.016909090909091, "grad_norm": 12.525547981262207, "learning_rate": 4.676767676767677e-05, "loss": 0.8741, "step": 870 }, { "epoch": 7.018727272727273, "grad_norm": 8.925640106201172, "learning_rate": 4.666666666666667e-05, "loss": 0.943, "step": 880 }, { "epoch": 7.020181818181818, "eval_accuracy": 0.4380952380952381, "eval_loss": 1.1085319519042969, "eval_runtime": 134.7633, "eval_samples_per_second": 7.012, "eval_steps_per_second": 0.883, "step": 888 }, { "epoch": 8.000363636363636, "grad_norm": 7.20665168762207, "learning_rate": 4.656565656565657e-05, "loss": 1.0359, "step": 890 }, { "epoch": 8.002181818181818, "grad_norm": 8.184736251831055, "learning_rate": 4.6464646464646464e-05, "loss": 0.9215, "step": 900 }, { "epoch": 8.004, "grad_norm": 6.529689788818359, "learning_rate": 4.636363636363636e-05, "loss": 1.1004, "step": 910 }, { "epoch": 8.005818181818182, "grad_norm": 9.363311767578125, "learning_rate": 4.626262626262626e-05, "loss": 0.9776, "step": 920 }, { "epoch": 8.007636363636363, "grad_norm": 7.587140083312988, "learning_rate": 4.616161616161616e-05, "loss": 0.9948, "step": 930 }, { "epoch": 8.009454545454545, "grad_norm": 3.559080123901367, "learning_rate": 4.606060606060607e-05, "loss": 1.1077, "step": 940 }, { "epoch": 8.011272727272727, "grad_norm": 6.365655422210693, "learning_rate": 4.595959595959596e-05, "loss": 0.9794, "step": 950 }, { "epoch": 8.01309090909091, "grad_norm": 6.0974040031433105, "learning_rate": 4.585858585858586e-05, "loss": 1.0501, "step": 960 }, { "epoch": 8.014909090909091, "grad_norm": 9.754732131958008, "learning_rate": 4.575757575757576e-05, "loss": 0.9277, "step": 970 }, { "epoch": 8.016727272727273, "grad_norm": 6.327628135681152, "learning_rate": 4.565656565656566e-05, "loss": 0.9875, "step": 980 }, { "epoch": 8.018545454545455, "grad_norm": 5.392932415008545, "learning_rate": 4.555555555555556e-05, "loss": 0.8807, "step": 990 }, { "epoch": 8.020181818181818, "eval_accuracy": 0.5767195767195767, "eval_loss": 0.9345303773880005, "eval_runtime": 135.6213, "eval_samples_per_second": 6.968, "eval_steps_per_second": 0.877, "step": 999 }, { "epoch": 9.000181818181819, "grad_norm": 6.117162704467773, "learning_rate": 4.545454545454546e-05, "loss": 1.0491, "step": 1000 }, { "epoch": 9.002, "grad_norm": 6.519684791564941, "learning_rate": 4.535353535353535e-05, "loss": 0.927, "step": 1010 }, { "epoch": 9.003818181818183, "grad_norm": 3.578704595565796, "learning_rate": 4.525252525252526e-05, "loss": 1.0444, "step": 1020 }, { "epoch": 9.005636363636363, "grad_norm": 3.731860637664795, "learning_rate": 4.515151515151516e-05, "loss": 1.0397, "step": 1030 }, { "epoch": 9.007454545454545, "grad_norm": 6.1547064781188965, "learning_rate": 4.5050505050505056e-05, "loss": 0.9507, "step": 1040 }, { "epoch": 9.009272727272727, "grad_norm": 6.028125286102295, "learning_rate": 4.494949494949495e-05, "loss": 0.8981, "step": 1050 }, { "epoch": 9.011090909090909, "grad_norm": 5.517436504364014, "learning_rate": 4.484848484848485e-05, "loss": 0.9094, "step": 1060 }, { "epoch": 9.01290909090909, "grad_norm": 3.5511577129364014, "learning_rate": 4.474747474747475e-05, "loss": 1.0594, "step": 1070 }, { "epoch": 9.014727272727272, "grad_norm": 3.742021322250366, "learning_rate": 4.464646464646465e-05, "loss": 0.9379, "step": 1080 }, { "epoch": 9.016545454545454, "grad_norm": 13.917617797851562, "learning_rate": 4.454545454545455e-05, "loss": 1.1228, "step": 1090 }, { "epoch": 9.018363636363636, "grad_norm": 4.444940090179443, "learning_rate": 4.4444444444444447e-05, "loss": 0.9285, "step": 1100 }, { "epoch": 9.020181818181818, "grad_norm": 20.118894577026367, "learning_rate": 4.4343434343434346e-05, "loss": 1.1009, "step": 1110 }, { "epoch": 9.020181818181818, "eval_accuracy": 0.5164021164021164, "eval_loss": 0.9854611158370972, "eval_runtime": 133.8494, "eval_samples_per_second": 7.06, "eval_steps_per_second": 0.889, "step": 1110 }, { "epoch": 10.001818181818182, "grad_norm": 4.6963090896606445, "learning_rate": 4.4242424242424246e-05, "loss": 1.053, "step": 1120 }, { "epoch": 10.003636363636364, "grad_norm": 7.05235481262207, "learning_rate": 4.4141414141414145e-05, "loss": 1.0504, "step": 1130 }, { "epoch": 10.005454545454546, "grad_norm": 6.721865177154541, "learning_rate": 4.4040404040404044e-05, "loss": 1.0168, "step": 1140 }, { "epoch": 10.007272727272728, "grad_norm": 2.274195909500122, "learning_rate": 4.3939393939393944e-05, "loss": 0.8676, "step": 1150 }, { "epoch": 10.00909090909091, "grad_norm": 3.7177436351776123, "learning_rate": 4.383838383838384e-05, "loss": 1.0794, "step": 1160 }, { "epoch": 10.010909090909092, "grad_norm": 4.672595024108887, "learning_rate": 4.3737373737373736e-05, "loss": 0.9912, "step": 1170 }, { "epoch": 10.012727272727274, "grad_norm": 8.199527740478516, "learning_rate": 4.3636363636363636e-05, "loss": 0.9819, "step": 1180 }, { "epoch": 10.014545454545454, "grad_norm": 8.785588264465332, "learning_rate": 4.3535353535353535e-05, "loss": 0.9493, "step": 1190 }, { "epoch": 10.016363636363636, "grad_norm": 5.248023509979248, "learning_rate": 4.343434343434344e-05, "loss": 0.8966, "step": 1200 }, { "epoch": 10.018181818181818, "grad_norm": 8.876505851745605, "learning_rate": 4.3333333333333334e-05, "loss": 0.9195, "step": 1210 }, { "epoch": 10.02, "grad_norm": 9.349332809448242, "learning_rate": 4.3232323232323234e-05, "loss": 1.0292, "step": 1220 }, { "epoch": 10.020181818181818, "eval_accuracy": 0.43386243386243384, "eval_loss": 1.0506019592285156, "eval_runtime": 134.4694, "eval_samples_per_second": 7.028, "eval_steps_per_second": 0.885, "step": 1221 }, { "epoch": 11.001636363636363, "grad_norm": 2.947324752807617, "learning_rate": 4.313131313131313e-05, "loss": 1.033, "step": 1230 }, { "epoch": 11.003454545454545, "grad_norm": 5.121668815612793, "learning_rate": 4.303030303030303e-05, "loss": 0.941, "step": 1240 }, { "epoch": 11.005272727272727, "grad_norm": 9.943852424621582, "learning_rate": 4.292929292929293e-05, "loss": 0.8642, "step": 1250 }, { "epoch": 11.007090909090909, "grad_norm": 9.663615226745605, "learning_rate": 4.282828282828283e-05, "loss": 0.903, "step": 1260 }, { "epoch": 11.008909090909091, "grad_norm": 6.816159248352051, "learning_rate": 4.2727272727272724e-05, "loss": 1.0538, "step": 1270 }, { "epoch": 11.010727272727273, "grad_norm": 4.542863845825195, "learning_rate": 4.262626262626263e-05, "loss": 0.9424, "step": 1280 }, { "epoch": 11.012545454545455, "grad_norm": 6.401607036590576, "learning_rate": 4.252525252525253e-05, "loss": 0.9769, "step": 1290 }, { "epoch": 11.014363636363637, "grad_norm": 4.389651775360107, "learning_rate": 4.242424242424243e-05, "loss": 0.9633, "step": 1300 }, { "epoch": 11.016181818181819, "grad_norm": 3.288135051727295, "learning_rate": 4.232323232323233e-05, "loss": 1.1248, "step": 1310 }, { "epoch": 11.018, "grad_norm": 6.90913200378418, "learning_rate": 4.222222222222222e-05, "loss": 0.9916, "step": 1320 }, { "epoch": 11.019818181818183, "grad_norm": 7.67244815826416, "learning_rate": 4.212121212121212e-05, "loss": 0.9071, "step": 1330 }, { "epoch": 11.020181818181818, "eval_accuracy": 0.5142857142857142, "eval_loss": 0.9925752878189087, "eval_runtime": 137.6148, "eval_samples_per_second": 6.867, "eval_steps_per_second": 0.865, "step": 1332 }, { "epoch": 12.001454545454546, "grad_norm": 9.924761772155762, "learning_rate": 4.202020202020202e-05, "loss": 0.9088, "step": 1340 }, { "epoch": 12.003272727272726, "grad_norm": 6.308601379394531, "learning_rate": 4.191919191919192e-05, "loss": 1.0805, "step": 1350 }, { "epoch": 12.005090909090908, "grad_norm": 3.287738561630249, "learning_rate": 4.181818181818182e-05, "loss": 0.897, "step": 1360 }, { "epoch": 12.00690909090909, "grad_norm": 11.629283905029297, "learning_rate": 4.171717171717172e-05, "loss": 0.9188, "step": 1370 }, { "epoch": 12.008727272727272, "grad_norm": 15.360862731933594, "learning_rate": 4.161616161616162e-05, "loss": 0.9509, "step": 1380 }, { "epoch": 12.010545454545454, "grad_norm": 3.7027997970581055, "learning_rate": 4.151515151515152e-05, "loss": 0.9146, "step": 1390 }, { "epoch": 12.012363636363636, "grad_norm": 2.7823915481567383, "learning_rate": 4.141414141414142e-05, "loss": 0.83, "step": 1400 }, { "epoch": 12.014181818181818, "grad_norm": 11.256366729736328, "learning_rate": 4.131313131313132e-05, "loss": 0.963, "step": 1410 }, { "epoch": 12.016, "grad_norm": 8.553262710571289, "learning_rate": 4.1212121212121216e-05, "loss": 1.0025, "step": 1420 }, { "epoch": 12.017818181818182, "grad_norm": 10.009743690490723, "learning_rate": 4.111111111111111e-05, "loss": 0.9265, "step": 1430 }, { "epoch": 12.019636363636364, "grad_norm": 14.513312339782715, "learning_rate": 4.101010101010101e-05, "loss": 1.0001, "step": 1440 }, { "epoch": 12.020181818181818, "eval_accuracy": 0.4931216931216931, "eval_loss": 1.0406430959701538, "eval_runtime": 135.3406, "eval_samples_per_second": 6.982, "eval_steps_per_second": 0.879, "step": 1443 }, { "epoch": 13.001272727272728, "grad_norm": 10.787238121032715, "learning_rate": 4.0909090909090915e-05, "loss": 0.9537, "step": 1450 }, { "epoch": 13.00309090909091, "grad_norm": 6.097622394561768, "learning_rate": 4.0808080808080814e-05, "loss": 0.9546, "step": 1460 }, { "epoch": 13.004909090909091, "grad_norm": 3.8867690563201904, "learning_rate": 4.070707070707071e-05, "loss": 0.9046, "step": 1470 }, { "epoch": 13.006727272727273, "grad_norm": 3.548370599746704, "learning_rate": 4.0606060606060606e-05, "loss": 1.0842, "step": 1480 }, { "epoch": 13.008545454545455, "grad_norm": 3.0282113552093506, "learning_rate": 4.0505050505050506e-05, "loss": 1.0189, "step": 1490 }, { "epoch": 13.010363636363637, "grad_norm": 5.710118293762207, "learning_rate": 4.0404040404040405e-05, "loss": 0.9354, "step": 1500 }, { "epoch": 13.012181818181817, "grad_norm": 8.055477142333984, "learning_rate": 4.0303030303030305e-05, "loss": 0.9242, "step": 1510 }, { "epoch": 13.014, "grad_norm": 2.122373342514038, "learning_rate": 4.0202020202020204e-05, "loss": 0.8736, "step": 1520 }, { "epoch": 13.015818181818181, "grad_norm": 3.9817798137664795, "learning_rate": 4.01010101010101e-05, "loss": 0.9973, "step": 1530 }, { "epoch": 13.017636363636363, "grad_norm": 9.886878967285156, "learning_rate": 4e-05, "loss": 1.0104, "step": 1540 }, { "epoch": 13.019454545454545, "grad_norm": 3.8867106437683105, "learning_rate": 3.98989898989899e-05, "loss": 0.9698, "step": 1550 }, { "epoch": 13.020181818181818, "eval_accuracy": 0.5597883597883598, "eval_loss": 0.9440351128578186, "eval_runtime": 135.8312, "eval_samples_per_second": 6.957, "eval_steps_per_second": 0.876, "step": 1554 }, { "epoch": 14.001090909090909, "grad_norm": 5.418400764465332, "learning_rate": 3.97979797979798e-05, "loss": 1.0339, "step": 1560 }, { "epoch": 14.00290909090909, "grad_norm": 3.685300827026367, "learning_rate": 3.96969696969697e-05, "loss": 0.9178, "step": 1570 }, { "epoch": 14.004727272727273, "grad_norm": 8.146584510803223, "learning_rate": 3.9595959595959594e-05, "loss": 0.9786, "step": 1580 }, { "epoch": 14.006545454545455, "grad_norm": 10.083425521850586, "learning_rate": 3.9494949494949494e-05, "loss": 0.9445, "step": 1590 }, { "epoch": 14.008363636363637, "grad_norm": 11.22440242767334, "learning_rate": 3.939393939393939e-05, "loss": 1.0185, "step": 1600 }, { "epoch": 14.010181818181819, "grad_norm": 9.001585960388184, "learning_rate": 3.929292929292929e-05, "loss": 1.0519, "step": 1610 }, { "epoch": 14.012, "grad_norm": 6.203736782073975, "learning_rate": 3.91919191919192e-05, "loss": 0.9379, "step": 1620 }, { "epoch": 14.013818181818182, "grad_norm": 9.380956649780273, "learning_rate": 3.909090909090909e-05, "loss": 0.9295, "step": 1630 }, { "epoch": 14.015636363636364, "grad_norm": 7.258037567138672, "learning_rate": 3.898989898989899e-05, "loss": 0.993, "step": 1640 }, { "epoch": 14.017454545454546, "grad_norm": 4.416772842407227, "learning_rate": 3.888888888888889e-05, "loss": 0.9866, "step": 1650 }, { "epoch": 14.019272727272726, "grad_norm": 5.408873081207275, "learning_rate": 3.878787878787879e-05, "loss": 0.9405, "step": 1660 }, { "epoch": 14.020181818181818, "eval_accuracy": 0.5322751322751322, "eval_loss": 0.9666908979415894, "eval_runtime": 136.0788, "eval_samples_per_second": 6.945, "eval_steps_per_second": 0.874, "step": 1665 }, { "epoch": 15.00090909090909, "grad_norm": 6.522909164428711, "learning_rate": 3.868686868686869e-05, "loss": 1.0054, "step": 1670 }, { "epoch": 15.002727272727272, "grad_norm": 4.72935676574707, "learning_rate": 3.858585858585859e-05, "loss": 0.888, "step": 1680 }, { "epoch": 15.004545454545454, "grad_norm": 11.134079933166504, "learning_rate": 3.848484848484848e-05, "loss": 1.0401, "step": 1690 }, { "epoch": 15.006363636363636, "grad_norm": 3.3430612087249756, "learning_rate": 3.838383838383838e-05, "loss": 0.8786, "step": 1700 }, { "epoch": 15.008181818181818, "grad_norm": 3.8444104194641113, "learning_rate": 3.828282828282829e-05, "loss": 0.8552, "step": 1710 }, { "epoch": 15.01, "grad_norm": 8.576009750366211, "learning_rate": 3.818181818181819e-05, "loss": 0.865, "step": 1720 }, { "epoch": 15.011818181818182, "grad_norm": 11.537275314331055, "learning_rate": 3.8080808080808087e-05, "loss": 0.9144, "step": 1730 }, { "epoch": 15.013636363636364, "grad_norm": 2.3551039695739746, "learning_rate": 3.797979797979798e-05, "loss": 0.8887, "step": 1740 }, { "epoch": 15.015454545454546, "grad_norm": 5.525041103363037, "learning_rate": 3.787878787878788e-05, "loss": 0.842, "step": 1750 }, { "epoch": 15.017272727272728, "grad_norm": 7.051702499389648, "learning_rate": 3.777777777777778e-05, "loss": 0.9548, "step": 1760 }, { "epoch": 15.01909090909091, "grad_norm": 7.506717681884766, "learning_rate": 3.767676767676768e-05, "loss": 0.8802, "step": 1770 }, { "epoch": 15.020181818181818, "eval_accuracy": 0.5862433862433862, "eval_loss": 0.9010878205299377, "eval_runtime": 132.3568, "eval_samples_per_second": 7.14, "eval_steps_per_second": 0.899, "step": 1776 }, { "epoch": 16.00072727272727, "grad_norm": 2.738687515258789, "learning_rate": 3.757575757575758e-05, "loss": 1.0482, "step": 1780 }, { "epoch": 16.002545454545455, "grad_norm": 8.474298477172852, "learning_rate": 3.747474747474748e-05, "loss": 0.9304, "step": 1790 }, { "epoch": 16.004363636363635, "grad_norm": 5.706143379211426, "learning_rate": 3.7373737373737376e-05, "loss": 0.9842, "step": 1800 }, { "epoch": 16.00618181818182, "grad_norm": 10.396550178527832, "learning_rate": 3.7272727272727276e-05, "loss": 1.125, "step": 1810 }, { "epoch": 16.008, "grad_norm": 7.308793544769287, "learning_rate": 3.7171717171717175e-05, "loss": 0.9996, "step": 1820 }, { "epoch": 16.009818181818183, "grad_norm": 3.592374563217163, "learning_rate": 3.7070707070707075e-05, "loss": 0.9729, "step": 1830 }, { "epoch": 16.011636363636363, "grad_norm": 8.693289756774902, "learning_rate": 3.6969696969696974e-05, "loss": 1.0341, "step": 1840 }, { "epoch": 16.013454545454547, "grad_norm": 5.157698154449463, "learning_rate": 3.686868686868687e-05, "loss": 0.9377, "step": 1850 }, { "epoch": 16.015272727272727, "grad_norm": 7.736370086669922, "learning_rate": 3.6767676767676766e-05, "loss": 0.9455, "step": 1860 }, { "epoch": 16.01709090909091, "grad_norm": 6.4704694747924805, "learning_rate": 3.6666666666666666e-05, "loss": 0.9546, "step": 1870 }, { "epoch": 16.01890909090909, "grad_norm": 7.747092247009277, "learning_rate": 3.656565656565657e-05, "loss": 0.9154, "step": 1880 }, { "epoch": 16.02018181818182, "eval_accuracy": 0.5597883597883598, "eval_loss": 0.942902684211731, "eval_runtime": 130.5549, "eval_samples_per_second": 7.238, "eval_steps_per_second": 0.911, "step": 1887 }, { "epoch": 17.000545454545456, "grad_norm": 8.326998710632324, "learning_rate": 3.6464646464646465e-05, "loss": 0.9206, "step": 1890 }, { "epoch": 17.002363636363636, "grad_norm": 9.486376762390137, "learning_rate": 3.6363636363636364e-05, "loss": 0.8327, "step": 1900 }, { "epoch": 17.004181818181817, "grad_norm": 9.206704139709473, "learning_rate": 3.6262626262626264e-05, "loss": 1.0631, "step": 1910 }, { "epoch": 17.006, "grad_norm": 9.73511791229248, "learning_rate": 3.616161616161616e-05, "loss": 1.0724, "step": 1920 }, { "epoch": 17.00781818181818, "grad_norm": 6.431131839752197, "learning_rate": 3.606060606060606e-05, "loss": 0.9529, "step": 1930 }, { "epoch": 17.009636363636364, "grad_norm": 4.713401794433594, "learning_rate": 3.595959595959596e-05, "loss": 0.9926, "step": 1940 }, { "epoch": 17.011454545454544, "grad_norm": 7.5310282707214355, "learning_rate": 3.5858585858585855e-05, "loss": 0.9691, "step": 1950 }, { "epoch": 17.013272727272728, "grad_norm": 5.700111389160156, "learning_rate": 3.575757575757576e-05, "loss": 0.9538, "step": 1960 }, { "epoch": 17.015090909090908, "grad_norm": 4.151130676269531, "learning_rate": 3.565656565656566e-05, "loss": 0.8932, "step": 1970 }, { "epoch": 17.016909090909092, "grad_norm": 3.5625922679901123, "learning_rate": 3.555555555555556e-05, "loss": 1.0085, "step": 1980 }, { "epoch": 17.018727272727272, "grad_norm": 5.718069553375244, "learning_rate": 3.545454545454546e-05, "loss": 0.929, "step": 1990 }, { "epoch": 17.02018181818182, "eval_accuracy": 0.5132275132275133, "eval_loss": 0.9948244690895081, "eval_runtime": 131.1914, "eval_samples_per_second": 7.203, "eval_steps_per_second": 0.907, "step": 1998 }, { "epoch": 18.000363636363637, "grad_norm": 2.451860189437866, "learning_rate": 3.535353535353535e-05, "loss": 0.9883, "step": 2000 }, { "epoch": 18.002181818181818, "grad_norm": 10.55557918548584, "learning_rate": 3.525252525252525e-05, "loss": 0.9586, "step": 2010 }, { "epoch": 18.004, "grad_norm": 3.5876386165618896, "learning_rate": 3.515151515151515e-05, "loss": 0.9265, "step": 2020 }, { "epoch": 18.00581818181818, "grad_norm": 4.748887538909912, "learning_rate": 3.505050505050505e-05, "loss": 1.021, "step": 2030 }, { "epoch": 18.007636363636365, "grad_norm": 6.446643352508545, "learning_rate": 3.494949494949495e-05, "loss": 0.8324, "step": 2040 }, { "epoch": 18.009454545454545, "grad_norm": 5.21691370010376, "learning_rate": 3.484848484848485e-05, "loss": 0.8463, "step": 2050 }, { "epoch": 18.011272727272726, "grad_norm": 6.048694133758545, "learning_rate": 3.474747474747475e-05, "loss": 0.8215, "step": 2060 }, { "epoch": 18.01309090909091, "grad_norm": 7.23938512802124, "learning_rate": 3.464646464646465e-05, "loss": 0.937, "step": 2070 }, { "epoch": 18.01490909090909, "grad_norm": 7.888938903808594, "learning_rate": 3.454545454545455e-05, "loss": 0.8632, "step": 2080 }, { "epoch": 18.016727272727273, "grad_norm": 5.132527828216553, "learning_rate": 3.444444444444445e-05, "loss": 1.0232, "step": 2090 }, { "epoch": 18.018545454545453, "grad_norm": 3.099971294403076, "learning_rate": 3.434343434343435e-05, "loss": 0.9112, "step": 2100 }, { "epoch": 18.02018181818182, "eval_accuracy": 0.5851851851851851, "eval_loss": 0.9056147933006287, "eval_runtime": 132.4218, "eval_samples_per_second": 7.136, "eval_steps_per_second": 0.899, "step": 2109 }, { "epoch": 19.00018181818182, "grad_norm": 6.3844218254089355, "learning_rate": 3.424242424242424e-05, "loss": 0.9953, "step": 2110 }, { "epoch": 19.002, "grad_norm": 5.968114852905273, "learning_rate": 3.414141414141414e-05, "loss": 0.9817, "step": 2120 }, { "epoch": 19.003818181818183, "grad_norm": 3.901590585708618, "learning_rate": 3.4040404040404045e-05, "loss": 0.9323, "step": 2130 }, { "epoch": 19.005636363636363, "grad_norm": 7.752019882202148, "learning_rate": 3.3939393939393945e-05, "loss": 0.9498, "step": 2140 }, { "epoch": 19.007454545454546, "grad_norm": 4.7573771476745605, "learning_rate": 3.3838383838383844e-05, "loss": 1.1847, "step": 2150 }, { "epoch": 19.009272727272727, "grad_norm": 7.086313724517822, "learning_rate": 3.373737373737374e-05, "loss": 0.9449, "step": 2160 }, { "epoch": 19.01109090909091, "grad_norm": 7.621354579925537, "learning_rate": 3.3636363636363636e-05, "loss": 0.9435, "step": 2170 }, { "epoch": 19.01290909090909, "grad_norm": 3.54822039604187, "learning_rate": 3.3535353535353536e-05, "loss": 0.9775, "step": 2180 }, { "epoch": 19.014727272727274, "grad_norm": 5.012094497680664, "learning_rate": 3.3434343434343435e-05, "loss": 0.9856, "step": 2190 }, { "epoch": 19.016545454545454, "grad_norm": 9.499866485595703, "learning_rate": 3.3333333333333335e-05, "loss": 0.9401, "step": 2200 }, { "epoch": 19.018363636363638, "grad_norm": 5.7213873863220215, "learning_rate": 3.3232323232323234e-05, "loss": 0.8776, "step": 2210 }, { "epoch": 19.02018181818182, "grad_norm": 11.805944442749023, "learning_rate": 3.3131313131313134e-05, "loss": 0.9202, "step": 2220 }, { "epoch": 19.02018181818182, "eval_accuracy": 0.5523809523809524, "eval_loss": 0.9488540887832642, "eval_runtime": 132.4819, "eval_samples_per_second": 7.133, "eval_steps_per_second": 0.898, "step": 2220 }, { "epoch": 20.00181818181818, "grad_norm": 3.7236974239349365, "learning_rate": 3.303030303030303e-05, "loss": 0.9245, "step": 2230 }, { "epoch": 20.003636363636364, "grad_norm": 6.91562032699585, "learning_rate": 3.292929292929293e-05, "loss": 0.9469, "step": 2240 }, { "epoch": 20.005454545454544, "grad_norm": 4.592532157897949, "learning_rate": 3.282828282828283e-05, "loss": 0.9701, "step": 2250 }, { "epoch": 20.007272727272728, "grad_norm": 9.3302001953125, "learning_rate": 3.272727272727273e-05, "loss": 0.7942, "step": 2260 }, { "epoch": 20.009090909090908, "grad_norm": 8.965353012084961, "learning_rate": 3.2626262626262624e-05, "loss": 0.9704, "step": 2270 }, { "epoch": 20.01090909090909, "grad_norm": 4.075799942016602, "learning_rate": 3.2525252525252524e-05, "loss": 1.0357, "step": 2280 }, { "epoch": 20.012727272727272, "grad_norm": 4.786118984222412, "learning_rate": 3.2424242424242423e-05, "loss": 0.9595, "step": 2290 }, { "epoch": 20.014545454545456, "grad_norm": 6.418935298919678, "learning_rate": 3.232323232323233e-05, "loss": 0.9382, "step": 2300 }, { "epoch": 20.016363636363636, "grad_norm": 5.954792022705078, "learning_rate": 3.222222222222223e-05, "loss": 0.9991, "step": 2310 }, { "epoch": 20.01818181818182, "grad_norm": 6.49984884262085, "learning_rate": 3.212121212121212e-05, "loss": 0.9113, "step": 2320 }, { "epoch": 20.02, "grad_norm": 7.011244773864746, "learning_rate": 3.202020202020202e-05, "loss": 0.9004, "step": 2330 }, { "epoch": 20.02018181818182, "eval_accuracy": 0.582010582010582, "eval_loss": 0.8995028734207153, "eval_runtime": 133.4171, "eval_samples_per_second": 7.083, "eval_steps_per_second": 0.892, "step": 2331 }, { "epoch": 21.001636363636365, "grad_norm": 4.065828800201416, "learning_rate": 3.191919191919192e-05, "loss": 0.9496, "step": 2340 }, { "epoch": 21.003454545454545, "grad_norm": 8.006439208984375, "learning_rate": 3.181818181818182e-05, "loss": 0.9348, "step": 2350 }, { "epoch": 21.00527272727273, "grad_norm": 3.660148859024048, "learning_rate": 3.171717171717172e-05, "loss": 0.7659, "step": 2360 }, { "epoch": 21.00709090909091, "grad_norm": 6.901710033416748, "learning_rate": 3.161616161616161e-05, "loss": 0.928, "step": 2370 }, { "epoch": 21.00890909090909, "grad_norm": 4.300201892852783, "learning_rate": 3.151515151515151e-05, "loss": 0.8376, "step": 2380 }, { "epoch": 21.010727272727273, "grad_norm": 7.4214768409729, "learning_rate": 3.141414141414142e-05, "loss": 0.8961, "step": 2390 }, { "epoch": 21.012545454545453, "grad_norm": 7.769872188568115, "learning_rate": 3.131313131313132e-05, "loss": 0.9421, "step": 2400 }, { "epoch": 21.014363636363637, "grad_norm": 6.112030029296875, "learning_rate": 3.121212121212122e-05, "loss": 1.003, "step": 2410 }, { "epoch": 21.016181818181817, "grad_norm": 2.415757656097412, "learning_rate": 3.111111111111111e-05, "loss": 0.8616, "step": 2420 }, { "epoch": 21.018, "grad_norm": 7.6159772872924805, "learning_rate": 3.101010101010101e-05, "loss": 1.0072, "step": 2430 }, { "epoch": 21.01981818181818, "grad_norm": 3.9019908905029297, "learning_rate": 3.090909090909091e-05, "loss": 0.9318, "step": 2440 }, { "epoch": 21.02018181818182, "eval_accuracy": 0.5957671957671957, "eval_loss": 0.9031988978385925, "eval_runtime": 133.1845, "eval_samples_per_second": 7.095, "eval_steps_per_second": 0.893, "step": 2442 }, { "epoch": 22.001454545454546, "grad_norm": 5.528774261474609, "learning_rate": 3.080808080808081e-05, "loss": 0.8702, "step": 2450 }, { "epoch": 22.003272727272726, "grad_norm": 4.2135467529296875, "learning_rate": 3.070707070707071e-05, "loss": 0.8018, "step": 2460 }, { "epoch": 22.00509090909091, "grad_norm": 5.549806118011475, "learning_rate": 3.060606060606061e-05, "loss": 0.865, "step": 2470 }, { "epoch": 22.00690909090909, "grad_norm": 8.080962181091309, "learning_rate": 3.050505050505051e-05, "loss": 0.9013, "step": 2480 }, { "epoch": 22.008727272727274, "grad_norm": 6.765378475189209, "learning_rate": 3.0404040404040406e-05, "loss": 0.7961, "step": 2490 }, { "epoch": 22.010545454545454, "grad_norm": 6.4678874015808105, "learning_rate": 3.0303030303030306e-05, "loss": 0.9707, "step": 2500 }, { "epoch": 22.012363636363638, "grad_norm": 6.30830717086792, "learning_rate": 3.0202020202020205e-05, "loss": 0.9456, "step": 2510 }, { "epoch": 22.014181818181818, "grad_norm": 7.281170845031738, "learning_rate": 3.01010101010101e-05, "loss": 0.9573, "step": 2520 }, { "epoch": 22.016, "grad_norm": 2.3276336193084717, "learning_rate": 3e-05, "loss": 0.9226, "step": 2530 }, { "epoch": 22.017818181818182, "grad_norm": 10.788141250610352, "learning_rate": 2.98989898989899e-05, "loss": 0.949, "step": 2540 }, { "epoch": 22.019636363636362, "grad_norm": 12.814167022705078, "learning_rate": 2.9797979797979796e-05, "loss": 0.8493, "step": 2550 }, { "epoch": 22.02018181818182, "eval_accuracy": 0.5238095238095238, "eval_loss": 0.997473418712616, "eval_runtime": 134.17, "eval_samples_per_second": 7.043, "eval_steps_per_second": 0.887, "step": 2553 }, { "epoch": 23.001272727272728, "grad_norm": 7.309125900268555, "learning_rate": 2.96969696969697e-05, "loss": 1.0028, "step": 2560 }, { "epoch": 23.003090909090908, "grad_norm": 3.8795406818389893, "learning_rate": 2.95959595959596e-05, "loss": 0.9076, "step": 2570 }, { "epoch": 23.00490909090909, "grad_norm": 11.91686725616455, "learning_rate": 2.9494949494949498e-05, "loss": 0.9604, "step": 2580 }, { "epoch": 23.00672727272727, "grad_norm": 3.4642341136932373, "learning_rate": 2.9393939393939394e-05, "loss": 0.9178, "step": 2590 }, { "epoch": 23.008545454545455, "grad_norm": 6.674759864807129, "learning_rate": 2.9292929292929294e-05, "loss": 1.0162, "step": 2600 }, { "epoch": 23.010363636363635, "grad_norm": 5.000521659851074, "learning_rate": 2.9191919191919193e-05, "loss": 0.9169, "step": 2610 }, { "epoch": 23.01218181818182, "grad_norm": 4.165314197540283, "learning_rate": 2.909090909090909e-05, "loss": 0.9068, "step": 2620 }, { "epoch": 23.014, "grad_norm": 2.4558990001678467, "learning_rate": 2.898989898989899e-05, "loss": 0.9802, "step": 2630 }, { "epoch": 23.015818181818183, "grad_norm": 4.509535312652588, "learning_rate": 2.8888888888888888e-05, "loss": 1.0425, "step": 2640 }, { "epoch": 23.017636363636363, "grad_norm": 4.437975883483887, "learning_rate": 2.878787878787879e-05, "loss": 0.7647, "step": 2650 }, { "epoch": 23.019454545454547, "grad_norm": 4.480066299438477, "learning_rate": 2.868686868686869e-05, "loss": 0.8587, "step": 2660 }, { "epoch": 23.02018181818182, "eval_accuracy": 0.5259259259259259, "eval_loss": 1.0142112970352173, "eval_runtime": 134.2476, "eval_samples_per_second": 7.039, "eval_steps_per_second": 0.886, "step": 2664 }, { "epoch": 24.00109090909091, "grad_norm": 8.571599960327148, "learning_rate": 2.8585858585858587e-05, "loss": 0.9351, "step": 2670 }, { "epoch": 24.002909090909093, "grad_norm": 6.295083522796631, "learning_rate": 2.8484848484848486e-05, "loss": 1.1825, "step": 2680 }, { "epoch": 24.004727272727273, "grad_norm": 8.257028579711914, "learning_rate": 2.8383838383838386e-05, "loss": 1.003, "step": 2690 }, { "epoch": 24.006545454545453, "grad_norm": 3.9263625144958496, "learning_rate": 2.8282828282828282e-05, "loss": 0.8946, "step": 2700 }, { "epoch": 24.008363636363637, "grad_norm": 5.969736099243164, "learning_rate": 2.818181818181818e-05, "loss": 0.8263, "step": 2710 }, { "epoch": 24.010181818181817, "grad_norm": 7.632034778594971, "learning_rate": 2.808080808080808e-05, "loss": 0.9067, "step": 2720 }, { "epoch": 24.012, "grad_norm": 5.329760551452637, "learning_rate": 2.7979797979797984e-05, "loss": 0.8835, "step": 2730 }, { "epoch": 24.01381818181818, "grad_norm": 5.737499237060547, "learning_rate": 2.7878787878787883e-05, "loss": 0.9203, "step": 2740 }, { "epoch": 24.015636363636364, "grad_norm": 4.075726509094238, "learning_rate": 2.777777777777778e-05, "loss": 0.945, "step": 2750 }, { "epoch": 24.017454545454545, "grad_norm": 5.517560005187988, "learning_rate": 2.767676767676768e-05, "loss": 0.9141, "step": 2760 }, { "epoch": 24.019272727272728, "grad_norm": 3.141531467437744, "learning_rate": 2.7575757575757578e-05, "loss": 0.958, "step": 2770 }, { "epoch": 24.02018181818182, "eval_accuracy": 0.5375661375661376, "eval_loss": 0.9664833545684814, "eval_runtime": 135.1561, "eval_samples_per_second": 6.992, "eval_steps_per_second": 0.88, "step": 2775 }, { "epoch": 25.00090909090909, "grad_norm": 5.119720935821533, "learning_rate": 2.7474747474747474e-05, "loss": 1.0862, "step": 2780 }, { "epoch": 25.002727272727274, "grad_norm": 10.184759140014648, "learning_rate": 2.7373737373737374e-05, "loss": 0.9412, "step": 2790 }, { "epoch": 25.004545454545454, "grad_norm": 2.972130060195923, "learning_rate": 2.7272727272727273e-05, "loss": 0.7545, "step": 2800 }, { "epoch": 25.006363636363638, "grad_norm": 2.982419967651367, "learning_rate": 2.717171717171717e-05, "loss": 0.93, "step": 2810 }, { "epoch": 25.008181818181818, "grad_norm": 3.9260990619659424, "learning_rate": 2.7070707070707075e-05, "loss": 0.7816, "step": 2820 }, { "epoch": 25.01, "grad_norm": 6.232370853424072, "learning_rate": 2.696969696969697e-05, "loss": 0.9919, "step": 2830 }, { "epoch": 25.01181818181818, "grad_norm": 5.925095081329346, "learning_rate": 2.686868686868687e-05, "loss": 0.8871, "step": 2840 }, { "epoch": 25.013636363636362, "grad_norm": 3.562493085861206, "learning_rate": 2.676767676767677e-05, "loss": 0.8277, "step": 2850 }, { "epoch": 25.015454545454546, "grad_norm": 5.1987199783325195, "learning_rate": 2.6666666666666667e-05, "loss": 0.8438, "step": 2860 }, { "epoch": 25.017272727272726, "grad_norm": 7.01310920715332, "learning_rate": 2.6565656565656566e-05, "loss": 0.9062, "step": 2870 }, { "epoch": 25.01909090909091, "grad_norm": 6.749232292175293, "learning_rate": 2.6464646464646466e-05, "loss": 0.996, "step": 2880 }, { "epoch": 25.02018181818182, "eval_accuracy": 0.5703703703703704, "eval_loss": 0.9390914440155029, "eval_runtime": 132.5682, "eval_samples_per_second": 7.128, "eval_steps_per_second": 0.898, "step": 2886 }, { "epoch": 26.00072727272727, "grad_norm": 2.76141619682312, "learning_rate": 2.636363636363636e-05, "loss": 1.0099, "step": 2890 }, { "epoch": 26.002545454545455, "grad_norm": 8.156211853027344, "learning_rate": 2.6262626262626268e-05, "loss": 0.9647, "step": 2900 }, { "epoch": 26.004363636363635, "grad_norm": 6.6141462326049805, "learning_rate": 2.6161616161616164e-05, "loss": 0.9456, "step": 2910 }, { "epoch": 26.00618181818182, "grad_norm": 4.650730133056641, "learning_rate": 2.6060606060606063e-05, "loss": 0.8798, "step": 2920 }, { "epoch": 26.008, "grad_norm": 2.213693618774414, "learning_rate": 2.5959595959595963e-05, "loss": 0.8274, "step": 2930 }, { "epoch": 26.009818181818183, "grad_norm": 9.269267082214355, "learning_rate": 2.585858585858586e-05, "loss": 0.9503, "step": 2940 }, { "epoch": 26.011636363636363, "grad_norm": 4.464992046356201, "learning_rate": 2.575757575757576e-05, "loss": 0.8205, "step": 2950 }, { "epoch": 26.013454545454547, "grad_norm": 6.635782718658447, "learning_rate": 2.5656565656565658e-05, "loss": 0.9112, "step": 2960 }, { "epoch": 26.015272727272727, "grad_norm": 8.023598670959473, "learning_rate": 2.5555555555555554e-05, "loss": 0.9634, "step": 2970 }, { "epoch": 26.01709090909091, "grad_norm": 11.189812660217285, "learning_rate": 2.5454545454545454e-05, "loss": 0.9046, "step": 2980 }, { "epoch": 26.01890909090909, "grad_norm": 3.639500856399536, "learning_rate": 2.5353535353535356e-05, "loss": 0.823, "step": 2990 }, { "epoch": 26.02018181818182, "eval_accuracy": 0.5777777777777777, "eval_loss": 0.9170966744422913, "eval_runtime": 133.3341, "eval_samples_per_second": 7.087, "eval_steps_per_second": 0.892, "step": 2997 }, { "epoch": 27.000545454545456, "grad_norm": 7.965769290924072, "learning_rate": 2.5252525252525256e-05, "loss": 0.9709, "step": 3000 }, { "epoch": 27.002363636363636, "grad_norm": 6.372323036193848, "learning_rate": 2.5151515151515155e-05, "loss": 0.9405, "step": 3010 }, { "epoch": 27.004181818181817, "grad_norm": 8.214998245239258, "learning_rate": 2.505050505050505e-05, "loss": 0.8592, "step": 3020 }, { "epoch": 27.006, "grad_norm": 2.5504863262176514, "learning_rate": 2.494949494949495e-05, "loss": 0.8577, "step": 3030 }, { "epoch": 27.00781818181818, "grad_norm": 7.738058090209961, "learning_rate": 2.4848484848484847e-05, "loss": 0.9339, "step": 3040 }, { "epoch": 27.009636363636364, "grad_norm": 4.073580741882324, "learning_rate": 2.474747474747475e-05, "loss": 0.8894, "step": 3050 }, { "epoch": 27.011454545454544, "grad_norm": 3.962249994277954, "learning_rate": 2.464646464646465e-05, "loss": 1.1608, "step": 3060 }, { "epoch": 27.013272727272728, "grad_norm": 5.425260066986084, "learning_rate": 2.4545454545454545e-05, "loss": 0.9497, "step": 3070 }, { "epoch": 27.015090909090908, "grad_norm": 9.002552032470703, "learning_rate": 2.4444444444444445e-05, "loss": 0.9151, "step": 3080 }, { "epoch": 27.016909090909092, "grad_norm": 4.565546035766602, "learning_rate": 2.4343434343434344e-05, "loss": 0.8929, "step": 3090 }, { "epoch": 27.018727272727272, "grad_norm": 3.171492338180542, "learning_rate": 2.4242424242424244e-05, "loss": 0.8834, "step": 3100 }, { "epoch": 27.02018181818182, "eval_accuracy": 0.5873015873015873, "eval_loss": 0.89227294921875, "eval_runtime": 133.2881, "eval_samples_per_second": 7.09, "eval_steps_per_second": 0.893, "step": 3108 }, { "epoch": 28.000363636363637, "grad_norm": 7.9800896644592285, "learning_rate": 2.4141414141414143e-05, "loss": 0.8895, "step": 3110 }, { "epoch": 28.002181818181818, "grad_norm": 4.383500576019287, "learning_rate": 2.404040404040404e-05, "loss": 0.9218, "step": 3120 }, { "epoch": 28.004, "grad_norm": 5.249983310699463, "learning_rate": 2.393939393939394e-05, "loss": 0.8501, "step": 3130 }, { "epoch": 28.00581818181818, "grad_norm": 9.334481239318848, "learning_rate": 2.3838383838383842e-05, "loss": 0.8836, "step": 3140 }, { "epoch": 28.007636363636365, "grad_norm": 7.620109558105469, "learning_rate": 2.3737373737373738e-05, "loss": 0.9437, "step": 3150 }, { "epoch": 28.009454545454545, "grad_norm": 6.430835247039795, "learning_rate": 2.3636363636363637e-05, "loss": 0.9573, "step": 3160 }, { "epoch": 28.011272727272726, "grad_norm": 5.154223442077637, "learning_rate": 2.3535353535353537e-05, "loss": 0.9988, "step": 3170 }, { "epoch": 28.01309090909091, "grad_norm": 3.8441991806030273, "learning_rate": 2.3434343434343436e-05, "loss": 0.8807, "step": 3180 }, { "epoch": 28.01490909090909, "grad_norm": 12.257723808288574, "learning_rate": 2.3333333333333336e-05, "loss": 0.8627, "step": 3190 }, { "epoch": 28.016727272727273, "grad_norm": 2.526041030883789, "learning_rate": 2.3232323232323232e-05, "loss": 0.8983, "step": 3200 }, { "epoch": 28.018545454545453, "grad_norm": 5.447187900543213, "learning_rate": 2.313131313131313e-05, "loss": 0.8615, "step": 3210 }, { "epoch": 28.02018181818182, "eval_accuracy": 0.5470899470899471, "eval_loss": 0.9576845169067383, "eval_runtime": 131.1178, "eval_samples_per_second": 7.207, "eval_steps_per_second": 0.908, "step": 3219 }, { "epoch": 29.00018181818182, "grad_norm": 4.697556972503662, "learning_rate": 2.3030303030303034e-05, "loss": 0.9178, "step": 3220 }, { "epoch": 29.002, "grad_norm": 7.460103511810303, "learning_rate": 2.292929292929293e-05, "loss": 0.901, "step": 3230 }, { "epoch": 29.003818181818183, "grad_norm": 4.0857343673706055, "learning_rate": 2.282828282828283e-05, "loss": 0.8679, "step": 3240 }, { "epoch": 29.005636363636363, "grad_norm": 5.448916912078857, "learning_rate": 2.272727272727273e-05, "loss": 0.7971, "step": 3250 }, { "epoch": 29.007454545454546, "grad_norm": 9.139603614807129, "learning_rate": 2.262626262626263e-05, "loss": 0.9889, "step": 3260 }, { "epoch": 29.009272727272727, "grad_norm": 7.882384300231934, "learning_rate": 2.2525252525252528e-05, "loss": 0.9392, "step": 3270 }, { "epoch": 29.01109090909091, "grad_norm": 4.343461036682129, "learning_rate": 2.2424242424242424e-05, "loss": 0.9194, "step": 3280 }, { "epoch": 29.01290909090909, "grad_norm": 7.994728088378906, "learning_rate": 2.2323232323232324e-05, "loss": 0.9083, "step": 3290 }, { "epoch": 29.014727272727274, "grad_norm": 3.5908119678497314, "learning_rate": 2.2222222222222223e-05, "loss": 0.987, "step": 3300 }, { "epoch": 29.016545454545454, "grad_norm": 4.692817211151123, "learning_rate": 2.2121212121212123e-05, "loss": 0.8884, "step": 3310 }, { "epoch": 29.018363636363638, "grad_norm": 6.650206565856934, "learning_rate": 2.2020202020202022e-05, "loss": 0.8695, "step": 3320 }, { "epoch": 29.02018181818182, "grad_norm": 26.35285186767578, "learning_rate": 2.191919191919192e-05, "loss": 0.9462, "step": 3330 }, { "epoch": 29.02018181818182, "eval_accuracy": 0.562962962962963, "eval_loss": 0.9467629790306091, "eval_runtime": 131.6292, "eval_samples_per_second": 7.179, "eval_steps_per_second": 0.904, "step": 3330 }, { "epoch": 30.00181818181818, "grad_norm": 3.437410593032837, "learning_rate": 2.1818181818181818e-05, "loss": 0.8162, "step": 3340 }, { "epoch": 30.003636363636364, "grad_norm": 5.97007417678833, "learning_rate": 2.171717171717172e-05, "loss": 1.0401, "step": 3350 }, { "epoch": 30.005454545454544, "grad_norm": 3.841773509979248, "learning_rate": 2.1616161616161617e-05, "loss": 0.7645, "step": 3360 }, { "epoch": 30.007272727272728, "grad_norm": 9.196441650390625, "learning_rate": 2.1515151515151516e-05, "loss": 0.8248, "step": 3370 }, { "epoch": 30.009090909090908, "grad_norm": 8.819738388061523, "learning_rate": 2.1414141414141416e-05, "loss": 0.9833, "step": 3380 }, { "epoch": 30.01090909090909, "grad_norm": 5.484842777252197, "learning_rate": 2.1313131313131315e-05, "loss": 0.9015, "step": 3390 }, { "epoch": 30.012727272727272, "grad_norm": 7.970553398132324, "learning_rate": 2.1212121212121215e-05, "loss": 0.9099, "step": 3400 }, { "epoch": 30.014545454545456, "grad_norm": 5.823868274688721, "learning_rate": 2.111111111111111e-05, "loss": 0.9532, "step": 3410 }, { "epoch": 30.016363636363636, "grad_norm": 3.9710135459899902, "learning_rate": 2.101010101010101e-05, "loss": 0.8651, "step": 3420 }, { "epoch": 30.01818181818182, "grad_norm": 4.124075412750244, "learning_rate": 2.090909090909091e-05, "loss": 0.955, "step": 3430 }, { "epoch": 30.02, "grad_norm": 7.066349983215332, "learning_rate": 2.080808080808081e-05, "loss": 0.8909, "step": 3440 }, { "epoch": 30.02018181818182, "eval_accuracy": 0.5671957671957671, "eval_loss": 0.934297502040863, "eval_runtime": 133.2693, "eval_samples_per_second": 7.091, "eval_steps_per_second": 0.893, "step": 3441 }, { "epoch": 31.001636363636365, "grad_norm": 8.60042667388916, "learning_rate": 2.070707070707071e-05, "loss": 0.9224, "step": 3450 }, { "epoch": 31.003454545454545, "grad_norm": 3.15602970123291, "learning_rate": 2.0606060606060608e-05, "loss": 0.9395, "step": 3460 }, { "epoch": 31.00527272727273, "grad_norm": 2.947406768798828, "learning_rate": 2.0505050505050504e-05, "loss": 0.9115, "step": 3470 }, { "epoch": 31.00709090909091, "grad_norm": 8.351478576660156, "learning_rate": 2.0404040404040407e-05, "loss": 0.9073, "step": 3480 }, { "epoch": 31.00890909090909, "grad_norm": 8.569064140319824, "learning_rate": 2.0303030303030303e-05, "loss": 0.9537, "step": 3490 }, { "epoch": 31.010727272727273, "grad_norm": 5.759737491607666, "learning_rate": 2.0202020202020203e-05, "loss": 0.7879, "step": 3500 }, { "epoch": 31.012545454545453, "grad_norm": 5.850707530975342, "learning_rate": 2.0101010101010102e-05, "loss": 0.9657, "step": 3510 }, { "epoch": 31.014363636363637, "grad_norm": 5.242412567138672, "learning_rate": 2e-05, "loss": 0.8814, "step": 3520 }, { "epoch": 31.016181818181817, "grad_norm": 4.501462936401367, "learning_rate": 1.98989898989899e-05, "loss": 0.8106, "step": 3530 }, { "epoch": 31.018, "grad_norm": 2.6810784339904785, "learning_rate": 1.9797979797979797e-05, "loss": 0.8476, "step": 3540 }, { "epoch": 31.01981818181818, "grad_norm": 2.7699146270751953, "learning_rate": 1.9696969696969697e-05, "loss": 0.8048, "step": 3550 }, { "epoch": 31.02018181818182, "eval_accuracy": 0.5777777777777777, "eval_loss": 0.9107006192207336, "eval_runtime": 134.8089, "eval_samples_per_second": 7.01, "eval_steps_per_second": 0.883, "step": 3552 }, { "epoch": 32.00145454545454, "grad_norm": 6.846524715423584, "learning_rate": 1.95959595959596e-05, "loss": 0.8392, "step": 3560 }, { "epoch": 32.00327272727273, "grad_norm": 6.047547340393066, "learning_rate": 1.9494949494949496e-05, "loss": 0.8437, "step": 3570 }, { "epoch": 32.00509090909091, "grad_norm": 4.787844657897949, "learning_rate": 1.9393939393939395e-05, "loss": 0.9993, "step": 3580 }, { "epoch": 32.00690909090909, "grad_norm": 7.202611923217773, "learning_rate": 1.9292929292929295e-05, "loss": 0.8839, "step": 3590 }, { "epoch": 32.00872727272727, "grad_norm": 3.939385175704956, "learning_rate": 1.919191919191919e-05, "loss": 0.8977, "step": 3600 }, { "epoch": 32.01054545454546, "grad_norm": 2.789416790008545, "learning_rate": 1.9090909090909094e-05, "loss": 0.841, "step": 3610 }, { "epoch": 32.01236363636364, "grad_norm": 6.802464008331299, "learning_rate": 1.898989898989899e-05, "loss": 0.8549, "step": 3620 }, { "epoch": 32.01418181818182, "grad_norm": 8.131646156311035, "learning_rate": 1.888888888888889e-05, "loss": 0.8839, "step": 3630 }, { "epoch": 32.016, "grad_norm": 6.163980007171631, "learning_rate": 1.878787878787879e-05, "loss": 0.8568, "step": 3640 }, { "epoch": 32.01781818181818, "grad_norm": 9.532483100891113, "learning_rate": 1.8686868686868688e-05, "loss": 0.8056, "step": 3650 }, { "epoch": 32.019636363636366, "grad_norm": 7.999500274658203, "learning_rate": 1.8585858585858588e-05, "loss": 0.8109, "step": 3660 }, { "epoch": 32.02018181818182, "eval_accuracy": 0.5492063492063493, "eval_loss": 0.9547345042228699, "eval_runtime": 135.0614, "eval_samples_per_second": 6.997, "eval_steps_per_second": 0.881, "step": 3663 }, { "epoch": 33.00127272727273, "grad_norm": 5.978763103485107, "learning_rate": 1.8484848484848487e-05, "loss": 0.942, "step": 3670 }, { "epoch": 33.00309090909091, "grad_norm": 10.230365753173828, "learning_rate": 1.8383838383838383e-05, "loss": 0.8532, "step": 3680 }, { "epoch": 33.00490909090909, "grad_norm": 8.178292274475098, "learning_rate": 1.8282828282828286e-05, "loss": 0.8999, "step": 3690 }, { "epoch": 33.006727272727275, "grad_norm": 9.503693580627441, "learning_rate": 1.8181818181818182e-05, "loss": 0.9341, "step": 3700 }, { "epoch": 33.008545454545455, "grad_norm": 5.494664192199707, "learning_rate": 1.808080808080808e-05, "loss": 0.926, "step": 3710 }, { "epoch": 33.010363636363635, "grad_norm": 5.923470973968506, "learning_rate": 1.797979797979798e-05, "loss": 0.8584, "step": 3720 }, { "epoch": 33.012181818181816, "grad_norm": 6.926611423492432, "learning_rate": 1.787878787878788e-05, "loss": 0.7687, "step": 3730 }, { "epoch": 33.014, "grad_norm": 5.04495906829834, "learning_rate": 1.777777777777778e-05, "loss": 0.9648, "step": 3740 }, { "epoch": 33.01581818181818, "grad_norm": 6.400146961212158, "learning_rate": 1.7676767676767676e-05, "loss": 0.9959, "step": 3750 }, { "epoch": 33.01763636363636, "grad_norm": 8.008678436279297, "learning_rate": 1.7575757575757576e-05, "loss": 0.7892, "step": 3760 }, { "epoch": 33.01945454545454, "grad_norm": 3.765272378921509, "learning_rate": 1.7474747474747475e-05, "loss": 0.9242, "step": 3770 }, { "epoch": 33.02018181818182, "eval_accuracy": 0.5597883597883598, "eval_loss": 0.9275074601173401, "eval_runtime": 135.4909, "eval_samples_per_second": 6.975, "eval_steps_per_second": 0.878, "step": 3774 }, { "epoch": 34.00109090909091, "grad_norm": 6.494891166687012, "learning_rate": 1.7373737373737375e-05, "loss": 0.7527, "step": 3780 }, { "epoch": 34.00290909090909, "grad_norm": 7.506988525390625, "learning_rate": 1.7272727272727274e-05, "loss": 0.8774, "step": 3790 }, { "epoch": 34.00472727272727, "grad_norm": 6.073853015899658, "learning_rate": 1.7171717171717173e-05, "loss": 1.0006, "step": 3800 }, { "epoch": 34.00654545454545, "grad_norm": 5.657601833343506, "learning_rate": 1.707070707070707e-05, "loss": 0.8368, "step": 3810 }, { "epoch": 34.00836363636363, "grad_norm": 9.511337280273438, "learning_rate": 1.6969696969696972e-05, "loss": 0.8862, "step": 3820 }, { "epoch": 34.01018181818182, "grad_norm": 6.792288303375244, "learning_rate": 1.686868686868687e-05, "loss": 0.9277, "step": 3830 }, { "epoch": 34.012, "grad_norm": 10.69278335571289, "learning_rate": 1.6767676767676768e-05, "loss": 0.9289, "step": 3840 }, { "epoch": 34.01381818181818, "grad_norm": 4.471822261810303, "learning_rate": 1.6666666666666667e-05, "loss": 0.8878, "step": 3850 }, { "epoch": 34.01563636363636, "grad_norm": 8.317084312438965, "learning_rate": 1.6565656565656567e-05, "loss": 0.7268, "step": 3860 }, { "epoch": 34.01745454545455, "grad_norm": 7.521054744720459, "learning_rate": 1.6464646464646466e-05, "loss": 0.8997, "step": 3870 }, { "epoch": 34.01927272727273, "grad_norm": 4.9641594886779785, "learning_rate": 1.6363636363636366e-05, "loss": 0.9046, "step": 3880 }, { "epoch": 34.02018181818182, "eval_accuracy": 0.5830687830687831, "eval_loss": 0.9289540648460388, "eval_runtime": 137.5464, "eval_samples_per_second": 6.87, "eval_steps_per_second": 0.865, "step": 3885 }, { "epoch": 35.00090909090909, "grad_norm": 8.1494722366333, "learning_rate": 1.6262626262626262e-05, "loss": 0.9279, "step": 3890 }, { "epoch": 35.00272727272727, "grad_norm": 4.9748687744140625, "learning_rate": 1.6161616161616165e-05, "loss": 0.6185, "step": 3900 }, { "epoch": 35.00454545454546, "grad_norm": 8.476805686950684, "learning_rate": 1.606060606060606e-05, "loss": 0.9423, "step": 3910 }, { "epoch": 35.00636363636364, "grad_norm": 4.4243974685668945, "learning_rate": 1.595959595959596e-05, "loss": 0.8173, "step": 3920 }, { "epoch": 35.00818181818182, "grad_norm": 10.399567604064941, "learning_rate": 1.585858585858586e-05, "loss": 1.0438, "step": 3930 }, { "epoch": 35.01, "grad_norm": 3.4936625957489014, "learning_rate": 1.5757575757575756e-05, "loss": 0.8161, "step": 3940 }, { "epoch": 35.011818181818185, "grad_norm": 7.2451581954956055, "learning_rate": 1.565656565656566e-05, "loss": 0.8241, "step": 3950 }, { "epoch": 35.013636363636365, "grad_norm": 3.3511080741882324, "learning_rate": 1.5555555555555555e-05, "loss": 0.9642, "step": 3960 }, { "epoch": 35.015454545454546, "grad_norm": 2.77311372756958, "learning_rate": 1.5454545454545454e-05, "loss": 1.0031, "step": 3970 }, { "epoch": 35.017272727272726, "grad_norm": 14.23676872253418, "learning_rate": 1.5353535353535354e-05, "loss": 0.9956, "step": 3980 }, { "epoch": 35.019090909090906, "grad_norm": 4.298346519470215, "learning_rate": 1.5252525252525255e-05, "loss": 0.7677, "step": 3990 }, { "epoch": 35.02018181818182, "eval_accuracy": 0.5724867724867725, "eval_loss": 0.9207894802093506, "eval_runtime": 139.2805, "eval_samples_per_second": 6.785, "eval_steps_per_second": 0.854, "step": 3996 }, { "epoch": 36.000727272727275, "grad_norm": 8.253910064697266, "learning_rate": 1.5151515151515153e-05, "loss": 0.8331, "step": 4000 }, { "epoch": 36.002545454545455, "grad_norm": 12.524259567260742, "learning_rate": 1.505050505050505e-05, "loss": 0.8999, "step": 4010 }, { "epoch": 36.004363636363635, "grad_norm": 13.681807518005371, "learning_rate": 1.494949494949495e-05, "loss": 0.879, "step": 4020 }, { "epoch": 36.006181818181815, "grad_norm": 11.588513374328613, "learning_rate": 1.484848484848485e-05, "loss": 1.0405, "step": 4030 }, { "epoch": 36.008, "grad_norm": 10.12088394165039, "learning_rate": 1.4747474747474749e-05, "loss": 0.9501, "step": 4040 }, { "epoch": 36.00981818181818, "grad_norm": 7.492605686187744, "learning_rate": 1.4646464646464647e-05, "loss": 0.8858, "step": 4050 }, { "epoch": 36.01163636363636, "grad_norm": 6.3661394119262695, "learning_rate": 1.4545454545454545e-05, "loss": 0.8903, "step": 4060 }, { "epoch": 36.01345454545454, "grad_norm": 7.913581848144531, "learning_rate": 1.4444444444444444e-05, "loss": 0.8672, "step": 4070 }, { "epoch": 36.01527272727273, "grad_norm": 5.75011682510376, "learning_rate": 1.4343434343434345e-05, "loss": 0.935, "step": 4080 }, { "epoch": 36.01709090909091, "grad_norm": 8.735386848449707, "learning_rate": 1.4242424242424243e-05, "loss": 0.9386, "step": 4090 }, { "epoch": 36.01890909090909, "grad_norm": 4.150994777679443, "learning_rate": 1.4141414141414141e-05, "loss": 0.8501, "step": 4100 }, { "epoch": 36.02018181818182, "eval_accuracy": 0.580952380952381, "eval_loss": 0.912603497505188, "eval_runtime": 139.8235, "eval_samples_per_second": 6.759, "eval_steps_per_second": 0.851, "step": 4107 }, { "epoch": 37.00054545454545, "grad_norm": 15.24356460571289, "learning_rate": 1.404040404040404e-05, "loss": 0.8454, "step": 4110 }, { "epoch": 37.00236363636364, "grad_norm": 5.617706775665283, "learning_rate": 1.3939393939393942e-05, "loss": 0.8981, "step": 4120 }, { "epoch": 37.00418181818182, "grad_norm": 9.067435264587402, "learning_rate": 1.383838383838384e-05, "loss": 0.8377, "step": 4130 }, { "epoch": 37.006, "grad_norm": 6.194527626037598, "learning_rate": 1.3737373737373737e-05, "loss": 0.8465, "step": 4140 }, { "epoch": 37.00781818181818, "grad_norm": 4.81503963470459, "learning_rate": 1.3636363636363637e-05, "loss": 0.8916, "step": 4150 }, { "epoch": 37.00963636363636, "grad_norm": 7.697696685791016, "learning_rate": 1.3535353535353538e-05, "loss": 0.9936, "step": 4160 }, { "epoch": 37.01145454545455, "grad_norm": 4.091245651245117, "learning_rate": 1.3434343434343436e-05, "loss": 0.8216, "step": 4170 }, { "epoch": 37.01327272727273, "grad_norm": 8.986047744750977, "learning_rate": 1.3333333333333333e-05, "loss": 0.9964, "step": 4180 }, { "epoch": 37.01509090909091, "grad_norm": 3.3684699535369873, "learning_rate": 1.3232323232323233e-05, "loss": 0.9505, "step": 4190 }, { "epoch": 37.01690909090909, "grad_norm": 5.151857376098633, "learning_rate": 1.3131313131313134e-05, "loss": 0.971, "step": 4200 }, { "epoch": 37.018727272727276, "grad_norm": 6.723459243774414, "learning_rate": 1.3030303030303032e-05, "loss": 0.8468, "step": 4210 }, { "epoch": 37.02018181818182, "eval_accuracy": 0.5862433862433862, "eval_loss": 0.9053242206573486, "eval_runtime": 139.4496, "eval_samples_per_second": 6.777, "eval_steps_per_second": 0.853, "step": 4218 }, { "epoch": 38.00036363636364, "grad_norm": 4.479756832122803, "learning_rate": 1.292929292929293e-05, "loss": 0.7653, "step": 4220 }, { "epoch": 38.00218181818182, "grad_norm": 5.094549655914307, "learning_rate": 1.2828282828282829e-05, "loss": 0.9674, "step": 4230 }, { "epoch": 38.004, "grad_norm": 5.455288887023926, "learning_rate": 1.2727272727272727e-05, "loss": 0.9193, "step": 4240 }, { "epoch": 38.005818181818185, "grad_norm": 4.766934871673584, "learning_rate": 1.2626262626262628e-05, "loss": 0.8482, "step": 4250 }, { "epoch": 38.007636363636365, "grad_norm": 10.875009536743164, "learning_rate": 1.2525252525252526e-05, "loss": 0.7313, "step": 4260 }, { "epoch": 38.009454545454545, "grad_norm": 4.407993316650391, "learning_rate": 1.2424242424242424e-05, "loss": 0.8851, "step": 4270 }, { "epoch": 38.011272727272726, "grad_norm": 5.814301490783691, "learning_rate": 1.2323232323232325e-05, "loss": 0.8522, "step": 4280 }, { "epoch": 38.013090909090906, "grad_norm": 4.259603977203369, "learning_rate": 1.2222222222222222e-05, "loss": 0.8599, "step": 4290 }, { "epoch": 38.01490909090909, "grad_norm": 9.286781311035156, "learning_rate": 1.2121212121212122e-05, "loss": 0.9504, "step": 4300 }, { "epoch": 38.01672727272727, "grad_norm": 6.765378952026367, "learning_rate": 1.202020202020202e-05, "loss": 0.9203, "step": 4310 }, { "epoch": 38.01854545454545, "grad_norm": 5.87042236328125, "learning_rate": 1.1919191919191921e-05, "loss": 0.7814, "step": 4320 }, { "epoch": 38.02018181818182, "eval_accuracy": 0.5904761904761905, "eval_loss": 0.8857986330986023, "eval_runtime": 136.9965, "eval_samples_per_second": 6.898, "eval_steps_per_second": 0.869, "step": 4329 }, { "epoch": 39.000181818181815, "grad_norm": 4.036645412445068, "learning_rate": 1.1818181818181819e-05, "loss": 0.896, "step": 4330 }, { "epoch": 39.002, "grad_norm": 6.015566825866699, "learning_rate": 1.1717171717171718e-05, "loss": 1.008, "step": 4340 }, { "epoch": 39.00381818181818, "grad_norm": 8.852757453918457, "learning_rate": 1.1616161616161616e-05, "loss": 0.8159, "step": 4350 }, { "epoch": 39.00563636363636, "grad_norm": 6.896280288696289, "learning_rate": 1.1515151515151517e-05, "loss": 0.7858, "step": 4360 }, { "epoch": 39.00745454545454, "grad_norm": 13.30453109741211, "learning_rate": 1.1414141414141415e-05, "loss": 0.7982, "step": 4370 }, { "epoch": 39.00927272727273, "grad_norm": 5.01935338973999, "learning_rate": 1.1313131313131314e-05, "loss": 0.9107, "step": 4380 }, { "epoch": 39.01109090909091, "grad_norm": 7.737326622009277, "learning_rate": 1.1212121212121212e-05, "loss": 0.8968, "step": 4390 }, { "epoch": 39.01290909090909, "grad_norm": 6.401789665222168, "learning_rate": 1.1111111111111112e-05, "loss": 0.881, "step": 4400 }, { "epoch": 39.01472727272727, "grad_norm": 5.3346171379089355, "learning_rate": 1.1010101010101011e-05, "loss": 0.9034, "step": 4410 }, { "epoch": 39.01654545454546, "grad_norm": 4.072237968444824, "learning_rate": 1.0909090909090909e-05, "loss": 0.7896, "step": 4420 }, { "epoch": 39.01836363636364, "grad_norm": 15.682538986206055, "learning_rate": 1.0808080808080808e-05, "loss": 0.8473, "step": 4430 }, { "epoch": 39.02018181818182, "grad_norm": 8.316577911376953, "learning_rate": 1.0707070707070708e-05, "loss": 0.9354, "step": 4440 }, { "epoch": 39.02018181818182, "eval_accuracy": 0.5724867724867725, "eval_loss": 0.9207160472869873, "eval_runtime": 137.3805, "eval_samples_per_second": 6.879, "eval_steps_per_second": 0.866, "step": 4440 }, { "epoch": 40.00181818181818, "grad_norm": 4.132864475250244, "learning_rate": 1.0606060606060607e-05, "loss": 0.7832, "step": 4450 }, { "epoch": 40.00363636363636, "grad_norm": 3.2783091068267822, "learning_rate": 1.0505050505050505e-05, "loss": 0.8667, "step": 4460 }, { "epoch": 40.00545454545455, "grad_norm": 4.29857063293457, "learning_rate": 1.0404040404040405e-05, "loss": 0.719, "step": 4470 }, { "epoch": 40.00727272727273, "grad_norm": 6.1660590171813965, "learning_rate": 1.0303030303030304e-05, "loss": 0.912, "step": 4480 }, { "epoch": 40.00909090909091, "grad_norm": 5.285783290863037, "learning_rate": 1.0202020202020204e-05, "loss": 0.8702, "step": 4490 }, { "epoch": 40.01090909090909, "grad_norm": 8.145794868469238, "learning_rate": 1.0101010101010101e-05, "loss": 0.9263, "step": 4500 }, { "epoch": 40.012727272727275, "grad_norm": 7.457106113433838, "learning_rate": 1e-05, "loss": 0.9648, "step": 4510 }, { "epoch": 40.014545454545456, "grad_norm": 7.848749160766602, "learning_rate": 9.898989898989899e-06, "loss": 0.8373, "step": 4520 }, { "epoch": 40.016363636363636, "grad_norm": 8.913769721984863, "learning_rate": 9.7979797979798e-06, "loss": 0.8669, "step": 4530 }, { "epoch": 40.018181818181816, "grad_norm": 10.52230167388916, "learning_rate": 9.696969696969698e-06, "loss": 0.8021, "step": 4540 }, { "epoch": 40.02, "grad_norm": 5.463939666748047, "learning_rate": 9.595959595959595e-06, "loss": 0.8849, "step": 4550 }, { "epoch": 40.02018181818182, "eval_accuracy": 0.5650793650793651, "eval_loss": 0.9276704788208008, "eval_runtime": 135.7086, "eval_samples_per_second": 6.963, "eval_steps_per_second": 0.877, "step": 4551 }, { "epoch": 41.001636363636365, "grad_norm": 8.246631622314453, "learning_rate": 9.494949494949495e-06, "loss": 0.8189, "step": 4560 }, { "epoch": 41.003454545454545, "grad_norm": 2.8926265239715576, "learning_rate": 9.393939393939394e-06, "loss": 1.0385, "step": 4570 }, { "epoch": 41.005272727272725, "grad_norm": 9.945596694946289, "learning_rate": 9.292929292929294e-06, "loss": 0.8323, "step": 4580 }, { "epoch": 41.00709090909091, "grad_norm": 4.7715277671813965, "learning_rate": 9.191919191919192e-06, "loss": 0.8293, "step": 4590 }, { "epoch": 41.00890909090909, "grad_norm": 6.302114963531494, "learning_rate": 9.090909090909091e-06, "loss": 0.7819, "step": 4600 }, { "epoch": 41.01072727272727, "grad_norm": 10.754914283752441, "learning_rate": 8.98989898989899e-06, "loss": 0.8525, "step": 4610 }, { "epoch": 41.01254545454545, "grad_norm": 3.367741346359253, "learning_rate": 8.88888888888889e-06, "loss": 0.9213, "step": 4620 }, { "epoch": 41.01436363636363, "grad_norm": 4.3797407150268555, "learning_rate": 8.787878787878788e-06, "loss": 0.8236, "step": 4630 }, { "epoch": 41.01618181818182, "grad_norm": 4.827358722686768, "learning_rate": 8.686868686868687e-06, "loss": 0.8398, "step": 4640 }, { "epoch": 41.018, "grad_norm": 8.580107688903809, "learning_rate": 8.585858585858587e-06, "loss": 0.9439, "step": 4650 }, { "epoch": 41.01981818181818, "grad_norm": 8.491392135620117, "learning_rate": 8.484848484848486e-06, "loss": 0.7856, "step": 4660 }, { "epoch": 41.02018181818182, "eval_accuracy": 0.5915343915343916, "eval_loss": 0.9130398035049438, "eval_runtime": 134.1436, "eval_samples_per_second": 7.045, "eval_steps_per_second": 0.887, "step": 4662 }, { "epoch": 42.00145454545454, "grad_norm": 6.375803470611572, "learning_rate": 8.383838383838384e-06, "loss": 0.7439, "step": 4670 }, { "epoch": 42.00327272727273, "grad_norm": 3.816681146621704, "learning_rate": 8.282828282828283e-06, "loss": 0.8752, "step": 4680 }, { "epoch": 42.00509090909091, "grad_norm": 8.415267944335938, "learning_rate": 8.181818181818183e-06, "loss": 0.8966, "step": 4690 }, { "epoch": 42.00690909090909, "grad_norm": 5.004403114318848, "learning_rate": 8.080808080808082e-06, "loss": 0.7916, "step": 4700 }, { "epoch": 42.00872727272727, "grad_norm": 8.591304779052734, "learning_rate": 7.97979797979798e-06, "loss": 0.8525, "step": 4710 }, { "epoch": 42.01054545454546, "grad_norm": 4.411281585693359, "learning_rate": 7.878787878787878e-06, "loss": 0.7593, "step": 4720 }, { "epoch": 42.01236363636364, "grad_norm": 5.2794718742370605, "learning_rate": 7.777777777777777e-06, "loss": 0.984, "step": 4730 }, { "epoch": 42.01418181818182, "grad_norm": 5.923099517822266, "learning_rate": 7.676767676767677e-06, "loss": 0.8673, "step": 4740 }, { "epoch": 42.016, "grad_norm": 7.30630350112915, "learning_rate": 7.5757575757575764e-06, "loss": 0.8758, "step": 4750 }, { "epoch": 42.01781818181818, "grad_norm": 9.451834678649902, "learning_rate": 7.474747474747475e-06, "loss": 0.8842, "step": 4760 }, { "epoch": 42.019636363636366, "grad_norm": 5.710024833679199, "learning_rate": 7.3737373737373745e-06, "loss": 0.7133, "step": 4770 }, { "epoch": 42.02018181818182, "eval_accuracy": 0.5883597883597883, "eval_loss": 0.9080068469047546, "eval_runtime": 133.6506, "eval_samples_per_second": 7.071, "eval_steps_per_second": 0.89, "step": 4773 }, { "epoch": 43.00127272727273, "grad_norm": 5.078426361083984, "learning_rate": 7.272727272727272e-06, "loss": 0.7062, "step": 4780 }, { "epoch": 43.00309090909091, "grad_norm": 7.484274864196777, "learning_rate": 7.171717171717173e-06, "loss": 0.965, "step": 4790 }, { "epoch": 43.00490909090909, "grad_norm": 4.4427008628845215, "learning_rate": 7.0707070707070704e-06, "loss": 0.7688, "step": 4800 }, { "epoch": 43.006727272727275, "grad_norm": 6.1136627197265625, "learning_rate": 6.969696969696971e-06, "loss": 0.7731, "step": 4810 }, { "epoch": 43.008545454545455, "grad_norm": 8.658069610595703, "learning_rate": 6.8686868686868685e-06, "loss": 0.8958, "step": 4820 }, { "epoch": 43.010363636363635, "grad_norm": 5.886534214019775, "learning_rate": 6.767676767676769e-06, "loss": 0.9499, "step": 4830 }, { "epoch": 43.012181818181816, "grad_norm": 10.104177474975586, "learning_rate": 6.666666666666667e-06, "loss": 0.8922, "step": 4840 }, { "epoch": 43.014, "grad_norm": 8.859838485717773, "learning_rate": 6.565656565656567e-06, "loss": 0.9673, "step": 4850 }, { "epoch": 43.01581818181818, "grad_norm": 5.677845001220703, "learning_rate": 6.464646464646465e-06, "loss": 0.8809, "step": 4860 }, { "epoch": 43.01763636363636, "grad_norm": 6.806046485900879, "learning_rate": 6.363636363636363e-06, "loss": 0.831, "step": 4870 }, { "epoch": 43.01945454545454, "grad_norm": 7.888843059539795, "learning_rate": 6.262626262626263e-06, "loss": 0.932, "step": 4880 }, { "epoch": 43.02018181818182, "eval_accuracy": 0.5576719576719577, "eval_loss": 0.9387907385826111, "eval_runtime": 137.3927, "eval_samples_per_second": 6.878, "eval_steps_per_second": 0.866, "step": 4884 }, { "epoch": 44.00109090909091, "grad_norm": 9.031815528869629, "learning_rate": 6.161616161616162e-06, "loss": 0.9602, "step": 4890 }, { "epoch": 44.00290909090909, "grad_norm": 8.353316307067871, "learning_rate": 6.060606060606061e-06, "loss": 0.8246, "step": 4900 }, { "epoch": 44.00472727272727, "grad_norm": 4.821561336517334, "learning_rate": 5.9595959595959605e-06, "loss": 0.8246, "step": 4910 }, { "epoch": 44.00654545454545, "grad_norm": 9.228032112121582, "learning_rate": 5.858585858585859e-06, "loss": 0.9177, "step": 4920 }, { "epoch": 44.00836363636363, "grad_norm": 6.226870059967041, "learning_rate": 5.7575757575757586e-06, "loss": 0.8679, "step": 4930 }, { "epoch": 44.01018181818182, "grad_norm": 11.385287284851074, "learning_rate": 5.656565656565657e-06, "loss": 0.9084, "step": 4940 }, { "epoch": 44.012, "grad_norm": 9.927154541015625, "learning_rate": 5.555555555555556e-06, "loss": 0.7855, "step": 4950 }, { "epoch": 44.01381818181818, "grad_norm": 6.668172836303711, "learning_rate": 5.4545454545454545e-06, "loss": 0.7826, "step": 4960 }, { "epoch": 44.01563636363636, "grad_norm": 4.65692663192749, "learning_rate": 5.353535353535354e-06, "loss": 0.8195, "step": 4970 }, { "epoch": 44.01745454545455, "grad_norm": 5.107462406158447, "learning_rate": 5.2525252525252526e-06, "loss": 0.9274, "step": 4980 }, { "epoch": 44.01927272727273, "grad_norm": 7.147744655609131, "learning_rate": 5.151515151515152e-06, "loss": 0.6883, "step": 4990 }, { "epoch": 44.02018181818182, "eval_accuracy": 0.5936507936507937, "eval_loss": 0.892484724521637, "eval_runtime": 137.8997, "eval_samples_per_second": 6.853, "eval_steps_per_second": 0.863, "step": 4995 }, { "epoch": 45.00090909090909, "grad_norm": 5.661710739135742, "learning_rate": 5.050505050505051e-06, "loss": 0.7446, "step": 5000 }, { "epoch": 45.00272727272727, "grad_norm": 3.658931016921997, "learning_rate": 4.949494949494949e-06, "loss": 0.9806, "step": 5010 }, { "epoch": 45.00454545454546, "grad_norm": 4.7141194343566895, "learning_rate": 4.848484848484849e-06, "loss": 0.7656, "step": 5020 }, { "epoch": 45.00636363636364, "grad_norm": 3.591520309448242, "learning_rate": 4.747474747474747e-06, "loss": 0.9483, "step": 5030 }, { "epoch": 45.00818181818182, "grad_norm": 9.716415405273438, "learning_rate": 4.646464646464647e-06, "loss": 0.8558, "step": 5040 }, { "epoch": 45.01, "grad_norm": 7.835188388824463, "learning_rate": 4.5454545454545455e-06, "loss": 0.8277, "step": 5050 }, { "epoch": 45.011818181818185, "grad_norm": 5.499917984008789, "learning_rate": 4.444444444444445e-06, "loss": 0.7901, "step": 5060 }, { "epoch": 45.013636363636365, "grad_norm": 6.486261367797852, "learning_rate": 4.343434343434344e-06, "loss": 0.6807, "step": 5070 }, { "epoch": 45.015454545454546, "grad_norm": 5.808882713317871, "learning_rate": 4.242424242424243e-06, "loss": 0.9827, "step": 5080 }, { "epoch": 45.017272727272726, "grad_norm": 8.00777530670166, "learning_rate": 4.141414141414142e-06, "loss": 0.8919, "step": 5090 }, { "epoch": 45.019090909090906, "grad_norm": 9.413071632385254, "learning_rate": 4.040404040404041e-06, "loss": 0.9944, "step": 5100 }, { "epoch": 45.02018181818182, "eval_accuracy": 0.582010582010582, "eval_loss": 0.9143127202987671, "eval_runtime": 137.4856, "eval_samples_per_second": 6.873, "eval_steps_per_second": 0.866, "step": 5106 }, { "epoch": 46.000727272727275, "grad_norm": 11.19728946685791, "learning_rate": 3.939393939393939e-06, "loss": 0.7968, "step": 5110 }, { "epoch": 46.002545454545455, "grad_norm": 3.2283504009246826, "learning_rate": 3.8383838383838385e-06, "loss": 0.8411, "step": 5120 }, { "epoch": 46.004363636363635, "grad_norm": 3.715587854385376, "learning_rate": 3.7373737373737375e-06, "loss": 0.926, "step": 5130 }, { "epoch": 46.006181818181815, "grad_norm": 6.313870906829834, "learning_rate": 3.636363636363636e-06, "loss": 0.9623, "step": 5140 }, { "epoch": 46.008, "grad_norm": 6.3307952880859375, "learning_rate": 3.5353535353535352e-06, "loss": 0.9009, "step": 5150 }, { "epoch": 46.00981818181818, "grad_norm": 5.783283710479736, "learning_rate": 3.4343434343434343e-06, "loss": 0.8005, "step": 5160 }, { "epoch": 46.01163636363636, "grad_norm": 5.894885063171387, "learning_rate": 3.3333333333333333e-06, "loss": 0.7556, "step": 5170 }, { "epoch": 46.01345454545454, "grad_norm": 6.77593994140625, "learning_rate": 3.2323232323232324e-06, "loss": 0.9579, "step": 5180 }, { "epoch": 46.01527272727273, "grad_norm": 5.100948333740234, "learning_rate": 3.1313131313131314e-06, "loss": 0.8254, "step": 5190 }, { "epoch": 46.01709090909091, "grad_norm": 9.13286018371582, "learning_rate": 3.0303030303030305e-06, "loss": 0.8354, "step": 5200 }, { "epoch": 46.01890909090909, "grad_norm": 4.923421859741211, "learning_rate": 2.9292929292929295e-06, "loss": 0.8892, "step": 5210 }, { "epoch": 46.02018181818182, "eval_accuracy": 0.5883597883597883, "eval_loss": 0.9102571606636047, "eval_runtime": 139.3527, "eval_samples_per_second": 6.781, "eval_steps_per_second": 0.854, "step": 5217 }, { "epoch": 47.00054545454545, "grad_norm": 6.632534503936768, "learning_rate": 2.8282828282828286e-06, "loss": 0.904, "step": 5220 }, { "epoch": 47.00236363636364, "grad_norm": 19.992380142211914, "learning_rate": 2.7272727272727272e-06, "loss": 0.832, "step": 5230 }, { "epoch": 47.00418181818182, "grad_norm": 6.258913516998291, "learning_rate": 2.6262626262626263e-06, "loss": 0.9356, "step": 5240 }, { "epoch": 47.006, "grad_norm": 6.109975814819336, "learning_rate": 2.5252525252525253e-06, "loss": 0.8786, "step": 5250 }, { "epoch": 47.00781818181818, "grad_norm": 6.342895030975342, "learning_rate": 2.4242424242424244e-06, "loss": 0.9049, "step": 5260 }, { "epoch": 47.00963636363636, "grad_norm": 8.066116333007812, "learning_rate": 2.3232323232323234e-06, "loss": 0.8481, "step": 5270 }, { "epoch": 47.01145454545455, "grad_norm": 9.589644432067871, "learning_rate": 2.2222222222222225e-06, "loss": 0.9711, "step": 5280 }, { "epoch": 47.01327272727273, "grad_norm": 7.310944080352783, "learning_rate": 2.1212121212121216e-06, "loss": 0.8926, "step": 5290 }, { "epoch": 47.01509090909091, "grad_norm": 5.914597511291504, "learning_rate": 2.0202020202020206e-06, "loss": 0.8469, "step": 5300 }, { "epoch": 47.01690909090909, "grad_norm": 7.06303596496582, "learning_rate": 1.9191919191919192e-06, "loss": 0.773, "step": 5310 }, { "epoch": 47.018727272727276, "grad_norm": 7.100560665130615, "learning_rate": 1.818181818181818e-06, "loss": 0.9071, "step": 5320 }, { "epoch": 47.02018181818182, "eval_accuracy": 0.5904761904761905, "eval_loss": 0.9018059968948364, "eval_runtime": 138.8187, "eval_samples_per_second": 6.807, "eval_steps_per_second": 0.857, "step": 5328 }, { "epoch": 48.00036363636364, "grad_norm": 7.776629447937012, "learning_rate": 1.7171717171717171e-06, "loss": 0.7373, "step": 5330 }, { "epoch": 48.00218181818182, "grad_norm": 7.495334148406982, "learning_rate": 1.6161616161616162e-06, "loss": 0.814, "step": 5340 }, { "epoch": 48.004, "grad_norm": 6.628978729248047, "learning_rate": 1.5151515151515152e-06, "loss": 0.8408, "step": 5350 }, { "epoch": 48.005818181818185, "grad_norm": 6.9452290534973145, "learning_rate": 1.4141414141414143e-06, "loss": 0.8563, "step": 5360 }, { "epoch": 48.007636363636365, "grad_norm": 4.360229015350342, "learning_rate": 1.3131313131313131e-06, "loss": 0.9381, "step": 5370 }, { "epoch": 48.009454545454545, "grad_norm": 4.341993808746338, "learning_rate": 1.2121212121212122e-06, "loss": 0.8765, "step": 5380 }, { "epoch": 48.011272727272726, "grad_norm": 8.818041801452637, "learning_rate": 1.1111111111111112e-06, "loss": 0.8021, "step": 5390 }, { "epoch": 48.013090909090906, "grad_norm": 3.769672155380249, "learning_rate": 1.0101010101010103e-06, "loss": 1.0232, "step": 5400 }, { "epoch": 48.01490909090909, "grad_norm": 4.771209716796875, "learning_rate": 9.09090909090909e-07, "loss": 0.878, "step": 5410 }, { "epoch": 48.01672727272727, "grad_norm": 3.48793363571167, "learning_rate": 8.080808080808081e-07, "loss": 0.8093, "step": 5420 }, { "epoch": 48.01854545454545, "grad_norm": 14.258705139160156, "learning_rate": 7.070707070707071e-07, "loss": 0.7943, "step": 5430 }, { "epoch": 48.02018181818182, "eval_accuracy": 0.5904761904761905, "eval_loss": 0.9021934270858765, "eval_runtime": 139.7838, "eval_samples_per_second": 6.76, "eval_steps_per_second": 0.851, "step": 5439 }, { "epoch": 49.000181818181815, "grad_norm": 5.376201152801514, "learning_rate": 6.060606060606061e-07, "loss": 0.9175, "step": 5440 }, { "epoch": 49.002, "grad_norm": 7.467062473297119, "learning_rate": 5.050505050505052e-07, "loss": 0.7526, "step": 5450 }, { "epoch": 49.00381818181818, "grad_norm": 7.232283115386963, "learning_rate": 4.0404040404040405e-07, "loss": 0.8092, "step": 5460 }, { "epoch": 49.00563636363636, "grad_norm": 3.167647123336792, "learning_rate": 3.0303030303030305e-07, "loss": 0.7768, "step": 5470 }, { "epoch": 49.00745454545454, "grad_norm": 3.3912150859832764, "learning_rate": 2.0202020202020202e-07, "loss": 0.8756, "step": 5480 }, { "epoch": 49.00927272727273, "grad_norm": 9.803848266601562, "learning_rate": 1.0101010101010101e-07, "loss": 0.8963, "step": 5490 }, { "epoch": 49.01109090909091, "grad_norm": 4.264191627502441, "learning_rate": 0.0, "loss": 0.8034, "step": 5500 }, { "epoch": 49.01109090909091, "eval_accuracy": 0.5947089947089947, "eval_loss": 0.9004086256027222, "eval_runtime": 136.9287, "eval_samples_per_second": 6.901, "eval_steps_per_second": 0.869, "step": 5500 }, { "epoch": 49.01109090909091, "step": 5500, "total_flos": 4.916643221082378e+19, "train_loss": 0.930074092431502, "train_runtime": 13888.1034, "train_samples_per_second": 3.168, "train_steps_per_second": 0.396 }, { "epoch": 49.01109090909091, "eval_accuracy": 0.5566137566137566, "eval_loss": 0.9280895590782166, "eval_runtime": 141.6112, "eval_samples_per_second": 6.673, "eval_steps_per_second": 0.84, "step": 5500 }, { "epoch": 49.01109090909091, "eval_accuracy": 0.5566137566137566, "eval_loss": 0.9280895590782166, "eval_runtime": 138.7006, "eval_samples_per_second": 6.813, "eval_steps_per_second": 0.858, "step": 5500 } ], "logging_steps": 10, "max_steps": 5500, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.916643221082378e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }