diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7863 @@ +{ + "best_metric": 0.1347890943288803, + "best_model_checkpoint": "finetuned-fake-food/checkpoint-6200", + "epoch": 10.0, + "eval_steps": 100, + "global_step": 9900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010101010101010102, + "grad_norm": 1.8522064685821533, + "learning_rate": 0.0001997979797979798, + "loss": 0.5857, + "step": 10 + }, + { + "epoch": 0.020202020202020204, + "grad_norm": 6.711320877075195, + "learning_rate": 0.0001995959595959596, + "loss": 0.4223, + "step": 20 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 1.6324388980865479, + "learning_rate": 0.0001993939393939394, + "loss": 0.3733, + "step": 30 + }, + { + "epoch": 0.04040404040404041, + "grad_norm": 3.825958013534546, + "learning_rate": 0.0001991919191919192, + "loss": 0.4517, + "step": 40 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 4.070992469787598, + "learning_rate": 0.000198989898989899, + "loss": 0.3153, + "step": 50 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 13.05056095123291, + "learning_rate": 0.00019878787878787878, + "loss": 0.3505, + "step": 60 + }, + { + "epoch": 0.0707070707070707, + "grad_norm": 2.4650187492370605, + "learning_rate": 0.0001985858585858586, + "loss": 0.4858, + "step": 70 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 3.954157829284668, + "learning_rate": 0.00019838383838383837, + "loss": 0.4539, + "step": 80 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 1.3492755889892578, + "learning_rate": 0.00019818181818181821, + "loss": 0.337, + "step": 90 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 3.1130309104919434, + "learning_rate": 0.000197979797979798, + "loss": 0.3831, + "step": 100 + }, + { + "epoch": 0.10101010101010101, + "eval_accuracy": 0.891156462585034, + "eval_loss": 0.2662810981273651, + "eval_runtime": 25.9983, + "eval_samples_per_second": 107.43, + "eval_steps_per_second": 13.462, + "step": 100 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 1.5025306940078735, + "learning_rate": 0.00019777777777777778, + "loss": 0.2793, + "step": 110 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 2.976517915725708, + "learning_rate": 0.0001975757575757576, + "loss": 0.4034, + "step": 120 + }, + { + "epoch": 0.13131313131313133, + "grad_norm": 2.702010154724121, + "learning_rate": 0.00019737373737373738, + "loss": 0.2592, + "step": 130 + }, + { + "epoch": 0.1414141414141414, + "grad_norm": 0.9340890645980835, + "learning_rate": 0.0001971717171717172, + "loss": 0.3167, + "step": 140 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.42018941044807434, + "learning_rate": 0.00019696969696969698, + "loss": 0.2226, + "step": 150 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 3.9382266998291016, + "learning_rate": 0.00019676767676767677, + "loss": 0.3006, + "step": 160 + }, + { + "epoch": 0.1717171717171717, + "grad_norm": 7.005850791931152, + "learning_rate": 0.00019656565656565658, + "loss": 0.3149, + "step": 170 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.840803623199463, + "learning_rate": 0.00019636363636363636, + "loss": 0.5338, + "step": 180 + }, + { + "epoch": 0.1919191919191919, + "grad_norm": 1.7472269535064697, + "learning_rate": 0.00019616161616161618, + "loss": 0.3341, + "step": 190 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 2.3459396362304688, + "learning_rate": 0.00019595959595959596, + "loss": 0.3699, + "step": 200 + }, + { + "epoch": 0.20202020202020202, + "eval_accuracy": 0.8990332975295381, + "eval_loss": 0.25698739290237427, + "eval_runtime": 25.9306, + "eval_samples_per_second": 107.71, + "eval_steps_per_second": 13.498, + "step": 200 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 2.436138868331909, + "learning_rate": 0.00019575757575757577, + "loss": 0.2933, + "step": 210 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.9893826246261597, + "learning_rate": 0.00019555555555555556, + "loss": 0.3092, + "step": 220 + }, + { + "epoch": 0.23232323232323232, + "grad_norm": 1.9548523426055908, + "learning_rate": 0.00019535353535353534, + "loss": 0.3216, + "step": 230 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.4108979105949402, + "learning_rate": 0.00019515151515151516, + "loss": 0.246, + "step": 240 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 1.4591792821884155, + "learning_rate": 0.00019494949494949494, + "loss": 0.214, + "step": 250 + }, + { + "epoch": 0.26262626262626265, + "grad_norm": 1.3752448558807373, + "learning_rate": 0.00019474747474747476, + "loss": 0.3116, + "step": 260 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 6.0490593910217285, + "learning_rate": 0.00019454545454545457, + "loss": 0.3501, + "step": 270 + }, + { + "epoch": 0.2828282828282828, + "grad_norm": 2.3255205154418945, + "learning_rate": 0.00019434343434343435, + "loss": 0.3766, + "step": 280 + }, + { + "epoch": 0.29292929292929293, + "grad_norm": 1.4832454919815063, + "learning_rate": 0.00019414141414141417, + "loss": 0.2122, + "step": 290 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 4.230952739715576, + "learning_rate": 0.00019393939393939395, + "loss": 0.2135, + "step": 300 + }, + { + "epoch": 0.30303030303030304, + "eval_accuracy": 0.7837450769781596, + "eval_loss": 0.6753332018852234, + "eval_runtime": 25.3952, + "eval_samples_per_second": 109.981, + "eval_steps_per_second": 13.782, + "step": 300 + }, + { + "epoch": 0.31313131313131315, + "grad_norm": 3.7198104858398438, + "learning_rate": 0.00019373737373737376, + "loss": 0.3568, + "step": 310 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 2.953439950942993, + "learning_rate": 0.00019353535353535355, + "loss": 0.2951, + "step": 320 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.3283486366271973, + "learning_rate": 0.00019333333333333333, + "loss": 0.3625, + "step": 330 + }, + { + "epoch": 0.3434343434343434, + "grad_norm": 2.153027296066284, + "learning_rate": 0.00019313131313131315, + "loss": 0.4129, + "step": 340 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 1.3729703426361084, + "learning_rate": 0.00019292929292929293, + "loss": 0.268, + "step": 350 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.5054312348365784, + "learning_rate": 0.00019272727272727274, + "loss": 0.1966, + "step": 360 + }, + { + "epoch": 0.37373737373737376, + "grad_norm": 3.2167739868164062, + "learning_rate": 0.00019252525252525253, + "loss": 0.308, + "step": 370 + }, + { + "epoch": 0.3838383838383838, + "grad_norm": 2.139538049697876, + "learning_rate": 0.00019232323232323232, + "loss": 0.3178, + "step": 380 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 1.4604796171188354, + "learning_rate": 0.00019212121212121213, + "loss": 0.2593, + "step": 390 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.86461341381073, + "learning_rate": 0.00019191919191919191, + "loss": 0.214, + "step": 400 + }, + { + "epoch": 0.40404040404040403, + "eval_accuracy": 0.8900823487289653, + "eval_loss": 0.26901283860206604, + "eval_runtime": 25.5497, + "eval_samples_per_second": 109.316, + "eval_steps_per_second": 13.699, + "step": 400 + }, + { + "epoch": 0.41414141414141414, + "grad_norm": 1.9686763286590576, + "learning_rate": 0.00019171717171717173, + "loss": 0.2308, + "step": 410 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 1.2210965156555176, + "learning_rate": 0.0001915151515151515, + "loss": 0.2931, + "step": 420 + }, + { + "epoch": 0.43434343434343436, + "grad_norm": 5.511355876922607, + "learning_rate": 0.00019131313131313132, + "loss": 0.3356, + "step": 430 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 2.9325032234191895, + "learning_rate": 0.00019111111111111114, + "loss": 0.3229, + "step": 440 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 3.8226077556610107, + "learning_rate": 0.00019090909090909092, + "loss": 0.2119, + "step": 450 + }, + { + "epoch": 0.46464646464646464, + "grad_norm": 1.5739362239837646, + "learning_rate": 0.00019070707070707073, + "loss": 0.2065, + "step": 460 + }, + { + "epoch": 0.47474747474747475, + "grad_norm": 1.6280564069747925, + "learning_rate": 0.00019050505050505052, + "loss": 0.2099, + "step": 470 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 2.8299176692962646, + "learning_rate": 0.0001903030303030303, + "loss": 0.3148, + "step": 480 + }, + { + "epoch": 0.494949494949495, + "grad_norm": 2.1273374557495117, + "learning_rate": 0.00019010101010101012, + "loss": 0.259, + "step": 490 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 2.3539364337921143, + "learning_rate": 0.0001898989898989899, + "loss": 0.1947, + "step": 500 + }, + { + "epoch": 0.5050505050505051, + "eval_accuracy": 0.9112065878983172, + "eval_loss": 0.253263920545578, + "eval_runtime": 26.1993, + "eval_samples_per_second": 106.606, + "eval_steps_per_second": 13.359, + "step": 500 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 1.3514348268508911, + "learning_rate": 0.00018969696969696972, + "loss": 0.2791, + "step": 510 + }, + { + "epoch": 0.5252525252525253, + "grad_norm": 1.6897673606872559, + "learning_rate": 0.0001894949494949495, + "loss": 0.4146, + "step": 520 + }, + { + "epoch": 0.5353535353535354, + "grad_norm": 1.5191751718521118, + "learning_rate": 0.0001892929292929293, + "loss": 0.2541, + "step": 530 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 2.3383405208587646, + "learning_rate": 0.0001890909090909091, + "loss": 0.286, + "step": 540 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.0000243186950684, + "learning_rate": 0.00018888888888888888, + "loss": 0.3235, + "step": 550 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 2.458211660385132, + "learning_rate": 0.0001886868686868687, + "loss": 0.2833, + "step": 560 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 1.1083074808120728, + "learning_rate": 0.00018848484848484848, + "loss": 0.3214, + "step": 570 + }, + { + "epoch": 0.5858585858585859, + "grad_norm": 4.410639762878418, + "learning_rate": 0.0001882828282828283, + "loss": 0.2404, + "step": 580 + }, + { + "epoch": 0.5959595959595959, + "grad_norm": 0.5000817775726318, + "learning_rate": 0.00018808080808080808, + "loss": 0.2423, + "step": 590 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.757890522480011, + "learning_rate": 0.0001878787878787879, + "loss": 0.3618, + "step": 600 + }, + { + "epoch": 0.6060606060606061, + "eval_accuracy": 0.8571428571428571, + "eval_loss": 0.37378671765327454, + "eval_runtime": 25.3641, + "eval_samples_per_second": 110.116, + "eval_steps_per_second": 13.799, + "step": 600 + }, + { + "epoch": 0.6161616161616161, + "grad_norm": 1.08816397190094, + "learning_rate": 0.0001876767676767677, + "loss": 0.2873, + "step": 610 + }, + { + "epoch": 0.6262626262626263, + "grad_norm": 0.5049443244934082, + "learning_rate": 0.0001874747474747475, + "loss": 0.2176, + "step": 620 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.9618722796440125, + "learning_rate": 0.00018727272727272728, + "loss": 0.2006, + "step": 630 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 0.9479689598083496, + "learning_rate": 0.0001870707070707071, + "loss": 0.357, + "step": 640 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 1.7955048084259033, + "learning_rate": 0.00018686868686868687, + "loss": 0.2619, + "step": 650 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.3254830837249756, + "learning_rate": 0.0001866666666666667, + "loss": 0.2783, + "step": 660 + }, + { + "epoch": 0.6767676767676768, + "grad_norm": 1.4659123420715332, + "learning_rate": 0.00018646464646464647, + "loss": 0.3846, + "step": 670 + }, + { + "epoch": 0.6868686868686869, + "grad_norm": 0.815553605556488, + "learning_rate": 0.00018626262626262628, + "loss": 0.2791, + "step": 680 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 3.2888660430908203, + "learning_rate": 0.00018606060606060607, + "loss": 0.3321, + "step": 690 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 0.5592217445373535, + "learning_rate": 0.00018585858585858586, + "loss": 0.2065, + "step": 700 + }, + { + "epoch": 0.7070707070707071, + "eval_accuracy": 0.8918725384890799, + "eval_loss": 0.2918793261051178, + "eval_runtime": 25.5852, + "eval_samples_per_second": 109.165, + "eval_steps_per_second": 13.68, + "step": 700 + }, + { + "epoch": 0.7171717171717171, + "grad_norm": 2.0169355869293213, + "learning_rate": 0.00018565656565656567, + "loss": 0.2735, + "step": 710 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.9667913913726807, + "learning_rate": 0.00018545454545454545, + "loss": 0.2697, + "step": 720 + }, + { + "epoch": 0.7373737373737373, + "grad_norm": 1.8768874406814575, + "learning_rate": 0.00018525252525252527, + "loss": 0.3328, + "step": 730 + }, + { + "epoch": 0.7474747474747475, + "grad_norm": 2.070849657058716, + "learning_rate": 0.00018505050505050505, + "loss": 0.2292, + "step": 740 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.9167304039001465, + "learning_rate": 0.00018484848484848484, + "loss": 0.2775, + "step": 750 + }, + { + "epoch": 0.7676767676767676, + "grad_norm": 1.886155128479004, + "learning_rate": 0.00018464646464646465, + "loss": 0.2064, + "step": 760 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 1.4632712602615356, + "learning_rate": 0.00018444444444444446, + "loss": 0.2472, + "step": 770 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 0.513150155544281, + "learning_rate": 0.00018424242424242427, + "loss": 0.1806, + "step": 780 + }, + { + "epoch": 0.797979797979798, + "grad_norm": 1.805382251739502, + "learning_rate": 0.00018404040404040406, + "loss": 0.2717, + "step": 790 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 1.0646640062332153, + "learning_rate": 0.00018383838383838384, + "loss": 0.3103, + "step": 800 + }, + { + "epoch": 0.8080808080808081, + "eval_accuracy": 0.9169351951306839, + "eval_loss": 0.21647529304027557, + "eval_runtime": 25.8731, + "eval_samples_per_second": 107.95, + "eval_steps_per_second": 13.528, + "step": 800 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 1.027064323425293, + "learning_rate": 0.00018363636363636366, + "loss": 0.2464, + "step": 810 + }, + { + "epoch": 0.8282828282828283, + "grad_norm": 3.9652762413024902, + "learning_rate": 0.00018343434343434344, + "loss": 0.1533, + "step": 820 + }, + { + "epoch": 0.8383838383838383, + "grad_norm": 1.7400450706481934, + "learning_rate": 0.00018323232323232326, + "loss": 0.3409, + "step": 830 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 1.1549253463745117, + "learning_rate": 0.00018303030303030304, + "loss": 0.3031, + "step": 840 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 0.6372619867324829, + "learning_rate": 0.00018282828282828283, + "loss": 0.2325, + "step": 850 + }, + { + "epoch": 0.8686868686868687, + "grad_norm": 1.2321391105651855, + "learning_rate": 0.00018262626262626264, + "loss": 0.2321, + "step": 860 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 1.4778344631195068, + "learning_rate": 0.00018242424242424242, + "loss": 0.3474, + "step": 870 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.5331010818481445, + "learning_rate": 0.00018222222222222224, + "loss": 0.1914, + "step": 880 + }, + { + "epoch": 0.898989898989899, + "grad_norm": 1.8820871114730835, + "learning_rate": 0.00018202020202020202, + "loss": 0.2703, + "step": 890 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.8015584945678711, + "learning_rate": 0.00018181818181818183, + "loss": 0.1479, + "step": 900 + }, + { + "epoch": 0.9090909090909091, + "eval_accuracy": 0.9172932330827067, + "eval_loss": 0.21352139115333557, + "eval_runtime": 25.4544, + "eval_samples_per_second": 109.726, + "eval_steps_per_second": 13.75, + "step": 900 + }, + { + "epoch": 0.9191919191919192, + "grad_norm": 8.3115873336792, + "learning_rate": 0.00018161616161616162, + "loss": 0.2335, + "step": 910 + }, + { + "epoch": 0.9292929292929293, + "grad_norm": 3.1177310943603516, + "learning_rate": 0.0001814141414141414, + "loss": 0.3137, + "step": 920 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 1.4966002702713013, + "learning_rate": 0.00018121212121212122, + "loss": 0.1858, + "step": 930 + }, + { + "epoch": 0.9494949494949495, + "grad_norm": 2.699180841445923, + "learning_rate": 0.00018101010101010103, + "loss": 0.2697, + "step": 940 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 3.030885934829712, + "learning_rate": 0.00018080808080808082, + "loss": 0.2867, + "step": 950 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 1.1255096197128296, + "learning_rate": 0.00018060606060606063, + "loss": 0.2376, + "step": 960 + }, + { + "epoch": 0.9797979797979798, + "grad_norm": 1.2400470972061157, + "learning_rate": 0.0001804040404040404, + "loss": 0.321, + "step": 970 + }, + { + "epoch": 0.98989898989899, + "grad_norm": 2.516021251678467, + "learning_rate": 0.00018020202020202023, + "loss": 0.2195, + "step": 980 + }, + { + "epoch": 1.0, + "grad_norm": NaN, + "learning_rate": 0.00018002020202020203, + "loss": 0.1921, + "step": 990 + }, + { + "epoch": 1.0101010101010102, + "grad_norm": 2.886852741241455, + "learning_rate": 0.0001798181818181818, + "loss": 0.2421, + "step": 1000 + }, + { + "epoch": 1.0101010101010102, + "eval_accuracy": 0.9183673469387755, + "eval_loss": 0.21867763996124268, + "eval_runtime": 25.6972, + "eval_samples_per_second": 108.689, + "eval_steps_per_second": 13.62, + "step": 1000 + }, + { + "epoch": 1.02020202020202, + "grad_norm": 0.8659813404083252, + "learning_rate": 0.00017961616161616163, + "loss": 0.219, + "step": 1010 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 0.21378262341022491, + "learning_rate": 0.0001794141414141414, + "loss": 0.159, + "step": 1020 + }, + { + "epoch": 1.0404040404040404, + "grad_norm": 0.6388025879859924, + "learning_rate": 0.00017921212121212122, + "loss": 0.2614, + "step": 1030 + }, + { + "epoch": 1.0505050505050506, + "grad_norm": 3.3048598766326904, + "learning_rate": 0.00017901010101010104, + "loss": 0.2088, + "step": 1040 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 1.8534300327301025, + "learning_rate": 0.00017880808080808082, + "loss": 0.1894, + "step": 1050 + }, + { + "epoch": 1.0707070707070707, + "grad_norm": 0.3146544098854065, + "learning_rate": 0.00017860606060606063, + "loss": 0.2314, + "step": 1060 + }, + { + "epoch": 1.0808080808080809, + "grad_norm": 4.180379867553711, + "learning_rate": 0.00017840404040404042, + "loss": 0.2217, + "step": 1070 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 1.628567099571228, + "learning_rate": 0.0001782020202020202, + "loss": 0.1622, + "step": 1080 + }, + { + "epoch": 1.101010101010101, + "grad_norm": 1.6040252447128296, + "learning_rate": 0.00017800000000000002, + "loss": 0.217, + "step": 1090 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 1.2518551349639893, + "learning_rate": 0.0001777979797979798, + "loss": 0.2264, + "step": 1100 + }, + { + "epoch": 1.1111111111111112, + "eval_accuracy": 0.920515574650913, + "eval_loss": 0.18880097568035126, + "eval_runtime": 26.2729, + "eval_samples_per_second": 106.307, + "eval_steps_per_second": 13.322, + "step": 1100 + }, + { + "epoch": 1.121212121212121, + "grad_norm": 3.4157488346099854, + "learning_rate": 0.00017759595959595961, + "loss": 0.1981, + "step": 1110 + }, + { + "epoch": 1.1313131313131313, + "grad_norm": 1.492838978767395, + "learning_rate": 0.0001773939393939394, + "loss": 0.1776, + "step": 1120 + }, + { + "epoch": 1.1414141414141414, + "grad_norm": 2.0432426929473877, + "learning_rate": 0.00017719191919191919, + "loss": 0.2956, + "step": 1130 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 1.3289395570755005, + "learning_rate": 0.000176989898989899, + "loss": 0.17, + "step": 1140 + }, + { + "epoch": 1.1616161616161615, + "grad_norm": 2.753247022628784, + "learning_rate": 0.00017678787878787878, + "loss": 0.1965, + "step": 1150 + }, + { + "epoch": 1.1717171717171717, + "grad_norm": 2.479680061340332, + "learning_rate": 0.0001765858585858586, + "loss": 0.2353, + "step": 1160 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 5.093598365783691, + "learning_rate": 0.00017638383838383838, + "loss": 0.3105, + "step": 1170 + }, + { + "epoch": 1.1919191919191918, + "grad_norm": 1.4242653846740723, + "learning_rate": 0.0001761818181818182, + "loss": 0.1905, + "step": 1180 + }, + { + "epoch": 1.202020202020202, + "grad_norm": 0.864905834197998, + "learning_rate": 0.00017597979797979798, + "loss": 0.182, + "step": 1190 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 1.4929953813552856, + "learning_rate": 0.0001757777777777778, + "loss": 0.1664, + "step": 1200 + }, + { + "epoch": 1.2121212121212122, + "eval_accuracy": 0.8875760830648048, + "eval_loss": 0.2607349455356598, + "eval_runtime": 26.13, + "eval_samples_per_second": 106.889, + "eval_steps_per_second": 13.395, + "step": 1200 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 2.241507053375244, + "learning_rate": 0.0001755757575757576, + "loss": 0.2211, + "step": 1210 + }, + { + "epoch": 1.2323232323232323, + "grad_norm": 8.578768730163574, + "learning_rate": 0.0001753737373737374, + "loss": 0.2079, + "step": 1220 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 2.6742098331451416, + "learning_rate": 0.00017517171717171718, + "loss": 0.1737, + "step": 1230 + }, + { + "epoch": 1.2525252525252526, + "grad_norm": 1.3255178928375244, + "learning_rate": 0.000174969696969697, + "loss": 0.2761, + "step": 1240 + }, + { + "epoch": 1.2626262626262625, + "grad_norm": 1.0575960874557495, + "learning_rate": 0.00017476767676767677, + "loss": 0.2031, + "step": 1250 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.9928869009017944, + "learning_rate": 0.00017456565656565659, + "loss": 0.253, + "step": 1260 + }, + { + "epoch": 1.2828282828282829, + "grad_norm": 5.389945983886719, + "learning_rate": 0.00017436363636363637, + "loss": 0.225, + "step": 1270 + }, + { + "epoch": 1.2929292929292928, + "grad_norm": 0.9251638650894165, + "learning_rate": 0.00017416161616161618, + "loss": 0.2688, + "step": 1280 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 2.1235504150390625, + "learning_rate": 0.00017395959595959597, + "loss": 0.2621, + "step": 1290 + }, + { + "epoch": 1.3131313131313131, + "grad_norm": 1.9619354009628296, + "learning_rate": 0.00017375757575757575, + "loss": 0.2049, + "step": 1300 + }, + { + "epoch": 1.3131313131313131, + "eval_accuracy": 0.9004654493376298, + "eval_loss": 0.25018033385276794, + "eval_runtime": 26.4939, + "eval_samples_per_second": 105.421, + "eval_steps_per_second": 13.211, + "step": 1300 + }, + { + "epoch": 1.3232323232323233, + "grad_norm": 0.6495092511177063, + "learning_rate": 0.00017355555555555557, + "loss": 0.2455, + "step": 1310 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.8858203887939453, + "learning_rate": 0.00017335353535353535, + "loss": 0.1854, + "step": 1320 + }, + { + "epoch": 1.3434343434343434, + "grad_norm": 0.21864072978496552, + "learning_rate": 0.00017315151515151516, + "loss": 0.2113, + "step": 1330 + }, + { + "epoch": 1.3535353535353536, + "grad_norm": 0.12539438903331757, + "learning_rate": 0.00017294949494949495, + "loss": 0.1762, + "step": 1340 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 1.0429880619049072, + "learning_rate": 0.00017274747474747474, + "loss": 0.203, + "step": 1350 + }, + { + "epoch": 1.3737373737373737, + "grad_norm": 3.3977394104003906, + "learning_rate": 0.00017254545454545455, + "loss": 0.2309, + "step": 1360 + }, + { + "epoch": 1.3838383838383839, + "grad_norm": 1.9078309535980225, + "learning_rate": 0.00017234343434343436, + "loss": 0.2731, + "step": 1370 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 0.43644586205482483, + "learning_rate": 0.00017214141414141415, + "loss": 0.2085, + "step": 1380 + }, + { + "epoch": 1.404040404040404, + "grad_norm": 0.9476821422576904, + "learning_rate": 0.00017193939393939396, + "loss": 0.2078, + "step": 1390 + }, + { + "epoch": 1.4141414141414141, + "grad_norm": 0.5371082425117493, + "learning_rate": 0.00017173737373737374, + "loss": 0.1503, + "step": 1400 + }, + { + "epoch": 1.4141414141414141, + "eval_accuracy": 0.9176512710347297, + "eval_loss": 0.23049259185791016, + "eval_runtime": 25.429, + "eval_samples_per_second": 109.835, + "eval_steps_per_second": 13.764, + "step": 1400 + }, + { + "epoch": 1.4242424242424243, + "grad_norm": 3.250983953475952, + "learning_rate": 0.00017153535353535356, + "loss": 0.2156, + "step": 1410 + }, + { + "epoch": 1.4343434343434343, + "grad_norm": 0.3875192701816559, + "learning_rate": 0.00017133333333333334, + "loss": 0.1947, + "step": 1420 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 1.1166036128997803, + "learning_rate": 0.00017113131313131315, + "loss": 0.2337, + "step": 1430 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 1.6529600620269775, + "learning_rate": 0.00017092929292929294, + "loss": 0.2475, + "step": 1440 + }, + { + "epoch": 1.4646464646464645, + "grad_norm": 0.8546658754348755, + "learning_rate": 0.00017072727272727273, + "loss": 0.1893, + "step": 1450 + }, + { + "epoch": 1.4747474747474747, + "grad_norm": 1.107015609741211, + "learning_rate": 0.00017052525252525254, + "loss": 0.3021, + "step": 1460 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 2.452491521835327, + "learning_rate": 0.00017032323232323232, + "loss": 0.1447, + "step": 1470 + }, + { + "epoch": 1.494949494949495, + "grad_norm": 0.7565712332725525, + "learning_rate": 0.00017012121212121214, + "loss": 0.2072, + "step": 1480 + }, + { + "epoch": 1.5050505050505052, + "grad_norm": 2.821697235107422, + "learning_rate": 0.00016991919191919192, + "loss": 0.3105, + "step": 1490 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 3.1241745948791504, + "learning_rate": 0.0001697171717171717, + "loss": 0.1846, + "step": 1500 + }, + { + "epoch": 1.5151515151515151, + "eval_accuracy": 0.9219477264590047, + "eval_loss": 0.18808439373970032, + "eval_runtime": 25.8091, + "eval_samples_per_second": 108.218, + "eval_steps_per_second": 13.561, + "step": 1500 + }, + { + "epoch": 1.5252525252525253, + "grad_norm": 0.7479563355445862, + "learning_rate": 0.00016951515151515152, + "loss": 0.2122, + "step": 1510 + }, + { + "epoch": 1.5353535353535355, + "grad_norm": 2.4693410396575928, + "learning_rate": 0.0001693131313131313, + "loss": 0.2165, + "step": 1520 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 1.7394063472747803, + "learning_rate": 0.00016911111111111112, + "loss": 0.2891, + "step": 1530 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 1.5887813568115234, + "learning_rate": 0.00016890909090909093, + "loss": 0.237, + "step": 1540 + }, + { + "epoch": 1.5656565656565657, + "grad_norm": 1.238959550857544, + "learning_rate": 0.00016870707070707071, + "loss": 0.2818, + "step": 1550 + }, + { + "epoch": 1.5757575757575757, + "grad_norm": 1.3330116271972656, + "learning_rate": 0.00016850505050505053, + "loss": 0.2991, + "step": 1560 + }, + { + "epoch": 1.5858585858585859, + "grad_norm": 1.5314040184020996, + "learning_rate": 0.0001683030303030303, + "loss": 0.185, + "step": 1570 + }, + { + "epoch": 1.595959595959596, + "grad_norm": 1.6973659992218018, + "learning_rate": 0.00016810101010101013, + "loss": 0.3002, + "step": 1580 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 1.6625034809112549, + "learning_rate": 0.0001678989898989899, + "loss": 0.1839, + "step": 1590 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.5937618613243103, + "learning_rate": 0.0001676969696969697, + "loss": 0.1571, + "step": 1600 + }, + { + "epoch": 1.6161616161616161, + "eval_accuracy": 0.9283924095954171, + "eval_loss": 0.17884130775928497, + "eval_runtime": 25.8191, + "eval_samples_per_second": 108.176, + "eval_steps_per_second": 13.556, + "step": 1600 + }, + { + "epoch": 1.6262626262626263, + "grad_norm": 4.288907527923584, + "learning_rate": 0.0001674949494949495, + "loss": 0.1889, + "step": 1610 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 2.5653529167175293, + "learning_rate": 0.0001672929292929293, + "loss": 0.2897, + "step": 1620 + }, + { + "epoch": 1.6464646464646466, + "grad_norm": 0.3338545858860016, + "learning_rate": 0.0001670909090909091, + "loss": 0.1289, + "step": 1630 + }, + { + "epoch": 1.6565656565656566, + "grad_norm": 2.711522340774536, + "learning_rate": 0.0001668888888888889, + "loss": 0.248, + "step": 1640 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 4.693909168243408, + "learning_rate": 0.0001666868686868687, + "loss": 0.1882, + "step": 1650 + }, + { + "epoch": 1.676767676767677, + "grad_norm": 1.3561639785766602, + "learning_rate": 0.0001664848484848485, + "loss": 0.2793, + "step": 1660 + }, + { + "epoch": 1.6868686868686869, + "grad_norm": 1.355980396270752, + "learning_rate": 0.00016628282828282828, + "loss": 0.2074, + "step": 1670 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 1.0056601762771606, + "learning_rate": 0.0001660808080808081, + "loss": 0.1986, + "step": 1680 + }, + { + "epoch": 1.7070707070707072, + "grad_norm": 2.7240686416625977, + "learning_rate": 0.00016587878787878787, + "loss": 0.2223, + "step": 1690 + }, + { + "epoch": 1.7171717171717171, + "grad_norm": 2.4127962589263916, + "learning_rate": 0.00016567676767676769, + "loss": 0.4091, + "step": 1700 + }, + { + "epoch": 1.7171717171717171, + "eval_accuracy": 0.9215896885069818, + "eval_loss": 0.2227836698293686, + "eval_runtime": 26.1924, + "eval_samples_per_second": 106.634, + "eval_steps_per_second": 13.363, + "step": 1700 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 2.2027206420898438, + "learning_rate": 0.0001654747474747475, + "loss": 0.1898, + "step": 1710 + }, + { + "epoch": 1.7373737373737375, + "grad_norm": 3.467402219772339, + "learning_rate": 0.00016527272727272728, + "loss": 0.2457, + "step": 1720 + }, + { + "epoch": 1.7474747474747474, + "grad_norm": 2.0899109840393066, + "learning_rate": 0.0001650707070707071, + "loss": 0.1725, + "step": 1730 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 2.3485350608825684, + "learning_rate": 0.00016486868686868688, + "loss": 0.2203, + "step": 1740 + }, + { + "epoch": 1.7676767676767677, + "grad_norm": 0.7051331996917725, + "learning_rate": 0.00016466666666666667, + "loss": 0.2484, + "step": 1750 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.6105542182922363, + "learning_rate": 0.00016446464646464648, + "loss": 0.2507, + "step": 1760 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 1.771294116973877, + "learning_rate": 0.00016426262626262626, + "loss": 0.2389, + "step": 1770 + }, + { + "epoch": 1.797979797979798, + "grad_norm": 0.9679822325706482, + "learning_rate": 0.00016406060606060608, + "loss": 0.2528, + "step": 1780 + }, + { + "epoch": 1.808080808080808, + "grad_norm": 0.7424560785293579, + "learning_rate": 0.00016385858585858586, + "loss": 0.1246, + "step": 1790 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 4.788498878479004, + "learning_rate": 0.00016365656565656568, + "loss": 0.2954, + "step": 1800 + }, + { + "epoch": 1.8181818181818183, + "eval_accuracy": 0.9366272824919442, + "eval_loss": 0.16531354188919067, + "eval_runtime": 25.6248, + "eval_samples_per_second": 108.996, + "eval_steps_per_second": 13.659, + "step": 1800 + }, + { + "epoch": 1.8282828282828283, + "grad_norm": 2.367147445678711, + "learning_rate": 0.00016345454545454546, + "loss": 0.1796, + "step": 1810 + }, + { + "epoch": 1.8383838383838382, + "grad_norm": 0.9937007427215576, + "learning_rate": 0.00016325252525252525, + "loss": 0.1957, + "step": 1820 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 2.382399559020996, + "learning_rate": 0.00016305050505050506, + "loss": 0.1189, + "step": 1830 + }, + { + "epoch": 1.8585858585858586, + "grad_norm": 2.237720251083374, + "learning_rate": 0.00016284848484848484, + "loss": 0.1741, + "step": 1840 + }, + { + "epoch": 1.8686868686868687, + "grad_norm": 1.3761154413223267, + "learning_rate": 0.00016264646464646466, + "loss": 0.1808, + "step": 1850 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 1.0046234130859375, + "learning_rate": 0.00016244444444444444, + "loss": 0.4006, + "step": 1860 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5912858843803406, + "learning_rate": 0.00016224242424242423, + "loss": 0.2065, + "step": 1870 + }, + { + "epoch": 1.898989898989899, + "grad_norm": 2.322145700454712, + "learning_rate": 0.00016204040404040407, + "loss": 0.3358, + "step": 1880 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.40922489762306213, + "learning_rate": 0.00016183838383838385, + "loss": 0.1255, + "step": 1890 + }, + { + "epoch": 1.9191919191919191, + "grad_norm": 1.023874044418335, + "learning_rate": 0.00016163636363636366, + "loss": 0.1366, + "step": 1900 + }, + { + "epoch": 1.9191919191919191, + "eval_accuracy": 0.9419978517722879, + "eval_loss": 0.15291424095630646, + "eval_runtime": 25.7466, + "eval_samples_per_second": 108.481, + "eval_steps_per_second": 13.594, + "step": 1900 + }, + { + "epoch": 1.9292929292929293, + "grad_norm": 1.4084643125534058, + "learning_rate": 0.00016143434343434345, + "loss": 0.2258, + "step": 1910 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 3.0177438259124756, + "learning_rate": 0.00016123232323232324, + "loss": 0.1898, + "step": 1920 + }, + { + "epoch": 1.9494949494949494, + "grad_norm": 1.6898006200790405, + "learning_rate": 0.00016103030303030305, + "loss": 0.2047, + "step": 1930 + }, + { + "epoch": 1.9595959595959596, + "grad_norm": 1.7388184070587158, + "learning_rate": 0.00016082828282828283, + "loss": 0.1772, + "step": 1940 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 2.4235947132110596, + "learning_rate": 0.00016062626262626265, + "loss": 0.1882, + "step": 1950 + }, + { + "epoch": 1.9797979797979797, + "grad_norm": 2.8717238903045654, + "learning_rate": 0.00016042424242424243, + "loss": 0.107, + "step": 1960 + }, + { + "epoch": 1.98989898989899, + "grad_norm": 1.5559844970703125, + "learning_rate": 0.00016022222222222222, + "loss": 0.188, + "step": 1970 + }, + { + "epoch": 2.0, + "grad_norm": 0.30049532651901245, + "learning_rate": 0.00016002020202020203, + "loss": 0.2067, + "step": 1980 + }, + { + "epoch": 2.01010101010101, + "grad_norm": 0.3831079602241516, + "learning_rate": 0.00015981818181818181, + "loss": 0.1721, + "step": 1990 + }, + { + "epoch": 2.0202020202020203, + "grad_norm": 2.535356044769287, + "learning_rate": 0.00015961616161616163, + "loss": 0.1657, + "step": 2000 + }, + { + "epoch": 2.0202020202020203, + "eval_accuracy": 0.9255281059792339, + "eval_loss": 0.1745312362909317, + "eval_runtime": 26.0851, + "eval_samples_per_second": 107.072, + "eval_steps_per_second": 13.418, + "step": 2000 + }, + { + "epoch": 2.0303030303030303, + "grad_norm": 2.323450803756714, + "learning_rate": 0.0001594141414141414, + "loss": 0.1716, + "step": 2010 + }, + { + "epoch": 2.04040404040404, + "grad_norm": 3.293064832687378, + "learning_rate": 0.00015921212121212122, + "loss": 0.2175, + "step": 2020 + }, + { + "epoch": 2.0505050505050506, + "grad_norm": 3.534296751022339, + "learning_rate": 0.000159010101010101, + "loss": 0.1455, + "step": 2030 + }, + { + "epoch": 2.0606060606060606, + "grad_norm": 3.079486846923828, + "learning_rate": 0.0001588080808080808, + "loss": 0.2201, + "step": 2040 + }, + { + "epoch": 2.0707070707070705, + "grad_norm": 1.527884602546692, + "learning_rate": 0.00015860606060606064, + "loss": 0.1364, + "step": 2050 + }, + { + "epoch": 2.080808080808081, + "grad_norm": 3.254300594329834, + "learning_rate": 0.00015840404040404042, + "loss": 0.1776, + "step": 2060 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 1.566991925239563, + "learning_rate": 0.0001582020202020202, + "loss": 0.1876, + "step": 2070 + }, + { + "epoch": 2.101010101010101, + "grad_norm": 3.1248011589050293, + "learning_rate": 0.00015800000000000002, + "loss": 0.1674, + "step": 2080 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 2.1448099613189697, + "learning_rate": 0.0001577979797979798, + "loss": 0.0895, + "step": 2090 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 3.84352970123291, + "learning_rate": 0.00015759595959595962, + "loss": 0.2531, + "step": 2100 + }, + { + "epoch": 2.121212121212121, + "eval_accuracy": 0.9380594343000358, + "eval_loss": 0.17440764605998993, + "eval_runtime": 26.0155, + "eval_samples_per_second": 107.359, + "eval_steps_per_second": 13.454, + "step": 2100 + }, + { + "epoch": 2.1313131313131315, + "grad_norm": 3.8544914722442627, + "learning_rate": 0.0001573939393939394, + "loss": 0.2031, + "step": 2110 + }, + { + "epoch": 2.1414141414141414, + "grad_norm": 1.2926223278045654, + "learning_rate": 0.00015719191919191921, + "loss": 0.1186, + "step": 2120 + }, + { + "epoch": 2.1515151515151514, + "grad_norm": 1.7553162574768066, + "learning_rate": 0.000156989898989899, + "loss": 0.1749, + "step": 2130 + }, + { + "epoch": 2.1616161616161618, + "grad_norm": 1.2930015325546265, + "learning_rate": 0.00015678787878787879, + "loss": 0.1639, + "step": 2140 + }, + { + "epoch": 2.1717171717171717, + "grad_norm": 1.4020500183105469, + "learning_rate": 0.0001565858585858586, + "loss": 0.1807, + "step": 2150 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 3.5150535106658936, + "learning_rate": 0.00015638383838383838, + "loss": 0.1325, + "step": 2160 + }, + { + "epoch": 2.191919191919192, + "grad_norm": 0.6746648550033569, + "learning_rate": 0.0001561818181818182, + "loss": 0.1361, + "step": 2170 + }, + { + "epoch": 2.202020202020202, + "grad_norm": 4.897150993347168, + "learning_rate": 0.00015597979797979798, + "loss": 0.2186, + "step": 2180 + }, + { + "epoch": 2.212121212121212, + "grad_norm": 1.577208399772644, + "learning_rate": 0.00015577777777777777, + "loss": 0.3264, + "step": 2190 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 1.9404233694076538, + "learning_rate": 0.00015557575757575758, + "loss": 0.152, + "step": 2200 + }, + { + "epoch": 2.2222222222222223, + "eval_accuracy": 0.8950948800572861, + "eval_loss": 0.2513238191604614, + "eval_runtime": 25.2915, + "eval_samples_per_second": 110.432, + "eval_steps_per_second": 13.839, + "step": 2200 + }, + { + "epoch": 2.2323232323232323, + "grad_norm": 1.1842570304870605, + "learning_rate": 0.00015537373737373736, + "loss": 0.1947, + "step": 2210 + }, + { + "epoch": 2.242424242424242, + "grad_norm": 2.7548556327819824, + "learning_rate": 0.00015517171717171718, + "loss": 0.1633, + "step": 2220 + }, + { + "epoch": 2.2525252525252526, + "grad_norm": 1.2859703302383423, + "learning_rate": 0.000154969696969697, + "loss": 0.1619, + "step": 2230 + }, + { + "epoch": 2.2626262626262625, + "grad_norm": 5.785464286804199, + "learning_rate": 0.00015476767676767677, + "loss": 0.2364, + "step": 2240 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 1.2347121238708496, + "learning_rate": 0.0001545656565656566, + "loss": 0.1638, + "step": 2250 + }, + { + "epoch": 2.282828282828283, + "grad_norm": 2.9314799308776855, + "learning_rate": 0.00015436363636363637, + "loss": 0.2124, + "step": 2260 + }, + { + "epoch": 2.292929292929293, + "grad_norm": 1.1902947425842285, + "learning_rate": 0.00015416161616161619, + "loss": 0.1228, + "step": 2270 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 2.0819814205169678, + "learning_rate": 0.00015395959595959597, + "loss": 0.1572, + "step": 2280 + }, + { + "epoch": 2.313131313131313, + "grad_norm": 1.9850718975067139, + "learning_rate": 0.00015375757575757576, + "loss": 0.1537, + "step": 2290 + }, + { + "epoch": 2.323232323232323, + "grad_norm": 2.0210001468658447, + "learning_rate": 0.00015355555555555557, + "loss": 0.145, + "step": 2300 + }, + { + "epoch": 2.323232323232323, + "eval_accuracy": 0.9301825993555317, + "eval_loss": 0.1718265563249588, + "eval_runtime": 25.859, + "eval_samples_per_second": 108.009, + "eval_steps_per_second": 13.535, + "step": 2300 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 2.4831063747406006, + "learning_rate": 0.00015335353535353535, + "loss": 0.1163, + "step": 2310 + }, + { + "epoch": 2.3434343434343434, + "grad_norm": 1.1499810218811035, + "learning_rate": 0.00015315151515151517, + "loss": 0.1759, + "step": 2320 + }, + { + "epoch": 2.3535353535353534, + "grad_norm": 2.068272113800049, + "learning_rate": 0.00015294949494949495, + "loss": 0.1668, + "step": 2330 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 1.8722370862960815, + "learning_rate": 0.00015274747474747474, + "loss": 0.1656, + "step": 2340 + }, + { + "epoch": 2.3737373737373737, + "grad_norm": 2.1109440326690674, + "learning_rate": 0.00015254545454545455, + "loss": 0.115, + "step": 2350 + }, + { + "epoch": 2.3838383838383836, + "grad_norm": 1.9140312671661377, + "learning_rate": 0.00015234343434343434, + "loss": 0.2942, + "step": 2360 + }, + { + "epoch": 2.393939393939394, + "grad_norm": 0.6813613176345825, + "learning_rate": 0.00015214141414141415, + "loss": 0.1885, + "step": 2370 + }, + { + "epoch": 2.404040404040404, + "grad_norm": 2.1684365272521973, + "learning_rate": 0.00015193939393939393, + "loss": 0.1623, + "step": 2380 + }, + { + "epoch": 2.4141414141414144, + "grad_norm": 1.1685301065444946, + "learning_rate": 0.00015173737373737375, + "loss": 0.1993, + "step": 2390 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 4.124536991119385, + "learning_rate": 0.00015153535353535356, + "loss": 0.202, + "step": 2400 + }, + { + "epoch": 2.4242424242424243, + "eval_accuracy": 0.9033297529538131, + "eval_loss": 0.243553027510643, + "eval_runtime": 26.2432, + "eval_samples_per_second": 106.428, + "eval_steps_per_second": 13.337, + "step": 2400 + }, + { + "epoch": 2.4343434343434343, + "grad_norm": 2.1310033798217773, + "learning_rate": 0.00015133333333333334, + "loss": 0.1787, + "step": 2410 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 1.696519374847412, + "learning_rate": 0.00015113131313131316, + "loss": 0.1145, + "step": 2420 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 0.22621920704841614, + "learning_rate": 0.00015092929292929294, + "loss": 0.1751, + "step": 2430 + }, + { + "epoch": 2.4646464646464645, + "grad_norm": 1.456549048423767, + "learning_rate": 0.00015072727272727273, + "loss": 0.2466, + "step": 2440 + }, + { + "epoch": 2.474747474747475, + "grad_norm": 1.5442981719970703, + "learning_rate": 0.00015052525252525254, + "loss": 0.072, + "step": 2450 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.8132629990577698, + "learning_rate": 0.00015032323232323232, + "loss": 0.1793, + "step": 2460 + }, + { + "epoch": 2.494949494949495, + "grad_norm": 2.552353620529175, + "learning_rate": 0.00015012121212121214, + "loss": 0.2657, + "step": 2470 + }, + { + "epoch": 2.505050505050505, + "grad_norm": 0.9862263798713684, + "learning_rate": 0.00014991919191919192, + "loss": 0.2232, + "step": 2480 + }, + { + "epoch": 2.515151515151515, + "grad_norm": 1.9942890405654907, + "learning_rate": 0.00014971717171717174, + "loss": 0.1905, + "step": 2490 + }, + { + "epoch": 2.525252525252525, + "grad_norm": 4.739780902862549, + "learning_rate": 0.00014951515151515152, + "loss": 0.1346, + "step": 2500 + }, + { + "epoch": 2.525252525252525, + "eval_accuracy": 0.9233798782670963, + "eval_loss": 0.18391837179660797, + "eval_runtime": 26.3453, + "eval_samples_per_second": 106.015, + "eval_steps_per_second": 13.285, + "step": 2500 + }, + { + "epoch": 2.5353535353535355, + "grad_norm": 0.7769536375999451, + "learning_rate": 0.0001493131313131313, + "loss": 0.1755, + "step": 2510 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 1.3806142807006836, + "learning_rate": 0.00014911111111111112, + "loss": 0.1891, + "step": 2520 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.22077137231826782, + "learning_rate": 0.0001489090909090909, + "loss": 0.1044, + "step": 2530 + }, + { + "epoch": 2.5656565656565657, + "grad_norm": 0.8488559126853943, + "learning_rate": 0.00014870707070707072, + "loss": 0.1627, + "step": 2540 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 1.2063629627227783, + "learning_rate": 0.0001485050505050505, + "loss": 0.0899, + "step": 2550 + }, + { + "epoch": 2.5858585858585856, + "grad_norm": 0.09559821337461472, + "learning_rate": 0.0001483030303030303, + "loss": 0.1586, + "step": 2560 + }, + { + "epoch": 2.595959595959596, + "grad_norm": 1.8213800191879272, + "learning_rate": 0.00014810101010101013, + "loss": 0.2368, + "step": 2570 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.23213154077529907, + "learning_rate": 0.0001478989898989899, + "loss": 0.1822, + "step": 2580 + }, + { + "epoch": 2.616161616161616, + "grad_norm": 2.048548936843872, + "learning_rate": 0.00014769696969696972, + "loss": 0.2115, + "step": 2590 + }, + { + "epoch": 2.6262626262626263, + "grad_norm": 0.5447703003883362, + "learning_rate": 0.0001474949494949495, + "loss": 0.1554, + "step": 2600 + }, + { + "epoch": 2.6262626262626263, + "eval_accuracy": 0.9462943071965628, + "eval_loss": 0.14465215802192688, + "eval_runtime": 39.7438, + "eval_samples_per_second": 70.275, + "eval_steps_per_second": 8.806, + "step": 2600 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 0.35208356380462646, + "learning_rate": 0.0001472929292929293, + "loss": 0.1676, + "step": 2610 + }, + { + "epoch": 2.6464646464646466, + "grad_norm": 2.1393558979034424, + "learning_rate": 0.0001470909090909091, + "loss": 0.172, + "step": 2620 + }, + { + "epoch": 2.6565656565656566, + "grad_norm": 3.569701671600342, + "learning_rate": 0.0001468888888888889, + "loss": 0.2052, + "step": 2630 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 2.239469528198242, + "learning_rate": 0.0001466868686868687, + "loss": 0.188, + "step": 2640 + }, + { + "epoch": 2.676767676767677, + "grad_norm": 0.36261606216430664, + "learning_rate": 0.0001464848484848485, + "loss": 0.1819, + "step": 2650 + }, + { + "epoch": 2.686868686868687, + "grad_norm": 1.5869864225387573, + "learning_rate": 0.00014628282828282828, + "loss": 0.1745, + "step": 2660 + }, + { + "epoch": 2.6969696969696972, + "grad_norm": 0.795631468296051, + "learning_rate": 0.0001460808080808081, + "loss": 0.1355, + "step": 2670 + }, + { + "epoch": 2.707070707070707, + "grad_norm": 1.5133333206176758, + "learning_rate": 0.00014587878787878787, + "loss": 0.1599, + "step": 2680 + }, + { + "epoch": 2.717171717171717, + "grad_norm": 0.3388718366622925, + "learning_rate": 0.0001456767676767677, + "loss": 0.16, + "step": 2690 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.9800481200218201, + "learning_rate": 0.00014547474747474747, + "loss": 0.183, + "step": 2700 + }, + { + "epoch": 2.7272727272727275, + "eval_accuracy": 0.8822055137844611, + "eval_loss": 0.2474302053451538, + "eval_runtime": 25.9075, + "eval_samples_per_second": 107.807, + "eval_steps_per_second": 13.51, + "step": 2700 + }, + { + "epoch": 2.7373737373737375, + "grad_norm": 0.34112411737442017, + "learning_rate": 0.00014527272727272726, + "loss": 0.1968, + "step": 2710 + }, + { + "epoch": 2.7474747474747474, + "grad_norm": 0.4181295335292816, + "learning_rate": 0.00014507070707070707, + "loss": 0.1574, + "step": 2720 + }, + { + "epoch": 2.757575757575758, + "grad_norm": 1.8951572179794312, + "learning_rate": 0.00014486868686868686, + "loss": 0.1834, + "step": 2730 + }, + { + "epoch": 2.7676767676767677, + "grad_norm": 0.6324455738067627, + "learning_rate": 0.0001446666666666667, + "loss": 0.1925, + "step": 2740 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 3.5484867095947266, + "learning_rate": 0.00014446464646464648, + "loss": 0.1239, + "step": 2750 + }, + { + "epoch": 2.787878787878788, + "grad_norm": 1.952815055847168, + "learning_rate": 0.00014426262626262627, + "loss": 0.2525, + "step": 2760 + }, + { + "epoch": 2.797979797979798, + "grad_norm": 0.6016231179237366, + "learning_rate": 0.00014406060606060608, + "loss": 0.2239, + "step": 2770 + }, + { + "epoch": 2.808080808080808, + "grad_norm": 2.579787254333496, + "learning_rate": 0.00014385858585858586, + "loss": 0.2118, + "step": 2780 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 1.8643174171447754, + "learning_rate": 0.00014365656565656568, + "loss": 0.1191, + "step": 2790 + }, + { + "epoch": 2.8282828282828283, + "grad_norm": 0.4166184067726135, + "learning_rate": 0.00014345454545454546, + "loss": 0.0972, + "step": 2800 + }, + { + "epoch": 2.8282828282828283, + "eval_accuracy": 0.920515574650913, + "eval_loss": 0.22228243947029114, + "eval_runtime": 25.8946, + "eval_samples_per_second": 107.861, + "eval_steps_per_second": 13.516, + "step": 2800 + }, + { + "epoch": 2.8383838383838382, + "grad_norm": 0.45725560188293457, + "learning_rate": 0.00014325252525252525, + "loss": 0.163, + "step": 2810 + }, + { + "epoch": 2.8484848484848486, + "grad_norm": 1.261326789855957, + "learning_rate": 0.00014305050505050506, + "loss": 0.2321, + "step": 2820 + }, + { + "epoch": 2.8585858585858586, + "grad_norm": 2.7571489810943604, + "learning_rate": 0.00014284848484848485, + "loss": 0.1687, + "step": 2830 + }, + { + "epoch": 2.8686868686868685, + "grad_norm": 3.2817001342773438, + "learning_rate": 0.00014264646464646466, + "loss": 0.2726, + "step": 2840 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.730097770690918, + "learning_rate": 0.00014244444444444444, + "loss": 0.1302, + "step": 2850 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 1.8856103420257568, + "learning_rate": 0.00014224242424242426, + "loss": 0.148, + "step": 2860 + }, + { + "epoch": 2.898989898989899, + "grad_norm": 2.830354690551758, + "learning_rate": 0.00014204040404040404, + "loss": 0.207, + "step": 2870 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 2.36492919921875, + "learning_rate": 0.00014183838383838383, + "loss": 0.1776, + "step": 2880 + }, + { + "epoch": 2.919191919191919, + "grad_norm": 1.9598393440246582, + "learning_rate": 0.00014163636363636364, + "loss": 0.1369, + "step": 2890 + }, + { + "epoch": 2.929292929292929, + "grad_norm": 0.15792728960514069, + "learning_rate": 0.00014143434343434342, + "loss": 0.1073, + "step": 2900 + }, + { + "epoch": 2.929292929292929, + "eval_accuracy": 0.9344790547798066, + "eval_loss": 0.18598826229572296, + "eval_runtime": 25.6958, + "eval_samples_per_second": 108.695, + "eval_steps_per_second": 13.621, + "step": 2900 + }, + { + "epoch": 2.9393939393939394, + "grad_norm": 5.3081278800964355, + "learning_rate": 0.00014123232323232324, + "loss": 0.2393, + "step": 2910 + }, + { + "epoch": 2.9494949494949494, + "grad_norm": 3.75370454788208, + "learning_rate": 0.00014103030303030305, + "loss": 0.1422, + "step": 2920 + }, + { + "epoch": 2.9595959595959593, + "grad_norm": 0.5363875031471252, + "learning_rate": 0.00014082828282828284, + "loss": 0.2477, + "step": 2930 + }, + { + "epoch": 2.9696969696969697, + "grad_norm": 0.6699589490890503, + "learning_rate": 0.00014062626262626265, + "loss": 0.1999, + "step": 2940 + }, + { + "epoch": 2.9797979797979797, + "grad_norm": 0.414679616689682, + "learning_rate": 0.00014042424242424243, + "loss": 0.1885, + "step": 2950 + }, + { + "epoch": 2.98989898989899, + "grad_norm": 2.85548734664917, + "learning_rate": 0.00014022222222222225, + "loss": 0.2321, + "step": 2960 + }, + { + "epoch": 3.0, + "grad_norm": 4.793233394622803, + "learning_rate": 0.00014002020202020203, + "loss": 0.194, + "step": 2970 + }, + { + "epoch": 3.01010101010101, + "grad_norm": 2.4459941387176514, + "learning_rate": 0.00013981818181818182, + "loss": 0.1001, + "step": 2980 + }, + { + "epoch": 3.0202020202020203, + "grad_norm": 4.440774440765381, + "learning_rate": 0.00013961616161616163, + "loss": 0.2352, + "step": 2990 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 1.1308948993682861, + "learning_rate": 0.00013941414141414141, + "loss": 0.1824, + "step": 3000 + }, + { + "epoch": 3.0303030303030303, + "eval_accuracy": 0.9194414607948442, + "eval_loss": 0.23244450986385345, + "eval_runtime": 26.0602, + "eval_samples_per_second": 107.175, + "eval_steps_per_second": 13.43, + "step": 3000 + }, + { + "epoch": 3.04040404040404, + "grad_norm": 0.8572925925254822, + "learning_rate": 0.00013921212121212123, + "loss": 0.173, + "step": 3010 + }, + { + "epoch": 3.0505050505050506, + "grad_norm": 2.714022397994995, + "learning_rate": 0.000139010101010101, + "loss": 0.1666, + "step": 3020 + }, + { + "epoch": 3.0606060606060606, + "grad_norm": 1.6566622257232666, + "learning_rate": 0.0001388080808080808, + "loss": 0.1817, + "step": 3030 + }, + { + "epoch": 3.0707070707070705, + "grad_norm": 1.3201829195022583, + "learning_rate": 0.0001386060606060606, + "loss": 0.0864, + "step": 3040 + }, + { + "epoch": 3.080808080808081, + "grad_norm": 2.294625759124756, + "learning_rate": 0.0001384040404040404, + "loss": 0.1213, + "step": 3050 + }, + { + "epoch": 3.090909090909091, + "grad_norm": 4.557636737823486, + "learning_rate": 0.0001382020202020202, + "loss": 0.2507, + "step": 3060 + }, + { + "epoch": 3.101010101010101, + "grad_norm": 2.418661594390869, + "learning_rate": 0.000138, + "loss": 0.1128, + "step": 3070 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.24477995932102203, + "learning_rate": 0.0001377979797979798, + "loss": 0.1639, + "step": 3080 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 1.70753014087677, + "learning_rate": 0.00013759595959595962, + "loss": 0.1445, + "step": 3090 + }, + { + "epoch": 3.1313131313131315, + "grad_norm": 2.5645668506622314, + "learning_rate": 0.0001373939393939394, + "loss": 0.1221, + "step": 3100 + }, + { + "epoch": 3.1313131313131315, + "eval_accuracy": 0.9448621553884712, + "eval_loss": 0.14752542972564697, + "eval_runtime": 25.6753, + "eval_samples_per_second": 108.782, + "eval_steps_per_second": 13.632, + "step": 3100 + }, + { + "epoch": 3.1414141414141414, + "grad_norm": 1.4956971406936646, + "learning_rate": 0.00013719191919191922, + "loss": 0.1874, + "step": 3110 + }, + { + "epoch": 3.1515151515151514, + "grad_norm": 0.11596371233463287, + "learning_rate": 0.000136989898989899, + "loss": 0.0846, + "step": 3120 + }, + { + "epoch": 3.1616161616161618, + "grad_norm": 1.4664329290390015, + "learning_rate": 0.0001367878787878788, + "loss": 0.0796, + "step": 3130 + }, + { + "epoch": 3.1717171717171717, + "grad_norm": 3.685087203979492, + "learning_rate": 0.0001365858585858586, + "loss": 0.216, + "step": 3140 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 2.8635785579681396, + "learning_rate": 0.00013638383838383838, + "loss": 0.1541, + "step": 3150 + }, + { + "epoch": 3.191919191919192, + "grad_norm": 2.6519153118133545, + "learning_rate": 0.0001361818181818182, + "loss": 0.1022, + "step": 3160 + }, + { + "epoch": 3.202020202020202, + "grad_norm": 3.962721347808838, + "learning_rate": 0.00013597979797979798, + "loss": 0.166, + "step": 3170 + }, + { + "epoch": 3.212121212121212, + "grad_norm": 1.308240532875061, + "learning_rate": 0.00013577777777777777, + "loss": 0.2154, + "step": 3180 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.127321258187294, + "learning_rate": 0.00013557575757575758, + "loss": 0.1097, + "step": 3190 + }, + { + "epoch": 3.2323232323232323, + "grad_norm": 2.5694165229797363, + "learning_rate": 0.00013537373737373737, + "loss": 0.1039, + "step": 3200 + }, + { + "epoch": 3.2323232323232323, + "eval_accuracy": 0.9427139276763337, + "eval_loss": 0.14802759885787964, + "eval_runtime": 25.9394, + "eval_samples_per_second": 107.674, + "eval_steps_per_second": 13.493, + "step": 3200 + }, + { + "epoch": 3.242424242424242, + "grad_norm": 2.3018665313720703, + "learning_rate": 0.00013517171717171718, + "loss": 0.0808, + "step": 3210 + }, + { + "epoch": 3.2525252525252526, + "grad_norm": 1.8823286294937134, + "learning_rate": 0.00013496969696969696, + "loss": 0.2128, + "step": 3220 + }, + { + "epoch": 3.2626262626262625, + "grad_norm": 2.9755682945251465, + "learning_rate": 0.00013476767676767678, + "loss": 0.1185, + "step": 3230 + }, + { + "epoch": 3.2727272727272725, + "grad_norm": 1.15530264377594, + "learning_rate": 0.00013456565656565656, + "loss": 0.1271, + "step": 3240 + }, + { + "epoch": 3.282828282828283, + "grad_norm": 3.966992139816284, + "learning_rate": 0.00013436363636363637, + "loss": 0.145, + "step": 3250 + }, + { + "epoch": 3.292929292929293, + "grad_norm": 2.1390750408172607, + "learning_rate": 0.0001341616161616162, + "loss": 0.0662, + "step": 3260 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.9288390874862671, + "learning_rate": 0.00013395959595959597, + "loss": 0.1553, + "step": 3270 + }, + { + "epoch": 3.313131313131313, + "grad_norm": 1.0771069526672363, + "learning_rate": 0.00013375757575757576, + "loss": 0.1856, + "step": 3280 + }, + { + "epoch": 3.323232323232323, + "grad_norm": 1.9450442790985107, + "learning_rate": 0.00013355555555555557, + "loss": 0.1707, + "step": 3290 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 1.086408257484436, + "learning_rate": 0.00013335353535353536, + "loss": 0.276, + "step": 3300 + }, + { + "epoch": 3.3333333333333335, + "eval_accuracy": 0.9402076620121733, + "eval_loss": 0.1591499000787735, + "eval_runtime": 26.2784, + "eval_samples_per_second": 106.285, + "eval_steps_per_second": 13.319, + "step": 3300 + }, + { + "epoch": 3.3434343434343434, + "grad_norm": 1.670813798904419, + "learning_rate": 0.00013315151515151517, + "loss": 0.1761, + "step": 3310 + }, + { + "epoch": 3.3535353535353534, + "grad_norm": 0.9401443600654602, + "learning_rate": 0.00013294949494949495, + "loss": 0.1101, + "step": 3320 + }, + { + "epoch": 3.3636363636363638, + "grad_norm": 6.867558479309082, + "learning_rate": 0.00013274747474747477, + "loss": 0.0827, + "step": 3330 + }, + { + "epoch": 3.3737373737373737, + "grad_norm": 0.5723597407341003, + "learning_rate": 0.00013254545454545455, + "loss": 0.0541, + "step": 3340 + }, + { + "epoch": 3.3838383838383836, + "grad_norm": 2.7410197257995605, + "learning_rate": 0.00013234343434343434, + "loss": 0.2224, + "step": 3350 + }, + { + "epoch": 3.393939393939394, + "grad_norm": 0.6748982071876526, + "learning_rate": 0.00013214141414141415, + "loss": 0.2135, + "step": 3360 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 1.3334232568740845, + "learning_rate": 0.00013193939393939393, + "loss": 0.1676, + "step": 3370 + }, + { + "epoch": 3.4141414141414144, + "grad_norm": 0.17679575085639954, + "learning_rate": 0.00013173737373737375, + "loss": 0.0689, + "step": 3380 + }, + { + "epoch": 3.4242424242424243, + "grad_norm": 1.2959870100021362, + "learning_rate": 0.00013153535353535353, + "loss": 0.1658, + "step": 3390 + }, + { + "epoch": 3.4343434343434343, + "grad_norm": 2.298753261566162, + "learning_rate": 0.00013133333333333332, + "loss": 0.2498, + "step": 3400 + }, + { + "epoch": 3.4343434343434343, + "eval_accuracy": 0.9097744360902256, + "eval_loss": 0.24474310874938965, + "eval_runtime": 30.7, + "eval_samples_per_second": 90.977, + "eval_steps_per_second": 11.401, + "step": 3400 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.1379319727420807, + "learning_rate": 0.00013113131313131313, + "loss": 0.1168, + "step": 3410 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 3.478100299835205, + "learning_rate": 0.00013092929292929294, + "loss": 0.2747, + "step": 3420 + }, + { + "epoch": 3.4646464646464645, + "grad_norm": 0.6245542764663696, + "learning_rate": 0.00013072727272727276, + "loss": 0.1841, + "step": 3430 + }, + { + "epoch": 3.474747474747475, + "grad_norm": 1.4680737257003784, + "learning_rate": 0.00013052525252525254, + "loss": 0.1082, + "step": 3440 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 0.40652742981910706, + "learning_rate": 0.00013032323232323233, + "loss": 0.1337, + "step": 3450 + }, + { + "epoch": 3.494949494949495, + "grad_norm": 3.755459785461426, + "learning_rate": 0.00013012121212121214, + "loss": 0.1358, + "step": 3460 + }, + { + "epoch": 3.505050505050505, + "grad_norm": 2.8953843116760254, + "learning_rate": 0.00012991919191919192, + "loss": 0.1239, + "step": 3470 + }, + { + "epoch": 3.515151515151515, + "grad_norm": 0.5568174123764038, + "learning_rate": 0.00012971717171717174, + "loss": 0.1052, + "step": 3480 + }, + { + "epoch": 3.525252525252525, + "grad_norm": 0.03225015476346016, + "learning_rate": 0.00012951515151515152, + "loss": 0.1354, + "step": 3490 + }, + { + "epoch": 3.5353535353535355, + "grad_norm": 0.3489704132080078, + "learning_rate": 0.0001293131313131313, + "loss": 0.1453, + "step": 3500 + }, + { + "epoch": 3.5353535353535355, + "eval_accuracy": 0.941639813820265, + "eval_loss": 0.15563398599624634, + "eval_runtime": 26.2929, + "eval_samples_per_second": 106.226, + "eval_steps_per_second": 13.312, + "step": 3500 + }, + { + "epoch": 3.5454545454545454, + "grad_norm": 3.1930465698242188, + "learning_rate": 0.00012911111111111112, + "loss": 0.1166, + "step": 3510 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.3423315286636353, + "learning_rate": 0.0001289090909090909, + "loss": 0.0629, + "step": 3520 + }, + { + "epoch": 3.5656565656565657, + "grad_norm": 1.1209440231323242, + "learning_rate": 0.00012870707070707072, + "loss": 0.2575, + "step": 3530 + }, + { + "epoch": 3.5757575757575757, + "grad_norm": 0.5611622929573059, + "learning_rate": 0.0001285050505050505, + "loss": 0.236, + "step": 3540 + }, + { + "epoch": 3.5858585858585856, + "grad_norm": 1.5175379514694214, + "learning_rate": 0.00012830303030303032, + "loss": 0.1043, + "step": 3550 + }, + { + "epoch": 3.595959595959596, + "grad_norm": 0.04836178198456764, + "learning_rate": 0.0001281010101010101, + "loss": 0.188, + "step": 3560 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 2.2456367015838623, + "learning_rate": 0.0001278989898989899, + "loss": 0.1346, + "step": 3570 + }, + { + "epoch": 3.616161616161616, + "grad_norm": 1.2927310466766357, + "learning_rate": 0.0001276969696969697, + "loss": 0.1676, + "step": 3580 + }, + { + "epoch": 3.6262626262626263, + "grad_norm": 1.7963800430297852, + "learning_rate": 0.0001274949494949495, + "loss": 0.1591, + "step": 3590 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 1.8542633056640625, + "learning_rate": 0.0001272929292929293, + "loss": 0.1794, + "step": 3600 + }, + { + "epoch": 3.6363636363636362, + "eval_accuracy": 0.9083422842821339, + "eval_loss": 0.2271716296672821, + "eval_runtime": 26.144, + "eval_samples_per_second": 106.831, + "eval_steps_per_second": 13.387, + "step": 3600 + }, + { + "epoch": 3.6464646464646466, + "grad_norm": 1.8239818811416626, + "learning_rate": 0.0001270909090909091, + "loss": 0.1963, + "step": 3610 + }, + { + "epoch": 3.6565656565656566, + "grad_norm": 1.0566405057907104, + "learning_rate": 0.0001268888888888889, + "loss": 0.1465, + "step": 3620 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 1.8584214448928833, + "learning_rate": 0.0001266868686868687, + "loss": 0.1604, + "step": 3630 + }, + { + "epoch": 3.676767676767677, + "grad_norm": 1.3602209091186523, + "learning_rate": 0.0001264848484848485, + "loss": 0.2129, + "step": 3640 + }, + { + "epoch": 3.686868686868687, + "grad_norm": 1.3268640041351318, + "learning_rate": 0.00012628282828282828, + "loss": 0.143, + "step": 3650 + }, + { + "epoch": 3.6969696969696972, + "grad_norm": 1.1488856077194214, + "learning_rate": 0.0001260808080808081, + "loss": 0.1078, + "step": 3660 + }, + { + "epoch": 3.707070707070707, + "grad_norm": 4.119093418121338, + "learning_rate": 0.00012587878787878788, + "loss": 0.1451, + "step": 3670 + }, + { + "epoch": 3.717171717171717, + "grad_norm": 0.3795292377471924, + "learning_rate": 0.0001256767676767677, + "loss": 0.1648, + "step": 3680 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 1.455502986907959, + "learning_rate": 0.00012547474747474747, + "loss": 0.1251, + "step": 3690 + }, + { + "epoch": 3.7373737373737375, + "grad_norm": 1.8064128160476685, + "learning_rate": 0.0001252727272727273, + "loss": 0.1467, + "step": 3700 + }, + { + "epoch": 3.7373737373737375, + "eval_accuracy": 0.941281775868242, + "eval_loss": 0.16726158559322357, + "eval_runtime": 26.4833, + "eval_samples_per_second": 105.463, + "eval_steps_per_second": 13.216, + "step": 3700 + }, + { + "epoch": 3.7474747474747474, + "grad_norm": 1.4757534265518188, + "learning_rate": 0.00012507070707070707, + "loss": 0.2673, + "step": 3710 + }, + { + "epoch": 3.757575757575758, + "grad_norm": 0.5617914199829102, + "learning_rate": 0.00012486868686868686, + "loss": 0.1323, + "step": 3720 + }, + { + "epoch": 3.7676767676767677, + "grad_norm": 1.8609440326690674, + "learning_rate": 0.00012466666666666667, + "loss": 0.1589, + "step": 3730 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 1.7290282249450684, + "learning_rate": 0.00012446464646464646, + "loss": 0.1302, + "step": 3740 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 1.9109983444213867, + "learning_rate": 0.00012426262626262627, + "loss": 0.1918, + "step": 3750 + }, + { + "epoch": 3.797979797979798, + "grad_norm": 1.1704823970794678, + "learning_rate": 0.00012406060606060608, + "loss": 0.1635, + "step": 3760 + }, + { + "epoch": 3.808080808080808, + "grad_norm": 0.6442949175834656, + "learning_rate": 0.00012385858585858587, + "loss": 0.1117, + "step": 3770 + }, + { + "epoch": 3.8181818181818183, + "grad_norm": 1.300221562385559, + "learning_rate": 0.00012365656565656568, + "loss": 0.1313, + "step": 3780 + }, + { + "epoch": 3.8282828282828283, + "grad_norm": 2.98504638671875, + "learning_rate": 0.00012345454545454546, + "loss": 0.2008, + "step": 3790 + }, + { + "epoch": 3.8383838383838382, + "grad_norm": 2.339833974838257, + "learning_rate": 0.00012325252525252528, + "loss": 0.1372, + "step": 3800 + }, + { + "epoch": 3.8383838383838382, + "eval_accuracy": 0.9341210168277837, + "eval_loss": 0.176345095038414, + "eval_runtime": 25.3933, + "eval_samples_per_second": 109.99, + "eval_steps_per_second": 13.783, + "step": 3800 + }, + { + "epoch": 3.8484848484848486, + "grad_norm": 0.8106898069381714, + "learning_rate": 0.00012305050505050506, + "loss": 0.0976, + "step": 3810 + }, + { + "epoch": 3.8585858585858586, + "grad_norm": 1.9186642169952393, + "learning_rate": 0.00012284848484848485, + "loss": 0.0574, + "step": 3820 + }, + { + "epoch": 3.8686868686868685, + "grad_norm": 4.3502702713012695, + "learning_rate": 0.00012264646464646466, + "loss": 0.1246, + "step": 3830 + }, + { + "epoch": 3.878787878787879, + "grad_norm": 6.374273777008057, + "learning_rate": 0.00012244444444444445, + "loss": 0.2127, + "step": 3840 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 1.5716196298599243, + "learning_rate": 0.00012224242424242426, + "loss": 0.162, + "step": 3850 + }, + { + "epoch": 3.898989898989899, + "grad_norm": 2.652738332748413, + "learning_rate": 0.00012204040404040404, + "loss": 0.1534, + "step": 3860 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.6810227036476135, + "learning_rate": 0.00012183838383838384, + "loss": 0.139, + "step": 3870 + }, + { + "epoch": 3.919191919191919, + "grad_norm": 2.7403507232666016, + "learning_rate": 0.00012163636363636364, + "loss": 0.1745, + "step": 3880 + }, + { + "epoch": 3.929292929292929, + "grad_norm": 0.3966246545314789, + "learning_rate": 0.00012143434343434344, + "loss": 0.0959, + "step": 3890 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.0760892927646637, + "learning_rate": 0.00012123232323232323, + "loss": 0.2283, + "step": 3900 + }, + { + "epoch": 3.9393939393939394, + "eval_accuracy": 0.9373433583959899, + "eval_loss": 0.1671173870563507, + "eval_runtime": 26.1195, + "eval_samples_per_second": 106.932, + "eval_steps_per_second": 13.4, + "step": 3900 + }, + { + "epoch": 3.9494949494949494, + "grad_norm": 1.0186794996261597, + "learning_rate": 0.00012103030303030302, + "loss": 0.1012, + "step": 3910 + }, + { + "epoch": 3.9595959595959593, + "grad_norm": 2.6325201988220215, + "learning_rate": 0.00012082828282828282, + "loss": 0.1398, + "step": 3920 + }, + { + "epoch": 3.9696969696969697, + "grad_norm": 0.14090034365653992, + "learning_rate": 0.00012062626262626265, + "loss": 0.1904, + "step": 3930 + }, + { + "epoch": 3.9797979797979797, + "grad_norm": 2.340514659881592, + "learning_rate": 0.00012042424242424243, + "loss": 0.1631, + "step": 3940 + }, + { + "epoch": 3.98989898989899, + "grad_norm": 1.8944944143295288, + "learning_rate": 0.00012022222222222223, + "loss": 0.138, + "step": 3950 + }, + { + "epoch": 4.0, + "grad_norm": NaN, + "learning_rate": 0.00012004040404040405, + "loss": 0.0589, + "step": 3960 + }, + { + "epoch": 4.01010101010101, + "grad_norm": 1.4394489526748657, + "learning_rate": 0.00011983838383838383, + "loss": 0.0575, + "step": 3970 + }, + { + "epoch": 4.02020202020202, + "grad_norm": 1.0577999353408813, + "learning_rate": 0.00011963636363636363, + "loss": 0.1652, + "step": 3980 + }, + { + "epoch": 4.03030303030303, + "grad_norm": 0.14677415788173676, + "learning_rate": 0.00011943434343434343, + "loss": 0.1329, + "step": 3990 + }, + { + "epoch": 4.040404040404041, + "grad_norm": 1.0222164392471313, + "learning_rate": 0.00011923232323232323, + "loss": 0.164, + "step": 4000 + }, + { + "epoch": 4.040404040404041, + "eval_accuracy": 0.9477264590046545, + "eval_loss": 0.14898167550563812, + "eval_runtime": 25.5521, + "eval_samples_per_second": 109.306, + "eval_steps_per_second": 13.697, + "step": 4000 + }, + { + "epoch": 4.05050505050505, + "grad_norm": 0.43585485219955444, + "learning_rate": 0.00011903030303030303, + "loss": 0.145, + "step": 4010 + }, + { + "epoch": 4.0606060606060606, + "grad_norm": 0.18842585384845734, + "learning_rate": 0.00011882828282828284, + "loss": 0.0525, + "step": 4020 + }, + { + "epoch": 4.070707070707071, + "grad_norm": 3.4666500091552734, + "learning_rate": 0.00011862626262626264, + "loss": 0.1291, + "step": 4030 + }, + { + "epoch": 4.08080808080808, + "grad_norm": 4.320937156677246, + "learning_rate": 0.00011842424242424244, + "loss": 0.0897, + "step": 4040 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.4111921787261963, + "learning_rate": 0.00011822222222222224, + "loss": 0.0838, + "step": 4050 + }, + { + "epoch": 4.101010101010101, + "grad_norm": 2.491947650909424, + "learning_rate": 0.00011802020202020202, + "loss": 0.113, + "step": 4060 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 1.7671141624450684, + "learning_rate": 0.00011781818181818182, + "loss": 0.0583, + "step": 4070 + }, + { + "epoch": 4.121212121212121, + "grad_norm": 1.7311761379241943, + "learning_rate": 0.00011761616161616162, + "loss": 0.1525, + "step": 4080 + }, + { + "epoch": 4.1313131313131315, + "grad_norm": 0.17230091989040375, + "learning_rate": 0.00011741414141414142, + "loss": 0.1428, + "step": 4090 + }, + { + "epoch": 4.141414141414141, + "grad_norm": 0.9565097093582153, + "learning_rate": 0.00011721212121212122, + "loss": 0.1513, + "step": 4100 + }, + { + "epoch": 4.141414141414141, + "eval_accuracy": 0.9488005728607233, + "eval_loss": 0.1546555459499359, + "eval_runtime": 26.2079, + "eval_samples_per_second": 106.571, + "eval_steps_per_second": 13.355, + "step": 4100 + }, + { + "epoch": 4.151515151515151, + "grad_norm": 2.4946250915527344, + "learning_rate": 0.00011701010101010102, + "loss": 0.1115, + "step": 4110 + }, + { + "epoch": 4.161616161616162, + "grad_norm": 1.1435558795928955, + "learning_rate": 0.0001168080808080808, + "loss": 0.1025, + "step": 4120 + }, + { + "epoch": 4.171717171717171, + "grad_norm": 0.11861927062273026, + "learning_rate": 0.0001166060606060606, + "loss": 0.0999, + "step": 4130 + }, + { + "epoch": 4.181818181818182, + "grad_norm": 2.040846109390259, + "learning_rate": 0.0001164040404040404, + "loss": 0.1288, + "step": 4140 + }, + { + "epoch": 4.191919191919192, + "grad_norm": 0.08840809017419815, + "learning_rate": 0.0001162020202020202, + "loss": 0.0999, + "step": 4150 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 2.014448404312134, + "learning_rate": 0.000116, + "loss": 0.087, + "step": 4160 + }, + { + "epoch": 4.212121212121212, + "grad_norm": 0.20499661564826965, + "learning_rate": 0.0001157979797979798, + "loss": 0.0906, + "step": 4170 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 1.9742176532745361, + "learning_rate": 0.00011559595959595959, + "loss": 0.0942, + "step": 4180 + }, + { + "epoch": 4.232323232323233, + "grad_norm": 1.2514885663986206, + "learning_rate": 0.00011539393939393941, + "loss": 0.0555, + "step": 4190 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 2.868319511413574, + "learning_rate": 0.00011519191919191921, + "loss": 0.0991, + "step": 4200 + }, + { + "epoch": 4.242424242424242, + "eval_accuracy": 0.9430719656283566, + "eval_loss": 0.15357114374637604, + "eval_runtime": 25.7214, + "eval_samples_per_second": 108.586, + "eval_steps_per_second": 13.607, + "step": 4200 + }, + { + "epoch": 4.252525252525253, + "grad_norm": 2.199428081512451, + "learning_rate": 0.00011498989898989901, + "loss": 0.0807, + "step": 4210 + }, + { + "epoch": 4.262626262626263, + "grad_norm": 1.5803449153900146, + "learning_rate": 0.0001147878787878788, + "loss": 0.1197, + "step": 4220 + }, + { + "epoch": 4.2727272727272725, + "grad_norm": 3.1839122772216797, + "learning_rate": 0.0001145858585858586, + "loss": 0.143, + "step": 4230 + }, + { + "epoch": 4.282828282828283, + "grad_norm": 0.22256587445735931, + "learning_rate": 0.00011438383838383839, + "loss": 0.0897, + "step": 4240 + }, + { + "epoch": 4.292929292929293, + "grad_norm": 0.8421348929405212, + "learning_rate": 0.00011418181818181819, + "loss": 0.1286, + "step": 4250 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 2.4368836879730225, + "learning_rate": 0.00011397979797979799, + "loss": 0.0572, + "step": 4260 + }, + { + "epoch": 4.313131313131313, + "grad_norm": 1.6639461517333984, + "learning_rate": 0.00011377777777777779, + "loss": 0.1067, + "step": 4270 + }, + { + "epoch": 4.3232323232323235, + "grad_norm": 0.683029294013977, + "learning_rate": 0.00011357575757575757, + "loss": 0.1497, + "step": 4280 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 2.050398826599121, + "learning_rate": 0.00011337373737373737, + "loss": 0.1382, + "step": 4290 + }, + { + "epoch": 4.343434343434343, + "grad_norm": 3.134596586227417, + "learning_rate": 0.00011317171717171717, + "loss": 0.1419, + "step": 4300 + }, + { + "epoch": 4.343434343434343, + "eval_accuracy": 0.9445041174364482, + "eval_loss": 0.15678761899471283, + "eval_runtime": 25.8061, + "eval_samples_per_second": 108.23, + "eval_steps_per_second": 13.563, + "step": 4300 + }, + { + "epoch": 4.353535353535354, + "grad_norm": 3.690528631210327, + "learning_rate": 0.00011296969696969697, + "loss": 0.1524, + "step": 4310 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.4388722777366638, + "learning_rate": 0.00011276767676767677, + "loss": 0.1008, + "step": 4320 + }, + { + "epoch": 4.373737373737374, + "grad_norm": 2.5544140338897705, + "learning_rate": 0.00011256565656565657, + "loss": 0.1352, + "step": 4330 + }, + { + "epoch": 4.383838383838384, + "grad_norm": 1.1697108745574951, + "learning_rate": 0.00011236363636363635, + "loss": 0.1478, + "step": 4340 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 1.3652334213256836, + "learning_rate": 0.00011216161616161615, + "loss": 0.1539, + "step": 4350 + }, + { + "epoch": 4.404040404040404, + "grad_norm": 0.6423632502555847, + "learning_rate": 0.00011195959595959598, + "loss": 0.1248, + "step": 4360 + }, + { + "epoch": 4.414141414141414, + "grad_norm": 2.489894151687622, + "learning_rate": 0.00011175757575757578, + "loss": 0.0748, + "step": 4370 + }, + { + "epoch": 4.424242424242424, + "grad_norm": 0.39871343970298767, + "learning_rate": 0.00011155555555555556, + "loss": 0.082, + "step": 4380 + }, + { + "epoch": 4.434343434343434, + "grad_norm": 4.258581161499023, + "learning_rate": 0.00011135353535353536, + "loss": 0.0561, + "step": 4390 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 3.4978339672088623, + "learning_rate": 0.00011115151515151516, + "loss": 0.1452, + "step": 4400 + }, + { + "epoch": 4.444444444444445, + "eval_accuracy": 0.9319727891156463, + "eval_loss": 0.23282791674137115, + "eval_runtime": 26.2522, + "eval_samples_per_second": 106.391, + "eval_steps_per_second": 13.332, + "step": 4400 + }, + { + "epoch": 4.454545454545454, + "grad_norm": 2.7257189750671387, + "learning_rate": 0.00011094949494949496, + "loss": 0.1593, + "step": 4410 + }, + { + "epoch": 4.4646464646464645, + "grad_norm": 4.198055267333984, + "learning_rate": 0.00011074747474747476, + "loss": 0.0885, + "step": 4420 + }, + { + "epoch": 4.474747474747475, + "grad_norm": 1.9910240173339844, + "learning_rate": 0.00011054545454545455, + "loss": 0.1419, + "step": 4430 + }, + { + "epoch": 4.484848484848484, + "grad_norm": 2.0958611965179443, + "learning_rate": 0.00011034343434343434, + "loss": 0.1043, + "step": 4440 + }, + { + "epoch": 4.494949494949495, + "grad_norm": 1.305000901222229, + "learning_rate": 0.00011014141414141414, + "loss": 0.1336, + "step": 4450 + }, + { + "epoch": 4.505050505050505, + "grad_norm": 1.2504723072052002, + "learning_rate": 0.00010993939393939394, + "loss": 0.1646, + "step": 4460 + }, + { + "epoch": 4.515151515151516, + "grad_norm": 0.7178593873977661, + "learning_rate": 0.00010973737373737374, + "loss": 0.0496, + "step": 4470 + }, + { + "epoch": 4.525252525252525, + "grad_norm": 0.11532630026340485, + "learning_rate": 0.00010953535353535354, + "loss": 0.0762, + "step": 4480 + }, + { + "epoch": 4.5353535353535355, + "grad_norm": 3.6086583137512207, + "learning_rate": 0.00010933333333333333, + "loss": 0.1567, + "step": 4490 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.928808867931366, + "learning_rate": 0.00010913131313131312, + "loss": 0.1445, + "step": 4500 + }, + { + "epoch": 4.545454545454545, + "eval_accuracy": 0.9513068385248836, + "eval_loss": 0.1351379156112671, + "eval_runtime": 25.9403, + "eval_samples_per_second": 107.67, + "eval_steps_per_second": 13.493, + "step": 4500 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 3.634613513946533, + "learning_rate": 0.00010892929292929292, + "loss": 0.133, + "step": 4510 + }, + { + "epoch": 4.565656565656566, + "grad_norm": 5.251986503601074, + "learning_rate": 0.00010872727272727272, + "loss": 0.0859, + "step": 4520 + }, + { + "epoch": 4.575757575757576, + "grad_norm": 8.674532890319824, + "learning_rate": 0.00010852525252525253, + "loss": 0.141, + "step": 4530 + }, + { + "epoch": 4.585858585858586, + "grad_norm": 0.17104926705360413, + "learning_rate": 0.00010832323232323233, + "loss": 0.11, + "step": 4540 + }, + { + "epoch": 4.595959595959596, + "grad_norm": 0.49020659923553467, + "learning_rate": 0.00010812121212121213, + "loss": 0.1464, + "step": 4550 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 2.5823633670806885, + "learning_rate": 0.00010791919191919193, + "loss": 0.1235, + "step": 4560 + }, + { + "epoch": 4.616161616161616, + "grad_norm": 0.19439224898815155, + "learning_rate": 0.00010771717171717173, + "loss": 0.106, + "step": 4570 + }, + { + "epoch": 4.626262626262626, + "grad_norm": 2.029311418533325, + "learning_rate": 0.00010751515151515153, + "loss": 0.1355, + "step": 4580 + }, + { + "epoch": 4.636363636363637, + "grad_norm": 0.9395115375518799, + "learning_rate": 0.00010731313131313132, + "loss": 0.1616, + "step": 4590 + }, + { + "epoch": 4.646464646464646, + "grad_norm": 1.2442481517791748, + "learning_rate": 0.00010711111111111111, + "loss": 0.1366, + "step": 4600 + }, + { + "epoch": 4.646464646464646, + "eval_accuracy": 0.941639813820265, + "eval_loss": 0.1570775955915451, + "eval_runtime": 25.9636, + "eval_samples_per_second": 107.574, + "eval_steps_per_second": 13.48, + "step": 4600 + }, + { + "epoch": 4.656565656565657, + "grad_norm": 0.11345808207988739, + "learning_rate": 0.00010690909090909091, + "loss": 0.0983, + "step": 4610 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 2.4496922492980957, + "learning_rate": 0.00010670707070707071, + "loss": 0.1403, + "step": 4620 + }, + { + "epoch": 4.6767676767676765, + "grad_norm": 1.0774174928665161, + "learning_rate": 0.00010650505050505051, + "loss": 0.1349, + "step": 4630 + }, + { + "epoch": 4.686868686868687, + "grad_norm": 3.606393814086914, + "learning_rate": 0.00010630303030303031, + "loss": 0.1461, + "step": 4640 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 2.261770486831665, + "learning_rate": 0.0001061010101010101, + "loss": 0.1043, + "step": 4650 + }, + { + "epoch": 4.707070707070707, + "grad_norm": 2.1773946285247803, + "learning_rate": 0.0001058989898989899, + "loss": 0.0915, + "step": 4660 + }, + { + "epoch": 4.717171717171717, + "grad_norm": 0.6248367428779602, + "learning_rate": 0.00010569696969696969, + "loss": 0.1363, + "step": 4670 + }, + { + "epoch": 4.7272727272727275, + "grad_norm": 4.649084091186523, + "learning_rate": 0.00010549494949494949, + "loss": 0.1858, + "step": 4680 + }, + { + "epoch": 4.737373737373737, + "grad_norm": 0.8173200488090515, + "learning_rate": 0.00010529292929292929, + "loss": 0.1659, + "step": 4690 + }, + { + "epoch": 4.747474747474747, + "grad_norm": 1.2802273035049438, + "learning_rate": 0.0001050909090909091, + "loss": 0.097, + "step": 4700 + }, + { + "epoch": 4.747474747474747, + "eval_accuracy": 0.9423558897243107, + "eval_loss": 0.1505608856678009, + "eval_runtime": 25.7027, + "eval_samples_per_second": 108.666, + "eval_steps_per_second": 13.617, + "step": 4700 + }, + { + "epoch": 4.757575757575758, + "grad_norm": 1.5660672187805176, + "learning_rate": 0.0001048888888888889, + "loss": 0.082, + "step": 4710 + }, + { + "epoch": 4.767676767676767, + "grad_norm": 2.602287769317627, + "learning_rate": 0.0001046868686868687, + "loss": 0.0363, + "step": 4720 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 3.42000675201416, + "learning_rate": 0.0001044848484848485, + "loss": 0.1116, + "step": 4730 + }, + { + "epoch": 4.787878787878788, + "grad_norm": 6.267419338226318, + "learning_rate": 0.0001042828282828283, + "loss": 0.2663, + "step": 4740 + }, + { + "epoch": 4.797979797979798, + "grad_norm": 2.1307647228240967, + "learning_rate": 0.00010408080808080808, + "loss": 0.1125, + "step": 4750 + }, + { + "epoch": 4.808080808080808, + "grad_norm": 0.09282152354717255, + "learning_rate": 0.00010387878787878788, + "loss": 0.1423, + "step": 4760 + }, + { + "epoch": 4.818181818181818, + "grad_norm": 2.5265235900878906, + "learning_rate": 0.00010367676767676768, + "loss": 0.1009, + "step": 4770 + }, + { + "epoch": 4.828282828282829, + "grad_norm": 0.6967155933380127, + "learning_rate": 0.00010347474747474748, + "loss": 0.1475, + "step": 4780 + }, + { + "epoch": 4.838383838383838, + "grad_norm": 1.6234831809997559, + "learning_rate": 0.00010327272727272728, + "loss": 0.1423, + "step": 4790 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.5020400285720825, + "learning_rate": 0.00010307070707070708, + "loss": 0.0603, + "step": 4800 + }, + { + "epoch": 4.848484848484849, + "eval_accuracy": 0.949874686716792, + "eval_loss": 0.1435205489397049, + "eval_runtime": 26.1687, + "eval_samples_per_second": 106.73, + "eval_steps_per_second": 13.375, + "step": 4800 + }, + { + "epoch": 4.858585858585858, + "grad_norm": 0.8095561265945435, + "learning_rate": 0.00010286868686868687, + "loss": 0.086, + "step": 4810 + }, + { + "epoch": 4.8686868686868685, + "grad_norm": 0.08542878925800323, + "learning_rate": 0.00010266666666666666, + "loss": 0.0988, + "step": 4820 + }, + { + "epoch": 4.878787878787879, + "grad_norm": 2.0379855632781982, + "learning_rate": 0.00010246464646464646, + "loss": 0.1714, + "step": 4830 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 1.3006110191345215, + "learning_rate": 0.00010226262626262626, + "loss": 0.2249, + "step": 4840 + }, + { + "epoch": 4.898989898989899, + "grad_norm": 0.09019988775253296, + "learning_rate": 0.00010206060606060606, + "loss": 0.1895, + "step": 4850 + }, + { + "epoch": 4.909090909090909, + "grad_norm": 1.2168376445770264, + "learning_rate": 0.00010185858585858586, + "loss": 0.1699, + "step": 4860 + }, + { + "epoch": 4.91919191919192, + "grad_norm": 3.1290087699890137, + "learning_rate": 0.00010165656565656567, + "loss": 0.1385, + "step": 4870 + }, + { + "epoch": 4.929292929292929, + "grad_norm": 0.1061786413192749, + "learning_rate": 0.00010145454545454547, + "loss": 0.097, + "step": 4880 + }, + { + "epoch": 4.9393939393939394, + "grad_norm": 0.17547203600406647, + "learning_rate": 0.00010125252525252527, + "loss": 0.112, + "step": 4890 + }, + { + "epoch": 4.94949494949495, + "grad_norm": 1.5746177434921265, + "learning_rate": 0.00010105050505050506, + "loss": 0.1179, + "step": 4900 + }, + { + "epoch": 4.94949494949495, + "eval_accuracy": 0.9362692445399212, + "eval_loss": 0.17535676062107086, + "eval_runtime": 26.1869, + "eval_samples_per_second": 106.656, + "eval_steps_per_second": 13.365, + "step": 4900 + }, + { + "epoch": 4.959595959595959, + "grad_norm": 0.42469334602355957, + "learning_rate": 0.00010084848484848485, + "loss": 0.0858, + "step": 4910 + }, + { + "epoch": 4.96969696969697, + "grad_norm": 6.303706645965576, + "learning_rate": 0.00010064646464646465, + "loss": 0.1654, + "step": 4920 + }, + { + "epoch": 4.97979797979798, + "grad_norm": 1.2742167711257935, + "learning_rate": 0.00010044444444444445, + "loss": 0.1244, + "step": 4930 + }, + { + "epoch": 4.98989898989899, + "grad_norm": 5.50272274017334, + "learning_rate": 0.00010024242424242425, + "loss": 0.1039, + "step": 4940 + }, + { + "epoch": 5.0, + "grad_norm": 0.027111496776342392, + "learning_rate": 0.00010004040404040405, + "loss": 0.0916, + "step": 4950 + }, + { + "epoch": 5.01010101010101, + "grad_norm": 3.654780626296997, + "learning_rate": 9.983838383838384e-05, + "loss": 0.1179, + "step": 4960 + }, + { + "epoch": 5.02020202020202, + "grad_norm": 0.6909185647964478, + "learning_rate": 9.963636363636363e-05, + "loss": 0.0716, + "step": 4970 + }, + { + "epoch": 5.03030303030303, + "grad_norm": 1.2126760482788086, + "learning_rate": 9.943434343434343e-05, + "loss": 0.1207, + "step": 4980 + }, + { + "epoch": 5.040404040404041, + "grad_norm": 0.04314064979553223, + "learning_rate": 9.923232323232323e-05, + "loss": 0.0951, + "step": 4990 + }, + { + "epoch": 5.05050505050505, + "grad_norm": 6.088857650756836, + "learning_rate": 9.903030303030305e-05, + "loss": 0.1948, + "step": 5000 + }, + { + "epoch": 5.05050505050505, + "eval_accuracy": 0.9402076620121733, + "eval_loss": 0.16094554960727692, + "eval_runtime": 25.955, + "eval_samples_per_second": 107.609, + "eval_steps_per_second": 13.485, + "step": 5000 + }, + { + "epoch": 5.0606060606060606, + "grad_norm": 3.058575391769409, + "learning_rate": 9.882828282828283e-05, + "loss": 0.0802, + "step": 5010 + }, + { + "epoch": 5.070707070707071, + "grad_norm": 0.2111556977033615, + "learning_rate": 9.862626262626263e-05, + "loss": 0.0589, + "step": 5020 + }, + { + "epoch": 5.08080808080808, + "grad_norm": 2.073899745941162, + "learning_rate": 9.842424242424243e-05, + "loss": 0.1131, + "step": 5030 + }, + { + "epoch": 5.090909090909091, + "grad_norm": 4.841446876525879, + "learning_rate": 9.822222222222223e-05, + "loss": 0.0367, + "step": 5040 + }, + { + "epoch": 5.101010101010101, + "grad_norm": 7.904470443725586, + "learning_rate": 9.802020202020203e-05, + "loss": 0.0877, + "step": 5050 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 1.0194205045700073, + "learning_rate": 9.781818181818183e-05, + "loss": 0.0882, + "step": 5060 + }, + { + "epoch": 5.121212121212121, + "grad_norm": 0.17837467789649963, + "learning_rate": 9.761616161616161e-05, + "loss": 0.0991, + "step": 5070 + }, + { + "epoch": 5.1313131313131315, + "grad_norm": 1.0016158819198608, + "learning_rate": 9.741414141414141e-05, + "loss": 0.0396, + "step": 5080 + }, + { + "epoch": 5.141414141414141, + "grad_norm": 0.15566827356815338, + "learning_rate": 9.721212121212122e-05, + "loss": 0.1433, + "step": 5090 + }, + { + "epoch": 5.151515151515151, + "grad_norm": 0.47362661361694336, + "learning_rate": 9.701010101010102e-05, + "loss": 0.1021, + "step": 5100 + }, + { + "epoch": 5.151515151515151, + "eval_accuracy": 0.9459362692445399, + "eval_loss": 0.15655164420604706, + "eval_runtime": 25.4433, + "eval_samples_per_second": 109.774, + "eval_steps_per_second": 13.756, + "step": 5100 + }, + { + "epoch": 5.161616161616162, + "grad_norm": 3.55830979347229, + "learning_rate": 9.680808080808082e-05, + "loss": 0.1389, + "step": 5110 + }, + { + "epoch": 5.171717171717171, + "grad_norm": 1.5197815895080566, + "learning_rate": 9.66060606060606e-05, + "loss": 0.12, + "step": 5120 + }, + { + "epoch": 5.181818181818182, + "grad_norm": 0.4415872097015381, + "learning_rate": 9.64040404040404e-05, + "loss": 0.0656, + "step": 5130 + }, + { + "epoch": 5.191919191919192, + "grad_norm": 2.322957754135132, + "learning_rate": 9.62020202020202e-05, + "loss": 0.091, + "step": 5140 + }, + { + "epoch": 5.202020202020202, + "grad_norm": 1.3364367485046387, + "learning_rate": 9.6e-05, + "loss": 0.12, + "step": 5150 + }, + { + "epoch": 5.212121212121212, + "grad_norm": 1.271172285079956, + "learning_rate": 9.57979797979798e-05, + "loss": 0.0752, + "step": 5160 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 1.848639965057373, + "learning_rate": 9.55959595959596e-05, + "loss": 0.065, + "step": 5170 + }, + { + "epoch": 5.232323232323233, + "grad_norm": 0.22514915466308594, + "learning_rate": 9.53939393939394e-05, + "loss": 0.1079, + "step": 5180 + }, + { + "epoch": 5.242424242424242, + "grad_norm": 2.2983055114746094, + "learning_rate": 9.51919191919192e-05, + "loss": 0.1161, + "step": 5190 + }, + { + "epoch": 5.252525252525253, + "grad_norm": 0.3693287968635559, + "learning_rate": 9.4989898989899e-05, + "loss": 0.0652, + "step": 5200 + }, + { + "epoch": 5.252525252525253, + "eval_accuracy": 0.9480844969566774, + "eval_loss": 0.15643763542175293, + "eval_runtime": 25.855, + "eval_samples_per_second": 108.025, + "eval_steps_per_second": 13.537, + "step": 5200 + }, + { + "epoch": 5.262626262626263, + "grad_norm": 2.0807137489318848, + "learning_rate": 9.47878787878788e-05, + "loss": 0.1401, + "step": 5210 + }, + { + "epoch": 5.2727272727272725, + "grad_norm": 0.10419812053442001, + "learning_rate": 9.45858585858586e-05, + "loss": 0.0812, + "step": 5220 + }, + { + "epoch": 5.282828282828283, + "grad_norm": 0.46770191192626953, + "learning_rate": 9.438383838383838e-05, + "loss": 0.0421, + "step": 5230 + }, + { + "epoch": 5.292929292929293, + "grad_norm": 2.635011672973633, + "learning_rate": 9.418181818181818e-05, + "loss": 0.1885, + "step": 5240 + }, + { + "epoch": 5.303030303030303, + "grad_norm": 0.25498488545417786, + "learning_rate": 9.397979797979798e-05, + "loss": 0.0601, + "step": 5250 + }, + { + "epoch": 5.313131313131313, + "grad_norm": 1.459560751914978, + "learning_rate": 9.377777777777779e-05, + "loss": 0.1223, + "step": 5260 + }, + { + "epoch": 5.3232323232323235, + "grad_norm": 0.1610535830259323, + "learning_rate": 9.357575757575759e-05, + "loss": 0.1514, + "step": 5270 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.09537172317504883, + "learning_rate": 9.337373737373738e-05, + "loss": 0.0844, + "step": 5280 + }, + { + "epoch": 5.343434343434343, + "grad_norm": 2.253643035888672, + "learning_rate": 9.317171717171717e-05, + "loss": 0.0966, + "step": 5290 + }, + { + "epoch": 5.353535353535354, + "grad_norm": 0.7719996571540833, + "learning_rate": 9.296969696969697e-05, + "loss": 0.1029, + "step": 5300 + }, + { + "epoch": 5.353535353535354, + "eval_accuracy": 0.9491586108127461, + "eval_loss": 0.14101244509220123, + "eval_runtime": 25.4761, + "eval_samples_per_second": 109.632, + "eval_steps_per_second": 13.738, + "step": 5300 + }, + { + "epoch": 5.363636363636363, + "grad_norm": 0.09155893325805664, + "learning_rate": 9.276767676767677e-05, + "loss": 0.1047, + "step": 5310 + }, + { + "epoch": 5.373737373737374, + "grad_norm": 0.717546284198761, + "learning_rate": 9.256565656565657e-05, + "loss": 0.1009, + "step": 5320 + }, + { + "epoch": 5.383838383838384, + "grad_norm": 1.9400055408477783, + "learning_rate": 9.236363636363636e-05, + "loss": 0.0928, + "step": 5330 + }, + { + "epoch": 5.393939393939394, + "grad_norm": 2.2842278480529785, + "learning_rate": 9.216161616161617e-05, + "loss": 0.1545, + "step": 5340 + }, + { + "epoch": 5.404040404040404, + "grad_norm": 2.3194849491119385, + "learning_rate": 9.195959595959597e-05, + "loss": 0.0965, + "step": 5350 + }, + { + "epoch": 5.414141414141414, + "grad_norm": 1.7421154975891113, + "learning_rate": 9.175757575757577e-05, + "loss": 0.1106, + "step": 5360 + }, + { + "epoch": 5.424242424242424, + "grad_norm": 0.7392351031303406, + "learning_rate": 9.155555555555557e-05, + "loss": 0.0633, + "step": 5370 + }, + { + "epoch": 5.434343434343434, + "grad_norm": 1.5309923887252808, + "learning_rate": 9.135353535353535e-05, + "loss": 0.1131, + "step": 5380 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 2.1185829639434814, + "learning_rate": 9.115151515151515e-05, + "loss": 0.0428, + "step": 5390 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.35134243965148926, + "learning_rate": 9.094949494949495e-05, + "loss": 0.1014, + "step": 5400 + }, + { + "epoch": 5.454545454545454, + "eval_accuracy": 0.9530970282849982, + "eval_loss": 0.1489526480436325, + "eval_runtime": 26.3602, + "eval_samples_per_second": 105.955, + "eval_steps_per_second": 13.278, + "step": 5400 + }, + { + "epoch": 5.4646464646464645, + "grad_norm": 0.04129509627819061, + "learning_rate": 9.074747474747475e-05, + "loss": 0.0783, + "step": 5410 + }, + { + "epoch": 5.474747474747475, + "grad_norm": 4.56264066696167, + "learning_rate": 9.054545454545455e-05, + "loss": 0.0867, + "step": 5420 + }, + { + "epoch": 5.484848484848484, + "grad_norm": 0.036666784435510635, + "learning_rate": 9.034343434343435e-05, + "loss": 0.0902, + "step": 5430 + }, + { + "epoch": 5.494949494949495, + "grad_norm": 0.4058803915977478, + "learning_rate": 9.014141414141415e-05, + "loss": 0.0481, + "step": 5440 + }, + { + "epoch": 5.505050505050505, + "grad_norm": 0.5836386680603027, + "learning_rate": 8.993939393939394e-05, + "loss": 0.0567, + "step": 5450 + }, + { + "epoch": 5.515151515151516, + "grad_norm": 0.03230315446853638, + "learning_rate": 8.973737373737374e-05, + "loss": 0.0862, + "step": 5460 + }, + { + "epoch": 5.525252525252525, + "grad_norm": 1.033913254737854, + "learning_rate": 8.953535353535354e-05, + "loss": 0.1181, + "step": 5470 + }, + { + "epoch": 5.5353535353535355, + "grad_norm": 5.181567192077637, + "learning_rate": 8.933333333333334e-05, + "loss": 0.1131, + "step": 5480 + }, + { + "epoch": 5.545454545454545, + "grad_norm": 0.06244403123855591, + "learning_rate": 8.913131313131313e-05, + "loss": 0.0977, + "step": 5490 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 6.101346015930176, + "learning_rate": 8.892929292929293e-05, + "loss": 0.1338, + "step": 5500 + }, + { + "epoch": 5.555555555555555, + "eval_accuracy": 0.9405656999641963, + "eval_loss": 0.1865406185388565, + "eval_runtime": 25.7365, + "eval_samples_per_second": 108.523, + "eval_steps_per_second": 13.599, + "step": 5500 + }, + { + "epoch": 5.565656565656566, + "grad_norm": 4.3687968254089355, + "learning_rate": 8.872727272727274e-05, + "loss": 0.1066, + "step": 5510 + }, + { + "epoch": 5.575757575757576, + "grad_norm": 0.08140174299478531, + "learning_rate": 8.852525252525254e-05, + "loss": 0.0473, + "step": 5520 + }, + { + "epoch": 5.585858585858586, + "grad_norm": 2.2927050590515137, + "learning_rate": 8.832323232323234e-05, + "loss": 0.0988, + "step": 5530 + }, + { + "epoch": 5.595959595959596, + "grad_norm": 3.8242876529693604, + "learning_rate": 8.812121212121212e-05, + "loss": 0.0915, + "step": 5540 + }, + { + "epoch": 5.606060606060606, + "grad_norm": 1.7475749254226685, + "learning_rate": 8.791919191919192e-05, + "loss": 0.1052, + "step": 5550 + }, + { + "epoch": 5.616161616161616, + "grad_norm": 1.825736403465271, + "learning_rate": 8.771717171717172e-05, + "loss": 0.132, + "step": 5560 + }, + { + "epoch": 5.626262626262626, + "grad_norm": 2.666304588317871, + "learning_rate": 8.751515151515152e-05, + "loss": 0.1736, + "step": 5570 + }, + { + "epoch": 5.636363636363637, + "grad_norm": 1.111354947090149, + "learning_rate": 8.731313131313132e-05, + "loss": 0.1023, + "step": 5580 + }, + { + "epoch": 5.646464646464646, + "grad_norm": 0.05366240069270134, + "learning_rate": 8.711111111111112e-05, + "loss": 0.0442, + "step": 5590 + }, + { + "epoch": 5.656565656565657, + "grad_norm": 3.5129668712615967, + "learning_rate": 8.690909090909091e-05, + "loss": 0.0844, + "step": 5600 + }, + { + "epoch": 5.656565656565657, + "eval_accuracy": 0.9455782312925171, + "eval_loss": 0.16305388510227203, + "eval_runtime": 25.3176, + "eval_samples_per_second": 110.318, + "eval_steps_per_second": 13.824, + "step": 5600 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.3834172487258911, + "learning_rate": 8.670707070707071e-05, + "loss": 0.1163, + "step": 5610 + }, + { + "epoch": 5.6767676767676765, + "grad_norm": 0.12349528819322586, + "learning_rate": 8.650505050505051e-05, + "loss": 0.0715, + "step": 5620 + }, + { + "epoch": 5.686868686868687, + "grad_norm": 0.23962771892547607, + "learning_rate": 8.630303030303031e-05, + "loss": 0.0783, + "step": 5630 + }, + { + "epoch": 5.696969696969697, + "grad_norm": 3.6592202186584473, + "learning_rate": 8.610101010101011e-05, + "loss": 0.0441, + "step": 5640 + }, + { + "epoch": 5.707070707070707, + "grad_norm": 5.241848945617676, + "learning_rate": 8.58989898989899e-05, + "loss": 0.1715, + "step": 5650 + }, + { + "epoch": 5.717171717171717, + "grad_norm": 0.05768425762653351, + "learning_rate": 8.56969696969697e-05, + "loss": 0.1549, + "step": 5660 + }, + { + "epoch": 5.7272727272727275, + "grad_norm": 1.6297590732574463, + "learning_rate": 8.54949494949495e-05, + "loss": 0.0363, + "step": 5670 + }, + { + "epoch": 5.737373737373737, + "grad_norm": 2.148622512817383, + "learning_rate": 8.52929292929293e-05, + "loss": 0.0904, + "step": 5680 + }, + { + "epoch": 5.747474747474747, + "grad_norm": 0.2656024396419525, + "learning_rate": 8.50909090909091e-05, + "loss": 0.0794, + "step": 5690 + }, + { + "epoch": 5.757575757575758, + "grad_norm": 0.6363499760627747, + "learning_rate": 8.488888888888889e-05, + "loss": 0.1059, + "step": 5700 + }, + { + "epoch": 5.757575757575758, + "eval_accuracy": 0.9409237379162191, + "eval_loss": 0.17375385761260986, + "eval_runtime": 25.566, + "eval_samples_per_second": 109.247, + "eval_steps_per_second": 13.69, + "step": 5700 + }, + { + "epoch": 5.767676767676767, + "grad_norm": 0.7380683422088623, + "learning_rate": 8.468686868686869e-05, + "loss": 0.1321, + "step": 5710 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 4.799917221069336, + "learning_rate": 8.448484848484849e-05, + "loss": 0.1298, + "step": 5720 + }, + { + "epoch": 5.787878787878788, + "grad_norm": 0.126003697514534, + "learning_rate": 8.428282828282829e-05, + "loss": 0.0479, + "step": 5730 + }, + { + "epoch": 5.797979797979798, + "grad_norm": 0.35241806507110596, + "learning_rate": 8.408080808080809e-05, + "loss": 0.0629, + "step": 5740 + }, + { + "epoch": 5.808080808080808, + "grad_norm": 0.16507993638515472, + "learning_rate": 8.387878787878789e-05, + "loss": 0.1027, + "step": 5750 + }, + { + "epoch": 5.818181818181818, + "grad_norm": 5.024535179138184, + "learning_rate": 8.367676767676767e-05, + "loss": 0.0982, + "step": 5760 + }, + { + "epoch": 5.828282828282829, + "grad_norm": 0.24480080604553223, + "learning_rate": 8.347474747474748e-05, + "loss": 0.2305, + "step": 5770 + }, + { + "epoch": 5.838383838383838, + "grad_norm": 0.8840824961662292, + "learning_rate": 8.327272727272728e-05, + "loss": 0.1446, + "step": 5780 + }, + { + "epoch": 5.848484848484849, + "grad_norm": 3.435821533203125, + "learning_rate": 8.307070707070708e-05, + "loss": 0.1347, + "step": 5790 + }, + { + "epoch": 5.858585858585858, + "grad_norm": 2.1608059406280518, + "learning_rate": 8.286868686868687e-05, + "loss": 0.0788, + "step": 5800 + }, + { + "epoch": 5.858585858585858, + "eval_accuracy": 0.9369853204439671, + "eval_loss": 0.1801307052373886, + "eval_runtime": 26.0224, + "eval_samples_per_second": 107.331, + "eval_steps_per_second": 13.45, + "step": 5800 + }, + { + "epoch": 5.8686868686868685, + "grad_norm": 1.0681806802749634, + "learning_rate": 8.266666666666667e-05, + "loss": 0.124, + "step": 5810 + }, + { + "epoch": 5.878787878787879, + "grad_norm": 0.046699218451976776, + "learning_rate": 8.246464646464646e-05, + "loss": 0.0926, + "step": 5820 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 2.9529507160186768, + "learning_rate": 8.226262626262626e-05, + "loss": 0.0832, + "step": 5830 + }, + { + "epoch": 5.898989898989899, + "grad_norm": 0.03451383113861084, + "learning_rate": 8.206060606060606e-05, + "loss": 0.0209, + "step": 5840 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 0.5120105147361755, + "learning_rate": 8.185858585858586e-05, + "loss": 0.1213, + "step": 5850 + }, + { + "epoch": 5.91919191919192, + "grad_norm": 9.386165618896484, + "learning_rate": 8.165656565656566e-05, + "loss": 0.0569, + "step": 5860 + }, + { + "epoch": 5.929292929292929, + "grad_norm": 0.3248002231121063, + "learning_rate": 8.145454545454546e-05, + "loss": 0.1299, + "step": 5870 + }, + { + "epoch": 5.9393939393939394, + "grad_norm": 1.143277883529663, + "learning_rate": 8.125252525252526e-05, + "loss": 0.0873, + "step": 5880 + }, + { + "epoch": 5.94949494949495, + "grad_norm": 1.6092329025268555, + "learning_rate": 8.105050505050506e-05, + "loss": 0.0943, + "step": 5890 + }, + { + "epoch": 5.959595959595959, + "grad_norm": 2.943516492843628, + "learning_rate": 8.084848484848486e-05, + "loss": 0.0941, + "step": 5900 + }, + { + "epoch": 5.959595959595959, + "eval_accuracy": 0.949516648764769, + "eval_loss": 0.1574886590242386, + "eval_runtime": 25.8576, + "eval_samples_per_second": 108.015, + "eval_steps_per_second": 13.536, + "step": 5900 + }, + { + "epoch": 5.96969696969697, + "grad_norm": 0.6184459328651428, + "learning_rate": 8.064646464646464e-05, + "loss": 0.0993, + "step": 5910 + }, + { + "epoch": 5.97979797979798, + "grad_norm": 4.26630973815918, + "learning_rate": 8.044444444444444e-05, + "loss": 0.0598, + "step": 5920 + }, + { + "epoch": 5.98989898989899, + "grad_norm": 4.177489757537842, + "learning_rate": 8.024242424242424e-05, + "loss": 0.1909, + "step": 5930 + }, + { + "epoch": 6.0, + "grad_norm": 0.016284123063087463, + "learning_rate": 8.004040404040405e-05, + "loss": 0.0714, + "step": 5940 + }, + { + "epoch": 6.01010101010101, + "grad_norm": 1.6857322454452515, + "learning_rate": 7.983838383838385e-05, + "loss": 0.0628, + "step": 5950 + }, + { + "epoch": 6.02020202020202, + "grad_norm": 0.1405191570520401, + "learning_rate": 7.963636363636364e-05, + "loss": 0.049, + "step": 5960 + }, + { + "epoch": 6.03030303030303, + "grad_norm": 0.053569547832012177, + "learning_rate": 7.943434343434344e-05, + "loss": 0.0604, + "step": 5970 + }, + { + "epoch": 6.040404040404041, + "grad_norm": 0.10379111021757126, + "learning_rate": 7.923232323232323e-05, + "loss": 0.0351, + "step": 5980 + }, + { + "epoch": 6.05050505050505, + "grad_norm": 0.024483507499098778, + "learning_rate": 7.903030303030303e-05, + "loss": 0.0451, + "step": 5990 + }, + { + "epoch": 6.0606060606060606, + "grad_norm": 0.07992430031299591, + "learning_rate": 7.882828282828283e-05, + "loss": 0.112, + "step": 6000 + }, + { + "epoch": 6.0606060606060606, + "eval_accuracy": 0.9470103831006087, + "eval_loss": 0.1796165257692337, + "eval_runtime": 26.6699, + "eval_samples_per_second": 104.725, + "eval_steps_per_second": 13.123, + "step": 6000 + }, + { + "epoch": 6.070707070707071, + "grad_norm": 0.6754518747329712, + "learning_rate": 7.862626262626263e-05, + "loss": 0.0417, + "step": 6010 + }, + { + "epoch": 6.08080808080808, + "grad_norm": 5.186758995056152, + "learning_rate": 7.842424242424243e-05, + "loss": 0.0814, + "step": 6020 + }, + { + "epoch": 6.090909090909091, + "grad_norm": 4.043176174163818, + "learning_rate": 7.822222222222223e-05, + "loss": 0.0604, + "step": 6030 + }, + { + "epoch": 6.101010101010101, + "grad_norm": 1.122359037399292, + "learning_rate": 7.802020202020203e-05, + "loss": 0.0795, + "step": 6040 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.10599520802497864, + "learning_rate": 7.781818181818183e-05, + "loss": 0.0552, + "step": 6050 + }, + { + "epoch": 6.121212121212121, + "grad_norm": 0.051988355815410614, + "learning_rate": 7.761616161616163e-05, + "loss": 0.0393, + "step": 6060 + }, + { + "epoch": 6.1313131313131315, + "grad_norm": 0.14302538335323334, + "learning_rate": 7.741414141414141e-05, + "loss": 0.0152, + "step": 6070 + }, + { + "epoch": 6.141414141414141, + "grad_norm": 0.022072261199355125, + "learning_rate": 7.721212121212121e-05, + "loss": 0.111, + "step": 6080 + }, + { + "epoch": 6.151515151515151, + "grad_norm": 0.05234786495566368, + "learning_rate": 7.701010101010101e-05, + "loss": 0.0718, + "step": 6090 + }, + { + "epoch": 6.161616161616162, + "grad_norm": 0.22603179514408112, + "learning_rate": 7.680808080808081e-05, + "loss": 0.0691, + "step": 6100 + }, + { + "epoch": 6.161616161616162, + "eval_accuracy": 0.949874686716792, + "eval_loss": 0.16974994540214539, + "eval_runtime": 25.0765, + "eval_samples_per_second": 111.379, + "eval_steps_per_second": 13.957, + "step": 6100 + }, + { + "epoch": 6.171717171717171, + "grad_norm": 0.09569063037633896, + "learning_rate": 7.660606060606062e-05, + "loss": 0.1551, + "step": 6110 + }, + { + "epoch": 6.181818181818182, + "grad_norm": 0.29587113857269287, + "learning_rate": 7.64040404040404e-05, + "loss": 0.045, + "step": 6120 + }, + { + "epoch": 6.191919191919192, + "grad_norm": 0.01917618326842785, + "learning_rate": 7.62020202020202e-05, + "loss": 0.1272, + "step": 6130 + }, + { + "epoch": 6.202020202020202, + "grad_norm": 2.4523396492004395, + "learning_rate": 7.6e-05, + "loss": 0.0166, + "step": 6140 + }, + { + "epoch": 6.212121212121212, + "grad_norm": 3.089550256729126, + "learning_rate": 7.57979797979798e-05, + "loss": 0.103, + "step": 6150 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 0.07542894780635834, + "learning_rate": 7.55959595959596e-05, + "loss": 0.1016, + "step": 6160 + }, + { + "epoch": 6.232323232323233, + "grad_norm": 1.3589491844177246, + "learning_rate": 7.53939393939394e-05, + "loss": 0.119, + "step": 6170 + }, + { + "epoch": 6.242424242424242, + "grad_norm": 0.09127297252416611, + "learning_rate": 7.519191919191919e-05, + "loss": 0.0322, + "step": 6180 + }, + { + "epoch": 6.252525252525253, + "grad_norm": 0.4492006301879883, + "learning_rate": 7.4989898989899e-05, + "loss": 0.0483, + "step": 6190 + }, + { + "epoch": 6.262626262626263, + "grad_norm": 3.9789376258850098, + "learning_rate": 7.47878787878788e-05, + "loss": 0.1385, + "step": 6200 + }, + { + "epoch": 6.262626262626263, + "eval_accuracy": 0.9563193698532044, + "eval_loss": 0.1347890943288803, + "eval_runtime": 25.6815, + "eval_samples_per_second": 108.755, + "eval_steps_per_second": 13.628, + "step": 6200 + }, + { + "epoch": 6.2727272727272725, + "grad_norm": 1.760690689086914, + "learning_rate": 7.45858585858586e-05, + "loss": 0.0754, + "step": 6210 + }, + { + "epoch": 6.282828282828283, + "grad_norm": 0.040993157774209976, + "learning_rate": 7.438383838383838e-05, + "loss": 0.0793, + "step": 6220 + }, + { + "epoch": 6.292929292929293, + "grad_norm": 1.8426438570022583, + "learning_rate": 7.418181818181818e-05, + "loss": 0.1176, + "step": 6230 + }, + { + "epoch": 6.303030303030303, + "grad_norm": 0.208732470870018, + "learning_rate": 7.397979797979798e-05, + "loss": 0.1086, + "step": 6240 + }, + { + "epoch": 6.313131313131313, + "grad_norm": 0.08218652009963989, + "learning_rate": 7.377777777777778e-05, + "loss": 0.0306, + "step": 6250 + }, + { + "epoch": 6.3232323232323235, + "grad_norm": 2.8951575756073, + "learning_rate": 7.357575757575758e-05, + "loss": 0.0742, + "step": 6260 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 3.194607973098755, + "learning_rate": 7.337373737373738e-05, + "loss": 0.1073, + "step": 6270 + }, + { + "epoch": 6.343434343434343, + "grad_norm": 2.866347074508667, + "learning_rate": 7.317171717171718e-05, + "loss": 0.0365, + "step": 6280 + }, + { + "epoch": 6.353535353535354, + "grad_norm": 0.01601150631904602, + "learning_rate": 7.296969696969697e-05, + "loss": 0.0821, + "step": 6290 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 2.505512237548828, + "learning_rate": 7.276767676767677e-05, + "loss": 0.1173, + "step": 6300 + }, + { + "epoch": 6.363636363636363, + "eval_accuracy": 0.9502327246688149, + "eval_loss": 0.1521872878074646, + "eval_runtime": 26.1829, + "eval_samples_per_second": 106.673, + "eval_steps_per_second": 13.367, + "step": 6300 + }, + { + "epoch": 6.373737373737374, + "grad_norm": 4.881263732910156, + "learning_rate": 7.256565656565657e-05, + "loss": 0.0931, + "step": 6310 + }, + { + "epoch": 6.383838383838384, + "grad_norm": 0.1095634326338768, + "learning_rate": 7.236363636363637e-05, + "loss": 0.0494, + "step": 6320 + }, + { + "epoch": 6.393939393939394, + "grad_norm": 4.2075581550598145, + "learning_rate": 7.216161616161616e-05, + "loss": 0.044, + "step": 6330 + }, + { + "epoch": 6.404040404040404, + "grad_norm": 0.23861250281333923, + "learning_rate": 7.195959595959596e-05, + "loss": 0.1207, + "step": 6340 + }, + { + "epoch": 6.414141414141414, + "grad_norm": 0.022121990099549294, + "learning_rate": 7.175757575757576e-05, + "loss": 0.0587, + "step": 6350 + }, + { + "epoch": 6.424242424242424, + "grad_norm": 0.36593952775001526, + "learning_rate": 7.155555555555555e-05, + "loss": 0.141, + "step": 6360 + }, + { + "epoch": 6.434343434343434, + "grad_norm": 2.1025164127349854, + "learning_rate": 7.135353535353537e-05, + "loss": 0.0982, + "step": 6370 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 1.3649953603744507, + "learning_rate": 7.115151515151515e-05, + "loss": 0.1344, + "step": 6380 + }, + { + "epoch": 6.454545454545454, + "grad_norm": 0.5976651906967163, + "learning_rate": 7.094949494949495e-05, + "loss": 0.1288, + "step": 6390 + }, + { + "epoch": 6.4646464646464645, + "grad_norm": 0.1333063542842865, + "learning_rate": 7.074747474747475e-05, + "loss": 0.046, + "step": 6400 + }, + { + "epoch": 6.4646464646464645, + "eval_accuracy": 0.9391335481561045, + "eval_loss": 0.21140703558921814, + "eval_runtime": 25.6858, + "eval_samples_per_second": 108.737, + "eval_steps_per_second": 13.626, + "step": 6400 + }, + { + "epoch": 6.474747474747475, + "grad_norm": 0.41920793056488037, + "learning_rate": 7.054545454545455e-05, + "loss": 0.1177, + "step": 6410 + }, + { + "epoch": 6.484848484848484, + "grad_norm": 6.899296283721924, + "learning_rate": 7.034343434343435e-05, + "loss": 0.0703, + "step": 6420 + }, + { + "epoch": 6.494949494949495, + "grad_norm": 0.05274542421102524, + "learning_rate": 7.014141414141415e-05, + "loss": 0.0128, + "step": 6430 + }, + { + "epoch": 6.505050505050505, + "grad_norm": 0.020516296848654747, + "learning_rate": 6.993939393939393e-05, + "loss": 0.0802, + "step": 6440 + }, + { + "epoch": 6.515151515151516, + "grad_norm": 0.6311562061309814, + "learning_rate": 6.973737373737374e-05, + "loss": 0.0316, + "step": 6450 + }, + { + "epoch": 6.525252525252525, + "grad_norm": 0.04587104544043541, + "learning_rate": 6.953535353535354e-05, + "loss": 0.1231, + "step": 6460 + }, + { + "epoch": 6.5353535353535355, + "grad_norm": 0.025068655610084534, + "learning_rate": 6.933333333333334e-05, + "loss": 0.0788, + "step": 6470 + }, + { + "epoch": 6.545454545454545, + "grad_norm": 0.635887861251831, + "learning_rate": 6.915151515151516e-05, + "loss": 0.0303, + "step": 6480 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.1272686868906021, + "learning_rate": 6.894949494949496e-05, + "loss": 0.0787, + "step": 6490 + }, + { + "epoch": 6.565656565656566, + "grad_norm": 2.4348435401916504, + "learning_rate": 6.874747474747476e-05, + "loss": 0.0319, + "step": 6500 + }, + { + "epoch": 6.565656565656566, + "eval_accuracy": 0.9477264590046545, + "eval_loss": 0.17225638031959534, + "eval_runtime": 25.5065, + "eval_samples_per_second": 109.501, + "eval_steps_per_second": 13.722, + "step": 6500 + }, + { + "epoch": 6.575757575757576, + "grad_norm": 1.989790678024292, + "learning_rate": 6.854545454545454e-05, + "loss": 0.0225, + "step": 6510 + }, + { + "epoch": 6.585858585858586, + "grad_norm": 0.0541650727391243, + "learning_rate": 6.834343434343434e-05, + "loss": 0.1695, + "step": 6520 + }, + { + "epoch": 6.595959595959596, + "grad_norm": 0.4964849352836609, + "learning_rate": 6.814141414141414e-05, + "loss": 0.0415, + "step": 6530 + }, + { + "epoch": 6.606060606060606, + "grad_norm": 0.09336791187524796, + "learning_rate": 6.793939393939395e-05, + "loss": 0.1004, + "step": 6540 + }, + { + "epoch": 6.616161616161616, + "grad_norm": 0.338398277759552, + "learning_rate": 6.773737373737375e-05, + "loss": 0.0221, + "step": 6550 + }, + { + "epoch": 6.626262626262626, + "grad_norm": 0.09210254997015, + "learning_rate": 6.753535353535354e-05, + "loss": 0.0449, + "step": 6560 + }, + { + "epoch": 6.636363636363637, + "grad_norm": 0.05256842449307442, + "learning_rate": 6.733333333333333e-05, + "loss": 0.0527, + "step": 6570 + }, + { + "epoch": 6.646464646464646, + "grad_norm": 0.311467707157135, + "learning_rate": 6.713131313131313e-05, + "loss": 0.035, + "step": 6580 + }, + { + "epoch": 6.656565656565657, + "grad_norm": 0.2883482575416565, + "learning_rate": 6.692929292929293e-05, + "loss": 0.1456, + "step": 6590 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.8831584453582764, + "learning_rate": 6.672727272727273e-05, + "loss": 0.0757, + "step": 6600 + }, + { + "epoch": 6.666666666666667, + "eval_accuracy": 0.9527389903329753, + "eval_loss": 0.15606904029846191, + "eval_runtime": 26.056, + "eval_samples_per_second": 107.192, + "eval_steps_per_second": 13.433, + "step": 6600 + }, + { + "epoch": 6.6767676767676765, + "grad_norm": 0.7311285734176636, + "learning_rate": 6.652525252525253e-05, + "loss": 0.0205, + "step": 6610 + }, + { + "epoch": 6.686868686868687, + "grad_norm": 0.1137022078037262, + "learning_rate": 6.632323232323233e-05, + "loss": 0.0988, + "step": 6620 + }, + { + "epoch": 6.696969696969697, + "grad_norm": 0.35282573103904724, + "learning_rate": 6.612121212121213e-05, + "loss": 0.1312, + "step": 6630 + }, + { + "epoch": 6.707070707070707, + "grad_norm": 0.3823562264442444, + "learning_rate": 6.591919191919193e-05, + "loss": 0.0931, + "step": 6640 + }, + { + "epoch": 6.717171717171717, + "grad_norm": 0.1301673948764801, + "learning_rate": 6.571717171717173e-05, + "loss": 0.0202, + "step": 6650 + }, + { + "epoch": 6.7272727272727275, + "grad_norm": 0.04657525569200516, + "learning_rate": 6.551515151515151e-05, + "loss": 0.0529, + "step": 6660 + }, + { + "epoch": 6.737373737373737, + "grad_norm": 0.28591451048851013, + "learning_rate": 6.531313131313131e-05, + "loss": 0.018, + "step": 6670 + }, + { + "epoch": 6.747474747474747, + "grad_norm": 0.03750978037714958, + "learning_rate": 6.511111111111111e-05, + "loss": 0.0104, + "step": 6680 + }, + { + "epoch": 6.757575757575758, + "grad_norm": 6.778770446777344, + "learning_rate": 6.490909090909091e-05, + "loss": 0.0783, + "step": 6690 + }, + { + "epoch": 6.767676767676767, + "grad_norm": 0.09437508136034012, + "learning_rate": 6.470707070707071e-05, + "loss": 0.0744, + "step": 6700 + }, + { + "epoch": 6.767676767676767, + "eval_accuracy": 0.9566774078052274, + "eval_loss": 0.15866762399673462, + "eval_runtime": 26.4523, + "eval_samples_per_second": 105.586, + "eval_steps_per_second": 13.231, + "step": 6700 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 0.2664666771888733, + "learning_rate": 6.45050505050505e-05, + "loss": 0.0872, + "step": 6710 + }, + { + "epoch": 6.787878787878788, + "grad_norm": 0.04324870929121971, + "learning_rate": 6.43030303030303e-05, + "loss": 0.0573, + "step": 6720 + }, + { + "epoch": 6.797979797979798, + "grad_norm": 0.751238226890564, + "learning_rate": 6.41010101010101e-05, + "loss": 0.1092, + "step": 6730 + }, + { + "epoch": 6.808080808080808, + "grad_norm": 0.10832629352807999, + "learning_rate": 6.38989898989899e-05, + "loss": 0.0776, + "step": 6740 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 0.5286844968795776, + "learning_rate": 6.36969696969697e-05, + "loss": 0.0374, + "step": 6750 + }, + { + "epoch": 6.828282828282829, + "grad_norm": 2.7931125164031982, + "learning_rate": 6.34949494949495e-05, + "loss": 0.1122, + "step": 6760 + }, + { + "epoch": 6.838383838383838, + "grad_norm": 0.05890562757849693, + "learning_rate": 6.329292929292929e-05, + "loss": 0.066, + "step": 6770 + }, + { + "epoch": 6.848484848484849, + "grad_norm": 0.583456814289093, + "learning_rate": 6.309090909090909e-05, + "loss": 0.0537, + "step": 6780 + }, + { + "epoch": 6.858585858585858, + "grad_norm": 0.058590278029441833, + "learning_rate": 6.28888888888889e-05, + "loss": 0.096, + "step": 6790 + }, + { + "epoch": 6.8686868686868685, + "grad_norm": 0.6827572584152222, + "learning_rate": 6.26868686868687e-05, + "loss": 0.0341, + "step": 6800 + }, + { + "epoch": 6.8686868686868685, + "eval_accuracy": 0.9577515216612961, + "eval_loss": 0.14577987790107727, + "eval_runtime": 26.3019, + "eval_samples_per_second": 106.19, + "eval_steps_per_second": 13.307, + "step": 6800 + }, + { + "epoch": 6.878787878787879, + "grad_norm": 5.151520729064941, + "learning_rate": 6.24848484848485e-05, + "loss": 0.0552, + "step": 6810 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 0.39091670513153076, + "learning_rate": 6.228282828282828e-05, + "loss": 0.0373, + "step": 6820 + }, + { + "epoch": 6.898989898989899, + "grad_norm": 0.05881926789879799, + "learning_rate": 6.208080808080808e-05, + "loss": 0.0314, + "step": 6830 + }, + { + "epoch": 6.909090909090909, + "grad_norm": 2.3945934772491455, + "learning_rate": 6.187878787878788e-05, + "loss": 0.115, + "step": 6840 + }, + { + "epoch": 6.91919191919192, + "grad_norm": 0.35756856203079224, + "learning_rate": 6.167676767676768e-05, + "loss": 0.0212, + "step": 6850 + }, + { + "epoch": 6.929292929292929, + "grad_norm": 1.2077807188034058, + "learning_rate": 6.147474747474748e-05, + "loss": 0.1025, + "step": 6860 + }, + { + "epoch": 6.9393939393939394, + "grad_norm": 0.029852261766791344, + "learning_rate": 6.127272727272728e-05, + "loss": 0.0404, + "step": 6870 + }, + { + "epoch": 6.94949494949495, + "grad_norm": 0.8109472990036011, + "learning_rate": 6.107070707070708e-05, + "loss": 0.0539, + "step": 6880 + }, + { + "epoch": 6.959595959595959, + "grad_norm": 0.09217210114002228, + "learning_rate": 6.0868686868686874e-05, + "loss": 0.1452, + "step": 6890 + }, + { + "epoch": 6.96969696969697, + "grad_norm": 0.17790426313877106, + "learning_rate": 6.066666666666667e-05, + "loss": 0.1512, + "step": 6900 + }, + { + "epoch": 6.96969696969697, + "eval_accuracy": 0.9530970282849982, + "eval_loss": 0.1572313755750656, + "eval_runtime": 25.7039, + "eval_samples_per_second": 108.66, + "eval_steps_per_second": 13.617, + "step": 6900 + }, + { + "epoch": 6.97979797979798, + "grad_norm": 1.3701117038726807, + "learning_rate": 6.0464646464646465e-05, + "loss": 0.0949, + "step": 6910 + }, + { + "epoch": 6.98989898989899, + "grad_norm": 2.158742904663086, + "learning_rate": 6.0262626262626264e-05, + "loss": 0.0386, + "step": 6920 + }, + { + "epoch": 7.0, + "grad_norm": 1.495492935180664, + "learning_rate": 6.006060606060606e-05, + "loss": 0.0568, + "step": 6930 + }, + { + "epoch": 7.01010101010101, + "grad_norm": 5.56789493560791, + "learning_rate": 5.9858585858585855e-05, + "loss": 0.1273, + "step": 6940 + }, + { + "epoch": 7.02020202020202, + "grad_norm": 0.0192103311419487, + "learning_rate": 5.9656565656565654e-05, + "loss": 0.0044, + "step": 6950 + }, + { + "epoch": 7.03030303030303, + "grad_norm": 0.2975185215473175, + "learning_rate": 5.945454545454546e-05, + "loss": 0.0533, + "step": 6960 + }, + { + "epoch": 7.040404040404041, + "grad_norm": 5.849196434020996, + "learning_rate": 5.925252525252526e-05, + "loss": 0.0314, + "step": 6970 + }, + { + "epoch": 7.05050505050505, + "grad_norm": 0.017664331942796707, + "learning_rate": 5.905050505050506e-05, + "loss": 0.0925, + "step": 6980 + }, + { + "epoch": 7.0606060606060606, + "grad_norm": 4.201910495758057, + "learning_rate": 5.884848484848485e-05, + "loss": 0.0937, + "step": 6990 + }, + { + "epoch": 7.070707070707071, + "grad_norm": 0.5537464022636414, + "learning_rate": 5.864646464646465e-05, + "loss": 0.0153, + "step": 7000 + }, + { + "epoch": 7.070707070707071, + "eval_accuracy": 0.9616899391335482, + "eval_loss": 0.14021535217761993, + "eval_runtime": 26.1591, + "eval_samples_per_second": 106.77, + "eval_steps_per_second": 13.38, + "step": 7000 + }, + { + "epoch": 7.08080808080808, + "grad_norm": 0.04259471595287323, + "learning_rate": 5.844444444444445e-05, + "loss": 0.0496, + "step": 7010 + }, + { + "epoch": 7.090909090909091, + "grad_norm": 0.04379872977733612, + "learning_rate": 5.824242424242424e-05, + "loss": 0.0736, + "step": 7020 + }, + { + "epoch": 7.101010101010101, + "grad_norm": 25.18751335144043, + "learning_rate": 5.804040404040404e-05, + "loss": 0.0661, + "step": 7030 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.26519498229026794, + "learning_rate": 5.783838383838384e-05, + "loss": 0.0218, + "step": 7040 + }, + { + "epoch": 7.121212121212121, + "grad_norm": 3.203284502029419, + "learning_rate": 5.7636363636363644e-05, + "loss": 0.0396, + "step": 7050 + }, + { + "epoch": 7.1313131313131315, + "grad_norm": 3.1685590744018555, + "learning_rate": 5.743434343434344e-05, + "loss": 0.0262, + "step": 7060 + }, + { + "epoch": 7.141414141414141, + "grad_norm": 2.3913097381591797, + "learning_rate": 5.7232323232323235e-05, + "loss": 0.1202, + "step": 7070 + }, + { + "epoch": 7.151515151515151, + "grad_norm": 0.10309541970491409, + "learning_rate": 5.7030303030303034e-05, + "loss": 0.0539, + "step": 7080 + }, + { + "epoch": 7.161616161616162, + "grad_norm": 0.011513768695294857, + "learning_rate": 5.682828282828283e-05, + "loss": 0.0363, + "step": 7090 + }, + { + "epoch": 7.171717171717171, + "grad_norm": 0.0340239517390728, + "learning_rate": 5.6626262626262625e-05, + "loss": 0.0711, + "step": 7100 + }, + { + "epoch": 7.171717171717171, + "eval_accuracy": 0.9609738632295023, + "eval_loss": 0.15271997451782227, + "eval_runtime": 26.5432, + "eval_samples_per_second": 105.225, + "eval_steps_per_second": 13.186, + "step": 7100 + }, + { + "epoch": 7.181818181818182, + "grad_norm": 3.066012382507324, + "learning_rate": 5.6424242424242424e-05, + "loss": 0.0586, + "step": 7110 + }, + { + "epoch": 7.191919191919192, + "grad_norm": 0.015100710093975067, + "learning_rate": 5.622222222222222e-05, + "loss": 0.0458, + "step": 7120 + }, + { + "epoch": 7.202020202020202, + "grad_norm": 3.451115608215332, + "learning_rate": 5.602020202020203e-05, + "loss": 0.0593, + "step": 7130 + }, + { + "epoch": 7.212121212121212, + "grad_norm": 0.14259886741638184, + "learning_rate": 5.581818181818183e-05, + "loss": 0.0571, + "step": 7140 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 0.04314308986067772, + "learning_rate": 5.561616161616162e-05, + "loss": 0.0532, + "step": 7150 + }, + { + "epoch": 7.232323232323233, + "grad_norm": 0.063353031873703, + "learning_rate": 5.541414141414142e-05, + "loss": 0.0306, + "step": 7160 + }, + { + "epoch": 7.242424242424242, + "grad_norm": 0.026339426636695862, + "learning_rate": 5.521212121212122e-05, + "loss": 0.0427, + "step": 7170 + }, + { + "epoch": 7.252525252525253, + "grad_norm": 2.5659215450286865, + "learning_rate": 5.501010101010101e-05, + "loss": 0.158, + "step": 7180 + }, + { + "epoch": 7.262626262626263, + "grad_norm": 4.935789108276367, + "learning_rate": 5.480808080808081e-05, + "loss": 0.0395, + "step": 7190 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 3.4505467414855957, + "learning_rate": 5.460606060606061e-05, + "loss": 0.0453, + "step": 7200 + }, + { + "epoch": 7.2727272727272725, + "eval_accuracy": 0.9570354457572503, + "eval_loss": 0.15119105577468872, + "eval_runtime": 25.5859, + "eval_samples_per_second": 109.162, + "eval_steps_per_second": 13.679, + "step": 7200 + }, + { + "epoch": 7.282828282828283, + "grad_norm": 1.491591215133667, + "learning_rate": 5.44040404040404e-05, + "loss": 0.0508, + "step": 7210 + }, + { + "epoch": 7.292929292929293, + "grad_norm": 0.20135779678821564, + "learning_rate": 5.420202020202021e-05, + "loss": 0.0119, + "step": 7220 + }, + { + "epoch": 7.303030303030303, + "grad_norm": 0.27936965227127075, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0749, + "step": 7230 + }, + { + "epoch": 7.313131313131313, + "grad_norm": 0.16268154978752136, + "learning_rate": 5.3797979797979804e-05, + "loss": 0.0197, + "step": 7240 + }, + { + "epoch": 7.3232323232323235, + "grad_norm": 4.364110469818115, + "learning_rate": 5.35959595959596e-05, + "loss": 0.0412, + "step": 7250 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.24327309429645538, + "learning_rate": 5.3393939393939395e-05, + "loss": 0.0074, + "step": 7260 + }, + { + "epoch": 7.343434343434343, + "grad_norm": 0.03149246424436569, + "learning_rate": 5.3191919191919194e-05, + "loss": 0.1778, + "step": 7270 + }, + { + "epoch": 7.353535353535354, + "grad_norm": 0.015329347923398018, + "learning_rate": 5.298989898989899e-05, + "loss": 0.0401, + "step": 7280 + }, + { + "epoch": 7.363636363636363, + "grad_norm": 1.2658077478408813, + "learning_rate": 5.2787878787878785e-05, + "loss": 0.0301, + "step": 7290 + }, + { + "epoch": 7.373737373737374, + "grad_norm": 1.1392098665237427, + "learning_rate": 5.258585858585859e-05, + "loss": 0.0052, + "step": 7300 + }, + { + "epoch": 7.373737373737374, + "eval_accuracy": 0.9520229144289295, + "eval_loss": 0.1935870349407196, + "eval_runtime": 26.3847, + "eval_samples_per_second": 105.857, + "eval_steps_per_second": 13.265, + "step": 7300 + }, + { + "epoch": 7.383838383838384, + "grad_norm": 5.420514106750488, + "learning_rate": 5.238383838383839e-05, + "loss": 0.085, + "step": 7310 + }, + { + "epoch": 7.393939393939394, + "grad_norm": 3.4437241554260254, + "learning_rate": 5.218181818181819e-05, + "loss": 0.0803, + "step": 7320 + }, + { + "epoch": 7.404040404040404, + "grad_norm": 0.11205892264842987, + "learning_rate": 5.197979797979798e-05, + "loss": 0.0702, + "step": 7330 + }, + { + "epoch": 7.414141414141414, + "grad_norm": 0.2336532026529312, + "learning_rate": 5.177777777777778e-05, + "loss": 0.0202, + "step": 7340 + }, + { + "epoch": 7.424242424242424, + "grad_norm": 0.2769406735897064, + "learning_rate": 5.157575757575758e-05, + "loss": 0.0833, + "step": 7350 + }, + { + "epoch": 7.434343434343434, + "grad_norm": 0.04012402519583702, + "learning_rate": 5.137373737373737e-05, + "loss": 0.0146, + "step": 7360 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 0.11134153604507446, + "learning_rate": 5.117171717171717e-05, + "loss": 0.0596, + "step": 7370 + }, + { + "epoch": 7.454545454545454, + "grad_norm": 3.452702760696411, + "learning_rate": 5.096969696969697e-05, + "loss": 0.0298, + "step": 7380 + }, + { + "epoch": 7.4646464646464645, + "grad_norm": 0.059644632041454315, + "learning_rate": 5.0767676767676774e-05, + "loss": 0.0785, + "step": 7390 + }, + { + "epoch": 7.474747474747475, + "grad_norm": 0.0173468217253685, + "learning_rate": 5.0565656565656573e-05, + "loss": 0.0477, + "step": 7400 + }, + { + "epoch": 7.474747474747475, + "eval_accuracy": 0.9513068385248836, + "eval_loss": 0.16992482542991638, + "eval_runtime": 25.1969, + "eval_samples_per_second": 110.847, + "eval_steps_per_second": 13.891, + "step": 7400 + }, + { + "epoch": 7.484848484848484, + "grad_norm": 0.07524644583463669, + "learning_rate": 5.0363636363636366e-05, + "loss": 0.0886, + "step": 7410 + }, + { + "epoch": 7.494949494949495, + "grad_norm": 0.541922926902771, + "learning_rate": 5.0161616161616165e-05, + "loss": 0.0421, + "step": 7420 + }, + { + "epoch": 7.505050505050505, + "grad_norm": 3.453490972518921, + "learning_rate": 4.9959595959595964e-05, + "loss": 0.0946, + "step": 7430 + }, + { + "epoch": 7.515151515151516, + "grad_norm": 0.1501472294330597, + "learning_rate": 4.9757575757575756e-05, + "loss": 0.0632, + "step": 7440 + }, + { + "epoch": 7.525252525252525, + "grad_norm": 0.0449078306555748, + "learning_rate": 4.955555555555556e-05, + "loss": 0.0397, + "step": 7450 + }, + { + "epoch": 7.5353535353535355, + "grad_norm": 2.0380120277404785, + "learning_rate": 4.935353535353536e-05, + "loss": 0.0795, + "step": 7460 + }, + { + "epoch": 7.545454545454545, + "grad_norm": 0.324027419090271, + "learning_rate": 4.915151515151515e-05, + "loss": 0.1152, + "step": 7470 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 4.435551166534424, + "learning_rate": 4.894949494949495e-05, + "loss": 0.1028, + "step": 7480 + }, + { + "epoch": 7.565656565656566, + "grad_norm": 0.15692196786403656, + "learning_rate": 4.874747474747475e-05, + "loss": 0.0669, + "step": 7490 + }, + { + "epoch": 7.575757575757576, + "grad_norm": 0.1278918832540512, + "learning_rate": 4.854545454545455e-05, + "loss": 0.091, + "step": 7500 + }, + { + "epoch": 7.575757575757576, + "eval_accuracy": 0.9513068385248836, + "eval_loss": 0.1628435254096985, + "eval_runtime": 25.9324, + "eval_samples_per_second": 107.703, + "eval_steps_per_second": 13.497, + "step": 7500 + }, + { + "epoch": 7.585858585858586, + "grad_norm": 0.06118590012192726, + "learning_rate": 4.834343434343435e-05, + "loss": 0.1065, + "step": 7510 + }, + { + "epoch": 7.595959595959596, + "grad_norm": 4.340107440948486, + "learning_rate": 4.814141414141414e-05, + "loss": 0.0821, + "step": 7520 + }, + { + "epoch": 7.606060606060606, + "grad_norm": 0.033492524176836014, + "learning_rate": 4.793939393939394e-05, + "loss": 0.0412, + "step": 7530 + }, + { + "epoch": 7.616161616161616, + "grad_norm": 0.07362242043018341, + "learning_rate": 4.773737373737374e-05, + "loss": 0.0686, + "step": 7540 + }, + { + "epoch": 7.626262626262626, + "grad_norm": 0.18190206587314606, + "learning_rate": 4.753535353535354e-05, + "loss": 0.023, + "step": 7550 + }, + { + "epoch": 7.636363636363637, + "grad_norm": 4.937658309936523, + "learning_rate": 4.7333333333333336e-05, + "loss": 0.0966, + "step": 7560 + }, + { + "epoch": 7.646464646464646, + "grad_norm": 0.1355772167444229, + "learning_rate": 4.713131313131313e-05, + "loss": 0.0872, + "step": 7570 + }, + { + "epoch": 7.656565656565657, + "grad_norm": 3.2895684242248535, + "learning_rate": 4.6929292929292934e-05, + "loss": 0.1588, + "step": 7580 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 1.176599383354187, + "learning_rate": 4.672727272727273e-05, + "loss": 0.0445, + "step": 7590 + }, + { + "epoch": 7.6767676767676765, + "grad_norm": 0.2387339174747467, + "learning_rate": 4.6525252525252525e-05, + "loss": 0.063, + "step": 7600 + }, + { + "epoch": 7.6767676767676765, + "eval_accuracy": 0.9577515216612961, + "eval_loss": 0.14738556742668152, + "eval_runtime": 25.6577, + "eval_samples_per_second": 108.856, + "eval_steps_per_second": 13.641, + "step": 7600 + }, + { + "epoch": 7.686868686868687, + "grad_norm": 2.4632670879364014, + "learning_rate": 4.6323232323232324e-05, + "loss": 0.0422, + "step": 7610 + }, + { + "epoch": 7.696969696969697, + "grad_norm": 0.07168363779783249, + "learning_rate": 4.612121212121212e-05, + "loss": 0.0169, + "step": 7620 + }, + { + "epoch": 7.707070707070707, + "grad_norm": 2.6026084423065186, + "learning_rate": 4.591919191919192e-05, + "loss": 0.0363, + "step": 7630 + }, + { + "epoch": 7.717171717171717, + "grad_norm": 2.899994373321533, + "learning_rate": 4.571717171717172e-05, + "loss": 0.0688, + "step": 7640 + }, + { + "epoch": 7.7272727272727275, + "grad_norm": 2.033482551574707, + "learning_rate": 4.5515151515151513e-05, + "loss": 0.0456, + "step": 7650 + }, + { + "epoch": 7.737373737373737, + "grad_norm": 0.19459086656570435, + "learning_rate": 4.531313131313131e-05, + "loss": 0.0643, + "step": 7660 + }, + { + "epoch": 7.747474747474747, + "grad_norm": 0.023826055228710175, + "learning_rate": 4.511111111111112e-05, + "loss": 0.0247, + "step": 7670 + }, + { + "epoch": 7.757575757575758, + "grad_norm": 0.21090379357337952, + "learning_rate": 4.490909090909091e-05, + "loss": 0.0417, + "step": 7680 + }, + { + "epoch": 7.767676767676767, + "grad_norm": 0.1651458889245987, + "learning_rate": 4.470707070707071e-05, + "loss": 0.032, + "step": 7690 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.25803717970848083, + "learning_rate": 4.450505050505051e-05, + "loss": 0.0497, + "step": 7700 + }, + { + "epoch": 7.777777777777778, + "eval_accuracy": 0.9613319011815252, + "eval_loss": 0.13887141644954681, + "eval_runtime": 25.6476, + "eval_samples_per_second": 108.899, + "eval_steps_per_second": 13.647, + "step": 7700 + }, + { + "epoch": 7.787878787878788, + "grad_norm": 4.601800441741943, + "learning_rate": 4.430303030303031e-05, + "loss": 0.1117, + "step": 7710 + }, + { + "epoch": 7.797979797979798, + "grad_norm": 0.2994377911090851, + "learning_rate": 4.4101010101010106e-05, + "loss": 0.0368, + "step": 7720 + }, + { + "epoch": 7.808080808080808, + "grad_norm": 6.382791042327881, + "learning_rate": 4.38989898989899e-05, + "loss": 0.1218, + "step": 7730 + }, + { + "epoch": 7.818181818181818, + "grad_norm": 0.1414938122034073, + "learning_rate": 4.36969696969697e-05, + "loss": 0.0296, + "step": 7740 + }, + { + "epoch": 7.828282828282829, + "grad_norm": 0.09703594446182251, + "learning_rate": 4.3494949494949496e-05, + "loss": 0.0436, + "step": 7750 + }, + { + "epoch": 7.838383838383838, + "grad_norm": 0.06831446290016174, + "learning_rate": 4.3292929292929295e-05, + "loss": 0.0748, + "step": 7760 + }, + { + "epoch": 7.848484848484849, + "grad_norm": 0.18760167062282562, + "learning_rate": 4.3090909090909094e-05, + "loss": 0.0534, + "step": 7770 + }, + { + "epoch": 7.858585858585858, + "grad_norm": 5.9486870765686035, + "learning_rate": 4.2888888888888886e-05, + "loss": 0.0627, + "step": 7780 + }, + { + "epoch": 7.8686868686868685, + "grad_norm": 0.156913161277771, + "learning_rate": 4.2686868686868685e-05, + "loss": 0.0586, + "step": 7790 + }, + { + "epoch": 7.878787878787879, + "grad_norm": 2.0493478775024414, + "learning_rate": 4.248484848484849e-05, + "loss": 0.0552, + "step": 7800 + }, + { + "epoch": 7.878787878787879, + "eval_accuracy": 0.9380594343000358, + "eval_loss": 0.25865495204925537, + "eval_runtime": 26.2229, + "eval_samples_per_second": 106.51, + "eval_steps_per_second": 13.347, + "step": 7800 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 4.067117214202881, + "learning_rate": 4.228282828282828e-05, + "loss": 0.0515, + "step": 7810 + }, + { + "epoch": 7.898989898989899, + "grad_norm": 0.11128303408622742, + "learning_rate": 4.208080808080808e-05, + "loss": 0.0604, + "step": 7820 + }, + { + "epoch": 7.909090909090909, + "grad_norm": 0.0496579185128212, + "learning_rate": 4.187878787878788e-05, + "loss": 0.0393, + "step": 7830 + }, + { + "epoch": 7.91919191919192, + "grad_norm": 0.021783454343676567, + "learning_rate": 4.167676767676768e-05, + "loss": 0.094, + "step": 7840 + }, + { + "epoch": 7.929292929292929, + "grad_norm": 3.133892059326172, + "learning_rate": 4.147474747474748e-05, + "loss": 0.0656, + "step": 7850 + }, + { + "epoch": 7.9393939393939394, + "grad_norm": 0.3182319402694702, + "learning_rate": 4.127272727272727e-05, + "loss": 0.0725, + "step": 7860 + }, + { + "epoch": 7.94949494949495, + "grad_norm": 0.10395387560129166, + "learning_rate": 4.107070707070707e-05, + "loss": 0.0352, + "step": 7870 + }, + { + "epoch": 7.959595959595959, + "grad_norm": 0.3641154170036316, + "learning_rate": 4.0868686868686876e-05, + "loss": 0.0894, + "step": 7880 + }, + { + "epoch": 7.96969696969697, + "grad_norm": 0.11004490405321121, + "learning_rate": 4.066666666666667e-05, + "loss": 0.0083, + "step": 7890 + }, + { + "epoch": 7.97979797979798, + "grad_norm": 0.014313346706330776, + "learning_rate": 4.046464646464647e-05, + "loss": 0.0364, + "step": 7900 + }, + { + "epoch": 7.97979797979798, + "eval_accuracy": 0.9602577873254565, + "eval_loss": 0.13607622683048248, + "eval_runtime": 25.5523, + "eval_samples_per_second": 109.305, + "eval_steps_per_second": 13.697, + "step": 7900 + }, + { + "epoch": 7.98989898989899, + "grad_norm": 3.420837879180908, + "learning_rate": 4.0262626262626266e-05, + "loss": 0.0268, + "step": 7910 + }, + { + "epoch": 8.0, + "grad_norm": 0.9288634657859802, + "learning_rate": 4.0060606060606065e-05, + "loss": 0.0731, + "step": 7920 + }, + { + "epoch": 8.01010101010101, + "grad_norm": 2.2030820846557617, + "learning_rate": 3.9858585858585864e-05, + "loss": 0.0636, + "step": 7930 + }, + { + "epoch": 8.02020202020202, + "grad_norm": 0.04353635758161545, + "learning_rate": 3.9656565656565656e-05, + "loss": 0.0463, + "step": 7940 + }, + { + "epoch": 8.030303030303031, + "grad_norm": 0.08897583186626434, + "learning_rate": 3.9454545454545455e-05, + "loss": 0.0529, + "step": 7950 + }, + { + "epoch": 8.04040404040404, + "grad_norm": 6.2125563621521, + "learning_rate": 3.9252525252525254e-05, + "loss": 0.043, + "step": 7960 + }, + { + "epoch": 8.05050505050505, + "grad_norm": 0.050241868942976, + "learning_rate": 3.905050505050505e-05, + "loss": 0.0077, + "step": 7970 + }, + { + "epoch": 8.06060606060606, + "grad_norm": 0.06562667340040207, + "learning_rate": 3.884848484848485e-05, + "loss": 0.023, + "step": 7980 + }, + { + "epoch": 8.070707070707071, + "grad_norm": 3.8004908561706543, + "learning_rate": 3.8646464646464644e-05, + "loss": 0.041, + "step": 7990 + }, + { + "epoch": 8.080808080808081, + "grad_norm": 0.06393536925315857, + "learning_rate": 3.844444444444444e-05, + "loss": 0.0124, + "step": 8000 + }, + { + "epoch": 8.080808080808081, + "eval_accuracy": 0.9606158252774795, + "eval_loss": 0.14380931854248047, + "eval_runtime": 25.6513, + "eval_samples_per_second": 108.883, + "eval_steps_per_second": 13.645, + "step": 8000 + }, + { + "epoch": 8.090909090909092, + "grad_norm": 1.5832247734069824, + "learning_rate": 3.824242424242425e-05, + "loss": 0.0782, + "step": 8010 + }, + { + "epoch": 8.1010101010101, + "grad_norm": 6.523228645324707, + "learning_rate": 3.804040404040404e-05, + "loss": 0.0384, + "step": 8020 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 0.03994593024253845, + "learning_rate": 3.783838383838384e-05, + "loss": 0.0077, + "step": 8030 + }, + { + "epoch": 8.121212121212121, + "grad_norm": 0.7552701830863953, + "learning_rate": 3.763636363636364e-05, + "loss": 0.0651, + "step": 8040 + }, + { + "epoch": 8.131313131313131, + "grad_norm": 0.08890596777200699, + "learning_rate": 3.743434343434344e-05, + "loss": 0.0197, + "step": 8050 + }, + { + "epoch": 8.141414141414142, + "grad_norm": 6.5320634841918945, + "learning_rate": 3.723232323232324e-05, + "loss": 0.0199, + "step": 8060 + }, + { + "epoch": 8.151515151515152, + "grad_norm": 0.024610524997115135, + "learning_rate": 3.703030303030303e-05, + "loss": 0.0322, + "step": 8070 + }, + { + "epoch": 8.16161616161616, + "grad_norm": 0.04751124233007431, + "learning_rate": 3.682828282828283e-05, + "loss": 0.0231, + "step": 8080 + }, + { + "epoch": 8.171717171717171, + "grad_norm": 0.05312546342611313, + "learning_rate": 3.6626262626262634e-05, + "loss": 0.0344, + "step": 8090 + }, + { + "epoch": 8.181818181818182, + "grad_norm": 4.530752182006836, + "learning_rate": 3.6424242424242426e-05, + "loss": 0.0703, + "step": 8100 + }, + { + "epoch": 8.181818181818182, + "eval_accuracy": 0.9584675975653419, + "eval_loss": 0.15765109658241272, + "eval_runtime": 26.1717, + "eval_samples_per_second": 106.718, + "eval_steps_per_second": 13.373, + "step": 8100 + }, + { + "epoch": 8.191919191919192, + "grad_norm": 0.04425670579075813, + "learning_rate": 3.6222222222222225e-05, + "loss": 0.0406, + "step": 8110 + }, + { + "epoch": 8.202020202020202, + "grad_norm": 4.567905426025391, + "learning_rate": 3.6020202020202024e-05, + "loss": 0.0941, + "step": 8120 + }, + { + "epoch": 8.212121212121213, + "grad_norm": 4.495558738708496, + "learning_rate": 3.5818181818181816e-05, + "loss": 0.0667, + "step": 8130 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.028609730303287506, + "learning_rate": 3.561616161616162e-05, + "loss": 0.0721, + "step": 8140 + }, + { + "epoch": 8.232323232323232, + "grad_norm": 1.9185714721679688, + "learning_rate": 3.5414141414141414e-05, + "loss": 0.0724, + "step": 8150 + }, + { + "epoch": 8.242424242424242, + "grad_norm": 0.10920464247465134, + "learning_rate": 3.521212121212121e-05, + "loss": 0.0516, + "step": 8160 + }, + { + "epoch": 8.252525252525253, + "grad_norm": 0.18538707494735718, + "learning_rate": 3.501010101010101e-05, + "loss": 0.0295, + "step": 8170 + }, + { + "epoch": 8.262626262626263, + "grad_norm": 0.3900783956050873, + "learning_rate": 3.480808080808081e-05, + "loss": 0.0083, + "step": 8180 + }, + { + "epoch": 8.272727272727273, + "grad_norm": 0.5472029447555542, + "learning_rate": 3.460606060606061e-05, + "loss": 0.0155, + "step": 8190 + }, + { + "epoch": 8.282828282828282, + "grad_norm": 0.09228463470935822, + "learning_rate": 3.44040404040404e-05, + "loss": 0.025, + "step": 8200 + }, + { + "epoch": 8.282828282828282, + "eval_accuracy": 0.9484425349087003, + "eval_loss": 0.1943020224571228, + "eval_runtime": 26.1968, + "eval_samples_per_second": 106.616, + "eval_steps_per_second": 13.36, + "step": 8200 + }, + { + "epoch": 8.292929292929292, + "grad_norm": 0.02402353845536709, + "learning_rate": 3.42020202020202e-05, + "loss": 0.0514, + "step": 8210 + }, + { + "epoch": 8.303030303030303, + "grad_norm": 0.276614785194397, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0588, + "step": 8220 + }, + { + "epoch": 8.313131313131313, + "grad_norm": 0.09456822276115417, + "learning_rate": 3.37979797979798e-05, + "loss": 0.0226, + "step": 8230 + }, + { + "epoch": 8.323232323232324, + "grad_norm": 0.024134701117873192, + "learning_rate": 3.35959595959596e-05, + "loss": 0.0225, + "step": 8240 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.12445070594549179, + "learning_rate": 3.3393939393939397e-05, + "loss": 0.0205, + "step": 8250 + }, + { + "epoch": 8.343434343434343, + "grad_norm": 0.006568551529198885, + "learning_rate": 3.319191919191919e-05, + "loss": 0.0033, + "step": 8260 + }, + { + "epoch": 8.353535353535353, + "grad_norm": 0.018136441707611084, + "learning_rate": 3.2989898989898995e-05, + "loss": 0.0327, + "step": 8270 + }, + { + "epoch": 8.363636363636363, + "grad_norm": 0.13373133540153503, + "learning_rate": 3.278787878787879e-05, + "loss": 0.0371, + "step": 8280 + }, + { + "epoch": 8.373737373737374, + "grad_norm": 2.1574575901031494, + "learning_rate": 3.2585858585858586e-05, + "loss": 0.0525, + "step": 8290 + }, + { + "epoch": 8.383838383838384, + "grad_norm": 0.14095142483711243, + "learning_rate": 3.2383838383838385e-05, + "loss": 0.0259, + "step": 8300 + }, + { + "epoch": 8.383838383838384, + "eval_accuracy": 0.9613319011815252, + "eval_loss": 0.15904253721237183, + "eval_runtime": 26.3335, + "eval_samples_per_second": 106.063, + "eval_steps_per_second": 13.291, + "step": 8300 + }, + { + "epoch": 8.393939393939394, + "grad_norm": 0.018682468682527542, + "learning_rate": 3.2181818181818184e-05, + "loss": 0.0803, + "step": 8310 + }, + { + "epoch": 8.404040404040405, + "grad_norm": 4.237796783447266, + "learning_rate": 3.197979797979798e-05, + "loss": 0.0185, + "step": 8320 + }, + { + "epoch": 8.414141414141413, + "grad_norm": 0.029690474271774292, + "learning_rate": 3.177777777777778e-05, + "loss": 0.0673, + "step": 8330 + }, + { + "epoch": 8.424242424242424, + "grad_norm": 3.0206427574157715, + "learning_rate": 3.1575757575757574e-05, + "loss": 0.0381, + "step": 8340 + }, + { + "epoch": 8.434343434343434, + "grad_norm": 0.01455895509570837, + "learning_rate": 3.137373737373738e-05, + "loss": 0.0247, + "step": 8350 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 0.018579356372356415, + "learning_rate": 3.117171717171717e-05, + "loss": 0.0403, + "step": 8360 + }, + { + "epoch": 8.454545454545455, + "grad_norm": 5.461668014526367, + "learning_rate": 3.096969696969697e-05, + "loss": 0.0659, + "step": 8370 + }, + { + "epoch": 8.464646464646465, + "grad_norm": 1.4274054765701294, + "learning_rate": 3.076767676767677e-05, + "loss": 0.0161, + "step": 8380 + }, + { + "epoch": 8.474747474747474, + "grad_norm": 0.012056349776685238, + "learning_rate": 3.056565656565657e-05, + "loss": 0.0471, + "step": 8390 + }, + { + "epoch": 8.484848484848484, + "grad_norm": 0.0278280358761549, + "learning_rate": 3.0363636363636367e-05, + "loss": 0.0049, + "step": 8400 + }, + { + "epoch": 8.484848484848484, + "eval_accuracy": 0.958109559613319, + "eval_loss": 0.15213708579540253, + "eval_runtime": 26.1963, + "eval_samples_per_second": 106.618, + "eval_steps_per_second": 13.361, + "step": 8400 + }, + { + "epoch": 8.494949494949495, + "grad_norm": 0.025999628007411957, + "learning_rate": 3.0161616161616163e-05, + "loss": 0.0173, + "step": 8410 + }, + { + "epoch": 8.505050505050505, + "grad_norm": 1.6824313402175903, + "learning_rate": 2.995959595959596e-05, + "loss": 0.039, + "step": 8420 + }, + { + "epoch": 8.515151515151516, + "grad_norm": 0.3492492437362671, + "learning_rate": 2.9757575757575757e-05, + "loss": 0.0361, + "step": 8430 + }, + { + "epoch": 8.525252525252526, + "grad_norm": 1.7269563674926758, + "learning_rate": 2.955555555555556e-05, + "loss": 0.0283, + "step": 8440 + }, + { + "epoch": 8.535353535353535, + "grad_norm": 4.346229553222656, + "learning_rate": 2.9353535353535355e-05, + "loss": 0.0795, + "step": 8450 + }, + { + "epoch": 8.545454545454545, + "grad_norm": 3.4249932765960693, + "learning_rate": 2.915151515151515e-05, + "loss": 0.07, + "step": 8460 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 0.16380125284194946, + "learning_rate": 2.894949494949495e-05, + "loss": 0.0338, + "step": 8470 + }, + { + "epoch": 8.565656565656566, + "grad_norm": 0.03398223966360092, + "learning_rate": 2.8747474747474752e-05, + "loss": 0.0512, + "step": 8480 + }, + { + "epoch": 8.575757575757576, + "grad_norm": 2.9662978649139404, + "learning_rate": 2.8545454545454548e-05, + "loss": 0.0719, + "step": 8490 + }, + { + "epoch": 8.585858585858587, + "grad_norm": 0.06849607825279236, + "learning_rate": 2.8343434343434343e-05, + "loss": 0.0174, + "step": 8500 + }, + { + "epoch": 8.585858585858587, + "eval_accuracy": 0.9598997493734336, + "eval_loss": 0.15224336087703705, + "eval_runtime": 25.9168, + "eval_samples_per_second": 107.768, + "eval_steps_per_second": 13.505, + "step": 8500 + }, + { + "epoch": 8.595959595959595, + "grad_norm": 3.5387423038482666, + "learning_rate": 2.8141414141414142e-05, + "loss": 0.0265, + "step": 8510 + }, + { + "epoch": 8.606060606060606, + "grad_norm": 0.2299012988805771, + "learning_rate": 2.7939393939393945e-05, + "loss": 0.0325, + "step": 8520 + }, + { + "epoch": 8.616161616161616, + "grad_norm": 0.1686079502105713, + "learning_rate": 2.773737373737374e-05, + "loss": 0.0665, + "step": 8530 + }, + { + "epoch": 8.626262626262626, + "grad_norm": 0.01624220982193947, + "learning_rate": 2.7535353535353536e-05, + "loss": 0.0316, + "step": 8540 + }, + { + "epoch": 8.636363636363637, + "grad_norm": 5.383854389190674, + "learning_rate": 2.733333333333333e-05, + "loss": 0.0568, + "step": 8550 + }, + { + "epoch": 8.646464646464647, + "grad_norm": 0.014906655997037888, + "learning_rate": 2.7131313131313134e-05, + "loss": 0.0201, + "step": 8560 + }, + { + "epoch": 8.656565656565657, + "grad_norm": 0.8503532409667969, + "learning_rate": 2.6929292929292933e-05, + "loss": 0.0269, + "step": 8570 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.006653611082583666, + "learning_rate": 2.6727272727272728e-05, + "loss": 0.0446, + "step": 8580 + }, + { + "epoch": 8.676767676767676, + "grad_norm": 8.002989768981934, + "learning_rate": 2.6525252525252524e-05, + "loss": 0.0535, + "step": 8590 + }, + { + "epoch": 8.686868686868687, + "grad_norm": 0.20638875663280487, + "learning_rate": 2.6323232323232323e-05, + "loss": 0.0194, + "step": 8600 + }, + { + "epoch": 8.686868686868687, + "eval_accuracy": 0.9606158252774795, + "eval_loss": 0.14563946425914764, + "eval_runtime": 25.8959, + "eval_samples_per_second": 107.855, + "eval_steps_per_second": 13.516, + "step": 8600 + }, + { + "epoch": 8.696969696969697, + "grad_norm": 0.33214274048805237, + "learning_rate": 2.6121212121212125e-05, + "loss": 0.0296, + "step": 8610 + }, + { + "epoch": 8.707070707070708, + "grad_norm": 0.021809542551636696, + "learning_rate": 2.591919191919192e-05, + "loss": 0.0328, + "step": 8620 + }, + { + "epoch": 8.717171717171716, + "grad_norm": 0.011034449562430382, + "learning_rate": 2.5717171717171716e-05, + "loss": 0.0172, + "step": 8630 + }, + { + "epoch": 8.727272727272727, + "grad_norm": 0.17327769100666046, + "learning_rate": 2.5515151515151515e-05, + "loss": 0.0748, + "step": 8640 + }, + { + "epoch": 8.737373737373737, + "grad_norm": 3.269444227218628, + "learning_rate": 2.5313131313131318e-05, + "loss": 0.1305, + "step": 8650 + }, + { + "epoch": 8.747474747474747, + "grad_norm": 0.11276157200336456, + "learning_rate": 2.5111111111111113e-05, + "loss": 0.0737, + "step": 8660 + }, + { + "epoch": 8.757575757575758, + "grad_norm": 4.957470417022705, + "learning_rate": 2.490909090909091e-05, + "loss": 0.0257, + "step": 8670 + }, + { + "epoch": 8.767676767676768, + "grad_norm": 0.06070602312684059, + "learning_rate": 2.4707070707070708e-05, + "loss": 0.0348, + "step": 8680 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 2.432539463043213, + "learning_rate": 2.4505050505050507e-05, + "loss": 0.0426, + "step": 8690 + }, + { + "epoch": 8.787878787878787, + "grad_norm": 0.7551902532577515, + "learning_rate": 2.4303030303030306e-05, + "loss": 0.0315, + "step": 8700 + }, + { + "epoch": 8.787878787878787, + "eval_accuracy": 0.9598997493734336, + "eval_loss": 0.1411319375038147, + "eval_runtime": 26.1071, + "eval_samples_per_second": 106.982, + "eval_steps_per_second": 13.406, + "step": 8700 + }, + { + "epoch": 8.797979797979798, + "grad_norm": 3.6202332973480225, + "learning_rate": 2.41010101010101e-05, + "loss": 0.0255, + "step": 8710 + }, + { + "epoch": 8.808080808080808, + "grad_norm": 1.979330062866211, + "learning_rate": 2.38989898989899e-05, + "loss": 0.0668, + "step": 8720 + }, + { + "epoch": 8.818181818181818, + "grad_norm": 0.09393583983182907, + "learning_rate": 2.36969696969697e-05, + "loss": 0.0424, + "step": 8730 + }, + { + "epoch": 8.828282828282829, + "grad_norm": 0.17870110273361206, + "learning_rate": 2.3494949494949495e-05, + "loss": 0.0429, + "step": 8740 + }, + { + "epoch": 8.83838383838384, + "grad_norm": 0.01010535005480051, + "learning_rate": 2.3292929292929294e-05, + "loss": 0.0289, + "step": 8750 + }, + { + "epoch": 8.848484848484848, + "grad_norm": 0.03681017830967903, + "learning_rate": 2.309090909090909e-05, + "loss": 0.0093, + "step": 8760 + }, + { + "epoch": 8.858585858585858, + "grad_norm": 0.03761085867881775, + "learning_rate": 2.288888888888889e-05, + "loss": 0.0276, + "step": 8770 + }, + { + "epoch": 8.868686868686869, + "grad_norm": 0.04552626237273216, + "learning_rate": 2.2686868686868687e-05, + "loss": 0.035, + "step": 8780 + }, + { + "epoch": 8.878787878787879, + "grad_norm": 15.462836265563965, + "learning_rate": 2.2484848484848486e-05, + "loss": 0.0363, + "step": 8790 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.05760538578033447, + "learning_rate": 2.228282828282828e-05, + "loss": 0.0419, + "step": 8800 + }, + { + "epoch": 8.88888888888889, + "eval_accuracy": 0.9591836734693877, + "eval_loss": 0.14262323081493378, + "eval_runtime": 26.5686, + "eval_samples_per_second": 105.124, + "eval_steps_per_second": 13.173, + "step": 8800 + }, + { + "epoch": 8.8989898989899, + "grad_norm": 0.013880529440939426, + "learning_rate": 2.2080808080808084e-05, + "loss": 0.0038, + "step": 8810 + }, + { + "epoch": 8.909090909090908, + "grad_norm": 0.5507473945617676, + "learning_rate": 2.187878787878788e-05, + "loss": 0.0298, + "step": 8820 + }, + { + "epoch": 8.919191919191919, + "grad_norm": 0.28017669916152954, + "learning_rate": 2.167676767676768e-05, + "loss": 0.0039, + "step": 8830 + }, + { + "epoch": 8.929292929292929, + "grad_norm": 0.017856653779745102, + "learning_rate": 2.1474747474747474e-05, + "loss": 0.0036, + "step": 8840 + }, + { + "epoch": 8.93939393939394, + "grad_norm": 0.6294344663619995, + "learning_rate": 2.1272727272727276e-05, + "loss": 0.042, + "step": 8850 + }, + { + "epoch": 8.94949494949495, + "grad_norm": 2.846271514892578, + "learning_rate": 2.1070707070707072e-05, + "loss": 0.0915, + "step": 8860 + }, + { + "epoch": 8.95959595959596, + "grad_norm": 1.6820452213287354, + "learning_rate": 2.086868686868687e-05, + "loss": 0.0441, + "step": 8870 + }, + { + "epoch": 8.969696969696969, + "grad_norm": 2.357525110244751, + "learning_rate": 2.0666666666666666e-05, + "loss": 0.044, + "step": 8880 + }, + { + "epoch": 8.97979797979798, + "grad_norm": 0.058374661952257156, + "learning_rate": 2.0464646464646465e-05, + "loss": 0.0394, + "step": 8890 + }, + { + "epoch": 8.98989898989899, + "grad_norm": 4.777621746063232, + "learning_rate": 2.0262626262626264e-05, + "loss": 0.0193, + "step": 8900 + }, + { + "epoch": 8.98989898989899, + "eval_accuracy": 0.9641962047977085, + "eval_loss": 0.13753151893615723, + "eval_runtime": 26.4062, + "eval_samples_per_second": 105.771, + "eval_steps_per_second": 13.254, + "step": 8900 + }, + { + "epoch": 9.0, + "grad_norm": NaN, + "learning_rate": 2.008080808080808e-05, + "loss": 0.002, + "step": 8910 + }, + { + "epoch": 9.01010101010101, + "grad_norm": 0.038479045033454895, + "learning_rate": 1.987878787878788e-05, + "loss": 0.0031, + "step": 8920 + }, + { + "epoch": 9.02020202020202, + "grad_norm": 0.06207535043358803, + "learning_rate": 1.9676767676767677e-05, + "loss": 0.0023, + "step": 8930 + }, + { + "epoch": 9.030303030303031, + "grad_norm": 5.280101776123047, + "learning_rate": 1.9474747474747476e-05, + "loss": 0.0608, + "step": 8940 + }, + { + "epoch": 9.04040404040404, + "grad_norm": 8.902334213256836, + "learning_rate": 1.9272727272727272e-05, + "loss": 0.0147, + "step": 8950 + }, + { + "epoch": 9.05050505050505, + "grad_norm": 0.884495735168457, + "learning_rate": 1.907070707070707e-05, + "loss": 0.0188, + "step": 8960 + }, + { + "epoch": 9.06060606060606, + "grad_norm": 0.07071898877620697, + "learning_rate": 1.886868686868687e-05, + "loss": 0.0149, + "step": 8970 + }, + { + "epoch": 9.070707070707071, + "grad_norm": 0.12493721395730972, + "learning_rate": 1.866666666666667e-05, + "loss": 0.006, + "step": 8980 + }, + { + "epoch": 9.080808080808081, + "grad_norm": 0.29426121711730957, + "learning_rate": 1.8464646464646464e-05, + "loss": 0.0794, + "step": 8990 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 0.01998014561831951, + "learning_rate": 1.8262626262626263e-05, + "loss": 0.0027, + "step": 9000 + }, + { + "epoch": 9.090909090909092, + "eval_accuracy": 0.9634801288936627, + "eval_loss": 0.13790033757686615, + "eval_runtime": 26.5693, + "eval_samples_per_second": 105.121, + "eval_steps_per_second": 13.173, + "step": 9000 + }, + { + "epoch": 9.1010101010101, + "grad_norm": 0.024814104661345482, + "learning_rate": 1.8060606060606062e-05, + "loss": 0.03, + "step": 9010 + }, + { + "epoch": 9.11111111111111, + "grad_norm": 0.0391666404902935, + "learning_rate": 1.785858585858586e-05, + "loss": 0.0366, + "step": 9020 + }, + { + "epoch": 9.121212121212121, + "grad_norm": 2.8100287914276123, + "learning_rate": 1.7656565656565657e-05, + "loss": 0.0138, + "step": 9030 + }, + { + "epoch": 9.131313131313131, + "grad_norm": 5.491546154022217, + "learning_rate": 1.7454545454545456e-05, + "loss": 0.0398, + "step": 9040 + }, + { + "epoch": 9.141414141414142, + "grad_norm": 0.3648895025253296, + "learning_rate": 1.7252525252525255e-05, + "loss": 0.018, + "step": 9050 + }, + { + "epoch": 9.151515151515152, + "grad_norm": 0.007288265973329544, + "learning_rate": 1.705050505050505e-05, + "loss": 0.0507, + "step": 9060 + }, + { + "epoch": 9.16161616161616, + "grad_norm": 4.20407247543335, + "learning_rate": 1.684848484848485e-05, + "loss": 0.0398, + "step": 9070 + }, + { + "epoch": 9.171717171717171, + "grad_norm": 0.0796607956290245, + "learning_rate": 1.6646464646464645e-05, + "loss": 0.0536, + "step": 9080 + }, + { + "epoch": 9.181818181818182, + "grad_norm": 0.1267855316400528, + "learning_rate": 1.6444444444444447e-05, + "loss": 0.0298, + "step": 9090 + }, + { + "epoch": 9.191919191919192, + "grad_norm": 0.4628753364086151, + "learning_rate": 1.6242424242424243e-05, + "loss": 0.0345, + "step": 9100 + }, + { + "epoch": 9.191919191919192, + "eval_accuracy": 0.9631220909416398, + "eval_loss": 0.14435383677482605, + "eval_runtime": 26.2973, + "eval_samples_per_second": 106.209, + "eval_steps_per_second": 13.309, + "step": 9100 + }, + { + "epoch": 9.202020202020202, + "grad_norm": 0.060050446540117264, + "learning_rate": 1.604040404040404e-05, + "loss": 0.0057, + "step": 9110 + }, + { + "epoch": 9.212121212121213, + "grad_norm": 0.09056686609983444, + "learning_rate": 1.5838383838383837e-05, + "loss": 0.0191, + "step": 9120 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 0.06281417608261108, + "learning_rate": 1.563636363636364e-05, + "loss": 0.0117, + "step": 9130 + }, + { + "epoch": 9.232323232323232, + "grad_norm": 0.08288536220788956, + "learning_rate": 1.5434343434343435e-05, + "loss": 0.0376, + "step": 9140 + }, + { + "epoch": 9.242424242424242, + "grad_norm": 0.034553345292806625, + "learning_rate": 1.5232323232323234e-05, + "loss": 0.0217, + "step": 9150 + }, + { + "epoch": 9.252525252525253, + "grad_norm": 3.04947829246521, + "learning_rate": 1.5030303030303031e-05, + "loss": 0.0129, + "step": 9160 + }, + { + "epoch": 9.262626262626263, + "grad_norm": 0.06548389047384262, + "learning_rate": 1.482828282828283e-05, + "loss": 0.0484, + "step": 9170 + }, + { + "epoch": 9.272727272727273, + "grad_norm": 0.06305460631847382, + "learning_rate": 1.4626262626262627e-05, + "loss": 0.0013, + "step": 9180 + }, + { + "epoch": 9.282828282828282, + "grad_norm": 0.004533341620117426, + "learning_rate": 1.4424242424242426e-05, + "loss": 0.0163, + "step": 9190 + }, + { + "epoch": 9.292929292929292, + "grad_norm": 0.19153164327144623, + "learning_rate": 1.4222222222222224e-05, + "loss": 0.0291, + "step": 9200 + }, + { + "epoch": 9.292929292929292, + "eval_accuracy": 0.9624060150375939, + "eval_loss": 0.14921478927135468, + "eval_runtime": 26.5425, + "eval_samples_per_second": 105.228, + "eval_steps_per_second": 13.186, + "step": 9200 + }, + { + "epoch": 9.303030303030303, + "grad_norm": 0.1258583813905716, + "learning_rate": 1.402020202020202e-05, + "loss": 0.0169, + "step": 9210 + }, + { + "epoch": 9.313131313131313, + "grad_norm": 5.8700361251831055, + "learning_rate": 1.3818181818181818e-05, + "loss": 0.0391, + "step": 9220 + }, + { + "epoch": 9.323232323232324, + "grad_norm": 0.17452913522720337, + "learning_rate": 1.3616161616161615e-05, + "loss": 0.0058, + "step": 9230 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 0.41943687200546265, + "learning_rate": 1.3414141414141414e-05, + "loss": 0.0564, + "step": 9240 + }, + { + "epoch": 9.343434343434343, + "grad_norm": 0.009540412575006485, + "learning_rate": 1.3212121212121212e-05, + "loss": 0.0021, + "step": 9250 + }, + { + "epoch": 9.353535353535353, + "grad_norm": 0.007924321107566357, + "learning_rate": 1.301010101010101e-05, + "loss": 0.0141, + "step": 9260 + }, + { + "epoch": 9.363636363636363, + "grad_norm": 0.0064546167850494385, + "learning_rate": 1.2808080808080808e-05, + "loss": 0.0138, + "step": 9270 + }, + { + "epoch": 9.373737373737374, + "grad_norm": 0.012489011511206627, + "learning_rate": 1.2606060606060607e-05, + "loss": 0.0034, + "step": 9280 + }, + { + "epoch": 9.383838383838384, + "grad_norm": 0.21840006113052368, + "learning_rate": 1.2404040404040404e-05, + "loss": 0.0035, + "step": 9290 + }, + { + "epoch": 9.393939393939394, + "grad_norm": 0.027711758390069008, + "learning_rate": 1.2202020202020201e-05, + "loss": 0.017, + "step": 9300 + }, + { + "epoch": 9.393939393939394, + "eval_accuracy": 0.9634801288936627, + "eval_loss": 0.1465526521205902, + "eval_runtime": 25.9586, + "eval_samples_per_second": 107.594, + "eval_steps_per_second": 13.483, + "step": 9300 + }, + { + "epoch": 9.404040404040405, + "grad_norm": 0.004929441958665848, + "learning_rate": 1.2e-05, + "loss": 0.0167, + "step": 9310 + }, + { + "epoch": 9.414141414141413, + "grad_norm": 0.013397878967225552, + "learning_rate": 1.1797979797979798e-05, + "loss": 0.0537, + "step": 9320 + }, + { + "epoch": 9.424242424242424, + "grad_norm": 10.173965454101562, + "learning_rate": 1.1595959595959597e-05, + "loss": 0.0854, + "step": 9330 + }, + { + "epoch": 9.434343434343434, + "grad_norm": 0.12551280856132507, + "learning_rate": 1.1393939393939394e-05, + "loss": 0.0048, + "step": 9340 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.21840262413024902, + "learning_rate": 1.1191919191919193e-05, + "loss": 0.0341, + "step": 9350 + }, + { + "epoch": 9.454545454545455, + "grad_norm": 0.01529910322278738, + "learning_rate": 1.098989898989899e-05, + "loss": 0.0544, + "step": 9360 + }, + { + "epoch": 9.464646464646465, + "grad_norm": 0.015066384337842464, + "learning_rate": 1.0787878787878789e-05, + "loss": 0.0196, + "step": 9370 + }, + { + "epoch": 9.474747474747474, + "grad_norm": 2.9213244915008545, + "learning_rate": 1.0585858585858586e-05, + "loss": 0.0645, + "step": 9380 + }, + { + "epoch": 9.484848484848484, + "grad_norm": 0.03714268282055855, + "learning_rate": 1.0383838383838385e-05, + "loss": 0.0572, + "step": 9390 + }, + { + "epoch": 9.494949494949495, + "grad_norm": 0.004297337029129267, + "learning_rate": 1.0181818181818182e-05, + "loss": 0.0269, + "step": 9400 + }, + { + "epoch": 9.494949494949495, + "eval_accuracy": 0.9631220909416398, + "eval_loss": 0.15231722593307495, + "eval_runtime": 26.6893, + "eval_samples_per_second": 104.648, + "eval_steps_per_second": 13.114, + "step": 9400 + }, + { + "epoch": 9.505050505050505, + "grad_norm": 0.20185399055480957, + "learning_rate": 9.979797979797981e-06, + "loss": 0.0069, + "step": 9410 + }, + { + "epoch": 9.515151515151516, + "grad_norm": 0.007642143405973911, + "learning_rate": 9.777777777777779e-06, + "loss": 0.0028, + "step": 9420 + }, + { + "epoch": 9.525252525252526, + "grad_norm": 0.012005124241113663, + "learning_rate": 9.575757575757578e-06, + "loss": 0.0055, + "step": 9430 + }, + { + "epoch": 9.535353535353535, + "grad_norm": 0.0104445805773139, + "learning_rate": 9.373737373737375e-06, + "loss": 0.0222, + "step": 9440 + }, + { + "epoch": 9.545454545454545, + "grad_norm": 0.004772584419697523, + "learning_rate": 9.171717171717172e-06, + "loss": 0.0056, + "step": 9450 + }, + { + "epoch": 9.555555555555555, + "grad_norm": 0.038412462919950485, + "learning_rate": 8.96969696969697e-06, + "loss": 0.0583, + "step": 9460 + }, + { + "epoch": 9.565656565656566, + "grad_norm": 0.007618672680109739, + "learning_rate": 8.767676767676768e-06, + "loss": 0.0903, + "step": 9470 + }, + { + "epoch": 9.575757575757576, + "grad_norm": 0.3378522992134094, + "learning_rate": 8.565656565656566e-06, + "loss": 0.0244, + "step": 9480 + }, + { + "epoch": 9.585858585858587, + "grad_norm": 0.04890254884958267, + "learning_rate": 8.363636363636365e-06, + "loss": 0.0714, + "step": 9490 + }, + { + "epoch": 9.595959595959595, + "grad_norm": 0.11964570730924606, + "learning_rate": 8.161616161616162e-06, + "loss": 0.003, + "step": 9500 + }, + { + "epoch": 9.595959595959595, + "eval_accuracy": 0.9627640529896169, + "eval_loss": 0.14447690546512604, + "eval_runtime": 26.1341, + "eval_samples_per_second": 106.872, + "eval_steps_per_second": 13.392, + "step": 9500 + }, + { + "epoch": 9.606060606060606, + "grad_norm": 0.10158071666955948, + "learning_rate": 7.959595959595959e-06, + "loss": 0.0246, + "step": 9510 + }, + { + "epoch": 9.616161616161616, + "grad_norm": 1.4588532447814941, + "learning_rate": 7.757575757575758e-06, + "loss": 0.0678, + "step": 9520 + }, + { + "epoch": 9.626262626262626, + "grad_norm": 0.060935135930776596, + "learning_rate": 7.555555555555556e-06, + "loss": 0.0119, + "step": 9530 + }, + { + "epoch": 9.636363636363637, + "grad_norm": 0.06521911174058914, + "learning_rate": 7.353535353535354e-06, + "loss": 0.0415, + "step": 9540 + }, + { + "epoch": 9.646464646464647, + "grad_norm": 0.027651334181427956, + "learning_rate": 7.151515151515152e-06, + "loss": 0.0414, + "step": 9550 + }, + { + "epoch": 9.656565656565657, + "grad_norm": 5.914989471435547, + "learning_rate": 6.9494949494949505e-06, + "loss": 0.0293, + "step": 9560 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 0.0250714048743248, + "learning_rate": 6.747474747474749e-06, + "loss": 0.0225, + "step": 9570 + }, + { + "epoch": 9.676767676767676, + "grad_norm": 0.01405611913651228, + "learning_rate": 6.545454545454547e-06, + "loss": 0.0235, + "step": 9580 + }, + { + "epoch": 9.686868686868687, + "grad_norm": 0.03979960083961487, + "learning_rate": 6.343434343434344e-06, + "loss": 0.0022, + "step": 9590 + }, + { + "epoch": 9.696969696969697, + "grad_norm": 0.34945064783096313, + "learning_rate": 6.141414141414142e-06, + "loss": 0.0471, + "step": 9600 + }, + { + "epoch": 9.696969696969697, + "eval_accuracy": 0.9616899391335482, + "eval_loss": 0.14536738395690918, + "eval_runtime": 26.8273, + "eval_samples_per_second": 104.11, + "eval_steps_per_second": 13.046, + "step": 9600 + }, + { + "epoch": 9.707070707070708, + "grad_norm": 0.05112791806459427, + "learning_rate": 5.93939393939394e-06, + "loss": 0.0024, + "step": 9610 + }, + { + "epoch": 9.717171717171716, + "grad_norm": 0.00436112005263567, + "learning_rate": 5.7373737373737374e-06, + "loss": 0.004, + "step": 9620 + }, + { + "epoch": 9.727272727272727, + "grad_norm": 0.041163258254528046, + "learning_rate": 5.5353535353535355e-06, + "loss": 0.0012, + "step": 9630 + }, + { + "epoch": 9.737373737373737, + "grad_norm": 0.08394615352153778, + "learning_rate": 5.333333333333334e-06, + "loss": 0.0622, + "step": 9640 + }, + { + "epoch": 9.747474747474747, + "grad_norm": 0.008766507729887962, + "learning_rate": 5.131313131313131e-06, + "loss": 0.0546, + "step": 9650 + }, + { + "epoch": 9.757575757575758, + "grad_norm": 5.026554107666016, + "learning_rate": 4.929292929292929e-06, + "loss": 0.0243, + "step": 9660 + }, + { + "epoch": 9.767676767676768, + "grad_norm": 0.011669186875224113, + "learning_rate": 4.727272727272727e-06, + "loss": 0.0325, + "step": 9670 + }, + { + "epoch": 9.777777777777779, + "grad_norm": 0.15583041310310364, + "learning_rate": 4.525252525252525e-06, + "loss": 0.0166, + "step": 9680 + }, + { + "epoch": 9.787878787878787, + "grad_norm": 0.19336989521980286, + "learning_rate": 4.323232323232323e-06, + "loss": 0.0084, + "step": 9690 + }, + { + "epoch": 9.797979797979798, + "grad_norm": 0.23474974930286407, + "learning_rate": 4.1212121212121215e-06, + "loss": 0.0356, + "step": 9700 + }, + { + "epoch": 9.797979797979798, + "eval_accuracy": 0.9620479770855711, + "eval_loss": 0.14517508447170258, + "eval_runtime": 25.941, + "eval_samples_per_second": 107.667, + "eval_steps_per_second": 13.492, + "step": 9700 + }, + { + "epoch": 9.808080808080808, + "grad_norm": 0.3716013729572296, + "learning_rate": 3.9191919191919196e-06, + "loss": 0.0885, + "step": 9710 + }, + { + "epoch": 9.818181818181818, + "grad_norm": 4.4450297355651855, + "learning_rate": 3.7171717171717177e-06, + "loss": 0.0399, + "step": 9720 + }, + { + "epoch": 9.828282828282829, + "grad_norm": 0.03690167888998985, + "learning_rate": 3.515151515151515e-06, + "loss": 0.06, + "step": 9730 + }, + { + "epoch": 9.83838383838384, + "grad_norm": 0.028440790250897408, + "learning_rate": 3.313131313131313e-06, + "loss": 0.0362, + "step": 9740 + }, + { + "epoch": 9.848484848484848, + "grad_norm": 0.021634520962834358, + "learning_rate": 3.111111111111111e-06, + "loss": 0.0032, + "step": 9750 + }, + { + "epoch": 9.858585858585858, + "grad_norm": 5.481240272521973, + "learning_rate": 2.9090909090909093e-06, + "loss": 0.0273, + "step": 9760 + }, + { + "epoch": 9.868686868686869, + "grad_norm": 0.009463118389248848, + "learning_rate": 2.7070707070707074e-06, + "loss": 0.0408, + "step": 9770 + }, + { + "epoch": 9.878787878787879, + "grad_norm": 0.008368213661015034, + "learning_rate": 2.5050505050505055e-06, + "loss": 0.0294, + "step": 9780 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 0.005486648064106703, + "learning_rate": 2.303030303030303e-06, + "loss": 0.0016, + "step": 9790 + }, + { + "epoch": 9.8989898989899, + "grad_norm": 0.5021072626113892, + "learning_rate": 2.1010101010101013e-06, + "loss": 0.0034, + "step": 9800 + }, + { + "epoch": 9.8989898989899, + "eval_accuracy": 0.9624060150375939, + "eval_loss": 0.14454838633537292, + "eval_runtime": 26.5093, + "eval_samples_per_second": 105.359, + "eval_steps_per_second": 13.203, + "step": 9800 + }, + { + "epoch": 9.909090909090908, + "grad_norm": 0.03990056738257408, + "learning_rate": 1.8989898989898992e-06, + "loss": 0.025, + "step": 9810 + }, + { + "epoch": 9.919191919191919, + "grad_norm": 0.003882176708430052, + "learning_rate": 1.6969696969696973e-06, + "loss": 0.0707, + "step": 9820 + }, + { + "epoch": 9.929292929292929, + "grad_norm": 0.08585841208696365, + "learning_rate": 1.4949494949494952e-06, + "loss": 0.0364, + "step": 9830 + }, + { + "epoch": 9.93939393939394, + "grad_norm": 1.9239050149917603, + "learning_rate": 1.292929292929293e-06, + "loss": 0.0313, + "step": 9840 + }, + { + "epoch": 9.94949494949495, + "grad_norm": 0.0036682253703475, + "learning_rate": 1.090909090909091e-06, + "loss": 0.0102, + "step": 9850 + }, + { + "epoch": 9.95959595959596, + "grad_norm": 0.014887700788676739, + "learning_rate": 8.88888888888889e-07, + "loss": 0.0558, + "step": 9860 + }, + { + "epoch": 9.969696969696969, + "grad_norm": 0.022091476246714592, + "learning_rate": 6.868686868686869e-07, + "loss": 0.032, + "step": 9870 + }, + { + "epoch": 9.97979797979798, + "grad_norm": 8.908573150634766, + "learning_rate": 4.848484848484849e-07, + "loss": 0.0626, + "step": 9880 + }, + { + "epoch": 9.98989898989899, + "grad_norm": 6.401342391967773, + "learning_rate": 2.8282828282828283e-07, + "loss": 0.0207, + "step": 9890 + }, + { + "epoch": 10.0, + "grad_norm": 0.0033838360104709864, + "learning_rate": 8.080808080808082e-08, + "loss": 0.0162, + "step": 9900 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.9627640529896169, + "eval_loss": 0.14507198333740234, + "eval_runtime": 26.3148, + "eval_samples_per_second": 106.138, + "eval_steps_per_second": 13.3, + "step": 9900 + }, + { + "epoch": 10.0, + "step": 9900, + "total_flos": 1.2263107356509184e+19, + "train_loss": 0.12599933518755316, + "train_runtime": 6423.2981, + "train_samples_per_second": 24.637, + "train_steps_per_second": 1.541 + } + ], + "logging_steps": 10, + "max_steps": 9900, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2263107356509184e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}