diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,48360 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 6889, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00029031789809841774, + "grad_norm": 12.98371410369873, + "learning_rate": 1.1614401858304298e-08, + "loss": 1.5049, + "step": 1 + }, + { + "epoch": 0.0005806357961968355, + "grad_norm": 15.011346817016602, + "learning_rate": 2.3228803716608597e-08, + "loss": 1.5849, + "step": 2 + }, + { + "epoch": 0.0008709536942952533, + "grad_norm": 11.487916946411133, + "learning_rate": 3.484320557491289e-08, + "loss": 1.3669, + "step": 3 + }, + { + "epoch": 0.001161271592393671, + "grad_norm": 10.780348777770996, + "learning_rate": 4.645760743321719e-08, + "loss": 1.4929, + "step": 4 + }, + { + "epoch": 0.0014515894904920889, + "grad_norm": 8.864033699035645, + "learning_rate": 5.807200929152149e-08, + "loss": 1.3715, + "step": 5 + }, + { + "epoch": 0.0017419073885905066, + "grad_norm": 11.861454010009766, + "learning_rate": 6.968641114982578e-08, + "loss": 1.3279, + "step": 6 + }, + { + "epoch": 0.0020322252866889243, + "grad_norm": 12.796159744262695, + "learning_rate": 8.130081300813009e-08, + "loss": 1.5399, + "step": 7 + }, + { + "epoch": 0.002322543184787342, + "grad_norm": 14.083832740783691, + "learning_rate": 9.291521486643439e-08, + "loss": 1.5013, + "step": 8 + }, + { + "epoch": 0.00261286108288576, + "grad_norm": 14.128660202026367, + "learning_rate": 1.045296167247387e-07, + "loss": 1.4663, + "step": 9 + }, + { + "epoch": 0.0029031789809841778, + "grad_norm": 13.425607681274414, + "learning_rate": 1.1614401858304298e-07, + "loss": 1.5617, + "step": 10 + }, + { + "epoch": 0.0031934968790825954, + "grad_norm": 13.83140754699707, + "learning_rate": 1.277584204413473e-07, + "loss": 1.5362, + "step": 11 + }, + { + "epoch": 0.003483814777181013, + "grad_norm": 13.65449047088623, + "learning_rate": 1.3937282229965157e-07, + "loss": 1.5805, + "step": 12 + }, + { + "epoch": 0.003774132675279431, + "grad_norm": 12.831888198852539, + "learning_rate": 1.509872241579559e-07, + "loss": 1.4625, + "step": 13 + }, + { + "epoch": 0.0040644505733778485, + "grad_norm": 11.353224754333496, + "learning_rate": 1.6260162601626018e-07, + "loss": 1.3862, + "step": 14 + }, + { + "epoch": 0.004354768471476266, + "grad_norm": 12.183592796325684, + "learning_rate": 1.7421602787456448e-07, + "loss": 1.4039, + "step": 15 + }, + { + "epoch": 0.004645086369574684, + "grad_norm": 10.883825302124023, + "learning_rate": 1.8583042973286877e-07, + "loss": 1.4896, + "step": 16 + }, + { + "epoch": 0.0049354042676731024, + "grad_norm": 11.417899131774902, + "learning_rate": 1.9744483159117307e-07, + "loss": 1.4953, + "step": 17 + }, + { + "epoch": 0.00522572216577152, + "grad_norm": 12.301745414733887, + "learning_rate": 2.090592334494774e-07, + "loss": 1.5397, + "step": 18 + }, + { + "epoch": 0.005516040063869938, + "grad_norm": 12.54859733581543, + "learning_rate": 2.2067363530778166e-07, + "loss": 1.4311, + "step": 19 + }, + { + "epoch": 0.0058063579619683555, + "grad_norm": 11.264630317687988, + "learning_rate": 2.3228803716608595e-07, + "loss": 1.4226, + "step": 20 + }, + { + "epoch": 0.006096675860066773, + "grad_norm": 9.510553359985352, + "learning_rate": 2.439024390243903e-07, + "loss": 1.2677, + "step": 21 + }, + { + "epoch": 0.006386993758165191, + "grad_norm": 10.799087524414062, + "learning_rate": 2.555168408826946e-07, + "loss": 1.2587, + "step": 22 + }, + { + "epoch": 0.006677311656263609, + "grad_norm": 12.985727310180664, + "learning_rate": 2.6713124274099886e-07, + "loss": 1.5219, + "step": 23 + }, + { + "epoch": 0.006967629554362026, + "grad_norm": 11.036988258361816, + "learning_rate": 2.7874564459930313e-07, + "loss": 1.2812, + "step": 24 + }, + { + "epoch": 0.007257947452460444, + "grad_norm": 12.813385009765625, + "learning_rate": 2.9036004645760745e-07, + "loss": 1.4295, + "step": 25 + }, + { + "epoch": 0.007548265350558862, + "grad_norm": 11.850518226623535, + "learning_rate": 3.019744483159118e-07, + "loss": 1.5312, + "step": 26 + }, + { + "epoch": 0.00783858324865728, + "grad_norm": 10.156636238098145, + "learning_rate": 3.1358885017421604e-07, + "loss": 1.335, + "step": 27 + }, + { + "epoch": 0.008128901146755697, + "grad_norm": 13.199593544006348, + "learning_rate": 3.2520325203252037e-07, + "loss": 1.4746, + "step": 28 + }, + { + "epoch": 0.008419219044854116, + "grad_norm": 11.168906211853027, + "learning_rate": 3.3681765389082463e-07, + "loss": 1.4153, + "step": 29 + }, + { + "epoch": 0.008709536942952532, + "grad_norm": 10.479500770568848, + "learning_rate": 3.4843205574912896e-07, + "loss": 1.4663, + "step": 30 + }, + { + "epoch": 0.008999854841050951, + "grad_norm": 9.586933135986328, + "learning_rate": 3.600464576074333e-07, + "loss": 1.3906, + "step": 31 + }, + { + "epoch": 0.009290172739149368, + "grad_norm": 8.640244483947754, + "learning_rate": 3.7166085946573755e-07, + "loss": 1.2824, + "step": 32 + }, + { + "epoch": 0.009580490637247786, + "grad_norm": 9.352594375610352, + "learning_rate": 3.832752613240418e-07, + "loss": 1.5556, + "step": 33 + }, + { + "epoch": 0.009870808535346205, + "grad_norm": 9.151625633239746, + "learning_rate": 3.9488966318234614e-07, + "loss": 1.4723, + "step": 34 + }, + { + "epoch": 0.010161126433444622, + "grad_norm": 8.76069164276123, + "learning_rate": 4.0650406504065046e-07, + "loss": 1.3395, + "step": 35 + }, + { + "epoch": 0.01045144433154304, + "grad_norm": 8.062403678894043, + "learning_rate": 4.181184668989548e-07, + "loss": 1.3208, + "step": 36 + }, + { + "epoch": 0.010741762229641457, + "grad_norm": 7.3440117835998535, + "learning_rate": 4.2973286875725905e-07, + "loss": 1.3127, + "step": 37 + }, + { + "epoch": 0.011032080127739876, + "grad_norm": 8.806610107421875, + "learning_rate": 4.413472706155633e-07, + "loss": 1.4392, + "step": 38 + }, + { + "epoch": 0.011322398025838292, + "grad_norm": 7.228657245635986, + "learning_rate": 4.5296167247386764e-07, + "loss": 1.2803, + "step": 39 + }, + { + "epoch": 0.011612715923936711, + "grad_norm": 6.03056001663208, + "learning_rate": 4.645760743321719e-07, + "loss": 1.4196, + "step": 40 + }, + { + "epoch": 0.011903033822035128, + "grad_norm": 6.1218414306640625, + "learning_rate": 4.7619047619047623e-07, + "loss": 1.3285, + "step": 41 + }, + { + "epoch": 0.012193351720133546, + "grad_norm": 5.98799467086792, + "learning_rate": 4.878048780487805e-07, + "loss": 1.4567, + "step": 42 + }, + { + "epoch": 0.012483669618231963, + "grad_norm": 6.054631233215332, + "learning_rate": 4.994192799070848e-07, + "loss": 1.167, + "step": 43 + }, + { + "epoch": 0.012773987516330382, + "grad_norm": 6.573026657104492, + "learning_rate": 5.110336817653892e-07, + "loss": 1.3017, + "step": 44 + }, + { + "epoch": 0.0130643054144288, + "grad_norm": 6.461268424987793, + "learning_rate": 5.226480836236935e-07, + "loss": 1.3128, + "step": 45 + }, + { + "epoch": 0.013354623312527217, + "grad_norm": 6.853832721710205, + "learning_rate": 5.342624854819977e-07, + "loss": 1.3945, + "step": 46 + }, + { + "epoch": 0.013644941210625636, + "grad_norm": 6.029784202575684, + "learning_rate": 5.45876887340302e-07, + "loss": 1.2451, + "step": 47 + }, + { + "epoch": 0.013935259108724053, + "grad_norm": 5.88099479675293, + "learning_rate": 5.574912891986063e-07, + "loss": 1.3445, + "step": 48 + }, + { + "epoch": 0.014225577006822471, + "grad_norm": 6.424746513366699, + "learning_rate": 5.691056910569106e-07, + "loss": 1.3422, + "step": 49 + }, + { + "epoch": 0.014515894904920888, + "grad_norm": 6.097443103790283, + "learning_rate": 5.807200929152149e-07, + "loss": 1.2858, + "step": 50 + }, + { + "epoch": 0.014806212803019306, + "grad_norm": 5.770637035369873, + "learning_rate": 5.923344947735192e-07, + "loss": 1.3279, + "step": 51 + }, + { + "epoch": 0.015096530701117723, + "grad_norm": 5.4142255783081055, + "learning_rate": 6.039488966318236e-07, + "loss": 1.0764, + "step": 52 + }, + { + "epoch": 0.015386848599216142, + "grad_norm": 5.637355327606201, + "learning_rate": 6.155632984901278e-07, + "loss": 1.3842, + "step": 53 + }, + { + "epoch": 0.01567716649731456, + "grad_norm": 5.330609321594238, + "learning_rate": 6.271777003484321e-07, + "loss": 1.378, + "step": 54 + }, + { + "epoch": 0.015967484395412977, + "grad_norm": 5.984035015106201, + "learning_rate": 6.387921022067365e-07, + "loss": 1.3356, + "step": 55 + }, + { + "epoch": 0.016257802293511394, + "grad_norm": 5.351655006408691, + "learning_rate": 6.504065040650407e-07, + "loss": 1.2484, + "step": 56 + }, + { + "epoch": 0.016548120191609814, + "grad_norm": 5.876321315765381, + "learning_rate": 6.62020905923345e-07, + "loss": 1.3715, + "step": 57 + }, + { + "epoch": 0.01683843808970823, + "grad_norm": 5.923037052154541, + "learning_rate": 6.736353077816493e-07, + "loss": 1.2894, + "step": 58 + }, + { + "epoch": 0.017128755987806648, + "grad_norm": 6.991525650024414, + "learning_rate": 6.852497096399536e-07, + "loss": 1.2768, + "step": 59 + }, + { + "epoch": 0.017419073885905065, + "grad_norm": 6.0120415687561035, + "learning_rate": 6.968641114982579e-07, + "loss": 1.3504, + "step": 60 + }, + { + "epoch": 0.017709391784003485, + "grad_norm": 5.738192081451416, + "learning_rate": 7.084785133565622e-07, + "loss": 1.2148, + "step": 61 + }, + { + "epoch": 0.017999709682101902, + "grad_norm": 5.9735565185546875, + "learning_rate": 7.200929152148666e-07, + "loss": 1.2813, + "step": 62 + }, + { + "epoch": 0.01829002758020032, + "grad_norm": 5.3724045753479, + "learning_rate": 7.317073170731707e-07, + "loss": 1.2319, + "step": 63 + }, + { + "epoch": 0.018580345478298736, + "grad_norm": 5.448958873748779, + "learning_rate": 7.433217189314751e-07, + "loss": 1.3275, + "step": 64 + }, + { + "epoch": 0.018870663376397156, + "grad_norm": 5.127908229827881, + "learning_rate": 7.549361207897795e-07, + "loss": 1.1126, + "step": 65 + }, + { + "epoch": 0.019160981274495573, + "grad_norm": 5.0388078689575195, + "learning_rate": 7.665505226480836e-07, + "loss": 1.1897, + "step": 66 + }, + { + "epoch": 0.01945129917259399, + "grad_norm": 5.600452423095703, + "learning_rate": 7.78164924506388e-07, + "loss": 1.2433, + "step": 67 + }, + { + "epoch": 0.01974161707069241, + "grad_norm": 5.887912750244141, + "learning_rate": 7.897793263646923e-07, + "loss": 1.3351, + "step": 68 + }, + { + "epoch": 0.020031934968790827, + "grad_norm": 5.299606800079346, + "learning_rate": 8.013937282229965e-07, + "loss": 1.3575, + "step": 69 + }, + { + "epoch": 0.020322252866889243, + "grad_norm": 5.105284214019775, + "learning_rate": 8.130081300813009e-07, + "loss": 1.3878, + "step": 70 + }, + { + "epoch": 0.02061257076498766, + "grad_norm": 5.7982611656188965, + "learning_rate": 8.246225319396052e-07, + "loss": 1.3991, + "step": 71 + }, + { + "epoch": 0.02090288866308608, + "grad_norm": 4.94700288772583, + "learning_rate": 8.362369337979096e-07, + "loss": 1.1393, + "step": 72 + }, + { + "epoch": 0.021193206561184497, + "grad_norm": 5.303609848022461, + "learning_rate": 8.478513356562137e-07, + "loss": 1.2964, + "step": 73 + }, + { + "epoch": 0.021483524459282914, + "grad_norm": 5.495324611663818, + "learning_rate": 8.594657375145181e-07, + "loss": 1.2954, + "step": 74 + }, + { + "epoch": 0.02177384235738133, + "grad_norm": 5.6491618156433105, + "learning_rate": 8.710801393728225e-07, + "loss": 1.2611, + "step": 75 + }, + { + "epoch": 0.02206416025547975, + "grad_norm": 5.240455627441406, + "learning_rate": 8.826945412311266e-07, + "loss": 1.2958, + "step": 76 + }, + { + "epoch": 0.022354478153578168, + "grad_norm": 5.199842929840088, + "learning_rate": 8.94308943089431e-07, + "loss": 1.2819, + "step": 77 + }, + { + "epoch": 0.022644796051676585, + "grad_norm": 5.2357563972473145, + "learning_rate": 9.059233449477353e-07, + "loss": 1.4676, + "step": 78 + }, + { + "epoch": 0.022935113949775005, + "grad_norm": 5.108120918273926, + "learning_rate": 9.175377468060395e-07, + "loss": 1.2405, + "step": 79 + }, + { + "epoch": 0.023225431847873422, + "grad_norm": 5.231665134429932, + "learning_rate": 9.291521486643438e-07, + "loss": 1.4379, + "step": 80 + }, + { + "epoch": 0.02351574974597184, + "grad_norm": 5.028713226318359, + "learning_rate": 9.407665505226482e-07, + "loss": 1.2525, + "step": 81 + }, + { + "epoch": 0.023806067644070256, + "grad_norm": 4.963902473449707, + "learning_rate": 9.523809523809525e-07, + "loss": 1.3688, + "step": 82 + }, + { + "epoch": 0.024096385542168676, + "grad_norm": 5.339515209197998, + "learning_rate": 9.639953542392568e-07, + "loss": 1.3483, + "step": 83 + }, + { + "epoch": 0.024386703440267093, + "grad_norm": 4.661757946014404, + "learning_rate": 9.75609756097561e-07, + "loss": 1.2315, + "step": 84 + }, + { + "epoch": 0.02467702133836551, + "grad_norm": 5.013600826263428, + "learning_rate": 9.872241579558654e-07, + "loss": 1.1119, + "step": 85 + }, + { + "epoch": 0.024967339236463926, + "grad_norm": 5.247570514678955, + "learning_rate": 9.988385598141696e-07, + "loss": 1.2577, + "step": 86 + }, + { + "epoch": 0.025257657134562347, + "grad_norm": 4.793586254119873, + "learning_rate": 1.010452961672474e-06, + "loss": 1.1647, + "step": 87 + }, + { + "epoch": 0.025547975032660764, + "grad_norm": 5.240910530090332, + "learning_rate": 1.0220673635307784e-06, + "loss": 1.2794, + "step": 88 + }, + { + "epoch": 0.02583829293075918, + "grad_norm": 5.577494144439697, + "learning_rate": 1.0336817653890824e-06, + "loss": 1.2242, + "step": 89 + }, + { + "epoch": 0.0261286108288576, + "grad_norm": 5.272243976593018, + "learning_rate": 1.045296167247387e-06, + "loss": 1.2782, + "step": 90 + }, + { + "epoch": 0.026418928726956017, + "grad_norm": 4.999173164367676, + "learning_rate": 1.0569105691056912e-06, + "loss": 1.2035, + "step": 91 + }, + { + "epoch": 0.026709246625054434, + "grad_norm": 5.285266399383545, + "learning_rate": 1.0685249709639955e-06, + "loss": 1.2585, + "step": 92 + }, + { + "epoch": 0.02699956452315285, + "grad_norm": 4.789974212646484, + "learning_rate": 1.0801393728222997e-06, + "loss": 1.1728, + "step": 93 + }, + { + "epoch": 0.02728988242125127, + "grad_norm": 4.92954158782959, + "learning_rate": 1.091753774680604e-06, + "loss": 1.0652, + "step": 94 + }, + { + "epoch": 0.027580200319349688, + "grad_norm": 5.096219062805176, + "learning_rate": 1.1033681765389083e-06, + "loss": 1.1034, + "step": 95 + }, + { + "epoch": 0.027870518217448105, + "grad_norm": 4.46090030670166, + "learning_rate": 1.1149825783972125e-06, + "loss": 1.1724, + "step": 96 + }, + { + "epoch": 0.028160836115546522, + "grad_norm": 4.940242767333984, + "learning_rate": 1.126596980255517e-06, + "loss": 1.2639, + "step": 97 + }, + { + "epoch": 0.028451154013644942, + "grad_norm": 5.028652667999268, + "learning_rate": 1.1382113821138213e-06, + "loss": 1.2364, + "step": 98 + }, + { + "epoch": 0.02874147191174336, + "grad_norm": 4.718242645263672, + "learning_rate": 1.1498257839721255e-06, + "loss": 1.1937, + "step": 99 + }, + { + "epoch": 0.029031789809841776, + "grad_norm": 4.432032585144043, + "learning_rate": 1.1614401858304298e-06, + "loss": 1.1513, + "step": 100 + }, + { + "epoch": 0.029322107707940196, + "grad_norm": 4.936924934387207, + "learning_rate": 1.173054587688734e-06, + "loss": 1.2232, + "step": 101 + }, + { + "epoch": 0.029612425606038613, + "grad_norm": 5.688957214355469, + "learning_rate": 1.1846689895470384e-06, + "loss": 1.1863, + "step": 102 + }, + { + "epoch": 0.02990274350413703, + "grad_norm": 5.0107598304748535, + "learning_rate": 1.1962833914053428e-06, + "loss": 1.3997, + "step": 103 + }, + { + "epoch": 0.030193061402235447, + "grad_norm": 5.119560241699219, + "learning_rate": 1.207897793263647e-06, + "loss": 1.2213, + "step": 104 + }, + { + "epoch": 0.030483379300333867, + "grad_norm": 4.578975677490234, + "learning_rate": 1.2195121951219514e-06, + "loss": 1.146, + "step": 105 + }, + { + "epoch": 0.030773697198432284, + "grad_norm": 4.886281490325928, + "learning_rate": 1.2311265969802556e-06, + "loss": 1.0512, + "step": 106 + }, + { + "epoch": 0.0310640150965307, + "grad_norm": 5.57105827331543, + "learning_rate": 1.24274099883856e-06, + "loss": 1.1915, + "step": 107 + }, + { + "epoch": 0.03135433299462912, + "grad_norm": 4.908017158508301, + "learning_rate": 1.2543554006968642e-06, + "loss": 1.3416, + "step": 108 + }, + { + "epoch": 0.031644650892727534, + "grad_norm": 4.684658050537109, + "learning_rate": 1.2659698025551684e-06, + "loss": 1.2278, + "step": 109 + }, + { + "epoch": 0.031934968790825954, + "grad_norm": 4.777091026306152, + "learning_rate": 1.277584204413473e-06, + "loss": 1.2465, + "step": 110 + }, + { + "epoch": 0.032225286688924375, + "grad_norm": 5.166219234466553, + "learning_rate": 1.289198606271777e-06, + "loss": 1.2505, + "step": 111 + }, + { + "epoch": 0.03251560458702279, + "grad_norm": 4.726422309875488, + "learning_rate": 1.3008130081300815e-06, + "loss": 1.0856, + "step": 112 + }, + { + "epoch": 0.03280592248512121, + "grad_norm": 5.305611610412598, + "learning_rate": 1.3124274099883857e-06, + "loss": 1.1678, + "step": 113 + }, + { + "epoch": 0.03309624038321963, + "grad_norm": 5.176076889038086, + "learning_rate": 1.32404181184669e-06, + "loss": 1.3583, + "step": 114 + }, + { + "epoch": 0.03338655828131804, + "grad_norm": 5.078863143920898, + "learning_rate": 1.3356562137049945e-06, + "loss": 1.2707, + "step": 115 + }, + { + "epoch": 0.03367687617941646, + "grad_norm": 4.867222785949707, + "learning_rate": 1.3472706155632985e-06, + "loss": 1.233, + "step": 116 + }, + { + "epoch": 0.033967194077514876, + "grad_norm": 5.004298210144043, + "learning_rate": 1.3588850174216028e-06, + "loss": 1.1668, + "step": 117 + }, + { + "epoch": 0.034257511975613296, + "grad_norm": 5.02892541885376, + "learning_rate": 1.3704994192799073e-06, + "loss": 1.3564, + "step": 118 + }, + { + "epoch": 0.034547829873711716, + "grad_norm": 5.394801616668701, + "learning_rate": 1.3821138211382116e-06, + "loss": 1.2523, + "step": 119 + }, + { + "epoch": 0.03483814777181013, + "grad_norm": 5.628437042236328, + "learning_rate": 1.3937282229965158e-06, + "loss": 1.4381, + "step": 120 + }, + { + "epoch": 0.03512846566990855, + "grad_norm": 4.8691229820251465, + "learning_rate": 1.4053426248548203e-06, + "loss": 1.25, + "step": 121 + }, + { + "epoch": 0.03541878356800697, + "grad_norm": 5.313623428344727, + "learning_rate": 1.4169570267131244e-06, + "loss": 1.2792, + "step": 122 + }, + { + "epoch": 0.035709101466105383, + "grad_norm": 4.7696943283081055, + "learning_rate": 1.4285714285714286e-06, + "loss": 1.2598, + "step": 123 + }, + { + "epoch": 0.035999419364203804, + "grad_norm": 5.520746231079102, + "learning_rate": 1.4401858304297331e-06, + "loss": 1.3088, + "step": 124 + }, + { + "epoch": 0.036289737262302224, + "grad_norm": 4.788918495178223, + "learning_rate": 1.4518002322880374e-06, + "loss": 1.1841, + "step": 125 + }, + { + "epoch": 0.03658005516040064, + "grad_norm": 4.718968868255615, + "learning_rate": 1.4634146341463414e-06, + "loss": 1.1215, + "step": 126 + }, + { + "epoch": 0.03687037305849906, + "grad_norm": 4.99876070022583, + "learning_rate": 1.475029036004646e-06, + "loss": 1.174, + "step": 127 + }, + { + "epoch": 0.03716069095659747, + "grad_norm": 5.314165115356445, + "learning_rate": 1.4866434378629502e-06, + "loss": 1.2494, + "step": 128 + }, + { + "epoch": 0.03745100885469589, + "grad_norm": 4.882414817810059, + "learning_rate": 1.4982578397212545e-06, + "loss": 1.2868, + "step": 129 + }, + { + "epoch": 0.03774132675279431, + "grad_norm": 4.856612682342529, + "learning_rate": 1.509872241579559e-06, + "loss": 1.1583, + "step": 130 + }, + { + "epoch": 0.038031644650892725, + "grad_norm": 5.177412986755371, + "learning_rate": 1.521486643437863e-06, + "loss": 1.1967, + "step": 131 + }, + { + "epoch": 0.038321962548991145, + "grad_norm": 5.465760231018066, + "learning_rate": 1.5331010452961673e-06, + "loss": 1.2635, + "step": 132 + }, + { + "epoch": 0.038612280447089566, + "grad_norm": 4.9557342529296875, + "learning_rate": 1.5447154471544717e-06, + "loss": 1.1736, + "step": 133 + }, + { + "epoch": 0.03890259834518798, + "grad_norm": 5.2583537101745605, + "learning_rate": 1.556329849012776e-06, + "loss": 1.1179, + "step": 134 + }, + { + "epoch": 0.0391929162432864, + "grad_norm": 5.05612325668335, + "learning_rate": 1.56794425087108e-06, + "loss": 1.2193, + "step": 135 + }, + { + "epoch": 0.03948323414138482, + "grad_norm": 5.267907619476318, + "learning_rate": 1.5795586527293845e-06, + "loss": 1.2736, + "step": 136 + }, + { + "epoch": 0.03977355203948323, + "grad_norm": 4.456612586975098, + "learning_rate": 1.5911730545876888e-06, + "loss": 1.1091, + "step": 137 + }, + { + "epoch": 0.04006386993758165, + "grad_norm": 4.886338710784912, + "learning_rate": 1.602787456445993e-06, + "loss": 1.2456, + "step": 138 + }, + { + "epoch": 0.040354187835680067, + "grad_norm": 4.77720308303833, + "learning_rate": 1.6144018583042976e-06, + "loss": 1.0892, + "step": 139 + }, + { + "epoch": 0.04064450573377849, + "grad_norm": 5.040073394775391, + "learning_rate": 1.6260162601626018e-06, + "loss": 1.1725, + "step": 140 + }, + { + "epoch": 0.04093482363187691, + "grad_norm": 4.47899055480957, + "learning_rate": 1.6376306620209059e-06, + "loss": 1.0944, + "step": 141 + }, + { + "epoch": 0.04122514152997532, + "grad_norm": 4.960933208465576, + "learning_rate": 1.6492450638792104e-06, + "loss": 1.1804, + "step": 142 + }, + { + "epoch": 0.04151545942807374, + "grad_norm": 4.783790111541748, + "learning_rate": 1.6608594657375146e-06, + "loss": 1.2976, + "step": 143 + }, + { + "epoch": 0.04180577732617216, + "grad_norm": 4.320231914520264, + "learning_rate": 1.6724738675958191e-06, + "loss": 1.2064, + "step": 144 + }, + { + "epoch": 0.042096095224270574, + "grad_norm": 4.767696380615234, + "learning_rate": 1.6840882694541234e-06, + "loss": 1.1187, + "step": 145 + }, + { + "epoch": 0.042386413122368995, + "grad_norm": 4.700660228729248, + "learning_rate": 1.6957026713124274e-06, + "loss": 1.0453, + "step": 146 + }, + { + "epoch": 0.042676731020467415, + "grad_norm": 4.928901195526123, + "learning_rate": 1.707317073170732e-06, + "loss": 1.1402, + "step": 147 + }, + { + "epoch": 0.04296704891856583, + "grad_norm": 5.0144758224487305, + "learning_rate": 1.7189314750290362e-06, + "loss": 1.2565, + "step": 148 + }, + { + "epoch": 0.04325736681666425, + "grad_norm": 5.311608791351318, + "learning_rate": 1.7305458768873405e-06, + "loss": 1.3904, + "step": 149 + }, + { + "epoch": 0.04354768471476266, + "grad_norm": 5.366107940673828, + "learning_rate": 1.742160278745645e-06, + "loss": 1.2125, + "step": 150 + }, + { + "epoch": 0.04383800261286108, + "grad_norm": 5.120449066162109, + "learning_rate": 1.753774680603949e-06, + "loss": 1.2111, + "step": 151 + }, + { + "epoch": 0.0441283205109595, + "grad_norm": 4.783287525177002, + "learning_rate": 1.7653890824622533e-06, + "loss": 1.309, + "step": 152 + }, + { + "epoch": 0.044418638409057916, + "grad_norm": 5.0367751121521, + "learning_rate": 1.7770034843205577e-06, + "loss": 1.1008, + "step": 153 + }, + { + "epoch": 0.044708956307156336, + "grad_norm": 4.646999835968018, + "learning_rate": 1.788617886178862e-06, + "loss": 1.1683, + "step": 154 + }, + { + "epoch": 0.044999274205254756, + "grad_norm": 4.942159175872803, + "learning_rate": 1.800232288037166e-06, + "loss": 1.1253, + "step": 155 + }, + { + "epoch": 0.04528959210335317, + "grad_norm": 5.135502815246582, + "learning_rate": 1.8118466898954705e-06, + "loss": 1.4273, + "step": 156 + }, + { + "epoch": 0.04557991000145159, + "grad_norm": 4.905440330505371, + "learning_rate": 1.8234610917537748e-06, + "loss": 1.3051, + "step": 157 + }, + { + "epoch": 0.04587022789955001, + "grad_norm": 4.9893798828125, + "learning_rate": 1.835075493612079e-06, + "loss": 1.308, + "step": 158 + }, + { + "epoch": 0.046160545797648424, + "grad_norm": 4.7659759521484375, + "learning_rate": 1.8466898954703836e-06, + "loss": 1.1285, + "step": 159 + }, + { + "epoch": 0.046450863695746844, + "grad_norm": 4.867801189422607, + "learning_rate": 1.8583042973286876e-06, + "loss": 1.1024, + "step": 160 + }, + { + "epoch": 0.04674118159384526, + "grad_norm": 5.107170104980469, + "learning_rate": 1.8699186991869919e-06, + "loss": 1.2959, + "step": 161 + }, + { + "epoch": 0.04703149949194368, + "grad_norm": 5.213975429534912, + "learning_rate": 1.8815331010452964e-06, + "loss": 1.2973, + "step": 162 + }, + { + "epoch": 0.0473218173900421, + "grad_norm": 4.60981559753418, + "learning_rate": 1.8931475029036006e-06, + "loss": 1.2259, + "step": 163 + }, + { + "epoch": 0.04761213528814051, + "grad_norm": 4.348560333251953, + "learning_rate": 1.904761904761905e-06, + "loss": 1.0382, + "step": 164 + }, + { + "epoch": 0.04790245318623893, + "grad_norm": 4.841989517211914, + "learning_rate": 1.916376306620209e-06, + "loss": 1.2645, + "step": 165 + }, + { + "epoch": 0.04819277108433735, + "grad_norm": 4.736576557159424, + "learning_rate": 1.9279907084785137e-06, + "loss": 1.2034, + "step": 166 + }, + { + "epoch": 0.048483088982435765, + "grad_norm": 5.0388383865356445, + "learning_rate": 1.9396051103368177e-06, + "loss": 1.2515, + "step": 167 + }, + { + "epoch": 0.048773406880534186, + "grad_norm": 4.488497257232666, + "learning_rate": 1.951219512195122e-06, + "loss": 1.1909, + "step": 168 + }, + { + "epoch": 0.049063724778632606, + "grad_norm": 4.383110523223877, + "learning_rate": 1.9628339140534263e-06, + "loss": 1.0721, + "step": 169 + }, + { + "epoch": 0.04935404267673102, + "grad_norm": 5.338650703430176, + "learning_rate": 1.9744483159117307e-06, + "loss": 1.2284, + "step": 170 + }, + { + "epoch": 0.04964436057482944, + "grad_norm": 4.445425033569336, + "learning_rate": 1.986062717770035e-06, + "loss": 1.1499, + "step": 171 + }, + { + "epoch": 0.04993467847292785, + "grad_norm": 4.984339237213135, + "learning_rate": 1.9976771196283393e-06, + "loss": 1.3605, + "step": 172 + }, + { + "epoch": 0.05022499637102627, + "grad_norm": 4.657524585723877, + "learning_rate": 2.0092915214866433e-06, + "loss": 1.2488, + "step": 173 + }, + { + "epoch": 0.05051531426912469, + "grad_norm": 4.822662353515625, + "learning_rate": 2.020905923344948e-06, + "loss": 1.1577, + "step": 174 + }, + { + "epoch": 0.05080563216722311, + "grad_norm": 4.718631744384766, + "learning_rate": 2.0325203252032523e-06, + "loss": 1.0862, + "step": 175 + }, + { + "epoch": 0.05109595006532153, + "grad_norm": 4.929813861846924, + "learning_rate": 2.0441347270615568e-06, + "loss": 1.2659, + "step": 176 + }, + { + "epoch": 0.05138626796341995, + "grad_norm": 5.136166572570801, + "learning_rate": 2.055749128919861e-06, + "loss": 1.2169, + "step": 177 + }, + { + "epoch": 0.05167658586151836, + "grad_norm": 4.956854343414307, + "learning_rate": 2.067363530778165e-06, + "loss": 1.1328, + "step": 178 + }, + { + "epoch": 0.05196690375961678, + "grad_norm": 4.586047649383545, + "learning_rate": 2.0789779326364694e-06, + "loss": 1.1756, + "step": 179 + }, + { + "epoch": 0.0522572216577152, + "grad_norm": 4.752535820007324, + "learning_rate": 2.090592334494774e-06, + "loss": 1.3709, + "step": 180 + }, + { + "epoch": 0.052547539555813615, + "grad_norm": 5.013321876525879, + "learning_rate": 2.102206736353078e-06, + "loss": 1.1806, + "step": 181 + }, + { + "epoch": 0.052837857453912035, + "grad_norm": 4.766448020935059, + "learning_rate": 2.1138211382113824e-06, + "loss": 0.9959, + "step": 182 + }, + { + "epoch": 0.05312817535201045, + "grad_norm": 4.972908020019531, + "learning_rate": 2.1254355400696864e-06, + "loss": 1.2942, + "step": 183 + }, + { + "epoch": 0.05341849325010887, + "grad_norm": 4.858799934387207, + "learning_rate": 2.137049941927991e-06, + "loss": 1.1823, + "step": 184 + }, + { + "epoch": 0.05370881114820729, + "grad_norm": 4.911069393157959, + "learning_rate": 2.1486643437862954e-06, + "loss": 1.2269, + "step": 185 + }, + { + "epoch": 0.0539991290463057, + "grad_norm": 4.7894368171691895, + "learning_rate": 2.1602787456445995e-06, + "loss": 1.2492, + "step": 186 + }, + { + "epoch": 0.05428944694440412, + "grad_norm": 4.717777729034424, + "learning_rate": 2.1718931475029035e-06, + "loss": 1.2164, + "step": 187 + }, + { + "epoch": 0.05457976484250254, + "grad_norm": 4.9674763679504395, + "learning_rate": 2.183507549361208e-06, + "loss": 1.2069, + "step": 188 + }, + { + "epoch": 0.054870082740600956, + "grad_norm": 5.091649532318115, + "learning_rate": 2.1951219512195125e-06, + "loss": 1.1534, + "step": 189 + }, + { + "epoch": 0.055160400638699376, + "grad_norm": 4.965774059295654, + "learning_rate": 2.2067363530778165e-06, + "loss": 1.3424, + "step": 190 + }, + { + "epoch": 0.0554507185367978, + "grad_norm": 5.256765842437744, + "learning_rate": 2.218350754936121e-06, + "loss": 1.2947, + "step": 191 + }, + { + "epoch": 0.05574103643489621, + "grad_norm": 5.316900730133057, + "learning_rate": 2.229965156794425e-06, + "loss": 1.3133, + "step": 192 + }, + { + "epoch": 0.05603135433299463, + "grad_norm": 5.373122692108154, + "learning_rate": 2.2415795586527295e-06, + "loss": 1.0931, + "step": 193 + }, + { + "epoch": 0.056321672231093044, + "grad_norm": 5.171296119689941, + "learning_rate": 2.253193960511034e-06, + "loss": 1.2404, + "step": 194 + }, + { + "epoch": 0.056611990129191464, + "grad_norm": 5.496878147125244, + "learning_rate": 2.264808362369338e-06, + "loss": 1.1381, + "step": 195 + }, + { + "epoch": 0.056902308027289884, + "grad_norm": 5.244287014007568, + "learning_rate": 2.2764227642276426e-06, + "loss": 1.3365, + "step": 196 + }, + { + "epoch": 0.0571926259253883, + "grad_norm": 5.100976943969727, + "learning_rate": 2.288037166085947e-06, + "loss": 1.2419, + "step": 197 + }, + { + "epoch": 0.05748294382348672, + "grad_norm": 5.562692642211914, + "learning_rate": 2.299651567944251e-06, + "loss": 1.3099, + "step": 198 + }, + { + "epoch": 0.05777326172158514, + "grad_norm": 5.311895370483398, + "learning_rate": 2.311265969802555e-06, + "loss": 1.1572, + "step": 199 + }, + { + "epoch": 0.05806357961968355, + "grad_norm": 5.604903221130371, + "learning_rate": 2.3228803716608596e-06, + "loss": 1.3625, + "step": 200 + }, + { + "epoch": 0.05835389751778197, + "grad_norm": 5.023021697998047, + "learning_rate": 2.334494773519164e-06, + "loss": 1.2232, + "step": 201 + }, + { + "epoch": 0.05864421541588039, + "grad_norm": 5.409064769744873, + "learning_rate": 2.346109175377468e-06, + "loss": 1.2896, + "step": 202 + }, + { + "epoch": 0.058934533313978806, + "grad_norm": 4.986676216125488, + "learning_rate": 2.3577235772357727e-06, + "loss": 1.1278, + "step": 203 + }, + { + "epoch": 0.059224851212077226, + "grad_norm": 4.7012128829956055, + "learning_rate": 2.3693379790940767e-06, + "loss": 1.2292, + "step": 204 + }, + { + "epoch": 0.05951516911017564, + "grad_norm": 4.756272315979004, + "learning_rate": 2.380952380952381e-06, + "loss": 1.1426, + "step": 205 + }, + { + "epoch": 0.05980548700827406, + "grad_norm": 4.644824504852295, + "learning_rate": 2.3925667828106857e-06, + "loss": 1.1133, + "step": 206 + }, + { + "epoch": 0.06009580490637248, + "grad_norm": 4.655545234680176, + "learning_rate": 2.4041811846689897e-06, + "loss": 1.1316, + "step": 207 + }, + { + "epoch": 0.06038612280447089, + "grad_norm": 5.067546844482422, + "learning_rate": 2.415795586527294e-06, + "loss": 1.1613, + "step": 208 + }, + { + "epoch": 0.06067644070256931, + "grad_norm": 5.918067455291748, + "learning_rate": 2.4274099883855983e-06, + "loss": 1.3104, + "step": 209 + }, + { + "epoch": 0.060966758600667734, + "grad_norm": 4.958433151245117, + "learning_rate": 2.4390243902439027e-06, + "loss": 1.1905, + "step": 210 + }, + { + "epoch": 0.06125707649876615, + "grad_norm": 4.635531902313232, + "learning_rate": 2.4506387921022072e-06, + "loss": 1.1553, + "step": 211 + }, + { + "epoch": 0.06154739439686457, + "grad_norm": 4.515402793884277, + "learning_rate": 2.4622531939605113e-06, + "loss": 1.1648, + "step": 212 + }, + { + "epoch": 0.06183771229496299, + "grad_norm": 4.840621471405029, + "learning_rate": 2.4738675958188153e-06, + "loss": 1.3244, + "step": 213 + }, + { + "epoch": 0.0621280301930614, + "grad_norm": 4.515079498291016, + "learning_rate": 2.48548199767712e-06, + "loss": 1.0585, + "step": 214 + }, + { + "epoch": 0.06241834809115982, + "grad_norm": 4.8643693923950195, + "learning_rate": 2.4970963995354243e-06, + "loss": 1.2397, + "step": 215 + }, + { + "epoch": 0.06270866598925824, + "grad_norm": 5.038429260253906, + "learning_rate": 2.5087108013937284e-06, + "loss": 1.1628, + "step": 216 + }, + { + "epoch": 0.06299898388735665, + "grad_norm": 5.393674373626709, + "learning_rate": 2.5203252032520324e-06, + "loss": 1.1503, + "step": 217 + }, + { + "epoch": 0.06328930178545507, + "grad_norm": 4.6619038581848145, + "learning_rate": 2.531939605110337e-06, + "loss": 1.2291, + "step": 218 + }, + { + "epoch": 0.0635796196835535, + "grad_norm": 4.9958648681640625, + "learning_rate": 2.5435540069686414e-06, + "loss": 1.1938, + "step": 219 + }, + { + "epoch": 0.06386993758165191, + "grad_norm": 4.516469955444336, + "learning_rate": 2.555168408826946e-06, + "loss": 1.0742, + "step": 220 + }, + { + "epoch": 0.06416025547975032, + "grad_norm": 4.328372001647949, + "learning_rate": 2.56678281068525e-06, + "loss": 1.2531, + "step": 221 + }, + { + "epoch": 0.06445057337784875, + "grad_norm": 4.436943054199219, + "learning_rate": 2.578397212543554e-06, + "loss": 1.2107, + "step": 222 + }, + { + "epoch": 0.06474089127594716, + "grad_norm": 4.877750396728516, + "learning_rate": 2.5900116144018584e-06, + "loss": 1.4222, + "step": 223 + }, + { + "epoch": 0.06503120917404558, + "grad_norm": 5.479437828063965, + "learning_rate": 2.601626016260163e-06, + "loss": 1.1668, + "step": 224 + }, + { + "epoch": 0.065321527072144, + "grad_norm": 4.4991583824157715, + "learning_rate": 2.6132404181184674e-06, + "loss": 0.9982, + "step": 225 + }, + { + "epoch": 0.06561184497024242, + "grad_norm": 4.993007183074951, + "learning_rate": 2.6248548199767715e-06, + "loss": 1.1666, + "step": 226 + }, + { + "epoch": 0.06590216286834083, + "grad_norm": 4.814315319061279, + "learning_rate": 2.6364692218350755e-06, + "loss": 1.2113, + "step": 227 + }, + { + "epoch": 0.06619248076643926, + "grad_norm": 4.64751672744751, + "learning_rate": 2.64808362369338e-06, + "loss": 1.1168, + "step": 228 + }, + { + "epoch": 0.06648279866453767, + "grad_norm": 4.427606582641602, + "learning_rate": 2.659698025551684e-06, + "loss": 1.05, + "step": 229 + }, + { + "epoch": 0.06677311656263608, + "grad_norm": 5.613397121429443, + "learning_rate": 2.671312427409989e-06, + "loss": 1.1802, + "step": 230 + }, + { + "epoch": 0.06706343446073451, + "grad_norm": 5.010979652404785, + "learning_rate": 2.682926829268293e-06, + "loss": 1.376, + "step": 231 + }, + { + "epoch": 0.06735375235883292, + "grad_norm": 4.853494644165039, + "learning_rate": 2.694541231126597e-06, + "loss": 1.006, + "step": 232 + }, + { + "epoch": 0.06764407025693134, + "grad_norm": 4.468390464782715, + "learning_rate": 2.7061556329849016e-06, + "loss": 1.2792, + "step": 233 + }, + { + "epoch": 0.06793438815502975, + "grad_norm": 4.853550910949707, + "learning_rate": 2.7177700348432056e-06, + "loss": 1.2201, + "step": 234 + }, + { + "epoch": 0.06822470605312818, + "grad_norm": 4.637911319732666, + "learning_rate": 2.7293844367015097e-06, + "loss": 1.1786, + "step": 235 + }, + { + "epoch": 0.06851502395122659, + "grad_norm": 4.544745922088623, + "learning_rate": 2.7409988385598146e-06, + "loss": 1.1948, + "step": 236 + }, + { + "epoch": 0.068805341849325, + "grad_norm": 4.622826099395752, + "learning_rate": 2.7526132404181186e-06, + "loss": 1.0758, + "step": 237 + }, + { + "epoch": 0.06909565974742343, + "grad_norm": 4.711224555969238, + "learning_rate": 2.764227642276423e-06, + "loss": 1.0424, + "step": 238 + }, + { + "epoch": 0.06938597764552185, + "grad_norm": 4.914583206176758, + "learning_rate": 2.775842044134727e-06, + "loss": 1.2045, + "step": 239 + }, + { + "epoch": 0.06967629554362026, + "grad_norm": 4.866950511932373, + "learning_rate": 2.7874564459930316e-06, + "loss": 1.1709, + "step": 240 + }, + { + "epoch": 0.06996661344171869, + "grad_norm": 4.3304123878479, + "learning_rate": 2.7990708478513357e-06, + "loss": 1.1456, + "step": 241 + }, + { + "epoch": 0.0702569313398171, + "grad_norm": 5.298426628112793, + "learning_rate": 2.8106852497096406e-06, + "loss": 1.3114, + "step": 242 + }, + { + "epoch": 0.07054724923791551, + "grad_norm": 4.610419750213623, + "learning_rate": 2.8222996515679447e-06, + "loss": 1.0748, + "step": 243 + }, + { + "epoch": 0.07083756713601394, + "grad_norm": 5.127123832702637, + "learning_rate": 2.8339140534262487e-06, + "loss": 1.1137, + "step": 244 + }, + { + "epoch": 0.07112788503411235, + "grad_norm": 4.717776775360107, + "learning_rate": 2.845528455284553e-06, + "loss": 1.286, + "step": 245 + }, + { + "epoch": 0.07141820293221077, + "grad_norm": 4.651844024658203, + "learning_rate": 2.8571428571428573e-06, + "loss": 1.1565, + "step": 246 + }, + { + "epoch": 0.0717085208303092, + "grad_norm": 4.493513584136963, + "learning_rate": 2.8687572590011613e-06, + "loss": 1.1787, + "step": 247 + }, + { + "epoch": 0.07199883872840761, + "grad_norm": 4.902074813842773, + "learning_rate": 2.8803716608594662e-06, + "loss": 1.232, + "step": 248 + }, + { + "epoch": 0.07228915662650602, + "grad_norm": 4.760148048400879, + "learning_rate": 2.8919860627177703e-06, + "loss": 1.1347, + "step": 249 + }, + { + "epoch": 0.07257947452460445, + "grad_norm": 5.101321697235107, + "learning_rate": 2.9036004645760748e-06, + "loss": 1.067, + "step": 250 + }, + { + "epoch": 0.07286979242270286, + "grad_norm": 5.148083209991455, + "learning_rate": 2.915214866434379e-06, + "loss": 1.3102, + "step": 251 + }, + { + "epoch": 0.07316011032080127, + "grad_norm": 4.3725152015686035, + "learning_rate": 2.926829268292683e-06, + "loss": 1.0622, + "step": 252 + }, + { + "epoch": 0.0734504282188997, + "grad_norm": 5.068384170532227, + "learning_rate": 2.9384436701509873e-06, + "loss": 1.1661, + "step": 253 + }, + { + "epoch": 0.07374074611699812, + "grad_norm": 4.736722946166992, + "learning_rate": 2.950058072009292e-06, + "loss": 1.2684, + "step": 254 + }, + { + "epoch": 0.07403106401509653, + "grad_norm": 4.653499603271484, + "learning_rate": 2.9616724738675963e-06, + "loss": 1.1995, + "step": 255 + }, + { + "epoch": 0.07432138191319494, + "grad_norm": 4.878271102905273, + "learning_rate": 2.9732868757259004e-06, + "loss": 1.1359, + "step": 256 + }, + { + "epoch": 0.07461169981129337, + "grad_norm": 4.5596923828125, + "learning_rate": 2.9849012775842044e-06, + "loss": 1.2118, + "step": 257 + }, + { + "epoch": 0.07490201770939178, + "grad_norm": 4.714583873748779, + "learning_rate": 2.996515679442509e-06, + "loss": 1.1838, + "step": 258 + }, + { + "epoch": 0.0751923356074902, + "grad_norm": 4.619505405426025, + "learning_rate": 3.0081300813008134e-06, + "loss": 1.005, + "step": 259 + }, + { + "epoch": 0.07548265350558862, + "grad_norm": 4.827937602996826, + "learning_rate": 3.019744483159118e-06, + "loss": 1.2427, + "step": 260 + }, + { + "epoch": 0.07577297140368704, + "grad_norm": 4.799366474151611, + "learning_rate": 3.031358885017422e-06, + "loss": 1.2602, + "step": 261 + }, + { + "epoch": 0.07606328930178545, + "grad_norm": 4.541555404663086, + "learning_rate": 3.042973286875726e-06, + "loss": 1.2551, + "step": 262 + }, + { + "epoch": 0.07635360719988388, + "grad_norm": 4.521805286407471, + "learning_rate": 3.0545876887340305e-06, + "loss": 1.1664, + "step": 263 + }, + { + "epoch": 0.07664392509798229, + "grad_norm": 4.505204677581787, + "learning_rate": 3.0662020905923345e-06, + "loss": 1.119, + "step": 264 + }, + { + "epoch": 0.0769342429960807, + "grad_norm": 4.231343746185303, + "learning_rate": 3.0778164924506394e-06, + "loss": 1.2252, + "step": 265 + }, + { + "epoch": 0.07722456089417913, + "grad_norm": 4.726437568664551, + "learning_rate": 3.0894308943089435e-06, + "loss": 1.3634, + "step": 266 + }, + { + "epoch": 0.07751487879227754, + "grad_norm": 4.76708984375, + "learning_rate": 3.1010452961672475e-06, + "loss": 1.3045, + "step": 267 + }, + { + "epoch": 0.07780519669037596, + "grad_norm": 4.557008743286133, + "learning_rate": 3.112659698025552e-06, + "loss": 1.1581, + "step": 268 + }, + { + "epoch": 0.07809551458847439, + "grad_norm": 5.199429512023926, + "learning_rate": 3.124274099883856e-06, + "loss": 1.2774, + "step": 269 + }, + { + "epoch": 0.0783858324865728, + "grad_norm": 5.509277820587158, + "learning_rate": 3.13588850174216e-06, + "loss": 1.3066, + "step": 270 + }, + { + "epoch": 0.07867615038467121, + "grad_norm": 4.698461055755615, + "learning_rate": 3.147502903600465e-06, + "loss": 1.1477, + "step": 271 + }, + { + "epoch": 0.07896646828276964, + "grad_norm": 4.983335494995117, + "learning_rate": 3.159117305458769e-06, + "loss": 1.0314, + "step": 272 + }, + { + "epoch": 0.07925678618086805, + "grad_norm": 4.76466703414917, + "learning_rate": 3.1707317073170736e-06, + "loss": 1.2794, + "step": 273 + }, + { + "epoch": 0.07954710407896647, + "grad_norm": 4.861992359161377, + "learning_rate": 3.1823461091753776e-06, + "loss": 1.2291, + "step": 274 + }, + { + "epoch": 0.07983742197706489, + "grad_norm": 5.327348709106445, + "learning_rate": 3.1939605110336817e-06, + "loss": 1.1653, + "step": 275 + }, + { + "epoch": 0.0801277398751633, + "grad_norm": 4.695688247680664, + "learning_rate": 3.205574912891986e-06, + "loss": 1.2847, + "step": 276 + }, + { + "epoch": 0.08041805777326172, + "grad_norm": 4.913061141967773, + "learning_rate": 3.2171893147502906e-06, + "loss": 1.1864, + "step": 277 + }, + { + "epoch": 0.08070837567136013, + "grad_norm": 4.667782306671143, + "learning_rate": 3.228803716608595e-06, + "loss": 1.1751, + "step": 278 + }, + { + "epoch": 0.08099869356945856, + "grad_norm": 4.723694324493408, + "learning_rate": 3.240418118466899e-06, + "loss": 1.3455, + "step": 279 + }, + { + "epoch": 0.08128901146755697, + "grad_norm": 5.621630668640137, + "learning_rate": 3.2520325203252037e-06, + "loss": 1.156, + "step": 280 + }, + { + "epoch": 0.08157932936565539, + "grad_norm": 4.824314117431641, + "learning_rate": 3.2636469221835077e-06, + "loss": 1.2029, + "step": 281 + }, + { + "epoch": 0.08186964726375381, + "grad_norm": 4.6834025382995605, + "learning_rate": 3.2752613240418118e-06, + "loss": 1.1729, + "step": 282 + }, + { + "epoch": 0.08215996516185223, + "grad_norm": 4.411752223968506, + "learning_rate": 3.2868757259001167e-06, + "loss": 1.099, + "step": 283 + }, + { + "epoch": 0.08245028305995064, + "grad_norm": 4.955481052398682, + "learning_rate": 3.2984901277584207e-06, + "loss": 1.3098, + "step": 284 + }, + { + "epoch": 0.08274060095804907, + "grad_norm": 4.61010217666626, + "learning_rate": 3.310104529616725e-06, + "loss": 0.9964, + "step": 285 + }, + { + "epoch": 0.08303091885614748, + "grad_norm": 4.8403000831604, + "learning_rate": 3.3217189314750293e-06, + "loss": 1.2399, + "step": 286 + }, + { + "epoch": 0.0833212367542459, + "grad_norm": 4.739892482757568, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.2173, + "step": 287 + }, + { + "epoch": 0.08361155465234432, + "grad_norm": 4.817641735076904, + "learning_rate": 3.3449477351916382e-06, + "loss": 1.1471, + "step": 288 + }, + { + "epoch": 0.08390187255044274, + "grad_norm": 4.951786518096924, + "learning_rate": 3.3565621370499423e-06, + "loss": 1.3252, + "step": 289 + }, + { + "epoch": 0.08419219044854115, + "grad_norm": 4.856020927429199, + "learning_rate": 3.3681765389082468e-06, + "loss": 1.2437, + "step": 290 + }, + { + "epoch": 0.08448250834663958, + "grad_norm": 4.223579406738281, + "learning_rate": 3.379790940766551e-06, + "loss": 1.0679, + "step": 291 + }, + { + "epoch": 0.08477282624473799, + "grad_norm": 4.6746344566345215, + "learning_rate": 3.391405342624855e-06, + "loss": 1.2439, + "step": 292 + }, + { + "epoch": 0.0850631441428364, + "grad_norm": 4.416624069213867, + "learning_rate": 3.4030197444831594e-06, + "loss": 1.1559, + "step": 293 + }, + { + "epoch": 0.08535346204093483, + "grad_norm": 4.347145080566406, + "learning_rate": 3.414634146341464e-06, + "loss": 1.2274, + "step": 294 + }, + { + "epoch": 0.08564377993903324, + "grad_norm": 4.638583660125732, + "learning_rate": 3.4262485481997683e-06, + "loss": 1.1497, + "step": 295 + }, + { + "epoch": 0.08593409783713166, + "grad_norm": 4.834431171417236, + "learning_rate": 3.4378629500580724e-06, + "loss": 1.2264, + "step": 296 + }, + { + "epoch": 0.08622441573523008, + "grad_norm": 4.830117225646973, + "learning_rate": 3.4494773519163764e-06, + "loss": 1.1305, + "step": 297 + }, + { + "epoch": 0.0865147336333285, + "grad_norm": 4.986152172088623, + "learning_rate": 3.461091753774681e-06, + "loss": 1.258, + "step": 298 + }, + { + "epoch": 0.08680505153142691, + "grad_norm": 4.623694896697998, + "learning_rate": 3.472706155632985e-06, + "loss": 1.1584, + "step": 299 + }, + { + "epoch": 0.08709536942952532, + "grad_norm": 4.773608207702637, + "learning_rate": 3.48432055749129e-06, + "loss": 1.2913, + "step": 300 + }, + { + "epoch": 0.08738568732762375, + "grad_norm": 4.353751182556152, + "learning_rate": 3.495934959349594e-06, + "loss": 1.2131, + "step": 301 + }, + { + "epoch": 0.08767600522572216, + "grad_norm": 4.784504413604736, + "learning_rate": 3.507549361207898e-06, + "loss": 1.145, + "step": 302 + }, + { + "epoch": 0.08796632312382058, + "grad_norm": 4.825213432312012, + "learning_rate": 3.5191637630662025e-06, + "loss": 1.154, + "step": 303 + }, + { + "epoch": 0.088256641021919, + "grad_norm": 5.358443737030029, + "learning_rate": 3.5307781649245065e-06, + "loss": 1.2215, + "step": 304 + }, + { + "epoch": 0.08854695892001742, + "grad_norm": 4.255599021911621, + "learning_rate": 3.5423925667828106e-06, + "loss": 1.1419, + "step": 305 + }, + { + "epoch": 0.08883727681811583, + "grad_norm": 4.947575092315674, + "learning_rate": 3.5540069686411155e-06, + "loss": 1.2882, + "step": 306 + }, + { + "epoch": 0.08912759471621426, + "grad_norm": 5.248209476470947, + "learning_rate": 3.5656213704994195e-06, + "loss": 1.1513, + "step": 307 + }, + { + "epoch": 0.08941791261431267, + "grad_norm": 4.874551773071289, + "learning_rate": 3.577235772357724e-06, + "loss": 1.3386, + "step": 308 + }, + { + "epoch": 0.08970823051241109, + "grad_norm": 4.576282978057861, + "learning_rate": 3.588850174216028e-06, + "loss": 1.1586, + "step": 309 + }, + { + "epoch": 0.08999854841050951, + "grad_norm": 4.958520889282227, + "learning_rate": 3.600464576074332e-06, + "loss": 1.3224, + "step": 310 + }, + { + "epoch": 0.09028886630860793, + "grad_norm": 4.927209854125977, + "learning_rate": 3.6120789779326366e-06, + "loss": 1.2893, + "step": 311 + }, + { + "epoch": 0.09057918420670634, + "grad_norm": 4.564126968383789, + "learning_rate": 3.623693379790941e-06, + "loss": 1.1545, + "step": 312 + }, + { + "epoch": 0.09086950210480477, + "grad_norm": 4.723407745361328, + "learning_rate": 3.6353077816492456e-06, + "loss": 1.2735, + "step": 313 + }, + { + "epoch": 0.09115982000290318, + "grad_norm": 4.90524435043335, + "learning_rate": 3.6469221835075496e-06, + "loss": 1.2508, + "step": 314 + }, + { + "epoch": 0.0914501379010016, + "grad_norm": 4.609728813171387, + "learning_rate": 3.6585365853658537e-06, + "loss": 1.055, + "step": 315 + }, + { + "epoch": 0.09174045579910002, + "grad_norm": 4.467485427856445, + "learning_rate": 3.670150987224158e-06, + "loss": 1.118, + "step": 316 + }, + { + "epoch": 0.09203077369719843, + "grad_norm": 4.879512310028076, + "learning_rate": 3.6817653890824622e-06, + "loss": 1.1635, + "step": 317 + }, + { + "epoch": 0.09232109159529685, + "grad_norm": 4.552756309509277, + "learning_rate": 3.693379790940767e-06, + "loss": 1.2854, + "step": 318 + }, + { + "epoch": 0.09261140949339527, + "grad_norm": 4.794209003448486, + "learning_rate": 3.704994192799071e-06, + "loss": 1.2284, + "step": 319 + }, + { + "epoch": 0.09290172739149369, + "grad_norm": 4.714296340942383, + "learning_rate": 3.7166085946573752e-06, + "loss": 1.1289, + "step": 320 + }, + { + "epoch": 0.0931920452895921, + "grad_norm": 4.3302106857299805, + "learning_rate": 3.7282229965156797e-06, + "loss": 1.2047, + "step": 321 + }, + { + "epoch": 0.09348236318769051, + "grad_norm": 4.78410005569458, + "learning_rate": 3.7398373983739838e-06, + "loss": 1.2851, + "step": 322 + }, + { + "epoch": 0.09377268108578894, + "grad_norm": 4.550713539123535, + "learning_rate": 3.7514518002322887e-06, + "loss": 1.1176, + "step": 323 + }, + { + "epoch": 0.09406299898388736, + "grad_norm": 4.67529821395874, + "learning_rate": 3.7630662020905927e-06, + "loss": 1.1582, + "step": 324 + }, + { + "epoch": 0.09435331688198577, + "grad_norm": 5.0789361000061035, + "learning_rate": 3.7746806039488972e-06, + "loss": 1.1994, + "step": 325 + }, + { + "epoch": 0.0946436347800842, + "grad_norm": 4.371364593505859, + "learning_rate": 3.7862950058072013e-06, + "loss": 1.185, + "step": 326 + }, + { + "epoch": 0.09493395267818261, + "grad_norm": 4.266092300415039, + "learning_rate": 3.7979094076655053e-06, + "loss": 1.0634, + "step": 327 + }, + { + "epoch": 0.09522427057628102, + "grad_norm": 4.3022141456604, + "learning_rate": 3.80952380952381e-06, + "loss": 1.0949, + "step": 328 + }, + { + "epoch": 0.09551458847437945, + "grad_norm": 4.752735137939453, + "learning_rate": 3.821138211382115e-06, + "loss": 1.1035, + "step": 329 + }, + { + "epoch": 0.09580490637247786, + "grad_norm": 4.965222358703613, + "learning_rate": 3.832752613240418e-06, + "loss": 1.1323, + "step": 330 + }, + { + "epoch": 0.09609522427057628, + "grad_norm": 5.181162357330322, + "learning_rate": 3.844367015098723e-06, + "loss": 1.0707, + "step": 331 + }, + { + "epoch": 0.0963855421686747, + "grad_norm": 5.318249225616455, + "learning_rate": 3.855981416957027e-06, + "loss": 1.3433, + "step": 332 + }, + { + "epoch": 0.09667586006677312, + "grad_norm": 4.770782470703125, + "learning_rate": 3.867595818815331e-06, + "loss": 1.2307, + "step": 333 + }, + { + "epoch": 0.09696617796487153, + "grad_norm": 4.776768207550049, + "learning_rate": 3.8792102206736354e-06, + "loss": 1.2659, + "step": 334 + }, + { + "epoch": 0.09725649586296996, + "grad_norm": 4.787647724151611, + "learning_rate": 3.89082462253194e-06, + "loss": 1.149, + "step": 335 + }, + { + "epoch": 0.09754681376106837, + "grad_norm": 4.631190299987793, + "learning_rate": 3.902439024390244e-06, + "loss": 1.0426, + "step": 336 + }, + { + "epoch": 0.09783713165916678, + "grad_norm": 4.632266044616699, + "learning_rate": 3.914053426248549e-06, + "loss": 1.2512, + "step": 337 + }, + { + "epoch": 0.09812744955726521, + "grad_norm": 4.575108528137207, + "learning_rate": 3.9256678281068525e-06, + "loss": 1.1754, + "step": 338 + }, + { + "epoch": 0.09841776745536363, + "grad_norm": 4.5373454093933105, + "learning_rate": 3.937282229965157e-06, + "loss": 1.0264, + "step": 339 + }, + { + "epoch": 0.09870808535346204, + "grad_norm": 4.490976333618164, + "learning_rate": 3.9488966318234615e-06, + "loss": 1.1909, + "step": 340 + }, + { + "epoch": 0.09899840325156047, + "grad_norm": 4.690683841705322, + "learning_rate": 3.960511033681766e-06, + "loss": 1.2999, + "step": 341 + }, + { + "epoch": 0.09928872114965888, + "grad_norm": 5.3299479484558105, + "learning_rate": 3.97212543554007e-06, + "loss": 1.3982, + "step": 342 + }, + { + "epoch": 0.09957903904775729, + "grad_norm": 4.69218635559082, + "learning_rate": 3.983739837398374e-06, + "loss": 1.2105, + "step": 343 + }, + { + "epoch": 0.0998693569458557, + "grad_norm": 4.691149711608887, + "learning_rate": 3.9953542392566785e-06, + "loss": 1.1759, + "step": 344 + }, + { + "epoch": 0.10015967484395413, + "grad_norm": 4.793273448944092, + "learning_rate": 4.006968641114983e-06, + "loss": 1.3496, + "step": 345 + }, + { + "epoch": 0.10044999274205255, + "grad_norm": 4.364034652709961, + "learning_rate": 4.018583042973287e-06, + "loss": 1.1204, + "step": 346 + }, + { + "epoch": 0.10074031064015096, + "grad_norm": 4.571069240570068, + "learning_rate": 4.030197444831592e-06, + "loss": 1.0896, + "step": 347 + }, + { + "epoch": 0.10103062853824939, + "grad_norm": 4.80451774597168, + "learning_rate": 4.041811846689896e-06, + "loss": 1.1484, + "step": 348 + }, + { + "epoch": 0.1013209464363478, + "grad_norm": 5.162931442260742, + "learning_rate": 4.0534262485482e-06, + "loss": 1.2662, + "step": 349 + }, + { + "epoch": 0.10161126433444621, + "grad_norm": 4.779268264770508, + "learning_rate": 4.0650406504065046e-06, + "loss": 1.1183, + "step": 350 + }, + { + "epoch": 0.10190158223254464, + "grad_norm": 4.979952812194824, + "learning_rate": 4.076655052264808e-06, + "loss": 1.4163, + "step": 351 + }, + { + "epoch": 0.10219190013064305, + "grad_norm": 4.158762454986572, + "learning_rate": 4.0882694541231135e-06, + "loss": 1.0298, + "step": 352 + }, + { + "epoch": 0.10248221802874147, + "grad_norm": 4.852020740509033, + "learning_rate": 4.099883855981417e-06, + "loss": 1.2704, + "step": 353 + }, + { + "epoch": 0.1027725359268399, + "grad_norm": 5.023031234741211, + "learning_rate": 4.111498257839722e-06, + "loss": 1.242, + "step": 354 + }, + { + "epoch": 0.10306285382493831, + "grad_norm": 5.079054355621338, + "learning_rate": 4.123112659698026e-06, + "loss": 1.1842, + "step": 355 + }, + { + "epoch": 0.10335317172303672, + "grad_norm": 4.983884811401367, + "learning_rate": 4.13472706155633e-06, + "loss": 1.2416, + "step": 356 + }, + { + "epoch": 0.10364348962113515, + "grad_norm": 4.6025543212890625, + "learning_rate": 4.146341463414634e-06, + "loss": 1.1068, + "step": 357 + }, + { + "epoch": 0.10393380751923356, + "grad_norm": 5.108760833740234, + "learning_rate": 4.157955865272939e-06, + "loss": 1.2235, + "step": 358 + }, + { + "epoch": 0.10422412541733198, + "grad_norm": 4.9223480224609375, + "learning_rate": 4.169570267131243e-06, + "loss": 1.2334, + "step": 359 + }, + { + "epoch": 0.1045144433154304, + "grad_norm": 4.93304443359375, + "learning_rate": 4.181184668989548e-06, + "loss": 1.2043, + "step": 360 + }, + { + "epoch": 0.10480476121352882, + "grad_norm": 4.894895553588867, + "learning_rate": 4.192799070847851e-06, + "loss": 1.2835, + "step": 361 + }, + { + "epoch": 0.10509507911162723, + "grad_norm": 4.598118782043457, + "learning_rate": 4.204413472706156e-06, + "loss": 1.1895, + "step": 362 + }, + { + "epoch": 0.10538539700972566, + "grad_norm": 4.7202839851379395, + "learning_rate": 4.21602787456446e-06, + "loss": 1.2704, + "step": 363 + }, + { + "epoch": 0.10567571490782407, + "grad_norm": 4.768918991088867, + "learning_rate": 4.227642276422765e-06, + "loss": 1.2544, + "step": 364 + }, + { + "epoch": 0.10596603280592248, + "grad_norm": 4.733092784881592, + "learning_rate": 4.239256678281069e-06, + "loss": 1.2009, + "step": 365 + }, + { + "epoch": 0.1062563507040209, + "grad_norm": 4.309986591339111, + "learning_rate": 4.250871080139373e-06, + "loss": 1.3049, + "step": 366 + }, + { + "epoch": 0.10654666860211932, + "grad_norm": 4.730205535888672, + "learning_rate": 4.262485481997677e-06, + "loss": 1.2298, + "step": 367 + }, + { + "epoch": 0.10683698650021774, + "grad_norm": 4.841794013977051, + "learning_rate": 4.274099883855982e-06, + "loss": 1.2284, + "step": 368 + }, + { + "epoch": 0.10712730439831615, + "grad_norm": 4.516952037811279, + "learning_rate": 4.2857142857142855e-06, + "loss": 1.1675, + "step": 369 + }, + { + "epoch": 0.10741762229641458, + "grad_norm": 4.625637054443359, + "learning_rate": 4.297328687572591e-06, + "loss": 1.055, + "step": 370 + }, + { + "epoch": 0.10770794019451299, + "grad_norm": 4.419715881347656, + "learning_rate": 4.308943089430894e-06, + "loss": 1.2107, + "step": 371 + }, + { + "epoch": 0.1079982580926114, + "grad_norm": 4.896949291229248, + "learning_rate": 4.320557491289199e-06, + "loss": 1.3021, + "step": 372 + }, + { + "epoch": 0.10828857599070983, + "grad_norm": 4.905848503112793, + "learning_rate": 4.332171893147503e-06, + "loss": 1.2083, + "step": 373 + }, + { + "epoch": 0.10857889388880824, + "grad_norm": 5.094426155090332, + "learning_rate": 4.343786295005807e-06, + "loss": 1.2681, + "step": 374 + }, + { + "epoch": 0.10886921178690666, + "grad_norm": 4.462698936462402, + "learning_rate": 4.3554006968641115e-06, + "loss": 1.2045, + "step": 375 + }, + { + "epoch": 0.10915952968500509, + "grad_norm": 4.681826591491699, + "learning_rate": 4.367015098722416e-06, + "loss": 1.1561, + "step": 376 + }, + { + "epoch": 0.1094498475831035, + "grad_norm": 4.762950420379639, + "learning_rate": 4.3786295005807205e-06, + "loss": 1.38, + "step": 377 + }, + { + "epoch": 0.10974016548120191, + "grad_norm": 4.647446155548096, + "learning_rate": 4.390243902439025e-06, + "loss": 1.1523, + "step": 378 + }, + { + "epoch": 0.11003048337930034, + "grad_norm": 4.403470039367676, + "learning_rate": 4.4018583042973286e-06, + "loss": 1.1952, + "step": 379 + }, + { + "epoch": 0.11032080127739875, + "grad_norm": 4.534971237182617, + "learning_rate": 4.413472706155633e-06, + "loss": 1.2161, + "step": 380 + }, + { + "epoch": 0.11061111917549717, + "grad_norm": 4.459516525268555, + "learning_rate": 4.4250871080139375e-06, + "loss": 1.165, + "step": 381 + }, + { + "epoch": 0.1109014370735956, + "grad_norm": 4.685759544372559, + "learning_rate": 4.436701509872242e-06, + "loss": 1.2302, + "step": 382 + }, + { + "epoch": 0.111191754971694, + "grad_norm": 4.3947062492370605, + "learning_rate": 4.4483159117305465e-06, + "loss": 1.0562, + "step": 383 + }, + { + "epoch": 0.11148207286979242, + "grad_norm": 4.368214130401611, + "learning_rate": 4.45993031358885e-06, + "loss": 1.1429, + "step": 384 + }, + { + "epoch": 0.11177239076789085, + "grad_norm": 4.556305408477783, + "learning_rate": 4.471544715447155e-06, + "loss": 1.2435, + "step": 385 + }, + { + "epoch": 0.11206270866598926, + "grad_norm": 4.672650337219238, + "learning_rate": 4.483159117305459e-06, + "loss": 1.2102, + "step": 386 + }, + { + "epoch": 0.11235302656408767, + "grad_norm": 4.5687127113342285, + "learning_rate": 4.4947735191637636e-06, + "loss": 1.0996, + "step": 387 + }, + { + "epoch": 0.11264334446218609, + "grad_norm": 4.420834064483643, + "learning_rate": 4.506387921022068e-06, + "loss": 1.0808, + "step": 388 + }, + { + "epoch": 0.11293366236028451, + "grad_norm": 4.193338394165039, + "learning_rate": 4.5180023228803725e-06, + "loss": 1.1776, + "step": 389 + }, + { + "epoch": 0.11322398025838293, + "grad_norm": 4.821009635925293, + "learning_rate": 4.529616724738676e-06, + "loss": 1.0963, + "step": 390 + }, + { + "epoch": 0.11351429815648134, + "grad_norm": 4.469620227813721, + "learning_rate": 4.541231126596981e-06, + "loss": 1.2174, + "step": 391 + }, + { + "epoch": 0.11380461605457977, + "grad_norm": 4.591622352600098, + "learning_rate": 4.552845528455285e-06, + "loss": 1.3208, + "step": 392 + }, + { + "epoch": 0.11409493395267818, + "grad_norm": 4.99096155166626, + "learning_rate": 4.56445993031359e-06, + "loss": 1.2179, + "step": 393 + }, + { + "epoch": 0.1143852518507766, + "grad_norm": 4.644974708557129, + "learning_rate": 4.576074332171894e-06, + "loss": 1.2476, + "step": 394 + }, + { + "epoch": 0.11467556974887502, + "grad_norm": 4.829218864440918, + "learning_rate": 4.587688734030198e-06, + "loss": 1.3546, + "step": 395 + }, + { + "epoch": 0.11496588764697344, + "grad_norm": 4.542574882507324, + "learning_rate": 4.599303135888502e-06, + "loss": 1.0894, + "step": 396 + }, + { + "epoch": 0.11525620554507185, + "grad_norm": 4.826246738433838, + "learning_rate": 4.610917537746807e-06, + "loss": 1.1201, + "step": 397 + }, + { + "epoch": 0.11554652344317028, + "grad_norm": 4.562570095062256, + "learning_rate": 4.62253193960511e-06, + "loss": 1.1192, + "step": 398 + }, + { + "epoch": 0.11583684134126869, + "grad_norm": 4.720918655395508, + "learning_rate": 4.634146341463416e-06, + "loss": 1.2242, + "step": 399 + }, + { + "epoch": 0.1161271592393671, + "grad_norm": 4.849851131439209, + "learning_rate": 4.645760743321719e-06, + "loss": 1.1184, + "step": 400 + }, + { + "epoch": 0.11641747713746553, + "grad_norm": 5.324199199676514, + "learning_rate": 4.657375145180024e-06, + "loss": 1.2376, + "step": 401 + }, + { + "epoch": 0.11670779503556394, + "grad_norm": 4.4429192543029785, + "learning_rate": 4.668989547038328e-06, + "loss": 1.0613, + "step": 402 + }, + { + "epoch": 0.11699811293366236, + "grad_norm": 4.644254684448242, + "learning_rate": 4.680603948896632e-06, + "loss": 1.1684, + "step": 403 + }, + { + "epoch": 0.11728843083176078, + "grad_norm": 5.012441158294678, + "learning_rate": 4.692218350754936e-06, + "loss": 1.2038, + "step": 404 + }, + { + "epoch": 0.1175787487298592, + "grad_norm": 4.652109622955322, + "learning_rate": 4.703832752613241e-06, + "loss": 1.2779, + "step": 405 + }, + { + "epoch": 0.11786906662795761, + "grad_norm": 4.548923969268799, + "learning_rate": 4.715447154471545e-06, + "loss": 1.2691, + "step": 406 + }, + { + "epoch": 0.11815938452605604, + "grad_norm": 4.364345073699951, + "learning_rate": 4.72706155632985e-06, + "loss": 1.1318, + "step": 407 + }, + { + "epoch": 0.11844970242415445, + "grad_norm": 4.687953472137451, + "learning_rate": 4.738675958188153e-06, + "loss": 1.4441, + "step": 408 + }, + { + "epoch": 0.11874002032225286, + "grad_norm": 4.44487190246582, + "learning_rate": 4.750290360046458e-06, + "loss": 1.2781, + "step": 409 + }, + { + "epoch": 0.11903033822035128, + "grad_norm": 4.23728609085083, + "learning_rate": 4.761904761904762e-06, + "loss": 1.0713, + "step": 410 + }, + { + "epoch": 0.1193206561184497, + "grad_norm": 4.650542736053467, + "learning_rate": 4.773519163763067e-06, + "loss": 1.1529, + "step": 411 + }, + { + "epoch": 0.11961097401654812, + "grad_norm": 4.119630813598633, + "learning_rate": 4.785133565621371e-06, + "loss": 1.0351, + "step": 412 + }, + { + "epoch": 0.11990129191464653, + "grad_norm": 4.689528465270996, + "learning_rate": 4.796747967479675e-06, + "loss": 1.27, + "step": 413 + }, + { + "epoch": 0.12019160981274496, + "grad_norm": 4.582840919494629, + "learning_rate": 4.8083623693379794e-06, + "loss": 1.2461, + "step": 414 + }, + { + "epoch": 0.12048192771084337, + "grad_norm": 4.441833019256592, + "learning_rate": 4.819976771196284e-06, + "loss": 1.2983, + "step": 415 + }, + { + "epoch": 0.12077224560894179, + "grad_norm": 4.192812919616699, + "learning_rate": 4.831591173054588e-06, + "loss": 1.1723, + "step": 416 + }, + { + "epoch": 0.12106256350704021, + "grad_norm": 4.11320686340332, + "learning_rate": 4.843205574912893e-06, + "loss": 1.1548, + "step": 417 + }, + { + "epoch": 0.12135288140513863, + "grad_norm": 4.811589241027832, + "learning_rate": 4.8548199767711965e-06, + "loss": 1.2443, + "step": 418 + }, + { + "epoch": 0.12164319930323704, + "grad_norm": 4.167344570159912, + "learning_rate": 4.866434378629501e-06, + "loss": 1.0633, + "step": 419 + }, + { + "epoch": 0.12193351720133547, + "grad_norm": 4.8188090324401855, + "learning_rate": 4.8780487804878055e-06, + "loss": 1.2695, + "step": 420 + }, + { + "epoch": 0.12222383509943388, + "grad_norm": 4.46505880355835, + "learning_rate": 4.889663182346109e-06, + "loss": 1.1716, + "step": 421 + }, + { + "epoch": 0.1225141529975323, + "grad_norm": 4.715555667877197, + "learning_rate": 4.9012775842044144e-06, + "loss": 1.1526, + "step": 422 + }, + { + "epoch": 0.12280447089563072, + "grad_norm": 4.3485612869262695, + "learning_rate": 4.912891986062718e-06, + "loss": 1.0477, + "step": 423 + }, + { + "epoch": 0.12309478879372913, + "grad_norm": 4.8962483406066895, + "learning_rate": 4.9245063879210226e-06, + "loss": 1.2028, + "step": 424 + }, + { + "epoch": 0.12338510669182755, + "grad_norm": 4.331915378570557, + "learning_rate": 4.936120789779327e-06, + "loss": 0.9834, + "step": 425 + }, + { + "epoch": 0.12367542458992598, + "grad_norm": 4.94401216506958, + "learning_rate": 4.947735191637631e-06, + "loss": 1.2552, + "step": 426 + }, + { + "epoch": 0.12396574248802439, + "grad_norm": 4.512451648712158, + "learning_rate": 4.959349593495935e-06, + "loss": 1.1289, + "step": 427 + }, + { + "epoch": 0.1242560603861228, + "grad_norm": 4.4072489738464355, + "learning_rate": 4.97096399535424e-06, + "loss": 1.1327, + "step": 428 + }, + { + "epoch": 0.12454637828422123, + "grad_norm": 4.699981212615967, + "learning_rate": 4.982578397212544e-06, + "loss": 1.2143, + "step": 429 + }, + { + "epoch": 0.12483669618231964, + "grad_norm": 4.3754496574401855, + "learning_rate": 4.994192799070849e-06, + "loss": 1.2076, + "step": 430 + }, + { + "epoch": 0.12512701408041807, + "grad_norm": 4.274416446685791, + "learning_rate": 5.005807200929152e-06, + "loss": 1.2851, + "step": 431 + }, + { + "epoch": 0.12541733197851648, + "grad_norm": 3.8760673999786377, + "learning_rate": 5.017421602787457e-06, + "loss": 1.0735, + "step": 432 + }, + { + "epoch": 0.1257076498766149, + "grad_norm": 4.6121015548706055, + "learning_rate": 5.029036004645761e-06, + "loss": 1.1603, + "step": 433 + }, + { + "epoch": 0.1259979677747133, + "grad_norm": 4.314383506774902, + "learning_rate": 5.040650406504065e-06, + "loss": 1.1134, + "step": 434 + }, + { + "epoch": 0.12628828567281172, + "grad_norm": 4.7067036628723145, + "learning_rate": 5.052264808362369e-06, + "loss": 1.2772, + "step": 435 + }, + { + "epoch": 0.12657860357091014, + "grad_norm": 4.6313982009887695, + "learning_rate": 5.063879210220674e-06, + "loss": 1.0923, + "step": 436 + }, + { + "epoch": 0.12686892146900858, + "grad_norm": 4.323302268981934, + "learning_rate": 5.075493612078979e-06, + "loss": 1.1981, + "step": 437 + }, + { + "epoch": 0.127159239367107, + "grad_norm": 4.471177101135254, + "learning_rate": 5.087108013937283e-06, + "loss": 1.456, + "step": 438 + }, + { + "epoch": 0.1274495572652054, + "grad_norm": 4.510197639465332, + "learning_rate": 5.098722415795587e-06, + "loss": 1.256, + "step": 439 + }, + { + "epoch": 0.12773987516330382, + "grad_norm": 4.906876087188721, + "learning_rate": 5.110336817653892e-06, + "loss": 1.1103, + "step": 440 + }, + { + "epoch": 0.12803019306140223, + "grad_norm": 4.39389181137085, + "learning_rate": 5.121951219512195e-06, + "loss": 1.1229, + "step": 441 + }, + { + "epoch": 0.12832051095950064, + "grad_norm": 4.98647403717041, + "learning_rate": 5.1335656213705e-06, + "loss": 1.1612, + "step": 442 + }, + { + "epoch": 0.12861082885759906, + "grad_norm": 4.218196392059326, + "learning_rate": 5.145180023228804e-06, + "loss": 1.2441, + "step": 443 + }, + { + "epoch": 0.1289011467556975, + "grad_norm": 4.1096086502075195, + "learning_rate": 5.156794425087108e-06, + "loss": 1.1238, + "step": 444 + }, + { + "epoch": 0.1291914646537959, + "grad_norm": 4.741826057434082, + "learning_rate": 5.168408826945412e-06, + "loss": 1.3542, + "step": 445 + }, + { + "epoch": 0.12948178255189433, + "grad_norm": 4.725194454193115, + "learning_rate": 5.180023228803717e-06, + "loss": 1.3447, + "step": 446 + }, + { + "epoch": 0.12977210044999274, + "grad_norm": 4.7122273445129395, + "learning_rate": 5.1916376306620205e-06, + "loss": 1.1016, + "step": 447 + }, + { + "epoch": 0.13006241834809115, + "grad_norm": 5.179031848907471, + "learning_rate": 5.203252032520326e-06, + "loss": 1.073, + "step": 448 + }, + { + "epoch": 0.13035273624618957, + "grad_norm": 4.772004127502441, + "learning_rate": 5.21486643437863e-06, + "loss": 1.2237, + "step": 449 + }, + { + "epoch": 0.130643054144288, + "grad_norm": 4.839110374450684, + "learning_rate": 5.226480836236935e-06, + "loss": 1.2604, + "step": 450 + }, + { + "epoch": 0.13093337204238642, + "grad_norm": 4.533593654632568, + "learning_rate": 5.2380952380952384e-06, + "loss": 1.1173, + "step": 451 + }, + { + "epoch": 0.13122368994048483, + "grad_norm": 4.776732444763184, + "learning_rate": 5.249709639953543e-06, + "loss": 1.2574, + "step": 452 + }, + { + "epoch": 0.13151400783858325, + "grad_norm": 4.366232872009277, + "learning_rate": 5.261324041811847e-06, + "loss": 1.1167, + "step": 453 + }, + { + "epoch": 0.13180432573668166, + "grad_norm": 4.264481067657471, + "learning_rate": 5.272938443670151e-06, + "loss": 1.081, + "step": 454 + }, + { + "epoch": 0.13209464363478007, + "grad_norm": 4.251311302185059, + "learning_rate": 5.2845528455284555e-06, + "loss": 1.094, + "step": 455 + }, + { + "epoch": 0.13238496153287851, + "grad_norm": 4.391427516937256, + "learning_rate": 5.29616724738676e-06, + "loss": 1.2025, + "step": 456 + }, + { + "epoch": 0.13267527943097693, + "grad_norm": 4.8531270027160645, + "learning_rate": 5.307781649245064e-06, + "loss": 1.16, + "step": 457 + }, + { + "epoch": 0.13296559732907534, + "grad_norm": 5.001920223236084, + "learning_rate": 5.319396051103368e-06, + "loss": 1.3174, + "step": 458 + }, + { + "epoch": 0.13325591522717375, + "grad_norm": 5.8515849113464355, + "learning_rate": 5.331010452961673e-06, + "loss": 1.2568, + "step": 459 + }, + { + "epoch": 0.13354623312527217, + "grad_norm": 4.972232818603516, + "learning_rate": 5.342624854819978e-06, + "loss": 1.3323, + "step": 460 + }, + { + "epoch": 0.13383655102337058, + "grad_norm": 4.840256690979004, + "learning_rate": 5.3542392566782816e-06, + "loss": 1.0883, + "step": 461 + }, + { + "epoch": 0.13412686892146902, + "grad_norm": 4.309145450592041, + "learning_rate": 5.365853658536586e-06, + "loss": 1.1276, + "step": 462 + }, + { + "epoch": 0.13441718681956744, + "grad_norm": 4.385857582092285, + "learning_rate": 5.3774680603948905e-06, + "loss": 1.0481, + "step": 463 + }, + { + "epoch": 0.13470750471766585, + "grad_norm": 4.541776180267334, + "learning_rate": 5.389082462253194e-06, + "loss": 1.1545, + "step": 464 + }, + { + "epoch": 0.13499782261576426, + "grad_norm": 4.9798712730407715, + "learning_rate": 5.400696864111499e-06, + "loss": 1.2534, + "step": 465 + }, + { + "epoch": 0.13528814051386268, + "grad_norm": 4.9744977951049805, + "learning_rate": 5.412311265969803e-06, + "loss": 1.104, + "step": 466 + }, + { + "epoch": 0.1355784584119611, + "grad_norm": 4.3919878005981445, + "learning_rate": 5.423925667828107e-06, + "loss": 1.1293, + "step": 467 + }, + { + "epoch": 0.1358687763100595, + "grad_norm": 4.843119144439697, + "learning_rate": 5.435540069686411e-06, + "loss": 1.2784, + "step": 468 + }, + { + "epoch": 0.13615909420815794, + "grad_norm": 4.212307453155518, + "learning_rate": 5.447154471544716e-06, + "loss": 1.1175, + "step": 469 + }, + { + "epoch": 0.13644941210625636, + "grad_norm": 4.107914447784424, + "learning_rate": 5.458768873403019e-06, + "loss": 1.1508, + "step": 470 + }, + { + "epoch": 0.13673973000435477, + "grad_norm": 4.234799385070801, + "learning_rate": 5.470383275261324e-06, + "loss": 1.1592, + "step": 471 + }, + { + "epoch": 0.13703004790245318, + "grad_norm": 4.388983726501465, + "learning_rate": 5.481997677119629e-06, + "loss": 1.1882, + "step": 472 + }, + { + "epoch": 0.1373203658005516, + "grad_norm": 4.463111877441406, + "learning_rate": 5.493612078977934e-06, + "loss": 1.3275, + "step": 473 + }, + { + "epoch": 0.13761068369865, + "grad_norm": 4.7095255851745605, + "learning_rate": 5.505226480836237e-06, + "loss": 1.3394, + "step": 474 + }, + { + "epoch": 0.13790100159674845, + "grad_norm": 4.3856024742126465, + "learning_rate": 5.516840882694542e-06, + "loss": 1.2117, + "step": 475 + }, + { + "epoch": 0.13819131949484686, + "grad_norm": 4.319365978240967, + "learning_rate": 5.528455284552846e-06, + "loss": 1.1883, + "step": 476 + }, + { + "epoch": 0.13848163739294528, + "grad_norm": 5.07382869720459, + "learning_rate": 5.540069686411151e-06, + "loss": 1.3553, + "step": 477 + }, + { + "epoch": 0.1387719552910437, + "grad_norm": 4.294496059417725, + "learning_rate": 5.551684088269454e-06, + "loss": 1.1118, + "step": 478 + }, + { + "epoch": 0.1390622731891421, + "grad_norm": 4.60385274887085, + "learning_rate": 5.563298490127759e-06, + "loss": 1.1757, + "step": 479 + }, + { + "epoch": 0.13935259108724052, + "grad_norm": 4.500978946685791, + "learning_rate": 5.574912891986063e-06, + "loss": 1.2481, + "step": 480 + }, + { + "epoch": 0.13964290898533896, + "grad_norm": 4.490742206573486, + "learning_rate": 5.586527293844367e-06, + "loss": 1.1312, + "step": 481 + }, + { + "epoch": 0.13993322688343737, + "grad_norm": 4.06981086730957, + "learning_rate": 5.598141695702671e-06, + "loss": 1.1169, + "step": 482 + }, + { + "epoch": 0.14022354478153579, + "grad_norm": 4.395321369171143, + "learning_rate": 5.609756097560977e-06, + "loss": 1.3147, + "step": 483 + }, + { + "epoch": 0.1405138626796342, + "grad_norm": 4.509646415710449, + "learning_rate": 5.621370499419281e-06, + "loss": 1.2163, + "step": 484 + }, + { + "epoch": 0.1408041805777326, + "grad_norm": 4.4350175857543945, + "learning_rate": 5.632984901277585e-06, + "loss": 1.2908, + "step": 485 + }, + { + "epoch": 0.14109449847583103, + "grad_norm": 4.2386698722839355, + "learning_rate": 5.644599303135889e-06, + "loss": 1.1437, + "step": 486 + }, + { + "epoch": 0.14138481637392944, + "grad_norm": 4.659437656402588, + "learning_rate": 5.656213704994194e-06, + "loss": 1.1965, + "step": 487 + }, + { + "epoch": 0.14167513427202788, + "grad_norm": 4.744169235229492, + "learning_rate": 5.6678281068524974e-06, + "loss": 1.2036, + "step": 488 + }, + { + "epoch": 0.1419654521701263, + "grad_norm": 4.265536785125732, + "learning_rate": 5.679442508710802e-06, + "loss": 1.0867, + "step": 489 + }, + { + "epoch": 0.1422557700682247, + "grad_norm": 5.0157999992370605, + "learning_rate": 5.691056910569106e-06, + "loss": 1.3433, + "step": 490 + }, + { + "epoch": 0.14254608796632312, + "grad_norm": 4.504755020141602, + "learning_rate": 5.70267131242741e-06, + "loss": 1.1293, + "step": 491 + }, + { + "epoch": 0.14283640586442153, + "grad_norm": 4.358330726623535, + "learning_rate": 5.7142857142857145e-06, + "loss": 1.138, + "step": 492 + }, + { + "epoch": 0.14312672376251995, + "grad_norm": 4.373233318328857, + "learning_rate": 5.725900116144019e-06, + "loss": 1.0552, + "step": 493 + }, + { + "epoch": 0.1434170416606184, + "grad_norm": 5.096903324127197, + "learning_rate": 5.737514518002323e-06, + "loss": 1.3915, + "step": 494 + }, + { + "epoch": 0.1437073595587168, + "grad_norm": 4.237011432647705, + "learning_rate": 5.749128919860628e-06, + "loss": 1.2089, + "step": 495 + }, + { + "epoch": 0.14399767745681522, + "grad_norm": 4.598453998565674, + "learning_rate": 5.7607433217189324e-06, + "loss": 1.1485, + "step": 496 + }, + { + "epoch": 0.14428799535491363, + "grad_norm": 4.585260391235352, + "learning_rate": 5.772357723577237e-06, + "loss": 1.1004, + "step": 497 + }, + { + "epoch": 0.14457831325301204, + "grad_norm": 4.202107906341553, + "learning_rate": 5.7839721254355405e-06, + "loss": 1.2586, + "step": 498 + }, + { + "epoch": 0.14486863115111046, + "grad_norm": 4.69224739074707, + "learning_rate": 5.795586527293845e-06, + "loss": 1.2772, + "step": 499 + }, + { + "epoch": 0.1451589490492089, + "grad_norm": 4.4062323570251465, + "learning_rate": 5.8072009291521495e-06, + "loss": 1.1927, + "step": 500 + }, + { + "epoch": 0.1451589490492089, + "eval_loss": 1.225874662399292, + "eval_runtime": 11.4881, + "eval_samples_per_second": 34.819, + "eval_steps_per_second": 4.352, + "step": 500 + }, + { + "epoch": 0.1454492669473073, + "grad_norm": 4.7002739906311035, + "learning_rate": 5.818815331010453e-06, + "loss": 1.2239, + "step": 501 + }, + { + "epoch": 0.14573958484540572, + "grad_norm": 4.650073528289795, + "learning_rate": 5.830429732868758e-06, + "loss": 1.3074, + "step": 502 + }, + { + "epoch": 0.14602990274350414, + "grad_norm": 4.497559070587158, + "learning_rate": 5.842044134727062e-06, + "loss": 1.398, + "step": 503 + }, + { + "epoch": 0.14632022064160255, + "grad_norm": 4.4335174560546875, + "learning_rate": 5.853658536585366e-06, + "loss": 1.1606, + "step": 504 + }, + { + "epoch": 0.14661053853970096, + "grad_norm": 4.531015396118164, + "learning_rate": 5.86527293844367e-06, + "loss": 1.2087, + "step": 505 + }, + { + "epoch": 0.1469008564377994, + "grad_norm": 4.521320343017578, + "learning_rate": 5.876887340301975e-06, + "loss": 1.0572, + "step": 506 + }, + { + "epoch": 0.14719117433589782, + "grad_norm": 4.088536739349365, + "learning_rate": 5.88850174216028e-06, + "loss": 1.1005, + "step": 507 + }, + { + "epoch": 0.14748149223399623, + "grad_norm": 4.54278039932251, + "learning_rate": 5.900116144018584e-06, + "loss": 1.2687, + "step": 508 + }, + { + "epoch": 0.14777181013209464, + "grad_norm": 4.390741348266602, + "learning_rate": 5.911730545876888e-06, + "loss": 1.4151, + "step": 509 + }, + { + "epoch": 0.14806212803019306, + "grad_norm": 4.438811779022217, + "learning_rate": 5.923344947735193e-06, + "loss": 1.3253, + "step": 510 + }, + { + "epoch": 0.14835244592829147, + "grad_norm": 4.363897800445557, + "learning_rate": 5.934959349593496e-06, + "loss": 1.2319, + "step": 511 + }, + { + "epoch": 0.14864276382638988, + "grad_norm": 4.362700462341309, + "learning_rate": 5.946573751451801e-06, + "loss": 1.2404, + "step": 512 + }, + { + "epoch": 0.14893308172448833, + "grad_norm": 4.311462879180908, + "learning_rate": 5.958188153310105e-06, + "loss": 1.1152, + "step": 513 + }, + { + "epoch": 0.14922339962258674, + "grad_norm": 4.525477886199951, + "learning_rate": 5.969802555168409e-06, + "loss": 1.4097, + "step": 514 + }, + { + "epoch": 0.14951371752068515, + "grad_norm": 4.645956516265869, + "learning_rate": 5.981416957026713e-06, + "loss": 1.2637, + "step": 515 + }, + { + "epoch": 0.14980403541878357, + "grad_norm": 4.705561637878418, + "learning_rate": 5.993031358885018e-06, + "loss": 1.23, + "step": 516 + }, + { + "epoch": 0.15009435331688198, + "grad_norm": 4.898301601409912, + "learning_rate": 6.0046457607433214e-06, + "loss": 1.2903, + "step": 517 + }, + { + "epoch": 0.1503846712149804, + "grad_norm": 4.390701770782471, + "learning_rate": 6.016260162601627e-06, + "loss": 1.1944, + "step": 518 + }, + { + "epoch": 0.15067498911307883, + "grad_norm": 4.7379913330078125, + "learning_rate": 6.027874564459931e-06, + "loss": 1.3016, + "step": 519 + }, + { + "epoch": 0.15096530701117725, + "grad_norm": 4.652884006500244, + "learning_rate": 6.039488966318236e-06, + "loss": 1.3385, + "step": 520 + }, + { + "epoch": 0.15125562490927566, + "grad_norm": 4.54412317276001, + "learning_rate": 6.051103368176539e-06, + "loss": 1.1898, + "step": 521 + }, + { + "epoch": 0.15154594280737407, + "grad_norm": 4.629741668701172, + "learning_rate": 6.062717770034844e-06, + "loss": 1.2784, + "step": 522 + }, + { + "epoch": 0.1518362607054725, + "grad_norm": 4.372036457061768, + "learning_rate": 6.074332171893148e-06, + "loss": 1.223, + "step": 523 + }, + { + "epoch": 0.1521265786035709, + "grad_norm": 4.209630966186523, + "learning_rate": 6.085946573751452e-06, + "loss": 1.2334, + "step": 524 + }, + { + "epoch": 0.15241689650166934, + "grad_norm": 4.473580360412598, + "learning_rate": 6.0975609756097564e-06, + "loss": 1.1312, + "step": 525 + }, + { + "epoch": 0.15270721439976775, + "grad_norm": 4.313533782958984, + "learning_rate": 6.109175377468061e-06, + "loss": 1.2681, + "step": 526 + }, + { + "epoch": 0.15299753229786617, + "grad_norm": 4.518441200256348, + "learning_rate": 6.1207897793263645e-06, + "loss": 1.2946, + "step": 527 + }, + { + "epoch": 0.15328785019596458, + "grad_norm": 4.112656593322754, + "learning_rate": 6.132404181184669e-06, + "loss": 1.2083, + "step": 528 + }, + { + "epoch": 0.153578168094063, + "grad_norm": 4.3622565269470215, + "learning_rate": 6.1440185830429735e-06, + "loss": 1.2937, + "step": 529 + }, + { + "epoch": 0.1538684859921614, + "grad_norm": 4.5020751953125, + "learning_rate": 6.155632984901279e-06, + "loss": 1.2604, + "step": 530 + }, + { + "epoch": 0.15415880389025982, + "grad_norm": 4.212316989898682, + "learning_rate": 6.1672473867595825e-06, + "loss": 1.1373, + "step": 531 + }, + { + "epoch": 0.15444912178835826, + "grad_norm": 4.951518535614014, + "learning_rate": 6.178861788617887e-06, + "loss": 1.3234, + "step": 532 + }, + { + "epoch": 0.15473943968645668, + "grad_norm": 4.149683475494385, + "learning_rate": 6.1904761904761914e-06, + "loss": 1.191, + "step": 533 + }, + { + "epoch": 0.1550297575845551, + "grad_norm": 4.293402194976807, + "learning_rate": 6.202090592334495e-06, + "loss": 1.3976, + "step": 534 + }, + { + "epoch": 0.1553200754826535, + "grad_norm": 4.633920669555664, + "learning_rate": 6.2137049941927995e-06, + "loss": 1.363, + "step": 535 + }, + { + "epoch": 0.15561039338075192, + "grad_norm": 4.190507888793945, + "learning_rate": 6.225319396051104e-06, + "loss": 1.0997, + "step": 536 + }, + { + "epoch": 0.15590071127885033, + "grad_norm": 4.259326934814453, + "learning_rate": 6.236933797909408e-06, + "loss": 1.2049, + "step": 537 + }, + { + "epoch": 0.15619102917694877, + "grad_norm": 4.1629204750061035, + "learning_rate": 6.248548199767712e-06, + "loss": 1.1561, + "step": 538 + }, + { + "epoch": 0.15648134707504718, + "grad_norm": 4.23039436340332, + "learning_rate": 6.260162601626017e-06, + "loss": 1.1901, + "step": 539 + }, + { + "epoch": 0.1567716649731456, + "grad_norm": 4.121535778045654, + "learning_rate": 6.27177700348432e-06, + "loss": 1.1737, + "step": 540 + }, + { + "epoch": 0.157061982871244, + "grad_norm": 4.287825584411621, + "learning_rate": 6.283391405342625e-06, + "loss": 1.3326, + "step": 541 + }, + { + "epoch": 0.15735230076934242, + "grad_norm": 9.216053009033203, + "learning_rate": 6.29500580720093e-06, + "loss": 1.3105, + "step": 542 + }, + { + "epoch": 0.15764261866744084, + "grad_norm": 4.486374855041504, + "learning_rate": 6.3066202090592345e-06, + "loss": 1.1196, + "step": 543 + }, + { + "epoch": 0.15793293656553928, + "grad_norm": 4.181046485900879, + "learning_rate": 6.318234610917538e-06, + "loss": 1.111, + "step": 544 + }, + { + "epoch": 0.1582232544636377, + "grad_norm": 4.662967205047607, + "learning_rate": 6.329849012775843e-06, + "loss": 1.1715, + "step": 545 + }, + { + "epoch": 0.1585135723617361, + "grad_norm": 4.380138397216797, + "learning_rate": 6.341463414634147e-06, + "loss": 1.2221, + "step": 546 + }, + { + "epoch": 0.15880389025983452, + "grad_norm": 4.870767593383789, + "learning_rate": 6.353077816492451e-06, + "loss": 1.1926, + "step": 547 + }, + { + "epoch": 0.15909420815793293, + "grad_norm": 4.571467876434326, + "learning_rate": 6.364692218350755e-06, + "loss": 1.0445, + "step": 548 + }, + { + "epoch": 0.15938452605603134, + "grad_norm": 4.919942378997803, + "learning_rate": 6.37630662020906e-06, + "loss": 1.2702, + "step": 549 + }, + { + "epoch": 0.15967484395412979, + "grad_norm": 4.4647979736328125, + "learning_rate": 6.387921022067363e-06, + "loss": 1.2969, + "step": 550 + }, + { + "epoch": 0.1599651618522282, + "grad_norm": 4.359588146209717, + "learning_rate": 6.399535423925668e-06, + "loss": 1.3191, + "step": 551 + }, + { + "epoch": 0.1602554797503266, + "grad_norm": 4.425624370574951, + "learning_rate": 6.411149825783972e-06, + "loss": 1.2345, + "step": 552 + }, + { + "epoch": 0.16054579764842503, + "grad_norm": 4.439249038696289, + "learning_rate": 6.422764227642278e-06, + "loss": 1.1849, + "step": 553 + }, + { + "epoch": 0.16083611554652344, + "grad_norm": 4.451704025268555, + "learning_rate": 6.434378629500581e-06, + "loss": 1.2828, + "step": 554 + }, + { + "epoch": 0.16112643344462185, + "grad_norm": 4.43411922454834, + "learning_rate": 6.445993031358886e-06, + "loss": 1.4051, + "step": 555 + }, + { + "epoch": 0.16141675134272027, + "grad_norm": 4.6609392166137695, + "learning_rate": 6.45760743321719e-06, + "loss": 1.1596, + "step": 556 + }, + { + "epoch": 0.1617070692408187, + "grad_norm": 4.231972694396973, + "learning_rate": 6.469221835075494e-06, + "loss": 1.2903, + "step": 557 + }, + { + "epoch": 0.16199738713891712, + "grad_norm": 4.471492290496826, + "learning_rate": 6.480836236933798e-06, + "loss": 1.2261, + "step": 558 + }, + { + "epoch": 0.16228770503701553, + "grad_norm": 4.300949573516846, + "learning_rate": 6.492450638792103e-06, + "loss": 1.232, + "step": 559 + }, + { + "epoch": 0.16257802293511395, + "grad_norm": 4.097339153289795, + "learning_rate": 6.504065040650407e-06, + "loss": 1.1599, + "step": 560 + }, + { + "epoch": 0.16286834083321236, + "grad_norm": 3.920823097229004, + "learning_rate": 6.515679442508711e-06, + "loss": 1.1565, + "step": 561 + }, + { + "epoch": 0.16315865873131077, + "grad_norm": 4.790262222290039, + "learning_rate": 6.5272938443670154e-06, + "loss": 1.3051, + "step": 562 + }, + { + "epoch": 0.16344897662940922, + "grad_norm": 4.490232467651367, + "learning_rate": 6.53890824622532e-06, + "loss": 1.2613, + "step": 563 + }, + { + "epoch": 0.16373929452750763, + "grad_norm": 4.4714813232421875, + "learning_rate": 6.5505226480836235e-06, + "loss": 1.2043, + "step": 564 + }, + { + "epoch": 0.16402961242560604, + "grad_norm": 4.994192600250244, + "learning_rate": 6.562137049941929e-06, + "loss": 1.2062, + "step": 565 + }, + { + "epoch": 0.16431993032370446, + "grad_norm": 4.22312068939209, + "learning_rate": 6.573751451800233e-06, + "loss": 1.2887, + "step": 566 + }, + { + "epoch": 0.16461024822180287, + "grad_norm": 4.273190975189209, + "learning_rate": 6.585365853658538e-06, + "loss": 1.2889, + "step": 567 + }, + { + "epoch": 0.16490056611990128, + "grad_norm": 4.727954387664795, + "learning_rate": 6.5969802555168415e-06, + "loss": 1.3973, + "step": 568 + }, + { + "epoch": 0.16519088401799972, + "grad_norm": 4.461411476135254, + "learning_rate": 6.608594657375146e-06, + "loss": 1.2739, + "step": 569 + }, + { + "epoch": 0.16548120191609814, + "grad_norm": 4.23778772354126, + "learning_rate": 6.62020905923345e-06, + "loss": 1.1162, + "step": 570 + }, + { + "epoch": 0.16577151981419655, + "grad_norm": 4.434848785400391, + "learning_rate": 6.631823461091754e-06, + "loss": 1.2089, + "step": 571 + }, + { + "epoch": 0.16606183771229496, + "grad_norm": 4.056807518005371, + "learning_rate": 6.6434378629500585e-06, + "loss": 1.2375, + "step": 572 + }, + { + "epoch": 0.16635215561039338, + "grad_norm": 4.4226975440979, + "learning_rate": 6.655052264808363e-06, + "loss": 1.1912, + "step": 573 + }, + { + "epoch": 0.1666424735084918, + "grad_norm": 4.397589206695557, + "learning_rate": 6.666666666666667e-06, + "loss": 1.2756, + "step": 574 + }, + { + "epoch": 0.1669327914065902, + "grad_norm": 4.375736236572266, + "learning_rate": 6.678281068524971e-06, + "loss": 1.1205, + "step": 575 + }, + { + "epoch": 0.16722310930468864, + "grad_norm": 4.373353481292725, + "learning_rate": 6.6898954703832765e-06, + "loss": 1.2309, + "step": 576 + }, + { + "epoch": 0.16751342720278706, + "grad_norm": 4.392578125, + "learning_rate": 6.701509872241581e-06, + "loss": 1.3111, + "step": 577 + }, + { + "epoch": 0.16780374510088547, + "grad_norm": 4.608421325683594, + "learning_rate": 6.7131242740998846e-06, + "loss": 1.1501, + "step": 578 + }, + { + "epoch": 0.16809406299898388, + "grad_norm": 4.548303127288818, + "learning_rate": 6.724738675958189e-06, + "loss": 1.2754, + "step": 579 + }, + { + "epoch": 0.1683843808970823, + "grad_norm": 4.56739616394043, + "learning_rate": 6.7363530778164935e-06, + "loss": 1.2028, + "step": 580 + }, + { + "epoch": 0.1686746987951807, + "grad_norm": 4.294614315032959, + "learning_rate": 6.747967479674797e-06, + "loss": 1.1459, + "step": 581 + }, + { + "epoch": 0.16896501669327915, + "grad_norm": 4.636039733886719, + "learning_rate": 6.759581881533102e-06, + "loss": 1.3814, + "step": 582 + }, + { + "epoch": 0.16925533459137757, + "grad_norm": 4.619139671325684, + "learning_rate": 6.771196283391406e-06, + "loss": 1.242, + "step": 583 + }, + { + "epoch": 0.16954565248947598, + "grad_norm": 4.989368915557861, + "learning_rate": 6.78281068524971e-06, + "loss": 1.4686, + "step": 584 + }, + { + "epoch": 0.1698359703875744, + "grad_norm": 4.284407138824463, + "learning_rate": 6.794425087108014e-06, + "loss": 1.1228, + "step": 585 + }, + { + "epoch": 0.1701262882856728, + "grad_norm": 4.518624782562256, + "learning_rate": 6.806039488966319e-06, + "loss": 1.0664, + "step": 586 + }, + { + "epoch": 0.17041660618377122, + "grad_norm": 4.132668495178223, + "learning_rate": 6.817653890824622e-06, + "loss": 1.1725, + "step": 587 + }, + { + "epoch": 0.17070692408186966, + "grad_norm": 4.393999099731445, + "learning_rate": 6.829268292682928e-06, + "loss": 1.2639, + "step": 588 + }, + { + "epoch": 0.17099724197996807, + "grad_norm": 4.1911139488220215, + "learning_rate": 6.840882694541232e-06, + "loss": 1.3127, + "step": 589 + }, + { + "epoch": 0.1712875598780665, + "grad_norm": 4.69661283493042, + "learning_rate": 6.852497096399537e-06, + "loss": 1.2984, + "step": 590 + }, + { + "epoch": 0.1715778777761649, + "grad_norm": 4.060606956481934, + "learning_rate": 6.86411149825784e-06, + "loss": 1.2638, + "step": 591 + }, + { + "epoch": 0.1718681956742633, + "grad_norm": 4.7827677726745605, + "learning_rate": 6.875725900116145e-06, + "loss": 1.3978, + "step": 592 + }, + { + "epoch": 0.17215851357236173, + "grad_norm": 4.189406394958496, + "learning_rate": 6.887340301974449e-06, + "loss": 1.2079, + "step": 593 + }, + { + "epoch": 0.17244883147046017, + "grad_norm": 4.125210762023926, + "learning_rate": 6.898954703832753e-06, + "loss": 1.2004, + "step": 594 + }, + { + "epoch": 0.17273914936855858, + "grad_norm": 4.049924373626709, + "learning_rate": 6.910569105691057e-06, + "loss": 1.254, + "step": 595 + }, + { + "epoch": 0.173029467266657, + "grad_norm": 4.361916542053223, + "learning_rate": 6.922183507549362e-06, + "loss": 1.3253, + "step": 596 + }, + { + "epoch": 0.1733197851647554, + "grad_norm": 3.9269027709960938, + "learning_rate": 6.9337979094076655e-06, + "loss": 1.114, + "step": 597 + }, + { + "epoch": 0.17361010306285382, + "grad_norm": 4.094462871551514, + "learning_rate": 6.94541231126597e-06, + "loss": 1.3056, + "step": 598 + }, + { + "epoch": 0.17390042096095223, + "grad_norm": 4.001208305358887, + "learning_rate": 6.957026713124274e-06, + "loss": 1.2286, + "step": 599 + }, + { + "epoch": 0.17419073885905065, + "grad_norm": 4.29280948638916, + "learning_rate": 6.96864111498258e-06, + "loss": 1.2494, + "step": 600 + }, + { + "epoch": 0.1744810567571491, + "grad_norm": 4.355632305145264, + "learning_rate": 6.980255516840883e-06, + "loss": 1.2811, + "step": 601 + }, + { + "epoch": 0.1747713746552475, + "grad_norm": 4.2747273445129395, + "learning_rate": 6.991869918699188e-06, + "loss": 1.2177, + "step": 602 + }, + { + "epoch": 0.17506169255334592, + "grad_norm": 4.914125442504883, + "learning_rate": 7.003484320557492e-06, + "loss": 1.2001, + "step": 603 + }, + { + "epoch": 0.17535201045144433, + "grad_norm": 4.380726337432861, + "learning_rate": 7.015098722415796e-06, + "loss": 1.2322, + "step": 604 + }, + { + "epoch": 0.17564232834954274, + "grad_norm": 4.1070733070373535, + "learning_rate": 7.0267131242741005e-06, + "loss": 1.0689, + "step": 605 + }, + { + "epoch": 0.17593264624764116, + "grad_norm": 4.090858459472656, + "learning_rate": 7.038327526132405e-06, + "loss": 1.0399, + "step": 606 + }, + { + "epoch": 0.1762229641457396, + "grad_norm": 4.439457893371582, + "learning_rate": 7.0499419279907086e-06, + "loss": 1.0798, + "step": 607 + }, + { + "epoch": 0.176513282043838, + "grad_norm": 4.626300811767578, + "learning_rate": 7.061556329849013e-06, + "loss": 1.1793, + "step": 608 + }, + { + "epoch": 0.17680359994193642, + "grad_norm": 4.283360481262207, + "learning_rate": 7.0731707317073175e-06, + "loss": 1.1937, + "step": 609 + }, + { + "epoch": 0.17709391784003484, + "grad_norm": 4.1504669189453125, + "learning_rate": 7.084785133565621e-06, + "loss": 1.0317, + "step": 610 + }, + { + "epoch": 0.17738423573813325, + "grad_norm": 4.170088768005371, + "learning_rate": 7.0963995354239265e-06, + "loss": 1.1571, + "step": 611 + }, + { + "epoch": 0.17767455363623166, + "grad_norm": 4.515710353851318, + "learning_rate": 7.108013937282231e-06, + "loss": 1.1888, + "step": 612 + }, + { + "epoch": 0.1779648715343301, + "grad_norm": 3.985978841781616, + "learning_rate": 7.1196283391405354e-06, + "loss": 1.1603, + "step": 613 + }, + { + "epoch": 0.17825518943242852, + "grad_norm": 4.436974048614502, + "learning_rate": 7.131242740998839e-06, + "loss": 1.2722, + "step": 614 + }, + { + "epoch": 0.17854550733052693, + "grad_norm": 4.694450855255127, + "learning_rate": 7.1428571428571436e-06, + "loss": 1.2873, + "step": 615 + }, + { + "epoch": 0.17883582522862534, + "grad_norm": 4.002849578857422, + "learning_rate": 7.154471544715448e-06, + "loss": 1.2664, + "step": 616 + }, + { + "epoch": 0.17912614312672376, + "grad_norm": 4.15142822265625, + "learning_rate": 7.166085946573752e-06, + "loss": 1.1794, + "step": 617 + }, + { + "epoch": 0.17941646102482217, + "grad_norm": 4.208678245544434, + "learning_rate": 7.177700348432056e-06, + "loss": 1.3951, + "step": 618 + }, + { + "epoch": 0.17970677892292058, + "grad_norm": 4.212402820587158, + "learning_rate": 7.189314750290361e-06, + "loss": 1.2183, + "step": 619 + }, + { + "epoch": 0.17999709682101903, + "grad_norm": 4.358024597167969, + "learning_rate": 7.200929152148664e-06, + "loss": 1.2951, + "step": 620 + }, + { + "epoch": 0.18028741471911744, + "grad_norm": 4.831110000610352, + "learning_rate": 7.212543554006969e-06, + "loss": 1.2045, + "step": 621 + }, + { + "epoch": 0.18057773261721585, + "grad_norm": 4.0317206382751465, + "learning_rate": 7.224157955865273e-06, + "loss": 1.1498, + "step": 622 + }, + { + "epoch": 0.18086805051531427, + "grad_norm": 4.493712425231934, + "learning_rate": 7.2357723577235786e-06, + "loss": 1.2473, + "step": 623 + }, + { + "epoch": 0.18115836841341268, + "grad_norm": 4.345702648162842, + "learning_rate": 7.247386759581882e-06, + "loss": 1.1674, + "step": 624 + }, + { + "epoch": 0.1814486863115111, + "grad_norm": 4.302826404571533, + "learning_rate": 7.259001161440187e-06, + "loss": 1.2631, + "step": 625 + }, + { + "epoch": 0.18173900420960953, + "grad_norm": 4.829352855682373, + "learning_rate": 7.270615563298491e-06, + "loss": 1.1601, + "step": 626 + }, + { + "epoch": 0.18202932210770795, + "grad_norm": 4.134838104248047, + "learning_rate": 7.282229965156795e-06, + "loss": 1.1322, + "step": 627 + }, + { + "epoch": 0.18231964000580636, + "grad_norm": 4.196687698364258, + "learning_rate": 7.293844367015099e-06, + "loss": 1.2701, + "step": 628 + }, + { + "epoch": 0.18260995790390477, + "grad_norm": 4.318655490875244, + "learning_rate": 7.305458768873404e-06, + "loss": 1.3027, + "step": 629 + }, + { + "epoch": 0.1829002758020032, + "grad_norm": 4.255601406097412, + "learning_rate": 7.317073170731707e-06, + "loss": 1.1377, + "step": 630 + }, + { + "epoch": 0.1831905937001016, + "grad_norm": 4.319618225097656, + "learning_rate": 7.328687572590012e-06, + "loss": 1.1671, + "step": 631 + }, + { + "epoch": 0.18348091159820004, + "grad_norm": 4.360809803009033, + "learning_rate": 7.340301974448316e-06, + "loss": 1.2979, + "step": 632 + }, + { + "epoch": 0.18377122949629845, + "grad_norm": 3.8124513626098633, + "learning_rate": 7.35191637630662e-06, + "loss": 1.1039, + "step": 633 + }, + { + "epoch": 0.18406154739439687, + "grad_norm": 4.552162170410156, + "learning_rate": 7.3635307781649245e-06, + "loss": 1.1019, + "step": 634 + }, + { + "epoch": 0.18435186529249528, + "grad_norm": 3.8770148754119873, + "learning_rate": 7.37514518002323e-06, + "loss": 1.0831, + "step": 635 + }, + { + "epoch": 0.1846421831905937, + "grad_norm": 4.136161804199219, + "learning_rate": 7.386759581881534e-06, + "loss": 1.1656, + "step": 636 + }, + { + "epoch": 0.1849325010886921, + "grad_norm": 4.266040802001953, + "learning_rate": 7.398373983739838e-06, + "loss": 1.1633, + "step": 637 + }, + { + "epoch": 0.18522281898679055, + "grad_norm": 4.174380779266357, + "learning_rate": 7.409988385598142e-06, + "loss": 1.2005, + "step": 638 + }, + { + "epoch": 0.18551313688488896, + "grad_norm": 4.037458419799805, + "learning_rate": 7.421602787456447e-06, + "loss": 1.1017, + "step": 639 + }, + { + "epoch": 0.18580345478298738, + "grad_norm": 4.106693744659424, + "learning_rate": 7.4332171893147505e-06, + "loss": 1.1764, + "step": 640 + }, + { + "epoch": 0.1860937726810858, + "grad_norm": 4.502237319946289, + "learning_rate": 7.444831591173055e-06, + "loss": 1.3775, + "step": 641 + }, + { + "epoch": 0.1863840905791842, + "grad_norm": 4.384480953216553, + "learning_rate": 7.4564459930313594e-06, + "loss": 1.2214, + "step": 642 + }, + { + "epoch": 0.18667440847728262, + "grad_norm": 4.051870346069336, + "learning_rate": 7.468060394889663e-06, + "loss": 1.2507, + "step": 643 + }, + { + "epoch": 0.18696472637538103, + "grad_norm": 3.967947244644165, + "learning_rate": 7.4796747967479676e-06, + "loss": 1.1179, + "step": 644 + }, + { + "epoch": 0.18725504427347947, + "grad_norm": 4.541753768920898, + "learning_rate": 7.491289198606272e-06, + "loss": 1.3501, + "step": 645 + }, + { + "epoch": 0.18754536217157788, + "grad_norm": 4.431195259094238, + "learning_rate": 7.502903600464577e-06, + "loss": 1.2707, + "step": 646 + }, + { + "epoch": 0.1878356800696763, + "grad_norm": 4.027304172515869, + "learning_rate": 7.514518002322881e-06, + "loss": 0.9999, + "step": 647 + }, + { + "epoch": 0.1881259979677747, + "grad_norm": 4.287905693054199, + "learning_rate": 7.5261324041811855e-06, + "loss": 1.2036, + "step": 648 + }, + { + "epoch": 0.18841631586587312, + "grad_norm": 4.41646671295166, + "learning_rate": 7.53774680603949e-06, + "loss": 1.3499, + "step": 649 + }, + { + "epoch": 0.18870663376397154, + "grad_norm": 3.83207106590271, + "learning_rate": 7.5493612078977944e-06, + "loss": 1.0668, + "step": 650 + }, + { + "epoch": 0.18899695166206998, + "grad_norm": 4.674952507019043, + "learning_rate": 7.560975609756098e-06, + "loss": 1.2712, + "step": 651 + }, + { + "epoch": 0.1892872695601684, + "grad_norm": 4.142502784729004, + "learning_rate": 7.5725900116144026e-06, + "loss": 1.2139, + "step": 652 + }, + { + "epoch": 0.1895775874582668, + "grad_norm": 4.170092582702637, + "learning_rate": 7.584204413472707e-06, + "loss": 1.1265, + "step": 653 + }, + { + "epoch": 0.18986790535636522, + "grad_norm": 4.253111362457275, + "learning_rate": 7.595818815331011e-06, + "loss": 1.3418, + "step": 654 + }, + { + "epoch": 0.19015822325446363, + "grad_norm": 4.222099781036377, + "learning_rate": 7.607433217189315e-06, + "loss": 1.1752, + "step": 655 + }, + { + "epoch": 0.19044854115256205, + "grad_norm": 3.9238572120666504, + "learning_rate": 7.61904761904762e-06, + "loss": 1.0777, + "step": 656 + }, + { + "epoch": 0.1907388590506605, + "grad_norm": 4.306210994720459, + "learning_rate": 7.630662020905924e-06, + "loss": 1.3503, + "step": 657 + }, + { + "epoch": 0.1910291769487589, + "grad_norm": 4.187571048736572, + "learning_rate": 7.64227642276423e-06, + "loss": 1.1342, + "step": 658 + }, + { + "epoch": 0.1913194948468573, + "grad_norm": 4.448465824127197, + "learning_rate": 7.653890824622533e-06, + "loss": 1.2355, + "step": 659 + }, + { + "epoch": 0.19160981274495573, + "grad_norm": 4.302551746368408, + "learning_rate": 7.665505226480837e-06, + "loss": 1.1779, + "step": 660 + }, + { + "epoch": 0.19190013064305414, + "grad_norm": 4.034951686859131, + "learning_rate": 7.677119628339142e-06, + "loss": 1.1235, + "step": 661 + }, + { + "epoch": 0.19219044854115255, + "grad_norm": 4.021313190460205, + "learning_rate": 7.688734030197446e-06, + "loss": 1.1306, + "step": 662 + }, + { + "epoch": 0.19248076643925097, + "grad_norm": 4.604819297790527, + "learning_rate": 7.70034843205575e-06, + "loss": 1.265, + "step": 663 + }, + { + "epoch": 0.1927710843373494, + "grad_norm": 4.1214189529418945, + "learning_rate": 7.711962833914055e-06, + "loss": 1.2959, + "step": 664 + }, + { + "epoch": 0.19306140223544782, + "grad_norm": 4.4705047607421875, + "learning_rate": 7.723577235772358e-06, + "loss": 1.3114, + "step": 665 + }, + { + "epoch": 0.19335172013354623, + "grad_norm": 4.120425701141357, + "learning_rate": 7.735191637630662e-06, + "loss": 1.113, + "step": 666 + }, + { + "epoch": 0.19364203803164465, + "grad_norm": 3.661496877670288, + "learning_rate": 7.746806039488967e-06, + "loss": 1.1437, + "step": 667 + }, + { + "epoch": 0.19393235592974306, + "grad_norm": 4.550029277801514, + "learning_rate": 7.758420441347271e-06, + "loss": 1.3003, + "step": 668 + }, + { + "epoch": 0.19422267382784147, + "grad_norm": 4.394417762756348, + "learning_rate": 7.770034843205574e-06, + "loss": 1.1571, + "step": 669 + }, + { + "epoch": 0.19451299172593992, + "grad_norm": 4.869671821594238, + "learning_rate": 7.78164924506388e-06, + "loss": 1.1525, + "step": 670 + }, + { + "epoch": 0.19480330962403833, + "grad_norm": 4.481640815734863, + "learning_rate": 7.793263646922185e-06, + "loss": 1.273, + "step": 671 + }, + { + "epoch": 0.19509362752213674, + "grad_norm": 4.039763450622559, + "learning_rate": 7.804878048780489e-06, + "loss": 1.1533, + "step": 672 + }, + { + "epoch": 0.19538394542023516, + "grad_norm": 4.439721584320068, + "learning_rate": 7.816492450638792e-06, + "loss": 1.2893, + "step": 673 + }, + { + "epoch": 0.19567426331833357, + "grad_norm": 3.8747873306274414, + "learning_rate": 7.828106852497098e-06, + "loss": 1.0562, + "step": 674 + }, + { + "epoch": 0.19596458121643198, + "grad_norm": 4.250256538391113, + "learning_rate": 7.839721254355401e-06, + "loss": 1.1873, + "step": 675 + }, + { + "epoch": 0.19625489911453042, + "grad_norm": 4.367439270019531, + "learning_rate": 7.851335656213705e-06, + "loss": 1.2494, + "step": 676 + }, + { + "epoch": 0.19654521701262884, + "grad_norm": 3.8989996910095215, + "learning_rate": 7.86295005807201e-06, + "loss": 1.2933, + "step": 677 + }, + { + "epoch": 0.19683553491072725, + "grad_norm": 4.156364917755127, + "learning_rate": 7.874564459930314e-06, + "loss": 1.2346, + "step": 678 + }, + { + "epoch": 0.19712585280882566, + "grad_norm": 3.9347167015075684, + "learning_rate": 7.886178861788618e-06, + "loss": 1.0208, + "step": 679 + }, + { + "epoch": 0.19741617070692408, + "grad_norm": 4.1161627769470215, + "learning_rate": 7.897793263646923e-06, + "loss": 1.2088, + "step": 680 + }, + { + "epoch": 0.1977064886050225, + "grad_norm": 4.2744855880737305, + "learning_rate": 7.909407665505228e-06, + "loss": 1.2502, + "step": 681 + }, + { + "epoch": 0.19799680650312093, + "grad_norm": 4.033324718475342, + "learning_rate": 7.921022067363532e-06, + "loss": 1.2464, + "step": 682 + }, + { + "epoch": 0.19828712440121934, + "grad_norm": 4.08077335357666, + "learning_rate": 7.932636469221836e-06, + "loss": 1.2234, + "step": 683 + }, + { + "epoch": 0.19857744229931776, + "grad_norm": 4.596649646759033, + "learning_rate": 7.94425087108014e-06, + "loss": 1.3688, + "step": 684 + }, + { + "epoch": 0.19886776019741617, + "grad_norm": 4.569955348968506, + "learning_rate": 7.955865272938444e-06, + "loss": 1.2121, + "step": 685 + }, + { + "epoch": 0.19915807809551458, + "grad_norm": 4.908385753631592, + "learning_rate": 7.967479674796748e-06, + "loss": 1.3586, + "step": 686 + }, + { + "epoch": 0.199448395993613, + "grad_norm": 4.093334674835205, + "learning_rate": 7.979094076655053e-06, + "loss": 1.2516, + "step": 687 + }, + { + "epoch": 0.1997387138917114, + "grad_norm": 4.448044776916504, + "learning_rate": 7.990708478513357e-06, + "loss": 1.1447, + "step": 688 + }, + { + "epoch": 0.20002903178980985, + "grad_norm": 4.412672519683838, + "learning_rate": 8.00232288037166e-06, + "loss": 1.2134, + "step": 689 + }, + { + "epoch": 0.20031934968790827, + "grad_norm": 3.8759539127349854, + "learning_rate": 8.013937282229966e-06, + "loss": 1.1278, + "step": 690 + }, + { + "epoch": 0.20060966758600668, + "grad_norm": 3.993645668029785, + "learning_rate": 8.02555168408827e-06, + "loss": 1.1997, + "step": 691 + }, + { + "epoch": 0.2008999854841051, + "grad_norm": 4.497583389282227, + "learning_rate": 8.037166085946573e-06, + "loss": 1.2892, + "step": 692 + }, + { + "epoch": 0.2011903033822035, + "grad_norm": 4.036830425262451, + "learning_rate": 8.048780487804879e-06, + "loss": 1.2577, + "step": 693 + }, + { + "epoch": 0.20148062128030192, + "grad_norm": 4.649497985839844, + "learning_rate": 8.060394889663184e-06, + "loss": 1.3546, + "step": 694 + }, + { + "epoch": 0.20177093917840036, + "grad_norm": 4.232790946960449, + "learning_rate": 8.072009291521488e-06, + "loss": 1.0828, + "step": 695 + }, + { + "epoch": 0.20206125707649877, + "grad_norm": 4.427145481109619, + "learning_rate": 8.083623693379791e-06, + "loss": 1.2565, + "step": 696 + }, + { + "epoch": 0.2023515749745972, + "grad_norm": 4.624083042144775, + "learning_rate": 8.095238095238097e-06, + "loss": 1.3997, + "step": 697 + }, + { + "epoch": 0.2026418928726956, + "grad_norm": 4.487246036529541, + "learning_rate": 8.1068524970964e-06, + "loss": 1.2862, + "step": 698 + }, + { + "epoch": 0.202932210770794, + "grad_norm": 4.557520866394043, + "learning_rate": 8.118466898954704e-06, + "loss": 1.1943, + "step": 699 + }, + { + "epoch": 0.20322252866889243, + "grad_norm": 4.63982629776001, + "learning_rate": 8.130081300813009e-06, + "loss": 1.1608, + "step": 700 + }, + { + "epoch": 0.20351284656699087, + "grad_norm": 4.147871017456055, + "learning_rate": 8.141695702671313e-06, + "loss": 1.1881, + "step": 701 + }, + { + "epoch": 0.20380316446508928, + "grad_norm": 4.48539924621582, + "learning_rate": 8.153310104529616e-06, + "loss": 1.2512, + "step": 702 + }, + { + "epoch": 0.2040934823631877, + "grad_norm": 4.378758430480957, + "learning_rate": 8.164924506387922e-06, + "loss": 1.2635, + "step": 703 + }, + { + "epoch": 0.2043838002612861, + "grad_norm": 4.198378562927246, + "learning_rate": 8.176538908246227e-06, + "loss": 1.3167, + "step": 704 + }, + { + "epoch": 0.20467411815938452, + "grad_norm": 4.351714134216309, + "learning_rate": 8.18815331010453e-06, + "loss": 1.3105, + "step": 705 + }, + { + "epoch": 0.20496443605748293, + "grad_norm": 3.9941248893737793, + "learning_rate": 8.199767711962834e-06, + "loss": 1.1611, + "step": 706 + }, + { + "epoch": 0.20525475395558135, + "grad_norm": 4.21259880065918, + "learning_rate": 8.21138211382114e-06, + "loss": 1.2724, + "step": 707 + }, + { + "epoch": 0.2055450718536798, + "grad_norm": 4.212434768676758, + "learning_rate": 8.222996515679443e-06, + "loss": 1.2564, + "step": 708 + }, + { + "epoch": 0.2058353897517782, + "grad_norm": 4.102781295776367, + "learning_rate": 8.234610917537747e-06, + "loss": 1.1005, + "step": 709 + }, + { + "epoch": 0.20612570764987662, + "grad_norm": 4.176314830780029, + "learning_rate": 8.246225319396052e-06, + "loss": 1.3496, + "step": 710 + }, + { + "epoch": 0.20641602554797503, + "grad_norm": 3.998204469680786, + "learning_rate": 8.257839721254356e-06, + "loss": 1.1549, + "step": 711 + }, + { + "epoch": 0.20670634344607344, + "grad_norm": 4.177518844604492, + "learning_rate": 8.26945412311266e-06, + "loss": 1.2156, + "step": 712 + }, + { + "epoch": 0.20699666134417186, + "grad_norm": 3.991353750228882, + "learning_rate": 8.281068524970965e-06, + "loss": 1.247, + "step": 713 + }, + { + "epoch": 0.2072869792422703, + "grad_norm": 4.021002292633057, + "learning_rate": 8.292682926829268e-06, + "loss": 1.2432, + "step": 714 + }, + { + "epoch": 0.2075772971403687, + "grad_norm": 4.410247802734375, + "learning_rate": 8.304297328687572e-06, + "loss": 1.4163, + "step": 715 + }, + { + "epoch": 0.20786761503846712, + "grad_norm": 4.138284683227539, + "learning_rate": 8.315911730545877e-06, + "loss": 1.1089, + "step": 716 + }, + { + "epoch": 0.20815793293656554, + "grad_norm": 3.8682849407196045, + "learning_rate": 8.327526132404183e-06, + "loss": 1.1813, + "step": 717 + }, + { + "epoch": 0.20844825083466395, + "grad_norm": 4.133089065551758, + "learning_rate": 8.339140534262486e-06, + "loss": 1.2069, + "step": 718 + }, + { + "epoch": 0.20873856873276236, + "grad_norm": 4.1765875816345215, + "learning_rate": 8.35075493612079e-06, + "loss": 1.3223, + "step": 719 + }, + { + "epoch": 0.2090288866308608, + "grad_norm": 4.326620101928711, + "learning_rate": 8.362369337979095e-06, + "loss": 1.1926, + "step": 720 + }, + { + "epoch": 0.20931920452895922, + "grad_norm": 4.258913993835449, + "learning_rate": 8.373983739837399e-06, + "loss": 1.2684, + "step": 721 + }, + { + "epoch": 0.20960952242705763, + "grad_norm": 3.9621224403381348, + "learning_rate": 8.385598141695703e-06, + "loss": 1.1473, + "step": 722 + }, + { + "epoch": 0.20989984032515605, + "grad_norm": 4.3580322265625, + "learning_rate": 8.397212543554008e-06, + "loss": 1.2432, + "step": 723 + }, + { + "epoch": 0.21019015822325446, + "grad_norm": 4.387808799743652, + "learning_rate": 8.408826945412312e-06, + "loss": 1.395, + "step": 724 + }, + { + "epoch": 0.21048047612135287, + "grad_norm": 4.777324199676514, + "learning_rate": 8.420441347270615e-06, + "loss": 1.2738, + "step": 725 + }, + { + "epoch": 0.2107707940194513, + "grad_norm": 3.977665424346924, + "learning_rate": 8.43205574912892e-06, + "loss": 1.2753, + "step": 726 + }, + { + "epoch": 0.21106111191754973, + "grad_norm": 4.048496246337891, + "learning_rate": 8.443670150987224e-06, + "loss": 1.1514, + "step": 727 + }, + { + "epoch": 0.21135142981564814, + "grad_norm": 3.8251259326934814, + "learning_rate": 8.45528455284553e-06, + "loss": 1.1101, + "step": 728 + }, + { + "epoch": 0.21164174771374655, + "grad_norm": 4.291660308837891, + "learning_rate": 8.466898954703833e-06, + "loss": 1.2019, + "step": 729 + }, + { + "epoch": 0.21193206561184497, + "grad_norm": 4.2600555419921875, + "learning_rate": 8.478513356562138e-06, + "loss": 1.2865, + "step": 730 + }, + { + "epoch": 0.21222238350994338, + "grad_norm": 3.9936861991882324, + "learning_rate": 8.490127758420442e-06, + "loss": 1.1312, + "step": 731 + }, + { + "epoch": 0.2125127014080418, + "grad_norm": 4.250927448272705, + "learning_rate": 8.501742160278746e-06, + "loss": 1.2805, + "step": 732 + }, + { + "epoch": 0.21280301930614023, + "grad_norm": 4.299734592437744, + "learning_rate": 8.513356562137051e-06, + "loss": 1.1194, + "step": 733 + }, + { + "epoch": 0.21309333720423865, + "grad_norm": 4.459551811218262, + "learning_rate": 8.524970963995355e-06, + "loss": 1.411, + "step": 734 + }, + { + "epoch": 0.21338365510233706, + "grad_norm": 4.234330654144287, + "learning_rate": 8.536585365853658e-06, + "loss": 1.2569, + "step": 735 + }, + { + "epoch": 0.21367397300043547, + "grad_norm": 4.489592552185059, + "learning_rate": 8.548199767711964e-06, + "loss": 1.341, + "step": 736 + }, + { + "epoch": 0.2139642908985339, + "grad_norm": 4.3680739402771, + "learning_rate": 8.559814169570267e-06, + "loss": 1.3785, + "step": 737 + }, + { + "epoch": 0.2142546087966323, + "grad_norm": 4.33858060836792, + "learning_rate": 8.571428571428571e-06, + "loss": 1.2048, + "step": 738 + }, + { + "epoch": 0.21454492669473074, + "grad_norm": 4.339114189147949, + "learning_rate": 8.583042973286876e-06, + "loss": 1.289, + "step": 739 + }, + { + "epoch": 0.21483524459282916, + "grad_norm": 3.8613274097442627, + "learning_rate": 8.594657375145182e-06, + "loss": 1.0864, + "step": 740 + }, + { + "epoch": 0.21512556249092757, + "grad_norm": 4.468837261199951, + "learning_rate": 8.606271777003485e-06, + "loss": 1.2507, + "step": 741 + }, + { + "epoch": 0.21541588038902598, + "grad_norm": 4.397847652435303, + "learning_rate": 8.617886178861789e-06, + "loss": 1.3629, + "step": 742 + }, + { + "epoch": 0.2157061982871244, + "grad_norm": 4.4040303230285645, + "learning_rate": 8.629500580720094e-06, + "loss": 1.2387, + "step": 743 + }, + { + "epoch": 0.2159965161852228, + "grad_norm": 4.0640130043029785, + "learning_rate": 8.641114982578398e-06, + "loss": 1.1718, + "step": 744 + }, + { + "epoch": 0.21628683408332125, + "grad_norm": 4.574658393859863, + "learning_rate": 8.652729384436701e-06, + "loss": 1.3895, + "step": 745 + }, + { + "epoch": 0.21657715198141966, + "grad_norm": 4.159901142120361, + "learning_rate": 8.664343786295007e-06, + "loss": 1.0791, + "step": 746 + }, + { + "epoch": 0.21686746987951808, + "grad_norm": 4.002528667449951, + "learning_rate": 8.67595818815331e-06, + "loss": 1.2234, + "step": 747 + }, + { + "epoch": 0.2171577877776165, + "grad_norm": 4.431401252746582, + "learning_rate": 8.687572590011614e-06, + "loss": 1.3391, + "step": 748 + }, + { + "epoch": 0.2174481056757149, + "grad_norm": 3.9772732257843018, + "learning_rate": 8.69918699186992e-06, + "loss": 1.2431, + "step": 749 + }, + { + "epoch": 0.21773842357381332, + "grad_norm": 3.6207127571105957, + "learning_rate": 8.710801393728223e-06, + "loss": 1.2068, + "step": 750 + }, + { + "epoch": 0.21802874147191173, + "grad_norm": 4.086411952972412, + "learning_rate": 8.722415795586528e-06, + "loss": 1.2978, + "step": 751 + }, + { + "epoch": 0.21831905937001017, + "grad_norm": 3.863708257675171, + "learning_rate": 8.734030197444832e-06, + "loss": 1.2108, + "step": 752 + }, + { + "epoch": 0.21860937726810858, + "grad_norm": 4.488952159881592, + "learning_rate": 8.745644599303137e-06, + "loss": 1.1774, + "step": 753 + }, + { + "epoch": 0.218899695166207, + "grad_norm": 4.089755535125732, + "learning_rate": 8.757259001161441e-06, + "loss": 1.2003, + "step": 754 + }, + { + "epoch": 0.2191900130643054, + "grad_norm": 3.3888041973114014, + "learning_rate": 8.768873403019745e-06, + "loss": 1.0689, + "step": 755 + }, + { + "epoch": 0.21948033096240382, + "grad_norm": 4.007880687713623, + "learning_rate": 8.78048780487805e-06, + "loss": 1.2168, + "step": 756 + }, + { + "epoch": 0.21977064886050224, + "grad_norm": 3.9035606384277344, + "learning_rate": 8.792102206736354e-06, + "loss": 1.0885, + "step": 757 + }, + { + "epoch": 0.22006096675860068, + "grad_norm": 4.004887580871582, + "learning_rate": 8.803716608594657e-06, + "loss": 1.1846, + "step": 758 + }, + { + "epoch": 0.2203512846566991, + "grad_norm": 4.1913580894470215, + "learning_rate": 8.815331010452962e-06, + "loss": 1.3213, + "step": 759 + }, + { + "epoch": 0.2206416025547975, + "grad_norm": 4.157240867614746, + "learning_rate": 8.826945412311266e-06, + "loss": 1.2908, + "step": 760 + }, + { + "epoch": 0.22093192045289592, + "grad_norm": 4.264801979064941, + "learning_rate": 8.83855981416957e-06, + "loss": 1.2615, + "step": 761 + }, + { + "epoch": 0.22122223835099433, + "grad_norm": 4.292517185211182, + "learning_rate": 8.850174216027875e-06, + "loss": 1.223, + "step": 762 + }, + { + "epoch": 0.22151255624909275, + "grad_norm": 3.773144245147705, + "learning_rate": 8.86178861788618e-06, + "loss": 1.1587, + "step": 763 + }, + { + "epoch": 0.2218028741471912, + "grad_norm": 4.224881172180176, + "learning_rate": 8.873403019744484e-06, + "loss": 1.3653, + "step": 764 + }, + { + "epoch": 0.2220931920452896, + "grad_norm": 4.401252269744873, + "learning_rate": 8.885017421602788e-06, + "loss": 1.2275, + "step": 765 + }, + { + "epoch": 0.222383509943388, + "grad_norm": 4.408329963684082, + "learning_rate": 8.896631823461093e-06, + "loss": 1.3104, + "step": 766 + }, + { + "epoch": 0.22267382784148643, + "grad_norm": 4.158458709716797, + "learning_rate": 8.908246225319397e-06, + "loss": 1.1537, + "step": 767 + }, + { + "epoch": 0.22296414573958484, + "grad_norm": 3.915562868118286, + "learning_rate": 8.9198606271777e-06, + "loss": 1.2162, + "step": 768 + }, + { + "epoch": 0.22325446363768325, + "grad_norm": 3.9854915142059326, + "learning_rate": 8.931475029036006e-06, + "loss": 1.0471, + "step": 769 + }, + { + "epoch": 0.2235447815357817, + "grad_norm": 4.040715217590332, + "learning_rate": 8.94308943089431e-06, + "loss": 1.2871, + "step": 770 + }, + { + "epoch": 0.2238350994338801, + "grad_norm": 4.417214870452881, + "learning_rate": 8.954703832752613e-06, + "loss": 1.2301, + "step": 771 + }, + { + "epoch": 0.22412541733197852, + "grad_norm": 4.276007175445557, + "learning_rate": 8.966318234610918e-06, + "loss": 1.4429, + "step": 772 + }, + { + "epoch": 0.22441573523007693, + "grad_norm": 4.329378604888916, + "learning_rate": 8.977932636469222e-06, + "loss": 1.3906, + "step": 773 + }, + { + "epoch": 0.22470605312817535, + "grad_norm": 4.080763339996338, + "learning_rate": 8.989547038327527e-06, + "loss": 1.1965, + "step": 774 + }, + { + "epoch": 0.22499637102627376, + "grad_norm": 3.89856219291687, + "learning_rate": 9.00116144018583e-06, + "loss": 1.1666, + "step": 775 + }, + { + "epoch": 0.22528668892437217, + "grad_norm": 4.193841457366943, + "learning_rate": 9.012775842044136e-06, + "loss": 1.3002, + "step": 776 + }, + { + "epoch": 0.22557700682247062, + "grad_norm": 4.260502338409424, + "learning_rate": 9.02439024390244e-06, + "loss": 1.2584, + "step": 777 + }, + { + "epoch": 0.22586732472056903, + "grad_norm": 4.089141845703125, + "learning_rate": 9.036004645760745e-06, + "loss": 1.102, + "step": 778 + }, + { + "epoch": 0.22615764261866744, + "grad_norm": 4.167725563049316, + "learning_rate": 9.047619047619049e-06, + "loss": 1.2121, + "step": 779 + }, + { + "epoch": 0.22644796051676586, + "grad_norm": 4.360806941986084, + "learning_rate": 9.059233449477352e-06, + "loss": 1.196, + "step": 780 + }, + { + "epoch": 0.22673827841486427, + "grad_norm": 4.336724281311035, + "learning_rate": 9.070847851335658e-06, + "loss": 1.142, + "step": 781 + }, + { + "epoch": 0.22702859631296268, + "grad_norm": 4.499552249908447, + "learning_rate": 9.082462253193961e-06, + "loss": 1.3475, + "step": 782 + }, + { + "epoch": 0.22731891421106112, + "grad_norm": 3.940721273422241, + "learning_rate": 9.094076655052265e-06, + "loss": 1.1308, + "step": 783 + }, + { + "epoch": 0.22760923210915954, + "grad_norm": 4.627920150756836, + "learning_rate": 9.10569105691057e-06, + "loss": 1.3685, + "step": 784 + }, + { + "epoch": 0.22789955000725795, + "grad_norm": 4.070476055145264, + "learning_rate": 9.117305458768874e-06, + "loss": 1.2696, + "step": 785 + }, + { + "epoch": 0.22818986790535636, + "grad_norm": 3.932196617126465, + "learning_rate": 9.12891986062718e-06, + "loss": 1.1755, + "step": 786 + }, + { + "epoch": 0.22848018580345478, + "grad_norm": 4.1085968017578125, + "learning_rate": 9.140534262485483e-06, + "loss": 1.3788, + "step": 787 + }, + { + "epoch": 0.2287705037015532, + "grad_norm": 4.546936511993408, + "learning_rate": 9.152148664343788e-06, + "loss": 1.2131, + "step": 788 + }, + { + "epoch": 0.22906082159965163, + "grad_norm": 3.854112148284912, + "learning_rate": 9.163763066202092e-06, + "loss": 1.2509, + "step": 789 + }, + { + "epoch": 0.22935113949775005, + "grad_norm": 3.6372368335723877, + "learning_rate": 9.175377468060395e-06, + "loss": 1.002, + "step": 790 + }, + { + "epoch": 0.22964145739584846, + "grad_norm": 4.038814067840576, + "learning_rate": 9.1869918699187e-06, + "loss": 1.273, + "step": 791 + }, + { + "epoch": 0.22993177529394687, + "grad_norm": 4.1536712646484375, + "learning_rate": 9.198606271777004e-06, + "loss": 1.3666, + "step": 792 + }, + { + "epoch": 0.23022209319204529, + "grad_norm": 4.179312705993652, + "learning_rate": 9.210220673635308e-06, + "loss": 1.2411, + "step": 793 + }, + { + "epoch": 0.2305124110901437, + "grad_norm": 3.946230411529541, + "learning_rate": 9.221835075493613e-06, + "loss": 1.357, + "step": 794 + }, + { + "epoch": 0.2308027289882421, + "grad_norm": 4.157958030700684, + "learning_rate": 9.233449477351917e-06, + "loss": 1.1273, + "step": 795 + }, + { + "epoch": 0.23109304688634055, + "grad_norm": 4.40532922744751, + "learning_rate": 9.24506387921022e-06, + "loss": 1.4211, + "step": 796 + }, + { + "epoch": 0.23138336478443897, + "grad_norm": 4.301095008850098, + "learning_rate": 9.256678281068526e-06, + "loss": 1.3181, + "step": 797 + }, + { + "epoch": 0.23167368268253738, + "grad_norm": 3.6317696571350098, + "learning_rate": 9.268292682926831e-06, + "loss": 1.0635, + "step": 798 + }, + { + "epoch": 0.2319640005806358, + "grad_norm": 4.2273359298706055, + "learning_rate": 9.279907084785135e-06, + "loss": 1.2817, + "step": 799 + }, + { + "epoch": 0.2322543184787342, + "grad_norm": 4.259072303771973, + "learning_rate": 9.291521486643439e-06, + "loss": 1.2549, + "step": 800 + }, + { + "epoch": 0.23254463637683262, + "grad_norm": 4.03896951675415, + "learning_rate": 9.303135888501744e-06, + "loss": 1.1359, + "step": 801 + }, + { + "epoch": 0.23283495427493106, + "grad_norm": 4.3312907218933105, + "learning_rate": 9.314750290360047e-06, + "loss": 1.3102, + "step": 802 + }, + { + "epoch": 0.23312527217302947, + "grad_norm": 3.9520628452301025, + "learning_rate": 9.326364692218351e-06, + "loss": 1.0503, + "step": 803 + }, + { + "epoch": 0.2334155900711279, + "grad_norm": 4.0430498123168945, + "learning_rate": 9.337979094076656e-06, + "loss": 1.2876, + "step": 804 + }, + { + "epoch": 0.2337059079692263, + "grad_norm": 4.059528350830078, + "learning_rate": 9.34959349593496e-06, + "loss": 1.1509, + "step": 805 + }, + { + "epoch": 0.23399622586732471, + "grad_norm": 3.862774610519409, + "learning_rate": 9.361207897793264e-06, + "loss": 1.1237, + "step": 806 + }, + { + "epoch": 0.23428654376542313, + "grad_norm": 4.267635345458984, + "learning_rate": 9.372822299651569e-06, + "loss": 1.2307, + "step": 807 + }, + { + "epoch": 0.23457686166352157, + "grad_norm": 3.8617136478424072, + "learning_rate": 9.384436701509873e-06, + "loss": 1.3029, + "step": 808 + }, + { + "epoch": 0.23486717956161998, + "grad_norm": 4.106259346008301, + "learning_rate": 9.396051103368178e-06, + "loss": 1.2887, + "step": 809 + }, + { + "epoch": 0.2351574974597184, + "grad_norm": 3.966156005859375, + "learning_rate": 9.407665505226482e-06, + "loss": 1.1533, + "step": 810 + }, + { + "epoch": 0.2354478153578168, + "grad_norm": 4.011099338531494, + "learning_rate": 9.419279907084787e-06, + "loss": 1.23, + "step": 811 + }, + { + "epoch": 0.23573813325591522, + "grad_norm": 3.8420891761779785, + "learning_rate": 9.43089430894309e-06, + "loss": 1.2038, + "step": 812 + }, + { + "epoch": 0.23602845115401364, + "grad_norm": 3.7966573238372803, + "learning_rate": 9.442508710801394e-06, + "loss": 1.1904, + "step": 813 + }, + { + "epoch": 0.23631876905211208, + "grad_norm": 3.8873846530914307, + "learning_rate": 9.4541231126597e-06, + "loss": 1.1495, + "step": 814 + }, + { + "epoch": 0.2366090869502105, + "grad_norm": 4.556484699249268, + "learning_rate": 9.465737514518003e-06, + "loss": 1.3733, + "step": 815 + }, + { + "epoch": 0.2368994048483089, + "grad_norm": 3.8360376358032227, + "learning_rate": 9.477351916376307e-06, + "loss": 1.1459, + "step": 816 + }, + { + "epoch": 0.23718972274640732, + "grad_norm": 4.036248683929443, + "learning_rate": 9.488966318234612e-06, + "loss": 1.0859, + "step": 817 + }, + { + "epoch": 0.23748004064450573, + "grad_norm": 4.281419277191162, + "learning_rate": 9.500580720092916e-06, + "loss": 1.1087, + "step": 818 + }, + { + "epoch": 0.23777035854260414, + "grad_norm": 4.298630237579346, + "learning_rate": 9.51219512195122e-06, + "loss": 1.2629, + "step": 819 + }, + { + "epoch": 0.23806067644070256, + "grad_norm": 4.755696773529053, + "learning_rate": 9.523809523809525e-06, + "loss": 1.4463, + "step": 820 + }, + { + "epoch": 0.238350994338801, + "grad_norm": 4.519877910614014, + "learning_rate": 9.53542392566783e-06, + "loss": 1.23, + "step": 821 + }, + { + "epoch": 0.2386413122368994, + "grad_norm": 4.5725812911987305, + "learning_rate": 9.547038327526134e-06, + "loss": 1.2735, + "step": 822 + }, + { + "epoch": 0.23893163013499782, + "grad_norm": 4.227170944213867, + "learning_rate": 9.558652729384437e-06, + "loss": 1.1873, + "step": 823 + }, + { + "epoch": 0.23922194803309624, + "grad_norm": 4.264405727386475, + "learning_rate": 9.570267131242743e-06, + "loss": 1.1793, + "step": 824 + }, + { + "epoch": 0.23951226593119465, + "grad_norm": 3.8990113735198975, + "learning_rate": 9.581881533101046e-06, + "loss": 1.2099, + "step": 825 + }, + { + "epoch": 0.23980258382929306, + "grad_norm": 4.033143997192383, + "learning_rate": 9.59349593495935e-06, + "loss": 1.159, + "step": 826 + }, + { + "epoch": 0.2400929017273915, + "grad_norm": 3.914243459701538, + "learning_rate": 9.605110336817655e-06, + "loss": 1.147, + "step": 827 + }, + { + "epoch": 0.24038321962548992, + "grad_norm": 4.004579067230225, + "learning_rate": 9.616724738675959e-06, + "loss": 1.3154, + "step": 828 + }, + { + "epoch": 0.24067353752358833, + "grad_norm": 4.188416004180908, + "learning_rate": 9.628339140534263e-06, + "loss": 1.2799, + "step": 829 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 4.339681148529053, + "learning_rate": 9.639953542392568e-06, + "loss": 1.2475, + "step": 830 + }, + { + "epoch": 0.24125417331978516, + "grad_norm": 4.40482759475708, + "learning_rate": 9.651567944250871e-06, + "loss": 1.2131, + "step": 831 + }, + { + "epoch": 0.24154449121788357, + "grad_norm": 3.721519947052002, + "learning_rate": 9.663182346109177e-06, + "loss": 1.1448, + "step": 832 + }, + { + "epoch": 0.241834809115982, + "grad_norm": 4.03656530380249, + "learning_rate": 9.67479674796748e-06, + "loss": 1.1783, + "step": 833 + }, + { + "epoch": 0.24212512701408043, + "grad_norm": 3.787747621536255, + "learning_rate": 9.686411149825786e-06, + "loss": 1.2477, + "step": 834 + }, + { + "epoch": 0.24241544491217884, + "grad_norm": 4.436072826385498, + "learning_rate": 9.69802555168409e-06, + "loss": 1.3761, + "step": 835 + }, + { + "epoch": 0.24270576281027725, + "grad_norm": 4.418893814086914, + "learning_rate": 9.709639953542393e-06, + "loss": 1.2114, + "step": 836 + }, + { + "epoch": 0.24299608070837567, + "grad_norm": 4.714204788208008, + "learning_rate": 9.721254355400698e-06, + "loss": 1.1931, + "step": 837 + }, + { + "epoch": 0.24328639860647408, + "grad_norm": 4.259952545166016, + "learning_rate": 9.732868757259002e-06, + "loss": 1.1285, + "step": 838 + }, + { + "epoch": 0.2435767165045725, + "grad_norm": 3.6294689178466797, + "learning_rate": 9.744483159117306e-06, + "loss": 1.0827, + "step": 839 + }, + { + "epoch": 0.24386703440267093, + "grad_norm": 4.037003040313721, + "learning_rate": 9.756097560975611e-06, + "loss": 1.1824, + "step": 840 + }, + { + "epoch": 0.24415735230076935, + "grad_norm": 4.08364200592041, + "learning_rate": 9.767711962833915e-06, + "loss": 1.1278, + "step": 841 + }, + { + "epoch": 0.24444767019886776, + "grad_norm": 4.233451843261719, + "learning_rate": 9.779326364692218e-06, + "loss": 1.2704, + "step": 842 + }, + { + "epoch": 0.24473798809696617, + "grad_norm": 4.0865631103515625, + "learning_rate": 9.790940766550524e-06, + "loss": 1.2111, + "step": 843 + }, + { + "epoch": 0.2450283059950646, + "grad_norm": 4.192430019378662, + "learning_rate": 9.802555168408829e-06, + "loss": 1.218, + "step": 844 + }, + { + "epoch": 0.245318623893163, + "grad_norm": 3.8745322227478027, + "learning_rate": 9.814169570267133e-06, + "loss": 1.2443, + "step": 845 + }, + { + "epoch": 0.24560894179126144, + "grad_norm": 3.955824136734009, + "learning_rate": 9.825783972125436e-06, + "loss": 1.1244, + "step": 846 + }, + { + "epoch": 0.24589925968935986, + "grad_norm": 4.057941913604736, + "learning_rate": 9.837398373983741e-06, + "loss": 1.1756, + "step": 847 + }, + { + "epoch": 0.24618957758745827, + "grad_norm": 3.894920587539673, + "learning_rate": 9.849012775842045e-06, + "loss": 1.2709, + "step": 848 + }, + { + "epoch": 0.24647989548555668, + "grad_norm": 3.87312912940979, + "learning_rate": 9.860627177700349e-06, + "loss": 1.0949, + "step": 849 + }, + { + "epoch": 0.2467702133836551, + "grad_norm": 3.991598606109619, + "learning_rate": 9.872241579558654e-06, + "loss": 1.0914, + "step": 850 + }, + { + "epoch": 0.2470605312817535, + "grad_norm": 4.442087650299072, + "learning_rate": 9.883855981416958e-06, + "loss": 1.3785, + "step": 851 + }, + { + "epoch": 0.24735084917985195, + "grad_norm": 4.167323112487793, + "learning_rate": 9.895470383275261e-06, + "loss": 1.1777, + "step": 852 + }, + { + "epoch": 0.24764116707795036, + "grad_norm": 3.8976168632507324, + "learning_rate": 9.907084785133567e-06, + "loss": 1.2094, + "step": 853 + }, + { + "epoch": 0.24793148497604878, + "grad_norm": 4.286924362182617, + "learning_rate": 9.91869918699187e-06, + "loss": 1.3301, + "step": 854 + }, + { + "epoch": 0.2482218028741472, + "grad_norm": 4.022475242614746, + "learning_rate": 9.930313588850174e-06, + "loss": 1.2017, + "step": 855 + }, + { + "epoch": 0.2485121207722456, + "grad_norm": 3.858656644821167, + "learning_rate": 9.94192799070848e-06, + "loss": 1.2202, + "step": 856 + }, + { + "epoch": 0.24880243867034402, + "grad_norm": 3.9576399326324463, + "learning_rate": 9.953542392566785e-06, + "loss": 1.2639, + "step": 857 + }, + { + "epoch": 0.24909275656844246, + "grad_norm": 3.78914213180542, + "learning_rate": 9.965156794425088e-06, + "loss": 1.0952, + "step": 858 + }, + { + "epoch": 0.24938307446654087, + "grad_norm": 4.147533893585205, + "learning_rate": 9.976771196283392e-06, + "loss": 1.191, + "step": 859 + }, + { + "epoch": 0.24967339236463928, + "grad_norm": 4.042754650115967, + "learning_rate": 9.988385598141697e-06, + "loss": 1.2136, + "step": 860 + }, + { + "epoch": 0.2499637102627377, + "grad_norm": 3.9550065994262695, + "learning_rate": 1e-05, + "loss": 1.1666, + "step": 861 + }, + { + "epoch": 0.25025402816083614, + "grad_norm": 4.223484516143799, + "learning_rate": 9.999999907800993e-06, + "loss": 1.1374, + "step": 862 + }, + { + "epoch": 0.25054434605893455, + "grad_norm": 3.993415355682373, + "learning_rate": 9.999999631203973e-06, + "loss": 1.206, + "step": 863 + }, + { + "epoch": 0.25083466395703297, + "grad_norm": 4.242237091064453, + "learning_rate": 9.99999917020895e-06, + "loss": 1.1703, + "step": 864 + }, + { + "epoch": 0.2511249818551314, + "grad_norm": 4.252773761749268, + "learning_rate": 9.999998524815943e-06, + "loss": 1.2586, + "step": 865 + }, + { + "epoch": 0.2514152997532298, + "grad_norm": 3.9203879833221436, + "learning_rate": 9.999997695024973e-06, + "loss": 1.1088, + "step": 866 + }, + { + "epoch": 0.2517056176513282, + "grad_norm": 4.138311386108398, + "learning_rate": 9.999996680836072e-06, + "loss": 1.2563, + "step": 867 + }, + { + "epoch": 0.2519959355494266, + "grad_norm": 4.038930416107178, + "learning_rate": 9.999995482249281e-06, + "loss": 1.2899, + "step": 868 + }, + { + "epoch": 0.25228625344752503, + "grad_norm": 4.0346879959106445, + "learning_rate": 9.999994099264638e-06, + "loss": 1.1238, + "step": 869 + }, + { + "epoch": 0.25257657134562345, + "grad_norm": 3.8026630878448486, + "learning_rate": 9.999992531882197e-06, + "loss": 1.0621, + "step": 870 + }, + { + "epoch": 0.25286688924372186, + "grad_norm": 4.135496139526367, + "learning_rate": 9.999990780102015e-06, + "loss": 1.1553, + "step": 871 + }, + { + "epoch": 0.2531572071418203, + "grad_norm": 3.8665709495544434, + "learning_rate": 9.999988843924158e-06, + "loss": 1.1897, + "step": 872 + }, + { + "epoch": 0.2534475250399187, + "grad_norm": 3.7282605171203613, + "learning_rate": 9.999986723348697e-06, + "loss": 1.162, + "step": 873 + }, + { + "epoch": 0.25373784293801716, + "grad_norm": 3.997755765914917, + "learning_rate": 9.99998441837571e-06, + "loss": 1.3417, + "step": 874 + }, + { + "epoch": 0.25402816083611557, + "grad_norm": 4.263042449951172, + "learning_rate": 9.999981929005281e-06, + "loss": 1.3103, + "step": 875 + }, + { + "epoch": 0.254318478734214, + "grad_norm": 4.087371826171875, + "learning_rate": 9.999979255237504e-06, + "loss": 1.2355, + "step": 876 + }, + { + "epoch": 0.2546087966323124, + "grad_norm": 4.311849117279053, + "learning_rate": 9.999976397072474e-06, + "loss": 1.25, + "step": 877 + }, + { + "epoch": 0.2548991145304108, + "grad_norm": 3.9726626873016357, + "learning_rate": 9.9999733545103e-06, + "loss": 1.2877, + "step": 878 + }, + { + "epoch": 0.2551894324285092, + "grad_norm": 4.184573173522949, + "learning_rate": 9.999970127551094e-06, + "loss": 1.4488, + "step": 879 + }, + { + "epoch": 0.25547975032660764, + "grad_norm": 4.292477130889893, + "learning_rate": 9.999966716194973e-06, + "loss": 1.3899, + "step": 880 + }, + { + "epoch": 0.25577006822470605, + "grad_norm": 3.918590545654297, + "learning_rate": 9.999963120442062e-06, + "loss": 1.2766, + "step": 881 + }, + { + "epoch": 0.25606038612280446, + "grad_norm": 3.896446466445923, + "learning_rate": 9.999959340292497e-06, + "loss": 1.2409, + "step": 882 + }, + { + "epoch": 0.2563507040209029, + "grad_norm": 3.7944939136505127, + "learning_rate": 9.999955375746415e-06, + "loss": 1.1856, + "step": 883 + }, + { + "epoch": 0.2566410219190013, + "grad_norm": 4.00242805480957, + "learning_rate": 9.999951226803963e-06, + "loss": 1.1902, + "step": 884 + }, + { + "epoch": 0.2569313398170997, + "grad_norm": 3.9395718574523926, + "learning_rate": 9.999946893465294e-06, + "loss": 1.2137, + "step": 885 + }, + { + "epoch": 0.2572216577151981, + "grad_norm": 3.7727317810058594, + "learning_rate": 9.999942375730568e-06, + "loss": 1.2436, + "step": 886 + }, + { + "epoch": 0.2575119756132966, + "grad_norm": 3.9272992610931396, + "learning_rate": 9.999937673599951e-06, + "loss": 1.223, + "step": 887 + }, + { + "epoch": 0.257802293511395, + "grad_norm": 4.122605800628662, + "learning_rate": 9.99993278707362e-06, + "loss": 1.2457, + "step": 888 + }, + { + "epoch": 0.2580926114094934, + "grad_norm": 3.6556971073150635, + "learning_rate": 9.999927716151747e-06, + "loss": 1.1214, + "step": 889 + }, + { + "epoch": 0.2583829293075918, + "grad_norm": 4.025891304016113, + "learning_rate": 9.999922460834525e-06, + "loss": 1.2022, + "step": 890 + }, + { + "epoch": 0.25867324720569024, + "grad_norm": 4.0044379234313965, + "learning_rate": 9.99991702112215e-06, + "loss": 1.1408, + "step": 891 + }, + { + "epoch": 0.25896356510378865, + "grad_norm": 3.8944759368896484, + "learning_rate": 9.999911397014816e-06, + "loss": 1.2388, + "step": 892 + }, + { + "epoch": 0.25925388300188706, + "grad_norm": 3.943559169769287, + "learning_rate": 9.999905588512735e-06, + "loss": 1.0437, + "step": 893 + }, + { + "epoch": 0.2595442008999855, + "grad_norm": 3.794334888458252, + "learning_rate": 9.99989959561612e-06, + "loss": 1.1493, + "step": 894 + }, + { + "epoch": 0.2598345187980839, + "grad_norm": 3.97279691696167, + "learning_rate": 9.999893418325193e-06, + "loss": 1.2069, + "step": 895 + }, + { + "epoch": 0.2601248366961823, + "grad_norm": 4.2030534744262695, + "learning_rate": 9.999887056640178e-06, + "loss": 1.3481, + "step": 896 + }, + { + "epoch": 0.2604151545942807, + "grad_norm": 3.7260630130767822, + "learning_rate": 9.999880510561316e-06, + "loss": 1.185, + "step": 897 + }, + { + "epoch": 0.26070547249237913, + "grad_norm": 4.051196575164795, + "learning_rate": 9.999873780088842e-06, + "loss": 1.2857, + "step": 898 + }, + { + "epoch": 0.2609957903904776, + "grad_norm": 3.895303964614868, + "learning_rate": 9.99986686522301e-06, + "loss": 1.1956, + "step": 899 + }, + { + "epoch": 0.261286108288576, + "grad_norm": 3.712827682495117, + "learning_rate": 9.999859765964071e-06, + "loss": 1.255, + "step": 900 + }, + { + "epoch": 0.2615764261866744, + "grad_norm": 4.21458101272583, + "learning_rate": 9.999852482312287e-06, + "loss": 1.2748, + "step": 901 + }, + { + "epoch": 0.26186674408477284, + "grad_norm": 4.291463375091553, + "learning_rate": 9.999845014267928e-06, + "loss": 1.3972, + "step": 902 + }, + { + "epoch": 0.26215706198287125, + "grad_norm": 3.866318464279175, + "learning_rate": 9.999837361831269e-06, + "loss": 1.1126, + "step": 903 + }, + { + "epoch": 0.26244737988096967, + "grad_norm": 3.7740962505340576, + "learning_rate": 9.999829525002593e-06, + "loss": 1.1077, + "step": 904 + }, + { + "epoch": 0.2627376977790681, + "grad_norm": 3.9418838024139404, + "learning_rate": 9.999821503782188e-06, + "loss": 1.1723, + "step": 905 + }, + { + "epoch": 0.2630280156771665, + "grad_norm": 4.411069869995117, + "learning_rate": 9.999813298170349e-06, + "loss": 1.2593, + "step": 906 + }, + { + "epoch": 0.2633183335752649, + "grad_norm": 4.006514549255371, + "learning_rate": 9.99980490816738e-06, + "loss": 1.2224, + "step": 907 + }, + { + "epoch": 0.2636086514733633, + "grad_norm": 4.01617956161499, + "learning_rate": 9.999796333773591e-06, + "loss": 1.3176, + "step": 908 + }, + { + "epoch": 0.26389896937146173, + "grad_norm": 3.717695951461792, + "learning_rate": 9.999787574989297e-06, + "loss": 1.1465, + "step": 909 + }, + { + "epoch": 0.26418928726956015, + "grad_norm": 4.200732231140137, + "learning_rate": 9.999778631814822e-06, + "loss": 1.2268, + "step": 910 + }, + { + "epoch": 0.26447960516765856, + "grad_norm": 4.170313358306885, + "learning_rate": 9.999769504250495e-06, + "loss": 1.1818, + "step": 911 + }, + { + "epoch": 0.26476992306575703, + "grad_norm": 4.117874622344971, + "learning_rate": 9.999760192296651e-06, + "loss": 1.2266, + "step": 912 + }, + { + "epoch": 0.26506024096385544, + "grad_norm": 4.023068428039551, + "learning_rate": 9.999750695953635e-06, + "loss": 1.2564, + "step": 913 + }, + { + "epoch": 0.26535055886195386, + "grad_norm": 3.9565770626068115, + "learning_rate": 9.9997410152218e-06, + "loss": 1.2719, + "step": 914 + }, + { + "epoch": 0.26564087676005227, + "grad_norm": 4.1268510818481445, + "learning_rate": 9.999731150101499e-06, + "loss": 1.1941, + "step": 915 + }, + { + "epoch": 0.2659311946581507, + "grad_norm": 4.024060249328613, + "learning_rate": 9.999721100593098e-06, + "loss": 1.2576, + "step": 916 + }, + { + "epoch": 0.2662215125562491, + "grad_norm": 4.292674541473389, + "learning_rate": 9.999710866696967e-06, + "loss": 1.3313, + "step": 917 + }, + { + "epoch": 0.2665118304543475, + "grad_norm": 3.7949039936065674, + "learning_rate": 9.999700448413483e-06, + "loss": 1.2748, + "step": 918 + }, + { + "epoch": 0.2668021483524459, + "grad_norm": 3.83724308013916, + "learning_rate": 9.99968984574303e-06, + "loss": 1.2568, + "step": 919 + }, + { + "epoch": 0.26709246625054434, + "grad_norm": 3.7601423263549805, + "learning_rate": 9.999679058686e-06, + "loss": 1.1709, + "step": 920 + }, + { + "epoch": 0.26738278414864275, + "grad_norm": 3.65810227394104, + "learning_rate": 9.999668087242789e-06, + "loss": 1.1861, + "step": 921 + }, + { + "epoch": 0.26767310204674116, + "grad_norm": 3.8424625396728516, + "learning_rate": 9.999656931413805e-06, + "loss": 1.2347, + "step": 922 + }, + { + "epoch": 0.2679634199448396, + "grad_norm": 3.8711178302764893, + "learning_rate": 9.999645591199456e-06, + "loss": 1.1713, + "step": 923 + }, + { + "epoch": 0.26825373784293804, + "grad_norm": 3.7193312644958496, + "learning_rate": 9.999634066600162e-06, + "loss": 1.1272, + "step": 924 + }, + { + "epoch": 0.26854405574103646, + "grad_norm": 3.983853578567505, + "learning_rate": 9.999622357616348e-06, + "loss": 1.2762, + "step": 925 + }, + { + "epoch": 0.26883437363913487, + "grad_norm": 4.00912618637085, + "learning_rate": 9.999610464248446e-06, + "loss": 1.1777, + "step": 926 + }, + { + "epoch": 0.2691246915372333, + "grad_norm": 4.1947126388549805, + "learning_rate": 9.999598386496893e-06, + "loss": 1.389, + "step": 927 + }, + { + "epoch": 0.2694150094353317, + "grad_norm": 3.9506235122680664, + "learning_rate": 9.999586124362136e-06, + "loss": 1.3365, + "step": 928 + }, + { + "epoch": 0.2697053273334301, + "grad_norm": 3.9439916610717773, + "learning_rate": 9.999573677844627e-06, + "loss": 1.2287, + "step": 929 + }, + { + "epoch": 0.2699956452315285, + "grad_norm": 4.163543224334717, + "learning_rate": 9.999561046944824e-06, + "loss": 1.2869, + "step": 930 + }, + { + "epoch": 0.27028596312962694, + "grad_norm": 3.9208672046661377, + "learning_rate": 9.999548231663194e-06, + "loss": 1.2985, + "step": 931 + }, + { + "epoch": 0.27057628102772535, + "grad_norm": 4.060229778289795, + "learning_rate": 9.99953523200021e-06, + "loss": 1.2768, + "step": 932 + }, + { + "epoch": 0.27086659892582376, + "grad_norm": 3.6714141368865967, + "learning_rate": 9.99952204795635e-06, + "loss": 1.1783, + "step": 933 + }, + { + "epoch": 0.2711569168239222, + "grad_norm": 3.772534132003784, + "learning_rate": 9.999508679532102e-06, + "loss": 1.1146, + "step": 934 + }, + { + "epoch": 0.2714472347220206, + "grad_norm": 4.284186840057373, + "learning_rate": 9.999495126727956e-06, + "loss": 1.3329, + "step": 935 + }, + { + "epoch": 0.271737552620119, + "grad_norm": 3.7998135089874268, + "learning_rate": 9.999481389544414e-06, + "loss": 1.2101, + "step": 936 + }, + { + "epoch": 0.2720278705182175, + "grad_norm": 4.04706335067749, + "learning_rate": 9.999467467981984e-06, + "loss": 1.307, + "step": 937 + }, + { + "epoch": 0.2723181884163159, + "grad_norm": 3.911973237991333, + "learning_rate": 9.999453362041177e-06, + "loss": 1.1824, + "step": 938 + }, + { + "epoch": 0.2726085063144143, + "grad_norm": 4.05914831161499, + "learning_rate": 9.999439071722513e-06, + "loss": 1.2237, + "step": 939 + }, + { + "epoch": 0.2728988242125127, + "grad_norm": 4.172504901885986, + "learning_rate": 9.999424597026521e-06, + "loss": 1.2877, + "step": 940 + }, + { + "epoch": 0.2731891421106111, + "grad_norm": 3.855518341064453, + "learning_rate": 9.999409937953732e-06, + "loss": 1.1341, + "step": 941 + }, + { + "epoch": 0.27347946000870954, + "grad_norm": 4.338953018188477, + "learning_rate": 9.999395094504692e-06, + "loss": 1.2654, + "step": 942 + }, + { + "epoch": 0.27376977790680795, + "grad_norm": 3.9418210983276367, + "learning_rate": 9.999380066679943e-06, + "loss": 1.2278, + "step": 943 + }, + { + "epoch": 0.27406009580490637, + "grad_norm": 3.866417646408081, + "learning_rate": 9.99936485448004e-06, + "loss": 1.3366, + "step": 944 + }, + { + "epoch": 0.2743504137030048, + "grad_norm": 3.783524513244629, + "learning_rate": 9.999349457905545e-06, + "loss": 1.1555, + "step": 945 + }, + { + "epoch": 0.2746407316011032, + "grad_norm": 3.9190661907196045, + "learning_rate": 9.999333876957027e-06, + "loss": 1.2089, + "step": 946 + }, + { + "epoch": 0.2749310494992016, + "grad_norm": 3.7447915077209473, + "learning_rate": 9.99931811163506e-06, + "loss": 1.2385, + "step": 947 + }, + { + "epoch": 0.2752213673973, + "grad_norm": 4.181678295135498, + "learning_rate": 9.999302161940224e-06, + "loss": 1.2333, + "step": 948 + }, + { + "epoch": 0.27551168529539843, + "grad_norm": 3.853498697280884, + "learning_rate": 9.99928602787311e-06, + "loss": 1.1547, + "step": 949 + }, + { + "epoch": 0.2758020031934969, + "grad_norm": 3.614431619644165, + "learning_rate": 9.999269709434308e-06, + "loss": 1.1117, + "step": 950 + }, + { + "epoch": 0.2760923210915953, + "grad_norm": 4.468873977661133, + "learning_rate": 9.999253206624425e-06, + "loss": 1.3627, + "step": 951 + }, + { + "epoch": 0.27638263898969373, + "grad_norm": 4.207579135894775, + "learning_rate": 9.999236519444067e-06, + "loss": 1.2428, + "step": 952 + }, + { + "epoch": 0.27667295688779214, + "grad_norm": 3.9187076091766357, + "learning_rate": 9.999219647893852e-06, + "loss": 1.1798, + "step": 953 + }, + { + "epoch": 0.27696327478589056, + "grad_norm": 3.7778027057647705, + "learning_rate": 9.999202591974398e-06, + "loss": 1.1975, + "step": 954 + }, + { + "epoch": 0.27725359268398897, + "grad_norm": 3.8436973094940186, + "learning_rate": 9.999185351686336e-06, + "loss": 1.1884, + "step": 955 + }, + { + "epoch": 0.2775439105820874, + "grad_norm": 4.115079402923584, + "learning_rate": 9.999167927030304e-06, + "loss": 1.2735, + "step": 956 + }, + { + "epoch": 0.2778342284801858, + "grad_norm": 3.7705702781677246, + "learning_rate": 9.999150318006942e-06, + "loss": 1.1011, + "step": 957 + }, + { + "epoch": 0.2781245463782842, + "grad_norm": 4.015285491943359, + "learning_rate": 9.9991325246169e-06, + "loss": 1.2667, + "step": 958 + }, + { + "epoch": 0.2784148642763826, + "grad_norm": 3.9331655502319336, + "learning_rate": 9.999114546860834e-06, + "loss": 1.2667, + "step": 959 + }, + { + "epoch": 0.27870518217448104, + "grad_norm": 4.180220603942871, + "learning_rate": 9.999096384739407e-06, + "loss": 1.2929, + "step": 960 + }, + { + "epoch": 0.27899550007257945, + "grad_norm": 4.194953918457031, + "learning_rate": 9.99907803825329e-06, + "loss": 1.451, + "step": 961 + }, + { + "epoch": 0.2792858179706779, + "grad_norm": 3.872340679168701, + "learning_rate": 9.99905950740316e-06, + "loss": 1.1172, + "step": 962 + }, + { + "epoch": 0.27957613586877633, + "grad_norm": 3.8990437984466553, + "learning_rate": 9.999040792189696e-06, + "loss": 1.2839, + "step": 963 + }, + { + "epoch": 0.27986645376687475, + "grad_norm": 4.102906703948975, + "learning_rate": 9.999021892613594e-06, + "loss": 1.1807, + "step": 964 + }, + { + "epoch": 0.28015677166497316, + "grad_norm": 3.698540210723877, + "learning_rate": 9.999002808675547e-06, + "loss": 1.3311, + "step": 965 + }, + { + "epoch": 0.28044708956307157, + "grad_norm": 4.117794990539551, + "learning_rate": 9.998983540376262e-06, + "loss": 1.2954, + "step": 966 + }, + { + "epoch": 0.28073740746117, + "grad_norm": 4.094895362854004, + "learning_rate": 9.998964087716445e-06, + "loss": 1.2965, + "step": 967 + }, + { + "epoch": 0.2810277253592684, + "grad_norm": 3.921121120452881, + "learning_rate": 9.998944450696818e-06, + "loss": 1.3762, + "step": 968 + }, + { + "epoch": 0.2813180432573668, + "grad_norm": 3.5735599994659424, + "learning_rate": 9.998924629318103e-06, + "loss": 1.227, + "step": 969 + }, + { + "epoch": 0.2816083611554652, + "grad_norm": 3.7150392532348633, + "learning_rate": 9.998904623581032e-06, + "loss": 1.2873, + "step": 970 + }, + { + "epoch": 0.28189867905356364, + "grad_norm": 4.215477466583252, + "learning_rate": 9.998884433486342e-06, + "loss": 1.4844, + "step": 971 + }, + { + "epoch": 0.28218899695166205, + "grad_norm": 3.861442804336548, + "learning_rate": 9.998864059034778e-06, + "loss": 1.1615, + "step": 972 + }, + { + "epoch": 0.28247931484976047, + "grad_norm": 3.7807931900024414, + "learning_rate": 9.998843500227092e-06, + "loss": 1.3308, + "step": 973 + }, + { + "epoch": 0.2827696327478589, + "grad_norm": 4.654616832733154, + "learning_rate": 9.99882275706404e-06, + "loss": 1.3845, + "step": 974 + }, + { + "epoch": 0.28305995064595735, + "grad_norm": 3.788461685180664, + "learning_rate": 9.998801829546387e-06, + "loss": 1.2098, + "step": 975 + }, + { + "epoch": 0.28335026854405576, + "grad_norm": 3.7853169441223145, + "learning_rate": 9.99878071767491e-06, + "loss": 1.1913, + "step": 976 + }, + { + "epoch": 0.2836405864421542, + "grad_norm": 3.6798760890960693, + "learning_rate": 9.998759421450382e-06, + "loss": 1.0833, + "step": 977 + }, + { + "epoch": 0.2839309043402526, + "grad_norm": 3.5938055515289307, + "learning_rate": 9.998737940873589e-06, + "loss": 1.2577, + "step": 978 + }, + { + "epoch": 0.284221222238351, + "grad_norm": 3.609879970550537, + "learning_rate": 9.998716275945326e-06, + "loss": 1.2261, + "step": 979 + }, + { + "epoch": 0.2845115401364494, + "grad_norm": 4.083144187927246, + "learning_rate": 9.99869442666639e-06, + "loss": 1.37, + "step": 980 + }, + { + "epoch": 0.28480185803454783, + "grad_norm": 3.6036617755889893, + "learning_rate": 9.998672393037587e-06, + "loss": 1.1282, + "step": 981 + }, + { + "epoch": 0.28509217593264624, + "grad_norm": 3.648822784423828, + "learning_rate": 9.99865017505973e-06, + "loss": 1.1288, + "step": 982 + }, + { + "epoch": 0.28538249383074465, + "grad_norm": 3.8245482444763184, + "learning_rate": 9.998627772733638e-06, + "loss": 1.1163, + "step": 983 + }, + { + "epoch": 0.28567281172884307, + "grad_norm": 3.836742877960205, + "learning_rate": 9.998605186060138e-06, + "loss": 1.1848, + "step": 984 + }, + { + "epoch": 0.2859631296269415, + "grad_norm": 3.5548558235168457, + "learning_rate": 9.998582415040061e-06, + "loss": 1.1864, + "step": 985 + }, + { + "epoch": 0.2862534475250399, + "grad_norm": 4.147696018218994, + "learning_rate": 9.99855945967425e-06, + "loss": 1.4001, + "step": 986 + }, + { + "epoch": 0.28654376542313836, + "grad_norm": 3.7722232341766357, + "learning_rate": 9.99853631996355e-06, + "loss": 1.2789, + "step": 987 + }, + { + "epoch": 0.2868340833212368, + "grad_norm": 4.302724838256836, + "learning_rate": 9.998512995908812e-06, + "loss": 1.2114, + "step": 988 + }, + { + "epoch": 0.2871244012193352, + "grad_norm": 4.2343621253967285, + "learning_rate": 9.9984894875109e-06, + "loss": 1.2674, + "step": 989 + }, + { + "epoch": 0.2874147191174336, + "grad_norm": 3.9608490467071533, + "learning_rate": 9.998465794770677e-06, + "loss": 1.0819, + "step": 990 + }, + { + "epoch": 0.287705037015532, + "grad_norm": 3.951963424682617, + "learning_rate": 9.998441917689022e-06, + "loss": 1.2561, + "step": 991 + }, + { + "epoch": 0.28799535491363043, + "grad_norm": 3.7183871269226074, + "learning_rate": 9.998417856266811e-06, + "loss": 1.1932, + "step": 992 + }, + { + "epoch": 0.28828567281172884, + "grad_norm": 3.7486894130706787, + "learning_rate": 9.998393610504933e-06, + "loss": 1.1478, + "step": 993 + }, + { + "epoch": 0.28857599070982726, + "grad_norm": 3.986708402633667, + "learning_rate": 9.998369180404283e-06, + "loss": 1.1849, + "step": 994 + }, + { + "epoch": 0.28886630860792567, + "grad_norm": 3.6684303283691406, + "learning_rate": 9.998344565965761e-06, + "loss": 1.2896, + "step": 995 + }, + { + "epoch": 0.2891566265060241, + "grad_norm": 3.8808441162109375, + "learning_rate": 9.998319767190274e-06, + "loss": 1.3013, + "step": 996 + }, + { + "epoch": 0.2894469444041225, + "grad_norm": 3.917853832244873, + "learning_rate": 9.998294784078739e-06, + "loss": 1.3916, + "step": 997 + }, + { + "epoch": 0.2897372623022209, + "grad_norm": 3.955862045288086, + "learning_rate": 9.998269616632075e-06, + "loss": 1.1784, + "step": 998 + }, + { + "epoch": 0.2900275802003193, + "grad_norm": 3.538889169692993, + "learning_rate": 9.998244264851211e-06, + "loss": 0.9783, + "step": 999 + }, + { + "epoch": 0.2903178980984178, + "grad_norm": 3.675344228744507, + "learning_rate": 9.998218728737081e-06, + "loss": 1.3406, + "step": 1000 + }, + { + "epoch": 0.2903178980984178, + "eval_loss": 1.252946376800537, + "eval_runtime": 11.2256, + "eval_samples_per_second": 35.633, + "eval_steps_per_second": 4.454, + "step": 1000 + }, + { + "epoch": 0.2906082159965162, + "grad_norm": 3.7118828296661377, + "learning_rate": 9.99819300829063e-06, + "loss": 1.0587, + "step": 1001 + }, + { + "epoch": 0.2908985338946146, + "grad_norm": 3.9424095153808594, + "learning_rate": 9.998167103512803e-06, + "loss": 1.1582, + "step": 1002 + }, + { + "epoch": 0.29118885179271303, + "grad_norm": 3.7735092639923096, + "learning_rate": 9.998141014404556e-06, + "loss": 1.3521, + "step": 1003 + }, + { + "epoch": 0.29147916969081145, + "grad_norm": 3.752547264099121, + "learning_rate": 9.998114740966853e-06, + "loss": 1.1414, + "step": 1004 + }, + { + "epoch": 0.29176948758890986, + "grad_norm": 3.8838298320770264, + "learning_rate": 9.998088283200662e-06, + "loss": 1.1848, + "step": 1005 + }, + { + "epoch": 0.2920598054870083, + "grad_norm": 4.132805824279785, + "learning_rate": 9.998061641106958e-06, + "loss": 1.288, + "step": 1006 + }, + { + "epoch": 0.2923501233851067, + "grad_norm": 3.8610050678253174, + "learning_rate": 9.998034814686724e-06, + "loss": 1.209, + "step": 1007 + }, + { + "epoch": 0.2926404412832051, + "grad_norm": 3.819197416305542, + "learning_rate": 9.99800780394095e-06, + "loss": 1.1585, + "step": 1008 + }, + { + "epoch": 0.2929307591813035, + "grad_norm": 3.5778913497924805, + "learning_rate": 9.99798060887063e-06, + "loss": 1.0819, + "step": 1009 + }, + { + "epoch": 0.2932210770794019, + "grad_norm": 3.7328646183013916, + "learning_rate": 9.997953229476771e-06, + "loss": 1.1686, + "step": 1010 + }, + { + "epoch": 0.29351139497750034, + "grad_norm": 3.9370815753936768, + "learning_rate": 9.997925665760378e-06, + "loss": 1.1981, + "step": 1011 + }, + { + "epoch": 0.2938017128755988, + "grad_norm": 3.5711724758148193, + "learning_rate": 9.997897917722473e-06, + "loss": 1.162, + "step": 1012 + }, + { + "epoch": 0.2940920307736972, + "grad_norm": 3.807966709136963, + "learning_rate": 9.997869985364073e-06, + "loss": 1.0689, + "step": 1013 + }, + { + "epoch": 0.29438234867179564, + "grad_norm": 3.5610194206237793, + "learning_rate": 9.997841868686211e-06, + "loss": 1.1146, + "step": 1014 + }, + { + "epoch": 0.29467266656989405, + "grad_norm": 3.8267099857330322, + "learning_rate": 9.997813567689926e-06, + "loss": 1.228, + "step": 1015 + }, + { + "epoch": 0.29496298446799246, + "grad_norm": 4.01648473739624, + "learning_rate": 9.99778508237626e-06, + "loss": 1.1465, + "step": 1016 + }, + { + "epoch": 0.2952533023660909, + "grad_norm": 3.702500820159912, + "learning_rate": 9.997756412746262e-06, + "loss": 1.1933, + "step": 1017 + }, + { + "epoch": 0.2955436202641893, + "grad_norm": 3.886366605758667, + "learning_rate": 9.997727558800991e-06, + "loss": 1.2683, + "step": 1018 + }, + { + "epoch": 0.2958339381622877, + "grad_norm": 4.139401912689209, + "learning_rate": 9.997698520541513e-06, + "loss": 1.2807, + "step": 1019 + }, + { + "epoch": 0.2961242560603861, + "grad_norm": 4.107751846313477, + "learning_rate": 9.997669297968895e-06, + "loss": 1.3347, + "step": 1020 + }, + { + "epoch": 0.29641457395848453, + "grad_norm": 3.888638734817505, + "learning_rate": 9.997639891084216e-06, + "loss": 1.2342, + "step": 1021 + }, + { + "epoch": 0.29670489185658294, + "grad_norm": 3.8988595008850098, + "learning_rate": 9.997610299888562e-06, + "loss": 1.2046, + "step": 1022 + }, + { + "epoch": 0.29699520975468136, + "grad_norm": 3.6805219650268555, + "learning_rate": 9.997580524383025e-06, + "loss": 1.1419, + "step": 1023 + }, + { + "epoch": 0.29728552765277977, + "grad_norm": 3.717468500137329, + "learning_rate": 9.997550564568698e-06, + "loss": 1.2272, + "step": 1024 + }, + { + "epoch": 0.29757584555087824, + "grad_norm": 3.684636116027832, + "learning_rate": 9.997520420446694e-06, + "loss": 1.2279, + "step": 1025 + }, + { + "epoch": 0.29786616344897665, + "grad_norm": 3.6968002319335938, + "learning_rate": 9.997490092018117e-06, + "loss": 1.1613, + "step": 1026 + }, + { + "epoch": 0.29815648134707506, + "grad_norm": 4.012862682342529, + "learning_rate": 9.997459579284088e-06, + "loss": 1.1938, + "step": 1027 + }, + { + "epoch": 0.2984467992451735, + "grad_norm": 4.252531051635742, + "learning_rate": 9.997428882245735e-06, + "loss": 1.149, + "step": 1028 + }, + { + "epoch": 0.2987371171432719, + "grad_norm": 3.787094831466675, + "learning_rate": 9.997398000904185e-06, + "loss": 1.2608, + "step": 1029 + }, + { + "epoch": 0.2990274350413703, + "grad_norm": 4.0114970207214355, + "learning_rate": 9.997366935260582e-06, + "loss": 1.185, + "step": 1030 + }, + { + "epoch": 0.2993177529394687, + "grad_norm": 3.625157356262207, + "learning_rate": 9.99733568531607e-06, + "loss": 1.1818, + "step": 1031 + }, + { + "epoch": 0.29960807083756713, + "grad_norm": 3.3687214851379395, + "learning_rate": 9.997304251071802e-06, + "loss": 1.0876, + "step": 1032 + }, + { + "epoch": 0.29989838873566554, + "grad_norm": 3.9616904258728027, + "learning_rate": 9.997272632528933e-06, + "loss": 1.1674, + "step": 1033 + }, + { + "epoch": 0.30018870663376396, + "grad_norm": 4.397826194763184, + "learning_rate": 9.997240829688634e-06, + "loss": 1.3382, + "step": 1034 + }, + { + "epoch": 0.30047902453186237, + "grad_norm": 3.7658543586730957, + "learning_rate": 9.997208842552077e-06, + "loss": 1.1838, + "step": 1035 + }, + { + "epoch": 0.3007693424299608, + "grad_norm": 3.806561231613159, + "learning_rate": 9.99717667112044e-06, + "loss": 1.1805, + "step": 1036 + }, + { + "epoch": 0.3010596603280592, + "grad_norm": 3.5808584690093994, + "learning_rate": 9.997144315394912e-06, + "loss": 1.2062, + "step": 1037 + }, + { + "epoch": 0.30134997822615767, + "grad_norm": 3.2824292182922363, + "learning_rate": 9.997111775376684e-06, + "loss": 1.0395, + "step": 1038 + }, + { + "epoch": 0.3016402961242561, + "grad_norm": 3.9872941970825195, + "learning_rate": 9.997079051066956e-06, + "loss": 1.2192, + "step": 1039 + }, + { + "epoch": 0.3019306140223545, + "grad_norm": 4.112649440765381, + "learning_rate": 9.997046142466935e-06, + "loss": 1.4281, + "step": 1040 + }, + { + "epoch": 0.3022209319204529, + "grad_norm": 3.963346481323242, + "learning_rate": 9.997013049577838e-06, + "loss": 1.2096, + "step": 1041 + }, + { + "epoch": 0.3025112498185513, + "grad_norm": 3.9230425357818604, + "learning_rate": 9.99697977240088e-06, + "loss": 1.2325, + "step": 1042 + }, + { + "epoch": 0.30280156771664973, + "grad_norm": 4.026306629180908, + "learning_rate": 9.996946310937292e-06, + "loss": 1.2818, + "step": 1043 + }, + { + "epoch": 0.30309188561474815, + "grad_norm": 4.02335786819458, + "learning_rate": 9.996912665188308e-06, + "loss": 1.3765, + "step": 1044 + }, + { + "epoch": 0.30338220351284656, + "grad_norm": 3.77268123626709, + "learning_rate": 9.996878835155166e-06, + "loss": 1.3176, + "step": 1045 + }, + { + "epoch": 0.303672521410945, + "grad_norm": 4.2044525146484375, + "learning_rate": 9.996844820839115e-06, + "loss": 1.3502, + "step": 1046 + }, + { + "epoch": 0.3039628393090434, + "grad_norm": 3.5329604148864746, + "learning_rate": 9.996810622241412e-06, + "loss": 1.1506, + "step": 1047 + }, + { + "epoch": 0.3042531572071418, + "grad_norm": 3.349825620651245, + "learning_rate": 9.996776239363317e-06, + "loss": 1.0941, + "step": 1048 + }, + { + "epoch": 0.3045434751052402, + "grad_norm": 3.884256362915039, + "learning_rate": 9.996741672206095e-06, + "loss": 1.308, + "step": 1049 + }, + { + "epoch": 0.3048337930033387, + "grad_norm": 3.6708192825317383, + "learning_rate": 9.996706920771024e-06, + "loss": 1.06, + "step": 1050 + }, + { + "epoch": 0.3051241109014371, + "grad_norm": 3.7969107627868652, + "learning_rate": 9.996671985059384e-06, + "loss": 1.2821, + "step": 1051 + }, + { + "epoch": 0.3054144287995355, + "grad_norm": 4.150816917419434, + "learning_rate": 9.996636865072464e-06, + "loss": 1.3209, + "step": 1052 + }, + { + "epoch": 0.3057047466976339, + "grad_norm": 3.5923068523406982, + "learning_rate": 9.99660156081156e-06, + "loss": 1.1685, + "step": 1053 + }, + { + "epoch": 0.30599506459573234, + "grad_norm": 4.074513912200928, + "learning_rate": 9.996566072277974e-06, + "loss": 1.1066, + "step": 1054 + }, + { + "epoch": 0.30628538249383075, + "grad_norm": 3.7009284496307373, + "learning_rate": 9.996530399473012e-06, + "loss": 1.1065, + "step": 1055 + }, + { + "epoch": 0.30657570039192916, + "grad_norm": 3.790034055709839, + "learning_rate": 9.996494542397993e-06, + "loss": 1.2058, + "step": 1056 + }, + { + "epoch": 0.3068660182900276, + "grad_norm": 4.157486915588379, + "learning_rate": 9.996458501054237e-06, + "loss": 1.3369, + "step": 1057 + }, + { + "epoch": 0.307156336188126, + "grad_norm": 4.008849143981934, + "learning_rate": 9.996422275443076e-06, + "loss": 1.3844, + "step": 1058 + }, + { + "epoch": 0.3074466540862244, + "grad_norm": 4.041140556335449, + "learning_rate": 9.996385865565844e-06, + "loss": 1.2306, + "step": 1059 + }, + { + "epoch": 0.3077369719843228, + "grad_norm": 4.257492542266846, + "learning_rate": 9.996349271423883e-06, + "loss": 1.248, + "step": 1060 + }, + { + "epoch": 0.30802728988242123, + "grad_norm": 4.013744354248047, + "learning_rate": 9.996312493018545e-06, + "loss": 1.2645, + "step": 1061 + }, + { + "epoch": 0.30831760778051964, + "grad_norm": 3.783053398132324, + "learning_rate": 9.996275530351184e-06, + "loss": 1.2519, + "step": 1062 + }, + { + "epoch": 0.3086079256786181, + "grad_norm": 4.049034118652344, + "learning_rate": 9.996238383423162e-06, + "loss": 1.2987, + "step": 1063 + }, + { + "epoch": 0.3088982435767165, + "grad_norm": 4.0037078857421875, + "learning_rate": 9.996201052235855e-06, + "loss": 1.3219, + "step": 1064 + }, + { + "epoch": 0.30918856147481494, + "grad_norm": 3.8853280544281006, + "learning_rate": 9.996163536790633e-06, + "loss": 1.3642, + "step": 1065 + }, + { + "epoch": 0.30947887937291335, + "grad_norm": 3.756002902984619, + "learning_rate": 9.996125837088883e-06, + "loss": 1.2355, + "step": 1066 + }, + { + "epoch": 0.30976919727101176, + "grad_norm": 3.9041924476623535, + "learning_rate": 9.996087953131996e-06, + "loss": 1.2097, + "step": 1067 + }, + { + "epoch": 0.3100595151691102, + "grad_norm": 3.773911952972412, + "learning_rate": 9.996049884921367e-06, + "loss": 1.1904, + "step": 1068 + }, + { + "epoch": 0.3103498330672086, + "grad_norm": 3.802534341812134, + "learning_rate": 9.996011632458403e-06, + "loss": 1.1983, + "step": 1069 + }, + { + "epoch": 0.310640150965307, + "grad_norm": 3.91593861579895, + "learning_rate": 9.99597319574451e-06, + "loss": 1.3075, + "step": 1070 + }, + { + "epoch": 0.3109304688634054, + "grad_norm": 3.9573280811309814, + "learning_rate": 9.995934574781108e-06, + "loss": 1.3832, + "step": 1071 + }, + { + "epoch": 0.31122078676150383, + "grad_norm": 3.5446033477783203, + "learning_rate": 9.995895769569623e-06, + "loss": 1.1472, + "step": 1072 + }, + { + "epoch": 0.31151110465960224, + "grad_norm": 3.6855850219726562, + "learning_rate": 9.995856780111483e-06, + "loss": 1.1494, + "step": 1073 + }, + { + "epoch": 0.31180142255770066, + "grad_norm": 4.052492618560791, + "learning_rate": 9.995817606408129e-06, + "loss": 1.3019, + "step": 1074 + }, + { + "epoch": 0.3120917404557991, + "grad_norm": 3.6750905513763428, + "learning_rate": 9.995778248461003e-06, + "loss": 1.1294, + "step": 1075 + }, + { + "epoch": 0.31238205835389754, + "grad_norm": 3.975306510925293, + "learning_rate": 9.995738706271559e-06, + "loss": 1.1529, + "step": 1076 + }, + { + "epoch": 0.31267237625199595, + "grad_norm": 3.8198189735412598, + "learning_rate": 9.995698979841253e-06, + "loss": 1.1464, + "step": 1077 + }, + { + "epoch": 0.31296269415009437, + "grad_norm": 3.8802731037139893, + "learning_rate": 9.99565906917155e-06, + "loss": 1.296, + "step": 1078 + }, + { + "epoch": 0.3132530120481928, + "grad_norm": 3.874182939529419, + "learning_rate": 9.995618974263925e-06, + "loss": 1.2741, + "step": 1079 + }, + { + "epoch": 0.3135433299462912, + "grad_norm": 4.022329807281494, + "learning_rate": 9.995578695119856e-06, + "loss": 1.2235, + "step": 1080 + }, + { + "epoch": 0.3138336478443896, + "grad_norm": 3.432136058807373, + "learning_rate": 9.995538231740825e-06, + "loss": 1.1024, + "step": 1081 + }, + { + "epoch": 0.314123965742488, + "grad_norm": 3.90201735496521, + "learning_rate": 9.995497584128326e-06, + "loss": 1.17, + "step": 1082 + }, + { + "epoch": 0.31441428364058643, + "grad_norm": 3.2675185203552246, + "learning_rate": 9.995456752283858e-06, + "loss": 1.0976, + "step": 1083 + }, + { + "epoch": 0.31470460153868485, + "grad_norm": 3.555330991744995, + "learning_rate": 9.99541573620893e-06, + "loss": 1.2697, + "step": 1084 + }, + { + "epoch": 0.31499491943678326, + "grad_norm": 3.853966236114502, + "learning_rate": 9.99537453590505e-06, + "loss": 1.4011, + "step": 1085 + }, + { + "epoch": 0.3152852373348817, + "grad_norm": 3.650466203689575, + "learning_rate": 9.99533315137374e-06, + "loss": 1.1749, + "step": 1086 + }, + { + "epoch": 0.3155755552329801, + "grad_norm": 3.698735475540161, + "learning_rate": 9.995291582616526e-06, + "loss": 1.3977, + "step": 1087 + }, + { + "epoch": 0.31586587313107856, + "grad_norm": 3.5275065898895264, + "learning_rate": 9.99524982963494e-06, + "loss": 1.3007, + "step": 1088 + }, + { + "epoch": 0.31615619102917697, + "grad_norm": 3.885864019393921, + "learning_rate": 9.995207892430525e-06, + "loss": 1.3067, + "step": 1089 + }, + { + "epoch": 0.3164465089272754, + "grad_norm": 3.5765745639801025, + "learning_rate": 9.995165771004821e-06, + "loss": 1.2831, + "step": 1090 + }, + { + "epoch": 0.3167368268253738, + "grad_norm": 4.13949728012085, + "learning_rate": 9.99512346535939e-06, + "loss": 1.3137, + "step": 1091 + }, + { + "epoch": 0.3170271447234722, + "grad_norm": 3.839385747909546, + "learning_rate": 9.995080975495786e-06, + "loss": 1.1197, + "step": 1092 + }, + { + "epoch": 0.3173174626215706, + "grad_norm": 3.585883617401123, + "learning_rate": 9.995038301415575e-06, + "loss": 1.043, + "step": 1093 + }, + { + "epoch": 0.31760778051966904, + "grad_norm": 3.585265636444092, + "learning_rate": 9.994995443120338e-06, + "loss": 1.2184, + "step": 1094 + }, + { + "epoch": 0.31789809841776745, + "grad_norm": 3.765455722808838, + "learning_rate": 9.99495240061165e-06, + "loss": 1.1284, + "step": 1095 + }, + { + "epoch": 0.31818841631586586, + "grad_norm": 3.9608914852142334, + "learning_rate": 9.994909173891098e-06, + "loss": 1.2844, + "step": 1096 + }, + { + "epoch": 0.3184787342139643, + "grad_norm": 4.155348777770996, + "learning_rate": 9.99486576296028e-06, + "loss": 1.2291, + "step": 1097 + }, + { + "epoch": 0.3187690521120627, + "grad_norm": 4.106432914733887, + "learning_rate": 9.994822167820794e-06, + "loss": 1.3016, + "step": 1098 + }, + { + "epoch": 0.3190593700101611, + "grad_norm": 3.668353319168091, + "learning_rate": 9.994778388474249e-06, + "loss": 1.1079, + "step": 1099 + }, + { + "epoch": 0.31934968790825957, + "grad_norm": 4.098554611206055, + "learning_rate": 9.994734424922258e-06, + "loss": 1.2308, + "step": 1100 + }, + { + "epoch": 0.319640005806358, + "grad_norm": 3.569974660873413, + "learning_rate": 9.994690277166443e-06, + "loss": 1.144, + "step": 1101 + }, + { + "epoch": 0.3199303237044564, + "grad_norm": 3.9479312896728516, + "learning_rate": 9.994645945208434e-06, + "loss": 1.147, + "step": 1102 + }, + { + "epoch": 0.3202206416025548, + "grad_norm": 3.754945755004883, + "learning_rate": 9.994601429049866e-06, + "loss": 1.2279, + "step": 1103 + }, + { + "epoch": 0.3205109595006532, + "grad_norm": 3.6482317447662354, + "learning_rate": 9.994556728692377e-06, + "loss": 1.1124, + "step": 1104 + }, + { + "epoch": 0.32080127739875164, + "grad_norm": 3.5694377422332764, + "learning_rate": 9.994511844137618e-06, + "loss": 1.1965, + "step": 1105 + }, + { + "epoch": 0.32109159529685005, + "grad_norm": 3.633552312850952, + "learning_rate": 9.994466775387246e-06, + "loss": 1.1248, + "step": 1106 + }, + { + "epoch": 0.32138191319494847, + "grad_norm": 4.080570220947266, + "learning_rate": 9.99442152244292e-06, + "loss": 1.4583, + "step": 1107 + }, + { + "epoch": 0.3216722310930469, + "grad_norm": 3.8583877086639404, + "learning_rate": 9.994376085306309e-06, + "loss": 1.314, + "step": 1108 + }, + { + "epoch": 0.3219625489911453, + "grad_norm": 4.030450820922852, + "learning_rate": 9.994330463979092e-06, + "loss": 1.1375, + "step": 1109 + }, + { + "epoch": 0.3222528668892437, + "grad_norm": 3.8722689151763916, + "learning_rate": 9.994284658462949e-06, + "loss": 1.3931, + "step": 1110 + }, + { + "epoch": 0.3225431847873421, + "grad_norm": 3.761976957321167, + "learning_rate": 9.99423866875957e-06, + "loss": 1.1999, + "step": 1111 + }, + { + "epoch": 0.32283350268544053, + "grad_norm": 3.489006519317627, + "learning_rate": 9.994192494870649e-06, + "loss": 1.1845, + "step": 1112 + }, + { + "epoch": 0.323123820583539, + "grad_norm": 4.012115001678467, + "learning_rate": 9.994146136797893e-06, + "loss": 1.1846, + "step": 1113 + }, + { + "epoch": 0.3234141384816374, + "grad_norm": 4.048895359039307, + "learning_rate": 9.994099594543007e-06, + "loss": 1.2829, + "step": 1114 + }, + { + "epoch": 0.3237044563797358, + "grad_norm": 3.85603666305542, + "learning_rate": 9.994052868107712e-06, + "loss": 1.1342, + "step": 1115 + }, + { + "epoch": 0.32399477427783424, + "grad_norm": 3.687089681625366, + "learning_rate": 9.99400595749373e-06, + "loss": 1.1298, + "step": 1116 + }, + { + "epoch": 0.32428509217593265, + "grad_norm": 3.7886598110198975, + "learning_rate": 9.993958862702785e-06, + "loss": 1.4015, + "step": 1117 + }, + { + "epoch": 0.32457541007403107, + "grad_norm": 3.9265501499176025, + "learning_rate": 9.993911583736624e-06, + "loss": 1.2466, + "step": 1118 + }, + { + "epoch": 0.3248657279721295, + "grad_norm": 3.571340560913086, + "learning_rate": 9.993864120596982e-06, + "loss": 1.1224, + "step": 1119 + }, + { + "epoch": 0.3251560458702279, + "grad_norm": 3.711078643798828, + "learning_rate": 9.993816473285615e-06, + "loss": 1.1134, + "step": 1120 + }, + { + "epoch": 0.3254463637683263, + "grad_norm": 3.8613884449005127, + "learning_rate": 9.993768641804279e-06, + "loss": 1.249, + "step": 1121 + }, + { + "epoch": 0.3257366816664247, + "grad_norm": 3.556450605392456, + "learning_rate": 9.993720626154736e-06, + "loss": 1.1877, + "step": 1122 + }, + { + "epoch": 0.32602699956452313, + "grad_norm": 4.229327201843262, + "learning_rate": 9.99367242633876e-06, + "loss": 1.3764, + "step": 1123 + }, + { + "epoch": 0.32631731746262155, + "grad_norm": 3.5248398780822754, + "learning_rate": 9.993624042358123e-06, + "loss": 1.1134, + "step": 1124 + }, + { + "epoch": 0.32660763536071996, + "grad_norm": 3.608933210372925, + "learning_rate": 9.993575474214615e-06, + "loss": 1.1646, + "step": 1125 + }, + { + "epoch": 0.32689795325881843, + "grad_norm": 3.668365001678467, + "learning_rate": 9.993526721910026e-06, + "loss": 1.2625, + "step": 1126 + }, + { + "epoch": 0.32718827115691684, + "grad_norm": 3.6710710525512695, + "learning_rate": 9.993477785446151e-06, + "loss": 1.2838, + "step": 1127 + }, + { + "epoch": 0.32747858905501526, + "grad_norm": 3.607513904571533, + "learning_rate": 9.993428664824798e-06, + "loss": 1.1953, + "step": 1128 + }, + { + "epoch": 0.32776890695311367, + "grad_norm": 4.071550369262695, + "learning_rate": 9.993379360047777e-06, + "loss": 1.1125, + "step": 1129 + }, + { + "epoch": 0.3280592248512121, + "grad_norm": 3.6153531074523926, + "learning_rate": 9.993329871116907e-06, + "loss": 1.0884, + "step": 1130 + }, + { + "epoch": 0.3283495427493105, + "grad_norm": 3.3417906761169434, + "learning_rate": 9.993280198034013e-06, + "loss": 1.046, + "step": 1131 + }, + { + "epoch": 0.3286398606474089, + "grad_norm": 4.090729236602783, + "learning_rate": 9.993230340800926e-06, + "loss": 1.3781, + "step": 1132 + }, + { + "epoch": 0.3289301785455073, + "grad_norm": 3.5112178325653076, + "learning_rate": 9.993180299419487e-06, + "loss": 1.1914, + "step": 1133 + }, + { + "epoch": 0.32922049644360574, + "grad_norm": 4.069597244262695, + "learning_rate": 9.993130073891539e-06, + "loss": 1.2936, + "step": 1134 + }, + { + "epoch": 0.32951081434170415, + "grad_norm": 3.7383646965026855, + "learning_rate": 9.993079664218936e-06, + "loss": 1.1317, + "step": 1135 + }, + { + "epoch": 0.32980113223980256, + "grad_norm": 3.911933422088623, + "learning_rate": 9.993029070403535e-06, + "loss": 1.17, + "step": 1136 + }, + { + "epoch": 0.330091450137901, + "grad_norm": 3.8537962436676025, + "learning_rate": 9.992978292447206e-06, + "loss": 1.2672, + "step": 1137 + }, + { + "epoch": 0.33038176803599945, + "grad_norm": 3.6948013305664062, + "learning_rate": 9.992927330351815e-06, + "loss": 1.2145, + "step": 1138 + }, + { + "epoch": 0.33067208593409786, + "grad_norm": 4.0727362632751465, + "learning_rate": 9.992876184119248e-06, + "loss": 1.2109, + "step": 1139 + }, + { + "epoch": 0.3309624038321963, + "grad_norm": 3.8704004287719727, + "learning_rate": 9.99282485375139e-06, + "loss": 1.2516, + "step": 1140 + }, + { + "epoch": 0.3312527217302947, + "grad_norm": 3.7747249603271484, + "learning_rate": 9.99277333925013e-06, + "loss": 1.2104, + "step": 1141 + }, + { + "epoch": 0.3315430396283931, + "grad_norm": 3.8810410499572754, + "learning_rate": 9.992721640617373e-06, + "loss": 1.2335, + "step": 1142 + }, + { + "epoch": 0.3318333575264915, + "grad_norm": 3.924704074859619, + "learning_rate": 9.992669757855022e-06, + "loss": 1.3601, + "step": 1143 + }, + { + "epoch": 0.3321236754245899, + "grad_norm": 3.7031071186065674, + "learning_rate": 9.992617690964992e-06, + "loss": 1.2986, + "step": 1144 + }, + { + "epoch": 0.33241399332268834, + "grad_norm": 3.5863468647003174, + "learning_rate": 9.992565439949202e-06, + "loss": 1.0064, + "step": 1145 + }, + { + "epoch": 0.33270431122078675, + "grad_norm": 3.349553346633911, + "learning_rate": 9.99251300480958e-06, + "loss": 1.0804, + "step": 1146 + }, + { + "epoch": 0.33299462911888517, + "grad_norm": 3.7625350952148438, + "learning_rate": 9.99246038554806e-06, + "loss": 1.2891, + "step": 1147 + }, + { + "epoch": 0.3332849470169836, + "grad_norm": 3.663235664367676, + "learning_rate": 9.992407582166582e-06, + "loss": 1.0882, + "step": 1148 + }, + { + "epoch": 0.333575264915082, + "grad_norm": 4.091626167297363, + "learning_rate": 9.992354594667092e-06, + "loss": 1.3082, + "step": 1149 + }, + { + "epoch": 0.3338655828131804, + "grad_norm": 4.003473281860352, + "learning_rate": 9.99230142305155e-06, + "loss": 1.3269, + "step": 1150 + }, + { + "epoch": 0.3341559007112789, + "grad_norm": 4.316757678985596, + "learning_rate": 9.992248067321908e-06, + "loss": 1.3672, + "step": 1151 + }, + { + "epoch": 0.3344462186093773, + "grad_norm": 3.5924463272094727, + "learning_rate": 9.992194527480141e-06, + "loss": 1.1899, + "step": 1152 + }, + { + "epoch": 0.3347365365074757, + "grad_norm": 3.5745058059692383, + "learning_rate": 9.99214080352822e-06, + "loss": 1.2595, + "step": 1153 + }, + { + "epoch": 0.3350268544055741, + "grad_norm": 3.8298416137695312, + "learning_rate": 9.992086895468126e-06, + "loss": 1.3461, + "step": 1154 + }, + { + "epoch": 0.33531717230367253, + "grad_norm": 3.9122047424316406, + "learning_rate": 9.992032803301852e-06, + "loss": 1.2159, + "step": 1155 + }, + { + "epoch": 0.33560749020177094, + "grad_norm": 3.804358959197998, + "learning_rate": 9.991978527031388e-06, + "loss": 1.2398, + "step": 1156 + }, + { + "epoch": 0.33589780809986935, + "grad_norm": 3.9901576042175293, + "learning_rate": 9.991924066658734e-06, + "loss": 1.3343, + "step": 1157 + }, + { + "epoch": 0.33618812599796777, + "grad_norm": 4.042963027954102, + "learning_rate": 9.991869422185905e-06, + "loss": 1.266, + "step": 1158 + }, + { + "epoch": 0.3364784438960662, + "grad_norm": 3.808166742324829, + "learning_rate": 9.991814593614911e-06, + "loss": 1.3053, + "step": 1159 + }, + { + "epoch": 0.3367687617941646, + "grad_norm": 3.918839931488037, + "learning_rate": 9.991759580947775e-06, + "loss": 1.2586, + "step": 1160 + }, + { + "epoch": 0.337059079692263, + "grad_norm": 4.197708606719971, + "learning_rate": 9.991704384186527e-06, + "loss": 1.4134, + "step": 1161 + }, + { + "epoch": 0.3373493975903614, + "grad_norm": 4.288426876068115, + "learning_rate": 9.991649003333202e-06, + "loss": 1.182, + "step": 1162 + }, + { + "epoch": 0.3376397154884599, + "grad_norm": 3.746020555496216, + "learning_rate": 9.991593438389844e-06, + "loss": 1.1078, + "step": 1163 + }, + { + "epoch": 0.3379300333865583, + "grad_norm": 4.072814464569092, + "learning_rate": 9.9915376893585e-06, + "loss": 1.1521, + "step": 1164 + }, + { + "epoch": 0.3382203512846567, + "grad_norm": 3.3874738216400146, + "learning_rate": 9.991481756241228e-06, + "loss": 1.0637, + "step": 1165 + }, + { + "epoch": 0.33851066918275513, + "grad_norm": 3.7892661094665527, + "learning_rate": 9.991425639040088e-06, + "loss": 1.1503, + "step": 1166 + }, + { + "epoch": 0.33880098708085354, + "grad_norm": 3.8184001445770264, + "learning_rate": 9.991369337757152e-06, + "loss": 1.2691, + "step": 1167 + }, + { + "epoch": 0.33909130497895196, + "grad_norm": 3.9826607704162598, + "learning_rate": 9.991312852394495e-06, + "loss": 1.2423, + "step": 1168 + }, + { + "epoch": 0.33938162287705037, + "grad_norm": 3.558635711669922, + "learning_rate": 9.9912561829542e-06, + "loss": 1.1773, + "step": 1169 + }, + { + "epoch": 0.3396719407751488, + "grad_norm": 4.2123847007751465, + "learning_rate": 9.99119932943836e-06, + "loss": 1.2101, + "step": 1170 + }, + { + "epoch": 0.3399622586732472, + "grad_norm": 3.4792020320892334, + "learning_rate": 9.991142291849068e-06, + "loss": 1.172, + "step": 1171 + }, + { + "epoch": 0.3402525765713456, + "grad_norm": 3.54262113571167, + "learning_rate": 9.991085070188429e-06, + "loss": 1.0937, + "step": 1172 + }, + { + "epoch": 0.340542894469444, + "grad_norm": 4.025277614593506, + "learning_rate": 9.991027664458553e-06, + "loss": 1.2719, + "step": 1173 + }, + { + "epoch": 0.34083321236754244, + "grad_norm": 3.762990713119507, + "learning_rate": 9.990970074661558e-06, + "loss": 1.1391, + "step": 1174 + }, + { + "epoch": 0.34112353026564085, + "grad_norm": 3.8915021419525146, + "learning_rate": 9.990912300799567e-06, + "loss": 1.3049, + "step": 1175 + }, + { + "epoch": 0.3414138481637393, + "grad_norm": 4.053305149078369, + "learning_rate": 9.990854342874712e-06, + "loss": 1.302, + "step": 1176 + }, + { + "epoch": 0.34170416606183773, + "grad_norm": 4.007221221923828, + "learning_rate": 9.990796200889129e-06, + "loss": 1.2686, + "step": 1177 + }, + { + "epoch": 0.34199448395993615, + "grad_norm": 3.757418632507324, + "learning_rate": 9.990737874844961e-06, + "loss": 1.1974, + "step": 1178 + }, + { + "epoch": 0.34228480185803456, + "grad_norm": 3.871196746826172, + "learning_rate": 9.99067936474436e-06, + "loss": 1.2349, + "step": 1179 + }, + { + "epoch": 0.342575119756133, + "grad_norm": 4.200139045715332, + "learning_rate": 9.990620670589488e-06, + "loss": 1.2872, + "step": 1180 + }, + { + "epoch": 0.3428654376542314, + "grad_norm": 3.794616460800171, + "learning_rate": 9.990561792382504e-06, + "loss": 1.1205, + "step": 1181 + }, + { + "epoch": 0.3431557555523298, + "grad_norm": 4.073183536529541, + "learning_rate": 9.990502730125583e-06, + "loss": 1.2517, + "step": 1182 + }, + { + "epoch": 0.3434460734504282, + "grad_norm": 3.3805885314941406, + "learning_rate": 9.990443483820899e-06, + "loss": 1.0957, + "step": 1183 + }, + { + "epoch": 0.3437363913485266, + "grad_norm": 3.706669807434082, + "learning_rate": 9.99038405347064e-06, + "loss": 1.1709, + "step": 1184 + }, + { + "epoch": 0.34402670924662504, + "grad_norm": 3.701693296432495, + "learning_rate": 9.990324439077e-06, + "loss": 1.3542, + "step": 1185 + }, + { + "epoch": 0.34431702714472345, + "grad_norm": 3.3958957195281982, + "learning_rate": 9.990264640642175e-06, + "loss": 1.0783, + "step": 1186 + }, + { + "epoch": 0.34460734504282187, + "grad_norm": 3.568415641784668, + "learning_rate": 9.990204658168368e-06, + "loss": 1.1532, + "step": 1187 + }, + { + "epoch": 0.34489766294092034, + "grad_norm": 3.5190603733062744, + "learning_rate": 9.990144491657796e-06, + "loss": 1.1625, + "step": 1188 + }, + { + "epoch": 0.34518798083901875, + "grad_norm": 3.578280210494995, + "learning_rate": 9.990084141112674e-06, + "loss": 1.1424, + "step": 1189 + }, + { + "epoch": 0.34547829873711716, + "grad_norm": 3.530015468597412, + "learning_rate": 9.990023606535229e-06, + "loss": 1.2192, + "step": 1190 + }, + { + "epoch": 0.3457686166352156, + "grad_norm": 3.9412999153137207, + "learning_rate": 9.989962887927693e-06, + "loss": 1.2546, + "step": 1191 + }, + { + "epoch": 0.346058934533314, + "grad_norm": 3.7730867862701416, + "learning_rate": 9.989901985292307e-06, + "loss": 1.3085, + "step": 1192 + }, + { + "epoch": 0.3463492524314124, + "grad_norm": 3.756413698196411, + "learning_rate": 9.989840898631316e-06, + "loss": 1.2506, + "step": 1193 + }, + { + "epoch": 0.3466395703295108, + "grad_norm": 3.6548380851745605, + "learning_rate": 9.989779627946974e-06, + "loss": 1.1645, + "step": 1194 + }, + { + "epoch": 0.34692988822760923, + "grad_norm": 3.9941673278808594, + "learning_rate": 9.989718173241537e-06, + "loss": 1.2806, + "step": 1195 + }, + { + "epoch": 0.34722020612570764, + "grad_norm": 4.010866641998291, + "learning_rate": 9.989656534517277e-06, + "loss": 1.283, + "step": 1196 + }, + { + "epoch": 0.34751052402380606, + "grad_norm": 3.7354612350463867, + "learning_rate": 9.98959471177646e-06, + "loss": 1.1878, + "step": 1197 + }, + { + "epoch": 0.34780084192190447, + "grad_norm": 3.2911434173583984, + "learning_rate": 9.989532705021373e-06, + "loss": 1.0222, + "step": 1198 + }, + { + "epoch": 0.3480911598200029, + "grad_norm": 3.4110004901885986, + "learning_rate": 9.989470514254298e-06, + "loss": 1.1306, + "step": 1199 + }, + { + "epoch": 0.3483814777181013, + "grad_norm": 3.56748366355896, + "learning_rate": 9.989408139477532e-06, + "loss": 1.0896, + "step": 1200 + }, + { + "epoch": 0.34867179561619976, + "grad_norm": 3.6176071166992188, + "learning_rate": 9.989345580693372e-06, + "loss": 1.1132, + "step": 1201 + }, + { + "epoch": 0.3489621135142982, + "grad_norm": 4.058175563812256, + "learning_rate": 9.989282837904128e-06, + "loss": 1.2873, + "step": 1202 + }, + { + "epoch": 0.3492524314123966, + "grad_norm": 3.6613640785217285, + "learning_rate": 9.989219911112114e-06, + "loss": 1.1239, + "step": 1203 + }, + { + "epoch": 0.349542749310495, + "grad_norm": 3.6973867416381836, + "learning_rate": 9.989156800319648e-06, + "loss": 1.2085, + "step": 1204 + }, + { + "epoch": 0.3498330672085934, + "grad_norm": 4.278224468231201, + "learning_rate": 9.989093505529061e-06, + "loss": 1.3686, + "step": 1205 + }, + { + "epoch": 0.35012338510669183, + "grad_norm": 3.5927252769470215, + "learning_rate": 9.989030026742683e-06, + "loss": 1.1315, + "step": 1206 + }, + { + "epoch": 0.35041370300479024, + "grad_norm": 3.239856004714966, + "learning_rate": 9.98896636396286e-06, + "loss": 1.1717, + "step": 1207 + }, + { + "epoch": 0.35070402090288866, + "grad_norm": 3.571183204650879, + "learning_rate": 9.988902517191935e-06, + "loss": 1.1082, + "step": 1208 + }, + { + "epoch": 0.35099433880098707, + "grad_norm": 3.4660732746124268, + "learning_rate": 9.988838486432266e-06, + "loss": 1.1124, + "step": 1209 + }, + { + "epoch": 0.3512846566990855, + "grad_norm": 3.6221065521240234, + "learning_rate": 9.988774271686213e-06, + "loss": 1.3044, + "step": 1210 + }, + { + "epoch": 0.3515749745971839, + "grad_norm": 3.52908992767334, + "learning_rate": 9.988709872956146e-06, + "loss": 1.1691, + "step": 1211 + }, + { + "epoch": 0.3518652924952823, + "grad_norm": 3.7822394371032715, + "learning_rate": 9.988645290244436e-06, + "loss": 1.3203, + "step": 1212 + }, + { + "epoch": 0.3521556103933807, + "grad_norm": 3.8475475311279297, + "learning_rate": 9.98858052355347e-06, + "loss": 1.1906, + "step": 1213 + }, + { + "epoch": 0.3524459282914792, + "grad_norm": 4.064851760864258, + "learning_rate": 9.988515572885632e-06, + "loss": 1.2655, + "step": 1214 + }, + { + "epoch": 0.3527362461895776, + "grad_norm": 4.1176018714904785, + "learning_rate": 9.98845043824332e-06, + "loss": 1.3391, + "step": 1215 + }, + { + "epoch": 0.353026564087676, + "grad_norm": 3.622924327850342, + "learning_rate": 9.988385119628936e-06, + "loss": 1.1187, + "step": 1216 + }, + { + "epoch": 0.35331688198577443, + "grad_norm": 3.7255032062530518, + "learning_rate": 9.988319617044889e-06, + "loss": 1.25, + "step": 1217 + }, + { + "epoch": 0.35360719988387285, + "grad_norm": 3.67846941947937, + "learning_rate": 9.988253930493592e-06, + "loss": 1.1302, + "step": 1218 + }, + { + "epoch": 0.35389751778197126, + "grad_norm": 3.972423791885376, + "learning_rate": 9.98818805997747e-06, + "loss": 1.2769, + "step": 1219 + }, + { + "epoch": 0.3541878356800697, + "grad_norm": 3.8370683193206787, + "learning_rate": 9.988122005498952e-06, + "loss": 1.3485, + "step": 1220 + }, + { + "epoch": 0.3544781535781681, + "grad_norm": 3.591844320297241, + "learning_rate": 9.988055767060474e-06, + "loss": 1.2438, + "step": 1221 + }, + { + "epoch": 0.3547684714762665, + "grad_norm": 3.620933771133423, + "learning_rate": 9.987989344664479e-06, + "loss": 1.2388, + "step": 1222 + }, + { + "epoch": 0.3550587893743649, + "grad_norm": 3.5270962715148926, + "learning_rate": 9.987922738313417e-06, + "loss": 1.157, + "step": 1223 + }, + { + "epoch": 0.3553491072724633, + "grad_norm": 3.6065704822540283, + "learning_rate": 9.987855948009744e-06, + "loss": 1.1126, + "step": 1224 + }, + { + "epoch": 0.35563942517056174, + "grad_norm": 3.9604432582855225, + "learning_rate": 9.98778897375592e-06, + "loss": 1.3218, + "step": 1225 + }, + { + "epoch": 0.3559297430686602, + "grad_norm": 3.827787160873413, + "learning_rate": 9.987721815554421e-06, + "loss": 1.2084, + "step": 1226 + }, + { + "epoch": 0.3562200609667586, + "grad_norm": 3.869262456893921, + "learning_rate": 9.98765447340772e-06, + "loss": 1.2332, + "step": 1227 + }, + { + "epoch": 0.35651037886485704, + "grad_norm": 3.5749378204345703, + "learning_rate": 9.987586947318302e-06, + "loss": 1.1676, + "step": 1228 + }, + { + "epoch": 0.35680069676295545, + "grad_norm": 3.531912088394165, + "learning_rate": 9.987519237288656e-06, + "loss": 1.2284, + "step": 1229 + }, + { + "epoch": 0.35709101466105386, + "grad_norm": 3.333885431289673, + "learning_rate": 9.98745134332128e-06, + "loss": 1.068, + "step": 1230 + }, + { + "epoch": 0.3573813325591523, + "grad_norm": 3.75718355178833, + "learning_rate": 9.987383265418677e-06, + "loss": 1.1405, + "step": 1231 + }, + { + "epoch": 0.3576716504572507, + "grad_norm": 3.853196859359741, + "learning_rate": 9.987315003583359e-06, + "loss": 1.2619, + "step": 1232 + }, + { + "epoch": 0.3579619683553491, + "grad_norm": 3.7360024452209473, + "learning_rate": 9.987246557817843e-06, + "loss": 1.241, + "step": 1233 + }, + { + "epoch": 0.3582522862534475, + "grad_norm": 3.7324812412261963, + "learning_rate": 9.987177928124651e-06, + "loss": 1.053, + "step": 1234 + }, + { + "epoch": 0.35854260415154593, + "grad_norm": 3.9284653663635254, + "learning_rate": 9.98710911450632e-06, + "loss": 1.3436, + "step": 1235 + }, + { + "epoch": 0.35883292204964434, + "grad_norm": 3.787597179412842, + "learning_rate": 9.987040116965381e-06, + "loss": 1.1066, + "step": 1236 + }, + { + "epoch": 0.35912323994774276, + "grad_norm": 3.7411112785339355, + "learning_rate": 9.98697093550438e-06, + "loss": 1.0653, + "step": 1237 + }, + { + "epoch": 0.35941355784584117, + "grad_norm": 3.5020062923431396, + "learning_rate": 9.986901570125873e-06, + "loss": 1.1956, + "step": 1238 + }, + { + "epoch": 0.35970387574393964, + "grad_norm": 3.475775718688965, + "learning_rate": 9.986832020832416e-06, + "loss": 1.1577, + "step": 1239 + }, + { + "epoch": 0.35999419364203805, + "grad_norm": 3.781212568283081, + "learning_rate": 9.98676228762657e-06, + "loss": 1.2847, + "step": 1240 + }, + { + "epoch": 0.36028451154013647, + "grad_norm": 3.5571868419647217, + "learning_rate": 9.98669237051091e-06, + "loss": 1.0893, + "step": 1241 + }, + { + "epoch": 0.3605748294382349, + "grad_norm": 3.7990763187408447, + "learning_rate": 9.986622269488017e-06, + "loss": 1.3096, + "step": 1242 + }, + { + "epoch": 0.3608651473363333, + "grad_norm": 3.936373710632324, + "learning_rate": 9.98655198456047e-06, + "loss": 1.2876, + "step": 1243 + }, + { + "epoch": 0.3611554652344317, + "grad_norm": 3.4436564445495605, + "learning_rate": 9.986481515730868e-06, + "loss": 1.1857, + "step": 1244 + }, + { + "epoch": 0.3614457831325301, + "grad_norm": 3.6510026454925537, + "learning_rate": 9.986410863001806e-06, + "loss": 1.277, + "step": 1245 + }, + { + "epoch": 0.36173610103062853, + "grad_norm": 4.282403469085693, + "learning_rate": 9.986340026375888e-06, + "loss": 1.2899, + "step": 1246 + }, + { + "epoch": 0.36202641892872695, + "grad_norm": 3.948631763458252, + "learning_rate": 9.98626900585573e-06, + "loss": 1.3668, + "step": 1247 + }, + { + "epoch": 0.36231673682682536, + "grad_norm": 3.5207550525665283, + "learning_rate": 9.98619780144395e-06, + "loss": 1.1732, + "step": 1248 + }, + { + "epoch": 0.36260705472492377, + "grad_norm": 3.9342057704925537, + "learning_rate": 9.986126413143173e-06, + "loss": 1.2864, + "step": 1249 + }, + { + "epoch": 0.3628973726230222, + "grad_norm": 4.076601982116699, + "learning_rate": 9.986054840956033e-06, + "loss": 1.3249, + "step": 1250 + }, + { + "epoch": 0.36318769052112065, + "grad_norm": 3.6744585037231445, + "learning_rate": 9.985983084885169e-06, + "loss": 1.1245, + "step": 1251 + }, + { + "epoch": 0.36347800841921907, + "grad_norm": 3.6365158557891846, + "learning_rate": 9.985911144933228e-06, + "loss": 1.1338, + "step": 1252 + }, + { + "epoch": 0.3637683263173175, + "grad_norm": 4.260592937469482, + "learning_rate": 9.985839021102862e-06, + "loss": 1.3485, + "step": 1253 + }, + { + "epoch": 0.3640586442154159, + "grad_norm": 3.7015466690063477, + "learning_rate": 9.985766713396732e-06, + "loss": 1.275, + "step": 1254 + }, + { + "epoch": 0.3643489621135143, + "grad_norm": 3.6575965881347656, + "learning_rate": 9.985694221817504e-06, + "loss": 1.1995, + "step": 1255 + }, + { + "epoch": 0.3646392800116127, + "grad_norm": 3.805546283721924, + "learning_rate": 9.985621546367851e-06, + "loss": 1.2516, + "step": 1256 + }, + { + "epoch": 0.36492959790971113, + "grad_norm": 3.6391587257385254, + "learning_rate": 9.985548687050454e-06, + "loss": 1.1948, + "step": 1257 + }, + { + "epoch": 0.36521991580780955, + "grad_norm": 3.510903835296631, + "learning_rate": 9.985475643868e-06, + "loss": 1.1434, + "step": 1258 + }, + { + "epoch": 0.36551023370590796, + "grad_norm": 3.690833806991577, + "learning_rate": 9.985402416823183e-06, + "loss": 1.3163, + "step": 1259 + }, + { + "epoch": 0.3658005516040064, + "grad_norm": 3.6341683864593506, + "learning_rate": 9.985329005918702e-06, + "loss": 1.2531, + "step": 1260 + }, + { + "epoch": 0.3660908695021048, + "grad_norm": 3.9021074771881104, + "learning_rate": 9.985255411157268e-06, + "loss": 1.3222, + "step": 1261 + }, + { + "epoch": 0.3663811874002032, + "grad_norm": 3.5397932529449463, + "learning_rate": 9.985181632541591e-06, + "loss": 1.1676, + "step": 1262 + }, + { + "epoch": 0.3666715052983016, + "grad_norm": 3.973975896835327, + "learning_rate": 9.985107670074394e-06, + "loss": 1.2106, + "step": 1263 + }, + { + "epoch": 0.3669618231964001, + "grad_norm": 3.945737600326538, + "learning_rate": 9.985033523758405e-06, + "loss": 1.2573, + "step": 1264 + }, + { + "epoch": 0.3672521410944985, + "grad_norm": 3.5193498134613037, + "learning_rate": 9.984959193596358e-06, + "loss": 1.1568, + "step": 1265 + }, + { + "epoch": 0.3675424589925969, + "grad_norm": 4.018974781036377, + "learning_rate": 9.984884679590994e-06, + "loss": 1.2194, + "step": 1266 + }, + { + "epoch": 0.3678327768906953, + "grad_norm": 3.666628122329712, + "learning_rate": 9.984809981745061e-06, + "loss": 1.3031, + "step": 1267 + }, + { + "epoch": 0.36812309478879374, + "grad_norm": 3.4612388610839844, + "learning_rate": 9.984735100061313e-06, + "loss": 1.1842, + "step": 1268 + }, + { + "epoch": 0.36841341268689215, + "grad_norm": 4.13927698135376, + "learning_rate": 9.984660034542512e-06, + "loss": 1.3674, + "step": 1269 + }, + { + "epoch": 0.36870373058499056, + "grad_norm": 3.5382606983184814, + "learning_rate": 9.98458478519143e-06, + "loss": 1.1857, + "step": 1270 + }, + { + "epoch": 0.368994048483089, + "grad_norm": 3.827183246612549, + "learning_rate": 9.984509352010839e-06, + "loss": 1.1914, + "step": 1271 + }, + { + "epoch": 0.3692843663811874, + "grad_norm": 3.528890609741211, + "learning_rate": 9.984433735003518e-06, + "loss": 1.1497, + "step": 1272 + }, + { + "epoch": 0.3695746842792858, + "grad_norm": 3.6063666343688965, + "learning_rate": 9.984357934172263e-06, + "loss": 1.2329, + "step": 1273 + }, + { + "epoch": 0.3698650021773842, + "grad_norm": 3.64660382270813, + "learning_rate": 9.984281949519861e-06, + "loss": 1.1589, + "step": 1274 + }, + { + "epoch": 0.37015532007548263, + "grad_norm": 3.4852254390716553, + "learning_rate": 9.984205781049122e-06, + "loss": 1.2945, + "step": 1275 + }, + { + "epoch": 0.3704456379735811, + "grad_norm": 4.028648376464844, + "learning_rate": 9.98412942876285e-06, + "loss": 1.1381, + "step": 1276 + }, + { + "epoch": 0.3707359558716795, + "grad_norm": 3.437859296798706, + "learning_rate": 9.984052892663863e-06, + "loss": 1.099, + "step": 1277 + }, + { + "epoch": 0.3710262737697779, + "grad_norm": 3.5467662811279297, + "learning_rate": 9.983976172754982e-06, + "loss": 1.1857, + "step": 1278 + }, + { + "epoch": 0.37131659166787634, + "grad_norm": 3.897996425628662, + "learning_rate": 9.98389926903904e-06, + "loss": 1.3047, + "step": 1279 + }, + { + "epoch": 0.37160690956597475, + "grad_norm": 3.553786516189575, + "learning_rate": 9.98382218151887e-06, + "loss": 1.1506, + "step": 1280 + }, + { + "epoch": 0.37189722746407317, + "grad_norm": 3.5104734897613525, + "learning_rate": 9.983744910197315e-06, + "loss": 1.2489, + "step": 1281 + }, + { + "epoch": 0.3721875453621716, + "grad_norm": 3.6049647331237793, + "learning_rate": 9.983667455077225e-06, + "loss": 1.2921, + "step": 1282 + }, + { + "epoch": 0.37247786326027, + "grad_norm": 3.746884822845459, + "learning_rate": 9.983589816161458e-06, + "loss": 1.0715, + "step": 1283 + }, + { + "epoch": 0.3727681811583684, + "grad_norm": 3.4639060497283936, + "learning_rate": 9.983511993452875e-06, + "loss": 1.2717, + "step": 1284 + }, + { + "epoch": 0.3730584990564668, + "grad_norm": 4.013452529907227, + "learning_rate": 9.983433986954349e-06, + "loss": 1.3516, + "step": 1285 + }, + { + "epoch": 0.37334881695456523, + "grad_norm": 3.8270010948181152, + "learning_rate": 9.983355796668755e-06, + "loss": 1.4126, + "step": 1286 + }, + { + "epoch": 0.37363913485266365, + "grad_norm": 3.6755404472351074, + "learning_rate": 9.983277422598976e-06, + "loss": 1.1109, + "step": 1287 + }, + { + "epoch": 0.37392945275076206, + "grad_norm": 3.8300483226776123, + "learning_rate": 9.983198864747904e-06, + "loss": 1.0732, + "step": 1288 + }, + { + "epoch": 0.37421977064886053, + "grad_norm": 3.9538397789001465, + "learning_rate": 9.983120123118435e-06, + "loss": 1.3122, + "step": 1289 + }, + { + "epoch": 0.37451008854695894, + "grad_norm": 3.865281343460083, + "learning_rate": 9.983041197713473e-06, + "loss": 1.3144, + "step": 1290 + }, + { + "epoch": 0.37480040644505735, + "grad_norm": 3.875990152359009, + "learning_rate": 9.982962088535928e-06, + "loss": 1.1896, + "step": 1291 + }, + { + "epoch": 0.37509072434315577, + "grad_norm": 3.8319966793060303, + "learning_rate": 9.98288279558872e-06, + "loss": 1.1693, + "step": 1292 + }, + { + "epoch": 0.3753810422412542, + "grad_norm": 3.9637584686279297, + "learning_rate": 9.982803318874772e-06, + "loss": 1.3056, + "step": 1293 + }, + { + "epoch": 0.3756713601393526, + "grad_norm": 3.718834400177002, + "learning_rate": 9.982723658397016e-06, + "loss": 1.3783, + "step": 1294 + }, + { + "epoch": 0.375961678037451, + "grad_norm": 3.859952688217163, + "learning_rate": 9.982643814158387e-06, + "loss": 1.224, + "step": 1295 + }, + { + "epoch": 0.3762519959355494, + "grad_norm": 3.4103081226348877, + "learning_rate": 9.982563786161831e-06, + "loss": 1.1378, + "step": 1296 + }, + { + "epoch": 0.37654231383364783, + "grad_norm": 3.879765510559082, + "learning_rate": 9.982483574410302e-06, + "loss": 1.2272, + "step": 1297 + }, + { + "epoch": 0.37683263173174625, + "grad_norm": 3.8443405628204346, + "learning_rate": 9.982403178906755e-06, + "loss": 1.2383, + "step": 1298 + }, + { + "epoch": 0.37712294962984466, + "grad_norm": 3.5465097427368164, + "learning_rate": 9.982322599654156e-06, + "loss": 1.1018, + "step": 1299 + }, + { + "epoch": 0.3774132675279431, + "grad_norm": 4.120823383331299, + "learning_rate": 9.982241836655475e-06, + "loss": 1.4552, + "step": 1300 + }, + { + "epoch": 0.3777035854260415, + "grad_norm": 3.9285216331481934, + "learning_rate": 9.982160889913695e-06, + "loss": 1.3464, + "step": 1301 + }, + { + "epoch": 0.37799390332413996, + "grad_norm": 3.467785596847534, + "learning_rate": 9.982079759431797e-06, + "loss": 1.0364, + "step": 1302 + }, + { + "epoch": 0.37828422122223837, + "grad_norm": 3.7329118251800537, + "learning_rate": 9.981998445212775e-06, + "loss": 1.3733, + "step": 1303 + }, + { + "epoch": 0.3785745391203368, + "grad_norm": 3.560277223587036, + "learning_rate": 9.981916947259627e-06, + "loss": 1.2214, + "step": 1304 + }, + { + "epoch": 0.3788648570184352, + "grad_norm": 3.2049508094787598, + "learning_rate": 9.981835265575358e-06, + "loss": 1.1433, + "step": 1305 + }, + { + "epoch": 0.3791551749165336, + "grad_norm": 3.6437489986419678, + "learning_rate": 9.981753400162984e-06, + "loss": 1.1825, + "step": 1306 + }, + { + "epoch": 0.379445492814632, + "grad_norm": 3.253337860107422, + "learning_rate": 9.981671351025519e-06, + "loss": 1.0779, + "step": 1307 + }, + { + "epoch": 0.37973581071273044, + "grad_norm": 3.6426970958709717, + "learning_rate": 9.981589118165993e-06, + "loss": 1.3683, + "step": 1308 + }, + { + "epoch": 0.38002612861082885, + "grad_norm": 3.8423707485198975, + "learning_rate": 9.981506701587437e-06, + "loss": 1.1725, + "step": 1309 + }, + { + "epoch": 0.38031644650892726, + "grad_norm": 3.6762940883636475, + "learning_rate": 9.98142410129289e-06, + "loss": 1.1383, + "step": 1310 + }, + { + "epoch": 0.3806067644070257, + "grad_norm": 3.8239686489105225, + "learning_rate": 9.9813413172854e-06, + "loss": 1.2646, + "step": 1311 + }, + { + "epoch": 0.3808970823051241, + "grad_norm": 3.683504581451416, + "learning_rate": 9.981258349568018e-06, + "loss": 1.2585, + "step": 1312 + }, + { + "epoch": 0.3811874002032225, + "grad_norm": 3.893596649169922, + "learning_rate": 9.981175198143805e-06, + "loss": 1.231, + "step": 1313 + }, + { + "epoch": 0.381477718101321, + "grad_norm": 3.4069478511810303, + "learning_rate": 9.981091863015828e-06, + "loss": 1.0599, + "step": 1314 + }, + { + "epoch": 0.3817680359994194, + "grad_norm": 3.19846248626709, + "learning_rate": 9.981008344187159e-06, + "loss": 1.0661, + "step": 1315 + }, + { + "epoch": 0.3820583538975178, + "grad_norm": 3.7466282844543457, + "learning_rate": 9.98092464166088e-06, + "loss": 1.1942, + "step": 1316 + }, + { + "epoch": 0.3823486717956162, + "grad_norm": 3.7203147411346436, + "learning_rate": 9.980840755440075e-06, + "loss": 1.1872, + "step": 1317 + }, + { + "epoch": 0.3826389896937146, + "grad_norm": 3.3040809631347656, + "learning_rate": 9.980756685527841e-06, + "loss": 1.0091, + "step": 1318 + }, + { + "epoch": 0.38292930759181304, + "grad_norm": 3.2888503074645996, + "learning_rate": 9.980672431927278e-06, + "loss": 1.148, + "step": 1319 + }, + { + "epoch": 0.38321962548991145, + "grad_norm": 3.654926061630249, + "learning_rate": 9.980587994641491e-06, + "loss": 1.3017, + "step": 1320 + }, + { + "epoch": 0.38350994338800987, + "grad_norm": 3.980696439743042, + "learning_rate": 9.980503373673594e-06, + "loss": 1.3312, + "step": 1321 + }, + { + "epoch": 0.3838002612861083, + "grad_norm": 3.6352922916412354, + "learning_rate": 9.980418569026711e-06, + "loss": 1.3227, + "step": 1322 + }, + { + "epoch": 0.3840905791842067, + "grad_norm": 3.5730032920837402, + "learning_rate": 9.980333580703968e-06, + "loss": 1.2282, + "step": 1323 + }, + { + "epoch": 0.3843808970823051, + "grad_norm": 3.418905258178711, + "learning_rate": 9.980248408708497e-06, + "loss": 1.1507, + "step": 1324 + }, + { + "epoch": 0.3846712149804035, + "grad_norm": 3.594193696975708, + "learning_rate": 9.980163053043441e-06, + "loss": 1.2218, + "step": 1325 + }, + { + "epoch": 0.38496153287850193, + "grad_norm": 3.8186099529266357, + "learning_rate": 9.98007751371195e-06, + "loss": 1.1744, + "step": 1326 + }, + { + "epoch": 0.3852518507766004, + "grad_norm": 3.8397912979125977, + "learning_rate": 9.979991790717174e-06, + "loss": 1.2721, + "step": 1327 + }, + { + "epoch": 0.3855421686746988, + "grad_norm": 3.193303346633911, + "learning_rate": 9.97990588406228e-06, + "loss": 1.1092, + "step": 1328 + }, + { + "epoch": 0.38583248657279723, + "grad_norm": 3.7081987857818604, + "learning_rate": 9.97981979375043e-06, + "loss": 1.2054, + "step": 1329 + }, + { + "epoch": 0.38612280447089564, + "grad_norm": 3.6489391326904297, + "learning_rate": 9.979733519784804e-06, + "loss": 1.2679, + "step": 1330 + }, + { + "epoch": 0.38641312236899406, + "grad_norm": 3.412721633911133, + "learning_rate": 9.979647062168582e-06, + "loss": 1.049, + "step": 1331 + }, + { + "epoch": 0.38670344026709247, + "grad_norm": 3.916553258895874, + "learning_rate": 9.979560420904953e-06, + "loss": 1.4672, + "step": 1332 + }, + { + "epoch": 0.3869937581651909, + "grad_norm": 3.6796796321868896, + "learning_rate": 9.97947359599711e-06, + "loss": 1.2632, + "step": 1333 + }, + { + "epoch": 0.3872840760632893, + "grad_norm": 3.4813990592956543, + "learning_rate": 9.979386587448257e-06, + "loss": 1.1071, + "step": 1334 + }, + { + "epoch": 0.3875743939613877, + "grad_norm": 3.768031120300293, + "learning_rate": 9.979299395261604e-06, + "loss": 1.3182, + "step": 1335 + }, + { + "epoch": 0.3878647118594861, + "grad_norm": 3.838653087615967, + "learning_rate": 9.979212019440364e-06, + "loss": 1.3277, + "step": 1336 + }, + { + "epoch": 0.38815502975758454, + "grad_norm": 3.5848910808563232, + "learning_rate": 9.97912445998776e-06, + "loss": 1.1353, + "step": 1337 + }, + { + "epoch": 0.38844534765568295, + "grad_norm": 3.538034439086914, + "learning_rate": 9.979036716907025e-06, + "loss": 1.3567, + "step": 1338 + }, + { + "epoch": 0.3887356655537814, + "grad_norm": 3.8515238761901855, + "learning_rate": 9.978948790201388e-06, + "loss": 1.1621, + "step": 1339 + }, + { + "epoch": 0.38902598345187983, + "grad_norm": 3.3468730449676514, + "learning_rate": 9.978860679874098e-06, + "loss": 1.1637, + "step": 1340 + }, + { + "epoch": 0.38931630134997824, + "grad_norm": 3.7249915599823, + "learning_rate": 9.9787723859284e-06, + "loss": 1.1381, + "step": 1341 + }, + { + "epoch": 0.38960661924807666, + "grad_norm": 3.5593464374542236, + "learning_rate": 9.978683908367555e-06, + "loss": 1.2549, + "step": 1342 + }, + { + "epoch": 0.38989693714617507, + "grad_norm": 3.818927526473999, + "learning_rate": 9.978595247194822e-06, + "loss": 1.3647, + "step": 1343 + }, + { + "epoch": 0.3901872550442735, + "grad_norm": 3.786468744277954, + "learning_rate": 9.978506402413472e-06, + "loss": 1.1994, + "step": 1344 + }, + { + "epoch": 0.3904775729423719, + "grad_norm": 3.9170660972595215, + "learning_rate": 9.97841737402678e-06, + "loss": 1.1363, + "step": 1345 + }, + { + "epoch": 0.3907678908404703, + "grad_norm": 3.4517476558685303, + "learning_rate": 9.978328162038032e-06, + "loss": 1.1165, + "step": 1346 + }, + { + "epoch": 0.3910582087385687, + "grad_norm": 3.631568670272827, + "learning_rate": 9.978238766450518e-06, + "loss": 1.199, + "step": 1347 + }, + { + "epoch": 0.39134852663666714, + "grad_norm": 3.3780012130737305, + "learning_rate": 9.978149187267532e-06, + "loss": 1.0625, + "step": 1348 + }, + { + "epoch": 0.39163884453476555, + "grad_norm": 3.4305973052978516, + "learning_rate": 9.97805942449238e-06, + "loss": 1.1939, + "step": 1349 + }, + { + "epoch": 0.39192916243286396, + "grad_norm": 3.3205480575561523, + "learning_rate": 9.977969478128373e-06, + "loss": 1.2248, + "step": 1350 + }, + { + "epoch": 0.3922194803309624, + "grad_norm": 3.6359150409698486, + "learning_rate": 9.977879348178826e-06, + "loss": 1.3019, + "step": 1351 + }, + { + "epoch": 0.39250979822906085, + "grad_norm": 3.7038495540618896, + "learning_rate": 9.977789034647066e-06, + "loss": 1.2069, + "step": 1352 + }, + { + "epoch": 0.39280011612715926, + "grad_norm": 3.569873094558716, + "learning_rate": 9.97769853753642e-06, + "loss": 1.2185, + "step": 1353 + }, + { + "epoch": 0.3930904340252577, + "grad_norm": 4.010556221008301, + "learning_rate": 9.977607856850227e-06, + "loss": 1.4308, + "step": 1354 + }, + { + "epoch": 0.3933807519233561, + "grad_norm": 3.732271432876587, + "learning_rate": 9.977516992591832e-06, + "loss": 1.3511, + "step": 1355 + }, + { + "epoch": 0.3936710698214545, + "grad_norm": 3.649620771408081, + "learning_rate": 9.977425944764585e-06, + "loss": 1.2222, + "step": 1356 + }, + { + "epoch": 0.3939613877195529, + "grad_norm": 3.5589444637298584, + "learning_rate": 9.977334713371844e-06, + "loss": 1.1794, + "step": 1357 + }, + { + "epoch": 0.3942517056176513, + "grad_norm": 3.443727970123291, + "learning_rate": 9.977243298416976e-06, + "loss": 1.2031, + "step": 1358 + }, + { + "epoch": 0.39454202351574974, + "grad_norm": 3.4052302837371826, + "learning_rate": 9.977151699903349e-06, + "loss": 1.3753, + "step": 1359 + }, + { + "epoch": 0.39483234141384815, + "grad_norm": 3.364332675933838, + "learning_rate": 9.977059917834342e-06, + "loss": 1.1101, + "step": 1360 + }, + { + "epoch": 0.39512265931194657, + "grad_norm": 3.46517276763916, + "learning_rate": 9.97696795221334e-06, + "loss": 1.1746, + "step": 1361 + }, + { + "epoch": 0.395412977210045, + "grad_norm": 3.6271650791168213, + "learning_rate": 9.976875803043737e-06, + "loss": 1.1741, + "step": 1362 + }, + { + "epoch": 0.3957032951081434, + "grad_norm": 3.873410224914551, + "learning_rate": 9.976783470328928e-06, + "loss": 1.2825, + "step": 1363 + }, + { + "epoch": 0.39599361300624186, + "grad_norm": 3.7868969440460205, + "learning_rate": 9.97669095407232e-06, + "loss": 1.3593, + "step": 1364 + }, + { + "epoch": 0.3962839309043403, + "grad_norm": 3.300156354904175, + "learning_rate": 9.976598254277324e-06, + "loss": 1.106, + "step": 1365 + }, + { + "epoch": 0.3965742488024387, + "grad_norm": 4.34855318069458, + "learning_rate": 9.97650537094736e-06, + "loss": 1.1779, + "step": 1366 + }, + { + "epoch": 0.3968645667005371, + "grad_norm": 3.3535711765289307, + "learning_rate": 9.976412304085852e-06, + "loss": 1.091, + "step": 1367 + }, + { + "epoch": 0.3971548845986355, + "grad_norm": 3.616659641265869, + "learning_rate": 9.976319053696236e-06, + "loss": 1.1698, + "step": 1368 + }, + { + "epoch": 0.39744520249673393, + "grad_norm": 3.9007325172424316, + "learning_rate": 9.976225619781944e-06, + "loss": 1.3209, + "step": 1369 + }, + { + "epoch": 0.39773552039483234, + "grad_norm": 3.554885149002075, + "learning_rate": 9.976132002346429e-06, + "loss": 1.0978, + "step": 1370 + }, + { + "epoch": 0.39802583829293076, + "grad_norm": 3.662487506866455, + "learning_rate": 9.976038201393138e-06, + "loss": 1.3094, + "step": 1371 + }, + { + "epoch": 0.39831615619102917, + "grad_norm": 3.5315754413604736, + "learning_rate": 9.975944216925533e-06, + "loss": 1.1677, + "step": 1372 + }, + { + "epoch": 0.3986064740891276, + "grad_norm": 3.787691831588745, + "learning_rate": 9.975850048947082e-06, + "loss": 1.294, + "step": 1373 + }, + { + "epoch": 0.398896791987226, + "grad_norm": 3.4021782875061035, + "learning_rate": 9.975755697461254e-06, + "loss": 1.1671, + "step": 1374 + }, + { + "epoch": 0.3991871098853244, + "grad_norm": 3.5344481468200684, + "learning_rate": 9.975661162471531e-06, + "loss": 1.061, + "step": 1375 + }, + { + "epoch": 0.3994774277834228, + "grad_norm": 3.530378580093384, + "learning_rate": 9.9755664439814e-06, + "loss": 1.1311, + "step": 1376 + }, + { + "epoch": 0.3997677456815213, + "grad_norm": 3.5945799350738525, + "learning_rate": 9.97547154199435e-06, + "loss": 1.0421, + "step": 1377 + }, + { + "epoch": 0.4000580635796197, + "grad_norm": 3.523029327392578, + "learning_rate": 9.975376456513886e-06, + "loss": 1.1865, + "step": 1378 + }, + { + "epoch": 0.4003483814777181, + "grad_norm": 3.855416774749756, + "learning_rate": 9.975281187543514e-06, + "loss": 1.3703, + "step": 1379 + }, + { + "epoch": 0.40063869937581653, + "grad_norm": 4.034465789794922, + "learning_rate": 9.975185735086745e-06, + "loss": 1.309, + "step": 1380 + }, + { + "epoch": 0.40092901727391494, + "grad_norm": 4.100909233093262, + "learning_rate": 9.9750900991471e-06, + "loss": 1.3027, + "step": 1381 + }, + { + "epoch": 0.40121933517201336, + "grad_norm": 3.6835947036743164, + "learning_rate": 9.974994279728105e-06, + "loss": 1.1245, + "step": 1382 + }, + { + "epoch": 0.40150965307011177, + "grad_norm": 3.456866979598999, + "learning_rate": 9.974898276833298e-06, + "loss": 1.1117, + "step": 1383 + }, + { + "epoch": 0.4017999709682102, + "grad_norm": 3.656215190887451, + "learning_rate": 9.974802090466216e-06, + "loss": 1.2049, + "step": 1384 + }, + { + "epoch": 0.4020902888663086, + "grad_norm": 4.105678081512451, + "learning_rate": 9.974705720630407e-06, + "loss": 1.4034, + "step": 1385 + }, + { + "epoch": 0.402380606764407, + "grad_norm": 3.769406795501709, + "learning_rate": 9.974609167329425e-06, + "loss": 1.365, + "step": 1386 + }, + { + "epoch": 0.4026709246625054, + "grad_norm": 3.7818362712860107, + "learning_rate": 9.974512430566829e-06, + "loss": 1.1959, + "step": 1387 + }, + { + "epoch": 0.40296124256060384, + "grad_norm": 3.7046732902526855, + "learning_rate": 9.974415510346192e-06, + "loss": 1.276, + "step": 1388 + }, + { + "epoch": 0.40325156045870225, + "grad_norm": 4.240913391113281, + "learning_rate": 9.974318406671083e-06, + "loss": 1.3754, + "step": 1389 + }, + { + "epoch": 0.4035418783568007, + "grad_norm": 3.827770948410034, + "learning_rate": 9.974221119545086e-06, + "loss": 1.1494, + "step": 1390 + }, + { + "epoch": 0.40383219625489913, + "grad_norm": 3.8236684799194336, + "learning_rate": 9.974123648971787e-06, + "loss": 1.3407, + "step": 1391 + }, + { + "epoch": 0.40412251415299755, + "grad_norm": 3.5897345542907715, + "learning_rate": 9.974025994954783e-06, + "loss": 1.1962, + "step": 1392 + }, + { + "epoch": 0.40441283205109596, + "grad_norm": 3.6147966384887695, + "learning_rate": 9.973928157497675e-06, + "loss": 1.2777, + "step": 1393 + }, + { + "epoch": 0.4047031499491944, + "grad_norm": 3.617846727371216, + "learning_rate": 9.973830136604068e-06, + "loss": 1.2909, + "step": 1394 + }, + { + "epoch": 0.4049934678472928, + "grad_norm": 3.4171886444091797, + "learning_rate": 9.973731932277581e-06, + "loss": 1.0739, + "step": 1395 + }, + { + "epoch": 0.4052837857453912, + "grad_norm": 3.370614767074585, + "learning_rate": 9.973633544521834e-06, + "loss": 1.1842, + "step": 1396 + }, + { + "epoch": 0.4055741036434896, + "grad_norm": 3.4126060009002686, + "learning_rate": 9.973534973340456e-06, + "loss": 1.1144, + "step": 1397 + }, + { + "epoch": 0.405864421541588, + "grad_norm": 3.8534622192382812, + "learning_rate": 9.97343621873708e-06, + "loss": 1.1885, + "step": 1398 + }, + { + "epoch": 0.40615473943968644, + "grad_norm": 3.420496940612793, + "learning_rate": 9.973337280715351e-06, + "loss": 1.1136, + "step": 1399 + }, + { + "epoch": 0.40644505733778485, + "grad_norm": 3.775999069213867, + "learning_rate": 9.973238159278917e-06, + "loss": 1.2418, + "step": 1400 + }, + { + "epoch": 0.40673537523588327, + "grad_norm": 3.9710724353790283, + "learning_rate": 9.973138854431433e-06, + "loss": 1.2584, + "step": 1401 + }, + { + "epoch": 0.40702569313398174, + "grad_norm": 3.2783279418945312, + "learning_rate": 9.97303936617656e-06, + "loss": 1.1942, + "step": 1402 + }, + { + "epoch": 0.40731601103208015, + "grad_norm": 3.7478137016296387, + "learning_rate": 9.972939694517971e-06, + "loss": 1.1562, + "step": 1403 + }, + { + "epoch": 0.40760632893017856, + "grad_norm": 3.628674030303955, + "learning_rate": 9.97283983945934e-06, + "loss": 1.2307, + "step": 1404 + }, + { + "epoch": 0.407896646828277, + "grad_norm": 3.313133716583252, + "learning_rate": 9.972739801004347e-06, + "loss": 1.0223, + "step": 1405 + }, + { + "epoch": 0.4081869647263754, + "grad_norm": 3.7657248973846436, + "learning_rate": 9.972639579156684e-06, + "loss": 1.2811, + "step": 1406 + }, + { + "epoch": 0.4084772826244738, + "grad_norm": 3.6290464401245117, + "learning_rate": 9.972539173920048e-06, + "loss": 1.1364, + "step": 1407 + }, + { + "epoch": 0.4087676005225722, + "grad_norm": 3.805755376815796, + "learning_rate": 9.972438585298139e-06, + "loss": 1.3117, + "step": 1408 + }, + { + "epoch": 0.40905791842067063, + "grad_norm": 3.6081717014312744, + "learning_rate": 9.972337813294668e-06, + "loss": 1.308, + "step": 1409 + }, + { + "epoch": 0.40934823631876904, + "grad_norm": 3.5983402729034424, + "learning_rate": 9.972236857913354e-06, + "loss": 1.1535, + "step": 1410 + }, + { + "epoch": 0.40963855421686746, + "grad_norm": 3.7803027629852295, + "learning_rate": 9.972135719157916e-06, + "loss": 1.3223, + "step": 1411 + }, + { + "epoch": 0.40992887211496587, + "grad_norm": 3.356072425842285, + "learning_rate": 9.972034397032086e-06, + "loss": 1.103, + "step": 1412 + }, + { + "epoch": 0.4102191900130643, + "grad_norm": 3.7912418842315674, + "learning_rate": 9.9719328915396e-06, + "loss": 1.1965, + "step": 1413 + }, + { + "epoch": 0.4105095079111627, + "grad_norm": 3.382089138031006, + "learning_rate": 9.971831202684203e-06, + "loss": 1.1991, + "step": 1414 + }, + { + "epoch": 0.41079982580926117, + "grad_norm": 3.6623477935791016, + "learning_rate": 9.971729330469644e-06, + "loss": 1.1763, + "step": 1415 + }, + { + "epoch": 0.4110901437073596, + "grad_norm": 3.4154701232910156, + "learning_rate": 9.97162727489968e-06, + "loss": 1.1038, + "step": 1416 + }, + { + "epoch": 0.411380461605458, + "grad_norm": 3.7780191898345947, + "learning_rate": 9.971525035978076e-06, + "loss": 1.315, + "step": 1417 + }, + { + "epoch": 0.4116707795035564, + "grad_norm": 3.626234292984009, + "learning_rate": 9.971422613708602e-06, + "loss": 1.2964, + "step": 1418 + }, + { + "epoch": 0.4119610974016548, + "grad_norm": 3.3718817234039307, + "learning_rate": 9.971320008095031e-06, + "loss": 1.2485, + "step": 1419 + }, + { + "epoch": 0.41225141529975323, + "grad_norm": 3.4189116954803467, + "learning_rate": 9.971217219141156e-06, + "loss": 1.1006, + "step": 1420 + }, + { + "epoch": 0.41254173319785165, + "grad_norm": 3.846132516860962, + "learning_rate": 9.97111424685076e-06, + "loss": 1.283, + "step": 1421 + }, + { + "epoch": 0.41283205109595006, + "grad_norm": 3.672684669494629, + "learning_rate": 9.971011091227642e-06, + "loss": 1.3357, + "step": 1422 + }, + { + "epoch": 0.41312236899404847, + "grad_norm": 3.523810863494873, + "learning_rate": 9.970907752275609e-06, + "loss": 1.2956, + "step": 1423 + }, + { + "epoch": 0.4134126868921469, + "grad_norm": 3.600360155105591, + "learning_rate": 9.970804229998472e-06, + "loss": 1.2537, + "step": 1424 + }, + { + "epoch": 0.4137030047902453, + "grad_norm": 3.5895142555236816, + "learning_rate": 9.970700524400047e-06, + "loss": 1.1542, + "step": 1425 + }, + { + "epoch": 0.4139933226883437, + "grad_norm": 3.9078710079193115, + "learning_rate": 9.970596635484158e-06, + "loss": 1.1888, + "step": 1426 + }, + { + "epoch": 0.4142836405864422, + "grad_norm": 3.8377363681793213, + "learning_rate": 9.970492563254638e-06, + "loss": 1.2513, + "step": 1427 + }, + { + "epoch": 0.4145739584845406, + "grad_norm": 3.7490737438201904, + "learning_rate": 9.970388307715326e-06, + "loss": 1.25, + "step": 1428 + }, + { + "epoch": 0.414864276382639, + "grad_norm": 3.806488275527954, + "learning_rate": 9.970283868870065e-06, + "loss": 1.1911, + "step": 1429 + }, + { + "epoch": 0.4151545942807374, + "grad_norm": 3.4695956707000732, + "learning_rate": 9.970179246722707e-06, + "loss": 1.1784, + "step": 1430 + }, + { + "epoch": 0.41544491217883583, + "grad_norm": 3.5068411827087402, + "learning_rate": 9.970074441277111e-06, + "loss": 1.2052, + "step": 1431 + }, + { + "epoch": 0.41573523007693425, + "grad_norm": 3.612985134124756, + "learning_rate": 9.96996945253714e-06, + "loss": 1.2254, + "step": 1432 + }, + { + "epoch": 0.41602554797503266, + "grad_norm": 3.5536611080169678, + "learning_rate": 9.96986428050667e-06, + "loss": 1.2219, + "step": 1433 + }, + { + "epoch": 0.4163158658731311, + "grad_norm": 3.725837469100952, + "learning_rate": 9.96975892518958e-06, + "loss": 1.174, + "step": 1434 + }, + { + "epoch": 0.4166061837712295, + "grad_norm": 3.201591968536377, + "learning_rate": 9.969653386589749e-06, + "loss": 0.9781, + "step": 1435 + }, + { + "epoch": 0.4168965016693279, + "grad_norm": 3.9703338146209717, + "learning_rate": 9.969547664711074e-06, + "loss": 1.1812, + "step": 1436 + }, + { + "epoch": 0.4171868195674263, + "grad_norm": 3.7230799198150635, + "learning_rate": 9.969441759557453e-06, + "loss": 1.102, + "step": 1437 + }, + { + "epoch": 0.41747713746552473, + "grad_norm": 3.4397854804992676, + "learning_rate": 9.969335671132793e-06, + "loss": 1.1384, + "step": 1438 + }, + { + "epoch": 0.41776745536362314, + "grad_norm": 3.201946258544922, + "learning_rate": 9.969229399441006e-06, + "loss": 1.0366, + "step": 1439 + }, + { + "epoch": 0.4180577732617216, + "grad_norm": 3.333623170852661, + "learning_rate": 9.96912294448601e-06, + "loss": 1.1551, + "step": 1440 + }, + { + "epoch": 0.41834809115982, + "grad_norm": 3.6181843280792236, + "learning_rate": 9.969016306271731e-06, + "loss": 1.2059, + "step": 1441 + }, + { + "epoch": 0.41863840905791844, + "grad_norm": 3.383269786834717, + "learning_rate": 9.968909484802103e-06, + "loss": 1.2181, + "step": 1442 + }, + { + "epoch": 0.41892872695601685, + "grad_norm": 3.3849494457244873, + "learning_rate": 9.968802480081065e-06, + "loss": 1.1045, + "step": 1443 + }, + { + "epoch": 0.41921904485411526, + "grad_norm": 3.6936628818511963, + "learning_rate": 9.968695292112564e-06, + "loss": 1.4005, + "step": 1444 + }, + { + "epoch": 0.4195093627522137, + "grad_norm": 3.769911050796509, + "learning_rate": 9.968587920900552e-06, + "loss": 1.3328, + "step": 1445 + }, + { + "epoch": 0.4197996806503121, + "grad_norm": 3.6452932357788086, + "learning_rate": 9.968480366448989e-06, + "loss": 1.2832, + "step": 1446 + }, + { + "epoch": 0.4200899985484105, + "grad_norm": 3.6828529834747314, + "learning_rate": 9.968372628761841e-06, + "loss": 1.2306, + "step": 1447 + }, + { + "epoch": 0.4203803164465089, + "grad_norm": 3.583516836166382, + "learning_rate": 9.968264707843083e-06, + "loss": 1.2331, + "step": 1448 + }, + { + "epoch": 0.42067063434460733, + "grad_norm": 4.031094074249268, + "learning_rate": 9.968156603696696e-06, + "loss": 1.2641, + "step": 1449 + }, + { + "epoch": 0.42096095224270574, + "grad_norm": 3.9242236614227295, + "learning_rate": 9.968048316326661e-06, + "loss": 1.2058, + "step": 1450 + }, + { + "epoch": 0.42125127014080416, + "grad_norm": 3.463057041168213, + "learning_rate": 9.967939845736978e-06, + "loss": 1.1357, + "step": 1451 + }, + { + "epoch": 0.4215415880389026, + "grad_norm": 3.4815406799316406, + "learning_rate": 9.967831191931645e-06, + "loss": 1.372, + "step": 1452 + }, + { + "epoch": 0.42183190593700104, + "grad_norm": 3.369882583618164, + "learning_rate": 9.967722354914668e-06, + "loss": 1.0852, + "step": 1453 + }, + { + "epoch": 0.42212222383509945, + "grad_norm": 3.3886513710021973, + "learning_rate": 9.967613334690065e-06, + "loss": 1.2646, + "step": 1454 + }, + { + "epoch": 0.42241254173319787, + "grad_norm": 3.631355047225952, + "learning_rate": 9.96750413126185e-06, + "loss": 1.1813, + "step": 1455 + }, + { + "epoch": 0.4227028596312963, + "grad_norm": 3.5558574199676514, + "learning_rate": 9.967394744634056e-06, + "loss": 1.2245, + "step": 1456 + }, + { + "epoch": 0.4229931775293947, + "grad_norm": 3.1339149475097656, + "learning_rate": 9.967285174810713e-06, + "loss": 1.0773, + "step": 1457 + }, + { + "epoch": 0.4232834954274931, + "grad_norm": 3.7277801036834717, + "learning_rate": 9.967175421795865e-06, + "loss": 1.3972, + "step": 1458 + }, + { + "epoch": 0.4235738133255915, + "grad_norm": 3.4788103103637695, + "learning_rate": 9.967065485593559e-06, + "loss": 1.2236, + "step": 1459 + }, + { + "epoch": 0.42386413122368993, + "grad_norm": 3.0842342376708984, + "learning_rate": 9.966955366207849e-06, + "loss": 1.0713, + "step": 1460 + }, + { + "epoch": 0.42415444912178835, + "grad_norm": 3.700028657913208, + "learning_rate": 9.966845063642795e-06, + "loss": 1.2501, + "step": 1461 + }, + { + "epoch": 0.42444476701988676, + "grad_norm": 3.3011817932128906, + "learning_rate": 9.966734577902469e-06, + "loss": 1.0213, + "step": 1462 + }, + { + "epoch": 0.4247350849179852, + "grad_norm": 3.7596285343170166, + "learning_rate": 9.96662390899094e-06, + "loss": 1.2344, + "step": 1463 + }, + { + "epoch": 0.4250254028160836, + "grad_norm": 3.251818895339966, + "learning_rate": 9.966513056912292e-06, + "loss": 1.1105, + "step": 1464 + }, + { + "epoch": 0.42531572071418206, + "grad_norm": 3.8628876209259033, + "learning_rate": 9.966402021670615e-06, + "loss": 1.2871, + "step": 1465 + }, + { + "epoch": 0.42560603861228047, + "grad_norm": 3.814058542251587, + "learning_rate": 9.966290803270003e-06, + "loss": 1.1547, + "step": 1466 + }, + { + "epoch": 0.4258963565103789, + "grad_norm": 3.737708330154419, + "learning_rate": 9.966179401714556e-06, + "loss": 1.2086, + "step": 1467 + }, + { + "epoch": 0.4261866744084773, + "grad_norm": 3.685622453689575, + "learning_rate": 9.966067817008383e-06, + "loss": 1.209, + "step": 1468 + }, + { + "epoch": 0.4264769923065757, + "grad_norm": 3.5678586959838867, + "learning_rate": 9.9659560491556e-06, + "loss": 1.3042, + "step": 1469 + }, + { + "epoch": 0.4267673102046741, + "grad_norm": 3.4052236080169678, + "learning_rate": 9.965844098160326e-06, + "loss": 1.084, + "step": 1470 + }, + { + "epoch": 0.42705762810277254, + "grad_norm": 3.542491912841797, + "learning_rate": 9.965731964026696e-06, + "loss": 1.2259, + "step": 1471 + }, + { + "epoch": 0.42734794600087095, + "grad_norm": 3.580087184906006, + "learning_rate": 9.96561964675884e-06, + "loss": 1.1961, + "step": 1472 + }, + { + "epoch": 0.42763826389896936, + "grad_norm": 3.7177071571350098, + "learning_rate": 9.965507146360902e-06, + "loss": 1.2361, + "step": 1473 + }, + { + "epoch": 0.4279285817970678, + "grad_norm": 3.361457586288452, + "learning_rate": 9.965394462837032e-06, + "loss": 1.1595, + "step": 1474 + }, + { + "epoch": 0.4282188996951662, + "grad_norm": 3.8086483478546143, + "learning_rate": 9.965281596191384e-06, + "loss": 1.4176, + "step": 1475 + }, + { + "epoch": 0.4285092175932646, + "grad_norm": 3.709951639175415, + "learning_rate": 9.965168546428122e-06, + "loss": 1.2644, + "step": 1476 + }, + { + "epoch": 0.428799535491363, + "grad_norm": 3.452254295349121, + "learning_rate": 9.965055313551413e-06, + "loss": 1.1387, + "step": 1477 + }, + { + "epoch": 0.4290898533894615, + "grad_norm": 3.2605044841766357, + "learning_rate": 9.964941897565434e-06, + "loss": 1.1387, + "step": 1478 + }, + { + "epoch": 0.4293801712875599, + "grad_norm": 3.717010498046875, + "learning_rate": 9.96482829847437e-06, + "loss": 1.2172, + "step": 1479 + }, + { + "epoch": 0.4296704891856583, + "grad_norm": 3.5657219886779785, + "learning_rate": 9.964714516282407e-06, + "loss": 1.2444, + "step": 1480 + }, + { + "epoch": 0.4299608070837567, + "grad_norm": 3.469438314437866, + "learning_rate": 9.964600550993744e-06, + "loss": 1.1068, + "step": 1481 + }, + { + "epoch": 0.43025112498185514, + "grad_norm": 3.4567294120788574, + "learning_rate": 9.96448640261258e-06, + "loss": 1.0813, + "step": 1482 + }, + { + "epoch": 0.43054144287995355, + "grad_norm": 3.3223202228546143, + "learning_rate": 9.964372071143131e-06, + "loss": 1.143, + "step": 1483 + }, + { + "epoch": 0.43083176077805196, + "grad_norm": 3.2226054668426514, + "learning_rate": 9.96425755658961e-06, + "loss": 1.189, + "step": 1484 + }, + { + "epoch": 0.4311220786761504, + "grad_norm": 3.6389126777648926, + "learning_rate": 9.964142858956239e-06, + "loss": 1.3073, + "step": 1485 + }, + { + "epoch": 0.4314123965742488, + "grad_norm": 3.3728039264678955, + "learning_rate": 9.964027978247248e-06, + "loss": 1.0786, + "step": 1486 + }, + { + "epoch": 0.4317027144723472, + "grad_norm": 3.0883610248565674, + "learning_rate": 9.963912914466877e-06, + "loss": 1.0915, + "step": 1487 + }, + { + "epoch": 0.4319930323704456, + "grad_norm": 3.4856998920440674, + "learning_rate": 9.963797667619368e-06, + "loss": 1.2368, + "step": 1488 + }, + { + "epoch": 0.43228335026854403, + "grad_norm": 3.4481701850891113, + "learning_rate": 9.96368223770897e-06, + "loss": 1.1896, + "step": 1489 + }, + { + "epoch": 0.4325736681666425, + "grad_norm": 3.5037729740142822, + "learning_rate": 9.963566624739939e-06, + "loss": 1.1162, + "step": 1490 + }, + { + "epoch": 0.4328639860647409, + "grad_norm": 3.3900668621063232, + "learning_rate": 9.963450828716543e-06, + "loss": 1.1042, + "step": 1491 + }, + { + "epoch": 0.4331543039628393, + "grad_norm": 3.7949774265289307, + "learning_rate": 9.96333484964305e-06, + "loss": 1.1798, + "step": 1492 + }, + { + "epoch": 0.43344462186093774, + "grad_norm": 3.6395134925842285, + "learning_rate": 9.963218687523737e-06, + "loss": 1.1875, + "step": 1493 + }, + { + "epoch": 0.43373493975903615, + "grad_norm": 3.550593376159668, + "learning_rate": 9.963102342362887e-06, + "loss": 1.2833, + "step": 1494 + }, + { + "epoch": 0.43402525765713457, + "grad_norm": 3.293470859527588, + "learning_rate": 9.962985814164794e-06, + "loss": 1.1546, + "step": 1495 + }, + { + "epoch": 0.434315575555233, + "grad_norm": 3.5365588665008545, + "learning_rate": 9.962869102933754e-06, + "loss": 1.1175, + "step": 1496 + }, + { + "epoch": 0.4346058934533314, + "grad_norm": 3.716935157775879, + "learning_rate": 9.962752208674069e-06, + "loss": 1.1918, + "step": 1497 + }, + { + "epoch": 0.4348962113514298, + "grad_norm": 3.8409154415130615, + "learning_rate": 9.962635131390054e-06, + "loss": 1.4506, + "step": 1498 + }, + { + "epoch": 0.4351865292495282, + "grad_norm": 3.3539156913757324, + "learning_rate": 9.962517871086023e-06, + "loss": 1.0693, + "step": 1499 + }, + { + "epoch": 0.43547684714762663, + "grad_norm": 3.8182966709136963, + "learning_rate": 9.962400427766304e-06, + "loss": 1.2104, + "step": 1500 + }, + { + "epoch": 0.43547684714762663, + "eval_loss": 1.2276885509490967, + "eval_runtime": 11.6259, + "eval_samples_per_second": 34.406, + "eval_steps_per_second": 4.301, + "step": 1500 + }, + { + "epoch": 0.43576716504572505, + "grad_norm": 3.362107992172241, + "learning_rate": 9.962282801435226e-06, + "loss": 1.1556, + "step": 1501 + }, + { + "epoch": 0.43605748294382346, + "grad_norm": 3.7278659343719482, + "learning_rate": 9.962164992097125e-06, + "loss": 1.2846, + "step": 1502 + }, + { + "epoch": 0.43634780084192193, + "grad_norm": 3.296018362045288, + "learning_rate": 9.962046999756352e-06, + "loss": 1.1573, + "step": 1503 + }, + { + "epoch": 0.43663811874002034, + "grad_norm": 3.632516860961914, + "learning_rate": 9.961928824417252e-06, + "loss": 1.3175, + "step": 1504 + }, + { + "epoch": 0.43692843663811876, + "grad_norm": 4.042605876922607, + "learning_rate": 9.961810466084188e-06, + "loss": 1.2586, + "step": 1505 + }, + { + "epoch": 0.43721875453621717, + "grad_norm": 3.322206735610962, + "learning_rate": 9.961691924761522e-06, + "loss": 1.0772, + "step": 1506 + }, + { + "epoch": 0.4375090724343156, + "grad_norm": 3.485081672668457, + "learning_rate": 9.961573200453627e-06, + "loss": 1.0764, + "step": 1507 + }, + { + "epoch": 0.437799390332414, + "grad_norm": 3.5794100761413574, + "learning_rate": 9.961454293164881e-06, + "loss": 1.1919, + "step": 1508 + }, + { + "epoch": 0.4380897082305124, + "grad_norm": 3.594174861907959, + "learning_rate": 9.96133520289967e-06, + "loss": 1.0832, + "step": 1509 + }, + { + "epoch": 0.4383800261286108, + "grad_norm": 3.7435574531555176, + "learning_rate": 9.961215929662385e-06, + "loss": 1.2706, + "step": 1510 + }, + { + "epoch": 0.43867034402670924, + "grad_norm": 3.7980704307556152, + "learning_rate": 9.961096473457425e-06, + "loss": 1.0843, + "step": 1511 + }, + { + "epoch": 0.43896066192480765, + "grad_norm": 3.568105459213257, + "learning_rate": 9.960976834289197e-06, + "loss": 1.1144, + "step": 1512 + }, + { + "epoch": 0.43925097982290606, + "grad_norm": 3.6921393871307373, + "learning_rate": 9.960857012162111e-06, + "loss": 1.2484, + "step": 1513 + }, + { + "epoch": 0.4395412977210045, + "grad_norm": 3.7946364879608154, + "learning_rate": 9.960737007080588e-06, + "loss": 1.262, + "step": 1514 + }, + { + "epoch": 0.43983161561910294, + "grad_norm": 3.6504087448120117, + "learning_rate": 9.960616819049053e-06, + "loss": 1.3897, + "step": 1515 + }, + { + "epoch": 0.44012193351720136, + "grad_norm": 3.5300283432006836, + "learning_rate": 9.960496448071936e-06, + "loss": 1.0847, + "step": 1516 + }, + { + "epoch": 0.44041225141529977, + "grad_norm": 3.1674747467041016, + "learning_rate": 9.960375894153682e-06, + "loss": 1.1272, + "step": 1517 + }, + { + "epoch": 0.4407025693133982, + "grad_norm": 3.515995502471924, + "learning_rate": 9.96025515729873e-06, + "loss": 1.1502, + "step": 1518 + }, + { + "epoch": 0.4409928872114966, + "grad_norm": 3.667440176010132, + "learning_rate": 9.960134237511538e-06, + "loss": 1.263, + "step": 1519 + }, + { + "epoch": 0.441283205109595, + "grad_norm": 3.8216376304626465, + "learning_rate": 9.960013134796564e-06, + "loss": 1.3115, + "step": 1520 + }, + { + "epoch": 0.4415735230076934, + "grad_norm": 3.4460253715515137, + "learning_rate": 9.959891849158275e-06, + "loss": 1.1301, + "step": 1521 + }, + { + "epoch": 0.44186384090579184, + "grad_norm": 3.636212110519409, + "learning_rate": 9.95977038060114e-06, + "loss": 1.3331, + "step": 1522 + }, + { + "epoch": 0.44215415880389025, + "grad_norm": 3.424614191055298, + "learning_rate": 9.959648729129642e-06, + "loss": 1.1076, + "step": 1523 + }, + { + "epoch": 0.44244447670198866, + "grad_norm": 3.6137311458587646, + "learning_rate": 9.959526894748268e-06, + "loss": 1.3869, + "step": 1524 + }, + { + "epoch": 0.4427347946000871, + "grad_norm": 3.550391912460327, + "learning_rate": 9.959404877461512e-06, + "loss": 1.2157, + "step": 1525 + }, + { + "epoch": 0.4430251124981855, + "grad_norm": 3.9449851512908936, + "learning_rate": 9.959282677273869e-06, + "loss": 1.1935, + "step": 1526 + }, + { + "epoch": 0.4433154303962839, + "grad_norm": 3.6746020317077637, + "learning_rate": 9.959160294189852e-06, + "loss": 1.3009, + "step": 1527 + }, + { + "epoch": 0.4436057482943824, + "grad_norm": 3.3976306915283203, + "learning_rate": 9.959037728213968e-06, + "loss": 1.3389, + "step": 1528 + }, + { + "epoch": 0.4438960661924808, + "grad_norm": 3.695160150527954, + "learning_rate": 9.958914979350743e-06, + "loss": 1.1807, + "step": 1529 + }, + { + "epoch": 0.4441863840905792, + "grad_norm": 3.731966257095337, + "learning_rate": 9.9587920476047e-06, + "loss": 1.2079, + "step": 1530 + }, + { + "epoch": 0.4444767019886776, + "grad_norm": 3.5896048545837402, + "learning_rate": 9.958668932980375e-06, + "loss": 1.1836, + "step": 1531 + }, + { + "epoch": 0.444767019886776, + "grad_norm": 3.400681972503662, + "learning_rate": 9.958545635482307e-06, + "loss": 1.1317, + "step": 1532 + }, + { + "epoch": 0.44505733778487444, + "grad_norm": 3.247178077697754, + "learning_rate": 9.958422155115044e-06, + "loss": 1.2038, + "step": 1533 + }, + { + "epoch": 0.44534765568297285, + "grad_norm": 3.610156536102295, + "learning_rate": 9.95829849188314e-06, + "loss": 1.1852, + "step": 1534 + }, + { + "epoch": 0.44563797358107127, + "grad_norm": 3.8021605014801025, + "learning_rate": 9.958174645791154e-06, + "loss": 1.4697, + "step": 1535 + }, + { + "epoch": 0.4459282914791697, + "grad_norm": 3.3716843128204346, + "learning_rate": 9.958050616843655e-06, + "loss": 1.1266, + "step": 1536 + }, + { + "epoch": 0.4462186093772681, + "grad_norm": 3.840357780456543, + "learning_rate": 9.957926405045219e-06, + "loss": 1.2474, + "step": 1537 + }, + { + "epoch": 0.4465089272753665, + "grad_norm": 3.4997823238372803, + "learning_rate": 9.957802010400423e-06, + "loss": 1.1936, + "step": 1538 + }, + { + "epoch": 0.4467992451734649, + "grad_norm": 3.3240110874176025, + "learning_rate": 9.957677432913855e-06, + "loss": 1.1124, + "step": 1539 + }, + { + "epoch": 0.4470895630715634, + "grad_norm": 3.7043850421905518, + "learning_rate": 9.957552672590111e-06, + "loss": 1.1571, + "step": 1540 + }, + { + "epoch": 0.4473798809696618, + "grad_norm": 3.405775308609009, + "learning_rate": 9.957427729433794e-06, + "loss": 1.2005, + "step": 1541 + }, + { + "epoch": 0.4476701988677602, + "grad_norm": 3.6422696113586426, + "learning_rate": 9.957302603449508e-06, + "loss": 1.3203, + "step": 1542 + }, + { + "epoch": 0.44796051676585863, + "grad_norm": 3.397426128387451, + "learning_rate": 9.95717729464187e-06, + "loss": 1.2018, + "step": 1543 + }, + { + "epoch": 0.44825083466395704, + "grad_norm": 3.974717617034912, + "learning_rate": 9.9570518030155e-06, + "loss": 1.3058, + "step": 1544 + }, + { + "epoch": 0.44854115256205546, + "grad_norm": 3.8308608531951904, + "learning_rate": 9.956926128575026e-06, + "loss": 1.2463, + "step": 1545 + }, + { + "epoch": 0.44883147046015387, + "grad_norm": 3.5619077682495117, + "learning_rate": 9.956800271325084e-06, + "loss": 1.2587, + "step": 1546 + }, + { + "epoch": 0.4491217883582523, + "grad_norm": 3.4124200344085693, + "learning_rate": 9.956674231270316e-06, + "loss": 1.1719, + "step": 1547 + }, + { + "epoch": 0.4494121062563507, + "grad_norm": 3.5342917442321777, + "learning_rate": 9.95654800841537e-06, + "loss": 1.1438, + "step": 1548 + }, + { + "epoch": 0.4497024241544491, + "grad_norm": 3.613375663757324, + "learning_rate": 9.956421602764899e-06, + "loss": 1.0305, + "step": 1549 + }, + { + "epoch": 0.4499927420525475, + "grad_norm": 3.55999493598938, + "learning_rate": 9.956295014323566e-06, + "loss": 1.122, + "step": 1550 + }, + { + "epoch": 0.45028305995064594, + "grad_norm": 3.425326347351074, + "learning_rate": 9.956168243096039e-06, + "loss": 0.9979, + "step": 1551 + }, + { + "epoch": 0.45057337784874435, + "grad_norm": 3.199810028076172, + "learning_rate": 9.956041289086995e-06, + "loss": 1.1511, + "step": 1552 + }, + { + "epoch": 0.4508636957468428, + "grad_norm": 3.714824914932251, + "learning_rate": 9.955914152301115e-06, + "loss": 1.2827, + "step": 1553 + }, + { + "epoch": 0.45115401364494123, + "grad_norm": 3.588531732559204, + "learning_rate": 9.955786832743089e-06, + "loss": 1.2596, + "step": 1554 + }, + { + "epoch": 0.45144433154303965, + "grad_norm": 3.7227511405944824, + "learning_rate": 9.955659330417608e-06, + "loss": 1.2919, + "step": 1555 + }, + { + "epoch": 0.45173464944113806, + "grad_norm": 3.487367868423462, + "learning_rate": 9.95553164532938e-06, + "loss": 1.118, + "step": 1556 + }, + { + "epoch": 0.45202496733923647, + "grad_norm": 3.4509451389312744, + "learning_rate": 9.955403777483112e-06, + "loss": 1.1279, + "step": 1557 + }, + { + "epoch": 0.4523152852373349, + "grad_norm": 3.383143663406372, + "learning_rate": 9.955275726883517e-06, + "loss": 1.0833, + "step": 1558 + }, + { + "epoch": 0.4526056031354333, + "grad_norm": 3.47957444190979, + "learning_rate": 9.955147493535321e-06, + "loss": 1.0278, + "step": 1559 + }, + { + "epoch": 0.4528959210335317, + "grad_norm": 3.340008497238159, + "learning_rate": 9.95501907744325e-06, + "loss": 1.2062, + "step": 1560 + }, + { + "epoch": 0.4531862389316301, + "grad_norm": 3.7595670223236084, + "learning_rate": 9.954890478612045e-06, + "loss": 1.142, + "step": 1561 + }, + { + "epoch": 0.45347655682972854, + "grad_norm": 3.9946539402008057, + "learning_rate": 9.954761697046445e-06, + "loss": 1.326, + "step": 1562 + }, + { + "epoch": 0.45376687472782695, + "grad_norm": 3.490159273147583, + "learning_rate": 9.954632732751196e-06, + "loss": 1.2648, + "step": 1563 + }, + { + "epoch": 0.45405719262592537, + "grad_norm": 3.5875537395477295, + "learning_rate": 9.954503585731061e-06, + "loss": 1.3082, + "step": 1564 + }, + { + "epoch": 0.4543475105240238, + "grad_norm": 3.562396764755249, + "learning_rate": 9.9543742559908e-06, + "loss": 1.1663, + "step": 1565 + }, + { + "epoch": 0.45463782842212225, + "grad_norm": 3.3653926849365234, + "learning_rate": 9.954244743535181e-06, + "loss": 1.1193, + "step": 1566 + }, + { + "epoch": 0.45492814632022066, + "grad_norm": 3.2246313095092773, + "learning_rate": 9.954115048368984e-06, + "loss": 1.1123, + "step": 1567 + }, + { + "epoch": 0.4552184642183191, + "grad_norm": 3.596186876296997, + "learning_rate": 9.953985170496989e-06, + "loss": 1.1279, + "step": 1568 + }, + { + "epoch": 0.4555087821164175, + "grad_norm": 3.476844072341919, + "learning_rate": 9.953855109923987e-06, + "loss": 1.1921, + "step": 1569 + }, + { + "epoch": 0.4557991000145159, + "grad_norm": 3.237635612487793, + "learning_rate": 9.953724866654775e-06, + "loss": 1.1454, + "step": 1570 + }, + { + "epoch": 0.4560894179126143, + "grad_norm": 3.8363256454467773, + "learning_rate": 9.953594440694154e-06, + "loss": 1.3695, + "step": 1571 + }, + { + "epoch": 0.45637973581071273, + "grad_norm": 3.6838483810424805, + "learning_rate": 9.953463832046936e-06, + "loss": 1.1117, + "step": 1572 + }, + { + "epoch": 0.45667005370881114, + "grad_norm": 3.7038700580596924, + "learning_rate": 9.953333040717938e-06, + "loss": 1.1542, + "step": 1573 + }, + { + "epoch": 0.45696037160690955, + "grad_norm": 3.6134519577026367, + "learning_rate": 9.953202066711985e-06, + "loss": 1.0318, + "step": 1574 + }, + { + "epoch": 0.45725068950500797, + "grad_norm": 3.8328986167907715, + "learning_rate": 9.953070910033904e-06, + "loss": 1.3801, + "step": 1575 + }, + { + "epoch": 0.4575410074031064, + "grad_norm": 3.912644624710083, + "learning_rate": 9.952939570688532e-06, + "loss": 1.1623, + "step": 1576 + }, + { + "epoch": 0.4578313253012048, + "grad_norm": 3.586677074432373, + "learning_rate": 9.952808048680716e-06, + "loss": 1.1486, + "step": 1577 + }, + { + "epoch": 0.45812164319930326, + "grad_norm": 3.4135282039642334, + "learning_rate": 9.952676344015304e-06, + "loss": 1.1422, + "step": 1578 + }, + { + "epoch": 0.4584119610974017, + "grad_norm": 3.606527090072632, + "learning_rate": 9.952544456697153e-06, + "loss": 1.1445, + "step": 1579 + }, + { + "epoch": 0.4587022789955001, + "grad_norm": 3.4661526679992676, + "learning_rate": 9.95241238673113e-06, + "loss": 1.2884, + "step": 1580 + }, + { + "epoch": 0.4589925968935985, + "grad_norm": 3.521548271179199, + "learning_rate": 9.9522801341221e-06, + "loss": 1.1434, + "step": 1581 + }, + { + "epoch": 0.4592829147916969, + "grad_norm": 3.239595890045166, + "learning_rate": 9.952147698874948e-06, + "loss": 1.0461, + "step": 1582 + }, + { + "epoch": 0.45957323268979533, + "grad_norm": 3.476299524307251, + "learning_rate": 9.95201508099455e-06, + "loss": 1.2104, + "step": 1583 + }, + { + "epoch": 0.45986355058789374, + "grad_norm": 3.398822784423828, + "learning_rate": 9.951882280485805e-06, + "loss": 1.2335, + "step": 1584 + }, + { + "epoch": 0.46015386848599216, + "grad_norm": 3.5042576789855957, + "learning_rate": 9.951749297353605e-06, + "loss": 1.1804, + "step": 1585 + }, + { + "epoch": 0.46044418638409057, + "grad_norm": 3.163114547729492, + "learning_rate": 9.951616131602855e-06, + "loss": 1.0034, + "step": 1586 + }, + { + "epoch": 0.460734504282189, + "grad_norm": 3.456465244293213, + "learning_rate": 9.951482783238468e-06, + "loss": 1.1458, + "step": 1587 + }, + { + "epoch": 0.4610248221802874, + "grad_norm": 3.5666391849517822, + "learning_rate": 9.95134925226536e-06, + "loss": 1.2278, + "step": 1588 + }, + { + "epoch": 0.4613151400783858, + "grad_norm": 3.3443286418914795, + "learning_rate": 9.951215538688456e-06, + "loss": 1.1107, + "step": 1589 + }, + { + "epoch": 0.4616054579764842, + "grad_norm": 3.3506739139556885, + "learning_rate": 9.95108164251269e-06, + "loss": 1.0595, + "step": 1590 + }, + { + "epoch": 0.4618957758745827, + "grad_norm": 3.423740863800049, + "learning_rate": 9.950947563742997e-06, + "loss": 1.0907, + "step": 1591 + }, + { + "epoch": 0.4621860937726811, + "grad_norm": 3.432969808578491, + "learning_rate": 9.950813302384322e-06, + "loss": 1.13, + "step": 1592 + }, + { + "epoch": 0.4624764116707795, + "grad_norm": 3.5011508464813232, + "learning_rate": 9.950678858441616e-06, + "loss": 1.2519, + "step": 1593 + }, + { + "epoch": 0.46276672956887793, + "grad_norm": 3.8555173873901367, + "learning_rate": 9.950544231919841e-06, + "loss": 1.2269, + "step": 1594 + }, + { + "epoch": 0.46305704746697635, + "grad_norm": 3.934401750564575, + "learning_rate": 9.950409422823957e-06, + "loss": 1.4339, + "step": 1595 + }, + { + "epoch": 0.46334736536507476, + "grad_norm": 3.346092700958252, + "learning_rate": 9.95027443115894e-06, + "loss": 1.0944, + "step": 1596 + }, + { + "epoch": 0.4636376832631732, + "grad_norm": 3.4434292316436768, + "learning_rate": 9.950139256929765e-06, + "loss": 1.2216, + "step": 1597 + }, + { + "epoch": 0.4639280011612716, + "grad_norm": 3.2562990188598633, + "learning_rate": 9.950003900141418e-06, + "loss": 1.139, + "step": 1598 + }, + { + "epoch": 0.46421831905937, + "grad_norm": 3.5434086322784424, + "learning_rate": 9.949868360798893e-06, + "loss": 1.1897, + "step": 1599 + }, + { + "epoch": 0.4645086369574684, + "grad_norm": 3.396911382675171, + "learning_rate": 9.949732638907186e-06, + "loss": 1.1597, + "step": 1600 + }, + { + "epoch": 0.4647989548555668, + "grad_norm": 3.415681838989258, + "learning_rate": 9.949596734471304e-06, + "loss": 1.0674, + "step": 1601 + }, + { + "epoch": 0.46508927275366524, + "grad_norm": 3.7457592487335205, + "learning_rate": 9.949460647496258e-06, + "loss": 1.4005, + "step": 1602 + }, + { + "epoch": 0.4653795906517637, + "grad_norm": 3.612797737121582, + "learning_rate": 9.949324377987069e-06, + "loss": 1.226, + "step": 1603 + }, + { + "epoch": 0.4656699085498621, + "grad_norm": 3.3432776927948, + "learning_rate": 9.94918792594876e-06, + "loss": 1.1843, + "step": 1604 + }, + { + "epoch": 0.46596022644796053, + "grad_norm": 3.31148624420166, + "learning_rate": 9.949051291386365e-06, + "loss": 1.0242, + "step": 1605 + }, + { + "epoch": 0.46625054434605895, + "grad_norm": 3.4100899696350098, + "learning_rate": 9.948914474304922e-06, + "loss": 1.2697, + "step": 1606 + }, + { + "epoch": 0.46654086224415736, + "grad_norm": 3.507978916168213, + "learning_rate": 9.948777474709477e-06, + "loss": 1.0508, + "step": 1607 + }, + { + "epoch": 0.4668311801422558, + "grad_norm": 3.3306009769439697, + "learning_rate": 9.948640292605081e-06, + "loss": 1.1063, + "step": 1608 + }, + { + "epoch": 0.4671214980403542, + "grad_norm": 3.5736498832702637, + "learning_rate": 9.948502927996797e-06, + "loss": 1.1513, + "step": 1609 + }, + { + "epoch": 0.4674118159384526, + "grad_norm": 3.465364933013916, + "learning_rate": 9.948365380889688e-06, + "loss": 1.1332, + "step": 1610 + }, + { + "epoch": 0.467702133836551, + "grad_norm": 3.7221927642822266, + "learning_rate": 9.948227651288828e-06, + "loss": 1.1749, + "step": 1611 + }, + { + "epoch": 0.46799245173464943, + "grad_norm": 3.762308359146118, + "learning_rate": 9.948089739199296e-06, + "loss": 1.1774, + "step": 1612 + }, + { + "epoch": 0.46828276963274784, + "grad_norm": 3.715789794921875, + "learning_rate": 9.947951644626177e-06, + "loss": 1.2722, + "step": 1613 + }, + { + "epoch": 0.46857308753084626, + "grad_norm": 3.3541386127471924, + "learning_rate": 9.947813367574564e-06, + "loss": 1.1911, + "step": 1614 + }, + { + "epoch": 0.46886340542894467, + "grad_norm": 3.2428276538848877, + "learning_rate": 9.94767490804956e-06, + "loss": 1.0802, + "step": 1615 + }, + { + "epoch": 0.46915372332704314, + "grad_norm": 3.3992302417755127, + "learning_rate": 9.947536266056269e-06, + "loss": 1.1518, + "step": 1616 + }, + { + "epoch": 0.46944404122514155, + "grad_norm": 3.9083375930786133, + "learning_rate": 9.947397441599801e-06, + "loss": 1.3027, + "step": 1617 + }, + { + "epoch": 0.46973435912323996, + "grad_norm": 4.152743816375732, + "learning_rate": 9.947258434685281e-06, + "loss": 1.2554, + "step": 1618 + }, + { + "epoch": 0.4700246770213384, + "grad_norm": 4.119356632232666, + "learning_rate": 9.947119245317832e-06, + "loss": 1.2819, + "step": 1619 + }, + { + "epoch": 0.4703149949194368, + "grad_norm": 3.8427681922912598, + "learning_rate": 9.946979873502589e-06, + "loss": 1.2107, + "step": 1620 + }, + { + "epoch": 0.4706053128175352, + "grad_norm": 3.865187883377075, + "learning_rate": 9.94684031924469e-06, + "loss": 1.2721, + "step": 1621 + }, + { + "epoch": 0.4708956307156336, + "grad_norm": 3.146252155303955, + "learning_rate": 9.946700582549285e-06, + "loss": 1.0884, + "step": 1622 + }, + { + "epoch": 0.47118594861373203, + "grad_norm": 3.6837799549102783, + "learning_rate": 9.946560663421525e-06, + "loss": 1.1676, + "step": 1623 + }, + { + "epoch": 0.47147626651183044, + "grad_norm": 3.769131898880005, + "learning_rate": 9.94642056186657e-06, + "loss": 1.3335, + "step": 1624 + }, + { + "epoch": 0.47176658440992886, + "grad_norm": 3.6001875400543213, + "learning_rate": 9.946280277889589e-06, + "loss": 1.1265, + "step": 1625 + }, + { + "epoch": 0.47205690230802727, + "grad_norm": 4.254703044891357, + "learning_rate": 9.946139811495752e-06, + "loss": 1.3297, + "step": 1626 + }, + { + "epoch": 0.4723472202061257, + "grad_norm": 3.61510968208313, + "learning_rate": 9.945999162690243e-06, + "loss": 1.1887, + "step": 1627 + }, + { + "epoch": 0.47263753810422415, + "grad_norm": 3.536651611328125, + "learning_rate": 9.945858331478249e-06, + "loss": 1.098, + "step": 1628 + }, + { + "epoch": 0.47292785600232257, + "grad_norm": 3.742727041244507, + "learning_rate": 9.94571731786496e-06, + "loss": 1.3315, + "step": 1629 + }, + { + "epoch": 0.473218173900421, + "grad_norm": 3.31262469291687, + "learning_rate": 9.94557612185558e-06, + "loss": 1.1736, + "step": 1630 + }, + { + "epoch": 0.4735084917985194, + "grad_norm": 3.649885892868042, + "learning_rate": 9.945434743455315e-06, + "loss": 1.1563, + "step": 1631 + }, + { + "epoch": 0.4737988096966178, + "grad_norm": 3.665729284286499, + "learning_rate": 9.945293182669379e-06, + "loss": 1.1454, + "step": 1632 + }, + { + "epoch": 0.4740891275947162, + "grad_norm": 3.2671260833740234, + "learning_rate": 9.945151439502994e-06, + "loss": 1.1382, + "step": 1633 + }, + { + "epoch": 0.47437944549281463, + "grad_norm": 3.785245180130005, + "learning_rate": 9.945009513961386e-06, + "loss": 1.1418, + "step": 1634 + }, + { + "epoch": 0.47466976339091305, + "grad_norm": 3.435044527053833, + "learning_rate": 9.94486740604979e-06, + "loss": 1.1039, + "step": 1635 + }, + { + "epoch": 0.47496008128901146, + "grad_norm": 3.3379416465759277, + "learning_rate": 9.944725115773444e-06, + "loss": 1.1867, + "step": 1636 + }, + { + "epoch": 0.4752503991871099, + "grad_norm": 3.381946563720703, + "learning_rate": 9.9445826431376e-06, + "loss": 1.1049, + "step": 1637 + }, + { + "epoch": 0.4755407170852083, + "grad_norm": 3.501094341278076, + "learning_rate": 9.944439988147509e-06, + "loss": 1.2041, + "step": 1638 + }, + { + "epoch": 0.4758310349833067, + "grad_norm": 3.4139304161071777, + "learning_rate": 9.944297150808435e-06, + "loss": 1.1924, + "step": 1639 + }, + { + "epoch": 0.4761213528814051, + "grad_norm": 3.5083329677581787, + "learning_rate": 9.944154131125643e-06, + "loss": 1.094, + "step": 1640 + }, + { + "epoch": 0.4764116707795036, + "grad_norm": 3.6780874729156494, + "learning_rate": 9.94401092910441e-06, + "loss": 1.3628, + "step": 1641 + }, + { + "epoch": 0.476701988677602, + "grad_norm": 3.515752077102661, + "learning_rate": 9.943867544750014e-06, + "loss": 1.2409, + "step": 1642 + }, + { + "epoch": 0.4769923065757004, + "grad_norm": 3.191023349761963, + "learning_rate": 9.943723978067747e-06, + "loss": 0.9894, + "step": 1643 + }, + { + "epoch": 0.4772826244737988, + "grad_norm": 3.679292678833008, + "learning_rate": 9.943580229062899e-06, + "loss": 1.2552, + "step": 1644 + }, + { + "epoch": 0.47757294237189724, + "grad_norm": 3.752819299697876, + "learning_rate": 9.943436297740775e-06, + "loss": 1.3449, + "step": 1645 + }, + { + "epoch": 0.47786326026999565, + "grad_norm": 3.826658248901367, + "learning_rate": 9.943292184106684e-06, + "loss": 1.239, + "step": 1646 + }, + { + "epoch": 0.47815357816809406, + "grad_norm": 3.6658759117126465, + "learning_rate": 9.943147888165936e-06, + "loss": 1.2737, + "step": 1647 + }, + { + "epoch": 0.4784438960661925, + "grad_norm": 3.1992828845977783, + "learning_rate": 9.943003409923857e-06, + "loss": 1.1231, + "step": 1648 + }, + { + "epoch": 0.4787342139642909, + "grad_norm": 4.053700923919678, + "learning_rate": 9.942858749385774e-06, + "loss": 1.1836, + "step": 1649 + }, + { + "epoch": 0.4790245318623893, + "grad_norm": 3.2630503177642822, + "learning_rate": 9.942713906557022e-06, + "loss": 1.2698, + "step": 1650 + }, + { + "epoch": 0.4793148497604877, + "grad_norm": 3.746953010559082, + "learning_rate": 9.942568881442942e-06, + "loss": 1.302, + "step": 1651 + }, + { + "epoch": 0.47960516765858613, + "grad_norm": 3.554513692855835, + "learning_rate": 9.942423674048883e-06, + "loss": 1.233, + "step": 1652 + }, + { + "epoch": 0.47989548555668454, + "grad_norm": 3.5243356227874756, + "learning_rate": 9.9422782843802e-06, + "loss": 1.2497, + "step": 1653 + }, + { + "epoch": 0.480185803454783, + "grad_norm": 3.6694653034210205, + "learning_rate": 9.942132712442256e-06, + "loss": 1.1968, + "step": 1654 + }, + { + "epoch": 0.4804761213528814, + "grad_norm": 3.7765867710113525, + "learning_rate": 9.941986958240419e-06, + "loss": 1.3024, + "step": 1655 + }, + { + "epoch": 0.48076643925097984, + "grad_norm": 3.853088855743408, + "learning_rate": 9.941841021780064e-06, + "loss": 1.2627, + "step": 1656 + }, + { + "epoch": 0.48105675714907825, + "grad_norm": 3.233306646347046, + "learning_rate": 9.941694903066572e-06, + "loss": 1.1378, + "step": 1657 + }, + { + "epoch": 0.48134707504717666, + "grad_norm": 3.6022415161132812, + "learning_rate": 9.941548602105333e-06, + "loss": 1.1581, + "step": 1658 + }, + { + "epoch": 0.4816373929452751, + "grad_norm": 3.3151590824127197, + "learning_rate": 9.941402118901743e-06, + "loss": 1.0879, + "step": 1659 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 3.559082508087158, + "learning_rate": 9.941255453461205e-06, + "loss": 1.2952, + "step": 1660 + }, + { + "epoch": 0.4822180287414719, + "grad_norm": 3.499293565750122, + "learning_rate": 9.941108605789125e-06, + "loss": 1.1496, + "step": 1661 + }, + { + "epoch": 0.4825083466395703, + "grad_norm": 3.5328094959259033, + "learning_rate": 9.940961575890921e-06, + "loss": 1.1707, + "step": 1662 + }, + { + "epoch": 0.48279866453766873, + "grad_norm": 3.5672430992126465, + "learning_rate": 9.940814363772016e-06, + "loss": 1.1496, + "step": 1663 + }, + { + "epoch": 0.48308898243576714, + "grad_norm": 3.3060715198516846, + "learning_rate": 9.940666969437836e-06, + "loss": 1.1478, + "step": 1664 + }, + { + "epoch": 0.48337930033386556, + "grad_norm": 3.711249828338623, + "learning_rate": 9.94051939289382e-06, + "loss": 1.2939, + "step": 1665 + }, + { + "epoch": 0.483669618231964, + "grad_norm": 3.299621343612671, + "learning_rate": 9.94037163414541e-06, + "loss": 1.1671, + "step": 1666 + }, + { + "epoch": 0.48395993613006244, + "grad_norm": 3.329033851623535, + "learning_rate": 9.940223693198054e-06, + "loss": 1.1649, + "step": 1667 + }, + { + "epoch": 0.48425025402816085, + "grad_norm": 3.5311896800994873, + "learning_rate": 9.940075570057209e-06, + "loss": 1.2479, + "step": 1668 + }, + { + "epoch": 0.48454057192625927, + "grad_norm": 3.478177785873413, + "learning_rate": 9.939927264728337e-06, + "loss": 1.0782, + "step": 1669 + }, + { + "epoch": 0.4848308898243577, + "grad_norm": 3.5076146125793457, + "learning_rate": 9.939778777216906e-06, + "loss": 1.1456, + "step": 1670 + }, + { + "epoch": 0.4851212077224561, + "grad_norm": 3.6281466484069824, + "learning_rate": 9.939630107528398e-06, + "loss": 1.1161, + "step": 1671 + }, + { + "epoch": 0.4854115256205545, + "grad_norm": 3.4649016857147217, + "learning_rate": 9.93948125566829e-06, + "loss": 1.1357, + "step": 1672 + }, + { + "epoch": 0.4857018435186529, + "grad_norm": 3.5469138622283936, + "learning_rate": 9.939332221642072e-06, + "loss": 1.1384, + "step": 1673 + }, + { + "epoch": 0.48599216141675133, + "grad_norm": 3.2848334312438965, + "learning_rate": 9.939183005455243e-06, + "loss": 1.1347, + "step": 1674 + }, + { + "epoch": 0.48628247931484975, + "grad_norm": 3.8708393573760986, + "learning_rate": 9.939033607113304e-06, + "loss": 1.2536, + "step": 1675 + }, + { + "epoch": 0.48657279721294816, + "grad_norm": 3.4363129138946533, + "learning_rate": 9.938884026621766e-06, + "loss": 1.2946, + "step": 1676 + }, + { + "epoch": 0.4868631151110466, + "grad_norm": 3.6415820121765137, + "learning_rate": 9.938734263986144e-06, + "loss": 1.2418, + "step": 1677 + }, + { + "epoch": 0.487153433009145, + "grad_norm": 3.5188496112823486, + "learning_rate": 9.938584319211965e-06, + "loss": 1.2058, + "step": 1678 + }, + { + "epoch": 0.48744375090724346, + "grad_norm": 3.30953049659729, + "learning_rate": 9.938434192304756e-06, + "loss": 1.1317, + "step": 1679 + }, + { + "epoch": 0.48773406880534187, + "grad_norm": 3.760052442550659, + "learning_rate": 9.938283883270051e-06, + "loss": 1.1917, + "step": 1680 + }, + { + "epoch": 0.4880243867034403, + "grad_norm": 3.384671688079834, + "learning_rate": 9.938133392113399e-06, + "loss": 1.1273, + "step": 1681 + }, + { + "epoch": 0.4883147046015387, + "grad_norm": 3.7452921867370605, + "learning_rate": 9.937982718840345e-06, + "loss": 1.2016, + "step": 1682 + }, + { + "epoch": 0.4886050224996371, + "grad_norm": 3.7120046615600586, + "learning_rate": 9.937831863456448e-06, + "loss": 1.3403, + "step": 1683 + }, + { + "epoch": 0.4888953403977355, + "grad_norm": 3.808293581008911, + "learning_rate": 9.937680825967272e-06, + "loss": 1.165, + "step": 1684 + }, + { + "epoch": 0.48918565829583394, + "grad_norm": 3.2630043029785156, + "learning_rate": 9.937529606378387e-06, + "loss": 1.125, + "step": 1685 + }, + { + "epoch": 0.48947597619393235, + "grad_norm": 3.6727232933044434, + "learning_rate": 9.937378204695368e-06, + "loss": 1.0798, + "step": 1686 + }, + { + "epoch": 0.48976629409203076, + "grad_norm": 3.460695505142212, + "learning_rate": 9.9372266209238e-06, + "loss": 1.1424, + "step": 1687 + }, + { + "epoch": 0.4900566119901292, + "grad_norm": 3.477473258972168, + "learning_rate": 9.937074855069276e-06, + "loss": 1.2076, + "step": 1688 + }, + { + "epoch": 0.4903469298882276, + "grad_norm": 3.641740322113037, + "learning_rate": 9.93692290713739e-06, + "loss": 1.3142, + "step": 1689 + }, + { + "epoch": 0.490637247786326, + "grad_norm": 3.400716543197632, + "learning_rate": 9.936770777133744e-06, + "loss": 1.2266, + "step": 1690 + }, + { + "epoch": 0.49092756568442447, + "grad_norm": 3.436521053314209, + "learning_rate": 9.936618465063955e-06, + "loss": 1.1197, + "step": 1691 + }, + { + "epoch": 0.4912178835825229, + "grad_norm": 3.466358184814453, + "learning_rate": 9.936465970933632e-06, + "loss": 1.2037, + "step": 1692 + }, + { + "epoch": 0.4915082014806213, + "grad_norm": 4.054111480712891, + "learning_rate": 9.936313294748405e-06, + "loss": 1.5063, + "step": 1693 + }, + { + "epoch": 0.4917985193787197, + "grad_norm": 3.775129556655884, + "learning_rate": 9.936160436513902e-06, + "loss": 1.2823, + "step": 1694 + }, + { + "epoch": 0.4920888372768181, + "grad_norm": 3.5445947647094727, + "learning_rate": 9.93600739623576e-06, + "loss": 1.2434, + "step": 1695 + }, + { + "epoch": 0.49237915517491654, + "grad_norm": 3.2320921421051025, + "learning_rate": 9.935854173919625e-06, + "loss": 1.1279, + "step": 1696 + }, + { + "epoch": 0.49266947307301495, + "grad_norm": 3.1317856311798096, + "learning_rate": 9.935700769571148e-06, + "loss": 1.1443, + "step": 1697 + }, + { + "epoch": 0.49295979097111337, + "grad_norm": 3.772987127304077, + "learning_rate": 9.935547183195985e-06, + "loss": 1.2283, + "step": 1698 + }, + { + "epoch": 0.4932501088692118, + "grad_norm": 3.737846851348877, + "learning_rate": 9.935393414799797e-06, + "loss": 1.1608, + "step": 1699 + }, + { + "epoch": 0.4935404267673102, + "grad_norm": 4.081494331359863, + "learning_rate": 9.935239464388262e-06, + "loss": 1.2129, + "step": 1700 + }, + { + "epoch": 0.4938307446654086, + "grad_norm": 3.5556063652038574, + "learning_rate": 9.935085331967054e-06, + "loss": 1.2782, + "step": 1701 + }, + { + "epoch": 0.494121062563507, + "grad_norm": 3.9093804359436035, + "learning_rate": 9.934931017541856e-06, + "loss": 1.2373, + "step": 1702 + }, + { + "epoch": 0.49441138046160543, + "grad_norm": 3.6765968799591064, + "learning_rate": 9.934776521118362e-06, + "loss": 1.2736, + "step": 1703 + }, + { + "epoch": 0.4947016983597039, + "grad_norm": 3.605074644088745, + "learning_rate": 9.934621842702265e-06, + "loss": 1.1006, + "step": 1704 + }, + { + "epoch": 0.4949920162578023, + "grad_norm": 3.1863555908203125, + "learning_rate": 9.934466982299276e-06, + "loss": 1.0095, + "step": 1705 + }, + { + "epoch": 0.4952823341559007, + "grad_norm": 3.9221925735473633, + "learning_rate": 9.934311939915101e-06, + "loss": 1.2584, + "step": 1706 + }, + { + "epoch": 0.49557265205399914, + "grad_norm": 3.368342161178589, + "learning_rate": 9.93415671555546e-06, + "loss": 1.2051, + "step": 1707 + }, + { + "epoch": 0.49586296995209755, + "grad_norm": 3.4629364013671875, + "learning_rate": 9.934001309226079e-06, + "loss": 1.0938, + "step": 1708 + }, + { + "epoch": 0.49615328785019597, + "grad_norm": 3.376192331314087, + "learning_rate": 9.933845720932685e-06, + "loss": 1.1602, + "step": 1709 + }, + { + "epoch": 0.4964436057482944, + "grad_norm": 3.689114809036255, + "learning_rate": 9.933689950681021e-06, + "loss": 1.1903, + "step": 1710 + }, + { + "epoch": 0.4967339236463928, + "grad_norm": 3.5950229167938232, + "learning_rate": 9.933533998476828e-06, + "loss": 1.1267, + "step": 1711 + }, + { + "epoch": 0.4970242415444912, + "grad_norm": 3.553500175476074, + "learning_rate": 9.933377864325861e-06, + "loss": 1.1726, + "step": 1712 + }, + { + "epoch": 0.4973145594425896, + "grad_norm": 3.4887280464172363, + "learning_rate": 9.933221548233875e-06, + "loss": 1.1724, + "step": 1713 + }, + { + "epoch": 0.49760487734068803, + "grad_norm": 3.257399082183838, + "learning_rate": 9.933065050206635e-06, + "loss": 1.1709, + "step": 1714 + }, + { + "epoch": 0.49789519523878645, + "grad_norm": 3.813685655593872, + "learning_rate": 9.932908370249914e-06, + "loss": 1.3864, + "step": 1715 + }, + { + "epoch": 0.4981855131368849, + "grad_norm": 3.354031562805176, + "learning_rate": 9.932751508369492e-06, + "loss": 1.2201, + "step": 1716 + }, + { + "epoch": 0.49847583103498333, + "grad_norm": 3.2486491203308105, + "learning_rate": 9.93259446457115e-06, + "loss": 1.2024, + "step": 1717 + }, + { + "epoch": 0.49876614893308174, + "grad_norm": 3.415264368057251, + "learning_rate": 9.932437238860682e-06, + "loss": 1.1056, + "step": 1718 + }, + { + "epoch": 0.49905646683118016, + "grad_norm": 3.367347478866577, + "learning_rate": 9.932279831243884e-06, + "loss": 1.1409, + "step": 1719 + }, + { + "epoch": 0.49934678472927857, + "grad_norm": 3.6677513122558594, + "learning_rate": 9.932122241726565e-06, + "loss": 1.1554, + "step": 1720 + }, + { + "epoch": 0.499637102627377, + "grad_norm": 3.5150060653686523, + "learning_rate": 9.931964470314535e-06, + "loss": 1.2135, + "step": 1721 + }, + { + "epoch": 0.4999274205254754, + "grad_norm": 3.3909170627593994, + "learning_rate": 9.931806517013612e-06, + "loss": 1.0787, + "step": 1722 + }, + { + "epoch": 0.5002177384235739, + "grad_norm": 3.4581210613250732, + "learning_rate": 9.931648381829623e-06, + "loss": 1.0847, + "step": 1723 + }, + { + "epoch": 0.5005080563216723, + "grad_norm": 3.3497819900512695, + "learning_rate": 9.931490064768397e-06, + "loss": 1.1567, + "step": 1724 + }, + { + "epoch": 0.5007983742197707, + "grad_norm": 3.56885027885437, + "learning_rate": 9.931331565835775e-06, + "loss": 1.2172, + "step": 1725 + }, + { + "epoch": 0.5010886921178691, + "grad_norm": 3.825061321258545, + "learning_rate": 9.931172885037604e-06, + "loss": 1.3385, + "step": 1726 + }, + { + "epoch": 0.5013790100159675, + "grad_norm": 3.5020551681518555, + "learning_rate": 9.93101402237973e-06, + "loss": 1.17, + "step": 1727 + }, + { + "epoch": 0.5016693279140659, + "grad_norm": 3.285560369491577, + "learning_rate": 9.930854977868019e-06, + "loss": 1.0894, + "step": 1728 + }, + { + "epoch": 0.5019596458121643, + "grad_norm": 3.811467409133911, + "learning_rate": 9.930695751508333e-06, + "loss": 1.3049, + "step": 1729 + }, + { + "epoch": 0.5022499637102628, + "grad_norm": 3.4026193618774414, + "learning_rate": 9.930536343306542e-06, + "loss": 1.1131, + "step": 1730 + }, + { + "epoch": 0.5025402816083612, + "grad_norm": 3.4770872592926025, + "learning_rate": 9.93037675326853e-06, + "loss": 1.2083, + "step": 1731 + }, + { + "epoch": 0.5028305995064596, + "grad_norm": 3.191282272338867, + "learning_rate": 9.930216981400176e-06, + "loss": 1.0672, + "step": 1732 + }, + { + "epoch": 0.503120917404558, + "grad_norm": 3.5323238372802734, + "learning_rate": 9.93005702770738e-06, + "loss": 1.2014, + "step": 1733 + }, + { + "epoch": 0.5034112353026564, + "grad_norm": 3.5278778076171875, + "learning_rate": 9.929896892196036e-06, + "loss": 1.2671, + "step": 1734 + }, + { + "epoch": 0.5037015532007548, + "grad_norm": 4.011770248413086, + "learning_rate": 9.929736574872052e-06, + "loss": 1.1926, + "step": 1735 + }, + { + "epoch": 0.5039918710988532, + "grad_norm": 3.5172500610351562, + "learning_rate": 9.929576075741335e-06, + "loss": 1.2072, + "step": 1736 + }, + { + "epoch": 0.5042821889969517, + "grad_norm": 3.119262218475342, + "learning_rate": 9.929415394809813e-06, + "loss": 1.2116, + "step": 1737 + }, + { + "epoch": 0.5045725068950501, + "grad_norm": 3.0061795711517334, + "learning_rate": 9.929254532083406e-06, + "loss": 0.9696, + "step": 1738 + }, + { + "epoch": 0.5048628247931485, + "grad_norm": 3.344226598739624, + "learning_rate": 9.929093487568048e-06, + "loss": 1.2049, + "step": 1739 + }, + { + "epoch": 0.5051531426912469, + "grad_norm": 3.819347620010376, + "learning_rate": 9.928932261269679e-06, + "loss": 1.4237, + "step": 1740 + }, + { + "epoch": 0.5054434605893453, + "grad_norm": 3.798185348510742, + "learning_rate": 9.928770853194245e-06, + "loss": 1.2619, + "step": 1741 + }, + { + "epoch": 0.5057337784874437, + "grad_norm": 3.4737367630004883, + "learning_rate": 9.928609263347695e-06, + "loss": 1.168, + "step": 1742 + }, + { + "epoch": 0.5060240963855421, + "grad_norm": 3.425579786300659, + "learning_rate": 9.928447491735994e-06, + "loss": 1.1395, + "step": 1743 + }, + { + "epoch": 0.5063144142836405, + "grad_norm": 3.61008882522583, + "learning_rate": 9.928285538365104e-06, + "loss": 1.2144, + "step": 1744 + }, + { + "epoch": 0.506604732181739, + "grad_norm": 3.7203760147094727, + "learning_rate": 9.928123403240999e-06, + "loss": 1.2222, + "step": 1745 + }, + { + "epoch": 0.5068950500798374, + "grad_norm": 3.9801478385925293, + "learning_rate": 9.927961086369658e-06, + "loss": 1.3081, + "step": 1746 + }, + { + "epoch": 0.5071853679779358, + "grad_norm": 3.47685170173645, + "learning_rate": 9.927798587757068e-06, + "loss": 1.2011, + "step": 1747 + }, + { + "epoch": 0.5074756858760343, + "grad_norm": 3.4289331436157227, + "learning_rate": 9.927635907409224e-06, + "loss": 1.0605, + "step": 1748 + }, + { + "epoch": 0.5077660037741327, + "grad_norm": 3.3467659950256348, + "learning_rate": 9.92747304533212e-06, + "loss": 1.166, + "step": 1749 + }, + { + "epoch": 0.5080563216722311, + "grad_norm": 3.1214489936828613, + "learning_rate": 9.927310001531767e-06, + "loss": 1.122, + "step": 1750 + }, + { + "epoch": 0.5083466395703296, + "grad_norm": 3.7944412231445312, + "learning_rate": 9.927146776014176e-06, + "loss": 1.1537, + "step": 1751 + }, + { + "epoch": 0.508636957468428, + "grad_norm": 3.5604870319366455, + "learning_rate": 9.926983368785367e-06, + "loss": 1.2378, + "step": 1752 + }, + { + "epoch": 0.5089272753665264, + "grad_norm": 3.4572842121124268, + "learning_rate": 9.926819779851366e-06, + "loss": 1.1053, + "step": 1753 + }, + { + "epoch": 0.5092175932646248, + "grad_norm": 3.5131027698516846, + "learning_rate": 9.926656009218208e-06, + "loss": 1.1586, + "step": 1754 + }, + { + "epoch": 0.5095079111627232, + "grad_norm": 3.2035908699035645, + "learning_rate": 9.926492056891932e-06, + "loss": 1.0894, + "step": 1755 + }, + { + "epoch": 0.5097982290608216, + "grad_norm": 3.468350887298584, + "learning_rate": 9.926327922878582e-06, + "loss": 1.1021, + "step": 1756 + }, + { + "epoch": 0.51008854695892, + "grad_norm": 3.570665121078491, + "learning_rate": 9.926163607184215e-06, + "loss": 1.2883, + "step": 1757 + }, + { + "epoch": 0.5103788648570184, + "grad_norm": 3.7645089626312256, + "learning_rate": 9.925999109814888e-06, + "loss": 1.4159, + "step": 1758 + }, + { + "epoch": 0.5106691827551169, + "grad_norm": 3.5040338039398193, + "learning_rate": 9.925834430776668e-06, + "loss": 1.1979, + "step": 1759 + }, + { + "epoch": 0.5109595006532153, + "grad_norm": 3.4286630153656006, + "learning_rate": 9.92566957007563e-06, + "loss": 1.1681, + "step": 1760 + }, + { + "epoch": 0.5112498185513137, + "grad_norm": 3.727626085281372, + "learning_rate": 9.925504527717855e-06, + "loss": 1.2216, + "step": 1761 + }, + { + "epoch": 0.5115401364494121, + "grad_norm": 4.011921405792236, + "learning_rate": 9.925339303709424e-06, + "loss": 1.3667, + "step": 1762 + }, + { + "epoch": 0.5118304543475105, + "grad_norm": 3.5719776153564453, + "learning_rate": 9.925173898056436e-06, + "loss": 1.3837, + "step": 1763 + }, + { + "epoch": 0.5121207722456089, + "grad_norm": 3.6549808979034424, + "learning_rate": 9.925008310764988e-06, + "loss": 1.21, + "step": 1764 + }, + { + "epoch": 0.5124110901437073, + "grad_norm": 3.3320508003234863, + "learning_rate": 9.924842541841188e-06, + "loss": 1.1101, + "step": 1765 + }, + { + "epoch": 0.5127014080418058, + "grad_norm": 3.2522830963134766, + "learning_rate": 9.924676591291152e-06, + "loss": 1.1344, + "step": 1766 + }, + { + "epoch": 0.5129917259399042, + "grad_norm": 3.3519628047943115, + "learning_rate": 9.924510459120996e-06, + "loss": 1.1014, + "step": 1767 + }, + { + "epoch": 0.5132820438380026, + "grad_norm": 3.706505537033081, + "learning_rate": 9.924344145336847e-06, + "loss": 1.0986, + "step": 1768 + }, + { + "epoch": 0.513572361736101, + "grad_norm": 4.049661636352539, + "learning_rate": 9.924177649944841e-06, + "loss": 1.2321, + "step": 1769 + }, + { + "epoch": 0.5138626796341994, + "grad_norm": 3.567394495010376, + "learning_rate": 9.924010972951116e-06, + "loss": 1.2176, + "step": 1770 + }, + { + "epoch": 0.5141529975322978, + "grad_norm": 3.6421961784362793, + "learning_rate": 9.923844114361823e-06, + "loss": 1.1742, + "step": 1771 + }, + { + "epoch": 0.5144433154303962, + "grad_norm": 3.635004997253418, + "learning_rate": 9.923677074183112e-06, + "loss": 1.3064, + "step": 1772 + }, + { + "epoch": 0.5147336333284948, + "grad_norm": 3.6937615871429443, + "learning_rate": 9.923509852421144e-06, + "loss": 1.1709, + "step": 1773 + }, + { + "epoch": 0.5150239512265932, + "grad_norm": 3.8052608966827393, + "learning_rate": 9.923342449082088e-06, + "loss": 1.3354, + "step": 1774 + }, + { + "epoch": 0.5153142691246916, + "grad_norm": 3.4751036167144775, + "learning_rate": 9.923174864172114e-06, + "loss": 1.19, + "step": 1775 + }, + { + "epoch": 0.51560458702279, + "grad_norm": 3.4713563919067383, + "learning_rate": 9.923007097697406e-06, + "loss": 1.2449, + "step": 1776 + }, + { + "epoch": 0.5158949049208884, + "grad_norm": 3.4838809967041016, + "learning_rate": 9.92283914966415e-06, + "loss": 1.3047, + "step": 1777 + }, + { + "epoch": 0.5161852228189868, + "grad_norm": 3.8674657344818115, + "learning_rate": 9.92267102007854e-06, + "loss": 1.3504, + "step": 1778 + }, + { + "epoch": 0.5164755407170852, + "grad_norm": 3.565331220626831, + "learning_rate": 9.922502708946776e-06, + "loss": 1.1211, + "step": 1779 + }, + { + "epoch": 0.5167658586151836, + "grad_norm": 3.551572561264038, + "learning_rate": 9.922334216275065e-06, + "loss": 1.0492, + "step": 1780 + }, + { + "epoch": 0.5170561765132821, + "grad_norm": 3.5140163898468018, + "learning_rate": 9.922165542069621e-06, + "loss": 1.2428, + "step": 1781 + }, + { + "epoch": 0.5173464944113805, + "grad_norm": 3.665693759918213, + "learning_rate": 9.921996686336665e-06, + "loss": 1.3825, + "step": 1782 + }, + { + "epoch": 0.5176368123094789, + "grad_norm": 3.2127161026000977, + "learning_rate": 9.921827649082426e-06, + "loss": 1.0095, + "step": 1783 + }, + { + "epoch": 0.5179271302075773, + "grad_norm": 3.358623504638672, + "learning_rate": 9.921658430313136e-06, + "loss": 1.1453, + "step": 1784 + }, + { + "epoch": 0.5182174481056757, + "grad_norm": 3.582932472229004, + "learning_rate": 9.921489030035036e-06, + "loss": 1.2084, + "step": 1785 + }, + { + "epoch": 0.5185077660037741, + "grad_norm": 3.42759108543396, + "learning_rate": 9.921319448254374e-06, + "loss": 1.2317, + "step": 1786 + }, + { + "epoch": 0.5187980839018725, + "grad_norm": 3.603374481201172, + "learning_rate": 9.921149684977402e-06, + "loss": 1.261, + "step": 1787 + }, + { + "epoch": 0.519088401799971, + "grad_norm": 3.466707944869995, + "learning_rate": 9.920979740210383e-06, + "loss": 1.151, + "step": 1788 + }, + { + "epoch": 0.5193787196980694, + "grad_norm": 3.541694164276123, + "learning_rate": 9.920809613959585e-06, + "loss": 1.2843, + "step": 1789 + }, + { + "epoch": 0.5196690375961678, + "grad_norm": 3.0089690685272217, + "learning_rate": 9.920639306231282e-06, + "loss": 1.0789, + "step": 1790 + }, + { + "epoch": 0.5199593554942662, + "grad_norm": 3.8396410942077637, + "learning_rate": 9.920468817031754e-06, + "loss": 1.1977, + "step": 1791 + }, + { + "epoch": 0.5202496733923646, + "grad_norm": 3.1440768241882324, + "learning_rate": 9.920298146367287e-06, + "loss": 0.9836, + "step": 1792 + }, + { + "epoch": 0.520539991290463, + "grad_norm": 3.4042434692382812, + "learning_rate": 9.920127294244178e-06, + "loss": 1.0643, + "step": 1793 + }, + { + "epoch": 0.5208303091885614, + "grad_norm": 3.349679946899414, + "learning_rate": 9.919956260668726e-06, + "loss": 1.2135, + "step": 1794 + }, + { + "epoch": 0.5211206270866598, + "grad_norm": 3.703922986984253, + "learning_rate": 9.91978504564724e-06, + "loss": 1.275, + "step": 1795 + }, + { + "epoch": 0.5214109449847583, + "grad_norm": 3.6810669898986816, + "learning_rate": 9.919613649186034e-06, + "loss": 1.2586, + "step": 1796 + }, + { + "epoch": 0.5217012628828567, + "grad_norm": 3.357159376144409, + "learning_rate": 9.919442071291428e-06, + "loss": 1.0915, + "step": 1797 + }, + { + "epoch": 0.5219915807809552, + "grad_norm": 3.6635754108428955, + "learning_rate": 9.919270311969752e-06, + "loss": 1.2885, + "step": 1798 + }, + { + "epoch": 0.5222818986790536, + "grad_norm": 3.329967975616455, + "learning_rate": 9.919098371227338e-06, + "loss": 1.2306, + "step": 1799 + }, + { + "epoch": 0.522572216577152, + "grad_norm": 3.19476056098938, + "learning_rate": 9.918926249070528e-06, + "loss": 1.1987, + "step": 1800 + }, + { + "epoch": 0.5228625344752504, + "grad_norm": 3.237572431564331, + "learning_rate": 9.918753945505671e-06, + "loss": 1.2641, + "step": 1801 + }, + { + "epoch": 0.5231528523733489, + "grad_norm": 3.1060678958892822, + "learning_rate": 9.91858146053912e-06, + "loss": 1.1233, + "step": 1802 + }, + { + "epoch": 0.5234431702714473, + "grad_norm": 3.3326449394226074, + "learning_rate": 9.918408794177236e-06, + "loss": 1.0348, + "step": 1803 + }, + { + "epoch": 0.5237334881695457, + "grad_norm": 3.1791276931762695, + "learning_rate": 9.918235946426389e-06, + "loss": 1.0184, + "step": 1804 + }, + { + "epoch": 0.5240238060676441, + "grad_norm": 3.0655264854431152, + "learning_rate": 9.918062917292951e-06, + "loss": 1.0412, + "step": 1805 + }, + { + "epoch": 0.5243141239657425, + "grad_norm": 3.459871768951416, + "learning_rate": 9.917889706783304e-06, + "loss": 1.1433, + "step": 1806 + }, + { + "epoch": 0.5246044418638409, + "grad_norm": 3.5047693252563477, + "learning_rate": 9.917716314903838e-06, + "loss": 1.2992, + "step": 1807 + }, + { + "epoch": 0.5248947597619393, + "grad_norm": 3.4301116466522217, + "learning_rate": 9.917542741660943e-06, + "loss": 1.2329, + "step": 1808 + }, + { + "epoch": 0.5251850776600377, + "grad_norm": 3.4077882766723633, + "learning_rate": 9.917368987061026e-06, + "loss": 1.1486, + "step": 1809 + }, + { + "epoch": 0.5254753955581362, + "grad_norm": 3.492203950881958, + "learning_rate": 9.917195051110492e-06, + "loss": 1.1808, + "step": 1810 + }, + { + "epoch": 0.5257657134562346, + "grad_norm": 3.3428704738616943, + "learning_rate": 9.917020933815753e-06, + "loss": 1.1278, + "step": 1811 + }, + { + "epoch": 0.526056031354333, + "grad_norm": 3.4210922718048096, + "learning_rate": 9.916846635183235e-06, + "loss": 1.1373, + "step": 1812 + }, + { + "epoch": 0.5263463492524314, + "grad_norm": 3.6874492168426514, + "learning_rate": 9.916672155219365e-06, + "loss": 1.4229, + "step": 1813 + }, + { + "epoch": 0.5266366671505298, + "grad_norm": 3.638094902038574, + "learning_rate": 9.916497493930574e-06, + "loss": 1.1038, + "step": 1814 + }, + { + "epoch": 0.5269269850486282, + "grad_norm": 3.680783987045288, + "learning_rate": 9.91632265132331e-06, + "loss": 1.3286, + "step": 1815 + }, + { + "epoch": 0.5272173029467266, + "grad_norm": 3.4472410678863525, + "learning_rate": 9.916147627404016e-06, + "loss": 1.2083, + "step": 1816 + }, + { + "epoch": 0.527507620844825, + "grad_norm": 3.3821465969085693, + "learning_rate": 9.91597242217915e-06, + "loss": 1.1939, + "step": 1817 + }, + { + "epoch": 0.5277979387429235, + "grad_norm": 3.5459840297698975, + "learning_rate": 9.91579703565517e-06, + "loss": 1.162, + "step": 1818 + }, + { + "epoch": 0.5280882566410219, + "grad_norm": 3.6665494441986084, + "learning_rate": 9.915621467838546e-06, + "loss": 1.202, + "step": 1819 + }, + { + "epoch": 0.5283785745391203, + "grad_norm": 3.051856756210327, + "learning_rate": 9.915445718735755e-06, + "loss": 1.0528, + "step": 1820 + }, + { + "epoch": 0.5286688924372187, + "grad_norm": 3.6828055381774902, + "learning_rate": 9.915269788353274e-06, + "loss": 1.1336, + "step": 1821 + }, + { + "epoch": 0.5289592103353171, + "grad_norm": 3.4396843910217285, + "learning_rate": 9.915093676697597e-06, + "loss": 1.2528, + "step": 1822 + }, + { + "epoch": 0.5292495282334156, + "grad_norm": 3.3097896575927734, + "learning_rate": 9.914917383775211e-06, + "loss": 1.1547, + "step": 1823 + }, + { + "epoch": 0.5295398461315141, + "grad_norm": 3.2964537143707275, + "learning_rate": 9.914740909592627e-06, + "loss": 1.1173, + "step": 1824 + }, + { + "epoch": 0.5298301640296125, + "grad_norm": 3.3613085746765137, + "learning_rate": 9.914564254156345e-06, + "loss": 1.0037, + "step": 1825 + }, + { + "epoch": 0.5301204819277109, + "grad_norm": 3.1005563735961914, + "learning_rate": 9.914387417472886e-06, + "loss": 1.1261, + "step": 1826 + }, + { + "epoch": 0.5304107998258093, + "grad_norm": 3.254185914993286, + "learning_rate": 9.914210399548768e-06, + "loss": 1.2195, + "step": 1827 + }, + { + "epoch": 0.5307011177239077, + "grad_norm": 3.0074527263641357, + "learning_rate": 9.91403320039052e-06, + "loss": 1.0624, + "step": 1828 + }, + { + "epoch": 0.5309914356220061, + "grad_norm": 3.3639132976531982, + "learning_rate": 9.91385582000468e-06, + "loss": 1.1939, + "step": 1829 + }, + { + "epoch": 0.5312817535201045, + "grad_norm": 3.1890807151794434, + "learning_rate": 9.913678258397785e-06, + "loss": 1.1671, + "step": 1830 + }, + { + "epoch": 0.531572071418203, + "grad_norm": 3.707369327545166, + "learning_rate": 9.913500515576388e-06, + "loss": 1.1766, + "step": 1831 + }, + { + "epoch": 0.5318623893163014, + "grad_norm": 3.2515759468078613, + "learning_rate": 9.913322591547042e-06, + "loss": 1.15, + "step": 1832 + }, + { + "epoch": 0.5321527072143998, + "grad_norm": 3.618812322616577, + "learning_rate": 9.913144486316306e-06, + "loss": 1.2448, + "step": 1833 + }, + { + "epoch": 0.5324430251124982, + "grad_norm": 3.4694342613220215, + "learning_rate": 9.912966199890753e-06, + "loss": 1.3931, + "step": 1834 + }, + { + "epoch": 0.5327333430105966, + "grad_norm": 3.811699628829956, + "learning_rate": 9.912787732276955e-06, + "loss": 1.2158, + "step": 1835 + }, + { + "epoch": 0.533023660908695, + "grad_norm": 3.5045254230499268, + "learning_rate": 9.912609083481494e-06, + "loss": 1.1664, + "step": 1836 + }, + { + "epoch": 0.5333139788067934, + "grad_norm": 3.1756935119628906, + "learning_rate": 9.912430253510963e-06, + "loss": 1.1034, + "step": 1837 + }, + { + "epoch": 0.5336042967048918, + "grad_norm": 3.141693115234375, + "learning_rate": 9.912251242371952e-06, + "loss": 0.9284, + "step": 1838 + }, + { + "epoch": 0.5338946146029903, + "grad_norm": 3.484868288040161, + "learning_rate": 9.912072050071063e-06, + "loss": 1.2705, + "step": 1839 + }, + { + "epoch": 0.5341849325010887, + "grad_norm": 3.564931631088257, + "learning_rate": 9.911892676614908e-06, + "loss": 1.1495, + "step": 1840 + }, + { + "epoch": 0.5344752503991871, + "grad_norm": 3.510122060775757, + "learning_rate": 9.9117131220101e-06, + "loss": 1.3716, + "step": 1841 + }, + { + "epoch": 0.5347655682972855, + "grad_norm": 3.416837453842163, + "learning_rate": 9.911533386263262e-06, + "loss": 1.3552, + "step": 1842 + }, + { + "epoch": 0.5350558861953839, + "grad_norm": 3.3100061416625977, + "learning_rate": 9.91135346938102e-06, + "loss": 1.2652, + "step": 1843 + }, + { + "epoch": 0.5353462040934823, + "grad_norm": 3.4213778972625732, + "learning_rate": 9.91117337137001e-06, + "loss": 1.0756, + "step": 1844 + }, + { + "epoch": 0.5356365219915807, + "grad_norm": 3.4177422523498535, + "learning_rate": 9.910993092236878e-06, + "loss": 1.1127, + "step": 1845 + }, + { + "epoch": 0.5359268398896792, + "grad_norm": 3.432579278945923, + "learning_rate": 9.910812631988268e-06, + "loss": 1.117, + "step": 1846 + }, + { + "epoch": 0.5362171577877776, + "grad_norm": 3.2651829719543457, + "learning_rate": 9.910631990630837e-06, + "loss": 1.1663, + "step": 1847 + }, + { + "epoch": 0.5365074756858761, + "grad_norm": 3.6530210971832275, + "learning_rate": 9.910451168171248e-06, + "loss": 1.0423, + "step": 1848 + }, + { + "epoch": 0.5367977935839745, + "grad_norm": 3.6912314891815186, + "learning_rate": 9.910270164616168e-06, + "loss": 1.1442, + "step": 1849 + }, + { + "epoch": 0.5370881114820729, + "grad_norm": 3.458739757537842, + "learning_rate": 9.910088979972272e-06, + "loss": 1.1812, + "step": 1850 + }, + { + "epoch": 0.5373784293801713, + "grad_norm": 3.281719923019409, + "learning_rate": 9.909907614246244e-06, + "loss": 1.101, + "step": 1851 + }, + { + "epoch": 0.5376687472782697, + "grad_norm": 3.4149019718170166, + "learning_rate": 9.909726067444772e-06, + "loss": 1.1371, + "step": 1852 + }, + { + "epoch": 0.5379590651763682, + "grad_norm": 3.3794870376586914, + "learning_rate": 9.909544339574549e-06, + "loss": 1.1995, + "step": 1853 + }, + { + "epoch": 0.5382493830744666, + "grad_norm": 3.4699738025665283, + "learning_rate": 9.90936243064228e-06, + "loss": 1.2481, + "step": 1854 + }, + { + "epoch": 0.538539700972565, + "grad_norm": 3.468823194503784, + "learning_rate": 9.909180340654674e-06, + "loss": 1.2427, + "step": 1855 + }, + { + "epoch": 0.5388300188706634, + "grad_norm": 3.8242857456207275, + "learning_rate": 9.908998069618445e-06, + "loss": 1.1741, + "step": 1856 + }, + { + "epoch": 0.5391203367687618, + "grad_norm": 3.8072662353515625, + "learning_rate": 9.908815617540314e-06, + "loss": 1.238, + "step": 1857 + }, + { + "epoch": 0.5394106546668602, + "grad_norm": 3.4818027019500732, + "learning_rate": 9.908632984427012e-06, + "loss": 1.3667, + "step": 1858 + }, + { + "epoch": 0.5397009725649586, + "grad_norm": 3.2435076236724854, + "learning_rate": 9.908450170285273e-06, + "loss": 1.133, + "step": 1859 + }, + { + "epoch": 0.539991290463057, + "grad_norm": 3.8168723583221436, + "learning_rate": 9.90826717512184e-06, + "loss": 1.232, + "step": 1860 + }, + { + "epoch": 0.5402816083611555, + "grad_norm": 3.5808327198028564, + "learning_rate": 9.90808399894346e-06, + "loss": 1.2569, + "step": 1861 + }, + { + "epoch": 0.5405719262592539, + "grad_norm": 3.1764636039733887, + "learning_rate": 9.907900641756891e-06, + "loss": 1.0774, + "step": 1862 + }, + { + "epoch": 0.5408622441573523, + "grad_norm": 3.4908952713012695, + "learning_rate": 9.907717103568895e-06, + "loss": 1.239, + "step": 1863 + }, + { + "epoch": 0.5411525620554507, + "grad_norm": 3.6539740562438965, + "learning_rate": 9.907533384386238e-06, + "loss": 1.2073, + "step": 1864 + }, + { + "epoch": 0.5414428799535491, + "grad_norm": 3.764848470687866, + "learning_rate": 9.907349484215698e-06, + "loss": 1.3799, + "step": 1865 + }, + { + "epoch": 0.5417331978516475, + "grad_norm": 3.1396989822387695, + "learning_rate": 9.907165403064057e-06, + "loss": 1.1709, + "step": 1866 + }, + { + "epoch": 0.5420235157497459, + "grad_norm": 3.9477617740631104, + "learning_rate": 9.906981140938102e-06, + "loss": 1.2874, + "step": 1867 + }, + { + "epoch": 0.5423138336478444, + "grad_norm": 3.45196795463562, + "learning_rate": 9.90679669784463e-06, + "loss": 1.2163, + "step": 1868 + }, + { + "epoch": 0.5426041515459428, + "grad_norm": 3.3559353351593018, + "learning_rate": 9.906612073790443e-06, + "loss": 1.1945, + "step": 1869 + }, + { + "epoch": 0.5428944694440412, + "grad_norm": 3.3837227821350098, + "learning_rate": 9.906427268782351e-06, + "loss": 1.1423, + "step": 1870 + }, + { + "epoch": 0.5431847873421396, + "grad_norm": 3.3866822719573975, + "learning_rate": 9.906242282827167e-06, + "loss": 1.1683, + "step": 1871 + }, + { + "epoch": 0.543475105240238, + "grad_norm": 3.538224220275879, + "learning_rate": 9.906057115931716e-06, + "loss": 1.0664, + "step": 1872 + }, + { + "epoch": 0.5437654231383364, + "grad_norm": 3.6277942657470703, + "learning_rate": 9.905871768102824e-06, + "loss": 1.2454, + "step": 1873 + }, + { + "epoch": 0.544055741036435, + "grad_norm": 3.9439074993133545, + "learning_rate": 9.905686239347329e-06, + "loss": 1.105, + "step": 1874 + }, + { + "epoch": 0.5443460589345334, + "grad_norm": 3.3200228214263916, + "learning_rate": 9.905500529672072e-06, + "loss": 1.0594, + "step": 1875 + }, + { + "epoch": 0.5446363768326318, + "grad_norm": 3.45715594291687, + "learning_rate": 9.905314639083902e-06, + "loss": 1.1502, + "step": 1876 + }, + { + "epoch": 0.5449266947307302, + "grad_norm": 3.3772661685943604, + "learning_rate": 9.905128567589674e-06, + "loss": 1.2959, + "step": 1877 + }, + { + "epoch": 0.5452170126288286, + "grad_norm": 3.353414297103882, + "learning_rate": 9.904942315196253e-06, + "loss": 1.337, + "step": 1878 + }, + { + "epoch": 0.545507330526927, + "grad_norm": 3.0528392791748047, + "learning_rate": 9.904755881910504e-06, + "loss": 0.9722, + "step": 1879 + }, + { + "epoch": 0.5457976484250254, + "grad_norm": 3.1231632232666016, + "learning_rate": 9.904569267739305e-06, + "loss": 1.0102, + "step": 1880 + }, + { + "epoch": 0.5460879663231238, + "grad_norm": 3.1487677097320557, + "learning_rate": 9.904382472689539e-06, + "loss": 1.1653, + "step": 1881 + }, + { + "epoch": 0.5463782842212223, + "grad_norm": 3.181234359741211, + "learning_rate": 9.904195496768092e-06, + "loss": 1.0459, + "step": 1882 + }, + { + "epoch": 0.5466686021193207, + "grad_norm": 3.1367695331573486, + "learning_rate": 9.904008339981861e-06, + "loss": 1.2362, + "step": 1883 + }, + { + "epoch": 0.5469589200174191, + "grad_norm": 3.5613062381744385, + "learning_rate": 9.90382100233775e-06, + "loss": 1.1144, + "step": 1884 + }, + { + "epoch": 0.5472492379155175, + "grad_norm": 3.170631170272827, + "learning_rate": 9.903633483842666e-06, + "loss": 1.0733, + "step": 1885 + }, + { + "epoch": 0.5475395558136159, + "grad_norm": 3.4632740020751953, + "learning_rate": 9.903445784503525e-06, + "loss": 1.1683, + "step": 1886 + }, + { + "epoch": 0.5478298737117143, + "grad_norm": 3.335059642791748, + "learning_rate": 9.90325790432725e-06, + "loss": 1.1356, + "step": 1887 + }, + { + "epoch": 0.5481201916098127, + "grad_norm": 3.508770704269409, + "learning_rate": 9.903069843320768e-06, + "loss": 1.1451, + "step": 1888 + }, + { + "epoch": 0.5484105095079111, + "grad_norm": 3.5093181133270264, + "learning_rate": 9.902881601491018e-06, + "loss": 1.1466, + "step": 1889 + }, + { + "epoch": 0.5487008274060096, + "grad_norm": 3.2411201000213623, + "learning_rate": 9.902693178844937e-06, + "loss": 1.1672, + "step": 1890 + }, + { + "epoch": 0.548991145304108, + "grad_norm": 3.413616418838501, + "learning_rate": 9.902504575389477e-06, + "loss": 1.1278, + "step": 1891 + }, + { + "epoch": 0.5492814632022064, + "grad_norm": 3.36161732673645, + "learning_rate": 9.902315791131596e-06, + "loss": 1.144, + "step": 1892 + }, + { + "epoch": 0.5495717811003048, + "grad_norm": 3.3496720790863037, + "learning_rate": 9.902126826078254e-06, + "loss": 1.2088, + "step": 1893 + }, + { + "epoch": 0.5498620989984032, + "grad_norm": 3.206235647201538, + "learning_rate": 9.901937680236419e-06, + "loss": 1.0498, + "step": 1894 + }, + { + "epoch": 0.5501524168965016, + "grad_norm": 3.4635236263275146, + "learning_rate": 9.901748353613069e-06, + "loss": 1.2035, + "step": 1895 + }, + { + "epoch": 0.5504427347946, + "grad_norm": 3.303619623184204, + "learning_rate": 9.901558846215185e-06, + "loss": 1.2198, + "step": 1896 + }, + { + "epoch": 0.5507330526926985, + "grad_norm": 3.401362419128418, + "learning_rate": 9.901369158049755e-06, + "loss": 1.1518, + "step": 1897 + }, + { + "epoch": 0.5510233705907969, + "grad_norm": 3.1420388221740723, + "learning_rate": 9.901179289123775e-06, + "loss": 1.1403, + "step": 1898 + }, + { + "epoch": 0.5513136884888954, + "grad_norm": 3.4058547019958496, + "learning_rate": 9.900989239444248e-06, + "loss": 1.1038, + "step": 1899 + }, + { + "epoch": 0.5516040063869938, + "grad_norm": 3.373687982559204, + "learning_rate": 9.900799009018183e-06, + "loss": 1.1644, + "step": 1900 + }, + { + "epoch": 0.5518943242850922, + "grad_norm": 3.383594512939453, + "learning_rate": 9.900608597852595e-06, + "loss": 1.2449, + "step": 1901 + }, + { + "epoch": 0.5521846421831906, + "grad_norm": 3.35893177986145, + "learning_rate": 9.900418005954506e-06, + "loss": 1.223, + "step": 1902 + }, + { + "epoch": 0.552474960081289, + "grad_norm": 3.3038265705108643, + "learning_rate": 9.900227233330947e-06, + "loss": 1.1816, + "step": 1903 + }, + { + "epoch": 0.5527652779793875, + "grad_norm": 3.53434157371521, + "learning_rate": 9.900036279988953e-06, + "loss": 1.191, + "step": 1904 + }, + { + "epoch": 0.5530555958774859, + "grad_norm": 3.4917852878570557, + "learning_rate": 9.899845145935563e-06, + "loss": 1.279, + "step": 1905 + }, + { + "epoch": 0.5533459137755843, + "grad_norm": 3.4064924716949463, + "learning_rate": 9.899653831177831e-06, + "loss": 1.1646, + "step": 1906 + }, + { + "epoch": 0.5536362316736827, + "grad_norm": 3.37669038772583, + "learning_rate": 9.89946233572281e-06, + "loss": 1.1901, + "step": 1907 + }, + { + "epoch": 0.5539265495717811, + "grad_norm": 3.174514055252075, + "learning_rate": 9.89927065957756e-06, + "loss": 0.989, + "step": 1908 + }, + { + "epoch": 0.5542168674698795, + "grad_norm": 4.024920463562012, + "learning_rate": 9.899078802749153e-06, + "loss": 1.3463, + "step": 1909 + }, + { + "epoch": 0.5545071853679779, + "grad_norm": 3.733576774597168, + "learning_rate": 9.898886765244663e-06, + "loss": 1.1643, + "step": 1910 + }, + { + "epoch": 0.5547975032660764, + "grad_norm": 3.5115084648132324, + "learning_rate": 9.898694547071177e-06, + "loss": 1.2633, + "step": 1911 + }, + { + "epoch": 0.5550878211641748, + "grad_norm": 3.4509117603302, + "learning_rate": 9.898502148235777e-06, + "loss": 1.1849, + "step": 1912 + }, + { + "epoch": 0.5553781390622732, + "grad_norm": 3.595416784286499, + "learning_rate": 9.898309568745562e-06, + "loss": 1.196, + "step": 1913 + }, + { + "epoch": 0.5556684569603716, + "grad_norm": 3.3942644596099854, + "learning_rate": 9.898116808607634e-06, + "loss": 1.1612, + "step": 1914 + }, + { + "epoch": 0.55595877485847, + "grad_norm": 3.5363807678222656, + "learning_rate": 9.897923867829102e-06, + "loss": 1.2277, + "step": 1915 + }, + { + "epoch": 0.5562490927565684, + "grad_norm": 3.9670045375823975, + "learning_rate": 9.897730746417082e-06, + "loss": 1.2816, + "step": 1916 + }, + { + "epoch": 0.5565394106546668, + "grad_norm": 3.706681251525879, + "learning_rate": 9.897537444378696e-06, + "loss": 1.1865, + "step": 1917 + }, + { + "epoch": 0.5568297285527652, + "grad_norm": 3.671945571899414, + "learning_rate": 9.897343961721071e-06, + "loss": 1.284, + "step": 1918 + }, + { + "epoch": 0.5571200464508637, + "grad_norm": 3.5591561794281006, + "learning_rate": 9.897150298451346e-06, + "loss": 1.1434, + "step": 1919 + }, + { + "epoch": 0.5574103643489621, + "grad_norm": 3.2542104721069336, + "learning_rate": 9.89695645457666e-06, + "loss": 1.0362, + "step": 1920 + }, + { + "epoch": 0.5577006822470605, + "grad_norm": 3.382683753967285, + "learning_rate": 9.896762430104163e-06, + "loss": 1.1443, + "step": 1921 + }, + { + "epoch": 0.5579910001451589, + "grad_norm": 3.6778106689453125, + "learning_rate": 9.896568225041013e-06, + "loss": 1.2972, + "step": 1922 + }, + { + "epoch": 0.5582813180432573, + "grad_norm": 3.39172101020813, + "learning_rate": 9.896373839394367e-06, + "loss": 1.0591, + "step": 1923 + }, + { + "epoch": 0.5585716359413558, + "grad_norm": 3.2635951042175293, + "learning_rate": 9.8961792731714e-06, + "loss": 1.1169, + "step": 1924 + }, + { + "epoch": 0.5588619538394543, + "grad_norm": 3.4910495281219482, + "learning_rate": 9.895984526379282e-06, + "loss": 1.1934, + "step": 1925 + }, + { + "epoch": 0.5591522717375527, + "grad_norm": 3.5301356315612793, + "learning_rate": 9.895789599025198e-06, + "loss": 1.1124, + "step": 1926 + }, + { + "epoch": 0.5594425896356511, + "grad_norm": 3.9778127670288086, + "learning_rate": 9.895594491116336e-06, + "loss": 1.184, + "step": 1927 + }, + { + "epoch": 0.5597329075337495, + "grad_norm": 3.4717602729797363, + "learning_rate": 9.895399202659892e-06, + "loss": 1.145, + "step": 1928 + }, + { + "epoch": 0.5600232254318479, + "grad_norm": 3.5621719360351562, + "learning_rate": 9.89520373366307e-06, + "loss": 1.1355, + "step": 1929 + }, + { + "epoch": 0.5603135433299463, + "grad_norm": 3.188401460647583, + "learning_rate": 9.895008084133075e-06, + "loss": 1.1089, + "step": 1930 + }, + { + "epoch": 0.5606038612280447, + "grad_norm": 3.5980637073516846, + "learning_rate": 9.894812254077126e-06, + "loss": 1.1874, + "step": 1931 + }, + { + "epoch": 0.5608941791261431, + "grad_norm": 3.370637893676758, + "learning_rate": 9.894616243502442e-06, + "loss": 1.2691, + "step": 1932 + }, + { + "epoch": 0.5611844970242416, + "grad_norm": 3.3705739974975586, + "learning_rate": 9.894420052416253e-06, + "loss": 1.2136, + "step": 1933 + }, + { + "epoch": 0.56147481492234, + "grad_norm": 3.4017226696014404, + "learning_rate": 9.894223680825797e-06, + "loss": 1.1104, + "step": 1934 + }, + { + "epoch": 0.5617651328204384, + "grad_norm": 3.2392518520355225, + "learning_rate": 9.894027128738311e-06, + "loss": 1.2475, + "step": 1935 + }, + { + "epoch": 0.5620554507185368, + "grad_norm": 3.236485004425049, + "learning_rate": 9.893830396161049e-06, + "loss": 1.011, + "step": 1936 + }, + { + "epoch": 0.5623457686166352, + "grad_norm": 3.775726795196533, + "learning_rate": 9.893633483101264e-06, + "loss": 1.3489, + "step": 1937 + }, + { + "epoch": 0.5626360865147336, + "grad_norm": 3.4214670658111572, + "learning_rate": 9.893436389566215e-06, + "loss": 1.1987, + "step": 1938 + }, + { + "epoch": 0.562926404412832, + "grad_norm": 3.5680062770843506, + "learning_rate": 9.893239115563179e-06, + "loss": 1.2214, + "step": 1939 + }, + { + "epoch": 0.5632167223109305, + "grad_norm": 3.623807191848755, + "learning_rate": 9.893041661099422e-06, + "loss": 1.2361, + "step": 1940 + }, + { + "epoch": 0.5635070402090289, + "grad_norm": 3.6621768474578857, + "learning_rate": 9.89284402618223e-06, + "loss": 1.2852, + "step": 1941 + }, + { + "epoch": 0.5637973581071273, + "grad_norm": 3.4510340690612793, + "learning_rate": 9.892646210818894e-06, + "loss": 1.2343, + "step": 1942 + }, + { + "epoch": 0.5640876760052257, + "grad_norm": 3.459193468093872, + "learning_rate": 9.892448215016708e-06, + "loss": 1.115, + "step": 1943 + }, + { + "epoch": 0.5643779939033241, + "grad_norm": 3.3555784225463867, + "learning_rate": 9.892250038782972e-06, + "loss": 1.1979, + "step": 1944 + }, + { + "epoch": 0.5646683118014225, + "grad_norm": 3.4835281372070312, + "learning_rate": 9.892051682124996e-06, + "loss": 1.1841, + "step": 1945 + }, + { + "epoch": 0.5649586296995209, + "grad_norm": 3.4608845710754395, + "learning_rate": 9.891853145050097e-06, + "loss": 1.1358, + "step": 1946 + }, + { + "epoch": 0.5652489475976193, + "grad_norm": 3.647038698196411, + "learning_rate": 9.891654427565594e-06, + "loss": 1.3349, + "step": 1947 + }, + { + "epoch": 0.5655392654957178, + "grad_norm": 3.701260805130005, + "learning_rate": 9.891455529678815e-06, + "loss": 1.2177, + "step": 1948 + }, + { + "epoch": 0.5658295833938163, + "grad_norm": 3.342308759689331, + "learning_rate": 9.8912564513971e-06, + "loss": 1.2119, + "step": 1949 + }, + { + "epoch": 0.5661199012919147, + "grad_norm": 3.529751777648926, + "learning_rate": 9.891057192727787e-06, + "loss": 1.1177, + "step": 1950 + }, + { + "epoch": 0.5664102191900131, + "grad_norm": 3.2894372940063477, + "learning_rate": 9.890857753678225e-06, + "loss": 1.3006, + "step": 1951 + }, + { + "epoch": 0.5667005370881115, + "grad_norm": 3.307856798171997, + "learning_rate": 9.890658134255771e-06, + "loss": 1.2037, + "step": 1952 + }, + { + "epoch": 0.5669908549862099, + "grad_norm": 3.4086251258850098, + "learning_rate": 9.890458334467784e-06, + "loss": 1.1736, + "step": 1953 + }, + { + "epoch": 0.5672811728843083, + "grad_norm": 3.872767925262451, + "learning_rate": 9.890258354321638e-06, + "loss": 1.309, + "step": 1954 + }, + { + "epoch": 0.5675714907824068, + "grad_norm": 3.3691158294677734, + "learning_rate": 9.890058193824702e-06, + "loss": 1.1146, + "step": 1955 + }, + { + "epoch": 0.5678618086805052, + "grad_norm": 3.3088929653167725, + "learning_rate": 9.88985785298436e-06, + "loss": 1.1527, + "step": 1956 + }, + { + "epoch": 0.5681521265786036, + "grad_norm": 3.4965968132019043, + "learning_rate": 9.889657331808003e-06, + "loss": 1.2041, + "step": 1957 + }, + { + "epoch": 0.568442444476702, + "grad_norm": 3.3518784046173096, + "learning_rate": 9.889456630303022e-06, + "loss": 1.2014, + "step": 1958 + }, + { + "epoch": 0.5687327623748004, + "grad_norm": 3.304481267929077, + "learning_rate": 9.88925574847682e-06, + "loss": 1.1083, + "step": 1959 + }, + { + "epoch": 0.5690230802728988, + "grad_norm": 3.6226377487182617, + "learning_rate": 9.889054686336808e-06, + "loss": 1.2176, + "step": 1960 + }, + { + "epoch": 0.5693133981709972, + "grad_norm": 3.2320313453674316, + "learning_rate": 9.8888534438904e-06, + "loss": 1.1255, + "step": 1961 + }, + { + "epoch": 0.5696037160690957, + "grad_norm": 3.6871187686920166, + "learning_rate": 9.888652021145015e-06, + "loss": 1.1531, + "step": 1962 + }, + { + "epoch": 0.5698940339671941, + "grad_norm": 3.502007007598877, + "learning_rate": 9.888450418108085e-06, + "loss": 1.226, + "step": 1963 + }, + { + "epoch": 0.5701843518652925, + "grad_norm": 3.3673317432403564, + "learning_rate": 9.888248634787044e-06, + "loss": 1.1027, + "step": 1964 + }, + { + "epoch": 0.5704746697633909, + "grad_norm": 3.250483751296997, + "learning_rate": 9.888046671189331e-06, + "loss": 1.0451, + "step": 1965 + }, + { + "epoch": 0.5707649876614893, + "grad_norm": 3.357563018798828, + "learning_rate": 9.887844527322398e-06, + "loss": 1.0807, + "step": 1966 + }, + { + "epoch": 0.5710553055595877, + "grad_norm": 3.171480655670166, + "learning_rate": 9.887642203193699e-06, + "loss": 1.0291, + "step": 1967 + }, + { + "epoch": 0.5713456234576861, + "grad_norm": 3.627028703689575, + "learning_rate": 9.887439698810694e-06, + "loss": 1.2565, + "step": 1968 + }, + { + "epoch": 0.5716359413557845, + "grad_norm": 3.0720813274383545, + "learning_rate": 9.887237014180853e-06, + "loss": 1.0151, + "step": 1969 + }, + { + "epoch": 0.571926259253883, + "grad_norm": 3.1045854091644287, + "learning_rate": 9.88703414931165e-06, + "loss": 1.0729, + "step": 1970 + }, + { + "epoch": 0.5722165771519814, + "grad_norm": 3.7090137004852295, + "learning_rate": 9.886831104210567e-06, + "loss": 1.2588, + "step": 1971 + }, + { + "epoch": 0.5725068950500798, + "grad_norm": 3.418719530105591, + "learning_rate": 9.886627878885093e-06, + "loss": 1.0532, + "step": 1972 + }, + { + "epoch": 0.5727972129481782, + "grad_norm": 4.114670276641846, + "learning_rate": 9.88642447334272e-06, + "loss": 1.2155, + "step": 1973 + }, + { + "epoch": 0.5730875308462767, + "grad_norm": 3.3780999183654785, + "learning_rate": 9.886220887590953e-06, + "loss": 1.0976, + "step": 1974 + }, + { + "epoch": 0.5733778487443751, + "grad_norm": 3.3988523483276367, + "learning_rate": 9.886017121637299e-06, + "loss": 1.1996, + "step": 1975 + }, + { + "epoch": 0.5736681666424736, + "grad_norm": 3.189674139022827, + "learning_rate": 9.885813175489272e-06, + "loss": 1.0011, + "step": 1976 + }, + { + "epoch": 0.573958484540572, + "grad_norm": 3.2904489040374756, + "learning_rate": 9.885609049154395e-06, + "loss": 1.0865, + "step": 1977 + }, + { + "epoch": 0.5742488024386704, + "grad_norm": 3.6734063625335693, + "learning_rate": 9.885404742640192e-06, + "loss": 1.2685, + "step": 1978 + }, + { + "epoch": 0.5745391203367688, + "grad_norm": 3.972599983215332, + "learning_rate": 9.885200255954203e-06, + "loss": 1.4054, + "step": 1979 + }, + { + "epoch": 0.5748294382348672, + "grad_norm": 3.402681589126587, + "learning_rate": 9.884995589103967e-06, + "loss": 1.1284, + "step": 1980 + }, + { + "epoch": 0.5751197561329656, + "grad_norm": 3.822906970977783, + "learning_rate": 9.884790742097032e-06, + "loss": 1.3255, + "step": 1981 + }, + { + "epoch": 0.575410074031064, + "grad_norm": 3.3857011795043945, + "learning_rate": 9.884585714940953e-06, + "loss": 1.1057, + "step": 1982 + }, + { + "epoch": 0.5757003919291624, + "grad_norm": 3.3248820304870605, + "learning_rate": 9.884380507643293e-06, + "loss": 1.1137, + "step": 1983 + }, + { + "epoch": 0.5759907098272609, + "grad_norm": 2.989927053451538, + "learning_rate": 9.884175120211616e-06, + "loss": 0.9767, + "step": 1984 + }, + { + "epoch": 0.5762810277253593, + "grad_norm": 3.6067261695861816, + "learning_rate": 9.8839695526535e-06, + "loss": 1.2875, + "step": 1985 + }, + { + "epoch": 0.5765713456234577, + "grad_norm": 3.1623098850250244, + "learning_rate": 9.883763804976525e-06, + "loss": 1.1397, + "step": 1986 + }, + { + "epoch": 0.5768616635215561, + "grad_norm": 3.215427875518799, + "learning_rate": 9.883557877188276e-06, + "loss": 1.0948, + "step": 1987 + }, + { + "epoch": 0.5771519814196545, + "grad_norm": 3.4448931217193604, + "learning_rate": 9.883351769296355e-06, + "loss": 1.1696, + "step": 1988 + }, + { + "epoch": 0.5774422993177529, + "grad_norm": 3.0168240070343018, + "learning_rate": 9.883145481308356e-06, + "loss": 0.9926, + "step": 1989 + }, + { + "epoch": 0.5777326172158513, + "grad_norm": 3.3162906169891357, + "learning_rate": 9.88293901323189e-06, + "loss": 1.161, + "step": 1990 + }, + { + "epoch": 0.5780229351139498, + "grad_norm": 3.2119832038879395, + "learning_rate": 9.882732365074572e-06, + "loss": 1.0616, + "step": 1991 + }, + { + "epoch": 0.5783132530120482, + "grad_norm": 3.3098132610321045, + "learning_rate": 9.88252553684402e-06, + "loss": 1.1829, + "step": 1992 + }, + { + "epoch": 0.5786035709101466, + "grad_norm": 4.403481960296631, + "learning_rate": 9.882318528547866e-06, + "loss": 1.205, + "step": 1993 + }, + { + "epoch": 0.578893888808245, + "grad_norm": 3.6638176441192627, + "learning_rate": 9.88211134019374e-06, + "loss": 1.1866, + "step": 1994 + }, + { + "epoch": 0.5791842067063434, + "grad_norm": 4.162400245666504, + "learning_rate": 9.881903971789285e-06, + "loss": 1.2875, + "step": 1995 + }, + { + "epoch": 0.5794745246044418, + "grad_norm": 3.9328815937042236, + "learning_rate": 9.88169642334215e-06, + "loss": 1.3008, + "step": 1996 + }, + { + "epoch": 0.5797648425025402, + "grad_norm": 3.249154567718506, + "learning_rate": 9.88148869485999e-06, + "loss": 1.1718, + "step": 1997 + }, + { + "epoch": 0.5800551604006386, + "grad_norm": 3.618673324584961, + "learning_rate": 9.88128078635046e-06, + "loss": 1.228, + "step": 1998 + }, + { + "epoch": 0.5803454782987372, + "grad_norm": 3.3982481956481934, + "learning_rate": 9.881072697821235e-06, + "loss": 1.3055, + "step": 1999 + }, + { + "epoch": 0.5806357961968356, + "grad_norm": 3.4904940128326416, + "learning_rate": 9.880864429279984e-06, + "loss": 1.2941, + "step": 2000 + }, + { + "epoch": 0.5806357961968356, + "eval_loss": 1.2087479829788208, + "eval_runtime": 11.634, + "eval_samples_per_second": 34.382, + "eval_steps_per_second": 4.298, + "step": 2000 + }, + { + "epoch": 0.580926114094934, + "grad_norm": 3.0078070163726807, + "learning_rate": 9.880655980734391e-06, + "loss": 1.0619, + "step": 2001 + }, + { + "epoch": 0.5812164319930324, + "grad_norm": 3.5126662254333496, + "learning_rate": 9.88044735219214e-06, + "loss": 1.1839, + "step": 2002 + }, + { + "epoch": 0.5815067498911308, + "grad_norm": 3.569251537322998, + "learning_rate": 9.88023854366093e-06, + "loss": 1.2774, + "step": 2003 + }, + { + "epoch": 0.5817970677892292, + "grad_norm": 3.7420945167541504, + "learning_rate": 9.880029555148458e-06, + "loss": 1.2724, + "step": 2004 + }, + { + "epoch": 0.5820873856873277, + "grad_norm": 3.3116486072540283, + "learning_rate": 9.879820386662434e-06, + "loss": 1.1711, + "step": 2005 + }, + { + "epoch": 0.5823777035854261, + "grad_norm": 3.6330201625823975, + "learning_rate": 9.879611038210569e-06, + "loss": 1.3515, + "step": 2006 + }, + { + "epoch": 0.5826680214835245, + "grad_norm": 3.2152249813079834, + "learning_rate": 9.879401509800586e-06, + "loss": 1.1697, + "step": 2007 + }, + { + "epoch": 0.5829583393816229, + "grad_norm": 3.21633243560791, + "learning_rate": 9.87919180144021e-06, + "loss": 1.1338, + "step": 2008 + }, + { + "epoch": 0.5832486572797213, + "grad_norm": 3.2678074836730957, + "learning_rate": 9.878981913137178e-06, + "loss": 1.0825, + "step": 2009 + }, + { + "epoch": 0.5835389751778197, + "grad_norm": 3.4714841842651367, + "learning_rate": 9.87877184489923e-06, + "loss": 1.2087, + "step": 2010 + }, + { + "epoch": 0.5838292930759181, + "grad_norm": 3.3108625411987305, + "learning_rate": 9.878561596734112e-06, + "loss": 1.237, + "step": 2011 + }, + { + "epoch": 0.5841196109740165, + "grad_norm": 3.4311110973358154, + "learning_rate": 9.878351168649579e-06, + "loss": 1.1453, + "step": 2012 + }, + { + "epoch": 0.584409928872115, + "grad_norm": 3.5887632369995117, + "learning_rate": 9.878140560653389e-06, + "loss": 1.2367, + "step": 2013 + }, + { + "epoch": 0.5847002467702134, + "grad_norm": 3.0961368083953857, + "learning_rate": 9.877929772753311e-06, + "loss": 1.1024, + "step": 2014 + }, + { + "epoch": 0.5849905646683118, + "grad_norm": 3.4218029975891113, + "learning_rate": 9.87771880495712e-06, + "loss": 1.1504, + "step": 2015 + }, + { + "epoch": 0.5852808825664102, + "grad_norm": 3.509666919708252, + "learning_rate": 9.877507657272596e-06, + "loss": 1.2652, + "step": 2016 + }, + { + "epoch": 0.5855712004645086, + "grad_norm": 3.555070161819458, + "learning_rate": 9.877296329707522e-06, + "loss": 1.3375, + "step": 2017 + }, + { + "epoch": 0.585861518362607, + "grad_norm": 3.184847116470337, + "learning_rate": 9.877084822269699e-06, + "loss": 1.1544, + "step": 2018 + }, + { + "epoch": 0.5861518362607054, + "grad_norm": 3.6347262859344482, + "learning_rate": 9.87687313496692e-06, + "loss": 1.2741, + "step": 2019 + }, + { + "epoch": 0.5864421541588039, + "grad_norm": 3.189941883087158, + "learning_rate": 9.876661267806995e-06, + "loss": 1.099, + "step": 2020 + }, + { + "epoch": 0.5867324720569023, + "grad_norm": 3.7185311317443848, + "learning_rate": 9.876449220797738e-06, + "loss": 1.3849, + "step": 2021 + }, + { + "epoch": 0.5870227899550007, + "grad_norm": 3.4867780208587646, + "learning_rate": 9.87623699394697e-06, + "loss": 1.2745, + "step": 2022 + }, + { + "epoch": 0.5873131078530991, + "grad_norm": 3.8721914291381836, + "learning_rate": 9.876024587262517e-06, + "loss": 1.2656, + "step": 2023 + }, + { + "epoch": 0.5876034257511976, + "grad_norm": 3.4358508586883545, + "learning_rate": 9.875812000752212e-06, + "loss": 1.1847, + "step": 2024 + }, + { + "epoch": 0.587893743649296, + "grad_norm": 3.7810873985290527, + "learning_rate": 9.875599234423895e-06, + "loss": 1.3291, + "step": 2025 + }, + { + "epoch": 0.5881840615473944, + "grad_norm": 3.518967628479004, + "learning_rate": 9.875386288285413e-06, + "loss": 1.1975, + "step": 2026 + }, + { + "epoch": 0.5884743794454929, + "grad_norm": 3.171295642852783, + "learning_rate": 9.875173162344618e-06, + "loss": 1.2229, + "step": 2027 + }, + { + "epoch": 0.5887646973435913, + "grad_norm": 3.1784987449645996, + "learning_rate": 9.874959856609374e-06, + "loss": 1.1273, + "step": 2028 + }, + { + "epoch": 0.5890550152416897, + "grad_norm": 3.9516916275024414, + "learning_rate": 9.874746371087543e-06, + "loss": 1.1746, + "step": 2029 + }, + { + "epoch": 0.5893453331397881, + "grad_norm": 3.0694680213928223, + "learning_rate": 9.874532705787e-06, + "loss": 1.0642, + "step": 2030 + }, + { + "epoch": 0.5896356510378865, + "grad_norm": 3.7301106452941895, + "learning_rate": 9.874318860715628e-06, + "loss": 1.2201, + "step": 2031 + }, + { + "epoch": 0.5899259689359849, + "grad_norm": 3.441990852355957, + "learning_rate": 9.874104835881308e-06, + "loss": 1.1172, + "step": 2032 + }, + { + "epoch": 0.5902162868340833, + "grad_norm": 3.696392059326172, + "learning_rate": 9.873890631291938e-06, + "loss": 1.3655, + "step": 2033 + }, + { + "epoch": 0.5905066047321818, + "grad_norm": 3.153104066848755, + "learning_rate": 9.873676246955415e-06, + "loss": 1.2507, + "step": 2034 + }, + { + "epoch": 0.5907969226302802, + "grad_norm": 3.5448312759399414, + "learning_rate": 9.873461682879646e-06, + "loss": 1.2732, + "step": 2035 + }, + { + "epoch": 0.5910872405283786, + "grad_norm": 3.10785174369812, + "learning_rate": 9.873246939072543e-06, + "loss": 1.1011, + "step": 2036 + }, + { + "epoch": 0.591377558426477, + "grad_norm": 3.3473784923553467, + "learning_rate": 9.873032015542027e-06, + "loss": 1.2466, + "step": 2037 + }, + { + "epoch": 0.5916678763245754, + "grad_norm": 3.331484794616699, + "learning_rate": 9.872816912296025e-06, + "loss": 1.1508, + "step": 2038 + }, + { + "epoch": 0.5919581942226738, + "grad_norm": 3.114262342453003, + "learning_rate": 9.872601629342468e-06, + "loss": 1.0387, + "step": 2039 + }, + { + "epoch": 0.5922485121207722, + "grad_norm": 3.09680438041687, + "learning_rate": 9.872386166689298e-06, + "loss": 1.271, + "step": 2040 + }, + { + "epoch": 0.5925388300188706, + "grad_norm": 3.1893184185028076, + "learning_rate": 9.872170524344458e-06, + "loss": 1.2041, + "step": 2041 + }, + { + "epoch": 0.5928291479169691, + "grad_norm": 3.161381959915161, + "learning_rate": 9.871954702315905e-06, + "loss": 0.9993, + "step": 2042 + }, + { + "epoch": 0.5931194658150675, + "grad_norm": 3.595938205718994, + "learning_rate": 9.871738700611593e-06, + "loss": 1.2812, + "step": 2043 + }, + { + "epoch": 0.5934097837131659, + "grad_norm": 3.2868971824645996, + "learning_rate": 9.871522519239493e-06, + "loss": 1.1479, + "step": 2044 + }, + { + "epoch": 0.5937001016112643, + "grad_norm": 3.5087060928344727, + "learning_rate": 9.871306158207575e-06, + "loss": 1.1772, + "step": 2045 + }, + { + "epoch": 0.5939904195093627, + "grad_norm": 3.3445980548858643, + "learning_rate": 9.87108961752382e-06, + "loss": 1.1756, + "step": 2046 + }, + { + "epoch": 0.5942807374074611, + "grad_norm": 3.3986401557922363, + "learning_rate": 9.870872897196211e-06, + "loss": 1.0586, + "step": 2047 + }, + { + "epoch": 0.5945710553055595, + "grad_norm": 3.7029547691345215, + "learning_rate": 9.870655997232743e-06, + "loss": 1.1999, + "step": 2048 + }, + { + "epoch": 0.594861373203658, + "grad_norm": 3.178635597229004, + "learning_rate": 9.870438917641416e-06, + "loss": 1.1305, + "step": 2049 + }, + { + "epoch": 0.5951516911017565, + "grad_norm": 3.1712300777435303, + "learning_rate": 9.870221658430233e-06, + "loss": 1.0952, + "step": 2050 + }, + { + "epoch": 0.5954420089998549, + "grad_norm": 3.49641489982605, + "learning_rate": 9.87000421960721e-06, + "loss": 1.1492, + "step": 2051 + }, + { + "epoch": 0.5957323268979533, + "grad_norm": 2.970425605773926, + "learning_rate": 9.869786601180362e-06, + "loss": 1.1189, + "step": 2052 + }, + { + "epoch": 0.5960226447960517, + "grad_norm": 3.2928876876831055, + "learning_rate": 9.869568803157717e-06, + "loss": 1.1222, + "step": 2053 + }, + { + "epoch": 0.5963129626941501, + "grad_norm": 3.36665678024292, + "learning_rate": 9.869350825547308e-06, + "loss": 1.2153, + "step": 2054 + }, + { + "epoch": 0.5966032805922485, + "grad_norm": 3.5911707878112793, + "learning_rate": 9.86913266835717e-06, + "loss": 1.2397, + "step": 2055 + }, + { + "epoch": 0.596893598490347, + "grad_norm": 3.268590211868286, + "learning_rate": 9.868914331595355e-06, + "loss": 1.1961, + "step": 2056 + }, + { + "epoch": 0.5971839163884454, + "grad_norm": 3.1666085720062256, + "learning_rate": 9.86869581526991e-06, + "loss": 1.0769, + "step": 2057 + }, + { + "epoch": 0.5974742342865438, + "grad_norm": 3.4120047092437744, + "learning_rate": 9.868477119388897e-06, + "loss": 1.1425, + "step": 2058 + }, + { + "epoch": 0.5977645521846422, + "grad_norm": 3.238154888153076, + "learning_rate": 9.868258243960378e-06, + "loss": 1.215, + "step": 2059 + }, + { + "epoch": 0.5980548700827406, + "grad_norm": 3.396493434906006, + "learning_rate": 9.868039188992427e-06, + "loss": 1.1295, + "step": 2060 + }, + { + "epoch": 0.598345187980839, + "grad_norm": 3.3043999671936035, + "learning_rate": 9.867819954493123e-06, + "loss": 1.0419, + "step": 2061 + }, + { + "epoch": 0.5986355058789374, + "grad_norm": 3.630920886993408, + "learning_rate": 9.86760054047055e-06, + "loss": 1.303, + "step": 2062 + }, + { + "epoch": 0.5989258237770358, + "grad_norm": 3.177386522293091, + "learning_rate": 9.867380946932803e-06, + "loss": 1.0805, + "step": 2063 + }, + { + "epoch": 0.5992161416751343, + "grad_norm": 3.366000175476074, + "learning_rate": 9.867161173887976e-06, + "loss": 1.1559, + "step": 2064 + }, + { + "epoch": 0.5995064595732327, + "grad_norm": 3.76708984375, + "learning_rate": 9.866941221344176e-06, + "loss": 1.4349, + "step": 2065 + }, + { + "epoch": 0.5997967774713311, + "grad_norm": 3.7043986320495605, + "learning_rate": 9.866721089309516e-06, + "loss": 1.1992, + "step": 2066 + }, + { + "epoch": 0.6000870953694295, + "grad_norm": 3.3993911743164062, + "learning_rate": 9.866500777792115e-06, + "loss": 1.1641, + "step": 2067 + }, + { + "epoch": 0.6003774132675279, + "grad_norm": 4.189173221588135, + "learning_rate": 9.866280286800093e-06, + "loss": 1.3878, + "step": 2068 + }, + { + "epoch": 0.6006677311656263, + "grad_norm": 3.4979851245880127, + "learning_rate": 9.86605961634159e-06, + "loss": 1.2999, + "step": 2069 + }, + { + "epoch": 0.6009580490637247, + "grad_norm": 3.66668963432312, + "learning_rate": 9.865838766424735e-06, + "loss": 1.0979, + "step": 2070 + }, + { + "epoch": 0.6012483669618232, + "grad_norm": 3.5483312606811523, + "learning_rate": 9.86561773705768e-06, + "loss": 1.1104, + "step": 2071 + }, + { + "epoch": 0.6015386848599216, + "grad_norm": 3.277080774307251, + "learning_rate": 9.865396528248572e-06, + "loss": 1.2044, + "step": 2072 + }, + { + "epoch": 0.60182900275802, + "grad_norm": 3.374983549118042, + "learning_rate": 9.865175140005571e-06, + "loss": 1.1618, + "step": 2073 + }, + { + "epoch": 0.6021193206561184, + "grad_norm": 3.7250962257385254, + "learning_rate": 9.864953572336843e-06, + "loss": 1.1848, + "step": 2074 + }, + { + "epoch": 0.6024096385542169, + "grad_norm": 3.2824532985687256, + "learning_rate": 9.864731825250557e-06, + "loss": 1.1748, + "step": 2075 + }, + { + "epoch": 0.6026999564523153, + "grad_norm": 3.4487054347991943, + "learning_rate": 9.864509898754891e-06, + "loss": 1.2878, + "step": 2076 + }, + { + "epoch": 0.6029902743504137, + "grad_norm": 3.3688509464263916, + "learning_rate": 9.864287792858032e-06, + "loss": 1.0886, + "step": 2077 + }, + { + "epoch": 0.6032805922485122, + "grad_norm": 3.505753517150879, + "learning_rate": 9.864065507568168e-06, + "loss": 1.2099, + "step": 2078 + }, + { + "epoch": 0.6035709101466106, + "grad_norm": 3.142094850540161, + "learning_rate": 9.863843042893499e-06, + "loss": 1.1276, + "step": 2079 + }, + { + "epoch": 0.603861228044709, + "grad_norm": 3.47158145904541, + "learning_rate": 9.863620398842229e-06, + "loss": 1.4327, + "step": 2080 + }, + { + "epoch": 0.6041515459428074, + "grad_norm": 3.2158539295196533, + "learning_rate": 9.863397575422569e-06, + "loss": 1.1101, + "step": 2081 + }, + { + "epoch": 0.6044418638409058, + "grad_norm": 3.1480183601379395, + "learning_rate": 9.863174572642736e-06, + "loss": 1.1376, + "step": 2082 + }, + { + "epoch": 0.6047321817390042, + "grad_norm": 3.2654166221618652, + "learning_rate": 9.862951390510953e-06, + "loss": 1.0447, + "step": 2083 + }, + { + "epoch": 0.6050224996371026, + "grad_norm": 3.2870917320251465, + "learning_rate": 9.862728029035454e-06, + "loss": 1.0577, + "step": 2084 + }, + { + "epoch": 0.605312817535201, + "grad_norm": 3.607374429702759, + "learning_rate": 9.862504488224477e-06, + "loss": 1.1754, + "step": 2085 + }, + { + "epoch": 0.6056031354332995, + "grad_norm": 4.0213470458984375, + "learning_rate": 9.86228076808626e-06, + "loss": 1.2472, + "step": 2086 + }, + { + "epoch": 0.6058934533313979, + "grad_norm": 3.1948390007019043, + "learning_rate": 9.86205686862906e-06, + "loss": 1.0298, + "step": 2087 + }, + { + "epoch": 0.6061837712294963, + "grad_norm": 3.687624454498291, + "learning_rate": 9.861832789861132e-06, + "loss": 1.1702, + "step": 2088 + }, + { + "epoch": 0.6064740891275947, + "grad_norm": 3.001420259475708, + "learning_rate": 9.861608531790741e-06, + "loss": 1.0514, + "step": 2089 + }, + { + "epoch": 0.6067644070256931, + "grad_norm": 3.481722354888916, + "learning_rate": 9.861384094426155e-06, + "loss": 1.1585, + "step": 2090 + }, + { + "epoch": 0.6070547249237915, + "grad_norm": 3.38626766204834, + "learning_rate": 9.861159477775653e-06, + "loss": 1.2134, + "step": 2091 + }, + { + "epoch": 0.60734504282189, + "grad_norm": 3.476393699645996, + "learning_rate": 9.86093468184752e-06, + "loss": 1.1685, + "step": 2092 + }, + { + "epoch": 0.6076353607199884, + "grad_norm": 3.7456226348876953, + "learning_rate": 9.860709706650043e-06, + "loss": 1.3925, + "step": 2093 + }, + { + "epoch": 0.6079256786180868, + "grad_norm": 3.1248064041137695, + "learning_rate": 9.860484552191523e-06, + "loss": 1.3072, + "step": 2094 + }, + { + "epoch": 0.6082159965161852, + "grad_norm": 3.2425031661987305, + "learning_rate": 9.860259218480259e-06, + "loss": 1.1772, + "step": 2095 + }, + { + "epoch": 0.6085063144142836, + "grad_norm": 3.4490549564361572, + "learning_rate": 9.860033705524566e-06, + "loss": 1.149, + "step": 2096 + }, + { + "epoch": 0.608796632312382, + "grad_norm": 3.476717948913574, + "learning_rate": 9.859808013332758e-06, + "loss": 1.1662, + "step": 2097 + }, + { + "epoch": 0.6090869502104804, + "grad_norm": 3.7627527713775635, + "learning_rate": 9.859582141913159e-06, + "loss": 1.2424, + "step": 2098 + }, + { + "epoch": 0.6093772681085788, + "grad_norm": 3.658005952835083, + "learning_rate": 9.859356091274099e-06, + "loss": 1.3146, + "step": 2099 + }, + { + "epoch": 0.6096675860066774, + "grad_norm": 3.8518424034118652, + "learning_rate": 9.859129861423915e-06, + "loss": 1.3079, + "step": 2100 + }, + { + "epoch": 0.6099579039047758, + "grad_norm": 3.3938114643096924, + "learning_rate": 9.858903452370949e-06, + "loss": 1.1353, + "step": 2101 + }, + { + "epoch": 0.6102482218028742, + "grad_norm": 3.4915430545806885, + "learning_rate": 9.858676864123553e-06, + "loss": 1.2039, + "step": 2102 + }, + { + "epoch": 0.6105385397009726, + "grad_norm": 3.37498140335083, + "learning_rate": 9.858450096690082e-06, + "loss": 1.1422, + "step": 2103 + }, + { + "epoch": 0.610828857599071, + "grad_norm": 3.400315761566162, + "learning_rate": 9.858223150078898e-06, + "loss": 1.1419, + "step": 2104 + }, + { + "epoch": 0.6111191754971694, + "grad_norm": 3.458354949951172, + "learning_rate": 9.857996024298374e-06, + "loss": 1.2601, + "step": 2105 + }, + { + "epoch": 0.6114094933952678, + "grad_norm": 3.3237063884735107, + "learning_rate": 9.857768719356884e-06, + "loss": 1.1714, + "step": 2106 + }, + { + "epoch": 0.6116998112933663, + "grad_norm": 3.4677350521087646, + "learning_rate": 9.85754123526281e-06, + "loss": 1.2117, + "step": 2107 + }, + { + "epoch": 0.6119901291914647, + "grad_norm": 3.1911838054656982, + "learning_rate": 9.857313572024545e-06, + "loss": 1.2366, + "step": 2108 + }, + { + "epoch": 0.6122804470895631, + "grad_norm": 3.291783332824707, + "learning_rate": 9.857085729650483e-06, + "loss": 1.1905, + "step": 2109 + }, + { + "epoch": 0.6125707649876615, + "grad_norm": 3.323556900024414, + "learning_rate": 9.856857708149025e-06, + "loss": 1.0904, + "step": 2110 + }, + { + "epoch": 0.6128610828857599, + "grad_norm": 3.2824766635894775, + "learning_rate": 9.856629507528583e-06, + "loss": 1.2211, + "step": 2111 + }, + { + "epoch": 0.6131514007838583, + "grad_norm": 3.0334928035736084, + "learning_rate": 9.856401127797572e-06, + "loss": 1.1749, + "step": 2112 + }, + { + "epoch": 0.6134417186819567, + "grad_norm": 3.456289291381836, + "learning_rate": 9.856172568964415e-06, + "loss": 1.341, + "step": 2113 + }, + { + "epoch": 0.6137320365800552, + "grad_norm": 3.350088119506836, + "learning_rate": 9.85594383103754e-06, + "loss": 1.2313, + "step": 2114 + }, + { + "epoch": 0.6140223544781536, + "grad_norm": 3.1120920181274414, + "learning_rate": 9.855714914025386e-06, + "loss": 0.9967, + "step": 2115 + }, + { + "epoch": 0.614312672376252, + "grad_norm": 3.164459228515625, + "learning_rate": 9.85548581793639e-06, + "loss": 1.1195, + "step": 2116 + }, + { + "epoch": 0.6146029902743504, + "grad_norm": 3.0181405544281006, + "learning_rate": 9.855256542779006e-06, + "loss": 1.0873, + "step": 2117 + }, + { + "epoch": 0.6148933081724488, + "grad_norm": 3.3663463592529297, + "learning_rate": 9.855027088561686e-06, + "loss": 1.2191, + "step": 2118 + }, + { + "epoch": 0.6151836260705472, + "grad_norm": 3.3779163360595703, + "learning_rate": 9.854797455292892e-06, + "loss": 1.1955, + "step": 2119 + }, + { + "epoch": 0.6154739439686456, + "grad_norm": 3.592621326446533, + "learning_rate": 9.854567642981098e-06, + "loss": 1.1278, + "step": 2120 + }, + { + "epoch": 0.615764261866744, + "grad_norm": 3.7898104190826416, + "learning_rate": 9.854337651634773e-06, + "loss": 1.2219, + "step": 2121 + }, + { + "epoch": 0.6160545797648425, + "grad_norm": 3.6015334129333496, + "learning_rate": 9.854107481262405e-06, + "loss": 1.1104, + "step": 2122 + }, + { + "epoch": 0.6163448976629409, + "grad_norm": 3.665905237197876, + "learning_rate": 9.853877131872475e-06, + "loss": 1.1972, + "step": 2123 + }, + { + "epoch": 0.6166352155610393, + "grad_norm": 3.183523416519165, + "learning_rate": 9.853646603473486e-06, + "loss": 1.1768, + "step": 2124 + }, + { + "epoch": 0.6169255334591378, + "grad_norm": 3.5019726753234863, + "learning_rate": 9.853415896073935e-06, + "loss": 1.1711, + "step": 2125 + }, + { + "epoch": 0.6172158513572362, + "grad_norm": 3.454185962677002, + "learning_rate": 9.853185009682332e-06, + "loss": 1.3214, + "step": 2126 + }, + { + "epoch": 0.6175061692553346, + "grad_norm": 3.2762303352355957, + "learning_rate": 9.852953944307192e-06, + "loss": 1.1759, + "step": 2127 + }, + { + "epoch": 0.617796487153433, + "grad_norm": 3.4141921997070312, + "learning_rate": 9.852722699957036e-06, + "loss": 1.1992, + "step": 2128 + }, + { + "epoch": 0.6180868050515315, + "grad_norm": 3.2765979766845703, + "learning_rate": 9.852491276640393e-06, + "loss": 1.0911, + "step": 2129 + }, + { + "epoch": 0.6183771229496299, + "grad_norm": 3.3329086303710938, + "learning_rate": 9.852259674365798e-06, + "loss": 1.1718, + "step": 2130 + }, + { + "epoch": 0.6186674408477283, + "grad_norm": 3.2059311866760254, + "learning_rate": 9.852027893141791e-06, + "loss": 1.0346, + "step": 2131 + }, + { + "epoch": 0.6189577587458267, + "grad_norm": 3.5297727584838867, + "learning_rate": 9.851795932976919e-06, + "loss": 1.1456, + "step": 2132 + }, + { + "epoch": 0.6192480766439251, + "grad_norm": 3.6350655555725098, + "learning_rate": 9.851563793879742e-06, + "loss": 1.1363, + "step": 2133 + }, + { + "epoch": 0.6195383945420235, + "grad_norm": 3.7481045722961426, + "learning_rate": 9.851331475858813e-06, + "loss": 1.285, + "step": 2134 + }, + { + "epoch": 0.6198287124401219, + "grad_norm": 3.4366955757141113, + "learning_rate": 9.851098978922708e-06, + "loss": 1.1945, + "step": 2135 + }, + { + "epoch": 0.6201190303382204, + "grad_norm": 3.219010829925537, + "learning_rate": 9.850866303079997e-06, + "loss": 1.15, + "step": 2136 + }, + { + "epoch": 0.6204093482363188, + "grad_norm": 3.1487579345703125, + "learning_rate": 9.850633448339262e-06, + "loss": 1.1192, + "step": 2137 + }, + { + "epoch": 0.6206996661344172, + "grad_norm": 3.2304723262786865, + "learning_rate": 9.85040041470909e-06, + "loss": 1.1732, + "step": 2138 + }, + { + "epoch": 0.6209899840325156, + "grad_norm": 3.366379737854004, + "learning_rate": 9.850167202198075e-06, + "loss": 1.1433, + "step": 2139 + }, + { + "epoch": 0.621280301930614, + "grad_norm": 3.346491575241089, + "learning_rate": 9.849933810814819e-06, + "loss": 1.2081, + "step": 2140 + }, + { + "epoch": 0.6215706198287124, + "grad_norm": 3.2903072834014893, + "learning_rate": 9.849700240567928e-06, + "loss": 1.1726, + "step": 2141 + }, + { + "epoch": 0.6218609377268108, + "grad_norm": 3.296185255050659, + "learning_rate": 9.849466491466017e-06, + "loss": 1.1533, + "step": 2142 + }, + { + "epoch": 0.6221512556249092, + "grad_norm": 3.3224129676818848, + "learning_rate": 9.849232563517706e-06, + "loss": 1.1278, + "step": 2143 + }, + { + "epoch": 0.6224415735230077, + "grad_norm": 3.283273458480835, + "learning_rate": 9.848998456731622e-06, + "loss": 1.1298, + "step": 2144 + }, + { + "epoch": 0.6227318914211061, + "grad_norm": 3.311249017715454, + "learning_rate": 9.848764171116401e-06, + "loss": 1.1447, + "step": 2145 + }, + { + "epoch": 0.6230222093192045, + "grad_norm": 2.9450314044952393, + "learning_rate": 9.84852970668068e-06, + "loss": 1.0555, + "step": 2146 + }, + { + "epoch": 0.6233125272173029, + "grad_norm": 3.447150707244873, + "learning_rate": 9.848295063433108e-06, + "loss": 1.2113, + "step": 2147 + }, + { + "epoch": 0.6236028451154013, + "grad_norm": 3.5015945434570312, + "learning_rate": 9.848060241382339e-06, + "loss": 1.1897, + "step": 2148 + }, + { + "epoch": 0.6238931630134997, + "grad_norm": 2.8743700981140137, + "learning_rate": 9.84782524053703e-06, + "loss": 1.0311, + "step": 2149 + }, + { + "epoch": 0.6241834809115983, + "grad_norm": 3.2919156551361084, + "learning_rate": 9.847590060905851e-06, + "loss": 1.2051, + "step": 2150 + }, + { + "epoch": 0.6244737988096967, + "grad_norm": 3.5934643745422363, + "learning_rate": 9.847354702497475e-06, + "loss": 1.165, + "step": 2151 + }, + { + "epoch": 0.6247641167077951, + "grad_norm": 3.7064201831817627, + "learning_rate": 9.84711916532058e-06, + "loss": 1.1528, + "step": 2152 + }, + { + "epoch": 0.6250544346058935, + "grad_norm": 3.3166534900665283, + "learning_rate": 9.846883449383854e-06, + "loss": 1.08, + "step": 2153 + }, + { + "epoch": 0.6253447525039919, + "grad_norm": 3.3062987327575684, + "learning_rate": 9.84664755469599e-06, + "loss": 1.1791, + "step": 2154 + }, + { + "epoch": 0.6256350704020903, + "grad_norm": 3.352381706237793, + "learning_rate": 9.846411481265687e-06, + "loss": 1.0613, + "step": 2155 + }, + { + "epoch": 0.6259253883001887, + "grad_norm": 3.193981409072876, + "learning_rate": 9.846175229101654e-06, + "loss": 1.1526, + "step": 2156 + }, + { + "epoch": 0.6262157061982871, + "grad_norm": 3.394362449645996, + "learning_rate": 9.8459387982126e-06, + "loss": 1.3341, + "step": 2157 + }, + { + "epoch": 0.6265060240963856, + "grad_norm": 3.4602437019348145, + "learning_rate": 9.845702188607246e-06, + "loss": 1.2346, + "step": 2158 + }, + { + "epoch": 0.626796341994484, + "grad_norm": 3.1883201599121094, + "learning_rate": 9.845465400294318e-06, + "loss": 1.0331, + "step": 2159 + }, + { + "epoch": 0.6270866598925824, + "grad_norm": 3.407731056213379, + "learning_rate": 9.84522843328255e-06, + "loss": 1.0798, + "step": 2160 + }, + { + "epoch": 0.6273769777906808, + "grad_norm": 3.20255184173584, + "learning_rate": 9.84499128758068e-06, + "loss": 1.0629, + "step": 2161 + }, + { + "epoch": 0.6276672956887792, + "grad_norm": 3.4190375804901123, + "learning_rate": 9.844753963197454e-06, + "loss": 1.3578, + "step": 2162 + }, + { + "epoch": 0.6279576135868776, + "grad_norm": 3.4097230434417725, + "learning_rate": 9.844516460141622e-06, + "loss": 1.1523, + "step": 2163 + }, + { + "epoch": 0.628247931484976, + "grad_norm": 3.4188151359558105, + "learning_rate": 9.844278778421947e-06, + "loss": 1.2496, + "step": 2164 + }, + { + "epoch": 0.6285382493830745, + "grad_norm": 3.40053653717041, + "learning_rate": 9.844040918047194e-06, + "loss": 1.2374, + "step": 2165 + }, + { + "epoch": 0.6288285672811729, + "grad_norm": 3.3556363582611084, + "learning_rate": 9.843802879026135e-06, + "loss": 1.1735, + "step": 2166 + }, + { + "epoch": 0.6291188851792713, + "grad_norm": 3.5495386123657227, + "learning_rate": 9.843564661367547e-06, + "loss": 1.2863, + "step": 2167 + }, + { + "epoch": 0.6294092030773697, + "grad_norm": 3.5947186946868896, + "learning_rate": 9.843326265080215e-06, + "loss": 1.2074, + "step": 2168 + }, + { + "epoch": 0.6296995209754681, + "grad_norm": 3.3430442810058594, + "learning_rate": 9.843087690172933e-06, + "loss": 1.151, + "step": 2169 + }, + { + "epoch": 0.6299898388735665, + "grad_norm": 3.2726352214813232, + "learning_rate": 9.8428489366545e-06, + "loss": 1.1052, + "step": 2170 + }, + { + "epoch": 0.6302801567716649, + "grad_norm": 3.2520737648010254, + "learning_rate": 9.842610004533719e-06, + "loss": 1.189, + "step": 2171 + }, + { + "epoch": 0.6305704746697633, + "grad_norm": 3.6799371242523193, + "learning_rate": 9.842370893819404e-06, + "loss": 1.2571, + "step": 2172 + }, + { + "epoch": 0.6308607925678618, + "grad_norm": 3.68361759185791, + "learning_rate": 9.84213160452037e-06, + "loss": 1.3269, + "step": 2173 + }, + { + "epoch": 0.6311511104659602, + "grad_norm": 3.2377898693084717, + "learning_rate": 9.841892136645445e-06, + "loss": 1.0609, + "step": 2174 + }, + { + "epoch": 0.6314414283640587, + "grad_norm": 3.5017290115356445, + "learning_rate": 9.84165249020346e-06, + "loss": 1.2455, + "step": 2175 + }, + { + "epoch": 0.6317317462621571, + "grad_norm": 3.285425901412964, + "learning_rate": 9.841412665203252e-06, + "loss": 1.1918, + "step": 2176 + }, + { + "epoch": 0.6320220641602555, + "grad_norm": 3.036376476287842, + "learning_rate": 9.841172661653666e-06, + "loss": 0.9972, + "step": 2177 + }, + { + "epoch": 0.6323123820583539, + "grad_norm": 3.130056858062744, + "learning_rate": 9.840932479563555e-06, + "loss": 1.2004, + "step": 2178 + }, + { + "epoch": 0.6326026999564524, + "grad_norm": 3.232766628265381, + "learning_rate": 9.840692118941774e-06, + "loss": 1.3199, + "step": 2179 + }, + { + "epoch": 0.6328930178545508, + "grad_norm": 3.6254005432128906, + "learning_rate": 9.840451579797187e-06, + "loss": 1.2094, + "step": 2180 + }, + { + "epoch": 0.6331833357526492, + "grad_norm": 3.1795482635498047, + "learning_rate": 9.840210862138669e-06, + "loss": 1.1589, + "step": 2181 + }, + { + "epoch": 0.6334736536507476, + "grad_norm": 3.2265725135803223, + "learning_rate": 9.839969965975095e-06, + "loss": 1.1383, + "step": 2182 + }, + { + "epoch": 0.633763971548846, + "grad_norm": 3.373206615447998, + "learning_rate": 9.839728891315347e-06, + "loss": 1.171, + "step": 2183 + }, + { + "epoch": 0.6340542894469444, + "grad_norm": 4.074607849121094, + "learning_rate": 9.839487638168321e-06, + "loss": 1.2394, + "step": 2184 + }, + { + "epoch": 0.6343446073450428, + "grad_norm": 3.1658074855804443, + "learning_rate": 9.839246206542909e-06, + "loss": 1.0554, + "step": 2185 + }, + { + "epoch": 0.6346349252431412, + "grad_norm": 3.2978014945983887, + "learning_rate": 9.839004596448019e-06, + "loss": 1.1405, + "step": 2186 + }, + { + "epoch": 0.6349252431412397, + "grad_norm": 3.3122334480285645, + "learning_rate": 9.83876280789256e-06, + "loss": 1.3262, + "step": 2187 + }, + { + "epoch": 0.6352155610393381, + "grad_norm": 3.4572861194610596, + "learning_rate": 9.838520840885449e-06, + "loss": 1.2122, + "step": 2188 + }, + { + "epoch": 0.6355058789374365, + "grad_norm": 3.5060789585113525, + "learning_rate": 9.838278695435609e-06, + "loss": 1.1584, + "step": 2189 + }, + { + "epoch": 0.6357961968355349, + "grad_norm": 3.2355239391326904, + "learning_rate": 9.83803637155197e-06, + "loss": 1.0703, + "step": 2190 + }, + { + "epoch": 0.6360865147336333, + "grad_norm": 3.302013635635376, + "learning_rate": 9.837793869243468e-06, + "loss": 1.2737, + "step": 2191 + }, + { + "epoch": 0.6363768326317317, + "grad_norm": 3.2123663425445557, + "learning_rate": 9.83755118851905e-06, + "loss": 1.237, + "step": 2192 + }, + { + "epoch": 0.6366671505298301, + "grad_norm": 3.7422244548797607, + "learning_rate": 9.837308329387664e-06, + "loss": 1.2597, + "step": 2193 + }, + { + "epoch": 0.6369574684279286, + "grad_norm": 3.2257628440856934, + "learning_rate": 9.837065291858267e-06, + "loss": 1.1498, + "step": 2194 + }, + { + "epoch": 0.637247786326027, + "grad_norm": 3.217024087905884, + "learning_rate": 9.83682207593982e-06, + "loss": 1.1622, + "step": 2195 + }, + { + "epoch": 0.6375381042241254, + "grad_norm": 3.39605450630188, + "learning_rate": 9.836578681641295e-06, + "loss": 1.1444, + "step": 2196 + }, + { + "epoch": 0.6378284221222238, + "grad_norm": 3.1654269695281982, + "learning_rate": 9.836335108971668e-06, + "loss": 1.2435, + "step": 2197 + }, + { + "epoch": 0.6381187400203222, + "grad_norm": 3.087963104248047, + "learning_rate": 9.83609135793992e-06, + "loss": 1.0901, + "step": 2198 + }, + { + "epoch": 0.6384090579184206, + "grad_norm": 3.3197085857391357, + "learning_rate": 9.835847428555042e-06, + "loss": 1.152, + "step": 2199 + }, + { + "epoch": 0.6386993758165191, + "grad_norm": 3.3169407844543457, + "learning_rate": 9.835603320826032e-06, + "loss": 1.1586, + "step": 2200 + }, + { + "epoch": 0.6389896937146176, + "grad_norm": 3.6764190196990967, + "learning_rate": 9.835359034761888e-06, + "loss": 1.359, + "step": 2201 + }, + { + "epoch": 0.639280011612716, + "grad_norm": 3.44268798828125, + "learning_rate": 9.835114570371624e-06, + "loss": 1.3031, + "step": 2202 + }, + { + "epoch": 0.6395703295108144, + "grad_norm": 3.2723872661590576, + "learning_rate": 9.834869927664253e-06, + "loss": 1.2116, + "step": 2203 + }, + { + "epoch": 0.6398606474089128, + "grad_norm": 3.278549909591675, + "learning_rate": 9.834625106648796e-06, + "loss": 1.2105, + "step": 2204 + }, + { + "epoch": 0.6401509653070112, + "grad_norm": 3.3444881439208984, + "learning_rate": 9.834380107334284e-06, + "loss": 1.2564, + "step": 2205 + }, + { + "epoch": 0.6404412832051096, + "grad_norm": 3.176098585128784, + "learning_rate": 9.834134929729752e-06, + "loss": 1.3411, + "step": 2206 + }, + { + "epoch": 0.640731601103208, + "grad_norm": 3.3489229679107666, + "learning_rate": 9.833889573844245e-06, + "loss": 1.3759, + "step": 2207 + }, + { + "epoch": 0.6410219190013065, + "grad_norm": 3.012814521789551, + "learning_rate": 9.833644039686806e-06, + "loss": 1.0518, + "step": 2208 + }, + { + "epoch": 0.6413122368994049, + "grad_norm": 3.1896815299987793, + "learning_rate": 9.833398327266494e-06, + "loss": 1.317, + "step": 2209 + }, + { + "epoch": 0.6416025547975033, + "grad_norm": 3.2311453819274902, + "learning_rate": 9.83315243659237e-06, + "loss": 1.1722, + "step": 2210 + }, + { + "epoch": 0.6418928726956017, + "grad_norm": 3.300663471221924, + "learning_rate": 9.8329063676735e-06, + "loss": 1.1816, + "step": 2211 + }, + { + "epoch": 0.6421831905937001, + "grad_norm": 3.508462429046631, + "learning_rate": 9.832660120518964e-06, + "loss": 1.1476, + "step": 2212 + }, + { + "epoch": 0.6424735084917985, + "grad_norm": 3.417879581451416, + "learning_rate": 9.832413695137839e-06, + "loss": 1.1925, + "step": 2213 + }, + { + "epoch": 0.6427638263898969, + "grad_norm": 3.324315071105957, + "learning_rate": 9.832167091539215e-06, + "loss": 1.2362, + "step": 2214 + }, + { + "epoch": 0.6430541442879953, + "grad_norm": 3.466980457305908, + "learning_rate": 9.831920309732184e-06, + "loss": 1.0621, + "step": 2215 + }, + { + "epoch": 0.6433444621860938, + "grad_norm": 3.5176475048065186, + "learning_rate": 9.831673349725852e-06, + "loss": 1.1971, + "step": 2216 + }, + { + "epoch": 0.6436347800841922, + "grad_norm": 3.5018646717071533, + "learning_rate": 9.831426211529324e-06, + "loss": 1.1557, + "step": 2217 + }, + { + "epoch": 0.6439250979822906, + "grad_norm": 3.705435276031494, + "learning_rate": 9.831178895151715e-06, + "loss": 1.2571, + "step": 2218 + }, + { + "epoch": 0.644215415880389, + "grad_norm": 4.033883571624756, + "learning_rate": 9.830931400602144e-06, + "loss": 1.3683, + "step": 2219 + }, + { + "epoch": 0.6445057337784874, + "grad_norm": 3.2899346351623535, + "learning_rate": 9.830683727889741e-06, + "loss": 1.1005, + "step": 2220 + }, + { + "epoch": 0.6447960516765858, + "grad_norm": 3.1492760181427, + "learning_rate": 9.830435877023639e-06, + "loss": 1.1345, + "step": 2221 + }, + { + "epoch": 0.6450863695746842, + "grad_norm": 3.2546796798706055, + "learning_rate": 9.830187848012979e-06, + "loss": 1.1064, + "step": 2222 + }, + { + "epoch": 0.6453766874727827, + "grad_norm": 3.29607892036438, + "learning_rate": 9.829939640866907e-06, + "loss": 1.2367, + "step": 2223 + }, + { + "epoch": 0.6456670053708811, + "grad_norm": 3.307436466217041, + "learning_rate": 9.82969125559458e-06, + "loss": 1.1053, + "step": 2224 + }, + { + "epoch": 0.6459573232689795, + "grad_norm": 3.5715291500091553, + "learning_rate": 9.829442692205153e-06, + "loss": 1.2292, + "step": 2225 + }, + { + "epoch": 0.646247641167078, + "grad_norm": 3.4303536415100098, + "learning_rate": 9.829193950707798e-06, + "loss": 1.1351, + "step": 2226 + }, + { + "epoch": 0.6465379590651764, + "grad_norm": 2.975395441055298, + "learning_rate": 9.828945031111686e-06, + "loss": 1.0084, + "step": 2227 + }, + { + "epoch": 0.6468282769632748, + "grad_norm": 3.295159101486206, + "learning_rate": 9.828695933425997e-06, + "loss": 1.1417, + "step": 2228 + }, + { + "epoch": 0.6471185948613732, + "grad_norm": 3.2531330585479736, + "learning_rate": 9.828446657659919e-06, + "loss": 1.1857, + "step": 2229 + }, + { + "epoch": 0.6474089127594717, + "grad_norm": 3.3126182556152344, + "learning_rate": 9.828197203822645e-06, + "loss": 1.2185, + "step": 2230 + }, + { + "epoch": 0.6476992306575701, + "grad_norm": 3.2954418659210205, + "learning_rate": 9.827947571923373e-06, + "loss": 1.1762, + "step": 2231 + }, + { + "epoch": 0.6479895485556685, + "grad_norm": 3.3297324180603027, + "learning_rate": 9.827697761971311e-06, + "loss": 1.2222, + "step": 2232 + }, + { + "epoch": 0.6482798664537669, + "grad_norm": 3.3421590328216553, + "learning_rate": 9.827447773975672e-06, + "loss": 1.1304, + "step": 2233 + }, + { + "epoch": 0.6485701843518653, + "grad_norm": 3.5584068298339844, + "learning_rate": 9.827197607945673e-06, + "loss": 1.2349, + "step": 2234 + }, + { + "epoch": 0.6488605022499637, + "grad_norm": 3.217658519744873, + "learning_rate": 9.826947263890542e-06, + "loss": 1.1348, + "step": 2235 + }, + { + "epoch": 0.6491508201480621, + "grad_norm": 3.6436023712158203, + "learning_rate": 9.826696741819513e-06, + "loss": 1.1754, + "step": 2236 + }, + { + "epoch": 0.6494411380461605, + "grad_norm": 3.1794240474700928, + "learning_rate": 9.826446041741821e-06, + "loss": 1.1274, + "step": 2237 + }, + { + "epoch": 0.649731455944259, + "grad_norm": 3.486071825027466, + "learning_rate": 9.826195163666717e-06, + "loss": 1.2021, + "step": 2238 + }, + { + "epoch": 0.6500217738423574, + "grad_norm": 3.734785795211792, + "learning_rate": 9.82594410760345e-06, + "loss": 1.2959, + "step": 2239 + }, + { + "epoch": 0.6503120917404558, + "grad_norm": 3.603210926055908, + "learning_rate": 9.825692873561278e-06, + "loss": 1.2613, + "step": 2240 + }, + { + "epoch": 0.6506024096385542, + "grad_norm": 3.3361124992370605, + "learning_rate": 9.825441461549469e-06, + "loss": 1.1428, + "step": 2241 + }, + { + "epoch": 0.6508927275366526, + "grad_norm": 3.122087240219116, + "learning_rate": 9.825189871577294e-06, + "loss": 1.0691, + "step": 2242 + }, + { + "epoch": 0.651183045434751, + "grad_norm": 3.1546952724456787, + "learning_rate": 9.824938103654031e-06, + "loss": 1.1187, + "step": 2243 + }, + { + "epoch": 0.6514733633328494, + "grad_norm": 3.2291035652160645, + "learning_rate": 9.824686157788968e-06, + "loss": 1.0736, + "step": 2244 + }, + { + "epoch": 0.6517636812309479, + "grad_norm": 3.363553762435913, + "learning_rate": 9.82443403399139e-06, + "loss": 1.182, + "step": 2245 + }, + { + "epoch": 0.6520539991290463, + "grad_norm": 3.5415096282958984, + "learning_rate": 9.824181732270601e-06, + "loss": 1.2854, + "step": 2246 + }, + { + "epoch": 0.6523443170271447, + "grad_norm": 3.141082525253296, + "learning_rate": 9.823929252635905e-06, + "loss": 1.155, + "step": 2247 + }, + { + "epoch": 0.6526346349252431, + "grad_norm": 3.1211352348327637, + "learning_rate": 9.823676595096612e-06, + "loss": 1.0612, + "step": 2248 + }, + { + "epoch": 0.6529249528233415, + "grad_norm": 3.169532060623169, + "learning_rate": 9.823423759662039e-06, + "loss": 1.1733, + "step": 2249 + }, + { + "epoch": 0.6532152707214399, + "grad_norm": 3.2215521335601807, + "learning_rate": 9.823170746341513e-06, + "loss": 1.2333, + "step": 2250 + }, + { + "epoch": 0.6535055886195384, + "grad_norm": 3.0309600830078125, + "learning_rate": 9.822917555144364e-06, + "loss": 1.1244, + "step": 2251 + }, + { + "epoch": 0.6537959065176369, + "grad_norm": 3.429142475128174, + "learning_rate": 9.822664186079928e-06, + "loss": 1.1219, + "step": 2252 + }, + { + "epoch": 0.6540862244157353, + "grad_norm": 3.5349714756011963, + "learning_rate": 9.822410639157554e-06, + "loss": 1.1846, + "step": 2253 + }, + { + "epoch": 0.6543765423138337, + "grad_norm": 3.37827205657959, + "learning_rate": 9.822156914386587e-06, + "loss": 1.083, + "step": 2254 + }, + { + "epoch": 0.6546668602119321, + "grad_norm": 3.172299861907959, + "learning_rate": 9.821903011776385e-06, + "loss": 1.0561, + "step": 2255 + }, + { + "epoch": 0.6549571781100305, + "grad_norm": 3.613541841506958, + "learning_rate": 9.821648931336316e-06, + "loss": 1.2298, + "step": 2256 + }, + { + "epoch": 0.6552474960081289, + "grad_norm": 3.3095669746398926, + "learning_rate": 9.821394673075749e-06, + "loss": 1.1434, + "step": 2257 + }, + { + "epoch": 0.6555378139062273, + "grad_norm": 3.3738560676574707, + "learning_rate": 9.821140237004056e-06, + "loss": 1.0829, + "step": 2258 + }, + { + "epoch": 0.6558281318043258, + "grad_norm": 3.2556138038635254, + "learning_rate": 9.820885623130626e-06, + "loss": 1.2057, + "step": 2259 + }, + { + "epoch": 0.6561184497024242, + "grad_norm": 3.1285338401794434, + "learning_rate": 9.820630831464848e-06, + "loss": 1.0995, + "step": 2260 + }, + { + "epoch": 0.6564087676005226, + "grad_norm": 3.290846109390259, + "learning_rate": 9.820375862016116e-06, + "loss": 1.1008, + "step": 2261 + }, + { + "epoch": 0.656699085498621, + "grad_norm": 3.7028110027313232, + "learning_rate": 9.820120714793837e-06, + "loss": 1.296, + "step": 2262 + }, + { + "epoch": 0.6569894033967194, + "grad_norm": 3.056378126144409, + "learning_rate": 9.819865389807418e-06, + "loss": 1.1055, + "step": 2263 + }, + { + "epoch": 0.6572797212948178, + "grad_norm": 3.3602118492126465, + "learning_rate": 9.819609887066277e-06, + "loss": 1.2804, + "step": 2264 + }, + { + "epoch": 0.6575700391929162, + "grad_norm": 3.4260177612304688, + "learning_rate": 9.819354206579837e-06, + "loss": 1.1645, + "step": 2265 + }, + { + "epoch": 0.6578603570910146, + "grad_norm": 3.3738510608673096, + "learning_rate": 9.819098348357524e-06, + "loss": 1.2217, + "step": 2266 + }, + { + "epoch": 0.6581506749891131, + "grad_norm": 3.576476573944092, + "learning_rate": 9.818842312408776e-06, + "loss": 1.1926, + "step": 2267 + }, + { + "epoch": 0.6584409928872115, + "grad_norm": 3.448089838027954, + "learning_rate": 9.818586098743038e-06, + "loss": 1.3726, + "step": 2268 + }, + { + "epoch": 0.6587313107853099, + "grad_norm": 3.3965907096862793, + "learning_rate": 9.818329707369755e-06, + "loss": 1.2387, + "step": 2269 + }, + { + "epoch": 0.6590216286834083, + "grad_norm": 3.6523730754852295, + "learning_rate": 9.818073138298386e-06, + "loss": 1.1913, + "step": 2270 + }, + { + "epoch": 0.6593119465815067, + "grad_norm": 3.646683931350708, + "learning_rate": 9.817816391538391e-06, + "loss": 1.231, + "step": 2271 + }, + { + "epoch": 0.6596022644796051, + "grad_norm": 2.9595463275909424, + "learning_rate": 9.81755946709924e-06, + "loss": 1.165, + "step": 2272 + }, + { + "epoch": 0.6598925823777035, + "grad_norm": 3.1737749576568604, + "learning_rate": 9.817302364990406e-06, + "loss": 1.0447, + "step": 2273 + }, + { + "epoch": 0.660182900275802, + "grad_norm": 3.2275867462158203, + "learning_rate": 9.817045085221373e-06, + "loss": 1.1765, + "step": 2274 + }, + { + "epoch": 0.6604732181739004, + "grad_norm": 3.4508190155029297, + "learning_rate": 9.81678762780163e-06, + "loss": 1.2429, + "step": 2275 + }, + { + "epoch": 0.6607635360719989, + "grad_norm": 3.456575632095337, + "learning_rate": 9.81652999274067e-06, + "loss": 1.2266, + "step": 2276 + }, + { + "epoch": 0.6610538539700973, + "grad_norm": 3.2471117973327637, + "learning_rate": 9.816272180047996e-06, + "loss": 1.0078, + "step": 2277 + }, + { + "epoch": 0.6613441718681957, + "grad_norm": 3.268442153930664, + "learning_rate": 9.816014189733114e-06, + "loss": 1.1238, + "step": 2278 + }, + { + "epoch": 0.6616344897662941, + "grad_norm": 3.4898526668548584, + "learning_rate": 9.81575602180554e-06, + "loss": 1.1437, + "step": 2279 + }, + { + "epoch": 0.6619248076643925, + "grad_norm": 3.3566908836364746, + "learning_rate": 9.815497676274796e-06, + "loss": 1.0441, + "step": 2280 + }, + { + "epoch": 0.662215125562491, + "grad_norm": 3.3789467811584473, + "learning_rate": 9.815239153150408e-06, + "loss": 1.1994, + "step": 2281 + }, + { + "epoch": 0.6625054434605894, + "grad_norm": 3.390451669692993, + "learning_rate": 9.81498045244191e-06, + "loss": 1.3149, + "step": 2282 + }, + { + "epoch": 0.6627957613586878, + "grad_norm": 3.3824403285980225, + "learning_rate": 9.814721574158846e-06, + "loss": 1.076, + "step": 2283 + }, + { + "epoch": 0.6630860792567862, + "grad_norm": 3.420539379119873, + "learning_rate": 9.81446251831076e-06, + "loss": 1.193, + "step": 2284 + }, + { + "epoch": 0.6633763971548846, + "grad_norm": 3.389395236968994, + "learning_rate": 9.814203284907207e-06, + "loss": 1.1161, + "step": 2285 + }, + { + "epoch": 0.663666715052983, + "grad_norm": 3.054683208465576, + "learning_rate": 9.813943873957748e-06, + "loss": 1.055, + "step": 2286 + }, + { + "epoch": 0.6639570329510814, + "grad_norm": 2.9350805282592773, + "learning_rate": 9.813684285471947e-06, + "loss": 1.0195, + "step": 2287 + }, + { + "epoch": 0.6642473508491799, + "grad_norm": 3.091355800628662, + "learning_rate": 9.81342451945938e-06, + "loss": 1.0988, + "step": 2288 + }, + { + "epoch": 0.6645376687472783, + "grad_norm": 3.1102099418640137, + "learning_rate": 9.813164575929628e-06, + "loss": 1.0639, + "step": 2289 + }, + { + "epoch": 0.6648279866453767, + "grad_norm": 3.5209128856658936, + "learning_rate": 9.812904454892276e-06, + "loss": 1.2014, + "step": 2290 + }, + { + "epoch": 0.6651183045434751, + "grad_norm": 3.12597393989563, + "learning_rate": 9.812644156356919e-06, + "loss": 1.0899, + "step": 2291 + }, + { + "epoch": 0.6654086224415735, + "grad_norm": 2.8330626487731934, + "learning_rate": 9.812383680333155e-06, + "loss": 1.1208, + "step": 2292 + }, + { + "epoch": 0.6656989403396719, + "grad_norm": 3.543325185775757, + "learning_rate": 9.812123026830589e-06, + "loss": 1.1893, + "step": 2293 + }, + { + "epoch": 0.6659892582377703, + "grad_norm": 3.1367380619049072, + "learning_rate": 9.811862195858837e-06, + "loss": 1.1395, + "step": 2294 + }, + { + "epoch": 0.6662795761358687, + "grad_norm": 3.0807571411132812, + "learning_rate": 9.811601187427516e-06, + "loss": 1.1274, + "step": 2295 + }, + { + "epoch": 0.6665698940339672, + "grad_norm": 3.28458309173584, + "learning_rate": 9.811340001546252e-06, + "loss": 1.0711, + "step": 2296 + }, + { + "epoch": 0.6668602119320656, + "grad_norm": 3.28643798828125, + "learning_rate": 9.81107863822468e-06, + "loss": 1.2233, + "step": 2297 + }, + { + "epoch": 0.667150529830164, + "grad_norm": 3.4898693561553955, + "learning_rate": 9.810817097472436e-06, + "loss": 1.2142, + "step": 2298 + }, + { + "epoch": 0.6674408477282624, + "grad_norm": 3.2157557010650635, + "learning_rate": 9.810555379299166e-06, + "loss": 1.2659, + "step": 2299 + }, + { + "epoch": 0.6677311656263608, + "grad_norm": 3.494442939758301, + "learning_rate": 9.810293483714523e-06, + "loss": 1.2787, + "step": 2300 + }, + { + "epoch": 0.6680214835244593, + "grad_norm": 3.61946702003479, + "learning_rate": 9.810031410728164e-06, + "loss": 1.1851, + "step": 2301 + }, + { + "epoch": 0.6683118014225577, + "grad_norm": 3.2607109546661377, + "learning_rate": 9.809769160349758e-06, + "loss": 1.1155, + "step": 2302 + }, + { + "epoch": 0.6686021193206562, + "grad_norm": 3.383884906768799, + "learning_rate": 9.809506732588972e-06, + "loss": 1.2479, + "step": 2303 + }, + { + "epoch": 0.6688924372187546, + "grad_norm": 3.2273740768432617, + "learning_rate": 9.809244127455488e-06, + "loss": 1.1941, + "step": 2304 + }, + { + "epoch": 0.669182755116853, + "grad_norm": 3.4954328536987305, + "learning_rate": 9.808981344958988e-06, + "loss": 1.1645, + "step": 2305 + }, + { + "epoch": 0.6694730730149514, + "grad_norm": 3.2053277492523193, + "learning_rate": 9.808718385109165e-06, + "loss": 1.2592, + "step": 2306 + }, + { + "epoch": 0.6697633909130498, + "grad_norm": 3.0955846309661865, + "learning_rate": 9.808455247915715e-06, + "loss": 1.2793, + "step": 2307 + }, + { + "epoch": 0.6700537088111482, + "grad_norm": 3.197502374649048, + "learning_rate": 9.808191933388345e-06, + "loss": 1.0838, + "step": 2308 + }, + { + "epoch": 0.6703440267092466, + "grad_norm": 3.3631088733673096, + "learning_rate": 9.807928441536762e-06, + "loss": 1.1083, + "step": 2309 + }, + { + "epoch": 0.6706343446073451, + "grad_norm": 2.953148126602173, + "learning_rate": 9.807664772370689e-06, + "loss": 1.0448, + "step": 2310 + }, + { + "epoch": 0.6709246625054435, + "grad_norm": 3.3612277507781982, + "learning_rate": 9.807400925899846e-06, + "loss": 1.0393, + "step": 2311 + }, + { + "epoch": 0.6712149804035419, + "grad_norm": 3.6656582355499268, + "learning_rate": 9.807136902133965e-06, + "loss": 1.2362, + "step": 2312 + }, + { + "epoch": 0.6715052983016403, + "grad_norm": 3.5118401050567627, + "learning_rate": 9.806872701082781e-06, + "loss": 1.2117, + "step": 2313 + }, + { + "epoch": 0.6717956161997387, + "grad_norm": 3.3114728927612305, + "learning_rate": 9.806608322756042e-06, + "loss": 1.1594, + "step": 2314 + }, + { + "epoch": 0.6720859340978371, + "grad_norm": 3.28566837310791, + "learning_rate": 9.806343767163494e-06, + "loss": 1.1699, + "step": 2315 + }, + { + "epoch": 0.6723762519959355, + "grad_norm": 3.1415863037109375, + "learning_rate": 9.806079034314895e-06, + "loss": 1.0319, + "step": 2316 + }, + { + "epoch": 0.672666569894034, + "grad_norm": 3.3450355529785156, + "learning_rate": 9.80581412422001e-06, + "loss": 1.1448, + "step": 2317 + }, + { + "epoch": 0.6729568877921324, + "grad_norm": 3.2889275550842285, + "learning_rate": 9.805549036888605e-06, + "loss": 1.1007, + "step": 2318 + }, + { + "epoch": 0.6732472056902308, + "grad_norm": 3.367488384246826, + "learning_rate": 9.80528377233046e-06, + "loss": 1.1438, + "step": 2319 + }, + { + "epoch": 0.6735375235883292, + "grad_norm": 3.3112919330596924, + "learning_rate": 9.805018330555356e-06, + "loss": 1.3459, + "step": 2320 + }, + { + "epoch": 0.6738278414864276, + "grad_norm": 3.415867567062378, + "learning_rate": 9.804752711573082e-06, + "loss": 1.1417, + "step": 2321 + }, + { + "epoch": 0.674118159384526, + "grad_norm": 3.7435660362243652, + "learning_rate": 9.804486915393437e-06, + "loss": 1.3839, + "step": 2322 + }, + { + "epoch": 0.6744084772826244, + "grad_norm": 3.293759822845459, + "learning_rate": 9.80422094202622e-06, + "loss": 1.103, + "step": 2323 + }, + { + "epoch": 0.6746987951807228, + "grad_norm": 3.387779474258423, + "learning_rate": 9.803954791481239e-06, + "loss": 1.2076, + "step": 2324 + }, + { + "epoch": 0.6749891130788213, + "grad_norm": 3.345348358154297, + "learning_rate": 9.803688463768314e-06, + "loss": 1.1311, + "step": 2325 + }, + { + "epoch": 0.6752794309769198, + "grad_norm": 3.251539707183838, + "learning_rate": 9.803421958897264e-06, + "loss": 1.1487, + "step": 2326 + }, + { + "epoch": 0.6755697488750182, + "grad_norm": 3.229526996612549, + "learning_rate": 9.803155276877918e-06, + "loss": 1.1344, + "step": 2327 + }, + { + "epoch": 0.6758600667731166, + "grad_norm": 3.530510187149048, + "learning_rate": 9.802888417720113e-06, + "loss": 1.2112, + "step": 2328 + }, + { + "epoch": 0.676150384671215, + "grad_norm": 3.2944540977478027, + "learning_rate": 9.802621381433687e-06, + "loss": 1.2135, + "step": 2329 + }, + { + "epoch": 0.6764407025693134, + "grad_norm": 3.1269474029541016, + "learning_rate": 9.802354168028491e-06, + "loss": 1.1785, + "step": 2330 + }, + { + "epoch": 0.6767310204674118, + "grad_norm": 3.0783286094665527, + "learning_rate": 9.80208677751438e-06, + "loss": 1.1425, + "step": 2331 + }, + { + "epoch": 0.6770213383655103, + "grad_norm": 3.5151352882385254, + "learning_rate": 9.801819209901214e-06, + "loss": 1.2729, + "step": 2332 + }, + { + "epoch": 0.6773116562636087, + "grad_norm": 3.083354949951172, + "learning_rate": 9.801551465198862e-06, + "loss": 1.0144, + "step": 2333 + }, + { + "epoch": 0.6776019741617071, + "grad_norm": 3.382624387741089, + "learning_rate": 9.801283543417195e-06, + "loss": 1.1739, + "step": 2334 + }, + { + "epoch": 0.6778922920598055, + "grad_norm": 3.231215000152588, + "learning_rate": 9.801015444566097e-06, + "loss": 1.2779, + "step": 2335 + }, + { + "epoch": 0.6781826099579039, + "grad_norm": 3.257922887802124, + "learning_rate": 9.800747168655455e-06, + "loss": 1.2151, + "step": 2336 + }, + { + "epoch": 0.6784729278560023, + "grad_norm": 3.3422892093658447, + "learning_rate": 9.800478715695165e-06, + "loss": 1.1516, + "step": 2337 + }, + { + "epoch": 0.6787632457541007, + "grad_norm": 3.452329158782959, + "learning_rate": 9.800210085695122e-06, + "loss": 1.1959, + "step": 2338 + }, + { + "epoch": 0.6790535636521992, + "grad_norm": 3.49959397315979, + "learning_rate": 9.799941278665237e-06, + "loss": 1.1562, + "step": 2339 + }, + { + "epoch": 0.6793438815502976, + "grad_norm": 3.652210235595703, + "learning_rate": 9.79967229461542e-06, + "loss": 1.1846, + "step": 2340 + }, + { + "epoch": 0.679634199448396, + "grad_norm": 2.9146311283111572, + "learning_rate": 9.799403133555596e-06, + "loss": 1.1545, + "step": 2341 + }, + { + "epoch": 0.6799245173464944, + "grad_norm": 3.4553141593933105, + "learning_rate": 9.79913379549569e-06, + "loss": 1.1622, + "step": 2342 + }, + { + "epoch": 0.6802148352445928, + "grad_norm": 3.6774072647094727, + "learning_rate": 9.798864280445633e-06, + "loss": 1.3584, + "step": 2343 + }, + { + "epoch": 0.6805051531426912, + "grad_norm": 3.1811299324035645, + "learning_rate": 9.798594588415364e-06, + "loss": 1.1414, + "step": 2344 + }, + { + "epoch": 0.6807954710407896, + "grad_norm": 3.348858594894409, + "learning_rate": 9.798324719414833e-06, + "loss": 1.1112, + "step": 2345 + }, + { + "epoch": 0.681085788938888, + "grad_norm": 3.5631508827209473, + "learning_rate": 9.79805467345399e-06, + "loss": 1.2831, + "step": 2346 + }, + { + "epoch": 0.6813761068369865, + "grad_norm": 3.5303027629852295, + "learning_rate": 9.797784450542794e-06, + "loss": 1.1016, + "step": 2347 + }, + { + "epoch": 0.6816664247350849, + "grad_norm": 3.4458773136138916, + "learning_rate": 9.79751405069121e-06, + "loss": 1.2462, + "step": 2348 + }, + { + "epoch": 0.6819567426331833, + "grad_norm": 3.3334274291992188, + "learning_rate": 9.797243473909214e-06, + "loss": 1.1773, + "step": 2349 + }, + { + "epoch": 0.6822470605312817, + "grad_norm": 3.3247268199920654, + "learning_rate": 9.796972720206783e-06, + "loss": 1.1246, + "step": 2350 + }, + { + "epoch": 0.6825373784293802, + "grad_norm": 3.354071617126465, + "learning_rate": 9.796701789593902e-06, + "loss": 1.1596, + "step": 2351 + }, + { + "epoch": 0.6828276963274786, + "grad_norm": 3.145782709121704, + "learning_rate": 9.79643068208056e-06, + "loss": 1.1527, + "step": 2352 + }, + { + "epoch": 0.683118014225577, + "grad_norm": 3.3376376628875732, + "learning_rate": 9.796159397676758e-06, + "loss": 1.1915, + "step": 2353 + }, + { + "epoch": 0.6834083321236755, + "grad_norm": 3.3845038414001465, + "learning_rate": 9.795887936392502e-06, + "loss": 1.1748, + "step": 2354 + }, + { + "epoch": 0.6836986500217739, + "grad_norm": 3.6921133995056152, + "learning_rate": 9.795616298237802e-06, + "loss": 1.1177, + "step": 2355 + }, + { + "epoch": 0.6839889679198723, + "grad_norm": 4.1208600997924805, + "learning_rate": 9.795344483222675e-06, + "loss": 1.183, + "step": 2356 + }, + { + "epoch": 0.6842792858179707, + "grad_norm": 3.442371368408203, + "learning_rate": 9.795072491357147e-06, + "loss": 1.2422, + "step": 2357 + }, + { + "epoch": 0.6845696037160691, + "grad_norm": 3.38021183013916, + "learning_rate": 9.79480032265125e-06, + "loss": 1.2806, + "step": 2358 + }, + { + "epoch": 0.6848599216141675, + "grad_norm": 3.3694331645965576, + "learning_rate": 9.794527977115019e-06, + "loss": 1.168, + "step": 2359 + }, + { + "epoch": 0.685150239512266, + "grad_norm": 3.2959866523742676, + "learning_rate": 9.794255454758497e-06, + "loss": 1.0299, + "step": 2360 + }, + { + "epoch": 0.6854405574103644, + "grad_norm": 3.3888444900512695, + "learning_rate": 9.793982755591738e-06, + "loss": 1.3449, + "step": 2361 + }, + { + "epoch": 0.6857308753084628, + "grad_norm": 3.2652950286865234, + "learning_rate": 9.793709879624797e-06, + "loss": 1.1281, + "step": 2362 + }, + { + "epoch": 0.6860211932065612, + "grad_norm": 3.525996208190918, + "learning_rate": 9.793436826867737e-06, + "loss": 1.2652, + "step": 2363 + }, + { + "epoch": 0.6863115111046596, + "grad_norm": 3.430039405822754, + "learning_rate": 9.79316359733063e-06, + "loss": 1.2594, + "step": 2364 + }, + { + "epoch": 0.686601829002758, + "grad_norm": 3.4313323497772217, + "learning_rate": 9.792890191023551e-06, + "loss": 1.1357, + "step": 2365 + }, + { + "epoch": 0.6868921469008564, + "grad_norm": 3.3758277893066406, + "learning_rate": 9.792616607956585e-06, + "loss": 1.2663, + "step": 2366 + }, + { + "epoch": 0.6871824647989548, + "grad_norm": 3.622230052947998, + "learning_rate": 9.79234284813982e-06, + "loss": 1.2205, + "step": 2367 + }, + { + "epoch": 0.6874727826970533, + "grad_norm": 3.0984694957733154, + "learning_rate": 9.792068911583353e-06, + "loss": 1.0823, + "step": 2368 + }, + { + "epoch": 0.6877631005951517, + "grad_norm": 3.3490564823150635, + "learning_rate": 9.791794798297286e-06, + "loss": 1.2032, + "step": 2369 + }, + { + "epoch": 0.6880534184932501, + "grad_norm": 3.1726980209350586, + "learning_rate": 9.791520508291728e-06, + "loss": 1.11, + "step": 2370 + }, + { + "epoch": 0.6883437363913485, + "grad_norm": 3.6225693225860596, + "learning_rate": 9.791246041576795e-06, + "loss": 1.3124, + "step": 2371 + }, + { + "epoch": 0.6886340542894469, + "grad_norm": 3.639941692352295, + "learning_rate": 9.790971398162608e-06, + "loss": 1.1873, + "step": 2372 + }, + { + "epoch": 0.6889243721875453, + "grad_norm": 3.2535839080810547, + "learning_rate": 9.7906965780593e-06, + "loss": 1.1934, + "step": 2373 + }, + { + "epoch": 0.6892146900856437, + "grad_norm": 3.317662000656128, + "learning_rate": 9.790421581277002e-06, + "loss": 1.167, + "step": 2374 + }, + { + "epoch": 0.6895050079837421, + "grad_norm": 3.376481533050537, + "learning_rate": 9.790146407825856e-06, + "loss": 1.1746, + "step": 2375 + }, + { + "epoch": 0.6897953258818407, + "grad_norm": 3.3618693351745605, + "learning_rate": 9.789871057716012e-06, + "loss": 1.2363, + "step": 2376 + }, + { + "epoch": 0.6900856437799391, + "grad_norm": 3.43084979057312, + "learning_rate": 9.789595530957626e-06, + "loss": 1.1278, + "step": 2377 + }, + { + "epoch": 0.6903759616780375, + "grad_norm": 3.321505546569824, + "learning_rate": 9.789319827560854e-06, + "loss": 1.2212, + "step": 2378 + }, + { + "epoch": 0.6906662795761359, + "grad_norm": 3.113330364227295, + "learning_rate": 9.78904394753587e-06, + "loss": 1.0621, + "step": 2379 + }, + { + "epoch": 0.6909565974742343, + "grad_norm": 3.3849680423736572, + "learning_rate": 9.788767890892845e-06, + "loss": 1.2761, + "step": 2380 + }, + { + "epoch": 0.6912469153723327, + "grad_norm": 3.285853147506714, + "learning_rate": 9.78849165764196e-06, + "loss": 1.0482, + "step": 2381 + }, + { + "epoch": 0.6915372332704312, + "grad_norm": 3.0740060806274414, + "learning_rate": 9.788215247793405e-06, + "loss": 1.1211, + "step": 2382 + }, + { + "epoch": 0.6918275511685296, + "grad_norm": 3.0753612518310547, + "learning_rate": 9.78793866135737e-06, + "loss": 1.0918, + "step": 2383 + }, + { + "epoch": 0.692117869066628, + "grad_norm": 3.350917100906372, + "learning_rate": 9.787661898344058e-06, + "loss": 1.348, + "step": 2384 + }, + { + "epoch": 0.6924081869647264, + "grad_norm": 3.713820219039917, + "learning_rate": 9.787384958763674e-06, + "loss": 1.2728, + "step": 2385 + }, + { + "epoch": 0.6926985048628248, + "grad_norm": 3.2374231815338135, + "learning_rate": 9.787107842626434e-06, + "loss": 1.1106, + "step": 2386 + }, + { + "epoch": 0.6929888227609232, + "grad_norm": 3.0998446941375732, + "learning_rate": 9.786830549942556e-06, + "loss": 1.147, + "step": 2387 + }, + { + "epoch": 0.6932791406590216, + "grad_norm": 3.490924835205078, + "learning_rate": 9.786553080722266e-06, + "loss": 1.3013, + "step": 2388 + }, + { + "epoch": 0.69356945855712, + "grad_norm": 3.3626949787139893, + "learning_rate": 9.786275434975797e-06, + "loss": 1.2637, + "step": 2389 + }, + { + "epoch": 0.6938597764552185, + "grad_norm": 3.2617788314819336, + "learning_rate": 9.785997612713391e-06, + "loss": 1.0639, + "step": 2390 + }, + { + "epoch": 0.6941500943533169, + "grad_norm": 3.3937413692474365, + "learning_rate": 9.785719613945293e-06, + "loss": 1.1385, + "step": 2391 + }, + { + "epoch": 0.6944404122514153, + "grad_norm": 3.2378339767456055, + "learning_rate": 9.785441438681755e-06, + "loss": 1.1471, + "step": 2392 + }, + { + "epoch": 0.6947307301495137, + "grad_norm": 3.2014105319976807, + "learning_rate": 9.785163086933034e-06, + "loss": 1.1106, + "step": 2393 + }, + { + "epoch": 0.6950210480476121, + "grad_norm": 3.524437665939331, + "learning_rate": 9.784884558709398e-06, + "loss": 1.1607, + "step": 2394 + }, + { + "epoch": 0.6953113659457105, + "grad_norm": 3.2841367721557617, + "learning_rate": 9.784605854021118e-06, + "loss": 0.9346, + "step": 2395 + }, + { + "epoch": 0.6956016838438089, + "grad_norm": 3.702146291732788, + "learning_rate": 9.784326972878474e-06, + "loss": 1.266, + "step": 2396 + }, + { + "epoch": 0.6958920017419074, + "grad_norm": 3.6109771728515625, + "learning_rate": 9.784047915291748e-06, + "loss": 1.1987, + "step": 2397 + }, + { + "epoch": 0.6961823196400058, + "grad_norm": 3.68677020072937, + "learning_rate": 9.783768681271234e-06, + "loss": 1.3537, + "step": 2398 + }, + { + "epoch": 0.6964726375381042, + "grad_norm": 2.9631056785583496, + "learning_rate": 9.78348927082723e-06, + "loss": 1.0584, + "step": 2399 + }, + { + "epoch": 0.6967629554362026, + "grad_norm": 3.4369635581970215, + "learning_rate": 9.78320968397004e-06, + "loss": 1.1968, + "step": 2400 + }, + { + "epoch": 0.697053273334301, + "grad_norm": 3.149402379989624, + "learning_rate": 9.782929920709974e-06, + "loss": 1.1627, + "step": 2401 + }, + { + "epoch": 0.6973435912323995, + "grad_norm": 3.3772337436676025, + "learning_rate": 9.782649981057352e-06, + "loss": 1.1989, + "step": 2402 + }, + { + "epoch": 0.6976339091304979, + "grad_norm": 3.39142107963562, + "learning_rate": 9.782369865022495e-06, + "loss": 1.2028, + "step": 2403 + }, + { + "epoch": 0.6979242270285964, + "grad_norm": 3.2515244483947754, + "learning_rate": 9.782089572615737e-06, + "loss": 1.1666, + "step": 2404 + }, + { + "epoch": 0.6982145449266948, + "grad_norm": 2.9869136810302734, + "learning_rate": 9.781809103847411e-06, + "loss": 1.0236, + "step": 2405 + }, + { + "epoch": 0.6985048628247932, + "grad_norm": 3.331195592880249, + "learning_rate": 9.781528458727865e-06, + "loss": 1.1569, + "step": 2406 + }, + { + "epoch": 0.6987951807228916, + "grad_norm": 3.2006444931030273, + "learning_rate": 9.781247637267446e-06, + "loss": 1.0676, + "step": 2407 + }, + { + "epoch": 0.69908549862099, + "grad_norm": 3.203761577606201, + "learning_rate": 9.780966639476513e-06, + "loss": 1.2282, + "step": 2408 + }, + { + "epoch": 0.6993758165190884, + "grad_norm": 3.381657600402832, + "learning_rate": 9.780685465365426e-06, + "loss": 1.1954, + "step": 2409 + }, + { + "epoch": 0.6996661344171868, + "grad_norm": 3.2319588661193848, + "learning_rate": 9.780404114944556e-06, + "loss": 1.1636, + "step": 2410 + }, + { + "epoch": 0.6999564523152852, + "grad_norm": 3.4879820346832275, + "learning_rate": 9.780122588224278e-06, + "loss": 1.2639, + "step": 2411 + }, + { + "epoch": 0.7002467702133837, + "grad_norm": 3.1994943618774414, + "learning_rate": 9.77984088521498e-06, + "loss": 1.1396, + "step": 2412 + }, + { + "epoch": 0.7005370881114821, + "grad_norm": 3.4960827827453613, + "learning_rate": 9.779559005927043e-06, + "loss": 1.1809, + "step": 2413 + }, + { + "epoch": 0.7008274060095805, + "grad_norm": 3.188183307647705, + "learning_rate": 9.779276950370868e-06, + "loss": 1.1677, + "step": 2414 + }, + { + "epoch": 0.7011177239076789, + "grad_norm": 3.095752000808716, + "learning_rate": 9.778994718556856e-06, + "loss": 1.0553, + "step": 2415 + }, + { + "epoch": 0.7014080418057773, + "grad_norm": 3.390242099761963, + "learning_rate": 9.778712310495415e-06, + "loss": 1.2226, + "step": 2416 + }, + { + "epoch": 0.7016983597038757, + "grad_norm": 2.846047878265381, + "learning_rate": 9.77842972619696e-06, + "loss": 1.0552, + "step": 2417 + }, + { + "epoch": 0.7019886776019741, + "grad_norm": 3.244255304336548, + "learning_rate": 9.778146965671915e-06, + "loss": 1.2517, + "step": 2418 + }, + { + "epoch": 0.7022789955000726, + "grad_norm": 3.267493724822998, + "learning_rate": 9.777864028930705e-06, + "loss": 1.1721, + "step": 2419 + }, + { + "epoch": 0.702569313398171, + "grad_norm": 3.073822259902954, + "learning_rate": 9.777580915983765e-06, + "loss": 1.129, + "step": 2420 + }, + { + "epoch": 0.7028596312962694, + "grad_norm": 3.1357955932617188, + "learning_rate": 9.777297626841536e-06, + "loss": 1.2401, + "step": 2421 + }, + { + "epoch": 0.7031499491943678, + "grad_norm": 3.211599349975586, + "learning_rate": 9.777014161514468e-06, + "loss": 1.203, + "step": 2422 + }, + { + "epoch": 0.7034402670924662, + "grad_norm": 3.394411325454712, + "learning_rate": 9.776730520013013e-06, + "loss": 1.2225, + "step": 2423 + }, + { + "epoch": 0.7037305849905646, + "grad_norm": 3.4315035343170166, + "learning_rate": 9.77644670234763e-06, + "loss": 1.1715, + "step": 2424 + }, + { + "epoch": 0.704020902888663, + "grad_norm": 3.435701847076416, + "learning_rate": 9.776162708528792e-06, + "loss": 1.2022, + "step": 2425 + }, + { + "epoch": 0.7043112207867614, + "grad_norm": 3.5279853343963623, + "learning_rate": 9.775878538566965e-06, + "loss": 1.1028, + "step": 2426 + }, + { + "epoch": 0.70460153868486, + "grad_norm": 3.295423984527588, + "learning_rate": 9.775594192472635e-06, + "loss": 1.2768, + "step": 2427 + }, + { + "epoch": 0.7048918565829584, + "grad_norm": 3.0675647258758545, + "learning_rate": 9.775309670256286e-06, + "loss": 1.2386, + "step": 2428 + }, + { + "epoch": 0.7051821744810568, + "grad_norm": 3.320549726486206, + "learning_rate": 9.77502497192841e-06, + "loss": 1.1444, + "step": 2429 + }, + { + "epoch": 0.7054724923791552, + "grad_norm": 3.095872402191162, + "learning_rate": 9.774740097499509e-06, + "loss": 1.0612, + "step": 2430 + }, + { + "epoch": 0.7057628102772536, + "grad_norm": 3.0651066303253174, + "learning_rate": 9.774455046980087e-06, + "loss": 0.9936, + "step": 2431 + }, + { + "epoch": 0.706053128175352, + "grad_norm": 3.40466570854187, + "learning_rate": 9.77416982038066e-06, + "loss": 1.113, + "step": 2432 + }, + { + "epoch": 0.7063434460734505, + "grad_norm": 3.6496083736419678, + "learning_rate": 9.773884417711743e-06, + "loss": 1.2631, + "step": 2433 + }, + { + "epoch": 0.7066337639715489, + "grad_norm": 3.3464813232421875, + "learning_rate": 9.773598838983863e-06, + "loss": 1.3191, + "step": 2434 + }, + { + "epoch": 0.7069240818696473, + "grad_norm": 3.3084921836853027, + "learning_rate": 9.773313084207552e-06, + "loss": 1.2405, + "step": 2435 + }, + { + "epoch": 0.7072143997677457, + "grad_norm": 3.0100600719451904, + "learning_rate": 9.773027153393349e-06, + "loss": 1.0613, + "step": 2436 + }, + { + "epoch": 0.7075047176658441, + "grad_norm": 3.3531084060668945, + "learning_rate": 9.772741046551798e-06, + "loss": 1.1767, + "step": 2437 + }, + { + "epoch": 0.7077950355639425, + "grad_norm": 3.3284599781036377, + "learning_rate": 9.772454763693453e-06, + "loss": 1.1301, + "step": 2438 + }, + { + "epoch": 0.7080853534620409, + "grad_norm": 3.4888689517974854, + "learning_rate": 9.772168304828869e-06, + "loss": 1.1039, + "step": 2439 + }, + { + "epoch": 0.7083756713601393, + "grad_norm": 3.0899245738983154, + "learning_rate": 9.771881669968611e-06, + "loss": 1.0399, + "step": 2440 + }, + { + "epoch": 0.7086659892582378, + "grad_norm": 3.2881476879119873, + "learning_rate": 9.771594859123252e-06, + "loss": 1.2318, + "step": 2441 + }, + { + "epoch": 0.7089563071563362, + "grad_norm": 4.053572654724121, + "learning_rate": 9.771307872303365e-06, + "loss": 1.2404, + "step": 2442 + }, + { + "epoch": 0.7092466250544346, + "grad_norm": 3.781298875808716, + "learning_rate": 9.77102070951954e-06, + "loss": 1.3447, + "step": 2443 + }, + { + "epoch": 0.709536942952533, + "grad_norm": 3.022076368331909, + "learning_rate": 9.770733370782365e-06, + "loss": 1.1249, + "step": 2444 + }, + { + "epoch": 0.7098272608506314, + "grad_norm": 3.1669278144836426, + "learning_rate": 9.770445856102438e-06, + "loss": 0.9911, + "step": 2445 + }, + { + "epoch": 0.7101175787487298, + "grad_norm": 3.3084747791290283, + "learning_rate": 9.770158165490358e-06, + "loss": 1.0994, + "step": 2446 + }, + { + "epoch": 0.7104078966468282, + "grad_norm": 3.027456760406494, + "learning_rate": 9.769870298956739e-06, + "loss": 1.0671, + "step": 2447 + }, + { + "epoch": 0.7106982145449267, + "grad_norm": 3.577392816543579, + "learning_rate": 9.769582256512195e-06, + "loss": 1.2498, + "step": 2448 + }, + { + "epoch": 0.7109885324430251, + "grad_norm": 3.087620735168457, + "learning_rate": 9.76929403816735e-06, + "loss": 1.2372, + "step": 2449 + }, + { + "epoch": 0.7112788503411235, + "grad_norm": 3.3493881225585938, + "learning_rate": 9.769005643932833e-06, + "loss": 1.1223, + "step": 2450 + }, + { + "epoch": 0.7115691682392219, + "grad_norm": 3.309208631515503, + "learning_rate": 9.768717073819282e-06, + "loss": 1.2156, + "step": 2451 + }, + { + "epoch": 0.7118594861373204, + "grad_norm": 3.5544214248657227, + "learning_rate": 9.768428327837339e-06, + "loss": 1.2821, + "step": 2452 + }, + { + "epoch": 0.7121498040354188, + "grad_norm": 3.2072324752807617, + "learning_rate": 9.76813940599765e-06, + "loss": 1.0891, + "step": 2453 + }, + { + "epoch": 0.7124401219335172, + "grad_norm": 3.3209030628204346, + "learning_rate": 9.767850308310872e-06, + "loss": 1.1572, + "step": 2454 + }, + { + "epoch": 0.7127304398316157, + "grad_norm": 3.294210910797119, + "learning_rate": 9.767561034787666e-06, + "loss": 1.0957, + "step": 2455 + }, + { + "epoch": 0.7130207577297141, + "grad_norm": 3.353680372238159, + "learning_rate": 9.767271585438703e-06, + "loss": 1.1803, + "step": 2456 + }, + { + "epoch": 0.7133110756278125, + "grad_norm": 2.933467149734497, + "learning_rate": 9.766981960274653e-06, + "loss": 1.0839, + "step": 2457 + }, + { + "epoch": 0.7136013935259109, + "grad_norm": 3.1124205589294434, + "learning_rate": 9.766692159306202e-06, + "loss": 1.0837, + "step": 2458 + }, + { + "epoch": 0.7138917114240093, + "grad_norm": 3.372271776199341, + "learning_rate": 9.766402182544034e-06, + "loss": 1.1596, + "step": 2459 + }, + { + "epoch": 0.7141820293221077, + "grad_norm": 3.386247396469116, + "learning_rate": 9.766112029998847e-06, + "loss": 1.1766, + "step": 2460 + }, + { + "epoch": 0.7144723472202061, + "grad_norm": 3.4302918910980225, + "learning_rate": 9.76582170168134e-06, + "loss": 1.1653, + "step": 2461 + }, + { + "epoch": 0.7147626651183046, + "grad_norm": 3.3646481037139893, + "learning_rate": 9.765531197602219e-06, + "loss": 1.2086, + "step": 2462 + }, + { + "epoch": 0.715052983016403, + "grad_norm": 3.197026491165161, + "learning_rate": 9.765240517772196e-06, + "loss": 1.1854, + "step": 2463 + }, + { + "epoch": 0.7153433009145014, + "grad_norm": 3.009091377258301, + "learning_rate": 9.764949662201997e-06, + "loss": 1.0761, + "step": 2464 + }, + { + "epoch": 0.7156336188125998, + "grad_norm": 3.1493172645568848, + "learning_rate": 9.764658630902345e-06, + "loss": 1.0669, + "step": 2465 + }, + { + "epoch": 0.7159239367106982, + "grad_norm": 3.1372087001800537, + "learning_rate": 9.764367423883973e-06, + "loss": 1.1141, + "step": 2466 + }, + { + "epoch": 0.7162142546087966, + "grad_norm": 3.358511209487915, + "learning_rate": 9.76407604115762e-06, + "loss": 1.1396, + "step": 2467 + }, + { + "epoch": 0.716504572506895, + "grad_norm": 3.5119621753692627, + "learning_rate": 9.763784482734035e-06, + "loss": 1.2956, + "step": 2468 + }, + { + "epoch": 0.7167948904049934, + "grad_norm": 3.1730403900146484, + "learning_rate": 9.763492748623969e-06, + "loss": 1.0829, + "step": 2469 + }, + { + "epoch": 0.7170852083030919, + "grad_norm": 3.2893500328063965, + "learning_rate": 9.763200838838178e-06, + "loss": 1.1184, + "step": 2470 + }, + { + "epoch": 0.7173755262011903, + "grad_norm": 2.979743480682373, + "learning_rate": 9.762908753387432e-06, + "loss": 1.0347, + "step": 2471 + }, + { + "epoch": 0.7176658440992887, + "grad_norm": 3.22346568107605, + "learning_rate": 9.762616492282502e-06, + "loss": 1.0688, + "step": 2472 + }, + { + "epoch": 0.7179561619973871, + "grad_norm": 3.191016912460327, + "learning_rate": 9.762324055534165e-06, + "loss": 1.1585, + "step": 2473 + }, + { + "epoch": 0.7182464798954855, + "grad_norm": 2.974458932876587, + "learning_rate": 9.762031443153207e-06, + "loss": 0.9389, + "step": 2474 + }, + { + "epoch": 0.7185367977935839, + "grad_norm": 3.3603460788726807, + "learning_rate": 9.761738655150419e-06, + "loss": 1.1379, + "step": 2475 + }, + { + "epoch": 0.7188271156916823, + "grad_norm": 3.3447885513305664, + "learning_rate": 9.761445691536598e-06, + "loss": 1.1837, + "step": 2476 + }, + { + "epoch": 0.7191174335897809, + "grad_norm": 3.482642412185669, + "learning_rate": 9.76115255232255e-06, + "loss": 1.1967, + "step": 2477 + }, + { + "epoch": 0.7194077514878793, + "grad_norm": 3.208934783935547, + "learning_rate": 9.760859237519087e-06, + "loss": 1.1285, + "step": 2478 + }, + { + "epoch": 0.7196980693859777, + "grad_norm": 3.199887990951538, + "learning_rate": 9.760565747137023e-06, + "loss": 1.0891, + "step": 2479 + }, + { + "epoch": 0.7199883872840761, + "grad_norm": 3.1284048557281494, + "learning_rate": 9.760272081187183e-06, + "loss": 1.1122, + "step": 2480 + }, + { + "epoch": 0.7202787051821745, + "grad_norm": 3.603379726409912, + "learning_rate": 9.7599782396804e-06, + "loss": 1.2686, + "step": 2481 + }, + { + "epoch": 0.7205690230802729, + "grad_norm": 3.496004581451416, + "learning_rate": 9.759684222627506e-06, + "loss": 1.2055, + "step": 2482 + }, + { + "epoch": 0.7208593409783713, + "grad_norm": 3.3529865741729736, + "learning_rate": 9.759390030039347e-06, + "loss": 1.154, + "step": 2483 + }, + { + "epoch": 0.7211496588764698, + "grad_norm": 3.08897066116333, + "learning_rate": 9.759095661926772e-06, + "loss": 1.0814, + "step": 2484 + }, + { + "epoch": 0.7214399767745682, + "grad_norm": 3.2618985176086426, + "learning_rate": 9.758801118300638e-06, + "loss": 1.1316, + "step": 2485 + }, + { + "epoch": 0.7217302946726666, + "grad_norm": 3.4715993404388428, + "learning_rate": 9.758506399171808e-06, + "loss": 1.2883, + "step": 2486 + }, + { + "epoch": 0.722020612570765, + "grad_norm": 3.0561084747314453, + "learning_rate": 9.758211504551151e-06, + "loss": 1.0894, + "step": 2487 + }, + { + "epoch": 0.7223109304688634, + "grad_norm": 3.1737711429595947, + "learning_rate": 9.75791643444954e-06, + "loss": 1.238, + "step": 2488 + }, + { + "epoch": 0.7226012483669618, + "grad_norm": 3.498148202896118, + "learning_rate": 9.757621188877861e-06, + "loss": 1.3628, + "step": 2489 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 2.9819672107696533, + "learning_rate": 9.757325767846999e-06, + "loss": 0.9908, + "step": 2490 + }, + { + "epoch": 0.7231818841631587, + "grad_norm": 2.9681432247161865, + "learning_rate": 9.757030171367852e-06, + "loss": 1.111, + "step": 2491 + }, + { + "epoch": 0.7234722020612571, + "grad_norm": 3.207848072052002, + "learning_rate": 9.756734399451318e-06, + "loss": 1.0846, + "step": 2492 + }, + { + "epoch": 0.7237625199593555, + "grad_norm": 3.4582133293151855, + "learning_rate": 9.756438452108307e-06, + "loss": 1.1117, + "step": 2493 + }, + { + "epoch": 0.7240528378574539, + "grad_norm": 3.1228976249694824, + "learning_rate": 9.756142329349737e-06, + "loss": 1.1891, + "step": 2494 + }, + { + "epoch": 0.7243431557555523, + "grad_norm": 3.249508857727051, + "learning_rate": 9.755846031186521e-06, + "loss": 1.0953, + "step": 2495 + }, + { + "epoch": 0.7246334736536507, + "grad_norm": 3.248222589492798, + "learning_rate": 9.755549557629593e-06, + "loss": 1.1658, + "step": 2496 + }, + { + "epoch": 0.7249237915517491, + "grad_norm": 3.254011869430542, + "learning_rate": 9.755252908689885e-06, + "loss": 1.117, + "step": 2497 + }, + { + "epoch": 0.7252141094498475, + "grad_norm": 3.4545297622680664, + "learning_rate": 9.754956084378336e-06, + "loss": 1.1358, + "step": 2498 + }, + { + "epoch": 0.725504427347946, + "grad_norm": 3.3574445247650146, + "learning_rate": 9.754659084705893e-06, + "loss": 1.1986, + "step": 2499 + }, + { + "epoch": 0.7257947452460444, + "grad_norm": 3.6412932872772217, + "learning_rate": 9.75436190968351e-06, + "loss": 1.2042, + "step": 2500 + }, + { + "epoch": 0.7257947452460444, + "eval_loss": 1.1983624696731567, + "eval_runtime": 11.2813, + "eval_samples_per_second": 35.457, + "eval_steps_per_second": 4.432, + "step": 2500 + }, + { + "epoch": 0.7260850631441428, + "grad_norm": 3.041032314300537, + "learning_rate": 9.754064559322147e-06, + "loss": 1.0758, + "step": 2501 + }, + { + "epoch": 0.7263753810422413, + "grad_norm": 3.4390034675598145, + "learning_rate": 9.753767033632769e-06, + "loss": 1.2908, + "step": 2502 + }, + { + "epoch": 0.7266656989403397, + "grad_norm": 3.178821563720703, + "learning_rate": 9.75346933262635e-06, + "loss": 1.0938, + "step": 2503 + }, + { + "epoch": 0.7269560168384381, + "grad_norm": 3.250523567199707, + "learning_rate": 9.753171456313868e-06, + "loss": 1.143, + "step": 2504 + }, + { + "epoch": 0.7272463347365365, + "grad_norm": 3.777912139892578, + "learning_rate": 9.752873404706309e-06, + "loss": 1.2468, + "step": 2505 + }, + { + "epoch": 0.727536652634635, + "grad_norm": 3.3552846908569336, + "learning_rate": 9.752575177814664e-06, + "loss": 1.0887, + "step": 2506 + }, + { + "epoch": 0.7278269705327334, + "grad_norm": 3.36442232131958, + "learning_rate": 9.752276775649934e-06, + "loss": 1.1639, + "step": 2507 + }, + { + "epoch": 0.7281172884308318, + "grad_norm": 3.309434175491333, + "learning_rate": 9.75197819822312e-06, + "loss": 1.2119, + "step": 2508 + }, + { + "epoch": 0.7284076063289302, + "grad_norm": 3.211569309234619, + "learning_rate": 9.751679445545239e-06, + "loss": 1.1335, + "step": 2509 + }, + { + "epoch": 0.7286979242270286, + "grad_norm": 3.2672746181488037, + "learning_rate": 9.751380517627304e-06, + "loss": 1.0993, + "step": 2510 + }, + { + "epoch": 0.728988242125127, + "grad_norm": 3.273798704147339, + "learning_rate": 9.751081414480342e-06, + "loss": 1.2028, + "step": 2511 + }, + { + "epoch": 0.7292785600232254, + "grad_norm": 3.2062716484069824, + "learning_rate": 9.750782136115381e-06, + "loss": 1.0892, + "step": 2512 + }, + { + "epoch": 0.7295688779213239, + "grad_norm": 3.3710551261901855, + "learning_rate": 9.75048268254346e-06, + "loss": 1.1791, + "step": 2513 + }, + { + "epoch": 0.7298591958194223, + "grad_norm": 3.117218255996704, + "learning_rate": 9.750183053775625e-06, + "loss": 1.0583, + "step": 2514 + }, + { + "epoch": 0.7301495137175207, + "grad_norm": 2.7797436714172363, + "learning_rate": 9.749883249822923e-06, + "loss": 0.9885, + "step": 2515 + }, + { + "epoch": 0.7304398316156191, + "grad_norm": 3.564326524734497, + "learning_rate": 9.749583270696413e-06, + "loss": 1.3298, + "step": 2516 + }, + { + "epoch": 0.7307301495137175, + "grad_norm": 3.287993907928467, + "learning_rate": 9.749283116407155e-06, + "loss": 1.2807, + "step": 2517 + }, + { + "epoch": 0.7310204674118159, + "grad_norm": 3.1724064350128174, + "learning_rate": 9.74898278696622e-06, + "loss": 1.1995, + "step": 2518 + }, + { + "epoch": 0.7313107853099143, + "grad_norm": 3.066631555557251, + "learning_rate": 9.748682282384685e-06, + "loss": 1.2402, + "step": 2519 + }, + { + "epoch": 0.7316011032080127, + "grad_norm": 3.3963117599487305, + "learning_rate": 9.748381602673633e-06, + "loss": 1.2954, + "step": 2520 + }, + { + "epoch": 0.7318914211061112, + "grad_norm": 3.1889572143554688, + "learning_rate": 9.74808074784415e-06, + "loss": 1.0468, + "step": 2521 + }, + { + "epoch": 0.7321817390042096, + "grad_norm": 3.008392810821533, + "learning_rate": 9.747779717907336e-06, + "loss": 1.0372, + "step": 2522 + }, + { + "epoch": 0.732472056902308, + "grad_norm": 3.1272335052490234, + "learning_rate": 9.747478512874288e-06, + "loss": 1.2067, + "step": 2523 + }, + { + "epoch": 0.7327623748004064, + "grad_norm": 3.072211503982544, + "learning_rate": 9.747177132756117e-06, + "loss": 0.9834, + "step": 2524 + }, + { + "epoch": 0.7330526926985048, + "grad_norm": 3.123993158340454, + "learning_rate": 9.746875577563936e-06, + "loss": 1.1079, + "step": 2525 + }, + { + "epoch": 0.7333430105966032, + "grad_norm": 3.211639404296875, + "learning_rate": 9.746573847308869e-06, + "loss": 1.1979, + "step": 2526 + }, + { + "epoch": 0.7336333284947018, + "grad_norm": 3.380052328109741, + "learning_rate": 9.746271942002042e-06, + "loss": 1.1854, + "step": 2527 + }, + { + "epoch": 0.7339236463928002, + "grad_norm": 3.1952614784240723, + "learning_rate": 9.745969861654589e-06, + "loss": 1.0955, + "step": 2528 + }, + { + "epoch": 0.7342139642908986, + "grad_norm": 3.376279592514038, + "learning_rate": 9.74566760627765e-06, + "loss": 1.3143, + "step": 2529 + }, + { + "epoch": 0.734504282188997, + "grad_norm": 3.431368589401245, + "learning_rate": 9.745365175882372e-06, + "loss": 1.2247, + "step": 2530 + }, + { + "epoch": 0.7347946000870954, + "grad_norm": 3.4958410263061523, + "learning_rate": 9.745062570479912e-06, + "loss": 1.1536, + "step": 2531 + }, + { + "epoch": 0.7350849179851938, + "grad_norm": 3.3066039085388184, + "learning_rate": 9.744759790081426e-06, + "loss": 1.1474, + "step": 2532 + }, + { + "epoch": 0.7353752358832922, + "grad_norm": 3.381757974624634, + "learning_rate": 9.744456834698083e-06, + "loss": 1.2692, + "step": 2533 + }, + { + "epoch": 0.7356655537813906, + "grad_norm": 3.070390224456787, + "learning_rate": 9.744153704341056e-06, + "loss": 1.1146, + "step": 2534 + }, + { + "epoch": 0.7359558716794891, + "grad_norm": 3.0699477195739746, + "learning_rate": 9.743850399021519e-06, + "loss": 1.2264, + "step": 2535 + }, + { + "epoch": 0.7362461895775875, + "grad_norm": 3.2143630981445312, + "learning_rate": 9.743546918750664e-06, + "loss": 1.2258, + "step": 2536 + }, + { + "epoch": 0.7365365074756859, + "grad_norm": 3.471107244491577, + "learning_rate": 9.743243263539681e-06, + "loss": 1.2183, + "step": 2537 + }, + { + "epoch": 0.7368268253737843, + "grad_norm": 3.6511921882629395, + "learning_rate": 9.742939433399769e-06, + "loss": 1.332, + "step": 2538 + }, + { + "epoch": 0.7371171432718827, + "grad_norm": 2.9969394207000732, + "learning_rate": 9.742635428342133e-06, + "loss": 1.1155, + "step": 2539 + }, + { + "epoch": 0.7374074611699811, + "grad_norm": 3.1637327671051025, + "learning_rate": 9.742331248377985e-06, + "loss": 1.2107, + "step": 2540 + }, + { + "epoch": 0.7376977790680795, + "grad_norm": 3.3259994983673096, + "learning_rate": 9.742026893518541e-06, + "loss": 1.1766, + "step": 2541 + }, + { + "epoch": 0.737988096966178, + "grad_norm": 3.2825498580932617, + "learning_rate": 9.741722363775029e-06, + "loss": 1.1946, + "step": 2542 + }, + { + "epoch": 0.7382784148642764, + "grad_norm": 3.317887783050537, + "learning_rate": 9.741417659158674e-06, + "loss": 1.0025, + "step": 2543 + }, + { + "epoch": 0.7385687327623748, + "grad_norm": 3.05649471282959, + "learning_rate": 9.741112779680721e-06, + "loss": 1.0689, + "step": 2544 + }, + { + "epoch": 0.7388590506604732, + "grad_norm": 3.0476882457733154, + "learning_rate": 9.740807725352408e-06, + "loss": 1.0704, + "step": 2545 + }, + { + "epoch": 0.7391493685585716, + "grad_norm": 2.8864781856536865, + "learning_rate": 9.740502496184989e-06, + "loss": 1.0802, + "step": 2546 + }, + { + "epoch": 0.73943968645667, + "grad_norm": 3.207580089569092, + "learning_rate": 9.740197092189718e-06, + "loss": 1.0071, + "step": 2547 + }, + { + "epoch": 0.7397300043547684, + "grad_norm": 2.972710371017456, + "learning_rate": 9.739891513377859e-06, + "loss": 1.0015, + "step": 2548 + }, + { + "epoch": 0.7400203222528668, + "grad_norm": 3.0222017765045166, + "learning_rate": 9.739585759760684e-06, + "loss": 1.1943, + "step": 2549 + }, + { + "epoch": 0.7403106401509653, + "grad_norm": 3.6331703662872314, + "learning_rate": 9.739279831349466e-06, + "loss": 1.0644, + "step": 2550 + }, + { + "epoch": 0.7406009580490637, + "grad_norm": 3.1713831424713135, + "learning_rate": 9.738973728155487e-06, + "loss": 1.1909, + "step": 2551 + }, + { + "epoch": 0.7408912759471622, + "grad_norm": 3.3440420627593994, + "learning_rate": 9.738667450190041e-06, + "loss": 1.1456, + "step": 2552 + }, + { + "epoch": 0.7411815938452606, + "grad_norm": 3.2886013984680176, + "learning_rate": 9.738360997464417e-06, + "loss": 1.1896, + "step": 2553 + }, + { + "epoch": 0.741471911743359, + "grad_norm": 3.303163528442383, + "learning_rate": 9.73805436998992e-06, + "loss": 1.174, + "step": 2554 + }, + { + "epoch": 0.7417622296414574, + "grad_norm": 3.4284379482269287, + "learning_rate": 9.737747567777859e-06, + "loss": 1.0949, + "step": 2555 + }, + { + "epoch": 0.7420525475395559, + "grad_norm": 3.026108980178833, + "learning_rate": 9.737440590839547e-06, + "loss": 1.2386, + "step": 2556 + }, + { + "epoch": 0.7423428654376543, + "grad_norm": 3.3348286151885986, + "learning_rate": 9.737133439186306e-06, + "loss": 1.1645, + "step": 2557 + }, + { + "epoch": 0.7426331833357527, + "grad_norm": 3.4476053714752197, + "learning_rate": 9.736826112829465e-06, + "loss": 1.2243, + "step": 2558 + }, + { + "epoch": 0.7429235012338511, + "grad_norm": 3.123429298400879, + "learning_rate": 9.736518611780356e-06, + "loss": 1.1967, + "step": 2559 + }, + { + "epoch": 0.7432138191319495, + "grad_norm": 3.2243711948394775, + "learning_rate": 9.73621093605032e-06, + "loss": 1.2283, + "step": 2560 + }, + { + "epoch": 0.7435041370300479, + "grad_norm": 3.192667245864868, + "learning_rate": 9.735903085650704e-06, + "loss": 1.1169, + "step": 2561 + }, + { + "epoch": 0.7437944549281463, + "grad_norm": 3.227220296859741, + "learning_rate": 9.735595060592861e-06, + "loss": 1.1867, + "step": 2562 + }, + { + "epoch": 0.7440847728262447, + "grad_norm": 3.1448750495910645, + "learning_rate": 9.735286860888153e-06, + "loss": 1.0588, + "step": 2563 + }, + { + "epoch": 0.7443750907243432, + "grad_norm": 3.9255151748657227, + "learning_rate": 9.734978486547943e-06, + "loss": 1.1771, + "step": 2564 + }, + { + "epoch": 0.7446654086224416, + "grad_norm": 3.173152208328247, + "learning_rate": 9.734669937583607e-06, + "loss": 1.0428, + "step": 2565 + }, + { + "epoch": 0.74495572652054, + "grad_norm": 2.9990289211273193, + "learning_rate": 9.734361214006523e-06, + "loss": 1.1064, + "step": 2566 + }, + { + "epoch": 0.7452460444186384, + "grad_norm": 3.705312490463257, + "learning_rate": 9.734052315828073e-06, + "loss": 1.2724, + "step": 2567 + }, + { + "epoch": 0.7455363623167368, + "grad_norm": 3.1329221725463867, + "learning_rate": 9.733743243059656e-06, + "loss": 1.0587, + "step": 2568 + }, + { + "epoch": 0.7458266802148352, + "grad_norm": 3.6346309185028076, + "learning_rate": 9.733433995712665e-06, + "loss": 1.2955, + "step": 2569 + }, + { + "epoch": 0.7461169981129336, + "grad_norm": 3.671525239944458, + "learning_rate": 9.733124573798507e-06, + "loss": 1.3279, + "step": 2570 + }, + { + "epoch": 0.746407316011032, + "grad_norm": 3.5882644653320312, + "learning_rate": 9.732814977328593e-06, + "loss": 1.3109, + "step": 2571 + }, + { + "epoch": 0.7466976339091305, + "grad_norm": 3.4163684844970703, + "learning_rate": 9.73250520631434e-06, + "loss": 1.2869, + "step": 2572 + }, + { + "epoch": 0.7469879518072289, + "grad_norm": 3.318476915359497, + "learning_rate": 9.732195260767175e-06, + "loss": 1.1014, + "step": 2573 + }, + { + "epoch": 0.7472782697053273, + "grad_norm": 3.565654993057251, + "learning_rate": 9.731885140698523e-06, + "loss": 1.3466, + "step": 2574 + }, + { + "epoch": 0.7475685876034257, + "grad_norm": 3.701667308807373, + "learning_rate": 9.73157484611983e-06, + "loss": 1.3208, + "step": 2575 + }, + { + "epoch": 0.7478589055015241, + "grad_norm": 3.6942193508148193, + "learning_rate": 9.73126437704253e-06, + "loss": 1.2147, + "step": 2576 + }, + { + "epoch": 0.7481492233996225, + "grad_norm": 3.2307727336883545, + "learning_rate": 9.73095373347808e-06, + "loss": 1.0473, + "step": 2577 + }, + { + "epoch": 0.7484395412977211, + "grad_norm": 3.1755237579345703, + "learning_rate": 9.730642915437932e-06, + "loss": 1.1311, + "step": 2578 + }, + { + "epoch": 0.7487298591958195, + "grad_norm": 2.977376937866211, + "learning_rate": 9.73033192293355e-06, + "loss": 1.0612, + "step": 2579 + }, + { + "epoch": 0.7490201770939179, + "grad_norm": 3.5205020904541016, + "learning_rate": 9.730020755976405e-06, + "loss": 1.2816, + "step": 2580 + }, + { + "epoch": 0.7493104949920163, + "grad_norm": 3.407058000564575, + "learning_rate": 9.729709414577971e-06, + "loss": 1.3124, + "step": 2581 + }, + { + "epoch": 0.7496008128901147, + "grad_norm": 3.4231269359588623, + "learning_rate": 9.729397898749732e-06, + "loss": 1.3177, + "step": 2582 + }, + { + "epoch": 0.7498911307882131, + "grad_norm": 3.3981311321258545, + "learning_rate": 9.729086208503174e-06, + "loss": 1.3057, + "step": 2583 + }, + { + "epoch": 0.7501814486863115, + "grad_norm": 3.3072404861450195, + "learning_rate": 9.728774343849794e-06, + "loss": 1.1111, + "step": 2584 + }, + { + "epoch": 0.75047176658441, + "grad_norm": 3.3770785331726074, + "learning_rate": 9.728462304801092e-06, + "loss": 1.0387, + "step": 2585 + }, + { + "epoch": 0.7507620844825084, + "grad_norm": 3.214796304702759, + "learning_rate": 9.728150091368578e-06, + "loss": 1.1361, + "step": 2586 + }, + { + "epoch": 0.7510524023806068, + "grad_norm": 3.14668345451355, + "learning_rate": 9.727837703563763e-06, + "loss": 1.1013, + "step": 2587 + }, + { + "epoch": 0.7513427202787052, + "grad_norm": 3.61557674407959, + "learning_rate": 9.727525141398172e-06, + "loss": 1.1335, + "step": 2588 + }, + { + "epoch": 0.7516330381768036, + "grad_norm": 3.3926947116851807, + "learning_rate": 9.727212404883328e-06, + "loss": 1.2092, + "step": 2589 + }, + { + "epoch": 0.751923356074902, + "grad_norm": 3.5248970985412598, + "learning_rate": 9.726899494030768e-06, + "loss": 1.2138, + "step": 2590 + }, + { + "epoch": 0.7522136739730004, + "grad_norm": 2.885737180709839, + "learning_rate": 9.72658640885203e-06, + "loss": 1.0495, + "step": 2591 + }, + { + "epoch": 0.7525039918710988, + "grad_norm": 3.0727686882019043, + "learning_rate": 9.726273149358661e-06, + "loss": 1.0749, + "step": 2592 + }, + { + "epoch": 0.7527943097691973, + "grad_norm": 3.084850549697876, + "learning_rate": 9.725959715562212e-06, + "loss": 1.2351, + "step": 2593 + }, + { + "epoch": 0.7530846276672957, + "grad_norm": 3.28760027885437, + "learning_rate": 9.725646107474245e-06, + "loss": 1.2275, + "step": 2594 + }, + { + "epoch": 0.7533749455653941, + "grad_norm": 3.085083246231079, + "learning_rate": 9.725332325106326e-06, + "loss": 1.1941, + "step": 2595 + }, + { + "epoch": 0.7536652634634925, + "grad_norm": 3.4755539894104004, + "learning_rate": 9.725018368470025e-06, + "loss": 1.324, + "step": 2596 + }, + { + "epoch": 0.7539555813615909, + "grad_norm": 3.1657776832580566, + "learning_rate": 9.724704237576924e-06, + "loss": 1.0582, + "step": 2597 + }, + { + "epoch": 0.7542458992596893, + "grad_norm": 3.143900156021118, + "learning_rate": 9.724389932438603e-06, + "loss": 1.1709, + "step": 2598 + }, + { + "epoch": 0.7545362171577877, + "grad_norm": 3.3038413524627686, + "learning_rate": 9.724075453066655e-06, + "loss": 1.1156, + "step": 2599 + }, + { + "epoch": 0.7548265350558861, + "grad_norm": 3.384906530380249, + "learning_rate": 9.723760799472681e-06, + "loss": 1.2913, + "step": 2600 + }, + { + "epoch": 0.7551168529539846, + "grad_norm": 3.3545148372650146, + "learning_rate": 9.723445971668284e-06, + "loss": 1.1701, + "step": 2601 + }, + { + "epoch": 0.755407170852083, + "grad_norm": 3.308631181716919, + "learning_rate": 9.723130969665073e-06, + "loss": 1.1446, + "step": 2602 + }, + { + "epoch": 0.7556974887501815, + "grad_norm": 3.1468513011932373, + "learning_rate": 9.722815793474667e-06, + "loss": 1.0866, + "step": 2603 + }, + { + "epoch": 0.7559878066482799, + "grad_norm": 3.327813148498535, + "learning_rate": 9.722500443108687e-06, + "loss": 1.1291, + "step": 2604 + }, + { + "epoch": 0.7562781245463783, + "grad_norm": 3.189318895339966, + "learning_rate": 9.722184918578765e-06, + "loss": 1.0912, + "step": 2605 + }, + { + "epoch": 0.7565684424444767, + "grad_norm": 3.209308385848999, + "learning_rate": 9.721869219896539e-06, + "loss": 1.2015, + "step": 2606 + }, + { + "epoch": 0.7568587603425752, + "grad_norm": 3.2611427307128906, + "learning_rate": 9.72155334707365e-06, + "loss": 1.1894, + "step": 2607 + }, + { + "epoch": 0.7571490782406736, + "grad_norm": 3.0698297023773193, + "learning_rate": 9.721237300121744e-06, + "loss": 1.1468, + "step": 2608 + }, + { + "epoch": 0.757439396138772, + "grad_norm": 3.030074119567871, + "learning_rate": 9.720921079052483e-06, + "loss": 1.0497, + "step": 2609 + }, + { + "epoch": 0.7577297140368704, + "grad_norm": 3.3314547538757324, + "learning_rate": 9.720604683877524e-06, + "loss": 1.2847, + "step": 2610 + }, + { + "epoch": 0.7580200319349688, + "grad_norm": 3.3319008350372314, + "learning_rate": 9.72028811460854e-06, + "loss": 1.1846, + "step": 2611 + }, + { + "epoch": 0.7583103498330672, + "grad_norm": 2.8318731784820557, + "learning_rate": 9.719971371257201e-06, + "loss": 1.1269, + "step": 2612 + }, + { + "epoch": 0.7586006677311656, + "grad_norm": 2.9825758934020996, + "learning_rate": 9.719654453835192e-06, + "loss": 1.172, + "step": 2613 + }, + { + "epoch": 0.758890985629264, + "grad_norm": 3.155717611312866, + "learning_rate": 9.7193373623542e-06, + "loss": 1.0468, + "step": 2614 + }, + { + "epoch": 0.7591813035273625, + "grad_norm": 3.3703644275665283, + "learning_rate": 9.71902009682592e-06, + "loss": 1.1021, + "step": 2615 + }, + { + "epoch": 0.7594716214254609, + "grad_norm": 3.448974132537842, + "learning_rate": 9.718702657262049e-06, + "loss": 1.3663, + "step": 2616 + }, + { + "epoch": 0.7597619393235593, + "grad_norm": 3.0262529850006104, + "learning_rate": 9.718385043674298e-06, + "loss": 1.0723, + "step": 2617 + }, + { + "epoch": 0.7600522572216577, + "grad_norm": 3.7767655849456787, + "learning_rate": 9.718067256074378e-06, + "loss": 1.2078, + "step": 2618 + }, + { + "epoch": 0.7603425751197561, + "grad_norm": 2.984757900238037, + "learning_rate": 9.71774929447401e-06, + "loss": 1.065, + "step": 2619 + }, + { + "epoch": 0.7606328930178545, + "grad_norm": 3.351996660232544, + "learning_rate": 9.717431158884922e-06, + "loss": 1.2249, + "step": 2620 + }, + { + "epoch": 0.7609232109159529, + "grad_norm": 3.374985933303833, + "learning_rate": 9.717112849318844e-06, + "loss": 1.1868, + "step": 2621 + }, + { + "epoch": 0.7612135288140514, + "grad_norm": 3.2836148738861084, + "learning_rate": 9.716794365787516e-06, + "loss": 1.3113, + "step": 2622 + }, + { + "epoch": 0.7615038467121498, + "grad_norm": 3.3848886489868164, + "learning_rate": 9.716475708302683e-06, + "loss": 1.2438, + "step": 2623 + }, + { + "epoch": 0.7617941646102482, + "grad_norm": 3.5439648628234863, + "learning_rate": 9.716156876876096e-06, + "loss": 1.1124, + "step": 2624 + }, + { + "epoch": 0.7620844825083466, + "grad_norm": 2.9537434577941895, + "learning_rate": 9.715837871519518e-06, + "loss": 1.0228, + "step": 2625 + }, + { + "epoch": 0.762374800406445, + "grad_norm": 3.688227891921997, + "learning_rate": 9.71551869224471e-06, + "loss": 1.1742, + "step": 2626 + }, + { + "epoch": 0.7626651183045434, + "grad_norm": 3.6073129177093506, + "learning_rate": 9.715199339063444e-06, + "loss": 1.1558, + "step": 2627 + }, + { + "epoch": 0.762955436202642, + "grad_norm": 3.2027735710144043, + "learning_rate": 9.714879811987496e-06, + "loss": 1.0795, + "step": 2628 + }, + { + "epoch": 0.7632457541007404, + "grad_norm": 3.0256600379943848, + "learning_rate": 9.714560111028654e-06, + "loss": 1.0514, + "step": 2629 + }, + { + "epoch": 0.7635360719988388, + "grad_norm": 3.2667462825775146, + "learning_rate": 9.714240236198704e-06, + "loss": 1.2406, + "step": 2630 + }, + { + "epoch": 0.7638263898969372, + "grad_norm": 3.4051690101623535, + "learning_rate": 9.713920187509445e-06, + "loss": 1.1812, + "step": 2631 + }, + { + "epoch": 0.7641167077950356, + "grad_norm": 3.3208694458007812, + "learning_rate": 9.713599964972682e-06, + "loss": 1.1577, + "step": 2632 + }, + { + "epoch": 0.764407025693134, + "grad_norm": 3.5661416053771973, + "learning_rate": 9.71327956860022e-06, + "loss": 1.2215, + "step": 2633 + }, + { + "epoch": 0.7646973435912324, + "grad_norm": 3.286116361618042, + "learning_rate": 9.712958998403881e-06, + "loss": 1.1043, + "step": 2634 + }, + { + "epoch": 0.7649876614893308, + "grad_norm": 3.0886998176574707, + "learning_rate": 9.712638254395481e-06, + "loss": 1.0814, + "step": 2635 + }, + { + "epoch": 0.7652779793874293, + "grad_norm": 3.3840620517730713, + "learning_rate": 9.712317336586854e-06, + "loss": 1.0548, + "step": 2636 + }, + { + "epoch": 0.7655682972855277, + "grad_norm": 3.4241580963134766, + "learning_rate": 9.711996244989835e-06, + "loss": 1.0526, + "step": 2637 + }, + { + "epoch": 0.7658586151836261, + "grad_norm": 3.7336814403533936, + "learning_rate": 9.711674979616263e-06, + "loss": 1.3548, + "step": 2638 + }, + { + "epoch": 0.7661489330817245, + "grad_norm": 3.1186118125915527, + "learning_rate": 9.711353540477988e-06, + "loss": 1.1147, + "step": 2639 + }, + { + "epoch": 0.7664392509798229, + "grad_norm": 3.3635342121124268, + "learning_rate": 9.711031927586864e-06, + "loss": 1.3023, + "step": 2640 + }, + { + "epoch": 0.7667295688779213, + "grad_norm": 3.2632579803466797, + "learning_rate": 9.710710140954752e-06, + "loss": 1.2382, + "step": 2641 + }, + { + "epoch": 0.7670198867760197, + "grad_norm": 3.1245193481445312, + "learning_rate": 9.710388180593518e-06, + "loss": 1.1616, + "step": 2642 + }, + { + "epoch": 0.7673102046741181, + "grad_norm": 3.439480781555176, + "learning_rate": 9.710066046515039e-06, + "loss": 1.24, + "step": 2643 + }, + { + "epoch": 0.7676005225722166, + "grad_norm": 3.172135353088379, + "learning_rate": 9.709743738731191e-06, + "loss": 0.993, + "step": 2644 + }, + { + "epoch": 0.767890840470315, + "grad_norm": 3.2096140384674072, + "learning_rate": 9.709421257253865e-06, + "loss": 1.2152, + "step": 2645 + }, + { + "epoch": 0.7681811583684134, + "grad_norm": 3.3263416290283203, + "learning_rate": 9.709098602094952e-06, + "loss": 1.1902, + "step": 2646 + }, + { + "epoch": 0.7684714762665118, + "grad_norm": 3.186981201171875, + "learning_rate": 9.708775773266353e-06, + "loss": 1.2518, + "step": 2647 + }, + { + "epoch": 0.7687617941646102, + "grad_norm": 3.4535677433013916, + "learning_rate": 9.708452770779967e-06, + "loss": 1.2558, + "step": 2648 + }, + { + "epoch": 0.7690521120627086, + "grad_norm": 3.2888617515563965, + "learning_rate": 9.708129594647716e-06, + "loss": 1.1457, + "step": 2649 + }, + { + "epoch": 0.769342429960807, + "grad_norm": 3.6258974075317383, + "learning_rate": 9.707806244881513e-06, + "loss": 1.3135, + "step": 2650 + }, + { + "epoch": 0.7696327478589055, + "grad_norm": 3.227768898010254, + "learning_rate": 9.707482721493282e-06, + "loss": 1.3181, + "step": 2651 + }, + { + "epoch": 0.7699230657570039, + "grad_norm": 3.445146322250366, + "learning_rate": 9.707159024494958e-06, + "loss": 1.0569, + "step": 2652 + }, + { + "epoch": 0.7702133836551024, + "grad_norm": 3.3416175842285156, + "learning_rate": 9.706835153898476e-06, + "loss": 1.0999, + "step": 2653 + }, + { + "epoch": 0.7705037015532008, + "grad_norm": 3.45808744430542, + "learning_rate": 9.706511109715782e-06, + "loss": 1.2106, + "step": 2654 + }, + { + "epoch": 0.7707940194512992, + "grad_norm": 3.3738346099853516, + "learning_rate": 9.706186891958826e-06, + "loss": 1.2624, + "step": 2655 + }, + { + "epoch": 0.7710843373493976, + "grad_norm": 3.630474328994751, + "learning_rate": 9.705862500639565e-06, + "loss": 1.3611, + "step": 2656 + }, + { + "epoch": 0.771374655247496, + "grad_norm": 3.3824191093444824, + "learning_rate": 9.705537935769962e-06, + "loss": 1.3021, + "step": 2657 + }, + { + "epoch": 0.7716649731455945, + "grad_norm": 3.4706802368164062, + "learning_rate": 9.705213197361989e-06, + "loss": 1.2166, + "step": 2658 + }, + { + "epoch": 0.7719552910436929, + "grad_norm": 3.271436929702759, + "learning_rate": 9.704888285427618e-06, + "loss": 1.232, + "step": 2659 + }, + { + "epoch": 0.7722456089417913, + "grad_norm": 3.3436808586120605, + "learning_rate": 9.704563199978837e-06, + "loss": 1.1864, + "step": 2660 + }, + { + "epoch": 0.7725359268398897, + "grad_norm": 3.2927451133728027, + "learning_rate": 9.70423794102763e-06, + "loss": 1.1901, + "step": 2661 + }, + { + "epoch": 0.7728262447379881, + "grad_norm": 3.3609869480133057, + "learning_rate": 9.703912508585995e-06, + "loss": 1.27, + "step": 2662 + }, + { + "epoch": 0.7731165626360865, + "grad_norm": 3.4284236431121826, + "learning_rate": 9.703586902665932e-06, + "loss": 1.3389, + "step": 2663 + }, + { + "epoch": 0.7734068805341849, + "grad_norm": 3.34993052482605, + "learning_rate": 9.703261123279453e-06, + "loss": 1.2551, + "step": 2664 + }, + { + "epoch": 0.7736971984322834, + "grad_norm": 3.4748470783233643, + "learning_rate": 9.70293517043857e-06, + "loss": 1.138, + "step": 2665 + }, + { + "epoch": 0.7739875163303818, + "grad_norm": 3.0106701850891113, + "learning_rate": 9.702609044155303e-06, + "loss": 1.1568, + "step": 2666 + }, + { + "epoch": 0.7742778342284802, + "grad_norm": 3.5232250690460205, + "learning_rate": 9.70228274444168e-06, + "loss": 1.1744, + "step": 2667 + }, + { + "epoch": 0.7745681521265786, + "grad_norm": 2.9455854892730713, + "learning_rate": 9.701956271309736e-06, + "loss": 1.0484, + "step": 2668 + }, + { + "epoch": 0.774858470024677, + "grad_norm": 3.023559808731079, + "learning_rate": 9.701629624771512e-06, + "loss": 1.074, + "step": 2669 + }, + { + "epoch": 0.7751487879227754, + "grad_norm": 3.59647798538208, + "learning_rate": 9.701302804839052e-06, + "loss": 1.4052, + "step": 2670 + }, + { + "epoch": 0.7754391058208738, + "grad_norm": 3.113689661026001, + "learning_rate": 9.70097581152441e-06, + "loss": 1.031, + "step": 2671 + }, + { + "epoch": 0.7757294237189722, + "grad_norm": 3.235813617706299, + "learning_rate": 9.700648644839647e-06, + "loss": 1.2389, + "step": 2672 + }, + { + "epoch": 0.7760197416170707, + "grad_norm": 3.190761089324951, + "learning_rate": 9.700321304796825e-06, + "loss": 1.1777, + "step": 2673 + }, + { + "epoch": 0.7763100595151691, + "grad_norm": 3.0125646591186523, + "learning_rate": 9.69999379140802e-06, + "loss": 1.1096, + "step": 2674 + }, + { + "epoch": 0.7766003774132675, + "grad_norm": 3.218435287475586, + "learning_rate": 9.69966610468531e-06, + "loss": 1.0491, + "step": 2675 + }, + { + "epoch": 0.7768906953113659, + "grad_norm": 3.141157865524292, + "learning_rate": 9.699338244640779e-06, + "loss": 1.1652, + "step": 2676 + }, + { + "epoch": 0.7771810132094643, + "grad_norm": 3.2786238193511963, + "learning_rate": 9.699010211286516e-06, + "loss": 1.2433, + "step": 2677 + }, + { + "epoch": 0.7774713311075628, + "grad_norm": 2.9467108249664307, + "learning_rate": 9.698682004634624e-06, + "loss": 1.1513, + "step": 2678 + }, + { + "epoch": 0.7777616490056612, + "grad_norm": 3.2171337604522705, + "learning_rate": 9.698353624697202e-06, + "loss": 1.1458, + "step": 2679 + }, + { + "epoch": 0.7780519669037597, + "grad_norm": 3.1761419773101807, + "learning_rate": 9.698025071486363e-06, + "loss": 1.1981, + "step": 2680 + }, + { + "epoch": 0.7783422848018581, + "grad_norm": 3.1694602966308594, + "learning_rate": 9.697696345014225e-06, + "loss": 1.1642, + "step": 2681 + }, + { + "epoch": 0.7786326026999565, + "grad_norm": 3.392407178878784, + "learning_rate": 9.69736744529291e-06, + "loss": 1.1867, + "step": 2682 + }, + { + "epoch": 0.7789229205980549, + "grad_norm": 3.022423028945923, + "learning_rate": 9.697038372334548e-06, + "loss": 1.0707, + "step": 2683 + }, + { + "epoch": 0.7792132384961533, + "grad_norm": 3.068240165710449, + "learning_rate": 9.696709126151274e-06, + "loss": 1.159, + "step": 2684 + }, + { + "epoch": 0.7795035563942517, + "grad_norm": 3.0357422828674316, + "learning_rate": 9.69637970675523e-06, + "loss": 1.0268, + "step": 2685 + }, + { + "epoch": 0.7797938742923501, + "grad_norm": 3.256622076034546, + "learning_rate": 9.696050114158569e-06, + "loss": 1.2258, + "step": 2686 + }, + { + "epoch": 0.7800841921904486, + "grad_norm": 3.265336275100708, + "learning_rate": 9.69572034837344e-06, + "loss": 1.1091, + "step": 2687 + }, + { + "epoch": 0.780374510088547, + "grad_norm": 3.419400453567505, + "learning_rate": 9.695390409412011e-06, + "loss": 1.2144, + "step": 2688 + }, + { + "epoch": 0.7806648279866454, + "grad_norm": 3.241852045059204, + "learning_rate": 9.695060297286445e-06, + "loss": 1.185, + "step": 2689 + }, + { + "epoch": 0.7809551458847438, + "grad_norm": 3.128333568572998, + "learning_rate": 9.694730012008919e-06, + "loss": 1.166, + "step": 2690 + }, + { + "epoch": 0.7812454637828422, + "grad_norm": 3.2814202308654785, + "learning_rate": 9.694399553591614e-06, + "loss": 1.1328, + "step": 2691 + }, + { + "epoch": 0.7815357816809406, + "grad_norm": 3.5707764625549316, + "learning_rate": 9.694068922046715e-06, + "loss": 1.3243, + "step": 2692 + }, + { + "epoch": 0.781826099579039, + "grad_norm": 3.2367355823516846, + "learning_rate": 9.693738117386419e-06, + "loss": 1.3495, + "step": 2693 + }, + { + "epoch": 0.7821164174771374, + "grad_norm": 3.425107479095459, + "learning_rate": 9.693407139622922e-06, + "loss": 1.1423, + "step": 2694 + }, + { + "epoch": 0.7824067353752359, + "grad_norm": 3.4596445560455322, + "learning_rate": 9.693075988768433e-06, + "loss": 1.2778, + "step": 2695 + }, + { + "epoch": 0.7826970532733343, + "grad_norm": 3.4609477519989014, + "learning_rate": 9.692744664835164e-06, + "loss": 1.159, + "step": 2696 + }, + { + "epoch": 0.7829873711714327, + "grad_norm": 3.192476272583008, + "learning_rate": 9.692413167835334e-06, + "loss": 1.1078, + "step": 2697 + }, + { + "epoch": 0.7832776890695311, + "grad_norm": 2.891274929046631, + "learning_rate": 9.692081497781168e-06, + "loss": 1.012, + "step": 2698 + }, + { + "epoch": 0.7835680069676295, + "grad_norm": 3.200326442718506, + "learning_rate": 9.691749654684899e-06, + "loss": 1.2797, + "step": 2699 + }, + { + "epoch": 0.7838583248657279, + "grad_norm": 3.1819984912872314, + "learning_rate": 9.691417638558764e-06, + "loss": 1.1548, + "step": 2700 + }, + { + "epoch": 0.7841486427638263, + "grad_norm": 2.8674476146698, + "learning_rate": 9.69108544941501e-06, + "loss": 0.9269, + "step": 2701 + }, + { + "epoch": 0.7844389606619248, + "grad_norm": 3.1889965534210205, + "learning_rate": 9.690753087265883e-06, + "loss": 1.2377, + "step": 2702 + }, + { + "epoch": 0.7847292785600233, + "grad_norm": 3.410156488418579, + "learning_rate": 9.690420552123645e-06, + "loss": 1.1583, + "step": 2703 + }, + { + "epoch": 0.7850195964581217, + "grad_norm": 2.966400146484375, + "learning_rate": 9.69008784400056e-06, + "loss": 1.0389, + "step": 2704 + }, + { + "epoch": 0.7853099143562201, + "grad_norm": 3.1139185428619385, + "learning_rate": 9.689754962908895e-06, + "loss": 1.0267, + "step": 2705 + }, + { + "epoch": 0.7856002322543185, + "grad_norm": 3.2307214736938477, + "learning_rate": 9.689421908860928e-06, + "loss": 1.0453, + "step": 2706 + }, + { + "epoch": 0.7858905501524169, + "grad_norm": 3.1317498683929443, + "learning_rate": 9.689088681868941e-06, + "loss": 1.193, + "step": 2707 + }, + { + "epoch": 0.7861808680505153, + "grad_norm": 2.7882258892059326, + "learning_rate": 9.688755281945226e-06, + "loss": 0.9895, + "step": 2708 + }, + { + "epoch": 0.7864711859486138, + "grad_norm": 3.129871368408203, + "learning_rate": 9.688421709102076e-06, + "loss": 1.2207, + "step": 2709 + }, + { + "epoch": 0.7867615038467122, + "grad_norm": 3.189854621887207, + "learning_rate": 9.688087963351795e-06, + "loss": 1.1442, + "step": 2710 + }, + { + "epoch": 0.7870518217448106, + "grad_norm": 3.1260828971862793, + "learning_rate": 9.68775404470669e-06, + "loss": 1.0848, + "step": 2711 + }, + { + "epoch": 0.787342139642909, + "grad_norm": 3.461789846420288, + "learning_rate": 9.687419953179074e-06, + "loss": 1.3078, + "step": 2712 + }, + { + "epoch": 0.7876324575410074, + "grad_norm": 3.009683132171631, + "learning_rate": 9.687085688781273e-06, + "loss": 0.9739, + "step": 2713 + }, + { + "epoch": 0.7879227754391058, + "grad_norm": 3.2000815868377686, + "learning_rate": 9.68675125152561e-06, + "loss": 1.2845, + "step": 2714 + }, + { + "epoch": 0.7882130933372042, + "grad_norm": 3.3149054050445557, + "learning_rate": 9.686416641424422e-06, + "loss": 1.1578, + "step": 2715 + }, + { + "epoch": 0.7885034112353027, + "grad_norm": 2.903021812438965, + "learning_rate": 9.686081858490047e-06, + "loss": 0.9999, + "step": 2716 + }, + { + "epoch": 0.7887937291334011, + "grad_norm": 3.2274374961853027, + "learning_rate": 9.685746902734834e-06, + "loss": 1.2606, + "step": 2717 + }, + { + "epoch": 0.7890840470314995, + "grad_norm": 3.3526039123535156, + "learning_rate": 9.685411774171133e-06, + "loss": 1.2573, + "step": 2718 + }, + { + "epoch": 0.7893743649295979, + "grad_norm": 3.025444269180298, + "learning_rate": 9.685076472811305e-06, + "loss": 1.12, + "step": 2719 + }, + { + "epoch": 0.7896646828276963, + "grad_norm": 3.1881661415100098, + "learning_rate": 9.684740998667718e-06, + "loss": 1.1475, + "step": 2720 + }, + { + "epoch": 0.7899550007257947, + "grad_norm": 3.1479337215423584, + "learning_rate": 9.68440535175274e-06, + "loss": 1.0881, + "step": 2721 + }, + { + "epoch": 0.7902453186238931, + "grad_norm": 3.6872854232788086, + "learning_rate": 9.684069532078753e-06, + "loss": 1.2607, + "step": 2722 + }, + { + "epoch": 0.7905356365219915, + "grad_norm": 2.9365339279174805, + "learning_rate": 9.68373353965814e-06, + "loss": 1.0904, + "step": 2723 + }, + { + "epoch": 0.79082595442009, + "grad_norm": 2.9232428073883057, + "learning_rate": 9.683397374503293e-06, + "loss": 1.1098, + "step": 2724 + }, + { + "epoch": 0.7911162723181884, + "grad_norm": 3.091132402420044, + "learning_rate": 9.683061036626608e-06, + "loss": 1.191, + "step": 2725 + }, + { + "epoch": 0.7914065902162868, + "grad_norm": 3.380723237991333, + "learning_rate": 9.682724526040493e-06, + "loss": 1.2003, + "step": 2726 + }, + { + "epoch": 0.7916969081143852, + "grad_norm": 3.4118423461914062, + "learning_rate": 9.682387842757354e-06, + "loss": 1.1715, + "step": 2727 + }, + { + "epoch": 0.7919872260124837, + "grad_norm": 3.095881462097168, + "learning_rate": 9.682050986789609e-06, + "loss": 1.1167, + "step": 2728 + }, + { + "epoch": 0.7922775439105821, + "grad_norm": 3.4140207767486572, + "learning_rate": 9.681713958149683e-06, + "loss": 1.1926, + "step": 2729 + }, + { + "epoch": 0.7925678618086806, + "grad_norm": 3.4278016090393066, + "learning_rate": 9.681376756850003e-06, + "loss": 1.2509, + "step": 2730 + }, + { + "epoch": 0.792858179706779, + "grad_norm": 3.5882339477539062, + "learning_rate": 9.681039382903007e-06, + "loss": 1.3001, + "step": 2731 + }, + { + "epoch": 0.7931484976048774, + "grad_norm": 3.4812803268432617, + "learning_rate": 9.680701836321135e-06, + "loss": 1.2319, + "step": 2732 + }, + { + "epoch": 0.7934388155029758, + "grad_norm": 3.3065333366394043, + "learning_rate": 9.680364117116838e-06, + "loss": 1.1813, + "step": 2733 + }, + { + "epoch": 0.7937291334010742, + "grad_norm": 3.2521045207977295, + "learning_rate": 9.680026225302568e-06, + "loss": 1.2133, + "step": 2734 + }, + { + "epoch": 0.7940194512991726, + "grad_norm": 2.7159008979797363, + "learning_rate": 9.67968816089079e-06, + "loss": 1.0154, + "step": 2735 + }, + { + "epoch": 0.794309769197271, + "grad_norm": 3.323042869567871, + "learning_rate": 9.679349923893968e-06, + "loss": 1.2234, + "step": 2736 + }, + { + "epoch": 0.7946000870953694, + "grad_norm": 3.2154958248138428, + "learning_rate": 9.679011514324579e-06, + "loss": 1.0341, + "step": 2737 + }, + { + "epoch": 0.7948904049934679, + "grad_norm": 3.100257396697998, + "learning_rate": 9.678672932195101e-06, + "loss": 1.0728, + "step": 2738 + }, + { + "epoch": 0.7951807228915663, + "grad_norm": 2.962118625640869, + "learning_rate": 9.678334177518022e-06, + "loss": 1.0618, + "step": 2739 + }, + { + "epoch": 0.7954710407896647, + "grad_norm": 3.430203914642334, + "learning_rate": 9.677995250305836e-06, + "loss": 1.3019, + "step": 2740 + }, + { + "epoch": 0.7957613586877631, + "grad_norm": 3.404595375061035, + "learning_rate": 9.677656150571042e-06, + "loss": 1.2069, + "step": 2741 + }, + { + "epoch": 0.7960516765858615, + "grad_norm": 3.271411418914795, + "learning_rate": 9.677316878326144e-06, + "loss": 1.1914, + "step": 2742 + }, + { + "epoch": 0.7963419944839599, + "grad_norm": 3.5595600605010986, + "learning_rate": 9.676977433583656e-06, + "loss": 1.404, + "step": 2743 + }, + { + "epoch": 0.7966323123820583, + "grad_norm": 3.423607587814331, + "learning_rate": 9.676637816356098e-06, + "loss": 1.2709, + "step": 2744 + }, + { + "epoch": 0.7969226302801568, + "grad_norm": 3.162513017654419, + "learning_rate": 9.676298026655992e-06, + "loss": 1.2843, + "step": 2745 + }, + { + "epoch": 0.7972129481782552, + "grad_norm": 3.023754119873047, + "learning_rate": 9.675958064495869e-06, + "loss": 1.2077, + "step": 2746 + }, + { + "epoch": 0.7975032660763536, + "grad_norm": 3.2960760593414307, + "learning_rate": 9.675617929888271e-06, + "loss": 1.1551, + "step": 2747 + }, + { + "epoch": 0.797793583974452, + "grad_norm": 3.2949986457824707, + "learning_rate": 9.675277622845736e-06, + "loss": 1.2885, + "step": 2748 + }, + { + "epoch": 0.7980839018725504, + "grad_norm": 3.3253605365753174, + "learning_rate": 9.67493714338082e-06, + "loss": 1.1524, + "step": 2749 + }, + { + "epoch": 0.7983742197706488, + "grad_norm": 3.1859922409057617, + "learning_rate": 9.674596491506077e-06, + "loss": 1.2582, + "step": 2750 + }, + { + "epoch": 0.7986645376687472, + "grad_norm": 3.2374212741851807, + "learning_rate": 9.67425566723407e-06, + "loss": 1.3034, + "step": 2751 + }, + { + "epoch": 0.7989548555668456, + "grad_norm": 3.093991279602051, + "learning_rate": 9.673914670577369e-06, + "loss": 1.1687, + "step": 2752 + }, + { + "epoch": 0.7992451734649441, + "grad_norm": 2.8157095909118652, + "learning_rate": 9.67357350154855e-06, + "loss": 1.0783, + "step": 2753 + }, + { + "epoch": 0.7995354913630426, + "grad_norm": 3.2308690547943115, + "learning_rate": 9.673232160160195e-06, + "loss": 1.1821, + "step": 2754 + }, + { + "epoch": 0.799825809261141, + "grad_norm": 2.980912208557129, + "learning_rate": 9.67289064642489e-06, + "loss": 1.1247, + "step": 2755 + }, + { + "epoch": 0.8001161271592394, + "grad_norm": 2.8929474353790283, + "learning_rate": 9.672548960355236e-06, + "loss": 1.0361, + "step": 2756 + }, + { + "epoch": 0.8004064450573378, + "grad_norm": 3.199467658996582, + "learning_rate": 9.672207101963828e-06, + "loss": 1.161, + "step": 2757 + }, + { + "epoch": 0.8006967629554362, + "grad_norm": 3.3019492626190186, + "learning_rate": 9.671865071263278e-06, + "loss": 1.0657, + "step": 2758 + }, + { + "epoch": 0.8009870808535346, + "grad_norm": 3.4587512016296387, + "learning_rate": 9.671522868266197e-06, + "loss": 1.1823, + "step": 2759 + }, + { + "epoch": 0.8012773987516331, + "grad_norm": 3.3693933486938477, + "learning_rate": 9.671180492985207e-06, + "loss": 1.1788, + "step": 2760 + }, + { + "epoch": 0.8015677166497315, + "grad_norm": 3.195629358291626, + "learning_rate": 9.670837945432934e-06, + "loss": 1.1368, + "step": 2761 + }, + { + "epoch": 0.8018580345478299, + "grad_norm": 3.206254243850708, + "learning_rate": 9.670495225622011e-06, + "loss": 1.1581, + "step": 2762 + }, + { + "epoch": 0.8021483524459283, + "grad_norm": 3.264477014541626, + "learning_rate": 9.670152333565078e-06, + "loss": 1.1068, + "step": 2763 + }, + { + "epoch": 0.8024386703440267, + "grad_norm": 3.518728256225586, + "learning_rate": 9.669809269274779e-06, + "loss": 1.1533, + "step": 2764 + }, + { + "epoch": 0.8027289882421251, + "grad_norm": 3.5006842613220215, + "learning_rate": 9.669466032763768e-06, + "loss": 1.2964, + "step": 2765 + }, + { + "epoch": 0.8030193061402235, + "grad_norm": 3.7323036193847656, + "learning_rate": 9.669122624044704e-06, + "loss": 1.2684, + "step": 2766 + }, + { + "epoch": 0.803309624038322, + "grad_norm": 3.5423648357391357, + "learning_rate": 9.668779043130249e-06, + "loss": 1.2638, + "step": 2767 + }, + { + "epoch": 0.8035999419364204, + "grad_norm": 3.037662982940674, + "learning_rate": 9.668435290033076e-06, + "loss": 0.916, + "step": 2768 + }, + { + "epoch": 0.8038902598345188, + "grad_norm": 3.0804009437561035, + "learning_rate": 9.668091364765862e-06, + "loss": 1.1467, + "step": 2769 + }, + { + "epoch": 0.8041805777326172, + "grad_norm": 3.015153169631958, + "learning_rate": 9.66774726734129e-06, + "loss": 1.0218, + "step": 2770 + }, + { + "epoch": 0.8044708956307156, + "grad_norm": 3.360714912414551, + "learning_rate": 9.667402997772052e-06, + "loss": 1.3612, + "step": 2771 + }, + { + "epoch": 0.804761213528814, + "grad_norm": 3.091615915298462, + "learning_rate": 9.667058556070846e-06, + "loss": 1.0789, + "step": 2772 + }, + { + "epoch": 0.8050515314269124, + "grad_norm": 3.4261224269866943, + "learning_rate": 9.66671394225037e-06, + "loss": 1.0892, + "step": 2773 + }, + { + "epoch": 0.8053418493250109, + "grad_norm": 3.1172802448272705, + "learning_rate": 9.666369156323335e-06, + "loss": 1.094, + "step": 2774 + }, + { + "epoch": 0.8056321672231093, + "grad_norm": 3.621525764465332, + "learning_rate": 9.666024198302459e-06, + "loss": 1.2377, + "step": 2775 + }, + { + "epoch": 0.8059224851212077, + "grad_norm": 3.2709341049194336, + "learning_rate": 9.665679068200463e-06, + "loss": 1.1966, + "step": 2776 + }, + { + "epoch": 0.8062128030193061, + "grad_norm": 3.9319911003112793, + "learning_rate": 9.66533376603007e-06, + "loss": 1.2563, + "step": 2777 + }, + { + "epoch": 0.8065031209174045, + "grad_norm": 3.317229747772217, + "learning_rate": 9.664988291804025e-06, + "loss": 1.0844, + "step": 2778 + }, + { + "epoch": 0.806793438815503, + "grad_norm": 3.2305257320404053, + "learning_rate": 9.664642645535058e-06, + "loss": 1.2113, + "step": 2779 + }, + { + "epoch": 0.8070837567136014, + "grad_norm": 2.9735424518585205, + "learning_rate": 9.664296827235924e-06, + "loss": 1.1671, + "step": 2780 + }, + { + "epoch": 0.8073740746116999, + "grad_norm": 3.4373373985290527, + "learning_rate": 9.663950836919373e-06, + "loss": 1.2868, + "step": 2781 + }, + { + "epoch": 0.8076643925097983, + "grad_norm": 3.469642400741577, + "learning_rate": 9.663604674598169e-06, + "loss": 1.1692, + "step": 2782 + }, + { + "epoch": 0.8079547104078967, + "grad_norm": 3.3247344493865967, + "learning_rate": 9.663258340285071e-06, + "loss": 1.1078, + "step": 2783 + }, + { + "epoch": 0.8082450283059951, + "grad_norm": 3.2038064002990723, + "learning_rate": 9.662911833992858e-06, + "loss": 1.2648, + "step": 2784 + }, + { + "epoch": 0.8085353462040935, + "grad_norm": 3.3712222576141357, + "learning_rate": 9.662565155734308e-06, + "loss": 1.1988, + "step": 2785 + }, + { + "epoch": 0.8088256641021919, + "grad_norm": 3.159156560897827, + "learning_rate": 9.662218305522204e-06, + "loss": 1.1781, + "step": 2786 + }, + { + "epoch": 0.8091159820002903, + "grad_norm": 2.919067859649658, + "learning_rate": 9.661871283369337e-06, + "loss": 1.048, + "step": 2787 + }, + { + "epoch": 0.8094062998983887, + "grad_norm": 3.1437933444976807, + "learning_rate": 9.66152408928851e-06, + "loss": 1.1889, + "step": 2788 + }, + { + "epoch": 0.8096966177964872, + "grad_norm": 3.3572521209716797, + "learning_rate": 9.661176723292524e-06, + "loss": 1.2144, + "step": 2789 + }, + { + "epoch": 0.8099869356945856, + "grad_norm": 3.069945812225342, + "learning_rate": 9.660829185394189e-06, + "loss": 1.0188, + "step": 2790 + }, + { + "epoch": 0.810277253592684, + "grad_norm": 2.9657914638519287, + "learning_rate": 9.660481475606325e-06, + "loss": 1.0332, + "step": 2791 + }, + { + "epoch": 0.8105675714907824, + "grad_norm": 3.1732230186462402, + "learning_rate": 9.660133593941752e-06, + "loss": 1.2119, + "step": 2792 + }, + { + "epoch": 0.8108578893888808, + "grad_norm": 3.295893430709839, + "learning_rate": 9.659785540413303e-06, + "loss": 1.1986, + "step": 2793 + }, + { + "epoch": 0.8111482072869792, + "grad_norm": 3.3230507373809814, + "learning_rate": 9.65943731503381e-06, + "loss": 1.2974, + "step": 2794 + }, + { + "epoch": 0.8114385251850776, + "grad_norm": 3.2661449909210205, + "learning_rate": 9.65908891781612e-06, + "loss": 1.1286, + "step": 2795 + }, + { + "epoch": 0.811728843083176, + "grad_norm": 3.3028149604797363, + "learning_rate": 9.658740348773079e-06, + "loss": 1.2416, + "step": 2796 + }, + { + "epoch": 0.8120191609812745, + "grad_norm": 3.1426446437835693, + "learning_rate": 9.658391607917543e-06, + "loss": 1.1046, + "step": 2797 + }, + { + "epoch": 0.8123094788793729, + "grad_norm": 3.3629467487335205, + "learning_rate": 9.658042695262373e-06, + "loss": 1.2118, + "step": 2798 + }, + { + "epoch": 0.8125997967774713, + "grad_norm": 3.356700897216797, + "learning_rate": 9.657693610820437e-06, + "loss": 1.0999, + "step": 2799 + }, + { + "epoch": 0.8128901146755697, + "grad_norm": 2.8955090045928955, + "learning_rate": 9.65734435460461e-06, + "loss": 1.1629, + "step": 2800 + }, + { + "epoch": 0.8131804325736681, + "grad_norm": 3.2146928310394287, + "learning_rate": 9.656994926627769e-06, + "loss": 1.1164, + "step": 2801 + }, + { + "epoch": 0.8134707504717665, + "grad_norm": 3.1054909229278564, + "learning_rate": 9.656645326902804e-06, + "loss": 1.0392, + "step": 2802 + }, + { + "epoch": 0.813761068369865, + "grad_norm": 4.134510517120361, + "learning_rate": 9.656295555442608e-06, + "loss": 1.1675, + "step": 2803 + }, + { + "epoch": 0.8140513862679635, + "grad_norm": 3.0019631385803223, + "learning_rate": 9.65594561226008e-06, + "loss": 0.9988, + "step": 2804 + }, + { + "epoch": 0.8143417041660619, + "grad_norm": 3.312530994415283, + "learning_rate": 9.655595497368123e-06, + "loss": 1.2161, + "step": 2805 + }, + { + "epoch": 0.8146320220641603, + "grad_norm": 3.215278387069702, + "learning_rate": 9.655245210779653e-06, + "loss": 1.0485, + "step": 2806 + }, + { + "epoch": 0.8149223399622587, + "grad_norm": 3.1792635917663574, + "learning_rate": 9.654894752507589e-06, + "loss": 1.1354, + "step": 2807 + }, + { + "epoch": 0.8152126578603571, + "grad_norm": 3.156052827835083, + "learning_rate": 9.654544122564852e-06, + "loss": 1.1189, + "step": 2808 + }, + { + "epoch": 0.8155029757584555, + "grad_norm": 3.3468096256256104, + "learning_rate": 9.654193320964374e-06, + "loss": 1.2148, + "step": 2809 + }, + { + "epoch": 0.815793293656554, + "grad_norm": 3.149667501449585, + "learning_rate": 9.653842347719094e-06, + "loss": 1.089, + "step": 2810 + }, + { + "epoch": 0.8160836115546524, + "grad_norm": 2.945133686065674, + "learning_rate": 9.653491202841955e-06, + "loss": 1.1251, + "step": 2811 + }, + { + "epoch": 0.8163739294527508, + "grad_norm": 3.5497055053710938, + "learning_rate": 9.653139886345909e-06, + "loss": 1.3452, + "step": 2812 + }, + { + "epoch": 0.8166642473508492, + "grad_norm": 3.0823254585266113, + "learning_rate": 9.652788398243908e-06, + "loss": 1.096, + "step": 2813 + }, + { + "epoch": 0.8169545652489476, + "grad_norm": 2.955162525177002, + "learning_rate": 9.652436738548917e-06, + "loss": 1.1443, + "step": 2814 + }, + { + "epoch": 0.817244883147046, + "grad_norm": 3.125523567199707, + "learning_rate": 9.652084907273908e-06, + "loss": 1.1199, + "step": 2815 + }, + { + "epoch": 0.8175352010451444, + "grad_norm": 3.3003995418548584, + "learning_rate": 9.651732904431852e-06, + "loss": 1.22, + "step": 2816 + }, + { + "epoch": 0.8178255189432428, + "grad_norm": 3.1056740283966064, + "learning_rate": 9.651380730035733e-06, + "loss": 1.112, + "step": 2817 + }, + { + "epoch": 0.8181158368413413, + "grad_norm": 3.12873911857605, + "learning_rate": 9.651028384098538e-06, + "loss": 0.9787, + "step": 2818 + }, + { + "epoch": 0.8184061547394397, + "grad_norm": 3.148348093032837, + "learning_rate": 9.650675866633263e-06, + "loss": 1.1535, + "step": 2819 + }, + { + "epoch": 0.8186964726375381, + "grad_norm": 3.1877403259277344, + "learning_rate": 9.650323177652907e-06, + "loss": 1.1669, + "step": 2820 + }, + { + "epoch": 0.8189867905356365, + "grad_norm": 3.172475576400757, + "learning_rate": 9.649970317170478e-06, + "loss": 1.1416, + "step": 2821 + }, + { + "epoch": 0.8192771084337349, + "grad_norm": 3.273035764694214, + "learning_rate": 9.649617285198988e-06, + "loss": 1.1465, + "step": 2822 + }, + { + "epoch": 0.8195674263318333, + "grad_norm": 3.1054487228393555, + "learning_rate": 9.649264081751457e-06, + "loss": 1.1381, + "step": 2823 + }, + { + "epoch": 0.8198577442299317, + "grad_norm": 3.0874011516571045, + "learning_rate": 9.648910706840913e-06, + "loss": 1.1209, + "step": 2824 + }, + { + "epoch": 0.8201480621280302, + "grad_norm": 3.571061611175537, + "learning_rate": 9.648557160480387e-06, + "loss": 1.3397, + "step": 2825 + }, + { + "epoch": 0.8204383800261286, + "grad_norm": 3.1404099464416504, + "learning_rate": 9.648203442682917e-06, + "loss": 1.3083, + "step": 2826 + }, + { + "epoch": 0.820728697924227, + "grad_norm": 3.4728198051452637, + "learning_rate": 9.64784955346155e-06, + "loss": 1.1533, + "step": 2827 + }, + { + "epoch": 0.8210190158223254, + "grad_norm": 3.192854404449463, + "learning_rate": 9.647495492829336e-06, + "loss": 1.2375, + "step": 2828 + }, + { + "epoch": 0.8213093337204239, + "grad_norm": 3.1363725662231445, + "learning_rate": 9.64714126079933e-06, + "loss": 0.974, + "step": 2829 + }, + { + "epoch": 0.8215996516185223, + "grad_norm": 3.270286798477173, + "learning_rate": 9.6467868573846e-06, + "loss": 1.1538, + "step": 2830 + }, + { + "epoch": 0.8218899695166207, + "grad_norm": 3.2243921756744385, + "learning_rate": 9.646432282598215e-06, + "loss": 1.1136, + "step": 2831 + }, + { + "epoch": 0.8221802874147192, + "grad_norm": 3.2147057056427, + "learning_rate": 9.646077536453251e-06, + "loss": 1.18, + "step": 2832 + }, + { + "epoch": 0.8224706053128176, + "grad_norm": 3.073420524597168, + "learning_rate": 9.64572261896279e-06, + "loss": 1.1108, + "step": 2833 + }, + { + "epoch": 0.822760923210916, + "grad_norm": 3.137725591659546, + "learning_rate": 9.645367530139925e-06, + "loss": 1.1624, + "step": 2834 + }, + { + "epoch": 0.8230512411090144, + "grad_norm": 3.2300662994384766, + "learning_rate": 9.645012269997747e-06, + "loss": 1.2579, + "step": 2835 + }, + { + "epoch": 0.8233415590071128, + "grad_norm": 3.178576707839966, + "learning_rate": 9.64465683854936e-06, + "loss": 1.1127, + "step": 2836 + }, + { + "epoch": 0.8236318769052112, + "grad_norm": 3.1000449657440186, + "learning_rate": 9.644301235807872e-06, + "loss": 1.0045, + "step": 2837 + }, + { + "epoch": 0.8239221948033096, + "grad_norm": 3.1290085315704346, + "learning_rate": 9.643945461786397e-06, + "loss": 1.0721, + "step": 2838 + }, + { + "epoch": 0.824212512701408, + "grad_norm": 3.3767518997192383, + "learning_rate": 9.643589516498057e-06, + "loss": 1.268, + "step": 2839 + }, + { + "epoch": 0.8245028305995065, + "grad_norm": 3.204231023788452, + "learning_rate": 9.64323339995598e-06, + "loss": 1.1592, + "step": 2840 + }, + { + "epoch": 0.8247931484976049, + "grad_norm": 2.6525983810424805, + "learning_rate": 9.642877112173294e-06, + "loss": 1.0086, + "step": 2841 + }, + { + "epoch": 0.8250834663957033, + "grad_norm": 3.5629663467407227, + "learning_rate": 9.642520653163146e-06, + "loss": 1.1653, + "step": 2842 + }, + { + "epoch": 0.8253737842938017, + "grad_norm": 3.5206522941589355, + "learning_rate": 9.642164022938678e-06, + "loss": 1.1618, + "step": 2843 + }, + { + "epoch": 0.8256641021919001, + "grad_norm": 3.1275205612182617, + "learning_rate": 9.641807221513041e-06, + "loss": 1.0722, + "step": 2844 + }, + { + "epoch": 0.8259544200899985, + "grad_norm": 3.354448080062866, + "learning_rate": 9.641450248899397e-06, + "loss": 1.2366, + "step": 2845 + }, + { + "epoch": 0.8262447379880969, + "grad_norm": 3.196295976638794, + "learning_rate": 9.64109310511091e-06, + "loss": 1.0949, + "step": 2846 + }, + { + "epoch": 0.8265350558861954, + "grad_norm": 3.35182785987854, + "learning_rate": 9.640735790160751e-06, + "loss": 1.3141, + "step": 2847 + }, + { + "epoch": 0.8268253737842938, + "grad_norm": 3.4913763999938965, + "learning_rate": 9.640378304062099e-06, + "loss": 1.3896, + "step": 2848 + }, + { + "epoch": 0.8271156916823922, + "grad_norm": 3.162344455718994, + "learning_rate": 9.640020646828134e-06, + "loss": 1.1087, + "step": 2849 + }, + { + "epoch": 0.8274060095804906, + "grad_norm": 2.9598472118377686, + "learning_rate": 9.639662818472051e-06, + "loss": 1.0635, + "step": 2850 + }, + { + "epoch": 0.827696327478589, + "grad_norm": 3.6094932556152344, + "learning_rate": 9.639304819007043e-06, + "loss": 1.286, + "step": 2851 + }, + { + "epoch": 0.8279866453766874, + "grad_norm": 2.9936227798461914, + "learning_rate": 9.638946648446314e-06, + "loss": 1.1463, + "step": 2852 + }, + { + "epoch": 0.8282769632747858, + "grad_norm": 3.988034725189209, + "learning_rate": 9.638588306803075e-06, + "loss": 1.2177, + "step": 2853 + }, + { + "epoch": 0.8285672811728844, + "grad_norm": 3.431546926498413, + "learning_rate": 9.63822979409054e-06, + "loss": 1.0094, + "step": 2854 + }, + { + "epoch": 0.8288575990709828, + "grad_norm": 3.446589231491089, + "learning_rate": 9.63787111032193e-06, + "loss": 1.315, + "step": 2855 + }, + { + "epoch": 0.8291479169690812, + "grad_norm": 3.355750322341919, + "learning_rate": 9.637512255510475e-06, + "loss": 1.1084, + "step": 2856 + }, + { + "epoch": 0.8294382348671796, + "grad_norm": 3.808082103729248, + "learning_rate": 9.637153229669407e-06, + "loss": 1.1741, + "step": 2857 + }, + { + "epoch": 0.829728552765278, + "grad_norm": 3.1000587940216064, + "learning_rate": 9.636794032811968e-06, + "loss": 1.0451, + "step": 2858 + }, + { + "epoch": 0.8300188706633764, + "grad_norm": 3.0135488510131836, + "learning_rate": 9.636434664951407e-06, + "loss": 1.1303, + "step": 2859 + }, + { + "epoch": 0.8303091885614748, + "grad_norm": 3.029987096786499, + "learning_rate": 9.636075126100974e-06, + "loss": 1.2556, + "step": 2860 + }, + { + "epoch": 0.8305995064595733, + "grad_norm": 3.5480244159698486, + "learning_rate": 9.63571541627393e-06, + "loss": 1.0877, + "step": 2861 + }, + { + "epoch": 0.8308898243576717, + "grad_norm": 3.170466423034668, + "learning_rate": 9.635355535483541e-06, + "loss": 1.1736, + "step": 2862 + }, + { + "epoch": 0.8311801422557701, + "grad_norm": 3.1938586235046387, + "learning_rate": 9.634995483743079e-06, + "loss": 1.2071, + "step": 2863 + }, + { + "epoch": 0.8314704601538685, + "grad_norm": 3.2252891063690186, + "learning_rate": 9.634635261065824e-06, + "loss": 1.1202, + "step": 2864 + }, + { + "epoch": 0.8317607780519669, + "grad_norm": 2.953683853149414, + "learning_rate": 9.634274867465058e-06, + "loss": 1.1123, + "step": 2865 + }, + { + "epoch": 0.8320510959500653, + "grad_norm": 3.30548357963562, + "learning_rate": 9.633914302954077e-06, + "loss": 1.1805, + "step": 2866 + }, + { + "epoch": 0.8323414138481637, + "grad_norm": 3.3781816959381104, + "learning_rate": 9.633553567546173e-06, + "loss": 1.2113, + "step": 2867 + }, + { + "epoch": 0.8326317317462621, + "grad_norm": 3.3362321853637695, + "learning_rate": 9.633192661254654e-06, + "loss": 1.2132, + "step": 2868 + }, + { + "epoch": 0.8329220496443606, + "grad_norm": 3.1321659088134766, + "learning_rate": 9.632831584092826e-06, + "loss": 1.1416, + "step": 2869 + }, + { + "epoch": 0.833212367542459, + "grad_norm": 3.464764356613159, + "learning_rate": 9.632470336074009e-06, + "loss": 1.2914, + "step": 2870 + }, + { + "epoch": 0.8335026854405574, + "grad_norm": 3.633310079574585, + "learning_rate": 9.632108917211525e-06, + "loss": 1.1349, + "step": 2871 + }, + { + "epoch": 0.8337930033386558, + "grad_norm": 2.9007396697998047, + "learning_rate": 9.6317473275187e-06, + "loss": 1.1294, + "step": 2872 + }, + { + "epoch": 0.8340833212367542, + "grad_norm": 3.544186592102051, + "learning_rate": 9.631385567008876e-06, + "loss": 1.1775, + "step": 2873 + }, + { + "epoch": 0.8343736391348526, + "grad_norm": 3.3772568702697754, + "learning_rate": 9.631023635695387e-06, + "loss": 1.2087, + "step": 2874 + }, + { + "epoch": 0.834663957032951, + "grad_norm": 5.305667877197266, + "learning_rate": 9.630661533591584e-06, + "loss": 1.0834, + "step": 2875 + }, + { + "epoch": 0.8349542749310495, + "grad_norm": 2.999448299407959, + "learning_rate": 9.630299260710821e-06, + "loss": 1.1121, + "step": 2876 + }, + { + "epoch": 0.8352445928291479, + "grad_norm": 3.4550819396972656, + "learning_rate": 9.629936817066459e-06, + "loss": 1.2967, + "step": 2877 + }, + { + "epoch": 0.8355349107272463, + "grad_norm": 2.9293079376220703, + "learning_rate": 9.629574202671866e-06, + "loss": 1.0916, + "step": 2878 + }, + { + "epoch": 0.8358252286253448, + "grad_norm": 3.328514814376831, + "learning_rate": 9.629211417540412e-06, + "loss": 1.2201, + "step": 2879 + }, + { + "epoch": 0.8361155465234432, + "grad_norm": 3.393035650253296, + "learning_rate": 9.628848461685479e-06, + "loss": 1.1133, + "step": 2880 + }, + { + "epoch": 0.8364058644215416, + "grad_norm": 3.4126694202423096, + "learning_rate": 9.62848533512045e-06, + "loss": 1.1965, + "step": 2881 + }, + { + "epoch": 0.83669618231964, + "grad_norm": 3.150296688079834, + "learning_rate": 9.62812203785872e-06, + "loss": 1.2777, + "step": 2882 + }, + { + "epoch": 0.8369865002177385, + "grad_norm": 3.2624874114990234, + "learning_rate": 9.627758569913687e-06, + "loss": 1.08, + "step": 2883 + }, + { + "epoch": 0.8372768181158369, + "grad_norm": 3.2924187183380127, + "learning_rate": 9.627394931298752e-06, + "loss": 1.1596, + "step": 2884 + }, + { + "epoch": 0.8375671360139353, + "grad_norm": 3.2016308307647705, + "learning_rate": 9.62703112202733e-06, + "loss": 1.0904, + "step": 2885 + }, + { + "epoch": 0.8378574539120337, + "grad_norm": 2.954402446746826, + "learning_rate": 9.626667142112835e-06, + "loss": 1.0328, + "step": 2886 + }, + { + "epoch": 0.8381477718101321, + "grad_norm": 3.052061080932617, + "learning_rate": 9.626302991568693e-06, + "loss": 1.0774, + "step": 2887 + }, + { + "epoch": 0.8384380897082305, + "grad_norm": 3.575716972351074, + "learning_rate": 9.625938670408332e-06, + "loss": 1.2461, + "step": 2888 + }, + { + "epoch": 0.8387284076063289, + "grad_norm": 3.2799222469329834, + "learning_rate": 9.62557417864519e-06, + "loss": 1.1622, + "step": 2889 + }, + { + "epoch": 0.8390187255044274, + "grad_norm": 3.241396188735962, + "learning_rate": 9.625209516292706e-06, + "loss": 1.2957, + "step": 2890 + }, + { + "epoch": 0.8393090434025258, + "grad_norm": 3.083571195602417, + "learning_rate": 9.62484468336433e-06, + "loss": 1.1481, + "step": 2891 + }, + { + "epoch": 0.8395993613006242, + "grad_norm": 2.80134654045105, + "learning_rate": 9.62447967987352e-06, + "loss": 1.0736, + "step": 2892 + }, + { + "epoch": 0.8398896791987226, + "grad_norm": 3.1099038124084473, + "learning_rate": 9.624114505833732e-06, + "loss": 1.2471, + "step": 2893 + }, + { + "epoch": 0.840179997096821, + "grad_norm": 2.9737226963043213, + "learning_rate": 9.623749161258437e-06, + "loss": 1.2019, + "step": 2894 + }, + { + "epoch": 0.8404703149949194, + "grad_norm": 3.2130281925201416, + "learning_rate": 9.623383646161108e-06, + "loss": 1.2244, + "step": 2895 + }, + { + "epoch": 0.8407606328930178, + "grad_norm": 3.3365936279296875, + "learning_rate": 9.623017960555226e-06, + "loss": 1.2363, + "step": 2896 + }, + { + "epoch": 0.8410509507911162, + "grad_norm": 3.4677717685699463, + "learning_rate": 9.622652104454274e-06, + "loss": 1.2702, + "step": 2897 + }, + { + "epoch": 0.8413412686892147, + "grad_norm": 3.4130473136901855, + "learning_rate": 9.622286077871748e-06, + "loss": 1.2962, + "step": 2898 + }, + { + "epoch": 0.8416315865873131, + "grad_norm": 3.2819225788116455, + "learning_rate": 9.621919880821145e-06, + "loss": 1.2152, + "step": 2899 + }, + { + "epoch": 0.8419219044854115, + "grad_norm": 3.008981227874756, + "learning_rate": 9.621553513315972e-06, + "loss": 1.0549, + "step": 2900 + }, + { + "epoch": 0.8422122223835099, + "grad_norm": 3.9223222732543945, + "learning_rate": 9.621186975369739e-06, + "loss": 1.0762, + "step": 2901 + }, + { + "epoch": 0.8425025402816083, + "grad_norm": 3.2732174396514893, + "learning_rate": 9.620820266995963e-06, + "loss": 1.174, + "step": 2902 + }, + { + "epoch": 0.8427928581797067, + "grad_norm": 3.5400829315185547, + "learning_rate": 9.620453388208171e-06, + "loss": 1.1838, + "step": 2903 + }, + { + "epoch": 0.8430831760778053, + "grad_norm": 3.2847681045532227, + "learning_rate": 9.620086339019892e-06, + "loss": 1.1586, + "step": 2904 + }, + { + "epoch": 0.8433734939759037, + "grad_norm": 3.5318374633789062, + "learning_rate": 9.619719119444662e-06, + "loss": 1.238, + "step": 2905 + }, + { + "epoch": 0.8436638118740021, + "grad_norm": 3.087456464767456, + "learning_rate": 9.619351729496022e-06, + "loss": 1.1586, + "step": 2906 + }, + { + "epoch": 0.8439541297721005, + "grad_norm": 3.138263702392578, + "learning_rate": 9.618984169187525e-06, + "loss": 1.0592, + "step": 2907 + }, + { + "epoch": 0.8442444476701989, + "grad_norm": 3.5749359130859375, + "learning_rate": 9.618616438532725e-06, + "loss": 1.3117, + "step": 2908 + }, + { + "epoch": 0.8445347655682973, + "grad_norm": 3.131622076034546, + "learning_rate": 9.618248537545182e-06, + "loss": 1.1527, + "step": 2909 + }, + { + "epoch": 0.8448250834663957, + "grad_norm": 3.2335894107818604, + "learning_rate": 9.617880466238468e-06, + "loss": 1.1853, + "step": 2910 + }, + { + "epoch": 0.8451154013644941, + "grad_norm": 3.500901222229004, + "learning_rate": 9.617512224626153e-06, + "loss": 1.2586, + "step": 2911 + }, + { + "epoch": 0.8454057192625926, + "grad_norm": 3.386972188949585, + "learning_rate": 9.61714381272182e-06, + "loss": 1.0863, + "step": 2912 + }, + { + "epoch": 0.845696037160691, + "grad_norm": 3.1666817665100098, + "learning_rate": 9.616775230539057e-06, + "loss": 1.1175, + "step": 2913 + }, + { + "epoch": 0.8459863550587894, + "grad_norm": 3.1548240184783936, + "learning_rate": 9.616406478091453e-06, + "loss": 1.2446, + "step": 2914 + }, + { + "epoch": 0.8462766729568878, + "grad_norm": 3.5241239070892334, + "learning_rate": 9.616037555392612e-06, + "loss": 1.2566, + "step": 2915 + }, + { + "epoch": 0.8465669908549862, + "grad_norm": 3.256432294845581, + "learning_rate": 9.615668462456138e-06, + "loss": 1.1985, + "step": 2916 + }, + { + "epoch": 0.8468573087530846, + "grad_norm": 3.100454807281494, + "learning_rate": 9.615299199295643e-06, + "loss": 1.0913, + "step": 2917 + }, + { + "epoch": 0.847147626651183, + "grad_norm": 3.2860195636749268, + "learning_rate": 9.614929765924743e-06, + "loss": 1.1788, + "step": 2918 + }, + { + "epoch": 0.8474379445492815, + "grad_norm": 3.776573419570923, + "learning_rate": 9.614560162357065e-06, + "loss": 1.1846, + "step": 2919 + }, + { + "epoch": 0.8477282624473799, + "grad_norm": 3.1679580211639404, + "learning_rate": 9.61419038860624e-06, + "loss": 1.1935, + "step": 2920 + }, + { + "epoch": 0.8480185803454783, + "grad_norm": 3.056650161743164, + "learning_rate": 9.613820444685905e-06, + "loss": 1.2031, + "step": 2921 + }, + { + "epoch": 0.8483088982435767, + "grad_norm": 3.017367362976074, + "learning_rate": 9.613450330609702e-06, + "loss": 1.0897, + "step": 2922 + }, + { + "epoch": 0.8485992161416751, + "grad_norm": 3.249253273010254, + "learning_rate": 9.613080046391283e-06, + "loss": 1.0954, + "step": 2923 + }, + { + "epoch": 0.8488895340397735, + "grad_norm": 3.4139556884765625, + "learning_rate": 9.612709592044302e-06, + "loss": 1.1066, + "step": 2924 + }, + { + "epoch": 0.8491798519378719, + "grad_norm": 2.860640048980713, + "learning_rate": 9.612338967582422e-06, + "loss": 1.0388, + "step": 2925 + }, + { + "epoch": 0.8494701698359703, + "grad_norm": 3.295013666152954, + "learning_rate": 9.61196817301931e-06, + "loss": 1.1332, + "step": 2926 + }, + { + "epoch": 0.8497604877340688, + "grad_norm": 3.217747449874878, + "learning_rate": 9.611597208368643e-06, + "loss": 1.1077, + "step": 2927 + }, + { + "epoch": 0.8500508056321672, + "grad_norm": 3.2518575191497803, + "learning_rate": 9.6112260736441e-06, + "loss": 1.1124, + "step": 2928 + }, + { + "epoch": 0.8503411235302656, + "grad_norm": 3.5065808296203613, + "learning_rate": 9.61085476885937e-06, + "loss": 1.4305, + "step": 2929 + }, + { + "epoch": 0.8506314414283641, + "grad_norm": 3.3835043907165527, + "learning_rate": 9.610483294028146e-06, + "loss": 1.1893, + "step": 2930 + }, + { + "epoch": 0.8509217593264625, + "grad_norm": 3.2871642112731934, + "learning_rate": 9.610111649164128e-06, + "loss": 0.9748, + "step": 2931 + }, + { + "epoch": 0.8512120772245609, + "grad_norm": 3.6779463291168213, + "learning_rate": 9.609739834281023e-06, + "loss": 1.1088, + "step": 2932 + }, + { + "epoch": 0.8515023951226594, + "grad_norm": 3.250479221343994, + "learning_rate": 9.609367849392538e-06, + "loss": 1.2176, + "step": 2933 + }, + { + "epoch": 0.8517927130207578, + "grad_norm": 3.5712833404541016, + "learning_rate": 9.6089956945124e-06, + "loss": 1.2416, + "step": 2934 + }, + { + "epoch": 0.8520830309188562, + "grad_norm": 3.58555269241333, + "learning_rate": 9.608623369654329e-06, + "loss": 1.2917, + "step": 2935 + }, + { + "epoch": 0.8523733488169546, + "grad_norm": 3.2484397888183594, + "learning_rate": 9.608250874832056e-06, + "loss": 1.2379, + "step": 2936 + }, + { + "epoch": 0.852663666715053, + "grad_norm": 3.329904556274414, + "learning_rate": 9.607878210059319e-06, + "loss": 1.1517, + "step": 2937 + }, + { + "epoch": 0.8529539846131514, + "grad_norm": 3.4330637454986572, + "learning_rate": 9.607505375349863e-06, + "loss": 1.1697, + "step": 2938 + }, + { + "epoch": 0.8532443025112498, + "grad_norm": 3.325636386871338, + "learning_rate": 9.607132370717438e-06, + "loss": 1.2163, + "step": 2939 + }, + { + "epoch": 0.8535346204093482, + "grad_norm": 3.112339973449707, + "learning_rate": 9.606759196175799e-06, + "loss": 1.1753, + "step": 2940 + }, + { + "epoch": 0.8538249383074467, + "grad_norm": 2.995211362838745, + "learning_rate": 9.606385851738709e-06, + "loss": 0.9425, + "step": 2941 + }, + { + "epoch": 0.8541152562055451, + "grad_norm": 3.3022618293762207, + "learning_rate": 9.606012337419935e-06, + "loss": 1.0678, + "step": 2942 + }, + { + "epoch": 0.8544055741036435, + "grad_norm": 3.29768967628479, + "learning_rate": 9.605638653233256e-06, + "loss": 1.0541, + "step": 2943 + }, + { + "epoch": 0.8546958920017419, + "grad_norm": 3.348756790161133, + "learning_rate": 9.605264799192451e-06, + "loss": 1.1323, + "step": 2944 + }, + { + "epoch": 0.8549862098998403, + "grad_norm": 3.145399808883667, + "learning_rate": 9.604890775311306e-06, + "loss": 1.1527, + "step": 2945 + }, + { + "epoch": 0.8552765277979387, + "grad_norm": 3.2217440605163574, + "learning_rate": 9.604516581603618e-06, + "loss": 1.1699, + "step": 2946 + }, + { + "epoch": 0.8555668456960371, + "grad_norm": 3.144026041030884, + "learning_rate": 9.604142218083186e-06, + "loss": 1.1709, + "step": 2947 + }, + { + "epoch": 0.8558571635941356, + "grad_norm": 3.078562021255493, + "learning_rate": 9.603767684763816e-06, + "loss": 1.0826, + "step": 2948 + }, + { + "epoch": 0.856147481492234, + "grad_norm": 3.4146084785461426, + "learning_rate": 9.60339298165932e-06, + "loss": 1.2019, + "step": 2949 + }, + { + "epoch": 0.8564377993903324, + "grad_norm": 3.4038820266723633, + "learning_rate": 9.603018108783518e-06, + "loss": 1.2677, + "step": 2950 + }, + { + "epoch": 0.8567281172884308, + "grad_norm": 2.9391040802001953, + "learning_rate": 9.602643066150235e-06, + "loss": 0.9942, + "step": 2951 + }, + { + "epoch": 0.8570184351865292, + "grad_norm": 3.679786443710327, + "learning_rate": 9.602267853773301e-06, + "loss": 1.3242, + "step": 2952 + }, + { + "epoch": 0.8573087530846276, + "grad_norm": 2.8453195095062256, + "learning_rate": 9.601892471666556e-06, + "loss": 0.9488, + "step": 2953 + }, + { + "epoch": 0.857599070982726, + "grad_norm": 3.4040963649749756, + "learning_rate": 9.601516919843843e-06, + "loss": 1.2333, + "step": 2954 + }, + { + "epoch": 0.8578893888808246, + "grad_norm": 2.9734036922454834, + "learning_rate": 9.601141198319013e-06, + "loss": 1.0074, + "step": 2955 + }, + { + "epoch": 0.858179706778923, + "grad_norm": 3.0131356716156006, + "learning_rate": 9.600765307105919e-06, + "loss": 1.1091, + "step": 2956 + }, + { + "epoch": 0.8584700246770214, + "grad_norm": 3.171550750732422, + "learning_rate": 9.60038924621843e-06, + "loss": 0.9531, + "step": 2957 + }, + { + "epoch": 0.8587603425751198, + "grad_norm": 3.324277639389038, + "learning_rate": 9.600013015670408e-06, + "loss": 1.3101, + "step": 2958 + }, + { + "epoch": 0.8590506604732182, + "grad_norm": 3.18428635597229, + "learning_rate": 9.599636615475731e-06, + "loss": 1.1184, + "step": 2959 + }, + { + "epoch": 0.8593409783713166, + "grad_norm": 3.0067083835601807, + "learning_rate": 9.599260045648281e-06, + "loss": 1.1813, + "step": 2960 + }, + { + "epoch": 0.859631296269415, + "grad_norm": 2.8176536560058594, + "learning_rate": 9.598883306201949e-06, + "loss": 1.062, + "step": 2961 + }, + { + "epoch": 0.8599216141675134, + "grad_norm": 3.1799442768096924, + "learning_rate": 9.598506397150623e-06, + "loss": 1.1755, + "step": 2962 + }, + { + "epoch": 0.8602119320656119, + "grad_norm": 2.9520862102508545, + "learning_rate": 9.598129318508207e-06, + "loss": 0.923, + "step": 2963 + }, + { + "epoch": 0.8605022499637103, + "grad_norm": 3.538482666015625, + "learning_rate": 9.597752070288607e-06, + "loss": 1.2052, + "step": 2964 + }, + { + "epoch": 0.8607925678618087, + "grad_norm": 3.4400877952575684, + "learning_rate": 9.597374652505733e-06, + "loss": 1.1748, + "step": 2965 + }, + { + "epoch": 0.8610828857599071, + "grad_norm": 3.192110300064087, + "learning_rate": 9.596997065173508e-06, + "loss": 1.1613, + "step": 2966 + }, + { + "epoch": 0.8613732036580055, + "grad_norm": 3.294027328491211, + "learning_rate": 9.596619308305855e-06, + "loss": 1.1743, + "step": 2967 + }, + { + "epoch": 0.8616635215561039, + "grad_norm": 3.0262019634246826, + "learning_rate": 9.596241381916704e-06, + "loss": 1.074, + "step": 2968 + }, + { + "epoch": 0.8619538394542023, + "grad_norm": 3.1539053916931152, + "learning_rate": 9.595863286019997e-06, + "loss": 1.2264, + "step": 2969 + }, + { + "epoch": 0.8622441573523008, + "grad_norm": 2.9892208576202393, + "learning_rate": 9.595485020629676e-06, + "loss": 1.0432, + "step": 2970 + }, + { + "epoch": 0.8625344752503992, + "grad_norm": 3.0038716793060303, + "learning_rate": 9.59510658575969e-06, + "loss": 1.0812, + "step": 2971 + }, + { + "epoch": 0.8628247931484976, + "grad_norm": 3.4315454959869385, + "learning_rate": 9.594727981423998e-06, + "loss": 1.2797, + "step": 2972 + }, + { + "epoch": 0.863115111046596, + "grad_norm": 3.2693030834198, + "learning_rate": 9.594349207636559e-06, + "loss": 1.1986, + "step": 2973 + }, + { + "epoch": 0.8634054289446944, + "grad_norm": 3.197600841522217, + "learning_rate": 9.593970264411348e-06, + "loss": 1.1726, + "step": 2974 + }, + { + "epoch": 0.8636957468427928, + "grad_norm": 3.848891496658325, + "learning_rate": 9.593591151762334e-06, + "loss": 1.1903, + "step": 2975 + }, + { + "epoch": 0.8639860647408912, + "grad_norm": 3.898817539215088, + "learning_rate": 9.593211869703503e-06, + "loss": 1.145, + "step": 2976 + }, + { + "epoch": 0.8642763826389896, + "grad_norm": 3.280470609664917, + "learning_rate": 9.592832418248838e-06, + "loss": 1.2771, + "step": 2977 + }, + { + "epoch": 0.8645667005370881, + "grad_norm": 2.8223423957824707, + "learning_rate": 9.59245279741234e-06, + "loss": 1.035, + "step": 2978 + }, + { + "epoch": 0.8648570184351865, + "grad_norm": 3.2701332569122314, + "learning_rate": 9.592073007208003e-06, + "loss": 1.3028, + "step": 2979 + }, + { + "epoch": 0.865147336333285, + "grad_norm": 3.103128671646118, + "learning_rate": 9.591693047649834e-06, + "loss": 1.1035, + "step": 2980 + }, + { + "epoch": 0.8654376542313834, + "grad_norm": 3.201188802719116, + "learning_rate": 9.591312918751852e-06, + "loss": 1.176, + "step": 2981 + }, + { + "epoch": 0.8657279721294818, + "grad_norm": 3.016108274459839, + "learning_rate": 9.590932620528068e-06, + "loss": 1.0289, + "step": 2982 + }, + { + "epoch": 0.8660182900275802, + "grad_norm": 3.240518093109131, + "learning_rate": 9.590552152992512e-06, + "loss": 1.1196, + "step": 2983 + }, + { + "epoch": 0.8663086079256787, + "grad_norm": 3.302276134490967, + "learning_rate": 9.590171516159214e-06, + "loss": 1.2784, + "step": 2984 + }, + { + "epoch": 0.8665989258237771, + "grad_norm": 3.3650875091552734, + "learning_rate": 9.589790710042212e-06, + "loss": 1.2402, + "step": 2985 + }, + { + "epoch": 0.8668892437218755, + "grad_norm": 3.414092779159546, + "learning_rate": 9.589409734655553e-06, + "loss": 1.2323, + "step": 2986 + }, + { + "epoch": 0.8671795616199739, + "grad_norm": 3.1558945178985596, + "learning_rate": 9.58902859001328e-06, + "loss": 1.0965, + "step": 2987 + }, + { + "epoch": 0.8674698795180723, + "grad_norm": 3.403278350830078, + "learning_rate": 9.588647276129456e-06, + "loss": 1.1815, + "step": 2988 + }, + { + "epoch": 0.8677601974161707, + "grad_norm": 2.8990426063537598, + "learning_rate": 9.588265793018141e-06, + "loss": 1.0713, + "step": 2989 + }, + { + "epoch": 0.8680505153142691, + "grad_norm": 3.296391248703003, + "learning_rate": 9.587884140693404e-06, + "loss": 1.146, + "step": 2990 + }, + { + "epoch": 0.8683408332123675, + "grad_norm": 3.0492796897888184, + "learning_rate": 9.58750231916932e-06, + "loss": 1.0286, + "step": 2991 + }, + { + "epoch": 0.868631151110466, + "grad_norm": 3.2753119468688965, + "learning_rate": 9.587120328459973e-06, + "loss": 1.0991, + "step": 2992 + }, + { + "epoch": 0.8689214690085644, + "grad_norm": 2.943715810775757, + "learning_rate": 9.586738168579446e-06, + "loss": 1.0901, + "step": 2993 + }, + { + "epoch": 0.8692117869066628, + "grad_norm": 3.236210584640503, + "learning_rate": 9.586355839541836e-06, + "loss": 1.3409, + "step": 2994 + }, + { + "epoch": 0.8695021048047612, + "grad_norm": 3.17950177192688, + "learning_rate": 9.585973341361244e-06, + "loss": 1.2406, + "step": 2995 + }, + { + "epoch": 0.8697924227028596, + "grad_norm": 2.9284613132476807, + "learning_rate": 9.585590674051775e-06, + "loss": 1.0142, + "step": 2996 + }, + { + "epoch": 0.870082740600958, + "grad_norm": 3.4473886489868164, + "learning_rate": 9.585207837627541e-06, + "loss": 1.3138, + "step": 2997 + }, + { + "epoch": 0.8703730584990564, + "grad_norm": 3.099240303039551, + "learning_rate": 9.58482483210266e-06, + "loss": 1.1775, + "step": 2998 + }, + { + "epoch": 0.8706633763971549, + "grad_norm": 3.1252505779266357, + "learning_rate": 9.584441657491263e-06, + "loss": 1.0392, + "step": 2999 + }, + { + "epoch": 0.8709536942952533, + "grad_norm": 3.072007417678833, + "learning_rate": 9.584058313807474e-06, + "loss": 1.0797, + "step": 3000 + }, + { + "epoch": 0.8709536942952533, + "eval_loss": 1.1775367259979248, + "eval_runtime": 11.589, + "eval_samples_per_second": 34.516, + "eval_steps_per_second": 4.314, + "step": 3000 + }, + { + "epoch": 0.8712440121933517, + "grad_norm": 3.092594861984253, + "learning_rate": 9.583674801065433e-06, + "loss": 1.1061, + "step": 3001 + }, + { + "epoch": 0.8715343300914501, + "grad_norm": 3.2414965629577637, + "learning_rate": 9.583291119279285e-06, + "loss": 1.0196, + "step": 3002 + }, + { + "epoch": 0.8718246479895485, + "grad_norm": 3.3458807468414307, + "learning_rate": 9.58290726846318e-06, + "loss": 1.269, + "step": 3003 + }, + { + "epoch": 0.8721149658876469, + "grad_norm": 3.083974838256836, + "learning_rate": 9.582523248631273e-06, + "loss": 1.1124, + "step": 3004 + }, + { + "epoch": 0.8724052837857454, + "grad_norm": 2.8129920959472656, + "learning_rate": 9.582139059797728e-06, + "loss": 1.0657, + "step": 3005 + }, + { + "epoch": 0.8726956016838439, + "grad_norm": 3.2248311042785645, + "learning_rate": 9.581754701976711e-06, + "loss": 1.2258, + "step": 3006 + }, + { + "epoch": 0.8729859195819423, + "grad_norm": 2.996952533721924, + "learning_rate": 9.581370175182401e-06, + "loss": 1.1067, + "step": 3007 + }, + { + "epoch": 0.8732762374800407, + "grad_norm": 3.218592643737793, + "learning_rate": 9.580985479428975e-06, + "loss": 1.0454, + "step": 3008 + }, + { + "epoch": 0.8735665553781391, + "grad_norm": 3.3797225952148438, + "learning_rate": 9.580600614730624e-06, + "loss": 1.1807, + "step": 3009 + }, + { + "epoch": 0.8738568732762375, + "grad_norm": 3.1415364742279053, + "learning_rate": 9.580215581101539e-06, + "loss": 1.1201, + "step": 3010 + }, + { + "epoch": 0.8741471911743359, + "grad_norm": 3.2598962783813477, + "learning_rate": 9.57983037855592e-06, + "loss": 1.1755, + "step": 3011 + }, + { + "epoch": 0.8744375090724343, + "grad_norm": 3.2180087566375732, + "learning_rate": 9.579445007107977e-06, + "loss": 1.2463, + "step": 3012 + }, + { + "epoch": 0.8747278269705328, + "grad_norm": 3.349390983581543, + "learning_rate": 9.579059466771918e-06, + "loss": 1.1918, + "step": 3013 + }, + { + "epoch": 0.8750181448686312, + "grad_norm": 3.22566819190979, + "learning_rate": 9.578673757561963e-06, + "loss": 1.1867, + "step": 3014 + }, + { + "epoch": 0.8753084627667296, + "grad_norm": 3.3200433254241943, + "learning_rate": 9.578287879492336e-06, + "loss": 1.0604, + "step": 3015 + }, + { + "epoch": 0.875598780664828, + "grad_norm": 2.9759771823883057, + "learning_rate": 9.577901832577269e-06, + "loss": 1.0893, + "step": 3016 + }, + { + "epoch": 0.8758890985629264, + "grad_norm": 3.5478708744049072, + "learning_rate": 9.577515616831e-06, + "loss": 1.231, + "step": 3017 + }, + { + "epoch": 0.8761794164610248, + "grad_norm": 3.2979137897491455, + "learning_rate": 9.577129232267772e-06, + "loss": 1.1449, + "step": 3018 + }, + { + "epoch": 0.8764697343591232, + "grad_norm": 3.123936653137207, + "learning_rate": 9.576742678901833e-06, + "loss": 1.1683, + "step": 3019 + }, + { + "epoch": 0.8767600522572216, + "grad_norm": 3.3888375759124756, + "learning_rate": 9.57635595674744e-06, + "loss": 1.2465, + "step": 3020 + }, + { + "epoch": 0.8770503701553201, + "grad_norm": 2.825896739959717, + "learning_rate": 9.575969065818856e-06, + "loss": 1.0497, + "step": 3021 + }, + { + "epoch": 0.8773406880534185, + "grad_norm": 3.0169923305511475, + "learning_rate": 9.57558200613035e-06, + "loss": 1.0137, + "step": 3022 + }, + { + "epoch": 0.8776310059515169, + "grad_norm": 3.445631265640259, + "learning_rate": 9.575194777696194e-06, + "loss": 1.1816, + "step": 3023 + }, + { + "epoch": 0.8779213238496153, + "grad_norm": 2.809177875518799, + "learning_rate": 9.57480738053067e-06, + "loss": 1.1858, + "step": 3024 + }, + { + "epoch": 0.8782116417477137, + "grad_norm": 3.311002254486084, + "learning_rate": 9.574419814648065e-06, + "loss": 1.2344, + "step": 3025 + }, + { + "epoch": 0.8785019596458121, + "grad_norm": 2.9318954944610596, + "learning_rate": 9.574032080062673e-06, + "loss": 1.1236, + "step": 3026 + }, + { + "epoch": 0.8787922775439105, + "grad_norm": 3.338117837905884, + "learning_rate": 9.573644176788795e-06, + "loss": 1.272, + "step": 3027 + }, + { + "epoch": 0.879082595442009, + "grad_norm": 3.30912446975708, + "learning_rate": 9.573256104840732e-06, + "loss": 1.1346, + "step": 3028 + }, + { + "epoch": 0.8793729133401074, + "grad_norm": 3.140470027923584, + "learning_rate": 9.572867864232799e-06, + "loss": 1.1724, + "step": 3029 + }, + { + "epoch": 0.8796632312382059, + "grad_norm": 3.1311466693878174, + "learning_rate": 9.572479454979315e-06, + "loss": 1.0638, + "step": 3030 + }, + { + "epoch": 0.8799535491363043, + "grad_norm": 3.1193671226501465, + "learning_rate": 9.572090877094604e-06, + "loss": 1.2142, + "step": 3031 + }, + { + "epoch": 0.8802438670344027, + "grad_norm": 3.0533499717712402, + "learning_rate": 9.571702130592994e-06, + "loss": 1.2326, + "step": 3032 + }, + { + "epoch": 0.8805341849325011, + "grad_norm": 3.523092269897461, + "learning_rate": 9.571313215488824e-06, + "loss": 1.0997, + "step": 3033 + }, + { + "epoch": 0.8808245028305995, + "grad_norm": 3.402045726776123, + "learning_rate": 9.570924131796437e-06, + "loss": 1.06, + "step": 3034 + }, + { + "epoch": 0.881114820728698, + "grad_norm": 3.0997350215911865, + "learning_rate": 9.570534879530182e-06, + "loss": 1.0053, + "step": 3035 + }, + { + "epoch": 0.8814051386267964, + "grad_norm": 2.9039306640625, + "learning_rate": 9.570145458704416e-06, + "loss": 1.0801, + "step": 3036 + }, + { + "epoch": 0.8816954565248948, + "grad_norm": 3.0941872596740723, + "learning_rate": 9.569755869333497e-06, + "loss": 1.16, + "step": 3037 + }, + { + "epoch": 0.8819857744229932, + "grad_norm": 3.2002017498016357, + "learning_rate": 9.569366111431794e-06, + "loss": 1.2813, + "step": 3038 + }, + { + "epoch": 0.8822760923210916, + "grad_norm": 3.665795087814331, + "learning_rate": 9.568976185013685e-06, + "loss": 1.3266, + "step": 3039 + }, + { + "epoch": 0.88256641021919, + "grad_norm": 3.3414106369018555, + "learning_rate": 9.568586090093545e-06, + "loss": 1.1968, + "step": 3040 + }, + { + "epoch": 0.8828567281172884, + "grad_norm": 3.1864659786224365, + "learning_rate": 9.568195826685765e-06, + "loss": 1.2351, + "step": 3041 + }, + { + "epoch": 0.8831470460153868, + "grad_norm": 3.338440179824829, + "learning_rate": 9.567805394804734e-06, + "loss": 1.1602, + "step": 3042 + }, + { + "epoch": 0.8834373639134853, + "grad_norm": 3.411781072616577, + "learning_rate": 9.567414794464854e-06, + "loss": 1.2741, + "step": 3043 + }, + { + "epoch": 0.8837276818115837, + "grad_norm": 2.922380208969116, + "learning_rate": 9.567024025680529e-06, + "loss": 1.0612, + "step": 3044 + }, + { + "epoch": 0.8840179997096821, + "grad_norm": 3.3472232818603516, + "learning_rate": 9.566633088466169e-06, + "loss": 1.0968, + "step": 3045 + }, + { + "epoch": 0.8843083176077805, + "grad_norm": 3.23529052734375, + "learning_rate": 9.566241982836193e-06, + "loss": 1.303, + "step": 3046 + }, + { + "epoch": 0.8845986355058789, + "grad_norm": 3.1247169971466064, + "learning_rate": 9.565850708805025e-06, + "loss": 1.2335, + "step": 3047 + }, + { + "epoch": 0.8848889534039773, + "grad_norm": 3.1896188259124756, + "learning_rate": 9.565459266387096e-06, + "loss": 1.2399, + "step": 3048 + }, + { + "epoch": 0.8851792713020757, + "grad_norm": 3.411284923553467, + "learning_rate": 9.56506765559684e-06, + "loss": 1.3471, + "step": 3049 + }, + { + "epoch": 0.8854695892001742, + "grad_norm": 3.114387273788452, + "learning_rate": 9.5646758764487e-06, + "loss": 1.195, + "step": 3050 + }, + { + "epoch": 0.8857599070982726, + "grad_norm": 3.2049310207366943, + "learning_rate": 9.564283928957126e-06, + "loss": 1.157, + "step": 3051 + }, + { + "epoch": 0.886050224996371, + "grad_norm": 3.156636953353882, + "learning_rate": 9.563891813136571e-06, + "loss": 1.1504, + "step": 3052 + }, + { + "epoch": 0.8863405428944694, + "grad_norm": 3.385990619659424, + "learning_rate": 9.563499529001498e-06, + "loss": 1.1591, + "step": 3053 + }, + { + "epoch": 0.8866308607925678, + "grad_norm": 3.049511671066284, + "learning_rate": 9.563107076566373e-06, + "loss": 1.1171, + "step": 3054 + }, + { + "epoch": 0.8869211786906663, + "grad_norm": 3.1001222133636475, + "learning_rate": 9.56271445584567e-06, + "loss": 1.0276, + "step": 3055 + }, + { + "epoch": 0.8872114965887647, + "grad_norm": 3.2549166679382324, + "learning_rate": 9.562321666853868e-06, + "loss": 1.1241, + "step": 3056 + }, + { + "epoch": 0.8875018144868632, + "grad_norm": 3.0443809032440186, + "learning_rate": 9.561928709605454e-06, + "loss": 1.0743, + "step": 3057 + }, + { + "epoch": 0.8877921323849616, + "grad_norm": 3.459932804107666, + "learning_rate": 9.561535584114919e-06, + "loss": 1.1445, + "step": 3058 + }, + { + "epoch": 0.88808245028306, + "grad_norm": 2.758932113647461, + "learning_rate": 9.561142290396763e-06, + "loss": 1.0656, + "step": 3059 + }, + { + "epoch": 0.8883727681811584, + "grad_norm": 2.894343852996826, + "learning_rate": 9.560748828465486e-06, + "loss": 1.1935, + "step": 3060 + }, + { + "epoch": 0.8886630860792568, + "grad_norm": 2.8865163326263428, + "learning_rate": 9.560355198335607e-06, + "loss": 0.9562, + "step": 3061 + }, + { + "epoch": 0.8889534039773552, + "grad_norm": 3.2808666229248047, + "learning_rate": 9.559961400021636e-06, + "loss": 1.0705, + "step": 3062 + }, + { + "epoch": 0.8892437218754536, + "grad_norm": 3.1613757610321045, + "learning_rate": 9.559567433538097e-06, + "loss": 1.1494, + "step": 3063 + }, + { + "epoch": 0.889534039773552, + "grad_norm": 3.128833532333374, + "learning_rate": 9.55917329889952e-06, + "loss": 1.1202, + "step": 3064 + }, + { + "epoch": 0.8898243576716505, + "grad_norm": 3.2559049129486084, + "learning_rate": 9.558778996120443e-06, + "loss": 1.2322, + "step": 3065 + }, + { + "epoch": 0.8901146755697489, + "grad_norm": 3.2830514907836914, + "learning_rate": 9.558384525215406e-06, + "loss": 1.2362, + "step": 3066 + }, + { + "epoch": 0.8904049934678473, + "grad_norm": 3.1671226024627686, + "learning_rate": 9.557989886198955e-06, + "loss": 1.3601, + "step": 3067 + }, + { + "epoch": 0.8906953113659457, + "grad_norm": 3.2132253646850586, + "learning_rate": 9.557595079085646e-06, + "loss": 0.9999, + "step": 3068 + }, + { + "epoch": 0.8909856292640441, + "grad_norm": 2.914524555206299, + "learning_rate": 9.557200103890038e-06, + "loss": 0.9415, + "step": 3069 + }, + { + "epoch": 0.8912759471621425, + "grad_norm": 3.0425221920013428, + "learning_rate": 9.556804960626702e-06, + "loss": 1.1311, + "step": 3070 + }, + { + "epoch": 0.891566265060241, + "grad_norm": 3.347184658050537, + "learning_rate": 9.556409649310206e-06, + "loss": 1.1673, + "step": 3071 + }, + { + "epoch": 0.8918565829583394, + "grad_norm": 3.4314563274383545, + "learning_rate": 9.556014169955128e-06, + "loss": 1.2945, + "step": 3072 + }, + { + "epoch": 0.8921469008564378, + "grad_norm": 2.9853997230529785, + "learning_rate": 9.555618522576058e-06, + "loss": 1.0987, + "step": 3073 + }, + { + "epoch": 0.8924372187545362, + "grad_norm": 3.1625750064849854, + "learning_rate": 9.555222707187584e-06, + "loss": 1.0362, + "step": 3074 + }, + { + "epoch": 0.8927275366526346, + "grad_norm": 3.226891279220581, + "learning_rate": 9.554826723804304e-06, + "loss": 1.2553, + "step": 3075 + }, + { + "epoch": 0.893017854550733, + "grad_norm": 3.2344210147857666, + "learning_rate": 9.554430572440822e-06, + "loss": 1.1399, + "step": 3076 + }, + { + "epoch": 0.8933081724488314, + "grad_norm": 3.3998959064483643, + "learning_rate": 9.554034253111747e-06, + "loss": 1.2145, + "step": 3077 + }, + { + "epoch": 0.8935984903469298, + "grad_norm": 3.6094846725463867, + "learning_rate": 9.553637765831697e-06, + "loss": 1.2089, + "step": 3078 + }, + { + "epoch": 0.8938888082450283, + "grad_norm": 2.996131181716919, + "learning_rate": 9.553241110615294e-06, + "loss": 1.0733, + "step": 3079 + }, + { + "epoch": 0.8941791261431268, + "grad_norm": 3.7459475994110107, + "learning_rate": 9.552844287477165e-06, + "loss": 1.3399, + "step": 3080 + }, + { + "epoch": 0.8944694440412252, + "grad_norm": 3.1052403450012207, + "learning_rate": 9.552447296431945e-06, + "loss": 1.1049, + "step": 3081 + }, + { + "epoch": 0.8947597619393236, + "grad_norm": 3.407588005065918, + "learning_rate": 9.552050137494275e-06, + "loss": 1.2035, + "step": 3082 + }, + { + "epoch": 0.895050079837422, + "grad_norm": 3.0574097633361816, + "learning_rate": 9.551652810678804e-06, + "loss": 1.0939, + "step": 3083 + }, + { + "epoch": 0.8953403977355204, + "grad_norm": 3.173433780670166, + "learning_rate": 9.551255316000183e-06, + "loss": 1.1121, + "step": 3084 + }, + { + "epoch": 0.8956307156336188, + "grad_norm": 3.04433274269104, + "learning_rate": 9.550857653473072e-06, + "loss": 1.0842, + "step": 3085 + }, + { + "epoch": 0.8959210335317173, + "grad_norm": 2.9734885692596436, + "learning_rate": 9.550459823112134e-06, + "loss": 1.0842, + "step": 3086 + }, + { + "epoch": 0.8962113514298157, + "grad_norm": 3.3427157402038574, + "learning_rate": 9.550061824932047e-06, + "loss": 1.1935, + "step": 3087 + }, + { + "epoch": 0.8965016693279141, + "grad_norm": 3.2677273750305176, + "learning_rate": 9.549663658947484e-06, + "loss": 1.2635, + "step": 3088 + }, + { + "epoch": 0.8967919872260125, + "grad_norm": 3.1517832279205322, + "learning_rate": 9.549265325173132e-06, + "loss": 1.3644, + "step": 3089 + }, + { + "epoch": 0.8970823051241109, + "grad_norm": 3.031965732574463, + "learning_rate": 9.548866823623679e-06, + "loss": 1.1241, + "step": 3090 + }, + { + "epoch": 0.8973726230222093, + "grad_norm": 3.4026827812194824, + "learning_rate": 9.548468154313822e-06, + "loss": 1.2084, + "step": 3091 + }, + { + "epoch": 0.8976629409203077, + "grad_norm": 3.157986879348755, + "learning_rate": 9.548069317258267e-06, + "loss": 1.016, + "step": 3092 + }, + { + "epoch": 0.8979532588184062, + "grad_norm": 3.4387762546539307, + "learning_rate": 9.547670312471718e-06, + "loss": 1.2204, + "step": 3093 + }, + { + "epoch": 0.8982435767165046, + "grad_norm": 3.1353819370269775, + "learning_rate": 9.547271139968893e-06, + "loss": 1.1181, + "step": 3094 + }, + { + "epoch": 0.898533894614603, + "grad_norm": 3.1333255767822266, + "learning_rate": 9.546871799764513e-06, + "loss": 1.2261, + "step": 3095 + }, + { + "epoch": 0.8988242125127014, + "grad_norm": 3.0457921028137207, + "learning_rate": 9.546472291873306e-06, + "loss": 1.0156, + "step": 3096 + }, + { + "epoch": 0.8991145304107998, + "grad_norm": 3.1292712688446045, + "learning_rate": 9.546072616310005e-06, + "loss": 1.0354, + "step": 3097 + }, + { + "epoch": 0.8994048483088982, + "grad_norm": 3.471691131591797, + "learning_rate": 9.54567277308935e-06, + "loss": 1.21, + "step": 3098 + }, + { + "epoch": 0.8996951662069966, + "grad_norm": 3.4814560413360596, + "learning_rate": 9.545272762226086e-06, + "loss": 1.2114, + "step": 3099 + }, + { + "epoch": 0.899985484105095, + "grad_norm": 3.2234396934509277, + "learning_rate": 9.544872583734967e-06, + "loss": 1.1872, + "step": 3100 + }, + { + "epoch": 0.9002758020031935, + "grad_norm": 3.178117275238037, + "learning_rate": 9.544472237630751e-06, + "loss": 1.0513, + "step": 3101 + }, + { + "epoch": 0.9005661199012919, + "grad_norm": 3.4485244750976562, + "learning_rate": 9.544071723928202e-06, + "loss": 1.3207, + "step": 3102 + }, + { + "epoch": 0.9008564377993903, + "grad_norm": 3.10819935798645, + "learning_rate": 9.54367104264209e-06, + "loss": 1.077, + "step": 3103 + }, + { + "epoch": 0.9011467556974887, + "grad_norm": 3.2871968746185303, + "learning_rate": 9.543270193787195e-06, + "loss": 1.1986, + "step": 3104 + }, + { + "epoch": 0.9014370735955871, + "grad_norm": 3.138451099395752, + "learning_rate": 9.542869177378298e-06, + "loss": 1.0721, + "step": 3105 + }, + { + "epoch": 0.9017273914936856, + "grad_norm": 3.0248279571533203, + "learning_rate": 9.542467993430189e-06, + "loss": 0.989, + "step": 3106 + }, + { + "epoch": 0.902017709391784, + "grad_norm": 2.8113856315612793, + "learning_rate": 9.542066641957661e-06, + "loss": 1.0949, + "step": 3107 + }, + { + "epoch": 0.9023080272898825, + "grad_norm": 2.728372573852539, + "learning_rate": 9.54166512297552e-06, + "loss": 0.9767, + "step": 3108 + }, + { + "epoch": 0.9025983451879809, + "grad_norm": 3.231879472732544, + "learning_rate": 9.541263436498568e-06, + "loss": 1.2046, + "step": 3109 + }, + { + "epoch": 0.9028886630860793, + "grad_norm": 3.1025683879852295, + "learning_rate": 9.540861582541624e-06, + "loss": 1.1099, + "step": 3110 + }, + { + "epoch": 0.9031789809841777, + "grad_norm": 3.0091891288757324, + "learning_rate": 9.540459561119508e-06, + "loss": 1.1656, + "step": 3111 + }, + { + "epoch": 0.9034692988822761, + "grad_norm": 3.297088861465454, + "learning_rate": 9.540057372247044e-06, + "loss": 1.0799, + "step": 3112 + }, + { + "epoch": 0.9037596167803745, + "grad_norm": 3.128406286239624, + "learning_rate": 9.539655015939068e-06, + "loss": 1.0659, + "step": 3113 + }, + { + "epoch": 0.9040499346784729, + "grad_norm": 3.099379777908325, + "learning_rate": 9.539252492210416e-06, + "loss": 1.1781, + "step": 3114 + }, + { + "epoch": 0.9043402525765714, + "grad_norm": 3.0667364597320557, + "learning_rate": 9.538849801075931e-06, + "loss": 1.0704, + "step": 3115 + }, + { + "epoch": 0.9046305704746698, + "grad_norm": 2.9172818660736084, + "learning_rate": 9.538446942550468e-06, + "loss": 0.9518, + "step": 3116 + }, + { + "epoch": 0.9049208883727682, + "grad_norm": 3.077747106552124, + "learning_rate": 9.538043916648884e-06, + "loss": 1.0487, + "step": 3117 + }, + { + "epoch": 0.9052112062708666, + "grad_norm": 3.0355618000030518, + "learning_rate": 9.53764072338604e-06, + "loss": 1.0977, + "step": 3118 + }, + { + "epoch": 0.905501524168965, + "grad_norm": 3.0987133979797363, + "learning_rate": 9.537237362776805e-06, + "loss": 1.2059, + "step": 3119 + }, + { + "epoch": 0.9057918420670634, + "grad_norm": 3.300485134124756, + "learning_rate": 9.53683383483606e-06, + "loss": 1.392, + "step": 3120 + }, + { + "epoch": 0.9060821599651618, + "grad_norm": 3.3400747776031494, + "learning_rate": 9.536430139578683e-06, + "loss": 1.251, + "step": 3121 + }, + { + "epoch": 0.9063724778632603, + "grad_norm": 3.356792688369751, + "learning_rate": 9.536026277019562e-06, + "loss": 1.3177, + "step": 3122 + }, + { + "epoch": 0.9066627957613587, + "grad_norm": 3.4476516246795654, + "learning_rate": 9.53562224717359e-06, + "loss": 1.2698, + "step": 3123 + }, + { + "epoch": 0.9069531136594571, + "grad_norm": 3.273559808731079, + "learning_rate": 9.535218050055672e-06, + "loss": 1.0991, + "step": 3124 + }, + { + "epoch": 0.9072434315575555, + "grad_norm": 3.0915908813476562, + "learning_rate": 9.53481368568071e-06, + "loss": 1.2781, + "step": 3125 + }, + { + "epoch": 0.9075337494556539, + "grad_norm": 3.1454083919525146, + "learning_rate": 9.53440915406362e-06, + "loss": 1.1556, + "step": 3126 + }, + { + "epoch": 0.9078240673537523, + "grad_norm": 3.109560966491699, + "learning_rate": 9.53400445521932e-06, + "loss": 0.9902, + "step": 3127 + }, + { + "epoch": 0.9081143852518507, + "grad_norm": 3.815458059310913, + "learning_rate": 9.533599589162735e-06, + "loss": 1.209, + "step": 3128 + }, + { + "epoch": 0.9084047031499491, + "grad_norm": 3.4106128215789795, + "learning_rate": 9.533194555908796e-06, + "loss": 1.2336, + "step": 3129 + }, + { + "epoch": 0.9086950210480476, + "grad_norm": 3.6380088329315186, + "learning_rate": 9.532789355472441e-06, + "loss": 1.3134, + "step": 3130 + }, + { + "epoch": 0.9089853389461461, + "grad_norm": 2.9199140071868896, + "learning_rate": 9.532383987868615e-06, + "loss": 1.0422, + "step": 3131 + }, + { + "epoch": 0.9092756568442445, + "grad_norm": 3.188913583755493, + "learning_rate": 9.531978453112263e-06, + "loss": 1.0525, + "step": 3132 + }, + { + "epoch": 0.9095659747423429, + "grad_norm": 3.872431516647339, + "learning_rate": 9.531572751218346e-06, + "loss": 1.2834, + "step": 3133 + }, + { + "epoch": 0.9098562926404413, + "grad_norm": 3.17043399810791, + "learning_rate": 9.531166882201823e-06, + "loss": 1.148, + "step": 3134 + }, + { + "epoch": 0.9101466105385397, + "grad_norm": 3.4306373596191406, + "learning_rate": 9.530760846077664e-06, + "loss": 1.0991, + "step": 3135 + }, + { + "epoch": 0.9104369284366381, + "grad_norm": 3.189354658126831, + "learning_rate": 9.530354642860845e-06, + "loss": 1.2444, + "step": 3136 + }, + { + "epoch": 0.9107272463347366, + "grad_norm": 3.085293769836426, + "learning_rate": 9.52994827256634e-06, + "loss": 1.2831, + "step": 3137 + }, + { + "epoch": 0.911017564232835, + "grad_norm": 3.2537155151367188, + "learning_rate": 9.529541735209145e-06, + "loss": 1.2515, + "step": 3138 + }, + { + "epoch": 0.9113078821309334, + "grad_norm": 3.4304065704345703, + "learning_rate": 9.529135030804246e-06, + "loss": 1.3192, + "step": 3139 + }, + { + "epoch": 0.9115982000290318, + "grad_norm": 3.0350377559661865, + "learning_rate": 9.528728159366644e-06, + "loss": 1.1985, + "step": 3140 + }, + { + "epoch": 0.9118885179271302, + "grad_norm": 3.5521934032440186, + "learning_rate": 9.528321120911345e-06, + "loss": 1.3126, + "step": 3141 + }, + { + "epoch": 0.9121788358252286, + "grad_norm": 3.580925941467285, + "learning_rate": 9.527913915453361e-06, + "loss": 1.2, + "step": 3142 + }, + { + "epoch": 0.912469153723327, + "grad_norm": 3.1894161701202393, + "learning_rate": 9.52750654300771e-06, + "loss": 1.2416, + "step": 3143 + }, + { + "epoch": 0.9127594716214255, + "grad_norm": 3.018322229385376, + "learning_rate": 9.52709900358941e-06, + "loss": 1.1492, + "step": 3144 + }, + { + "epoch": 0.9130497895195239, + "grad_norm": 3.544252634048462, + "learning_rate": 9.526691297213499e-06, + "loss": 1.2548, + "step": 3145 + }, + { + "epoch": 0.9133401074176223, + "grad_norm": 3.4180855751037598, + "learning_rate": 9.526283423895008e-06, + "loss": 1.3203, + "step": 3146 + }, + { + "epoch": 0.9136304253157207, + "grad_norm": 3.4566452503204346, + "learning_rate": 9.525875383648982e-06, + "loss": 1.1988, + "step": 3147 + }, + { + "epoch": 0.9139207432138191, + "grad_norm": 3.160930871963501, + "learning_rate": 9.525467176490467e-06, + "loss": 1.1696, + "step": 3148 + }, + { + "epoch": 0.9142110611119175, + "grad_norm": 3.328986167907715, + "learning_rate": 9.525058802434518e-06, + "loss": 1.2203, + "step": 3149 + }, + { + "epoch": 0.9145013790100159, + "grad_norm": 3.3570051193237305, + "learning_rate": 9.524650261496195e-06, + "loss": 1.1992, + "step": 3150 + }, + { + "epoch": 0.9147916969081143, + "grad_norm": 3.1143946647644043, + "learning_rate": 9.524241553690567e-06, + "loss": 1.0589, + "step": 3151 + }, + { + "epoch": 0.9150820148062128, + "grad_norm": 2.998553514480591, + "learning_rate": 9.523832679032705e-06, + "loss": 1.0533, + "step": 3152 + }, + { + "epoch": 0.9153723327043112, + "grad_norm": 3.413071632385254, + "learning_rate": 9.52342363753769e-06, + "loss": 1.2558, + "step": 3153 + }, + { + "epoch": 0.9156626506024096, + "grad_norm": 3.0415122509002686, + "learning_rate": 9.523014429220607e-06, + "loss": 1.1888, + "step": 3154 + }, + { + "epoch": 0.915952968500508, + "grad_norm": 3.035825490951538, + "learning_rate": 9.522605054096545e-06, + "loss": 1.018, + "step": 3155 + }, + { + "epoch": 0.9162432863986065, + "grad_norm": 3.2089812755584717, + "learning_rate": 9.522195512180606e-06, + "loss": 1.1775, + "step": 3156 + }, + { + "epoch": 0.9165336042967049, + "grad_norm": 3.3788814544677734, + "learning_rate": 9.521785803487888e-06, + "loss": 1.1407, + "step": 3157 + }, + { + "epoch": 0.9168239221948034, + "grad_norm": 3.256770133972168, + "learning_rate": 9.521375928033505e-06, + "loss": 1.2715, + "step": 3158 + }, + { + "epoch": 0.9171142400929018, + "grad_norm": 3.437924861907959, + "learning_rate": 9.520965885832574e-06, + "loss": 1.1269, + "step": 3159 + }, + { + "epoch": 0.9174045579910002, + "grad_norm": 3.3418171405792236, + "learning_rate": 9.520555676900214e-06, + "loss": 1.1122, + "step": 3160 + }, + { + "epoch": 0.9176948758890986, + "grad_norm": 3.2611937522888184, + "learning_rate": 9.520145301251554e-06, + "loss": 1.0641, + "step": 3161 + }, + { + "epoch": 0.917985193787197, + "grad_norm": 3.1774210929870605, + "learning_rate": 9.519734758901728e-06, + "loss": 1.1638, + "step": 3162 + }, + { + "epoch": 0.9182755116852954, + "grad_norm": 3.2918379306793213, + "learning_rate": 9.51932404986588e-06, + "loss": 1.2033, + "step": 3163 + }, + { + "epoch": 0.9185658295833938, + "grad_norm": 3.268033981323242, + "learning_rate": 9.518913174159153e-06, + "loss": 1.0939, + "step": 3164 + }, + { + "epoch": 0.9188561474814922, + "grad_norm": 3.0575218200683594, + "learning_rate": 9.518502131796701e-06, + "loss": 1.0925, + "step": 3165 + }, + { + "epoch": 0.9191464653795907, + "grad_norm": 3.339613914489746, + "learning_rate": 9.518090922793685e-06, + "loss": 1.2114, + "step": 3166 + }, + { + "epoch": 0.9194367832776891, + "grad_norm": 3.2413666248321533, + "learning_rate": 9.517679547165269e-06, + "loss": 1.1209, + "step": 3167 + }, + { + "epoch": 0.9197271011757875, + "grad_norm": 3.4668829441070557, + "learning_rate": 9.517268004926622e-06, + "loss": 1.13, + "step": 3168 + }, + { + "epoch": 0.9200174190738859, + "grad_norm": 3.3018696308135986, + "learning_rate": 9.516856296092925e-06, + "loss": 1.2597, + "step": 3169 + }, + { + "epoch": 0.9203077369719843, + "grad_norm": 3.127471923828125, + "learning_rate": 9.51644442067936e-06, + "loss": 1.069, + "step": 3170 + }, + { + "epoch": 0.9205980548700827, + "grad_norm": 2.9845657348632812, + "learning_rate": 9.516032378701117e-06, + "loss": 1.1097, + "step": 3171 + }, + { + "epoch": 0.9208883727681811, + "grad_norm": 3.2858119010925293, + "learning_rate": 9.515620170173392e-06, + "loss": 1.2764, + "step": 3172 + }, + { + "epoch": 0.9211786906662796, + "grad_norm": 2.8209214210510254, + "learning_rate": 9.515207795111387e-06, + "loss": 0.9764, + "step": 3173 + }, + { + "epoch": 0.921469008564378, + "grad_norm": 3.091514825820923, + "learning_rate": 9.51479525353031e-06, + "loss": 1.2961, + "step": 3174 + }, + { + "epoch": 0.9217593264624764, + "grad_norm": 2.9070065021514893, + "learning_rate": 9.514382545445376e-06, + "loss": 1.292, + "step": 3175 + }, + { + "epoch": 0.9220496443605748, + "grad_norm": 3.108344316482544, + "learning_rate": 9.513969670871805e-06, + "loss": 1.1846, + "step": 3176 + }, + { + "epoch": 0.9223399622586732, + "grad_norm": 3.2052361965179443, + "learning_rate": 9.513556629824825e-06, + "loss": 1.2653, + "step": 3177 + }, + { + "epoch": 0.9226302801567716, + "grad_norm": 3.103595018386841, + "learning_rate": 9.513143422319667e-06, + "loss": 1.1459, + "step": 3178 + }, + { + "epoch": 0.92292059805487, + "grad_norm": 2.842895984649658, + "learning_rate": 9.51273004837157e-06, + "loss": 1.0839, + "step": 3179 + }, + { + "epoch": 0.9232109159529684, + "grad_norm": 3.2208235263824463, + "learning_rate": 9.51231650799578e-06, + "loss": 1.1171, + "step": 3180 + }, + { + "epoch": 0.923501233851067, + "grad_norm": 2.9387643337249756, + "learning_rate": 9.511902801207548e-06, + "loss": 1.1748, + "step": 3181 + }, + { + "epoch": 0.9237915517491654, + "grad_norm": 3.3002710342407227, + "learning_rate": 9.51148892802213e-06, + "loss": 1.1812, + "step": 3182 + }, + { + "epoch": 0.9240818696472638, + "grad_norm": 3.609367847442627, + "learning_rate": 9.511074888454793e-06, + "loss": 1.1326, + "step": 3183 + }, + { + "epoch": 0.9243721875453622, + "grad_norm": 3.185091257095337, + "learning_rate": 9.510660682520803e-06, + "loss": 1.2802, + "step": 3184 + }, + { + "epoch": 0.9246625054434606, + "grad_norm": 3.3810675144195557, + "learning_rate": 9.510246310235438e-06, + "loss": 1.13, + "step": 3185 + }, + { + "epoch": 0.924952823341559, + "grad_norm": 2.905977725982666, + "learning_rate": 9.509831771613977e-06, + "loss": 0.9673, + "step": 3186 + }, + { + "epoch": 0.9252431412396575, + "grad_norm": 3.448277473449707, + "learning_rate": 9.50941706667171e-06, + "loss": 1.0962, + "step": 3187 + }, + { + "epoch": 0.9255334591377559, + "grad_norm": 3.034240961074829, + "learning_rate": 9.509002195423934e-06, + "loss": 1.1603, + "step": 3188 + }, + { + "epoch": 0.9258237770358543, + "grad_norm": 3.534836530685425, + "learning_rate": 9.508587157885944e-06, + "loss": 1.2476, + "step": 3189 + }, + { + "epoch": 0.9261140949339527, + "grad_norm": 3.2182629108428955, + "learning_rate": 9.508171954073049e-06, + "loss": 1.1697, + "step": 3190 + }, + { + "epoch": 0.9264044128320511, + "grad_norm": 3.3119056224823, + "learning_rate": 9.50775658400056e-06, + "loss": 1.1276, + "step": 3191 + }, + { + "epoch": 0.9266947307301495, + "grad_norm": 2.935210704803467, + "learning_rate": 9.5073410476838e-06, + "loss": 1.0133, + "step": 3192 + }, + { + "epoch": 0.9269850486282479, + "grad_norm": 2.970475912094116, + "learning_rate": 9.50692534513809e-06, + "loss": 1.3047, + "step": 3193 + }, + { + "epoch": 0.9272753665263463, + "grad_norm": 2.995439291000366, + "learning_rate": 9.50650947637876e-06, + "loss": 1.03, + "step": 3194 + }, + { + "epoch": 0.9275656844244448, + "grad_norm": 2.998599052429199, + "learning_rate": 9.50609344142115e-06, + "loss": 1.2295, + "step": 3195 + }, + { + "epoch": 0.9278560023225432, + "grad_norm": 3.299854040145874, + "learning_rate": 9.505677240280602e-06, + "loss": 1.2555, + "step": 3196 + }, + { + "epoch": 0.9281463202206416, + "grad_norm": 3.150684118270874, + "learning_rate": 9.505260872972466e-06, + "loss": 1.2473, + "step": 3197 + }, + { + "epoch": 0.92843663811874, + "grad_norm": 3.107889175415039, + "learning_rate": 9.504844339512096e-06, + "loss": 0.9754, + "step": 3198 + }, + { + "epoch": 0.9287269560168384, + "grad_norm": 3.0680747032165527, + "learning_rate": 9.504427639914856e-06, + "loss": 1.1238, + "step": 3199 + }, + { + "epoch": 0.9290172739149368, + "grad_norm": 3.120218276977539, + "learning_rate": 9.504010774196111e-06, + "loss": 1.1543, + "step": 3200 + }, + { + "epoch": 0.9293075918130352, + "grad_norm": 3.446390390396118, + "learning_rate": 9.503593742371236e-06, + "loss": 1.2022, + "step": 3201 + }, + { + "epoch": 0.9295979097111337, + "grad_norm": 3.453664541244507, + "learning_rate": 9.503176544455611e-06, + "loss": 1.2489, + "step": 3202 + }, + { + "epoch": 0.9298882276092321, + "grad_norm": 3.372509479522705, + "learning_rate": 9.502759180464621e-06, + "loss": 1.2709, + "step": 3203 + }, + { + "epoch": 0.9301785455073305, + "grad_norm": 3.100264072418213, + "learning_rate": 9.50234165041366e-06, + "loss": 1.1211, + "step": 3204 + }, + { + "epoch": 0.9304688634054289, + "grad_norm": 2.9130682945251465, + "learning_rate": 9.501923954318126e-06, + "loss": 1.0133, + "step": 3205 + }, + { + "epoch": 0.9307591813035274, + "grad_norm": 3.162043809890747, + "learning_rate": 9.501506092193424e-06, + "loss": 1.0223, + "step": 3206 + }, + { + "epoch": 0.9310494992016258, + "grad_norm": 3.3077001571655273, + "learning_rate": 9.501088064054963e-06, + "loss": 1.2443, + "step": 3207 + }, + { + "epoch": 0.9313398170997242, + "grad_norm": 3.330491781234741, + "learning_rate": 9.50066986991816e-06, + "loss": 1.128, + "step": 3208 + }, + { + "epoch": 0.9316301349978227, + "grad_norm": 3.372661828994751, + "learning_rate": 9.500251509798438e-06, + "loss": 1.3112, + "step": 3209 + }, + { + "epoch": 0.9319204528959211, + "grad_norm": 3.3673317432403564, + "learning_rate": 9.499832983711226e-06, + "loss": 1.2208, + "step": 3210 + }, + { + "epoch": 0.9322107707940195, + "grad_norm": 3.226531744003296, + "learning_rate": 9.499414291671961e-06, + "loss": 1.2343, + "step": 3211 + }, + { + "epoch": 0.9325010886921179, + "grad_norm": 3.247696876525879, + "learning_rate": 9.498995433696081e-06, + "loss": 1.1313, + "step": 3212 + }, + { + "epoch": 0.9327914065902163, + "grad_norm": 3.215843915939331, + "learning_rate": 9.498576409799034e-06, + "loss": 1.4321, + "step": 3213 + }, + { + "epoch": 0.9330817244883147, + "grad_norm": 3.0820136070251465, + "learning_rate": 9.498157219996275e-06, + "loss": 1.2786, + "step": 3214 + }, + { + "epoch": 0.9333720423864131, + "grad_norm": 3.309765100479126, + "learning_rate": 9.497737864303265e-06, + "loss": 1.1981, + "step": 3215 + }, + { + "epoch": 0.9336623602845115, + "grad_norm": 3.2941930294036865, + "learning_rate": 9.497318342735466e-06, + "loss": 1.1813, + "step": 3216 + }, + { + "epoch": 0.93395267818261, + "grad_norm": 3.4502313137054443, + "learning_rate": 9.49689865530835e-06, + "loss": 1.1647, + "step": 3217 + }, + { + "epoch": 0.9342429960807084, + "grad_norm": 3.085756778717041, + "learning_rate": 9.496478802037396e-06, + "loss": 1.1329, + "step": 3218 + }, + { + "epoch": 0.9345333139788068, + "grad_norm": 3.3223068714141846, + "learning_rate": 9.496058782938088e-06, + "loss": 1.2166, + "step": 3219 + }, + { + "epoch": 0.9348236318769052, + "grad_norm": 3.3261163234710693, + "learning_rate": 9.49563859802592e-06, + "loss": 1.1447, + "step": 3220 + }, + { + "epoch": 0.9351139497750036, + "grad_norm": 3.140730381011963, + "learning_rate": 9.495218247316381e-06, + "loss": 1.1553, + "step": 3221 + }, + { + "epoch": 0.935404267673102, + "grad_norm": 3.2012627124786377, + "learning_rate": 9.494797730824978e-06, + "loss": 1.0707, + "step": 3222 + }, + { + "epoch": 0.9356945855712004, + "grad_norm": 3.489518404006958, + "learning_rate": 9.494377048567218e-06, + "loss": 1.1577, + "step": 3223 + }, + { + "epoch": 0.9359849034692989, + "grad_norm": 3.089207172393799, + "learning_rate": 9.493956200558615e-06, + "loss": 1.202, + "step": 3224 + }, + { + "epoch": 0.9362752213673973, + "grad_norm": 3.1790692806243896, + "learning_rate": 9.493535186814693e-06, + "loss": 1.0798, + "step": 3225 + }, + { + "epoch": 0.9365655392654957, + "grad_norm": 3.196995496749878, + "learning_rate": 9.493114007350976e-06, + "loss": 1.1304, + "step": 3226 + }, + { + "epoch": 0.9368558571635941, + "grad_norm": 3.4054222106933594, + "learning_rate": 9.492692662182997e-06, + "loss": 1.0787, + "step": 3227 + }, + { + "epoch": 0.9371461750616925, + "grad_norm": 3.0863044261932373, + "learning_rate": 9.492271151326295e-06, + "loss": 1.0259, + "step": 3228 + }, + { + "epoch": 0.9374364929597909, + "grad_norm": 3.0026841163635254, + "learning_rate": 9.491849474796416e-06, + "loss": 1.0096, + "step": 3229 + }, + { + "epoch": 0.9377268108578893, + "grad_norm": 2.935014247894287, + "learning_rate": 9.49142763260891e-06, + "loss": 1.1263, + "step": 3230 + }, + { + "epoch": 0.9380171287559879, + "grad_norm": 2.8023691177368164, + "learning_rate": 9.491005624779337e-06, + "loss": 1.0752, + "step": 3231 + }, + { + "epoch": 0.9383074466540863, + "grad_norm": 3.2768187522888184, + "learning_rate": 9.490583451323258e-06, + "loss": 1.2187, + "step": 3232 + }, + { + "epoch": 0.9385977645521847, + "grad_norm": 3.004180431365967, + "learning_rate": 9.490161112256242e-06, + "loss": 1.1065, + "step": 3233 + }, + { + "epoch": 0.9388880824502831, + "grad_norm": 3.0199663639068604, + "learning_rate": 9.489738607593867e-06, + "loss": 1.175, + "step": 3234 + }, + { + "epoch": 0.9391784003483815, + "grad_norm": 3.029003381729126, + "learning_rate": 9.489315937351715e-06, + "loss": 1.079, + "step": 3235 + }, + { + "epoch": 0.9394687182464799, + "grad_norm": 3.2275280952453613, + "learning_rate": 9.488893101545372e-06, + "loss": 1.1521, + "step": 3236 + }, + { + "epoch": 0.9397590361445783, + "grad_norm": 2.9786007404327393, + "learning_rate": 9.488470100190432e-06, + "loss": 1.0745, + "step": 3237 + }, + { + "epoch": 0.9400493540426768, + "grad_norm": 3.209221839904785, + "learning_rate": 9.488046933302498e-06, + "loss": 1.271, + "step": 3238 + }, + { + "epoch": 0.9403396719407752, + "grad_norm": 3.101375102996826, + "learning_rate": 9.487623600897172e-06, + "loss": 1.1747, + "step": 3239 + }, + { + "epoch": 0.9406299898388736, + "grad_norm": 3.2204413414001465, + "learning_rate": 9.487200102990068e-06, + "loss": 1.205, + "step": 3240 + }, + { + "epoch": 0.940920307736972, + "grad_norm": 3.0944347381591797, + "learning_rate": 9.486776439596808e-06, + "loss": 1.1888, + "step": 3241 + }, + { + "epoch": 0.9412106256350704, + "grad_norm": 3.1230151653289795, + "learning_rate": 9.48635261073301e-06, + "loss": 1.1052, + "step": 3242 + }, + { + "epoch": 0.9415009435331688, + "grad_norm": 3.4646267890930176, + "learning_rate": 9.48592861641431e-06, + "loss": 1.3186, + "step": 3243 + }, + { + "epoch": 0.9417912614312672, + "grad_norm": 3.0284507274627686, + "learning_rate": 9.485504456656343e-06, + "loss": 1.0032, + "step": 3244 + }, + { + "epoch": 0.9420815793293656, + "grad_norm": 2.971484899520874, + "learning_rate": 9.48508013147475e-06, + "loss": 1.0545, + "step": 3245 + }, + { + "epoch": 0.9423718972274641, + "grad_norm": 3.0329430103302, + "learning_rate": 9.484655640885183e-06, + "loss": 1.143, + "step": 3246 + }, + { + "epoch": 0.9426622151255625, + "grad_norm": 3.1452481746673584, + "learning_rate": 9.484230984903296e-06, + "loss": 1.2393, + "step": 3247 + }, + { + "epoch": 0.9429525330236609, + "grad_norm": 3.5928149223327637, + "learning_rate": 9.483806163544749e-06, + "loss": 1.1103, + "step": 3248 + }, + { + "epoch": 0.9432428509217593, + "grad_norm": 3.611189126968384, + "learning_rate": 9.48338117682521e-06, + "loss": 1.3648, + "step": 3249 + }, + { + "epoch": 0.9435331688198577, + "grad_norm": 3.1281070709228516, + "learning_rate": 9.482956024760352e-06, + "loss": 0.9971, + "step": 3250 + }, + { + "epoch": 0.9438234867179561, + "grad_norm": 3.092606544494629, + "learning_rate": 9.482530707365856e-06, + "loss": 1.0551, + "step": 3251 + }, + { + "epoch": 0.9441138046160545, + "grad_norm": 3.4306132793426514, + "learning_rate": 9.482105224657406e-06, + "loss": 1.2839, + "step": 3252 + }, + { + "epoch": 0.944404122514153, + "grad_norm": 3.2871501445770264, + "learning_rate": 9.481679576650693e-06, + "loss": 1.0642, + "step": 3253 + }, + { + "epoch": 0.9446944404122514, + "grad_norm": 3.144798994064331, + "learning_rate": 9.481253763361415e-06, + "loss": 1.1322, + "step": 3254 + }, + { + "epoch": 0.9449847583103498, + "grad_norm": 4.029135227203369, + "learning_rate": 9.480827784805278e-06, + "loss": 1.1049, + "step": 3255 + }, + { + "epoch": 0.9452750762084483, + "grad_norm": 3.037443161010742, + "learning_rate": 9.480401640997991e-06, + "loss": 1.2186, + "step": 3256 + }, + { + "epoch": 0.9455653941065467, + "grad_norm": 3.2530734539031982, + "learning_rate": 9.479975331955269e-06, + "loss": 1.2415, + "step": 3257 + }, + { + "epoch": 0.9458557120046451, + "grad_norm": 3.5844802856445312, + "learning_rate": 9.479548857692836e-06, + "loss": 1.1883, + "step": 3258 + }, + { + "epoch": 0.9461460299027435, + "grad_norm": 2.8868770599365234, + "learning_rate": 9.479122218226415e-06, + "loss": 1.0488, + "step": 3259 + }, + { + "epoch": 0.946436347800842, + "grad_norm": 3.3103206157684326, + "learning_rate": 9.478695413571747e-06, + "loss": 1.2274, + "step": 3260 + }, + { + "epoch": 0.9467266656989404, + "grad_norm": 2.9122848510742188, + "learning_rate": 9.478268443744569e-06, + "loss": 1.0438, + "step": 3261 + }, + { + "epoch": 0.9470169835970388, + "grad_norm": 3.0058131217956543, + "learning_rate": 9.477841308760628e-06, + "loss": 1.027, + "step": 3262 + }, + { + "epoch": 0.9473073014951372, + "grad_norm": 2.9957618713378906, + "learning_rate": 9.477414008635675e-06, + "loss": 1.2333, + "step": 3263 + }, + { + "epoch": 0.9475976193932356, + "grad_norm": 3.0428504943847656, + "learning_rate": 9.476986543385472e-06, + "loss": 1.13, + "step": 3264 + }, + { + "epoch": 0.947887937291334, + "grad_norm": 2.8519036769866943, + "learning_rate": 9.47655891302578e-06, + "loss": 0.9563, + "step": 3265 + }, + { + "epoch": 0.9481782551894324, + "grad_norm": 2.8498032093048096, + "learning_rate": 9.476131117572373e-06, + "loss": 1.096, + "step": 3266 + }, + { + "epoch": 0.9484685730875309, + "grad_norm": 3.2216978073120117, + "learning_rate": 9.475703157041028e-06, + "loss": 1.2349, + "step": 3267 + }, + { + "epoch": 0.9487588909856293, + "grad_norm": 3.696192502975464, + "learning_rate": 9.475275031447525e-06, + "loss": 1.1619, + "step": 3268 + }, + { + "epoch": 0.9490492088837277, + "grad_norm": 3.411872625350952, + "learning_rate": 9.474846740807655e-06, + "loss": 1.1287, + "step": 3269 + }, + { + "epoch": 0.9493395267818261, + "grad_norm": 3.1810708045959473, + "learning_rate": 9.474418285137214e-06, + "loss": 1.1311, + "step": 3270 + }, + { + "epoch": 0.9496298446799245, + "grad_norm": 3.444535255432129, + "learning_rate": 9.473989664452001e-06, + "loss": 1.1452, + "step": 3271 + }, + { + "epoch": 0.9499201625780229, + "grad_norm": 3.02544903755188, + "learning_rate": 9.473560878767825e-06, + "loss": 1.1944, + "step": 3272 + }, + { + "epoch": 0.9502104804761213, + "grad_norm": 2.964012384414673, + "learning_rate": 9.4731319281005e-06, + "loss": 1.1086, + "step": 3273 + }, + { + "epoch": 0.9505007983742197, + "grad_norm": 3.4347403049468994, + "learning_rate": 9.472702812465843e-06, + "loss": 1.2453, + "step": 3274 + }, + { + "epoch": 0.9507911162723182, + "grad_norm": 3.0634796619415283, + "learning_rate": 9.47227353187968e-06, + "loss": 1.0154, + "step": 3275 + }, + { + "epoch": 0.9510814341704166, + "grad_norm": 3.2538411617279053, + "learning_rate": 9.471844086357848e-06, + "loss": 1.1605, + "step": 3276 + }, + { + "epoch": 0.951371752068515, + "grad_norm": 2.976386547088623, + "learning_rate": 9.471414475916179e-06, + "loss": 1.0983, + "step": 3277 + }, + { + "epoch": 0.9516620699666134, + "grad_norm": 3.2437491416931152, + "learning_rate": 9.470984700570518e-06, + "loss": 1.1463, + "step": 3278 + }, + { + "epoch": 0.9519523878647118, + "grad_norm": 3.283535957336426, + "learning_rate": 9.470554760336714e-06, + "loss": 1.0749, + "step": 3279 + }, + { + "epoch": 0.9522427057628102, + "grad_norm": 3.1635475158691406, + "learning_rate": 9.470124655230627e-06, + "loss": 1.1702, + "step": 3280 + }, + { + "epoch": 0.9525330236609086, + "grad_norm": 3.6238670349121094, + "learning_rate": 9.469694385268115e-06, + "loss": 1.2376, + "step": 3281 + }, + { + "epoch": 0.9528233415590072, + "grad_norm": 3.029278516769409, + "learning_rate": 9.469263950465048e-06, + "loss": 1.1066, + "step": 3282 + }, + { + "epoch": 0.9531136594571056, + "grad_norm": 2.8746628761291504, + "learning_rate": 9.468833350837301e-06, + "loss": 1.0827, + "step": 3283 + }, + { + "epoch": 0.953403977355204, + "grad_norm": 2.8631439208984375, + "learning_rate": 9.468402586400753e-06, + "loss": 0.8597, + "step": 3284 + }, + { + "epoch": 0.9536942952533024, + "grad_norm": 3.1171255111694336, + "learning_rate": 9.467971657171292e-06, + "loss": 1.086, + "step": 3285 + }, + { + "epoch": 0.9539846131514008, + "grad_norm": 3.133019208908081, + "learning_rate": 9.467540563164808e-06, + "loss": 1.1201, + "step": 3286 + }, + { + "epoch": 0.9542749310494992, + "grad_norm": 3.1883506774902344, + "learning_rate": 9.467109304397201e-06, + "loss": 1.1701, + "step": 3287 + }, + { + "epoch": 0.9545652489475976, + "grad_norm": 3.2414369583129883, + "learning_rate": 9.466677880884376e-06, + "loss": 1.1613, + "step": 3288 + }, + { + "epoch": 0.9548555668456961, + "grad_norm": 2.8469996452331543, + "learning_rate": 9.466246292642243e-06, + "loss": 0.9667, + "step": 3289 + }, + { + "epoch": 0.9551458847437945, + "grad_norm": 3.1720969676971436, + "learning_rate": 9.465814539686719e-06, + "loss": 1.1769, + "step": 3290 + }, + { + "epoch": 0.9554362026418929, + "grad_norm": 3.1476361751556396, + "learning_rate": 9.465382622033727e-06, + "loss": 1.2384, + "step": 3291 + }, + { + "epoch": 0.9557265205399913, + "grad_norm": 3.4708709716796875, + "learning_rate": 9.464950539699195e-06, + "loss": 1.4053, + "step": 3292 + }, + { + "epoch": 0.9560168384380897, + "grad_norm": 3.2307510375976562, + "learning_rate": 9.46451829269906e-06, + "loss": 1.0809, + "step": 3293 + }, + { + "epoch": 0.9563071563361881, + "grad_norm": 3.331270933151245, + "learning_rate": 9.464085881049262e-06, + "loss": 1.1588, + "step": 3294 + }, + { + "epoch": 0.9565974742342865, + "grad_norm": 3.047401189804077, + "learning_rate": 9.46365330476575e-06, + "loss": 1.2303, + "step": 3295 + }, + { + "epoch": 0.956887792132385, + "grad_norm": 2.589224338531494, + "learning_rate": 9.463220563864474e-06, + "loss": 0.9973, + "step": 3296 + }, + { + "epoch": 0.9571781100304834, + "grad_norm": 3.296471357345581, + "learning_rate": 9.462787658361394e-06, + "loss": 1.2449, + "step": 3297 + }, + { + "epoch": 0.9574684279285818, + "grad_norm": 3.164555788040161, + "learning_rate": 9.462354588272478e-06, + "loss": 1.1311, + "step": 3298 + }, + { + "epoch": 0.9577587458266802, + "grad_norm": 3.3225278854370117, + "learning_rate": 9.461921353613693e-06, + "loss": 1.2072, + "step": 3299 + }, + { + "epoch": 0.9580490637247786, + "grad_norm": 3.135514259338379, + "learning_rate": 9.461487954401021e-06, + "loss": 1.0418, + "step": 3300 + }, + { + "epoch": 0.958339381622877, + "grad_norm": 3.0921425819396973, + "learning_rate": 9.461054390650444e-06, + "loss": 1.2124, + "step": 3301 + }, + { + "epoch": 0.9586296995209754, + "grad_norm": 3.197275161743164, + "learning_rate": 9.460620662377949e-06, + "loss": 1.2466, + "step": 3302 + }, + { + "epoch": 0.9589200174190738, + "grad_norm": 3.615117311477661, + "learning_rate": 9.460186769599536e-06, + "loss": 1.239, + "step": 3303 + }, + { + "epoch": 0.9592103353171723, + "grad_norm": 3.303147077560425, + "learning_rate": 9.459752712331204e-06, + "loss": 1.2606, + "step": 3304 + }, + { + "epoch": 0.9595006532152707, + "grad_norm": 3.386007308959961, + "learning_rate": 9.459318490588964e-06, + "loss": 1.2938, + "step": 3305 + }, + { + "epoch": 0.9597909711133691, + "grad_norm": 3.0497190952301025, + "learning_rate": 9.458884104388826e-06, + "loss": 1.1553, + "step": 3306 + }, + { + "epoch": 0.9600812890114676, + "grad_norm": 2.7740349769592285, + "learning_rate": 9.458449553746812e-06, + "loss": 1.05, + "step": 3307 + }, + { + "epoch": 0.960371606909566, + "grad_norm": 3.255222797393799, + "learning_rate": 9.458014838678946e-06, + "loss": 0.9898, + "step": 3308 + }, + { + "epoch": 0.9606619248076644, + "grad_norm": 2.9783425331115723, + "learning_rate": 9.457579959201263e-06, + "loss": 1.0716, + "step": 3309 + }, + { + "epoch": 0.9609522427057628, + "grad_norm": 3.041851043701172, + "learning_rate": 9.457144915329802e-06, + "loss": 1.1695, + "step": 3310 + }, + { + "epoch": 0.9612425606038613, + "grad_norm": 3.023836851119995, + "learning_rate": 9.456709707080602e-06, + "loss": 1.0672, + "step": 3311 + }, + { + "epoch": 0.9615328785019597, + "grad_norm": 2.8885133266448975, + "learning_rate": 9.45627433446972e-06, + "loss": 1.0908, + "step": 3312 + }, + { + "epoch": 0.9618231964000581, + "grad_norm": 3.162452459335327, + "learning_rate": 9.455838797513206e-06, + "loss": 1.0913, + "step": 3313 + }, + { + "epoch": 0.9621135142981565, + "grad_norm": 3.567873239517212, + "learning_rate": 9.455403096227126e-06, + "loss": 1.2009, + "step": 3314 + }, + { + "epoch": 0.9624038321962549, + "grad_norm": 2.9521007537841797, + "learning_rate": 9.454967230627549e-06, + "loss": 1.0564, + "step": 3315 + }, + { + "epoch": 0.9626941500943533, + "grad_norm": 3.264430284500122, + "learning_rate": 9.45453120073055e-06, + "loss": 1.167, + "step": 3316 + }, + { + "epoch": 0.9629844679924517, + "grad_norm": 3.638040065765381, + "learning_rate": 9.454095006552204e-06, + "loss": 1.2732, + "step": 3317 + }, + { + "epoch": 0.9632747858905502, + "grad_norm": 3.109283208847046, + "learning_rate": 9.453658648108604e-06, + "loss": 1.0722, + "step": 3318 + }, + { + "epoch": 0.9635651037886486, + "grad_norm": 3.268758535385132, + "learning_rate": 9.45322212541584e-06, + "loss": 1.2237, + "step": 3319 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 3.297163963317871, + "learning_rate": 9.452785438490011e-06, + "loss": 1.2266, + "step": 3320 + }, + { + "epoch": 0.9641457395848454, + "grad_norm": 3.4363367557525635, + "learning_rate": 9.452348587347224e-06, + "loss": 1.3593, + "step": 3321 + }, + { + "epoch": 0.9644360574829438, + "grad_norm": 2.6215686798095703, + "learning_rate": 9.451911572003586e-06, + "loss": 1.0826, + "step": 3322 + }, + { + "epoch": 0.9647263753810422, + "grad_norm": 3.397822380065918, + "learning_rate": 9.451474392475216e-06, + "loss": 1.1542, + "step": 3323 + }, + { + "epoch": 0.9650166932791406, + "grad_norm": 3.1584107875823975, + "learning_rate": 9.451037048778238e-06, + "loss": 1.113, + "step": 3324 + }, + { + "epoch": 0.965307011177239, + "grad_norm": 3.2262637615203857, + "learning_rate": 9.450599540928779e-06, + "loss": 1.1506, + "step": 3325 + }, + { + "epoch": 0.9655973290753375, + "grad_norm": 3.163564443588257, + "learning_rate": 9.450161868942975e-06, + "loss": 1.2236, + "step": 3326 + }, + { + "epoch": 0.9658876469734359, + "grad_norm": 3.1902246475219727, + "learning_rate": 9.449724032836968e-06, + "loss": 1.2597, + "step": 3327 + }, + { + "epoch": 0.9661779648715343, + "grad_norm": 3.5280227661132812, + "learning_rate": 9.449286032626904e-06, + "loss": 1.2247, + "step": 3328 + }, + { + "epoch": 0.9664682827696327, + "grad_norm": 3.1843626499176025, + "learning_rate": 9.448847868328936e-06, + "loss": 1.0195, + "step": 3329 + }, + { + "epoch": 0.9667586006677311, + "grad_norm": 3.1920642852783203, + "learning_rate": 9.448409539959225e-06, + "loss": 1.1452, + "step": 3330 + }, + { + "epoch": 0.9670489185658295, + "grad_norm": 4.158785343170166, + "learning_rate": 9.447971047533936e-06, + "loss": 1.2936, + "step": 3331 + }, + { + "epoch": 0.967339236463928, + "grad_norm": 3.061877727508545, + "learning_rate": 9.447532391069238e-06, + "loss": 1.1663, + "step": 3332 + }, + { + "epoch": 0.9676295543620265, + "grad_norm": 2.6941730976104736, + "learning_rate": 9.447093570581313e-06, + "loss": 0.9278, + "step": 3333 + }, + { + "epoch": 0.9679198722601249, + "grad_norm": 3.301288366317749, + "learning_rate": 9.44665458608634e-06, + "loss": 1.1854, + "step": 3334 + }, + { + "epoch": 0.9682101901582233, + "grad_norm": 3.001420021057129, + "learning_rate": 9.446215437600511e-06, + "loss": 1.0494, + "step": 3335 + }, + { + "epoch": 0.9685005080563217, + "grad_norm": 3.054023504257202, + "learning_rate": 9.44577612514002e-06, + "loss": 1.0131, + "step": 3336 + }, + { + "epoch": 0.9687908259544201, + "grad_norm": 3.395092010498047, + "learning_rate": 9.445336648721073e-06, + "loss": 1.2864, + "step": 3337 + }, + { + "epoch": 0.9690811438525185, + "grad_norm": 3.0327727794647217, + "learning_rate": 9.444897008359871e-06, + "loss": 1.0428, + "step": 3338 + }, + { + "epoch": 0.969371461750617, + "grad_norm": 2.8564300537109375, + "learning_rate": 9.444457204072632e-06, + "loss": 1.0468, + "step": 3339 + }, + { + "epoch": 0.9696617796487154, + "grad_norm": 3.084829330444336, + "learning_rate": 9.444017235875577e-06, + "loss": 1.0957, + "step": 3340 + }, + { + "epoch": 0.9699520975468138, + "grad_norm": 2.857167959213257, + "learning_rate": 9.443577103784927e-06, + "loss": 0.9776, + "step": 3341 + }, + { + "epoch": 0.9702424154449122, + "grad_norm": 2.935952663421631, + "learning_rate": 9.443136807816919e-06, + "loss": 1.1364, + "step": 3342 + }, + { + "epoch": 0.9705327333430106, + "grad_norm": 3.175546884536743, + "learning_rate": 9.442696347987787e-06, + "loss": 1.1864, + "step": 3343 + }, + { + "epoch": 0.970823051241109, + "grad_norm": 3.243807315826416, + "learning_rate": 9.442255724313778e-06, + "loss": 1.1785, + "step": 3344 + }, + { + "epoch": 0.9711133691392074, + "grad_norm": 2.8106155395507812, + "learning_rate": 9.441814936811142e-06, + "loss": 0.9373, + "step": 3345 + }, + { + "epoch": 0.9714036870373058, + "grad_norm": 3.255561113357544, + "learning_rate": 9.441373985496133e-06, + "loss": 1.0555, + "step": 3346 + }, + { + "epoch": 0.9716940049354043, + "grad_norm": 3.7151269912719727, + "learning_rate": 9.440932870385011e-06, + "loss": 1.3468, + "step": 3347 + }, + { + "epoch": 0.9719843228335027, + "grad_norm": 3.180406093597412, + "learning_rate": 9.44049159149405e-06, + "loss": 1.0678, + "step": 3348 + }, + { + "epoch": 0.9722746407316011, + "grad_norm": 3.1511247158050537, + "learning_rate": 9.440050148839521e-06, + "loss": 1.1926, + "step": 3349 + }, + { + "epoch": 0.9725649586296995, + "grad_norm": 3.3239786624908447, + "learning_rate": 9.439608542437704e-06, + "loss": 1.0599, + "step": 3350 + }, + { + "epoch": 0.9728552765277979, + "grad_norm": 3.1429460048675537, + "learning_rate": 9.439166772304886e-06, + "loss": 1.1076, + "step": 3351 + }, + { + "epoch": 0.9731455944258963, + "grad_norm": 3.073305368423462, + "learning_rate": 9.438724838457358e-06, + "loss": 1.0712, + "step": 3352 + }, + { + "epoch": 0.9734359123239947, + "grad_norm": 3.4165472984313965, + "learning_rate": 9.438282740911421e-06, + "loss": 1.1699, + "step": 3353 + }, + { + "epoch": 0.9737262302220931, + "grad_norm": 3.339623212814331, + "learning_rate": 9.437840479683377e-06, + "loss": 1.1977, + "step": 3354 + }, + { + "epoch": 0.9740165481201916, + "grad_norm": 2.8841733932495117, + "learning_rate": 9.437398054789537e-06, + "loss": 1.1156, + "step": 3355 + }, + { + "epoch": 0.97430686601829, + "grad_norm": 3.360177516937256, + "learning_rate": 9.436955466246218e-06, + "loss": 1.1148, + "step": 3356 + }, + { + "epoch": 0.9745971839163885, + "grad_norm": 3.3974556922912598, + "learning_rate": 9.436512714069742e-06, + "loss": 1.2665, + "step": 3357 + }, + { + "epoch": 0.9748875018144869, + "grad_norm": 2.819671154022217, + "learning_rate": 9.436069798276438e-06, + "loss": 1.1152, + "step": 3358 + }, + { + "epoch": 0.9751778197125853, + "grad_norm": 3.1836605072021484, + "learning_rate": 9.43562671888264e-06, + "loss": 1.2375, + "step": 3359 + }, + { + "epoch": 0.9754681376106837, + "grad_norm": 3.028640031814575, + "learning_rate": 9.435183475904688e-06, + "loss": 1.1392, + "step": 3360 + }, + { + "epoch": 0.9757584555087822, + "grad_norm": 3.1864209175109863, + "learning_rate": 9.434740069358931e-06, + "loss": 1.217, + "step": 3361 + }, + { + "epoch": 0.9760487734068806, + "grad_norm": 2.9835257530212402, + "learning_rate": 9.434296499261719e-06, + "loss": 1.0562, + "step": 3362 + }, + { + "epoch": 0.976339091304979, + "grad_norm": 3.0759634971618652, + "learning_rate": 9.433852765629412e-06, + "loss": 1.1193, + "step": 3363 + }, + { + "epoch": 0.9766294092030774, + "grad_norm": 3.088196277618408, + "learning_rate": 9.433408868478375e-06, + "loss": 1.1732, + "step": 3364 + }, + { + "epoch": 0.9769197271011758, + "grad_norm": 2.7689590454101562, + "learning_rate": 9.432964807824979e-06, + "loss": 1.0004, + "step": 3365 + }, + { + "epoch": 0.9772100449992742, + "grad_norm": 2.8842358589172363, + "learning_rate": 9.432520583685597e-06, + "loss": 1.0616, + "step": 3366 + }, + { + "epoch": 0.9775003628973726, + "grad_norm": 3.2516438961029053, + "learning_rate": 9.432076196076618e-06, + "loss": 1.1702, + "step": 3367 + }, + { + "epoch": 0.977790680795471, + "grad_norm": 2.9004099369049072, + "learning_rate": 9.431631645014427e-06, + "loss": 1.0521, + "step": 3368 + }, + { + "epoch": 0.9780809986935695, + "grad_norm": 3.162397861480713, + "learning_rate": 9.431186930515419e-06, + "loss": 1.1376, + "step": 3369 + }, + { + "epoch": 0.9783713165916679, + "grad_norm": 3.3717830181121826, + "learning_rate": 9.430742052595995e-06, + "loss": 1.1725, + "step": 3370 + }, + { + "epoch": 0.9786616344897663, + "grad_norm": 3.5331950187683105, + "learning_rate": 9.430297011272564e-06, + "loss": 1.2318, + "step": 3371 + }, + { + "epoch": 0.9789519523878647, + "grad_norm": 3.0815625190734863, + "learning_rate": 9.429851806561537e-06, + "loss": 0.9662, + "step": 3372 + }, + { + "epoch": 0.9792422702859631, + "grad_norm": 3.1928114891052246, + "learning_rate": 9.429406438479332e-06, + "loss": 1.2074, + "step": 3373 + }, + { + "epoch": 0.9795325881840615, + "grad_norm": 3.0204803943634033, + "learning_rate": 9.428960907042377e-06, + "loss": 1.0493, + "step": 3374 + }, + { + "epoch": 0.9798229060821599, + "grad_norm": 3.266531467437744, + "learning_rate": 9.4285152122671e-06, + "loss": 1.2921, + "step": 3375 + }, + { + "epoch": 0.9801132239802584, + "grad_norm": 2.9223287105560303, + "learning_rate": 9.42806935416994e-06, + "loss": 1.0824, + "step": 3376 + }, + { + "epoch": 0.9804035418783568, + "grad_norm": 3.335517168045044, + "learning_rate": 9.427623332767338e-06, + "loss": 1.3236, + "step": 3377 + }, + { + "epoch": 0.9806938597764552, + "grad_norm": 3.223524332046509, + "learning_rate": 9.427177148075746e-06, + "loss": 1.2141, + "step": 3378 + }, + { + "epoch": 0.9809841776745536, + "grad_norm": 3.1920454502105713, + "learning_rate": 9.426730800111618e-06, + "loss": 1.0862, + "step": 3379 + }, + { + "epoch": 0.981274495572652, + "grad_norm": 3.0022921562194824, + "learning_rate": 9.426284288891415e-06, + "loss": 1.1349, + "step": 3380 + }, + { + "epoch": 0.9815648134707504, + "grad_norm": 3.008728265762329, + "learning_rate": 9.425837614431601e-06, + "loss": 1.1163, + "step": 3381 + }, + { + "epoch": 0.9818551313688489, + "grad_norm": 2.845618724822998, + "learning_rate": 9.425390776748656e-06, + "loss": 1.0241, + "step": 3382 + }, + { + "epoch": 0.9821454492669474, + "grad_norm": 3.227717876434326, + "learning_rate": 9.424943775859052e-06, + "loss": 1.1405, + "step": 3383 + }, + { + "epoch": 0.9824357671650458, + "grad_norm": 3.4967589378356934, + "learning_rate": 9.424496611779279e-06, + "loss": 1.3153, + "step": 3384 + }, + { + "epoch": 0.9827260850631442, + "grad_norm": 3.4529168605804443, + "learning_rate": 9.424049284525827e-06, + "loss": 1.2027, + "step": 3385 + }, + { + "epoch": 0.9830164029612426, + "grad_norm": 3.211639404296875, + "learning_rate": 9.423601794115194e-06, + "loss": 1.0941, + "step": 3386 + }, + { + "epoch": 0.983306720859341, + "grad_norm": 3.719665765762329, + "learning_rate": 9.42315414056388e-06, + "loss": 1.332, + "step": 3387 + }, + { + "epoch": 0.9835970387574394, + "grad_norm": 3.154254674911499, + "learning_rate": 9.422706323888398e-06, + "loss": 1.1848, + "step": 3388 + }, + { + "epoch": 0.9838873566555378, + "grad_norm": 3.1426172256469727, + "learning_rate": 9.422258344105263e-06, + "loss": 1.1643, + "step": 3389 + }, + { + "epoch": 0.9841776745536363, + "grad_norm": 3.4022419452667236, + "learning_rate": 9.421810201230992e-06, + "loss": 1.3219, + "step": 3390 + }, + { + "epoch": 0.9844679924517347, + "grad_norm": 3.381171464920044, + "learning_rate": 9.421361895282117e-06, + "loss": 1.3257, + "step": 3391 + }, + { + "epoch": 0.9847583103498331, + "grad_norm": 3.2930431365966797, + "learning_rate": 9.42091342627517e-06, + "loss": 1.096, + "step": 3392 + }, + { + "epoch": 0.9850486282479315, + "grad_norm": 3.0404651165008545, + "learning_rate": 9.420464794226691e-06, + "loss": 1.1944, + "step": 3393 + }, + { + "epoch": 0.9853389461460299, + "grad_norm": 3.1677844524383545, + "learning_rate": 9.420015999153225e-06, + "loss": 1.1356, + "step": 3394 + }, + { + "epoch": 0.9856292640441283, + "grad_norm": 3.403318166732788, + "learning_rate": 9.41956704107132e-06, + "loss": 1.0833, + "step": 3395 + }, + { + "epoch": 0.9859195819422267, + "grad_norm": 3.0987493991851807, + "learning_rate": 9.419117919997538e-06, + "loss": 1.106, + "step": 3396 + }, + { + "epoch": 0.9862098998403251, + "grad_norm": 3.006129503250122, + "learning_rate": 9.418668635948443e-06, + "loss": 1.0986, + "step": 3397 + }, + { + "epoch": 0.9865002177384236, + "grad_norm": 3.6161084175109863, + "learning_rate": 9.4182191889406e-06, + "loss": 1.3971, + "step": 3398 + }, + { + "epoch": 0.986790535636522, + "grad_norm": 3.079556465148926, + "learning_rate": 9.417769578990586e-06, + "loss": 1.0629, + "step": 3399 + }, + { + "epoch": 0.9870808535346204, + "grad_norm": 3.1218533515930176, + "learning_rate": 9.417319806114984e-06, + "loss": 1.1182, + "step": 3400 + }, + { + "epoch": 0.9873711714327188, + "grad_norm": 2.991771697998047, + "learning_rate": 9.41686987033038e-06, + "loss": 1.0839, + "step": 3401 + }, + { + "epoch": 0.9876614893308172, + "grad_norm": 3.7504146099090576, + "learning_rate": 9.416419771653368e-06, + "loss": 1.4415, + "step": 3402 + }, + { + "epoch": 0.9879518072289156, + "grad_norm": 3.217874526977539, + "learning_rate": 9.415969510100549e-06, + "loss": 1.2136, + "step": 3403 + }, + { + "epoch": 0.988242125127014, + "grad_norm": 3.183932304382324, + "learning_rate": 9.415519085688526e-06, + "loss": 1.0926, + "step": 3404 + }, + { + "epoch": 0.9885324430251125, + "grad_norm": 3.3624684810638428, + "learning_rate": 9.415068498433912e-06, + "loss": 1.1281, + "step": 3405 + }, + { + "epoch": 0.9888227609232109, + "grad_norm": 3.2152488231658936, + "learning_rate": 9.414617748353324e-06, + "loss": 1.2438, + "step": 3406 + }, + { + "epoch": 0.9891130788213094, + "grad_norm": 3.27553391456604, + "learning_rate": 9.414166835463383e-06, + "loss": 1.128, + "step": 3407 + }, + { + "epoch": 0.9894033967194078, + "grad_norm": 3.2097506523132324, + "learning_rate": 9.413715759780722e-06, + "loss": 1.1601, + "step": 3408 + }, + { + "epoch": 0.9896937146175062, + "grad_norm": 3.083144187927246, + "learning_rate": 9.413264521321976e-06, + "loss": 1.0782, + "step": 3409 + }, + { + "epoch": 0.9899840325156046, + "grad_norm": 3.3622589111328125, + "learning_rate": 9.412813120103786e-06, + "loss": 1.1783, + "step": 3410 + }, + { + "epoch": 0.990274350413703, + "grad_norm": 3.3557496070861816, + "learning_rate": 9.412361556142797e-06, + "loss": 1.2824, + "step": 3411 + }, + { + "epoch": 0.9905646683118015, + "grad_norm": 3.4692952632904053, + "learning_rate": 9.411909829455667e-06, + "loss": 1.2376, + "step": 3412 + }, + { + "epoch": 0.9908549862098999, + "grad_norm": 2.9737493991851807, + "learning_rate": 9.411457940059053e-06, + "loss": 0.9969, + "step": 3413 + }, + { + "epoch": 0.9911453041079983, + "grad_norm": 3.2683541774749756, + "learning_rate": 9.41100588796962e-06, + "loss": 1.172, + "step": 3414 + }, + { + "epoch": 0.9914356220060967, + "grad_norm": 2.798372268676758, + "learning_rate": 9.41055367320404e-06, + "loss": 1.0958, + "step": 3415 + }, + { + "epoch": 0.9917259399041951, + "grad_norm": 3.1530799865722656, + "learning_rate": 9.410101295778992e-06, + "loss": 1.1092, + "step": 3416 + }, + { + "epoch": 0.9920162578022935, + "grad_norm": 3.589674711227417, + "learning_rate": 9.409648755711157e-06, + "loss": 1.4038, + "step": 3417 + }, + { + "epoch": 0.9923065757003919, + "grad_norm": 2.7075817584991455, + "learning_rate": 9.409196053017227e-06, + "loss": 1.0471, + "step": 3418 + }, + { + "epoch": 0.9925968935984903, + "grad_norm": 3.057220697402954, + "learning_rate": 9.408743187713895e-06, + "loss": 1.1861, + "step": 3419 + }, + { + "epoch": 0.9928872114965888, + "grad_norm": 2.9704697132110596, + "learning_rate": 9.408290159817865e-06, + "loss": 1.1141, + "step": 3420 + }, + { + "epoch": 0.9931775293946872, + "grad_norm": 3.118169069290161, + "learning_rate": 9.407836969345845e-06, + "loss": 1.0851, + "step": 3421 + }, + { + "epoch": 0.9934678472927856, + "grad_norm": 2.885435104370117, + "learning_rate": 9.407383616314545e-06, + "loss": 1.0472, + "step": 3422 + }, + { + "epoch": 0.993758165190884, + "grad_norm": 3.142916202545166, + "learning_rate": 9.406930100740686e-06, + "loss": 1.0709, + "step": 3423 + }, + { + "epoch": 0.9940484830889824, + "grad_norm": 3.0547609329223633, + "learning_rate": 9.406476422640994e-06, + "loss": 1.1419, + "step": 3424 + }, + { + "epoch": 0.9943388009870808, + "grad_norm": 3.3543431758880615, + "learning_rate": 9.4060225820322e-06, + "loss": 1.1565, + "step": 3425 + }, + { + "epoch": 0.9946291188851792, + "grad_norm": 3.0204522609710693, + "learning_rate": 9.405568578931042e-06, + "loss": 1.2616, + "step": 3426 + }, + { + "epoch": 0.9949194367832777, + "grad_norm": 3.07812237739563, + "learning_rate": 9.405114413354261e-06, + "loss": 1.0725, + "step": 3427 + }, + { + "epoch": 0.9952097546813761, + "grad_norm": 2.8966448307037354, + "learning_rate": 9.40466008531861e-06, + "loss": 1.0558, + "step": 3428 + }, + { + "epoch": 0.9955000725794745, + "grad_norm": 3.6422510147094727, + "learning_rate": 9.404205594840843e-06, + "loss": 1.0604, + "step": 3429 + }, + { + "epoch": 0.9957903904775729, + "grad_norm": 3.1371798515319824, + "learning_rate": 9.403750941937723e-06, + "loss": 1.0434, + "step": 3430 + }, + { + "epoch": 0.9960807083756713, + "grad_norm": 3.1310348510742188, + "learning_rate": 9.403296126626014e-06, + "loss": 1.0345, + "step": 3431 + }, + { + "epoch": 0.9963710262737698, + "grad_norm": 3.1864089965820312, + "learning_rate": 9.402841148922493e-06, + "loss": 1.1211, + "step": 3432 + }, + { + "epoch": 0.9966613441718682, + "grad_norm": 3.2112019062042236, + "learning_rate": 9.402386008843935e-06, + "loss": 1.0529, + "step": 3433 + }, + { + "epoch": 0.9969516620699667, + "grad_norm": 3.1958372592926025, + "learning_rate": 9.401930706407129e-06, + "loss": 1.1574, + "step": 3434 + }, + { + "epoch": 0.9972419799680651, + "grad_norm": 3.1686742305755615, + "learning_rate": 9.401475241628867e-06, + "loss": 0.9665, + "step": 3435 + }, + { + "epoch": 0.9975322978661635, + "grad_norm": 2.843740701675415, + "learning_rate": 9.401019614525944e-06, + "loss": 1.0863, + "step": 3436 + }, + { + "epoch": 0.9978226157642619, + "grad_norm": 2.8418521881103516, + "learning_rate": 9.400563825115163e-06, + "loss": 1.0813, + "step": 3437 + }, + { + "epoch": 0.9981129336623603, + "grad_norm": 3.322758913040161, + "learning_rate": 9.400107873413335e-06, + "loss": 1.0213, + "step": 3438 + }, + { + "epoch": 0.9984032515604587, + "grad_norm": 3.388033866882324, + "learning_rate": 9.399651759437276e-06, + "loss": 1.14, + "step": 3439 + }, + { + "epoch": 0.9986935694585571, + "grad_norm": 3.383345127105713, + "learning_rate": 9.399195483203805e-06, + "loss": 1.2244, + "step": 3440 + }, + { + "epoch": 0.9989838873566556, + "grad_norm": 3.070141315460205, + "learning_rate": 9.39873904472975e-06, + "loss": 1.1552, + "step": 3441 + }, + { + "epoch": 0.999274205254754, + "grad_norm": 3.090776205062866, + "learning_rate": 9.398282444031944e-06, + "loss": 1.1257, + "step": 3442 + }, + { + "epoch": 0.9995645231528524, + "grad_norm": 3.1344099044799805, + "learning_rate": 9.397825681127228e-06, + "loss": 1.278, + "step": 3443 + }, + { + "epoch": 0.9998548410509508, + "grad_norm": 2.9550633430480957, + "learning_rate": 9.397368756032445e-06, + "loss": 1.0698, + "step": 3444 + }, + { + "epoch": 1.0001451589490493, + "grad_norm": 3.0842957496643066, + "learning_rate": 9.39691166876445e-06, + "loss": 1.1084, + "step": 3445 + }, + { + "epoch": 1.0004354768471477, + "grad_norm": 2.8712656497955322, + "learning_rate": 9.396454419340096e-06, + "loss": 0.7726, + "step": 3446 + }, + { + "epoch": 1.0007257947452461, + "grad_norm": 2.9478588104248047, + "learning_rate": 9.395997007776247e-06, + "loss": 0.8716, + "step": 3447 + }, + { + "epoch": 1.0010161126433446, + "grad_norm": 3.1845040321350098, + "learning_rate": 9.395539434089773e-06, + "loss": 0.8577, + "step": 3448 + }, + { + "epoch": 1.001306430541443, + "grad_norm": 2.5706589221954346, + "learning_rate": 9.395081698297549e-06, + "loss": 0.6979, + "step": 3449 + }, + { + "epoch": 1.0015967484395414, + "grad_norm": 3.145312786102295, + "learning_rate": 9.394623800416456e-06, + "loss": 0.8096, + "step": 3450 + }, + { + "epoch": 1.0018870663376398, + "grad_norm": 2.9135537147521973, + "learning_rate": 9.394165740463382e-06, + "loss": 0.7561, + "step": 3451 + }, + { + "epoch": 1.0021773842357382, + "grad_norm": 3.1902127265930176, + "learning_rate": 9.39370751845522e-06, + "loss": 0.7947, + "step": 3452 + }, + { + "epoch": 1.0024677021338366, + "grad_norm": 3.7546684741973877, + "learning_rate": 9.393249134408866e-06, + "loss": 0.959, + "step": 3453 + }, + { + "epoch": 1.002758020031935, + "grad_norm": 3.009138584136963, + "learning_rate": 9.392790588341228e-06, + "loss": 0.8543, + "step": 3454 + }, + { + "epoch": 1.0030483379300335, + "grad_norm": 3.8401989936828613, + "learning_rate": 9.392331880269217e-06, + "loss": 0.9496, + "step": 3455 + }, + { + "epoch": 1.0033386558281319, + "grad_norm": 3.9304797649383545, + "learning_rate": 9.39187301020975e-06, + "loss": 0.8184, + "step": 3456 + }, + { + "epoch": 1.0036289737262303, + "grad_norm": 3.111929416656494, + "learning_rate": 9.391413978179748e-06, + "loss": 0.6968, + "step": 3457 + }, + { + "epoch": 1.0039192916243287, + "grad_norm": 3.6900084018707275, + "learning_rate": 9.390954784196143e-06, + "loss": 0.7946, + "step": 3458 + }, + { + "epoch": 1.004209609522427, + "grad_norm": 3.747096300125122, + "learning_rate": 9.390495428275866e-06, + "loss": 0.8256, + "step": 3459 + }, + { + "epoch": 1.0044999274205255, + "grad_norm": 3.514481782913208, + "learning_rate": 9.39003591043586e-06, + "loss": 0.7779, + "step": 3460 + }, + { + "epoch": 1.004790245318624, + "grad_norm": 3.580620050430298, + "learning_rate": 9.389576230693072e-06, + "loss": 0.8052, + "step": 3461 + }, + { + "epoch": 1.0050805632167223, + "grad_norm": 3.489169120788574, + "learning_rate": 9.389116389064454e-06, + "loss": 0.7664, + "step": 3462 + }, + { + "epoch": 1.0053708811148208, + "grad_norm": 3.3632068634033203, + "learning_rate": 9.388656385566967e-06, + "loss": 0.8239, + "step": 3463 + }, + { + "epoch": 1.0056611990129192, + "grad_norm": 3.4779982566833496, + "learning_rate": 9.388196220217574e-06, + "loss": 0.7322, + "step": 3464 + }, + { + "epoch": 1.0059515169110176, + "grad_norm": 3.2781569957733154, + "learning_rate": 9.387735893033244e-06, + "loss": 0.7248, + "step": 3465 + }, + { + "epoch": 1.006241834809116, + "grad_norm": 2.9717156887054443, + "learning_rate": 9.387275404030957e-06, + "loss": 0.6981, + "step": 3466 + }, + { + "epoch": 1.0065321527072144, + "grad_norm": 3.2096431255340576, + "learning_rate": 9.386814753227694e-06, + "loss": 0.691, + "step": 3467 + }, + { + "epoch": 1.0068224706053128, + "grad_norm": 3.4639768600463867, + "learning_rate": 9.386353940640442e-06, + "loss": 0.8206, + "step": 3468 + }, + { + "epoch": 1.0071127885034112, + "grad_norm": 3.3985044956207275, + "learning_rate": 9.3858929662862e-06, + "loss": 0.765, + "step": 3469 + }, + { + "epoch": 1.0074031064015097, + "grad_norm": 3.998185634613037, + "learning_rate": 9.385431830181963e-06, + "loss": 0.9247, + "step": 3470 + }, + { + "epoch": 1.007693424299608, + "grad_norm": 4.009119033813477, + "learning_rate": 9.384970532344744e-06, + "loss": 0.8434, + "step": 3471 + }, + { + "epoch": 1.0079837421977065, + "grad_norm": 3.4947729110717773, + "learning_rate": 9.38450907279155e-06, + "loss": 0.7898, + "step": 3472 + }, + { + "epoch": 1.008274060095805, + "grad_norm": 3.387531280517578, + "learning_rate": 9.3840474515394e-06, + "loss": 0.7269, + "step": 3473 + }, + { + "epoch": 1.0085643779939033, + "grad_norm": 3.3790810108184814, + "learning_rate": 9.383585668605321e-06, + "loss": 0.7782, + "step": 3474 + }, + { + "epoch": 1.0088546958920017, + "grad_norm": 3.5012128353118896, + "learning_rate": 9.383123724006343e-06, + "loss": 0.7547, + "step": 3475 + }, + { + "epoch": 1.0091450137901001, + "grad_norm": 3.605910539627075, + "learning_rate": 9.382661617759501e-06, + "loss": 0.7258, + "step": 3476 + }, + { + "epoch": 1.0094353316881985, + "grad_norm": 3.25126576423645, + "learning_rate": 9.382199349881838e-06, + "loss": 0.7431, + "step": 3477 + }, + { + "epoch": 1.009725649586297, + "grad_norm": 3.136561155319214, + "learning_rate": 9.3817369203904e-06, + "loss": 0.7035, + "step": 3478 + }, + { + "epoch": 1.0100159674843954, + "grad_norm": 3.353161334991455, + "learning_rate": 9.381274329302244e-06, + "loss": 0.6801, + "step": 3479 + }, + { + "epoch": 1.0103062853824938, + "grad_norm": 3.6189143657684326, + "learning_rate": 9.38081157663443e-06, + "loss": 0.702, + "step": 3480 + }, + { + "epoch": 1.0105966032805922, + "grad_norm": 3.855806350708008, + "learning_rate": 9.380348662404024e-06, + "loss": 0.7256, + "step": 3481 + }, + { + "epoch": 1.0108869211786906, + "grad_norm": 4.021921634674072, + "learning_rate": 9.379885586628098e-06, + "loss": 0.7767, + "step": 3482 + }, + { + "epoch": 1.011177239076789, + "grad_norm": 3.6086981296539307, + "learning_rate": 9.379422349323728e-06, + "loss": 0.8053, + "step": 3483 + }, + { + "epoch": 1.0114675569748874, + "grad_norm": 3.474881887435913, + "learning_rate": 9.378958950508001e-06, + "loss": 0.7292, + "step": 3484 + }, + { + "epoch": 1.0117578748729859, + "grad_norm": 3.6635396480560303, + "learning_rate": 9.378495390198005e-06, + "loss": 0.8161, + "step": 3485 + }, + { + "epoch": 1.0120481927710843, + "grad_norm": 3.293006420135498, + "learning_rate": 9.378031668410836e-06, + "loss": 0.6933, + "step": 3486 + }, + { + "epoch": 1.0123385106691827, + "grad_norm": 3.572141408920288, + "learning_rate": 9.377567785163597e-06, + "loss": 0.7402, + "step": 3487 + }, + { + "epoch": 1.012628828567281, + "grad_norm": 3.474271535873413, + "learning_rate": 9.377103740473396e-06, + "loss": 0.7938, + "step": 3488 + }, + { + "epoch": 1.0129191464653795, + "grad_norm": 3.1348941326141357, + "learning_rate": 9.376639534357346e-06, + "loss": 0.68, + "step": 3489 + }, + { + "epoch": 1.013209464363478, + "grad_norm": 3.269479990005493, + "learning_rate": 9.376175166832565e-06, + "loss": 0.7249, + "step": 3490 + }, + { + "epoch": 1.0134997822615763, + "grad_norm": 3.5079116821289062, + "learning_rate": 9.375710637916182e-06, + "loss": 0.8077, + "step": 3491 + }, + { + "epoch": 1.0137901001596747, + "grad_norm": 3.673961877822876, + "learning_rate": 9.375245947625326e-06, + "loss": 0.7918, + "step": 3492 + }, + { + "epoch": 1.0140804180577732, + "grad_norm": 3.6237893104553223, + "learning_rate": 9.374781095977137e-06, + "loss": 0.7134, + "step": 3493 + }, + { + "epoch": 1.0143707359558716, + "grad_norm": 3.540834903717041, + "learning_rate": 9.374316082988758e-06, + "loss": 0.8578, + "step": 3494 + }, + { + "epoch": 1.0146610538539702, + "grad_norm": 3.900315046310425, + "learning_rate": 9.373850908677335e-06, + "loss": 0.7959, + "step": 3495 + }, + { + "epoch": 1.0149513717520686, + "grad_norm": 3.6177544593811035, + "learning_rate": 9.373385573060028e-06, + "loss": 0.7218, + "step": 3496 + }, + { + "epoch": 1.015241689650167, + "grad_norm": 3.376136064529419, + "learning_rate": 9.372920076153996e-06, + "loss": 0.7929, + "step": 3497 + }, + { + "epoch": 1.0155320075482654, + "grad_norm": 3.4732577800750732, + "learning_rate": 9.372454417976407e-06, + "loss": 0.7308, + "step": 3498 + }, + { + "epoch": 1.0158223254463639, + "grad_norm": 3.1645116806030273, + "learning_rate": 9.371988598544434e-06, + "loss": 0.782, + "step": 3499 + }, + { + "epoch": 1.0161126433444623, + "grad_norm": 3.3945982456207275, + "learning_rate": 9.371522617875258e-06, + "loss": 0.826, + "step": 3500 + }, + { + "epoch": 1.0161126433444623, + "eval_loss": 1.20125412940979, + "eval_runtime": 13.5944, + "eval_samples_per_second": 29.424, + "eval_steps_per_second": 3.678, + "step": 3500 + }, + { + "epoch": 1.0164029612425607, + "grad_norm": 2.948904037475586, + "learning_rate": 9.371056475986062e-06, + "loss": 0.6515, + "step": 3501 + }, + { + "epoch": 1.016693279140659, + "grad_norm": 3.3584020137786865, + "learning_rate": 9.370590172894037e-06, + "loss": 0.757, + "step": 3502 + }, + { + "epoch": 1.0169835970387575, + "grad_norm": 3.512335777282715, + "learning_rate": 9.370123708616381e-06, + "loss": 0.7603, + "step": 3503 + }, + { + "epoch": 1.017273914936856, + "grad_norm": 3.194840908050537, + "learning_rate": 9.369657083170297e-06, + "loss": 0.6974, + "step": 3504 + }, + { + "epoch": 1.0175642328349543, + "grad_norm": 3.945988178253174, + "learning_rate": 9.369190296572994e-06, + "loss": 0.8559, + "step": 3505 + }, + { + "epoch": 1.0178545507330528, + "grad_norm": 3.3235080242156982, + "learning_rate": 9.368723348841687e-06, + "loss": 0.7431, + "step": 3506 + }, + { + "epoch": 1.0181448686311512, + "grad_norm": 5.137023448944092, + "learning_rate": 9.368256239993597e-06, + "loss": 0.8582, + "step": 3507 + }, + { + "epoch": 1.0184351865292496, + "grad_norm": 3.438002824783325, + "learning_rate": 9.367788970045947e-06, + "loss": 0.7126, + "step": 3508 + }, + { + "epoch": 1.018725504427348, + "grad_norm": 3.168781042098999, + "learning_rate": 9.367321539015977e-06, + "loss": 0.6956, + "step": 3509 + }, + { + "epoch": 1.0190158223254464, + "grad_norm": 3.538299322128296, + "learning_rate": 9.36685394692092e-06, + "loss": 0.6906, + "step": 3510 + }, + { + "epoch": 1.0193061402235448, + "grad_norm": 3.4802918434143066, + "learning_rate": 9.366386193778023e-06, + "loss": 0.6902, + "step": 3511 + }, + { + "epoch": 1.0195964581216432, + "grad_norm": 2.9979684352874756, + "learning_rate": 9.365918279604536e-06, + "loss": 0.744, + "step": 3512 + }, + { + "epoch": 1.0198867760197416, + "grad_norm": 3.391887664794922, + "learning_rate": 9.365450204417714e-06, + "loss": 0.7245, + "step": 3513 + }, + { + "epoch": 1.02017709391784, + "grad_norm": 3.5485000610351562, + "learning_rate": 9.364981968234823e-06, + "loss": 0.8228, + "step": 3514 + }, + { + "epoch": 1.0204674118159385, + "grad_norm": 3.3044545650482178, + "learning_rate": 9.364513571073129e-06, + "loss": 0.6746, + "step": 3515 + }, + { + "epoch": 1.0207577297140369, + "grad_norm": 3.7134881019592285, + "learning_rate": 9.364045012949904e-06, + "loss": 0.8221, + "step": 3516 + }, + { + "epoch": 1.0210480476121353, + "grad_norm": 3.1946160793304443, + "learning_rate": 9.363576293882432e-06, + "loss": 0.7176, + "step": 3517 + }, + { + "epoch": 1.0213383655102337, + "grad_norm": 3.5327720642089844, + "learning_rate": 9.363107413887999e-06, + "loss": 0.7192, + "step": 3518 + }, + { + "epoch": 1.0216286834083321, + "grad_norm": 3.509906053543091, + "learning_rate": 9.362638372983894e-06, + "loss": 0.7382, + "step": 3519 + }, + { + "epoch": 1.0219190013064305, + "grad_norm": 3.8694610595703125, + "learning_rate": 9.362169171187419e-06, + "loss": 0.7577, + "step": 3520 + }, + { + "epoch": 1.022209319204529, + "grad_norm": 3.1691699028015137, + "learning_rate": 9.361699808515877e-06, + "loss": 0.7352, + "step": 3521 + }, + { + "epoch": 1.0224996371026274, + "grad_norm": 3.552873134613037, + "learning_rate": 9.361230284986573e-06, + "loss": 0.8043, + "step": 3522 + }, + { + "epoch": 1.0227899550007258, + "grad_norm": 3.761043071746826, + "learning_rate": 9.36076060061683e-06, + "loss": 0.9012, + "step": 3523 + }, + { + "epoch": 1.0230802728988242, + "grad_norm": 3.4257898330688477, + "learning_rate": 9.360290755423966e-06, + "loss": 0.7829, + "step": 3524 + }, + { + "epoch": 1.0233705907969226, + "grad_norm": 3.318141460418701, + "learning_rate": 9.359820749425308e-06, + "loss": 0.6867, + "step": 3525 + }, + { + "epoch": 1.023660908695021, + "grad_norm": 3.2003114223480225, + "learning_rate": 9.359350582638193e-06, + "loss": 0.7361, + "step": 3526 + }, + { + "epoch": 1.0239512265931194, + "grad_norm": 3.5448482036590576, + "learning_rate": 9.358880255079957e-06, + "loss": 0.7987, + "step": 3527 + }, + { + "epoch": 1.0242415444912178, + "grad_norm": 3.18243145942688, + "learning_rate": 9.358409766767946e-06, + "loss": 0.7502, + "step": 3528 + }, + { + "epoch": 1.0245318623893163, + "grad_norm": 3.511103868484497, + "learning_rate": 9.357939117719515e-06, + "loss": 0.6952, + "step": 3529 + }, + { + "epoch": 1.0248221802874147, + "grad_norm": 3.4447379112243652, + "learning_rate": 9.357468307952019e-06, + "loss": 0.7581, + "step": 3530 + }, + { + "epoch": 1.025112498185513, + "grad_norm": 4.462029933929443, + "learning_rate": 9.356997337482818e-06, + "loss": 0.9036, + "step": 3531 + }, + { + "epoch": 1.0254028160836115, + "grad_norm": 4.024928092956543, + "learning_rate": 9.356526206329285e-06, + "loss": 0.7405, + "step": 3532 + }, + { + "epoch": 1.02569313398171, + "grad_norm": 3.3090834617614746, + "learning_rate": 9.356054914508796e-06, + "loss": 0.6529, + "step": 3533 + }, + { + "epoch": 1.0259834518798083, + "grad_norm": 3.7456352710723877, + "learning_rate": 9.355583462038728e-06, + "loss": 0.8039, + "step": 3534 + }, + { + "epoch": 1.0262737697779067, + "grad_norm": 3.3236465454101562, + "learning_rate": 9.355111848936472e-06, + "loss": 0.749, + "step": 3535 + }, + { + "epoch": 1.0265640876760052, + "grad_norm": 3.631131887435913, + "learning_rate": 9.354640075219419e-06, + "loss": 0.7229, + "step": 3536 + }, + { + "epoch": 1.0268544055741036, + "grad_norm": 3.345919132232666, + "learning_rate": 9.35416814090497e-06, + "loss": 0.689, + "step": 3537 + }, + { + "epoch": 1.027144723472202, + "grad_norm": 3.5057573318481445, + "learning_rate": 9.353696046010524e-06, + "loss": 0.6877, + "step": 3538 + }, + { + "epoch": 1.0274350413703004, + "grad_norm": 3.5284013748168945, + "learning_rate": 9.353223790553499e-06, + "loss": 0.7665, + "step": 3539 + }, + { + "epoch": 1.0277253592683988, + "grad_norm": 3.2629342079162598, + "learning_rate": 9.352751374551305e-06, + "loss": 0.7404, + "step": 3540 + }, + { + "epoch": 1.0280156771664972, + "grad_norm": 3.636103630065918, + "learning_rate": 9.35227879802137e-06, + "loss": 0.7259, + "step": 3541 + }, + { + "epoch": 1.0283059950645956, + "grad_norm": 3.3388805389404297, + "learning_rate": 9.35180606098112e-06, + "loss": 0.8092, + "step": 3542 + }, + { + "epoch": 1.028596312962694, + "grad_norm": 3.710493326187134, + "learning_rate": 9.351333163447989e-06, + "loss": 0.7778, + "step": 3543 + }, + { + "epoch": 1.0288866308607925, + "grad_norm": 3.360016345977783, + "learning_rate": 9.350860105439416e-06, + "loss": 0.8075, + "step": 3544 + }, + { + "epoch": 1.029176948758891, + "grad_norm": 3.6781256198883057, + "learning_rate": 9.35038688697285e-06, + "loss": 0.835, + "step": 3545 + }, + { + "epoch": 1.0294672666569895, + "grad_norm": 3.3641157150268555, + "learning_rate": 9.349913508065743e-06, + "loss": 0.8336, + "step": 3546 + }, + { + "epoch": 1.029757584555088, + "grad_norm": 3.334789752960205, + "learning_rate": 9.349439968735551e-06, + "loss": 0.6987, + "step": 3547 + }, + { + "epoch": 1.0300479024531863, + "grad_norm": 4.041718482971191, + "learning_rate": 9.34896626899974e-06, + "loss": 0.6966, + "step": 3548 + }, + { + "epoch": 1.0303382203512848, + "grad_norm": 3.1009633541107178, + "learning_rate": 9.348492408875779e-06, + "loss": 0.6439, + "step": 3549 + }, + { + "epoch": 1.0306285382493832, + "grad_norm": 3.5959973335266113, + "learning_rate": 9.348018388381142e-06, + "loss": 0.7712, + "step": 3550 + }, + { + "epoch": 1.0309188561474816, + "grad_norm": 3.441721200942993, + "learning_rate": 9.347544207533315e-06, + "loss": 0.6931, + "step": 3551 + }, + { + "epoch": 1.03120917404558, + "grad_norm": 3.2447519302368164, + "learning_rate": 9.34706986634978e-06, + "loss": 0.6328, + "step": 3552 + }, + { + "epoch": 1.0314994919436784, + "grad_norm": 3.586515188217163, + "learning_rate": 9.346595364848035e-06, + "loss": 0.8278, + "step": 3553 + }, + { + "epoch": 1.0317898098417768, + "grad_norm": 3.604525327682495, + "learning_rate": 9.346120703045576e-06, + "loss": 0.8527, + "step": 3554 + }, + { + "epoch": 1.0320801277398752, + "grad_norm": 3.301090717315674, + "learning_rate": 9.345645880959912e-06, + "loss": 0.6894, + "step": 3555 + }, + { + "epoch": 1.0323704456379736, + "grad_norm": 3.388200044631958, + "learning_rate": 9.345170898608553e-06, + "loss": 0.7878, + "step": 3556 + }, + { + "epoch": 1.032660763536072, + "grad_norm": 3.0278408527374268, + "learning_rate": 9.344695756009013e-06, + "loss": 0.7222, + "step": 3557 + }, + { + "epoch": 1.0329510814341705, + "grad_norm": 3.474755048751831, + "learning_rate": 9.344220453178821e-06, + "loss": 0.7424, + "step": 3558 + }, + { + "epoch": 1.0332413993322689, + "grad_norm": 3.2388312816619873, + "learning_rate": 9.3437449901355e-06, + "loss": 0.7189, + "step": 3559 + }, + { + "epoch": 1.0335317172303673, + "grad_norm": 3.3592824935913086, + "learning_rate": 9.343269366896588e-06, + "loss": 0.7467, + "step": 3560 + }, + { + "epoch": 1.0338220351284657, + "grad_norm": 3.4638187885284424, + "learning_rate": 9.342793583479625e-06, + "loss": 0.7311, + "step": 3561 + }, + { + "epoch": 1.0341123530265641, + "grad_norm": 3.9923784732818604, + "learning_rate": 9.342317639902158e-06, + "loss": 0.944, + "step": 3562 + }, + { + "epoch": 1.0344026709246625, + "grad_norm": 3.471781015396118, + "learning_rate": 9.341841536181742e-06, + "loss": 0.7335, + "step": 3563 + }, + { + "epoch": 1.034692988822761, + "grad_norm": 3.4282989501953125, + "learning_rate": 9.341365272335932e-06, + "loss": 0.8669, + "step": 3564 + }, + { + "epoch": 1.0349833067208594, + "grad_norm": 3.347621440887451, + "learning_rate": 9.340888848382292e-06, + "loss": 0.72, + "step": 3565 + }, + { + "epoch": 1.0352736246189578, + "grad_norm": 3.4983551502227783, + "learning_rate": 9.340412264338394e-06, + "loss": 0.8129, + "step": 3566 + }, + { + "epoch": 1.0355639425170562, + "grad_norm": 3.236875534057617, + "learning_rate": 9.339935520221816e-06, + "loss": 0.7324, + "step": 3567 + }, + { + "epoch": 1.0358542604151546, + "grad_norm": 3.8020715713500977, + "learning_rate": 9.339458616050137e-06, + "loss": 0.6812, + "step": 3568 + }, + { + "epoch": 1.036144578313253, + "grad_norm": 3.6184334754943848, + "learning_rate": 9.338981551840947e-06, + "loss": 0.6708, + "step": 3569 + }, + { + "epoch": 1.0364348962113514, + "grad_norm": 3.225571632385254, + "learning_rate": 9.338504327611839e-06, + "loss": 0.719, + "step": 3570 + }, + { + "epoch": 1.0367252141094498, + "grad_norm": 3.2746708393096924, + "learning_rate": 9.338026943380413e-06, + "loss": 0.7274, + "step": 3571 + }, + { + "epoch": 1.0370155320075483, + "grad_norm": 3.2747983932495117, + "learning_rate": 9.337549399164274e-06, + "loss": 0.7414, + "step": 3572 + }, + { + "epoch": 1.0373058499056467, + "grad_norm": 3.33699369430542, + "learning_rate": 9.337071694981038e-06, + "loss": 0.7898, + "step": 3573 + }, + { + "epoch": 1.037596167803745, + "grad_norm": 3.4813315868377686, + "learning_rate": 9.336593830848315e-06, + "loss": 0.6973, + "step": 3574 + }, + { + "epoch": 1.0378864857018435, + "grad_norm": 2.953972339630127, + "learning_rate": 9.336115806783734e-06, + "loss": 0.6768, + "step": 3575 + }, + { + "epoch": 1.038176803599942, + "grad_norm": 3.2962663173675537, + "learning_rate": 9.335637622804922e-06, + "loss": 0.7336, + "step": 3576 + }, + { + "epoch": 1.0384671214980403, + "grad_norm": 3.4844980239868164, + "learning_rate": 9.335159278929516e-06, + "loss": 0.7695, + "step": 3577 + }, + { + "epoch": 1.0387574393961387, + "grad_norm": 3.954115152359009, + "learning_rate": 9.334680775175154e-06, + "loss": 0.9909, + "step": 3578 + }, + { + "epoch": 1.0390477572942372, + "grad_norm": 3.5708436965942383, + "learning_rate": 9.334202111559487e-06, + "loss": 0.7544, + "step": 3579 + }, + { + "epoch": 1.0393380751923356, + "grad_norm": 2.7870044708251953, + "learning_rate": 9.333723288100167e-06, + "loss": 0.6855, + "step": 3580 + }, + { + "epoch": 1.039628393090434, + "grad_norm": 3.445352554321289, + "learning_rate": 9.33324430481485e-06, + "loss": 0.6626, + "step": 3581 + }, + { + "epoch": 1.0399187109885324, + "grad_norm": 3.9660799503326416, + "learning_rate": 9.332765161721203e-06, + "loss": 0.8, + "step": 3582 + }, + { + "epoch": 1.0402090288866308, + "grad_norm": 4.004605293273926, + "learning_rate": 9.332285858836898e-06, + "loss": 0.8748, + "step": 3583 + }, + { + "epoch": 1.0404993467847292, + "grad_norm": 3.285799980163574, + "learning_rate": 9.331806396179607e-06, + "loss": 0.8192, + "step": 3584 + }, + { + "epoch": 1.0407896646828276, + "grad_norm": 3.3582661151885986, + "learning_rate": 9.331326773767018e-06, + "loss": 0.6696, + "step": 3585 + }, + { + "epoch": 1.041079982580926, + "grad_norm": 3.596374273300171, + "learning_rate": 9.330846991616814e-06, + "loss": 0.7014, + "step": 3586 + }, + { + "epoch": 1.0413703004790245, + "grad_norm": 3.59114408493042, + "learning_rate": 9.330367049746693e-06, + "loss": 0.7166, + "step": 3587 + }, + { + "epoch": 1.0416606183771229, + "grad_norm": 3.740971565246582, + "learning_rate": 9.329886948174353e-06, + "loss": 0.7826, + "step": 3588 + }, + { + "epoch": 1.0419509362752213, + "grad_norm": 3.2020390033721924, + "learning_rate": 9.329406686917502e-06, + "loss": 0.643, + "step": 3589 + }, + { + "epoch": 1.0422412541733197, + "grad_norm": 3.364518404006958, + "learning_rate": 9.328926265993849e-06, + "loss": 0.8063, + "step": 3590 + }, + { + "epoch": 1.0425315720714181, + "grad_norm": 3.603043556213379, + "learning_rate": 9.328445685421113e-06, + "loss": 0.6926, + "step": 3591 + }, + { + "epoch": 1.0428218899695165, + "grad_norm": 3.999770164489746, + "learning_rate": 9.327964945217018e-06, + "loss": 0.8984, + "step": 3592 + }, + { + "epoch": 1.043112207867615, + "grad_norm": 3.8826112747192383, + "learning_rate": 9.327484045399294e-06, + "loss": 0.7575, + "step": 3593 + }, + { + "epoch": 1.0434025257657134, + "grad_norm": 3.6010074615478516, + "learning_rate": 9.327002985985676e-06, + "loss": 0.8438, + "step": 3594 + }, + { + "epoch": 1.0436928436638118, + "grad_norm": 3.9782824516296387, + "learning_rate": 9.326521766993904e-06, + "loss": 0.7927, + "step": 3595 + }, + { + "epoch": 1.0439831615619104, + "grad_norm": 3.465355157852173, + "learning_rate": 9.326040388441727e-06, + "loss": 0.6731, + "step": 3596 + }, + { + "epoch": 1.0442734794600088, + "grad_norm": 3.5577354431152344, + "learning_rate": 9.325558850346897e-06, + "loss": 0.8736, + "step": 3597 + }, + { + "epoch": 1.0445637973581072, + "grad_norm": 3.6358604431152344, + "learning_rate": 9.325077152727173e-06, + "loss": 0.7572, + "step": 3598 + }, + { + "epoch": 1.0448541152562056, + "grad_norm": 3.9167327880859375, + "learning_rate": 9.324595295600318e-06, + "loss": 0.7054, + "step": 3599 + }, + { + "epoch": 1.045144433154304, + "grad_norm": 4.315560340881348, + "learning_rate": 9.324113278984108e-06, + "loss": 0.9471, + "step": 3600 + }, + { + "epoch": 1.0454347510524025, + "grad_norm": 3.8556084632873535, + "learning_rate": 9.323631102896314e-06, + "loss": 0.7213, + "step": 3601 + }, + { + "epoch": 1.0457250689505009, + "grad_norm": 3.4413363933563232, + "learning_rate": 9.323148767354721e-06, + "loss": 0.7063, + "step": 3602 + }, + { + "epoch": 1.0460153868485993, + "grad_norm": 3.2421858310699463, + "learning_rate": 9.322666272377119e-06, + "loss": 0.7034, + "step": 3603 + }, + { + "epoch": 1.0463057047466977, + "grad_norm": 3.6639201641082764, + "learning_rate": 9.322183617981297e-06, + "loss": 0.8093, + "step": 3604 + }, + { + "epoch": 1.0465960226447961, + "grad_norm": 3.616205930709839, + "learning_rate": 9.321700804185061e-06, + "loss": 0.7865, + "step": 3605 + }, + { + "epoch": 1.0468863405428945, + "grad_norm": 3.593491554260254, + "learning_rate": 9.321217831006214e-06, + "loss": 0.8386, + "step": 3606 + }, + { + "epoch": 1.047176658440993, + "grad_norm": 3.3423163890838623, + "learning_rate": 9.320734698462569e-06, + "loss": 0.7197, + "step": 3607 + }, + { + "epoch": 1.0474669763390914, + "grad_norm": 3.197126865386963, + "learning_rate": 9.32025140657194e-06, + "loss": 0.6785, + "step": 3608 + }, + { + "epoch": 1.0477572942371898, + "grad_norm": 3.575289487838745, + "learning_rate": 9.319767955352154e-06, + "loss": 0.7922, + "step": 3609 + }, + { + "epoch": 1.0480476121352882, + "grad_norm": 3.8259365558624268, + "learning_rate": 9.319284344821042e-06, + "loss": 0.7762, + "step": 3610 + }, + { + "epoch": 1.0483379300333866, + "grad_norm": 3.8167777061462402, + "learning_rate": 9.318800574996437e-06, + "loss": 0.9812, + "step": 3611 + }, + { + "epoch": 1.048628247931485, + "grad_norm": 3.700352430343628, + "learning_rate": 9.318316645896182e-06, + "loss": 0.7656, + "step": 3612 + }, + { + "epoch": 1.0489185658295834, + "grad_norm": 3.7808494567871094, + "learning_rate": 9.31783255753812e-06, + "loss": 0.6732, + "step": 3613 + }, + { + "epoch": 1.0492088837276818, + "grad_norm": 3.1864030361175537, + "learning_rate": 9.317348309940109e-06, + "loss": 0.664, + "step": 3614 + }, + { + "epoch": 1.0494992016257803, + "grad_norm": 3.409240245819092, + "learning_rate": 9.316863903120004e-06, + "loss": 0.7459, + "step": 3615 + }, + { + "epoch": 1.0497895195238787, + "grad_norm": 3.466313362121582, + "learning_rate": 9.316379337095671e-06, + "loss": 0.7646, + "step": 3616 + }, + { + "epoch": 1.050079837421977, + "grad_norm": 3.3947641849517822, + "learning_rate": 9.315894611884982e-06, + "loss": 0.7207, + "step": 3617 + }, + { + "epoch": 1.0503701553200755, + "grad_norm": 3.1996078491210938, + "learning_rate": 9.315409727505813e-06, + "loss": 0.6923, + "step": 3618 + }, + { + "epoch": 1.050660473218174, + "grad_norm": 3.2390217781066895, + "learning_rate": 9.314924683976044e-06, + "loss": 0.6493, + "step": 3619 + }, + { + "epoch": 1.0509507911162723, + "grad_norm": 3.375798225402832, + "learning_rate": 9.314439481313567e-06, + "loss": 0.7514, + "step": 3620 + }, + { + "epoch": 1.0512411090143707, + "grad_norm": 3.334712028503418, + "learning_rate": 9.313954119536273e-06, + "loss": 0.7673, + "step": 3621 + }, + { + "epoch": 1.0515314269124691, + "grad_norm": 3.1791110038757324, + "learning_rate": 9.313468598662063e-06, + "loss": 0.6983, + "step": 3622 + }, + { + "epoch": 1.0518217448105676, + "grad_norm": 3.7215187549591064, + "learning_rate": 9.312982918708843e-06, + "loss": 0.878, + "step": 3623 + }, + { + "epoch": 1.052112062708666, + "grad_norm": 3.428053617477417, + "learning_rate": 9.312497079694524e-06, + "loss": 0.7427, + "step": 3624 + }, + { + "epoch": 1.0524023806067644, + "grad_norm": 3.332998752593994, + "learning_rate": 9.312011081637025e-06, + "loss": 0.6933, + "step": 3625 + }, + { + "epoch": 1.0526926985048628, + "grad_norm": 3.5585575103759766, + "learning_rate": 9.311524924554268e-06, + "loss": 0.7643, + "step": 3626 + }, + { + "epoch": 1.0529830164029612, + "grad_norm": 3.3463525772094727, + "learning_rate": 9.311038608464183e-06, + "loss": 0.6914, + "step": 3627 + }, + { + "epoch": 1.0532733343010596, + "grad_norm": 3.7298991680145264, + "learning_rate": 9.310552133384703e-06, + "loss": 0.8181, + "step": 3628 + }, + { + "epoch": 1.053563652199158, + "grad_norm": 3.674640655517578, + "learning_rate": 9.310065499333773e-06, + "loss": 0.7731, + "step": 3629 + }, + { + "epoch": 1.0538539700972565, + "grad_norm": 3.8359897136688232, + "learning_rate": 9.309578706329338e-06, + "loss": 0.79, + "step": 3630 + }, + { + "epoch": 1.0541442879953549, + "grad_norm": 3.7508792877197266, + "learning_rate": 9.30909175438935e-06, + "loss": 0.7695, + "step": 3631 + }, + { + "epoch": 1.0544346058934533, + "grad_norm": 3.3596932888031006, + "learning_rate": 9.308604643531767e-06, + "loss": 0.7073, + "step": 3632 + }, + { + "epoch": 1.0547249237915517, + "grad_norm": 3.5916035175323486, + "learning_rate": 9.308117373774555e-06, + "loss": 0.7361, + "step": 3633 + }, + { + "epoch": 1.05501524168965, + "grad_norm": 3.477250576019287, + "learning_rate": 9.307629945135686e-06, + "loss": 0.6548, + "step": 3634 + }, + { + "epoch": 1.0553055595877485, + "grad_norm": 3.5962586402893066, + "learning_rate": 9.307142357633132e-06, + "loss": 0.8024, + "step": 3635 + }, + { + "epoch": 1.055595877485847, + "grad_norm": 3.7356138229370117, + "learning_rate": 9.306654611284878e-06, + "loss": 0.7214, + "step": 3636 + }, + { + "epoch": 1.0558861953839453, + "grad_norm": 3.799440860748291, + "learning_rate": 9.30616670610891e-06, + "loss": 0.7302, + "step": 3637 + }, + { + "epoch": 1.0561765132820438, + "grad_norm": 4.045415878295898, + "learning_rate": 9.305678642123224e-06, + "loss": 0.8737, + "step": 3638 + }, + { + "epoch": 1.0564668311801422, + "grad_norm": 3.4359524250030518, + "learning_rate": 9.305190419345817e-06, + "loss": 0.6862, + "step": 3639 + }, + { + "epoch": 1.0567571490782406, + "grad_norm": 3.230022430419922, + "learning_rate": 9.304702037794696e-06, + "loss": 0.7209, + "step": 3640 + }, + { + "epoch": 1.057047466976339, + "grad_norm": 3.462850570678711, + "learning_rate": 9.304213497487873e-06, + "loss": 0.7218, + "step": 3641 + }, + { + "epoch": 1.0573377848744374, + "grad_norm": 4.064338684082031, + "learning_rate": 9.303724798443362e-06, + "loss": 0.915, + "step": 3642 + }, + { + "epoch": 1.0576281027725358, + "grad_norm": 3.556943416595459, + "learning_rate": 9.303235940679192e-06, + "loss": 0.7198, + "step": 3643 + }, + { + "epoch": 1.0579184206706342, + "grad_norm": 3.441154718399048, + "learning_rate": 9.302746924213386e-06, + "loss": 0.7755, + "step": 3644 + }, + { + "epoch": 1.0582087385687329, + "grad_norm": 3.428337812423706, + "learning_rate": 9.302257749063981e-06, + "loss": 0.7677, + "step": 3645 + }, + { + "epoch": 1.058499056466831, + "grad_norm": 3.435852289199829, + "learning_rate": 9.301768415249017e-06, + "loss": 0.7581, + "step": 3646 + }, + { + "epoch": 1.0587893743649297, + "grad_norm": 3.674840211868286, + "learning_rate": 9.301278922786543e-06, + "loss": 0.7458, + "step": 3647 + }, + { + "epoch": 1.0590796922630281, + "grad_norm": 3.1479077339172363, + "learning_rate": 9.300789271694607e-06, + "loss": 0.7086, + "step": 3648 + }, + { + "epoch": 1.0593700101611265, + "grad_norm": 3.5983262062072754, + "learning_rate": 9.30029946199127e-06, + "loss": 0.8011, + "step": 3649 + }, + { + "epoch": 1.059660328059225, + "grad_norm": 3.4171347618103027, + "learning_rate": 9.299809493694597e-06, + "loss": 0.7957, + "step": 3650 + }, + { + "epoch": 1.0599506459573234, + "grad_norm": 3.0307910442352295, + "learning_rate": 9.299319366822654e-06, + "loss": 0.6833, + "step": 3651 + }, + { + "epoch": 1.0602409638554218, + "grad_norm": 3.349909543991089, + "learning_rate": 9.29882908139352e-06, + "loss": 0.6931, + "step": 3652 + }, + { + "epoch": 1.0605312817535202, + "grad_norm": 3.658194065093994, + "learning_rate": 9.298338637425276e-06, + "loss": 0.8358, + "step": 3653 + }, + { + "epoch": 1.0608215996516186, + "grad_norm": 3.7426369190216064, + "learning_rate": 9.297848034936007e-06, + "loss": 0.807, + "step": 3654 + }, + { + "epoch": 1.061111917549717, + "grad_norm": 3.262444019317627, + "learning_rate": 9.297357273943809e-06, + "loss": 0.7332, + "step": 3655 + }, + { + "epoch": 1.0614022354478154, + "grad_norm": 3.7360541820526123, + "learning_rate": 9.29686635446678e-06, + "loss": 0.8238, + "step": 3656 + }, + { + "epoch": 1.0616925533459138, + "grad_norm": 3.4465503692626953, + "learning_rate": 9.296375276523024e-06, + "loss": 0.8175, + "step": 3657 + }, + { + "epoch": 1.0619828712440122, + "grad_norm": 3.6495959758758545, + "learning_rate": 9.295884040130656e-06, + "loss": 0.7113, + "step": 3658 + }, + { + "epoch": 1.0622731891421107, + "grad_norm": 4.032883167266846, + "learning_rate": 9.295392645307786e-06, + "loss": 0.9692, + "step": 3659 + }, + { + "epoch": 1.062563507040209, + "grad_norm": 3.732147216796875, + "learning_rate": 9.294901092072541e-06, + "loss": 0.906, + "step": 3660 + }, + { + "epoch": 1.0628538249383075, + "grad_norm": 3.5926883220672607, + "learning_rate": 9.294409380443047e-06, + "loss": 0.7899, + "step": 3661 + }, + { + "epoch": 1.063144142836406, + "grad_norm": 3.649583578109741, + "learning_rate": 9.293917510437442e-06, + "loss": 0.7995, + "step": 3662 + }, + { + "epoch": 1.0634344607345043, + "grad_norm": 3.221046209335327, + "learning_rate": 9.293425482073862e-06, + "loss": 0.6568, + "step": 3663 + }, + { + "epoch": 1.0637247786326027, + "grad_norm": 3.4746248722076416, + "learning_rate": 9.292933295370452e-06, + "loss": 0.7059, + "step": 3664 + }, + { + "epoch": 1.0640150965307011, + "grad_norm": 3.3903510570526123, + "learning_rate": 9.292440950345367e-06, + "loss": 0.8072, + "step": 3665 + }, + { + "epoch": 1.0643054144287996, + "grad_norm": 3.920558452606201, + "learning_rate": 9.291948447016764e-06, + "loss": 0.8547, + "step": 3666 + }, + { + "epoch": 1.064595732326898, + "grad_norm": 3.2678873538970947, + "learning_rate": 9.291455785402806e-06, + "loss": 0.7555, + "step": 3667 + }, + { + "epoch": 1.0648860502249964, + "grad_norm": 3.292327404022217, + "learning_rate": 9.29096296552166e-06, + "loss": 0.7205, + "step": 3668 + }, + { + "epoch": 1.0651763681230948, + "grad_norm": 3.6426451206207275, + "learning_rate": 9.290469987391503e-06, + "loss": 0.8298, + "step": 3669 + }, + { + "epoch": 1.0654666860211932, + "grad_norm": 3.2656807899475098, + "learning_rate": 9.289976851030516e-06, + "loss": 0.7498, + "step": 3670 + }, + { + "epoch": 1.0657570039192916, + "grad_norm": 3.449364423751831, + "learning_rate": 9.289483556456883e-06, + "loss": 0.6807, + "step": 3671 + }, + { + "epoch": 1.06604732181739, + "grad_norm": 3.5260181427001953, + "learning_rate": 9.288990103688803e-06, + "loss": 0.7635, + "step": 3672 + }, + { + "epoch": 1.0663376397154885, + "grad_norm": 3.302656650543213, + "learning_rate": 9.288496492744466e-06, + "loss": 0.7195, + "step": 3673 + }, + { + "epoch": 1.0666279576135869, + "grad_norm": 3.234776258468628, + "learning_rate": 9.288002723642082e-06, + "loss": 0.7321, + "step": 3674 + }, + { + "epoch": 1.0669182755116853, + "grad_norm": 3.6483352184295654, + "learning_rate": 9.287508796399858e-06, + "loss": 0.7607, + "step": 3675 + }, + { + "epoch": 1.0672085934097837, + "grad_norm": 3.533311367034912, + "learning_rate": 9.287014711036013e-06, + "loss": 0.771, + "step": 3676 + }, + { + "epoch": 1.067498911307882, + "grad_norm": 3.86702036857605, + "learning_rate": 9.286520467568765e-06, + "loss": 0.7407, + "step": 3677 + }, + { + "epoch": 1.0677892292059805, + "grad_norm": 3.479646921157837, + "learning_rate": 9.286026066016344e-06, + "loss": 0.7384, + "step": 3678 + }, + { + "epoch": 1.068079547104079, + "grad_norm": 3.4313340187072754, + "learning_rate": 9.285531506396981e-06, + "loss": 0.7239, + "step": 3679 + }, + { + "epoch": 1.0683698650021773, + "grad_norm": 3.6296842098236084, + "learning_rate": 9.28503678872892e-06, + "loss": 0.739, + "step": 3680 + }, + { + "epoch": 1.0686601829002758, + "grad_norm": 3.3509602546691895, + "learning_rate": 9.2845419130304e-06, + "loss": 0.7174, + "step": 3681 + }, + { + "epoch": 1.0689505007983742, + "grad_norm": 3.4982831478118896, + "learning_rate": 9.284046879319675e-06, + "loss": 0.689, + "step": 3682 + }, + { + "epoch": 1.0692408186964726, + "grad_norm": 3.420058488845825, + "learning_rate": 9.283551687615002e-06, + "loss": 0.8226, + "step": 3683 + }, + { + "epoch": 1.069531136594571, + "grad_norm": 3.6235501766204834, + "learning_rate": 9.283056337934642e-06, + "loss": 0.8152, + "step": 3684 + }, + { + "epoch": 1.0698214544926694, + "grad_norm": 3.484602212905884, + "learning_rate": 9.282560830296864e-06, + "loss": 0.6969, + "step": 3685 + }, + { + "epoch": 1.0701117723907678, + "grad_norm": 3.5631332397460938, + "learning_rate": 9.282065164719942e-06, + "loss": 0.6524, + "step": 3686 + }, + { + "epoch": 1.0704020902888662, + "grad_norm": 3.950852155685425, + "learning_rate": 9.281569341222157e-06, + "loss": 0.8027, + "step": 3687 + }, + { + "epoch": 1.0706924081869647, + "grad_norm": 3.7912333011627197, + "learning_rate": 9.281073359821793e-06, + "loss": 0.7996, + "step": 3688 + }, + { + "epoch": 1.070982726085063, + "grad_norm": 3.653871774673462, + "learning_rate": 9.280577220537141e-06, + "loss": 0.8104, + "step": 3689 + }, + { + "epoch": 1.0712730439831615, + "grad_norm": 3.430440902709961, + "learning_rate": 9.280080923386501e-06, + "loss": 0.7232, + "step": 3690 + }, + { + "epoch": 1.07156336188126, + "grad_norm": 4.061392784118652, + "learning_rate": 9.279584468388176e-06, + "loss": 0.971, + "step": 3691 + }, + { + "epoch": 1.0718536797793583, + "grad_norm": 4.008795261383057, + "learning_rate": 9.279087855560474e-06, + "loss": 1.0048, + "step": 3692 + }, + { + "epoch": 1.0721439976774567, + "grad_norm": 3.099137306213379, + "learning_rate": 9.278591084921707e-06, + "loss": 0.6123, + "step": 3693 + }, + { + "epoch": 1.0724343155755551, + "grad_norm": 3.285714864730835, + "learning_rate": 9.278094156490201e-06, + "loss": 0.8407, + "step": 3694 + }, + { + "epoch": 1.0727246334736535, + "grad_norm": 3.100593090057373, + "learning_rate": 9.277597070284281e-06, + "loss": 0.6844, + "step": 3695 + }, + { + "epoch": 1.0730149513717522, + "grad_norm": 3.205623149871826, + "learning_rate": 9.277099826322277e-06, + "loss": 0.7021, + "step": 3696 + }, + { + "epoch": 1.0733052692698504, + "grad_norm": 3.03759765625, + "learning_rate": 9.27660242462253e-06, + "loss": 0.7171, + "step": 3697 + }, + { + "epoch": 1.073595587167949, + "grad_norm": 3.6206579208374023, + "learning_rate": 9.276104865203381e-06, + "loss": 0.7852, + "step": 3698 + }, + { + "epoch": 1.0738859050660474, + "grad_norm": 3.3694751262664795, + "learning_rate": 9.275607148083183e-06, + "loss": 0.7441, + "step": 3699 + }, + { + "epoch": 1.0741762229641458, + "grad_norm": 3.1286754608154297, + "learning_rate": 9.27510927328029e-06, + "loss": 0.6822, + "step": 3700 + }, + { + "epoch": 1.0744665408622442, + "grad_norm": 3.711529493331909, + "learning_rate": 9.274611240813062e-06, + "loss": 0.8291, + "step": 3701 + }, + { + "epoch": 1.0747568587603427, + "grad_norm": 3.498225688934326, + "learning_rate": 9.27411305069987e-06, + "loss": 0.7177, + "step": 3702 + }, + { + "epoch": 1.075047176658441, + "grad_norm": 3.874438524246216, + "learning_rate": 9.273614702959084e-06, + "loss": 0.8755, + "step": 3703 + }, + { + "epoch": 1.0753374945565395, + "grad_norm": 3.329667091369629, + "learning_rate": 9.273116197609085e-06, + "loss": 0.7263, + "step": 3704 + }, + { + "epoch": 1.075627812454638, + "grad_norm": 3.8230533599853516, + "learning_rate": 9.272617534668253e-06, + "loss": 0.7387, + "step": 3705 + }, + { + "epoch": 1.0759181303527363, + "grad_norm": 3.4294612407684326, + "learning_rate": 9.272118714154985e-06, + "loss": 0.7991, + "step": 3706 + }, + { + "epoch": 1.0762084482508347, + "grad_norm": 3.3059473037719727, + "learning_rate": 9.271619736087672e-06, + "loss": 0.678, + "step": 3707 + }, + { + "epoch": 1.0764987661489331, + "grad_norm": 3.165100336074829, + "learning_rate": 9.271120600484719e-06, + "loss": 0.7196, + "step": 3708 + }, + { + "epoch": 1.0767890840470316, + "grad_norm": 3.8009140491485596, + "learning_rate": 9.270621307364534e-06, + "loss": 0.9077, + "step": 3709 + }, + { + "epoch": 1.07707940194513, + "grad_norm": 3.789745330810547, + "learning_rate": 9.270121856745529e-06, + "loss": 0.8262, + "step": 3710 + }, + { + "epoch": 1.0773697198432284, + "grad_norm": 3.822162628173828, + "learning_rate": 9.269622248646124e-06, + "loss": 0.8806, + "step": 3711 + }, + { + "epoch": 1.0776600377413268, + "grad_norm": 3.407487392425537, + "learning_rate": 9.269122483084748e-06, + "loss": 0.7972, + "step": 3712 + }, + { + "epoch": 1.0779503556394252, + "grad_norm": 3.5224902629852295, + "learning_rate": 9.268622560079825e-06, + "loss": 0.8497, + "step": 3713 + }, + { + "epoch": 1.0782406735375236, + "grad_norm": 3.553903102874756, + "learning_rate": 9.268122479649796e-06, + "loss": 0.7534, + "step": 3714 + }, + { + "epoch": 1.078530991435622, + "grad_norm": 3.266307830810547, + "learning_rate": 9.267622241813106e-06, + "loss": 0.707, + "step": 3715 + }, + { + "epoch": 1.0788213093337204, + "grad_norm": 3.3318376541137695, + "learning_rate": 9.267121846588201e-06, + "loss": 0.7378, + "step": 3716 + }, + { + "epoch": 1.0791116272318189, + "grad_norm": 3.259420871734619, + "learning_rate": 9.266621293993534e-06, + "loss": 0.7609, + "step": 3717 + }, + { + "epoch": 1.0794019451299173, + "grad_norm": 3.658750295639038, + "learning_rate": 9.26612058404757e-06, + "loss": 0.8624, + "step": 3718 + }, + { + "epoch": 1.0796922630280157, + "grad_norm": 3.5097463130950928, + "learning_rate": 9.265619716768769e-06, + "loss": 0.7934, + "step": 3719 + }, + { + "epoch": 1.079982580926114, + "grad_norm": 3.147826671600342, + "learning_rate": 9.265118692175605e-06, + "loss": 0.7036, + "step": 3720 + }, + { + "epoch": 1.0802728988242125, + "grad_norm": 3.7938437461853027, + "learning_rate": 9.264617510286558e-06, + "loss": 0.788, + "step": 3721 + }, + { + "epoch": 1.080563216722311, + "grad_norm": 3.502878189086914, + "learning_rate": 9.26411617112011e-06, + "loss": 0.6864, + "step": 3722 + }, + { + "epoch": 1.0808535346204093, + "grad_norm": 3.6998252868652344, + "learning_rate": 9.263614674694748e-06, + "loss": 0.8459, + "step": 3723 + }, + { + "epoch": 1.0811438525185078, + "grad_norm": 3.7824223041534424, + "learning_rate": 9.26311302102897e-06, + "loss": 0.8461, + "step": 3724 + }, + { + "epoch": 1.0814341704166062, + "grad_norm": 3.34706711769104, + "learning_rate": 9.262611210141276e-06, + "loss": 0.8156, + "step": 3725 + }, + { + "epoch": 1.0817244883147046, + "grad_norm": 3.4476208686828613, + "learning_rate": 9.262109242050172e-06, + "loss": 0.7911, + "step": 3726 + }, + { + "epoch": 1.082014806212803, + "grad_norm": 3.309239149093628, + "learning_rate": 9.26160711677417e-06, + "loss": 0.7158, + "step": 3727 + }, + { + "epoch": 1.0823051241109014, + "grad_norm": 3.8136990070343018, + "learning_rate": 9.261104834331788e-06, + "loss": 0.7803, + "step": 3728 + }, + { + "epoch": 1.0825954420089998, + "grad_norm": 3.3151988983154297, + "learning_rate": 9.260602394741551e-06, + "loss": 0.7313, + "step": 3729 + }, + { + "epoch": 1.0828857599070982, + "grad_norm": 3.0309386253356934, + "learning_rate": 9.260099798021988e-06, + "loss": 0.6643, + "step": 3730 + }, + { + "epoch": 1.0831760778051966, + "grad_norm": 3.5916686058044434, + "learning_rate": 9.259597044191635e-06, + "loss": 0.7616, + "step": 3731 + }, + { + "epoch": 1.083466395703295, + "grad_norm": 4.077143669128418, + "learning_rate": 9.259094133269036e-06, + "loss": 0.7774, + "step": 3732 + }, + { + "epoch": 1.0837567136013935, + "grad_norm": 3.529888391494751, + "learning_rate": 9.258591065272733e-06, + "loss": 0.7659, + "step": 3733 + }, + { + "epoch": 1.0840470314994919, + "grad_norm": 3.5668489933013916, + "learning_rate": 9.258087840221281e-06, + "loss": 0.8392, + "step": 3734 + }, + { + "epoch": 1.0843373493975903, + "grad_norm": 3.344179153442383, + "learning_rate": 9.257584458133242e-06, + "loss": 0.858, + "step": 3735 + }, + { + "epoch": 1.0846276672956887, + "grad_norm": 4.286630630493164, + "learning_rate": 9.257080919027175e-06, + "loss": 0.8578, + "step": 3736 + }, + { + "epoch": 1.0849179851937871, + "grad_norm": 3.0358517169952393, + "learning_rate": 9.256577222921654e-06, + "loss": 0.7462, + "step": 3737 + }, + { + "epoch": 1.0852083030918855, + "grad_norm": 3.1172049045562744, + "learning_rate": 9.256073369835255e-06, + "loss": 0.5998, + "step": 3738 + }, + { + "epoch": 1.085498620989984, + "grad_norm": 3.513422727584839, + "learning_rate": 9.255569359786558e-06, + "loss": 0.7894, + "step": 3739 + }, + { + "epoch": 1.0857889388880824, + "grad_norm": 3.954484462738037, + "learning_rate": 9.255065192794153e-06, + "loss": 0.9343, + "step": 3740 + }, + { + "epoch": 1.0860792567861808, + "grad_norm": 3.374732255935669, + "learning_rate": 9.254560868876633e-06, + "loss": 0.6729, + "step": 3741 + }, + { + "epoch": 1.0863695746842792, + "grad_norm": 3.012810230255127, + "learning_rate": 9.254056388052593e-06, + "loss": 0.7632, + "step": 3742 + }, + { + "epoch": 1.0866598925823776, + "grad_norm": 3.49700927734375, + "learning_rate": 9.253551750340643e-06, + "loss": 0.6696, + "step": 3743 + }, + { + "epoch": 1.086950210480476, + "grad_norm": 3.2697410583496094, + "learning_rate": 9.253046955759394e-06, + "loss": 0.6528, + "step": 3744 + }, + { + "epoch": 1.0872405283785747, + "grad_norm": 3.7874979972839355, + "learning_rate": 9.25254200432746e-06, + "loss": 0.8849, + "step": 3745 + }, + { + "epoch": 1.0875308462766728, + "grad_norm": 3.744913101196289, + "learning_rate": 9.252036896063464e-06, + "loss": 0.8497, + "step": 3746 + }, + { + "epoch": 1.0878211641747715, + "grad_norm": 3.6657257080078125, + "learning_rate": 9.251531630986036e-06, + "loss": 0.8023, + "step": 3747 + }, + { + "epoch": 1.08811148207287, + "grad_norm": 3.5472493171691895, + "learning_rate": 9.251026209113806e-06, + "loss": 0.7415, + "step": 3748 + }, + { + "epoch": 1.0884017999709683, + "grad_norm": 3.7813925743103027, + "learning_rate": 9.250520630465419e-06, + "loss": 0.9681, + "step": 3749 + }, + { + "epoch": 1.0886921178690667, + "grad_norm": 3.2687952518463135, + "learning_rate": 9.250014895059518e-06, + "loss": 0.7353, + "step": 3750 + }, + { + "epoch": 1.0889824357671651, + "grad_norm": 3.2803022861480713, + "learning_rate": 9.249509002914752e-06, + "loss": 0.681, + "step": 3751 + }, + { + "epoch": 1.0892727536652635, + "grad_norm": 3.0684728622436523, + "learning_rate": 9.249002954049781e-06, + "loss": 0.7091, + "step": 3752 + }, + { + "epoch": 1.089563071563362, + "grad_norm": 3.981271982192993, + "learning_rate": 9.24849674848327e-06, + "loss": 0.8076, + "step": 3753 + }, + { + "epoch": 1.0898533894614604, + "grad_norm": 3.0908520221710205, + "learning_rate": 9.247990386233883e-06, + "loss": 0.7367, + "step": 3754 + }, + { + "epoch": 1.0901437073595588, + "grad_norm": 3.574917793273926, + "learning_rate": 9.247483867320295e-06, + "loss": 0.6696, + "step": 3755 + }, + { + "epoch": 1.0904340252576572, + "grad_norm": 3.577314853668213, + "learning_rate": 9.246977191761188e-06, + "loss": 0.8258, + "step": 3756 + }, + { + "epoch": 1.0907243431557556, + "grad_norm": 3.003840446472168, + "learning_rate": 9.246470359575249e-06, + "loss": 0.6683, + "step": 3757 + }, + { + "epoch": 1.091014661053854, + "grad_norm": 3.4558334350585938, + "learning_rate": 9.245963370781168e-06, + "loss": 0.7331, + "step": 3758 + }, + { + "epoch": 1.0913049789519524, + "grad_norm": 4.03562593460083, + "learning_rate": 9.245456225397642e-06, + "loss": 0.868, + "step": 3759 + }, + { + "epoch": 1.0915952968500509, + "grad_norm": 3.536433458328247, + "learning_rate": 9.244948923443376e-06, + "loss": 0.8345, + "step": 3760 + }, + { + "epoch": 1.0918856147481493, + "grad_norm": 3.7021758556365967, + "learning_rate": 9.244441464937077e-06, + "loss": 0.717, + "step": 3761 + }, + { + "epoch": 1.0921759326462477, + "grad_norm": 3.741546869277954, + "learning_rate": 9.243933849897462e-06, + "loss": 0.7938, + "step": 3762 + }, + { + "epoch": 1.092466250544346, + "grad_norm": 3.5878963470458984, + "learning_rate": 9.243426078343251e-06, + "loss": 0.8451, + "step": 3763 + }, + { + "epoch": 1.0927565684424445, + "grad_norm": 3.552255630493164, + "learning_rate": 9.242918150293169e-06, + "loss": 0.8474, + "step": 3764 + }, + { + "epoch": 1.093046886340543, + "grad_norm": 3.8845558166503906, + "learning_rate": 9.24241006576595e-06, + "loss": 0.7834, + "step": 3765 + }, + { + "epoch": 1.0933372042386413, + "grad_norm": 3.360624074935913, + "learning_rate": 9.241901824780331e-06, + "loss": 0.7395, + "step": 3766 + }, + { + "epoch": 1.0936275221367397, + "grad_norm": 3.2982327938079834, + "learning_rate": 9.241393427355056e-06, + "loss": 0.7452, + "step": 3767 + }, + { + "epoch": 1.0939178400348382, + "grad_norm": 3.23142409324646, + "learning_rate": 9.240884873508876e-06, + "loss": 0.6713, + "step": 3768 + }, + { + "epoch": 1.0942081579329366, + "grad_norm": 3.441584348678589, + "learning_rate": 9.240376163260545e-06, + "loss": 0.8075, + "step": 3769 + }, + { + "epoch": 1.094498475831035, + "grad_norm": 3.3424441814422607, + "learning_rate": 9.239867296628821e-06, + "loss": 0.7221, + "step": 3770 + }, + { + "epoch": 1.0947887937291334, + "grad_norm": 3.5608901977539062, + "learning_rate": 9.239358273632476e-06, + "loss": 0.8401, + "step": 3771 + }, + { + "epoch": 1.0950791116272318, + "grad_norm": 3.4727823734283447, + "learning_rate": 9.238849094290279e-06, + "loss": 0.7322, + "step": 3772 + }, + { + "epoch": 1.0953694295253302, + "grad_norm": 3.133427858352661, + "learning_rate": 9.238339758621011e-06, + "loss": 0.7485, + "step": 3773 + }, + { + "epoch": 1.0956597474234286, + "grad_norm": 3.073030710220337, + "learning_rate": 9.237830266643453e-06, + "loss": 0.6532, + "step": 3774 + }, + { + "epoch": 1.095950065321527, + "grad_norm": 3.06816029548645, + "learning_rate": 9.237320618376398e-06, + "loss": 0.6492, + "step": 3775 + }, + { + "epoch": 1.0962403832196255, + "grad_norm": 3.501046657562256, + "learning_rate": 9.23681081383864e-06, + "loss": 0.8614, + "step": 3776 + }, + { + "epoch": 1.0965307011177239, + "grad_norm": 3.766171455383301, + "learning_rate": 9.236300853048978e-06, + "loss": 0.8673, + "step": 3777 + }, + { + "epoch": 1.0968210190158223, + "grad_norm": 4.0137553215026855, + "learning_rate": 9.235790736026225e-06, + "loss": 0.788, + "step": 3778 + }, + { + "epoch": 1.0971113369139207, + "grad_norm": 3.591977834701538, + "learning_rate": 9.235280462789188e-06, + "loss": 0.7047, + "step": 3779 + }, + { + "epoch": 1.0974016548120191, + "grad_norm": 3.4781503677368164, + "learning_rate": 9.23477003335669e-06, + "loss": 0.7515, + "step": 3780 + }, + { + "epoch": 1.0976919727101175, + "grad_norm": 3.477678060531616, + "learning_rate": 9.234259447747554e-06, + "loss": 0.7738, + "step": 3781 + }, + { + "epoch": 1.097982290608216, + "grad_norm": 3.9467685222625732, + "learning_rate": 9.233748705980607e-06, + "loss": 0.856, + "step": 3782 + }, + { + "epoch": 1.0982726085063144, + "grad_norm": 3.463690996170044, + "learning_rate": 9.233237808074691e-06, + "loss": 0.7639, + "step": 3783 + }, + { + "epoch": 1.0985629264044128, + "grad_norm": 3.620694875717163, + "learning_rate": 9.232726754048643e-06, + "loss": 0.8162, + "step": 3784 + }, + { + "epoch": 1.0988532443025112, + "grad_norm": 3.6893718242645264, + "learning_rate": 9.232215543921313e-06, + "loss": 0.8336, + "step": 3785 + }, + { + "epoch": 1.0991435622006096, + "grad_norm": 3.620185613632202, + "learning_rate": 9.231704177711552e-06, + "loss": 0.8067, + "step": 3786 + }, + { + "epoch": 1.099433880098708, + "grad_norm": 3.3584699630737305, + "learning_rate": 9.231192655438222e-06, + "loss": 0.7664, + "step": 3787 + }, + { + "epoch": 1.0997241979968064, + "grad_norm": 3.5024573802948, + "learning_rate": 9.230680977120184e-06, + "loss": 0.7521, + "step": 3788 + }, + { + "epoch": 1.1000145158949048, + "grad_norm": 3.554534435272217, + "learning_rate": 9.230169142776311e-06, + "loss": 0.7894, + "step": 3789 + }, + { + "epoch": 1.1003048337930033, + "grad_norm": 3.831371784210205, + "learning_rate": 9.22965715242548e-06, + "loss": 0.8239, + "step": 3790 + }, + { + "epoch": 1.1005951516911017, + "grad_norm": 3.851170778274536, + "learning_rate": 9.22914500608657e-06, + "loss": 0.7703, + "step": 3791 + }, + { + "epoch": 1.1008854695892, + "grad_norm": 3.348322868347168, + "learning_rate": 9.22863270377847e-06, + "loss": 0.7707, + "step": 3792 + }, + { + "epoch": 1.1011757874872985, + "grad_norm": 3.3687806129455566, + "learning_rate": 9.228120245520076e-06, + "loss": 0.7372, + "step": 3793 + }, + { + "epoch": 1.101466105385397, + "grad_norm": 3.2010092735290527, + "learning_rate": 9.227607631330285e-06, + "loss": 0.6718, + "step": 3794 + }, + { + "epoch": 1.1017564232834953, + "grad_norm": 4.082772731781006, + "learning_rate": 9.227094861228e-06, + "loss": 0.8161, + "step": 3795 + }, + { + "epoch": 1.102046741181594, + "grad_norm": 3.4030327796936035, + "learning_rate": 9.226581935232135e-06, + "loss": 0.8786, + "step": 3796 + }, + { + "epoch": 1.1023370590796921, + "grad_norm": 3.6011297702789307, + "learning_rate": 9.226068853361607e-06, + "loss": 0.7148, + "step": 3797 + }, + { + "epoch": 1.1026273769777908, + "grad_norm": 3.7094037532806396, + "learning_rate": 9.225555615635336e-06, + "loss": 0.7745, + "step": 3798 + }, + { + "epoch": 1.1029176948758892, + "grad_norm": 3.2469165325164795, + "learning_rate": 9.225042222072251e-06, + "loss": 0.6453, + "step": 3799 + }, + { + "epoch": 1.1032080127739876, + "grad_norm": 3.479039430618286, + "learning_rate": 9.224528672691284e-06, + "loss": 0.7451, + "step": 3800 + }, + { + "epoch": 1.103498330672086, + "grad_norm": 3.0856449604034424, + "learning_rate": 9.224014967511378e-06, + "loss": 0.7583, + "step": 3801 + }, + { + "epoch": 1.1037886485701844, + "grad_norm": 3.4856984615325928, + "learning_rate": 9.223501106551475e-06, + "loss": 0.6975, + "step": 3802 + }, + { + "epoch": 1.1040789664682829, + "grad_norm": 3.5641419887542725, + "learning_rate": 9.222987089830528e-06, + "loss": 0.7357, + "step": 3803 + }, + { + "epoch": 1.1043692843663813, + "grad_norm": 3.8171226978302, + "learning_rate": 9.222472917367492e-06, + "loss": 0.8233, + "step": 3804 + }, + { + "epoch": 1.1046596022644797, + "grad_norm": 3.733131170272827, + "learning_rate": 9.22195858918133e-06, + "loss": 0.7833, + "step": 3805 + }, + { + "epoch": 1.104949920162578, + "grad_norm": 3.9596691131591797, + "learning_rate": 9.221444105291013e-06, + "loss": 0.8639, + "step": 3806 + }, + { + "epoch": 1.1052402380606765, + "grad_norm": 3.4496874809265137, + "learning_rate": 9.22092946571551e-06, + "loss": 0.7427, + "step": 3807 + }, + { + "epoch": 1.105530555958775, + "grad_norm": 3.837810754776001, + "learning_rate": 9.220414670473806e-06, + "loss": 0.7449, + "step": 3808 + }, + { + "epoch": 1.1058208738568733, + "grad_norm": 3.513516902923584, + "learning_rate": 9.219899719584882e-06, + "loss": 0.8359, + "step": 3809 + }, + { + "epoch": 1.1061111917549717, + "grad_norm": 3.4239394664764404, + "learning_rate": 9.21938461306773e-06, + "loss": 0.6477, + "step": 3810 + }, + { + "epoch": 1.1064015096530702, + "grad_norm": 3.192553758621216, + "learning_rate": 9.21886935094135e-06, + "loss": 0.6797, + "step": 3811 + }, + { + "epoch": 1.1066918275511686, + "grad_norm": 3.2809319496154785, + "learning_rate": 9.218353933224743e-06, + "loss": 0.7457, + "step": 3812 + }, + { + "epoch": 1.106982145449267, + "grad_norm": 3.670210361480713, + "learning_rate": 9.217838359936914e-06, + "loss": 0.6784, + "step": 3813 + }, + { + "epoch": 1.1072724633473654, + "grad_norm": 3.770373582839966, + "learning_rate": 9.21732263109688e-06, + "loss": 0.8031, + "step": 3814 + }, + { + "epoch": 1.1075627812454638, + "grad_norm": 3.9848551750183105, + "learning_rate": 9.216806746723666e-06, + "loss": 0.8274, + "step": 3815 + }, + { + "epoch": 1.1078530991435622, + "grad_norm": 3.6225457191467285, + "learning_rate": 9.216290706836288e-06, + "loss": 0.8351, + "step": 3816 + }, + { + "epoch": 1.1081434170416606, + "grad_norm": 3.5515317916870117, + "learning_rate": 9.215774511453784e-06, + "loss": 0.6946, + "step": 3817 + }, + { + "epoch": 1.108433734939759, + "grad_norm": 3.5677294731140137, + "learning_rate": 9.215258160595187e-06, + "loss": 0.7142, + "step": 3818 + }, + { + "epoch": 1.1087240528378575, + "grad_norm": 3.2002451419830322, + "learning_rate": 9.214741654279543e-06, + "loss": 0.7483, + "step": 3819 + }, + { + "epoch": 1.1090143707359559, + "grad_norm": 3.1444714069366455, + "learning_rate": 9.2142249925259e-06, + "loss": 0.7511, + "step": 3820 + }, + { + "epoch": 1.1093046886340543, + "grad_norm": 3.7607555389404297, + "learning_rate": 9.213708175353311e-06, + "loss": 0.6861, + "step": 3821 + }, + { + "epoch": 1.1095950065321527, + "grad_norm": 3.2420289516448975, + "learning_rate": 9.213191202780835e-06, + "loss": 0.6305, + "step": 3822 + }, + { + "epoch": 1.1098853244302511, + "grad_norm": 3.4901387691497803, + "learning_rate": 9.212674074827542e-06, + "loss": 0.8123, + "step": 3823 + }, + { + "epoch": 1.1101756423283495, + "grad_norm": 3.4428091049194336, + "learning_rate": 9.212156791512502e-06, + "loss": 0.8259, + "step": 3824 + }, + { + "epoch": 1.110465960226448, + "grad_norm": 3.0317587852478027, + "learning_rate": 9.211639352854786e-06, + "loss": 0.67, + "step": 3825 + }, + { + "epoch": 1.1107562781245464, + "grad_norm": 3.2258551120758057, + "learning_rate": 9.211121758873487e-06, + "loss": 0.7019, + "step": 3826 + }, + { + "epoch": 1.1110465960226448, + "grad_norm": 3.6131057739257812, + "learning_rate": 9.210604009587687e-06, + "loss": 0.8236, + "step": 3827 + }, + { + "epoch": 1.1113369139207432, + "grad_norm": 3.5522913932800293, + "learning_rate": 9.21008610501648e-06, + "loss": 0.7453, + "step": 3828 + }, + { + "epoch": 1.1116272318188416, + "grad_norm": 3.3678643703460693, + "learning_rate": 9.20956804517897e-06, + "loss": 0.7088, + "step": 3829 + }, + { + "epoch": 1.11191754971694, + "grad_norm": 3.779475688934326, + "learning_rate": 9.20904983009426e-06, + "loss": 0.7937, + "step": 3830 + }, + { + "epoch": 1.1122078676150384, + "grad_norm": 3.308375597000122, + "learning_rate": 9.208531459781464e-06, + "loss": 0.7086, + "step": 3831 + }, + { + "epoch": 1.1124981855131368, + "grad_norm": 3.5668954849243164, + "learning_rate": 9.208012934259697e-06, + "loss": 0.7745, + "step": 3832 + }, + { + "epoch": 1.1127885034112353, + "grad_norm": 3.0808634757995605, + "learning_rate": 9.207494253548084e-06, + "loss": 0.6845, + "step": 3833 + }, + { + "epoch": 1.1130788213093337, + "grad_norm": 3.044464349746704, + "learning_rate": 9.206975417665751e-06, + "loss": 0.7371, + "step": 3834 + }, + { + "epoch": 1.113369139207432, + "grad_norm": 3.4729931354522705, + "learning_rate": 9.206456426631836e-06, + "loss": 0.7245, + "step": 3835 + }, + { + "epoch": 1.1136594571055305, + "grad_norm": 3.503591775894165, + "learning_rate": 9.205937280465476e-06, + "loss": 0.7385, + "step": 3836 + }, + { + "epoch": 1.113949775003629, + "grad_norm": 3.2636380195617676, + "learning_rate": 9.205417979185818e-06, + "loss": 0.7385, + "step": 3837 + }, + { + "epoch": 1.1142400929017273, + "grad_norm": 4.049813747406006, + "learning_rate": 9.204898522812015e-06, + "loss": 0.8251, + "step": 3838 + }, + { + "epoch": 1.1145304107998257, + "grad_norm": 3.246598958969116, + "learning_rate": 9.204378911363222e-06, + "loss": 0.6892, + "step": 3839 + }, + { + "epoch": 1.1148207286979241, + "grad_norm": 3.6350643634796143, + "learning_rate": 9.203859144858604e-06, + "loss": 0.8535, + "step": 3840 + }, + { + "epoch": 1.1151110465960226, + "grad_norm": 3.558542251586914, + "learning_rate": 9.203339223317328e-06, + "loss": 0.8299, + "step": 3841 + }, + { + "epoch": 1.115401364494121, + "grad_norm": 3.504409074783325, + "learning_rate": 9.20281914675857e-06, + "loss": 0.7694, + "step": 3842 + }, + { + "epoch": 1.1156916823922194, + "grad_norm": 3.365307569503784, + "learning_rate": 9.20229891520151e-06, + "loss": 0.7959, + "step": 3843 + }, + { + "epoch": 1.1159820002903178, + "grad_norm": 3.161320447921753, + "learning_rate": 9.201778528665333e-06, + "loss": 0.6549, + "step": 3844 + }, + { + "epoch": 1.1162723181884162, + "grad_norm": 3.2018449306488037, + "learning_rate": 9.201257987169233e-06, + "loss": 0.6626, + "step": 3845 + }, + { + "epoch": 1.1165626360865146, + "grad_norm": 3.6142992973327637, + "learning_rate": 9.200737290732402e-06, + "loss": 0.7719, + "step": 3846 + }, + { + "epoch": 1.1168529539846133, + "grad_norm": 3.2540829181671143, + "learning_rate": 9.20021643937405e-06, + "loss": 0.6995, + "step": 3847 + }, + { + "epoch": 1.1171432718827115, + "grad_norm": 3.530956268310547, + "learning_rate": 9.19969543311338e-06, + "loss": 0.857, + "step": 3848 + }, + { + "epoch": 1.11743358978081, + "grad_norm": 3.8063101768493652, + "learning_rate": 9.199174271969612e-06, + "loss": 0.8972, + "step": 3849 + }, + { + "epoch": 1.1177239076789085, + "grad_norm": 3.33796763420105, + "learning_rate": 9.198652955961961e-06, + "loss": 0.7059, + "step": 3850 + }, + { + "epoch": 1.118014225577007, + "grad_norm": 3.4572362899780273, + "learning_rate": 9.198131485109656e-06, + "loss": 0.7459, + "step": 3851 + }, + { + "epoch": 1.1183045434751053, + "grad_norm": 4.223832607269287, + "learning_rate": 9.197609859431928e-06, + "loss": 0.8582, + "step": 3852 + }, + { + "epoch": 1.1185948613732037, + "grad_norm": 3.749410390853882, + "learning_rate": 9.197088078948013e-06, + "loss": 0.7968, + "step": 3853 + }, + { + "epoch": 1.1188851792713022, + "grad_norm": 3.3402292728424072, + "learning_rate": 9.196566143677157e-06, + "loss": 0.7766, + "step": 3854 + }, + { + "epoch": 1.1191754971694006, + "grad_norm": 3.567389488220215, + "learning_rate": 9.196044053638607e-06, + "loss": 0.8716, + "step": 3855 + }, + { + "epoch": 1.119465815067499, + "grad_norm": 3.3039045333862305, + "learning_rate": 9.195521808851615e-06, + "loss": 0.6931, + "step": 3856 + }, + { + "epoch": 1.1197561329655974, + "grad_norm": 3.9325478076934814, + "learning_rate": 9.194999409335446e-06, + "loss": 0.8135, + "step": 3857 + }, + { + "epoch": 1.1200464508636958, + "grad_norm": 3.852951765060425, + "learning_rate": 9.194476855109362e-06, + "loss": 0.8106, + "step": 3858 + }, + { + "epoch": 1.1203367687617942, + "grad_norm": 3.6040732860565186, + "learning_rate": 9.193954146192638e-06, + "loss": 0.766, + "step": 3859 + }, + { + "epoch": 1.1206270866598926, + "grad_norm": 3.2979674339294434, + "learning_rate": 9.193431282604547e-06, + "loss": 0.7364, + "step": 3860 + }, + { + "epoch": 1.120917404557991, + "grad_norm": 3.225715160369873, + "learning_rate": 9.192908264364377e-06, + "loss": 0.7519, + "step": 3861 + }, + { + "epoch": 1.1212077224560895, + "grad_norm": 3.7926652431488037, + "learning_rate": 9.192385091491411e-06, + "loss": 0.7857, + "step": 3862 + }, + { + "epoch": 1.1214980403541879, + "grad_norm": 3.2855775356292725, + "learning_rate": 9.19186176400495e-06, + "loss": 0.674, + "step": 3863 + }, + { + "epoch": 1.1217883582522863, + "grad_norm": 3.847721815109253, + "learning_rate": 9.191338281924288e-06, + "loss": 0.826, + "step": 3864 + }, + { + "epoch": 1.1220786761503847, + "grad_norm": 3.684709072113037, + "learning_rate": 9.190814645268735e-06, + "loss": 0.8217, + "step": 3865 + }, + { + "epoch": 1.1223689940484831, + "grad_norm": 3.2224950790405273, + "learning_rate": 9.1902908540576e-06, + "loss": 0.7144, + "step": 3866 + }, + { + "epoch": 1.1226593119465815, + "grad_norm": 3.4135384559631348, + "learning_rate": 9.1897669083102e-06, + "loss": 0.8183, + "step": 3867 + }, + { + "epoch": 1.12294962984468, + "grad_norm": 3.310356616973877, + "learning_rate": 9.189242808045862e-06, + "loss": 0.8442, + "step": 3868 + }, + { + "epoch": 1.1232399477427784, + "grad_norm": 3.4118008613586426, + "learning_rate": 9.188718553283912e-06, + "loss": 0.7003, + "step": 3869 + }, + { + "epoch": 1.1235302656408768, + "grad_norm": 3.467306613922119, + "learning_rate": 9.18819414404368e-06, + "loss": 0.6666, + "step": 3870 + }, + { + "epoch": 1.1238205835389752, + "grad_norm": 3.144047737121582, + "learning_rate": 9.187669580344512e-06, + "loss": 0.7123, + "step": 3871 + }, + { + "epoch": 1.1241109014370736, + "grad_norm": 3.6717677116394043, + "learning_rate": 9.187144862205753e-06, + "loss": 0.812, + "step": 3872 + }, + { + "epoch": 1.124401219335172, + "grad_norm": 4.038080215454102, + "learning_rate": 9.186619989646753e-06, + "loss": 0.7922, + "step": 3873 + }, + { + "epoch": 1.1246915372332704, + "grad_norm": 3.4617326259613037, + "learning_rate": 9.186094962686867e-06, + "loss": 0.7475, + "step": 3874 + }, + { + "epoch": 1.1249818551313688, + "grad_norm": 3.546358823776245, + "learning_rate": 9.18556978134546e-06, + "loss": 0.675, + "step": 3875 + }, + { + "epoch": 1.1252721730294672, + "grad_norm": 3.410590648651123, + "learning_rate": 9.185044445641902e-06, + "loss": 0.7824, + "step": 3876 + }, + { + "epoch": 1.1255624909275657, + "grad_norm": 3.880183458328247, + "learning_rate": 9.184518955595567e-06, + "loss": 0.9077, + "step": 3877 + }, + { + "epoch": 1.125852808825664, + "grad_norm": 3.3557281494140625, + "learning_rate": 9.18399331122583e-06, + "loss": 0.699, + "step": 3878 + }, + { + "epoch": 1.1261431267237625, + "grad_norm": 3.676377773284912, + "learning_rate": 9.183467512552082e-06, + "loss": 0.8433, + "step": 3879 + }, + { + "epoch": 1.126433444621861, + "grad_norm": 3.825648069381714, + "learning_rate": 9.182941559593713e-06, + "loss": 0.7285, + "step": 3880 + }, + { + "epoch": 1.1267237625199593, + "grad_norm": 3.3879647254943848, + "learning_rate": 9.182415452370119e-06, + "loss": 0.7921, + "step": 3881 + }, + { + "epoch": 1.1270140804180577, + "grad_norm": 3.6778652667999268, + "learning_rate": 9.181889190900702e-06, + "loss": 0.7911, + "step": 3882 + }, + { + "epoch": 1.1273043983161561, + "grad_norm": 3.598294734954834, + "learning_rate": 9.181362775204871e-06, + "loss": 0.7536, + "step": 3883 + }, + { + "epoch": 1.1275947162142546, + "grad_norm": 3.4838759899139404, + "learning_rate": 9.18083620530204e-06, + "loss": 0.8197, + "step": 3884 + }, + { + "epoch": 1.127885034112353, + "grad_norm": 3.5205631256103516, + "learning_rate": 9.180309481211629e-06, + "loss": 0.7828, + "step": 3885 + }, + { + "epoch": 1.1281753520104514, + "grad_norm": 3.924164295196533, + "learning_rate": 9.179782602953065e-06, + "loss": 0.7685, + "step": 3886 + }, + { + "epoch": 1.1284656699085498, + "grad_norm": 3.336639881134033, + "learning_rate": 9.179255570545775e-06, + "loss": 0.7275, + "step": 3887 + }, + { + "epoch": 1.1287559878066482, + "grad_norm": 3.553356885910034, + "learning_rate": 9.178728384009199e-06, + "loss": 0.7881, + "step": 3888 + }, + { + "epoch": 1.1290463057047466, + "grad_norm": 3.6561996936798096, + "learning_rate": 9.178201043362778e-06, + "loss": 0.876, + "step": 3889 + }, + { + "epoch": 1.129336623602845, + "grad_norm": 7.617891788482666, + "learning_rate": 9.177673548625962e-06, + "loss": 0.6766, + "step": 3890 + }, + { + "epoch": 1.1296269415009434, + "grad_norm": 3.3711862564086914, + "learning_rate": 9.177145899818203e-06, + "loss": 0.79, + "step": 3891 + }, + { + "epoch": 1.1299172593990419, + "grad_norm": 3.308711528778076, + "learning_rate": 9.17661809695896e-06, + "loss": 0.7435, + "step": 3892 + }, + { + "epoch": 1.1302075772971403, + "grad_norm": 3.669429063796997, + "learning_rate": 9.176090140067699e-06, + "loss": 0.647, + "step": 3893 + }, + { + "epoch": 1.130497895195239, + "grad_norm": 3.878659248352051, + "learning_rate": 9.175562029163892e-06, + "loss": 0.7192, + "step": 3894 + }, + { + "epoch": 1.130788213093337, + "grad_norm": 3.555819272994995, + "learning_rate": 9.175033764267013e-06, + "loss": 0.7141, + "step": 3895 + }, + { + "epoch": 1.1310785309914357, + "grad_norm": 3.896650791168213, + "learning_rate": 9.174505345396546e-06, + "loss": 0.823, + "step": 3896 + }, + { + "epoch": 1.131368848889534, + "grad_norm": 3.3993911743164062, + "learning_rate": 9.173976772571978e-06, + "loss": 0.7859, + "step": 3897 + }, + { + "epoch": 1.1316591667876326, + "grad_norm": 3.185831069946289, + "learning_rate": 9.173448045812806e-06, + "loss": 0.8121, + "step": 3898 + }, + { + "epoch": 1.1319494846857308, + "grad_norm": 3.3628885746002197, + "learning_rate": 9.172919165138523e-06, + "loss": 0.6954, + "step": 3899 + }, + { + "epoch": 1.1322398025838294, + "grad_norm": 3.817692995071411, + "learning_rate": 9.172390130568638e-06, + "loss": 0.892, + "step": 3900 + }, + { + "epoch": 1.1325301204819278, + "grad_norm": 3.3503918647766113, + "learning_rate": 9.17186094212266e-06, + "loss": 0.6653, + "step": 3901 + }, + { + "epoch": 1.1328204383800262, + "grad_norm": 3.7152490615844727, + "learning_rate": 9.171331599820106e-06, + "loss": 0.7165, + "step": 3902 + }, + { + "epoch": 1.1331107562781246, + "grad_norm": 3.5846714973449707, + "learning_rate": 9.1708021036805e-06, + "loss": 0.7886, + "step": 3903 + }, + { + "epoch": 1.133401074176223, + "grad_norm": 3.3426952362060547, + "learning_rate": 9.170272453723365e-06, + "loss": 0.7662, + "step": 3904 + }, + { + "epoch": 1.1336913920743215, + "grad_norm": 3.628878355026245, + "learning_rate": 9.169742649968238e-06, + "loss": 0.7641, + "step": 3905 + }, + { + "epoch": 1.1339817099724199, + "grad_norm": 3.536870002746582, + "learning_rate": 9.169212692434658e-06, + "loss": 0.7743, + "step": 3906 + }, + { + "epoch": 1.1342720278705183, + "grad_norm": 3.908158540725708, + "learning_rate": 9.168682581142168e-06, + "loss": 0.8958, + "step": 3907 + }, + { + "epoch": 1.1345623457686167, + "grad_norm": 3.5317137241363525, + "learning_rate": 9.168152316110318e-06, + "loss": 0.7183, + "step": 3908 + }, + { + "epoch": 1.1348526636667151, + "grad_norm": 3.372509479522705, + "learning_rate": 9.167621897358665e-06, + "loss": 0.8437, + "step": 3909 + }, + { + "epoch": 1.1351429815648135, + "grad_norm": 3.9317944049835205, + "learning_rate": 9.16709132490677e-06, + "loss": 0.7876, + "step": 3910 + }, + { + "epoch": 1.135433299462912, + "grad_norm": 4.102498531341553, + "learning_rate": 9.166560598774201e-06, + "loss": 0.8697, + "step": 3911 + }, + { + "epoch": 1.1357236173610104, + "grad_norm": 4.032670974731445, + "learning_rate": 9.16602971898053e-06, + "loss": 0.8186, + "step": 3912 + }, + { + "epoch": 1.1360139352591088, + "grad_norm": 3.8047587871551514, + "learning_rate": 9.165498685545335e-06, + "loss": 0.7771, + "step": 3913 + }, + { + "epoch": 1.1363042531572072, + "grad_norm": 3.7372140884399414, + "learning_rate": 9.164967498488203e-06, + "loss": 0.8252, + "step": 3914 + }, + { + "epoch": 1.1365945710553056, + "grad_norm": 3.7121047973632812, + "learning_rate": 9.164436157828721e-06, + "loss": 0.7722, + "step": 3915 + }, + { + "epoch": 1.136884888953404, + "grad_norm": 3.0970237255096436, + "learning_rate": 9.16390466358649e-06, + "loss": 0.6937, + "step": 3916 + }, + { + "epoch": 1.1371752068515024, + "grad_norm": 3.5272154808044434, + "learning_rate": 9.163373015781104e-06, + "loss": 0.7701, + "step": 3917 + }, + { + "epoch": 1.1374655247496008, + "grad_norm": 3.2425544261932373, + "learning_rate": 9.162841214432174e-06, + "loss": 0.771, + "step": 3918 + }, + { + "epoch": 1.1377558426476992, + "grad_norm": 3.648613452911377, + "learning_rate": 9.162309259559313e-06, + "loss": 0.8285, + "step": 3919 + }, + { + "epoch": 1.1380461605457977, + "grad_norm": 3.265514373779297, + "learning_rate": 9.161777151182137e-06, + "loss": 0.7192, + "step": 3920 + }, + { + "epoch": 1.138336478443896, + "grad_norm": 3.608022689819336, + "learning_rate": 9.161244889320271e-06, + "loss": 0.6825, + "step": 3921 + }, + { + "epoch": 1.1386267963419945, + "grad_norm": 3.5768356323242188, + "learning_rate": 9.160712473993347e-06, + "loss": 0.7143, + "step": 3922 + }, + { + "epoch": 1.138917114240093, + "grad_norm": 3.050487518310547, + "learning_rate": 9.160179905220995e-06, + "loss": 0.6958, + "step": 3923 + }, + { + "epoch": 1.1392074321381913, + "grad_norm": 3.143773078918457, + "learning_rate": 9.159647183022862e-06, + "loss": 0.6979, + "step": 3924 + }, + { + "epoch": 1.1394977500362897, + "grad_norm": 3.6614866256713867, + "learning_rate": 9.159114307418589e-06, + "loss": 0.6862, + "step": 3925 + }, + { + "epoch": 1.1397880679343881, + "grad_norm": 4.121794700622559, + "learning_rate": 9.158581278427833e-06, + "loss": 0.9153, + "step": 3926 + }, + { + "epoch": 1.1400783858324866, + "grad_norm": 3.5927717685699463, + "learning_rate": 9.158048096070249e-06, + "loss": 0.7082, + "step": 3927 + }, + { + "epoch": 1.140368703730585, + "grad_norm": 3.526240825653076, + "learning_rate": 9.1575147603655e-06, + "loss": 0.7109, + "step": 3928 + }, + { + "epoch": 1.1406590216286834, + "grad_norm": 3.6357266902923584, + "learning_rate": 9.156981271333258e-06, + "loss": 0.8743, + "step": 3929 + }, + { + "epoch": 1.1409493395267818, + "grad_norm": 3.472874879837036, + "learning_rate": 9.156447628993197e-06, + "loss": 0.7222, + "step": 3930 + }, + { + "epoch": 1.1412396574248802, + "grad_norm": 3.201047420501709, + "learning_rate": 9.155913833364995e-06, + "loss": 0.7311, + "step": 3931 + }, + { + "epoch": 1.1415299753229786, + "grad_norm": 3.7483444213867188, + "learning_rate": 9.15537988446834e-06, + "loss": 0.8526, + "step": 3932 + }, + { + "epoch": 1.141820293221077, + "grad_norm": 3.605494737625122, + "learning_rate": 9.154845782322926e-06, + "loss": 0.8127, + "step": 3933 + }, + { + "epoch": 1.1421106111191754, + "grad_norm": 3.23360013961792, + "learning_rate": 9.154311526948446e-06, + "loss": 0.7632, + "step": 3934 + }, + { + "epoch": 1.1424009290172739, + "grad_norm": 3.5619661808013916, + "learning_rate": 9.153777118364607e-06, + "loss": 0.7391, + "step": 3935 + }, + { + "epoch": 1.1426912469153723, + "grad_norm": 3.4658992290496826, + "learning_rate": 9.153242556591115e-06, + "loss": 0.7462, + "step": 3936 + }, + { + "epoch": 1.1429815648134707, + "grad_norm": 3.5309834480285645, + "learning_rate": 9.152707841647687e-06, + "loss": 0.7752, + "step": 3937 + }, + { + "epoch": 1.143271882711569, + "grad_norm": 3.535386323928833, + "learning_rate": 9.15217297355404e-06, + "loss": 0.8618, + "step": 3938 + }, + { + "epoch": 1.1435622006096675, + "grad_norm": 3.6657485961914062, + "learning_rate": 9.151637952329903e-06, + "loss": 0.806, + "step": 3939 + }, + { + "epoch": 1.143852518507766, + "grad_norm": 3.7661120891571045, + "learning_rate": 9.151102777995007e-06, + "loss": 0.77, + "step": 3940 + }, + { + "epoch": 1.1441428364058643, + "grad_norm": 3.8279857635498047, + "learning_rate": 9.150567450569086e-06, + "loss": 0.8114, + "step": 3941 + }, + { + "epoch": 1.1444331543039628, + "grad_norm": 3.7410404682159424, + "learning_rate": 9.150031970071884e-06, + "loss": 0.7662, + "step": 3942 + }, + { + "epoch": 1.1447234722020612, + "grad_norm": 3.9251410961151123, + "learning_rate": 9.149496336523151e-06, + "loss": 0.7287, + "step": 3943 + }, + { + "epoch": 1.1450137901001596, + "grad_norm": 4.000790596008301, + "learning_rate": 9.14896054994264e-06, + "loss": 0.8683, + "step": 3944 + }, + { + "epoch": 1.1453041079982582, + "grad_norm": 3.3374154567718506, + "learning_rate": 9.148424610350111e-06, + "loss": 0.6767, + "step": 3945 + }, + { + "epoch": 1.1455944258963564, + "grad_norm": 3.548007011413574, + "learning_rate": 9.147888517765326e-06, + "loss": 0.7309, + "step": 3946 + }, + { + "epoch": 1.145884743794455, + "grad_norm": 3.259523630142212, + "learning_rate": 9.147352272208061e-06, + "loss": 0.7373, + "step": 3947 + }, + { + "epoch": 1.1461750616925532, + "grad_norm": 3.932647466659546, + "learning_rate": 9.14681587369809e-06, + "loss": 0.8901, + "step": 3948 + }, + { + "epoch": 1.1464653795906519, + "grad_norm": 4.032466411590576, + "learning_rate": 9.146279322255194e-06, + "loss": 0.8693, + "step": 3949 + }, + { + "epoch": 1.14675569748875, + "grad_norm": 2.9973812103271484, + "learning_rate": 9.14574261789916e-06, + "loss": 0.6747, + "step": 3950 + }, + { + "epoch": 1.1470460153868487, + "grad_norm": 3.4267399311065674, + "learning_rate": 9.145205760649787e-06, + "loss": 0.7947, + "step": 3951 + }, + { + "epoch": 1.147336333284947, + "grad_norm": 3.820967197418213, + "learning_rate": 9.14466875052687e-06, + "loss": 0.87, + "step": 3952 + }, + { + "epoch": 1.1476266511830455, + "grad_norm": 3.9774868488311768, + "learning_rate": 9.144131587550214e-06, + "loss": 0.7757, + "step": 3953 + }, + { + "epoch": 1.147916969081144, + "grad_norm": 3.902125597000122, + "learning_rate": 9.143594271739628e-06, + "loss": 0.8454, + "step": 3954 + }, + { + "epoch": 1.1482072869792423, + "grad_norm": 3.565986156463623, + "learning_rate": 9.14305680311493e-06, + "loss": 0.8091, + "step": 3955 + }, + { + "epoch": 1.1484976048773408, + "grad_norm": 3.5948240756988525, + "learning_rate": 9.142519181695943e-06, + "loss": 0.8775, + "step": 3956 + }, + { + "epoch": 1.1487879227754392, + "grad_norm": 3.324223279953003, + "learning_rate": 9.141981407502492e-06, + "loss": 0.5903, + "step": 3957 + }, + { + "epoch": 1.1490782406735376, + "grad_norm": 3.6919519901275635, + "learning_rate": 9.141443480554408e-06, + "loss": 0.6949, + "step": 3958 + }, + { + "epoch": 1.149368558571636, + "grad_norm": 4.20566987991333, + "learning_rate": 9.140905400871535e-06, + "loss": 0.9056, + "step": 3959 + }, + { + "epoch": 1.1496588764697344, + "grad_norm": 3.5956645011901855, + "learning_rate": 9.140367168473711e-06, + "loss": 0.8069, + "step": 3960 + }, + { + "epoch": 1.1499491943678328, + "grad_norm": 3.393167734146118, + "learning_rate": 9.139828783380791e-06, + "loss": 0.7518, + "step": 3961 + }, + { + "epoch": 1.1502395122659312, + "grad_norm": 3.863666534423828, + "learning_rate": 9.13929024561263e-06, + "loss": 0.8516, + "step": 3962 + }, + { + "epoch": 1.1505298301640297, + "grad_norm": 3.6960501670837402, + "learning_rate": 9.138751555189084e-06, + "loss": 0.8199, + "step": 3963 + }, + { + "epoch": 1.150820148062128, + "grad_norm": 3.536280393600464, + "learning_rate": 9.138212712130024e-06, + "loss": 0.7833, + "step": 3964 + }, + { + "epoch": 1.1511104659602265, + "grad_norm": 3.709526538848877, + "learning_rate": 9.137673716455322e-06, + "loss": 0.7854, + "step": 3965 + }, + { + "epoch": 1.151400783858325, + "grad_norm": 4.362963676452637, + "learning_rate": 9.137134568184855e-06, + "loss": 1.0421, + "step": 3966 + }, + { + "epoch": 1.1516911017564233, + "grad_norm": 3.34218168258667, + "learning_rate": 9.136595267338507e-06, + "loss": 0.7751, + "step": 3967 + }, + { + "epoch": 1.1519814196545217, + "grad_norm": 3.735380172729492, + "learning_rate": 9.136055813936167e-06, + "loss": 0.7086, + "step": 3968 + }, + { + "epoch": 1.1522717375526201, + "grad_norm": 3.6577022075653076, + "learning_rate": 9.13551620799773e-06, + "loss": 0.792, + "step": 3969 + }, + { + "epoch": 1.1525620554507185, + "grad_norm": 3.5989091396331787, + "learning_rate": 9.134976449543097e-06, + "loss": 0.8835, + "step": 3970 + }, + { + "epoch": 1.152852373348817, + "grad_norm": 3.620215654373169, + "learning_rate": 9.134436538592173e-06, + "loss": 0.8646, + "step": 3971 + }, + { + "epoch": 1.1531426912469154, + "grad_norm": 3.421151638031006, + "learning_rate": 9.13389647516487e-06, + "loss": 0.8433, + "step": 3972 + }, + { + "epoch": 1.1534330091450138, + "grad_norm": 3.528225898742676, + "learning_rate": 9.133356259281106e-06, + "loss": 0.7508, + "step": 3973 + }, + { + "epoch": 1.1537233270431122, + "grad_norm": 3.510094165802002, + "learning_rate": 9.132815890960802e-06, + "loss": 0.8679, + "step": 3974 + }, + { + "epoch": 1.1540136449412106, + "grad_norm": 3.6377060413360596, + "learning_rate": 9.132275370223889e-06, + "loss": 0.8165, + "step": 3975 + }, + { + "epoch": 1.154303962839309, + "grad_norm": 3.446504592895508, + "learning_rate": 9.1317346970903e-06, + "loss": 0.724, + "step": 3976 + }, + { + "epoch": 1.1545942807374074, + "grad_norm": 3.3458592891693115, + "learning_rate": 9.131193871579975e-06, + "loss": 0.7576, + "step": 3977 + }, + { + "epoch": 1.1548845986355059, + "grad_norm": 3.1824088096618652, + "learning_rate": 9.13065289371286e-06, + "loss": 0.7326, + "step": 3978 + }, + { + "epoch": 1.1551749165336043, + "grad_norm": 3.756395101547241, + "learning_rate": 9.130111763508905e-06, + "loss": 0.7555, + "step": 3979 + }, + { + "epoch": 1.1554652344317027, + "grad_norm": 3.877638816833496, + "learning_rate": 9.129570480988067e-06, + "loss": 0.7437, + "step": 3980 + }, + { + "epoch": 1.155755552329801, + "grad_norm": 4.016098499298096, + "learning_rate": 9.129029046170309e-06, + "loss": 0.8865, + "step": 3981 + }, + { + "epoch": 1.1560458702278995, + "grad_norm": 4.113951683044434, + "learning_rate": 9.128487459075596e-06, + "loss": 0.6799, + "step": 3982 + }, + { + "epoch": 1.156336188125998, + "grad_norm": 3.6073343753814697, + "learning_rate": 9.127945719723908e-06, + "loss": 0.7611, + "step": 3983 + }, + { + "epoch": 1.1566265060240963, + "grad_norm": 3.384596347808838, + "learning_rate": 9.127403828135217e-06, + "loss": 0.7778, + "step": 3984 + }, + { + "epoch": 1.1569168239221947, + "grad_norm": 3.194797992706299, + "learning_rate": 9.126861784329511e-06, + "loss": 0.5762, + "step": 3985 + }, + { + "epoch": 1.1572071418202932, + "grad_norm": 3.214726686477661, + "learning_rate": 9.12631958832678e-06, + "loss": 0.7325, + "step": 3986 + }, + { + "epoch": 1.1574974597183916, + "grad_norm": 3.8333182334899902, + "learning_rate": 9.12577724014702e-06, + "loss": 0.7784, + "step": 3987 + }, + { + "epoch": 1.15778777761649, + "grad_norm": 3.346073865890503, + "learning_rate": 9.125234739810235e-06, + "loss": 0.6485, + "step": 3988 + }, + { + "epoch": 1.1580780955145884, + "grad_norm": 3.903724431991577, + "learning_rate": 9.12469208733643e-06, + "loss": 0.7638, + "step": 3989 + }, + { + "epoch": 1.1583684134126868, + "grad_norm": 3.9996793270111084, + "learning_rate": 9.124149282745614e-06, + "loss": 0.842, + "step": 3990 + }, + { + "epoch": 1.1586587313107852, + "grad_norm": 3.5065417289733887, + "learning_rate": 9.12360632605781e-06, + "loss": 0.7256, + "step": 3991 + }, + { + "epoch": 1.1589490492088836, + "grad_norm": 3.8333420753479004, + "learning_rate": 9.123063217293043e-06, + "loss": 0.7925, + "step": 3992 + }, + { + "epoch": 1.159239367106982, + "grad_norm": 3.6747794151306152, + "learning_rate": 9.12251995647134e-06, + "loss": 0.8747, + "step": 3993 + }, + { + "epoch": 1.1595296850050805, + "grad_norm": 3.530374050140381, + "learning_rate": 9.121976543612736e-06, + "loss": 0.7956, + "step": 3994 + }, + { + "epoch": 1.1598200029031789, + "grad_norm": 3.2479302883148193, + "learning_rate": 9.121432978737273e-06, + "loss": 0.7378, + "step": 3995 + }, + { + "epoch": 1.1601103208012775, + "grad_norm": 3.1474578380584717, + "learning_rate": 9.120889261864999e-06, + "loss": 0.6483, + "step": 3996 + }, + { + "epoch": 1.1604006386993757, + "grad_norm": 3.481990098953247, + "learning_rate": 9.120345393015962e-06, + "loss": 0.7954, + "step": 3997 + }, + { + "epoch": 1.1606909565974743, + "grad_norm": 3.3965940475463867, + "learning_rate": 9.119801372210224e-06, + "loss": 0.8142, + "step": 3998 + }, + { + "epoch": 1.1609812744955725, + "grad_norm": 3.3146190643310547, + "learning_rate": 9.119257199467846e-06, + "loss": 0.6746, + "step": 3999 + }, + { + "epoch": 1.1612715923936712, + "grad_norm": 3.1795899868011475, + "learning_rate": 9.118712874808897e-06, + "loss": 0.6696, + "step": 4000 + }, + { + "epoch": 1.1612715923936712, + "eval_loss": 1.1864620447158813, + "eval_runtime": 13.2004, + "eval_samples_per_second": 30.302, + "eval_steps_per_second": 3.788, + "step": 4000 + }, + { + "epoch": 1.1615619102917696, + "grad_norm": 3.368516683578491, + "learning_rate": 9.11816839825345e-06, + "loss": 0.799, + "step": 4001 + }, + { + "epoch": 1.161852228189868, + "grad_norm": 3.838491201400757, + "learning_rate": 9.117623769821588e-06, + "loss": 0.8574, + "step": 4002 + }, + { + "epoch": 1.1621425460879664, + "grad_norm": 3.6480486392974854, + "learning_rate": 9.117078989533394e-06, + "loss": 0.7749, + "step": 4003 + }, + { + "epoch": 1.1624328639860648, + "grad_norm": 3.585958480834961, + "learning_rate": 9.116534057408964e-06, + "loss": 0.7411, + "step": 4004 + }, + { + "epoch": 1.1627231818841632, + "grad_norm": 3.195746898651123, + "learning_rate": 9.115988973468387e-06, + "loss": 0.64, + "step": 4005 + }, + { + "epoch": 1.1630134997822617, + "grad_norm": 3.85469913482666, + "learning_rate": 9.115443737731775e-06, + "loss": 0.7704, + "step": 4006 + }, + { + "epoch": 1.16330381768036, + "grad_norm": 3.8025283813476562, + "learning_rate": 9.114898350219227e-06, + "loss": 0.775, + "step": 4007 + }, + { + "epoch": 1.1635941355784585, + "grad_norm": 3.44447660446167, + "learning_rate": 9.114352810950864e-06, + "loss": 0.7815, + "step": 4008 + }, + { + "epoch": 1.163884453476557, + "grad_norm": 3.455094575881958, + "learning_rate": 9.1138071199468e-06, + "loss": 0.7137, + "step": 4009 + }, + { + "epoch": 1.1641747713746553, + "grad_norm": 4.332744121551514, + "learning_rate": 9.113261277227163e-06, + "loss": 0.8485, + "step": 4010 + }, + { + "epoch": 1.1644650892727537, + "grad_norm": 3.313493490219116, + "learning_rate": 9.112715282812081e-06, + "loss": 0.7353, + "step": 4011 + }, + { + "epoch": 1.1647554071708521, + "grad_norm": 3.056633472442627, + "learning_rate": 9.112169136721693e-06, + "loss": 0.7518, + "step": 4012 + }, + { + "epoch": 1.1650457250689505, + "grad_norm": 3.9191696643829346, + "learning_rate": 9.111622838976139e-06, + "loss": 0.8178, + "step": 4013 + }, + { + "epoch": 1.165336042967049, + "grad_norm": 3.181851387023926, + "learning_rate": 9.111076389595566e-06, + "loss": 0.6374, + "step": 4014 + }, + { + "epoch": 1.1656263608651474, + "grad_norm": 3.6287424564361572, + "learning_rate": 9.110529788600127e-06, + "loss": 0.8051, + "step": 4015 + }, + { + "epoch": 1.1659166787632458, + "grad_norm": 3.4764528274536133, + "learning_rate": 9.109983036009979e-06, + "loss": 0.6772, + "step": 4016 + }, + { + "epoch": 1.1662069966613442, + "grad_norm": 3.713264226913452, + "learning_rate": 9.109436131845291e-06, + "loss": 0.9324, + "step": 4017 + }, + { + "epoch": 1.1664973145594426, + "grad_norm": 3.6909563541412354, + "learning_rate": 9.108889076126226e-06, + "loss": 0.709, + "step": 4018 + }, + { + "epoch": 1.166787632457541, + "grad_norm": 3.3515591621398926, + "learning_rate": 9.108341868872966e-06, + "loss": 0.8808, + "step": 4019 + }, + { + "epoch": 1.1670779503556394, + "grad_norm": 3.6842029094696045, + "learning_rate": 9.107794510105685e-06, + "loss": 0.7281, + "step": 4020 + }, + { + "epoch": 1.1673682682537379, + "grad_norm": 3.2459568977355957, + "learning_rate": 9.107246999844573e-06, + "loss": 0.717, + "step": 4021 + }, + { + "epoch": 1.1676585861518363, + "grad_norm": 3.540125608444214, + "learning_rate": 9.106699338109824e-06, + "loss": 0.7114, + "step": 4022 + }, + { + "epoch": 1.1679489040499347, + "grad_norm": 3.283958911895752, + "learning_rate": 9.10615152492163e-06, + "loss": 0.8662, + "step": 4023 + }, + { + "epoch": 1.168239221948033, + "grad_norm": 2.9903454780578613, + "learning_rate": 9.105603560300199e-06, + "loss": 0.682, + "step": 4024 + }, + { + "epoch": 1.1685295398461315, + "grad_norm": 3.7494277954101562, + "learning_rate": 9.105055444265737e-06, + "loss": 0.8702, + "step": 4025 + }, + { + "epoch": 1.16881985774423, + "grad_norm": 3.8516342639923096, + "learning_rate": 9.10450717683846e-06, + "loss": 0.849, + "step": 4026 + }, + { + "epoch": 1.1691101756423283, + "grad_norm": 3.3459055423736572, + "learning_rate": 9.103958758038587e-06, + "loss": 0.7186, + "step": 4027 + }, + { + "epoch": 1.1694004935404267, + "grad_norm": 3.6910083293914795, + "learning_rate": 9.103410187886343e-06, + "loss": 0.7625, + "step": 4028 + }, + { + "epoch": 1.1696908114385252, + "grad_norm": 3.9832990169525146, + "learning_rate": 9.10286146640196e-06, + "loss": 0.8289, + "step": 4029 + }, + { + "epoch": 1.1699811293366236, + "grad_norm": 3.4876708984375, + "learning_rate": 9.102312593605675e-06, + "loss": 0.891, + "step": 4030 + }, + { + "epoch": 1.170271447234722, + "grad_norm": 3.19136643409729, + "learning_rate": 9.10176356951773e-06, + "loss": 0.726, + "step": 4031 + }, + { + "epoch": 1.1705617651328204, + "grad_norm": 4.043649673461914, + "learning_rate": 9.101214394158371e-06, + "loss": 0.7879, + "step": 4032 + }, + { + "epoch": 1.1708520830309188, + "grad_norm": 3.827148914337158, + "learning_rate": 9.100665067547854e-06, + "loss": 0.7717, + "step": 4033 + }, + { + "epoch": 1.1711424009290172, + "grad_norm": 3.3949193954467773, + "learning_rate": 9.100115589706436e-06, + "loss": 0.7799, + "step": 4034 + }, + { + "epoch": 1.1714327188271156, + "grad_norm": 3.4499807357788086, + "learning_rate": 9.09956596065438e-06, + "loss": 0.9016, + "step": 4035 + }, + { + "epoch": 1.171723036725214, + "grad_norm": 3.645195245742798, + "learning_rate": 9.09901618041196e-06, + "loss": 0.7207, + "step": 4036 + }, + { + "epoch": 1.1720133546233125, + "grad_norm": 3.701106071472168, + "learning_rate": 9.09846624899945e-06, + "loss": 0.7687, + "step": 4037 + }, + { + "epoch": 1.1723036725214109, + "grad_norm": 3.188385486602783, + "learning_rate": 9.097916166437131e-06, + "loss": 0.7065, + "step": 4038 + }, + { + "epoch": 1.1725939904195093, + "grad_norm": 4.226047992706299, + "learning_rate": 9.09736593274529e-06, + "loss": 0.9615, + "step": 4039 + }, + { + "epoch": 1.1728843083176077, + "grad_norm": 3.4825079441070557, + "learning_rate": 9.09681554794422e-06, + "loss": 0.7315, + "step": 4040 + }, + { + "epoch": 1.1731746262157061, + "grad_norm": 3.5694072246551514, + "learning_rate": 9.096265012054218e-06, + "loss": 0.7047, + "step": 4041 + }, + { + "epoch": 1.1734649441138045, + "grad_norm": 3.669870615005493, + "learning_rate": 9.095714325095587e-06, + "loss": 0.8166, + "step": 4042 + }, + { + "epoch": 1.173755262011903, + "grad_norm": 3.8622612953186035, + "learning_rate": 9.095163487088639e-06, + "loss": 0.8473, + "step": 4043 + }, + { + "epoch": 1.1740455799100014, + "grad_norm": 3.600687026977539, + "learning_rate": 9.094612498053684e-06, + "loss": 0.7861, + "step": 4044 + }, + { + "epoch": 1.1743358978081, + "grad_norm": 3.816171884536743, + "learning_rate": 9.094061358011047e-06, + "loss": 0.7794, + "step": 4045 + }, + { + "epoch": 1.1746262157061982, + "grad_norm": 3.986691474914551, + "learning_rate": 9.09351006698105e-06, + "loss": 0.8641, + "step": 4046 + }, + { + "epoch": 1.1749165336042968, + "grad_norm": 3.3331282138824463, + "learning_rate": 9.092958624984029e-06, + "loss": 0.7402, + "step": 4047 + }, + { + "epoch": 1.175206851502395, + "grad_norm": 3.6391406059265137, + "learning_rate": 9.092407032040316e-06, + "loss": 0.8001, + "step": 4048 + }, + { + "epoch": 1.1754971694004936, + "grad_norm": 3.1407461166381836, + "learning_rate": 9.091855288170257e-06, + "loss": 0.6524, + "step": 4049 + }, + { + "epoch": 1.1757874872985918, + "grad_norm": 3.806478977203369, + "learning_rate": 9.091303393394197e-06, + "loss": 0.858, + "step": 4050 + }, + { + "epoch": 1.1760778051966905, + "grad_norm": 3.330761194229126, + "learning_rate": 9.090751347732492e-06, + "loss": 0.6516, + "step": 4051 + }, + { + "epoch": 1.1763681230947889, + "grad_norm": 4.271059513092041, + "learning_rate": 9.090199151205502e-06, + "loss": 0.721, + "step": 4052 + }, + { + "epoch": 1.1766584409928873, + "grad_norm": 3.2130672931671143, + "learning_rate": 9.089646803833589e-06, + "loss": 0.7209, + "step": 4053 + }, + { + "epoch": 1.1769487588909857, + "grad_norm": 3.837550163269043, + "learning_rate": 9.089094305637125e-06, + "loss": 0.7907, + "step": 4054 + }, + { + "epoch": 1.1772390767890841, + "grad_norm": 3.642279863357544, + "learning_rate": 9.088541656636487e-06, + "loss": 0.7112, + "step": 4055 + }, + { + "epoch": 1.1775293946871825, + "grad_norm": 3.739576816558838, + "learning_rate": 9.087988856852054e-06, + "loss": 0.8681, + "step": 4056 + }, + { + "epoch": 1.177819712585281, + "grad_norm": 3.580559015274048, + "learning_rate": 9.087435906304214e-06, + "loss": 0.9132, + "step": 4057 + }, + { + "epoch": 1.1781100304833794, + "grad_norm": 3.414616584777832, + "learning_rate": 9.08688280501336e-06, + "loss": 0.715, + "step": 4058 + }, + { + "epoch": 1.1784003483814778, + "grad_norm": 3.5943245887756348, + "learning_rate": 9.08632955299989e-06, + "loss": 0.8107, + "step": 4059 + }, + { + "epoch": 1.1786906662795762, + "grad_norm": 3.487362861633301, + "learning_rate": 9.085776150284209e-06, + "loss": 0.7891, + "step": 4060 + }, + { + "epoch": 1.1789809841776746, + "grad_norm": 3.6045470237731934, + "learning_rate": 9.085222596886724e-06, + "loss": 0.7728, + "step": 4061 + }, + { + "epoch": 1.179271302075773, + "grad_norm": 3.904658079147339, + "learning_rate": 9.08466889282785e-06, + "loss": 0.8763, + "step": 4062 + }, + { + "epoch": 1.1795616199738714, + "grad_norm": 3.8356258869171143, + "learning_rate": 9.08411503812801e-06, + "loss": 0.8663, + "step": 4063 + }, + { + "epoch": 1.1798519378719698, + "grad_norm": 3.4146289825439453, + "learning_rate": 9.083561032807626e-06, + "loss": 0.7986, + "step": 4064 + }, + { + "epoch": 1.1801422557700683, + "grad_norm": 3.3314566612243652, + "learning_rate": 9.083006876887132e-06, + "loss": 0.8305, + "step": 4065 + }, + { + "epoch": 1.1804325736681667, + "grad_norm": 3.6700377464294434, + "learning_rate": 9.082452570386966e-06, + "loss": 0.8067, + "step": 4066 + }, + { + "epoch": 1.180722891566265, + "grad_norm": 3.317873954772949, + "learning_rate": 9.08189811332757e-06, + "loss": 0.7411, + "step": 4067 + }, + { + "epoch": 1.1810132094643635, + "grad_norm": 3.274186134338379, + "learning_rate": 9.08134350572939e-06, + "loss": 0.738, + "step": 4068 + }, + { + "epoch": 1.181303527362462, + "grad_norm": 3.3086955547332764, + "learning_rate": 9.08078874761288e-06, + "loss": 0.7723, + "step": 4069 + }, + { + "epoch": 1.1815938452605603, + "grad_norm": 3.8123908042907715, + "learning_rate": 9.080233838998503e-06, + "loss": 0.8489, + "step": 4070 + }, + { + "epoch": 1.1818841631586587, + "grad_norm": 3.341263771057129, + "learning_rate": 9.079678779906718e-06, + "loss": 0.7099, + "step": 4071 + }, + { + "epoch": 1.1821744810567572, + "grad_norm": 3.642395496368408, + "learning_rate": 9.079123570358e-06, + "loss": 0.6924, + "step": 4072 + }, + { + "epoch": 1.1824647989548556, + "grad_norm": 3.3351449966430664, + "learning_rate": 9.078568210372825e-06, + "loss": 0.7104, + "step": 4073 + }, + { + "epoch": 1.182755116852954, + "grad_norm": 3.6893389225006104, + "learning_rate": 9.078012699971673e-06, + "loss": 0.6957, + "step": 4074 + }, + { + "epoch": 1.1830454347510524, + "grad_norm": 3.6875810623168945, + "learning_rate": 9.077457039175028e-06, + "loss": 0.7803, + "step": 4075 + }, + { + "epoch": 1.1833357526491508, + "grad_norm": 3.341475248336792, + "learning_rate": 9.076901228003387e-06, + "loss": 0.8119, + "step": 4076 + }, + { + "epoch": 1.1836260705472492, + "grad_norm": 3.684300422668457, + "learning_rate": 9.076345266477247e-06, + "loss": 0.8527, + "step": 4077 + }, + { + "epoch": 1.1839163884453476, + "grad_norm": 3.4260594844818115, + "learning_rate": 9.075789154617112e-06, + "loss": 0.6892, + "step": 4078 + }, + { + "epoch": 1.184206706343446, + "grad_norm": 3.6972508430480957, + "learning_rate": 9.075232892443488e-06, + "loss": 0.6416, + "step": 4079 + }, + { + "epoch": 1.1844970242415445, + "grad_norm": 3.9194812774658203, + "learning_rate": 9.074676479976894e-06, + "loss": 0.8281, + "step": 4080 + }, + { + "epoch": 1.1847873421396429, + "grad_norm": 3.2946715354919434, + "learning_rate": 9.074119917237849e-06, + "loss": 0.7115, + "step": 4081 + }, + { + "epoch": 1.1850776600377413, + "grad_norm": 3.7364883422851562, + "learning_rate": 9.073563204246877e-06, + "loss": 0.6713, + "step": 4082 + }, + { + "epoch": 1.1853679779358397, + "grad_norm": 3.7229502201080322, + "learning_rate": 9.07300634102451e-06, + "loss": 0.7803, + "step": 4083 + }, + { + "epoch": 1.1856582958339381, + "grad_norm": 3.2690937519073486, + "learning_rate": 9.072449327591285e-06, + "loss": 0.6948, + "step": 4084 + }, + { + "epoch": 1.1859486137320365, + "grad_norm": 3.791633367538452, + "learning_rate": 9.071892163967749e-06, + "loss": 0.863, + "step": 4085 + }, + { + "epoch": 1.186238931630135, + "grad_norm": 3.2623965740203857, + "learning_rate": 9.071334850174442e-06, + "loss": 0.6323, + "step": 4086 + }, + { + "epoch": 1.1865292495282334, + "grad_norm": 3.938901901245117, + "learning_rate": 9.070777386231921e-06, + "loss": 0.8053, + "step": 4087 + }, + { + "epoch": 1.1868195674263318, + "grad_norm": 3.3571414947509766, + "learning_rate": 9.070219772160748e-06, + "loss": 0.7462, + "step": 4088 + }, + { + "epoch": 1.1871098853244302, + "grad_norm": 3.772347927093506, + "learning_rate": 9.069662007981483e-06, + "loss": 0.8494, + "step": 4089 + }, + { + "epoch": 1.1874002032225286, + "grad_norm": 3.5584139823913574, + "learning_rate": 9.0691040937147e-06, + "loss": 0.7863, + "step": 4090 + }, + { + "epoch": 1.187690521120627, + "grad_norm": 3.906470775604248, + "learning_rate": 9.068546029380971e-06, + "loss": 0.8599, + "step": 4091 + }, + { + "epoch": 1.1879808390187254, + "grad_norm": 3.395383834838867, + "learning_rate": 9.06798781500088e-06, + "loss": 0.7968, + "step": 4092 + }, + { + "epoch": 1.1882711569168238, + "grad_norm": 3.3741462230682373, + "learning_rate": 9.067429450595014e-06, + "loss": 0.7056, + "step": 4093 + }, + { + "epoch": 1.1885614748149222, + "grad_norm": 3.172368049621582, + "learning_rate": 9.066870936183962e-06, + "loss": 0.7439, + "step": 4094 + }, + { + "epoch": 1.1888517927130207, + "grad_norm": 3.850167751312256, + "learning_rate": 9.066312271788323e-06, + "loss": 0.8851, + "step": 4095 + }, + { + "epoch": 1.1891421106111193, + "grad_norm": 3.6464662551879883, + "learning_rate": 9.065753457428703e-06, + "loss": 0.7846, + "step": 4096 + }, + { + "epoch": 1.1894324285092175, + "grad_norm": 4.118659973144531, + "learning_rate": 9.065194493125708e-06, + "loss": 0.9087, + "step": 4097 + }, + { + "epoch": 1.1897227464073161, + "grad_norm": 3.62093448638916, + "learning_rate": 9.064635378899954e-06, + "loss": 0.8598, + "step": 4098 + }, + { + "epoch": 1.1900130643054143, + "grad_norm": 3.810291051864624, + "learning_rate": 9.06407611477206e-06, + "loss": 0.8275, + "step": 4099 + }, + { + "epoch": 1.190303382203513, + "grad_norm": 3.34863018989563, + "learning_rate": 9.06351670076265e-06, + "loss": 0.6317, + "step": 4100 + }, + { + "epoch": 1.1905937001016111, + "grad_norm": 3.578842878341675, + "learning_rate": 9.06295713689236e-06, + "loss": 0.7453, + "step": 4101 + }, + { + "epoch": 1.1908840179997098, + "grad_norm": 3.7192788124084473, + "learning_rate": 9.06239742318182e-06, + "loss": 0.8762, + "step": 4102 + }, + { + "epoch": 1.1911743358978082, + "grad_norm": 3.813288450241089, + "learning_rate": 9.061837559651676e-06, + "loss": 0.8466, + "step": 4103 + }, + { + "epoch": 1.1914646537959066, + "grad_norm": 3.4084653854370117, + "learning_rate": 9.061277546322576e-06, + "loss": 0.8022, + "step": 4104 + }, + { + "epoch": 1.191754971694005, + "grad_norm": 3.501131057739258, + "learning_rate": 9.060717383215169e-06, + "loss": 0.7563, + "step": 4105 + }, + { + "epoch": 1.1920452895921034, + "grad_norm": 3.5633366107940674, + "learning_rate": 9.060157070350119e-06, + "loss": 0.8084, + "step": 4106 + }, + { + "epoch": 1.1923356074902018, + "grad_norm": 3.551622152328491, + "learning_rate": 9.059596607748087e-06, + "loss": 0.7899, + "step": 4107 + }, + { + "epoch": 1.1926259253883003, + "grad_norm": 3.22371244430542, + "learning_rate": 9.059035995429743e-06, + "loss": 0.6764, + "step": 4108 + }, + { + "epoch": 1.1929162432863987, + "grad_norm": 3.7044527530670166, + "learning_rate": 9.058475233415763e-06, + "loss": 0.9281, + "step": 4109 + }, + { + "epoch": 1.193206561184497, + "grad_norm": 3.9586267471313477, + "learning_rate": 9.057914321726824e-06, + "loss": 0.9419, + "step": 4110 + }, + { + "epoch": 1.1934968790825955, + "grad_norm": 2.908240556716919, + "learning_rate": 9.057353260383617e-06, + "loss": 0.8072, + "step": 4111 + }, + { + "epoch": 1.193787196980694, + "grad_norm": 3.484663724899292, + "learning_rate": 9.056792049406833e-06, + "loss": 0.804, + "step": 4112 + }, + { + "epoch": 1.1940775148787923, + "grad_norm": 3.4705088138580322, + "learning_rate": 9.056230688817168e-06, + "loss": 0.7696, + "step": 4113 + }, + { + "epoch": 1.1943678327768907, + "grad_norm": 3.4565269947052, + "learning_rate": 9.055669178635322e-06, + "loss": 0.7479, + "step": 4114 + }, + { + "epoch": 1.1946581506749891, + "grad_norm": 3.331815719604492, + "learning_rate": 9.055107518882009e-06, + "loss": 0.6769, + "step": 4115 + }, + { + "epoch": 1.1949484685730876, + "grad_norm": 3.844775438308716, + "learning_rate": 9.054545709577939e-06, + "loss": 0.9125, + "step": 4116 + }, + { + "epoch": 1.195238786471186, + "grad_norm": 3.518406867980957, + "learning_rate": 9.053983750743831e-06, + "loss": 0.7155, + "step": 4117 + }, + { + "epoch": 1.1955291043692844, + "grad_norm": 3.5197741985321045, + "learning_rate": 9.053421642400414e-06, + "loss": 0.786, + "step": 4118 + }, + { + "epoch": 1.1958194222673828, + "grad_norm": 3.6934590339660645, + "learning_rate": 9.052859384568414e-06, + "loss": 0.778, + "step": 4119 + }, + { + "epoch": 1.1961097401654812, + "grad_norm": 3.5394248962402344, + "learning_rate": 9.052296977268566e-06, + "loss": 0.755, + "step": 4120 + }, + { + "epoch": 1.1964000580635796, + "grad_norm": 3.7590219974517822, + "learning_rate": 9.051734420521616e-06, + "loss": 0.8084, + "step": 4121 + }, + { + "epoch": 1.196690375961678, + "grad_norm": 3.022731304168701, + "learning_rate": 9.051171714348309e-06, + "loss": 0.7038, + "step": 4122 + }, + { + "epoch": 1.1969806938597765, + "grad_norm": 3.880645990371704, + "learning_rate": 9.050608858769395e-06, + "loss": 0.7077, + "step": 4123 + }, + { + "epoch": 1.1972710117578749, + "grad_norm": 3.356694459915161, + "learning_rate": 9.050045853805634e-06, + "loss": 0.7646, + "step": 4124 + }, + { + "epoch": 1.1975613296559733, + "grad_norm": 3.812464714050293, + "learning_rate": 9.04948269947779e-06, + "loss": 0.8239, + "step": 4125 + }, + { + "epoch": 1.1978516475540717, + "grad_norm": 3.7726550102233887, + "learning_rate": 9.04891939580663e-06, + "loss": 0.8597, + "step": 4126 + }, + { + "epoch": 1.19814196545217, + "grad_norm": 3.775982141494751, + "learning_rate": 9.048355942812929e-06, + "loss": 0.797, + "step": 4127 + }, + { + "epoch": 1.1984322833502685, + "grad_norm": 3.6224353313446045, + "learning_rate": 9.04779234051747e-06, + "loss": 0.676, + "step": 4128 + }, + { + "epoch": 1.198722601248367, + "grad_norm": 3.9695451259613037, + "learning_rate": 9.047228588941034e-06, + "loss": 0.8476, + "step": 4129 + }, + { + "epoch": 1.1990129191464654, + "grad_norm": 3.48233962059021, + "learning_rate": 9.046664688104414e-06, + "loss": 0.7039, + "step": 4130 + }, + { + "epoch": 1.1993032370445638, + "grad_norm": 3.5250630378723145, + "learning_rate": 9.046100638028406e-06, + "loss": 0.7195, + "step": 4131 + }, + { + "epoch": 1.1995935549426622, + "grad_norm": 4.188467502593994, + "learning_rate": 9.045536438733814e-06, + "loss": 0.8922, + "step": 4132 + }, + { + "epoch": 1.1998838728407606, + "grad_norm": 3.3059566020965576, + "learning_rate": 9.044972090241439e-06, + "loss": 0.791, + "step": 4133 + }, + { + "epoch": 1.200174190738859, + "grad_norm": 3.44315505027771, + "learning_rate": 9.044407592572102e-06, + "loss": 0.7476, + "step": 4134 + }, + { + "epoch": 1.2004645086369574, + "grad_norm": 3.908571481704712, + "learning_rate": 9.043842945746617e-06, + "loss": 0.8055, + "step": 4135 + }, + { + "epoch": 1.2007548265350558, + "grad_norm": 3.499602794647217, + "learning_rate": 9.04327814978581e-06, + "loss": 0.7689, + "step": 4136 + }, + { + "epoch": 1.2010451444331542, + "grad_norm": 3.504218578338623, + "learning_rate": 9.042713204710509e-06, + "loss": 0.7161, + "step": 4137 + }, + { + "epoch": 1.2013354623312527, + "grad_norm": 3.1022610664367676, + "learning_rate": 9.04214811054155e-06, + "loss": 0.7635, + "step": 4138 + }, + { + "epoch": 1.201625780229351, + "grad_norm": 3.5882506370544434, + "learning_rate": 9.04158286729977e-06, + "loss": 0.7621, + "step": 4139 + }, + { + "epoch": 1.2019160981274495, + "grad_norm": 3.5278327465057373, + "learning_rate": 9.04101747500602e-06, + "loss": 0.7782, + "step": 4140 + }, + { + "epoch": 1.202206416025548, + "grad_norm": 3.5033469200134277, + "learning_rate": 9.040451933681148e-06, + "loss": 0.7269, + "step": 4141 + }, + { + "epoch": 1.2024967339236463, + "grad_norm": 3.472656488418579, + "learning_rate": 9.039886243346013e-06, + "loss": 0.7632, + "step": 4142 + }, + { + "epoch": 1.2027870518217447, + "grad_norm": 3.2979049682617188, + "learning_rate": 9.039320404021475e-06, + "loss": 0.765, + "step": 4143 + }, + { + "epoch": 1.2030773697198431, + "grad_norm": 3.6671695709228516, + "learning_rate": 9.038754415728405e-06, + "loss": 0.6898, + "step": 4144 + }, + { + "epoch": 1.2033676876179416, + "grad_norm": 3.387666940689087, + "learning_rate": 9.038188278487673e-06, + "loss": 0.662, + "step": 4145 + }, + { + "epoch": 1.20365800551604, + "grad_norm": 3.3943850994110107, + "learning_rate": 9.037621992320162e-06, + "loss": 0.7152, + "step": 4146 + }, + { + "epoch": 1.2039483234141386, + "grad_norm": 3.2745096683502197, + "learning_rate": 9.037055557246754e-06, + "loss": 0.7477, + "step": 4147 + }, + { + "epoch": 1.2042386413122368, + "grad_norm": 3.368821859359741, + "learning_rate": 9.036488973288339e-06, + "loss": 0.7086, + "step": 4148 + }, + { + "epoch": 1.2045289592103354, + "grad_norm": 3.569892644882202, + "learning_rate": 9.035922240465813e-06, + "loss": 0.8061, + "step": 4149 + }, + { + "epoch": 1.2048192771084336, + "grad_norm": 4.035867214202881, + "learning_rate": 9.035355358800073e-06, + "loss": 0.8411, + "step": 4150 + }, + { + "epoch": 1.2051095950065323, + "grad_norm": 3.9796719551086426, + "learning_rate": 9.034788328312031e-06, + "loss": 0.8424, + "step": 4151 + }, + { + "epoch": 1.2053999129046307, + "grad_norm": 3.9051156044006348, + "learning_rate": 9.034221149022599e-06, + "loss": 0.8068, + "step": 4152 + }, + { + "epoch": 1.205690230802729, + "grad_norm": 3.869713068008423, + "learning_rate": 9.033653820952689e-06, + "loss": 0.8491, + "step": 4153 + }, + { + "epoch": 1.2059805487008275, + "grad_norm": 2.9886488914489746, + "learning_rate": 9.033086344123227e-06, + "loss": 0.7795, + "step": 4154 + }, + { + "epoch": 1.206270866598926, + "grad_norm": 4.163388252258301, + "learning_rate": 9.032518718555142e-06, + "loss": 0.8913, + "step": 4155 + }, + { + "epoch": 1.2065611844970243, + "grad_norm": 3.384000539779663, + "learning_rate": 9.031950944269366e-06, + "loss": 0.8076, + "step": 4156 + }, + { + "epoch": 1.2068515023951227, + "grad_norm": 4.030092239379883, + "learning_rate": 9.03138302128684e-06, + "loss": 0.8349, + "step": 4157 + }, + { + "epoch": 1.2071418202932211, + "grad_norm": 3.787898540496826, + "learning_rate": 9.030814949628509e-06, + "loss": 0.7586, + "step": 4158 + }, + { + "epoch": 1.2074321381913196, + "grad_norm": 3.355987787246704, + "learning_rate": 9.03024672931532e-06, + "loss": 0.7544, + "step": 4159 + }, + { + "epoch": 1.207722456089418, + "grad_norm": 3.9991297721862793, + "learning_rate": 9.029678360368232e-06, + "loss": 0.7545, + "step": 4160 + }, + { + "epoch": 1.2080127739875164, + "grad_norm": 3.7311341762542725, + "learning_rate": 9.029109842808205e-06, + "loss": 0.7447, + "step": 4161 + }, + { + "epoch": 1.2083030918856148, + "grad_norm": 4.173926830291748, + "learning_rate": 9.028541176656206e-06, + "loss": 0.9467, + "step": 4162 + }, + { + "epoch": 1.2085934097837132, + "grad_norm": 3.6992671489715576, + "learning_rate": 9.027972361933206e-06, + "loss": 0.7205, + "step": 4163 + }, + { + "epoch": 1.2088837276818116, + "grad_norm": 3.7675483226776123, + "learning_rate": 9.027403398660186e-06, + "loss": 0.8685, + "step": 4164 + }, + { + "epoch": 1.20917404557991, + "grad_norm": 3.525923490524292, + "learning_rate": 9.026834286858125e-06, + "loss": 0.8266, + "step": 4165 + }, + { + "epoch": 1.2094643634780085, + "grad_norm": 3.47044038772583, + "learning_rate": 9.026265026548016e-06, + "loss": 0.8065, + "step": 4166 + }, + { + "epoch": 1.2097546813761069, + "grad_norm": 3.7477779388427734, + "learning_rate": 9.025695617750848e-06, + "loss": 0.7428, + "step": 4167 + }, + { + "epoch": 1.2100449992742053, + "grad_norm": 3.2594008445739746, + "learning_rate": 9.025126060487623e-06, + "loss": 0.7125, + "step": 4168 + }, + { + "epoch": 1.2103353171723037, + "grad_norm": 3.4195213317871094, + "learning_rate": 9.024556354779348e-06, + "loss": 0.8543, + "step": 4169 + }, + { + "epoch": 1.210625635070402, + "grad_norm": 2.9705264568328857, + "learning_rate": 9.02398650064703e-06, + "loss": 0.6412, + "step": 4170 + }, + { + "epoch": 1.2109159529685005, + "grad_norm": 3.3002724647521973, + "learning_rate": 9.023416498111688e-06, + "loss": 0.7906, + "step": 4171 + }, + { + "epoch": 1.211206270866599, + "grad_norm": 3.0194554328918457, + "learning_rate": 9.022846347194343e-06, + "loss": 0.7628, + "step": 4172 + }, + { + "epoch": 1.2114965887646973, + "grad_norm": 3.412965774536133, + "learning_rate": 9.02227604791602e-06, + "loss": 0.7688, + "step": 4173 + }, + { + "epoch": 1.2117869066627958, + "grad_norm": 3.7909467220306396, + "learning_rate": 9.021705600297753e-06, + "loss": 0.8916, + "step": 4174 + }, + { + "epoch": 1.2120772245608942, + "grad_norm": 3.2401669025421143, + "learning_rate": 9.021135004360578e-06, + "loss": 0.6957, + "step": 4175 + }, + { + "epoch": 1.2123675424589926, + "grad_norm": 3.907761812210083, + "learning_rate": 9.020564260125542e-06, + "loss": 0.8673, + "step": 4176 + }, + { + "epoch": 1.212657860357091, + "grad_norm": 3.2626876831054688, + "learning_rate": 9.019993367613689e-06, + "loss": 0.7596, + "step": 4177 + }, + { + "epoch": 1.2129481782551894, + "grad_norm": 3.8206748962402344, + "learning_rate": 9.019422326846078e-06, + "loss": 0.8473, + "step": 4178 + }, + { + "epoch": 1.2132384961532878, + "grad_norm": 3.7625372409820557, + "learning_rate": 9.018851137843765e-06, + "loss": 0.8529, + "step": 4179 + }, + { + "epoch": 1.2135288140513862, + "grad_norm": 3.553237199783325, + "learning_rate": 9.018279800627818e-06, + "loss": 0.8849, + "step": 4180 + }, + { + "epoch": 1.2138191319494847, + "grad_norm": 3.6299870014190674, + "learning_rate": 9.017708315219307e-06, + "loss": 0.7347, + "step": 4181 + }, + { + "epoch": 1.214109449847583, + "grad_norm": 3.9615767002105713, + "learning_rate": 9.017136681639307e-06, + "loss": 0.8044, + "step": 4182 + }, + { + "epoch": 1.2143997677456815, + "grad_norm": 3.804377555847168, + "learning_rate": 9.0165648999089e-06, + "loss": 0.7135, + "step": 4183 + }, + { + "epoch": 1.21469008564378, + "grad_norm": 3.876023054122925, + "learning_rate": 9.015992970049175e-06, + "loss": 0.8958, + "step": 4184 + }, + { + "epoch": 1.2149804035418783, + "grad_norm": 3.5934906005859375, + "learning_rate": 9.015420892081222e-06, + "loss": 0.7761, + "step": 4185 + }, + { + "epoch": 1.2152707214399767, + "grad_norm": 3.36338210105896, + "learning_rate": 9.014848666026138e-06, + "loss": 0.722, + "step": 4186 + }, + { + "epoch": 1.2155610393380751, + "grad_norm": 3.8048529624938965, + "learning_rate": 9.01427629190503e-06, + "loss": 0.8724, + "step": 4187 + }, + { + "epoch": 1.2158513572361735, + "grad_norm": 3.8319287300109863, + "learning_rate": 9.013703769739007e-06, + "loss": 0.8544, + "step": 4188 + }, + { + "epoch": 1.216141675134272, + "grad_norm": 3.9430227279663086, + "learning_rate": 9.01313109954918e-06, + "loss": 0.7627, + "step": 4189 + }, + { + "epoch": 1.2164319930323704, + "grad_norm": 3.7642529010772705, + "learning_rate": 9.01255828135667e-06, + "loss": 0.7264, + "step": 4190 + }, + { + "epoch": 1.2167223109304688, + "grad_norm": 3.522141933441162, + "learning_rate": 9.011985315182605e-06, + "loss": 0.8301, + "step": 4191 + }, + { + "epoch": 1.2170126288285672, + "grad_norm": 3.0998566150665283, + "learning_rate": 9.011412201048113e-06, + "loss": 0.7483, + "step": 4192 + }, + { + "epoch": 1.2173029467266656, + "grad_norm": 3.6285431385040283, + "learning_rate": 9.010838938974329e-06, + "loss": 0.7769, + "step": 4193 + }, + { + "epoch": 1.217593264624764, + "grad_norm": 4.2689337730407715, + "learning_rate": 9.010265528982398e-06, + "loss": 0.9484, + "step": 4194 + }, + { + "epoch": 1.2178835825228624, + "grad_norm": 3.3270440101623535, + "learning_rate": 9.009691971093467e-06, + "loss": 0.8008, + "step": 4195 + }, + { + "epoch": 1.218173900420961, + "grad_norm": 3.4125139713287354, + "learning_rate": 9.009118265328684e-06, + "loss": 0.7329, + "step": 4196 + }, + { + "epoch": 1.2184642183190593, + "grad_norm": 3.2748773097991943, + "learning_rate": 9.008544411709214e-06, + "loss": 0.69, + "step": 4197 + }, + { + "epoch": 1.218754536217158, + "grad_norm": 3.5631113052368164, + "learning_rate": 9.007970410256216e-06, + "loss": 0.7348, + "step": 4198 + }, + { + "epoch": 1.219044854115256, + "grad_norm": 3.6760542392730713, + "learning_rate": 9.007396260990857e-06, + "loss": 0.8198, + "step": 4199 + }, + { + "epoch": 1.2193351720133547, + "grad_norm": 3.3203012943267822, + "learning_rate": 9.006821963934316e-06, + "loss": 0.7226, + "step": 4200 + }, + { + "epoch": 1.219625489911453, + "grad_norm": 4.029517650604248, + "learning_rate": 9.006247519107771e-06, + "loss": 0.7686, + "step": 4201 + }, + { + "epoch": 1.2199158078095516, + "grad_norm": 4.306983470916748, + "learning_rate": 9.005672926532408e-06, + "loss": 0.8475, + "step": 4202 + }, + { + "epoch": 1.22020612570765, + "grad_norm": 3.5306789875030518, + "learning_rate": 9.005098186229417e-06, + "loss": 0.7178, + "step": 4203 + }, + { + "epoch": 1.2204964436057484, + "grad_norm": 3.456655502319336, + "learning_rate": 9.004523298219993e-06, + "loss": 0.7594, + "step": 4204 + }, + { + "epoch": 1.2207867615038468, + "grad_norm": 3.8073463439941406, + "learning_rate": 9.003948262525341e-06, + "loss": 0.82, + "step": 4205 + }, + { + "epoch": 1.2210770794019452, + "grad_norm": 3.5894739627838135, + "learning_rate": 9.003373079166664e-06, + "loss": 0.7883, + "step": 4206 + }, + { + "epoch": 1.2213673973000436, + "grad_norm": 3.461728572845459, + "learning_rate": 9.002797748165178e-06, + "loss": 0.8509, + "step": 4207 + }, + { + "epoch": 1.221657715198142, + "grad_norm": 3.460731267929077, + "learning_rate": 9.002222269542098e-06, + "loss": 0.8584, + "step": 4208 + }, + { + "epoch": 1.2219480330962404, + "grad_norm": 3.5668509006500244, + "learning_rate": 9.00164664331865e-06, + "loss": 0.8295, + "step": 4209 + }, + { + "epoch": 1.2222383509943389, + "grad_norm": 3.156965970993042, + "learning_rate": 9.001070869516062e-06, + "loss": 0.7822, + "step": 4210 + }, + { + "epoch": 1.2225286688924373, + "grad_norm": 3.166682720184326, + "learning_rate": 9.000494948155567e-06, + "loss": 0.7692, + "step": 4211 + }, + { + "epoch": 1.2228189867905357, + "grad_norm": 3.3912453651428223, + "learning_rate": 8.999918879258406e-06, + "loss": 0.7951, + "step": 4212 + }, + { + "epoch": 1.223109304688634, + "grad_norm": 3.546839952468872, + "learning_rate": 8.999342662845826e-06, + "loss": 0.7712, + "step": 4213 + }, + { + "epoch": 1.2233996225867325, + "grad_norm": 3.8041069507598877, + "learning_rate": 8.998766298939074e-06, + "loss": 0.8666, + "step": 4214 + }, + { + "epoch": 1.223689940484831, + "grad_norm": 3.5458247661590576, + "learning_rate": 8.998189787559408e-06, + "loss": 0.8102, + "step": 4215 + }, + { + "epoch": 1.2239802583829293, + "grad_norm": 3.452237367630005, + "learning_rate": 8.997613128728089e-06, + "loss": 0.7241, + "step": 4216 + }, + { + "epoch": 1.2242705762810278, + "grad_norm": 3.775862216949463, + "learning_rate": 8.997036322466385e-06, + "loss": 0.7433, + "step": 4217 + }, + { + "epoch": 1.2245608941791262, + "grad_norm": 3.6754865646362305, + "learning_rate": 8.996459368795567e-06, + "loss": 0.8025, + "step": 4218 + }, + { + "epoch": 1.2248512120772246, + "grad_norm": 3.375824213027954, + "learning_rate": 8.995882267736913e-06, + "loss": 0.7066, + "step": 4219 + }, + { + "epoch": 1.225141529975323, + "grad_norm": 3.4623117446899414, + "learning_rate": 8.995305019311708e-06, + "loss": 0.785, + "step": 4220 + }, + { + "epoch": 1.2254318478734214, + "grad_norm": 3.7280542850494385, + "learning_rate": 8.994727623541237e-06, + "loss": 0.7869, + "step": 4221 + }, + { + "epoch": 1.2257221657715198, + "grad_norm": 4.037339210510254, + "learning_rate": 8.9941500804468e-06, + "loss": 0.8466, + "step": 4222 + }, + { + "epoch": 1.2260124836696182, + "grad_norm": 3.8792598247528076, + "learning_rate": 8.99357239004969e-06, + "loss": 0.9094, + "step": 4223 + }, + { + "epoch": 1.2263028015677166, + "grad_norm": 3.7027788162231445, + "learning_rate": 8.992994552371217e-06, + "loss": 0.7475, + "step": 4224 + }, + { + "epoch": 1.226593119465815, + "grad_norm": 3.8787484169006348, + "learning_rate": 8.992416567432688e-06, + "loss": 0.9464, + "step": 4225 + }, + { + "epoch": 1.2268834373639135, + "grad_norm": 3.166562080383301, + "learning_rate": 8.991838435255422e-06, + "loss": 0.762, + "step": 4226 + }, + { + "epoch": 1.227173755262012, + "grad_norm": 3.317545175552368, + "learning_rate": 8.991260155860737e-06, + "loss": 0.6764, + "step": 4227 + }, + { + "epoch": 1.2274640731601103, + "grad_norm": 3.3221254348754883, + "learning_rate": 8.990681729269962e-06, + "loss": 0.8601, + "step": 4228 + }, + { + "epoch": 1.2277543910582087, + "grad_norm": 3.914020299911499, + "learning_rate": 8.990103155504428e-06, + "loss": 0.8584, + "step": 4229 + }, + { + "epoch": 1.2280447089563071, + "grad_norm": 3.6654372215270996, + "learning_rate": 8.989524434585473e-06, + "loss": 0.7289, + "step": 4230 + }, + { + "epoch": 1.2283350268544055, + "grad_norm": 3.4380693435668945, + "learning_rate": 8.988945566534442e-06, + "loss": 0.7692, + "step": 4231 + }, + { + "epoch": 1.228625344752504, + "grad_norm": 3.8467538356781006, + "learning_rate": 8.98836655137268e-06, + "loss": 0.9227, + "step": 4232 + }, + { + "epoch": 1.2289156626506024, + "grad_norm": 3.577817916870117, + "learning_rate": 8.987787389121542e-06, + "loss": 0.7317, + "step": 4233 + }, + { + "epoch": 1.2292059805487008, + "grad_norm": 3.5391640663146973, + "learning_rate": 8.987208079802387e-06, + "loss": 0.7497, + "step": 4234 + }, + { + "epoch": 1.2294962984467992, + "grad_norm": 3.71026611328125, + "learning_rate": 8.986628623436583e-06, + "loss": 0.7541, + "step": 4235 + }, + { + "epoch": 1.2297866163448976, + "grad_norm": 3.2825422286987305, + "learning_rate": 8.986049020045495e-06, + "loss": 0.8143, + "step": 4236 + }, + { + "epoch": 1.230076934242996, + "grad_norm": 3.931927442550659, + "learning_rate": 8.9854692696505e-06, + "loss": 0.8363, + "step": 4237 + }, + { + "epoch": 1.2303672521410944, + "grad_norm": 3.6304123401641846, + "learning_rate": 8.984889372272982e-06, + "loss": 0.7422, + "step": 4238 + }, + { + "epoch": 1.2306575700391928, + "grad_norm": 3.913593053817749, + "learning_rate": 8.984309327934326e-06, + "loss": 0.7626, + "step": 4239 + }, + { + "epoch": 1.2309478879372913, + "grad_norm": 3.2616569995880127, + "learning_rate": 8.983729136655921e-06, + "loss": 0.6163, + "step": 4240 + }, + { + "epoch": 1.2312382058353897, + "grad_norm": 4.207817554473877, + "learning_rate": 8.983148798459167e-06, + "loss": 0.8562, + "step": 4241 + }, + { + "epoch": 1.231528523733488, + "grad_norm": 3.02081561088562, + "learning_rate": 8.982568313365467e-06, + "loss": 0.6839, + "step": 4242 + }, + { + "epoch": 1.2318188416315865, + "grad_norm": 3.8226892948150635, + "learning_rate": 8.981987681396226e-06, + "loss": 0.8784, + "step": 4243 + }, + { + "epoch": 1.232109159529685, + "grad_norm": 3.748441696166992, + "learning_rate": 8.981406902572862e-06, + "loss": 0.8386, + "step": 4244 + }, + { + "epoch": 1.2323994774277833, + "grad_norm": 3.492546319961548, + "learning_rate": 8.98082597691679e-06, + "loss": 0.7597, + "step": 4245 + }, + { + "epoch": 1.2326897953258817, + "grad_norm": 3.4718661308288574, + "learning_rate": 8.980244904449436e-06, + "loss": 0.7796, + "step": 4246 + }, + { + "epoch": 1.2329801132239804, + "grad_norm": 3.1242318153381348, + "learning_rate": 8.97966368519223e-06, + "loss": 0.5742, + "step": 4247 + }, + { + "epoch": 1.2332704311220786, + "grad_norm": 3.907931327819824, + "learning_rate": 8.979082319166605e-06, + "loss": 0.8138, + "step": 4248 + }, + { + "epoch": 1.2335607490201772, + "grad_norm": 3.067992925643921, + "learning_rate": 8.978500806394004e-06, + "loss": 0.6971, + "step": 4249 + }, + { + "epoch": 1.2338510669182754, + "grad_norm": 3.232266664505005, + "learning_rate": 8.977919146895872e-06, + "loss": 0.7405, + "step": 4250 + }, + { + "epoch": 1.234141384816374, + "grad_norm": 3.50213623046875, + "learning_rate": 8.977337340693662e-06, + "loss": 0.686, + "step": 4251 + }, + { + "epoch": 1.2344317027144722, + "grad_norm": 3.8020687103271484, + "learning_rate": 8.976755387808826e-06, + "loss": 0.7404, + "step": 4252 + }, + { + "epoch": 1.2347220206125709, + "grad_norm": 3.3541903495788574, + "learning_rate": 8.976173288262832e-06, + "loss": 0.7247, + "step": 4253 + }, + { + "epoch": 1.2350123385106693, + "grad_norm": 3.84443736076355, + "learning_rate": 8.975591042077144e-06, + "loss": 0.8052, + "step": 4254 + }, + { + "epoch": 1.2353026564087677, + "grad_norm": 3.4659833908081055, + "learning_rate": 8.975008649273238e-06, + "loss": 0.7656, + "step": 4255 + }, + { + "epoch": 1.235592974306866, + "grad_norm": 3.320693254470825, + "learning_rate": 8.974426109872587e-06, + "loss": 0.6717, + "step": 4256 + }, + { + "epoch": 1.2358832922049645, + "grad_norm": 3.577528953552246, + "learning_rate": 8.97384342389668e-06, + "loss": 0.7556, + "step": 4257 + }, + { + "epoch": 1.236173610103063, + "grad_norm": 3.8595802783966064, + "learning_rate": 8.973260591367006e-06, + "loss": 0.8209, + "step": 4258 + }, + { + "epoch": 1.2364639280011613, + "grad_norm": 3.4095239639282227, + "learning_rate": 8.972677612305056e-06, + "loss": 0.733, + "step": 4259 + }, + { + "epoch": 1.2367542458992598, + "grad_norm": 3.280168294906616, + "learning_rate": 8.972094486732332e-06, + "loss": 0.6605, + "step": 4260 + }, + { + "epoch": 1.2370445637973582, + "grad_norm": 2.979154586791992, + "learning_rate": 8.971511214670342e-06, + "loss": 0.6957, + "step": 4261 + }, + { + "epoch": 1.2373348816954566, + "grad_norm": 3.2444956302642822, + "learning_rate": 8.970927796140592e-06, + "loss": 0.8197, + "step": 4262 + }, + { + "epoch": 1.237625199593555, + "grad_norm": 3.193018913269043, + "learning_rate": 8.970344231164602e-06, + "loss": 0.7737, + "step": 4263 + }, + { + "epoch": 1.2379155174916534, + "grad_norm": 3.533512830734253, + "learning_rate": 8.969760519763891e-06, + "loss": 0.8184, + "step": 4264 + }, + { + "epoch": 1.2382058353897518, + "grad_norm": 3.282985210418701, + "learning_rate": 8.969176661959989e-06, + "loss": 0.7852, + "step": 4265 + }, + { + "epoch": 1.2384961532878502, + "grad_norm": 3.325979471206665, + "learning_rate": 8.968592657774427e-06, + "loss": 0.7307, + "step": 4266 + }, + { + "epoch": 1.2387864711859486, + "grad_norm": 3.227482318878174, + "learning_rate": 8.96800850722874e-06, + "loss": 0.7528, + "step": 4267 + }, + { + "epoch": 1.239076789084047, + "grad_norm": 3.809748888015747, + "learning_rate": 8.967424210344475e-06, + "loss": 0.8771, + "step": 4268 + }, + { + "epoch": 1.2393671069821455, + "grad_norm": 3.711108684539795, + "learning_rate": 8.96683976714318e-06, + "loss": 0.7809, + "step": 4269 + }, + { + "epoch": 1.2396574248802439, + "grad_norm": 3.6016719341278076, + "learning_rate": 8.96625517764641e-06, + "loss": 0.8463, + "step": 4270 + }, + { + "epoch": 1.2399477427783423, + "grad_norm": 3.190556049346924, + "learning_rate": 8.965670441875722e-06, + "loss": 0.7897, + "step": 4271 + }, + { + "epoch": 1.2402380606764407, + "grad_norm": 3.8056397438049316, + "learning_rate": 8.965085559852682e-06, + "loss": 0.7555, + "step": 4272 + }, + { + "epoch": 1.2405283785745391, + "grad_norm": 3.822848081588745, + "learning_rate": 8.964500531598859e-06, + "loss": 0.7953, + "step": 4273 + }, + { + "epoch": 1.2408186964726375, + "grad_norm": 3.6595678329467773, + "learning_rate": 8.963915357135831e-06, + "loss": 0.8042, + "step": 4274 + }, + { + "epoch": 1.241109014370736, + "grad_norm": 3.2902088165283203, + "learning_rate": 8.963330036485177e-06, + "loss": 0.6457, + "step": 4275 + }, + { + "epoch": 1.2413993322688344, + "grad_norm": 3.0377769470214844, + "learning_rate": 8.962744569668485e-06, + "loss": 0.7047, + "step": 4276 + }, + { + "epoch": 1.2416896501669328, + "grad_norm": 3.4491989612579346, + "learning_rate": 8.962158956707343e-06, + "loss": 0.7604, + "step": 4277 + }, + { + "epoch": 1.2419799680650312, + "grad_norm": 3.833693027496338, + "learning_rate": 8.961573197623353e-06, + "loss": 0.7477, + "step": 4278 + }, + { + "epoch": 1.2422702859631296, + "grad_norm": 3.5604989528656006, + "learning_rate": 8.960987292438117e-06, + "loss": 0.7044, + "step": 4279 + }, + { + "epoch": 1.242560603861228, + "grad_norm": 4.023108959197998, + "learning_rate": 8.96040124117324e-06, + "loss": 0.8121, + "step": 4280 + }, + { + "epoch": 1.2428509217593264, + "grad_norm": 4.016019821166992, + "learning_rate": 8.959815043850336e-06, + "loss": 0.8181, + "step": 4281 + }, + { + "epoch": 1.2431412396574248, + "grad_norm": 3.4648163318634033, + "learning_rate": 8.959228700491025e-06, + "loss": 0.7576, + "step": 4282 + }, + { + "epoch": 1.2434315575555233, + "grad_norm": 3.7959625720977783, + "learning_rate": 8.958642211116932e-06, + "loss": 0.8032, + "step": 4283 + }, + { + "epoch": 1.2437218754536217, + "grad_norm": 3.156304121017456, + "learning_rate": 8.958055575749685e-06, + "loss": 0.6847, + "step": 4284 + }, + { + "epoch": 1.24401219335172, + "grad_norm": 3.544156789779663, + "learning_rate": 8.957468794410918e-06, + "loss": 0.8136, + "step": 4285 + }, + { + "epoch": 1.2443025112498185, + "grad_norm": 3.452969551086426, + "learning_rate": 8.956881867122272e-06, + "loss": 0.8339, + "step": 4286 + }, + { + "epoch": 1.244592829147917, + "grad_norm": 3.346737861633301, + "learning_rate": 8.956294793905394e-06, + "loss": 0.6818, + "step": 4287 + }, + { + "epoch": 1.2448831470460153, + "grad_norm": 3.5661866664886475, + "learning_rate": 8.955707574781934e-06, + "loss": 0.8036, + "step": 4288 + }, + { + "epoch": 1.2451734649441137, + "grad_norm": 3.5071399211883545, + "learning_rate": 8.955120209773549e-06, + "loss": 0.7945, + "step": 4289 + }, + { + "epoch": 1.2454637828422122, + "grad_norm": 3.2883074283599854, + "learning_rate": 8.954532698901899e-06, + "loss": 0.7716, + "step": 4290 + }, + { + "epoch": 1.2457541007403106, + "grad_norm": 3.3931667804718018, + "learning_rate": 8.953945042188652e-06, + "loss": 0.7448, + "step": 4291 + }, + { + "epoch": 1.246044418638409, + "grad_norm": 3.219741106033325, + "learning_rate": 8.953357239655482e-06, + "loss": 0.739, + "step": 4292 + }, + { + "epoch": 1.2463347365365074, + "grad_norm": 3.6574721336364746, + "learning_rate": 8.952769291324065e-06, + "loss": 0.842, + "step": 4293 + }, + { + "epoch": 1.2466250544346058, + "grad_norm": 3.3695685863494873, + "learning_rate": 8.952181197216086e-06, + "loss": 0.7608, + "step": 4294 + }, + { + "epoch": 1.2469153723327042, + "grad_norm": 3.4170355796813965, + "learning_rate": 8.951592957353233e-06, + "loss": 0.7691, + "step": 4295 + }, + { + "epoch": 1.2472056902308026, + "grad_norm": 3.5159530639648438, + "learning_rate": 8.9510045717572e-06, + "loss": 0.7036, + "step": 4296 + }, + { + "epoch": 1.247496008128901, + "grad_norm": 3.3947741985321045, + "learning_rate": 8.950416040449684e-06, + "loss": 0.7098, + "step": 4297 + }, + { + "epoch": 1.2477863260269997, + "grad_norm": 3.538968801498413, + "learning_rate": 8.949827363452394e-06, + "loss": 0.7997, + "step": 4298 + }, + { + "epoch": 1.2480766439250979, + "grad_norm": 3.8506956100463867, + "learning_rate": 8.949238540787038e-06, + "loss": 0.8263, + "step": 4299 + }, + { + "epoch": 1.2483669618231965, + "grad_norm": 3.439701795578003, + "learning_rate": 8.948649572475332e-06, + "loss": 0.8389, + "step": 4300 + }, + { + "epoch": 1.2486572797212947, + "grad_norm": 3.6517250537872314, + "learning_rate": 8.948060458538996e-06, + "loss": 0.8981, + "step": 4301 + }, + { + "epoch": 1.2489475976193933, + "grad_norm": 3.491595983505249, + "learning_rate": 8.947471198999758e-06, + "loss": 0.729, + "step": 4302 + }, + { + "epoch": 1.2492379155174917, + "grad_norm": 3.2227985858917236, + "learning_rate": 8.946881793879348e-06, + "loss": 0.7198, + "step": 4303 + }, + { + "epoch": 1.2495282334155902, + "grad_norm": 3.37418532371521, + "learning_rate": 8.946292243199504e-06, + "loss": 0.7225, + "step": 4304 + }, + { + "epoch": 1.2498185513136886, + "grad_norm": 3.6257195472717285, + "learning_rate": 8.94570254698197e-06, + "loss": 0.8104, + "step": 4305 + }, + { + "epoch": 1.250108869211787, + "grad_norm": 3.424806833267212, + "learning_rate": 8.945112705248488e-06, + "loss": 0.7668, + "step": 4306 + }, + { + "epoch": 1.2503991871098854, + "grad_norm": 3.6353793144226074, + "learning_rate": 8.944522718020818e-06, + "loss": 0.6752, + "step": 4307 + }, + { + "epoch": 1.2506895050079838, + "grad_norm": 3.7617337703704834, + "learning_rate": 8.943932585320714e-06, + "loss": 0.9097, + "step": 4308 + }, + { + "epoch": 1.2509798229060822, + "grad_norm": 3.1361441612243652, + "learning_rate": 8.943342307169942e-06, + "loss": 0.6137, + "step": 4309 + }, + { + "epoch": 1.2512701408041806, + "grad_norm": 3.2930431365966797, + "learning_rate": 8.94275188359027e-06, + "loss": 0.6702, + "step": 4310 + }, + { + "epoch": 1.251560458702279, + "grad_norm": 3.5887277126312256, + "learning_rate": 8.942161314603475e-06, + "loss": 0.7784, + "step": 4311 + }, + { + "epoch": 1.2518507766003775, + "grad_norm": 3.7460267543792725, + "learning_rate": 8.941570600231333e-06, + "loss": 0.8589, + "step": 4312 + }, + { + "epoch": 1.2521410944984759, + "grad_norm": 3.7701773643493652, + "learning_rate": 8.940979740495632e-06, + "loss": 0.8413, + "step": 4313 + }, + { + "epoch": 1.2524314123965743, + "grad_norm": 3.804666519165039, + "learning_rate": 8.940388735418163e-06, + "loss": 0.7439, + "step": 4314 + }, + { + "epoch": 1.2527217302946727, + "grad_norm": 3.4871022701263428, + "learning_rate": 8.93979758502072e-06, + "loss": 0.6554, + "step": 4315 + }, + { + "epoch": 1.2530120481927711, + "grad_norm": 4.020226001739502, + "learning_rate": 8.939206289325107e-06, + "loss": 0.8006, + "step": 4316 + }, + { + "epoch": 1.2533023660908695, + "grad_norm": 3.413485527038574, + "learning_rate": 8.938614848353127e-06, + "loss": 0.7265, + "step": 4317 + }, + { + "epoch": 1.253592683988968, + "grad_norm": 3.1707980632781982, + "learning_rate": 8.938023262126596e-06, + "loss": 0.727, + "step": 4318 + }, + { + "epoch": 1.2538830018870664, + "grad_norm": 3.4203269481658936, + "learning_rate": 8.937431530667329e-06, + "loss": 0.7808, + "step": 4319 + }, + { + "epoch": 1.2541733197851648, + "grad_norm": 3.5568814277648926, + "learning_rate": 8.93683965399715e-06, + "loss": 0.8797, + "step": 4320 + }, + { + "epoch": 1.2544636376832632, + "grad_norm": 3.493055820465088, + "learning_rate": 8.936247632137886e-06, + "loss": 0.7317, + "step": 4321 + }, + { + "epoch": 1.2547539555813616, + "grad_norm": 3.5168776512145996, + "learning_rate": 8.935655465111372e-06, + "loss": 0.7399, + "step": 4322 + }, + { + "epoch": 1.25504427347946, + "grad_norm": 3.694639205932617, + "learning_rate": 8.935063152939446e-06, + "loss": 0.7509, + "step": 4323 + }, + { + "epoch": 1.2553345913775584, + "grad_norm": 3.880681276321411, + "learning_rate": 8.934470695643955e-06, + "loss": 0.7885, + "step": 4324 + }, + { + "epoch": 1.2556249092756568, + "grad_norm": 3.654292345046997, + "learning_rate": 8.933878093246744e-06, + "loss": 0.7816, + "step": 4325 + }, + { + "epoch": 1.2559152271737553, + "grad_norm": 3.8426339626312256, + "learning_rate": 8.933285345769671e-06, + "loss": 0.7581, + "step": 4326 + }, + { + "epoch": 1.2562055450718537, + "grad_norm": 4.196420192718506, + "learning_rate": 8.932692453234596e-06, + "loss": 0.9055, + "step": 4327 + }, + { + "epoch": 1.256495862969952, + "grad_norm": 3.6766929626464844, + "learning_rate": 8.93209941566338e-06, + "loss": 0.7715, + "step": 4328 + }, + { + "epoch": 1.2567861808680505, + "grad_norm": 3.5587241649627686, + "learning_rate": 8.9315062330779e-06, + "loss": 0.7941, + "step": 4329 + }, + { + "epoch": 1.257076498766149, + "grad_norm": 3.5319676399230957, + "learning_rate": 8.930912905500032e-06, + "loss": 0.7719, + "step": 4330 + }, + { + "epoch": 1.2573668166642473, + "grad_norm": 3.6964783668518066, + "learning_rate": 8.930319432951655e-06, + "loss": 0.8323, + "step": 4331 + }, + { + "epoch": 1.2576571345623457, + "grad_norm": 3.3253002166748047, + "learning_rate": 8.929725815454656e-06, + "loss": 0.7429, + "step": 4332 + }, + { + "epoch": 1.2579474524604441, + "grad_norm": 3.380309581756592, + "learning_rate": 8.929132053030928e-06, + "loss": 0.6763, + "step": 4333 + }, + { + "epoch": 1.2582377703585426, + "grad_norm": 3.194960117340088, + "learning_rate": 8.928538145702372e-06, + "loss": 0.6991, + "step": 4334 + }, + { + "epoch": 1.258528088256641, + "grad_norm": 3.830277681350708, + "learning_rate": 8.927944093490886e-06, + "loss": 0.8593, + "step": 4335 + }, + { + "epoch": 1.2588184061547394, + "grad_norm": 3.335928440093994, + "learning_rate": 8.92734989641838e-06, + "loss": 0.7855, + "step": 4336 + }, + { + "epoch": 1.2591087240528378, + "grad_norm": 3.180267572402954, + "learning_rate": 8.92675555450677e-06, + "loss": 0.6565, + "step": 4337 + }, + { + "epoch": 1.2593990419509362, + "grad_norm": 3.597320795059204, + "learning_rate": 8.926161067777973e-06, + "loss": 0.8024, + "step": 4338 + }, + { + "epoch": 1.2596893598490346, + "grad_norm": 3.2640135288238525, + "learning_rate": 8.925566436253915e-06, + "loss": 0.6889, + "step": 4339 + }, + { + "epoch": 1.259979677747133, + "grad_norm": 3.3412210941314697, + "learning_rate": 8.924971659956523e-06, + "loss": 0.703, + "step": 4340 + }, + { + "epoch": 1.2602699956452315, + "grad_norm": 3.2234513759613037, + "learning_rate": 8.924376738907734e-06, + "loss": 0.8093, + "step": 4341 + }, + { + "epoch": 1.2605603135433299, + "grad_norm": 3.5414047241210938, + "learning_rate": 8.923781673129488e-06, + "loss": 0.7886, + "step": 4342 + }, + { + "epoch": 1.2608506314414283, + "grad_norm": 3.6356825828552246, + "learning_rate": 8.923186462643732e-06, + "loss": 0.8428, + "step": 4343 + }, + { + "epoch": 1.2611409493395267, + "grad_norm": 3.2509765625, + "learning_rate": 8.922591107472413e-06, + "loss": 0.6025, + "step": 4344 + }, + { + "epoch": 1.2614312672376253, + "grad_norm": 3.6975150108337402, + "learning_rate": 8.921995607637494e-06, + "loss": 0.8912, + "step": 4345 + }, + { + "epoch": 1.2617215851357235, + "grad_norm": 3.27187180519104, + "learning_rate": 8.921399963160934e-06, + "loss": 0.8242, + "step": 4346 + }, + { + "epoch": 1.2620119030338222, + "grad_norm": 3.6707258224487305, + "learning_rate": 8.920804174064697e-06, + "loss": 0.7924, + "step": 4347 + }, + { + "epoch": 1.2623022209319203, + "grad_norm": 3.329015016555786, + "learning_rate": 8.920208240370757e-06, + "loss": 0.6646, + "step": 4348 + }, + { + "epoch": 1.262592538830019, + "grad_norm": 3.4273433685302734, + "learning_rate": 8.919612162101096e-06, + "loss": 0.7172, + "step": 4349 + }, + { + "epoch": 1.2628828567281172, + "grad_norm": 3.6761045455932617, + "learning_rate": 8.919015939277693e-06, + "loss": 0.7967, + "step": 4350 + }, + { + "epoch": 1.2631731746262158, + "grad_norm": 3.431152105331421, + "learning_rate": 8.918419571922536e-06, + "loss": 0.7262, + "step": 4351 + }, + { + "epoch": 1.263463492524314, + "grad_norm": 3.728382110595703, + "learning_rate": 8.917823060057622e-06, + "loss": 0.8809, + "step": 4352 + }, + { + "epoch": 1.2637538104224126, + "grad_norm": 3.5108156204223633, + "learning_rate": 8.917226403704947e-06, + "loss": 0.8824, + "step": 4353 + }, + { + "epoch": 1.2640441283205108, + "grad_norm": 4.058180809020996, + "learning_rate": 8.916629602886518e-06, + "loss": 0.9238, + "step": 4354 + }, + { + "epoch": 1.2643344462186095, + "grad_norm": 3.4847519397735596, + "learning_rate": 8.916032657624342e-06, + "loss": 0.7447, + "step": 4355 + }, + { + "epoch": 1.2646247641167079, + "grad_norm": 3.2892417907714844, + "learning_rate": 8.915435567940436e-06, + "loss": 0.8063, + "step": 4356 + }, + { + "epoch": 1.2649150820148063, + "grad_norm": 3.6869657039642334, + "learning_rate": 8.914838333856822e-06, + "loss": 0.7635, + "step": 4357 + }, + { + "epoch": 1.2652053999129047, + "grad_norm": 3.4175963401794434, + "learning_rate": 8.914240955395522e-06, + "loss": 0.69, + "step": 4358 + }, + { + "epoch": 1.2654957178110031, + "grad_norm": 3.2602951526641846, + "learning_rate": 8.913643432578567e-06, + "loss": 0.7531, + "step": 4359 + }, + { + "epoch": 1.2657860357091015, + "grad_norm": 3.464566469192505, + "learning_rate": 8.913045765428e-06, + "loss": 0.7623, + "step": 4360 + }, + { + "epoch": 1.2660763536072, + "grad_norm": 3.740095615386963, + "learning_rate": 8.912447953965854e-06, + "loss": 0.7427, + "step": 4361 + }, + { + "epoch": 1.2663666715052984, + "grad_norm": 3.2100818157196045, + "learning_rate": 8.911849998214182e-06, + "loss": 0.7076, + "step": 4362 + }, + { + "epoch": 1.2666569894033968, + "grad_norm": 4.004035472869873, + "learning_rate": 8.911251898195033e-06, + "loss": 0.9656, + "step": 4363 + }, + { + "epoch": 1.2669473073014952, + "grad_norm": 3.215731143951416, + "learning_rate": 8.910653653930466e-06, + "loss": 0.7425, + "step": 4364 + }, + { + "epoch": 1.2672376251995936, + "grad_norm": 3.169572353363037, + "learning_rate": 8.910055265442546e-06, + "loss": 0.606, + "step": 4365 + }, + { + "epoch": 1.267527943097692, + "grad_norm": 3.384373903274536, + "learning_rate": 8.909456732753339e-06, + "loss": 0.7641, + "step": 4366 + }, + { + "epoch": 1.2678182609957904, + "grad_norm": 3.2704479694366455, + "learning_rate": 8.908858055884919e-06, + "loss": 0.7908, + "step": 4367 + }, + { + "epoch": 1.2681085788938888, + "grad_norm": 3.2683961391448975, + "learning_rate": 8.908259234859365e-06, + "loss": 0.7381, + "step": 4368 + }, + { + "epoch": 1.2683988967919873, + "grad_norm": 3.749446153640747, + "learning_rate": 8.90766026969876e-06, + "loss": 0.7697, + "step": 4369 + }, + { + "epoch": 1.2686892146900857, + "grad_norm": 3.1948935985565186, + "learning_rate": 8.907061160425196e-06, + "loss": 0.7704, + "step": 4370 + }, + { + "epoch": 1.268979532588184, + "grad_norm": 3.549154281616211, + "learning_rate": 8.906461907060766e-06, + "loss": 0.7984, + "step": 4371 + }, + { + "epoch": 1.2692698504862825, + "grad_norm": 3.7574455738067627, + "learning_rate": 8.905862509627573e-06, + "loss": 0.8247, + "step": 4372 + }, + { + "epoch": 1.269560168384381, + "grad_norm": 3.7579362392425537, + "learning_rate": 8.905262968147719e-06, + "loss": 0.8506, + "step": 4373 + }, + { + "epoch": 1.2698504862824793, + "grad_norm": 3.5681581497192383, + "learning_rate": 8.904663282643317e-06, + "loss": 0.8562, + "step": 4374 + }, + { + "epoch": 1.2701408041805777, + "grad_norm": 3.9688186645507812, + "learning_rate": 8.904063453136483e-06, + "loss": 0.7506, + "step": 4375 + }, + { + "epoch": 1.2704311220786761, + "grad_norm": 3.3955612182617188, + "learning_rate": 8.90346347964934e-06, + "loss": 0.8032, + "step": 4376 + }, + { + "epoch": 1.2707214399767746, + "grad_norm": 3.876274585723877, + "learning_rate": 8.90286336220401e-06, + "loss": 0.8659, + "step": 4377 + }, + { + "epoch": 1.271011757874873, + "grad_norm": 3.3711607456207275, + "learning_rate": 8.902263100822628e-06, + "loss": 0.8466, + "step": 4378 + }, + { + "epoch": 1.2713020757729714, + "grad_norm": 3.78266978263855, + "learning_rate": 8.901662695527333e-06, + "loss": 0.7602, + "step": 4379 + }, + { + "epoch": 1.2715923936710698, + "grad_norm": 3.5354392528533936, + "learning_rate": 8.901062146340264e-06, + "loss": 0.7627, + "step": 4380 + }, + { + "epoch": 1.2718827115691682, + "grad_norm": 3.4958252906799316, + "learning_rate": 8.900461453283573e-06, + "loss": 0.7408, + "step": 4381 + }, + { + "epoch": 1.2721730294672666, + "grad_norm": 3.33056902885437, + "learning_rate": 8.899860616379413e-06, + "loss": 0.6797, + "step": 4382 + }, + { + "epoch": 1.272463347365365, + "grad_norm": 3.6068787574768066, + "learning_rate": 8.899259635649937e-06, + "loss": 0.7534, + "step": 4383 + }, + { + "epoch": 1.2727536652634635, + "grad_norm": 3.752138376235962, + "learning_rate": 8.898658511117316e-06, + "loss": 0.862, + "step": 4384 + }, + { + "epoch": 1.2730439831615619, + "grad_norm": 4.157615661621094, + "learning_rate": 8.898057242803715e-06, + "loss": 0.9252, + "step": 4385 + }, + { + "epoch": 1.2733343010596603, + "grad_norm": 3.7800021171569824, + "learning_rate": 8.89745583073131e-06, + "loss": 0.9327, + "step": 4386 + }, + { + "epoch": 1.2736246189577587, + "grad_norm": 3.5581021308898926, + "learning_rate": 8.89685427492228e-06, + "loss": 0.7756, + "step": 4387 + }, + { + "epoch": 1.273914936855857, + "grad_norm": 4.283809185028076, + "learning_rate": 8.896252575398812e-06, + "loss": 0.7042, + "step": 4388 + }, + { + "epoch": 1.2742052547539555, + "grad_norm": 3.8366518020629883, + "learning_rate": 8.895650732183094e-06, + "loss": 0.7813, + "step": 4389 + }, + { + "epoch": 1.274495572652054, + "grad_norm": 3.6063332557678223, + "learning_rate": 8.895048745297324e-06, + "loss": 0.8001, + "step": 4390 + }, + { + "epoch": 1.2747858905501523, + "grad_norm": 3.7101552486419678, + "learning_rate": 8.894446614763703e-06, + "loss": 0.8196, + "step": 4391 + }, + { + "epoch": 1.2750762084482508, + "grad_norm": 3.490100145339966, + "learning_rate": 8.893844340604433e-06, + "loss": 0.6849, + "step": 4392 + }, + { + "epoch": 1.2753665263463492, + "grad_norm": 3.1747055053710938, + "learning_rate": 8.89324192284173e-06, + "loss": 0.7158, + "step": 4393 + }, + { + "epoch": 1.2756568442444478, + "grad_norm": 3.8452651500701904, + "learning_rate": 8.892639361497812e-06, + "loss": 0.8298, + "step": 4394 + }, + { + "epoch": 1.275947162142546, + "grad_norm": 3.712412118911743, + "learning_rate": 8.892036656594898e-06, + "loss": 0.8208, + "step": 4395 + }, + { + "epoch": 1.2762374800406446, + "grad_norm": 3.924801826477051, + "learning_rate": 8.891433808155217e-06, + "loss": 0.7733, + "step": 4396 + }, + { + "epoch": 1.2765277979387428, + "grad_norm": 3.4314823150634766, + "learning_rate": 8.890830816201002e-06, + "loss": 0.7885, + "step": 4397 + }, + { + "epoch": 1.2768181158368415, + "grad_norm": 3.6019883155822754, + "learning_rate": 8.890227680754488e-06, + "loss": 0.8482, + "step": 4398 + }, + { + "epoch": 1.2771084337349397, + "grad_norm": 3.7721011638641357, + "learning_rate": 8.889624401837922e-06, + "loss": 0.8683, + "step": 4399 + }, + { + "epoch": 1.2773987516330383, + "grad_norm": 4.242115497589111, + "learning_rate": 8.889020979473552e-06, + "loss": 0.7933, + "step": 4400 + }, + { + "epoch": 1.2776890695311365, + "grad_norm": 3.3585760593414307, + "learning_rate": 8.888417413683632e-06, + "loss": 0.7908, + "step": 4401 + }, + { + "epoch": 1.2779793874292351, + "grad_norm": 3.427093744277954, + "learning_rate": 8.88781370449042e-06, + "loss": 0.7503, + "step": 4402 + }, + { + "epoch": 1.2782697053273333, + "grad_norm": 3.113924264907837, + "learning_rate": 8.887209851916184e-06, + "loss": 0.6797, + "step": 4403 + }, + { + "epoch": 1.278560023225432, + "grad_norm": 3.53076171875, + "learning_rate": 8.886605855983186e-06, + "loss": 0.8397, + "step": 4404 + }, + { + "epoch": 1.2788503411235301, + "grad_norm": 3.538825273513794, + "learning_rate": 8.88600171671371e-06, + "loss": 0.8238, + "step": 4405 + }, + { + "epoch": 1.2791406590216288, + "grad_norm": 3.9378812313079834, + "learning_rate": 8.885397434130032e-06, + "loss": 0.93, + "step": 4406 + }, + { + "epoch": 1.2794309769197272, + "grad_norm": 3.679234743118286, + "learning_rate": 8.88479300825444e-06, + "loss": 0.8222, + "step": 4407 + }, + { + "epoch": 1.2797212948178256, + "grad_norm": 3.579631805419922, + "learning_rate": 8.884188439109221e-06, + "loss": 0.712, + "step": 4408 + }, + { + "epoch": 1.280011612715924, + "grad_norm": 3.1883227825164795, + "learning_rate": 8.883583726716675e-06, + "loss": 0.7363, + "step": 4409 + }, + { + "epoch": 1.2803019306140224, + "grad_norm": 3.2501161098480225, + "learning_rate": 8.882978871099104e-06, + "loss": 0.7167, + "step": 4410 + }, + { + "epoch": 1.2805922485121208, + "grad_norm": 3.2505548000335693, + "learning_rate": 8.882373872278811e-06, + "loss": 0.7979, + "step": 4411 + }, + { + "epoch": 1.2808825664102192, + "grad_norm": 3.244330644607544, + "learning_rate": 8.881768730278112e-06, + "loss": 0.7045, + "step": 4412 + }, + { + "epoch": 1.2811728843083177, + "grad_norm": 3.533038854598999, + "learning_rate": 8.88116344511932e-06, + "loss": 0.7283, + "step": 4413 + }, + { + "epoch": 1.281463202206416, + "grad_norm": 3.818068027496338, + "learning_rate": 8.88055801682476e-06, + "loss": 0.7732, + "step": 4414 + }, + { + "epoch": 1.2817535201045145, + "grad_norm": 3.346083164215088, + "learning_rate": 8.879952445416763e-06, + "loss": 0.7547, + "step": 4415 + }, + { + "epoch": 1.282043838002613, + "grad_norm": 3.3234782218933105, + "learning_rate": 8.87934673091766e-06, + "loss": 0.7011, + "step": 4416 + }, + { + "epoch": 1.2823341559007113, + "grad_norm": 3.6858558654785156, + "learning_rate": 8.878740873349786e-06, + "loss": 0.7762, + "step": 4417 + }, + { + "epoch": 1.2826244737988097, + "grad_norm": 3.9861769676208496, + "learning_rate": 8.878134872735488e-06, + "loss": 0.7367, + "step": 4418 + }, + { + "epoch": 1.2829147916969081, + "grad_norm": 3.2009475231170654, + "learning_rate": 8.877528729097119e-06, + "loss": 0.6656, + "step": 4419 + }, + { + "epoch": 1.2832051095950066, + "grad_norm": 3.757075071334839, + "learning_rate": 8.876922442457026e-06, + "loss": 0.8415, + "step": 4420 + }, + { + "epoch": 1.283495427493105, + "grad_norm": 3.684903383255005, + "learning_rate": 8.87631601283757e-06, + "loss": 0.7047, + "step": 4421 + }, + { + "epoch": 1.2837857453912034, + "grad_norm": 3.873124122619629, + "learning_rate": 8.875709440261122e-06, + "loss": 0.9507, + "step": 4422 + }, + { + "epoch": 1.2840760632893018, + "grad_norm": 3.6405625343322754, + "learning_rate": 8.875102724750046e-06, + "loss": 0.7636, + "step": 4423 + }, + { + "epoch": 1.2843663811874002, + "grad_norm": 3.4353067874908447, + "learning_rate": 8.874495866326717e-06, + "loss": 0.7197, + "step": 4424 + }, + { + "epoch": 1.2846566990854986, + "grad_norm": 3.651857376098633, + "learning_rate": 8.873888865013522e-06, + "loss": 0.7654, + "step": 4425 + }, + { + "epoch": 1.284947016983597, + "grad_norm": 3.4452688694000244, + "learning_rate": 8.873281720832841e-06, + "loss": 0.7886, + "step": 4426 + }, + { + "epoch": 1.2852373348816954, + "grad_norm": 3.2013700008392334, + "learning_rate": 8.872674433807066e-06, + "loss": 0.7016, + "step": 4427 + }, + { + "epoch": 1.2855276527797939, + "grad_norm": 3.624314546585083, + "learning_rate": 8.872067003958597e-06, + "loss": 0.7305, + "step": 4428 + }, + { + "epoch": 1.2858179706778923, + "grad_norm": 3.3400025367736816, + "learning_rate": 8.871459431309832e-06, + "loss": 0.7687, + "step": 4429 + }, + { + "epoch": 1.2861082885759907, + "grad_norm": 3.594221353530884, + "learning_rate": 8.870851715883181e-06, + "loss": 0.7492, + "step": 4430 + }, + { + "epoch": 1.286398606474089, + "grad_norm": 3.681166172027588, + "learning_rate": 8.870243857701054e-06, + "loss": 0.7178, + "step": 4431 + }, + { + "epoch": 1.2866889243721875, + "grad_norm": 3.8900341987609863, + "learning_rate": 8.86963585678587e-06, + "loss": 0.7441, + "step": 4432 + }, + { + "epoch": 1.286979242270286, + "grad_norm": 3.9225640296936035, + "learning_rate": 8.86902771316005e-06, + "loss": 0.8456, + "step": 4433 + }, + { + "epoch": 1.2872695601683843, + "grad_norm": 4.030943393707275, + "learning_rate": 8.868419426846023e-06, + "loss": 0.917, + "step": 4434 + }, + { + "epoch": 1.2875598780664828, + "grad_norm": 3.665842294692993, + "learning_rate": 8.867810997866224e-06, + "loss": 0.7861, + "step": 4435 + }, + { + "epoch": 1.2878501959645812, + "grad_norm": 3.1855833530426025, + "learning_rate": 8.867202426243089e-06, + "loss": 0.7015, + "step": 4436 + }, + { + "epoch": 1.2881405138626796, + "grad_norm": 3.545858860015869, + "learning_rate": 8.866593711999065e-06, + "loss": 0.6991, + "step": 4437 + }, + { + "epoch": 1.288430831760778, + "grad_norm": 3.6752161979675293, + "learning_rate": 8.865984855156597e-06, + "loss": 0.9095, + "step": 4438 + }, + { + "epoch": 1.2887211496588764, + "grad_norm": 3.5139942169189453, + "learning_rate": 8.865375855738144e-06, + "loss": 0.7329, + "step": 4439 + }, + { + "epoch": 1.2890114675569748, + "grad_norm": 3.157313346862793, + "learning_rate": 8.864766713766163e-06, + "loss": 0.7239, + "step": 4440 + }, + { + "epoch": 1.2893017854550732, + "grad_norm": 3.623577117919922, + "learning_rate": 8.864157429263117e-06, + "loss": 0.8599, + "step": 4441 + }, + { + "epoch": 1.2895921033531716, + "grad_norm": 3.468719959259033, + "learning_rate": 8.86354800225148e-06, + "loss": 0.7423, + "step": 4442 + }, + { + "epoch": 1.28988242125127, + "grad_norm": 3.5650932788848877, + "learning_rate": 8.862938432753727e-06, + "loss": 0.7737, + "step": 4443 + }, + { + "epoch": 1.2901727391493685, + "grad_norm": 3.9904751777648926, + "learning_rate": 8.862328720792336e-06, + "loss": 0.8928, + "step": 4444 + }, + { + "epoch": 1.290463057047467, + "grad_norm": 3.572465419769287, + "learning_rate": 8.861718866389794e-06, + "loss": 0.7338, + "step": 4445 + }, + { + "epoch": 1.2907533749455653, + "grad_norm": 3.5529489517211914, + "learning_rate": 8.861108869568595e-06, + "loss": 0.7628, + "step": 4446 + }, + { + "epoch": 1.291043692843664, + "grad_norm": 4.1549763679504395, + "learning_rate": 8.860498730351232e-06, + "loss": 0.8803, + "step": 4447 + }, + { + "epoch": 1.2913340107417621, + "grad_norm": 3.6090340614318848, + "learning_rate": 8.859888448760207e-06, + "loss": 0.7089, + "step": 4448 + }, + { + "epoch": 1.2916243286398608, + "grad_norm": 3.5773282051086426, + "learning_rate": 8.859278024818028e-06, + "loss": 0.7114, + "step": 4449 + }, + { + "epoch": 1.291914646537959, + "grad_norm": 3.5102736949920654, + "learning_rate": 8.858667458547207e-06, + "loss": 0.6933, + "step": 4450 + }, + { + "epoch": 1.2922049644360576, + "grad_norm": 3.4220693111419678, + "learning_rate": 8.858056749970263e-06, + "loss": 0.8308, + "step": 4451 + }, + { + "epoch": 1.2924952823341558, + "grad_norm": 3.735527992248535, + "learning_rate": 8.857445899109716e-06, + "loss": 0.8135, + "step": 4452 + }, + { + "epoch": 1.2927856002322544, + "grad_norm": 3.3440489768981934, + "learning_rate": 8.856834905988095e-06, + "loss": 0.8015, + "step": 4453 + }, + { + "epoch": 1.2930759181303526, + "grad_norm": 3.7086057662963867, + "learning_rate": 8.856223770627932e-06, + "loss": 0.7704, + "step": 4454 + }, + { + "epoch": 1.2933662360284512, + "grad_norm": 3.410614490509033, + "learning_rate": 8.855612493051768e-06, + "loss": 0.7604, + "step": 4455 + }, + { + "epoch": 1.2936565539265494, + "grad_norm": 3.5582363605499268, + "learning_rate": 8.855001073282145e-06, + "loss": 0.7961, + "step": 4456 + }, + { + "epoch": 1.293946871824648, + "grad_norm": 3.860466241836548, + "learning_rate": 8.854389511341613e-06, + "loss": 0.8195, + "step": 4457 + }, + { + "epoch": 1.2942371897227465, + "grad_norm": 3.4531681537628174, + "learning_rate": 8.853777807252724e-06, + "loss": 0.7939, + "step": 4458 + }, + { + "epoch": 1.294527507620845, + "grad_norm": 3.2805068492889404, + "learning_rate": 8.85316596103804e-06, + "loss": 0.7967, + "step": 4459 + }, + { + "epoch": 1.2948178255189433, + "grad_norm": 3.298468828201294, + "learning_rate": 8.852553972720123e-06, + "loss": 0.7372, + "step": 4460 + }, + { + "epoch": 1.2951081434170417, + "grad_norm": 3.193430185317993, + "learning_rate": 8.851941842321545e-06, + "loss": 0.6366, + "step": 4461 + }, + { + "epoch": 1.2953984613151401, + "grad_norm": 3.1615333557128906, + "learning_rate": 8.851329569864882e-06, + "loss": 0.6768, + "step": 4462 + }, + { + "epoch": 1.2956887792132386, + "grad_norm": 3.9289627075195312, + "learning_rate": 8.85071715537271e-06, + "loss": 0.8529, + "step": 4463 + }, + { + "epoch": 1.295979097111337, + "grad_norm": 3.5651650428771973, + "learning_rate": 8.85010459886762e-06, + "loss": 0.7418, + "step": 4464 + }, + { + "epoch": 1.2962694150094354, + "grad_norm": 3.642563819885254, + "learning_rate": 8.849491900372199e-06, + "loss": 0.7399, + "step": 4465 + }, + { + "epoch": 1.2965597329075338, + "grad_norm": 4.064639568328857, + "learning_rate": 8.848879059909043e-06, + "loss": 0.7291, + "step": 4466 + }, + { + "epoch": 1.2968500508056322, + "grad_norm": 3.841418504714966, + "learning_rate": 8.848266077500757e-06, + "loss": 0.7529, + "step": 4467 + }, + { + "epoch": 1.2971403687037306, + "grad_norm": 3.663754463195801, + "learning_rate": 8.847652953169944e-06, + "loss": 0.8091, + "step": 4468 + }, + { + "epoch": 1.297430686601829, + "grad_norm": 3.4412853717803955, + "learning_rate": 8.847039686939218e-06, + "loss": 0.7146, + "step": 4469 + }, + { + "epoch": 1.2977210044999274, + "grad_norm": 3.2278478145599365, + "learning_rate": 8.846426278831193e-06, + "loss": 0.7616, + "step": 4470 + }, + { + "epoch": 1.2980113223980259, + "grad_norm": 3.4113316535949707, + "learning_rate": 8.845812728868496e-06, + "loss": 0.7473, + "step": 4471 + }, + { + "epoch": 1.2983016402961243, + "grad_norm": 3.9025003910064697, + "learning_rate": 8.845199037073748e-06, + "loss": 0.8915, + "step": 4472 + }, + { + "epoch": 1.2985919581942227, + "grad_norm": 4.0561957359313965, + "learning_rate": 8.84458520346959e-06, + "loss": 0.906, + "step": 4473 + }, + { + "epoch": 1.298882276092321, + "grad_norm": 3.336223840713501, + "learning_rate": 8.843971228078652e-06, + "loss": 0.713, + "step": 4474 + }, + { + "epoch": 1.2991725939904195, + "grad_norm": 2.9069418907165527, + "learning_rate": 8.843357110923582e-06, + "loss": 0.6755, + "step": 4475 + }, + { + "epoch": 1.299462911888518, + "grad_norm": 3.5413739681243896, + "learning_rate": 8.842742852027027e-06, + "loss": 0.7062, + "step": 4476 + }, + { + "epoch": 1.2997532297866163, + "grad_norm": 3.9641714096069336, + "learning_rate": 8.84212845141164e-06, + "loss": 0.7989, + "step": 4477 + }, + { + "epoch": 1.3000435476847148, + "grad_norm": 3.441683292388916, + "learning_rate": 8.84151390910008e-06, + "loss": 0.7618, + "step": 4478 + }, + { + "epoch": 1.3003338655828132, + "grad_norm": 3.9780099391937256, + "learning_rate": 8.840899225115012e-06, + "loss": 0.924, + "step": 4479 + }, + { + "epoch": 1.3006241834809116, + "grad_norm": 3.294429302215576, + "learning_rate": 8.840284399479104e-06, + "loss": 0.8258, + "step": 4480 + }, + { + "epoch": 1.30091450137901, + "grad_norm": 3.246403455734253, + "learning_rate": 8.839669432215032e-06, + "loss": 0.7254, + "step": 4481 + }, + { + "epoch": 1.3012048192771084, + "grad_norm": 3.34118390083313, + "learning_rate": 8.839054323345475e-06, + "loss": 0.6937, + "step": 4482 + }, + { + "epoch": 1.3014951371752068, + "grad_norm": 3.6143157482147217, + "learning_rate": 8.83843907289312e-06, + "loss": 0.8604, + "step": 4483 + }, + { + "epoch": 1.3017854550733052, + "grad_norm": 3.6616508960723877, + "learning_rate": 8.837823680880653e-06, + "loss": 0.7709, + "step": 4484 + }, + { + "epoch": 1.3020757729714036, + "grad_norm": 3.775017499923706, + "learning_rate": 8.837208147330772e-06, + "loss": 0.9203, + "step": 4485 + }, + { + "epoch": 1.302366090869502, + "grad_norm": 3.4333863258361816, + "learning_rate": 8.836592472266177e-06, + "loss": 0.693, + "step": 4486 + }, + { + "epoch": 1.3026564087676005, + "grad_norm": 3.2192022800445557, + "learning_rate": 8.835976655709574e-06, + "loss": 0.7076, + "step": 4487 + }, + { + "epoch": 1.3029467266656989, + "grad_norm": 3.2752268314361572, + "learning_rate": 8.835360697683675e-06, + "loss": 0.6535, + "step": 4488 + }, + { + "epoch": 1.3032370445637973, + "grad_norm": 3.480109691619873, + "learning_rate": 8.834744598211195e-06, + "loss": 0.7052, + "step": 4489 + }, + { + "epoch": 1.3035273624618957, + "grad_norm": 3.7845897674560547, + "learning_rate": 8.834128357314856e-06, + "loss": 0.7524, + "step": 4490 + }, + { + "epoch": 1.3038176803599941, + "grad_norm": 3.100076198577881, + "learning_rate": 8.833511975017385e-06, + "loss": 0.6737, + "step": 4491 + }, + { + "epoch": 1.3041079982580925, + "grad_norm": 3.8748559951782227, + "learning_rate": 8.832895451341514e-06, + "loss": 0.7552, + "step": 4492 + }, + { + "epoch": 1.304398316156191, + "grad_norm": 3.216489315032959, + "learning_rate": 8.832278786309979e-06, + "loss": 0.7168, + "step": 4493 + }, + { + "epoch": 1.3046886340542894, + "grad_norm": 3.7815732955932617, + "learning_rate": 8.831661979945522e-06, + "loss": 0.7442, + "step": 4494 + }, + { + "epoch": 1.3049789519523878, + "grad_norm": 3.8493406772613525, + "learning_rate": 8.831045032270895e-06, + "loss": 0.7968, + "step": 4495 + }, + { + "epoch": 1.3052692698504864, + "grad_norm": 3.2961323261260986, + "learning_rate": 8.830427943308846e-06, + "loss": 0.8342, + "step": 4496 + }, + { + "epoch": 1.3055595877485846, + "grad_norm": 3.404946804046631, + "learning_rate": 8.829810713082134e-06, + "loss": 0.6763, + "step": 4497 + }, + { + "epoch": 1.3058499056466832, + "grad_norm": 3.4007487297058105, + "learning_rate": 8.829193341613522e-06, + "loss": 0.6758, + "step": 4498 + }, + { + "epoch": 1.3061402235447814, + "grad_norm": 3.0017502307891846, + "learning_rate": 8.82857582892578e-06, + "loss": 0.629, + "step": 4499 + }, + { + "epoch": 1.30643054144288, + "grad_norm": 3.563961982727051, + "learning_rate": 8.827958175041682e-06, + "loss": 0.7526, + "step": 4500 + }, + { + "epoch": 1.30643054144288, + "eval_loss": 1.1974629163742065, + "eval_runtime": 13.5571, + "eval_samples_per_second": 29.505, + "eval_steps_per_second": 3.688, + "step": 4500 + }, + { + "epoch": 1.3067208593409783, + "grad_norm": 3.8173577785491943, + "learning_rate": 8.827340379984003e-06, + "loss": 0.8251, + "step": 4501 + }, + { + "epoch": 1.307011177239077, + "grad_norm": 3.867654323577881, + "learning_rate": 8.826722443775531e-06, + "loss": 0.8697, + "step": 4502 + }, + { + "epoch": 1.307301495137175, + "grad_norm": 3.384533405303955, + "learning_rate": 8.826104366439054e-06, + "loss": 0.6338, + "step": 4503 + }, + { + "epoch": 1.3075918130352737, + "grad_norm": 3.7659904956817627, + "learning_rate": 8.825486147997366e-06, + "loss": 0.7178, + "step": 4504 + }, + { + "epoch": 1.307882130933372, + "grad_norm": 3.433115243911743, + "learning_rate": 8.824867788473267e-06, + "loss": 0.7663, + "step": 4505 + }, + { + "epoch": 1.3081724488314705, + "grad_norm": 3.6183979511260986, + "learning_rate": 8.824249287889563e-06, + "loss": 0.789, + "step": 4506 + }, + { + "epoch": 1.308462766729569, + "grad_norm": 3.6479341983795166, + "learning_rate": 8.823630646269061e-06, + "loss": 0.8397, + "step": 4507 + }, + { + "epoch": 1.3087530846276674, + "grad_norm": 3.5444366931915283, + "learning_rate": 8.82301186363458e-06, + "loss": 0.7653, + "step": 4508 + }, + { + "epoch": 1.3090434025257658, + "grad_norm": 3.838498830795288, + "learning_rate": 8.822392940008937e-06, + "loss": 0.7974, + "step": 4509 + }, + { + "epoch": 1.3093337204238642, + "grad_norm": 3.370309352874756, + "learning_rate": 8.82177387541496e-06, + "loss": 0.7828, + "step": 4510 + }, + { + "epoch": 1.3096240383219626, + "grad_norm": 4.022466659545898, + "learning_rate": 8.82115466987548e-06, + "loss": 0.7971, + "step": 4511 + }, + { + "epoch": 1.309914356220061, + "grad_norm": 3.3781039714813232, + "learning_rate": 8.820535323413331e-06, + "loss": 0.7885, + "step": 4512 + }, + { + "epoch": 1.3102046741181594, + "grad_norm": 3.4412195682525635, + "learning_rate": 8.819915836051354e-06, + "loss": 0.7148, + "step": 4513 + }, + { + "epoch": 1.3104949920162579, + "grad_norm": 3.6797571182250977, + "learning_rate": 8.8192962078124e-06, + "loss": 0.7848, + "step": 4514 + }, + { + "epoch": 1.3107853099143563, + "grad_norm": 3.71624755859375, + "learning_rate": 8.818676438719314e-06, + "loss": 0.882, + "step": 4515 + }, + { + "epoch": 1.3110756278124547, + "grad_norm": 3.6649434566497803, + "learning_rate": 8.818056528794958e-06, + "loss": 0.8181, + "step": 4516 + }, + { + "epoch": 1.311365945710553, + "grad_norm": 3.5233776569366455, + "learning_rate": 8.817436478062193e-06, + "loss": 0.7826, + "step": 4517 + }, + { + "epoch": 1.3116562636086515, + "grad_norm": 3.272698402404785, + "learning_rate": 8.816816286543886e-06, + "loss": 0.8691, + "step": 4518 + }, + { + "epoch": 1.31194658150675, + "grad_norm": 3.329058885574341, + "learning_rate": 8.816195954262907e-06, + "loss": 0.755, + "step": 4519 + }, + { + "epoch": 1.3122368994048483, + "grad_norm": 3.4336793422698975, + "learning_rate": 8.815575481242137e-06, + "loss": 0.8395, + "step": 4520 + }, + { + "epoch": 1.3125272173029467, + "grad_norm": 3.476872444152832, + "learning_rate": 8.814954867504457e-06, + "loss": 0.7582, + "step": 4521 + }, + { + "epoch": 1.3128175352010452, + "grad_norm": 3.659498453140259, + "learning_rate": 8.814334113072755e-06, + "loss": 0.751, + "step": 4522 + }, + { + "epoch": 1.3131078530991436, + "grad_norm": 3.768644332885742, + "learning_rate": 8.813713217969926e-06, + "loss": 0.7894, + "step": 4523 + }, + { + "epoch": 1.313398170997242, + "grad_norm": 3.286921977996826, + "learning_rate": 8.813092182218866e-06, + "loss": 0.7101, + "step": 4524 + }, + { + "epoch": 1.3136884888953404, + "grad_norm": 3.3848443031311035, + "learning_rate": 8.81247100584248e-06, + "loss": 0.7923, + "step": 4525 + }, + { + "epoch": 1.3139788067934388, + "grad_norm": 3.8643271923065186, + "learning_rate": 8.811849688863674e-06, + "loss": 0.7354, + "step": 4526 + }, + { + "epoch": 1.3142691246915372, + "grad_norm": 3.531477212905884, + "learning_rate": 8.811228231305368e-06, + "loss": 0.7571, + "step": 4527 + }, + { + "epoch": 1.3145594425896356, + "grad_norm": 3.487464189529419, + "learning_rate": 8.810606633190475e-06, + "loss": 0.803, + "step": 4528 + }, + { + "epoch": 1.314849760487734, + "grad_norm": 3.5375609397888184, + "learning_rate": 8.80998489454192e-06, + "loss": 0.747, + "step": 4529 + }, + { + "epoch": 1.3151400783858325, + "grad_norm": 3.17202091217041, + "learning_rate": 8.809363015382636e-06, + "loss": 0.7476, + "step": 4530 + }, + { + "epoch": 1.3154303962839309, + "grad_norm": 3.4418551921844482, + "learning_rate": 8.808740995735556e-06, + "loss": 0.8416, + "step": 4531 + }, + { + "epoch": 1.3157207141820293, + "grad_norm": 3.173208713531494, + "learning_rate": 8.80811883562362e-06, + "loss": 0.6681, + "step": 4532 + }, + { + "epoch": 1.3160110320801277, + "grad_norm": 3.4615097045898438, + "learning_rate": 8.80749653506977e-06, + "loss": 0.7732, + "step": 4533 + }, + { + "epoch": 1.3163013499782261, + "grad_norm": 3.5268373489379883, + "learning_rate": 8.806874094096962e-06, + "loss": 0.7281, + "step": 4534 + }, + { + "epoch": 1.3165916678763245, + "grad_norm": 3.3505427837371826, + "learning_rate": 8.806251512728145e-06, + "loss": 0.8716, + "step": 4535 + }, + { + "epoch": 1.316881985774423, + "grad_norm": 4.016345977783203, + "learning_rate": 8.805628790986284e-06, + "loss": 0.9032, + "step": 4536 + }, + { + "epoch": 1.3171723036725214, + "grad_norm": 3.4283342361450195, + "learning_rate": 8.805005928894346e-06, + "loss": 0.7144, + "step": 4537 + }, + { + "epoch": 1.3174626215706198, + "grad_norm": 3.8195414543151855, + "learning_rate": 8.804382926475296e-06, + "loss": 0.8395, + "step": 4538 + }, + { + "epoch": 1.3177529394687182, + "grad_norm": 3.2591845989227295, + "learning_rate": 8.803759783752113e-06, + "loss": 0.8047, + "step": 4539 + }, + { + "epoch": 1.3180432573668166, + "grad_norm": 3.5877437591552734, + "learning_rate": 8.80313650074778e-06, + "loss": 0.8031, + "step": 4540 + }, + { + "epoch": 1.318333575264915, + "grad_norm": 3.386138439178467, + "learning_rate": 8.802513077485283e-06, + "loss": 0.6563, + "step": 4541 + }, + { + "epoch": 1.3186238931630134, + "grad_norm": 3.528615951538086, + "learning_rate": 8.801889513987612e-06, + "loss": 0.8133, + "step": 4542 + }, + { + "epoch": 1.3189142110611118, + "grad_norm": 3.881578207015991, + "learning_rate": 8.801265810277764e-06, + "loss": 0.8733, + "step": 4543 + }, + { + "epoch": 1.3192045289592103, + "grad_norm": 3.4517362117767334, + "learning_rate": 8.800641966378742e-06, + "loss": 0.7932, + "step": 4544 + }, + { + "epoch": 1.319494846857309, + "grad_norm": 3.6806721687316895, + "learning_rate": 8.800017982313552e-06, + "loss": 0.7803, + "step": 4545 + }, + { + "epoch": 1.319785164755407, + "grad_norm": 3.730502128601074, + "learning_rate": 8.799393858105206e-06, + "loss": 0.7542, + "step": 4546 + }, + { + "epoch": 1.3200754826535057, + "grad_norm": 3.9465389251708984, + "learning_rate": 8.798769593776723e-06, + "loss": 0.9533, + "step": 4547 + }, + { + "epoch": 1.320365800551604, + "grad_norm": 3.670346975326538, + "learning_rate": 8.798145189351127e-06, + "loss": 0.7445, + "step": 4548 + }, + { + "epoch": 1.3206561184497025, + "grad_norm": 3.567537784576416, + "learning_rate": 8.797520644851441e-06, + "loss": 0.8044, + "step": 4549 + }, + { + "epoch": 1.3209464363478007, + "grad_norm": 4.003215312957764, + "learning_rate": 8.7968959603007e-06, + "loss": 0.7814, + "step": 4550 + }, + { + "epoch": 1.3212367542458994, + "grad_norm": 3.2233598232269287, + "learning_rate": 8.796271135721944e-06, + "loss": 0.763, + "step": 4551 + }, + { + "epoch": 1.3215270721439976, + "grad_norm": 3.6300904750823975, + "learning_rate": 8.795646171138215e-06, + "loss": 0.7442, + "step": 4552 + }, + { + "epoch": 1.3218173900420962, + "grad_norm": 3.644545555114746, + "learning_rate": 8.795021066572562e-06, + "loss": 0.7269, + "step": 4553 + }, + { + "epoch": 1.3221077079401944, + "grad_norm": 3.8695108890533447, + "learning_rate": 8.794395822048036e-06, + "loss": 0.8088, + "step": 4554 + }, + { + "epoch": 1.322398025838293, + "grad_norm": 4.05075216293335, + "learning_rate": 8.7937704375877e-06, + "loss": 0.9627, + "step": 4555 + }, + { + "epoch": 1.3226883437363912, + "grad_norm": 4.718074798583984, + "learning_rate": 8.793144913214616e-06, + "loss": 0.9465, + "step": 4556 + }, + { + "epoch": 1.3229786616344898, + "grad_norm": 3.9765820503234863, + "learning_rate": 8.792519248951851e-06, + "loss": 0.7774, + "step": 4557 + }, + { + "epoch": 1.3232689795325883, + "grad_norm": 3.4047420024871826, + "learning_rate": 8.791893444822483e-06, + "loss": 0.7692, + "step": 4558 + }, + { + "epoch": 1.3235592974306867, + "grad_norm": 3.7445778846740723, + "learning_rate": 8.791267500849589e-06, + "loss": 0.7714, + "step": 4559 + }, + { + "epoch": 1.323849615328785, + "grad_norm": 3.5737650394439697, + "learning_rate": 8.790641417056254e-06, + "loss": 0.8386, + "step": 4560 + }, + { + "epoch": 1.3241399332268835, + "grad_norm": 3.6184606552124023, + "learning_rate": 8.790015193465566e-06, + "loss": 0.8462, + "step": 4561 + }, + { + "epoch": 1.324430251124982, + "grad_norm": 3.231999158859253, + "learning_rate": 8.789388830100625e-06, + "loss": 0.7059, + "step": 4562 + }, + { + "epoch": 1.3247205690230803, + "grad_norm": 3.858499765396118, + "learning_rate": 8.788762326984525e-06, + "loss": 0.9108, + "step": 4563 + }, + { + "epoch": 1.3250108869211787, + "grad_norm": 3.4451212882995605, + "learning_rate": 8.788135684140375e-06, + "loss": 0.7431, + "step": 4564 + }, + { + "epoch": 1.3253012048192772, + "grad_norm": 3.238949775695801, + "learning_rate": 8.787508901591283e-06, + "loss": 0.7886, + "step": 4565 + }, + { + "epoch": 1.3255915227173756, + "grad_norm": 3.2312495708465576, + "learning_rate": 8.786881979360368e-06, + "loss": 0.7297, + "step": 4566 + }, + { + "epoch": 1.325881840615474, + "grad_norm": 4.028390407562256, + "learning_rate": 8.786254917470749e-06, + "loss": 0.8983, + "step": 4567 + }, + { + "epoch": 1.3261721585135724, + "grad_norm": 3.7783362865448, + "learning_rate": 8.785627715945549e-06, + "loss": 0.8377, + "step": 4568 + }, + { + "epoch": 1.3264624764116708, + "grad_norm": 3.3699865341186523, + "learning_rate": 8.7850003748079e-06, + "loss": 0.7171, + "step": 4569 + }, + { + "epoch": 1.3267527943097692, + "grad_norm": 4.025466442108154, + "learning_rate": 8.784372894080942e-06, + "loss": 0.7516, + "step": 4570 + }, + { + "epoch": 1.3270431122078676, + "grad_norm": 3.33362078666687, + "learning_rate": 8.783745273787811e-06, + "loss": 0.7302, + "step": 4571 + }, + { + "epoch": 1.327333430105966, + "grad_norm": 4.020394325256348, + "learning_rate": 8.783117513951658e-06, + "loss": 0.8613, + "step": 4572 + }, + { + "epoch": 1.3276237480040645, + "grad_norm": 3.2039053440093994, + "learning_rate": 8.78248961459563e-06, + "loss": 0.7226, + "step": 4573 + }, + { + "epoch": 1.3279140659021629, + "grad_norm": 3.7454745769500732, + "learning_rate": 8.781861575742888e-06, + "loss": 0.7889, + "step": 4574 + }, + { + "epoch": 1.3282043838002613, + "grad_norm": 3.397183895111084, + "learning_rate": 8.78123339741659e-06, + "loss": 0.7233, + "step": 4575 + }, + { + "epoch": 1.3284947016983597, + "grad_norm": 3.3106231689453125, + "learning_rate": 8.780605079639909e-06, + "loss": 0.7288, + "step": 4576 + }, + { + "epoch": 1.3287850195964581, + "grad_norm": 3.925104856491089, + "learning_rate": 8.779976622436008e-06, + "loss": 0.7683, + "step": 4577 + }, + { + "epoch": 1.3290753374945565, + "grad_norm": 3.4288625717163086, + "learning_rate": 8.779348025828071e-06, + "loss": 0.8009, + "step": 4578 + }, + { + "epoch": 1.329365655392655, + "grad_norm": 3.6718027591705322, + "learning_rate": 8.77871928983928e-06, + "loss": 0.8048, + "step": 4579 + }, + { + "epoch": 1.3296559732907534, + "grad_norm": 3.671327829360962, + "learning_rate": 8.77809041449282e-06, + "loss": 0.7686, + "step": 4580 + }, + { + "epoch": 1.3299462911888518, + "grad_norm": 3.493149757385254, + "learning_rate": 8.777461399811886e-06, + "loss": 0.8484, + "step": 4581 + }, + { + "epoch": 1.3302366090869502, + "grad_norm": 3.3519535064697266, + "learning_rate": 8.776832245819672e-06, + "loss": 0.8071, + "step": 4582 + }, + { + "epoch": 1.3305269269850486, + "grad_norm": 3.5099620819091797, + "learning_rate": 8.776202952539386e-06, + "loss": 0.7594, + "step": 4583 + }, + { + "epoch": 1.330817244883147, + "grad_norm": 3.586620569229126, + "learning_rate": 8.775573519994232e-06, + "loss": 0.8284, + "step": 4584 + }, + { + "epoch": 1.3311075627812454, + "grad_norm": 3.604992151260376, + "learning_rate": 8.774943948207427e-06, + "loss": 0.8269, + "step": 4585 + }, + { + "epoch": 1.3313978806793438, + "grad_norm": 4.07960844039917, + "learning_rate": 8.774314237202183e-06, + "loss": 0.8021, + "step": 4586 + }, + { + "epoch": 1.3316881985774423, + "grad_norm": 3.2540876865386963, + "learning_rate": 8.773684387001734e-06, + "loss": 0.6545, + "step": 4587 + }, + { + "epoch": 1.3319785164755407, + "grad_norm": 3.480595111846924, + "learning_rate": 8.773054397629297e-06, + "loss": 0.8309, + "step": 4588 + }, + { + "epoch": 1.332268834373639, + "grad_norm": 3.6046059131622314, + "learning_rate": 8.772424269108113e-06, + "loss": 0.722, + "step": 4589 + }, + { + "epoch": 1.3325591522717375, + "grad_norm": 3.5399723052978516, + "learning_rate": 8.77179400146142e-06, + "loss": 0.7857, + "step": 4590 + }, + { + "epoch": 1.332849470169836, + "grad_norm": 3.6178572177886963, + "learning_rate": 8.77116359471246e-06, + "loss": 0.7778, + "step": 4591 + }, + { + "epoch": 1.3331397880679343, + "grad_norm": 3.778470277786255, + "learning_rate": 8.770533048884483e-06, + "loss": 0.7767, + "step": 4592 + }, + { + "epoch": 1.3334301059660327, + "grad_norm": 3.622446298599243, + "learning_rate": 8.769902364000741e-06, + "loss": 0.8007, + "step": 4593 + }, + { + "epoch": 1.3337204238641311, + "grad_norm": 3.512143611907959, + "learning_rate": 8.7692715400845e-06, + "loss": 0.7634, + "step": 4594 + }, + { + "epoch": 1.3340107417622296, + "grad_norm": 3.5384085178375244, + "learning_rate": 8.768640577159018e-06, + "loss": 0.6932, + "step": 4595 + }, + { + "epoch": 1.3343010596603282, + "grad_norm": 3.703672170639038, + "learning_rate": 8.76800947524757e-06, + "loss": 0.8353, + "step": 4596 + }, + { + "epoch": 1.3345913775584264, + "grad_norm": 3.3167338371276855, + "learning_rate": 8.767378234373425e-06, + "loss": 0.7462, + "step": 4597 + }, + { + "epoch": 1.334881695456525, + "grad_norm": 3.785743236541748, + "learning_rate": 8.766746854559866e-06, + "loss": 0.8629, + "step": 4598 + }, + { + "epoch": 1.3351720133546232, + "grad_norm": 3.8739917278289795, + "learning_rate": 8.766115335830178e-06, + "loss": 0.8669, + "step": 4599 + }, + { + "epoch": 1.3354623312527218, + "grad_norm": 4.050196647644043, + "learning_rate": 8.76548367820765e-06, + "loss": 0.8905, + "step": 4600 + }, + { + "epoch": 1.33575264915082, + "grad_norm": 3.502135753631592, + "learning_rate": 8.764851881715581e-06, + "loss": 0.6934, + "step": 4601 + }, + { + "epoch": 1.3360429670489187, + "grad_norm": 3.472646713256836, + "learning_rate": 8.764219946377268e-06, + "loss": 0.7761, + "step": 4602 + }, + { + "epoch": 1.3363332849470169, + "grad_norm": 3.4790096282958984, + "learning_rate": 8.763587872216016e-06, + "loss": 0.6904, + "step": 4603 + }, + { + "epoch": 1.3366236028451155, + "grad_norm": 3.4734671115875244, + "learning_rate": 8.762955659255137e-06, + "loss": 0.7641, + "step": 4604 + }, + { + "epoch": 1.3369139207432137, + "grad_norm": 3.610750913619995, + "learning_rate": 8.762323307517946e-06, + "loss": 0.7647, + "step": 4605 + }, + { + "epoch": 1.3372042386413123, + "grad_norm": 3.5902762413024902, + "learning_rate": 8.761690817027764e-06, + "loss": 0.7836, + "step": 4606 + }, + { + "epoch": 1.3374945565394105, + "grad_norm": 3.4237771034240723, + "learning_rate": 8.761058187807921e-06, + "loss": 0.798, + "step": 4607 + }, + { + "epoch": 1.3377848744375092, + "grad_norm": 3.5920920372009277, + "learning_rate": 8.760425419881742e-06, + "loss": 0.8194, + "step": 4608 + }, + { + "epoch": 1.3380751923356076, + "grad_norm": 3.539668321609497, + "learning_rate": 8.759792513272564e-06, + "loss": 0.7582, + "step": 4609 + }, + { + "epoch": 1.338365510233706, + "grad_norm": 3.6332497596740723, + "learning_rate": 8.759159468003734e-06, + "loss": 0.814, + "step": 4610 + }, + { + "epoch": 1.3386558281318044, + "grad_norm": 3.294271469116211, + "learning_rate": 8.758526284098591e-06, + "loss": 0.7436, + "step": 4611 + }, + { + "epoch": 1.3389461460299028, + "grad_norm": 3.5333640575408936, + "learning_rate": 8.757892961580492e-06, + "loss": 0.8189, + "step": 4612 + }, + { + "epoch": 1.3392364639280012, + "grad_norm": 3.9221789836883545, + "learning_rate": 8.757259500472793e-06, + "loss": 0.7984, + "step": 4613 + }, + { + "epoch": 1.3395267818260996, + "grad_norm": 3.526892900466919, + "learning_rate": 8.756625900798852e-06, + "loss": 0.7433, + "step": 4614 + }, + { + "epoch": 1.339817099724198, + "grad_norm": 3.8629260063171387, + "learning_rate": 8.75599216258204e-06, + "loss": 0.8027, + "step": 4615 + }, + { + "epoch": 1.3401074176222965, + "grad_norm": 3.3638052940368652, + "learning_rate": 8.755358285845728e-06, + "loss": 0.8417, + "step": 4616 + }, + { + "epoch": 1.3403977355203949, + "grad_norm": 3.4001290798187256, + "learning_rate": 8.754724270613291e-06, + "loss": 0.7387, + "step": 4617 + }, + { + "epoch": 1.3406880534184933, + "grad_norm": 3.7218117713928223, + "learning_rate": 8.754090116908115e-06, + "loss": 0.7018, + "step": 4618 + }, + { + "epoch": 1.3409783713165917, + "grad_norm": 3.930997848510742, + "learning_rate": 8.753455824753584e-06, + "loss": 0.7548, + "step": 4619 + }, + { + "epoch": 1.3412686892146901, + "grad_norm": 3.2416226863861084, + "learning_rate": 8.752821394173092e-06, + "loss": 0.7009, + "step": 4620 + }, + { + "epoch": 1.3415590071127885, + "grad_norm": 3.5444040298461914, + "learning_rate": 8.752186825190037e-06, + "loss": 0.7432, + "step": 4621 + }, + { + "epoch": 1.341849325010887, + "grad_norm": 3.347137451171875, + "learning_rate": 8.751552117827819e-06, + "loss": 0.7666, + "step": 4622 + }, + { + "epoch": 1.3421396429089854, + "grad_norm": 3.45306396484375, + "learning_rate": 8.750917272109849e-06, + "loss": 0.7356, + "step": 4623 + }, + { + "epoch": 1.3424299608070838, + "grad_norm": 3.755613327026367, + "learning_rate": 8.750282288059538e-06, + "loss": 0.7731, + "step": 4624 + }, + { + "epoch": 1.3427202787051822, + "grad_norm": 3.633800745010376, + "learning_rate": 8.749647165700306e-06, + "loss": 0.785, + "step": 4625 + }, + { + "epoch": 1.3430105966032806, + "grad_norm": 3.169142961502075, + "learning_rate": 8.749011905055572e-06, + "loss": 0.7931, + "step": 4626 + }, + { + "epoch": 1.343300914501379, + "grad_norm": 3.288231611251831, + "learning_rate": 8.748376506148768e-06, + "loss": 0.7093, + "step": 4627 + }, + { + "epoch": 1.3435912323994774, + "grad_norm": 3.5134363174438477, + "learning_rate": 8.747740969003327e-06, + "loss": 0.7093, + "step": 4628 + }, + { + "epoch": 1.3438815502975758, + "grad_norm": 3.5111641883850098, + "learning_rate": 8.747105293642686e-06, + "loss": 0.7451, + "step": 4629 + }, + { + "epoch": 1.3441718681956742, + "grad_norm": 4.0172343254089355, + "learning_rate": 8.746469480090287e-06, + "loss": 0.9514, + "step": 4630 + }, + { + "epoch": 1.3444621860937727, + "grad_norm": 3.5807735919952393, + "learning_rate": 8.74583352836958e-06, + "loss": 0.7602, + "step": 4631 + }, + { + "epoch": 1.344752503991871, + "grad_norm": 3.807891368865967, + "learning_rate": 8.745197438504021e-06, + "loss": 0.7435, + "step": 4632 + }, + { + "epoch": 1.3450428218899695, + "grad_norm": 3.7621943950653076, + "learning_rate": 8.744561210517067e-06, + "loss": 0.7656, + "step": 4633 + }, + { + "epoch": 1.345333139788068, + "grad_norm": 3.7001357078552246, + "learning_rate": 8.743924844432178e-06, + "loss": 0.7488, + "step": 4634 + }, + { + "epoch": 1.3456234576861663, + "grad_norm": 3.6607542037963867, + "learning_rate": 8.74328834027283e-06, + "loss": 0.788, + "step": 4635 + }, + { + "epoch": 1.3459137755842647, + "grad_norm": 3.2181572914123535, + "learning_rate": 8.742651698062492e-06, + "loss": 0.7679, + "step": 4636 + }, + { + "epoch": 1.3462040934823631, + "grad_norm": 3.7429494857788086, + "learning_rate": 8.742014917824646e-06, + "loss": 0.8146, + "step": 4637 + }, + { + "epoch": 1.3464944113804616, + "grad_norm": 3.507017135620117, + "learning_rate": 8.741377999582774e-06, + "loss": 0.6924, + "step": 4638 + }, + { + "epoch": 1.34678472927856, + "grad_norm": 3.102918863296509, + "learning_rate": 8.740740943360367e-06, + "loss": 0.66, + "step": 4639 + }, + { + "epoch": 1.3470750471766584, + "grad_norm": 3.8772389888763428, + "learning_rate": 8.740103749180916e-06, + "loss": 0.7636, + "step": 4640 + }, + { + "epoch": 1.3473653650747568, + "grad_norm": 3.7670934200286865, + "learning_rate": 8.739466417067926e-06, + "loss": 0.8094, + "step": 4641 + }, + { + "epoch": 1.3476556829728552, + "grad_norm": 3.172104597091675, + "learning_rate": 8.738828947044895e-06, + "loss": 0.7114, + "step": 4642 + }, + { + "epoch": 1.3479460008709536, + "grad_norm": 3.3927054405212402, + "learning_rate": 8.738191339135339e-06, + "loss": 0.7699, + "step": 4643 + }, + { + "epoch": 1.348236318769052, + "grad_norm": 3.920764207839966, + "learning_rate": 8.737553593362769e-06, + "loss": 0.8753, + "step": 4644 + }, + { + "epoch": 1.3485266366671504, + "grad_norm": 3.6030490398406982, + "learning_rate": 8.736915709750704e-06, + "loss": 0.7227, + "step": 4645 + }, + { + "epoch": 1.3488169545652489, + "grad_norm": 3.684602975845337, + "learning_rate": 8.736277688322675e-06, + "loss": 0.7326, + "step": 4646 + }, + { + "epoch": 1.3491072724633475, + "grad_norm": 3.836862802505493, + "learning_rate": 8.735639529102203e-06, + "loss": 0.8414, + "step": 4647 + }, + { + "epoch": 1.3493975903614457, + "grad_norm": 3.029850959777832, + "learning_rate": 8.73500123211283e-06, + "loss": 0.662, + "step": 4648 + }, + { + "epoch": 1.3496879082595443, + "grad_norm": 3.983029842376709, + "learning_rate": 8.734362797378094e-06, + "loss": 0.8949, + "step": 4649 + }, + { + "epoch": 1.3499782261576425, + "grad_norm": 3.8315110206604004, + "learning_rate": 8.733724224921539e-06, + "loss": 0.769, + "step": 4650 + }, + { + "epoch": 1.3502685440557411, + "grad_norm": 3.8778200149536133, + "learning_rate": 8.733085514766715e-06, + "loss": 0.8529, + "step": 4651 + }, + { + "epoch": 1.3505588619538393, + "grad_norm": 3.760114908218384, + "learning_rate": 8.73244666693718e-06, + "loss": 0.7106, + "step": 4652 + }, + { + "epoch": 1.350849179851938, + "grad_norm": 3.9810686111450195, + "learning_rate": 8.731807681456493e-06, + "loss": 0.9468, + "step": 4653 + }, + { + "epoch": 1.3511394977500362, + "grad_norm": 3.370008707046509, + "learning_rate": 8.73116855834822e-06, + "loss": 0.7507, + "step": 4654 + }, + { + "epoch": 1.3514298156481348, + "grad_norm": 3.6600735187530518, + "learning_rate": 8.73052929763593e-06, + "loss": 0.7098, + "step": 4655 + }, + { + "epoch": 1.351720133546233, + "grad_norm": 3.648756742477417, + "learning_rate": 8.7298898993432e-06, + "loss": 0.748, + "step": 4656 + }, + { + "epoch": 1.3520104514443316, + "grad_norm": 3.0947530269622803, + "learning_rate": 8.729250363493613e-06, + "loss": 0.7099, + "step": 4657 + }, + { + "epoch": 1.35230076934243, + "grad_norm": 3.60309100151062, + "learning_rate": 8.72861069011075e-06, + "loss": 0.9044, + "step": 4658 + }, + { + "epoch": 1.3525910872405285, + "grad_norm": 4.018613815307617, + "learning_rate": 8.727970879218207e-06, + "loss": 0.9816, + "step": 4659 + }, + { + "epoch": 1.3528814051386269, + "grad_norm": 3.9331774711608887, + "learning_rate": 8.727330930839575e-06, + "loss": 0.8805, + "step": 4660 + }, + { + "epoch": 1.3531717230367253, + "grad_norm": 3.5565247535705566, + "learning_rate": 8.726690844998457e-06, + "loss": 0.7209, + "step": 4661 + }, + { + "epoch": 1.3534620409348237, + "grad_norm": 3.6686999797821045, + "learning_rate": 8.726050621718462e-06, + "loss": 0.9374, + "step": 4662 + }, + { + "epoch": 1.353752358832922, + "grad_norm": 3.3711307048797607, + "learning_rate": 8.725410261023198e-06, + "loss": 0.7055, + "step": 4663 + }, + { + "epoch": 1.3540426767310205, + "grad_norm": 3.3798294067382812, + "learning_rate": 8.72476976293628e-06, + "loss": 0.8147, + "step": 4664 + }, + { + "epoch": 1.354332994629119, + "grad_norm": 3.112764596939087, + "learning_rate": 8.724129127481333e-06, + "loss": 0.6867, + "step": 4665 + }, + { + "epoch": 1.3546233125272173, + "grad_norm": 3.7623043060302734, + "learning_rate": 8.723488354681981e-06, + "loss": 0.8379, + "step": 4666 + }, + { + "epoch": 1.3549136304253158, + "grad_norm": 3.341522216796875, + "learning_rate": 8.722847444561857e-06, + "loss": 0.7471, + "step": 4667 + }, + { + "epoch": 1.3552039483234142, + "grad_norm": 3.6576430797576904, + "learning_rate": 8.722206397144596e-06, + "loss": 0.8535, + "step": 4668 + }, + { + "epoch": 1.3554942662215126, + "grad_norm": 3.5284230709075928, + "learning_rate": 8.721565212453841e-06, + "loss": 0.748, + "step": 4669 + }, + { + "epoch": 1.355784584119611, + "grad_norm": 3.8579068183898926, + "learning_rate": 8.720923890513237e-06, + "loss": 0.9345, + "step": 4670 + }, + { + "epoch": 1.3560749020177094, + "grad_norm": 3.495478630065918, + "learning_rate": 8.720282431346437e-06, + "loss": 0.8069, + "step": 4671 + }, + { + "epoch": 1.3563652199158078, + "grad_norm": 3.690916061401367, + "learning_rate": 8.719640834977097e-06, + "loss": 0.8264, + "step": 4672 + }, + { + "epoch": 1.3566555378139062, + "grad_norm": 3.7047739028930664, + "learning_rate": 8.718999101428878e-06, + "loss": 0.8304, + "step": 4673 + }, + { + "epoch": 1.3569458557120047, + "grad_norm": 4.055042743682861, + "learning_rate": 8.71835723072545e-06, + "loss": 0.9142, + "step": 4674 + }, + { + "epoch": 1.357236173610103, + "grad_norm": 4.049088478088379, + "learning_rate": 8.717715222890481e-06, + "loss": 0.9886, + "step": 4675 + }, + { + "epoch": 1.3575264915082015, + "grad_norm": 3.6256749629974365, + "learning_rate": 8.71707307794765e-06, + "loss": 0.8418, + "step": 4676 + }, + { + "epoch": 1.3578168094063, + "grad_norm": 3.9590277671813965, + "learning_rate": 8.71643079592064e-06, + "loss": 0.9358, + "step": 4677 + }, + { + "epoch": 1.3581071273043983, + "grad_norm": 3.463407278060913, + "learning_rate": 8.715788376833136e-06, + "loss": 0.8553, + "step": 4678 + }, + { + "epoch": 1.3583974452024967, + "grad_norm": 3.912795066833496, + "learning_rate": 8.715145820708834e-06, + "loss": 0.8467, + "step": 4679 + }, + { + "epoch": 1.3586877631005951, + "grad_norm": 3.678302526473999, + "learning_rate": 8.714503127571425e-06, + "loss": 0.7989, + "step": 4680 + }, + { + "epoch": 1.3589780809986935, + "grad_norm": 3.615201711654663, + "learning_rate": 8.713860297444617e-06, + "loss": 0.8054, + "step": 4681 + }, + { + "epoch": 1.359268398896792, + "grad_norm": 3.3816769123077393, + "learning_rate": 8.713217330352116e-06, + "loss": 0.7385, + "step": 4682 + }, + { + "epoch": 1.3595587167948904, + "grad_norm": 3.387265682220459, + "learning_rate": 8.71257422631763e-06, + "loss": 0.6743, + "step": 4683 + }, + { + "epoch": 1.3598490346929888, + "grad_norm": 3.653581142425537, + "learning_rate": 8.711930985364882e-06, + "loss": 0.7954, + "step": 4684 + }, + { + "epoch": 1.3601393525910872, + "grad_norm": 2.788815498352051, + "learning_rate": 8.711287607517592e-06, + "loss": 0.6557, + "step": 4685 + }, + { + "epoch": 1.3604296704891856, + "grad_norm": 3.226760149002075, + "learning_rate": 8.710644092799486e-06, + "loss": 0.7571, + "step": 4686 + }, + { + "epoch": 1.360719988387284, + "grad_norm": 3.787449836730957, + "learning_rate": 8.7100004412343e-06, + "loss": 0.8545, + "step": 4687 + }, + { + "epoch": 1.3610103062853824, + "grad_norm": 3.5828921794891357, + "learning_rate": 8.70935665284577e-06, + "loss": 0.8057, + "step": 4688 + }, + { + "epoch": 1.3613006241834809, + "grad_norm": 3.35209321975708, + "learning_rate": 8.70871272765764e-06, + "loss": 0.6767, + "step": 4689 + }, + { + "epoch": 1.3615909420815793, + "grad_norm": 3.3819470405578613, + "learning_rate": 8.708068665693654e-06, + "loss": 0.7925, + "step": 4690 + }, + { + "epoch": 1.3618812599796777, + "grad_norm": 3.8231050968170166, + "learning_rate": 8.707424466977568e-06, + "loss": 0.792, + "step": 4691 + }, + { + "epoch": 1.362171577877776, + "grad_norm": 3.430509567260742, + "learning_rate": 8.706780131533139e-06, + "loss": 0.6875, + "step": 4692 + }, + { + "epoch": 1.3624618957758745, + "grad_norm": 3.384800910949707, + "learning_rate": 8.70613565938413e-06, + "loss": 0.8151, + "step": 4693 + }, + { + "epoch": 1.362752213673973, + "grad_norm": 3.6406893730163574, + "learning_rate": 8.705491050554308e-06, + "loss": 0.6947, + "step": 4694 + }, + { + "epoch": 1.3630425315720713, + "grad_norm": 3.7997663021087646, + "learning_rate": 8.704846305067446e-06, + "loss": 0.7631, + "step": 4695 + }, + { + "epoch": 1.36333284947017, + "grad_norm": 3.516602039337158, + "learning_rate": 8.704201422947325e-06, + "loss": 0.6366, + "step": 4696 + }, + { + "epoch": 1.3636231673682682, + "grad_norm": 3.8194212913513184, + "learning_rate": 8.703556404217723e-06, + "loss": 0.8989, + "step": 4697 + }, + { + "epoch": 1.3639134852663668, + "grad_norm": 3.5797340869903564, + "learning_rate": 8.702911248902432e-06, + "loss": 0.8461, + "step": 4698 + }, + { + "epoch": 1.364203803164465, + "grad_norm": 3.9624059200286865, + "learning_rate": 8.702265957025241e-06, + "loss": 0.8728, + "step": 4699 + }, + { + "epoch": 1.3644941210625636, + "grad_norm": 3.4519660472869873, + "learning_rate": 8.701620528609953e-06, + "loss": 0.7457, + "step": 4700 + }, + { + "epoch": 1.3647844389606618, + "grad_norm": 3.042926788330078, + "learning_rate": 8.70097496368037e-06, + "loss": 0.7291, + "step": 4701 + }, + { + "epoch": 1.3650747568587605, + "grad_norm": 3.8642966747283936, + "learning_rate": 8.700329262260296e-06, + "loss": 0.7738, + "step": 4702 + }, + { + "epoch": 1.3653650747568586, + "grad_norm": 3.7110559940338135, + "learning_rate": 8.69968342437355e-06, + "loss": 0.8148, + "step": 4703 + }, + { + "epoch": 1.3656553926549573, + "grad_norm": 3.6700289249420166, + "learning_rate": 8.699037450043945e-06, + "loss": 0.8303, + "step": 4704 + }, + { + "epoch": 1.3659457105530555, + "grad_norm": 3.7049381732940674, + "learning_rate": 8.698391339295308e-06, + "loss": 0.7679, + "step": 4705 + }, + { + "epoch": 1.366236028451154, + "grad_norm": 3.5809037685394287, + "learning_rate": 8.697745092151467e-06, + "loss": 0.8532, + "step": 4706 + }, + { + "epoch": 1.3665263463492523, + "grad_norm": 3.510563373565674, + "learning_rate": 8.697098708636254e-06, + "loss": 0.7873, + "step": 4707 + }, + { + "epoch": 1.366816664247351, + "grad_norm": 3.510591745376587, + "learning_rate": 8.696452188773506e-06, + "loss": 0.7684, + "step": 4708 + }, + { + "epoch": 1.3671069821454493, + "grad_norm": 2.9833831787109375, + "learning_rate": 8.69580553258707e-06, + "loss": 0.641, + "step": 4709 + }, + { + "epoch": 1.3673973000435478, + "grad_norm": 3.475123882293701, + "learning_rate": 8.695158740100792e-06, + "loss": 0.7174, + "step": 4710 + }, + { + "epoch": 1.3676876179416462, + "grad_norm": 3.5613787174224854, + "learning_rate": 8.694511811338526e-06, + "loss": 0.8575, + "step": 4711 + }, + { + "epoch": 1.3679779358397446, + "grad_norm": 3.813519239425659, + "learning_rate": 8.69386474632413e-06, + "loss": 0.7598, + "step": 4712 + }, + { + "epoch": 1.368268253737843, + "grad_norm": 3.702981472015381, + "learning_rate": 8.69321754508147e-06, + "loss": 0.7989, + "step": 4713 + }, + { + "epoch": 1.3685585716359414, + "grad_norm": 3.176640033721924, + "learning_rate": 8.692570207634411e-06, + "loss": 0.7673, + "step": 4714 + }, + { + "epoch": 1.3688488895340398, + "grad_norm": 4.3239874839782715, + "learning_rate": 8.691922734006828e-06, + "loss": 0.8715, + "step": 4715 + }, + { + "epoch": 1.3691392074321382, + "grad_norm": 3.727651357650757, + "learning_rate": 8.6912751242226e-06, + "loss": 0.8308, + "step": 4716 + }, + { + "epoch": 1.3694295253302367, + "grad_norm": 3.114671468734741, + "learning_rate": 8.690627378305609e-06, + "loss": 0.7163, + "step": 4717 + }, + { + "epoch": 1.369719843228335, + "grad_norm": 3.7326815128326416, + "learning_rate": 8.689979496279747e-06, + "loss": 0.763, + "step": 4718 + }, + { + "epoch": 1.3700101611264335, + "grad_norm": 3.3478212356567383, + "learning_rate": 8.689331478168906e-06, + "loss": 0.7826, + "step": 4719 + }, + { + "epoch": 1.370300479024532, + "grad_norm": 3.6909072399139404, + "learning_rate": 8.68868332399698e-06, + "loss": 0.8426, + "step": 4720 + }, + { + "epoch": 1.3705907969226303, + "grad_norm": 3.614936351776123, + "learning_rate": 8.688035033787881e-06, + "loss": 0.6808, + "step": 4721 + }, + { + "epoch": 1.3708811148207287, + "grad_norm": 2.883828639984131, + "learning_rate": 8.68738660756551e-06, + "loss": 0.622, + "step": 4722 + }, + { + "epoch": 1.3711714327188271, + "grad_norm": 3.4927828311920166, + "learning_rate": 8.686738045353788e-06, + "loss": 0.7631, + "step": 4723 + }, + { + "epoch": 1.3714617506169255, + "grad_norm": 3.9574286937713623, + "learning_rate": 8.686089347176628e-06, + "loss": 0.8098, + "step": 4724 + }, + { + "epoch": 1.371752068515024, + "grad_norm": 3.645223379135132, + "learning_rate": 8.685440513057955e-06, + "loss": 0.7543, + "step": 4725 + }, + { + "epoch": 1.3720423864131224, + "grad_norm": 3.428619146347046, + "learning_rate": 8.6847915430217e-06, + "loss": 0.7329, + "step": 4726 + }, + { + "epoch": 1.3723327043112208, + "grad_norm": 3.4425439834594727, + "learning_rate": 8.684142437091793e-06, + "loss": 0.7378, + "step": 4727 + }, + { + "epoch": 1.3726230222093192, + "grad_norm": 3.855262041091919, + "learning_rate": 8.683493195292177e-06, + "loss": 0.8353, + "step": 4728 + }, + { + "epoch": 1.3729133401074176, + "grad_norm": 3.844834327697754, + "learning_rate": 8.682843817646793e-06, + "loss": 0.8555, + "step": 4729 + }, + { + "epoch": 1.373203658005516, + "grad_norm": 3.6634161472320557, + "learning_rate": 8.682194304179592e-06, + "loss": 0.7366, + "step": 4730 + }, + { + "epoch": 1.3734939759036144, + "grad_norm": 3.33919358253479, + "learning_rate": 8.681544654914525e-06, + "loss": 0.7108, + "step": 4731 + }, + { + "epoch": 1.3737842938017129, + "grad_norm": 3.5919675827026367, + "learning_rate": 8.680894869875551e-06, + "loss": 0.7798, + "step": 4732 + }, + { + "epoch": 1.3740746116998113, + "grad_norm": 3.4425220489501953, + "learning_rate": 8.680244949086635e-06, + "loss": 0.7974, + "step": 4733 + }, + { + "epoch": 1.3743649295979097, + "grad_norm": 3.5473134517669678, + "learning_rate": 8.679594892571748e-06, + "loss": 0.8122, + "step": 4734 + }, + { + "epoch": 1.374655247496008, + "grad_norm": 3.0709292888641357, + "learning_rate": 8.678944700354858e-06, + "loss": 0.702, + "step": 4735 + }, + { + "epoch": 1.3749455653941065, + "grad_norm": 3.581000566482544, + "learning_rate": 8.678294372459951e-06, + "loss": 0.7628, + "step": 4736 + }, + { + "epoch": 1.375235883292205, + "grad_norm": 3.376634120941162, + "learning_rate": 8.677643908911007e-06, + "loss": 0.7925, + "step": 4737 + }, + { + "epoch": 1.3755262011903033, + "grad_norm": 3.8900506496429443, + "learning_rate": 8.676993309732013e-06, + "loss": 0.9143, + "step": 4738 + }, + { + "epoch": 1.3758165190884017, + "grad_norm": 3.7068428993225098, + "learning_rate": 8.676342574946966e-06, + "loss": 0.8024, + "step": 4739 + }, + { + "epoch": 1.3761068369865002, + "grad_norm": 3.0612878799438477, + "learning_rate": 8.675691704579862e-06, + "loss": 0.7633, + "step": 4740 + }, + { + "epoch": 1.3763971548845986, + "grad_norm": 3.1773834228515625, + "learning_rate": 8.675040698654708e-06, + "loss": 0.6102, + "step": 4741 + }, + { + "epoch": 1.376687472782697, + "grad_norm": 3.589904308319092, + "learning_rate": 8.674389557195513e-06, + "loss": 0.7074, + "step": 4742 + }, + { + "epoch": 1.3769777906807954, + "grad_norm": 2.9823384284973145, + "learning_rate": 8.673738280226287e-06, + "loss": 0.6443, + "step": 4743 + }, + { + "epoch": 1.3772681085788938, + "grad_norm": 3.825936794281006, + "learning_rate": 8.673086867771051e-06, + "loss": 0.8828, + "step": 4744 + }, + { + "epoch": 1.3775584264769922, + "grad_norm": 4.055813312530518, + "learning_rate": 8.672435319853831e-06, + "loss": 0.879, + "step": 4745 + }, + { + "epoch": 1.3778487443750906, + "grad_norm": 3.423865556716919, + "learning_rate": 8.671783636498652e-06, + "loss": 0.6735, + "step": 4746 + }, + { + "epoch": 1.3781390622731893, + "grad_norm": 3.7734615802764893, + "learning_rate": 8.67113181772955e-06, + "loss": 0.8025, + "step": 4747 + }, + { + "epoch": 1.3784293801712875, + "grad_norm": 4.315977573394775, + "learning_rate": 8.670479863570565e-06, + "loss": 0.9297, + "step": 4748 + }, + { + "epoch": 1.378719698069386, + "grad_norm": 3.550494432449341, + "learning_rate": 8.669827774045738e-06, + "loss": 0.7168, + "step": 4749 + }, + { + "epoch": 1.3790100159674843, + "grad_norm": 3.4887938499450684, + "learning_rate": 8.669175549179117e-06, + "loss": 0.7436, + "step": 4750 + }, + { + "epoch": 1.379300333865583, + "grad_norm": 3.508185863494873, + "learning_rate": 8.66852318899476e-06, + "loss": 0.9205, + "step": 4751 + }, + { + "epoch": 1.3795906517636811, + "grad_norm": 3.516390800476074, + "learning_rate": 8.667870693516723e-06, + "loss": 0.8103, + "step": 4752 + }, + { + "epoch": 1.3798809696617798, + "grad_norm": 3.781928300857544, + "learning_rate": 8.667218062769071e-06, + "loss": 0.8471, + "step": 4753 + }, + { + "epoch": 1.380171287559878, + "grad_norm": 3.511101722717285, + "learning_rate": 8.66656529677587e-06, + "loss": 0.8162, + "step": 4754 + }, + { + "epoch": 1.3804616054579766, + "grad_norm": 3.4614226818084717, + "learning_rate": 8.665912395561199e-06, + "loss": 0.7267, + "step": 4755 + }, + { + "epoch": 1.3807519233560748, + "grad_norm": 3.698258638381958, + "learning_rate": 8.665259359149132e-06, + "loss": 0.8931, + "step": 4756 + }, + { + "epoch": 1.3810422412541734, + "grad_norm": 3.715919017791748, + "learning_rate": 8.664606187563755e-06, + "loss": 0.8431, + "step": 4757 + }, + { + "epoch": 1.3813325591522716, + "grad_norm": 3.9970555305480957, + "learning_rate": 8.663952880829156e-06, + "loss": 0.8736, + "step": 4758 + }, + { + "epoch": 1.3816228770503702, + "grad_norm": 3.340646982192993, + "learning_rate": 8.663299438969429e-06, + "loss": 0.8427, + "step": 4759 + }, + { + "epoch": 1.3819131949484686, + "grad_norm": 3.6539206504821777, + "learning_rate": 8.66264586200867e-06, + "loss": 0.8327, + "step": 4760 + }, + { + "epoch": 1.382203512846567, + "grad_norm": 3.351881980895996, + "learning_rate": 8.661992149970987e-06, + "loss": 0.7164, + "step": 4761 + }, + { + "epoch": 1.3824938307446655, + "grad_norm": 3.7830088138580322, + "learning_rate": 8.661338302880486e-06, + "loss": 0.8005, + "step": 4762 + }, + { + "epoch": 1.3827841486427639, + "grad_norm": 3.4923183917999268, + "learning_rate": 8.660684320761283e-06, + "loss": 0.8499, + "step": 4763 + }, + { + "epoch": 1.3830744665408623, + "grad_norm": 3.5462584495544434, + "learning_rate": 8.660030203637495e-06, + "loss": 0.8269, + "step": 4764 + }, + { + "epoch": 1.3833647844389607, + "grad_norm": 3.958484172821045, + "learning_rate": 8.659375951533244e-06, + "loss": 0.8645, + "step": 4765 + }, + { + "epoch": 1.3836551023370591, + "grad_norm": 3.3109965324401855, + "learning_rate": 8.658721564472661e-06, + "loss": 0.7037, + "step": 4766 + }, + { + "epoch": 1.3839454202351575, + "grad_norm": 3.4409313201904297, + "learning_rate": 8.658067042479877e-06, + "loss": 0.7239, + "step": 4767 + }, + { + "epoch": 1.384235738133256, + "grad_norm": 3.4091596603393555, + "learning_rate": 8.657412385579034e-06, + "loss": 0.8077, + "step": 4768 + }, + { + "epoch": 1.3845260560313544, + "grad_norm": 3.524073362350464, + "learning_rate": 8.656757593794273e-06, + "loss": 0.8358, + "step": 4769 + }, + { + "epoch": 1.3848163739294528, + "grad_norm": 3.48581862449646, + "learning_rate": 8.656102667149742e-06, + "loss": 0.7484, + "step": 4770 + }, + { + "epoch": 1.3851066918275512, + "grad_norm": 3.575439929962158, + "learning_rate": 8.655447605669596e-06, + "loss": 0.8364, + "step": 4771 + }, + { + "epoch": 1.3853970097256496, + "grad_norm": 3.5409598350524902, + "learning_rate": 8.654792409377995e-06, + "loss": 0.817, + "step": 4772 + }, + { + "epoch": 1.385687327623748, + "grad_norm": 3.779784679412842, + "learning_rate": 8.654137078299099e-06, + "loss": 0.8296, + "step": 4773 + }, + { + "epoch": 1.3859776455218464, + "grad_norm": 3.9878501892089844, + "learning_rate": 8.653481612457077e-06, + "loss": 0.9375, + "step": 4774 + }, + { + "epoch": 1.3862679634199448, + "grad_norm": 3.504255533218384, + "learning_rate": 8.652826011876104e-06, + "loss": 0.7396, + "step": 4775 + }, + { + "epoch": 1.3865582813180433, + "grad_norm": 3.366769313812256, + "learning_rate": 8.652170276580357e-06, + "loss": 0.7795, + "step": 4776 + }, + { + "epoch": 1.3868485992161417, + "grad_norm": 3.392413377761841, + "learning_rate": 8.651514406594017e-06, + "loss": 0.7361, + "step": 4777 + }, + { + "epoch": 1.38713891711424, + "grad_norm": 3.5908358097076416, + "learning_rate": 8.650858401941278e-06, + "loss": 0.8597, + "step": 4778 + }, + { + "epoch": 1.3874292350123385, + "grad_norm": 3.968031406402588, + "learning_rate": 8.650202262646327e-06, + "loss": 0.9725, + "step": 4779 + }, + { + "epoch": 1.387719552910437, + "grad_norm": 4.526281356811523, + "learning_rate": 8.649545988733367e-06, + "loss": 0.7876, + "step": 4780 + }, + { + "epoch": 1.3880098708085353, + "grad_norm": 3.4686625003814697, + "learning_rate": 8.648889580226601e-06, + "loss": 0.8439, + "step": 4781 + }, + { + "epoch": 1.3883001887066337, + "grad_norm": 3.479299545288086, + "learning_rate": 8.648233037150233e-06, + "loss": 0.7461, + "step": 4782 + }, + { + "epoch": 1.3885905066047322, + "grad_norm": 3.5957489013671875, + "learning_rate": 8.647576359528479e-06, + "loss": 0.8737, + "step": 4783 + }, + { + "epoch": 1.3888808245028306, + "grad_norm": 3.4597275257110596, + "learning_rate": 8.646919547385554e-06, + "loss": 0.7269, + "step": 4784 + }, + { + "epoch": 1.389171142400929, + "grad_norm": 3.6386306285858154, + "learning_rate": 8.646262600745687e-06, + "loss": 0.9262, + "step": 4785 + }, + { + "epoch": 1.3894614602990274, + "grad_norm": 3.6978306770324707, + "learning_rate": 8.6456055196331e-06, + "loss": 0.757, + "step": 4786 + }, + { + "epoch": 1.3897517781971258, + "grad_norm": 3.7922861576080322, + "learning_rate": 8.64494830407203e-06, + "loss": 0.8727, + "step": 4787 + }, + { + "epoch": 1.3900420960952242, + "grad_norm": 3.6173031330108643, + "learning_rate": 8.644290954086711e-06, + "loss": 0.9186, + "step": 4788 + }, + { + "epoch": 1.3903324139933226, + "grad_norm": 3.2797791957855225, + "learning_rate": 8.643633469701389e-06, + "loss": 0.7659, + "step": 4789 + }, + { + "epoch": 1.390622731891421, + "grad_norm": 3.4677507877349854, + "learning_rate": 8.64297585094031e-06, + "loss": 0.8224, + "step": 4790 + }, + { + "epoch": 1.3909130497895195, + "grad_norm": 3.7027604579925537, + "learning_rate": 8.642318097827728e-06, + "loss": 0.8528, + "step": 4791 + }, + { + "epoch": 1.3912033676876179, + "grad_norm": 3.7726094722747803, + "learning_rate": 8.6416602103879e-06, + "loss": 0.817, + "step": 4792 + }, + { + "epoch": 1.3914936855857163, + "grad_norm": 2.998366117477417, + "learning_rate": 8.641002188645087e-06, + "loss": 0.6437, + "step": 4793 + }, + { + "epoch": 1.3917840034838147, + "grad_norm": 3.7641310691833496, + "learning_rate": 8.64034403262356e-06, + "loss": 0.8539, + "step": 4794 + }, + { + "epoch": 1.3920743213819131, + "grad_norm": 3.446791648864746, + "learning_rate": 8.639685742347588e-06, + "loss": 0.7193, + "step": 4795 + }, + { + "epoch": 1.3923646392800118, + "grad_norm": 3.87794828414917, + "learning_rate": 8.639027317841453e-06, + "loss": 0.8783, + "step": 4796 + }, + { + "epoch": 1.39265495717811, + "grad_norm": 3.71549129486084, + "learning_rate": 8.638368759129433e-06, + "loss": 0.7826, + "step": 4797 + }, + { + "epoch": 1.3929452750762086, + "grad_norm": 3.6566429138183594, + "learning_rate": 8.637710066235816e-06, + "loss": 0.7971, + "step": 4798 + }, + { + "epoch": 1.3932355929743068, + "grad_norm": 3.5347371101379395, + "learning_rate": 8.637051239184896e-06, + "loss": 0.7795, + "step": 4799 + }, + { + "epoch": 1.3935259108724054, + "grad_norm": 3.63020920753479, + "learning_rate": 8.63639227800097e-06, + "loss": 0.7689, + "step": 4800 + }, + { + "epoch": 1.3938162287705036, + "grad_norm": 3.451944589614868, + "learning_rate": 8.635733182708339e-06, + "loss": 0.7747, + "step": 4801 + }, + { + "epoch": 1.3941065466686022, + "grad_norm": 3.4489200115203857, + "learning_rate": 8.635073953331312e-06, + "loss": 0.7529, + "step": 4802 + }, + { + "epoch": 1.3943968645667004, + "grad_norm": 3.329653024673462, + "learning_rate": 8.6344145898942e-06, + "loss": 0.7888, + "step": 4803 + }, + { + "epoch": 1.394687182464799, + "grad_norm": 3.4779586791992188, + "learning_rate": 8.633755092421319e-06, + "loss": 0.7773, + "step": 4804 + }, + { + "epoch": 1.3949775003628972, + "grad_norm": 3.3433725833892822, + "learning_rate": 8.633095460936993e-06, + "loss": 0.7696, + "step": 4805 + }, + { + "epoch": 1.3952678182609959, + "grad_norm": 3.6129310131073, + "learning_rate": 8.632435695465549e-06, + "loss": 0.7715, + "step": 4806 + }, + { + "epoch": 1.395558136159094, + "grad_norm": 3.8630218505859375, + "learning_rate": 8.631775796031316e-06, + "loss": 0.8732, + "step": 4807 + }, + { + "epoch": 1.3958484540571927, + "grad_norm": 3.8463497161865234, + "learning_rate": 8.631115762658635e-06, + "loss": 0.7539, + "step": 4808 + }, + { + "epoch": 1.3961387719552911, + "grad_norm": 3.2628061771392822, + "learning_rate": 8.630455595371846e-06, + "loss": 0.7529, + "step": 4809 + }, + { + "epoch": 1.3964290898533895, + "grad_norm": 3.669912338256836, + "learning_rate": 8.629795294195293e-06, + "loss": 0.8761, + "step": 4810 + }, + { + "epoch": 1.396719407751488, + "grad_norm": 3.5903918743133545, + "learning_rate": 8.629134859153331e-06, + "loss": 0.7032, + "step": 4811 + }, + { + "epoch": 1.3970097256495864, + "grad_norm": 3.8013930320739746, + "learning_rate": 8.628474290270316e-06, + "loss": 0.8091, + "step": 4812 + }, + { + "epoch": 1.3973000435476848, + "grad_norm": 3.3065521717071533, + "learning_rate": 8.627813587570609e-06, + "loss": 0.7613, + "step": 4813 + }, + { + "epoch": 1.3975903614457832, + "grad_norm": 3.47182035446167, + "learning_rate": 8.627152751078576e-06, + "loss": 0.7276, + "step": 4814 + }, + { + "epoch": 1.3978806793438816, + "grad_norm": 3.4294252395629883, + "learning_rate": 8.62649178081859e-06, + "loss": 0.6753, + "step": 4815 + }, + { + "epoch": 1.39817099724198, + "grad_norm": 3.6028592586517334, + "learning_rate": 8.625830676815026e-06, + "loss": 0.8833, + "step": 4816 + }, + { + "epoch": 1.3984613151400784, + "grad_norm": 3.3166987895965576, + "learning_rate": 8.625169439092265e-06, + "loss": 0.6944, + "step": 4817 + }, + { + "epoch": 1.3987516330381768, + "grad_norm": 3.5034635066986084, + "learning_rate": 8.624508067674692e-06, + "loss": 0.8244, + "step": 4818 + }, + { + "epoch": 1.3990419509362753, + "grad_norm": 3.2709362506866455, + "learning_rate": 8.623846562586701e-06, + "loss": 0.7226, + "step": 4819 + }, + { + "epoch": 1.3993322688343737, + "grad_norm": 3.6299030780792236, + "learning_rate": 8.623184923852688e-06, + "loss": 0.7935, + "step": 4820 + }, + { + "epoch": 1.399622586732472, + "grad_norm": 3.91402268409729, + "learning_rate": 8.622523151497052e-06, + "loss": 0.8692, + "step": 4821 + }, + { + "epoch": 1.3999129046305705, + "grad_norm": 3.647177219390869, + "learning_rate": 8.6218612455442e-06, + "loss": 0.7919, + "step": 4822 + }, + { + "epoch": 1.400203222528669, + "grad_norm": 4.167767524719238, + "learning_rate": 8.621199206018544e-06, + "loss": 0.8089, + "step": 4823 + }, + { + "epoch": 1.4004935404267673, + "grad_norm": 3.5647425651550293, + "learning_rate": 8.620537032944495e-06, + "loss": 0.6652, + "step": 4824 + }, + { + "epoch": 1.4007838583248657, + "grad_norm": 3.2275984287261963, + "learning_rate": 8.619874726346479e-06, + "loss": 0.6856, + "step": 4825 + }, + { + "epoch": 1.4010741762229642, + "grad_norm": 3.2474308013916016, + "learning_rate": 8.61921228624892e-06, + "loss": 0.7441, + "step": 4826 + }, + { + "epoch": 1.4013644941210626, + "grad_norm": 4.000554084777832, + "learning_rate": 8.618549712676247e-06, + "loss": 0.7875, + "step": 4827 + }, + { + "epoch": 1.401654812019161, + "grad_norm": 3.8246102333068848, + "learning_rate": 8.617887005652898e-06, + "loss": 0.7176, + "step": 4828 + }, + { + "epoch": 1.4019451299172594, + "grad_norm": 3.5609772205352783, + "learning_rate": 8.61722416520331e-06, + "loss": 0.8243, + "step": 4829 + }, + { + "epoch": 1.4022354478153578, + "grad_norm": 3.6110541820526123, + "learning_rate": 8.616561191351934e-06, + "loss": 0.7761, + "step": 4830 + }, + { + "epoch": 1.4025257657134562, + "grad_norm": 3.7265875339508057, + "learning_rate": 8.615898084123214e-06, + "loss": 0.7602, + "step": 4831 + }, + { + "epoch": 1.4028160836115546, + "grad_norm": 3.745316505432129, + "learning_rate": 8.615234843541606e-06, + "loss": 0.8678, + "step": 4832 + }, + { + "epoch": 1.403106401509653, + "grad_norm": 3.5089032649993896, + "learning_rate": 8.614571469631573e-06, + "loss": 0.7717, + "step": 4833 + }, + { + "epoch": 1.4033967194077515, + "grad_norm": 3.9560272693634033, + "learning_rate": 8.613907962417578e-06, + "loss": 0.9322, + "step": 4834 + }, + { + "epoch": 1.4036870373058499, + "grad_norm": 3.922571897506714, + "learning_rate": 8.613244321924092e-06, + "loss": 0.8043, + "step": 4835 + }, + { + "epoch": 1.4039773552039483, + "grad_norm": 3.940345525741577, + "learning_rate": 8.612580548175588e-06, + "loss": 0.9217, + "step": 4836 + }, + { + "epoch": 1.4042676731020467, + "grad_norm": 3.3031015396118164, + "learning_rate": 8.61191664119655e-06, + "loss": 0.7364, + "step": 4837 + }, + { + "epoch": 1.4045579910001451, + "grad_norm": 3.5342633724212646, + "learning_rate": 8.611252601011457e-06, + "loss": 0.8785, + "step": 4838 + }, + { + "epoch": 1.4048483088982435, + "grad_norm": 3.5416972637176514, + "learning_rate": 8.610588427644803e-06, + "loss": 0.7948, + "step": 4839 + }, + { + "epoch": 1.405138626796342, + "grad_norm": 3.5838162899017334, + "learning_rate": 8.60992412112108e-06, + "loss": 0.799, + "step": 4840 + }, + { + "epoch": 1.4054289446944404, + "grad_norm": 3.579805850982666, + "learning_rate": 8.609259681464788e-06, + "loss": 0.6866, + "step": 4841 + }, + { + "epoch": 1.4057192625925388, + "grad_norm": 3.6548197269439697, + "learning_rate": 8.60859510870043e-06, + "loss": 0.7634, + "step": 4842 + }, + { + "epoch": 1.4060095804906372, + "grad_norm": 3.1477739810943604, + "learning_rate": 8.607930402852518e-06, + "loss": 0.7293, + "step": 4843 + }, + { + "epoch": 1.4062998983887356, + "grad_norm": 3.979515790939331, + "learning_rate": 8.607265563945563e-06, + "loss": 0.8599, + "step": 4844 + }, + { + "epoch": 1.406590216286834, + "grad_norm": 3.6897566318511963, + "learning_rate": 8.606600592004086e-06, + "loss": 0.7855, + "step": 4845 + }, + { + "epoch": 1.4068805341849324, + "grad_norm": 3.6874310970306396, + "learning_rate": 8.60593548705261e-06, + "loss": 0.828, + "step": 4846 + }, + { + "epoch": 1.407170852083031, + "grad_norm": 3.679901123046875, + "learning_rate": 8.605270249115668e-06, + "loss": 0.8838, + "step": 4847 + }, + { + "epoch": 1.4074611699811292, + "grad_norm": 3.7150042057037354, + "learning_rate": 8.604604878217786e-06, + "loss": 0.7686, + "step": 4848 + }, + { + "epoch": 1.4077514878792279, + "grad_norm": 3.672172784805298, + "learning_rate": 8.603939374383507e-06, + "loss": 0.687, + "step": 4849 + }, + { + "epoch": 1.408041805777326, + "grad_norm": 3.7549571990966797, + "learning_rate": 8.603273737637374e-06, + "loss": 0.8388, + "step": 4850 + }, + { + "epoch": 1.4083321236754247, + "grad_norm": 4.318403720855713, + "learning_rate": 8.602607968003935e-06, + "loss": 0.9144, + "step": 4851 + }, + { + "epoch": 1.408622441573523, + "grad_norm": 3.597714424133301, + "learning_rate": 8.601942065507746e-06, + "loss": 0.7885, + "step": 4852 + }, + { + "epoch": 1.4089127594716215, + "grad_norm": 3.403085947036743, + "learning_rate": 8.601276030173361e-06, + "loss": 0.8434, + "step": 4853 + }, + { + "epoch": 1.4092030773697197, + "grad_norm": 3.6063506603240967, + "learning_rate": 8.600609862025346e-06, + "loss": 0.8667, + "step": 4854 + }, + { + "epoch": 1.4094933952678184, + "grad_norm": 3.697525978088379, + "learning_rate": 8.599943561088268e-06, + "loss": 0.84, + "step": 4855 + }, + { + "epoch": 1.4097837131659166, + "grad_norm": 3.562664031982422, + "learning_rate": 8.5992771273867e-06, + "loss": 0.7553, + "step": 4856 + }, + { + "epoch": 1.4100740310640152, + "grad_norm": 3.5420081615448, + "learning_rate": 8.59861056094522e-06, + "loss": 0.7472, + "step": 4857 + }, + { + "epoch": 1.4103643489621134, + "grad_norm": 3.676253080368042, + "learning_rate": 8.59794386178841e-06, + "loss": 0.8556, + "step": 4858 + }, + { + "epoch": 1.410654666860212, + "grad_norm": 3.7087533473968506, + "learning_rate": 8.59727702994086e-06, + "loss": 0.7977, + "step": 4859 + }, + { + "epoch": 1.4109449847583104, + "grad_norm": 3.540095806121826, + "learning_rate": 8.596610065427158e-06, + "loss": 0.815, + "step": 4860 + }, + { + "epoch": 1.4112353026564088, + "grad_norm": 2.9336438179016113, + "learning_rate": 8.595942968271907e-06, + "loss": 0.7382, + "step": 4861 + }, + { + "epoch": 1.4115256205545073, + "grad_norm": 3.024334669113159, + "learning_rate": 8.595275738499704e-06, + "loss": 0.8273, + "step": 4862 + }, + { + "epoch": 1.4118159384526057, + "grad_norm": 3.550865650177002, + "learning_rate": 8.594608376135159e-06, + "loss": 0.7818, + "step": 4863 + }, + { + "epoch": 1.412106256350704, + "grad_norm": 3.29832124710083, + "learning_rate": 8.593940881202885e-06, + "loss": 0.7025, + "step": 4864 + }, + { + "epoch": 1.4123965742488025, + "grad_norm": 3.7970573902130127, + "learning_rate": 8.593273253727495e-06, + "loss": 0.831, + "step": 4865 + }, + { + "epoch": 1.412686892146901, + "grad_norm": 3.563462257385254, + "learning_rate": 8.592605493733614e-06, + "loss": 0.7108, + "step": 4866 + }, + { + "epoch": 1.4129772100449993, + "grad_norm": 3.863367795944214, + "learning_rate": 8.59193760124587e-06, + "loss": 0.7942, + "step": 4867 + }, + { + "epoch": 1.4132675279430977, + "grad_norm": 3.109443426132202, + "learning_rate": 8.591269576288892e-06, + "loss": 0.7006, + "step": 4868 + }, + { + "epoch": 1.4135578458411961, + "grad_norm": 3.792145252227783, + "learning_rate": 8.590601418887316e-06, + "loss": 0.8134, + "step": 4869 + }, + { + "epoch": 1.4138481637392946, + "grad_norm": 3.6752769947052, + "learning_rate": 8.589933129065786e-06, + "loss": 0.7159, + "step": 4870 + }, + { + "epoch": 1.414138481637393, + "grad_norm": 3.0564382076263428, + "learning_rate": 8.589264706848946e-06, + "loss": 0.7533, + "step": 4871 + }, + { + "epoch": 1.4144287995354914, + "grad_norm": 3.0098416805267334, + "learning_rate": 8.588596152261447e-06, + "loss": 0.6984, + "step": 4872 + }, + { + "epoch": 1.4147191174335898, + "grad_norm": 3.4505839347839355, + "learning_rate": 8.587927465327948e-06, + "loss": 0.7734, + "step": 4873 + }, + { + "epoch": 1.4150094353316882, + "grad_norm": 3.9714856147766113, + "learning_rate": 8.587258646073107e-06, + "loss": 0.8756, + "step": 4874 + }, + { + "epoch": 1.4152997532297866, + "grad_norm": 3.669161081314087, + "learning_rate": 8.58658969452159e-06, + "loss": 0.8002, + "step": 4875 + }, + { + "epoch": 1.415590071127885, + "grad_norm": 3.4111788272857666, + "learning_rate": 8.585920610698068e-06, + "loss": 0.79, + "step": 4876 + }, + { + "epoch": 1.4158803890259835, + "grad_norm": 3.534163236618042, + "learning_rate": 8.585251394627217e-06, + "loss": 0.6854, + "step": 4877 + }, + { + "epoch": 1.4161707069240819, + "grad_norm": 3.521871566772461, + "learning_rate": 8.584582046333719e-06, + "loss": 0.7174, + "step": 4878 + }, + { + "epoch": 1.4164610248221803, + "grad_norm": 3.245898962020874, + "learning_rate": 8.583912565842258e-06, + "loss": 0.7329, + "step": 4879 + }, + { + "epoch": 1.4167513427202787, + "grad_norm": 3.9191839694976807, + "learning_rate": 8.583242953177522e-06, + "loss": 0.8377, + "step": 4880 + }, + { + "epoch": 1.417041660618377, + "grad_norm": 3.0914013385772705, + "learning_rate": 8.582573208364209e-06, + "loss": 0.7686, + "step": 4881 + }, + { + "epoch": 1.4173319785164755, + "grad_norm": 3.8165574073791504, + "learning_rate": 8.581903331427016e-06, + "loss": 0.7768, + "step": 4882 + }, + { + "epoch": 1.417622296414574, + "grad_norm": 3.884101152420044, + "learning_rate": 8.581233322390652e-06, + "loss": 0.8283, + "step": 4883 + }, + { + "epoch": 1.4179126143126723, + "grad_norm": 4.394293308258057, + "learning_rate": 8.580563181279822e-06, + "loss": 0.9988, + "step": 4884 + }, + { + "epoch": 1.4182029322107708, + "grad_norm": 3.411958694458008, + "learning_rate": 8.579892908119244e-06, + "loss": 0.7588, + "step": 4885 + }, + { + "epoch": 1.4184932501088692, + "grad_norm": 3.832937002182007, + "learning_rate": 8.579222502933635e-06, + "loss": 0.7294, + "step": 4886 + }, + { + "epoch": 1.4187835680069676, + "grad_norm": 3.814302921295166, + "learning_rate": 8.578551965747722e-06, + "loss": 0.7515, + "step": 4887 + }, + { + "epoch": 1.419073885905066, + "grad_norm": 3.579897403717041, + "learning_rate": 8.577881296586233e-06, + "loss": 0.8351, + "step": 4888 + }, + { + "epoch": 1.4193642038031644, + "grad_norm": 3.93332576751709, + "learning_rate": 8.5772104954739e-06, + "loss": 0.727, + "step": 4889 + }, + { + "epoch": 1.4196545217012628, + "grad_norm": 3.954401731491089, + "learning_rate": 8.576539562435464e-06, + "loss": 0.7004, + "step": 4890 + }, + { + "epoch": 1.4199448395993612, + "grad_norm": 3.2439942359924316, + "learning_rate": 8.575868497495668e-06, + "loss": 0.7239, + "step": 4891 + }, + { + "epoch": 1.4202351574974597, + "grad_norm": 3.3064539432525635, + "learning_rate": 8.575197300679262e-06, + "loss": 0.8092, + "step": 4892 + }, + { + "epoch": 1.420525475395558, + "grad_norm": 3.907304525375366, + "learning_rate": 8.574525972010997e-06, + "loss": 0.851, + "step": 4893 + }, + { + "epoch": 1.4208157932936565, + "grad_norm": 3.5380594730377197, + "learning_rate": 8.573854511515633e-06, + "loss": 0.7994, + "step": 4894 + }, + { + "epoch": 1.421106111191755, + "grad_norm": 3.559415817260742, + "learning_rate": 8.573182919217936e-06, + "loss": 0.76, + "step": 4895 + }, + { + "epoch": 1.4213964290898533, + "grad_norm": 3.537963628768921, + "learning_rate": 8.572511195142665e-06, + "loss": 0.7259, + "step": 4896 + }, + { + "epoch": 1.4216867469879517, + "grad_norm": 3.594255208969116, + "learning_rate": 8.571839339314602e-06, + "loss": 0.856, + "step": 4897 + }, + { + "epoch": 1.4219770648860504, + "grad_norm": 3.629476308822632, + "learning_rate": 8.571167351758522e-06, + "loss": 0.7807, + "step": 4898 + }, + { + "epoch": 1.4222673827841485, + "grad_norm": 3.595150947570801, + "learning_rate": 8.570495232499207e-06, + "loss": 0.801, + "step": 4899 + }, + { + "epoch": 1.4225577006822472, + "grad_norm": 3.8158557415008545, + "learning_rate": 8.569822981561445e-06, + "loss": 0.8622, + "step": 4900 + }, + { + "epoch": 1.4228480185803454, + "grad_norm": 3.8504481315612793, + "learning_rate": 8.569150598970027e-06, + "loss": 0.7183, + "step": 4901 + }, + { + "epoch": 1.423138336478444, + "grad_norm": 3.875899076461792, + "learning_rate": 8.568478084749752e-06, + "loss": 0.7786, + "step": 4902 + }, + { + "epoch": 1.4234286543765422, + "grad_norm": 3.6759371757507324, + "learning_rate": 8.56780543892542e-06, + "loss": 0.8178, + "step": 4903 + }, + { + "epoch": 1.4237189722746408, + "grad_norm": 3.799499034881592, + "learning_rate": 8.567132661521841e-06, + "loss": 0.854, + "step": 4904 + }, + { + "epoch": 1.424009290172739, + "grad_norm": 3.120879888534546, + "learning_rate": 8.566459752563825e-06, + "loss": 0.7493, + "step": 4905 + }, + { + "epoch": 1.4242996080708377, + "grad_norm": 3.856126070022583, + "learning_rate": 8.56578671207619e-06, + "loss": 0.777, + "step": 4906 + }, + { + "epoch": 1.4245899259689359, + "grad_norm": 3.700613021850586, + "learning_rate": 8.565113540083751e-06, + "loss": 0.8536, + "step": 4907 + }, + { + "epoch": 1.4248802438670345, + "grad_norm": 3.3016512393951416, + "learning_rate": 8.564440236611344e-06, + "loss": 0.7961, + "step": 4908 + }, + { + "epoch": 1.4251705617651327, + "grad_norm": 3.592452049255371, + "learning_rate": 8.563766801683794e-06, + "loss": 0.9353, + "step": 4909 + }, + { + "epoch": 1.4254608796632313, + "grad_norm": 2.960012674331665, + "learning_rate": 8.56309323532594e-06, + "loss": 0.6846, + "step": 4910 + }, + { + "epoch": 1.4257511975613297, + "grad_norm": 3.6264259815216064, + "learning_rate": 8.56241953756262e-06, + "loss": 0.727, + "step": 4911 + }, + { + "epoch": 1.4260415154594281, + "grad_norm": 3.664760112762451, + "learning_rate": 8.56174570841868e-06, + "loss": 0.7984, + "step": 4912 + }, + { + "epoch": 1.4263318333575266, + "grad_norm": 3.2246367931365967, + "learning_rate": 8.561071747918973e-06, + "loss": 0.6332, + "step": 4913 + }, + { + "epoch": 1.426622151255625, + "grad_norm": 3.133545160293579, + "learning_rate": 8.560397656088353e-06, + "loss": 0.7211, + "step": 4914 + }, + { + "epoch": 1.4269124691537234, + "grad_norm": 3.770587205886841, + "learning_rate": 8.55972343295168e-06, + "loss": 0.7908, + "step": 4915 + }, + { + "epoch": 1.4272027870518218, + "grad_norm": 3.3660528659820557, + "learning_rate": 8.559049078533821e-06, + "loss": 0.7996, + "step": 4916 + }, + { + "epoch": 1.4274931049499202, + "grad_norm": 3.4238767623901367, + "learning_rate": 8.558374592859644e-06, + "loss": 0.817, + "step": 4917 + }, + { + "epoch": 1.4277834228480186, + "grad_norm": 3.7060892581939697, + "learning_rate": 8.557699975954023e-06, + "loss": 0.7631, + "step": 4918 + }, + { + "epoch": 1.428073740746117, + "grad_norm": 3.3508338928222656, + "learning_rate": 8.557025227841839e-06, + "loss": 0.7387, + "step": 4919 + }, + { + "epoch": 1.4283640586442155, + "grad_norm": 3.907799243927002, + "learning_rate": 8.556350348547978e-06, + "loss": 0.6976, + "step": 4920 + }, + { + "epoch": 1.4286543765423139, + "grad_norm": 3.8321168422698975, + "learning_rate": 8.555675338097324e-06, + "loss": 0.8515, + "step": 4921 + }, + { + "epoch": 1.4289446944404123, + "grad_norm": 3.4706666469573975, + "learning_rate": 8.555000196514776e-06, + "loss": 0.8331, + "step": 4922 + }, + { + "epoch": 1.4292350123385107, + "grad_norm": 3.963350534439087, + "learning_rate": 8.554324923825233e-06, + "loss": 0.8487, + "step": 4923 + }, + { + "epoch": 1.429525330236609, + "grad_norm": 3.9221112728118896, + "learning_rate": 8.553649520053596e-06, + "loss": 0.8157, + "step": 4924 + }, + { + "epoch": 1.4298156481347075, + "grad_norm": 3.6907260417938232, + "learning_rate": 8.552973985224774e-06, + "loss": 0.8462, + "step": 4925 + }, + { + "epoch": 1.430105966032806, + "grad_norm": 3.558818817138672, + "learning_rate": 8.552298319363682e-06, + "loss": 0.754, + "step": 4926 + }, + { + "epoch": 1.4303962839309043, + "grad_norm": 3.271465539932251, + "learning_rate": 8.551622522495238e-06, + "loss": 0.746, + "step": 4927 + }, + { + "epoch": 1.4306866018290028, + "grad_norm": 3.642778158187866, + "learning_rate": 8.550946594644365e-06, + "loss": 0.7517, + "step": 4928 + }, + { + "epoch": 1.4309769197271012, + "grad_norm": 3.227018117904663, + "learning_rate": 8.550270535835992e-06, + "loss": 0.5879, + "step": 4929 + }, + { + "epoch": 1.4312672376251996, + "grad_norm": 3.576512098312378, + "learning_rate": 8.549594346095049e-06, + "loss": 0.7585, + "step": 4930 + }, + { + "epoch": 1.431557555523298, + "grad_norm": 3.381173849105835, + "learning_rate": 8.548918025446474e-06, + "loss": 0.7194, + "step": 4931 + }, + { + "epoch": 1.4318478734213964, + "grad_norm": 3.5712335109710693, + "learning_rate": 8.548241573915213e-06, + "loss": 0.7103, + "step": 4932 + }, + { + "epoch": 1.4321381913194948, + "grad_norm": 4.106939315795898, + "learning_rate": 8.54756499152621e-06, + "loss": 0.7445, + "step": 4933 + }, + { + "epoch": 1.4324285092175932, + "grad_norm": 3.6397581100463867, + "learning_rate": 8.546888278304416e-06, + "loss": 0.9127, + "step": 4934 + }, + { + "epoch": 1.4327188271156917, + "grad_norm": 3.9541220664978027, + "learning_rate": 8.546211434274791e-06, + "loss": 0.8085, + "step": 4935 + }, + { + "epoch": 1.43300914501379, + "grad_norm": 3.7158708572387695, + "learning_rate": 8.545534459462297e-06, + "loss": 0.7887, + "step": 4936 + }, + { + "epoch": 1.4332994629118885, + "grad_norm": 3.8351891040802, + "learning_rate": 8.544857353891898e-06, + "loss": 0.8938, + "step": 4937 + }, + { + "epoch": 1.433589780809987, + "grad_norm": 3.1466290950775146, + "learning_rate": 8.544180117588567e-06, + "loss": 0.6964, + "step": 4938 + }, + { + "epoch": 1.4338800987080853, + "grad_norm": 3.5582618713378906, + "learning_rate": 8.54350275057728e-06, + "loss": 0.7432, + "step": 4939 + }, + { + "epoch": 1.4341704166061837, + "grad_norm": 3.1632747650146484, + "learning_rate": 8.542825252883015e-06, + "loss": 0.6981, + "step": 4940 + }, + { + "epoch": 1.4344607345042821, + "grad_norm": 3.2447924613952637, + "learning_rate": 8.542147624530763e-06, + "loss": 0.7172, + "step": 4941 + }, + { + "epoch": 1.4347510524023805, + "grad_norm": 3.235755443572998, + "learning_rate": 8.541469865545513e-06, + "loss": 0.7927, + "step": 4942 + }, + { + "epoch": 1.435041370300479, + "grad_norm": 3.388984203338623, + "learning_rate": 8.540791975952258e-06, + "loss": 0.733, + "step": 4943 + }, + { + "epoch": 1.4353316881985774, + "grad_norm": 3.0334298610687256, + "learning_rate": 8.540113955776001e-06, + "loss": 0.5858, + "step": 4944 + }, + { + "epoch": 1.4356220060966758, + "grad_norm": 3.707620859146118, + "learning_rate": 8.539435805041745e-06, + "loss": 0.7823, + "step": 4945 + }, + { + "epoch": 1.4359123239947742, + "grad_norm": 3.4698052406311035, + "learning_rate": 8.538757523774503e-06, + "loss": 0.8276, + "step": 4946 + }, + { + "epoch": 1.4362026418928728, + "grad_norm": 3.6473255157470703, + "learning_rate": 8.538079111999287e-06, + "loss": 0.7954, + "step": 4947 + }, + { + "epoch": 1.436492959790971, + "grad_norm": 3.7372074127197266, + "learning_rate": 8.537400569741117e-06, + "loss": 0.841, + "step": 4948 + }, + { + "epoch": 1.4367832776890697, + "grad_norm": 4.107751369476318, + "learning_rate": 8.536721897025018e-06, + "loss": 0.8634, + "step": 4949 + }, + { + "epoch": 1.4370735955871679, + "grad_norm": 3.484713077545166, + "learning_rate": 8.536043093876018e-06, + "loss": 0.8296, + "step": 4950 + }, + { + "epoch": 1.4373639134852665, + "grad_norm": 3.7558670043945312, + "learning_rate": 8.535364160319154e-06, + "loss": 0.8254, + "step": 4951 + }, + { + "epoch": 1.4376542313833647, + "grad_norm": 3.655763864517212, + "learning_rate": 8.534685096379463e-06, + "loss": 0.7879, + "step": 4952 + }, + { + "epoch": 1.4379445492814633, + "grad_norm": 3.9244983196258545, + "learning_rate": 8.534005902081985e-06, + "loss": 0.7759, + "step": 4953 + }, + { + "epoch": 1.4382348671795615, + "grad_norm": 3.526134490966797, + "learning_rate": 8.533326577451775e-06, + "loss": 0.8024, + "step": 4954 + }, + { + "epoch": 1.4385251850776601, + "grad_norm": 3.7379188537597656, + "learning_rate": 8.53264712251388e-06, + "loss": 0.7485, + "step": 4955 + }, + { + "epoch": 1.4388155029757583, + "grad_norm": 4.165005683898926, + "learning_rate": 8.531967537293365e-06, + "loss": 0.9631, + "step": 4956 + }, + { + "epoch": 1.439105820873857, + "grad_norm": 3.4370205402374268, + "learning_rate": 8.531287821815286e-06, + "loss": 0.6982, + "step": 4957 + }, + { + "epoch": 1.4393961387719552, + "grad_norm": 3.3375890254974365, + "learning_rate": 8.530607976104712e-06, + "loss": 0.7578, + "step": 4958 + }, + { + "epoch": 1.4396864566700538, + "grad_norm": 3.7006642818450928, + "learning_rate": 8.529928000186721e-06, + "loss": 0.832, + "step": 4959 + }, + { + "epoch": 1.4399767745681522, + "grad_norm": 3.493058443069458, + "learning_rate": 8.529247894086383e-06, + "loss": 0.8828, + "step": 4960 + }, + { + "epoch": 1.4402670924662506, + "grad_norm": 3.9224722385406494, + "learning_rate": 8.528567657828785e-06, + "loss": 0.9021, + "step": 4961 + }, + { + "epoch": 1.440557410364349, + "grad_norm": 3.570800542831421, + "learning_rate": 8.527887291439012e-06, + "loss": 0.7967, + "step": 4962 + }, + { + "epoch": 1.4408477282624474, + "grad_norm": 4.029253959655762, + "learning_rate": 8.527206794942154e-06, + "loss": 0.7519, + "step": 4963 + }, + { + "epoch": 1.4411380461605459, + "grad_norm": 3.2075116634368896, + "learning_rate": 8.52652616836331e-06, + "loss": 0.673, + "step": 4964 + }, + { + "epoch": 1.4414283640586443, + "grad_norm": 3.6427388191223145, + "learning_rate": 8.525845411727581e-06, + "loss": 0.7974, + "step": 4965 + }, + { + "epoch": 1.4417186819567427, + "grad_norm": 3.2091753482818604, + "learning_rate": 8.525164525060072e-06, + "loss": 0.7223, + "step": 4966 + }, + { + "epoch": 1.442008999854841, + "grad_norm": 3.3279550075531006, + "learning_rate": 8.524483508385895e-06, + "loss": 0.7353, + "step": 4967 + }, + { + "epoch": 1.4422993177529395, + "grad_norm": 3.2981271743774414, + "learning_rate": 8.523802361730162e-06, + "loss": 0.7777, + "step": 4968 + }, + { + "epoch": 1.442589635651038, + "grad_norm": 3.850630760192871, + "learning_rate": 8.523121085118001e-06, + "loss": 0.8775, + "step": 4969 + }, + { + "epoch": 1.4428799535491363, + "grad_norm": 3.483059883117676, + "learning_rate": 8.522439678574528e-06, + "loss": 0.7326, + "step": 4970 + }, + { + "epoch": 1.4431702714472348, + "grad_norm": 3.390303611755371, + "learning_rate": 8.52175814212488e-06, + "loss": 0.7247, + "step": 4971 + }, + { + "epoch": 1.4434605893453332, + "grad_norm": 3.6529483795166016, + "learning_rate": 8.521076475794188e-06, + "loss": 0.7653, + "step": 4972 + }, + { + "epoch": 1.4437509072434316, + "grad_norm": 3.635930061340332, + "learning_rate": 8.520394679607592e-06, + "loss": 0.8241, + "step": 4973 + }, + { + "epoch": 1.44404122514153, + "grad_norm": 3.3492178916931152, + "learning_rate": 8.519712753590241e-06, + "loss": 0.7107, + "step": 4974 + }, + { + "epoch": 1.4443315430396284, + "grad_norm": 4.295066833496094, + "learning_rate": 8.519030697767278e-06, + "loss": 0.8889, + "step": 4975 + }, + { + "epoch": 1.4446218609377268, + "grad_norm": 3.8008925914764404, + "learning_rate": 8.51834851216386e-06, + "loss": 0.8281, + "step": 4976 + }, + { + "epoch": 1.4449121788358252, + "grad_norm": 3.6782050132751465, + "learning_rate": 8.517666196805142e-06, + "loss": 0.7278, + "step": 4977 + }, + { + "epoch": 1.4452024967339236, + "grad_norm": 3.2875430583953857, + "learning_rate": 8.516983751716294e-06, + "loss": 0.7124, + "step": 4978 + }, + { + "epoch": 1.445492814632022, + "grad_norm": 3.449599027633667, + "learning_rate": 8.516301176922482e-06, + "loss": 0.6499, + "step": 4979 + }, + { + "epoch": 1.4457831325301205, + "grad_norm": 3.2835583686828613, + "learning_rate": 8.515618472448875e-06, + "loss": 0.7154, + "step": 4980 + }, + { + "epoch": 1.4460734504282189, + "grad_norm": 3.622060537338257, + "learning_rate": 8.514935638320656e-06, + "loss": 0.8061, + "step": 4981 + }, + { + "epoch": 1.4463637683263173, + "grad_norm": 3.7743592262268066, + "learning_rate": 8.514252674563003e-06, + "loss": 0.781, + "step": 4982 + }, + { + "epoch": 1.4466540862244157, + "grad_norm": 3.5391032695770264, + "learning_rate": 8.513569581201109e-06, + "loss": 0.7509, + "step": 4983 + }, + { + "epoch": 1.4469444041225141, + "grad_norm": 3.4815375804901123, + "learning_rate": 8.512886358260162e-06, + "loss": 0.8138, + "step": 4984 + }, + { + "epoch": 1.4472347220206125, + "grad_norm": 3.804208755493164, + "learning_rate": 8.512203005765358e-06, + "loss": 0.7921, + "step": 4985 + }, + { + "epoch": 1.447525039918711, + "grad_norm": 3.3835744857788086, + "learning_rate": 8.511519523741903e-06, + "loss": 0.7415, + "step": 4986 + }, + { + "epoch": 1.4478153578168094, + "grad_norm": 3.5784029960632324, + "learning_rate": 8.510835912215001e-06, + "loss": 0.7147, + "step": 4987 + }, + { + "epoch": 1.4481056757149078, + "grad_norm": 3.8594770431518555, + "learning_rate": 8.510152171209864e-06, + "loss": 0.718, + "step": 4988 + }, + { + "epoch": 1.4483959936130062, + "grad_norm": 3.8807501792907715, + "learning_rate": 8.509468300751709e-06, + "loss": 0.7239, + "step": 4989 + }, + { + "epoch": 1.4486863115111046, + "grad_norm": 3.600749969482422, + "learning_rate": 8.508784300865754e-06, + "loss": 0.7901, + "step": 4990 + }, + { + "epoch": 1.448976629409203, + "grad_norm": 3.7116174697875977, + "learning_rate": 8.508100171577226e-06, + "loss": 0.8248, + "step": 4991 + }, + { + "epoch": 1.4492669473073014, + "grad_norm": 4.034679889678955, + "learning_rate": 8.507415912911357e-06, + "loss": 1.0043, + "step": 4992 + }, + { + "epoch": 1.4495572652053998, + "grad_norm": 3.861468553543091, + "learning_rate": 8.50673152489338e-06, + "loss": 0.8495, + "step": 4993 + }, + { + "epoch": 1.4498475831034983, + "grad_norm": 3.6064560413360596, + "learning_rate": 8.506047007548537e-06, + "loss": 0.776, + "step": 4994 + }, + { + "epoch": 1.4501379010015967, + "grad_norm": 3.368307113647461, + "learning_rate": 8.505362360902071e-06, + "loss": 0.8076, + "step": 4995 + }, + { + "epoch": 1.450428218899695, + "grad_norm": 3.307891845703125, + "learning_rate": 8.504677584979233e-06, + "loss": 0.7516, + "step": 4996 + }, + { + "epoch": 1.4507185367977935, + "grad_norm": 3.733379602432251, + "learning_rate": 8.503992679805277e-06, + "loss": 0.8998, + "step": 4997 + }, + { + "epoch": 1.4510088546958921, + "grad_norm": 3.367964029312134, + "learning_rate": 8.503307645405461e-06, + "loss": 0.7692, + "step": 4998 + }, + { + "epoch": 1.4512991725939903, + "grad_norm": 3.5045888423919678, + "learning_rate": 8.502622481805047e-06, + "loss": 0.8215, + "step": 4999 + }, + { + "epoch": 1.451589490492089, + "grad_norm": 3.624884605407715, + "learning_rate": 8.501937189029309e-06, + "loss": 0.8049, + "step": 5000 + }, + { + "epoch": 1.451589490492089, + "eval_loss": 1.1851102113723755, + "eval_runtime": 13.235, + "eval_samples_per_second": 30.223, + "eval_steps_per_second": 3.778, + "step": 5000 + }, + { + "epoch": 1.4518798083901872, + "grad_norm": 3.5284340381622314, + "learning_rate": 8.501251767103515e-06, + "loss": 0.8034, + "step": 5001 + }, + { + "epoch": 1.4521701262882858, + "grad_norm": 3.5684597492218018, + "learning_rate": 8.500566216052948e-06, + "loss": 0.7959, + "step": 5002 + }, + { + "epoch": 1.452460444186384, + "grad_norm": 3.7145283222198486, + "learning_rate": 8.499880535902885e-06, + "loss": 0.9445, + "step": 5003 + }, + { + "epoch": 1.4527507620844826, + "grad_norm": 3.89518666267395, + "learning_rate": 8.499194726678619e-06, + "loss": 0.7677, + "step": 5004 + }, + { + "epoch": 1.4530410799825808, + "grad_norm": 3.8414015769958496, + "learning_rate": 8.498508788405438e-06, + "loss": 0.9152, + "step": 5005 + }, + { + "epoch": 1.4533313978806794, + "grad_norm": 3.748683214187622, + "learning_rate": 8.497822721108642e-06, + "loss": 0.7538, + "step": 5006 + }, + { + "epoch": 1.4536217157787776, + "grad_norm": 3.3457822799682617, + "learning_rate": 8.497136524813534e-06, + "loss": 0.7947, + "step": 5007 + }, + { + "epoch": 1.4539120336768763, + "grad_norm": 3.300783157348633, + "learning_rate": 8.496450199545417e-06, + "loss": 0.7006, + "step": 5008 + }, + { + "epoch": 1.4542023515749745, + "grad_norm": 3.2852492332458496, + "learning_rate": 8.495763745329604e-06, + "loss": 0.6321, + "step": 5009 + }, + { + "epoch": 1.454492669473073, + "grad_norm": 3.0854744911193848, + "learning_rate": 8.49507716219141e-06, + "loss": 0.6212, + "step": 5010 + }, + { + "epoch": 1.4547829873711715, + "grad_norm": 4.364450931549072, + "learning_rate": 8.49439045015616e-06, + "loss": 0.9948, + "step": 5011 + }, + { + "epoch": 1.45507330526927, + "grad_norm": 3.982003927230835, + "learning_rate": 8.493703609249175e-06, + "loss": 0.8609, + "step": 5012 + }, + { + "epoch": 1.4553636231673683, + "grad_norm": 3.6758294105529785, + "learning_rate": 8.49301663949579e-06, + "loss": 0.7805, + "step": 5013 + }, + { + "epoch": 1.4556539410654667, + "grad_norm": 3.922879934310913, + "learning_rate": 8.492329540921335e-06, + "loss": 0.9171, + "step": 5014 + }, + { + "epoch": 1.4559442589635652, + "grad_norm": 3.4253084659576416, + "learning_rate": 8.491642313551153e-06, + "loss": 0.7327, + "step": 5015 + }, + { + "epoch": 1.4562345768616636, + "grad_norm": 3.4870643615722656, + "learning_rate": 8.490954957410588e-06, + "loss": 0.7023, + "step": 5016 + }, + { + "epoch": 1.456524894759762, + "grad_norm": 3.2392799854278564, + "learning_rate": 8.490267472524989e-06, + "loss": 0.6963, + "step": 5017 + }, + { + "epoch": 1.4568152126578604, + "grad_norm": 3.677802324295044, + "learning_rate": 8.489579858919711e-06, + "loss": 0.8241, + "step": 5018 + }, + { + "epoch": 1.4571055305559588, + "grad_norm": 3.4841086864471436, + "learning_rate": 8.488892116620114e-06, + "loss": 0.7841, + "step": 5019 + }, + { + "epoch": 1.4573958484540572, + "grad_norm": 3.652825117111206, + "learning_rate": 8.48820424565156e-06, + "loss": 0.8396, + "step": 5020 + }, + { + "epoch": 1.4576861663521556, + "grad_norm": 4.243154048919678, + "learning_rate": 8.487516246039415e-06, + "loss": 0.9935, + "step": 5021 + }, + { + "epoch": 1.457976484250254, + "grad_norm": 3.3527235984802246, + "learning_rate": 8.486828117809057e-06, + "loss": 0.7414, + "step": 5022 + }, + { + "epoch": 1.4582668021483525, + "grad_norm": 3.2306318283081055, + "learning_rate": 8.486139860985862e-06, + "loss": 0.7676, + "step": 5023 + }, + { + "epoch": 1.4585571200464509, + "grad_norm": 3.4278311729431152, + "learning_rate": 8.485451475595211e-06, + "loss": 0.7074, + "step": 5024 + }, + { + "epoch": 1.4588474379445493, + "grad_norm": 3.2792117595672607, + "learning_rate": 8.484762961662494e-06, + "loss": 0.7377, + "step": 5025 + }, + { + "epoch": 1.4591377558426477, + "grad_norm": 3.4412848949432373, + "learning_rate": 8.4840743192131e-06, + "loss": 0.7358, + "step": 5026 + }, + { + "epoch": 1.4594280737407461, + "grad_norm": 3.700155258178711, + "learning_rate": 8.48338554827243e-06, + "loss": 0.7138, + "step": 5027 + }, + { + "epoch": 1.4597183916388445, + "grad_norm": 3.4831392765045166, + "learning_rate": 8.482696648865883e-06, + "loss": 0.795, + "step": 5028 + }, + { + "epoch": 1.460008709536943, + "grad_norm": 3.9102721214294434, + "learning_rate": 8.482007621018865e-06, + "loss": 0.7914, + "step": 5029 + }, + { + "epoch": 1.4602990274350414, + "grad_norm": 3.5112287998199463, + "learning_rate": 8.481318464756787e-06, + "loss": 0.6755, + "step": 5030 + }, + { + "epoch": 1.4605893453331398, + "grad_norm": 3.3797972202301025, + "learning_rate": 8.480629180105067e-06, + "loss": 0.7752, + "step": 5031 + }, + { + "epoch": 1.4608796632312382, + "grad_norm": 3.8857204914093018, + "learning_rate": 8.479939767089124e-06, + "loss": 0.7878, + "step": 5032 + }, + { + "epoch": 1.4611699811293366, + "grad_norm": 3.759293556213379, + "learning_rate": 8.479250225734382e-06, + "loss": 0.767, + "step": 5033 + }, + { + "epoch": 1.461460299027435, + "grad_norm": 3.3629093170166016, + "learning_rate": 8.478560556066274e-06, + "loss": 0.7772, + "step": 5034 + }, + { + "epoch": 1.4617506169255334, + "grad_norm": 3.661879062652588, + "learning_rate": 8.477870758110231e-06, + "loss": 0.8362, + "step": 5035 + }, + { + "epoch": 1.4620409348236318, + "grad_norm": 3.17903995513916, + "learning_rate": 8.477180831891696e-06, + "loss": 0.7094, + "step": 5036 + }, + { + "epoch": 1.4623312527217303, + "grad_norm": 3.840388536453247, + "learning_rate": 8.476490777436113e-06, + "loss": 0.7962, + "step": 5037 + }, + { + "epoch": 1.4626215706198287, + "grad_norm": 3.8354861736297607, + "learning_rate": 8.475800594768929e-06, + "loss": 0.7228, + "step": 5038 + }, + { + "epoch": 1.462911888517927, + "grad_norm": 3.448528289794922, + "learning_rate": 8.475110283915597e-06, + "loss": 0.8893, + "step": 5039 + }, + { + "epoch": 1.4632022064160255, + "grad_norm": 3.4191551208496094, + "learning_rate": 8.474419844901575e-06, + "loss": 0.8896, + "step": 5040 + }, + { + "epoch": 1.463492524314124, + "grad_norm": 3.802597999572754, + "learning_rate": 8.473729277752331e-06, + "loss": 0.7941, + "step": 5041 + }, + { + "epoch": 1.4637828422122223, + "grad_norm": 3.8264427185058594, + "learning_rate": 8.47303858249333e-06, + "loss": 0.823, + "step": 5042 + }, + { + "epoch": 1.4640731601103207, + "grad_norm": 3.2838687896728516, + "learning_rate": 8.472347759150044e-06, + "loss": 0.7341, + "step": 5043 + }, + { + "epoch": 1.4643634780084192, + "grad_norm": 3.4499127864837646, + "learning_rate": 8.47165680774795e-06, + "loss": 0.7497, + "step": 5044 + }, + { + "epoch": 1.4646537959065176, + "grad_norm": 3.102621078491211, + "learning_rate": 8.47096572831253e-06, + "loss": 0.6851, + "step": 5045 + }, + { + "epoch": 1.464944113804616, + "grad_norm": 3.695542097091675, + "learning_rate": 8.470274520869273e-06, + "loss": 0.7494, + "step": 5046 + }, + { + "epoch": 1.4652344317027144, + "grad_norm": 3.250293254852295, + "learning_rate": 8.469583185443669e-06, + "loss": 0.7554, + "step": 5047 + }, + { + "epoch": 1.4655247496008128, + "grad_norm": 3.8266215324401855, + "learning_rate": 8.468891722061211e-06, + "loss": 0.8187, + "step": 5048 + }, + { + "epoch": 1.4658150674989114, + "grad_norm": 3.5755343437194824, + "learning_rate": 8.468200130747406e-06, + "loss": 0.7568, + "step": 5049 + }, + { + "epoch": 1.4661053853970096, + "grad_norm": 3.6069979667663574, + "learning_rate": 8.467508411527754e-06, + "loss": 0.8279, + "step": 5050 + }, + { + "epoch": 1.4663957032951083, + "grad_norm": 3.417710542678833, + "learning_rate": 8.46681656442777e-06, + "loss": 0.7938, + "step": 5051 + }, + { + "epoch": 1.4666860211932065, + "grad_norm": 3.6008191108703613, + "learning_rate": 8.466124589472967e-06, + "loss": 0.8101, + "step": 5052 + }, + { + "epoch": 1.466976339091305, + "grad_norm": 3.4891951084136963, + "learning_rate": 8.465432486688863e-06, + "loss": 0.8224, + "step": 5053 + }, + { + "epoch": 1.4672666569894033, + "grad_norm": 3.3960723876953125, + "learning_rate": 8.464740256100984e-06, + "loss": 0.8218, + "step": 5054 + }, + { + "epoch": 1.467556974887502, + "grad_norm": 3.9240763187408447, + "learning_rate": 8.46404789773486e-06, + "loss": 0.7348, + "step": 5055 + }, + { + "epoch": 1.4678472927856, + "grad_norm": 3.406634569168091, + "learning_rate": 8.463355411616024e-06, + "loss": 0.7603, + "step": 5056 + }, + { + "epoch": 1.4681376106836987, + "grad_norm": 2.8764095306396484, + "learning_rate": 8.462662797770016e-06, + "loss": 0.5915, + "step": 5057 + }, + { + "epoch": 1.468427928581797, + "grad_norm": 3.4037272930145264, + "learning_rate": 8.461970056222375e-06, + "loss": 0.6647, + "step": 5058 + }, + { + "epoch": 1.4687182464798956, + "grad_norm": 3.4545750617980957, + "learning_rate": 8.461277186998656e-06, + "loss": 0.7738, + "step": 5059 + }, + { + "epoch": 1.469008564377994, + "grad_norm": 3.645581007003784, + "learning_rate": 8.460584190124405e-06, + "loss": 0.7971, + "step": 5060 + }, + { + "epoch": 1.4692988822760924, + "grad_norm": 3.7972629070281982, + "learning_rate": 8.459891065625184e-06, + "loss": 0.7959, + "step": 5061 + }, + { + "epoch": 1.4695892001741908, + "grad_norm": 3.7442901134490967, + "learning_rate": 8.459197813526554e-06, + "loss": 0.8311, + "step": 5062 + }, + { + "epoch": 1.4698795180722892, + "grad_norm": 3.545626640319824, + "learning_rate": 8.45850443385408e-06, + "loss": 0.8391, + "step": 5063 + }, + { + "epoch": 1.4701698359703876, + "grad_norm": 3.4700894355773926, + "learning_rate": 8.457810926633336e-06, + "loss": 0.7605, + "step": 5064 + }, + { + "epoch": 1.470460153868486, + "grad_norm": 3.531576633453369, + "learning_rate": 8.457117291889895e-06, + "loss": 0.706, + "step": 5065 + }, + { + "epoch": 1.4707504717665845, + "grad_norm": 3.6248650550842285, + "learning_rate": 8.456423529649343e-06, + "loss": 0.9177, + "step": 5066 + }, + { + "epoch": 1.4710407896646829, + "grad_norm": 3.7519371509552, + "learning_rate": 8.45572963993726e-06, + "loss": 0.8181, + "step": 5067 + }, + { + "epoch": 1.4713311075627813, + "grad_norm": 3.677908420562744, + "learning_rate": 8.455035622779242e-06, + "loss": 0.8197, + "step": 5068 + }, + { + "epoch": 1.4716214254608797, + "grad_norm": 3.604118824005127, + "learning_rate": 8.45434147820088e-06, + "loss": 0.7423, + "step": 5069 + }, + { + "epoch": 1.4719117433589781, + "grad_norm": 3.7687582969665527, + "learning_rate": 8.453647206227776e-06, + "loss": 0.8346, + "step": 5070 + }, + { + "epoch": 1.4722020612570765, + "grad_norm": 3.278323173522949, + "learning_rate": 8.452952806885533e-06, + "loss": 0.6388, + "step": 5071 + }, + { + "epoch": 1.472492379155175, + "grad_norm": 3.315422296524048, + "learning_rate": 8.45225828019976e-06, + "loss": 0.7371, + "step": 5072 + }, + { + "epoch": 1.4727826970532734, + "grad_norm": 4.38593864440918, + "learning_rate": 8.451563626196072e-06, + "loss": 0.9145, + "step": 5073 + }, + { + "epoch": 1.4730730149513718, + "grad_norm": 3.3235526084899902, + "learning_rate": 8.450868844900088e-06, + "loss": 0.6989, + "step": 5074 + }, + { + "epoch": 1.4733633328494702, + "grad_norm": 3.56598162651062, + "learning_rate": 8.450173936337429e-06, + "loss": 0.7485, + "step": 5075 + }, + { + "epoch": 1.4736536507475686, + "grad_norm": 3.6153061389923096, + "learning_rate": 8.449478900533726e-06, + "loss": 0.8819, + "step": 5076 + }, + { + "epoch": 1.473943968645667, + "grad_norm": 3.7739386558532715, + "learning_rate": 8.448783737514609e-06, + "loss": 0.6986, + "step": 5077 + }, + { + "epoch": 1.4742342865437654, + "grad_norm": 3.4768197536468506, + "learning_rate": 8.448088447305716e-06, + "loss": 0.7281, + "step": 5078 + }, + { + "epoch": 1.4745246044418638, + "grad_norm": 3.376514196395874, + "learning_rate": 8.447393029932692e-06, + "loss": 0.7537, + "step": 5079 + }, + { + "epoch": 1.4748149223399623, + "grad_norm": 3.229945421218872, + "learning_rate": 8.446697485421179e-06, + "loss": 0.7705, + "step": 5080 + }, + { + "epoch": 1.4751052402380607, + "grad_norm": 3.3229260444641113, + "learning_rate": 8.446001813796829e-06, + "loss": 0.8065, + "step": 5081 + }, + { + "epoch": 1.475395558136159, + "grad_norm": 4.087240695953369, + "learning_rate": 8.445306015085301e-06, + "loss": 0.8067, + "step": 5082 + }, + { + "epoch": 1.4756858760342575, + "grad_norm": 3.625922203063965, + "learning_rate": 8.444610089312255e-06, + "loss": 0.8401, + "step": 5083 + }, + { + "epoch": 1.475976193932356, + "grad_norm": 3.589026689529419, + "learning_rate": 8.443914036503356e-06, + "loss": 0.7364, + "step": 5084 + }, + { + "epoch": 1.4762665118304543, + "grad_norm": 3.892855405807495, + "learning_rate": 8.443217856684273e-06, + "loss": 0.8431, + "step": 5085 + }, + { + "epoch": 1.4765568297285527, + "grad_norm": 3.6631200313568115, + "learning_rate": 8.442521549880682e-06, + "loss": 0.6817, + "step": 5086 + }, + { + "epoch": 1.4768471476266511, + "grad_norm": 3.349924325942993, + "learning_rate": 8.441825116118264e-06, + "loss": 0.8062, + "step": 5087 + }, + { + "epoch": 1.4771374655247496, + "grad_norm": 3.383465051651001, + "learning_rate": 8.4411285554227e-06, + "loss": 0.8242, + "step": 5088 + }, + { + "epoch": 1.477427783422848, + "grad_norm": 3.3581674098968506, + "learning_rate": 8.44043186781968e-06, + "loss": 0.73, + "step": 5089 + }, + { + "epoch": 1.4777181013209464, + "grad_norm": 3.511465072631836, + "learning_rate": 8.439735053334899e-06, + "loss": 0.7939, + "step": 5090 + }, + { + "epoch": 1.4780084192190448, + "grad_norm": 3.3580431938171387, + "learning_rate": 8.439038111994055e-06, + "loss": 0.7183, + "step": 5091 + }, + { + "epoch": 1.4782987371171432, + "grad_norm": 3.0493764877319336, + "learning_rate": 8.43834104382285e-06, + "loss": 0.6271, + "step": 5092 + }, + { + "epoch": 1.4785890550152416, + "grad_norm": 3.401853561401367, + "learning_rate": 8.43764384884699e-06, + "loss": 0.7419, + "step": 5093 + }, + { + "epoch": 1.47887937291334, + "grad_norm": 3.6405131816864014, + "learning_rate": 8.43694652709219e-06, + "loss": 0.932, + "step": 5094 + }, + { + "epoch": 1.4791696908114385, + "grad_norm": 3.652693510055542, + "learning_rate": 8.436249078584166e-06, + "loss": 0.8069, + "step": 5095 + }, + { + "epoch": 1.4794600087095369, + "grad_norm": 3.3175535202026367, + "learning_rate": 8.43555150334864e-06, + "loss": 0.7172, + "step": 5096 + }, + { + "epoch": 1.4797503266076353, + "grad_norm": 3.5787837505340576, + "learning_rate": 8.434853801411337e-06, + "loss": 0.7607, + "step": 5097 + }, + { + "epoch": 1.480040644505734, + "grad_norm": 3.6244215965270996, + "learning_rate": 8.43415597279799e-06, + "loss": 0.7861, + "step": 5098 + }, + { + "epoch": 1.480330962403832, + "grad_norm": 2.996455430984497, + "learning_rate": 8.433458017534332e-06, + "loss": 0.6984, + "step": 5099 + }, + { + "epoch": 1.4806212803019307, + "grad_norm": 3.1511950492858887, + "learning_rate": 8.432759935646107e-06, + "loss": 0.6542, + "step": 5100 + }, + { + "epoch": 1.480911598200029, + "grad_norm": 3.5917961597442627, + "learning_rate": 8.432061727159056e-06, + "loss": 0.6977, + "step": 5101 + }, + { + "epoch": 1.4812019160981276, + "grad_norm": 3.706416130065918, + "learning_rate": 8.431363392098931e-06, + "loss": 0.7762, + "step": 5102 + }, + { + "epoch": 1.4814922339962258, + "grad_norm": 3.645132303237915, + "learning_rate": 8.430664930491485e-06, + "loss": 0.7918, + "step": 5103 + }, + { + "epoch": 1.4817825518943244, + "grad_norm": 3.448289155960083, + "learning_rate": 8.429966342362478e-06, + "loss": 0.8402, + "step": 5104 + }, + { + "epoch": 1.4820728697924226, + "grad_norm": 3.323197364807129, + "learning_rate": 8.429267627737675e-06, + "loss": 0.7244, + "step": 5105 + }, + { + "epoch": 1.4823631876905212, + "grad_norm": 3.8272650241851807, + "learning_rate": 8.428568786642842e-06, + "loss": 0.9625, + "step": 5106 + }, + { + "epoch": 1.4826535055886194, + "grad_norm": 3.6143205165863037, + "learning_rate": 8.427869819103753e-06, + "loss": 0.7005, + "step": 5107 + }, + { + "epoch": 1.482943823486718, + "grad_norm": 3.774230718612671, + "learning_rate": 8.427170725146184e-06, + "loss": 0.8041, + "step": 5108 + }, + { + "epoch": 1.4832341413848162, + "grad_norm": 3.8581020832061768, + "learning_rate": 8.42647150479592e-06, + "loss": 0.8116, + "step": 5109 + }, + { + "epoch": 1.4835244592829149, + "grad_norm": 3.6557369232177734, + "learning_rate": 8.425772158078747e-06, + "loss": 0.8101, + "step": 5110 + }, + { + "epoch": 1.4838147771810133, + "grad_norm": 3.73181414604187, + "learning_rate": 8.425072685020454e-06, + "loss": 0.7939, + "step": 5111 + }, + { + "epoch": 1.4841050950791117, + "grad_norm": 3.7906265258789062, + "learning_rate": 8.424373085646842e-06, + "loss": 0.8461, + "step": 5112 + }, + { + "epoch": 1.4843954129772101, + "grad_norm": 3.1719911098480225, + "learning_rate": 8.423673359983708e-06, + "loss": 0.7403, + "step": 5113 + }, + { + "epoch": 1.4846857308753085, + "grad_norm": 3.553091049194336, + "learning_rate": 8.42297350805686e-06, + "loss": 0.8006, + "step": 5114 + }, + { + "epoch": 1.484976048773407, + "grad_norm": 3.4175894260406494, + "learning_rate": 8.42227352989211e-06, + "loss": 0.8031, + "step": 5115 + }, + { + "epoch": 1.4852663666715054, + "grad_norm": 3.7422873973846436, + "learning_rate": 8.421573425515267e-06, + "loss": 0.8509, + "step": 5116 + }, + { + "epoch": 1.4855566845696038, + "grad_norm": 3.5525622367858887, + "learning_rate": 8.420873194952153e-06, + "loss": 0.8151, + "step": 5117 + }, + { + "epoch": 1.4858470024677022, + "grad_norm": 3.4313485622406006, + "learning_rate": 8.420172838228595e-06, + "loss": 0.7339, + "step": 5118 + }, + { + "epoch": 1.4861373203658006, + "grad_norm": 3.8493919372558594, + "learning_rate": 8.41947235537042e-06, + "loss": 0.7356, + "step": 5119 + }, + { + "epoch": 1.486427638263899, + "grad_norm": 3.501344919204712, + "learning_rate": 8.41877174640346e-06, + "loss": 0.8211, + "step": 5120 + }, + { + "epoch": 1.4867179561619974, + "grad_norm": 3.3096718788146973, + "learning_rate": 8.418071011353556e-06, + "loss": 0.699, + "step": 5121 + }, + { + "epoch": 1.4870082740600958, + "grad_norm": 3.5474376678466797, + "learning_rate": 8.417370150246548e-06, + "loss": 0.8234, + "step": 5122 + }, + { + "epoch": 1.4872985919581942, + "grad_norm": 3.771197557449341, + "learning_rate": 8.416669163108287e-06, + "loss": 0.8363, + "step": 5123 + }, + { + "epoch": 1.4875889098562927, + "grad_norm": 3.500458240509033, + "learning_rate": 8.415968049964623e-06, + "loss": 0.7884, + "step": 5124 + }, + { + "epoch": 1.487879227754391, + "grad_norm": 4.015684127807617, + "learning_rate": 8.415266810841412e-06, + "loss": 0.8161, + "step": 5125 + }, + { + "epoch": 1.4881695456524895, + "grad_norm": 3.5269722938537598, + "learning_rate": 8.414565445764517e-06, + "loss": 0.7957, + "step": 5126 + }, + { + "epoch": 1.488459863550588, + "grad_norm": 3.418762683868408, + "learning_rate": 8.413863954759802e-06, + "loss": 0.9305, + "step": 5127 + }, + { + "epoch": 1.4887501814486863, + "grad_norm": 3.1369898319244385, + "learning_rate": 8.41316233785314e-06, + "loss": 0.7075, + "step": 5128 + }, + { + "epoch": 1.4890404993467847, + "grad_norm": 3.5826096534729004, + "learning_rate": 8.412460595070405e-06, + "loss": 0.8197, + "step": 5129 + }, + { + "epoch": 1.4893308172448831, + "grad_norm": 3.6522650718688965, + "learning_rate": 8.411758726437478e-06, + "loss": 0.8644, + "step": 5130 + }, + { + "epoch": 1.4896211351429816, + "grad_norm": 3.8462347984313965, + "learning_rate": 8.411056731980243e-06, + "loss": 0.7973, + "step": 5131 + }, + { + "epoch": 1.48991145304108, + "grad_norm": 4.156594753265381, + "learning_rate": 8.41035461172459e-06, + "loss": 0.8661, + "step": 5132 + }, + { + "epoch": 1.4902017709391784, + "grad_norm": 4.024465084075928, + "learning_rate": 8.409652365696411e-06, + "loss": 0.7934, + "step": 5133 + }, + { + "epoch": 1.4904920888372768, + "grad_norm": 3.535364866256714, + "learning_rate": 8.408949993921607e-06, + "loss": 0.76, + "step": 5134 + }, + { + "epoch": 1.4907824067353752, + "grad_norm": 3.9186315536499023, + "learning_rate": 8.40824749642608e-06, + "loss": 0.763, + "step": 5135 + }, + { + "epoch": 1.4910727246334736, + "grad_norm": 3.914283514022827, + "learning_rate": 8.407544873235736e-06, + "loss": 0.7664, + "step": 5136 + }, + { + "epoch": 1.491363042531572, + "grad_norm": 3.8266966342926025, + "learning_rate": 8.40684212437649e-06, + "loss": 0.7919, + "step": 5137 + }, + { + "epoch": 1.4916533604296704, + "grad_norm": 3.40091609954834, + "learning_rate": 8.406139249874261e-06, + "loss": 0.7519, + "step": 5138 + }, + { + "epoch": 1.4919436783277689, + "grad_norm": 3.6243724822998047, + "learning_rate": 8.405436249754965e-06, + "loss": 0.731, + "step": 5139 + }, + { + "epoch": 1.4922339962258673, + "grad_norm": 3.760503053665161, + "learning_rate": 8.404733124044532e-06, + "loss": 0.8834, + "step": 5140 + }, + { + "epoch": 1.4925243141239657, + "grad_norm": 3.4237194061279297, + "learning_rate": 8.404029872768895e-06, + "loss": 0.7376, + "step": 5141 + }, + { + "epoch": 1.492814632022064, + "grad_norm": 4.07653284072876, + "learning_rate": 8.403326495953985e-06, + "loss": 0.9527, + "step": 5142 + }, + { + "epoch": 1.4931049499201625, + "grad_norm": 3.364163875579834, + "learning_rate": 8.402622993625744e-06, + "loss": 0.744, + "step": 5143 + }, + { + "epoch": 1.493395267818261, + "grad_norm": 3.8818440437316895, + "learning_rate": 8.40191936581012e-06, + "loss": 0.8061, + "step": 5144 + }, + { + "epoch": 1.4936855857163593, + "grad_norm": 3.259274482727051, + "learning_rate": 8.401215612533056e-06, + "loss": 0.7186, + "step": 5145 + }, + { + "epoch": 1.4939759036144578, + "grad_norm": 3.872246265411377, + "learning_rate": 8.400511733820513e-06, + "loss": 0.9133, + "step": 5146 + }, + { + "epoch": 1.4942662215125562, + "grad_norm": 4.047363758087158, + "learning_rate": 8.399807729698446e-06, + "loss": 0.9393, + "step": 5147 + }, + { + "epoch": 1.4945565394106546, + "grad_norm": 3.743149995803833, + "learning_rate": 8.399103600192817e-06, + "loss": 0.8807, + "step": 5148 + }, + { + "epoch": 1.4948468573087532, + "grad_norm": 3.3832364082336426, + "learning_rate": 8.398399345329598e-06, + "loss": 0.7135, + "step": 5149 + }, + { + "epoch": 1.4951371752068514, + "grad_norm": 3.180245876312256, + "learning_rate": 8.397694965134759e-06, + "loss": 0.767, + "step": 5150 + }, + { + "epoch": 1.49542749310495, + "grad_norm": 3.3784093856811523, + "learning_rate": 8.39699045963428e-06, + "loss": 0.8157, + "step": 5151 + }, + { + "epoch": 1.4957178110030482, + "grad_norm": 3.921077251434326, + "learning_rate": 8.39628582885414e-06, + "loss": 0.8413, + "step": 5152 + }, + { + "epoch": 1.4960081289011469, + "grad_norm": 3.9500279426574707, + "learning_rate": 8.395581072820325e-06, + "loss": 0.7991, + "step": 5153 + }, + { + "epoch": 1.496298446799245, + "grad_norm": 3.412574052810669, + "learning_rate": 8.394876191558828e-06, + "loss": 0.6968, + "step": 5154 + }, + { + "epoch": 1.4965887646973437, + "grad_norm": 3.49398136138916, + "learning_rate": 8.394171185095646e-06, + "loss": 0.7868, + "step": 5155 + }, + { + "epoch": 1.496879082595442, + "grad_norm": 3.4007065296173096, + "learning_rate": 8.393466053456775e-06, + "loss": 0.7535, + "step": 5156 + }, + { + "epoch": 1.4971694004935405, + "grad_norm": 3.4070701599121094, + "learning_rate": 8.392760796668225e-06, + "loss": 0.7558, + "step": 5157 + }, + { + "epoch": 1.4974597183916387, + "grad_norm": 3.5991742610931396, + "learning_rate": 8.392055414756e-06, + "loss": 0.807, + "step": 5158 + }, + { + "epoch": 1.4977500362897374, + "grad_norm": 3.701852560043335, + "learning_rate": 8.39134990774612e-06, + "loss": 0.8775, + "step": 5159 + }, + { + "epoch": 1.4980403541878355, + "grad_norm": 3.2740137577056885, + "learning_rate": 8.390644275664602e-06, + "loss": 0.7085, + "step": 5160 + }, + { + "epoch": 1.4983306720859342, + "grad_norm": 3.118898868560791, + "learning_rate": 8.389938518537468e-06, + "loss": 0.7442, + "step": 5161 + }, + { + "epoch": 1.4986209899840326, + "grad_norm": 3.790092945098877, + "learning_rate": 8.389232636390744e-06, + "loss": 0.7488, + "step": 5162 + }, + { + "epoch": 1.498911307882131, + "grad_norm": 3.3232455253601074, + "learning_rate": 8.388526629250469e-06, + "loss": 0.8099, + "step": 5163 + }, + { + "epoch": 1.4992016257802294, + "grad_norm": 3.8602137565612793, + "learning_rate": 8.387820497142674e-06, + "loss": 0.7779, + "step": 5164 + }, + { + "epoch": 1.4994919436783278, + "grad_norm": 3.3287713527679443, + "learning_rate": 8.387114240093406e-06, + "loss": 0.7674, + "step": 5165 + }, + { + "epoch": 1.4997822615764262, + "grad_norm": 3.3892929553985596, + "learning_rate": 8.386407858128707e-06, + "loss": 0.8281, + "step": 5166 + }, + { + "epoch": 1.5000725794745247, + "grad_norm": 3.7774667739868164, + "learning_rate": 8.38570135127463e-06, + "loss": 0.9089, + "step": 5167 + }, + { + "epoch": 1.500362897372623, + "grad_norm": 3.992173194885254, + "learning_rate": 8.384994719557232e-06, + "loss": 0.8284, + "step": 5168 + }, + { + "epoch": 1.5006532152707215, + "grad_norm": 3.2167437076568604, + "learning_rate": 8.38428796300257e-06, + "loss": 0.7445, + "step": 5169 + }, + { + "epoch": 1.50094353316882, + "grad_norm": 3.703031301498413, + "learning_rate": 8.383581081636712e-06, + "loss": 0.8307, + "step": 5170 + }, + { + "epoch": 1.5012338510669183, + "grad_norm": 3.630709409713745, + "learning_rate": 8.382874075485728e-06, + "loss": 0.7981, + "step": 5171 + }, + { + "epoch": 1.5015241689650167, + "grad_norm": 3.494649887084961, + "learning_rate": 8.382166944575689e-06, + "loss": 0.7494, + "step": 5172 + }, + { + "epoch": 1.5018144868631151, + "grad_norm": 3.6578376293182373, + "learning_rate": 8.381459688932674e-06, + "loss": 0.9244, + "step": 5173 + }, + { + "epoch": 1.5021048047612136, + "grad_norm": 3.397042989730835, + "learning_rate": 8.38075230858277e-06, + "loss": 0.7623, + "step": 5174 + }, + { + "epoch": 1.502395122659312, + "grad_norm": 3.3813118934631348, + "learning_rate": 8.38004480355206e-06, + "loss": 0.6921, + "step": 5175 + }, + { + "epoch": 1.5026854405574104, + "grad_norm": 3.190324306488037, + "learning_rate": 8.379337173866642e-06, + "loss": 0.7639, + "step": 5176 + }, + { + "epoch": 1.5029757584555088, + "grad_norm": 3.264589786529541, + "learning_rate": 8.37862941955261e-06, + "loss": 0.7104, + "step": 5177 + }, + { + "epoch": 1.5032660763536072, + "grad_norm": 3.360027551651001, + "learning_rate": 8.377921540636062e-06, + "loss": 0.7006, + "step": 5178 + }, + { + "epoch": 1.5035563942517056, + "grad_norm": 3.662677049636841, + "learning_rate": 8.37721353714311e-06, + "loss": 0.7836, + "step": 5179 + }, + { + "epoch": 1.503846712149804, + "grad_norm": 3.459056854248047, + "learning_rate": 8.376505409099865e-06, + "loss": 0.7282, + "step": 5180 + }, + { + "epoch": 1.5041370300479024, + "grad_norm": 3.67429256439209, + "learning_rate": 8.375797156532436e-06, + "loss": 0.8587, + "step": 5181 + }, + { + "epoch": 1.5044273479460009, + "grad_norm": 3.853055715560913, + "learning_rate": 8.375088779466953e-06, + "loss": 0.8487, + "step": 5182 + }, + { + "epoch": 1.5047176658440993, + "grad_norm": 3.8401939868927, + "learning_rate": 8.374380277929532e-06, + "loss": 0.9323, + "step": 5183 + }, + { + "epoch": 1.5050079837421977, + "grad_norm": 3.4322612285614014, + "learning_rate": 8.373671651946306e-06, + "loss": 0.7913, + "step": 5184 + }, + { + "epoch": 1.505298301640296, + "grad_norm": 3.570939064025879, + "learning_rate": 8.372962901543409e-06, + "loss": 0.8441, + "step": 5185 + }, + { + "epoch": 1.5055886195383945, + "grad_norm": 3.5912673473358154, + "learning_rate": 8.372254026746977e-06, + "loss": 0.7455, + "step": 5186 + }, + { + "epoch": 1.505878937436493, + "grad_norm": 3.4715113639831543, + "learning_rate": 8.371545027583154e-06, + "loss": 0.7535, + "step": 5187 + }, + { + "epoch": 1.5061692553345913, + "grad_norm": 3.537951707839966, + "learning_rate": 8.370835904078092e-06, + "loss": 0.7693, + "step": 5188 + }, + { + "epoch": 1.5064595732326898, + "grad_norm": 3.579514265060425, + "learning_rate": 8.370126656257938e-06, + "loss": 0.8167, + "step": 5189 + }, + { + "epoch": 1.5067498911307882, + "grad_norm": 3.547579050064087, + "learning_rate": 8.369417284148849e-06, + "loss": 0.721, + "step": 5190 + }, + { + "epoch": 1.5070402090288866, + "grad_norm": 3.8502068519592285, + "learning_rate": 8.368707787776988e-06, + "loss": 0.8689, + "step": 5191 + }, + { + "epoch": 1.5073305269269852, + "grad_norm": 3.8900763988494873, + "learning_rate": 8.367998167168521e-06, + "loss": 0.8262, + "step": 5192 + }, + { + "epoch": 1.5076208448250834, + "grad_norm": 3.818483591079712, + "learning_rate": 8.367288422349617e-06, + "loss": 0.8494, + "step": 5193 + }, + { + "epoch": 1.507911162723182, + "grad_norm": 3.8860888481140137, + "learning_rate": 8.366578553346455e-06, + "loss": 0.881, + "step": 5194 + }, + { + "epoch": 1.5082014806212802, + "grad_norm": 3.470583915710449, + "learning_rate": 8.365868560185209e-06, + "loss": 0.7415, + "step": 5195 + }, + { + "epoch": 1.5084917985193789, + "grad_norm": 3.4204583168029785, + "learning_rate": 8.365158442892069e-06, + "loss": 0.7979, + "step": 5196 + }, + { + "epoch": 1.508782116417477, + "grad_norm": 4.06003999710083, + "learning_rate": 8.36444820149322e-06, + "loss": 0.8262, + "step": 5197 + }, + { + "epoch": 1.5090724343155757, + "grad_norm": 3.7327427864074707, + "learning_rate": 8.363737836014855e-06, + "loss": 0.8375, + "step": 5198 + }, + { + "epoch": 1.5093627522136739, + "grad_norm": 3.577173948287964, + "learning_rate": 8.363027346483174e-06, + "loss": 0.8699, + "step": 5199 + }, + { + "epoch": 1.5096530701117725, + "grad_norm": 3.675419569015503, + "learning_rate": 8.36231673292438e-06, + "loss": 0.7442, + "step": 5200 + }, + { + "epoch": 1.5099433880098707, + "grad_norm": 3.533881425857544, + "learning_rate": 8.36160599536468e-06, + "loss": 0.8418, + "step": 5201 + }, + { + "epoch": 1.5102337059079693, + "grad_norm": 3.6481122970581055, + "learning_rate": 8.360895133830284e-06, + "loss": 0.7421, + "step": 5202 + }, + { + "epoch": 1.5105240238060675, + "grad_norm": 3.723921298980713, + "learning_rate": 8.360184148347409e-06, + "loss": 0.7413, + "step": 5203 + }, + { + "epoch": 1.5108143417041662, + "grad_norm": 3.4912638664245605, + "learning_rate": 8.359473038942275e-06, + "loss": 0.834, + "step": 5204 + }, + { + "epoch": 1.5111046596022644, + "grad_norm": 3.025022506713867, + "learning_rate": 8.358761805641109e-06, + "loss": 0.64, + "step": 5205 + }, + { + "epoch": 1.511394977500363, + "grad_norm": 3.581099033355713, + "learning_rate": 8.358050448470143e-06, + "loss": 0.8429, + "step": 5206 + }, + { + "epoch": 1.5116852953984612, + "grad_norm": 3.6286072731018066, + "learning_rate": 8.357338967455605e-06, + "loss": 0.778, + "step": 5207 + }, + { + "epoch": 1.5119756132965598, + "grad_norm": 3.345937967300415, + "learning_rate": 8.356627362623742e-06, + "loss": 0.845, + "step": 5208 + }, + { + "epoch": 1.512265931194658, + "grad_norm": 3.4083850383758545, + "learning_rate": 8.35591563400079e-06, + "loss": 0.842, + "step": 5209 + }, + { + "epoch": 1.5125562490927567, + "grad_norm": 3.297445774078369, + "learning_rate": 8.355203781613004e-06, + "loss": 0.6617, + "step": 5210 + }, + { + "epoch": 1.5128465669908548, + "grad_norm": 3.6352899074554443, + "learning_rate": 8.354491805486633e-06, + "loss": 0.8348, + "step": 5211 + }, + { + "epoch": 1.5131368848889535, + "grad_norm": 3.588831663131714, + "learning_rate": 8.353779705647936e-06, + "loss": 0.8025, + "step": 5212 + }, + { + "epoch": 1.5134272027870517, + "grad_norm": 3.7391092777252197, + "learning_rate": 8.353067482123174e-06, + "loss": 0.77, + "step": 5213 + }, + { + "epoch": 1.5137175206851503, + "grad_norm": 4.033049583435059, + "learning_rate": 8.352355134938615e-06, + "loss": 0.8908, + "step": 5214 + }, + { + "epoch": 1.5140078385832485, + "grad_norm": 3.4990336894989014, + "learning_rate": 8.351642664120527e-06, + "loss": 0.6708, + "step": 5215 + }, + { + "epoch": 1.5142981564813471, + "grad_norm": 3.54728627204895, + "learning_rate": 8.35093006969519e-06, + "loss": 0.7484, + "step": 5216 + }, + { + "epoch": 1.5145884743794453, + "grad_norm": 3.496731758117676, + "learning_rate": 8.35021735168888e-06, + "loss": 0.7755, + "step": 5217 + }, + { + "epoch": 1.514878792277544, + "grad_norm": 3.043483257293701, + "learning_rate": 8.349504510127884e-06, + "loss": 0.675, + "step": 5218 + }, + { + "epoch": 1.5151691101756424, + "grad_norm": 3.824181079864502, + "learning_rate": 8.34879154503849e-06, + "loss": 0.8784, + "step": 5219 + }, + { + "epoch": 1.5154594280737408, + "grad_norm": 3.797044515609741, + "learning_rate": 8.348078456446992e-06, + "loss": 0.9087, + "step": 5220 + }, + { + "epoch": 1.5157497459718392, + "grad_norm": 3.5474209785461426, + "learning_rate": 8.347365244379693e-06, + "loss": 0.7362, + "step": 5221 + }, + { + "epoch": 1.5160400638699376, + "grad_norm": 3.3133018016815186, + "learning_rate": 8.346651908862888e-06, + "loss": 0.668, + "step": 5222 + }, + { + "epoch": 1.516330381768036, + "grad_norm": 3.8621597290039062, + "learning_rate": 8.345938449922892e-06, + "loss": 0.8732, + "step": 5223 + }, + { + "epoch": 1.5166206996661344, + "grad_norm": 3.851616382598877, + "learning_rate": 8.345224867586012e-06, + "loss": 0.927, + "step": 5224 + }, + { + "epoch": 1.5169110175642329, + "grad_norm": 3.438823938369751, + "learning_rate": 8.344511161878567e-06, + "loss": 0.7236, + "step": 5225 + }, + { + "epoch": 1.5172013354623313, + "grad_norm": 3.7797598838806152, + "learning_rate": 8.343797332826877e-06, + "loss": 0.8414, + "step": 5226 + }, + { + "epoch": 1.5174916533604297, + "grad_norm": 3.4886631965637207, + "learning_rate": 8.343083380457269e-06, + "loss": 0.8468, + "step": 5227 + }, + { + "epoch": 1.517781971258528, + "grad_norm": 3.6253554821014404, + "learning_rate": 8.342369304796072e-06, + "loss": 0.8223, + "step": 5228 + }, + { + "epoch": 1.5180722891566265, + "grad_norm": 3.529344081878662, + "learning_rate": 8.341655105869622e-06, + "loss": 0.7312, + "step": 5229 + }, + { + "epoch": 1.518362607054725, + "grad_norm": 3.156813383102417, + "learning_rate": 8.340940783704257e-06, + "loss": 0.7537, + "step": 5230 + }, + { + "epoch": 1.5186529249528233, + "grad_norm": 3.1911001205444336, + "learning_rate": 8.340226338326321e-06, + "loss": 0.7023, + "step": 5231 + }, + { + "epoch": 1.5189432428509217, + "grad_norm": 3.402534246444702, + "learning_rate": 8.339511769762166e-06, + "loss": 0.822, + "step": 5232 + }, + { + "epoch": 1.5192335607490202, + "grad_norm": 3.5590410232543945, + "learning_rate": 8.338797078038139e-06, + "loss": 0.8028, + "step": 5233 + }, + { + "epoch": 1.5195238786471186, + "grad_norm": 3.573758840560913, + "learning_rate": 8.338082263180602e-06, + "loss": 0.8886, + "step": 5234 + }, + { + "epoch": 1.519814196545217, + "grad_norm": 3.4216904640197754, + "learning_rate": 8.337367325215917e-06, + "loss": 0.7472, + "step": 5235 + }, + { + "epoch": 1.5201045144433154, + "grad_norm": 3.222221851348877, + "learning_rate": 8.336652264170447e-06, + "loss": 0.816, + "step": 5236 + }, + { + "epoch": 1.5203948323414138, + "grad_norm": 3.748356342315674, + "learning_rate": 8.335937080070567e-06, + "loss": 0.7983, + "step": 5237 + }, + { + "epoch": 1.5206851502395122, + "grad_norm": 3.761164903640747, + "learning_rate": 8.335221772942652e-06, + "loss": 0.7149, + "step": 5238 + }, + { + "epoch": 1.5209754681376106, + "grad_norm": 3.916020393371582, + "learning_rate": 8.334506342813081e-06, + "loss": 0.9039, + "step": 5239 + }, + { + "epoch": 1.521265786035709, + "grad_norm": 4.239342212677002, + "learning_rate": 8.333790789708238e-06, + "loss": 0.8167, + "step": 5240 + }, + { + "epoch": 1.5215561039338075, + "grad_norm": 3.4121599197387695, + "learning_rate": 8.333075113654516e-06, + "loss": 0.8067, + "step": 5241 + }, + { + "epoch": 1.5218464218319059, + "grad_norm": 3.260080575942993, + "learning_rate": 8.332359314678306e-06, + "loss": 0.7618, + "step": 5242 + }, + { + "epoch": 1.5221367397300045, + "grad_norm": 3.816723346710205, + "learning_rate": 8.331643392806006e-06, + "loss": 0.7984, + "step": 5243 + }, + { + "epoch": 1.5224270576281027, + "grad_norm": 3.672610282897949, + "learning_rate": 8.33092734806402e-06, + "loss": 0.7288, + "step": 5244 + }, + { + "epoch": 1.5227173755262013, + "grad_norm": 3.4358227252960205, + "learning_rate": 8.330211180478754e-06, + "loss": 0.6884, + "step": 5245 + }, + { + "epoch": 1.5230076934242995, + "grad_norm": 3.3935177326202393, + "learning_rate": 8.329494890076623e-06, + "loss": 0.7529, + "step": 5246 + }, + { + "epoch": 1.5232980113223982, + "grad_norm": 3.87324595451355, + "learning_rate": 8.32877847688404e-06, + "loss": 0.8658, + "step": 5247 + }, + { + "epoch": 1.5235883292204964, + "grad_norm": 3.859293222427368, + "learning_rate": 8.32806194092743e-06, + "loss": 0.8443, + "step": 5248 + }, + { + "epoch": 1.523878647118595, + "grad_norm": 3.1775994300842285, + "learning_rate": 8.327345282233217e-06, + "loss": 0.6733, + "step": 5249 + }, + { + "epoch": 1.5241689650166932, + "grad_norm": 3.546396255493164, + "learning_rate": 8.326628500827826e-06, + "loss": 0.739, + "step": 5250 + }, + { + "epoch": 1.5244592829147918, + "grad_norm": 3.3907859325408936, + "learning_rate": 8.3259115967377e-06, + "loss": 0.8505, + "step": 5251 + }, + { + "epoch": 1.52474960081289, + "grad_norm": 3.738556146621704, + "learning_rate": 8.325194569989273e-06, + "loss": 0.8121, + "step": 5252 + }, + { + "epoch": 1.5250399187109887, + "grad_norm": 3.676562547683716, + "learning_rate": 8.324477420608989e-06, + "loss": 0.7887, + "step": 5253 + }, + { + "epoch": 1.5253302366090868, + "grad_norm": 3.9967105388641357, + "learning_rate": 8.323760148623298e-06, + "loss": 0.9404, + "step": 5254 + }, + { + "epoch": 1.5256205545071855, + "grad_norm": 3.4851815700531006, + "learning_rate": 8.323042754058652e-06, + "loss": 0.7178, + "step": 5255 + }, + { + "epoch": 1.5259108724052837, + "grad_norm": 3.8005199432373047, + "learning_rate": 8.322325236941507e-06, + "loss": 0.8294, + "step": 5256 + }, + { + "epoch": 1.5262011903033823, + "grad_norm": 3.8124680519104004, + "learning_rate": 8.321607597298326e-06, + "loss": 0.8139, + "step": 5257 + }, + { + "epoch": 1.5264915082014805, + "grad_norm": 3.2474987506866455, + "learning_rate": 8.320889835155577e-06, + "loss": 0.7376, + "step": 5258 + }, + { + "epoch": 1.5267818260995791, + "grad_norm": 3.5540499687194824, + "learning_rate": 8.320171950539726e-06, + "loss": 0.7025, + "step": 5259 + }, + { + "epoch": 1.5270721439976773, + "grad_norm": 3.8556888103485107, + "learning_rate": 8.319453943477252e-06, + "loss": 0.8861, + "step": 5260 + }, + { + "epoch": 1.527362461895776, + "grad_norm": 3.5100462436676025, + "learning_rate": 8.318735813994633e-06, + "loss": 0.8822, + "step": 5261 + }, + { + "epoch": 1.5276527797938741, + "grad_norm": 3.7765297889709473, + "learning_rate": 8.318017562118354e-06, + "loss": 0.908, + "step": 5262 + }, + { + "epoch": 1.5279430976919728, + "grad_norm": 3.2733256816864014, + "learning_rate": 8.317299187874906e-06, + "loss": 0.7915, + "step": 5263 + }, + { + "epoch": 1.528233415590071, + "grad_norm": 3.604302167892456, + "learning_rate": 8.31658069129078e-06, + "loss": 0.7866, + "step": 5264 + }, + { + "epoch": 1.5285237334881696, + "grad_norm": 3.134767532348633, + "learning_rate": 8.315862072392471e-06, + "loss": 0.7795, + "step": 5265 + }, + { + "epoch": 1.5288140513862678, + "grad_norm": 3.620120048522949, + "learning_rate": 8.315143331206488e-06, + "loss": 0.8672, + "step": 5266 + }, + { + "epoch": 1.5291043692843664, + "grad_norm": 3.1708273887634277, + "learning_rate": 8.314424467759334e-06, + "loss": 0.7367, + "step": 5267 + }, + { + "epoch": 1.5293946871824649, + "grad_norm": 3.5994269847869873, + "learning_rate": 8.313705482077521e-06, + "loss": 0.934, + "step": 5268 + }, + { + "epoch": 1.5296850050805633, + "grad_norm": 3.8919529914855957, + "learning_rate": 8.312986374187563e-06, + "loss": 0.7943, + "step": 5269 + }, + { + "epoch": 1.5299753229786617, + "grad_norm": 3.3385488986968994, + "learning_rate": 8.312267144115984e-06, + "loss": 0.6968, + "step": 5270 + }, + { + "epoch": 1.53026564087676, + "grad_norm": 3.434603691101074, + "learning_rate": 8.311547791889307e-06, + "loss": 0.713, + "step": 5271 + }, + { + "epoch": 1.5305559587748585, + "grad_norm": 3.5553691387176514, + "learning_rate": 8.310828317534061e-06, + "loss": 0.78, + "step": 5272 + }, + { + "epoch": 1.530846276672957, + "grad_norm": 3.590174436569214, + "learning_rate": 8.310108721076782e-06, + "loss": 0.8297, + "step": 5273 + }, + { + "epoch": 1.5311365945710553, + "grad_norm": 3.4327259063720703, + "learning_rate": 8.309389002544005e-06, + "loss": 0.665, + "step": 5274 + }, + { + "epoch": 1.5314269124691537, + "grad_norm": 3.5836644172668457, + "learning_rate": 8.308669161962275e-06, + "loss": 0.908, + "step": 5275 + }, + { + "epoch": 1.5317172303672522, + "grad_norm": 3.7336366176605225, + "learning_rate": 8.30794919935814e-06, + "loss": 0.9022, + "step": 5276 + }, + { + "epoch": 1.5320075482653506, + "grad_norm": 3.6512033939361572, + "learning_rate": 8.307229114758151e-06, + "loss": 0.8058, + "step": 5277 + }, + { + "epoch": 1.532297866163449, + "grad_norm": 3.5464484691619873, + "learning_rate": 8.306508908188866e-06, + "loss": 0.7925, + "step": 5278 + }, + { + "epoch": 1.5325881840615474, + "grad_norm": 3.1770904064178467, + "learning_rate": 8.305788579676843e-06, + "loss": 0.7042, + "step": 5279 + }, + { + "epoch": 1.5328785019596458, + "grad_norm": 3.586550712585449, + "learning_rate": 8.30506812924865e-06, + "loss": 0.8136, + "step": 5280 + }, + { + "epoch": 1.5331688198577442, + "grad_norm": 3.467254638671875, + "learning_rate": 8.304347556930856e-06, + "loss": 0.7584, + "step": 5281 + }, + { + "epoch": 1.5334591377558426, + "grad_norm": 2.9315671920776367, + "learning_rate": 8.303626862750034e-06, + "loss": 0.6456, + "step": 5282 + }, + { + "epoch": 1.533749455653941, + "grad_norm": 3.198570966720581, + "learning_rate": 8.302906046732766e-06, + "loss": 0.7304, + "step": 5283 + }, + { + "epoch": 1.5340397735520395, + "grad_norm": 3.693838596343994, + "learning_rate": 8.302185108905632e-06, + "loss": 0.8126, + "step": 5284 + }, + { + "epoch": 1.5343300914501379, + "grad_norm": 3.353278875350952, + "learning_rate": 8.301464049295224e-06, + "loss": 0.7881, + "step": 5285 + }, + { + "epoch": 1.5346204093482363, + "grad_norm": 3.5820326805114746, + "learning_rate": 8.300742867928128e-06, + "loss": 0.8091, + "step": 5286 + }, + { + "epoch": 1.5349107272463347, + "grad_norm": 3.771308422088623, + "learning_rate": 8.300021564830949e-06, + "loss": 0.7514, + "step": 5287 + }, + { + "epoch": 1.5352010451444331, + "grad_norm": 3.5976288318634033, + "learning_rate": 8.299300140030283e-06, + "loss": 0.718, + "step": 5288 + }, + { + "epoch": 1.5354913630425315, + "grad_norm": 3.889220714569092, + "learning_rate": 8.298578593552737e-06, + "loss": 1.033, + "step": 5289 + }, + { + "epoch": 1.53578168094063, + "grad_norm": 3.3892271518707275, + "learning_rate": 8.29785692542492e-06, + "loss": 0.766, + "step": 5290 + }, + { + "epoch": 1.5360719988387284, + "grad_norm": 3.569516658782959, + "learning_rate": 8.297135135673451e-06, + "loss": 0.8218, + "step": 5291 + }, + { + "epoch": 1.5363623167368268, + "grad_norm": 3.7338545322418213, + "learning_rate": 8.296413224324944e-06, + "loss": 0.8123, + "step": 5292 + }, + { + "epoch": 1.5366526346349252, + "grad_norm": 3.023319959640503, + "learning_rate": 8.295691191406029e-06, + "loss": 0.6148, + "step": 5293 + }, + { + "epoch": 1.5369429525330238, + "grad_norm": 4.053857803344727, + "learning_rate": 8.294969036943328e-06, + "loss": 0.8692, + "step": 5294 + }, + { + "epoch": 1.537233270431122, + "grad_norm": 3.6890289783477783, + "learning_rate": 8.294246760963477e-06, + "loss": 0.8347, + "step": 5295 + }, + { + "epoch": 1.5375235883292206, + "grad_norm": 3.724935531616211, + "learning_rate": 8.29352436349311e-06, + "loss": 0.7793, + "step": 5296 + }, + { + "epoch": 1.5378139062273188, + "grad_norm": 3.5507402420043945, + "learning_rate": 8.292801844558875e-06, + "loss": 0.7672, + "step": 5297 + }, + { + "epoch": 1.5381042241254175, + "grad_norm": 3.179330348968506, + "learning_rate": 8.292079204187415e-06, + "loss": 0.6646, + "step": 5298 + }, + { + "epoch": 1.5383945420235157, + "grad_norm": 3.5478248596191406, + "learning_rate": 8.291356442405379e-06, + "loss": 0.8077, + "step": 5299 + }, + { + "epoch": 1.5386848599216143, + "grad_norm": 3.900157928466797, + "learning_rate": 8.290633559239422e-06, + "loss": 0.8082, + "step": 5300 + }, + { + "epoch": 1.5389751778197125, + "grad_norm": 3.41748046875, + "learning_rate": 8.289910554716208e-06, + "loss": 0.8316, + "step": 5301 + }, + { + "epoch": 1.5392654957178111, + "grad_norm": 3.5558431148529053, + "learning_rate": 8.289187428862398e-06, + "loss": 0.7715, + "step": 5302 + }, + { + "epoch": 1.5395558136159093, + "grad_norm": 3.833019495010376, + "learning_rate": 8.28846418170466e-06, + "loss": 0.9359, + "step": 5303 + }, + { + "epoch": 1.539846131514008, + "grad_norm": 3.508436679840088, + "learning_rate": 8.287740813269666e-06, + "loss": 0.7736, + "step": 5304 + }, + { + "epoch": 1.5401364494121061, + "grad_norm": 3.5869967937469482, + "learning_rate": 8.2870173235841e-06, + "loss": 0.8337, + "step": 5305 + }, + { + "epoch": 1.5404267673102048, + "grad_norm": 3.26682448387146, + "learning_rate": 8.286293712674636e-06, + "loss": 0.854, + "step": 5306 + }, + { + "epoch": 1.540717085208303, + "grad_norm": 3.3529770374298096, + "learning_rate": 8.285569980567965e-06, + "loss": 0.6657, + "step": 5307 + }, + { + "epoch": 1.5410074031064016, + "grad_norm": 3.2348685264587402, + "learning_rate": 8.284846127290778e-06, + "loss": 0.7903, + "step": 5308 + }, + { + "epoch": 1.5412977210044998, + "grad_norm": 2.985450267791748, + "learning_rate": 8.284122152869766e-06, + "loss": 0.6562, + "step": 5309 + }, + { + "epoch": 1.5415880389025984, + "grad_norm": 3.228339433670044, + "learning_rate": 8.283398057331636e-06, + "loss": 0.7334, + "step": 5310 + }, + { + "epoch": 1.5418783568006966, + "grad_norm": 3.838925361633301, + "learning_rate": 8.282673840703088e-06, + "loss": 0.8747, + "step": 5311 + }, + { + "epoch": 1.5421686746987953, + "grad_norm": 3.7266595363616943, + "learning_rate": 8.28194950301083e-06, + "loss": 0.8117, + "step": 5312 + }, + { + "epoch": 1.5424589925968935, + "grad_norm": 3.6524641513824463, + "learning_rate": 8.281225044281578e-06, + "loss": 0.6544, + "step": 5313 + }, + { + "epoch": 1.542749310494992, + "grad_norm": 3.4742302894592285, + "learning_rate": 8.280500464542047e-06, + "loss": 0.7832, + "step": 5314 + }, + { + "epoch": 1.5430396283930903, + "grad_norm": 3.4193482398986816, + "learning_rate": 8.279775763818962e-06, + "loss": 0.7379, + "step": 5315 + }, + { + "epoch": 1.543329946291189, + "grad_norm": 3.552457571029663, + "learning_rate": 8.279050942139048e-06, + "loss": 0.7344, + "step": 5316 + }, + { + "epoch": 1.543620264189287, + "grad_norm": 3.5372767448425293, + "learning_rate": 8.278325999529037e-06, + "loss": 0.8419, + "step": 5317 + }, + { + "epoch": 1.5439105820873857, + "grad_norm": 3.67195725440979, + "learning_rate": 8.277600936015663e-06, + "loss": 0.8765, + "step": 5318 + }, + { + "epoch": 1.5442008999854842, + "grad_norm": 3.4521541595458984, + "learning_rate": 8.276875751625669e-06, + "loss": 0.775, + "step": 5319 + }, + { + "epoch": 1.5444912178835826, + "grad_norm": 2.988212823867798, + "learning_rate": 8.276150446385796e-06, + "loss": 0.6954, + "step": 5320 + }, + { + "epoch": 1.544781535781681, + "grad_norm": 3.3321187496185303, + "learning_rate": 8.275425020322794e-06, + "loss": 0.6975, + "step": 5321 + }, + { + "epoch": 1.5450718536797794, + "grad_norm": 3.7323224544525146, + "learning_rate": 8.274699473463417e-06, + "loss": 0.7937, + "step": 5322 + }, + { + "epoch": 1.5453621715778778, + "grad_norm": 3.2482450008392334, + "learning_rate": 8.273973805834425e-06, + "loss": 0.7083, + "step": 5323 + }, + { + "epoch": 1.5456524894759762, + "grad_norm": 3.227125883102417, + "learning_rate": 8.273248017462579e-06, + "loss": 0.7372, + "step": 5324 + }, + { + "epoch": 1.5459428073740746, + "grad_norm": 3.8536081314086914, + "learning_rate": 8.272522108374643e-06, + "loss": 0.7417, + "step": 5325 + }, + { + "epoch": 1.546233125272173, + "grad_norm": 3.9545321464538574, + "learning_rate": 8.27179607859739e-06, + "loss": 0.759, + "step": 5326 + }, + { + "epoch": 1.5465234431702715, + "grad_norm": 3.3392481803894043, + "learning_rate": 8.271069928157595e-06, + "loss": 0.7807, + "step": 5327 + }, + { + "epoch": 1.5468137610683699, + "grad_norm": 3.7387235164642334, + "learning_rate": 8.270343657082043e-06, + "loss": 0.7683, + "step": 5328 + }, + { + "epoch": 1.5471040789664683, + "grad_norm": 3.2074947357177734, + "learning_rate": 8.26961726539751e-06, + "loss": 0.7249, + "step": 5329 + }, + { + "epoch": 1.5473943968645667, + "grad_norm": 3.8873088359832764, + "learning_rate": 8.268890753130794e-06, + "loss": 0.8258, + "step": 5330 + }, + { + "epoch": 1.5476847147626651, + "grad_norm": 3.9215521812438965, + "learning_rate": 8.268164120308684e-06, + "loss": 0.836, + "step": 5331 + }, + { + "epoch": 1.5479750326607635, + "grad_norm": 3.316826581954956, + "learning_rate": 8.267437366957976e-06, + "loss": 0.7363, + "step": 5332 + }, + { + "epoch": 1.548265350558862, + "grad_norm": 3.273144245147705, + "learning_rate": 8.266710493105476e-06, + "loss": 0.7226, + "step": 5333 + }, + { + "epoch": 1.5485556684569604, + "grad_norm": 3.902099132537842, + "learning_rate": 8.265983498777987e-06, + "loss": 0.845, + "step": 5334 + }, + { + "epoch": 1.5488459863550588, + "grad_norm": 3.659940004348755, + "learning_rate": 8.265256384002326e-06, + "loss": 0.7165, + "step": 5335 + }, + { + "epoch": 1.5491363042531572, + "grad_norm": 3.8005053997039795, + "learning_rate": 8.264529148805303e-06, + "loss": 0.854, + "step": 5336 + }, + { + "epoch": 1.5494266221512556, + "grad_norm": 3.4792816638946533, + "learning_rate": 8.26380179321374e-06, + "loss": 0.8514, + "step": 5337 + }, + { + "epoch": 1.549716940049354, + "grad_norm": 3.3794267177581787, + "learning_rate": 8.263074317254465e-06, + "loss": 0.7644, + "step": 5338 + }, + { + "epoch": 1.5500072579474524, + "grad_norm": 3.5877439975738525, + "learning_rate": 8.262346720954302e-06, + "loss": 0.6902, + "step": 5339 + }, + { + "epoch": 1.5502975758455508, + "grad_norm": 3.7112104892730713, + "learning_rate": 8.261619004340086e-06, + "loss": 0.7891, + "step": 5340 + }, + { + "epoch": 1.5505878937436492, + "grad_norm": 3.597099542617798, + "learning_rate": 8.260891167438655e-06, + "loss": 0.8692, + "step": 5341 + }, + { + "epoch": 1.5508782116417477, + "grad_norm": 3.904702663421631, + "learning_rate": 8.260163210276856e-06, + "loss": 0.9059, + "step": 5342 + }, + { + "epoch": 1.5511685295398463, + "grad_norm": 3.292292833328247, + "learning_rate": 8.259435132881528e-06, + "loss": 0.6733, + "step": 5343 + }, + { + "epoch": 1.5514588474379445, + "grad_norm": 3.1722826957702637, + "learning_rate": 8.258706935279526e-06, + "loss": 0.7296, + "step": 5344 + }, + { + "epoch": 1.5517491653360431, + "grad_norm": 3.7739975452423096, + "learning_rate": 8.257978617497706e-06, + "loss": 0.8633, + "step": 5345 + }, + { + "epoch": 1.5520394832341413, + "grad_norm": 3.7184388637542725, + "learning_rate": 8.257250179562926e-06, + "loss": 0.8095, + "step": 5346 + }, + { + "epoch": 1.55232980113224, + "grad_norm": 3.367509603500366, + "learning_rate": 8.256521621502053e-06, + "loss": 0.7923, + "step": 5347 + }, + { + "epoch": 1.5526201190303381, + "grad_norm": 3.6302716732025146, + "learning_rate": 8.255792943341957e-06, + "loss": 0.7699, + "step": 5348 + }, + { + "epoch": 1.5529104369284368, + "grad_norm": 3.957557439804077, + "learning_rate": 8.255064145109507e-06, + "loss": 0.8685, + "step": 5349 + }, + { + "epoch": 1.553200754826535, + "grad_norm": 3.2462220191955566, + "learning_rate": 8.254335226831582e-06, + "loss": 0.7029, + "step": 5350 + }, + { + "epoch": 1.5534910727246336, + "grad_norm": 3.4993910789489746, + "learning_rate": 8.253606188535068e-06, + "loss": 0.8325, + "step": 5351 + }, + { + "epoch": 1.5537813906227318, + "grad_norm": 3.787658452987671, + "learning_rate": 8.252877030246848e-06, + "loss": 0.8423, + "step": 5352 + }, + { + "epoch": 1.5540717085208304, + "grad_norm": 3.5158355236053467, + "learning_rate": 8.252147751993813e-06, + "loss": 0.7064, + "step": 5353 + }, + { + "epoch": 1.5543620264189286, + "grad_norm": 3.4053237438201904, + "learning_rate": 8.25141835380286e-06, + "loss": 0.7082, + "step": 5354 + }, + { + "epoch": 1.5546523443170273, + "grad_norm": 3.5351498126983643, + "learning_rate": 8.25068883570089e-06, + "loss": 0.8094, + "step": 5355 + }, + { + "epoch": 1.5549426622151254, + "grad_norm": 4.113193988800049, + "learning_rate": 8.249959197714803e-06, + "loss": 0.871, + "step": 5356 + }, + { + "epoch": 1.555232980113224, + "grad_norm": 3.213313341140747, + "learning_rate": 8.249229439871513e-06, + "loss": 0.7838, + "step": 5357 + }, + { + "epoch": 1.5555232980113223, + "grad_norm": 3.948580503463745, + "learning_rate": 8.248499562197929e-06, + "loss": 0.8546, + "step": 5358 + }, + { + "epoch": 1.555813615909421, + "grad_norm": 3.2688424587249756, + "learning_rate": 8.24776956472097e-06, + "loss": 0.6867, + "step": 5359 + }, + { + "epoch": 1.556103933807519, + "grad_norm": 3.9987499713897705, + "learning_rate": 8.24703944746756e-06, + "loss": 0.9382, + "step": 5360 + }, + { + "epoch": 1.5563942517056177, + "grad_norm": 3.3232181072235107, + "learning_rate": 8.246309210464623e-06, + "loss": 0.6795, + "step": 5361 + }, + { + "epoch": 1.556684569603716, + "grad_norm": 4.028323650360107, + "learning_rate": 8.24557885373909e-06, + "loss": 0.7453, + "step": 5362 + }, + { + "epoch": 1.5569748875018146, + "grad_norm": 3.309086561203003, + "learning_rate": 8.244848377317896e-06, + "loss": 0.7652, + "step": 5363 + }, + { + "epoch": 1.5572652053999128, + "grad_norm": 3.551588296890259, + "learning_rate": 8.244117781227982e-06, + "loss": 0.7157, + "step": 5364 + }, + { + "epoch": 1.5575555232980114, + "grad_norm": 3.302396535873413, + "learning_rate": 8.243387065496293e-06, + "loss": 0.7068, + "step": 5365 + }, + { + "epoch": 1.5578458411961096, + "grad_norm": 3.6382970809936523, + "learning_rate": 8.242656230149776e-06, + "loss": 0.7192, + "step": 5366 + }, + { + "epoch": 1.5581361590942082, + "grad_norm": 3.7732627391815186, + "learning_rate": 8.241925275215384e-06, + "loss": 0.8809, + "step": 5367 + }, + { + "epoch": 1.5584264769923066, + "grad_norm": 3.7419416904449463, + "learning_rate": 8.241194200720073e-06, + "loss": 0.9588, + "step": 5368 + }, + { + "epoch": 1.558716794890405, + "grad_norm": 3.50207257270813, + "learning_rate": 8.240463006690807e-06, + "loss": 0.7929, + "step": 5369 + }, + { + "epoch": 1.5590071127885035, + "grad_norm": 3.7464301586151123, + "learning_rate": 8.239731693154552e-06, + "loss": 0.7807, + "step": 5370 + }, + { + "epoch": 1.5592974306866019, + "grad_norm": 3.450807809829712, + "learning_rate": 8.239000260138277e-06, + "loss": 0.819, + "step": 5371 + }, + { + "epoch": 1.5595877485847003, + "grad_norm": 3.783979654312134, + "learning_rate": 8.238268707668957e-06, + "loss": 0.7797, + "step": 5372 + }, + { + "epoch": 1.5598780664827987, + "grad_norm": 3.410276174545288, + "learning_rate": 8.237537035773572e-06, + "loss": 0.7907, + "step": 5373 + }, + { + "epoch": 1.560168384380897, + "grad_norm": 3.077827215194702, + "learning_rate": 8.236805244479109e-06, + "loss": 0.7487, + "step": 5374 + }, + { + "epoch": 1.5604587022789955, + "grad_norm": 3.5620744228363037, + "learning_rate": 8.23607333381255e-06, + "loss": 0.7937, + "step": 5375 + }, + { + "epoch": 1.560749020177094, + "grad_norm": 3.3524978160858154, + "learning_rate": 8.235341303800892e-06, + "loss": 0.657, + "step": 5376 + }, + { + "epoch": 1.5610393380751924, + "grad_norm": 3.698017120361328, + "learning_rate": 8.234609154471129e-06, + "loss": 0.8229, + "step": 5377 + }, + { + "epoch": 1.5613296559732908, + "grad_norm": 3.363804340362549, + "learning_rate": 8.233876885850265e-06, + "loss": 0.7087, + "step": 5378 + }, + { + "epoch": 1.5616199738713892, + "grad_norm": 3.8434033393859863, + "learning_rate": 8.233144497965306e-06, + "loss": 0.822, + "step": 5379 + }, + { + "epoch": 1.5619102917694876, + "grad_norm": 3.6037120819091797, + "learning_rate": 8.23241199084326e-06, + "loss": 0.7207, + "step": 5380 + }, + { + "epoch": 1.562200609667586, + "grad_norm": 3.8489432334899902, + "learning_rate": 8.231679364511142e-06, + "loss": 0.8636, + "step": 5381 + }, + { + "epoch": 1.5624909275656844, + "grad_norm": 3.7548909187316895, + "learning_rate": 8.230946618995972e-06, + "loss": 0.7164, + "step": 5382 + }, + { + "epoch": 1.5627812454637828, + "grad_norm": 3.570434808731079, + "learning_rate": 8.230213754324773e-06, + "loss": 0.7482, + "step": 5383 + }, + { + "epoch": 1.5630715633618812, + "grad_norm": 3.7216358184814453, + "learning_rate": 8.229480770524571e-06, + "loss": 0.7673, + "step": 5384 + }, + { + "epoch": 1.5633618812599797, + "grad_norm": 3.5830092430114746, + "learning_rate": 8.228747667622402e-06, + "loss": 0.7737, + "step": 5385 + }, + { + "epoch": 1.563652199158078, + "grad_norm": 3.990433931350708, + "learning_rate": 8.228014445645299e-06, + "loss": 0.7824, + "step": 5386 + }, + { + "epoch": 1.5639425170561765, + "grad_norm": 3.3041436672210693, + "learning_rate": 8.227281104620307e-06, + "loss": 0.8376, + "step": 5387 + }, + { + "epoch": 1.564232834954275, + "grad_norm": 3.908924102783203, + "learning_rate": 8.226547644574465e-06, + "loss": 0.8597, + "step": 5388 + }, + { + "epoch": 1.5645231528523733, + "grad_norm": 3.409175157546997, + "learning_rate": 8.225814065534827e-06, + "loss": 0.8483, + "step": 5389 + }, + { + "epoch": 1.5648134707504717, + "grad_norm": 3.362900733947754, + "learning_rate": 8.225080367528447e-06, + "loss": 0.7746, + "step": 5390 + }, + { + "epoch": 1.5651037886485701, + "grad_norm": 4.050478458404541, + "learning_rate": 8.224346550582382e-06, + "loss": 0.8165, + "step": 5391 + }, + { + "epoch": 1.5653941065466686, + "grad_norm": 4.049386978149414, + "learning_rate": 8.223612614723697e-06, + "loss": 0.9072, + "step": 5392 + }, + { + "epoch": 1.565684424444767, + "grad_norm": 3.4654226303100586, + "learning_rate": 8.222878559979458e-06, + "loss": 0.8, + "step": 5393 + }, + { + "epoch": 1.5659747423428656, + "grad_norm": 3.154883861541748, + "learning_rate": 8.222144386376736e-06, + "loss": 0.7033, + "step": 5394 + }, + { + "epoch": 1.5662650602409638, + "grad_norm": 3.412895679473877, + "learning_rate": 8.221410093942608e-06, + "loss": 0.7621, + "step": 5395 + }, + { + "epoch": 1.5665553781390624, + "grad_norm": 4.3008928298950195, + "learning_rate": 8.220675682704153e-06, + "loss": 0.9183, + "step": 5396 + }, + { + "epoch": 1.5668456960371606, + "grad_norm": 3.676053047180176, + "learning_rate": 8.219941152688459e-06, + "loss": 0.8163, + "step": 5397 + }, + { + "epoch": 1.5671360139352593, + "grad_norm": 3.78293776512146, + "learning_rate": 8.219206503922612e-06, + "loss": 0.854, + "step": 5398 + }, + { + "epoch": 1.5674263318333574, + "grad_norm": 3.3566396236419678, + "learning_rate": 8.218471736433706e-06, + "loss": 0.8328, + "step": 5399 + }, + { + "epoch": 1.567716649731456, + "grad_norm": 4.530660629272461, + "learning_rate": 8.217736850248841e-06, + "loss": 0.7557, + "step": 5400 + }, + { + "epoch": 1.5680069676295543, + "grad_norm": 3.4996469020843506, + "learning_rate": 8.217001845395118e-06, + "loss": 0.7436, + "step": 5401 + }, + { + "epoch": 1.568297285527653, + "grad_norm": 3.5606935024261475, + "learning_rate": 8.216266721899642e-06, + "loss": 0.7685, + "step": 5402 + }, + { + "epoch": 1.568587603425751, + "grad_norm": 3.8874683380126953, + "learning_rate": 8.215531479789527e-06, + "loss": 0.8437, + "step": 5403 + }, + { + "epoch": 1.5688779213238497, + "grad_norm": 3.3992443084716797, + "learning_rate": 8.214796119091886e-06, + "loss": 0.8176, + "step": 5404 + }, + { + "epoch": 1.569168239221948, + "grad_norm": 3.601271867752075, + "learning_rate": 8.21406063983384e-06, + "loss": 0.8506, + "step": 5405 + }, + { + "epoch": 1.5694585571200466, + "grad_norm": 3.4691638946533203, + "learning_rate": 8.213325042042512e-06, + "loss": 0.7478, + "step": 5406 + }, + { + "epoch": 1.5697488750181448, + "grad_norm": 3.978273868560791, + "learning_rate": 8.212589325745036e-06, + "loss": 0.9128, + "step": 5407 + }, + { + "epoch": 1.5700391929162434, + "grad_norm": 3.4004123210906982, + "learning_rate": 8.211853490968536e-06, + "loss": 0.7231, + "step": 5408 + }, + { + "epoch": 1.5703295108143416, + "grad_norm": 3.540611982345581, + "learning_rate": 8.211117537740154e-06, + "loss": 0.758, + "step": 5409 + }, + { + "epoch": 1.5706198287124402, + "grad_norm": 3.3469531536102295, + "learning_rate": 8.210381466087035e-06, + "loss": 0.6891, + "step": 5410 + }, + { + "epoch": 1.5709101466105384, + "grad_norm": 3.611398935317993, + "learning_rate": 8.209645276036318e-06, + "loss": 0.8158, + "step": 5411 + }, + { + "epoch": 1.571200464508637, + "grad_norm": 3.818127155303955, + "learning_rate": 8.208908967615159e-06, + "loss": 0.762, + "step": 5412 + }, + { + "epoch": 1.5714907824067352, + "grad_norm": 3.4710285663604736, + "learning_rate": 8.20817254085071e-06, + "loss": 0.7297, + "step": 5413 + }, + { + "epoch": 1.5717811003048339, + "grad_norm": 3.6411702632904053, + "learning_rate": 8.20743599577013e-06, + "loss": 0.8483, + "step": 5414 + }, + { + "epoch": 1.572071418202932, + "grad_norm": 3.465782403945923, + "learning_rate": 8.206699332400585e-06, + "loss": 0.7407, + "step": 5415 + }, + { + "epoch": 1.5723617361010307, + "grad_norm": 3.5217745304107666, + "learning_rate": 8.20596255076924e-06, + "loss": 0.8056, + "step": 5416 + }, + { + "epoch": 1.5726520539991289, + "grad_norm": 3.9428863525390625, + "learning_rate": 8.205225650903269e-06, + "loss": 0.867, + "step": 5417 + }, + { + "epoch": 1.5729423718972275, + "grad_norm": 3.5911359786987305, + "learning_rate": 8.204488632829848e-06, + "loss": 0.8481, + "step": 5418 + }, + { + "epoch": 1.573232689795326, + "grad_norm": 3.63502836227417, + "learning_rate": 8.203751496576157e-06, + "loss": 0.8925, + "step": 5419 + }, + { + "epoch": 1.5735230076934243, + "grad_norm": 3.740027666091919, + "learning_rate": 8.203014242169382e-06, + "loss": 0.89, + "step": 5420 + }, + { + "epoch": 1.5738133255915228, + "grad_norm": 3.697819232940674, + "learning_rate": 8.202276869636713e-06, + "loss": 0.8272, + "step": 5421 + }, + { + "epoch": 1.5741036434896212, + "grad_norm": 3.058216094970703, + "learning_rate": 8.201539379005346e-06, + "loss": 0.8128, + "step": 5422 + }, + { + "epoch": 1.5743939613877196, + "grad_norm": 3.1359705924987793, + "learning_rate": 8.200801770302474e-06, + "loss": 0.6643, + "step": 5423 + }, + { + "epoch": 1.574684279285818, + "grad_norm": 3.386383533477783, + "learning_rate": 8.200064043555304e-06, + "loss": 0.6573, + "step": 5424 + }, + { + "epoch": 1.5749745971839164, + "grad_norm": 3.152573823928833, + "learning_rate": 8.199326198791044e-06, + "loss": 0.7556, + "step": 5425 + }, + { + "epoch": 1.5752649150820148, + "grad_norm": 3.3397903442382812, + "learning_rate": 8.198588236036902e-06, + "loss": 0.7253, + "step": 5426 + }, + { + "epoch": 1.5755552329801132, + "grad_norm": 3.6608428955078125, + "learning_rate": 8.197850155320094e-06, + "loss": 0.7888, + "step": 5427 + }, + { + "epoch": 1.5758455508782117, + "grad_norm": 3.4254817962646484, + "learning_rate": 8.197111956667842e-06, + "loss": 0.7963, + "step": 5428 + }, + { + "epoch": 1.57613586877631, + "grad_norm": 3.2243576049804688, + "learning_rate": 8.196373640107372e-06, + "loss": 0.6132, + "step": 5429 + }, + { + "epoch": 1.5764261866744085, + "grad_norm": 3.9535470008850098, + "learning_rate": 8.195635205665909e-06, + "loss": 0.8969, + "step": 5430 + }, + { + "epoch": 1.576716504572507, + "grad_norm": 3.825469970703125, + "learning_rate": 8.194896653370686e-06, + "loss": 0.7282, + "step": 5431 + }, + { + "epoch": 1.5770068224706053, + "grad_norm": 4.59237003326416, + "learning_rate": 8.194157983248943e-06, + "loss": 0.9332, + "step": 5432 + }, + { + "epoch": 1.5772971403687037, + "grad_norm": 3.5294547080993652, + "learning_rate": 8.193419195327923e-06, + "loss": 0.7861, + "step": 5433 + }, + { + "epoch": 1.5775874582668021, + "grad_norm": 3.565861701965332, + "learning_rate": 8.192680289634868e-06, + "loss": 0.7375, + "step": 5434 + }, + { + "epoch": 1.5778777761649005, + "grad_norm": 4.323357582092285, + "learning_rate": 8.191941266197032e-06, + "loss": 0.8921, + "step": 5435 + }, + { + "epoch": 1.578168094062999, + "grad_norm": 3.646151065826416, + "learning_rate": 8.19120212504167e-06, + "loss": 0.8274, + "step": 5436 + }, + { + "epoch": 1.5784584119610974, + "grad_norm": 3.351614236831665, + "learning_rate": 8.190462866196038e-06, + "loss": 0.8299, + "step": 5437 + }, + { + "epoch": 1.5787487298591958, + "grad_norm": 3.4705700874328613, + "learning_rate": 8.189723489687404e-06, + "loss": 0.6837, + "step": 5438 + }, + { + "epoch": 1.5790390477572942, + "grad_norm": 4.0358991622924805, + "learning_rate": 8.188983995543031e-06, + "loss": 0.9315, + "step": 5439 + }, + { + "epoch": 1.5793293656553926, + "grad_norm": 3.4540350437164307, + "learning_rate": 8.188244383790196e-06, + "loss": 0.7148, + "step": 5440 + }, + { + "epoch": 1.579619683553491, + "grad_norm": 3.703850507736206, + "learning_rate": 8.187504654456171e-06, + "loss": 0.7906, + "step": 5441 + }, + { + "epoch": 1.5799100014515894, + "grad_norm": 3.2540676593780518, + "learning_rate": 8.18676480756824e-06, + "loss": 0.8288, + "step": 5442 + }, + { + "epoch": 1.5802003193496879, + "grad_norm": 3.3832411766052246, + "learning_rate": 8.186024843153689e-06, + "loss": 0.7643, + "step": 5443 + }, + { + "epoch": 1.5804906372477863, + "grad_norm": 3.6068215370178223, + "learning_rate": 8.185284761239805e-06, + "loss": 0.8474, + "step": 5444 + }, + { + "epoch": 1.580780955145885, + "grad_norm": 3.1546831130981445, + "learning_rate": 8.184544561853882e-06, + "loss": 0.703, + "step": 5445 + }, + { + "epoch": 1.581071273043983, + "grad_norm": 3.4897522926330566, + "learning_rate": 8.18380424502322e-06, + "loss": 0.8384, + "step": 5446 + }, + { + "epoch": 1.5813615909420817, + "grad_norm": 3.1088387966156006, + "learning_rate": 8.183063810775121e-06, + "loss": 0.7216, + "step": 5447 + }, + { + "epoch": 1.58165190884018, + "grad_norm": 3.126387596130371, + "learning_rate": 8.182323259136893e-06, + "loss": 0.7299, + "step": 5448 + }, + { + "epoch": 1.5819422267382786, + "grad_norm": 3.984802484512329, + "learning_rate": 8.181582590135846e-06, + "loss": 0.6985, + "step": 5449 + }, + { + "epoch": 1.5822325446363767, + "grad_norm": 3.2511186599731445, + "learning_rate": 8.180841803799293e-06, + "loss": 0.6762, + "step": 5450 + }, + { + "epoch": 1.5825228625344754, + "grad_norm": 3.4527862071990967, + "learning_rate": 8.180100900154559e-06, + "loss": 0.7734, + "step": 5451 + }, + { + "epoch": 1.5828131804325736, + "grad_norm": 3.6589744091033936, + "learning_rate": 8.179359879228966e-06, + "loss": 0.8921, + "step": 5452 + }, + { + "epoch": 1.5831034983306722, + "grad_norm": 3.0081589221954346, + "learning_rate": 8.178618741049841e-06, + "loss": 0.6017, + "step": 5453 + }, + { + "epoch": 1.5833938162287704, + "grad_norm": 3.805534839630127, + "learning_rate": 8.177877485644518e-06, + "loss": 0.8037, + "step": 5454 + }, + { + "epoch": 1.583684134126869, + "grad_norm": 3.2553961277008057, + "learning_rate": 8.177136113040337e-06, + "loss": 0.6002, + "step": 5455 + }, + { + "epoch": 1.5839744520249672, + "grad_norm": 3.6897778511047363, + "learning_rate": 8.176394623264634e-06, + "loss": 0.7646, + "step": 5456 + }, + { + "epoch": 1.5842647699230659, + "grad_norm": 4.139689922332764, + "learning_rate": 8.17565301634476e-06, + "loss": 0.9523, + "step": 5457 + }, + { + "epoch": 1.584555087821164, + "grad_norm": 3.3002512454986572, + "learning_rate": 8.17491129230806e-06, + "loss": 0.7418, + "step": 5458 + }, + { + "epoch": 1.5848454057192627, + "grad_norm": 3.755394220352173, + "learning_rate": 8.174169451181893e-06, + "loss": 0.8796, + "step": 5459 + }, + { + "epoch": 1.5851357236173609, + "grad_norm": 3.5037105083465576, + "learning_rate": 8.173427492993617e-06, + "loss": 0.7438, + "step": 5460 + }, + { + "epoch": 1.5854260415154595, + "grad_norm": 3.9173336029052734, + "learning_rate": 8.172685417770595e-06, + "loss": 0.9091, + "step": 5461 + }, + { + "epoch": 1.5857163594135577, + "grad_norm": 3.251797676086426, + "learning_rate": 8.171943225540193e-06, + "loss": 0.7687, + "step": 5462 + }, + { + "epoch": 1.5860066773116563, + "grad_norm": 3.7072701454162598, + "learning_rate": 8.171200916329782e-06, + "loss": 0.8204, + "step": 5463 + }, + { + "epoch": 1.5862969952097545, + "grad_norm": 3.598876476287842, + "learning_rate": 8.170458490166741e-06, + "loss": 0.8249, + "step": 5464 + }, + { + "epoch": 1.5865873131078532, + "grad_norm": 3.932330846786499, + "learning_rate": 8.16971594707845e-06, + "loss": 0.8425, + "step": 5465 + }, + { + "epoch": 1.5868776310059514, + "grad_norm": 4.134816646575928, + "learning_rate": 8.168973287092292e-06, + "loss": 0.925, + "step": 5466 + }, + { + "epoch": 1.58716794890405, + "grad_norm": 3.6095468997955322, + "learning_rate": 8.168230510235655e-06, + "loss": 0.8141, + "step": 5467 + }, + { + "epoch": 1.5874582668021482, + "grad_norm": 3.84780216217041, + "learning_rate": 8.167487616535937e-06, + "loss": 0.9084, + "step": 5468 + }, + { + "epoch": 1.5877485847002468, + "grad_norm": 3.4866528511047363, + "learning_rate": 8.166744606020532e-06, + "loss": 0.8294, + "step": 5469 + }, + { + "epoch": 1.5880389025983452, + "grad_norm": 3.47239089012146, + "learning_rate": 8.166001478716842e-06, + "loss": 0.7165, + "step": 5470 + }, + { + "epoch": 1.5883292204964437, + "grad_norm": 3.2797508239746094, + "learning_rate": 8.165258234652273e-06, + "loss": 0.7534, + "step": 5471 + }, + { + "epoch": 1.588619538394542, + "grad_norm": 3.6644527912139893, + "learning_rate": 8.164514873854238e-06, + "loss": 0.7998, + "step": 5472 + }, + { + "epoch": 1.5889098562926405, + "grad_norm": 4.518185138702393, + "learning_rate": 8.163771396350149e-06, + "loss": 0.9153, + "step": 5473 + }, + { + "epoch": 1.589200174190739, + "grad_norm": 3.9391283988952637, + "learning_rate": 8.163027802167427e-06, + "loss": 0.7404, + "step": 5474 + }, + { + "epoch": 1.5894904920888373, + "grad_norm": 3.672680616378784, + "learning_rate": 8.162284091333495e-06, + "loss": 0.7028, + "step": 5475 + }, + { + "epoch": 1.5897808099869357, + "grad_norm": 3.7391512393951416, + "learning_rate": 8.16154026387578e-06, + "loss": 0.8395, + "step": 5476 + }, + { + "epoch": 1.5900711278850341, + "grad_norm": 3.7817800045013428, + "learning_rate": 8.160796319821715e-06, + "loss": 0.7917, + "step": 5477 + }, + { + "epoch": 1.5903614457831325, + "grad_norm": 3.9017398357391357, + "learning_rate": 8.160052259198737e-06, + "loss": 0.7596, + "step": 5478 + }, + { + "epoch": 1.590651763681231, + "grad_norm": 3.59230375289917, + "learning_rate": 8.159308082034284e-06, + "loss": 0.8597, + "step": 5479 + }, + { + "epoch": 1.5909420815793294, + "grad_norm": 3.1670892238616943, + "learning_rate": 8.158563788355803e-06, + "loss": 0.7628, + "step": 5480 + }, + { + "epoch": 1.5912323994774278, + "grad_norm": 3.757706880569458, + "learning_rate": 8.157819378190743e-06, + "loss": 0.7866, + "step": 5481 + }, + { + "epoch": 1.5915227173755262, + "grad_norm": 3.193671703338623, + "learning_rate": 8.157074851566558e-06, + "loss": 0.698, + "step": 5482 + }, + { + "epoch": 1.5918130352736246, + "grad_norm": 3.6582417488098145, + "learning_rate": 8.156330208510706e-06, + "loss": 0.7991, + "step": 5483 + }, + { + "epoch": 1.592103353171723, + "grad_norm": 4.1811089515686035, + "learning_rate": 8.155585449050647e-06, + "loss": 0.9821, + "step": 5484 + }, + { + "epoch": 1.5923936710698214, + "grad_norm": 3.4012670516967773, + "learning_rate": 8.15484057321385e-06, + "loss": 0.8423, + "step": 5485 + }, + { + "epoch": 1.5926839889679199, + "grad_norm": 3.3922324180603027, + "learning_rate": 8.154095581027783e-06, + "loss": 0.8446, + "step": 5486 + }, + { + "epoch": 1.5929743068660183, + "grad_norm": 3.582942008972168, + "learning_rate": 8.153350472519925e-06, + "loss": 0.7196, + "step": 5487 + }, + { + "epoch": 1.5932646247641167, + "grad_norm": 3.835096836090088, + "learning_rate": 8.152605247717753e-06, + "loss": 0.8157, + "step": 5488 + }, + { + "epoch": 1.593554942662215, + "grad_norm": 3.4639639854431152, + "learning_rate": 8.151859906648747e-06, + "loss": 0.7725, + "step": 5489 + }, + { + "epoch": 1.5938452605603135, + "grad_norm": 3.6137194633483887, + "learning_rate": 8.151114449340403e-06, + "loss": 0.8316, + "step": 5490 + }, + { + "epoch": 1.594135578458412, + "grad_norm": 3.6025030612945557, + "learning_rate": 8.150368875820206e-06, + "loss": 0.7249, + "step": 5491 + }, + { + "epoch": 1.5944258963565103, + "grad_norm": 3.8320367336273193, + "learning_rate": 8.149623186115655e-06, + "loss": 0.958, + "step": 5492 + }, + { + "epoch": 1.5947162142546087, + "grad_norm": 3.5915944576263428, + "learning_rate": 8.14887738025425e-06, + "loss": 0.8933, + "step": 5493 + }, + { + "epoch": 1.5950065321527074, + "grad_norm": 3.5409955978393555, + "learning_rate": 8.148131458263499e-06, + "loss": 0.7437, + "step": 5494 + }, + { + "epoch": 1.5952968500508056, + "grad_norm": 3.5840892791748047, + "learning_rate": 8.147385420170907e-06, + "loss": 0.731, + "step": 5495 + }, + { + "epoch": 1.5955871679489042, + "grad_norm": 2.954227924346924, + "learning_rate": 8.146639266003991e-06, + "loss": 0.611, + "step": 5496 + }, + { + "epoch": 1.5958774858470024, + "grad_norm": 3.372689723968506, + "learning_rate": 8.145892995790269e-06, + "loss": 0.7692, + "step": 5497 + }, + { + "epoch": 1.596167803745101, + "grad_norm": 3.156162738800049, + "learning_rate": 8.145146609557259e-06, + "loss": 0.7034, + "step": 5498 + }, + { + "epoch": 1.5964581216431992, + "grad_norm": 3.1837658882141113, + "learning_rate": 8.144400107332491e-06, + "loss": 0.7963, + "step": 5499 + }, + { + "epoch": 1.5967484395412979, + "grad_norm": 3.6337132453918457, + "learning_rate": 8.143653489143495e-06, + "loss": 0.8182, + "step": 5500 + }, + { + "epoch": 1.5967484395412979, + "eval_loss": 1.1764451265335083, + "eval_runtime": 13.4597, + "eval_samples_per_second": 29.718, + "eval_steps_per_second": 3.715, + "step": 5500 + }, + { + "epoch": 1.597038757439396, + "grad_norm": 3.751736879348755, + "learning_rate": 8.142906755017806e-06, + "loss": 0.8149, + "step": 5501 + }, + { + "epoch": 1.5973290753374947, + "grad_norm": 3.2839596271514893, + "learning_rate": 8.142159904982963e-06, + "loss": 0.6112, + "step": 5502 + }, + { + "epoch": 1.5976193932355929, + "grad_norm": 3.4218335151672363, + "learning_rate": 8.14141293906651e-06, + "loss": 0.8055, + "step": 5503 + }, + { + "epoch": 1.5979097111336915, + "grad_norm": 3.7377045154571533, + "learning_rate": 8.140665857295994e-06, + "loss": 0.8185, + "step": 5504 + }, + { + "epoch": 1.5982000290317897, + "grad_norm": 3.6234383583068848, + "learning_rate": 8.139918659698967e-06, + "loss": 0.9353, + "step": 5505 + }, + { + "epoch": 1.5984903469298883, + "grad_norm": 3.7796764373779297, + "learning_rate": 8.139171346302987e-06, + "loss": 0.8076, + "step": 5506 + }, + { + "epoch": 1.5987806648279865, + "grad_norm": 3.846904993057251, + "learning_rate": 8.138423917135613e-06, + "loss": 0.7598, + "step": 5507 + }, + { + "epoch": 1.5990709827260852, + "grad_norm": 3.7689170837402344, + "learning_rate": 8.13767637222441e-06, + "loss": 0.8609, + "step": 5508 + }, + { + "epoch": 1.5993613006241834, + "grad_norm": 3.787233352661133, + "learning_rate": 8.136928711596948e-06, + "loss": 0.7595, + "step": 5509 + }, + { + "epoch": 1.599651618522282, + "grad_norm": 3.4965553283691406, + "learning_rate": 8.1361809352808e-06, + "loss": 0.7314, + "step": 5510 + }, + { + "epoch": 1.5999419364203802, + "grad_norm": 3.4074811935424805, + "learning_rate": 8.135433043303543e-06, + "loss": 0.7915, + "step": 5511 + }, + { + "epoch": 1.6002322543184788, + "grad_norm": 3.774893283843994, + "learning_rate": 8.134685035692761e-06, + "loss": 0.7789, + "step": 5512 + }, + { + "epoch": 1.600522572216577, + "grad_norm": 3.5672433376312256, + "learning_rate": 8.133936912476038e-06, + "loss": 0.7728, + "step": 5513 + }, + { + "epoch": 1.6008128901146756, + "grad_norm": 3.479285717010498, + "learning_rate": 8.133188673680966e-06, + "loss": 0.7429, + "step": 5514 + }, + { + "epoch": 1.6011032080127738, + "grad_norm": 3.180401563644409, + "learning_rate": 8.132440319335138e-06, + "loss": 0.6545, + "step": 5515 + }, + { + "epoch": 1.6013935259108725, + "grad_norm": 3.3858981132507324, + "learning_rate": 8.131691849466154e-06, + "loss": 0.7118, + "step": 5516 + }, + { + "epoch": 1.6016838438089707, + "grad_norm": 3.231828212738037, + "learning_rate": 8.130943264101618e-06, + "loss": 0.7514, + "step": 5517 + }, + { + "epoch": 1.6019741617070693, + "grad_norm": 3.7033121585845947, + "learning_rate": 8.130194563269137e-06, + "loss": 0.7819, + "step": 5518 + }, + { + "epoch": 1.6022644796051677, + "grad_norm": 3.5103394985198975, + "learning_rate": 8.129445746996322e-06, + "loss": 0.8944, + "step": 5519 + }, + { + "epoch": 1.6025547975032661, + "grad_norm": 3.523192882537842, + "learning_rate": 8.12869681531079e-06, + "loss": 0.7582, + "step": 5520 + }, + { + "epoch": 1.6028451154013645, + "grad_norm": 3.773475408554077, + "learning_rate": 8.127947768240161e-06, + "loss": 0.7963, + "step": 5521 + }, + { + "epoch": 1.603135433299463, + "grad_norm": 3.4685418605804443, + "learning_rate": 8.12719860581206e-06, + "loss": 0.8421, + "step": 5522 + }, + { + "epoch": 1.6034257511975614, + "grad_norm": 3.8262131214141846, + "learning_rate": 8.126449328054115e-06, + "loss": 0.7972, + "step": 5523 + }, + { + "epoch": 1.6037160690956598, + "grad_norm": 3.396672487258911, + "learning_rate": 8.125699934993961e-06, + "loss": 0.724, + "step": 5524 + }, + { + "epoch": 1.6040063869937582, + "grad_norm": 3.644125461578369, + "learning_rate": 8.124950426659231e-06, + "loss": 0.818, + "step": 5525 + }, + { + "epoch": 1.6042967048918566, + "grad_norm": 3.7308244705200195, + "learning_rate": 8.124200803077571e-06, + "loss": 0.7834, + "step": 5526 + }, + { + "epoch": 1.604587022789955, + "grad_norm": 3.544517755508423, + "learning_rate": 8.123451064276625e-06, + "loss": 0.7286, + "step": 5527 + }, + { + "epoch": 1.6048773406880534, + "grad_norm": 3.779484272003174, + "learning_rate": 8.122701210284042e-06, + "loss": 0.879, + "step": 5528 + }, + { + "epoch": 1.6051676585861518, + "grad_norm": 3.2026147842407227, + "learning_rate": 8.12195124112748e-06, + "loss": 0.6605, + "step": 5529 + }, + { + "epoch": 1.6054579764842503, + "grad_norm": 3.595618486404419, + "learning_rate": 8.121201156834595e-06, + "loss": 0.7681, + "step": 5530 + }, + { + "epoch": 1.6057482943823487, + "grad_norm": 3.6730122566223145, + "learning_rate": 8.120450957433048e-06, + "loss": 0.8714, + "step": 5531 + }, + { + "epoch": 1.606038612280447, + "grad_norm": 3.6328916549682617, + "learning_rate": 8.11970064295051e-06, + "loss": 0.9266, + "step": 5532 + }, + { + "epoch": 1.6063289301785455, + "grad_norm": 3.8567254543304443, + "learning_rate": 8.11895021341465e-06, + "loss": 0.8275, + "step": 5533 + }, + { + "epoch": 1.606619248076644, + "grad_norm": 3.534677505493164, + "learning_rate": 8.118199668853141e-06, + "loss": 0.8414, + "step": 5534 + }, + { + "epoch": 1.6069095659747423, + "grad_norm": 3.3644704818725586, + "learning_rate": 8.117449009293668e-06, + "loss": 0.7034, + "step": 5535 + }, + { + "epoch": 1.6071998838728407, + "grad_norm": 3.779590129852295, + "learning_rate": 8.116698234763913e-06, + "loss": 0.6894, + "step": 5536 + }, + { + "epoch": 1.6074902017709392, + "grad_norm": 3.705146312713623, + "learning_rate": 8.115947345291565e-06, + "loss": 0.8024, + "step": 5537 + }, + { + "epoch": 1.6077805196690376, + "grad_norm": 3.603299856185913, + "learning_rate": 8.115196340904312e-06, + "loss": 0.8889, + "step": 5538 + }, + { + "epoch": 1.608070837567136, + "grad_norm": 3.7928433418273926, + "learning_rate": 8.114445221629856e-06, + "loss": 0.987, + "step": 5539 + }, + { + "epoch": 1.6083611554652344, + "grad_norm": 3.6779932975769043, + "learning_rate": 8.113693987495897e-06, + "loss": 0.7934, + "step": 5540 + }, + { + "epoch": 1.6086514733633328, + "grad_norm": 3.47401762008667, + "learning_rate": 8.112942638530137e-06, + "loss": 0.8087, + "step": 5541 + }, + { + "epoch": 1.6089417912614312, + "grad_norm": 3.580387830734253, + "learning_rate": 8.112191174760289e-06, + "loss": 0.8183, + "step": 5542 + }, + { + "epoch": 1.6092321091595296, + "grad_norm": 3.6536662578582764, + "learning_rate": 8.111439596214066e-06, + "loss": 0.8197, + "step": 5543 + }, + { + "epoch": 1.609522427057628, + "grad_norm": 3.348607301712036, + "learning_rate": 8.110687902919185e-06, + "loss": 0.7254, + "step": 5544 + }, + { + "epoch": 1.6098127449557267, + "grad_norm": 3.2281482219696045, + "learning_rate": 8.10993609490337e-06, + "loss": 0.7144, + "step": 5545 + }, + { + "epoch": 1.6101030628538249, + "grad_norm": 3.217322826385498, + "learning_rate": 8.109184172194344e-06, + "loss": 0.7845, + "step": 5546 + }, + { + "epoch": 1.6103933807519235, + "grad_norm": 3.4739761352539062, + "learning_rate": 8.10843213481984e-06, + "loss": 0.7105, + "step": 5547 + }, + { + "epoch": 1.6106836986500217, + "grad_norm": 3.3393218517303467, + "learning_rate": 8.107679982807593e-06, + "loss": 0.7621, + "step": 5548 + }, + { + "epoch": 1.6109740165481203, + "grad_norm": 3.1378118991851807, + "learning_rate": 8.106927716185341e-06, + "loss": 0.7798, + "step": 5549 + }, + { + "epoch": 1.6112643344462185, + "grad_norm": 3.5275983810424805, + "learning_rate": 8.106175334980828e-06, + "loss": 0.7628, + "step": 5550 + }, + { + "epoch": 1.6115546523443172, + "grad_norm": 4.053371906280518, + "learning_rate": 8.105422839221801e-06, + "loss": 0.849, + "step": 5551 + }, + { + "epoch": 1.6118449702424154, + "grad_norm": 3.6364362239837646, + "learning_rate": 8.104670228936014e-06, + "loss": 0.76, + "step": 5552 + }, + { + "epoch": 1.612135288140514, + "grad_norm": 2.8951313495635986, + "learning_rate": 8.103917504151219e-06, + "loss": 0.7134, + "step": 5553 + }, + { + "epoch": 1.6124256060386122, + "grad_norm": 3.7895846366882324, + "learning_rate": 8.103164664895179e-06, + "loss": 0.8141, + "step": 5554 + }, + { + "epoch": 1.6127159239367108, + "grad_norm": 3.412078619003296, + "learning_rate": 8.102411711195657e-06, + "loss": 0.7362, + "step": 5555 + }, + { + "epoch": 1.613006241834809, + "grad_norm": 3.6012485027313232, + "learning_rate": 8.101658643080421e-06, + "loss": 0.8171, + "step": 5556 + }, + { + "epoch": 1.6132965597329076, + "grad_norm": 3.7033040523529053, + "learning_rate": 8.100905460577246e-06, + "loss": 0.8706, + "step": 5557 + }, + { + "epoch": 1.6135868776310058, + "grad_norm": 3.526740074157715, + "learning_rate": 8.100152163713911e-06, + "loss": 0.7134, + "step": 5558 + }, + { + "epoch": 1.6138771955291045, + "grad_norm": 3.473214864730835, + "learning_rate": 8.09939875251819e-06, + "loss": 0.8147, + "step": 5559 + }, + { + "epoch": 1.6141675134272027, + "grad_norm": 3.854447603225708, + "learning_rate": 8.098645227017876e-06, + "loss": 0.8453, + "step": 5560 + }, + { + "epoch": 1.6144578313253013, + "grad_norm": 3.334552049636841, + "learning_rate": 8.097891587240754e-06, + "loss": 0.7638, + "step": 5561 + }, + { + "epoch": 1.6147481492233995, + "grad_norm": 3.6212611198425293, + "learning_rate": 8.097137833214621e-06, + "loss": 0.8392, + "step": 5562 + }, + { + "epoch": 1.6150384671214981, + "grad_norm": 3.836317300796509, + "learning_rate": 8.096383964967273e-06, + "loss": 0.8645, + "step": 5563 + }, + { + "epoch": 1.6153287850195963, + "grad_norm": 3.2368345260620117, + "learning_rate": 8.095629982526513e-06, + "loss": 0.7104, + "step": 5564 + }, + { + "epoch": 1.615619102917695, + "grad_norm": 3.441826105117798, + "learning_rate": 8.094875885920148e-06, + "loss": 0.7553, + "step": 5565 + }, + { + "epoch": 1.6159094208157931, + "grad_norm": 3.322342872619629, + "learning_rate": 8.094121675175988e-06, + "loss": 0.7563, + "step": 5566 + }, + { + "epoch": 1.6161997387138918, + "grad_norm": 3.713310956954956, + "learning_rate": 8.09336735032185e-06, + "loss": 0.7707, + "step": 5567 + }, + { + "epoch": 1.61649005661199, + "grad_norm": 3.6072230339050293, + "learning_rate": 8.092612911385551e-06, + "loss": 0.6832, + "step": 5568 + }, + { + "epoch": 1.6167803745100886, + "grad_norm": 3.0848445892333984, + "learning_rate": 8.091858358394915e-06, + "loss": 0.7505, + "step": 5569 + }, + { + "epoch": 1.617070692408187, + "grad_norm": 3.962153673171997, + "learning_rate": 8.09110369137777e-06, + "loss": 0.8181, + "step": 5570 + }, + { + "epoch": 1.6173610103062854, + "grad_norm": 3.5778603553771973, + "learning_rate": 8.090348910361946e-06, + "loss": 0.8057, + "step": 5571 + }, + { + "epoch": 1.6176513282043838, + "grad_norm": 3.639045238494873, + "learning_rate": 8.089594015375281e-06, + "loss": 0.8074, + "step": 5572 + }, + { + "epoch": 1.6179416461024823, + "grad_norm": 3.190915584564209, + "learning_rate": 8.088839006445615e-06, + "loss": 0.6914, + "step": 5573 + }, + { + "epoch": 1.6182319640005807, + "grad_norm": 3.173288345336914, + "learning_rate": 8.088083883600793e-06, + "loss": 0.7042, + "step": 5574 + }, + { + "epoch": 1.618522281898679, + "grad_norm": 3.3784337043762207, + "learning_rate": 8.087328646868663e-06, + "loss": 0.7792, + "step": 5575 + }, + { + "epoch": 1.6188125997967775, + "grad_norm": 3.4538354873657227, + "learning_rate": 8.086573296277078e-06, + "loss": 0.7685, + "step": 5576 + }, + { + "epoch": 1.619102917694876, + "grad_norm": 3.314093828201294, + "learning_rate": 8.085817831853893e-06, + "loss": 0.8075, + "step": 5577 + }, + { + "epoch": 1.6193932355929743, + "grad_norm": 3.4923017024993896, + "learning_rate": 8.085062253626971e-06, + "loss": 0.8034, + "step": 5578 + }, + { + "epoch": 1.6196835534910727, + "grad_norm": 3.724478244781494, + "learning_rate": 8.084306561624177e-06, + "loss": 0.7567, + "step": 5579 + }, + { + "epoch": 1.6199738713891711, + "grad_norm": 3.650859832763672, + "learning_rate": 8.083550755873384e-06, + "loss": 0.7958, + "step": 5580 + }, + { + "epoch": 1.6202641892872696, + "grad_norm": 3.2904725074768066, + "learning_rate": 8.08279483640246e-06, + "loss": 0.7741, + "step": 5581 + }, + { + "epoch": 1.620554507185368, + "grad_norm": 3.688880205154419, + "learning_rate": 8.082038803239288e-06, + "loss": 0.7899, + "step": 5582 + }, + { + "epoch": 1.6208448250834664, + "grad_norm": 3.716184139251709, + "learning_rate": 8.081282656411746e-06, + "loss": 0.7081, + "step": 5583 + }, + { + "epoch": 1.6211351429815648, + "grad_norm": 3.6786234378814697, + "learning_rate": 8.080526395947722e-06, + "loss": 0.9142, + "step": 5584 + }, + { + "epoch": 1.6214254608796632, + "grad_norm": 3.396521806716919, + "learning_rate": 8.079770021875108e-06, + "loss": 0.7703, + "step": 5585 + }, + { + "epoch": 1.6217157787777616, + "grad_norm": 3.3132734298706055, + "learning_rate": 8.079013534221798e-06, + "loss": 0.7606, + "step": 5586 + }, + { + "epoch": 1.62200609667586, + "grad_norm": 3.5055415630340576, + "learning_rate": 8.078256933015692e-06, + "loss": 0.8032, + "step": 5587 + }, + { + "epoch": 1.6222964145739585, + "grad_norm": 3.584742307662964, + "learning_rate": 8.077500218284689e-06, + "loss": 0.7928, + "step": 5588 + }, + { + "epoch": 1.6225867324720569, + "grad_norm": 3.809736490249634, + "learning_rate": 8.0767433900567e-06, + "loss": 0.8064, + "step": 5589 + }, + { + "epoch": 1.6228770503701553, + "grad_norm": 3.6083149909973145, + "learning_rate": 8.075986448359637e-06, + "loss": 0.7596, + "step": 5590 + }, + { + "epoch": 1.6231673682682537, + "grad_norm": 3.65105938911438, + "learning_rate": 8.075229393221413e-06, + "loss": 0.8699, + "step": 5591 + }, + { + "epoch": 1.623457686166352, + "grad_norm": 3.3243539333343506, + "learning_rate": 8.074472224669952e-06, + "loss": 0.7765, + "step": 5592 + }, + { + "epoch": 1.6237480040644505, + "grad_norm": 3.975712537765503, + "learning_rate": 8.073714942733173e-06, + "loss": 0.9207, + "step": 5593 + }, + { + "epoch": 1.624038321962549, + "grad_norm": 3.689615488052368, + "learning_rate": 8.072957547439006e-06, + "loss": 0.9121, + "step": 5594 + }, + { + "epoch": 1.6243286398606473, + "grad_norm": 3.562192440032959, + "learning_rate": 8.072200038815387e-06, + "loss": 0.7415, + "step": 5595 + }, + { + "epoch": 1.624618957758746, + "grad_norm": 3.7881624698638916, + "learning_rate": 8.071442416890247e-06, + "loss": 0.7459, + "step": 5596 + }, + { + "epoch": 1.6249092756568442, + "grad_norm": 3.2582058906555176, + "learning_rate": 8.070684681691532e-06, + "loss": 0.7617, + "step": 5597 + }, + { + "epoch": 1.6251995935549428, + "grad_norm": 3.686997175216675, + "learning_rate": 8.069926833247181e-06, + "loss": 0.8463, + "step": 5598 + }, + { + "epoch": 1.625489911453041, + "grad_norm": 4.284474849700928, + "learning_rate": 8.06916887158515e-06, + "loss": 0.9876, + "step": 5599 + }, + { + "epoch": 1.6257802293511396, + "grad_norm": 3.551377058029175, + "learning_rate": 8.068410796733388e-06, + "loss": 0.8189, + "step": 5600 + }, + { + "epoch": 1.6260705472492378, + "grad_norm": 3.5549912452697754, + "learning_rate": 8.067652608719854e-06, + "loss": 0.7113, + "step": 5601 + }, + { + "epoch": 1.6263608651473365, + "grad_norm": 3.1862168312072754, + "learning_rate": 8.066894307572507e-06, + "loss": 0.7421, + "step": 5602 + }, + { + "epoch": 1.6266511830454347, + "grad_norm": 3.8687539100646973, + "learning_rate": 8.066135893319316e-06, + "loss": 0.9149, + "step": 5603 + }, + { + "epoch": 1.6269415009435333, + "grad_norm": 3.6645760536193848, + "learning_rate": 8.065377365988252e-06, + "loss": 0.7268, + "step": 5604 + }, + { + "epoch": 1.6272318188416315, + "grad_norm": 3.643216609954834, + "learning_rate": 8.064618725607284e-06, + "loss": 0.7743, + "step": 5605 + }, + { + "epoch": 1.6275221367397301, + "grad_norm": 3.8267617225646973, + "learning_rate": 8.063859972204395e-06, + "loss": 0.7137, + "step": 5606 + }, + { + "epoch": 1.6278124546378283, + "grad_norm": 3.9164083003997803, + "learning_rate": 8.063101105807566e-06, + "loss": 0.8744, + "step": 5607 + }, + { + "epoch": 1.628102772535927, + "grad_norm": 3.626497507095337, + "learning_rate": 8.062342126444786e-06, + "loss": 0.7174, + "step": 5608 + }, + { + "epoch": 1.6283930904340251, + "grad_norm": 3.792872428894043, + "learning_rate": 8.06158303414404e-06, + "loss": 0.7724, + "step": 5609 + }, + { + "epoch": 1.6286834083321238, + "grad_norm": 4.004924297332764, + "learning_rate": 8.060823828933329e-06, + "loss": 0.8403, + "step": 5610 + }, + { + "epoch": 1.628973726230222, + "grad_norm": 4.27058219909668, + "learning_rate": 8.060064510840648e-06, + "loss": 0.7813, + "step": 5611 + }, + { + "epoch": 1.6292640441283206, + "grad_norm": 3.6475908756256104, + "learning_rate": 8.059305079894004e-06, + "loss": 0.8612, + "step": 5612 + }, + { + "epoch": 1.6295543620264188, + "grad_norm": 3.181816339492798, + "learning_rate": 8.058545536121402e-06, + "loss": 0.659, + "step": 5613 + }, + { + "epoch": 1.6298446799245174, + "grad_norm": 3.768768072128296, + "learning_rate": 8.057785879550854e-06, + "loss": 0.7758, + "step": 5614 + }, + { + "epoch": 1.6301349978226156, + "grad_norm": 4.072582244873047, + "learning_rate": 8.057026110210378e-06, + "loss": 0.8186, + "step": 5615 + }, + { + "epoch": 1.6304253157207143, + "grad_norm": 3.40413498878479, + "learning_rate": 8.05626622812799e-06, + "loss": 0.7683, + "step": 5616 + }, + { + "epoch": 1.6307156336188124, + "grad_norm": 3.935901403427124, + "learning_rate": 8.055506233331718e-06, + "loss": 0.773, + "step": 5617 + }, + { + "epoch": 1.631005951516911, + "grad_norm": 3.696681499481201, + "learning_rate": 8.054746125849587e-06, + "loss": 0.8155, + "step": 5618 + }, + { + "epoch": 1.6312962694150093, + "grad_norm": 3.344435691833496, + "learning_rate": 8.053985905709632e-06, + "loss": 0.7765, + "step": 5619 + }, + { + "epoch": 1.631586587313108, + "grad_norm": 3.0115749835968018, + "learning_rate": 8.053225572939888e-06, + "loss": 0.6434, + "step": 5620 + }, + { + "epoch": 1.6318769052112063, + "grad_norm": 3.36995005607605, + "learning_rate": 8.052465127568399e-06, + "loss": 0.7216, + "step": 5621 + }, + { + "epoch": 1.6321672231093047, + "grad_norm": 3.2760109901428223, + "learning_rate": 8.051704569623205e-06, + "loss": 0.6746, + "step": 5622 + }, + { + "epoch": 1.6324575410074031, + "grad_norm": 3.5613889694213867, + "learning_rate": 8.050943899132357e-06, + "loss": 0.7582, + "step": 5623 + }, + { + "epoch": 1.6327478589055016, + "grad_norm": 3.7071661949157715, + "learning_rate": 8.05018311612391e-06, + "loss": 0.85, + "step": 5624 + }, + { + "epoch": 1.6330381768036, + "grad_norm": 3.911090135574341, + "learning_rate": 8.049422220625921e-06, + "loss": 0.9153, + "step": 5625 + }, + { + "epoch": 1.6333284947016984, + "grad_norm": 3.132866621017456, + "learning_rate": 8.048661212666449e-06, + "loss": 0.7028, + "step": 5626 + }, + { + "epoch": 1.6336188125997968, + "grad_norm": 3.5012645721435547, + "learning_rate": 8.047900092273562e-06, + "loss": 0.797, + "step": 5627 + }, + { + "epoch": 1.6339091304978952, + "grad_norm": 3.3185794353485107, + "learning_rate": 8.047138859475328e-06, + "loss": 0.6393, + "step": 5628 + }, + { + "epoch": 1.6341994483959936, + "grad_norm": 3.0536088943481445, + "learning_rate": 8.046377514299824e-06, + "loss": 0.76, + "step": 5629 + }, + { + "epoch": 1.634489766294092, + "grad_norm": 3.2665023803710938, + "learning_rate": 8.045616056775124e-06, + "loss": 0.7035, + "step": 5630 + }, + { + "epoch": 1.6347800841921905, + "grad_norm": 3.6513378620147705, + "learning_rate": 8.044854486929315e-06, + "loss": 0.7328, + "step": 5631 + }, + { + "epoch": 1.6350704020902889, + "grad_norm": 4.083636283874512, + "learning_rate": 8.04409280479048e-06, + "loss": 0.8992, + "step": 5632 + }, + { + "epoch": 1.6353607199883873, + "grad_norm": 3.52335524559021, + "learning_rate": 8.043331010386709e-06, + "loss": 0.8255, + "step": 5633 + }, + { + "epoch": 1.6356510378864857, + "grad_norm": 3.233189582824707, + "learning_rate": 8.0425691037461e-06, + "loss": 0.6768, + "step": 5634 + }, + { + "epoch": 1.635941355784584, + "grad_norm": 3.613593578338623, + "learning_rate": 8.04180708489675e-06, + "loss": 0.8201, + "step": 5635 + }, + { + "epoch": 1.6362316736826825, + "grad_norm": 3.1805691719055176, + "learning_rate": 8.041044953866758e-06, + "loss": 0.6954, + "step": 5636 + }, + { + "epoch": 1.636521991580781, + "grad_norm": 3.4872689247131348, + "learning_rate": 8.040282710684238e-06, + "loss": 0.8031, + "step": 5637 + }, + { + "epoch": 1.6368123094788793, + "grad_norm": 3.5049612522125244, + "learning_rate": 8.039520355377299e-06, + "loss": 0.7646, + "step": 5638 + }, + { + "epoch": 1.6371026273769778, + "grad_norm": 4.077413558959961, + "learning_rate": 8.038757887974053e-06, + "loss": 0.8644, + "step": 5639 + }, + { + "epoch": 1.6373929452750762, + "grad_norm": 3.759481430053711, + "learning_rate": 8.037995308502625e-06, + "loss": 0.9328, + "step": 5640 + }, + { + "epoch": 1.6376832631731746, + "grad_norm": 3.288496971130371, + "learning_rate": 8.037232616991132e-06, + "loss": 0.7038, + "step": 5641 + }, + { + "epoch": 1.637973581071273, + "grad_norm": 3.1828713417053223, + "learning_rate": 8.036469813467707e-06, + "loss": 0.7033, + "step": 5642 + }, + { + "epoch": 1.6382638989693714, + "grad_norm": 3.8580126762390137, + "learning_rate": 8.03570689796048e-06, + "loss": 0.84, + "step": 5643 + }, + { + "epoch": 1.6385542168674698, + "grad_norm": 3.1978330612182617, + "learning_rate": 8.034943870497589e-06, + "loss": 0.7056, + "step": 5644 + }, + { + "epoch": 1.6388445347655685, + "grad_norm": 3.5237538814544678, + "learning_rate": 8.034180731107171e-06, + "loss": 0.8868, + "step": 5645 + }, + { + "epoch": 1.6391348526636667, + "grad_norm": 3.573692798614502, + "learning_rate": 8.033417479817371e-06, + "loss": 0.6922, + "step": 5646 + }, + { + "epoch": 1.6394251705617653, + "grad_norm": 3.6821346282958984, + "learning_rate": 8.03265411665634e-06, + "loss": 0.8068, + "step": 5647 + }, + { + "epoch": 1.6397154884598635, + "grad_norm": 3.5693955421447754, + "learning_rate": 8.031890641652228e-06, + "loss": 0.7738, + "step": 5648 + }, + { + "epoch": 1.6400058063579621, + "grad_norm": 3.874678134918213, + "learning_rate": 8.031127054833192e-06, + "loss": 0.7949, + "step": 5649 + }, + { + "epoch": 1.6402961242560603, + "grad_norm": 3.197110414505005, + "learning_rate": 8.030363356227393e-06, + "loss": 0.8176, + "step": 5650 + }, + { + "epoch": 1.640586442154159, + "grad_norm": 3.5319745540618896, + "learning_rate": 8.029599545862994e-06, + "loss": 0.8178, + "step": 5651 + }, + { + "epoch": 1.6408767600522571, + "grad_norm": 3.6435129642486572, + "learning_rate": 8.02883562376817e-06, + "loss": 0.8178, + "step": 5652 + }, + { + "epoch": 1.6411670779503558, + "grad_norm": 3.5644171237945557, + "learning_rate": 8.028071589971086e-06, + "loss": 0.7177, + "step": 5653 + }, + { + "epoch": 1.641457395848454, + "grad_norm": 3.39943528175354, + "learning_rate": 8.027307444499927e-06, + "loss": 0.745, + "step": 5654 + }, + { + "epoch": 1.6417477137465526, + "grad_norm": 4.047821521759033, + "learning_rate": 8.02654318738287e-06, + "loss": 0.8497, + "step": 5655 + }, + { + "epoch": 1.6420380316446508, + "grad_norm": 3.406195640563965, + "learning_rate": 8.0257788186481e-06, + "loss": 0.7035, + "step": 5656 + }, + { + "epoch": 1.6423283495427494, + "grad_norm": 3.4617931842803955, + "learning_rate": 8.02501433832381e-06, + "loss": 0.7898, + "step": 5657 + }, + { + "epoch": 1.6426186674408476, + "grad_norm": 3.187101364135742, + "learning_rate": 8.024249746438189e-06, + "loss": 0.6932, + "step": 5658 + }, + { + "epoch": 1.6429089853389462, + "grad_norm": 3.927638053894043, + "learning_rate": 8.023485043019437e-06, + "loss": 0.7909, + "step": 5659 + }, + { + "epoch": 1.6431993032370444, + "grad_norm": 4.530860424041748, + "learning_rate": 8.02272022809576e-06, + "loss": 0.8495, + "step": 5660 + }, + { + "epoch": 1.643489621135143, + "grad_norm": 3.5661168098449707, + "learning_rate": 8.021955301695357e-06, + "loss": 0.8213, + "step": 5661 + }, + { + "epoch": 1.6437799390332413, + "grad_norm": 4.098810195922852, + "learning_rate": 8.021190263846445e-06, + "loss": 0.9182, + "step": 5662 + }, + { + "epoch": 1.64407025693134, + "grad_norm": 3.5405988693237305, + "learning_rate": 8.020425114577232e-06, + "loss": 0.7886, + "step": 5663 + }, + { + "epoch": 1.644360574829438, + "grad_norm": 3.576150894165039, + "learning_rate": 8.01965985391594e-06, + "loss": 0.7849, + "step": 5664 + }, + { + "epoch": 1.6446508927275367, + "grad_norm": 3.515380620956421, + "learning_rate": 8.018894481890793e-06, + "loss": 0.7205, + "step": 5665 + }, + { + "epoch": 1.644941210625635, + "grad_norm": 3.65975284576416, + "learning_rate": 8.018128998530013e-06, + "loss": 0.7721, + "step": 5666 + }, + { + "epoch": 1.6452315285237336, + "grad_norm": 3.3606433868408203, + "learning_rate": 8.017363403861836e-06, + "loss": 0.7938, + "step": 5667 + }, + { + "epoch": 1.6455218464218317, + "grad_norm": 3.917895793914795, + "learning_rate": 8.016597697914492e-06, + "loss": 0.7639, + "step": 5668 + }, + { + "epoch": 1.6458121643199304, + "grad_norm": 3.221787452697754, + "learning_rate": 8.015831880716222e-06, + "loss": 0.6328, + "step": 5669 + }, + { + "epoch": 1.6461024822180288, + "grad_norm": 3.6997926235198975, + "learning_rate": 8.01506595229527e-06, + "loss": 0.7413, + "step": 5670 + }, + { + "epoch": 1.6463928001161272, + "grad_norm": 3.6128926277160645, + "learning_rate": 8.014299912679882e-06, + "loss": 0.82, + "step": 5671 + }, + { + "epoch": 1.6466831180142256, + "grad_norm": 3.536489963531494, + "learning_rate": 8.013533761898308e-06, + "loss": 0.7352, + "step": 5672 + }, + { + "epoch": 1.646973435912324, + "grad_norm": 3.36811900138855, + "learning_rate": 8.012767499978806e-06, + "loss": 0.7863, + "step": 5673 + }, + { + "epoch": 1.6472637538104224, + "grad_norm": 3.6035544872283936, + "learning_rate": 8.012001126949634e-06, + "loss": 0.7394, + "step": 5674 + }, + { + "epoch": 1.6475540717085209, + "grad_norm": 3.541083574295044, + "learning_rate": 8.011234642839057e-06, + "loss": 0.7212, + "step": 5675 + }, + { + "epoch": 1.6478443896066193, + "grad_norm": 3.6132876873016357, + "learning_rate": 8.010468047675339e-06, + "loss": 0.7709, + "step": 5676 + }, + { + "epoch": 1.6481347075047177, + "grad_norm": 3.5941474437713623, + "learning_rate": 8.009701341486755e-06, + "loss": 0.7479, + "step": 5677 + }, + { + "epoch": 1.648425025402816, + "grad_norm": 3.7650184631347656, + "learning_rate": 8.00893452430158e-06, + "loss": 0.8398, + "step": 5678 + }, + { + "epoch": 1.6487153433009145, + "grad_norm": 3.681375503540039, + "learning_rate": 8.008167596148094e-06, + "loss": 0.7961, + "step": 5679 + }, + { + "epoch": 1.649005661199013, + "grad_norm": 3.072575330734253, + "learning_rate": 8.007400557054581e-06, + "loss": 0.6448, + "step": 5680 + }, + { + "epoch": 1.6492959790971113, + "grad_norm": 3.290656566619873, + "learning_rate": 8.006633407049329e-06, + "loss": 0.7265, + "step": 5681 + }, + { + "epoch": 1.6495862969952098, + "grad_norm": 3.1598901748657227, + "learning_rate": 8.005866146160628e-06, + "loss": 0.6802, + "step": 5682 + }, + { + "epoch": 1.6498766148933082, + "grad_norm": 3.601827383041382, + "learning_rate": 8.005098774416779e-06, + "loss": 0.7517, + "step": 5683 + }, + { + "epoch": 1.6501669327914066, + "grad_norm": 4.136563777923584, + "learning_rate": 8.00433129184608e-06, + "loss": 0.8474, + "step": 5684 + }, + { + "epoch": 1.650457250689505, + "grad_norm": 3.642002582550049, + "learning_rate": 8.003563698476832e-06, + "loss": 0.7993, + "step": 5685 + }, + { + "epoch": 1.6507475685876034, + "grad_norm": 3.2611653804779053, + "learning_rate": 8.00279599433735e-06, + "loss": 0.9312, + "step": 5686 + }, + { + "epoch": 1.6510378864857018, + "grad_norm": 3.619309186935425, + "learning_rate": 8.002028179455941e-06, + "loss": 0.7925, + "step": 5687 + }, + { + "epoch": 1.6513282043838002, + "grad_norm": 2.8980655670166016, + "learning_rate": 8.001260253860926e-06, + "loss": 0.6433, + "step": 5688 + }, + { + "epoch": 1.6516185222818986, + "grad_norm": 3.9477474689483643, + "learning_rate": 8.000492217580623e-06, + "loss": 0.853, + "step": 5689 + }, + { + "epoch": 1.651908840179997, + "grad_norm": 3.8791873455047607, + "learning_rate": 7.999724070643357e-06, + "loss": 0.8406, + "step": 5690 + }, + { + "epoch": 1.6521991580780955, + "grad_norm": 4.422399044036865, + "learning_rate": 7.998955813077457e-06, + "loss": 1.0581, + "step": 5691 + }, + { + "epoch": 1.6524894759761939, + "grad_norm": 3.169612169265747, + "learning_rate": 7.998187444911259e-06, + "loss": 0.7056, + "step": 5692 + }, + { + "epoch": 1.6527797938742923, + "grad_norm": 3.5287580490112305, + "learning_rate": 7.997418966173098e-06, + "loss": 0.7648, + "step": 5693 + }, + { + "epoch": 1.6530701117723907, + "grad_norm": 3.7084598541259766, + "learning_rate": 7.996650376891314e-06, + "loss": 0.7283, + "step": 5694 + }, + { + "epoch": 1.6533604296704891, + "grad_norm": 3.714036226272583, + "learning_rate": 7.995881677094252e-06, + "loss": 0.8884, + "step": 5695 + }, + { + "epoch": 1.6536507475685878, + "grad_norm": 3.511685371398926, + "learning_rate": 7.995112866810264e-06, + "loss": 0.7522, + "step": 5696 + }, + { + "epoch": 1.653941065466686, + "grad_norm": 3.6127731800079346, + "learning_rate": 7.994343946067702e-06, + "loss": 0.7927, + "step": 5697 + }, + { + "epoch": 1.6542313833647846, + "grad_norm": 3.3412842750549316, + "learning_rate": 7.993574914894924e-06, + "loss": 0.8249, + "step": 5698 + }, + { + "epoch": 1.6545217012628828, + "grad_norm": 3.3941237926483154, + "learning_rate": 7.99280577332029e-06, + "loss": 0.8165, + "step": 5699 + }, + { + "epoch": 1.6548120191609814, + "grad_norm": 3.492751121520996, + "learning_rate": 7.992036521372168e-06, + "loss": 0.9081, + "step": 5700 + }, + { + "epoch": 1.6551023370590796, + "grad_norm": 3.8079779148101807, + "learning_rate": 7.991267159078926e-06, + "loss": 0.9421, + "step": 5701 + }, + { + "epoch": 1.6553926549571782, + "grad_norm": 3.5926706790924072, + "learning_rate": 7.990497686468937e-06, + "loss": 0.821, + "step": 5702 + }, + { + "epoch": 1.6556829728552764, + "grad_norm": 3.417275905609131, + "learning_rate": 7.989728103570582e-06, + "loss": 0.7135, + "step": 5703 + }, + { + "epoch": 1.655973290753375, + "grad_norm": 3.2997395992279053, + "learning_rate": 7.98895841041224e-06, + "loss": 0.8215, + "step": 5704 + }, + { + "epoch": 1.6562636086514733, + "grad_norm": 3.397256374359131, + "learning_rate": 7.988188607022297e-06, + "loss": 0.7057, + "step": 5705 + }, + { + "epoch": 1.656553926549572, + "grad_norm": 4.578166961669922, + "learning_rate": 7.987418693429145e-06, + "loss": 0.8764, + "step": 5706 + }, + { + "epoch": 1.65684424444767, + "grad_norm": 3.8914785385131836, + "learning_rate": 7.986648669661177e-06, + "loss": 0.9121, + "step": 5707 + }, + { + "epoch": 1.6571345623457687, + "grad_norm": 3.776986598968506, + "learning_rate": 7.985878535746791e-06, + "loss": 0.7753, + "step": 5708 + }, + { + "epoch": 1.657424880243867, + "grad_norm": 3.3599276542663574, + "learning_rate": 7.98510829171439e-06, + "loss": 0.818, + "step": 5709 + }, + { + "epoch": 1.6577151981419656, + "grad_norm": 3.421091318130493, + "learning_rate": 7.984337937592379e-06, + "loss": 0.7669, + "step": 5710 + }, + { + "epoch": 1.6580055160400637, + "grad_norm": 3.1329991817474365, + "learning_rate": 7.983567473409171e-06, + "loss": 0.7219, + "step": 5711 + }, + { + "epoch": 1.6582958339381624, + "grad_norm": 3.530151844024658, + "learning_rate": 7.982796899193177e-06, + "loss": 0.6851, + "step": 5712 + }, + { + "epoch": 1.6585861518362606, + "grad_norm": 3.30942702293396, + "learning_rate": 7.982026214972819e-06, + "loss": 0.7465, + "step": 5713 + }, + { + "epoch": 1.6588764697343592, + "grad_norm": 3.1759490966796875, + "learning_rate": 7.981255420776513e-06, + "loss": 0.6359, + "step": 5714 + }, + { + "epoch": 1.6591667876324574, + "grad_norm": 3.953688621520996, + "learning_rate": 7.980484516632693e-06, + "loss": 0.8722, + "step": 5715 + }, + { + "epoch": 1.659457105530556, + "grad_norm": 3.810572862625122, + "learning_rate": 7.979713502569787e-06, + "loss": 0.863, + "step": 5716 + }, + { + "epoch": 1.6597474234286542, + "grad_norm": 3.367386817932129, + "learning_rate": 7.97894237861623e-06, + "loss": 0.7079, + "step": 5717 + }, + { + "epoch": 1.6600377413267529, + "grad_norm": 3.8818647861480713, + "learning_rate": 7.97817114480046e-06, + "loss": 0.9224, + "step": 5718 + }, + { + "epoch": 1.660328059224851, + "grad_norm": 3.8525187969207764, + "learning_rate": 7.97739980115092e-06, + "loss": 0.8674, + "step": 5719 + }, + { + "epoch": 1.6606183771229497, + "grad_norm": 3.035203695297241, + "learning_rate": 7.976628347696057e-06, + "loss": 0.6624, + "step": 5720 + }, + { + "epoch": 1.660908695021048, + "grad_norm": 3.5461297035217285, + "learning_rate": 7.975856784464322e-06, + "loss": 0.7999, + "step": 5721 + }, + { + "epoch": 1.6611990129191465, + "grad_norm": 3.743105888366699, + "learning_rate": 7.975085111484169e-06, + "loss": 0.852, + "step": 5722 + }, + { + "epoch": 1.661489330817245, + "grad_norm": 3.713768482208252, + "learning_rate": 7.974313328784056e-06, + "loss": 0.8012, + "step": 5723 + }, + { + "epoch": 1.6617796487153433, + "grad_norm": 3.6123037338256836, + "learning_rate": 7.97354143639245e-06, + "loss": 0.8306, + "step": 5724 + }, + { + "epoch": 1.6620699666134418, + "grad_norm": 2.929441213607788, + "learning_rate": 7.972769434337815e-06, + "loss": 0.6391, + "step": 5725 + }, + { + "epoch": 1.6623602845115402, + "grad_norm": 3.306553602218628, + "learning_rate": 7.971997322648623e-06, + "loss": 0.7801, + "step": 5726 + }, + { + "epoch": 1.6626506024096386, + "grad_norm": 3.7010748386383057, + "learning_rate": 7.971225101353351e-06, + "loss": 0.8044, + "step": 5727 + }, + { + "epoch": 1.662940920307737, + "grad_norm": 3.7694780826568604, + "learning_rate": 7.970452770480474e-06, + "loss": 0.8357, + "step": 5728 + }, + { + "epoch": 1.6632312382058354, + "grad_norm": 3.188607692718506, + "learning_rate": 7.969680330058478e-06, + "loss": 0.8356, + "step": 5729 + }, + { + "epoch": 1.6635215561039338, + "grad_norm": 3.9263787269592285, + "learning_rate": 7.96890778011585e-06, + "loss": 0.8787, + "step": 5730 + }, + { + "epoch": 1.6638118740020322, + "grad_norm": 3.170591115951538, + "learning_rate": 7.968135120681082e-06, + "loss": 0.707, + "step": 5731 + }, + { + "epoch": 1.6641021919001306, + "grad_norm": 3.5900368690490723, + "learning_rate": 7.967362351782668e-06, + "loss": 0.7168, + "step": 5732 + }, + { + "epoch": 1.664392509798229, + "grad_norm": 3.5648303031921387, + "learning_rate": 7.966589473449109e-06, + "loss": 0.8968, + "step": 5733 + }, + { + "epoch": 1.6646828276963275, + "grad_norm": 3.489239454269409, + "learning_rate": 7.965816485708905e-06, + "loss": 0.7251, + "step": 5734 + }, + { + "epoch": 1.6649731455944259, + "grad_norm": 3.441934585571289, + "learning_rate": 7.96504338859057e-06, + "loss": 0.7385, + "step": 5735 + }, + { + "epoch": 1.6652634634925243, + "grad_norm": 3.39975905418396, + "learning_rate": 7.96427018212261e-06, + "loss": 0.7562, + "step": 5736 + }, + { + "epoch": 1.6655537813906227, + "grad_norm": 3.6406619548797607, + "learning_rate": 7.96349686633354e-06, + "loss": 0.8788, + "step": 5737 + }, + { + "epoch": 1.6658440992887211, + "grad_norm": 3.939983606338501, + "learning_rate": 7.962723441251882e-06, + "loss": 0.8964, + "step": 5738 + }, + { + "epoch": 1.6661344171868195, + "grad_norm": 3.801276445388794, + "learning_rate": 7.96194990690616e-06, + "loss": 0.884, + "step": 5739 + }, + { + "epoch": 1.666424735084918, + "grad_norm": 3.2761764526367188, + "learning_rate": 7.961176263324902e-06, + "loss": 0.7168, + "step": 5740 + }, + { + "epoch": 1.6667150529830164, + "grad_norm": 3.361765146255493, + "learning_rate": 7.960402510536635e-06, + "loss": 0.687, + "step": 5741 + }, + { + "epoch": 1.6670053708811148, + "grad_norm": 3.3800249099731445, + "learning_rate": 7.959628648569901e-06, + "loss": 0.8002, + "step": 5742 + }, + { + "epoch": 1.6672956887792132, + "grad_norm": 3.9530911445617676, + "learning_rate": 7.958854677453238e-06, + "loss": 0.8342, + "step": 5743 + }, + { + "epoch": 1.6675860066773116, + "grad_norm": 3.6186470985412598, + "learning_rate": 7.958080597215187e-06, + "loss": 0.7748, + "step": 5744 + }, + { + "epoch": 1.66787632457541, + "grad_norm": 3.4672844409942627, + "learning_rate": 7.957306407884298e-06, + "loss": 0.7663, + "step": 5745 + }, + { + "epoch": 1.6681666424735084, + "grad_norm": 4.060912609100342, + "learning_rate": 7.95653210948912e-06, + "loss": 0.7882, + "step": 5746 + }, + { + "epoch": 1.668456960371607, + "grad_norm": 3.9443535804748535, + "learning_rate": 7.955757702058213e-06, + "loss": 0.9355, + "step": 5747 + }, + { + "epoch": 1.6687472782697053, + "grad_norm": 3.683994770050049, + "learning_rate": 7.954983185620136e-06, + "loss": 0.6635, + "step": 5748 + }, + { + "epoch": 1.669037596167804, + "grad_norm": 3.9671192169189453, + "learning_rate": 7.95420856020345e-06, + "loss": 0.8924, + "step": 5749 + }, + { + "epoch": 1.669327914065902, + "grad_norm": 3.1241607666015625, + "learning_rate": 7.953433825836725e-06, + "loss": 0.65, + "step": 5750 + }, + { + "epoch": 1.6696182319640007, + "grad_norm": 3.5456268787384033, + "learning_rate": 7.952658982548533e-06, + "loss": 0.7186, + "step": 5751 + }, + { + "epoch": 1.669908549862099, + "grad_norm": 3.425567388534546, + "learning_rate": 7.95188403036745e-06, + "loss": 0.6548, + "step": 5752 + }, + { + "epoch": 1.6701988677601975, + "grad_norm": 3.700671672821045, + "learning_rate": 7.951108969322054e-06, + "loss": 0.8279, + "step": 5753 + }, + { + "epoch": 1.6704891856582957, + "grad_norm": 3.732058525085449, + "learning_rate": 7.95033379944093e-06, + "loss": 0.7564, + "step": 5754 + }, + { + "epoch": 1.6707795035563944, + "grad_norm": 3.4859352111816406, + "learning_rate": 7.949558520752667e-06, + "loss": 0.7317, + "step": 5755 + }, + { + "epoch": 1.6710698214544926, + "grad_norm": 3.258023738861084, + "learning_rate": 7.948783133285858e-06, + "loss": 0.7544, + "step": 5756 + }, + { + "epoch": 1.6713601393525912, + "grad_norm": 3.767179012298584, + "learning_rate": 7.948007637069095e-06, + "loss": 0.8025, + "step": 5757 + }, + { + "epoch": 1.6716504572506894, + "grad_norm": 3.410964012145996, + "learning_rate": 7.947232032130982e-06, + "loss": 0.6954, + "step": 5758 + }, + { + "epoch": 1.671940775148788, + "grad_norm": 3.8064308166503906, + "learning_rate": 7.94645631850012e-06, + "loss": 0.9072, + "step": 5759 + }, + { + "epoch": 1.6722310930468862, + "grad_norm": 3.1589906215667725, + "learning_rate": 7.945680496205117e-06, + "loss": 0.7262, + "step": 5760 + }, + { + "epoch": 1.6725214109449849, + "grad_norm": 3.672649621963501, + "learning_rate": 7.944904565274588e-06, + "loss": 0.8108, + "step": 5761 + }, + { + "epoch": 1.672811728843083, + "grad_norm": 3.2626302242279053, + "learning_rate": 7.944128525737147e-06, + "loss": 0.7403, + "step": 5762 + }, + { + "epoch": 1.6731020467411817, + "grad_norm": 3.6295340061187744, + "learning_rate": 7.943352377621414e-06, + "loss": 0.7882, + "step": 5763 + }, + { + "epoch": 1.6733923646392799, + "grad_norm": 4.048469543457031, + "learning_rate": 7.942576120956014e-06, + "loss": 0.8053, + "step": 5764 + }, + { + "epoch": 1.6736826825373785, + "grad_norm": 3.6602652072906494, + "learning_rate": 7.941799755769573e-06, + "loss": 0.7699, + "step": 5765 + }, + { + "epoch": 1.6739730004354767, + "grad_norm": 3.469912528991699, + "learning_rate": 7.941023282090727e-06, + "loss": 0.6628, + "step": 5766 + }, + { + "epoch": 1.6742633183335753, + "grad_norm": 3.6404995918273926, + "learning_rate": 7.940246699948107e-06, + "loss": 0.8513, + "step": 5767 + }, + { + "epoch": 1.6745536362316735, + "grad_norm": 4.017561435699463, + "learning_rate": 7.939470009370357e-06, + "loss": 0.918, + "step": 5768 + }, + { + "epoch": 1.6748439541297722, + "grad_norm": 3.269432544708252, + "learning_rate": 7.938693210386118e-06, + "loss": 0.7086, + "step": 5769 + }, + { + "epoch": 1.6751342720278704, + "grad_norm": 3.6618704795837402, + "learning_rate": 7.93791630302404e-06, + "loss": 0.8762, + "step": 5770 + }, + { + "epoch": 1.675424589925969, + "grad_norm": 3.3765363693237305, + "learning_rate": 7.937139287312777e-06, + "loss": 0.7739, + "step": 5771 + }, + { + "epoch": 1.6757149078240674, + "grad_norm": 3.6694111824035645, + "learning_rate": 7.93636216328098e-06, + "loss": 0.7754, + "step": 5772 + }, + { + "epoch": 1.6760052257221658, + "grad_norm": 3.989017963409424, + "learning_rate": 7.935584930957312e-06, + "loss": 0.8737, + "step": 5773 + }, + { + "epoch": 1.6762955436202642, + "grad_norm": 3.580270528793335, + "learning_rate": 7.934807590370438e-06, + "loss": 0.7978, + "step": 5774 + }, + { + "epoch": 1.6765858615183626, + "grad_norm": 3.720231771469116, + "learning_rate": 7.934030141549024e-06, + "loss": 0.851, + "step": 5775 + }, + { + "epoch": 1.676876179416461, + "grad_norm": 3.835939407348633, + "learning_rate": 7.933252584521743e-06, + "loss": 0.8481, + "step": 5776 + }, + { + "epoch": 1.6771664973145595, + "grad_norm": 3.7228312492370605, + "learning_rate": 7.93247491931727e-06, + "loss": 0.957, + "step": 5777 + }, + { + "epoch": 1.6774568152126579, + "grad_norm": 3.7690441608428955, + "learning_rate": 7.931697145964284e-06, + "loss": 0.8309, + "step": 5778 + }, + { + "epoch": 1.6777471331107563, + "grad_norm": 3.3121449947357178, + "learning_rate": 7.930919264491473e-06, + "loss": 0.7899, + "step": 5779 + }, + { + "epoch": 1.6780374510088547, + "grad_norm": 3.385662794113159, + "learning_rate": 7.930141274927522e-06, + "loss": 0.7839, + "step": 5780 + }, + { + "epoch": 1.6783277689069531, + "grad_norm": 3.8395118713378906, + "learning_rate": 7.929363177301124e-06, + "loss": 0.9903, + "step": 5781 + }, + { + "epoch": 1.6786180868050515, + "grad_norm": 3.8420722484588623, + "learning_rate": 7.928584971640974e-06, + "loss": 0.8054, + "step": 5782 + }, + { + "epoch": 1.67890840470315, + "grad_norm": 3.230956554412842, + "learning_rate": 7.927806657975775e-06, + "loss": 0.7696, + "step": 5783 + }, + { + "epoch": 1.6791987226012484, + "grad_norm": 3.2777044773101807, + "learning_rate": 7.927028236334224e-06, + "loss": 0.694, + "step": 5784 + }, + { + "epoch": 1.6794890404993468, + "grad_norm": 3.614997625350952, + "learning_rate": 7.926249706745036e-06, + "loss": 0.839, + "step": 5785 + }, + { + "epoch": 1.6797793583974452, + "grad_norm": 3.2601284980773926, + "learning_rate": 7.92547106923692e-06, + "loss": 0.7297, + "step": 5786 + }, + { + "epoch": 1.6800696762955436, + "grad_norm": 3.0316452980041504, + "learning_rate": 7.92469232383859e-06, + "loss": 0.768, + "step": 5787 + }, + { + "epoch": 1.680359994193642, + "grad_norm": 3.3039333820343018, + "learning_rate": 7.92391347057877e-06, + "loss": 0.7392, + "step": 5788 + }, + { + "epoch": 1.6806503120917404, + "grad_norm": 3.2324368953704834, + "learning_rate": 7.92313450948618e-06, + "loss": 0.655, + "step": 5789 + }, + { + "epoch": 1.6809406299898388, + "grad_norm": 3.5473809242248535, + "learning_rate": 7.92235544058955e-06, + "loss": 0.7821, + "step": 5790 + }, + { + "epoch": 1.6812309478879373, + "grad_norm": 3.683997392654419, + "learning_rate": 7.921576263917612e-06, + "loss": 0.7927, + "step": 5791 + }, + { + "epoch": 1.6815212657860357, + "grad_norm": 3.726501703262329, + "learning_rate": 7.920796979499098e-06, + "loss": 0.7179, + "step": 5792 + }, + { + "epoch": 1.681811583684134, + "grad_norm": 3.5410258769989014, + "learning_rate": 7.920017587362751e-06, + "loss": 0.6961, + "step": 5793 + }, + { + "epoch": 1.6821019015822325, + "grad_norm": 3.490867853164673, + "learning_rate": 7.919238087537317e-06, + "loss": 0.8215, + "step": 5794 + }, + { + "epoch": 1.682392219480331, + "grad_norm": 3.436814308166504, + "learning_rate": 7.91845848005154e-06, + "loss": 0.7662, + "step": 5795 + }, + { + "epoch": 1.6826825373784295, + "grad_norm": 2.991690158843994, + "learning_rate": 7.917678764934169e-06, + "loss": 0.7011, + "step": 5796 + }, + { + "epoch": 1.6829728552765277, + "grad_norm": 3.5436899662017822, + "learning_rate": 7.916898942213967e-06, + "loss": 0.6851, + "step": 5797 + }, + { + "epoch": 1.6832631731746264, + "grad_norm": 3.9489386081695557, + "learning_rate": 7.916119011919687e-06, + "loss": 0.9344, + "step": 5798 + }, + { + "epoch": 1.6835534910727246, + "grad_norm": 3.72562313079834, + "learning_rate": 7.915338974080098e-06, + "loss": 0.7195, + "step": 5799 + }, + { + "epoch": 1.6838438089708232, + "grad_norm": 3.375615358352661, + "learning_rate": 7.914558828723961e-06, + "loss": 0.6861, + "step": 5800 + }, + { + "epoch": 1.6841341268689214, + "grad_norm": 3.519691228866577, + "learning_rate": 7.913778575880054e-06, + "loss": 0.8167, + "step": 5801 + }, + { + "epoch": 1.68442444476702, + "grad_norm": 3.521460771560669, + "learning_rate": 7.912998215577147e-06, + "loss": 0.8164, + "step": 5802 + }, + { + "epoch": 1.6847147626651182, + "grad_norm": 3.0612387657165527, + "learning_rate": 7.912217747844022e-06, + "loss": 0.6171, + "step": 5803 + }, + { + "epoch": 1.6850050805632169, + "grad_norm": 3.277848243713379, + "learning_rate": 7.911437172709464e-06, + "loss": 0.6403, + "step": 5804 + }, + { + "epoch": 1.685295398461315, + "grad_norm": 3.4739081859588623, + "learning_rate": 7.910656490202258e-06, + "loss": 0.7629, + "step": 5805 + }, + { + "epoch": 1.6855857163594137, + "grad_norm": 3.8861570358276367, + "learning_rate": 7.909875700351193e-06, + "loss": 0.8584, + "step": 5806 + }, + { + "epoch": 1.6858760342575119, + "grad_norm": 3.6719019412994385, + "learning_rate": 7.909094803185071e-06, + "loss": 0.7888, + "step": 5807 + }, + { + "epoch": 1.6861663521556105, + "grad_norm": 3.4244160652160645, + "learning_rate": 7.908313798732685e-06, + "loss": 0.6949, + "step": 5808 + }, + { + "epoch": 1.6864566700537087, + "grad_norm": 3.7639153003692627, + "learning_rate": 7.907532687022841e-06, + "loss": 0.814, + "step": 5809 + }, + { + "epoch": 1.6867469879518073, + "grad_norm": 3.6842236518859863, + "learning_rate": 7.906751468084343e-06, + "loss": 0.7004, + "step": 5810 + }, + { + "epoch": 1.6870373058499055, + "grad_norm": 3.259575366973877, + "learning_rate": 7.905970141946006e-06, + "loss": 0.6729, + "step": 5811 + }, + { + "epoch": 1.6873276237480042, + "grad_norm": 3.651085138320923, + "learning_rate": 7.905188708636645e-06, + "loss": 0.7953, + "step": 5812 + }, + { + "epoch": 1.6876179416461023, + "grad_norm": 3.5897328853607178, + "learning_rate": 7.904407168185076e-06, + "loss": 0.773, + "step": 5813 + }, + { + "epoch": 1.687908259544201, + "grad_norm": 3.297179937362671, + "learning_rate": 7.903625520620122e-06, + "loss": 0.7771, + "step": 5814 + }, + { + "epoch": 1.6881985774422992, + "grad_norm": 3.8753912448883057, + "learning_rate": 7.902843765970611e-06, + "loss": 0.7852, + "step": 5815 + }, + { + "epoch": 1.6884888953403978, + "grad_norm": 3.782907247543335, + "learning_rate": 7.902061904265375e-06, + "loss": 0.7274, + "step": 5816 + }, + { + "epoch": 1.688779213238496, + "grad_norm": 3.083601713180542, + "learning_rate": 7.901279935533248e-06, + "loss": 0.7227, + "step": 5817 + }, + { + "epoch": 1.6890695311365946, + "grad_norm": 3.3517086505889893, + "learning_rate": 7.900497859803069e-06, + "loss": 0.6743, + "step": 5818 + }, + { + "epoch": 1.6893598490346928, + "grad_norm": 3.5704421997070312, + "learning_rate": 7.899715677103677e-06, + "loss": 0.7981, + "step": 5819 + }, + { + "epoch": 1.6896501669327915, + "grad_norm": 2.869518280029297, + "learning_rate": 7.898933387463924e-06, + "loss": 0.5827, + "step": 5820 + }, + { + "epoch": 1.6899404848308899, + "grad_norm": 3.6910226345062256, + "learning_rate": 7.898150990912657e-06, + "loss": 0.8739, + "step": 5821 + }, + { + "epoch": 1.6902308027289883, + "grad_norm": 3.580432415008545, + "learning_rate": 7.897368487478733e-06, + "loss": 0.8449, + "step": 5822 + }, + { + "epoch": 1.6905211206270867, + "grad_norm": 3.478239059448242, + "learning_rate": 7.896585877191007e-06, + "loss": 0.7331, + "step": 5823 + }, + { + "epoch": 1.6908114385251851, + "grad_norm": 3.5383105278015137, + "learning_rate": 7.895803160078344e-06, + "loss": 0.7373, + "step": 5824 + }, + { + "epoch": 1.6911017564232835, + "grad_norm": 3.06196928024292, + "learning_rate": 7.89502033616961e-06, + "loss": 0.7516, + "step": 5825 + }, + { + "epoch": 1.691392074321382, + "grad_norm": 3.9107048511505127, + "learning_rate": 7.894237405493675e-06, + "loss": 0.8451, + "step": 5826 + }, + { + "epoch": 1.6916823922194804, + "grad_norm": 3.3762965202331543, + "learning_rate": 7.893454368079413e-06, + "loss": 0.7507, + "step": 5827 + }, + { + "epoch": 1.6919727101175788, + "grad_norm": 3.90720534324646, + "learning_rate": 7.892671223955702e-06, + "loss": 0.8307, + "step": 5828 + }, + { + "epoch": 1.6922630280156772, + "grad_norm": 3.7784852981567383, + "learning_rate": 7.891887973151424e-06, + "loss": 0.8638, + "step": 5829 + }, + { + "epoch": 1.6925533459137756, + "grad_norm": 3.581059455871582, + "learning_rate": 7.891104615695463e-06, + "loss": 0.7242, + "step": 5830 + }, + { + "epoch": 1.692843663811874, + "grad_norm": 3.4262542724609375, + "learning_rate": 7.890321151616716e-06, + "loss": 0.7449, + "step": 5831 + }, + { + "epoch": 1.6931339817099724, + "grad_norm": 3.4586639404296875, + "learning_rate": 7.889537580944068e-06, + "loss": 0.7635, + "step": 5832 + }, + { + "epoch": 1.6934242996080708, + "grad_norm": 4.539180755615234, + "learning_rate": 7.888753903706422e-06, + "loss": 0.8506, + "step": 5833 + }, + { + "epoch": 1.6937146175061693, + "grad_norm": 3.6680805683135986, + "learning_rate": 7.887970119932678e-06, + "loss": 0.707, + "step": 5834 + }, + { + "epoch": 1.6940049354042677, + "grad_norm": 3.8652119636535645, + "learning_rate": 7.887186229651741e-06, + "loss": 0.8594, + "step": 5835 + }, + { + "epoch": 1.694295253302366, + "grad_norm": 3.5802907943725586, + "learning_rate": 7.886402232892525e-06, + "loss": 0.77, + "step": 5836 + }, + { + "epoch": 1.6945855712004645, + "grad_norm": 3.5474374294281006, + "learning_rate": 7.885618129683938e-06, + "loss": 0.806, + "step": 5837 + }, + { + "epoch": 1.694875889098563, + "grad_norm": 3.7476847171783447, + "learning_rate": 7.8848339200549e-06, + "loss": 0.8719, + "step": 5838 + }, + { + "epoch": 1.6951662069966613, + "grad_norm": 3.4608943462371826, + "learning_rate": 7.884049604034331e-06, + "loss": 0.8042, + "step": 5839 + }, + { + "epoch": 1.6954565248947597, + "grad_norm": 3.389352798461914, + "learning_rate": 7.883265181651158e-06, + "loss": 0.7396, + "step": 5840 + }, + { + "epoch": 1.6957468427928581, + "grad_norm": 3.1610846519470215, + "learning_rate": 7.882480652934307e-06, + "loss": 0.7559, + "step": 5841 + }, + { + "epoch": 1.6960371606909566, + "grad_norm": 3.6229166984558105, + "learning_rate": 7.881696017912716e-06, + "loss": 0.7203, + "step": 5842 + }, + { + "epoch": 1.696327478589055, + "grad_norm": 3.709913492202759, + "learning_rate": 7.880911276615319e-06, + "loss": 0.8945, + "step": 5843 + }, + { + "epoch": 1.6966177964871534, + "grad_norm": 3.5395514965057373, + "learning_rate": 7.880126429071057e-06, + "loss": 0.7933, + "step": 5844 + }, + { + "epoch": 1.6969081143852518, + "grad_norm": 3.6049327850341797, + "learning_rate": 7.879341475308876e-06, + "loss": 0.7339, + "step": 5845 + }, + { + "epoch": 1.6971984322833502, + "grad_norm": 3.444969415664673, + "learning_rate": 7.878556415357721e-06, + "loss": 0.8457, + "step": 5846 + }, + { + "epoch": 1.6974887501814488, + "grad_norm": 3.630948781967163, + "learning_rate": 7.877771249246551e-06, + "loss": 0.7315, + "step": 5847 + }, + { + "epoch": 1.697779068079547, + "grad_norm": 3.8517673015594482, + "learning_rate": 7.876985977004319e-06, + "loss": 0.8216, + "step": 5848 + }, + { + "epoch": 1.6980693859776457, + "grad_norm": 3.088366985321045, + "learning_rate": 7.876200598659984e-06, + "loss": 0.6817, + "step": 5849 + }, + { + "epoch": 1.6983597038757439, + "grad_norm": 3.610283374786377, + "learning_rate": 7.875415114242514e-06, + "loss": 0.7258, + "step": 5850 + }, + { + "epoch": 1.6986500217738425, + "grad_norm": 3.6045877933502197, + "learning_rate": 7.874629523780875e-06, + "loss": 0.7373, + "step": 5851 + }, + { + "epoch": 1.6989403396719407, + "grad_norm": 3.6890554428100586, + "learning_rate": 7.873843827304039e-06, + "loss": 0.9028, + "step": 5852 + }, + { + "epoch": 1.6992306575700393, + "grad_norm": 3.803805112838745, + "learning_rate": 7.873058024840985e-06, + "loss": 0.9551, + "step": 5853 + }, + { + "epoch": 1.6995209754681375, + "grad_norm": 3.7046024799346924, + "learning_rate": 7.87227211642069e-06, + "loss": 0.8835, + "step": 5854 + }, + { + "epoch": 1.6998112933662362, + "grad_norm": 3.598008155822754, + "learning_rate": 7.871486102072138e-06, + "loss": 0.81, + "step": 5855 + }, + { + "epoch": 1.7001016112643343, + "grad_norm": 3.314302921295166, + "learning_rate": 7.870699981824322e-06, + "loss": 0.8002, + "step": 5856 + }, + { + "epoch": 1.700391929162433, + "grad_norm": 3.438389301300049, + "learning_rate": 7.869913755706227e-06, + "loss": 0.697, + "step": 5857 + }, + { + "epoch": 1.7006822470605312, + "grad_norm": 3.140916585922241, + "learning_rate": 7.869127423746852e-06, + "loss": 0.7491, + "step": 5858 + }, + { + "epoch": 1.7009725649586298, + "grad_norm": 3.362424612045288, + "learning_rate": 7.868340985975195e-06, + "loss": 0.8557, + "step": 5859 + }, + { + "epoch": 1.701262882856728, + "grad_norm": 3.793604850769043, + "learning_rate": 7.867554442420262e-06, + "loss": 0.6942, + "step": 5860 + }, + { + "epoch": 1.7015532007548266, + "grad_norm": 3.624799966812134, + "learning_rate": 7.86676779311106e-06, + "loss": 0.7548, + "step": 5861 + }, + { + "epoch": 1.7018435186529248, + "grad_norm": 4.076056957244873, + "learning_rate": 7.865981038076598e-06, + "loss": 0.8502, + "step": 5862 + }, + { + "epoch": 1.7021338365510235, + "grad_norm": 3.5222671031951904, + "learning_rate": 7.865194177345894e-06, + "loss": 0.6433, + "step": 5863 + }, + { + "epoch": 1.7024241544491217, + "grad_norm": 3.4212605953216553, + "learning_rate": 7.864407210947965e-06, + "loss": 0.7633, + "step": 5864 + }, + { + "epoch": 1.7027144723472203, + "grad_norm": 3.3345491886138916, + "learning_rate": 7.863620138911833e-06, + "loss": 0.7564, + "step": 5865 + }, + { + "epoch": 1.7030047902453185, + "grad_norm": 3.045092821121216, + "learning_rate": 7.862832961266529e-06, + "loss": 0.7526, + "step": 5866 + }, + { + "epoch": 1.7032951081434171, + "grad_norm": 3.5737078189849854, + "learning_rate": 7.862045678041082e-06, + "loss": 0.7683, + "step": 5867 + }, + { + "epoch": 1.7035854260415153, + "grad_norm": 3.4689781665802, + "learning_rate": 7.861258289264524e-06, + "loss": 0.716, + "step": 5868 + }, + { + "epoch": 1.703875743939614, + "grad_norm": 3.78070068359375, + "learning_rate": 7.860470794965896e-06, + "loss": 0.7166, + "step": 5869 + }, + { + "epoch": 1.7041660618377121, + "grad_norm": 3.7463736534118652, + "learning_rate": 7.859683195174242e-06, + "loss": 0.8338, + "step": 5870 + }, + { + "epoch": 1.7044563797358108, + "grad_norm": 3.7229490280151367, + "learning_rate": 7.858895489918605e-06, + "loss": 0.8716, + "step": 5871 + }, + { + "epoch": 1.7047466976339092, + "grad_norm": 3.8799808025360107, + "learning_rate": 7.858107679228037e-06, + "loss": 0.7594, + "step": 5872 + }, + { + "epoch": 1.7050370155320076, + "grad_norm": 3.2937357425689697, + "learning_rate": 7.857319763131592e-06, + "loss": 0.6893, + "step": 5873 + }, + { + "epoch": 1.705327333430106, + "grad_norm": 3.463261127471924, + "learning_rate": 7.856531741658328e-06, + "loss": 0.7997, + "step": 5874 + }, + { + "epoch": 1.7056176513282044, + "grad_norm": 3.727832317352295, + "learning_rate": 7.855743614837307e-06, + "loss": 0.7482, + "step": 5875 + }, + { + "epoch": 1.7059079692263028, + "grad_norm": 3.596024990081787, + "learning_rate": 7.854955382697597e-06, + "loss": 0.6919, + "step": 5876 + }, + { + "epoch": 1.7061982871244012, + "grad_norm": 3.800488233566284, + "learning_rate": 7.854167045268265e-06, + "loss": 0.9058, + "step": 5877 + }, + { + "epoch": 1.7064886050224997, + "grad_norm": 3.0563924312591553, + "learning_rate": 7.853378602578381e-06, + "loss": 0.6268, + "step": 5878 + }, + { + "epoch": 1.706778922920598, + "grad_norm": 4.0375494956970215, + "learning_rate": 7.85259005465703e-06, + "loss": 0.8667, + "step": 5879 + }, + { + "epoch": 1.7070692408186965, + "grad_norm": 3.549715995788574, + "learning_rate": 7.851801401533288e-06, + "loss": 0.6337, + "step": 5880 + }, + { + "epoch": 1.707359558716795, + "grad_norm": 3.2920758724212646, + "learning_rate": 7.851012643236244e-06, + "loss": 0.6598, + "step": 5881 + }, + { + "epoch": 1.7076498766148933, + "grad_norm": 2.9315907955169678, + "learning_rate": 7.850223779794983e-06, + "loss": 0.6499, + "step": 5882 + }, + { + "epoch": 1.7079401945129917, + "grad_norm": 3.107271432876587, + "learning_rate": 7.849434811238601e-06, + "loss": 0.6202, + "step": 5883 + }, + { + "epoch": 1.7082305124110901, + "grad_norm": 3.9191412925720215, + "learning_rate": 7.848645737596193e-06, + "loss": 0.887, + "step": 5884 + }, + { + "epoch": 1.7085208303091886, + "grad_norm": 3.584061861038208, + "learning_rate": 7.847856558896863e-06, + "loss": 0.8037, + "step": 5885 + }, + { + "epoch": 1.708811148207287, + "grad_norm": 3.5416791439056396, + "learning_rate": 7.847067275169711e-06, + "loss": 0.8083, + "step": 5886 + }, + { + "epoch": 1.7091014661053854, + "grad_norm": 3.7633187770843506, + "learning_rate": 7.846277886443849e-06, + "loss": 0.7173, + "step": 5887 + }, + { + "epoch": 1.7093917840034838, + "grad_norm": 3.4615838527679443, + "learning_rate": 7.845488392748387e-06, + "loss": 0.7684, + "step": 5888 + }, + { + "epoch": 1.7096821019015822, + "grad_norm": 3.8253400325775146, + "learning_rate": 7.844698794112444e-06, + "loss": 0.7963, + "step": 5889 + }, + { + "epoch": 1.7099724197996806, + "grad_norm": 3.686365842819214, + "learning_rate": 7.843909090565136e-06, + "loss": 0.7613, + "step": 5890 + }, + { + "epoch": 1.710262737697779, + "grad_norm": 3.3100762367248535, + "learning_rate": 7.843119282135592e-06, + "loss": 0.743, + "step": 5891 + }, + { + "epoch": 1.7105530555958774, + "grad_norm": 3.4483158588409424, + "learning_rate": 7.842329368852935e-06, + "loss": 0.7322, + "step": 5892 + }, + { + "epoch": 1.7108433734939759, + "grad_norm": 3.625225305557251, + "learning_rate": 7.841539350746299e-06, + "loss": 0.7968, + "step": 5893 + }, + { + "epoch": 1.7111336913920743, + "grad_norm": 3.4722776412963867, + "learning_rate": 7.840749227844819e-06, + "loss": 0.7476, + "step": 5894 + }, + { + "epoch": 1.7114240092901727, + "grad_norm": 3.5864033699035645, + "learning_rate": 7.839959000177637e-06, + "loss": 0.7872, + "step": 5895 + }, + { + "epoch": 1.7117143271882713, + "grad_norm": 3.2345564365386963, + "learning_rate": 7.839168667773891e-06, + "loss": 0.7775, + "step": 5896 + }, + { + "epoch": 1.7120046450863695, + "grad_norm": 3.407197952270508, + "learning_rate": 7.838378230662732e-06, + "loss": 0.7034, + "step": 5897 + }, + { + "epoch": 1.7122949629844681, + "grad_norm": 3.791569948196411, + "learning_rate": 7.837587688873314e-06, + "loss": 0.782, + "step": 5898 + }, + { + "epoch": 1.7125852808825663, + "grad_norm": 4.092060089111328, + "learning_rate": 7.836797042434785e-06, + "loss": 0.8197, + "step": 5899 + }, + { + "epoch": 1.712875598780665, + "grad_norm": 3.3512213230133057, + "learning_rate": 7.836006291376307e-06, + "loss": 0.6995, + "step": 5900 + }, + { + "epoch": 1.7131659166787632, + "grad_norm": 3.657559394836426, + "learning_rate": 7.835215435727042e-06, + "loss": 0.7018, + "step": 5901 + }, + { + "epoch": 1.7134562345768618, + "grad_norm": 3.197721481323242, + "learning_rate": 7.834424475516158e-06, + "loss": 0.755, + "step": 5902 + }, + { + "epoch": 1.71374655247496, + "grad_norm": 3.3309671878814697, + "learning_rate": 7.833633410772823e-06, + "loss": 0.7921, + "step": 5903 + }, + { + "epoch": 1.7140368703730586, + "grad_norm": 3.4525208473205566, + "learning_rate": 7.832842241526212e-06, + "loss": 0.7811, + "step": 5904 + }, + { + "epoch": 1.7143271882711568, + "grad_norm": 3.945049285888672, + "learning_rate": 7.832050967805504e-06, + "loss": 0.702, + "step": 5905 + }, + { + "epoch": 1.7146175061692555, + "grad_norm": 3.4726674556732178, + "learning_rate": 7.83125958963988e-06, + "loss": 0.7474, + "step": 5906 + }, + { + "epoch": 1.7149078240673536, + "grad_norm": 3.5951087474823, + "learning_rate": 7.830468107058527e-06, + "loss": 0.7378, + "step": 5907 + }, + { + "epoch": 1.7151981419654523, + "grad_norm": 3.877894401550293, + "learning_rate": 7.829676520090632e-06, + "loss": 0.855, + "step": 5908 + }, + { + "epoch": 1.7154884598635505, + "grad_norm": 3.470466375350952, + "learning_rate": 7.828884828765391e-06, + "loss": 0.7057, + "step": 5909 + }, + { + "epoch": 1.715778777761649, + "grad_norm": 3.618359088897705, + "learning_rate": 7.828093033112e-06, + "loss": 0.8365, + "step": 5910 + }, + { + "epoch": 1.7160690956597473, + "grad_norm": 3.4028820991516113, + "learning_rate": 7.827301133159659e-06, + "loss": 0.8622, + "step": 5911 + }, + { + "epoch": 1.716359413557846, + "grad_norm": 3.890469789505005, + "learning_rate": 7.826509128937576e-06, + "loss": 0.7958, + "step": 5912 + }, + { + "epoch": 1.7166497314559441, + "grad_norm": 3.6213538646698, + "learning_rate": 7.825717020474957e-06, + "loss": 0.8028, + "step": 5913 + }, + { + "epoch": 1.7169400493540428, + "grad_norm": 3.528296709060669, + "learning_rate": 7.824924807801015e-06, + "loss": 0.8284, + "step": 5914 + }, + { + "epoch": 1.717230367252141, + "grad_norm": 3.321072816848755, + "learning_rate": 7.824132490944968e-06, + "loss": 0.7871, + "step": 5915 + }, + { + "epoch": 1.7175206851502396, + "grad_norm": 3.2413792610168457, + "learning_rate": 7.823340069936035e-06, + "loss": 0.7666, + "step": 5916 + }, + { + "epoch": 1.7178110030483378, + "grad_norm": 4.080096244812012, + "learning_rate": 7.82254754480344e-06, + "loss": 0.7143, + "step": 5917 + }, + { + "epoch": 1.7181013209464364, + "grad_norm": 3.3351078033447266, + "learning_rate": 7.821754915576415e-06, + "loss": 0.8247, + "step": 5918 + }, + { + "epoch": 1.7183916388445346, + "grad_norm": 3.2570137977600098, + "learning_rate": 7.820962182284183e-06, + "loss": 0.6952, + "step": 5919 + }, + { + "epoch": 1.7186819567426332, + "grad_norm": 3.4597902297973633, + "learning_rate": 7.820169344955991e-06, + "loss": 0.6665, + "step": 5920 + }, + { + "epoch": 1.7189722746407314, + "grad_norm": 3.462433099746704, + "learning_rate": 7.819376403621068e-06, + "loss": 0.7972, + "step": 5921 + }, + { + "epoch": 1.71926259253883, + "grad_norm": 3.6604247093200684, + "learning_rate": 7.818583358308664e-06, + "loss": 0.747, + "step": 5922 + }, + { + "epoch": 1.7195529104369285, + "grad_norm": 3.404092311859131, + "learning_rate": 7.817790209048025e-06, + "loss": 0.7847, + "step": 5923 + }, + { + "epoch": 1.719843228335027, + "grad_norm": 3.8753247261047363, + "learning_rate": 7.8169969558684e-06, + "loss": 0.7468, + "step": 5924 + }, + { + "epoch": 1.7201335462331253, + "grad_norm": 3.532658338546753, + "learning_rate": 7.816203598799046e-06, + "loss": 0.7734, + "step": 5925 + }, + { + "epoch": 1.7204238641312237, + "grad_norm": 3.13362193107605, + "learning_rate": 7.815410137869222e-06, + "loss": 0.6992, + "step": 5926 + }, + { + "epoch": 1.7207141820293221, + "grad_norm": 3.5808610916137695, + "learning_rate": 7.814616573108188e-06, + "loss": 0.7753, + "step": 5927 + }, + { + "epoch": 1.7210044999274206, + "grad_norm": 3.5286667346954346, + "learning_rate": 7.81382290454521e-06, + "loss": 0.6765, + "step": 5928 + }, + { + "epoch": 1.721294817825519, + "grad_norm": 3.8136179447174072, + "learning_rate": 7.813029132209562e-06, + "loss": 0.8816, + "step": 5929 + }, + { + "epoch": 1.7215851357236174, + "grad_norm": 3.408217668533325, + "learning_rate": 7.812235256130515e-06, + "loss": 0.7641, + "step": 5930 + }, + { + "epoch": 1.7218754536217158, + "grad_norm": 3.4473049640655518, + "learning_rate": 7.811441276337348e-06, + "loss": 0.7553, + "step": 5931 + }, + { + "epoch": 1.7221657715198142, + "grad_norm": 3.727487087249756, + "learning_rate": 7.810647192859344e-06, + "loss": 0.8163, + "step": 5932 + }, + { + "epoch": 1.7224560894179126, + "grad_norm": 3.421032667160034, + "learning_rate": 7.809853005725784e-06, + "loss": 0.9554, + "step": 5933 + }, + { + "epoch": 1.722746407316011, + "grad_norm": 3.630430221557617, + "learning_rate": 7.809058714965962e-06, + "loss": 0.719, + "step": 5934 + }, + { + "epoch": 1.7230367252141094, + "grad_norm": 3.216792583465576, + "learning_rate": 7.80826432060917e-06, + "loss": 0.7135, + "step": 5935 + }, + { + "epoch": 1.7233270431122079, + "grad_norm": 3.312319278717041, + "learning_rate": 7.807469822684704e-06, + "loss": 0.7871, + "step": 5936 + }, + { + "epoch": 1.7236173610103063, + "grad_norm": 3.6939849853515625, + "learning_rate": 7.806675221221862e-06, + "loss": 0.7946, + "step": 5937 + }, + { + "epoch": 1.7239076789084047, + "grad_norm": 3.2859673500061035, + "learning_rate": 7.805880516249955e-06, + "loss": 0.742, + "step": 5938 + }, + { + "epoch": 1.724197996806503, + "grad_norm": 3.6563122272491455, + "learning_rate": 7.805085707798288e-06, + "loss": 0.7939, + "step": 5939 + }, + { + "epoch": 1.7244883147046015, + "grad_norm": 3.717435598373413, + "learning_rate": 7.804290795896172e-06, + "loss": 0.7775, + "step": 5940 + }, + { + "epoch": 1.7247786326027, + "grad_norm": 3.4693424701690674, + "learning_rate": 7.803495780572925e-06, + "loss": 0.7695, + "step": 5941 + }, + { + "epoch": 1.7250689505007983, + "grad_norm": 3.7334964275360107, + "learning_rate": 7.802700661857864e-06, + "loss": 0.853, + "step": 5942 + }, + { + "epoch": 1.7253592683988968, + "grad_norm": 3.2945621013641357, + "learning_rate": 7.801905439780317e-06, + "loss": 0.8119, + "step": 5943 + }, + { + "epoch": 1.7256495862969952, + "grad_norm": 3.5244734287261963, + "learning_rate": 7.80111011436961e-06, + "loss": 0.7805, + "step": 5944 + }, + { + "epoch": 1.7259399041950936, + "grad_norm": 3.339840888977051, + "learning_rate": 7.800314685655072e-06, + "loss": 0.7999, + "step": 5945 + }, + { + "epoch": 1.726230222093192, + "grad_norm": 3.149946928024292, + "learning_rate": 7.79951915366604e-06, + "loss": 0.7761, + "step": 5946 + }, + { + "epoch": 1.7265205399912906, + "grad_norm": 3.8940494060516357, + "learning_rate": 7.798723518431852e-06, + "loss": 0.803, + "step": 5947 + }, + { + "epoch": 1.7268108578893888, + "grad_norm": 3.4763505458831787, + "learning_rate": 7.797927779981854e-06, + "loss": 0.7353, + "step": 5948 + }, + { + "epoch": 1.7271011757874875, + "grad_norm": 3.4645235538482666, + "learning_rate": 7.797131938345386e-06, + "loss": 0.6931, + "step": 5949 + }, + { + "epoch": 1.7273914936855856, + "grad_norm": 3.8292295932769775, + "learning_rate": 7.796335993551805e-06, + "loss": 0.806, + "step": 5950 + }, + { + "epoch": 1.7276818115836843, + "grad_norm": 3.6954762935638428, + "learning_rate": 7.79553994563046e-06, + "loss": 0.8108, + "step": 5951 + }, + { + "epoch": 1.7279721294817825, + "grad_norm": 3.1089465618133545, + "learning_rate": 7.794743794610713e-06, + "loss": 0.668, + "step": 5952 + }, + { + "epoch": 1.728262447379881, + "grad_norm": 3.7287204265594482, + "learning_rate": 7.793947540521922e-06, + "loss": 0.7968, + "step": 5953 + }, + { + "epoch": 1.7285527652779793, + "grad_norm": 3.2793920040130615, + "learning_rate": 7.793151183393458e-06, + "loss": 0.7453, + "step": 5954 + }, + { + "epoch": 1.728843083176078, + "grad_norm": 3.862212896347046, + "learning_rate": 7.792354723254682e-06, + "loss": 0.8377, + "step": 5955 + }, + { + "epoch": 1.7291334010741761, + "grad_norm": 3.502390146255493, + "learning_rate": 7.791558160134975e-06, + "loss": 0.7483, + "step": 5956 + }, + { + "epoch": 1.7294237189722748, + "grad_norm": 3.9124982357025146, + "learning_rate": 7.790761494063712e-06, + "loss": 0.7549, + "step": 5957 + }, + { + "epoch": 1.729714036870373, + "grad_norm": 3.570953845977783, + "learning_rate": 7.789964725070269e-06, + "loss": 0.8017, + "step": 5958 + }, + { + "epoch": 1.7300043547684716, + "grad_norm": 3.851487874984741, + "learning_rate": 7.789167853184036e-06, + "loss": 0.8175, + "step": 5959 + }, + { + "epoch": 1.7302946726665698, + "grad_norm": 3.938213348388672, + "learning_rate": 7.7883708784344e-06, + "loss": 0.7864, + "step": 5960 + }, + { + "epoch": 1.7305849905646684, + "grad_norm": 3.95170521736145, + "learning_rate": 7.787573800850752e-06, + "loss": 0.8373, + "step": 5961 + }, + { + "epoch": 1.7308753084627666, + "grad_norm": 3.3376810550689697, + "learning_rate": 7.786776620462488e-06, + "loss": 0.7517, + "step": 5962 + }, + { + "epoch": 1.7311656263608652, + "grad_norm": 3.5237679481506348, + "learning_rate": 7.785979337299008e-06, + "loss": 0.8221, + "step": 5963 + }, + { + "epoch": 1.7314559442589634, + "grad_norm": 3.8222129344940186, + "learning_rate": 7.785181951389718e-06, + "loss": 0.8373, + "step": 5964 + }, + { + "epoch": 1.731746262157062, + "grad_norm": 3.338149070739746, + "learning_rate": 7.784384462764019e-06, + "loss": 0.7124, + "step": 5965 + }, + { + "epoch": 1.7320365800551603, + "grad_norm": 3.3781659603118896, + "learning_rate": 7.783586871451328e-06, + "loss": 0.7377, + "step": 5966 + }, + { + "epoch": 1.732326897953259, + "grad_norm": 3.5843288898468018, + "learning_rate": 7.782789177481057e-06, + "loss": 0.7315, + "step": 5967 + }, + { + "epoch": 1.732617215851357, + "grad_norm": 3.395334005355835, + "learning_rate": 7.781991380882627e-06, + "loss": 0.8184, + "step": 5968 + }, + { + "epoch": 1.7329075337494557, + "grad_norm": 3.441681385040283, + "learning_rate": 7.781193481685459e-06, + "loss": 0.8113, + "step": 5969 + }, + { + "epoch": 1.733197851647554, + "grad_norm": 3.6689629554748535, + "learning_rate": 7.780395479918979e-06, + "loss": 0.7977, + "step": 5970 + }, + { + "epoch": 1.7334881695456525, + "grad_norm": 3.465517520904541, + "learning_rate": 7.779597375612616e-06, + "loss": 0.8234, + "step": 5971 + }, + { + "epoch": 1.733778487443751, + "grad_norm": 3.51955246925354, + "learning_rate": 7.778799168795804e-06, + "loss": 0.7416, + "step": 5972 + }, + { + "epoch": 1.7340688053418494, + "grad_norm": 3.4402823448181152, + "learning_rate": 7.778000859497983e-06, + "loss": 0.7273, + "step": 5973 + }, + { + "epoch": 1.7343591232399478, + "grad_norm": 3.8265280723571777, + "learning_rate": 7.777202447748592e-06, + "loss": 0.8453, + "step": 5974 + }, + { + "epoch": 1.7346494411380462, + "grad_norm": 3.3544716835021973, + "learning_rate": 7.776403933577077e-06, + "loss": 0.6991, + "step": 5975 + }, + { + "epoch": 1.7349397590361446, + "grad_norm": 3.417309045791626, + "learning_rate": 7.775605317012886e-06, + "loss": 0.7992, + "step": 5976 + }, + { + "epoch": 1.735230076934243, + "grad_norm": 3.171778678894043, + "learning_rate": 7.774806598085473e-06, + "loss": 0.6875, + "step": 5977 + }, + { + "epoch": 1.7355203948323414, + "grad_norm": 3.8337888717651367, + "learning_rate": 7.774007776824293e-06, + "loss": 0.8176, + "step": 5978 + }, + { + "epoch": 1.7358107127304399, + "grad_norm": 3.4257326126098633, + "learning_rate": 7.77320885325881e-06, + "loss": 0.7383, + "step": 5979 + }, + { + "epoch": 1.7361010306285383, + "grad_norm": 3.621321201324463, + "learning_rate": 7.772409827418481e-06, + "loss": 0.8088, + "step": 5980 + }, + { + "epoch": 1.7363913485266367, + "grad_norm": 3.9669549465179443, + "learning_rate": 7.77161069933278e-06, + "loss": 0.8997, + "step": 5981 + }, + { + "epoch": 1.736681666424735, + "grad_norm": 3.9241344928741455, + "learning_rate": 7.770811469031176e-06, + "loss": 0.9407, + "step": 5982 + }, + { + "epoch": 1.7369719843228335, + "grad_norm": 3.7991113662719727, + "learning_rate": 7.770012136543144e-06, + "loss": 0.6812, + "step": 5983 + }, + { + "epoch": 1.737262302220932, + "grad_norm": 3.605419158935547, + "learning_rate": 7.769212701898166e-06, + "loss": 0.7869, + "step": 5984 + }, + { + "epoch": 1.7375526201190303, + "grad_norm": 3.2687923908233643, + "learning_rate": 7.76841316512572e-06, + "loss": 0.7058, + "step": 5985 + }, + { + "epoch": 1.7378429380171287, + "grad_norm": 3.817347288131714, + "learning_rate": 7.767613526255296e-06, + "loss": 0.8495, + "step": 5986 + }, + { + "epoch": 1.7381332559152272, + "grad_norm": 3.1826589107513428, + "learning_rate": 7.766813785316382e-06, + "loss": 0.792, + "step": 5987 + }, + { + "epoch": 1.7384235738133256, + "grad_norm": 3.6973764896392822, + "learning_rate": 7.766013942338476e-06, + "loss": 0.7691, + "step": 5988 + }, + { + "epoch": 1.738713891711424, + "grad_norm": 3.428189992904663, + "learning_rate": 7.765213997351072e-06, + "loss": 0.8026, + "step": 5989 + }, + { + "epoch": 1.7390042096095224, + "grad_norm": 3.3443777561187744, + "learning_rate": 7.764413950383674e-06, + "loss": 0.7425, + "step": 5990 + }, + { + "epoch": 1.7392945275076208, + "grad_norm": 2.8721110820770264, + "learning_rate": 7.763613801465785e-06, + "loss": 0.6768, + "step": 5991 + }, + { + "epoch": 1.7395848454057192, + "grad_norm": 3.564232587814331, + "learning_rate": 7.762813550626917e-06, + "loss": 0.6933, + "step": 5992 + }, + { + "epoch": 1.7398751633038176, + "grad_norm": 3.7007267475128174, + "learning_rate": 7.76201319789658e-06, + "loss": 0.8681, + "step": 5993 + }, + { + "epoch": 1.740165481201916, + "grad_norm": 3.5045223236083984, + "learning_rate": 7.761212743304294e-06, + "loss": 0.7965, + "step": 5994 + }, + { + "epoch": 1.7404557991000145, + "grad_norm": 3.9651434421539307, + "learning_rate": 7.760412186879579e-06, + "loss": 0.8799, + "step": 5995 + }, + { + "epoch": 1.7407461169981129, + "grad_norm": 3.1684725284576416, + "learning_rate": 7.759611528651954e-06, + "loss": 0.7174, + "step": 5996 + }, + { + "epoch": 1.7410364348962113, + "grad_norm": 3.4137959480285645, + "learning_rate": 7.758810768650954e-06, + "loss": 0.781, + "step": 5997 + }, + { + "epoch": 1.74132675279431, + "grad_norm": 3.7508652210235596, + "learning_rate": 7.758009906906107e-06, + "loss": 0.8172, + "step": 5998 + }, + { + "epoch": 1.7416170706924081, + "grad_norm": 3.002896308898926, + "learning_rate": 7.75720894344695e-06, + "loss": 0.7221, + "step": 5999 + }, + { + "epoch": 1.7419073885905068, + "grad_norm": 3.63832426071167, + "learning_rate": 7.75640787830302e-06, + "loss": 0.7877, + "step": 6000 + }, + { + "epoch": 1.7419073885905068, + "eval_loss": 1.1696195602416992, + "eval_runtime": 13.3242, + "eval_samples_per_second": 30.02, + "eval_steps_per_second": 3.753, + "step": 6000 + }, + { + "epoch": 1.742197706488605, + "grad_norm": 3.424290657043457, + "learning_rate": 7.755606711503861e-06, + "loss": 0.8493, + "step": 6001 + }, + { + "epoch": 1.7424880243867036, + "grad_norm": 3.4848201274871826, + "learning_rate": 7.75480544307902e-06, + "loss": 0.7085, + "step": 6002 + }, + { + "epoch": 1.7427783422848018, + "grad_norm": 3.4856338500976562, + "learning_rate": 7.754004073058048e-06, + "loss": 0.8014, + "step": 6003 + }, + { + "epoch": 1.7430686601829004, + "grad_norm": 3.9823102951049805, + "learning_rate": 7.753202601470499e-06, + "loss": 0.9238, + "step": 6004 + }, + { + "epoch": 1.7433589780809986, + "grad_norm": 3.394909620285034, + "learning_rate": 7.752401028345932e-06, + "loss": 0.8048, + "step": 6005 + }, + { + "epoch": 1.7436492959790972, + "grad_norm": 3.9474101066589355, + "learning_rate": 7.751599353713906e-06, + "loss": 0.8962, + "step": 6006 + }, + { + "epoch": 1.7439396138771954, + "grad_norm": 3.826502799987793, + "learning_rate": 7.750797577603988e-06, + "loss": 0.8611, + "step": 6007 + }, + { + "epoch": 1.744229931775294, + "grad_norm": 3.7918648719787598, + "learning_rate": 7.749995700045746e-06, + "loss": 0.781, + "step": 6008 + }, + { + "epoch": 1.7445202496733923, + "grad_norm": 3.3785643577575684, + "learning_rate": 7.749193721068754e-06, + "loss": 0.7255, + "step": 6009 + }, + { + "epoch": 1.744810567571491, + "grad_norm": 2.9595866203308105, + "learning_rate": 7.748391640702588e-06, + "loss": 0.752, + "step": 6010 + }, + { + "epoch": 1.745100885469589, + "grad_norm": 3.2847795486450195, + "learning_rate": 7.74758945897683e-06, + "loss": 0.6969, + "step": 6011 + }, + { + "epoch": 1.7453912033676877, + "grad_norm": 3.771801233291626, + "learning_rate": 7.746787175921065e-06, + "loss": 0.7866, + "step": 6012 + }, + { + "epoch": 1.745681521265786, + "grad_norm": 3.230302333831787, + "learning_rate": 7.745984791564876e-06, + "loss": 0.7506, + "step": 6013 + }, + { + "epoch": 1.7459718391638845, + "grad_norm": 4.036153316497803, + "learning_rate": 7.745182305937859e-06, + "loss": 1.0717, + "step": 6014 + }, + { + "epoch": 1.7462621570619827, + "grad_norm": 3.5328006744384766, + "learning_rate": 7.744379719069607e-06, + "loss": 0.791, + "step": 6015 + }, + { + "epoch": 1.7465524749600814, + "grad_norm": 3.5628857612609863, + "learning_rate": 7.74357703098972e-06, + "loss": 0.7866, + "step": 6016 + }, + { + "epoch": 1.7468427928581796, + "grad_norm": 3.3404319286346436, + "learning_rate": 7.742774241727801e-06, + "loss": 0.7193, + "step": 6017 + }, + { + "epoch": 1.7471331107562782, + "grad_norm": 3.2553791999816895, + "learning_rate": 7.741971351313458e-06, + "loss": 0.7112, + "step": 6018 + }, + { + "epoch": 1.7474234286543764, + "grad_norm": 3.824651002883911, + "learning_rate": 7.7411683597763e-06, + "loss": 0.7889, + "step": 6019 + }, + { + "epoch": 1.747713746552475, + "grad_norm": 2.963634967803955, + "learning_rate": 7.740365267145937e-06, + "loss": 0.6034, + "step": 6020 + }, + { + "epoch": 1.7480040644505732, + "grad_norm": 3.501497268676758, + "learning_rate": 7.739562073451994e-06, + "loss": 0.7022, + "step": 6021 + }, + { + "epoch": 1.7482943823486718, + "grad_norm": 3.259615898132324, + "learning_rate": 7.738758778724087e-06, + "loss": 0.7075, + "step": 6022 + }, + { + "epoch": 1.7485847002467703, + "grad_norm": 3.740983009338379, + "learning_rate": 7.737955382991844e-06, + "loss": 0.8299, + "step": 6023 + }, + { + "epoch": 1.7488750181448687, + "grad_norm": 3.5070557594299316, + "learning_rate": 7.737151886284893e-06, + "loss": 0.7363, + "step": 6024 + }, + { + "epoch": 1.749165336042967, + "grad_norm": 3.7931597232818604, + "learning_rate": 7.736348288632866e-06, + "loss": 0.8515, + "step": 6025 + }, + { + "epoch": 1.7494556539410655, + "grad_norm": 3.109853744506836, + "learning_rate": 7.7355445900654e-06, + "loss": 0.669, + "step": 6026 + }, + { + "epoch": 1.749745971839164, + "grad_norm": 3.4060046672821045, + "learning_rate": 7.734740790612137e-06, + "loss": 0.8745, + "step": 6027 + }, + { + "epoch": 1.7500362897372623, + "grad_norm": 3.7956717014312744, + "learning_rate": 7.733936890302716e-06, + "loss": 0.8567, + "step": 6028 + }, + { + "epoch": 1.7503266076353607, + "grad_norm": 3.112710475921631, + "learning_rate": 7.733132889166788e-06, + "loss": 0.7221, + "step": 6029 + }, + { + "epoch": 1.7506169255334592, + "grad_norm": 3.7839791774749756, + "learning_rate": 7.732328787234006e-06, + "loss": 0.8762, + "step": 6030 + }, + { + "epoch": 1.7509072434315576, + "grad_norm": 3.9805736541748047, + "learning_rate": 7.73152458453402e-06, + "loss": 0.8325, + "step": 6031 + }, + { + "epoch": 1.751197561329656, + "grad_norm": 3.4485583305358887, + "learning_rate": 7.730720281096493e-06, + "loss": 0.7338, + "step": 6032 + }, + { + "epoch": 1.7514878792277544, + "grad_norm": 3.645721912384033, + "learning_rate": 7.729915876951082e-06, + "loss": 0.7995, + "step": 6033 + }, + { + "epoch": 1.7517781971258528, + "grad_norm": 3.793673515319824, + "learning_rate": 7.72911137212746e-06, + "loss": 0.8108, + "step": 6034 + }, + { + "epoch": 1.7520685150239512, + "grad_norm": 3.6693036556243896, + "learning_rate": 7.728306766655294e-06, + "loss": 0.7696, + "step": 6035 + }, + { + "epoch": 1.7523588329220496, + "grad_norm": 3.7668471336364746, + "learning_rate": 7.727502060564257e-06, + "loss": 0.8003, + "step": 6036 + }, + { + "epoch": 1.752649150820148, + "grad_norm": 3.386531352996826, + "learning_rate": 7.726697253884026e-06, + "loss": 0.8003, + "step": 6037 + }, + { + "epoch": 1.7529394687182465, + "grad_norm": 3.680187940597534, + "learning_rate": 7.725892346644281e-06, + "loss": 0.876, + "step": 6038 + }, + { + "epoch": 1.7532297866163449, + "grad_norm": 2.98075795173645, + "learning_rate": 7.72508733887471e-06, + "loss": 0.7267, + "step": 6039 + }, + { + "epoch": 1.7535201045144433, + "grad_norm": 3.63118314743042, + "learning_rate": 7.724282230604998e-06, + "loss": 0.7591, + "step": 6040 + }, + { + "epoch": 1.7538104224125417, + "grad_norm": 3.2664151191711426, + "learning_rate": 7.72347702186484e-06, + "loss": 0.7249, + "step": 6041 + }, + { + "epoch": 1.7541007403106401, + "grad_norm": 3.529172897338867, + "learning_rate": 7.722671712683929e-06, + "loss": 0.7926, + "step": 6042 + }, + { + "epoch": 1.7543910582087385, + "grad_norm": 3.5128173828125, + "learning_rate": 7.721866303091965e-06, + "loss": 0.7381, + "step": 6043 + }, + { + "epoch": 1.754681376106837, + "grad_norm": 3.793933153152466, + "learning_rate": 7.721060793118653e-06, + "loss": 0.8778, + "step": 6044 + }, + { + "epoch": 1.7549716940049354, + "grad_norm": 3.560621500015259, + "learning_rate": 7.7202551827937e-06, + "loss": 0.7361, + "step": 6045 + }, + { + "epoch": 1.7552620119030338, + "grad_norm": 3.519472360610962, + "learning_rate": 7.719449472146814e-06, + "loss": 0.726, + "step": 6046 + }, + { + "epoch": 1.7555523298011324, + "grad_norm": 3.8505566120147705, + "learning_rate": 7.71864366120771e-06, + "loss": 0.9294, + "step": 6047 + }, + { + "epoch": 1.7558426476992306, + "grad_norm": 3.6858813762664795, + "learning_rate": 7.717837750006106e-06, + "loss": 0.7188, + "step": 6048 + }, + { + "epoch": 1.7561329655973292, + "grad_norm": 3.213684320449829, + "learning_rate": 7.717031738571726e-06, + "loss": 0.8008, + "step": 6049 + }, + { + "epoch": 1.7564232834954274, + "grad_norm": 3.483856678009033, + "learning_rate": 7.716225626934293e-06, + "loss": 0.7414, + "step": 6050 + }, + { + "epoch": 1.756713601393526, + "grad_norm": 3.566657781600952, + "learning_rate": 7.715419415123537e-06, + "loss": 0.8229, + "step": 6051 + }, + { + "epoch": 1.7570039192916242, + "grad_norm": 3.8110122680664062, + "learning_rate": 7.71461310316919e-06, + "loss": 0.8532, + "step": 6052 + }, + { + "epoch": 1.7572942371897229, + "grad_norm": 3.7343101501464844, + "learning_rate": 7.71380669110099e-06, + "loss": 0.8744, + "step": 6053 + }, + { + "epoch": 1.757584555087821, + "grad_norm": 3.0625345706939697, + "learning_rate": 7.713000178948675e-06, + "loss": 0.7301, + "step": 6054 + }, + { + "epoch": 1.7578748729859197, + "grad_norm": 3.1641945838928223, + "learning_rate": 7.712193566741993e-06, + "loss": 0.6697, + "step": 6055 + }, + { + "epoch": 1.758165190884018, + "grad_norm": 3.662405014038086, + "learning_rate": 7.711386854510685e-06, + "loss": 0.8059, + "step": 6056 + }, + { + "epoch": 1.7584555087821165, + "grad_norm": 3.7662250995635986, + "learning_rate": 7.710580042284507e-06, + "loss": 0.7312, + "step": 6057 + }, + { + "epoch": 1.7587458266802147, + "grad_norm": 3.9004745483398438, + "learning_rate": 7.709773130093213e-06, + "loss": 0.7461, + "step": 6058 + }, + { + "epoch": 1.7590361445783134, + "grad_norm": 3.377485513687134, + "learning_rate": 7.70896611796656e-06, + "loss": 0.8538, + "step": 6059 + }, + { + "epoch": 1.7593264624764116, + "grad_norm": 3.235250473022461, + "learning_rate": 7.708159005934312e-06, + "loss": 0.7092, + "step": 6060 + }, + { + "epoch": 1.7596167803745102, + "grad_norm": 3.500490665435791, + "learning_rate": 7.707351794026236e-06, + "loss": 0.7842, + "step": 6061 + }, + { + "epoch": 1.7599070982726084, + "grad_norm": 3.8645684719085693, + "learning_rate": 7.7065444822721e-06, + "loss": 0.7956, + "step": 6062 + }, + { + "epoch": 1.760197416170707, + "grad_norm": 3.4011542797088623, + "learning_rate": 7.705737070701678e-06, + "loss": 0.8391, + "step": 6063 + }, + { + "epoch": 1.7604877340688052, + "grad_norm": 3.686098337173462, + "learning_rate": 7.704929559344745e-06, + "loss": 0.943, + "step": 6064 + }, + { + "epoch": 1.7607780519669038, + "grad_norm": 3.844574451446533, + "learning_rate": 7.704121948231083e-06, + "loss": 0.9983, + "step": 6065 + }, + { + "epoch": 1.761068369865002, + "grad_norm": 3.554001808166504, + "learning_rate": 7.703314237390478e-06, + "loss": 0.8524, + "step": 6066 + }, + { + "epoch": 1.7613586877631007, + "grad_norm": 3.8397789001464844, + "learning_rate": 7.702506426852715e-06, + "loss": 0.8776, + "step": 6067 + }, + { + "epoch": 1.7616490056611989, + "grad_norm": 3.77868914604187, + "learning_rate": 7.70169851664759e-06, + "loss": 0.9187, + "step": 6068 + }, + { + "epoch": 1.7619393235592975, + "grad_norm": 3.272463321685791, + "learning_rate": 7.700890506804895e-06, + "loss": 0.6733, + "step": 6069 + }, + { + "epoch": 1.7622296414573957, + "grad_norm": 3.852590322494507, + "learning_rate": 7.70008239735443e-06, + "loss": 0.9901, + "step": 6070 + }, + { + "epoch": 1.7625199593554943, + "grad_norm": 3.8171653747558594, + "learning_rate": 7.699274188325995e-06, + "loss": 0.9094, + "step": 6071 + }, + { + "epoch": 1.7628102772535927, + "grad_norm": 3.6177287101745605, + "learning_rate": 7.698465879749404e-06, + "loss": 0.8565, + "step": 6072 + }, + { + "epoch": 1.7631005951516912, + "grad_norm": 3.4823312759399414, + "learning_rate": 7.697657471654459e-06, + "loss": 0.8491, + "step": 6073 + }, + { + "epoch": 1.7633909130497896, + "grad_norm": 3.9708127975463867, + "learning_rate": 7.696848964070976e-06, + "loss": 0.9884, + "step": 6074 + }, + { + "epoch": 1.763681230947888, + "grad_norm": 3.4418365955352783, + "learning_rate": 7.696040357028775e-06, + "loss": 0.7678, + "step": 6075 + }, + { + "epoch": 1.7639715488459864, + "grad_norm": 3.3301215171813965, + "learning_rate": 7.695231650557675e-06, + "loss": 0.7267, + "step": 6076 + }, + { + "epoch": 1.7642618667440848, + "grad_norm": 3.1033713817596436, + "learning_rate": 7.694422844687502e-06, + "loss": 0.6836, + "step": 6077 + }, + { + "epoch": 1.7645521846421832, + "grad_norm": 4.057397365570068, + "learning_rate": 7.693613939448083e-06, + "loss": 0.7511, + "step": 6078 + }, + { + "epoch": 1.7648425025402816, + "grad_norm": 3.352520227432251, + "learning_rate": 7.692804934869252e-06, + "loss": 0.7612, + "step": 6079 + }, + { + "epoch": 1.76513282043838, + "grad_norm": 3.3684096336364746, + "learning_rate": 7.691995830980841e-06, + "loss": 0.7262, + "step": 6080 + }, + { + "epoch": 1.7654231383364785, + "grad_norm": 3.3228354454040527, + "learning_rate": 7.691186627812696e-06, + "loss": 0.7095, + "step": 6081 + }, + { + "epoch": 1.7657134562345769, + "grad_norm": 3.406299591064453, + "learning_rate": 7.690377325394653e-06, + "loss": 0.7504, + "step": 6082 + }, + { + "epoch": 1.7660037741326753, + "grad_norm": 3.5867552757263184, + "learning_rate": 7.689567923756563e-06, + "loss": 0.7775, + "step": 6083 + }, + { + "epoch": 1.7662940920307737, + "grad_norm": 3.1561825275421143, + "learning_rate": 7.688758422928275e-06, + "loss": 0.707, + "step": 6084 + }, + { + "epoch": 1.7665844099288721, + "grad_norm": 2.969261646270752, + "learning_rate": 7.687948822939643e-06, + "loss": 0.7095, + "step": 6085 + }, + { + "epoch": 1.7668747278269705, + "grad_norm": 3.4857072830200195, + "learning_rate": 7.687139123820526e-06, + "loss": 0.705, + "step": 6086 + }, + { + "epoch": 1.767165045725069, + "grad_norm": 3.617248296737671, + "learning_rate": 7.686329325600785e-06, + "loss": 0.7477, + "step": 6087 + }, + { + "epoch": 1.7674553636231674, + "grad_norm": 3.9258131980895996, + "learning_rate": 7.685519428310282e-06, + "loss": 0.8036, + "step": 6088 + }, + { + "epoch": 1.7677456815212658, + "grad_norm": 3.5120155811309814, + "learning_rate": 7.684709431978891e-06, + "loss": 0.6849, + "step": 6089 + }, + { + "epoch": 1.7680359994193642, + "grad_norm": 3.392848491668701, + "learning_rate": 7.68389933663648e-06, + "loss": 0.7749, + "step": 6090 + }, + { + "epoch": 1.7683263173174626, + "grad_norm": 4.192860126495361, + "learning_rate": 7.683089142312927e-06, + "loss": 0.8256, + "step": 6091 + }, + { + "epoch": 1.768616635215561, + "grad_norm": 4.079232215881348, + "learning_rate": 7.682278849038109e-06, + "loss": 0.9657, + "step": 6092 + }, + { + "epoch": 1.7689069531136594, + "grad_norm": 3.493929386138916, + "learning_rate": 7.681468456841914e-06, + "loss": 0.7045, + "step": 6093 + }, + { + "epoch": 1.7691972710117578, + "grad_norm": 3.630089044570923, + "learning_rate": 7.680657965754227e-06, + "loss": 0.8063, + "step": 6094 + }, + { + "epoch": 1.7694875889098562, + "grad_norm": 3.227755546569824, + "learning_rate": 7.679847375804938e-06, + "loss": 0.6261, + "step": 6095 + }, + { + "epoch": 1.7697779068079547, + "grad_norm": 3.3954944610595703, + "learning_rate": 7.67903668702394e-06, + "loss": 0.6809, + "step": 6096 + }, + { + "epoch": 1.770068224706053, + "grad_norm": 3.9170215129852295, + "learning_rate": 7.678225899441131e-06, + "loss": 0.8088, + "step": 6097 + }, + { + "epoch": 1.7703585426041517, + "grad_norm": 3.5438239574432373, + "learning_rate": 7.677415013086415e-06, + "loss": 0.7075, + "step": 6098 + }, + { + "epoch": 1.77064886050225, + "grad_norm": 3.731586456298828, + "learning_rate": 7.676604027989695e-06, + "loss": 0.7176, + "step": 6099 + }, + { + "epoch": 1.7709391784003485, + "grad_norm": 3.9872632026672363, + "learning_rate": 7.675792944180884e-06, + "loss": 0.7342, + "step": 6100 + }, + { + "epoch": 1.7712294962984467, + "grad_norm": 3.564387083053589, + "learning_rate": 7.674981761689885e-06, + "loss": 0.8111, + "step": 6101 + }, + { + "epoch": 1.7715198141965454, + "grad_norm": 3.6033754348754883, + "learning_rate": 7.674170480546626e-06, + "loss": 0.6986, + "step": 6102 + }, + { + "epoch": 1.7718101320946436, + "grad_norm": 3.794177532196045, + "learning_rate": 7.673359100781018e-06, + "loss": 0.8078, + "step": 6103 + }, + { + "epoch": 1.7721004499927422, + "grad_norm": 3.224788188934326, + "learning_rate": 7.67254762242299e-06, + "loss": 0.7318, + "step": 6104 + }, + { + "epoch": 1.7723907678908404, + "grad_norm": 3.258075714111328, + "learning_rate": 7.671736045502462e-06, + "loss": 0.7327, + "step": 6105 + }, + { + "epoch": 1.772681085788939, + "grad_norm": 3.753732919692993, + "learning_rate": 7.67092437004937e-06, + "loss": 0.7999, + "step": 6106 + }, + { + "epoch": 1.7729714036870372, + "grad_norm": 3.635417938232422, + "learning_rate": 7.670112596093649e-06, + "loss": 0.7014, + "step": 6107 + }, + { + "epoch": 1.7732617215851358, + "grad_norm": 4.326013565063477, + "learning_rate": 7.669300723665234e-06, + "loss": 0.9172, + "step": 6108 + }, + { + "epoch": 1.773552039483234, + "grad_norm": 3.5447564125061035, + "learning_rate": 7.668488752794067e-06, + "loss": 0.7672, + "step": 6109 + }, + { + "epoch": 1.7738423573813327, + "grad_norm": 3.6314609050750732, + "learning_rate": 7.667676683510095e-06, + "loss": 0.8618, + "step": 6110 + }, + { + "epoch": 1.7741326752794309, + "grad_norm": 3.521106004714966, + "learning_rate": 7.666864515843266e-06, + "loss": 0.7867, + "step": 6111 + }, + { + "epoch": 1.7744229931775295, + "grad_norm": 3.3227083683013916, + "learning_rate": 7.66605224982353e-06, + "loss": 0.7712, + "step": 6112 + }, + { + "epoch": 1.7747133110756277, + "grad_norm": 3.570622682571411, + "learning_rate": 7.665239885480846e-06, + "loss": 0.7956, + "step": 6113 + }, + { + "epoch": 1.7750036289737263, + "grad_norm": 3.695883274078369, + "learning_rate": 7.664427422845172e-06, + "loss": 0.8755, + "step": 6114 + }, + { + "epoch": 1.7752939468718245, + "grad_norm": 3.544062376022339, + "learning_rate": 7.663614861946474e-06, + "loss": 0.8408, + "step": 6115 + }, + { + "epoch": 1.7755842647699231, + "grad_norm": 3.7533979415893555, + "learning_rate": 7.662802202814717e-06, + "loss": 0.8039, + "step": 6116 + }, + { + "epoch": 1.7758745826680213, + "grad_norm": 3.271301031112671, + "learning_rate": 7.661989445479869e-06, + "loss": 0.7642, + "step": 6117 + }, + { + "epoch": 1.77616490056612, + "grad_norm": 3.6111979484558105, + "learning_rate": 7.661176589971909e-06, + "loss": 0.7683, + "step": 6118 + }, + { + "epoch": 1.7764552184642182, + "grad_norm": 3.15321683883667, + "learning_rate": 7.660363636320809e-06, + "loss": 0.7051, + "step": 6119 + }, + { + "epoch": 1.7767455363623168, + "grad_norm": 3.64837646484375, + "learning_rate": 7.659550584556556e-06, + "loss": 0.716, + "step": 6120 + }, + { + "epoch": 1.777035854260415, + "grad_norm": 3.7064368724823, + "learning_rate": 7.658737434709134e-06, + "loss": 0.7225, + "step": 6121 + }, + { + "epoch": 1.7773261721585136, + "grad_norm": 3.836670160293579, + "learning_rate": 7.657924186808528e-06, + "loss": 0.7857, + "step": 6122 + }, + { + "epoch": 1.777616490056612, + "grad_norm": 3.381930351257324, + "learning_rate": 7.657110840884736e-06, + "loss": 0.7435, + "step": 6123 + }, + { + "epoch": 1.7779068079547105, + "grad_norm": 3.776498317718506, + "learning_rate": 7.656297396967747e-06, + "loss": 0.8766, + "step": 6124 + }, + { + "epoch": 1.7781971258528089, + "grad_norm": 4.0997419357299805, + "learning_rate": 7.655483855087566e-06, + "loss": 0.8466, + "step": 6125 + }, + { + "epoch": 1.7784874437509073, + "grad_norm": 3.578490734100342, + "learning_rate": 7.654670215274194e-06, + "loss": 0.8105, + "step": 6126 + }, + { + "epoch": 1.7787777616490057, + "grad_norm": 3.371166229248047, + "learning_rate": 7.653856477557639e-06, + "loss": 0.7181, + "step": 6127 + }, + { + "epoch": 1.779068079547104, + "grad_norm": 3.529717206954956, + "learning_rate": 7.65304264196791e-06, + "loss": 0.7721, + "step": 6128 + }, + { + "epoch": 1.7793583974452025, + "grad_norm": 3.6220967769622803, + "learning_rate": 7.65222870853502e-06, + "loss": 0.793, + "step": 6129 + }, + { + "epoch": 1.779648715343301, + "grad_norm": 3.6344494819641113, + "learning_rate": 7.651414677288987e-06, + "loss": 0.6975, + "step": 6130 + }, + { + "epoch": 1.7799390332413993, + "grad_norm": 3.3892741203308105, + "learning_rate": 7.650600548259835e-06, + "loss": 0.7217, + "step": 6131 + }, + { + "epoch": 1.7802293511394978, + "grad_norm": 2.9629781246185303, + "learning_rate": 7.649786321477585e-06, + "loss": 0.7099, + "step": 6132 + }, + { + "epoch": 1.7805196690375962, + "grad_norm": 3.578287124633789, + "learning_rate": 7.648971996972268e-06, + "loss": 0.772, + "step": 6133 + }, + { + "epoch": 1.7808099869356946, + "grad_norm": 3.6381213665008545, + "learning_rate": 7.648157574773915e-06, + "loss": 0.712, + "step": 6134 + }, + { + "epoch": 1.781100304833793, + "grad_norm": 3.346418619155884, + "learning_rate": 7.647343054912561e-06, + "loss": 0.7385, + "step": 6135 + }, + { + "epoch": 1.7813906227318914, + "grad_norm": 3.614990472793579, + "learning_rate": 7.646528437418246e-06, + "loss": 0.783, + "step": 6136 + }, + { + "epoch": 1.7816809406299898, + "grad_norm": 3.3961567878723145, + "learning_rate": 7.645713722321013e-06, + "loss": 0.7439, + "step": 6137 + }, + { + "epoch": 1.7819712585280882, + "grad_norm": 3.5309431552886963, + "learning_rate": 7.644898909650906e-06, + "loss": 0.7021, + "step": 6138 + }, + { + "epoch": 1.7822615764261867, + "grad_norm": 3.698122262954712, + "learning_rate": 7.644083999437976e-06, + "loss": 0.7764, + "step": 6139 + }, + { + "epoch": 1.782551894324285, + "grad_norm": 3.429757595062256, + "learning_rate": 7.643268991712281e-06, + "loss": 0.6601, + "step": 6140 + }, + { + "epoch": 1.7828422122223835, + "grad_norm": 3.651519775390625, + "learning_rate": 7.642453886503873e-06, + "loss": 0.7773, + "step": 6141 + }, + { + "epoch": 1.783132530120482, + "grad_norm": 3.704296112060547, + "learning_rate": 7.641638683842814e-06, + "loss": 0.7685, + "step": 6142 + }, + { + "epoch": 1.7834228480185803, + "grad_norm": 3.3031558990478516, + "learning_rate": 7.640823383759169e-06, + "loss": 0.7214, + "step": 6143 + }, + { + "epoch": 1.7837131659166787, + "grad_norm": 3.5565595626831055, + "learning_rate": 7.640007986283006e-06, + "loss": 0.7482, + "step": 6144 + }, + { + "epoch": 1.7840034838147771, + "grad_norm": 4.059230327606201, + "learning_rate": 7.639192491444395e-06, + "loss": 0.848, + "step": 6145 + }, + { + "epoch": 1.7842938017128755, + "grad_norm": 3.8568592071533203, + "learning_rate": 7.638376899273414e-06, + "loss": 0.7522, + "step": 6146 + }, + { + "epoch": 1.784584119610974, + "grad_norm": 3.5061683654785156, + "learning_rate": 7.637561209800137e-06, + "loss": 0.7799, + "step": 6147 + }, + { + "epoch": 1.7848744375090724, + "grad_norm": 3.739004135131836, + "learning_rate": 7.636745423054652e-06, + "loss": 0.8028, + "step": 6148 + }, + { + "epoch": 1.785164755407171, + "grad_norm": 3.494581699371338, + "learning_rate": 7.635929539067042e-06, + "loss": 0.8013, + "step": 6149 + }, + { + "epoch": 1.7854550733052692, + "grad_norm": 3.7833151817321777, + "learning_rate": 7.635113557867395e-06, + "loss": 0.8237, + "step": 6150 + }, + { + "epoch": 1.7857453912033678, + "grad_norm": 3.478761911392212, + "learning_rate": 7.634297479485806e-06, + "loss": 0.7016, + "step": 6151 + }, + { + "epoch": 1.786035709101466, + "grad_norm": 3.378567934036255, + "learning_rate": 7.633481303952373e-06, + "loss": 0.8555, + "step": 6152 + }, + { + "epoch": 1.7863260269995647, + "grad_norm": 3.6236679553985596, + "learning_rate": 7.632665031297193e-06, + "loss": 0.8543, + "step": 6153 + }, + { + "epoch": 1.7866163448976629, + "grad_norm": 3.544419050216675, + "learning_rate": 7.631848661550372e-06, + "loss": 0.7616, + "step": 6154 + }, + { + "epoch": 1.7869066627957615, + "grad_norm": 3.239393472671509, + "learning_rate": 7.631032194742017e-06, + "loss": 0.7845, + "step": 6155 + }, + { + "epoch": 1.7871969806938597, + "grad_norm": 3.15290904045105, + "learning_rate": 7.630215630902236e-06, + "loss": 0.7698, + "step": 6156 + }, + { + "epoch": 1.7874872985919583, + "grad_norm": 3.545022964477539, + "learning_rate": 7.62939897006115e-06, + "loss": 0.7294, + "step": 6157 + }, + { + "epoch": 1.7877776164900565, + "grad_norm": 2.9995696544647217, + "learning_rate": 7.628582212248871e-06, + "loss": 0.6932, + "step": 6158 + }, + { + "epoch": 1.7880679343881551, + "grad_norm": 3.410565137863159, + "learning_rate": 7.627765357495526e-06, + "loss": 0.6982, + "step": 6159 + }, + { + "epoch": 1.7883582522862533, + "grad_norm": 3.6005923748016357, + "learning_rate": 7.626948405831235e-06, + "loss": 0.757, + "step": 6160 + }, + { + "epoch": 1.788648570184352, + "grad_norm": 3.7826449871063232, + "learning_rate": 7.626131357286129e-06, + "loss": 0.8267, + "step": 6161 + }, + { + "epoch": 1.7889388880824502, + "grad_norm": 3.534515619277954, + "learning_rate": 7.625314211890342e-06, + "loss": 0.6781, + "step": 6162 + }, + { + "epoch": 1.7892292059805488, + "grad_norm": 3.6266918182373047, + "learning_rate": 7.624496969674009e-06, + "loss": 0.6734, + "step": 6163 + }, + { + "epoch": 1.789519523878647, + "grad_norm": 3.3739120960235596, + "learning_rate": 7.623679630667269e-06, + "loss": 0.6884, + "step": 6164 + }, + { + "epoch": 1.7898098417767456, + "grad_norm": 3.380641222000122, + "learning_rate": 7.622862194900263e-06, + "loss": 0.6936, + "step": 6165 + }, + { + "epoch": 1.7901001596748438, + "grad_norm": 3.769023895263672, + "learning_rate": 7.622044662403143e-06, + "loss": 0.7827, + "step": 6166 + }, + { + "epoch": 1.7903904775729425, + "grad_norm": 3.9562571048736572, + "learning_rate": 7.621227033206055e-06, + "loss": 0.9208, + "step": 6167 + }, + { + "epoch": 1.7906807954710406, + "grad_norm": 3.863774299621582, + "learning_rate": 7.620409307339156e-06, + "loss": 0.8076, + "step": 6168 + }, + { + "epoch": 1.7909711133691393, + "grad_norm": 3.953861951828003, + "learning_rate": 7.6195914848326e-06, + "loss": 0.8365, + "step": 6169 + }, + { + "epoch": 1.7912614312672375, + "grad_norm": 3.024517059326172, + "learning_rate": 7.61877356571655e-06, + "loss": 0.6976, + "step": 6170 + }, + { + "epoch": 1.791551749165336, + "grad_norm": 3.4500885009765625, + "learning_rate": 7.617955550021169e-06, + "loss": 0.7894, + "step": 6171 + }, + { + "epoch": 1.7918420670634343, + "grad_norm": 3.453752040863037, + "learning_rate": 7.617137437776627e-06, + "loss": 0.8166, + "step": 6172 + }, + { + "epoch": 1.792132384961533, + "grad_norm": 3.911886215209961, + "learning_rate": 7.616319229013096e-06, + "loss": 0.9803, + "step": 6173 + }, + { + "epoch": 1.7924227028596313, + "grad_norm": 3.8347620964050293, + "learning_rate": 7.615500923760748e-06, + "loss": 0.7538, + "step": 6174 + }, + { + "epoch": 1.7927130207577298, + "grad_norm": 3.304626226425171, + "learning_rate": 7.614682522049766e-06, + "loss": 0.747, + "step": 6175 + }, + { + "epoch": 1.7930033386558282, + "grad_norm": 3.2706761360168457, + "learning_rate": 7.613864023910329e-06, + "loss": 0.7474, + "step": 6176 + }, + { + "epoch": 1.7932936565539266, + "grad_norm": 3.834886312484741, + "learning_rate": 7.613045429372624e-06, + "loss": 0.8663, + "step": 6177 + }, + { + "epoch": 1.793583974452025, + "grad_norm": 3.344585418701172, + "learning_rate": 7.612226738466841e-06, + "loss": 0.62, + "step": 6178 + }, + { + "epoch": 1.7938742923501234, + "grad_norm": 3.5737040042877197, + "learning_rate": 7.611407951223173e-06, + "loss": 0.7471, + "step": 6179 + }, + { + "epoch": 1.7941646102482218, + "grad_norm": 3.5841925144195557, + "learning_rate": 7.610589067671814e-06, + "loss": 0.8081, + "step": 6180 + }, + { + "epoch": 1.7944549281463202, + "grad_norm": 3.6530447006225586, + "learning_rate": 7.609770087842969e-06, + "loss": 0.7242, + "step": 6181 + }, + { + "epoch": 1.7947452460444187, + "grad_norm": 3.2289116382598877, + "learning_rate": 7.6089510117668365e-06, + "loss": 0.7093, + "step": 6182 + }, + { + "epoch": 1.795035563942517, + "grad_norm": 3.61566424369812, + "learning_rate": 7.608131839473627e-06, + "loss": 0.7938, + "step": 6183 + }, + { + "epoch": 1.7953258818406155, + "grad_norm": 3.7904341220855713, + "learning_rate": 7.607312570993551e-06, + "loss": 0.821, + "step": 6184 + }, + { + "epoch": 1.795616199738714, + "grad_norm": 3.485880136489868, + "learning_rate": 7.606493206356821e-06, + "loss": 0.7012, + "step": 6185 + }, + { + "epoch": 1.7959065176368123, + "grad_norm": 3.770455837249756, + "learning_rate": 7.6056737455936556e-06, + "loss": 0.7758, + "step": 6186 + }, + { + "epoch": 1.7961968355349107, + "grad_norm": 3.34679913520813, + "learning_rate": 7.604854188734278e-06, + "loss": 0.7696, + "step": 6187 + }, + { + "epoch": 1.7964871534330091, + "grad_norm": 3.1228458881378174, + "learning_rate": 7.604034535808909e-06, + "loss": 0.6932, + "step": 6188 + }, + { + "epoch": 1.7967774713311075, + "grad_norm": 3.367436408996582, + "learning_rate": 7.603214786847781e-06, + "loss": 0.8846, + "step": 6189 + }, + { + "epoch": 1.797067789229206, + "grad_norm": 3.469499349594116, + "learning_rate": 7.602394941881126e-06, + "loss": 0.7274, + "step": 6190 + }, + { + "epoch": 1.7973581071273044, + "grad_norm": 3.600771903991699, + "learning_rate": 7.6015750009391776e-06, + "loss": 0.7988, + "step": 6191 + }, + { + "epoch": 1.7976484250254028, + "grad_norm": 3.430292844772339, + "learning_rate": 7.600754964052174e-06, + "loss": 0.8242, + "step": 6192 + }, + { + "epoch": 1.7979387429235012, + "grad_norm": 3.573873281478882, + "learning_rate": 7.5999348312503614e-06, + "loss": 0.859, + "step": 6193 + }, + { + "epoch": 1.7982290608215996, + "grad_norm": 3.5837037563323975, + "learning_rate": 7.5991146025639825e-06, + "loss": 0.7537, + "step": 6194 + }, + { + "epoch": 1.798519378719698, + "grad_norm": 3.798265218734741, + "learning_rate": 7.59829427802329e-06, + "loss": 0.8035, + "step": 6195 + }, + { + "epoch": 1.7988096966177964, + "grad_norm": 3.419114112854004, + "learning_rate": 7.597473857658535e-06, + "loss": 0.6888, + "step": 6196 + }, + { + "epoch": 1.7991000145158949, + "grad_norm": 3.157182216644287, + "learning_rate": 7.596653341499974e-06, + "loss": 0.7266, + "step": 6197 + }, + { + "epoch": 1.7993903324139935, + "grad_norm": 3.9746930599212646, + "learning_rate": 7.59583272957787e-06, + "loss": 0.9873, + "step": 6198 + }, + { + "epoch": 1.7996806503120917, + "grad_norm": 3.456258535385132, + "learning_rate": 7.595012021922483e-06, + "loss": 0.8182, + "step": 6199 + }, + { + "epoch": 1.7999709682101903, + "grad_norm": 3.296928882598877, + "learning_rate": 7.594191218564084e-06, + "loss": 0.7492, + "step": 6200 + }, + { + "epoch": 1.8002612861082885, + "grad_norm": 3.6365811824798584, + "learning_rate": 7.5933703195329426e-06, + "loss": 0.8622, + "step": 6201 + }, + { + "epoch": 1.8005516040063871, + "grad_norm": 3.2589075565338135, + "learning_rate": 7.592549324859332e-06, + "loss": 0.673, + "step": 6202 + }, + { + "epoch": 1.8008419219044853, + "grad_norm": 4.169826507568359, + "learning_rate": 7.591728234573531e-06, + "loss": 0.8656, + "step": 6203 + }, + { + "epoch": 1.801132239802584, + "grad_norm": 3.259309768676758, + "learning_rate": 7.590907048705822e-06, + "loss": 0.7238, + "step": 6204 + }, + { + "epoch": 1.8014225577006822, + "grad_norm": 4.122686862945557, + "learning_rate": 7.590085767286488e-06, + "loss": 1.0135, + "step": 6205 + }, + { + "epoch": 1.8017128755987808, + "grad_norm": 3.3853394985198975, + "learning_rate": 7.58926439034582e-06, + "loss": 0.7526, + "step": 6206 + }, + { + "epoch": 1.802003193496879, + "grad_norm": 3.3177542686462402, + "learning_rate": 7.5884429179141076e-06, + "loss": 0.7382, + "step": 6207 + }, + { + "epoch": 1.8022935113949776, + "grad_norm": 3.5391876697540283, + "learning_rate": 7.587621350021649e-06, + "loss": 0.8011, + "step": 6208 + }, + { + "epoch": 1.8025838292930758, + "grad_norm": 3.7560062408447266, + "learning_rate": 7.58679968669874e-06, + "loss": 0.8786, + "step": 6209 + }, + { + "epoch": 1.8028741471911744, + "grad_norm": 3.5351386070251465, + "learning_rate": 7.585977927975687e-06, + "loss": 0.726, + "step": 6210 + }, + { + "epoch": 1.8031644650892726, + "grad_norm": 3.548893451690674, + "learning_rate": 7.585156073882793e-06, + "loss": 0.7565, + "step": 6211 + }, + { + "epoch": 1.8034547829873713, + "grad_norm": 3.7670400142669678, + "learning_rate": 7.58433412445037e-06, + "loss": 0.8406, + "step": 6212 + }, + { + "epoch": 1.8037451008854695, + "grad_norm": 3.432896375656128, + "learning_rate": 7.583512079708729e-06, + "loss": 0.7089, + "step": 6213 + }, + { + "epoch": 1.804035418783568, + "grad_norm": 3.5606884956359863, + "learning_rate": 7.582689939688188e-06, + "loss": 0.8647, + "step": 6214 + }, + { + "epoch": 1.8043257366816663, + "grad_norm": 3.3018386363983154, + "learning_rate": 7.581867704419068e-06, + "loss": 0.7557, + "step": 6215 + }, + { + "epoch": 1.804616054579765, + "grad_norm": 3.351177215576172, + "learning_rate": 7.581045373931691e-06, + "loss": 0.8048, + "step": 6216 + }, + { + "epoch": 1.8049063724778631, + "grad_norm": 3.514824151992798, + "learning_rate": 7.580222948256384e-06, + "loss": 0.7764, + "step": 6217 + }, + { + "epoch": 1.8051966903759618, + "grad_norm": 3.573287010192871, + "learning_rate": 7.579400427423479e-06, + "loss": 0.8168, + "step": 6218 + }, + { + "epoch": 1.80548700827406, + "grad_norm": 3.355710506439209, + "learning_rate": 7.57857781146331e-06, + "loss": 0.7323, + "step": 6219 + }, + { + "epoch": 1.8057773261721586, + "grad_norm": 3.2817916870117188, + "learning_rate": 7.577755100406215e-06, + "loss": 0.7215, + "step": 6220 + }, + { + "epoch": 1.8060676440702568, + "grad_norm": 3.442941665649414, + "learning_rate": 7.5769322942825345e-06, + "loss": 0.7334, + "step": 6221 + }, + { + "epoch": 1.8063579619683554, + "grad_norm": 3.865924596786499, + "learning_rate": 7.576109393122613e-06, + "loss": 0.8406, + "step": 6222 + }, + { + "epoch": 1.8066482798664538, + "grad_norm": 3.839789628982544, + "learning_rate": 7.5752863969568e-06, + "loss": 0.8302, + "step": 6223 + }, + { + "epoch": 1.8069385977645522, + "grad_norm": 3.4474151134490967, + "learning_rate": 7.574463305815446e-06, + "loss": 0.8842, + "step": 6224 + }, + { + "epoch": 1.8072289156626506, + "grad_norm": 3.137389659881592, + "learning_rate": 7.573640119728909e-06, + "loss": 0.8209, + "step": 6225 + }, + { + "epoch": 1.807519233560749, + "grad_norm": 3.777895212173462, + "learning_rate": 7.572816838727544e-06, + "loss": 0.8116, + "step": 6226 + }, + { + "epoch": 1.8078095514588475, + "grad_norm": 3.000427484512329, + "learning_rate": 7.571993462841714e-06, + "loss": 0.6237, + "step": 6227 + }, + { + "epoch": 1.8080998693569459, + "grad_norm": 3.8934295177459717, + "learning_rate": 7.571169992101788e-06, + "loss": 0.9309, + "step": 6228 + }, + { + "epoch": 1.8083901872550443, + "grad_norm": 3.262486457824707, + "learning_rate": 7.570346426538131e-06, + "loss": 0.6841, + "step": 6229 + }, + { + "epoch": 1.8086805051531427, + "grad_norm": 3.2486703395843506, + "learning_rate": 7.56952276618112e-06, + "loss": 0.8261, + "step": 6230 + }, + { + "epoch": 1.8089708230512411, + "grad_norm": 3.4097964763641357, + "learning_rate": 7.568699011061127e-06, + "loss": 0.7107, + "step": 6231 + }, + { + "epoch": 1.8092611409493395, + "grad_norm": 3.5118725299835205, + "learning_rate": 7.5678751612085344e-06, + "loss": 0.7122, + "step": 6232 + }, + { + "epoch": 1.809551458847438, + "grad_norm": 3.1857311725616455, + "learning_rate": 7.567051216653725e-06, + "loss": 0.697, + "step": 6233 + }, + { + "epoch": 1.8098417767455364, + "grad_norm": 4.186178207397461, + "learning_rate": 7.566227177427085e-06, + "loss": 0.8029, + "step": 6234 + }, + { + "epoch": 1.8101320946436348, + "grad_norm": 3.4743754863739014, + "learning_rate": 7.565403043559007e-06, + "loss": 0.7779, + "step": 6235 + }, + { + "epoch": 1.8104224125417332, + "grad_norm": 3.412288188934326, + "learning_rate": 7.5645788150798814e-06, + "loss": 0.7435, + "step": 6236 + }, + { + "epoch": 1.8107127304398316, + "grad_norm": 3.591625690460205, + "learning_rate": 7.563754492020108e-06, + "loss": 0.9457, + "step": 6237 + }, + { + "epoch": 1.81100304833793, + "grad_norm": 3.9877660274505615, + "learning_rate": 7.562930074410084e-06, + "loss": 0.8225, + "step": 6238 + }, + { + "epoch": 1.8112933662360284, + "grad_norm": 3.482994556427002, + "learning_rate": 7.562105562280218e-06, + "loss": 0.8183, + "step": 6239 + }, + { + "epoch": 1.8115836841341268, + "grad_norm": 3.938270330429077, + "learning_rate": 7.561280955660915e-06, + "loss": 0.8329, + "step": 6240 + }, + { + "epoch": 1.8118740020322253, + "grad_norm": 3.121049404144287, + "learning_rate": 7.560456254582586e-06, + "loss": 0.6843, + "step": 6241 + }, + { + "epoch": 1.8121643199303237, + "grad_norm": 3.8467633724212646, + "learning_rate": 7.559631459075646e-06, + "loss": 0.9058, + "step": 6242 + }, + { + "epoch": 1.812454637828422, + "grad_norm": 3.8543753623962402, + "learning_rate": 7.558806569170514e-06, + "loss": 0.8795, + "step": 6243 + }, + { + "epoch": 1.8127449557265205, + "grad_norm": 3.738771438598633, + "learning_rate": 7.557981584897612e-06, + "loss": 0.7087, + "step": 6244 + }, + { + "epoch": 1.813035273624619, + "grad_norm": 3.7522284984588623, + "learning_rate": 7.557156506287364e-06, + "loss": 0.7569, + "step": 6245 + }, + { + "epoch": 1.8133255915227173, + "grad_norm": 3.697587251663208, + "learning_rate": 7.556331333370199e-06, + "loss": 0.8145, + "step": 6246 + }, + { + "epoch": 1.8136159094208157, + "grad_norm": 3.8390111923217773, + "learning_rate": 7.555506066176549e-06, + "loss": 0.833, + "step": 6247 + }, + { + "epoch": 1.8139062273189142, + "grad_norm": 3.501277208328247, + "learning_rate": 7.5546807047368485e-06, + "loss": 0.717, + "step": 6248 + }, + { + "epoch": 1.8141965452170128, + "grad_norm": 3.8523659706115723, + "learning_rate": 7.553855249081538e-06, + "loss": 0.8559, + "step": 6249 + }, + { + "epoch": 1.814486863115111, + "grad_norm": 3.714585781097412, + "learning_rate": 7.553029699241059e-06, + "loss": 0.7097, + "step": 6250 + }, + { + "epoch": 1.8147771810132096, + "grad_norm": 3.495954751968384, + "learning_rate": 7.552204055245858e-06, + "loss": 0.7008, + "step": 6251 + }, + { + "epoch": 1.8150674989113078, + "grad_norm": 3.6363167762756348, + "learning_rate": 7.551378317126384e-06, + "loss": 0.7602, + "step": 6252 + }, + { + "epoch": 1.8153578168094064, + "grad_norm": 3.7626495361328125, + "learning_rate": 7.5505524849130915e-06, + "loss": 0.8059, + "step": 6253 + }, + { + "epoch": 1.8156481347075046, + "grad_norm": 3.3501880168914795, + "learning_rate": 7.549726558636434e-06, + "loss": 0.7476, + "step": 6254 + }, + { + "epoch": 1.8159384526056033, + "grad_norm": 3.376075267791748, + "learning_rate": 7.548900538326874e-06, + "loss": 0.7685, + "step": 6255 + }, + { + "epoch": 1.8162287705037015, + "grad_norm": 3.886094570159912, + "learning_rate": 7.548074424014873e-06, + "loss": 0.8429, + "step": 6256 + }, + { + "epoch": 1.8165190884018, + "grad_norm": 3.8451836109161377, + "learning_rate": 7.5472482157308975e-06, + "loss": 0.8856, + "step": 6257 + }, + { + "epoch": 1.8168094062998983, + "grad_norm": 3.035158395767212, + "learning_rate": 7.54642191350542e-06, + "loss": 0.6661, + "step": 6258 + }, + { + "epoch": 1.817099724197997, + "grad_norm": 3.0387699604034424, + "learning_rate": 7.545595517368913e-06, + "loss": 0.669, + "step": 6259 + }, + { + "epoch": 1.8173900420960951, + "grad_norm": 3.523467540740967, + "learning_rate": 7.544769027351853e-06, + "loss": 0.7385, + "step": 6260 + }, + { + "epoch": 1.8176803599941938, + "grad_norm": 3.1985654830932617, + "learning_rate": 7.543942443484721e-06, + "loss": 0.7173, + "step": 6261 + }, + { + "epoch": 1.817970677892292, + "grad_norm": 3.688586473464966, + "learning_rate": 7.543115765798002e-06, + "loss": 0.7391, + "step": 6262 + }, + { + "epoch": 1.8182609957903906, + "grad_norm": 3.3867619037628174, + "learning_rate": 7.542288994322181e-06, + "loss": 0.7213, + "step": 6263 + }, + { + "epoch": 1.8185513136884888, + "grad_norm": 3.24111008644104, + "learning_rate": 7.5414621290877525e-06, + "loss": 0.744, + "step": 6264 + }, + { + "epoch": 1.8188416315865874, + "grad_norm": 3.452265739440918, + "learning_rate": 7.540635170125208e-06, + "loss": 0.6929, + "step": 6265 + }, + { + "epoch": 1.8191319494846856, + "grad_norm": 3.555257558822632, + "learning_rate": 7.539808117465047e-06, + "loss": 0.8184, + "step": 6266 + }, + { + "epoch": 1.8194222673827842, + "grad_norm": 3.979184865951538, + "learning_rate": 7.538980971137771e-06, + "loss": 0.85, + "step": 6267 + }, + { + "epoch": 1.8197125852808824, + "grad_norm": 3.006906747817993, + "learning_rate": 7.538153731173885e-06, + "loss": 0.6521, + "step": 6268 + }, + { + "epoch": 1.820002903178981, + "grad_norm": 3.9368133544921875, + "learning_rate": 7.5373263976038944e-06, + "loss": 0.9165, + "step": 6269 + }, + { + "epoch": 1.8202932210770792, + "grad_norm": 3.690107583999634, + "learning_rate": 7.536498970458314e-06, + "loss": 0.7681, + "step": 6270 + }, + { + "epoch": 1.8205835389751779, + "grad_norm": 3.7240521907806396, + "learning_rate": 7.535671449767659e-06, + "loss": 0.7563, + "step": 6271 + }, + { + "epoch": 1.820873856873276, + "grad_norm": 3.656486988067627, + "learning_rate": 7.534843835562448e-06, + "loss": 0.7902, + "step": 6272 + }, + { + "epoch": 1.8211641747713747, + "grad_norm": 3.4625377655029297, + "learning_rate": 7.5340161278732e-06, + "loss": 0.7638, + "step": 6273 + }, + { + "epoch": 1.8214544926694731, + "grad_norm": 3.750249147415161, + "learning_rate": 7.533188326730444e-06, + "loss": 0.8196, + "step": 6274 + }, + { + "epoch": 1.8217448105675715, + "grad_norm": 3.308974266052246, + "learning_rate": 7.532360432164707e-06, + "loss": 0.7057, + "step": 6275 + }, + { + "epoch": 1.82203512846567, + "grad_norm": 3.5016844272613525, + "learning_rate": 7.531532444206524e-06, + "loss": 0.8291, + "step": 6276 + }, + { + "epoch": 1.8223254463637684, + "grad_norm": 3.492377758026123, + "learning_rate": 7.530704362886428e-06, + "loss": 0.7162, + "step": 6277 + }, + { + "epoch": 1.8226157642618668, + "grad_norm": 3.7556068897247314, + "learning_rate": 7.5298761882349594e-06, + "loss": 0.7858, + "step": 6278 + }, + { + "epoch": 1.8229060821599652, + "grad_norm": 3.9492125511169434, + "learning_rate": 7.5290479202826596e-06, + "loss": 0.8273, + "step": 6279 + }, + { + "epoch": 1.8231964000580636, + "grad_norm": 4.034613609313965, + "learning_rate": 7.528219559060077e-06, + "loss": 0.8135, + "step": 6280 + }, + { + "epoch": 1.823486717956162, + "grad_norm": 3.474411725997925, + "learning_rate": 7.527391104597761e-06, + "loss": 0.8682, + "step": 6281 + }, + { + "epoch": 1.8237770358542604, + "grad_norm": 3.4744694232940674, + "learning_rate": 7.526562556926265e-06, + "loss": 0.7112, + "step": 6282 + }, + { + "epoch": 1.8240673537523588, + "grad_norm": 3.711562395095825, + "learning_rate": 7.525733916076142e-06, + "loss": 0.76, + "step": 6283 + }, + { + "epoch": 1.8243576716504573, + "grad_norm": 3.230764150619507, + "learning_rate": 7.524905182077955e-06, + "loss": 0.6565, + "step": 6284 + }, + { + "epoch": 1.8246479895485557, + "grad_norm": 3.4089322090148926, + "learning_rate": 7.5240763549622685e-06, + "loss": 0.6973, + "step": 6285 + }, + { + "epoch": 1.824938307446654, + "grad_norm": 3.709282636642456, + "learning_rate": 7.523247434759646e-06, + "loss": 0.8532, + "step": 6286 + }, + { + "epoch": 1.8252286253447525, + "grad_norm": 3.3632187843322754, + "learning_rate": 7.522418421500662e-06, + "loss": 0.8516, + "step": 6287 + }, + { + "epoch": 1.825518943242851, + "grad_norm": 3.4261248111724854, + "learning_rate": 7.5215893152158846e-06, + "loss": 0.8845, + "step": 6288 + }, + { + "epoch": 1.8258092611409493, + "grad_norm": 3.668027400970459, + "learning_rate": 7.5207601159358955e-06, + "loss": 0.7571, + "step": 6289 + }, + { + "epoch": 1.8260995790390477, + "grad_norm": 3.609893321990967, + "learning_rate": 7.519930823691272e-06, + "loss": 0.847, + "step": 6290 + }, + { + "epoch": 1.8263898969371462, + "grad_norm": 3.379772186279297, + "learning_rate": 7.519101438512602e-06, + "loss": 0.734, + "step": 6291 + }, + { + "epoch": 1.8266802148352446, + "grad_norm": 3.4122653007507324, + "learning_rate": 7.5182719604304685e-06, + "loss": 0.7448, + "step": 6292 + }, + { + "epoch": 1.826970532733343, + "grad_norm": 3.6492867469787598, + "learning_rate": 7.5174423894754664e-06, + "loss": 0.763, + "step": 6293 + }, + { + "epoch": 1.8272608506314414, + "grad_norm": 3.439892292022705, + "learning_rate": 7.5166127256781876e-06, + "loss": 0.7315, + "step": 6294 + }, + { + "epoch": 1.8275511685295398, + "grad_norm": 3.6350364685058594, + "learning_rate": 7.515782969069229e-06, + "loss": 0.7174, + "step": 6295 + }, + { + "epoch": 1.8278414864276382, + "grad_norm": 3.2767841815948486, + "learning_rate": 7.514953119679193e-06, + "loss": 0.714, + "step": 6296 + }, + { + "epoch": 1.8281318043257366, + "grad_norm": 3.690453052520752, + "learning_rate": 7.514123177538686e-06, + "loss": 0.6819, + "step": 6297 + }, + { + "epoch": 1.828422122223835, + "grad_norm": 3.7709054946899414, + "learning_rate": 7.513293142678313e-06, + "loss": 0.7278, + "step": 6298 + }, + { + "epoch": 1.8287124401219335, + "grad_norm": 3.1825685501098633, + "learning_rate": 7.5124630151286845e-06, + "loss": 0.7173, + "step": 6299 + }, + { + "epoch": 1.829002758020032, + "grad_norm": 3.712411880493164, + "learning_rate": 7.511632794920419e-06, + "loss": 0.7861, + "step": 6300 + }, + { + "epoch": 1.8292930759181303, + "grad_norm": 3.5475590229034424, + "learning_rate": 7.510802482084132e-06, + "loss": 0.678, + "step": 6301 + }, + { + "epoch": 1.829583393816229, + "grad_norm": 4.581618309020996, + "learning_rate": 7.509972076650446e-06, + "loss": 0.8925, + "step": 6302 + }, + { + "epoch": 1.8298737117143271, + "grad_norm": 3.616469383239746, + "learning_rate": 7.509141578649986e-06, + "loss": 0.7198, + "step": 6303 + }, + { + "epoch": 1.8301640296124257, + "grad_norm": 3.2408971786499023, + "learning_rate": 7.50831098811338e-06, + "loss": 0.7338, + "step": 6304 + }, + { + "epoch": 1.830454347510524, + "grad_norm": 3.7839319705963135, + "learning_rate": 7.50748030507126e-06, + "loss": 0.8358, + "step": 6305 + }, + { + "epoch": 1.8307446654086226, + "grad_norm": 3.9839742183685303, + "learning_rate": 7.506649529554261e-06, + "loss": 0.8758, + "step": 6306 + }, + { + "epoch": 1.8310349833067208, + "grad_norm": 4.165936470031738, + "learning_rate": 7.505818661593023e-06, + "loss": 0.8142, + "step": 6307 + }, + { + "epoch": 1.8313253012048194, + "grad_norm": 3.3792271614074707, + "learning_rate": 7.504987701218187e-06, + "loss": 0.8431, + "step": 6308 + }, + { + "epoch": 1.8316156191029176, + "grad_norm": 3.979881525039673, + "learning_rate": 7.5041566484603975e-06, + "loss": 0.9142, + "step": 6309 + }, + { + "epoch": 1.8319059370010162, + "grad_norm": 3.540987253189087, + "learning_rate": 7.503325503350307e-06, + "loss": 0.8675, + "step": 6310 + }, + { + "epoch": 1.8321962548991144, + "grad_norm": 3.5563859939575195, + "learning_rate": 7.502494265918563e-06, + "loss": 0.779, + "step": 6311 + }, + { + "epoch": 1.832486572797213, + "grad_norm": 3.8116211891174316, + "learning_rate": 7.501662936195824e-06, + "loss": 0.8108, + "step": 6312 + }, + { + "epoch": 1.8327768906953112, + "grad_norm": 3.5146663188934326, + "learning_rate": 7.500831514212749e-06, + "loss": 0.7253, + "step": 6313 + }, + { + "epoch": 1.8330672085934099, + "grad_norm": 3.380580425262451, + "learning_rate": 7.500000000000001e-06, + "loss": 0.7129, + "step": 6314 + }, + { + "epoch": 1.833357526491508, + "grad_norm": 3.595702886581421, + "learning_rate": 7.499168393588244e-06, + "loss": 0.7543, + "step": 6315 + }, + { + "epoch": 1.8336478443896067, + "grad_norm": 3.2393553256988525, + "learning_rate": 7.498336695008148e-06, + "loss": 0.773, + "step": 6316 + }, + { + "epoch": 1.833938162287705, + "grad_norm": 4.056413650512695, + "learning_rate": 7.497504904290388e-06, + "loss": 0.8839, + "step": 6317 + }, + { + "epoch": 1.8342284801858035, + "grad_norm": 3.646803617477417, + "learning_rate": 7.496673021465637e-06, + "loss": 0.8599, + "step": 6318 + }, + { + "epoch": 1.8345187980839017, + "grad_norm": 3.5614094734191895, + "learning_rate": 7.495841046564577e-06, + "loss": 0.8281, + "step": 6319 + }, + { + "epoch": 1.8348091159820004, + "grad_norm": 3.5541832447052, + "learning_rate": 7.495008979617887e-06, + "loss": 0.7304, + "step": 6320 + }, + { + "epoch": 1.8350994338800986, + "grad_norm": 3.597524404525757, + "learning_rate": 7.494176820656258e-06, + "loss": 0.757, + "step": 6321 + }, + { + "epoch": 1.8353897517781972, + "grad_norm": 3.2266886234283447, + "learning_rate": 7.493344569710377e-06, + "loss": 0.7391, + "step": 6322 + }, + { + "epoch": 1.8356800696762954, + "grad_norm": 3.777841329574585, + "learning_rate": 7.492512226810938e-06, + "loss": 0.7076, + "step": 6323 + }, + { + "epoch": 1.835970387574394, + "grad_norm": 3.5459866523742676, + "learning_rate": 7.491679791988636e-06, + "loss": 0.7855, + "step": 6324 + }, + { + "epoch": 1.8362607054724924, + "grad_norm": 3.8192386627197266, + "learning_rate": 7.490847265274174e-06, + "loss": 0.7813, + "step": 6325 + }, + { + "epoch": 1.8365510233705908, + "grad_norm": 3.7294278144836426, + "learning_rate": 7.490014646698252e-06, + "loss": 0.7653, + "step": 6326 + }, + { + "epoch": 1.8368413412686893, + "grad_norm": 3.3755605220794678, + "learning_rate": 7.489181936291578e-06, + "loss": 0.7804, + "step": 6327 + }, + { + "epoch": 1.8371316591667877, + "grad_norm": 3.258549928665161, + "learning_rate": 7.488349134084864e-06, + "loss": 0.6664, + "step": 6328 + }, + { + "epoch": 1.837421977064886, + "grad_norm": 3.3586201667785645, + "learning_rate": 7.487516240108819e-06, + "loss": 0.7859, + "step": 6329 + }, + { + "epoch": 1.8377122949629845, + "grad_norm": 3.6065549850463867, + "learning_rate": 7.486683254394164e-06, + "loss": 0.7288, + "step": 6330 + }, + { + "epoch": 1.838002612861083, + "grad_norm": 3.9054665565490723, + "learning_rate": 7.485850176971615e-06, + "loss": 0.7768, + "step": 6331 + }, + { + "epoch": 1.8382929307591813, + "grad_norm": 3.5716748237609863, + "learning_rate": 7.4850170078719e-06, + "loss": 0.7479, + "step": 6332 + }, + { + "epoch": 1.8385832486572797, + "grad_norm": 3.473572254180908, + "learning_rate": 7.484183747125743e-06, + "loss": 0.8524, + "step": 6333 + }, + { + "epoch": 1.8388735665553781, + "grad_norm": 3.5693931579589844, + "learning_rate": 7.483350394763875e-06, + "loss": 0.8059, + "step": 6334 + }, + { + "epoch": 1.8391638844534766, + "grad_norm": 3.8996100425720215, + "learning_rate": 7.48251695081703e-06, + "loss": 0.8808, + "step": 6335 + }, + { + "epoch": 1.839454202351575, + "grad_norm": 3.6452038288116455, + "learning_rate": 7.481683415315947e-06, + "loss": 0.7321, + "step": 6336 + }, + { + "epoch": 1.8397445202496734, + "grad_norm": 3.863975763320923, + "learning_rate": 7.480849788291363e-06, + "loss": 0.8304, + "step": 6337 + }, + { + "epoch": 1.8400348381477718, + "grad_norm": 3.3858823776245117, + "learning_rate": 7.480016069774022e-06, + "loss": 0.7193, + "step": 6338 + }, + { + "epoch": 1.8403251560458702, + "grad_norm": 3.359248161315918, + "learning_rate": 7.479182259794673e-06, + "loss": 0.804, + "step": 6339 + }, + { + "epoch": 1.8406154739439686, + "grad_norm": 3.686079740524292, + "learning_rate": 7.478348358384068e-06, + "loss": 0.8708, + "step": 6340 + }, + { + "epoch": 1.840905791842067, + "grad_norm": 3.9238340854644775, + "learning_rate": 7.477514365572958e-06, + "loss": 0.8281, + "step": 6341 + }, + { + "epoch": 1.8411961097401655, + "grad_norm": 3.203186273574829, + "learning_rate": 7.4766802813921016e-06, + "loss": 0.7698, + "step": 6342 + }, + { + "epoch": 1.8414864276382639, + "grad_norm": 3.614574432373047, + "learning_rate": 7.475846105872258e-06, + "loss": 0.7622, + "step": 6343 + }, + { + "epoch": 1.8417767455363623, + "grad_norm": 3.4722697734832764, + "learning_rate": 7.475011839044193e-06, + "loss": 0.7134, + "step": 6344 + }, + { + "epoch": 1.8420670634344607, + "grad_norm": 3.442232608795166, + "learning_rate": 7.4741774809386734e-06, + "loss": 0.7563, + "step": 6345 + }, + { + "epoch": 1.842357381332559, + "grad_norm": 4.323866844177246, + "learning_rate": 7.473343031586472e-06, + "loss": 0.8256, + "step": 6346 + }, + { + "epoch": 1.8426476992306575, + "grad_norm": 3.4767138957977295, + "learning_rate": 7.47250849101836e-06, + "loss": 0.6983, + "step": 6347 + }, + { + "epoch": 1.842938017128756, + "grad_norm": 3.646294593811035, + "learning_rate": 7.471673859265115e-06, + "loss": 0.8051, + "step": 6348 + }, + { + "epoch": 1.8432283350268546, + "grad_norm": 3.3605406284332275, + "learning_rate": 7.470839136357521e-06, + "loss": 0.7647, + "step": 6349 + }, + { + "epoch": 1.8435186529249528, + "grad_norm": 3.6406664848327637, + "learning_rate": 7.470004322326358e-06, + "loss": 0.844, + "step": 6350 + }, + { + "epoch": 1.8438089708230514, + "grad_norm": 3.698698043823242, + "learning_rate": 7.469169417202418e-06, + "loss": 0.7931, + "step": 6351 + }, + { + "epoch": 1.8440992887211496, + "grad_norm": 4.0768280029296875, + "learning_rate": 7.468334421016486e-06, + "loss": 0.8189, + "step": 6352 + }, + { + "epoch": 1.8443896066192482, + "grad_norm": 3.440924644470215, + "learning_rate": 7.467499333799364e-06, + "loss": 0.6892, + "step": 6353 + }, + { + "epoch": 1.8446799245173464, + "grad_norm": 3.8425514698028564, + "learning_rate": 7.466664155581844e-06, + "loss": 0.817, + "step": 6354 + }, + { + "epoch": 1.844970242415445, + "grad_norm": 3.595719337463379, + "learning_rate": 7.465828886394729e-06, + "loss": 0.7626, + "step": 6355 + }, + { + "epoch": 1.8452605603135432, + "grad_norm": 3.3320703506469727, + "learning_rate": 7.464993526268822e-06, + "loss": 0.6524, + "step": 6356 + }, + { + "epoch": 1.8455508782116419, + "grad_norm": 3.798980951309204, + "learning_rate": 7.464158075234934e-06, + "loss": 0.7571, + "step": 6357 + }, + { + "epoch": 1.84584119610974, + "grad_norm": 3.508420944213867, + "learning_rate": 7.463322533323874e-06, + "loss": 0.7707, + "step": 6358 + }, + { + "epoch": 1.8461315140078387, + "grad_norm": 3.330502986907959, + "learning_rate": 7.4624869005664554e-06, + "loss": 0.6898, + "step": 6359 + }, + { + "epoch": 1.846421831905937, + "grad_norm": 3.756951332092285, + "learning_rate": 7.4616511769934985e-06, + "loss": 0.8923, + "step": 6360 + }, + { + "epoch": 1.8467121498040355, + "grad_norm": 3.696202516555786, + "learning_rate": 7.460815362635821e-06, + "loss": 0.851, + "step": 6361 + }, + { + "epoch": 1.8470024677021337, + "grad_norm": 3.410972833633423, + "learning_rate": 7.45997945752425e-06, + "loss": 0.7278, + "step": 6362 + }, + { + "epoch": 1.8472927856002324, + "grad_norm": 3.7810752391815186, + "learning_rate": 7.4591434616896156e-06, + "loss": 0.8884, + "step": 6363 + }, + { + "epoch": 1.8475831034983305, + "grad_norm": 3.368793487548828, + "learning_rate": 7.458307375162743e-06, + "loss": 0.6754, + "step": 6364 + }, + { + "epoch": 1.8478734213964292, + "grad_norm": 3.527655839920044, + "learning_rate": 7.4574711979744705e-06, + "loss": 0.8358, + "step": 6365 + }, + { + "epoch": 1.8481637392945274, + "grad_norm": 3.6964645385742188, + "learning_rate": 7.4566349301556366e-06, + "loss": 0.776, + "step": 6366 + }, + { + "epoch": 1.848454057192626, + "grad_norm": 3.480604410171509, + "learning_rate": 7.45579857173708e-06, + "loss": 0.813, + "step": 6367 + }, + { + "epoch": 1.8487443750907242, + "grad_norm": 3.0932321548461914, + "learning_rate": 7.454962122749648e-06, + "loss": 0.6029, + "step": 6368 + }, + { + "epoch": 1.8490346929888228, + "grad_norm": 3.5673985481262207, + "learning_rate": 7.454125583224186e-06, + "loss": 0.8752, + "step": 6369 + }, + { + "epoch": 1.849325010886921, + "grad_norm": 3.8833866119384766, + "learning_rate": 7.453288953191547e-06, + "loss": 0.8049, + "step": 6370 + }, + { + "epoch": 1.8496153287850197, + "grad_norm": 3.3621320724487305, + "learning_rate": 7.452452232682585e-06, + "loss": 0.778, + "step": 6371 + }, + { + "epoch": 1.8499056466831179, + "grad_norm": 3.439912796020508, + "learning_rate": 7.451615421728158e-06, + "loss": 0.7637, + "step": 6372 + }, + { + "epoch": 1.8501959645812165, + "grad_norm": 3.4569733142852783, + "learning_rate": 7.450778520359127e-06, + "loss": 0.757, + "step": 6373 + }, + { + "epoch": 1.850486282479315, + "grad_norm": 3.3859477043151855, + "learning_rate": 7.449941528606356e-06, + "loss": 0.7486, + "step": 6374 + }, + { + "epoch": 1.8507766003774133, + "grad_norm": 4.253404140472412, + "learning_rate": 7.449104446500713e-06, + "loss": 0.9496, + "step": 6375 + }, + { + "epoch": 1.8510669182755117, + "grad_norm": 3.733933448791504, + "learning_rate": 7.448267274073072e-06, + "loss": 0.8169, + "step": 6376 + }, + { + "epoch": 1.8513572361736101, + "grad_norm": 3.200833320617676, + "learning_rate": 7.447430011354304e-06, + "loss": 0.6549, + "step": 6377 + }, + { + "epoch": 1.8516475540717086, + "grad_norm": 3.777592658996582, + "learning_rate": 7.44659265837529e-06, + "loss": 0.7889, + "step": 6378 + }, + { + "epoch": 1.851937871969807, + "grad_norm": 3.5749125480651855, + "learning_rate": 7.4457552151669085e-06, + "loss": 0.8438, + "step": 6379 + }, + { + "epoch": 1.8522281898679054, + "grad_norm": 3.531050205230713, + "learning_rate": 7.444917681760046e-06, + "loss": 0.8027, + "step": 6380 + }, + { + "epoch": 1.8525185077660038, + "grad_norm": 3.0747227668762207, + "learning_rate": 7.444080058185587e-06, + "loss": 0.6814, + "step": 6381 + }, + { + "epoch": 1.8528088256641022, + "grad_norm": 3.703937530517578, + "learning_rate": 7.443242344474429e-06, + "loss": 0.8243, + "step": 6382 + }, + { + "epoch": 1.8530991435622006, + "grad_norm": 3.3314077854156494, + "learning_rate": 7.442404540657461e-06, + "loss": 0.7393, + "step": 6383 + }, + { + "epoch": 1.853389461460299, + "grad_norm": 3.324211835861206, + "learning_rate": 7.4415666467655835e-06, + "loss": 0.7398, + "step": 6384 + }, + { + "epoch": 1.8536797793583975, + "grad_norm": 3.0877864360809326, + "learning_rate": 7.440728662829697e-06, + "loss": 0.7265, + "step": 6385 + }, + { + "epoch": 1.8539700972564959, + "grad_norm": 3.642578363418579, + "learning_rate": 7.439890588880705e-06, + "loss": 0.7797, + "step": 6386 + }, + { + "epoch": 1.8542604151545943, + "grad_norm": 3.4550280570983887, + "learning_rate": 7.439052424949518e-06, + "loss": 0.7592, + "step": 6387 + }, + { + "epoch": 1.8545507330526927, + "grad_norm": 3.4730403423309326, + "learning_rate": 7.438214171067042e-06, + "loss": 0.7711, + "step": 6388 + }, + { + "epoch": 1.854841050950791, + "grad_norm": 3.5537898540496826, + "learning_rate": 7.437375827264198e-06, + "loss": 0.9184, + "step": 6389 + }, + { + "epoch": 1.8551313688488895, + "grad_norm": 3.556471586227417, + "learning_rate": 7.4365373935719e-06, + "loss": 0.7449, + "step": 6390 + }, + { + "epoch": 1.855421686746988, + "grad_norm": 3.9682884216308594, + "learning_rate": 7.435698870021071e-06, + "loss": 0.8094, + "step": 6391 + }, + { + "epoch": 1.8557120046450863, + "grad_norm": 3.6690304279327393, + "learning_rate": 7.434860256642633e-06, + "loss": 0.8124, + "step": 6392 + }, + { + "epoch": 1.8560023225431848, + "grad_norm": 3.4016544818878174, + "learning_rate": 7.434021553467514e-06, + "loss": 0.8016, + "step": 6393 + }, + { + "epoch": 1.8562926404412832, + "grad_norm": 3.5285894870758057, + "learning_rate": 7.433182760526647e-06, + "loss": 0.802, + "step": 6394 + }, + { + "epoch": 1.8565829583393816, + "grad_norm": 3.331476926803589, + "learning_rate": 7.432343877850966e-06, + "loss": 0.6942, + "step": 6395 + }, + { + "epoch": 1.85687327623748, + "grad_norm": 3.557368516921997, + "learning_rate": 7.431504905471407e-06, + "loss": 0.696, + "step": 6396 + }, + { + "epoch": 1.8571635941355784, + "grad_norm": 3.8558270931243896, + "learning_rate": 7.4306658434189126e-06, + "loss": 0.8857, + "step": 6397 + }, + { + "epoch": 1.8574539120336768, + "grad_norm": 3.4773919582366943, + "learning_rate": 7.4298266917244266e-06, + "loss": 0.6939, + "step": 6398 + }, + { + "epoch": 1.8577442299317752, + "grad_norm": 3.5946531295776367, + "learning_rate": 7.428987450418896e-06, + "loss": 0.8188, + "step": 6399 + }, + { + "epoch": 1.8580345478298739, + "grad_norm": 3.5143725872039795, + "learning_rate": 7.428148119533274e-06, + "loss": 0.8558, + "step": 6400 + }, + { + "epoch": 1.858324865727972, + "grad_norm": 3.770815372467041, + "learning_rate": 7.427308699098511e-06, + "loss": 0.7335, + "step": 6401 + }, + { + "epoch": 1.8586151836260707, + "grad_norm": 3.5556554794311523, + "learning_rate": 7.426469189145567e-06, + "loss": 0.7183, + "step": 6402 + }, + { + "epoch": 1.858905501524169, + "grad_norm": 3.102630138397217, + "learning_rate": 7.425629589705401e-06, + "loss": 0.8115, + "step": 6403 + }, + { + "epoch": 1.8591958194222675, + "grad_norm": 3.410172700881958, + "learning_rate": 7.42478990080898e-06, + "loss": 0.7377, + "step": 6404 + }, + { + "epoch": 1.8594861373203657, + "grad_norm": 3.825101613998413, + "learning_rate": 7.423950122487269e-06, + "loss": 0.8198, + "step": 6405 + }, + { + "epoch": 1.8597764552184644, + "grad_norm": 3.740804672241211, + "learning_rate": 7.423110254771238e-06, + "loss": 0.724, + "step": 6406 + }, + { + "epoch": 1.8600667731165625, + "grad_norm": 4.087116718292236, + "learning_rate": 7.4222702976918635e-06, + "loss": 0.8019, + "step": 6407 + }, + { + "epoch": 1.8603570910146612, + "grad_norm": 3.577281951904297, + "learning_rate": 7.421430251280123e-06, + "loss": 0.7734, + "step": 6408 + }, + { + "epoch": 1.8606474089127594, + "grad_norm": 3.1149165630340576, + "learning_rate": 7.420590115566995e-06, + "loss": 0.6023, + "step": 6409 + }, + { + "epoch": 1.860937726810858, + "grad_norm": 3.652672052383423, + "learning_rate": 7.419749890583464e-06, + "loss": 0.8898, + "step": 6410 + }, + { + "epoch": 1.8612280447089562, + "grad_norm": 3.6932666301727295, + "learning_rate": 7.418909576360515e-06, + "loss": 0.8296, + "step": 6411 + }, + { + "epoch": 1.8615183626070548, + "grad_norm": 3.1710166931152344, + "learning_rate": 7.418069172929144e-06, + "loss": 0.6779, + "step": 6412 + }, + { + "epoch": 1.861808680505153, + "grad_norm": 3.5479466915130615, + "learning_rate": 7.417228680320341e-06, + "loss": 0.7505, + "step": 6413 + }, + { + "epoch": 1.8620989984032517, + "grad_norm": 3.41398549079895, + "learning_rate": 7.416388098565103e-06, + "loss": 0.8062, + "step": 6414 + }, + { + "epoch": 1.8623893163013499, + "grad_norm": 3.561964511871338, + "learning_rate": 7.41554742769443e-06, + "loss": 0.7902, + "step": 6415 + }, + { + "epoch": 1.8626796341994485, + "grad_norm": 3.3961057662963867, + "learning_rate": 7.414706667739327e-06, + "loss": 0.7915, + "step": 6416 + }, + { + "epoch": 1.8629699520975467, + "grad_norm": 3.501466751098633, + "learning_rate": 7.413865818730801e-06, + "loss": 0.829, + "step": 6417 + }, + { + "epoch": 1.8632602699956453, + "grad_norm": 3.181313991546631, + "learning_rate": 7.413024880699861e-06, + "loss": 0.6991, + "step": 6418 + }, + { + "epoch": 1.8635505878937435, + "grad_norm": 3.7406692504882812, + "learning_rate": 7.412183853677522e-06, + "loss": 0.999, + "step": 6419 + }, + { + "epoch": 1.8638409057918421, + "grad_norm": 3.098989248275757, + "learning_rate": 7.4113427376947966e-06, + "loss": 0.7114, + "step": 6420 + }, + { + "epoch": 1.8641312236899403, + "grad_norm": 3.511604070663452, + "learning_rate": 7.4105015327827115e-06, + "loss": 0.6936, + "step": 6421 + }, + { + "epoch": 1.864421541588039, + "grad_norm": 3.8496603965759277, + "learning_rate": 7.409660238972285e-06, + "loss": 0.9334, + "step": 6422 + }, + { + "epoch": 1.8647118594861372, + "grad_norm": 3.1544764041900635, + "learning_rate": 7.4088188562945454e-06, + "loss": 0.7209, + "step": 6423 + }, + { + "epoch": 1.8650021773842358, + "grad_norm": 4.877438068389893, + "learning_rate": 7.4079773847805216e-06, + "loss": 0.9736, + "step": 6424 + }, + { + "epoch": 1.8652924952823342, + "grad_norm": 3.5776352882385254, + "learning_rate": 7.407135824461247e-06, + "loss": 0.7248, + "step": 6425 + }, + { + "epoch": 1.8655828131804326, + "grad_norm": 3.487882375717163, + "learning_rate": 7.406294175367758e-06, + "loss": 0.7247, + "step": 6426 + }, + { + "epoch": 1.865873131078531, + "grad_norm": 3.9391684532165527, + "learning_rate": 7.405452437531098e-06, + "loss": 0.8622, + "step": 6427 + }, + { + "epoch": 1.8661634489766294, + "grad_norm": 3.6147098541259766, + "learning_rate": 7.4046106109823045e-06, + "loss": 0.7524, + "step": 6428 + }, + { + "epoch": 1.8664537668747279, + "grad_norm": 4.174846649169922, + "learning_rate": 7.403768695752426e-06, + "loss": 0.842, + "step": 6429 + }, + { + "epoch": 1.8667440847728263, + "grad_norm": 3.839925527572632, + "learning_rate": 7.402926691872512e-06, + "loss": 0.853, + "step": 6430 + }, + { + "epoch": 1.8670344026709247, + "grad_norm": 3.8808486461639404, + "learning_rate": 7.402084599373616e-06, + "loss": 0.7748, + "step": 6431 + }, + { + "epoch": 1.867324720569023, + "grad_norm": 3.5012404918670654, + "learning_rate": 7.401242418286792e-06, + "loss": 0.8308, + "step": 6432 + }, + { + "epoch": 1.8676150384671215, + "grad_norm": 3.0792105197906494, + "learning_rate": 7.400400148643101e-06, + "loss": 0.6845, + "step": 6433 + }, + { + "epoch": 1.86790535636522, + "grad_norm": 3.1592519283294678, + "learning_rate": 7.399557790473604e-06, + "loss": 0.7151, + "step": 6434 + }, + { + "epoch": 1.8681956742633183, + "grad_norm": 3.6104846000671387, + "learning_rate": 7.398715343809368e-06, + "loss": 0.7171, + "step": 6435 + }, + { + "epoch": 1.8684859921614168, + "grad_norm": 3.654996633529663, + "learning_rate": 7.397872808681465e-06, + "loss": 0.8835, + "step": 6436 + }, + { + "epoch": 1.8687763100595152, + "grad_norm": 3.450308322906494, + "learning_rate": 7.397030185120962e-06, + "loss": 0.7241, + "step": 6437 + }, + { + "epoch": 1.8690666279576136, + "grad_norm": 4.059999465942383, + "learning_rate": 7.396187473158937e-06, + "loss": 0.8683, + "step": 6438 + }, + { + "epoch": 1.869356945855712, + "grad_norm": 3.439053773880005, + "learning_rate": 7.395344672826469e-06, + "loss": 0.6581, + "step": 6439 + }, + { + "epoch": 1.8696472637538104, + "grad_norm": 3.5375428199768066, + "learning_rate": 7.394501784154641e-06, + "loss": 0.7848, + "step": 6440 + }, + { + "epoch": 1.8699375816519088, + "grad_norm": 3.373065710067749, + "learning_rate": 7.393658807174536e-06, + "loss": 0.6419, + "step": 6441 + }, + { + "epoch": 1.8702278995500072, + "grad_norm": 3.765425682067871, + "learning_rate": 7.392815741917245e-06, + "loss": 0.8696, + "step": 6442 + }, + { + "epoch": 1.8705182174481056, + "grad_norm": 3.7273731231689453, + "learning_rate": 7.391972588413858e-06, + "loss": 0.6883, + "step": 6443 + }, + { + "epoch": 1.870808535346204, + "grad_norm": 3.4617130756378174, + "learning_rate": 7.391129346695472e-06, + "loss": 0.8119, + "step": 6444 + }, + { + "epoch": 1.8710988532443025, + "grad_norm": 3.6720211505889893, + "learning_rate": 7.390286016793185e-06, + "loss": 0.7574, + "step": 6445 + }, + { + "epoch": 1.8713891711424009, + "grad_norm": 3.469089984893799, + "learning_rate": 7.389442598738098e-06, + "loss": 0.8107, + "step": 6446 + }, + { + "epoch": 1.8716794890404993, + "grad_norm": 3.012542963027954, + "learning_rate": 7.388599092561315e-06, + "loss": 0.7098, + "step": 6447 + }, + { + "epoch": 1.8719698069385977, + "grad_norm": 3.592057943344116, + "learning_rate": 7.387755498293947e-06, + "loss": 0.6834, + "step": 6448 + }, + { + "epoch": 1.8722601248366961, + "grad_norm": 3.2716832160949707, + "learning_rate": 7.386911815967104e-06, + "loss": 0.6979, + "step": 6449 + }, + { + "epoch": 1.8725504427347945, + "grad_norm": 3.7392630577087402, + "learning_rate": 7.386068045611899e-06, + "loss": 0.7324, + "step": 6450 + }, + { + "epoch": 1.8728407606328932, + "grad_norm": 3.501025676727295, + "learning_rate": 7.385224187259451e-06, + "loss": 0.8299, + "step": 6451 + }, + { + "epoch": 1.8731310785309914, + "grad_norm": 3.846646547317505, + "learning_rate": 7.384380240940883e-06, + "loss": 0.7621, + "step": 6452 + }, + { + "epoch": 1.87342139642909, + "grad_norm": 3.536499261856079, + "learning_rate": 7.383536206687317e-06, + "loss": 0.7554, + "step": 6453 + }, + { + "epoch": 1.8737117143271882, + "grad_norm": 3.91064715385437, + "learning_rate": 7.382692084529881e-06, + "loss": 0.7909, + "step": 6454 + }, + { + "epoch": 1.8740020322252868, + "grad_norm": 3.2774910926818848, + "learning_rate": 7.381847874499708e-06, + "loss": 0.7301, + "step": 6455 + }, + { + "epoch": 1.874292350123385, + "grad_norm": 4.022462368011475, + "learning_rate": 7.38100357662793e-06, + "loss": 0.7458, + "step": 6456 + }, + { + "epoch": 1.8745826680214837, + "grad_norm": 4.091184139251709, + "learning_rate": 7.380159190945685e-06, + "loss": 0.7613, + "step": 6457 + }, + { + "epoch": 1.8748729859195818, + "grad_norm": 3.5496578216552734, + "learning_rate": 7.379314717484113e-06, + "loss": 0.7163, + "step": 6458 + }, + { + "epoch": 1.8751633038176805, + "grad_norm": 3.375134229660034, + "learning_rate": 7.37847015627436e-06, + "loss": 0.7181, + "step": 6459 + }, + { + "epoch": 1.8754536217157787, + "grad_norm": 3.6883907318115234, + "learning_rate": 7.3776255073475696e-06, + "loss": 0.7514, + "step": 6460 + }, + { + "epoch": 1.8757439396138773, + "grad_norm": 3.7220544815063477, + "learning_rate": 7.376780770734895e-06, + "loss": 0.8063, + "step": 6461 + }, + { + "epoch": 1.8760342575119755, + "grad_norm": 3.9749653339385986, + "learning_rate": 7.375935946467487e-06, + "loss": 0.8315, + "step": 6462 + }, + { + "epoch": 1.8763245754100741, + "grad_norm": 3.658550500869751, + "learning_rate": 7.375091034576507e-06, + "loss": 0.8187, + "step": 6463 + }, + { + "epoch": 1.8766148933081723, + "grad_norm": 3.2026803493499756, + "learning_rate": 7.374246035093111e-06, + "loss": 0.7014, + "step": 6464 + }, + { + "epoch": 1.876905211206271, + "grad_norm": 3.760976791381836, + "learning_rate": 7.373400948048464e-06, + "loss": 0.8147, + "step": 6465 + }, + { + "epoch": 1.8771955291043692, + "grad_norm": 3.686145544052124, + "learning_rate": 7.372555773473731e-06, + "loss": 0.7361, + "step": 6466 + }, + { + "epoch": 1.8774858470024678, + "grad_norm": 3.6365010738372803, + "learning_rate": 7.371710511400083e-06, + "loss": 0.7642, + "step": 6467 + }, + { + "epoch": 1.877776164900566, + "grad_norm": 3.697004795074463, + "learning_rate": 7.3708651618586925e-06, + "loss": 0.8165, + "step": 6468 + }, + { + "epoch": 1.8780664827986646, + "grad_norm": 3.7043352127075195, + "learning_rate": 7.370019724880734e-06, + "loss": 0.7413, + "step": 6469 + }, + { + "epoch": 1.8783568006967628, + "grad_norm": 3.635573148727417, + "learning_rate": 7.3691742004973906e-06, + "loss": 0.7286, + "step": 6470 + }, + { + "epoch": 1.8786471185948614, + "grad_norm": 3.533658742904663, + "learning_rate": 7.368328588739843e-06, + "loss": 0.7747, + "step": 6471 + }, + { + "epoch": 1.8789374364929596, + "grad_norm": 3.5193533897399902, + "learning_rate": 7.367482889639277e-06, + "loss": 0.7303, + "step": 6472 + }, + { + "epoch": 1.8792277543910583, + "grad_norm": 3.6575841903686523, + "learning_rate": 7.36663710322688e-06, + "loss": 0.8409, + "step": 6473 + }, + { + "epoch": 1.8795180722891565, + "grad_norm": 4.039218425750732, + "learning_rate": 7.365791229533848e-06, + "loss": 0.8452, + "step": 6474 + }, + { + "epoch": 1.879808390187255, + "grad_norm": 3.2911484241485596, + "learning_rate": 7.36494526859137e-06, + "loss": 0.7872, + "step": 6475 + }, + { + "epoch": 1.8800987080853535, + "grad_norm": 3.6404707431793213, + "learning_rate": 7.364099220430654e-06, + "loss": 0.8814, + "step": 6476 + }, + { + "epoch": 1.880389025983452, + "grad_norm": 3.8109161853790283, + "learning_rate": 7.3632530850828934e-06, + "loss": 0.6996, + "step": 6477 + }, + { + "epoch": 1.8806793438815503, + "grad_norm": 3.478952169418335, + "learning_rate": 7.362406862579299e-06, + "loss": 0.745, + "step": 6478 + }, + { + "epoch": 1.8809696617796487, + "grad_norm": 3.923051118850708, + "learning_rate": 7.3615605529510766e-06, + "loss": 0.8903, + "step": 6479 + }, + { + "epoch": 1.8812599796777472, + "grad_norm": 3.3513667583465576, + "learning_rate": 7.360714156229437e-06, + "loss": 0.8369, + "step": 6480 + }, + { + "epoch": 1.8815502975758456, + "grad_norm": 3.3167412281036377, + "learning_rate": 7.359867672445598e-06, + "loss": 0.8021, + "step": 6481 + }, + { + "epoch": 1.881840615473944, + "grad_norm": 3.9195165634155273, + "learning_rate": 7.359021101630775e-06, + "loss": 0.8945, + "step": 6482 + }, + { + "epoch": 1.8821309333720424, + "grad_norm": 3.156968116760254, + "learning_rate": 7.358174443816188e-06, + "loss": 0.7998, + "step": 6483 + }, + { + "epoch": 1.8824212512701408, + "grad_norm": 3.577028512954712, + "learning_rate": 7.357327699033065e-06, + "loss": 0.7762, + "step": 6484 + }, + { + "epoch": 1.8827115691682392, + "grad_norm": 3.363496780395508, + "learning_rate": 7.356480867312632e-06, + "loss": 0.7806, + "step": 6485 + }, + { + "epoch": 1.8830018870663376, + "grad_norm": 3.6327083110809326, + "learning_rate": 7.355633948686121e-06, + "loss": 0.8288, + "step": 6486 + }, + { + "epoch": 1.883292204964436, + "grad_norm": 3.394564628601074, + "learning_rate": 7.354786943184763e-06, + "loss": 0.7802, + "step": 6487 + }, + { + "epoch": 1.8835825228625345, + "grad_norm": 3.100290298461914, + "learning_rate": 7.353939850839796e-06, + "loss": 0.7393, + "step": 6488 + }, + { + "epoch": 1.8838728407606329, + "grad_norm": 3.4168612957000732, + "learning_rate": 7.353092671682464e-06, + "loss": 0.7864, + "step": 6489 + }, + { + "epoch": 1.8841631586587313, + "grad_norm": 3.401819944381714, + "learning_rate": 7.352245405744007e-06, + "loss": 0.7972, + "step": 6490 + }, + { + "epoch": 1.8844534765568297, + "grad_norm": 3.8674604892730713, + "learning_rate": 7.351398053055673e-06, + "loss": 0.7671, + "step": 6491 + }, + { + "epoch": 1.8847437944549281, + "grad_norm": 3.5375800132751465, + "learning_rate": 7.35055061364871e-06, + "loss": 0.7949, + "step": 6492 + }, + { + "epoch": 1.8850341123530265, + "grad_norm": 3.1606504917144775, + "learning_rate": 7.349703087554376e-06, + "loss": 0.6934, + "step": 6493 + }, + { + "epoch": 1.885324430251125, + "grad_norm": 3.7292003631591797, + "learning_rate": 7.348855474803923e-06, + "loss": 0.8148, + "step": 6494 + }, + { + "epoch": 1.8856147481492234, + "grad_norm": 3.975048542022705, + "learning_rate": 7.348007775428613e-06, + "loss": 0.7449, + "step": 6495 + }, + { + "epoch": 1.8859050660473218, + "grad_norm": 3.215825319290161, + "learning_rate": 7.347159989459707e-06, + "loss": 0.6939, + "step": 6496 + }, + { + "epoch": 1.8861953839454202, + "grad_norm": 3.75365948677063, + "learning_rate": 7.346312116928473e-06, + "loss": 0.7789, + "step": 6497 + }, + { + "epoch": 1.8864857018435186, + "grad_norm": 3.8031654357910156, + "learning_rate": 7.34546415786618e-06, + "loss": 0.7878, + "step": 6498 + }, + { + "epoch": 1.886776019741617, + "grad_norm": 3.699834108352661, + "learning_rate": 7.3446161123040975e-06, + "loss": 0.7436, + "step": 6499 + }, + { + "epoch": 1.8870663376397157, + "grad_norm": 3.516376256942749, + "learning_rate": 7.3437679802735054e-06, + "loss": 0.7246, + "step": 6500 + }, + { + "epoch": 1.8870663376397157, + "eval_loss": 1.1672762632369995, + "eval_runtime": 13.3449, + "eval_samples_per_second": 29.974, + "eval_steps_per_second": 3.747, + "step": 6500 + }, + { + "epoch": 1.8873566555378138, + "grad_norm": 2.8746337890625, + "learning_rate": 7.342919761805678e-06, + "loss": 0.6085, + "step": 6501 + }, + { + "epoch": 1.8876469734359125, + "grad_norm": 3.8056139945983887, + "learning_rate": 7.342071456931901e-06, + "loss": 0.8326, + "step": 6502 + }, + { + "epoch": 1.8879372913340107, + "grad_norm": 3.5527572631835938, + "learning_rate": 7.3412230656834584e-06, + "loss": 0.7709, + "step": 6503 + }, + { + "epoch": 1.8882276092321093, + "grad_norm": 3.6476054191589355, + "learning_rate": 7.340374588091638e-06, + "loss": 0.7901, + "step": 6504 + }, + { + "epoch": 1.8885179271302075, + "grad_norm": 3.307996988296509, + "learning_rate": 7.339526024187731e-06, + "loss": 0.738, + "step": 6505 + }, + { + "epoch": 1.8888082450283061, + "grad_norm": 3.871455192565918, + "learning_rate": 7.338677374003032e-06, + "loss": 0.8552, + "step": 6506 + }, + { + "epoch": 1.8890985629264043, + "grad_norm": 3.560155153274536, + "learning_rate": 7.33782863756884e-06, + "loss": 0.8033, + "step": 6507 + }, + { + "epoch": 1.889388880824503, + "grad_norm": 3.363393783569336, + "learning_rate": 7.336979814916456e-06, + "loss": 0.7238, + "step": 6508 + }, + { + "epoch": 1.8896791987226011, + "grad_norm": 3.2523813247680664, + "learning_rate": 7.336130906077183e-06, + "loss": 0.7462, + "step": 6509 + }, + { + "epoch": 1.8899695166206998, + "grad_norm": 3.4237465858459473, + "learning_rate": 7.335281911082332e-06, + "loss": 0.7069, + "step": 6510 + }, + { + "epoch": 1.890259834518798, + "grad_norm": 3.43580961227417, + "learning_rate": 7.334432829963207e-06, + "loss": 0.7886, + "step": 6511 + }, + { + "epoch": 1.8905501524168966, + "grad_norm": 3.4298160076141357, + "learning_rate": 7.333583662751128e-06, + "loss": 0.7729, + "step": 6512 + }, + { + "epoch": 1.8908404703149948, + "grad_norm": 3.8691980838775635, + "learning_rate": 7.332734409477409e-06, + "loss": 0.9029, + "step": 6513 + }, + { + "epoch": 1.8911307882130934, + "grad_norm": 3.4163694381713867, + "learning_rate": 7.331885070173371e-06, + "loss": 0.8358, + "step": 6514 + }, + { + "epoch": 1.8914211061111916, + "grad_norm": 3.1868526935577393, + "learning_rate": 7.331035644870336e-06, + "loss": 0.7406, + "step": 6515 + }, + { + "epoch": 1.8917114240092903, + "grad_norm": 3.5221593379974365, + "learning_rate": 7.3301861335996325e-06, + "loss": 0.7748, + "step": 6516 + }, + { + "epoch": 1.8920017419073885, + "grad_norm": 3.6796584129333496, + "learning_rate": 7.3293365363925894e-06, + "loss": 0.7916, + "step": 6517 + }, + { + "epoch": 1.892292059805487, + "grad_norm": 3.560765266418457, + "learning_rate": 7.328486853280539e-06, + "loss": 0.7967, + "step": 6518 + }, + { + "epoch": 1.8925823777035853, + "grad_norm": 3.809666633605957, + "learning_rate": 7.327637084294818e-06, + "loss": 0.818, + "step": 6519 + }, + { + "epoch": 1.892872695601684, + "grad_norm": 3.327310085296631, + "learning_rate": 7.326787229466762e-06, + "loss": 0.7358, + "step": 6520 + }, + { + "epoch": 1.893163013499782, + "grad_norm": 3.5378589630126953, + "learning_rate": 7.325937288827719e-06, + "loss": 0.7298, + "step": 6521 + }, + { + "epoch": 1.8934533313978807, + "grad_norm": 3.669187068939209, + "learning_rate": 7.325087262409031e-06, + "loss": 0.8244, + "step": 6522 + }, + { + "epoch": 1.893743649295979, + "grad_norm": 3.2379751205444336, + "learning_rate": 7.3242371502420485e-06, + "loss": 0.7645, + "step": 6523 + }, + { + "epoch": 1.8940339671940776, + "grad_norm": 4.166474342346191, + "learning_rate": 7.3233869523581214e-06, + "loss": 0.9135, + "step": 6524 + }, + { + "epoch": 1.894324285092176, + "grad_norm": 3.6318092346191406, + "learning_rate": 7.322536668788605e-06, + "loss": 0.777, + "step": 6525 + }, + { + "epoch": 1.8946146029902744, + "grad_norm": 3.7004711627960205, + "learning_rate": 7.321686299564858e-06, + "loss": 0.6785, + "step": 6526 + }, + { + "epoch": 1.8949049208883728, + "grad_norm": 3.1796281337738037, + "learning_rate": 7.320835844718243e-06, + "loss": 0.7538, + "step": 6527 + }, + { + "epoch": 1.8951952387864712, + "grad_norm": 3.454525947570801, + "learning_rate": 7.319985304280122e-06, + "loss": 0.748, + "step": 6528 + }, + { + "epoch": 1.8954855566845696, + "grad_norm": 3.8724629878997803, + "learning_rate": 7.319134678281863e-06, + "loss": 0.8925, + "step": 6529 + }, + { + "epoch": 1.895775874582668, + "grad_norm": 3.6467819213867188, + "learning_rate": 7.318283966754838e-06, + "loss": 0.7681, + "step": 6530 + }, + { + "epoch": 1.8960661924807665, + "grad_norm": 3.970150947570801, + "learning_rate": 7.317433169730421e-06, + "loss": 0.815, + "step": 6531 + }, + { + "epoch": 1.8963565103788649, + "grad_norm": 3.366507053375244, + "learning_rate": 7.3165822872399875e-06, + "loss": 0.7705, + "step": 6532 + }, + { + "epoch": 1.8966468282769633, + "grad_norm": 3.836026668548584, + "learning_rate": 7.315731319314919e-06, + "loss": 0.8512, + "step": 6533 + }, + { + "epoch": 1.8969371461750617, + "grad_norm": 3.7245543003082275, + "learning_rate": 7.314880265986598e-06, + "loss": 0.8078, + "step": 6534 + }, + { + "epoch": 1.8972274640731601, + "grad_norm": 3.417665481567383, + "learning_rate": 7.3140291272864116e-06, + "loss": 0.6402, + "step": 6535 + }, + { + "epoch": 1.8975177819712585, + "grad_norm": 3.5677568912506104, + "learning_rate": 7.313177903245749e-06, + "loss": 0.8362, + "step": 6536 + }, + { + "epoch": 1.897808099869357, + "grad_norm": 4.0231218338012695, + "learning_rate": 7.312326593896004e-06, + "loss": 0.9341, + "step": 6537 + }, + { + "epoch": 1.8980984177674554, + "grad_norm": 3.707977056503296, + "learning_rate": 7.311475199268572e-06, + "loss": 0.7686, + "step": 6538 + }, + { + "epoch": 1.8983887356655538, + "grad_norm": 3.406618595123291, + "learning_rate": 7.3106237193948504e-06, + "loss": 0.7152, + "step": 6539 + }, + { + "epoch": 1.8986790535636522, + "grad_norm": 3.426307439804077, + "learning_rate": 7.309772154306245e-06, + "loss": 0.7234, + "step": 6540 + }, + { + "epoch": 1.8989693714617506, + "grad_norm": 3.2683463096618652, + "learning_rate": 7.308920504034157e-06, + "loss": 0.7997, + "step": 6541 + }, + { + "epoch": 1.899259689359849, + "grad_norm": 3.643825054168701, + "learning_rate": 7.308068768609999e-06, + "loss": 0.8139, + "step": 6542 + }, + { + "epoch": 1.8995500072579474, + "grad_norm": 3.777906656265259, + "learning_rate": 7.3072169480651785e-06, + "loss": 0.8931, + "step": 6543 + }, + { + "epoch": 1.8998403251560458, + "grad_norm": 3.40627121925354, + "learning_rate": 7.306365042431115e-06, + "loss": 0.8319, + "step": 6544 + }, + { + "epoch": 1.9001306430541443, + "grad_norm": 3.8914313316345215, + "learning_rate": 7.305513051739222e-06, + "loss": 0.8638, + "step": 6545 + }, + { + "epoch": 1.9004209609522427, + "grad_norm": 4.062667369842529, + "learning_rate": 7.3046609760209255e-06, + "loss": 0.8284, + "step": 6546 + }, + { + "epoch": 1.900711278850341, + "grad_norm": 3.093411684036255, + "learning_rate": 7.303808815307644e-06, + "loss": 0.69, + "step": 6547 + }, + { + "epoch": 1.9010015967484395, + "grad_norm": 3.077059745788574, + "learning_rate": 7.302956569630808e-06, + "loss": 0.7037, + "step": 6548 + }, + { + "epoch": 1.901291914646538, + "grad_norm": 3.481987714767456, + "learning_rate": 7.302104239021849e-06, + "loss": 0.8128, + "step": 6549 + }, + { + "epoch": 1.9015822325446363, + "grad_norm": 3.439530372619629, + "learning_rate": 7.3012518235121976e-06, + "loss": 0.7401, + "step": 6550 + }, + { + "epoch": 1.901872550442735, + "grad_norm": 3.3708889484405518, + "learning_rate": 7.300399323133292e-06, + "loss": 0.7138, + "step": 6551 + }, + { + "epoch": 1.9021628683408331, + "grad_norm": 3.8107917308807373, + "learning_rate": 7.299546737916574e-06, + "loss": 0.8779, + "step": 6552 + }, + { + "epoch": 1.9024531862389318, + "grad_norm": 3.5310473442077637, + "learning_rate": 7.298694067893483e-06, + "loss": 0.7679, + "step": 6553 + }, + { + "epoch": 1.90274350413703, + "grad_norm": 3.196654796600342, + "learning_rate": 7.297841313095468e-06, + "loss": 0.7009, + "step": 6554 + }, + { + "epoch": 1.9030338220351286, + "grad_norm": 3.6681015491485596, + "learning_rate": 7.296988473553979e-06, + "loss": 0.7745, + "step": 6555 + }, + { + "epoch": 1.9033241399332268, + "grad_norm": 3.4849812984466553, + "learning_rate": 7.296135549300465e-06, + "loss": 0.7308, + "step": 6556 + }, + { + "epoch": 1.9036144578313254, + "grad_norm": 3.7782578468322754, + "learning_rate": 7.295282540366382e-06, + "loss": 0.8262, + "step": 6557 + }, + { + "epoch": 1.9039047757294236, + "grad_norm": 3.266765832901001, + "learning_rate": 7.29442944678319e-06, + "loss": 0.7228, + "step": 6558 + }, + { + "epoch": 1.9041950936275223, + "grad_norm": 3.374159336090088, + "learning_rate": 7.293576268582352e-06, + "loss": 0.7207, + "step": 6559 + }, + { + "epoch": 1.9044854115256205, + "grad_norm": 3.4658048152923584, + "learning_rate": 7.29272300579533e-06, + "loss": 0.7921, + "step": 6560 + }, + { + "epoch": 1.904775729423719, + "grad_norm": 3.6144564151763916, + "learning_rate": 7.291869658453594e-06, + "loss": 0.7876, + "step": 6561 + }, + { + "epoch": 1.9050660473218173, + "grad_norm": 3.865516424179077, + "learning_rate": 7.2910162265886146e-06, + "loss": 0.8732, + "step": 6562 + }, + { + "epoch": 1.905356365219916, + "grad_norm": 3.7226791381835938, + "learning_rate": 7.2901627102318665e-06, + "loss": 0.9022, + "step": 6563 + }, + { + "epoch": 1.905646683118014, + "grad_norm": 3.6240618228912354, + "learning_rate": 7.289309109414825e-06, + "loss": 0.8165, + "step": 6564 + }, + { + "epoch": 1.9059370010161127, + "grad_norm": 3.4062204360961914, + "learning_rate": 7.2884554241689744e-06, + "loss": 0.7112, + "step": 6565 + }, + { + "epoch": 1.906227318914211, + "grad_norm": 3.518115520477295, + "learning_rate": 7.287601654525793e-06, + "loss": 0.7026, + "step": 6566 + }, + { + "epoch": 1.9065176368123096, + "grad_norm": 3.3960046768188477, + "learning_rate": 7.286747800516771e-06, + "loss": 0.7845, + "step": 6567 + }, + { + "epoch": 1.9068079547104078, + "grad_norm": 3.9768590927124023, + "learning_rate": 7.2858938621734e-06, + "loss": 0.835, + "step": 6568 + }, + { + "epoch": 1.9070982726085064, + "grad_norm": 4.007421493530273, + "learning_rate": 7.285039839527168e-06, + "loss": 0.8687, + "step": 6569 + }, + { + "epoch": 1.9073885905066046, + "grad_norm": 3.7359652519226074, + "learning_rate": 7.284185732609574e-06, + "loss": 0.8011, + "step": 6570 + }, + { + "epoch": 1.9076789084047032, + "grad_norm": 3.613187313079834, + "learning_rate": 7.283331541452117e-06, + "loss": 0.6303, + "step": 6571 + }, + { + "epoch": 1.9079692263028014, + "grad_norm": 3.4708168506622314, + "learning_rate": 7.2824772660863e-06, + "loss": 0.6899, + "step": 6572 + }, + { + "epoch": 1.9082595442009, + "grad_norm": 3.855139970779419, + "learning_rate": 7.281622906543625e-06, + "loss": 0.843, + "step": 6573 + }, + { + "epoch": 1.9085498620989982, + "grad_norm": 3.631195068359375, + "learning_rate": 7.280768462855605e-06, + "loss": 0.8049, + "step": 6574 + }, + { + "epoch": 1.9088401799970969, + "grad_norm": 2.9242310523986816, + "learning_rate": 7.2799139350537466e-06, + "loss": 0.7044, + "step": 6575 + }, + { + "epoch": 1.9091304978951953, + "grad_norm": 3.3103771209716797, + "learning_rate": 7.279059323169569e-06, + "loss": 0.6607, + "step": 6576 + }, + { + "epoch": 1.9094208157932937, + "grad_norm": 3.7364091873168945, + "learning_rate": 7.278204627234587e-06, + "loss": 0.824, + "step": 6577 + }, + { + "epoch": 1.9097111336913921, + "grad_norm": 4.07366418838501, + "learning_rate": 7.277349847280323e-06, + "loss": 0.8653, + "step": 6578 + }, + { + "epoch": 1.9100014515894905, + "grad_norm": 3.9594852924346924, + "learning_rate": 7.276494983338298e-06, + "loss": 0.8074, + "step": 6579 + }, + { + "epoch": 1.910291769487589, + "grad_norm": 3.387207269668579, + "learning_rate": 7.2756400354400445e-06, + "loss": 0.7408, + "step": 6580 + }, + { + "epoch": 1.9105820873856874, + "grad_norm": 3.4568238258361816, + "learning_rate": 7.274785003617088e-06, + "loss": 0.7503, + "step": 6581 + }, + { + "epoch": 1.9108724052837858, + "grad_norm": 3.83988881111145, + "learning_rate": 7.273929887900965e-06, + "loss": 0.9153, + "step": 6582 + }, + { + "epoch": 1.9111627231818842, + "grad_norm": 3.5666446685791016, + "learning_rate": 7.273074688323209e-06, + "loss": 0.6675, + "step": 6583 + }, + { + "epoch": 1.9114530410799826, + "grad_norm": 3.3551602363586426, + "learning_rate": 7.272219404915359e-06, + "loss": 0.7733, + "step": 6584 + }, + { + "epoch": 1.911743358978081, + "grad_norm": 3.7108700275421143, + "learning_rate": 7.271364037708961e-06, + "loss": 0.765, + "step": 6585 + }, + { + "epoch": 1.9120336768761794, + "grad_norm": 4.1356916427612305, + "learning_rate": 7.270508586735559e-06, + "loss": 0.8728, + "step": 6586 + }, + { + "epoch": 1.9123239947742778, + "grad_norm": 3.6741342544555664, + "learning_rate": 7.269653052026701e-06, + "loss": 0.7273, + "step": 6587 + }, + { + "epoch": 1.9126143126723762, + "grad_norm": 3.7104272842407227, + "learning_rate": 7.268797433613938e-06, + "loss": 0.785, + "step": 6588 + }, + { + "epoch": 1.9129046305704747, + "grad_norm": 3.8318393230438232, + "learning_rate": 7.267941731528827e-06, + "loss": 0.8279, + "step": 6589 + }, + { + "epoch": 1.913194948468573, + "grad_norm": 3.612663507461548, + "learning_rate": 7.267085945802923e-06, + "loss": 0.7359, + "step": 6590 + }, + { + "epoch": 1.9134852663666715, + "grad_norm": 3.567901611328125, + "learning_rate": 7.266230076467792e-06, + "loss": 0.7328, + "step": 6591 + }, + { + "epoch": 1.91377558426477, + "grad_norm": 3.3783185482025146, + "learning_rate": 7.265374123554993e-06, + "loss": 0.7242, + "step": 6592 + }, + { + "epoch": 1.9140659021628683, + "grad_norm": 3.4487850666046143, + "learning_rate": 7.264518087096095e-06, + "loss": 0.7309, + "step": 6593 + }, + { + "epoch": 1.9143562200609667, + "grad_norm": 2.840123176574707, + "learning_rate": 7.26366196712267e-06, + "loss": 0.697, + "step": 6594 + }, + { + "epoch": 1.9146465379590651, + "grad_norm": 3.352851152420044, + "learning_rate": 7.26280576366629e-06, + "loss": 0.747, + "step": 6595 + }, + { + "epoch": 1.9149368558571636, + "grad_norm": 3.16660213470459, + "learning_rate": 7.261949476758531e-06, + "loss": 0.7444, + "step": 6596 + }, + { + "epoch": 1.915227173755262, + "grad_norm": 3.6520681381225586, + "learning_rate": 7.261093106430973e-06, + "loss": 0.7809, + "step": 6597 + }, + { + "epoch": 1.9155174916533604, + "grad_norm": 3.453809976577759, + "learning_rate": 7.260236652715198e-06, + "loss": 0.7439, + "step": 6598 + }, + { + "epoch": 1.9158078095514588, + "grad_norm": 3.48639178276062, + "learning_rate": 7.2593801156427924e-06, + "loss": 0.7891, + "step": 6599 + }, + { + "epoch": 1.9160981274495574, + "grad_norm": 3.535409927368164, + "learning_rate": 7.258523495245344e-06, + "loss": 0.6814, + "step": 6600 + }, + { + "epoch": 1.9163884453476556, + "grad_norm": 3.8680124282836914, + "learning_rate": 7.257666791554448e-06, + "loss": 0.806, + "step": 6601 + }, + { + "epoch": 1.9166787632457543, + "grad_norm": 3.6400327682495117, + "learning_rate": 7.256810004601694e-06, + "loss": 0.8711, + "step": 6602 + }, + { + "epoch": 1.9169690811438524, + "grad_norm": 3.5471885204315186, + "learning_rate": 7.255953134418684e-06, + "loss": 0.8371, + "step": 6603 + }, + { + "epoch": 1.917259399041951, + "grad_norm": 3.0958104133605957, + "learning_rate": 7.255096181037018e-06, + "loss": 0.6935, + "step": 6604 + }, + { + "epoch": 1.9175497169400493, + "grad_norm": 3.3974525928497314, + "learning_rate": 7.254239144488297e-06, + "loss": 0.7654, + "step": 6605 + }, + { + "epoch": 1.917840034838148, + "grad_norm": 3.58324933052063, + "learning_rate": 7.253382024804134e-06, + "loss": 0.7546, + "step": 6606 + }, + { + "epoch": 1.918130352736246, + "grad_norm": 3.7460758686065674, + "learning_rate": 7.252524822016135e-06, + "loss": 0.7191, + "step": 6607 + }, + { + "epoch": 1.9184206706343447, + "grad_norm": 3.605059862136841, + "learning_rate": 7.251667536155915e-06, + "loss": 0.8426, + "step": 6608 + }, + { + "epoch": 1.918710988532443, + "grad_norm": 3.271284580230713, + "learning_rate": 7.250810167255089e-06, + "loss": 0.6733, + "step": 6609 + }, + { + "epoch": 1.9190013064305416, + "grad_norm": 3.9770383834838867, + "learning_rate": 7.2499527153452775e-06, + "loss": 0.9251, + "step": 6610 + }, + { + "epoch": 1.9192916243286398, + "grad_norm": 3.7332961559295654, + "learning_rate": 7.249095180458101e-06, + "loss": 0.8789, + "step": 6611 + }, + { + "epoch": 1.9195819422267384, + "grad_norm": 3.8814618587493896, + "learning_rate": 7.24823756262519e-06, + "loss": 0.8266, + "step": 6612 + }, + { + "epoch": 1.9198722601248366, + "grad_norm": 3.7765979766845703, + "learning_rate": 7.247379861878167e-06, + "loss": 0.7793, + "step": 6613 + }, + { + "epoch": 1.9201625780229352, + "grad_norm": 3.925607442855835, + "learning_rate": 7.24652207824867e-06, + "loss": 0.8038, + "step": 6614 + }, + { + "epoch": 1.9204528959210334, + "grad_norm": 3.446561098098755, + "learning_rate": 7.245664211768327e-06, + "loss": 0.7647, + "step": 6615 + }, + { + "epoch": 1.920743213819132, + "grad_norm": 4.287924289703369, + "learning_rate": 7.24480626246878e-06, + "loss": 0.7625, + "step": 6616 + }, + { + "epoch": 1.9210335317172302, + "grad_norm": 3.6572999954223633, + "learning_rate": 7.24394823038167e-06, + "loss": 0.7498, + "step": 6617 + }, + { + "epoch": 1.9213238496153289, + "grad_norm": 4.16467809677124, + "learning_rate": 7.243090115538639e-06, + "loss": 0.8243, + "step": 6618 + }, + { + "epoch": 1.921614167513427, + "grad_norm": 3.5425045490264893, + "learning_rate": 7.242231917971335e-06, + "loss": 0.7329, + "step": 6619 + }, + { + "epoch": 1.9219044854115257, + "grad_norm": 3.067556858062744, + "learning_rate": 7.241373637711407e-06, + "loss": 0.6621, + "step": 6620 + }, + { + "epoch": 1.922194803309624, + "grad_norm": 3.9090375900268555, + "learning_rate": 7.240515274790508e-06, + "loss": 0.8719, + "step": 6621 + }, + { + "epoch": 1.9224851212077225, + "grad_norm": 3.3259966373443604, + "learning_rate": 7.239656829240296e-06, + "loss": 0.7411, + "step": 6622 + }, + { + "epoch": 1.9227754391058207, + "grad_norm": 3.5183987617492676, + "learning_rate": 7.238798301092429e-06, + "loss": 0.8731, + "step": 6623 + }, + { + "epoch": 1.9230657570039194, + "grad_norm": 3.416977882385254, + "learning_rate": 7.237939690378568e-06, + "loss": 0.7071, + "step": 6624 + }, + { + "epoch": 1.9233560749020175, + "grad_norm": 3.410515069961548, + "learning_rate": 7.2370809971303805e-06, + "loss": 0.7712, + "step": 6625 + }, + { + "epoch": 1.9236463928001162, + "grad_norm": 3.2718276977539062, + "learning_rate": 7.236222221379532e-06, + "loss": 0.6932, + "step": 6626 + }, + { + "epoch": 1.9239367106982146, + "grad_norm": 3.1431262493133545, + "learning_rate": 7.235363363157697e-06, + "loss": 0.6635, + "step": 6627 + }, + { + "epoch": 1.924227028596313, + "grad_norm": 3.419757843017578, + "learning_rate": 7.234504422496548e-06, + "loss": 0.8539, + "step": 6628 + }, + { + "epoch": 1.9245173464944114, + "grad_norm": 3.7469241619110107, + "learning_rate": 7.233645399427762e-06, + "loss": 0.8466, + "step": 6629 + }, + { + "epoch": 1.9248076643925098, + "grad_norm": 3.4239110946655273, + "learning_rate": 7.2327862939830204e-06, + "loss": 0.7049, + "step": 6630 + }, + { + "epoch": 1.9250979822906082, + "grad_norm": 3.939842462539673, + "learning_rate": 7.231927106194007e-06, + "loss": 0.8585, + "step": 6631 + }, + { + "epoch": 1.9253883001887067, + "grad_norm": 3.831742286682129, + "learning_rate": 7.231067836092407e-06, + "loss": 0.8349, + "step": 6632 + }, + { + "epoch": 1.925678618086805, + "grad_norm": 3.6673312187194824, + "learning_rate": 7.23020848370991e-06, + "loss": 0.8182, + "step": 6633 + }, + { + "epoch": 1.9259689359849035, + "grad_norm": 3.4564390182495117, + "learning_rate": 7.229349049078211e-06, + "loss": 0.7056, + "step": 6634 + }, + { + "epoch": 1.926259253883002, + "grad_norm": 3.5960192680358887, + "learning_rate": 7.228489532229001e-06, + "loss": 0.8343, + "step": 6635 + }, + { + "epoch": 1.9265495717811003, + "grad_norm": 3.550015926361084, + "learning_rate": 7.227629933193983e-06, + "loss": 0.7848, + "step": 6636 + }, + { + "epoch": 1.9268398896791987, + "grad_norm": 3.623354196548462, + "learning_rate": 7.226770252004858e-06, + "loss": 0.831, + "step": 6637 + }, + { + "epoch": 1.9271302075772971, + "grad_norm": 3.6000454425811768, + "learning_rate": 7.225910488693328e-06, + "loss": 0.8775, + "step": 6638 + }, + { + "epoch": 1.9274205254753956, + "grad_norm": 3.9402761459350586, + "learning_rate": 7.225050643291103e-06, + "loss": 0.786, + "step": 6639 + }, + { + "epoch": 1.927710843373494, + "grad_norm": 3.940194845199585, + "learning_rate": 7.224190715829894e-06, + "loss": 0.9916, + "step": 6640 + }, + { + "epoch": 1.9280011612715924, + "grad_norm": 3.295717239379883, + "learning_rate": 7.223330706341414e-06, + "loss": 0.7205, + "step": 6641 + }, + { + "epoch": 1.9282914791696908, + "grad_norm": 3.6699130535125732, + "learning_rate": 7.22247061485738e-06, + "loss": 0.8265, + "step": 6642 + }, + { + "epoch": 1.9285817970677892, + "grad_norm": 3.288679361343384, + "learning_rate": 7.221610441409509e-06, + "loss": 0.826, + "step": 6643 + }, + { + "epoch": 1.9288721149658876, + "grad_norm": 3.609783411026001, + "learning_rate": 7.220750186029529e-06, + "loss": 0.7258, + "step": 6644 + }, + { + "epoch": 1.929162432863986, + "grad_norm": 3.97063946723938, + "learning_rate": 7.219889848749163e-06, + "loss": 0.8644, + "step": 6645 + }, + { + "epoch": 1.9294527507620844, + "grad_norm": 3.5488922595977783, + "learning_rate": 7.21902942960014e-06, + "loss": 0.8222, + "step": 6646 + }, + { + "epoch": 1.9297430686601829, + "grad_norm": 3.388948678970337, + "learning_rate": 7.2181689286141935e-06, + "loss": 0.782, + "step": 6647 + }, + { + "epoch": 1.9300333865582813, + "grad_norm": 3.740267515182495, + "learning_rate": 7.2173083458230556e-06, + "loss": 0.8105, + "step": 6648 + }, + { + "epoch": 1.9303237044563797, + "grad_norm": 3.400404214859009, + "learning_rate": 7.2164476812584676e-06, + "loss": 0.6943, + "step": 6649 + }, + { + "epoch": 1.930614022354478, + "grad_norm": 3.628769874572754, + "learning_rate": 7.215586934952167e-06, + "loss": 0.7671, + "step": 6650 + }, + { + "epoch": 1.9309043402525767, + "grad_norm": 3.8510613441467285, + "learning_rate": 7.2147261069359e-06, + "loss": 0.7778, + "step": 6651 + }, + { + "epoch": 1.931194658150675, + "grad_norm": 3.6918275356292725, + "learning_rate": 7.213865197241412e-06, + "loss": 0.91, + "step": 6652 + }, + { + "epoch": 1.9314849760487736, + "grad_norm": 3.9917702674865723, + "learning_rate": 7.2130042059004554e-06, + "loss": 0.7967, + "step": 6653 + }, + { + "epoch": 1.9317752939468718, + "grad_norm": 3.8993303775787354, + "learning_rate": 7.212143132944782e-06, + "loss": 0.694, + "step": 6654 + }, + { + "epoch": 1.9320656118449704, + "grad_norm": 4.138810634613037, + "learning_rate": 7.2112819784061484e-06, + "loss": 0.8451, + "step": 6655 + }, + { + "epoch": 1.9323559297430686, + "grad_norm": 3.826202392578125, + "learning_rate": 7.210420742316311e-06, + "loss": 0.7908, + "step": 6656 + }, + { + "epoch": 1.9326462476411672, + "grad_norm": 3.8772799968719482, + "learning_rate": 7.209559424707034e-06, + "loss": 0.8552, + "step": 6657 + }, + { + "epoch": 1.9329365655392654, + "grad_norm": 3.5785393714904785, + "learning_rate": 7.208698025610084e-06, + "loss": 0.7256, + "step": 6658 + }, + { + "epoch": 1.933226883437364, + "grad_norm": 3.79007887840271, + "learning_rate": 7.207836545057226e-06, + "loss": 0.8709, + "step": 6659 + }, + { + "epoch": 1.9335172013354622, + "grad_norm": 3.2313954830169678, + "learning_rate": 7.206974983080233e-06, + "loss": 0.735, + "step": 6660 + }, + { + "epoch": 1.9338075192335609, + "grad_norm": 3.2719595432281494, + "learning_rate": 7.206113339710877e-06, + "loss": 0.7153, + "step": 6661 + }, + { + "epoch": 1.934097837131659, + "grad_norm": 3.3899588584899902, + "learning_rate": 7.205251614980938e-06, + "loss": 0.776, + "step": 6662 + }, + { + "epoch": 1.9343881550297577, + "grad_norm": 3.854118824005127, + "learning_rate": 7.204389808922194e-06, + "loss": 0.8208, + "step": 6663 + }, + { + "epoch": 1.9346784729278559, + "grad_norm": 3.564875841140747, + "learning_rate": 7.203527921566429e-06, + "loss": 0.8051, + "step": 6664 + }, + { + "epoch": 1.9349687908259545, + "grad_norm": 3.328470468521118, + "learning_rate": 7.202665952945429e-06, + "loss": 0.7459, + "step": 6665 + }, + { + "epoch": 1.9352591087240527, + "grad_norm": 3.722935438156128, + "learning_rate": 7.201803903090983e-06, + "loss": 0.8839, + "step": 6666 + }, + { + "epoch": 1.9355494266221513, + "grad_norm": 3.237356662750244, + "learning_rate": 7.20094177203488e-06, + "loss": 0.7183, + "step": 6667 + }, + { + "epoch": 1.9358397445202495, + "grad_norm": 3.388763666152954, + "learning_rate": 7.2000795598089215e-06, + "loss": 0.7665, + "step": 6668 + }, + { + "epoch": 1.9361300624183482, + "grad_norm": 3.321420907974243, + "learning_rate": 7.1992172664449e-06, + "loss": 0.75, + "step": 6669 + }, + { + "epoch": 1.9364203803164464, + "grad_norm": 3.4232888221740723, + "learning_rate": 7.1983548919746185e-06, + "loss": 0.7208, + "step": 6670 + }, + { + "epoch": 1.936710698214545, + "grad_norm": 3.926154375076294, + "learning_rate": 7.1974924364298804e-06, + "loss": 0.9375, + "step": 6671 + }, + { + "epoch": 1.9370010161126432, + "grad_norm": 3.0157711505889893, + "learning_rate": 7.196629899842495e-06, + "loss": 0.6688, + "step": 6672 + }, + { + "epoch": 1.9372913340107418, + "grad_norm": 3.3146955966949463, + "learning_rate": 7.19576728224427e-06, + "loss": 0.7946, + "step": 6673 + }, + { + "epoch": 1.93758165190884, + "grad_norm": 3.8927111625671387, + "learning_rate": 7.1949045836670195e-06, + "loss": 0.8109, + "step": 6674 + }, + { + "epoch": 1.9378719698069387, + "grad_norm": 3.6933443546295166, + "learning_rate": 7.194041804142556e-06, + "loss": 0.8922, + "step": 6675 + }, + { + "epoch": 1.938162287705037, + "grad_norm": 3.167834758758545, + "learning_rate": 7.193178943702706e-06, + "loss": 0.6685, + "step": 6676 + }, + { + "epoch": 1.9384526056031355, + "grad_norm": 3.4566543102264404, + "learning_rate": 7.192316002379283e-06, + "loss": 0.8034, + "step": 6677 + }, + { + "epoch": 1.938742923501234, + "grad_norm": 3.62845778465271, + "learning_rate": 7.191452980204119e-06, + "loss": 0.7736, + "step": 6678 + }, + { + "epoch": 1.9390332413993323, + "grad_norm": 3.444850206375122, + "learning_rate": 7.190589877209036e-06, + "loss": 0.8366, + "step": 6679 + }, + { + "epoch": 1.9393235592974307, + "grad_norm": 3.749293565750122, + "learning_rate": 7.189726693425869e-06, + "loss": 0.8621, + "step": 6680 + }, + { + "epoch": 1.9396138771955291, + "grad_norm": 3.366168737411499, + "learning_rate": 7.18886342888645e-06, + "loss": 0.7352, + "step": 6681 + }, + { + "epoch": 1.9399041950936275, + "grad_norm": 3.095916509628296, + "learning_rate": 7.1880000836226175e-06, + "loss": 0.7065, + "step": 6682 + }, + { + "epoch": 1.940194512991726, + "grad_norm": 3.3080127239227295, + "learning_rate": 7.187136657666208e-06, + "loss": 0.7082, + "step": 6683 + }, + { + "epoch": 1.9404848308898244, + "grad_norm": 3.6062538623809814, + "learning_rate": 7.186273151049068e-06, + "loss": 0.8088, + "step": 6684 + }, + { + "epoch": 1.9407751487879228, + "grad_norm": 3.4961612224578857, + "learning_rate": 7.185409563803042e-06, + "loss": 0.7568, + "step": 6685 + }, + { + "epoch": 1.9410654666860212, + "grad_norm": 3.556150436401367, + "learning_rate": 7.184545895959978e-06, + "loss": 0.8106, + "step": 6686 + }, + { + "epoch": 1.9413557845841196, + "grad_norm": 3.6129298210144043, + "learning_rate": 7.183682147551729e-06, + "loss": 0.8475, + "step": 6687 + }, + { + "epoch": 1.941646102482218, + "grad_norm": 3.856016159057617, + "learning_rate": 7.182818318610148e-06, + "loss": 0.7978, + "step": 6688 + }, + { + "epoch": 1.9419364203803164, + "grad_norm": 3.685530424118042, + "learning_rate": 7.1819544091670935e-06, + "loss": 0.7201, + "step": 6689 + }, + { + "epoch": 1.9422267382784149, + "grad_norm": 3.1159701347351074, + "learning_rate": 7.1810904192544265e-06, + "loss": 0.713, + "step": 6690 + }, + { + "epoch": 1.9425170561765133, + "grad_norm": 3.259775161743164, + "learning_rate": 7.180226348904012e-06, + "loss": 0.803, + "step": 6691 + }, + { + "epoch": 1.9428073740746117, + "grad_norm": 3.3046481609344482, + "learning_rate": 7.179362198147712e-06, + "loss": 0.7637, + "step": 6692 + }, + { + "epoch": 1.94309769197271, + "grad_norm": 3.818223476409912, + "learning_rate": 7.178497967017401e-06, + "loss": 0.7639, + "step": 6693 + }, + { + "epoch": 1.9433880098708085, + "grad_norm": 3.595642328262329, + "learning_rate": 7.177633655544949e-06, + "loss": 0.7035, + "step": 6694 + }, + { + "epoch": 1.943678327768907, + "grad_norm": 3.9954261779785156, + "learning_rate": 7.1767692637622336e-06, + "loss": 0.9131, + "step": 6695 + }, + { + "epoch": 1.9439686456670053, + "grad_norm": 3.853381872177124, + "learning_rate": 7.175904791701129e-06, + "loss": 0.7387, + "step": 6696 + }, + { + "epoch": 1.9442589635651037, + "grad_norm": 3.729569911956787, + "learning_rate": 7.17504023939352e-06, + "loss": 0.8771, + "step": 6697 + }, + { + "epoch": 1.9445492814632022, + "grad_norm": 3.354240894317627, + "learning_rate": 7.174175606871291e-06, + "loss": 0.7374, + "step": 6698 + }, + { + "epoch": 1.9448395993613006, + "grad_norm": 3.4094414710998535, + "learning_rate": 7.173310894166328e-06, + "loss": 0.7392, + "step": 6699 + }, + { + "epoch": 1.945129917259399, + "grad_norm": 3.737236976623535, + "learning_rate": 7.172446101310521e-06, + "loss": 0.8732, + "step": 6700 + }, + { + "epoch": 1.9454202351574974, + "grad_norm": 3.317800521850586, + "learning_rate": 7.171581228335764e-06, + "loss": 0.6949, + "step": 6701 + }, + { + "epoch": 1.945710553055596, + "grad_norm": 3.552617311477661, + "learning_rate": 7.170716275273954e-06, + "loss": 0.7557, + "step": 6702 + }, + { + "epoch": 1.9460008709536942, + "grad_norm": 3.36234712600708, + "learning_rate": 7.169851242156988e-06, + "loss": 0.7444, + "step": 6703 + }, + { + "epoch": 1.9462911888517929, + "grad_norm": 3.4189670085906982, + "learning_rate": 7.168986129016771e-06, + "loss": 0.7771, + "step": 6704 + }, + { + "epoch": 1.946581506749891, + "grad_norm": 3.6109812259674072, + "learning_rate": 7.168120935885203e-06, + "loss": 0.6837, + "step": 6705 + }, + { + "epoch": 1.9468718246479897, + "grad_norm": 4.015439510345459, + "learning_rate": 7.1672556627941995e-06, + "loss": 0.7297, + "step": 6706 + }, + { + "epoch": 1.9471621425460879, + "grad_norm": 3.6183969974517822, + "learning_rate": 7.166390309775664e-06, + "loss": 0.7379, + "step": 6707 + }, + { + "epoch": 1.9474524604441865, + "grad_norm": 3.7580604553222656, + "learning_rate": 7.165524876861515e-06, + "loss": 0.7974, + "step": 6708 + }, + { + "epoch": 1.9477427783422847, + "grad_norm": 2.972172975540161, + "learning_rate": 7.164659364083667e-06, + "loss": 0.5959, + "step": 6709 + }, + { + "epoch": 1.9480330962403833, + "grad_norm": 3.764477252960205, + "learning_rate": 7.1637937714740414e-06, + "loss": 0.8297, + "step": 6710 + }, + { + "epoch": 1.9483234141384815, + "grad_norm": 3.675285816192627, + "learning_rate": 7.162928099064559e-06, + "loss": 0.7536, + "step": 6711 + }, + { + "epoch": 1.9486137320365802, + "grad_norm": 3.6830227375030518, + "learning_rate": 7.1620623468871484e-06, + "loss": 0.6829, + "step": 6712 + }, + { + "epoch": 1.9489040499346784, + "grad_norm": 3.798758029937744, + "learning_rate": 7.161196514973735e-06, + "loss": 0.7489, + "step": 6713 + }, + { + "epoch": 1.949194367832777, + "grad_norm": 3.6859729290008545, + "learning_rate": 7.160330603356254e-06, + "loss": 0.7206, + "step": 6714 + }, + { + "epoch": 1.9494846857308752, + "grad_norm": 4.245936393737793, + "learning_rate": 7.159464612066636e-06, + "loss": 0.922, + "step": 6715 + }, + { + "epoch": 1.9497750036289738, + "grad_norm": 3.321417808532715, + "learning_rate": 7.158598541136819e-06, + "loss": 0.7266, + "step": 6716 + }, + { + "epoch": 1.950065321527072, + "grad_norm": 3.910299301147461, + "learning_rate": 7.1577323905987465e-06, + "loss": 0.8134, + "step": 6717 + }, + { + "epoch": 1.9503556394251707, + "grad_norm": 3.536652088165283, + "learning_rate": 7.156866160484358e-06, + "loss": 0.7933, + "step": 6718 + }, + { + "epoch": 1.9506459573232688, + "grad_norm": 3.7989182472229004, + "learning_rate": 7.155999850825604e-06, + "loss": 0.7904, + "step": 6719 + }, + { + "epoch": 1.9509362752213675, + "grad_norm": 3.8353662490844727, + "learning_rate": 7.155133461654429e-06, + "loss": 0.8632, + "step": 6720 + }, + { + "epoch": 1.9512265931194657, + "grad_norm": 3.9606752395629883, + "learning_rate": 7.154266993002786e-06, + "loss": 0.8703, + "step": 6721 + }, + { + "epoch": 1.9515169110175643, + "grad_norm": 3.5751256942749023, + "learning_rate": 7.1534004449026325e-06, + "loss": 0.6907, + "step": 6722 + }, + { + "epoch": 1.9518072289156625, + "grad_norm": 3.333437442779541, + "learning_rate": 7.152533817385927e-06, + "loss": 0.698, + "step": 6723 + }, + { + "epoch": 1.9520975468137611, + "grad_norm": 3.3084776401519775, + "learning_rate": 7.151667110484626e-06, + "loss": 0.8091, + "step": 6724 + }, + { + "epoch": 1.9523878647118593, + "grad_norm": 3.4940638542175293, + "learning_rate": 7.150800324230696e-06, + "loss": 0.8429, + "step": 6725 + }, + { + "epoch": 1.952678182609958, + "grad_norm": 3.774066209793091, + "learning_rate": 7.149933458656104e-06, + "loss": 0.8869, + "step": 6726 + }, + { + "epoch": 1.9529685005080564, + "grad_norm": 3.1335461139678955, + "learning_rate": 7.149066513792821e-06, + "loss": 0.7892, + "step": 6727 + }, + { + "epoch": 1.9532588184061548, + "grad_norm": 3.268209934234619, + "learning_rate": 7.148199489672816e-06, + "loss": 0.7628, + "step": 6728 + }, + { + "epoch": 1.9535491363042532, + "grad_norm": 3.1325523853302, + "learning_rate": 7.1473323863280666e-06, + "loss": 0.6749, + "step": 6729 + }, + { + "epoch": 1.9538394542023516, + "grad_norm": 4.100725173950195, + "learning_rate": 7.146465203790549e-06, + "loss": 0.8469, + "step": 6730 + }, + { + "epoch": 1.95412977210045, + "grad_norm": 3.513888359069824, + "learning_rate": 7.14559794209225e-06, + "loss": 0.754, + "step": 6731 + }, + { + "epoch": 1.9544200899985484, + "grad_norm": 3.4546597003936768, + "learning_rate": 7.144730601265148e-06, + "loss": 0.6669, + "step": 6732 + }, + { + "epoch": 1.9547104078966469, + "grad_norm": 3.6429920196533203, + "learning_rate": 7.143863181341234e-06, + "loss": 0.7706, + "step": 6733 + }, + { + "epoch": 1.9550007257947453, + "grad_norm": 3.8409531116485596, + "learning_rate": 7.1429956823524956e-06, + "loss": 0.8653, + "step": 6734 + }, + { + "epoch": 1.9552910436928437, + "grad_norm": 3.6878139972686768, + "learning_rate": 7.1421281043309265e-06, + "loss": 0.9262, + "step": 6735 + }, + { + "epoch": 1.955581361590942, + "grad_norm": 3.273050546646118, + "learning_rate": 7.141260447308525e-06, + "loss": 0.7529, + "step": 6736 + }, + { + "epoch": 1.9558716794890405, + "grad_norm": 3.7971384525299072, + "learning_rate": 7.140392711317286e-06, + "loss": 0.8767, + "step": 6737 + }, + { + "epoch": 1.956161997387139, + "grad_norm": 3.7323203086853027, + "learning_rate": 7.139524896389214e-06, + "loss": 0.7121, + "step": 6738 + }, + { + "epoch": 1.9564523152852373, + "grad_norm": 3.6993770599365234, + "learning_rate": 7.138657002556311e-06, + "loss": 0.7983, + "step": 6739 + }, + { + "epoch": 1.9567426331833357, + "grad_norm": 3.204155206680298, + "learning_rate": 7.13778902985059e-06, + "loss": 0.7088, + "step": 6740 + }, + { + "epoch": 1.9570329510814342, + "grad_norm": 3.982203483581543, + "learning_rate": 7.136920978304056e-06, + "loss": 0.9362, + "step": 6741 + }, + { + "epoch": 1.9573232689795326, + "grad_norm": 3.766463279724121, + "learning_rate": 7.136052847948724e-06, + "loss": 0.7985, + "step": 6742 + }, + { + "epoch": 1.957613586877631, + "grad_norm": 3.5600168704986572, + "learning_rate": 7.13518463881661e-06, + "loss": 0.7356, + "step": 6743 + }, + { + "epoch": 1.9579039047757294, + "grad_norm": 3.440335750579834, + "learning_rate": 7.134316350939736e-06, + "loss": 0.7941, + "step": 6744 + }, + { + "epoch": 1.9581942226738278, + "grad_norm": 3.4422836303710938, + "learning_rate": 7.13344798435012e-06, + "loss": 0.7227, + "step": 6745 + }, + { + "epoch": 1.9584845405719262, + "grad_norm": 3.442683219909668, + "learning_rate": 7.13257953907979e-06, + "loss": 0.7831, + "step": 6746 + }, + { + "epoch": 1.9587748584700246, + "grad_norm": 3.355893611907959, + "learning_rate": 7.1317110151607724e-06, + "loss": 0.7271, + "step": 6747 + }, + { + "epoch": 1.959065176368123, + "grad_norm": 3.4449734687805176, + "learning_rate": 7.130842412625099e-06, + "loss": 0.7658, + "step": 6748 + }, + { + "epoch": 1.9593554942662215, + "grad_norm": 4.014678478240967, + "learning_rate": 7.129973731504802e-06, + "loss": 0.8837, + "step": 6749 + }, + { + "epoch": 1.9596458121643199, + "grad_norm": 3.140547513961792, + "learning_rate": 7.1291049718319214e-06, + "loss": 0.7722, + "step": 6750 + }, + { + "epoch": 1.9599361300624185, + "grad_norm": 3.45985746383667, + "learning_rate": 7.128236133638492e-06, + "loss": 0.8081, + "step": 6751 + }, + { + "epoch": 1.9602264479605167, + "grad_norm": 3.613837718963623, + "learning_rate": 7.127367216956559e-06, + "loss": 0.8547, + "step": 6752 + }, + { + "epoch": 1.9605167658586153, + "grad_norm": 3.648763418197632, + "learning_rate": 7.126498221818167e-06, + "loss": 0.8113, + "step": 6753 + }, + { + "epoch": 1.9608070837567135, + "grad_norm": 2.954113483428955, + "learning_rate": 7.125629148255366e-06, + "loss": 0.6359, + "step": 6754 + }, + { + "epoch": 1.9610974016548122, + "grad_norm": 3.690190076828003, + "learning_rate": 7.1247599963002055e-06, + "loss": 0.7334, + "step": 6755 + }, + { + "epoch": 1.9613877195529104, + "grad_norm": 4.204606533050537, + "learning_rate": 7.123890765984738e-06, + "loss": 0.885, + "step": 6756 + }, + { + "epoch": 1.961678037451009, + "grad_norm": 3.6308844089508057, + "learning_rate": 7.123021457341022e-06, + "loss": 0.8379, + "step": 6757 + }, + { + "epoch": 1.9619683553491072, + "grad_norm": 3.752915620803833, + "learning_rate": 7.1221520704011186e-06, + "loss": 0.7165, + "step": 6758 + }, + { + "epoch": 1.9622586732472058, + "grad_norm": 3.76926326751709, + "learning_rate": 7.121282605197087e-06, + "loss": 0.7306, + "step": 6759 + }, + { + "epoch": 1.962548991145304, + "grad_norm": 3.9330079555511475, + "learning_rate": 7.120413061760996e-06, + "loss": 0.8327, + "step": 6760 + }, + { + "epoch": 1.9628393090434026, + "grad_norm": 3.476900339126587, + "learning_rate": 7.119543440124913e-06, + "loss": 0.8174, + "step": 6761 + }, + { + "epoch": 1.9631296269415008, + "grad_norm": 3.7719175815582275, + "learning_rate": 7.118673740320907e-06, + "loss": 0.6952, + "step": 6762 + }, + { + "epoch": 1.9634199448395995, + "grad_norm": 3.6090521812438965, + "learning_rate": 7.117803962381057e-06, + "loss": 0.7363, + "step": 6763 + }, + { + "epoch": 1.9637102627376977, + "grad_norm": 3.7342145442962646, + "learning_rate": 7.116934106337436e-06, + "loss": 0.7811, + "step": 6764 + }, + { + "epoch": 1.9640005806357963, + "grad_norm": 3.467252731323242, + "learning_rate": 7.1160641722221255e-06, + "loss": 0.7612, + "step": 6765 + }, + { + "epoch": 1.9642908985338945, + "grad_norm": 3.8008577823638916, + "learning_rate": 7.115194160067208e-06, + "loss": 0.8841, + "step": 6766 + }, + { + "epoch": 1.9645812164319931, + "grad_norm": 3.648664951324463, + "learning_rate": 7.114324069904769e-06, + "loss": 0.7991, + "step": 6767 + }, + { + "epoch": 1.9648715343300913, + "grad_norm": 3.3115179538726807, + "learning_rate": 7.113453901766898e-06, + "loss": 0.7317, + "step": 6768 + }, + { + "epoch": 1.96516185222819, + "grad_norm": 4.139917373657227, + "learning_rate": 7.112583655685685e-06, + "loss": 0.8714, + "step": 6769 + }, + { + "epoch": 1.9654521701262881, + "grad_norm": 3.2607545852661133, + "learning_rate": 7.1117133316932255e-06, + "loss": 0.6839, + "step": 6770 + }, + { + "epoch": 1.9657424880243868, + "grad_norm": 3.5741126537323, + "learning_rate": 7.110842929821615e-06, + "loss": 0.795, + "step": 6771 + }, + { + "epoch": 1.966032805922485, + "grad_norm": 3.477534294128418, + "learning_rate": 7.109972450102958e-06, + "loss": 0.7614, + "step": 6772 + }, + { + "epoch": 1.9663231238205836, + "grad_norm": 3.491774797439575, + "learning_rate": 7.109101892569351e-06, + "loss": 0.6599, + "step": 6773 + }, + { + "epoch": 1.9666134417186818, + "grad_norm": 3.976912021636963, + "learning_rate": 7.108231257252906e-06, + "loss": 0.9449, + "step": 6774 + }, + { + "epoch": 1.9669037596167804, + "grad_norm": 3.2056448459625244, + "learning_rate": 7.107360544185726e-06, + "loss": 0.8332, + "step": 6775 + }, + { + "epoch": 1.9671940775148788, + "grad_norm": 4.093783855438232, + "learning_rate": 7.1064897533999275e-06, + "loss": 0.849, + "step": 6776 + }, + { + "epoch": 1.9674843954129773, + "grad_norm": 3.3977859020233154, + "learning_rate": 7.105618884927622e-06, + "loss": 0.7746, + "step": 6777 + }, + { + "epoch": 1.9677747133110757, + "grad_norm": 3.2878258228302, + "learning_rate": 7.104747938800929e-06, + "loss": 0.7264, + "step": 6778 + }, + { + "epoch": 1.968065031209174, + "grad_norm": 3.859818696975708, + "learning_rate": 7.1038769150519656e-06, + "loss": 0.8852, + "step": 6779 + }, + { + "epoch": 1.9683553491072725, + "grad_norm": 3.4528026580810547, + "learning_rate": 7.103005813712856e-06, + "loss": 0.7505, + "step": 6780 + }, + { + "epoch": 1.968645667005371, + "grad_norm": 3.6107583045959473, + "learning_rate": 7.1021346348157285e-06, + "loss": 0.7107, + "step": 6781 + }, + { + "epoch": 1.9689359849034693, + "grad_norm": 3.1933040618896484, + "learning_rate": 7.101263378392709e-06, + "loss": 0.6672, + "step": 6782 + }, + { + "epoch": 1.9692263028015677, + "grad_norm": 3.2831835746765137, + "learning_rate": 7.10039204447593e-06, + "loss": 0.7403, + "step": 6783 + }, + { + "epoch": 1.9695166206996662, + "grad_norm": 3.4789631366729736, + "learning_rate": 7.099520633097525e-06, + "loss": 0.8518, + "step": 6784 + }, + { + "epoch": 1.9698069385977646, + "grad_norm": 3.596649646759033, + "learning_rate": 7.098649144289633e-06, + "loss": 0.7417, + "step": 6785 + }, + { + "epoch": 1.970097256495863, + "grad_norm": 3.3953075408935547, + "learning_rate": 7.097777578084394e-06, + "loss": 0.7524, + "step": 6786 + }, + { + "epoch": 1.9703875743939614, + "grad_norm": 3.428148031234741, + "learning_rate": 7.09690593451395e-06, + "loss": 0.738, + "step": 6787 + }, + { + "epoch": 1.9706778922920598, + "grad_norm": 3.6509974002838135, + "learning_rate": 7.096034213610448e-06, + "loss": 0.7525, + "step": 6788 + }, + { + "epoch": 1.9709682101901582, + "grad_norm": 3.5561928749084473, + "learning_rate": 7.095162415406034e-06, + "loss": 0.8845, + "step": 6789 + }, + { + "epoch": 1.9712585280882566, + "grad_norm": 3.5671966075897217, + "learning_rate": 7.0942905399328625e-06, + "loss": 0.6514, + "step": 6790 + }, + { + "epoch": 1.971548845986355, + "grad_norm": 3.40765118598938, + "learning_rate": 7.093418587223088e-06, + "loss": 0.7776, + "step": 6791 + }, + { + "epoch": 1.9718391638844535, + "grad_norm": 3.513580560684204, + "learning_rate": 7.092546557308866e-06, + "loss": 0.6769, + "step": 6792 + }, + { + "epoch": 1.9721294817825519, + "grad_norm": 3.5958590507507324, + "learning_rate": 7.091674450222357e-06, + "loss": 0.7664, + "step": 6793 + }, + { + "epoch": 1.9724197996806503, + "grad_norm": 3.6124091148376465, + "learning_rate": 7.090802265995723e-06, + "loss": 0.7266, + "step": 6794 + }, + { + "epoch": 1.9727101175787487, + "grad_norm": 3.596867322921753, + "learning_rate": 7.089930004661134e-06, + "loss": 0.7801, + "step": 6795 + }, + { + "epoch": 1.9730004354768471, + "grad_norm": 3.997195243835449, + "learning_rate": 7.089057666250754e-06, + "loss": 0.8244, + "step": 6796 + }, + { + "epoch": 1.9732907533749455, + "grad_norm": 3.457582712173462, + "learning_rate": 7.088185250796757e-06, + "loss": 0.7506, + "step": 6797 + }, + { + "epoch": 1.973581071273044, + "grad_norm": 3.2290596961975098, + "learning_rate": 7.087312758331318e-06, + "loss": 0.8002, + "step": 6798 + }, + { + "epoch": 1.9738713891711424, + "grad_norm": 3.566000461578369, + "learning_rate": 7.08644018888661e-06, + "loss": 0.8488, + "step": 6799 + }, + { + "epoch": 1.9741617070692408, + "grad_norm": 3.3688695430755615, + "learning_rate": 7.085567542494815e-06, + "loss": 0.6546, + "step": 6800 + }, + { + "epoch": 1.9744520249673392, + "grad_norm": 3.4332211017608643, + "learning_rate": 7.08469481918812e-06, + "loss": 0.7747, + "step": 6801 + }, + { + "epoch": 1.9747423428654378, + "grad_norm": 3.266073703765869, + "learning_rate": 7.083822018998706e-06, + "loss": 0.6387, + "step": 6802 + }, + { + "epoch": 1.975032660763536, + "grad_norm": 3.644442558288574, + "learning_rate": 7.082949141958762e-06, + "loss": 0.9104, + "step": 6803 + }, + { + "epoch": 1.9753229786616346, + "grad_norm": 3.220064878463745, + "learning_rate": 7.082076188100483e-06, + "loss": 0.709, + "step": 6804 + }, + { + "epoch": 1.9756132965597328, + "grad_norm": 3.7324562072753906, + "learning_rate": 7.081203157456058e-06, + "loss": 0.7557, + "step": 6805 + }, + { + "epoch": 1.9759036144578315, + "grad_norm": 3.2915639877319336, + "learning_rate": 7.080330050057687e-06, + "loss": 0.7483, + "step": 6806 + }, + { + "epoch": 1.9761939323559297, + "grad_norm": 3.8188564777374268, + "learning_rate": 7.079456865937568e-06, + "loss": 0.8745, + "step": 6807 + }, + { + "epoch": 1.9764842502540283, + "grad_norm": 3.867581844329834, + "learning_rate": 7.078583605127908e-06, + "loss": 0.7953, + "step": 6808 + }, + { + "epoch": 1.9767745681521265, + "grad_norm": 3.83316969871521, + "learning_rate": 7.077710267660908e-06, + "loss": 0.8975, + "step": 6809 + }, + { + "epoch": 1.9770648860502251, + "grad_norm": 3.6134462356567383, + "learning_rate": 7.076836853568778e-06, + "loss": 0.8214, + "step": 6810 + }, + { + "epoch": 1.9773552039483233, + "grad_norm": 3.6381266117095947, + "learning_rate": 7.0759633628837285e-06, + "loss": 0.6846, + "step": 6811 + }, + { + "epoch": 1.977645521846422, + "grad_norm": 3.7517611980438232, + "learning_rate": 7.075089795637974e-06, + "loss": 0.7253, + "step": 6812 + }, + { + "epoch": 1.9779358397445201, + "grad_norm": 3.577470302581787, + "learning_rate": 7.074216151863731e-06, + "loss": 0.7477, + "step": 6813 + }, + { + "epoch": 1.9782261576426188, + "grad_norm": 3.7703053951263428, + "learning_rate": 7.0733424315932195e-06, + "loss": 0.7689, + "step": 6814 + }, + { + "epoch": 1.978516475540717, + "grad_norm": 3.7044544219970703, + "learning_rate": 7.072468634858663e-06, + "loss": 0.886, + "step": 6815 + }, + { + "epoch": 1.9788067934388156, + "grad_norm": 3.4169695377349854, + "learning_rate": 7.071594761692284e-06, + "loss": 0.7732, + "step": 6816 + }, + { + "epoch": 1.9790971113369138, + "grad_norm": 3.8502378463745117, + "learning_rate": 7.070720812126315e-06, + "loss": 0.8438, + "step": 6817 + }, + { + "epoch": 1.9793874292350124, + "grad_norm": 3.873922348022461, + "learning_rate": 7.069846786192982e-06, + "loss": 0.8482, + "step": 6818 + }, + { + "epoch": 1.9796777471331106, + "grad_norm": 3.5439321994781494, + "learning_rate": 7.068972683924522e-06, + "loss": 0.7929, + "step": 6819 + }, + { + "epoch": 1.9799680650312093, + "grad_norm": 3.0595645904541016, + "learning_rate": 7.068098505353169e-06, + "loss": 0.6958, + "step": 6820 + }, + { + "epoch": 1.9802583829293074, + "grad_norm": 3.681124210357666, + "learning_rate": 7.0672242505111644e-06, + "loss": 0.7487, + "step": 6821 + }, + { + "epoch": 1.980548700827406, + "grad_norm": 3.742825508117676, + "learning_rate": 7.066349919430751e-06, + "loss": 0.7309, + "step": 6822 + }, + { + "epoch": 1.9808390187255043, + "grad_norm": 3.4205269813537598, + "learning_rate": 7.065475512144172e-06, + "loss": 0.7474, + "step": 6823 + }, + { + "epoch": 1.981129336623603, + "grad_norm": 3.3415684700012207, + "learning_rate": 7.064601028683675e-06, + "loss": 0.6876, + "step": 6824 + }, + { + "epoch": 1.981419654521701, + "grad_norm": 3.7281363010406494, + "learning_rate": 7.063726469081511e-06, + "loss": 0.8471, + "step": 6825 + }, + { + "epoch": 1.9817099724197997, + "grad_norm": 3.569338798522949, + "learning_rate": 7.062851833369935e-06, + "loss": 0.8374, + "step": 6826 + }, + { + "epoch": 1.9820002903178982, + "grad_norm": 3.6577813625335693, + "learning_rate": 7.061977121581202e-06, + "loss": 0.7678, + "step": 6827 + }, + { + "epoch": 1.9822906082159966, + "grad_norm": 3.6792924404144287, + "learning_rate": 7.06110233374757e-06, + "loss": 0.7505, + "step": 6828 + }, + { + "epoch": 1.982580926114095, + "grad_norm": 3.227928400039673, + "learning_rate": 7.060227469901304e-06, + "loss": 0.7637, + "step": 6829 + }, + { + "epoch": 1.9828712440121934, + "grad_norm": 3.342305898666382, + "learning_rate": 7.0593525300746635e-06, + "loss": 0.6598, + "step": 6830 + }, + { + "epoch": 1.9831615619102918, + "grad_norm": 3.869431734085083, + "learning_rate": 7.058477514299921e-06, + "loss": 0.7006, + "step": 6831 + }, + { + "epoch": 1.9834518798083902, + "grad_norm": 3.2897863388061523, + "learning_rate": 7.057602422609343e-06, + "loss": 0.821, + "step": 6832 + }, + { + "epoch": 1.9837421977064886, + "grad_norm": 3.5811805725097656, + "learning_rate": 7.056727255035206e-06, + "loss": 0.793, + "step": 6833 + }, + { + "epoch": 1.984032515604587, + "grad_norm": 4.032071113586426, + "learning_rate": 7.0558520116097826e-06, + "loss": 0.8207, + "step": 6834 + }, + { + "epoch": 1.9843228335026855, + "grad_norm": 4.270670413970947, + "learning_rate": 7.054976692365354e-06, + "loss": 0.9153, + "step": 6835 + }, + { + "epoch": 1.9846131514007839, + "grad_norm": 3.341407537460327, + "learning_rate": 7.0541012973342e-06, + "loss": 0.7869, + "step": 6836 + }, + { + "epoch": 1.9849034692988823, + "grad_norm": 3.6755237579345703, + "learning_rate": 7.053225826548605e-06, + "loss": 0.8061, + "step": 6837 + }, + { + "epoch": 1.9851937871969807, + "grad_norm": 3.738955497741699, + "learning_rate": 7.052350280040858e-06, + "loss": 0.7908, + "step": 6838 + }, + { + "epoch": 1.985484105095079, + "grad_norm": 3.7172625064849854, + "learning_rate": 7.051474657843245e-06, + "loss": 0.812, + "step": 6839 + }, + { + "epoch": 1.9857744229931775, + "grad_norm": 3.776444435119629, + "learning_rate": 7.050598959988062e-06, + "loss": 0.9028, + "step": 6840 + }, + { + "epoch": 1.986064740891276, + "grad_norm": 3.6935839653015137, + "learning_rate": 7.049723186507602e-06, + "loss": 0.8667, + "step": 6841 + }, + { + "epoch": 1.9863550587893744, + "grad_norm": 3.6881377696990967, + "learning_rate": 7.048847337434166e-06, + "loss": 0.8647, + "step": 6842 + }, + { + "epoch": 1.9866453766874728, + "grad_norm": 3.4528255462646484, + "learning_rate": 7.047971412800051e-06, + "loss": 0.775, + "step": 6843 + }, + { + "epoch": 1.9869356945855712, + "grad_norm": 3.9001612663269043, + "learning_rate": 7.047095412637563e-06, + "loss": 0.8675, + "step": 6844 + }, + { + "epoch": 1.9872260124836696, + "grad_norm": 3.6792030334472656, + "learning_rate": 7.04621933697901e-06, + "loss": 0.7322, + "step": 6845 + }, + { + "epoch": 1.987516330381768, + "grad_norm": 3.6226887702941895, + "learning_rate": 7.045343185856701e-06, + "loss": 0.7921, + "step": 6846 + }, + { + "epoch": 1.9878066482798664, + "grad_norm": 3.9914066791534424, + "learning_rate": 7.044466959302945e-06, + "loss": 0.8576, + "step": 6847 + }, + { + "epoch": 1.9880969661779648, + "grad_norm": 3.397376537322998, + "learning_rate": 7.043590657350059e-06, + "loss": 0.6744, + "step": 6848 + }, + { + "epoch": 1.9883872840760632, + "grad_norm": 3.3360671997070312, + "learning_rate": 7.042714280030361e-06, + "loss": 0.6491, + "step": 6849 + }, + { + "epoch": 1.9886776019741617, + "grad_norm": 3.4122045040130615, + "learning_rate": 7.041837827376171e-06, + "loss": 0.8094, + "step": 6850 + }, + { + "epoch": 1.98896791987226, + "grad_norm": 3.3993654251098633, + "learning_rate": 7.040961299419812e-06, + "loss": 0.7477, + "step": 6851 + }, + { + "epoch": 1.9892582377703585, + "grad_norm": 3.57908296585083, + "learning_rate": 7.040084696193611e-06, + "loss": 0.8479, + "step": 6852 + }, + { + "epoch": 1.9895485556684571, + "grad_norm": 3.6195056438446045, + "learning_rate": 7.039208017729895e-06, + "loss": 0.6888, + "step": 6853 + }, + { + "epoch": 1.9898388735665553, + "grad_norm": 3.9272801876068115, + "learning_rate": 7.038331264060996e-06, + "loss": 0.7325, + "step": 6854 + }, + { + "epoch": 1.990129191464654, + "grad_norm": 4.077366352081299, + "learning_rate": 7.037454435219251e-06, + "loss": 0.7975, + "step": 6855 + }, + { + "epoch": 1.9904195093627521, + "grad_norm": 3.617011547088623, + "learning_rate": 7.0365775312369935e-06, + "loss": 0.7656, + "step": 6856 + }, + { + "epoch": 1.9907098272608508, + "grad_norm": 3.4023525714874268, + "learning_rate": 7.0357005521465635e-06, + "loss": 0.6409, + "step": 6857 + }, + { + "epoch": 1.991000145158949, + "grad_norm": 3.8578407764434814, + "learning_rate": 7.034823497980307e-06, + "loss": 0.9175, + "step": 6858 + }, + { + "epoch": 1.9912904630570476, + "grad_norm": 4.258701801300049, + "learning_rate": 7.033946368770568e-06, + "loss": 0.7781, + "step": 6859 + }, + { + "epoch": 1.9915807809551458, + "grad_norm": 3.1676242351531982, + "learning_rate": 7.033069164549692e-06, + "loss": 0.6299, + "step": 6860 + }, + { + "epoch": 1.9918710988532444, + "grad_norm": 3.4303393363952637, + "learning_rate": 7.032191885350034e-06, + "loss": 0.7877, + "step": 6861 + }, + { + "epoch": 1.9921614167513426, + "grad_norm": 3.757079601287842, + "learning_rate": 7.031314531203943e-06, + "loss": 0.8279, + "step": 6862 + }, + { + "epoch": 1.9924517346494413, + "grad_norm": 3.5876965522766113, + "learning_rate": 7.030437102143781e-06, + "loss": 0.7769, + "step": 6863 + }, + { + "epoch": 1.9927420525475394, + "grad_norm": 3.210477352142334, + "learning_rate": 7.029559598201903e-06, + "loss": 0.7287, + "step": 6864 + }, + { + "epoch": 1.993032370445638, + "grad_norm": 3.55476713180542, + "learning_rate": 7.028682019410673e-06, + "loss": 0.7846, + "step": 6865 + }, + { + "epoch": 1.9933226883437363, + "grad_norm": 3.201202630996704, + "learning_rate": 7.027804365802454e-06, + "loss": 0.6625, + "step": 6866 + }, + { + "epoch": 1.993613006241835, + "grad_norm": 3.8153321743011475, + "learning_rate": 7.026926637409615e-06, + "loss": 0.8795, + "step": 6867 + }, + { + "epoch": 1.993903324139933, + "grad_norm": 3.239248275756836, + "learning_rate": 7.0260488342645284e-06, + "loss": 0.7628, + "step": 6868 + }, + { + "epoch": 1.9941936420380317, + "grad_norm": 3.5351696014404297, + "learning_rate": 7.0251709563995626e-06, + "loss": 0.7015, + "step": 6869 + }, + { + "epoch": 1.99448395993613, + "grad_norm": 3.6981968879699707, + "learning_rate": 7.024293003847096e-06, + "loss": 0.9076, + "step": 6870 + }, + { + "epoch": 1.9947742778342286, + "grad_norm": 3.6667771339416504, + "learning_rate": 7.023414976639505e-06, + "loss": 0.7591, + "step": 6871 + }, + { + "epoch": 1.9950645957323268, + "grad_norm": 3.423527956008911, + "learning_rate": 7.022536874809176e-06, + "loss": 0.7876, + "step": 6872 + }, + { + "epoch": 1.9953549136304254, + "grad_norm": 3.921292304992676, + "learning_rate": 7.021658698388487e-06, + "loss": 0.8568, + "step": 6873 + }, + { + "epoch": 1.9956452315285236, + "grad_norm": 3.7030389308929443, + "learning_rate": 7.02078044740983e-06, + "loss": 0.7251, + "step": 6874 + }, + { + "epoch": 1.9959355494266222, + "grad_norm": 3.2520456314086914, + "learning_rate": 7.019902121905588e-06, + "loss": 0.7709, + "step": 6875 + }, + { + "epoch": 1.9962258673247204, + "grad_norm": 3.617114543914795, + "learning_rate": 7.019023721908162e-06, + "loss": 0.7272, + "step": 6876 + }, + { + "epoch": 1.996516185222819, + "grad_norm": 3.458791732788086, + "learning_rate": 7.018145247449939e-06, + "loss": 0.7036, + "step": 6877 + }, + { + "epoch": 1.9968065031209175, + "grad_norm": 3.542085886001587, + "learning_rate": 7.017266698563322e-06, + "loss": 0.7234, + "step": 6878 + }, + { + "epoch": 1.9970968210190159, + "grad_norm": 3.0014126300811768, + "learning_rate": 7.016388075280709e-06, + "loss": 0.7739, + "step": 6879 + }, + { + "epoch": 1.9973871389171143, + "grad_norm": 3.6589744091033936, + "learning_rate": 7.015509377634504e-06, + "loss": 0.8309, + "step": 6880 + }, + { + "epoch": 1.9976774568152127, + "grad_norm": 3.2439112663269043, + "learning_rate": 7.014630605657113e-06, + "loss": 0.7759, + "step": 6881 + }, + { + "epoch": 1.997967774713311, + "grad_norm": 3.3802876472473145, + "learning_rate": 7.013751759380944e-06, + "loss": 0.6549, + "step": 6882 + }, + { + "epoch": 1.9982580926114095, + "grad_norm": 3.9446818828582764, + "learning_rate": 7.01287283883841e-06, + "loss": 0.8399, + "step": 6883 + }, + { + "epoch": 1.998548410509508, + "grad_norm": 3.197936773300171, + "learning_rate": 7.011993844061925e-06, + "loss": 0.7113, + "step": 6884 + }, + { + "epoch": 1.9988387284076063, + "grad_norm": 3.189903974533081, + "learning_rate": 7.011114775083905e-06, + "loss": 0.8267, + "step": 6885 + }, + { + "epoch": 1.9991290463057048, + "grad_norm": 3.5381767749786377, + "learning_rate": 7.010235631936771e-06, + "loss": 0.835, + "step": 6886 + }, + { + "epoch": 1.9994193642038032, + "grad_norm": 3.2049825191497803, + "learning_rate": 7.009356414652944e-06, + "loss": 0.7166, + "step": 6887 + }, + { + "epoch": 1.9997096821019016, + "grad_norm": 3.489812135696411, + "learning_rate": 7.008477123264849e-06, + "loss": 0.8515, + "step": 6888 + }, + { + "epoch": 2.0, + "grad_norm": 3.69612717628479, + "learning_rate": 7.007597757804914e-06, + "loss": 0.785, + "step": 6889 + } + ], + "logging_steps": 1.0, + "max_steps": 17220, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2241570820120904e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}