{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 17429, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.7375638303976134e-05, "grad_norm": 12.767591137701938, "learning_rate": 1.1474469305794607e-08, "loss": 1.3368, "step": 1 }, { "epoch": 0.00028687819151988064, "grad_norm": 12.76410262323862, "learning_rate": 5.7372346528973045e-08, "loss": 1.0792, "step": 5 }, { "epoch": 0.0005737563830397613, "grad_norm": 10.665986967858837, "learning_rate": 1.1474469305794609e-07, "loss": 1.1061, "step": 10 }, { "epoch": 0.0008606345745596419, "grad_norm": 12.094284503174352, "learning_rate": 1.721170395869191e-07, "loss": 1.1508, "step": 15 }, { "epoch": 0.0011475127660795226, "grad_norm": 9.850745427352663, "learning_rate": 2.2948938611589218e-07, "loss": 1.1598, "step": 20 }, { "epoch": 0.0014343909575994032, "grad_norm": 8.394092973406948, "learning_rate": 2.868617326448652e-07, "loss": 1.068, "step": 25 }, { "epoch": 0.0017212691491192839, "grad_norm": 8.266687784005606, "learning_rate": 3.442340791738382e-07, "loss": 0.9866, "step": 30 }, { "epoch": 0.0020081473406391645, "grad_norm": 7.558975739052505, "learning_rate": 4.0160642570281125e-07, "loss": 1.0208, "step": 35 }, { "epoch": 0.002295025532159045, "grad_norm": 7.396202141242944, "learning_rate": 4.5897877223178436e-07, "loss": 1.0417, "step": 40 }, { "epoch": 0.002581903723678926, "grad_norm": 6.895272778801837, "learning_rate": 5.163511187607573e-07, "loss": 0.9907, "step": 45 }, { "epoch": 0.0028687819151988064, "grad_norm": 7.038386369771074, "learning_rate": 5.737234652897304e-07, "loss": 1.0359, "step": 50 }, { "epoch": 0.003155660106718687, "grad_norm": 7.3044903638067495, "learning_rate": 6.310958118187034e-07, "loss": 1.0211, "step": 55 }, { "epoch": 0.0034425382982385677, "grad_norm": 7.573951761572998, "learning_rate": 6.884681583476764e-07, "loss": 1.0024, "step": 60 }, { "epoch": 0.0037294164897584484, "grad_norm": 6.7619192385151825, "learning_rate": 7.458405048766495e-07, "loss": 1.044, "step": 65 }, { "epoch": 0.004016294681278329, "grad_norm": 6.560783856589264, "learning_rate": 8.032128514056225e-07, "loss": 1.0796, "step": 70 }, { "epoch": 0.00430317287279821, "grad_norm": 7.128663899738009, "learning_rate": 8.605851979345956e-07, "loss": 1.0079, "step": 75 }, { "epoch": 0.00459005106431809, "grad_norm": 7.063384462831227, "learning_rate": 9.179575444635687e-07, "loss": 1.0543, "step": 80 }, { "epoch": 0.004876929255837971, "grad_norm": 6.6731784269380015, "learning_rate": 9.753298909925418e-07, "loss": 1.0156, "step": 85 }, { "epoch": 0.005163807447357852, "grad_norm": 6.318581391452399, "learning_rate": 1.0327022375215146e-06, "loss": 0.9685, "step": 90 }, { "epoch": 0.005450685638877732, "grad_norm": 6.453200309004313, "learning_rate": 1.0900745840504877e-06, "loss": 1.0376, "step": 95 }, { "epoch": 0.005737563830397613, "grad_norm": 7.20464597690153, "learning_rate": 1.1474469305794607e-06, "loss": 1.0073, "step": 100 }, { "epoch": 0.0060244420219174935, "grad_norm": 7.829606969862679, "learning_rate": 1.2048192771084338e-06, "loss": 1.0972, "step": 105 }, { "epoch": 0.006311320213437374, "grad_norm": 14.021951345445771, "learning_rate": 1.2621916236374069e-06, "loss": 0.9553, "step": 110 }, { "epoch": 0.006598198404957255, "grad_norm": 5.567300856615818, "learning_rate": 1.31956397016638e-06, "loss": 0.9265, "step": 115 }, { "epoch": 0.0068850765964771354, "grad_norm": 6.390911933022975, "learning_rate": 1.3769363166953528e-06, "loss": 0.9278, "step": 120 }, { "epoch": 0.007171954787997016, "grad_norm": 6.534271866696713, "learning_rate": 1.434308663224326e-06, "loss": 0.9962, "step": 125 }, { "epoch": 0.007458832979516897, "grad_norm": 6.927472604542325, "learning_rate": 1.491681009753299e-06, "loss": 0.973, "step": 130 }, { "epoch": 0.007745711171036777, "grad_norm": 6.188620421298334, "learning_rate": 1.5490533562822722e-06, "loss": 0.9512, "step": 135 }, { "epoch": 0.008032589362556658, "grad_norm": 7.48846483520928, "learning_rate": 1.606425702811245e-06, "loss": 0.9511, "step": 140 }, { "epoch": 0.008319467554076539, "grad_norm": 6.813291706800879, "learning_rate": 1.663798049340218e-06, "loss": 0.9536, "step": 145 }, { "epoch": 0.00860634574559642, "grad_norm": 7.91034766772547, "learning_rate": 1.7211703958691911e-06, "loss": 0.9536, "step": 150 }, { "epoch": 0.0088932239371163, "grad_norm": 6.197564487929476, "learning_rate": 1.7785427423981642e-06, "loss": 0.9519, "step": 155 }, { "epoch": 0.00918010212863618, "grad_norm": 6.828966108860384, "learning_rate": 1.8359150889271374e-06, "loss": 0.9697, "step": 160 }, { "epoch": 0.009466980320156061, "grad_norm": 6.9570208941401654, "learning_rate": 1.8932874354561103e-06, "loss": 0.9908, "step": 165 }, { "epoch": 0.009753858511675942, "grad_norm": 7.430769844482543, "learning_rate": 1.9506597819850836e-06, "loss": 0.9848, "step": 170 }, { "epoch": 0.010040736703195823, "grad_norm": 6.568157937494497, "learning_rate": 2.0080321285140564e-06, "loss": 0.9369, "step": 175 }, { "epoch": 0.010327614894715703, "grad_norm": 6.513701678468512, "learning_rate": 2.0654044750430293e-06, "loss": 1.0107, "step": 180 }, { "epoch": 0.010614493086235584, "grad_norm": 5.968525556797643, "learning_rate": 2.1227768215720025e-06, "loss": 1.0153, "step": 185 }, { "epoch": 0.010901371277755464, "grad_norm": 6.712267780125409, "learning_rate": 2.1801491681009754e-06, "loss": 1.0227, "step": 190 }, { "epoch": 0.011188249469275345, "grad_norm": 6.430776348239323, "learning_rate": 2.2375215146299486e-06, "loss": 0.8964, "step": 195 }, { "epoch": 0.011475127660795226, "grad_norm": 6.51032717922432, "learning_rate": 2.2948938611589215e-06, "loss": 1.0529, "step": 200 }, { "epoch": 0.011762005852315106, "grad_norm": 7.068578907363485, "learning_rate": 2.3522662076878948e-06, "loss": 1.0663, "step": 205 }, { "epoch": 0.012048884043834987, "grad_norm": 6.389933921691043, "learning_rate": 2.4096385542168676e-06, "loss": 1.0433, "step": 210 }, { "epoch": 0.012335762235354868, "grad_norm": 6.505195904119943, "learning_rate": 2.4670109007458405e-06, "loss": 1.0056, "step": 215 }, { "epoch": 0.012622640426874748, "grad_norm": 6.744762337542109, "learning_rate": 2.5243832472748137e-06, "loss": 0.9578, "step": 220 }, { "epoch": 0.012909518618394629, "grad_norm": 6.348403608073718, "learning_rate": 2.581755593803787e-06, "loss": 1.0109, "step": 225 }, { "epoch": 0.01319639680991451, "grad_norm": 6.570708092814506, "learning_rate": 2.63912794033276e-06, "loss": 1.0884, "step": 230 }, { "epoch": 0.01348327500143439, "grad_norm": 6.355493538884563, "learning_rate": 2.696500286861733e-06, "loss": 0.9948, "step": 235 }, { "epoch": 0.013770153192954271, "grad_norm": 6.555605587374978, "learning_rate": 2.7538726333907055e-06, "loss": 1.0316, "step": 240 }, { "epoch": 0.014057031384474152, "grad_norm": 6.001325760234826, "learning_rate": 2.811244979919679e-06, "loss": 1.0465, "step": 245 }, { "epoch": 0.014343909575994032, "grad_norm": 6.634787745986684, "learning_rate": 2.868617326448652e-06, "loss": 1.0084, "step": 250 }, { "epoch": 0.014630787767513913, "grad_norm": 6.077380703245379, "learning_rate": 2.925989672977625e-06, "loss": 0.9853, "step": 255 }, { "epoch": 0.014917665959033793, "grad_norm": 12.041317079256487, "learning_rate": 2.983362019506598e-06, "loss": 0.9607, "step": 260 }, { "epoch": 0.015204544150553674, "grad_norm": 5.808886823566987, "learning_rate": 3.040734366035571e-06, "loss": 1.0258, "step": 265 }, { "epoch": 0.015491422342073555, "grad_norm": 6.04574219959993, "learning_rate": 3.0981067125645443e-06, "loss": 0.9432, "step": 270 }, { "epoch": 0.015778300533593435, "grad_norm": 6.3028089159840075, "learning_rate": 3.1554790590935167e-06, "loss": 0.9541, "step": 275 }, { "epoch": 0.016065178725113316, "grad_norm": 6.411545256601727, "learning_rate": 3.21285140562249e-06, "loss": 0.9757, "step": 280 }, { "epoch": 0.016352056916633197, "grad_norm": 6.301481108298221, "learning_rate": 3.2702237521514633e-06, "loss": 1.0209, "step": 285 }, { "epoch": 0.016638935108153077, "grad_norm": 6.292336057808692, "learning_rate": 3.327596098680436e-06, "loss": 1.0517, "step": 290 }, { "epoch": 0.016925813299672958, "grad_norm": 6.3380087628676725, "learning_rate": 3.3849684452094094e-06, "loss": 1.0638, "step": 295 }, { "epoch": 0.01721269149119284, "grad_norm": 6.548243444778715, "learning_rate": 3.4423407917383822e-06, "loss": 0.963, "step": 300 }, { "epoch": 0.01749956968271272, "grad_norm": 5.936093930412873, "learning_rate": 3.4997131382673555e-06, "loss": 0.998, "step": 305 }, { "epoch": 0.0177864478742326, "grad_norm": 5.841015511407791, "learning_rate": 3.5570854847963284e-06, "loss": 0.9243, "step": 310 }, { "epoch": 0.01807332606575248, "grad_norm": 6.02709706399877, "learning_rate": 3.6144578313253016e-06, "loss": 1.0982, "step": 315 }, { "epoch": 0.01836020425727236, "grad_norm": 6.231290404498924, "learning_rate": 3.671830177854275e-06, "loss": 0.9981, "step": 320 }, { "epoch": 0.018647082448792242, "grad_norm": 6.063568844989174, "learning_rate": 3.7292025243832473e-06, "loss": 1.0253, "step": 325 }, { "epoch": 0.018933960640312122, "grad_norm": 6.488414021587068, "learning_rate": 3.7865748709122206e-06, "loss": 1.0574, "step": 330 }, { "epoch": 0.019220838831832003, "grad_norm": 6.167321729140299, "learning_rate": 3.8439472174411934e-06, "loss": 1.0481, "step": 335 }, { "epoch": 0.019507717023351884, "grad_norm": 6.715228054258956, "learning_rate": 3.901319563970167e-06, "loss": 1.0285, "step": 340 }, { "epoch": 0.019794595214871764, "grad_norm": 6.163122386476777, "learning_rate": 3.958691910499139e-06, "loss": 1.0245, "step": 345 }, { "epoch": 0.020081473406391645, "grad_norm": 6.534958005215632, "learning_rate": 4.016064257028113e-06, "loss": 1.0287, "step": 350 }, { "epoch": 0.020368351597911526, "grad_norm": 6.18828551102654, "learning_rate": 4.073436603557086e-06, "loss": 1.0613, "step": 355 }, { "epoch": 0.020655229789431406, "grad_norm": 5.9993762302144775, "learning_rate": 4.1308089500860585e-06, "loss": 1.0334, "step": 360 }, { "epoch": 0.020942107980951287, "grad_norm": 5.714021559866135, "learning_rate": 4.188181296615032e-06, "loss": 1.0312, "step": 365 }, { "epoch": 0.021228986172471168, "grad_norm": 6.293130802893021, "learning_rate": 4.245553643144005e-06, "loss": 0.9367, "step": 370 }, { "epoch": 0.021515864363991048, "grad_norm": 5.965203982082978, "learning_rate": 4.302925989672978e-06, "loss": 0.9885, "step": 375 }, { "epoch": 0.02180274255551093, "grad_norm": 5.664784692127852, "learning_rate": 4.360298336201951e-06, "loss": 1.0143, "step": 380 }, { "epoch": 0.02208962074703081, "grad_norm": 6.47385979311387, "learning_rate": 4.4176706827309244e-06, "loss": 1.0099, "step": 385 }, { "epoch": 0.02237649893855069, "grad_norm": 6.439095596417777, "learning_rate": 4.475043029259897e-06, "loss": 1.0003, "step": 390 }, { "epoch": 0.02266337713007057, "grad_norm": 6.335344398825625, "learning_rate": 4.53241537578887e-06, "loss": 1.0381, "step": 395 }, { "epoch": 0.02295025532159045, "grad_norm": 5.8276729609707205, "learning_rate": 4.589787722317843e-06, "loss": 0.9782, "step": 400 }, { "epoch": 0.023237133513110332, "grad_norm": 5.717105795961442, "learning_rate": 4.647160068846816e-06, "loss": 1.004, "step": 405 }, { "epoch": 0.023524011704630213, "grad_norm": 6.340719567677661, "learning_rate": 4.7045324153757895e-06, "loss": 0.9929, "step": 410 }, { "epoch": 0.023810889896150093, "grad_norm": 6.198239788060548, "learning_rate": 4.761904761904762e-06, "loss": 1.0673, "step": 415 }, { "epoch": 0.024097768087669974, "grad_norm": 6.045019112550878, "learning_rate": 4.819277108433735e-06, "loss": 0.984, "step": 420 }, { "epoch": 0.024384646279189855, "grad_norm": 5.9467642101586895, "learning_rate": 4.876649454962709e-06, "loss": 1.0305, "step": 425 }, { "epoch": 0.024671524470709735, "grad_norm": 6.450714722668409, "learning_rate": 4.934021801491681e-06, "loss": 1.0685, "step": 430 }, { "epoch": 0.024958402662229616, "grad_norm": 5.8412034560547434, "learning_rate": 4.991394148020655e-06, "loss": 1.0873, "step": 435 }, { "epoch": 0.025245280853749497, "grad_norm": 5.548641758284582, "learning_rate": 5.0487664945496275e-06, "loss": 0.9785, "step": 440 }, { "epoch": 0.025532159045269377, "grad_norm": 5.503425810045256, "learning_rate": 5.1061388410786e-06, "loss": 1.0488, "step": 445 }, { "epoch": 0.025819037236789258, "grad_norm": 5.778947333949336, "learning_rate": 5.163511187607574e-06, "loss": 1.0061, "step": 450 }, { "epoch": 0.02610591542830914, "grad_norm": 6.175759755048395, "learning_rate": 5.220883534136547e-06, "loss": 1.0248, "step": 455 }, { "epoch": 0.02639279361982902, "grad_norm": 6.182303913774293, "learning_rate": 5.27825588066552e-06, "loss": 1.0899, "step": 460 }, { "epoch": 0.0266796718113489, "grad_norm": 6.012408219291012, "learning_rate": 5.3356282271944925e-06, "loss": 0.979, "step": 465 }, { "epoch": 0.02696655000286878, "grad_norm": 5.7031088367428096, "learning_rate": 5.393000573723466e-06, "loss": 1.1099, "step": 470 }, { "epoch": 0.02725342819438866, "grad_norm": 6.068756650964581, "learning_rate": 5.450372920252439e-06, "loss": 1.0386, "step": 475 }, { "epoch": 0.027540306385908542, "grad_norm": 5.904949387403092, "learning_rate": 5.507745266781411e-06, "loss": 1.0139, "step": 480 }, { "epoch": 0.027827184577428422, "grad_norm": 5.873300038734275, "learning_rate": 5.565117613310385e-06, "loss": 1.0716, "step": 485 }, { "epoch": 0.028114062768948303, "grad_norm": 7.521703883788804, "learning_rate": 5.622489959839358e-06, "loss": 0.9733, "step": 490 }, { "epoch": 0.028400940960468184, "grad_norm": 6.443757075666579, "learning_rate": 5.679862306368331e-06, "loss": 1.0007, "step": 495 }, { "epoch": 0.028687819151988064, "grad_norm": 5.925685931567415, "learning_rate": 5.737234652897304e-06, "loss": 1.0444, "step": 500 }, { "epoch": 0.028974697343507945, "grad_norm": 6.1330766468329765, "learning_rate": 5.794606999426276e-06, "loss": 1.0098, "step": 505 }, { "epoch": 0.029261575535027826, "grad_norm": 6.012655632972298, "learning_rate": 5.85197934595525e-06, "loss": 1.0435, "step": 510 }, { "epoch": 0.029548453726547706, "grad_norm": 6.167953985501741, "learning_rate": 5.9093516924842235e-06, "loss": 1.1518, "step": 515 }, { "epoch": 0.029835331918067587, "grad_norm": 6.071810157171307, "learning_rate": 5.966724039013196e-06, "loss": 1.1178, "step": 520 }, { "epoch": 0.030122210109587468, "grad_norm": 6.0953163683142755, "learning_rate": 6.02409638554217e-06, "loss": 1.0585, "step": 525 }, { "epoch": 0.030409088301107348, "grad_norm": 5.549135253828369, "learning_rate": 6.081468732071142e-06, "loss": 1.0108, "step": 530 }, { "epoch": 0.03069596649262723, "grad_norm": 5.932551041380845, "learning_rate": 6.138841078600115e-06, "loss": 1.0223, "step": 535 }, { "epoch": 0.03098284468414711, "grad_norm": 5.517603460387521, "learning_rate": 6.196213425129089e-06, "loss": 1.0598, "step": 540 }, { "epoch": 0.03126972287566699, "grad_norm": 5.846569709494865, "learning_rate": 6.2535857716580615e-06, "loss": 1.0039, "step": 545 }, { "epoch": 0.03155660106718687, "grad_norm": 6.486294567015053, "learning_rate": 6.3109581181870335e-06, "loss": 1.1084, "step": 550 }, { "epoch": 0.03184347925870675, "grad_norm": 5.630959006393395, "learning_rate": 6.368330464716007e-06, "loss": 1.0106, "step": 555 }, { "epoch": 0.03213035745022663, "grad_norm": 5.485882012359311, "learning_rate": 6.42570281124498e-06, "loss": 1.0672, "step": 560 }, { "epoch": 0.03241723564174651, "grad_norm": 6.29655319515772, "learning_rate": 6.483075157773954e-06, "loss": 1.0535, "step": 565 }, { "epoch": 0.03270411383326639, "grad_norm": 5.352699061024898, "learning_rate": 6.5404475043029266e-06, "loss": 1.0992, "step": 570 }, { "epoch": 0.032990992024786274, "grad_norm": 6.637962407048809, "learning_rate": 6.597819850831899e-06, "loss": 1.0772, "step": 575 }, { "epoch": 0.033277870216306155, "grad_norm": 5.773453767339884, "learning_rate": 6.655192197360872e-06, "loss": 1.0009, "step": 580 }, { "epoch": 0.033564748407826035, "grad_norm": 5.678390508058307, "learning_rate": 6.712564543889846e-06, "loss": 1.0902, "step": 585 }, { "epoch": 0.033851626599345916, "grad_norm": 5.663324284228026, "learning_rate": 6.769936890418819e-06, "loss": 1.0915, "step": 590 }, { "epoch": 0.0341385047908658, "grad_norm": 5.945390421080143, "learning_rate": 6.8273092369477925e-06, "loss": 1.1159, "step": 595 }, { "epoch": 0.03442538298238568, "grad_norm": 6.000462757685704, "learning_rate": 6.8846815834767645e-06, "loss": 1.1272, "step": 600 }, { "epoch": 0.03471226117390556, "grad_norm": 6.727310214649698, "learning_rate": 6.942053930005737e-06, "loss": 1.0097, "step": 605 }, { "epoch": 0.03499913936542544, "grad_norm": 5.834127375089645, "learning_rate": 6.999426276534711e-06, "loss": 1.0415, "step": 610 }, { "epoch": 0.03528601755694532, "grad_norm": 5.908656291160888, "learning_rate": 7.056798623063684e-06, "loss": 1.016, "step": 615 }, { "epoch": 0.0355728957484652, "grad_norm": 5.8594607055823085, "learning_rate": 7.114170969592657e-06, "loss": 1.1304, "step": 620 }, { "epoch": 0.03585977393998508, "grad_norm": 5.180225508143497, "learning_rate": 7.1715433161216296e-06, "loss": 1.1146, "step": 625 }, { "epoch": 0.03614665213150496, "grad_norm": 7.5216353389200625, "learning_rate": 7.228915662650603e-06, "loss": 1.0418, "step": 630 }, { "epoch": 0.03643353032302484, "grad_norm": 5.690938295073585, "learning_rate": 7.286288009179576e-06, "loss": 1.0355, "step": 635 }, { "epoch": 0.03672040851454472, "grad_norm": 5.906975180504765, "learning_rate": 7.34366035570855e-06, "loss": 1.0604, "step": 640 }, { "epoch": 0.0370072867060646, "grad_norm": 5.454253225813113, "learning_rate": 7.401032702237522e-06, "loss": 1.0328, "step": 645 }, { "epoch": 0.037294164897584484, "grad_norm": 5.603450078829713, "learning_rate": 7.458405048766495e-06, "loss": 0.9958, "step": 650 }, { "epoch": 0.037581043089104364, "grad_norm": 5.751875002760454, "learning_rate": 7.515777395295468e-06, "loss": 1.1126, "step": 655 }, { "epoch": 0.037867921280624245, "grad_norm": 6.079445844086674, "learning_rate": 7.573149741824441e-06, "loss": 1.0507, "step": 660 }, { "epoch": 0.038154799472144126, "grad_norm": 5.7332979976805065, "learning_rate": 7.630522088353415e-06, "loss": 1.0691, "step": 665 }, { "epoch": 0.038441677663664006, "grad_norm": 5.221074675036754, "learning_rate": 7.687894434882387e-06, "loss": 0.9502, "step": 670 }, { "epoch": 0.03872855585518389, "grad_norm": 5.922889258335442, "learning_rate": 7.74526678141136e-06, "loss": 1.1478, "step": 675 }, { "epoch": 0.03901543404670377, "grad_norm": 5.29083304696582, "learning_rate": 7.802639127940334e-06, "loss": 1.1421, "step": 680 }, { "epoch": 0.03930231223822365, "grad_norm": 5.790330046832938, "learning_rate": 7.860011474469306e-06, "loss": 1.0533, "step": 685 }, { "epoch": 0.03958919042974353, "grad_norm": 6.363431025562162, "learning_rate": 7.917383820998278e-06, "loss": 1.0505, "step": 690 }, { "epoch": 0.03987606862126341, "grad_norm": 5.790173609550593, "learning_rate": 7.974756167527252e-06, "loss": 1.0506, "step": 695 }, { "epoch": 0.04016294681278329, "grad_norm": 6.317369795268714, "learning_rate": 8.032128514056226e-06, "loss": 1.1327, "step": 700 }, { "epoch": 0.04044982500430317, "grad_norm": 5.3728142238173096, "learning_rate": 8.0895008605852e-06, "loss": 1.0658, "step": 705 }, { "epoch": 0.04073670319582305, "grad_norm": 6.46754781144707, "learning_rate": 8.146873207114171e-06, "loss": 1.0065, "step": 710 }, { "epoch": 0.04102358138734293, "grad_norm": 6.093669635592826, "learning_rate": 8.204245553643145e-06, "loss": 1.1248, "step": 715 }, { "epoch": 0.04131045957886281, "grad_norm": 5.500299207612891, "learning_rate": 8.261617900172117e-06, "loss": 1.0621, "step": 720 }, { "epoch": 0.04159733777038269, "grad_norm": 5.425878390088494, "learning_rate": 8.31899024670109e-06, "loss": 1.0736, "step": 725 }, { "epoch": 0.041884215961902574, "grad_norm": 5.68985912700441, "learning_rate": 8.376362593230064e-06, "loss": 1.1347, "step": 730 }, { "epoch": 0.042171094153422455, "grad_norm": 6.417838279997857, "learning_rate": 8.433734939759038e-06, "loss": 1.1104, "step": 735 }, { "epoch": 0.042457972344942335, "grad_norm": 5.095204998636876, "learning_rate": 8.49110728628801e-06, "loss": 1.0487, "step": 740 }, { "epoch": 0.042744850536462216, "grad_norm": 5.449868885704676, "learning_rate": 8.548479632816982e-06, "loss": 1.1217, "step": 745 }, { "epoch": 0.043031728727982096, "grad_norm": 5.36657305640976, "learning_rate": 8.605851979345956e-06, "loss": 1.0742, "step": 750 }, { "epoch": 0.04331860691950198, "grad_norm": 5.277821491497758, "learning_rate": 8.66322432587493e-06, "loss": 1.0556, "step": 755 }, { "epoch": 0.04360548511102186, "grad_norm": 5.962269065527649, "learning_rate": 8.720596672403902e-06, "loss": 1.1027, "step": 760 }, { "epoch": 0.04389236330254174, "grad_norm": 5.572472271181297, "learning_rate": 8.777969018932875e-06, "loss": 1.0429, "step": 765 }, { "epoch": 0.04417924149406162, "grad_norm": 6.214468873238288, "learning_rate": 8.835341365461849e-06, "loss": 1.1393, "step": 770 }, { "epoch": 0.0444661196855815, "grad_norm": 5.390974828977905, "learning_rate": 8.892713711990821e-06, "loss": 1.0453, "step": 775 }, { "epoch": 0.04475299787710138, "grad_norm": 5.713343453322397, "learning_rate": 8.950086058519795e-06, "loss": 1.1194, "step": 780 }, { "epoch": 0.04503987606862126, "grad_norm": 5.888334116815661, "learning_rate": 9.007458405048767e-06, "loss": 1.099, "step": 785 }, { "epoch": 0.04532675426014114, "grad_norm": 5.630001544753435, "learning_rate": 9.06483075157774e-06, "loss": 1.0771, "step": 790 }, { "epoch": 0.04561363245166102, "grad_norm": 5.638480028086103, "learning_rate": 9.122203098106714e-06, "loss": 1.1316, "step": 795 }, { "epoch": 0.0459005106431809, "grad_norm": 5.394231049658397, "learning_rate": 9.179575444635686e-06, "loss": 1.0297, "step": 800 }, { "epoch": 0.046187388834700784, "grad_norm": 5.693543131488364, "learning_rate": 9.23694779116466e-06, "loss": 1.0582, "step": 805 }, { "epoch": 0.046474267026220664, "grad_norm": 6.371537244782153, "learning_rate": 9.294320137693632e-06, "loss": 1.0759, "step": 810 }, { "epoch": 0.046761145217740545, "grad_norm": 6.203470279857053, "learning_rate": 9.351692484222605e-06, "loss": 1.0645, "step": 815 }, { "epoch": 0.047048023409260425, "grad_norm": 5.521927818881918, "learning_rate": 9.409064830751579e-06, "loss": 1.111, "step": 820 }, { "epoch": 0.047334901600780306, "grad_norm": 5.150421054345734, "learning_rate": 9.466437177280551e-06, "loss": 1.1599, "step": 825 }, { "epoch": 0.04762177979230019, "grad_norm": 530.7378900393746, "learning_rate": 9.523809523809525e-06, "loss": 1.1586, "step": 830 }, { "epoch": 0.04790865798382007, "grad_norm": 6.117224272934098, "learning_rate": 9.581181870338497e-06, "loss": 1.1046, "step": 835 }, { "epoch": 0.04819553617533995, "grad_norm": 5.185575012386112, "learning_rate": 9.63855421686747e-06, "loss": 1.0685, "step": 840 }, { "epoch": 0.04848241436685983, "grad_norm": 6.28107325617092, "learning_rate": 9.695926563396444e-06, "loss": 1.061, "step": 845 }, { "epoch": 0.04876929255837971, "grad_norm": 40.04998828068869, "learning_rate": 9.753298909925418e-06, "loss": 1.1407, "step": 850 }, { "epoch": 0.04905617074989959, "grad_norm": 6.6923246604180875, "learning_rate": 9.81067125645439e-06, "loss": 1.1337, "step": 855 }, { "epoch": 0.04934304894141947, "grad_norm": 6.316969793527926, "learning_rate": 9.868043602983362e-06, "loss": 1.1558, "step": 860 }, { "epoch": 0.04962992713293935, "grad_norm": 5.554729483640509, "learning_rate": 9.925415949512336e-06, "loss": 1.1011, "step": 865 }, { "epoch": 0.04991680532445923, "grad_norm": 6.177419229303469, "learning_rate": 9.98278829604131e-06, "loss": 1.1111, "step": 870 }, { "epoch": 0.05020368351597911, "grad_norm": 6.050423400967422, "learning_rate": 1.0040160642570283e-05, "loss": 1.1255, "step": 875 }, { "epoch": 0.05049056170749899, "grad_norm": 5.830676273221124, "learning_rate": 1.0097532989099255e-05, "loss": 1.1709, "step": 880 }, { "epoch": 0.050777439899018874, "grad_norm": 5.631139001028401, "learning_rate": 1.0154905335628229e-05, "loss": 1.145, "step": 885 }, { "epoch": 0.051064318090538754, "grad_norm": 5.801181780741042, "learning_rate": 1.02122776821572e-05, "loss": 1.1262, "step": 890 }, { "epoch": 0.051351196282058635, "grad_norm": 6.110782762372955, "learning_rate": 1.0269650028686173e-05, "loss": 1.0903, "step": 895 }, { "epoch": 0.051638074473578516, "grad_norm": 5.800998980935618, "learning_rate": 1.0327022375215148e-05, "loss": 1.17, "step": 900 }, { "epoch": 0.051924952665098396, "grad_norm": 5.596057684691657, "learning_rate": 1.038439472174412e-05, "loss": 1.081, "step": 905 }, { "epoch": 0.05221183085661828, "grad_norm": 5.9311367465280265, "learning_rate": 1.0441767068273094e-05, "loss": 1.2247, "step": 910 }, { "epoch": 0.05249870904813816, "grad_norm": 6.200884165536612, "learning_rate": 1.0499139414802066e-05, "loss": 1.0895, "step": 915 }, { "epoch": 0.05278558723965804, "grad_norm": 5.824518167988625, "learning_rate": 1.055651176133104e-05, "loss": 1.1809, "step": 920 }, { "epoch": 0.05307246543117792, "grad_norm": 5.62200267754407, "learning_rate": 1.0613884107860013e-05, "loss": 1.1415, "step": 925 }, { "epoch": 0.0533593436226978, "grad_norm": 4.882306452418002, "learning_rate": 1.0671256454388985e-05, "loss": 1.1017, "step": 930 }, { "epoch": 0.05364622181421768, "grad_norm": 7.378770461076936, "learning_rate": 1.0728628800917957e-05, "loss": 1.161, "step": 935 }, { "epoch": 0.05393310000573756, "grad_norm": 28.782079490913052, "learning_rate": 1.0786001147446932e-05, "loss": 1.1201, "step": 940 }, { "epoch": 0.05421997819725744, "grad_norm": 5.815219820170197, "learning_rate": 1.0843373493975904e-05, "loss": 1.0962, "step": 945 }, { "epoch": 0.05450685638877732, "grad_norm": 5.618712939088892, "learning_rate": 1.0900745840504878e-05, "loss": 1.053, "step": 950 }, { "epoch": 0.0547937345802972, "grad_norm": 6.490107470473578, "learning_rate": 1.095811818703385e-05, "loss": 1.1194, "step": 955 }, { "epoch": 0.055080612771817083, "grad_norm": 5.971772549666276, "learning_rate": 1.1015490533562822e-05, "loss": 1.1109, "step": 960 }, { "epoch": 0.055367490963336964, "grad_norm": 5.059401471934044, "learning_rate": 1.1072862880091798e-05, "loss": 1.1043, "step": 965 }, { "epoch": 0.055654369154856845, "grad_norm": 7.344049015356465, "learning_rate": 1.113023522662077e-05, "loss": 1.1168, "step": 970 }, { "epoch": 0.055941247346376725, "grad_norm": 6.047184541116098, "learning_rate": 1.1187607573149743e-05, "loss": 1.0835, "step": 975 }, { "epoch": 0.056228125537896606, "grad_norm": 5.301514111717804, "learning_rate": 1.1244979919678715e-05, "loss": 1.1196, "step": 980 }, { "epoch": 0.05651500372941649, "grad_norm": 5.309999251946666, "learning_rate": 1.1302352266207687e-05, "loss": 1.1134, "step": 985 }, { "epoch": 0.05680188192093637, "grad_norm": 7.000159336305085, "learning_rate": 1.1359724612736663e-05, "loss": 1.0661, "step": 990 }, { "epoch": 0.05708876011245625, "grad_norm": 5.289778220013336, "learning_rate": 1.1417096959265635e-05, "loss": 1.0823, "step": 995 }, { "epoch": 0.05737563830397613, "grad_norm": 5.808248792685078, "learning_rate": 1.1474469305794608e-05, "loss": 1.1794, "step": 1000 }, { "epoch": 0.05766251649549601, "grad_norm": 5.778811751934345, "learning_rate": 1.153184165232358e-05, "loss": 1.1324, "step": 1005 }, { "epoch": 0.05794939468701589, "grad_norm": 5.939627938024098, "learning_rate": 1.1589213998852552e-05, "loss": 1.1598, "step": 1010 }, { "epoch": 0.05823627287853577, "grad_norm": 6.17589475068515, "learning_rate": 1.1646586345381528e-05, "loss": 1.1909, "step": 1015 }, { "epoch": 0.05852315107005565, "grad_norm": 5.551317430785165, "learning_rate": 1.17039586919105e-05, "loss": 1.1444, "step": 1020 }, { "epoch": 0.05881002926157553, "grad_norm": 5.597792202981386, "learning_rate": 1.1761331038439473e-05, "loss": 1.1169, "step": 1025 }, { "epoch": 0.05909690745309541, "grad_norm": 6.834068670425091, "learning_rate": 1.1818703384968447e-05, "loss": 1.141, "step": 1030 }, { "epoch": 0.05938378564461529, "grad_norm": 5.772987633722752, "learning_rate": 1.1876075731497419e-05, "loss": 1.1277, "step": 1035 }, { "epoch": 0.059670663836135174, "grad_norm": 5.962965390351646, "learning_rate": 1.1933448078026393e-05, "loss": 1.1124, "step": 1040 }, { "epoch": 0.059957542027655054, "grad_norm": 5.773279858726362, "learning_rate": 1.1990820424555365e-05, "loss": 1.2577, "step": 1045 }, { "epoch": 0.060244420219174935, "grad_norm": 5.262608546585663, "learning_rate": 1.204819277108434e-05, "loss": 1.1764, "step": 1050 }, { "epoch": 0.060531298410694816, "grad_norm": 5.786215231901187, "learning_rate": 1.2105565117613312e-05, "loss": 1.1338, "step": 1055 }, { "epoch": 0.060818176602214696, "grad_norm": 6.496302177402128, "learning_rate": 1.2162937464142284e-05, "loss": 1.1198, "step": 1060 }, { "epoch": 0.06110505479373458, "grad_norm": 6.0881756967634395, "learning_rate": 1.2220309810671258e-05, "loss": 1.1919, "step": 1065 }, { "epoch": 0.06139193298525446, "grad_norm": 5.547797724966673, "learning_rate": 1.227768215720023e-05, "loss": 1.058, "step": 1070 }, { "epoch": 0.06167881117677434, "grad_norm": 6.135173765458703, "learning_rate": 1.2335054503729202e-05, "loss": 1.1109, "step": 1075 }, { "epoch": 0.06196568936829422, "grad_norm": 5.233454735706023, "learning_rate": 1.2392426850258177e-05, "loss": 1.1455, "step": 1080 }, { "epoch": 0.0622525675598141, "grad_norm": 5.29272963469865, "learning_rate": 1.244979919678715e-05, "loss": 1.1413, "step": 1085 }, { "epoch": 0.06253944575133398, "grad_norm": 5.42684594293495, "learning_rate": 1.2507171543316123e-05, "loss": 1.0925, "step": 1090 }, { "epoch": 0.06282632394285387, "grad_norm": 6.518672606160382, "learning_rate": 1.2564543889845095e-05, "loss": 1.1604, "step": 1095 }, { "epoch": 0.06311320213437374, "grad_norm": 5.52835431510946, "learning_rate": 1.2621916236374067e-05, "loss": 1.1563, "step": 1100 }, { "epoch": 0.06340008032589363, "grad_norm": 5.291740831357666, "learning_rate": 1.2679288582903042e-05, "loss": 1.1189, "step": 1105 }, { "epoch": 0.0636869585174135, "grad_norm": 5.441773422117592, "learning_rate": 1.2736660929432014e-05, "loss": 1.1808, "step": 1110 }, { "epoch": 0.06397383670893339, "grad_norm": 5.283076368840306, "learning_rate": 1.2794033275960988e-05, "loss": 1.1443, "step": 1115 }, { "epoch": 0.06426071490045326, "grad_norm": 5.5868444501688375, "learning_rate": 1.285140562248996e-05, "loss": 1.1244, "step": 1120 }, { "epoch": 0.06454759309197315, "grad_norm": 5.1421542197142776, "learning_rate": 1.2908777969018934e-05, "loss": 1.1654, "step": 1125 }, { "epoch": 0.06483447128349303, "grad_norm": 5.319966133271966, "learning_rate": 1.2966150315547907e-05, "loss": 1.1711, "step": 1130 }, { "epoch": 0.06512134947501291, "grad_norm": 5.032868773934788, "learning_rate": 1.302352266207688e-05, "loss": 1.2274, "step": 1135 }, { "epoch": 0.06540822766653279, "grad_norm": 5.148454072052246, "learning_rate": 1.3080895008605853e-05, "loss": 1.1236, "step": 1140 }, { "epoch": 0.06569510585805267, "grad_norm": 5.25887993252207, "learning_rate": 1.3138267355134827e-05, "loss": 1.2075, "step": 1145 }, { "epoch": 0.06598198404957255, "grad_norm": 5.667309607227554, "learning_rate": 1.3195639701663799e-05, "loss": 1.1989, "step": 1150 }, { "epoch": 0.06626886224109244, "grad_norm": 5.172688963152066, "learning_rate": 1.3253012048192772e-05, "loss": 1.1828, "step": 1155 }, { "epoch": 0.06655574043261231, "grad_norm": 5.261541675150709, "learning_rate": 1.3310384394721744e-05, "loss": 1.2025, "step": 1160 }, { "epoch": 0.0668426186241322, "grad_norm": 6.0228125399033585, "learning_rate": 1.336775674125072e-05, "loss": 1.1908, "step": 1165 }, { "epoch": 0.06712949681565207, "grad_norm": 5.619991187017016, "learning_rate": 1.3425129087779692e-05, "loss": 1.1449, "step": 1170 }, { "epoch": 0.06741637500717196, "grad_norm": 5.579252606383179, "learning_rate": 1.3482501434308664e-05, "loss": 1.1582, "step": 1175 }, { "epoch": 0.06770325319869183, "grad_norm": 5.277094352559649, "learning_rate": 1.3539873780837638e-05, "loss": 1.1088, "step": 1180 }, { "epoch": 0.06799013139021172, "grad_norm": 6.731166433028172, "learning_rate": 1.359724612736661e-05, "loss": 1.1509, "step": 1185 }, { "epoch": 0.0682770095817316, "grad_norm": 5.114773082673724, "learning_rate": 1.3654618473895585e-05, "loss": 1.1268, "step": 1190 }, { "epoch": 0.06856388777325148, "grad_norm": 5.443977334664743, "learning_rate": 1.3711990820424557e-05, "loss": 1.2162, "step": 1195 }, { "epoch": 0.06885076596477135, "grad_norm": 5.522133662651608, "learning_rate": 1.3769363166953529e-05, "loss": 1.1593, "step": 1200 }, { "epoch": 0.06913764415629124, "grad_norm": 5.220230480886979, "learning_rate": 1.3826735513482503e-05, "loss": 1.1425, "step": 1205 }, { "epoch": 0.06942452234781112, "grad_norm": 8.232152130751398, "learning_rate": 1.3884107860011475e-05, "loss": 1.1186, "step": 1210 }, { "epoch": 0.069711400539331, "grad_norm": 5.501254631106599, "learning_rate": 1.3941480206540448e-05, "loss": 1.1538, "step": 1215 }, { "epoch": 0.06999827873085088, "grad_norm": 5.188568423874403, "learning_rate": 1.3998852553069422e-05, "loss": 1.1881, "step": 1220 }, { "epoch": 0.07028515692237076, "grad_norm": 6.829056376936748, "learning_rate": 1.4056224899598394e-05, "loss": 1.139, "step": 1225 }, { "epoch": 0.07057203511389064, "grad_norm": 5.686941426048155, "learning_rate": 1.4113597246127368e-05, "loss": 1.1656, "step": 1230 }, { "epoch": 0.07085891330541053, "grad_norm": 5.743350753801365, "learning_rate": 1.4170969592656341e-05, "loss": 1.1423, "step": 1235 }, { "epoch": 0.0711457914969304, "grad_norm": 28.585738540553507, "learning_rate": 1.4228341939185313e-05, "loss": 1.1535, "step": 1240 }, { "epoch": 0.07143266968845029, "grad_norm": 5.693733343162777, "learning_rate": 1.4285714285714287e-05, "loss": 1.1553, "step": 1245 }, { "epoch": 0.07171954787997016, "grad_norm": 5.622938707973783, "learning_rate": 1.4343086632243259e-05, "loss": 1.2302, "step": 1250 }, { "epoch": 0.07200642607149005, "grad_norm": 6.282252377265223, "learning_rate": 1.4400458978772235e-05, "loss": 1.2047, "step": 1255 }, { "epoch": 0.07229330426300992, "grad_norm": 5.240933485730208, "learning_rate": 1.4457831325301207e-05, "loss": 1.1917, "step": 1260 }, { "epoch": 0.07258018245452981, "grad_norm": 5.4722774529946925, "learning_rate": 1.4515203671830179e-05, "loss": 1.2421, "step": 1265 }, { "epoch": 0.07286706064604968, "grad_norm": 6.0579054741101155, "learning_rate": 1.4572576018359152e-05, "loss": 1.1452, "step": 1270 }, { "epoch": 0.07315393883756957, "grad_norm": 5.738916990431266, "learning_rate": 1.4629948364888124e-05, "loss": 1.1611, "step": 1275 }, { "epoch": 0.07344081702908944, "grad_norm": 5.560633712725496, "learning_rate": 1.46873207114171e-05, "loss": 1.2396, "step": 1280 }, { "epoch": 0.07372769522060933, "grad_norm": 7.0939719159232, "learning_rate": 1.4744693057946072e-05, "loss": 1.2334, "step": 1285 }, { "epoch": 0.0740145734121292, "grad_norm": 5.207083977144141, "learning_rate": 1.4802065404475044e-05, "loss": 1.1542, "step": 1290 }, { "epoch": 0.0743014516036491, "grad_norm": 5.566595661684277, "learning_rate": 1.4859437751004017e-05, "loss": 1.2335, "step": 1295 }, { "epoch": 0.07458832979516897, "grad_norm": 5.201535278255039, "learning_rate": 1.491681009753299e-05, "loss": 1.1597, "step": 1300 }, { "epoch": 0.07487520798668885, "grad_norm": 8.37727292003229, "learning_rate": 1.4974182444061965e-05, "loss": 1.1891, "step": 1305 }, { "epoch": 0.07516208617820873, "grad_norm": 5.682981417719145, "learning_rate": 1.5031554790590937e-05, "loss": 1.1302, "step": 1310 }, { "epoch": 0.07544896436972862, "grad_norm": 5.257447580181923, "learning_rate": 1.5088927137119909e-05, "loss": 1.1492, "step": 1315 }, { "epoch": 0.07573584256124849, "grad_norm": 5.838775804427132, "learning_rate": 1.5146299483648882e-05, "loss": 1.1671, "step": 1320 }, { "epoch": 0.07602272075276838, "grad_norm": 6.026885858423549, "learning_rate": 1.5203671830177854e-05, "loss": 1.2045, "step": 1325 }, { "epoch": 0.07630959894428825, "grad_norm": 5.908399812752056, "learning_rate": 1.526104417670683e-05, "loss": 1.2136, "step": 1330 }, { "epoch": 0.07659647713580814, "grad_norm": 6.387979984619573, "learning_rate": 1.5318416523235802e-05, "loss": 1.177, "step": 1335 }, { "epoch": 0.07688335532732801, "grad_norm": 7.392616466663444, "learning_rate": 1.5375788869764774e-05, "loss": 1.1999, "step": 1340 }, { "epoch": 0.0771702335188479, "grad_norm": 6.914708843383304, "learning_rate": 1.543316121629375e-05, "loss": 1.244, "step": 1345 }, { "epoch": 0.07745711171036777, "grad_norm": 5.007624216323807, "learning_rate": 1.549053356282272e-05, "loss": 1.1794, "step": 1350 }, { "epoch": 0.07774398990188766, "grad_norm": 5.086189016966721, "learning_rate": 1.5547905909351697e-05, "loss": 1.1995, "step": 1355 }, { "epoch": 0.07803086809340753, "grad_norm": 5.205199052229356, "learning_rate": 1.560527825588067e-05, "loss": 1.1628, "step": 1360 }, { "epoch": 0.07831774628492742, "grad_norm": 4.915842430662404, "learning_rate": 1.566265060240964e-05, "loss": 1.2368, "step": 1365 }, { "epoch": 0.0786046244764473, "grad_norm": 4.9595170884845015, "learning_rate": 1.5720022948938613e-05, "loss": 1.1438, "step": 1370 }, { "epoch": 0.07889150266796718, "grad_norm": 10.239936426064354, "learning_rate": 1.5777395295467585e-05, "loss": 1.1871, "step": 1375 }, { "epoch": 0.07917838085948706, "grad_norm": 7.149720706915769, "learning_rate": 1.5834767641996557e-05, "loss": 1.2666, "step": 1380 }, { "epoch": 0.07946525905100695, "grad_norm": 6.697574272118073, "learning_rate": 1.5892139988525532e-05, "loss": 1.1785, "step": 1385 }, { "epoch": 0.07975213724252682, "grad_norm": 5.885629412213772, "learning_rate": 1.5949512335054504e-05, "loss": 1.1904, "step": 1390 }, { "epoch": 0.0800390154340467, "grad_norm": 4.88838746943101, "learning_rate": 1.600688468158348e-05, "loss": 1.1974, "step": 1395 }, { "epoch": 0.08032589362556658, "grad_norm": 5.708731468478139, "learning_rate": 1.606425702811245e-05, "loss": 1.1602, "step": 1400 }, { "epoch": 0.08061277181708647, "grad_norm": 6.019551424076299, "learning_rate": 1.6121629374641423e-05, "loss": 1.3657, "step": 1405 }, { "epoch": 0.08089965000860634, "grad_norm": 6.3810272141018585, "learning_rate": 1.61790017211704e-05, "loss": 1.2877, "step": 1410 }, { "epoch": 0.08118652820012623, "grad_norm": 6.196969937036423, "learning_rate": 1.623637406769937e-05, "loss": 1.2593, "step": 1415 }, { "epoch": 0.0814734063916461, "grad_norm": 5.138494689314316, "learning_rate": 1.6293746414228343e-05, "loss": 1.2321, "step": 1420 }, { "epoch": 0.08176028458316599, "grad_norm": 7.677460329918865, "learning_rate": 1.6351118760757315e-05, "loss": 1.2077, "step": 1425 }, { "epoch": 0.08204716277468586, "grad_norm": 6.464280474545335, "learning_rate": 1.640849110728629e-05, "loss": 1.292, "step": 1430 }, { "epoch": 0.08233404096620575, "grad_norm": 5.853113721357127, "learning_rate": 1.6465863453815262e-05, "loss": 1.1729, "step": 1435 }, { "epoch": 0.08262091915772563, "grad_norm": 6.1335848601725536, "learning_rate": 1.6523235800344234e-05, "loss": 1.2836, "step": 1440 }, { "epoch": 0.08290779734924551, "grad_norm": 5.664288782783335, "learning_rate": 1.658060814687321e-05, "loss": 1.203, "step": 1445 }, { "epoch": 0.08319467554076539, "grad_norm": 5.366662821299216, "learning_rate": 1.663798049340218e-05, "loss": 1.2489, "step": 1450 }, { "epoch": 0.08348155373228527, "grad_norm": 5.415575782211289, "learning_rate": 1.6695352839931153e-05, "loss": 1.2339, "step": 1455 }, { "epoch": 0.08376843192380515, "grad_norm": 6.237910756517021, "learning_rate": 1.675272518646013e-05, "loss": 1.1462, "step": 1460 }, { "epoch": 0.08405531011532504, "grad_norm": 5.986220067400214, "learning_rate": 1.68100975329891e-05, "loss": 1.2849, "step": 1465 }, { "epoch": 0.08434218830684491, "grad_norm": 5.7528528020377125, "learning_rate": 1.6867469879518076e-05, "loss": 1.3098, "step": 1470 }, { "epoch": 0.0846290664983648, "grad_norm": 14.897871295942581, "learning_rate": 1.6924842226047048e-05, "loss": 1.3661, "step": 1475 }, { "epoch": 0.08491594468988467, "grad_norm": 5.7148796959262045, "learning_rate": 1.698221457257602e-05, "loss": 1.2495, "step": 1480 }, { "epoch": 0.08520282288140456, "grad_norm": 5.5043342451192805, "learning_rate": 1.7039586919104992e-05, "loss": 1.1714, "step": 1485 }, { "epoch": 0.08548970107292443, "grad_norm": 5.93990998762541, "learning_rate": 1.7096959265633964e-05, "loss": 1.1981, "step": 1490 }, { "epoch": 0.08577657926444432, "grad_norm": 6.595057366888322, "learning_rate": 1.715433161216294e-05, "loss": 1.1542, "step": 1495 }, { "epoch": 0.08606345745596419, "grad_norm": 5.75947340639385, "learning_rate": 1.721170395869191e-05, "loss": 1.2615, "step": 1500 }, { "epoch": 0.08635033564748408, "grad_norm": 5.441482634188159, "learning_rate": 1.7269076305220884e-05, "loss": 1.2772, "step": 1505 }, { "epoch": 0.08663721383900395, "grad_norm": 6.749149052842351, "learning_rate": 1.732644865174986e-05, "loss": 1.3273, "step": 1510 }, { "epoch": 0.08692409203052384, "grad_norm": 5.738888251396333, "learning_rate": 1.738382099827883e-05, "loss": 1.265, "step": 1515 }, { "epoch": 0.08721097022204372, "grad_norm": 5.506859194287912, "learning_rate": 1.7441193344807803e-05, "loss": 1.2632, "step": 1520 }, { "epoch": 0.0874978484135636, "grad_norm": 5.838627724282759, "learning_rate": 1.749856569133678e-05, "loss": 1.2678, "step": 1525 }, { "epoch": 0.08778472660508348, "grad_norm": 5.359261539819315, "learning_rate": 1.755593803786575e-05, "loss": 1.2039, "step": 1530 }, { "epoch": 0.08807160479660336, "grad_norm": 5.816196332627696, "learning_rate": 1.7613310384394722e-05, "loss": 1.1598, "step": 1535 }, { "epoch": 0.08835848298812324, "grad_norm": 13.318819246180842, "learning_rate": 1.7670682730923698e-05, "loss": 1.2279, "step": 1540 }, { "epoch": 0.08864536117964313, "grad_norm": 5.546948313668278, "learning_rate": 1.772805507745267e-05, "loss": 1.312, "step": 1545 }, { "epoch": 0.088932239371163, "grad_norm": 10.881555958292543, "learning_rate": 1.7785427423981642e-05, "loss": 1.2868, "step": 1550 }, { "epoch": 0.08921911756268289, "grad_norm": 9.674106515116575, "learning_rate": 1.7842799770510614e-05, "loss": 1.2134, "step": 1555 }, { "epoch": 0.08950599575420276, "grad_norm": 5.261270387736301, "learning_rate": 1.790017211703959e-05, "loss": 1.1808, "step": 1560 }, { "epoch": 0.08979287394572265, "grad_norm": 5.375463062721768, "learning_rate": 1.795754446356856e-05, "loss": 1.2478, "step": 1565 }, { "epoch": 0.09007975213724252, "grad_norm": 5.730627344476799, "learning_rate": 1.8014916810097533e-05, "loss": 1.2111, "step": 1570 }, { "epoch": 0.09036663032876241, "grad_norm": 6.042393013402504, "learning_rate": 1.807228915662651e-05, "loss": 1.1699, "step": 1575 }, { "epoch": 0.09065350852028228, "grad_norm": 5.38806300599388, "learning_rate": 1.812966150315548e-05, "loss": 1.1767, "step": 1580 }, { "epoch": 0.09094038671180217, "grad_norm": 6.238106891150774, "learning_rate": 1.8187033849684456e-05, "loss": 1.2093, "step": 1585 }, { "epoch": 0.09122726490332204, "grad_norm": 5.61390892690777, "learning_rate": 1.8244406196213428e-05, "loss": 1.2353, "step": 1590 }, { "epoch": 0.09151414309484193, "grad_norm": 5.767511466161737, "learning_rate": 1.83017785427424e-05, "loss": 1.2365, "step": 1595 }, { "epoch": 0.0918010212863618, "grad_norm": 5.2873559123070155, "learning_rate": 1.8359150889271372e-05, "loss": 1.3039, "step": 1600 }, { "epoch": 0.0920878994778817, "grad_norm": 5.257927269322271, "learning_rate": 1.8416523235800344e-05, "loss": 1.27, "step": 1605 }, { "epoch": 0.09237477766940157, "grad_norm": 5.48089360579949, "learning_rate": 1.847389558232932e-05, "loss": 1.2851, "step": 1610 }, { "epoch": 0.09266165586092145, "grad_norm": 5.051049677807695, "learning_rate": 1.853126792885829e-05, "loss": 1.2937, "step": 1615 }, { "epoch": 0.09294853405244133, "grad_norm": 5.918993397048792, "learning_rate": 1.8588640275387263e-05, "loss": 1.2511, "step": 1620 }, { "epoch": 0.09323541224396122, "grad_norm": 6.093361661851196, "learning_rate": 1.864601262191624e-05, "loss": 1.2887, "step": 1625 }, { "epoch": 0.09352229043548109, "grad_norm": 4.909992854112667, "learning_rate": 1.870338496844521e-05, "loss": 1.2655, "step": 1630 }, { "epoch": 0.09380916862700098, "grad_norm": 5.102241417796393, "learning_rate": 1.8760757314974186e-05, "loss": 1.2072, "step": 1635 }, { "epoch": 0.09409604681852085, "grad_norm": 5.373007289181058, "learning_rate": 1.8818129661503158e-05, "loss": 1.2212, "step": 1640 }, { "epoch": 0.09438292501004074, "grad_norm": 5.960788758054094, "learning_rate": 1.887550200803213e-05, "loss": 1.3007, "step": 1645 }, { "epoch": 0.09466980320156061, "grad_norm": 5.854610634001193, "learning_rate": 1.8932874354561102e-05, "loss": 1.1677, "step": 1650 }, { "epoch": 0.0949566813930805, "grad_norm": 5.855788091255976, "learning_rate": 1.8990246701090078e-05, "loss": 1.2867, "step": 1655 }, { "epoch": 0.09524355958460037, "grad_norm": 4.913693809644809, "learning_rate": 1.904761904761905e-05, "loss": 1.2137, "step": 1660 }, { "epoch": 0.09553043777612026, "grad_norm": 5.302316014179101, "learning_rate": 1.910499139414802e-05, "loss": 1.2499, "step": 1665 }, { "epoch": 0.09581731596764013, "grad_norm": 5.22471811112386, "learning_rate": 1.9162363740676993e-05, "loss": 1.2715, "step": 1670 }, { "epoch": 0.09610419415916002, "grad_norm": 6.093671881012434, "learning_rate": 1.921973608720597e-05, "loss": 1.2553, "step": 1675 }, { "epoch": 0.0963910723506799, "grad_norm": 4.973623394819131, "learning_rate": 1.927710843373494e-05, "loss": 1.3017, "step": 1680 }, { "epoch": 0.09667795054219978, "grad_norm": 5.778627121082467, "learning_rate": 1.9334480780263913e-05, "loss": 1.2567, "step": 1685 }, { "epoch": 0.09696482873371966, "grad_norm": 5.380010042732207, "learning_rate": 1.9391853126792888e-05, "loss": 1.2882, "step": 1690 }, { "epoch": 0.09725170692523954, "grad_norm": 5.883352345109206, "learning_rate": 1.944922547332186e-05, "loss": 1.3161, "step": 1695 }, { "epoch": 0.09753858511675942, "grad_norm": 5.587749127394265, "learning_rate": 1.9506597819850836e-05, "loss": 1.2754, "step": 1700 }, { "epoch": 0.0978254633082793, "grad_norm": 6.05754945151573, "learning_rate": 1.9563970166379808e-05, "loss": 1.243, "step": 1705 }, { "epoch": 0.09811234149979918, "grad_norm": 6.018392286179883, "learning_rate": 1.962134251290878e-05, "loss": 1.2657, "step": 1710 }, { "epoch": 0.09839921969131907, "grad_norm": 9.139326928945549, "learning_rate": 1.967871485943775e-05, "loss": 1.3318, "step": 1715 }, { "epoch": 0.09868609788283894, "grad_norm": 5.972210447166518, "learning_rate": 1.9736087205966724e-05, "loss": 1.3406, "step": 1720 }, { "epoch": 0.09897297607435883, "grad_norm": 6.055783892124681, "learning_rate": 1.97934595524957e-05, "loss": 1.3404, "step": 1725 }, { "epoch": 0.0992598542658787, "grad_norm": 13.003733042486196, "learning_rate": 1.985083189902467e-05, "loss": 1.2561, "step": 1730 }, { "epoch": 0.09954673245739859, "grad_norm": 72.98978905898083, "learning_rate": 1.9908204245553643e-05, "loss": 1.2706, "step": 1735 }, { "epoch": 0.09983361064891846, "grad_norm": 5.41385507169968, "learning_rate": 1.996557659208262e-05, "loss": 1.2691, "step": 1740 }, { "epoch": 0.10012048884043835, "grad_norm": 5.716126166625373, "learning_rate": 1.999999919775815e-05, "loss": 1.2856, "step": 1745 }, { "epoch": 0.10040736703195823, "grad_norm": 5.544710132288757, "learning_rate": 1.9999990172538813e-05, "loss": 1.2483, "step": 1750 }, { "epoch": 0.10069424522347811, "grad_norm": 6.0777827530830155, "learning_rate": 1.999997111930691e-05, "loss": 1.2402, "step": 1755 }, { "epoch": 0.10098112341499799, "grad_norm": 5.941643332187974, "learning_rate": 1.999994203808154e-05, "loss": 1.3477, "step": 1760 }, { "epoch": 0.10126800160651787, "grad_norm": 6.149656691952028, "learning_rate": 1.9999902928891875e-05, "loss": 1.3171, "step": 1765 }, { "epoch": 0.10155487979803775, "grad_norm": 6.421977441034005, "learning_rate": 1.9999853791777125e-05, "loss": 1.2793, "step": 1770 }, { "epoch": 0.10184175798955764, "grad_norm": 5.512103938667319, "learning_rate": 1.9999794626786574e-05, "loss": 1.3635, "step": 1775 }, { "epoch": 0.10212863618107751, "grad_norm": 6.04729828437378, "learning_rate": 1.9999725433979546e-05, "loss": 1.253, "step": 1780 }, { "epoch": 0.1024155143725974, "grad_norm": 5.387551011863025, "learning_rate": 1.9999646213425428e-05, "loss": 1.2918, "step": 1785 }, { "epoch": 0.10270239256411727, "grad_norm": 8.510782213645184, "learning_rate": 1.9999556965203663e-05, "loss": 1.2438, "step": 1790 }, { "epoch": 0.10298927075563716, "grad_norm": 5.494983587129584, "learning_rate": 1.9999457689403754e-05, "loss": 1.1479, "step": 1795 }, { "epoch": 0.10327614894715703, "grad_norm": 5.6028769785309445, "learning_rate": 1.999934838612525e-05, "loss": 1.246, "step": 1800 }, { "epoch": 0.10356302713867692, "grad_norm": 5.5976906705049165, "learning_rate": 1.9999229055477764e-05, "loss": 1.2151, "step": 1805 }, { "epoch": 0.10384990533019679, "grad_norm": 5.7942016047718345, "learning_rate": 1.9999099697580953e-05, "loss": 1.2625, "step": 1810 }, { "epoch": 0.10413678352171668, "grad_norm": 7.66603896532262, "learning_rate": 1.999896031256455e-05, "loss": 1.2328, "step": 1815 }, { "epoch": 0.10442366171323655, "grad_norm": 6.365492384416032, "learning_rate": 1.9998810900568323e-05, "loss": 1.3206, "step": 1820 }, { "epoch": 0.10471053990475644, "grad_norm": 5.372783767319592, "learning_rate": 1.99986514617421e-05, "loss": 1.2886, "step": 1825 }, { "epoch": 0.10499741809627632, "grad_norm": 5.470458482194892, "learning_rate": 1.999848199624577e-05, "loss": 1.2999, "step": 1830 }, { "epoch": 0.1052842962877962, "grad_norm": 5.579611806848224, "learning_rate": 1.9998302504249278e-05, "loss": 1.3056, "step": 1835 }, { "epoch": 0.10557117447931608, "grad_norm": 5.812625790580043, "learning_rate": 1.9998112985932615e-05, "loss": 1.3045, "step": 1840 }, { "epoch": 0.10585805267083596, "grad_norm": 5.859953614240949, "learning_rate": 1.9997913441485826e-05, "loss": 1.276, "step": 1845 }, { "epoch": 0.10614493086235584, "grad_norm": 5.291508646069696, "learning_rate": 1.9997703871109023e-05, "loss": 1.3678, "step": 1850 }, { "epoch": 0.10643180905387573, "grad_norm": 4.931710867809496, "learning_rate": 1.999748427501236e-05, "loss": 1.3257, "step": 1855 }, { "epoch": 0.1067186872453956, "grad_norm": 7.1134834983707425, "learning_rate": 1.9997254653416042e-05, "loss": 1.2181, "step": 1860 }, { "epoch": 0.10700556543691549, "grad_norm": 5.439126729586421, "learning_rate": 1.9997015006550346e-05, "loss": 1.2951, "step": 1865 }, { "epoch": 0.10729244362843536, "grad_norm": 5.411106100852302, "learning_rate": 1.9996765334655578e-05, "loss": 1.2254, "step": 1870 }, { "epoch": 0.10757932181995525, "grad_norm": 5.573941278546742, "learning_rate": 1.9996505637982124e-05, "loss": 1.3048, "step": 1875 }, { "epoch": 0.10786620001147512, "grad_norm": 9.095954082891941, "learning_rate": 1.9996235916790392e-05, "loss": 1.2818, "step": 1880 }, { "epoch": 0.10815307820299501, "grad_norm": 5.366261583317368, "learning_rate": 1.999595617135087e-05, "loss": 1.2437, "step": 1885 }, { "epoch": 0.10843995639451488, "grad_norm": 5.464249069806292, "learning_rate": 1.999566640194409e-05, "loss": 1.2695, "step": 1890 }, { "epoch": 0.10872683458603477, "grad_norm": 7.533222534381487, "learning_rate": 1.999536660886062e-05, "loss": 1.2803, "step": 1895 }, { "epoch": 0.10901371277755464, "grad_norm": 4.818410087209907, "learning_rate": 1.9995056792401105e-05, "loss": 1.3431, "step": 1900 }, { "epoch": 0.10930059096907453, "grad_norm": 9.41795403481959, "learning_rate": 1.9994736952876225e-05, "loss": 1.3158, "step": 1905 }, { "epoch": 0.1095874691605944, "grad_norm": 5.43209918269842, "learning_rate": 1.999440709060672e-05, "loss": 1.2103, "step": 1910 }, { "epoch": 0.1098743473521143, "grad_norm": 5.517151401121562, "learning_rate": 1.9994067205923368e-05, "loss": 1.2588, "step": 1915 }, { "epoch": 0.11016122554363417, "grad_norm": 10.508556098984199, "learning_rate": 1.9993717299167014e-05, "loss": 1.2103, "step": 1920 }, { "epoch": 0.11044810373515405, "grad_norm": 7.046528905523309, "learning_rate": 1.9993357370688543e-05, "loss": 1.3359, "step": 1925 }, { "epoch": 0.11073498192667393, "grad_norm": 6.204889831317087, "learning_rate": 1.9992987420848893e-05, "loss": 1.2338, "step": 1930 }, { "epoch": 0.11102186011819382, "grad_norm": 4.928822623222508, "learning_rate": 1.999260745001905e-05, "loss": 1.3138, "step": 1935 }, { "epoch": 0.11130873830971369, "grad_norm": 5.193151052015445, "learning_rate": 1.9992217458580044e-05, "loss": 1.2329, "step": 1940 }, { "epoch": 0.11159561650123358, "grad_norm": 6.059303027131716, "learning_rate": 1.9991817446922966e-05, "loss": 1.3235, "step": 1945 }, { "epoch": 0.11188249469275345, "grad_norm": 5.530453309807942, "learning_rate": 1.9991407415448948e-05, "loss": 1.2673, "step": 1950 }, { "epoch": 0.11216937288427334, "grad_norm": 4.596323953487856, "learning_rate": 1.999098736456917e-05, "loss": 1.2359, "step": 1955 }, { "epoch": 0.11245625107579321, "grad_norm": 5.915992694751815, "learning_rate": 1.9990557294704857e-05, "loss": 1.317, "step": 1960 }, { "epoch": 0.1127431292673131, "grad_norm": 7.80619444850235, "learning_rate": 1.999011720628729e-05, "loss": 1.3193, "step": 1965 }, { "epoch": 0.11303000745883297, "grad_norm": 4.9286068310188025, "learning_rate": 1.998966709975778e-05, "loss": 1.288, "step": 1970 }, { "epoch": 0.11331688565035286, "grad_norm": 6.095338092587908, "learning_rate": 1.9989206975567708e-05, "loss": 1.2951, "step": 1975 }, { "epoch": 0.11360376384187273, "grad_norm": 8.839996452747812, "learning_rate": 1.998873683417848e-05, "loss": 1.3363, "step": 1980 }, { "epoch": 0.11389064203339262, "grad_norm": 5.143593901584496, "learning_rate": 1.9988256676061555e-05, "loss": 1.3043, "step": 1985 }, { "epoch": 0.1141775202249125, "grad_norm": 5.180088718325215, "learning_rate": 1.9987766501698437e-05, "loss": 1.292, "step": 1990 }, { "epoch": 0.11446439841643238, "grad_norm": 6.184334562368573, "learning_rate": 1.9987266311580678e-05, "loss": 1.3415, "step": 1995 }, { "epoch": 0.11475127660795226, "grad_norm": 6.062017513445739, "learning_rate": 1.9986756106209864e-05, "loss": 1.2377, "step": 2000 }, { "epoch": 0.11503815479947214, "grad_norm": 5.585352753041497, "learning_rate": 1.998623588609763e-05, "loss": 1.253, "step": 2005 }, { "epoch": 0.11532503299099202, "grad_norm": 8.48364352807319, "learning_rate": 1.998570565176566e-05, "loss": 1.3631, "step": 2010 }, { "epoch": 0.1156119111825119, "grad_norm": 6.00042224516325, "learning_rate": 1.998516540374567e-05, "loss": 1.3207, "step": 2015 }, { "epoch": 0.11589878937403178, "grad_norm": 5.202814090751597, "learning_rate": 1.9984615142579426e-05, "loss": 1.2626, "step": 2020 }, { "epoch": 0.11618566756555167, "grad_norm": 4.822876693544526, "learning_rate": 1.9984054868818724e-05, "loss": 1.272, "step": 2025 }, { "epoch": 0.11647254575707154, "grad_norm": 6.02083150521409, "learning_rate": 1.998348458302541e-05, "loss": 1.2349, "step": 2030 }, { "epoch": 0.11675942394859143, "grad_norm": 5.954725041294553, "learning_rate": 1.9982904285771368e-05, "loss": 1.3002, "step": 2035 }, { "epoch": 0.1170463021401113, "grad_norm": 5.53548839428847, "learning_rate": 1.998231397763853e-05, "loss": 1.305, "step": 2040 }, { "epoch": 0.11733318033163119, "grad_norm": 10.58523173031085, "learning_rate": 1.9981713659218846e-05, "loss": 1.1902, "step": 2045 }, { "epoch": 0.11762005852315106, "grad_norm": 5.600999480723141, "learning_rate": 1.9981103331114323e-05, "loss": 1.2968, "step": 2050 }, { "epoch": 0.11790693671467095, "grad_norm": 5.408598766992346, "learning_rate": 1.9980482993936995e-05, "loss": 1.4011, "step": 2055 }, { "epoch": 0.11819381490619082, "grad_norm": 5.529266105186685, "learning_rate": 1.9979852648308945e-05, "loss": 1.3031, "step": 2060 }, { "epoch": 0.11848069309771071, "grad_norm": 5.1026667628897675, "learning_rate": 1.997921229486228e-05, "loss": 1.2735, "step": 2065 }, { "epoch": 0.11876757128923059, "grad_norm": 5.228356429298885, "learning_rate": 1.997856193423915e-05, "loss": 1.2464, "step": 2070 }, { "epoch": 0.11905444948075047, "grad_norm": 5.513956765399148, "learning_rate": 1.997790156709173e-05, "loss": 1.2801, "step": 2075 }, { "epoch": 0.11934132767227035, "grad_norm": 6.018623197988776, "learning_rate": 1.997723119408225e-05, "loss": 1.3733, "step": 2080 }, { "epoch": 0.11962820586379024, "grad_norm": 6.822774275054917, "learning_rate": 1.9976550815882953e-05, "loss": 1.2294, "step": 2085 }, { "epoch": 0.11991508405531011, "grad_norm": 5.444114189975097, "learning_rate": 1.997586043317613e-05, "loss": 1.2364, "step": 2090 }, { "epoch": 0.12020196224683, "grad_norm": 5.370357653910981, "learning_rate": 1.9975160046654092e-05, "loss": 1.2406, "step": 2095 }, { "epoch": 0.12048884043834987, "grad_norm": 5.381387637155637, "learning_rate": 1.9974449657019192e-05, "loss": 1.2444, "step": 2100 }, { "epoch": 0.12077571862986976, "grad_norm": 5.293242508925208, "learning_rate": 1.997372926498381e-05, "loss": 1.2309, "step": 2105 }, { "epoch": 0.12106259682138963, "grad_norm": 7.901636820794866, "learning_rate": 1.9972998871270356e-05, "loss": 1.316, "step": 2110 }, { "epoch": 0.12134947501290952, "grad_norm": 5.1460831224457655, "learning_rate": 1.997225847661127e-05, "loss": 1.2469, "step": 2115 }, { "epoch": 0.12163635320442939, "grad_norm": 5.13996491161827, "learning_rate": 1.9971508081749022e-05, "loss": 1.3522, "step": 2120 }, { "epoch": 0.12192323139594928, "grad_norm": 5.03056592661486, "learning_rate": 1.997074768743611e-05, "loss": 1.2899, "step": 2125 }, { "epoch": 0.12221010958746915, "grad_norm": 6.060127644174636, "learning_rate": 1.996997729443506e-05, "loss": 1.3084, "step": 2130 }, { "epoch": 0.12249698777898904, "grad_norm": 5.79261433481041, "learning_rate": 1.996919690351842e-05, "loss": 1.3091, "step": 2135 }, { "epoch": 0.12278386597050892, "grad_norm": 4.630999108667004, "learning_rate": 1.996840651546877e-05, "loss": 1.2316, "step": 2140 }, { "epoch": 0.1230707441620288, "grad_norm": 5.1291781922647, "learning_rate": 1.996760613107872e-05, "loss": 1.2377, "step": 2145 }, { "epoch": 0.12335762235354868, "grad_norm": 7.049711472192149, "learning_rate": 1.9966795751150884e-05, "loss": 1.3544, "step": 2150 }, { "epoch": 0.12364450054506856, "grad_norm": 5.221199288748738, "learning_rate": 1.996597537649792e-05, "loss": 1.2851, "step": 2155 }, { "epoch": 0.12393137873658844, "grad_norm": 4.7099195433171195, "learning_rate": 1.9965145007942502e-05, "loss": 1.2174, "step": 2160 }, { "epoch": 0.12421825692810833, "grad_norm": 6.57924333005282, "learning_rate": 1.9964304646317325e-05, "loss": 1.3017, "step": 2165 }, { "epoch": 0.1245051351196282, "grad_norm": 4.999480566825913, "learning_rate": 1.9963454292465102e-05, "loss": 1.3311, "step": 2170 }, { "epoch": 0.12479201331114809, "grad_norm": 4.9042422709056535, "learning_rate": 1.9962593947238576e-05, "loss": 1.3238, "step": 2175 }, { "epoch": 0.12507889150266796, "grad_norm": 5.317262801392513, "learning_rate": 1.99617236115005e-05, "loss": 1.2919, "step": 2180 }, { "epoch": 0.12536576969418783, "grad_norm": 4.616116186055363, "learning_rate": 1.9960843286123648e-05, "loss": 1.2356, "step": 2185 }, { "epoch": 0.12565264788570774, "grad_norm": 4.996410686594699, "learning_rate": 1.9959952971990813e-05, "loss": 1.3524, "step": 2190 }, { "epoch": 0.1259395260772276, "grad_norm": 4.89064601217377, "learning_rate": 1.99590526699948e-05, "loss": 1.3365, "step": 2195 }, { "epoch": 0.12622640426874748, "grad_norm": 5.044399020446158, "learning_rate": 1.995814238103844e-05, "loss": 1.2454, "step": 2200 }, { "epoch": 0.12651328246026736, "grad_norm": 5.85877199438017, "learning_rate": 1.9957222106034572e-05, "loss": 1.3137, "step": 2205 }, { "epoch": 0.12680016065178726, "grad_norm": 4.416580316844231, "learning_rate": 1.9956291845906047e-05, "loss": 1.2766, "step": 2210 }, { "epoch": 0.12708703884330713, "grad_norm": 5.562248961104113, "learning_rate": 1.995535160158573e-05, "loss": 1.3279, "step": 2215 }, { "epoch": 0.127373917034827, "grad_norm": 4.791192685057644, "learning_rate": 1.995440137401651e-05, "loss": 1.2202, "step": 2220 }, { "epoch": 0.12766079522634688, "grad_norm": 4.606453779969444, "learning_rate": 1.9953441164151265e-05, "loss": 1.334, "step": 2225 }, { "epoch": 0.12794767341786678, "grad_norm": 4.966823642312285, "learning_rate": 1.9952470972952903e-05, "loss": 1.242, "step": 2230 }, { "epoch": 0.12823455160938665, "grad_norm": 5.9875833908363685, "learning_rate": 1.995149080139433e-05, "loss": 1.3279, "step": 2235 }, { "epoch": 0.12852142980090653, "grad_norm": 5.068265131774375, "learning_rate": 1.9950500650458472e-05, "loss": 1.2968, "step": 2240 }, { "epoch": 0.12880830799242643, "grad_norm": 5.028263103275457, "learning_rate": 1.9949500521138243e-05, "loss": 1.2209, "step": 2245 }, { "epoch": 0.1290951861839463, "grad_norm": 38.82885986972658, "learning_rate": 1.9948490414436582e-05, "loss": 1.3952, "step": 2250 }, { "epoch": 0.12938206437546618, "grad_norm": 9.93461989914911, "learning_rate": 1.9947470331366427e-05, "loss": 1.2712, "step": 2255 }, { "epoch": 0.12966894256698605, "grad_norm": 5.690120274287126, "learning_rate": 1.9946440272950718e-05, "loss": 1.2238, "step": 2260 }, { "epoch": 0.12995582075850595, "grad_norm": 5.588843812157996, "learning_rate": 1.9945400240222397e-05, "loss": 1.438, "step": 2265 }, { "epoch": 0.13024269895002583, "grad_norm": 5.184593098209396, "learning_rate": 1.9944350234224415e-05, "loss": 1.262, "step": 2270 }, { "epoch": 0.1305295771415457, "grad_norm": 4.292967335723752, "learning_rate": 1.994329025600972e-05, "loss": 1.278, "step": 2275 }, { "epoch": 0.13081645533306557, "grad_norm": 5.672412635122899, "learning_rate": 1.9942220306641256e-05, "loss": 1.391, "step": 2280 }, { "epoch": 0.13110333352458547, "grad_norm": 4.971657938544365, "learning_rate": 1.9941140387191978e-05, "loss": 1.2742, "step": 2285 }, { "epoch": 0.13139021171610535, "grad_norm": 8.276010463462544, "learning_rate": 1.994005049874483e-05, "loss": 1.3107, "step": 2290 }, { "epoch": 0.13167708990762522, "grad_norm": 5.22886296139896, "learning_rate": 1.993895064239275e-05, "loss": 1.1941, "step": 2295 }, { "epoch": 0.1319639680991451, "grad_norm": 5.385060022218811, "learning_rate": 1.9937840819238676e-05, "loss": 1.3337, "step": 2300 }, { "epoch": 0.132250846290665, "grad_norm": 5.499843606089984, "learning_rate": 1.9936721030395546e-05, "loss": 1.3267, "step": 2305 }, { "epoch": 0.13253772448218487, "grad_norm": 6.992826190687404, "learning_rate": 1.9935591276986287e-05, "loss": 1.3741, "step": 2310 }, { "epoch": 0.13282460267370474, "grad_norm": 5.381974405870367, "learning_rate": 1.9934451560143816e-05, "loss": 1.3723, "step": 2315 }, { "epoch": 0.13311148086522462, "grad_norm": 5.586081829349455, "learning_rate": 1.993330188101104e-05, "loss": 1.2634, "step": 2320 }, { "epoch": 0.13339835905674452, "grad_norm": 9.194346492424183, "learning_rate": 1.9932142240740865e-05, "loss": 1.3005, "step": 2325 }, { "epoch": 0.1336852372482644, "grad_norm": 8.010558400925598, "learning_rate": 1.993097264049618e-05, "loss": 1.288, "step": 2330 }, { "epoch": 0.13397211543978427, "grad_norm": 4.855224618004592, "learning_rate": 1.9929793081449863e-05, "loss": 1.2951, "step": 2335 }, { "epoch": 0.13425899363130414, "grad_norm": 4.858373628039437, "learning_rate": 1.9928603564784773e-05, "loss": 1.3923, "step": 2340 }, { "epoch": 0.13454587182282404, "grad_norm": 5.877981814606828, "learning_rate": 1.992740409169377e-05, "loss": 1.3465, "step": 2345 }, { "epoch": 0.13483275001434392, "grad_norm": 6.020783973887619, "learning_rate": 1.992619466337968e-05, "loss": 1.297, "step": 2350 }, { "epoch": 0.1351196282058638, "grad_norm": 5.992645837506376, "learning_rate": 1.9924975281055325e-05, "loss": 1.2264, "step": 2355 }, { "epoch": 0.13540650639738366, "grad_norm": 5.062069531992148, "learning_rate": 1.9923745945943502e-05, "loss": 1.2366, "step": 2360 }, { "epoch": 0.13569338458890357, "grad_norm": 5.161333348323579, "learning_rate": 1.992250665927699e-05, "loss": 1.2972, "step": 2365 }, { "epoch": 0.13598026278042344, "grad_norm": 5.786285818122213, "learning_rate": 1.9921257422298553e-05, "loss": 1.2711, "step": 2370 }, { "epoch": 0.1362671409719433, "grad_norm": 4.746742507445327, "learning_rate": 1.991999823626092e-05, "loss": 1.3626, "step": 2375 }, { "epoch": 0.1365540191634632, "grad_norm": 5.441186419417357, "learning_rate": 1.9918729102426818e-05, "loss": 1.334, "step": 2380 }, { "epoch": 0.1368408973549831, "grad_norm": 4.30947724513665, "learning_rate": 1.991745002206893e-05, "loss": 1.2316, "step": 2385 }, { "epoch": 0.13712777554650296, "grad_norm": 5.450095819653387, "learning_rate": 1.9916160996469916e-05, "loss": 1.3249, "step": 2390 }, { "epoch": 0.13741465373802284, "grad_norm": 4.691873935386862, "learning_rate": 1.991486202692242e-05, "loss": 1.3398, "step": 2395 }, { "epoch": 0.1377015319295427, "grad_norm": 4.873355828893619, "learning_rate": 1.9913553114729054e-05, "loss": 1.2423, "step": 2400 }, { "epoch": 0.1379884101210626, "grad_norm": 5.220622336471057, "learning_rate": 1.991223426120239e-05, "loss": 1.3491, "step": 2405 }, { "epoch": 0.13827528831258248, "grad_norm": 5.00781391020583, "learning_rate": 1.9910905467664986e-05, "loss": 1.3142, "step": 2410 }, { "epoch": 0.13856216650410236, "grad_norm": 5.002616596027752, "learning_rate": 1.9909566735449354e-05, "loss": 1.3711, "step": 2415 }, { "epoch": 0.13884904469562223, "grad_norm": 5.165607072074728, "learning_rate": 1.990821806589798e-05, "loss": 1.3098, "step": 2420 }, { "epoch": 0.13913592288714213, "grad_norm": 5.229685213863043, "learning_rate": 1.9906859460363307e-05, "loss": 1.2267, "step": 2425 }, { "epoch": 0.139422801078662, "grad_norm": 4.599230901038231, "learning_rate": 1.9905490920207758e-05, "loss": 1.2987, "step": 2430 }, { "epoch": 0.13970967927018188, "grad_norm": 4.518474115917966, "learning_rate": 1.9904112446803697e-05, "loss": 1.2904, "step": 2435 }, { "epoch": 0.13999655746170175, "grad_norm": 7.983732936317085, "learning_rate": 1.990272404153347e-05, "loss": 1.4243, "step": 2440 }, { "epoch": 0.14028343565322166, "grad_norm": 5.787793716387152, "learning_rate": 1.9901325705789367e-05, "loss": 1.3046, "step": 2445 }, { "epoch": 0.14057031384474153, "grad_norm": 4.648000511325244, "learning_rate": 1.9899917440973642e-05, "loss": 1.3705, "step": 2450 }, { "epoch": 0.1408571920362614, "grad_norm": 4.9838977085073735, "learning_rate": 1.989849924849851e-05, "loss": 1.345, "step": 2455 }, { "epoch": 0.14114407022778128, "grad_norm": 5.32843420537399, "learning_rate": 1.9897071129786136e-05, "loss": 1.3624, "step": 2460 }, { "epoch": 0.14143094841930118, "grad_norm": 5.516203688880067, "learning_rate": 1.989563308626864e-05, "loss": 1.3427, "step": 2465 }, { "epoch": 0.14171782661082105, "grad_norm": 4.708718798919535, "learning_rate": 1.989418511938809e-05, "loss": 1.3134, "step": 2470 }, { "epoch": 0.14200470480234093, "grad_norm": 6.616814041075748, "learning_rate": 1.989272723059652e-05, "loss": 1.3778, "step": 2475 }, { "epoch": 0.1422915829938608, "grad_norm": 6.405381674799359, "learning_rate": 1.9891259421355898e-05, "loss": 1.3104, "step": 2480 }, { "epoch": 0.1425784611853807, "grad_norm": 4.736414551543994, "learning_rate": 1.9889781693138152e-05, "loss": 1.3274, "step": 2485 }, { "epoch": 0.14286533937690057, "grad_norm": 5.622626193331212, "learning_rate": 1.9888294047425142e-05, "loss": 1.3592, "step": 2490 }, { "epoch": 0.14315221756842045, "grad_norm": 4.817859183982059, "learning_rate": 1.9886796485708692e-05, "loss": 1.2315, "step": 2495 }, { "epoch": 0.14343909575994032, "grad_norm": 5.504514506777474, "learning_rate": 1.9885289009490554e-05, "loss": 1.3372, "step": 2500 }, { "epoch": 0.14372597395146022, "grad_norm": 4.424363047129484, "learning_rate": 1.9883771620282432e-05, "loss": 1.2614, "step": 2505 }, { "epoch": 0.1440128521429801, "grad_norm": 5.173245427316084, "learning_rate": 1.988224431960597e-05, "loss": 1.3433, "step": 2510 }, { "epoch": 0.14429973033449997, "grad_norm": 4.98565669295182, "learning_rate": 1.9880707108992737e-05, "loss": 1.4609, "step": 2515 }, { "epoch": 0.14458660852601984, "grad_norm": 6.491951478135267, "learning_rate": 1.9879159989984264e-05, "loss": 1.3005, "step": 2520 }, { "epoch": 0.14487348671753975, "grad_norm": 5.004825028711236, "learning_rate": 1.9877602964131996e-05, "loss": 1.2415, "step": 2525 }, { "epoch": 0.14516036490905962, "grad_norm": 5.788575861155241, "learning_rate": 1.987603603299733e-05, "loss": 1.3298, "step": 2530 }, { "epoch": 0.1454472431005795, "grad_norm": 5.8456707698520365, "learning_rate": 1.9874459198151583e-05, "loss": 1.3323, "step": 2535 }, { "epoch": 0.14573412129209937, "grad_norm": 4.635634054078906, "learning_rate": 1.987287246117601e-05, "loss": 1.3491, "step": 2540 }, { "epoch": 0.14602099948361927, "grad_norm": 5.372393862856524, "learning_rate": 1.9871275823661796e-05, "loss": 1.3327, "step": 2545 }, { "epoch": 0.14630787767513914, "grad_norm": 4.518674873286287, "learning_rate": 1.9869669287210044e-05, "loss": 1.3384, "step": 2550 }, { "epoch": 0.14659475586665902, "grad_norm": 5.717241075195987, "learning_rate": 1.9868052853431806e-05, "loss": 1.2948, "step": 2555 }, { "epoch": 0.1468816340581789, "grad_norm": 4.988819559553401, "learning_rate": 1.9866426523948036e-05, "loss": 1.3254, "step": 2560 }, { "epoch": 0.1471685122496988, "grad_norm": 7.448076837789291, "learning_rate": 1.9864790300389626e-05, "loss": 1.3852, "step": 2565 }, { "epoch": 0.14745539044121866, "grad_norm": 4.634084752774205, "learning_rate": 1.986314418439738e-05, "loss": 1.2902, "step": 2570 }, { "epoch": 0.14774226863273854, "grad_norm": 5.825078538213663, "learning_rate": 1.986148817762203e-05, "loss": 1.3176, "step": 2575 }, { "epoch": 0.1480291468242584, "grad_norm": 5.4969780363320115, "learning_rate": 1.9859822281724218e-05, "loss": 1.3915, "step": 2580 }, { "epoch": 0.1483160250157783, "grad_norm": 5.09530010972399, "learning_rate": 1.985814649837452e-05, "loss": 1.342, "step": 2585 }, { "epoch": 0.1486029032072982, "grad_norm": 4.743324798695775, "learning_rate": 1.98564608292534e-05, "loss": 1.2838, "step": 2590 }, { "epoch": 0.14888978139881806, "grad_norm": 4.903091692260643, "learning_rate": 1.9854765276051266e-05, "loss": 1.2516, "step": 2595 }, { "epoch": 0.14917665959033793, "grad_norm": 4.592392338650179, "learning_rate": 1.985305984046841e-05, "loss": 1.3058, "step": 2600 }, { "epoch": 0.14946353778185784, "grad_norm": 5.481958891304398, "learning_rate": 1.985134452421505e-05, "loss": 1.3883, "step": 2605 }, { "epoch": 0.1497504159733777, "grad_norm": 6.273935933415567, "learning_rate": 1.9849619329011316e-05, "loss": 1.3363, "step": 2610 }, { "epoch": 0.15003729416489758, "grad_norm": 5.760477938886427, "learning_rate": 1.984788425658723e-05, "loss": 1.425, "step": 2615 }, { "epoch": 0.15032417235641746, "grad_norm": 5.906968828811207, "learning_rate": 1.984613930868273e-05, "loss": 1.3671, "step": 2620 }, { "epoch": 0.15061105054793736, "grad_norm": 4.506140333112699, "learning_rate": 1.984438448704765e-05, "loss": 1.2737, "step": 2625 }, { "epoch": 0.15089792873945723, "grad_norm": 4.641644654663601, "learning_rate": 1.9842619793441732e-05, "loss": 1.2891, "step": 2630 }, { "epoch": 0.1511848069309771, "grad_norm": 4.704845543865726, "learning_rate": 1.9840845229634614e-05, "loss": 1.2949, "step": 2635 }, { "epoch": 0.15147168512249698, "grad_norm": 5.712137174681385, "learning_rate": 1.9839060797405834e-05, "loss": 1.3662, "step": 2640 }, { "epoch": 0.15175856331401688, "grad_norm": 5.651098525938301, "learning_rate": 1.9837266498544823e-05, "loss": 1.2461, "step": 2645 }, { "epoch": 0.15204544150553675, "grad_norm": 5.056198837783476, "learning_rate": 1.983546233485091e-05, "loss": 1.3689, "step": 2650 }, { "epoch": 0.15233231969705663, "grad_norm": 4.429363693893526, "learning_rate": 1.983364830813331e-05, "loss": 1.3831, "step": 2655 }, { "epoch": 0.1526191978885765, "grad_norm": 4.770752974313884, "learning_rate": 1.9831824420211135e-05, "loss": 1.3393, "step": 2660 }, { "epoch": 0.1529060760800964, "grad_norm": 4.928048895356641, "learning_rate": 1.982999067291339e-05, "loss": 1.3411, "step": 2665 }, { "epoch": 0.15319295427161628, "grad_norm": 5.028498017383584, "learning_rate": 1.982814706807895e-05, "loss": 1.3067, "step": 2670 }, { "epoch": 0.15347983246313615, "grad_norm": 4.762404063375597, "learning_rate": 1.9826293607556595e-05, "loss": 1.2383, "step": 2675 }, { "epoch": 0.15376671065465602, "grad_norm": 4.9015556462134695, "learning_rate": 1.9824430293204976e-05, "loss": 1.2416, "step": 2680 }, { "epoch": 0.15405358884617593, "grad_norm": 4.286304621358924, "learning_rate": 1.9822557126892628e-05, "loss": 1.3301, "step": 2685 }, { "epoch": 0.1543404670376958, "grad_norm": 5.028619490403861, "learning_rate": 1.9820674110497966e-05, "loss": 1.2733, "step": 2690 }, { "epoch": 0.15462734522921567, "grad_norm": 4.218887233844497, "learning_rate": 1.9818781245909286e-05, "loss": 1.3179, "step": 2695 }, { "epoch": 0.15491422342073555, "grad_norm": 4.838672458394469, "learning_rate": 1.9816878535024756e-05, "loss": 1.3429, "step": 2700 }, { "epoch": 0.15520110161225545, "grad_norm": 4.713647622891565, "learning_rate": 1.9814965979752416e-05, "loss": 1.3588, "step": 2705 }, { "epoch": 0.15548797980377532, "grad_norm": 4.668359260545721, "learning_rate": 1.981304358201018e-05, "loss": 1.3828, "step": 2710 }, { "epoch": 0.1557748579952952, "grad_norm": 5.144652671911242, "learning_rate": 1.9811111343725842e-05, "loss": 1.2683, "step": 2715 }, { "epoch": 0.15606173618681507, "grad_norm": 4.835734187606194, "learning_rate": 1.9809169266837044e-05, "loss": 1.2751, "step": 2720 }, { "epoch": 0.15634861437833497, "grad_norm": 5.633582148681992, "learning_rate": 1.9807217353291312e-05, "loss": 1.3193, "step": 2725 }, { "epoch": 0.15663549256985485, "grad_norm": 4.722273836611023, "learning_rate": 1.980525560504602e-05, "loss": 1.2572, "step": 2730 }, { "epoch": 0.15692237076137472, "grad_norm": 4.6948411777040215, "learning_rate": 1.9803284024068428e-05, "loss": 1.3492, "step": 2735 }, { "epoch": 0.1572092489528946, "grad_norm": 4.014926651874665, "learning_rate": 1.980130261233563e-05, "loss": 1.2927, "step": 2740 }, { "epoch": 0.1574961271444145, "grad_norm": 5.185549660183582, "learning_rate": 1.9799311371834594e-05, "loss": 1.3146, "step": 2745 }, { "epoch": 0.15778300533593437, "grad_norm": 5.062997697658575, "learning_rate": 1.9797310304562143e-05, "loss": 1.2887, "step": 2750 }, { "epoch": 0.15806988352745424, "grad_norm": 4.431801030875766, "learning_rate": 1.9795299412524948e-05, "loss": 1.3785, "step": 2755 }, { "epoch": 0.15835676171897412, "grad_norm": 4.866764660096674, "learning_rate": 1.9793278697739537e-05, "loss": 1.306, "step": 2760 }, { "epoch": 0.15864363991049402, "grad_norm": 5.0283330495361875, "learning_rate": 1.9791248162232287e-05, "loss": 1.3046, "step": 2765 }, { "epoch": 0.1589305181020139, "grad_norm": 4.705095337144273, "learning_rate": 1.9789207808039426e-05, "loss": 1.2905, "step": 2770 }, { "epoch": 0.15921739629353376, "grad_norm": 5.8112392333574805, "learning_rate": 1.978715763720702e-05, "loss": 1.3799, "step": 2775 }, { "epoch": 0.15950427448505364, "grad_norm": 4.356514792727087, "learning_rate": 1.9785097651790992e-05, "loss": 1.2929, "step": 2780 }, { "epoch": 0.15979115267657354, "grad_norm": 4.563713299657952, "learning_rate": 1.9783027853857096e-05, "loss": 1.4164, "step": 2785 }, { "epoch": 0.1600780308680934, "grad_norm": 4.417764446899442, "learning_rate": 1.9780948245480933e-05, "loss": 1.35, "step": 2790 }, { "epoch": 0.1603649090596133, "grad_norm": 6.3027058100642055, "learning_rate": 1.9778858828747935e-05, "loss": 1.3184, "step": 2795 }, { "epoch": 0.16065178725113316, "grad_norm": 4.451223893270837, "learning_rate": 1.9776759605753377e-05, "loss": 1.407, "step": 2800 }, { "epoch": 0.16093866544265306, "grad_norm": 4.571710728376636, "learning_rate": 1.977465057860236e-05, "loss": 1.3268, "step": 2805 }, { "epoch": 0.16122554363417294, "grad_norm": 4.477552590492786, "learning_rate": 1.9772531749409827e-05, "loss": 1.2875, "step": 2810 }, { "epoch": 0.1615124218256928, "grad_norm": 4.813017708042702, "learning_rate": 1.977040312030054e-05, "loss": 1.3184, "step": 2815 }, { "epoch": 0.16179930001721268, "grad_norm": 4.225319546556177, "learning_rate": 1.97682646934091e-05, "loss": 1.3135, "step": 2820 }, { "epoch": 0.16208617820873258, "grad_norm": 4.793367842993655, "learning_rate": 1.9766116470879914e-05, "loss": 1.329, "step": 2825 }, { "epoch": 0.16237305640025246, "grad_norm": 6.468436890534012, "learning_rate": 1.9763958454867237e-05, "loss": 1.3439, "step": 2830 }, { "epoch": 0.16265993459177233, "grad_norm": 5.164197186332296, "learning_rate": 1.9761790647535127e-05, "loss": 1.3216, "step": 2835 }, { "epoch": 0.1629468127832922, "grad_norm": 4.896474077270743, "learning_rate": 1.9759613051057464e-05, "loss": 1.296, "step": 2840 }, { "epoch": 0.1632336909748121, "grad_norm": 4.752163041756587, "learning_rate": 1.9757425667617945e-05, "loss": 1.263, "step": 2845 }, { "epoch": 0.16352056916633198, "grad_norm": 5.4745303889997325, "learning_rate": 1.9755228499410094e-05, "loss": 1.3657, "step": 2850 }, { "epoch": 0.16380744735785185, "grad_norm": 5.8995193545257045, "learning_rate": 1.9753021548637222e-05, "loss": 1.3276, "step": 2855 }, { "epoch": 0.16409432554937173, "grad_norm": 5.030509124729779, "learning_rate": 1.9750804817512475e-05, "loss": 1.3005, "step": 2860 }, { "epoch": 0.16438120374089163, "grad_norm": 5.071501088121094, "learning_rate": 1.9748578308258792e-05, "loss": 1.3353, "step": 2865 }, { "epoch": 0.1646680819324115, "grad_norm": 4.984305663644961, "learning_rate": 1.974634202310892e-05, "loss": 1.2842, "step": 2870 }, { "epoch": 0.16495496012393138, "grad_norm": 5.305838586677253, "learning_rate": 1.9744095964305414e-05, "loss": 1.3298, "step": 2875 }, { "epoch": 0.16524183831545125, "grad_norm": 4.8496343371766, "learning_rate": 1.9741840134100624e-05, "loss": 1.2943, "step": 2880 }, { "epoch": 0.16552871650697115, "grad_norm": 5.260695187506133, "learning_rate": 1.9739574534756703e-05, "loss": 1.3179, "step": 2885 }, { "epoch": 0.16581559469849103, "grad_norm": 4.479426117288476, "learning_rate": 1.97372991685456e-05, "loss": 1.3367, "step": 2890 }, { "epoch": 0.1661024728900109, "grad_norm": 4.981261954415825, "learning_rate": 1.9735014037749055e-05, "loss": 1.4397, "step": 2895 }, { "epoch": 0.16638935108153077, "grad_norm": 4.3215324825099515, "learning_rate": 1.97327191446586e-05, "loss": 1.2536, "step": 2900 }, { "epoch": 0.16667622927305067, "grad_norm": 4.7578741938315785, "learning_rate": 1.9730414491575566e-05, "loss": 1.2255, "step": 2905 }, { "epoch": 0.16696310746457055, "grad_norm": 5.901140124672792, "learning_rate": 1.9728100080811058e-05, "loss": 1.3279, "step": 2910 }, { "epoch": 0.16724998565609042, "grad_norm": 5.98158901123952, "learning_rate": 1.9725775914685975e-05, "loss": 1.2887, "step": 2915 }, { "epoch": 0.1675368638476103, "grad_norm": 5.436797571038331, "learning_rate": 1.9723441995531e-05, "loss": 1.3129, "step": 2920 }, { "epoch": 0.1678237420391302, "grad_norm": 5.388365428404706, "learning_rate": 1.9721098325686585e-05, "loss": 1.2835, "step": 2925 }, { "epoch": 0.16811062023065007, "grad_norm": 4.769193364365567, "learning_rate": 1.971874490750297e-05, "loss": 1.309, "step": 2930 }, { "epoch": 0.16839749842216994, "grad_norm": 4.1695037352343265, "learning_rate": 1.971638174334017e-05, "loss": 1.3448, "step": 2935 }, { "epoch": 0.16868437661368982, "grad_norm": 4.867639508584913, "learning_rate": 1.971400883556797e-05, "loss": 1.2708, "step": 2940 }, { "epoch": 0.16897125480520972, "grad_norm": 5.568082376131052, "learning_rate": 1.971162618656593e-05, "loss": 1.3823, "step": 2945 }, { "epoch": 0.1692581329967296, "grad_norm": 4.7469679473112905, "learning_rate": 1.970923379872337e-05, "loss": 1.3116, "step": 2950 }, { "epoch": 0.16954501118824947, "grad_norm": 5.108797826751623, "learning_rate": 1.9706831674439383e-05, "loss": 1.3951, "step": 2955 }, { "epoch": 0.16983188937976934, "grad_norm": 4.391136711318426, "learning_rate": 1.9704419816122827e-05, "loss": 1.3397, "step": 2960 }, { "epoch": 0.17011876757128924, "grad_norm": 5.213599899215434, "learning_rate": 1.9701998226192322e-05, "loss": 1.3353, "step": 2965 }, { "epoch": 0.17040564576280912, "grad_norm": 4.820725963524994, "learning_rate": 1.9699566907076234e-05, "loss": 1.284, "step": 2970 }, { "epoch": 0.170692523954329, "grad_norm": 4.403861946854548, "learning_rate": 1.9697125861212706e-05, "loss": 1.2886, "step": 2975 }, { "epoch": 0.17097940214584886, "grad_norm": 5.674542392570453, "learning_rate": 1.969467509104962e-05, "loss": 1.2998, "step": 2980 }, { "epoch": 0.17126628033736876, "grad_norm": 5.22897466687227, "learning_rate": 1.969221459904461e-05, "loss": 1.3462, "step": 2985 }, { "epoch": 0.17155315852888864, "grad_norm": 5.006455647820901, "learning_rate": 1.968974438766507e-05, "loss": 1.3756, "step": 2990 }, { "epoch": 0.1718400367204085, "grad_norm": 5.251561029576707, "learning_rate": 1.9687264459388128e-05, "loss": 1.3481, "step": 2995 }, { "epoch": 0.17212691491192839, "grad_norm": 4.564415579129805, "learning_rate": 1.9684774816700664e-05, "loss": 1.2351, "step": 3000 }, { "epoch": 0.1724137931034483, "grad_norm": 4.57711237436247, "learning_rate": 1.96822754620993e-05, "loss": 1.2961, "step": 3005 }, { "epoch": 0.17270067129496816, "grad_norm": 4.520739399851926, "learning_rate": 1.9679766398090386e-05, "loss": 1.3733, "step": 3010 }, { "epoch": 0.17298754948648803, "grad_norm": 4.3399469659638665, "learning_rate": 1.9677247627190025e-05, "loss": 1.3326, "step": 3015 }, { "epoch": 0.1732744276780079, "grad_norm": 5.494981560893319, "learning_rate": 1.9674719151924045e-05, "loss": 1.3649, "step": 3020 }, { "epoch": 0.1735613058695278, "grad_norm": 4.420830692055126, "learning_rate": 1.9672180974828003e-05, "loss": 1.4237, "step": 3025 }, { "epoch": 0.17384818406104768, "grad_norm": 5.268325844547193, "learning_rate": 1.966963309844719e-05, "loss": 1.3215, "step": 3030 }, { "epoch": 0.17413506225256756, "grad_norm": 3.8203099837423147, "learning_rate": 1.9667075525336622e-05, "loss": 1.2901, "step": 3035 }, { "epoch": 0.17442194044408743, "grad_norm": 30.129564604245623, "learning_rate": 1.9664508258061042e-05, "loss": 1.3689, "step": 3040 }, { "epoch": 0.17470881863560733, "grad_norm": 4.653362056955883, "learning_rate": 1.966193129919491e-05, "loss": 1.3032, "step": 3045 }, { "epoch": 0.1749956968271272, "grad_norm": 5.174546113321289, "learning_rate": 1.9659344651322404e-05, "loss": 1.3141, "step": 3050 }, { "epoch": 0.17528257501864708, "grad_norm": 5.307048853775514, "learning_rate": 1.9656748317037424e-05, "loss": 1.3207, "step": 3055 }, { "epoch": 0.17556945321016695, "grad_norm": 5.376066257595926, "learning_rate": 1.9654142298943576e-05, "loss": 1.3391, "step": 3060 }, { "epoch": 0.17585633140168686, "grad_norm": 6.118868051113455, "learning_rate": 1.9651526599654184e-05, "loss": 1.2458, "step": 3065 }, { "epoch": 0.17614320959320673, "grad_norm": 5.020151652995736, "learning_rate": 1.964890122179227e-05, "loss": 1.291, "step": 3070 }, { "epoch": 0.1764300877847266, "grad_norm": 5.089552308332491, "learning_rate": 1.9646266167990576e-05, "loss": 1.2884, "step": 3075 }, { "epoch": 0.17671696597624648, "grad_norm": 5.8251872563528515, "learning_rate": 1.9643621440891542e-05, "loss": 1.2907, "step": 3080 }, { "epoch": 0.17700384416776638, "grad_norm": 4.90738742980356, "learning_rate": 1.96409670431473e-05, "loss": 1.3356, "step": 3085 }, { "epoch": 0.17729072235928625, "grad_norm": 4.639667696064047, "learning_rate": 1.963830297741969e-05, "loss": 1.283, "step": 3090 }, { "epoch": 0.17757760055080613, "grad_norm": 5.203375879111512, "learning_rate": 1.9635629246380242e-05, "loss": 1.3237, "step": 3095 }, { "epoch": 0.177864478742326, "grad_norm": 5.183779296010179, "learning_rate": 1.9632945852710175e-05, "loss": 1.3279, "step": 3100 }, { "epoch": 0.1781513569338459, "grad_norm": 4.886162524479371, "learning_rate": 1.963025279910041e-05, "loss": 1.27, "step": 3105 }, { "epoch": 0.17843823512536577, "grad_norm": 5.426748537274678, "learning_rate": 1.962755008825154e-05, "loss": 1.306, "step": 3110 }, { "epoch": 0.17872511331688565, "grad_norm": 4.9196753858967055, "learning_rate": 1.9624837722873855e-05, "loss": 1.3113, "step": 3115 }, { "epoch": 0.17901199150840552, "grad_norm": 5.114174767052371, "learning_rate": 1.962211570568732e-05, "loss": 1.2708, "step": 3120 }, { "epoch": 0.17929886969992542, "grad_norm": 4.556632062504037, "learning_rate": 1.9619384039421578e-05, "loss": 1.3308, "step": 3125 }, { "epoch": 0.1795857478914453, "grad_norm": 4.697591045605062, "learning_rate": 1.961664272681595e-05, "loss": 1.2536, "step": 3130 }, { "epoch": 0.17987262608296517, "grad_norm": 4.209367068823256, "learning_rate": 1.9613891770619433e-05, "loss": 1.2756, "step": 3135 }, { "epoch": 0.18015950427448504, "grad_norm": 5.2192526610642, "learning_rate": 1.9611131173590686e-05, "loss": 1.3, "step": 3140 }, { "epoch": 0.18044638246600495, "grad_norm": 4.471660835556204, "learning_rate": 1.960836093849805e-05, "loss": 1.336, "step": 3145 }, { "epoch": 0.18073326065752482, "grad_norm": 5.152856997558774, "learning_rate": 1.960558106811952e-05, "loss": 1.3637, "step": 3150 }, { "epoch": 0.1810201388490447, "grad_norm": 4.700432605002514, "learning_rate": 1.9602791565242755e-05, "loss": 1.2068, "step": 3155 }, { "epoch": 0.18130701704056457, "grad_norm": 6.6210382316500676, "learning_rate": 1.9599992432665073e-05, "loss": 1.2788, "step": 3160 }, { "epoch": 0.18159389523208447, "grad_norm": 5.05498754825603, "learning_rate": 1.9597183673193454e-05, "loss": 1.2907, "step": 3165 }, { "epoch": 0.18188077342360434, "grad_norm": 4.537471033574283, "learning_rate": 1.9594365289644527e-05, "loss": 1.2267, "step": 3170 }, { "epoch": 0.18216765161512422, "grad_norm": 4.664614498362583, "learning_rate": 1.9591537284844575e-05, "loss": 1.3642, "step": 3175 }, { "epoch": 0.1824545298066441, "grad_norm": 4.130512318334508, "learning_rate": 1.9588699661629526e-05, "loss": 1.2552, "step": 3180 }, { "epoch": 0.182741407998164, "grad_norm": 4.559252207776005, "learning_rate": 1.958585242284495e-05, "loss": 1.3195, "step": 3185 }, { "epoch": 0.18302828618968386, "grad_norm": 4.38067467326191, "learning_rate": 1.9582995571346073e-05, "loss": 1.3423, "step": 3190 }, { "epoch": 0.18331516438120374, "grad_norm": 4.565919366610676, "learning_rate": 1.958012910999775e-05, "loss": 1.3205, "step": 3195 }, { "epoch": 0.1836020425727236, "grad_norm": 8.152042751820078, "learning_rate": 1.957725304167447e-05, "loss": 1.424, "step": 3200 }, { "epoch": 0.1838889207642435, "grad_norm": 4.72488237961958, "learning_rate": 1.9574367369260364e-05, "loss": 1.2721, "step": 3205 }, { "epoch": 0.1841757989557634, "grad_norm": 4.719012733739851, "learning_rate": 1.9571472095649194e-05, "loss": 1.3084, "step": 3210 }, { "epoch": 0.18446267714728326, "grad_norm": 4.563417510027355, "learning_rate": 1.956856722374434e-05, "loss": 1.3818, "step": 3215 }, { "epoch": 0.18474955533880313, "grad_norm": 4.667983246329887, "learning_rate": 1.956565275645882e-05, "loss": 1.3272, "step": 3220 }, { "epoch": 0.18503643353032304, "grad_norm": 4.520534052965826, "learning_rate": 1.9562728696715263e-05, "loss": 1.2866, "step": 3225 }, { "epoch": 0.1853233117218429, "grad_norm": 4.458089915039921, "learning_rate": 1.955979504744593e-05, "loss": 1.3036, "step": 3230 }, { "epoch": 0.18561018991336278, "grad_norm": 5.6548610780277615, "learning_rate": 1.9556851811592684e-05, "loss": 1.3558, "step": 3235 }, { "epoch": 0.18589706810488266, "grad_norm": 6.422264989474293, "learning_rate": 1.955389899210701e-05, "loss": 1.3209, "step": 3240 }, { "epoch": 0.18618394629640256, "grad_norm": 4.022657580338163, "learning_rate": 1.9550936591950006e-05, "loss": 1.2529, "step": 3245 }, { "epoch": 0.18647082448792243, "grad_norm": 4.9412647808558345, "learning_rate": 1.954796461409237e-05, "loss": 1.2862, "step": 3250 }, { "epoch": 0.1867577026794423, "grad_norm": 4.232666174055594, "learning_rate": 1.954498306151441e-05, "loss": 1.2817, "step": 3255 }, { "epoch": 0.18704458087096218, "grad_norm": 5.133816324780198, "learning_rate": 1.9541991937206027e-05, "loss": 1.3651, "step": 3260 }, { "epoch": 0.18733145906248208, "grad_norm": 4.486004095431311, "learning_rate": 1.953899124416674e-05, "loss": 1.2736, "step": 3265 }, { "epoch": 0.18761833725400195, "grad_norm": 4.575955085739559, "learning_rate": 1.9535980985405638e-05, "loss": 1.3113, "step": 3270 }, { "epoch": 0.18790521544552183, "grad_norm": 5.22163755559649, "learning_rate": 1.9532961163941422e-05, "loss": 1.2747, "step": 3275 }, { "epoch": 0.1881920936370417, "grad_norm": 5.033300154652712, "learning_rate": 1.9529931782802377e-05, "loss": 1.3453, "step": 3280 }, { "epoch": 0.1884789718285616, "grad_norm": 3.919674733316959, "learning_rate": 1.9526892845026366e-05, "loss": 1.3672, "step": 3285 }, { "epoch": 0.18876585002008148, "grad_norm": 4.5364154487046715, "learning_rate": 1.9523844353660848e-05, "loss": 1.2958, "step": 3290 }, { "epoch": 0.18905272821160135, "grad_norm": 4.7145393868603005, "learning_rate": 1.952078631176286e-05, "loss": 1.2597, "step": 3295 }, { "epoch": 0.18933960640312122, "grad_norm": 4.460544478015526, "learning_rate": 1.9517718722399005e-05, "loss": 1.2994, "step": 3300 }, { "epoch": 0.18962648459464113, "grad_norm": 4.005969569441419, "learning_rate": 1.9514641588645472e-05, "loss": 1.2649, "step": 3305 }, { "epoch": 0.189913362786161, "grad_norm": 4.733993064563216, "learning_rate": 1.951155491358802e-05, "loss": 1.3002, "step": 3310 }, { "epoch": 0.19020024097768087, "grad_norm": 5.967145209561638, "learning_rate": 1.9508458700321975e-05, "loss": 1.3502, "step": 3315 }, { "epoch": 0.19048711916920075, "grad_norm": 4.809757063286074, "learning_rate": 1.950535295195222e-05, "loss": 1.3066, "step": 3320 }, { "epoch": 0.19077399736072065, "grad_norm": 4.208850281728846, "learning_rate": 1.9502237671593215e-05, "loss": 1.2743, "step": 3325 }, { "epoch": 0.19106087555224052, "grad_norm": 4.381119475615323, "learning_rate": 1.949911286236896e-05, "loss": 1.2146, "step": 3330 }, { "epoch": 0.1913477537437604, "grad_norm": 5.2335848078440455, "learning_rate": 1.9495978527413028e-05, "loss": 1.3384, "step": 3335 }, { "epoch": 0.19163463193528027, "grad_norm": 4.836938776872508, "learning_rate": 1.9492834669868536e-05, "loss": 1.3391, "step": 3340 }, { "epoch": 0.19192151012680017, "grad_norm": 4.233426980115054, "learning_rate": 1.948968129288815e-05, "loss": 1.3686, "step": 3345 }, { "epoch": 0.19220838831832004, "grad_norm": 4.5059707917104195, "learning_rate": 1.948651839963408e-05, "loss": 1.2983, "step": 3350 }, { "epoch": 0.19249526650983992, "grad_norm": 4.2501629385722675, "learning_rate": 1.948334599327809e-05, "loss": 1.2988, "step": 3355 }, { "epoch": 0.1927821447013598, "grad_norm": 4.280182002052744, "learning_rate": 1.9480164077001477e-05, "loss": 1.3577, "step": 3360 }, { "epoch": 0.1930690228928797, "grad_norm": 4.489469300434685, "learning_rate": 1.9476972653995064e-05, "loss": 1.2953, "step": 3365 }, { "epoch": 0.19335590108439957, "grad_norm": 6.906455799356816, "learning_rate": 1.9473771727459224e-05, "loss": 1.3731, "step": 3370 }, { "epoch": 0.19364277927591944, "grad_norm": 4.833510084271619, "learning_rate": 1.947056130060385e-05, "loss": 1.3379, "step": 3375 }, { "epoch": 0.19392965746743931, "grad_norm": 5.0641539660523245, "learning_rate": 1.9467341376648373e-05, "loss": 1.3079, "step": 3380 }, { "epoch": 0.19421653565895922, "grad_norm": 4.857137401615879, "learning_rate": 1.946411195882173e-05, "loss": 1.3344, "step": 3385 }, { "epoch": 0.1945034138504791, "grad_norm": 4.402046168294648, "learning_rate": 1.946087305036239e-05, "loss": 1.3425, "step": 3390 }, { "epoch": 0.19479029204199896, "grad_norm": 12.786580070325057, "learning_rate": 1.9457624654518344e-05, "loss": 1.3241, "step": 3395 }, { "epoch": 0.19507717023351884, "grad_norm": 4.373452114513639, "learning_rate": 1.9454366774547085e-05, "loss": 1.2425, "step": 3400 }, { "epoch": 0.19536404842503874, "grad_norm": 4.260249618587644, "learning_rate": 1.9451099413715626e-05, "loss": 1.3051, "step": 3405 }, { "epoch": 0.1956509266165586, "grad_norm": 4.382645223440963, "learning_rate": 1.944782257530048e-05, "loss": 1.3067, "step": 3410 }, { "epoch": 0.1959378048080785, "grad_norm": 5.0107236356996605, "learning_rate": 1.944453626258767e-05, "loss": 1.2964, "step": 3415 }, { "epoch": 0.19622468299959836, "grad_norm": 5.136992940523369, "learning_rate": 1.944124047887272e-05, "loss": 1.2986, "step": 3420 }, { "epoch": 0.19651156119111826, "grad_norm": 4.541943226656399, "learning_rate": 1.9437935227460643e-05, "loss": 1.282, "step": 3425 }, { "epoch": 0.19679843938263814, "grad_norm": 4.1820599347590885, "learning_rate": 1.943462051166596e-05, "loss": 1.3754, "step": 3430 }, { "epoch": 0.197085317574158, "grad_norm": 4.688569076359718, "learning_rate": 1.943129633481267e-05, "loss": 1.3072, "step": 3435 }, { "epoch": 0.19737219576567788, "grad_norm": 3.83911158772529, "learning_rate": 1.9427962700234267e-05, "loss": 1.3939, "step": 3440 }, { "epoch": 0.19765907395719778, "grad_norm": 4.271775359188642, "learning_rate": 1.9424619611273726e-05, "loss": 1.3245, "step": 3445 }, { "epoch": 0.19794595214871766, "grad_norm": 4.317213246949932, "learning_rate": 1.942126707128351e-05, "loss": 1.3195, "step": 3450 }, { "epoch": 0.19823283034023753, "grad_norm": 4.276536157413799, "learning_rate": 1.9417905083625546e-05, "loss": 1.4058, "step": 3455 }, { "epoch": 0.1985197085317574, "grad_norm": 4.697302715511732, "learning_rate": 1.9414533651671244e-05, "loss": 1.29, "step": 3460 }, { "epoch": 0.1988065867232773, "grad_norm": 4.900868519821105, "learning_rate": 1.941115277880149e-05, "loss": 1.2759, "step": 3465 }, { "epoch": 0.19909346491479718, "grad_norm": 4.201378256953131, "learning_rate": 1.9407762468406618e-05, "loss": 1.2492, "step": 3470 }, { "epoch": 0.19938034310631705, "grad_norm": 4.122327353040211, "learning_rate": 1.9404362723886452e-05, "loss": 1.3014, "step": 3475 }, { "epoch": 0.19966722129783693, "grad_norm": 4.801045204737037, "learning_rate": 1.9400953548650258e-05, "loss": 1.341, "step": 3480 }, { "epoch": 0.19995409948935683, "grad_norm": 4.115881380918182, "learning_rate": 1.9397534946116762e-05, "loss": 1.3844, "step": 3485 }, { "epoch": 0.2002409776808767, "grad_norm": 4.842934500229018, "learning_rate": 1.9394106919714155e-05, "loss": 1.2293, "step": 3490 }, { "epoch": 0.20052785587239658, "grad_norm": 4.458218172386568, "learning_rate": 1.939066947288006e-05, "loss": 1.38, "step": 3495 }, { "epoch": 0.20081473406391645, "grad_norm": 6.000565731855771, "learning_rate": 1.938722260906156e-05, "loss": 1.2855, "step": 3500 }, { "epoch": 0.20110161225543635, "grad_norm": 4.078795657404665, "learning_rate": 1.9383766331715178e-05, "loss": 1.4004, "step": 3505 }, { "epoch": 0.20138849044695623, "grad_norm": 4.790392206722042, "learning_rate": 1.938030064430688e-05, "loss": 1.394, "step": 3510 }, { "epoch": 0.2016753686384761, "grad_norm": 5.373856872242404, "learning_rate": 1.9376825550312057e-05, "loss": 1.3453, "step": 3515 }, { "epoch": 0.20196224682999597, "grad_norm": 4.995300975324293, "learning_rate": 1.9373341053215545e-05, "loss": 1.2767, "step": 3520 }, { "epoch": 0.20224912502151587, "grad_norm": 4.296172679702256, "learning_rate": 1.936984715651161e-05, "loss": 1.3196, "step": 3525 }, { "epoch": 0.20253600321303575, "grad_norm": 4.286260917370341, "learning_rate": 1.9366343863703933e-05, "loss": 1.3595, "step": 3530 }, { "epoch": 0.20282288140455562, "grad_norm": 4.335157856765925, "learning_rate": 1.936283117830563e-05, "loss": 1.2523, "step": 3535 }, { "epoch": 0.2031097595960755, "grad_norm": 4.362929575494771, "learning_rate": 1.9359309103839227e-05, "loss": 1.2439, "step": 3540 }, { "epoch": 0.2033966377875954, "grad_norm": 4.350249401092582, "learning_rate": 1.935577764383666e-05, "loss": 1.3432, "step": 3545 }, { "epoch": 0.20368351597911527, "grad_norm": 4.752719861447074, "learning_rate": 1.93522368018393e-05, "loss": 1.2663, "step": 3550 }, { "epoch": 0.20397039417063514, "grad_norm": 4.552460902049511, "learning_rate": 1.93486865813979e-05, "loss": 1.2951, "step": 3555 }, { "epoch": 0.20425727236215502, "grad_norm": 5.166536919033785, "learning_rate": 1.9345126986072636e-05, "loss": 1.4059, "step": 3560 }, { "epoch": 0.20454415055367492, "grad_norm": 4.34437652039017, "learning_rate": 1.934155801943307e-05, "loss": 1.2913, "step": 3565 }, { "epoch": 0.2048310287451948, "grad_norm": 4.441293854179579, "learning_rate": 1.933797968505818e-05, "loss": 1.2875, "step": 3570 }, { "epoch": 0.20511790693671467, "grad_norm": 4.226565029868768, "learning_rate": 1.933439198653632e-05, "loss": 1.3266, "step": 3575 }, { "epoch": 0.20540478512823454, "grad_norm": 4.756356620852567, "learning_rate": 1.9330794927465246e-05, "loss": 1.2988, "step": 3580 }, { "epoch": 0.20569166331975444, "grad_norm": 3.5855470798863696, "learning_rate": 1.9327188511452094e-05, "loss": 1.3286, "step": 3585 }, { "epoch": 0.20597854151127432, "grad_norm": 4.396234381189321, "learning_rate": 1.9323572742113388e-05, "loss": 1.2606, "step": 3590 }, { "epoch": 0.2062654197027942, "grad_norm": 4.207734890901994, "learning_rate": 1.931994762307503e-05, "loss": 1.3035, "step": 3595 }, { "epoch": 0.20655229789431406, "grad_norm": 4.698911975706728, "learning_rate": 1.9316313157972297e-05, "loss": 1.325, "step": 3600 }, { "epoch": 0.20683917608583396, "grad_norm": 6.136122416768338, "learning_rate": 1.9312669350449837e-05, "loss": 1.3227, "step": 3605 }, { "epoch": 0.20712605427735384, "grad_norm": 3.7244496061229495, "learning_rate": 1.930901620416167e-05, "loss": 1.1873, "step": 3610 }, { "epoch": 0.2074129324688737, "grad_norm": 4.570541059329514, "learning_rate": 1.9305353722771182e-05, "loss": 1.3602, "step": 3615 }, { "epoch": 0.20769981066039359, "grad_norm": 4.137609177301105, "learning_rate": 1.930168190995111e-05, "loss": 1.3097, "step": 3620 }, { "epoch": 0.2079866888519135, "grad_norm": 4.765174642187809, "learning_rate": 1.9298000769383564e-05, "loss": 1.3498, "step": 3625 }, { "epoch": 0.20827356704343336, "grad_norm": 4.403606581463236, "learning_rate": 1.9294310304759995e-05, "loss": 1.3319, "step": 3630 }, { "epoch": 0.20856044523495323, "grad_norm": 4.531888516697289, "learning_rate": 1.929061051978121e-05, "loss": 1.318, "step": 3635 }, { "epoch": 0.2088473234264731, "grad_norm": 4.40307926713266, "learning_rate": 1.9286901418157367e-05, "loss": 1.3551, "step": 3640 }, { "epoch": 0.209134201617993, "grad_norm": 4.043674471748499, "learning_rate": 1.9283183003607956e-05, "loss": 1.4047, "step": 3645 }, { "epoch": 0.20942107980951288, "grad_norm": 4.882327853131664, "learning_rate": 1.927945527986181e-05, "loss": 1.3626, "step": 3650 }, { "epoch": 0.20970795800103276, "grad_norm": 6.459501198859144, "learning_rate": 1.9275718250657102e-05, "loss": 1.2933, "step": 3655 }, { "epoch": 0.20999483619255263, "grad_norm": 25.195772955552624, "learning_rate": 1.9271971919741332e-05, "loss": 1.2991, "step": 3660 }, { "epoch": 0.21028171438407253, "grad_norm": 5.146484602110404, "learning_rate": 1.926821629087133e-05, "loss": 1.3052, "step": 3665 }, { "epoch": 0.2105685925755924, "grad_norm": 4.32665250140874, "learning_rate": 1.926445136781325e-05, "loss": 1.2635, "step": 3670 }, { "epoch": 0.21085547076711228, "grad_norm": 4.260316373616151, "learning_rate": 1.9260677154342563e-05, "loss": 1.4462, "step": 3675 }, { "epoch": 0.21114234895863215, "grad_norm": 5.114150152209364, "learning_rate": 1.9256893654244064e-05, "loss": 1.4082, "step": 3680 }, { "epoch": 0.21142922715015205, "grad_norm": 4.118823699082449, "learning_rate": 1.9253100871311843e-05, "loss": 1.2547, "step": 3685 }, { "epoch": 0.21171610534167193, "grad_norm": 4.481788914072123, "learning_rate": 1.9249298809349322e-05, "loss": 1.2607, "step": 3690 }, { "epoch": 0.2120029835331918, "grad_norm": 4.063466052815678, "learning_rate": 1.9245487472169218e-05, "loss": 1.3167, "step": 3695 }, { "epoch": 0.21228986172471168, "grad_norm": 3.951649489789216, "learning_rate": 1.9241666863593542e-05, "loss": 1.3018, "step": 3700 }, { "epoch": 0.21257673991623158, "grad_norm": 5.099311998089095, "learning_rate": 1.9237836987453614e-05, "loss": 1.3393, "step": 3705 }, { "epoch": 0.21286361810775145, "grad_norm": 4.302863599891849, "learning_rate": 1.9233997847590035e-05, "loss": 1.2772, "step": 3710 }, { "epoch": 0.21315049629927132, "grad_norm": 4.071457931839553, "learning_rate": 1.9230149447852714e-05, "loss": 1.3271, "step": 3715 }, { "epoch": 0.2134373744907912, "grad_norm": 4.358118659520866, "learning_rate": 1.9226291792100826e-05, "loss": 1.2568, "step": 3720 }, { "epoch": 0.2137242526823111, "grad_norm": 4.440176064497633, "learning_rate": 1.9222424884202843e-05, "loss": 1.3568, "step": 3725 }, { "epoch": 0.21401113087383097, "grad_norm": 5.3612483813783, "learning_rate": 1.9218548728036504e-05, "loss": 1.2705, "step": 3730 }, { "epoch": 0.21429800906535085, "grad_norm": 4.971045026711492, "learning_rate": 1.921466332748883e-05, "loss": 1.291, "step": 3735 }, { "epoch": 0.21458488725687072, "grad_norm": 4.298932921059107, "learning_rate": 1.921076868645611e-05, "loss": 1.3493, "step": 3740 }, { "epoch": 0.21487176544839062, "grad_norm": 4.818118949800486, "learning_rate": 1.9206864808843893e-05, "loss": 1.2891, "step": 3745 }, { "epoch": 0.2151586436399105, "grad_norm": 4.94257010870807, "learning_rate": 1.9202951698567e-05, "loss": 1.2976, "step": 3750 }, { "epoch": 0.21544552183143037, "grad_norm": 4.291025088779294, "learning_rate": 1.919902935954951e-05, "loss": 1.2749, "step": 3755 }, { "epoch": 0.21573240002295024, "grad_norm": 4.389705749116385, "learning_rate": 1.919509779572475e-05, "loss": 1.3532, "step": 3760 }, { "epoch": 0.21601927821447015, "grad_norm": 4.420529678473457, "learning_rate": 1.91911570110353e-05, "loss": 1.3676, "step": 3765 }, { "epoch": 0.21630615640599002, "grad_norm": 5.584387314129881, "learning_rate": 1.918720700943299e-05, "loss": 1.3491, "step": 3770 }, { "epoch": 0.2165930345975099, "grad_norm": 4.435422212208001, "learning_rate": 1.918324779487889e-05, "loss": 1.285, "step": 3775 }, { "epoch": 0.21687991278902977, "grad_norm": 4.150183409255917, "learning_rate": 1.917927937134331e-05, "loss": 1.3447, "step": 3780 }, { "epoch": 0.21716679098054967, "grad_norm": 4.671548413396415, "learning_rate": 1.9175301742805796e-05, "loss": 1.2703, "step": 3785 }, { "epoch": 0.21745366917206954, "grad_norm": 4.724571727561411, "learning_rate": 1.9171314913255114e-05, "loss": 1.3619, "step": 3790 }, { "epoch": 0.21774054736358942, "grad_norm": 4.073883493524989, "learning_rate": 1.9167318886689276e-05, "loss": 1.3166, "step": 3795 }, { "epoch": 0.2180274255551093, "grad_norm": 6.789770915438957, "learning_rate": 1.9163313667115498e-05, "loss": 1.3319, "step": 3800 }, { "epoch": 0.2183143037466292, "grad_norm": 4.778144382007497, "learning_rate": 1.915929925855023e-05, "loss": 1.3332, "step": 3805 }, { "epoch": 0.21860118193814906, "grad_norm": 4.982648963419946, "learning_rate": 1.915527566501912e-05, "loss": 1.2643, "step": 3810 }, { "epoch": 0.21888806012966894, "grad_norm": 4.52057367679852, "learning_rate": 1.9151242890557053e-05, "loss": 1.2273, "step": 3815 }, { "epoch": 0.2191749383211888, "grad_norm": 4.679653012856312, "learning_rate": 1.9147200939208088e-05, "loss": 1.3318, "step": 3820 }, { "epoch": 0.2194618165127087, "grad_norm": 4.186035464485291, "learning_rate": 1.9143149815025514e-05, "loss": 1.3379, "step": 3825 }, { "epoch": 0.2197486947042286, "grad_norm": 3.946445865206007, "learning_rate": 1.91390895220718e-05, "loss": 1.2299, "step": 3830 }, { "epoch": 0.22003557289574846, "grad_norm": 4.576109873643958, "learning_rate": 1.913502006441862e-05, "loss": 1.3019, "step": 3835 }, { "epoch": 0.22032245108726833, "grad_norm": 4.498464678510582, "learning_rate": 1.9130941446146838e-05, "loss": 1.2518, "step": 3840 }, { "epoch": 0.22060932927878824, "grad_norm": 4.703654331348258, "learning_rate": 1.9126853671346496e-05, "loss": 1.2831, "step": 3845 }, { "epoch": 0.2208962074703081, "grad_norm": 3.967903707949483, "learning_rate": 1.9122756744116827e-05, "loss": 1.329, "step": 3850 }, { "epoch": 0.22118308566182798, "grad_norm": 4.336703375177257, "learning_rate": 1.911865066856624e-05, "loss": 1.3626, "step": 3855 }, { "epoch": 0.22146996385334786, "grad_norm": 4.194658549996388, "learning_rate": 1.9114535448812313e-05, "loss": 1.2861, "step": 3860 }, { "epoch": 0.22175684204486776, "grad_norm": 4.1924819994290905, "learning_rate": 1.9110411088981803e-05, "loss": 1.2764, "step": 3865 }, { "epoch": 0.22204372023638763, "grad_norm": 3.7629831158774416, "learning_rate": 1.910627759321062e-05, "loss": 1.2254, "step": 3870 }, { "epoch": 0.2223305984279075, "grad_norm": 4.7298987542971735, "learning_rate": 1.910213496564385e-05, "loss": 1.3354, "step": 3875 }, { "epoch": 0.22261747661942738, "grad_norm": 3.8191602149385573, "learning_rate": 1.9097983210435723e-05, "loss": 1.3803, "step": 3880 }, { "epoch": 0.22290435481094728, "grad_norm": 5.652064217632065, "learning_rate": 1.9093822331749632e-05, "loss": 1.2625, "step": 3885 }, { "epoch": 0.22319123300246715, "grad_norm": 4.724690286243064, "learning_rate": 1.9089652333758117e-05, "loss": 1.3311, "step": 3890 }, { "epoch": 0.22347811119398703, "grad_norm": 4.273744406604878, "learning_rate": 1.9085473220642855e-05, "loss": 1.2921, "step": 3895 }, { "epoch": 0.2237649893855069, "grad_norm": 4.777560822538318, "learning_rate": 1.908128499659468e-05, "loss": 1.2648, "step": 3900 }, { "epoch": 0.2240518675770268, "grad_norm": 4.404632106524898, "learning_rate": 1.9077087665813547e-05, "loss": 1.2847, "step": 3905 }, { "epoch": 0.22433874576854668, "grad_norm": 4.426980557863807, "learning_rate": 1.907288123250854e-05, "loss": 1.3265, "step": 3910 }, { "epoch": 0.22462562396006655, "grad_norm": 4.548486911910833, "learning_rate": 1.9068665700897897e-05, "loss": 1.3077, "step": 3915 }, { "epoch": 0.22491250215158642, "grad_norm": 5.637757447446983, "learning_rate": 1.9064441075208952e-05, "loss": 1.356, "step": 3920 }, { "epoch": 0.22519938034310633, "grad_norm": 3.876820265510319, "learning_rate": 1.9060207359678165e-05, "loss": 1.2262, "step": 3925 }, { "epoch": 0.2254862585346262, "grad_norm": 5.874606668715982, "learning_rate": 1.9055964558551127e-05, "loss": 1.3244, "step": 3930 }, { "epoch": 0.22577313672614607, "grad_norm": 4.489647725951779, "learning_rate": 1.905171267608252e-05, "loss": 1.2592, "step": 3935 }, { "epoch": 0.22606001491766595, "grad_norm": 3.930125289858793, "learning_rate": 1.9047451716536147e-05, "loss": 1.3243, "step": 3940 }, { "epoch": 0.22634689310918585, "grad_norm": 4.725076589597772, "learning_rate": 1.9043181684184902e-05, "loss": 1.3037, "step": 3945 }, { "epoch": 0.22663377130070572, "grad_norm": 3.964658931588981, "learning_rate": 1.9038902583310785e-05, "loss": 1.383, "step": 3950 }, { "epoch": 0.2269206494922256, "grad_norm": 4.004596980891946, "learning_rate": 1.9034614418204893e-05, "loss": 1.3284, "step": 3955 }, { "epoch": 0.22720752768374547, "grad_norm": 4.370526767902233, "learning_rate": 1.90303171931674e-05, "loss": 1.3139, "step": 3960 }, { "epoch": 0.22749440587526537, "grad_norm": 3.9812713630847494, "learning_rate": 1.902601091250758e-05, "loss": 1.3366, "step": 3965 }, { "epoch": 0.22778128406678524, "grad_norm": 5.881221363423829, "learning_rate": 1.9021695580543772e-05, "loss": 1.2745, "step": 3970 }, { "epoch": 0.22806816225830512, "grad_norm": 4.2189107463210105, "learning_rate": 1.901737120160341e-05, "loss": 1.2347, "step": 3975 }, { "epoch": 0.228355040449825, "grad_norm": 4.988992692400158, "learning_rate": 1.9013037780022982e-05, "loss": 1.3976, "step": 3980 }, { "epoch": 0.2286419186413449, "grad_norm": 4.527316582331816, "learning_rate": 1.9008695320148062e-05, "loss": 1.2554, "step": 3985 }, { "epoch": 0.22892879683286477, "grad_norm": 4.894732971978761, "learning_rate": 1.9004343826333273e-05, "loss": 1.3011, "step": 3990 }, { "epoch": 0.22921567502438464, "grad_norm": 4.294746001804963, "learning_rate": 1.8999983302942304e-05, "loss": 1.2549, "step": 3995 }, { "epoch": 0.22950255321590451, "grad_norm": 4.471779873310863, "learning_rate": 1.8995613754347895e-05, "loss": 1.3131, "step": 4000 }, { "epoch": 0.22978943140742442, "grad_norm": 3.886827979656859, "learning_rate": 1.8991235184931844e-05, "loss": 1.2076, "step": 4005 }, { "epoch": 0.2300763095989443, "grad_norm": 4.0123671985468, "learning_rate": 1.898684759908499e-05, "loss": 1.2386, "step": 4010 }, { "epoch": 0.23036318779046416, "grad_norm": 4.801513423690596, "learning_rate": 1.8982451001207204e-05, "loss": 1.3336, "step": 4015 }, { "epoch": 0.23065006598198404, "grad_norm": 4.515227543781069, "learning_rate": 1.897804539570742e-05, "loss": 1.3401, "step": 4020 }, { "epoch": 0.23093694417350394, "grad_norm": 5.192784488153313, "learning_rate": 1.8973630787003576e-05, "loss": 1.3217, "step": 4025 }, { "epoch": 0.2312238223650238, "grad_norm": 4.00465053828836, "learning_rate": 1.896920717952266e-05, "loss": 1.2491, "step": 4030 }, { "epoch": 0.23151070055654369, "grad_norm": 3.844514778752055, "learning_rate": 1.896477457770067e-05, "loss": 1.3132, "step": 4035 }, { "epoch": 0.23179757874806356, "grad_norm": 4.246279893570158, "learning_rate": 1.8960332985982627e-05, "loss": 1.3255, "step": 4040 }, { "epoch": 0.23208445693958346, "grad_norm": 3.9655975148570235, "learning_rate": 1.895588240882258e-05, "loss": 1.3139, "step": 4045 }, { "epoch": 0.23237133513110333, "grad_norm": 4.016530408842844, "learning_rate": 1.895142285068357e-05, "loss": 1.243, "step": 4050 }, { "epoch": 0.2326582133226232, "grad_norm": 3.787568829923831, "learning_rate": 1.894695431603765e-05, "loss": 1.2729, "step": 4055 }, { "epoch": 0.23294509151414308, "grad_norm": 4.316291627539346, "learning_rate": 1.8942476809365878e-05, "loss": 1.2635, "step": 4060 }, { "epoch": 0.23323196970566298, "grad_norm": 4.26853247926681, "learning_rate": 1.8937990335158313e-05, "loss": 1.2818, "step": 4065 }, { "epoch": 0.23351884789718286, "grad_norm": 5.537556731707633, "learning_rate": 1.8933494897913997e-05, "loss": 1.3462, "step": 4070 }, { "epoch": 0.23380572608870273, "grad_norm": 4.45984377206506, "learning_rate": 1.8928990502140966e-05, "loss": 1.2249, "step": 4075 }, { "epoch": 0.2340926042802226, "grad_norm": 3.9824899161737717, "learning_rate": 1.892447715235623e-05, "loss": 1.3881, "step": 4080 }, { "epoch": 0.2343794824717425, "grad_norm": 4.708216775568872, "learning_rate": 1.8919954853085803e-05, "loss": 1.2792, "step": 4085 }, { "epoch": 0.23466636066326238, "grad_norm": 4.316936725489589, "learning_rate": 1.891542360886464e-05, "loss": 1.3908, "step": 4090 }, { "epoch": 0.23495323885478225, "grad_norm": 3.928900310878763, "learning_rate": 1.8910883424236695e-05, "loss": 1.3402, "step": 4095 }, { "epoch": 0.23524011704630213, "grad_norm": 4.821214751737216, "learning_rate": 1.8906334303754873e-05, "loss": 1.3365, "step": 4100 }, { "epoch": 0.23552699523782203, "grad_norm": 4.7486207197697246, "learning_rate": 1.8901776251981033e-05, "loss": 1.2788, "step": 4105 }, { "epoch": 0.2358138734293419, "grad_norm": 3.7202705539400522, "learning_rate": 1.889720927348601e-05, "loss": 1.2971, "step": 4110 }, { "epoch": 0.23610075162086178, "grad_norm": 4.02223268421477, "learning_rate": 1.8892633372849575e-05, "loss": 1.2357, "step": 4115 }, { "epoch": 0.23638762981238165, "grad_norm": 4.327546147835083, "learning_rate": 1.8888048554660458e-05, "loss": 1.2874, "step": 4120 }, { "epoch": 0.23667450800390155, "grad_norm": 4.37778951882171, "learning_rate": 1.8883454823516315e-05, "loss": 1.3196, "step": 4125 }, { "epoch": 0.23696138619542143, "grad_norm": 3.8218923842885637, "learning_rate": 1.8878852184023754e-05, "loss": 1.3441, "step": 4130 }, { "epoch": 0.2372482643869413, "grad_norm": 4.541907116744646, "learning_rate": 1.8874240640798316e-05, "loss": 1.2773, "step": 4135 }, { "epoch": 0.23753514257846117, "grad_norm": 4.195909950107503, "learning_rate": 1.8869620198464462e-05, "loss": 1.3111, "step": 4140 }, { "epoch": 0.23782202076998107, "grad_norm": 4.181505550072813, "learning_rate": 1.8864990861655584e-05, "loss": 1.2353, "step": 4145 }, { "epoch": 0.23810889896150095, "grad_norm": 4.832257435256732, "learning_rate": 1.8860352635013992e-05, "loss": 1.2806, "step": 4150 }, { "epoch": 0.23839577715302082, "grad_norm": 4.911212853953505, "learning_rate": 1.885570552319091e-05, "loss": 1.3068, "step": 4155 }, { "epoch": 0.2386826553445407, "grad_norm": 3.7987226717601863, "learning_rate": 1.885104953084647e-05, "loss": 1.3091, "step": 4160 }, { "epoch": 0.2389695335360606, "grad_norm": 4.714116849520427, "learning_rate": 1.8846384662649713e-05, "loss": 1.3161, "step": 4165 }, { "epoch": 0.23925641172758047, "grad_norm": 4.400513908123231, "learning_rate": 1.884171092327858e-05, "loss": 1.287, "step": 4170 }, { "epoch": 0.23954328991910034, "grad_norm": 3.916023106813228, "learning_rate": 1.883702831741991e-05, "loss": 1.2123, "step": 4175 }, { "epoch": 0.23983016811062022, "grad_norm": 5.767108664472833, "learning_rate": 1.8832336849769427e-05, "loss": 1.2406, "step": 4180 }, { "epoch": 0.24011704630214012, "grad_norm": 5.477182383634469, "learning_rate": 1.882763652503174e-05, "loss": 1.3499, "step": 4185 }, { "epoch": 0.24040392449366, "grad_norm": 4.0324502697599325, "learning_rate": 1.8822927347920355e-05, "loss": 1.2729, "step": 4190 }, { "epoch": 0.24069080268517987, "grad_norm": 4.776378833069471, "learning_rate": 1.881820932315764e-05, "loss": 1.3761, "step": 4195 }, { "epoch": 0.24097768087669974, "grad_norm": 4.175079514097928, "learning_rate": 1.881348245547484e-05, "loss": 1.3401, "step": 4200 }, { "epoch": 0.24126455906821964, "grad_norm": 6.969156296844628, "learning_rate": 1.8808746749612074e-05, "loss": 1.2523, "step": 4205 }, { "epoch": 0.24155143725973952, "grad_norm": 4.413480290735355, "learning_rate": 1.880400221031831e-05, "loss": 1.2379, "step": 4210 }, { "epoch": 0.2418383154512594, "grad_norm": 3.9273926691476237, "learning_rate": 1.8799248842351392e-05, "loss": 1.3025, "step": 4215 }, { "epoch": 0.24212519364277926, "grad_norm": 4.088633601061029, "learning_rate": 1.8794486650478003e-05, "loss": 1.3402, "step": 4220 }, { "epoch": 0.24241207183429916, "grad_norm": 4.205323777042477, "learning_rate": 1.878971563947368e-05, "loss": 1.2547, "step": 4225 }, { "epoch": 0.24269895002581904, "grad_norm": 4.200293308193946, "learning_rate": 1.8784935814122803e-05, "loss": 1.3833, "step": 4230 }, { "epoch": 0.2429858282173389, "grad_norm": 3.7352229935218477, "learning_rate": 1.8780147179218594e-05, "loss": 1.2354, "step": 4235 }, { "epoch": 0.24327270640885879, "grad_norm": 4.685852960081381, "learning_rate": 1.87753497395631e-05, "loss": 1.3228, "step": 4240 }, { "epoch": 0.2435595846003787, "grad_norm": 4.627697304846661, "learning_rate": 1.8770543499967218e-05, "loss": 1.3177, "step": 4245 }, { "epoch": 0.24384646279189856, "grad_norm": 4.916745735283121, "learning_rate": 1.8765728465250643e-05, "loss": 1.3715, "step": 4250 }, { "epoch": 0.24413334098341843, "grad_norm": 4.294009442364749, "learning_rate": 1.8760904640241904e-05, "loss": 1.2615, "step": 4255 }, { "epoch": 0.2444202191749383, "grad_norm": 4.4808744533819125, "learning_rate": 1.8756072029778353e-05, "loss": 1.2323, "step": 4260 }, { "epoch": 0.2447070973664582, "grad_norm": 4.042128432204412, "learning_rate": 1.8751230638706132e-05, "loss": 1.3131, "step": 4265 }, { "epoch": 0.24499397555797808, "grad_norm": 4.131788378075868, "learning_rate": 1.8746380471880204e-05, "loss": 1.2913, "step": 4270 }, { "epoch": 0.24528085374949796, "grad_norm": 4.81658499883742, "learning_rate": 1.8741521534164324e-05, "loss": 1.3249, "step": 4275 }, { "epoch": 0.24556773194101783, "grad_norm": 4.231456248873171, "learning_rate": 1.873665383043105e-05, "loss": 1.3586, "step": 4280 }, { "epoch": 0.24585461013253773, "grad_norm": 4.672126910477844, "learning_rate": 1.8731777365561723e-05, "loss": 1.266, "step": 4285 }, { "epoch": 0.2461414883240576, "grad_norm": 4.459326375748264, "learning_rate": 1.8726892144446473e-05, "loss": 1.2451, "step": 4290 }, { "epoch": 0.24642836651557748, "grad_norm": 4.275991733940975, "learning_rate": 1.872199817198421e-05, "loss": 1.2965, "step": 4295 }, { "epoch": 0.24671524470709735, "grad_norm": 4.45100676472711, "learning_rate": 1.8717095453082627e-05, "loss": 1.3225, "step": 4300 }, { "epoch": 0.24700212289861725, "grad_norm": 3.7611481080748796, "learning_rate": 1.8712183992658176e-05, "loss": 1.2215, "step": 4305 }, { "epoch": 0.24728900109013713, "grad_norm": 6.445526001426653, "learning_rate": 1.8707263795636077e-05, "loss": 1.2592, "step": 4310 }, { "epoch": 0.247575879281657, "grad_norm": 3.892316869363364, "learning_rate": 1.8702334866950322e-05, "loss": 1.21, "step": 4315 }, { "epoch": 0.24786275747317688, "grad_norm": 4.551513491650196, "learning_rate": 1.869739721154365e-05, "loss": 1.4175, "step": 4320 }, { "epoch": 0.24814963566469678, "grad_norm": 4.6840881884312955, "learning_rate": 1.8692450834367546e-05, "loss": 1.382, "step": 4325 }, { "epoch": 0.24843651385621665, "grad_norm": 5.186103761722206, "learning_rate": 1.8687495740382256e-05, "loss": 1.2972, "step": 4330 }, { "epoch": 0.24872339204773652, "grad_norm": 4.0217014627095065, "learning_rate": 1.8682531934556758e-05, "loss": 1.3392, "step": 4335 }, { "epoch": 0.2490102702392564, "grad_norm": 4.1406566398748685, "learning_rate": 1.8677559421868768e-05, "loss": 1.3305, "step": 4340 }, { "epoch": 0.2492971484307763, "grad_norm": 5.077805899218966, "learning_rate": 1.867257820730473e-05, "loss": 1.3019, "step": 4345 }, { "epoch": 0.24958402662229617, "grad_norm": 4.807686569185305, "learning_rate": 1.8667588295859814e-05, "loss": 1.2961, "step": 4350 }, { "epoch": 0.24987090481381605, "grad_norm": 4.205236855277625, "learning_rate": 1.8662589692537922e-05, "loss": 1.3391, "step": 4355 }, { "epoch": 0.2501577830053359, "grad_norm": 4.348001426506396, "learning_rate": 1.8657582402351665e-05, "loss": 1.3524, "step": 4360 }, { "epoch": 0.2504446611968558, "grad_norm": 4.880848205333957, "learning_rate": 1.8652566430322355e-05, "loss": 1.3639, "step": 4365 }, { "epoch": 0.25073153938837567, "grad_norm": 3.616260360908594, "learning_rate": 1.8647541781480033e-05, "loss": 1.286, "step": 4370 }, { "epoch": 0.25101841757989557, "grad_norm": 4.451287538496318, "learning_rate": 1.864250846086342e-05, "loss": 1.3125, "step": 4375 }, { "epoch": 0.25130529577141547, "grad_norm": 4.3390530353034125, "learning_rate": 1.8637466473519938e-05, "loss": 1.3077, "step": 4380 }, { "epoch": 0.2515921739629353, "grad_norm": 4.339921255834637, "learning_rate": 1.863241582450571e-05, "loss": 1.377, "step": 4385 }, { "epoch": 0.2518790521544552, "grad_norm": 3.76112218316654, "learning_rate": 1.8627356518885536e-05, "loss": 1.3058, "step": 4390 }, { "epoch": 0.2521659303459751, "grad_norm": 3.9445758019937447, "learning_rate": 1.86222885617329e-05, "loss": 1.3148, "step": 4395 }, { "epoch": 0.25245280853749497, "grad_norm": 4.145922620691017, "learning_rate": 1.861721195812996e-05, "loss": 1.1987, "step": 4400 }, { "epoch": 0.25273968672901487, "grad_norm": 4.2549342397589776, "learning_rate": 1.8612126713167544e-05, "loss": 1.3596, "step": 4405 }, { "epoch": 0.2530265649205347, "grad_norm": 4.015544327195161, "learning_rate": 1.860703283194515e-05, "loss": 1.2521, "step": 4410 }, { "epoch": 0.2533134431120546, "grad_norm": 3.7999312536710956, "learning_rate": 1.8601930319570933e-05, "loss": 1.3103, "step": 4415 }, { "epoch": 0.2536003213035745, "grad_norm": 4.1544353506022285, "learning_rate": 1.85968191811617e-05, "loss": 1.3332, "step": 4420 }, { "epoch": 0.25388719949509436, "grad_norm": 3.817582018884883, "learning_rate": 1.8591699421842917e-05, "loss": 1.1907, "step": 4425 }, { "epoch": 0.25417407768661426, "grad_norm": 4.675037902612188, "learning_rate": 1.8586571046748683e-05, "loss": 1.3303, "step": 4430 }, { "epoch": 0.25446095587813417, "grad_norm": 3.971739691341492, "learning_rate": 1.8581434061021755e-05, "loss": 1.2449, "step": 4435 }, { "epoch": 0.254747834069654, "grad_norm": 4.092229855269379, "learning_rate": 1.8576288469813506e-05, "loss": 1.338, "step": 4440 }, { "epoch": 0.2550347122611739, "grad_norm": 3.6202172440246585, "learning_rate": 1.8571134278283946e-05, "loss": 1.3441, "step": 4445 }, { "epoch": 0.25532159045269376, "grad_norm": 4.3809314891935704, "learning_rate": 1.8565971491601713e-05, "loss": 1.2999, "step": 4450 }, { "epoch": 0.25560846864421366, "grad_norm": 4.034405117919258, "learning_rate": 1.8560800114944062e-05, "loss": 1.3737, "step": 4455 }, { "epoch": 0.25589534683573356, "grad_norm": 3.589778494841832, "learning_rate": 1.8555620153496855e-05, "loss": 1.2729, "step": 4460 }, { "epoch": 0.2561822250272534, "grad_norm": 4.362416190705672, "learning_rate": 1.8550431612454578e-05, "loss": 1.3517, "step": 4465 }, { "epoch": 0.2564691032187733, "grad_norm": 3.704330935657843, "learning_rate": 1.8545234497020305e-05, "loss": 1.2623, "step": 4470 }, { "epoch": 0.2567559814102932, "grad_norm": 3.696875359223093, "learning_rate": 1.8540028812405717e-05, "loss": 1.3041, "step": 4475 }, { "epoch": 0.25704285960181306, "grad_norm": 4.408804170049381, "learning_rate": 1.8534814563831082e-05, "loss": 1.2329, "step": 4480 }, { "epoch": 0.25732973779333296, "grad_norm": 4.9076585379680076, "learning_rate": 1.852959175652527e-05, "loss": 1.3588, "step": 4485 }, { "epoch": 0.25761661598485286, "grad_norm": 5.056311051410968, "learning_rate": 1.8524360395725715e-05, "loss": 1.2756, "step": 4490 }, { "epoch": 0.2579034941763727, "grad_norm": 3.9102033017367592, "learning_rate": 1.851912048667844e-05, "loss": 1.2529, "step": 4495 }, { "epoch": 0.2581903723678926, "grad_norm": 3.9949314657124626, "learning_rate": 1.851387203463804e-05, "loss": 1.2891, "step": 4500 }, { "epoch": 0.25847725055941245, "grad_norm": 3.537859932455064, "learning_rate": 1.850861504486767e-05, "loss": 1.2152, "step": 4505 }, { "epoch": 0.25876412875093235, "grad_norm": 4.2238834571846215, "learning_rate": 1.8503349522639054e-05, "loss": 1.3347, "step": 4510 }, { "epoch": 0.25905100694245226, "grad_norm": 4.394063913330881, "learning_rate": 1.849807547323247e-05, "loss": 1.2862, "step": 4515 }, { "epoch": 0.2593378851339721, "grad_norm": 4.09438363966619, "learning_rate": 1.8492792901936744e-05, "loss": 1.3133, "step": 4520 }, { "epoch": 0.259624763325492, "grad_norm": 4.1882086900600815, "learning_rate": 1.8487501814049252e-05, "loss": 1.2756, "step": 4525 }, { "epoch": 0.2599116415170119, "grad_norm": 4.335358580684273, "learning_rate": 1.8482202214875907e-05, "loss": 1.3291, "step": 4530 }, { "epoch": 0.26019851970853175, "grad_norm": 3.8699283674163913, "learning_rate": 1.8476894109731166e-05, "loss": 1.3901, "step": 4535 }, { "epoch": 0.26048539790005165, "grad_norm": 3.9589524637119995, "learning_rate": 1.8471577503937998e-05, "loss": 1.3205, "step": 4540 }, { "epoch": 0.2607722760915715, "grad_norm": 3.7392801079336637, "learning_rate": 1.8466252402827916e-05, "loss": 1.338, "step": 4545 }, { "epoch": 0.2610591542830914, "grad_norm": 4.273655714947217, "learning_rate": 1.8460918811740937e-05, "loss": 1.2963, "step": 4550 }, { "epoch": 0.2613460324746113, "grad_norm": 4.013993311005634, "learning_rate": 1.84555767360256e-05, "loss": 1.2451, "step": 4555 }, { "epoch": 0.26163291066613115, "grad_norm": 4.341244236567086, "learning_rate": 1.8450226181038953e-05, "loss": 1.2541, "step": 4560 }, { "epoch": 0.26191978885765105, "grad_norm": 3.9504044773478086, "learning_rate": 1.8444867152146545e-05, "loss": 1.3295, "step": 4565 }, { "epoch": 0.26220666704917095, "grad_norm": 4.42745890883407, "learning_rate": 1.843949965472242e-05, "loss": 1.3222, "step": 4570 }, { "epoch": 0.2624935452406908, "grad_norm": 4.410599454760885, "learning_rate": 1.8434123694149118e-05, "loss": 1.3017, "step": 4575 }, { "epoch": 0.2627804234322107, "grad_norm": 4.514505447599759, "learning_rate": 1.842873927581766e-05, "loss": 1.3669, "step": 4580 }, { "epoch": 0.26306730162373054, "grad_norm": 4.277993543794356, "learning_rate": 1.8423346405127557e-05, "loss": 1.3218, "step": 4585 }, { "epoch": 0.26335417981525044, "grad_norm": 5.110592871272862, "learning_rate": 1.841794508748679e-05, "loss": 1.322, "step": 4590 }, { "epoch": 0.26364105800677035, "grad_norm": 4.462969648037805, "learning_rate": 1.8412535328311813e-05, "loss": 1.3155, "step": 4595 }, { "epoch": 0.2639279361982902, "grad_norm": 3.235055411162824, "learning_rate": 1.8407117133027546e-05, "loss": 1.2542, "step": 4600 }, { "epoch": 0.2642148143898101, "grad_norm": 4.04892337379317, "learning_rate": 1.8401690507067363e-05, "loss": 1.2488, "step": 4605 }, { "epoch": 0.26450169258133, "grad_norm": 3.833828223688341, "learning_rate": 1.83962554558731e-05, "loss": 1.2945, "step": 4610 }, { "epoch": 0.26478857077284984, "grad_norm": 4.270486208496539, "learning_rate": 1.8390811984895044e-05, "loss": 1.3327, "step": 4615 }, { "epoch": 0.26507544896436974, "grad_norm": 4.617299860609044, "learning_rate": 1.838536009959191e-05, "loss": 1.3306, "step": 4620 }, { "epoch": 0.2653623271558896, "grad_norm": 3.842751937963414, "learning_rate": 1.8379899805430864e-05, "loss": 1.3245, "step": 4625 }, { "epoch": 0.2656492053474095, "grad_norm": 4.8900275222161085, "learning_rate": 1.8374431107887504e-05, "loss": 1.259, "step": 4630 }, { "epoch": 0.2659360835389294, "grad_norm": 4.512019463203551, "learning_rate": 1.836895401244585e-05, "loss": 1.309, "step": 4635 }, { "epoch": 0.26622296173044924, "grad_norm": 4.205026999235429, "learning_rate": 1.8363468524598343e-05, "loss": 1.25, "step": 4640 }, { "epoch": 0.26650983992196914, "grad_norm": 3.983799742321726, "learning_rate": 1.8357974649845847e-05, "loss": 1.3009, "step": 4645 }, { "epoch": 0.26679671811348904, "grad_norm": 4.3032420485565925, "learning_rate": 1.8352472393697635e-05, "loss": 1.25, "step": 4650 }, { "epoch": 0.2670835963050089, "grad_norm": 4.211143891935089, "learning_rate": 1.834696176167137e-05, "loss": 1.2757, "step": 4655 }, { "epoch": 0.2673704744965288, "grad_norm": 3.994267737343004, "learning_rate": 1.8341442759293136e-05, "loss": 1.2924, "step": 4660 }, { "epoch": 0.26765735268804863, "grad_norm": 4.6390201053909905, "learning_rate": 1.8335915392097402e-05, "loss": 1.3065, "step": 4665 }, { "epoch": 0.26794423087956853, "grad_norm": 4.17526196912745, "learning_rate": 1.8330379665627017e-05, "loss": 1.2237, "step": 4670 }, { "epoch": 0.26823110907108844, "grad_norm": 5.248232648031216, "learning_rate": 1.8324835585433225e-05, "loss": 1.2596, "step": 4675 }, { "epoch": 0.2685179872626083, "grad_norm": 4.562638638688315, "learning_rate": 1.831928315707564e-05, "loss": 1.312, "step": 4680 }, { "epoch": 0.2688048654541282, "grad_norm": 3.547748251815732, "learning_rate": 1.8313722386122247e-05, "loss": 1.2418, "step": 4685 }, { "epoch": 0.2690917436456481, "grad_norm": 5.297023807681865, "learning_rate": 1.8308153278149404e-05, "loss": 1.2673, "step": 4690 }, { "epoch": 0.26937862183716793, "grad_norm": 4.902813973467405, "learning_rate": 1.8302575838741828e-05, "loss": 1.3668, "step": 4695 }, { "epoch": 0.26966550002868783, "grad_norm": 4.360949585825616, "learning_rate": 1.829699007349258e-05, "loss": 1.3193, "step": 4700 }, { "epoch": 0.2699523782202077, "grad_norm": 3.637634231382023, "learning_rate": 1.829139598800308e-05, "loss": 1.1797, "step": 4705 }, { "epoch": 0.2702392564117276, "grad_norm": 3.8681034860458627, "learning_rate": 1.8285793587883093e-05, "loss": 1.4706, "step": 4710 }, { "epoch": 0.2705261346032475, "grad_norm": 3.8566632942856853, "learning_rate": 1.8280182878750715e-05, "loss": 1.3363, "step": 4715 }, { "epoch": 0.2708130127947673, "grad_norm": 4.05023800635455, "learning_rate": 1.827456386623238e-05, "loss": 1.2646, "step": 4720 }, { "epoch": 0.27109989098628723, "grad_norm": 3.8105680132070114, "learning_rate": 1.8268936555962847e-05, "loss": 1.3872, "step": 4725 }, { "epoch": 0.27138676917780713, "grad_norm": 4.171663771386364, "learning_rate": 1.826330095358519e-05, "loss": 1.2846, "step": 4730 }, { "epoch": 0.271673647369327, "grad_norm": 5.3437517609370175, "learning_rate": 1.825765706475081e-05, "loss": 1.2221, "step": 4735 }, { "epoch": 0.2719605255608469, "grad_norm": 3.6831482627537238, "learning_rate": 1.8252004895119406e-05, "loss": 1.3115, "step": 4740 }, { "epoch": 0.2722474037523667, "grad_norm": 4.167763508272118, "learning_rate": 1.8246344450358987e-05, "loss": 1.3173, "step": 4745 }, { "epoch": 0.2725342819438866, "grad_norm": 4.226259492683696, "learning_rate": 1.8240675736145866e-05, "loss": 1.3839, "step": 4750 }, { "epoch": 0.2728211601354065, "grad_norm": 3.816823181769976, "learning_rate": 1.8234998758164638e-05, "loss": 1.2776, "step": 4755 }, { "epoch": 0.2731080383269264, "grad_norm": 3.9396472868136456, "learning_rate": 1.822931352210819e-05, "loss": 1.3126, "step": 4760 }, { "epoch": 0.2733949165184463, "grad_norm": 4.553375973104562, "learning_rate": 1.8223620033677685e-05, "loss": 1.3179, "step": 4765 }, { "epoch": 0.2736817947099662, "grad_norm": 4.8673983936852, "learning_rate": 1.821791829858257e-05, "loss": 1.2434, "step": 4770 }, { "epoch": 0.273968672901486, "grad_norm": 5.076942430795134, "learning_rate": 1.821220832254056e-05, "loss": 1.3129, "step": 4775 }, { "epoch": 0.2742555510930059, "grad_norm": 3.859810547577042, "learning_rate": 1.8206490111277627e-05, "loss": 1.3037, "step": 4780 }, { "epoch": 0.27454242928452577, "grad_norm": 4.293762574861637, "learning_rate": 1.820076367052801e-05, "loss": 1.2798, "step": 4785 }, { "epoch": 0.27482930747604567, "grad_norm": 5.093808099128152, "learning_rate": 1.8195029006034195e-05, "loss": 1.2868, "step": 4790 }, { "epoch": 0.27511618566756557, "grad_norm": 3.8760024047244603, "learning_rate": 1.8189286123546915e-05, "loss": 1.3683, "step": 4795 }, { "epoch": 0.2754030638590854, "grad_norm": 3.957546331222296, "learning_rate": 1.818353502882515e-05, "loss": 1.256, "step": 4800 }, { "epoch": 0.2756899420506053, "grad_norm": 4.06135094232281, "learning_rate": 1.8177775727636107e-05, "loss": 1.2877, "step": 4805 }, { "epoch": 0.2759768202421252, "grad_norm": 3.59853571783171, "learning_rate": 1.8172008225755226e-05, "loss": 1.2389, "step": 4810 }, { "epoch": 0.27626369843364507, "grad_norm": 3.4750766849157544, "learning_rate": 1.816623252896617e-05, "loss": 1.2129, "step": 4815 }, { "epoch": 0.27655057662516497, "grad_norm": 4.447380340080289, "learning_rate": 1.816044864306082e-05, "loss": 1.2776, "step": 4820 }, { "epoch": 0.2768374548166848, "grad_norm": 3.6406440487587663, "learning_rate": 1.8154656573839276e-05, "loss": 1.49, "step": 4825 }, { "epoch": 0.2771243330082047, "grad_norm": 5.303071755047006, "learning_rate": 1.814885632710983e-05, "loss": 1.2519, "step": 4830 }, { "epoch": 0.2774112111997246, "grad_norm": 3.5502778641428256, "learning_rate": 1.8143047908688995e-05, "loss": 1.2472, "step": 4835 }, { "epoch": 0.27769808939124446, "grad_norm": 3.87409242551997, "learning_rate": 1.8137231324401446e-05, "loss": 1.3481, "step": 4840 }, { "epoch": 0.27798496758276436, "grad_norm": 3.725072127706999, "learning_rate": 1.8131406580080086e-05, "loss": 1.2682, "step": 4845 }, { "epoch": 0.27827184577428427, "grad_norm": 3.657435043614451, "learning_rate": 1.812557368156597e-05, "loss": 1.2507, "step": 4850 }, { "epoch": 0.2785587239658041, "grad_norm": 4.0989159374693696, "learning_rate": 1.8119732634708346e-05, "loss": 1.3101, "step": 4855 }, { "epoch": 0.278845602157324, "grad_norm": 4.441157521152681, "learning_rate": 1.811388344536463e-05, "loss": 1.2818, "step": 4860 }, { "epoch": 0.27913248034884386, "grad_norm": 3.826799864273376, "learning_rate": 1.8108026119400398e-05, "loss": 1.2859, "step": 4865 }, { "epoch": 0.27941935854036376, "grad_norm": 4.303582171337263, "learning_rate": 1.8102160662689394e-05, "loss": 1.2689, "step": 4870 }, { "epoch": 0.27970623673188366, "grad_norm": 4.076766323252899, "learning_rate": 1.809628708111351e-05, "loss": 1.2921, "step": 4875 }, { "epoch": 0.2799931149234035, "grad_norm": 4.194702969113473, "learning_rate": 1.8090405380562785e-05, "loss": 1.3075, "step": 4880 }, { "epoch": 0.2802799931149234, "grad_norm": 4.078076906073747, "learning_rate": 1.8084515566935405e-05, "loss": 1.4172, "step": 4885 }, { "epoch": 0.2805668713064433, "grad_norm": 3.3433076200778156, "learning_rate": 1.807861764613768e-05, "loss": 1.3294, "step": 4890 }, { "epoch": 0.28085374949796316, "grad_norm": 3.9087729679665975, "learning_rate": 1.807271162408407e-05, "loss": 1.2969, "step": 4895 }, { "epoch": 0.28114062768948306, "grad_norm": 4.039294935847034, "learning_rate": 1.8066797506697138e-05, "loss": 1.3407, "step": 4900 }, { "epoch": 0.2814275058810029, "grad_norm": 4.446991552483666, "learning_rate": 1.806087529990758e-05, "loss": 1.3042, "step": 4905 }, { "epoch": 0.2817143840725228, "grad_norm": 4.5160645712337395, "learning_rate": 1.8054945009654196e-05, "loss": 1.2779, "step": 4910 }, { "epoch": 0.2820012622640427, "grad_norm": 4.782932691104381, "learning_rate": 1.8049006641883888e-05, "loss": 1.2802, "step": 4915 }, { "epoch": 0.28228814045556255, "grad_norm": 3.896769650837769, "learning_rate": 1.8043060202551674e-05, "loss": 1.3474, "step": 4920 }, { "epoch": 0.28257501864708245, "grad_norm": 3.8260548735616537, "learning_rate": 1.8037105697620656e-05, "loss": 1.3206, "step": 4925 }, { "epoch": 0.28286189683860236, "grad_norm": 4.994723371237868, "learning_rate": 1.803114313306202e-05, "loss": 1.3075, "step": 4930 }, { "epoch": 0.2831487750301222, "grad_norm": 3.97609179717184, "learning_rate": 1.8025172514855043e-05, "loss": 1.2454, "step": 4935 }, { "epoch": 0.2834356532216421, "grad_norm": 4.265663665814093, "learning_rate": 1.8019193848987072e-05, "loss": 1.2982, "step": 4940 }, { "epoch": 0.28372253141316195, "grad_norm": 3.6602563033435254, "learning_rate": 1.8013207141453524e-05, "loss": 1.2975, "step": 4945 }, { "epoch": 0.28400940960468185, "grad_norm": 3.8489144234228, "learning_rate": 1.800721239825789e-05, "loss": 1.2903, "step": 4950 }, { "epoch": 0.28429628779620175, "grad_norm": 4.0151384187621275, "learning_rate": 1.8001209625411707e-05, "loss": 1.3283, "step": 4955 }, { "epoch": 0.2845831659877216, "grad_norm": 4.116862401444179, "learning_rate": 1.799519882893457e-05, "loss": 1.2482, "step": 4960 }, { "epoch": 0.2848700441792415, "grad_norm": 3.7223138356433196, "learning_rate": 1.7989180014854114e-05, "loss": 1.3155, "step": 4965 }, { "epoch": 0.2851569223707614, "grad_norm": 3.332635756454807, "learning_rate": 1.798315318920603e-05, "loss": 1.2387, "step": 4970 }, { "epoch": 0.28544380056228125, "grad_norm": 4.464137345169728, "learning_rate": 1.7977118358034024e-05, "loss": 1.3065, "step": 4975 }, { "epoch": 0.28573067875380115, "grad_norm": 3.686533444562598, "learning_rate": 1.797107552738984e-05, "loss": 1.2545, "step": 4980 }, { "epoch": 0.286017556945321, "grad_norm": 3.6444832947344485, "learning_rate": 1.7965024703333246e-05, "loss": 1.3347, "step": 4985 }, { "epoch": 0.2863044351368409, "grad_norm": 4.220760224442312, "learning_rate": 1.795896589193202e-05, "loss": 1.3275, "step": 4990 }, { "epoch": 0.2865913133283608, "grad_norm": 4.383334052462153, "learning_rate": 1.7952899099261944e-05, "loss": 1.3112, "step": 4995 }, { "epoch": 0.28687819151988064, "grad_norm": 3.734496391664216, "learning_rate": 1.7946824331406826e-05, "loss": 1.2967, "step": 5000 }, { "epoch": 0.28716506971140054, "grad_norm": 4.370052512654082, "learning_rate": 1.7940741594458444e-05, "loss": 1.4204, "step": 5005 }, { "epoch": 0.28745194790292045, "grad_norm": 4.40671284114377, "learning_rate": 1.7934650894516584e-05, "loss": 1.2223, "step": 5010 }, { "epoch": 0.2877388260944403, "grad_norm": 3.9499605455176168, "learning_rate": 1.7928552237689017e-05, "loss": 1.2871, "step": 5015 }, { "epoch": 0.2880257042859602, "grad_norm": 3.9261957374910255, "learning_rate": 1.7922445630091485e-05, "loss": 1.3165, "step": 5020 }, { "epoch": 0.28831258247748004, "grad_norm": 3.961463551575858, "learning_rate": 1.791633107784771e-05, "loss": 1.2837, "step": 5025 }, { "epoch": 0.28859946066899994, "grad_norm": 3.5584327519627648, "learning_rate": 1.7910208587089383e-05, "loss": 1.2917, "step": 5030 }, { "epoch": 0.28888633886051984, "grad_norm": 3.803030886181798, "learning_rate": 1.7904078163956143e-05, "loss": 1.3209, "step": 5035 }, { "epoch": 0.2891732170520397, "grad_norm": 3.9621469035528922, "learning_rate": 1.7897939814595597e-05, "loss": 1.3332, "step": 5040 }, { "epoch": 0.2894600952435596, "grad_norm": 3.8943100905075325, "learning_rate": 1.7891793545163298e-05, "loss": 1.3239, "step": 5045 }, { "epoch": 0.2897469734350795, "grad_norm": 4.424528230976105, "learning_rate": 1.7885639361822728e-05, "loss": 1.2796, "step": 5050 }, { "epoch": 0.29003385162659934, "grad_norm": 4.067149271716995, "learning_rate": 1.7879477270745328e-05, "loss": 1.2698, "step": 5055 }, { "epoch": 0.29032072981811924, "grad_norm": 4.065154152319611, "learning_rate": 1.787330727811045e-05, "loss": 1.2455, "step": 5060 }, { "epoch": 0.2906076080096391, "grad_norm": 3.975366456445216, "learning_rate": 1.7867129390105384e-05, "loss": 1.312, "step": 5065 }, { "epoch": 0.290894486201159, "grad_norm": 3.652656915391245, "learning_rate": 1.786094361292532e-05, "loss": 1.2728, "step": 5070 }, { "epoch": 0.2911813643926789, "grad_norm": 4.204993941501378, "learning_rate": 1.7854749952773374e-05, "loss": 1.2849, "step": 5075 }, { "epoch": 0.29146824258419873, "grad_norm": 3.986611863117327, "learning_rate": 1.784854841586056e-05, "loss": 1.2455, "step": 5080 }, { "epoch": 0.29175512077571863, "grad_norm": 5.308265939946118, "learning_rate": 1.7842339008405803e-05, "loss": 1.3408, "step": 5085 }, { "epoch": 0.29204199896723854, "grad_norm": 3.4157094652301248, "learning_rate": 1.7836121736635895e-05, "loss": 1.2068, "step": 5090 }, { "epoch": 0.2923288771587584, "grad_norm": 8.490381672192239, "learning_rate": 1.7829896606785543e-05, "loss": 1.3101, "step": 5095 }, { "epoch": 0.2926157553502783, "grad_norm": 4.0457595764401795, "learning_rate": 1.7823663625097314e-05, "loss": 1.2608, "step": 5100 }, { "epoch": 0.29290263354179813, "grad_norm": 4.414589706755519, "learning_rate": 1.7817422797821658e-05, "loss": 1.2942, "step": 5105 }, { "epoch": 0.29318951173331803, "grad_norm": 4.325933912514259, "learning_rate": 1.781117413121689e-05, "loss": 1.2606, "step": 5110 }, { "epoch": 0.29347638992483793, "grad_norm": 7.592878663758047, "learning_rate": 1.780491763154919e-05, "loss": 1.4913, "step": 5115 }, { "epoch": 0.2937632681163578, "grad_norm": 4.078870061564924, "learning_rate": 1.7798653305092585e-05, "loss": 1.361, "step": 5120 }, { "epoch": 0.2940501463078777, "grad_norm": 4.362843010090427, "learning_rate": 1.7792381158128956e-05, "loss": 1.3174, "step": 5125 }, { "epoch": 0.2943370244993976, "grad_norm": 3.9974543118394688, "learning_rate": 1.7786101196948034e-05, "loss": 1.326, "step": 5130 }, { "epoch": 0.2946239026909174, "grad_norm": 4.404817431420152, "learning_rate": 1.7779813427847368e-05, "loss": 1.3743, "step": 5135 }, { "epoch": 0.29491078088243733, "grad_norm": 4.055994395777529, "learning_rate": 1.7773517857132353e-05, "loss": 1.2772, "step": 5140 }, { "epoch": 0.2951976590739572, "grad_norm": 4.876000148714891, "learning_rate": 1.7767214491116198e-05, "loss": 1.2357, "step": 5145 }, { "epoch": 0.2954845372654771, "grad_norm": 3.861794765387172, "learning_rate": 1.7760903336119936e-05, "loss": 1.3305, "step": 5150 }, { "epoch": 0.295771415456997, "grad_norm": 3.813270085862637, "learning_rate": 1.7754584398472407e-05, "loss": 1.1441, "step": 5155 }, { "epoch": 0.2960582936485168, "grad_norm": 3.527822514487711, "learning_rate": 1.7748257684510252e-05, "loss": 1.2656, "step": 5160 }, { "epoch": 0.2963451718400367, "grad_norm": 3.799342235146481, "learning_rate": 1.7741923200577917e-05, "loss": 1.3022, "step": 5165 }, { "epoch": 0.2966320500315566, "grad_norm": 4.244211711518838, "learning_rate": 1.7735580953027637e-05, "loss": 1.3481, "step": 5170 }, { "epoch": 0.2969189282230765, "grad_norm": 3.606845375108358, "learning_rate": 1.772923094821943e-05, "loss": 1.2973, "step": 5175 }, { "epoch": 0.2972058064145964, "grad_norm": 3.8755153556595174, "learning_rate": 1.7722873192521094e-05, "loss": 1.2541, "step": 5180 }, { "epoch": 0.2974926846061162, "grad_norm": 3.537634122331179, "learning_rate": 1.7716507692308208e-05, "loss": 1.2568, "step": 5185 }, { "epoch": 0.2977795627976361, "grad_norm": 3.712179505849638, "learning_rate": 1.77101344539641e-05, "loss": 1.3261, "step": 5190 }, { "epoch": 0.298066440989156, "grad_norm": 4.7692469771463415, "learning_rate": 1.7703753483879876e-05, "loss": 1.3255, "step": 5195 }, { "epoch": 0.29835331918067587, "grad_norm": 4.081974162909123, "learning_rate": 1.7697364788454384e-05, "loss": 1.352, "step": 5200 }, { "epoch": 0.29864019737219577, "grad_norm": 5.325636575184922, "learning_rate": 1.769096837409422e-05, "loss": 1.2139, "step": 5205 }, { "epoch": 0.29892707556371567, "grad_norm": 4.003839547218714, "learning_rate": 1.7684564247213723e-05, "loss": 1.2244, "step": 5210 }, { "epoch": 0.2992139537552355, "grad_norm": 4.107231813929609, "learning_rate": 1.767815241423497e-05, "loss": 1.289, "step": 5215 }, { "epoch": 0.2995008319467554, "grad_norm": 5.483704044777657, "learning_rate": 1.7671732881587758e-05, "loss": 1.3157, "step": 5220 }, { "epoch": 0.29978771013827527, "grad_norm": 3.634184419944545, "learning_rate": 1.766530565570961e-05, "loss": 1.2635, "step": 5225 }, { "epoch": 0.30007458832979517, "grad_norm": 3.953812958072901, "learning_rate": 1.7658870743045757e-05, "loss": 1.3099, "step": 5230 }, { "epoch": 0.30036146652131507, "grad_norm": 4.591495391688045, "learning_rate": 1.7652428150049153e-05, "loss": 1.2685, "step": 5235 }, { "epoch": 0.3006483447128349, "grad_norm": 3.426358303689489, "learning_rate": 1.764597788318044e-05, "loss": 1.2487, "step": 5240 }, { "epoch": 0.3009352229043548, "grad_norm": 3.8735623811570434, "learning_rate": 1.7639519948907963e-05, "loss": 1.2922, "step": 5245 }, { "epoch": 0.3012221010958747, "grad_norm": 4.380854032330455, "learning_rate": 1.7633054353707745e-05, "loss": 1.2446, "step": 5250 }, { "epoch": 0.30150897928739456, "grad_norm": 4.958071053735019, "learning_rate": 1.762658110406351e-05, "loss": 1.2593, "step": 5255 }, { "epoch": 0.30179585747891446, "grad_norm": 4.0888476368067215, "learning_rate": 1.7620100206466637e-05, "loss": 1.3121, "step": 5260 }, { "epoch": 0.3020827356704343, "grad_norm": 3.6879928767690178, "learning_rate": 1.7613611667416192e-05, "loss": 1.2594, "step": 5265 }, { "epoch": 0.3023696138619542, "grad_norm": 3.815479397144886, "learning_rate": 1.7607115493418895e-05, "loss": 1.2701, "step": 5270 }, { "epoch": 0.3026564920534741, "grad_norm": 3.788293529304048, "learning_rate": 1.7600611690989125e-05, "loss": 1.3318, "step": 5275 }, { "epoch": 0.30294337024499396, "grad_norm": 3.9939993124593283, "learning_rate": 1.7594100266648907e-05, "loss": 1.2423, "step": 5280 }, { "epoch": 0.30323024843651386, "grad_norm": 3.9973003455891907, "learning_rate": 1.758758122692791e-05, "loss": 1.2357, "step": 5285 }, { "epoch": 0.30351712662803376, "grad_norm": 4.241066866959136, "learning_rate": 1.7581054578363445e-05, "loss": 1.2365, "step": 5290 }, { "epoch": 0.3038040048195536, "grad_norm": 4.330022914809729, "learning_rate": 1.757452032750045e-05, "loss": 1.316, "step": 5295 }, { "epoch": 0.3040908830110735, "grad_norm": 4.053444720558285, "learning_rate": 1.756797848089149e-05, "loss": 1.2968, "step": 5300 }, { "epoch": 0.30437776120259336, "grad_norm": 4.101949160455034, "learning_rate": 1.7561429045096732e-05, "loss": 1.3452, "step": 5305 }, { "epoch": 0.30466463939411326, "grad_norm": 3.799115264680895, "learning_rate": 1.7554872026683978e-05, "loss": 1.2523, "step": 5310 }, { "epoch": 0.30495151758563316, "grad_norm": 3.8201036502682904, "learning_rate": 1.754830743222861e-05, "loss": 1.3379, "step": 5315 }, { "epoch": 0.305238395777153, "grad_norm": 3.861455030992477, "learning_rate": 1.7541735268313625e-05, "loss": 1.2946, "step": 5320 }, { "epoch": 0.3055252739686729, "grad_norm": 4.264479324851676, "learning_rate": 1.75351555415296e-05, "loss": 1.3226, "step": 5325 }, { "epoch": 0.3058121521601928, "grad_norm": 3.6938216295841775, "learning_rate": 1.7528568258474705e-05, "loss": 1.2597, "step": 5330 }, { "epoch": 0.30609903035171265, "grad_norm": 3.5916618452892397, "learning_rate": 1.7521973425754676e-05, "loss": 1.2157, "step": 5335 }, { "epoch": 0.30638590854323255, "grad_norm": 4.131137930808326, "learning_rate": 1.7515371049982827e-05, "loss": 1.2424, "step": 5340 }, { "epoch": 0.3066727867347524, "grad_norm": 4.077925985495176, "learning_rate": 1.7508761137780037e-05, "loss": 1.3001, "step": 5345 }, { "epoch": 0.3069596649262723, "grad_norm": 3.937893264806396, "learning_rate": 1.7502143695774744e-05, "loss": 1.3126, "step": 5350 }, { "epoch": 0.3072465431177922, "grad_norm": 3.9181073266687982, "learning_rate": 1.7495518730602924e-05, "loss": 1.2065, "step": 5355 }, { "epoch": 0.30753342130931205, "grad_norm": 4.06639123926537, "learning_rate": 1.748888624890812e-05, "loss": 1.2827, "step": 5360 }, { "epoch": 0.30782029950083195, "grad_norm": 3.909012963119109, "learning_rate": 1.748224625734139e-05, "loss": 1.2325, "step": 5365 }, { "epoch": 0.30810717769235185, "grad_norm": 4.046375828326188, "learning_rate": 1.7475598762561333e-05, "loss": 1.3208, "step": 5370 }, { "epoch": 0.3083940558838717, "grad_norm": 3.5318057490899206, "learning_rate": 1.7468943771234075e-05, "loss": 1.2285, "step": 5375 }, { "epoch": 0.3086809340753916, "grad_norm": 3.771963133512019, "learning_rate": 1.7462281290033255e-05, "loss": 1.2488, "step": 5380 }, { "epoch": 0.30896781226691145, "grad_norm": 3.6563066067925107, "learning_rate": 1.7455611325640024e-05, "loss": 1.231, "step": 5385 }, { "epoch": 0.30925469045843135, "grad_norm": 3.96627932674194, "learning_rate": 1.744893388474304e-05, "loss": 1.2517, "step": 5390 }, { "epoch": 0.30954156864995125, "grad_norm": 3.9591731281019764, "learning_rate": 1.744224897403845e-05, "loss": 1.3002, "step": 5395 }, { "epoch": 0.3098284468414711, "grad_norm": 3.990339570411768, "learning_rate": 1.7435556600229904e-05, "loss": 1.2852, "step": 5400 }, { "epoch": 0.310115325032991, "grad_norm": 3.8456999868374724, "learning_rate": 1.7428856770028523e-05, "loss": 1.3359, "step": 5405 }, { "epoch": 0.3104022032245109, "grad_norm": 3.8513782517181454, "learning_rate": 1.7422149490152913e-05, "loss": 1.2831, "step": 5410 }, { "epoch": 0.31068908141603074, "grad_norm": 4.081294091469816, "learning_rate": 1.7415434767329155e-05, "loss": 1.3718, "step": 5415 }, { "epoch": 0.31097595960755064, "grad_norm": 4.662020164806738, "learning_rate": 1.740871260829078e-05, "loss": 1.3111, "step": 5420 }, { "epoch": 0.3112628377990705, "grad_norm": 399.734845392211, "learning_rate": 1.7401983019778793e-05, "loss": 1.3551, "step": 5425 }, { "epoch": 0.3115497159905904, "grad_norm": 8.028795106634284, "learning_rate": 1.739524600854163e-05, "loss": 1.2991, "step": 5430 }, { "epoch": 0.3118365941821103, "grad_norm": 3.497895974531472, "learning_rate": 1.7388501581335192e-05, "loss": 1.3085, "step": 5435 }, { "epoch": 0.31212347237363014, "grad_norm": 4.31752649887084, "learning_rate": 1.7381749744922797e-05, "loss": 1.2689, "step": 5440 }, { "epoch": 0.31241035056515004, "grad_norm": 3.8421156630280406, "learning_rate": 1.7374990506075208e-05, "loss": 1.2862, "step": 5445 }, { "epoch": 0.31269722875666994, "grad_norm": 4.652459454834167, "learning_rate": 1.7368223871570598e-05, "loss": 1.3059, "step": 5450 }, { "epoch": 0.3129841069481898, "grad_norm": 4.657360491959865, "learning_rate": 1.7361449848194572e-05, "loss": 1.3131, "step": 5455 }, { "epoch": 0.3132709851397097, "grad_norm": 3.6215298839165913, "learning_rate": 1.7354668442740128e-05, "loss": 1.2484, "step": 5460 }, { "epoch": 0.31355786333122954, "grad_norm": 3.8376536093873996, "learning_rate": 1.7347879662007676e-05, "loss": 1.347, "step": 5465 }, { "epoch": 0.31384474152274944, "grad_norm": 3.7844049938096154, "learning_rate": 1.7341083512805025e-05, "loss": 1.2392, "step": 5470 }, { "epoch": 0.31413161971426934, "grad_norm": 3.6486857895756777, "learning_rate": 1.7334280001947362e-05, "loss": 1.2549, "step": 5475 }, { "epoch": 0.3144184979057892, "grad_norm": 4.1838934918591795, "learning_rate": 1.7327469136257272e-05, "loss": 1.3049, "step": 5480 }, { "epoch": 0.3147053760973091, "grad_norm": 3.9393028099069207, "learning_rate": 1.73206509225647e-05, "loss": 1.1967, "step": 5485 }, { "epoch": 0.314992254288829, "grad_norm": 3.6603703237063296, "learning_rate": 1.7313825367706965e-05, "loss": 1.2341, "step": 5490 }, { "epoch": 0.31527913248034883, "grad_norm": 4.206351097097503, "learning_rate": 1.7306992478528753e-05, "loss": 1.3189, "step": 5495 }, { "epoch": 0.31556601067186874, "grad_norm": 4.153961570422761, "learning_rate": 1.73001522618821e-05, "loss": 1.243, "step": 5500 }, { "epoch": 0.3158528888633886, "grad_norm": 3.8129522174833985, "learning_rate": 1.7293304724626387e-05, "loss": 1.2989, "step": 5505 }, { "epoch": 0.3161397670549085, "grad_norm": 3.8037990483220265, "learning_rate": 1.728644987362834e-05, "loss": 1.3249, "step": 5510 }, { "epoch": 0.3164266452464284, "grad_norm": 3.64357115411839, "learning_rate": 1.727958771576202e-05, "loss": 1.2337, "step": 5515 }, { "epoch": 0.31671352343794823, "grad_norm": 3.796252621985965, "learning_rate": 1.727271825790882e-05, "loss": 1.2901, "step": 5520 }, { "epoch": 0.31700040162946813, "grad_norm": 3.7996768941422325, "learning_rate": 1.726584150695744e-05, "loss": 1.2322, "step": 5525 }, { "epoch": 0.31728727982098803, "grad_norm": 3.69019029275034, "learning_rate": 1.7258957469803905e-05, "loss": 1.296, "step": 5530 }, { "epoch": 0.3175741580125079, "grad_norm": 3.905115857345142, "learning_rate": 1.725206615335154e-05, "loss": 1.2803, "step": 5535 }, { "epoch": 0.3178610362040278, "grad_norm": 3.7526882886470374, "learning_rate": 1.7245167564510974e-05, "loss": 1.2126, "step": 5540 }, { "epoch": 0.3181479143955476, "grad_norm": 4.538022271748023, "learning_rate": 1.7238261710200132e-05, "loss": 1.2486, "step": 5545 }, { "epoch": 0.31843479258706753, "grad_norm": 3.821349046398445, "learning_rate": 1.723134859734421e-05, "loss": 1.296, "step": 5550 }, { "epoch": 0.31872167077858743, "grad_norm": 3.5519732045712407, "learning_rate": 1.7224428232875704e-05, "loss": 1.1641, "step": 5555 }, { "epoch": 0.3190085489701073, "grad_norm": 3.4602651010888272, "learning_rate": 1.7217500623734363e-05, "loss": 1.2495, "step": 5560 }, { "epoch": 0.3192954271616272, "grad_norm": 4.120466462306713, "learning_rate": 1.7210565776867216e-05, "loss": 1.2852, "step": 5565 }, { "epoch": 0.3195823053531471, "grad_norm": 3.8990374792008904, "learning_rate": 1.720362369922854e-05, "loss": 1.2726, "step": 5570 }, { "epoch": 0.3198691835446669, "grad_norm": 3.8424777641838785, "learning_rate": 1.719667439777986e-05, "loss": 1.2698, "step": 5575 }, { "epoch": 0.3201560617361868, "grad_norm": 4.6143118503359775, "learning_rate": 1.7189717879489958e-05, "loss": 1.2616, "step": 5580 }, { "epoch": 0.32044293992770667, "grad_norm": 4.155692331292962, "learning_rate": 1.7182754151334845e-05, "loss": 1.2455, "step": 5585 }, { "epoch": 0.3207298181192266, "grad_norm": 3.5546806598158054, "learning_rate": 1.7175783220297765e-05, "loss": 1.201, "step": 5590 }, { "epoch": 0.3210166963107465, "grad_norm": 4.122890887557611, "learning_rate": 1.7168805093369175e-05, "loss": 1.2371, "step": 5595 }, { "epoch": 0.3213035745022663, "grad_norm": 3.5029533486809066, "learning_rate": 1.7161819777546766e-05, "loss": 1.2408, "step": 5600 }, { "epoch": 0.3215904526937862, "grad_norm": 3.706453407997344, "learning_rate": 1.715482727983542e-05, "loss": 1.3256, "step": 5605 }, { "epoch": 0.3218773308853061, "grad_norm": 3.5174229339290943, "learning_rate": 1.7147827607247243e-05, "loss": 1.2421, "step": 5610 }, { "epoch": 0.32216420907682597, "grad_norm": 3.8309467851058723, "learning_rate": 1.7140820766801507e-05, "loss": 1.286, "step": 5615 }, { "epoch": 0.32245108726834587, "grad_norm": 3.8945065404320833, "learning_rate": 1.7133806765524693e-05, "loss": 1.2235, "step": 5620 }, { "epoch": 0.3227379654598657, "grad_norm": 18.790071503790184, "learning_rate": 1.712678561045046e-05, "loss": 1.3298, "step": 5625 }, { "epoch": 0.3230248436513856, "grad_norm": 4.2094168807214265, "learning_rate": 1.7119757308619636e-05, "loss": 1.2761, "step": 5630 }, { "epoch": 0.3233117218429055, "grad_norm": 3.5605169493932447, "learning_rate": 1.711272186708022e-05, "loss": 1.3081, "step": 5635 }, { "epoch": 0.32359860003442537, "grad_norm": 4.280438258600744, "learning_rate": 1.710567929288736e-05, "loss": 1.2731, "step": 5640 }, { "epoch": 0.32388547822594527, "grad_norm": 3.660013000566843, "learning_rate": 1.7098629593103378e-05, "loss": 1.2563, "step": 5645 }, { "epoch": 0.32417235641746517, "grad_norm": 3.8330870703488085, "learning_rate": 1.7091572774797714e-05, "loss": 1.312, "step": 5650 }, { "epoch": 0.324459234608985, "grad_norm": 3.7553655279978284, "learning_rate": 1.7084508845046978e-05, "loss": 1.3267, "step": 5655 }, { "epoch": 0.3247461128005049, "grad_norm": 3.596786993852508, "learning_rate": 1.707743781093488e-05, "loss": 1.2194, "step": 5660 }, { "epoch": 0.32503299099202476, "grad_norm": 4.013231120152725, "learning_rate": 1.707035967955228e-05, "loss": 1.2289, "step": 5665 }, { "epoch": 0.32531986918354466, "grad_norm": 3.8225220442081462, "learning_rate": 1.7063274457997137e-05, "loss": 1.2933, "step": 5670 }, { "epoch": 0.32560674737506456, "grad_norm": 3.7503090774734593, "learning_rate": 1.7056182153374528e-05, "loss": 1.2954, "step": 5675 }, { "epoch": 0.3258936255665844, "grad_norm": 4.24979721391911, "learning_rate": 1.7049082772796635e-05, "loss": 1.2512, "step": 5680 }, { "epoch": 0.3261805037581043, "grad_norm": 3.67228540463488, "learning_rate": 1.7041976323382727e-05, "loss": 1.2618, "step": 5685 }, { "epoch": 0.3264673819496242, "grad_norm": 3.7021878409373197, "learning_rate": 1.7034862812259173e-05, "loss": 1.3515, "step": 5690 }, { "epoch": 0.32675426014114406, "grad_norm": 3.3524674013480102, "learning_rate": 1.7027742246559417e-05, "loss": 1.2877, "step": 5695 }, { "epoch": 0.32704113833266396, "grad_norm": 3.9567150254450216, "learning_rate": 1.7020614633423977e-05, "loss": 1.2306, "step": 5700 }, { "epoch": 0.3273280165241838, "grad_norm": 3.728962222323664, "learning_rate": 1.701347998000044e-05, "loss": 1.3105, "step": 5705 }, { "epoch": 0.3276148947157037, "grad_norm": 4.172742369588312, "learning_rate": 1.700633829344345e-05, "loss": 1.3278, "step": 5710 }, { "epoch": 0.3279017729072236, "grad_norm": 4.495358707222596, "learning_rate": 1.699918958091471e-05, "loss": 1.239, "step": 5715 }, { "epoch": 0.32818865109874346, "grad_norm": 21.731098993618843, "learning_rate": 1.6992033849582963e-05, "loss": 1.1724, "step": 5720 }, { "epoch": 0.32847552929026336, "grad_norm": 5.1708139132325615, "learning_rate": 1.698487110662399e-05, "loss": 1.1917, "step": 5725 }, { "epoch": 0.32876240748178326, "grad_norm": 4.1772718544327825, "learning_rate": 1.6977701359220616e-05, "loss": 1.2156, "step": 5730 }, { "epoch": 0.3290492856733031, "grad_norm": 4.178288649134642, "learning_rate": 1.6970524614562663e-05, "loss": 1.272, "step": 5735 }, { "epoch": 0.329336163864823, "grad_norm": 3.5010098464656734, "learning_rate": 1.6963340879847004e-05, "loss": 1.2213, "step": 5740 }, { "epoch": 0.32962304205634285, "grad_norm": 3.955389613128324, "learning_rate": 1.695615016227749e-05, "loss": 1.2397, "step": 5745 }, { "epoch": 0.32990992024786275, "grad_norm": 3.6763010481625664, "learning_rate": 1.6948952469065e-05, "loss": 1.3028, "step": 5750 }, { "epoch": 0.33019679843938265, "grad_norm": 3.8384300777308193, "learning_rate": 1.694174780742739e-05, "loss": 1.2866, "step": 5755 }, { "epoch": 0.3304836766309025, "grad_norm": 3.746348316304465, "learning_rate": 1.6934536184589513e-05, "loss": 1.2955, "step": 5760 }, { "epoch": 0.3307705548224224, "grad_norm": 3.9755251609704403, "learning_rate": 1.69273176077832e-05, "loss": 1.2972, "step": 5765 }, { "epoch": 0.3310574330139423, "grad_norm": 3.679471017908131, "learning_rate": 1.6920092084247255e-05, "loss": 1.2255, "step": 5770 }, { "epoch": 0.33134431120546215, "grad_norm": 3.6947071989944633, "learning_rate": 1.691285962122745e-05, "loss": 1.318, "step": 5775 }, { "epoch": 0.33163118939698205, "grad_norm": 3.9450441974535373, "learning_rate": 1.690562022597652e-05, "loss": 1.2588, "step": 5780 }, { "epoch": 0.3319180675885019, "grad_norm": 4.790690669002136, "learning_rate": 1.6898373905754137e-05, "loss": 1.2547, "step": 5785 }, { "epoch": 0.3322049457800218, "grad_norm": 4.167130625087515, "learning_rate": 1.6891120667826935e-05, "loss": 1.2177, "step": 5790 }, { "epoch": 0.3324918239715417, "grad_norm": 4.648692090390158, "learning_rate": 1.6883860519468474e-05, "loss": 1.2302, "step": 5795 }, { "epoch": 0.33277870216306155, "grad_norm": 3.60819312017151, "learning_rate": 1.687659346795925e-05, "loss": 1.1777, "step": 5800 }, { "epoch": 0.33306558035458145, "grad_norm": 4.6161721078770555, "learning_rate": 1.6869319520586676e-05, "loss": 1.2288, "step": 5805 }, { "epoch": 0.33335245854610135, "grad_norm": 4.3208964312347575, "learning_rate": 1.686203868464508e-05, "loss": 1.2594, "step": 5810 }, { "epoch": 0.3336393367376212, "grad_norm": 3.8849555276367, "learning_rate": 1.6854750967435706e-05, "loss": 1.265, "step": 5815 }, { "epoch": 0.3339262149291411, "grad_norm": 3.8042317417083047, "learning_rate": 1.684745637626669e-05, "loss": 1.3257, "step": 5820 }, { "epoch": 0.33421309312066094, "grad_norm": 3.9539469153511293, "learning_rate": 1.6840154918453064e-05, "loss": 1.2744, "step": 5825 }, { "epoch": 0.33449997131218084, "grad_norm": 3.284073378505744, "learning_rate": 1.683284660131675e-05, "loss": 1.2615, "step": 5830 }, { "epoch": 0.33478684950370075, "grad_norm": 3.5495549552756804, "learning_rate": 1.6825531432186545e-05, "loss": 1.2671, "step": 5835 }, { "epoch": 0.3350737276952206, "grad_norm": 3.9769266856941314, "learning_rate": 1.6818209418398107e-05, "loss": 1.257, "step": 5840 }, { "epoch": 0.3353606058867405, "grad_norm": 4.1398843105783145, "learning_rate": 1.681088056729398e-05, "loss": 1.3432, "step": 5845 }, { "epoch": 0.3356474840782604, "grad_norm": 4.006607228892523, "learning_rate": 1.680354488622355e-05, "loss": 1.2376, "step": 5850 }, { "epoch": 0.33593436226978024, "grad_norm": 3.6044392842661512, "learning_rate": 1.6796202382543046e-05, "loss": 1.2779, "step": 5855 }, { "epoch": 0.33622124046130014, "grad_norm": 4.018606678770412, "learning_rate": 1.6788853063615555e-05, "loss": 1.2398, "step": 5860 }, { "epoch": 0.33650811865282, "grad_norm": 3.8476167129571515, "learning_rate": 1.678149693681099e-05, "loss": 1.2128, "step": 5865 }, { "epoch": 0.3367949968443399, "grad_norm": 3.4984189365018357, "learning_rate": 1.677413400950609e-05, "loss": 1.3237, "step": 5870 }, { "epoch": 0.3370818750358598, "grad_norm": 3.2193933134010693, "learning_rate": 1.6766764289084415e-05, "loss": 1.3111, "step": 5875 }, { "epoch": 0.33736875322737964, "grad_norm": 3.8554527908446286, "learning_rate": 1.6759387782936336e-05, "loss": 1.1803, "step": 5880 }, { "epoch": 0.33765563141889954, "grad_norm": 3.968225129697392, "learning_rate": 1.6752004498459034e-05, "loss": 1.2744, "step": 5885 }, { "epoch": 0.33794250961041944, "grad_norm": 3.73894085266525, "learning_rate": 1.6744614443056477e-05, "loss": 1.2962, "step": 5890 }, { "epoch": 0.3382293878019393, "grad_norm": 3.9083578053611663, "learning_rate": 1.6737217624139435e-05, "loss": 1.2255, "step": 5895 }, { "epoch": 0.3385162659934592, "grad_norm": 5.437687440077169, "learning_rate": 1.672981404912545e-05, "loss": 1.2007, "step": 5900 }, { "epoch": 0.33880314418497903, "grad_norm": 3.848730749566172, "learning_rate": 1.6722403725438846e-05, "loss": 1.2639, "step": 5905 }, { "epoch": 0.33909002237649893, "grad_norm": 4.351229084268987, "learning_rate": 1.6714986660510715e-05, "loss": 1.1786, "step": 5910 }, { "epoch": 0.33937690056801884, "grad_norm": 3.5391483059326827, "learning_rate": 1.6707562861778898e-05, "loss": 1.3636, "step": 5915 }, { "epoch": 0.3396637787595387, "grad_norm": 3.6049681272458955, "learning_rate": 1.6700132336688008e-05, "loss": 1.2161, "step": 5920 }, { "epoch": 0.3399506569510586, "grad_norm": 3.660938826478612, "learning_rate": 1.6692695092689382e-05, "loss": 1.328, "step": 5925 }, { "epoch": 0.3402375351425785, "grad_norm": 3.846514041028459, "learning_rate": 1.6685251137241113e-05, "loss": 1.3168, "step": 5930 }, { "epoch": 0.34052441333409833, "grad_norm": 3.7073246913014013, "learning_rate": 1.667780047780801e-05, "loss": 1.2674, "step": 5935 }, { "epoch": 0.34081129152561823, "grad_norm": 3.2759948462760837, "learning_rate": 1.6670343121861613e-05, "loss": 1.2557, "step": 5940 }, { "epoch": 0.3410981697171381, "grad_norm": 4.245955173874968, "learning_rate": 1.6662879076880178e-05, "loss": 1.3669, "step": 5945 }, { "epoch": 0.341385047908658, "grad_norm": 9.361290471092373, "learning_rate": 1.6655408350348665e-05, "loss": 1.2009, "step": 5950 }, { "epoch": 0.3416719261001779, "grad_norm": 3.686234961131057, "learning_rate": 1.6647930949758734e-05, "loss": 1.3041, "step": 5955 }, { "epoch": 0.3419588042916977, "grad_norm": 3.554606339154031, "learning_rate": 1.6640446882608738e-05, "loss": 1.1979, "step": 5960 }, { "epoch": 0.34224568248321763, "grad_norm": 4.018055302315789, "learning_rate": 1.6632956156403717e-05, "loss": 1.3393, "step": 5965 }, { "epoch": 0.34253256067473753, "grad_norm": 3.9442883837672738, "learning_rate": 1.6625458778655386e-05, "loss": 1.2894, "step": 5970 }, { "epoch": 0.3428194388662574, "grad_norm": 3.686018050107771, "learning_rate": 1.6617954756882143e-05, "loss": 1.3713, "step": 5975 }, { "epoch": 0.3431063170577773, "grad_norm": 3.6584774138010947, "learning_rate": 1.6610444098609028e-05, "loss": 1.1891, "step": 5980 }, { "epoch": 0.3433931952492971, "grad_norm": 4.7315450744857355, "learning_rate": 1.6602926811367742e-05, "loss": 1.2311, "step": 5985 }, { "epoch": 0.343680073440817, "grad_norm": 7.9838575467773545, "learning_rate": 1.6595402902696648e-05, "loss": 1.258, "step": 5990 }, { "epoch": 0.3439669516323369, "grad_norm": 4.344372356604471, "learning_rate": 1.658787238014073e-05, "loss": 1.3688, "step": 5995 }, { "epoch": 0.34425382982385677, "grad_norm": 3.9448270804214256, "learning_rate": 1.6580335251251623e-05, "loss": 1.3228, "step": 6000 }, { "epoch": 0.3445407080153767, "grad_norm": 3.936195752563929, "learning_rate": 1.6572791523587562e-05, "loss": 1.2974, "step": 6005 }, { "epoch": 0.3448275862068966, "grad_norm": 3.8117332648258735, "learning_rate": 1.656524120471343e-05, "loss": 1.3259, "step": 6010 }, { "epoch": 0.3451144643984164, "grad_norm": 3.8414899551179444, "learning_rate": 1.6557684302200693e-05, "loss": 1.2449, "step": 6015 }, { "epoch": 0.3454013425899363, "grad_norm": 3.5301833558734286, "learning_rate": 1.6550120823627432e-05, "loss": 1.1582, "step": 6020 }, { "epoch": 0.34568822078145617, "grad_norm": 3.8300845266643226, "learning_rate": 1.6542550776578322e-05, "loss": 1.2502, "step": 6025 }, { "epoch": 0.34597509897297607, "grad_norm": 4.564113987229508, "learning_rate": 1.6534974168644625e-05, "loss": 1.2829, "step": 6030 }, { "epoch": 0.34626197716449597, "grad_norm": 4.422577559692342, "learning_rate": 1.652739100742417e-05, "loss": 1.3089, "step": 6035 }, { "epoch": 0.3465488553560158, "grad_norm": 4.03236816872727, "learning_rate": 1.6519801300521384e-05, "loss": 1.3503, "step": 6040 }, { "epoch": 0.3468357335475357, "grad_norm": 3.4735891650467665, "learning_rate": 1.651220505554723e-05, "loss": 1.2771, "step": 6045 }, { "epoch": 0.3471226117390556, "grad_norm": 4.076106759765979, "learning_rate": 1.6504602280119244e-05, "loss": 1.2658, "step": 6050 }, { "epoch": 0.34740948993057547, "grad_norm": 4.327437287736069, "learning_rate": 1.6496992981861507e-05, "loss": 1.2594, "step": 6055 }, { "epoch": 0.34769636812209537, "grad_norm": 3.773114362390044, "learning_rate": 1.648937716840464e-05, "loss": 1.2561, "step": 6060 }, { "epoch": 0.3479832463136152, "grad_norm": 4.267105216191694, "learning_rate": 1.6481754847385794e-05, "loss": 1.3621, "step": 6065 }, { "epoch": 0.3482701245051351, "grad_norm": 3.422519013994409, "learning_rate": 1.6474126026448654e-05, "loss": 1.2928, "step": 6070 }, { "epoch": 0.348557002696655, "grad_norm": 3.571883084705498, "learning_rate": 1.6466490713243417e-05, "loss": 1.1459, "step": 6075 }, { "epoch": 0.34884388088817486, "grad_norm": 4.3911774362983085, "learning_rate": 1.6458848915426792e-05, "loss": 1.2741, "step": 6080 }, { "epoch": 0.34913075907969476, "grad_norm": 3.953601582358217, "learning_rate": 1.6451200640661992e-05, "loss": 1.3187, "step": 6085 }, { "epoch": 0.34941763727121466, "grad_norm": 3.6748566850794675, "learning_rate": 1.6443545896618726e-05, "loss": 1.2908, "step": 6090 }, { "epoch": 0.3497045154627345, "grad_norm": 3.7945705077736873, "learning_rate": 1.643588469097318e-05, "loss": 1.3096, "step": 6095 }, { "epoch": 0.3499913936542544, "grad_norm": 6.933516315444646, "learning_rate": 1.642821703140804e-05, "loss": 1.209, "step": 6100 }, { "epoch": 0.35027827184577426, "grad_norm": 3.4742483195061182, "learning_rate": 1.6420542925612442e-05, "loss": 1.2108, "step": 6105 }, { "epoch": 0.35056515003729416, "grad_norm": 4.159938458516438, "learning_rate": 1.6412862381282004e-05, "loss": 1.3296, "step": 6110 }, { "epoch": 0.35085202822881406, "grad_norm": 3.775200383569843, "learning_rate": 1.6405175406118788e-05, "loss": 1.3042, "step": 6115 }, { "epoch": 0.3511389064203339, "grad_norm": 3.5784751590403103, "learning_rate": 1.6397482007831312e-05, "loss": 1.1879, "step": 6120 }, { "epoch": 0.3514257846118538, "grad_norm": 3.5957157766461414, "learning_rate": 1.6389782194134535e-05, "loss": 1.2125, "step": 6125 }, { "epoch": 0.3517126628033737, "grad_norm": 3.727163951068632, "learning_rate": 1.6382075972749843e-05, "loss": 1.2178, "step": 6130 }, { "epoch": 0.35199954099489356, "grad_norm": 3.375389334902129, "learning_rate": 1.6374363351405055e-05, "loss": 1.3041, "step": 6135 }, { "epoch": 0.35228641918641346, "grad_norm": 3.7985193803963613, "learning_rate": 1.6366644337834406e-05, "loss": 1.2927, "step": 6140 }, { "epoch": 0.3525732973779333, "grad_norm": 27.356979745881976, "learning_rate": 1.6358918939778538e-05, "loss": 1.2558, "step": 6145 }, { "epoch": 0.3528601755694532, "grad_norm": 3.6526309138539617, "learning_rate": 1.6351187164984496e-05, "loss": 1.2397, "step": 6150 }, { "epoch": 0.3531470537609731, "grad_norm": 3.5428960017415707, "learning_rate": 1.6343449021205725e-05, "loss": 1.2945, "step": 6155 }, { "epoch": 0.35343393195249295, "grad_norm": 4.368731074309572, "learning_rate": 1.6335704516202052e-05, "loss": 1.2429, "step": 6160 }, { "epoch": 0.35372081014401285, "grad_norm": 3.704064891312114, "learning_rate": 1.632795365773968e-05, "loss": 1.2633, "step": 6165 }, { "epoch": 0.35400768833553276, "grad_norm": 4.687046285275695, "learning_rate": 1.6320196453591194e-05, "loss": 1.3298, "step": 6170 }, { "epoch": 0.3542945665270526, "grad_norm": 5.964488656375001, "learning_rate": 1.631243291153553e-05, "loss": 1.2239, "step": 6175 }, { "epoch": 0.3545814447185725, "grad_norm": 3.7297313623637596, "learning_rate": 1.6304663039357988e-05, "loss": 1.36, "step": 6180 }, { "epoch": 0.35486832291009235, "grad_norm": 3.8331164623322875, "learning_rate": 1.629688684485021e-05, "loss": 1.2014, "step": 6185 }, { "epoch": 0.35515520110161225, "grad_norm": 3.7236216995787963, "learning_rate": 1.6289104335810187e-05, "loss": 1.2536, "step": 6190 }, { "epoch": 0.35544207929313215, "grad_norm": 4.080133384596942, "learning_rate": 1.6281315520042235e-05, "loss": 1.2338, "step": 6195 }, { "epoch": 0.355728957484652, "grad_norm": 3.2761754132542293, "learning_rate": 1.627352040535699e-05, "loss": 1.15, "step": 6200 }, { "epoch": 0.3560158356761719, "grad_norm": 3.797876690214772, "learning_rate": 1.6265718999571416e-05, "loss": 1.285, "step": 6205 }, { "epoch": 0.3563027138676918, "grad_norm": 3.583467881134055, "learning_rate": 1.6257911310508783e-05, "loss": 1.2712, "step": 6210 }, { "epoch": 0.35658959205921165, "grad_norm": 3.820930439929066, "learning_rate": 1.625009734599865e-05, "loss": 1.2533, "step": 6215 }, { "epoch": 0.35687647025073155, "grad_norm": 4.177426226367705, "learning_rate": 1.6242277113876887e-05, "loss": 1.2172, "step": 6220 }, { "epoch": 0.3571633484422514, "grad_norm": 3.8350939589256563, "learning_rate": 1.6234450621985636e-05, "loss": 1.2844, "step": 6225 }, { "epoch": 0.3574502266337713, "grad_norm": 3.7892775574193442, "learning_rate": 1.622661787817332e-05, "loss": 1.2915, "step": 6230 }, { "epoch": 0.3577371048252912, "grad_norm": 4.41506426947533, "learning_rate": 1.6218778890294634e-05, "loss": 1.2773, "step": 6235 }, { "epoch": 0.35802398301681104, "grad_norm": 4.004981337492539, "learning_rate": 1.6210933666210533e-05, "loss": 1.1799, "step": 6240 }, { "epoch": 0.35831086120833094, "grad_norm": 3.630349043157775, "learning_rate": 1.620308221378822e-05, "loss": 1.2313, "step": 6245 }, { "epoch": 0.35859773939985085, "grad_norm": 3.8335676591262775, "learning_rate": 1.619522454090116e-05, "loss": 1.2821, "step": 6250 }, { "epoch": 0.3588846175913707, "grad_norm": 3.422600034826802, "learning_rate": 1.6187360655429034e-05, "loss": 1.2791, "step": 6255 }, { "epoch": 0.3591714957828906, "grad_norm": 4.142762574759638, "learning_rate": 1.617949056525777e-05, "loss": 1.2595, "step": 6260 }, { "epoch": 0.35945837397441044, "grad_norm": 4.056483855691592, "learning_rate": 1.617161427827951e-05, "loss": 1.2313, "step": 6265 }, { "epoch": 0.35974525216593034, "grad_norm": 3.72881231167572, "learning_rate": 1.616373180239261e-05, "loss": 1.242, "step": 6270 }, { "epoch": 0.36003213035745024, "grad_norm": 3.708406451698977, "learning_rate": 1.615584314550164e-05, "loss": 1.2249, "step": 6275 }, { "epoch": 0.3603190085489701, "grad_norm": 4.036571456082381, "learning_rate": 1.614794831551736e-05, "loss": 1.2618, "step": 6280 }, { "epoch": 0.36060588674049, "grad_norm": 5.400978532430585, "learning_rate": 1.614004732035672e-05, "loss": 1.235, "step": 6285 }, { "epoch": 0.3608927649320099, "grad_norm": 3.8057312199527558, "learning_rate": 1.6132140167942863e-05, "loss": 1.3034, "step": 6290 }, { "epoch": 0.36117964312352974, "grad_norm": 3.88407832289184, "learning_rate": 1.6124226866205092e-05, "loss": 1.1741, "step": 6295 }, { "epoch": 0.36146652131504964, "grad_norm": 3.4601726120651914, "learning_rate": 1.611630742307889e-05, "loss": 1.2438, "step": 6300 }, { "epoch": 0.3617533995065695, "grad_norm": 4.20823045494652, "learning_rate": 1.6108381846505884e-05, "loss": 1.2172, "step": 6305 }, { "epoch": 0.3620402776980894, "grad_norm": 3.818169672875593, "learning_rate": 1.610045014443387e-05, "loss": 1.1852, "step": 6310 }, { "epoch": 0.3623271558896093, "grad_norm": 3.6849286166380284, "learning_rate": 1.609251232481677e-05, "loss": 1.2286, "step": 6315 }, { "epoch": 0.36261403408112913, "grad_norm": 3.4903722959433208, "learning_rate": 1.6084568395614647e-05, "loss": 1.2327, "step": 6320 }, { "epoch": 0.36290091227264903, "grad_norm": 4.053828888739159, "learning_rate": 1.6076618364793696e-05, "loss": 1.2986, "step": 6325 }, { "epoch": 0.36318779046416894, "grad_norm": 3.6484515529259958, "learning_rate": 1.606866224032622e-05, "loss": 1.2649, "step": 6330 }, { "epoch": 0.3634746686556888, "grad_norm": 3.534661329762727, "learning_rate": 1.606070003019064e-05, "loss": 1.1773, "step": 6335 }, { "epoch": 0.3637615468472087, "grad_norm": 4.328204178614986, "learning_rate": 1.6052731742371487e-05, "loss": 1.2949, "step": 6340 }, { "epoch": 0.36404842503872853, "grad_norm": 3.67263377916346, "learning_rate": 1.6044757384859365e-05, "loss": 1.2827, "step": 6345 }, { "epoch": 0.36433530323024843, "grad_norm": 3.3090338894709967, "learning_rate": 1.603677696565098e-05, "loss": 1.1731, "step": 6350 }, { "epoch": 0.36462218142176833, "grad_norm": 3.3614035895653713, "learning_rate": 1.602879049274912e-05, "loss": 1.1761, "step": 6355 }, { "epoch": 0.3649090596132882, "grad_norm": 7.186181021996858, "learning_rate": 1.6020797974162638e-05, "loss": 1.3393, "step": 6360 }, { "epoch": 0.3651959378048081, "grad_norm": 4.084010594275213, "learning_rate": 1.6012799417906443e-05, "loss": 1.2145, "step": 6365 }, { "epoch": 0.365482815996328, "grad_norm": 3.973320112570281, "learning_rate": 1.6004794832001507e-05, "loss": 1.2683, "step": 6370 }, { "epoch": 0.3657696941878478, "grad_norm": 3.5185638630386524, "learning_rate": 1.599678422447485e-05, "loss": 1.2706, "step": 6375 }, { "epoch": 0.36605657237936773, "grad_norm": 3.4928902418728542, "learning_rate": 1.5988767603359527e-05, "loss": 1.3497, "step": 6380 }, { "epoch": 0.3663434505708876, "grad_norm": 3.9523256258416635, "learning_rate": 1.598074497669462e-05, "loss": 1.2696, "step": 6385 }, { "epoch": 0.3666303287624075, "grad_norm": 3.4862257756641384, "learning_rate": 1.597271635252524e-05, "loss": 1.2307, "step": 6390 }, { "epoch": 0.3669172069539274, "grad_norm": 4.219397742121651, "learning_rate": 1.596468173890251e-05, "loss": 1.2538, "step": 6395 }, { "epoch": 0.3672040851454472, "grad_norm": 3.56324063039285, "learning_rate": 1.595664114388356e-05, "loss": 1.2109, "step": 6400 }, { "epoch": 0.3674909633369671, "grad_norm": 3.6669052388359376, "learning_rate": 1.594859457553151e-05, "loss": 1.2413, "step": 6405 }, { "epoch": 0.367777841528487, "grad_norm": 3.4367749068160487, "learning_rate": 1.594054204191548e-05, "loss": 1.2632, "step": 6410 }, { "epoch": 0.36806471972000687, "grad_norm": 3.708929157880347, "learning_rate": 1.593248355111057e-05, "loss": 1.221, "step": 6415 }, { "epoch": 0.3683515979115268, "grad_norm": 3.8814002663489133, "learning_rate": 1.5924419111197853e-05, "loss": 1.3026, "step": 6420 }, { "epoch": 0.3686384761030466, "grad_norm": 3.4505745306489777, "learning_rate": 1.5916348730264367e-05, "loss": 1.2203, "step": 6425 }, { "epoch": 0.3689253542945665, "grad_norm": 3.6893116015447234, "learning_rate": 1.5908272416403106e-05, "loss": 1.2169, "step": 6430 }, { "epoch": 0.3692122324860864, "grad_norm": 4.725579675043148, "learning_rate": 1.5900190177713018e-05, "loss": 1.2876, "step": 6435 }, { "epoch": 0.36949911067760627, "grad_norm": 3.5889379930312395, "learning_rate": 1.5892102022298988e-05, "loss": 1.1789, "step": 6440 }, { "epoch": 0.36978598886912617, "grad_norm": 5.19005588994173, "learning_rate": 1.5884007958271838e-05, "loss": 1.2433, "step": 6445 }, { "epoch": 0.37007286706064607, "grad_norm": 3.449774587147724, "learning_rate": 1.5875907993748314e-05, "loss": 1.2183, "step": 6450 }, { "epoch": 0.3703597452521659, "grad_norm": 4.006325935136862, "learning_rate": 1.5867802136851078e-05, "loss": 1.2738, "step": 6455 }, { "epoch": 0.3706466234436858, "grad_norm": 3.4253510914222836, "learning_rate": 1.5859690395708703e-05, "loss": 1.196, "step": 6460 }, { "epoch": 0.37093350163520566, "grad_norm": 3.4408115197111715, "learning_rate": 1.5851572778455658e-05, "loss": 1.2827, "step": 6465 }, { "epoch": 0.37122037982672557, "grad_norm": 3.9070809953127994, "learning_rate": 1.5843449293232305e-05, "loss": 1.2428, "step": 6470 }, { "epoch": 0.37150725801824547, "grad_norm": 3.9508385646005353, "learning_rate": 1.5835319948184903e-05, "loss": 1.2556, "step": 6475 }, { "epoch": 0.3717941362097653, "grad_norm": 3.1564112242384725, "learning_rate": 1.5827184751465573e-05, "loss": 1.1779, "step": 6480 }, { "epoch": 0.3720810144012852, "grad_norm": 3.4330474325099485, "learning_rate": 1.58190437112323e-05, "loss": 1.2484, "step": 6485 }, { "epoch": 0.3723678925928051, "grad_norm": 4.790691720947433, "learning_rate": 1.5810896835648954e-05, "loss": 1.2925, "step": 6490 }, { "epoch": 0.37265477078432496, "grad_norm": 3.4296729650782734, "learning_rate": 1.580274413288523e-05, "loss": 1.3073, "step": 6495 }, { "epoch": 0.37294164897584486, "grad_norm": 3.8367132811388207, "learning_rate": 1.5794585611116672e-05, "loss": 1.2292, "step": 6500 }, { "epoch": 0.3732285271673647, "grad_norm": 3.910230137313282, "learning_rate": 1.578642127852467e-05, "loss": 1.193, "step": 6505 }, { "epoch": 0.3735154053588846, "grad_norm": 3.5242664634248286, "learning_rate": 1.5778251143296437e-05, "loss": 1.2649, "step": 6510 }, { "epoch": 0.3738022835504045, "grad_norm": 3.2770377311073986, "learning_rate": 1.5770075213625e-05, "loss": 1.2182, "step": 6515 }, { "epoch": 0.37408916174192436, "grad_norm": 4.02523672273697, "learning_rate": 1.57618934977092e-05, "loss": 1.1845, "step": 6520 }, { "epoch": 0.37437603993344426, "grad_norm": 3.445304489939561, "learning_rate": 1.575370600375368e-05, "loss": 1.3098, "step": 6525 }, { "epoch": 0.37466291812496416, "grad_norm": 3.600736053491594, "learning_rate": 1.5745512739968877e-05, "loss": 1.218, "step": 6530 }, { "epoch": 0.374949796316484, "grad_norm": 3.4450950371086053, "learning_rate": 1.5737313714571016e-05, "loss": 1.2414, "step": 6535 }, { "epoch": 0.3752366745080039, "grad_norm": 3.951837856202493, "learning_rate": 1.5729108935782095e-05, "loss": 1.2097, "step": 6540 }, { "epoch": 0.37552355269952375, "grad_norm": 4.249652486109699, "learning_rate": 1.5720898411829887e-05, "loss": 1.2839, "step": 6545 }, { "epoch": 0.37581043089104366, "grad_norm": 3.4414363213807264, "learning_rate": 1.5712682150947926e-05, "loss": 1.2126, "step": 6550 }, { "epoch": 0.37609730908256356, "grad_norm": 3.216296401558256, "learning_rate": 1.570446016137549e-05, "loss": 1.1859, "step": 6555 }, { "epoch": 0.3763841872740834, "grad_norm": 4.26056182435401, "learning_rate": 1.5696232451357615e-05, "loss": 1.2246, "step": 6560 }, { "epoch": 0.3766710654656033, "grad_norm": 4.085942648411189, "learning_rate": 1.5687999029145062e-05, "loss": 1.3239, "step": 6565 }, { "epoch": 0.3769579436571232, "grad_norm": 4.196347492907388, "learning_rate": 1.567975990299433e-05, "loss": 1.2343, "step": 6570 }, { "epoch": 0.37724482184864305, "grad_norm": 3.443305866077105, "learning_rate": 1.5671515081167632e-05, "loss": 1.238, "step": 6575 }, { "epoch": 0.37753170004016295, "grad_norm": 3.475272008298972, "learning_rate": 1.5663264571932893e-05, "loss": 1.223, "step": 6580 }, { "epoch": 0.3778185782316828, "grad_norm": 3.7534183088172464, "learning_rate": 1.565500838356374e-05, "loss": 1.2071, "step": 6585 }, { "epoch": 0.3781054564232027, "grad_norm": 4.086155082934382, "learning_rate": 1.5646746524339496e-05, "loss": 1.3238, "step": 6590 }, { "epoch": 0.3783923346147226, "grad_norm": 3.333138265659205, "learning_rate": 1.5638479002545184e-05, "loss": 1.2179, "step": 6595 }, { "epoch": 0.37867921280624245, "grad_norm": 4.576446953101192, "learning_rate": 1.563020582647148e-05, "loss": 1.2959, "step": 6600 }, { "epoch": 0.37896609099776235, "grad_norm": 3.437988703462699, "learning_rate": 1.5621927004414747e-05, "loss": 1.3632, "step": 6605 }, { "epoch": 0.37925296918928225, "grad_norm": 3.751829668146616, "learning_rate": 1.561364254467701e-05, "loss": 1.2602, "step": 6610 }, { "epoch": 0.3795398473808021, "grad_norm": 3.7398036362444445, "learning_rate": 1.5605352455565937e-05, "loss": 1.2591, "step": 6615 }, { "epoch": 0.379826725572322, "grad_norm": 3.472550983022067, "learning_rate": 1.5597056745394858e-05, "loss": 1.3091, "step": 6620 }, { "epoch": 0.38011360376384185, "grad_norm": 3.923330562655186, "learning_rate": 1.558875542248272e-05, "loss": 1.2354, "step": 6625 }, { "epoch": 0.38040048195536175, "grad_norm": 3.382150651983063, "learning_rate": 1.558044849515411e-05, "loss": 1.2311, "step": 6630 }, { "epoch": 0.38068736014688165, "grad_norm": 3.6645195440724727, "learning_rate": 1.5572135971739242e-05, "loss": 1.1676, "step": 6635 }, { "epoch": 0.3809742383384015, "grad_norm": 4.118487272497083, "learning_rate": 1.5563817860573922e-05, "loss": 1.2133, "step": 6640 }, { "epoch": 0.3812611165299214, "grad_norm": 3.6789396389221842, "learning_rate": 1.5555494169999578e-05, "loss": 1.2297, "step": 6645 }, { "epoch": 0.3815479947214413, "grad_norm": 3.9208273875436825, "learning_rate": 1.5547164908363222e-05, "loss": 1.2122, "step": 6650 }, { "epoch": 0.38183487291296114, "grad_norm": 3.3201936650324617, "learning_rate": 1.5538830084017455e-05, "loss": 1.2399, "step": 6655 }, { "epoch": 0.38212175110448104, "grad_norm": 3.9533395701549385, "learning_rate": 1.5530489705320466e-05, "loss": 1.2674, "step": 6660 }, { "epoch": 0.3824086292960009, "grad_norm": 4.230589856248343, "learning_rate": 1.552214378063599e-05, "loss": 1.2892, "step": 6665 }, { "epoch": 0.3826955074875208, "grad_norm": 4.833209570688316, "learning_rate": 1.551379231833335e-05, "loss": 1.2565, "step": 6670 }, { "epoch": 0.3829823856790407, "grad_norm": 4.292810337145479, "learning_rate": 1.5505435326787413e-05, "loss": 1.2997, "step": 6675 }, { "epoch": 0.38326926387056054, "grad_norm": 3.7532061156216137, "learning_rate": 1.5497072814378585e-05, "loss": 1.1932, "step": 6680 }, { "epoch": 0.38355614206208044, "grad_norm": 3.7389195369582087, "learning_rate": 1.5488704789492814e-05, "loss": 1.2364, "step": 6685 }, { "epoch": 0.38384302025360034, "grad_norm": 4.01615984243143, "learning_rate": 1.5480331260521567e-05, "loss": 1.2544, "step": 6690 }, { "epoch": 0.3841298984451202, "grad_norm": 3.42545710969234, "learning_rate": 1.5471952235861842e-05, "loss": 1.2918, "step": 6695 }, { "epoch": 0.3844167766366401, "grad_norm": 3.657165264440433, "learning_rate": 1.546356772391615e-05, "loss": 1.2662, "step": 6700 }, { "epoch": 0.38470365482816, "grad_norm": 4.039202477770144, "learning_rate": 1.545517773309249e-05, "loss": 1.286, "step": 6705 }, { "epoch": 0.38499053301967984, "grad_norm": 3.8350154905294565, "learning_rate": 1.5446782271804365e-05, "loss": 1.2144, "step": 6710 }, { "epoch": 0.38527741121119974, "grad_norm": 3.438412072724366, "learning_rate": 1.5438381348470766e-05, "loss": 1.2472, "step": 6715 }, { "epoch": 0.3855642894027196, "grad_norm": 3.6766678224361025, "learning_rate": 1.5429974971516157e-05, "loss": 1.184, "step": 6720 }, { "epoch": 0.3858511675942395, "grad_norm": 3.404856887041815, "learning_rate": 1.542156314937047e-05, "loss": 1.1647, "step": 6725 }, { "epoch": 0.3861380457857594, "grad_norm": 3.565711314329798, "learning_rate": 1.54131458904691e-05, "loss": 1.2394, "step": 6730 }, { "epoch": 0.38642492397727923, "grad_norm": 3.672278908586729, "learning_rate": 1.5404723203252892e-05, "loss": 1.2137, "step": 6735 }, { "epoch": 0.38671180216879913, "grad_norm": 3.707305957985887, "learning_rate": 1.539629509616814e-05, "loss": 1.3125, "step": 6740 }, { "epoch": 0.38699868036031904, "grad_norm": 3.1726148921071387, "learning_rate": 1.538786157766656e-05, "loss": 1.2729, "step": 6745 }, { "epoch": 0.3872855585518389, "grad_norm": 3.904892325786969, "learning_rate": 1.537942265620531e-05, "loss": 1.2639, "step": 6750 }, { "epoch": 0.3875724367433588, "grad_norm": 3.8826643722727234, "learning_rate": 1.5370978340246957e-05, "loss": 1.2812, "step": 6755 }, { "epoch": 0.38785931493487863, "grad_norm": 3.4387935732681836, "learning_rate": 1.536252863825948e-05, "loss": 1.2123, "step": 6760 }, { "epoch": 0.38814619312639853, "grad_norm": 3.2780841248402663, "learning_rate": 1.535407355871626e-05, "loss": 1.1627, "step": 6765 }, { "epoch": 0.38843307131791843, "grad_norm": 3.3220617697093275, "learning_rate": 1.534561311009607e-05, "loss": 1.1724, "step": 6770 }, { "epoch": 0.3887199495094383, "grad_norm": 3.9237018203947813, "learning_rate": 1.5337147300883065e-05, "loss": 1.3294, "step": 6775 }, { "epoch": 0.3890068277009582, "grad_norm": 3.1177286119169394, "learning_rate": 1.532867613956678e-05, "loss": 1.2309, "step": 6780 }, { "epoch": 0.3892937058924781, "grad_norm": 3.723616095466779, "learning_rate": 1.5320199634642112e-05, "loss": 1.2065, "step": 6785 }, { "epoch": 0.3895805840839979, "grad_norm": 3.6408332003413832, "learning_rate": 1.5311717794609324e-05, "loss": 1.1958, "step": 6790 }, { "epoch": 0.38986746227551783, "grad_norm": 3.6458562612425687, "learning_rate": 1.530323062797402e-05, "loss": 1.1824, "step": 6795 }, { "epoch": 0.3901543404670377, "grad_norm": 5.85167918646838, "learning_rate": 1.5294738143247148e-05, "loss": 1.3147, "step": 6800 }, { "epoch": 0.3904412186585576, "grad_norm": 3.7982908468434107, "learning_rate": 1.5286240348944996e-05, "loss": 1.2115, "step": 6805 }, { "epoch": 0.3907280968500775, "grad_norm": 3.643704453485706, "learning_rate": 1.5277737253589165e-05, "loss": 1.2649, "step": 6810 }, { "epoch": 0.3910149750415973, "grad_norm": 3.693548781963623, "learning_rate": 1.5269228865706584e-05, "loss": 1.3049, "step": 6815 }, { "epoch": 0.3913018532331172, "grad_norm": 3.5643620035659556, "learning_rate": 1.526071519382948e-05, "loss": 1.2574, "step": 6820 }, { "epoch": 0.3915887314246371, "grad_norm": 3.531807312343297, "learning_rate": 1.5252196246495382e-05, "loss": 1.2025, "step": 6825 }, { "epoch": 0.391875609616157, "grad_norm": 3.5396751219575733, "learning_rate": 1.5243672032247111e-05, "loss": 1.462, "step": 6830 }, { "epoch": 0.3921624878076769, "grad_norm": 3.2771077808447333, "learning_rate": 1.5235142559632767e-05, "loss": 1.2122, "step": 6835 }, { "epoch": 0.3924493659991967, "grad_norm": 3.532287013523869, "learning_rate": 1.5226607837205727e-05, "loss": 1.1909, "step": 6840 }, { "epoch": 0.3927362441907166, "grad_norm": 3.042561925999634, "learning_rate": 1.5218067873524627e-05, "loss": 1.2352, "step": 6845 }, { "epoch": 0.3930231223822365, "grad_norm": 3.7128808790727117, "learning_rate": 1.5209522677153364e-05, "loss": 1.2465, "step": 6850 }, { "epoch": 0.39331000057375637, "grad_norm": 3.7870284150532703, "learning_rate": 1.5200972256661075e-05, "loss": 1.3156, "step": 6855 }, { "epoch": 0.39359687876527627, "grad_norm": 3.7624968788341984, "learning_rate": 1.5192416620622146e-05, "loss": 1.3735, "step": 6860 }, { "epoch": 0.39388375695679617, "grad_norm": 3.786406680761254, "learning_rate": 1.5183855777616187e-05, "loss": 1.3295, "step": 6865 }, { "epoch": 0.394170635148316, "grad_norm": 3.7613439802676756, "learning_rate": 1.517528973622803e-05, "loss": 1.2747, "step": 6870 }, { "epoch": 0.3944575133398359, "grad_norm": 3.340850720915856, "learning_rate": 1.5166718505047722e-05, "loss": 1.2856, "step": 6875 }, { "epoch": 0.39474439153135576, "grad_norm": 4.047948893718045, "learning_rate": 1.5158142092670512e-05, "loss": 1.3204, "step": 6880 }, { "epoch": 0.39503126972287567, "grad_norm": 4.347228186284784, "learning_rate": 1.514956050769684e-05, "loss": 1.2337, "step": 6885 }, { "epoch": 0.39531814791439557, "grad_norm": 3.5815082780119583, "learning_rate": 1.5140973758732347e-05, "loss": 1.2619, "step": 6890 }, { "epoch": 0.3956050261059154, "grad_norm": 3.280356282679634, "learning_rate": 1.513238185438784e-05, "loss": 1.1616, "step": 6895 }, { "epoch": 0.3958919042974353, "grad_norm": 3.521325444157095, "learning_rate": 1.5123784803279301e-05, "loss": 1.209, "step": 6900 }, { "epoch": 0.3961787824889552, "grad_norm": 3.7303443120330133, "learning_rate": 1.5115182614027873e-05, "loss": 1.3058, "step": 6905 }, { "epoch": 0.39646566068047506, "grad_norm": 3.484743684988866, "learning_rate": 1.5106575295259846e-05, "loss": 1.1619, "step": 6910 }, { "epoch": 0.39675253887199496, "grad_norm": 4.384039026359902, "learning_rate": 1.5097962855606665e-05, "loss": 1.1966, "step": 6915 }, { "epoch": 0.3970394170635148, "grad_norm": 3.5416918926218446, "learning_rate": 1.5089345303704901e-05, "loss": 1.194, "step": 6920 }, { "epoch": 0.3973262952550347, "grad_norm": 3.435737056354799, "learning_rate": 1.5080722648196254e-05, "loss": 1.2757, "step": 6925 }, { "epoch": 0.3976131734465546, "grad_norm": 3.57562310993039, "learning_rate": 1.507209489772754e-05, "loss": 1.1597, "step": 6930 }, { "epoch": 0.39790005163807446, "grad_norm": 3.921848368302219, "learning_rate": 1.506346206095069e-05, "loss": 1.341, "step": 6935 }, { "epoch": 0.39818692982959436, "grad_norm": 3.474149746833519, "learning_rate": 1.5054824146522733e-05, "loss": 1.2802, "step": 6940 }, { "epoch": 0.39847380802111426, "grad_norm": 3.6571804582700156, "learning_rate": 1.5046181163105786e-05, "loss": 1.1991, "step": 6945 }, { "epoch": 0.3987606862126341, "grad_norm": 3.490907952228987, "learning_rate": 1.5037533119367054e-05, "loss": 1.3036, "step": 6950 }, { "epoch": 0.399047564404154, "grad_norm": 3.716134299761281, "learning_rate": 1.5028880023978811e-05, "loss": 1.1457, "step": 6955 }, { "epoch": 0.39933444259567386, "grad_norm": 3.940991912449712, "learning_rate": 1.5020221885618406e-05, "loss": 1.3037, "step": 6960 }, { "epoch": 0.39962132078719376, "grad_norm": 3.2093349192797604, "learning_rate": 1.5011558712968235e-05, "loss": 1.2573, "step": 6965 }, { "epoch": 0.39990819897871366, "grad_norm": 4.119284605030204, "learning_rate": 1.5002890514715751e-05, "loss": 1.1793, "step": 6970 }, { "epoch": 0.4001950771702335, "grad_norm": 3.5824380068659027, "learning_rate": 1.499421729955344e-05, "loss": 1.2186, "step": 6975 }, { "epoch": 0.4004819553617534, "grad_norm": 3.66645752575848, "learning_rate": 1.4985539076178819e-05, "loss": 1.2915, "step": 6980 }, { "epoch": 0.4007688335532733, "grad_norm": 3.738127063868495, "learning_rate": 1.4976855853294436e-05, "loss": 1.2324, "step": 6985 }, { "epoch": 0.40105571174479315, "grad_norm": 3.3602382800019677, "learning_rate": 1.4968167639607845e-05, "loss": 1.2999, "step": 6990 }, { "epoch": 0.40134258993631305, "grad_norm": 3.471344824552091, "learning_rate": 1.4959474443831599e-05, "loss": 1.2097, "step": 6995 }, { "epoch": 0.4016294681278329, "grad_norm": 3.8432716791614485, "learning_rate": 1.4950776274683264e-05, "loss": 1.2687, "step": 7000 }, { "epoch": 0.4019163463193528, "grad_norm": 3.8145732106462447, "learning_rate": 1.4942073140885377e-05, "loss": 1.2035, "step": 7005 }, { "epoch": 0.4022032245108727, "grad_norm": 3.8608899425127503, "learning_rate": 1.4933365051165463e-05, "loss": 1.2744, "step": 7010 }, { "epoch": 0.40249010270239255, "grad_norm": 3.427937045451496, "learning_rate": 1.4924652014256013e-05, "loss": 1.2022, "step": 7015 }, { "epoch": 0.40277698089391245, "grad_norm": 3.3102669151930106, "learning_rate": 1.4915934038894481e-05, "loss": 1.2904, "step": 7020 }, { "epoch": 0.40306385908543235, "grad_norm": 4.153596247869959, "learning_rate": 1.4907211133823274e-05, "loss": 1.2299, "step": 7025 }, { "epoch": 0.4033507372769522, "grad_norm": 3.215313638814001, "learning_rate": 1.4898483307789733e-05, "loss": 1.1633, "step": 7030 }, { "epoch": 0.4036376154684721, "grad_norm": 3.8018371568008966, "learning_rate": 1.4889750569546149e-05, "loss": 1.2045, "step": 7035 }, { "epoch": 0.40392449365999195, "grad_norm": 3.690958390030654, "learning_rate": 1.488101292784973e-05, "loss": 1.2337, "step": 7040 }, { "epoch": 0.40421137185151185, "grad_norm": 3.4893357996574212, "learning_rate": 1.48722703914626e-05, "loss": 1.2174, "step": 7045 }, { "epoch": 0.40449825004303175, "grad_norm": 3.8571787702600813, "learning_rate": 1.4863522969151797e-05, "loss": 1.1944, "step": 7050 }, { "epoch": 0.4047851282345516, "grad_norm": 3.7178613694463145, "learning_rate": 1.4854770669689254e-05, "loss": 1.2557, "step": 7055 }, { "epoch": 0.4050720064260715, "grad_norm": 3.944967920453989, "learning_rate": 1.4846013501851797e-05, "loss": 1.2739, "step": 7060 }, { "epoch": 0.4053588846175914, "grad_norm": 3.6267193140290774, "learning_rate": 1.4837251474421135e-05, "loss": 1.178, "step": 7065 }, { "epoch": 0.40564576280911124, "grad_norm": 3.7726120384043953, "learning_rate": 1.4828484596183845e-05, "loss": 1.1607, "step": 7070 }, { "epoch": 0.40593264100063114, "grad_norm": 3.9361079386830675, "learning_rate": 1.481971287593138e-05, "loss": 1.264, "step": 7075 }, { "epoch": 0.406219519192151, "grad_norm": 3.5226819814580717, "learning_rate": 1.481093632246003e-05, "loss": 1.1782, "step": 7080 }, { "epoch": 0.4065063973836709, "grad_norm": 3.8264507184366163, "learning_rate": 1.4802154944570952e-05, "loss": 1.2044, "step": 7085 }, { "epoch": 0.4067932755751908, "grad_norm": 3.250921175992061, "learning_rate": 1.4793368751070125e-05, "loss": 1.1712, "step": 7090 }, { "epoch": 0.40708015376671064, "grad_norm": 3.9182171482845556, "learning_rate": 1.4784577750768364e-05, "loss": 1.192, "step": 7095 }, { "epoch": 0.40736703195823054, "grad_norm": 3.209709901642492, "learning_rate": 1.477578195248131e-05, "loss": 1.2314, "step": 7100 }, { "epoch": 0.40765391014975044, "grad_norm": 4.532835202183789, "learning_rate": 1.47669813650294e-05, "loss": 1.3214, "step": 7105 }, { "epoch": 0.4079407883412703, "grad_norm": 3.593339723235817, "learning_rate": 1.4758175997237891e-05, "loss": 1.2836, "step": 7110 }, { "epoch": 0.4082276665327902, "grad_norm": 3.4274364857893427, "learning_rate": 1.4749365857936823e-05, "loss": 1.1589, "step": 7115 }, { "epoch": 0.40851454472431004, "grad_norm": 3.6172641457552035, "learning_rate": 1.4740550955961022e-05, "loss": 1.2261, "step": 7120 }, { "epoch": 0.40880142291582994, "grad_norm": 3.4776576784796034, "learning_rate": 1.4731731300150092e-05, "loss": 1.2466, "step": 7125 }, { "epoch": 0.40908830110734984, "grad_norm": 3.960426755015358, "learning_rate": 1.4722906899348401e-05, "loss": 1.1957, "step": 7130 }, { "epoch": 0.4093751792988697, "grad_norm": 3.4515795029208194, "learning_rate": 1.4714077762405085e-05, "loss": 1.2677, "step": 7135 }, { "epoch": 0.4096620574903896, "grad_norm": 4.0852357088586935, "learning_rate": 1.4705243898174017e-05, "loss": 1.2959, "step": 7140 }, { "epoch": 0.4099489356819095, "grad_norm": 3.3257273436463195, "learning_rate": 1.4696405315513816e-05, "loss": 1.2296, "step": 7145 }, { "epoch": 0.41023581387342933, "grad_norm": 3.7741789823205303, "learning_rate": 1.4687562023287833e-05, "loss": 1.231, "step": 7150 }, { "epoch": 0.41052269206494924, "grad_norm": 3.847021091179078, "learning_rate": 1.4678714030364143e-05, "loss": 1.2601, "step": 7155 }, { "epoch": 0.4108095702564691, "grad_norm": 3.0658589156680724, "learning_rate": 1.4669861345615533e-05, "loss": 1.1801, "step": 7160 }, { "epoch": 0.411096448447989, "grad_norm": 3.741492030099095, "learning_rate": 1.4661003977919493e-05, "loss": 1.2245, "step": 7165 }, { "epoch": 0.4113833266395089, "grad_norm": 3.5634106834408543, "learning_rate": 1.465214193615821e-05, "loss": 1.2756, "step": 7170 }, { "epoch": 0.41167020483102873, "grad_norm": 3.713474963771682, "learning_rate": 1.4643275229218563e-05, "loss": 1.2113, "step": 7175 }, { "epoch": 0.41195708302254863, "grad_norm": 3.37131942316876, "learning_rate": 1.4634403865992106e-05, "loss": 1.2501, "step": 7180 }, { "epoch": 0.41224396121406853, "grad_norm": 5.010365134013607, "learning_rate": 1.462552785537506e-05, "loss": 1.2684, "step": 7185 }, { "epoch": 0.4125308394055884, "grad_norm": 3.362474579606011, "learning_rate": 1.4616647206268306e-05, "loss": 1.1945, "step": 7190 }, { "epoch": 0.4128177175971083, "grad_norm": 4.182521313564589, "learning_rate": 1.4607761927577379e-05, "loss": 1.1933, "step": 7195 }, { "epoch": 0.4131045957886281, "grad_norm": 3.9157152215237487, "learning_rate": 1.4598872028212464e-05, "loss": 1.2188, "step": 7200 }, { "epoch": 0.413391473980148, "grad_norm": 4.395558405377868, "learning_rate": 1.4589977517088366e-05, "loss": 1.2559, "step": 7205 }, { "epoch": 0.41367835217166793, "grad_norm": 2.9711161250704685, "learning_rate": 1.458107840312452e-05, "loss": 1.2001, "step": 7210 }, { "epoch": 0.4139652303631878, "grad_norm": 3.7328015096254985, "learning_rate": 1.4572174695244977e-05, "loss": 1.1647, "step": 7215 }, { "epoch": 0.4142521085547077, "grad_norm": 3.2654551617042515, "learning_rate": 1.4563266402378399e-05, "loss": 1.2876, "step": 7220 }, { "epoch": 0.4145389867462276, "grad_norm": 3.776511297969427, "learning_rate": 1.455435353345804e-05, "loss": 1.2391, "step": 7225 }, { "epoch": 0.4148258649377474, "grad_norm": 3.2297402850706427, "learning_rate": 1.4545436097421745e-05, "loss": 1.2191, "step": 7230 }, { "epoch": 0.4151127431292673, "grad_norm": 3.4840782172413394, "learning_rate": 1.4536514103211941e-05, "loss": 1.298, "step": 7235 }, { "epoch": 0.41539962132078717, "grad_norm": 3.390429568833039, "learning_rate": 1.4527587559775617e-05, "loss": 1.2164, "step": 7240 }, { "epoch": 0.4156864995123071, "grad_norm": 4.14145340933831, "learning_rate": 1.451865647606434e-05, "loss": 1.2668, "step": 7245 }, { "epoch": 0.415973377703827, "grad_norm": 4.306148030685749, "learning_rate": 1.4509720861034213e-05, "loss": 1.1976, "step": 7250 }, { "epoch": 0.4162602558953468, "grad_norm": 3.2491114586707326, "learning_rate": 1.4500780723645897e-05, "loss": 1.1743, "step": 7255 }, { "epoch": 0.4165471340868667, "grad_norm": 4.007003843655357, "learning_rate": 1.4491836072864579e-05, "loss": 1.2878, "step": 7260 }, { "epoch": 0.4168340122783866, "grad_norm": 3.374141318405852, "learning_rate": 1.448288691765997e-05, "loss": 1.2206, "step": 7265 }, { "epoch": 0.41712089046990647, "grad_norm": 3.7475227625821854, "learning_rate": 1.447393326700631e-05, "loss": 1.2244, "step": 7270 }, { "epoch": 0.41740776866142637, "grad_norm": 3.416842308257622, "learning_rate": 1.446497512988234e-05, "loss": 1.2606, "step": 7275 }, { "epoch": 0.4176946468529462, "grad_norm": 3.8663875822672864, "learning_rate": 1.4456012515271294e-05, "loss": 1.2096, "step": 7280 }, { "epoch": 0.4179815250444661, "grad_norm": 3.433046331645397, "learning_rate": 1.4447045432160911e-05, "loss": 1.1464, "step": 7285 }, { "epoch": 0.418268403235986, "grad_norm": 3.525716148988708, "learning_rate": 1.443807388954339e-05, "loss": 1.2367, "step": 7290 }, { "epoch": 0.41855528142750587, "grad_norm": 3.516835066210468, "learning_rate": 1.4429097896415424e-05, "loss": 1.2348, "step": 7295 }, { "epoch": 0.41884215961902577, "grad_norm": 4.061799638421397, "learning_rate": 1.4420117461778156e-05, "loss": 1.1691, "step": 7300 }, { "epoch": 0.41912903781054567, "grad_norm": 3.304097368207438, "learning_rate": 1.4411132594637184e-05, "loss": 1.1693, "step": 7305 }, { "epoch": 0.4194159160020655, "grad_norm": 3.8427363648738857, "learning_rate": 1.4402143304002559e-05, "loss": 1.2439, "step": 7310 }, { "epoch": 0.4197027941935854, "grad_norm": 3.421728303372576, "learning_rate": 1.4393149598888752e-05, "loss": 1.2657, "step": 7315 }, { "epoch": 0.41998967238510526, "grad_norm": 3.56365449578167, "learning_rate": 1.438415148831468e-05, "loss": 1.2842, "step": 7320 }, { "epoch": 0.42027655057662516, "grad_norm": 3.7795674286363936, "learning_rate": 1.4375148981303663e-05, "loss": 1.2779, "step": 7325 }, { "epoch": 0.42056342876814506, "grad_norm": 3.5152116574015513, "learning_rate": 1.4366142086883437e-05, "loss": 1.2611, "step": 7330 }, { "epoch": 0.4208503069596649, "grad_norm": 3.414788920559177, "learning_rate": 1.4357130814086136e-05, "loss": 1.2069, "step": 7335 }, { "epoch": 0.4211371851511848, "grad_norm": 3.340324761349587, "learning_rate": 1.4348115171948283e-05, "loss": 1.2224, "step": 7340 }, { "epoch": 0.4214240633427047, "grad_norm": 3.3043830136788896, "learning_rate": 1.4339095169510786e-05, "loss": 1.2172, "step": 7345 }, { "epoch": 0.42171094153422456, "grad_norm": 3.661319658816841, "learning_rate": 1.4330070815818924e-05, "loss": 1.2406, "step": 7350 }, { "epoch": 0.42199781972574446, "grad_norm": 3.802269331614734, "learning_rate": 1.4321042119922337e-05, "loss": 1.1956, "step": 7355 }, { "epoch": 0.4222846979172643, "grad_norm": 3.5430943508251236, "learning_rate": 1.4312009090875025e-05, "loss": 1.258, "step": 7360 }, { "epoch": 0.4225715761087842, "grad_norm": 3.4187698747141977, "learning_rate": 1.4302971737735325e-05, "loss": 1.2513, "step": 7365 }, { "epoch": 0.4228584543003041, "grad_norm": 3.5917491030337745, "learning_rate": 1.429393006956592e-05, "loss": 1.2196, "step": 7370 }, { "epoch": 0.42314533249182396, "grad_norm": 3.7099832154397063, "learning_rate": 1.428488409543381e-05, "loss": 1.2271, "step": 7375 }, { "epoch": 0.42343221068334386, "grad_norm": 4.182229008020983, "learning_rate": 1.4275833824410322e-05, "loss": 1.1973, "step": 7380 }, { "epoch": 0.42371908887486376, "grad_norm": 3.871847724760893, "learning_rate": 1.4266779265571087e-05, "loss": 1.2353, "step": 7385 }, { "epoch": 0.4240059670663836, "grad_norm": 3.668373522072272, "learning_rate": 1.4257720427996035e-05, "loss": 1.2577, "step": 7390 }, { "epoch": 0.4242928452579035, "grad_norm": 4.209389196028504, "learning_rate": 1.4248657320769392e-05, "loss": 1.2658, "step": 7395 }, { "epoch": 0.42457972344942335, "grad_norm": 3.836815696579933, "learning_rate": 1.4239589952979662e-05, "loss": 1.3607, "step": 7400 }, { "epoch": 0.42486660164094325, "grad_norm": 3.6064143877811103, "learning_rate": 1.4230518333719617e-05, "loss": 1.1828, "step": 7405 }, { "epoch": 0.42515347983246315, "grad_norm": 3.8453398508971492, "learning_rate": 1.4221442472086304e-05, "loss": 1.2623, "step": 7410 }, { "epoch": 0.425440358023983, "grad_norm": 3.6637761386213197, "learning_rate": 1.421236237718101e-05, "loss": 1.1856, "step": 7415 }, { "epoch": 0.4257272362155029, "grad_norm": 3.335917559602314, "learning_rate": 1.4203278058109282e-05, "loss": 1.307, "step": 7420 }, { "epoch": 0.4260141144070228, "grad_norm": 4.098486459343832, "learning_rate": 1.4194189523980892e-05, "loss": 1.3394, "step": 7425 }, { "epoch": 0.42630099259854265, "grad_norm": 3.4194746551067805, "learning_rate": 1.4185096783909838e-05, "loss": 1.2439, "step": 7430 }, { "epoch": 0.42658787079006255, "grad_norm": 3.735697613314367, "learning_rate": 1.4175999847014348e-05, "loss": 1.2121, "step": 7435 }, { "epoch": 0.4268747489815824, "grad_norm": 3.19434389940862, "learning_rate": 1.4166898722416846e-05, "loss": 1.1866, "step": 7440 }, { "epoch": 0.4271616271731023, "grad_norm": 3.3435425432228167, "learning_rate": 1.4157793419243964e-05, "loss": 1.2339, "step": 7445 }, { "epoch": 0.4274485053646222, "grad_norm": 3.704171740022002, "learning_rate": 1.4148683946626516e-05, "loss": 1.1647, "step": 7450 }, { "epoch": 0.42773538355614205, "grad_norm": 4.217406014871696, "learning_rate": 1.4139570313699503e-05, "loss": 1.1821, "step": 7455 }, { "epoch": 0.42802226174766195, "grad_norm": 3.652353949271945, "learning_rate": 1.4130452529602097e-05, "loss": 1.2115, "step": 7460 }, { "epoch": 0.42830913993918185, "grad_norm": 3.900945346425537, "learning_rate": 1.4121330603477633e-05, "loss": 1.2297, "step": 7465 }, { "epoch": 0.4285960181307017, "grad_norm": 3.7421216727015794, "learning_rate": 1.4112204544473598e-05, "loss": 1.2204, "step": 7470 }, { "epoch": 0.4288828963222216, "grad_norm": 3.5991582748292017, "learning_rate": 1.4103074361741625e-05, "loss": 1.2652, "step": 7475 }, { "epoch": 0.42916977451374144, "grad_norm": 3.7993808090926575, "learning_rate": 1.409394006443748e-05, "loss": 1.2424, "step": 7480 }, { "epoch": 0.42945665270526134, "grad_norm": 4.193121714211052, "learning_rate": 1.408480166172106e-05, "loss": 1.2336, "step": 7485 }, { "epoch": 0.42974353089678125, "grad_norm": 4.5072661663842775, "learning_rate": 1.4075659162756372e-05, "loss": 1.2868, "step": 7490 }, { "epoch": 0.4300304090883011, "grad_norm": 3.8912011249397804, "learning_rate": 1.4066512576711538e-05, "loss": 1.2526, "step": 7495 }, { "epoch": 0.430317287279821, "grad_norm": 3.907878380001484, "learning_rate": 1.405736191275877e-05, "loss": 1.256, "step": 7500 }, { "epoch": 0.4306041654713409, "grad_norm": 3.362431597411126, "learning_rate": 1.4048207180074383e-05, "loss": 1.2264, "step": 7505 }, { "epoch": 0.43089104366286074, "grad_norm": 3.491402894271633, "learning_rate": 1.4039048387838757e-05, "loss": 1.213, "step": 7510 }, { "epoch": 0.43117792185438064, "grad_norm": 3.337856378086455, "learning_rate": 1.4029885545236348e-05, "loss": 1.1726, "step": 7515 }, { "epoch": 0.4314648000459005, "grad_norm": 4.427934845090413, "learning_rate": 1.402071866145568e-05, "loss": 1.2526, "step": 7520 }, { "epoch": 0.4317516782374204, "grad_norm": 3.168954286717132, "learning_rate": 1.401154774568932e-05, "loss": 1.1687, "step": 7525 }, { "epoch": 0.4320385564289403, "grad_norm": 3.7772662718259915, "learning_rate": 1.4002372807133886e-05, "loss": 1.1561, "step": 7530 }, { "epoch": 0.43232543462046014, "grad_norm": 3.580818349651549, "learning_rate": 1.3993193854990028e-05, "loss": 1.2422, "step": 7535 }, { "epoch": 0.43261231281198004, "grad_norm": 3.454557643258713, "learning_rate": 1.3984010898462417e-05, "loss": 1.2051, "step": 7540 }, { "epoch": 0.43289919100349994, "grad_norm": 3.5389687822096496, "learning_rate": 1.3974823946759744e-05, "loss": 1.2201, "step": 7545 }, { "epoch": 0.4331860691950198, "grad_norm": 3.7035373569921197, "learning_rate": 1.3965633009094702e-05, "loss": 1.2252, "step": 7550 }, { "epoch": 0.4334729473865397, "grad_norm": 3.3866964184935044, "learning_rate": 1.3956438094683987e-05, "loss": 1.2744, "step": 7555 }, { "epoch": 0.43375982557805953, "grad_norm": 3.6902588145236948, "learning_rate": 1.3947239212748279e-05, "loss": 1.3079, "step": 7560 }, { "epoch": 0.43404670376957943, "grad_norm": 3.5462945173206832, "learning_rate": 1.3938036372512235e-05, "loss": 1.147, "step": 7565 }, { "epoch": 0.43433358196109934, "grad_norm": 3.420473226496287, "learning_rate": 1.392882958320449e-05, "loss": 1.1507, "step": 7570 }, { "epoch": 0.4346204601526192, "grad_norm": 3.2278755474480283, "learning_rate": 1.3919618854057626e-05, "loss": 1.1914, "step": 7575 }, { "epoch": 0.4349073383441391, "grad_norm": 4.361523156079659, "learning_rate": 1.3910404194308188e-05, "loss": 1.2612, "step": 7580 }, { "epoch": 0.435194216535659, "grad_norm": 3.218951017209154, "learning_rate": 1.3901185613196655e-05, "loss": 1.2115, "step": 7585 }, { "epoch": 0.43548109472717883, "grad_norm": 3.296665047964582, "learning_rate": 1.389196311996744e-05, "loss": 1.1645, "step": 7590 }, { "epoch": 0.43576797291869873, "grad_norm": 3.9738885992887227, "learning_rate": 1.3882736723868883e-05, "loss": 1.1801, "step": 7595 }, { "epoch": 0.4360548511102186, "grad_norm": 3.509820437434482, "learning_rate": 1.387350643415323e-05, "loss": 1.1496, "step": 7600 }, { "epoch": 0.4363417293017385, "grad_norm": 3.4018780378290114, "learning_rate": 1.386427226007664e-05, "loss": 1.1877, "step": 7605 }, { "epoch": 0.4366286074932584, "grad_norm": 3.656302473246249, "learning_rate": 1.3855034210899162e-05, "loss": 1.2186, "step": 7610 }, { "epoch": 0.4369154856847782, "grad_norm": 3.3075183268667976, "learning_rate": 1.3845792295884735e-05, "loss": 1.2124, "step": 7615 }, { "epoch": 0.43720236387629813, "grad_norm": 3.4395243014985235, "learning_rate": 1.3836546524301171e-05, "loss": 1.2156, "step": 7620 }, { "epoch": 0.43748924206781803, "grad_norm": 3.593549314865416, "learning_rate": 1.3827296905420144e-05, "loss": 1.279, "step": 7625 }, { "epoch": 0.4377761202593379, "grad_norm": 3.631003081048878, "learning_rate": 1.3818043448517201e-05, "loss": 1.3087, "step": 7630 }, { "epoch": 0.4380629984508578, "grad_norm": 3.5814582878802304, "learning_rate": 1.3808786162871728e-05, "loss": 1.2602, "step": 7635 }, { "epoch": 0.4383498766423776, "grad_norm": 3.454356137827576, "learning_rate": 1.3799525057766946e-05, "loss": 1.2884, "step": 7640 }, { "epoch": 0.4386367548338975, "grad_norm": 3.9541128349476025, "learning_rate": 1.3790260142489922e-05, "loss": 1.1844, "step": 7645 }, { "epoch": 0.4389236330254174, "grad_norm": 3.460581502833067, "learning_rate": 1.3780991426331523e-05, "loss": 1.2202, "step": 7650 }, { "epoch": 0.43921051121693727, "grad_norm": 3.6825103485107546, "learning_rate": 1.3771718918586445e-05, "loss": 1.1859, "step": 7655 }, { "epoch": 0.4394973894084572, "grad_norm": 3.774038102235499, "learning_rate": 1.376244262855318e-05, "loss": 1.2173, "step": 7660 }, { "epoch": 0.4397842675999771, "grad_norm": 3.932067855104383, "learning_rate": 1.3753162565534004e-05, "loss": 1.2368, "step": 7665 }, { "epoch": 0.4400711457914969, "grad_norm": 3.2340769578221926, "learning_rate": 1.3743878738834997e-05, "loss": 1.2221, "step": 7670 }, { "epoch": 0.4403580239830168, "grad_norm": 2.989691390102699, "learning_rate": 1.3734591157765993e-05, "loss": 1.2153, "step": 7675 }, { "epoch": 0.44064490217453667, "grad_norm": 4.138866751119863, "learning_rate": 1.3725299831640601e-05, "loss": 1.1706, "step": 7680 }, { "epoch": 0.44093178036605657, "grad_norm": 3.8242119619296453, "learning_rate": 1.3716004769776188e-05, "loss": 1.187, "step": 7685 }, { "epoch": 0.44121865855757647, "grad_norm": 3.3135980565495573, "learning_rate": 1.3706705981493853e-05, "loss": 1.1901, "step": 7690 }, { "epoch": 0.4415055367490963, "grad_norm": 3.3175390920280283, "learning_rate": 1.3697403476118453e-05, "loss": 1.1702, "step": 7695 }, { "epoch": 0.4417924149406162, "grad_norm": 4.904813335701704, "learning_rate": 1.3688097262978555e-05, "loss": 1.1841, "step": 7700 }, { "epoch": 0.4420792931321361, "grad_norm": 3.167248121383652, "learning_rate": 1.3678787351406451e-05, "loss": 1.1694, "step": 7705 }, { "epoch": 0.44236617132365597, "grad_norm": 3.2947857614500053, "learning_rate": 1.3669473750738142e-05, "loss": 1.2157, "step": 7710 }, { "epoch": 0.44265304951517587, "grad_norm": 3.014642429764263, "learning_rate": 1.3660156470313326e-05, "loss": 1.2128, "step": 7715 }, { "epoch": 0.4429399277066957, "grad_norm": 3.6694985854430757, "learning_rate": 1.3650835519475397e-05, "loss": 1.2722, "step": 7720 }, { "epoch": 0.4432268058982156, "grad_norm": 4.289026327700578, "learning_rate": 1.364151090757142e-05, "loss": 1.2863, "step": 7725 }, { "epoch": 0.4435136840897355, "grad_norm": 3.179150826452536, "learning_rate": 1.3632182643952142e-05, "loss": 1.1896, "step": 7730 }, { "epoch": 0.44380056228125536, "grad_norm": 3.5007016758766025, "learning_rate": 1.3622850737971963e-05, "loss": 1.2364, "step": 7735 }, { "epoch": 0.44408744047277526, "grad_norm": 4.007150125557757, "learning_rate": 1.3613515198988938e-05, "loss": 1.2845, "step": 7740 }, { "epoch": 0.44437431866429516, "grad_norm": 3.4750515988826076, "learning_rate": 1.360417603636477e-05, "loss": 1.1999, "step": 7745 }, { "epoch": 0.444661196855815, "grad_norm": 3.8205263793602575, "learning_rate": 1.359483325946479e-05, "loss": 1.254, "step": 7750 }, { "epoch": 0.4449480750473349, "grad_norm": 3.4539023159118947, "learning_rate": 1.3585486877657958e-05, "loss": 1.2306, "step": 7755 }, { "epoch": 0.44523495323885476, "grad_norm": 3.5007828955752225, "learning_rate": 1.3576136900316844e-05, "loss": 1.1845, "step": 7760 }, { "epoch": 0.44552183143037466, "grad_norm": 3.7345804862497527, "learning_rate": 1.3566783336817629e-05, "loss": 1.2992, "step": 7765 }, { "epoch": 0.44580870962189456, "grad_norm": 3.4453706469520613, "learning_rate": 1.3557426196540081e-05, "loss": 1.2483, "step": 7770 }, { "epoch": 0.4460955878134144, "grad_norm": 3.397805528870216, "learning_rate": 1.3548065488867572e-05, "loss": 1.2285, "step": 7775 }, { "epoch": 0.4463824660049343, "grad_norm": 3.6284092120958404, "learning_rate": 1.3538701223187031e-05, "loss": 1.2287, "step": 7780 }, { "epoch": 0.4466693441964542, "grad_norm": 3.7365245065296593, "learning_rate": 1.352933340888897e-05, "loss": 1.1446, "step": 7785 }, { "epoch": 0.44695622238797406, "grad_norm": 3.685559165732207, "learning_rate": 1.351996205536745e-05, "loss": 1.2466, "step": 7790 }, { "epoch": 0.44724310057949396, "grad_norm": 3.8330763957940546, "learning_rate": 1.3510587172020091e-05, "loss": 1.1835, "step": 7795 }, { "epoch": 0.4475299787710138, "grad_norm": 3.7275702743362604, "learning_rate": 1.3501208768248042e-05, "loss": 1.2311, "step": 7800 }, { "epoch": 0.4478168569625337, "grad_norm": 3.3361813096443345, "learning_rate": 1.349182685345599e-05, "loss": 1.1689, "step": 7805 }, { "epoch": 0.4481037351540536, "grad_norm": 3.312695058291792, "learning_rate": 1.3482441437052137e-05, "loss": 1.3172, "step": 7810 }, { "epoch": 0.44839061334557345, "grad_norm": 3.4183574901261435, "learning_rate": 1.3473052528448203e-05, "loss": 1.2034, "step": 7815 }, { "epoch": 0.44867749153709335, "grad_norm": 4.272749579735019, "learning_rate": 1.3463660137059406e-05, "loss": 1.1797, "step": 7820 }, { "epoch": 0.44896436972861326, "grad_norm": 3.6678134071111863, "learning_rate": 1.3454264272304461e-05, "loss": 1.1279, "step": 7825 }, { "epoch": 0.4492512479201331, "grad_norm": 3.15260411543669, "learning_rate": 1.3444864943605551e-05, "loss": 1.1772, "step": 7830 }, { "epoch": 0.449538126111653, "grad_norm": 3.2182352230431572, "learning_rate": 1.3435462160388353e-05, "loss": 1.1373, "step": 7835 }, { "epoch": 0.44982500430317285, "grad_norm": 3.37912059763861, "learning_rate": 1.3426055932081998e-05, "loss": 1.1593, "step": 7840 }, { "epoch": 0.45011188249469275, "grad_norm": 4.847647209777198, "learning_rate": 1.3416646268119073e-05, "loss": 1.2662, "step": 7845 }, { "epoch": 0.45039876068621265, "grad_norm": 3.5676116370555104, "learning_rate": 1.3407233177935608e-05, "loss": 1.2618, "step": 7850 }, { "epoch": 0.4506856388777325, "grad_norm": 3.9056400816761543, "learning_rate": 1.339781667097107e-05, "loss": 1.2095, "step": 7855 }, { "epoch": 0.4509725170692524, "grad_norm": 3.7340007855397745, "learning_rate": 1.3388396756668357e-05, "loss": 1.148, "step": 7860 }, { "epoch": 0.4512593952607723, "grad_norm": 3.707050185148197, "learning_rate": 1.3378973444473778e-05, "loss": 1.1687, "step": 7865 }, { "epoch": 0.45154627345229215, "grad_norm": 3.365877412697334, "learning_rate": 1.3369546743837052e-05, "loss": 1.2052, "step": 7870 }, { "epoch": 0.45183315164381205, "grad_norm": 3.1051881115264783, "learning_rate": 1.3360116664211294e-05, "loss": 1.1217, "step": 7875 }, { "epoch": 0.4521200298353319, "grad_norm": 3.432355208726948, "learning_rate": 1.3350683215053013e-05, "loss": 1.1644, "step": 7880 }, { "epoch": 0.4524069080268518, "grad_norm": 3.554592583023602, "learning_rate": 1.3341246405822089e-05, "loss": 1.1512, "step": 7885 }, { "epoch": 0.4526937862183717, "grad_norm": 3.115407405769473, "learning_rate": 1.3331806245981776e-05, "loss": 1.1492, "step": 7890 }, { "epoch": 0.45298066440989154, "grad_norm": 3.3569214075906957, "learning_rate": 1.332236274499869e-05, "loss": 1.2728, "step": 7895 }, { "epoch": 0.45326754260141144, "grad_norm": 3.5589682914479126, "learning_rate": 1.3312915912342793e-05, "loss": 1.2453, "step": 7900 }, { "epoch": 0.45355442079293135, "grad_norm": 3.4978316041422906, "learning_rate": 1.330346575748739e-05, "loss": 1.1693, "step": 7905 }, { "epoch": 0.4538412989844512, "grad_norm": 3.5325110736129566, "learning_rate": 1.3294012289909115e-05, "loss": 1.2109, "step": 7910 }, { "epoch": 0.4541281771759711, "grad_norm": 3.3874587405857586, "learning_rate": 1.3284555519087935e-05, "loss": 1.2106, "step": 7915 }, { "epoch": 0.45441505536749094, "grad_norm": 3.4176893864528846, "learning_rate": 1.3275095454507113e-05, "loss": 1.187, "step": 7920 }, { "epoch": 0.45470193355901084, "grad_norm": 3.7668582113627935, "learning_rate": 1.3265632105653221e-05, "loss": 1.1901, "step": 7925 }, { "epoch": 0.45498881175053074, "grad_norm": 3.8304539394728407, "learning_rate": 1.3256165482016136e-05, "loss": 1.2274, "step": 7930 }, { "epoch": 0.4552756899420506, "grad_norm": 3.724817378788915, "learning_rate": 1.3246695593089001e-05, "loss": 1.2364, "step": 7935 }, { "epoch": 0.4555625681335705, "grad_norm": 3.565874052155941, "learning_rate": 1.3237222448368247e-05, "loss": 1.2492, "step": 7940 }, { "epoch": 0.4558494463250904, "grad_norm": 3.5921519709549234, "learning_rate": 1.3227746057353561e-05, "loss": 1.2539, "step": 7945 }, { "epoch": 0.45613632451661024, "grad_norm": 3.567058743124596, "learning_rate": 1.321826642954789e-05, "loss": 1.2113, "step": 7950 }, { "epoch": 0.45642320270813014, "grad_norm": 3.346358502981863, "learning_rate": 1.3208783574457431e-05, "loss": 1.2482, "step": 7955 }, { "epoch": 0.45671008089965, "grad_norm": 3.7132093316839434, "learning_rate": 1.3199297501591604e-05, "loss": 1.2289, "step": 7960 }, { "epoch": 0.4569969590911699, "grad_norm": 3.3660201663170772, "learning_rate": 1.3189808220463071e-05, "loss": 1.2085, "step": 7965 }, { "epoch": 0.4572838372826898, "grad_norm": 3.584911660891008, "learning_rate": 1.3180315740587702e-05, "loss": 1.2963, "step": 7970 }, { "epoch": 0.45757071547420963, "grad_norm": 3.0058657647500784, "learning_rate": 1.3170820071484574e-05, "loss": 1.1818, "step": 7975 }, { "epoch": 0.45785759366572953, "grad_norm": 3.358670497035153, "learning_rate": 1.3161321222675971e-05, "loss": 1.1873, "step": 7980 }, { "epoch": 0.45814447185724944, "grad_norm": 3.164255012483237, "learning_rate": 1.3151819203687355e-05, "loss": 1.1256, "step": 7985 }, { "epoch": 0.4584313500487693, "grad_norm": 3.970281246440322, "learning_rate": 1.3142314024047375e-05, "loss": 1.1701, "step": 7990 }, { "epoch": 0.4587182282402892, "grad_norm": 3.4300405908115605, "learning_rate": 1.3132805693287844e-05, "loss": 1.1764, "step": 7995 }, { "epoch": 0.45900510643180903, "grad_norm": 3.8055383757035197, "learning_rate": 1.312329422094374e-05, "loss": 1.2586, "step": 8000 }, { "epoch": 0.45929198462332893, "grad_norm": 3.502292961175685, "learning_rate": 1.3113779616553189e-05, "loss": 1.1297, "step": 8005 }, { "epoch": 0.45957886281484883, "grad_norm": 3.478663502497207, "learning_rate": 1.3104261889657455e-05, "loss": 1.208, "step": 8010 }, { "epoch": 0.4598657410063687, "grad_norm": 3.3693678988229863, "learning_rate": 1.3094741049800937e-05, "loss": 1.1627, "step": 8015 }, { "epoch": 0.4601526191978886, "grad_norm": 3.3059573340579167, "learning_rate": 1.3085217106531154e-05, "loss": 1.2784, "step": 8020 }, { "epoch": 0.4604394973894085, "grad_norm": 3.1776039172310027, "learning_rate": 1.307569006939874e-05, "loss": 1.2244, "step": 8025 }, { "epoch": 0.4607263755809283, "grad_norm": 4.509375868139583, "learning_rate": 1.3066159947957426e-05, "loss": 1.2461, "step": 8030 }, { "epoch": 0.46101325377244823, "grad_norm": 3.8156802064346196, "learning_rate": 1.305662675176404e-05, "loss": 1.215, "step": 8035 }, { "epoch": 0.4613001319639681, "grad_norm": 3.9544127792604478, "learning_rate": 1.3047090490378495e-05, "loss": 1.1819, "step": 8040 }, { "epoch": 0.461587010155488, "grad_norm": 3.2211729905294657, "learning_rate": 1.3037551173363775e-05, "loss": 1.1768, "step": 8045 }, { "epoch": 0.4618738883470079, "grad_norm": 3.2953881647446663, "learning_rate": 1.3028008810285924e-05, "loss": 1.2253, "step": 8050 }, { "epoch": 0.4621607665385277, "grad_norm": 3.302442434292357, "learning_rate": 1.3018463410714048e-05, "loss": 1.113, "step": 8055 }, { "epoch": 0.4624476447300476, "grad_norm": 3.8925807017902163, "learning_rate": 1.3008914984220296e-05, "loss": 1.1666, "step": 8060 }, { "epoch": 0.4627345229215675, "grad_norm": 4.02294116108428, "learning_rate": 1.2999363540379853e-05, "loss": 1.175, "step": 8065 }, { "epoch": 0.46302140111308737, "grad_norm": 3.366358449745895, "learning_rate": 1.2989809088770923e-05, "loss": 1.1899, "step": 8070 }, { "epoch": 0.4633082793046073, "grad_norm": 3.4016380641417476, "learning_rate": 1.2980251638974734e-05, "loss": 1.2509, "step": 8075 }, { "epoch": 0.4635951574961271, "grad_norm": 4.385247992970491, "learning_rate": 1.297069120057552e-05, "loss": 1.2923, "step": 8080 }, { "epoch": 0.463882035687647, "grad_norm": 3.2694223066436727, "learning_rate": 1.296112778316051e-05, "loss": 1.2068, "step": 8085 }, { "epoch": 0.4641689138791669, "grad_norm": 3.3936752443145637, "learning_rate": 1.2951561396319918e-05, "loss": 1.1963, "step": 8090 }, { "epoch": 0.46445579207068677, "grad_norm": 3.5262670851868965, "learning_rate": 1.2941992049646936e-05, "loss": 1.2336, "step": 8095 }, { "epoch": 0.46474267026220667, "grad_norm": 3.5303198271635616, "learning_rate": 1.2932419752737735e-05, "loss": 1.2545, "step": 8100 }, { "epoch": 0.46502954845372657, "grad_norm": 3.548401507719974, "learning_rate": 1.2922844515191427e-05, "loss": 1.1656, "step": 8105 }, { "epoch": 0.4653164266452464, "grad_norm": 3.511251671096042, "learning_rate": 1.2913266346610088e-05, "loss": 1.1786, "step": 8110 }, { "epoch": 0.4656033048367663, "grad_norm": 3.7048748880792686, "learning_rate": 1.2903685256598722e-05, "loss": 1.1993, "step": 8115 }, { "epoch": 0.46589018302828616, "grad_norm": 3.011221661595092, "learning_rate": 1.2894101254765268e-05, "loss": 1.2124, "step": 8120 }, { "epoch": 0.46617706121980607, "grad_norm": 3.0938219972556382, "learning_rate": 1.2884514350720587e-05, "loss": 1.1332, "step": 8125 }, { "epoch": 0.46646393941132597, "grad_norm": 4.200010342905014, "learning_rate": 1.2874924554078447e-05, "loss": 1.2047, "step": 8130 }, { "epoch": 0.4667508176028458, "grad_norm": 3.173501327396619, "learning_rate": 1.2865331874455517e-05, "loss": 1.1797, "step": 8135 }, { "epoch": 0.4670376957943657, "grad_norm": 3.251634244493627, "learning_rate": 1.2855736321471361e-05, "loss": 1.2701, "step": 8140 }, { "epoch": 0.4673245739858856, "grad_norm": 3.215406649408876, "learning_rate": 1.2846137904748415e-05, "loss": 1.2205, "step": 8145 }, { "epoch": 0.46761145217740546, "grad_norm": 4.31847986082044, "learning_rate": 1.2836536633911995e-05, "loss": 1.2251, "step": 8150 }, { "epoch": 0.46789833036892536, "grad_norm": 3.4219785131058607, "learning_rate": 1.2826932518590282e-05, "loss": 1.199, "step": 8155 }, { "epoch": 0.4681852085604452, "grad_norm": 3.3951818482830967, "learning_rate": 1.2817325568414299e-05, "loss": 1.2416, "step": 8160 }, { "epoch": 0.4684720867519651, "grad_norm": 3.232475225294129, "learning_rate": 1.280771579301792e-05, "loss": 1.2312, "step": 8165 }, { "epoch": 0.468758964943485, "grad_norm": 3.3591486538095823, "learning_rate": 1.2798103202037841e-05, "loss": 1.2413, "step": 8170 }, { "epoch": 0.46904584313500486, "grad_norm": 6.008471334168248, "learning_rate": 1.2788487805113603e-05, "loss": 1.1459, "step": 8175 }, { "epoch": 0.46933272132652476, "grad_norm": 4.159601495396801, "learning_rate": 1.277886961188754e-05, "loss": 1.2785, "step": 8180 }, { "epoch": 0.46961959951804466, "grad_norm": 3.8625068324584184, "learning_rate": 1.2769248632004797e-05, "loss": 1.2852, "step": 8185 }, { "epoch": 0.4699064777095645, "grad_norm": 2.935717416531001, "learning_rate": 1.2759624875113319e-05, "loss": 1.2481, "step": 8190 }, { "epoch": 0.4701933559010844, "grad_norm": 3.413315074751343, "learning_rate": 1.2749998350863826e-05, "loss": 1.1791, "step": 8195 }, { "epoch": 0.47048023409260425, "grad_norm": 3.6105411307887807, "learning_rate": 1.2740369068909821e-05, "loss": 1.2036, "step": 8200 }, { "epoch": 0.47076711228412416, "grad_norm": 3.839015302813599, "learning_rate": 1.2730737038907569e-05, "loss": 1.2015, "step": 8205 }, { "epoch": 0.47105399047564406, "grad_norm": 6.43438655815783, "learning_rate": 1.2721102270516089e-05, "loss": 1.2436, "step": 8210 }, { "epoch": 0.4713408686671639, "grad_norm": 3.238943068589488, "learning_rate": 1.2711464773397153e-05, "loss": 1.1797, "step": 8215 }, { "epoch": 0.4716277468586838, "grad_norm": 3.666192111918968, "learning_rate": 1.270182455721526e-05, "loss": 1.1429, "step": 8220 }, { "epoch": 0.4719146250502037, "grad_norm": 3.565369197400118, "learning_rate": 1.2692181631637644e-05, "loss": 1.1642, "step": 8225 }, { "epoch": 0.47220150324172355, "grad_norm": 3.8496229915904303, "learning_rate": 1.2682536006334249e-05, "loss": 1.1845, "step": 8230 }, { "epoch": 0.47248838143324345, "grad_norm": 3.322646990134722, "learning_rate": 1.2672887690977733e-05, "loss": 1.1971, "step": 8235 }, { "epoch": 0.4727752596247633, "grad_norm": 3.3035054525091048, "learning_rate": 1.2663236695243448e-05, "loss": 1.207, "step": 8240 }, { "epoch": 0.4730621378162832, "grad_norm": 3.4445298657262327, "learning_rate": 1.265358302880943e-05, "loss": 1.1522, "step": 8245 }, { "epoch": 0.4733490160078031, "grad_norm": 3.3129095456613222, "learning_rate": 1.2643926701356403e-05, "loss": 1.2551, "step": 8250 }, { "epoch": 0.47363589419932295, "grad_norm": 3.382714983989345, "learning_rate": 1.2634267722567753e-05, "loss": 1.1849, "step": 8255 }, { "epoch": 0.47392277239084285, "grad_norm": 3.260184694227071, "learning_rate": 1.2624606102129518e-05, "loss": 1.212, "step": 8260 }, { "epoch": 0.47420965058236275, "grad_norm": 3.370431406917315, "learning_rate": 1.2614941849730404e-05, "loss": 1.1633, "step": 8265 }, { "epoch": 0.4744965287738826, "grad_norm": 3.494883010902085, "learning_rate": 1.2605274975061737e-05, "loss": 1.2647, "step": 8270 }, { "epoch": 0.4747834069654025, "grad_norm": 3.7533493720738527, "learning_rate": 1.2595605487817481e-05, "loss": 1.2277, "step": 8275 }, { "epoch": 0.47507028515692235, "grad_norm": 3.193312397455943, "learning_rate": 1.2585933397694225e-05, "loss": 1.203, "step": 8280 }, { "epoch": 0.47535716334844225, "grad_norm": 2.9661514567918763, "learning_rate": 1.2576258714391156e-05, "loss": 1.1741, "step": 8285 }, { "epoch": 0.47564404153996215, "grad_norm": 3.4524165564484095, "learning_rate": 1.2566581447610074e-05, "loss": 1.2537, "step": 8290 }, { "epoch": 0.475930919731482, "grad_norm": 3.6035770789655692, "learning_rate": 1.255690160705536e-05, "loss": 1.1615, "step": 8295 }, { "epoch": 0.4762177979230019, "grad_norm": 3.557954383666333, "learning_rate": 1.254721920243398e-05, "loss": 1.1498, "step": 8300 }, { "epoch": 0.4765046761145218, "grad_norm": 3.456224853889429, "learning_rate": 1.2537534243455473e-05, "loss": 1.1814, "step": 8305 }, { "epoch": 0.47679155430604164, "grad_norm": 3.7235886959592848, "learning_rate": 1.2527846739831935e-05, "loss": 1.2778, "step": 8310 }, { "epoch": 0.47707843249756154, "grad_norm": 4.377329226202487, "learning_rate": 1.251815670127802e-05, "loss": 1.1169, "step": 8315 }, { "epoch": 0.4773653106890814, "grad_norm": 3.557424515393954, "learning_rate": 1.2508464137510919e-05, "loss": 1.3098, "step": 8320 }, { "epoch": 0.4776521888806013, "grad_norm": 3.30620464853212, "learning_rate": 1.2498769058250356e-05, "loss": 1.1904, "step": 8325 }, { "epoch": 0.4779390670721212, "grad_norm": 3.3298443753910476, "learning_rate": 1.2489071473218574e-05, "loss": 1.1651, "step": 8330 }, { "epoch": 0.47822594526364104, "grad_norm": 3.235158055539368, "learning_rate": 1.247937139214034e-05, "loss": 1.2119, "step": 8335 }, { "epoch": 0.47851282345516094, "grad_norm": 3.148364838305951, "learning_rate": 1.2469668824742915e-05, "loss": 1.2037, "step": 8340 }, { "epoch": 0.47879970164668084, "grad_norm": 3.459143110468305, "learning_rate": 1.2459963780756054e-05, "loss": 1.2225, "step": 8345 }, { "epoch": 0.4790865798382007, "grad_norm": 3.6471415972788046, "learning_rate": 1.2450256269911998e-05, "loss": 1.1643, "step": 8350 }, { "epoch": 0.4793734580297206, "grad_norm": 3.1647861114207303, "learning_rate": 1.244054630194546e-05, "loss": 1.1649, "step": 8355 }, { "epoch": 0.47966033622124044, "grad_norm": 3.172889152781508, "learning_rate": 1.2430833886593614e-05, "loss": 1.1, "step": 8360 }, { "epoch": 0.47994721441276034, "grad_norm": 3.4059168557281216, "learning_rate": 1.24211190335961e-05, "loss": 1.1378, "step": 8365 }, { "epoch": 0.48023409260428024, "grad_norm": 3.579896462038547, "learning_rate": 1.2411401752694989e-05, "loss": 1.2695, "step": 8370 }, { "epoch": 0.4805209707958001, "grad_norm": 3.286629283020475, "learning_rate": 1.2401682053634793e-05, "loss": 1.142, "step": 8375 }, { "epoch": 0.48080784898732, "grad_norm": 3.8319545858860176, "learning_rate": 1.2391959946162447e-05, "loss": 1.2096, "step": 8380 }, { "epoch": 0.4810947271788399, "grad_norm": 3.195115435865029, "learning_rate": 1.2382235440027306e-05, "loss": 1.0826, "step": 8385 }, { "epoch": 0.48138160537035973, "grad_norm": 3.393371653729847, "learning_rate": 1.2372508544981121e-05, "loss": 1.2397, "step": 8390 }, { "epoch": 0.48166848356187963, "grad_norm": 3.034101978911756, "learning_rate": 1.236277927077805e-05, "loss": 1.1983, "step": 8395 }, { "epoch": 0.4819553617533995, "grad_norm": 3.461151034610082, "learning_rate": 1.2353047627174626e-05, "loss": 1.1993, "step": 8400 }, { "epoch": 0.4822422399449194, "grad_norm": 3.478754143054256, "learning_rate": 1.2343313623929764e-05, "loss": 1.2084, "step": 8405 }, { "epoch": 0.4825291181364393, "grad_norm": 3.6658547499330667, "learning_rate": 1.2333577270804746e-05, "loss": 1.2163, "step": 8410 }, { "epoch": 0.48281599632795913, "grad_norm": 3.3006000654683727, "learning_rate": 1.232383857756321e-05, "loss": 1.1773, "step": 8415 }, { "epoch": 0.48310287451947903, "grad_norm": 3.891603889427661, "learning_rate": 1.2314097553971136e-05, "loss": 1.2329, "step": 8420 }, { "epoch": 0.48338975271099893, "grad_norm": 3.190670782365789, "learning_rate": 1.2304354209796847e-05, "loss": 1.2048, "step": 8425 }, { "epoch": 0.4836766309025188, "grad_norm": 3.5455787207132476, "learning_rate": 1.2294608554810988e-05, "loss": 1.1961, "step": 8430 }, { "epoch": 0.4839635090940387, "grad_norm": 3.47419000494357, "learning_rate": 1.2284860598786525e-05, "loss": 1.2398, "step": 8435 }, { "epoch": 0.4842503872855585, "grad_norm": 3.247588399566466, "learning_rate": 1.2275110351498728e-05, "loss": 1.218, "step": 8440 }, { "epoch": 0.4845372654770784, "grad_norm": 3.4296964701300237, "learning_rate": 1.2265357822725172e-05, "loss": 1.2379, "step": 8445 }, { "epoch": 0.48482414366859833, "grad_norm": 3.952507089564333, "learning_rate": 1.2255603022245713e-05, "loss": 1.2375, "step": 8450 }, { "epoch": 0.4851110218601182, "grad_norm": 3.6546867401170005, "learning_rate": 1.224584595984248e-05, "loss": 1.2017, "step": 8455 }, { "epoch": 0.4853979000516381, "grad_norm": 3.75371923746037, "learning_rate": 1.2236086645299887e-05, "loss": 1.2518, "step": 8460 }, { "epoch": 0.485684778243158, "grad_norm": 3.4237340975901707, "learning_rate": 1.222632508840459e-05, "loss": 1.1795, "step": 8465 }, { "epoch": 0.4859716564346778, "grad_norm": 3.1844342197475535, "learning_rate": 1.2216561298945502e-05, "loss": 1.1664, "step": 8470 }, { "epoch": 0.4862585346261977, "grad_norm": 3.412244749716502, "learning_rate": 1.2206795286713776e-05, "loss": 1.2232, "step": 8475 }, { "epoch": 0.48654541281771757, "grad_norm": 3.2116306123321117, "learning_rate": 1.2197027061502782e-05, "loss": 1.1993, "step": 8480 }, { "epoch": 0.48683229100923747, "grad_norm": 8.57232583050524, "learning_rate": 1.218725663310813e-05, "loss": 1.1902, "step": 8485 }, { "epoch": 0.4871191692007574, "grad_norm": 3.7457091781088407, "learning_rate": 1.2177484011327618e-05, "loss": 1.1524, "step": 8490 }, { "epoch": 0.4874060473922772, "grad_norm": 3.2975642250161767, "learning_rate": 1.2167709205961255e-05, "loss": 1.1638, "step": 8495 }, { "epoch": 0.4876929255837971, "grad_norm": 3.1374696260224098, "learning_rate": 1.2157932226811246e-05, "loss": 1.13, "step": 8500 }, { "epoch": 0.487979803775317, "grad_norm": 3.511925395028662, "learning_rate": 1.2148153083681956e-05, "loss": 1.2523, "step": 8505 }, { "epoch": 0.48826668196683687, "grad_norm": 3.027484815699034, "learning_rate": 1.213837178637994e-05, "loss": 1.1929, "step": 8510 }, { "epoch": 0.48855356015835677, "grad_norm": 3.2248967396623502, "learning_rate": 1.2128588344713898e-05, "loss": 1.165, "step": 8515 }, { "epoch": 0.4888404383498766, "grad_norm": 3.431300900478561, "learning_rate": 1.211880276849469e-05, "loss": 1.1915, "step": 8520 }, { "epoch": 0.4891273165413965, "grad_norm": 3.3338816971310354, "learning_rate": 1.210901506753532e-05, "loss": 1.1501, "step": 8525 }, { "epoch": 0.4894141947329164, "grad_norm": 3.8227717936865058, "learning_rate": 1.2099225251650908e-05, "loss": 1.2277, "step": 8530 }, { "epoch": 0.48970107292443626, "grad_norm": 3.9655324330996216, "learning_rate": 1.2089433330658706e-05, "loss": 1.2293, "step": 8535 }, { "epoch": 0.48998795111595617, "grad_norm": 3.137977763309509, "learning_rate": 1.2079639314378076e-05, "loss": 1.2642, "step": 8540 }, { "epoch": 0.49027482930747607, "grad_norm": 3.4771058797143684, "learning_rate": 1.2069843212630474e-05, "loss": 1.1129, "step": 8545 }, { "epoch": 0.4905617074989959, "grad_norm": 3.4869954569640953, "learning_rate": 1.2060045035239465e-05, "loss": 1.1256, "step": 8550 }, { "epoch": 0.4908485856905158, "grad_norm": 3.4332331675987606, "learning_rate": 1.2050244792030667e-05, "loss": 1.1346, "step": 8555 }, { "epoch": 0.49113546388203566, "grad_norm": 3.4293659756444788, "learning_rate": 1.20404424928318e-05, "loss": 1.1833, "step": 8560 }, { "epoch": 0.49142234207355556, "grad_norm": 3.583077483914129, "learning_rate": 1.2030638147472624e-05, "loss": 1.144, "step": 8565 }, { "epoch": 0.49170922026507546, "grad_norm": 9.965293439227118, "learning_rate": 1.2020831765784957e-05, "loss": 1.1672, "step": 8570 }, { "epoch": 0.4919960984565953, "grad_norm": 3.6271723444522004, "learning_rate": 1.2011023357602669e-05, "loss": 1.1986, "step": 8575 }, { "epoch": 0.4922829766481152, "grad_norm": 3.0207091623537874, "learning_rate": 1.2001212932761646e-05, "loss": 1.0906, "step": 8580 }, { "epoch": 0.4925698548396351, "grad_norm": 3.4750301039763603, "learning_rate": 1.1991400501099806e-05, "loss": 1.1328, "step": 8585 }, { "epoch": 0.49285673303115496, "grad_norm": 3.528585039414603, "learning_rate": 1.198158607245708e-05, "loss": 1.1892, "step": 8590 }, { "epoch": 0.49314361122267486, "grad_norm": 3.10593326204609, "learning_rate": 1.1971769656675392e-05, "loss": 1.2585, "step": 8595 }, { "epoch": 0.4934304894141947, "grad_norm": 3.3227549265014105, "learning_rate": 1.1961951263598677e-05, "loss": 1.1938, "step": 8600 }, { "epoch": 0.4937173676057146, "grad_norm": 3.5712925491766847, "learning_rate": 1.1952130903072832e-05, "loss": 1.1949, "step": 8605 }, { "epoch": 0.4940042457972345, "grad_norm": 3.434344576996418, "learning_rate": 1.1942308584945742e-05, "loss": 1.2234, "step": 8610 }, { "epoch": 0.49429112398875436, "grad_norm": 3.547403632165294, "learning_rate": 1.1932484319067246e-05, "loss": 1.2216, "step": 8615 }, { "epoch": 0.49457800218027426, "grad_norm": 3.5227689077452786, "learning_rate": 1.1922658115289141e-05, "loss": 1.1556, "step": 8620 }, { "epoch": 0.49486488037179416, "grad_norm": 3.3524876189182624, "learning_rate": 1.1912829983465169e-05, "loss": 1.1144, "step": 8625 }, { "epoch": 0.495151758563314, "grad_norm": 3.392516828134865, "learning_rate": 1.1902999933450997e-05, "loss": 1.1979, "step": 8630 }, { "epoch": 0.4954386367548339, "grad_norm": 3.5507889562155044, "learning_rate": 1.189316797510423e-05, "loss": 1.1743, "step": 8635 }, { "epoch": 0.49572551494635375, "grad_norm": 4.199938534108679, "learning_rate": 1.1883334118284368e-05, "loss": 1.1728, "step": 8640 }, { "epoch": 0.49601239313787365, "grad_norm": 3.7702023361818076, "learning_rate": 1.1873498372852828e-05, "loss": 1.2575, "step": 8645 }, { "epoch": 0.49629927132939355, "grad_norm": 3.595625691100054, "learning_rate": 1.186366074867292e-05, "loss": 1.1064, "step": 8650 }, { "epoch": 0.4965861495209134, "grad_norm": 4.225614743355705, "learning_rate": 1.1853821255609836e-05, "loss": 1.2182, "step": 8655 }, { "epoch": 0.4968730277124333, "grad_norm": 3.849603281903678, "learning_rate": 1.1843979903530637e-05, "loss": 1.1702, "step": 8660 }, { "epoch": 0.4971599059039532, "grad_norm": 5.358502064485114, "learning_rate": 1.1834136702304257e-05, "loss": 1.1606, "step": 8665 }, { "epoch": 0.49744678409547305, "grad_norm": 3.76723224644966, "learning_rate": 1.1824291661801478e-05, "loss": 1.1861, "step": 8670 }, { "epoch": 0.49773366228699295, "grad_norm": 3.383911609465907, "learning_rate": 1.1814444791894935e-05, "loss": 1.2017, "step": 8675 }, { "epoch": 0.4980205404785128, "grad_norm": 4.030487072192011, "learning_rate": 1.180459610245908e-05, "loss": 1.2287, "step": 8680 }, { "epoch": 0.4983074186700327, "grad_norm": 3.1701214013874264, "learning_rate": 1.1794745603370212e-05, "loss": 1.1939, "step": 8685 }, { "epoch": 0.4985942968615526, "grad_norm": 3.2945104584873963, "learning_rate": 1.1784893304506424e-05, "loss": 1.2234, "step": 8690 }, { "epoch": 0.49888117505307245, "grad_norm": 3.940857813692626, "learning_rate": 1.177503921574763e-05, "loss": 1.1713, "step": 8695 }, { "epoch": 0.49916805324459235, "grad_norm": 3.77515747578742, "learning_rate": 1.1765183346975528e-05, "loss": 1.1451, "step": 8700 }, { "epoch": 0.49945493143611225, "grad_norm": 3.196118640268827, "learning_rate": 1.175532570807361e-05, "loss": 1.1677, "step": 8705 }, { "epoch": 0.4997418096276321, "grad_norm": 3.4513105168934164, "learning_rate": 1.1745466308927136e-05, "loss": 1.2317, "step": 8710 }, { "epoch": 0.500028687819152, "grad_norm": 3.612908675681068, "learning_rate": 1.173560515942313e-05, "loss": 1.1163, "step": 8715 }, { "epoch": 0.5003155660106718, "grad_norm": 2.987036113375402, "learning_rate": 1.1725742269450381e-05, "loss": 1.1111, "step": 8720 }, { "epoch": 0.5006024442021918, "grad_norm": 3.598270938292425, "learning_rate": 1.1715877648899414e-05, "loss": 1.1956, "step": 8725 }, { "epoch": 0.5008893223937116, "grad_norm": 3.2944125550075065, "learning_rate": 1.170601130766249e-05, "loss": 1.1236, "step": 8730 }, { "epoch": 0.5011762005852315, "grad_norm": 3.643123053947698, "learning_rate": 1.1696143255633607e-05, "loss": 1.1491, "step": 8735 }, { "epoch": 0.5014630787767513, "grad_norm": 3.2352913693391216, "learning_rate": 1.168627350270846e-05, "loss": 1.2036, "step": 8740 }, { "epoch": 0.5017499569682713, "grad_norm": 3.310821284087894, "learning_rate": 1.1676402058784464e-05, "loss": 1.1458, "step": 8745 }, { "epoch": 0.5020368351597911, "grad_norm": 3.746948328832309, "learning_rate": 1.1666528933760726e-05, "loss": 1.2018, "step": 8750 }, { "epoch": 0.502323713351311, "grad_norm": 3.92709717867608, "learning_rate": 1.1656654137538032e-05, "loss": 1.1794, "step": 8755 }, { "epoch": 0.5026105915428309, "grad_norm": 3.4951786165291114, "learning_rate": 1.1646777680018859e-05, "loss": 1.1419, "step": 8760 }, { "epoch": 0.5028974697343508, "grad_norm": 3.312818987744202, "learning_rate": 1.1636899571107334e-05, "loss": 1.0581, "step": 8765 }, { "epoch": 0.5031843479258706, "grad_norm": 3.4802960402979184, "learning_rate": 1.1627019820709246e-05, "loss": 1.1456, "step": 8770 }, { "epoch": 0.5034712261173906, "grad_norm": 3.700359682471302, "learning_rate": 1.1617138438732036e-05, "loss": 1.148, "step": 8775 }, { "epoch": 0.5037581043089104, "grad_norm": 3.385138509390088, "learning_rate": 1.1607255435084772e-05, "loss": 1.1353, "step": 8780 }, { "epoch": 0.5040449825004303, "grad_norm": 3.4194343490532657, "learning_rate": 1.1597370819678157e-05, "loss": 1.1466, "step": 8785 }, { "epoch": 0.5043318606919502, "grad_norm": 3.293636158865099, "learning_rate": 1.15874846024245e-05, "loss": 1.106, "step": 8790 }, { "epoch": 0.5046187388834701, "grad_norm": 4.159831726291985, "learning_rate": 1.1577596793237722e-05, "loss": 1.2071, "step": 8795 }, { "epoch": 0.5049056170749899, "grad_norm": 3.2235667707253124, "learning_rate": 1.1567707402033345e-05, "loss": 1.1438, "step": 8800 }, { "epoch": 0.5051924952665099, "grad_norm": 3.5303599541501813, "learning_rate": 1.1557816438728467e-05, "loss": 1.1377, "step": 8805 }, { "epoch": 0.5054793734580297, "grad_norm": 3.2838208344349913, "learning_rate": 1.1547923913241774e-05, "loss": 1.1658, "step": 8810 }, { "epoch": 0.5057662516495496, "grad_norm": 3.4746938439396153, "learning_rate": 1.1538029835493507e-05, "loss": 1.1699, "step": 8815 }, { "epoch": 0.5060531298410694, "grad_norm": 3.203664249594409, "learning_rate": 1.1528134215405473e-05, "loss": 1.1853, "step": 8820 }, { "epoch": 0.5063400080325894, "grad_norm": 3.636458135210526, "learning_rate": 1.1518237062901023e-05, "loss": 1.2558, "step": 8825 }, { "epoch": 0.5066268862241092, "grad_norm": 3.3422628648201163, "learning_rate": 1.1508338387905039e-05, "loss": 1.239, "step": 8830 }, { "epoch": 0.5069137644156291, "grad_norm": 3.538622600629751, "learning_rate": 1.149843820034394e-05, "loss": 1.1162, "step": 8835 }, { "epoch": 0.507200642607149, "grad_norm": 3.229648122854182, "learning_rate": 1.148853651014565e-05, "loss": 1.1317, "step": 8840 }, { "epoch": 0.5074875207986689, "grad_norm": 3.6403704107368364, "learning_rate": 1.1478633327239615e-05, "loss": 1.211, "step": 8845 }, { "epoch": 0.5077743989901887, "grad_norm": 3.6611438152183147, "learning_rate": 1.1468728661556761e-05, "loss": 1.1615, "step": 8850 }, { "epoch": 0.5080612771817087, "grad_norm": 3.703797254777598, "learning_rate": 1.1458822523029509e-05, "loss": 1.1831, "step": 8855 }, { "epoch": 0.5083481553732285, "grad_norm": 3.6352397265462346, "learning_rate": 1.1448914921591765e-05, "loss": 1.1382, "step": 8860 }, { "epoch": 0.5086350335647484, "grad_norm": 3.697932795859595, "learning_rate": 1.1439005867178884e-05, "loss": 1.1805, "step": 8865 }, { "epoch": 0.5089219117562683, "grad_norm": 3.57353688255735, "learning_rate": 1.1429095369727696e-05, "loss": 1.1028, "step": 8870 }, { "epoch": 0.5092087899477882, "grad_norm": 3.312940437440683, "learning_rate": 1.1419183439176464e-05, "loss": 1.1226, "step": 8875 }, { "epoch": 0.509495668139308, "grad_norm": 4.075670075276293, "learning_rate": 1.1409270085464898e-05, "loss": 1.1428, "step": 8880 }, { "epoch": 0.509782546330828, "grad_norm": 3.1320454183312956, "learning_rate": 1.1399355318534131e-05, "loss": 1.2194, "step": 8885 }, { "epoch": 0.5100694245223478, "grad_norm": 3.304759803664198, "learning_rate": 1.138943914832671e-05, "loss": 1.0879, "step": 8890 }, { "epoch": 0.5103563027138677, "grad_norm": 3.2608505742943303, "learning_rate": 1.13795215847866e-05, "loss": 1.1122, "step": 8895 }, { "epoch": 0.5106431809053875, "grad_norm": 3.3937044770078293, "learning_rate": 1.136960263785915e-05, "loss": 1.1898, "step": 8900 }, { "epoch": 0.5109300590969075, "grad_norm": 3.731774413765772, "learning_rate": 1.1359682317491098e-05, "loss": 1.1921, "step": 8905 }, { "epoch": 0.5112169372884273, "grad_norm": 3.2723613892022003, "learning_rate": 1.1349760633630575e-05, "loss": 1.1297, "step": 8910 }, { "epoch": 0.5115038154799472, "grad_norm": 3.2375295411070826, "learning_rate": 1.1339837596227061e-05, "loss": 1.1853, "step": 8915 }, { "epoch": 0.5117906936714671, "grad_norm": 3.4791170206884705, "learning_rate": 1.13299132152314e-05, "loss": 1.1949, "step": 8920 }, { "epoch": 0.512077571862987, "grad_norm": 4.2417451711620515, "learning_rate": 1.1319987500595785e-05, "loss": 1.1432, "step": 8925 }, { "epoch": 0.5123644500545068, "grad_norm": 3.5088394825701466, "learning_rate": 1.1310060462273744e-05, "loss": 1.1899, "step": 8930 }, { "epoch": 0.5126513282460268, "grad_norm": 3.293093942477056, "learning_rate": 1.1300132110220135e-05, "loss": 1.1564, "step": 8935 }, { "epoch": 0.5129382064375466, "grad_norm": 3.387864505700452, "learning_rate": 1.129020245439113e-05, "loss": 1.166, "step": 8940 }, { "epoch": 0.5132250846290665, "grad_norm": 3.579426404583115, "learning_rate": 1.128027150474421e-05, "loss": 1.2217, "step": 8945 }, { "epoch": 0.5135119628205864, "grad_norm": 2.983162795014091, "learning_rate": 1.1270339271238154e-05, "loss": 1.1989, "step": 8950 }, { "epoch": 0.5137988410121063, "grad_norm": 3.4058404196621037, "learning_rate": 1.126040576383303e-05, "loss": 1.1454, "step": 8955 }, { "epoch": 0.5140857192036261, "grad_norm": 3.480360327024166, "learning_rate": 1.1250470992490176e-05, "loss": 1.248, "step": 8960 }, { "epoch": 0.5143725973951461, "grad_norm": 3.560413700905197, "learning_rate": 1.1240534967172209e-05, "loss": 1.193, "step": 8965 }, { "epoch": 0.5146594755866659, "grad_norm": 3.263141866085325, "learning_rate": 1.1230597697842997e-05, "loss": 1.1911, "step": 8970 }, { "epoch": 0.5149463537781858, "grad_norm": 3.7690717193531555, "learning_rate": 1.1220659194467652e-05, "loss": 1.1187, "step": 8975 }, { "epoch": 0.5152332319697057, "grad_norm": 3.208015336317504, "learning_rate": 1.121071946701253e-05, "loss": 1.101, "step": 8980 }, { "epoch": 0.5155201101612256, "grad_norm": 3.4510520396759676, "learning_rate": 1.1200778525445211e-05, "loss": 1.2132, "step": 8985 }, { "epoch": 0.5158069883527454, "grad_norm": 3.6567954545644294, "learning_rate": 1.1190836379734496e-05, "loss": 1.1778, "step": 8990 }, { "epoch": 0.5160938665442653, "grad_norm": 3.118933605808038, "learning_rate": 1.1180893039850389e-05, "loss": 1.1484, "step": 8995 }, { "epoch": 0.5163807447357852, "grad_norm": 3.291411404952948, "learning_rate": 1.1170948515764089e-05, "loss": 1.148, "step": 9000 }, { "epoch": 0.5166676229273051, "grad_norm": 3.528303087358735, "learning_rate": 1.1161002817447996e-05, "loss": 1.2483, "step": 9005 }, { "epoch": 0.5169545011188249, "grad_norm": 3.270379404223355, "learning_rate": 1.1151055954875674e-05, "loss": 1.094, "step": 9010 }, { "epoch": 0.5172413793103449, "grad_norm": 3.5289579541990106, "learning_rate": 1.1141107938021858e-05, "loss": 1.2137, "step": 9015 }, { "epoch": 0.5175282575018647, "grad_norm": 3.7801509153393016, "learning_rate": 1.1131158776862445e-05, "loss": 1.2643, "step": 9020 }, { "epoch": 0.5178151356933846, "grad_norm": 4.140322716540474, "learning_rate": 1.112120848137447e-05, "loss": 1.1842, "step": 9025 }, { "epoch": 0.5181020138849045, "grad_norm": 3.1923286619779754, "learning_rate": 1.111125706153612e-05, "loss": 1.0876, "step": 9030 }, { "epoch": 0.5183888920764244, "grad_norm": 3.915544639419879, "learning_rate": 1.1101304527326695e-05, "loss": 1.1827, "step": 9035 }, { "epoch": 0.5186757702679442, "grad_norm": 4.428692682480397, "learning_rate": 1.109135088872662e-05, "loss": 1.2195, "step": 9040 }, { "epoch": 0.5189626484594642, "grad_norm": 3.297016851136949, "learning_rate": 1.108139615571743e-05, "loss": 1.1878, "step": 9045 }, { "epoch": 0.519249526650984, "grad_norm": 3.386700780208732, "learning_rate": 1.1071440338281745e-05, "loss": 1.1954, "step": 9050 }, { "epoch": 0.5195364048425039, "grad_norm": 3.582777302753334, "learning_rate": 1.106148344640329e-05, "loss": 1.1729, "step": 9055 }, { "epoch": 0.5198232830340238, "grad_norm": 3.1279229908858315, "learning_rate": 1.1051525490066852e-05, "loss": 1.1913, "step": 9060 }, { "epoch": 0.5201101612255437, "grad_norm": 3.392925479700777, "learning_rate": 1.1041566479258294e-05, "loss": 1.1423, "step": 9065 }, { "epoch": 0.5203970394170635, "grad_norm": 3.471765531679707, "learning_rate": 1.103160642396454e-05, "loss": 1.1666, "step": 9070 }, { "epoch": 0.5206839176085833, "grad_norm": 3.453242096206292, "learning_rate": 1.1021645334173547e-05, "loss": 1.1688, "step": 9075 }, { "epoch": 0.5209707958001033, "grad_norm": 3.583218943469046, "learning_rate": 1.1011683219874324e-05, "loss": 1.1802, "step": 9080 }, { "epoch": 0.5212576739916231, "grad_norm": 4.271672815592396, "learning_rate": 1.1001720091056897e-05, "loss": 1.1316, "step": 9085 }, { "epoch": 0.521544552183143, "grad_norm": 3.0585902983967044, "learning_rate": 1.0991755957712317e-05, "loss": 1.0957, "step": 9090 }, { "epoch": 0.521831430374663, "grad_norm": 3.6452974496681163, "learning_rate": 1.0981790829832642e-05, "loss": 1.166, "step": 9095 }, { "epoch": 0.5221183085661828, "grad_norm": 4.133618284092286, "learning_rate": 1.0971824717410917e-05, "loss": 1.1267, "step": 9100 }, { "epoch": 0.5224051867577026, "grad_norm": 3.0551315431627004, "learning_rate": 1.0961857630441188e-05, "loss": 1.1745, "step": 9105 }, { "epoch": 0.5226920649492226, "grad_norm": 3.3532962236993495, "learning_rate": 1.095188957891847e-05, "loss": 1.1782, "step": 9110 }, { "epoch": 0.5229789431407424, "grad_norm": 3.163427256236003, "learning_rate": 1.0941920572838747e-05, "loss": 1.0839, "step": 9115 }, { "epoch": 0.5232658213322623, "grad_norm": 3.2460254242023874, "learning_rate": 1.0931950622198966e-05, "loss": 1.2311, "step": 9120 }, { "epoch": 0.5235526995237823, "grad_norm": 3.517439507766329, "learning_rate": 1.0921979736997005e-05, "loss": 1.1949, "step": 9125 }, { "epoch": 0.5238395777153021, "grad_norm": 3.2158709631231686, "learning_rate": 1.0912007927231701e-05, "loss": 1.1558, "step": 9130 }, { "epoch": 0.5241264559068219, "grad_norm": 3.362234490789048, "learning_rate": 1.09020352029028e-05, "loss": 1.2035, "step": 9135 }, { "epoch": 0.5244133340983419, "grad_norm": 3.3215125005273927, "learning_rate": 1.0892061574010972e-05, "loss": 1.2169, "step": 9140 }, { "epoch": 0.5247002122898617, "grad_norm": 3.6038380547080835, "learning_rate": 1.0882087050557804e-05, "loss": 1.1983, "step": 9145 }, { "epoch": 0.5249870904813816, "grad_norm": 3.536213466496912, "learning_rate": 1.0872111642545759e-05, "loss": 1.1953, "step": 9150 }, { "epoch": 0.5252739686729014, "grad_norm": 3.3250812102962346, "learning_rate": 1.0862135359978206e-05, "loss": 1.1403, "step": 9155 }, { "epoch": 0.5255608468644214, "grad_norm": 3.8946132879163504, "learning_rate": 1.0852158212859379e-05, "loss": 1.1371, "step": 9160 }, { "epoch": 0.5258477250559412, "grad_norm": 3.3897603720104352, "learning_rate": 1.0842180211194384e-05, "loss": 1.1699, "step": 9165 }, { "epoch": 0.5261346032474611, "grad_norm": 3.8236274041356744, "learning_rate": 1.0832201364989186e-05, "loss": 1.1183, "step": 9170 }, { "epoch": 0.526421481438981, "grad_norm": 3.5955881839652446, "learning_rate": 1.0822221684250593e-05, "loss": 1.1487, "step": 9175 }, { "epoch": 0.5267083596305009, "grad_norm": 3.2653413433572296, "learning_rate": 1.0812241178986254e-05, "loss": 1.1591, "step": 9180 }, { "epoch": 0.5269952378220207, "grad_norm": 3.3163528623589658, "learning_rate": 1.0802259859204636e-05, "loss": 1.2038, "step": 9185 }, { "epoch": 0.5272821160135407, "grad_norm": 3.8802988058120476, "learning_rate": 1.0792277734915034e-05, "loss": 1.1393, "step": 9190 }, { "epoch": 0.5275689942050605, "grad_norm": 3.1079438915236457, "learning_rate": 1.0782294816127541e-05, "loss": 1.1997, "step": 9195 }, { "epoch": 0.5278558723965804, "grad_norm": 3.521060885686377, "learning_rate": 1.0772311112853053e-05, "loss": 1.2178, "step": 9200 }, { "epoch": 0.5281427505881003, "grad_norm": 3.29659974658219, "learning_rate": 1.076232663510325e-05, "loss": 1.1699, "step": 9205 }, { "epoch": 0.5284296287796202, "grad_norm": 3.7293221412284274, "learning_rate": 1.0752341392890589e-05, "loss": 1.2338, "step": 9210 }, { "epoch": 0.52871650697114, "grad_norm": 3.213258720454421, "learning_rate": 1.0742355396228288e-05, "loss": 1.176, "step": 9215 }, { "epoch": 0.52900338516266, "grad_norm": 3.102185916914755, "learning_rate": 1.0732368655130335e-05, "loss": 1.0939, "step": 9220 }, { "epoch": 0.5292902633541798, "grad_norm": 3.5547568513173586, "learning_rate": 1.072238117961145e-05, "loss": 1.2613, "step": 9225 }, { "epoch": 0.5295771415456997, "grad_norm": 25.014098026245193, "learning_rate": 1.0712392979687101e-05, "loss": 1.1016, "step": 9230 }, { "epoch": 0.5298640197372195, "grad_norm": 3.369144589975072, "learning_rate": 1.0702404065373471e-05, "loss": 1.1547, "step": 9235 }, { "epoch": 0.5301508979287395, "grad_norm": 3.140050050995834, "learning_rate": 1.0692414446687471e-05, "loss": 1.1657, "step": 9240 }, { "epoch": 0.5304377761202593, "grad_norm": 3.7343517380528097, "learning_rate": 1.0682424133646712e-05, "loss": 1.1724, "step": 9245 }, { "epoch": 0.5307246543117792, "grad_norm": 4.243803221304461, "learning_rate": 1.06724331362695e-05, "loss": 1.1296, "step": 9250 }, { "epoch": 0.5310115325032991, "grad_norm": 3.848884966051868, "learning_rate": 1.0662441464574832e-05, "loss": 1.1478, "step": 9255 }, { "epoch": 0.531298410694819, "grad_norm": 4.514346758219499, "learning_rate": 1.0652449128582377e-05, "loss": 1.2152, "step": 9260 }, { "epoch": 0.5315852888863388, "grad_norm": 3.145034926140732, "learning_rate": 1.0642456138312474e-05, "loss": 1.1747, "step": 9265 }, { "epoch": 0.5318721670778588, "grad_norm": 3.4636544681163897, "learning_rate": 1.0632462503786115e-05, "loss": 1.1437, "step": 9270 }, { "epoch": 0.5321590452693786, "grad_norm": 3.8513445986878194, "learning_rate": 1.0622468235024937e-05, "loss": 1.1317, "step": 9275 }, { "epoch": 0.5324459234608985, "grad_norm": 6.316442466311766, "learning_rate": 1.061247334205122e-05, "loss": 1.0976, "step": 9280 }, { "epoch": 0.5327328016524184, "grad_norm": 3.145956543777418, "learning_rate": 1.060247783488786e-05, "loss": 1.0825, "step": 9285 }, { "epoch": 0.5330196798439383, "grad_norm": 3.507666080130733, "learning_rate": 1.0592481723558375e-05, "loss": 1.1189, "step": 9290 }, { "epoch": 0.5333065580354581, "grad_norm": 3.6240744886743927, "learning_rate": 1.0582485018086892e-05, "loss": 1.1336, "step": 9295 }, { "epoch": 0.5335934362269781, "grad_norm": 3.056095403783343, "learning_rate": 1.0572487728498127e-05, "loss": 1.1492, "step": 9300 }, { "epoch": 0.5338803144184979, "grad_norm": 3.386243063643788, "learning_rate": 1.0562489864817382e-05, "loss": 1.1495, "step": 9305 }, { "epoch": 0.5341671926100178, "grad_norm": 3.572548199703092, "learning_rate": 1.0552491437070537e-05, "loss": 1.2254, "step": 9310 }, { "epoch": 0.5344540708015376, "grad_norm": 3.3953493742594576, "learning_rate": 1.0542492455284044e-05, "loss": 1.1421, "step": 9315 }, { "epoch": 0.5347409489930576, "grad_norm": 3.178148248306452, "learning_rate": 1.0532492929484899e-05, "loss": 1.18, "step": 9320 }, { "epoch": 0.5350278271845774, "grad_norm": 2.999938050105646, "learning_rate": 1.052249286970065e-05, "loss": 1.1267, "step": 9325 }, { "epoch": 0.5353147053760973, "grad_norm": 3.039084894869949, "learning_rate": 1.0512492285959381e-05, "loss": 1.1382, "step": 9330 }, { "epoch": 0.5356015835676172, "grad_norm": 3.4935757286795868, "learning_rate": 1.0502491188289697e-05, "loss": 1.1418, "step": 9335 }, { "epoch": 0.5358884617591371, "grad_norm": 3.287017563248424, "learning_rate": 1.0492489586720725e-05, "loss": 1.2135, "step": 9340 }, { "epoch": 0.5361753399506569, "grad_norm": 3.2109892198712466, "learning_rate": 1.0482487491282089e-05, "loss": 1.0682, "step": 9345 }, { "epoch": 0.5364622181421769, "grad_norm": 3.8175384010622717, "learning_rate": 1.0472484912003913e-05, "loss": 1.2386, "step": 9350 }, { "epoch": 0.5367490963336967, "grad_norm": 3.4667580607489574, "learning_rate": 1.0462481858916813e-05, "loss": 1.2045, "step": 9355 }, { "epoch": 0.5370359745252166, "grad_norm": 3.3452671566158916, "learning_rate": 1.0452478342051862e-05, "loss": 1.1539, "step": 9360 }, { "epoch": 0.5373228527167365, "grad_norm": 3.0012973103568092, "learning_rate": 1.0442474371440618e-05, "loss": 1.1566, "step": 9365 }, { "epoch": 0.5376097309082564, "grad_norm": 3.272517767331313, "learning_rate": 1.0432469957115083e-05, "loss": 1.1581, "step": 9370 }, { "epoch": 0.5378966090997762, "grad_norm": 3.4090426230914037, "learning_rate": 1.0422465109107703e-05, "loss": 1.1959, "step": 9375 }, { "epoch": 0.5381834872912962, "grad_norm": 3.357573682900627, "learning_rate": 1.0412459837451367e-05, "loss": 1.2099, "step": 9380 }, { "epoch": 0.538470365482816, "grad_norm": 3.452945886159865, "learning_rate": 1.0402454152179377e-05, "loss": 1.1226, "step": 9385 }, { "epoch": 0.5387572436743359, "grad_norm": 3.483499289667796, "learning_rate": 1.0392448063325464e-05, "loss": 1.1116, "step": 9390 }, { "epoch": 0.5390441218658557, "grad_norm": 3.2357227622894547, "learning_rate": 1.0382441580923752e-05, "loss": 1.1607, "step": 9395 }, { "epoch": 0.5393310000573757, "grad_norm": 4.443362875075011, "learning_rate": 1.0372434715008763e-05, "loss": 1.2313, "step": 9400 }, { "epoch": 0.5396178782488955, "grad_norm": 2.8779847850220412, "learning_rate": 1.0362427475615413e-05, "loss": 1.1739, "step": 9405 }, { "epoch": 0.5399047564404154, "grad_norm": 3.4574677172488126, "learning_rate": 1.035241987277897e-05, "loss": 1.1486, "step": 9410 }, { "epoch": 0.5401916346319353, "grad_norm": 3.6371942990042765, "learning_rate": 1.0342411916535093e-05, "loss": 1.1494, "step": 9415 }, { "epoch": 0.5404785128234552, "grad_norm": 3.361400576299899, "learning_rate": 1.0332403616919779e-05, "loss": 1.1663, "step": 9420 }, { "epoch": 0.540765391014975, "grad_norm": 4.024201036708512, "learning_rate": 1.0322394983969369e-05, "loss": 1.1748, "step": 9425 }, { "epoch": 0.541052269206495, "grad_norm": 3.8243938595138536, "learning_rate": 1.031238602772055e-05, "loss": 1.1135, "step": 9430 }, { "epoch": 0.5413391473980148, "grad_norm": 3.4495758603944915, "learning_rate": 1.030237675821032e-05, "loss": 1.208, "step": 9435 }, { "epoch": 0.5416260255895347, "grad_norm": 3.904033161911523, "learning_rate": 1.0292367185475998e-05, "loss": 1.2021, "step": 9440 }, { "epoch": 0.5419129037810546, "grad_norm": 3.5253168000691786, "learning_rate": 1.0282357319555208e-05, "loss": 1.184, "step": 9445 }, { "epoch": 0.5421997819725745, "grad_norm": 3.2845802461019526, "learning_rate": 1.0272347170485864e-05, "loss": 1.1827, "step": 9450 }, { "epoch": 0.5424866601640943, "grad_norm": 3.549305845558518, "learning_rate": 1.0262336748306165e-05, "loss": 1.1864, "step": 9455 }, { "epoch": 0.5427735383556143, "grad_norm": 3.537431999550678, "learning_rate": 1.0252326063054589e-05, "loss": 1.1786, "step": 9460 }, { "epoch": 0.5430604165471341, "grad_norm": 3.5621448378643676, "learning_rate": 1.0242315124769872e-05, "loss": 1.1199, "step": 9465 }, { "epoch": 0.543347294738654, "grad_norm": 3.4378829175205285, "learning_rate": 1.0232303943491005e-05, "loss": 1.2456, "step": 9470 }, { "epoch": 0.5436341729301738, "grad_norm": 3.54962307357482, "learning_rate": 1.022229252925722e-05, "loss": 1.1783, "step": 9475 }, { "epoch": 0.5439210511216938, "grad_norm": 3.5524383610852355, "learning_rate": 1.021228089210799e-05, "loss": 1.1699, "step": 9480 }, { "epoch": 0.5442079293132136, "grad_norm": 3.23575802163993, "learning_rate": 1.0202269042083001e-05, "loss": 1.1337, "step": 9485 }, { "epoch": 0.5444948075047334, "grad_norm": 3.59531668933501, "learning_rate": 1.019225698922217e-05, "loss": 1.1383, "step": 9490 }, { "epoch": 0.5447816856962534, "grad_norm": 3.3211086960600413, "learning_rate": 1.0182244743565595e-05, "loss": 1.0836, "step": 9495 }, { "epoch": 0.5450685638877732, "grad_norm": 4.046057625527267, "learning_rate": 1.017223231515358e-05, "loss": 1.1491, "step": 9500 }, { "epoch": 0.5453554420792931, "grad_norm": 3.2372649889700726, "learning_rate": 1.0162219714026617e-05, "loss": 1.1136, "step": 9505 }, { "epoch": 0.545642320270813, "grad_norm": 3.1536803980312698, "learning_rate": 1.015220695022536e-05, "loss": 1.1277, "step": 9510 }, { "epoch": 0.5459291984623329, "grad_norm": 3.072864838106788, "learning_rate": 1.0142194033790634e-05, "loss": 1.107, "step": 9515 }, { "epoch": 0.5462160766538527, "grad_norm": 3.291378109409898, "learning_rate": 1.013218097476341e-05, "loss": 1.1337, "step": 9520 }, { "epoch": 0.5465029548453727, "grad_norm": 3.6089616723139177, "learning_rate": 1.0122167783184806e-05, "loss": 1.1872, "step": 9525 }, { "epoch": 0.5467898330368925, "grad_norm": 3.4644174287556546, "learning_rate": 1.0112154469096078e-05, "loss": 1.1436, "step": 9530 }, { "epoch": 0.5470767112284124, "grad_norm": 3.7473488762820923, "learning_rate": 1.0102141042538598e-05, "loss": 1.197, "step": 9535 }, { "epoch": 0.5473635894199324, "grad_norm": 3.3962285093932385, "learning_rate": 1.0092127513553852e-05, "loss": 1.112, "step": 9540 }, { "epoch": 0.5476504676114522, "grad_norm": 3.2112628936637493, "learning_rate": 1.0082113892183422e-05, "loss": 1.1048, "step": 9545 }, { "epoch": 0.547937345802972, "grad_norm": 3.0171115167226676, "learning_rate": 1.0072100188469003e-05, "loss": 1.0991, "step": 9550 }, { "epoch": 0.5482242239944919, "grad_norm": 3.5090499574691956, "learning_rate": 1.0062086412452353e-05, "loss": 1.1264, "step": 9555 }, { "epoch": 0.5485111021860118, "grad_norm": 3.3153750790114134, "learning_rate": 1.0052072574175307e-05, "loss": 1.1884, "step": 9560 }, { "epoch": 0.5487979803775317, "grad_norm": 3.3110656808634324, "learning_rate": 1.004205868367977e-05, "loss": 1.1457, "step": 9565 }, { "epoch": 0.5490848585690515, "grad_norm": 3.298938206242078, "learning_rate": 1.0032044751007685e-05, "loss": 1.17, "step": 9570 }, { "epoch": 0.5493717367605715, "grad_norm": 3.874570808968153, "learning_rate": 1.0022030786201057e-05, "loss": 1.2047, "step": 9575 }, { "epoch": 0.5496586149520913, "grad_norm": 3.070634122620084, "learning_rate": 1.0012016799301908e-05, "loss": 1.107, "step": 9580 }, { "epoch": 0.5499454931436112, "grad_norm": 3.124257595495955, "learning_rate": 1.0002002800352281e-05, "loss": 1.1446, "step": 9585 }, { "epoch": 0.5502323713351311, "grad_norm": 3.560715594041934, "learning_rate": 9.991988799394246e-06, "loss": 1.0488, "step": 9590 }, { "epoch": 0.550519249526651, "grad_norm": 3.4848718813007054, "learning_rate": 9.981974806469858e-06, "loss": 1.1634, "step": 9595 }, { "epoch": 0.5508061277181708, "grad_norm": 4.659851160016056, "learning_rate": 9.971960831621174e-06, "loss": 1.0845, "step": 9600 }, { "epoch": 0.5510930059096908, "grad_norm": 4.41966138702183, "learning_rate": 9.961946884890232e-06, "loss": 1.2125, "step": 9605 }, { "epoch": 0.5513798841012106, "grad_norm": 3.4083544283058385, "learning_rate": 9.951932976319041e-06, "loss": 1.1039, "step": 9610 }, { "epoch": 0.5516667622927305, "grad_norm": 3.2324527187198338, "learning_rate": 9.941919115949565e-06, "loss": 1.1116, "step": 9615 }, { "epoch": 0.5519536404842504, "grad_norm": 4.4319054224316075, "learning_rate": 9.931905313823733e-06, "loss": 1.2477, "step": 9620 }, { "epoch": 0.5522405186757703, "grad_norm": 3.2354754131032304, "learning_rate": 9.921891579983404e-06, "loss": 1.1374, "step": 9625 }, { "epoch": 0.5525273968672901, "grad_norm": 3.4906619872542297, "learning_rate": 9.911877924470373e-06, "loss": 1.1328, "step": 9630 }, { "epoch": 0.55281427505881, "grad_norm": 3.4959679478225154, "learning_rate": 9.901864357326357e-06, "loss": 1.1776, "step": 9635 }, { "epoch": 0.5531011532503299, "grad_norm": 3.043749657437343, "learning_rate": 9.891850888592987e-06, "loss": 1.0919, "step": 9640 }, { "epoch": 0.5533880314418498, "grad_norm": 7.073546482332132, "learning_rate": 9.881837528311787e-06, "loss": 1.1893, "step": 9645 }, { "epoch": 0.5536749096333696, "grad_norm": 3.0616705426767608, "learning_rate": 9.871824286524175e-06, "loss": 1.2151, "step": 9650 }, { "epoch": 0.5539617878248896, "grad_norm": 3.3227028973654096, "learning_rate": 9.86181117327146e-06, "loss": 1.1874, "step": 9655 }, { "epoch": 0.5542486660164094, "grad_norm": 4.102752727873679, "learning_rate": 9.851798198594808e-06, "loss": 1.1601, "step": 9660 }, { "epoch": 0.5545355442079293, "grad_norm": 3.1080919287338395, "learning_rate": 9.841785372535255e-06, "loss": 1.1941, "step": 9665 }, { "epoch": 0.5548224223994492, "grad_norm": 3.6058955303815003, "learning_rate": 9.831772705133685e-06, "loss": 1.1246, "step": 9670 }, { "epoch": 0.5551093005909691, "grad_norm": 3.94513573541649, "learning_rate": 9.821760206430825e-06, "loss": 1.2288, "step": 9675 }, { "epoch": 0.5553961787824889, "grad_norm": 3.7997766431909286, "learning_rate": 9.811747886467227e-06, "loss": 1.0573, "step": 9680 }, { "epoch": 0.5556830569740089, "grad_norm": 3.4261277289111396, "learning_rate": 9.801735755283272e-06, "loss": 1.1282, "step": 9685 }, { "epoch": 0.5559699351655287, "grad_norm": 3.2653650024421133, "learning_rate": 9.791723822919148e-06, "loss": 1.1063, "step": 9690 }, { "epoch": 0.5562568133570486, "grad_norm": 3.440108314750474, "learning_rate": 9.781712099414842e-06, "loss": 1.0975, "step": 9695 }, { "epoch": 0.5565436915485685, "grad_norm": 3.2775400555065586, "learning_rate": 9.771700594810129e-06, "loss": 1.1439, "step": 9700 }, { "epoch": 0.5568305697400884, "grad_norm": 3.4192695416857473, "learning_rate": 9.761689319144572e-06, "loss": 1.0685, "step": 9705 }, { "epoch": 0.5571174479316082, "grad_norm": 3.3777162290926697, "learning_rate": 9.751678282457502e-06, "loss": 1.056, "step": 9710 }, { "epoch": 0.5574043261231281, "grad_norm": 3.464629598258671, "learning_rate": 9.741667494788004e-06, "loss": 1.1206, "step": 9715 }, { "epoch": 0.557691204314648, "grad_norm": 3.1232405206578795, "learning_rate": 9.731656966174924e-06, "loss": 1.0917, "step": 9720 }, { "epoch": 0.5579780825061679, "grad_norm": 3.495836084407134, "learning_rate": 9.72164670665684e-06, "loss": 1.0747, "step": 9725 }, { "epoch": 0.5582649606976877, "grad_norm": 3.237004153205446, "learning_rate": 9.71163672627206e-06, "loss": 1.2104, "step": 9730 }, { "epoch": 0.5585518388892077, "grad_norm": 3.2580691737446195, "learning_rate": 9.70162703505862e-06, "loss": 1.0643, "step": 9735 }, { "epoch": 0.5588387170807275, "grad_norm": 3.6427790647021476, "learning_rate": 9.691617643054261e-06, "loss": 1.1893, "step": 9740 }, { "epoch": 0.5591255952722474, "grad_norm": 3.7543222354562653, "learning_rate": 9.681608560296414e-06, "loss": 1.1317, "step": 9745 }, { "epoch": 0.5594124734637673, "grad_norm": 3.309727930644226, "learning_rate": 9.671599796822223e-06, "loss": 1.1246, "step": 9750 }, { "epoch": 0.5596993516552872, "grad_norm": 2.9113907537605677, "learning_rate": 9.661591362668491e-06, "loss": 1.1459, "step": 9755 }, { "epoch": 0.559986229846807, "grad_norm": 3.560717421219788, "learning_rate": 9.651583267871697e-06, "loss": 1.177, "step": 9760 }, { "epoch": 0.560273108038327, "grad_norm": 4.058201981895635, "learning_rate": 9.641575522467985e-06, "loss": 1.1897, "step": 9765 }, { "epoch": 0.5605599862298468, "grad_norm": 3.5920477802022175, "learning_rate": 9.631568136493142e-06, "loss": 1.217, "step": 9770 }, { "epoch": 0.5608468644213667, "grad_norm": 3.285712086437911, "learning_rate": 9.6215611199826e-06, "loss": 1.0552, "step": 9775 }, { "epoch": 0.5611337426128866, "grad_norm": 3.700367347804019, "learning_rate": 9.61155448297141e-06, "loss": 1.2681, "step": 9780 }, { "epoch": 0.5614206208044065, "grad_norm": 3.396965581090774, "learning_rate": 9.60154823549426e-06, "loss": 1.1415, "step": 9785 }, { "epoch": 0.5617074989959263, "grad_norm": 3.2460698824851457, "learning_rate": 9.591542387585435e-06, "loss": 1.1532, "step": 9790 }, { "epoch": 0.5619943771874462, "grad_norm": 4.017777381330048, "learning_rate": 9.581536949278814e-06, "loss": 1.1305, "step": 9795 }, { "epoch": 0.5622812553789661, "grad_norm": 3.2254343281236446, "learning_rate": 9.571531930607884e-06, "loss": 1.15, "step": 9800 }, { "epoch": 0.562568133570486, "grad_norm": 3.1791149799601657, "learning_rate": 9.561527341605693e-06, "loss": 1.1254, "step": 9805 }, { "epoch": 0.5628550117620058, "grad_norm": 3.4766085811328065, "learning_rate": 9.551523192304863e-06, "loss": 1.1756, "step": 9810 }, { "epoch": 0.5631418899535258, "grad_norm": 3.536568871228603, "learning_rate": 9.541519492737585e-06, "loss": 1.1463, "step": 9815 }, { "epoch": 0.5634287681450456, "grad_norm": 3.568784983846471, "learning_rate": 9.531516252935589e-06, "loss": 1.1893, "step": 9820 }, { "epoch": 0.5637156463365655, "grad_norm": 3.0071401961742152, "learning_rate": 9.521513482930144e-06, "loss": 1.0734, "step": 9825 }, { "epoch": 0.5640025245280854, "grad_norm": 2.8832330157857617, "learning_rate": 9.51151119275205e-06, "loss": 1.1733, "step": 9830 }, { "epoch": 0.5642894027196053, "grad_norm": 3.0312609606457572, "learning_rate": 9.501509392431627e-06, "loss": 1.1025, "step": 9835 }, { "epoch": 0.5645762809111251, "grad_norm": 3.2875358522148526, "learning_rate": 9.491508091998707e-06, "loss": 1.1394, "step": 9840 }, { "epoch": 0.5648631591026451, "grad_norm": 3.647923863006296, "learning_rate": 9.481507301482605e-06, "loss": 1.0862, "step": 9845 }, { "epoch": 0.5651500372941649, "grad_norm": 3.7582454158331884, "learning_rate": 9.471507030912152e-06, "loss": 1.0884, "step": 9850 }, { "epoch": 0.5654369154856848, "grad_norm": 3.224166028493974, "learning_rate": 9.46150729031563e-06, "loss": 1.1272, "step": 9855 }, { "epoch": 0.5657237936772047, "grad_norm": 3.6887490656358444, "learning_rate": 9.451508089720802e-06, "loss": 1.2198, "step": 9860 }, { "epoch": 0.5660106718687246, "grad_norm": 3.7484286366593067, "learning_rate": 9.441509439154896e-06, "loss": 1.1886, "step": 9865 }, { "epoch": 0.5662975500602444, "grad_norm": 3.3094563443830367, "learning_rate": 9.431511348644575e-06, "loss": 1.1785, "step": 9870 }, { "epoch": 0.5665844282517642, "grad_norm": 3.655794529784237, "learning_rate": 9.421513828215946e-06, "loss": 1.168, "step": 9875 }, { "epoch": 0.5668713064432842, "grad_norm": 3.1863094740460713, "learning_rate": 9.41151688789455e-06, "loss": 1.1706, "step": 9880 }, { "epoch": 0.567158184634804, "grad_norm": 3.0287318296932657, "learning_rate": 9.401520537705339e-06, "loss": 1.1531, "step": 9885 }, { "epoch": 0.5674450628263239, "grad_norm": 3.238250924329374, "learning_rate": 9.391524787672677e-06, "loss": 1.085, "step": 9890 }, { "epoch": 0.5677319410178439, "grad_norm": 3.5978072035888613, "learning_rate": 9.381529647820315e-06, "loss": 1.1133, "step": 9895 }, { "epoch": 0.5680188192093637, "grad_norm": 4.0702575724174475, "learning_rate": 9.371535128171417e-06, "loss": 1.1358, "step": 9900 }, { "epoch": 0.5683056974008835, "grad_norm": 3.590624012230649, "learning_rate": 9.361541238748497e-06, "loss": 1.161, "step": 9905 }, { "epoch": 0.5685925755924035, "grad_norm": 3.2522393634037012, "learning_rate": 9.351547989573455e-06, "loss": 1.1474, "step": 9910 }, { "epoch": 0.5688794537839234, "grad_norm": 3.3347426914960434, "learning_rate": 9.341555390667542e-06, "loss": 1.1725, "step": 9915 }, { "epoch": 0.5691663319754432, "grad_norm": 3.500671807964452, "learning_rate": 9.331563452051361e-06, "loss": 1.1356, "step": 9920 }, { "epoch": 0.5694532101669632, "grad_norm": 2.9993534797221546, "learning_rate": 9.321572183744849e-06, "loss": 1.0931, "step": 9925 }, { "epoch": 0.569740088358483, "grad_norm": 3.5776696651345845, "learning_rate": 9.311581595767273e-06, "loss": 1.11, "step": 9930 }, { "epoch": 0.5700269665500028, "grad_norm": 3.015046384852246, "learning_rate": 9.301591698137218e-06, "loss": 1.1374, "step": 9935 }, { "epoch": 0.5703138447415228, "grad_norm": 3.3325546157156998, "learning_rate": 9.29160250087257e-06, "loss": 1.1766, "step": 9940 }, { "epoch": 0.5706007229330426, "grad_norm": 3.8872570660603745, "learning_rate": 9.281614013990526e-06, "loss": 1.2248, "step": 9945 }, { "epoch": 0.5708876011245625, "grad_norm": 3.5644747778672494, "learning_rate": 9.271626247507561e-06, "loss": 1.1099, "step": 9950 }, { "epoch": 0.5711744793160823, "grad_norm": 3.540889204160025, "learning_rate": 9.261639211439427e-06, "loss": 1.1493, "step": 9955 }, { "epoch": 0.5714613575076023, "grad_norm": 3.4973940178756875, "learning_rate": 9.251652915801145e-06, "loss": 1.1482, "step": 9960 }, { "epoch": 0.5717482356991221, "grad_norm": 3.254926555020585, "learning_rate": 9.241667370606999e-06, "loss": 1.1295, "step": 9965 }, { "epoch": 0.572035113890642, "grad_norm": 3.0671798445146408, "learning_rate": 9.231682585870513e-06, "loss": 1.1035, "step": 9970 }, { "epoch": 0.572321992082162, "grad_norm": 3.3527220586025765, "learning_rate": 9.221698571604453e-06, "loss": 1.1144, "step": 9975 }, { "epoch": 0.5726088702736818, "grad_norm": 3.541816084564746, "learning_rate": 9.211715337820812e-06, "loss": 1.155, "step": 9980 }, { "epoch": 0.5728957484652016, "grad_norm": 3.100163631611502, "learning_rate": 9.201732894530797e-06, "loss": 1.1438, "step": 9985 }, { "epoch": 0.5731826266567216, "grad_norm": 3.1402747649588267, "learning_rate": 9.191751251744824e-06, "loss": 1.1415, "step": 9990 }, { "epoch": 0.5734695048482414, "grad_norm": 3.4534437958369564, "learning_rate": 9.18177041947251e-06, "loss": 1.1619, "step": 9995 }, { "epoch": 0.5737563830397613, "grad_norm": 3.1538557453611094, "learning_rate": 9.171790407722657e-06, "loss": 1.1222, "step": 10000 }, { "epoch": 0.5740432612312812, "grad_norm": 3.4137624362801615, "learning_rate": 9.161811226503233e-06, "loss": 1.0701, "step": 10005 }, { "epoch": 0.5743301394228011, "grad_norm": 3.1414293625262797, "learning_rate": 9.151832885821397e-06, "loss": 1.175, "step": 10010 }, { "epoch": 0.5746170176143209, "grad_norm": 3.3235881672679453, "learning_rate": 9.141855395683444e-06, "loss": 1.1674, "step": 10015 }, { "epoch": 0.5749038958058409, "grad_norm": 3.954874299443936, "learning_rate": 9.131878766094822e-06, "loss": 1.0905, "step": 10020 }, { "epoch": 0.5751907739973607, "grad_norm": 3.4654291532152977, "learning_rate": 9.121903007060121e-06, "loss": 1.1557, "step": 10025 }, { "epoch": 0.5754776521888806, "grad_norm": 3.0177327112768872, "learning_rate": 9.111928128583054e-06, "loss": 1.1491, "step": 10030 }, { "epoch": 0.5757645303804004, "grad_norm": 4.441414079824567, "learning_rate": 9.101954140666451e-06, "loss": 1.1291, "step": 10035 }, { "epoch": 0.5760514085719204, "grad_norm": 3.5925182648082297, "learning_rate": 9.091981053312247e-06, "loss": 1.0855, "step": 10040 }, { "epoch": 0.5763382867634402, "grad_norm": 3.248513016170106, "learning_rate": 9.082008876521482e-06, "loss": 1.1048, "step": 10045 }, { "epoch": 0.5766251649549601, "grad_norm": 3.2119677370517907, "learning_rate": 9.072037620294276e-06, "loss": 1.1401, "step": 10050 }, { "epoch": 0.57691204314648, "grad_norm": 4.064022418123313, "learning_rate": 9.06206729462982e-06, "loss": 1.1412, "step": 10055 }, { "epoch": 0.5771989213379999, "grad_norm": 3.0861671066969407, "learning_rate": 9.052097909526389e-06, "loss": 1.0877, "step": 10060 }, { "epoch": 0.5774857995295197, "grad_norm": 3.867884753480938, "learning_rate": 9.042129474981296e-06, "loss": 1.1585, "step": 10065 }, { "epoch": 0.5777726777210397, "grad_norm": 3.058575869308891, "learning_rate": 9.032162000990916e-06, "loss": 1.071, "step": 10070 }, { "epoch": 0.5780595559125595, "grad_norm": 3.491762632212896, "learning_rate": 9.02219549755065e-06, "loss": 1.2051, "step": 10075 }, { "epoch": 0.5783464341040794, "grad_norm": 3.1217013407817475, "learning_rate": 9.012229974654933e-06, "loss": 1.0882, "step": 10080 }, { "epoch": 0.5786333122955993, "grad_norm": 3.5394570337278086, "learning_rate": 9.002265442297213e-06, "loss": 1.1137, "step": 10085 }, { "epoch": 0.5789201904871192, "grad_norm": 2.9601054264443425, "learning_rate": 8.992301910469941e-06, "loss": 1.1447, "step": 10090 }, { "epoch": 0.579207068678639, "grad_norm": 3.086878704905248, "learning_rate": 8.982339389164575e-06, "loss": 1.1593, "step": 10095 }, { "epoch": 0.579493946870159, "grad_norm": 3.322788362759537, "learning_rate": 8.972377888371554e-06, "loss": 1.1234, "step": 10100 }, { "epoch": 0.5797808250616788, "grad_norm": 3.3215495382946223, "learning_rate": 8.962417418080285e-06, "loss": 1.0589, "step": 10105 }, { "epoch": 0.5800677032531987, "grad_norm": 3.4501695166994755, "learning_rate": 8.952457988279162e-06, "loss": 1.1517, "step": 10110 }, { "epoch": 0.5803545814447185, "grad_norm": 3.366665673892304, "learning_rate": 8.942499608955516e-06, "loss": 1.1862, "step": 10115 }, { "epoch": 0.5806414596362385, "grad_norm": 3.080315603312387, "learning_rate": 8.932542290095631e-06, "loss": 1.0953, "step": 10120 }, { "epoch": 0.5809283378277583, "grad_norm": 3.172200153846606, "learning_rate": 8.922586041684733e-06, "loss": 1.1456, "step": 10125 }, { "epoch": 0.5812152160192782, "grad_norm": 2.7798878503350464, "learning_rate": 8.912630873706967e-06, "loss": 1.1171, "step": 10130 }, { "epoch": 0.5815020942107981, "grad_norm": 3.2486255758158085, "learning_rate": 8.902676796145403e-06, "loss": 1.1443, "step": 10135 }, { "epoch": 0.581788972402318, "grad_norm": 3.1387956660105436, "learning_rate": 8.892723818982001e-06, "loss": 1.1281, "step": 10140 }, { "epoch": 0.5820758505938378, "grad_norm": 2.911602430988678, "learning_rate": 8.882771952197642e-06, "loss": 1.1275, "step": 10145 }, { "epoch": 0.5823627287853578, "grad_norm": 4.018933228484199, "learning_rate": 8.872821205772075e-06, "loss": 1.1456, "step": 10150 }, { "epoch": 0.5826496069768776, "grad_norm": 3.2129853285674166, "learning_rate": 8.862871589683925e-06, "loss": 1.1606, "step": 10155 }, { "epoch": 0.5829364851683975, "grad_norm": 3.090772841278142, "learning_rate": 8.852923113910701e-06, "loss": 1.0693, "step": 10160 }, { "epoch": 0.5832233633599174, "grad_norm": 3.475078772241131, "learning_rate": 8.842975788428748e-06, "loss": 1.1128, "step": 10165 }, { "epoch": 0.5835102415514373, "grad_norm": 3.300031319014394, "learning_rate": 8.833029623213267e-06, "loss": 1.0328, "step": 10170 }, { "epoch": 0.5837971197429571, "grad_norm": 3.5380860241063914, "learning_rate": 8.8230846282383e-06, "loss": 1.1651, "step": 10175 }, { "epoch": 0.5840839979344771, "grad_norm": 3.209227173995402, "learning_rate": 8.813140813476705e-06, "loss": 1.1232, "step": 10180 }, { "epoch": 0.5843708761259969, "grad_norm": 3.4510430333305346, "learning_rate": 8.803198188900162e-06, "loss": 1.0926, "step": 10185 }, { "epoch": 0.5846577543175168, "grad_norm": 3.1249652511636623, "learning_rate": 8.793256764479162e-06, "loss": 1.1803, "step": 10190 }, { "epoch": 0.5849446325090366, "grad_norm": 3.205538009390768, "learning_rate": 8.783316550182983e-06, "loss": 1.1574, "step": 10195 }, { "epoch": 0.5852315107005566, "grad_norm": 3.2460471500322496, "learning_rate": 8.7733775559797e-06, "loss": 1.1212, "step": 10200 }, { "epoch": 0.5855183888920764, "grad_norm": 3.342283774838026, "learning_rate": 8.763439791836145e-06, "loss": 1.0857, "step": 10205 }, { "epoch": 0.5858052670835963, "grad_norm": 3.2883148582176536, "learning_rate": 8.753503267717948e-06, "loss": 1.1172, "step": 10210 }, { "epoch": 0.5860921452751162, "grad_norm": 6.664787088613047, "learning_rate": 8.743567993589466e-06, "loss": 1.135, "step": 10215 }, { "epoch": 0.5863790234666361, "grad_norm": 3.28681299122575, "learning_rate": 8.733633979413816e-06, "loss": 1.211, "step": 10220 }, { "epoch": 0.5866659016581559, "grad_norm": 3.52539614749016, "learning_rate": 8.723701235152853e-06, "loss": 1.2069, "step": 10225 }, { "epoch": 0.5869527798496759, "grad_norm": 3.2571052548196597, "learning_rate": 8.713769770767156e-06, "loss": 1.0219, "step": 10230 }, { "epoch": 0.5872396580411957, "grad_norm": 3.2438649438886977, "learning_rate": 8.703839596216012e-06, "loss": 1.1854, "step": 10235 }, { "epoch": 0.5875265362327156, "grad_norm": 3.2350448705189523, "learning_rate": 8.69391072145743e-06, "loss": 1.1251, "step": 10240 }, { "epoch": 0.5878134144242355, "grad_norm": 3.0164375523361646, "learning_rate": 8.683983156448106e-06, "loss": 1.0671, "step": 10245 }, { "epoch": 0.5881002926157554, "grad_norm": 3.7842188284811873, "learning_rate": 8.674056911143421e-06, "loss": 1.1014, "step": 10250 }, { "epoch": 0.5883871708072752, "grad_norm": 3.4395327176674133, "learning_rate": 8.66413199549744e-06, "loss": 1.1471, "step": 10255 }, { "epoch": 0.5886740489987952, "grad_norm": 3.3517151417199913, "learning_rate": 8.654208419462894e-06, "loss": 1.093, "step": 10260 }, { "epoch": 0.588960927190315, "grad_norm": 3.024318926367898, "learning_rate": 8.644286192991158e-06, "loss": 1.0835, "step": 10265 }, { "epoch": 0.5892478053818349, "grad_norm": 3.4498317923781894, "learning_rate": 8.634365326032266e-06, "loss": 1.1365, "step": 10270 }, { "epoch": 0.5895346835733547, "grad_norm": 3.3147864267568217, "learning_rate": 8.624445828534891e-06, "loss": 1.1464, "step": 10275 }, { "epoch": 0.5898215617648747, "grad_norm": 2.97095268935642, "learning_rate": 8.614527710446322e-06, "loss": 1.1782, "step": 10280 }, { "epoch": 0.5901084399563945, "grad_norm": 3.0527075816760822, "learning_rate": 8.604610981712471e-06, "loss": 1.1252, "step": 10285 }, { "epoch": 0.5903953181479143, "grad_norm": 3.3636112882757847, "learning_rate": 8.594695652277857e-06, "loss": 1.1416, "step": 10290 }, { "epoch": 0.5906821963394343, "grad_norm": 4.04568655312239, "learning_rate": 8.584781732085598e-06, "loss": 1.1454, "step": 10295 }, { "epoch": 0.5909690745309542, "grad_norm": 3.800042409401551, "learning_rate": 8.574869231077385e-06, "loss": 1.1786, "step": 10300 }, { "epoch": 0.591255952722474, "grad_norm": 3.0676578781303467, "learning_rate": 8.564958159193505e-06, "loss": 1.1743, "step": 10305 }, { "epoch": 0.591542830913994, "grad_norm": 2.8473090497881213, "learning_rate": 8.555048526372806e-06, "loss": 1.0826, "step": 10310 }, { "epoch": 0.5918297091055138, "grad_norm": 3.289827158060531, "learning_rate": 8.545140342552675e-06, "loss": 1.1355, "step": 10315 }, { "epoch": 0.5921165872970336, "grad_norm": 3.2752142951688903, "learning_rate": 8.535233617669081e-06, "loss": 1.1547, "step": 10320 }, { "epoch": 0.5924034654885536, "grad_norm": 3.087627214650262, "learning_rate": 8.525328361656495e-06, "loss": 1.0938, "step": 10325 }, { "epoch": 0.5926903436800735, "grad_norm": 3.1159884481961724, "learning_rate": 8.515424584447935e-06, "loss": 1.0221, "step": 10330 }, { "epoch": 0.5929772218715933, "grad_norm": 3.7444380425122152, "learning_rate": 8.50552229597493e-06, "loss": 1.1517, "step": 10335 }, { "epoch": 0.5932641000631133, "grad_norm": 3.8550504649827273, "learning_rate": 8.495621506167519e-06, "loss": 1.1438, "step": 10340 }, { "epoch": 0.5935509782546331, "grad_norm": 3.8485404905093654, "learning_rate": 8.485722224954237e-06, "loss": 1.259, "step": 10345 }, { "epoch": 0.593837856446153, "grad_norm": 3.9536407738577326, "learning_rate": 8.475824462262097e-06, "loss": 1.1473, "step": 10350 }, { "epoch": 0.5941247346376728, "grad_norm": 3.3081624546734374, "learning_rate": 8.465928228016609e-06, "loss": 1.1519, "step": 10355 }, { "epoch": 0.5944116128291927, "grad_norm": 3.4448729368817816, "learning_rate": 8.456033532141736e-06, "loss": 1.1354, "step": 10360 }, { "epoch": 0.5946984910207126, "grad_norm": 3.5301344674280246, "learning_rate": 8.446140384559892e-06, "loss": 1.1427, "step": 10365 }, { "epoch": 0.5949853692122324, "grad_norm": 3.823306295723402, "learning_rate": 8.436248795191961e-06, "loss": 1.1503, "step": 10370 }, { "epoch": 0.5952722474037524, "grad_norm": 3.8075614650368594, "learning_rate": 8.426358773957243e-06, "loss": 1.1288, "step": 10375 }, { "epoch": 0.5955591255952722, "grad_norm": 3.564962651686787, "learning_rate": 8.416470330773472e-06, "loss": 1.0675, "step": 10380 }, { "epoch": 0.5958460037867921, "grad_norm": 3.514847600817712, "learning_rate": 8.406583475556808e-06, "loss": 1.1368, "step": 10385 }, { "epoch": 0.596132881978312, "grad_norm": 3.453608817532353, "learning_rate": 8.396698218221806e-06, "loss": 1.0554, "step": 10390 }, { "epoch": 0.5964197601698319, "grad_norm": 3.1683904461347483, "learning_rate": 8.38681456868143e-06, "loss": 1.1125, "step": 10395 }, { "epoch": 0.5967066383613517, "grad_norm": 3.3900342417375406, "learning_rate": 8.376932536847014e-06, "loss": 1.1597, "step": 10400 }, { "epoch": 0.5969935165528717, "grad_norm": 4.017222561070337, "learning_rate": 8.367052132628295e-06, "loss": 1.1438, "step": 10405 }, { "epoch": 0.5972803947443915, "grad_norm": 3.533069820010521, "learning_rate": 8.357173365933361e-06, "loss": 1.1266, "step": 10410 }, { "epoch": 0.5975672729359114, "grad_norm": 2.959372337368788, "learning_rate": 8.347296246668652e-06, "loss": 1.0382, "step": 10415 }, { "epoch": 0.5978541511274313, "grad_norm": 3.4800054751565628, "learning_rate": 8.33742078473898e-06, "loss": 1.1713, "step": 10420 }, { "epoch": 0.5981410293189512, "grad_norm": 3.43667874533604, "learning_rate": 8.327546990047471e-06, "loss": 1.1055, "step": 10425 }, { "epoch": 0.598427907510471, "grad_norm": 3.9036588847379403, "learning_rate": 8.317674872495588e-06, "loss": 1.0974, "step": 10430 }, { "epoch": 0.5987147857019909, "grad_norm": 3.5404323089205447, "learning_rate": 8.30780444198312e-06, "loss": 1.1055, "step": 10435 }, { "epoch": 0.5990016638935108, "grad_norm": 3.2383684032039444, "learning_rate": 8.29793570840815e-06, "loss": 1.2016, "step": 10440 }, { "epoch": 0.5992885420850307, "grad_norm": 3.248184906238019, "learning_rate": 8.288068681667066e-06, "loss": 1.1481, "step": 10445 }, { "epoch": 0.5995754202765505, "grad_norm": 3.1990199249917715, "learning_rate": 8.27820337165455e-06, "loss": 1.1253, "step": 10450 }, { "epoch": 0.5998622984680705, "grad_norm": 3.4223404910910245, "learning_rate": 8.268339788263552e-06, "loss": 1.1561, "step": 10455 }, { "epoch": 0.6001491766595903, "grad_norm": 3.5575953885039295, "learning_rate": 8.258477941385302e-06, "loss": 1.1477, "step": 10460 }, { "epoch": 0.6004360548511102, "grad_norm": 3.2455389844421334, "learning_rate": 8.248617840909269e-06, "loss": 1.0905, "step": 10465 }, { "epoch": 0.6007229330426301, "grad_norm": 3.7521681957818043, "learning_rate": 8.2387594967232e-06, "loss": 1.1958, "step": 10470 }, { "epoch": 0.60100981123415, "grad_norm": 3.5529231978214315, "learning_rate": 8.228902918713053e-06, "loss": 1.1084, "step": 10475 }, { "epoch": 0.6012966894256698, "grad_norm": 3.537452108649779, "learning_rate": 8.219048116763031e-06, "loss": 1.0927, "step": 10480 }, { "epoch": 0.6015835676171898, "grad_norm": 3.369027937516129, "learning_rate": 8.209195100755552e-06, "loss": 1.0944, "step": 10485 }, { "epoch": 0.6018704458087096, "grad_norm": 3.0308811283730135, "learning_rate": 8.199343880571241e-06, "loss": 1.1566, "step": 10490 }, { "epoch": 0.6021573240002295, "grad_norm": 3.2223805297262924, "learning_rate": 8.189494466088923e-06, "loss": 1.2292, "step": 10495 }, { "epoch": 0.6024442021917494, "grad_norm": 3.1826310681882135, "learning_rate": 8.179646867185617e-06, "loss": 1.1169, "step": 10500 }, { "epoch": 0.6027310803832693, "grad_norm": 3.3228339300469334, "learning_rate": 8.169801093736515e-06, "loss": 1.2028, "step": 10505 }, { "epoch": 0.6030179585747891, "grad_norm": 4.291919418679521, "learning_rate": 8.159957155614974e-06, "loss": 1.098, "step": 10510 }, { "epoch": 0.603304836766309, "grad_norm": 4.13549732273055, "learning_rate": 8.15011506269253e-06, "loss": 1.1119, "step": 10515 }, { "epoch": 0.6035917149578289, "grad_norm": 4.206930847058277, "learning_rate": 8.140274824838849e-06, "loss": 1.1413, "step": 10520 }, { "epoch": 0.6038785931493488, "grad_norm": 3.2360343772640907, "learning_rate": 8.130436451921743e-06, "loss": 1.1411, "step": 10525 }, { "epoch": 0.6041654713408686, "grad_norm": 3.5693759086162533, "learning_rate": 8.120599953807153e-06, "loss": 1.1413, "step": 10530 }, { "epoch": 0.6044523495323886, "grad_norm": 3.2648813387145155, "learning_rate": 8.110765340359146e-06, "loss": 1.1917, "step": 10535 }, { "epoch": 0.6047392277239084, "grad_norm": 4.259747246145986, "learning_rate": 8.10093262143989e-06, "loss": 1.1492, "step": 10540 }, { "epoch": 0.6050261059154283, "grad_norm": 3.9012478257310006, "learning_rate": 8.09110180690966e-06, "loss": 1.1591, "step": 10545 }, { "epoch": 0.6053129841069482, "grad_norm": 3.839134632255886, "learning_rate": 8.081272906626821e-06, "loss": 1.1079, "step": 10550 }, { "epoch": 0.6055998622984681, "grad_norm": 3.3716407410300127, "learning_rate": 8.071445930447813e-06, "loss": 1.1919, "step": 10555 }, { "epoch": 0.6058867404899879, "grad_norm": 3.402979606877552, "learning_rate": 8.061620888227147e-06, "loss": 1.1241, "step": 10560 }, { "epoch": 0.6061736186815079, "grad_norm": 3.281529454996215, "learning_rate": 8.051797789817403e-06, "loss": 1.0661, "step": 10565 }, { "epoch": 0.6064604968730277, "grad_norm": 3.4876696940710614, "learning_rate": 8.041976645069208e-06, "loss": 1.0727, "step": 10570 }, { "epoch": 0.6067473750645476, "grad_norm": 3.3730892639036902, "learning_rate": 8.032157463831217e-06, "loss": 1.152, "step": 10575 }, { "epoch": 0.6070342532560675, "grad_norm": 3.227084114387924, "learning_rate": 8.022340255950138e-06, "loss": 1.1295, "step": 10580 }, { "epoch": 0.6073211314475874, "grad_norm": 3.635840850572328, "learning_rate": 8.012525031270686e-06, "loss": 1.1349, "step": 10585 }, { "epoch": 0.6076080096391072, "grad_norm": 3.128822403019229, "learning_rate": 8.002711799635588e-06, "loss": 1.1103, "step": 10590 }, { "epoch": 0.6078948878306271, "grad_norm": 3.1172055807816386, "learning_rate": 7.992900570885574e-06, "loss": 1.0986, "step": 10595 }, { "epoch": 0.608181766022147, "grad_norm": 3.229369540687658, "learning_rate": 7.98309135485937e-06, "loss": 1.1559, "step": 10600 }, { "epoch": 0.6084686442136669, "grad_norm": 3.1476696910239923, "learning_rate": 7.973284161393681e-06, "loss": 1.1274, "step": 10605 }, { "epoch": 0.6087555224051867, "grad_norm": 3.7301236805730693, "learning_rate": 7.963479000323172e-06, "loss": 1.1168, "step": 10610 }, { "epoch": 0.6090424005967067, "grad_norm": 3.087133669603848, "learning_rate": 7.953675881480493e-06, "loss": 1.1549, "step": 10615 }, { "epoch": 0.6093292787882265, "grad_norm": 3.524537690913959, "learning_rate": 7.94387481469623e-06, "loss": 1.1158, "step": 10620 }, { "epoch": 0.6096161569797464, "grad_norm": 3.5219319562582596, "learning_rate": 7.93407580979891e-06, "loss": 1.2244, "step": 10625 }, { "epoch": 0.6099030351712663, "grad_norm": 3.0389587097206006, "learning_rate": 7.924278876615003e-06, "loss": 1.0849, "step": 10630 }, { "epoch": 0.6101899133627862, "grad_norm": 3.46258022579977, "learning_rate": 7.914484024968893e-06, "loss": 1.105, "step": 10635 }, { "epoch": 0.610476791554306, "grad_norm": 3.0519916154153433, "learning_rate": 7.90469126468288e-06, "loss": 1.0713, "step": 10640 }, { "epoch": 0.610763669745826, "grad_norm": 2.9609986352128232, "learning_rate": 7.894900605577162e-06, "loss": 1.135, "step": 10645 }, { "epoch": 0.6110505479373458, "grad_norm": 3.5594693007509677, "learning_rate": 7.88511205746984e-06, "loss": 1.1431, "step": 10650 }, { "epoch": 0.6113374261288657, "grad_norm": 3.281478294153034, "learning_rate": 7.875325630176889e-06, "loss": 1.0953, "step": 10655 }, { "epoch": 0.6116243043203856, "grad_norm": 3.7785552126566295, "learning_rate": 7.865541333512157e-06, "loss": 1.0698, "step": 10660 }, { "epoch": 0.6119111825119055, "grad_norm": 3.2719003289092186, "learning_rate": 7.855759177287367e-06, "loss": 1.0975, "step": 10665 }, { "epoch": 0.6121980607034253, "grad_norm": 3.135005154158364, "learning_rate": 7.84597917131208e-06, "loss": 1.1039, "step": 10670 }, { "epoch": 0.6124849388949452, "grad_norm": 3.2430104395304524, "learning_rate": 7.836201325393706e-06, "loss": 1.1521, "step": 10675 }, { "epoch": 0.6127718170864651, "grad_norm": 3.602884960883519, "learning_rate": 7.8264256493375e-06, "loss": 1.1162, "step": 10680 }, { "epoch": 0.613058695277985, "grad_norm": 3.514933905422461, "learning_rate": 7.816652152946528e-06, "loss": 1.1919, "step": 10685 }, { "epoch": 0.6133455734695048, "grad_norm": 3.348068893615265, "learning_rate": 7.80688084602167e-06, "loss": 1.0981, "step": 10690 }, { "epoch": 0.6136324516610248, "grad_norm": 3.4671713686954146, "learning_rate": 7.797111738361617e-06, "loss": 1.0554, "step": 10695 }, { "epoch": 0.6139193298525446, "grad_norm": 3.632965510069143, "learning_rate": 7.787344839762854e-06, "loss": 1.1347, "step": 10700 }, { "epoch": 0.6142062080440645, "grad_norm": 3.206658235736852, "learning_rate": 7.77758016001965e-06, "loss": 1.0654, "step": 10705 }, { "epoch": 0.6144930862355844, "grad_norm": 3.3597970867937934, "learning_rate": 7.767817708924037e-06, "loss": 1.1416, "step": 10710 }, { "epoch": 0.6147799644271043, "grad_norm": 3.4279758941595238, "learning_rate": 7.75805749626584e-06, "loss": 1.1215, "step": 10715 }, { "epoch": 0.6150668426186241, "grad_norm": 3.511761088525684, "learning_rate": 7.74829953183261e-06, "loss": 1.1508, "step": 10720 }, { "epoch": 0.6153537208101441, "grad_norm": 3.3358171237604113, "learning_rate": 7.738543825409653e-06, "loss": 1.1496, "step": 10725 }, { "epoch": 0.6156405990016639, "grad_norm": 3.238325538879624, "learning_rate": 7.728790386780025e-06, "loss": 1.1205, "step": 10730 }, { "epoch": 0.6159274771931837, "grad_norm": 2.9750996876137283, "learning_rate": 7.719039225724489e-06, "loss": 1.2104, "step": 10735 }, { "epoch": 0.6162143553847037, "grad_norm": 3.3057362574483884, "learning_rate": 7.70929035202153e-06, "loss": 1.017, "step": 10740 }, { "epoch": 0.6165012335762236, "grad_norm": 3.2383250611644425, "learning_rate": 7.699543775447346e-06, "loss": 1.1178, "step": 10745 }, { "epoch": 0.6167881117677434, "grad_norm": 3.122797008150266, "learning_rate": 7.689799505775822e-06, "loss": 1.1405, "step": 10750 }, { "epoch": 0.6170749899592632, "grad_norm": 3.2956149313480383, "learning_rate": 7.68005755277853e-06, "loss": 1.0597, "step": 10755 }, { "epoch": 0.6173618681507832, "grad_norm": 3.0999412012926357, "learning_rate": 7.670317926224729e-06, "loss": 1.1592, "step": 10760 }, { "epoch": 0.617648746342303, "grad_norm": 3.1377118348732687, "learning_rate": 7.660580635881338e-06, "loss": 1.1541, "step": 10765 }, { "epoch": 0.6179356245338229, "grad_norm": 3.624883847656244, "learning_rate": 7.65084569151293e-06, "loss": 1.0653, "step": 10770 }, { "epoch": 0.6182225027253428, "grad_norm": 3.0995811360897845, "learning_rate": 7.641113102881726e-06, "loss": 1.0776, "step": 10775 }, { "epoch": 0.6185093809168627, "grad_norm": 3.310964400087299, "learning_rate": 7.631382879747598e-06, "loss": 1.1113, "step": 10780 }, { "epoch": 0.6187962591083825, "grad_norm": 3.1953191318756433, "learning_rate": 7.621655031868026e-06, "loss": 1.1507, "step": 10785 }, { "epoch": 0.6190831372999025, "grad_norm": 3.4117046070121577, "learning_rate": 7.6119295689981195e-06, "loss": 1.1932, "step": 10790 }, { "epoch": 0.6193700154914223, "grad_norm": 3.619793438981974, "learning_rate": 7.6022065008906e-06, "loss": 1.1154, "step": 10795 }, { "epoch": 0.6196568936829422, "grad_norm": 3.149579933403025, "learning_rate": 7.592485837295776e-06, "loss": 1.1184, "step": 10800 }, { "epoch": 0.6199437718744621, "grad_norm": 3.303832432901427, "learning_rate": 7.5827675879615525e-06, "loss": 1.0939, "step": 10805 }, { "epoch": 0.620230650065982, "grad_norm": 3.49512970220587, "learning_rate": 7.573051762633414e-06, "loss": 1.1345, "step": 10810 }, { "epoch": 0.6205175282575018, "grad_norm": 3.514951199162295, "learning_rate": 7.563338371054411e-06, "loss": 1.1792, "step": 10815 }, { "epoch": 0.6208044064490218, "grad_norm": 3.4168143038006784, "learning_rate": 7.553627422965149e-06, "loss": 1.1442, "step": 10820 }, { "epoch": 0.6210912846405416, "grad_norm": 3.2578545200943703, "learning_rate": 7.543918928103795e-06, "loss": 1.1108, "step": 10825 }, { "epoch": 0.6213781628320615, "grad_norm": 3.6145213361166983, "learning_rate": 7.534212896206051e-06, "loss": 1.0369, "step": 10830 }, { "epoch": 0.6216650410235813, "grad_norm": 3.5985095534387974, "learning_rate": 7.5245093370051415e-06, "loss": 1.0947, "step": 10835 }, { "epoch": 0.6219519192151013, "grad_norm": 3.0794908308926363, "learning_rate": 7.514808260231818e-06, "loss": 1.0591, "step": 10840 }, { "epoch": 0.6222387974066211, "grad_norm": 3.3118310211773796, "learning_rate": 7.505109675614346e-06, "loss": 1.0526, "step": 10845 }, { "epoch": 0.622525675598141, "grad_norm": 3.1306776006377306, "learning_rate": 7.4954135928784846e-06, "loss": 1.0694, "step": 10850 }, { "epoch": 0.6228125537896609, "grad_norm": 3.122367199281352, "learning_rate": 7.485720021747487e-06, "loss": 1.1007, "step": 10855 }, { "epoch": 0.6230994319811808, "grad_norm": 3.066018012864805, "learning_rate": 7.476028971942093e-06, "loss": 1.0839, "step": 10860 }, { "epoch": 0.6233863101727006, "grad_norm": 3.2860871365974824, "learning_rate": 7.466340453180506e-06, "loss": 1.1455, "step": 10865 }, { "epoch": 0.6236731883642206, "grad_norm": 3.388409144732927, "learning_rate": 7.456654475178389e-06, "loss": 1.1214, "step": 10870 }, { "epoch": 0.6239600665557404, "grad_norm": 3.3815885531547525, "learning_rate": 7.446971047648872e-06, "loss": 1.0631, "step": 10875 }, { "epoch": 0.6242469447472603, "grad_norm": 3.139100827646729, "learning_rate": 7.437290180302512e-06, "loss": 1.1298, "step": 10880 }, { "epoch": 0.6245338229387802, "grad_norm": 3.093703499499235, "learning_rate": 7.427611882847302e-06, "loss": 1.1045, "step": 10885 }, { "epoch": 0.6248207011303001, "grad_norm": 3.187663187645134, "learning_rate": 7.4179361649886705e-06, "loss": 1.168, "step": 10890 }, { "epoch": 0.6251075793218199, "grad_norm": 3.1922870360327065, "learning_rate": 7.408263036429441e-06, "loss": 1.0455, "step": 10895 }, { "epoch": 0.6253944575133399, "grad_norm": 3.270107942435463, "learning_rate": 7.398592506869848e-06, "loss": 1.1229, "step": 10900 }, { "epoch": 0.6256813357048597, "grad_norm": 3.6271043112003043, "learning_rate": 7.388924586007523e-06, "loss": 1.0453, "step": 10905 }, { "epoch": 0.6259682138963796, "grad_norm": 3.1619030637738144, "learning_rate": 7.379259283537478e-06, "loss": 1.1224, "step": 10910 }, { "epoch": 0.6262550920878994, "grad_norm": 3.3059971832224027, "learning_rate": 7.3695966091521055e-06, "loss": 1.1546, "step": 10915 }, { "epoch": 0.6265419702794194, "grad_norm": 3.5409392945213556, "learning_rate": 7.359936572541142e-06, "loss": 1.1948, "step": 10920 }, { "epoch": 0.6268288484709392, "grad_norm": 4.03142223368236, "learning_rate": 7.350279183391712e-06, "loss": 1.111, "step": 10925 }, { "epoch": 0.6271157266624591, "grad_norm": 2.9951355314991672, "learning_rate": 7.3406244513882565e-06, "loss": 1.1181, "step": 10930 }, { "epoch": 0.627402604853979, "grad_norm": 2.9586176017825547, "learning_rate": 7.330972386212558e-06, "loss": 1.1008, "step": 10935 }, { "epoch": 0.6276894830454989, "grad_norm": 3.683786306646009, "learning_rate": 7.3213229975437435e-06, "loss": 1.14, "step": 10940 }, { "epoch": 0.6279763612370187, "grad_norm": 3.1744390637289888, "learning_rate": 7.311676295058232e-06, "loss": 1.0642, "step": 10945 }, { "epoch": 0.6282632394285387, "grad_norm": 3.2404438834146454, "learning_rate": 7.3020322884297565e-06, "loss": 1.0692, "step": 10950 }, { "epoch": 0.6285501176200585, "grad_norm": 3.5740660637760775, "learning_rate": 7.292390987329356e-06, "loss": 1.1299, "step": 10955 }, { "epoch": 0.6288369958115784, "grad_norm": 3.3524153005138166, "learning_rate": 7.282752401425344e-06, "loss": 1.1098, "step": 10960 }, { "epoch": 0.6291238740030983, "grad_norm": 3.146892482490224, "learning_rate": 7.273116540383319e-06, "loss": 1.203, "step": 10965 }, { "epoch": 0.6294107521946182, "grad_norm": 3.3450765263829743, "learning_rate": 7.263483413866136e-06, "loss": 1.1603, "step": 10970 }, { "epoch": 0.629697630386138, "grad_norm": 3.1255291935049803, "learning_rate": 7.253853031533929e-06, "loss": 1.1126, "step": 10975 }, { "epoch": 0.629984508577658, "grad_norm": 3.5515104227745318, "learning_rate": 7.244225403044056e-06, "loss": 1.1387, "step": 10980 }, { "epoch": 0.6302713867691778, "grad_norm": 3.355871022473389, "learning_rate": 7.2346005380511246e-06, "loss": 1.0757, "step": 10985 }, { "epoch": 0.6305582649606977, "grad_norm": 3.5550892831854397, "learning_rate": 7.22497844620698e-06, "loss": 1.087, "step": 10990 }, { "epoch": 0.6308451431522175, "grad_norm": 2.9847579570518925, "learning_rate": 7.215359137160673e-06, "loss": 1.0553, "step": 10995 }, { "epoch": 0.6311320213437375, "grad_norm": 3.996925182557025, "learning_rate": 7.2057426205584636e-06, "loss": 1.1199, "step": 11000 }, { "epoch": 0.6314188995352573, "grad_norm": 3.330511185797482, "learning_rate": 7.196128906043822e-06, "loss": 1.215, "step": 11005 }, { "epoch": 0.6317057777267772, "grad_norm": 3.289481056681264, "learning_rate": 7.186518003257401e-06, "loss": 1.0309, "step": 11010 }, { "epoch": 0.6319926559182971, "grad_norm": 4.147730677305381, "learning_rate": 7.176909921837034e-06, "loss": 1.2476, "step": 11015 }, { "epoch": 0.632279534109817, "grad_norm": 3.1825381056603566, "learning_rate": 7.167304671417728e-06, "loss": 1.0784, "step": 11020 }, { "epoch": 0.6325664123013368, "grad_norm": 3.1448813493505594, "learning_rate": 7.157702261631654e-06, "loss": 1.1087, "step": 11025 }, { "epoch": 0.6328532904928568, "grad_norm": 3.159587170061402, "learning_rate": 7.148102702108122e-06, "loss": 1.1429, "step": 11030 }, { "epoch": 0.6331401686843766, "grad_norm": 3.45520592476515, "learning_rate": 7.13850600247359e-06, "loss": 1.1448, "step": 11035 }, { "epoch": 0.6334270468758965, "grad_norm": 3.8561852275373236, "learning_rate": 7.1289121723516645e-06, "loss": 1.0894, "step": 11040 }, { "epoch": 0.6337139250674164, "grad_norm": 3.384594809804652, "learning_rate": 7.119321221363047e-06, "loss": 1.0924, "step": 11045 }, { "epoch": 0.6340008032589363, "grad_norm": 3.0323462598524293, "learning_rate": 7.1097331591255654e-06, "loss": 1.0975, "step": 11050 }, { "epoch": 0.6342876814504561, "grad_norm": 3.154023549281135, "learning_rate": 7.100147995254156e-06, "loss": 1.0714, "step": 11055 }, { "epoch": 0.6345745596419761, "grad_norm": 2.9403034527800673, "learning_rate": 7.090565739360839e-06, "loss": 1.1272, "step": 11060 }, { "epoch": 0.6348614378334959, "grad_norm": 3.3220243260979756, "learning_rate": 7.0809864010547215e-06, "loss": 1.0133, "step": 11065 }, { "epoch": 0.6351483160250158, "grad_norm": 3.4628066122776118, "learning_rate": 7.071409989941988e-06, "loss": 1.1096, "step": 11070 }, { "epoch": 0.6354351942165356, "grad_norm": 3.0744286409872683, "learning_rate": 7.0618365156258865e-06, "loss": 1.1119, "step": 11075 }, { "epoch": 0.6357220724080556, "grad_norm": 3.031991658047844, "learning_rate": 7.052265987706708e-06, "loss": 1.0874, "step": 11080 }, { "epoch": 0.6360089505995754, "grad_norm": 3.05896206496098, "learning_rate": 7.042698415781813e-06, "loss": 1.1248, "step": 11085 }, { "epoch": 0.6362958287910953, "grad_norm": 3.519373429961666, "learning_rate": 7.033133809445577e-06, "loss": 1.0505, "step": 11090 }, { "epoch": 0.6365827069826152, "grad_norm": 3.2951022330449176, "learning_rate": 7.02357217828941e-06, "loss": 1.0918, "step": 11095 }, { "epoch": 0.6368695851741351, "grad_norm": 2.8857756341855203, "learning_rate": 7.014013531901734e-06, "loss": 1.0658, "step": 11100 }, { "epoch": 0.6371564633656549, "grad_norm": 4.564077488695055, "learning_rate": 7.004457879867987e-06, "loss": 1.1134, "step": 11105 }, { "epoch": 0.6374433415571749, "grad_norm": 3.387166750367618, "learning_rate": 6.994905231770593e-06, "loss": 1.0455, "step": 11110 }, { "epoch": 0.6377302197486947, "grad_norm": 3.196694589763004, "learning_rate": 6.985355597188972e-06, "loss": 1.0738, "step": 11115 }, { "epoch": 0.6380170979402146, "grad_norm": 3.7583563455949713, "learning_rate": 6.975808985699518e-06, "loss": 1.1462, "step": 11120 }, { "epoch": 0.6383039761317345, "grad_norm": 3.1346608940341505, "learning_rate": 6.966265406875597e-06, "loss": 1.0805, "step": 11125 }, { "epoch": 0.6385908543232544, "grad_norm": 3.207307378395864, "learning_rate": 6.956724870287523e-06, "loss": 1.0449, "step": 11130 }, { "epoch": 0.6388777325147742, "grad_norm": 2.9326439035336866, "learning_rate": 6.947187385502581e-06, "loss": 1.0995, "step": 11135 }, { "epoch": 0.6391646107062942, "grad_norm": 3.3228516010494316, "learning_rate": 6.93765296208497e-06, "loss": 1.0915, "step": 11140 }, { "epoch": 0.639451488897814, "grad_norm": 3.414745354185812, "learning_rate": 6.928121609595836e-06, "loss": 1.1057, "step": 11145 }, { "epoch": 0.6397383670893338, "grad_norm": 3.349972624215406, "learning_rate": 6.918593337593238e-06, "loss": 1.1021, "step": 11150 }, { "epoch": 0.6400252452808537, "grad_norm": 4.876690850060579, "learning_rate": 6.909068155632154e-06, "loss": 1.1078, "step": 11155 }, { "epoch": 0.6403121234723737, "grad_norm": 3.2418259686856996, "learning_rate": 6.899546073264453e-06, "loss": 1.1288, "step": 11160 }, { "epoch": 0.6405990016638935, "grad_norm": 3.0001670021134594, "learning_rate": 6.8900271000389006e-06, "loss": 1.0425, "step": 11165 }, { "epoch": 0.6408858798554133, "grad_norm": 3.5299791113378194, "learning_rate": 6.880511245501149e-06, "loss": 1.1914, "step": 11170 }, { "epoch": 0.6411727580469333, "grad_norm": 3.2286573128031844, "learning_rate": 6.870998519193717e-06, "loss": 1.1029, "step": 11175 }, { "epoch": 0.6414596362384531, "grad_norm": 3.5557670101589225, "learning_rate": 6.861488930655979e-06, "loss": 1.1295, "step": 11180 }, { "epoch": 0.641746514429973, "grad_norm": 3.4374764294695783, "learning_rate": 6.8519824894241874e-06, "loss": 1.1196, "step": 11185 }, { "epoch": 0.642033392621493, "grad_norm": 3.1108892188156956, "learning_rate": 6.842479205031411e-06, "loss": 1.1208, "step": 11190 }, { "epoch": 0.6423202708130128, "grad_norm": 3.0894146117680252, "learning_rate": 6.832979087007565e-06, "loss": 1.1425, "step": 11195 }, { "epoch": 0.6426071490045326, "grad_norm": 3.4015366187849105, "learning_rate": 6.8234821448793985e-06, "loss": 1.1151, "step": 11200 }, { "epoch": 0.6428940271960526, "grad_norm": 3.201023061197176, "learning_rate": 6.813988388170456e-06, "loss": 1.1346, "step": 11205 }, { "epoch": 0.6431809053875724, "grad_norm": 3.3600172492517935, "learning_rate": 6.804497826401104e-06, "loss": 1.1382, "step": 11210 }, { "epoch": 0.6434677835790923, "grad_norm": 3.0569779165742705, "learning_rate": 6.7950104690884945e-06, "loss": 1.1352, "step": 11215 }, { "epoch": 0.6437546617706122, "grad_norm": 3.42295545803529, "learning_rate": 6.785526325746577e-06, "loss": 1.1494, "step": 11220 }, { "epoch": 0.6440415399621321, "grad_norm": 3.378314499025113, "learning_rate": 6.776045405886067e-06, "loss": 1.1907, "step": 11225 }, { "epoch": 0.6443284181536519, "grad_norm": 3.272441466651842, "learning_rate": 6.76656771901445e-06, "loss": 1.1361, "step": 11230 }, { "epoch": 0.6446152963451719, "grad_norm": 3.024902534223318, "learning_rate": 6.7570932746359795e-06, "loss": 1.1017, "step": 11235 }, { "epoch": 0.6449021745366917, "grad_norm": 3.1096163538072967, "learning_rate": 6.747622082251644e-06, "loss": 1.0555, "step": 11240 }, { "epoch": 0.6451890527282116, "grad_norm": 3.047747253604133, "learning_rate": 6.738154151359172e-06, "loss": 1.0962, "step": 11245 }, { "epoch": 0.6454759309197314, "grad_norm": 3.638852111164968, "learning_rate": 6.728689491453039e-06, "loss": 1.1301, "step": 11250 }, { "epoch": 0.6457628091112514, "grad_norm": 3.075064969437343, "learning_rate": 6.719228112024416e-06, "loss": 1.0948, "step": 11255 }, { "epoch": 0.6460496873027712, "grad_norm": 2.919138408443992, "learning_rate": 6.709770022561197e-06, "loss": 1.1248, "step": 11260 }, { "epoch": 0.6463365654942911, "grad_norm": 3.123136763222794, "learning_rate": 6.7003152325479806e-06, "loss": 1.148, "step": 11265 }, { "epoch": 0.646623443685811, "grad_norm": 3.0808525980449, "learning_rate": 6.690863751466048e-06, "loss": 1.0855, "step": 11270 }, { "epoch": 0.6469103218773309, "grad_norm": 4.00691899721447, "learning_rate": 6.681415588793368e-06, "loss": 1.1233, "step": 11275 }, { "epoch": 0.6471972000688507, "grad_norm": 2.9961310717920675, "learning_rate": 6.67197075400457e-06, "loss": 1.0425, "step": 11280 }, { "epoch": 0.6474840782603707, "grad_norm": 3.4559166390928087, "learning_rate": 6.662529256570968e-06, "loss": 1.1071, "step": 11285 }, { "epoch": 0.6477709564518905, "grad_norm": 3.16419973843326, "learning_rate": 6.653091105960512e-06, "loss": 1.0975, "step": 11290 }, { "epoch": 0.6480578346434104, "grad_norm": 3.0579507329397275, "learning_rate": 6.643656311637796e-06, "loss": 1.0528, "step": 11295 }, { "epoch": 0.6483447128349303, "grad_norm": 3.6683602834121145, "learning_rate": 6.63422488306406e-06, "loss": 1.1063, "step": 11300 }, { "epoch": 0.6486315910264502, "grad_norm": 3.372407090751424, "learning_rate": 6.624796829697158e-06, "loss": 1.204, "step": 11305 }, { "epoch": 0.64891846921797, "grad_norm": 3.2300510669688496, "learning_rate": 6.615372160991561e-06, "loss": 1.1688, "step": 11310 }, { "epoch": 0.64920534740949, "grad_norm": 3.219032858656139, "learning_rate": 6.6059508863983535e-06, "loss": 1.0517, "step": 11315 }, { "epoch": 0.6494922256010098, "grad_norm": 3.1646923626637893, "learning_rate": 6.596533015365207e-06, "loss": 1.0325, "step": 11320 }, { "epoch": 0.6497791037925297, "grad_norm": 3.283182439144784, "learning_rate": 6.5871185573363825e-06, "loss": 1.0787, "step": 11325 }, { "epoch": 0.6500659819840495, "grad_norm": 3.9294961513350066, "learning_rate": 6.577707521752725e-06, "loss": 1.1193, "step": 11330 }, { "epoch": 0.6503528601755695, "grad_norm": 3.904810068983176, "learning_rate": 6.56829991805164e-06, "loss": 1.1131, "step": 11335 }, { "epoch": 0.6506397383670893, "grad_norm": 4.596010228643845, "learning_rate": 6.558895755667092e-06, "loss": 1.1086, "step": 11340 }, { "epoch": 0.6509266165586092, "grad_norm": 3.356451780090185, "learning_rate": 6.549495044029593e-06, "loss": 1.15, "step": 11345 }, { "epoch": 0.6512134947501291, "grad_norm": 3.1132547834668256, "learning_rate": 6.5400977925662025e-06, "loss": 1.0929, "step": 11350 }, { "epoch": 0.651500372941649, "grad_norm": 3.4343325847214916, "learning_rate": 6.530704010700504e-06, "loss": 1.0607, "step": 11355 }, { "epoch": 0.6517872511331688, "grad_norm": 3.372502250681165, "learning_rate": 6.521313707852601e-06, "loss": 1.107, "step": 11360 }, { "epoch": 0.6520741293246888, "grad_norm": 3.345522460623843, "learning_rate": 6.511926893439116e-06, "loss": 1.0842, "step": 11365 }, { "epoch": 0.6523610075162086, "grad_norm": 2.9286985465362525, "learning_rate": 6.502543576873163e-06, "loss": 1.0745, "step": 11370 }, { "epoch": 0.6526478857077285, "grad_norm": 3.304952132136242, "learning_rate": 6.493163767564352e-06, "loss": 1.1161, "step": 11375 }, { "epoch": 0.6529347638992484, "grad_norm": 3.341612581378258, "learning_rate": 6.483787474918779e-06, "loss": 1.1084, "step": 11380 }, { "epoch": 0.6532216420907683, "grad_norm": 3.4244090077374674, "learning_rate": 6.474414708339014e-06, "loss": 1.2071, "step": 11385 }, { "epoch": 0.6535085202822881, "grad_norm": 3.3753083230683707, "learning_rate": 6.46504547722408e-06, "loss": 1.131, "step": 11390 }, { "epoch": 0.6537953984738081, "grad_norm": 3.235050238400293, "learning_rate": 6.455679790969474e-06, "loss": 1.1066, "step": 11395 }, { "epoch": 0.6540822766653279, "grad_norm": 3.419397186092815, "learning_rate": 6.446317658967118e-06, "loss": 1.0519, "step": 11400 }, { "epoch": 0.6543691548568478, "grad_norm": 3.1162712864899715, "learning_rate": 6.436959090605384e-06, "loss": 1.0856, "step": 11405 }, { "epoch": 0.6546560330483676, "grad_norm": 3.7532014293567606, "learning_rate": 6.42760409526906e-06, "loss": 1.0824, "step": 11410 }, { "epoch": 0.6549429112398876, "grad_norm": 3.2689210839800893, "learning_rate": 6.418252682339362e-06, "loss": 1.087, "step": 11415 }, { "epoch": 0.6552297894314074, "grad_norm": 3.4587716777297075, "learning_rate": 6.4089048611939065e-06, "loss": 1.1311, "step": 11420 }, { "epoch": 0.6555166676229273, "grad_norm": 3.0862584668193396, "learning_rate": 6.399560641206706e-06, "loss": 1.1036, "step": 11425 }, { "epoch": 0.6558035458144472, "grad_norm": 3.53250702750914, "learning_rate": 6.390220031748171e-06, "loss": 1.1085, "step": 11430 }, { "epoch": 0.6560904240059671, "grad_norm": 2.916652979111715, "learning_rate": 6.380883042185085e-06, "loss": 1.1006, "step": 11435 }, { "epoch": 0.6563773021974869, "grad_norm": 3.6535146723376517, "learning_rate": 6.371549681880593e-06, "loss": 1.087, "step": 11440 }, { "epoch": 0.6566641803890069, "grad_norm": 3.0392587092398933, "learning_rate": 6.362219960194222e-06, "loss": 1.0228, "step": 11445 }, { "epoch": 0.6569510585805267, "grad_norm": 3.111868934720598, "learning_rate": 6.35289388648183e-06, "loss": 1.0809, "step": 11450 }, { "epoch": 0.6572379367720466, "grad_norm": 3.658633186499982, "learning_rate": 6.343571470095624e-06, "loss": 1.0233, "step": 11455 }, { "epoch": 0.6575248149635665, "grad_norm": 3.634866803388317, "learning_rate": 6.334252720384153e-06, "loss": 1.0735, "step": 11460 }, { "epoch": 0.6578116931550864, "grad_norm": 2.996350960234766, "learning_rate": 6.3249376466922705e-06, "loss": 1.076, "step": 11465 }, { "epoch": 0.6580985713466062, "grad_norm": 3.1687083598610664, "learning_rate": 6.315626258361158e-06, "loss": 1.1335, "step": 11470 }, { "epoch": 0.6583854495381262, "grad_norm": 3.223075596967375, "learning_rate": 6.306318564728293e-06, "loss": 1.1181, "step": 11475 }, { "epoch": 0.658672327729646, "grad_norm": 3.028424361612919, "learning_rate": 6.297014575127456e-06, "loss": 1.1224, "step": 11480 }, { "epoch": 0.6589592059211659, "grad_norm": 3.8361473553055974, "learning_rate": 6.2877142988887094e-06, "loss": 1.1674, "step": 11485 }, { "epoch": 0.6592460841126857, "grad_norm": 2.931021218358947, "learning_rate": 6.278417745338381e-06, "loss": 1.1026, "step": 11490 }, { "epoch": 0.6595329623042057, "grad_norm": 3.088811614778611, "learning_rate": 6.269124923799089e-06, "loss": 1.0696, "step": 11495 }, { "epoch": 0.6598198404957255, "grad_norm": 3.2350144799130542, "learning_rate": 6.259835843589688e-06, "loss": 1.1792, "step": 11500 }, { "epoch": 0.6601067186872454, "grad_norm": 3.2976686337358503, "learning_rate": 6.250550514025286e-06, "loss": 1.1755, "step": 11505 }, { "epoch": 0.6603935968787653, "grad_norm": 3.873848732482631, "learning_rate": 6.241268944417241e-06, "loss": 1.1135, "step": 11510 }, { "epoch": 0.6606804750702852, "grad_norm": 3.1564046097019687, "learning_rate": 6.231991144073126e-06, "loss": 1.0551, "step": 11515 }, { "epoch": 0.660967353261805, "grad_norm": 3.16795356356215, "learning_rate": 6.222717122296738e-06, "loss": 1.1053, "step": 11520 }, { "epoch": 0.661254231453325, "grad_norm": 3.02338647913606, "learning_rate": 6.213446888388093e-06, "loss": 1.0885, "step": 11525 }, { "epoch": 0.6615411096448448, "grad_norm": 2.9701484517767844, "learning_rate": 6.2041804516434005e-06, "loss": 1.001, "step": 11530 }, { "epoch": 0.6618279878363647, "grad_norm": 3.237910149232802, "learning_rate": 6.194917821355062e-06, "loss": 1.0834, "step": 11535 }, { "epoch": 0.6621148660278846, "grad_norm": 3.3040423016230407, "learning_rate": 6.18565900681166e-06, "loss": 1.1528, "step": 11540 }, { "epoch": 0.6624017442194045, "grad_norm": 3.6607947887989507, "learning_rate": 6.176404017297965e-06, "loss": 1.1604, "step": 11545 }, { "epoch": 0.6626886224109243, "grad_norm": 3.10165114067369, "learning_rate": 6.167152862094894e-06, "loss": 1.0387, "step": 11550 }, { "epoch": 0.6629755006024443, "grad_norm": 4.457050079150402, "learning_rate": 6.157905550479525e-06, "loss": 1.1242, "step": 11555 }, { "epoch": 0.6632623787939641, "grad_norm": 3.0385441869908165, "learning_rate": 6.148662091725087e-06, "loss": 1.075, "step": 11560 }, { "epoch": 0.663549256985484, "grad_norm": 3.4936647505356815, "learning_rate": 6.139422495100939e-06, "loss": 1.1599, "step": 11565 }, { "epoch": 0.6638361351770038, "grad_norm": 4.059984957471122, "learning_rate": 6.13018676987257e-06, "loss": 1.0705, "step": 11570 }, { "epoch": 0.6641230133685238, "grad_norm": 3.2537001665191263, "learning_rate": 6.120954925301588e-06, "loss": 1.1138, "step": 11575 }, { "epoch": 0.6644098915600436, "grad_norm": 3.384286980760091, "learning_rate": 6.111726970645703e-06, "loss": 1.1097, "step": 11580 }, { "epoch": 0.6646967697515634, "grad_norm": 3.41419760602678, "learning_rate": 6.102502915158733e-06, "loss": 1.0835, "step": 11585 }, { "epoch": 0.6649836479430834, "grad_norm": 3.238116637613142, "learning_rate": 6.093282768090574e-06, "loss": 1.034, "step": 11590 }, { "epoch": 0.6652705261346032, "grad_norm": 3.531639476487918, "learning_rate": 6.084066538687222e-06, "loss": 1.053, "step": 11595 }, { "epoch": 0.6655574043261231, "grad_norm": 3.2298044240910397, "learning_rate": 6.0748542361907235e-06, "loss": 1.0981, "step": 11600 }, { "epoch": 0.665844282517643, "grad_norm": 3.5127045226237743, "learning_rate": 6.0656458698391965e-06, "loss": 1.0697, "step": 11605 }, { "epoch": 0.6661311607091629, "grad_norm": 3.285099372797378, "learning_rate": 6.056441448866817e-06, "loss": 1.1243, "step": 11610 }, { "epoch": 0.6664180389006827, "grad_norm": 3.28115693204597, "learning_rate": 6.0472409825037924e-06, "loss": 1.1439, "step": 11615 }, { "epoch": 0.6667049170922027, "grad_norm": 3.0091995422983757, "learning_rate": 6.038044479976375e-06, "loss": 1.0445, "step": 11620 }, { "epoch": 0.6669917952837225, "grad_norm": 3.6164224883990777, "learning_rate": 6.028851950506838e-06, "loss": 1.0902, "step": 11625 }, { "epoch": 0.6672786734752424, "grad_norm": 3.069607797158091, "learning_rate": 6.0196634033134704e-06, "loss": 1.1082, "step": 11630 }, { "epoch": 0.6675655516667623, "grad_norm": 3.2508189661760394, "learning_rate": 6.0104788476105646e-06, "loss": 1.1173, "step": 11635 }, { "epoch": 0.6678524298582822, "grad_norm": 3.08434873413725, "learning_rate": 6.001298292608419e-06, "loss": 1.1428, "step": 11640 }, { "epoch": 0.668139308049802, "grad_norm": 3.423831180430705, "learning_rate": 5.9921217475133155e-06, "loss": 1.1113, "step": 11645 }, { "epoch": 0.6684261862413219, "grad_norm": 3.117998282513249, "learning_rate": 5.982949221527506e-06, "loss": 1.0782, "step": 11650 }, { "epoch": 0.6687130644328418, "grad_norm": 3.588437165798767, "learning_rate": 5.973780723849225e-06, "loss": 1.0776, "step": 11655 }, { "epoch": 0.6689999426243617, "grad_norm": 3.3214264079628864, "learning_rate": 5.9646162636726634e-06, "loss": 1.0633, "step": 11660 }, { "epoch": 0.6692868208158815, "grad_norm": 3.3677436009562407, "learning_rate": 5.955455850187962e-06, "loss": 1.0433, "step": 11665 }, { "epoch": 0.6695736990074015, "grad_norm": 3.355644797098839, "learning_rate": 5.946299492581201e-06, "loss": 1.0832, "step": 11670 }, { "epoch": 0.6698605771989213, "grad_norm": 3.708248859127841, "learning_rate": 5.937147200034401e-06, "loss": 1.1154, "step": 11675 }, { "epoch": 0.6701474553904412, "grad_norm": 3.2657649229815267, "learning_rate": 5.9279989817255e-06, "loss": 1.1551, "step": 11680 }, { "epoch": 0.6704343335819611, "grad_norm": 3.5387411910470816, "learning_rate": 5.918854846828348e-06, "loss": 1.0112, "step": 11685 }, { "epoch": 0.670721211773481, "grad_norm": 3.4874484653519247, "learning_rate": 5.909714804512709e-06, "loss": 1.0696, "step": 11690 }, { "epoch": 0.6710080899650008, "grad_norm": 3.2666769356560943, "learning_rate": 5.900578863944239e-06, "loss": 1.0536, "step": 11695 }, { "epoch": 0.6712949681565208, "grad_norm": 3.340737049724393, "learning_rate": 5.8914470342844694e-06, "loss": 1.0156, "step": 11700 }, { "epoch": 0.6715818463480406, "grad_norm": 3.225746488901726, "learning_rate": 5.882319324690835e-06, "loss": 1.1209, "step": 11705 }, { "epoch": 0.6718687245395605, "grad_norm": 3.7788510040402494, "learning_rate": 5.873195744316611e-06, "loss": 1.1069, "step": 11710 }, { "epoch": 0.6721556027310804, "grad_norm": 3.541166956098384, "learning_rate": 5.86407630231095e-06, "loss": 1.0904, "step": 11715 }, { "epoch": 0.6724424809226003, "grad_norm": 3.303678890629549, "learning_rate": 5.854961007818845e-06, "loss": 1.0765, "step": 11720 }, { "epoch": 0.6727293591141201, "grad_norm": 3.2903809506633417, "learning_rate": 5.845849869981137e-06, "loss": 1.1227, "step": 11725 }, { "epoch": 0.67301623730564, "grad_norm": 3.130816846902172, "learning_rate": 5.836742897934497e-06, "loss": 1.0525, "step": 11730 }, { "epoch": 0.6733031154971599, "grad_norm": 3.1457425553885257, "learning_rate": 5.827640100811409e-06, "loss": 1.0405, "step": 11735 }, { "epoch": 0.6735899936886798, "grad_norm": 3.204972389635532, "learning_rate": 5.818541487740188e-06, "loss": 1.0829, "step": 11740 }, { "epoch": 0.6738768718801996, "grad_norm": 3.128989934923962, "learning_rate": 5.80944706784494e-06, "loss": 1.1753, "step": 11745 }, { "epoch": 0.6741637500717196, "grad_norm": 3.665805411594829, "learning_rate": 5.800356850245568e-06, "loss": 1.1102, "step": 11750 }, { "epoch": 0.6744506282632394, "grad_norm": 3.1336867968270474, "learning_rate": 5.791270844057764e-06, "loss": 1.0566, "step": 11755 }, { "epoch": 0.6747375064547593, "grad_norm": 3.6020945570902407, "learning_rate": 5.7821890583929955e-06, "loss": 1.2028, "step": 11760 }, { "epoch": 0.6750243846462792, "grad_norm": 3.4141194186590433, "learning_rate": 5.7731115023584926e-06, "loss": 1.067, "step": 11765 }, { "epoch": 0.6753112628377991, "grad_norm": 3.2340209173616925, "learning_rate": 5.764038185057259e-06, "loss": 0.9734, "step": 11770 }, { "epoch": 0.6755981410293189, "grad_norm": 3.0940275272070794, "learning_rate": 5.754969115588034e-06, "loss": 1.1054, "step": 11775 }, { "epoch": 0.6758850192208389, "grad_norm": 3.341727185222942, "learning_rate": 5.745904303045296e-06, "loss": 1.0924, "step": 11780 }, { "epoch": 0.6761718974123587, "grad_norm": 3.1449601386245494, "learning_rate": 5.736843756519259e-06, "loss": 1.1355, "step": 11785 }, { "epoch": 0.6764587756038786, "grad_norm": 3.579533726803417, "learning_rate": 5.727787485095866e-06, "loss": 1.1518, "step": 11790 }, { "epoch": 0.6767456537953985, "grad_norm": 3.060997246278451, "learning_rate": 5.718735497856762e-06, "loss": 1.1025, "step": 11795 }, { "epoch": 0.6770325319869184, "grad_norm": 3.320564829388344, "learning_rate": 5.709687803879301e-06, "loss": 1.0674, "step": 11800 }, { "epoch": 0.6773194101784382, "grad_norm": 3.2647825660908874, "learning_rate": 5.700644412236531e-06, "loss": 1.0182, "step": 11805 }, { "epoch": 0.6776062883699581, "grad_norm": 3.3540870276702646, "learning_rate": 5.6916053319971854e-06, "loss": 1.1688, "step": 11810 }, { "epoch": 0.677893166561478, "grad_norm": 2.9124268219209277, "learning_rate": 5.682570572225671e-06, "loss": 1.0877, "step": 11815 }, { "epoch": 0.6781800447529979, "grad_norm": 3.46981080120072, "learning_rate": 5.67354014198207e-06, "loss": 1.0712, "step": 11820 }, { "epoch": 0.6784669229445177, "grad_norm": 3.792922759123268, "learning_rate": 5.664514050322122e-06, "loss": 1.0573, "step": 11825 }, { "epoch": 0.6787538011360377, "grad_norm": 3.104448646194611, "learning_rate": 5.655492306297197e-06, "loss": 1.1506, "step": 11830 }, { "epoch": 0.6790406793275575, "grad_norm": 3.542141938360616, "learning_rate": 5.646474918954334e-06, "loss": 1.0717, "step": 11835 }, { "epoch": 0.6793275575190774, "grad_norm": 3.235531177980258, "learning_rate": 5.637461897336185e-06, "loss": 1.0648, "step": 11840 }, { "epoch": 0.6796144357105973, "grad_norm": 3.309010836076304, "learning_rate": 5.628453250481027e-06, "loss": 1.0746, "step": 11845 }, { "epoch": 0.6799013139021172, "grad_norm": 3.0506049802011765, "learning_rate": 5.619448987422751e-06, "loss": 1.0578, "step": 11850 }, { "epoch": 0.680188192093637, "grad_norm": 3.2022098237006316, "learning_rate": 5.610449117190855e-06, "loss": 1.0814, "step": 11855 }, { "epoch": 0.680475070285157, "grad_norm": 3.677563240507885, "learning_rate": 5.601453648810427e-06, "loss": 1.0383, "step": 11860 }, { "epoch": 0.6807619484766768, "grad_norm": 3.2920739611836836, "learning_rate": 5.592462591302139e-06, "loss": 1.0786, "step": 11865 }, { "epoch": 0.6810488266681967, "grad_norm": 3.1574559903256305, "learning_rate": 5.5834759536822515e-06, "loss": 1.1019, "step": 11870 }, { "epoch": 0.6813357048597166, "grad_norm": 3.3369879786929264, "learning_rate": 5.574493744962586e-06, "loss": 1.1044, "step": 11875 }, { "epoch": 0.6816225830512365, "grad_norm": 3.134182535798571, "learning_rate": 5.5655159741505085e-06, "loss": 1.1538, "step": 11880 }, { "epoch": 0.6819094612427563, "grad_norm": 3.0264051191931705, "learning_rate": 5.5565426502489595e-06, "loss": 1.0527, "step": 11885 }, { "epoch": 0.6821963394342762, "grad_norm": 3.438322542304316, "learning_rate": 5.547573782256404e-06, "loss": 1.1223, "step": 11890 }, { "epoch": 0.6824832176257961, "grad_norm": 3.2823947727699023, "learning_rate": 5.538609379166845e-06, "loss": 1.1213, "step": 11895 }, { "epoch": 0.682770095817316, "grad_norm": 3.070918532336758, "learning_rate": 5.529649449969804e-06, "loss": 1.0259, "step": 11900 }, { "epoch": 0.6830569740088358, "grad_norm": 3.1711362951788318, "learning_rate": 5.52069400365032e-06, "loss": 1.1197, "step": 11905 }, { "epoch": 0.6833438522003558, "grad_norm": 3.3006299334911295, "learning_rate": 5.511743049188931e-06, "loss": 1.1217, "step": 11910 }, { "epoch": 0.6836307303918756, "grad_norm": 4.2851581039113205, "learning_rate": 5.502796595561675e-06, "loss": 1.1298, "step": 11915 }, { "epoch": 0.6839176085833955, "grad_norm": 2.7979242318974036, "learning_rate": 5.493854651740081e-06, "loss": 1.0616, "step": 11920 }, { "epoch": 0.6842044867749154, "grad_norm": 3.594235777701993, "learning_rate": 5.48491722669115e-06, "loss": 1.1754, "step": 11925 }, { "epoch": 0.6844913649664353, "grad_norm": 3.2886632472834836, "learning_rate": 5.4759843293773404e-06, "loss": 1.1325, "step": 11930 }, { "epoch": 0.6847782431579551, "grad_norm": 3.2327116295534415, "learning_rate": 5.467055968756595e-06, "loss": 1.0922, "step": 11935 }, { "epoch": 0.6850651213494751, "grad_norm": 3.002397634043456, "learning_rate": 5.458132153782287e-06, "loss": 1.0218, "step": 11940 }, { "epoch": 0.6853519995409949, "grad_norm": 3.27522297483525, "learning_rate": 5.449212893403241e-06, "loss": 1.1085, "step": 11945 }, { "epoch": 0.6856388777325148, "grad_norm": 3.7121488063221464, "learning_rate": 5.4402981965637115e-06, "loss": 1.121, "step": 11950 }, { "epoch": 0.6859257559240347, "grad_norm": 3.330029707900964, "learning_rate": 5.431388072203373e-06, "loss": 1.0962, "step": 11955 }, { "epoch": 0.6862126341155546, "grad_norm": 3.6493339812319023, "learning_rate": 5.422482529257316e-06, "loss": 1.1111, "step": 11960 }, { "epoch": 0.6864995123070744, "grad_norm": 3.7594179786523374, "learning_rate": 5.413581576656048e-06, "loss": 1.1437, "step": 11965 }, { "epoch": 0.6867863904985942, "grad_norm": 3.1042232056336156, "learning_rate": 5.40468522332546e-06, "loss": 1.0943, "step": 11970 }, { "epoch": 0.6870732686901142, "grad_norm": 3.15213302466962, "learning_rate": 5.395793478186838e-06, "loss": 1.1117, "step": 11975 }, { "epoch": 0.687360146881634, "grad_norm": 3.433144578903204, "learning_rate": 5.386906350156833e-06, "loss": 1.0754, "step": 11980 }, { "epoch": 0.6876470250731539, "grad_norm": 3.626296525035197, "learning_rate": 5.378023848147486e-06, "loss": 1.1006, "step": 11985 }, { "epoch": 0.6879339032646739, "grad_norm": 3.4866290211856943, "learning_rate": 5.36914598106619e-06, "loss": 1.0908, "step": 11990 }, { "epoch": 0.6882207814561937, "grad_norm": 3.118931459710941, "learning_rate": 5.36027275781569e-06, "loss": 1.081, "step": 11995 }, { "epoch": 0.6885076596477135, "grad_norm": 3.441107024302871, "learning_rate": 5.3514041872940705e-06, "loss": 1.0597, "step": 12000 }, { "epoch": 0.6887945378392335, "grad_norm": 3.412591049201003, "learning_rate": 5.342540278394757e-06, "loss": 1.0506, "step": 12005 }, { "epoch": 0.6890814160307533, "grad_norm": 3.492561727908128, "learning_rate": 5.333681040006491e-06, "loss": 1.163, "step": 12010 }, { "epoch": 0.6893682942222732, "grad_norm": 3.2474454415917307, "learning_rate": 5.324826481013345e-06, "loss": 1.0829, "step": 12015 }, { "epoch": 0.6896551724137931, "grad_norm": 3.5509099737460676, "learning_rate": 5.31597661029469e-06, "loss": 1.0978, "step": 12020 }, { "epoch": 0.689942050605313, "grad_norm": 3.262645606326154, "learning_rate": 5.307131436725191e-06, "loss": 1.0815, "step": 12025 }, { "epoch": 0.6902289287968328, "grad_norm": 3.491561980496009, "learning_rate": 5.298290969174812e-06, "loss": 1.1116, "step": 12030 }, { "epoch": 0.6905158069883528, "grad_norm": 2.7439541017322213, "learning_rate": 5.289455216508792e-06, "loss": 1.1491, "step": 12035 }, { "epoch": 0.6908026851798726, "grad_norm": 3.450863909010519, "learning_rate": 5.280624187587643e-06, "loss": 1.1524, "step": 12040 }, { "epoch": 0.6910895633713925, "grad_norm": 2.9909990659465207, "learning_rate": 5.271797891267142e-06, "loss": 1.0208, "step": 12045 }, { "epoch": 0.6913764415629123, "grad_norm": 3.2833124418211916, "learning_rate": 5.262976336398318e-06, "loss": 1.0929, "step": 12050 }, { "epoch": 0.6916633197544323, "grad_norm": 3.3138068067098683, "learning_rate": 5.254159531827446e-06, "loss": 1.1701, "step": 12055 }, { "epoch": 0.6919501979459521, "grad_norm": 3.542653283568446, "learning_rate": 5.245347486396033e-06, "loss": 1.0938, "step": 12060 }, { "epoch": 0.692237076137472, "grad_norm": 2.9601173097139406, "learning_rate": 5.2365402089408265e-06, "loss": 1.0282, "step": 12065 }, { "epoch": 0.6925239543289919, "grad_norm": 3.44590490007891, "learning_rate": 5.227737708293781e-06, "loss": 1.121, "step": 12070 }, { "epoch": 0.6928108325205118, "grad_norm": 3.0382558167077542, "learning_rate": 5.218939993282062e-06, "loss": 1.0402, "step": 12075 }, { "epoch": 0.6930977107120316, "grad_norm": 3.7763649752997637, "learning_rate": 5.2101470727280375e-06, "loss": 1.0587, "step": 12080 }, { "epoch": 0.6933845889035516, "grad_norm": 3.4564707697255503, "learning_rate": 5.2013589554492715e-06, "loss": 1.0776, "step": 12085 }, { "epoch": 0.6936714670950714, "grad_norm": 3.2083726696284716, "learning_rate": 5.192575650258503e-06, "loss": 1.0289, "step": 12090 }, { "epoch": 0.6939583452865913, "grad_norm": 3.183388393171332, "learning_rate": 5.183797165963654e-06, "loss": 1.001, "step": 12095 }, { "epoch": 0.6942452234781112, "grad_norm": 3.307211999144673, "learning_rate": 5.175023511367807e-06, "loss": 0.9885, "step": 12100 }, { "epoch": 0.6945321016696311, "grad_norm": 3.7011621972028967, "learning_rate": 5.166254695269202e-06, "loss": 1.0697, "step": 12105 }, { "epoch": 0.6948189798611509, "grad_norm": 3.486997389081723, "learning_rate": 5.157490726461223e-06, "loss": 1.0068, "step": 12110 }, { "epoch": 0.6951058580526709, "grad_norm": 3.418040025936389, "learning_rate": 5.148731613732407e-06, "loss": 1.0224, "step": 12115 }, { "epoch": 0.6953927362441907, "grad_norm": 3.818659415887155, "learning_rate": 5.139977365866406e-06, "loss": 1.0726, "step": 12120 }, { "epoch": 0.6956796144357106, "grad_norm": 3.1225550732507528, "learning_rate": 5.1312279916420015e-06, "loss": 1.0863, "step": 12125 }, { "epoch": 0.6959664926272304, "grad_norm": 3.4753142831947654, "learning_rate": 5.122483499833084e-06, "loss": 1.0794, "step": 12130 }, { "epoch": 0.6962533708187504, "grad_norm": 3.2779238108470206, "learning_rate": 5.1137438992086504e-06, "loss": 1.1199, "step": 12135 }, { "epoch": 0.6965402490102702, "grad_norm": 3.0917591177230515, "learning_rate": 5.105009198532789e-06, "loss": 1.0269, "step": 12140 }, { "epoch": 0.6968271272017901, "grad_norm": 3.5724235258213253, "learning_rate": 5.096279406564686e-06, "loss": 1.1195, "step": 12145 }, { "epoch": 0.69711400539331, "grad_norm": 3.5507192714999616, "learning_rate": 5.087554532058586e-06, "loss": 1.0916, "step": 12150 }, { "epoch": 0.6974008835848299, "grad_norm": 3.103802344500986, "learning_rate": 5.078834583763817e-06, "loss": 1.0118, "step": 12155 }, { "epoch": 0.6976877617763497, "grad_norm": 3.1152850373371517, "learning_rate": 5.0701195704247595e-06, "loss": 1.1292, "step": 12160 }, { "epoch": 0.6979746399678697, "grad_norm": 3.1699024661832946, "learning_rate": 5.061409500780854e-06, "loss": 1.093, "step": 12165 }, { "epoch": 0.6982615181593895, "grad_norm": 3.218968636223778, "learning_rate": 5.052704383566576e-06, "loss": 1.0813, "step": 12170 }, { "epoch": 0.6985483963509094, "grad_norm": 3.4300089461735994, "learning_rate": 5.0440042275114365e-06, "loss": 1.0852, "step": 12175 }, { "epoch": 0.6988352745424293, "grad_norm": 3.4071438177662943, "learning_rate": 5.035309041339971e-06, "loss": 1.0721, "step": 12180 }, { "epoch": 0.6991221527339492, "grad_norm": 3.392283479806577, "learning_rate": 5.02661883377173e-06, "loss": 1.0606, "step": 12185 }, { "epoch": 0.699409030925469, "grad_norm": 3.387290813140784, "learning_rate": 5.017933613521273e-06, "loss": 1.0992, "step": 12190 }, { "epoch": 0.699695909116989, "grad_norm": 2.9916977015366735, "learning_rate": 5.009253389298166e-06, "loss": 1.1039, "step": 12195 }, { "epoch": 0.6999827873085088, "grad_norm": 3.264842828342237, "learning_rate": 5.000578169806948e-06, "loss": 1.0818, "step": 12200 }, { "epoch": 0.7002696655000287, "grad_norm": 3.7866754290259634, "learning_rate": 4.991907963747148e-06, "loss": 1.0288, "step": 12205 }, { "epoch": 0.7005565436915485, "grad_norm": 3.065034225642087, "learning_rate": 4.983242779813276e-06, "loss": 1.0141, "step": 12210 }, { "epoch": 0.7008434218830685, "grad_norm": 3.2594584491287004, "learning_rate": 4.974582626694794e-06, "loss": 1.0399, "step": 12215 }, { "epoch": 0.7011303000745883, "grad_norm": 3.179355566359275, "learning_rate": 4.965927513076123e-06, "loss": 1.0666, "step": 12220 }, { "epoch": 0.7014171782661082, "grad_norm": 3.2325113307940763, "learning_rate": 4.957277447636629e-06, "loss": 1.1296, "step": 12225 }, { "epoch": 0.7017040564576281, "grad_norm": 2.8471815743556697, "learning_rate": 4.94863243905062e-06, "loss": 1.1, "step": 12230 }, { "epoch": 0.701990934649148, "grad_norm": 3.068103221303161, "learning_rate": 4.939992495987327e-06, "loss": 1.0203, "step": 12235 }, { "epoch": 0.7022778128406678, "grad_norm": 3.5571181963565457, "learning_rate": 4.931357627110902e-06, "loss": 1.0413, "step": 12240 }, { "epoch": 0.7025646910321878, "grad_norm": 3.524834758663714, "learning_rate": 4.922727841080422e-06, "loss": 1.1011, "step": 12245 }, { "epoch": 0.7028515692237076, "grad_norm": 3.552034420047552, "learning_rate": 4.914103146549845e-06, "loss": 1.1061, "step": 12250 }, { "epoch": 0.7031384474152275, "grad_norm": 3.376574072285558, "learning_rate": 4.905483552168032e-06, "loss": 1.0642, "step": 12255 }, { "epoch": 0.7034253256067474, "grad_norm": 2.9454764344883415, "learning_rate": 4.896869066578741e-06, "loss": 1.0621, "step": 12260 }, { "epoch": 0.7037122037982673, "grad_norm": 3.8982340918911422, "learning_rate": 4.888259698420595e-06, "loss": 1.1252, "step": 12265 }, { "epoch": 0.7039990819897871, "grad_norm": 3.2331964843527974, "learning_rate": 4.879655456327084e-06, "loss": 1.0561, "step": 12270 }, { "epoch": 0.7042859601813071, "grad_norm": 3.556809507901754, "learning_rate": 4.871056348926563e-06, "loss": 1.0733, "step": 12275 }, { "epoch": 0.7045728383728269, "grad_norm": 3.651259446318736, "learning_rate": 4.8624623848422366e-06, "loss": 1.1181, "step": 12280 }, { "epoch": 0.7048597165643468, "grad_norm": 3.6084092379828334, "learning_rate": 4.853873572692152e-06, "loss": 1.1477, "step": 12285 }, { "epoch": 0.7051465947558666, "grad_norm": 2.8983480053027524, "learning_rate": 4.845289921089183e-06, "loss": 1.124, "step": 12290 }, { "epoch": 0.7054334729473866, "grad_norm": 3.3855691233565994, "learning_rate": 4.836711438641049e-06, "loss": 1.1406, "step": 12295 }, { "epoch": 0.7057203511389064, "grad_norm": 3.219746471228572, "learning_rate": 4.828138133950256e-06, "loss": 1.0396, "step": 12300 }, { "epoch": 0.7060072293304263, "grad_norm": 3.8106133587470437, "learning_rate": 4.819570015614139e-06, "loss": 1.1532, "step": 12305 }, { "epoch": 0.7062941075219462, "grad_norm": 3.538041820459486, "learning_rate": 4.811007092224829e-06, "loss": 1.1505, "step": 12310 }, { "epoch": 0.7065809857134661, "grad_norm": 3.071601926826406, "learning_rate": 4.802449372369242e-06, "loss": 1.069, "step": 12315 }, { "epoch": 0.7068678639049859, "grad_norm": 3.1487721290511175, "learning_rate": 4.793896864629081e-06, "loss": 1.123, "step": 12320 }, { "epoch": 0.7071547420965059, "grad_norm": 3.3452299654754647, "learning_rate": 4.785349577580817e-06, "loss": 1.1142, "step": 12325 }, { "epoch": 0.7074416202880257, "grad_norm": 3.1368973495205306, "learning_rate": 4.77680751979569e-06, "loss": 1.1436, "step": 12330 }, { "epoch": 0.7077284984795456, "grad_norm": 3.389899671274102, "learning_rate": 4.768270699839691e-06, "loss": 1.0933, "step": 12335 }, { "epoch": 0.7080153766710655, "grad_norm": 3.1651731886455607, "learning_rate": 4.759739126273569e-06, "loss": 1.0927, "step": 12340 }, { "epoch": 0.7083022548625854, "grad_norm": 3.067529663524672, "learning_rate": 4.7512128076528065e-06, "loss": 1.0541, "step": 12345 }, { "epoch": 0.7085891330541052, "grad_norm": 3.2751052289331466, "learning_rate": 4.742691752527606e-06, "loss": 1.0856, "step": 12350 }, { "epoch": 0.7088760112456252, "grad_norm": 3.323915481845003, "learning_rate": 4.7341759694429016e-06, "loss": 0.9668, "step": 12355 }, { "epoch": 0.709162889437145, "grad_norm": 3.4384813678408404, "learning_rate": 4.725665466938346e-06, "loss": 1.0559, "step": 12360 }, { "epoch": 0.7094497676286649, "grad_norm": 3.44849654605566, "learning_rate": 4.717160253548288e-06, "loss": 0.9801, "step": 12365 }, { "epoch": 0.7097366458201847, "grad_norm": 3.603004776677077, "learning_rate": 4.708660337801773e-06, "loss": 1.0594, "step": 12370 }, { "epoch": 0.7100235240117047, "grad_norm": 3.5841332203300573, "learning_rate": 4.700165728222538e-06, "loss": 1.0868, "step": 12375 }, { "epoch": 0.7103104022032245, "grad_norm": 3.7514209691641134, "learning_rate": 4.691676433328993e-06, "loss": 1.133, "step": 12380 }, { "epoch": 0.7105972803947443, "grad_norm": 3.614079366530346, "learning_rate": 4.683192461634222e-06, "loss": 1.0655, "step": 12385 }, { "epoch": 0.7108841585862643, "grad_norm": 3.5317816399521083, "learning_rate": 4.674713821645976e-06, "loss": 1.0337, "step": 12390 }, { "epoch": 0.7111710367777841, "grad_norm": 3.130018164882358, "learning_rate": 4.666240521866653e-06, "loss": 1.0207, "step": 12395 }, { "epoch": 0.711457914969304, "grad_norm": 3.4429610568989197, "learning_rate": 4.6577725707932895e-06, "loss": 1.036, "step": 12400 }, { "epoch": 0.711744793160824, "grad_norm": 3.856593678689411, "learning_rate": 4.649309976917574e-06, "loss": 1.107, "step": 12405 }, { "epoch": 0.7120316713523438, "grad_norm": 2.9443616050955255, "learning_rate": 4.640852748725812e-06, "loss": 1.0833, "step": 12410 }, { "epoch": 0.7123185495438636, "grad_norm": 2.866458207815026, "learning_rate": 4.632400894698932e-06, "loss": 1.0922, "step": 12415 }, { "epoch": 0.7126054277353836, "grad_norm": 2.981932793885333, "learning_rate": 4.62395442331247e-06, "loss": 1.0364, "step": 12420 }, { "epoch": 0.7128923059269034, "grad_norm": 3.131326633976429, "learning_rate": 4.615513343036567e-06, "loss": 1.0493, "step": 12425 }, { "epoch": 0.7131791841184233, "grad_norm": 3.216167487724314, "learning_rate": 4.6070776623359595e-06, "loss": 1.0695, "step": 12430 }, { "epoch": 0.7134660623099432, "grad_norm": 3.370191207948025, "learning_rate": 4.59864738966996e-06, "loss": 1.1013, "step": 12435 }, { "epoch": 0.7137529405014631, "grad_norm": 3.0309078389695596, "learning_rate": 4.5902225334924734e-06, "loss": 1.1346, "step": 12440 }, { "epoch": 0.7140398186929829, "grad_norm": 3.685427295671719, "learning_rate": 4.581803102251966e-06, "loss": 1.1465, "step": 12445 }, { "epoch": 0.7143266968845028, "grad_norm": 3.249264354750655, "learning_rate": 4.573389104391449e-06, "loss": 1.0096, "step": 12450 }, { "epoch": 0.7146135750760227, "grad_norm": 3.360200216074128, "learning_rate": 4.564980548348511e-06, "loss": 1.0598, "step": 12455 }, { "epoch": 0.7149004532675426, "grad_norm": 3.2519887996359724, "learning_rate": 4.5565774425552655e-06, "loss": 1.0661, "step": 12460 }, { "epoch": 0.7151873314590624, "grad_norm": 3.70151236078968, "learning_rate": 4.548179795438368e-06, "loss": 1.0575, "step": 12465 }, { "epoch": 0.7154742096505824, "grad_norm": 3.4543527639854603, "learning_rate": 4.539787615418996e-06, "loss": 1.1087, "step": 12470 }, { "epoch": 0.7157610878421022, "grad_norm": 3.4334386656825675, "learning_rate": 4.531400910912846e-06, "loss": 1.0575, "step": 12475 }, { "epoch": 0.7160479660336221, "grad_norm": 2.7712039348034203, "learning_rate": 4.5230196903301275e-06, "loss": 1.0254, "step": 12480 }, { "epoch": 0.716334844225142, "grad_norm": 3.2656241235991206, "learning_rate": 4.51464396207554e-06, "loss": 1.0149, "step": 12485 }, { "epoch": 0.7166217224166619, "grad_norm": 3.412373825936961, "learning_rate": 4.506273734548292e-06, "loss": 1.0865, "step": 12490 }, { "epoch": 0.7169086006081817, "grad_norm": 3.2183713236121934, "learning_rate": 4.497909016142065e-06, "loss": 1.0934, "step": 12495 }, { "epoch": 0.7171954787997017, "grad_norm": 3.1300421800363796, "learning_rate": 4.489549815245008e-06, "loss": 1.043, "step": 12500 }, { "epoch": 0.7174823569912215, "grad_norm": 3.3884146586682613, "learning_rate": 4.481196140239756e-06, "loss": 1.0645, "step": 12505 }, { "epoch": 0.7177692351827414, "grad_norm": 3.28415827255224, "learning_rate": 4.472847999503389e-06, "loss": 1.0916, "step": 12510 }, { "epoch": 0.7180561133742613, "grad_norm": 3.0634574010676277, "learning_rate": 4.464505401407443e-06, "loss": 1.0282, "step": 12515 }, { "epoch": 0.7183429915657812, "grad_norm": 3.6877667394876563, "learning_rate": 4.456168354317892e-06, "loss": 1.0335, "step": 12520 }, { "epoch": 0.718629869757301, "grad_norm": 3.451853658791238, "learning_rate": 4.447836866595148e-06, "loss": 1.0256, "step": 12525 }, { "epoch": 0.7189167479488209, "grad_norm": 3.2348251288454057, "learning_rate": 4.43951094659404e-06, "loss": 1.0388, "step": 12530 }, { "epoch": 0.7192036261403408, "grad_norm": 3.7569316513160436, "learning_rate": 4.431190602663828e-06, "loss": 1.0599, "step": 12535 }, { "epoch": 0.7194905043318607, "grad_norm": 3.4517007468934846, "learning_rate": 4.422875843148166e-06, "loss": 1.0923, "step": 12540 }, { "epoch": 0.7197773825233805, "grad_norm": 3.1213374966054146, "learning_rate": 4.414566676385118e-06, "loss": 1.1115, "step": 12545 }, { "epoch": 0.7200642607149005, "grad_norm": 3.1095918465158245, "learning_rate": 4.406263110707125e-06, "loss": 1.0827, "step": 12550 }, { "epoch": 0.7203511389064203, "grad_norm": 3.587247054232583, "learning_rate": 4.397965154441031e-06, "loss": 1.088, "step": 12555 }, { "epoch": 0.7206380170979402, "grad_norm": 3.2681909957242716, "learning_rate": 4.389672815908043e-06, "loss": 1.0897, "step": 12560 }, { "epoch": 0.7209248952894601, "grad_norm": 3.1834099044792126, "learning_rate": 4.381386103423735e-06, "loss": 1.1239, "step": 12565 }, { "epoch": 0.72121177348098, "grad_norm": 3.879350112787823, "learning_rate": 4.373105025298041e-06, "loss": 1.0234, "step": 12570 }, { "epoch": 0.7214986516724998, "grad_norm": 3.2355917671392413, "learning_rate": 4.364829589835245e-06, "loss": 1.0491, "step": 12575 }, { "epoch": 0.7217855298640198, "grad_norm": 3.331335854283612, "learning_rate": 4.356559805333971e-06, "loss": 1.133, "step": 12580 }, { "epoch": 0.7220724080555396, "grad_norm": 3.423839440564336, "learning_rate": 4.348295680087181e-06, "loss": 1.0695, "step": 12585 }, { "epoch": 0.7223592862470595, "grad_norm": 3.3567861860557793, "learning_rate": 4.340037222382156e-06, "loss": 1.0309, "step": 12590 }, { "epoch": 0.7226461644385794, "grad_norm": 3.6229868942702597, "learning_rate": 4.331784440500501e-06, "loss": 1.0931, "step": 12595 }, { "epoch": 0.7229330426300993, "grad_norm": 3.226343240566295, "learning_rate": 4.3235373427181115e-06, "loss": 1.0575, "step": 12600 }, { "epoch": 0.7232199208216191, "grad_norm": 3.514278129467416, "learning_rate": 4.315295937305207e-06, "loss": 0.9839, "step": 12605 }, { "epoch": 0.723506799013139, "grad_norm": 3.571817534202396, "learning_rate": 4.307060232526283e-06, "loss": 1.1241, "step": 12610 }, { "epoch": 0.7237936772046589, "grad_norm": 3.3813562351625244, "learning_rate": 4.298830236640126e-06, "loss": 1.043, "step": 12615 }, { "epoch": 0.7240805553961788, "grad_norm": 3.856566174507739, "learning_rate": 4.290605957899789e-06, "loss": 1.0132, "step": 12620 }, { "epoch": 0.7243674335876986, "grad_norm": 3.225720012293306, "learning_rate": 4.282387404552603e-06, "loss": 1.1176, "step": 12625 }, { "epoch": 0.7246543117792186, "grad_norm": 2.7620282249029198, "learning_rate": 4.274174584840143e-06, "loss": 1.0917, "step": 12630 }, { "epoch": 0.7249411899707384, "grad_norm": 3.144787262458617, "learning_rate": 4.265967506998253e-06, "loss": 1.1358, "step": 12635 }, { "epoch": 0.7252280681622583, "grad_norm": 2.944493076033099, "learning_rate": 4.2577661792570046e-06, "loss": 1.1075, "step": 12640 }, { "epoch": 0.7255149463537782, "grad_norm": 3.242234596608046, "learning_rate": 4.2495706098407085e-06, "loss": 1.0236, "step": 12645 }, { "epoch": 0.7258018245452981, "grad_norm": 3.342788965080397, "learning_rate": 4.241380806967899e-06, "loss": 1.0308, "step": 12650 }, { "epoch": 0.7260887027368179, "grad_norm": 3.9910919517728902, "learning_rate": 4.2331967788513295e-06, "loss": 1.1002, "step": 12655 }, { "epoch": 0.7263755809283379, "grad_norm": 2.914134531015643, "learning_rate": 4.225018533697962e-06, "loss": 1.0471, "step": 12660 }, { "epoch": 0.7266624591198577, "grad_norm": 2.9498892791535143, "learning_rate": 4.216846079708958e-06, "loss": 1.081, "step": 12665 }, { "epoch": 0.7269493373113776, "grad_norm": 3.2552405055999274, "learning_rate": 4.208679425079674e-06, "loss": 1.0805, "step": 12670 }, { "epoch": 0.7272362155028975, "grad_norm": 3.1341959667919106, "learning_rate": 4.200518577999649e-06, "loss": 1.0759, "step": 12675 }, { "epoch": 0.7275230936944174, "grad_norm": 3.1933366453908882, "learning_rate": 4.1923635466525936e-06, "loss": 1.0524, "step": 12680 }, { "epoch": 0.7278099718859372, "grad_norm": 3.7022309947009644, "learning_rate": 4.1842143392164e-06, "loss": 1.0813, "step": 12685 }, { "epoch": 0.7280968500774571, "grad_norm": 3.7221073296824234, "learning_rate": 4.17607096386311e-06, "loss": 1.0614, "step": 12690 }, { "epoch": 0.728383728268977, "grad_norm": 3.127170291263601, "learning_rate": 4.167933428758915e-06, "loss": 1.0231, "step": 12695 }, { "epoch": 0.7286706064604969, "grad_norm": 2.9852642431343974, "learning_rate": 4.159801742064158e-06, "loss": 1.0061, "step": 12700 }, { "epoch": 0.7289574846520167, "grad_norm": 3.389257068419293, "learning_rate": 4.151675911933308e-06, "loss": 1.0662, "step": 12705 }, { "epoch": 0.7292443628435367, "grad_norm": 3.495457498976014, "learning_rate": 4.143555946514964e-06, "loss": 1.0512, "step": 12710 }, { "epoch": 0.7295312410350565, "grad_norm": 3.3430950928021885, "learning_rate": 4.135441853951857e-06, "loss": 1.0081, "step": 12715 }, { "epoch": 0.7298181192265764, "grad_norm": 3.2727666271142244, "learning_rate": 4.127333642380807e-06, "loss": 0.9744, "step": 12720 }, { "epoch": 0.7301049974180963, "grad_norm": 3.046992106232037, "learning_rate": 4.119231319932747e-06, "loss": 0.9904, "step": 12725 }, { "epoch": 0.7303918756096162, "grad_norm": 2.9877203909841503, "learning_rate": 4.111134894732703e-06, "loss": 1.0675, "step": 12730 }, { "epoch": 0.730678753801136, "grad_norm": 3.4825658352988604, "learning_rate": 4.103044374899797e-06, "loss": 1.057, "step": 12735 }, { "epoch": 0.730965631992656, "grad_norm": 3.306344575100828, "learning_rate": 4.094959768547214e-06, "loss": 1.0379, "step": 12740 }, { "epoch": 0.7312525101841758, "grad_norm": 3.194480094730791, "learning_rate": 4.086881083782216e-06, "loss": 1.0368, "step": 12745 }, { "epoch": 0.7315393883756957, "grad_norm": 3.2950722741521905, "learning_rate": 4.078808328706128e-06, "loss": 1.0266, "step": 12750 }, { "epoch": 0.7318262665672156, "grad_norm": 3.204172938155854, "learning_rate": 4.070741511414324e-06, "loss": 0.9951, "step": 12755 }, { "epoch": 0.7321131447587355, "grad_norm": 3.3218409122429526, "learning_rate": 4.062680639996225e-06, "loss": 1.0959, "step": 12760 }, { "epoch": 0.7324000229502553, "grad_norm": 3.3625441263618443, "learning_rate": 4.054625722535301e-06, "loss": 1.0589, "step": 12765 }, { "epoch": 0.7326869011417751, "grad_norm": 3.74491897461003, "learning_rate": 4.0465767671090305e-06, "loss": 0.9427, "step": 12770 }, { "epoch": 0.7329737793332951, "grad_norm": 2.986576442766941, "learning_rate": 4.038533781788925e-06, "loss": 1.0196, "step": 12775 }, { "epoch": 0.733260657524815, "grad_norm": 3.39145804125488, "learning_rate": 4.030496774640515e-06, "loss": 1.0509, "step": 12780 }, { "epoch": 0.7335475357163348, "grad_norm": 2.98684423239601, "learning_rate": 4.0224657537233236e-06, "loss": 1.0153, "step": 12785 }, { "epoch": 0.7338344139078548, "grad_norm": 3.0965640549972315, "learning_rate": 4.014440727090879e-06, "loss": 1.0558, "step": 12790 }, { "epoch": 0.7341212920993746, "grad_norm": 3.3986820021551023, "learning_rate": 4.006421702790695e-06, "loss": 1.051, "step": 12795 }, { "epoch": 0.7344081702908944, "grad_norm": 3.074110661459439, "learning_rate": 3.998408688864267e-06, "loss": 1.0075, "step": 12800 }, { "epoch": 0.7346950484824144, "grad_norm": 3.40070396447483, "learning_rate": 3.990401693347065e-06, "loss": 1.0917, "step": 12805 }, { "epoch": 0.7349819266739342, "grad_norm": 3.0828266878119095, "learning_rate": 3.982400724268516e-06, "loss": 1.0257, "step": 12810 }, { "epoch": 0.7352688048654541, "grad_norm": 2.916084153469261, "learning_rate": 3.9744057896520216e-06, "loss": 1.0513, "step": 12815 }, { "epoch": 0.735555683056974, "grad_norm": 3.165915835234275, "learning_rate": 3.96641689751491e-06, "loss": 1.0208, "step": 12820 }, { "epoch": 0.7358425612484939, "grad_norm": 3.0505319363917347, "learning_rate": 3.95843405586846e-06, "loss": 1.1075, "step": 12825 }, { "epoch": 0.7361294394400137, "grad_norm": 3.8320470337002113, "learning_rate": 3.950457272717889e-06, "loss": 1.1244, "step": 12830 }, { "epoch": 0.7364163176315337, "grad_norm": 3.25590967479276, "learning_rate": 3.9424865560623305e-06, "loss": 1.1169, "step": 12835 }, { "epoch": 0.7367031958230535, "grad_norm": 3.138966042720696, "learning_rate": 3.9345219138948365e-06, "loss": 1.0893, "step": 12840 }, { "epoch": 0.7369900740145734, "grad_norm": 4.463964296766966, "learning_rate": 3.9265633542023685e-06, "loss": 1.1042, "step": 12845 }, { "epoch": 0.7372769522060932, "grad_norm": 3.5919603596997374, "learning_rate": 3.918610884965789e-06, "loss": 1.1353, "step": 12850 }, { "epoch": 0.7375638303976132, "grad_norm": 3.0934752578370364, "learning_rate": 3.91066451415985e-06, "loss": 1.0332, "step": 12855 }, { "epoch": 0.737850708589133, "grad_norm": 3.2781879427097222, "learning_rate": 3.902724249753187e-06, "loss": 1.1071, "step": 12860 }, { "epoch": 0.7381375867806529, "grad_norm": 3.235349997714651, "learning_rate": 3.8947900997083255e-06, "loss": 1.0513, "step": 12865 }, { "epoch": 0.7384244649721728, "grad_norm": 3.8022587923825752, "learning_rate": 3.886862071981639e-06, "loss": 1.1193, "step": 12870 }, { "epoch": 0.7387113431636927, "grad_norm": 3.0951584648720964, "learning_rate": 3.878940174523371e-06, "loss": 1.0406, "step": 12875 }, { "epoch": 0.7389982213552125, "grad_norm": 3.252183226767297, "learning_rate": 3.871024415277627e-06, "loss": 1.057, "step": 12880 }, { "epoch": 0.7392850995467325, "grad_norm": 3.657248159913111, "learning_rate": 3.86311480218234e-06, "loss": 1.0376, "step": 12885 }, { "epoch": 0.7395719777382523, "grad_norm": 3.013889472797598, "learning_rate": 3.855211343169293e-06, "loss": 1.0218, "step": 12890 }, { "epoch": 0.7398588559297722, "grad_norm": 3.445043243738392, "learning_rate": 3.847314046164089e-06, "loss": 1.0236, "step": 12895 }, { "epoch": 0.7401457341212921, "grad_norm": 3.2891185526964883, "learning_rate": 3.839422919086157e-06, "loss": 1.0476, "step": 12900 }, { "epoch": 0.740432612312812, "grad_norm": 3.419660850337131, "learning_rate": 3.831537969848731e-06, "loss": 1.1234, "step": 12905 }, { "epoch": 0.7407194905043318, "grad_norm": 7.810072200630034, "learning_rate": 3.823659206358865e-06, "loss": 1.113, "step": 12910 }, { "epoch": 0.7410063686958518, "grad_norm": 3.3998030704149667, "learning_rate": 3.8157866365174e-06, "loss": 1.1139, "step": 12915 }, { "epoch": 0.7412932468873716, "grad_norm": 3.05949110449083, "learning_rate": 3.807920268218961e-06, "loss": 1.0127, "step": 12920 }, { "epoch": 0.7415801250788915, "grad_norm": 3.1507791326448626, "learning_rate": 3.8000601093519573e-06, "loss": 1.0086, "step": 12925 }, { "epoch": 0.7418670032704113, "grad_norm": 3.140909509722921, "learning_rate": 3.792206167798582e-06, "loss": 0.9705, "step": 12930 }, { "epoch": 0.7421538814619313, "grad_norm": 3.3505003660222363, "learning_rate": 3.784358451434783e-06, "loss": 1.0204, "step": 12935 }, { "epoch": 0.7424407596534511, "grad_norm": 3.294501991385837, "learning_rate": 3.776516968130266e-06, "loss": 1.0957, "step": 12940 }, { "epoch": 0.742727637844971, "grad_norm": 3.166429331912014, "learning_rate": 3.768681725748489e-06, "loss": 1.0691, "step": 12945 }, { "epoch": 0.7430145160364909, "grad_norm": 4.301791100468454, "learning_rate": 3.7608527321466493e-06, "loss": 1.0117, "step": 12950 }, { "epoch": 0.7433013942280108, "grad_norm": 3.2047067933845717, "learning_rate": 3.7530299951756776e-06, "loss": 1.0458, "step": 12955 }, { "epoch": 0.7435882724195306, "grad_norm": 3.28666203786594, "learning_rate": 3.745213522680239e-06, "loss": 1.0474, "step": 12960 }, { "epoch": 0.7438751506110506, "grad_norm": 3.403761458495736, "learning_rate": 3.7374033224987084e-06, "loss": 1.1127, "step": 12965 }, { "epoch": 0.7441620288025704, "grad_norm": 3.79923478224165, "learning_rate": 3.7295994024631623e-06, "loss": 1.078, "step": 12970 }, { "epoch": 0.7444489069940903, "grad_norm": 3.6171395105013233, "learning_rate": 3.7218017703993993e-06, "loss": 1.0507, "step": 12975 }, { "epoch": 0.7447357851856102, "grad_norm": 3.3797925670757585, "learning_rate": 3.714010434126899e-06, "loss": 1.0771, "step": 12980 }, { "epoch": 0.7450226633771301, "grad_norm": 3.402269303849736, "learning_rate": 3.7062254014588317e-06, "loss": 0.9972, "step": 12985 }, { "epoch": 0.7453095415686499, "grad_norm": 3.2708972420804625, "learning_rate": 3.6984466802020436e-06, "loss": 1.1152, "step": 12990 }, { "epoch": 0.7455964197601699, "grad_norm": 3.539579383215263, "learning_rate": 3.6906742781570557e-06, "loss": 1.0172, "step": 12995 }, { "epoch": 0.7458832979516897, "grad_norm": 3.2343178988949464, "learning_rate": 3.68290820311805e-06, "loss": 1.0318, "step": 13000 }, { "epoch": 0.7461701761432096, "grad_norm": 3.1717835685973252, "learning_rate": 3.6751484628728594e-06, "loss": 1.0488, "step": 13005 }, { "epoch": 0.7464570543347294, "grad_norm": 3.1996042188421576, "learning_rate": 3.6673950652029766e-06, "loss": 0.9712, "step": 13010 }, { "epoch": 0.7467439325262494, "grad_norm": 3.579604065399662, "learning_rate": 3.6596480178835258e-06, "loss": 1.0437, "step": 13015 }, { "epoch": 0.7470308107177692, "grad_norm": 3.2216557045468397, "learning_rate": 3.651907328683254e-06, "loss": 1.0361, "step": 13020 }, { "epoch": 0.7473176889092891, "grad_norm": 3.495650132429074, "learning_rate": 3.6441730053645507e-06, "loss": 1.0634, "step": 13025 }, { "epoch": 0.747604567100809, "grad_norm": 3.016946994645015, "learning_rate": 3.6364450556834096e-06, "loss": 1.0659, "step": 13030 }, { "epoch": 0.7478914452923289, "grad_norm": 3.5563886129216593, "learning_rate": 3.6287234873894372e-06, "loss": 1.0393, "step": 13035 }, { "epoch": 0.7481783234838487, "grad_norm": 3.2853021995660656, "learning_rate": 3.6210083082258374e-06, "loss": 0.9779, "step": 13040 }, { "epoch": 0.7484652016753687, "grad_norm": 3.5156647209937364, "learning_rate": 3.61329952592941e-06, "loss": 1.0177, "step": 13045 }, { "epoch": 0.7487520798668885, "grad_norm": 3.645523090120582, "learning_rate": 3.605597148230541e-06, "loss": 1.0283, "step": 13050 }, { "epoch": 0.7490389580584084, "grad_norm": 3.427403358592197, "learning_rate": 3.597901182853185e-06, "loss": 1.0297, "step": 13055 }, { "epoch": 0.7493258362499283, "grad_norm": 3.348744819471747, "learning_rate": 3.5902116375148844e-06, "loss": 1.0815, "step": 13060 }, { "epoch": 0.7496127144414482, "grad_norm": 2.9912141119245166, "learning_rate": 3.582528519926729e-06, "loss": 1.0859, "step": 13065 }, { "epoch": 0.749899592632968, "grad_norm": 3.2401990408018735, "learning_rate": 3.5748518377933573e-06, "loss": 1.0278, "step": 13070 }, { "epoch": 0.750186470824488, "grad_norm": 3.2240370919498753, "learning_rate": 3.567181598812973e-06, "loss": 1.0631, "step": 13075 }, { "epoch": 0.7504733490160078, "grad_norm": 3.223331366213257, "learning_rate": 3.5595178106773074e-06, "loss": 1.059, "step": 13080 }, { "epoch": 0.7507602272075277, "grad_norm": 2.9266830694898043, "learning_rate": 3.5518604810716238e-06, "loss": 0.9639, "step": 13085 }, { "epoch": 0.7510471053990475, "grad_norm": 3.081679182216585, "learning_rate": 3.5442096176747074e-06, "loss": 1.0577, "step": 13090 }, { "epoch": 0.7513339835905675, "grad_norm": 3.060819892184341, "learning_rate": 3.536565228158864e-06, "loss": 1.0051, "step": 13095 }, { "epoch": 0.7516208617820873, "grad_norm": 3.6260246904758775, "learning_rate": 3.5289273201899033e-06, "loss": 0.9896, "step": 13100 }, { "epoch": 0.7519077399736072, "grad_norm": 3.404676858906788, "learning_rate": 3.521295901427132e-06, "loss": 1.0919, "step": 13105 }, { "epoch": 0.7521946181651271, "grad_norm": 3.2562490921391287, "learning_rate": 3.5136709795233624e-06, "loss": 1.0786, "step": 13110 }, { "epoch": 0.752481496356647, "grad_norm": 3.05105003736866, "learning_rate": 3.506052562124883e-06, "loss": 0.9627, "step": 13115 }, { "epoch": 0.7527683745481668, "grad_norm": 3.9196549339991864, "learning_rate": 3.498440656871449e-06, "loss": 1.0177, "step": 13120 }, { "epoch": 0.7530552527396868, "grad_norm": 3.2136165629001394, "learning_rate": 3.490835271396308e-06, "loss": 1.0393, "step": 13125 }, { "epoch": 0.7533421309312066, "grad_norm": 3.0967137941983895, "learning_rate": 3.483236413326151e-06, "loss": 1.027, "step": 13130 }, { "epoch": 0.7536290091227265, "grad_norm": 3.3974156135869475, "learning_rate": 3.4756440902811326e-06, "loss": 1.0543, "step": 13135 }, { "epoch": 0.7539158873142464, "grad_norm": 3.019521344880212, "learning_rate": 3.468058309874851e-06, "loss": 1.0027, "step": 13140 }, { "epoch": 0.7542027655057663, "grad_norm": 3.2076049283908734, "learning_rate": 3.460479079714343e-06, "loss": 1.0494, "step": 13145 }, { "epoch": 0.7544896436972861, "grad_norm": 3.5320056920484686, "learning_rate": 3.452906407400074e-06, "loss": 1.0749, "step": 13150 }, { "epoch": 0.7547765218888061, "grad_norm": 3.9456728030951718, "learning_rate": 3.4453403005259443e-06, "loss": 1.0568, "step": 13155 }, { "epoch": 0.7550634000803259, "grad_norm": 4.024966909597105, "learning_rate": 3.4377807666792597e-06, "loss": 1.0252, "step": 13160 }, { "epoch": 0.7553502782718458, "grad_norm": 3.3538868078329425, "learning_rate": 3.4302278134407407e-06, "loss": 0.9832, "step": 13165 }, { "epoch": 0.7556371564633656, "grad_norm": 3.2928763639454823, "learning_rate": 3.422681448384495e-06, "loss": 1.0846, "step": 13170 }, { "epoch": 0.7559240346548856, "grad_norm": 3.621964129781018, "learning_rate": 3.4151416790780454e-06, "loss": 1.0409, "step": 13175 }, { "epoch": 0.7562109128464054, "grad_norm": 3.1425890865596493, "learning_rate": 3.4076085130822868e-06, "loss": 1.0582, "step": 13180 }, { "epoch": 0.7564977910379252, "grad_norm": 2.993581863409231, "learning_rate": 3.400081957951492e-06, "loss": 1.0055, "step": 13185 }, { "epoch": 0.7567846692294452, "grad_norm": 3.4899385704034738, "learning_rate": 3.392562021233311e-06, "loss": 1.0046, "step": 13190 }, { "epoch": 0.757071547420965, "grad_norm": 3.7117292318420794, "learning_rate": 3.3850487104687502e-06, "loss": 1.0832, "step": 13195 }, { "epoch": 0.7573584256124849, "grad_norm": 3.604589194534151, "learning_rate": 3.3775420331921737e-06, "loss": 1.0567, "step": 13200 }, { "epoch": 0.7576453038040049, "grad_norm": 3.324170011305961, "learning_rate": 3.3700419969312994e-06, "loss": 1.0337, "step": 13205 }, { "epoch": 0.7579321819955247, "grad_norm": 3.132929625445994, "learning_rate": 3.3625486092071767e-06, "loss": 0.9911, "step": 13210 }, { "epoch": 0.7582190601870445, "grad_norm": 3.177337592142767, "learning_rate": 3.3550618775341927e-06, "loss": 1.0848, "step": 13215 }, { "epoch": 0.7585059383785645, "grad_norm": 3.1714435291732235, "learning_rate": 3.3475818094200584e-06, "loss": 1.0214, "step": 13220 }, { "epoch": 0.7587928165700843, "grad_norm": 3.455008243212108, "learning_rate": 3.3401084123658035e-06, "loss": 1.0717, "step": 13225 }, { "epoch": 0.7590796947616042, "grad_norm": 3.3887386969709214, "learning_rate": 3.332641693865766e-06, "loss": 1.0402, "step": 13230 }, { "epoch": 0.7593665729531242, "grad_norm": 3.0845457800367084, "learning_rate": 3.3251816614075882e-06, "loss": 1.0219, "step": 13235 }, { "epoch": 0.759653451144644, "grad_norm": 3.5182455479085037, "learning_rate": 3.317728322472209e-06, "loss": 1.0556, "step": 13240 }, { "epoch": 0.7599403293361638, "grad_norm": 3.2043633511988228, "learning_rate": 3.3102816845338513e-06, "loss": 1.0161, "step": 13245 }, { "epoch": 0.7602272075276837, "grad_norm": 3.426021974559271, "learning_rate": 3.3028417550600177e-06, "loss": 1.0714, "step": 13250 }, { "epoch": 0.7605140857192036, "grad_norm": 3.420613485956261, "learning_rate": 3.2954085415114946e-06, "loss": 1.0415, "step": 13255 }, { "epoch": 0.7608009639107235, "grad_norm": 4.089380902286974, "learning_rate": 3.287982051342319e-06, "loss": 1.0898, "step": 13260 }, { "epoch": 0.7610878421022433, "grad_norm": 2.9713433528548645, "learning_rate": 3.2805622919997937e-06, "loss": 1.0145, "step": 13265 }, { "epoch": 0.7613747202937633, "grad_norm": 3.4176473412959694, "learning_rate": 3.273149270924468e-06, "loss": 1.0264, "step": 13270 }, { "epoch": 0.7616615984852831, "grad_norm": 3.1492729057398345, "learning_rate": 3.2657429955501396e-06, "loss": 1.0306, "step": 13275 }, { "epoch": 0.761948476676803, "grad_norm": 3.1961655618047873, "learning_rate": 3.2583434733038322e-06, "loss": 1.0403, "step": 13280 }, { "epoch": 0.762235354868323, "grad_norm": 3.0319427626152704, "learning_rate": 3.2509507116058135e-06, "loss": 1.0024, "step": 13285 }, { "epoch": 0.7625222330598428, "grad_norm": 3.4280161490280854, "learning_rate": 3.2435647178695517e-06, "loss": 1.055, "step": 13290 }, { "epoch": 0.7628091112513626, "grad_norm": 3.6290240964274383, "learning_rate": 3.2361854995017417e-06, "loss": 1.0333, "step": 13295 }, { "epoch": 0.7630959894428826, "grad_norm": 3.012562178559695, "learning_rate": 3.2288130639022764e-06, "loss": 1.0211, "step": 13300 }, { "epoch": 0.7633828676344024, "grad_norm": 3.0252931326775196, "learning_rate": 3.2214474184642575e-06, "loss": 1.0712, "step": 13305 }, { "epoch": 0.7636697458259223, "grad_norm": 3.0279831094840275, "learning_rate": 3.214088570573968e-06, "loss": 1.0722, "step": 13310 }, { "epoch": 0.7639566240174422, "grad_norm": 3.1850730406068877, "learning_rate": 3.2067365276108753e-06, "loss": 1.0941, "step": 13315 }, { "epoch": 0.7642435022089621, "grad_norm": 3.965488685634366, "learning_rate": 3.199391296947627e-06, "loss": 1.0924, "step": 13320 }, { "epoch": 0.7645303804004819, "grad_norm": 3.5151124213078857, "learning_rate": 3.1920528859500343e-06, "loss": 1.0597, "step": 13325 }, { "epoch": 0.7648172585920018, "grad_norm": 3.106944557506971, "learning_rate": 3.184721301977072e-06, "loss": 1.0084, "step": 13330 }, { "epoch": 0.7651041367835217, "grad_norm": 2.91540434547735, "learning_rate": 3.177396552380876e-06, "loss": 1.1492, "step": 13335 }, { "epoch": 0.7653910149750416, "grad_norm": 3.2501919061203632, "learning_rate": 3.1700786445067133e-06, "loss": 1.0153, "step": 13340 }, { "epoch": 0.7656778931665614, "grad_norm": 3.4276316162769893, "learning_rate": 3.1627675856929975e-06, "loss": 0.9982, "step": 13345 }, { "epoch": 0.7659647713580814, "grad_norm": 3.1629143197023057, "learning_rate": 3.155463383271282e-06, "loss": 0.9592, "step": 13350 }, { "epoch": 0.7662516495496012, "grad_norm": 2.9466540502880556, "learning_rate": 3.1481660445662333e-06, "loss": 1.0629, "step": 13355 }, { "epoch": 0.7665385277411211, "grad_norm": 3.5614734145645763, "learning_rate": 3.14087557689564e-06, "loss": 1.0637, "step": 13360 }, { "epoch": 0.766825405932641, "grad_norm": 3.937224681066208, "learning_rate": 3.133591987570399e-06, "loss": 1.0645, "step": 13365 }, { "epoch": 0.7671122841241609, "grad_norm": 3.166072654086745, "learning_rate": 3.12631528389451e-06, "loss": 1.0301, "step": 13370 }, { "epoch": 0.7673991623156807, "grad_norm": 3.1941073435335414, "learning_rate": 3.1190454731650675e-06, "loss": 1.0568, "step": 13375 }, { "epoch": 0.7676860405072007, "grad_norm": 3.1523523828533535, "learning_rate": 3.1117825626722507e-06, "loss": 1.057, "step": 13380 }, { "epoch": 0.7679729186987205, "grad_norm": 3.2212643811667454, "learning_rate": 3.1045265596993337e-06, "loss": 1.0004, "step": 13385 }, { "epoch": 0.7682597968902404, "grad_norm": 3.1432799204838293, "learning_rate": 3.0972774715226406e-06, "loss": 0.998, "step": 13390 }, { "epoch": 0.7685466750817603, "grad_norm": 3.2958130576324303, "learning_rate": 3.090035305411575e-06, "loss": 1.0245, "step": 13395 }, { "epoch": 0.7688335532732802, "grad_norm": 3.2414356613620265, "learning_rate": 3.0828000686286032e-06, "loss": 1.018, "step": 13400 }, { "epoch": 0.7691204314648, "grad_norm": 3.362950010942062, "learning_rate": 3.075571768429233e-06, "loss": 1.071, "step": 13405 }, { "epoch": 0.76940730965632, "grad_norm": 3.2202319354669595, "learning_rate": 3.06835041206202e-06, "loss": 1.0307, "step": 13410 }, { "epoch": 0.7696941878478398, "grad_norm": 4.185092967382134, "learning_rate": 3.061136006768558e-06, "loss": 1.033, "step": 13415 }, { "epoch": 0.7699810660393597, "grad_norm": 3.097619752718529, "learning_rate": 3.053928559783468e-06, "loss": 1.0041, "step": 13420 }, { "epoch": 0.7702679442308795, "grad_norm": 2.9871748063625954, "learning_rate": 3.0467280783343946e-06, "loss": 1.0542, "step": 13425 }, { "epoch": 0.7705548224223995, "grad_norm": 3.001370689607345, "learning_rate": 3.039534569641992e-06, "loss": 1.0292, "step": 13430 }, { "epoch": 0.7708417006139193, "grad_norm": 3.324880933422377, "learning_rate": 3.032348040919938e-06, "loss": 1.0353, "step": 13435 }, { "epoch": 0.7711285788054392, "grad_norm": 3.5270615633184255, "learning_rate": 3.0251684993748888e-06, "loss": 1.0566, "step": 13440 }, { "epoch": 0.7714154569969591, "grad_norm": 3.102821337413082, "learning_rate": 3.017995952206506e-06, "loss": 1.0854, "step": 13445 }, { "epoch": 0.771702335188479, "grad_norm": 3.290277462918213, "learning_rate": 3.0108304066074412e-06, "loss": 0.9978, "step": 13450 }, { "epoch": 0.7719892133799988, "grad_norm": 3.5048194465045754, "learning_rate": 3.0036718697633174e-06, "loss": 1.1485, "step": 13455 }, { "epoch": 0.7722760915715188, "grad_norm": 3.3447595783886452, "learning_rate": 2.9965203488527315e-06, "loss": 1.0277, "step": 13460 }, { "epoch": 0.7725629697630386, "grad_norm": 3.55767531267622, "learning_rate": 2.9893758510472436e-06, "loss": 1.0398, "step": 13465 }, { "epoch": 0.7728498479545585, "grad_norm": 3.3057104008510447, "learning_rate": 2.9822383835113733e-06, "loss": 1.1015, "step": 13470 }, { "epoch": 0.7731367261460784, "grad_norm": 3.460810888607055, "learning_rate": 2.975107953402585e-06, "loss": 1.1069, "step": 13475 }, { "epoch": 0.7734236043375983, "grad_norm": 3.915309514000913, "learning_rate": 2.967984567871297e-06, "loss": 1.0715, "step": 13480 }, { "epoch": 0.7737104825291181, "grad_norm": 3.2160777675760444, "learning_rate": 2.960868234060855e-06, "loss": 1.0237, "step": 13485 }, { "epoch": 0.7739973607206381, "grad_norm": 3.3227098348336885, "learning_rate": 2.95375895910753e-06, "loss": 1.0766, "step": 13490 }, { "epoch": 0.7742842389121579, "grad_norm": 3.0398656732104197, "learning_rate": 2.9466567501405186e-06, "loss": 0.9919, "step": 13495 }, { "epoch": 0.7745711171036778, "grad_norm": 3.297864739911567, "learning_rate": 2.9395616142819363e-06, "loss": 1.1134, "step": 13500 }, { "epoch": 0.7748579952951976, "grad_norm": 3.5126988245847444, "learning_rate": 2.9324735586468e-06, "loss": 1.0378, "step": 13505 }, { "epoch": 0.7751448734867176, "grad_norm": 3.184441936027504, "learning_rate": 2.9253925903430267e-06, "loss": 0.9992, "step": 13510 }, { "epoch": 0.7754317516782374, "grad_norm": 3.1500509354935926, "learning_rate": 2.9183187164714287e-06, "loss": 1.0838, "step": 13515 }, { "epoch": 0.7757186298697573, "grad_norm": 3.290171143387618, "learning_rate": 2.9112519441257003e-06, "loss": 0.9638, "step": 13520 }, { "epoch": 0.7760055080612772, "grad_norm": 3.187794228198158, "learning_rate": 2.904192280392416e-06, "loss": 1.0002, "step": 13525 }, { "epoch": 0.7762923862527971, "grad_norm": 3.186888491336794, "learning_rate": 2.8971397323510277e-06, "loss": 1.0655, "step": 13530 }, { "epoch": 0.7765792644443169, "grad_norm": 3.257000917538965, "learning_rate": 2.890094307073845e-06, "loss": 0.998, "step": 13535 }, { "epoch": 0.7768661426358369, "grad_norm": 3.726881880705207, "learning_rate": 2.883056011626032e-06, "loss": 1.0659, "step": 13540 }, { "epoch": 0.7771530208273567, "grad_norm": 2.937926382347164, "learning_rate": 2.8760248530656064e-06, "loss": 1.0639, "step": 13545 }, { "epoch": 0.7774398990188766, "grad_norm": 3.2755414321307175, "learning_rate": 2.8690008384434364e-06, "loss": 1.0229, "step": 13550 }, { "epoch": 0.7777267772103965, "grad_norm": 3.5527537839660064, "learning_rate": 2.861983974803215e-06, "loss": 1.0275, "step": 13555 }, { "epoch": 0.7780136554019164, "grad_norm": 3.303720482432539, "learning_rate": 2.854974269181471e-06, "loss": 1.055, "step": 13560 }, { "epoch": 0.7783005335934362, "grad_norm": 3.361327927784108, "learning_rate": 2.8479717286075505e-06, "loss": 1.0251, "step": 13565 }, { "epoch": 0.7785874117849562, "grad_norm": 3.378655408953542, "learning_rate": 2.840976360103619e-06, "loss": 1.0682, "step": 13570 }, { "epoch": 0.778874289976476, "grad_norm": 3.7712991432974756, "learning_rate": 2.8339881706846427e-06, "loss": 1.1341, "step": 13575 }, { "epoch": 0.7791611681679959, "grad_norm": 3.192128055231739, "learning_rate": 2.827007167358401e-06, "loss": 1.0642, "step": 13580 }, { "epoch": 0.7794480463595157, "grad_norm": 2.929747340496945, "learning_rate": 2.8200333571254603e-06, "loss": 1.0539, "step": 13585 }, { "epoch": 0.7797349245510357, "grad_norm": 3.2943031006804255, "learning_rate": 2.813066746979163e-06, "loss": 0.9392, "step": 13590 }, { "epoch": 0.7800218027425555, "grad_norm": 3.462486111635248, "learning_rate": 2.8061073439056508e-06, "loss": 0.9742, "step": 13595 }, { "epoch": 0.7803086809340753, "grad_norm": 3.6603762389871726, "learning_rate": 2.799155154883826e-06, "loss": 1.0046, "step": 13600 }, { "epoch": 0.7805955591255953, "grad_norm": 3.0859834356208857, "learning_rate": 2.792210186885358e-06, "loss": 1.0386, "step": 13605 }, { "epoch": 0.7808824373171152, "grad_norm": 2.8784631122899738, "learning_rate": 2.785272446874677e-06, "loss": 0.9889, "step": 13610 }, { "epoch": 0.781169315508635, "grad_norm": 3.5087353179281773, "learning_rate": 2.7783419418089653e-06, "loss": 1.1021, "step": 13615 }, { "epoch": 0.781456193700155, "grad_norm": 3.1628580370057278, "learning_rate": 2.771418678638147e-06, "loss": 1.0732, "step": 13620 }, { "epoch": 0.7817430718916748, "grad_norm": 3.90076538323039, "learning_rate": 2.7645026643048856e-06, "loss": 1.1002, "step": 13625 }, { "epoch": 0.7820299500831946, "grad_norm": 4.39245952717636, "learning_rate": 2.7575939057445787e-06, "loss": 1.1035, "step": 13630 }, { "epoch": 0.7823168282747146, "grad_norm": 3.3021791829604883, "learning_rate": 2.7506924098853473e-06, "loss": 1.0398, "step": 13635 }, { "epoch": 0.7826037064662345, "grad_norm": 3.351756023896106, "learning_rate": 2.7437981836480164e-06, "loss": 1.0884, "step": 13640 }, { "epoch": 0.7828905846577543, "grad_norm": 3.154711630980806, "learning_rate": 2.7369112339461413e-06, "loss": 0.9976, "step": 13645 }, { "epoch": 0.7831774628492743, "grad_norm": 2.9376808666029715, "learning_rate": 2.730031567685968e-06, "loss": 1.0371, "step": 13650 }, { "epoch": 0.7834643410407941, "grad_norm": 3.2174256405791986, "learning_rate": 2.723159191766439e-06, "loss": 1.035, "step": 13655 }, { "epoch": 0.783751219232314, "grad_norm": 3.074110206616267, "learning_rate": 2.716294113079192e-06, "loss": 0.9672, "step": 13660 }, { "epoch": 0.7840380974238338, "grad_norm": 3.3610909466147776, "learning_rate": 2.7094363385085397e-06, "loss": 1.1409, "step": 13665 }, { "epoch": 0.7843249756153537, "grad_norm": 3.3834592537727057, "learning_rate": 2.7025858749314758e-06, "loss": 1.092, "step": 13670 }, { "epoch": 0.7846118538068736, "grad_norm": 3.458072029689745, "learning_rate": 2.6957427292176576e-06, "loss": 0.9578, "step": 13675 }, { "epoch": 0.7848987319983934, "grad_norm": 3.269532655994686, "learning_rate": 2.6889069082294115e-06, "loss": 1.1074, "step": 13680 }, { "epoch": 0.7851856101899134, "grad_norm": 3.1979666872677734, "learning_rate": 2.6820784188217164e-06, "loss": 1.0732, "step": 13685 }, { "epoch": 0.7854724883814332, "grad_norm": 2.969435227209521, "learning_rate": 2.675257267842185e-06, "loss": 1.0333, "step": 13690 }, { "epoch": 0.7857593665729531, "grad_norm": 3.1151155567241773, "learning_rate": 2.668443462131094e-06, "loss": 1.06, "step": 13695 }, { "epoch": 0.786046244764473, "grad_norm": 3.710586113700483, "learning_rate": 2.6616370085213395e-06, "loss": 1.1171, "step": 13700 }, { "epoch": 0.7863331229559929, "grad_norm": 3.3366889788600473, "learning_rate": 2.6548379138384484e-06, "loss": 1.0572, "step": 13705 }, { "epoch": 0.7866200011475127, "grad_norm": 3.6367226183478145, "learning_rate": 2.6480461849005677e-06, "loss": 1.005, "step": 13710 }, { "epoch": 0.7869068793390327, "grad_norm": 3.72602201200232, "learning_rate": 2.6412618285184587e-06, "loss": 1.0589, "step": 13715 }, { "epoch": 0.7871937575305525, "grad_norm": 3.4584777063593832, "learning_rate": 2.6344848514954856e-06, "loss": 1.0546, "step": 13720 }, { "epoch": 0.7874806357220724, "grad_norm": 3.2121730285394343, "learning_rate": 2.6277152606276236e-06, "loss": 1.0733, "step": 13725 }, { "epoch": 0.7877675139135923, "grad_norm": 3.3530698352915067, "learning_rate": 2.6209530627034297e-06, "loss": 0.9983, "step": 13730 }, { "epoch": 0.7880543921051122, "grad_norm": 3.7727800517862398, "learning_rate": 2.614198264504053e-06, "loss": 1.016, "step": 13735 }, { "epoch": 0.788341270296632, "grad_norm": 3.127495197816606, "learning_rate": 2.6074508728032135e-06, "loss": 1.0806, "step": 13740 }, { "epoch": 0.7886281484881519, "grad_norm": 2.8677017054804783, "learning_rate": 2.6007108943672187e-06, "loss": 1.0132, "step": 13745 }, { "epoch": 0.7889150266796718, "grad_norm": 3.218048148109048, "learning_rate": 2.593978335954931e-06, "loss": 1.1038, "step": 13750 }, { "epoch": 0.7892019048711917, "grad_norm": 3.4884111842133905, "learning_rate": 2.5872532043177744e-06, "loss": 1.0042, "step": 13755 }, { "epoch": 0.7894887830627115, "grad_norm": 3.0939706247306424, "learning_rate": 2.5805355061997274e-06, "loss": 1.077, "step": 13760 }, { "epoch": 0.7897756612542315, "grad_norm": 3.6266584703533558, "learning_rate": 2.5738252483373117e-06, "loss": 1.0291, "step": 13765 }, { "epoch": 0.7900625394457513, "grad_norm": 3.2131062734829947, "learning_rate": 2.567122437459586e-06, "loss": 1.04, "step": 13770 }, { "epoch": 0.7903494176372712, "grad_norm": 3.3981206347932074, "learning_rate": 2.5604270802881505e-06, "loss": 1.0456, "step": 13775 }, { "epoch": 0.7906362958287911, "grad_norm": 3.0204477036223367, "learning_rate": 2.5537391835371215e-06, "loss": 1.1827, "step": 13780 }, { "epoch": 0.790923174020311, "grad_norm": 3.0651840343719368, "learning_rate": 2.5470587539131366e-06, "loss": 1.0233, "step": 13785 }, { "epoch": 0.7912100522118308, "grad_norm": 3.7784003631276035, "learning_rate": 2.5403857981153458e-06, "loss": 0.9958, "step": 13790 }, { "epoch": 0.7914969304033508, "grad_norm": 3.3134048481122944, "learning_rate": 2.5337203228354034e-06, "loss": 1.0771, "step": 13795 }, { "epoch": 0.7917838085948706, "grad_norm": 3.2711725100964286, "learning_rate": 2.527062334757464e-06, "loss": 1.1059, "step": 13800 }, { "epoch": 0.7920706867863905, "grad_norm": 3.579972432294722, "learning_rate": 2.5204118405581725e-06, "loss": 1.0489, "step": 13805 }, { "epoch": 0.7923575649779104, "grad_norm": 3.567187015453122, "learning_rate": 2.513768846906659e-06, "loss": 1.0803, "step": 13810 }, { "epoch": 0.7926444431694303, "grad_norm": 3.21850717436034, "learning_rate": 2.507133360464533e-06, "loss": 1.0696, "step": 13815 }, { "epoch": 0.7929313213609501, "grad_norm": 3.4265417306798276, "learning_rate": 2.500505387885872e-06, "loss": 1.0471, "step": 13820 }, { "epoch": 0.79321819955247, "grad_norm": 3.2686557619457868, "learning_rate": 2.493884935817228e-06, "loss": 1.1052, "step": 13825 }, { "epoch": 0.7935050777439899, "grad_norm": 3.301765558270053, "learning_rate": 2.487272010897601e-06, "loss": 0.9986, "step": 13830 }, { "epoch": 0.7937919559355098, "grad_norm": 3.2245365970240214, "learning_rate": 2.4806666197584485e-06, "loss": 0.9955, "step": 13835 }, { "epoch": 0.7940788341270296, "grad_norm": 3.128402030718763, "learning_rate": 2.474068769023671e-06, "loss": 1.0235, "step": 13840 }, { "epoch": 0.7943657123185496, "grad_norm": 3.8100601267249585, "learning_rate": 2.4674784653096085e-06, "loss": 1.0609, "step": 13845 }, { "epoch": 0.7946525905100694, "grad_norm": 3.0286026816911034, "learning_rate": 2.460895715225028e-06, "loss": 1.1271, "step": 13850 }, { "epoch": 0.7949394687015893, "grad_norm": 3.464385712740268, "learning_rate": 2.4543205253711357e-06, "loss": 1.0666, "step": 13855 }, { "epoch": 0.7952263468931092, "grad_norm": 3.470839966821716, "learning_rate": 2.4477529023415382e-06, "loss": 0.9996, "step": 13860 }, { "epoch": 0.7955132250846291, "grad_norm": 3.202369102513249, "learning_rate": 2.441192852722265e-06, "loss": 1.0153, "step": 13865 }, { "epoch": 0.7958001032761489, "grad_norm": 3.415654754401935, "learning_rate": 2.4346403830917464e-06, "loss": 1.012, "step": 13870 }, { "epoch": 0.7960869814676689, "grad_norm": 3.4141438225150638, "learning_rate": 2.4280955000208182e-06, "loss": 0.9759, "step": 13875 }, { "epoch": 0.7963738596591887, "grad_norm": 3.258626995190857, "learning_rate": 2.421558210072702e-06, "loss": 1.0974, "step": 13880 }, { "epoch": 0.7966607378507086, "grad_norm": 3.148105531234528, "learning_rate": 2.4150285198030067e-06, "loss": 1.003, "step": 13885 }, { "epoch": 0.7969476160422285, "grad_norm": 3.821882671154222, "learning_rate": 2.40850643575972e-06, "loss": 1.0423, "step": 13890 }, { "epoch": 0.7972344942337484, "grad_norm": 3.355343539224275, "learning_rate": 2.4019919644832023e-06, "loss": 1.0408, "step": 13895 }, { "epoch": 0.7975213724252682, "grad_norm": 3.3993644103895275, "learning_rate": 2.395485112506177e-06, "loss": 1.1074, "step": 13900 }, { "epoch": 0.7978082506167881, "grad_norm": 2.9738946948855016, "learning_rate": 2.3889858863537398e-06, "loss": 1.0464, "step": 13905 }, { "epoch": 0.798095128808308, "grad_norm": 3.188949912323467, "learning_rate": 2.3824942925433192e-06, "loss": 1.0075, "step": 13910 }, { "epoch": 0.7983820069998279, "grad_norm": 3.3702129493902744, "learning_rate": 2.376010337584701e-06, "loss": 1.0291, "step": 13915 }, { "epoch": 0.7986688851913477, "grad_norm": 2.863481002277995, "learning_rate": 2.369534027980015e-06, "loss": 1.0856, "step": 13920 }, { "epoch": 0.7989557633828677, "grad_norm": 3.014222828872076, "learning_rate": 2.363065370223716e-06, "loss": 1.0208, "step": 13925 }, { "epoch": 0.7992426415743875, "grad_norm": 3.8734300847198084, "learning_rate": 2.356604370802588e-06, "loss": 1.1486, "step": 13930 }, { "epoch": 0.7995295197659074, "grad_norm": 3.297892833624107, "learning_rate": 2.350151036195737e-06, "loss": 1.1089, "step": 13935 }, { "epoch": 0.7998163979574273, "grad_norm": 3.185930204866702, "learning_rate": 2.3437053728745807e-06, "loss": 1.0311, "step": 13940 }, { "epoch": 0.8001032761489472, "grad_norm": 3.351319179999831, "learning_rate": 2.337267387302844e-06, "loss": 1.0822, "step": 13945 }, { "epoch": 0.800390154340467, "grad_norm": 3.2801573447753674, "learning_rate": 2.3308370859365524e-06, "loss": 1.0212, "step": 13950 }, { "epoch": 0.800677032531987, "grad_norm": 3.216852746710273, "learning_rate": 2.3244144752240337e-06, "loss": 1.0108, "step": 13955 }, { "epoch": 0.8009639107235068, "grad_norm": 3.4935252068027065, "learning_rate": 2.3179995616058883e-06, "loss": 1.0838, "step": 13960 }, { "epoch": 0.8012507889150267, "grad_norm": 3.0542321407417776, "learning_rate": 2.311592351515004e-06, "loss": 1.0219, "step": 13965 }, { "epoch": 0.8015376671065466, "grad_norm": 3.8472481362332394, "learning_rate": 2.3051928513765544e-06, "loss": 1.0113, "step": 13970 }, { "epoch": 0.8018245452980665, "grad_norm": 3.2028690114433602, "learning_rate": 2.2988010676079677e-06, "loss": 0.932, "step": 13975 }, { "epoch": 0.8021114234895863, "grad_norm": 3.7546566086926125, "learning_rate": 2.2924170066189388e-06, "loss": 1.027, "step": 13980 }, { "epoch": 0.8023983016811062, "grad_norm": 3.5461065750746195, "learning_rate": 2.2860406748114196e-06, "loss": 1.1195, "step": 13985 }, { "epoch": 0.8026851798726261, "grad_norm": 3.243902367517484, "learning_rate": 2.279672078579609e-06, "loss": 1.0858, "step": 13990 }, { "epoch": 0.802972058064146, "grad_norm": 3.3904573664417836, "learning_rate": 2.273311224309951e-06, "loss": 1.057, "step": 13995 }, { "epoch": 0.8032589362556658, "grad_norm": 3.4803965734261397, "learning_rate": 2.2669581183811195e-06, "loss": 1.0421, "step": 14000 }, { "epoch": 0.8035458144471858, "grad_norm": 3.22024968759766, "learning_rate": 2.2606127671640333e-06, "loss": 1.0393, "step": 14005 }, { "epoch": 0.8038326926387056, "grad_norm": 3.4159484367845723, "learning_rate": 2.2542751770218164e-06, "loss": 0.9959, "step": 14010 }, { "epoch": 0.8041195708302254, "grad_norm": 3.0090413684703803, "learning_rate": 2.2479453543098172e-06, "loss": 1.0583, "step": 14015 }, { "epoch": 0.8044064490217454, "grad_norm": 3.7649925087073064, "learning_rate": 2.241623305375603e-06, "loss": 1.0658, "step": 14020 }, { "epoch": 0.8046933272132653, "grad_norm": 3.0698731711314546, "learning_rate": 2.2353090365589348e-06, "loss": 1.0783, "step": 14025 }, { "epoch": 0.8049802054047851, "grad_norm": 2.9738998205462974, "learning_rate": 2.229002554191777e-06, "loss": 1.0642, "step": 14030 }, { "epoch": 0.8052670835963051, "grad_norm": 3.3870144041856887, "learning_rate": 2.2227038645982835e-06, "loss": 1.0229, "step": 14035 }, { "epoch": 0.8055539617878249, "grad_norm": 3.0308107643355977, "learning_rate": 2.216412974094794e-06, "loss": 1.0116, "step": 14040 }, { "epoch": 0.8058408399793447, "grad_norm": 3.6855548795147475, "learning_rate": 2.2101298889898273e-06, "loss": 1.0577, "step": 14045 }, { "epoch": 0.8061277181708647, "grad_norm": 3.381599852367108, "learning_rate": 2.2038546155840733e-06, "loss": 1.0796, "step": 14050 }, { "epoch": 0.8064145963623846, "grad_norm": 3.4603851207076195, "learning_rate": 2.197587160170398e-06, "loss": 1.0606, "step": 14055 }, { "epoch": 0.8067014745539044, "grad_norm": 3.948726757360776, "learning_rate": 2.191327529033812e-06, "loss": 1.081, "step": 14060 }, { "epoch": 0.8069883527454242, "grad_norm": 2.9267428641158295, "learning_rate": 2.1850757284514877e-06, "loss": 1.0226, "step": 14065 }, { "epoch": 0.8072752309369442, "grad_norm": 3.4370238022105264, "learning_rate": 2.178831764692749e-06, "loss": 1.0305, "step": 14070 }, { "epoch": 0.807562109128464, "grad_norm": 3.6290745918672793, "learning_rate": 2.1725956440190545e-06, "loss": 1.0048, "step": 14075 }, { "epoch": 0.8078489873199839, "grad_norm": 4.219545655990287, "learning_rate": 2.1663673726840006e-06, "loss": 1.035, "step": 14080 }, { "epoch": 0.8081358655115038, "grad_norm": 3.119322522240256, "learning_rate": 2.1601469569333112e-06, "loss": 1.0598, "step": 14085 }, { "epoch": 0.8084227437030237, "grad_norm": 3.210918273581893, "learning_rate": 2.153934403004834e-06, "loss": 1.0249, "step": 14090 }, { "epoch": 0.8087096218945435, "grad_norm": 3.192341962950796, "learning_rate": 2.1477297171285283e-06, "loss": 0.953, "step": 14095 }, { "epoch": 0.8089965000860635, "grad_norm": 3.3749641731935203, "learning_rate": 2.141532905526472e-06, "loss": 0.9792, "step": 14100 }, { "epoch": 0.8092833782775833, "grad_norm": 3.1290937679929987, "learning_rate": 2.1353439744128436e-06, "loss": 1.0659, "step": 14105 }, { "epoch": 0.8095702564691032, "grad_norm": 3.501716912986199, "learning_rate": 2.1291629299939097e-06, "loss": 1.143, "step": 14110 }, { "epoch": 0.8098571346606231, "grad_norm": 3.179510335349071, "learning_rate": 2.1229897784680363e-06, "loss": 1.0332, "step": 14115 }, { "epoch": 0.810144012852143, "grad_norm": 3.0326262256217875, "learning_rate": 2.116824526025679e-06, "loss": 0.9452, "step": 14120 }, { "epoch": 0.8104308910436628, "grad_norm": 3.58727821628984, "learning_rate": 2.1106671788493637e-06, "loss": 1.0743, "step": 14125 }, { "epoch": 0.8107177692351828, "grad_norm": 3.903098855059959, "learning_rate": 2.104517743113693e-06, "loss": 0.995, "step": 14130 }, { "epoch": 0.8110046474267026, "grad_norm": 3.57748939863304, "learning_rate": 2.0983762249853346e-06, "loss": 1.0595, "step": 14135 }, { "epoch": 0.8112915256182225, "grad_norm": 3.4861270465626584, "learning_rate": 2.092242630623016e-06, "loss": 1.0534, "step": 14140 }, { "epoch": 0.8115784038097423, "grad_norm": 3.5780227255481356, "learning_rate": 2.086116966177516e-06, "loss": 0.9991, "step": 14145 }, { "epoch": 0.8118652820012623, "grad_norm": 3.195388327908311, "learning_rate": 2.0799992377916722e-06, "loss": 1.0739, "step": 14150 }, { "epoch": 0.8121521601927821, "grad_norm": 3.0987960000692873, "learning_rate": 2.0738894516003538e-06, "loss": 1.0161, "step": 14155 }, { "epoch": 0.812439038384302, "grad_norm": 3.3919892570118404, "learning_rate": 2.067787613730462e-06, "loss": 1.051, "step": 14160 }, { "epoch": 0.8127259165758219, "grad_norm": 3.9349475026248366, "learning_rate": 2.061693730300941e-06, "loss": 1.0851, "step": 14165 }, { "epoch": 0.8130127947673418, "grad_norm": 4.398223627683151, "learning_rate": 2.055607807422748e-06, "loss": 1.1218, "step": 14170 }, { "epoch": 0.8132996729588616, "grad_norm": 3.2469936222520497, "learning_rate": 2.0495298511988605e-06, "loss": 1.0982, "step": 14175 }, { "epoch": 0.8135865511503816, "grad_norm": 3.714137870005456, "learning_rate": 2.0434598677242655e-06, "loss": 1.0756, "step": 14180 }, { "epoch": 0.8138734293419014, "grad_norm": 3.6752140069175687, "learning_rate": 2.037397863085957e-06, "loss": 1.1396, "step": 14185 }, { "epoch": 0.8141603075334213, "grad_norm": 3.0696305402725272, "learning_rate": 2.0313438433629263e-06, "loss": 1.0391, "step": 14190 }, { "epoch": 0.8144471857249412, "grad_norm": 3.5781864266349266, "learning_rate": 2.0252978146261558e-06, "loss": 1.0641, "step": 14195 }, { "epoch": 0.8147340639164611, "grad_norm": 3.1323966070937073, "learning_rate": 2.0192597829386217e-06, "loss": 1.1204, "step": 14200 }, { "epoch": 0.8150209421079809, "grad_norm": 3.1666646416612827, "learning_rate": 2.013229754355276e-06, "loss": 0.9772, "step": 14205 }, { "epoch": 0.8153078202995009, "grad_norm": 3.1036136850075975, "learning_rate": 2.007207734923036e-06, "loss": 0.9909, "step": 14210 }, { "epoch": 0.8155946984910207, "grad_norm": 3.1711355183453227, "learning_rate": 2.0011937306808048e-06, "loss": 0.9849, "step": 14215 }, { "epoch": 0.8158815766825406, "grad_norm": 3.690660076781952, "learning_rate": 1.9951877476594385e-06, "loss": 0.9628, "step": 14220 }, { "epoch": 0.8161684548740604, "grad_norm": 3.3754721866933646, "learning_rate": 1.989189791881747e-06, "loss": 1.0554, "step": 14225 }, { "epoch": 0.8164553330655804, "grad_norm": 3.177762415697452, "learning_rate": 1.9831998693624964e-06, "loss": 1.0169, "step": 14230 }, { "epoch": 0.8167422112571002, "grad_norm": 3.33120488030606, "learning_rate": 1.977217986108393e-06, "loss": 1.038, "step": 14235 }, { "epoch": 0.8170290894486201, "grad_norm": 3.748508895909544, "learning_rate": 1.971244148118083e-06, "loss": 1.0023, "step": 14240 }, { "epoch": 0.81731596764014, "grad_norm": 3.2797437302784647, "learning_rate": 1.9652783613821434e-06, "loss": 1.0211, "step": 14245 }, { "epoch": 0.8176028458316599, "grad_norm": 3.4562169724522853, "learning_rate": 1.9593206318830814e-06, "loss": 1.0531, "step": 14250 }, { "epoch": 0.8178897240231797, "grad_norm": 2.8416396474378027, "learning_rate": 1.953370965595324e-06, "loss": 1.0529, "step": 14255 }, { "epoch": 0.8181766022146997, "grad_norm": 3.0695444032381722, "learning_rate": 1.9474293684851987e-06, "loss": 1.0002, "step": 14260 }, { "epoch": 0.8184634804062195, "grad_norm": 3.053326265798995, "learning_rate": 1.9414958465109637e-06, "loss": 1.0715, "step": 14265 }, { "epoch": 0.8187503585977394, "grad_norm": 3.4866087676065285, "learning_rate": 1.935570405622763e-06, "loss": 1.1583, "step": 14270 }, { "epoch": 0.8190372367892593, "grad_norm": 3.170122262287769, "learning_rate": 1.9296530517626443e-06, "loss": 0.986, "step": 14275 }, { "epoch": 0.8193241149807792, "grad_norm": 3.3539387054181886, "learning_rate": 1.9237437908645417e-06, "loss": 0.9767, "step": 14280 }, { "epoch": 0.819610993172299, "grad_norm": 3.2860664958739436, "learning_rate": 1.9178426288542752e-06, "loss": 1.1129, "step": 14285 }, { "epoch": 0.819897871363819, "grad_norm": 3.5613618147051382, "learning_rate": 1.911949571649542e-06, "loss": 1.1014, "step": 14290 }, { "epoch": 0.8201847495553388, "grad_norm": 3.077254365582193, "learning_rate": 1.9060646251599157e-06, "loss": 1.0934, "step": 14295 }, { "epoch": 0.8204716277468587, "grad_norm": 3.152780183870098, "learning_rate": 1.900187795286834e-06, "loss": 1.0398, "step": 14300 }, { "epoch": 0.8207585059383785, "grad_norm": 3.4588891266808752, "learning_rate": 1.8943190879235972e-06, "loss": 1.0745, "step": 14305 }, { "epoch": 0.8210453841298985, "grad_norm": 3.7687536093215095, "learning_rate": 1.8884585089553497e-06, "loss": 1.0675, "step": 14310 }, { "epoch": 0.8213322623214183, "grad_norm": 3.4524116064093566, "learning_rate": 1.8826060642591003e-06, "loss": 1.0753, "step": 14315 }, { "epoch": 0.8216191405129382, "grad_norm": 2.8069052572319233, "learning_rate": 1.8767617597036924e-06, "loss": 0.9568, "step": 14320 }, { "epoch": 0.8219060187044581, "grad_norm": 3.4451527032912046, "learning_rate": 1.870925601149808e-06, "loss": 0.981, "step": 14325 }, { "epoch": 0.822192896895978, "grad_norm": 2.8367410711068524, "learning_rate": 1.8650975944499584e-06, "loss": 0.9386, "step": 14330 }, { "epoch": 0.8224797750874978, "grad_norm": 3.734487033286228, "learning_rate": 1.8592777454484834e-06, "loss": 0.9972, "step": 14335 }, { "epoch": 0.8227666532790178, "grad_norm": 3.1030023463224854, "learning_rate": 1.8534660599815368e-06, "loss": 1.0224, "step": 14340 }, { "epoch": 0.8230535314705376, "grad_norm": 3.5619294905430228, "learning_rate": 1.8476625438770944e-06, "loss": 1.1101, "step": 14345 }, { "epoch": 0.8233404096620575, "grad_norm": 3.0224036871213324, "learning_rate": 1.8418672029549356e-06, "loss": 1.041, "step": 14350 }, { "epoch": 0.8236272878535774, "grad_norm": 3.3826375770808417, "learning_rate": 1.836080043026638e-06, "loss": 1.0687, "step": 14355 }, { "epoch": 0.8239141660450973, "grad_norm": 3.1375190320269217, "learning_rate": 1.8303010698955803e-06, "loss": 1.0012, "step": 14360 }, { "epoch": 0.8242010442366171, "grad_norm": 3.696292943828924, "learning_rate": 1.8245302893569294e-06, "loss": 1.0433, "step": 14365 }, { "epoch": 0.8244879224281371, "grad_norm": 2.9533240124141424, "learning_rate": 1.8187677071976362e-06, "loss": 0.9926, "step": 14370 }, { "epoch": 0.8247748006196569, "grad_norm": 2.898317023251736, "learning_rate": 1.8130133291964325e-06, "loss": 1.0007, "step": 14375 }, { "epoch": 0.8250616788111768, "grad_norm": 3.4569634358531474, "learning_rate": 1.8072671611238202e-06, "loss": 0.9083, "step": 14380 }, { "epoch": 0.8253485570026966, "grad_norm": 3.272255361880805, "learning_rate": 1.80152920874207e-06, "loss": 0.9976, "step": 14385 }, { "epoch": 0.8256354351942166, "grad_norm": 3.142044168660313, "learning_rate": 1.7957994778052113e-06, "loss": 0.9733, "step": 14390 }, { "epoch": 0.8259223133857364, "grad_norm": 3.253870860010789, "learning_rate": 1.7900779740590346e-06, "loss": 1.1107, "step": 14395 }, { "epoch": 0.8262091915772563, "grad_norm": 3.2603176162216156, "learning_rate": 1.784364703241076e-06, "loss": 1.0328, "step": 14400 }, { "epoch": 0.8264960697687762, "grad_norm": 3.1491323147663706, "learning_rate": 1.7786596710806159e-06, "loss": 1.0081, "step": 14405 }, { "epoch": 0.826782947960296, "grad_norm": 3.570359656056488, "learning_rate": 1.7729628832986722e-06, "loss": 1.0953, "step": 14410 }, { "epoch": 0.8270698261518159, "grad_norm": 4.948259515366424, "learning_rate": 1.7672743456079977e-06, "loss": 1.0414, "step": 14415 }, { "epoch": 0.8273567043433359, "grad_norm": 4.227222211812507, "learning_rate": 1.7615940637130679e-06, "loss": 1.0324, "step": 14420 }, { "epoch": 0.8276435825348557, "grad_norm": 3.214619890475981, "learning_rate": 1.7559220433100899e-06, "loss": 1.0546, "step": 14425 }, { "epoch": 0.8279304607263755, "grad_norm": 3.209953965606381, "learning_rate": 1.7502582900869703e-06, "loss": 0.9859, "step": 14430 }, { "epoch": 0.8282173389178955, "grad_norm": 3.356410448607741, "learning_rate": 1.744602809723337e-06, "loss": 1.0174, "step": 14435 }, { "epoch": 0.8285042171094154, "grad_norm": 3.3052937765782335, "learning_rate": 1.7389556078905145e-06, "loss": 1.1184, "step": 14440 }, { "epoch": 0.8287910953009352, "grad_norm": 2.998034953702605, "learning_rate": 1.7333166902515364e-06, "loss": 0.9667, "step": 14445 }, { "epoch": 0.8290779734924552, "grad_norm": 3.4888211433056613, "learning_rate": 1.727686062461118e-06, "loss": 1.0482, "step": 14450 }, { "epoch": 0.829364851683975, "grad_norm": 3.2379125955384156, "learning_rate": 1.7220637301656652e-06, "loss": 0.9535, "step": 14455 }, { "epoch": 0.8296517298754948, "grad_norm": 3.2388883592124733, "learning_rate": 1.7164496990032663e-06, "loss": 1.008, "step": 14460 }, { "epoch": 0.8299386080670147, "grad_norm": 3.118178274554452, "learning_rate": 1.7108439746036842e-06, "loss": 0.9443, "step": 14465 }, { "epoch": 0.8302254862585347, "grad_norm": 3.476718383345352, "learning_rate": 1.7052465625883496e-06, "loss": 1.0379, "step": 14470 }, { "epoch": 0.8305123644500545, "grad_norm": 3.501269386573678, "learning_rate": 1.699657468570367e-06, "loss": 0.9906, "step": 14475 }, { "epoch": 0.8307992426415743, "grad_norm": 3.0516656796953407, "learning_rate": 1.6940766981544844e-06, "loss": 1.0174, "step": 14480 }, { "epoch": 0.8310861208330943, "grad_norm": 2.9207559576024615, "learning_rate": 1.6885042569371147e-06, "loss": 1.0836, "step": 14485 }, { "epoch": 0.8313729990246141, "grad_norm": 3.4225626743010866, "learning_rate": 1.6829401505063103e-06, "loss": 1.0172, "step": 14490 }, { "epoch": 0.831659877216134, "grad_norm": 2.9343346803716663, "learning_rate": 1.6773843844417759e-06, "loss": 0.9628, "step": 14495 }, { "epoch": 0.831946755407654, "grad_norm": 3.7823106245595035, "learning_rate": 1.6718369643148435e-06, "loss": 1.0195, "step": 14500 }, { "epoch": 0.8322336335991738, "grad_norm": 3.2527968060186265, "learning_rate": 1.6662978956884778e-06, "loss": 0.9755, "step": 14505 }, { "epoch": 0.8325205117906936, "grad_norm": 3.3605418497856894, "learning_rate": 1.66076718411727e-06, "loss": 0.9635, "step": 14510 }, { "epoch": 0.8328073899822136, "grad_norm": 3.312857934392767, "learning_rate": 1.6552448351474305e-06, "loss": 1.0222, "step": 14515 }, { "epoch": 0.8330942681737334, "grad_norm": 3.3290524660615732, "learning_rate": 1.6497308543167789e-06, "loss": 1.0047, "step": 14520 }, { "epoch": 0.8333811463652533, "grad_norm": 3.2927961022613585, "learning_rate": 1.644225247154756e-06, "loss": 1.0013, "step": 14525 }, { "epoch": 0.8336680245567732, "grad_norm": 3.2125946025575933, "learning_rate": 1.6387280191823895e-06, "loss": 1.0585, "step": 14530 }, { "epoch": 0.8339549027482931, "grad_norm": 2.9512920282867516, "learning_rate": 1.6332391759123123e-06, "loss": 0.9975, "step": 14535 }, { "epoch": 0.8342417809398129, "grad_norm": 3.7325097338639237, "learning_rate": 1.6277587228487536e-06, "loss": 1.201, "step": 14540 }, { "epoch": 0.8345286591313328, "grad_norm": 3.145634391023428, "learning_rate": 1.6222866654875214e-06, "loss": 1.1047, "step": 14545 }, { "epoch": 0.8348155373228527, "grad_norm": 3.457211611732761, "learning_rate": 1.6168230093160065e-06, "loss": 1.1028, "step": 14550 }, { "epoch": 0.8351024155143726, "grad_norm": 2.9316720110222394, "learning_rate": 1.611367759813176e-06, "loss": 1.0398, "step": 14555 }, { "epoch": 0.8353892937058924, "grad_norm": 2.923063049449617, "learning_rate": 1.6059209224495676e-06, "loss": 1.0094, "step": 14560 }, { "epoch": 0.8356761718974124, "grad_norm": 3.4014022601531666, "learning_rate": 1.6004825026872806e-06, "loss": 1.0133, "step": 14565 }, { "epoch": 0.8359630500889322, "grad_norm": 3.3044458598921116, "learning_rate": 1.5950525059799716e-06, "loss": 1.0841, "step": 14570 }, { "epoch": 0.8362499282804521, "grad_norm": 3.359797745164939, "learning_rate": 1.5896309377728624e-06, "loss": 1.0507, "step": 14575 }, { "epoch": 0.836536806471972, "grad_norm": 3.4887748897468733, "learning_rate": 1.5842178035027044e-06, "loss": 1.0948, "step": 14580 }, { "epoch": 0.8368236846634919, "grad_norm": 3.4684381302706933, "learning_rate": 1.5788131085978032e-06, "loss": 1.0085, "step": 14585 }, { "epoch": 0.8371105628550117, "grad_norm": 3.160138253271482, "learning_rate": 1.573416858478003e-06, "loss": 1.0402, "step": 14590 }, { "epoch": 0.8373974410465317, "grad_norm": 3.1663423459649227, "learning_rate": 1.568029058554672e-06, "loss": 1.0603, "step": 14595 }, { "epoch": 0.8376843192380515, "grad_norm": 3.66197811325555, "learning_rate": 1.5626497142307085e-06, "loss": 1.0836, "step": 14600 }, { "epoch": 0.8379711974295714, "grad_norm": 3.718239150018343, "learning_rate": 1.5572788309005315e-06, "loss": 1.0312, "step": 14605 }, { "epoch": 0.8382580756210913, "grad_norm": 3.600274188590663, "learning_rate": 1.5519164139500741e-06, "loss": 1.062, "step": 14610 }, { "epoch": 0.8385449538126112, "grad_norm": 3.213284725110922, "learning_rate": 1.5465624687567815e-06, "loss": 1.0582, "step": 14615 }, { "epoch": 0.838831832004131, "grad_norm": 3.5888107795222752, "learning_rate": 1.5412170006895987e-06, "loss": 0.968, "step": 14620 }, { "epoch": 0.8391187101956509, "grad_norm": 3.2536285747417844, "learning_rate": 1.5358800151089803e-06, "loss": 1.0332, "step": 14625 }, { "epoch": 0.8394055883871708, "grad_norm": 3.167185417959817, "learning_rate": 1.5305515173668595e-06, "loss": 1.0481, "step": 14630 }, { "epoch": 0.8396924665786907, "grad_norm": 3.214925553599641, "learning_rate": 1.5252315128066663e-06, "loss": 0.9746, "step": 14635 }, { "epoch": 0.8399793447702105, "grad_norm": 4.211883058265854, "learning_rate": 1.519920006763319e-06, "loss": 1.1113, "step": 14640 }, { "epoch": 0.8402662229617305, "grad_norm": 3.425412294282702, "learning_rate": 1.5146170045632036e-06, "loss": 1.0242, "step": 14645 }, { "epoch": 0.8405531011532503, "grad_norm": 3.403819604526518, "learning_rate": 1.509322511524184e-06, "loss": 0.9971, "step": 14650 }, { "epoch": 0.8408399793447702, "grad_norm": 3.016146877497939, "learning_rate": 1.5040365329555894e-06, "loss": 0.9803, "step": 14655 }, { "epoch": 0.8411268575362901, "grad_norm": 3.6723515144985286, "learning_rate": 1.4987590741582102e-06, "loss": 1.0688, "step": 14660 }, { "epoch": 0.84141373572781, "grad_norm": 2.7860925278992568, "learning_rate": 1.493490140424293e-06, "loss": 0.9491, "step": 14665 }, { "epoch": 0.8417006139193298, "grad_norm": 2.867919210453954, "learning_rate": 1.4882297370375388e-06, "loss": 0.9671, "step": 14670 }, { "epoch": 0.8419874921108498, "grad_norm": 3.354997782640784, "learning_rate": 1.4829778692730945e-06, "loss": 1.0808, "step": 14675 }, { "epoch": 0.8422743703023696, "grad_norm": 3.2763995872035583, "learning_rate": 1.4777345423975375e-06, "loss": 1.0508, "step": 14680 }, { "epoch": 0.8425612484938895, "grad_norm": 3.4645120930855766, "learning_rate": 1.4724997616688907e-06, "loss": 1.0475, "step": 14685 }, { "epoch": 0.8428481266854094, "grad_norm": 3.0878449841882096, "learning_rate": 1.467273532336606e-06, "loss": 0.98, "step": 14690 }, { "epoch": 0.8431350048769293, "grad_norm": 3.312382208661438, "learning_rate": 1.462055859641558e-06, "loss": 0.9655, "step": 14695 }, { "epoch": 0.8434218830684491, "grad_norm": 3.456890249088566, "learning_rate": 1.4568467488160387e-06, "loss": 0.9945, "step": 14700 }, { "epoch": 0.843708761259969, "grad_norm": 3.505636522399719, "learning_rate": 1.4516462050837565e-06, "loss": 1.0802, "step": 14705 }, { "epoch": 0.8439956394514889, "grad_norm": 3.405078050669893, "learning_rate": 1.4464542336598276e-06, "loss": 1.0276, "step": 14710 }, { "epoch": 0.8442825176430088, "grad_norm": 3.1023006372913233, "learning_rate": 1.4412708397507724e-06, "loss": 0.9899, "step": 14715 }, { "epoch": 0.8445693958345286, "grad_norm": 3.4090833033742247, "learning_rate": 1.4360960285545134e-06, "loss": 1.0416, "step": 14720 }, { "epoch": 0.8448562740260486, "grad_norm": 3.370214386001877, "learning_rate": 1.4309298052603626e-06, "loss": 1.0539, "step": 14725 }, { "epoch": 0.8451431522175684, "grad_norm": 3.449541281387495, "learning_rate": 1.4257721750490129e-06, "loss": 0.9906, "step": 14730 }, { "epoch": 0.8454300304090883, "grad_norm": 3.1730343363618916, "learning_rate": 1.4206231430925555e-06, "loss": 1.0682, "step": 14735 }, { "epoch": 0.8457169086006082, "grad_norm": 3.1886877665036937, "learning_rate": 1.4154827145544492e-06, "loss": 1.0275, "step": 14740 }, { "epoch": 0.8460037867921281, "grad_norm": 2.8872393005880865, "learning_rate": 1.410350894589525e-06, "loss": 1.0428, "step": 14745 }, { "epoch": 0.8462906649836479, "grad_norm": 3.6463747749644027, "learning_rate": 1.4052276883439864e-06, "loss": 0.9975, "step": 14750 }, { "epoch": 0.8465775431751679, "grad_norm": 3.620112139225195, "learning_rate": 1.4001131009553937e-06, "loss": 1.0837, "step": 14755 }, { "epoch": 0.8468644213666877, "grad_norm": 3.1268230905121843, "learning_rate": 1.3950071375526686e-06, "loss": 1.0912, "step": 14760 }, { "epoch": 0.8471512995582076, "grad_norm": 3.145266631767835, "learning_rate": 1.3899098032560787e-06, "loss": 0.9818, "step": 14765 }, { "epoch": 0.8474381777497275, "grad_norm": 3.048690137430667, "learning_rate": 1.3848211031772474e-06, "loss": 1.0265, "step": 14770 }, { "epoch": 0.8477250559412474, "grad_norm": 3.009950323797931, "learning_rate": 1.3797410424191338e-06, "loss": 1.0082, "step": 14775 }, { "epoch": 0.8480119341327672, "grad_norm": 3.389088047393967, "learning_rate": 1.3746696260760296e-06, "loss": 0.9921, "step": 14780 }, { "epoch": 0.848298812324287, "grad_norm": 3.22639077889437, "learning_rate": 1.3696068592335676e-06, "loss": 1.0427, "step": 14785 }, { "epoch": 0.848585690515807, "grad_norm": 3.146468378984391, "learning_rate": 1.3645527469686992e-06, "loss": 1.0405, "step": 14790 }, { "epoch": 0.8488725687073269, "grad_norm": 3.3243609796752605, "learning_rate": 1.3595072943497012e-06, "loss": 1.0215, "step": 14795 }, { "epoch": 0.8491594468988467, "grad_norm": 3.1641028686769483, "learning_rate": 1.354470506436163e-06, "loss": 0.9677, "step": 14800 }, { "epoch": 0.8494463250903667, "grad_norm": 3.290469618640695, "learning_rate": 1.3494423882789876e-06, "loss": 1.0448, "step": 14805 }, { "epoch": 0.8497332032818865, "grad_norm": 4.487985072871688, "learning_rate": 1.3444229449203828e-06, "loss": 1.1275, "step": 14810 }, { "epoch": 0.8500200814734064, "grad_norm": 3.3925955533195817, "learning_rate": 1.3394121813938554e-06, "loss": 1.0212, "step": 14815 }, { "epoch": 0.8503069596649263, "grad_norm": 3.2254225403935504, "learning_rate": 1.3344101027242162e-06, "loss": 0.9769, "step": 14820 }, { "epoch": 0.8505938378564462, "grad_norm": 3.076672954838583, "learning_rate": 1.3294167139275594e-06, "loss": 0.989, "step": 14825 }, { "epoch": 0.850880716047966, "grad_norm": 3.0417263940101313, "learning_rate": 1.3244320200112593e-06, "loss": 1.0175, "step": 14830 }, { "epoch": 0.851167594239486, "grad_norm": 3.219946358863101, "learning_rate": 1.3194560259739863e-06, "loss": 0.9673, "step": 14835 }, { "epoch": 0.8514544724310058, "grad_norm": 2.9221454387718198, "learning_rate": 1.3144887368056758e-06, "loss": 1.0079, "step": 14840 }, { "epoch": 0.8517413506225257, "grad_norm": 3.191886690391079, "learning_rate": 1.3095301574875362e-06, "loss": 1.0798, "step": 14845 }, { "epoch": 0.8520282288140456, "grad_norm": 3.3953416947674255, "learning_rate": 1.3045802929920414e-06, "loss": 1.0504, "step": 14850 }, { "epoch": 0.8523151070055655, "grad_norm": 3.0561263228081943, "learning_rate": 1.2996391482829274e-06, "loss": 0.9564, "step": 14855 }, { "epoch": 0.8526019851970853, "grad_norm": 3.577494677880719, "learning_rate": 1.2947067283151838e-06, "loss": 0.9425, "step": 14860 }, { "epoch": 0.8528888633886051, "grad_norm": 3.3980536554921943, "learning_rate": 1.289783038035055e-06, "loss": 1.098, "step": 14865 }, { "epoch": 0.8531757415801251, "grad_norm": 3.257755868344329, "learning_rate": 1.2848680823800275e-06, "loss": 0.9684, "step": 14870 }, { "epoch": 0.853462619771645, "grad_norm": 3.4150650112971723, "learning_rate": 1.2799618662788316e-06, "loss": 0.9709, "step": 14875 }, { "epoch": 0.8537494979631648, "grad_norm": 3.683561620530894, "learning_rate": 1.2750643946514252e-06, "loss": 1.0469, "step": 14880 }, { "epoch": 0.8540363761546848, "grad_norm": 3.0496572168479057, "learning_rate": 1.2701756724090108e-06, "loss": 0.9765, "step": 14885 }, { "epoch": 0.8543232543462046, "grad_norm": 3.145376446245544, "learning_rate": 1.2652957044540082e-06, "loss": 1.0235, "step": 14890 }, { "epoch": 0.8546101325377244, "grad_norm": 3.1413940013327575, "learning_rate": 1.2604244956800593e-06, "loss": 0.9717, "step": 14895 }, { "epoch": 0.8548970107292444, "grad_norm": 3.1968578375525762, "learning_rate": 1.2555620509720235e-06, "loss": 1.0566, "step": 14900 }, { "epoch": 0.8551838889207642, "grad_norm": 3.5252822125750725, "learning_rate": 1.2507083752059723e-06, "loss": 1.0525, "step": 14905 }, { "epoch": 0.8554707671122841, "grad_norm": 3.8657312393785204, "learning_rate": 1.2458634732491782e-06, "loss": 1.0447, "step": 14910 }, { "epoch": 0.855757645303804, "grad_norm": 3.3638161998778906, "learning_rate": 1.2410273499601267e-06, "loss": 1.0658, "step": 14915 }, { "epoch": 0.8560445234953239, "grad_norm": 3.5652282159144972, "learning_rate": 1.2362000101884885e-06, "loss": 1.0217, "step": 14920 }, { "epoch": 0.8563314016868437, "grad_norm": 3.020690194030179, "learning_rate": 1.2313814587751316e-06, "loss": 1.0496, "step": 14925 }, { "epoch": 0.8566182798783637, "grad_norm": 3.299665485775949, "learning_rate": 1.2265717005521116e-06, "loss": 1.005, "step": 14930 }, { "epoch": 0.8569051580698835, "grad_norm": 3.8164987991685226, "learning_rate": 1.2217707403426626e-06, "loss": 1.038, "step": 14935 }, { "epoch": 0.8571920362614034, "grad_norm": 3.099859975446559, "learning_rate": 1.2169785829612003e-06, "loss": 1.0091, "step": 14940 }, { "epoch": 0.8574789144529232, "grad_norm": 3.331847007676515, "learning_rate": 1.2121952332133091e-06, "loss": 1.1276, "step": 14945 }, { "epoch": 0.8577657926444432, "grad_norm": 3.220288769172157, "learning_rate": 1.2074206958957446e-06, "loss": 1.0472, "step": 14950 }, { "epoch": 0.858052670835963, "grad_norm": 2.8185124266536175, "learning_rate": 1.2026549757964212e-06, "loss": 0.9181, "step": 14955 }, { "epoch": 0.8583395490274829, "grad_norm": 3.333939200140748, "learning_rate": 1.1978980776944138e-06, "loss": 1.0189, "step": 14960 }, { "epoch": 0.8586264272190028, "grad_norm": 2.9694843383385052, "learning_rate": 1.1931500063599545e-06, "loss": 1.0502, "step": 14965 }, { "epoch": 0.8589133054105227, "grad_norm": 3.111790712472915, "learning_rate": 1.1884107665544165e-06, "loss": 0.9934, "step": 14970 }, { "epoch": 0.8592001836020425, "grad_norm": 3.2523215450793623, "learning_rate": 1.1836803630303206e-06, "loss": 1.0206, "step": 14975 }, { "epoch": 0.8594870617935625, "grad_norm": 3.787962026560641, "learning_rate": 1.1789588005313257e-06, "loss": 1.076, "step": 14980 }, { "epoch": 0.8597739399850823, "grad_norm": 3.302574349762841, "learning_rate": 1.1742460837922264e-06, "loss": 1.0074, "step": 14985 }, { "epoch": 0.8600608181766022, "grad_norm": 2.840469731100635, "learning_rate": 1.1695422175389447e-06, "loss": 0.986, "step": 14990 }, { "epoch": 0.8603476963681221, "grad_norm": 3.28413559651193, "learning_rate": 1.1648472064885286e-06, "loss": 1.0877, "step": 14995 }, { "epoch": 0.860634574559642, "grad_norm": 3.08564945422835, "learning_rate": 1.1601610553491461e-06, "loss": 0.9613, "step": 15000 }, { "epoch": 0.8609214527511618, "grad_norm": 3.2894994570878424, "learning_rate": 1.1554837688200793e-06, "loss": 1.0731, "step": 15005 }, { "epoch": 0.8612083309426818, "grad_norm": 3.048484194342337, "learning_rate": 1.1508153515917198e-06, "loss": 0.9837, "step": 15010 }, { "epoch": 0.8614952091342016, "grad_norm": 2.913190266397635, "learning_rate": 1.1461558083455703e-06, "loss": 1.0062, "step": 15015 }, { "epoch": 0.8617820873257215, "grad_norm": 3.1561515167614296, "learning_rate": 1.1415051437542302e-06, "loss": 0.9995, "step": 15020 }, { "epoch": 0.8620689655172413, "grad_norm": 3.6485253193673577, "learning_rate": 1.1368633624813974e-06, "loss": 1.055, "step": 15025 }, { "epoch": 0.8623558437087613, "grad_norm": 2.9573646299141654, "learning_rate": 1.1322304691818576e-06, "loss": 1.083, "step": 15030 }, { "epoch": 0.8626427219002811, "grad_norm": 3.2275794770010315, "learning_rate": 1.1276064685014888e-06, "loss": 1.0163, "step": 15035 }, { "epoch": 0.862929600091801, "grad_norm": 3.610590560935629, "learning_rate": 1.1229913650772473e-06, "loss": 1.0702, "step": 15040 }, { "epoch": 0.8632164782833209, "grad_norm": 3.4093420930188048, "learning_rate": 1.1183851635371735e-06, "loss": 1.0092, "step": 15045 }, { "epoch": 0.8635033564748408, "grad_norm": 3.466754595695979, "learning_rate": 1.1137878685003723e-06, "loss": 1.0283, "step": 15050 }, { "epoch": 0.8637902346663606, "grad_norm": 3.3940122523149396, "learning_rate": 1.1091994845770227e-06, "loss": 1.0204, "step": 15055 }, { "epoch": 0.8640771128578806, "grad_norm": 3.3040862281731624, "learning_rate": 1.104620016368364e-06, "loss": 1.1229, "step": 15060 }, { "epoch": 0.8643639910494004, "grad_norm": 3.402465392361589, "learning_rate": 1.1000494684667017e-06, "loss": 1.0343, "step": 15065 }, { "epoch": 0.8646508692409203, "grad_norm": 3.2701605895298473, "learning_rate": 1.0954878454553907e-06, "loss": 0.9513, "step": 15070 }, { "epoch": 0.8649377474324402, "grad_norm": 3.2269794466242425, "learning_rate": 1.0909351519088352e-06, "loss": 0.951, "step": 15075 }, { "epoch": 0.8652246256239601, "grad_norm": 3.532583445868963, "learning_rate": 1.0863913923924863e-06, "loss": 1.0989, "step": 15080 }, { "epoch": 0.8655115038154799, "grad_norm": 3.4425232298969872, "learning_rate": 1.081856571462837e-06, "loss": 1.0445, "step": 15085 }, { "epoch": 0.8657983820069999, "grad_norm": 4.076612538007008, "learning_rate": 1.0773306936674134e-06, "loss": 1.0555, "step": 15090 }, { "epoch": 0.8660852601985197, "grad_norm": 5.387414991469258, "learning_rate": 1.0728137635447821e-06, "loss": 1.1601, "step": 15095 }, { "epoch": 0.8663721383900396, "grad_norm": 2.7703200947228304, "learning_rate": 1.068305785624526e-06, "loss": 1.0007, "step": 15100 }, { "epoch": 0.8666590165815594, "grad_norm": 3.307033980430676, "learning_rate": 1.0638067644272532e-06, "loss": 1.0558, "step": 15105 }, { "epoch": 0.8669458947730794, "grad_norm": 3.1912321768413388, "learning_rate": 1.059316704464598e-06, "loss": 1.0263, "step": 15110 }, { "epoch": 0.8672327729645992, "grad_norm": 3.329497930296765, "learning_rate": 1.0548356102391998e-06, "loss": 1.026, "step": 15115 }, { "epoch": 0.8675196511561191, "grad_norm": 3.6129897546653633, "learning_rate": 1.0503634862447098e-06, "loss": 0.9842, "step": 15120 }, { "epoch": 0.867806529347639, "grad_norm": 3.4390547712629984, "learning_rate": 1.045900336965785e-06, "loss": 0.9948, "step": 15125 }, { "epoch": 0.8680934075391589, "grad_norm": 3.3344108430773676, "learning_rate": 1.0414461668780806e-06, "loss": 0.9673, "step": 15130 }, { "epoch": 0.8683802857306787, "grad_norm": 3.23022075280538, "learning_rate": 1.0370009804482483e-06, "loss": 0.9661, "step": 15135 }, { "epoch": 0.8686671639221987, "grad_norm": 3.115117191110591, "learning_rate": 1.032564782133929e-06, "loss": 1.0152, "step": 15140 }, { "epoch": 0.8689540421137185, "grad_norm": 3.378590596328516, "learning_rate": 1.0281375763837598e-06, "loss": 0.9825, "step": 15145 }, { "epoch": 0.8692409203052384, "grad_norm": 3.072197193401539, "learning_rate": 1.0237193676373437e-06, "loss": 0.9981, "step": 15150 }, { "epoch": 0.8695277984967583, "grad_norm": 3.2718749389971, "learning_rate": 1.019310160325273e-06, "loss": 1.0487, "step": 15155 }, { "epoch": 0.8698146766882782, "grad_norm": 3.1595785625607817, "learning_rate": 1.0149099588691135e-06, "loss": 1.0049, "step": 15160 }, { "epoch": 0.870101554879798, "grad_norm": 3.6630456272185943, "learning_rate": 1.0105187676813955e-06, "loss": 1.0684, "step": 15165 }, { "epoch": 0.870388433071318, "grad_norm": 3.3509642311486143, "learning_rate": 1.006136591165614e-06, "loss": 1.1047, "step": 15170 }, { "epoch": 0.8706753112628378, "grad_norm": 3.2901079374549327, "learning_rate": 1.0017634337162275e-06, "loss": 1.0297, "step": 15175 }, { "epoch": 0.8709621894543577, "grad_norm": 3.276358745180128, "learning_rate": 9.973992997186465e-07, "loss": 1.0139, "step": 15180 }, { "epoch": 0.8712490676458775, "grad_norm": 3.4152014030371727, "learning_rate": 9.930441935492362e-07, "loss": 1.0112, "step": 15185 }, { "epoch": 0.8715359458373975, "grad_norm": 3.0632430921268043, "learning_rate": 9.88698119575302e-07, "loss": 1.0528, "step": 15190 }, { "epoch": 0.8718228240289173, "grad_norm": 3.4192035052667697, "learning_rate": 9.843610821551052e-07, "loss": 1.0041, "step": 15195 }, { "epoch": 0.8721097022204372, "grad_norm": 3.205921962062353, "learning_rate": 9.800330856378304e-07, "loss": 1.0535, "step": 15200 }, { "epoch": 0.8723965804119571, "grad_norm": 3.256766299272756, "learning_rate": 9.757141343636002e-07, "loss": 1.0111, "step": 15205 }, { "epoch": 0.872683458603477, "grad_norm": 3.3058707211830014, "learning_rate": 9.714042326634744e-07, "loss": 1.0064, "step": 15210 }, { "epoch": 0.8729703367949968, "grad_norm": 3.392452618124447, "learning_rate": 9.671033848594301e-07, "loss": 1.0043, "step": 15215 }, { "epoch": 0.8732572149865168, "grad_norm": 3.187254454815171, "learning_rate": 9.628115952643657e-07, "loss": 1.0104, "step": 15220 }, { "epoch": 0.8735440931780366, "grad_norm": 3.356535463913503, "learning_rate": 9.585288681820993e-07, "loss": 1.0723, "step": 15225 }, { "epoch": 0.8738309713695565, "grad_norm": 3.5444285367734145, "learning_rate": 9.542552079073586e-07, "loss": 1.077, "step": 15230 }, { "epoch": 0.8741178495610764, "grad_norm": 3.044256623713826, "learning_rate": 9.499906187257768e-07, "loss": 1.0278, "step": 15235 }, { "epoch": 0.8744047277525963, "grad_norm": 3.330777327580303, "learning_rate": 9.457351049138974e-07, "loss": 1.0619, "step": 15240 }, { "epoch": 0.8746916059441161, "grad_norm": 3.383508692425511, "learning_rate": 9.414886707391613e-07, "loss": 1.023, "step": 15245 }, { "epoch": 0.8749784841356361, "grad_norm": 3.25170497964211, "learning_rate": 9.372513204598954e-07, "loss": 0.9315, "step": 15250 }, { "epoch": 0.8752653623271559, "grad_norm": 3.0297795706445183, "learning_rate": 9.330230583253264e-07, "loss": 0.9996, "step": 15255 }, { "epoch": 0.8755522405186758, "grad_norm": 3.301593667966574, "learning_rate": 9.288038885755679e-07, "loss": 1.0734, "step": 15260 }, { "epoch": 0.8758391187101956, "grad_norm": 3.5791076872187855, "learning_rate": 9.245938154416112e-07, "loss": 1.0414, "step": 15265 }, { "epoch": 0.8761259969017156, "grad_norm": 3.507568444780706, "learning_rate": 9.203928431453268e-07, "loss": 1.1006, "step": 15270 }, { "epoch": 0.8764128750932354, "grad_norm": 3.075640159079677, "learning_rate": 9.162009758994595e-07, "loss": 1.071, "step": 15275 }, { "epoch": 0.8766997532847552, "grad_norm": 3.807724262370148, "learning_rate": 9.12018217907622e-07, "loss": 0.9752, "step": 15280 }, { "epoch": 0.8769866314762752, "grad_norm": 3.256582065320333, "learning_rate": 9.078445733642927e-07, "loss": 1.0469, "step": 15285 }, { "epoch": 0.877273509667795, "grad_norm": 3.3001862405089195, "learning_rate": 9.036800464548157e-07, "loss": 1.0431, "step": 15290 }, { "epoch": 0.8775603878593149, "grad_norm": 2.9958275964024335, "learning_rate": 8.995246413553871e-07, "loss": 0.9018, "step": 15295 }, { "epoch": 0.8778472660508349, "grad_norm": 3.954096662042772, "learning_rate": 8.953783622330514e-07, "loss": 0.9576, "step": 15300 }, { "epoch": 0.8781341442423547, "grad_norm": 3.685465957418405, "learning_rate": 8.912412132457116e-07, "loss": 0.9867, "step": 15305 }, { "epoch": 0.8784210224338745, "grad_norm": 3.4171256637567353, "learning_rate": 8.871131985421089e-07, "loss": 1.0628, "step": 15310 }, { "epoch": 0.8787079006253945, "grad_norm": 3.267952499620587, "learning_rate": 8.829943222618242e-07, "loss": 1.0632, "step": 15315 }, { "epoch": 0.8789947788169143, "grad_norm": 3.466011203962954, "learning_rate": 8.788845885352781e-07, "loss": 0.9263, "step": 15320 }, { "epoch": 0.8792816570084342, "grad_norm": 3.1720369233154324, "learning_rate": 8.747840014837194e-07, "loss": 1.0359, "step": 15325 }, { "epoch": 0.8795685351999541, "grad_norm": 3.180029368930501, "learning_rate": 8.706925652192255e-07, "loss": 0.9976, "step": 15330 }, { "epoch": 0.879855413391474, "grad_norm": 5.262823763910098, "learning_rate": 8.666102838446977e-07, "loss": 1.0676, "step": 15335 }, { "epoch": 0.8801422915829938, "grad_norm": 3.031951523377811, "learning_rate": 8.625371614538591e-07, "loss": 0.9984, "step": 15340 }, { "epoch": 0.8804291697745137, "grad_norm": 2.943409606855121, "learning_rate": 8.584732021312469e-07, "loss": 1.0153, "step": 15345 }, { "epoch": 0.8807160479660336, "grad_norm": 4.189034950910965, "learning_rate": 8.544184099522024e-07, "loss": 1.0437, "step": 15350 }, { "epoch": 0.8810029261575535, "grad_norm": 3.8247966974737104, "learning_rate": 8.503727889828861e-07, "loss": 1.0261, "step": 15355 }, { "epoch": 0.8812898043490733, "grad_norm": 2.8245375401427473, "learning_rate": 8.46336343280254e-07, "loss": 0.9941, "step": 15360 }, { "epoch": 0.8815766825405933, "grad_norm": 3.942681236071897, "learning_rate": 8.423090768920628e-07, "loss": 1.0035, "step": 15365 }, { "epoch": 0.8818635607321131, "grad_norm": 3.6386071592012064, "learning_rate": 8.382909938568651e-07, "loss": 1.0531, "step": 15370 }, { "epoch": 0.882150438923633, "grad_norm": 3.09098582612325, "learning_rate": 8.342820982040012e-07, "loss": 1.088, "step": 15375 }, { "epoch": 0.8824373171151529, "grad_norm": 3.4341785591491485, "learning_rate": 8.302823939536031e-07, "loss": 1.0159, "step": 15380 }, { "epoch": 0.8827241953066728, "grad_norm": 2.9591593615070697, "learning_rate": 8.262918851165813e-07, "loss": 0.9647, "step": 15385 }, { "epoch": 0.8830110734981926, "grad_norm": 3.4642742555729775, "learning_rate": 8.223105756946293e-07, "loss": 1.0021, "step": 15390 }, { "epoch": 0.8832979516897126, "grad_norm": 2.9987052232601985, "learning_rate": 8.183384696802133e-07, "loss": 0.9908, "step": 15395 }, { "epoch": 0.8835848298812324, "grad_norm": 3.5991823641627065, "learning_rate": 8.143755710565649e-07, "loss": 1.0751, "step": 15400 }, { "epoch": 0.8838717080727523, "grad_norm": 3.2744632605286235, "learning_rate": 8.10421883797694e-07, "loss": 1.0393, "step": 15405 }, { "epoch": 0.8841585862642722, "grad_norm": 2.8146779636778354, "learning_rate": 8.064774118683638e-07, "loss": 0.9477, "step": 15410 }, { "epoch": 0.8844454644557921, "grad_norm": 3.558668542485769, "learning_rate": 8.025421592241012e-07, "loss": 1.0661, "step": 15415 }, { "epoch": 0.8847323426473119, "grad_norm": 3.1238491799771326, "learning_rate": 7.98616129811185e-07, "loss": 1.0409, "step": 15420 }, { "epoch": 0.8850192208388318, "grad_norm": 3.743266293054315, "learning_rate": 7.94699327566647e-07, "loss": 1.1042, "step": 15425 }, { "epoch": 0.8853060990303517, "grad_norm": 3.040702122165473, "learning_rate": 7.907917564182632e-07, "loss": 0.9024, "step": 15430 }, { "epoch": 0.8855929772218716, "grad_norm": 3.1647370681641247, "learning_rate": 7.86893420284559e-07, "loss": 0.9602, "step": 15435 }, { "epoch": 0.8858798554133914, "grad_norm": 3.2479724596247728, "learning_rate": 7.830043230747919e-07, "loss": 0.9976, "step": 15440 }, { "epoch": 0.8861667336049114, "grad_norm": 3.2677041679783834, "learning_rate": 7.791244686889587e-07, "loss": 1.0375, "step": 15445 }, { "epoch": 0.8864536117964312, "grad_norm": 3.310141304509953, "learning_rate": 7.752538610177818e-07, "loss": 1.0216, "step": 15450 }, { "epoch": 0.8867404899879511, "grad_norm": 3.058362870661739, "learning_rate": 7.713925039427206e-07, "loss": 1.1004, "step": 15455 }, { "epoch": 0.887027368179471, "grad_norm": 3.112974460970513, "learning_rate": 7.675404013359511e-07, "loss": 1.0263, "step": 15460 }, { "epoch": 0.8873142463709909, "grad_norm": 3.889513585559512, "learning_rate": 7.636975570603689e-07, "loss": 1.004, "step": 15465 }, { "epoch": 0.8876011245625107, "grad_norm": 3.402450078538814, "learning_rate": 7.59863974969588e-07, "loss": 1.0661, "step": 15470 }, { "epoch": 0.8878880027540307, "grad_norm": 3.357551465364782, "learning_rate": 7.560396589079322e-07, "loss": 1.0379, "step": 15475 }, { "epoch": 0.8881748809455505, "grad_norm": 3.410597858581791, "learning_rate": 7.522246127104349e-07, "loss": 0.999, "step": 15480 }, { "epoch": 0.8884617591370704, "grad_norm": 2.9495407534481144, "learning_rate": 7.484188402028336e-07, "loss": 1.0125, "step": 15485 }, { "epoch": 0.8887486373285903, "grad_norm": 3.512594805644136, "learning_rate": 7.446223452015644e-07, "loss": 1.0297, "step": 15490 }, { "epoch": 0.8890355155201102, "grad_norm": 3.4505095683041813, "learning_rate": 7.40835131513764e-07, "loss": 1.0563, "step": 15495 }, { "epoch": 0.88932239371163, "grad_norm": 2.9672506329594253, "learning_rate": 7.370572029372525e-07, "loss": 1.0172, "step": 15500 }, { "epoch": 0.8896092719031499, "grad_norm": 3.0288536590480786, "learning_rate": 7.332885632605513e-07, "loss": 0.9292, "step": 15505 }, { "epoch": 0.8898961500946698, "grad_norm": 3.03740693791089, "learning_rate": 7.295292162628576e-07, "loss": 1.0191, "step": 15510 }, { "epoch": 0.8901830282861897, "grad_norm": 3.9133740646864723, "learning_rate": 7.257791657140545e-07, "loss": 1.0286, "step": 15515 }, { "epoch": 0.8904699064777095, "grad_norm": 3.1400743463466325, "learning_rate": 7.220384153746996e-07, "loss": 1.0608, "step": 15520 }, { "epoch": 0.8907567846692295, "grad_norm": 3.4363149929940464, "learning_rate": 7.183069689960265e-07, "loss": 1.0573, "step": 15525 }, { "epoch": 0.8910436628607493, "grad_norm": 3.4376195006717483, "learning_rate": 7.145848303199366e-07, "loss": 1.0647, "step": 15530 }, { "epoch": 0.8913305410522692, "grad_norm": 3.4163356113217147, "learning_rate": 7.108720030790028e-07, "loss": 1.0296, "step": 15535 }, { "epoch": 0.8916174192437891, "grad_norm": 3.8463372845661654, "learning_rate": 7.071684909964527e-07, "loss": 1.063, "step": 15540 }, { "epoch": 0.891904297435309, "grad_norm": 3.0679941882699118, "learning_rate": 7.034742977861786e-07, "loss": 0.9877, "step": 15545 }, { "epoch": 0.8921911756268288, "grad_norm": 3.425335754984763, "learning_rate": 6.99789427152725e-07, "loss": 0.9831, "step": 15550 }, { "epoch": 0.8924780538183488, "grad_norm": 3.097560512787204, "learning_rate": 6.961138827912883e-07, "loss": 1.0663, "step": 15555 }, { "epoch": 0.8927649320098686, "grad_norm": 3.2602618125572347, "learning_rate": 6.924476683877123e-07, "loss": 1.042, "step": 15560 }, { "epoch": 0.8930518102013885, "grad_norm": 3.209737191929221, "learning_rate": 6.887907876184863e-07, "loss": 1.0347, "step": 15565 }, { "epoch": 0.8933386883929084, "grad_norm": 2.959812451775001, "learning_rate": 6.851432441507377e-07, "loss": 0.9944, "step": 15570 }, { "epoch": 0.8936255665844283, "grad_norm": 3.31648999288441, "learning_rate": 6.815050416422309e-07, "loss": 1.0657, "step": 15575 }, { "epoch": 0.8939124447759481, "grad_norm": 3.343465992322855, "learning_rate": 6.778761837413627e-07, "loss": 1.0708, "step": 15580 }, { "epoch": 0.894199322967468, "grad_norm": 3.473196681968828, "learning_rate": 6.742566740871626e-07, "loss": 1.031, "step": 15585 }, { "epoch": 0.8944862011589879, "grad_norm": 3.7890308894308626, "learning_rate": 6.706465163092824e-07, "loss": 1.0924, "step": 15590 }, { "epoch": 0.8947730793505078, "grad_norm": 4.0213246560319655, "learning_rate": 6.67045714027994e-07, "loss": 1.0428, "step": 15595 }, { "epoch": 0.8950599575420276, "grad_norm": 3.2102187563933575, "learning_rate": 6.634542708541936e-07, "loss": 1.0302, "step": 15600 }, { "epoch": 0.8953468357335476, "grad_norm": 3.793853097687329, "learning_rate": 6.598721903893846e-07, "loss": 0.9984, "step": 15605 }, { "epoch": 0.8956337139250674, "grad_norm": 3.161455315753596, "learning_rate": 6.562994762256869e-07, "loss": 0.9366, "step": 15610 }, { "epoch": 0.8959205921165873, "grad_norm": 3.0693727567890314, "learning_rate": 6.527361319458292e-07, "loss": 1.0221, "step": 15615 }, { "epoch": 0.8962074703081072, "grad_norm": 3.414534365604592, "learning_rate": 6.491821611231364e-07, "loss": 1.0081, "step": 15620 }, { "epoch": 0.8964943484996271, "grad_norm": 3.903134824495282, "learning_rate": 6.456375673215409e-07, "loss": 1.0711, "step": 15625 }, { "epoch": 0.8967812266911469, "grad_norm": 3.0623766276645914, "learning_rate": 6.421023540955684e-07, "loss": 0.9339, "step": 15630 }, { "epoch": 0.8970681048826669, "grad_norm": 3.296768035994017, "learning_rate": 6.385765249903397e-07, "loss": 0.9931, "step": 15635 }, { "epoch": 0.8973549830741867, "grad_norm": 2.94649123156447, "learning_rate": 6.350600835415632e-07, "loss": 1.0235, "step": 15640 }, { "epoch": 0.8976418612657066, "grad_norm": 3.605010713860185, "learning_rate": 6.31553033275536e-07, "loss": 1.0143, "step": 15645 }, { "epoch": 0.8979287394572265, "grad_norm": 3.5427069125473527, "learning_rate": 6.280553777091336e-07, "loss": 1.0285, "step": 15650 }, { "epoch": 0.8982156176487464, "grad_norm": 3.0771127951250588, "learning_rate": 6.245671203498149e-07, "loss": 0.9885, "step": 15655 }, { "epoch": 0.8985024958402662, "grad_norm": 3.2776254181095745, "learning_rate": 6.210882646956084e-07, "loss": 0.9426, "step": 15660 }, { "epoch": 0.8987893740317862, "grad_norm": 3.1506917627009856, "learning_rate": 6.176188142351247e-07, "loss": 0.9917, "step": 15665 }, { "epoch": 0.899076252223306, "grad_norm": 3.6249452802186157, "learning_rate": 6.141587724475318e-07, "loss": 1.0267, "step": 15670 }, { "epoch": 0.8993631304148259, "grad_norm": 3.4187890526421145, "learning_rate": 6.107081428025675e-07, "loss": 1.0693, "step": 15675 }, { "epoch": 0.8996500086063457, "grad_norm": 3.646593604145915, "learning_rate": 6.072669287605327e-07, "loss": 1.102, "step": 15680 }, { "epoch": 0.8999368867978657, "grad_norm": 3.239914643917391, "learning_rate": 6.038351337722837e-07, "loss": 1.0038, "step": 15685 }, { "epoch": 0.9002237649893855, "grad_norm": 3.386271455551777, "learning_rate": 6.004127612792332e-07, "loss": 0.978, "step": 15690 }, { "epoch": 0.9005106431809053, "grad_norm": 3.314999093002035, "learning_rate": 5.969998147133415e-07, "loss": 1.0505, "step": 15695 }, { "epoch": 0.9007975213724253, "grad_norm": 2.9018401488546597, "learning_rate": 5.935962974971221e-07, "loss": 1.0142, "step": 15700 }, { "epoch": 0.9010843995639451, "grad_norm": 3.1455786305793576, "learning_rate": 5.902022130436269e-07, "loss": 0.9786, "step": 15705 }, { "epoch": 0.901371277755465, "grad_norm": 3.459405968036804, "learning_rate": 5.868175647564522e-07, "loss": 1.0455, "step": 15710 }, { "epoch": 0.901658155946985, "grad_norm": 3.1492121712757672, "learning_rate": 5.834423560297353e-07, "loss": 1.0528, "step": 15715 }, { "epoch": 0.9019450341385048, "grad_norm": 3.4728125191182784, "learning_rate": 5.800765902481365e-07, "loss": 1.0172, "step": 15720 }, { "epoch": 0.9022319123300246, "grad_norm": 2.996168333516266, "learning_rate": 5.767202707868558e-07, "loss": 1.0205, "step": 15725 }, { "epoch": 0.9025187905215446, "grad_norm": 3.259214063434123, "learning_rate": 5.733734010116187e-07, "loss": 0.982, "step": 15730 }, { "epoch": 0.9028056687130644, "grad_norm": 3.0767297251061048, "learning_rate": 5.700359842786729e-07, "loss": 0.9307, "step": 15735 }, { "epoch": 0.9030925469045843, "grad_norm": 3.214833605217749, "learning_rate": 5.667080239347888e-07, "loss": 1.0476, "step": 15740 }, { "epoch": 0.9033794250961042, "grad_norm": 3.1930808978555847, "learning_rate": 5.633895233172503e-07, "loss": 1.0431, "step": 15745 }, { "epoch": 0.9036663032876241, "grad_norm": 3.5410818126624037, "learning_rate": 5.600804857538589e-07, "loss": 1.0502, "step": 15750 }, { "epoch": 0.9039531814791439, "grad_norm": 3.186157826439203, "learning_rate": 5.567809145629244e-07, "loss": 1.0097, "step": 15755 }, { "epoch": 0.9042400596706638, "grad_norm": 3.4409503205251775, "learning_rate": 5.534908130532624e-07, "loss": 1.026, "step": 15760 }, { "epoch": 0.9045269378621837, "grad_norm": 3.533616401250333, "learning_rate": 5.50210184524198e-07, "loss": 0.9851, "step": 15765 }, { "epoch": 0.9048138160537036, "grad_norm": 3.163716199173939, "learning_rate": 5.469390322655499e-07, "loss": 1.0061, "step": 15770 }, { "epoch": 0.9051006942452234, "grad_norm": 3.148077536817484, "learning_rate": 5.436773595576361e-07, "loss": 0.9699, "step": 15775 }, { "epoch": 0.9053875724367434, "grad_norm": 3.610239090234733, "learning_rate": 5.404251696712715e-07, "loss": 1.0881, "step": 15780 }, { "epoch": 0.9056744506282632, "grad_norm": 3.856415476062301, "learning_rate": 5.371824658677594e-07, "loss": 1.0727, "step": 15785 }, { "epoch": 0.9059613288197831, "grad_norm": 3.2449629393906525, "learning_rate": 5.339492513988898e-07, "loss": 1.0094, "step": 15790 }, { "epoch": 0.906248207011303, "grad_norm": 2.8755633031840704, "learning_rate": 5.307255295069369e-07, "loss": 0.9925, "step": 15795 }, { "epoch": 0.9065350852028229, "grad_norm": 3.4296429010969347, "learning_rate": 5.275113034246571e-07, "loss": 1.0844, "step": 15800 }, { "epoch": 0.9068219633943427, "grad_norm": 3.340634387581154, "learning_rate": 5.243065763752819e-07, "loss": 1.0412, "step": 15805 }, { "epoch": 0.9071088415858627, "grad_norm": 3.5080320549638953, "learning_rate": 5.211113515725208e-07, "loss": 1.0549, "step": 15810 }, { "epoch": 0.9073957197773825, "grad_norm": 3.324468767223721, "learning_rate": 5.179256322205539e-07, "loss": 1.085, "step": 15815 }, { "epoch": 0.9076825979689024, "grad_norm": 3.747612549935855, "learning_rate": 5.147494215140236e-07, "loss": 1.0596, "step": 15820 }, { "epoch": 0.9079694761604223, "grad_norm": 2.9379510525238968, "learning_rate": 5.115827226380421e-07, "loss": 0.9321, "step": 15825 }, { "epoch": 0.9082563543519422, "grad_norm": 3.579305185479052, "learning_rate": 5.084255387681836e-07, "loss": 0.9907, "step": 15830 }, { "epoch": 0.908543232543462, "grad_norm": 3.966618888843476, "learning_rate": 5.052778730704788e-07, "loss": 1.0504, "step": 15835 }, { "epoch": 0.9088301107349819, "grad_norm": 3.7109297843126026, "learning_rate": 5.021397287014129e-07, "loss": 1.0681, "step": 15840 }, { "epoch": 0.9091169889265018, "grad_norm": 3.348490045633641, "learning_rate": 4.990111088079264e-07, "loss": 1.0005, "step": 15845 }, { "epoch": 0.9094038671180217, "grad_norm": 3.2858621757795268, "learning_rate": 4.958920165274039e-07, "loss": 1.0976, "step": 15850 }, { "epoch": 0.9096907453095415, "grad_norm": 3.8376432225682295, "learning_rate": 4.927824549876781e-07, "loss": 1.0371, "step": 15855 }, { "epoch": 0.9099776235010615, "grad_norm": 3.1762583929006376, "learning_rate": 4.896824273070256e-07, "loss": 0.9316, "step": 15860 }, { "epoch": 0.9102645016925813, "grad_norm": 3.4419765083230507, "learning_rate": 4.865919365941629e-07, "loss": 1.1183, "step": 15865 }, { "epoch": 0.9105513798841012, "grad_norm": 3.3376461484392377, "learning_rate": 4.835109859482368e-07, "loss": 1.0407, "step": 15870 }, { "epoch": 0.9108382580756211, "grad_norm": 2.9851927873887516, "learning_rate": 4.804395784588334e-07, "loss": 1.0122, "step": 15875 }, { "epoch": 0.911125136267141, "grad_norm": 3.0999427117033433, "learning_rate": 4.77377717205969e-07, "loss": 1.0525, "step": 15880 }, { "epoch": 0.9114120144586608, "grad_norm": 3.057941534334267, "learning_rate": 4.743254052600821e-07, "loss": 0.9614, "step": 15885 }, { "epoch": 0.9116988926501808, "grad_norm": 3.8547392264729567, "learning_rate": 4.7128264568203853e-07, "loss": 0.9935, "step": 15890 }, { "epoch": 0.9119857708417006, "grad_norm": 3.562364601316341, "learning_rate": 4.6824944152312534e-07, "loss": 0.9529, "step": 15895 }, { "epoch": 0.9122726490332205, "grad_norm": 3.4945795718896013, "learning_rate": 4.652257958250461e-07, "loss": 1.0535, "step": 15900 }, { "epoch": 0.9125595272247404, "grad_norm": 3.050317126422796, "learning_rate": 4.622117116199187e-07, "loss": 0.9908, "step": 15905 }, { "epoch": 0.9128464054162603, "grad_norm": 3.215288118093743, "learning_rate": 4.592071919302743e-07, "loss": 1.0064, "step": 15910 }, { "epoch": 0.9131332836077801, "grad_norm": 3.6966144676298516, "learning_rate": 4.562122397690538e-07, "loss": 1.0046, "step": 15915 }, { "epoch": 0.9134201617993, "grad_norm": 3.331633645599991, "learning_rate": 4.532268581395982e-07, "loss": 0.9336, "step": 15920 }, { "epoch": 0.9137070399908199, "grad_norm": 3.375415198782261, "learning_rate": 4.5025105003565717e-07, "loss": 0.9663, "step": 15925 }, { "epoch": 0.9139939181823398, "grad_norm": 3.0597976864152847, "learning_rate": 4.4728481844137693e-07, "loss": 1.0562, "step": 15930 }, { "epoch": 0.9142807963738596, "grad_norm": 3.2420990586913923, "learning_rate": 4.443281663313026e-07, "loss": 1.0023, "step": 15935 }, { "epoch": 0.9145676745653796, "grad_norm": 3.5686658620531846, "learning_rate": 4.413810966703702e-07, "loss": 1.0868, "step": 15940 }, { "epoch": 0.9148545527568994, "grad_norm": 3.00759800163779, "learning_rate": 4.3844361241390797e-07, "loss": 0.9109, "step": 15945 }, { "epoch": 0.9151414309484193, "grad_norm": 2.9974477611778267, "learning_rate": 4.355157165076318e-07, "loss": 0.9818, "step": 15950 }, { "epoch": 0.9154283091399392, "grad_norm": 3.1620855538218913, "learning_rate": 4.325974118876408e-07, "loss": 1.0792, "step": 15955 }, { "epoch": 0.9157151873314591, "grad_norm": 3.3651149160092246, "learning_rate": 4.2968870148042076e-07, "loss": 1.0133, "step": 15960 }, { "epoch": 0.9160020655229789, "grad_norm": 3.514686789278442, "learning_rate": 4.267895882028328e-07, "loss": 1.044, "step": 15965 }, { "epoch": 0.9162889437144989, "grad_norm": 3.0902783624461403, "learning_rate": 4.239000749621092e-07, "loss": 1.01, "step": 15970 }, { "epoch": 0.9165758219060187, "grad_norm": 3.159053667311339, "learning_rate": 4.2102016465586535e-07, "loss": 1.0389, "step": 15975 }, { "epoch": 0.9168627000975386, "grad_norm": 3.2930604738196916, "learning_rate": 4.1814986017208013e-07, "loss": 0.9844, "step": 15980 }, { "epoch": 0.9171495782890585, "grad_norm": 3.31514901825254, "learning_rate": 4.1528916438910104e-07, "loss": 0.9738, "step": 15985 }, { "epoch": 0.9174364564805784, "grad_norm": 3.4613419684532225, "learning_rate": 4.1243808017564115e-07, "loss": 1.017, "step": 15990 }, { "epoch": 0.9177233346720982, "grad_norm": 3.499877472813896, "learning_rate": 4.0959661039077224e-07, "loss": 1.0704, "step": 15995 }, { "epoch": 0.9180102128636181, "grad_norm": 3.0598457465287097, "learning_rate": 4.0676475788392845e-07, "loss": 1.0308, "step": 16000 }, { "epoch": 0.918297091055138, "grad_norm": 3.654333336698881, "learning_rate": 4.039425254948959e-07, "loss": 1.0645, "step": 16005 }, { "epoch": 0.9185839692466579, "grad_norm": 3.517694449671513, "learning_rate": 4.0112991605381847e-07, "loss": 1.0868, "step": 16010 }, { "epoch": 0.9188708474381777, "grad_norm": 3.155748332395742, "learning_rate": 3.9832693238118556e-07, "loss": 0.9957, "step": 16015 }, { "epoch": 0.9191577256296977, "grad_norm": 3.329825358214544, "learning_rate": 3.9553357728783434e-07, "loss": 0.9758, "step": 16020 }, { "epoch": 0.9194446038212175, "grad_norm": 2.9848896572668635, "learning_rate": 3.9274985357494856e-07, "loss": 1.0859, "step": 16025 }, { "epoch": 0.9197314820127374, "grad_norm": 3.357348732752412, "learning_rate": 3.89975764034054e-07, "loss": 0.9666, "step": 16030 }, { "epoch": 0.9200183602042573, "grad_norm": 3.8213560613918407, "learning_rate": 3.872113114470122e-07, "loss": 1.0804, "step": 16035 }, { "epoch": 0.9203052383957772, "grad_norm": 3.272022898868998, "learning_rate": 3.8445649858602217e-07, "loss": 1.0273, "step": 16040 }, { "epoch": 0.920592116587297, "grad_norm": 3.160301516680516, "learning_rate": 3.817113282136176e-07, "loss": 0.9318, "step": 16045 }, { "epoch": 0.920878994778817, "grad_norm": 3.320716273837095, "learning_rate": 3.7897580308265955e-07, "loss": 1.0336, "step": 16050 }, { "epoch": 0.9211658729703368, "grad_norm": 3.3161837149636106, "learning_rate": 3.762499259363417e-07, "loss": 0.9963, "step": 16055 }, { "epoch": 0.9214527511618567, "grad_norm": 3.499786492464985, "learning_rate": 3.7353369950817954e-07, "loss": 1.0337, "step": 16060 }, { "epoch": 0.9217396293533766, "grad_norm": 3.3980212861493713, "learning_rate": 3.708271265220087e-07, "loss": 1.0346, "step": 16065 }, { "epoch": 0.9220265075448965, "grad_norm": 3.7662532529192805, "learning_rate": 3.6813020969198586e-07, "loss": 1.0166, "step": 16070 }, { "epoch": 0.9223133857364163, "grad_norm": 3.3740408158763655, "learning_rate": 3.654429517225877e-07, "loss": 0.9899, "step": 16075 }, { "epoch": 0.9226002639279361, "grad_norm": 3.3676839119705013, "learning_rate": 3.62765355308603e-07, "loss": 0.9701, "step": 16080 }, { "epoch": 0.9228871421194561, "grad_norm": 3.0961248245635633, "learning_rate": 3.6009742313513063e-07, "loss": 0.9807, "step": 16085 }, { "epoch": 0.923174020310976, "grad_norm": 3.748873219743657, "learning_rate": 3.574391578775771e-07, "loss": 1.0064, "step": 16090 }, { "epoch": 0.9234608985024958, "grad_norm": 3.1573502249942442, "learning_rate": 3.5479056220166006e-07, "loss": 1.1051, "step": 16095 }, { "epoch": 0.9237477766940158, "grad_norm": 3.047411028442806, "learning_rate": 3.5215163876339273e-07, "loss": 0.924, "step": 16100 }, { "epoch": 0.9240346548855356, "grad_norm": 3.7018208677629993, "learning_rate": 3.495223902090983e-07, "loss": 1.0349, "step": 16105 }, { "epoch": 0.9243215330770554, "grad_norm": 3.6051859162408233, "learning_rate": 3.46902819175392e-07, "loss": 1.0172, "step": 16110 }, { "epoch": 0.9246084112685754, "grad_norm": 3.2770047869944943, "learning_rate": 3.442929282891827e-07, "loss": 1.0599, "step": 16115 }, { "epoch": 0.9248952894600952, "grad_norm": 3.425657111227946, "learning_rate": 3.4169272016767674e-07, "loss": 0.9718, "step": 16120 }, { "epoch": 0.9251821676516151, "grad_norm": 3.5112971750266992, "learning_rate": 3.3910219741836947e-07, "loss": 1.0073, "step": 16125 }, { "epoch": 0.925469045843135, "grad_norm": 3.171701311825449, "learning_rate": 3.365213626390418e-07, "loss": 0.9967, "step": 16130 }, { "epoch": 0.9257559240346549, "grad_norm": 3.657790244652206, "learning_rate": 3.339502184177612e-07, "loss": 1.0694, "step": 16135 }, { "epoch": 0.9260428022261747, "grad_norm": 3.9956030446431523, "learning_rate": 3.313887673328764e-07, "loss": 1.0389, "step": 16140 }, { "epoch": 0.9263296804176947, "grad_norm": 4.140876753012415, "learning_rate": 3.2883701195301597e-07, "loss": 1.0252, "step": 16145 }, { "epoch": 0.9266165586092145, "grad_norm": 2.999212070485959, "learning_rate": 3.262949548370853e-07, "loss": 1.0366, "step": 16150 }, { "epoch": 0.9269034368007344, "grad_norm": 3.4955239328780006, "learning_rate": 3.237625985342674e-07, "loss": 1.0385, "step": 16155 }, { "epoch": 0.9271903149922542, "grad_norm": 3.5334103492863247, "learning_rate": 3.212399455840154e-07, "loss": 1.0156, "step": 16160 }, { "epoch": 0.9274771931837742, "grad_norm": 3.352061298373847, "learning_rate": 3.1872699851604573e-07, "loss": 1.007, "step": 16165 }, { "epoch": 0.927764071375294, "grad_norm": 3.2409562978994546, "learning_rate": 3.1622375985035367e-07, "loss": 1.0096, "step": 16170 }, { "epoch": 0.9280509495668139, "grad_norm": 3.422888402426888, "learning_rate": 3.1373023209718913e-07, "loss": 0.9978, "step": 16175 }, { "epoch": 0.9283378277583338, "grad_norm": 3.5354127502976898, "learning_rate": 3.112464177570662e-07, "loss": 0.9977, "step": 16180 }, { "epoch": 0.9286247059498537, "grad_norm": 2.955355122438612, "learning_rate": 3.087723193207648e-07, "loss": 0.984, "step": 16185 }, { "epoch": 0.9289115841413735, "grad_norm": 3.6408829350813336, "learning_rate": 3.0630793926931136e-07, "loss": 0.9803, "step": 16190 }, { "epoch": 0.9291984623328935, "grad_norm": 3.345583929170256, "learning_rate": 3.038532800739935e-07, "loss": 0.9816, "step": 16195 }, { "epoch": 0.9294853405244133, "grad_norm": 3.1678992181235723, "learning_rate": 3.0140834419634777e-07, "loss": 1.0424, "step": 16200 }, { "epoch": 0.9297722187159332, "grad_norm": 3.6655040491254174, "learning_rate": 2.989731340881641e-07, "loss": 1.0242, "step": 16205 }, { "epoch": 0.9300590969074531, "grad_norm": 3.3770248151797855, "learning_rate": 2.965476521914756e-07, "loss": 0.9112, "step": 16210 }, { "epoch": 0.930345975098973, "grad_norm": 3.1429279154487664, "learning_rate": 2.9413190093855794e-07, "loss": 0.9762, "step": 16215 }, { "epoch": 0.9306328532904928, "grad_norm": 3.24555069389365, "learning_rate": 2.9172588275193536e-07, "loss": 0.976, "step": 16220 }, { "epoch": 0.9309197314820128, "grad_norm": 3.5590016705896685, "learning_rate": 2.8932960004436795e-07, "loss": 1.0481, "step": 16225 }, { "epoch": 0.9312066096735326, "grad_norm": 3.5463689551011512, "learning_rate": 2.8694305521885014e-07, "loss": 1.0188, "step": 16230 }, { "epoch": 0.9314934878650525, "grad_norm": 3.3247859148365597, "learning_rate": 2.8456625066861977e-07, "loss": 0.9919, "step": 16235 }, { "epoch": 0.9317803660565723, "grad_norm": 3.0579881052690796, "learning_rate": 2.8219918877713806e-07, "loss": 0.9801, "step": 16240 }, { "epoch": 0.9320672442480923, "grad_norm": 3.1619170726876606, "learning_rate": 2.7984187191810066e-07, "loss": 0.9667, "step": 16245 }, { "epoch": 0.9323541224396121, "grad_norm": 3.325673080493218, "learning_rate": 2.7749430245542994e-07, "loss": 0.9958, "step": 16250 }, { "epoch": 0.932641000631132, "grad_norm": 3.5977592516314756, "learning_rate": 2.751564827432751e-07, "loss": 1.0478, "step": 16255 }, { "epoch": 0.9329278788226519, "grad_norm": 3.4588290432572033, "learning_rate": 2.728284151260063e-07, "loss": 0.9333, "step": 16260 }, { "epoch": 0.9332147570141718, "grad_norm": 3.7337522657391897, "learning_rate": 2.7051010193821393e-07, "loss": 1.0686, "step": 16265 }, { "epoch": 0.9335016352056916, "grad_norm": 3.7964644858484604, "learning_rate": 2.682015455047093e-07, "loss": 1.0877, "step": 16270 }, { "epoch": 0.9337885133972116, "grad_norm": 3.818876858498564, "learning_rate": 2.659027481405163e-07, "loss": 0.9824, "step": 16275 }, { "epoch": 0.9340753915887314, "grad_norm": 3.6050265254425136, "learning_rate": 2.636137121508753e-07, "loss": 1.0644, "step": 16280 }, { "epoch": 0.9343622697802513, "grad_norm": 3.917739816262115, "learning_rate": 2.6133443983123783e-07, "loss": 1.1094, "step": 16285 }, { "epoch": 0.9346491479717712, "grad_norm": 3.69248753880542, "learning_rate": 2.5906493346726127e-07, "loss": 0.9754, "step": 16290 }, { "epoch": 0.9349360261632911, "grad_norm": 3.719427069910604, "learning_rate": 2.5680519533481055e-07, "loss": 1.0047, "step": 16295 }, { "epoch": 0.9352229043548109, "grad_norm": 3.3124631191609044, "learning_rate": 2.5455522769995967e-07, "loss": 1.048, "step": 16300 }, { "epoch": 0.9355097825463309, "grad_norm": 3.5987114215336344, "learning_rate": 2.5231503281897827e-07, "loss": 0.9976, "step": 16305 }, { "epoch": 0.9357966607378507, "grad_norm": 3.0662521611320797, "learning_rate": 2.5008461293834164e-07, "loss": 1.0246, "step": 16310 }, { "epoch": 0.9360835389293706, "grad_norm": 3.143497617171298, "learning_rate": 2.478639702947172e-07, "loss": 1.0331, "step": 16315 }, { "epoch": 0.9363704171208904, "grad_norm": 3.516419941407218, "learning_rate": 2.4565310711497146e-07, "loss": 1.0547, "step": 16320 }, { "epoch": 0.9366572953124104, "grad_norm": 3.4929249296353815, "learning_rate": 2.434520256161632e-07, "loss": 1.0159, "step": 16325 }, { "epoch": 0.9369441735039302, "grad_norm": 3.606205392540055, "learning_rate": 2.4126072800554015e-07, "loss": 1.0481, "step": 16330 }, { "epoch": 0.9372310516954501, "grad_norm": 3.1454881476347025, "learning_rate": 2.3907921648054335e-07, "loss": 0.9474, "step": 16335 }, { "epoch": 0.93751792988697, "grad_norm": 3.99896927472956, "learning_rate": 2.3690749322879626e-07, "loss": 1.1079, "step": 16340 }, { "epoch": 0.9378048080784899, "grad_norm": 3.166605528911609, "learning_rate": 2.347455604281057e-07, "loss": 1.012, "step": 16345 }, { "epoch": 0.9380916862700097, "grad_norm": 3.5595844568104456, "learning_rate": 2.3259342024646524e-07, "loss": 1.0039, "step": 16350 }, { "epoch": 0.9383785644615297, "grad_norm": 3.621001626674501, "learning_rate": 2.304510748420463e-07, "loss": 1.073, "step": 16355 }, { "epoch": 0.9386654426530495, "grad_norm": 2.9272911747023973, "learning_rate": 2.2831852636319597e-07, "loss": 1.0065, "step": 16360 }, { "epoch": 0.9389523208445694, "grad_norm": 3.0589147561610766, "learning_rate": 2.2619577694843907e-07, "loss": 0.9657, "step": 16365 }, { "epoch": 0.9392391990360893, "grad_norm": 3.596055657751487, "learning_rate": 2.2408282872647292e-07, "loss": 0.9676, "step": 16370 }, { "epoch": 0.9395260772276092, "grad_norm": 3.283293749593224, "learning_rate": 2.2197968381616807e-07, "loss": 0.9544, "step": 16375 }, { "epoch": 0.939812955419129, "grad_norm": 3.3039815540054507, "learning_rate": 2.1988634432656198e-07, "loss": 1.0539, "step": 16380 }, { "epoch": 0.940099833610649, "grad_norm": 3.5379821128755067, "learning_rate": 2.1780281235686206e-07, "loss": 0.9665, "step": 16385 }, { "epoch": 0.9403867118021688, "grad_norm": 3.1709282601341973, "learning_rate": 2.1572908999643706e-07, "loss": 0.9836, "step": 16390 }, { "epoch": 0.9406735899936887, "grad_norm": 3.4200140993662074, "learning_rate": 2.13665179324819e-07, "loss": 0.9771, "step": 16395 }, { "epoch": 0.9409604681852085, "grad_norm": 3.2133770448351346, "learning_rate": 2.1161108241170458e-07, "loss": 0.9313, "step": 16400 }, { "epoch": 0.9412473463767285, "grad_norm": 3.503796842204677, "learning_rate": 2.0956680131694608e-07, "loss": 1.0085, "step": 16405 }, { "epoch": 0.9415342245682483, "grad_norm": 3.5882154401774145, "learning_rate": 2.075323380905536e-07, "loss": 1.0317, "step": 16410 }, { "epoch": 0.9418211027597682, "grad_norm": 3.9443984712995417, "learning_rate": 2.0550769477269084e-07, "loss": 1.0346, "step": 16415 }, { "epoch": 0.9421079809512881, "grad_norm": 3.440526177016731, "learning_rate": 2.0349287339367362e-07, "loss": 1.0737, "step": 16420 }, { "epoch": 0.942394859142808, "grad_norm": 3.4841209289403894, "learning_rate": 2.0148787597397135e-07, "loss": 1.0263, "step": 16425 }, { "epoch": 0.9426817373343278, "grad_norm": 2.9346825047112874, "learning_rate": 1.99492704524199e-07, "loss": 1.0429, "step": 16430 }, { "epoch": 0.9429686155258478, "grad_norm": 2.9638802350968807, "learning_rate": 1.9750736104511947e-07, "loss": 1.0089, "step": 16435 }, { "epoch": 0.9432554937173676, "grad_norm": 3.161151717577515, "learning_rate": 1.955318475276391e-07, "loss": 0.9911, "step": 16440 }, { "epoch": 0.9435423719088875, "grad_norm": 3.1189916029478773, "learning_rate": 1.935661659528054e-07, "loss": 1.016, "step": 16445 }, { "epoch": 0.9438292501004074, "grad_norm": 3.9198315794652787, "learning_rate": 1.9161031829181275e-07, "loss": 1.0325, "step": 16450 }, { "epoch": 0.9441161282919273, "grad_norm": 3.6737714545352955, "learning_rate": 1.8966430650598556e-07, "loss": 1.0349, "step": 16455 }, { "epoch": 0.9444030064834471, "grad_norm": 3.5554725761179395, "learning_rate": 1.8772813254679166e-07, "loss": 0.98, "step": 16460 }, { "epoch": 0.9446898846749671, "grad_norm": 2.9030808023971937, "learning_rate": 1.85801798355828e-07, "loss": 1.0342, "step": 16465 }, { "epoch": 0.9449767628664869, "grad_norm": 3.4830372059436256, "learning_rate": 1.8388530586482932e-07, "loss": 1.0187, "step": 16470 }, { "epoch": 0.9452636410580068, "grad_norm": 3.1208673080618916, "learning_rate": 1.8197865699565498e-07, "loss": 1.0226, "step": 16475 }, { "epoch": 0.9455505192495266, "grad_norm": 3.023010445395173, "learning_rate": 1.8008185366030219e-07, "loss": 1.0013, "step": 16480 }, { "epoch": 0.9458373974410466, "grad_norm": 3.4593628761058453, "learning_rate": 1.7819489776088494e-07, "loss": 1.0044, "step": 16485 }, { "epoch": 0.9461242756325664, "grad_norm": 3.542336690606002, "learning_rate": 1.7631779118964855e-07, "loss": 1.0213, "step": 16490 }, { "epoch": 0.9464111538240862, "grad_norm": 3.7492795477820247, "learning_rate": 1.7445053582895942e-07, "loss": 1.0611, "step": 16495 }, { "epoch": 0.9466980320156062, "grad_norm": 3.4815306017852374, "learning_rate": 1.7259313355130648e-07, "loss": 0.9984, "step": 16500 }, { "epoch": 0.946984910207126, "grad_norm": 3.2170931463808095, "learning_rate": 1.7074558621929526e-07, "loss": 1.0299, "step": 16505 }, { "epoch": 0.9472717883986459, "grad_norm": 3.537261411489333, "learning_rate": 1.6890789568565158e-07, "loss": 0.9685, "step": 16510 }, { "epoch": 0.9475586665901659, "grad_norm": 3.5355775130610287, "learning_rate": 1.6708006379321462e-07, "loss": 1.0052, "step": 16515 }, { "epoch": 0.9478455447816857, "grad_norm": 3.005035402968236, "learning_rate": 1.652620923749393e-07, "loss": 1.0158, "step": 16520 }, { "epoch": 0.9481324229732055, "grad_norm": 3.3368023848080504, "learning_rate": 1.634539832538895e-07, "loss": 1.0759, "step": 16525 }, { "epoch": 0.9484193011647255, "grad_norm": 3.6303732151413497, "learning_rate": 1.6165573824324486e-07, "loss": 1.0574, "step": 16530 }, { "epoch": 0.9487061793562453, "grad_norm": 3.1706360768582846, "learning_rate": 1.5986735914628626e-07, "loss": 1.0304, "step": 16535 }, { "epoch": 0.9489930575477652, "grad_norm": 3.553666395136711, "learning_rate": 1.5808884775640464e-07, "loss": 0.9789, "step": 16540 }, { "epoch": 0.9492799357392852, "grad_norm": 3.546729015098772, "learning_rate": 1.5632020585709672e-07, "loss": 0.9756, "step": 16545 }, { "epoch": 0.949566813930805, "grad_norm": 3.3843933445295815, "learning_rate": 1.545614352219593e-07, "loss": 1.0673, "step": 16550 }, { "epoch": 0.9498536921223248, "grad_norm": 3.284932515134134, "learning_rate": 1.5281253761469162e-07, "loss": 0.946, "step": 16555 }, { "epoch": 0.9501405703138447, "grad_norm": 3.581866942529998, "learning_rate": 1.5107351478909293e-07, "loss": 0.9938, "step": 16560 }, { "epoch": 0.9504274485053646, "grad_norm": 3.2944443729417108, "learning_rate": 1.4934436848905832e-07, "loss": 1.0225, "step": 16565 }, { "epoch": 0.9507143266968845, "grad_norm": 3.326441515749694, "learning_rate": 1.476251004485796e-07, "loss": 0.9842, "step": 16570 }, { "epoch": 0.9510012048884043, "grad_norm": 2.92246789901786, "learning_rate": 1.459157123917432e-07, "loss": 1.0245, "step": 16575 }, { "epoch": 0.9512880830799243, "grad_norm": 2.996748823661829, "learning_rate": 1.442162060327279e-07, "loss": 0.9697, "step": 16580 }, { "epoch": 0.9515749612714441, "grad_norm": 3.2707464013409098, "learning_rate": 1.4252658307580046e-07, "loss": 1.0358, "step": 16585 }, { "epoch": 0.951861839462964, "grad_norm": 4.111556590962565, "learning_rate": 1.4084684521531888e-07, "loss": 1.0439, "step": 16590 }, { "epoch": 0.952148717654484, "grad_norm": 3.4854178594445644, "learning_rate": 1.3917699413573016e-07, "loss": 1.0508, "step": 16595 }, { "epoch": 0.9524355958460038, "grad_norm": 3.085684230577412, "learning_rate": 1.375170315115637e-07, "loss": 1.0557, "step": 16600 }, { "epoch": 0.9527224740375236, "grad_norm": 3.2905500666048044, "learning_rate": 1.3586695900743353e-07, "loss": 1.0882, "step": 16605 }, { "epoch": 0.9530093522290436, "grad_norm": 3.500158725297431, "learning_rate": 1.34226778278036e-07, "loss": 1.0806, "step": 16610 }, { "epoch": 0.9532962304205634, "grad_norm": 3.1604785154112527, "learning_rate": 1.3259649096814763e-07, "loss": 1.0065, "step": 16615 }, { "epoch": 0.9535831086120833, "grad_norm": 3.6561913946684963, "learning_rate": 1.3097609871262296e-07, "loss": 1.0618, "step": 16620 }, { "epoch": 0.9538699868036032, "grad_norm": 3.3291498516552513, "learning_rate": 1.2936560313639879e-07, "loss": 1.0301, "step": 16625 }, { "epoch": 0.9541568649951231, "grad_norm": 3.058915497801384, "learning_rate": 1.2776500585448215e-07, "loss": 1.0121, "step": 16630 }, { "epoch": 0.9544437431866429, "grad_norm": 3.5334267920688642, "learning_rate": 1.2617430847195356e-07, "loss": 0.9336, "step": 16635 }, { "epoch": 0.9547306213781628, "grad_norm": 3.2065015859320334, "learning_rate": 1.245935125839681e-07, "loss": 1.0507, "step": 16640 }, { "epoch": 0.9550174995696827, "grad_norm": 3.1572354719088005, "learning_rate": 1.2302261977575446e-07, "loss": 0.9926, "step": 16645 }, { "epoch": 0.9553043777612026, "grad_norm": 3.346396702059747, "learning_rate": 1.2146163162260581e-07, "loss": 1.1468, "step": 16650 }, { "epoch": 0.9555912559527224, "grad_norm": 3.2002953612568974, "learning_rate": 1.1991054968988335e-07, "loss": 1.0105, "step": 16655 }, { "epoch": 0.9558781341442424, "grad_norm": 3.4476254797309, "learning_rate": 1.1836937553301731e-07, "loss": 0.945, "step": 16660 }, { "epoch": 0.9561650123357622, "grad_norm": 3.55062903107159, "learning_rate": 1.1683811069749917e-07, "loss": 1.0799, "step": 16665 }, { "epoch": 0.9564518905272821, "grad_norm": 3.7956120336707073, "learning_rate": 1.1531675671888621e-07, "loss": 1.0099, "step": 16670 }, { "epoch": 0.956738768718802, "grad_norm": 3.4650463557170106, "learning_rate": 1.138053151227947e-07, "loss": 1.0229, "step": 16675 }, { "epoch": 0.9570256469103219, "grad_norm": 3.3902435613232598, "learning_rate": 1.1230378742490223e-07, "loss": 1.0003, "step": 16680 }, { "epoch": 0.9573125251018417, "grad_norm": 3.3386238949020237, "learning_rate": 1.1081217513094211e-07, "loss": 0.9831, "step": 16685 }, { "epoch": 0.9575994032933617, "grad_norm": 4.16300773546596, "learning_rate": 1.0933047973670896e-07, "loss": 1.029, "step": 16690 }, { "epoch": 0.9578862814848815, "grad_norm": 3.527907465383239, "learning_rate": 1.0785870272804977e-07, "loss": 0.9901, "step": 16695 }, { "epoch": 0.9581731596764014, "grad_norm": 3.5143005269814322, "learning_rate": 1.0639684558086505e-07, "loss": 1.0105, "step": 16700 }, { "epoch": 0.9584600378679213, "grad_norm": 3.603826755455907, "learning_rate": 1.0494490976110883e-07, "loss": 1.0587, "step": 16705 }, { "epoch": 0.9587469160594412, "grad_norm": 3.6630516733401137, "learning_rate": 1.0350289672478642e-07, "loss": 1.0342, "step": 16710 }, { "epoch": 0.959033794250961, "grad_norm": 3.3657531478004024, "learning_rate": 1.0207080791794999e-07, "loss": 1.0309, "step": 16715 }, { "epoch": 0.9593206724424809, "grad_norm": 3.388531857377306, "learning_rate": 1.0064864477670189e-07, "loss": 1.0133, "step": 16720 }, { "epoch": 0.9596075506340008, "grad_norm": 3.464912490466396, "learning_rate": 9.923640872719131e-08, "loss": 0.9916, "step": 16725 }, { "epoch": 0.9598944288255207, "grad_norm": 3.037763310140967, "learning_rate": 9.78341011856121e-08, "loss": 1.0065, "step": 16730 }, { "epoch": 0.9601813070170405, "grad_norm": 3.501549343907844, "learning_rate": 9.644172355819936e-08, "loss": 1.133, "step": 16735 }, { "epoch": 0.9604681852085605, "grad_norm": 3.2602067773759167, "learning_rate": 9.505927724123509e-08, "loss": 1.0007, "step": 16740 }, { "epoch": 0.9607550634000803, "grad_norm": 3.3840338254922684, "learning_rate": 9.368676362103701e-08, "loss": 1.0449, "step": 16745 }, { "epoch": 0.9610419415916002, "grad_norm": 3.2662104429704493, "learning_rate": 9.232418407396638e-08, "loss": 1.0476, "step": 16750 }, { "epoch": 0.9613288197831201, "grad_norm": 2.9861336482078666, "learning_rate": 9.097153996642238e-08, "loss": 1.0084, "step": 16755 }, { "epoch": 0.96161569797464, "grad_norm": 3.6415768673222177, "learning_rate": 8.962883265483669e-08, "loss": 1.0596, "step": 16760 }, { "epoch": 0.9619025761661598, "grad_norm": 3.504855504458576, "learning_rate": 8.829606348567999e-08, "loss": 1.021, "step": 16765 }, { "epoch": 0.9621894543576798, "grad_norm": 3.490746842413399, "learning_rate": 8.697323379545652e-08, "loss": 1.004, "step": 16770 }, { "epoch": 0.9624763325491996, "grad_norm": 3.351408967432194, "learning_rate": 8.566034491070407e-08, "loss": 1.0293, "step": 16775 }, { "epoch": 0.9627632107407195, "grad_norm": 2.9578059514565602, "learning_rate": 8.435739814798949e-08, "loss": 1.0301, "step": 16780 }, { "epoch": 0.9630500889322394, "grad_norm": 3.3349459138470334, "learning_rate": 8.306439481390871e-08, "loss": 1.0118, "step": 16785 }, { "epoch": 0.9633369671237593, "grad_norm": 3.5334338987982075, "learning_rate": 8.178133620509232e-08, "loss": 0.9897, "step": 16790 }, { "epoch": 0.9636238453152791, "grad_norm": 2.9102180128554127, "learning_rate": 8.050822360819222e-08, "loss": 1.1079, "step": 16795 }, { "epoch": 0.963910723506799, "grad_norm": 3.2345027221171576, "learning_rate": 7.924505829988716e-08, "loss": 1.0069, "step": 16800 }, { "epoch": 0.9641976016983189, "grad_norm": 3.024672030439771, "learning_rate": 7.79918415468861e-08, "loss": 0.9468, "step": 16805 }, { "epoch": 0.9644844798898388, "grad_norm": 3.3165022424033, "learning_rate": 7.674857460591378e-08, "loss": 1.0041, "step": 16810 }, { "epoch": 0.9647713580813586, "grad_norm": 3.311240044454259, "learning_rate": 7.551525872372289e-08, "loss": 1.0437, "step": 16815 }, { "epoch": 0.9650582362728786, "grad_norm": 3.7040343635762794, "learning_rate": 7.429189513708523e-08, "loss": 1.1004, "step": 16820 }, { "epoch": 0.9653451144643984, "grad_norm": 3.257033103892997, "learning_rate": 7.30784850727917e-08, "loss": 1.0128, "step": 16825 }, { "epoch": 0.9656319926559183, "grad_norm": 3.181038844003521, "learning_rate": 7.187502974765448e-08, "loss": 0.9792, "step": 16830 }, { "epoch": 0.9659188708474382, "grad_norm": 3.203524364074193, "learning_rate": 7.068153036849934e-08, "loss": 1.0386, "step": 16835 }, { "epoch": 0.9662057490389581, "grad_norm": 3.7025523068103645, "learning_rate": 6.949798813217002e-08, "loss": 1.0037, "step": 16840 }, { "epoch": 0.9664926272304779, "grad_norm": 3.78948212559206, "learning_rate": 6.832440422552711e-08, "loss": 1.0521, "step": 16845 }, { "epoch": 0.9667795054219979, "grad_norm": 3.1381124557525326, "learning_rate": 6.716077982544256e-08, "loss": 0.9984, "step": 16850 }, { "epoch": 0.9670663836135177, "grad_norm": 3.2638718681493426, "learning_rate": 6.600711609880073e-08, "loss": 1.0067, "step": 16855 }, { "epoch": 0.9673532618050376, "grad_norm": 3.1984200282959274, "learning_rate": 6.486341420249842e-08, "loss": 1.0512, "step": 16860 }, { "epoch": 0.9676401399965575, "grad_norm": 3.3406668228425573, "learning_rate": 6.372967528344265e-08, "loss": 1.0601, "step": 16865 }, { "epoch": 0.9679270181880774, "grad_norm": 3.0724355889400408, "learning_rate": 6.260590047854953e-08, "loss": 1.0886, "step": 16870 }, { "epoch": 0.9682138963795972, "grad_norm": 3.490537832380911, "learning_rate": 6.149209091474317e-08, "loss": 0.9627, "step": 16875 }, { "epoch": 0.968500774571117, "grad_norm": 3.3700805548713624, "learning_rate": 6.038824770895457e-08, "loss": 1.0468, "step": 16880 }, { "epoch": 0.968787652762637, "grad_norm": 3.937306866068547, "learning_rate": 5.929437196811827e-08, "loss": 0.9531, "step": 16885 }, { "epoch": 0.9690745309541569, "grad_norm": 3.546582541846896, "learning_rate": 5.821046478917791e-08, "loss": 1.046, "step": 16890 }, { "epoch": 0.9693614091456767, "grad_norm": 3.0919267502172763, "learning_rate": 5.713652725907626e-08, "loss": 1.0911, "step": 16895 }, { "epoch": 0.9696482873371967, "grad_norm": 3.311507474901811, "learning_rate": 5.6072560454759615e-08, "loss": 0.9968, "step": 16900 }, { "epoch": 0.9699351655287165, "grad_norm": 3.3605841403792427, "learning_rate": 5.501856544317896e-08, "loss": 0.9734, "step": 16905 }, { "epoch": 0.9702220437202363, "grad_norm": 3.836272520049859, "learning_rate": 5.397454328128104e-08, "loss": 1.0033, "step": 16910 }, { "epoch": 0.9705089219117563, "grad_norm": 3.2237908468376166, "learning_rate": 5.294049501601284e-08, "loss": 0.9958, "step": 16915 }, { "epoch": 0.9707958001032762, "grad_norm": 2.9821956103126563, "learning_rate": 5.1916421684321536e-08, "loss": 1.0408, "step": 16920 }, { "epoch": 0.971082678294796, "grad_norm": 3.2249582691685967, "learning_rate": 5.090232431315123e-08, "loss": 1.021, "step": 16925 }, { "epoch": 0.971369556486316, "grad_norm": 3.2168181889222485, "learning_rate": 4.9898203919438445e-08, "loss": 1.0257, "step": 16930 }, { "epoch": 0.9716564346778358, "grad_norm": 3.3833638563153756, "learning_rate": 4.890406151011884e-08, "loss": 1.0409, "step": 16935 }, { "epoch": 0.9719433128693556, "grad_norm": 2.936479243100045, "learning_rate": 4.7919898082121607e-08, "loss": 1.0667, "step": 16940 }, { "epoch": 0.9722301910608756, "grad_norm": 3.429910715379141, "learning_rate": 4.694571462236619e-08, "loss": 0.9867, "step": 16945 }, { "epoch": 0.9725170692523954, "grad_norm": 3.4308391438300996, "learning_rate": 4.598151210776669e-08, "loss": 1.0191, "step": 16950 }, { "epoch": 0.9728039474439153, "grad_norm": 2.860349840870402, "learning_rate": 4.5027291505227446e-08, "loss": 0.9091, "step": 16955 }, { "epoch": 0.9730908256354351, "grad_norm": 3.457818040428893, "learning_rate": 4.408305377164302e-08, "loss": 0.9628, "step": 16960 }, { "epoch": 0.9733777038269551, "grad_norm": 3.0170916467110973, "learning_rate": 4.314879985389708e-08, "loss": 0.941, "step": 16965 }, { "epoch": 0.9736645820184749, "grad_norm": 3.451343502349111, "learning_rate": 4.2224530688862453e-08, "loss": 0.9755, "step": 16970 }, { "epoch": 0.9739514602099948, "grad_norm": 3.2180684348653075, "learning_rate": 4.1310247203396605e-08, "loss": 0.9932, "step": 16975 }, { "epoch": 0.9742383384015147, "grad_norm": 3.1136670547203904, "learning_rate": 4.040595031434724e-08, "loss": 1.0252, "step": 16980 }, { "epoch": 0.9745252165930346, "grad_norm": 3.2784615448976697, "learning_rate": 3.9511640928543425e-08, "loss": 1.001, "step": 16985 }, { "epoch": 0.9748120947845544, "grad_norm": 3.321029419979409, "learning_rate": 3.862731994280111e-08, "loss": 0.9997, "step": 16990 }, { "epoch": 0.9750989729760744, "grad_norm": 3.2992327895907305, "learning_rate": 3.7752988243919817e-08, "loss": 1.0259, "step": 16995 }, { "epoch": 0.9753858511675942, "grad_norm": 3.5287874418874634, "learning_rate": 3.688864670868153e-08, "loss": 1.0335, "step": 17000 }, { "epoch": 0.9756727293591141, "grad_norm": 3.37385079770515, "learning_rate": 3.603429620384846e-08, "loss": 1.0147, "step": 17005 }, { "epoch": 0.975959607550634, "grad_norm": 3.146427428993973, "learning_rate": 3.51899375861664e-08, "loss": 1.0046, "step": 17010 }, { "epoch": 0.9762464857421539, "grad_norm": 3.198092055835174, "learning_rate": 3.4355571702360255e-08, "loss": 1.0139, "step": 17015 }, { "epoch": 0.9765333639336737, "grad_norm": 2.74024475377541, "learning_rate": 3.353119938913296e-08, "loss": 0.9576, "step": 17020 }, { "epoch": 0.9768202421251937, "grad_norm": 3.3730482673641227, "learning_rate": 3.271682147316879e-08, "loss": 0.9802, "step": 17025 }, { "epoch": 0.9771071203167135, "grad_norm": 3.404189887949835, "learning_rate": 3.1912438771125596e-08, "loss": 1.0381, "step": 17030 }, { "epoch": 0.9773939985082334, "grad_norm": 3.196361659418212, "learning_rate": 3.111805208964036e-08, "loss": 0.984, "step": 17035 }, { "epoch": 0.9776808766997532, "grad_norm": 3.4732612241486454, "learning_rate": 3.033366222532807e-08, "loss": 1.0141, "step": 17040 }, { "epoch": 0.9779677548912732, "grad_norm": 3.160419521162163, "learning_rate": 2.9559269964773984e-08, "loss": 1.0141, "step": 17045 }, { "epoch": 0.978254633082793, "grad_norm": 3.5324295353224398, "learning_rate": 2.8794876084541347e-08, "loss": 0.9984, "step": 17050 }, { "epoch": 0.9785415112743129, "grad_norm": 3.298279858427205, "learning_rate": 2.8040481351166992e-08, "loss": 0.9908, "step": 17055 }, { "epoch": 0.9788283894658328, "grad_norm": 3.8401157778896917, "learning_rate": 2.7296086521158004e-08, "loss": 0.9852, "step": 17060 }, { "epoch": 0.9791152676573527, "grad_norm": 3.880346434325686, "learning_rate": 2.6561692340997258e-08, "loss": 1.0267, "step": 17065 }, { "epoch": 0.9794021458488725, "grad_norm": 3.8517165099645494, "learning_rate": 2.583729954713454e-08, "loss": 0.9616, "step": 17070 }, { "epoch": 0.9796890240403925, "grad_norm": 3.4385496876053923, "learning_rate": 2.5122908865994332e-08, "loss": 0.9867, "step": 17075 }, { "epoch": 0.9799759022319123, "grad_norm": 3.878707449651252, "learning_rate": 2.441852101396802e-08, "loss": 1.1015, "step": 17080 }, { "epoch": 0.9802627804234322, "grad_norm": 3.1887851095737596, "learning_rate": 2.3724136697418353e-08, "loss": 1.0049, "step": 17085 }, { "epoch": 0.9805496586149521, "grad_norm": 3.478654023963993, "learning_rate": 2.303975661267499e-08, "loss": 0.9594, "step": 17090 }, { "epoch": 0.980836536806472, "grad_norm": 3.851570194111132, "learning_rate": 2.2365381446035617e-08, "loss": 0.9541, "step": 17095 }, { "epoch": 0.9811234149979918, "grad_norm": 3.9455813921558365, "learning_rate": 2.1701011873765943e-08, "loss": 1.1032, "step": 17100 }, { "epoch": 0.9814102931895118, "grad_norm": 3.2318812137029256, "learning_rate": 2.104664856209637e-08, "loss": 0.9833, "step": 17105 }, { "epoch": 0.9816971713810316, "grad_norm": 3.3531204445949196, "learning_rate": 2.0402292167224226e-08, "loss": 1.0396, "step": 17110 }, { "epoch": 0.9819840495725515, "grad_norm": 3.5172641640266105, "learning_rate": 1.9767943335311512e-08, "loss": 1.0299, "step": 17115 }, { "epoch": 0.9822709277640713, "grad_norm": 3.348037011622541, "learning_rate": 1.9143602702484943e-08, "loss": 1.033, "step": 17120 }, { "epoch": 0.9825578059555913, "grad_norm": 3.3928865885937407, "learning_rate": 1.8529270894833697e-08, "loss": 0.9867, "step": 17125 }, { "epoch": 0.9828446841471111, "grad_norm": 3.228369463046365, "learning_rate": 1.7924948528412756e-08, "loss": 0.9745, "step": 17130 }, { "epoch": 0.983131562338631, "grad_norm": 3.0440561806050854, "learning_rate": 1.733063620923625e-08, "loss": 0.9993, "step": 17135 }, { "epoch": 0.9834184405301509, "grad_norm": 3.294059778953701, "learning_rate": 1.6746334533284116e-08, "loss": 1.0442, "step": 17140 }, { "epoch": 0.9837053187216708, "grad_norm": 3.146268183438361, "learning_rate": 1.6172044086492088e-08, "loss": 0.9659, "step": 17145 }, { "epoch": 0.9839921969131906, "grad_norm": 3.1142192335208034, "learning_rate": 1.5607765444762834e-08, "loss": 1.0157, "step": 17150 }, { "epoch": 0.9842790751047106, "grad_norm": 3.6715910985047575, "learning_rate": 1.5053499173955933e-08, "loss": 1.0786, "step": 17155 }, { "epoch": 0.9845659532962304, "grad_norm": 3.3580754382617015, "learning_rate": 1.4509245829888995e-08, "loss": 1.0079, "step": 17160 }, { "epoch": 0.9848528314877503, "grad_norm": 3.178900203187281, "learning_rate": 1.3975005958341004e-08, "loss": 1.0196, "step": 17165 }, { "epoch": 0.9851397096792702, "grad_norm": 3.4824506039989265, "learning_rate": 1.3450780095051185e-08, "loss": 0.9241, "step": 17170 }, { "epoch": 0.9854265878707901, "grad_norm": 3.4354801181358128, "learning_rate": 1.2936568765711254e-08, "loss": 0.9565, "step": 17175 }, { "epoch": 0.9857134660623099, "grad_norm": 2.959318447747334, "learning_rate": 1.2432372485975397e-08, "loss": 0.9869, "step": 17180 }, { "epoch": 0.9860003442538299, "grad_norm": 3.529882554155149, "learning_rate": 1.193819176145361e-08, "loss": 1.033, "step": 17185 }, { "epoch": 0.9862872224453497, "grad_norm": 2.9197052155335186, "learning_rate": 1.1454027087708376e-08, "loss": 0.9109, "step": 17190 }, { "epoch": 0.9865741006368696, "grad_norm": 3.168890043343197, "learning_rate": 1.0979878950263533e-08, "loss": 0.9887, "step": 17195 }, { "epoch": 0.9868609788283894, "grad_norm": 3.3475344102144247, "learning_rate": 1.0515747824595412e-08, "loss": 1.081, "step": 17200 }, { "epoch": 0.9871478570199094, "grad_norm": 3.4056759763656057, "learning_rate": 1.0061634176136147e-08, "loss": 1.0633, "step": 17205 }, { "epoch": 0.9874347352114292, "grad_norm": 3.282206386237861, "learning_rate": 9.617538460270359e-09, "loss": 0.9866, "step": 17210 }, { "epoch": 0.9877216134029491, "grad_norm": 3.4276957874349523, "learning_rate": 9.183461122339587e-09, "loss": 0.9642, "step": 17215 }, { "epoch": 0.988008491594469, "grad_norm": 3.7253362185760324, "learning_rate": 8.759402597637855e-09, "loss": 1.0203, "step": 17220 }, { "epoch": 0.9882953697859889, "grad_norm": 3.6246427600266142, "learning_rate": 8.345363311410559e-09, "loss": 1.0749, "step": 17225 }, { "epoch": 0.9885822479775087, "grad_norm": 3.315159184457952, "learning_rate": 7.941343678857793e-09, "loss": 0.9991, "step": 17230 }, { "epoch": 0.9888691261690287, "grad_norm": 3.4896590553265745, "learning_rate": 7.547344105132136e-09, "loss": 1.047, "step": 17235 }, { "epoch": 0.9891560043605485, "grad_norm": 3.4348912253432604, "learning_rate": 7.163364985336429e-09, "loss": 0.9584, "step": 17240 }, { "epoch": 0.9894428825520684, "grad_norm": 3.471039147360216, "learning_rate": 6.789406704527102e-09, "loss": 1.0145, "step": 17245 }, { "epoch": 0.9897297607435883, "grad_norm": 3.3479590328068785, "learning_rate": 6.425469637708626e-09, "loss": 1.0558, "step": 17250 }, { "epoch": 0.9900166389351082, "grad_norm": 3.229069227909241, "learning_rate": 6.071554149837955e-09, "loss": 1.0154, "step": 17255 }, { "epoch": 0.990303517126628, "grad_norm": 3.2629651523594547, "learning_rate": 5.727660595823415e-09, "loss": 1.0136, "step": 17260 }, { "epoch": 0.990590395318148, "grad_norm": 3.1241654414072744, "learning_rate": 5.39378932052248e-09, "loss": 0.997, "step": 17265 }, { "epoch": 0.9908772735096678, "grad_norm": 3.202440116383696, "learning_rate": 5.069940658740669e-09, "loss": 1.0555, "step": 17270 }, { "epoch": 0.9911641517011877, "grad_norm": 3.136120933093717, "learning_rate": 4.756114935234868e-09, "loss": 1.0201, "step": 17275 }, { "epoch": 0.9914510298927075, "grad_norm": 3.328637372765008, "learning_rate": 4.452312464710007e-09, "loss": 1.0729, "step": 17280 }, { "epoch": 0.9917379080842275, "grad_norm": 3.0965421127693755, "learning_rate": 4.158533551820165e-09, "loss": 1.0924, "step": 17285 }, { "epoch": 0.9920247862757473, "grad_norm": 3.3339990224797034, "learning_rate": 3.8747784911674635e-09, "loss": 1.006, "step": 17290 }, { "epoch": 0.9923116644672672, "grad_norm": 3.166422326502353, "learning_rate": 3.6010475673009525e-09, "loss": 1.0299, "step": 17295 }, { "epoch": 0.9925985426587871, "grad_norm": 3.0429971654693517, "learning_rate": 3.337341054721055e-09, "loss": 1.0276, "step": 17300 }, { "epoch": 0.992885420850307, "grad_norm": 3.40127291670829, "learning_rate": 3.083659217871793e-09, "loss": 0.9814, "step": 17305 }, { "epoch": 0.9931722990418268, "grad_norm": 3.0281421258257666, "learning_rate": 2.840002311145229e-09, "loss": 0.9582, "step": 17310 }, { "epoch": 0.9934591772333468, "grad_norm": 3.606488183607485, "learning_rate": 2.606370578882578e-09, "loss": 1.0396, "step": 17315 }, { "epoch": 0.9937460554248666, "grad_norm": 3.26542386995938, "learning_rate": 2.3827642553686527e-09, "loss": 1.0718, "step": 17320 }, { "epoch": 0.9940329336163864, "grad_norm": 3.166852809558139, "learning_rate": 2.1691835648374183e-09, "loss": 0.9801, "step": 17325 }, { "epoch": 0.9943198118079064, "grad_norm": 3.346107741429317, "learning_rate": 1.9656287214686596e-09, "loss": 0.9513, "step": 17330 }, { "epoch": 0.9946066899994263, "grad_norm": 3.1044533841437265, "learning_rate": 1.7720999293857622e-09, "loss": 1.0253, "step": 17335 }, { "epoch": 0.9948935681909461, "grad_norm": 3.1883282607685053, "learning_rate": 1.588597382661261e-09, "loss": 1.0665, "step": 17340 }, { "epoch": 0.995180446382466, "grad_norm": 3.001896807736475, "learning_rate": 1.4151212653112922e-09, "loss": 1.0277, "step": 17345 }, { "epoch": 0.9954673245739859, "grad_norm": 3.697578153319097, "learning_rate": 1.2516717512989217e-09, "loss": 1.0376, "step": 17350 }, { "epoch": 0.9957542027655057, "grad_norm": 3.0796949875105897, "learning_rate": 1.098249004530816e-09, "loss": 1.0004, "step": 17355 }, { "epoch": 0.9960410809570256, "grad_norm": 3.2400655488307697, "learning_rate": 9.548531788605709e-10, "loss": 1.0567, "step": 17360 }, { "epoch": 0.9963279591485455, "grad_norm": 3.1208318454741004, "learning_rate": 8.21484418084273e-10, "loss": 1.0233, "step": 17365 }, { "epoch": 0.9966148373400654, "grad_norm": 2.8875143489197206, "learning_rate": 6.98142855946049e-10, "loss": 0.9895, "step": 17370 }, { "epoch": 0.9969017155315852, "grad_norm": 3.789005790743376, "learning_rate": 5.848286161314054e-10, "loss": 1.1136, "step": 17375 }, { "epoch": 0.9971885937231052, "grad_norm": 3.9578140318126565, "learning_rate": 4.815418122738891e-10, "loss": 1.0698, "step": 17380 }, { "epoch": 0.997475471914625, "grad_norm": 3.2247497097160256, "learning_rate": 3.882825479495367e-10, "loss": 1.0103, "step": 17385 }, { "epoch": 0.9977623501061449, "grad_norm": 3.0710151356946165, "learning_rate": 3.050509166779847e-10, "loss": 1.025, "step": 17390 }, { "epoch": 0.9980492282976648, "grad_norm": 3.6011376429760924, "learning_rate": 2.3184700192357967e-10, "loss": 1.0349, "step": 17395 }, { "epoch": 0.9983361064891847, "grad_norm": 3.2828123242337326, "learning_rate": 1.6867087709759866e-10, "loss": 0.9699, "step": 17400 }, { "epoch": 0.9986229846807045, "grad_norm": 3.4486833303944877, "learning_rate": 1.1552260555047768e-10, "loss": 0.9672, "step": 17405 }, { "epoch": 0.9989098628722245, "grad_norm": 3.4416426216010434, "learning_rate": 7.240224058180368e-11, "loss": 1.0408, "step": 17410 }, { "epoch": 0.9991967410637443, "grad_norm": 3.2220472025524605, "learning_rate": 3.93098254314328e-11, "loss": 1.0822, "step": 17415 }, { "epoch": 0.9994836192552642, "grad_norm": 3.4988767661557105, "learning_rate": 1.6245393286151713e-11, "loss": 1.0027, "step": 17420 }, { "epoch": 0.9997704974467841, "grad_norm": 3.7985696594822804, "learning_rate": 3.208967271906005e-12, "loss": 1.0295, "step": 17425 }, { "epoch": 1.0, "eval_loss": 1.0397080183029175, "eval_runtime": 1067.7803, "eval_samples_per_second": 14.451, "eval_steps_per_second": 1.807, "step": 17429 }, { "epoch": 1.0, "step": 17429, "total_flos": 456159752355840.0, "train_loss": 1.1550761079944154, "train_runtime": 57821.4994, "train_samples_per_second": 2.411, "train_steps_per_second": 0.301 } ], "logging_steps": 5, "max_steps": 17429, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 456159752355840.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }